diff options
Diffstat (limited to 'xlators/performance')
51 files changed, 17544 insertions, 11364 deletions
diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am index f99e11829db..e95725acb8c 100644 --- a/xlators/performance/Makefile.am +++ b/xlators/performance/Makefile.am @@ -1,3 +1,4 @@ -SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache quick-read md-cache open-behind +SUBDIRS = write-behind read-ahead readdir-ahead io-threads io-cache \ + quick-read md-cache open-behind nl-cache CLEANFILES = diff --git a/xlators/performance/io-cache/src/Makefile.am b/xlators/performance/io-cache/src/Makefile.am index 155be9988c9..bfa34ce5502 100644 --- a/xlators/performance/io-cache/src/Makefile.am +++ b/xlators/performance/io-cache/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = io-cache.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -io_cache_la_LDFLAGS = -module -avoid-version +io_cache_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) io_cache_la_SOURCES = io-cache.c page.c ioc-inode.c io_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = io-cache.h ioc-mem-types.h +noinst_HEADERS = io-cache.h ioc-mem-types.h io-cache-messages.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ -I$(CONTRIBDIR)/rbtree AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/performance/io-cache/src/io-cache-messages.h b/xlators/performance/io-cache/src/io-cache-messages.h new file mode 100644 index 00000000000..38ad0b14d0e --- /dev/null +++ b/xlators/performance/io-cache/src/io-cache-messages.h @@ -0,0 +1,69 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _IO_CACHE_MESSAGES_H_ +#define _IO_CACHE_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(IO_CACHE, IO_CACHE_MSG_ENFORCEMENT_FAILED, + IO_CACHE_MSG_INVALID_ARGUMENT, + IO_CACHE_MSG_XLATOR_CHILD_MISCONFIGURED, IO_CACHE_MSG_NO_MEMORY, + IO_CACHE_MSG_VOL_MISCONFIGURED, IO_CACHE_MSG_INODE_NULL, + IO_CACHE_MSG_PAGE_WAIT_VALIDATE, IO_CACHE_MSG_STR_COVERSION_FAILED, + IO_CACHE_MSG_WASTED_COPY, IO_CACHE_MSG_SET_FD_FAILED, + IO_CACHE_MSG_TABLE_NULL, IO_CACHE_MSG_MEMORY_INIT_FAILED, + IO_CACHE_MSG_NO_CACHE_SIZE_OPT, IO_CACHE_MSG_NOT_RECONFIG_CACHE_SIZE, + IO_CACHE_MSG_CREATE_MEM_POOL_FAILED, + IO_CACHE_MSG_ALLOC_MEM_POOL_FAILED, IO_CACHE_MSG_NULL_PAGE_WAIT, + IO_CACHE_MSG_FRAME_NULL, IO_CACHE_MSG_PAGE_FAULT, + IO_CACHE_MSG_SERVE_READ_REQUEST, IO_CACHE_MSG_LOCAL_NULL, + IO_CACHE_MSG_DEFAULTING_TO_OLD); + +#define IO_CACHE_MSG_NO_MEMORY_STR "out of memory" +#define IO_CACHE_MSG_ENFORCEMENT_FAILED_STR "inode context is NULL" +#define IO_CACHE_MSG_SET_FD_FAILED_STR "failed to set fd ctx" +#define IO_CACHE_MSG_TABLE_NULL_STR "table is NULL" +#define IO_CACHE_MSG_MEMORY_INIT_FAILED_STR "Memory accounting init failed" +#define IO_CACHE_MSG_NO_CACHE_SIZE_OPT_STR "could not get cache-size option" +#define IO_CACHE_MSG_INVALID_ARGUMENT_STR \ + "file size is greater than the max size" +#define IO_CACHE_MSG_NOT_RECONFIG_CACHE_SIZE_STR "Not reconfiguring cache-size" +#define IO_CACHE_MSG_XLATOR_CHILD_MISCONFIGURED_STR \ + "FATAL: io-cache not configured with exactly one child" +#define IO_CACHE_MSG_VOL_MISCONFIGURED_STR "dangling volume. check volfile" +#define IO_CACHE_MSG_CREATE_MEM_POOL_FAILED_STR \ + "failed to create local_t's memory pool" +#define IO_CACHE_MSG_ALLOC_MEM_POOL_FAILED_STR "Unable to allocate mem_pool" +#define IO_CACHE_MSG_STR_COVERSION_FAILED_STR \ + "asprintf failed while converting prt to str" +#define IO_CACHE_MSG_INODE_NULL_STR "ioc_inode is NULL" +#define IO_CACHE_MSG_PAGE_WAIT_VALIDATE_STR \ + "cache validate called without any page waiting to be validated" +#define IO_CACHE_MSG_NULL_PAGE_WAIT_STR "asked to wait on a NULL page" +#define IO_CACHE_MSG_WASTED_COPY_STR "wasted copy" +#define IO_CACHE_MSG_FRAME_NULL_STR "frame>root>rsp_refs is null" +#define IO_CACHE_MSG_PAGE_FAULT_STR "page fault on a NULL frame" +#define IO_CACHE_MSG_SERVE_READ_REQUEST_STR \ + "NULL page has been provided to serve read request" +#define IO_CACHE_MSG_LOCAL_NULL_STR "local is NULL" +#define IO_CACHE_MSG_DEFAULTING_TO_OLD_STR \ + "minimum size of file that can be cached is greater than maximum size. " \ + "Hence Defaulting to old value" +#endif /* _IO_CACHE_MESSAGES_H_ */ diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c index f56a574564a..9375d29c17f 100644 --- a/xlators/performance/io-cache/src/io-cache.c +++ b/xlators/performance/io-cache/src/io-cache.c @@ -8,40 +8,38 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <math.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "io-cache.h" #include "ioc-mem-types.h" -#include "statedump.h" +#include <glusterfs/statedump.h> #include <assert.h> #include <sys/time.h> - +#include "io-cache-messages.h" int ioc_log2_page_size; uint32_t -ioc_get_priority (ioc_table_t *table, const char *path); +ioc_get_priority(ioc_table_t *table, const char *path); struct volume_options options[]; - -static inline uint32_t -ioc_hashfn (void *data, int len) +static uint32_t +ioc_hashfn(void *data, int len) { - off_t offset; + off_t offset; - offset = *(off_t *) data; + offset = *(off_t *)data; - return (offset >> ioc_log2_page_size); + return (offset >> ioc_log2_page_size); } -static inline ioc_inode_t * +/* TODO: This function is not used, uncomment when we find a + usage for this function. + +static ioc_inode_t * ioc_inode_reupdate (ioc_inode_t *ioc_inode) { ioc_table_t *table = NULL; @@ -54,7 +52,8 @@ ioc_inode_reupdate (ioc_inode_t *ioc_inode) return ioc_inode; } -static inline ioc_inode_t * + +static ioc_inode_t * ioc_get_inode (dict_t *dict, char *name) { ioc_inode_t *ioc_inode = NULL; @@ -77,22 +76,74 @@ ioc_get_inode (dict_t *dict, char *name) return ioc_inode; } +*/ -int32_t -ioc_inode_need_revalidate (ioc_inode_t *ioc_inode) +int +ioc_update_pages(call_frame_t *frame, ioc_inode_t *ioc_inode, + struct iovec *vector, int32_t count, int op_ret, off_t offset) { - int8_t need_revalidate = 0; - struct timeval tv = {0,}; - ioc_table_t *table = NULL; + size_t size = 0; + off_t rounded_offset = 0, rounded_end = 0, trav_offset = 0, + write_offset = 0; + off_t page_offset = 0, page_end = 0; + ioc_page_t *trav = NULL; - table = ioc_inode->table; + size = iov_length(vector, count); + size = min(size, op_ret); - gettimeofday (&tv, NULL); + rounded_offset = gf_floor(offset, ioc_inode->table->page_size); + rounded_end = gf_roof(offset + size, ioc_inode->table->page_size); - if (time_elapsed (&tv, &ioc_inode->cache.tv) >= table->cache_timeout) - need_revalidate = 1; + trav_offset = rounded_offset; + ioc_inode_lock(ioc_inode); + { + while (trav_offset < rounded_end) { + trav = __ioc_page_get(ioc_inode, trav_offset); + if (trav && trav->ready) { + if (trav_offset == rounded_offset) + page_offset = offset - rounded_offset; + else + page_offset = 0; + + if ((trav_offset + ioc_inode->table->page_size) >= + rounded_end) { + page_end = trav->size - (rounded_end - (offset + size)); + } else { + page_end = trav->size; + } + + iov_range_copy(trav->vector, trav->count, page_offset, vector, + count, write_offset, page_end - page_offset); + } else if (trav) { + if (!trav->waitq) + ioc_inode->table->cache_used -= __ioc_page_destroy(trav); + } + + if (trav_offset == rounded_offset) + write_offset += (ioc_inode->table->page_size - + (offset - rounded_offset)); + else + write_offset += ioc_inode->table->page_size; + + trav_offset += ioc_inode->table->page_size; + } + } + ioc_inode_unlock(ioc_inode); - return need_revalidate; + return 0; +} + +static gf_boolean_t +ioc_inode_need_revalidate(ioc_inode_t *ioc_inode) +{ + ioc_table_t *table = NULL; + + GF_ASSERT(ioc_inode); + table = ioc_inode->table; + GF_ASSERT(table); + + return (gf_time() - ioc_inode->cache.last_revalidate >= + table->cache_timeout); } /* @@ -103,193 +154,201 @@ ioc_inode_need_revalidate (ioc_inode_t *ioc_inode) * assumes lock is held */ int64_t -__ioc_inode_flush (ioc_inode_t *ioc_inode) +__ioc_inode_flush(ioc_inode_t *ioc_inode) { - ioc_page_t *curr = NULL, *next = NULL; - int64_t destroy_size = 0; - int64_t ret = 0; + ioc_page_t *curr = NULL, *next = NULL; + int64_t destroy_size = 0; + int64_t ret = 0; - list_for_each_entry_safe (curr, next, &ioc_inode->cache.page_lru, - page_lru) { - ret = __ioc_page_destroy (curr); + list_for_each_entry_safe(curr, next, &ioc_inode->cache.page_lru, page_lru) + { + ret = __ioc_page_destroy(curr); - if (ret != -1) - destroy_size += ret; - } + if (ret != -1) + destroy_size += ret; + } - return destroy_size; + return destroy_size; } void -ioc_inode_flush (ioc_inode_t *ioc_inode) +ioc_inode_flush(ioc_inode_t *ioc_inode) { - int64_t destroy_size = 0; + int64_t destroy_size = 0; - ioc_inode_lock (ioc_inode); - { - destroy_size = __ioc_inode_flush (ioc_inode); - } - ioc_inode_unlock (ioc_inode); + ioc_inode_lock(ioc_inode); + { + destroy_size = __ioc_inode_flush(ioc_inode); + } + ioc_inode_unlock(ioc_inode); - if (destroy_size) { - ioc_table_lock (ioc_inode->table); - { - ioc_inode->table->cache_used -= destroy_size; - } - ioc_table_unlock (ioc_inode->table); + if (destroy_size) { + ioc_table_lock(ioc_inode->table); + { + ioc_inode->table->cache_used -= destroy_size; } + ioc_table_unlock(ioc_inode->table); + } - return; + return; } int32_t -ioc_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) +ioc_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preop, + struct iatt *postop, dict_t *xdata) { - STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop, - xdata); - return 0; + STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, preop, postop, xdata); + return 0; } int32_t -ioc_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +ioc_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - uint64_t ioc_inode = 0; + uint64_t ioc_inode = 0; - inode_ctx_get (loc->inode, this, &ioc_inode); + inode_ctx_get(loc->inode, this, &ioc_inode); - if (ioc_inode - && ((valid & GF_SET_ATTR_ATIME) - || (valid & GF_SET_ATTR_MTIME))) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + if (ioc_inode && + ((valid & GF_SET_ATTR_ATIME) || (valid & GF_SET_ATTR_MTIME))) + ioc_inode_flush((ioc_inode_t *)(long)ioc_inode); - STACK_WIND (frame, ioc_setattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid, xdata); + STACK_WIND(frame, ioc_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); - return 0; + return 0; } int32_t -ioc_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *stbuf, dict_t *xdata, struct iatt *postparent) +ioc_inode_update(xlator_t *this, inode_t *inode, char *path, struct iatt *iabuf) { - ioc_inode_t *ioc_inode = NULL; - ioc_table_t *table = NULL; - uint8_t cache_still_valid = 0; - uint64_t tmp_ioc_inode = 0; - uint32_t weight = 0xffffffff; - const char *path = NULL; - ioc_local_t *local = NULL; - - if (op_ret != 0) - goto out; + ioc_table_t *table = NULL; + uint64_t tmp_ioc_inode = 0; + ioc_inode_t *ioc_inode = NULL; + uint32_t weight = 0xffffffff; + gf_boolean_t cache_still_valid = _gf_false; - local = frame->local; - if (local == NULL) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } + if (!this || !inode) + goto out; - if (!this || !this->private) { - op_ret = -1; - op_errno = EINVAL; - goto out; + table = this->private; + + LOCK(&inode->lock); + { + (void)__inode_ctx_get(inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + + if (!ioc_inode) { + weight = ioc_get_priority(table, path); + + ioc_inode = ioc_inode_create(table, inode, weight); + + (void)__inode_ctx_put(inode, this, (uint64_t)(long)ioc_inode); } + } + UNLOCK(&inode->lock); - table = this->private; + ioc_inode_lock(ioc_inode); + { + if (ioc_inode->cache.mtime == 0) { + ioc_inode->cache.mtime = iabuf->ia_mtime; + ioc_inode->cache.mtime_nsec = iabuf->ia_mtime_nsec; + } - path = local->file_loc.path; + ioc_inode->ia_size = iabuf->ia_size; + } + ioc_inode_unlock(ioc_inode); - LOCK (&inode->lock); - { - __inode_ctx_get (inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + cache_still_valid = ioc_cache_still_valid(ioc_inode, iabuf); - if (!ioc_inode) { - weight = ioc_get_priority (table, path); + if (!cache_still_valid) { + ioc_inode_flush(ioc_inode); + } - ioc_inode = ioc_inode_update (table, inode, - weight); + ioc_table_lock(ioc_inode->table); + { + list_move_tail(&ioc_inode->inode_lru, + &table->inode_lru[ioc_inode->weight]); + } + ioc_table_unlock(ioc_inode->table); - __inode_ctx_put (inode, this, - (uint64_t)(long)ioc_inode); - } - } - UNLOCK (&inode->lock); +out: + return 0; +} - ioc_inode_lock (ioc_inode); - { - if (ioc_inode->cache.mtime == 0) { - ioc_inode->cache.mtime = stbuf->ia_mtime; - ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; - } +int32_t +ioc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xdata, struct iatt *postparent) +{ + ioc_local_t *local = NULL; - ioc_inode->ia_size = stbuf->ia_size; - } - ioc_inode_unlock (ioc_inode); + if (op_ret != 0) + goto out; - cache_still_valid = ioc_cache_still_valid (ioc_inode, - stbuf); + local = frame->local; + if (local == NULL) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } - if (!cache_still_valid) { - ioc_inode_flush (ioc_inode); - } + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } - ioc_table_lock (ioc_inode->table); - { - list_move_tail (&ioc_inode->inode_lru, - &table->inode_lru[ioc_inode->weight]); - } - ioc_table_unlock (ioc_inode->table); + ioc_inode_update(this, inode, (char *)local->file_loc.path, stbuf); out: - if (frame->local != NULL) { - local = frame->local; - loc_wipe (&local->file_loc); - } + if (frame->local != NULL) { + local = frame->local; + loc_wipe(&local->file_loc); + } - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, stbuf, - xdata, postparent); - return 0; + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf, xdata, + postparent); + return 0; } int32_t -ioc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata) +ioc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - ioc_local_t *local = NULL; - int32_t op_errno = -1, ret = -1; + ioc_local_t *local = NULL; + int32_t op_errno = -1, ret = -1; - local = mem_get0 (this->local_pool); - if (local == NULL) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto unwind; - } + local = mem_get0(this->local_pool); + if (local == NULL) { + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL); + goto unwind; + } - ret = loc_copy (&local->file_loc, loc); - if (ret != 0) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto unwind; - } + ret = loc_copy(&local->file_loc, loc); + if (ret != 0) { + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL); + goto unwind; + } - frame->local = local; + frame->local = local; - STACK_WIND (frame, ioc_lookup_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->lookup, loc, xdata); + STACK_WIND(frame, ioc_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (lookup, frame, -1, op_errno, NULL, NULL, - NULL, NULL); + if (local != NULL) { + loc_wipe(&local->file_loc); + mem_put(local); + } - return 0; + STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return 0; } /* @@ -301,29 +360,29 @@ unwind: * */ int32_t -ioc_forget (xlator_t *this, inode_t *inode) +ioc_forget(xlator_t *this, inode_t *inode) { - uint64_t ioc_inode = 0; + uint64_t ioc_inode = 0; - inode_ctx_get (inode, this, &ioc_inode); + inode_ctx_get(inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_destroy ((ioc_inode_t *)(long)ioc_inode); + if (ioc_inode) + ioc_inode_destroy((ioc_inode_t *)(long)ioc_inode); - return 0; + return 0; } static int32_t ioc_invalidate(xlator_t *this, inode_t *inode) { - ioc_inode_t *ioc_inode = NULL; + uint64_t ioc_inode = 0; - inode_ctx_get(inode, this, (uint64_t *) &ioc_inode); + inode_ctx_get(inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_flush(ioc_inode); + if (ioc_inode) + ioc_inode_flush((ioc_inode_t *)(uintptr_t)ioc_inode); - return 0; + return 0; } /* @@ -338,104 +397,103 @@ ioc_invalidate(xlator_t *this, inode_t *inode) * */ int32_t -ioc_cache_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *stbuf, - dict_t *xdata) +ioc_cache_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + dict_t *xdata) { - ioc_local_t *local = NULL; - ioc_inode_t *ioc_inode = NULL; - size_t destroy_size = 0; - struct iatt *local_stbuf = NULL; - - local = frame->local; - ioc_inode = local->inode; - local_stbuf = stbuf; - - if ((op_ret == -1) || - ((op_ret >= 0) && !ioc_cache_still_valid(ioc_inode, stbuf))) { - gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG, - "cache for inode(%p) is invalid. flushing all pages", - ioc_inode); - /* NOTE: only pages with no waiting frames are flushed by - * ioc_inode_flush. page_fault will be generated for all - * the pages which have waiting frames by ioc_inode_wakeup() - */ - ioc_inode_lock (ioc_inode); - { - destroy_size = __ioc_inode_flush (ioc_inode); - if (op_ret >= 0) { - ioc_inode->cache.mtime = stbuf->ia_mtime; - ioc_inode->cache.mtime_nsec - = stbuf->ia_mtime_nsec; - } - } - ioc_inode_unlock (ioc_inode); - local_stbuf = NULL; - } - - if (destroy_size) { - ioc_table_lock (ioc_inode->table); - { - ioc_inode->table->cache_used -= destroy_size; - } - ioc_table_unlock (ioc_inode->table); + ioc_local_t *local = NULL; + ioc_inode_t *ioc_inode = NULL; + size_t destroy_size = 0; + struct iatt *local_stbuf = NULL; + + local = frame->local; + ioc_inode = local->inode; + local_stbuf = stbuf; + + if ((op_ret == -1) || + ((op_ret >= 0) && !ioc_cache_still_valid(ioc_inode, stbuf))) { + gf_msg_debug(ioc_inode->table->xl->name, 0, + "cache for inode(%p) is invalid. flushing all pages", + ioc_inode); + /* NOTE: only pages with no waiting frames are flushed by + * ioc_inode_flush. page_fault will be generated for all + * the pages which have waiting frames by ioc_inode_wakeup() + */ + ioc_inode_lock(ioc_inode); + { + destroy_size = __ioc_inode_flush(ioc_inode); + if (op_ret >= 0) { + ioc_inode->cache.mtime = stbuf->ia_mtime; + ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; + } + } + ioc_inode_unlock(ioc_inode); + local_stbuf = NULL; + } + + if (destroy_size) { + ioc_table_lock(ioc_inode->table); + { + ioc_inode->table->cache_used -= destroy_size; } + ioc_table_unlock(ioc_inode->table); + } - if (op_ret < 0) - local_stbuf = NULL; + if (op_ret < 0) + local_stbuf = NULL; - ioc_inode_lock (ioc_inode); - { - gettimeofday (&ioc_inode->cache.tv, NULL); - } - ioc_inode_unlock (ioc_inode); + ioc_inode_lock(ioc_inode); + { + ioc_inode->cache.last_revalidate = gf_time(); + } + ioc_inode_unlock(ioc_inode); - ioc_inode_wakeup (frame, ioc_inode, local_stbuf); + ioc_inode_wakeup(frame, ioc_inode, local_stbuf); - /* any page-fault initiated by ioc_inode_wakeup() will have its own - * fd_ref on fd, safe to unref validate frame's private copy - */ - fd_unref (local->fd); + /* any page-fault initiated by ioc_inode_wakeup() will have its own + * fd_ref on fd, safe to unref validate frame's private copy + */ + fd_unref(local->fd); + dict_unref(local->xattr_req); - STACK_DESTROY (frame->root); + STACK_DESTROY(frame->root); - return 0; + return 0; } int32_t -ioc_wait_on_inode (ioc_inode_t *ioc_inode, ioc_page_t *page) +ioc_wait_on_inode(ioc_inode_t *ioc_inode, ioc_page_t *page) { - ioc_waitq_t *waiter = NULL, *trav = NULL; - uint32_t page_found = 0; - int32_t ret = 0; + ioc_waitq_t *waiter = NULL, *trav = NULL; + uint32_t page_found = 0; + int32_t ret = 0; - trav = ioc_inode->waitq; + trav = ioc_inode->waitq; - while (trav) { - if (trav->data == page) { - page_found = 1; - break; - } - trav = trav->next; + while (trav) { + if (trav->data == page) { + page_found = 1; + break; } + trav = trav->next; + } - if (!page_found) { - waiter = GF_CALLOC (1, sizeof (ioc_waitq_t), - gf_ioc_mt_ioc_waitq_t); - if (waiter == NULL) { - gf_log (ioc_inode->table->xl->name, GF_LOG_ERROR, - "out of memory"); - ret = -ENOMEM; - goto out; - } - - waiter->data = page; - waiter->next = ioc_inode->waitq; - ioc_inode->waitq = waiter; + if (!page_found) { + waiter = GF_CALLOC(1, sizeof(ioc_waitq_t), gf_ioc_mt_ioc_waitq_t); + if (waiter == NULL) { + gf_smsg(ioc_inode->table->xl->name, GF_LOG_ERROR, ENOMEM, + IO_CACHE_MSG_NO_MEMORY, NULL); + ret = -ENOMEM; + goto out; } + waiter->data = page; + waiter->next = ioc_inode->waitq; + ioc_inode->waitq = waiter; + } + out: - return ret; + return ret; } /* @@ -447,74 +505,77 @@ out: * */ int32_t -ioc_cache_validate (call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, - ioc_page_t *page) +ioc_cache_validate(call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, + ioc_page_t *page) { - call_frame_t *validate_frame = NULL; - ioc_local_t *validate_local = NULL; - ioc_local_t *local = NULL; - int32_t ret = 0; - - local = frame->local; - validate_local = mem_get0 (THIS->local_pool); - if (validate_local == NULL) { - ret = -1; - local->op_ret = -1; - local->op_errno = ENOMEM; - gf_log (ioc_inode->table->xl->name, GF_LOG_ERROR, - "out of memory"); - goto out; - } - - validate_frame = copy_frame (frame); - if (validate_frame == NULL) { - ret = -1; - local->op_ret = -1; - local->op_errno = ENOMEM; - mem_put (validate_local); - gf_log (ioc_inode->table->xl->name, GF_LOG_ERROR, - "out of memory"); - goto out; - } - - validate_local->fd = fd_ref (fd); - validate_local->inode = ioc_inode; - validate_frame->local = validate_local; - - STACK_WIND (validate_frame, ioc_cache_validate_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->fstat, fd, NULL); + call_frame_t *validate_frame = NULL; + ioc_local_t *validate_local = NULL; + ioc_local_t *local = NULL; + int32_t ret = 0; + + local = frame->local; + validate_local = mem_get0(THIS->local_pool); + if (validate_local == NULL) { + ret = -1; + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_smsg(ioc_inode->table->xl->name, GF_LOG_ERROR, 0, + IO_CACHE_MSG_NO_MEMORY, NULL); + goto out; + } + + validate_frame = copy_frame(frame); + if (validate_frame == NULL) { + ret = -1; + local->op_ret = -1; + local->op_errno = ENOMEM; + mem_put(validate_local); + gf_smsg(ioc_inode->table->xl->name, GF_LOG_ERROR, 0, + IO_CACHE_MSG_NO_MEMORY, NULL); + goto out; + } + + validate_local->fd = fd_ref(fd); + validate_local->inode = ioc_inode; + if (local && local->xattr_req) + validate_local->xattr_req = dict_ref(local->xattr_req); + validate_frame->local = validate_local; + + STACK_WIND(validate_frame, ioc_cache_validate_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->fstat, fd, + validate_local->xattr_req); out: - return ret; + return ret; } -static inline uint32_t -is_match (const char *path, const char *pattern) +static uint32_t +is_match(const char *path, const char *pattern) { - int32_t ret = 0; + int32_t ret = 0; - ret = fnmatch (pattern, path, FNM_NOESCAPE); + ret = fnmatch(pattern, path, FNM_NOESCAPE); - return (ret == 0); + return (ret == 0); } uint32_t -ioc_get_priority (ioc_table_t *table, const char *path) +ioc_get_priority(ioc_table_t *table, const char *path) { - uint32_t priority = 1; - struct ioc_priority *curr = NULL; + uint32_t priority = 1; + struct ioc_priority *curr = NULL; - if (list_empty(&table->priority_list)) - return priority; + if (list_empty(&table->priority_list) || !path) + return priority; - priority = 0; - list_for_each_entry (curr, &table->priority_list, list) { - if (is_match (path, curr->pattern)) - priority = curr->priority; - } + priority = 0; + list_for_each_entry(curr, &table->priority_list, list) + { + if (is_match(path, curr->pattern)) + priority = curr->priority; + } - return priority; + return priority; } /* @@ -529,75 +590,68 @@ ioc_get_priority (ioc_table_t *table, const char *path) * */ int32_t -ioc_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd, dict_t *xdata) +ioc_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - uint64_t tmp_ioc_inode = 0; - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_inode_t *ioc_inode = NULL; - uint32_t weight = 0xffffffff; + uint64_t tmp_ioc_inode = 0; + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_inode_t *ioc_inode = NULL; + + local = frame->local; + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + table = this->private; + + if (op_ret != -1) { + inode_ctx_get(fd->inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - local = frame->local; - if (!this || !this->private) { - op_ret = -1; - op_errno = EINVAL; - goto out; + // TODO: see why inode context is NULL and handle it. + if (!ioc_inode) { + gf_smsg(this->name, GF_LOG_ERROR, EINVAL, + IO_CACHE_MSG_ENFORCEMENT_FAILED, "inode-gfid=%s", + uuid_utoa(fd->inode->gfid), NULL); + goto out; } - table = this->private; - - if (op_ret != -1) { - inode_ctx_get (fd->inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - - //TODO: see why inode context is NULL and handle it. - if (!ioc_inode) { - gf_log (this->name, GF_LOG_ERROR, "inode context is " - "NULL (%s)", uuid_utoa (fd->inode->gfid)); - goto out; - } - - ioc_table_lock (ioc_inode->table); - { - list_move_tail (&ioc_inode->inode_lru, - &table->inode_lru[ioc_inode->weight]); - } - ioc_table_unlock (ioc_inode->table); + ioc_table_lock(ioc_inode->table); + { + list_move_tail(&ioc_inode->inode_lru, + &table->inode_lru[ioc_inode->weight]); + } + ioc_table_unlock(ioc_inode->table); - ioc_inode_lock (ioc_inode); - { - if ((table->min_file_size > ioc_inode->ia_size) - || ((table->max_file_size > 0) - && (table->max_file_size < ioc_inode->ia_size))) { - fd_ctx_set (fd, this, 1); - } - } - ioc_inode_unlock (ioc_inode); - - /* If O_DIRECT open, we disable caching on it */ - if ((local->flags & O_DIRECT)){ - /* O_DIRECT is only for one fd, not the inode - * as a whole - */ - fd_ctx_set (fd, this, 1); - } + ioc_inode_lock(ioc_inode); + { + if ((table->min_file_size > ioc_inode->ia_size) || + ((table->max_file_size > 0) && + (table->max_file_size < ioc_inode->ia_size))) { + fd_ctx_set(fd, this, 1); + } + } + ioc_inode_unlock(ioc_inode); - /* weight = 0, we disable caching on it */ - if (weight == 0) { - /* we allow a pattern-matched cache disable this way - */ - fd_ctx_set (fd, this, 1); - } + /* If O_DIRECT open, we disable caching on it */ + if ((local->flags & O_DIRECT)) { + /* O_DIRECT is only for one fd, not the inode + * as a whole + */ + fd_ctx_set(fd, this, 1); } + } out: - mem_put (local); - frame->local = NULL; + mem_put(local); + frame->local = NULL; - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); - return 0; + return 0; } /* @@ -614,184 +668,175 @@ out: * */ int32_t -ioc_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +ioc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_inode_t *ioc_inode = NULL; - uint32_t weight = 0xffffffff; - const char *path = NULL; - int ret = -1; - - local = frame->local; - if (!this || !this->private) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - table = this->private; - path = local->file_loc.path; - - if (op_ret != -1) { - /* assign weight */ - weight = ioc_get_priority (table, path); - - ioc_inode = ioc_inode_update (table, inode, weight); - - ioc_inode_lock (ioc_inode); - { - ioc_inode->cache.mtime = buf->ia_mtime; - ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec; - ioc_inode->ia_size = buf->ia_size; - - if ((table->min_file_size > ioc_inode->ia_size) - || ((table->max_file_size > 0) - && (table->max_file_size < ioc_inode->ia_size))) { - ret = fd_ctx_set (fd, this, 1); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set fd ctx", - local->file_loc.path); - } - } - ioc_inode_unlock (ioc_inode); - - inode_ctx_put (fd->inode, this, - (uint64_t)(long)ioc_inode); - - /* If O_DIRECT open, we disable caching on it */ - if (local->flags & O_DIRECT) { - /* - * O_DIRECT is only for one fd, not the inode - * as a whole */ - ret = fd_ctx_set (fd, this, 1); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set fd ctx", - local->file_loc.path); - } - - /* if weight == 0, we disable caching on it */ - if (!weight) { - /* we allow a pattern-matched cache disable this way */ - ret = fd_ctx_set (fd, this, 1); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set fd ctx", - local->file_loc.path); - } - - } + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_inode_t *ioc_inode = NULL; + uint32_t weight = 0xffffffff; + const char *path = NULL; + int ret = -1; + + local = frame->local; + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + table = this->private; + path = local->file_loc.path; + + if (op_ret != -1) { + /* assign weight */ + weight = ioc_get_priority(table, path); + + ioc_inode = ioc_inode_create(table, inode, weight); + + ioc_inode_lock(ioc_inode); + { + ioc_inode->cache.mtime = buf->ia_mtime; + ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec; + ioc_inode->ia_size = buf->ia_size; + + if ((table->min_file_size > ioc_inode->ia_size) || + ((table->max_file_size > 0) && + (table->max_file_size < ioc_inode->ia_size))) { + ret = fd_ctx_set(fd, this, 1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, + IO_CACHE_MSG_SET_FD_FAILED, "path=%s", + local->file_loc.path, NULL); + } + } + ioc_inode_unlock(ioc_inode); + + inode_ctx_put(fd->inode, this, (uint64_t)(long)ioc_inode); + + /* If O_DIRECT open, we disable caching on it */ + if (local->flags & O_DIRECT) { + /* + * O_DIRECT is only for one fd, not the inode + * as a whole */ + ret = fd_ctx_set(fd, this, 1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, + IO_CACHE_MSG_SET_FD_FAILED, "path=%s", + local->file_loc.path, NULL); + } + + /* if weight == 0, we disable caching on it */ + if (!weight) { + /* we allow a pattern-matched cache disable this way */ + ret = fd_ctx_set(fd, this, 1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, + IO_CACHE_MSG_SET_FD_FAILED, "path=%s", + local->file_loc.path, NULL); + } + } out: - frame->local = NULL; - mem_put (local); + frame->local = NULL; + mem_put(local); - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent, xdata); + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); - return 0; + return 0; } - int32_t -ioc_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +ioc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_inode_t *ioc_inode = NULL; - uint32_t weight = 0xffffffff; - const char *path = NULL; + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_inode_t *ioc_inode = NULL; + uint32_t weight = 0xffffffff; + const char *path = NULL; - local = frame->local; - if (!this || !this->private) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } + local = frame->local; + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } - table = this->private; - path = local->file_loc.path; + table = this->private; + path = local->file_loc.path; - if (op_ret != -1) { - /* assign weight */ - weight = ioc_get_priority (table, path); + if (op_ret != -1) { + /* assign weight */ + weight = ioc_get_priority(table, path); - ioc_inode = ioc_inode_update (table, inode, weight); + ioc_inode = ioc_inode_create(table, inode, weight); - ioc_inode_lock (ioc_inode); - { - ioc_inode->cache.mtime = buf->ia_mtime; - ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec; - ioc_inode->ia_size = buf->ia_size; - } - ioc_inode_unlock (ioc_inode); - - inode_ctx_put (inode, this, - (uint64_t)(long)ioc_inode); + ioc_inode_lock(ioc_inode); + { + ioc_inode->cache.mtime = buf->ia_mtime; + ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec; + ioc_inode->ia_size = buf->ia_size; } + ioc_inode_unlock(ioc_inode); + + inode_ctx_put(inode, this, (uint64_t)(long)ioc_inode); + } out: - frame->local = NULL; + frame->local = NULL; - loc_wipe (&local->file_loc); - mem_put (local); + loc_wipe(&local->file_loc); + mem_put(local); - STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; + STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; } - int -ioc_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, mode_t umask, dict_t *xdata) +ioc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) { - ioc_local_t *local = NULL; - int32_t op_errno = -1, ret = -1; + ioc_local_t *local = NULL; + int32_t op_errno = -1, ret = -1; - local = mem_get0 (this->local_pool); - if (local == NULL) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto unwind; - } + local = mem_get0(this->local_pool); + if (local == NULL) { + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL); + goto unwind; + } - ret = loc_copy (&local->file_loc, loc); - if (ret != 0) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto unwind; - } + ret = loc_copy(&local->file_loc, loc); + if (ret != 0) { + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL); + goto unwind; + } - frame->local = local; + frame->local = local; - STACK_WIND (frame, ioc_mknod_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, umask, xdata); - return 0; + STACK_WIND(frame, ioc_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + return 0; unwind: - if (local != NULL) { - loc_wipe (&local->file_loc); - mem_put (local); - } + if (local != NULL) { + loc_wipe(&local->file_loc); + mem_put(local); + } - STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL, NULL, - NULL, NULL, NULL); + STACK_UNWIND_STRICT(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; } - /* * ioc_open - open fop for io cache * @frame: @@ -801,30 +846,28 @@ unwind: * */ int32_t -ioc_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata) +ioc_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { + ioc_local_t *local = NULL; - ioc_local_t *local = NULL; - - local = mem_get0 (this->local_pool); - if (local == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - STACK_UNWIND_STRICT (open, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + local = mem_get0(this->local_pool); + if (local == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL); + STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL); + return 0; + } - local->flags = flags; - local->file_loc.path = loc->path; - local->file_loc.inode = loc->inode; + local->flags = flags; + local->file_loc.path = loc->path; + local->file_loc.inode = loc->inode; - frame->local = local; + frame->local = local; - STACK_WIND (frame, ioc_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, - xdata); + STACK_WIND(frame, ioc_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); - return 0; + return 0; } /* @@ -838,32 +881,29 @@ ioc_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, * */ int32_t -ioc_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +ioc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - ioc_local_t *local = NULL; - - local = mem_get0 (this->local_pool); - if (local == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - STACK_UNWIND_STRICT (create, frame, -1, ENOMEM, NULL, NULL, - NULL, NULL, NULL, NULL); - return 0; - } - - local->flags = flags; - local->file_loc.path = loc->path; - frame->local = local; - - STACK_WIND (frame, ioc_create_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, loc, flags, mode, - umask, fd, xdata); + ioc_local_t *local = NULL; + local = mem_get0(this->local_pool); + if (local == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL); + STACK_UNWIND_STRICT(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; -} + } + local->flags = flags; + local->file_loc.path = loc->path; + frame->local = local; + STACK_WIND(frame, ioc_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +} /* * ioc_release - release fop for io cache @@ -874,49 +914,26 @@ ioc_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, * */ int32_t -ioc_release (xlator_t *this, fd_t *fd) +ioc_release(xlator_t *this, fd_t *fd) { - return 0; + return 0; } -/* - * ioc_readv_disabled_cbk - * @frame: - * @cookie: - * @this: - * @op_ret: - * @op_errno: - * @vector: - * @count: - * - */ int32_t -ioc_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, - struct iobref *iobref, dict_t *xdata) +ioc_need_prune(ioc_table_t *table) { - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref, xdata); - return 0; -} - + int64_t cache_difference = 0; -int32_t -ioc_need_prune (ioc_table_t *table) -{ - int64_t cache_difference = 0; - - ioc_table_lock (table); - { - cache_difference = table->cache_used - table->cache_size; - } - ioc_table_unlock (table); + ioc_table_lock(table); + { + cache_difference = table->cache_used - table->cache_size; + } + ioc_table_unlock(table); - if (cache_difference > 0) - return 1; - else - return 0; + if (cache_difference > 0) + return 1; + else + return 0; } /* @@ -928,158 +945,151 @@ ioc_need_prune (ioc_table_t *table) * */ void -ioc_dispatch_requests (call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, - off_t offset, size_t size) +ioc_dispatch_requests(call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, + off_t offset, size_t size) { - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_page_t *trav = NULL; - ioc_waitq_t *waitq = NULL; - off_t rounded_offset = 0; - off_t rounded_end = 0; - off_t trav_offset = 0; - int32_t fault = 0; - size_t trav_size = 0; - off_t local_offset = 0; - int32_t ret = -1; - int8_t need_validate = 0; - int8_t might_need_validate = 0; /* - * if a page exists, do we need - * to validate it? - */ - local = frame->local; - table = ioc_inode->table; - - rounded_offset = floor (offset, table->page_size); - rounded_end = roof (offset + size, table->page_size); - trav_offset = rounded_offset; - - /* once a frame does read, it should be waiting on something */ - local->wait_count++; - - /* Requested region can fall in three different pages, - * 1. Ready - region is already in cache, we just have to serve it. - * 2. In-transit - page fault has been generated on this page, we need - * to wait till the page is ready - * 3. Fault - page is not in cache, we have to generate a page fault - */ + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_page_t *trav = NULL; + ioc_waitq_t *waitq = NULL; + off_t rounded_offset = 0; + off_t rounded_end = 0; + off_t trav_offset = 0; + int32_t fault = 0; + size_t trav_size = 0; + off_t local_offset = 0; + int32_t ret = -1; + int8_t need_validate = 0; + int8_t might_need_validate = 0; /* + * if a page exists, do we need + * to validate it? + */ + local = frame->local; + table = ioc_inode->table; + + rounded_offset = gf_floor(offset, table->page_size); + rounded_end = gf_roof(offset + size, table->page_size); + trav_offset = rounded_offset; + + /* once a frame does read, it should be waiting on something */ + local->wait_count++; + + /* Requested region can fall in three different pages, + * 1. Ready - region is already in cache, we just have to serve it. + * 2. In-transit - page fault has been generated on this page, we need + * to wait till the page is ready + * 3. Fault - page is not in cache, we have to generate a page fault + */ + + might_need_validate = ioc_inode_need_revalidate(ioc_inode); + + while (trav_offset < rounded_end) { + ioc_inode_lock(ioc_inode); + { + /* look for requested region in the cache */ + trav = __ioc_page_get(ioc_inode, trav_offset); - might_need_validate = ioc_inode_need_revalidate (ioc_inode); + local_offset = max(trav_offset, offset); + trav_size = min(((offset + size) - local_offset), table->page_size); - while (trav_offset < rounded_end) { - ioc_inode_lock (ioc_inode); - { - /* look for requested region in the cache */ - trav = __ioc_page_get (ioc_inode, trav_offset); - - local_offset = max (trav_offset, offset); - trav_size = min (((offset+size) - local_offset), - table->page_size); - - if (!trav) { - /* page not in cache, we need to generate page - * fault - */ - trav = __ioc_page_create (ioc_inode, - trav_offset); - fault = 1; - if (!trav) { - gf_log (frame->this->name, - GF_LOG_CRITICAL, - "out of memory"); - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } - } + if (!trav) { + /* page not in cache, we need to generate page + * fault + */ + trav = __ioc_page_create(ioc_inode, trav_offset); + fault = 1; + if (!trav) { + gf_smsg(frame->this->name, GF_LOG_CRITICAL, ENOMEM, + IO_CACHE_MSG_NO_MEMORY, NULL); + local->op_ret = -1; + local->op_errno = ENOMEM; + ioc_inode_unlock(ioc_inode); + goto out; + } + } + + __ioc_wait_on_page(trav, frame, local_offset, trav_size); + + if (trav->ready) { + /* page found in cache */ + if (!might_need_validate && !ioc_inode->waitq) { + /* fresh enough */ + gf_msg_trace(frame->this->name, 0, + "cache hit for " + "trav_offset=%" PRId64 + "/local_" + "offset=%" PRId64 "", + trav_offset, local_offset); + waitq = __ioc_page_wakeup(trav, trav->op_errno); + } else { + /* if waitq already exists, fstat + * revalidate is + * already on the way + */ + if (!ioc_inode->waitq) { + need_validate = 1; + } + + ret = ioc_wait_on_inode(ioc_inode, trav); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + need_validate = 0; - __ioc_wait_on_page (trav, frame, local_offset, - trav_size); - - if (trav->ready) { - /* page found in cache */ - if (!might_need_validate && !ioc_inode->waitq) { - /* fresh enough */ - gf_log (frame->this->name, GF_LOG_TRACE, - "cache hit for trav_offset=%" - PRId64"/local_offset=%"PRId64"", - trav_offset, local_offset); - waitq = __ioc_page_wakeup (trav, - trav->op_errno); - } else { - /* if waitq already exists, fstat - * revalidate is - * already on the way - */ - if (!ioc_inode->waitq) { - need_validate = 1; - } - - ret = ioc_wait_on_inode (ioc_inode, - trav); - if (ret < 0) { - local->op_ret = -1; - local->op_errno = -ret; - need_validate = 0; - - waitq = __ioc_page_wakeup (trav, - trav->op_errno); - ioc_inode_unlock (ioc_inode); - - ioc_waitq_return (waitq); - waitq = NULL; - goto out; - } - } - } + waitq = __ioc_page_wakeup(trav, trav->op_errno); + ioc_inode_unlock(ioc_inode); + ioc_waitq_return(waitq); + waitq = NULL; + goto out; + } } - ioc_inode_unlock (ioc_inode); + } + } + ioc_inode_unlock(ioc_inode); - ioc_waitq_return (waitq); - waitq = NULL; + ioc_waitq_return(waitq); + waitq = NULL; - if (fault) { - fault = 0; - /* new page created, increase the table->cache_used */ - ioc_page_fault (ioc_inode, frame, fd, trav_offset); - } + if (fault) { + fault = 0; + /* new page created, increase the table->cache_used */ + ioc_page_fault(ioc_inode, frame, fd, trav_offset); + } - if (need_validate) { - need_validate = 0; - gf_log (frame->this->name, GF_LOG_TRACE, - "sending validate request for " - "inode(%s) at offset=%"PRId64"", - uuid_utoa (fd->inode->gfid), trav_offset); - ret = ioc_cache_validate (frame, ioc_inode, fd, trav); - if (ret == -1) { - ioc_inode_lock (ioc_inode); - { - waitq = __ioc_page_wakeup (trav, - trav->op_errno); - } - ioc_inode_unlock (ioc_inode); - - ioc_waitq_return (waitq); - waitq = NULL; - goto out; - } + if (need_validate) { + need_validate = 0; + gf_msg_trace(frame->this->name, 0, + "sending validate request for " + "inode(%s) at offset=%" PRId64 "", + uuid_utoa(fd->inode->gfid), trav_offset); + ret = ioc_cache_validate(frame, ioc_inode, fd, trav); + if (ret == -1) { + ioc_inode_lock(ioc_inode); + { + waitq = __ioc_page_wakeup(trav, trav->op_errno); } + ioc_inode_unlock(ioc_inode); - trav_offset += table->page_size; + ioc_waitq_return(waitq); + waitq = NULL; + goto out; + } } + trav_offset += table->page_size; + } + out: - ioc_frame_return (frame); + ioc_frame_return(frame); - if (ioc_need_prune (ioc_inode->table)) { - ioc_prune (ioc_inode->table); - } + if (ioc_need_prune(ioc_inode->table)) { + ioc_prune(ioc_inode->table); + } - return; + return; } - /* * ioc_readv - * @@ -1091,103 +1101,108 @@ out: * */ int32_t -ioc_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset, uint32_t flags, dict_t *xdata) +ioc_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - uint64_t tmp_ioc_inode = 0; - ioc_inode_t *ioc_inode = NULL; - ioc_local_t *local = NULL; - uint32_t weight = 0; - ioc_table_t *table = NULL; - int32_t op_errno = -1; - - if (!this) { - goto out; - } - - inode_ctx_get (fd->inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - if (!ioc_inode) { - /* caching disabled, go ahead with normal readv */ - STACK_WIND (frame, ioc_readv_disabled_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, fd, size, - offset, flags, xdata); - return 0; - } + uint64_t tmp_ioc_inode = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_local_t *local = NULL; + uint32_t weight = 0; + ioc_table_t *table = NULL; + int32_t op_errno = EINVAL; + + if (!this) { + goto out; + } + + inode_ctx_get(fd->inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + if (!ioc_inode) { + /* caching disabled, go ahead with normal readv */ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + return 0; + } + if (flags & O_DIRECT) { + /* disable caching for this fd, if O_DIRECT is used */ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + return 0; + } - table = this->private; + table = this->private; - if (!table) { - gf_log (this->name, GF_LOG_ERROR, "table is null"); - op_errno = EINVAL; - goto out; - } + if (!table) { + gf_smsg(this->name, GF_LOG_ERROR, EINVAL, IO_CACHE_MSG_TABLE_NULL, + NULL); + op_errno = EINVAL; + goto out; + } - ioc_inode_lock (ioc_inode); - { - if (!ioc_inode->cache.page_table) { - ioc_inode->cache.page_table - = rbthash_table_init - (IOC_PAGE_TABLE_BUCKET_COUNT, - ioc_hashfn, NULL, 0, - table->mem_pool); - - if (ioc_inode->cache.page_table == NULL) { - op_errno = ENOMEM; - ioc_inode_unlock (ioc_inode); - goto out; - } - } - } - ioc_inode_unlock (ioc_inode); - - if (!fd_ctx_get (fd, this, NULL)) { - /* disable caching for this fd, go ahead with normal readv */ - STACK_WIND (frame, ioc_readv_disabled_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, fd, size, - offset, flags, xdata); - return 0; - } + ioc_inode_lock(ioc_inode); + { + if (!ioc_inode->cache.page_table) { + ioc_inode->cache.page_table = rbthash_table_init( + this->ctx, IOC_PAGE_TABLE_BUCKET_COUNT, ioc_hashfn, NULL, 0, + table->mem_pool); - local = mem_get0 (this->local_pool); - if (local == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); + if (ioc_inode->cache.page_table == NULL) { op_errno = ENOMEM; + ioc_inode_unlock(ioc_inode); goto out; + } } + } + ioc_inode_unlock(ioc_inode); - INIT_LIST_HEAD (&local->fill_list); - - frame->local = local; - local->pending_offset = offset; - local->pending_size = size; - local->offset = offset; - local->size = size; - local->inode = ioc_inode; - - gf_log (this->name, GF_LOG_TRACE, - "NEW REQ (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET"", - frame, offset, size); - - weight = ioc_inode->weight; - - ioc_table_lock (ioc_inode->table); - { - list_move_tail (&ioc_inode->inode_lru, - &ioc_inode->table->inode_lru[weight]); - } - ioc_table_unlock (ioc_inode->table); - - ioc_dispatch_requests (frame, ioc_inode, fd, offset, size); + if (!fd_ctx_get(fd, this, NULL)) { + /* disable caching for this fd, go ahead with normal readv */ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); return 0; + } + + local = mem_get0(this->local_pool); + if (local == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL); + op_errno = ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&local->fill_list); + + frame->local = local; + local->pending_offset = offset; + local->pending_size = size; + local->offset = offset; + local->size = size; + local->inode = ioc_inode; + local->xattr_req = dict_ref(xdata); + + gf_msg_trace(this->name, 0, + "NEW REQ (%p) offset " + "= %" PRId64 " && size = %" GF_PRI_SIZET "", + frame, offset, size); + + weight = ioc_inode->weight; + + ioc_table_lock(ioc_inode->table); + { + list_move_tail(&ioc_inode->inode_lru, + &ioc_inode->table->inode_lru[weight]); + } + ioc_table_unlock(ioc_inode->table); + + ioc_dispatch_requests(frame, ioc_inode, fd, offset, size); + return 0; out: - STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, - NULL); - return 0; + STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + return 0; } /* @@ -1201,22 +1216,31 @@ out: * */ int32_t -ioc_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +ioc_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - ioc_local_t *local = NULL; - uint64_t ioc_inode = 0; - - local = frame->local; - inode_ctx_get (local->fd->inode, this, &ioc_inode); - - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); - - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, - xdata); - return 0; + ioc_local_t *local = NULL; + uint64_t ioc_inode = 0; + + local = frame->local; + frame->local = NULL; + inode_ctx_get(local->fd->inode, this, &ioc_inode); + + if (op_ret >= 0) { + ioc_update_pages(frame, (ioc_inode_t *)(long)ioc_inode, local->vector, + local->op_ret, op_ret, local->offset); + } + + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + if (local->iobref) { + iobref_unref(local->iobref); + GF_FREE(local->vector); + } + + mem_put(local); + return 0; } /* @@ -1231,34 +1255,38 @@ ioc_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * */ int32_t -ioc_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata) +ioc_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - ioc_local_t *local = NULL; - uint64_t ioc_inode = 0; + ioc_local_t *local = NULL; + uint64_t ioc_inode = 0; - local = mem_get0 (this->local_pool); - if (local == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); + local = mem_get0(this->local_pool); + if (local == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL); - STACK_UNWIND_STRICT (writev, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } + STACK_UNWIND_STRICT(writev, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - /* TODO: why is it not fd_ref'ed */ - local->fd = fd; - frame->local = local; + /* TODO: why is it not fd_ref'ed */ + local->fd = fd; + frame->local = local; - inode_ctx_get (fd->inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + inode_ctx_get(fd->inode, this, &ioc_inode); + if (ioc_inode) { + local->iobref = iobref_ref(iobref); + local->vector = iov_dup(vector, count); + local->op_ret = count; + local->offset = offset; + } - STACK_WIND (frame, ioc_writev_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, - flags, iobref, xdata); + STACK_WIND(frame, ioc_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); - return 0; + return 0; } /* @@ -1273,17 +1301,15 @@ ioc_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, * */ int32_t -ioc_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +ioc_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, - postbuf, xdata); - return 0; + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - /* * ioc_ftruncate_cbk - * @@ -1296,17 +1322,15 @@ ioc_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * */ int32_t -ioc_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +ioc_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - - STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, - postbuf, xdata); - return 0; + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - /* * ioc_truncate - * @@ -1317,19 +1341,19 @@ ioc_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * */ int32_t -ioc_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, - dict_t *xdata) +ioc_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - uint64_t ioc_inode = 0; + uint64_t ioc_inode = 0; - inode_ctx_get (loc->inode, this, &ioc_inode); + inode_ctx_get(loc->inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + if (ioc_inode) + ioc_inode_flush((ioc_inode_t *)(long)ioc_inode); - STACK_WIND (frame, ioc_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); - return 0; + STACK_WIND(frame, ioc_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; } /* @@ -1342,680 +1366,719 @@ ioc_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, * */ int32_t -ioc_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) +ioc_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - uint64_t ioc_inode = 0; + uint64_t ioc_inode = 0; - inode_ctx_get (fd->inode, this, &ioc_inode); + inode_ctx_get(fd->inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + if (ioc_inode) + ioc_inode_flush((ioc_inode_t *)(long)ioc_inode); - STACK_WIND (frame, ioc_ftruncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); - return 0; + STACK_WIND(frame, ioc_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; } int32_t -ioc_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct gf_flock *lock, dict_t *xdata) +ioc_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata); - return 0; + STACK_UNWIND_STRICT(lk, frame, op_ret, op_errno, lock, xdata); + return 0; } int32_t -ioc_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *lock, dict_t *xdata) +ioc_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) { - ioc_inode_t *ioc_inode = NULL; - uint64_t tmp_inode = 0; - - inode_ctx_get (fd->inode, this, &tmp_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_inode; - if (!ioc_inode) { - gf_log (this->name, GF_LOG_DEBUG, - "inode context is NULL: returning EBADFD"); - STACK_UNWIND_STRICT (lk, frame, -1, EBADFD, NULL, NULL); - return 0; - } + ioc_inode_t *ioc_inode = NULL; + uint64_t tmp_inode = 0; + + inode_ctx_get(fd->inode, this, &tmp_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_inode; + if (!ioc_inode) { + gf_msg_debug(this->name, EBADFD, + "inode context is NULL: returning EBADFD"); + STACK_UNWIND_STRICT(lk, frame, -1, EBADFD, NULL, NULL); + return 0; + } - ioc_inode_lock (ioc_inode); - { - gettimeofday (&ioc_inode->cache.tv, NULL); - } - ioc_inode_unlock (ioc_inode); + ioc_inode_lock(ioc_inode); + { + ioc_inode->cache.last_revalidate = gf_time(); + } + ioc_inode_unlock(ioc_inode); - STACK_WIND (frame, ioc_lk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->lk, fd, cmd, lock, xdata); + STACK_WIND(frame, ioc_lk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata); - return 0; + return 0; } int -ioc_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries, dict_t *xdata) +ioc_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) { - gf_dirent_t *entry = NULL; + gf_dirent_t *entry = NULL; + char *path = NULL; + fd_t *fd = NULL; - if (op_ret <= 0) - goto unwind; + fd = frame->local; + frame->local = NULL; - list_for_each_entry (entry, &entries->list, list) { - /* TODO: fill things */ - } + if (op_ret <= 0) + goto unwind; + + list_for_each_entry(entry, &entries->list, list) + { + inode_path(fd->inode, entry->d_name, &path); + ioc_inode_update(this, entry->inode, path, &entry->d_stat); + GF_FREE(path); + path = NULL; + } unwind: - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata); - return 0; + return 0; } + int -ioc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *dict) +ioc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) { - STACK_WIND (frame, ioc_readdirp_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, - fd, size, offset, dict); + frame->local = fd; - return 0; + STACK_WIND(frame, ioc_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict); + + return 0; } static int32_t ioc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *pre, - struct iatt *post, dict_t *xdata) + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata); - return 0; + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata); + return 0; } static int32_t ioc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) + size_t len, dict_t *xdata) { - uint64_t ioc_inode = 0; + uint64_t ioc_inode = 0; - inode_ctx_get (fd->inode, this, &ioc_inode); + inode_ctx_get(fd->inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + if (ioc_inode) + ioc_inode_flush((ioc_inode_t *)(long)ioc_inode); - STACK_WIND(frame, ioc_discard_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); - return 0; + STACK_WIND(frame, ioc_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; } - - -int32_t -ioc_get_priority_list (const char *opt_str, struct list_head *first) +static int32_t +ioc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - int32_t max_pri = 1; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *tmp_str2 = NULL; - char *dup_str = NULL; - char *stripe_str = NULL; - char *pattern = NULL; - char *priority = NULL; - char *string = NULL; - struct ioc_priority *curr = NULL, *tmp = NULL; - - string = gf_strdup (opt_str); - if (string == NULL) { - max_pri = -1; - goto out; - } - - /* Get the pattern for cache priority. - * "option priority *.jpg:1,abc*:2" etc - */ - /* TODO: inode_lru in table is statically hard-coded to 5, - * should be changed to run-time configuration - */ - stripe_str = strtok_r (string, ",", &tmp_str); - while (stripe_str) { - curr = GF_CALLOC (1, sizeof (struct ioc_priority), - gf_ioc_mt_ioc_priority); - if (curr == NULL) { - max_pri = -1; - goto out; - } - - list_add_tail (&curr->list, first); - - dup_str = gf_strdup (stripe_str); - if (dup_str == NULL) { - max_pri = -1; - goto out; - } - - pattern = strtok_r (dup_str, ":", &tmp_str1); - if (!pattern) { - max_pri = -1; - goto out; - } - - priority = strtok_r (NULL, ":", &tmp_str1); - if (!priority) { - max_pri = -1; - goto out; - } + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, pre, post, xdata); + return 0; +} - gf_log ("io-cache", GF_LOG_TRACE, - "ioc priority : pattern %s : priority %s", - pattern, - priority); +static int32_t +ioc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + uint64_t ioc_inode = 0; - curr->pattern = gf_strdup (pattern); - if (curr->pattern == NULL) { - max_pri = -1; - goto out; - } + inode_ctx_get(fd->inode, this, &ioc_inode); - curr->priority = strtol (priority, &tmp_str2, 0); - if (tmp_str2 && (*tmp_str2)) { - max_pri = -1; - goto out; - } else { - max_pri = max (max_pri, curr->priority); - } + if (ioc_inode) + ioc_inode_flush((ioc_inode_t *)(long)ioc_inode); - GF_FREE (dup_str); - dup_str = NULL; + STACK_WIND(frame, ioc_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; +} - stripe_str = strtok_r (NULL, ",", &tmp_str); - } +int32_t +ioc_get_priority_list(const char *opt_str, struct list_head *first) +{ + int32_t max_pri = 1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *tmp_str2 = NULL; + char *dup_str = NULL; + char *stripe_str = NULL; + char *pattern = NULL; + char *priority = NULL; + char *string = NULL; + struct ioc_priority *curr = NULL, *tmp = NULL; + + string = gf_strdup(opt_str); + if (string == NULL) { + max_pri = -1; + goto out; + } + + /* Get the pattern for cache priority. + * "option priority *.jpg:1,abc*:2" etc + */ + /* TODO: inode_lru in table is statically hard-coded to 5, + * should be changed to run-time configuration + */ + stripe_str = strtok_r(string, ",", &tmp_str); + while (stripe_str) { + curr = GF_CALLOC(1, sizeof(struct ioc_priority), + gf_ioc_mt_ioc_priority); + if (curr == NULL) { + max_pri = -1; + goto out; + } + + list_add_tail(&curr->list, first); + + dup_str = gf_strdup(stripe_str); + if (dup_str == NULL) { + max_pri = -1; + goto out; + } + + pattern = strtok_r(dup_str, ":", &tmp_str1); + if (!pattern) { + max_pri = -1; + goto out; + } + + priority = strtok_r(NULL, ":", &tmp_str1); + if (!priority) { + max_pri = -1; + goto out; + } + + gf_msg_trace("io-cache", 0, "ioc priority : pattern %s : priority %s", + pattern, priority); + + curr->pattern = gf_strdup(pattern); + if (curr->pattern == NULL) { + max_pri = -1; + goto out; + } + + curr->priority = strtol(priority, &tmp_str2, 0); + if (tmp_str2 && (*tmp_str2)) { + max_pri = -1; + goto out; + } else { + max_pri = max(max_pri, curr->priority); + } + + GF_FREE(dup_str); + dup_str = NULL; + + stripe_str = strtok_r(NULL, ",", &tmp_str); + } out: - GF_FREE (string); + GF_FREE(string); - GF_FREE (dup_str); + GF_FREE(dup_str); - if (max_pri == -1) { - list_for_each_entry_safe (curr, tmp, first, list) { - list_del_init (&curr->list); - GF_FREE (curr->pattern); - GF_FREE (curr); - } + if (max_pri == -1) { + list_for_each_entry_safe(curr, tmp, first, list) + { + list_del_init(&curr->list); + GF_FREE(curr->pattern); + GF_FREE(curr); } + } - return max_pri; + return max_pri; } int32_t -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_ioc_mt_end + 1); + if (!this) + return ret; - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } + ret = xlator_mem_acct_init(this, gf_ioc_mt_end + 1); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + IO_CACHE_MSG_MEMORY_INIT_FAILED, NULL); return ret; -} + } + return ret; +} static gf_boolean_t -check_cache_size_ok (xlator_t *this, uint64_t cache_size) +check_cache_size_ok(xlator_t *this, uint64_t cache_size) { - gf_boolean_t ret = _gf_true; - uint64_t total_mem = 0; - uint64_t max_cache_size = 0; - volume_option_t *opt = NULL; - - GF_ASSERT (this); - opt = xlator_volume_option_get (this, "cache-size"); - if (!opt) { - ret = _gf_false; - gf_log (this->name, GF_LOG_ERROR, - "could not get cache-size option"); - goto out; - } - - total_mem = get_mem_size (); - if (-1 == total_mem) - max_cache_size = opt->max; - else - max_cache_size = total_mem; - - gf_log (this->name, GF_LOG_DEBUG, "Max cache size is %"PRIu64, - max_cache_size); - - if (cache_size > max_cache_size) { - ret = _gf_false; - gf_log (this->name, GF_LOG_ERROR, "Cache size %"PRIu64 - " is greater than the max size of %"PRIu64, - cache_size, max_cache_size); - goto out; - } + gf_boolean_t ret = _gf_true; + uint64_t total_mem = 0; + uint64_t max_cache_size = 0; + volume_option_t *opt = NULL; + + GF_ASSERT(this); + opt = xlator_volume_option_get(this, "cache-size"); + if (!opt) { + ret = _gf_false; + gf_smsg(this->name, GF_LOG_ERROR, EINVAL, + IO_CACHE_MSG_NO_CACHE_SIZE_OPT, NULL); + goto out; + } + + total_mem = get_mem_size(); + if (-1 == total_mem) + max_cache_size = opt->max; + else + max_cache_size = total_mem; + + gf_msg_debug(this->name, 0, "Max cache size is %" PRIu64, max_cache_size); + + if (cache_size > max_cache_size) { + ret = _gf_false; + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_INVALID_ARGUMENT, + "Cache-size=%" PRIu64, cache_size, "max-size=%" PRIu64, + max_cache_size, NULL); + goto out; + } out: - return ret; + return ret; } int -reconfigure (xlator_t *this, dict_t *options) +reconfigure(xlator_t *this, dict_t *options) { - data_t *data = NULL; - ioc_table_t *table = NULL; - int ret = -1; - uint64_t cache_size_new = 0; - if (!this || !this->private) - goto out; + data_t *data = NULL; + ioc_table_t *table = NULL; + int ret = -1; + uint64_t cache_size_new = 0; + if (!this || !this->private) + goto out; - table = this->private; + table = this->private; - ioc_table_lock (table); - { - GF_OPTION_RECONF ("cache-timeout", table->cache_timeout, - options, int32, unlock); + ioc_table_lock(table); + { + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, + unlock); - data = dict_get (options, "priority"); - if (data) { - char *option_list = data_to_str (data); + GF_OPTION_RECONF("cache-timeout", table->cache_timeout, options, int32, + unlock); - gf_log (this->name, GF_LOG_TRACE, - "option path %s", option_list); - /* parse the list of pattern:priority */ - table->max_pri = ioc_get_priority_list (option_list, - &table->priority_list); + data = dict_get(options, "priority"); + if (data) { + char *option_list = data_to_str(data); - if (table->max_pri == -1) { - goto unlock; - } - table->max_pri ++; - } + gf_msg_trace(this->name, 0, "option path %s", option_list); + /* parse the list of pattern:priority */ + table->max_pri = ioc_get_priority_list(option_list, + &table->priority_list); - GF_OPTION_RECONF ("max-file-size", table->max_file_size, - options, size, unlock); + if (table->max_pri == -1) { + goto unlock; + } + table->max_pri++; + } - GF_OPTION_RECONF ("min-file-size", table->min_file_size, - options, size, unlock); + GF_OPTION_RECONF("max-file-size", table->max_file_size, options, + size_uint64, unlock); - if ((table->max_file_size >= 0) && - (table->min_file_size > table->max_file_size)) { - gf_log (this->name, GF_LOG_ERROR, "minimum size (%" - PRIu64") of a file that can be cached is " - "greater than maximum size (%"PRIu64"). " - "Hence Defaulting to old value", - table->min_file_size, table->max_file_size); - goto unlock; - } + GF_OPTION_RECONF("min-file-size", table->min_file_size, options, + size_uint64, unlock); - GF_OPTION_RECONF ("cache-size", cache_size_new, - options, size, unlock); - if (!check_cache_size_ok (this, cache_size_new)) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "Not reconfiguring cache-size"); - goto unlock; - } - table->cache_size = cache_size_new; + if ((table->max_file_size <= UINT64_MAX) && + (table->min_file_size > table->max_file_size)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_DEFAULTING_TO_OLD, + "minimum-size=%" PRIu64, table->min_file_size, + "maximum-size=%" PRIu64, table->max_file_size, NULL); + goto unlock; + } - ret = 0; + GF_OPTION_RECONF("cache-size", cache_size_new, options, size_uint64, + unlock); + if (!check_cache_size_ok(this, cache_size_new)) { + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, 0, + IO_CACHE_MSG_NOT_RECONFIG_CACHE_SIZE, NULL); + goto unlock; } + table->cache_size = cache_size_new; + + ret = 0; + } unlock: - ioc_table_unlock (table); + ioc_table_unlock(table); out: - return ret; + return ret; } - /* * init - * @this: * */ int32_t -init (xlator_t *this) +init(xlator_t *this) { - ioc_table_t *table = NULL; - dict_t *xl_options = NULL; - uint32_t index = 0; - int32_t ret = -1; - glusterfs_ctx_t *ctx = NULL; - data_t *data = 0; - uint32_t num_pages = 0; - - xl_options = this->options; - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: io-cache not configured with exactly " - "one child"); - goto out; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - table = (void *) GF_CALLOC (1, sizeof (*table), gf_ioc_mt_ioc_table_t); - if (table == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto out; - } - - table->xl = this; - table->page_size = this->ctx->page_size; - - GF_OPTION_INIT ("cache-size", table->cache_size, size, out); - - GF_OPTION_INIT ("cache-timeout", table->cache_timeout, int32, out); - - GF_OPTION_INIT ("min-file-size", table->min_file_size, size, out); - - GF_OPTION_INIT ("max-file-size", table->max_file_size, size, out); - - if (!check_cache_size_ok (this, table->cache_size)) { - ret = -1; - goto out; - } - - INIT_LIST_HEAD (&table->priority_list); - table->max_pri = 1; - data = dict_get (xl_options, "priority"); - if (data) { - char *option_list = data_to_str (data); - gf_log (this->name, GF_LOG_TRACE, - "option path %s", option_list); - /* parse the list of pattern:priority */ - table->max_pri = ioc_get_priority_list (option_list, - &table->priority_list); - - if (table->max_pri == -1) { - goto out; - } - } - table->max_pri ++; - - INIT_LIST_HEAD (&table->inodes); - - if ((table->max_file_size >= 0) - && (table->min_file_size > table->max_file_size)) { - gf_log ("io-cache", GF_LOG_ERROR, "minimum size (%" - PRIu64") of a file that can be cached is " - "greater than maximum size (%"PRIu64")", - table->min_file_size, table->max_file_size); - goto out; - } - - table->inode_lru = GF_CALLOC (table->max_pri, - sizeof (struct list_head), - gf_ioc_mt_list_head); - if (table->inode_lru == NULL) { - goto out; - } - - for (index = 0; index < (table->max_pri); index++) - INIT_LIST_HEAD (&table->inode_lru[index]); - - this->local_pool = mem_pool_new (ioc_local_t, 64); - if (!this->local_pool) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - goto out; - } - - pthread_mutex_init (&table->table_lock, NULL); - this->private = table; - - num_pages = (table->cache_size / table->page_size) - + ((table->cache_size % table->page_size) - ? 1 : 0); - - table->mem_pool = mem_pool_new (rbthash_entry_t, num_pages); - if (!table->mem_pool) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to allocate mem_pool"); - goto out; - } - - ret = 0; - - ctx = this->ctx; - ioc_log2_page_size = log_base2 (ctx->page_size); + ioc_table_t *table = NULL; + dict_t *xl_options = NULL; + uint32_t index = 0; + int32_t ret = -1; + glusterfs_ctx_t *ctx = NULL; + data_t *data = 0; + uint32_t num_pages = 0; + + xl_options = this->options; + + if (!this->children || this->children->next) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + IO_CACHE_MSG_XLATOR_CHILD_MISCONFIGURED, NULL); + goto out; + } + + if (!this->parents) { + gf_smsg(this->name, GF_LOG_WARNING, 0, IO_CACHE_MSG_VOL_MISCONFIGURED, + NULL); + } + + table = (void *)GF_CALLOC(1, sizeof(*table), gf_ioc_mt_ioc_table_t); + if (table == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL); + goto out; + } + + table->xl = this; + table->page_size = this->ctx->page_size; + + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); + + GF_OPTION_INIT("cache-size", table->cache_size, size_uint64, out); + + GF_OPTION_INIT("cache-timeout", table->cache_timeout, int32, out); + + GF_OPTION_INIT("min-file-size", table->min_file_size, size_uint64, out); + + GF_OPTION_INIT("max-file-size", table->max_file_size, size_uint64, out); + + if (!check_cache_size_ok(this, table->cache_size)) { + ret = -1; + goto out; + } + + INIT_LIST_HEAD(&table->priority_list); + table->max_pri = 1; + data = dict_get(xl_options, "priority"); + if (data) { + char *option_list = data_to_str(data); + gf_msg_trace(this->name, 0, "option path %s", option_list); + /* parse the list of pattern:priority */ + table->max_pri = ioc_get_priority_list(option_list, + &table->priority_list); + + if (table->max_pri == -1) { + goto out; + } + } + table->max_pri++; + + INIT_LIST_HEAD(&table->inodes); + + if ((table->max_file_size <= UINT64_MAX) && + (table->min_file_size > table->max_file_size)) { + gf_smsg("io-cache", GF_LOG_ERROR, 0, IO_CACHE_MSG_INVALID_ARGUMENT, + "minimum-size=%" PRIu64, table->min_file_size, + "maximum-size=%" PRIu64, table->max_file_size, NULL); + goto out; + } + + table->inode_lru = GF_CALLOC(table->max_pri, sizeof(struct list_head), + gf_ioc_mt_list_head); + if (table->inode_lru == NULL) { + goto out; + } + + for (index = 0; index < (table->max_pri); index++) + INIT_LIST_HEAD(&table->inode_lru[index]); + + this->local_pool = mem_pool_new(ioc_local_t, 64); + if (!this->local_pool) { + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + IO_CACHE_MSG_CREATE_MEM_POOL_FAILED, NULL); + goto out; + } + + pthread_mutex_init(&table->table_lock, NULL); + this->private = table; + + num_pages = (table->cache_size / table->page_size) + + ((table->cache_size % table->page_size) ? 1 : 0); + + table->mem_pool = mem_pool_new(rbthash_entry_t, num_pages); + if (!table->mem_pool) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + IO_CACHE_MSG_ALLOC_MEM_POOL_FAILED, NULL); + goto out; + } + + ret = 0; + + ctx = this->ctx; + ioc_log2_page_size = log_base2(ctx->page_size); out: - if (ret == -1) { - if (table != NULL) { - GF_FREE (table->inode_lru); - GF_FREE (table); - } + if (ret == -1) { + if (table != NULL) { + GF_FREE(table->inode_lru); + GF_FREE(table); } + } - return ret; + return ret; } void -ioc_page_waitq_dump (ioc_page_t *page, char *prefix) +ioc_page_waitq_dump(ioc_page_t *page, char *prefix) { - ioc_waitq_t *trav = NULL; - call_frame_t *frame = NULL; - int32_t i = 0; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - - trav = page->waitq; - - while (trav) { - frame = trav->data; - sprintf (key, "waitq.frame[%d]", i++); - gf_proc_dump_write (key, "%"PRId64, frame->root->unique); - - trav = trav->next; - } + ioc_waitq_t *trav = NULL; + call_frame_t *frame = NULL; + int32_t i = 0; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + + trav = page->waitq; + + while (trav) { + frame = trav->data; + sprintf(key, "waitq.frame[%d]", i++); + gf_proc_dump_write(key, "%" PRId64, frame->root->unique); + + trav = trav->next; + } } void -__ioc_inode_waitq_dump (ioc_inode_t *ioc_inode, char *prefix) +__ioc_inode_waitq_dump(ioc_inode_t *ioc_inode, char *prefix) { - ioc_waitq_t *trav = NULL; - ioc_page_t *page = NULL; - int32_t i = 0; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; + ioc_waitq_t *trav = NULL; + ioc_page_t *page = NULL; + int32_t i = 0; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; - trav = ioc_inode->waitq; + trav = ioc_inode->waitq; - while (trav) { - page = trav->data; + while (trav) { + page = trav->data; - sprintf (key, "cache-validation-waitq.page[%d].offset", i++); - gf_proc_dump_write (key, "%"PRId64, page->offset); + sprintf(key, "cache-validation-waitq.page[%d].offset", i++); + gf_proc_dump_write(key, "%" PRId64, page->offset); - trav = trav->next; - } + trav = trav->next; + } } void -__ioc_page_dump (ioc_page_t *page, char *prefix) +__ioc_page_dump(ioc_page_t *page, char *prefix) { + int ret = -1; - int ret = -1; - - if (!page) - return; - /* ioc_page_lock can be used to hold the mutex. But in statedump - * its better to use trylock to avoid deadlocks. - */ - ret = pthread_mutex_trylock (&page->page_lock); - if (ret) - goto out; - { - gf_proc_dump_write ("offset", "%"PRId64, page->offset); - gf_proc_dump_write ("size", "%"PRId64, page->size); - gf_proc_dump_write ("dirty", "%s", page->dirty ? "yes" : "no"); - gf_proc_dump_write ("ready", "%s", page->ready ? "yes" : "no"); - ioc_page_waitq_dump (page, prefix); - } - pthread_mutex_unlock (&page->page_lock); + if (!page) + return; + /* ioc_page_lock can be used to hold the mutex. But in statedump + * its better to use trylock to avoid deadlocks. + */ + ret = pthread_mutex_trylock(&page->page_lock); + if (ret) + goto out; + { + gf_proc_dump_write("offset", "%" PRId64, page->offset); + gf_proc_dump_write("size", "%" GF_PRI_SIZET, page->size); + gf_proc_dump_write("dirty", "%s", page->dirty ? "yes" : "no"); + gf_proc_dump_write("ready", "%s", page->ready ? "yes" : "no"); + ioc_page_waitq_dump(page, prefix); + } + pthread_mutex_unlock(&page->page_lock); out: - if (ret && page) - gf_proc_dump_write ("Unable to dump the page information", - "(Lock acquisition failed) %p", page); + if (ret && page) + gf_proc_dump_write("Unable to dump the page information", + "(Lock acquisition failed) %p", page); - return; + return; } void -__ioc_cache_dump (ioc_inode_t *ioc_inode, char *prefix) +__ioc_cache_dump(ioc_inode_t *ioc_inode, char *prefix) { - off_t offset = 0; - ioc_table_t *table = NULL; - ioc_page_t *page = NULL; - int i = 0; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - char timestr[256] = {0, }; - - if ((ioc_inode == NULL) || (prefix == NULL)) { - goto out; - } - - table = ioc_inode->table; - - if (ioc_inode->cache.tv.tv_sec) { - gf_time_fmt (timestr, sizeof timestr, - ioc_inode->cache.tv.tv_sec, gf_timefmt_FT); - snprintf (timestr + strlen (timestr), sizeof timestr - strlen (timestr), - ".%"GF_PRI_SUSECONDS, ioc_inode->cache.tv.tv_usec); - - gf_proc_dump_write ("last-cache-validation-time", "%s", - timestr); - } - - for (offset = 0; offset < ioc_inode->ia_size; - offset += table->page_size) { - page = __ioc_page_get (ioc_inode, offset); - if (page == NULL) { - continue; - } - - sprintf (key, "inode.cache.page[%d]", i++); - __ioc_page_dump (page, key); - } + off_t offset = 0; + ioc_table_t *table = NULL; + ioc_page_t *page = NULL; + int i = 0; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + + if ((ioc_inode == NULL) || (prefix == NULL)) { + goto out; + } + + table = ioc_inode->table; + + if (ioc_inode->cache.last_revalidate) { + gf_time_fmt(timestr, sizeof timestr, ioc_inode->cache.last_revalidate, + gf_timefmt_FT); + + gf_proc_dump_write("last-cache-validation-time", "%s", timestr); + } + + for (offset = 0; offset < ioc_inode->ia_size; offset += table->page_size) { + page = __ioc_page_get(ioc_inode, offset); + if (page == NULL) { + continue; + } + + sprintf(key, "inode.cache.page[%d]", i++); + __ioc_page_dump(page, key); + } out: - return; + return; } - int -ioc_inode_dump (xlator_t *this, inode_t *inode) +ioc_inode_dump(xlator_t *this, inode_t *inode) { - - char *path = NULL; - int ret = -1; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - uint64_t tmp_ioc_inode = 0; - ioc_inode_t *ioc_inode = NULL; - gf_boolean_t section_added = _gf_false; - char uuid_str[64] = {0,}; - - if (this == NULL || inode == NULL) - goto out; - - gf_proc_dump_build_key (key_prefix, "io-cache", "inode"); - - inode_ctx_get (inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - if (ioc_inode == NULL) - goto out; - - /* Similar to ioc_page_dump function its better to use - * pthread_mutex_trylock and not to use gf_log in statedump - * to avoid deadlocks. - */ - ret = pthread_mutex_trylock (&ioc_inode->inode_lock); - if (ret) - goto out; - - { - if (uuid_is_null (ioc_inode->inode->gfid)) - goto unlock; - - gf_proc_dump_add_section (key_prefix); - section_added = _gf_true; - - __inode_path (ioc_inode->inode, NULL, &path); - - gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight); - - if (path) { - gf_proc_dump_write ("path", "%s", path); - GF_FREE (path); - } - - gf_proc_dump_write ("uuid", "%s", uuid_utoa_r - (ioc_inode->inode->gfid, uuid_str)); - __ioc_cache_dump (ioc_inode, key_prefix); - __ioc_inode_waitq_dump (ioc_inode, key_prefix); - } + char *path = NULL; + int ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + uint64_t tmp_ioc_inode = 0; + ioc_inode_t *ioc_inode = NULL; + gf_boolean_t section_added = _gf_false; + char uuid_str[64] = { + 0, + }; + + if (this == NULL || inode == NULL) + goto out; + + gf_proc_dump_build_key(key_prefix, "io-cache", "inode"); + + inode_ctx_get(inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + if (ioc_inode == NULL) + goto out; + + /* Similar to ioc_page_dump function its better to use + * pthread_mutex_trylock and not to use gf_log in statedump + * to avoid deadlocks. + */ + ret = pthread_mutex_trylock(&ioc_inode->inode_lock); + if (ret) + goto out; + + { + if (gf_uuid_is_null(ioc_inode->inode->gfid)) + goto unlock; + + gf_proc_dump_add_section("%s", key_prefix); + section_added = _gf_true; + + __inode_path(ioc_inode->inode, NULL, &path); + + gf_proc_dump_write("inode.weight", "%d", ioc_inode->weight); + + if (path) { + gf_proc_dump_write("path", "%s", path); + GF_FREE(path); + } + + gf_proc_dump_write("uuid", "%s", + uuid_utoa_r(ioc_inode->inode->gfid, uuid_str)); + __ioc_cache_dump(ioc_inode, key_prefix); + __ioc_inode_waitq_dump(ioc_inode, key_prefix); + } unlock: - pthread_mutex_unlock (&ioc_inode->inode_lock); + pthread_mutex_unlock(&ioc_inode->inode_lock); out: - if (ret && ioc_inode) { - if (section_added == _gf_false) - gf_proc_dump_add_section (key_prefix); - gf_proc_dump_write ("Unable to print the status of ioc_inode", - "(Lock acquisition failed) %s", - uuid_utoa (inode->gfid)); - } - return ret; + if (ret && ioc_inode) { + if (section_added == _gf_false) + gf_proc_dump_add_section("%s", key_prefix); + gf_proc_dump_write("Unable to print the status of ioc_inode", + "(Lock acquisition failed) %s", + uuid_utoa(inode->gfid)); + } + return ret; } int -ioc_priv_dump (xlator_t *this) +ioc_priv_dump(xlator_t *this) { - ioc_table_t *priv = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - int ret = -1; - gf_boolean_t add_section = _gf_false; - - if (!this || !this->private) - goto out; - - priv = this->private; - - gf_proc_dump_build_key (key_prefix, "io-cache", "priv"); - gf_proc_dump_add_section (key_prefix); - add_section = _gf_true; - - ret = pthread_mutex_trylock (&priv->table_lock); - if (ret) - goto out; - { - gf_proc_dump_write ("page_size", "%ld", priv->page_size); - gf_proc_dump_write ("cache_size", "%ld", priv->cache_size); - gf_proc_dump_write ("cache_used", "%ld", priv->cache_used); - gf_proc_dump_write ("inode_count", "%u", priv->inode_count); - gf_proc_dump_write ("cache_timeout", "%u", priv->cache_timeout); - gf_proc_dump_write ("min-file-size", "%u", priv->min_file_size); - gf_proc_dump_write ("max-file-size", "%u", priv->max_file_size); - } - pthread_mutex_unlock (&priv->table_lock); + ioc_table_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + int ret = -1; + gf_boolean_t add_section = _gf_false; + + if (!this || !this->private) + goto out; + + priv = this->private; + + gf_proc_dump_build_key(key_prefix, "io-cache", "priv"); + gf_proc_dump_add_section("%s", key_prefix); + add_section = _gf_true; + + ret = pthread_mutex_trylock(&priv->table_lock); + if (ret) + goto out; + { + gf_proc_dump_write("page_size", "%" PRIu64, priv->page_size); + gf_proc_dump_write("cache_size", "%" PRIu64, priv->cache_size); + gf_proc_dump_write("cache_used", "%" PRIu64, priv->cache_used); + gf_proc_dump_write("inode_count", "%u", priv->inode_count); + gf_proc_dump_write("cache_timeout", "%u", priv->cache_timeout); + gf_proc_dump_write("min-file-size", "%" PRIu64, priv->min_file_size); + gf_proc_dump_write("max-file-size", "%" PRIu64, priv->max_file_size); + } + pthread_mutex_unlock(&priv->table_lock); out: - if (ret && priv) { - if (!add_section) { - gf_proc_dump_build_key (key_prefix, "xlator." - "performance.io-cache", "priv"); - gf_proc_dump_add_section (key_prefix); - } - gf_proc_dump_write ("Unable to dump the state of private " - "structure of io-cache xlator", "(Lock " - "acquisition failed) %s", this->name); - } - - return 0; + if (ret && priv) { + if (!add_section) { + gf_proc_dump_build_key(key_prefix, + "xlator." + "performance.io-cache", + "priv"); + gf_proc_dump_add_section("%s", key_prefix); + } + gf_proc_dump_write( + "Unable to dump the state of private " + "structure of io-cache xlator", + "(Lock " + "acquisition failed) %s", + this->name); + } + + return 0; } /* @@ -2025,106 +2088,144 @@ out: * */ void -fini (xlator_t *this) +fini(xlator_t *this) { - ioc_table_t *table = NULL; - struct ioc_priority *curr = NULL, *tmp = NULL; - int i = 0; - - table = this->private; - - if (table == NULL) - return; - - this->private = NULL; - - if (table->mem_pool != NULL) { - mem_pool_destroy (table->mem_pool); - table->mem_pool = NULL; - } - - list_for_each_entry_safe (curr, tmp, &table->priority_list, list) { - list_del_init (&curr->list); - GF_FREE (curr->pattern); - GF_FREE (curr); - } - - for (i = 0; i < table->max_pri; i++) { - GF_ASSERT (list_empty (&table->inode_lru[i])); - } + ioc_table_t *table = NULL; + struct ioc_priority *curr = NULL, *tmp = NULL; - GF_ASSERT (list_empty (&table->inodes)); - pthread_mutex_destroy (&table->table_lock); - GF_FREE (table); + table = this->private; - this->private = NULL; + if (table == NULL) return; + + this->private = NULL; + + if (table->mem_pool != NULL) { + mem_pool_destroy(table->mem_pool); + table->mem_pool = NULL; + } + + list_for_each_entry_safe(curr, tmp, &table->priority_list, list) + { + list_del_init(&curr->list); + GF_FREE(curr->pattern); + GF_FREE(curr); + } + + /* inode_lru and inodes list can be empty in case fini() is + * called soon after init()? Hence commenting the below asserts. + */ + /*for (i = 0; i < table->max_pri; i++) { + GF_ASSERT (list_empty (&table->inode_lru[i])); + } + + GF_ASSERT (list_empty (&table->inodes)); + */ + pthread_mutex_destroy(&table->table_lock); + GF_FREE(table); + + this->private = NULL; + return; } struct xlator_fops fops = { - .open = ioc_open, - .create = ioc_create, - .readv = ioc_readv, - .writev = ioc_writev, - .truncate = ioc_truncate, - .ftruncate = ioc_ftruncate, - .lookup = ioc_lookup, - .lk = ioc_lk, - .setattr = ioc_setattr, - .mknod = ioc_mknod, - - .readdirp = ioc_readdirp, - .discard = ioc_discard, + .open = ioc_open, + .create = ioc_create, + .readv = ioc_readv, + .writev = ioc_writev, + .truncate = ioc_truncate, + .ftruncate = ioc_ftruncate, + .lookup = ioc_lookup, + .lk = ioc_lk, + .setattr = ioc_setattr, + .mknod = ioc_mknod, + + .readdirp = ioc_readdirp, + .discard = ioc_discard, + .zerofill = ioc_zerofill, }; - struct xlator_dumpops dumpops = { - .priv = ioc_priv_dump, - .inodectx = ioc_inode_dump, + .priv = ioc_priv_dump, + .inodectx = ioc_inode_dump, }; struct xlator_cbks cbks = { - .forget = ioc_forget, - .release = ioc_release, - .invalidate = ioc_invalidate, + .forget = ioc_forget, + .release = ioc_release, + .invalidate = ioc_invalidate, }; struct volume_options options[] = { - { .key = {"priority"}, - .type = GF_OPTION_TYPE_PRIORITY_LIST, - .default_value = "", - .description = "Assigns priority to filenames with specific " - "patterns so that when a page needs to be ejected " - "out of the cache, the page of a file whose " - "priority is the lowest will be ejected earlier" - }, - { .key = {"cache-timeout", "force-revalidate-timeout"}, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = 60, - .default_value = "1", - .description = "The cached data for a file will be retained till " - "'cache-refresh-timeout' seconds, after which data " - "re-validation is performed." - }, - { .key = {"cache-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 4 * GF_UNIT_MB, - .max = 32 * GF_UNIT_GB, - .default_value = "32MB", - .description = "Size of the read cache." - }, - { .key = {"min-file-size"}, - .type = GF_OPTION_TYPE_SIZET, - .default_value = "0", - .description = "Minimum file size which would be cached by the " - "io-cache translator." - }, - { .key = {"max-file-size"}, - .type = GF_OPTION_TYPE_SIZET, - .default_value = "0", - .description = "Maximum file size which would be cached by the " - "io-cache translator." - }, - { .key = {NULL} }, + { + .key = {"io-cache"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable io-cache", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + {.key = {"priority"}, + .type = GF_OPTION_TYPE_PRIORITY_LIST, + .default_value = "", + .description = "Assigns priority to filenames with specific " + "patterns so that when a page needs to be ejected " + "out of the cache, the page of a file whose " + "priority is the lowest will be ejected earlier", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"cache-timeout", "force-revalidate-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 60, + .default_value = "1", + .description = "The cached data for a file will be retained for " + "'cache-refresh-timeout' seconds, after which data " + "re-validation is performed.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"cache-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 4 * GF_UNIT_MB, + .max = INFINITY, + .default_value = "32MB", + .description = "Size of the read cache.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"min-file-size"}, + .type = GF_OPTION_TYPE_SIZET, + .default_value = "0", + .description = "Minimum file size which would be cached by the " + "io-cache translator.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"max-file-size"}, + .type = GF_OPTION_TYPE_SIZET, + .default_value = "0", + .description = "Maximum file size which would be cached by the " + "io-cache translator.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"io-cache"}, + .description = "Enable/Disable io cache translator"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "io-cache", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/io-cache/src/io-cache.h b/xlators/performance/io-cache/src/io-cache.h index 46d758a6608..14923c75edc 100644 --- a/xlators/performance/io-cache/src/io-cache.h +++ b/xlators/performance/io-cache/src/io-cache.h @@ -11,27 +11,19 @@ #ifndef __IO_CACHE_H #define __IO_CACHE_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include <sys/types.h> -#include "compat-errno.h" - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "common-utils.h" -#include "call-stub.h" -#include "rbthash.h" -#include "hashfn.h" +#include <glusterfs/compat-errno.h> + +#include <glusterfs/glusterfs.h> +#include <glusterfs/dict.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/rbthash.h> #include <sys/time.h> #include <fnmatch.h> +#include "io-cache-messages.h" -#define IOC_PAGE_SIZE (1024 * 128) /* 128KB */ -#define IOC_CACHE_SIZE (32 * 1024 * 1024) +#define IOC_PAGE_SIZE (1024 * 128) /* 128KB */ +#define IOC_CACHE_SIZE (32 * 1024 * 1024) #define IOC_PAGE_TABLE_BUCKET_COUNT 1 struct ioc_table; @@ -40,9 +32,9 @@ struct ioc_page; struct ioc_inode; struct ioc_priority { - struct list_head list; - char *pattern; - uint32_t priority; + struct list_head list; + char *pattern; + uint32_t priority; }; /* @@ -53,10 +45,10 @@ struct ioc_priority { * @data: pointer to the frame which is waiting */ struct ioc_waitq { - struct ioc_waitq *next; - void *data; - off_t pending_offset; - size_t pending_size; + struct ioc_waitq *next; + void *data; + off_t pending_offset; + size_t pending_size; }; /* @@ -64,39 +56,41 @@ struct ioc_waitq { * */ struct ioc_fill { - struct list_head list; /* list of ioc_fill structures of a frame */ - off_t offset; - size_t size; - struct iovec *vector; - int32_t count; - struct iobref *iobref; + struct list_head list; /* list of ioc_fill structures of a frame */ + off_t offset; + size_t size; + struct iovec *vector; + int32_t count; + struct iobref *iobref; }; struct ioc_local { - mode_t mode; - int32_t flags; - loc_t file_loc; - off_t offset; - size_t size; - int32_t op_ret; - int32_t op_errno; - struct list_head fill_list; /* list of ioc_fill structures */ - off_t pending_offset; /* - * offset from this frame should - * continue - */ - size_t pending_size; /* - * size of data this frame is waiting - * on - */ - struct ioc_inode *inode; - int32_t wait_count; - pthread_mutex_t local_lock; - struct ioc_waitq *waitq; - void *stub; - fd_t *fd; - int32_t need_xattr; - dict_t *xattr_req; + mode_t mode; + int32_t flags; + loc_t file_loc; + off_t offset; + size_t size; + int32_t op_ret; + int32_t op_errno; + struct list_head fill_list; /* list of ioc_fill structures */ + off_t pending_offset; /* + * offset from this frame should + * continue + */ + size_t pending_size; /* + * size of data this frame is waiting + * on + */ + struct ioc_inode *inode; + int32_t wait_count; + pthread_mutex_t local_lock; + struct ioc_waitq *waitq; + void *stub; + fd_t *fd; + struct iovec *vector; + struct iobref *iobref; + int32_t need_xattr; + dict_t *xattr_req; }; /* @@ -104,71 +98,69 @@ struct ioc_local { * */ struct ioc_page { - struct list_head page_lru; - struct ioc_inode *inode; /* inode this page belongs to */ - struct ioc_priority *priority; - char dirty; - char ready; - struct iovec *vector; - int32_t count; - off_t offset; - size_t size; - struct ioc_waitq *waitq; - struct iobref *iobref; - pthread_mutex_t page_lock; - int32_t op_errno; - char stale; + struct list_head page_lru; + struct ioc_inode *inode; /* inode this page belongs to */ + struct ioc_priority *priority; + char dirty; + char ready; + struct iovec *vector; + int32_t count; + off_t offset; + size_t size; + struct ioc_waitq *waitq; + struct iobref *iobref; + pthread_mutex_t page_lock; + int32_t op_errno; + char stale; }; struct ioc_cache { - rbthash_table_t *page_table; - struct list_head page_lru; - time_t mtime; /* - * seconds component of file mtime - */ - time_t mtime_nsec; /* - * nanosecond component of file mtime - */ - struct timeval tv; /* - * time-stamp at last re-validate - */ + rbthash_table_t *page_table; + struct list_head page_lru; + time_t mtime; /* + * seconds component of file mtime + */ + time_t mtime_nsec; /* + * nanosecond component of file mtime + */ + time_t last_revalidate; /* timestamp at last re-validate */ }; struct ioc_inode { - struct ioc_table *table; - off_t ia_size; - struct ioc_cache cache; - struct list_head inode_list; /* - * list of inodes, maintained by - * io-cache translator - */ - struct list_head inode_lru; - struct ioc_waitq *waitq; - pthread_mutex_t inode_lock; - uint32_t weight; /* - * weight of the inode, increases - * on each read - */ - inode_t *inode; + struct ioc_table *table; + off_t ia_size; + struct ioc_cache cache; + struct list_head inode_list; /* + * list of inodes, maintained by + * io-cache translator + */ + struct list_head inode_lru; + struct ioc_waitq *waitq; + pthread_mutex_t inode_lock; + uint32_t weight; /* + * weight of the inode, increases + * on each read + */ + inode_t *inode; }; struct ioc_table { - uint64_t page_size; - uint64_t cache_size; - uint64_t cache_used; - uint64_t min_file_size; - uint64_t max_file_size; - struct list_head inodes; /* list of inodes cached */ - struct list_head active; - struct list_head *inode_lru; - struct list_head priority_list; - int32_t readv_count; - pthread_mutex_t table_lock; - xlator_t *xl; - uint32_t inode_count; - int32_t cache_timeout; - int32_t max_pri; - struct mem_pool *mem_pool; + uint64_t page_size; + uint64_t cache_size; + uint64_t cache_used; + uint64_t min_file_size; + uint64_t max_file_size; + struct list_head inodes; /* list of inodes cached */ + struct list_head active; + struct list_head *inode_lru; + struct list_head priority_list; + int32_t readv_count; + pthread_mutex_t table_lock; + xlator_t *xl; + uint32_t inode_count; + int32_t cache_timeout; + int32_t max_pri; + struct mem_pool *mem_pool; }; typedef struct ioc_table ioc_table_t; @@ -179,154 +171,136 @@ typedef struct ioc_waitq ioc_waitq_t; typedef struct ioc_fill ioc_fill_t; void * -str_to_ptr (char *string); +str_to_ptr(char *string); char * -ptr_to_str (void *ptr); +ptr_to_str(void *ptr); int32_t -ioc_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, - struct iobref *iobref, dict_t *xdata); +ioc_readv_disabled_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata); ioc_page_t * -__ioc_page_get (ioc_inode_t *ioc_inode, off_t offset); +__ioc_page_get(ioc_inode_t *ioc_inode, off_t offset); ioc_page_t * -__ioc_page_create (ioc_inode_t *ioc_inode, off_t offset); +__ioc_page_create(ioc_inode_t *ioc_inode, off_t offset); void -ioc_page_fault (ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, - off_t offset); +ioc_page_fault(ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, + off_t offset); void -__ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size); +__ioc_wait_on_page(ioc_page_t *page, call_frame_t *frame, off_t offset, + size_t size); ioc_waitq_t * -__ioc_page_wakeup (ioc_page_t *page, int32_t op_errno); +__ioc_page_wakeup(ioc_page_t *page, int32_t op_errno); void -ioc_page_flush (ioc_page_t *page); +ioc_page_flush(ioc_page_t *page); ioc_waitq_t * -__ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno); +__ioc_page_error(ioc_page_t *page, int32_t op_ret, int32_t op_errno); void -ioc_frame_return (call_frame_t *frame); +ioc_frame_return(call_frame_t *frame); void -ioc_waitq_return (ioc_waitq_t *waitq); +ioc_waitq_return(ioc_waitq_t *waitq); int32_t -ioc_frame_fill (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size, int32_t op_errno); - -#define ioc_inode_lock(ioc_inode) \ - do { \ - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, \ - "locked inode(%p)", ioc_inode); \ - pthread_mutex_lock (&ioc_inode->inode_lock); \ - } while (0) - - -#define ioc_inode_unlock(ioc_inode) \ - do { \ - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, \ - "unlocked inode(%p)", ioc_inode); \ - pthread_mutex_unlock (&ioc_inode->inode_lock); \ - } while (0) - - -#define ioc_table_lock(table) \ - do { \ - gf_log (table->xl->name, GF_LOG_TRACE, \ - "locked table(%p)", table); \ - pthread_mutex_lock (&table->table_lock); \ - } while (0) - - -#define ioc_table_unlock(table) \ - do { \ - gf_log (table->xl->name, GF_LOG_TRACE, \ - "unlocked table(%p)", table); \ - pthread_mutex_unlock (&table->table_lock); \ - } while (0) - - -#define ioc_local_lock(local) \ - do { \ - gf_log (local->inode->table->xl->name, GF_LOG_TRACE, \ - "locked local(%p)", local); \ - pthread_mutex_lock (&local->local_lock); \ - } while (0) - - -#define ioc_local_unlock(local) \ - do { \ - gf_log (local->inode->table->xl->name, GF_LOG_TRACE, \ - "unlocked local(%p)", local); \ - pthread_mutex_unlock (&local->local_lock); \ - } while (0) - - -#define ioc_page_lock(page) \ - do { \ - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, \ - "locked page(%p)", page); \ - pthread_mutex_lock (&page->page_lock); \ - } while (0) - - -#define ioc_page_unlock(page) \ - do { \ - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, \ - "unlocked page(%p)", page); \ - pthread_mutex_unlock (&page->page_lock); \ - } while (0) - - -static inline uint64_t -time_elapsed (struct timeval *now, - struct timeval *then) -{ - uint64_t sec = now->tv_sec - then->tv_sec; - - if (sec) - return sec; - - return 0; -} +ioc_frame_fill(ioc_page_t *page, call_frame_t *frame, off_t offset, size_t size, + int32_t op_errno); + +#define ioc_inode_lock(ioc_inode) \ + do { \ + gf_msg_trace(ioc_inode->table->xl->name, 0, "locked inode(%p)", \ + ioc_inode); \ + pthread_mutex_lock(&ioc_inode->inode_lock); \ + } while (0) + +#define ioc_inode_unlock(ioc_inode) \ + do { \ + gf_msg_trace(ioc_inode->table->xl->name, 0, "unlocked inode(%p)", \ + ioc_inode); \ + pthread_mutex_unlock(&ioc_inode->inode_lock); \ + } while (0) + +#define ioc_table_lock(table) \ + do { \ + gf_msg_trace(table->xl->name, 0, "locked table(%p)", table); \ + pthread_mutex_lock(&table->table_lock); \ + } while (0) + +#define ioc_table_unlock(table) \ + do { \ + gf_msg_trace(table->xl->name, 0, "unlocked table(%p)", table); \ + pthread_mutex_unlock(&table->table_lock); \ + } while (0) + +#define ioc_local_lock(local) \ + do { \ + gf_msg_trace(local->inode->table->xl->name, 0, "locked local(%p)", \ + local); \ + pthread_mutex_lock(&local->local_lock); \ + } while (0) + +#define ioc_local_unlock(local) \ + do { \ + gf_msg_trace(local->inode->table->xl->name, 0, "unlocked local(%p)", \ + local); \ + pthread_mutex_unlock(&local->local_lock); \ + } while (0) + +#define ioc_page_lock(page) \ + do { \ + gf_msg_trace(page->inode->table->xl->name, 0, "locked page(%p)", \ + page); \ + pthread_mutex_lock(&page->page_lock); \ + } while (0) + +#define ioc_page_unlock(page) \ + do { \ + gf_msg_trace(page->inode->table->xl->name, 0, "unlocked page(%p)", \ + page); \ + pthread_mutex_unlock(&page->page_lock); \ + } while (0) ioc_inode_t * -ioc_inode_search (ioc_table_t *table, inode_t *inode); +ioc_inode_search(ioc_table_t *table, inode_t *inode); void -ioc_inode_destroy (ioc_inode_t *ioc_inode); +ioc_inode_destroy(ioc_inode_t *ioc_inode); + +int32_t +ioc_inode_update(xlator_t *this, inode_t *inode, char *path, + struct iatt *iabuf); ioc_inode_t * -ioc_inode_update (ioc_table_t *table, inode_t *inode, uint32_t weight); +ioc_inode_create(ioc_table_t *table, inode_t *inode, uint32_t weight); int64_t -__ioc_page_destroy (ioc_page_t *page); +__ioc_page_destroy(ioc_page_t *page); int64_t -__ioc_inode_flush (ioc_inode_t *ioc_inode); +__ioc_inode_flush(ioc_inode_t *ioc_inode); void -ioc_inode_flush (ioc_inode_t *ioc_inode); +ioc_inode_flush(ioc_inode_t *ioc_inode); void -ioc_inode_wakeup (call_frame_t *frame, ioc_inode_t *ioc_inode, - struct iatt *stbuf); +ioc_inode_wakeup(call_frame_t *frame, ioc_inode_t *ioc_inode, + struct iatt *stbuf); int8_t -ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct iatt *stbuf); +ioc_cache_still_valid(ioc_inode_t *ioc_inode, struct iatt *stbuf); int32_t -ioc_prune (ioc_table_t *table); +ioc_prune(ioc_table_t *table); int32_t -ioc_need_prune (ioc_table_t *table); +ioc_need_prune(ioc_table_t *table); #endif /* __IO_CACHE_H */ diff --git a/xlators/performance/io-cache/src/ioc-inode.c b/xlators/performance/io-cache/src/ioc-inode.c index 86a54bb14ca..97767d85285 100644 --- a/xlators/performance/io-cache/src/ioc-inode.c +++ b/xlators/performance/io-cache/src/ioc-inode.c @@ -8,11 +8,6 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "io-cache.h" #include "ioc-mem-types.h" @@ -24,145 +19,140 @@ extern int ioc_log2_page_size; * */ void * -str_to_ptr (char *string) +str_to_ptr(char *string) { - void *ptr = NULL; + void *ptr = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", string, out); + GF_VALIDATE_OR_GOTO("io-cache", string, out); - ptr = (void *)strtoul (string, NULL, 16); + ptr = (void *)strtoul(string, NULL, 16); out: - return ptr; + return ptr; } - /* * ptr_to_str - convert a pointer to string * @ptr: pointer * */ char * -ptr_to_str (void *ptr) +ptr_to_str(void *ptr) { - int ret = 0; - char *str = NULL; + int ret = 0; + char *str = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", ptr, out); + GF_VALIDATE_OR_GOTO("io-cache", ptr, out); - ret = gf_asprintf (&str, "%p", ptr); - if (-1 == ret) { - gf_log ("io-cache", GF_LOG_WARNING, - "asprintf failed while converting ptr to str"); - str = NULL; - goto out; - } + ret = gf_asprintf(&str, "%p", ptr); + if (-1 == ret) { + gf_smsg("io-cache", GF_LOG_WARNING, 0, + IO_CACHE_MSG_STR_COVERSION_FAILED, NULL); + str = NULL; + goto out; + } out: - return str; + return str; } - void -ioc_inode_wakeup (call_frame_t *frame, ioc_inode_t *ioc_inode, - struct iatt *stbuf) +ioc_inode_wakeup(call_frame_t *frame, ioc_inode_t *ioc_inode, + struct iatt *stbuf) { - ioc_waitq_t *waiter = NULL, *waited = NULL; - ioc_waitq_t *page_waitq = NULL; - int8_t cache_still_valid = 1; - ioc_local_t *local = NULL; - int8_t need_fault = 0; - ioc_page_t *waiter_page = NULL; - - GF_VALIDATE_OR_GOTO ("io-cache", frame, out); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, out); - - if (ioc_inode == NULL) { - local->op_ret = -1; - local->op_errno = EINVAL; - gf_log (frame->this->name, GF_LOG_WARNING, "ioc_inode is NULL"); - goto out; - } - - ioc_inode_lock (ioc_inode); - { - waiter = ioc_inode->waitq; - ioc_inode->waitq = NULL; - } - ioc_inode_unlock (ioc_inode); - - if (stbuf) - cache_still_valid = ioc_cache_still_valid (ioc_inode, stbuf); - else - cache_still_valid = 0; - + ioc_waitq_t *waiter = NULL, *waited = NULL; + ioc_waitq_t *page_waitq = NULL; + int8_t cache_still_valid = 1; + ioc_local_t *local = NULL; + int8_t need_fault = 0; + ioc_page_t *waiter_page = NULL; + + GF_VALIDATE_OR_GOTO("io-cache", frame, out); + + local = frame->local; + GF_VALIDATE_OR_GOTO(frame->this->name, local, out); + + if (ioc_inode == NULL) { + local->op_ret = -1; + local->op_errno = EINVAL; + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, IO_CACHE_MSG_INODE_NULL, + NULL); + goto out; + } + + if (stbuf) + cache_still_valid = ioc_cache_still_valid(ioc_inode, stbuf); + else + cache_still_valid = 0; + + ioc_inode_lock(ioc_inode); + { + waiter = ioc_inode->waitq; if (!waiter) { - gf_log (frame->this->name, GF_LOG_WARNING, - "cache validate called without any " - "page waiting to be validated"); + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + IO_CACHE_MSG_PAGE_WAIT_VALIDATE, NULL); + + ioc_inode_unlock(ioc_inode); + goto out; } while (waiter) { - waiter_page = waiter->data; - page_waitq = NULL; - - if (waiter_page) { - if (cache_still_valid) { - /* cache valid, wake up page */ - ioc_inode_lock (ioc_inode); - { - page_waitq = - __ioc_page_wakeup (waiter_page, - waiter_page->op_errno); - } - ioc_inode_unlock (ioc_inode); - if (page_waitq) - ioc_waitq_return (page_waitq); - } else { - /* cache invalid, generate page fault and set - * page->ready = 0, to avoid double faults - */ - ioc_inode_lock (ioc_inode); - { - if (waiter_page->ready) { - waiter_page->ready = 0; - need_fault = 1; - } else { - gf_log (frame->this->name, - GF_LOG_TRACE, - "validate frame(%p) is " - "waiting for in-transit" - " page = %p", frame, - waiter_page); - } - } - ioc_inode_unlock (ioc_inode); - - if (need_fault) { - need_fault = 0; - ioc_page_fault (ioc_inode, frame, - local->fd, - waiter_page->offset); - } - } + waiter_page = waiter->data; + ioc_inode->waitq = waiter->next; + page_waitq = NULL; + + if (waiter_page) { + if (cache_still_valid) { + /* cache valid, wake up page */ + page_waitq = __ioc_page_wakeup(waiter_page, + waiter_page->op_errno); + if (page_waitq) { + ioc_inode_unlock(ioc_inode); + ioc_waitq_return(page_waitq); + ioc_inode_lock(ioc_inode); + } + } else { + /* cache invalid, generate page fault and set + * page->ready = 0, to avoid double faults + */ + if (waiter_page->ready) { + waiter_page->ready = 0; + need_fault = 1; + } else { + gf_msg_trace(frame->this->name, 0, + "validate " + "frame(%p) is " + "waiting for " + "in-transit" + " page = %p", + frame, waiter_page); + } + + if (need_fault) { + need_fault = 0; + ioc_inode_unlock(ioc_inode); + ioc_page_fault(ioc_inode, frame, local->fd, + waiter_page->offset); + ioc_inode_lock(ioc_inode); + } } + } - waited = waiter; - waiter = waiter->next; + waited = waiter; + waiter = ioc_inode->waitq; - waited->data = NULL; - GF_FREE (waited); + waited->data = NULL; + GF_FREE(waited); } + } + ioc_inode_unlock(ioc_inode); out: - return; + return; } - /* - * ioc_inode_update - create a new ioc_inode_t structure and add it to + * ioc_inode_create - create a new ioc_inode_t structure and add it to * the table table. fill in the fields which are derived * from inode_t corresponding to the file * @@ -172,40 +162,37 @@ out: * not for external reference */ ioc_inode_t * -ioc_inode_update (ioc_table_t *table, inode_t *inode, uint32_t weight) +ioc_inode_create(ioc_table_t *table, inode_t *inode, uint32_t weight) { - ioc_inode_t *ioc_inode = NULL; + ioc_inode_t *ioc_inode = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", table, out); + GF_VALIDATE_OR_GOTO("io-cache", table, out); - ioc_inode = GF_CALLOC (1, sizeof (ioc_inode_t), gf_ioc_mt_ioc_inode_t); - if (ioc_inode == NULL) { - goto out; - } + ioc_inode = GF_CALLOC(1, sizeof(ioc_inode_t), gf_ioc_mt_ioc_inode_t); + if (ioc_inode == NULL) { + goto out; + } - ioc_inode->inode = inode; - ioc_inode->table = table; - INIT_LIST_HEAD (&ioc_inode->cache.page_lru); - pthread_mutex_init (&ioc_inode->inode_lock, NULL); - ioc_inode->weight = weight; - - ioc_table_lock (table); - { - table->inode_count++; - list_add (&ioc_inode->inode_list, &table->inodes); - list_add_tail (&ioc_inode->inode_lru, - &table->inode_lru[weight]); - } - ioc_table_unlock (table); + ioc_inode->inode = inode; + ioc_inode->table = table; + INIT_LIST_HEAD(&ioc_inode->cache.page_lru); + pthread_mutex_init(&ioc_inode->inode_lock, NULL); + ioc_inode->weight = weight; + + ioc_table_lock(table); + { + table->inode_count++; + list_add(&ioc_inode->inode_list, &table->inodes); + list_add_tail(&ioc_inode->inode_lru, &table->inode_lru[weight]); + } + ioc_table_unlock(table); - gf_log (table->xl->name, GF_LOG_TRACE, - "adding to inode_lru[%d]", weight); + gf_msg_trace(table->xl->name, 0, "adding to inode_lru[%d]", weight); out: - return ioc_inode; + return ioc_inode; } - /* * ioc_inode_destroy - destroy an ioc_inode_t object. * @@ -214,27 +201,27 @@ out: * to be called only from ioc_forget. */ void -ioc_inode_destroy (ioc_inode_t *ioc_inode) +ioc_inode_destroy(ioc_inode_t *ioc_inode) { - ioc_table_t *table = NULL; + ioc_table_t *table = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out); - table = ioc_inode->table; + table = ioc_inode->table; - ioc_table_lock (table); - { - table->inode_count--; - list_del (&ioc_inode->inode_list); - list_del (&ioc_inode->inode_lru); - } - ioc_table_unlock (table); + ioc_table_lock(table); + { + table->inode_count--; + list_del(&ioc_inode->inode_list); + list_del(&ioc_inode->inode_lru); + } + ioc_table_unlock(table); - ioc_inode_flush (ioc_inode); - rbthash_table_destroy (ioc_inode->cache.page_table); + ioc_inode_flush(ioc_inode); + rbthash_table_destroy(ioc_inode->cache.page_table); - pthread_mutex_destroy (&ioc_inode->inode_lock); - GF_FREE (ioc_inode); + pthread_mutex_destroy(&ioc_inode->inode_lock); + GF_FREE(ioc_inode); out: - return; + return; } diff --git a/xlators/performance/io-cache/src/ioc-mem-types.h b/xlators/performance/io-cache/src/ioc-mem-types.h index 9b68f9fce5f..20c9a12021e 100644 --- a/xlators/performance/io-cache/src/ioc-mem-types.h +++ b/xlators/performance/io-cache/src/ioc-mem-types.h @@ -11,19 +11,19 @@ #ifndef __IOC_MT_H__ #define __IOC_MT_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_ioc_mem_types_ { - gf_ioc_mt_iovec = gf_common_mt_end + 1, - gf_ioc_mt_ioc_table_t, - gf_ioc_mt_char, - gf_ioc_mt_ioc_waitq_t, - gf_ioc_mt_ioc_priority, - gf_ioc_mt_list_head, - gf_ioc_mt_call_pool_t, - gf_ioc_mt_ioc_inode_t, - gf_ioc_mt_ioc_fill_t, - gf_ioc_mt_ioc_newpage_t, - gf_ioc_mt_end + gf_ioc_mt_iovec = gf_common_mt_end + 1, + gf_ioc_mt_ioc_table_t, + gf_ioc_mt_char, + gf_ioc_mt_ioc_waitq_t, + gf_ioc_mt_ioc_priority, + gf_ioc_mt_list_head, + gf_ioc_mt_call_pool_t, + gf_ioc_mt_ioc_inode_t, + gf_ioc_mt_ioc_fill_t, + gf_ioc_mt_ioc_newpage_t, + gf_ioc_mt_end }; #endif diff --git a/xlators/performance/io-cache/src/page.c b/xlators/performance/io-cache/src/page.c index b2e20ba659f..84b1ae6cb20 100644 --- a/xlators/performance/io-cache/src/page.c +++ b/xlators/performance/io-cache/src/page.c @@ -8,81 +8,73 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "io-cache.h" #include "ioc-mem-types.h" #include <assert.h> #include <sys/time.h> - +#include "io-cache-messages.h" char -ioc_empty (struct ioc_cache *cache) +ioc_empty(struct ioc_cache *cache) { - char is_empty = -1; + char is_empty = -1; - GF_VALIDATE_OR_GOTO ("io-cache", cache, out); + GF_VALIDATE_OR_GOTO("io-cache", cache, out); - is_empty = list_empty (&cache->page_lru); + is_empty = list_empty(&cache->page_lru); out: - return is_empty; + return is_empty; } - ioc_page_t * -__ioc_page_get (ioc_inode_t *ioc_inode, off_t offset) +__ioc_page_get(ioc_inode_t *ioc_inode, off_t offset) { - ioc_page_t *page = NULL; - ioc_table_t *table = NULL; - off_t rounded_offset = 0; + ioc_page_t *page = NULL; + ioc_table_t *table = NULL; + off_t rounded_offset = 0; - GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out); - table = ioc_inode->table; - GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + table = ioc_inode->table; + GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out); - rounded_offset = floor (offset, table->page_size); + rounded_offset = gf_floor(offset, table->page_size); - page = rbthash_get (ioc_inode->cache.page_table, &rounded_offset, - sizeof (rounded_offset)); + page = rbthash_get(ioc_inode->cache.page_table, &rounded_offset, + sizeof(rounded_offset)); - if (page != NULL) { - /* push the page to the end of the lru list */ - list_move_tail (&page->page_lru, &ioc_inode->cache.page_lru); - } + if (page != NULL) { + /* push the page to the end of the lru list */ + list_move_tail(&page->page_lru, &ioc_inode->cache.page_lru); + } out: - return page; + return page; } - ioc_page_t * -ioc_page_get (ioc_inode_t *ioc_inode, off_t offset) +ioc_page_get(ioc_inode_t *ioc_inode, off_t offset) { - ioc_page_t *page = NULL; + ioc_page_t *page = NULL; - if (ioc_inode == NULL) { - goto out; - } + if (ioc_inode == NULL) { + goto out; + } - ioc_inode_lock (ioc_inode); - { - page = __ioc_page_get (ioc_inode, offset); - } - ioc_inode_unlock (ioc_inode); + ioc_inode_lock(ioc_inode); + { + page = __ioc_page_get(ioc_inode, offset); + } + ioc_inode_unlock(ioc_inode); out: - return page; + return page; } - /* * __ioc_page_destroy - * @@ -90,103 +82,108 @@ out: * */ int64_t -__ioc_page_destroy (ioc_page_t *page) +__ioc_page_destroy(ioc_page_t *page) { - int64_t page_size = 0; - - GF_VALIDATE_OR_GOTO ("io-cache", page, out); - - if (page->iobref) - page_size = iobref_size (page->iobref); - - if (page->waitq) { - /* frames waiting on this page, do not destroy this page */ - page_size = -1; - page->stale = 1; - } else { - rbthash_remove (page->inode->cache.page_table, &page->offset, - sizeof (page->offset)); - list_del (&page->page_lru); - - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, - "destroying page = %p, offset = %"PRId64" " - "&& inode = %p", - page, page->offset, page->inode); - - if (page->vector){ - iobref_unref (page->iobref); - GF_FREE (page->vector); - page->vector = NULL; - } - - page->inode = NULL; + int64_t page_size = 0; + + GF_VALIDATE_OR_GOTO("io-cache", page, out); + + if (page->iobref) + page_size = iobref_size(page->iobref); + + if (page->waitq) { + /* frames waiting on this page, do not destroy this page */ + page_size = -1; + page->stale = 1; + } else { + rbthash_remove(page->inode->cache.page_table, &page->offset, + sizeof(page->offset)); + list_del(&page->page_lru); + + gf_msg_trace(page->inode->table->xl->name, 0, + "destroying page = %p, offset = %" PRId64 + " " + "&& inode = %p", + page, page->offset, page->inode); + + if (page->vector) { + iobref_unref(page->iobref); + GF_FREE(page->vector); + page->vector = NULL; } - if (page_size != -1) { - pthread_mutex_destroy (&page->page_lock); - GF_FREE (page); - } + page->inode = NULL; + } + + if (page_size != -1) { + pthread_mutex_destroy(&page->page_lock); + GF_FREE(page); + } out: - return page_size; + return page_size; } - int64_t -ioc_page_destroy (ioc_page_t *page) +ioc_page_destroy(ioc_page_t *page) { - int64_t ret = 0; + int64_t ret = 0; + struct ioc_inode *inode = NULL; - if (page == NULL) { - goto out; - } + if (page == NULL) { + goto out; + } - ioc_inode_lock (page->inode); - { - ret = __ioc_page_destroy (page); - } - ioc_inode_unlock (page->inode); + ioc_inode_lock(page->inode); + { + inode = page->inode; + ret = __ioc_page_destroy(page); + } + ioc_inode_unlock(inode); out: - return ret; + return ret; } int32_t -__ioc_inode_prune (ioc_inode_t *curr, uint64_t *size_pruned, - uint64_t size_to_prune, uint32_t index) +__ioc_inode_prune(ioc_inode_t *curr, uint64_t *size_pruned, + uint64_t size_to_prune, uint32_t index) { - ioc_page_t *page = NULL, *next = NULL; - int32_t ret = 0; - ioc_table_t *table = NULL; + ioc_page_t *page = NULL, *next = NULL; + int32_t ret = 0; + ioc_table_t *table = NULL; - if (curr == NULL) { - goto out; - } + if (curr == NULL) { + goto out; + } - table = curr->table; + table = curr->table; - list_for_each_entry_safe (page, next, &curr->cache.page_lru, page_lru) { - *size_pruned += page->size; - ret = __ioc_page_destroy (page); + list_for_each_entry_safe(page, next, &curr->cache.page_lru, page_lru) + { + *size_pruned += page->size; + ret = __ioc_page_destroy(page); - if (ret != -1) - table->cache_used -= ret; + if (ret != -1) + table->cache_used -= ret; - gf_log (table->xl->name, GF_LOG_TRACE, - "index = %d && table->cache_used = %"PRIu64" && table->" - "cache_size = %"PRIu64, index, table->cache_used, - table->cache_size); + gf_msg_trace(table->xl->name, 0, + "index = %d && " + "table->cache_used = %" PRIu64 + " && table->" + "cache_size = %" PRIu64, + index, table->cache_used, table->cache_size); - if ((*size_pruned) >= size_to_prune) - break; - } + if ((*size_pruned) >= size_to_prune) + break; + } - if (ioc_empty (&curr->cache)) { - list_del_init (&curr->inode_lru); - } + if (ioc_empty(&curr->cache)) { + list_del_init(&curr->inode_lru); + } out: - return 0; + return 0; } /* * ioc_prune - prune the cache. we have a limit to the number of pages we @@ -196,46 +193,44 @@ out: * */ int32_t -ioc_prune (ioc_table_t *table) +ioc_prune(ioc_table_t *table) { - ioc_inode_t *curr = NULL, *next_ioc_inode = NULL; - int32_t index = 0; - uint64_t size_to_prune = 0; - uint64_t size_pruned = 0; + ioc_inode_t *curr = NULL, *next_ioc_inode = NULL; + int32_t index = 0; + uint64_t size_to_prune = 0; + uint64_t size_pruned = 0; + + GF_VALIDATE_OR_GOTO("io-cache", table, out); + + ioc_table_lock(table); + { + size_to_prune = table->cache_used - table->cache_size; + /* take out the least recently used inode */ + for (index = 0; index < table->max_pri; index++) { + list_for_each_entry_safe(curr, next_ioc_inode, + &table->inode_lru[index], inode_lru) + { + /* prune page-by-page for this inode, till + * we reach the equilibrium */ + ioc_inode_lock(curr); + { + __ioc_inode_prune(curr, &size_pruned, size_to_prune, index); + } + ioc_inode_unlock(curr); - GF_VALIDATE_OR_GOTO ("io-cache", table, out); + if (size_pruned >= size_to_prune) + break; + } /* list_for_each_entry_safe (curr...) */ - ioc_table_lock (table); - { - size_to_prune = table->cache_used - table->cache_size; - /* take out the least recently used inode */ - for (index=0; index < table->max_pri; index++) { - list_for_each_entry_safe (curr, next_ioc_inode, - &table->inode_lru[index], - inode_lru) { - /* prune page-by-page for this inode, till - * we reach the equilibrium */ - ioc_inode_lock (curr); - { - __ioc_inode_prune (curr, &size_pruned, - size_to_prune, - index); - } - ioc_inode_unlock (curr); - - if (size_pruned >= size_to_prune) - break; - } /* list_for_each_entry_safe (curr...) */ - - if (size_pruned >= size_to_prune) - break; - } /* for(index=0;...) */ - - } /* ioc_inode_table locked region end */ - ioc_table_unlock (table); + if (size_pruned >= size_to_prune) + break; + } /* for(index=0;...) */ + + } /* ioc_inode_table locked region end */ + ioc_table_unlock(table); out: - return 0; + return 0; } /* @@ -246,47 +241,46 @@ out: * */ ioc_page_t * -__ioc_page_create (ioc_inode_t *ioc_inode, off_t offset) +__ioc_page_create(ioc_inode_t *ioc_inode, off_t offset) { - ioc_table_t *table = NULL; - ioc_page_t *page = NULL; - off_t rounded_offset = 0; - ioc_page_t *newpage = NULL; + ioc_table_t *table = NULL; + ioc_page_t *page = NULL; + off_t rounded_offset = 0; + ioc_page_t *newpage = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out); - table = ioc_inode->table; - GF_VALIDATE_OR_GOTO ("io-cache", table, out); + table = ioc_inode->table; + GF_VALIDATE_OR_GOTO("io-cache", table, out); - rounded_offset = floor (offset, table->page_size); + rounded_offset = gf_floor(offset, table->page_size); - newpage = GF_CALLOC (1, sizeof (*newpage), gf_ioc_mt_ioc_newpage_t); - if (newpage == NULL) { - goto out; - } + newpage = GF_CALLOC(1, sizeof(*newpage), gf_ioc_mt_ioc_newpage_t); + if (newpage == NULL) { + goto out; + } - if (!ioc_inode) { - GF_FREE (newpage); - newpage = NULL; - goto out; - } + if (!ioc_inode) { + GF_FREE(newpage); + newpage = NULL; + goto out; + } - newpage->offset = rounded_offset; - newpage->inode = ioc_inode; - pthread_mutex_init (&newpage->page_lock, NULL); + newpage->offset = rounded_offset; + newpage->inode = ioc_inode; + pthread_mutex_init(&newpage->page_lock, NULL); - rbthash_insert (ioc_inode->cache.page_table, newpage, &rounded_offset, - sizeof (rounded_offset)); + rbthash_insert(ioc_inode->cache.page_table, newpage, &rounded_offset, + sizeof(rounded_offset)); - list_add_tail (&newpage->page_lru, &ioc_inode->cache.page_lru); + list_add_tail(&newpage->page_lru, &ioc_inode->cache.page_lru); - page = newpage; + page = newpage; - gf_log ("io-cache", GF_LOG_TRACE, - "returning new page %p", page); + gf_msg_trace("io-cache", 0, "returning new page %p", page); out: - return page; + return page; } /* @@ -299,54 +293,55 @@ out: * */ void -__ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size) +__ioc_wait_on_page(ioc_page_t *page, call_frame_t *frame, off_t offset, + size_t size) { - ioc_waitq_t *waitq = NULL; - ioc_local_t *local = NULL; - - GF_VALIDATE_OR_GOTO ("io-cache", frame, out); - local = frame->local; - - GF_VALIDATE_OR_GOTO (frame->this->name, local, out); - - if (page == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - gf_log (frame->this->name, GF_LOG_WARNING, - "asked to wait on a NULL page"); - } - - waitq = GF_CALLOC (1, sizeof (*waitq), gf_ioc_mt_ioc_waitq_t); - if (waitq == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } - - gf_log (frame->this->name, GF_LOG_TRACE, - "frame(%p) waiting on page = %p, offset=%"PRId64", " - "size=%"GF_PRI_SIZET"", - frame, page, offset, size); - - waitq->data = frame; - waitq->next = page->waitq; - waitq->pending_offset = offset; - waitq->pending_size = size; - page->waitq = waitq; - /* one frame can wait only once on a given page, - * local->wait_count is number of pages a frame is waiting on */ - ioc_local_lock (local); - { - local->wait_count++; - } - ioc_local_unlock (local); + ioc_waitq_t *waitq = NULL; + ioc_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO("io-cache", frame, out); + local = frame->local; + + GF_VALIDATE_OR_GOTO(frame->this->name, local, out); + + if (page == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + IO_CACHE_MSG_NULL_PAGE_WAIT, NULL); + goto out; + } + + waitq = GF_CALLOC(1, sizeof(*waitq), gf_ioc_mt_ioc_waitq_t); + if (waitq == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + gf_msg_trace(frame->this->name, 0, + "frame(%p) waiting on page = %p, offset=%" PRId64 + ", " + "size=%" GF_PRI_SIZET "", + frame, page, offset, size); + + waitq->data = frame; + waitq->next = page->waitq; + waitq->pending_offset = offset; + waitq->pending_size = size; + page->waitq = waitq; + /* one frame can wait only once on a given page, + * local->wait_count is number of pages a frame is waiting on */ + ioc_local_lock(local); + { + local->wait_count++; + } + ioc_local_unlock(local); out: - return; + return; } - /* * ioc_cache_still_valid - see if cached pages ioc_inode are still valid * against given stbuf @@ -357,11 +352,11 @@ out: * assumes ioc_inode is locked */ int8_t -ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct iatt *stbuf) +ioc_cache_still_valid(ioc_inode_t *ioc_inode, struct iatt *stbuf) { - int8_t cache_still_valid = 1; + int8_t cache_still_valid = 1; - GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out); #if 0 if (!stbuf || (stbuf->ia_mtime != ioc_inode->cache.mtime) || @@ -369,9 +364,9 @@ ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct iatt *stbuf) cache_still_valid = 0; #else - if (!stbuf || (stbuf->ia_mtime != ioc_inode->cache.mtime) - || (stbuf->ia_mtime_nsec != ioc_inode->cache.mtime_nsec)) - cache_still_valid = 0; + if (!stbuf || (stbuf->ia_mtime != ioc_inode->cache.mtime) || + (stbuf->ia_mtime_nsec != ioc_inode->cache.mtime_nsec)) + cache_still_valid = 0; #endif @@ -384,182 +379,173 @@ ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct iatt *stbuf) #endif out: - return cache_still_valid; + return cache_still_valid; } - void -ioc_waitq_return (ioc_waitq_t *waitq) +ioc_waitq_return(ioc_waitq_t *waitq) { - ioc_waitq_t *trav = NULL; - ioc_waitq_t *next = NULL; - call_frame_t *frame = NULL; + ioc_waitq_t *trav = NULL; + ioc_waitq_t *next = NULL; + call_frame_t *frame = NULL; - for (trav = waitq; trav; trav = next) { - next = trav->next; + for (trav = waitq; trav; trav = next) { + next = trav->next; - frame = trav->data; - ioc_frame_return (frame); - GF_FREE (trav); - } + frame = trav->data; + ioc_frame_return(frame); + GF_FREE(trav); + } } - int -ioc_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref, - dict_t *xdata) +ioc_fault_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { - ioc_local_t *local = NULL; - off_t offset = 0; - ioc_inode_t *ioc_inode = NULL; - ioc_table_t *table = NULL; - ioc_page_t *page = NULL; - int32_t destroy_size = 0; - size_t page_size = 0; - ioc_waitq_t *waitq = NULL; - size_t iobref_page_size = 0; - char zero_filled = 0; - - GF_ASSERT (frame); - - local = frame->local; - GF_ASSERT (local); - - offset = local->pending_offset; - ioc_inode = local->inode; - GF_ASSERT (ioc_inode); + ioc_local_t *local = NULL; + off_t offset = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_table_t *table = NULL; + ioc_page_t *page = NULL; + int32_t destroy_size = 0; + size_t page_size = 0; + ioc_waitq_t *waitq = NULL; + size_t iobref_page_size = 0; + char zero_filled = 0; + + GF_ASSERT(frame); + + local = frame->local; + GF_ASSERT(local); + + offset = local->pending_offset; + ioc_inode = local->inode; + GF_ASSERT(ioc_inode); + + table = ioc_inode->table; + GF_ASSERT(table); + + zero_filled = ((op_ret >= 0) && (stbuf->ia_mtime == 0)); + + ioc_inode_lock(ioc_inode); + { + if (op_ret == -1 || + !(zero_filled || ioc_cache_still_valid(ioc_inode, stbuf))) { + gf_msg_trace(ioc_inode->table->xl->name, 0, + "cache for inode(%p) is invalid. flushing " + "all pages", + ioc_inode); + destroy_size = __ioc_inode_flush(ioc_inode); + } - table = ioc_inode->table; - GF_ASSERT (table); + if ((op_ret >= 0) && !zero_filled) { + ioc_inode->cache.mtime = stbuf->ia_mtime; + ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; + } - zero_filled = ((op_ret >=0) && (stbuf->ia_mtime == 0)); + ioc_inode->cache.last_revalidate = gf_time(); - ioc_inode_lock (ioc_inode); - { - if (op_ret == -1 || !(zero_filled || - ioc_cache_still_valid(ioc_inode, - stbuf))) { - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, - "cache for inode(%p) is invalid. flushing " - "all pages", ioc_inode); - destroy_size = __ioc_inode_flush (ioc_inode); + if (op_ret < 0) { + /* error, readv returned -1 */ + page = __ioc_page_get(ioc_inode, offset); + if (page) + waitq = __ioc_page_error(page, op_ret, op_errno); + } else { + gf_msg_trace(ioc_inode->table->xl->name, 0, "op_ret = %d", op_ret); + page = __ioc_page_get(ioc_inode, offset); + if (!page) { + /* page was flushed */ + /* some serious bug ? */ + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + IO_CACHE_MSG_WASTED_COPY, "offset=%" PRId64, offset, + "page-size=%" PRId64, table->page_size, "ioc_inode=%p", + ioc_inode, NULL); + } else { + if (page->vector) { + iobref_unref(page->iobref); + GF_FREE(page->vector); + page->vector = NULL; + page->iobref = NULL; } - if ((op_ret >= 0) && !zero_filled) { - ioc_inode->cache.mtime = stbuf->ia_mtime; - ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; + /* keep a copy of the page for our cache */ + page->vector = iov_dup(vector, count); + if (page->vector == NULL) { + page = __ioc_page_get(ioc_inode, offset); + if (page != NULL) + waitq = __ioc_page_error(page, -1, ENOMEM); + goto unlock; } - gettimeofday (&ioc_inode->cache.tv, NULL); - - if (op_ret < 0) { - /* error, readv returned -1 */ - page = __ioc_page_get (ioc_inode, offset); - if (page) - waitq = __ioc_page_error (page, op_ret, - op_errno); + page->count = count; + if (iobref) { + page->iobref = iobref_ref(iobref); } else { - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, - "op_ret = %d", op_ret); - page = __ioc_page_get (ioc_inode, offset); - if (!page) { - /* page was flushed */ - /* some serious bug ? */ - gf_log (frame->this->name, GF_LOG_WARNING, - "wasted copy: %"PRId64"[+%"PRId64"] " - "ioc_inode=%p", offset, - table->page_size, ioc_inode); - } else { - if (page->vector) { - iobref_unref (page->iobref); - GF_FREE (page->vector); - page->vector = NULL; - } - - /* keep a copy of the page for our cache */ - page->vector = iov_dup (vector, count); - if (page->vector == NULL) { - page = __ioc_page_get (ioc_inode, - offset); - if (page != NULL) - waitq = __ioc_page_error (page, - -1, - ENOMEM); - goto unlock; - } - - page->count = count; - if (iobref) { - page->iobref = iobref_ref (iobref); - } else { - /* TODO: we have got a response to - * our request and no data */ - gf_log (frame->this->name, - GF_LOG_CRITICAL, - "frame>root>rsp_refs is null"); - } /* if(frame->root->rsp_refs) */ - - /* page->size should indicate exactly how - * much the readv call to the child - * translator returned. earlier op_ret - * from child translator was used, which - * gave rise to a bug where reads from - * io-cached volume were resulting in 0 - * byte replies */ - page_size = iov_length(vector, count); - page->size = page_size; - page->op_errno = op_errno; - - iobref_page_size = iobref_size (page->iobref); - - if (page->waitq) { - /* wake up all the frames waiting on - * this page, including - * the frame which triggered fault */ - waitq = __ioc_page_wakeup (page, - op_errno); - } /* if(page->waitq) */ - } /* if(!page)...else */ - } /* if(op_ret < 0)...else */ - } /* ioc_inode locked region end */ + /* TODO: we have got a response to + * our request and no data */ + gf_smsg(frame->this->name, GF_LOG_CRITICAL, ENOMEM, + IO_CACHE_MSG_FRAME_NULL, NULL); + } /* if(frame->root->rsp_refs) */ + + /* page->size should indicate exactly how + * much the readv call to the child + * translator returned. earlier op_ret + * from child translator was used, which + * gave rise to a bug where reads from + * io-cached volume were resulting in 0 + * byte replies */ + page_size = iov_length(vector, count); + page->size = page_size; + page->op_errno = op_errno; + + iobref_page_size = iobref_size(page->iobref); + + if (page->waitq) { + /* wake up all the frames waiting on + * this page, including + * the frame which triggered fault */ + waitq = __ioc_page_wakeup(page, op_errno); + } /* if(page->waitq) */ + } /* if(!page)...else */ + } /* if(op_ret < 0)...else */ + } /* ioc_inode locked region end */ unlock: - ioc_inode_unlock (ioc_inode); + ioc_inode_unlock(ioc_inode); - ioc_waitq_return (waitq); + ioc_waitq_return(waitq); - if (iobref_page_size) { - ioc_table_lock (table); - { - table->cache_used += iobref_page_size; - } - ioc_table_unlock (table); + if (iobref_page_size) { + ioc_table_lock(table); + { + table->cache_used += iobref_page_size; } + ioc_table_unlock(table); + } - if (destroy_size) { - ioc_table_lock (table); - { - table->cache_used -= destroy_size; - } - ioc_table_unlock (table); + if (destroy_size) { + ioc_table_lock(table); + { + table->cache_used -= destroy_size; } + ioc_table_unlock(table); + } - if (ioc_need_prune (ioc_inode->table)) { - ioc_prune (ioc_inode->table); - } + if (ioc_need_prune(ioc_inode->table)) { + ioc_prune(ioc_inode->table); + } - gf_log (frame->this->name, GF_LOG_TRACE, "fault frame %p returned", - frame); - pthread_mutex_destroy (&local->local_lock); + gf_msg_trace(frame->this->name, 0, "fault frame %p returned", frame); + pthread_mutex_destroy(&local->local_lock); - fd_unref (local->fd); + fd_unref(local->fd); + if (local->xattr_req) + dict_unref(local->xattr_req); - STACK_DESTROY (frame->root); - return 0; + STACK_DESTROY(frame->root); + return 0; } - /* * ioc_page_fault - * @@ -570,219 +556,212 @@ unlock: * */ void -ioc_page_fault (ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, - off_t offset) +ioc_page_fault(ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, + off_t offset) { - ioc_table_t *table = NULL; - call_frame_t *fault_frame = NULL; - ioc_local_t *fault_local = NULL; - int32_t op_ret = -1, op_errno = -1; - ioc_waitq_t *waitq = NULL; - ioc_page_t *page = NULL; - - GF_ASSERT (ioc_inode); - if (frame == NULL) { - op_ret = -1; - op_errno = EINVAL; - gf_log ("io-cache", GF_LOG_WARNING, - "page fault on a NULL frame"); - goto err; - } - - table = ioc_inode->table; - fault_frame = copy_frame (frame); - if (fault_frame == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - fault_local = mem_get0 (THIS->local_pool); - if (fault_local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - STACK_DESTROY (fault_frame->root); - goto err; - } - - /* NOTE: copy_frame() means, the frame the fop whose fd_ref we - * are using till now won't be valid till we get reply from server. - * we unref this fd, in fault_cbk */ - fault_local->fd = fd_ref (fd); - - fault_frame->local = fault_local; - pthread_mutex_init (&fault_local->local_lock, NULL); - - INIT_LIST_HEAD (&fault_local->fill_list); - fault_local->pending_offset = offset; - fault_local->pending_size = table->page_size; - fault_local->inode = ioc_inode; - - gf_log (frame->this->name, GF_LOG_TRACE, - "stack winding page fault for offset = %"PRId64" with " - "frame %p", offset, fault_frame); - - STACK_WIND (fault_frame, ioc_fault_cbk, FIRST_CHILD(fault_frame->this), - FIRST_CHILD(fault_frame->this)->fops->readv, fd, - table->page_size, offset, 0, NULL); - return; + ioc_table_t *table = NULL; + call_frame_t *fault_frame = NULL; + ioc_local_t *fault_local = NULL; + ioc_local_t *local = NULL; + int32_t op_ret = -1, op_errno = -1; + ioc_waitq_t *waitq = NULL; + ioc_page_t *page = NULL; + + GF_ASSERT(ioc_inode); + if (frame == NULL) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg("io-cache", GF_LOG_WARNING, EINVAL, IO_CACHE_MSG_PAGE_FAULT, + NULL); + goto err; + } + + table = ioc_inode->table; + fault_frame = copy_frame(frame); + if (fault_frame == NULL) { + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + local = frame->local; + fault_local = mem_get0(THIS->local_pool); + if (fault_local == NULL) { + op_ret = -1; + op_errno = ENOMEM; + STACK_DESTROY(fault_frame->root); + goto err; + } + + /* NOTE: copy_frame() means, the frame the fop whose fd_ref we + * are using till now won't be valid till we get reply from server. + * we unref this fd, in fault_cbk */ + fault_local->fd = fd_ref(fd); + + fault_frame->local = fault_local; + pthread_mutex_init(&fault_local->local_lock, NULL); + + INIT_LIST_HEAD(&fault_local->fill_list); + fault_local->pending_offset = offset; + fault_local->pending_size = table->page_size; + fault_local->inode = ioc_inode; + + if (local && local->xattr_req) + fault_local->xattr_req = dict_ref(local->xattr_req); + + gf_msg_trace(frame->this->name, 0, + "stack winding page fault for offset = %" PRId64 + " with " + "frame %p", + offset, fault_frame); + + STACK_WIND(fault_frame, ioc_fault_cbk, FIRST_CHILD(fault_frame->this), + FIRST_CHILD(fault_frame->this)->fops->readv, fd, + table->page_size, offset, 0, fault_local->xattr_req); + return; err: - ioc_inode_lock (ioc_inode); - { - page = __ioc_page_get (ioc_inode, offset); - if (page != NULL) { - waitq = __ioc_page_error (page, op_ret, op_errno); - } + ioc_inode_lock(ioc_inode); + { + page = __ioc_page_get(ioc_inode, offset); + if (page != NULL) { + waitq = __ioc_page_error(page, op_ret, op_errno); } - ioc_inode_unlock (ioc_inode); + } + ioc_inode_unlock(ioc_inode); - if (waitq != NULL) { - ioc_waitq_return (waitq); - } + if (waitq != NULL) { + ioc_waitq_return(waitq); + } } - int32_t -__ioc_frame_fill (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size, int32_t op_errno) +__ioc_frame_fill(ioc_page_t *page, call_frame_t *frame, off_t offset, + size_t size, int32_t op_errno) { - ioc_local_t *local = NULL; - ioc_fill_t *fill = NULL; - off_t src_offset = 0; - off_t dst_offset = 0; - ssize_t copy_size = 0; - ioc_inode_t *ioc_inode = NULL; - ioc_fill_t *new = NULL; - int8_t found = 0; - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("io-cache", frame, out); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, out); - - if (page == NULL) { - gf_log (frame->this->name, GF_LOG_WARNING, - "NULL page has been provided to serve read request"); - local->op_ret = -1; - local->op_errno = EINVAL; - goto out; + ioc_local_t *local = NULL; + ioc_fill_t *fill = NULL; + off_t src_offset = 0; + off_t dst_offset = 0; + ssize_t copy_size = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_fill_t *new = NULL; + int8_t found = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("io-cache", frame, out); + + local = frame->local; + GF_VALIDATE_OR_GOTO(frame->this->name, local, out); + + if (page == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + IO_CACHE_MSG_SERVE_READ_REQUEST, NULL); + local->op_ret = -1; + local->op_errno = EINVAL; + goto out; + } + + ioc_inode = page->inode; + + gf_msg_trace(frame->this->name, 0, + "frame (%p) offset = %" PRId64 " && size = %" GF_PRI_SIZET + " " + "&& page->size = %" GF_PRI_SIZET " && wait_count = %d", + frame, offset, size, page->size, local->wait_count); + + /* immediately move this page to the end of the page_lru list */ + list_move_tail(&page->page_lru, &ioc_inode->cache.page_lru); + /* fill local->pending_size bytes from local->pending_offset */ + if (local->op_ret != -1) { + local->op_errno = op_errno; + + if (page->size == 0) { + goto done; } - ioc_inode = page->inode; - - gf_log (frame->this->name, GF_LOG_TRACE, - "frame (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET" " - "&& page->size = %"GF_PRI_SIZET" && wait_count = %d", - frame, offset, size, page->size, local->wait_count); - - /* immediately move this page to the end of the page_lru list */ - list_move_tail (&page->page_lru, &ioc_inode->cache.page_lru); - /* fill local->pending_size bytes from local->pending_offset */ - if (local->op_ret != -1) { - local->op_errno = op_errno; - - if (page->size == 0) { - goto done; - } + if (offset > page->offset) + /* offset is offset in file, convert it to offset in + * page */ + src_offset = offset - page->offset; + /*FIXME: since offset is the offset within page is the + * else case valid? */ + else + /* local->pending_offset is in previous page. do not + * fill until we have filled all previous pages */ + dst_offset = page->offset - offset; + + /* we have to copy from offset to either end of this page + * or till the requested size */ + copy_size = min(page->size - src_offset, size - dst_offset); + + if (copy_size < 0) { + /* if page contains fewer bytes and the required offset + is beyond the page size in the page */ + copy_size = src_offset = 0; + } - if (offset > page->offset) - /* offset is offset in file, convert it to offset in - * page */ - src_offset = offset - page->offset; - /*FIXME: since offset is the offset within page is the - * else case valid? */ - else - /* local->pending_offset is in previous page. do not - * fill until we have filled all previous pages */ - dst_offset = page->offset - offset; - - /* we have to copy from offset to either end of this page - * or till the requested size */ - copy_size = min (page->size - src_offset, - size - dst_offset); - - if (copy_size < 0) { - /* if page contains fewer bytes and the required offset - is beyond the page size in the page */ - copy_size = src_offset = 0; - } + gf_msg_trace(page->inode->table->xl->name, 0, + "copy_size = %" GF_PRI_SIZET + " && src_offset = " + "%" PRId64 " && dst_offset = %" PRId64 "", + copy_size, src_offset, dst_offset); - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, - "copy_size = %"GF_PRI_SIZET" && src_offset = " - "%"PRId64" && dst_offset = %"PRId64"", - copy_size, src_offset, dst_offset); + { + new = GF_CALLOC(1, sizeof(*new), gf_ioc_mt_ioc_fill_t); + if (new == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + new->offset = page->offset; + new->size = copy_size; + new->iobref = iobref_ref(page->iobref); + new->count = iov_subset(page->vector, page->count, src_offset, + copy_size, &new->vector, 0); + if (new->count < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(new->iobref); + GF_FREE(new); + goto out; + } + + /* add the ioc_fill to fill_list for this frame */ + if (list_empty(&local->fill_list)) { + /* if list is empty, then this is the first + * time we are filling frame, add the + * ioc_fill_t to the end of list */ + list_add_tail(&new->list, &local->fill_list); + } else { + found = 0; + /* list is not empty, we need to look for + * where this offset fits in list */ + list_for_each_entry(fill, &local->fill_list, list) { - new = GF_CALLOC (1, sizeof (*new), - gf_ioc_mt_ioc_fill_t); - if (new == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } - - new->offset = page->offset; - new->size = copy_size; - new->iobref = iobref_ref (page->iobref); - new->count = iov_subset (page->vector, page->count, - src_offset, - src_offset + copy_size, - NULL); - - new->vector = GF_CALLOC (new->count, - sizeof (struct iovec), - gf_ioc_mt_iovec); - if (new->vector == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - - iobref_unref (new->iobref); - GF_FREE (new); - goto out; - } - - new->count = iov_subset (page->vector, page->count, - src_offset, - src_offset + copy_size, - new->vector); - - /* add the ioc_fill to fill_list for this frame */ - if (list_empty (&local->fill_list)) { - /* if list is empty, then this is the first - * time we are filling frame, add the - * ioc_fill_t to the end of list */ - list_add_tail (&new->list, &local->fill_list); - } else { - found = 0; - /* list is not empty, we need to look for - * where this offset fits in list */ - list_for_each_entry (fill, &local->fill_list, - list) { - if (fill->offset > new->offset) { - found = 1; - break; - } - } - - if (found) { - list_add_tail (&new->list, - &fill->list); - } else { - list_add_tail (&new->list, - &local->fill_list); - } - } + if (fill->offset > new->offset) { + found = 1; + break; + } } - local->op_ret += copy_size; + if (found) { + list_add_tail(&new->list, &fill->list); + } else { + list_add_tail(&new->list, &local->fill_list); + } + } } + local->op_ret += copy_size; + } + done: - ret = 0; + ret = 0; out: - return ret; + return ret; } /* @@ -795,103 +774,109 @@ out: * */ static void -ioc_frame_unwind (call_frame_t *frame) +ioc_frame_unwind(call_frame_t *frame) { - ioc_local_t *local = NULL; - ioc_fill_t *fill = NULL, *next = NULL; - int32_t count = 0; - struct iovec *vector = NULL; - int32_t copied = 0; - struct iobref *iobref = NULL; - struct iatt stbuf = {0,}; - int32_t op_ret = 0, op_errno = 0; - - GF_ASSERT (frame); - - local = frame->local; - if (local == NULL) { - gf_log (frame->this->name, GF_LOG_WARNING, - "local is NULL"); - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - if (local->op_ret < 0) { - op_ret = local->op_ret; - op_errno = local->op_errno; - goto unwind; - } - - // ioc_local_lock (local); - iobref = iobref_new (); - if (iobref == NULL) { + ioc_local_t *local = NULL; + ioc_fill_t *fill = NULL, *next = NULL; + int32_t count = 0; + struct iovec *vector = NULL; + int32_t copied = 0; + struct iobref *iobref = NULL; + struct iatt stbuf = { + 0, + }; + int32_t op_ret = 0, op_errno = 0; + + GF_ASSERT(frame); + + local = frame->local; + if (local == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM, + IO_CACHE_MSG_LOCAL_NULL, NULL); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (local->op_ret < 0) { + op_ret = local->op_ret; + op_errno = local->op_errno; + goto unwind; + } + + // ioc_local_lock (local); + iobref = iobref_new(); + if (iobref == NULL) { + op_ret = -1; + op_errno = ENOMEM; + } + + if (list_empty(&local->fill_list)) { + gf_msg_trace(frame->this->name, 0, + "frame(%p) has 0 entries in local->fill_list " + "(offset = %" PRId64 " && size = %" GF_PRI_SIZET ")", + frame, local->offset, local->size); + } + + list_for_each_entry(fill, &local->fill_list, list) { count += fill->count; } + + vector = GF_CALLOC(count, sizeof(*vector), gf_ioc_mt_iovec); + if (vector == NULL) { + op_ret = -1; + op_errno = ENOMEM; + } + + list_for_each_entry_safe(fill, next, &local->fill_list, list) + { + /* # TODO: check why this if clause is needed at all. */ + if ((vector != NULL) && (iobref != NULL)) { + memcpy(((char *)vector) + copied, fill->vector, + fill->count * sizeof(*vector)); + + copied += (fill->count * sizeof(*vector)); + + if (iobref_merge(iobref, fill->iobref)) { op_ret = -1; op_errno = ENOMEM; + } } - if (list_empty (&local->fill_list)) { - gf_log (frame->this->name, GF_LOG_TRACE, - "frame(%p) has 0 entries in local->fill_list " - "(offset = %"PRId64" && size = %"GF_PRI_SIZET")", - frame, local->offset, local->size); - } - - list_for_each_entry (fill, &local->fill_list, list) { - count += fill->count; - } + list_del(&fill->list); + iobref_unref(fill->iobref); + GF_FREE(fill->vector); + GF_FREE(fill); + } - vector = GF_CALLOC (count, sizeof (*vector), gf_ioc_mt_iovec); - if (vector == NULL) { - op_ret = -1; - op_errno = ENOMEM; - } - - list_for_each_entry_safe (fill, next, &local->fill_list, list) { - if ((vector != NULL) && (iobref != NULL)) { - memcpy (((char *)vector) + copied, - fill->vector, - fill->count * sizeof (*vector)); - - copied += (fill->count * sizeof (*vector)); - - iobref_merge (iobref, fill->iobref); - } - - list_del (&fill->list); - iobref_unref (fill->iobref); - GF_FREE (fill->vector); - GF_FREE (fill); - } - - if (op_ret != -1) { - op_ret = iov_length (vector, count); - } + if (op_ret != -1) { + op_ret = iov_length(vector, count); + } unwind: - gf_log (frame->this->name, GF_LOG_TRACE, - "frame(%p) unwinding with op_ret=%d", frame, op_ret); - - // ioc_local_unlock (local); - - frame->local = NULL; - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, - count, &stbuf, iobref, NULL); - - if (iobref != NULL) { - iobref_unref (iobref); - } - - if (vector != NULL) { - GF_FREE (vector); - vector = NULL; - } - - pthread_mutex_destroy (&local->local_lock); - if (local) - mem_put (local); - - return; + gf_msg_trace(frame->this->name, 0, "frame(%p) unwinding with op_ret=%d", + frame, op_ret); + + // ioc_local_unlock (local); + + frame->local = NULL; + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, &stbuf, + iobref, NULL); + + if (iobref != NULL) { + iobref_unref(iobref); + } + + if (vector != NULL) { + GF_FREE(vector); + vector = NULL; + } + + if (local) { + if (local->xattr_req) + dict_unref(local->xattr_req); + pthread_mutex_destroy(&local->local_lock); + mem_put(local); + } + return; } /* @@ -901,27 +886,27 @@ unwind: * to be called only when a frame is waiting on an in-transit page */ void -ioc_frame_return (call_frame_t *frame) +ioc_frame_return(call_frame_t *frame) { - ioc_local_t *local = NULL; - int32_t wait_count = 0; + ioc_local_t *local = NULL; + int32_t wait_count = 0; - GF_ASSERT (frame); + GF_ASSERT(frame); - local = frame->local; - GF_ASSERT (local->wait_count > 0); + local = frame->local; + GF_ASSERT(local->wait_count > 0); - ioc_local_lock (local); - { - wait_count = --local->wait_count; - } - ioc_local_unlock (local); + ioc_local_lock(local); + { + wait_count = --local->wait_count; + } + ioc_local_unlock(local); - if (!wait_count) { - ioc_frame_unwind (frame); - } + if (!wait_count) { + ioc_frame_unwind(frame); + } - return; + return; } /* @@ -931,41 +916,39 @@ ioc_frame_return (call_frame_t *frame) * to be called only when a frame is waiting on an in-transit page */ ioc_waitq_t * -__ioc_page_wakeup (ioc_page_t *page, int32_t op_errno) +__ioc_page_wakeup(ioc_page_t *page, int32_t op_errno) { - ioc_waitq_t *waitq = NULL, *trav = NULL; - call_frame_t *frame = NULL; - int32_t ret = -1; + ioc_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; + int32_t ret = -1; - GF_VALIDATE_OR_GOTO ("io-cache", page, out); + GF_VALIDATE_OR_GOTO("io-cache", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - page->ready = 1; + page->ready = 1; - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, - "page is %p && waitq = %p", page, waitq); + gf_msg_trace(page->inode->table->xl->name, 0, "page is %p && waitq = %p", + page, waitq); - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; - ret = __ioc_frame_fill (page, frame, trav->pending_offset, - trav->pending_size, op_errno); - if (ret == -1) { - break; - } + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + ret = __ioc_frame_fill(page, frame, trav->pending_offset, + trav->pending_size, op_errno); + if (ret == -1) { + break; } + } - if (page->stale) { - __ioc_page_destroy (page); - } + if (page->stale) { + __ioc_page_destroy(page); + } out: - return waitq; + return waitq; } - - /* * ioc_page_error - * @page: @@ -974,46 +957,45 @@ out: * */ ioc_waitq_t * -__ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno) +__ioc_page_error(ioc_page_t *page, int32_t op_ret, int32_t op_errno) { - ioc_waitq_t *waitq = NULL, *trav = NULL; - call_frame_t *frame = NULL; - int64_t ret = 0; - ioc_table_t *table = NULL; - ioc_local_t *local = NULL; - - GF_VALIDATE_OR_GOTO ("io-cache", page, out); + ioc_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; + int64_t ret = 0; + ioc_table_t *table = NULL; + ioc_local_t *local = NULL; - waitq = page->waitq; - page->waitq = NULL; + GF_VALIDATE_OR_GOTO("io-cache", page, out); - gf_log (page->inode->table->xl->name, GF_LOG_WARNING, - "page error for page = %p & waitq = %p", page, waitq); + waitq = page->waitq; + page->waitq = NULL; - for (trav = waitq; trav; trav = trav->next) { + gf_msg_debug(page->inode->table->xl->name, 0, + "page error for page = %p & waitq = %p", page, waitq); - frame = trav->data; + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; - local = frame->local; - ioc_local_lock (local); - { - if (local->op_ret != -1) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } - } - ioc_local_unlock (local); + local = frame->local; + ioc_local_lock(local); + { + if (local->op_ret != -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } } + ioc_local_unlock(local); + } - table = page->inode->table; - ret = __ioc_page_destroy (page); + table = page->inode->table; + ret = __ioc_page_destroy(page); - if (ret != -1) { - table->cache_used -= ret; - } + if (ret != -1) { + table->cache_used -= ret; + } out: - return waitq; + return waitq; } /* @@ -1024,20 +1006,22 @@ out: * */ ioc_waitq_t * -ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno) +ioc_page_error(ioc_page_t *page, int32_t op_ret, int32_t op_errno) { - ioc_waitq_t *waitq = NULL; + ioc_waitq_t *waitq = NULL; + struct ioc_inode *inode = NULL; - if (page == NULL) { - goto out; - } + if (page == NULL) { + goto out; + } - ioc_inode_lock (page->inode); - { - waitq = __ioc_page_error (page, op_ret, op_errno); - } - ioc_inode_unlock (page->inode); + ioc_inode_lock(page->inode); + { + inode = page->inode; + waitq = __ioc_page_error(page, op_ret, op_errno); + } + ioc_inode_unlock(inode); out: - return waitq; + return waitq; } diff --git a/xlators/performance/io-threads/src/Makefile.am b/xlators/performance/io-threads/src/Makefile.am index d63042e7c05..7570cf41ed2 100644 --- a/xlators/performance/io-threads/src/Makefile.am +++ b/xlators/performance/io-threads/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = io-threads.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -io_threads_la_LDFLAGS = -module -avoid-version +io_threads_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) io_threads_la_SOURCES = io-threads.c io_threads_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = io-threads.h iot-mem-types.h +noinst_HEADERS = io-threads.h iot-mem-types.h io-threads-messages.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/performance/io-threads/src/io-threads-messages.h b/xlators/performance/io-threads/src/io-threads-messages.h new file mode 100644 index 00000000000..6229c353f96 --- /dev/null +++ b/xlators/performance/io-threads/src/io-threads-messages.h @@ -0,0 +1,41 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _IO_THREADS_MESSAGES_H_ +#define _IO_THREADS_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(IO_THREADS, IO_THREADS_MSG_INIT_FAILED, + IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED, IO_THREADS_MSG_NO_MEMORY, + IO_THREADS_MSG_VOL_MISCONFIGURED, IO_THREADS_MSG_SIZE_NOT_SET, + IO_THREADS_MSG_OUT_OF_MEMORY, IO_THREADS_MSG_PTHREAD_INIT_FAILED, + IO_THREADS_MSG_WORKER_THREAD_INIT_FAILED); + +#define IO_THREADS_MSG_INIT_FAILED_STR "Thread attribute initialization failed" +#define IO_THREADS_MSG_SIZE_NOT_SET_STR "Using default thread stack size" +#define IO_THREADS_MSG_NO_MEMORY_STR "Memory accounting init failed" +#define IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED_STR \ + "FATAL: iot not configured with exactly one child" +#define IO_THREADS_MSG_VOL_MISCONFIGURED_STR "dangling volume. check volfile" +#define IO_THREADS_MSG_OUT_OF_MEMORY_STR "out of memory" +#define IO_THREADS_MSG_PTHREAD_INIT_FAILED_STR "init failed" +#define IO_THREADS_MSG_WORKER_THREAD_INIT_FAILED_STR \ + "cannot initialize worker threads, exiting init" +#endif /* _IO_THREADS_MESSAGES_H_ */ diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index c6bdc375439..3d24cc97f4b 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -8,258 +8,300 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "call-stub.h" -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/call-stub.h> +#include <glusterfs/defaults.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "io-threads.h" +#include <signal.h> #include <stdlib.h> #include <sys/time.h> #include <time.h> -#include "locking.h" +#include <glusterfs/locking.h> +#include "io-threads-messages.h" +#include <glusterfs/timespec.h> -void *iot_worker (void *arg); -int iot_workers_scale (iot_conf_t *conf); -int __iot_workers_scale (iot_conf_t *conf); +void * +iot_worker(void *arg); +int +iot_workers_scale(iot_conf_t *conf); +int +__iot_workers_scale(iot_conf_t *conf); struct volume_options options[]; +#define IOT_FOP(name, frame, this, args...) \ + do { \ + call_stub_t *__stub = NULL; \ + int __ret = -1; \ + \ + __stub = fop_##name##_stub(frame, default_##name##_resume, args); \ + if (!__stub) { \ + __ret = -ENOMEM; \ + goto out; \ + } \ + \ + __ret = iot_schedule(frame, this, __stub); \ + \ + out: \ + if (__ret < 0) { \ + default_##name##_failure_cbk(frame, -__ret); \ + if (__stub != NULL) { \ + call_stub_destroy(__stub); \ + } \ + } \ + } while (0) + +iot_client_ctx_t * +iot_get_ctx(xlator_t *this, client_t *client) +{ + iot_client_ctx_t *ctx = NULL; + iot_client_ctx_t *setted_ctx = NULL; + int i; + + if (client_ctx_get(client, this, (void **)&ctx) != 0) { + ctx = GF_MALLOC(GF_FOP_PRI_MAX * sizeof(*ctx), gf_iot_mt_client_ctx_t); + if (ctx) { + for (i = 0; i < GF_FOP_PRI_MAX; ++i) { + INIT_LIST_HEAD(&ctx[i].clients); + INIT_LIST_HEAD(&ctx[i].reqs); + } + setted_ctx = client_ctx_set(client, this, ctx); + if (ctx != setted_ctx) { + GF_FREE(ctx); + ctx = setted_ctx; + } + } + } + + return ctx; +} + call_stub_t * -__iot_dequeue (iot_conf_t *conf, int *pri, struct timespec *sleep) -{ - call_stub_t *stub = NULL; - int i = 0; - struct timeval curtv = {0,}, difftv = {0,}; - - *pri = -1; - sleep->tv_sec = 0; - sleep->tv_nsec = 0; - for (i = 0; i < IOT_PRI_MAX; i++) { - if (list_empty (&conf->reqs[i]) || - (conf->ac_iot_count[i] >= conf->ac_iot_limit[i])) - continue; - - if (i == IOT_PRI_LEAST) { - pthread_mutex_lock(&conf->throttle.lock); - if (!conf->throttle.sample_time.tv_sec) { - /* initialize */ - gettimeofday(&conf->throttle.sample_time, NULL); - } else { - /* - * Maintain a running count of least priority - * operations that are handled over a particular - * time interval. The count is provided via - * state dump and is used as a measure against - * least priority op throttling. - */ - gettimeofday(&curtv, NULL); - timersub(&curtv, &conf->throttle.sample_time, - &difftv); - if (difftv.tv_sec >= IOT_LEAST_THROTTLE_DELAY) { - conf->throttle.cached_rate = - conf->throttle.sample_cnt; - conf->throttle.sample_cnt = 0; - conf->throttle.sample_time = curtv; - } - - /* - * If we're over the configured rate limit, - * provide an absolute time to the caller that - * represents the soonest we're allowed to - * return another least priority request. - */ - if (conf->throttle.rate_limit && - conf->throttle.sample_cnt >= - conf->throttle.rate_limit) { - struct timeval delay; - delay.tv_sec = IOT_LEAST_THROTTLE_DELAY; - delay.tv_usec = 0; - - timeradd(&conf->throttle.sample_time, - &delay, &curtv); - TIMEVAL_TO_TIMESPEC(&curtv, sleep); - - pthread_mutex_unlock( - &conf->throttle.lock); - break; - } - } - conf->throttle.sample_cnt++; - pthread_mutex_unlock(&conf->throttle.lock); - } - - stub = list_entry (conf->reqs[i].next, call_stub_t, list); - conf->ac_iot_count[i]++; - *pri = i; - break; +__iot_dequeue(iot_conf_t *conf, int *pri) +{ + call_stub_t *stub = NULL; + int i = 0; + iot_client_ctx_t *ctx; + + *pri = -1; + for (i = 0; i < GF_FOP_PRI_MAX; i++) { + if (conf->ac_iot_count[i] >= conf->ac_iot_limit[i]) { + continue; } - if (!stub) - return NULL; + if (list_empty(&conf->clients[i])) { + continue; + } - conf->queue_size--; - conf->queue_sizes[*pri]--; - list_del_init (&stub->list); + /* Get the first per-client queue for this priority. */ + ctx = list_first_entry(&conf->clients[i], iot_client_ctx_t, clients); + if (!ctx) { + continue; + } - return stub; -} + if (list_empty(&ctx->reqs)) { + continue; + } + /* Get the first request on that queue. */ + stub = list_first_entry(&ctx->reqs, call_stub_t, list); + list_del_init(&stub->list); + if (list_empty(&ctx->reqs)) { + list_del_init(&ctx->clients); + } else { + list_rotate_left(&conf->clients[i]); + } + + conf->ac_iot_count[i]++; + conf->queue_marked[i] = _gf_false; + *pri = i; + break; + } + + if (!stub) + return NULL; + + conf->queue_size--; + conf->queue_sizes[*pri]--; + + return stub; +} void -__iot_enqueue (iot_conf_t *conf, call_stub_t *stub, int pri) +__iot_enqueue(iot_conf_t *conf, call_stub_t *stub, int pri) { - if (pri < 0 || pri >= IOT_PRI_MAX) - pri = IOT_PRI_MAX-1; + client_t *client = stub->frame->root->client; + iot_client_ctx_t *ctx; - list_add_tail (&stub->list, &conf->reqs[pri]); + if (pri < 0 || pri >= GF_FOP_PRI_MAX) + pri = GF_FOP_PRI_MAX - 1; - conf->queue_size++; - conf->queue_sizes[pri]++; + if (client) { + ctx = iot_get_ctx(THIS, client); + if (ctx) { + ctx = &ctx[pri]; + } + } else { + ctx = NULL; + } + if (!ctx) { + ctx = &conf->no_client[pri]; + } - return; -} + if (list_empty(&ctx->reqs)) { + list_add_tail(&ctx->clients, &conf->clients[pri]); + } + list_add_tail(&stub->list, &ctx->reqs); + conf->queue_size++; + GF_ATOMIC_INC(conf->stub_cnt); + conf->queue_sizes[pri]++; +} void * -iot_worker (void *data) -{ - iot_conf_t *conf = NULL; - xlator_t *this = NULL; - call_stub_t *stub = NULL; - struct timespec sleep_till = {0, }; - int ret = 0; - int pri = -1; - char timeout = 0; - char bye = 0; - struct timespec sleep = {0,}; - - conf = data; - this = conf->this; - THIS = this; - - for (;;) { - sleep_till.tv_sec = time (NULL) + conf->idle_time; - - pthread_mutex_lock (&conf->mutex); - { - if (pri != -1) { - conf->ac_iot_count[pri]--; - pri = -1; - } - while (conf->queue_size == 0) { - conf->sleep_count++; - - ret = pthread_cond_timedwait (&conf->cond, - &conf->mutex, - &sleep_till); - conf->sleep_count--; - - if (ret == ETIMEDOUT) { - timeout = 1; - break; - } - } - - if (timeout) { - if (conf->curr_count > IOT_MIN_THREADS) { - conf->curr_count--; - bye = 1; - gf_log (conf->this->name, GF_LOG_DEBUG, - "timeout, terminated. conf->curr_count=%d", - conf->curr_count); - } else { - timeout = 0; - } - } - - stub = __iot_dequeue (conf, &pri, &sleep); - if (!stub && (sleep.tv_sec || sleep.tv_nsec)) { - pthread_cond_timedwait(&conf->cond, - &conf->mutex, &sleep); - pthread_mutex_unlock(&conf->mutex); - continue; - } +iot_worker(void *data) +{ + iot_conf_t *conf = NULL; + xlator_t *this = NULL; + call_stub_t *stub = NULL; + struct timespec sleep_till = { + 0, + }; + int ret = 0; + int pri = -1; + gf_boolean_t bye = _gf_false; + + conf = data; + this = conf->this; + THIS = this; + + for (;;) { + pthread_mutex_lock(&conf->mutex); + { + if (pri != -1) { + conf->ac_iot_count[pri]--; + pri = -1; + } + while (conf->queue_size == 0) { + if (conf->down) { + bye = _gf_true; /*Avoid sleep*/ + break; } - pthread_mutex_unlock (&conf->mutex); - if (stub) /* guard against spurious wakeups */ - call_resume (stub); + clock_gettime(CLOCK_REALTIME_COARSE, &sleep_till); + sleep_till.tv_sec += conf->idle_time; - if (bye) - break; - } + conf->sleep_count++; + ret = pthread_cond_timedwait(&conf->cond, &conf->mutex, + &sleep_till); + conf->sleep_count--; - if (pri != -1) { - pthread_mutex_lock (&conf->mutex); - { - conf->ac_iot_count[pri]--; + if (conf->down || ret == ETIMEDOUT) { + bye = _gf_true; + break; + } + } + + if (bye) { + if (conf->down || conf->curr_count > IOT_MIN_THREADS) { + conf->curr_count--; + if (conf->curr_count == 0) + pthread_cond_broadcast(&conf->cond); + gf_msg_debug(conf->this->name, 0, + "terminated. " + "conf->curr_count=%d", + conf->curr_count); + } else { + bye = _gf_false; } - pthread_mutex_unlock (&conf->mutex); + } + + if (!bye) + stub = __iot_dequeue(conf, &pri); } - return NULL; -} + pthread_mutex_unlock(&conf->mutex); + + if (stub) { /* guard against spurious wakeups */ + if (stub->poison) { + gf_log(this->name, GF_LOG_INFO, "Dropping poisoned request %p.", + stub); + call_stub_destroy(stub); + } else { + call_resume(stub); + } + GF_ATOMIC_DEC(conf->stub_cnt); + } + stub = NULL; + + if (bye) + break; + } + return NULL; +} int -do_iot_schedule (iot_conf_t *conf, call_stub_t *stub, int pri) +do_iot_schedule(iot_conf_t *conf, call_stub_t *stub, int pri) { - int ret = 0; + int ret = 0; - pthread_mutex_lock (&conf->mutex); - { - __iot_enqueue (conf, stub, pri); + pthread_mutex_lock(&conf->mutex); + { + __iot_enqueue(conf, stub, pri); - pthread_cond_signal (&conf->cond); + pthread_cond_signal(&conf->cond); - ret = __iot_workers_scale (conf); - } - pthread_mutex_unlock (&conf->mutex); + ret = __iot_workers_scale(conf); + } + pthread_mutex_unlock(&conf->mutex); - return ret; + return ret; } -char* -iot_get_pri_meaning (iot_pri_t pri) -{ - char *name = NULL; - switch (pri) { - case IOT_PRI_HI: - name = "fast"; - break; - case IOT_PRI_NORMAL: - name = "normal"; - break; - case IOT_PRI_LO: - name = "slow"; - break; - case IOT_PRI_LEAST: - name = "least priority"; - break; - case IOT_PRI_MAX: - name = "invalid"; - break; - } - return name; +char * +iot_get_pri_meaning(gf_fop_pri_t pri) +{ + char *name = NULL; + switch (pri) { + case GF_FOP_PRI_HI: + name = "fast"; + break; + case GF_FOP_PRI_NORMAL: + name = "normal"; + break; + case GF_FOP_PRI_LO: + name = "slow"; + break; + case GF_FOP_PRI_LEAST: + name = "least"; + break; + case GF_FOP_PRI_MAX: + name = "invalid"; + break; + case GF_FOP_PRI_UNSPEC: + name = "unspecified"; + break; + } + return name; } int -iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) +iot_schedule(call_frame_t *frame, xlator_t *this, call_stub_t *stub) { - int ret = -1; - iot_pri_t pri = IOT_PRI_MAX - 1; - iot_conf_t *conf = this->private; + int ret = -1; + gf_fop_pri_t pri = GF_FOP_PRI_MAX - 1; + iot_conf_t *conf = this->private; - if ((frame->root->pid < GF_CLIENT_PID_MAX) && conf->least_priority) { - pri = IOT_PRI_LEAST; - goto out; - } + if ((frame->root->pid < GF_CLIENT_PID_MAX) && + (frame->root->pid != GF_CLIENT_PID_NO_ROOT_SQUASH) && + conf->least_priority) { + pri = GF_FOP_PRI_LEAST; + goto out; + } - switch (stub->fop) { + switch (stub->fop) { case GF_FOP_OPEN: case GF_FOP_STAT: case GF_FOP_FSTAT: @@ -270,8 +312,12 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) case GF_FOP_STATFS: case GF_FOP_READDIR: case GF_FOP_READDIRP: - pri = IOT_PRI_HI; - break; + case GF_FOP_GETACTIVELK: + case GF_FOP_SETACTIVELK: + case GF_FOP_ICREATE: + case GF_FOP_NAMELINK: + pri = GF_FOP_PRI_HI; + break; case GF_FOP_CREATE: case GF_FOP_FLUSH: @@ -280,6 +326,7 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) case GF_FOP_FINODELK: case GF_FOP_ENTRYLK: case GF_FOP_FENTRYLK: + case GF_FOP_LEASE: case GF_FOP_UNLINK: case GF_FOP_SETATTR: case GF_FOP_FSETATTR: @@ -295,8 +342,9 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) case GF_FOP_FSETXATTR: case GF_FOP_REMOVEXATTR: case GF_FOP_FREMOVEXATTR: - pri = IOT_PRI_NORMAL; - break; + case GF_FOP_PUT: + pri = GF_FOP_PRI_NORMAL; + break; case GF_FOP_READ: case GF_FOP_WRITE: @@ -307,2607 +355,1236 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) case GF_FOP_XATTROP: case GF_FOP_FXATTROP: case GF_FOP_RCHECKSUM: - case GF_FOP_FALLOCATE: - case GF_FOP_DISCARD: - pri = IOT_PRI_LO; - break; + case GF_FOP_FALLOCATE: + case GF_FOP_DISCARD: + case GF_FOP_ZEROFILL: + case GF_FOP_SEEK: + pri = GF_FOP_PRI_LO; + break; - case GF_FOP_NULL: case GF_FOP_FORGET: case GF_FOP_RELEASE: case GF_FOP_RELEASEDIR: case GF_FOP_GETSPEC: - case GF_FOP_MAXVALUE: - //fail compilation on missing fop - //new fop must choose priority. - break; - } -out: - ret = do_iot_schedule (this->private, stub, pri); - gf_log (this->name, GF_LOG_DEBUG, "%s scheduled as %s fop", - gf_fop_list[stub->fop], iot_get_pri_meaning (pri)); - return ret; -} - -int -iot_lookup_cbk (call_frame_t *frame, void * cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xdata, - struct iatt *postparent) -{ - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata, - postparent); - return 0; -} - - -int -iot_lookup_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata) -{ - STACK_WIND (frame, iot_lookup_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->lookup, - loc, xdata); - return 0; -} - - -int -iot_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_lookup_stub (frame, iot_lookup_wrapper, loc, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create lookup stub (out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - if (stub != NULL) { - call_stub_destroy (stub); - } - STACK_UNWIND_STRICT (lookup, frame, -1, -ret, NULL, NULL, NULL, - NULL); - } - - return 0; -} - - -int -iot_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) -{ - STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop, - xdata); - return 0; -} - - -int -iot_setattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) -{ - STACK_WIND (frame, iot_setattr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setattr, - loc, stbuf, valid, xdata); - return 0; -} - - -int -iot_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_setattr_stub (frame, iot_setattr_wrapper, loc, stbuf, valid, - xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "Cannot create setattr stub" - "(Out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - if (stub != NULL) { - call_stub_destroy (stub); - } - - STACK_UNWIND_STRICT (setattr, frame, -1, -ret, NULL, NULL, NULL); - } - - return 0; -} - - -int -iot_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) -{ - STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, preop, postop, - xdata); - return 0; -} - - -int -iot_fsetattr_wrapper (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata) -{ - STACK_WIND (frame, iot_fsetattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetattr, fd, stbuf, valid, - xdata); - return 0; -} - - -int -iot_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fsetattr_stub (frame, iot_fsetattr_wrapper, fd, stbuf, - valid, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fsetattr stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fsetattr, frame, -1, -ret, NULL, NULL, - NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - -int -iot_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata); - return 0; -} - - -int -iot_access_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t mask, dict_t *xdata) -{ - STACK_WIND (frame, iot_access_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->access, loc, mask, xdata); - return 0; -} - - -int -iot_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, - dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_access_stub (frame, iot_access_wrapper, loc, mask, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create access stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); + break; + case GF_FOP_IPC: + default: + return -EINVAL; + } out: - if (ret < 0) { - STACK_UNWIND_STRICT (access, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + gf_msg_debug(this->name, 0, "%s scheduled as %s priority fop", + gf_fop_list[stub->fop], iot_get_pri_meaning(pri)); + if (this->private) + ret = do_iot_schedule(this->private, stub, pri); + return ret; } - int -iot_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, const char *path, - struct iatt *stbuf, dict_t *xdata) +iot_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path, stbuf, - xdata); - return 0; + IOT_FOP(lookup, frame, this, loc, xdata); + return 0; } - int -iot_readlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - size_t size, dict_t *xdata) +iot_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - STACK_WIND (frame, iot_readlink_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->readlink, - loc, size, xdata); - return 0; + IOT_FOP(setattr, frame, this, loc, stbuf, valid, xdata); + return 0; } - int -iot_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata) +iot_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_readlink_stub (frame, iot_readlink_wrapper, loc, size, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create readlink stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (readlink, frame, -1, -ret, NULL, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; + IOT_FOP(fsetattr, frame, this, fd, stbuf, valid, xdata); + return 0; } - int -iot_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; -} - - -int -iot_mknod_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, mode_t umask, dict_t *xdata) +iot_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) { - STACK_WIND (frame, iot_mknod_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->mknod, loc, mode, rdev, umask, - xdata); - return 0; + IOT_FOP(access, frame, this, loc, mask, xdata); + return 0; } - int -iot_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, mode_t umask, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_mknod_stub (frame, iot_mknod_wrapper, loc, mode, rdev, - umask, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create mknod stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (mknod, frame, -1, -ret, NULL, NULL, NULL, - NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - -int -iot_mkdir_cbk (call_frame_t *frame, void * cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +iot_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) { - STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; + IOT_FOP(readlink, frame, this, loc, size, xdata); + return 0; } - int -iot_mkdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) +iot_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) { - STACK_WIND (frame, iot_mkdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->mkdir, loc, mode, umask, xdata); - return 0; + IOT_FOP(mknod, frame, this, loc, mode, rdev, umask, xdata); + return 0; } - int -iot_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_mkdir_stub (frame, iot_mkdir_wrapper, loc, mode, umask, - xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create mkdir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (mkdir, frame, -1, -ret, NULL, NULL, NULL, - NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - -int -iot_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, preparent, - postparent, xdata); - return 0; -} - - -int -iot_rmdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) +iot_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - STACK_WIND (frame, iot_rmdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->rmdir, loc, flags, xdata); - return 0; + IOT_FOP(mkdir, frame, this, loc, mode, umask, xdata); + return 0; } - int -iot_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_rmdir_stub (frame, iot_rmdir_wrapper, loc, flags, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create rmdir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (rmdir, frame, -1, -ret, NULL, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - -int -iot_symlink_cbk (call_frame_t *frame, void * cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +iot_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; + IOT_FOP(rmdir, frame, this, loc, flags, xdata); + return 0; } - int -iot_symlink_wrapper (call_frame_t *frame, xlator_t *this, const char *linkname, - loc_t *loc, mode_t umask, dict_t *xdata) +iot_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) { - STACK_WIND (frame, iot_symlink_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->symlink, linkname, loc, umask, - xdata); - return 0; + IOT_FOP(symlink, frame, this, linkname, loc, umask, xdata); + return 0; } - int -iot_symlink (call_frame_t *frame, xlator_t *this, const char *linkname, - loc_t *loc, mode_t umask, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_symlink_stub (frame, iot_symlink_wrapper, linkname, loc, - umask, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create symlink stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (symlink, frame, -1, -ret, NULL, NULL, NULL, - NULL, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; -} - - -int -iot_rename_cbk (call_frame_t *frame, void * cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent, - dict_t *xdata) -{ - STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf, preoldparent, - postoldparent, prenewparent, postnewparent, xdata); - return 0; -} - - -int -iot_rename_wrapper (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc, dict_t *xdata) +iot_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - STACK_WIND (frame, iot_rename_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->rename, oldloc, newloc, xdata); - return 0; + IOT_FOP(rename, frame, this, oldloc, newloc, xdata); + return 0; } - int -iot_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, - dict_t *xdata) +iot_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_rename_stub (frame, iot_rename_wrapper, oldloc, newloc, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_DEBUG, "cannot create rename stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (rename, frame, -1, -ret, NULL, NULL, NULL, - NULL, NULL, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; + IOT_FOP(open, frame, this, loc, flags, fd, xdata); + return 0; } - int -iot_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd, dict_t *xdata) +iot_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); - return 0; + IOT_FOP(create, frame, this, loc, flags, mode, umask, fd, xdata); + return 0; } - int -iot_open_wrapper (call_frame_t * frame, xlator_t * this, loc_t *loc, - int32_t flags, fd_t * fd, dict_t *xdata) +iot_put(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, uint32_t flags, struct iovec *vector, int32_t count, + off_t offset, struct iobref *iobref, dict_t *xattr, dict_t *xdata) { - STACK_WIND (frame, iot_open_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->open, loc, flags, fd, - xdata); - return 0; + IOT_FOP(put, frame, this, loc, mode, umask, flags, vector, count, offset, + iobref, xattr, xdata); + return 0; } - int -iot_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_open_stub (frame, iot_open_wrapper, loc, flags, fd, - xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create open call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (open, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; -} - - -int -iot_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +iot_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, stbuf, - preparent, postparent, xdata); - return 0; + IOT_FOP(readv, frame, this, fd, size, offset, flags, xdata); + return 0; } - int -iot_create_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, mode_t mode, mode_t umask, fd_t *fd, - dict_t *xdata) +iot_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, iot_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, umask, fd, xdata); - return 0; + IOT_FOP(flush, frame, this, fd, xdata); + return 0; } - -int -iot_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_create_stub (frame, iot_create_wrapper, loc, flags, mode, - umask, fd, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create \"create\" call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (create, frame, -1, -ret, NULL, NULL, NULL, - NULL, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; -} - - int -iot_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref, - dict_t *xdata) +iot_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref, xdata); - - return 0; + IOT_FOP(fsync, frame, this, fd, datasync, xdata); + return 0; } - int -iot_readv_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +iot_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - STACK_WIND (frame, iot_readv_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, - fd, size, offset, flags, xdata); - return 0; + IOT_FOP(writev, frame, this, fd, vector, count, offset, flags, iobref, + xdata); + return 0; } - int -iot_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +iot_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_readv_stub (frame, iot_readv_wrapper, fd, size, offset, - flags, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create readv call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (readv, frame, -1, -ret, NULL, -1, NULL, - NULL, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(lk, frame, this, fd, cmd, flock, xdata); + return 0; } - int -iot_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +iot_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); - return 0; + IOT_FOP(stat, frame, this, loc, xdata); + return 0; } - int -iot_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +iot_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, iot_flush_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, - fd, xdata); - return 0; + IOT_FOP(fstat, frame, this, fd, xdata); + return 0; } - int -iot_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_flush_stub (frame, iot_flush_wrapper, fd, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create flush_cbk call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (flush, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - -int -iot_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +iot_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, - xdata); - return 0; + IOT_FOP(truncate, frame, this, loc, offset, xdata); + return 0; } - int -iot_fsync_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +iot_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - STACK_WIND (frame, iot_fsync_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsync, - fd, datasync, xdata); - return 0; + IOT_FOP(ftruncate, frame, this, fd, offset, xdata); + return 0; } - int -iot_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, +iot_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fsync_stub (frame, iot_fsync_wrapper, fd, datasync, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fsync_cbk call stub" - "(out of memory)"); - ret = -1; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fsync, frame, -1, -ret, NULL, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(unlink, frame, this, loc, xflag, xdata); + return 0; } - int -iot_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +iot_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, - xdata); - return 0; + IOT_FOP(link, frame, this, oldloc, newloc, xdata); + return 0; } - int -iot_writev_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, - off_t offset, uint32_t flags, struct iobref *iobref, - dict_t *xdata) -{ - STACK_WIND (frame, iot_writev_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - fd, vector, count, offset, flags, iobref, xdata); - return 0; -} - - -int -iot_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_writev_stub (frame, iot_writev_wrapper, fd, vector, - count, offset, flags, iobref, xdata); - - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create writev call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (writev, frame, -1, -ret, NULL, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; -} - - -int32_t -iot_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *flock, +iot_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata) { - STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, flock, xdata); - return 0; + IOT_FOP(opendir, frame, this, loc, fd, xdata); + return 0; } - int -iot_lk_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t cmd, struct gf_flock *flock, dict_t *xdata) +iot_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) { - STACK_WIND (frame, iot_lk_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lk, - fd, cmd, flock, xdata); - return 0; + IOT_FOP(fsyncdir, frame, this, fd, datasync, xdata); + return 0; } - int -iot_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *flock, dict_t *xdata) +iot_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_lk_stub (frame, iot_lk_wrapper, fd, cmd, flock, xdata); - - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_lk call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (lk, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(statfs, frame, this, loc, xdata); + return 0; } - int -iot_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) +iot_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata); - return 0; + IOT_FOP(setxattr, frame, this, loc, dict, flags, xdata); + return 0; } - -int -iot_stat_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) -{ - STACK_WIND (frame, iot_stat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, - loc, xdata); - return 0; -} - - int -iot_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +iot_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_stat_stub (frame, iot_stat_wrapper, loc, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_stat call stub" - "(out of memory)"); - ret = -1; - goto out; - } - - ret = iot_schedule (frame, this, stub); + iot_conf_t *conf = NULL; + dict_t *depths = NULL; + int i = 0; + int32_t op_ret = 0; + int32_t op_errno = 0; -out: - if (ret < 0) { - STACK_UNWIND_STRICT (stat, frame, -1, -ret, NULL, NULL); + conf = this->private; - if (stub != NULL) { - call_stub_destroy (stub); - } + if (name && strcmp(name, IO_THREADS_QUEUE_SIZE_KEY) == 0) { + /* + * We explicitly do not want a reference count + * for this dict in this translator + */ + depths = dict_new(); + if (!depths) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind_special_getxattr; } - return 0; -} - - -int -iot_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) -{ - STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata); - return 0; -} - - -int -iot_fstat_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ - STACK_WIND (frame, iot_fstat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, - fd, xdata); - return 0; -} - - -int -iot_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fstat_stub (frame, iot_fstat_wrapper, fd, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_fstat call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fstat, frame, -1, -ret, NULL, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } + for (i = 0; i < GF_FOP_PRI_MAX; i++) { + if (dict_set_int32(depths, (char *)fop_pri_to_string(i), + conf->queue_sizes[i]) != 0) { + dict_unref(depths); + depths = NULL; + goto unwind_special_getxattr; + } } - return 0; -} + unwind_special_getxattr: + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, depths, xdata); + if (depths) + dict_unref(depths); + return 0; + } -int -iot_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, - postbuf, xdata); - return 0; + IOT_FOP(getxattr, frame, this, loc, name, xdata); + return 0; } - int -iot_truncate_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset, dict_t *xdata) -{ - STACK_WIND (frame, iot_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - loc, offset, xdata); - return 0; -} - - -int -iot_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, +iot_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { - call_stub_t *stub; - int ret = -1; - - stub = fop_truncate_stub (frame, iot_truncate_wrapper, loc, offset, - xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_stat call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (truncate, frame, -1, -ret, NULL, NULL, - NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; + IOT_FOP(fgetxattr, frame, this, fd, name, xdata); + return 0; } - int -iot_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, - postbuf, xdata); - return 0; -} - - -int -iot_ftruncate_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset, dict_t *xdata) -{ - STACK_WIND (frame, iot_ftruncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, - fd, offset, xdata); - return 0; -} - - -int -iot_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_ftruncate_stub (frame, iot_ftruncate_wrapper, fd, offset, - xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_ftruncate call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (ftruncate, frame, -1, -ret, NULL, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - - -int -iot_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +iot_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) { - STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, preparent, - postparent, xdata); - return 0; + IOT_FOP(fsetxattr, frame, this, fd, dict, flags, xdata); + return 0; } - int -iot_unlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t xflag, dict_t *xdata) +iot_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - STACK_WIND (frame, iot_unlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, - loc, xflag, xdata); - return 0; + IOT_FOP(removexattr, frame, this, loc, name, xdata); + return 0; } - int -iot_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, - dict_t *xdata) +iot_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_unlink_stub (frame, iot_unlink_wrapper, loc, xflag, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_unlink call stub" - "(out of memory)"); - ret = -1; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (unlink, frame, -1, -ret, NULL, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; + IOT_FOP(fremovexattr, frame, this, fd, name, xdata); + return 0; } - int -iot_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) +iot_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; + IOT_FOP(readdirp, frame, this, fd, size, offset, xdata); + return 0; } - int -iot_link_wrapper (call_frame_t *frame, xlator_t *this, loc_t *old, loc_t *new, - dict_t *xdata) +iot_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - STACK_WIND (frame, iot_link_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->link, old, new, xdata); - - return 0; + IOT_FOP(readdir, frame, this, fd, size, offset, xdata); + return 0; } - int -iot_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, - dict_t *xdata) +iot_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *lock, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_link_stub (frame, iot_link_wrapper, oldloc, newloc, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create link stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (link, frame, -1, -ret, NULL, NULL, NULL, - NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(inodelk, frame, this, volume, loc, cmd, lock, xdata); + return 0; } - int -iot_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +iot_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *lock, dict_t *xdata) { - STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata); - return 0; + IOT_FOP(finodelk, frame, this, volume, fd, cmd, lock, xdata); + return 0; } - int -iot_opendir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, - dict_t *xdata) +iot_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - STACK_WIND (frame, iot_opendir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->opendir, loc, fd, xdata); - return 0; + IOT_FOP(entrylk, frame, this, volume, loc, basename, cmd, type, xdata); + return 0; } - int -iot_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, +iot_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_opendir_stub (frame, iot_opendir_wrapper, loc, fd, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create opendir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (opendir, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(fentrylk, frame, this, volume, fd, basename, cmd, type, xdata); + return 0; } - int -iot_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +iot_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata); - return 0; + IOT_FOP(xattrop, frame, this, loc, optype, xattr, xdata); + return 0; } - int -iot_fsyncdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int datasync, dict_t *xdata) +iot_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - STACK_WIND (frame, iot_fsyncdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsyncdir, fd, datasync, xdata); - return 0; + IOT_FOP(fxattrop, frame, this, fd, optype, xattr, xdata); + return 0; } - -int -iot_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, - dict_t *xdata) +int32_t +iot_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fsyncdir_stub (frame, iot_fsyncdir_wrapper, fd, datasync, - xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fsyncdir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fsyncdir, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(rchecksum, frame, this, fd, offset, len, xdata); + return 0; } - int -iot_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *buf, - dict_t *xdata) +iot_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { - STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata); - return 0; + IOT_FOP(fallocate, frame, this, fd, mode, offset, len, xdata); + return 0; } - int -iot_statfs_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata) +iot_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - STACK_WIND (frame, iot_statfs_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->statfs, loc, xdata); - return 0; + IOT_FOP(discard, frame, this, fd, offset, len, xdata); + return 0; } - int -iot_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +iot_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_statfs_stub (frame, iot_statfs_wrapper, loc, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create statfs stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (statfs, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(zerofill, frame, this, fd, offset, len, xdata); + return 0; } - int -iot_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +iot_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) { - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); - return 0; + IOT_FOP(seek, frame, this, fd, offset, what, xdata); + return 0; } - int -iot_setxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *dict, int32_t flags, dict_t *xdata) +iot_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata) { - STACK_WIND (frame, iot_setxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setxattr, loc, dict, flags, xdata); - return 0; + IOT_FOP(lease, frame, this, loc, lease, xdata); + return 0; } - int -iot_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags, dict_t *xdata) +iot_getactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_setxattr_stub (frame, iot_setxattr_wrapper, loc, dict, - flags, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create setxattr stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (setxattr, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(getactivelk, frame, this, loc, xdata); + return 0; } - int -iot_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +iot_setactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, + lock_migration_info_t *locklist, dict_t *xdata) { - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + IOT_FOP(setactivelk, frame, this, loc, locklist, xdata); + return 0; } - int -iot_getxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) +__iot_workers_scale(iot_conf_t *conf) { - STACK_WIND (frame, iot_getxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->getxattr, loc, name, xdata); - return 0; -} + int scale = 0; + int diff = 0; + pthread_t thread; + int ret = 0; + int i = 0; + for (i = 0; i < GF_FOP_PRI_MAX; i++) + scale += min(conf->queue_sizes[i], conf->ac_iot_limit[i]); -int -iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_getxattr_stub (frame, iot_getxattr_wrapper, loc, name, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create getxattr stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + if (scale < IOT_MIN_THREADS) + scale = IOT_MIN_THREADS; - ret = iot_schedule (frame, this, stub); + if (scale > conf->max_count) + scale = conf->max_count; -out: - if (ret < 0) { - STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, NULL, NULL); + if (conf->curr_count < scale) { + diff = scale - conf->curr_count; + } - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} + while (diff) { + diff--; + ret = gf_thread_create(&thread, &conf->w_attr, iot_worker, conf, + "iotwr%03hx", conf->curr_count & 0x3ff); + if (ret == 0) { + pthread_detach(thread); + conf->curr_count++; + gf_msg_debug(conf->this->name, 0, + "scaled threads to %d (queue_size=%d/%d)", + conf->curr_count, conf->queue_size, scale); + } else { + break; + } + } -int -iot_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) -{ - STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + return diff; } - int -iot_fgetxattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) +iot_workers_scale(iot_conf_t *conf) { - STACK_WIND (frame, iot_fgetxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fgetxattr, fd, name, xdata); - return 0; -} + int ret = -1; + if (conf == NULL) { + ret = -EINVAL; + goto out; + } -int -iot_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fgetxattr_stub (frame, iot_fgetxattr_wrapper, fd, name, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fgetxattr stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + pthread_mutex_lock(&conf->mutex); + { + ret = __iot_workers_scale(conf); + } + pthread_mutex_unlock(&conf->mutex); - ret = iot_schedule (frame, this, stub); out: - if (ret < 0) { - STACK_UNWIND_STRICT (fgetxattr, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - -int -iot_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata); - return 0; + return ret; } - int -iot_fsetxattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - dict_t *dict, int32_t flags, dict_t *xdata) +set_stack_size(iot_conf_t *conf) { - STACK_WIND (frame, iot_fsetxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetxattr, fd, dict, flags, - xdata); - return 0; -} + int err = 0; + size_t stacksize = IOT_THREAD_STACK_SIZE; + xlator_t *this = NULL; + this = THIS; -int -iot_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fsetxattr_stub (frame, iot_fsetxattr_wrapper, fd, dict, - flags, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fsetxattr stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + err = pthread_attr_init(&conf->w_attr); + if (err != 0) { + gf_smsg(this->name, GF_LOG_ERROR, err, IO_THREADS_MSG_INIT_FAILED, + NULL); + return err; + } - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fsetxattr, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } + err = pthread_attr_setstacksize(&conf->w_attr, stacksize); + if (err == EINVAL) { + err = pthread_attr_getstacksize(&conf->w_attr, &stacksize); + if (!err) { + gf_smsg(this->name, GF_LOG_WARNING, 0, IO_THREADS_MSG_SIZE_NOT_SET, + "size=%zd", stacksize, NULL); + } else { + gf_smsg(this->name, GF_LOG_WARNING, 0, IO_THREADS_MSG_SIZE_NOT_SET, + NULL); + err = 0; } - return 0; -} - + } -int -iot_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata); - return 0; + conf->stack_size = stacksize; + return err; } - -int -iot_removexattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) +int32_t +mem_acct_init(xlator_t *this) { - STACK_WIND (frame, iot_removexattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->removexattr, loc, name, xdata); - return 0; -} + int ret = -1; + if (!this) + return ret; -int -iot_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_removexattr_stub (frame, iot_removexattr_wrapper, loc, - name, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR,"cannot get removexattr fop" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + ret = xlator_mem_acct_init(this, gf_iot_mt_end + 1); - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (removexattr, frame, -1, -ret, NULL); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_THREADS_MSG_NO_MEMORY, + NULL); + return ret; + } - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + return ret; } int -iot_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +iot_priv_dump(xlator_t *this) { - STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata); - return 0; -} - + iot_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; -int -iot_fremovexattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) -{ - STACK_WIND (frame, iot_fremovexattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fremovexattr, fd, name, xdata); + if (!this) return 0; -} - -int -iot_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fremovexattr_stub (frame, iot_fremovexattr_wrapper, fd, - name, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR,"cannot get fremovexattr fop" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fremovexattr, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } + conf = this->private; + if (!conf) return 0; -} + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); -int -iot_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) -{ - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); - return 0; -} + gf_proc_dump_add_section("%s", key_prefix); + gf_proc_dump_write("maximum_threads_count", "%d", conf->max_count); + gf_proc_dump_write("current_threads_count", "%d", conf->curr_count); + gf_proc_dump_write("sleep_count", "%d", conf->sleep_count); + gf_proc_dump_write("idle_time", "%d", conf->idle_time); + gf_proc_dump_write("stack_size", "%zd", conf->stack_size); + gf_proc_dump_write("max_high_priority_threads", "%d", + conf->ac_iot_limit[GF_FOP_PRI_HI]); + gf_proc_dump_write("max_normal_priority_threads", "%d", + conf->ac_iot_limit[GF_FOP_PRI_NORMAL]); + gf_proc_dump_write("max_low_priority_threads", "%d", + conf->ac_iot_limit[GF_FOP_PRI_LO]); + gf_proc_dump_write("max_least_priority_threads", "%d", + conf->ac_iot_limit[GF_FOP_PRI_LEAST]); + gf_proc_dump_write("current_high_priority_threads", "%d", + conf->ac_iot_count[GF_FOP_PRI_HI]); + gf_proc_dump_write("current_normal_priority_threads", "%d", + conf->ac_iot_count[GF_FOP_PRI_NORMAL]); + gf_proc_dump_write("current_low_priority_threads", "%d", + conf->ac_iot_count[GF_FOP_PRI_LO]); + gf_proc_dump_write("current_least_priority_threads", "%d", + conf->ac_iot_count[GF_FOP_PRI_LEAST]); + for (i = 0; i < GF_FOP_PRI_MAX; i++) { + if (!conf->queue_sizes[i]) + continue; + snprintf(key, sizeof(key), "%s_priority_queue_length", + iot_get_pri_meaning(i)); + gf_proc_dump_write(key, "%d", conf->queue_sizes[i]); + } -int -iot_readdirp_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset, dict_t *xdata) -{ - STACK_WIND (frame, iot_readdirp_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->readdirp, fd, size, offset, xdata); - return 0; + return 0; } - -int -iot_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_readdirp_stub (frame, iot_readdirp_wrapper, fd, size, - offset, xdata); - if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (readdirp, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); +/* + * We use a decay model to keep track and make sure we're not spawning new + * threads too often. Each increment adds a large value to a counter, and that + * counter keeps ticking back down to zero over a fairly long period. For + * example, let's use ONE_WEEK=604800 seconds, and we want to detect when we + * have N=3 increments during that time. Thus, our threshold is + * (N-1)*ONE_WEEK. To see how it works, look at three examples. + * + * (a) Two events close together, then one more almost a week later. The + * first two events push our counter to 2*ONE_WEEK plus a bit. At the third + * event, we decay down to ONE_WEEK plus a bit and then add ONE_WEEK for the + * new event, exceeding our threshold. + * + * (b) One event, then two more almost a week later. At the time of the + * second and third events, the counter is already non-zero, so when we add + * 2*ONE_WEEK we exceed again. + * + * (c) Three events, spaced three days apart. At the time of the second + * event, we decay down to approxitely ONE_WEEK*4/7 and then add another + * ONE_WEEK. At the third event, we decay again down to ONE_WEEK*8/7 and add + * another ONE_WEEK, so boom. + * + * Note that in all three cases if that last event came a day later our counter + * would have decayed a bit more and we would *not* exceed our threshold. It's + * not exactly the same as a precise "three in one week" limit, but it's very + * close and it allows the same kind of tweaking while requiring only constant + * space - no arrays of variable length N to allocate or maintain. All we need + * (for each queue) is the value plus the time of the last update. + */ + +typedef struct { + time_t update_time; + uint32_t value; +} threshold_t; +/* + * Variables so that I can hack these for testing. + * TBD: make these tunable? + */ +static uint32_t THRESH_SECONDS = 604800; +static uint32_t THRESH_EVENTS = 3; +static uint32_t THRESH_LIMIT = 1209600; /* SECONDS * (EVENTS-1) */ + +static void +iot_apply_event(xlator_t *this, threshold_t *thresh) +{ + time_t delta, now = gf_time(); + + /* Refresh for manual testing/debugging. It's cheap. */ + THRESH_LIMIT = THRESH_SECONDS * (THRESH_EVENTS - 1); + + if (thresh->value && thresh->update_time) { + delta = now - thresh->update_time; + /* Be careful about underflow. */ + if (thresh->value <= delta) { + thresh->value = 0; + } else { + thresh->value -= delta; + } + } + + thresh->value += THRESH_SECONDS; + if (thresh->value >= THRESH_LIMIT) { + gf_log(this->name, GF_LOG_EMERG, "watchdog firing too often"); + /* + * The default action for SIGTRAP is to dump core, but the fact + * that it's distinct from other signals we use means that + * there are other possibilities as well (e.g. drop into gdb or + * invoke a special handler). + */ + kill(getpid(), SIGTRAP); + } + + thresh->update_time = now; +} + +static void * +iot_watchdog(void *arg) +{ + xlator_t *this = arg; + iot_conf_t *priv = this->private; + int i; + int bad_times[GF_FOP_PRI_MAX] = { + 0, + }; + threshold_t thresholds[GF_FOP_PRI_MAX] = {{ + 0, + }}; + + for (;;) { + sleep(max(priv->watchdog_secs / 5, 1)); + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + pthread_mutex_lock(&priv->mutex); + for (i = 0; i < GF_FOP_PRI_MAX; ++i) { + if (priv->queue_marked[i]) { + if (++bad_times[i] >= 5) { + gf_log(this->name, GF_LOG_WARNING, "queue %d stalled", i); + iot_apply_event(this, &thresholds[i]); + /* + * We might not get here if the event + * put us over our threshold. + */ + ++(priv->ac_iot_limit[i]); + bad_times[i] = 0; } + } else { + bad_times[i] = 0; + } + priv->queue_marked[i] = (priv->queue_sizes[i] > 0); } - return 0; -} + pthread_mutex_unlock(&priv->mutex); + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + } - -int -iot_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) -{ - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries, xdata); - return 0; + /* NOTREACHED */ + return NULL; } - -int -iot_readdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset, dict_t *xdata) +static void +start_iot_watchdog(xlator_t *this) { - STACK_WIND (frame, iot_readdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->readdir, fd, size, offset, xdata); - return 0; -} + iot_conf_t *priv = this->private; + int ret; + if (priv->watchdog_running) { + return; + } -int -iot_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_readdir_stub (frame, iot_readdir_wrapper, fd, size, offset, - xdata); - if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (readdir, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - -int -iot_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata); - return 0; + ret = pthread_create(&priv->watchdog_thread, NULL, iot_watchdog, this); + if (ret == 0) { + priv->watchdog_running = _gf_true; + } else { + gf_log(this->name, GF_LOG_WARNING, + "pthread_create(iot_watchdog) failed"); + } } - -int -iot_inodelk_wrapper (call_frame_t *frame, xlator_t *this, const char *volume, - loc_t *loc, int32_t cmd, struct gf_flock *lock, - dict_t *xdata) +static void +stop_iot_watchdog(xlator_t *this) { - STACK_WIND (frame, iot_inodelk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->inodelk, volume, loc, cmd, lock, - xdata); - return 0; -} - + iot_conf_t *priv = this->private; -int -iot_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock, - dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; + if (!priv->watchdog_running) { + return; + } - stub = fop_inodelk_stub (frame, iot_inodelk_wrapper, - volume, loc, cmd, lock, xdata); - if (!stub) { - ret = -ENOMEM; - goto out; - } + if (pthread_cancel(priv->watchdog_thread) != 0) { + gf_log(this->name, GF_LOG_WARNING, + "pthread_cancel(iot_watchdog) failed"); + } - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (inodelk, frame, -1, -ret, NULL); + if (pthread_join(priv->watchdog_thread, NULL) != 0) { + gf_log(this->name, GF_LOG_WARNING, "pthread_join(iot_watchdog) failed"); + } - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + /* Failure probably means it's already dead. */ + priv->watchdog_running = _gf_false; } int -iot_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +reconfigure(xlator_t *this, dict_t *options) { - STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata); - return 0; -} + iot_conf_t *conf = NULL; + int ret = -1; + conf = this->private; + if (!conf) + goto out; -int -iot_finodelk_wrapper (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, - struct gf_flock *lock, dict_t *xdata) -{ - STACK_WIND (frame, iot_finodelk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->finodelk, volume, fd, cmd, lock, - xdata); - return 0; -} + GF_OPTION_RECONF("thread-count", conf->max_count, options, int32, out); + GF_OPTION_RECONF("high-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_HI], + options, int32, out); -int -iot_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock, - dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; + GF_OPTION_RECONF("normal-prio-threads", + conf->ac_iot_limit[GF_FOP_PRI_NORMAL], options, int32, + out); - stub = fop_finodelk_stub (frame, iot_finodelk_wrapper, - volume, fd, cmd, lock, xdata); - if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get finodelk stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + GF_OPTION_RECONF("low-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LO], + options, int32, out); - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (finodelk, frame, -1, -ret, NULL); + GF_OPTION_RECONF("least-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LEAST], + options, int32, out); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} + GF_OPTION_RECONF("enable-least-priority", conf->least_priority, options, + bool, out); -int -iot_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata); - return 0; -} + GF_OPTION_RECONF("cleanup-disconnected-reqs", + conf->cleanup_disconnected_reqs, options, bool, out); + GF_OPTION_RECONF("watchdog-secs", conf->watchdog_secs, options, int32, out); -int -iot_entrylk_wrapper (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) -{ - STACK_WIND (frame, iot_entrylk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->entrylk, - volume, loc, basename, cmd, type, xdata); - return 0; -} - + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); -int -iot_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_entrylk_stub (frame, iot_entrylk_wrapper, - volume, loc, basename, cmd, type, xdata); - if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get entrylk stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + if (conf->watchdog_secs > 0) { + start_iot_watchdog(this); + } else { + stop_iot_watchdog(this); + } - ret = iot_schedule (frame, this, stub); + ret = 0; out: - if (ret < 0) { - STACK_UNWIND_STRICT (entrylk, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + return ret; } int -iot_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +init(xlator_t *this) { - STACK_UNWIND_STRICT (fentrylk, frame, op_ret, op_errno, xdata); - return 0; -} + iot_conf_t *conf = NULL; + int ret = -1; + int i = 0; + if (!this->children || this->children->next) { + gf_smsg("io-threads", GF_LOG_ERROR, 0, + IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED, NULL); + goto out; + } -int -iot_fentrylk_wrapper (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) -{ - STACK_WIND (frame, iot_fentrylk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fentrylk, - volume, fd, basename, cmd, type, xdata); - return 0; -} + if (!this->parents) { + gf_smsg(this->name, GF_LOG_WARNING, 0, IO_THREADS_MSG_VOL_MISCONFIGURED, + NULL); + } + conf = (void *)GF_CALLOC(1, sizeof(*conf), gf_iot_mt_iot_conf_t); + if (conf == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_THREADS_MSG_OUT_OF_MEMORY, + NULL); + goto out; + } -int -iot_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fentrylk_stub (frame, iot_fentrylk_wrapper, - volume, fd, basename, cmd, type, xdata); - if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get fentrylk stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + if ((ret = pthread_cond_init(&conf->cond, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_THREADS_MSG_PTHREAD_INIT_FAILED, + "pthread_cond_init ret=%d", ret, NULL); + goto out; + } + conf->cond_inited = _gf_true; - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fentrylk, frame, -1, -ret, NULL); + if ((ret = pthread_mutex_init(&conf->mutex, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_THREADS_MSG_PTHREAD_INIT_FAILED, + "pthread_mutex_init ret=%d", ret, NULL); + goto out; + } + conf->mutex_inited = _gf_true; - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} + ret = set_stack_size(conf); + if (ret != 0) + goto out; -int -iot_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, xdata); - return 0; -} + ret = -1; + GF_OPTION_INIT("thread-count", conf->max_count, int32, out); -int -iot_xattrop_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) -{ - STACK_WIND (frame, iot_xattrop_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->xattrop, loc, optype, xattr, xdata); - return 0; -} + GF_OPTION_INIT("high-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_HI], + int32, out); + GF_OPTION_INIT("normal-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_NORMAL], + int32, out); -int -iot_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; + GF_OPTION_INIT("low-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LO], int32, + out); - stub = fop_xattrop_stub (frame, iot_xattrop_wrapper, loc, optype, - xattr, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create xattrop stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + GF_OPTION_INIT("least-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LEAST], + int32, out); - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (xattrop, frame, -1, -ret, NULL, NULL); + GF_OPTION_INIT("idle-time", conf->idle_time, int32, out); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} + GF_OPTION_INIT("enable-least-priority", conf->least_priority, bool, out); + GF_OPTION_INIT("cleanup-disconnected-reqs", conf->cleanup_disconnected_reqs, + bool, out); -int -iot_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, xattr, xdata); - return 0; -} + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); -int -iot_fxattrop_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) -{ - STACK_WIND (frame, iot_fxattrop_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fxattrop, fd, optype, xattr, xdata); - return 0; -} + conf->this = this; + GF_ATOMIC_INIT(conf->stub_cnt, 0); + for (i = 0; i < GF_FOP_PRI_MAX; i++) { + INIT_LIST_HEAD(&conf->clients[i]); + INIT_LIST_HEAD(&conf->no_client[i].clients); + INIT_LIST_HEAD(&conf->no_client[i].reqs); + } -int -iot_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fxattrop_stub (frame, iot_fxattrop_wrapper, fd, optype, - xattr, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fxattrop stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + if (!this->pass_through) { + ret = iot_workers_scale(conf); - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fxattrop, frame, -1, -ret, NULL, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } + if (ret == -1) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + IO_THREADS_MSG_WORKER_THREAD_INIT_FAILED, NULL); + goto out; } - return 0; -} + } + this->private = conf; -int32_t -iot_rchecksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, uint32_t weak_checksum, - uint8_t *strong_checksum, dict_t *xdata) -{ - STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, weak_checksum, - strong_checksum, xdata); - return 0; -} + conf->watchdog_secs = 0; + GF_OPTION_INIT("watchdog-secs", conf->watchdog_secs, int32, out); + if (conf->watchdog_secs > 0) { + start_iot_watchdog(this); + } - -int32_t -iot_rchecksum_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset, int32_t len, dict_t *xdata) -{ - STACK_WIND (frame, iot_rchecksum_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata); - return 0; -} - - -int32_t -iot_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - int32_t len, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_rchecksum_stub (frame, iot_rchecksum_wrapper, fd, offset, - len, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create rchecksum stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); + ret = 0; out: - if (ret < 0) { - STACK_UNWIND_STRICT (rchecksum, frame, -1, -ret, -1, NULL, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } + if (ret) + GF_FREE(conf); - return 0; + return ret; } -int -iot_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) +static void +iot_exit_threads(iot_conf_t *conf) { - STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno, preop, postop, - xdata); - return 0; + pthread_mutex_lock(&conf->mutex); + { + conf->down = _gf_true; + /*Let all the threads know that xl is going down*/ + pthread_cond_broadcast(&conf->cond); + while (conf->curr_count) /*Wait for threads to exit*/ + pthread_cond_wait(&conf->cond, &conf->mutex); + } + pthread_mutex_unlock(&conf->mutex); } - int -iot_fallocate_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) +notify(xlator_t *this, int32_t event, void *data, ...) { - STACK_WIND (frame, iot_fallocate_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fallocate, fd, mode, offset, len, - xdata); - return 0; -} - + iot_conf_t *conf = this->private; + xlator_t *victim = data; + uint64_t stub_cnt = 0; + struct timespec sleep_till = { + 0, + }; -int -iot_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fallocate_stub(frame, iot_fallocate_wrapper, fd, mode, offset, - len, xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fallocate stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fallocate, frame, -1, -ret, NULL, NULL, - NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - -int -iot_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) -{ - STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, preop, postop, - xdata); - return 0; -} - - -int -iot_discard_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) -{ - STACK_WIND (frame, iot_discard_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->discard, fd, offset, len, xdata); - return 0; -} - - -int -iot_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_discard_stub(frame, iot_discard_wrapper, fd, offset, len, - xdata); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create discard stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (discard, frame, -1, -ret, NULL, NULL, - NULL); - if (stub != NULL) { - call_stub_destroy (stub); + if (GF_EVENT_PARENT_DOWN == event) { + if (victim->cleanup_starting) { + /* Wait for draining stub from queue before notify PARENT_DOWN */ + stub_cnt = GF_ATOMIC_GET(conf->stub_cnt); + if (stub_cnt) { + timespec_now_realtime(&sleep_till); + sleep_till.tv_sec += 1; + pthread_mutex_lock(&conf->mutex); + { + while (stub_cnt) { + (void)pthread_cond_timedwait(&conf->cond, &conf->mutex, + &sleep_till); + stub_cnt = GF_ATOMIC_GET(conf->stub_cnt); + } } - } - return 0; -} + pthread_mutex_unlock(&conf->mutex); + } -int -__iot_workers_scale (iot_conf_t *conf) -{ - int scale = 0; - int diff = 0; - pthread_t thread; - int ret = 0; - int i = 0; - - for (i = 0; i < IOT_PRI_MAX; i++) - scale += min (conf->queue_sizes[i], conf->ac_iot_limit[i]); - - if (scale < IOT_MIN_THREADS) - scale = IOT_MIN_THREADS; - - if (scale > conf->max_count) - scale = conf->max_count; - - if (conf->curr_count < scale) { - diff = scale - conf->curr_count; + gf_log(this->name, GF_LOG_INFO, + "Notify GF_EVENT_PARENT_DOWN for brick %s", victim->name); + } else { + iot_exit_threads(conf); } + } - while (diff) { - diff --; - - ret = pthread_create (&thread, &conf->w_attr, iot_worker, conf); - if (ret == 0) { - conf->curr_count++; - gf_log (conf->this->name, GF_LOG_DEBUG, - "scaled threads to %d (queue_size=%d/%d)", - conf->curr_count, conf->queue_size, scale); - } else { - break; - } - } - - return diff; -} - - -int -iot_workers_scale (iot_conf_t *conf) -{ - int ret = -1; - - if (conf == NULL) { - ret = -EINVAL; - goto out; + if (GF_EVENT_CHILD_DOWN == event) { + if (victim->cleanup_starting) { + iot_exit_threads(conf); + gf_log(this->name, GF_LOG_INFO, + "Notify GF_EVENT_CHILD_DOWN for brick %s", victim->name); } + } - pthread_mutex_lock (&conf->mutex); - { - ret = __iot_workers_scale (conf); - } - pthread_mutex_unlock (&conf->mutex); + default_notify(this, event, data); -out: - return ret; + return 0; } - void -set_stack_size (iot_conf_t *conf) -{ - int err = 0; - size_t stacksize = IOT_THREAD_STACK_SIZE; - xlator_t *this = NULL; - - this = THIS; - - pthread_attr_init (&conf->w_attr); - err = pthread_attr_setstacksize (&conf->w_attr, stacksize); - if (err == EINVAL) { - err = pthread_attr_getstacksize (&conf->w_attr, &stacksize); - if (!err) - gf_log (this->name, GF_LOG_WARNING, - "Using default thread stack size %zd", - stacksize); - else - gf_log (this->name, GF_LOG_WARNING, - "Using default thread stack size"); - } - - conf->stack_size = stacksize; -} - - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_iot_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -int -iot_priv_dump (xlator_t *this) +fini(xlator_t *this) { - iot_conf_t *conf = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; + iot_conf_t *conf = this->private; - if (!this) - return 0; + if (!conf) + return; - conf = this->private; - if (!conf) - return 0; + if (conf->mutex_inited && conf->cond_inited) + iot_exit_threads(conf); - snprintf (key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, - this->name); + if (conf->cond_inited) + pthread_cond_destroy(&conf->cond); - gf_proc_dump_add_section(key_prefix); + if (conf->mutex_inited) + pthread_mutex_destroy(&conf->mutex); - gf_proc_dump_write("maximum_threads_count", "%d", conf->max_count); - gf_proc_dump_write("current_threads_count", "%d", conf->curr_count); - gf_proc_dump_write("sleep_count", "%d", conf->sleep_count); - gf_proc_dump_write("idle_time", "%d", conf->idle_time); - gf_proc_dump_write("stack_size", "%zd", conf->stack_size); - gf_proc_dump_write("high_priority_threads", "%d", - conf->ac_iot_limit[IOT_PRI_HI]); - gf_proc_dump_write("normal_priority_threads", "%d", - conf->ac_iot_limit[IOT_PRI_NORMAL]); - gf_proc_dump_write("low_priority_threads", "%d", - conf->ac_iot_limit[IOT_PRI_LO]); - gf_proc_dump_write("least_priority_threads", "%d", - conf->ac_iot_limit[IOT_PRI_LEAST]); + stop_iot_watchdog(this); - gf_proc_dump_write("cached least rate", "%u", - conf->throttle.cached_rate); - gf_proc_dump_write("least rate limit", "%u", conf->throttle.rate_limit); + GF_FREE(conf); - return 0; + this->private = NULL; + return; } int -reconfigure (xlator_t *this, dict_t *options) +iot_client_destroy(xlator_t *this, client_t *client) { - iot_conf_t *conf = NULL; - int ret = -1; - - conf = this->private; - if (!conf) - goto out; + void *tmp = NULL; - GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out); + if (client_ctx_del(client, this, &tmp) == 0) { + GF_FREE(tmp); + } - GF_OPTION_RECONF ("high-prio-threads", - conf->ac_iot_limit[IOT_PRI_HI], options, int32, out); - - GF_OPTION_RECONF ("normal-prio-threads", - conf->ac_iot_limit[IOT_PRI_NORMAL], options, int32, - out); - - GF_OPTION_RECONF ("low-prio-threads", - conf->ac_iot_limit[IOT_PRI_LO], options, int32, out); - - GF_OPTION_RECONF ("least-prio-threads", - conf->ac_iot_limit[IOT_PRI_LEAST], options, int32, - out); - GF_OPTION_RECONF ("enable-least-priority", conf->least_priority, - options, bool, out); - - GF_OPTION_RECONF("least-rate-limit", conf->throttle.rate_limit, options, - int32, out); - - ret = 0; -out: - return ret; + return 0; } +static int +iot_disconnect_cbk(xlator_t *this, client_t *client) +{ + int i; + call_stub_t *curr; + call_stub_t *next; + iot_conf_t *conf = this->private; + iot_client_ctx_t *ctx; -int -init (xlator_t *this) -{ - iot_conf_t *conf = NULL; - int ret = -1; - int i = 0; - - if (!this->children || this->children->next) { - gf_log ("io-threads", GF_LOG_ERROR, - "FATAL: iot not configured with exactly one child"); - goto out; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - conf = (void *) GF_CALLOC (1, sizeof (*conf), - gf_iot_mt_iot_conf_t); - if (conf == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory"); - goto out; - } - - if ((ret = pthread_cond_init(&conf->cond, NULL)) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "pthread_cond_init failed (%d)", ret); - goto out; - } - - if ((ret = pthread_mutex_init(&conf->mutex, NULL)) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "pthread_mutex_init failed (%d)", ret); - goto out; - } - - set_stack_size (conf); - - GF_OPTION_INIT ("thread-count", conf->max_count, int32, out); - - GF_OPTION_INIT ("high-prio-threads", - conf->ac_iot_limit[IOT_PRI_HI], int32, out); - - GF_OPTION_INIT ("normal-prio-threads", - conf->ac_iot_limit[IOT_PRI_NORMAL], int32, out); - - GF_OPTION_INIT ("low-prio-threads", - conf->ac_iot_limit[IOT_PRI_LO], int32, out); - - GF_OPTION_INIT ("least-prio-threads", - conf->ac_iot_limit[IOT_PRI_LEAST], int32, out); - - GF_OPTION_INIT ("idle-time", conf->idle_time, int32, out); - GF_OPTION_INIT ("enable-least-priority", conf->least_priority, - bool, out); - - GF_OPTION_INIT("least-rate-limit", conf->throttle.rate_limit, int32, - out); - if ((ret = pthread_mutex_init(&conf->throttle.lock, NULL)) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "pthread_mutex_init failed (%d)", ret); - goto out; - } - - conf->this = this; - - for (i = 0; i < IOT_PRI_MAX; i++) { - INIT_LIST_HEAD (&conf->reqs[i]); - } - - ret = iot_workers_scale (conf); + if (!conf || !conf->cleanup_disconnected_reqs) { + goto out; + } - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "cannot initialize worker threads, exiting init"); - goto out; + pthread_mutex_lock(&conf->mutex); + for (i = 0; i < GF_FOP_PRI_MAX; i++) { + ctx = &conf->no_client[i]; + list_for_each_entry_safe(curr, next, &ctx->reqs, list) + { + if (curr->frame->root->client != client) { + continue; + } + gf_log(this->name, GF_LOG_INFO, + "poisoning %s fop at %p for client %s", + gf_fop_list[curr->fop], curr, client->client_uid); + curr->poison = _gf_true; } + } + pthread_mutex_unlock(&conf->mutex); - this->private = conf; - ret = 0; out: - if (ret) - GF_FREE (conf); - - return ret; -} - - -void -fini (xlator_t *this) -{ - iot_conf_t *conf = this->private; - - GF_FREE (conf); - - this->private = NULL; - return; + return 0; } struct xlator_dumpops dumpops = { - .priv = iot_priv_dump, + .priv = iot_priv_dump, }; struct xlator_fops fops = { - .open = iot_open, - .create = iot_create, - .readv = iot_readv, - .writev = iot_writev, - .flush = iot_flush, - .fsync = iot_fsync, - .lk = iot_lk, - .stat = iot_stat, - .fstat = iot_fstat, - .truncate = iot_truncate, - .ftruncate = iot_ftruncate, - .unlink = iot_unlink, - .lookup = iot_lookup, - .setattr = iot_setattr, - .fsetattr = iot_fsetattr, - .access = iot_access, - .readlink = iot_readlink, - .mknod = iot_mknod, - .mkdir = iot_mkdir, - .rmdir = iot_rmdir, - .symlink = iot_symlink, - .rename = iot_rename, - .link = iot_link, - .opendir = iot_opendir, - .fsyncdir = iot_fsyncdir, - .statfs = iot_statfs, - .setxattr = iot_setxattr, - .getxattr = iot_getxattr, - .fgetxattr = iot_fgetxattr, - .fsetxattr = iot_fsetxattr, - .removexattr = iot_removexattr, - .fremovexattr = iot_fremovexattr, - .readdir = iot_readdir, - .readdirp = iot_readdirp, - .inodelk = iot_inodelk, - .finodelk = iot_finodelk, - .entrylk = iot_entrylk, - .fentrylk = iot_fentrylk, - .xattrop = iot_xattrop, - .fxattrop = iot_fxattrop, - .rchecksum = iot_rchecksum, - .fallocate = iot_fallocate, - .discard = iot_discard, + .open = iot_open, + .create = iot_create, + .readv = iot_readv, + .writev = iot_writev, + .flush = iot_flush, + .fsync = iot_fsync, + .lk = iot_lk, + .stat = iot_stat, + .fstat = iot_fstat, + .truncate = iot_truncate, + .ftruncate = iot_ftruncate, + .unlink = iot_unlink, + .lookup = iot_lookup, + .setattr = iot_setattr, + .fsetattr = iot_fsetattr, + .access = iot_access, + .readlink = iot_readlink, + .mknod = iot_mknod, + .mkdir = iot_mkdir, + .rmdir = iot_rmdir, + .symlink = iot_symlink, + .rename = iot_rename, + .link = iot_link, + .opendir = iot_opendir, + .fsyncdir = iot_fsyncdir, + .statfs = iot_statfs, + .setxattr = iot_setxattr, + .getxattr = iot_getxattr, + .fgetxattr = iot_fgetxattr, + .fsetxattr = iot_fsetxattr, + .removexattr = iot_removexattr, + .fremovexattr = iot_fremovexattr, + .readdir = iot_readdir, + .readdirp = iot_readdirp, + .inodelk = iot_inodelk, + .finodelk = iot_finodelk, + .entrylk = iot_entrylk, + .fentrylk = iot_fentrylk, + .xattrop = iot_xattrop, + .fxattrop = iot_fxattrop, + .rchecksum = iot_rchecksum, + .fallocate = iot_fallocate, + .discard = iot_discard, + .zerofill = iot_zerofill, + .seek = iot_seek, + .lease = iot_lease, + .getactivelk = iot_getactivelk, + .setactivelk = iot_setactivelk, + .put = iot_put, }; -struct xlator_cbks cbks; +struct xlator_cbks cbks = { + .client_destroy = iot_client_destroy, + .client_disconnect = iot_disconnect_cbk, +}; struct volume_options options[] = { - { .key = {"thread-count"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .default_value = "16", - .description = "Number of threads in IO threads translator which " - "perform concurrent IO operations" - - }, - { .key = {"high-prio-threads"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .default_value = "16", - .description = "Max number of threads in IO threads translator which " - "perform high priority IO operations at a given time" - - }, - { .key = {"normal-prio-threads"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .default_value = "16", - .description = "Max number of threads in IO threads translator which " - "perform normal priority IO operations at a given time" - - }, - { .key = {"low-prio-threads"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .default_value = "16", - .description = "Max number of threads in IO threads translator which " - "perform low priority IO operations at a given time" - - }, - { .key = {"least-prio-threads"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .default_value = "1", - .description = "Max number of threads in IO threads translator which " - "perform least priority IO operations at a given time" - }, - { .key = {"enable-least-priority"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "Enable/Disable least priority" - }, - {.key = {"idle-time"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 0x7fffffff, - .default_value = "120", - }, - {.key = {"least-rate-limit"}, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = INT_MAX, - .default_value = "0", - .description = "Max number of least priority operations to handle " - "per-second" - }, - { .key = {NULL}, - }, + {.key = {"thread-count"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + /*.option = "thread-count"*/ + .description = "Number of threads in IO threads translator which " + "perform concurrent IO operations" + + }, + {.key = {"high-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Max number of threads in IO threads translator which " + "perform high priority IO operations at a given time" + + }, + {.key = {"normal-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Max number of threads in IO threads translator which " + "perform normal priority IO operations at a given time" + + }, + {.key = {"low-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Max number of threads in IO threads translator which " + "perform low priority IO operations at a given time" + + }, + {.key = {"least-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "1", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Max number of threads in IO threads translator which " + "perform least priority IO operations at a given time"}, + {.key = {"enable-least-priority"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = SITE_H_ENABLE_LEAST_PRIORITY, + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Enable/Disable least priority"}, + { + .key = {"idle-time"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 0x7fffffff, + .default_value = "120", + }, + {.key = {"watchdog-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = 0, + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Number of seconds a queue must be stalled before " + "starting an 'emergency' thread."}, + {.key = {"cleanup-disconnected-reqs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"io-threads"}, + .description = "'Poison' queued requests when a client disconnects"}, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"io-threads"}, + .description = "Enable/Disable io threads translator"}, + { + .key = {NULL}, + }, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "io-threads", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h index 1a9dee9ae2c..f54d2f4912d 100644 --- a/xlators/performance/io-threads/src/io-threads.h +++ b/xlators/performance/io-threads/src/io-threads.h @@ -11,80 +11,74 @@ #ifndef __IOT_H #define __IOT_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "compat-errno.h" -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "common-utils.h" -#include "list.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/list.h> #include <stdlib.h> -#include "locking.h" +#include <glusterfs/locking.h> #include "iot-mem-types.h" #include <semaphore.h> -#include "statedump.h" - +#include <glusterfs/statedump.h> struct iot_conf; -#define MAX_IDLE_SKEW 4 /* In secs */ -#define skew_sec_idle_time(sec) ((sec) + (random () % MAX_IDLE_SKEW)) -#define IOT_DEFAULT_IDLE 120 /* In secs. */ - -#define IOT_MIN_THREADS 1 -#define IOT_DEFAULT_THREADS 16 -#define IOT_MAX_THREADS 64 - +#define MAX_IDLE_SKEW 4 /* In secs */ +#define skew_sec_idle_time(sec) ((sec) + (random() % MAX_IDLE_SKEW)) +#define IOT_DEFAULT_IDLE 120 /* In secs. */ -#define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024)) +#define IOT_MIN_THREADS 1 +#define IOT_DEFAULT_THREADS 16 +#define IOT_MAX_THREADS 64 +#define IOT_THREAD_STACK_SIZE ((size_t)(256 * 1024)) -typedef enum { - IOT_PRI_HI = 0, /* low latency */ - IOT_PRI_NORMAL, /* normal */ - IOT_PRI_LO, /* bulk */ - IOT_PRI_LEAST, /* least */ - IOT_PRI_MAX, -} iot_pri_t; - -#define IOT_LEAST_THROTTLE_DELAY 1 /* sample interval in seconds */ -struct iot_least_throttle { - struct timeval sample_time; /* timestamp of current sample */ - uint32_t sample_cnt; /* sample count for active interval */ - uint32_t cached_rate; /* the most recently measured rate */ - int32_t rate_limit; /* user-specified rate limit */ - pthread_mutex_t lock; -}; +typedef struct { + struct list_head clients; + struct list_head reqs; +} iot_client_ctx_t; struct iot_conf { - pthread_mutex_t mutex; - pthread_cond_t cond; - - int32_t max_count; /* configured maximum */ - int32_t curr_count; /* actual number of threads running */ - int32_t sleep_count; - - int32_t idle_time; /* in seconds */ - - struct list_head reqs[IOT_PRI_MAX]; - - int32_t ac_iot_limit[IOT_PRI_MAX]; - int32_t ac_iot_count[IOT_PRI_MAX]; - int queue_sizes[IOT_PRI_MAX]; - int queue_size; - pthread_attr_t w_attr; - gf_boolean_t least_priority; /*Enable/Disable least-priority */ - - xlator_t *this; - size_t stack_size; - - struct iot_least_throttle throttle; + pthread_mutex_t mutex; + pthread_cond_t cond; + + int32_t max_count; /* configured maximum */ + int32_t curr_count; /* actual number of threads running */ + int32_t sleep_count; + + int32_t idle_time; /* in seconds */ + + struct list_head clients[GF_FOP_PRI_MAX]; + /* + * It turns out that there are several ways a frame can get to us + * without having an associated client (server_first_lookup was the + * first one I hit). Instead of trying to update all such callers, + * we use this to queue them. + */ + iot_client_ctx_t no_client[GF_FOP_PRI_MAX]; + + int32_t ac_iot_limit[GF_FOP_PRI_MAX]; + int32_t ac_iot_count[GF_FOP_PRI_MAX]; + int queue_sizes[GF_FOP_PRI_MAX]; + int32_t queue_size; + gf_atomic_t stub_cnt; + pthread_attr_t w_attr; + gf_boolean_t least_priority; /*Enable/Disable least-priority */ + + xlator_t *this; + size_t stack_size; + gf_boolean_t down; /*PARENT_DOWN event is notified*/ + gf_boolean_t mutex_inited; + gf_boolean_t cond_inited; + + int32_t watchdog_secs; + gf_boolean_t watchdog_running; + pthread_t watchdog_thread; + gf_boolean_t queue_marked[GF_FOP_PRI_MAX]; + gf_boolean_t cleanup_disconnected_reqs; }; typedef struct iot_conf iot_conf_t; diff --git a/xlators/performance/io-threads/src/iot-mem-types.h b/xlators/performance/io-threads/src/iot-mem-types.h index 4fa8302d1f4..29565f34dd4 100644 --- a/xlators/performance/io-threads/src/iot-mem-types.h +++ b/xlators/performance/io-threads/src/iot-mem-types.h @@ -8,15 +8,14 @@ cases as published by the Free Software Foundation. */ - #ifndef __IOT_MEM_TYPES_H__ #define __IOT_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_iot_mem_types_ { - gf_iot_mt_iot_conf_t = gf_common_mt_end + 1, - gf_iot_mt_end + gf_iot_mt_iot_conf_t = gf_common_mt_end + 1, + gf_iot_mt_client_ctx_t, + gf_iot_mt_end }; #endif - diff --git a/xlators/performance/md-cache/src/Makefile.am b/xlators/performance/md-cache/src/Makefile.am index 8c9f5a8582f..447ff0f30f0 100644 --- a/xlators/performance/md-cache/src/Makefile.am +++ b/xlators/performance/md-cache/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = md-cache.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -md_cache_la_LDFLAGS = -module -avoid-version +md_cache_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) md_cache_la_SOURCES = md-cache.c md_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = md-cache-mem-types.h +noinst_HEADERS = md-cache-mem-types.h md-cache-messages.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ -I$(CONTRIBDIR)/rbtree AM_CFLAGS = -Wall $(GF_CFLAGS) @@ -23,3 +24,6 @@ stat-prefetch-compat: install-exec-local: stat-prefetch-compat + +uninstall-local: + rm -f $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance/stat-prefetch.so diff --git a/xlators/performance/md-cache/src/md-cache-mem-types.h b/xlators/performance/md-cache/src/md-cache-mem-types.h index 6634cf962a5..47a07005717 100644 --- a/xlators/performance/md-cache/src/md-cache-mem-types.h +++ b/xlators/performance/md-cache/src/md-cache-mem-types.h @@ -8,17 +8,16 @@ cases as published by the Free Software Foundation. */ - #ifndef __MDC_MEM_TYPES_H__ #define __MDC_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_mdc_mem_types_ { - gf_mdc_mt_mdc_local_t = gf_common_mt_end + 1, - gf_mdc_mt_md_cache_t, - gf_mdc_mt_mdc_conf_t, - gf_mdc_mt_end + gf_mdc_mt_mdc_local_t = gf_common_mt_end + 1, + gf_mdc_mt_md_cache_t, + gf_mdc_mt_mdc_conf_t, + gf_mdc_mt_mdc_ipc, + gf_mdc_mt_end }; #endif - diff --git a/xlators/performance/md-cache/src/md-cache-messages.h b/xlators/performance/md-cache/src/md-cache-messages.h new file mode 100644 index 00000000000..f367bad1991 --- /dev/null +++ b/xlators/performance/md-cache/src/md-cache-messages.h @@ -0,0 +1,29 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _MD_CACHE_MESSAGES_H_ +#define _MD_CACHE_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(MD_CACHE, MD_CACHE_MSG_NO_MEMORY, MD_CACHE_MSG_DISCARD_UPDATE, + MD_CACHE_MSG_CACHE_UPDATE, MD_CACHE_MSG_IPC_UPCALL_FAILED, + MD_CACHE_MSG_NO_XATTR_CACHE); + +#endif /* _MD_CACHE_MESSAGES_H_ */ diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c index 2f52cbe5807..a405be51f02 100644 --- a/xlators/performance/md-cache/src/md-cache.c +++ b/xlators/performance/md-cache/src/md-cache.c @@ -8,2083 +8,4013 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/defaults.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/syncop.h> #include "md-cache-mem-types.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/glusterfs-acl.h> +#include <glusterfs/defaults.h> +#include <glusterfs/upcall-utils.h> #include <assert.h> #include <sys/time.h> - +#include "md-cache-messages.h" +#include <glusterfs/statedump.h> +#include <glusterfs/atomic.h> /* TODO: - cache symlink() link names and nuke symlink-cache - send proper postbuf in setattr_cbk even when op_ret = -1 */ - -struct mdc_conf { - int timeout; - gf_boolean_t cache_posix_acl; - gf_boolean_t cache_selinux; - gf_boolean_t force_readdirp; +struct mdc_statfs_cache { + pthread_mutex_t lock; + time_t last_refreshed; /* (time_t)-1 if not yet initialized. */ + struct statvfs buf; }; +struct mdc_statistics { + gf_atomic_t stat_hit; /* No. of times lookup/stat was served from + mdc */ -static struct mdc_key { - const char *name; - int load; - int check; -} mdc_keys[] = { - { - .name = "system.posix_acl_access", - .load = 0, - .check = 1, - }, - { - .name = "system.posix_acl_default", - .load = 0, - .check = 1, - }, - { - .name = GF_SELINUX_XATTR_KEY, - .load = 0, - .check = 1, - }, - { - .name = "security.capability", - .load = 0, - .check = 1, - }, - { - .name = "gfid-req", - .load = 0, - .check = 1, - }, - { - .name = NULL, - .load = 0, - .check = 0, - } -}; + gf_atomic_t stat_miss; /* No. of times valid stat wasn't present in + mdc */ + gf_atomic_t xattr_hit; /* No. of times getxattr was served from mdc, + Note: this doesn't count the xattr served + from lookup */ -static uint64_t -gfid_to_ino (uuid_t gfid) -{ - uint64_t ino = 0; - int i = 0, j = 0; - - for (i = 15; i > (15 - 8); i--) { - ino += (uint64_t)(gfid[i]) << j; - j += 8; - } + gf_atomic_t xattr_miss; /* No. of times xattr req was WIND from mdc */ + gf_atomic_t negative_lookup; /* No. of negative lookups */ + gf_atomic_t nameless_lookup; /* No. of negative lookups that were sent + to bricks */ - return ino; -} + gf_atomic_t stat_invals; /* No. of invalidates received from upcall */ + gf_atomic_t xattr_invals; /* No. of invalidates received from upcall */ + gf_atomic_t need_lookup; /* No. of lookups issued, because other + xlators requested for explicit lookup */ +}; +struct mdc_conf { + uint32_t timeout; + gf_boolean_t cache_posix_acl; + gf_boolean_t cache_glusterfs_acl; + gf_boolean_t cache_selinux; + gf_boolean_t cache_capability; + gf_boolean_t cache_ima; + gf_boolean_t force_readdirp; + gf_boolean_t cache_swift_metadata; + gf_boolean_t cache_samba_metadata; + gf_boolean_t mdc_invalidation; + gf_boolean_t global_invalidation; + + time_t last_child_down; + gf_lock_t lock; + struct mdc_statistics mdc_counter; + gf_boolean_t cache_statfs; + struct mdc_statfs_cache statfs_cache; + char *mdc_xattr_str; + gf_atomic_int32_t generation; +}; struct mdc_local; typedef struct mdc_local mdc_local_t; -#define MDC_STACK_UNWIND(fop, frame, params ...) do { \ - mdc_local_t *__local = NULL; \ - xlator_t *__xl = NULL; \ - if (frame) { \ - __xl = frame->this; \ - __local = frame->local; \ - frame->local = NULL; \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ - mdc_local_wipe (__xl, __local); \ - } while (0) - +#define MDC_STACK_UNWIND(fop, frame, params...) \ + do { \ + mdc_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + mdc_local_wipe(__xl, __local); \ + } while (0) struct md_cache { - ia_prot_t md_prot; - uint32_t md_nlink; - uint32_t md_uid; - uint32_t md_gid; - uint32_t md_atime; - uint32_t md_atime_nsec; - uint32_t md_mtime; - uint32_t md_mtime_nsec; - uint32_t md_ctime; - uint32_t md_ctime_nsec; - uint64_t md_rdev; - uint64_t md_size; - uint64_t md_blocks; - dict_t *xattr; - char *linkname; - time_t ia_time; - time_t xa_time; - gf_lock_t lock; + ia_prot_t md_prot; + uint32_t md_nlink; + uint32_t md_uid; + uint32_t md_gid; + uint32_t md_atime_nsec; + uint32_t md_mtime_nsec; + uint32_t md_ctime_nsec; + int64_t md_atime; + int64_t md_mtime; + int64_t md_ctime; + uint64_t md_rdev; + uint64_t md_size; + uint64_t md_blocks; + uint64_t generation; + dict_t *xattr; + char *linkname; + time_t ia_time; + time_t xa_time; + gf_boolean_t need_lookup; + gf_boolean_t valid; + gf_boolean_t gen_rollover; + gf_boolean_t invalidation_rollover; + gf_lock_t lock; }; - struct mdc_local { - loc_t loc; - loc_t loc2; - fd_t *fd; - char *linkname; - dict_t *xattr; + loc_t loc; + loc_t loc2; + fd_t *fd; + char *linkname; + char *key; + dict_t *xattr; + uint64_t incident_time; + bool update_cache; }; - int -__mdc_inode_ctx_get (xlator_t *this, inode_t *inode, struct md_cache **mdc_p) +__mdc_inode_ctx_get(xlator_t *this, inode_t *inode, struct md_cache **mdc_p) { - int ret = 0; - struct md_cache *mdc = NULL; - uint64_t mdc_int = 0; + int ret = 0; + struct md_cache *mdc = NULL; + uint64_t mdc_int = 0; - ret = __inode_ctx_get (inode, this, &mdc_int); - mdc = (void *) (long) (mdc_int); - if (ret == 0 && mdc_p) - *mdc_p = mdc; + ret = __inode_ctx_get(inode, this, &mdc_int); + mdc = (void *)(long)(mdc_int); + if (ret == 0 && mdc_p) + *mdc_p = mdc; - return ret; + return ret; } - int -mdc_inode_ctx_get (xlator_t *this, inode_t *inode, struct md_cache **mdc_p) +mdc_inode_ctx_get(xlator_t *this, inode_t *inode, struct md_cache **mdc_p) { - int ret; + int ret = -1; + + if (!inode) + goto out; - LOCK(&inode->lock); - { - ret = __mdc_inode_ctx_get (this, inode, mdc_p); - } - UNLOCK(&inode->lock); + LOCK(&inode->lock); + { + ret = __mdc_inode_ctx_get(this, inode, mdc_p); + } + UNLOCK(&inode->lock); - return ret; +out: + return ret; } +uint64_t +__mdc_inc_generation(xlator_t *this, struct md_cache *mdc) +{ + uint64_t gen = 0, rollover; + struct mdc_conf *conf = NULL; -int -__mdc_inode_ctx_set (xlator_t *this, inode_t *inode, struct md_cache *mdc) + conf = this->private; + + gen = GF_ATOMIC_INC(conf->generation); + if (gen == 0) { + mdc->gen_rollover = !mdc->gen_rollover; + gen = GF_ATOMIC_INC(conf->generation); + mdc->ia_time = 0; + mdc->generation = 0; + } + + rollover = mdc->gen_rollover; + gen |= (rollover << 32); + return gen; +} + +uint64_t +mdc_inc_generation(xlator_t *this, inode_t *inode) { - int ret = 0; - uint64_t mdc_int = 0; + struct mdc_conf *conf = NULL; + uint64_t gen = 0; + struct md_cache *mdc = NULL; + + conf = this->private; - mdc_int = (long) mdc; - ret = __inode_ctx_set2 (inode, this, &mdc_int, 0); + mdc_inode_ctx_get(this, inode, &mdc); - return ret; + if (mdc) { + LOCK(&mdc->lock); + { + gen = __mdc_inc_generation(this, mdc); + } + UNLOCK(&mdc->lock); + } else { + gen = GF_ATOMIC_INC(conf->generation); + if (gen == 0) { + gen = GF_ATOMIC_INC(conf->generation); + } + } + + return gen; } +uint64_t +mdc_get_generation(xlator_t *this, inode_t *inode) +{ + struct mdc_conf *conf = NULL; + uint64_t gen = 0; + struct md_cache *mdc = NULL; + + conf = this->private; + + mdc_inode_ctx_get(this, inode, &mdc); + + if (mdc) { + LOCK(&mdc->lock); + { + gen = mdc->generation; + } + UNLOCK(&mdc->lock); + } else + gen = GF_ATOMIC_GET(conf->generation); + + return gen; +} int -mdc_inode_ctx_set (xlator_t *this, inode_t *inode, struct md_cache *mdc) +__mdc_inode_ctx_set(xlator_t *this, inode_t *inode, struct md_cache *mdc) { - int ret; + int ret = 0; + uint64_t mdc_int = 0; - LOCK(&inode->lock); - { - ret = __mdc_inode_ctx_set (this, inode, mdc); - } - UNLOCK(&inode->lock); + mdc_int = (long)mdc; + ret = __inode_ctx_set(inode, this, &mdc_int); - return ret; + return ret; } +int +mdc_inode_ctx_set(xlator_t *this, inode_t *inode, struct md_cache *mdc) +{ + int ret; + + LOCK(&inode->lock); + { + ret = __mdc_inode_ctx_set(this, inode, mdc); + } + UNLOCK(&inode->lock); + + return ret; +} mdc_local_t * -mdc_local_get (call_frame_t *frame) +mdc_local_get(call_frame_t *frame, inode_t *inode) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = frame->local; - if (local) - goto out; + local = frame->local; + if (local) + goto out; - local = GF_CALLOC (sizeof (*local), 1, gf_mdc_mt_mdc_local_t); - if (!local) - goto out; + local = GF_CALLOC(sizeof(*local), 1, gf_mdc_mt_mdc_local_t); + if (!local) + goto out; - frame->local = local; + local->incident_time = mdc_get_generation(frame->this, inode); + frame->local = local; out: - return local; + return local; } - void -mdc_local_wipe (xlator_t *this, mdc_local_t *local) +mdc_local_wipe(xlator_t *this, mdc_local_t *local) { - if (!local) - return; + if (!local) + return; - loc_wipe (&local->loc); + loc_wipe(&local->loc); - loc_wipe (&local->loc2); + loc_wipe(&local->loc2); - if (local->fd) - fd_unref (local->fd); + if (local->fd) + fd_unref(local->fd); - GF_FREE (local->linkname); + GF_FREE(local->linkname); - if (local->xattr) - dict_unref (local->xattr); + GF_FREE(local->key); - GF_FREE (local); - return; -} + if (local->xattr) + dict_unref(local->xattr); + GF_FREE(local); + return; +} int -mdc_inode_wipe (xlator_t *this, inode_t *inode) +mdc_inode_wipe(xlator_t *this, inode_t *inode) { - int ret = 0; - uint64_t mdc_int = 0; - struct md_cache *mdc = NULL; + int ret = 0; + uint64_t mdc_int = 0; + struct md_cache *mdc = NULL; - ret = inode_ctx_del (inode, this, &mdc_int); - if (ret != 0) - goto out; + ret = inode_ctx_del(inode, this, &mdc_int); + if (ret != 0) + goto out; - mdc = (void *) (long) mdc_int; + mdc = (void *)(long)mdc_int; - if (mdc->xattr) - dict_unref (mdc->xattr); + if (mdc->xattr) + dict_unref(mdc->xattr); - GF_FREE (mdc->linkname); + GF_FREE(mdc->linkname); - GF_FREE (mdc); + GF_FREE(mdc); - ret = 0; + ret = 0; out: - return ret; + return ret; } - struct md_cache * -mdc_inode_prep (xlator_t *this, inode_t *inode) -{ - int ret = 0; - struct md_cache *mdc = NULL; - - LOCK (&inode->lock); - { - ret = __mdc_inode_ctx_get (this, inode, &mdc); - if (ret == 0) - goto unlock; - - mdc = GF_CALLOC (sizeof (*mdc), 1, gf_mdc_mt_md_cache_t); - if (!mdc) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - goto unlock; - } +mdc_inode_prep(xlator_t *this, inode_t *inode) +{ + int ret = 0; + struct md_cache *mdc = NULL; + + LOCK(&inode->lock); + { + ret = __mdc_inode_ctx_get(this, inode, &mdc); + if (ret == 0) + goto unlock; + + mdc = GF_CALLOC(sizeof(*mdc), 1, gf_mdc_mt_md_cache_t); + if (!mdc) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "out of memory"); + goto unlock; + } - LOCK_INIT (&mdc->lock); + LOCK_INIT(&mdc->lock); - ret = __mdc_inode_ctx_set (this, inode, mdc); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - GF_FREE (mdc); - mdc = NULL; - } + ret = __mdc_inode_ctx_set(this, inode, mdc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "out of memory"); + GF_FREE(mdc); + mdc = NULL; } + } unlock: - UNLOCK (&inode->lock); + UNLOCK(&inode->lock); - return mdc; + return mdc; } - +/* Cache is valid if: + * - It is not cached before any brick was down. Brick down case is handled by + * invalidating all the cache when any brick went down. + * - The cache time is not expired + */ static gf_boolean_t -is_md_cache_iatt_valid (xlator_t *this, struct md_cache *mdc) -{ - struct mdc_conf *conf = NULL; - time_t now = 0; - gf_boolean_t ret = _gf_true; - conf = this->private; +__is_cache_valid(xlator_t *this, time_t mdc_time) +{ + gf_boolean_t ret = _gf_true; + struct mdc_conf *conf = NULL; + uint32_t timeout = 0; + time_t last_child_down = 0; + + conf = this->private; + + /* conf->lock here is not taken deliberately, so that the multi + * threaded IO doesn't contend on a global lock. While updating + * the variable, the lock is taken, so that at least the writes are + * intact. The read of last_child_down may return junk, but that + * is for a very short period of time. + */ + last_child_down = conf->last_child_down; + timeout = conf->timeout; + + if ((mdc_time == 0) || + ((last_child_down != 0) && (mdc_time < last_child_down))) { + ret = _gf_false; + goto out; + } + + if (gf_time() >= (mdc_time + timeout)) { + ret = _gf_false; + } - time (&now); +out: + return ret; +} - LOCK (&mdc->lock); - { - if (now >= (mdc->ia_time + conf->timeout)) - ret = _gf_false; +static gf_boolean_t +is_md_cache_iatt_valid(xlator_t *this, struct md_cache *mdc) +{ + gf_boolean_t ret = _gf_true; + + LOCK(&mdc->lock); + { + if (mdc->valid == _gf_false) { + ret = mdc->valid; + } else { + ret = __is_cache_valid(this, mdc->ia_time); + if (ret == _gf_false) { + mdc->ia_time = 0; + mdc->generation = 0; + } } - UNLOCK (&mdc->lock); + } + UNLOCK(&mdc->lock); - return ret; + return ret; } - static gf_boolean_t -is_md_cache_xatt_valid (xlator_t *this, struct md_cache *mdc) +is_md_cache_xatt_valid(xlator_t *this, struct md_cache *mdc) { - struct mdc_conf *conf = NULL; - time_t now = 0; - gf_boolean_t ret = _gf_true; - - conf = this->private; + gf_boolean_t ret = _gf_true; - time (&now); - - LOCK (&mdc->lock); - { - if (now >= (mdc->xa_time + conf->timeout)) - ret = _gf_false; - } - UNLOCK (&mdc->lock); + LOCK(&mdc->lock); + { + ret = __is_cache_valid(this, mdc->xa_time); + if (ret == _gf_false) + mdc->xa_time = 0; + } + UNLOCK(&mdc->lock); - return ret; + return ret; } - void -mdc_from_iatt (struct md_cache *mdc, struct iatt *iatt) -{ - mdc->md_prot = iatt->ia_prot; - mdc->md_nlink = iatt->ia_nlink; - mdc->md_uid = iatt->ia_uid; - mdc->md_gid = iatt->ia_gid; - mdc->md_atime = iatt->ia_atime; - mdc->md_atime_nsec = iatt->ia_atime_nsec; - mdc->md_mtime = iatt->ia_mtime; - mdc->md_mtime_nsec = iatt->ia_mtime_nsec; - mdc->md_ctime = iatt->ia_ctime; - mdc->md_ctime_nsec = iatt->ia_ctime_nsec; - mdc->md_rdev = iatt->ia_rdev; - mdc->md_size = iatt->ia_size; - mdc->md_blocks = iatt->ia_blocks; +mdc_from_iatt(struct md_cache *mdc, struct iatt *iatt) +{ + mdc->md_prot = iatt->ia_prot; + mdc->md_nlink = iatt->ia_nlink; + mdc->md_uid = iatt->ia_uid; + mdc->md_gid = iatt->ia_gid; + mdc->md_atime = iatt->ia_atime; + mdc->md_atime_nsec = iatt->ia_atime_nsec; + mdc->md_mtime = iatt->ia_mtime; + mdc->md_mtime_nsec = iatt->ia_mtime_nsec; + mdc->md_ctime = iatt->ia_ctime; + mdc->md_ctime_nsec = iatt->ia_ctime_nsec; + mdc->md_rdev = iatt->ia_rdev; + mdc->md_size = iatt->ia_size; + mdc->md_blocks = iatt->ia_blocks; } - void -mdc_to_iatt (struct md_cache *mdc, struct iatt *iatt) +mdc_to_iatt(struct md_cache *mdc, struct iatt *iatt) { - iatt->ia_prot = mdc->md_prot; - iatt->ia_nlink = mdc->md_nlink; - iatt->ia_uid = mdc->md_uid; - iatt->ia_gid = mdc->md_gid; - iatt->ia_atime = mdc->md_atime; - iatt->ia_atime_nsec = mdc->md_atime_nsec; - iatt->ia_mtime = mdc->md_mtime; - iatt->ia_mtime_nsec = mdc->md_mtime_nsec; - iatt->ia_ctime = mdc->md_ctime; - iatt->ia_ctime_nsec = mdc->md_ctime_nsec; - iatt->ia_rdev = mdc->md_rdev; - iatt->ia_size = mdc->md_size; - iatt->ia_blocks = mdc->md_blocks; + iatt->ia_prot = mdc->md_prot; + iatt->ia_nlink = mdc->md_nlink; + iatt->ia_uid = mdc->md_uid; + iatt->ia_gid = mdc->md_gid; + iatt->ia_atime = mdc->md_atime; + iatt->ia_atime_nsec = mdc->md_atime_nsec; + iatt->ia_mtime = mdc->md_mtime; + iatt->ia_mtime_nsec = mdc->md_mtime_nsec; + iatt->ia_ctime = mdc->md_ctime; + iatt->ia_ctime_nsec = mdc->md_ctime_nsec; + iatt->ia_rdev = mdc->md_rdev; + iatt->ia_size = mdc->md_size; + iatt->ia_blocks = mdc->md_blocks; } - int mdc_inode_iatt_set_validate(xlator_t *this, inode_t *inode, struct iatt *prebuf, - struct iatt *iatt) -{ - int ret = -1; - struct md_cache *mdc = NULL; + struct iatt *iatt, gf_boolean_t update_time, + uint64_t incident_time) +{ + int ret = 0; + struct md_cache *mdc = NULL; + uint32_t rollover = 0; + uint64_t gen = 0; + gf_boolean_t update_xa_time = _gf_false; + struct mdc_conf *conf = this->private; + + mdc = mdc_inode_prep(this, inode); + if (!mdc) { + ret = -1; + goto out; + } + + rollover = incident_time >> 32; + incident_time = (incident_time & 0xffffffff); + + LOCK(&mdc->lock); + { + if (!iatt || !iatt->ia_ctime) { + gf_msg_callingfn("md-cache", GF_LOG_TRACE, 0, 0, + "invalidating iatt(NULL)" + "(%s)", + uuid_utoa(inode->gfid)); + mdc->ia_time = 0; + mdc->valid = 0; + + gen = __mdc_inc_generation(this, mdc); + mdc->generation = (gen & 0xffffffff); + goto unlock; + } - mdc = mdc_inode_prep (this, inode); - if (!mdc) - goto out; + /* There could be a race in invalidation, where the + * invalidations in order A, B reaches md-cache in the order + * B, A. Hence, make sure the invalidation A is discarded if + * it comes after B. ctime of a file is always in ascending + * order unlike atime and mtime(which can be changed by user + * to any date), also ctime gets updates when atime/mtime + * changes, hence check for ctime only. + */ + if (mdc->md_ctime > iatt->ia_ctime) { + gf_msg_callingfn(this->name, GF_LOG_DEBUG, EINVAL, + MD_CACHE_MSG_DISCARD_UPDATE, + "discarding the iatt validate " + "request (%s)", + uuid_utoa(inode->gfid)); + ret = -1; + goto unlock; + } + if ((mdc->md_ctime == iatt->ia_ctime) && + (mdc->md_ctime_nsec > iatt->ia_ctime_nsec)) { + gf_msg_callingfn(this->name, GF_LOG_DEBUG, EINVAL, + MD_CACHE_MSG_DISCARD_UPDATE, + "discarding the iatt validate " + "request(ctime_nsec) (%s)", + uuid_utoa(inode->gfid)); + ret = -1; + goto unlock; + } - LOCK (&mdc->lock); - { - if (!iatt || !iatt->ia_ctime) { - mdc->ia_time = 0; - goto unlock; + /* + * Invalidate the inode if the mtime or ctime has changed + * and the prebuf doesn't match the value we have cached. + * TODO: writev returns with a NULL iatt due to + * performance/write-behind, causing invalidation on writes. + */ + if ((iatt->ia_mtime != mdc->md_mtime) || + (iatt->ia_mtime_nsec != mdc->md_mtime_nsec) || + (iatt->ia_ctime != mdc->md_ctime) || + (iatt->ia_ctime_nsec != mdc->md_ctime_nsec)) { + if (conf->global_invalidation && + (!prebuf || (prebuf->ia_mtime != mdc->md_mtime) || + (prebuf->ia_mtime_nsec != mdc->md_mtime_nsec) || + (prebuf->ia_ctime != mdc->md_ctime) || + (prebuf->ia_ctime_nsec != mdc->md_ctime_nsec))) { + if (IA_ISREG(inode->ia_type)) { + gf_msg("md-cache", GF_LOG_TRACE, 0, + MD_CACHE_MSG_DISCARD_UPDATE, + "prebuf doesn't match the value we have cached," + " invalidate the inode(%s)", + uuid_utoa(inode->gfid)); + + inode_invalidate(inode); } + } else { + update_xa_time = _gf_true; + } + } - /* - * Invalidate the inode if the mtime or ctime has changed - * and the prebuf doesn't match the value we have cached. - * TODO: writev returns with a NULL iatt due to - * performance/write-behind, causing invalidation on writes. - */ - if (IA_ISREG(inode->ia_type) && - ((iatt->ia_mtime != mdc->md_mtime) || - (iatt->ia_ctime != mdc->md_ctime))) - if (!prebuf || (prebuf->ia_ctime != mdc->md_ctime) || - (prebuf->ia_mtime != mdc->md_mtime)) - inode_invalidate(inode); - - mdc_from_iatt (mdc, iatt); - - time (&mdc->ia_time); + if ((mdc->gen_rollover == rollover) && + (incident_time >= mdc->generation)) { + mdc_from_iatt(mdc, iatt); + mdc->valid = _gf_true; + if (update_time) { + mdc->ia_time = gf_time(); + if (mdc->xa_time && update_xa_time) + mdc->xa_time = mdc->ia_time; + } + + gf_msg_callingfn( + "md-cache", GF_LOG_TRACE, 0, MD_CACHE_MSG_CACHE_UPDATE, + "Updated iatt(%s)" + " time:%lld generation=%lld", + uuid_utoa(iatt->ia_gfid), (unsigned long long)mdc->ia_time, + (unsigned long long)mdc->generation); + } else { + gf_msg_callingfn("md-cache", GF_LOG_TRACE, 0, 0, + "not updating cache (%s)" + "mdc-rollover=%u rollover=%u " + "mdc-generation=%llu " + "mdc-ia_time=%llu incident_time=%llu ", + uuid_utoa(iatt->ia_gfid), mdc->gen_rollover, + rollover, (unsigned long long)mdc->generation, + (unsigned long long)mdc->ia_time, + (unsigned long long)incident_time); } + } unlock: - UNLOCK (&mdc->lock); - ret = 0; + UNLOCK(&mdc->lock); + out: - return ret; + return ret; } -int mdc_inode_iatt_set(xlator_t *this, inode_t *inode, struct iatt *iatt) +int +mdc_inode_iatt_set(xlator_t *this, inode_t *inode, struct iatt *iatt, + uint64_t incident_time) { - return mdc_inode_iatt_set_validate(this, inode, NULL, iatt); + return mdc_inode_iatt_set_validate(this, inode, NULL, iatt, _gf_true, + incident_time); } int -mdc_inode_iatt_get (xlator_t *this, inode_t *inode, struct iatt *iatt) +mdc_inode_iatt_get(xlator_t *this, inode_t *inode, struct iatt *iatt) { - int ret = -1; - struct md_cache *mdc = NULL; + int ret = -1; + struct md_cache *mdc = NULL; - if (mdc_inode_ctx_get (this, inode, &mdc) != 0) - goto out; + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) { + gf_msg_trace("md-cache", 0, "mdc_inode_ctx_get failed (%s)", + uuid_utoa(inode->gfid)); + goto out; + } - if (!is_md_cache_iatt_valid (this, mdc)) - goto out; + if (!is_md_cache_iatt_valid(this, mdc)) { + gf_msg_trace("md-cache", 0, "iatt cache not valid for (%s)", + uuid_utoa(inode->gfid)); + goto out; + } - LOCK (&mdc->lock); - { - mdc_to_iatt (mdc, iatt); - } - UNLOCK (&mdc->lock); + LOCK(&mdc->lock); + { + mdc_to_iatt(mdc, iatt); + } + UNLOCK(&mdc->lock); - uuid_copy (iatt->ia_gfid, inode->gfid); - iatt->ia_ino = gfid_to_ino (inode->gfid); - iatt->ia_dev = 42; - iatt->ia_type = inode->ia_type; + gf_uuid_copy(iatt->ia_gfid, inode->gfid); + iatt->ia_ino = gfid_to_ino(inode->gfid); + iatt->ia_dev = 42; + iatt->ia_type = inode->ia_type; - ret = 0; + ret = 0; out: - return ret; + return ret; } struct updatedict { - dict_t *dict; - int ret; + dict_t *dict; + int ret; }; static int +is_mdc_key_satisfied(xlator_t *this, const char *key) +{ + int ret = 0; + char *pattern = NULL; + struct mdc_conf *conf = this->private; + char *mdc_xattr_str = NULL; + char *tmp = NULL; + char *tmp1 = NULL; + + if (!key) + goto out; + + /* conf->mdc_xattr_str, is never freed and is hence safely used outside + * of lock*/ + tmp1 = conf->mdc_xattr_str; + if (!tmp1) + goto out; + + mdc_xattr_str = gf_strdup(tmp1); + if (!mdc_xattr_str) + goto out; + + pattern = strtok_r(mdc_xattr_str, ",", &tmp); + while (pattern) { + gf_strTrim(&pattern); + if (fnmatch(pattern, key, 0) == 0) { + ret = 1; + break; + } else { + gf_msg_trace("md-cache", 0, + "xattr key %s doesn't satisfy " + "caching requirements", + key); + } + pattern = strtok_r(NULL, ",", &tmp); + } + GF_FREE(mdc_xattr_str); +out: + return ret; +} + +static int updatefn(dict_t *dict, char *key, data_t *value, void *data) { - struct updatedict *u = data; - const char *mdc_key; - int i = 0; - - for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) { - if (!mdc_keys[i].check) - continue; - if (strcmp(mdc_key, key)) - continue; - - if (!u->dict) { - u->dict = dict_new(); - if (!u->dict) { - u->ret = -1; - return -1; - } - } - - if (dict_set(u->dict, key, value) < 0) { - u->ret = -1; - return -1; - } - - break; - } - return 0; + struct updatedict *u = data; + + if (is_mdc_key_satisfied(THIS, key)) { + if (!u->dict) { + u->dict = dict_new(); + if (!u->dict) { + u->ret = -1; + return -1; + } + } + + if (dict_set(u->dict, key, value) < 0) { + u->ret = -1; + return -1; + } + } + return 0; } static int mdc_dict_update(dict_t **tgt, dict_t *src) { - struct updatedict u = { - .dict = *tgt, - .ret = 0, - }; + struct updatedict u = { + .dict = *tgt, + .ret = 0, + }; - dict_foreach(src, updatefn, &u); + dict_foreach(src, updatefn, &u); - if (*tgt) - return u.ret; + if (*tgt) + return u.ret; - if ((u.ret < 0) && u.dict) { - dict_unref(u.dict); - return u.ret; - } + if ((u.ret < 0) && u.dict) { + dict_unref(u.dict); + return u.ret; + } - *tgt = u.dict; + *tgt = u.dict; - return u.ret; + return u.ret; } int -mdc_inode_xatt_set (xlator_t *this, inode_t *inode, dict_t *dict) +mdc_inode_xatt_set(xlator_t *this, inode_t *inode, dict_t *dict) { - int ret = -1; - struct md_cache *mdc = NULL; - dict_t *newdict = NULL; + int ret = -1; + struct md_cache *mdc = NULL; + dict_t *newdict = NULL; - mdc = mdc_inode_prep (this, inode); - if (!mdc) - goto out; + mdc = mdc_inode_prep(this, inode); + if (!mdc) + goto out; - if (!dict) - goto out; + if (!dict) { + gf_msg_trace("md-cache", 0, + "mdc_inode_xatt_set failed (%s) " + "dict NULL", + uuid_utoa(inode->gfid)); + goto out; + } - LOCK (&mdc->lock); - { - if (mdc->xattr) { - dict_unref (mdc->xattr); - mdc->xattr = NULL; - } + LOCK(&mdc->lock); + { + if (mdc->xattr) { + gf_msg_trace("md-cache", 0, + "deleting the old xattr " + "cache (%s)", + uuid_utoa(inode->gfid)); + dict_unref(mdc->xattr); + mdc->xattr = NULL; + } - ret = mdc_dict_update(&newdict, dict); - if (ret < 0) { - UNLOCK(&mdc->lock); - goto out; - } + ret = mdc_dict_update(&newdict, dict); + if (ret < 0) { + UNLOCK(&mdc->lock); + goto out; + } - if (newdict) - mdc->xattr = newdict; + if (newdict) + mdc->xattr = newdict; - time (&mdc->xa_time); - } - UNLOCK (&mdc->lock); - ret = 0; + mdc->xa_time = gf_time(); + gf_msg_trace("md-cache", 0, "xatt cache set for (%s) time:%lld", + uuid_utoa(inode->gfid), (long long)mdc->xa_time); + } + UNLOCK(&mdc->lock); + ret = 0; out: - return ret; + return ret; } - int -mdc_inode_xatt_update (xlator_t *this, inode_t *inode, dict_t *dict) +mdc_inode_xatt_update(xlator_t *this, inode_t *inode, dict_t *dict) { - int ret = -1; - struct md_cache *mdc = NULL; + int ret = -1; + struct md_cache *mdc = NULL; - mdc = mdc_inode_prep (this, inode); - if (!mdc) - goto out; + mdc = mdc_inode_prep(this, inode); + if (!mdc) + goto out; - if (!dict) - goto out; + if (!dict) + goto out; - LOCK (&mdc->lock); - { - ret = mdc_dict_update(&mdc->xattr, dict); - if (ret < 0) { - UNLOCK(&mdc->lock); - goto out; - } - - time (&mdc->xa_time); + LOCK(&mdc->lock); + { + ret = mdc_dict_update(&mdc->xattr, dict); + if (ret < 0) { + UNLOCK(&mdc->lock); + goto out; } - UNLOCK (&mdc->lock); + } + UNLOCK(&mdc->lock); - ret = 0; + ret = 0; out: - return ret; + return ret; } +int +mdc_inode_xatt_unset(xlator_t *this, inode_t *inode, char *name) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + mdc = mdc_inode_prep(this, inode); + if (!mdc) + goto out; + + if (!name || !mdc->xattr) + goto out; + + LOCK(&mdc->lock); + { + dict_del(mdc->xattr, name); + } + UNLOCK(&mdc->lock); + + ret = 0; +out: + return ret; +} int -mdc_inode_xatt_get (xlator_t *this, inode_t *inode, dict_t **dict) +mdc_inode_xatt_get(xlator_t *this, inode_t *inode, dict_t **dict) { - int ret = -1; - struct md_cache *mdc = NULL; + int ret = -1; + struct md_cache *mdc = NULL; - if (mdc_inode_ctx_get (this, inode, &mdc) != 0) - goto out; + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) { + gf_msg_trace("md-cache", 0, "mdc_inode_ctx_get failed (%s)", + uuid_utoa(inode->gfid)); + goto out; + } - if (!is_md_cache_xatt_valid (this, mdc)) - goto out; + if (!is_md_cache_xatt_valid(this, mdc)) { + gf_msg_trace("md-cache", 0, "xattr cache not valid for (%s)", + uuid_utoa(inode->gfid)); + goto out; + } - LOCK (&mdc->lock); - { - ret = 0; - /* Missing xattr only means no keys were there, i.e - a negative cache for the "loaded" keys - */ - if (!mdc->xattr) - goto unlock; - - if (dict) - *dict = dict_ref (mdc->xattr); + LOCK(&mdc->lock); + { + ret = 0; + /* Missing xattr only means no keys were there, i.e + a negative cache for the "loaded" keys + */ + if (!mdc->xattr) { + gf_msg_trace("md-cache", 0, "xattr not present (%s)", + uuid_utoa(inode->gfid)); + goto unlock; } + + if (dict) + *dict = dict_ref(mdc->xattr); + } unlock: - UNLOCK (&mdc->lock); + UNLOCK(&mdc->lock); out: - return ret; + return ret; } +gf_boolean_t +mdc_inode_reset_need_lookup(xlator_t *this, inode_t *inode) +{ + struct md_cache *mdc = NULL; + gf_boolean_t need = _gf_false; + + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) + goto out; + + LOCK(&mdc->lock); + { + need = mdc->need_lookup; + mdc->need_lookup = _gf_false; + } + UNLOCK(&mdc->lock); + +out: + return need; +} void -mdc_load_reqs (xlator_t *this, dict_t *dict) +mdc_inode_set_need_lookup(xlator_t *this, inode_t *inode, gf_boolean_t need) { - const char *mdc_key = NULL; - int i = 0; - int ret = 0; + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) + goto out; - for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) { - if (!mdc_keys[i].load) - continue; - ret = dict_set_int8 (dict, (char *)mdc_key, 0); - if (ret) - return; - } + LOCK(&mdc->lock); + { + mdc->need_lookup = need; + } + UNLOCK(&mdc->lock); + +out: + return; } +void +mdc_inode_iatt_invalidate(xlator_t *this, inode_t *inode) +{ + struct md_cache *mdc = NULL; + uint32_t gen = 0; -struct checkpair { - int ret; - dict_t *rsp; -}; + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) + goto out; + gen = mdc_inc_generation(this, inode) & 0xffffffff; -static int -is_mdc_key_satisfied (const char *key) + LOCK(&mdc->lock); + { + mdc->ia_time = 0; + mdc->valid = _gf_false; + mdc->generation = gen; + } + UNLOCK(&mdc->lock); + +out: + return; +} + +int +mdc_inode_xatt_invalidate(xlator_t *this, inode_t *inode) { - const char *mdc_key = NULL; - int i = 0; + int ret = -1; + struct md_cache *mdc = NULL; - if (!key) - return 0; + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) + goto out; - for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) { - if (!mdc_keys[i].load) - continue; - if (strcmp (mdc_key, key) == 0) - return 1; - } + LOCK(&mdc->lock); + { + mdc->xa_time = 0; + } + UNLOCK(&mdc->lock); + +out: + return ret; +} + +static int +mdc_update_gfid_stat(xlator_t *this, struct iatt *iatt) +{ + int ret = 0; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + + itable = ((xlator_t *)this->graph->top)->itable; + inode = inode_find(itable, iatt->ia_gfid); + if (!inode) { + ret = -1; + goto out; + } + ret = mdc_inode_iatt_set_validate(this, inode, NULL, iatt, _gf_true, + mdc_inc_generation(this, inode)); +out: + return ret; +} + +static bool +mdc_load_reqs(xlator_t *this, dict_t *dict) +{ + struct mdc_conf *conf = this->private; + char *pattern = NULL; + char *mdc_xattr_str = NULL; + char *tmp = NULL; + char *tmp1 = NULL; + int ret = 0; + bool loaded = false; + + tmp1 = conf->mdc_xattr_str; + if (!tmp1) + goto out; + + mdc_xattr_str = gf_strdup(tmp1); + if (!mdc_xattr_str) + goto out; + + pattern = strtok_r(mdc_xattr_str, ",", &tmp); + while (pattern) { + gf_strTrim(&pattern); + ret = dict_set_int8(dict, pattern, 0); + if (ret) { + conf->mdc_xattr_str = NULL; + gf_msg("md-cache", GF_LOG_ERROR, 0, MD_CACHE_MSG_NO_XATTR_CACHE, + "Disabled cache for xattrs, dict_set failed"); + goto out; + } + pattern = strtok_r(NULL, ",", &tmp); + } + + loaded = true; + +out: + GF_FREE(mdc_xattr_str); - return 0; + return loaded; } +struct checkpair { + int ret; + dict_t *rsp; +}; static int -checkfn (dict_t *this, char *key, data_t *value, void *data) +checkfn(dict_t *this, char *key, data_t *value, void *data) { - struct checkpair *pair = data; + struct checkpair *pair = data; - if (!is_mdc_key_satisfied (key)) - pair->ret = 0; + if (!is_mdc_key_satisfied(THIS, key)) + pair->ret = 0; - return 0; + return 0; } - int -mdc_xattr_satisfied (xlator_t *this, dict_t *req, dict_t *rsp) +mdc_xattr_satisfied(xlator_t *this, dict_t *req, dict_t *rsp) { - struct checkpair pair = { - .ret = 1, - .rsp = rsp, - }; + struct checkpair pair = { + .ret = 1, + .rsp = rsp, + }; - dict_foreach (req, checkfn, &pair); + dict_foreach(req, checkfn, &pair); - return pair.ret; + return pair.ret; } +static void +mdc_cache_statfs(xlator_t *this, struct statvfs *buf) +{ + struct mdc_conf *conf = this->private; + + pthread_mutex_lock(&conf->statfs_cache.lock); + { + memcpy(&conf->statfs_cache.buf, buf, sizeof(struct statvfs)); + conf->statfs_cache.last_refreshed = gf_time(); + } + pthread_mutex_unlock(&conf->statfs_cache.lock); +} int -mdc_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *stbuf, dict_t *dict, struct iatt *postparent) +mdc_load_statfs_info_from_cache(xlator_t *this, struct statvfs **buf) { - mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; + uint32_t cache_age = 0; + int ret = 0; + + if (!buf || !conf) { + ret = -1; + goto err; + } + + *buf = NULL; + + pthread_mutex_lock(&conf->statfs_cache.lock); + { + /* Skip if the cache is not initialized. */ + if (conf->statfs_cache.last_refreshed == (time_t)-1) { + ret = -1; + goto unlock; + } + + cache_age = (gf_time() - conf->statfs_cache.last_refreshed); + + gf_log(this->name, GF_LOG_DEBUG, "STATFS cache age = %u secs", + cache_age); + if (cache_age > conf->timeout) { + /* Expire the cache. */ + gf_log(this->name, GF_LOG_DEBUG, + "Cache age %u secs exceeded timeout %u secs", cache_age, + conf->timeout); + ret = -1; + goto unlock; + } + + *buf = &conf->statfs_cache.buf; + } +unlock: + pthread_mutex_unlock(&conf->statfs_cache.lock); +err: + return ret; +} - local = frame->local; +static dict_t * +mdc_prepare_request(xlator_t *this, mdc_local_t *local, dict_t *xdata) +{ + if (xdata != NULL) { + dict_ref(xdata); + } - if (op_ret != 0) - goto out; + if (local == NULL) { + return xdata; + } - if (!local) - goto out; + if (xdata == NULL) { + xdata = dict_new(); + if (xdata == NULL) { + local->update_cache = false; - if (local->loc.parent) { - mdc_inode_iatt_set (this, local->loc.parent, postparent); + return NULL; } + } + + local->update_cache = mdc_load_reqs(this, xdata); + + return xdata; +} - if (local->loc.inode) { - mdc_inode_iatt_set (this, local->loc.inode, stbuf); - mdc_inode_xatt_set (this, local->loc.inode, dict); +int +mdc_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata) +{ + struct mdc_conf *conf = this->private; + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); } + + goto out; + } + + if (conf && conf->cache_statfs) { + mdc_cache_statfs(this, buf); + } + out: - MDC_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, - dict, postparent); - return 0; + MDC_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata); + + return 0; } +int +mdc_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int ret = 0, op_ret = 0, op_errno = 0; + struct statvfs *buf = NULL; + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; + + local = mdc_local_get(frame, loc->inode); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + loc_copy(&local->loc, loc); + + if (!conf) { + goto uncached; + } + + if (!conf->cache_statfs) { + goto uncached; + } + + ret = mdc_load_statfs_info_from_cache(this, &buf); + if (ret == 0 && buf) { + op_ret = 0; + op_errno = 0; + goto out; + } + +uncached: + STACK_WIND(frame, mdc_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; + +out: + MDC_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata); + return 0; +} int -mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata) +mdc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *dict, struct iatt *postparent) { - int ret = 0; - struct iatt stbuf = {0, }; - struct iatt postparent = {0, }; - dict_t *xattr_rsp = NULL; - dict_t *xattr_alloc = NULL; - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; + local = frame->local; - local = mdc_local_get (frame); - if (!local) - goto uncached; + if (!local) + goto out; - loc_copy (&local->loc, loc); + if (op_ret != 0) { + if (op_errno == ENOENT) + GF_ATOMIC_INC(conf->mdc_counter.negative_lookup); - ret = mdc_inode_iatt_get (this, loc->inode, &stbuf); - if (ret != 0) - goto uncached; + if (op_errno == ESTALE) { + /* if op_errno is ENOENT, fuse-bridge will unlink the + * dentry + */ + if (local->loc.parent) + mdc_inode_iatt_invalidate(this, local->loc.parent); + else + mdc_inode_iatt_invalidate(this, local->loc.inode); + } - if (xdata) { - ret = mdc_inode_xatt_get (this, loc->inode, &xattr_rsp); - if (ret != 0) - goto uncached; + goto out; + } - if (!mdc_xattr_satisfied (this, xdata, xattr_rsp)) - goto uncached; + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, stbuf, local->incident_time); + if (local->update_cache) { + mdc_inode_xatt_set(this, local->loc.inode, dict); + } + } +out: + MDC_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, dict, + postparent); + return 0; +} + +int +mdc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int ret = 0; + struct iatt stbuf = { + 0, + }; + struct iatt postparent = { + 0, + }; + dict_t *xattr_rsp = NULL; + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; + + local = mdc_local_get(frame, loc->inode); + if (!local) { + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + goto uncached; + } + + loc_copy(&local->loc, loc); + + if (!inode_is_linked(loc->inode)) { + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + goto uncached; + } + + if (mdc_inode_reset_need_lookup(this, loc->inode)) { + GF_ATOMIC_INC(conf->mdc_counter.need_lookup); + goto uncached; + } + + ret = mdc_inode_iatt_get(this, loc->inode, &stbuf); + if (ret != 0) { + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + goto uncached; + } + + if (xdata) { + ret = mdc_inode_xatt_get(this, loc->inode, &xattr_rsp); + if (ret != 0) { + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + goto uncached; } - MDC_STACK_UNWIND (lookup, frame, 0, 0, loc->inode, &stbuf, - xattr_rsp, &postparent); + if (!mdc_xattr_satisfied(this, xdata, xattr_rsp)) { + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + goto uncached; + } + } - if (xattr_rsp) - dict_unref (xattr_rsp); + GF_ATOMIC_INC(conf->mdc_counter.stat_hit); + MDC_STACK_UNWIND(lookup, frame, 0, 0, loc->inode, &stbuf, xattr_rsp, + &postparent); - return 0; + if (xattr_rsp) + dict_unref(xattr_rsp); + + return 0; uncached: - if (!xdata) - xdata = xattr_alloc = dict_new (); - if (xdata) - mdc_load_reqs (this, xdata); - - STACK_WIND (frame, mdc_lookup_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->lookup, loc, xdata); - - if (xattr_rsp) - dict_unref (xattr_rsp); - if (xattr_alloc) - dict_unref (xattr_alloc); - return 0; -} + xdata = mdc_prepare_request(this, local, xdata); + + STACK_WIND(frame, mdc_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + + if (xattr_rsp) + dict_unref(xattr_rsp); + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +} int -mdc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) +mdc_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - if (op_ret != 0) - goto out; + local = frame->local; + if (!local) + goto out; - local = frame->local; - if (!local) - goto out; + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + } - mdc_inode_iatt_set (this, local->loc.inode, buf); + goto out; + } + + mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time); + if (local->update_cache) { + mdc_inode_xatt_set(this, local->loc.inode, xdata); + } out: - MDC_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); + MDC_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata); - return 0; + return 0; } - int -mdc_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +mdc_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - int ret; - struct iatt stbuf; - mdc_local_t *local = NULL; + int ret; + struct iatt stbuf; + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; - local = mdc_local_get (frame); - if (!local) - goto uncached; + local = mdc_local_get(frame, loc->inode); + if (!local) + goto uncached; - loc_copy (&local->loc, loc); + loc_copy(&local->loc, loc); - ret = mdc_inode_iatt_get (this, loc->inode, &stbuf); - if (ret != 0) - goto uncached; + if (!inode_is_linked(loc->inode)) { + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + goto uncached; + } - MDC_STACK_UNWIND (stat, frame, 0, 0, &stbuf, xdata); + ret = mdc_inode_iatt_get(this, loc->inode, &stbuf); + if (ret != 0) + goto uncached; - return 0; + GF_ATOMIC_INC(conf->mdc_counter.stat_hit); + MDC_STACK_UNWIND(stat, frame, 0, 0, &stbuf, xdata); + + return 0; uncached: - STACK_WIND (frame, mdc_stat_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, - loc, xdata); - return 0; -} + xdata = mdc_prepare_request(this, local, xdata); + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + STACK_WIND(frame, mdc_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +} int -mdc_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) +mdc_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - if (op_ret != 0) - goto out; + local = frame->local; + if (!local) + goto out; - local = frame->local; - if (!local) - goto out; + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) { + mdc_inode_iatt_invalidate(this, local->fd->inode); + } + + goto out; + } - mdc_inode_iatt_set (this, local->fd->inode, buf); + mdc_inode_iatt_set(this, local->fd->inode, buf, local->incident_time); + if (local->update_cache) { + mdc_inode_xatt_set(this, local->fd->inode, xdata); + } out: - MDC_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); + MDC_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata); - return 0; + return 0; } - int -mdc_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +mdc_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - int ret; - struct iatt stbuf; - mdc_local_t *local = NULL; + int ret; + struct iatt stbuf; + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; - local = mdc_local_get (frame); - if (!local) - goto uncached; + local = mdc_local_get(frame, fd->inode); + if (!local) + goto uncached; - local->fd = fd_ref (fd); + local->fd = __fd_ref(fd); - ret = mdc_inode_iatt_get (this, fd->inode, &stbuf); - if (ret != 0) - goto uncached; + ret = mdc_inode_iatt_get(this, fd->inode, &stbuf); + if (ret != 0) + goto uncached; - MDC_STACK_UNWIND (fstat, frame, 0, 0, &stbuf, xdata); + GF_ATOMIC_INC(conf->mdc_counter.stat_hit); + MDC_STACK_UNWIND(fstat, frame, 0, 0, &stbuf, xdata); - return 0; + return 0; uncached: - STACK_WIND (frame, mdc_fstat_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, - fd, xdata); - return 0; -} + xdata = mdc_prepare_request(this, local, xdata); + + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + STACK_WIND(frame, mdc_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +} int -mdc_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +mdc_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret != 0) - goto out; + if (!local) + goto out; - if (!local) - goto out; + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->loc.inode); - mdc_inode_iatt_set_validate(this, local->loc.inode, prebuf, postbuf); + goto out; + } + + mdc_inode_iatt_set_validate(this, local->loc.inode, prebuf, postbuf, + _gf_true, local->incident_time); out: - MDC_STACK_UNWIND (truncate, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + MDC_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); - return 0; + return 0; } - int -mdc_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset, dict_t *xdata) +mdc_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = mdc_local_get (frame); + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + local->loc.inode = inode_ref(loc->inode); + } - local->loc.inode = inode_ref (loc->inode); - - STACK_WIND (frame, mdc_truncate_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, - loc, offset, xdata); - return 0; + STACK_WIND(frame, mdc_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; } - int -mdc_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +mdc_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; + + local = frame->local; - local = frame->local; + if (!local) + goto out; - if (op_ret != 0) - goto out; + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); - if (!local) - goto out; + goto out; + } - mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); out: - MDC_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + MDC_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); - return 0; + return 0; } - int -mdc_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset, dict_t *xdata) +mdc_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = mdc_local_get (frame); + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } - local->fd = fd_ref (fd); - - STACK_WIND (frame, mdc_ftruncate_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, - fd, offset, xdata); - return 0; + STACK_WIND(frame, mdc_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; } - int -mdc_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +mdc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - mdc_local_t *local = NULL; - - local = frame->local; + mdc_local_t *local = NULL; - if (op_ret != 0) - goto out; + local = frame->local; - if (!local) - goto out; + if (!local) + goto out; - if (local->loc.parent) { - mdc_inode_iatt_set (this, local->loc.parent, postparent); + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.parent); } - if (local->loc.inode) { - mdc_inode_iatt_set (this, local->loc.inode, buf); - mdc_inode_xatt_set (this, local->loc.inode, local->xattr); - } + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time); + } out: - MDC_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; + MDC_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; } +int +mdc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + local->xattr = dict_ref(xdata); + } + + STACK_WIND(frame, mdc_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + return 0; +} int -mdc_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, - mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +mdc_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = mdc_local_get (frame); + local = frame->local; - loc_copy (&local->loc, loc); - local->xattr = dict_ref (xdata); + if (!local) + goto out; - STACK_WIND (frame, mdc_mknod_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, umask, xdata); - return 0; + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.parent); + } + + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time); + } +out: + MDC_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; } +int +mdc_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + local->xattr = dict_ref(xdata); + } + + STACK_WIND(frame, mdc_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + return 0; +} int -mdc_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, +mdc_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret != 0) - goto out; + if (!local) + goto out; - if (!local) - goto out; + if (op_ret != 0) { + /* if errno is ESTALE, parent is not present, which implies even + * child is not present. Also, man 2 unlink states unlink can + * return ENOENT if a component in pathname does not + * exist or is a dangling symbolic link. So, invalidate both + * parent and child for both errno + */ - if (local->loc.parent) { - mdc_inode_iatt_set (this, local->loc.parent, postparent); + if ((op_errno == ENOENT) || (op_errno == ESTALE)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + mdc_inode_iatt_invalidate(this, local->loc.parent); } - if (local->loc.inode) { - mdc_inode_iatt_set (this, local->loc.inode, buf); - mdc_inode_xatt_set (this, local->loc.inode, local->xattr); - } + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, NULL, local->incident_time); + } + out: - MDC_STACK_UNWIND (mkdir, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; + MDC_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; } - int -mdc_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, - mode_t mode, mode_t umask, dict_t *xdata) +mdc_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, + dict_t *xdata) { - mdc_local_t *local = NULL; - - local = mdc_local_get (frame); + mdc_local_t *local = NULL; - loc_copy (&local->loc, loc); - local->xattr = dict_ref (xdata); + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + } - STACK_WIND (frame, mdc_mkdir_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, - loc, mode, umask, xdata); - return 0; + STACK_WIND(frame, mdc_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; } - int -mdc_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +mdc_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret != 0) - goto out; + if (!local) + goto out; - if (!local) - goto out; + if (op_ret != 0) { + /* if errno is ESTALE, parent is not present, which implies even + * child is not present. Also, man 2 rmdir states rmdir can + * return ENOENT if a directory component in pathname does not + * exist or is a dangling symbolic link. So, invalidate both + * parent and child for both errno + */ - if (local->loc.parent) { - mdc_inode_iatt_set (this, local->loc.parent, postparent); + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + mdc_inode_iatt_invalidate(this, local->loc.parent); } - if (local->loc.inode) { - mdc_inode_iatt_set (this, local->loc.inode, NULL); - } + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } out: - MDC_STACK_UNWIND (unlink, frame, op_ret, op_errno, - preparent, postparent, xdata); - return 0; + MDC_STACK_UNWIND(rmdir, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; } - int -mdc_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, - dict_t *xdata) +mdc_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, + dict_t *xdata) { - mdc_local_t *local = NULL; - - local = mdc_local_get (frame); + mdc_local_t *local = NULL; - loc_copy (&local->loc, loc); + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + } - STACK_WIND (frame, mdc_unlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, - loc, xflag, xdata); - return 0; + STACK_WIND(frame, mdc_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, flag, xdata); + return 0; } - int -mdc_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +mdc_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - mdc_local_t *local = NULL; - - local = frame->local; + mdc_local_t *local = NULL; - if (op_ret != 0) - goto out; + local = frame->local; - if (!local) - goto out; + if (!local) + goto out; - if (local->loc.parent) { - mdc_inode_iatt_set (this, local->loc.parent, postparent); + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.parent); } + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time); + } out: - MDC_STACK_UNWIND (rmdir, frame, op_ret, op_errno, - preparent, postparent, xdata); - return 0; + MDC_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; } - int -mdc_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, - dict_t *xdata) +mdc_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; + char *name; - local = mdc_local_get (frame); + name = gf_strdup(linkname); + if (name == NULL) { + goto wind; + } + local = mdc_local_get(frame, loc->inode); + if (local == NULL) { + GF_FREE(name); + goto wind; + } - loc_copy (&local->loc, loc); + loc_copy(&local->loc, loc); + local->linkname = name; - STACK_WIND (frame, mdc_rmdir_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, - loc, flag, xdata); - return 0; +wind: + STACK_WIND(frame, mdc_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata); + return 0; } - int -mdc_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +mdc_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = frame->local; + local = frame->local; + if (!local) + goto out; - if (op_ret != 0) - goto out; + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + mdc_inode_iatt_invalidate(this, local->loc2.parent); + } - if (!local) - goto out; + goto out; + } - if (local->loc.parent) { - mdc_inode_iatt_set (this, local->loc.parent, postparent); - } + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postoldparent, + local->incident_time); + } - if (local->loc.inode) { - mdc_inode_iatt_set (this, local->loc.inode, buf); - } + if (local->loc.inode) { + /* TODO: fix dht_rename() not to return linkfile + attributes before setting attributes here + */ + + mdc_inode_iatt_set(this, local->loc.inode, NULL, local->incident_time); + } + + if (local->loc2.parent) { + mdc_inode_iatt_set(this, local->loc2.parent, postnewparent, + local->incident_time); + } out: - MDC_STACK_UNWIND (symlink, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; + MDC_STACK_UNWIND(rename, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); + return 0; } - int -mdc_symlink (call_frame_t *frame, xlator_t *this, const char *linkname, - loc_t *loc, mode_t umask, dict_t *xdata) +mdc_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - mdc_local_t *local = NULL; - - local = mdc_local_get (frame); + mdc_local_t *local = NULL; - loc_copy (&local->loc, loc); + local = mdc_local_get(frame, oldloc->inode); + if (local != NULL) { + loc_copy(&local->loc, oldloc); + loc_copy(&local->loc2, newloc); + } - local->linkname = gf_strdup (linkname); - - STACK_WIND (frame, mdc_symlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, - linkname, loc, umask, xdata); - return 0; + STACK_WIND(frame, mdc_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; } - int -mdc_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent, - dict_t *xdata) +mdc_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret != 0) - goto out; + if (!local) + goto out; - if (!local) - goto out; - - if (local->loc.parent) { - mdc_inode_iatt_set (this, local->loc.parent, postoldparent); + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + mdc_inode_iatt_invalidate(this, local->loc2.parent); } - if (local->loc.inode) { - /* TODO: fix dht_rename() not to return linkfile - attributes before setting attributes here - */ + goto out; + } - mdc_inode_iatt_set (this, local->loc.inode, NULL); - } + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time); + } - if (local->loc2.parent) { - mdc_inode_iatt_set (this, local->loc2.parent, postnewparent); - } + if (local->loc2.parent) { + mdc_inode_iatt_set(this, local->loc2.parent, postparent, + local->incident_time); + } out: - MDC_STACK_UNWIND (rename, frame, op_ret, op_errno, buf, - preoldparent, postoldparent, prenewparent, - postnewparent, xdata); - return 0; + MDC_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; } - int -mdc_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +mdc_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = mdc_local_get (frame); + local = mdc_local_get(frame, oldloc->inode); + if (local != NULL) { + loc_copy(&local->loc, oldloc); + loc_copy(&local->loc2, newloc); + } - loc_copy (&local->loc, oldloc); - loc_copy (&local->loc2, newloc); - - STACK_WIND (frame, mdc_rename_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, - oldloc, newloc, xdata); - return 0; + STACK_WIND(frame, mdc_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; } - int -mdc_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +mdc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - mdc_local_t *local = NULL; - - local = frame->local; + mdc_local_t *local = NULL; - if (op_ret != 0) - goto out; + local = frame->local; - if (!local) - goto out; + if (!local) + goto out; - if (local->loc.inode) { - mdc_inode_iatt_set (this, local->loc.inode, buf); + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.parent); } - if (local->loc2.parent) { - mdc_inode_iatt_set (this, local->loc2.parent, postparent); - } + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, inode, buf, local->incident_time); + } out: - MDC_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; + MDC_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, buf, preparent, + postparent, xdata); + return 0; } - int -mdc_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +mdc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = mdc_local_get (frame); + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + local->xattr = dict_ref(xdata); + } - loc_copy (&local->loc, oldloc); - loc_copy (&local->loc2, newloc); - - STACK_WIND (frame, mdc_link_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, - oldloc, newloc, xdata); - return 0; + STACK_WIND(frame, mdc_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; } - -int -mdc_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +static int +mdc_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret != 0) - goto out; + if (!local) + goto out; - if (!local) - goto out; + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + goto out; + } - if (local->loc.parent) { - mdc_inode_iatt_set (this, local->loc.parent, postparent); - } + if (local->fd->flags & O_TRUNC) { + /* O_TRUNC modifies file size. Hence invalidate the + * cache entry to fetch latest attributes. */ + mdc_inode_iatt_invalidate(this, local->fd->inode); + } - if (local->loc.inode) { - mdc_inode_iatt_set (this, inode, buf); - mdc_inode_xatt_set (this, local->loc.inode, local->xattr); - } out: - MDC_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent, xdata); - return 0; + MDC_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); + return 0; } - -int -mdc_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +static int +mdc_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = mdc_local_get (frame); + if (!fd || !IA_ISREG(fd->inode->ia_type) || !(fd->flags & O_TRUNC)) { + goto out; + } - loc_copy (&local->loc, loc); - local->xattr = dict_ref (xdata); + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } - STACK_WIND (frame, mdc_create_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, - loc, flags, mode, umask, fd, xdata); - return 0; +out: + STACK_WIND(frame, mdc_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; } - int -mdc_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, - struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +mdc_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = frame->local; + local = frame->local; + if (!local) + goto out; - if (op_ret != 0) - goto out; + if (op_ret < 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } - if (!local) - goto out; - - mdc_inode_iatt_set (this, local->fd->inode, stbuf); + mdc_inode_iatt_set(this, local->fd->inode, stbuf, local->incident_time); out: - MDC_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref, xdata); + MDC_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); - return 0; + return 0; } - int -mdc_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +mdc_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = mdc_local_get (frame); + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } - local->fd = fd_ref (fd); - - STACK_WIND (frame, mdc_readv_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, - fd, size, offset, flags, xdata); - return 0; + STACK_WIND(frame, mdc_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; } - int -mdc_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +mdc_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - mdc_local_t *local = NULL; - - local = frame->local; + mdc_local_t *local = NULL; - if (op_ret == -1) - goto out; + local = frame->local; + if (!local) + goto out; - if (!local) - goto out; + if (op_ret == -1) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } - mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); out: - MDC_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + MDC_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); - return 0; + return 0; } - int -mdc_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int count, off_t offset, uint32_t flags, struct iobref *iobref, - dict_t *xdata) +mdc_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - mdc_local_t *local = NULL; - - local = mdc_local_get (frame); + mdc_local_t *local = NULL; - local->fd = fd_ref (fd); + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } - STACK_WIND (frame, mdc_writev_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, - fd, vector, count, offset, flags, iobref, xdata); - return 0; + STACK_WIND(frame, mdc_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; } - int -mdc_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +mdc_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - mdc_local_t *local = NULL; - - local = frame->local; + mdc_local_t *local = NULL; - if (op_ret != 0) { - mdc_inode_iatt_set (this, local->loc.inode, NULL); - goto out; - } + local = frame->local; + if (!local) + goto out; - if (!local) - goto out; + if (op_ret != 0) { + mdc_inode_iatt_set(this, local->loc.inode, NULL, local->incident_time); + goto out; + } - mdc_inode_iatt_set_validate(this, local->loc.inode, prebuf, postbuf); + mdc_inode_iatt_set_validate(this, local->loc.inode, prebuf, postbuf, + _gf_true, local->incident_time); + mdc_inode_xatt_update(this, local->loc.inode, xdata); out: - MDC_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + MDC_STACK_UNWIND(setattr, frame, op_ret, op_errno, prebuf, postbuf, xdata); - return 0; + return 0; } - int -mdc_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int valid, dict_t *xdata) +mdc_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int valid, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; + dict_t *xattr_alloc = NULL; + int ret = 0; + struct mdc_conf *conf = this->private; - local = mdc_local_get (frame); + local = mdc_local_get(frame, loc->inode); + if (local == NULL) { + goto wind; + } - loc_copy (&local->loc, loc); + loc_copy(&local->loc, loc); - STACK_WIND (frame, mdc_setattr_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr, - loc, stbuf, valid, xdata); - return 0; -} + if ((valid & GF_SET_ATTR_MODE) && conf->cache_glusterfs_acl) { + if (!xdata) + xdata = xattr_alloc = dict_new(); + if (xdata) { + ret = dict_set_int8(xdata, GF_POSIX_ACL_ACCESS, 0); + if (!ret) + ret = dict_set_int8(xdata, GF_POSIX_ACL_DEFAULT, 0); + if (ret) + mdc_inode_xatt_invalidate(this, local->loc.inode); + } + } + if ((valid & GF_SET_ATTR_MODE) && conf->cache_posix_acl) { + if (!xdata) + xdata = xattr_alloc = dict_new(); + if (xdata) { + ret = dict_set_int8(xdata, POSIX_ACL_ACCESS_XATTR, 0); + if (!ret) + ret = dict_set_int8(xdata, POSIX_ACL_DEFAULT_XATTR, 0); + if (ret) + mdc_inode_xatt_invalidate(this, local->loc.inode); + } + } + +wind: + STACK_WIND(frame, mdc_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + + if (xattr_alloc) + dict_unref(xattr_alloc); + return 0; +} int -mdc_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +mdc_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - mdc_local_t *local = NULL; - - local = frame->local; + mdc_local_t *local = NULL; - if (op_ret != 0) - goto out; + local = frame->local; + if (!local) + goto out; - if (!local) - goto out; + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } - mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); + mdc_inode_xatt_update(this, local->fd->inode, xdata); out: - MDC_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + MDC_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, prebuf, postbuf, xdata); - return 0; + return 0; } - int -mdc_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int valid, dict_t *xdata) +mdc_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int valid, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; + dict_t *xattr_alloc = NULL; + int ret = 0; + struct mdc_conf *conf = this->private; - local = mdc_local_get (frame); + local = mdc_local_get(frame, fd->inode); + if (local == NULL) { + goto wind; + } - local->fd = fd_ref (fd); + local->fd = __fd_ref(fd); - STACK_WIND (frame, mdc_fsetattr_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr, - fd, stbuf, valid, xdata); - return 0; -} + if ((valid & GF_SET_ATTR_MODE) && conf->cache_glusterfs_acl) { + if (!xdata) + xdata = xattr_alloc = dict_new(); + if (xdata) { + ret = dict_set_int8(xdata, GF_POSIX_ACL_ACCESS, 0); + if (!ret) + ret = dict_set_int8(xdata, GF_POSIX_ACL_DEFAULT, 0); + if (ret) + mdc_inode_xatt_invalidate(this, local->fd->inode); + } + } + + if ((valid & GF_SET_ATTR_MODE) && conf->cache_posix_acl) { + if (!xdata) + xdata = xattr_alloc = dict_new(); + if (xdata) { + ret = dict_set_int8(xdata, POSIX_ACL_ACCESS_XATTR, 0); + if (!ret) + ret = dict_set_int8(xdata, POSIX_ACL_DEFAULT_XATTR, 0); + if (ret) + mdc_inode_xatt_invalidate(this, local->fd->inode); + } + } + +wind: + STACK_WIND(frame, mdc_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + if (xattr_alloc) + dict_unref(xattr_alloc); + return 0; +} int -mdc_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +mdc_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - mdc_local_t *local = NULL; - - local = frame->local; + mdc_local_t *local = NULL; - if (op_ret != 0) - goto out; + local = frame->local; + if (!local) + goto out; - if (!local) - goto out; + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } - mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); out: - MDC_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + MDC_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); - return 0; + return 0; } +int +mdc_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } + + STACK_WIND(frame, mdc_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; +} int -mdc_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, - dict_t *xdata) +mdc_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; + struct iatt prestat = { + 0, + }; + struct iatt poststat = { + 0, + }; + int ret = 0; - local = mdc_local_get (frame); + local = frame->local; + if (!local) + goto out; - local->fd = fd_ref (fd); + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + goto out; + } - STACK_WIND (frame, mdc_fsync_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, - fd, datasync, xdata); - return 0; -} + mdc_inode_xatt_update(this, local->loc.inode, local->xattr); + + ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat); + if (ret >= 0) { + ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat); + mdc_inode_iatt_set_validate(this, local->loc.inode, &prestat, &poststat, + _gf_true, local->incident_time); + } + if (ret < 0) + mdc_inode_iatt_invalidate(this, local->loc.inode); + +out: + MDC_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + + return 0; +} int -mdc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +mdc_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, + int flags, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = frame->local; + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + local->xattr = dict_ref(xattr); + } - if (op_ret != 0) - goto out; + STACK_WIND(frame, mdc_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, xattr, flags, xdata); - if (!local) - goto out; + return 0; +} - mdc_inode_xatt_update (this, local->loc.inode, local->xattr); +int +mdc_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + struct iatt prestat = { + 0, + }; + struct iatt poststat = { + 0, + }; + int ret = 0; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + mdc_inode_xatt_update(this, local->fd->inode, local->xattr); + + ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat); + if (ret >= 0) { + ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat); + mdc_inode_iatt_set_validate(this, local->fd->inode, &prestat, &poststat, + _gf_true, local->incident_time); + } + + if (ret < 0) + mdc_inode_iatt_invalidate(this, local->fd->inode); out: - MDC_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + MDC_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata); - return 0; + return 0; } - int -mdc_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xattr, int flags, dict_t *xdata) +mdc_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = mdc_local_get (frame); + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + local->xattr = dict_ref(xattr); + } - loc_copy (&local->loc, loc); - local->xattr = dict_ref (xattr); + STACK_WIND(frame, mdc_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, xattr, flags, xdata); - STACK_WIND (frame, mdc_setxattr_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, - loc, xattr, flags, xdata); - return 0; + return 0; } - int -mdc_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +mdc_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - local = frame->local; + local = frame->local; + if (!local) + goto out; - if (op_ret != 0) - goto out; + if (op_ret < 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + goto out; + } - if (!local) - goto out; + if (dict_get(xattr, "glusterfs.skip-cache")) { + gf_msg(this->name, GF_LOG_DEBUG, 0, 0, + "Skipping xattr update due to empty value"); + goto out; + } - mdc_inode_xatt_update (this, local->fd->inode, local->xattr); + if (local->update_cache) { + mdc_inode_xatt_set(this, local->loc.inode, xdata); + } out: - MDC_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); + MDC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata); - return 0; + return 0; } - int -mdc_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - dict_t *xattr, int flags, dict_t *xdata) +mdc_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, + dict_t *xdata) { - mdc_local_t *local = NULL; + int ret; + int op_errno = ENODATA; + mdc_local_t *local = NULL; + dict_t *xattr = NULL; + struct mdc_conf *conf = this->private; + gf_boolean_t key_satisfied = _gf_false; - local = mdc_local_get (frame); + local = mdc_local_get(frame, loc->inode); + if (!local) { + goto uncached; + } - local->fd = fd_ref (fd); - local->xattr = dict_ref (xattr); + loc_copy(&local->loc, loc); - STACK_WIND (frame, mdc_fsetxattr_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetxattr, - fd, xattr, flags, xdata); - return 0; + if (!is_mdc_key_satisfied(this, key)) { + goto uncached; + } + key_satisfied = _gf_true; + + ret = mdc_inode_xatt_get(this, loc->inode, &xattr); + if (ret != 0) + goto uncached; + + if (!xattr || !dict_get(xattr, (char *)key)) { + ret = -1; + op_errno = ENODATA; + } + + GF_ATOMIC_INC(conf->mdc_counter.xattr_hit); + MDC_STACK_UNWIND(getxattr, frame, ret, op_errno, xattr, xdata); + + if (xattr) + dict_unref(xattr); + + return 0; + +uncached: + if (key_satisfied) { + xdata = mdc_prepare_request(this, local, xdata); + } + + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + STACK_WIND(frame, mdc_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, key, xdata); + + if (key_satisfied && (xdata != NULL)) { + dict_unref(xdata); + } + + return 0; } int -mdc_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, +mdc_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; - if (op_ret != 0) - goto out; + local = frame->local; + if (!local) + goto out; - local = frame->local; - if (!local) - goto out; + if (op_ret < 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } - mdc_inode_xatt_update (this, local->loc.inode, xattr); + if (dict_get(xattr, "glusterfs.skip-cache")) { + gf_msg(this->name, GF_LOG_DEBUG, 0, 0, + "Skipping xattr update due to empty value"); + goto out; + } + + if (local->update_cache) { + mdc_inode_xatt_set(this, local->fd->inode, xdata); + } out: - MDC_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + MDC_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata); - return 0; + return 0; } - int -mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, +mdc_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, dict_t *xdata) { - int ret; - int op_errno = ENODATA; - mdc_local_t *local = NULL; - dict_t *xattr = NULL; + int ret; + mdc_local_t *local = NULL; + dict_t *xattr = NULL; + int op_errno = ENODATA; + struct mdc_conf *conf = this->private; + gf_boolean_t key_satisfied = _gf_true; - local = mdc_local_get (frame); - if (!local) - goto uncached; + local = mdc_local_get(frame, fd->inode); + if (!local) + goto uncached; - loc_copy (&local->loc, loc); + local->fd = __fd_ref(fd); - if (!is_mdc_key_satisfied (key)) - goto uncached; + if (!is_mdc_key_satisfied(this, key)) { + key_satisfied = _gf_false; + goto uncached; + } - ret = mdc_inode_xatt_get (this, loc->inode, &xattr); - if (ret != 0) - goto uncached; + ret = mdc_inode_xatt_get(this, fd->inode, &xattr); + if (ret != 0) + goto uncached; - if (!xattr || dict_get (xattr, (char *)key)) { - ret = -1; - op_errno = ENODATA; - } + if (!xattr || !dict_get(xattr, (char *)key)) { + ret = -1; + op_errno = ENODATA; + } - MDC_STACK_UNWIND (getxattr, frame, ret, op_errno, xattr, xdata); + GF_ATOMIC_INC(conf->mdc_counter.xattr_hit); + MDC_STACK_UNWIND(fgetxattr, frame, ret, op_errno, xattr, xdata); - return 0; + if (xattr) + dict_unref(xattr); + + return 0; uncached: - STACK_WIND (frame, mdc_getxattr_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr, - loc, key, xdata); - return 0; -} + if (key_satisfied) { + xdata = mdc_prepare_request(this, local, xdata); + } + + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + STACK_WIND(frame, mdc_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, key, xdata); + + if (key_satisfied && (xdata != NULL)) { + dict_unref(xdata); + } + + return 0; +} + +int +mdc_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + struct iatt prestat = { + 0, + }; + struct iatt poststat = { + 0, + }; + int ret = 0; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + goto out; + } + + if (local->key) + mdc_inode_xatt_unset(this, local->loc.inode, local->key); + else + mdc_inode_xatt_invalidate(this, local->loc.inode); + + ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat); + if (ret >= 0) { + ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat); + mdc_inode_iatt_set_validate(this, local->loc.inode, &prestat, &poststat, + _gf_true, local->incident_time); + } + + if (ret < 0) + mdc_inode_iatt_invalidate(this, local->loc.inode); +out: + MDC_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata); + return 0; +} int -mdc_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) +mdc_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - mdc_local_t *local = NULL; + mdc_local_t *local = NULL; + int op_errno = ENODATA; + int ret = 0; + dict_t *xattr = NULL; + struct mdc_conf *conf = this->private; + char *name2; + + name2 = gf_strdup(name); + if (name2 == NULL) { + goto uncached; + } - if (op_ret != 0) - goto out; + local = mdc_local_get(frame, loc->inode); + if (local == NULL) { + GF_FREE(name2); + goto uncached; + } - local = frame->local; - if (!local) - goto out; + loc_copy(&local->loc, loc); + local->key = name2; - mdc_inode_xatt_update (this, local->fd->inode, xattr); + if (!is_mdc_key_satisfied(this, name)) + goto uncached; + + ret = mdc_inode_xatt_get(this, loc->inode, &xattr); + if (ret != 0) + goto uncached; + + GF_ATOMIC_INC(conf->mdc_counter.xattr_hit); + + if (!xattr || !dict_get(xattr, (char *)name)) { + ret = -1; + op_errno = ENODATA; + + MDC_STACK_UNWIND(removexattr, frame, ret, op_errno, xdata); + } else { + STACK_WIND(frame, mdc_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + } + + if (xattr) + dict_unref(xattr); + + return 0; + +uncached: + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + STACK_WIND(frame, mdc_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; +} + +int +mdc_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + struct iatt prestat = { + 0, + }; + struct iatt poststat = { + 0, + }; + int ret = 0; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + if (local->key) + mdc_inode_xatt_unset(this, local->fd->inode, local->key); + else + mdc_inode_xatt_invalidate(this, local->fd->inode); + + ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat); + if (ret >= 0) { + ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat); + mdc_inode_iatt_set_validate(this, local->fd->inode, &prestat, &poststat, + _gf_true, local->incident_time); + } + + if (ret < 0) + mdc_inode_iatt_invalidate(this, local->fd->inode); out: - MDC_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, xdata); + MDC_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata); - return 0; + return 0; } - int -mdc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, - dict_t *xdata) +mdc_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - int ret; - mdc_local_t *local = NULL; - dict_t *xattr = NULL; - int op_errno = ENODATA; + mdc_local_t *local = NULL; + int op_errno = ENODATA; + int ret = 0; + dict_t *xattr = NULL; + struct mdc_conf *conf = this->private; + char *name2; - local = mdc_local_get (frame); - if (!local) - goto uncached; + name2 = gf_strdup(name); + if (name2 == NULL) { + goto uncached; + } - local->fd = fd_ref (fd); + local = mdc_local_get(frame, fd->inode); + if (local == NULL) { + GF_FREE(name2); + goto uncached; + } - if (!is_mdc_key_satisfied (key)) - goto uncached; + local->fd = __fd_ref(fd); + local->key = name2; - ret = mdc_inode_xatt_get (this, fd->inode, &xattr); - if (ret != 0) - goto uncached; + if (!is_mdc_key_satisfied(this, name)) + goto uncached; - if (!xattr || dict_get (xattr, (char *)key)) { - ret = -1; - op_errno = ENODATA; - } + ret = mdc_inode_xatt_get(this, fd->inode, &xattr); + if (ret != 0) + goto uncached; - MDC_STACK_UNWIND (fgetxattr, frame, ret, op_errno, xattr, xdata); + GF_ATOMIC_INC(conf->mdc_counter.xattr_hit); - return 0; + if (!xattr || !dict_get(xattr, (char *)name)) { + ret = -1; + op_errno = ENODATA; + + MDC_STACK_UNWIND(fremovexattr, frame, ret, op_errno, xdata); + } else { + STACK_WIND(frame, mdc_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + } + + if (xattr) + dict_unref(xattr); + + return 0; uncached: - STACK_WIND (frame, mdc_fgetxattr_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->fgetxattr, - fd, key, xdata); - return 0; + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + STACK_WIND(frame, mdc_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; } +int32_t +mdc_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == 0) + goto out; + + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + +out: + MDC_STACK_UNWIND(opendir, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +int +mdc_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + } + + /* Tell readdir-ahead to include these keys in xdata when it + * internally issues readdirp() in it's opendir_cbk */ + xdata = mdc_prepare_request(this, local, xdata); + + STACK_WIND(frame, mdc_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +} int -mdc_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries, dict_t *xdata) +mdc_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) { - gf_dirent_t *entry = NULL; + gf_dirent_t *entry = NULL; + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto unwind; - if (op_ret <= 0) - goto unwind; + if (op_ret <= 0) { + if ((op_ret == -1) && ((op_errno == ENOENT) || (op_errno == ESTALE))) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto unwind; + } - list_for_each_entry (entry, &entries->list, list) { - if (!entry->inode) - continue; - mdc_inode_iatt_set (this, entry->inode, &entry->d_stat); - mdc_inode_xatt_set (this, entry->inode, entry->dict); + list_for_each_entry(entry, &entries->list, list) + { + if (!entry->inode) + continue; + mdc_inode_iatt_set(this, entry->inode, &entry->d_stat, + local->incident_time); + if (local->update_cache) { + mdc_inode_xatt_set(this, entry->inode, entry->dict); } + } unwind: - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); - return 0; + MDC_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; } - int -mdc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset, dict_t *xdata) +mdc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - dict_t *xattr_alloc = NULL; + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, fd->inode); + if (!local) + goto out; + + local->fd = __fd_ref(fd); + + xdata = mdc_prepare_request(this, local, xdata); + + STACK_WIND(frame, mdc_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata); - if (!xdata) - xdata = xattr_alloc = dict_new (); - if (xdata) - mdc_load_reqs (this, xdata); + if (xdata != NULL) { + dict_unref(xdata); + } - STACK_WIND (frame, mdc_readdirp_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->readdirp, - fd, size, offset, xdata); - if (xattr_alloc) - dict_unref (xattr_alloc); - return 0; + return 0; +out: + MDC_STACK_UNWIND(readdirp, frame, -1, ENOMEM, NULL, NULL); + return 0; } int mdc_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, gf_dirent_t *entries, dict_t *xdata) + int op_errno, gf_dirent_t *entries, dict_t *xdata) { - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries, xdata); - return 0; + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == 0) + goto out; + + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->fd->inode); +out: + MDC_STACK_UNWIND(readdir, frame, op_ret, op_errno, entries, xdata); + return 0; } int -mdc_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset, dict_t *xdata) +mdc_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - int need_unref = 0; - struct mdc_conf *conf = this->private; + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; - if (!conf->force_readdirp) { - STACK_WIND(frame, mdc_readdir_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdir, fd, size, offset, - xdata); - return 0; - } + local = mdc_local_get(frame, fd->inode); + if (!local) + goto unwind; - if (!xdata) { - xdata = dict_new (); - need_unref = 1; - } + local->fd = __fd_ref(fd); + + if (!conf->force_readdirp) { + STACK_WIND(frame, mdc_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); + return 0; + } - if (xdata) - mdc_load_reqs (this, xdata); + xdata = mdc_prepare_request(this, local, xdata); - STACK_WIND(frame, mdc_readdirp_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdirp, fd, size, offset, - xdata); + STACK_WIND(frame, mdc_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata); - if (need_unref && xdata) - dict_unref (xdata); + if (xdata != NULL) { + dict_unref(xdata); + } - return 0; + return 0; +unwind: + MDC_STACK_UNWIND(readdir, frame, -1, ENOMEM, NULL, NULL); + return 0; } int mdc_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - mdc_local_t *local = NULL; - - local = frame->local; + mdc_local_t *local = NULL; - if (op_ret != 0) - goto out; + local = frame->local; + if (!local) + goto out; - if (!local) - goto out; + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } - mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); out: - MDC_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + MDC_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); - return 0; + return 0; } -int mdc_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) +int +mdc_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { - mdc_local_t *local; + mdc_local_t *local; - local = mdc_local_get(frame); - local->fd = fd_ref(fd); + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } - STACK_WIND(frame, mdc_fallocate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, - xdata); + STACK_WIND(frame, mdc_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); - return 0; + return 0; } int mdc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - mdc_local_t *local = NULL; - - local = frame->local; + mdc_local_t *local = NULL; - if (op_ret != 0) - goto out; + local = frame->local; + if (!local) + goto out; - if (!local) - goto out; + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } - mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); out: - MDC_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + MDC_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata); - return 0; + return 0; } -int mdc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) +int +mdc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - mdc_local_t *local; + mdc_local_t *local; - local = mdc_local_get(frame); - local->fd = fd_ref(fd); + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } - STACK_WIND(frame, mdc_discard_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->discard, fd, offset, len, - xdata); + STACK_WIND(frame, mdc_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); - return 0; + return 0; } int -mdc_forget (xlator_t *this, inode_t *inode) +mdc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - mdc_inode_wipe (this, inode); + mdc_local_t *local = NULL; - return 0; -} + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); + +out: + MDC_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} int -is_strpfx (const char *str1, const char *str2) +mdc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) { - /* is one of the string a prefix of the other? */ - int i; + mdc_local_t *local; + + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } + + STACK_WIND(frame, mdc_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + + return 0; +} + +int32_t +mdc_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, const char *path, + struct iatt *buf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == 0) + goto out; - for (i = 0; str1[i] == str2[i]; i++) { - if (!str1[i] || !str2[i]) - break; - } + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->loc.inode); - return !(str1[i] && str2[i]); +out: + MDC_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, buf, xdata); + return 0; +} + +int32_t +mdc_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (!local) + goto unwind; + + loc_copy(&local->loc, loc); + + STACK_WIND(frame, mdc_readlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, loc, size, xdata); + return 0; + +unwind: + MDC_STACK_UNWIND(readlink, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; } +int32_t +mdc_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == 0) + goto out; + + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + +out: + MDC_STACK_UNWIND(fsyncdir, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +mdc_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, fd->inode); + if (!local) + goto unwind; + + local->fd = __fd_ref(fd); + + STACK_WIND(frame, mdc_fsyncdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, fd, flags, xdata); + return 0; + +unwind: + MDC_STACK_UNWIND(fsyncdir, frame, -1, ENOMEM, NULL); + return 0; +} + +int32_t +mdc_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == 0) + goto out; + + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + +out: + MDC_STACK_UNWIND(access, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +mdc_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (!local) + goto unwind; + + loc_copy(&local->loc, loc); + + STACK_WIND(frame, mdc_access_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, loc, mask, xdata); + return 0; + +unwind: + MDC_STACK_UNWIND(access, frame, -1, ENOMEM, NULL); + return 0; +} + +int +mdc_priv_dump(xlator_t *this) +{ + struct mdc_conf *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + + conf = this->private; + + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + + gf_proc_dump_write("stat_hit_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.stat_hit)); + gf_proc_dump_write("stat_miss_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.stat_miss)); + gf_proc_dump_write("xattr_hit_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.xattr_hit)); + gf_proc_dump_write("xattr_miss_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.xattr_miss)); + gf_proc_dump_write("nameless_lookup_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.nameless_lookup)); + gf_proc_dump_write("negative_lookup_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.negative_lookup)); + gf_proc_dump_write("stat_invalidations_received", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.stat_invals)); + gf_proc_dump_write("xattr_invalidations_received", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.xattr_invals)); + + return 0; +} + +static int32_t +mdc_dump_metrics(xlator_t *this, int fd) +{ + struct mdc_conf *conf = NULL; + + conf = this->private; + if (!conf) + goto out; + + dprintf(fd, "%s.stat_cache_hit_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.stat_hit)); + dprintf(fd, "%s.stat_cache_miss_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.stat_miss)); + dprintf(fd, "%s.xattr_cache_hit_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.xattr_hit)); + dprintf(fd, "%s.xattr_cache_miss_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.xattr_miss)); + dprintf(fd, "%s.nameless_lookup_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.nameless_lookup)); + dprintf(fd, "%s.negative_lookup_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.negative_lookup)); + dprintf(fd, "%s.stat_cache_invalidations_received %" PRId64 "\n", + this->name, GF_ATOMIC_GET(conf->mdc_counter.stat_invals)); + dprintf(fd, "%s.xattr_cache_invalidations_received %" PRId64 "\n", + this->name, GF_ATOMIC_GET(conf->mdc_counter.xattr_invals)); +out: + return 0; +} + +int +mdc_forget(xlator_t *this, inode_t *inode) +{ + mdc_inode_wipe(this, inode); + + return 0; +} int -mdc_key_load_set (struct mdc_key *keys, char *pattern, gf_boolean_t val) +is_strpfx(const char *str1, const char *str2) { - struct mdc_key *key = NULL; + /* is one of the string a prefix of the other? */ + int i; - for (key = keys; key->name; key++) { - if (is_strpfx (key->name, pattern)) - key->load = val; - } + for (i = 0; str1[i] == str2[i]; i++) { + if (!str1[i] || !str2[i]) + break; + } - return 0; + return !(str1[i] && str2[i]); } +static int +mdc_key_unload_all(struct mdc_conf *conf) +{ + conf->mdc_xattr_str = NULL; + + return 0; +} int -reconfigure (xlator_t *this, dict_t *options) +mdc_xattr_list_populate(struct mdc_conf *conf, char *tmp_str) { - struct mdc_conf *conf = NULL; + char *mdc_xattr_str = NULL; + size_t max_size = 0; + int ret = 0; + + max_size = SLEN( + "security.capability,security.selinux,security." + "ima," POSIX_ACL_ACCESS_XATTR "," POSIX_ACL_DEFAULT_XATTR + "," GF_POSIX_ACL_ACCESS "," GF_POSIX_ACL_DEFAULT + "," + "user.swift.metadata,user.DOSATTRIB,user.DosStream.*" + ",user.org.netatalk.Metadata,security.NTACL," + "user.org.netatalk.ResourceFork") + + strlen(tmp_str) + 5; /*Some buffer bytes*/ + + mdc_xattr_str = GF_MALLOC(max_size, gf_common_mt_char); + GF_CHECK_ALLOC(mdc_xattr_str, ret, out); + mdc_xattr_str[0] = '\0'; + + if (conf->cache_capability) + strcat(mdc_xattr_str, "security.capability,"); + + if (conf->cache_selinux) + strcat(mdc_xattr_str, "security.selinux,"); - conf = this->private; + if (conf->cache_ima) + strcat(mdc_xattr_str, "security.ima,"); - GF_OPTION_RECONF ("md-cache-timeout", conf->timeout, options, int32, out); + if (conf->cache_posix_acl) + strcat(mdc_xattr_str, + POSIX_ACL_ACCESS_XATTR "," POSIX_ACL_DEFAULT_XATTR ","); - GF_OPTION_RECONF ("cache-selinux", conf->cache_selinux, options, bool, out); - mdc_key_load_set (mdc_keys, "security.", conf->cache_selinux); + if (conf->cache_glusterfs_acl) + strcat(mdc_xattr_str, GF_POSIX_ACL_ACCESS "," GF_POSIX_ACL_DEFAULT ","); - GF_OPTION_RECONF ("cache-posix-acl", conf->cache_posix_acl, options, bool, out); - mdc_key_load_set (mdc_keys, "system.posix_acl_", conf->cache_posix_acl); + if (conf->cache_swift_metadata) + strcat(mdc_xattr_str, "user.swift.metadata,"); - GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, out); + if (conf->cache_samba_metadata) + strcat(mdc_xattr_str, + "user.DOSATTRIB,user.DosStream.*," + "user.org.netatalk.Metadata,user.org.netatalk." + "ResourceFork,security.NTACL,"); + + strcat(mdc_xattr_str, tmp_str); + + LOCK(&conf->lock); + { + /* This is not freed, else is_mdc_key_satisfied, which is + * called by every fop has to take lock, and will lead to + * lock contention + */ + conf->mdc_xattr_str = mdc_xattr_str; + } + UNLOCK(&conf->lock); out: - return 0; + return ret; } -int32_t -mem_acct_init (xlator_t *this) +struct set { + inode_t *inode; + xlator_t *this; +}; + +static int +mdc_inval_xatt(dict_t *d, char *k, data_t *v, void *tmp) +{ + struct set *tmp1 = NULL; + int ret = 0; + + tmp1 = (struct set *)tmp; + ret = mdc_inode_xatt_unset(tmp1->this, tmp1->inode, k); + return ret; +} + +static int +mdc_invalidate(xlator_t *this, void *data) +{ + struct gf_upcall *up_data = NULL; + struct gf_upcall_cache_invalidation *up_ci = NULL; + inode_t *inode = NULL; + int ret = 0; + struct set tmp = { + 0, + }; + inode_table_t *itable = NULL; + struct mdc_conf *conf = this->private; + uint64_t gen = 0; + + up_data = (struct gf_upcall *)data; + + if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) + goto out; + + up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; + + itable = ((xlator_t *)this->graph->top)->itable; + inode = inode_find(itable, up_data->gfid); + if (!inode) { + ret = -1; + goto out; + } + + if (up_ci->flags & UP_PARENT_DENTRY_FLAGS) { + mdc_update_gfid_stat(this, &up_ci->p_stat); + if (up_ci->flags & UP_RENAME_FLAGS) + mdc_update_gfid_stat(this, &up_ci->oldp_stat); + } + + if (up_ci->flags & UP_EXPLICIT_LOOKUP) { + mdc_inode_set_need_lookup(this, inode, _gf_true); + goto out; + } + + if (up_ci->flags & + (UP_NLINK | UP_RENAME_FLAGS | UP_FORGET | UP_INVAL_ATTR)) { + mdc_inode_iatt_invalidate(this, inode); + mdc_inode_xatt_invalidate(this, inode); + GF_ATOMIC_INC(conf->mdc_counter.stat_invals); + goto out; + } + + if (up_ci->flags & IATT_UPDATE_FLAGS) { + gen = mdc_inc_generation(this, inode); + ret = mdc_inode_iatt_set_validate(this, inode, NULL, &up_ci->stat, + _gf_false, gen); + /* one of the scenarios where ret < 0 is when this invalidate + * is older than the current stat, in that case do not + * update the xattrs as well + */ + if (ret < 0) + goto out; + GF_ATOMIC_INC(conf->mdc_counter.stat_invals); + } + + if (up_ci->flags & UP_XATTR) { + if (up_ci->dict) + ret = mdc_inode_xatt_update(this, inode, up_ci->dict); + else + ret = mdc_inode_xatt_invalidate(this, inode); + + GF_ATOMIC_INC(conf->mdc_counter.xattr_invals); + } else if (up_ci->flags & UP_XATTR_RM) { + tmp.inode = inode; + tmp.this = this; + ret = dict_foreach(up_ci->dict, mdc_inval_xatt, &tmp); + + GF_ATOMIC_INC(conf->mdc_counter.xattr_invals); + } + +out: + if (inode) + inode_unref(inode); + + return ret; +} + +struct mdc_ipc { + xlator_t *this; + dict_t *xattr; +}; + +static int +mdc_send_xattrs_cbk(int ret, call_frame_t *frame, void *data) { - int ret = -1; + struct mdc_ipc *tmp = data; - ret = xlator_mem_acct_init (this, gf_mdc_mt_end + 1); - return ret; + if (ret < 0) { + mdc_key_unload_all(THIS->private); + gf_msg("md-cache", GF_LOG_INFO, 0, MD_CACHE_MSG_NO_XATTR_CACHE, + "Disabled cache for all xattrs, as registering for " + "xattr cache invalidation failed"); + } + STACK_DESTROY(frame->root); + dict_unref(tmp->xattr); + GF_FREE(tmp); + + return 0; +} + +static int +mdc_send_xattrs(void *data) +{ + int ret = 0; + struct mdc_ipc *tmp = data; + + ret = syncop_ipc(FIRST_CHILD(tmp->this), GF_IPC_TARGET_UPCALL, tmp->xattr, + NULL); + DECODE_SYNCOP_ERR(ret); + if (ret < 0) { + gf_msg(tmp->this->name, GF_LOG_WARNING, errno, + MD_CACHE_MSG_IPC_UPCALL_FAILED, + "Registering the list " + "of xattrs that needs invalidaton, with upcall, failed"); + } + + return ret; +} + +static int +mdc_register_xattr_inval(xlator_t *this) +{ + dict_t *xattr = NULL; + int ret = 0; + struct mdc_conf *conf = NULL; + call_frame_t *frame = NULL; + struct mdc_ipc *data = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + if (!conf->mdc_invalidation) { + UNLOCK(&conf->lock); + goto out; + } + } + UNLOCK(&conf->lock); + + xattr = dict_new(); + if (!xattr) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "dict_new failed"); + ret = -1; + goto out; + } + + if (!mdc_load_reqs(this, xattr)) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "failed to populate cache entries"); + ret = -1; + goto out; + } + + frame = create_frame(this, this->ctx->pool); + if (!frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "failed to create the frame"); + ret = -1; + goto out; + } + + data = GF_CALLOC(1, sizeof(struct mdc_ipc), gf_mdc_mt_mdc_ipc); + if (!data) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "failed to allocate memory"); + ret = -1; + goto out; + } + + data->this = this; + data->xattr = xattr; + ret = synctask_new(this->ctx->env, mdc_send_xattrs, mdc_send_xattrs_cbk, + frame, data); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, + MD_CACHE_MSG_IPC_UPCALL_FAILED, + "Registering the list " + "of xattrs that needs invalidaton, with upcall, failed"); + } + +out: + if (ret < 0) { + mdc_key_unload_all(conf); + if (xattr) + dict_unref(xattr); + if (frame) + STACK_DESTROY(frame->root); + GF_FREE(data); + gf_msg(this->name, GF_LOG_INFO, 0, MD_CACHE_MSG_NO_XATTR_CACHE, + "Disabled cache for all xattrs, as registering for " + "xattr cache invalidation failed"); + } + + return ret; } int -init (xlator_t *this) +mdc_reconfigure(xlator_t *this, dict_t *options) { - struct mdc_conf *conf = NULL; + struct mdc_conf *conf = NULL; + int timeout = 0, ret = 0; + char *tmp_str = NULL; + + conf = this->private; + + GF_OPTION_RECONF("md-cache-timeout", timeout, options, int32, out); + + GF_OPTION_RECONF("cache-selinux", conf->cache_selinux, options, bool, out); + + GF_OPTION_RECONF("cache-capability-xattrs", conf->cache_capability, options, + bool, out); + + GF_OPTION_RECONF("cache-ima-xattrs", conf->cache_ima, options, bool, out); - conf = GF_CALLOC (sizeof (*conf), 1, gf_mdc_mt_mdc_conf_t); - if (!conf) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory"); - return -1; - } + GF_OPTION_RECONF("cache-posix-acl", conf->cache_posix_acl, options, bool, + out); - GF_OPTION_INIT ("md-cache-timeout", conf->timeout, int32, out); + GF_OPTION_RECONF("cache-glusterfs-acl", conf->cache_glusterfs_acl, options, + bool, out); - GF_OPTION_INIT ("cache-selinux", conf->cache_selinux, bool, out); - mdc_key_load_set (mdc_keys, "security.", conf->cache_selinux); + GF_OPTION_RECONF("cache-swift-metadata", conf->cache_swift_metadata, + options, bool, out); - GF_OPTION_INIT ("cache-posix-acl", conf->cache_posix_acl, bool, out); - mdc_key_load_set (mdc_keys, "system.posix_acl_", conf->cache_posix_acl); + GF_OPTION_RECONF("cache-samba-metadata", conf->cache_samba_metadata, + options, bool, out); - GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out); + GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, + out); + + GF_OPTION_RECONF("cache-invalidation", conf->mdc_invalidation, options, + bool, out); + + GF_OPTION_RECONF("global-cache-invalidation", conf->global_invalidation, + options, bool, out); + + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); + + GF_OPTION_RECONF("md-cache-statfs", conf->cache_statfs, options, bool, out); + + GF_OPTION_RECONF("xattr-cache-list", tmp_str, options, str, out); + + ret = mdc_xattr_list_populate(conf, tmp_str); + if (ret < 0) + goto out; + + /* If timeout is greater than 60s (default before the patch that added + * cache invalidation support was added) then, cache invalidation + * feature for md-cache needs to be enabled, if not set timeout to the + * previous max which is 60s + */ + if ((timeout > 60) && (!conf->mdc_invalidation)) { + conf->timeout = 60; + goto out; + } + conf->timeout = timeout; + + ret = mdc_register_xattr_inval(this); out: - this->private = conf; + return ret; +} - return 0; +int32_t +mdc_mem_acct_init(xlator_t *this) +{ + return xlator_mem_acct_init(this, gf_mdc_mt_end + 1); } +int +mdc_init(xlator_t *this) +{ + struct mdc_conf *conf = NULL; + uint32_t timeout = 0; + char *tmp_str = NULL; + + conf = GF_CALLOC(sizeof(*conf), 1, gf_mdc_mt_mdc_conf_t); + if (!conf) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "out of memory"); + return -1; + } + + LOCK_INIT(&conf->lock); + + GF_OPTION_INIT("md-cache-timeout", timeout, uint32, out); + + GF_OPTION_INIT("cache-selinux", conf->cache_selinux, bool, out); + + GF_OPTION_INIT("cache-capability-xattrs", conf->cache_capability, bool, + out); + + GF_OPTION_INIT("cache-ima-xattrs", conf->cache_ima, bool, out); + + GF_OPTION_INIT("cache-posix-acl", conf->cache_posix_acl, bool, out); + + GF_OPTION_INIT("cache-glusterfs-acl", conf->cache_glusterfs_acl, bool, out); + + GF_OPTION_INIT("cache-swift-metadata", conf->cache_swift_metadata, bool, + out); + + GF_OPTION_INIT("cache-samba-metadata", conf->cache_samba_metadata, bool, + out); + + GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out); + + GF_OPTION_INIT("cache-invalidation", conf->mdc_invalidation, bool, out); + + GF_OPTION_INIT("global-cache-invalidation", conf->global_invalidation, bool, + out); + + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); + + pthread_mutex_init(&conf->statfs_cache.lock, NULL); + GF_OPTION_INIT("md-cache-statfs", conf->cache_statfs, bool, out); + + GF_OPTION_INIT("xattr-cache-list", tmp_str, str, out); + mdc_xattr_list_populate(conf, tmp_str); + + conf->last_child_down = gf_time(); + conf->statfs_cache.last_refreshed = (time_t)-1; + + /* initialize gf_atomic_t counters */ + GF_ATOMIC_INIT(conf->mdc_counter.stat_hit, 0); + GF_ATOMIC_INIT(conf->mdc_counter.stat_miss, 0); + GF_ATOMIC_INIT(conf->mdc_counter.xattr_hit, 0); + GF_ATOMIC_INIT(conf->mdc_counter.xattr_miss, 0); + GF_ATOMIC_INIT(conf->mdc_counter.negative_lookup, 0); + GF_ATOMIC_INIT(conf->mdc_counter.nameless_lookup, 0); + GF_ATOMIC_INIT(conf->mdc_counter.stat_invals, 0); + GF_ATOMIC_INIT(conf->mdc_counter.xattr_invals, 0); + GF_ATOMIC_INIT(conf->mdc_counter.need_lookup, 0); + GF_ATOMIC_INIT(conf->generation, 0); + + /* If timeout is greater than 60s (default before the patch that added + * cache invalidation support was added) then, cache invalidation + * feature for md-cache needs to be enabled, if not set timeout to the + * previous max which is 60s + */ + if ((timeout > 60) && (!conf->mdc_invalidation)) { + conf->timeout = 60; + goto out; + } + conf->timeout = timeout; + +out: + this->private = conf; + + return 0; +} void -fini (xlator_t *this) +mdc_update_child_down_time(xlator_t *this, time_t now) { - return; + struct mdc_conf *conf = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + conf->last_child_down = now; + } + UNLOCK(&conf->lock); } +int +mdc_notify(xlator_t *this, int event, void *data, ...) +{ + int ret = 0; + struct mdc_conf *conf = NULL; + + conf = this->private; + switch (event) { + case GF_EVENT_CHILD_DOWN: + case GF_EVENT_SOME_DESCENDENT_DOWN: + mdc_update_child_down_time(this, gf_time()); + break; + case GF_EVENT_UPCALL: + if (conf->mdc_invalidation) + ret = mdc_invalidate(this, data); + break; + case GF_EVENT_CHILD_UP: + case GF_EVENT_SOME_DESCENDENT_UP: + ret = mdc_register_xattr_inval(this); + break; + default: + break; + } + + if (default_notify(this, event, data) != 0) + ret = -1; + + return ret; +} -struct xlator_fops fops = { - .lookup = mdc_lookup, - .stat = mdc_stat, - .fstat = mdc_fstat, - .truncate = mdc_truncate, - .ftruncate = mdc_ftruncate, - .mknod = mdc_mknod, - .mkdir = mdc_mkdir, - .unlink = mdc_unlink, - .rmdir = mdc_rmdir, - .symlink = mdc_symlink, - .rename = mdc_rename, - .link = mdc_link, - .create = mdc_create, - .readv = mdc_readv, - .writev = mdc_writev, - .setattr = mdc_setattr, - .fsetattr = mdc_fsetattr, - .fsync = mdc_fsync, - .setxattr = mdc_setxattr, - .fsetxattr = mdc_fsetxattr, - .getxattr = mdc_getxattr, - .fgetxattr = mdc_fgetxattr, - .readdirp = mdc_readdirp, - .readdir = mdc_readdir, - .fallocate = mdc_fallocate, - .discard = mdc_discard, +void +mdc_fini(xlator_t *this) +{ + GF_FREE(this->private); +} + +struct xlator_fops mdc_fops = { + .lookup = mdc_lookup, + .stat = mdc_stat, + .fstat = mdc_fstat, + .truncate = mdc_truncate, + .ftruncate = mdc_ftruncate, + .mknod = mdc_mknod, + .mkdir = mdc_mkdir, + .unlink = mdc_unlink, + .rmdir = mdc_rmdir, + .symlink = mdc_symlink, + .rename = mdc_rename, + .link = mdc_link, + .create = mdc_create, + .open = mdc_open, + .readv = mdc_readv, + .writev = mdc_writev, + .setattr = mdc_setattr, + .fsetattr = mdc_fsetattr, + .fsync = mdc_fsync, + .setxattr = mdc_setxattr, + .fsetxattr = mdc_fsetxattr, + .getxattr = mdc_getxattr, + .fgetxattr = mdc_fgetxattr, + .removexattr = mdc_removexattr, + .fremovexattr = mdc_fremovexattr, + .opendir = mdc_opendir, + .readdirp = mdc_readdirp, + .readdir = mdc_readdir, + .fallocate = mdc_fallocate, + .discard = mdc_discard, + .zerofill = mdc_zerofill, + .statfs = mdc_statfs, + .readlink = mdc_readlink, + .fsyncdir = mdc_fsyncdir, + .access = mdc_access, }; +struct xlator_cbks mdc_cbks = { + .forget = mdc_forget, +}; + +struct xlator_dumpops mdc_dumpops = { + .priv = mdc_priv_dump, +}; -struct xlator_cbks cbks = { - .forget = mdc_forget, +struct volume_options mdc_options[] = { + { + .key = {"md-cache"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable md-cache", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + { + .key = {"cache-selinux"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache selinux xattr(security.selinux) on client side", + }, + { + .key = {"cache-capability-xattrs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {GD_OP_VERSION_3_10_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache capability xattr(security.capability) on " + "client side", + }, + { + .key = {"cache-ima-xattrs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {GD_OP_VERSION_3_10_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache Linux integrity subsystem xattr(security.ima) " + "on client side", + }, + { + .key = {"cache-swift-metadata"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_7_10}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache swift metadata (user.swift.metadata xattr)", + }, + { + .key = {"cache-samba-metadata"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache samba metadata (user.DOSATTRIB, security.NTACL," + " org.netatalk.Metadata, org.netatalk.ResourceFork, " + "and user.DosStream. xattrs)", + }, + { + .key = {"cache-posix-acl"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache posix ACL xattrs (system.posix_acl_access, " + "system.posix_acl_default) on client side", + }, + { + .key = {"cache-glusterfs-acl"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache virtual glusterfs ACL xattrs " + "(glusterfs.posix.acl, glusterfs.posix.default_acl) " + "on client side", + }, + { + .key = {"md-cache-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 600, + .default_value = SITE_H_MD_CACHE_TIMEOUT, + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Time period after which cache has to be refreshed", + }, + { + .key = {"force-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Convert all readdir requests to readdirplus to " + "collect stat info on each entry.", + }, + { + .key = {"cache-invalidation"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "When \"on\", invalidates/updates the metadata cache," + " on receiving the cache-invalidation notifications", + }, + { + .key = {"global-cache-invalidation"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = + "When \"on\", purges all read caches in kernel and glusterfs stack " + "whenever a stat change is detected. Stat changes can be detected " + "while processing responses to file operations (fop) or through " + "upcall notifications. Since purging caches can be an expensive " + "operation, it's advised to have this option \"on\" only when a " + "file " + "can be accessed from multiple different Glusterfs mounts and " + "caches across these different mounts are required to be coherent. " + "If a file is not accessed across different mounts " + "(simple example is having only one mount for a volume), its " + "advised to keep " + "this option \"off\" as all file modifications go through caches " + "keeping them " + "coherent. This option overrides value of " + "performance.cache-invalidation.", + }, + { + .key = {"md-cache-statfs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache statfs information of filesystem on the client", + }, + { + .key = {"xattr-cache-list"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "A comma separated list of xattrs that shall be " + "cached by md-cache. The only wildcard allowed is '*'", + }, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"md-cache"}, + .description = "Enable/Disable md cache translator"}, + {.key = {NULL}}, }; -struct volume_options options[] = { - { .key = {"cache-selinux"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "false", - }, - { .key = {"cache-posix-acl"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "false", - }, - { .key = {"md-cache-timeout"}, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = 60, - .default_value = "1", - .description = "Time period after which cache has to be refreshed", - }, - { .key = {"force-readdirp"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "true", - .description = "Convert all readdir requests to readdirplus to " - "collect stat info on each entry.", - }, - { .key = {NULL} }, +xlator_api_t xlator_api = { + .init = mdc_init, + .fini = mdc_fini, + .notify = mdc_notify, + .reconfigure = mdc_reconfigure, + .mem_acct_init = mdc_mem_acct_init, + .dump_metrics = mdc_dump_metrics, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &mdc_dumpops, + .fops = &mdc_fops, + .cbks = &mdc_cbks, + .options = mdc_options, + .identifier = "md-cache", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/symlink-cache/Makefile.am b/xlators/performance/nl-cache/Makefile.am index d471a3f9243..a985f42a877 100644 --- a/xlators/performance/symlink-cache/Makefile.am +++ b/xlators/performance/nl-cache/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = src -CLEANFILES = +CLEANFILES = diff --git a/xlators/performance/nl-cache/src/Makefile.am b/xlators/performance/nl-cache/src/Makefile.am new file mode 100644 index 00000000000..c44ce871627 --- /dev/null +++ b/xlators/performance/nl-cache/src/Makefile.am @@ -0,0 +1,12 @@ +xlator_LTLIBRARIES = nl-cache.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance +nl_cache_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) +nl_cache_la_SOURCES = nl-cache.c nl-cache-helper.c +nl_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +noinst_HEADERS = nl-cache.h nl-cache-mem-types.h nl-cache-messages.h +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(CONTRIBDIR)/timer-wheel + +AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) +CLEANFILES = diff --git a/xlators/performance/nl-cache/src/nl-cache-helper.c b/xlators/performance/nl-cache/src/nl-cache-helper.c new file mode 100644 index 00000000000..29b99b5b8ea --- /dev/null +++ b/xlators/performance/nl-cache/src/nl-cache-helper.c @@ -0,0 +1,1201 @@ +/* + * Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#include "nl-cache.h" +#include "timer-wheel.h" +#include <glusterfs/statedump.h> + +/* Caching guidelines: + * This xlator serves negative lookup(ENOENT lookups) from the cache, + * there by making create faster. + * What is cached? + * Negative lookup cache is stored for each directory, and has 2 entries: + * - Negative entries: Populated only when lookup/stat returns ENOENT. + * Fuse mostly sends only one lookup before create, hence negative entry + * cache is almost useless. But for SMB access, multiple lookups/stats + * are sent before creating the file. Hence the negative entry cache. + * It can exist even when the positive entry cache is invalid. It also + * has the entries that were deleted from this directory. + * Freed on receiving upcall(with dentry change flag) or on expiring + * timeout of the cache. + * + * - Positive entries: Populated as a part of readdirp, and as a part of + * mkdir followed by creates inside that directory. Lookups and other + * fops do not populate the positive entry (as it can grow long and is + * of no value add) + * Freed on receiving upcall(with dentry change flag) or on expiring + * timeout of the cache. + * + * Data structures to store cache? + * The cache of any directory is stored in the inode_ctx of the directory. + * Negative entries are stored as list of strings. + * Search - O(n) + * Add - O(1) + * Delete - O(n) - as it has to be searched before deleting + * Positive entries are stored as a list, each list node has a pointer + * to the inode of the positive entry or the name of the entry. + * Since the client side inode table already will have inodes for + * positive entries, we just take a ref of that inode and store as + * positive entry cache. In cases like hardlinks and readdirp where + * inode is NULL, we store the names. + * Name Search - O(n) + * Inode Search - O(1) - Actually complexity of inode_find() + * Name/inode Add - O(1) + * Name Delete - O(n) + * Inode Delete - O(1) + * + * Locking order: + * + * TODO: + * - Fill Positive entries on readdir/p, after which in lookup_cbk check if the + * name is in PE and replace it with inode. + * - fini, PARENET_DOWN, disable caching + * - Virtual setxattr to dump the inode_ctx, to ease debugging + * - Handle dht_nuke xattr: clear all cache + * - Special handling for .meta and .trashcan? + */ + +int +__nlc_inode_ctx_timer_start(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx); +int +__nlc_add_to_lru(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx); +void +nlc_remove_from_lru(xlator_t *this, inode_t *inode); +void +__nlc_inode_ctx_timer_delete(xlator_t *this, nlc_ctx_t *nlc_ctx); +gf_boolean_t +__nlc_search_ne(nlc_ctx_t *nlc_ctx, const char *name); +void +__nlc_free_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_pe_t *pe); +void +__nlc_free_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_ne_t *ne); + +static int32_t +nlc_get_cache_timeout(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + /* Cache timeout is generally not meant to be changed often, + * once set, hence not within locks */ + return conf->cache_timeout; +} + +static gf_boolean_t +__nlc_is_cache_valid(xlator_t *this, nlc_ctx_t *nlc_ctx) +{ + nlc_conf_t *conf = NULL; + time_t last_val_time; + gf_boolean_t ret = _gf_false; + + GF_VALIDATE_OR_GOTO(this->name, nlc_ctx, out); + + conf = this->private; + + LOCK(&conf->lock); + { + last_val_time = conf->last_child_down; + } + UNLOCK(&conf->lock); + + if ((last_val_time <= nlc_ctx->cache_time) && (nlc_ctx->cache_time != 0)) + ret = _gf_true; +out: + return ret; +} + +void +nlc_update_child_down_time(xlator_t *this, time_t now) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + conf->last_child_down = now; + } + UNLOCK(&conf->lock); + + return; +} + +void +nlc_disable_cache(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + conf->disable_cache = _gf_true; + } + UNLOCK(&conf->lock); + + return; +} + +static int +__nlc_inode_ctx_get(xlator_t *this, inode_t *inode, nlc_ctx_t **nlc_ctx_p) +{ + int ret = 0; + nlc_ctx_t *nlc_ctx = NULL; + uint64_t nlc_ctx_int = 0; + uint64_t nlc_pe_int = 0; + + ret = __inode_ctx_get2(inode, this, &nlc_ctx_int, &nlc_pe_int); + if (ret == 0 && nlc_ctx_p) { + nlc_ctx = (void *)(long)(nlc_ctx_int); + *nlc_ctx_p = nlc_ctx; + } + return ret; +} + +static int +nlc_inode_ctx_set(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx, + nlc_pe_t *nlc_pe_p) +{ + uint64_t ctx1, ctx2; + int ret = -1; + + ctx1 = (uint64_t)(uintptr_t)nlc_ctx; + ctx2 = (uint64_t)(uintptr_t)nlc_pe_p; + + /* The caller may choose to set one of the ctxs, hence check + * if the ctx1/2 is non zero and then send the address. If we + * blindly send the address of both the ctxs, it may reset the + * ctx the caller had sent NULL(intended as leave untouched) for.*/ + LOCK(&inode->lock); + { + ret = __inode_ctx_set2(inode, this, ctx1 ? &ctx1 : 0, ctx2 ? &ctx2 : 0); + } + UNLOCK(&inode->lock); + return ret; +} + +static void +nlc_inode_ctx_get(xlator_t *this, inode_t *inode, nlc_ctx_t **nlc_ctx_p) +{ + int ret = 0; + + LOCK(&inode->lock); + { + ret = __nlc_inode_ctx_get(this, inode, nlc_ctx_p); + if (ret < 0) + gf_msg_debug(this->name, 0, + "inode ctx get failed for " + "inode:%p", + inode); + } + UNLOCK(&inode->lock); + + return; +} + +static void +__nlc_inode_clear_entries(xlator_t *this, nlc_ctx_t *nlc_ctx) +{ + nlc_pe_t *pe = NULL; + nlc_pe_t *tmp = NULL; + nlc_ne_t *ne = NULL; + nlc_ne_t *tmp1 = NULL; + + if (!nlc_ctx) + goto out; + + if (IS_PE_VALID(nlc_ctx->state)) + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + __nlc_free_pe(this, nlc_ctx, pe); + } + + if (IS_NE_VALID(nlc_ctx->state)) + list_for_each_entry_safe(ne, tmp1, &nlc_ctx->ne, list) + { + __nlc_free_ne(this, nlc_ctx, ne); + } + + nlc_ctx->cache_time = 0; + nlc_ctx->state = 0; + GF_ASSERT(nlc_ctx->cache_size == sizeof(*nlc_ctx)); + GF_ASSERT(nlc_ctx->refd_inodes == 0); +out: + return; +} + +static void +nlc_init_invalid_ctx(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx) +{ + nlc_conf_t *conf = NULL; + int ret = -1; + + conf = this->private; + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + if (__nlc_is_cache_valid(this, nlc_ctx)) + goto unlock; + + /* The cache/nlc_ctx can be invalid for 2 reasons: + * - Because of a child-down/timer expiry, cache is + * invalid but the nlc_ctx is not yet cleaned up. + * - nlc_ctx is cleaned up, because of invalidations + * or lru prune etc.*/ + + /* If the cache is present but invalid, clear the cache and + * reset the timer. */ + __nlc_inode_clear_entries(this, nlc_ctx); + + /* If timer is present, then it is already part of lru as well + * Hence reset the timer and return.*/ + if (nlc_ctx->timer) { + gf_tw_mod_timer_pending(conf->timer_wheel, nlc_ctx->timer, + conf->cache_timeout); + nlc_ctx->cache_time = gf_time(); + goto unlock; + } + + /* If timer was NULL, the nlc_ctx is already cleanedup, + * and we need to start timer and add to lru, so that it is + * ready to cache entries a fresh */ + ret = __nlc_inode_ctx_timer_start(this, inode, nlc_ctx); + if (ret < 0) + goto unlock; + + ret = __nlc_add_to_lru(this, inode, nlc_ctx); + if (ret < 0) { + __nlc_inode_ctx_timer_delete(this, nlc_ctx); + goto unlock; + } + } +unlock: + UNLOCK(&nlc_ctx->lock); +out: + return; +} + +static nlc_ctx_t * +nlc_inode_ctx_get_set(xlator_t *this, inode_t *inode, nlc_ctx_t **nlc_ctx_p) +{ + uint64_t ctx; + int ret = 0; + nlc_ctx_t *nlc_ctx = NULL; + nlc_conf_t *conf = NULL; + + conf = this->private; + + LOCK(&inode->lock); + { + ret = __nlc_inode_ctx_get(this, inode, &nlc_ctx); + if (nlc_ctx) + goto unlock; + + nlc_ctx = GF_CALLOC(sizeof(*nlc_ctx), 1, gf_nlc_mt_nlc_ctx_t); + if (!nlc_ctx) + goto unlock; + + LOCK_INIT(&nlc_ctx->lock); + INIT_LIST_HEAD(&nlc_ctx->pe); + INIT_LIST_HEAD(&nlc_ctx->ne); + + ret = __nlc_inode_ctx_timer_start(this, inode, nlc_ctx); + if (ret < 0) + goto unlock; + + ret = __nlc_add_to_lru(this, inode, nlc_ctx); + if (ret < 0) { + __nlc_inode_ctx_timer_delete(this, nlc_ctx); + goto unlock; + } + + ctx = (uint64_t)(uintptr_t)nlc_ctx; + ret = __inode_ctx_set2(inode, this, &ctx, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, NLC_MSG_NO_MEMORY, + "inode ctx set failed"); + __nlc_inode_ctx_timer_delete(this, nlc_ctx); + nlc_remove_from_lru(this, inode); + goto unlock; + } + + /*TODO: also sizeof (gf_tw_timer_list) + nlc_timer_data_t ?*/ + nlc_ctx->cache_size = sizeof(*nlc_ctx); + GF_ATOMIC_ADD(conf->current_cache_size, nlc_ctx->cache_size); + } +unlock: + UNLOCK(&inode->lock); + + if (ret == 0 && nlc_ctx_p) { + *nlc_ctx_p = nlc_ctx; + nlc_init_invalid_ctx(this, inode, nlc_ctx); + } + + if (ret < 0 && nlc_ctx) { + LOCK_DESTROY(&nlc_ctx->lock); + GF_FREE(nlc_ctx); + nlc_ctx = NULL; + goto out; + } + +out: + return nlc_ctx; +} + +nlc_local_t * +nlc_local_init(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + loc_t *loc, loc_t *loc2) +{ + nlc_local_t *local = NULL; + + local = GF_CALLOC(sizeof(*local), 1, gf_nlc_mt_nlc_local_t); + if (!local) + goto out; + + if (loc) + loc_copy(&local->loc, loc); + if (loc2) + loc_copy(&local->loc2, loc2); + + local->fop = fop; + frame->local = local; +out: + return local; +} + +void +nlc_local_wipe(xlator_t *this, nlc_local_t *local) +{ + if (!local) + goto out; + + loc_wipe(&local->loc); + + loc_wipe(&local->loc2); + + GF_FREE(local); +out: + return; +} + +static void +__nlc_set_dir_state(nlc_ctx_t *nlc_ctx, uint64_t new_state) +{ + nlc_ctx->state |= new_state; + + return; +} + +void +nlc_set_dir_state(xlator_t *this, inode_t *inode, uint64_t state) +{ + nlc_ctx_t *nlc_ctx = NULL; + + if (inode->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get_set(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + __nlc_set_dir_state(nlc_ctx, state); + } + UNLOCK(&nlc_ctx->lock); +out: + return; +} + +static void +nlc_cache_timeout_handler(struct gf_tw_timer_list *timer, void *data, + unsigned long calltime) +{ + nlc_timer_data_t *tmp = data; + nlc_ctx_t *nlc_ctx = NULL; + + nlc_inode_ctx_get(tmp->this, tmp->inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + /* Taking nlc_ctx->lock will lead to deadlock, hence updating + * the cache is invalid outside of lock, instead of clear_cache. + * Since cache_time is assigned outside of lock, the value can + * be invalid for short time, this may result in false negative + * which is better than deadlock */ + nlc_ctx->cache_time = 0; +out: + return; +} + +void +__nlc_inode_ctx_timer_delete(xlator_t *this, nlc_ctx_t *nlc_ctx) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + if (nlc_ctx->timer) + gf_tw_del_timer(conf->timer_wheel, nlc_ctx->timer); + + if (nlc_ctx->timer_data) { + inode_unref(nlc_ctx->timer_data->inode); + GF_FREE(nlc_ctx->timer_data); + nlc_ctx->timer_data = NULL; + } + + GF_FREE(nlc_ctx->timer); + nlc_ctx->timer = NULL; + + return; +} + +int +__nlc_inode_ctx_timer_start(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx) +{ + struct gf_tw_timer_list *timer = NULL; + nlc_timer_data_t *tmp = NULL; + nlc_conf_t *conf = NULL; + int ret = -1; + + conf = this->private; + + /* We are taking inode_table->lock within inode->lock + * as the only other caller which takes inode->lock within + * inode_table->lock and cause deadlock is inode_table_destroy. + * Hopefully, there can be no fop when inode_table_destroy is + * being called. */ + tmp = GF_CALLOC(1, sizeof(*tmp), gf_nlc_mt_nlc_timer_data_t); + if (!tmp) + goto out; + tmp->inode = inode_ref(inode); + tmp->this = this; + + timer = GF_CALLOC(1, sizeof(*timer), gf_common_mt_tw_timer_list); + if (!timer) + goto out; + + INIT_LIST_HEAD(&timer->entry); + timer->expires = nlc_get_cache_timeout(this); + timer->function = nlc_cache_timeout_handler; + timer->data = tmp; + nlc_ctx->timer = timer; + nlc_ctx->timer_data = tmp; + gf_tw_add_timer(conf->timer_wheel, timer); + + nlc_ctx->cache_time = gf_time(); + gf_msg_trace(this->name, 0, + "Registering timer:%p, inode:%p, " + "gfid:%s", + timer, inode, uuid_utoa(inode->gfid)); + + ret = 0; + +out: + if (ret < 0) { + if (tmp && tmp->inode) + inode_unref(tmp->inode); + GF_FREE(tmp); + GF_FREE(timer); + } + + return ret; +} + +int +__nlc_add_to_lru(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx) +{ + nlc_lru_node_t *lru_ino = NULL; + uint64_t nlc_pe_int = 0; + nlc_conf_t *conf = NULL; + int ret = -1; + + conf = this->private; + + lru_ino = GF_CALLOC(1, sizeof(*lru_ino), gf_nlc_mt_nlc_lru_node); + if (!lru_ino) + goto out; + + INIT_LIST_HEAD(&lru_ino->list); + lru_ino->inode = inode_ref(inode); + LOCK(&conf->lock); + { + list_add_tail(&lru_ino->list, &conf->lru); + } + UNLOCK(&conf->lock); + + nlc_ctx->refd_inodes = 0; + ret = __inode_ctx_get2(inode, this, NULL, &nlc_pe_int); + if (nlc_pe_int == 0) + GF_ATOMIC_ADD(conf->refd_inodes, 1); + + ret = 0; + +out: + return ret; +} + +void +nlc_remove_from_lru(xlator_t *this, inode_t *inode) +{ + nlc_lru_node_t *lru_node = NULL; + nlc_lru_node_t *tmp = NULL; + nlc_lru_node_t *tmp1 = NULL; + nlc_conf_t *conf = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + list_for_each_entry_safe(lru_node, tmp, &conf->lru, list) + { + if (inode == lru_node->inode) { + list_del(&lru_node->list); + tmp1 = lru_node; + break; + } + } + } + UNLOCK(&conf->lock); + + if (tmp1) { + inode_unref(tmp1->inode); + GF_FREE(tmp1); + } + + return; +} + +void +nlc_lru_prune(xlator_t *this, inode_t *inode) +{ + nlc_lru_node_t *lru_node = NULL; + nlc_lru_node_t *prune_node = NULL; + nlc_lru_node_t *tmp = NULL; + nlc_conf_t *conf = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + if ((GF_ATOMIC_GET(conf->refd_inodes) < conf->inode_limit) && + (GF_ATOMIC_GET(conf->current_cache_size) < conf->cache_size)) + goto unlock; + + list_for_each_entry_safe(lru_node, tmp, &conf->lru, list) + { + list_del(&lru_node->list); + prune_node = lru_node; + goto unlock; + } + } +unlock: + UNLOCK(&conf->lock); + + if (prune_node) { + nlc_inode_clear_cache(this, prune_node->inode, NLC_LRU_PRUNE); + inode_unref(prune_node->inode); + GF_FREE(prune_node); + } + return; +} + +void +nlc_clear_all_cache(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + struct list_head clear_list; + nlc_lru_node_t *prune_node = NULL; + nlc_lru_node_t *tmp = NULL; + + conf = this->private; + + INIT_LIST_HEAD(&clear_list); + + LOCK(&conf->lock); + { + list_replace_init(&conf->lru, &clear_list); + } + UNLOCK(&conf->lock); + + list_for_each_entry_safe(prune_node, tmp, &clear_list, list) + { + list_del(&prune_node->list); + nlc_inode_clear_cache(this, prune_node->inode, NLC_LRU_PRUNE); + inode_unref(prune_node->inode); + GF_FREE(prune_node); + } + + return; +} + +void +__nlc_free_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_pe_t *pe) +{ + uint64_t pe_int = 0; + nlc_conf_t *conf = NULL; + uint64_t nlc_ctx_int = 0; + + conf = this->private; + + if (pe->inode) { + inode_ctx_reset1(pe->inode, this, &pe_int); + inode_ctx_get2(pe->inode, this, &nlc_ctx_int, NULL); + inode_unref(pe->inode); + } + list_del(&pe->list); + + nlc_ctx->cache_size -= sizeof(*pe) + sizeof(pe->name); + GF_ATOMIC_SUB(conf->current_cache_size, (sizeof(*pe) + sizeof(pe->name))); + + nlc_ctx->refd_inodes -= 1; + if (nlc_ctx_int == 0) + GF_ATOMIC_SUB(conf->refd_inodes, 1); + + GF_FREE(pe->name); + GF_FREE(pe); + + return; +} + +void +__nlc_free_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_ne_t *ne) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + list_del(&ne->list); + GF_FREE(ne->name); + GF_FREE(ne); + + nlc_ctx->cache_size -= sizeof(*ne) + sizeof(ne->name); + GF_ATOMIC_SUB(conf->current_cache_size, (sizeof(*ne) + sizeof(ne->name))); + + return; +} + +void +nlc_inode_clear_cache(xlator_t *this, inode_t *inode, int reason) +{ + nlc_ctx_t *nlc_ctx = NULL; + + nlc_inode_ctx_get(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + __nlc_inode_ctx_timer_delete(this, nlc_ctx); + + __nlc_inode_clear_entries(this, nlc_ctx); + } + UNLOCK(&nlc_ctx->lock); + + if (reason != NLC_LRU_PRUNE) + nlc_remove_from_lru(this, inode); + +out: + return; +} + +static void +__nlc_del_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, inode_t *entry_ino, + const char *name, gf_boolean_t multilink) +{ + nlc_pe_t *pe = NULL; + nlc_pe_t *tmp = NULL; + gf_boolean_t found = _gf_false; + uint64_t pe_int = 0; + + if (!IS_PE_VALID(nlc_ctx->state)) + goto out; + + if (!entry_ino) + goto name_search; + + /* If there are hardlinks first search names, followed by inodes */ + if (multilink) { + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + if (pe->name && (strcmp(pe->name, name) == 0)) { + found = _gf_true; + goto out; + } + } + inode_ctx_reset1(entry_ino, this, &pe_int); + if (pe_int) { + pe = (void *)(long)(pe_int); + found = _gf_true; + goto out; + } + goto out; + } + + inode_ctx_reset1(entry_ino, this, &pe_int); + if (pe_int) { + pe = (void *)(long)(pe_int); + found = _gf_true; + goto out; + } + +name_search: + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + if (pe->name && (strcmp(pe->name, name) == 0)) { + found = _gf_true; + break; + /* TODO: can there be duplicates? */ + } + } + +out: + if (found) + __nlc_free_pe(this, nlc_ctx, pe); + + return; +} + +static void +__nlc_del_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, const char *name) +{ + nlc_ne_t *ne = NULL; + nlc_ne_t *tmp = NULL; + + if (!IS_NE_VALID(nlc_ctx->state)) + goto out; + + list_for_each_entry_safe(ne, tmp, &nlc_ctx->ne, list) + { + if (strcmp(ne->name, name) == 0) { + __nlc_free_ne(this, nlc_ctx, ne); + break; + } + } +out: + return; +} + +static void +__nlc_add_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, inode_t *entry_ino, + const char *name) +{ + nlc_pe_t *pe = NULL; + int ret = -1; + nlc_conf_t *conf = NULL; + uint64_t nlc_ctx_int = 0; + + conf = this->private; + + /* TODO: There can be no duplicate entries, as it is added only + during create. In case there arises duplicate entries, search PE + found = __nlc_search (entries, name, _gf_false); + can use bit vector to have simple search than sequential search */ + + pe = GF_CALLOC(sizeof(*pe), 1, gf_nlc_mt_nlc_pe_t); + if (!pe) + goto out; + + if (entry_ino) { + pe->inode = inode_ref(entry_ino); + nlc_inode_ctx_set(this, entry_ino, NULL, pe); + } else if (name) { + pe->name = gf_strdup(name); + if (!pe->name) + goto out; + } + + list_add(&pe->list, &nlc_ctx->pe); + + nlc_ctx->cache_size += sizeof(*pe) + sizeof(pe->name); + GF_ATOMIC_ADD(conf->current_cache_size, (sizeof(*pe) + sizeof(pe->name))); + + nlc_ctx->refd_inodes += 1; + inode_ctx_get2(entry_ino, this, &nlc_ctx_int, NULL); + if (nlc_ctx_int == 0) + GF_ATOMIC_ADD(conf->refd_inodes, 1); + + ret = 0; +out: + if (ret) + GF_FREE(pe); + + return; +} + +static void +__nlc_add_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, const char *name) +{ + nlc_ne_t *ne = NULL; + int ret = -1; + nlc_conf_t *conf = NULL; + + conf = this->private; + + /* TODO: search ne before adding to get rid of duplicate entries + found = __nlc_search (entries, name, _gf_false); + can use bit vector to have faster search than sequential search */ + + ne = GF_CALLOC(sizeof(*ne), 1, gf_nlc_mt_nlc_ne_t); + if (!ne) + goto out; + + ne->name = gf_strdup(name); + if (!ne->name) + goto out; + + list_add(&ne->list, &nlc_ctx->ne); + + nlc_ctx->cache_size += sizeof(*ne) + sizeof(ne->name); + GF_ATOMIC_ADD(conf->current_cache_size, (sizeof(*ne) + sizeof(ne->name))); + ret = 0; +out: + if (ret) + GF_FREE(ne); + + return; +} + +void +nlc_dir_add_ne(xlator_t *this, inode_t *inode, const char *name) +{ + nlc_ctx_t *nlc_ctx = NULL; + + if (inode->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get_set(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + /* There is one possibility where we need to search before + * adding NE: when there are two parallel lookups on a non + * existent file */ + if (!__nlc_search_ne(nlc_ctx, name)) { + __nlc_add_ne(this, nlc_ctx, name); + __nlc_set_dir_state(nlc_ctx, NLC_NE_VALID); + } + } + UNLOCK(&nlc_ctx->lock); +out: + return; +} + +void +nlc_dir_remove_pe(xlator_t *this, inode_t *parent, inode_t *entry_ino, + const char *name, gf_boolean_t multilink) +{ + nlc_ctx_t *nlc_ctx = NULL; + + if (parent->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get(this, parent, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + if (!__nlc_is_cache_valid(this, nlc_ctx)) + goto unlock; + + __nlc_del_pe(this, nlc_ctx, entry_ino, name, multilink); + __nlc_add_ne(this, nlc_ctx, name); + __nlc_set_dir_state(nlc_ctx, NLC_NE_VALID); + } +unlock: + UNLOCK(&nlc_ctx->lock); +out: + return; +} + +void +nlc_dir_add_pe(xlator_t *this, inode_t *inode, inode_t *entry_ino, + const char *name) +{ + nlc_ctx_t *nlc_ctx = NULL; + + if (inode->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get_set(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + __nlc_del_ne(this, nlc_ctx, name); + __nlc_add_pe(this, nlc_ctx, entry_ino, name); + if (!IS_PE_VALID(nlc_ctx->state)) + __nlc_set_dir_state(nlc_ctx, NLC_PE_PARTIAL); + } + UNLOCK(&nlc_ctx->lock); +out: + return; +} + +gf_boolean_t +__nlc_search_ne(nlc_ctx_t *nlc_ctx, const char *name) +{ + gf_boolean_t found = _gf_false; + nlc_ne_t *ne = NULL; + nlc_ne_t *tmp = NULL; + + if (!IS_NE_VALID(nlc_ctx->state)) + goto out; + + list_for_each_entry_safe(ne, tmp, &nlc_ctx->ne, list) + { + if (strcmp(ne->name, name) == 0) { + found = _gf_true; + break; + } + } +out: + return found; +} + +static gf_boolean_t +__nlc_search_pe(nlc_ctx_t *nlc_ctx, const char *name) +{ + gf_boolean_t found = _gf_false; + nlc_pe_t *pe = NULL; + nlc_pe_t *tmp = NULL; + + if (!IS_PE_VALID(nlc_ctx->state)) + goto out; + + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + if (pe->name && (strcmp(pe->name, name) == 0)) { + found = _gf_true; + break; + } + } +out: + return found; +} + +static char * +__nlc_get_pe(nlc_ctx_t *nlc_ctx, const char *name, + gf_boolean_t case_insensitive) +{ + char *found = NULL; + nlc_pe_t *pe = NULL; + nlc_pe_t *tmp = NULL; + + if (!IS_PE_VALID(nlc_ctx->state)) + goto out; + + if (case_insensitive) { + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + if (pe->name && (strcasecmp(pe->name, name) == 0)) { + found = pe->name; + break; + } + } + } else { + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + if (pe->name && (strcmp(pe->name, name) == 0)) { + found = pe->name; + break; + } + } + } +out: + return found; +} + +gf_boolean_t +nlc_is_negative_lookup(xlator_t *this, loc_t *loc) +{ + nlc_ctx_t *nlc_ctx = NULL; + inode_t *inode = NULL; + gf_boolean_t neg_entry = _gf_false; + + inode = loc->parent; + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + if (inode->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + if (!__nlc_is_cache_valid(this, nlc_ctx)) + goto unlock; + + if (__nlc_search_ne(nlc_ctx, loc->name)) { + neg_entry = _gf_true; + goto unlock; + } + if ((nlc_ctx->state & NLC_PE_FULL) && + !__nlc_search_pe(nlc_ctx, loc->name)) { + neg_entry = _gf_true; + goto unlock; + } + } +unlock: + UNLOCK(&nlc_ctx->lock); + +out: + return neg_entry; +} + +gf_boolean_t +nlc_get_real_file_name(xlator_t *this, loc_t *loc, const char *fname, + int32_t *op_ret, int32_t *op_errno, dict_t *dict) +{ + nlc_ctx_t *nlc_ctx = NULL; + inode_t *inode = NULL; + gf_boolean_t hit = _gf_false; + char *found_file = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, fname, out); + GF_VALIDATE_OR_GOTO(this->name, op_ret, out); + GF_VALIDATE_OR_GOTO(this->name, op_errno, out); + GF_VALIDATE_OR_GOTO(this->name, dict, out); + + inode = loc->inode; + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + if (inode->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + if (!__nlc_is_cache_valid(this, nlc_ctx)) + goto unlock; + + found_file = __nlc_get_pe(nlc_ctx, fname, _gf_true); + if (found_file) { + ret = dict_set_dynstr(dict, GF_XATTR_GET_REAL_FILENAME_KEY, + gf_strdup(found_file)); + if (ret < 0) + goto unlock; + *op_ret = strlen(found_file) + 1; + hit = _gf_true; + goto unlock; + } + if (!found_file && (nlc_ctx->state & NLC_PE_FULL)) { + *op_ret = -1; + *op_errno = ENOENT; + hit = _gf_true; + goto unlock; + } + } +unlock: + UNLOCK(&nlc_ctx->lock); + +out: + return hit; +} + +void +nlc_dump_inodectx(xlator_t *this, inode_t *inode) +{ + int32_t ret = -1; + char *path = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char uuid_str[64] = { + 0, + }; + nlc_ctx_t *nlc_ctx = NULL; + nlc_pe_t *pe = NULL; + nlc_pe_t *tmp = NULL; + nlc_ne_t *ne = NULL; + nlc_ne_t *tmp1 = NULL; + + nlc_inode_ctx_get(this, inode, &nlc_ctx); + + if (!nlc_ctx) + goto out; + + ret = TRY_LOCK(&nlc_ctx->lock); + if (!ret) { + gf_proc_dump_build_key(key_prefix, "xlator.performance.nl-cache", + "nlc_inode"); + gf_proc_dump_add_section("%s", key_prefix); + + __inode_path(inode, NULL, &path); + if (path != NULL) { + gf_proc_dump_write("path", "%s", path); + GF_FREE(path); + } + + uuid_utoa_r(inode->gfid, uuid_str); + + gf_proc_dump_write("inode", "%p", inode); + gf_proc_dump_write("gfid", "%s", uuid_str); + + gf_proc_dump_write("state", "%" PRIu64, nlc_ctx->state); + gf_proc_dump_write("timer", "%p", nlc_ctx->timer); + gf_proc_dump_write("cache-time", "%ld", nlc_ctx->cache_time); + gf_proc_dump_write("cache-size", "%zu", nlc_ctx->cache_size); + gf_proc_dump_write("refd-inodes", "%" PRIu64, nlc_ctx->refd_inodes); + + if (IS_PE_VALID(nlc_ctx->state)) + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + gf_proc_dump_write("pe", "%p, %p, %s", pe, pe->inode, pe->name); + } + + if (IS_NE_VALID(nlc_ctx->state)) + list_for_each_entry_safe(ne, tmp1, &nlc_ctx->ne, list) + { + gf_proc_dump_write("ne", "%s", ne->name); + } + + UNLOCK(&nlc_ctx->lock); + } + + if (ret && nlc_ctx) + gf_proc_dump_write("Unable to dump the inode information", + "(Lock acquisition failed) %p (gfid: %s)", nlc_ctx, + uuid_str); +out: + return; +} diff --git a/xlators/performance/nl-cache/src/nl-cache-mem-types.h b/xlators/performance/nl-cache/src/nl-cache-mem-types.h new file mode 100644 index 00000000000..93a17b3fd5a --- /dev/null +++ b/xlators/performance/nl-cache/src/nl-cache-mem-types.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __NL_CACHE_MEM_TYPES_H__ +#define __NL_CACHE_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_nlc_mem_types_ { + gf_nlc_mt_nlc_conf_t = gf_common_mt_end + 1, + gf_nlc_mt_nlc_ctx_t, + gf_nlc_mt_nlc_local_t, + gf_nlc_mt_nlc_pe_t, + gf_nlc_mt_nlc_ne_t, + gf_nlc_mt_nlc_timer_data_t, + gf_nlc_mt_nlc_lru_node, + gf_nlc_mt_end +}; + +#endif /* __NL_CACHE_MEM_TYPES_H__ */ diff --git a/xlators/performance/nl-cache/src/nl-cache-messages.h b/xlators/performance/nl-cache/src/nl-cache-messages.h new file mode 100644 index 00000000000..222d709e133 --- /dev/null +++ b/xlators/performance/nl-cache/src/nl-cache-messages.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __NL_CACHE_MESSAGES_H__ +#define __NL_CACHE_MESSAGES_H__ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(NLC, NLC_MSG_NO_MEMORY, NLC_MSG_EINVAL, NLC_MSG_NO_TIMER_WHEEL, + NLC_MSG_DICT_FAILURE); + +#endif /* __NL_CACHE_MESSAGES_H__ */ diff --git a/xlators/performance/nl-cache/src/nl-cache.c b/xlators/performance/nl-cache/src/nl-cache.c new file mode 100644 index 00000000000..33a7c471663 --- /dev/null +++ b/xlators/performance/nl-cache/src/nl-cache.c @@ -0,0 +1,840 @@ +/* + * Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#include "nl-cache.h" +#include <glusterfs/statedump.h> +#include <glusterfs/upcall-utils.h> + +static void +nlc_dentry_op(call_frame_t *frame, xlator_t *this, gf_boolean_t multilink) +{ + nlc_local_t *local = frame->local; + + GF_VALIDATE_OR_GOTO(this->name, local, out); + + switch (local->fop) { + case GF_FOP_MKDIR: + nlc_set_dir_state(this, local->loc.inode, NLC_PE_FULL); + /*fall-through*/ + case GF_FOP_MKNOD: + case GF_FOP_CREATE: + case GF_FOP_SYMLINK: + nlc_dir_add_pe(this, local->loc.parent, local->loc.inode, + local->loc.name); + break; + case GF_FOP_LINK: + nlc_dir_add_pe(this, local->loc2.parent, NULL, local->loc2.name); + break; + case GF_FOP_RMDIR: + nlc_inode_clear_cache(this, local->loc.inode, _gf_false); + /*fall-through*/ + case GF_FOP_UNLINK: + nlc_dir_remove_pe(this, local->loc.parent, local->loc.inode, + local->loc.name, multilink); + break; + case GF_FOP_RENAME: + /* TBD: Should these be atomic ? In case of rename, the + * newloc->inode can be NULL, and hence use oldloc->inode */ + nlc_dir_remove_pe(this, local->loc2.parent, local->loc2.inode, + local->loc2.name, _gf_false); + + /*TODO: Remove old dentry from destination before adding this pe*/ + nlc_dir_add_pe(this, local->loc.parent, local->loc2.inode, + local->loc.name); + + default: + return; + } + + nlc_lru_prune(this, NULL); +out: + return; +} + +#define NLC_FOP(_name, _op, loc1, loc2, frame, this, args...) \ + do { \ + nlc_local_t *__local = NULL; \ + nlc_conf_t *conf = NULL; \ + \ + conf = this->private; \ + \ + if (!IS_PEC_ENABLED(conf)) \ + goto disabled; \ + \ + __local = nlc_local_init(frame, this, _op, loc1, loc2); \ + GF_VALIDATE_OR_GOTO(this->name, __local, err); \ + \ + STACK_WIND(frame, nlc_##_name##_cbk, FIRST_CHILD(this), \ + FIRST_CHILD(this)->fops->_name, args); \ + break; \ + disabled: \ + default_##_name##_resume(frame, this, args); \ + break; \ + err: \ + default_##_name##_failure_cbk(frame, ENOMEM); \ + break; \ + } while (0) + +#define NLC_FOP_CBK(_name, multilink, frame, cookie, this, op_ret, op_errno, \ + args...) \ + do { \ + nlc_conf_t *conf = NULL; \ + \ + if (op_ret != 0) \ + goto out; \ + \ + conf = this->private; \ + \ + if (op_ret < 0 || !IS_PEC_ENABLED(conf)) \ + goto out; \ + nlc_dentry_op(frame, this, multilink); \ + out: \ + NLC_STACK_UNWIND(_name, frame, op_ret, op_errno, args); \ + } while (0) + +static int32_t +nlc_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + NLC_FOP_CBK(rename, _gf_false, frame, cookie, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, postnewparent, + xdata); + return 0; +} + +static int32_t +nlc_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + NLC_FOP(rename, GF_FOP_RENAME, newloc, oldloc, frame, this, oldloc, newloc, + xdata); + return 0; +} + +static int32_t +nlc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + NLC_FOP_CBK(mknod, _gf_false, frame, cookie, this, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + NLC_FOP(mknod, GF_FOP_MKNOD, loc, NULL, frame, this, loc, mode, rdev, umask, + xdata); + return 0; +} + +static int32_t +nlc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + NLC_FOP_CBK(create, _gf_false, frame, cookie, this, op_ret, op_errno, fd, + inode, buf, preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + NLC_FOP(create, GF_FOP_CREATE, loc, NULL, frame, this, loc, flags, mode, + umask, fd, xdata); + return 0; +} + +static int32_t +nlc_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + NLC_FOP_CBK(mkdir, _gf_false, frame, cookie, this, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + NLC_FOP(mkdir, GF_FOP_MKDIR, loc, NULL, frame, this, loc, mode, umask, + xdata); + return 0; +} + +static int32_t +nlc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + nlc_local_t *local = NULL; + nlc_conf_t *conf = NULL; + + local = frame->local; + conf = this->private; + + if (!local) + goto out; + + /* Donot add to pe, this may lead to duplicate entry and + * requires search before adding if list of strings */ + if (op_ret < 0 && op_errno == ENOENT) { + nlc_dir_add_ne(this, local->loc.parent, local->loc.name); + GF_ATOMIC_INC(conf->nlc_counter.nlc_miss); + } + +out: + NLC_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; +} + +static int32_t +nlc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + nlc_local_t *local = NULL; + nlc_conf_t *conf = NULL; + inode_t *inode = NULL; + + if (loc_is_nameless(loc)) + goto wind; + + local = nlc_local_init(frame, this, GF_FOP_LOOKUP, loc, NULL); + if (!local) + goto err; + + conf = this->private; + + inode = inode_grep(loc->inode->table, loc->parent, loc->name); + if (inode) { + inode_unref(inode); + goto wind; + } + + if (nlc_is_negative_lookup(this, loc)) { + GF_ATOMIC_INC(conf->nlc_counter.nlc_hit); + gf_msg_trace(this->name, 0, + "Serving negative lookup from " + "cache:%s", + loc->name); + goto unwind; + } + +wind: + STACK_WIND(frame, nlc_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + return 0; +unwind: + NLC_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL); + return 0; +err: + NLC_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); + return 0; +} + +static int32_t +nlc_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + NLC_FOP_CBK(rmdir, _gf_false, frame, cookie, this, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + dict_t *xdata) +{ + NLC_FOP(rmdir, GF_FOP_RMDIR, loc, NULL, frame, this, loc, flags, xdata); + return 0; +} + +static int32_t +nlc_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); + + if (!IS_PEC_ENABLED(conf)) + goto out; + + if (op_ret < 0 && op_errno == ENOENT) { + GF_ATOMIC_INC(conf->nlc_counter.getrealfilename_miss); + } + +out: + NLC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +static int32_t +nlc_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + dict_t *dict = NULL; + nlc_local_t *local = NULL; + gf_boolean_t hit = _gf_false; + const char *fname = NULL; + nlc_conf_t *conf = NULL; + + conf = this->private; + + if (!IS_PEC_ENABLED(conf)) + goto wind; + + if (!key || (strncmp(key, GF_XATTR_GET_REAL_FILENAME_KEY, + SLEN(GF_XATTR_GET_REAL_FILENAME_KEY)) != 0)) + goto wind; + + local = nlc_local_init(frame, this, GF_FOP_GETXATTR, loc, NULL); + if (!local) + goto err; + + if (loc->inode && key) { + dict = dict_new(); + if (!dict) + goto err; + + fname = key + SLEN(GF_XATTR_GET_REAL_FILENAME_KEY); + hit = nlc_get_real_file_name(this, loc, fname, &op_ret, &op_errno, + dict); + if (hit) + goto unwind; + else + dict_unref(dict); + } + + STACK_WIND(frame, nlc_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, key, xdata); + return 0; +wind: + STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, key, xdata); + return 0; +unwind: + GF_ATOMIC_INC(conf->nlc_counter.getrealfilename_hit); + NLC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, NULL); + dict_unref(dict); + return 0; +err: + NLC_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +static int32_t +nlc_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + NLC_FOP_CBK(symlink, _gf_false, frame, cookie, this, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + NLC_FOP(symlink, GF_FOP_SYMLINK, loc, NULL, frame, this, linkpath, loc, + umask, xdata); + return 0; +} + +static int32_t +nlc_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + NLC_FOP_CBK(link, _gf_false, frame, cookie, this, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + NLC_FOP(link, GF_FOP_LINK, oldloc, newloc, frame, this, oldloc, newloc, + xdata); + return 0; +} + +static int32_t +nlc_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + uint32_t link_count = 0; + gf_boolean_t multilink = _gf_false; + + if (xdata && !dict_get_uint32(xdata, GET_LINK_COUNT, &link_count)) { + if (link_count > 1) + multilink = _gf_true; + } else { + /* Don't touch cache if we don't know enough */ + gf_msg(this->name, GF_LOG_WARNING, 0, NLC_MSG_DICT_FAILURE, + "Failed to get GET_LINK_COUNT from dict"); + NLC_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; + } + + NLC_FOP_CBK(unlink, multilink, frame, cookie, this, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + dict_t *xdata) +{ + nlc_conf_t *conf = NULL; + gf_boolean_t new_dict = _gf_false; + + conf = this->private; + + if (!IS_PEC_ENABLED(conf)) + goto do_fop; + + if (!xdata) { + xdata = dict_new(); + if (xdata) + new_dict = _gf_true; + } + + if (xdata && dict_set_uint32(xdata, GET_LINK_COUNT, 0)) { + gf_msg(this->name, GF_LOG_WARNING, 0, NLC_MSG_DICT_FAILURE, + "Failed to set GET_LINK_COUNT in dict"); + goto err; + } + +do_fop: + NLC_FOP(unlink, GF_FOP_UNLINK, loc, NULL, frame, this, loc, flags, xdata); + + if (new_dict) + dict_unref(xdata); + return 0; +} + +static int32_t +nlc_invalidate(xlator_t *this, void *data) +{ + struct gf_upcall *up_data = NULL; + struct gf_upcall_cache_invalidation *up_ci = NULL; + inode_t *inode = NULL; + inode_t *parent1 = NULL; + inode_t *parent2 = NULL; + int ret = 0; + inode_table_t *itable = NULL; + nlc_conf_t *conf = NULL; + + up_data = (struct gf_upcall *)data; + + if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) + goto out; + + conf = this->private; + if (!conf) + goto out; + + up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; + + /*TODO: Add he inodes found as a member in gf_upcall_cache_invalidation + * so that it prevents subsequent xlators from doing inode_find again + */ + itable = ((xlator_t *)this->graph->top)->itable; + inode = inode_find(itable, up_data->gfid); + if (!inode) { + ret = -1; + goto out; + } + + if ((!((up_ci->flags & UP_TIMES) && inode->ia_type == IA_IFDIR)) && + (!(up_ci->flags & UP_PARENT_DENTRY_FLAGS))) { + goto out; + } + + if (!gf_uuid_is_null(up_ci->p_stat.ia_gfid)) { + parent1 = inode_find(itable, up_ci->p_stat.ia_gfid); + if (!parent1) { + ret = -1; + goto out; + } + } + + if (!gf_uuid_is_null(up_ci->oldp_stat.ia_gfid)) { + parent2 = inode_find(itable, up_ci->oldp_stat.ia_gfid); + if (!parent2) { + ret = -1; + goto out; + } + } + + /* TODO: get enough data in upcall so that we do not invalidate but + * update */ + if (inode && inode->ia_type == IA_IFDIR) + nlc_inode_clear_cache(this, inode, NLC_NONE); + if (parent1) + nlc_inode_clear_cache(this, parent1, NLC_NONE); + if (parent2) + nlc_inode_clear_cache(this, parent2, NLC_NONE); + + GF_ATOMIC_INC(conf->nlc_counter.nlc_invals); + +out: + if (inode) + inode_unref(inode); + if (parent1) + inode_unref(parent1); + if (parent2) + inode_unref(parent2); + + return ret; +} + +int +nlc_notify(xlator_t *this, int event, void *data, ...) +{ + int ret = 0; + + switch (event) { + case GF_EVENT_CHILD_DOWN: + case GF_EVENT_SOME_DESCENDENT_DOWN: + case GF_EVENT_CHILD_UP: + case GF_EVENT_SOME_DESCENDENT_UP: + nlc_update_child_down_time(this, gf_time()); + /* TODO: nlc_clear_all_cache (this); else + lru prune will lazily clear it*/ + break; + case GF_EVENT_UPCALL: + ret = nlc_invalidate(this, data); + break; + case GF_EVENT_PARENT_DOWN: + nlc_disable_cache(this); + nlc_clear_all_cache(this); + default: + break; + } + + if (default_notify(this, event, data) != 0) + ret = -1; + + return ret; +} + +static int32_t +nlc_forget(xlator_t *this, inode_t *inode) +{ + uint64_t pe_int = 0; + uint64_t nlc_ctx_int = 0; + nlc_ctx_t *nlc_ctx = NULL; + nlc_conf_t *conf = NULL; + + conf = this->private; + + inode_ctx_reset1(inode, this, &pe_int); + GF_ASSERT(pe_int == 0); + + nlc_inode_clear_cache(this, inode, NLC_NONE); + inode_ctx_reset0(inode, this, &nlc_ctx_int); + nlc_ctx = (void *)(long)nlc_ctx_int; + if (nlc_ctx) { + GF_FREE(nlc_ctx); + GF_ATOMIC_SUB(conf->current_cache_size, sizeof(*nlc_ctx)); + } + + return 0; +} + +static int32_t +nlc_inodectx(xlator_t *this, inode_t *inode) +{ + nlc_dump_inodectx(this, inode); + return 0; +} + +static int32_t +nlc_priv_dump(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + + conf = this->private; + + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + + gf_proc_dump_write("negative_lookup_hit_count", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.nlc_hit)); + gf_proc_dump_write("negative_lookup_miss_count", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.nlc_miss)); + gf_proc_dump_write("get_real_filename_hit_count", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_hit)); + gf_proc_dump_write("get_real_filename_miss_count", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_miss)); + gf_proc_dump_write("nameless_lookup_count", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.nameless_lookup)); + gf_proc_dump_write("inodes_with_positive_dentry_cache", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.pe_inode_cnt)); + gf_proc_dump_write("inodes_with_negative_dentry_cache", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.ne_inode_cnt)); + gf_proc_dump_write("dentry_invalidations_received", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.nlc_invals)); + gf_proc_dump_write("cache_limit", "%" PRIu64, conf->cache_size); + gf_proc_dump_write("consumed_cache_size", "%" PRId64, + GF_ATOMIC_GET(conf->current_cache_size)); + gf_proc_dump_write("inode_limit", "%" PRIu64, conf->inode_limit); + gf_proc_dump_write("consumed_inodes", "%" PRId64, + GF_ATOMIC_GET(conf->refd_inodes)); + + return 0; +} + +static int32_t +nlc_dump_metrics(xlator_t *this, int fd) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + dprintf(fd, "%s.negative_lookup_hit_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.nlc_hit)); + dprintf(fd, "%s.negative_lookup_miss_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.nlc_miss)); + dprintf(fd, "%s.get_real_filename_hit_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_hit)); + dprintf(fd, "%s.get_real_filename_miss_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_miss)); + dprintf(fd, "%s.nameless_lookup_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.nameless_lookup)); + dprintf(fd, "%s.inodes_with_positive_dentry_cache %" PRId64 "\n", + this->name, GF_ATOMIC_GET(conf->nlc_counter.pe_inode_cnt)); + dprintf(fd, "%s.inodes_with_negative_dentry_cache %" PRId64 "\n", + this->name, GF_ATOMIC_GET(conf->nlc_counter.ne_inode_cnt)); + dprintf(fd, "%s.dentry_invalidations_received %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.nlc_invals)); + dprintf(fd, "%s.cache_limit %" PRIu64 "\n", this->name, conf->cache_size); + dprintf(fd, "%s.consumed_cache_size %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->current_cache_size)); + dprintf(fd, "%s.inode_limit %" PRIu64 "\n", this->name, conf->inode_limit); + dprintf(fd, "%s.consumed_inodes %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->refd_inodes)); + + return 0; +} + +void +nlc_fini(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + GF_FREE(conf); + + glusterfs_ctx_tw_put(this->ctx); + + return; +} + +int32_t +nlc_mem_acct_init(xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init(this, gf_nlc_mt_end + 1); + return ret; +} + +int32_t +nlc_reconfigure(xlator_t *this, dict_t *options) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + GF_OPTION_RECONF("nl-cache-timeout", conf->cache_timeout, options, int32, + out); + GF_OPTION_RECONF("nl-cache-positive-entry", conf->positive_entry_cache, + options, bool, out); + GF_OPTION_RECONF("nl-cache-limit", conf->cache_size, options, size_uint64, + out); + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); + +out: + return 0; +} + +int32_t +nlc_init(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + int ret = -1; + inode_table_t *itable = NULL; + + conf = GF_CALLOC(sizeof(*conf), 1, gf_nlc_mt_nlc_conf_t); + if (!conf) + goto out; + + GF_OPTION_INIT("nl-cache-timeout", conf->cache_timeout, int32, out); + GF_OPTION_INIT("nl-cache-positive-entry", conf->positive_entry_cache, bool, + out); + GF_OPTION_INIT("nl-cache-limit", conf->cache_size, size_uint64, out); + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); + + /* Since the positive entries are stored as list of refs on + * existing inodes, we should not overflow the inode lru_limit. + * Hence keep the limit of inodes that are refed by this xlator, + * to 80% of inode_table->lru_limit. In fuse where the limit is + * infinite, take 131072 as lru limit (as in gfapi). */ + itable = ((xlator_t *)this->graph->top)->itable; + if (itable && itable->lru_limit) + conf->inode_limit = itable->lru_limit * 80 / 100; + else + conf->inode_limit = 131072 * 80 / 100; + + LOCK_INIT(&conf->lock); + GF_ATOMIC_INIT(conf->current_cache_size, 0); + GF_ATOMIC_INIT(conf->refd_inodes, 0); + GF_ATOMIC_INIT(conf->nlc_counter.nlc_hit, 0); + GF_ATOMIC_INIT(conf->nlc_counter.nlc_miss, 0); + GF_ATOMIC_INIT(conf->nlc_counter.nameless_lookup, 0); + GF_ATOMIC_INIT(conf->nlc_counter.getrealfilename_hit, 0); + GF_ATOMIC_INIT(conf->nlc_counter.getrealfilename_miss, 0); + GF_ATOMIC_INIT(conf->nlc_counter.pe_inode_cnt, 0); + GF_ATOMIC_INIT(conf->nlc_counter.ne_inode_cnt, 0); + GF_ATOMIC_INIT(conf->nlc_counter.nlc_invals, 0); + + INIT_LIST_HEAD(&conf->lru); + conf->last_child_down = gf_time(); + + conf->timer_wheel = glusterfs_ctx_tw_get(this->ctx); + if (!conf->timer_wheel) { + gf_msg(this->name, GF_LOG_ERROR, 0, NLC_MSG_NO_TIMER_WHEEL, + "Initing the global timer wheel failed"); + goto out; + } + + this->private = conf; + + ret = 0; +out: + if (ret < 0) + GF_FREE(conf); + + return ret; +} + +struct xlator_fops nlc_fops = { + .rename = nlc_rename, + .mknod = nlc_mknod, + .create = nlc_create, + .mkdir = nlc_mkdir, + .lookup = nlc_lookup, + .rmdir = nlc_rmdir, + .getxattr = nlc_getxattr, + .symlink = nlc_symlink, + .link = nlc_link, + .unlink = nlc_unlink, + /* TODO: + .readdir = nlc_readdir, + .readdirp = nlc_readdirp, + .seek = nlc_seek, + .opendir = nlc_opendir, */ +}; + +struct xlator_cbks nlc_cbks = { + .forget = nlc_forget, +}; + +struct xlator_dumpops nlc_dumpops = { + .inodectx = nlc_inodectx, + .priv = nlc_priv_dump, +}; + +struct volume_options nlc_options[] = { + { + .key = {"nl-cache"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable nl-cache", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + { + .key = {"nl-cache-positive-entry"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache the name of the files/directories that was" + " looked up and are present in a directory", + }, + { + .key = {"nl-cache-limit"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .default_value = "131072", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "the value over which caching will be disabled for" + "a while and the cache is cleared based on LRU", + }, + { + .key = {"nl-cache-timeout"}, + .type = GF_OPTION_TYPE_TIME, + .min = 0, + .default_value = "60", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Time period after which cache has to be refreshed", + }, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"nl-cache"}, + .description = "Enable/Disable nl cache translator"}, + + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = nlc_init, + .fini = nlc_fini, + .notify = nlc_notify, + .reconfigure = nlc_reconfigure, + .mem_acct_init = nlc_mem_acct_init, + .dump_metrics = nlc_dump_metrics, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &nlc_dumpops, + .fops = &nlc_fops, + .cbks = &nlc_cbks, + .options = nlc_options, + .identifier = "nl-cache", + .category = GF_TECH_PREVIEW, +}; diff --git a/xlators/performance/nl-cache/src/nl-cache.h b/xlators/performance/nl-cache/src/nl-cache.h new file mode 100644 index 00000000000..85fcc176342 --- /dev/null +++ b/xlators/performance/nl-cache/src/nl-cache.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __NL_CACHE_H__ +#define __NL_CACHE_H__ + +#include "nl-cache-mem-types.h" +#include "nl-cache-messages.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/atomic.h> + +#define NLC_INVALID 0x0000 +#define NLC_PE_FULL 0x0001 +#define NLC_PE_PARTIAL 0x0002 +#define NLC_NE_VALID 0x0004 + +#define IS_PE_VALID(state) \ + ((state != NLC_INVALID) && (state & (NLC_PE_FULL | NLC_PE_PARTIAL))) +#define IS_NE_VALID(state) ((state != NLC_INVALID) && (state & NLC_NE_VALID)) + +#define IS_PEC_ENABLED(conf) (conf->positive_entry_cache) +#define IS_CACHE_ENABLED(conf) ((!conf->cache_disabled)) + +#define NLC_STACK_UNWIND(fop, frame, params...) \ + do { \ + nlc_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + nlc_local_wipe(__xl, __local); \ + } while (0) + +enum nlc_cache_clear_reason { + NLC_NONE = 0, + NLC_LRU_PRUNE, +}; + +struct nlc_ne { + struct list_head list; + char *name; +}; +typedef struct nlc_ne nlc_ne_t; + +struct nlc_pe { + struct list_head list; + inode_t *inode; + char *name; +}; +typedef struct nlc_pe nlc_pe_t; + +struct nlc_timer_data { + inode_t *inode; + xlator_t *this; +}; +typedef struct nlc_timer_data nlc_timer_data_t; + +struct nlc_lru_node { + inode_t *inode; + struct list_head list; +}; +typedef struct nlc_lru_node nlc_lru_node_t; + +struct nlc_ctx { + struct list_head pe; /* list of positive entries */ + struct list_head ne; /* list of negative entries */ + uint64_t state; + time_t cache_time; + struct gf_tw_timer_list *timer; + nlc_timer_data_t *timer_data; + size_t cache_size; + uint64_t refd_inodes; + gf_lock_t lock; +}; +typedef struct nlc_ctx nlc_ctx_t; + +struct nlc_local { + loc_t loc; + loc_t loc2; + inode_t *inode; + inode_t *parent; + fd_t *fd; + char *linkname; + glusterfs_fop_t fop; +}; +typedef struct nlc_local nlc_local_t; + +struct nlc_statistics { + gf_atomic_t nlc_hit; /* No. of times lookup/stat was served from this xl */ + gf_atomic_t nlc_miss; /* No. of times negative lookups were sent to disk */ + /* More granular counters */ + gf_atomic_t nameless_lookup; + gf_atomic_t getrealfilename_hit; + gf_atomic_t getrealfilename_miss; + gf_atomic_t pe_inode_cnt; + gf_atomic_t ne_inode_cnt; + gf_atomic_t nlc_invals; /* No. of invalidates received from upcall*/ +}; + +struct nlc_conf { + int32_t cache_timeout; + gf_boolean_t positive_entry_cache; + gf_boolean_t negative_entry_cache; + gf_boolean_t disable_cache; + uint64_t cache_size; + gf_atomic_t current_cache_size; + uint64_t inode_limit; + gf_atomic_t refd_inodes; + struct tvec_base *timer_wheel; + time_t last_child_down; + struct list_head lru; + gf_lock_t lock; + struct nlc_statistics nlc_counter; +}; +typedef struct nlc_conf nlc_conf_t; + +gf_boolean_t +nlc_get_real_file_name(xlator_t *this, loc_t *loc, const char *fname, + int32_t *op_ret, int32_t *op_errno, dict_t *dict); + +gf_boolean_t +nlc_is_negative_lookup(xlator_t *this, loc_t *loc); + +void +nlc_set_dir_state(xlator_t *this, inode_t *inode, uint64_t state); + +void +nlc_dir_add_pe(xlator_t *this, inode_t *inode, inode_t *entry_ino, + const char *name); + +void +nlc_dir_remove_pe(xlator_t *this, inode_t *inode, inode_t *entry_ino, + const char *name, gf_boolean_t multilink); + +void +nlc_dir_add_ne(xlator_t *this, inode_t *inode, const char *name); + +void +nlc_local_wipe(xlator_t *this, nlc_local_t *local); + +nlc_local_t * +nlc_local_init(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + loc_t *loc, loc_t *loc2); + +void +nlc_update_child_down_time(xlator_t *this, time_t now); + +void +nlc_inode_clear_cache(xlator_t *this, inode_t *inode, int reason); + +void +nlc_dump_inodectx(xlator_t *this, inode_t *inode); + +void +nlc_clear_all_cache(xlator_t *this); + +void +nlc_disable_cache(xlator_t *this); + +void +nlc_lru_prune(xlator_t *this, inode_t *inode); + +#endif /* __NL_CACHE_H__ */ diff --git a/xlators/performance/open-behind/src/Makefile.am b/xlators/performance/open-behind/src/Makefile.am index 12528570783..41930dcd67d 100644 --- a/xlators/performance/open-behind/src/Makefile.am +++ b/xlators/performance/open-behind/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = open-behind.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -open_behind_la_LDFLAGS = -module -avoid-version +open_behind_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) open_behind_la_SOURCES = open-behind.c open_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = open-behind-mem-types.h +noinst_HEADERS = open-behind-mem-types.h open-behind-messages.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/performance/open-behind/src/open-behind-mem-types.h b/xlators/performance/open-behind/src/open-behind-mem-types.h index 1e94296f424..6c1ab2e19d2 100644 --- a/xlators/performance/open-behind/src/open-behind-mem-types.h +++ b/xlators/performance/open-behind/src/open-behind-mem-types.h @@ -11,11 +11,12 @@ #ifndef __OB_MEM_TYPES_H__ #define __OB_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_ob_mem_types_ { - gf_ob_mt_fd_t = gf_common_mt_end + 1, - gf_ob_mt_conf_t, - gf_ob_mt_end + gf_ob_mt_fd_t = gf_common_mt_end + 1, + gf_ob_mt_conf_t, + gf_ob_mt_inode_t, + gf_ob_mt_end }; #endif diff --git a/xlators/performance/open-behind/src/open-behind-messages.h b/xlators/performance/open-behind/src/open-behind-messages.h new file mode 100644 index 00000000000..0e789177684 --- /dev/null +++ b/xlators/performance/open-behind/src/open-behind-messages.h @@ -0,0 +1,32 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _OPEN_BEHIND_MESSAGES_H_ +#define _OPEN_BEHIND_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(OPEN_BEHIND, OPEN_BEHIND_MSG_XLATOR_CHILD_MISCONFIGURED, + OPEN_BEHIND_MSG_VOL_MISCONFIGURED, OPEN_BEHIND_MSG_NO_MEMORY, + OPEN_BEHIND_MSG_FAILED, OPEN_BEHIND_MSG_BAD_STATE); + +#define OPEN_BEHIND_MSG_FAILED_STR "Failed to submit fop" +#define OPEN_BEHIND_MSG_BAD_STATE_STR "Unexpected state" + +#endif /* _OPEN_BEHIND_MESSAGES_H_ */ diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c index 26b684e888c..600c3b62ffe 100644 --- a/xlators/performance/open-behind/src/open-behind.c +++ b/xlators/performance/open-behind/src/open-behind.c @@ -9,968 +9,1093 @@ */ #include "open-behind-mem-types.h" -#include "xlator.h" -#include "statedump.h" -#include "call-stub.h" -#include "defaults.h" +#include <glusterfs/xlator.h> +#include <glusterfs/statedump.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/defaults.h> +#include "open-behind-messages.h" +#include <glusterfs/glusterfs-acl.h> + +/* Note: The initial design of open-behind was made to cover the simple case + * of open, read, close for small files. This pattern combined with + * quick-read can do the whole operation without a single request to the + * bricks (except the initial lookup). + * + * The way to do this has been improved, but the logic remains the same. + * Basically, this means that any operation sent to the fd or the inode + * that it's not a read, causes the open request to be sent to the + * bricks, and all future operations will be executed synchronously, + * including opens (it's reset once all fd's are closed). + */ typedef struct ob_conf { - gf_boolean_t use_anonymous_fd; /* use anonymous FDs wherever safe - e.g - fstat() readv() - - whereas for fops like writev(), lk(), - the fd is important for side effects - like mandatory locks - */ - gf_boolean_t lazy_open; /* delay backend open as much as possible */ + gf_boolean_t use_anonymous_fd; /* use anonymous FDs wherever safe + e.g - fstat() readv() + + whereas for fops like writev(), lk(), + the fd is important for side effects + like mandatory locks + */ + gf_boolean_t lazy_open; /* delay backend open as much as possible */ + gf_boolean_t read_after_open; /* instead of sending readvs on + anonymous fds, open the file + first and then send readv i.e + similar to what writev does + */ } ob_conf_t; - -typedef struct ob_fd { - call_frame_t *open_frame; - loc_t loc; - dict_t *xdata; - int flags; - int op_errno; - struct list_head list; -} ob_fd_t; - - -ob_fd_t * -__ob_fd_ctx_get (xlator_t *this, fd_t *fd) +/* A negative state represents an errno value negated. In this case the + * current operation cannot be processed. */ +typedef enum _ob_state { + /* There are no opens on the inode or the first open is already + * completed. The current operation can be sent directly. */ + OB_STATE_READY = 0, + + /* There's an open pending and it has been triggered. The current + * operation should be "stubbified" and processed with + * ob_stub_dispatch(). */ + OB_STATE_OPEN_TRIGGERED, + + /* There's an open pending but it has not been triggered. The current + * operation can be processed directly but using an anonymous fd. */ + OB_STATE_OPEN_PENDING, + + /* The current operation is the first open on the inode. */ + OB_STATE_FIRST_OPEN +} ob_state_t; + +typedef struct ob_inode { + /* List of stubs pending on the first open. Once the first open is + * complete, all these stubs will be resubmitted, and dependencies + * will be checked again. */ + struct list_head resume_fops; + + /* The inode this object references. */ + inode_t *inode; + + /* The fd from the first open sent to this inode. It will be set + * from the moment the open is processed until the open if fully + * executed or closed before actually opened. It's NULL in all + * other cases. */ + fd_t *first_fd; + + /* The stub from the first open operation. When open fop starts + * being processed, it's assigned the OB_OPEN_PREPARING value + * until the actual stub is created. This is necessary to avoid + * creating the stub inside a locked region. Once the stub is + * successfully created, it's assigned here. This value is set + * to NULL once the stub is resumed. */ + call_stub_t *first_open; + + /* The total number of currently open fd's on this inode. */ + int32_t open_count; + + /* This flag is set as soon as we know that the open will be + * sent to the bricks, even before the stub is ready. */ + bool triggered; +} ob_inode_t; + +/* Dummy pointer used temporarily while the actual open stub is being created */ +#define OB_OPEN_PREPARING ((call_stub_t *)-1) + +#define OB_POST_COMMON(_fop, _xl, _frame, _fd, _args...) \ + case OB_STATE_FIRST_OPEN: \ + gf_smsg((_xl)->name, GF_LOG_ERROR, EINVAL, OPEN_BEHIND_MSG_BAD_STATE, \ + "fop=%s", #_fop, "state=%d", __ob_state, NULL); \ + default_##_fop##_failure_cbk(_frame, EINVAL); \ + break; \ + case OB_STATE_READY: \ + default_##_fop(_frame, _xl, ##_args); \ + break; \ + case OB_STATE_OPEN_TRIGGERED: { \ + call_stub_t *__ob_stub = fop_##_fop##_stub(_frame, ob_##_fop, \ + ##_args); \ + if (__ob_stub != NULL) { \ + ob_stub_dispatch(_xl, __ob_inode, _fd, __ob_stub); \ + break; \ + } \ + __ob_state = -ENOMEM; \ + } \ + default: \ + gf_smsg((_xl)->name, GF_LOG_ERROR, -__ob_state, \ + OPEN_BEHIND_MSG_FAILED, "fop=%s", #_fop, NULL); \ + default_##_fop##_failure_cbk(_frame, -__ob_state) + +#define OB_POST_FD(_fop, _xl, _frame, _fd, _trigger, _args...) \ + do { \ + ob_inode_t *__ob_inode; \ + fd_t *__first_fd; \ + ob_state_t __ob_state = ob_open_and_resume_fd( \ + _xl, _fd, 0, true, _trigger, &__ob_inode, &__first_fd); \ + switch (__ob_state) { \ + case OB_STATE_OPEN_PENDING: \ + if (!(_trigger)) { \ + fd_t *__ob_fd = fd_anonymous_with_flags((_fd)->inode, \ + (_fd)->flags); \ + if (__ob_fd != NULL) { \ + default_##_fop(_frame, _xl, ##_args); \ + fd_unref(__ob_fd); \ + break; \ + } \ + __ob_state = -ENOMEM; \ + } \ + OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args); \ + } \ + } while (0) + +#define OB_POST_FLUSH(_xl, _frame, _fd, _args...) \ + do { \ + ob_inode_t *__ob_inode; \ + fd_t *__first_fd; \ + ob_state_t __ob_state = ob_open_and_resume_fd( \ + _xl, _fd, 0, true, false, &__ob_inode, &__first_fd); \ + switch (__ob_state) { \ + case OB_STATE_OPEN_PENDING: \ + default_flush_cbk(_frame, NULL, _xl, 0, 0, NULL); \ + break; \ + OB_POST_COMMON(flush, _xl, _frame, __first_fd, ##_args); \ + } \ + } while (0) + +#define OB_POST_INODE(_fop, _xl, _frame, _inode, _trigger, _args...) \ + do { \ + ob_inode_t *__ob_inode; \ + fd_t *__first_fd; \ + ob_state_t __ob_state = ob_open_and_resume_inode( \ + _xl, _inode, NULL, 0, true, _trigger, &__ob_inode, &__first_fd); \ + switch (__ob_state) { \ + case OB_STATE_OPEN_PENDING: \ + OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args); \ + } \ + } while (0) + +static ob_inode_t * +ob_inode_get_locked(xlator_t *this, inode_t *inode) { - uint64_t value = 0; - int ret = -1; - ob_fd_t *ob_fd = NULL; - - ret = __fd_ctx_get (fd, this, &value); - if (ret) - return NULL; - - ob_fd = (void *) ((long) value); + ob_inode_t *ob_inode = NULL; + uint64_t value = 0; + + if ((__inode_ctx_get(inode, this, &value) == 0) && (value != 0)) { + return (ob_inode_t *)(uintptr_t)value; + } + + ob_inode = GF_CALLOC(1, sizeof(*ob_inode), gf_ob_mt_inode_t); + if (ob_inode != NULL) { + ob_inode->inode = inode; + INIT_LIST_HEAD(&ob_inode->resume_fops); + + value = (uint64_t)(uintptr_t)ob_inode; + if (__inode_ctx_set(inode, this, &value) < 0) { + GF_FREE(ob_inode); + ob_inode = NULL; + } + } - return ob_fd; + return ob_inode; } - -ob_fd_t * -ob_fd_ctx_get (xlator_t *this, fd_t *fd) +static ob_state_t +ob_open_and_resume_inode(xlator_t *xl, inode_t *inode, fd_t *fd, + int32_t open_count, bool synchronous, bool trigger, + ob_inode_t **pob_inode, fd_t **pfd) { - ob_fd_t *ob_fd = NULL; + ob_conf_t *conf; + ob_inode_t *ob_inode; + call_stub_t *open_stub; - LOCK (&fd->lock); - { - ob_fd = __ob_fd_ctx_get (this, fd); - } - UNLOCK (&fd->lock); + if (inode == NULL) { + return OB_STATE_READY; + } - return ob_fd; -} + conf = xl->private; + *pfd = NULL; -int -__ob_fd_ctx_set (xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) -{ - uint64_t value = 0; - int ret = -1; + LOCK(&inode->lock); + { + ob_inode = ob_inode_get_locked(xl, inode); + if (ob_inode == NULL) { + UNLOCK(&inode->lock); - value = (long) ((void *) ob_fd); + return -ENOMEM; + } + *pob_inode = ob_inode; + + ob_inode->open_count += open_count; + + /* If first_fd is not NULL, it means that there's a previous open not + * yet completed. */ + if (ob_inode->first_fd != NULL) { + *pfd = ob_inode->first_fd; + /* If the current request doesn't trigger the open and it hasn't + * been triggered yet, we can continue without issuing the open + * only if the current request belongs to the same fd as the + * first one. */ + if (!trigger && !ob_inode->triggered && + (ob_inode->first_fd == fd)) { + UNLOCK(&inode->lock); + + return OB_STATE_OPEN_PENDING; + } + + /* We need to issue the open. It could have already been triggered + * before. In this case open_stub will be NULL. Or the initial open + * may not be completely ready yet. In this case open_stub will be + * OB_OPEN_PREPARING. */ + open_stub = ob_inode->first_open; + ob_inode->first_open = NULL; + ob_inode->triggered = true; + + UNLOCK(&inode->lock); + + if ((open_stub != NULL) && (open_stub != OB_OPEN_PREPARING)) { + call_resume(open_stub); + } + + return OB_STATE_OPEN_TRIGGERED; + } - ret = __fd_ctx_set (fd, this, value); + /* There's no pending open. Only opens can be non synchronous, so all + * regular fops will be processed directly. For non synchronous opens, + * we'll still process them normally (i.e. synchornous) if there are + * more file descriptors open. */ + if (synchronous || (ob_inode->open_count > open_count)) { + UNLOCK(&inode->lock); - return ret; -} + return OB_STATE_READY; + } + *pfd = fd; -int -ob_fd_ctx_set (xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) -{ - int ret = -1; + /* This is the first open. We keep a reference on the fd and set + * first_open stub to OB_OPEN_PREPARING until the actual stub can + * be assigned (we don't create the stub here to avoid doing memory + * allocations inside the mutex). */ + ob_inode->first_fd = __fd_ref(fd); + ob_inode->first_open = OB_OPEN_PREPARING; - LOCK (&fd->lock); - { - ret = __ob_fd_ctx_set (this, fd, ob_fd); - } - UNLOCK (&fd->lock); + /* If lazy_open is not set, we'll need to immediately send the open, + * so we set triggered right now. */ + ob_inode->triggered = !conf->lazy_open; + } + UNLOCK(&inode->lock); - return ret; + return OB_STATE_FIRST_OPEN; } - -ob_fd_t * -ob_fd_new (void) +static ob_state_t +ob_open_and_resume_fd(xlator_t *xl, fd_t *fd, int32_t open_count, + bool synchronous, bool trigger, ob_inode_t **pob_inode, + fd_t **pfd) { - ob_fd_t *ob_fd = NULL; - - ob_fd = GF_CALLOC (1, sizeof (*ob_fd), gf_ob_mt_fd_t); + uint64_t err; - INIT_LIST_HEAD (&ob_fd->list); + if ((fd_ctx_get(fd, xl, &err) == 0) && (err != 0)) { + return (ob_state_t)-err; + } - return ob_fd; + return ob_open_and_resume_inode(xl, fd->inode, fd, open_count, synchronous, + trigger, pob_inode, pfd); } - -void -ob_fd_free (ob_fd_t *ob_fd) +static ob_state_t +ob_open_behind(xlator_t *xl, fd_t *fd, int32_t flags, ob_inode_t **pob_inode, + fd_t **pfd) { - loc_wipe (&ob_fd->loc); - - if (ob_fd->xdata) - dict_unref (ob_fd->xdata); + bool synchronous; - if (ob_fd->open_frame) - STACK_DESTROY (ob_fd->open_frame->root); + /* TODO: If O_CREAT, O_APPEND, O_WRONLY or O_DIRECT are specified, shouldn't + * we also execute this open synchronously ? */ + synchronous = (flags & O_TRUNC) != 0; - GF_FREE (ob_fd); + return ob_open_and_resume_fd(xl, fd, 1, synchronous, true, pob_inode, pfd); } - -int -ob_wake_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd_ret, dict_t *xdata) +static int32_t +ob_stub_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, + call_stub_t *stub) { - fd_t *fd = NULL; - struct list_head list; - ob_fd_t *ob_fd = NULL; - call_stub_t *stub = NULL, *tmp = NULL; - - fd = frame->local; - frame->local = NULL; - - INIT_LIST_HEAD (&list); - - LOCK (&fd->lock); - { - ob_fd = __ob_fd_ctx_get (this, fd); - - list_splice_init (&ob_fd->list, &list); - - if (op_ret < 0) { - /* mark fd BAD for ever */ - ob_fd->op_errno = op_errno; - } else { - __fd_ctx_del (fd, this, NULL); - ob_fd_free (ob_fd); - } - } - UNLOCK (&fd->lock); - - list_for_each_entry_safe (stub, tmp, &list, list) { - list_del_init (&stub->list); - - if (op_ret < 0) - call_unwind_error (stub, -1, op_errno); - else - call_resume (stub); - } - - fd_unref (fd); + LOCK(&ob_inode->inode->lock); + { + /* We only queue a stub if the open has not been completed or + * cancelled. */ + if (ob_inode->first_fd == fd) { + list_add_tail(&stub->list, &ob_inode->resume_fops); + stub = NULL; + } + } + UNLOCK(&ob_inode->inode->lock); - STACK_DESTROY (frame->root); + if (stub != NULL) { + call_resume(stub); + } - return 0; + return 0; } - -int -ob_fd_wake (xlator_t *this, fd_t *fd) +static void +ob_open_destroy(call_stub_t *stub, fd_t *fd) { - call_frame_t *frame = NULL; - ob_fd_t *ob_fd = NULL; - - LOCK (&fd->lock); - { - ob_fd = __ob_fd_ctx_get (this, fd); - if (!ob_fd) - goto unlock; - - frame = ob_fd->open_frame; - ob_fd->open_frame = NULL; - } -unlock: - UNLOCK (&fd->lock); - - if (frame) { - frame->local = fd_ref (fd); - - STACK_WIND (frame, ob_wake_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->open, - &ob_fd->loc, ob_fd->flags, fd, ob_fd->xdata); - } - - return 0; + stub->frame->local = NULL; + STACK_DESTROY(stub->frame->root); + call_stub_destroy(stub); + fd_unref(fd); } - -int -open_and_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +static int32_t +ob_open_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, + call_stub_t *stub) { - ob_fd_t *ob_fd = NULL; - int op_errno = 0; - - if (!fd) - goto nofd; - - LOCK (&fd->lock); - { - ob_fd = __ob_fd_ctx_get (this, fd); - if (!ob_fd) - goto unlock; - - if (ob_fd->op_errno) { - op_errno = ob_fd->op_errno; - goto unlock; - } - - list_add_tail (&stub->list, &ob_fd->list); - } -unlock: - UNLOCK (&fd->lock); - -nofd: - if (op_errno) - call_unwind_error (stub, -1, op_errno); - else if (ob_fd) - ob_fd_wake (this, fd); - else - call_resume (stub); + bool closed; + + LOCK(&ob_inode->inode->lock); + { + closed = ob_inode->first_fd != fd; + if (!closed) { + if (ob_inode->triggered) { + ob_inode->first_open = NULL; + } else { + ob_inode->first_open = stub; + stub = NULL; + } + } + } + UNLOCK(&ob_inode->inode->lock); + + if (stub != NULL) { + if (closed) { + ob_open_destroy(stub, fd); + } else { + call_resume(stub); + } + } - return 0; + return 0; } - -int -ob_open_behind (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - fd_t *fd, dict_t *xdata) +static void +ob_resume_pending(struct list_head *list) { - ob_fd_t *ob_fd = NULL; - int ret = -1; - ob_conf_t *conf = NULL; - - - conf = this->private; - - if (flags & O_TRUNC) { - STACK_WIND (frame, default_open_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, - loc, flags, fd, xdata); - return 0; - } - - ob_fd = ob_fd_new (); - if (!ob_fd) - goto enomem; - - ob_fd->open_frame = copy_frame (frame); - if (!ob_fd->open_frame) - goto enomem; - ret = loc_copy (&ob_fd->loc, loc); - if (ret) - goto enomem; - - ob_fd->flags = flags; - if (xdata) - ob_fd->xdata = dict_ref (xdata); + call_stub_t *stub; - ret = ob_fd_ctx_set (this, fd, ob_fd); - if (ret) - goto enomem; + while (!list_empty(list)) { + stub = list_first_entry(list, call_stub_t, list); + list_del_init(&stub->list); - fd_ref (fd); - - STACK_UNWIND_STRICT (open, frame, 0, 0, fd, xdata); - - if (!conf->lazy_open) - ob_fd_wake (this, fd); - - fd_unref (fd); - - return 0; -enomem: - if (ob_fd) { - if (ob_fd->open_frame) - STACK_DESTROY (ob_fd->open_frame->root); - loc_wipe (&ob_fd->loc); - if (ob_fd->xdata) - dict_unref (ob_fd->xdata); - GF_FREE (ob_fd); - } - - return -1; + call_resume(stub); + } } - -int -ob_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - fd_t *fd, dict_t *xdata) -{ - fd_t *old_fd = NULL; - int ret = -1; - int op_errno = 0; - call_stub_t *stub = NULL; - - old_fd = fd_lookup (fd->inode, 0); - if (old_fd) { - /* open-behind only when this is the first FD */ - stub = fop_open_stub (frame, default_open_resume, - loc, flags, fd, xdata); - if (!stub) { - op_errno = ENOMEM; - fd_unref (old_fd); - goto err; - } - - open_and_resume (this, old_fd, stub); - - fd_unref (old_fd); - - return 0; - } - - ret = ob_open_behind (frame, this, loc, flags, fd, xdata); - if (ret) { - op_errno = ENOMEM; - goto err; - } - - return 0; -err: - gf_log (this->name, GF_LOG_ERROR, "%s: %s", loc->path, - strerror (op_errno)); - - STACK_UNWIND_STRICT (open, frame, -1, op_errno, 0, 0); - - return 0; -} - - -fd_t * -ob_get_wind_fd (xlator_t *this, fd_t *fd) +static void +ob_open_completed(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, int32_t op_ret, + int32_t op_errno) { - ob_conf_t *conf = NULL; - ob_fd_t *ob_fd = NULL; - - conf = this->private; - - ob_fd = ob_fd_ctx_get (this, fd); + struct list_head list; + + INIT_LIST_HEAD(&list); + + if (op_ret < 0) { + fd_ctx_set(fd, xl, op_errno <= 0 ? EIO : op_errno); + } + + LOCK(&ob_inode->inode->lock); + { + /* Only update the fields if the file has not been closed before + * getting here. */ + if (ob_inode->first_fd == fd) { + list_splice_init(&ob_inode->resume_fops, &list); + ob_inode->first_fd = NULL; + ob_inode->first_open = NULL; + ob_inode->triggered = false; + } + } + UNLOCK(&ob_inode->inode->lock); - if (ob_fd && conf->use_anonymous_fd) - return fd_anonymous (fd->inode); + ob_resume_pending(&list); - return fd_ref (fd); + fd_unref(fd); } - -int -ob_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +static int32_t +ob_open_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - call_stub_t *stub = NULL; - fd_t *wind_fd = NULL; + ob_inode_t *ob_inode; - wind_fd = ob_get_wind_fd (this, fd); + ob_inode = frame->local; + frame->local = NULL; - stub = fop_readv_stub (frame, default_readv_resume, wind_fd, - size, offset, flags, xdata); - fd_unref (wind_fd); + ob_open_completed(xl, ob_inode, cookie, op_ret, op_errno); - if (!stub) - goto err; + STACK_DESTROY(frame->root); - open_and_resume (this, wind_fd, stub); - - return 0; -err: - STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0); - - return 0; + return 0; } - -int -ob_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov, - int count, off_t offset, uint32_t flags, struct iobref *iobref, - dict_t *xdata) +static int32_t +ob_open_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + fd_t *fd, dict_t *xdata) { - call_stub_t *stub = NULL; - - stub = fop_writev_stub (frame, default_writev_resume, fd, iov, count, - offset, flags, iobref, xdata); - if (!stub) - goto err; + STACK_WIND_COOKIE(frame, ob_open_cbk, fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); - open_and_resume (this, fd, stub); - - return 0; -err: - STACK_UNWIND_STRICT (writev, frame, -1, ENOMEM, 0, 0, 0); - - return 0; + return 0; } - -int -ob_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +static int32_t +ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + dict_t *xdata) { - call_stub_t *stub = NULL; - fd_t *wind_fd = NULL; + ob_inode_t *ob_inode; + call_frame_t *open_frame; + call_stub_t *stub; + fd_t *first_fd; + ob_state_t state; + + state = ob_open_behind(this, fd, flags, &ob_inode, &first_fd); + if (state == OB_STATE_READY) { + /* There's no pending open, but there are other file descriptors opened + * or the current flags require a synchronous open. */ + return default_open(frame, this, loc, flags, fd, xdata); + } + + if (state == OB_STATE_OPEN_TRIGGERED) { + /* The first open is in progress (either because it was already issued + * or because this request triggered it). We try to create a new stub + * to retry the operation once the initial open completes. */ + stub = fop_open_stub(frame, ob_open, loc, flags, fd, xdata); + if (stub != NULL) { + return ob_stub_dispatch(this, ob_inode, first_fd, stub); + } - wind_fd = ob_get_wind_fd (this, fd); + state = -ENOMEM; + } + + if (state == OB_STATE_FIRST_OPEN) { + /* We try to create a stub for the new open. A new frame needs to be + * used because the current one may be destroyed soon after sending + * the open's reply. */ + open_frame = copy_frame(frame); + if (open_frame != NULL) { + stub = fop_open_stub(open_frame, ob_open_resume, loc, flags, fd, + xdata); + if (stub != NULL) { + open_frame->local = ob_inode; + + /* TODO: Previous version passed xdata back to the caller, but + * probably this doesn't make sense since it won't contain + * any requested data. I think it would be better to pass + * NULL for xdata. */ + default_open_cbk(frame, NULL, this, 0, 0, fd, xdata); + + return ob_open_dispatch(this, ob_inode, first_fd, stub); + } + + STACK_DESTROY(open_frame->root); + } - stub = fop_fstat_stub (frame, default_fstat_resume, wind_fd, xdata); + /* In case of error, simulate a regular completion but with an error + * code. */ + ob_open_completed(this, ob_inode, first_fd, -1, ENOMEM); - fd_unref (wind_fd); + state = -ENOMEM; + } - if (!stub) - goto err; + /* In case of failure we need to decrement the number of open files because + * ob_fdclose() won't be called. */ - open_and_resume (this, wind_fd, stub); + LOCK(&fd->inode->lock); + { + ob_inode->open_count--; + } + UNLOCK(&fd->inode->lock); - return 0; -err: - STACK_UNWIND_STRICT (fstat, frame, -1, ENOMEM, 0, 0); + gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s", + "open", "path=%s", loc->path, NULL); - return 0; + return default_open_failure_cbk(frame, -state); } - -int -ob_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +static int32_t +ob_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - call_stub_t *stub = NULL; - ob_fd_t *ob_fd = NULL; - gf_boolean_t unwind = _gf_false; - - LOCK (&fd->lock); - { - ob_fd = __ob_fd_ctx_get (this, fd); - if (ob_fd && ob_fd->open_frame) - /* if open() was never wound to backend, - no need to wind flush() either. - */ - unwind = _gf_true; - } - UNLOCK (&fd->lock); - - if (unwind) - goto unwind; + ob_inode_t *ob_inode; + call_stub_t *stub; + fd_t *first_fd; + ob_state_t state; + + /* Create requests are never delayed. We always send them synchronously. */ + state = ob_open_and_resume_fd(this, fd, 1, true, true, &ob_inode, + &first_fd); + if (state == OB_STATE_READY) { + /* There's no pending open, but there are other file descriptors opened + * so we simply forward the request synchronously. */ + return default_create(frame, this, loc, flags, mode, umask, fd, xdata); + } + + if (state == OB_STATE_OPEN_TRIGGERED) { + /* The first open is in progress (either because it was already issued + * or because this request triggered it). We try to create a new stub + * to retry the operation once the initial open completes. */ + stub = fop_create_stub(frame, ob_create, loc, flags, mode, umask, fd, + xdata); + if (stub != NULL) { + return ob_stub_dispatch(this, ob_inode, first_fd, stub); + } - stub = fop_flush_stub (frame, default_flush_resume, fd, xdata); - if (!stub) - goto err; + state = -ENOMEM; + } - open_and_resume (this, fd, stub); + /* Since we forced a synchronous request, OB_STATE_FIRST_OPEN will never + * be returned by ob_open_and_resume_fd(). If we are here it can only be + * because there has been a problem. */ - return 0; -err: - STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM, 0); + /* In case of failure we need to decrement the number of open files because + * ob_fdclose() won't be called. */ - return 0; + LOCK(&fd->inode->lock); + { + ob_inode->open_count--; + } + UNLOCK(&fd->inode->lock); -unwind: - STACK_UNWIND_STRICT (flush, frame, 0, 0, 0); + gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s", + "create", "path=%s", loc->path, NULL); - return 0; + return default_create_failure_cbk(frame, -state); } - -int -ob_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int flag, - dict_t *xdata) +static int32_t +ob_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - call_stub_t *stub = NULL; - - stub = fop_fsync_stub (frame, default_fsync_resume, fd, flag, xdata); - if (!stub) - goto err; + ob_conf_t *conf = this->private; + bool trigger = conf->read_after_open || !conf->use_anonymous_fd; - open_and_resume (this, fd, stub); + OB_POST_FD(readv, this, frame, fd, trigger, fd, size, offset, flags, xdata); - return 0; -err: - STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0); - - return 0; + return 0; } - -int -ob_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, - struct gf_flock *flock, dict_t *xdata) +static int32_t +ob_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - call_stub_t *stub = NULL; - - stub = fop_lk_stub (frame, default_lk_resume, fd, cmd, flock, xdata); - if (!stub) - goto err; - - open_and_resume (this, fd, stub); - - return 0; -err: - STACK_UNWIND_STRICT (lk, frame, -1, ENOMEM, 0, 0); + OB_POST_FD(writev, this, frame, fd, true, fd, iov, count, offset, flags, + iobref, xdata); - return 0; + return 0; } -int -ob_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) +static int32_t +ob_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - call_stub_t *stub = NULL; - - stub = fop_ftruncate_stub (frame, default_ftruncate_resume, fd, offset, - xdata); - if (!stub) - goto err; - - open_and_resume (this, fd, stub); + ob_conf_t *conf = this->private; + bool trigger = !conf->use_anonymous_fd; - return 0; -err: - STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, 0, 0, 0); + OB_POST_FD(fstat, this, frame, fd, trigger, fd, xdata); - return 0; + return 0; } - -int -ob_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, - int flags, dict_t *xdata) +static int32_t +ob_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) { - call_stub_t *stub = NULL; + ob_conf_t *conf = this->private; + bool trigger = !conf->use_anonymous_fd; - stub = fop_fsetxattr_stub (frame, default_fsetxattr_resume, fd, xattr, - flags, xdata); - if (!stub) - goto err; + OB_POST_FD(seek, this, frame, fd, trigger, fd, offset, what, xdata); - open_and_resume (this, fd, stub); - - return 0; -err: - STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOMEM, 0); - - return 0; + return 0; } - -int -ob_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, - dict_t *xdata) +static int32_t +ob_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - call_stub_t *stub = NULL; - - stub = fop_fgetxattr_stub (frame, default_fgetxattr_resume, fd, name, - xdata); - if (!stub) - goto err; + OB_POST_FLUSH(this, frame, fd, fd, xdata); - open_and_resume (this, fd, stub); - - return 0; -err: - STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOMEM, 0, 0); - - return 0; + return 0; } - -int -ob_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) +static int32_t +ob_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int flag, dict_t *xdata) { - call_stub_t *stub = NULL; - - stub = fop_fremovexattr_stub (frame, default_fremovexattr_resume, fd, - name, xdata); - if (!stub) - goto err; + OB_POST_FD(fsync, this, frame, fd, true, fd, flag, xdata); - open_and_resume (this, fd, stub); + return 0; +} - return 0; -err: - STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOMEM, 0); +static int32_t +ob_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, + struct gf_flock *flock, dict_t *xdata) +{ + OB_POST_FD(lk, this, frame, fd, true, fd, cmd, flock, xdata); - return 0; + return 0; } - -int -ob_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, - int cmd, struct gf_flock *flock, dict_t *xdata) +static int32_t +ob_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - call_stub_t *stub = NULL; - - stub = fop_finodelk_stub (frame, default_finodelk_resume, volume, fd, - cmd, flock, xdata); - if (!stub) - goto err; + OB_POST_FD(ftruncate, this, frame, fd, true, fd, offset, xdata); - open_and_resume (this, fd, stub); + return 0; +} - return 0; -err: - STACK_UNWIND_STRICT (finodelk, frame, -1, ENOMEM, 0); +static int32_t +ob_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) +{ + OB_POST_FD(fsetxattr, this, frame, fd, true, fd, xattr, flags, xdata); - return 0; + return 0; } - -int -ob_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type, - dict_t *xdata) +static int32_t +ob_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) { - call_stub_t *stub = NULL; - - stub = fop_fentrylk_stub (frame, default_fentrylk_resume, volume, fd, - basename, cmd, type, xdata); - if (!stub) - goto err; + OB_POST_FD(fgetxattr, this, frame, fd, true, fd, name, xdata); - open_and_resume (this, fd, stub); + return 0; +} - return 0; -err: - STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOMEM, 0); +static int32_t +ob_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + OB_POST_FD(fremovexattr, this, frame, fd, true, fd, name, xdata); - return 0; + return 0; } - -int -ob_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +static int32_t +ob_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int cmd, struct gf_flock *flock, dict_t *xdata) { - call_stub_t *stub = NULL; - - stub = fop_fxattrop_stub (frame, default_fxattrop_resume, fd, optype, - xattr, xdata); - if (!stub) - goto err; + OB_POST_FD(finodelk, this, frame, fd, true, volume, fd, cmd, flock, xdata); - open_and_resume (this, fd, stub); + return 0; +} - return 0; -err: - STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOMEM, 0, 0); +static int32_t +ob_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + OB_POST_FD(fentrylk, this, frame, fd, true, volume, fd, basename, cmd, type, + xdata); - return 0; + return 0; } - -int -ob_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *iatt, int valid, dict_t *xdata) +static int32_t +ob_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - call_stub_t *stub = NULL; - - stub = fop_fsetattr_stub (frame, default_fsetattr_resume, fd, - iatt, valid, xdata); - if (!stub) - goto err; + OB_POST_FD(fxattrop, this, frame, fd, true, fd, optype, xattr, xdata); - open_and_resume (this, fd, stub); + return 0; +} - return 0; -err: - STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOMEM, 0, 0, 0); +static int32_t +ob_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *iatt, + int valid, dict_t *xdata) +{ + OB_POST_FD(fsetattr, this, frame, fd, true, fd, iatt, valid, xdata); - return 0; + return 0; } -int +static int32_t ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) + off_t offset, size_t len, dict_t *xdata) { - call_stub_t *stub; - - stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode, - offset, len, xdata); - if (!stub) - goto err; - - open_and_resume(this, fd, stub); + OB_POST_FD(fallocate, this, frame, fd, true, fd, mode, offset, len, xdata); - return 0; -err: - STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } -int +static int32_t ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) + size_t len, dict_t *xdata) { - call_stub_t *stub; + OB_POST_FD(discard, this, frame, fd, true, fd, offset, len, xdata); - stub = fop_discard_stub(frame, default_discard_resume, fd, offset, len, - xdata); - if (!stub) - goto err; + return 0; +} - open_and_resume(this, fd, stub); +static int32_t +ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + OB_POST_FD(zerofill, this, frame, fd, true, fd, offset, len, xdata); - return 0; -err: - STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } -int -ob_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, - dict_t *xdata) +static int32_t +ob_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) { - fd_t *fd = NULL; - call_stub_t *stub = NULL; + OB_POST_INODE(unlink, this, frame, loc->inode, true, loc, xflags, xdata); - stub = fop_unlink_stub (frame, default_unlink_resume, loc, - xflags, xdata); - if (!stub) - goto err; + return 0; +} - fd = fd_lookup (loc->inode, 0); +static int32_t +ob_rename(call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst, + dict_t *xdata) +{ + OB_POST_INODE(rename, this, frame, dst->inode, true, src, dst, xdata); - open_and_resume (this, fd, stub); + return 0; +} - return 0; -err: - STACK_UNWIND_STRICT (unlink, frame, -1, ENOMEM, 0, 0, 0); +static int32_t +ob_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + OB_POST_INODE(setattr, this, frame, loc->inode, true, loc, stbuf, valid, + xdata); - return 0; + return 0; } - -int -ob_rename (call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst, - dict_t *xdata) +static int32_t +ob_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - fd_t *fd = NULL; - call_stub_t *stub = NULL; + if (dict_get(dict, POSIX_ACL_DEFAULT_XATTR) || + dict_get(dict, POSIX_ACL_ACCESS_XATTR) || + dict_get(dict, GF_SELINUX_XATTR_KEY)) { + return default_setxattr(frame, this, loc, dict, flags, xdata); + } - stub = fop_rename_stub (frame, default_rename_resume, src, dst, xdata); - if (!stub) - goto err; + OB_POST_INODE(setxattr, this, frame, loc->inode, true, loc, dict, flags, + xdata); - if (dst->inode) - fd = fd_lookup (dst->inode, 0); + return 0; +} - open_and_resume (this, fd, stub); +static void +ob_fdclose(xlator_t *this, fd_t *fd) +{ + struct list_head list; + ob_inode_t *ob_inode; + call_stub_t *stub; + + INIT_LIST_HEAD(&list); + stub = NULL; + + LOCK(&fd->inode->lock); + { + ob_inode = ob_inode_get_locked(this, fd->inode); + if (ob_inode != NULL) { + ob_inode->open_count--; + + /* If this fd is the same as ob_inode->first_fd, it means that + * the initial open has not fully completed. We'll try to cancel + * it. */ + if (ob_inode->first_fd == fd) { + if (ob_inode->first_open == OB_OPEN_PREPARING) { + /* In this case ob_open_dispatch() has not been called yet. + * We clear first_fd and first_open to allow that function + * to know that the open is not really needed. This also + * allows other requests to work as expected if they + * arrive before the dispatch function is called. If there + * are pending fops, we can directly process them here. + * (note that there shouldn't be any fd related fops, but + * if there are, it's fine if they fail). */ + ob_inode->first_fd = NULL; + ob_inode->first_open = NULL; + ob_inode->triggered = false; + list_splice_init(&ob_inode->resume_fops, &list); + } else if (!ob_inode->triggered) { + /* If the open has already been dispatched, we can only + * cancel it if it has not been triggered. Otherwise we + * simply wait until it completes. While it's not triggered, + * first_open must be a valid stub and there can't be any + * pending fops. */ + GF_ASSERT((ob_inode->first_open != NULL) && + list_empty(&ob_inode->resume_fops)); + + ob_inode->first_fd = NULL; + stub = ob_inode->first_open; + ob_inode->first_open = NULL; + } + } + } + } + UNLOCK(&fd->inode->lock); - return 0; -err: - STACK_UNWIND_STRICT (rename, frame, -1, ENOMEM, 0, 0, 0, 0, 0, 0); + if (stub != NULL) { + ob_open_destroy(stub, fd); + } - return 0; + ob_resume_pending(&list); } - int -ob_release (xlator_t *this, fd_t *fd) +ob_forget(xlator_t *this, inode_t *inode) { - ob_fd_t *ob_fd = NULL; + ob_inode_t *ob_inode; + uint64_t value = 0; - ob_fd = ob_fd_ctx_get (this, fd); + if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0)) { + ob_inode = (ob_inode_t *)(uintptr_t)value; + GF_FREE(ob_inode); + } - ob_fd_free (ob_fd); - - return 0; + return 0; } - int -ob_priv_dump (xlator_t *this) +ob_priv_dump(xlator_t *this) { - ob_conf_t *conf = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; + ob_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; - conf = this->private; + conf = this->private; - if (!conf) - return -1; + if (!conf) + return -1; - gf_proc_dump_build_key (key_prefix, "xlator.performance.open-behind", - "priv"); + gf_proc_dump_build_key(key_prefix, "xlator.performance.open-behind", + "priv"); - gf_proc_dump_add_section (key_prefix); + gf_proc_dump_add_section("%s", key_prefix); - gf_proc_dump_write ("use_anonymous_fd", "%d", conf->use_anonymous_fd); + gf_proc_dump_write("use_anonymous_fd", "%d", conf->use_anonymous_fd); - gf_proc_dump_write ("lazy_open", "%d", conf->lazy_open); + gf_proc_dump_write("lazy_open", "%d", conf->lazy_open); - return 0; + return 0; } - int -ob_fdctx_dump (xlator_t *this, fd_t *fd) +ob_fdctx_dump(xlator_t *this, fd_t *fd) { - ob_fd_t *ob_fd = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - int ret = 0; - - ret = TRY_LOCK (&fd->lock); - if (ret) - return 0; - - ob_fd = __ob_fd_ctx_get (this, fd); - if (!ob_fd) { - UNLOCK (&fd->lock); - return 0; - } - - gf_proc_dump_build_key (key_prefix, "xlator.performance.open-behind", - "file"); - gf_proc_dump_add_section (key_prefix); - - gf_proc_dump_write ("fd", "%p", fd); - - gf_proc_dump_write ("open_frame", "%p", ob_fd->open_frame); + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + uint64_t value = 0; + int ret = 0, error = 0; + + ret = TRY_LOCK(&fd->lock); + if (ret) + return 0; - gf_proc_dump_write ("open_frame.root.unique", "%p", - ob_fd->open_frame->root->unique); + if ((__fd_ctx_get(fd, this, &value) == 0) && (value != 0)) { + error = (int32_t)value; + } - gf_proc_dump_write ("loc.path", "%s", ob_fd->loc.path); + gf_proc_dump_build_key(key_prefix, "xlator.performance.open-behind", + "file"); + gf_proc_dump_add_section("%s", key_prefix); - gf_proc_dump_write ("loc.ino", "%s", uuid_utoa (ob_fd->loc.gfid)); + gf_proc_dump_write("fd", "%p", fd); - gf_proc_dump_write ("flags", "%p", ob_fd->open_frame); + gf_proc_dump_write("error", "%d", error); - UNLOCK (&fd->lock); + UNLOCK(&fd->lock); - return 0; + return 0; } - int -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - ret = xlator_mem_acct_init (this, gf_ob_mt_end + 1); + ret = xlator_mem_acct_init(this, gf_ob_mt_end + 1); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Memory accounting failed"); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, OPEN_BEHIND_MSG_NO_MEMORY, + "Memory accounting failed"); - return ret; + return ret; } - int -reconfigure (xlator_t *this, dict_t *options) +reconfigure(xlator_t *this, dict_t *options) { - ob_conf_t *conf = NULL; - int ret = -1; + ob_conf_t *conf = NULL; + int ret = -1; + + conf = this->private; - conf = this->private; + GF_OPTION_RECONF("use-anonymous-fd", conf->use_anonymous_fd, options, bool, + out); - GF_OPTION_RECONF ("use-anonymous-fd", conf->use_anonymous_fd, options, - bool, out); + GF_OPTION_RECONF("lazy-open", conf->lazy_open, options, bool, out); - GF_OPTION_RECONF ("lazy-open", conf->lazy_open, options, bool, out); + GF_OPTION_RECONF("read-after-open", conf->read_after_open, options, bool, + out); - ret = 0; + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); + ret = 0; out: - return ret; + return ret; } - int -init (xlator_t *this) +init(xlator_t *this) { - ob_conf_t *conf = NULL; + ob_conf_t *conf = NULL; + + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, + OPEN_BEHIND_MSG_XLATOR_CHILD_MISCONFIGURED, + "FATAL: volume (%s) not configured with exactly one " + "child", + this->name); + return -1; + } - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: volume (%s) not configured with exactly one " - "child", this->name); - return -1; - } + if (!this->parents) + gf_msg(this->name, GF_LOG_WARNING, 0, OPEN_BEHIND_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfile "); + + conf = GF_CALLOC(1, sizeof(*conf), gf_ob_mt_conf_t); + if (!conf) + goto err; - if (!this->parents) - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); + GF_OPTION_INIT("use-anonymous-fd", conf->use_anonymous_fd, bool, err); - conf = GF_CALLOC (1, sizeof (*conf), gf_ob_mt_conf_t); - if (!conf) - goto err; + GF_OPTION_INIT("lazy-open", conf->lazy_open, bool, err); - GF_OPTION_INIT ("use-anonymous-fd", conf->use_anonymous_fd, bool, err); + GF_OPTION_INIT("read-after-open", conf->read_after_open, bool, err); - GF_OPTION_INIT ("lazy-open", conf->lazy_open, bool, err); + GF_OPTION_INIT("pass-through", this->pass_through, bool, err); - this->private = conf; + this->private = conf; - return 0; + return 0; err: - if (conf) - GF_FREE (conf); + if (conf) + GF_FREE(conf); - return -1; + return -1; } - void -fini (xlator_t *this) +fini(xlator_t *this) { - ob_conf_t *conf = NULL; + ob_conf_t *conf = NULL; - conf = this->private; + conf = this->private; - GF_FREE (conf); + GF_FREE(conf); - return; + return; } - struct xlator_fops fops = { - .open = ob_open, - .readv = ob_readv, - .writev = ob_writev, - .flush = ob_flush, - .fsync = ob_fsync, - .fstat = ob_fstat, - .ftruncate = ob_ftruncate, - .fsetxattr = ob_fsetxattr, - .fgetxattr = ob_fgetxattr, - .fremovexattr = ob_fremovexattr, - .finodelk = ob_finodelk, - .fentrylk = ob_fentrylk, - .fxattrop = ob_fxattrop, - .fsetattr = ob_fsetattr, - .fallocate = ob_fallocate, - .discard = ob_discard, - .unlink = ob_unlink, - .rename = ob_rename, - .lk = ob_lk, + .open = ob_open, + .create = ob_create, + .readv = ob_readv, + .writev = ob_writev, + .flush = ob_flush, + .fsync = ob_fsync, + .fstat = ob_fstat, + .seek = ob_seek, + .ftruncate = ob_ftruncate, + .fsetxattr = ob_fsetxattr, + .setxattr = ob_setxattr, + .fgetxattr = ob_fgetxattr, + .fremovexattr = ob_fremovexattr, + .finodelk = ob_finodelk, + .fentrylk = ob_fentrylk, + .fxattrop = ob_fxattrop, + .fsetattr = ob_fsetattr, + .setattr = ob_setattr, + .fallocate = ob_fallocate, + .discard = ob_discard, + .zerofill = ob_zerofill, + .unlink = ob_unlink, + .rename = ob_rename, + .lk = ob_lk, }; struct xlator_cbks cbks = { - .release = ob_release, + .fdclose = ob_fdclose, + .forget = ob_forget, }; struct xlator_dumpops dumpops = { - .priv = ob_priv_dump, - .fdctx = ob_fdctx_dump, + .priv = ob_priv_dump, + .fdctx = ob_fdctx_dump, }; - struct volume_options options[] = { - { .key = {"use-anonymous-fd"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "yes", - .description = "For read operations, use anonymous FD when " - "original FD is open-behind and not yet opened in the backend.", - }, - { .key = {"lazy-open"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "yes", - .description = "Perform open in the backend only when a necessary " - "FOP arrives (e.g writev on the FD, unlink of the file). When option " - "is disabled, perform backend open right after unwinding open().", - }, - { .key = {NULL} } + { + .key = {"open-behind"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable open-behind", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + { + .key = {"use-anonymous-fd"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .description = + "For read operations, use anonymous FD when " + "original FD is open-behind and not yet opened in the backend.", + }, + { + .key = {"lazy-open"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes", + .description = + "Perform open in the backend only when a necessary " + "FOP arrives (e.g writev on the FD, unlink of the file). When " + "option " + "is disabled, perform backend open right after unwinding open().", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT, + .tags = {}, + /* option_validation_fn validate_fn; */ + }, + { + .key = {"read-after-open"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes", + .description = "read is sent only after actual open happens and real " + "fd is obtained, instead of doing on anonymous fd " + "(similar to write)", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT, + .tags = {}, + /* option_validation_fn validate_fn; */ + }, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"open-behind"}, + .description = "Enable/Disable open behind translator"}, + {.key = {NULL}} + +}; +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "open-behind", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/quick-read/src/Makefile.am b/xlators/performance/quick-read/src/Makefile.am index 4906f408abc..8eb6cece738 100644 --- a/xlators/performance/quick-read/src/Makefile.am +++ b/xlators/performance/quick-read/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = quick-read.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -quick_read_la_LDFLAGS = -module -avoid-version +quick_read_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) quick_read_la_SOURCES = quick-read.c quick_read_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = quick-read.h quick-read-mem-types.h +noinst_HEADERS = quick-read.h quick-read-mem-types.h quick-read-messages.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/performance/quick-read/src/quick-read-mem-types.h b/xlators/performance/quick-read/src/quick-read-mem-types.h index 78547f64116..e4aef8549ff 100644 --- a/xlators/performance/quick-read/src/quick-read-mem-types.h +++ b/xlators/performance/quick-read/src/quick-read-mem-types.h @@ -11,17 +11,13 @@ #ifndef __QR_MEM_TYPES_H__ #define __QR_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_qr_mem_types_ { - gf_qr_mt_qr_inode_t = gf_common_mt_end + 1, - gf_qr_mt_content_t, - gf_qr_mt_qr_fd_ctx_t, - gf_qr_mt_iovec, - gf_qr_mt_qr_conf_t, - gf_qr_mt_qr_priority_t, - gf_qr_mt_qr_private_t, - gf_qr_mt_qr_unlink_ctx_t, - gf_qr_mt_end + gf_qr_mt_qr_inode_t = gf_common_mt_end + 1, + gf_qr_mt_content_t, + gf_qr_mt_qr_priority_t, + gf_qr_mt_qr_private_t, + gf_qr_mt_end }; #endif diff --git a/xlators/performance/quick-read/src/quick-read-messages.h b/xlators/performance/quick-read/src/quick-read-messages.h new file mode 100644 index 00000000000..da9724a3c9c --- /dev/null +++ b/xlators/performance/quick-read/src/quick-read-messages.h @@ -0,0 +1,31 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _QUICK_READ_MESSAGES_H_ +#define _QUICK_READ_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(QUICK_READ, QUICK_READ_MSG_ENFORCEMENT_FAILED, + QUICK_READ_MSG_INVALID_ARGUMENT, + QUICK_READ_MSG_XLATOR_CHILD_MISCONFIGURED, QUICK_READ_MSG_NO_MEMORY, + QUICK_READ_MSG_VOL_MISCONFIGURED, QUICK_READ_MSG_DICT_SET_FAILED, + QUICK_READ_MSG_INVALID_CONFIG, QUICK_READ_MSG_LRU_NOT_EMPTY); + +#endif /* _QUICK_READ_MESSAGES_H_ */ diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c index 445ea8658ea..7fe4b3c3a4b 100644 --- a/xlators/performance/quick-read/src/quick-read.c +++ b/xlators/performance/quick-read/src/quick-read.c @@ -8,1140 +8,1637 @@ cases as published by the Free Software Foundation. */ +#include <math.h> #include "quick-read.h" -#include "statedump.h" +#include <glusterfs/statedump.h> +#include "quick-read-messages.h" +#include <glusterfs/upcall-utils.h> +#include <glusterfs/atomic.h> -qr_inode_t *qr_inode_ctx_get (xlator_t *this, inode_t *inode); -void __qr_inode_prune (qr_inode_table_t *table, qr_inode_t *qr_inode); +typedef struct qr_local { + inode_t *inode; + uint64_t incident_gen; + fd_t *fd; +} qr_local_t; +qr_inode_t * +qr_inode_ctx_get(xlator_t *this, inode_t *inode); -int -__qr_inode_ctx_set (xlator_t *this, inode_t *inode, qr_inode_t *qr_inode) +void +__qr_inode_prune_data(xlator_t *this, qr_inode_table_t *table, + qr_inode_t *qr_inode); + +void +qr_local_wipe(qr_local_t *local) { - uint64_t value = 0; - int ret = -1; + if (!local) + goto out; - value = (long) qr_inode; + if (local->inode) + inode_unref(local->inode); - ret = __inode_ctx_set (inode, this, &value); + if (local->fd) + fd_unref(local->fd); - return ret; + GF_FREE(local); +out: + return; } +uint64_t +__qr_get_generation(xlator_t *this, qr_inode_t *qr_inode) +{ + uint64_t gen = 0, rollover; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + + priv = this->private; + table = &priv->table; + + gen = GF_ATOMIC_INC(priv->generation); + if (gen == 0) { + qr_inode->gen_rollover = !qr_inode->gen_rollover; + gen = GF_ATOMIC_INC(priv->generation); + __qr_inode_prune_data(this, table, qr_inode); + qr_inode->gen = qr_inode->invalidation_time = gen - 1; + } + + rollover = qr_inode->gen_rollover; + gen |= (rollover << 32); + return gen; +} -qr_inode_t * -__qr_inode_ctx_get (xlator_t *this, inode_t *inode) +uint64_t +qr_get_generation(xlator_t *this, inode_t *inode) { - qr_inode_t *qr_inode = NULL; - uint64_t value = 0; - int ret = -1; + qr_inode_t *qr_inode = NULL; + uint64_t gen = 0; + qr_inode_table_t *table = NULL; + qr_private_t *priv = NULL; + + priv = this->private; + table = &priv->table; + + qr_inode = qr_inode_ctx_get(this, inode); + + if (qr_inode) { + LOCK(&table->lock); + { + gen = __qr_get_generation(this, qr_inode); + } + UNLOCK(&table->lock); + } else { + gen = GF_ATOMIC_INC(priv->generation); + if (gen == 0) { + gen = GF_ATOMIC_INC(priv->generation); + } + } + + return gen; +} - ret = __inode_ctx_get (inode, this, &value); - if (ret) - return NULL; +qr_local_t * +qr_local_get(xlator_t *this, inode_t *inode) +{ + qr_local_t *local = NULL; - qr_inode = (void *) ((long) value); + local = GF_CALLOC(1, sizeof(*local), gf_common_mt_char); + if (!local) + goto out; - return qr_inode; + local->incident_gen = qr_get_generation(this, inode); +out: + return local; } +#define QR_STACK_UNWIND(fop, frame, params...) \ + do { \ + qr_local_t *__local = NULL; \ + if (frame) { \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + qr_local_wipe(__local); \ + } while (0) + +void +__qr_inode_prune(xlator_t *this, qr_inode_table_t *table, qr_inode_t *qr_inode, + uint64_t gen); + +int +__qr_inode_ctx_set(xlator_t *this, inode_t *inode, qr_inode_t *qr_inode) +{ + uint64_t value = 0; + int ret = -1; + + value = (long)qr_inode; + + ret = __inode_ctx_set(inode, this, &value); + + return ret; +} qr_inode_t * -qr_inode_ctx_get (xlator_t *this, inode_t *inode) +__qr_inode_ctx_get(xlator_t *this, inode_t *inode) { - qr_inode_t *qr_inode = NULL; + qr_inode_t *qr_inode = NULL; + uint64_t value = 0; + int ret = -1; - LOCK (&inode->lock); - { - qr_inode = __qr_inode_ctx_get (this, inode); - } - UNLOCK (&inode->lock); + ret = __inode_ctx_get(inode, this, &value); + if (ret) + return NULL; - return qr_inode; + qr_inode = (void *)((long)value); + + return qr_inode; } +qr_inode_t * +qr_inode_ctx_get(xlator_t *this, inode_t *inode) +{ + qr_inode_t *qr_inode = NULL; + + if (inode == NULL) + goto out; + + LOCK(&inode->lock); + { + qr_inode = __qr_inode_ctx_get(this, inode); + } + UNLOCK(&inode->lock); + +out: + return qr_inode; +} qr_inode_t * -qr_inode_new (xlator_t *this, inode_t *inode) +qr_inode_new(xlator_t *this, inode_t *inode) { - qr_inode_t *qr_inode = NULL; + qr_inode_t *qr_inode = NULL; - qr_inode = GF_CALLOC (1, sizeof (*qr_inode), gf_qr_mt_qr_inode_t); - if (!qr_inode) - return NULL; + qr_inode = GF_CALLOC(1, sizeof(*qr_inode), gf_qr_mt_qr_inode_t); + if (!qr_inode) + return NULL; - INIT_LIST_HEAD (&qr_inode->lru); + INIT_LIST_HEAD(&qr_inode->lru); - qr_inode->priority = 0; /* initial priority */ + qr_inode->priority = 0; /* initial priority */ - return qr_inode; + return qr_inode; } - qr_inode_t * -qr_inode_ctx_get_or_new (xlator_t *this, inode_t *inode) +qr_inode_ctx_get_or_new(xlator_t *this, inode_t *inode) { - qr_inode_t *qr_inode = NULL; - int ret = -1; - qr_private_t *priv = NULL; - - priv = this->private; - - LOCK (&inode->lock); - { - qr_inode = __qr_inode_ctx_get (this, inode); - if (qr_inode) - goto unlock; - - qr_inode = qr_inode_new (this, inode); - if (!qr_inode) - goto unlock; - - ret = __qr_inode_ctx_set (this, inode, qr_inode); - if (ret) { - __qr_inode_prune (&priv->table, qr_inode); - GF_FREE (qr_inode); - } - } + qr_inode_t *qr_inode = NULL; + int ret = -1; + qr_private_t *priv = NULL; + + priv = this->private; + + LOCK(&inode->lock); + { + qr_inode = __qr_inode_ctx_get(this, inode); + if (qr_inode) + goto unlock; + + qr_inode = qr_inode_new(this, inode); + if (!qr_inode) + goto unlock; + + ret = __qr_inode_ctx_set(this, inode, qr_inode); + if (ret) { + __qr_inode_prune(this, &priv->table, qr_inode, 0); + GF_FREE(qr_inode); + qr_inode = NULL; + } + } unlock: - UNLOCK (&inode->lock); + UNLOCK(&inode->lock); - return qr_inode; + return qr_inode; } - uint32_t -qr_get_priority (qr_conf_t *conf, const char *path) +qr_get_priority(qr_conf_t *conf, const char *path) { - uint32_t priority = 0; - struct qr_priority *curr = NULL; + uint32_t priority = 0; + struct qr_priority *curr = NULL; - list_for_each_entry (curr, &conf->priority_list, list) { - if (fnmatch (curr->pattern, path, FNM_NOESCAPE) == 0) - priority = curr->priority; - } + list_for_each_entry(curr, &conf->priority_list, list) + { + if (fnmatch(curr->pattern, path, FNM_NOESCAPE) == 0) + priority = curr->priority; + } - return priority; + return priority; } - void -__qr_inode_register (qr_inode_table_t *table, qr_inode_t *qr_inode) +__qr_inode_register(xlator_t *this, qr_inode_table_t *table, + qr_inode_t *qr_inode) { - if (!qr_inode->data) - return; + qr_private_t *priv = NULL; - if (list_empty (&qr_inode->lru)) - /* first time addition of this qr_inode into table */ - table->cache_used += qr_inode->size; - else - list_del_init (&qr_inode->lru); + if (!qr_inode->data) + return; - list_add_tail (&qr_inode->lru, &table->lru[qr_inode->priority]); -} + priv = this->private; + if (!priv) + return; + if (list_empty(&qr_inode->lru)) + /* first time addition of this qr_inode into table */ + table->cache_used += qr_inode->size; + else + list_del_init(&qr_inode->lru); + + list_add_tail(&qr_inode->lru, &table->lru[qr_inode->priority]); + + GF_ATOMIC_INC(priv->qr_counter.files_cached); + + return; +} void -qr_inode_set_priority (xlator_t *this, inode_t *inode, const char *path) +qr_inode_set_priority(xlator_t *this, inode_t *inode, const char *path) { - uint32_t priority = 0; - qr_inode_table_t *table = NULL; - qr_inode_t *qr_inode = NULL; - qr_private_t *priv = NULL; - qr_conf_t *conf = NULL; - - qr_inode = qr_inode_ctx_get (this, inode); - if (!qr_inode) - return; - - priv = this->private; - table = &priv->table; - conf = &priv->conf; - - if (path) - priority = qr_get_priority (conf, path); - else - /* retain existing priority, just bump LRU */ - priority = qr_inode->priority; - - LOCK (&table->lock); - { - qr_inode->priority = priority; - - __qr_inode_register (table, qr_inode); - } - UNLOCK (&table->lock); -} + uint32_t priority = 0; + qr_inode_table_t *table = NULL; + qr_inode_t *qr_inode = NULL; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + + qr_inode = qr_inode_ctx_get(this, inode); + if (!qr_inode) + return; + priv = this->private; + table = &priv->table; + conf = &priv->conf; + + if (path) + priority = qr_get_priority(conf, path); + else + /* retain existing priority, just bump LRU */ + priority = qr_inode->priority; + + LOCK(&table->lock); + { + qr_inode->priority = priority; + + __qr_inode_register(this, table, qr_inode); + } + UNLOCK(&table->lock); +} -/* To be called with priv->table.lock held */ void -__qr_inode_prune (qr_inode_table_t *table, qr_inode_t *qr_inode) +__qr_inode_prune_data(xlator_t *this, qr_inode_table_t *table, + qr_inode_t *qr_inode) { - GF_FREE (qr_inode->data); - qr_inode->data = NULL; + qr_private_t *priv = NULL; - if (!list_empty (&qr_inode->lru)) { - table->cache_used -= qr_inode->size; - qr_inode->size = 0; + priv = this->private; - list_del_init (&qr_inode->lru); - } + GF_FREE(qr_inode->data); + qr_inode->data = NULL; - memset (&qr_inode->buf, 0, sizeof (qr_inode->buf)); -} + if (!list_empty(&qr_inode->lru)) { + table->cache_used -= qr_inode->size; + qr_inode->size = 0; + + list_del_init(&qr_inode->lru); + GF_ATOMIC_DEC(priv->qr_counter.files_cached); + } + + memset(&qr_inode->buf, 0, sizeof(qr_inode->buf)); +} +/* To be called with priv->table.lock held */ void -qr_inode_prune (xlator_t *this, inode_t *inode) +__qr_inode_prune(xlator_t *this, qr_inode_table_t *table, qr_inode_t *qr_inode, + uint64_t gen) { - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - qr_inode_t *qr_inode = NULL; - - qr_inode = qr_inode_ctx_get (this, inode); - if (!qr_inode) - return; - - priv = this->private; - table = &priv->table; - - LOCK (&table->lock); - { - __qr_inode_prune (table, qr_inode); - } - UNLOCK (&table->lock); + __qr_inode_prune_data(this, table, qr_inode); + if (gen) + qr_inode->gen = gen; + qr_inode->invalidation_time = __qr_get_generation(this, qr_inode); } +void +qr_inode_prune(xlator_t *this, inode_t *inode, uint64_t gen) +{ + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + qr_inode_t *qr_inode = NULL; + + qr_inode = qr_inode_ctx_get(this, inode); + if (!qr_inode) + return; + + priv = this->private; + table = &priv->table; + + LOCK(&table->lock); + { + __qr_inode_prune(this, table, qr_inode, gen); + } + UNLOCK(&table->lock); +} /* To be called with priv->table.lock held */ void -__qr_cache_prune (qr_inode_table_t *table, qr_conf_t *conf) +__qr_cache_prune(xlator_t *this, qr_inode_table_t *table, qr_conf_t *conf) { - qr_inode_t *curr = NULL; - qr_inode_t *next = NULL; - int index = 0; - size_t size_pruned = 0; + qr_inode_t *curr = NULL; + qr_inode_t *next = NULL; + int index = 0; + size_t size_pruned = 0; - for (index = 0; index < conf->max_pri; index++) { - list_for_each_entry_safe (curr, next, &table->lru[index], lru) { + for (index = 0; index < conf->max_pri; index++) { + list_for_each_entry_safe(curr, next, &table->lru[index], lru) + { + size_pruned += curr->size; - size_pruned += curr->size; + __qr_inode_prune(this, table, curr, 0); - __qr_inode_prune (table, curr); - - if (table->cache_used < conf->cache_size) - return; - } + if (table->cache_used < conf->cache_size) + return; } + } - return; + return; } - void -qr_cache_prune (xlator_t *this) +qr_cache_prune(xlator_t *this) { - qr_private_t *priv = NULL; - qr_conf_t *conf = NULL; - qr_inode_table_t *table = NULL; - - priv = this->private; - table = &priv->table; - conf = &priv->conf; - - LOCK (&table->lock); - { - if (table->cache_used > conf->cache_size) - __qr_cache_prune (table, conf); - } - UNLOCK (&table->lock); + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + qr_inode_table_t *table = NULL; + + priv = this->private; + table = &priv->table; + conf = &priv->conf; + + LOCK(&table->lock); + { + if (table->cache_used > conf->cache_size) + __qr_cache_prune(this, table, conf); + } + UNLOCK(&table->lock); } - void * -qr_content_extract (dict_t *xdata) +qr_content_extract(dict_t *xdata) { - data_t *data = NULL; - void *content = NULL; + data_t *data = NULL; + void *content = NULL; + int ret = 0; - data = dict_get (xdata, GF_CONTENT_KEY); - if (!data) - return NULL; + ret = dict_get_with_ref(xdata, GF_CONTENT_KEY, &data); + if (ret < 0 || !data) + return NULL; - content = GF_CALLOC (1, data->len, gf_qr_mt_content_t); - if (!content) - return NULL; + content = GF_MALLOC(data->len, gf_qr_mt_content_t); + if (!content) + goto out; - memcpy (content, data->data, data->len); + memcpy(content, data->data, data->len); - return content; +out: + data_unref(data); + return content; } - void -qr_content_update (xlator_t *this, qr_inode_t *qr_inode, void *data, - struct iatt *buf) +qr_content_update(xlator_t *this, qr_inode_t *qr_inode, void *data, + struct iatt *buf, uint64_t gen) { - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + uint32_t rollover = 0; - priv = this->private; - table = &priv->table; + rollover = gen >> 32; + gen = gen & 0xffffffff; - LOCK (&table->lock); - { - __qr_inode_prune (table, qr_inode); + priv = this->private; + table = &priv->table; - qr_inode->data = data; - qr_inode->size = buf->ia_size; + LOCK(&table->lock); + { + if ((rollover != qr_inode->gen_rollover) || + (gen && qr_inode->gen && (qr_inode->gen >= gen))) + goto unlock; - qr_inode->ia_mtime = buf->ia_mtime; - qr_inode->ia_mtime_nsec = buf->ia_mtime_nsec; + if ((qr_inode->data == NULL) && (qr_inode->invalidation_time >= gen)) + goto unlock; - qr_inode->buf = *buf; + __qr_inode_prune(this, table, qr_inode, gen); - gettimeofday (&qr_inode->last_refresh, NULL); + qr_inode->data = data; + data = NULL; + qr_inode->size = buf->ia_size; - __qr_inode_register (table, qr_inode); - } - UNLOCK (&table->lock); + qr_inode->ia_mtime = buf->ia_mtime; + qr_inode->ia_mtime_nsec = buf->ia_mtime_nsec; + qr_inode->ia_ctime = buf->ia_ctime; + qr_inode->ia_ctime_nsec = buf->ia_ctime_nsec; - qr_cache_prune (this); -} + qr_inode->buf = *buf; + qr_inode->last_refresh = gf_time(); + __qr_inode_register(this, table, qr_inode); + } +unlock: + UNLOCK(&table->lock); + + if (data) + GF_FREE(data); + + qr_cache_prune(this); +} gf_boolean_t -qr_size_fits (qr_conf_t *conf, struct iatt *buf) +qr_size_fits(qr_conf_t *conf, struct iatt *buf) { - return (buf->ia_size <= conf->max_file_size); + return (buf->ia_size <= conf->max_file_size); } +gf_boolean_t +qr_mtime_equal(qr_inode_t *qr_inode, struct iatt *buf) +{ + return (qr_inode->ia_mtime == buf->ia_mtime && + qr_inode->ia_mtime_nsec == buf->ia_mtime_nsec); +} gf_boolean_t -qr_mtime_equal (qr_inode_t *qr_inode, struct iatt *buf) +qr_ctime_equal(qr_inode_t *qr_inode, struct iatt *buf) { - return (qr_inode->ia_mtime == buf->ia_mtime && - qr_inode->ia_mtime_nsec == buf->ia_mtime_nsec); + return (qr_inode->ia_ctime == buf->ia_ctime && + qr_inode->ia_ctime_nsec == buf->ia_ctime_nsec); } +gf_boolean_t +qr_time_equal(qr_conf_t *conf, qr_inode_t *qr_inode, struct iatt *buf) +{ + if (conf->ctime_invalidation) + return qr_ctime_equal(qr_inode, buf); + else + return qr_mtime_equal(qr_inode, buf); +} void -__qr_content_refresh (xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf) +__qr_content_refresh(xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf, + uint64_t gen) { - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - qr_conf_t *conf = NULL; - - priv = this->private; - table = &priv->table; - conf = &priv->conf; - - if (qr_size_fits (conf, buf) && qr_mtime_equal (qr_inode, buf)) { - qr_inode->buf = *buf; - - gettimeofday (&qr_inode->last_refresh, NULL); - - __qr_inode_register (table, qr_inode); - } else { - __qr_inode_prune (table, qr_inode); - } - - return; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + qr_conf_t *conf = NULL; + uint32_t rollover = 0; + + rollover = gen >> 32; + gen = gen & 0xffffffff; + + priv = this->private; + table = &priv->table; + conf = &priv->conf; + + /* allow for rollover of frame->root->unique */ + if ((rollover != qr_inode->gen_rollover) || + (gen && qr_inode->gen && (qr_inode->gen >= gen))) + goto done; + + if ((qr_inode->data == NULL) && (qr_inode->invalidation_time >= gen)) + goto done; + + qr_inode->gen = gen; + + if (qr_size_fits(conf, buf) && qr_time_equal(conf, qr_inode, buf)) { + qr_inode->buf = *buf; + qr_inode->last_refresh = gf_time(); + __qr_inode_register(this, table, qr_inode); + } else { + __qr_inode_prune(this, table, qr_inode, gen); + } + +done: + return; } - void -qr_content_refresh (xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf) +qr_content_refresh(xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf, + uint64_t gen) { - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; - priv = this->private; - table = &priv->table; + priv = this->private; + table = &priv->table; - LOCK (&table->lock); - { - __qr_content_refresh (this, qr_inode, buf); - } - UNLOCK (&table->lock); + LOCK(&table->lock); + { + __qr_content_refresh(this, qr_inode, buf, gen); + } + UNLOCK(&table->lock); } - gf_boolean_t -__qr_cache_is_fresh (xlator_t *this, qr_inode_t *qr_inode) +__qr_cache_is_fresh(xlator_t *this, qr_inode_t *qr_inode) { - qr_conf_t *conf = NULL; - qr_private_t *priv = NULL; - struct timeval now; - struct timeval diff; + qr_conf_t *conf = NULL; + qr_private_t *priv = NULL; - priv = this->private; - conf = &priv->conf; + priv = this->private; + conf = &priv->conf; - gettimeofday (&now, NULL); + if (qr_inode->last_refresh < priv->last_child_down) + return _gf_false; - timersub (&now, &qr_inode->last_refresh, &diff); + if (gf_time() - qr_inode->last_refresh >= conf->cache_timeout) + return _gf_false; - if (diff.tv_sec >= conf->cache_timeout) - return _gf_false; - - return _gf_true; + return _gf_true; } - int -qr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode_ret, - struct iatt *buf, dict_t *xdata, struct iatt *postparent) +qr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode_ret, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) { - void *content = NULL; - qr_inode_t *qr_inode = NULL; - inode_t *inode = NULL; - - inode = frame->local; - frame->local = NULL; - - if (op_ret == -1) { - qr_inode_prune (this, inode); - goto out; - } - - if (dict_get (xdata, "sh-failed")) { - qr_inode_prune (this, inode); - goto out; - } - - content = qr_content_extract (xdata); - - if (content) { - /* new content came along, always replace old content */ - qr_inode = qr_inode_ctx_get_or_new (this, inode); - if (!qr_inode) - /* no harm done */ - goto out; - - qr_content_update (this, qr_inode, content, buf); - } else { - /* purge old content if necessary */ - qr_inode = qr_inode_ctx_get (this, inode); - if (!qr_inode) - /* usual path for large files */ - goto out; - - qr_content_refresh (this, qr_inode, buf); - } -out: - if (inode) - inode_unref (inode); + void *content = NULL; + qr_inode_t *qr_inode = NULL; + inode_t *inode = NULL; + qr_local_t *local = NULL; + + local = frame->local; + inode = local->inode; + + if (op_ret == -1) { + qr_inode_prune(this, inode, local->incident_gen); + goto out; + } + + if (dict_get(xdata, GLUSTERFS_BAD_INODE)) { + qr_inode_prune(this, inode, local->incident_gen); + goto out; + } + + if (dict_get(xdata, "sh-failed")) { + qr_inode_prune(this, inode, local->incident_gen); + goto out; + } + + content = qr_content_extract(xdata); + + if (content) { + /* new content came along, always replace old content */ + qr_inode = qr_inode_ctx_get_or_new(this, inode); + if (!qr_inode) { + /* no harm done */ + GF_FREE(content); + goto out; + } - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode_ret, - buf, xdata, postparent); - return 0; -} + qr_content_update(this, qr_inode, content, buf, local->incident_gen); + } else { + /* purge old content if necessary */ + qr_inode = qr_inode_ctx_get(this, inode); + if (!qr_inode) + /* usual path for large files */ + goto out; + qr_content_refresh(this, qr_inode, buf, local->incident_gen); + } +out: + QR_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode_ret, buf, xdata, + postparent); + return 0; +} int -qr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +qr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - qr_private_t *priv = NULL; - qr_conf_t *conf = NULL; - qr_inode_t *qr_inode = NULL; - int ret = -1; - dict_t *new_xdata = NULL; - - priv = this->private; - conf = &priv->conf; - - qr_inode = qr_inode_ctx_get (this, loc->inode); - if (qr_inode && qr_inode->data) - /* cached. only validate in qr_lookup_cbk */ - goto wind; - - if (!xdata) - xdata = new_xdata = dict_new (); - - if (!xdata) - goto wind; - - ret = 0; - if (conf->max_file_size) - ret = dict_set (xdata, GF_CONTENT_KEY, - data_from_uint64 (conf->max_file_size)); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "cannot set key in request dict (%s)", - loc->path); + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + qr_inode_t *qr_inode = NULL; + int ret = -1; + dict_t *new_xdata = NULL; + qr_local_t *local = NULL; + + priv = this->private; + conf = &priv->conf; + local = qr_local_get(this, loc->inode); + local->inode = inode_ref(loc->inode); + frame->local = local; + + qr_inode = qr_inode_ctx_get(this, loc->inode); + if (qr_inode && qr_inode->data) + /* cached. only validate in qr_lookup_cbk */ + goto wind; + + if (!xdata) + xdata = new_xdata = dict_new(); + + if (!xdata) + goto wind; + + ret = 0; + if (conf->max_file_size) + ret = dict_set(xdata, GF_CONTENT_KEY, + data_from_uint64(conf->max_file_size)); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, QUICK_READ_MSG_DICT_SET_FAILED, + "cannot set key in request dict (%s)", loc->path); wind: - frame->local = inode_ref (loc->inode); + STACK_WIND(frame, qr_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); - STACK_WIND (frame, qr_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, xdata); + if (new_xdata) + dict_unref(new_xdata); - if (new_xdata) - dict_unref (new_xdata); - - return 0; + return 0; } - int -qr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries, dict_t *xdata) +qr_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) { - gf_dirent_t *entry = NULL; - qr_inode_t *qr_inode = NULL; + gf_dirent_t *entry = NULL; + qr_inode_t *qr_inode = NULL; + qr_local_t *local = NULL; - if (op_ret <= 0) - goto unwind; + local = frame->local; - list_for_each_entry (entry, &entries->list, list) { - if (!entry->inode) - continue; + if (op_ret <= 0) + goto unwind; - qr_inode = qr_inode_ctx_get (this, entry->inode); - if (!qr_inode) - /* no harm */ - continue; + list_for_each_entry(entry, &entries->list, list) + { + if (!entry->inode) + continue; - qr_content_refresh (this, qr_inode, &entry->d_stat); - } + qr_inode = qr_inode_ctx_get(this, entry->inode); + if (!qr_inode) + /* no harm */ + continue; + + qr_content_refresh(this, qr_inode, &entry->d_stat, local->incident_gen); + } unwind: - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); - return 0; + QR_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; } - int -qr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset, dict_t *xdata) +qr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - STACK_WIND (frame, qr_readdirp_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->readdirp, - fd, size, offset, xdata); - return 0; -} + qr_local_t *local = NULL; + local = qr_local_get(this, NULL); + frame->local = local; + + STACK_WIND(frame, qr_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata); + return 0; +} int -qr_readv_cached (call_frame_t *frame, qr_inode_t *qr_inode, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +qr_readv_cached(call_frame_t *frame, qr_inode_t *qr_inode, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - xlator_t *this = NULL; - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - int op_ret = -1; - struct iobuf *iobuf = NULL; - struct iobref *iobref = NULL; - struct iovec iov = {0, }; - struct iatt buf = {0, }; + xlator_t *this = NULL; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + int op_ret = -1; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + struct iovec iov = { + 0, + }; + struct iatt buf = { + 0, + }; + + this = frame->this; + priv = this->private; + table = &priv->table; + + LOCK(&table->lock); + { + if (!qr_inode->data) + goto unlock; + + if (offset >= qr_inode->size) + goto unlock; + + if (!__qr_cache_is_fresh(this, qr_inode)) + goto unlock; + + op_ret = min(size, (qr_inode->size - offset)); + + iobuf = iobuf_get2(this->ctx->iobuf_pool, op_ret); + if (!iobuf) { + op_ret = -1; + goto unlock; + } - this = frame->this; - priv = this->private; - table = &priv->table; + iobref = iobref_new(); + if (!iobref) { + op_ret = -1; + goto unlock; + } - LOCK (&table->lock); - { - op_ret = -1; + iobref_add(iobref, iobuf); - if (!qr_inode->data) - goto unlock; + memcpy(iobuf->ptr, qr_inode->data + offset, op_ret); - if (offset >= qr_inode->size) - goto unlock; + buf = qr_inode->buf; - if (!__qr_cache_is_fresh (this, qr_inode)) - goto unlock; + /* bump LRU */ + __qr_inode_register(frame->this, table, qr_inode); + } +unlock: + UNLOCK(&table->lock); - op_ret = min (size, (qr_inode->size - offset)); + if (op_ret >= 0) { + iov.iov_base = iobuf->ptr; + iov.iov_len = op_ret; - iobuf = iobuf_get2 (this->ctx->iobuf_pool, op_ret); - if (!iobuf) { - op_ret = -1; - goto unlock; - } + GF_ATOMIC_INC(priv->qr_counter.cache_hit); + STACK_UNWIND_STRICT(readv, frame, op_ret, 0, &iov, 1, &buf, iobref, + xdata); + } else { + GF_ATOMIC_INC(priv->qr_counter.cache_miss); + } - iobref = iobref_new (); - if (!iobref) { - op_ret = -1; - iobuf_unref (iobuf); - goto unlock; - } + if (iobuf) + iobuf_unref(iobuf); - iobref_add (iobref, iobuf); + if (iobref) + iobref_unref(iobref); - memcpy (iobuf->ptr, qr_inode->data + offset, op_ret); + return op_ret; +} - buf = qr_inode->buf; +int +qr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + qr_inode_t *qr_inode = NULL; - /* bump LRU */ - __qr_inode_register (table, qr_inode); - } -unlock: - UNLOCK (&table->lock); + qr_inode = qr_inode_ctx_get(this, fd->inode); + if (!qr_inode) + goto wind; - if (op_ret > 0) { - iov.iov_base = iobuf->ptr; - iov.iov_len = op_ret; + if (qr_readv_cached(frame, qr_inode, size, offset, flags, xdata) < 0) + goto wind; + + return 0; +wind: + STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; +} - STACK_UNWIND_STRICT (readv, frame, op_ret, 0, &iov, 1, - &buf, iobref, xdata); - } +int32_t +qr_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + qr_local_t *local = NULL; - if (iobuf) - iobuf_unref (iobuf); + local = frame->local; - if (iobref) - iobref_unref (iobref); + qr_inode_prune(this, local->fd->inode, local->incident_gen); - return op_ret; + QR_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; } - int -qr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +qr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - qr_inode_t *qr_inode = NULL; + qr_local_t *local = NULL; - qr_inode = qr_inode_ctx_get (this, fd->inode); - if (!qr_inode) - goto wind; + local = qr_local_get(this, fd->inode); + local->fd = fd_ref(fd); - if (qr_readv_cached (frame, qr_inode, size, offset, flags, xdata) <= 0) - goto wind; + frame->local = local; - return 0; -wind: - STACK_WIND (frame, default_readv_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, - fd, size, offset, flags, xdata); - return 0; + STACK_WIND(frame, qr_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, iov, count, offset, flags, + iobref, xdata); + return 0; } +int32_t +qr_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + qr_local_t *local = NULL; + + local = frame->local; + qr_inode_prune(this, local->inode, local->incident_gen); + + QR_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} int -qr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov, - int count, off_t offset, uint32_t flags, struct iobref *iobref, - dict_t *xdata) +qr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - qr_inode_prune (this, fd->inode); + qr_local_t *local = NULL; + + local = qr_local_get(this, loc->inode); + local->inode = inode_ref(loc->inode); + frame->local = local; - STACK_WIND (frame, default_writev_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, - fd, iov, count, offset, flags, iobref, xdata); - return 0; + STACK_WIND(frame, qr_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; } +int32_t +qr_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + qr_local_t *local = NULL; + + local = frame->local; + qr_inode_prune(this, local->fd->inode, local->incident_gen); + + QR_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} int -qr_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, - dict_t *xdata) +qr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - qr_inode_prune (this, loc->inode); + qr_local_t *local = NULL; + + local = qr_local_get(this, fd->inode); + local->fd = fd_ref(fd); + frame->local = local; - STACK_WIND (frame, default_truncate_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate, - loc, offset, xdata); - return 0; + STACK_WIND(frame, qr_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; } +int32_t +qr_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + qr_local_t *local = NULL; -int -qr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) + local = frame->local; + qr_inode_prune(this, local->fd->inode, local->incident_gen); + + QR_STACK_UNWIND(fallocate, frame, op_ret, op_errno, pre, post, xdata); + return 0; +} + +static int +qr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int keep_size, + off_t offset, size_t len, dict_t *xdata) { - qr_inode_prune (this, fd->inode); + qr_local_t *local = NULL; - STACK_WIND (frame, default_ftruncate_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate, - fd, offset, xdata); - return 0; + local = qr_local_get(this, fd->inode); + local->fd = fd_ref(fd); + frame->local = local; + + STACK_WIND(frame, qr_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, len, + xdata); + return 0; } +int32_t +qr_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + qr_local_t *local = NULL; + + local = frame->local; + qr_inode_prune(this, local->fd->inode, local->incident_gen); -int -qr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - fd_t *fd, dict_t *xdata) + QR_STACK_UNWIND(discard, frame, op_ret, op_errno, pre, post, xdata); + return 0; +} + +static int +qr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - qr_inode_set_priority (this, fd->inode, loc->path); + qr_local_t *local = NULL; - STACK_WIND (frame, default_open_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, - loc, flags, fd, xdata); - return 0; + local = qr_local_get(this, fd->inode); + local->fd = fd_ref(fd); + frame->local = local; + + STACK_WIND(frame, qr_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; } -int -qr_forget (xlator_t *this, inode_t *inode) +int32_t +qr_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - qr_inode_t *qr_inode = NULL; + qr_local_t *local = NULL; - qr_inode = qr_inode_ctx_get (this, inode); + local = frame->local; + qr_inode_prune(this, local->fd->inode, local->incident_gen); - if (!qr_inode) - return 0; + QR_STACK_UNWIND(zerofill, frame, op_ret, op_errno, pre, post, xdata); + return 0; +} - qr_inode_prune (this, inode); +static int +qr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + qr_local_t *local = NULL; - GF_FREE (qr_inode); + local = qr_local_get(this, fd->inode); + local->fd = fd_ref(fd); + frame->local = local; - return 0; + STACK_WIND(frame, qr_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; } +int +qr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + dict_t *xdata) +{ + qr_inode_set_priority(this, fd->inode, loc->path); -int32_t -qr_inodectx_dump (xlator_t *this, inode_t *inode) + STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +int +qr_forget(xlator_t *this, inode_t *inode) { - qr_inode_t *qr_inode = NULL; - int32_t ret = -1; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - char buf[256] = {0, }; + qr_inode_t *qr_inode = NULL; - qr_inode = qr_inode_ctx_get (this, inode); - if (!qr_inode) - goto out; + qr_inode = qr_inode_ctx_get(this, inode); - gf_proc_dump_build_key (key_prefix, "xlator.performance.quick-read", - "inodectx"); - gf_proc_dump_add_section (key_prefix); + if (!qr_inode) + return 0; - gf_proc_dump_write ("entire-file-cached", "%s", qr_inode->data ? "yes" : "no"); + qr_inode_prune(this, inode, qr_get_generation(this, inode)); - if (qr_inode->last_refresh.tv_sec) { - gf_time_fmt (buf, sizeof buf, qr_inode->last_refresh.tv_sec, - gf_timefmt_FT); - snprintf (buf + strlen (buf), sizeof buf - strlen (buf), - ".%"GF_PRI_SUSECONDS, qr_inode->last_refresh.tv_usec); + GF_FREE(qr_inode); - gf_proc_dump_write ("last-cache-validation-time", "%s", buf); - } + return 0; +} - ret = 0; +int32_t +qr_inodectx_dump(xlator_t *this, inode_t *inode) +{ + qr_inode_t *qr_inode = NULL; + int32_t ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char buf[GF_TIMESTR_SIZE] = { + 0, + }; + + qr_inode = qr_inode_ctx_get(this, inode); + if (!qr_inode) + goto out; + + gf_proc_dump_build_key(key_prefix, "xlator.performance.quick-read", + "inodectx"); + gf_proc_dump_add_section("%s", key_prefix); + + gf_proc_dump_write("entire-file-cached", "%s", + qr_inode->data ? "yes" : "no"); + + if (qr_inode->last_refresh) { + gf_time_fmt(buf, sizeof buf, qr_inode->last_refresh, gf_timefmt_FT); + gf_proc_dump_write("last-cache-validation-time", "%s", buf); + } + + ret = 0; out: - return ret; + return ret; } - int -qr_priv_dump (xlator_t *this) +qr_priv_dump(xlator_t *this) { - qr_conf_t *conf = NULL; - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - uint32_t file_count = 0; - uint32_t i = 0; - qr_inode_t *curr = NULL; - uint64_t total_size = 0; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - - if (!this) { - return -1; - } + qr_conf_t *conf = NULL; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + uint32_t file_count = 0; + uint32_t i = 0; + qr_inode_t *curr = NULL; + uint64_t total_size = 0; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; - priv = this->private; - conf = &priv->conf; + if (!this) { + return -1; + } - if (!conf) - return -1; + priv = this->private; + conf = &priv->conf; + if (!conf) + return -1; - table = &priv->table; + table = &priv->table; - gf_proc_dump_build_key (key_prefix, "xlator.performance.quick-read", - "priv"); + gf_proc_dump_build_key(key_prefix, "xlator.performance.quick-read", "priv"); - gf_proc_dump_add_section (key_prefix); + gf_proc_dump_add_section("%s", key_prefix); - gf_proc_dump_write ("max_file_size", "%d", conf->max_file_size); - gf_proc_dump_write ("cache_timeout", "%d", conf->cache_timeout); + gf_proc_dump_write("max_file_size", "%" PRIu64, conf->max_file_size); + gf_proc_dump_write("cache_timeout", "%d", conf->cache_timeout); - if (!table) { - goto out; - } else { - for (i = 0; i < conf->max_pri; i++) { - list_for_each_entry (curr, &table->lru[i], lru) { - file_count++; - total_size += curr->size; - } - } + if (!table) { + goto out; + } else { + for (i = 0; i < conf->max_pri; i++) { + list_for_each_entry(curr, &table->lru[i], lru) + { + file_count++; + total_size += curr->size; + } } + } - gf_proc_dump_write ("total_files_cached", "%d", file_count); - gf_proc_dump_write ("total_cache_used", "%d", total_size); + gf_proc_dump_write("total_files_cached", "%d", file_count); + gf_proc_dump_write("total_cache_used", "%" PRIu64, total_size); + gf_proc_dump_write("cache-hit", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(priv->qr_counter.cache_hit)); + gf_proc_dump_write("cache-miss", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(priv->qr_counter.cache_miss)); + gf_proc_dump_write("cache-invalidations", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(priv->qr_counter.file_data_invals)); out: - return 0; + return 0; } +static int32_t +qr_dump_metrics(xlator_t *this, int fd) +{ + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + + priv = this->private; + table = &priv->table; + + dprintf(fd, "%s.total_files_cached %" PRId64 "\n", this->name, + GF_ATOMIC_GET(priv->qr_counter.files_cached)); + dprintf(fd, "%s.total_cache_used %" PRId64 "\n", this->name, + table->cache_used); + dprintf(fd, "%s.cache-hit %" PRId64 "\n", this->name, + GF_ATOMIC_GET(priv->qr_counter.cache_hit)); + dprintf(fd, "%s.cache-miss %" PRId64 "\n", this->name, + GF_ATOMIC_GET(priv->qr_counter.cache_miss)); + dprintf(fd, "%s.cache-invalidations %" PRId64 "\n", this->name, + GF_ATOMIC_GET(priv->qr_counter.file_data_invals)); + + return 0; +} int32_t -mem_acct_init (xlator_t *this) +qr_mem_acct_init(xlator_t *this) { - int ret = -1; - - if (!this) - return ret; + int ret = -1; - ret = xlator_mem_acct_init (this, gf_qr_mt_end + 1); + if (!this) + return ret; - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } + ret = xlator_mem_acct_init(this, gf_qr_mt_end + 1); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, QUICK_READ_MSG_NO_MEMORY, + "Memory accounting init failed"); return ret; -} + } + return ret; +} static gf_boolean_t -check_cache_size_ok (xlator_t *this, int64_t cache_size) +check_cache_size_ok(xlator_t *this, int64_t cache_size) { - int ret = _gf_true; - uint64_t total_mem = 0; - uint64_t max_cache_size = 0; - volume_option_t *opt = NULL; - - GF_ASSERT (this); - opt = xlator_volume_option_get (this, "cache-size"); - if (!opt) { - ret = _gf_false; - gf_log (this->name, GF_LOG_ERROR, - "could not get cache-size option"); - goto out; - } - - total_mem = get_mem_size (); - if (-1 == total_mem) - max_cache_size = opt->max; - else - max_cache_size = total_mem; - - gf_log (this->name, GF_LOG_DEBUG, "Max cache size is %"PRIu64, - max_cache_size); - if (cache_size > max_cache_size) { - ret = _gf_false; - gf_log (this->name, GF_LOG_ERROR, "Cache size %"PRIu64 - " is greater than the max size of %"PRIu64, - cache_size, max_cache_size); - goto out; - } + int ret = _gf_true; + uint64_t total_mem = 0; + uint64_t max_cache_size = 0; + volume_option_t *opt = NULL; + + GF_ASSERT(this); + opt = xlator_volume_option_get(this, "cache-size"); + if (!opt) { + ret = _gf_false; + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + QUICK_READ_MSG_INVALID_ARGUMENT, + "could not get cache-size option"); + goto out; + } + + total_mem = get_mem_size(); + if (-1 == total_mem) + max_cache_size = opt->max; + else + max_cache_size = total_mem; + + gf_msg_debug(this->name, 0, "Max cache size is %" PRIu64, max_cache_size); + if (cache_size > max_cache_size) { + ret = _gf_false; + gf_msg(this->name, GF_LOG_ERROR, 0, QUICK_READ_MSG_INVALID_ARGUMENT, + "Cache size %" PRIu64 + " is greater than the max size of %" PRIu64, + cache_size, max_cache_size); + goto out; + } out: - return ret; + return ret; } int -reconfigure (xlator_t *this, dict_t *options) +qr_reconfigure(xlator_t *this, dict_t *options) { - int32_t ret = -1; - qr_private_t *priv = NULL; - qr_conf_t *conf = NULL; - uint64_t cache_size_new = 0; + int32_t ret = -1; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + uint64_t cache_size_new = 0; - GF_VALIDATE_OR_GOTO ("quick-read", this, out); - GF_VALIDATE_OR_GOTO (this->name, this->private, out); - GF_VALIDATE_OR_GOTO (this->name, options, out); + GF_VALIDATE_OR_GOTO("quick-read", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, options, out); - priv = this->private; + priv = this->private; - conf = &priv->conf; - if (!conf) { - goto out; - } + conf = &priv->conf; + if (!conf) { + goto out; + } - GF_OPTION_RECONF ("cache-timeout", conf->cache_timeout, options, int32, - out); + GF_OPTION_RECONF("cache-timeout", conf->cache_timeout, options, int32, out); - GF_OPTION_RECONF ("cache-size", cache_size_new, options, size, out); - if (!check_cache_size_ok (this, cache_size_new)) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "Not reconfiguring cache-size"); - goto out; - } - conf->cache_size = cache_size_new; + GF_OPTION_RECONF("quick-read-cache-invalidation", conf->qr_invalidation, + options, bool, out); + + GF_OPTION_RECONF("ctime-invalidation", conf->ctime_invalidation, options, + bool, out); + + GF_OPTION_RECONF("cache-size", cache_size_new, options, size_uint64, out); + if (!check_cache_size_ok(this, cache_size_new)) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, EINVAL, QUICK_READ_MSG_INVALID_CONFIG, + "Not reconfiguring cache-size"); + goto out; + } + conf->cache_size = cache_size_new; - ret = 0; + ret = 0; out: - return ret; + return ret; } - int32_t -qr_get_priority_list (const char *opt_str, struct list_head *first) +qr_get_priority_list(const char *opt_str, struct list_head *first) { - int32_t max_pri = 1; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *tmp_str2 = NULL; - char *dup_str = NULL; - char *priority_str = NULL; - char *pattern = NULL; - char *priority = NULL; - char *string = NULL; - struct qr_priority *curr = NULL, *tmp = NULL; - - GF_VALIDATE_OR_GOTO ("quick-read", opt_str, out); - GF_VALIDATE_OR_GOTO ("quick-read", first, out); - - string = gf_strdup (opt_str); - if (string == NULL) { - max_pri = -1; - goto out; - } - - /* Get the pattern for cache priority. - * "option priority *.jpg:1,abc*:2" etc - */ - /* TODO: inode_lru in table is statically hard-coded to 5, - * should be changed to run-time configuration - */ - priority_str = strtok_r (string, ",", &tmp_str); - while (priority_str) { - curr = GF_CALLOC (1, sizeof (*curr), gf_qr_mt_qr_priority_t); - if (curr == NULL) { - max_pri = -1; - goto out; - } - - list_add_tail (&curr->list, first); - - dup_str = gf_strdup (priority_str); - if (dup_str == NULL) { - max_pri = -1; - goto out; - } - - pattern = strtok_r (dup_str, ":", &tmp_str1); - if (!pattern) { - max_pri = -1; - goto out; - } - - priority = strtok_r (NULL, ":", &tmp_str1); - if (!priority) { - max_pri = -1; - goto out; - } - - gf_log ("quick-read", GF_LOG_TRACE, - "quick-read priority : pattern %s : priority %s", - pattern, - priority); - - curr->pattern = gf_strdup (pattern); - if (curr->pattern == NULL) { - max_pri = -1; - goto out; - } - - curr->priority = strtol (priority, &tmp_str2, 0); - if (tmp_str2 && (*tmp_str2)) { - max_pri = -1; - goto out; - } else { - max_pri = max (max_pri, curr->priority); - } - - GF_FREE (dup_str); - dup_str = NULL; - - priority_str = strtok_r (NULL, ",", &tmp_str); + int32_t max_pri = 1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *tmp_str2 = NULL; + char *dup_str = NULL; + char *priority_str = NULL; + char *pattern = NULL; + char *priority = NULL; + char *string = NULL; + struct qr_priority *curr = NULL, *tmp = NULL; + + GF_VALIDATE_OR_GOTO("quick-read", opt_str, out); + GF_VALIDATE_OR_GOTO("quick-read", first, out); + + string = gf_strdup(opt_str); + if (string == NULL) { + max_pri = -1; + goto out; + } + + /* Get the pattern for cache priority. + * "option priority *.jpg:1,abc*:2" etc + */ + /* TODO: inode_lru in table is statically hard-coded to 5, + * should be changed to run-time configuration + */ + priority_str = strtok_r(string, ",", &tmp_str); + while (priority_str) { + curr = GF_CALLOC(1, sizeof(*curr), gf_qr_mt_qr_priority_t); + if (curr == NULL) { + max_pri = -1; + goto out; } -out: - GF_FREE (string); - GF_FREE (dup_str); + list_add_tail(&curr->list, first); - if (max_pri == -1) { - list_for_each_entry_safe (curr, tmp, first, list) { - list_del_init (&curr->list); - GF_FREE (curr->pattern); - GF_FREE (curr); - } + dup_str = gf_strdup(priority_str); + if (dup_str == NULL) { + max_pri = -1; + goto out; } - return max_pri; -} - + pattern = strtok_r(dup_str, ":", &tmp_str1); + if (!pattern) { + max_pri = -1; + goto out; + } -int32_t -init (xlator_t *this) -{ - int32_t ret = -1, i = 0; - qr_private_t *priv = NULL; - qr_conf_t *conf = NULL; - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: volume (%s) not configured with exactly one " - "child", this->name); - return -1; + priority = strtok_r(NULL, ":", &tmp_str1); + if (!priority) { + max_pri = -1; + goto out; } - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); + gf_msg_trace("quick-read", 0, + "quick-read priority : pattern %s : priority %s", pattern, + priority); + + curr->pattern = gf_strdup(pattern); + if (curr->pattern == NULL) { + max_pri = -1; + goto out; } - priv = GF_CALLOC (1, sizeof (*priv), gf_qr_mt_qr_private_t); - if (priv == NULL) { - ret = -1; - goto out; + curr->priority = strtol(priority, &tmp_str2, 0); + if (tmp_str2 && (*tmp_str2)) { + max_pri = -1; + goto out; + } else { + max_pri = max(max_pri, curr->priority); } - LOCK_INIT (&priv->table.lock); - conf = &priv->conf; + GF_FREE(dup_str); + dup_str = NULL; - GF_OPTION_INIT ("max-file-size", conf->max_file_size, size, out); + priority_str = strtok_r(NULL, ",", &tmp_str); + } +out: + GF_FREE(string); - GF_OPTION_INIT ("cache-timeout", conf->cache_timeout, int32, out); + GF_FREE(dup_str); - GF_OPTION_INIT ("cache-size", conf->cache_size, size, out); - if (!check_cache_size_ok (this, conf->cache_size)) { - ret = -1; - goto out; + if (max_pri == -1) { + list_for_each_entry_safe(curr, tmp, first, list) + { + list_del_init(&curr->list); + GF_FREE(curr->pattern); + GF_FREE(curr); } + } - INIT_LIST_HEAD (&conf->priority_list); - conf->max_pri = 1; - if (dict_get (this->options, "priority")) { - char *option_list = data_to_str (dict_get (this->options, - "priority")); - gf_log (this->name, GF_LOG_TRACE, - "option path %s", option_list); - /* parse the list of pattern:priority */ - conf->max_pri = qr_get_priority_list (option_list, - &conf->priority_list); - - if (conf->max_pri == -1) { - goto out; - } - conf->max_pri ++; - } + return max_pri; +} - priv->table.lru = GF_CALLOC (conf->max_pri, sizeof (*priv->table.lru), - gf_common_mt_list_head); - if (priv->table.lru == NULL) { - ret = -1; - goto out; +int32_t +qr_init(xlator_t *this) +{ + int32_t ret = -1, i = 0; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, + QUICK_READ_MSG_XLATOR_CHILD_MISCONFIGURED, + "FATAL: volume (%s) not configured with exactly one " + "child", + this->name); + return -1; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, QUICK_READ_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfile "); + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_qr_mt_qr_private_t); + if (priv == NULL) { + ret = -1; + goto out; + } + + LOCK_INIT(&priv->table.lock); + conf = &priv->conf; + + GF_OPTION_INIT("max-file-size", conf->max_file_size, size_uint64, out); + + GF_OPTION_INIT("cache-timeout", conf->cache_timeout, int32, out); + + GF_OPTION_INIT("quick-read-cache-invalidation", conf->qr_invalidation, bool, + out); + + GF_OPTION_INIT("cache-size", conf->cache_size, size_uint64, out); + if (!check_cache_size_ok(this, conf->cache_size)) { + ret = -1; + goto out; + } + + GF_OPTION_INIT("ctime-invalidation", conf->ctime_invalidation, bool, out); + + INIT_LIST_HEAD(&conf->priority_list); + conf->max_pri = 1; + if (dict_get(this->options, "priority")) { + char *option_list = data_to_str(dict_get(this->options, "priority")); + gf_msg_trace(this->name, 0, "option path %s", option_list); + /* parse the list of pattern:priority */ + conf->max_pri = qr_get_priority_list(option_list, &conf->priority_list); + + if (conf->max_pri == -1) { + goto out; } + conf->max_pri++; + } - for (i = 0; i < conf->max_pri; i++) { - INIT_LIST_HEAD (&priv->table.lru[i]); - } + priv->table.lru = GF_CALLOC(conf->max_pri, sizeof(*priv->table.lru), + gf_common_mt_list_head); + if (priv->table.lru == NULL) { + ret = -1; + goto out; + } + + for (i = 0; i < conf->max_pri; i++) { + INIT_LIST_HEAD(&priv->table.lru[i]); + } - ret = 0; + ret = 0; - this->private = priv; + priv->last_child_down = gf_time(); + GF_ATOMIC_INIT(priv->generation, 0); + this->private = priv; out: - if ((ret == -1) && priv) { - GF_FREE (priv); - } + if ((ret == -1) && priv) { + GF_FREE(priv); + } - return ret; + return ret; } - void -qr_inode_table_destroy (qr_private_t *priv) +qr_inode_table_destroy(qr_private_t *priv) { - int i = 0; - qr_conf_t *conf = NULL; - - conf = &priv->conf; - - for (i = 0; i < conf->max_pri; i++) { - GF_ASSERT (list_empty (&priv->table.lru[i])); + int i = 0; + qr_conf_t *conf = NULL; + + conf = &priv->conf; + + for (i = 0; i < conf->max_pri; i++) { + /* There is a known leak of inodes, hence until + * that is fixed, log the assert as warning. + GF_ASSERT (list_empty (&priv->table.lru[i]));*/ + if (!list_empty(&priv->table.lru[i])) { + gf_msg("quick-read", GF_LOG_INFO, 0, QUICK_READ_MSG_LRU_NOT_EMPTY, + "quick read inode table lru not empty"); } + } - LOCK_DESTROY (&priv->table.lock); + LOCK_DESTROY(&priv->table.lock); - return; + return; } +void +qr_conf_destroy(qr_conf_t *conf) +{ + struct qr_priority *curr = NULL, *tmp = NULL; + + list_for_each_entry_safe(curr, tmp, &conf->priority_list, list) + { + list_del(&curr->list); + GF_FREE(curr->pattern); + GF_FREE(curr); + } + + return; +} void -qr_conf_destroy (qr_conf_t *conf) +qr_update_child_down_time(xlator_t *this, time_t now) { - struct qr_priority *curr = NULL, *tmp = NULL; + qr_private_t *priv = NULL; + + priv = this->private; - list_for_each_entry_safe (curr, tmp, &conf->priority_list, list) { - list_del (&curr->list); - GF_FREE (curr->pattern); - GF_FREE (curr); + LOCK(&priv->lock); + { + priv->last_child_down = now; + } + UNLOCK(&priv->lock); +} + +static int +qr_invalidate(xlator_t *this, void *data) +{ + struct gf_upcall *up_data = NULL; + struct gf_upcall_cache_invalidation *up_ci = NULL; + inode_t *inode = NULL; + int ret = 0; + inode_table_t *itable = NULL; + qr_private_t *priv = NULL; + + up_data = (struct gf_upcall *)data; + + if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) + goto out; + + priv = this->private; + up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; + + if (up_ci && (up_ci->flags & UP_WRITE_FLAGS)) { + GF_ATOMIC_INC(priv->qr_counter.file_data_invals); + itable = ((xlator_t *)this->graph->top)->itable; + inode = inode_find(itable, up_data->gfid); + if (!inode) { + ret = -1; + goto out; } + qr_inode_prune(this, inode, qr_get_generation(this, inode)); + } - return; +out: + if (inode) + inode_unref(inode); + + return ret; } +int +qr_notify(xlator_t *this, int event, void *data, ...) +{ + int ret = 0; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + + priv = this->private; + conf = &priv->conf; + + switch (event) { + case GF_EVENT_CHILD_DOWN: + case GF_EVENT_SOME_DESCENDENT_DOWN: + qr_update_child_down_time(this, gf_time()); + break; + case GF_EVENT_UPCALL: + if (conf->qr_invalidation) + ret = qr_invalidate(this, data); + break; + default: + break; + } + + if (default_notify(this, event, data) != 0) + ret = -1; + + return ret; +} void -fini (xlator_t *this) +qr_fini(xlator_t *this) { - qr_private_t *priv = NULL; + qr_private_t *priv = NULL; - if (this == NULL) { - goto out; - } + if (this == NULL) { + goto out; + } - priv = this->private; - if (priv == NULL) { - goto out; - } + priv = this->private; + if (priv == NULL) { + goto out; + } - qr_inode_table_destroy (priv); - qr_conf_destroy (&priv->conf); + qr_inode_table_destroy(priv); + qr_conf_destroy(&priv->conf); - this->private = NULL; + this->private = NULL; - GF_FREE (priv); + GF_FREE(priv); out: - return; + return; } -struct xlator_fops fops = { - .lookup = qr_lookup, - .readdirp = qr_readdirp, - .open = qr_open, - .readv = qr_readv, - .writev = qr_writev, - .truncate = qr_truncate, - .ftruncate = qr_ftruncate -}; - -struct xlator_cbks cbks = { - .forget = qr_forget, +struct xlator_fops qr_fops = {.lookup = qr_lookup, + .readdirp = qr_readdirp, + .open = qr_open, + .readv = qr_readv, + .writev = qr_writev, + .truncate = qr_truncate, + .ftruncate = qr_ftruncate, + .fallocate = qr_fallocate, + .discard = qr_discard, + .zerofill = qr_zerofill}; + +struct xlator_cbks qr_cbks = { + .forget = qr_forget, }; -struct xlator_dumpops dumpops = { - .priv = qr_priv_dump, - .inodectx = qr_inodectx_dump, +struct xlator_dumpops qr_dumpops = { + .priv = qr_priv_dump, + .inodectx = qr_inodectx_dump, }; -struct volume_options options[] = { - { .key = {"priority"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"cache-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 0, - .max = 32 * GF_UNIT_GB, - .default_value = "128MB", - .description = "Size of the read cache." - }, - { .key = {"cache-timeout"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 60, - .default_value = "1", - }, - { .key = {"max-file-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 0, - .max = 1 * GF_UNIT_KB * 1000, - .default_value = "64KB", - }, - { .key = {NULL} } +struct volume_options qr_options[] = { + { + .key = {"quick-read"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable quick-read", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + {.key = {"priority"}, .type = GF_OPTION_TYPE_ANY}, + {.key = {"cache-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = INFINITY, + .default_value = "128MB", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = "Size of small file read cache."}, + { + .key = {"cache-timeout"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "1", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + }, + { + .key = {"max-file-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 1 * GF_UNIT_KB * 1000, + .default_value = "64KB", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + }, + { + .key = {"quick-read-cache-invalidation"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = "When \"on\", invalidates/updates the metadata cache," + " on receiving the cache-invalidation notifications", + }, + { + .key = {"ctime-invalidation"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_5_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = "Quick-read by default uses mtime to identify changes " + "to file data. However there are applications like " + "rsync which explicitly set mtime making it unreliable " + "for the purpose of identifying change in file content " + ". Since ctime also changes when content of a file " + " changes and it cannot be set explicitly, it becomes " + " suitable for identifying staleness of cached data. " + "This option makes quick-read to prefer ctime over " + "mtime to validate its cache. However, using ctime " + "can result in false positives as ctime changes with " + "just attribute changes like permission without " + "changes to file data. So, use this only when mtime " + "is not reliable", + }, + {.key = {NULL}}}; + +xlator_api_t xlator_api = { + .init = qr_init, + .fini = qr_fini, + .notify = qr_notify, + .reconfigure = qr_reconfigure, + .mem_acct_init = qr_mem_acct_init, + .dump_metrics = qr_dump_metrics, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &qr_dumpops, + .fops = &qr_fops, + .cbks = &qr_cbks, + .options = qr_options, + .identifier = "quick-read", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/quick-read/src/quick-read.h b/xlators/performance/quick-read/src/quick-read.h index 6f0a0541731..20fcc70b3a7 100644 --- a/xlators/performance/quick-read/src/quick-read.h +++ b/xlators/performance/quick-read/src/quick-read.h @@ -11,21 +11,16 @@ #ifndef __QUICK_READ_H #define __QUICK_READ_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "list.h" -#include "compat.h" -#include "compat-errno.h" -#include "common-utils.h" -#include "call-stub.h" -#include "defaults.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/list.h> +#include <glusterfs/compat.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/defaults.h> #include <libgen.h> #include <sys/time.h> #include <sys/types.h> @@ -34,48 +29,63 @@ #include <fnmatch.h> #include "quick-read-mem-types.h" - struct qr_inode { - void *data; - size_t size; - int priority; - uint32_t ia_mtime; - uint32_t ia_mtime_nsec; - struct iatt buf; - struct timeval last_refresh; - struct list_head lru; + void *data; + size_t size; + int priority; + uint32_t ia_mtime; + uint32_t ia_mtime_nsec; + uint32_t ia_ctime; + uint32_t ia_ctime_nsec; + uint32_t gen_rollover; + struct iatt buf; + time_t last_refresh; + struct list_head lru; + uint64_t gen; + uint64_t invalidation_time; }; typedef struct qr_inode qr_inode_t; - struct qr_priority { - char *pattern; - int32_t priority; - struct list_head list; + char *pattern; + int32_t priority; + struct list_head list; }; typedef struct qr_priority qr_priority_t; struct qr_conf { - uint64_t max_file_size; - int32_t cache_timeout; - uint64_t cache_size; - int max_pri; - struct list_head priority_list; + uint64_t max_file_size; + int32_t cache_timeout; + uint64_t cache_size; + int max_pri; + gf_boolean_t qr_invalidation; + gf_boolean_t ctime_invalidation; + struct list_head priority_list; }; typedef struct qr_conf qr_conf_t; struct qr_inode_table { - uint64_t cache_used; - struct list_head *lru; - gf_lock_t lock; + uint64_t cache_used; + struct list_head *lru; + gf_lock_t lock; }; typedef struct qr_inode_table qr_inode_table_t; +struct qr_statistics { + gf_atomic_t cache_hit; + gf_atomic_t cache_miss; + gf_atomic_t file_data_invals; /* No. of invalidates received from upcall */ + gf_atomic_t files_cached; +}; + struct qr_private { - qr_conf_t conf; - qr_inode_table_t table; + qr_conf_t conf; + qr_inode_table_t table; + time_t last_child_down; + gf_lock_t lock; + struct qr_statistics qr_counter; + gf_atomic_int32_t generation; }; typedef struct qr_private qr_private_t; - #endif /* #ifndef __QUICK_READ_H */ diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am index be80ae7ac68..99efca3660c 100644 --- a/xlators/performance/read-ahead/src/Makefile.am +++ b/xlators/performance/read-ahead/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = read-ahead.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -read_ahead_la_LDFLAGS = -module -avoid-version +read_ahead_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) read_ahead_la_SOURCES = read-ahead.c page.c read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = read-ahead.h read-ahead-mem-types.h +noinst_HEADERS = read-ahead.h read-ahead-mem-types.h read-ahead-messages.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c index e79e7ae7880..8a58ad8bb7a 100644 --- a/xlators/performance/read-ahead/src/page.c +++ b/xlators/performance/read-ahead/src/page.c @@ -8,446 +8,448 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "read-ahead.h" #include <assert.h> +#include "read-ahead-messages.h" ra_page_t * -ra_page_get (ra_file_t *file, off_t offset) +ra_page_get(ra_file_t *file, off_t offset) { - ra_page_t *page = NULL; - off_t rounded_offset = 0; + ra_page_t *page = NULL; + off_t rounded_offset = 0; - GF_VALIDATE_OR_GOTO ("read-ahead", file, out); + GF_VALIDATE_OR_GOTO("read-ahead", file, out); - page = file->pages.next; - rounded_offset = floor (offset, file->page_size); + page = file->pages.next; + rounded_offset = gf_floor(offset, file->page_size); - while (page != &file->pages && page->offset < rounded_offset) - page = page->next; + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; - if (page == &file->pages || page->offset != rounded_offset) - page = NULL; + if (page == &file->pages || page->offset != rounded_offset) + page = NULL; out: - return page; + return page; } - ra_page_t * -ra_page_create (ra_file_t *file, off_t offset) +ra_page_create(ra_file_t *file, off_t offset) { - ra_page_t *page = NULL; - off_t rounded_offset = 0; - ra_page_t *newpage = NULL; + ra_page_t *page = NULL; + off_t rounded_offset = 0; + ra_page_t *newpage = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", file, out); + GF_VALIDATE_OR_GOTO("read-ahead", file, out); - page = file->pages.next; - rounded_offset = floor (offset, file->page_size); + page = file->pages.next; + rounded_offset = gf_floor(offset, file->page_size); - while (page != &file->pages && page->offset < rounded_offset) - page = page->next; + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; - if (page == &file->pages || page->offset != rounded_offset) { - newpage = GF_CALLOC (1, sizeof (*newpage), gf_ra_mt_ra_page_t); - if (!newpage) { - goto out; - } + if (page == &file->pages || page->offset != rounded_offset) { + newpage = GF_CALLOC(1, sizeof(*newpage), gf_ra_mt_ra_page_t); + if (!newpage) { + goto out; + } - newpage->offset = rounded_offset; - newpage->prev = page->prev; - newpage->next = page; - newpage->file = file; - page->prev->next = newpage; - page->prev = newpage; + newpage->offset = rounded_offset; + newpage->prev = page->prev; + newpage->next = page; + newpage->file = file; + page->prev->next = newpage; + page->prev = newpage; - page = newpage; - } + page = newpage; + } out: - return page; + return page; } - void -ra_wait_on_page (ra_page_t *page, call_frame_t *frame) +ra_wait_on_page(ra_page_t *page, call_frame_t *frame) { - ra_waitq_t *waitq = NULL; - ra_local_t *local = NULL; + ra_waitq_t *waitq = NULL; + ra_local_t *local = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, page, out); + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, page, out); - local = frame->local; + local = frame->local; - waitq = GF_CALLOC (1, sizeof (*waitq), gf_ra_mt_ra_waitq_t); - if (!waitq) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } + waitq = GF_CALLOC(1, sizeof(*waitq), gf_ra_mt_ra_waitq_t); + if (!waitq) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } - waitq->data = frame; - waitq->next = page->waitq; - page->waitq = waitq; + waitq->data = frame; + waitq->next = page->waitq; + page->waitq = waitq; - ra_local_lock (local); - { - local->wait_count++; - } - ra_local_unlock (local); + ra_local_lock(local); + { + local->wait_count++; + } + ra_local_unlock(local); out: - return; + return; } - void -ra_waitq_return (ra_waitq_t *waitq) +ra_waitq_return(ra_waitq_t *waitq) { - ra_waitq_t *trav = NULL; - ra_waitq_t *next = NULL; - call_frame_t *frame = NULL; + ra_waitq_t *trav = NULL; + ra_waitq_t *next = NULL; + call_frame_t *frame = NULL; - for (trav = waitq; trav; trav = next) { - next = trav->next; + for (trav = waitq; trav; trav = next) { + next = trav->next; - frame = trav->data; - ra_frame_return (frame); - GF_FREE (trav); - } + frame = trav->data; + ra_frame_return(frame); + GF_FREE(trav); + } - return; + return; } - int -ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref, - dict_t *xdata) +ra_fault_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { - ra_local_t *local = NULL; - off_t pending_offset = 0; - ra_file_t *file = NULL; - ra_page_t *page = NULL; - ra_waitq_t *waitq = NULL; - fd_t *fd = NULL; - uint64_t tmp_file = 0; - - GF_ASSERT (frame); - - local = frame->local; - fd = local->fd; - - fd_ctx_get (fd, this, &tmp_file); - - file = (ra_file_t *)(long)tmp_file; - pending_offset = local->pending_offset; - - if (file == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "read-ahead context not set in fd (%p)", fd); - op_ret = -1; - op_errno = EBADF; - goto out; + ra_local_t *local = NULL; + off_t pending_offset = 0; + ra_file_t *file = NULL; + ra_page_t *page = NULL; + ra_waitq_t *waitq = NULL; + fd_t *fd = NULL; + uint64_t tmp_file = 0; + gf_boolean_t stale = _gf_false; + + GF_ASSERT(frame); + + local = frame->local; + fd = local->fd; + + fd_ctx_get(fd, this, &tmp_file); + + file = (ra_file_t *)(long)tmp_file; + pending_offset = local->pending_offset; + + if (file == NULL) { + gf_msg(this->name, GF_LOG_WARNING, EBADF, + READ_AHEAD_MSG_FD_CONTEXT_NOT_SET, + "read-ahead context not set in fd (%p)", fd); + op_ret = -1; + op_errno = EBADF; + goto out; + } + + ra_file_lock(file); + { + if (op_ret >= 0) + file->stbuf = *stbuf; + + page = ra_page_get(file, pending_offset); + + if (!page) { + gf_msg_trace(this->name, 0, + "wasted copy: " + "%" PRId64 "[+%" PRId64 "] file=%p", + pending_offset, file->page_size, file); + goto unlock; } - ra_file_lock (file); - { - if (op_ret >= 0) - file->stbuf = *stbuf; - - page = ra_page_get (file, pending_offset); - - if (!page) { - gf_log (this->name, GF_LOG_TRACE, - "wasted copy: %"PRId64"[+%"PRId64"] file=%p", - pending_offset, file->page_size, file); - goto unlock; - } - - /* - * "Dirty" means that the request was a pure read-ahead; it's - * set for requests we issue ourselves, and cleared when user - * requests are issued or put on the waitq. "Poisoned" means - * that we got a write while a read was still in flight, and we - * couldn't stop it so we marked it instead. If it's both - * dirty and poisoned by the time we get here, we cancel its - * effect so that a subsequent user read doesn't get data that - * we know is stale (because we made it stale ourselves). We - * can't use ESTALE because that has special significance. - * ECANCELED has no such special meaning, and is close to what - * we're trying to indicate. - */ - if (page->dirty && page->poisoned) { - op_ret = -1; - op_errno = ECANCELED; - } - - if (op_ret < 0) { - waitq = ra_page_error (page, op_ret, op_errno); - goto unlock; - } - - if (page->vector) { - iobref_unref (page->iobref); - GF_FREE (page->vector); - } - - page->vector = iov_dup (vector, count); - if (page->vector == NULL) { - waitq = ra_page_error (page, -1, ENOMEM); - goto unlock; - } - - page->count = count; - page->iobref = iobref_ref (iobref); - page->ready = 1; - - page->size = iov_length (vector, count); - - waitq = ra_page_wakeup (page); + if (page->stale) { + page->stale = 0; + page->ready = 0; + stale = 1; + goto unlock; } -unlock: - ra_file_unlock (file); - - ra_waitq_return (waitq); - - fd_unref (local->fd); - mem_put (frame->local); - frame->local = NULL; - -out: - STACK_DESTROY (frame->root); - return 0; -} + /* + * "Dirty" means that the request was a pure read-ahead; it's + * set for requests we issue ourselves, and cleared when user + * requests are issued or put on the waitq. "Poisoned" means + * that we got a write while a read was still in flight, and we + * couldn't stop it so we marked it instead. If it's both + * dirty and poisoned by the time we get here, we cancel its + * effect so that a subsequent user read doesn't get data that + * we know is stale (because we made it stale ourselves). We + * can't use ESTALE because that has special significance. + * ECANCELED has no such special meaning, and is close to what + * we're trying to indicate. + */ + if (page->dirty && page->poisoned) { + op_ret = -1; + op_errno = ECANCELED; + } + if (op_ret < 0) { + waitq = ra_page_error(page, op_ret, op_errno); + goto unlock; + } -void -ra_page_fault (ra_file_t *file, call_frame_t *frame, off_t offset) -{ - call_frame_t *fault_frame = NULL; - ra_local_t *fault_local = NULL; - ra_page_t *page = NULL; - ra_waitq_t *waitq = NULL; - int32_t op_ret = -1, op_errno = -1; - - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, file, out); - - fault_frame = copy_frame (frame); - if (fault_frame == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto err; + if (page->vector) { + iobref_unref(page->iobref); + GF_FREE(page->vector); } - fault_local = mem_get0 (THIS->local_pool); - if (fault_local == NULL) { - STACK_DESTROY (fault_frame->root); - op_ret = -1; - op_errno = ENOMEM; - goto err; + page->vector = iov_dup(vector, count); + if (page->vector == NULL) { + waitq = ra_page_error(page, -1, ENOMEM); + goto unlock; } - fault_frame->local = fault_local; - fault_local->pending_offset = offset; - fault_local->pending_size = file->page_size; + page->count = count; + page->iobref = iobref_ref(iobref); + page->ready = 1; - fault_local->fd = fd_ref (file->fd); + page->size = iov_length(vector, count); - STACK_WIND (fault_frame, ra_fault_cbk, - FIRST_CHILD (fault_frame->this), - FIRST_CHILD (fault_frame->this)->fops->readv, - file->fd, file->page_size, offset, 0, NULL); + waitq = ra_page_wakeup(page); + } +unlock: + ra_file_unlock(file); - return; + if (stale) { + STACK_WIND(frame, ra_fault_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->readv, local->fd, + local->pending_size, local->pending_offset, 0, NULL); -err: - ra_file_lock (file); - { - page = ra_page_get (file, offset); - if (page) - waitq = ra_page_error (page, op_ret, - op_errno); - } - ra_file_unlock (file); + return 0; + } - if (waitq != NULL) { - ra_waitq_return (waitq); - } + ra_waitq_return(waitq); + + fd_unref(local->fd); + + mem_put(frame->local); + frame->local = NULL; out: - return; + STACK_DESTROY(frame->root); + return 0; } - void -ra_frame_fill (ra_page_t *page, call_frame_t *frame) +ra_page_fault(ra_file_t *file, call_frame_t *frame, off_t offset) { - ra_local_t *local = NULL; - ra_fill_t *fill = NULL; - off_t src_offset = 0; - off_t dst_offset = 0; - ssize_t copy_size = 0; - ra_fill_t *new = NULL; - - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, page, out); + call_frame_t *fault_frame = NULL; + ra_local_t *fault_local = NULL; + ra_page_t *page = NULL; + ra_waitq_t *waitq = NULL; + int32_t op_ret = -1, op_errno = -1; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, file, out); + + fault_frame = copy_frame(frame); + if (fault_frame == NULL) { + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + fault_local = mem_get0(THIS->local_pool); + if (fault_local == NULL) { + STACK_DESTROY(fault_frame->root); + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + fault_frame->local = fault_local; + fault_local->pending_offset = offset; + fault_local->pending_size = file->page_size; + + fault_local->fd = fd_ref(file->fd); + + STACK_WIND(fault_frame, ra_fault_cbk, FIRST_CHILD(fault_frame->this), + FIRST_CHILD(fault_frame->this)->fops->readv, file->fd, + file->page_size, offset, 0, NULL); + + return; - local = frame->local; - fill = &local->fill; - - if (local->op_ret != -1 && page->size) { - if (local->offset > page->offset) - src_offset = local->offset - page->offset; - else - dst_offset = page->offset - local->offset; - - copy_size = min (page->size - src_offset, - local->size - dst_offset); - - if (copy_size < 0) { - /* if page contains fewer bytes and the required offset - is beyond the page size in the page */ - copy_size = src_offset = 0; - } - - fill = fill->next; - while (fill != &local->fill) { - if (fill->offset > page->offset) { - break; - } - fill = fill->next; - } - - new = GF_CALLOC (1, sizeof (*new), gf_ra_mt_ra_fill_t); - if (new == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } - - new->offset = page->offset; - new->size = copy_size; - new->iobref = iobref_ref (page->iobref); - new->count = iov_subset (page->vector, page->count, - src_offset, src_offset+copy_size, - NULL); - new->vector = GF_CALLOC (new->count, sizeof (struct iovec), - gf_ra_mt_iovec); - if (new->vector == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - GF_FREE (new); - goto out; - } - - new->count = iov_subset (page->vector, page->count, - src_offset, src_offset+copy_size, - new->vector); - - new->next = fill; - new->prev = new->next->prev; - new->next->prev = new; - new->prev->next = new; - - local->op_ret += copy_size; - } +err: + ra_file_lock(file); + { + page = ra_page_get(file, offset); + if (page) + waitq = ra_page_error(page, op_ret, op_errno); + } + ra_file_unlock(file); + + if (waitq != NULL) { + ra_waitq_return(waitq); + } out: - return; + return; } - void -ra_frame_unwind (call_frame_t *frame) +ra_frame_fill(ra_page_t *page, call_frame_t *frame) { - ra_local_t *local = NULL; - ra_fill_t *fill = NULL; - int32_t count = 0; - struct iovec *vector = NULL; - int32_t copied = 0; - struct iobref *iobref = NULL; - ra_fill_t *next = NULL; - fd_t *fd = NULL; - ra_file_t *file = NULL; - uint64_t tmp_file = 0; - - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - - local = frame->local; - fill = local->fill.next; - - iobref = iobref_new (); - if (iobref == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + off_t src_offset = 0; + off_t dst_offset = 0; + ssize_t copy_size = 0; + ra_fill_t *new = NULL; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, page, out); + + local = frame->local; + fill = &local->fill; + + if (local->op_ret != -1 && page->size) { + if (local->offset > page->offset) + src_offset = local->offset - page->offset; + else + dst_offset = page->offset - local->offset; + + copy_size = min(page->size - src_offset, local->size - dst_offset); + + if (copy_size < 0) { + /* if page contains fewer bytes and the required offset + is beyond the page size in the page */ + copy_size = src_offset = 0; } - frame->local = NULL; - + fill = fill->next; while (fill != &local->fill) { - count += fill->count; - fill = fill->next; + if (fill->offset > page->offset) { + break; + } + fill = fill->next; } - vector = GF_CALLOC (count, sizeof (*vector), gf_ra_mt_iovec); - if (vector == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - iobref_unref (iobref); - iobref = NULL; + new = GF_CALLOC(1, sizeof(*new), gf_ra_mt_ra_fill_t); + if (new == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + new->offset = page->offset; + new->size = copy_size; + new->iobref = iobref_ref(page->iobref); + new->count = iov_subset(page->vector, page->count, src_offset, + copy_size, &new->vector, 0); + if (new->count < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(new->iobref); + GF_FREE(new); + goto out; } - fill = local->fill.next; + new->next = fill; + new->prev = new->next->prev; + new->next->prev = new; + new->prev->next = new; - while (fill != &local->fill) { - next = fill->next; + local->op_ret += copy_size; + } - if ((vector != NULL) && (iobref != NULL)) { - memcpy (((char *)vector) + copied, fill->vector, - fill->count * sizeof (*vector)); +out: + return; +} - copied += (fill->count * sizeof (*vector)); - iobref_merge (iobref, fill->iobref); - } +void +ra_frame_unwind(call_frame_t *frame) +{ + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + int32_t count = 0; + struct iovec *vector = NULL; + int32_t copied = 0; + struct iobref *iobref = NULL; + ra_fill_t *next = NULL; + fd_t *fd = NULL; + ra_file_t *file = NULL; + uint64_t tmp_file = 0; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + + local = frame->local; + fill = local->fill.next; + + iobref = iobref_new(); + if (iobref == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } + + frame->local = NULL; + + while (fill != &local->fill) { + count += fill->count; + fill = fill->next; + } + + vector = GF_CALLOC(count, sizeof(*vector), gf_ra_mt_iovec); + if (vector == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(iobref); + iobref = NULL; + } + + fill = local->fill.next; + + while (fill != &local->fill) { + next = fill->next; + + if ((vector != NULL) && (iobref != NULL)) { + memcpy(((char *)vector) + copied, fill->vector, + fill->count * sizeof(*vector)); + + copied += (fill->count * sizeof(*vector)); + if (iobref_merge(iobref, fill->iobref)) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(iobref); + iobref = NULL; + } + } - fill->next->prev = fill->prev; - fill->prev->next = fill->prev; + fill->next->prev = fill->prev; + fill->prev->next = fill->prev; - iobref_unref (fill->iobref); - GF_FREE (fill->vector); - GF_FREE (fill); + iobref_unref(fill->iobref); + GF_FREE(fill->vector); + GF_FREE(fill); - fill = next; - } + fill = next; + } - fd = local->fd; - fd_ctx_get (fd, frame->this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; + fd = local->fd; + fd_ctx_get(fd, frame->this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; - STACK_UNWIND_STRICT (readv, frame, local->op_ret, local->op_errno, - vector, count, &file->stbuf, iobref, NULL); + STACK_UNWIND_STRICT(readv, frame, local->op_ret, local->op_errno, vector, + count, &file->stbuf, iobref, NULL); - iobref_unref (iobref); - pthread_mutex_destroy (&local->local_lock); - mem_put (local); - GF_FREE (vector); + iobref_unref(iobref); + pthread_mutex_destroy(&local->local_lock); + mem_put(local); + GF_FREE(vector); out: - return; + return; } /* @@ -456,27 +458,27 @@ out: * */ void -ra_frame_return (call_frame_t *frame) +ra_frame_return(call_frame_t *frame) { - ra_local_t *local = NULL; - int32_t wait_count = 0; + ra_local_t *local = NULL; + int32_t wait_count = 0; - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); - local = frame->local; - GF_ASSERT (local->wait_count > 0); + local = frame->local; + GF_ASSERT(local->wait_count > 0); - ra_local_lock (local); - { - wait_count = --local->wait_count; - } - ra_local_unlock (local); + ra_local_lock(local); + { + wait_count = --local->wait_count; + } + ra_local_unlock(local); - if (!wait_count) - ra_frame_unwind (frame); + if (!wait_count) + ra_frame_unwind(frame); out: - return; + return; } /* @@ -485,26 +487,26 @@ out: * */ ra_waitq_t * -ra_page_wakeup (ra_page_t *page) +ra_page_wakeup(ra_page_t *page) { - ra_waitq_t *waitq = NULL, *trav = NULL; - call_frame_t *frame = NULL; + ra_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", page, out); + GF_VALIDATE_OR_GOTO("read-ahead", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; - ra_frame_fill (page, frame); - } + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + ra_frame_fill(page, frame); + } - if (page->stale) { - ra_page_purge (page); - } + if (page->stale) { + ra_page_purge(page); + } out: - return waitq; + return waitq; } /* @@ -513,22 +515,22 @@ out: * */ void -ra_page_purge (ra_page_t *page) +ra_page_purge(ra_page_t *page) { - GF_VALIDATE_OR_GOTO ("read-ahead", page, out); + GF_VALIDATE_OR_GOTO("read-ahead", page, out); - page->prev->next = page->next; - page->next->prev = page->prev; + page->prev->next = page->next; + page->next->prev = page->prev; - if (page->iobref) { - iobref_unref (page->iobref); - } + if (page->iobref) { + iobref_unref(page->iobref); + } - GF_FREE (page->vector); - GF_FREE (page); + GF_FREE(page->vector); + GF_FREE(page); out: - return; + return; } /* @@ -539,32 +541,32 @@ out: * */ ra_waitq_t * -ra_page_error (ra_page_t *page, int32_t op_ret, int32_t op_errno) +ra_page_error(ra_page_t *page, int32_t op_ret, int32_t op_errno) { - ra_waitq_t *waitq = NULL; - ra_waitq_t *trav = NULL; - call_frame_t *frame = NULL; - ra_local_t *local = NULL; + ra_waitq_t *waitq = NULL; + ra_waitq_t *trav = NULL; + call_frame_t *frame = NULL; + ra_local_t *local = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", page, out); + GF_VALIDATE_OR_GOTO("read-ahead", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; - local = frame->local; - if (local->op_ret != -1) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } + local = frame->local; + if (local->op_ret != -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; } + } - ra_page_purge (page); + ra_page_purge(page); out: - return waitq; + return waitq; } /* @@ -573,31 +575,31 @@ out: * */ void -ra_file_destroy (ra_file_t *file) +ra_file_destroy(ra_file_t *file) { - ra_conf_t *conf = NULL; - ra_page_t *trav = NULL; + ra_conf_t *conf = NULL; + ra_page_t *trav = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", file, out); + GF_VALIDATE_OR_GOTO("read-ahead", file, out); - conf = file->conf; + conf = file->conf; - ra_conf_lock (conf); - { - file->prev->next = file->next; - file->next->prev = file->prev; - } - ra_conf_unlock (conf); + ra_conf_lock(conf); + { + file->prev->next = file->next; + file->next->prev = file->prev; + } + ra_conf_unlock(conf); + trav = file->pages.next; + while (trav != &file->pages) { + ra_page_error(trav, -1, EINVAL); trav = file->pages.next; - while (trav != &file->pages) { - ra_page_error (trav, -1, EINVAL); - trav = file->pages.next; - } + } - pthread_mutex_destroy (&file->file_lock); - GF_FREE (file); + pthread_mutex_destroy(&file->file_lock); + GF_FREE(file); out: - return; + return; } diff --git a/xlators/performance/read-ahead/src/read-ahead-mem-types.h b/xlators/performance/read-ahead/src/read-ahead-mem-types.h index 219e2928919..f07cfc5bba5 100644 --- a/xlators/performance/read-ahead/src/read-ahead-mem-types.h +++ b/xlators/performance/read-ahead/src/read-ahead-mem-types.h @@ -8,19 +8,18 @@ cases as published by the Free Software Foundation. */ - #ifndef __RA_MEM_TYPES_H__ #define __RA_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_ra_mem_types_ { - gf_ra_mt_ra_file_t = gf_common_mt_end + 1, - gf_ra_mt_ra_conf_t, - gf_ra_mt_ra_page_t, - gf_ra_mt_ra_waitq_t, - gf_ra_mt_ra_fill_t, - gf_ra_mt_iovec, - gf_ra_mt_end + gf_ra_mt_ra_file_t = gf_common_mt_end + 1, + gf_ra_mt_ra_conf_t, + gf_ra_mt_ra_page_t, + gf_ra_mt_ra_waitq_t, + gf_ra_mt_ra_fill_t, + gf_ra_mt_iovec, + gf_ra_mt_end }; #endif diff --git a/xlators/performance/read-ahead/src/read-ahead-messages.h b/xlators/performance/read-ahead/src/read-ahead-messages.h new file mode 100644 index 00000000000..0302b7a7122 --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead-messages.h @@ -0,0 +1,31 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _READ_AHEAD_MESSAGES_H_ +#define _READ_AHEAD_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(READ_AHEAD, READ_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED, + READ_AHEAD_MSG_VOL_MISCONFIGURED, READ_AHEAD_MSG_NO_MEMORY, + READ_AHEAD_MSG_FD_CONTEXT_NOT_SET, + READ_AHEAD_MSG_UNDESTROYED_FILE_FOUND, + READ_AHEAD_MSG_XLATOR_CONF_NULL); + +#endif /* _READ_AHEAD_MESSAGES_H_ */ diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c index 241fa477fda..5246e1317d2 100644 --- a/xlators/performance/read-ahead/src/read-ahead.c +++ b/xlators/performance/read-ahead/src/read-ahead.c @@ -15,197 +15,187 @@ - ensure efficient memory management in case of random seek */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "read-ahead.h" -#include "statedump.h" +#include <glusterfs/statedump.h> #include <assert.h> #include <sys/time.h> +#include "read-ahead-messages.h" static void -read_ahead (call_frame_t *frame, ra_file_t *file); - +read_ahead(call_frame_t *frame, ra_file_t *file); int -ra_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +ra_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - ra_conf_t *conf = NULL; - ra_file_t *file = NULL; - int ret = 0; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - - conf = this->private; - - if (op_ret == -1) { - goto unwind; - } - - file = GF_CALLOC (1, sizeof (*file), gf_ra_mt_ra_file_t); - if (!file) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - /* If O_DIRECT open, we disable caching on it */ - - if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) - file->disabled = 1; - - file->offset = (unsigned long long) 0; - file->conf = conf; - file->pages.next = &file->pages; - file->pages.prev = &file->pages; - file->pages.offset = (unsigned long long) 0; - file->pages.file = file; - - ra_conf_lock (conf); - { - file->next = conf->files.next; - conf->files.next = file; - file->next->prev = file; - file->prev = &conf->files; - } - ra_conf_unlock (conf); - - file->fd = fd; - file->page_count = conf->page_count; - file->page_size = conf->page_size; - pthread_mutex_init (&file->file_lock, NULL); - - if (!file->disabled) { - file->page_count = 1; - } - - ret = fd_ctx_set (fd, this, (uint64_t)(long)file); - if (ret == -1) { - gf_log (frame->this->name, GF_LOG_WARNING, - "cannot set read-ahead context information in fd (%p)", - fd); - ra_file_destroy (file); - op_ret = -1; - op_errno = ENOMEM; - } + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = GF_CALLOC(1, sizeof(*file), gf_ra_mt_ra_file_t); + if (!file) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long)0; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long)0; + file->pages.file = file; + + ra_conf_lock(conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock(conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init(&file->file_lock, NULL); + + if (!file->disabled) { + file->page_count = 1; + } + + ret = fd_ctx_set(fd, this, (uint64_t)(long)file); + if (ret == -1) { + gf_msg(frame->this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_NO_MEMORY, + "cannot set read-ahead context" + "information in fd (%p)", + fd); + ra_file_destroy(file); + op_ret = -1; + op_errno = ENOMEM; + } unwind: - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); - return 0; + return 0; } - int -ra_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +ra_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - ra_conf_t *conf = NULL; - ra_file_t *file = NULL; - int ret = 0; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - - conf = this->private; - - if (op_ret == -1) { - goto unwind; - } - - file = GF_CALLOC (1, sizeof (*file), gf_ra_mt_ra_file_t); - if (!file) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - /* If O_DIRECT open, we disable caching on it */ - - if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) - file->disabled = 1; - - file->offset = (unsigned long long) 0; - //file->size = fd->inode->buf.ia_size; - file->conf = conf; - file->pages.next = &file->pages; - file->pages.prev = &file->pages; - file->pages.offset = (unsigned long long) 0; - file->pages.file = file; - - ra_conf_lock (conf); - { - file->next = conf->files.next; - conf->files.next = file; - file->next->prev = file; - file->prev = &conf->files; - } - ra_conf_unlock (conf); - - file->fd = fd; - file->page_count = conf->page_count; - file->page_size = conf->page_size; - pthread_mutex_init (&file->file_lock, NULL); - - ret = fd_ctx_set (fd, this, (uint64_t)(long)file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot set read ahead context information in fd (%p)", - fd); - ra_file_destroy (file); - op_ret = -1; - op_errno = ENOMEM; - } + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = GF_CALLOC(1, sizeof(*file), gf_ra_mt_ra_file_t); + if (!file) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long)0; + // file->size = fd->inode->buf.ia_size; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long)0; + file->pages.file = file; + + ra_conf_lock(conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock(conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init(&file->file_lock, NULL); + + ret = fd_ctx_set(fd, this, (uint64_t)(long)file); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_NO_MEMORY, + "cannot set read ahead context" + "information in fd (%p)", + fd); + ra_file_destroy(file); + op_ret = -1; + op_errno = ENOMEM; + } unwind: - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent, xdata); + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); - return 0; + return 0; } - int -ra_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata) +ra_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - GF_ASSERT (frame); - GF_ASSERT (this); + GF_ASSERT(frame); + GF_ASSERT(this); - STACK_WIND (frame, ra_open_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->open, - loc, flags, fd, xdata); + STACK_WIND(frame, ra_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); - return 0; + return 0; } - int -ra_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +ra_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - GF_ASSERT (frame); - GF_ASSERT (this); + GF_ASSERT(frame); + GF_ASSERT(this); - STACK_WIND (frame, ra_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, umask, fd, xdata); + STACK_WIND(frame, ra_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); - return 0; + return 0; } /* free cache pages between offset and offset+size, @@ -213,733 +203,710 @@ ra_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, */ static void -flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size, - int for_write) +flush_region(call_frame_t *frame, ra_file_t *file, off_t offset, off_t size, + int for_write) { - ra_page_t *trav = NULL; - ra_page_t *next = NULL; - - ra_file_lock (file); - { - trav = file->pages.next; - while (trav != &file->pages - && trav->offset < (offset + size)) { - - next = trav->next; - if (trav->offset >= offset) { - if (!trav->waitq) { - ra_page_purge (trav); - } - else { - trav->stale = 1; - - if (for_write) { - trav->poisoned = 1; - } - } - } - trav = next; + ra_page_t *trav = NULL; + ra_page_t *next = NULL; + + ra_file_lock(file); + { + trav = file->pages.next; + while (trav != &file->pages && trav->offset < (offset + size)) { + next = trav->next; + if (trav->offset >= offset) { + if (!trav->waitq) { + ra_page_purge(trav); + } else { + trav->stale = 1; + + if (for_write) { + trav->poisoned = 1; + } } + } + trav = next; } - ra_file_unlock (file); + } + ra_file_unlock(file); } - int -ra_release (xlator_t *this, fd_t *fd) +ra_release(xlator_t *this, fd_t *fd) { - uint64_t tmp_file = 0; - int ret = 0; + uint64_t tmp_file = 0; + int ret = 0; - GF_VALIDATE_OR_GOTO ("read-ahead", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO("read-ahead", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); - ret = fd_ctx_del (fd, this, &tmp_file); + ret = fd_ctx_del(fd, this, &tmp_file); - if (!ret) { - ra_file_destroy ((ra_file_t *)(long)tmp_file); - } + if (!ret) { + ra_file_destroy((ra_file_t *)(long)tmp_file); + } out: - return 0; + return 0; } - void -read_ahead (call_frame_t *frame, ra_file_t *file) +read_ahead(call_frame_t *frame, ra_file_t *file) { - off_t ra_offset = 0; - size_t ra_size = 0; - off_t trav_offset = 0; - ra_page_t *trav = NULL; - off_t cap = 0; - char fault = 0; - - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, file, out); - - if (!file->page_count) { - goto out; + off_t ra_offset = 0; + size_t ra_size = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + off_t cap = 0; + char fault = 0; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, file, out); + + if (!file->page_count) { + goto out; + } + + ra_size = file->page_size * file->page_count; + ra_offset = gf_floor(file->offset, file->page_size); + cap = file->size ? file->size : file->offset + ra_size; + + while (ra_offset < min(file->offset + ra_size, cap)) { + ra_file_lock(file); + { + trav = ra_page_get(file, ra_offset); } + ra_file_unlock(file); - ra_size = file->page_size * file->page_count; - ra_offset = floor (file->offset, file->page_size); - cap = file->size ? file->size : file->offset + ra_size; + if (!trav) + break; - while (ra_offset < min (file->offset + ra_size, cap)) { + ra_offset += file->page_size; + } - ra_file_lock (file); - { - trav = ra_page_get (file, ra_offset); - } - ra_file_unlock (file); + if (trav) { + /* comfortable enough */ + goto out; + } - if (!trav) - break; + trav_offset = ra_offset; - ra_offset += file->page_size; - } + cap = file->size ? file->size : ra_offset + ra_size; - if (trav) { - /* comfortable enough */ - goto out; + while (trav_offset < min(ra_offset + ra_size, cap)) { + fault = 0; + ra_file_lock(file); + { + trav = ra_page_get(file, trav_offset); + if (!trav) { + fault = 1; + trav = ra_page_create(file, trav_offset); + if (trav) + trav->dirty = 1; + } } + ra_file_unlock(file); - trav_offset = ra_offset; - - cap = file->size ? file->size : ra_offset + ra_size; - - while (trav_offset < min(ra_offset + ra_size, cap)) { - fault = 0; - ra_file_lock (file); - { - trav = ra_page_get (file, trav_offset); - if (!trav) { - fault = 1; - trav = ra_page_create (file, trav_offset); - if (trav) - trav->dirty = 1; - } - } - ra_file_unlock (file); - - if (!trav) { - /* OUT OF MEMORY */ - break; - } + if (!trav) { + /* OUT OF MEMORY */ + break; + } - if (fault) { - gf_log (frame->this->name, GF_LOG_TRACE, - "RA at offset=%"PRId64, trav_offset); - ra_page_fault (file, frame, trav_offset); - } - trav_offset += file->page_size; + if (fault) { + gf_msg_trace(frame->this->name, 0, "RA at offset=%" PRId64, + trav_offset); + ra_page_fault(file, frame, trav_offset); } + trav_offset += file->page_size; + } out: - return; + return; } - int -ra_need_atime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref, - dict_t *xdata) +ra_need_atime_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) { - GF_ASSERT (frame); - STACK_DESTROY (frame->root); - return 0; + GF_ASSERT(frame); + STACK_DESTROY(frame->root); + return 0; } - static void -dispatch_requests (call_frame_t *frame, ra_file_t *file) +dispatch_requests(call_frame_t *frame, ra_file_t *file) { - ra_local_t *local = NULL; - ra_conf_t *conf = NULL; - off_t rounded_offset = 0; - off_t rounded_end = 0; - off_t trav_offset = 0; - ra_page_t *trav = NULL; - call_frame_t *ra_frame = NULL; - char need_atime_update = 1; - char fault = 0; - - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, file, out); - - local = frame->local; - conf = file->conf; - - rounded_offset = floor (local->offset, file->page_size); - rounded_end = roof (local->offset + local->size, file->page_size); - - trav_offset = rounded_offset; - - while (trav_offset < rounded_end) { - fault = 0; - - ra_file_lock (file); - { - trav = ra_page_get (file, trav_offset); - if (!trav) { - trav = ra_page_create (file, trav_offset); - if (!trav) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - fault = 1; - need_atime_update = 0; - } - trav->dirty = 0; - - if (trav->ready) { - gf_log (frame->this->name, GF_LOG_TRACE, - "HIT at offset=%"PRId64".", - trav_offset); - ra_frame_fill (trav, frame); - } else { - gf_log (frame->this->name, GF_LOG_TRACE, - "IN-TRANSIT at offset=%"PRId64".", - trav_offset); - ra_wait_on_page (trav, frame); - need_atime_update = 0; - } - } - unlock: - ra_file_unlock (file); + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + off_t rounded_offset = 0; + off_t rounded_end = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + call_frame_t *ra_frame = NULL; + char need_atime_update = 1; + char fault = 0; - if (local->op_ret == -1) { - goto out; - } + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, file, out); + + local = frame->local; + conf = file->conf; + + rounded_offset = gf_floor(local->offset, file->page_size); + rounded_end = gf_roof(local->offset + local->size, file->page_size); - if (fault) { - gf_log (frame->this->name, GF_LOG_TRACE, - "MISS at offset=%"PRId64".", - trav_offset); - ra_page_fault (file, frame, trav_offset); + trav_offset = rounded_offset; + + while (trav_offset < rounded_end) { + fault = 0; + + ra_file_lock(file); + { + trav = ra_page_get(file, trav_offset); + if (!trav) { + trav = ra_page_create(file, trav_offset); + if (!trav) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; } + fault = 1; + need_atime_update = 0; + } + trav->dirty = 0; + + if (trav->ready) { + gf_msg_trace(frame->this->name, 0, "HIT at offset=%" PRId64 ".", + trav_offset); + ra_frame_fill(trav, frame); + } else { + gf_msg_trace(frame->this->name, 0, + "IN-TRANSIT at " + "offset=%" PRId64 ".", + trav_offset); + ra_wait_on_page(trav, frame); + need_atime_update = 0; + } + } + unlock: + ra_file_unlock(file); - trav_offset += file->page_size; + if (local->op_ret == -1) { + goto out; } - if (need_atime_update && conf->force_atime_update) { - /* TODO: use untimens() since readv() can confuse underlying - io-cache and others */ - ra_frame = copy_frame (frame); - if (ra_frame == NULL) { - goto out; - } + if (fault) { + gf_msg_trace(frame->this->name, 0, "MISS at offset=%" PRId64 ".", + trav_offset); + ra_page_fault(file, frame, trav_offset); + } + + trav_offset += file->page_size; + } - STACK_WIND (ra_frame, ra_need_atime_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, - file->fd, 1, 1, 0, NULL); + if (need_atime_update && conf->force_atime_update) { + /* TODO: use untimens() since readv() can confuse underlying + io-cache and others */ + ra_frame = copy_frame(frame); + if (ra_frame == NULL) { + goto out; } + STACK_WIND(ra_frame, ra_need_atime_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->readv, file->fd, 1, 1, 0, + NULL); + } + out: - return ; + return; } - int -ra_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref, - dict_t *xdata) +ra_readv_disabled_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) { - GF_ASSERT (frame); + GF_ASSERT(frame); - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref, xdata); + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); - return 0; + return 0; } - int -ra_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +ra_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - ra_file_t *file = NULL; - ra_local_t *local = NULL; - ra_conf_t *conf = NULL; - int op_errno = EINVAL; - char expected_offset = 1; - uint64_t tmp_file = 0; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - conf = this->private; - - gf_log (this->name, GF_LOG_TRACE, - "NEW REQ at offset=%"PRId64" for size=%"GF_PRI_SIZET"", - offset, size); - - fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (!file || file->disabled) { - goto disabled; + ra_file_t *file = NULL; + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + int op_errno = EINVAL; + char expected_offset = 1; + uint64_t tmp_file = 0; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + conf = this->private; + + gf_msg_trace(this->name, 0, + "NEW REQ at offset=%" PRId64 " for size=%" GF_PRI_SIZET "", + offset, size); + + fd_ctx_get(fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file || file->disabled) { + goto disabled; + } + + if (file->offset != offset) { + gf_msg_trace(this->name, 0, + "unexpected offset (%" PRId64 " != %" PRId64 + ") " + "resetting", + file->offset, offset); + + expected_offset = file->expected = file->page_count = 0; + } else { + gf_msg_trace(this->name, 0, + "expected offset (%" PRId64 ") when page_count=%d", offset, + file->page_count); + + if (file->expected < (file->page_size * conf->page_count)) { + file->expected += size; + file->page_count = min((file->expected / file->page_size), + conf->page_count); } + } - if (file->offset != offset) { - gf_log (this->name, GF_LOG_TRACE, - "unexpected offset (%"PRId64" != %"PRId64") resetting", - file->offset, offset); - - expected_offset = file->expected = file->page_count = 0; - } else { - gf_log (this->name, GF_LOG_TRACE, - "expected offset (%"PRId64") when page_count=%d", - offset, file->page_count); - - if (file->expected < (file->page_size * conf->page_count)) { - file->expected += size; - file->page_count = min ((file->expected - / file->page_size), - conf->page_count); - } - } + if (!expected_offset) { + flush_region(frame, file, 0, file->pages.prev->offset + 1, 0); + } - if (!expected_offset) { - flush_region (frame, file, 0, file->pages.prev->offset + 1, 0); - } + local = mem_get0(this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto unwind; + } - local = mem_get0 (this->local_pool); - if (!local) { - op_errno = ENOMEM; - goto unwind; - } + local->fd = fd; + local->offset = offset; + local->size = size; + local->wait_count = 1; - local->fd = fd; - local->offset = offset; - local->size = size; - local->wait_count = 1; + local->fill.next = &local->fill; + local->fill.prev = &local->fill; - local->fill.next = &local->fill; - local->fill.prev = &local->fill; + pthread_mutex_init(&local->local_lock, NULL); - pthread_mutex_init (&local->local_lock, NULL); + frame->local = local; - frame->local = local; + dispatch_requests(frame, file); - dispatch_requests (frame, file); + flush_region(frame, file, 0, gf_floor(offset, file->page_size), 0); - flush_region (frame, file, 0, floor (offset, file->page_size), 0); + read_ahead(frame, file); - read_ahead (frame, file); + file->offset = offset + size; - ra_frame_return (frame); + ra_frame_return(frame); - file->offset = offset + size; - - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, - NULL); + STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); - return 0; + return 0; disabled: - STACK_WIND (frame, ra_readv_disabled_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, - fd, size, offset, flags, xdata); - return 0; + STACK_WIND(frame, ra_readv_disabled_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->readv, fd, size, offset, flags, + xdata); + return 0; } - int -ra_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xdata) +ra_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - GF_ASSERT (frame); - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); - return 0; + GF_ASSERT(frame); + STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata); + return 0; } - - int -ra_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, - dict_t *xdata) +ra_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - GF_ASSERT (frame); - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, - xdata); - return 0; + GF_ASSERT(frame); + STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; } - int -ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +ra_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - ra_file_t *file = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + int32_t op_errno = EINVAL; - fd_ctx_get (fd, this, &tmp_file); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - file = (ra_file_t *)(long)tmp_file; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1, 0); - } - - STACK_WIND (frame, ra_flush_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->flush, fd, xdata); - return 0; + STACK_WIND(frame, ra_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (flush, frame, -1, op_errno, NULL); - return 0; + STACK_UNWIND_STRICT(flush, frame, -1, op_errno, NULL); + return 0; } - int -ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, - dict_t *xdata) +ra_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - ra_file_t *file = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + int32_t op_errno = EINVAL; - fd_ctx_get (fd, this, &tmp_file); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - file = (ra_file_t *)(long)tmp_file; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1, 0); - } - - STACK_WIND (frame, ra_fsync_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsync, fd, datasync, xdata); - return 0; + STACK_WIND(frame, ra_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - int -ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +ra_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - ra_file_t *file = NULL; + ra_file_t *file = NULL; - GF_ASSERT (frame); + GF_ASSERT(frame); - file = frame->local; + file = frame->local; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1, 1); - } + if (file) { + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); + } - frame->local = NULL; - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, - xdata); - return 0; + frame->local = NULL; + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - int -ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, - dict_t *xdata) +ra_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - ra_file_t *file = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1, 1); + ra_file_t *file = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + + if (iter_fd == fd) frame->local = file; - /* reset the read-ahead counters too */ - file->expected = file->page_count = 0; + + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); + + /* reset the read-ahead counters too */ + file->expected = file->page_count = 0; } + } + UNLOCK(&inode->lock); - STACK_WIND (frame, ra_writev_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - fd, vector, count, offset, flags, iobref, xdata); + STACK_WIND(frame, ra_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - int -ra_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +ra_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - GF_ASSERT (frame); + GF_ASSERT(frame); - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, - postbuf, xdata); - return 0; + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - int -ra_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) +ra_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) { - GF_ASSERT (frame); + GF_ASSERT(frame); - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata); - return 0; + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata); + return 0; } - int -ra_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, - dict_t *xdata) +ra_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, loc, unwind); - inode = loc->inode; + inode = loc->inode; - LOCK (&inode->lock); + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (!file) - continue; - /* - * Truncation invalidates reads just like writing does. - * TBD: this seems to flush more than it should. The - * only time we should flush at all is when we're - * shortening (not lengthening) the file, and then only - * from new EOF to old EOF. The same problem exists in - * ra_ftruncate. - */ - flush_region (frame, file, 0, - file->pages.prev->offset + 1, 1); - } + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + /* + * Truncation invalidates reads just like writing does. + * TBD: this seems to flush more than it should. The + * only time we should flush at all is when we're + * shortening (not lengthening) the file, and then only + * from new EOF to old EOF. The same problem exists in + * ra_ftruncate. + */ + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); } - UNLOCK (&inode->lock); + } + UNLOCK(&inode->lock); - STACK_WIND (frame, ra_truncate_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->truncate, - loc, offset, xdata); - return 0; + STACK_WIND(frame, ra_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - void -ra_page_dump (struct ra_page *page) +ra_page_dump(struct ra_page *page) { - int i = 0; - call_frame_t *frame = NULL; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - ra_waitq_t *trav = NULL; + int i = 0; + call_frame_t *frame = NULL; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + ra_waitq_t *trav = NULL; - if (page == NULL) { - goto out; - } + if (page == NULL) { + goto out; + } - gf_proc_dump_write ("offset", "%"PRId64, page->offset); + gf_proc_dump_write("offset", "%" PRId64, page->offset); - gf_proc_dump_write ("size", "%"PRId64, page->size); + gf_proc_dump_write("size", "%" GF_PRI_SIZET, page->size); - gf_proc_dump_write ("dirty", "%s", page->dirty ? "yes" : "no"); + gf_proc_dump_write("dirty", "%s", page->dirty ? "yes" : "no"); - gf_proc_dump_write ("poisoned", "%s", page->poisoned ? "yes" : "no"); + gf_proc_dump_write("poisoned", "%s", page->poisoned ? "yes" : "no"); - gf_proc_dump_write ("ready", "%s", page->ready ? "yes" : "no"); + gf_proc_dump_write("ready", "%s", page->ready ? "yes" : "no"); - for (trav = page->waitq; trav; trav = trav->next) { - frame = trav->data; - sprintf (key, "waiting-frame[%d]", i++); - gf_proc_dump_write (key, "%"PRId64, frame->root->unique); - } + for (trav = page->waitq; trav; trav = trav->next) { + frame = trav->data; + sprintf(key, "waiting-frame[%d]", i++); + gf_proc_dump_write(key, "%" PRId64, frame->root->unique); + } out: - return; + return; } int32_t -ra_fdctx_dump (xlator_t *this, fd_t *fd) +ra_fdctx_dump(xlator_t *this, fd_t *fd) { - ra_file_t *file = NULL; - ra_page_t *page = NULL; - int32_t ret = 0, i = 0; - uint64_t tmp_file = 0; - char *path = NULL; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - - fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (file == NULL) { - ret = 0; - goto out; - } + ra_file_t *file = NULL; + ra_page_t *page = NULL; + int32_t ret = 0, i = 0; + uint64_t tmp_file = 0; + char *path = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + + fd_ctx_get(fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file == NULL) { + ret = 0; + goto out; + } - gf_proc_dump_build_key (key_prefix, - "xlator.performance.read-ahead", - "file"); + gf_proc_dump_build_key(key_prefix, "xlator.performance.read-ahead", "file"); - gf_proc_dump_add_section (key_prefix); + gf_proc_dump_add_section("%s", key_prefix); - ret = __inode_path (fd->inode, NULL, &path); - if (path != NULL) { - gf_proc_dump_write ("path", "%s", path); - GF_FREE (path); - } + ret = __inode_path(fd->inode, NULL, &path); + if (path != NULL) { + gf_proc_dump_write("path", "%s", path); + GF_FREE(path); + } - gf_proc_dump_write ("fd", "%p", fd); + gf_proc_dump_write("fd", "%p", fd); - gf_proc_dump_write ("disabled", "%s", file->disabled ? "yes" : "no"); + gf_proc_dump_write("disabled", "%s", file->disabled ? "yes" : "no"); - if (file->disabled) { - ret = 0; - goto out; - } + if (file->disabled) { + ret = 0; + goto out; + } - gf_proc_dump_write ("page-size", "%"PRId64, file->page_size); + gf_proc_dump_write("page-size", "%" PRId64, file->page_size); - gf_proc_dump_write ("page-count", "%u", file->page_count); + gf_proc_dump_write("page-count", "%u", file->page_count); - gf_proc_dump_write ("next-expected-offset-for-sequential-reads", - "%"PRId64, file->offset); + gf_proc_dump_write("next-expected-offset-for-sequential-reads", "%" PRId64, + file->offset); - for (page = file->pages.next; page != &file->pages; - page = page->next) { - sprintf (key, "page[%d]", i); - gf_proc_dump_write (key, "%p", page[i++]); - ra_page_dump (page); - } + for (page = file->pages.next; page != &file->pages; page = page->next) { + gf_proc_dump_write("page", "%d: %p", i++, (void *)page); + ra_page_dump(page); + } - ret = 0; + ret = 0; out: - return ret; + return ret; } int -ra_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +ra_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + ra_conf_t *conf = NULL; + + conf = this->private; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - inode = fd->inode; + inode = fd->inode; - LOCK (&inode->lock); + if (conf->force_atime_update) { + LOCK(&inode->lock); { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1, 0); - } + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + flush_region(frame, file, 0, file->pages.prev->offset + 1, 0); + } } - UNLOCK (&inode->lock); + UNLOCK(&inode->lock); + } - STACK_WIND (frame, ra_attr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fstat, fd, xdata); - return 0; + STACK_WIND(frame, ra_attr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (stat, frame, -1, op_errno, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(stat, frame, -1, op_errno, NULL, NULL); + return 0; } - int -ra_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) +ra_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - inode = fd->inode; + inode = fd->inode; - LOCK (&inode->lock); + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (!file) - continue; - /* - * Truncation invalidates reads just like writing does. - * TBD: this seems to flush more than it should. The - * only time we should flush at all is when we're - * shortening (not lengthening) the file, and then only - * from new EOF to old EOF. The same problem exists in - * ra_truncate. - */ - flush_region (frame, file, 0, - file->pages.prev->offset + 1, 1); - } + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + /* + * Truncation invalidates reads just like writing does. + * TBD: this seems to flush more than it should. The + * only time we should flush at all is when we're + * shortening (not lengthening) the file, and then only + * from new EOF to old EOF. The same problem exists in + * ra_truncate. + */ + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); } - UNLOCK (&inode->lock); + } + UNLOCK(&inode->lock); - STACK_WIND (frame, ra_truncate_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->ftruncate, fd, offset, xdata); - return 0; + STACK_WIND(frame, ra_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } int @@ -947,261 +914,359 @@ ra_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - GF_ASSERT (frame); + GF_ASSERT(frame); - STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf, - postbuf, xdata); - return 0; + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } static int ra_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) + size_t len, dict_t *xdata) { - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - inode = fd->inode; + inode = fd->inode; - LOCK (&inode->lock); + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (!file) - continue; + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; - flush_region(frame, file, offset, len, 1); - } + flush_region(frame, file, offset, len, 1); } - UNLOCK (&inode->lock); + } + UNLOCK(&inode->lock); - STACK_WIND (frame, ra_discard_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->discard, fd, offset, len, xdata); - return 0; + STACK_WIND(frame, ra_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (discard, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } int -ra_priv_dump (xlator_t *this) +ra_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - ra_conf_t *conf = NULL; - int ret = -1; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - gf_boolean_t add_section = _gf_false; + GF_ASSERT(frame); - if (!this) { - goto out; - } + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} - conf = this->private; - if (!conf) { - gf_log (this->name, GF_LOG_WARNING, "conf null in xlator"); - goto out; - } +static int +ra_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; - gf_proc_dump_build_key (key_prefix, "xlator.performance.read-ahead", - "priv"); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - gf_proc_dump_add_section (key_prefix); - add_section = _gf_true; + inode = fd->inode; - ret = pthread_mutex_trylock (&conf->conf_lock); - if (ret) - goto out; + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) { - gf_proc_dump_write ("page_size", "%d", conf->page_size); - gf_proc_dump_write ("page_count", "%d", conf->page_count); - gf_proc_dump_write ("force_atime_update", "%d", - conf->force_atime_update); + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + + flush_region(frame, file, offset, len, 1); } - pthread_mutex_unlock (&conf->conf_lock); + } + UNLOCK(&inode->lock); - ret = 0; -out: - if (ret && conf) { - if (add_section == _gf_false) - gf_proc_dump_add_section (key_prefix); + STACK_WIND(frame, ra_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; - gf_proc_dump_write ("Unable to dump priv", - "(Lock acquisition failed) %s", this->name); - } - return ret; +unwind: + STACK_UNWIND_STRICT(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } +int +ra_priv_dump(xlator_t *this) +{ + ra_conf_t *conf = NULL; + int ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + + if (!this) { + goto out; + } + + conf = this->private; + if (!conf) { + gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_XLATOR_CONF_NULL, + "conf null in xlator"); + goto out; + } + + gf_proc_dump_build_key(key_prefix, "xlator.performance.read-ahead", "priv"); + + gf_proc_dump_add_section("%s", key_prefix); + + ret = pthread_mutex_trylock(&conf->conf_lock); + if (ret) + goto out; + { + gf_proc_dump_write("page_size", "%" PRIu64, conf->page_size); + gf_proc_dump_write("page_count", "%d", conf->page_count); + gf_proc_dump_write("force_atime_update", "%d", + conf->force_atime_update); + } + pthread_mutex_unlock(&conf->conf_lock); + + ret = 0; +out: + if (ret && conf) { + gf_proc_dump_write("Unable to dump priv", + "(Lock acquisition failed) %s", this->name); + } + return ret; +} int32_t -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - if (!this) { - goto out; - } + if (!this) { + goto out; + } - ret = xlator_mem_acct_init (this, gf_ra_mt_end + 1); + ret = xlator_mem_acct_init(this, gf_ra_mt_end + 1); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - } + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READ_AHEAD_MSG_NO_MEMORY, + "Memory accounting init" + "failed"); + } out: - return ret; + return ret; } int -reconfigure (xlator_t *this, dict_t *options) +reconfigure(xlator_t *this, dict_t *options) { - ra_conf_t *conf = NULL; - int ret = -1; + ra_conf_t *conf = NULL; + int ret = -1; - GF_VALIDATE_OR_GOTO ("read-ahead", this, out); - GF_VALIDATE_OR_GOTO ("read-ahead", this->private, out); + GF_VALIDATE_OR_GOTO("read-ahead", this, out); + GF_VALIDATE_OR_GOTO("read-ahead", this->private, out); - conf = this->private; + conf = this->private; - GF_OPTION_RECONF ("page-count", conf->page_count, options, uint32, out); + GF_OPTION_RECONF("page-count", conf->page_count, options, uint32, out); - GF_OPTION_RECONF ("page-size", conf->page_size, options, size, out); + GF_OPTION_RECONF("page-size", conf->page_size, options, size_uint64, out); - ret = 0; - out: - return ret; + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); + + ret = 0; +out: + return ret; } int -init (xlator_t *this) +init(xlator_t *this) { - ra_conf_t *conf = NULL; - int32_t ret = -1; + ra_conf_t *conf = NULL; + int32_t ret = -1; - GF_VALIDATE_OR_GOTO ("read-ahead", this, out); + GF_VALIDATE_OR_GOTO("read-ahead", this, out); - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: read-ahead not configured with exactly one" - " child"); - goto out; - } + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, + READ_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED, + "FATAL: read-ahead not configured with exactly one" + " child"); + goto out; + } - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfile "); + } - conf = (void *) GF_CALLOC (1, sizeof (*conf), gf_ra_mt_ra_conf_t); - if (conf == NULL) { - goto out; - } + conf = (void *)GF_CALLOC(1, sizeof(*conf), gf_ra_mt_ra_conf_t); + if (conf == NULL) { + goto out; + } - conf->page_size = this->ctx->page_size; + conf->page_size = this->ctx->page_size; - GF_OPTION_INIT ("page-size", conf->page_size, size, out); + GF_OPTION_INIT("page-size", conf->page_size, size_uint64, out); - GF_OPTION_INIT ("page-count", conf->page_count, uint32, out); + GF_OPTION_INIT("page-count", conf->page_count, uint32, out); - GF_OPTION_INIT ("force-atime-update", conf->force_atime_update, bool, out); + GF_OPTION_INIT("force-atime-update", conf->force_atime_update, bool, out); - conf->files.next = &conf->files; - conf->files.prev = &conf->files; + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); - pthread_mutex_init (&conf->conf_lock, NULL); + conf->files.next = &conf->files; + conf->files.prev = &conf->files; - this->local_pool = mem_pool_new (ra_local_t, 64); - if (!this->local_pool) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - goto out; - } + pthread_mutex_init(&conf->conf_lock, NULL); - this->private = conf; - ret = 0; + this->local_pool = mem_pool_new(ra_local_t, 64); + if (!this->local_pool) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READ_AHEAD_MSG_NO_MEMORY, + "failed to create local_t's memory pool"); + goto out; + } + + this->private = conf; + ret = 0; out: - if (ret == -1) { - GF_FREE (conf); - } + if (ret == -1) { + GF_FREE(conf); + } - return ret; + return ret; } - void -fini (xlator_t *this) +fini(xlator_t *this) { - ra_conf_t *conf = NULL; + ra_conf_t *conf = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", this, out); + GF_VALIDATE_OR_GOTO("read-ahead", this, out); - conf = this->private; - if (conf == NULL) { - goto out; - } + conf = this->private; + if (conf == NULL) { + goto out; + } - this->private = NULL; + this->private = NULL; - GF_ASSERT ((conf->files.next == &conf->files) - && (conf->files.prev == &conf->files)); + /* The files structures allocated in open and create are not deleted. + * until that is freed, marking the below assert as warning. + GF_ASSERT ((conf->files.next == &conf->files) + && (conf->files.prev == &conf->files)); + */ + if (!((conf->files.next == &conf->files) && + (conf->files.prev == &conf->files))) { + gf_msg(this->name, GF_LOG_INFO, 0, + READ_AHEAD_MSG_UNDESTROYED_FILE_FOUND, + "undestroyed read ahead file structures found"); + } - pthread_mutex_destroy (&conf->conf_lock); - GF_FREE (conf); + pthread_mutex_destroy(&conf->conf_lock); + GF_FREE(conf); out: - return; + return; } struct xlator_fops fops = { - .open = ra_open, - .create = ra_create, - .readv = ra_readv, - .writev = ra_writev, - .flush = ra_flush, - .fsync = ra_fsync, - .truncate = ra_truncate, - .ftruncate = ra_ftruncate, - .fstat = ra_fstat, - .discard = ra_discard, + .open = ra_open, + .create = ra_create, + .readv = ra_readv, + .writev = ra_writev, + .flush = ra_flush, + .fsync = ra_fsync, + .truncate = ra_truncate, + .ftruncate = ra_ftruncate, + .fstat = ra_fstat, + .discard = ra_discard, + .zerofill = ra_zerofill, }; struct xlator_cbks cbks = { - .release = ra_release, + .release = ra_release, }; struct xlator_dumpops dumpops = { - .priv = ra_priv_dump, - .fdctx = ra_fdctx_dump, + .priv = ra_priv_dump, + .fdctx = ra_fdctx_dump, }; struct volume_options options[] = { - { .key = {"force-atime-update"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "false" - }, - { .key = {"page-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 16, - .default_value = "4", - .description = "Number of pages that will be pre-fetched" - }, - { .key = {"page-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 4096, - .max = 1048576 * 64, - .default_value = "131072", - .description = "Page size with which read-ahead performs server I/O" - }, - { .key = {NULL} }, + { + .key = {"read-ahead"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable read-ahead", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + {.key = {"force-atime-update"}, + .type = GF_OPTION_TYPE_BOOL, + .op_version = {1}, + .tags = {"read-ahead"}, + .default_value = "false"}, + {.key = {"page-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 16, + .default_value = "4", + .op_version = {1}, + .tags = {"read-ahead"}, + .description = "Number of pages that will be pre-fetched"}, + {.key = {"page-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 4096, + .max = 1048576 * 64, + .default_value = "131072", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"read-ahead"}, + .description = "Page size with which read-ahead performs server I/O"}, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"read-ahead"}, + .description = "Enable/Disable read ahead translator"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "read-ahead", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h index d1d768c34fb..e9432fb47cc 100644 --- a/xlators/performance/read-ahead/src/read-ahead.h +++ b/xlators/performance/read-ahead/src/read-ahead.h @@ -11,17 +11,11 @@ #ifndef __READ_AHEAD_H #define __READ_AHEAD_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "common-utils.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/common-utils.h> #include "read-ahead-mem-types.h" struct ra_conf; @@ -30,84 +24,77 @@ struct ra_page; struct ra_file; struct ra_waitq; - struct ra_waitq { - struct ra_waitq *next; - void *data; + struct ra_waitq *next; + void *data; }; - struct ra_fill { - struct ra_fill *next; - struct ra_fill *prev; - off_t offset; - size_t size; - struct iovec *vector; - int32_t count; - struct iobref *iobref; + struct ra_fill *next; + struct ra_fill *prev; + off_t offset; + size_t size; + struct iovec *vector; + int32_t count; + struct iobref *iobref; }; - struct ra_local { - mode_t mode; - struct ra_fill fill; - off_t offset; - size_t size; - int32_t op_ret; - int32_t op_errno; - off_t pending_offset; - size_t pending_size; - fd_t *fd; - int32_t wait_count; - pthread_mutex_t local_lock; + mode_t mode; + struct ra_fill fill; + off_t offset; + size_t size; + int32_t op_ret; + int32_t op_errno; + off_t pending_offset; + size_t pending_size; + fd_t *fd; + int32_t wait_count; + pthread_mutex_t local_lock; }; - struct ra_page { - struct ra_page *next; - struct ra_page *prev; - struct ra_file *file; - char dirty; /* Internal request, not from user. */ - char poisoned; /* Pending read invalidated by write. */ - char ready; - struct iovec *vector; - int32_t count; - off_t offset; - size_t size; - struct ra_waitq *waitq; - struct iobref *iobref; - char stale; + struct ra_page *next; + struct ra_page *prev; + struct ra_file *file; + char dirty; /* Internal request, not from user. */ + char poisoned; /* Pending read invalidated by write. */ + char ready; + struct iovec *vector; + int32_t count; + off_t offset; + size_t size; + struct ra_waitq *waitq; + struct iobref *iobref; + char stale; }; - struct ra_file { - struct ra_file *next; - struct ra_file *prev; - struct ra_conf *conf; - fd_t *fd; - int disabled; - size_t expected; - struct ra_page pages; - off_t offset; - size_t size; - int32_t refcount; - pthread_mutex_t file_lock; - struct iatt stbuf; - uint64_t page_size; - uint32_t page_count; + struct ra_file *next; + struct ra_file *prev; + struct ra_conf *conf; + fd_t *fd; + int disabled; + size_t expected; + struct ra_page pages; + off_t offset; + size_t size; + int32_t refcount; + pthread_mutex_t file_lock; + struct iatt stbuf; + uint64_t page_size; + uint32_t page_count; }; - struct ra_conf { - uint64_t page_size; - uint32_t page_count; - void *cache_block; - struct ra_file files; - gf_boolean_t force_atime_update; - pthread_mutex_t conf_lock; + uint64_t page_size; + uint32_t page_count; + void *cache_block; + struct ra_file files; + gf_boolean_t force_atime_update; + pthread_mutex_t conf_lock; }; - typedef struct ra_conf ra_conf_t; typedef struct ra_local ra_local_t; typedef struct ra_page ra_page_t; @@ -116,77 +103,69 @@ typedef struct ra_waitq ra_waitq_t; typedef struct ra_fill ra_fill_t; ra_page_t * -ra_page_get (ra_file_t *file, - off_t offset); +ra_page_get(ra_file_t *file, off_t offset); ra_page_t * -ra_page_create (ra_file_t *file, - off_t offset); +ra_page_create(ra_file_t *file, off_t offset); void -ra_page_fault (ra_file_t *file, - call_frame_t *frame, - off_t offset); +ra_page_fault(ra_file_t *file, call_frame_t *frame, off_t offset); void -ra_wait_on_page (ra_page_t *page, - call_frame_t *frame); +ra_wait_on_page(ra_page_t *page, call_frame_t *frame); ra_waitq_t * -ra_page_wakeup (ra_page_t *page); +ra_page_wakeup(ra_page_t *page); void -ra_page_flush (ra_page_t *page); +ra_page_flush(ra_page_t *page); ra_waitq_t * -ra_page_error (ra_page_t *page, - int32_t op_ret, - int32_t op_errno); +ra_page_error(ra_page_t *page, int32_t op_ret, int32_t op_errno); void -ra_page_purge (ra_page_t *page); +ra_page_purge(ra_page_t *page); void -ra_frame_return (call_frame_t *frame); +ra_frame_return(call_frame_t *frame); void -ra_frame_fill (ra_page_t *page, - call_frame_t *frame); +ra_frame_fill(ra_page_t *page, call_frame_t *frame); void -ra_file_destroy (ra_file_t *file); +ra_file_destroy(ra_file_t *file); static inline void -ra_file_lock (ra_file_t *file) +ra_file_lock(ra_file_t *file) { - pthread_mutex_lock (&file->file_lock); + pthread_mutex_lock(&file->file_lock); } static inline void -ra_file_unlock (ra_file_t *file) +ra_file_unlock(ra_file_t *file) { - pthread_mutex_unlock (&file->file_lock); + pthread_mutex_unlock(&file->file_lock); } static inline void -ra_conf_lock (ra_conf_t *conf) +ra_conf_lock(ra_conf_t *conf) { - pthread_mutex_lock (&conf->conf_lock); + pthread_mutex_lock(&conf->conf_lock); } static inline void -ra_conf_unlock (ra_conf_t *conf) +ra_conf_unlock(ra_conf_t *conf) { - pthread_mutex_unlock (&conf->conf_lock); + pthread_mutex_unlock(&conf->conf_lock); } static inline void -ra_local_lock (ra_local_t *local) +ra_local_lock(ra_local_t *local) { - pthread_mutex_lock (&local->local_lock); + pthread_mutex_lock(&local->local_lock); } static inline void -ra_local_unlock (ra_local_t *local) +ra_local_unlock(ra_local_t *local) { - pthread_mutex_unlock (&local->local_lock); + pthread_mutex_unlock(&local->local_lock); } #endif /* __READ_AHEAD_H */ diff --git a/xlators/performance/readdir-ahead/Makefile.am b/xlators/performance/readdir-ahead/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/performance/readdir-ahead/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/performance/readdir-ahead/src/Makefile.am b/xlators/performance/readdir-ahead/src/Makefile.am new file mode 100644 index 00000000000..3d6b6ae951f --- /dev/null +++ b/xlators/performance/readdir-ahead/src/Makefile.am @@ -0,0 +1,18 @@ +xlator_LTLIBRARIES = readdir-ahead.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +readdir_ahead_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +readdir_ahead_la_SOURCES = readdir-ahead.c +readdir_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = readdir-ahead.h readdir-ahead-mem-types.h \ + readdir-ahead-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h new file mode 100644 index 00000000000..498ffae7f64 --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __RDA_MEM_TYPES_H__ +#define __RDA_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_rda_mem_types_ { + gf_rda_mt_rda_local = gf_common_mt_end + 1, + gf_rda_mt_rda_fd_ctx, + gf_rda_mt_rda_priv, + gf_rda_mt_inode_ctx_t, + gf_rda_mt_end +}; + +#endif diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-messages.h b/xlators/performance/readdir-ahead/src/readdir-ahead-messages.h new file mode 100644 index 00000000000..28ec14dd845 --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead-messages.h @@ -0,0 +1,30 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _READDIR_AHEAD_MESSAGES_H_ +#define _READDIR_AHEAD_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(READDIR_AHEAD, READDIR_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED, + READDIR_AHEAD_MSG_VOL_MISCONFIGURED, READDIR_AHEAD_MSG_NO_MEMORY, + READDIR_AHEAD_MSG_DIR_RELEASE_PENDING_STUB, + READDIR_AHEAD_MSG_OUT_OF_SEQUENCE, READDIR_AHEAD_MSG_DICT_OP_FAILED); + +#endif /* _READDIR_AHEAD_MESSAGES_H_ */ diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.c b/xlators/performance/readdir-ahead/src/readdir-ahead.c new file mode 100644 index 00000000000..4ba7ee7077a --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead.c @@ -0,0 +1,1382 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/* + * performance/readdir-ahead preloads a local buffer with directory entries + * on opendir. The optimization involves using maximum sized gluster rpc + * requests (128k) to minimize overhead of smaller client requests. + * + * For example, fuse currently supports a maximum readdir buffer of 4k + * (regardless of the filesystem client's buffer size). readdir-ahead should + * effectively convert these smaller requests into fewer, larger sized requests + * for simple, sequential workloads (i.e., ls). + * + * The translator is currently designed to handle the simple, sequential case + * only. If a non-sequential directory read occurs, readdir-ahead disables + * preloads on the directory. + */ + +#include <math.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/call-stub.h> +#include "readdir-ahead.h" +#include "readdir-ahead-mem-types.h" +#include <glusterfs/defaults.h> +#include "readdir-ahead-messages.h" +static int +rda_fill_fd(call_frame_t *, xlator_t *, fd_t *); + +static void +rda_local_wipe(struct rda_local *local) +{ + if (local->fd) + fd_unref(local->fd); + if (local->xattrs) + dict_unref(local->xattrs); + if (local->inode) + inode_unref(local->inode); +} + +/* + * Get (or create) the fd context for storing prepopulated directory + * entries. + */ +static struct rda_fd_ctx * +get_rda_fd_ctx(fd_t *fd, xlator_t *this) +{ + uint64_t val; + struct rda_fd_ctx *ctx; + + LOCK(&fd->lock); + + if (__fd_ctx_get(fd, this, &val) < 0) { + ctx = GF_CALLOC(1, sizeof(struct rda_fd_ctx), gf_rda_mt_rda_fd_ctx); + if (!ctx) + goto out; + + LOCK_INIT(&ctx->lock); + INIT_LIST_HEAD(&ctx->entries.list); + ctx->state = RDA_FD_NEW; + /* ctx offset values initialized to 0 */ + ctx->xattrs = NULL; + + if (__fd_ctx_set(fd, this, (uint64_t)(uintptr_t)ctx) < 0) { + GF_FREE(ctx); + ctx = NULL; + goto out; + } + } else { + ctx = (struct rda_fd_ctx *)(uintptr_t)val; + } +out: + UNLOCK(&fd->lock); + return ctx; +} + +static rda_inode_ctx_t * +__rda_inode_ctx_get(inode_t *inode, xlator_t *this) +{ + int ret = -1; + uint64_t ctx_uint = 0; + rda_inode_ctx_t *ctx_p = NULL; + + ret = __inode_ctx_get1(inode, this, &ctx_uint); + if (ret == 0) + return (rda_inode_ctx_t *)(uintptr_t)ctx_uint; + + ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_rda_mt_inode_ctx_t); + if (!ctx_p) + return NULL; + + GF_ATOMIC_INIT(ctx_p->generation, 0); + + ctx_uint = (uint64_t)(uintptr_t)ctx_p; + ret = __inode_ctx_set1(inode, this, &ctx_uint); + if (ret < 0) { + GF_FREE(ctx_p); + return NULL; + } + + return ctx_p; +} + +static int +__rda_inode_ctx_update_iatts(inode_t *inode, xlator_t *this, + struct iatt *stbuf_in, struct iatt *stbuf_out, + uint64_t generation) +{ + rda_inode_ctx_t *ctx_p = NULL; + struct iatt tmp_stat = { + 0, + }; + + ctx_p = __rda_inode_ctx_get(inode, this); + if (!ctx_p) + return -1; + + if ((!stbuf_in) || (stbuf_in->ia_ctime == 0)) { + /* A fop modified a file but valid stbuf is not provided. + * Can't update iatt to reflect results of fop and hence + * invalidate the iatt stored in dentry. + * + * An example of this case can be response of write request + * that is cached in write-behind. + */ + if (stbuf_in) + tmp_stat = *stbuf_in; + else + tmp_stat = ctx_p->statbuf; + memset(&ctx_p->statbuf, 0, sizeof(ctx_p->statbuf)); + gf_uuid_copy(ctx_p->statbuf.ia_gfid, tmp_stat.ia_gfid); + ctx_p->statbuf.ia_type = tmp_stat.ia_type; + GF_ATOMIC_INC(ctx_p->generation); + } else { + if (ctx_p->statbuf.ia_ctime) { + if (stbuf_in->ia_ctime < ctx_p->statbuf.ia_ctime) { + goto out; + } + + if ((stbuf_in->ia_ctime == ctx_p->statbuf.ia_ctime) && + (stbuf_in->ia_ctime_nsec < ctx_p->statbuf.ia_ctime_nsec)) { + goto out; + } + } else { + if ((generation != -1) && + (generation != GF_ATOMIC_GET(ctx_p->generation))) + goto out; + } + + ctx_p->statbuf = *stbuf_in; + } + +out: + if (stbuf_out) + *stbuf_out = ctx_p->statbuf; + + return 0; +} + +static int +rda_inode_ctx_update_iatts(inode_t *inode, xlator_t *this, + struct iatt *stbuf_in, struct iatt *stbuf_out, + uint64_t generation) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __rda_inode_ctx_update_iatts(inode, this, stbuf_in, stbuf_out, + generation); + } + UNLOCK(&inode->lock); + + return ret; +} + +/* + * Reset the tracking state of the context. + */ +static void +rda_reset_ctx(xlator_t *this, struct rda_fd_ctx *ctx) +{ + struct rda_priv *priv = NULL; + + priv = this->private; + + ctx->state = RDA_FD_NEW; + ctx->cur_offset = 0; + ctx->next_offset = 0; + ctx->op_errno = 0; + + gf_dirent_free(&ctx->entries); + GF_ATOMIC_SUB(priv->rda_cache_size, ctx->cur_size); + ctx->cur_size = 0; + + if (ctx->xattrs) { + dict_unref(ctx->xattrs); + ctx->xattrs = NULL; + } +} + +static void +rda_mark_inode_dirty(xlator_t *this, inode_t *inode) +{ + inode_t *parent = NULL; + fd_t *fd = NULL; + uint64_t val = 0; + int32_t ret = 0; + struct rda_fd_ctx *fd_ctx = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + parent = inode_parent(inode, NULL, NULL); + if (parent) { + LOCK(&parent->lock); + { + list_for_each_entry(fd, &parent->fd_list, inode_list) + { + val = 0; + fd_ctx_get(fd, this, &val); + if (val == 0) + continue; + + fd_ctx = (void *)(uintptr_t)val; + uuid_utoa_r(inode->gfid, gfid); + if (!GF_ATOMIC_GET(fd_ctx->prefetching)) + continue; + + LOCK(&fd_ctx->lock); + { + if (GF_ATOMIC_GET(fd_ctx->prefetching)) { + if (fd_ctx->writes_during_prefetch == NULL) + fd_ctx->writes_during_prefetch = dict_new(); + + ret = dict_set_int8(fd_ctx->writes_during_prefetch, + gfid, 1); + if (ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "marking to invalidate stats of %s from an " + "in progress " + "prefetching has failed, might result in " + "stale stat to " + "application", + gfid); + } + } + } + UNLOCK(&fd_ctx->lock); + } + } + UNLOCK(&parent->lock); + inode_unref(parent); + } + + return; +} + +/* + * Check whether we can handle a request. Offset verification is done by the + * caller, so we only check whether the preload buffer has completion status + * (including an error) or has some data to return. + */ +static gf_boolean_t +rda_can_serve_readdirp(struct rda_fd_ctx *ctx, size_t request_size) +{ + if ((ctx->state & RDA_FD_EOD) || (ctx->state & RDA_FD_ERROR) || + (!(ctx->state & RDA_FD_PLUGGED) && (ctx->cur_size > 0)) || + (request_size && ctx->cur_size >= request_size)) + return _gf_true; + + return _gf_false; +} + +void +rda_inode_ctx_get_iatt(inode_t *inode, xlator_t *this, struct iatt *attr) +{ + rda_inode_ctx_t *ctx_p = NULL; + + if (!inode || !this || !attr) + goto out; + + LOCK(&inode->lock); + { + ctx_p = __rda_inode_ctx_get(inode, this); + if (ctx_p) { + *attr = ctx_p->statbuf; + } + } + UNLOCK(&inode->lock); + +out: + return; +} + +/* + * Serve a request from the fd dentry list based on the size of the request + * buffer. ctx must be locked. + */ +static int32_t +__rda_fill_readdirp(xlator_t *this, gf_dirent_t *entries, size_t request_size, + struct rda_fd_ctx *ctx) +{ + gf_dirent_t *dirent, *tmp; + size_t dirent_size, size = 0; + int32_t count = 0; + struct rda_priv *priv = NULL; + struct iatt tmp_stat = { + 0, + }; + + priv = this->private; + + list_for_each_entry_safe(dirent, tmp, &ctx->entries.list, list) + { + dirent_size = gf_dirent_size(dirent->d_name); + if (size + dirent_size > request_size) + break; + + memset(&tmp_stat, 0, sizeof(tmp_stat)); + + if (dirent->inode && (!((strcmp(dirent->d_name, ".") == 0) || + (strcmp(dirent->d_name, "..") == 0)))) { + rda_inode_ctx_get_iatt(dirent->inode, this, &tmp_stat); + dirent->d_stat = tmp_stat; + } + + size += dirent_size; + list_del_init(&dirent->list); + ctx->cur_size -= dirent_size; + + GF_ATOMIC_SUB(priv->rda_cache_size, dirent_size); + + list_add_tail(&dirent->list, &entries->list); + ctx->cur_offset = dirent->d_off; + count++; + } + + if (ctx->cur_size <= priv->rda_low_wmark) + ctx->state |= RDA_FD_PLUGGED; + + return count; +} + +static int32_t +__rda_serve_readdirp(xlator_t *this, struct rda_fd_ctx *ctx, size_t size, + gf_dirent_t *entries, int *op_errno) +{ + int32_t ret = 0; + + ret = __rda_fill_readdirp(this, entries, size, ctx); + + if (!ret && (ctx->state & RDA_FD_ERROR)) { + ret = -1; + ctx->state &= ~RDA_FD_ERROR; + + /* + * the preload has stopped running in the event of an error, so + * pass all future requests along + */ + ctx->state |= RDA_FD_BYPASS; + } + /* + * Use the op_errno sent by lower layers as xlators above will check + * the op_errno for identifying whether readdir is completed or not. + */ + *op_errno = ctx->op_errno; + + return ret; +} + +static int32_t +rda_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + struct rda_fd_ctx *ctx = NULL; + int fill = 0; + gf_dirent_t entries; + int ret = 0; + int op_errno = 0; + gf_boolean_t serve = _gf_false; + + ctx = get_rda_fd_ctx(fd, this); + if (!ctx) + goto err; + + if (ctx->state & RDA_FD_BYPASS) + goto bypass; + + INIT_LIST_HEAD(&entries.list); + LOCK(&ctx->lock); + + /* recheck now that we have the lock */ + if (ctx->state & RDA_FD_BYPASS) { + UNLOCK(&ctx->lock); + goto bypass; + } + + /* + * If a new read comes in at offset 0 and the buffer has been + * completed, reset the context and kickstart the filler again. + */ + if (!off && (ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0)) { + rda_reset_ctx(this, ctx); + /* + * Unref and discard the 'list of xattrs to be fetched' + * stored during opendir call. This is done above - inside + * rda_reset_ctx(). + * Now, ref the xdata passed by md-cache in actual readdirp() + * call and use that for all subsequent internal readdirp() + * requests issued by this xlator. + */ + ctx->xattrs = dict_ref(xdata); + fill = 1; + } + + /* + * If a readdir occurs at an unexpected offset or we already have a + * request pending, admit defeat and just get out of the way. + */ + if (off != ctx->cur_offset || ctx->stub) { + ctx->state |= RDA_FD_BYPASS; + UNLOCK(&ctx->lock); + goto bypass; + } + + /* + * If we haven't bypassed the preload, this means we can either serve + * the request out of the preload or the request that enables us to do + * so is in flight... + */ + if (rda_can_serve_readdirp(ctx, size)) { + ret = __rda_serve_readdirp(this, ctx, size, &entries, &op_errno); + serve = _gf_true; + + if (op_errno == ENOENT && + !((ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0))) + op_errno = 0; + } else { + ctx->stub = fop_readdirp_stub(frame, NULL, fd, size, off, xdata); + if (!ctx->stub) { + UNLOCK(&ctx->lock); + goto err; + } + + if (!(ctx->state & RDA_FD_RUNNING)) { + fill = 1; + if (!ctx->xattrs) + ctx->xattrs = dict_ref(xdata); + ctx->state |= RDA_FD_RUNNING; + } + } + + UNLOCK(&ctx->lock); + + if (serve) { + STACK_UNWIND_STRICT(readdirp, frame, ret, op_errno, &entries, xdata); + gf_dirent_free(&entries); + } + + if (fill) + rda_fill_fd(frame, this, fd); + + return 0; + +bypass: + STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +static int32_t +rda_fill_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + gf_dirent_t *dirent = NULL; + gf_dirent_t *tmp = NULL; + gf_dirent_t serve_entries; + struct rda_local *local = frame->local; + struct rda_fd_ctx *ctx = local->ctx; + struct rda_priv *priv = this->private; + int fill = 1; + size_t dirent_size = 0; + int ret = 0; + gf_boolean_t serve = _gf_false; + call_stub_t *stub = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + uint64_t generation = 0; + call_frame_t *fill_frame = NULL; + + INIT_LIST_HEAD(&serve_entries.list); + LOCK(&ctx->lock); + + /* Verify that the preload buffer is still pending on this data. */ + if (ctx->next_offset != local->offset) { + gf_msg(this->name, GF_LOG_ERROR, 0, READDIR_AHEAD_MSG_OUT_OF_SEQUENCE, + "Out of sequence directory preload."); + ctx->state |= (RDA_FD_BYPASS | RDA_FD_ERROR); + ctx->op_errno = EUCLEAN; + + goto out; + } + + if (entries) { + list_for_each_entry_safe(dirent, tmp, &entries->list, list) + { + list_del_init(&dirent->list); + + /* must preserve entry order */ + list_add_tail(&dirent->list, &ctx->entries.list); + if (dirent->inode) { + /* If ctxp->stat is invalidated, don't update it + * with dirent->d_stat as we don't have + * generation number of the inode when readdirp + * request was initiated. So, we pass 0 for + * generation number + */ + + generation = -1; + if (ctx->writes_during_prefetch) { + memset(gfid, 0, sizeof(gfid)); + uuid_utoa_r(dirent->inode->gfid, gfid); + if (dict_get(ctx->writes_during_prefetch, gfid)) + generation = 0; + } + + if (!((strcmp(dirent->d_name, ".") == 0) || + (strcmp(dirent->d_name, "..") == 0))) { + rda_inode_ctx_update_iatts(dirent->inode, this, + &dirent->d_stat, &dirent->d_stat, + generation); + } + } + + dirent_size = gf_dirent_size(dirent->d_name); + + ctx->cur_size += dirent_size; + + GF_ATOMIC_ADD(priv->rda_cache_size, dirent_size); + + ctx->next_offset = dirent->d_off; + } + } + + if (ctx->writes_during_prefetch) { + dict_unref(ctx->writes_during_prefetch); + ctx->writes_during_prefetch = NULL; + } + + GF_ATOMIC_DEC(ctx->prefetching); + + if (ctx->cur_size >= priv->rda_high_wmark) + ctx->state &= ~RDA_FD_PLUGGED; + + if (!op_ret || op_errno == ENOENT) { + /* we've hit eod */ + ctx->state &= ~RDA_FD_RUNNING; + ctx->state |= RDA_FD_EOD; + ctx->op_errno = op_errno; + } else if (op_ret == -1) { + /* kill the preload and pend the error */ + ctx->state &= ~RDA_FD_RUNNING; + ctx->state |= RDA_FD_ERROR; + ctx->op_errno = op_errno; + } + + /* + * NOTE: The strict bypass logic in readdirp() means a pending request + * is always based on ctx->cur_offset. + */ + if (ctx->stub && rda_can_serve_readdirp(ctx, ctx->stub->args.size)) { + ret = __rda_serve_readdirp(this, ctx, ctx->stub->args.size, + &serve_entries, &op_errno); + serve = _gf_true; + stub = ctx->stub; + ctx->stub = NULL; + } + +out: + /* + * If we have been marked for bypass and have no pending stub, clear the + * run state so we stop preloading the context with entries. + */ + if (!ctx->stub && + ((ctx->state & RDA_FD_BYPASS) || + GF_ATOMIC_GET(priv->rda_cache_size) > priv->rda_cache_limit)) + ctx->state &= ~RDA_FD_RUNNING; + + if (!(ctx->state & RDA_FD_RUNNING)) { + fill = 0; + if (ctx->xattrs) { + /* + * fill = 0 and hence rda_fill_fd() won't be invoked. + * unref for ref taken in rda_fill_fd() + */ + dict_unref(ctx->xattrs); + ctx->xattrs = NULL; + } + + fill_frame = ctx->fill_frame; + ctx->fill_frame = NULL; + } + + if (op_errno == ENOENT && + !((ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0))) + op_errno = 0; + + UNLOCK(&ctx->lock); + if (fill_frame) { + rda_local_wipe(fill_frame->local); + STACK_DESTROY(fill_frame->root); + } + + if (serve) { + STACK_UNWIND_STRICT(readdirp, stub->frame, ret, op_errno, + &serve_entries, xdata); + gf_dirent_free(&serve_entries); + call_stub_destroy(stub); + } + + if (fill) + rda_fill_fd(frame, this, local->fd); + + return 0; +} + +/* + * Start prepopulating the fd context with directory entries. + */ +static int +rda_fill_fd(call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + call_frame_t *nframe = NULL; + struct rda_local *local = NULL; + struct rda_local *orig_local = frame->local; + struct rda_fd_ctx *ctx; + off_t offset; + struct rda_priv *priv = this->private; + + ctx = get_rda_fd_ctx(fd, this); + if (!ctx) + goto err; + + LOCK(&ctx->lock); + + if (ctx->state & RDA_FD_NEW) { + ctx->state &= ~RDA_FD_NEW; + ctx->state |= RDA_FD_RUNNING; + if (priv->rda_low_wmark) + ctx->state |= RDA_FD_PLUGGED; + } + + offset = ctx->next_offset; + + if (!ctx->fill_frame) { + nframe = copy_frame(frame); + if (!nframe) { + UNLOCK(&ctx->lock); + goto err; + } + + local = mem_get0(this->local_pool); + if (!local) { + UNLOCK(&ctx->lock); + goto err; + } + + local->ctx = ctx; + local->fd = fd_ref(fd); + nframe->local = local; + + ctx->fill_frame = nframe; + + if (!ctx->xattrs && orig_local && orig_local->xattrs) { + /* when this function is invoked by rda_opendir_cbk */ + ctx->xattrs = dict_ref(orig_local->xattrs); + } + } else { + nframe = ctx->fill_frame; + local = nframe->local; + } + + local->offset = offset; + GF_ATOMIC_INC(ctx->prefetching); + + UNLOCK(&ctx->lock); + + STACK_WIND(nframe, rda_fill_fd_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, priv->rda_req_size, + offset, ctx->xattrs); + + return 0; + +err: + if (nframe) { + rda_local_wipe(nframe->local); + FRAME_DESTROY(nframe); + } + + return -1; +} + +static int32_t +rda_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + if (!op_ret) + rda_fill_fd(frame, this, fd); + + RDA_STACK_UNWIND(opendir, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +static int32_t +rda_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + int op_errno = 0; + struct rda_local *local = NULL; + + if (xdata) { + local = mem_get0(this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto unwind; + } + + /* + * Retrieve list of keys set by md-cache xlator and store it + * in local to be consumed in rda_opendir_cbk + */ + local->xattrs = dict_copy_with_ref(xdata, NULL); + frame->local = local; + } + + STACK_WIND(frame, rda_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(opendir, frame, -1, op_errno, fd, xdata); + return 0; +} + +static int32_t +rda_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + + rda_mark_inode_dirty(this, local->inode); + + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(writev, frame, this, fd->inode, xdata, fd, + vector, count, off, flags, iobref); + return 0; +} + +static int32_t +rda_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(fallocate, frame, this, fd->inode, xdata, fd, + keep_size, offset, len); + return 0; +} + +static int32_t +rda_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(zerofill, frame, this, fd->inode, xdata, fd, + offset, len); + return 0; +} + +static int32_t +rda_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(discard, frame, this, fd->inode, xdata, fd, + offset, len); + return 0; +} + +static int32_t +rda_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(ftruncate, frame, this, fd->inode, xdata, fd, + offset); + return 0; +} + +static int32_t +rda_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(truncate, frame, this, loc->inode, xdata, loc, + offset); + return 0; +} + +static int32_t +rda_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + struct rda_local *local = NULL; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL, + local->generation); +unwind: + RDA_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int32_t +rda_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(setxattr, frame, this, loc->inode, xdata, loc, + dict, flags); + return 0; +} + +static int32_t +rda_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + struct rda_local *local = NULL; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL, + local->generation); +unwind: + RDA_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int32_t +rda_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(fsetxattr, frame, this, fd->inode, xdata, fd, + dict, flags); + return 0; +} + +static int32_t +rda_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, statpost, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(setattr, frame, op_ret, op_errno, statpre, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(setattr, frame, this, loc->inode, xdata, loc, + stbuf, valid); + return 0; +} + +static int32_t +rda_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, statpost, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, statpre, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(fsetattr, frame, this, fd->inode, xdata, fd, + stbuf, valid); + return 0; +} + +static int32_t +rda_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + struct rda_local *local = NULL; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL, + local->generation); +unwind: + RDA_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int32_t +rda_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(removexattr, frame, this, loc->inode, xdata, + loc, name); + return 0; +} + +static int32_t +rda_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + struct rda_local *local = NULL; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL, + local->generation); +unwind: + RDA_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int32_t +rda_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(fremovexattr, frame, this, fd->inode, xdata, fd, + name); + return 0; +} + +static int32_t +rda_releasedir(xlator_t *this, fd_t *fd) +{ + uint64_t val; + struct rda_fd_ctx *ctx; + + if (fd_ctx_del(fd, this, &val) < 0) + return -1; + + ctx = (struct rda_fd_ctx *)(uintptr_t)val; + if (!ctx) + return 0; + + rda_reset_ctx(this, ctx); + + if (ctx->fill_frame) + STACK_DESTROY(ctx->fill_frame->root); + + if (ctx->stub) + gf_msg(this->name, GF_LOG_ERROR, 0, + READDIR_AHEAD_MSG_DIR_RELEASE_PENDING_STUB, + "released a directory with a pending stub"); + + GF_FREE(ctx); + return 0; +} + +static int +rda_forget(xlator_t *this, inode_t *inode) +{ + uint64_t ctx_uint = 0; + rda_inode_ctx_t *ctx = NULL; + + inode_ctx_del1(inode, this, &ctx_uint); + if (!ctx_uint) + return 0; + + ctx = (rda_inode_ctx_t *)(uintptr_t)ctx_uint; + + GF_FREE(ctx); + + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + goto out; + + ret = xlator_mem_acct_init(this, gf_rda_mt_end + 1); + + if (ret != 0) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READDIR_AHEAD_MSG_NO_MEMORY, + "Memory accounting init" + "failed"); + +out: + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + struct rda_priv *priv = this->private; + + GF_OPTION_RECONF("rda-request-size", priv->rda_req_size, options, + size_uint64, err); + GF_OPTION_RECONF("rda-low-wmark", priv->rda_low_wmark, options, size_uint64, + err); + GF_OPTION_RECONF("rda-high-wmark", priv->rda_high_wmark, options, + size_uint64, err); + GF_OPTION_RECONF("rda-cache-limit", priv->rda_cache_limit, options, + size_uint64, err); + GF_OPTION_RECONF("parallel-readdir", priv->parallel_readdir, options, bool, + err); + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, err); + + return 0; +err: + return -1; +} + +int +init(xlator_t *this) +{ + struct rda_priv *priv = NULL; + + GF_VALIDATE_OR_GOTO("readdir-ahead", this, err); + + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, + READDIR_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED, + "FATAL: readdir-ahead not configured with exactly one" + " child"); + goto err; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, + READDIR_AHEAD_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfile "); + } + + priv = GF_CALLOC(1, sizeof(struct rda_priv), gf_rda_mt_rda_priv); + if (!priv) + goto err; + this->private = priv; + + GF_ATOMIC_INIT(priv->rda_cache_size, 0); + + this->local_pool = mem_pool_new(struct rda_local, 32); + if (!this->local_pool) + goto err; + + GF_OPTION_INIT("rda-request-size", priv->rda_req_size, size_uint64, err); + GF_OPTION_INIT("rda-low-wmark", priv->rda_low_wmark, size_uint64, err); + GF_OPTION_INIT("rda-high-wmark", priv->rda_high_wmark, size_uint64, err); + GF_OPTION_INIT("rda-cache-limit", priv->rda_cache_limit, size_uint64, err); + GF_OPTION_INIT("parallel-readdir", priv->parallel_readdir, bool, err); + GF_OPTION_INIT("pass-through", this->pass_through, bool, err); + + return 0; + +err: + if (this->local_pool) + mem_pool_destroy(this->local_pool); + if (priv) + GF_FREE(priv); + + return -1; +} + +void +fini(xlator_t *this) +{ + GF_VALIDATE_OR_GOTO("readdir-ahead", this, out); + + GF_FREE(this->private); + +out: + return; +} + +struct xlator_fops fops = { + .opendir = rda_opendir, + .readdirp = rda_readdirp, + /* inode write */ + /* TODO: invalidate a dentry's stats if its pointing to a directory + * when entry operations happen in that directory + */ + .writev = rda_writev, + .truncate = rda_truncate, + .ftruncate = rda_ftruncate, + .fallocate = rda_fallocate, + .discard = rda_discard, + .zerofill = rda_zerofill, + /* metadata write */ + .setxattr = rda_setxattr, + .fsetxattr = rda_fsetxattr, + .setattr = rda_setattr, + .fsetattr = rda_fsetattr, + .removexattr = rda_removexattr, + .fremovexattr = rda_fremovexattr, +}; + +struct xlator_cbks cbks = { + .releasedir = rda_releasedir, + .forget = rda_forget, +}; + +struct volume_options options[] = { + { + .key = {"readdir-ahead"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable readdir-ahead", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + { + .key = {"rda-request-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 4096, + .max = 131072, + .default_value = "131072", + .description = "size of buffer in readdirp calls initiated by " + "readdir-ahead ", + }, + { + .key = {"rda-low-wmark"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 10 * GF_UNIT_MB, + .default_value = "4096", + .description = "the value under which readdir-ahead plugs", + }, + { + .key = {"rda-high-wmark"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 100 * GF_UNIT_MB, + .default_value = "128KB", + .description = "the value over which readdir-ahead unplugs", + }, + { + .key = {"rda-cache-limit"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = INFINITY, + .default_value = "10MB", + .description = "maximum size of cache consumed by readdir-ahead " + "xlator. This value is global and total memory " + "consumption by readdir-ahead is capped by this " + "value, irrespective of the number/size of " + "directories cached", + }, + {.key = {"parallel-readdir"}, + .type = GF_OPTION_TYPE_BOOL, + .op_version = {GD_OP_VERSION_3_10_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .default_value = "off", + .description = "If this option is enabled, the readdir operation " + "is performed in parallel on all the bricks, thus " + "improving the performance of readdir. Note that " + "the performance improvement is higher in large " + "clusters"}, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"readdir-ahead"}, + .description = "Enable/Disable readdir ahead translator"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "readdir-ahead", + .category = GF_MAINTAINED, +}; diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.h b/xlators/performance/readdir-ahead/src/readdir-ahead.h new file mode 100644 index 00000000000..619c41059ff --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead.h @@ -0,0 +1,98 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __READDIR_AHEAD_H +#define __READDIR_AHEAD_H + +/* state flags */ +#define RDA_FD_NEW (1 << 0) +#define RDA_FD_RUNNING (1 << 1) +#define RDA_FD_EOD (1 << 2) +#define RDA_FD_ERROR (1 << 3) +#define RDA_FD_BYPASS (1 << 4) +#define RDA_FD_PLUGGED (1 << 5) + +#define RDA_COMMON_MODIFICATION_FOP(name, frame, this, __inode, __xdata, \ + args...) \ + do { \ + struct rda_local *__local = NULL; \ + rda_inode_ctx_t *ctx_p = NULL; \ + \ + __local = mem_get0(this->local_pool); \ + __local->inode = inode_ref(__inode); \ + LOCK(&__inode->lock); \ + { \ + ctx_p = __rda_inode_ctx_get(__inode, this); \ + } \ + UNLOCK(&__inode->lock); \ + __local->generation = GF_ATOMIC_GET(ctx_p->generation); \ + \ + frame->local = __local; \ + if (__xdata) \ + __local->xattrs = dict_ref(__xdata); \ + \ + STACK_WIND(frame, rda_##name##_cbk, FIRST_CHILD(this), \ + FIRST_CHILD(this)->fops->name, args, __xdata); \ + } while (0) + +#define RDA_STACK_UNWIND(fop, frame, params...) \ + do { \ + struct rda_local *__local = NULL; \ + if (frame) { \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + if (__local) { \ + rda_local_wipe(__local); \ + mem_put(__local); \ + } \ + } while (0) + +struct rda_fd_ctx { + off_t cur_offset; /* current head of the ctx */ + size_t cur_size; /* current size of the preload */ + off_t next_offset; /* tail of the ctx */ + uint32_t state; + gf_lock_t lock; + gf_dirent_t entries; + call_frame_t *fill_frame; + call_stub_t *stub; + int op_errno; + dict_t *xattrs; /* md-cache keys to be sent in readdirp() */ + dict_t *writes_during_prefetch; + gf_atomic_t prefetching; +}; + +struct rda_local { + struct rda_fd_ctx *ctx; + fd_t *fd; + dict_t *xattrs; /* md-cache keys to be sent in readdirp() */ + inode_t *inode; + off_t offset; + uint64_t generation; + int32_t skip_dir; +}; + +struct rda_priv { + uint64_t rda_req_size; + uint64_t rda_low_wmark; + uint64_t rda_high_wmark; + uint64_t rda_cache_limit; + gf_atomic_t rda_cache_size; + gf_boolean_t parallel_readdir; +}; + +typedef struct rda_inode_ctx { + struct iatt statbuf; + gf_atomic_t generation; +} rda_inode_ctx_t; + +#endif /* __READDIR_AHEAD_H */ diff --git a/xlators/performance/symlink-cache/src/Makefile.am b/xlators/performance/symlink-cache/src/Makefile.am deleted file mode 100644 index 4091c329325..00000000000 --- a/xlators/performance/symlink-cache/src/Makefile.am +++ /dev/null @@ -1,13 +0,0 @@ -xlator_LTLIBRARIES = symlink-cache.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/performance - -symlink_cache_la_LDFLAGS = -module -avoid-version - -symlink_cache_la_SOURCES = symlink-cache.c -symlink_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src - -AM_CFLAGS = -Wall $(GF_CFLAGS) - -CLEANFILES = diff --git a/xlators/performance/symlink-cache/src/symlink-cache.c b/xlators/performance/symlink-cache/src/symlink-cache.c deleted file mode 100644 index 3b5fbc252ec..00000000000 --- a/xlators/performance/symlink-cache/src/symlink-cache.c +++ /dev/null @@ -1,399 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "list.h" -#include "compat.h" -#include "compat-errno.h" -#include "common-utils.h" - -struct symlink_cache { - time_t ctime; - char *readlink; -}; - - -static int -symlink_inode_ctx_get (inode_t *inode, xlator_t *this, void **ctx) -{ - int ret = 0; - uint64_t tmp_ctx = 0; - ret = inode_ctx_get (inode, this, &tmp_ctx); - if (-1 == ret) - gf_log (this->name, GF_LOG_ERROR, "dict get failed"); - else - *ctx = (void *)(long)tmp_ctx; - - return 0; -} - - -static int -symlink_inode_ctx_set (inode_t *inode, xlator_t *this, void *ctx) -{ - int ret = 0; - ret = inode_ctx_put (inode, this, (uint64_t)(long) ctx); - if (-1 == ret) - gf_log (this->name, GF_LOG_ERROR, "dict set failed"); - - return 0; -} - - -int -sc_cache_update (xlator_t *this, inode_t *inode, const char *link) -{ - struct symlink_cache *sc = NULL; - - symlink_inode_ctx_get (inode, this, VOID(&sc)); - if (!sc) - return 0; - - if (!sc->readlink) { - gf_log (this->name, GF_LOG_DEBUG, - "updating cache: %s", link); - - sc->readlink = strdup (link); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "not updating existing cache: %s with %s", - sc->readlink, link); - } - - return 0; -} - - -int -sc_cache_set (xlator_t *this, inode_t *inode, struct iatt *buf, - const char *link) -{ - struct symlink_cache *sc = NULL; - int ret = -1; - int need_set = 0; - - - symlink_inode_ctx_get (inode, this, VOID(&sc)); - if (!sc) { - need_set = 1; - sc = CALLOC (1, sizeof (*sc)); - if (!sc) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - goto err; - } - } - - if (sc->readlink) { - gf_log (this->name, GF_LOG_DEBUG, - "replacing old cache: %s with new cache: %s", - sc->readlink, link); - FREE (sc->readlink); - sc->readlink = NULL; - } - - if (link) { - sc->readlink = strdup (link); - if (!sc->readlink) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - goto err; - } - } - - sc->ctime = buf->ia_ctime; - - gf_log (this->name, GF_LOG_DEBUG, - "setting symlink cache: %s", link); - - if (need_set) { - ret = symlink_inode_ctx_set (inode, this, sc); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set inode context (%s)", - strerror (-ret)); - goto err; - } - } - - return 0; -err: - - if (sc) { - FREE (sc->readlink); - sc->readlink = NULL; - FREE (sc); - } - - return -1; -} - - -int -sc_cache_flush (xlator_t *this, inode_t *inode) -{ - struct symlink_cache *sc = NULL; - - symlink_inode_ctx_get (inode, this, VOID(&sc)); - if (!sc) - return 0; - - if (sc->readlink) { - gf_log (this->name, GF_LOG_DEBUG, - "flushing cache: %s", sc->readlink); - - FREE (sc->readlink); - sc->readlink = NULL; - } - - FREE (sc); - - return 0; -} - - -int -sc_cache_validate (xlator_t *this, inode_t *inode, struct iatt *buf) -{ - struct symlink_cache *sc = NULL; - uint64_t tmp_sc = 0; - - if (!IA_ISLNK (buf->ia_type)) { - sc_cache_flush (this, inode); - return 0; - } - - symlink_inode_ctx_get (inode, this, VOID(&sc)); - - if (!sc) { - sc_cache_set (this, inode, buf, NULL); - inode_ctx_get (inode, this, &tmp_sc); - - if (!sc) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - return 0; - } - sc = (struct symlink_cache *)(long)tmp_sc; - } - - if (sc->ctime == buf->ia_ctime) - return 0; - - /* STALE */ - if (sc->readlink) { - gf_log (this->name, GF_LOG_DEBUG, - "flushing cache: %s", sc->readlink); - - FREE (sc->readlink); - sc->readlink = NULL; - } - - sc->ctime = buf->ia_ctime; - - return 0; -} - - - -int -sc_cache_get (xlator_t *this, inode_t *inode, char **link) -{ - struct symlink_cache *sc = NULL; - - symlink_inode_ctx_get (inode, this, VOID(&sc)); - - if (!sc) - return 0; - - if (link && sc->readlink) - *link = strdup (sc->readlink); - return 0; -} - - -int -sc_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - const char *link, struct iatt *sbuf, dict_t *xdata) -{ - if (op_ret > 0) - sc_cache_update (this, frame->local, link); - - inode_unref (frame->local); - frame->local = NULL; - - STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, link, sbuf, - xdata); - return 0; -} - - -int -sc_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size, dict_t *xdata) -{ - char *link = NULL; - struct iatt buf = {0, }; - - sc_cache_get (this, loc->inode, &link); - - if (link) { - /* cache hit */ - gf_log (this->name, GF_LOG_DEBUG, - "cache hit %s -> %s", - loc->path, link); - - /* - libglusterfsclient, nfs or any other translators - using buf in readlink_cbk should be aware that @buf - is 0 filled - */ - STACK_UNWIND_STRICT (readlink, frame, strlen (link), 0, link, - &buf, NULL); - FREE (link); - return 0; - } - - frame->local = inode_ref (loc->inode); - - STACK_WIND (frame, sc_readlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readlink, - loc, size, xdata); - - return 0; -} - - -int -sc_symlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - if (op_ret == 0) { - if (frame->local) { - sc_cache_set (this, inode, buf, frame->local); - } - } - - STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; -} - - -int -sc_symlink (call_frame_t *frame, xlator_t *this, - const char *dst, loc_t *src, mode_t umask, dict_t *xdata) -{ - frame->local = strdup (dst); - - STACK_WIND (frame, sc_symlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, - dst, src, umask, xdata); - - return 0; -} - - -int -sc_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *buf, dict_t *xdata, - struct iatt *postparent) -{ - if (op_ret == 0) - sc_cache_validate (this, inode, buf); - else - sc_cache_flush (this, inode); - - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, - xdata, postparent); - return 0; -} - - -int -sc_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xdata) -{ - STACK_WIND (frame, sc_lookup_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - loc, xdata); - - return 0; -} - - -int -sc_forget (xlator_t *this, - inode_t *inode) -{ - sc_cache_flush (this, inode); - - return 0; -} - - -int32_t -init (xlator_t *this) -{ - if (!this->children || this->children->next) - { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: volume (%s) not configured with exactly one " - "child", this->name); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - return 0; -} - - -void -fini (xlator_t *this) -{ - return; -} - - -struct xlator_fops fops = { - .lookup = sc_lookup, - .symlink = sc_symlink, - .readlink = sc_readlink, -}; - - -struct xlator_cbks cbks = { - .forget = sc_forget, -}; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/performance/write-behind/src/Makefile.am b/xlators/performance/write-behind/src/Makefile.am index 6c829d8ee36..a6a16fcc080 100644 --- a/xlators/performance/write-behind/src/Makefile.am +++ b/xlators/performance/write-behind/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = write-behind.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -write_behind_la_LDFLAGS = -module -avoid-version +write_behind_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) write_behind_la_SOURCES = write-behind.c write_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = write-behind-mem-types.h +noinst_HEADERS = write-behind-mem-types.h write-behind-messages.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/performance/write-behind/src/write-behind-mem-types.h b/xlators/performance/write-behind/src/write-behind-mem-types.h index f64f429ce22..a0647299150 100644 --- a/xlators/performance/write-behind/src/write-behind-mem-types.h +++ b/xlators/performance/write-behind/src/write-behind-mem-types.h @@ -8,19 +8,17 @@ cases as published by the Free Software Foundation. */ - #ifndef __WB_MEM_TYPES_H__ #define __WB_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_wb_mem_types_ { - gf_wb_mt_wb_file_t = gf_common_mt_end + 1, - gf_wb_mt_wb_request_t, - gf_wb_mt_iovec, - gf_wb_mt_wb_conf_t, - gf_wb_mt_wb_inode_t, - gf_wb_mt_end + gf_wb_mt_wb_file_t = gf_common_mt_end + 1, + gf_wb_mt_wb_request_t, + gf_wb_mt_iovec, + gf_wb_mt_wb_conf_t, + gf_wb_mt_wb_inode_t, + gf_wb_mt_end }; #endif - diff --git a/xlators/performance/write-behind/src/write-behind-messages.h b/xlators/performance/write-behind/src/write-behind-messages.h new file mode 100644 index 00000000000..e9ea474879b --- /dev/null +++ b/xlators/performance/write-behind/src/write-behind-messages.h @@ -0,0 +1,31 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _WRITE_BEHIND_MESSAGES_H_ +#define _WRITE_BEHIND_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(WRITE_BEHIND, WRITE_BEHIND_MSG_EXCEEDED_MAX_SIZE, + WRITE_BEHIND_MSG_INIT_FAILED, WRITE_BEHIND_MSG_INVALID_ARGUMENT, + WRITE_BEHIND_MSG_NO_MEMORY, WRITE_BEHIND_MSG_SIZE_NOT_SET, + WRITE_BEHIND_MSG_VOL_MISCONFIGURED, + WRITE_BEHIND_MSG_RES_UNAVAILABLE); + +#endif /* _WRITE_BEHIND_MESSAGES_H_ */ diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index e01a227f30e..00cfca016e6 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -8,222 +8,258 @@ cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "list.h" -#include "compat.h" -#include "compat-errno.h" -#include "common-utils.h" -#include "call-stub.h" -#include "statedump.h" -#include "defaults.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/list.h> +#include <glusterfs/compat.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/statedump.h> +#include <glusterfs/defaults.h> #include "write-behind-mem-types.h" +#include "write-behind-messages.h" -#define MAX_VECTOR_COUNT 8 -#define WB_AGGREGATE_SIZE 131072 /* 128 KB */ -#define WB_WINDOW_SIZE 1048576 /* 1MB */ +#define MAX_VECTOR_COUNT 8 +#define WB_AGGREGATE_SIZE 131072 /* 128 KB */ +#define WB_WINDOW_SIZE 1048576 /* 1MB */ typedef struct list_head list_head_t; struct wb_conf; struct wb_inode; typedef struct wb_inode { - ssize_t window_conf; - ssize_t window_current; - ssize_t transit; /* size of data stack_wound, and yet - to be fulfilled (wb_fulfill_cbk). - used for trickling_writes - */ - - list_head_t all; /* All requests, from enqueue() till destroy(). - Used only for resetting generation - number when empty. - */ - list_head_t todo; /* Work to do (i.e, STACK_WIND to server). - Once we STACK_WIND, the entry is taken - off the list. If it is non-sync write, - then we continue to track it via @liability - or @temptation depending on the status - of its writeback. - */ - list_head_t liability; /* Non-sync writes which are lied - (STACK_UNWIND'ed to caller) but ack - from server not yet complete. This - is the "liability" which we hold, and - must guarantee that dependent operations - which arrive later (which overlap, etc.) - are issued only after their dependencies - in this list are "fulfilled". - - Server acks for entries in this list - shrinks the window. - - The sum total of all req->write_size - of entries in this list must be kept less - than the permitted window size. - */ - list_head_t temptation; /* Operations for which we are tempted - to 'lie' (write-behind), but temporarily - holding off (because of insufficient - window capacity, etc.) - - This is the list to look at to grow - the window (in __wb_pick_unwinds()). - - Entries typically get chosen from - write-behind from this list, and therefore - get "upgraded" to the "liability" list. - */ - list_head_t wip; /* List of write calls in progress, SYNC or non-SYNC - which are currently STACK_WIND'ed towards the server. - This is for guaranteeing that no two overlapping - writes are in progress at the same time. Modules - like eager-lock in AFR depend on this behavior. - */ - uint64_t gen; /* Liability generation number. Represents - the current 'state' of liability. Every - new addition to the liability list bumps - the generation number. - - a newly arrived request is only required - to perform causal checks against the entries - in the liability list which were present - at the time of its addition. the generation - number at the time of its addition is stored - in the request and used during checks. - - the liability list can grow while the request - waits in the todo list waiting for its - dependent operations to complete. however - it is not of the request's concern to depend - itself on those new entries which arrived - after it arrived (i.e, those that have a - liability generation higher than itself) - */ - gf_lock_t lock; - xlator_t *this; -} wb_inode_t; + ssize_t window_conf; + ssize_t window_current; + ssize_t transit; /* size of data stack_wound, and yet + to be fulfilled (wb_fulfill_cbk). + used for trickling_writes + */ + + list_head_t all; /* All requests, from enqueue() till destroy(). + Used only for resetting generation + number when empty. + */ + list_head_t todo; /* Work to do (i.e, STACK_WIND to server). + Once we STACK_WIND, the entry is taken + off the list. If it is non-sync write, + then we continue to track it via @liability + or @temptation depending on the status + of its writeback. + */ + list_head_t liability; /* Non-sync writes which are lied + (STACK_UNWIND'ed to caller) but ack + from server not yet complete. This + is the "liability" which we hold, and + must guarantee that dependent operations + which arrive later (which overlap, etc.) + are issued only after their dependencies + in this list are "fulfilled". + + Server acks for entries in this list + shrinks the window. + + The sum total of all req->write_size + of entries in this list must be kept less + than the permitted window size. + */ + list_head_t temptation; /* Operations for which we are tempted + to 'lie' (write-behind), but temporarily + holding off (because of insufficient + window capacity, etc.) + + This is the list to look at to grow + the window (in __wb_pick_unwinds()). + + Entries typically get chosen from + write-behind from this list, and therefore + get "upgraded" to the "liability" list. + */ + list_head_t wip; /* List of write calls in progress, SYNC or non-SYNC + which are currently STACK_WIND'ed towards the server. + This is for guaranteeing that no two overlapping + writes are in progress at the same time. Modules + like eager-lock in AFR depend on this behavior. + */ + list_head_t invalidate_list; /* list of wb_inodes that were marked for + * iatt invalidation due to requests in + * liability queue fulfilled while there + * was a readdirp session on parent + * directory. For a directory inode, this + * list points to list of children. + */ + uint64_t gen; /* Liability generation number. Represents + the current 'state' of liability. Every + new addition to the liability list bumps + the generation number. + + a newly arrived request is only required + to perform causal checks against the entries + in the liability list which were present + at the time of its addition. the generation + number at the time of its addition is stored + in the request and used during checks. + + the liability list can grow while the request + waits in the todo list waiting for its + dependent operations to complete. however + it is not of the request's concern to depend + itself on those new entries which arrived + after it arrived (i.e, those that have a + liability generation higher than itself) + */ + size_t size; /* Size of the file to catch write after EOF. */ + gf_lock_t lock; + xlator_t *this; + inode_t *inode; + int dontsync; /* If positive, don't pick lies for + * winding. This is needed to break infinite + * recursion during invocation of + * wb_process_queue from + * wb_fulfill_cbk in case of an + * error during fulfill. + */ + gf_atomic_int32_t readdirps; + gf_atomic_int8_t invalidate; +} wb_inode_t; typedef struct wb_request { - list_head_t all; - list_head_t todo; - list_head_t lie; /* either in @liability or @temptation */ - list_head_t winds; - list_head_t unwinds; - list_head_t wip; - - call_stub_t *stub; - - ssize_t write_size; /* currently held size - (after collapsing) */ - size_t orig_size; /* size which arrived with the request. - This is the size by which we grow - the window when unwinding the frame. - */ - size_t total_size; /* valid only in @head in wb_fulfill(). - This is the size with which we perform - STACK_WIND to server and therefore the - amount by which we shrink the window. - */ - - int op_ret; - int op_errno; - - int32_t refcount; - wb_inode_t *wb_inode; - glusterfs_fop_t fop; - gf_lkowner_t lk_owner; - struct iobref *iobref; - uint64_t gen; /* inode liability state at the time of - request arrival */ - - fd_t *fd; - struct { - size_t size; /* 0 size == till infinity */ - off_t off; - int append:1; /* offset is invalid. only one - outstanding append at a time */ - int tempted:1; /* true only for non-sync writes */ - int lied:1; /* sin committed */ - int fulfilled:1; /* got server acknowledgement */ - int go:1; /* enough aggregating, good to go */ - } ordering; + list_head_t all; + list_head_t todo; + list_head_t lie; /* either in @liability or @temptation */ + list_head_t winds; + list_head_t unwinds; + list_head_t wip; + + call_stub_t *stub; + + ssize_t write_size; /* currently held size + (after collapsing) */ + size_t orig_size; /* size which arrived with the request. + This is the size by which we grow + the window when unwinding the frame. + */ + size_t total_size; /* valid only in @head in wb_fulfill(). + This is the size with which we perform + STACK_WIND to server and therefore the + amount by which we shrink the window. + */ + + int op_ret; + int op_errno; + + int32_t refcount; + wb_inode_t *wb_inode; + glusterfs_fop_t fop; + gf_lkowner_t lk_owner; + pid_t client_pid; + struct iobref *iobref; + uint64_t gen; /* inode liability state at the time of + request arrival */ + + fd_t *fd; + int wind_count; /* number of sync-attempts. Only + for debug purposes */ + struct { + size_t size; /* 0 size == till infinity */ + off_t off; + int append : 1; /* offset is invalid. only one + outstanding append at a time */ + int tempted : 1; /* true only for non-sync writes */ + int lied : 1; /* sin committed */ + int fulfilled : 1; /* got server acknowledgement */ + int go : 1; /* enough aggregating, good to go */ + } ordering; + + /* for debug purposes. A request might outlive the fop it is + * representing. So, preserve essential info for logging. + */ + uint64_t unique; + uuid_t gfid; } wb_request_t; - typedef struct wb_conf { - uint64_t aggregate_size; - uint64_t window_size; - gf_boolean_t flush_behind; - gf_boolean_t trickling_writes; - gf_boolean_t strict_write_ordering; - gf_boolean_t strict_O_DIRECT; + uint64_t aggregate_size; + uint64_t page_size; + uint64_t window_size; + gf_boolean_t flush_behind; + gf_boolean_t trickling_writes; + gf_boolean_t strict_write_ordering; + gf_boolean_t strict_O_DIRECT; + gf_boolean_t resync_after_fsync; } wb_conf_t; - -void -wb_process_queue (wb_inode_t *wb_inode); - - wb_inode_t * -__wb_inode_ctx_get (xlator_t *this, inode_t *inode) +__wb_inode_ctx_get(xlator_t *this, inode_t *inode) { - uint64_t value = 0; - wb_inode_t *wb_inode = NULL; + uint64_t value = 0; + wb_inode_t *wb_inode = NULL; + int ret = 0; - __inode_ctx_get (inode, this, &value); - wb_inode = (wb_inode_t *)(unsigned long) value; + ret = __inode_ctx_get(inode, this, &value); + if (ret) + return NULL; - return wb_inode; -} + wb_inode = (wb_inode_t *)(unsigned long)value; + return wb_inode; +} wb_inode_t * -wb_inode_ctx_get (xlator_t *this, inode_t *inode) +wb_inode_ctx_get(xlator_t *this, inode_t *inode) { - wb_inode_t *wb_inode = NULL; + wb_inode_t *wb_inode = NULL; - GF_VALIDATE_OR_GOTO ("write-behind", this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO("write-behind", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - LOCK (&inode->lock); - { - wb_inode = __wb_inode_ctx_get (this, inode); - } - UNLOCK (&inode->lock); + LOCK(&inode->lock); + { + wb_inode = __wb_inode_ctx_get(this, inode); + } + UNLOCK(&inode->lock); out: - return wb_inode; + return wb_inode; } - -gf_boolean_t -wb_fd_err (fd_t *fd, xlator_t *this, int32_t *op_errno) +static void +wb_set_invalidate(wb_inode_t *wb_inode) { - gf_boolean_t err = _gf_false; - uint64_t value = 0; - int32_t tmp = 0; + int readdirps = 0; + inode_t *parent_inode = NULL; + wb_inode_t *wb_parent_inode = NULL; - if (fd_ctx_get (fd, this, &value) == 0) { - if (op_errno) { - tmp = value; - *op_errno = tmp; - } + parent_inode = inode_parent(wb_inode->inode, NULL, NULL); + if (parent_inode) + wb_parent_inode = wb_inode_ctx_get(wb_inode->this, parent_inode); - err = _gf_true; + if (wb_parent_inode) { + LOCK(&wb_parent_inode->lock); + { + readdirps = GF_ATOMIC_GET(wb_parent_inode->readdirps); + if (readdirps && list_empty(&wb_inode->invalidate_list)) { + inode_ref(wb_inode->inode); + GF_ATOMIC_INIT(wb_inode->invalidate, 1); + list_add(&wb_inode->invalidate_list, + &wb_parent_inode->invalidate_list); + } } + UNLOCK(&wb_parent_inode->lock); + } else { + GF_ATOMIC_INIT(wb_inode->invalidate, 0); + } + + if (parent_inode) + inode_unref(parent_inode); - return err; + return; } +void +wb_process_queue(wb_inode_t *wb_inode); /* Below is a succinct explanation of the code deciding whether two regions @@ -250,1868 +286,2993 @@ wb_fd_err (fd_t *fd, xlator_t *this, int32_t *op_errno) */ gf_boolean_t -wb_requests_overlap (wb_request_t *req1, wb_request_t *req2) +wb_requests_overlap(wb_request_t *req1, wb_request_t *req2) { - uint64_t r1_start = 0; - uint64_t r1_end = 0; - uint64_t r2_start = 0; - uint64_t r2_end = 0; - enum _gf_boolean do_overlap = 0; - - r1_start = req1->ordering.off; - if (req1->ordering.size) - r1_end = r1_start + req1->ordering.size - 1; - else - r1_end = ULLONG_MAX; - - r2_start = req2->ordering.off; - if (req2->ordering.size) - r2_end = r2_start + req2->ordering.size - 1; - else - r2_end = ULLONG_MAX; - - do_overlap = ((r1_end >= r2_start) && (r2_end >= r1_start)); - - return do_overlap; + uint64_t r1_start = 0; + uint64_t r1_end = 0; + uint64_t r2_start = 0; + uint64_t r2_end = 0; + gf_boolean_t do_overlap = _gf_false; + + r1_start = req1->ordering.off; + if (req1->ordering.size) + r1_end = r1_start + req1->ordering.size - 1; + else + r1_end = ULLONG_MAX; + + r2_start = req2->ordering.off; + if (req2->ordering.size) + r2_end = r2_start + req2->ordering.size - 1; + else + r2_end = ULLONG_MAX; + + do_overlap = ((r1_end >= r2_start) && (r2_end >= r1_start)); + + return do_overlap; } - gf_boolean_t -wb_requests_conflict (wb_request_t *lie, wb_request_t *req) +wb_requests_conflict(wb_request_t *lie, wb_request_t *req) { - wb_conf_t *conf = NULL; + wb_conf_t *conf = NULL; - conf = req->wb_inode->this->private; + conf = req->wb_inode->this->private; - if (lie == req) - /* request cannot conflict with itself */ - return _gf_false; + if (lie == req) + /* request cannot conflict with itself */ + return _gf_false; - if (lie->gen >= req->gen) - /* this liability entry was behind - us in the todo list */ - return _gf_false; + if (lie->gen >= req->gen) + /* this liability entry was behind + us in the todo list */ + return _gf_false; - if (lie->ordering.append) - /* all modifications wait for the completion - of outstanding append */ - return _gf_true; + if (lie->ordering.append) + /* all modifications wait for the completion + of outstanding append */ + return _gf_true; - if (conf->strict_write_ordering) - /* We are sure (lie->gen < req->gen) by now. So - skip overlap check if strict write ordering is - requested and always return "conflict" against a - lower generation lie. */ - return _gf_true; + if (conf->strict_write_ordering) + /* We are sure (lie->gen < req->gen) by now. So + skip overlap check if strict write ordering is + requested and always return "conflict" against a + lower generation lie. */ + return _gf_true; - return wb_requests_overlap (lie, req); + return wb_requests_overlap(lie, req); } - -gf_boolean_t -wb_liability_has_conflict (wb_inode_t *wb_inode, wb_request_t *req) +wb_request_t * +wb_liability_has_conflict(wb_inode_t *wb_inode, wb_request_t *req) { - wb_request_t *each = NULL; - - list_for_each_entry (each, &wb_inode->liability, lie) { - if (wb_requests_conflict (each, req)) - return _gf_true; - } - - return _gf_false; + wb_request_t *each = NULL; + + list_for_each_entry(each, &wb_inode->liability, lie) + { + if (wb_requests_conflict(each, req) && (!each->ordering.fulfilled)) + /* A fulfilled request shouldn't block another + * request (even a dependent one) from winding. + */ + return each; + } + + return NULL; } - -gf_boolean_t -wb_wip_has_conflict (wb_inode_t *wb_inode, wb_request_t *req) +wb_request_t * +wb_wip_has_conflict(wb_inode_t *wb_inode, wb_request_t *req) { - wb_request_t *each = NULL; + wb_request_t *each = NULL; - if (req->stub->fop != GF_FOP_WRITE) - /* non-writes fundamentally never conflict with WIP requests */ - return _gf_false; + if (req->stub->fop != GF_FOP_WRITE) + /* non-writes fundamentally never conflict with WIP requests */ + return NULL; - list_for_each_entry (each, &wb_inode->wip, wip) { - if (each == req) - /* request never conflicts with itself, - though this condition should never occur. - */ - continue; + list_for_each_entry(each, &wb_inode->wip, wip) + { + if (each == req) + /* request never conflicts with itself, + though this condition should never occur. + */ + continue; - if (wb_requests_overlap (each, req)) - return _gf_true; - } + if (wb_requests_overlap(each, req)) + return each; + } - return _gf_false; + return NULL; } - static int -__wb_request_unref (wb_request_t *req) +__wb_request_unref(wb_request_t *req) { - int ret = -1; - wb_inode_t *wb_inode = NULL; - - wb_inode = req->wb_inode; - - if (req->refcount <= 0) { - gf_log ("wb-request", GF_LOG_WARNING, - "refcount(%d) is <= 0", req->refcount); - goto out; + int ret = -1; + wb_inode_t *wb_inode = NULL; + char gfid[64] = { + 0, + }; + + wb_inode = req->wb_inode; + + if (req->refcount <= 0) { + uuid_utoa_r(req->gfid, gfid); + + gf_msg( + "wb-request", GF_LOG_WARNING, 0, WRITE_BEHIND_MSG_RES_UNAVAILABLE, + "(unique=%" PRIu64 ", fop=%s, gfid=%s, gen=%" PRIu64 + "): " + "refcount(%d) is <= 0 ", + req->unique, gf_fop_list[req->fop], gfid, req->gen, req->refcount); + goto out; + } + + ret = --req->refcount; + if (req->refcount == 0) { + uuid_utoa_r(req->gfid, gfid); + + gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG, + "(unique = %" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): destroying request, " + "removing from all queues", + req->unique, gf_fop_list[req->fop], gfid, req->gen); + + list_del_init(&req->todo); + list_del_init(&req->lie); + list_del_init(&req->wip); + + list_del_init(&req->all); + if (list_empty(&wb_inode->all)) { + wb_inode->gen = 0; + /* in case of accounting errors? */ + wb_inode->window_current = 0; } - ret = --req->refcount; - if (req->refcount == 0) { - list_del_init (&req->todo); - list_del_init (&req->lie); - list_del_init (&req->wip); - - list_del_init (&req->all); - if (list_empty (&wb_inode->all)) { - wb_inode->gen = 0; - /* in case of accounting errors? */ - wb_inode->window_current = 0; - } - - list_del_init (&req->winds); - list_del_init (&req->unwinds); + list_del_init(&req->winds); + list_del_init(&req->unwinds); - if (req->stub && req->ordering.tempted) { - call_stub_destroy (req->stub); - req->stub = NULL; - } /* else we would have call_resume()'ed */ + if (req->stub) { + call_stub_destroy(req->stub); + req->stub = NULL; + } - if (req->iobref) - iobref_unref (req->iobref); + if (req->iobref) + iobref_unref(req->iobref); - if (req->fd) - fd_unref (req->fd); + if (req->fd) + fd_unref(req->fd); - GF_FREE (req); - } + GF_FREE(req); + } out: - return ret; + return ret; } - static int -wb_request_unref (wb_request_t *req) +wb_request_unref(wb_request_t *req) { - wb_inode_t *wb_inode = NULL; - int ret = -1; + wb_inode_t *wb_inode = NULL; + int ret = -1; - GF_VALIDATE_OR_GOTO ("write-behind", req, out); + GF_VALIDATE_OR_GOTO("write-behind", req, out); - wb_inode = req->wb_inode; + wb_inode = req->wb_inode; - LOCK (&wb_inode->lock); - { - ret = __wb_request_unref (req); - } - UNLOCK (&wb_inode->lock); + LOCK(&wb_inode->lock); + { + ret = __wb_request_unref(req); + } + UNLOCK(&wb_inode->lock); out: - return ret; + return ret; } - static wb_request_t * -__wb_request_ref (wb_request_t *req) +__wb_request_ref(wb_request_t *req) { - GF_VALIDATE_OR_GOTO ("write-behind", req, out); + GF_VALIDATE_OR_GOTO("write-behind", req, out); - if (req->refcount < 0) { - gf_log ("wb-request", GF_LOG_WARNING, - "refcount(%d) is < 0", req->refcount); - req = NULL; - goto out; - } + if (req->refcount < 0) { + gf_msg("wb-request", GF_LOG_WARNING, 0, + WRITE_BEHIND_MSG_RES_UNAVAILABLE, "refcount(%d) is < 0", + req->refcount); + req = NULL; + goto out; + } - req->refcount++; + req->refcount++; out: - return req; + return req; } - wb_request_t * -wb_request_ref (wb_request_t *req) +wb_request_ref(wb_request_t *req) { - wb_inode_t *wb_inode = NULL; + wb_inode_t *wb_inode = NULL; - GF_VALIDATE_OR_GOTO ("write-behind", req, out); + GF_VALIDATE_OR_GOTO("write-behind", req, out); - wb_inode = req->wb_inode; - LOCK (&wb_inode->lock); - { - req = __wb_request_ref (req); - } - UNLOCK (&wb_inode->lock); + wb_inode = req->wb_inode; + LOCK(&wb_inode->lock); + { + req = __wb_request_ref(req); + } + UNLOCK(&wb_inode->lock); out: - return req; + return req; } - gf_boolean_t -wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted) -{ - wb_request_t *req = NULL; - - GF_VALIDATE_OR_GOTO ("write-behind", wb_inode, out); - GF_VALIDATE_OR_GOTO (wb_inode->this->name, stub, out); - - req = GF_CALLOC (1, sizeof (*req), gf_wb_mt_wb_request_t); - if (!req) - goto out; - - INIT_LIST_HEAD (&req->all); - INIT_LIST_HEAD (&req->todo); - INIT_LIST_HEAD (&req->lie); - INIT_LIST_HEAD (&req->winds); - INIT_LIST_HEAD (&req->unwinds); - INIT_LIST_HEAD (&req->wip); - - req->stub = stub; - req->wb_inode = wb_inode; - req->fop = stub->fop; - req->ordering.tempted = tempted; - - if (stub->fop == GF_FOP_WRITE) { - req->write_size = iov_length (stub->args.vector, - stub->args.count); - - /* req->write_size can change as we collapse - small writes. But the window needs to grow - only by how much we acknowledge the app. so - copy the original size in orig_size for the - purpose of accounting. - */ - req->orig_size = req->write_size; - - /* Let's be optimistic that we can - lie about it - */ - req->op_ret = req->write_size; - req->op_errno = 0; - - if (stub->args.fd->flags & O_APPEND) - req->ordering.append = 1; - } - - req->lk_owner = stub->frame->root->lk_owner; - - switch (stub->fop) { - case GF_FOP_WRITE: - req->ordering.off = stub->args.offset; - req->ordering.size = req->write_size; - - req->fd = fd_ref (stub->args.fd); - - break; - case GF_FOP_READ: - req->ordering.off = stub->args.offset; - req->ordering.size = stub->args.size; - - req->fd = fd_ref (stub->args.fd); - - break; - case GF_FOP_TRUNCATE: - req->ordering.off = stub->args.offset; - req->ordering.size = 0; /* till infinity */ - break; - case GF_FOP_FTRUNCATE: - req->ordering.off = stub->args.offset; - req->ordering.size = 0; /* till infinity */ - - req->fd = fd_ref (stub->args.fd); - - break; - default: - break; - } - - LOCK (&wb_inode->lock); - { - list_add_tail (&req->all, &wb_inode->all); - - req->gen = wb_inode->gen; - - list_add_tail (&req->todo, &wb_inode->todo); - __wb_request_ref (req); /* for wind */ +wb_enqueue_common(wb_inode_t *wb_inode, call_stub_t *stub, int tempted) +{ + wb_request_t *req = NULL; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("write-behind", wb_inode, out); + GF_VALIDATE_OR_GOTO(wb_inode->this->name, stub, out); + + req = GF_CALLOC(1, sizeof(*req), gf_wb_mt_wb_request_t); + if (!req) + goto out; + + INIT_LIST_HEAD(&req->all); + INIT_LIST_HEAD(&req->todo); + INIT_LIST_HEAD(&req->lie); + INIT_LIST_HEAD(&req->winds); + INIT_LIST_HEAD(&req->unwinds); + INIT_LIST_HEAD(&req->wip); + + req->stub = stub; + req->wb_inode = wb_inode; + req->fop = stub->fop; + req->ordering.tempted = tempted; + req->unique = stub->frame->root->unique; + + inode = ((stub->args.fd != NULL) ? stub->args.fd->inode + : stub->args.loc.inode); + + if (inode) + gf_uuid_copy(req->gfid, inode->gfid); + + if (stub->fop == GF_FOP_WRITE) { + req->write_size = iov_length(stub->args.vector, stub->args.count); + + /* req->write_size can change as we collapse + small writes. But the window needs to grow + only by how much we acknowledge the app. so + copy the original size in orig_size for the + purpose of accounting. + */ + req->orig_size = req->write_size; + + /* Let's be optimistic that we can + lie about it + */ + req->op_ret = req->write_size; + req->op_errno = 0; + + if (stub->args.fd && (stub->args.fd->flags & O_APPEND)) + req->ordering.append = 1; + } + + req->lk_owner = stub->frame->root->lk_owner; + req->client_pid = stub->frame->root->pid; + + switch (stub->fop) { + case GF_FOP_WRITE: + LOCK(&wb_inode->lock); + { + if (wb_inode->size < stub->args.offset) { + req->ordering.off = wb_inode->size; + req->ordering.size = stub->args.offset + req->write_size - + wb_inode->size; + } else { + req->ordering.off = stub->args.offset; + req->ordering.size = req->write_size; + } - if (req->ordering.tempted) { - list_add_tail (&req->lie, &wb_inode->temptation); - __wb_request_ref (req); /* for unwind */ - } + if (wb_inode->size < stub->args.offset + req->write_size) + wb_inode->size = stub->args.offset + req->write_size; + } + UNLOCK(&wb_inode->lock); + + req->fd = fd_ref(stub->args.fd); + + break; + case GF_FOP_READ: + req->ordering.off = stub->args.offset; + req->ordering.size = stub->args.size; + + req->fd = fd_ref(stub->args.fd); + + break; + case GF_FOP_TRUNCATE: + req->ordering.off = stub->args.offset; + req->ordering.size = 0; /* till infinity */ + LOCK(&wb_inode->lock); + { + wb_inode->size = req->ordering.off; + } + UNLOCK(&wb_inode->lock); + break; + case GF_FOP_FTRUNCATE: + req->ordering.off = stub->args.offset; + req->ordering.size = 0; /* till infinity */ + LOCK(&wb_inode->lock); + { + wb_inode->size = req->ordering.off; + } + UNLOCK(&wb_inode->lock); + + req->fd = fd_ref(stub->args.fd); + + break; + default: + if (stub && stub->args.fd) + req->fd = fd_ref(stub->args.fd); + + break; + } + + LOCK(&wb_inode->lock); + { + list_add_tail(&req->all, &wb_inode->all); + + req->gen = wb_inode->gen; + + list_add_tail(&req->todo, &wb_inode->todo); + __wb_request_ref(req); /* for wind */ + + if (req->ordering.tempted) { + list_add_tail(&req->lie, &wb_inode->temptation); + __wb_request_ref(req); /* for unwind */ } - UNLOCK (&wb_inode->lock); + } + UNLOCK(&wb_inode->lock); out: - if (!req) - return _gf_false; + if (!req) + return _gf_false; - return _gf_true; + return _gf_true; } - gf_boolean_t -wb_enqueue (wb_inode_t *wb_inode, call_stub_t *stub) +wb_enqueue(wb_inode_t *wb_inode, call_stub_t *stub) { - return wb_enqueue_common (wb_inode, stub, 0); + return wb_enqueue_common(wb_inode, stub, 0); } - gf_boolean_t -wb_enqueue_tempted (wb_inode_t *wb_inode, call_stub_t *stub) +wb_enqueue_tempted(wb_inode_t *wb_inode, call_stub_t *stub) { - return wb_enqueue_common (wb_inode, stub, 1); + return wb_enqueue_common(wb_inode, stub, 1); } - wb_inode_t * -__wb_inode_create (xlator_t *this, inode_t *inode) +__wb_inode_create(xlator_t *this, inode_t *inode) { - wb_inode_t *wb_inode = NULL; - wb_conf_t *conf = NULL; + wb_inode_t *wb_inode = NULL; + wb_conf_t *conf = NULL; + int ret = 0; - GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - conf = this->private; + conf = this->private; - wb_inode = GF_CALLOC (1, sizeof (*wb_inode), gf_wb_mt_wb_inode_t); - if (!wb_inode) - goto out; + wb_inode = GF_CALLOC(1, sizeof(*wb_inode), gf_wb_mt_wb_inode_t); + if (!wb_inode) + goto out; - INIT_LIST_HEAD (&wb_inode->all); - INIT_LIST_HEAD (&wb_inode->todo); - INIT_LIST_HEAD (&wb_inode->liability); - INIT_LIST_HEAD (&wb_inode->temptation); - INIT_LIST_HEAD (&wb_inode->wip); + INIT_LIST_HEAD(&wb_inode->all); + INIT_LIST_HEAD(&wb_inode->todo); + INIT_LIST_HEAD(&wb_inode->liability); + INIT_LIST_HEAD(&wb_inode->temptation); + INIT_LIST_HEAD(&wb_inode->wip); + INIT_LIST_HEAD(&wb_inode->invalidate_list); - wb_inode->this = this; + wb_inode->this = this; - wb_inode->window_conf = conf->window_size; + wb_inode->window_conf = conf->window_size; + wb_inode->inode = inode; - LOCK_INIT (&wb_inode->lock); + LOCK_INIT(&wb_inode->lock); + GF_ATOMIC_INIT(wb_inode->invalidate, 0); + GF_ATOMIC_INIT(wb_inode->readdirps, 0); - __inode_ctx_put (inode, this, (uint64_t)(unsigned long)wb_inode); + ret = __inode_ctx_put(inode, this, (uint64_t)(unsigned long)wb_inode); + if (ret) { + GF_FREE(wb_inode); + wb_inode = NULL; + } out: - return wb_inode; + return wb_inode; } - wb_inode_t * -wb_inode_create (xlator_t *this, inode_t *inode) +wb_inode_create(xlator_t *this, inode_t *inode) { - wb_inode_t *wb_inode = NULL; + wb_inode_t *wb_inode = NULL; - GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - LOCK (&inode->lock); - { - wb_inode = __wb_inode_ctx_get (this, inode); - if (!wb_inode) - wb_inode = __wb_inode_create (this, inode); - } - UNLOCK (&inode->lock); + LOCK(&inode->lock); + { + wb_inode = __wb_inode_ctx_get(this, inode); + if (!wb_inode) + wb_inode = __wb_inode_create(this, inode); + } + UNLOCK(&inode->lock); out: - return wb_inode; + return wb_inode; } - void -wb_inode_destroy (wb_inode_t *wb_inode) +wb_inode_destroy(wb_inode_t *wb_inode) { - GF_VALIDATE_OR_GOTO ("write-behind", wb_inode, out); + GF_VALIDATE_OR_GOTO("write-behind", wb_inode, out); + + GF_ASSERT(list_empty(&wb_inode->todo)); + GF_ASSERT(list_empty(&wb_inode->liability)); + GF_ASSERT(list_empty(&wb_inode->temptation)); - LOCK_DESTROY (&wb_inode->lock); - GF_FREE (wb_inode); + LOCK_DESTROY(&wb_inode->lock); + GF_FREE(wb_inode); out: - return; + return; } - void -__wb_fulfill_request (wb_request_t *req) +__wb_fulfill_request(wb_request_t *req) { - wb_inode_t *wb_inode = NULL; + wb_inode_t *wb_inode = NULL; + char gfid[64] = { + 0, + }; + + wb_inode = req->wb_inode; + + req->ordering.fulfilled = 1; + wb_inode->window_current -= req->total_size; + wb_inode->transit -= req->total_size; + + uuid_utoa_r(req->gfid, gfid); + + gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): request fulfilled. " + "removing the request from liability queue? = %s", + req->unique, gf_fop_list[req->fop], gfid, req->gen, + req->ordering.lied ? "yes" : "no"); + + if (req->ordering.lied) { + /* 1. If yes, request is in liability queue and hence can be + safely removed from list. + 2. If no, request is in temptation queue and hence should be + left in the queue so that wb_pick_unwinds picks it up + */ + list_del_init(&req->lie); + } else { + /* TODO: fail the req->frame with error if + necessary + */ + } + + list_del_init(&req->wip); + __wb_request_unref(req); +} - wb_inode = req->wb_inode; +/* get a flush/fsync waiting on req */ +wb_request_t * +__wb_request_waiting_on(wb_request_t *req) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *trav = NULL; - req->ordering.fulfilled = 1; - wb_inode->window_current -= req->total_size; - wb_inode->transit -= req->total_size; + wb_inode = req->wb_inode; - if (!req->ordering.lied) { - /* TODO: fail the req->frame with error if - necessary - */ - } + list_for_each_entry(trav, &wb_inode->todo, todo) + { + if (((trav->stub->fop == GF_FOP_FLUSH) || + (trav->stub->fop == GF_FOP_FSYNC)) && + (trav->gen >= req->gen)) + return trav; + } - __wb_request_unref (req); + return NULL; } - void -wb_head_done (wb_request_t *head) +__wb_add_request_for_retry(wb_request_t *req) { - wb_request_t *req = NULL; - wb_request_t *tmp = NULL; - wb_inode_t *wb_inode = NULL; + wb_inode_t *wb_inode = NULL; - wb_inode = head->wb_inode; + if (!req) + goto out; - LOCK (&wb_inode->lock); - { - list_for_each_entry_safe (req, tmp, &head->winds, winds) { - __wb_fulfill_request (req); - } - __wb_fulfill_request (head); - } - UNLOCK (&wb_inode->lock); -} + wb_inode = req->wb_inode; + /* response was unwound and no waiter waiting on this request, retry + till a flush or fsync (subject to conf->resync_after_fsync). + */ + wb_inode->transit -= req->total_size; -void -wb_fulfill_err (wb_request_t *head, int op_errno) -{ - wb_inode_t *wb_inode; - wb_request_t *req; + req->total_size = 0; - wb_inode = head->wb_inode; + list_del_init(&req->winds); + list_del_init(&req->todo); + list_del_init(&req->wip); - /* for all future requests yet to arrive */ - fd_ctx_set (head->fd, THIS, op_errno); + /* sanitize ordering flags to retry */ + req->ordering.go = 0; - LOCK (&wb_inode->lock); - { - /* for all requests already arrived */ - list_for_each_entry (req, &wb_inode->all, all) { - if (req->fd != head->fd) - continue; - req->op_ret = -1; - req->op_errno = op_errno; - } - } - UNLOCK (&wb_inode->lock); -} + /* Add back to todo list to retry */ + list_add(&req->todo, &wb_inode->todo); +out: + return; +} -int -wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +void +__wb_add_head_for_retry(wb_request_t *head) { - wb_inode_t *wb_inode = NULL; - wb_request_t *head = NULL; - - head = frame->local; - frame->local = NULL; - - wb_inode = head->wb_inode; + wb_request_t *req = NULL, *tmp = NULL; - if (op_ret == -1) { - wb_fulfill_err (head, op_errno); - } else if (op_ret < head->total_size) { - /* - * We've encountered a short write, for whatever reason. - * Set an EIO error for the next fop. This should be - * valid for writev or flush (close). - * - * TODO: Retry the write so we can potentially capture - * a real error condition (i.e., ENOSPC). - */ - wb_fulfill_err (head, EIO); - } + if (!head) + goto out; - wb_head_done (head); + list_for_each_entry_safe_reverse(req, tmp, &head->winds, winds) + { + __wb_add_request_for_retry(req); + } - wb_process_queue (wb_inode); + __wb_add_request_for_retry(head); - STACK_DESTROY (frame->root); - - return 0; +out: + return; } +void +wb_add_head_for_retry(wb_request_t *head) +{ + if (!head) + goto out; -#define WB_IOV_LOAD(vec, cnt, req, head) do { \ - memcpy (&vec[cnt], req->stub->args.vector, \ - (req->stub->args.count * sizeof(vec[0]))); \ - cnt += req->stub->args.count; \ - head->total_size += req->write_size; \ - } while (0) + LOCK(&head->wb_inode->lock); + { + __wb_add_head_for_retry(head); + } + UNLOCK(&head->wb_inode->lock); +out: + return; +} void -wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) +__wb_fulfill_request_err(wb_request_t *req, int32_t op_errno) { - struct iovec vector[MAX_VECTOR_COUNT]; - int count = 0; - wb_request_t *req = NULL; - call_frame_t *frame = NULL; - gf_boolean_t fderr = _gf_false; - xlator_t *this = NULL; + wb_inode_t *wb_inode = NULL; + wb_request_t *waiter = NULL; + wb_conf_t *conf = NULL; + + wb_inode = req->wb_inode; + + conf = wb_inode->this->private; + + req->op_ret = -1; + req->op_errno = op_errno; + + if (req->ordering.lied) + waiter = __wb_request_waiting_on(req); + + if (!req->ordering.lied || waiter) { + if (!req->ordering.lied) { + /* response to app is still pending, send failure in + * response. + */ + } else { + /* response was sent, store the error in a + * waiter (either an fsync or flush). + */ + waiter->op_ret = -1; + waiter->op_errno = op_errno; + } - this = THIS; + if (!req->ordering.lied || (waiter->stub->fop == GF_FOP_FLUSH) || + ((waiter->stub->fop == GF_FOP_FSYNC) && + !conf->resync_after_fsync)) { + /* No retry needed, forget the request */ + __wb_fulfill_request(req); + return; + } + } - /* make sure head->total_size is updated before we run into any - * errors - */ + __wb_add_request_for_retry(req); - WB_IOV_LOAD (vector, count, head, head); + return; +} - list_for_each_entry (req, &head->winds, winds) { - WB_IOV_LOAD (vector, count, req, head); +void +wb_head_done(wb_request_t *head) +{ + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + wb_inode_t *wb_inode = NULL; - iobref_merge (head->stub->args.iobref, - req->stub->args.iobref); - } + wb_inode = head->wb_inode; - if (wb_fd_err (head->fd, this, NULL)) { - fderr = _gf_true; - goto err; + LOCK(&wb_inode->lock); + { + list_for_each_entry_safe(req, tmp, &head->winds, winds) + { + __wb_fulfill_request(req); } - frame = create_frame (wb_inode->this, wb_inode->this->ctx->pool); - if (!frame) - goto err; + __wb_fulfill_request(head); + } + UNLOCK(&wb_inode->lock); +} - frame->root->lk_owner = head->lk_owner; - frame->local = head; +void +__wb_fulfill_err(wb_request_t *head, int op_errno) +{ + wb_request_t *req = NULL, *tmp = NULL; - LOCK (&wb_inode->lock); - { - wb_inode->transit += head->total_size; - } - UNLOCK (&wb_inode->lock); + if (!head) + goto out; - STACK_WIND (frame, wb_fulfill_cbk, FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->writev, - head->fd, vector, count, - head->stub->args.offset, - head->stub->args.flags, - head->stub->args.iobref, NULL); + head->wb_inode->dontsync++; - return; -err: - if (!fderr) { - /* frame creation failure */ - wb_fulfill_err (head, ENOMEM); - } + list_for_each_entry_safe_reverse(req, tmp, &head->winds, winds) + { + __wb_fulfill_request_err(req, op_errno); + } - wb_head_done (head); + __wb_fulfill_request_err(head, op_errno); - return; +out: + return; } +void +wb_fulfill_err(wb_request_t *head, int op_errno) +{ + wb_inode_t *wb_inode = NULL; -#define NEXT_HEAD(head, req) do { \ - if (head) \ - wb_fulfill_head (wb_inode, head); \ - head = req; \ - expected_offset = req->stub->args.offset + \ - req->write_size; \ - curr_aggregate = 0; \ - vector_count = 0; \ - } while (0) + wb_inode = head->wb_inode; + LOCK(&wb_inode->lock); + { + __wb_fulfill_err(head, op_errno); + } + UNLOCK(&wb_inode->lock); +} void -wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities) +__wb_modify_write_request(wb_request_t *req, int synced_size) { - wb_request_t *req = NULL; - wb_request_t *head = NULL; - wb_request_t *tmp = NULL; - wb_conf_t *conf = NULL; - off_t expected_offset = 0; - size_t curr_aggregate = 0; - size_t vector_count = 0; + struct iovec *vector = NULL; + int count = 0; - conf = wb_inode->this->private; + if (!req || synced_size == 0) + goto out; - list_for_each_entry_safe (req, tmp, liabilities, winds) { - list_del_init (&req->winds); + req->write_size -= synced_size; + req->stub->args.offset += synced_size; - if (!head) { - NEXT_HEAD (head, req); - continue; - } + vector = req->stub->args.vector; + count = req->stub->args.count; - if (req->fd != head->fd) { - NEXT_HEAD (head, req); - continue; - } + req->stub->args.count = iov_skip(vector, count, synced_size); - if (!is_same_lkowner (&req->lk_owner, &head->lk_owner)) { - NEXT_HEAD (head, req); - continue; - } - - if (expected_offset != req->stub->args.offset) { - NEXT_HEAD (head, req); - continue; - } +out: + return; +} - if ((curr_aggregate + req->write_size) > conf->aggregate_size) { - NEXT_HEAD (head, req); - continue; - } +int +__wb_fulfill_short_write(wb_request_t *req, int size, gf_boolean_t *fulfilled) +{ + int accounted_size = 0; - if (vector_count + req->stub->args.count > - MAX_VECTOR_COUNT) { - NEXT_HEAD (head, req); - continue; - } + if (req == NULL) + goto out; - list_add_tail (&req->winds, &head->winds); - curr_aggregate += req->write_size; - vector_count += req->stub->args.count; - } + if (req->write_size <= size) { + accounted_size = req->write_size; + __wb_fulfill_request(req); + *fulfilled = 1; + } else { + accounted_size = size; + __wb_modify_write_request(req, size); + *fulfilled = 0; + } - if (head) - wb_fulfill_head (wb_inode, head); - return; +out: + return accounted_size; } - void -wb_do_unwinds (wb_inode_t *wb_inode, list_head_t *lies) +wb_fulfill_short_write(wb_request_t *head, int size) { - wb_request_t *req = NULL; - wb_request_t *tmp = NULL; - call_frame_t *frame = NULL; - struct iatt buf = {0, }; + wb_inode_t *wb_inode = NULL; + wb_request_t *req = NULL, *next = NULL; + int accounted_size = 0; + gf_boolean_t fulfilled = _gf_false; + + if (!head) + goto out; + + wb_inode = head->wb_inode; + + req = head; + + LOCK(&wb_inode->lock); + { + /* hold a reference to head so that __wb_fulfill_short_write + * won't free it. We need head for a cleaner list traversal as + * list_for_each_entry_safe doesn't iterate over "head" member. + * So, if we pass "next->winds" as head to list_for_each_entry, + * "next" is skipped. For a simpler logic we need to traverse + * the list in the order. So, we start traversal from + * "head->winds" and hence we want head to be alive. + */ + __wb_request_ref(head); + + next = list_entry(head->winds.next, wb_request_t, winds); + + accounted_size = __wb_fulfill_short_write(head, size, &fulfilled); - list_for_each_entry_safe (req, tmp, lies, unwinds) { - frame = req->stub->frame; + size -= accounted_size; - STACK_UNWIND_STRICT (writev, frame, req->op_ret, req->op_errno, - &buf, &buf, NULL); /* :O */ - req->stub->frame = NULL; + if (size == 0) { + if (fulfilled && (next != head)) + req = next; - list_del_init (&req->unwinds); - wb_request_unref (req); + goto done; } - return; + list_for_each_entry_safe(req, next, &head->winds, winds) + { + accounted_size = __wb_fulfill_short_write(req, size, &fulfilled); + size -= accounted_size; + + if (size == 0) { + if (fulfilled && (next != head)) + req = next; + break; + } + } + done: + __wb_request_unref(head); + } + UNLOCK(&wb_inode->lock); + + wb_add_head_for_retry(req); +out: + return; } +int +wb_fulfill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *head = NULL; + + head = frame->local; + frame->local = NULL; + + wb_inode = head->wb_inode; + + /* There could be a readdirp session in progress. Since wb_fulfill_cbk + * can potentially remove a request from liability queue, + * wb_readdirp_cbk will miss writes on this inode (as it invalidates + * stats only if liability queue is not empty) and hence mark inode + * for invalidation of stats in readdirp response. Specifically this + * code fixes the following race mentioned in wb_readdirp_cbk: + */ + + /* <removed comment from wb_readdirp_cbk> + * We cannot guarantee integrity of entry->d_stat as there are cached + * writes. The stat is most likely stale as it doesn't account the + * cached writes. However, checking for non-empty liability list here is + * not a fool-proof solution as there can be races like, + * 1. readdirp is successful on posix + * 2. sync of cached write is successful on posix + * 3. write-behind received sync response and removed the request from + * liability queue + * 4. readdirp response is processed at write-behind + * + * In the above scenario, stat for the file is sent back in readdirp + * response but it is stale. + * </comment> */ + wb_set_invalidate(wb_inode); + + if (op_ret == -1) { + wb_fulfill_err(head, op_errno); + } else if (op_ret < head->total_size) { + wb_fulfill_short_write(head, op_ret); + } else { + wb_head_done(head); + } + + wb_process_queue(wb_inode); + + STACK_DESTROY(frame->root); + + return 0; +} -void -__wb_pick_unwinds (wb_inode_t *wb_inode, list_head_t *lies) +#define WB_IOV_LOAD(vec, cnt, req, head) \ + do { \ + memcpy(&vec[cnt], req->stub->args.vector, \ + (req->stub->args.count * sizeof(vec[0]))); \ + cnt += req->stub->args.count; \ + head->total_size += req->write_size; \ + } while (0) + +int +wb_fulfill_head(wb_inode_t *wb_inode, wb_request_t *head) { - wb_request_t *req = NULL; - wb_request_t *tmp = NULL; + struct iovec vector[MAX_VECTOR_COUNT]; + int count = 0; + wb_request_t *req = NULL; + call_frame_t *frame = NULL; - list_for_each_entry_safe (req, tmp, &wb_inode->temptation, lie) { - if (!req->ordering.fulfilled && - wb_inode->window_current > wb_inode->window_conf) - continue; + /* make sure head->total_size is updated before we run into any + * errors + */ - list_del_init (&req->lie); - list_move_tail (&req->unwinds, lies); + WB_IOV_LOAD(vector, count, head, head); - wb_inode->window_current += req->orig_size; + list_for_each_entry(req, &head->winds, winds) + { + WB_IOV_LOAD(vector, count, req, head); - if (!req->ordering.fulfilled) { - /* burden increased */ - list_add_tail (&req->lie, &wb_inode->liability); + if (iobref_merge(head->stub->args.iobref, req->stub->args.iobref)) + goto err; + } - req->ordering.lied = 1; + frame = create_frame(wb_inode->this, wb_inode->this->ctx->pool); + if (!frame) + goto err; - wb_inode->gen++; - } - } + frame->root->lk_owner = head->lk_owner; + frame->root->pid = head->client_pid; + frame->local = head; - return; + LOCK(&wb_inode->lock); + { + wb_inode->transit += head->total_size; + } + UNLOCK(&wb_inode->lock); + + STACK_WIND(frame, wb_fulfill_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->writev, head->fd, vector, count, + head->stub->args.offset, head->stub->args.flags, + head->stub->args.iobref, NULL); + + return 0; +err: + /* frame creation failure */ + wb_fulfill_err(head, ENOMEM); + + return ENOMEM; } +#define NEXT_HEAD(head, req) \ + do { \ + if (head) \ + ret |= wb_fulfill_head(wb_inode, head); \ + head = req; \ + expected_offset = req->stub->args.offset + req->write_size; \ + curr_aggregate = 0; \ + vector_count = 0; \ + } while (0) int -__wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req) -{ - char *ptr = NULL; - struct iobuf *iobuf = NULL; - struct iobref *iobref = NULL; - int ret = -1; - ssize_t required_size = 0; - size_t holder_len = 0; - size_t req_len = 0; - - if (!holder->iobref) { - holder_len = iov_length (holder->stub->args.vector, - holder->stub->args.count); - req_len = iov_length (req->stub->args.vector, - req->stub->args.count); - - required_size = max ((THIS->ctx->page_size), - (holder_len + req_len)); - iobuf = iobuf_get2 (req->wb_inode->this->ctx->iobuf_pool, - required_size); - if (iobuf == NULL) { - goto out; - } +wb_fulfill(wb_inode_t *wb_inode, list_head_t *liabilities) +{ + wb_request_t *req = NULL; + wb_request_t *head = NULL; + wb_request_t *tmp = NULL; + wb_conf_t *conf = NULL; + off_t expected_offset = 0; + size_t curr_aggregate = 0; + size_t vector_count = 0; + int ret = 0; + + conf = wb_inode->this->private; + + list_for_each_entry_safe(req, tmp, liabilities, winds) + { + list_del_init(&req->winds); + + if (!head) { + NEXT_HEAD(head, req); + continue; + } - iobref = iobref_new (); - if (iobref == NULL) { - iobuf_unref (iobuf); - goto out; - } + if (req->fd != head->fd) { + NEXT_HEAD(head, req); + continue; + } - ret = iobref_add (iobref, iobuf); - if (ret != 0) { - iobuf_unref (iobuf); - iobref_unref (iobref); - gf_log (req->wb_inode->this->name, GF_LOG_WARNING, - "cannot add iobuf (%p) into iobref (%p)", - iobuf, iobref); - goto out; - } + if (!is_same_lkowner(&req->lk_owner, &head->lk_owner)) { + NEXT_HEAD(head, req); + continue; + } + + if (expected_offset != req->stub->args.offset) { + NEXT_HEAD(head, req); + continue; + } + + if ((curr_aggregate + req->write_size) > conf->aggregate_size) { + NEXT_HEAD(head, req); + continue; + } + + if (vector_count + req->stub->args.count > MAX_VECTOR_COUNT) { + NEXT_HEAD(head, req); + continue; + } + + list_add_tail(&req->winds, &head->winds); + curr_aggregate += req->write_size; + vector_count += req->stub->args.count; + } + + if (head) + ret |= wb_fulfill_head(wb_inode, head); + + return ret; +} + +void +wb_do_unwinds(wb_inode_t *wb_inode, list_head_t *lies) +{ + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + call_frame_t *frame = NULL; + struct iatt buf = { + 0, + }; + + list_for_each_entry_safe(req, tmp, lies, unwinds) + { + frame = req->stub->frame; + + STACK_UNWIND_STRICT(writev, frame, req->op_ret, req->op_errno, &buf, + &buf, NULL); /* :O */ + req->stub->frame = NULL; + + list_del_init(&req->unwinds); + wb_request_unref(req); + } + + return; +} - iov_unload (iobuf->ptr, holder->stub->args.vector, - holder->stub->args.count); - holder->stub->args.vector[0].iov_base = iobuf->ptr; - holder->stub->args.count = 1; +void +__wb_pick_unwinds(wb_inode_t *wb_inode, list_head_t *lies) +{ + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + char gfid[64] = { + 0, + }; + + list_for_each_entry_safe(req, tmp, &wb_inode->temptation, lie) + { + if (!req->ordering.fulfilled && + wb_inode->window_current > wb_inode->window_conf) + continue; + + list_del_init(&req->lie); + list_move_tail(&req->unwinds, lies); + + wb_inode->window_current += req->orig_size; + + wb_inode->gen++; + + if (!req->ordering.fulfilled) { + /* burden increased */ + list_add_tail(&req->lie, &wb_inode->liability); + + req->ordering.lied = 1; + + uuid_utoa_r(req->gfid, gfid); + gf_msg_debug(wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): added req to liability " + "queue. inode-generation-number=%" PRIu64, + req->stub->frame->root->unique, gf_fop_list[req->fop], + gfid, req->gen, wb_inode->gen); + } + } - iobref_unref (holder->stub->args.iobref); - holder->stub->args.iobref = iobref; + return; +} - iobuf_unref (iobuf); +int +__wb_collapse_small_writes(wb_conf_t *conf, wb_request_t *holder, + wb_request_t *req) +{ + char *ptr = NULL; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + int ret = -1; + ssize_t required_size = 0; + size_t holder_len = 0; + size_t req_len = 0; + + if (!holder->iobref) { + holder_len = iov_length(holder->stub->args.vector, + holder->stub->args.count); + req_len = iov_length(req->stub->args.vector, req->stub->args.count); + + required_size = max((conf->page_size), (holder_len + req_len)); + iobuf = iobuf_get2(req->wb_inode->this->ctx->iobuf_pool, required_size); + if (iobuf == NULL) { + goto out; + } - holder->iobref = iobref_ref (iobref); + iobref = iobref_new(); + if (iobref == NULL) { + iobuf_unref(iobuf); + goto out; } - ptr = holder->stub->args.vector[0].iov_base + holder->write_size; + ret = iobref_add(iobref, iobuf); + if (ret != 0) { + gf_msg(req->wb_inode->this->name, GF_LOG_WARNING, -ret, + WRITE_BEHIND_MSG_INVALID_ARGUMENT, + "cannot add iobuf (%p) into iobref (%p)", iobuf, iobref); + iobuf_unref(iobuf); + iobref_unref(iobref); + goto out; + } - iov_unload (ptr, req->stub->args.vector, - req->stub->args.count); + iov_unload(iobuf->ptr, holder->stub->args.vector, + holder->stub->args.count); + holder->stub->args.vector[0].iov_base = iobuf->ptr; + holder->stub->args.count = 1; - holder->stub->args.vector[0].iov_len += req->write_size; - holder->write_size += req->write_size; - holder->ordering.size += req->write_size; + iobref_unref(holder->stub->args.iobref); + holder->stub->args.iobref = iobref; - ret = 0; + iobuf_unref(iobuf); + + holder->iobref = iobref_ref(iobref); + } + + ptr = holder->stub->args.vector[0].iov_base + holder->write_size; + + iov_unload(ptr, req->stub->args.vector, req->stub->args.count); + + holder->stub->args.vector[0].iov_len += req->write_size; + holder->write_size += req->write_size; + holder->ordering.size += req->write_size; + + ret = 0; out: - return ret; + return ret; } - void -__wb_preprocess_winds (wb_inode_t *wb_inode) -{ - off_t offset_expected = 0; - ssize_t space_left = 0; - wb_request_t *req = NULL; - wb_request_t *tmp = NULL; - wb_request_t *holder = NULL; - wb_conf_t *conf = NULL; - int ret = 0; - ssize_t page_size = 0; - - /* With asynchronous IO from a VM guest (as a file), there - can be two sequential writes happening in two regions - of the file. But individual (broken down) IO requests - can arrive interleaved. - - TODO: cycle for each such sequence sifting - through the interleaved ops - */ - - page_size = wb_inode->this->ctx->page_size; - conf = wb_inode->this->private; - - list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { - if (!req->ordering.tempted) { - if (holder) { - if (wb_requests_conflict (holder, req)) - /* do not hold on write if a - dependent write is in queue */ - holder->ordering.go = 1; - } - /* collapse only non-sync writes */ - continue; - } else if (!holder) { - /* holder is always a non-sync write */ - holder = req; - continue; - } - - offset_expected = holder->stub->args.offset - + holder->write_size; - - if (req->stub->args.offset != offset_expected) { - holder->ordering.go = 1; - holder = req; - continue; - } - - if (!is_same_lkowner (&req->lk_owner, &holder->lk_owner)) { - holder->ordering.go = 1; - holder = req; - continue; - } - - if (req->fd != holder->fd) { - holder->ordering.go = 1; - holder = req; - continue; - } +__wb_preprocess_winds(wb_inode_t *wb_inode) +{ + off_t offset_expected = 0; + ssize_t space_left = 0; + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + wb_request_t *holder = NULL; + wb_conf_t *conf = NULL; + int ret = 0; + ssize_t page_size = 0; + char gfid[64] = { + 0, + }; + + /* With asynchronous IO from a VM guest (as a file), there + can be two sequential writes happening in two regions + of the file. But individual (broken down) IO requests + can arrive interleaved. + + TODO: cycle for each such sequence sifting + through the interleaved ops + */ + + conf = wb_inode->this->private; + page_size = conf->page_size; + + list_for_each_entry_safe(req, tmp, &wb_inode->todo, todo) + { + if (wb_inode->dontsync && req->ordering.lied) { + /* sync has failed. Don't pick lies _again_ for winding + * as winding these lies again will trigger an infinite + * recursion of wb_process_queue being called from a + * failed fulfill. However, pick non-lied requests for + * winding so that application won't block indefinitely + * waiting for write result. + */ + + uuid_utoa_r(req->gfid, gfid); + gf_msg_debug(wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): not setting ordering.go" + "as dontsync is set", + req->unique, gf_fop_list[req->fop], gfid, req->gen); + + continue; + } - space_left = page_size - holder->write_size; + if (!req->ordering.tempted) { + if (holder) { + if (wb_requests_conflict(holder, req)) + /* do not hold on write if a + dependent write is in queue */ + holder->ordering.go = 1; + } + /* collapse only non-sync writes */ + continue; + } else if (!holder) { + /* holder is always a non-sync write */ + holder = req; + continue; + } - if (space_left < req->write_size) { - holder->ordering.go = 1; - holder = req; - continue; - } + offset_expected = holder->stub->args.offset + holder->write_size; - ret = __wb_collapse_small_writes (holder, req); - if (ret) - continue; + if (req->stub->args.offset != offset_expected) { + holder->ordering.go = 1; + holder = req; + continue; + } - /* collapsed request is as good as wound - (from its p.o.v) - */ - list_del_init (&req->todo); - __wb_fulfill_request (req); + if (!is_same_lkowner(&req->lk_owner, &holder->lk_owner)) { + holder->ordering.go = 1; + holder = req; + continue; + } - /* Only the last @holder in queue which + if (req->fd != holder->fd) { + holder->ordering.go = 1; + holder = req; + continue; + } - - does not have any non-buffered-writes following it - - has not yet filled its capacity + space_left = page_size - holder->write_size; - does not get its 'go' set, in anticipation of the arrival - of consecutive smaller writes. - */ + if (space_left < req->write_size) { + holder->ordering.go = 1; + holder = req; + continue; } - /* but if trickling writes are enabled, then do not hold back - writes if there are no outstanding requests - */ + ret = __wb_collapse_small_writes(conf, holder, req); + if (ret) + continue; - if (conf->trickling_writes && !wb_inode->transit && holder) - holder->ordering.go = 1; + /* collapsed request is as good as wound + (from its p.o.v) + */ + list_del_init(&req->todo); + __wb_fulfill_request(req); - return; -} + /* Only the last @holder in queue which + - does not have any non-buffered-writes following it + - has not yet filled its capacity -void -__wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks, - list_head_t *liabilities) + does not get its 'go' set, in anticipation of the arrival + of consecutive smaller writes. + */ + } + + /* but if trickling writes are enabled, then do not hold back + writes if there are no outstanding requests + */ + + if (conf->trickling_writes && !wb_inode->transit && holder) + holder->ordering.go = 1; + + if (wb_inode->dontsync > 0) + wb_inode->dontsync--; + + return; +} + +int +__wb_handle_failed_conflict(wb_request_t *req, wb_request_t *conflict, + list_head_t *tasks) { - wb_request_t *req = NULL; - wb_request_t *tmp = NULL; + wb_conf_t *conf = NULL; + char gfid[64] = { + 0, + }; + + conf = req->wb_inode->this->private; + + uuid_utoa_r(req->gfid, gfid); + + if ((req->stub->fop != GF_FOP_FLUSH) && + ((req->stub->fop != GF_FOP_FSYNC) || conf->resync_after_fsync)) { + if (!req->ordering.lied && list_empty(&conflict->wip)) { + /* If request itself is in liability queue, + * 1. We cannot unwind as the response has already been + * sent. + * 2. We cannot wind till conflict clears up. + * 3. So, skip the request for now. + * 4. Otherwise, resume (unwind) it with error. + */ + req->op_ret = -1; + req->op_errno = conflict->op_errno; + if ((req->stub->fop == GF_FOP_TRUNCATE) || + (req->stub->fop == GF_FOP_FTRUNCATE)) { + req->stub->frame->local = NULL; + } + + list_del_init(&req->todo); + list_add_tail(&req->winds, tasks); + + gf_msg_debug(req->wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): A conflicting write " + "request in liability queue has failed " + "to sync (error = \"%s\"), " + "unwinding this request as a failure", + req->unique, gf_fop_list[req->fop], gfid, req->gen, + strerror(req->op_errno)); + + if (req->ordering.tempted) { + /* make sure that it won't be unwound in + * wb_do_unwinds too. Otherwise there'll be + * a double wind. + */ + list_del_init(&req->lie); + + gf_msg_debug(req->wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, " + "gfid=%s, gen=%" PRIu64 + "): " + "removed from liability queue", + req->unique, gf_fop_list[req->fop], gfid, + req->gen); + + __wb_fulfill_request(req); + } + } + } else { + gf_msg_debug(req->wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): A conflicting write request " + "in liability queue has failed to sync " + "(error = \"%s\"). This is an " + "FSYNC/FLUSH and we need to maintain ordering " + "guarantees with other writes in TODO queue. " + "Hence doing nothing now", + req->unique, gf_fop_list[req->fop], gfid, req->gen, + strerror(conflict->op_errno)); + + /* flush and fsync (without conf->resync_after_fsync) act as + barriers. We cannot unwind them out of + order, when there are earlier generation writes just because + there is a conflicting liability with an error. So, wait for + our turn till there are no conflicting liabilities. + + This situation can arise when there liabilities spread across + multiple generations. For eg., consider two writes with + following characterstics: + + 1. they belong to different generations gen1, gen2 and + (gen1 > gen2). + 2. they overlap. + 3. both are liabilities. + 4. gen1 write was attempted to sync, but the attempt failed. + 5. there was no attempt to sync gen2 write yet. + 6. A flush (as part of close) is issued and gets a gen no + gen3. + + In the above scenario, if flush is unwound without waiting + for gen1 and gen2 writes either to be successfully synced or + purged, we end up with these two writes in wb_inode->todo + list forever as there will be no attempt to process the queue + as flush is the last operation. + */ + } + + return 0; +} - list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { - if (wb_liability_has_conflict (wb_inode, req)) - continue; +int +__wb_pick_winds(wb_inode_t *wb_inode, list_head_t *tasks, + list_head_t *liabilities) +{ + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + wb_request_t *conflict = NULL; + char req_gfid[64] = + { + 0, + }, + conflict_gfid[64] = { + 0, + }; + + list_for_each_entry_safe(req, tmp, &wb_inode->todo, todo) + { + uuid_utoa_r(req->gfid, req_gfid); + + conflict = wb_liability_has_conflict(wb_inode, req); + if (conflict) { + uuid_utoa_r(conflict->gfid, conflict_gfid); + + gf_msg_debug(wb_inode->this->name, 0, + "Not winding request due to a " + "conflicting write in liability queue. " + "REQ: unique=%" PRIu64 + ", fop=%s, " + "gen=%" PRIu64 + ", gfid=%s. " + "CONFLICT: unique=%" PRIu64 + ", fop=%s, " + "gen=%" PRIu64 + ", gfid=%s, " + "conflicts-sync-failed?=%s, " + "conflicts-error=%s", + req->unique, gf_fop_list[req->fop], req->gen, req_gfid, + conflict->unique, gf_fop_list[conflict->fop], + conflict->gen, conflict_gfid, + (conflict->op_ret == 1) ? "yes" : "no", + strerror(conflict->op_errno)); + + if (conflict->op_ret == -1) { + /* There is a conflicting liability which failed + * to sync in previous attempts, resume the req + * and fail, unless its an fsync/flush. + */ + + __wb_handle_failed_conflict(req, conflict, tasks); + } else { + /* There is a conflicting liability which was + * not attempted to sync even once. Wait till + * at least one attempt to sync is made. + */ + } + + continue; + } - if (req->ordering.tempted && !req->ordering.go) - /* wait some more */ - continue; + if (req->ordering.tempted && !req->ordering.go) { + /* wait some more */ + gf_msg_debug(wb_inode->this->name, 0, + "(unique=%" PRIu64 ", fop=%s, gen=%" PRIu64 + ", gfid=%s): ordering.go is not set, " + "hence not winding", + req->unique, gf_fop_list[req->fop], req->gen, + req_gfid); + continue; + } - if (req->stub->fop == GF_FOP_WRITE) { - if (wb_wip_has_conflict (wb_inode, req)) - continue; + if (req->stub->fop == GF_FOP_WRITE) { + conflict = wb_wip_has_conflict(wb_inode, req); + + if (conflict) { + uuid_utoa_r(conflict->gfid, conflict_gfid); + + gf_msg_debug(wb_inode->this->name, 0, + "Not winding write request as " + "a conflicting write is being " + "synced to backend. " + "REQ: unique=%" PRIu64 + " fop=%s," + " gen=%" PRIu64 + ", gfid=%s. " + "CONFLICT: unique=%" PRIu64 + " " + "fop=%s, gen=%" PRIu64 + ", " + "gfid=%s", + req->unique, gf_fop_list[req->fop], req->gen, + req_gfid, conflict->unique, + gf_fop_list[conflict->fop], conflict->gen, + conflict_gfid); + continue; + } + + list_add_tail(&req->wip, &wb_inode->wip); + req->wind_count++; + + if (!req->ordering.tempted) + /* unrefed in wb_writev_cbk */ + req->stub->frame->local = __wb_request_ref(req); + } - list_add_tail (&req->wip, &wb_inode->wip); + gf_msg_debug(wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): picking the request for " + "winding", + req->unique, gf_fop_list[req->fop], req_gfid, req->gen); - if (!req->ordering.tempted) - /* unrefed in wb_writev_cbk */ - req->stub->frame->local = - __wb_request_ref (req); - } + list_del_init(&req->todo); - list_del_init (&req->todo); + if (req->ordering.tempted) { + list_add_tail(&req->winds, liabilities); + } else { + list_add_tail(&req->winds, tasks); + } + } - if (req->ordering.tempted) - list_add_tail (&req->winds, liabilities); - else - list_add_tail (&req->winds, tasks); - } + return 0; } - void -wb_do_winds (wb_inode_t *wb_inode, list_head_t *tasks) +wb_do_winds(wb_inode_t *wb_inode, list_head_t *tasks) { - wb_request_t *req = NULL; - wb_request_t *tmp = NULL; + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; - list_for_each_entry_safe (req, tmp, tasks, winds) { - list_del_init (&req->winds); + list_for_each_entry_safe(req, tmp, tasks, winds) + { + list_del_init(&req->winds); - call_resume (req->stub); + if (req->op_ret == -1) { + call_unwind_error_keep_stub(req->stub, req->op_ret, req->op_errno); + } else { + call_resume_keep_stub(req->stub); + } - wb_request_unref (req); - } + wb_request_unref(req); + } } - void -wb_process_queue (wb_inode_t *wb_inode) +wb_process_queue(wb_inode_t *wb_inode) { - list_head_t tasks = {0, }; - list_head_t lies = {0, }; - list_head_t liabilities = {0, }; + list_head_t tasks; + list_head_t lies; + list_head_t liabilities; + int wind_failure = 0; - INIT_LIST_HEAD (&tasks); - INIT_LIST_HEAD (&lies); - INIT_LIST_HEAD (&liabilities); + INIT_LIST_HEAD(&tasks); + INIT_LIST_HEAD(&lies); + INIT_LIST_HEAD(&liabilities); - LOCK (&wb_inode->lock); - { - __wb_preprocess_winds (wb_inode); + do { + gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG, + "processing queues"); - __wb_pick_winds (wb_inode, &tasks, &liabilities); + LOCK(&wb_inode->lock); + { + __wb_preprocess_winds(wb_inode); - __wb_pick_unwinds (wb_inode, &lies); + __wb_pick_winds(wb_inode, &tasks, &liabilities); + __wb_pick_unwinds(wb_inode, &lies); } - UNLOCK (&wb_inode->lock); + UNLOCK(&wb_inode->lock); - wb_do_unwinds (wb_inode, &lies); + if (!list_empty(&lies)) + wb_do_unwinds(wb_inode, &lies); - wb_do_winds (wb_inode, &tasks); + if (!list_empty(&tasks)) + wb_do_winds(wb_inode, &tasks); - wb_fulfill (wb_inode, &liabilities); + /* If there is an error in wb_fulfill before winding write + * requests, we would miss invocation of wb_process_queue + * from wb_fulfill_cbk. So, retry processing again. + */ + if (!list_empty(&liabilities)) + wind_failure = wb_fulfill(wb_inode, &liabilities); + } while (wind_failure); - return; + return; } +void +wb_set_inode_size(wb_inode_t *wb_inode, struct iatt *postbuf) +{ + GF_ASSERT(wb_inode); + GF_ASSERT(postbuf); + + LOCK(&wb_inode->lock); + { + wb_inode->size = postbuf->ia_size; + } + UNLOCK(&wb_inode->lock); +} int -wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +wb_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - wb_request_t *req = NULL; - wb_inode_t *wb_inode; + wb_request_t *req = NULL; + wb_inode_t *wb_inode; - req = frame->local; - frame->local = NULL; - wb_inode = req->wb_inode; + req = frame->local; + frame->local = NULL; + wb_inode = req->wb_inode; - wb_request_unref (req); + LOCK(&req->wb_inode->lock); + { + list_del_init(&req->wip); + } + UNLOCK(&req->wb_inode->lock); - /* requests could be pending while this was in progress */ - wb_process_queue(wb_inode); + wb_request_unref(req); - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, - xdata); - return 0; + /* requests could be pending while this was in progress */ + wb_process_queue(wb_inode); + + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } +int +wb_writev_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + STACK_WIND(frame, wb_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; +} int -wb_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata) +wb_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - STACK_WIND (frame, wb_writev_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, - fd, vector, count, offset, flags, iobref, xdata); - return 0; + wb_inode_t *wb_inode = NULL; + wb_conf_t *conf = NULL; + gf_boolean_t wb_disabled = 0; + call_stub_t *stub = NULL; + int ret = -1; + int32_t op_errno = EINVAL; + int o_direct = O_DIRECT; + + conf = this->private; + + wb_inode = wb_inode_create(this, fd->inode); + if (!wb_inode) { + op_errno = ENOMEM; + goto unwind; + } + + if (!conf->strict_O_DIRECT) + o_direct = 0; + + if (fd->flags & (O_SYNC | O_DSYNC | o_direct)) + wb_disabled = 1; + + if (flags & (O_SYNC | O_DSYNC | o_direct)) + wb_disabled = 1; + + if (wb_disabled) + stub = fop_writev_stub(frame, wb_writev_helper, fd, vector, count, + offset, flags, iobref, xdata); + else + stub = fop_writev_stub(frame, NULL, fd, vector, count, offset, flags, + iobref, xdata); + if (!stub) { + op_errno = ENOMEM; + goto unwind; + } + + if (wb_disabled) + ret = wb_enqueue(wb_inode, stub); + else + ret = wb_enqueue_tempted(wb_inode, stub); + + if (!ret) { + op_errno = ENOMEM; + goto unwind; + } + + wb_process_queue(wb_inode); + + return 0; + +unwind: + STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL); + + if (stub) + call_stub_destroy(stub); + + return 0; } +int +wb_readv_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; +} int -wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, - dict_t *xdata) -{ - wb_inode_t *wb_inode = NULL; - wb_conf_t *conf = NULL; - gf_boolean_t wb_disabled = 0; - call_stub_t *stub = NULL; - int ret = -1; - int32_t op_errno = EINVAL; - int o_direct = O_DIRECT; - - conf = this->private; - - if (wb_fd_err (fd, this, &op_errno)) { - goto unwind; - } +wb_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - wb_inode = wb_inode_create (this, fd->inode); - if (!wb_inode) { - op_errno = ENOMEM; - goto unwind; - } - - if (!conf->strict_O_DIRECT) - o_direct = 0; - - if (fd->flags & (O_SYNC|O_DSYNC|o_direct)) - wb_disabled = 1; - - if (flags & (O_SYNC|O_DSYNC|o_direct)) - wb_disabled = 1; - - if (wb_disabled) - stub = fop_writev_stub (frame, wb_writev_helper, fd, vector, - count, offset, flags, iobref, xdata); - else - stub = fop_writev_stub (frame, NULL, fd, vector, count, offset, - flags, iobref, xdata); - if (!stub) { - op_errno = ENOMEM; - goto unwind; - } + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - if (wb_disabled) - ret = wb_enqueue (wb_inode, stub); - else - ret = wb_enqueue_tempted (wb_inode, stub); + stub = fop_readv_stub(frame, wb_readv_helper, fd, size, offset, flags, + xdata); + if (!stub) + goto unwind; - if (!ret) { - op_errno = ENOMEM; - goto unwind; - } + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - wb_process_queue (wb_inode); + wb_process_queue(wb_inode); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL, NULL); + STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, NULL); - if (stub) - call_stub_destroy (stub); + if (stub) + call_stub_destroy(stub); + return 0; - return 0; +noqueue: + STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; } - int -wb_readv_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +wb_flush_bg_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, - xdata); - return 0; + STACK_DESTROY(frame->root); + return 0; } +int +wb_flush_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + wb_conf_t *conf = NULL; + wb_inode_t *wb_inode = NULL; + call_frame_t *bg_frame = NULL; + int32_t op_errno = 0; + int op_ret = 0; + + conf = this->private; + + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + + if (conf->flush_behind) + goto flushbehind; + + STACK_WIND(frame, default_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; + +flushbehind: + bg_frame = copy_frame(frame); + if (!bg_frame) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + STACK_WIND(bg_frame, wb_flush_bg_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + /* fall through */ +unwind: + STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, NULL); + + return 0; +} int -wb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +wb_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - wb_inode_t *wb_inode = NULL; - call_stub_t *stub = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - wb_inode = wb_inode_ctx_get (this, fd->inode); - if (!wb_inode) - goto noqueue; + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - stub = fop_readv_stub (frame, wb_readv_helper, fd, size, - offset, flags, xdata); - if (!stub) - goto unwind; + stub = fop_flush_stub(frame, wb_flush_helper, fd, xdata); + if (!stub) + goto unwind; - if (!wb_enqueue (wb_inode, stub)) - goto unwind; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - wb_process_queue (wb_inode); + wb_process_queue(wb_inode); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, - NULL); - return 0; + STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, NULL); + + if (stub) + call_stub_destroy(stub); + + return 0; noqueue: - STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, - xdata); - return 0; + STACK_WIND(frame, default_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; } - int -wb_flush_bg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +wb_fsync_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - STACK_DESTROY (frame->root); - return 0; + STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; } - int -wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +wb_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - wb_conf_t *conf = NULL; - wb_inode_t *wb_inode = NULL; - call_frame_t *bg_frame = NULL; - int32_t op_errno = 0; - int op_ret = 0; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; + int32_t op_errno = EINVAL; - conf = this->private; + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - wb_inode = wb_inode_ctx_get (this, fd->inode); - if (!wb_inode) { - op_ret = -1; - op_errno = EINVAL; - goto unwind; - } + stub = fop_fsync_stub(frame, wb_fsync_helper, fd, datasync, xdata); + if (!stub) + goto unwind; - if (wb_fd_err (fd, this, &op_errno)) { - op_ret = -1; - goto unwind; - } + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - if (conf->flush_behind) - goto flushbehind; + wb_process_queue(wb_inode); - STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, fd, xdata); - return 0; + return 0; -flushbehind: - bg_frame = copy_frame (frame); - if (!bg_frame) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - STACK_WIND (bg_frame, wb_flush_bg_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, fd, xdata); - /* fall through */ unwind: - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL); + STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + if (stub) + call_stub_destroy(stub); + return 0; + +noqueue: + STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; } +int +wb_stat_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +} int -wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +wb_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - wb_inode_t *wb_inode = NULL; - call_stub_t *stub = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - wb_inode = wb_inode_ctx_get (this, fd->inode); - if (!wb_inode) - goto noqueue; + wb_inode = wb_inode_ctx_get(this, loc->inode); + if (!wb_inode) + goto noqueue; - stub = fop_flush_stub (frame, wb_flush_helper, fd, xdata); - if (!stub) - goto unwind; + stub = fop_stat_stub(frame, wb_stat_helper, loc, xdata); + if (!stub) + goto unwind; - if (!wb_enqueue (wb_inode, stub)) - goto unwind; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - wb_process_queue (wb_inode); + wb_process_queue(wb_inode); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM, NULL); + STACK_UNWIND_STRICT(stat, frame, -1, ENOMEM, NULL, NULL); - return 0; + if (stub) + call_stub_destroy(stub); + return 0; noqueue: - STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, fd, xdata); - return 0; + STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; } - - int -wb_fsync_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +wb_fstat_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); - return 0; + STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; } - int -wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, - dict_t *xdata) +wb_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - wb_inode_t *wb_inode = NULL; - call_stub_t *stub = NULL; - int32_t op_errno = EINVAL; - - if (wb_fd_err (fd, this, &op_errno)) - goto unwind; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - wb_inode = wb_inode_ctx_get (this, fd->inode); - if (!wb_inode) - goto noqueue; + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - stub = fop_fsync_stub (frame, wb_fsync_helper, fd, datasync, xdata); - if (!stub) - goto unwind; + stub = fop_fstat_stub(frame, wb_fstat_helper, fd, xdata); + if (!stub) + goto unwind; - if (!wb_enqueue (wb_inode, stub)) - goto unwind; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - wb_process_queue (wb_inode); + wb_process_queue(wb_inode); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL, NULL); + STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, NULL, NULL); - return 0; + if (stub) + call_stub_destroy(stub); + return 0; noqueue: - STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); - return 0; + STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; } +int32_t +wb_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT(frame->local); + + if (op_ret == 0) + wb_set_inode_size(frame->local, postbuf); + + frame->local = NULL; + + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} int -wb_stat_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +wb_truncate_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) { - STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc, xdata); - return 0; + STACK_WIND(frame, wb_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; } - int -wb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +wb_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - wb_inode_t *wb_inode = NULL; - call_stub_t *stub = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; + wb_inode = wb_inode_create(this, loc->inode); + if (!wb_inode) + goto unwind; - wb_inode = wb_inode_ctx_get (this, loc->inode); - if (!wb_inode) - goto noqueue; + frame->local = wb_inode; - stub = fop_stat_stub (frame, wb_stat_helper, loc, xdata); - if (!stub) - goto unwind; + stub = fop_truncate_stub(frame, wb_truncate_helper, loc, offset, xdata); + if (!stub) + goto unwind; - if (!wb_enqueue (wb_inode, stub)) - goto unwind; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - wb_process_queue (wb_inode); + wb_process_queue(wb_inode); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (stat, frame, -1, ENOMEM, NULL, NULL); + STACK_UNWIND_STRICT(truncate, frame, -1, ENOMEM, NULL, NULL, NULL); - if (stub) - call_stub_destroy (stub); - return 0; + if (stub) + call_stub_destroy(stub); -noqueue: - STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc, xdata); - return 0; + return 0; } +int32_t +wb_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT(frame->local); + + if (op_ret == 0) + wb_set_inode_size(frame->local, postbuf); + + frame->local = NULL; + + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} int -wb_fstat_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +wb_ftruncate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - STACK_WIND (frame, default_fstat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, fd, xdata); - return 0; + STACK_WIND(frame, wb_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; } - int -wb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +wb_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - wb_inode_t *wb_inode = NULL; - call_stub_t *stub = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; + int32_t op_errno = 0; + wb_inode = wb_inode_create(this, fd->inode); + if (!wb_inode) { + op_errno = ENOMEM; + goto unwind; + } - wb_inode = wb_inode_ctx_get (this, fd->inode); - if (!wb_inode) - goto noqueue; + frame->local = wb_inode; - stub = fop_fstat_stub (frame, wb_fstat_helper, fd, xdata); - if (!stub) - goto unwind; + stub = fop_ftruncate_stub(frame, wb_ftruncate_helper, fd, offset, xdata); + if (!stub) { + op_errno = ENOMEM; + goto unwind; + } - if (!wb_enqueue (wb_inode, stub)) - goto unwind; + if (!wb_enqueue(wb_inode, stub)) { + op_errno = ENOMEM; + goto unwind; + } - wb_process_queue (wb_inode); + wb_process_queue(wb_inode); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (fstat, frame, -1, ENOMEM, NULL, NULL); + frame->local = NULL; - if (stub) - call_stub_destroy (stub); - return 0; + STACK_UNWIND_STRICT(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); -noqueue: - STACK_WIND (frame, default_fstat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, fd, xdata); - return 0; + if (stub) + call_stub_destroy(stub); + return 0; } +int +wb_setattr_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; +} int -wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset, dict_t *xdata) +wb_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); - return 0; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; + + wb_inode = wb_inode_ctx_get(this, loc->inode); + if (!wb_inode) + goto noqueue; + + stub = fop_setattr_stub(frame, wb_setattr_helper, loc, stbuf, valid, xdata); + if (!stub) + goto unwind; + + if (!wb_enqueue(wb_inode, stub)) + goto unwind; + + wb_process_queue(wb_inode); + + return 0; +unwind: + STACK_UNWIND_STRICT(setattr, frame, -1, ENOMEM, NULL, NULL, NULL); + + if (stub) + call_stub_destroy(stub); + return 0; + +noqueue: + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; } +int +wb_fsetattr_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; +} int -wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, - dict_t *xdata) +wb_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - wb_inode_t *wb_inode = NULL; - call_stub_t *stub = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - wb_inode = wb_inode_create (this, loc->inode); - if (!wb_inode) - goto unwind; + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - stub = fop_truncate_stub (frame, wb_truncate_helper, loc, - offset, xdata); - if (!stub) - goto unwind; + stub = fop_fsetattr_stub(frame, wb_fsetattr_helper, fd, stbuf, valid, + xdata); + if (!stub) + goto unwind; - if (!wb_enqueue (wb_inode, stub)) - goto unwind; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - wb_process_queue (wb_inode); + wb_process_queue(wb_inode); - return 0; + return 0; +unwind: + STACK_UNWIND_STRICT(fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); + + if (stub) + call_stub_destroy(stub); + return 0; + +noqueue: + STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; +} + +int32_t +wb_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + + wb_inode = wb_inode_create(this, fd->inode); + if (!wb_inode) + goto unwind; + + if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC)) + wb_inode->size = 0; + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (truncate, frame, -1, ENOMEM, NULL, NULL, NULL); + STACK_UNWIND_STRICT(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} - if (stub) - call_stub_destroy (stub); +int32_t +wb_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; - return 0; + wb_inode = wb_inode_create(this, fd->inode); + if (!wb_inode) + goto unwind; + + if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC)) + wb_inode->size = 0; + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, + loc, flags, fd, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL); + return 0; } +int32_t +wb_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + if (op_ret == 0) { + wb_inode_t *wb_inode = wb_inode_ctx_get(this, inode); + if (wb_inode) + wb_set_inode_size(wb_inode, buf); + } + + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; +} int -wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset, dict_t *xdata) +wb_lookup_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); - return 0; + STACK_WIND(frame, wb_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + return 0; } +int32_t +wb_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; -int -wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) + wb_inode = wb_inode_ctx_get(this, loc->inode); + if (!wb_inode) + goto noqueue; + + stub = fop_lookup_stub(frame, wb_lookup_helper, loc, xdata); + if (!stub) + goto unwind; + + if (!wb_enqueue(wb_inode, stub)) + goto unwind; + + wb_process_queue(wb_inode); + + return 0; + +unwind: + if (stub) + call_stub_destroy(stub); + + STACK_UNWIND_STRICT(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); + return 0; + +noqueue: + STACK_WIND(frame, wb_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + return 0; +} + +static void +wb_mark_readdirp_start(xlator_t *this, inode_t *directory) { - wb_inode_t *wb_inode = NULL; - call_stub_t *stub = NULL; - int32_t op_errno = 0; + wb_inode_t *wb_directory_inode = NULL; - wb_inode = wb_inode_create (this, fd->inode); - if (!wb_inode) { - op_errno = ENOMEM; - goto unwind; - } + wb_directory_inode = wb_inode_create(this, directory); - if (wb_fd_err (fd, this, &op_errno)) - goto unwind; + if (!wb_directory_inode) + return; - stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd, - offset, xdata); - if (!stub) { - op_errno = ENOMEM; - goto unwind; - } + LOCK(&wb_directory_inode->lock); + { + GF_ATOMIC_INC(wb_directory_inode->readdirps); + } + UNLOCK(&wb_directory_inode->lock); - if (!wb_enqueue (wb_inode, stub)) { - op_errno = ENOMEM; - goto unwind; + return; +} + +static void +wb_mark_readdirp_end(xlator_t *this, inode_t *directory) +{ + wb_inode_t *wb_directory_inode = NULL, *wb_inode = NULL, *tmp = NULL; + int readdirps = 0; + + wb_directory_inode = wb_inode_ctx_get(this, directory); + + if (!wb_directory_inode) + return; + + LOCK(&wb_directory_inode->lock); + { + readdirps = GF_ATOMIC_DEC(wb_directory_inode->readdirps); + if (readdirps) + goto unlock; + + list_for_each_entry_safe(wb_inode, tmp, + &wb_directory_inode->invalidate_list, + invalidate_list) + { + list_del_init(&wb_inode->invalidate_list); + GF_ATOMIC_INIT(wb_inode->invalidate, 0); + inode_unref(wb_inode->inode); } + } +unlock: + UNLOCK(&wb_directory_inode->lock); - wb_process_queue (wb_inode); + return; +} - return 0; +int32_t +wb_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + gf_dirent_t *entry = NULL; + inode_t *inode = NULL; + fd_t *fd = NULL; + + fd = frame->local; + frame->local = NULL; + + if (op_ret <= 0) + goto unwind; + + list_for_each_entry(entry, &entries->list, list) + { + if (!entry->inode || !IA_ISREG(entry->d_stat.ia_type)) + continue; + + wb_inode = wb_inode_ctx_get(this, entry->inode); + if (!wb_inode) + continue; + + LOCK(&wb_inode->lock); + { + if (!list_empty(&wb_inode->liability) || + GF_ATOMIC_GET(wb_inode->invalidate)) { + inode = entry->inode; + + entry->inode = NULL; + memset(&entry->d_stat, 0, sizeof(entry->d_stat)); + } + } + UNLOCK(&wb_inode->lock); + + if (inode) { + inode_unref(inode); + inode = NULL; + } + } unwind: - STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + wb_mark_readdirp_end(this, fd->inode); - if (stub) - call_stub_destroy (stub); - return 0; + frame->local = NULL; + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; } - -int -wb_setattr_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +int32_t +wb_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) { - STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); - return 0; + wb_mark_readdirp_start(this, fd->inode); + + frame->local = fd; + + STACK_WIND(frame, wb_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); + + return 0; } +int32_t +wb_link_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; +} -int -wb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +int32_t +wb_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - wb_inode_t *wb_inode = NULL; - call_stub_t *stub = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - wb_inode = wb_inode_ctx_get (this, loc->inode); - if (!wb_inode) - goto noqueue; + wb_inode = wb_inode_ctx_get(this, oldloc->inode); + if (!wb_inode) + goto noqueue; - stub = fop_setattr_stub (frame, wb_setattr_helper, loc, stbuf, - valid, xdata); - if (!stub) - goto unwind; + stub = fop_link_stub(frame, wb_link_helper, oldloc, newloc, xdata); + if (!stub) + goto unwind; - if (!wb_enqueue (wb_inode, stub)) - goto unwind; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - wb_process_queue (wb_inode); + wb_process_queue(wb_inode); + + return 0; - return 0; unwind: - STACK_UNWIND_STRICT (setattr, frame, -1, ENOMEM, NULL, NULL, NULL); + STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + + if (stub) + call_stub_destroy(stub); - if (stub) - call_stub_destroy (stub); - return 0; + return 0; noqueue: - STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); - return 0; + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; } +int32_t +wb_fallocate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t keep_size, off_t offset, size_t len, dict_t *xdata) +{ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, + len, xdata); + return 0; +} -int -wb_fsetattr_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +int32_t +wb_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) { - STACK_WIND (frame, default_fsetattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); - return 0; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; + + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; + + stub = fop_fallocate_stub(frame, wb_fallocate_helper, fd, keep_size, offset, + len, xdata); + if (!stub) + goto unwind; + + if (!wb_enqueue(wb_inode, stub)) + goto unwind; + + wb_process_queue(wb_inode); + + return 0; + +unwind: + STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); + + if (stub) + call_stub_destroy(stub); + + return 0; + +noqueue: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, + len, xdata); + return 0; } +int32_t +wb_discard_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard, + fd, offset, len, xdata); + return 0; +} -int -wb_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +int32_t +wb_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - wb_inode_t *wb_inode = NULL; - call_stub_t *stub = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - wb_inode = wb_inode_ctx_get (this, fd->inode); - if (!wb_inode) - goto noqueue; + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - stub = fop_fsetattr_stub (frame, wb_fsetattr_helper, fd, stbuf, - valid, xdata); - if (!stub) - goto unwind; + stub = fop_discard_stub(frame, wb_discard_helper, fd, offset, len, xdata); + if (!stub) + goto unwind; - if (!wb_enqueue (wb_inode, stub)) - goto unwind; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - wb_process_queue (wb_inode); + wb_process_queue(wb_inode); + + return 0; - return 0; unwind: - STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); + STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL); - if (stub) - call_stub_destroy (stub); - return 0; + if (stub) + call_stub_destroy(stub); + return 0; noqueue: - STACK_WIND (frame, default_fsetattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); - return 0; + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard, + fd, offset, len, xdata); + + return 0; } +int32_t +wb_zerofill_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill, + fd, offset, len, xdata); + return 0; +} -int -wb_forget (xlator_t *this, inode_t *inode) +int32_t +wb_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) { - uint64_t tmp = 0; - wb_inode_t *wb_inode = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - inode_ctx_del (inode, this, &tmp); + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - wb_inode = (wb_inode_t *)(long)tmp; + stub = fop_zerofill_stub(frame, wb_zerofill_helper, fd, offset, len, xdata); + if (!stub) + goto unwind; - if (!wb_inode) - return 0; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - GF_ASSERT (list_empty (&wb_inode->todo)); - GF_ASSERT (list_empty (&wb_inode->liability)); - GF_ASSERT (list_empty (&wb_inode->temptation)); + wb_process_queue(wb_inode); - GF_FREE (wb_inode); + return 0; - return 0; +unwind: + STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL); + + if (stub) + call_stub_destroy(stub); + +noqueue: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill, + fd, offset, len, xdata); + return 0; } +int32_t +wb_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; + + wb_inode = wb_inode_ctx_get(this, oldloc->inode); + if (!wb_inode) + goto noqueue; + + stub = fop_rename_stub(frame, default_rename_resume, oldloc, newloc, xdata); + if (!stub) + goto unwind; + + if (!wb_enqueue(wb_inode, stub)) + goto unwind; + + wb_process_queue(wb_inode); + + return 0; + +unwind: + if (stub) + call_stub_destroy(stub); + + STACK_UNWIND_STRICT(rename, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL, + NULL); + + return 0; + +noqueue: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, + oldloc, newloc, xdata); + return 0; +} int -wb_release (xlator_t *this, fd_t *fd) +wb_forget(xlator_t *this, inode_t *inode) { - uint64_t tmp = 0; + uint64_t tmp = 0; + wb_inode_t *wb_inode = NULL; + + inode_ctx_del(inode, this, &tmp); - fd_ctx_del (fd, this, &tmp); + wb_inode = (wb_inode_t *)(long)tmp; + if (!wb_inode) return 0; + + wb_inode_destroy(wb_inode); + + return 0; } +int +wb_release(xlator_t *this, fd_t *fd) +{ + uint64_t tmp = 0; + + (void)fd_ctx_del(fd, this, &tmp); + + return 0; +} int -wb_priv_dump (xlator_t *this) +wb_priv_dump(xlator_t *this) { - wb_conf_t *conf = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - int ret = -1; + wb_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + int ret = -1; - GF_VALIDATE_OR_GOTO ("write-behind", this, out); + GF_VALIDATE_OR_GOTO("write-behind", this, out); - conf = this->private; - GF_VALIDATE_OR_GOTO (this->name, conf, out); + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); - gf_proc_dump_build_key (key_prefix, "xlator.performance.write-behind", - "priv"); + gf_proc_dump_build_key(key_prefix, "xlator.performance.write-behind", + "priv"); - gf_proc_dump_add_section (key_prefix); + gf_proc_dump_add_section("%s", key_prefix); - gf_proc_dump_write ("aggregate_size", "%d", conf->aggregate_size); - gf_proc_dump_write ("window_size", "%d", conf->window_size); - gf_proc_dump_write ("flush_behind", "%d", conf->flush_behind); - gf_proc_dump_write ("trickling_writes", "%d", conf->trickling_writes); + gf_proc_dump_write("aggregate_size", "%" PRIu64, conf->aggregate_size); + gf_proc_dump_write("window_size", "%" PRIu64, conf->window_size); + gf_proc_dump_write("flush_behind", "%d", conf->flush_behind); + gf_proc_dump_write("trickling_writes", "%d", conf->trickling_writes); - ret = 0; + ret = 0; out: - return ret; + return ret; } - void -__wb_dump_requests (struct list_head *head, char *prefix) +__wb_dump_requests(struct list_head *head, char *prefix) { - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }, flag = 0; - wb_request_t *req = NULL; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = + { + 0, + }, + flag = 0; + wb_request_t *req = NULL; - list_for_each_entry (req, head, all) { - gf_proc_dump_build_key (key_prefix, key, - (char *)gf_fop_list[req->fop]); + list_for_each_entry(req, head, all) + { + gf_proc_dump_build_key(key_prefix, key, "%s", + (char *)gf_fop_list[req->fop]); - gf_proc_dump_add_section(key_prefix); + gf_proc_dump_add_section("%s", key_prefix); - gf_proc_dump_write ("request-ptr", "%p", req); + gf_proc_dump_write("unique", "%" PRIu64, req->unique); - gf_proc_dump_write ("refcount", "%d", req->refcount); + gf_proc_dump_write("refcount", "%d", req->refcount); - if (list_empty (&req->todo)) - gf_proc_dump_write ("wound", "yes"); - else - gf_proc_dump_write ("wound", "no"); + if (list_empty(&req->todo)) + gf_proc_dump_write("wound", "yes"); + else + gf_proc_dump_write("wound", "no"); - if (req->fop == GF_FOP_WRITE) { - gf_proc_dump_write ("size", "%"GF_PRI_SIZET, - req->write_size); + gf_proc_dump_write("generation-number", "%" PRIu64, req->gen); - gf_proc_dump_write ("offset", "%"PRId64, - req->stub->args.offset); + gf_proc_dump_write("req->op_ret", "%d", req->op_ret); + gf_proc_dump_write("req->op_errno", "%d", req->op_errno); + gf_proc_dump_write("sync-attempts", "%d", req->wind_count); - flag = req->ordering.lied; - gf_proc_dump_write ("lied", "%d", flag); + if (req->fop == GF_FOP_WRITE) { + if (list_empty(&req->wip)) + gf_proc_dump_write("sync-in-progress", "no"); + else + gf_proc_dump_write("sync-in-progress", "yes"); - flag = req->ordering.append; - gf_proc_dump_write ("append", "%d", flag); + gf_proc_dump_write("size", "%" GF_PRI_SIZET, req->write_size); - flag = req->ordering.fulfilled; - gf_proc_dump_write ("fulfilled", "%d", flag); + if (req->stub) + gf_proc_dump_write("offset", "%" PRId64, + req->stub->args.offset); - flag = req->ordering.go; - gf_proc_dump_write ("go", "%d", flag); - } + flag = req->ordering.lied; + gf_proc_dump_write("lied", "%d", flag); + + flag = req->ordering.append; + gf_proc_dump_write("append", "%d", flag); + + flag = req->ordering.fulfilled; + gf_proc_dump_write("fulfilled", "%d", flag); + + flag = req->ordering.go; + gf_proc_dump_write("go", "%d", flag); } + } } - int -wb_inode_dump (xlator_t *this, inode_t *inode) +wb_inode_dump(xlator_t *this, inode_t *inode) { - wb_inode_t *wb_inode = NULL; - int32_t ret = -1; - char *path = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - char uuid_str[64] = {0,}; + wb_inode_t *wb_inode = NULL; + int32_t ret = -1; + char *path = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char uuid_str[64] = { + 0, + }; + + if ((inode == NULL) || (this == NULL)) { + ret = 0; + goto out; + } - if ((inode == NULL) || (this == NULL)) { - ret = 0; - goto out; - } + wb_inode = wb_inode_ctx_get(this, inode); + if (wb_inode == NULL) { + ret = 0; + goto out; + } - wb_inode = wb_inode_ctx_get (this, inode); - if (wb_inode == NULL) { - ret = 0; - goto out; - } + uuid_utoa_r(inode->gfid, uuid_str); - gf_proc_dump_build_key (key_prefix, "xlator.performance.write-behind", - "wb_inode"); + gf_proc_dump_build_key(key_prefix, "xlator.performance.write-behind", + "wb_inode"); - gf_proc_dump_add_section (key_prefix); + gf_proc_dump_add_section("%s", key_prefix); - __inode_path (inode, NULL, &path); - if (path != NULL) { - gf_proc_dump_write ("path", "%s", path); - GF_FREE (path); - } + __inode_path(inode, NULL, &path); + if (path != NULL) { + gf_proc_dump_write("path", "%s", path); + GF_FREE(path); + } - gf_proc_dump_write ("inode", "%p", inode); + gf_proc_dump_write("inode", "%p", inode); - gf_proc_dump_write ("window_conf", "%"GF_PRI_SIZET, - wb_inode->window_conf); + gf_proc_dump_write("gfid", "%s", uuid_str); - gf_proc_dump_write ("window_current", "%"GF_PRI_SIZET, - wb_inode->window_current); + gf_proc_dump_write("window_conf", "%" GF_PRI_SIZET, wb_inode->window_conf); + gf_proc_dump_write("window_current", "%" GF_PRI_SIZET, + wb_inode->window_current); - ret = TRY_LOCK (&wb_inode->lock); - if (!ret) - { - if (!list_empty (&wb_inode->all)) { - __wb_dump_requests (&wb_inode->all, key_prefix); - } - UNLOCK (&wb_inode->lock); + gf_proc_dump_write("transit-size", "%" GF_PRI_SIZET, wb_inode->transit); + + gf_proc_dump_write("dontsync", "%d", wb_inode->dontsync); + + ret = TRY_LOCK(&wb_inode->lock); + if (!ret) { + if (!list_empty(&wb_inode->all)) { + __wb_dump_requests(&wb_inode->all, key_prefix); } + UNLOCK(&wb_inode->lock); + } - if (ret && wb_inode) - gf_proc_dump_write ("Unable to dump the inode information", - "(Lock acquisition failed) %p (gfid: %s)", - wb_inode, - uuid_utoa_r (inode->gfid, uuid_str)); - ret = 0; + if (ret && wb_inode) + gf_proc_dump_write("Unable to dump the inode information", + "(Lock acquisition failed) %p (gfid: %s)", wb_inode, + uuid_str); + + ret = 0; out: - return ret; + return ret; } - int -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - if (!this) { - goto out; - } + if (!this) { + goto out; + } - ret = xlator_mem_acct_init (this, gf_wb_mt_end + 1); + ret = xlator_mem_acct_init(this, gf_wb_mt_end + 1); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - } + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, WRITE_BEHIND_MSG_NO_MEMORY, + "Memory accounting init" + "failed"); + } out: - return ret; + return ret; } - int -reconfigure (xlator_t *this, dict_t *options) +reconfigure(xlator_t *this, dict_t *options) { - wb_conf_t *conf = NULL; - int ret = -1; + wb_conf_t *conf = NULL; + int ret = -1; - conf = this->private; + conf = this->private; - GF_OPTION_RECONF ("cache-size", conf->window_size, options, size, out); + GF_OPTION_RECONF("cache-size", conf->window_size, options, size_uint64, + out); - GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool, - out); + GF_OPTION_RECONF("flush-behind", conf->flush_behind, options, bool, out); - GF_OPTION_RECONF ("trickling-writes", conf->trickling_writes, options, - bool, out); + GF_OPTION_RECONF("trickling-writes", conf->trickling_writes, options, bool, + out); - GF_OPTION_RECONF ("strict-O_DIRECT", conf->strict_O_DIRECT, options, - bool, out); + GF_OPTION_RECONF("strict-O_DIRECT", conf->strict_O_DIRECT, options, bool, + out); - GF_OPTION_RECONF ("strict-write-ordering", conf->strict_write_ordering, - options, bool, out); - ret = 0; + GF_OPTION_RECONF("strict-write-ordering", conf->strict_write_ordering, + options, bool, out); + GF_OPTION_RECONF("resync-failed-syncs-after-fsync", + conf->resync_after_fsync, options, bool, out); + + ret = 0; out: - return ret; + return ret; } - int32_t -init (xlator_t *this) +init(xlator_t *this) { - wb_conf_t *conf = NULL; - int32_t ret = -1; - - if ((this->children == NULL) - || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: write-behind (%s) not configured with exactly " - "one child", this->name); - goto out; - } - - if (this->parents == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfilex"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_wb_mt_wb_conf_t); - if (conf == NULL) { - goto out; - } - - /* configure 'options aggregate-size <size>' */ - conf->aggregate_size = WB_AGGREGATE_SIZE; - - /* configure 'option window-size <size>' */ - GF_OPTION_INIT ("cache-size", conf->window_size, size, out); - - if (!conf->window_size && conf->aggregate_size) { - gf_log (this->name, GF_LOG_WARNING, - "setting window-size to be equal to " - "aggregate-size(%"PRIu64")", - conf->aggregate_size); - conf->window_size = conf->aggregate_size; - } - - if (conf->window_size < conf->aggregate_size) { - gf_log (this->name, GF_LOG_ERROR, - "aggregate-size(%"PRIu64") cannot be more than " - "window-size(%"PRIu64")", conf->aggregate_size, - conf->window_size); - goto out; - } - - /* configure 'option flush-behind <on/off>' */ - GF_OPTION_INIT ("flush-behind", conf->flush_behind, bool, out); - - GF_OPTION_INIT ("trickling-writes", conf->trickling_writes, bool, out); - - GF_OPTION_INIT ("strict-O_DIRECT", conf->strict_O_DIRECT, bool, out); - - GF_OPTION_INIT ("strict-write-ordering", conf->strict_write_ordering, - bool, out); - - this->private = conf; - ret = 0; + wb_conf_t *conf = NULL; + int32_t ret = -1; + + if ((this->children == NULL) || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, WRITE_BEHIND_MSG_INIT_FAILED, + "FATAL: write-behind (%s) not configured with exactly " + "one child", + this->name); + goto out; + } + + if (this->parents == NULL) { + gf_msg(this->name, GF_LOG_WARNING, 0, + WRITE_BEHIND_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfilex"); + } + + conf = GF_CALLOC(1, sizeof(*conf), gf_wb_mt_wb_conf_t); + if (conf == NULL) { + goto out; + } + + /* configure 'options aggregate-size <size>' */ + GF_OPTION_INIT("aggregate-size", conf->aggregate_size, size_uint64, out); + conf->page_size = conf->aggregate_size; + + /* configure 'option window-size <size>' */ + GF_OPTION_INIT("cache-size", conf->window_size, size_uint64, out); + + if (!conf->window_size && conf->aggregate_size) { + gf_msg(this->name, GF_LOG_WARNING, 0, WRITE_BEHIND_MSG_SIZE_NOT_SET, + "setting window-size to be equal to " + "aggregate-size(%" PRIu64 ")", + conf->aggregate_size); + conf->window_size = conf->aggregate_size; + } + + if (conf->window_size < conf->aggregate_size) { + gf_msg(this->name, GF_LOG_ERROR, 0, WRITE_BEHIND_MSG_EXCEEDED_MAX_SIZE, + "aggregate-size(%" PRIu64 + ") cannot be more than " + "window-size(%" PRIu64 ")", + conf->aggregate_size, conf->window_size); + goto out; + } + + /* configure 'option flush-behind <on/off>' */ + GF_OPTION_INIT("flush-behind", conf->flush_behind, bool, out); + + GF_OPTION_INIT("trickling-writes", conf->trickling_writes, bool, out); + + GF_OPTION_INIT("strict-O_DIRECT", conf->strict_O_DIRECT, bool, out); + + GF_OPTION_INIT("strict-write-ordering", conf->strict_write_ordering, bool, + out); + + GF_OPTION_INIT("resync-failed-syncs-after-fsync", conf->resync_after_fsync, + bool, out); + + this->private = conf; + ret = 0; out: - if (ret) { - GF_FREE (conf); - } - return ret; + if (ret) { + GF_FREE(conf); + } + return ret; } - void -fini (xlator_t *this) +fini(xlator_t *this) { - wb_conf_t *conf = NULL; + wb_conf_t *conf = NULL; - GF_VALIDATE_OR_GOTO ("write-behind", this, out); + GF_VALIDATE_OR_GOTO("write-behind", this, out); - conf = this->private; - if (!conf) { - goto out; - } + conf = this->private; + if (!conf) { + goto out; + } - this->private = NULL; - GF_FREE (conf); + this->private = NULL; + GF_FREE(conf); out: - return; + return; } - struct xlator_fops fops = { - .writev = wb_writev, - .readv = wb_readv, - .flush = wb_flush, - .fsync = wb_fsync, - .stat = wb_stat, - .fstat = wb_fstat, - .truncate = wb_truncate, - .ftruncate = wb_ftruncate, - .setattr = wb_setattr, - .fsetattr = wb_fsetattr, -}; - - -struct xlator_cbks cbks = { - .forget = wb_forget, - .release = wb_release + .writev = wb_writev, + .readv = wb_readv, + .flush = wb_flush, + .fsync = wb_fsync, + .stat = wb_stat, + .fstat = wb_fstat, + .truncate = wb_truncate, + .ftruncate = wb_ftruncate, + .setattr = wb_setattr, + .fsetattr = wb_fsetattr, + .lookup = wb_lookup, + .readdirp = wb_readdirp, + .link = wb_link, + .fallocate = wb_fallocate, + .discard = wb_discard, + .zerofill = wb_zerofill, + .rename = wb_rename, }; +struct xlator_cbks cbks = {.forget = wb_forget, .release = wb_release}; struct xlator_dumpops dumpops = { - .priv = wb_priv_dump, - .inodectx = wb_inode_dump, + .priv = wb_priv_dump, + .inodectx = wb_inode_dump, }; - struct volume_options options[] = { - { .key = {"flush-behind"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "If this option is set ON, instructs write-behind " - "translator to perform flush in background, by " - "returning success (or any errors, if any of " - "previous writes were failed) to application even " - "before flush FOP is sent to backend filesystem. " - }, - { .key = {"cache-size", "window-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 512 * GF_UNIT_KB, - .max = 1 * GF_UNIT_GB, - .default_value = "1MB", - .description = "Size of the write-behind buffer for a single file " - "(inode)." - }, - { .key = {"trickling-writes"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - }, - { .key = {"strict-O_DIRECT"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "This option when set to off, ignores the " - "O_DIRECT flag." - }, - { .key = {"strict-write-ordering"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "Do not let later writes overtake earlier writes even " - "if they do not overlap", - }, - { .key = {NULL} }, + { + .key = {"write-behind"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable write-behind", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + {.key = {"flush-behind"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .description = "If this option is set ON, instructs write-behind " + "translator to perform flush in background, by " + "returning success (or any errors, if any of " + "previous writes were failed) to application even " + "before flush FOP is sent to backend filesystem. "}, + {.key = {"cache-size", "window-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 512 * GF_UNIT_KB, + .max = 1 * GF_UNIT_GB, + .default_value = "1MB", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .description = "Size of the write-behind buffer for a single file " + "(inode)."}, + { + .key = {"trickling-writes"}, + .type = GF_OPTION_TYPE_BOOL, + .op_version = {GD_OP_VERSION_3_13_1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .default_value = "on", + }, + {.key = {"strict-O_DIRECT"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .description = "This option when set to off, ignores the " + "O_DIRECT flag."}, + { + .key = {"strict-write-ordering"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .description = "Do not let later writes overtake earlier writes even " + "if they do not overlap", + }, + { + .key = {"resync-failed-syncs-after-fsync"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_3_7_7}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .description = "If sync of \"cached-writes issued before fsync\" " + "(to backend) fails, this option configures whether " + "to retry syncing them after fsync or forget them. " + "If set to on, cached-writes are retried " + "till a \"flush\" fop (or a successful sync) on sync " + "failures. " + "fsync itself is failed irrespective of the value of " + "this option. ", + }, + { + .key = {"aggregate-size"}, + .type = GF_OPTION_TYPE_SIZET, + .default_value = "128KB", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .description = "Will aggregate writes until data of specified " + "size is fully filled for a single file provided " + "there are no dependent fops on cached writes. This " + "option just sets the aggregate size. Note that " + "aggregation won't happen if " + "performance.write-behind-trickling-writes" + " is turned on. Hence turn off " + "performance.write-behind.trickling-writes" + " so that writes are aggregated till a max of " + "\"aggregate-size\" bytes", + }, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "write-behind", + .category = GF_MAINTAINED, }; |
