summaryrefslogtreecommitdiffstats
path: root/xlators/performance
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/performance')
-rw-r--r--xlators/performance/Makefile.am2
-rw-r--r--xlators/performance/io-cache/src/Makefile.am2
-rw-r--r--xlators/performance/io-cache/src/io-cache.c88
-rw-r--r--xlators/performance/io-cache/src/page.c5
-rw-r--r--xlators/performance/io-threads/src/Makefile.am2
-rw-r--r--xlators/performance/io-threads/src/io-threads.c160
-rw-r--r--xlators/performance/md-cache/src/Makefile.am2
-rw-r--r--xlators/performance/md-cache/src/md-cache.c344
-rw-r--r--xlators/performance/open-behind/Makefile.am1
-rw-r--r--xlators/performance/open-behind/src/Makefile.am15
-rw-r--r--xlators/performance/open-behind/src/open-behind-mem-types.h21
-rw-r--r--xlators/performance/open-behind/src/open-behind.c1001
-rw-r--r--xlators/performance/quick-read/src/Makefile.am2
-rw-r--r--xlators/performance/quick-read/src/quick-read.c44
-rw-r--r--xlators/performance/read-ahead/src/Makefile.am2
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c115
-rw-r--r--xlators/performance/readdir-ahead/Makefile.am3
-rw-r--r--xlators/performance/readdir-ahead/src/Makefile.am15
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h24
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.c560
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.h46
-rw-r--r--xlators/performance/symlink-cache/src/Makefile.am2
-rw-r--r--xlators/performance/write-behind/src/Makefile.am2
-rw-r--r--xlators/performance/write-behind/src/write-behind.c368
24 files changed, 2669 insertions, 157 deletions
diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am
index eb94d8d6a..a494190ba 100644
--- a/xlators/performance/Makefile.am
+++ b/xlators/performance/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache quick-read md-cache
+SUBDIRS = write-behind read-ahead readdir-ahead io-threads io-cache symlink-cache quick-read md-cache open-behind
CLEANFILES =
diff --git a/xlators/performance/io-cache/src/Makefile.am b/xlators/performance/io-cache/src/Makefile.am
index 838e5f597..155be9988 100644
--- a/xlators/performance/io-cache/src/Makefile.am
+++ b/xlators/performance/io-cache/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = io-cache.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-io_cache_la_LDFLAGS = -module -avoidversion
+io_cache_la_LDFLAGS = -module -avoid-version
io_cache_la_SOURCES = io-cache.c page.c ioc-inode.c
io_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c
index a7459a9a3..201777b38 100644
--- a/xlators/performance/io-cache/src/io-cache.c
+++ b/xlators/performance/io-cache/src/io-cache.c
@@ -316,9 +316,11 @@ ioc_forget (xlator_t *this, inode_t *inode)
static int32_t
ioc_invalidate(xlator_t *this, inode_t *inode)
{
+ uint64_t ioc_addr = 0;
ioc_inode_t *ioc_inode = NULL;
- inode_ctx_get(inode, this, (uint64_t *) &ioc_inode);
+ inode_ctx_get(inode, this, (uint64_t *) &ioc_addr);
+ ioc_inode = (void *) ioc_addr;
if (ioc_inode)
ioc_inode_flush(ioc_inode);
@@ -551,6 +553,13 @@ ioc_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
inode_ctx_get (fd->inode, this, &tmp_ioc_inode);
ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
+ //TODO: see why inode context is NULL and handle it.
+ if (!ioc_inode) {
+ gf_log (this->name, GF_LOG_ERROR, "inode context is "
+ "NULL (%s)", uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+
ioc_table_lock (ioc_inode->table);
{
list_move_tail (&ioc_inode->inode_lru,
@@ -1415,6 +1424,58 @@ ioc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
return 0;
}
+static int32_t
+ioc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata);
+ return 0;
+}
+
+static int32_t
+ioc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ uint64_t ioc_inode = 0;
+
+ inode_ctx_get (fd->inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_WIND(frame, ioc_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+ return 0;
+}
+
+static int32_t
+ioc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT(zerofill, frame, op_ret,
+ op_errno, pre, post, xdata);
+ return 0;
+}
+
+static int32_t
+ioc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ uint64_t ioc_inode = 0;
+
+ inode_ctx_get (fd->inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_WIND(frame, ioc_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+ return 0;
+}
+
+
int32_t
ioc_get_priority_list (const char *opt_str, struct list_head *first)
{
@@ -1878,11 +1939,11 @@ int
ioc_inode_dump (xlator_t *this, inode_t *inode)
{
- char *path = NULL;
+ char *path = NULL;
int ret = -1;
char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, };
uint64_t tmp_ioc_inode = 0;
- ioc_inode_t *ioc_inode = NULL;
+ ioc_inode_t *ioc_inode = NULL;
gf_boolean_t section_added = _gf_false;
char uuid_str[64] = {0,};
@@ -1896,9 +1957,6 @@ ioc_inode_dump (xlator_t *this, inode_t *inode)
if (ioc_inode == NULL)
goto out;
- gf_proc_dump_add_section (key_prefix);
- section_added = _gf_true;
-
/* Similar to ioc_page_dump function its better to use
* pthread_mutex_trylock and not to use gf_log in statedump
* to avoid deadlocks.
@@ -1906,24 +1964,30 @@ ioc_inode_dump (xlator_t *this, inode_t *inode)
ret = pthread_mutex_trylock (&ioc_inode->inode_lock);
if (ret)
goto out;
- else
+
{
- gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight);
+ if (uuid_is_null (ioc_inode->inode->gfid))
+ goto unlock;
+
+ gf_proc_dump_add_section (key_prefix);
+ section_added = _gf_true;
- //inode_path takes blocking lock on the itable.
__inode_path (ioc_inode->inode, NULL, &path);
+ gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight);
+
if (path) {
gf_proc_dump_write ("path", "%s", path);
GF_FREE (path);
}
+
gf_proc_dump_write ("uuid", "%s", uuid_utoa_r
(ioc_inode->inode->gfid, uuid_str));
__ioc_cache_dump (ioc_inode, key_prefix);
__ioc_inode_waitq_dump (ioc_inode, key_prefix);
-
- pthread_mutex_unlock (&ioc_inode->inode_lock);
}
+unlock:
+ pthread_mutex_unlock (&ioc_inode->inode_lock);
out:
if (ret && ioc_inode) {
@@ -2037,6 +2101,8 @@ struct xlator_fops fops = {
.mknod = ioc_mknod,
.readdirp = ioc_readdirp,
+ .discard = ioc_discard,
+ .zerofill = ioc_zerofill,
};
diff --git a/xlators/performance/io-cache/src/page.c b/xlators/performance/io-cache/src/page.c
index 54c6f9b50..b2e20ba65 100644
--- a/xlators/performance/io-cache/src/page.c
+++ b/xlators/performance/io-cache/src/page.c
@@ -824,7 +824,6 @@ ioc_frame_unwind (call_frame_t *frame)
}
// ioc_local_lock (local);
- frame->local = NULL;
iobref = iobref_new ();
if (iobref == NULL) {
op_ret = -1;
@@ -875,6 +874,7 @@ unwind:
// ioc_local_unlock (local);
+ frame->local = NULL;
STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector,
count, &stbuf, iobref, NULL);
@@ -888,7 +888,8 @@ unwind:
}
pthread_mutex_destroy (&local->local_lock);
- mem_put (local);
+ if (local)
+ mem_put (local);
return;
}
diff --git a/xlators/performance/io-threads/src/Makefile.am b/xlators/performance/io-threads/src/Makefile.am
index 0f5a3b181..d63042e7c 100644
--- a/xlators/performance/io-threads/src/Makefile.am
+++ b/xlators/performance/io-threads/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = io-threads.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-io_threads_la_LDFLAGS = -module -avoidversion
+io_threads_la_LDFLAGS = -module -avoid-version
io_threads_la_SOURCES = io-threads.c
io_threads_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c
index ccbd41194..bbcf4ed26 100644
--- a/xlators/performance/io-threads/src/io-threads.c
+++ b/xlators/performance/io-threads/src/io-threads.c
@@ -307,6 +307,9 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub)
case GF_FOP_XATTROP:
case GF_FOP_FXATTROP:
case GF_FOP_RCHECKSUM:
+ case GF_FOP_FALLOCATE:
+ case GF_FOP_DISCARD:
+ case GF_FOP_ZEROFILL:
pri = IOT_PRI_LO;
break;
@@ -321,9 +324,9 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub)
break;
}
out:
- ret = do_iot_schedule (this->private, stub, pri);
gf_log (this->name, GF_LOG_DEBUG, "%s scheduled as %s fop",
gf_fop_list[stub->fop], iot_get_pri_meaning (pri));
+ ret = do_iot_schedule (this->private, stub, pri);
return ret;
}
@@ -2406,6 +2409,155 @@ out:
return 0;
}
+int
+iot_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno, preop, postop,
+ xdata);
+ return 0;
+}
+
+
+int
+iot_fallocate_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ STACK_WIND (frame, iot_fallocate_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
+ return 0;
+}
+
+
+int
+iot_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ int ret = -1;
+
+ stub = fop_fallocate_stub(frame, iot_fallocate_wrapper, fd, mode, offset,
+ len, xdata);
+ if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "cannot create fallocate stub"
+ "(out of memory)");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = iot_schedule (frame, this, stub);
+
+out:
+ if (ret < 0) {
+ STACK_UNWIND_STRICT (fallocate, frame, -1, -ret, NULL, NULL,
+ NULL);
+ if (stub != NULL) {
+ call_stub_destroy (stub);
+ }
+ }
+ return 0;
+}
+
+int
+iot_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, preop, postop,
+ xdata);
+ return 0;
+}
+
+
+int
+iot_discard_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ STACK_WIND (frame, iot_discard_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->discard, fd, offset, len, xdata);
+ return 0;
+}
+
+
+int
+iot_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ int ret = -1;
+
+ stub = fop_discard_stub(frame, iot_discard_wrapper, fd, offset, len,
+ xdata);
+ if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "cannot create discard stub"
+ "(out of memory)");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = iot_schedule (frame, this, stub);
+
+out:
+ if (ret < 0) {
+ STACK_UNWIND_STRICT (discard, frame, -1, -ret, NULL, NULL,
+ NULL);
+ if (stub != NULL) {
+ call_stub_destroy (stub);
+ }
+ }
+ return 0;
+}
+
+int
+iot_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, preop, postop,
+ xdata);
+ return 0;
+}
+
+int
+iot_zerofill_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ STACK_WIND (frame, iot_zerofill_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->zerofill, fd, offset, len, xdata);
+ return 0;
+}
+
+int
+iot_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ int ret = -1;
+
+ stub = fop_zerofill_stub(frame, iot_zerofill_wrapper, fd,
+ offset, len, xdata);
+ if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "cannot create zerofill stub"
+ "(out of memory)");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = iot_schedule (frame, this, stub);
+
+out:
+ if (ret < 0) {
+ STACK_UNWIND_STRICT (zerofill, frame, -1, -ret, NULL, NULL,
+ NULL);
+ if (stub != NULL) {
+ call_stub_destroy (stub);
+ }
+ }
+ return 0;
+}
+
int
__iot_workers_scale (iot_conf_t *conf)
@@ -2432,7 +2584,7 @@ __iot_workers_scale (iot_conf_t *conf)
while (diff) {
diff --;
- ret = pthread_create (&thread, &conf->w_attr, iot_worker, conf);
+ ret = gf_thread_create (&thread, &conf->w_attr, iot_worker, conf);
if (ret == 0) {
conf->curr_count++;
gf_log (conf->this->name, GF_LOG_DEBUG,
@@ -2736,6 +2888,9 @@ struct xlator_fops fops = {
.xattrop = iot_xattrop,
.fxattrop = iot_fxattrop,
.rchecksum = iot_rchecksum,
+ .fallocate = iot_fallocate,
+ .discard = iot_discard,
+ .zerofill = iot_zerofill,
};
struct xlator_cbks cbks;
@@ -2799,6 +2954,7 @@ struct volume_options options[] = {
{.key = {"least-rate-limit"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
+ .max = INT_MAX,
.default_value = "0",
.description = "Max number of least priority operations to handle "
"per-second"
diff --git a/xlators/performance/md-cache/src/Makefile.am b/xlators/performance/md-cache/src/Makefile.am
index bd09c15c2..8c9f5a858 100644
--- a/xlators/performance/md-cache/src/Makefile.am
+++ b/xlators/performance/md-cache/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = md-cache.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-md_cache_la_LDFLAGS = -module -avoidversion
+md_cache_la_LDFLAGS = -module -avoid-version
md_cache_la_SOURCES = md-cache.c
md_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c
index 0c5ca87d2..84c363ad9 100644
--- a/xlators/performance/md-cache/src/md-cache.c
+++ b/xlators/performance/md-cache/src/md-cache.c
@@ -18,6 +18,7 @@
#include "dict.h"
#include "xlator.h"
#include "md-cache-mem-types.h"
+#include "glusterfs-acl.h"
#include <assert.h>
#include <sys/time.h>
@@ -42,17 +43,17 @@ static struct mdc_key {
int check;
} mdc_keys[] = {
{
- .name = "system.posix_acl_access",
+ .name = POSIX_ACL_ACCESS_XATTR,
.load = 0,
.check = 1,
},
{
- .name = "system.posix_acl_default",
+ .name = POSIX_ACL_DEFAULT_XATTR,
.load = 0,
.check = 1,
},
{
- .name = "security.selinux",
+ .name = GF_SELINUX_XATTR_KEY,
.load = 0,
.check = 1,
},
@@ -132,6 +133,7 @@ struct mdc_local {
loc_t loc2;
fd_t *fd;
char *linkname;
+ char *key;
dict_t *xattr;
};
@@ -174,7 +176,7 @@ __mdc_inode_ctx_set (xlator_t *this, inode_t *inode, struct md_cache *mdc)
uint64_t mdc_int = 0;
mdc_int = (long) mdc;
- ret = __inode_ctx_set2 (inode, this, &mdc_int, 0);
+ ret = __inode_ctx_set (inode, this, &mdc_int);
return ret;
}
@@ -229,6 +231,8 @@ mdc_local_wipe (xlator_t *this, mdc_local_t *local)
GF_FREE (local->linkname);
+ GF_FREE (local->key);
+
if (local->xattr)
dict_unref (local->xattr);
@@ -585,6 +589,31 @@ out:
int
+mdc_inode_xatt_unset (xlator_t *this, inode_t *inode, char *name)
+{
+ int ret = -1;
+ struct md_cache *mdc = NULL;
+
+ mdc = mdc_inode_prep (this, inode);
+ if (!mdc)
+ goto out;
+
+ if (!name)
+ goto out;
+
+ LOCK (&mdc->lock);
+ {
+ dict_del (mdc->xattr, name);
+ }
+ UNLOCK (&mdc->lock);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
mdc_inode_xatt_get (xlator_t *this, inode_t *inode, dict_t **dict)
{
int ret = -1;
@@ -598,13 +627,15 @@ mdc_inode_xatt_get (xlator_t *this, inode_t *inode, dict_t **dict)
LOCK (&mdc->lock);
{
+ ret = 0;
+ /* Missing xattr only means no keys were there, i.e
+ a negative cache for the "loaded" keys
+ */
if (!mdc->xattr)
goto unlock;
if (dict)
*dict = dict_ref (mdc->xattr);
-
- ret = 0;
}
unlock:
UNLOCK (&mdc->lock);
@@ -614,6 +645,46 @@ out:
}
+int
+mdc_inode_iatt_invalidate (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ struct md_cache *mdc = NULL;
+
+ if (mdc_inode_ctx_get (this, inode, &mdc) != 0)
+ goto out;
+
+ LOCK (&mdc->lock);
+ {
+ mdc->ia_time = 0;
+ }
+ UNLOCK (&mdc->lock);
+
+out:
+ return ret;
+}
+
+
+int
+mdc_inode_xatt_invalidate (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ struct md_cache *mdc = NULL;
+
+ if (mdc_inode_ctx_get (this, inode, &mdc) != 0)
+ goto out;
+
+ LOCK (&mdc->lock);
+ {
+ mdc->xa_time = 0;
+ }
+ UNLOCK (&mdc->lock);
+
+out:
+ return ret;
+}
+
+
void
mdc_load_reqs (xlator_t *this, dict_t *dict)
{
@@ -647,7 +718,7 @@ is_mdc_key_satisfied (const char *key)
return 0;
for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) {
- if (!mdc_keys[i].check)
+ if (!mdc_keys[i].load)
continue;
if (strcmp (mdc_key, key) == 0)
return 1;
@@ -721,6 +792,7 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt stbuf = {0, };
struct iatt postparent = {0, };
dict_t *xattr_rsp = NULL;
+ dict_t *xattr_alloc = NULL;
mdc_local_t *local = NULL;
@@ -728,6 +800,13 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
if (!local)
goto uncached;
+ if (!loc->name)
+ /* A nameless discovery is dangerous to cache. We
+ perform nameless lookup with the intention of
+ re-establishing an inode "properly"
+ */
+ goto uncached;
+
loc_copy (&local->loc, loc);
ret = mdc_inode_iatt_get (this, loc->inode, &stbuf);
@@ -752,6 +831,8 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
return 0;
uncached:
+ if (!xdata)
+ xdata = xattr_alloc = dict_new ();
if (xdata)
mdc_load_reqs (this, xdata);
@@ -760,7 +841,8 @@ uncached:
if (xattr_rsp)
dict_unref (xattr_rsp);
-
+ if (xattr_alloc)
+ dict_unref (xattr_alloc);
return 0;
}
@@ -1573,6 +1655,8 @@ mdc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
mdc_inode_xatt_update (this, local->loc.inode, local->xattr);
+ mdc_inode_iatt_invalidate (this, local->loc.inode);
+
out:
MDC_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
@@ -1614,6 +1698,7 @@ mdc_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
mdc_inode_xatt_update (this, local->fd->inode, local->xattr);
+ mdc_inode_iatt_invalidate (this, local->fd->inode);
out:
MDC_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata);
@@ -1666,6 +1751,7 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
dict_t *xdata)
{
int ret;
+ int op_errno = ENODATA;
mdc_local_t *local = NULL;
dict_t *xattr = NULL;
@@ -1682,10 +1768,12 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
if (ret != 0)
goto uncached;
- if (!dict_get (xattr, (char *)key))
- goto uncached;
+ if (!xattr || !dict_get (xattr, (char *)key)) {
+ ret = -1;
+ op_errno = ENODATA;
+ }
- MDC_STACK_UNWIND (getxattr, frame, 0, 0, xattr, xdata);
+ MDC_STACK_UNWIND (getxattr, frame, ret, op_errno, xattr, xdata);
return 0;
@@ -1727,6 +1815,7 @@ mdc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
int ret;
mdc_local_t *local = NULL;
dict_t *xattr = NULL;
+ int op_errno = ENODATA;
local = mdc_local_get (frame);
if (!local)
@@ -1741,10 +1830,12 @@ mdc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
if (ret != 0)
goto uncached;
- if (!dict_get (xattr, (char *)key))
- goto uncached;
+ if (!xattr || !dict_get (xattr, (char *)key)) {
+ ret = -1;
+ op_errno = ENODATA;
+ }
- MDC_STACK_UNWIND (fgetxattr, frame, 0, 0, xattr, xdata);
+ MDC_STACK_UNWIND (fgetxattr, frame, ret, op_errno, xattr, xdata);
return 0;
@@ -1755,6 +1846,97 @@ uncached:
return 0;
}
+int
+mdc_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ if (local->key)
+ mdc_inode_xatt_unset (this, local->loc.inode, local->key);
+ else
+ mdc_inode_xatt_invalidate (this, local->loc.inode);
+
+ mdc_inode_iatt_invalidate (this, local->loc.inode);
+out:
+ MDC_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+
+int
+mdc_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = mdc_local_get (frame);
+
+ loc_copy (&local->loc, loc);
+
+ local->key = gf_strdup (name);
+
+ STACK_WIND (frame, mdc_removexattr_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+}
+
+
+int
+mdc_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ if (local->key)
+ mdc_inode_xatt_unset (this, local->fd->inode, local->key);
+ else
+ mdc_inode_xatt_invalidate (this, local->fd->inode);
+
+ mdc_inode_iatt_invalidate (this, local->fd->inode);
+out:
+ MDC_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+
+int
+mdc_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = mdc_local_get (frame);
+
+ local->fd = fd_ref (fd);
+
+ local->key = gf_strdup (name);
+
+ STACK_WIND (frame, mdc_fremovexattr_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+}
+
int
mdc_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1782,9 +1964,18 @@ int
mdc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd,
size_t size, off_t offset, dict_t *xdata)
{
+ dict_t *xattr_alloc = NULL;
+
+ if (!xdata)
+ xdata = xattr_alloc = dict_new ();
+ if (xdata)
+ mdc_load_reqs (this, xdata);
+
STACK_WIND (frame, mdc_readdirp_cbk,
FIRST_CHILD (this), FIRST_CHILD (this)->fops->readdirp,
fd, size, offset, xdata);
+ if (xattr_alloc)
+ dict_unref (xattr_alloc);
return 0;
}
@@ -1828,6 +2019,123 @@ mdc_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
return 0;
}
+int
+mdc_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf);
+
+out:
+ MDC_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int mdc_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ mdc_local_t *local;
+
+ local = mdc_local_get(frame);
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame, mdc_fallocate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
+
+ return 0;
+}
+
+int
+mdc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf);
+
+out:
+ MDC_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int mdc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ mdc_local_t *local;
+
+ local = mdc_local_get(frame);
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame, mdc_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len,
+ xdata);
+
+ return 0;
+}
+
+int
+mdc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf);
+
+out:
+ MDC_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int mdc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ mdc_local_t *local;
+
+ local = mdc_local_get(frame);
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame, mdc_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len,
+ xdata);
+
+ return 0;
+}
+
int
mdc_forget (xlator_t *this, inode_t *inode)
@@ -1955,8 +2263,13 @@ struct xlator_fops fops = {
.fsetxattr = mdc_fsetxattr,
.getxattr = mdc_getxattr,
.fgetxattr = mdc_fgetxattr,
+ .removexattr = mdc_removexattr,
+ .fremovexattr= mdc_fremovexattr,
.readdirp = mdc_readdirp,
- .readdir = mdc_readdir
+ .readdir = mdc_readdir,
+ .fallocate = mdc_fallocate,
+ .discard = mdc_discard,
+ .zerofill = mdc_zerofill,
};
@@ -1986,4 +2299,5 @@ struct volume_options options[] = {
.description = "Convert all readdir requests to readdirplus to "
"collect stat info on each entry.",
},
+ { .key = {NULL} },
};
diff --git a/xlators/performance/open-behind/Makefile.am b/xlators/performance/open-behind/Makefile.am
new file mode 100644
index 000000000..af437a64d
--- /dev/null
+++ b/xlators/performance/open-behind/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
diff --git a/xlators/performance/open-behind/src/Makefile.am b/xlators/performance/open-behind/src/Makefile.am
new file mode 100644
index 000000000..125285707
--- /dev/null
+++ b/xlators/performance/open-behind/src/Makefile.am
@@ -0,0 +1,15 @@
+xlator_LTLIBRARIES = open-behind.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+
+open_behind_la_LDFLAGS = -module -avoid-version
+
+open_behind_la_SOURCES = open-behind.c
+open_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = open-behind-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/performance/open-behind/src/open-behind-mem-types.h b/xlators/performance/open-behind/src/open-behind-mem-types.h
new file mode 100644
index 000000000..1e94296f4
--- /dev/null
+++ b/xlators/performance/open-behind/src/open-behind-mem-types.h
@@ -0,0 +1,21 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __OB_MEM_TYPES_H__
+#define __OB_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_ob_mem_types_ {
+ gf_ob_mt_fd_t = gf_common_mt_end + 1,
+ gf_ob_mt_conf_t,
+ gf_ob_mt_end
+};
+#endif
diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c
new file mode 100644
index 000000000..7e5b57278
--- /dev/null
+++ b/xlators/performance/open-behind/src/open-behind.c
@@ -0,0 +1,1001 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "open-behind-mem-types.h"
+#include "xlator.h"
+#include "statedump.h"
+#include "call-stub.h"
+#include "defaults.h"
+
+typedef struct ob_conf {
+ gf_boolean_t use_anonymous_fd; /* use anonymous FDs wherever safe
+ e.g - fstat() readv()
+
+ whereas for fops like writev(), lk(),
+ the fd is important for side effects
+ like mandatory locks
+ */
+ gf_boolean_t lazy_open; /* delay backend open as much as possible */
+} ob_conf_t;
+
+
+typedef struct ob_fd {
+ call_frame_t *open_frame;
+ loc_t loc;
+ dict_t *xdata;
+ int flags;
+ int op_errno;
+ struct list_head list;
+} ob_fd_t;
+
+
+ob_fd_t *
+__ob_fd_ctx_get (xlator_t *this, fd_t *fd)
+{
+ uint64_t value = 0;
+ int ret = -1;
+ ob_fd_t *ob_fd = NULL;
+
+ ret = __fd_ctx_get (fd, this, &value);
+ if (ret)
+ return NULL;
+
+ ob_fd = (void *) ((long) value);
+
+ return ob_fd;
+}
+
+
+ob_fd_t *
+ob_fd_ctx_get (xlator_t *this, fd_t *fd)
+{
+ ob_fd_t *ob_fd = NULL;
+
+ LOCK (&fd->lock);
+ {
+ ob_fd = __ob_fd_ctx_get (this, fd);
+ }
+ UNLOCK (&fd->lock);
+
+ return ob_fd;
+}
+
+
+int
+__ob_fd_ctx_set (xlator_t *this, fd_t *fd, ob_fd_t *ob_fd)
+{
+ uint64_t value = 0;
+ int ret = -1;
+
+ value = (long) ((void *) ob_fd);
+
+ ret = __fd_ctx_set (fd, this, value);
+
+ return ret;
+}
+
+
+int
+ob_fd_ctx_set (xlator_t *this, fd_t *fd, ob_fd_t *ob_fd)
+{
+ int ret = -1;
+
+ LOCK (&fd->lock);
+ {
+ ret = __ob_fd_ctx_set (this, fd, ob_fd);
+ }
+ UNLOCK (&fd->lock);
+
+ return ret;
+}
+
+
+ob_fd_t *
+ob_fd_new (void)
+{
+ ob_fd_t *ob_fd = NULL;
+
+ ob_fd = GF_CALLOC (1, sizeof (*ob_fd), gf_ob_mt_fd_t);
+
+ INIT_LIST_HEAD (&ob_fd->list);
+
+ return ob_fd;
+}
+
+
+void
+ob_fd_free (ob_fd_t *ob_fd)
+{
+ loc_wipe (&ob_fd->loc);
+
+ if (ob_fd->xdata)
+ dict_unref (ob_fd->xdata);
+
+ if (ob_fd->open_frame)
+ STACK_DESTROY (ob_fd->open_frame->root);
+
+ GF_FREE (ob_fd);
+}
+
+
+int
+ob_wake_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd_ret, dict_t *xdata)
+{
+ fd_t *fd = NULL;
+ struct list_head list;
+ ob_fd_t *ob_fd = NULL;
+ call_stub_t *stub = NULL, *tmp = NULL;
+
+ fd = frame->local;
+ frame->local = NULL;
+
+ INIT_LIST_HEAD (&list);
+
+ LOCK (&fd->lock);
+ {
+ ob_fd = __ob_fd_ctx_get (this, fd);
+
+ list_splice_init (&ob_fd->list, &list);
+
+ if (op_ret < 0) {
+ /* mark fd BAD for ever */
+ ob_fd->op_errno = op_errno;
+ } else {
+ __fd_ctx_del (fd, this, NULL);
+ ob_fd_free (ob_fd);
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ list_for_each_entry_safe (stub, tmp, &list, list) {
+ list_del_init (&stub->list);
+
+ if (op_ret < 0)
+ call_unwind_error (stub, -1, op_errno);
+ else
+ call_resume (stub);
+ }
+
+ fd_unref (fd);
+
+ STACK_DESTROY (frame->root);
+
+ return 0;
+}
+
+
+int
+ob_fd_wake (xlator_t *this, fd_t *fd)
+{
+ call_frame_t *frame = NULL;
+ ob_fd_t *ob_fd = NULL;
+
+ LOCK (&fd->lock);
+ {
+ ob_fd = __ob_fd_ctx_get (this, fd);
+ if (!ob_fd)
+ goto unlock;
+
+ frame = ob_fd->open_frame;
+ ob_fd->open_frame = NULL;
+ }
+unlock:
+ UNLOCK (&fd->lock);
+
+ if (frame) {
+ frame->local = fd_ref (fd);
+
+ STACK_WIND (frame, ob_wake_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->open,
+ &ob_fd->loc, ob_fd->flags, fd, ob_fd->xdata);
+ }
+
+ return 0;
+}
+
+
+int
+open_and_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
+{
+ ob_fd_t *ob_fd = NULL;
+ int op_errno = 0;
+
+ if (!fd)
+ goto nofd;
+
+ LOCK (&fd->lock);
+ {
+ ob_fd = __ob_fd_ctx_get (this, fd);
+ if (!ob_fd)
+ goto unlock;
+
+ if (ob_fd->op_errno) {
+ op_errno = ob_fd->op_errno;
+ goto unlock;
+ }
+
+ list_add_tail (&stub->list, &ob_fd->list);
+ }
+unlock:
+ UNLOCK (&fd->lock);
+
+nofd:
+ if (op_errno)
+ call_unwind_error (stub, -1, op_errno);
+ else if (ob_fd)
+ ob_fd_wake (this, fd);
+ else
+ call_resume (stub);
+
+ return 0;
+}
+
+
+int
+ob_open_behind (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ fd_t *fd, dict_t *xdata)
+{
+ ob_fd_t *ob_fd = NULL;
+ int ret = -1;
+ ob_conf_t *conf = NULL;
+
+
+ conf = this->private;
+
+ if (flags & O_TRUNC) {
+ STACK_WIND (frame, default_open_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->open,
+ loc, flags, fd, xdata);
+ return 0;
+ }
+
+ ob_fd = ob_fd_new ();
+ if (!ob_fd)
+ goto enomem;
+
+ ob_fd->open_frame = copy_frame (frame);
+ if (!ob_fd->open_frame)
+ goto enomem;
+ ret = loc_copy (&ob_fd->loc, loc);
+ if (ret)
+ goto enomem;
+
+ ob_fd->flags = flags;
+ if (xdata)
+ ob_fd->xdata = dict_ref (xdata);
+
+ ret = ob_fd_ctx_set (this, fd, ob_fd);
+ if (ret)
+ goto enomem;
+
+ fd_ref (fd);
+
+ STACK_UNWIND_STRICT (open, frame, 0, 0, fd, xdata);
+
+ if (!conf->lazy_open)
+ ob_fd_wake (this, fd);
+
+ fd_unref (fd);
+
+ return 0;
+enomem:
+ if (ob_fd) {
+ if (ob_fd->open_frame)
+ STACK_DESTROY (ob_fd->open_frame->root);
+ loc_wipe (&ob_fd->loc);
+ if (ob_fd->xdata)
+ dict_unref (ob_fd->xdata);
+ GF_FREE (ob_fd);
+ }
+
+ return -1;
+}
+
+
+int
+ob_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ fd_t *fd, dict_t *xdata)
+{
+ fd_t *old_fd = NULL;
+ int ret = -1;
+ int op_errno = 0;
+ call_stub_t *stub = NULL;
+
+ old_fd = fd_lookup (fd->inode, 0);
+ if (old_fd) {
+ /* open-behind only when this is the first FD */
+ stub = fop_open_stub (frame, default_open_resume,
+ loc, flags, fd, xdata);
+ if (!stub) {
+ op_errno = ENOMEM;
+ fd_unref (old_fd);
+ goto err;
+ }
+
+ open_and_resume (this, old_fd, stub);
+
+ fd_unref (old_fd);
+
+ return 0;
+ }
+
+ ret = ob_open_behind (frame, this, loc, flags, fd, xdata);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ return 0;
+err:
+ gf_log (this->name, GF_LOG_ERROR, "%s: %s", loc->path,
+ strerror (op_errno));
+
+ STACK_UNWIND_STRICT (open, frame, -1, op_errno, 0, 0);
+
+ return 0;
+}
+
+
+fd_t *
+ob_get_wind_fd (xlator_t *this, fd_t *fd)
+{
+ ob_conf_t *conf = NULL;
+ ob_fd_t *ob_fd = NULL;
+
+ conf = this->private;
+
+ ob_fd = ob_fd_ctx_get (this, fd);
+
+ if (ob_fd && conf->use_anonymous_fd)
+ return fd_anonymous (fd->inode);
+
+ return fd_ref (fd);
+}
+
+
+int
+ob_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ fd_t *wind_fd = NULL;
+
+ wind_fd = ob_get_wind_fd (this, fd);
+
+ stub = fop_readv_stub (frame, default_readv_resume, wind_fd,
+ size, offset, flags, xdata);
+ fd_unref (wind_fd);
+
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, wind_fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov,
+ int count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_writev_stub (frame, default_writev_resume, fd, iov, count,
+ offset, flags, iobref, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (writev, frame, -1, ENOMEM, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ fd_t *wind_fd = NULL;
+
+ wind_fd = ob_get_wind_fd (this, fd);
+
+ stub = fop_fstat_stub (frame, default_fstat_resume, wind_fd, xdata);
+
+ fd_unref (wind_fd);
+
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, wind_fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fstat, frame, -1, ENOMEM, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ ob_fd_t *ob_fd = NULL;
+ gf_boolean_t unwind = _gf_false;
+
+ LOCK (&fd->lock);
+ {
+ ob_fd = __ob_fd_ctx_get (this, fd);
+ if (ob_fd && ob_fd->open_frame)
+ /* if open() was never wound to backend,
+ no need to wind flush() either.
+ */
+ unwind = _gf_true;
+ }
+ UNLOCK (&fd->lock);
+
+ if (unwind)
+ goto unwind;
+
+ stub = fop_flush_stub (frame, default_flush_resume, fd, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM, 0);
+
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (flush, frame, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int flag,
+ dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fsync_stub (frame, default_fsync_resume, fd, flag, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_lk_stub (frame, default_lk_resume, fd, cmd, flock, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (lk, frame, -1, ENOMEM, 0, 0);
+
+ return 0;
+}
+
+int
+ob_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_ftruncate_stub (frame, default_ftruncate_resume, fd, offset,
+ xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
+ int flags, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fsetxattr_stub (frame, default_fsetxattr_resume, fd, xattr,
+ flags, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOMEM, 0);
+
+ return 0;
+}
+
+
+int
+ob_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fgetxattr_stub (frame, default_fgetxattr_resume, fd, name,
+ xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOMEM, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fremovexattr_stub (frame, default_fremovexattr_resume, fd,
+ name, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOMEM, 0);
+
+ return 0;
+}
+
+
+int
+ob_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_finodelk_stub (frame, default_finodelk_resume, volume, fd,
+ cmd, flock, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (finodelk, frame, -1, ENOMEM, 0);
+
+ return 0;
+}
+
+
+int
+ob_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fentrylk_stub (frame, default_fentrylk_resume, volume, fd,
+ basename, cmd, type, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOMEM, 0);
+
+ return 0;
+}
+
+
+int
+ob_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fxattrop_stub (frame, default_fxattrop_resume, fd, optype,
+ xattr, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOMEM, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *iatt, int valid, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fsetattr_stub (frame, default_fsetattr_resume, fd,
+ iatt, valid, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOMEM, 0, 0, 0);
+
+ return 0;
+}
+
+int
+ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ call_stub_t *stub;
+
+ stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode,
+ offset, len, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume(this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ call_stub_t *stub;
+
+ stub = fop_discard_stub(frame, default_discard_resume, fd, offset, len,
+ xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume(this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ call_stub_t *stub;
+
+ stub = fop_zerofill_stub(frame, default_zerofill_resume, fd,
+ offset, len, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume(this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int
+ob_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+ dict_t *xdata)
+{
+ fd_t *fd = NULL;
+ call_stub_t *stub = NULL;
+
+ stub = fop_unlink_stub (frame, default_unlink_resume, loc,
+ xflags, xdata);
+ if (!stub)
+ goto err;
+
+ fd = fd_lookup (loc->inode, 0);
+
+ open_and_resume (this, fd, stub);
+ if (fd)
+ fd_unref (fd);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (unlink, frame, -1, ENOMEM, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_rename (call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst,
+ dict_t *xdata)
+{
+ fd_t *fd = NULL;
+ call_stub_t *stub = NULL;
+
+ stub = fop_rename_stub (frame, default_rename_resume, src, dst, xdata);
+ if (!stub)
+ goto err;
+
+ if (dst->inode)
+ fd = fd_lookup (dst->inode, 0);
+
+ open_and_resume (this, fd, stub);
+ if (fd)
+ fd_unref (fd);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (rename, frame, -1, ENOMEM, 0, 0, 0, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_release (xlator_t *this, fd_t *fd)
+{
+ ob_fd_t *ob_fd = NULL;
+
+ ob_fd = ob_fd_ctx_get (this, fd);
+
+ ob_fd_free (ob_fd);
+
+ return 0;
+}
+
+
+int
+ob_priv_dump (xlator_t *this)
+{
+ ob_conf_t *conf = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+
+ conf = this->private;
+
+ if (!conf)
+ return -1;
+
+ gf_proc_dump_build_key (key_prefix, "xlator.performance.open-behind",
+ "priv");
+
+ gf_proc_dump_add_section (key_prefix);
+
+ gf_proc_dump_write ("use_anonymous_fd", "%d", conf->use_anonymous_fd);
+
+ gf_proc_dump_write ("lazy_open", "%d", conf->lazy_open);
+
+ return 0;
+}
+
+
+int
+ob_fdctx_dump (xlator_t *this, fd_t *fd)
+{
+ ob_fd_t *ob_fd = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, };
+ int ret = 0;
+
+ ret = TRY_LOCK (&fd->lock);
+ if (ret)
+ return 0;
+
+ ob_fd = __ob_fd_ctx_get (this, fd);
+ if (!ob_fd) {
+ UNLOCK (&fd->lock);
+ return 0;
+ }
+
+ gf_proc_dump_build_key (key_prefix, "xlator.performance.open-behind",
+ "file");
+ gf_proc_dump_add_section (key_prefix);
+
+ gf_proc_dump_write ("fd", "%p", fd);
+
+ gf_proc_dump_write ("open_frame", "%p", ob_fd->open_frame);
+
+ gf_proc_dump_write ("open_frame.root.unique", "%p",
+ ob_fd->open_frame->root->unique);
+
+ gf_proc_dump_write ("loc.path", "%s", ob_fd->loc.path);
+
+ gf_proc_dump_write ("loc.ino", "%s", uuid_utoa (ob_fd->loc.gfid));
+
+ gf_proc_dump_write ("flags", "%p", ob_fd->open_frame);
+
+ UNLOCK (&fd->lock);
+
+ return 0;
+}
+
+
+int
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ ret = xlator_mem_acct_init (this, gf_ob_mt_end + 1);
+
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting failed");
+
+ return ret;
+}
+
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ ob_conf_t *conf = NULL;
+ int ret = -1;
+
+ conf = this->private;
+
+ GF_OPTION_RECONF ("use-anonymous-fd", conf->use_anonymous_fd, options,
+ bool, out);
+
+ GF_OPTION_RECONF ("lazy-open", conf->lazy_open, options, bool, out);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+init (xlator_t *this)
+{
+ ob_conf_t *conf = NULL;
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "FATAL: volume (%s) not configured with exactly one "
+ "child", this->name);
+ return -1;
+ }
+
+ if (!this->parents)
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+
+ conf = GF_CALLOC (1, sizeof (*conf), gf_ob_mt_conf_t);
+ if (!conf)
+ goto err;
+
+ GF_OPTION_INIT ("use-anonymous-fd", conf->use_anonymous_fd, bool, err);
+
+ GF_OPTION_INIT ("lazy-open", conf->lazy_open, bool, err);
+
+ this->private = conf;
+
+ return 0;
+err:
+ if (conf)
+ GF_FREE (conf);
+
+ return -1;
+}
+
+
+void
+fini (xlator_t *this)
+{
+ ob_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ GF_FREE (conf);
+
+ return;
+}
+
+
+struct xlator_fops fops = {
+ .open = ob_open,
+ .readv = ob_readv,
+ .writev = ob_writev,
+ .flush = ob_flush,
+ .fsync = ob_fsync,
+ .fstat = ob_fstat,
+ .ftruncate = ob_ftruncate,
+ .fsetxattr = ob_fsetxattr,
+ .fgetxattr = ob_fgetxattr,
+ .fremovexattr = ob_fremovexattr,
+ .finodelk = ob_finodelk,
+ .fentrylk = ob_fentrylk,
+ .fxattrop = ob_fxattrop,
+ .fsetattr = ob_fsetattr,
+ .fallocate = ob_fallocate,
+ .discard = ob_discard,
+ .zerofill = ob_zerofill,
+ .unlink = ob_unlink,
+ .rename = ob_rename,
+ .lk = ob_lk,
+};
+
+struct xlator_cbks cbks = {
+ .release = ob_release,
+};
+
+struct xlator_dumpops dumpops = {
+ .priv = ob_priv_dump,
+ .fdctx = ob_fdctx_dump,
+};
+
+
+struct volume_options options[] = {
+ { .key = {"use-anonymous-fd"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "yes",
+ .description = "For read operations, use anonymous FD when "
+ "original FD is open-behind and not yet opened in the backend.",
+ },
+ { .key = {"lazy-open"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "yes",
+ .description = "Perform open in the backend only when a necessary "
+ "FOP arrives (e.g writev on the FD, unlink of the file). When option "
+ "is disabled, perform backend open right after unwinding open().",
+ },
+ { .key = {NULL} }
+
+};
diff --git a/xlators/performance/quick-read/src/Makefile.am b/xlators/performance/quick-read/src/Makefile.am
index 790f1e943..4906f408a 100644
--- a/xlators/performance/quick-read/src/Makefile.am
+++ b/xlators/performance/quick-read/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = quick-read.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-quick_read_la_LDFLAGS = -module -avoidversion
+quick_read_la_LDFLAGS = -module -avoid-version
quick_read_la_SOURCES = quick-read.c
quick_read_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c
index 1a0f8675e..445ea8658 100644
--- a/xlators/performance/quick-read/src/quick-read.c
+++ b/xlators/performance/quick-read/src/quick-read.c
@@ -622,6 +622,46 @@ wind:
int
+qr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov,
+ int count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ qr_inode_prune (this, fd->inode);
+
+ STACK_WIND (frame, default_writev_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev,
+ fd, iov, count, offset, flags, iobref, xdata);
+ return 0;
+}
+
+
+int
+qr_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ qr_inode_prune (this, loc->inode);
+
+ STACK_WIND (frame, default_truncate_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+}
+
+
+int
+qr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ qr_inode_prune (this, fd->inode);
+
+ STACK_WIND (frame, default_ftruncate_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
+}
+
+
+int
qr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
fd_t *fd, dict_t *xdata)
{
@@ -1066,6 +1106,9 @@ struct xlator_fops fops = {
.readdirp = qr_readdirp,
.open = qr_open,
.readv = qr_readv,
+ .writev = qr_writev,
+ .truncate = qr_truncate,
+ .ftruncate = qr_ftruncate
};
struct xlator_cbks cbks = {
@@ -1100,4 +1143,5 @@ struct volume_options options[] = {
.max = 1 * GF_UNIT_KB * 1000,
.default_value = "64KB",
},
+ { .key = {NULL} }
};
diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am
index ae2b1ace9..be80ae7ac 100644
--- a/xlators/performance/read-ahead/src/Makefile.am
+++ b/xlators/performance/read-ahead/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = read-ahead.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-read_ahead_la_LDFLAGS = -module -avoidversion
+read_ahead_la_LDFLAGS = -module -avoid-version
read_ahead_la_SOURCES = read-ahead.c page.c
read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c
index 549496755..069ab1f1a 100644
--- a/xlators/performance/read-ahead/src/read-ahead.c
+++ b/xlators/performance/read-ahead/src/read-ahead.c
@@ -498,7 +498,7 @@ ra_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
"expected offset (%"PRId64") when page_count=%d",
offset, file->page_count);
- if (file->expected < (conf->page_size * conf->page_count)) {
+ if (file->expected < (file->page_size * conf->page_count)) {
file->expected += size;
file->page_count = min ((file->expected
/ file->page_size),
@@ -942,6 +942,106 @@ unwind:
return 0;
}
+int
+ra_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame);
+
+ STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+static int
+ra_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ ra_file_t *file = NULL;
+ fd_t *iter_fd = NULL;
+ inode_t *inode = NULL;
+ uint64_t tmp_file = 0;
+ int32_t op_errno = EINVAL;
+
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
+ GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
+
+ inode = fd->inode;
+
+ LOCK (&inode->lock);
+ {
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ fd_ctx_get (iter_fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+ if (!file)
+ continue;
+
+ flush_region(frame, file, offset, len, 1);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ STACK_WIND (frame, ra_discard_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->discard, fd, offset, len, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+ra_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame);
+
+ STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+static int
+ra_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ ra_file_t *file = NULL;
+ fd_t *iter_fd = NULL;
+ inode_t *inode = NULL;
+ uint64_t tmp_file = 0;
+ int32_t op_errno = EINVAL;
+
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
+ GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
+
+ inode = fd->inode;
+
+ LOCK (&inode->lock);
+ {
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ fd_ctx_get (iter_fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+ if (!file)
+ continue;
+
+ flush_region(frame, file, offset, len, 1);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ STACK_WIND (frame, ra_zerofill_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->zerofill, fd,
+ offset, len, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
int
ra_priv_dump (xlator_t *this)
@@ -1024,6 +1124,8 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("page-count", conf->page_count, options, uint32, out);
+ GF_OPTION_RECONF ("page-size", conf->page_size, options, size, out);
+
ret = 0;
out:
return ret;
@@ -1056,6 +1158,8 @@ init (xlator_t *this)
conf->page_size = this->ctx->page_size;
+ GF_OPTION_INIT ("page-size", conf->page_size, size, out);
+
GF_OPTION_INIT ("page-count", conf->page_count, uint32, out);
GF_OPTION_INIT ("force-atime-update", conf->force_atime_update, bool, out);
@@ -1119,6 +1223,8 @@ struct xlator_fops fops = {
.truncate = ra_truncate,
.ftruncate = ra_ftruncate,
.fstat = ra_fstat,
+ .discard = ra_discard,
+ .zerofill = ra_zerofill,
};
struct xlator_cbks cbks = {
@@ -1142,5 +1248,12 @@ struct volume_options options[] = {
.default_value = "4",
.description = "Number of pages that will be pre-fetched"
},
+ { .key = {"page-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 4096,
+ .max = 1048576 * 64,
+ .default_value = "131072",
+ .description = "Page size with which read-ahead performs server I/O"
+ },
{ .key = {NULL} },
};
diff --git a/xlators/performance/readdir-ahead/Makefile.am b/xlators/performance/readdir-ahead/Makefile.am
new file mode 100644
index 000000000..a985f42a8
--- /dev/null
+++ b/xlators/performance/readdir-ahead/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/performance/readdir-ahead/src/Makefile.am b/xlators/performance/readdir-ahead/src/Makefile.am
new file mode 100644
index 000000000..cdabd1428
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/Makefile.am
@@ -0,0 +1,15 @@
+xlator_LTLIBRARIES = readdir-ahead.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+
+readdir_ahead_la_LDFLAGS = -module -avoidversion
+
+readdir_ahead_la_SOURCES = readdir-ahead.c
+readdir_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = readdir-ahead.h readdir-ahead-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h
new file mode 100644
index 000000000..39e2c5369
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h
@@ -0,0 +1,24 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __RDA_MEM_TYPES_H__
+#define __RDA_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_rda_mem_types_ {
+ gf_rda_mt_rda_local = gf_common_mt_end + 1,
+ gf_rda_mt_rda_fd_ctx,
+ gf_rda_mt_rda_priv,
+ gf_rda_mt_end
+};
+
+#endif
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.c b/xlators/performance/readdir-ahead/src/readdir-ahead.c
new file mode 100644
index 000000000..53e6756f0
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead.c
@@ -0,0 +1,560 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+/*
+ * performance/readdir-ahead preloads a local buffer with directory entries
+ * on opendir. The optimization involves using maximum sized gluster rpc
+ * requests (128k) to minimize overhead of smaller client requests.
+ *
+ * For example, fuse currently supports a maximum readdir buffer of 4k
+ * (regardless of the filesystem client's buffer size). readdir-ahead should
+ * effectively convert these smaller requests into fewer, larger sized requests
+ * for simple, sequential workloads (i.e., ls).
+ *
+ * The translator is currently designed to handle the simple, sequential case
+ * only. If a non-sequential directory read occurs, readdir-ahead disables
+ * preloads on the directory.
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "call-stub.h"
+#include "readdir-ahead.h"
+#include "readdir-ahead-mem-types.h"
+#include "defaults.h"
+
+static int rda_fill_fd(call_frame_t *, xlator_t *, fd_t *);
+
+/*
+ * Get (or create) the fd context for storing prepopulated directory
+ * entries.
+ */
+static struct
+rda_fd_ctx *get_rda_fd_ctx(fd_t *fd, xlator_t *this)
+{
+ uint64_t val;
+ struct rda_fd_ctx *ctx;
+
+ LOCK(&fd->lock);
+
+ if (__fd_ctx_get(fd, this, &val) < 0) {
+ ctx = GF_CALLOC(1, sizeof(struct rda_fd_ctx),
+ gf_rda_mt_rda_fd_ctx);
+ if (!ctx)
+ goto out;
+
+ LOCK_INIT(&ctx->lock);
+ INIT_LIST_HEAD(&ctx->entries.list);
+ ctx->state = RDA_FD_NEW;
+ /* ctx offset values initialized to 0 */
+
+ if (__fd_ctx_set(fd, this, (uint64_t) ctx) < 0) {
+ GF_FREE(ctx);
+ ctx = NULL;
+ goto out;
+ }
+ } else {
+ ctx = (struct rda_fd_ctx *) val;
+ }
+out:
+ UNLOCK(&fd->lock);
+ return ctx;
+}
+
+/*
+ * Reset the tracking state of the context.
+ */
+static void
+rda_reset_ctx(struct rda_fd_ctx *ctx)
+{
+ ctx->state = RDA_FD_NEW;
+ ctx->cur_offset = 0;
+ ctx->cur_size = 0;
+ ctx->next_offset = 0;
+ gf_dirent_free(&ctx->entries);
+}
+
+/*
+ * Check whether we can handle a request. Offset verification is done by the
+ * caller, so we only check whether the preload buffer has completion status
+ * (including an error) or has some data to return.
+ */
+static gf_boolean_t
+rda_can_serve_readdirp(struct rda_fd_ctx *ctx, size_t request_size)
+{
+ if ((ctx->state & RDA_FD_EOD) ||
+ (ctx->state & RDA_FD_ERROR) ||
+ (!(ctx->state & RDA_FD_PLUGGED) && (ctx->cur_size > 0)))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+/*
+ * Serve a request from the fd dentry list based on the size of the request
+ * buffer. ctx must be locked.
+ */
+static int32_t
+__rda_serve_readdirp(xlator_t *this, gf_dirent_t *entries, size_t request_size,
+ struct rda_fd_ctx *ctx)
+{
+ gf_dirent_t *dirent, *tmp;
+ size_t dirent_size, size = 0;
+ int32_t count = 0;
+ struct rda_priv *priv = this->private;
+
+ list_for_each_entry_safe(dirent, tmp, &ctx->entries.list, list) {
+ dirent_size = gf_dirent_size(dirent->d_name);
+ if (size + dirent_size > request_size)
+ break;
+
+ size += dirent_size;
+ list_del_init(&dirent->list);
+ ctx->cur_size -= dirent_size;
+
+ list_add_tail(&dirent->list, &entries->list);
+ ctx->cur_offset = dirent->d_off;
+ count++;
+ }
+
+ if (ctx->cur_size <= priv->rda_low_wmark)
+ ctx->state |= RDA_FD_PLUGGED;
+
+ return count;
+}
+
+static int32_t
+rda_readdirp_stub(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
+{
+ gf_dirent_t entries;
+ int32_t ret;
+ struct rda_fd_ctx *ctx;
+ int op_errno = 0;
+
+ ctx = get_rda_fd_ctx(fd, this);
+ INIT_LIST_HEAD(&entries.list);
+ ret = __rda_serve_readdirp(this, &entries, size, ctx);
+
+ if (!ret && (ctx->state & RDA_FD_ERROR)) {
+ ret = -1;
+ op_errno = ctx->op_errno;
+ ctx->state &= ~RDA_FD_ERROR;
+
+ /*
+ * the preload has stopped running in the event of an error, so
+ * pass all future requests along
+ */
+ ctx->state |= RDA_FD_BYPASS;
+ }
+
+ STACK_UNWIND_STRICT(readdirp, frame, ret, op_errno, &entries, xdata);
+ gf_dirent_free(&entries);
+
+ return 0;
+}
+
+static int32_t
+rda_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata)
+{
+ struct rda_fd_ctx *ctx;
+ call_stub_t *stub;
+ int fill = 0;
+
+ ctx = get_rda_fd_ctx(fd, this);
+ if (!ctx)
+ goto err;
+
+ if (ctx->state & RDA_FD_BYPASS)
+ goto bypass;
+
+ LOCK(&ctx->lock);
+
+ /* recheck now that we have the lock */
+ if (ctx->state & RDA_FD_BYPASS) {
+ UNLOCK(&ctx->lock);
+ goto bypass;
+ }
+
+ /*
+ * If a new read comes in at offset 0 and the buffer has been
+ * completed, reset the context and kickstart the filler again.
+ */
+ if (!off && (ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0)) {
+ rda_reset_ctx(ctx);
+ fill = 1;
+ }
+
+ /*
+ * If a readdir occurs at an unexpected offset or we already have a
+ * request pending, admit defeat and just get out of the way.
+ */
+ if (off != ctx->cur_offset || ctx->stub) {
+ ctx->state |= RDA_FD_BYPASS;
+ UNLOCK(&ctx->lock);
+ goto bypass;
+ }
+
+ stub = fop_readdirp_stub(frame, rda_readdirp_stub, fd, size, off, xdata);
+ if (!stub) {
+ UNLOCK(&ctx->lock);
+ goto err;
+ }
+
+ /*
+ * If we haven't bypassed the preload, this means we can either serve
+ * the request out of the preload or the request that enables us to do
+ * so is in flight...
+ */
+ if (rda_can_serve_readdirp(ctx, size))
+ call_resume(stub);
+ else
+ ctx->stub = stub;
+
+ UNLOCK(&ctx->lock);
+
+ if (fill)
+ rda_fill_fd(frame, this, fd);
+
+ return 0;
+
+bypass:
+ STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
+
+static int32_t
+rda_fill_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ gf_dirent_t *dirent, *tmp;
+ struct rda_local *local = frame->local;
+ struct rda_fd_ctx *ctx = local->ctx;
+ struct rda_priv *priv = this->private;
+ int fill = 1;
+
+ LOCK(&ctx->lock);
+
+ /* Verify that the preload buffer is still pending on this data. */
+ if (ctx->next_offset != local->offset) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Out of sequence directory preload.");
+ ctx->state |= (RDA_FD_BYPASS|RDA_FD_ERROR);
+ ctx->op_errno = EUCLEAN;
+
+ goto out;
+ }
+
+ if (entries) {
+ list_for_each_entry_safe(dirent, tmp, &entries->list, list) {
+ list_del_init(&dirent->list);
+ /* must preserve entry order */
+ list_add_tail(&dirent->list, &ctx->entries.list);
+
+ ctx->cur_size += gf_dirent_size(dirent->d_name);
+ ctx->next_offset = dirent->d_off;
+ }
+ }
+
+ if (ctx->cur_size >= priv->rda_high_wmark)
+ ctx->state &= ~RDA_FD_PLUGGED;
+
+ if (!op_ret) {
+ /* we've hit eod */
+ ctx->state &= ~RDA_FD_RUNNING;
+ ctx->state |= RDA_FD_EOD;
+ } else if (op_ret == -1) {
+ /* kill the preload and pend the error */
+ ctx->state &= ~RDA_FD_RUNNING;
+ ctx->state |= RDA_FD_ERROR;
+ ctx->op_errno = op_errno;
+ }
+
+ /*
+ * NOTE: The strict bypass logic in readdirp() means a pending request
+ * is always based on ctx->cur_offset.
+ */
+ if (ctx->stub &&
+ rda_can_serve_readdirp(ctx, ctx->stub->args.size)) {
+ call_resume(ctx->stub);
+ ctx->stub = NULL;
+ }
+
+out:
+ /*
+ * If we have been marked for bypass and have no pending stub, clear the
+ * run state so we stop preloading the context with entries.
+ */
+ if ((ctx->state & RDA_FD_BYPASS) && !ctx->stub)
+ ctx->state &= ~RDA_FD_RUNNING;
+
+ if (!(ctx->state & RDA_FD_RUNNING)) {
+ fill = 0;
+ STACK_DESTROY(ctx->fill_frame->root);
+ ctx->fill_frame = NULL;
+ }
+
+ UNLOCK(&ctx->lock);
+
+ if (fill)
+ rda_fill_fd(frame, this, local->fd);
+
+ return 0;
+}
+
+/*
+ * Start prepopulating the fd context with directory entries.
+ */
+static int
+rda_fill_fd(call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+ call_frame_t *nframe = NULL;
+ struct rda_local *local = NULL;
+ struct rda_fd_ctx *ctx;
+ off_t offset;
+ struct rda_priv *priv = this->private;
+
+ ctx = get_rda_fd_ctx(fd, this);
+ if (!ctx)
+ goto err;
+
+ LOCK(&ctx->lock);
+
+ if (ctx->state & RDA_FD_NEW) {
+ ctx->state &= ~RDA_FD_NEW;
+ ctx->state |= RDA_FD_RUNNING;
+ if (priv->rda_low_wmark)
+ ctx->state |= RDA_FD_PLUGGED;
+ }
+
+ offset = ctx->next_offset;
+
+ if (!ctx->fill_frame) {
+ nframe = copy_frame(frame);
+ if (!nframe) {
+ UNLOCK(&ctx->lock);
+ goto err;
+ }
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ UNLOCK(&ctx->lock);
+ goto err;
+ }
+
+ local->ctx = ctx;
+ local->fd = fd;
+ nframe->local = local;
+
+ ctx->fill_frame = nframe;
+ } else {
+ nframe = ctx->fill_frame;
+ local = nframe->local;
+ }
+
+ local->offset = offset;
+
+ UNLOCK(&ctx->lock);
+
+ STACK_WIND(nframe, rda_fill_fd_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, priv->rda_req_size,
+ offset, NULL);
+
+ return 0;
+
+err:
+ if (nframe)
+ FRAME_DESTROY(nframe);
+
+ return -1;
+}
+
+static int32_t
+rda_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ if (!op_ret)
+ rda_fill_fd(frame, this, fd);
+
+ STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+static int32_t
+rda_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
+{
+ STACK_WIND(frame, rda_opendir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+ return 0;
+}
+
+static int32_t
+rda_releasedir(xlator_t *this, fd_t *fd)
+{
+ uint64_t val;
+ struct rda_fd_ctx *ctx;
+
+ if (fd_ctx_del(fd, this, &val) < 0)
+ return -1;
+
+ ctx = (struct rda_fd_ctx *) val;
+ if (!ctx)
+ return 0;
+
+ rda_reset_ctx(ctx);
+
+ if (ctx->fill_frame)
+ STACK_DESTROY(ctx->fill_frame->root);
+
+ if (ctx->stub)
+ gf_log(this->name, GF_LOG_ERROR,
+ "released a directory with a pending stub");
+
+ GF_FREE(ctx);
+ return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ goto out;
+
+ ret = xlator_mem_acct_init(this, gf_rda_mt_end + 1);
+
+ if (ret != 0)
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+
+out:
+ return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+ struct rda_priv *priv = this->private;
+
+ GF_OPTION_RECONF("rda-request-size", priv->rda_req_size, options,
+ uint32, err);
+ GF_OPTION_RECONF("rda-low-wmark", priv->rda_low_wmark, options, size,
+ err);
+ GF_OPTION_RECONF("rda-high-wmark", priv->rda_high_wmark, options, size,
+ err);
+
+ return 0;
+err:
+ return -1;
+}
+
+int
+init(xlator_t *this)
+{
+ struct rda_priv *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO("readdir-ahead", this, err);
+
+ if (!this->children || this->children->next) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "FATAL: readdir-ahead not configured with exactly one"
+ " child");
+ goto err;
+ }
+
+ if (!this->parents) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ priv = GF_CALLOC(1, sizeof(struct rda_priv), gf_rda_mt_rda_priv);
+ if (!priv)
+ goto err;
+ this->private = priv;
+
+ this->local_pool = mem_pool_new(struct rda_local, 32);
+ if (!this->local_pool)
+ goto err;
+
+ GF_OPTION_INIT("rda-request-size", priv->rda_req_size, uint32, err);
+ GF_OPTION_INIT("rda-low-wmark", priv->rda_low_wmark, size, err);
+ GF_OPTION_INIT("rda-high-wmark", priv->rda_high_wmark, size, err);
+
+ return 0;
+
+err:
+ if (this->local_pool)
+ mem_pool_destroy(this->local_pool);
+ if (priv)
+ GF_FREE(priv);
+
+ return -1;
+}
+
+
+void
+fini(xlator_t *this)
+{
+ GF_VALIDATE_OR_GOTO ("readdir-ahead", this, out);
+
+ GF_FREE(this->private);
+
+out:
+ return;
+}
+
+struct xlator_fops fops = {
+ .opendir = rda_opendir,
+ .readdirp = rda_readdirp,
+};
+
+struct xlator_cbks cbks = {
+ .releasedir = rda_releasedir,
+};
+
+struct volume_options options[] = {
+ { .key = {"rda-request-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 4096,
+ .max = 131072,
+ .default_value = "131072",
+ .description = "readdir-ahead request size",
+ },
+ { .key = {"rda-low-wmark"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 0,
+ .max = 10 * GF_UNIT_MB,
+ .default_value = "4096",
+ .description = "the value under which we plug",
+ },
+ { .key = {"rda-high-wmark"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 0,
+ .max = 100 * GF_UNIT_MB,
+ .default_value = "131072",
+ .description = "the value over which we unplug",
+ },
+ { .key = {NULL} },
+};
+
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.h b/xlators/performance/readdir-ahead/src/readdir-ahead.h
new file mode 100644
index 000000000..e48786dae
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead.h
@@ -0,0 +1,46 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __READDIR_AHEAD_H
+#define __READDIR_AHEAD_H
+
+/* state flags */
+#define RDA_FD_NEW (1 << 0)
+#define RDA_FD_RUNNING (1 << 1)
+#define RDA_FD_EOD (1 << 2)
+#define RDA_FD_ERROR (1 << 3)
+#define RDA_FD_BYPASS (1 << 4)
+#define RDA_FD_PLUGGED (1 << 5)
+
+struct rda_fd_ctx {
+ off_t cur_offset; /* current head of the ctx */
+ size_t cur_size; /* current size of the preload */
+ off_t next_offset; /* tail of the ctx */
+ uint32_t state;
+ gf_lock_t lock;
+ gf_dirent_t entries;
+ call_frame_t *fill_frame;
+ call_stub_t *stub;
+ int op_errno;
+};
+
+struct rda_local {
+ struct rda_fd_ctx *ctx;
+ fd_t *fd;
+ off_t offset;
+};
+
+struct rda_priv {
+ uint32_t rda_req_size;
+ uint64_t rda_low_wmark;
+ uint64_t rda_high_wmark;
+};
+
+#endif /* __READDIR_AHEAD_H */
diff --git a/xlators/performance/symlink-cache/src/Makefile.am b/xlators/performance/symlink-cache/src/Makefile.am
index c37d93e86..4091c3293 100644
--- a/xlators/performance/symlink-cache/src/Makefile.am
+++ b/xlators/performance/symlink-cache/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = symlink-cache.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/performance
-symlink_cache_la_LDFLAGS = -module -avoidversion
+symlink_cache_la_LDFLAGS = -module -avoid-version
symlink_cache_la_SOURCES = symlink-cache.c
symlink_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/write-behind/src/Makefile.am b/xlators/performance/write-behind/src/Makefile.am
index 5ca0462ae..6c829d8ee 100644
--- a/xlators/performance/write-behind/src/Makefile.am
+++ b/xlators/performance/write-behind/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = write-behind.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-write_behind_la_LDFLAGS = -module -avoidversion
+write_behind_la_LDFLAGS = -module -avoid-version
write_behind_la_SOURCES = write-behind.c
write_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index b94b18a4a..95c5921c6 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -43,13 +43,6 @@ typedef struct wb_inode {
used for trickling_writes
*/
- int32_t op_ret; /* Last found op_ret and op_errno
- while completing a liability
- operation. Will be picked by
- the next arriving writev/flush/fsync
- */
- int32_t op_errno;
-
list_head_t all; /* All requests, from enqueue() till destroy().
Used only for resetting generation
number when empty.
@@ -89,6 +82,12 @@ typedef struct wb_inode {
write-behind from this list, and therefore
get "upgraded" to the "liability" list.
*/
+ list_head_t wip; /* List of write calls in progress, SYNC or non-SYNC
+ which are currently STACK_WIND'ed towards the server.
+ This is for guaranteeing that no two overlapping
+ writes are in progress at the same time. Modules
+ like eager-lock in AFR depend on this behavior.
+ */
uint64_t gen; /* Liability generation number. Represents
the current 'state' of liability. Every
new addition to the liability list bumps
@@ -120,6 +119,7 @@ typedef struct wb_request {
list_head_t lie; /* either in @liability or @temptation */
list_head_t winds;
list_head_t unwinds;
+ list_head_t wip;
call_stub_t *stub;
@@ -205,6 +205,26 @@ out:
}
+gf_boolean_t
+wb_fd_err (fd_t *fd, xlator_t *this, int32_t *op_errno)
+{
+ gf_boolean_t err = _gf_false;
+ uint64_t value = 0;
+ int32_t tmp = 0;
+
+ if (fd_ctx_get (fd, this, &value) == 0) {
+ if (op_errno) {
+ tmp = value;
+ *op_errno = tmp;
+ }
+
+ err = _gf_true;
+ }
+
+ return err;
+}
+
+
/*
Below is a succinct explanation of the code deciding whether two regions
overlap, from Pavan <tcp@gluster.com>.
@@ -302,6 +322,30 @@ wb_liability_has_conflict (wb_inode_t *wb_inode, wb_request_t *req)
}
+gf_boolean_t
+wb_wip_has_conflict (wb_inode_t *wb_inode, wb_request_t *req)
+{
+ wb_request_t *each = NULL;
+
+ if (req->stub->fop != GF_FOP_WRITE)
+ /* non-writes fundamentally never conflict with WIP requests */
+ return _gf_false;
+
+ list_for_each_entry (each, &wb_inode->wip, wip) {
+ if (each == req)
+ /* request never conflicts with itself,
+ though this condition should never occur.
+ */
+ continue;
+
+ if (wb_requests_overlap (each, req))
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+
static int
__wb_request_unref (wb_request_t *req)
{
@@ -320,6 +364,7 @@ __wb_request_unref (wb_request_t *req)
if (req->refcount == 0) {
list_del_init (&req->todo);
list_del_init (&req->lie);
+ list_del_init (&req->wip);
list_del_init (&req->all);
if (list_empty (&wb_inode->all)) {
@@ -425,6 +470,7 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
INIT_LIST_HEAD (&req->lie);
INIT_LIST_HEAD (&req->winds);
INIT_LIST_HEAD (&req->unwinds);
+ INIT_LIST_HEAD (&req->wip);
req->stub = stub;
req->wb_inode = wb_inode;
@@ -432,8 +478,8 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
req->ordering.tempted = tempted;
if (stub->fop == GF_FOP_WRITE) {
- req->write_size = iov_length (stub->args.writev.vector,
- stub->args.writev.count);
+ req->write_size = iov_length (stub->args.vector,
+ stub->args.count);
/* req->write_size can change as we collapse
small writes. But the window needs to grow
@@ -449,7 +495,7 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
req->op_ret = req->write_size;
req->op_errno = 0;
- if (stub->args.writev.fd->flags & O_APPEND)
+ if (stub->args.fd->flags & O_APPEND)
req->ordering.append = 1;
}
@@ -457,28 +503,28 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
switch (stub->fop) {
case GF_FOP_WRITE:
- req->ordering.off = stub->args.writev.off;
+ req->ordering.off = stub->args.offset;
req->ordering.size = req->write_size;
- req->fd = fd_ref (stub->args.writev.fd);
+ req->fd = fd_ref (stub->args.fd);
break;
case GF_FOP_READ:
- req->ordering.off = stub->args.readv.off;
- req->ordering.size = stub->args.readv.size;
+ req->ordering.off = stub->args.offset;
+ req->ordering.size = stub->args.size;
- req->fd = fd_ref (stub->args.readv.fd);
+ req->fd = fd_ref (stub->args.fd);
break;
case GF_FOP_TRUNCATE:
- req->ordering.off = stub->args.truncate.off;
+ req->ordering.off = stub->args.offset;
req->ordering.size = 0; /* till infinity */
break;
case GF_FOP_FTRUNCATE:
- req->ordering.off = stub->args.ftruncate.off;
+ req->ordering.off = stub->args.offset;
req->ordering.size = 0; /* till infinity */
- req->fd = fd_ref (stub->args.ftruncate.fd);
+ req->fd = fd_ref (stub->args.fd);
break;
default:
@@ -541,6 +587,7 @@ __wb_inode_create (xlator_t *this, inode_t *inode)
INIT_LIST_HEAD (&wb_inode->todo);
INIT_LIST_HEAD (&wb_inode->liability);
INIT_LIST_HEAD (&wb_inode->temptation);
+ INIT_LIST_HEAD (&wb_inode->wip);
wb_inode->this = this;
@@ -629,12 +676,25 @@ wb_head_done (wb_request_t *head)
void
-wb_inode_err (wb_inode_t *wb_inode, int op_errno)
+wb_fulfill_err (wb_request_t *head, int op_errno)
{
+ wb_inode_t *wb_inode;
+ wb_request_t *req;
+
+ wb_inode = head->wb_inode;
+
+ /* for all future requests yet to arrive */
+ fd_ctx_set (head->fd, THIS, op_errno);
+
LOCK (&wb_inode->lock);
{
- wb_inode->op_ret = -1;
- wb_inode->op_errno = op_errno;
+ /* for all requests already arrived */
+ list_for_each_entry (req, &wb_inode->all, all) {
+ if (req->fd != head->fd)
+ continue;
+ req->op_ret = -1;
+ req->op_errno = op_errno;
+ }
}
UNLOCK (&wb_inode->lock);
}
@@ -654,7 +714,7 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
wb_inode = head->wb_inode;
if (op_ret == -1) {
- wb_inode_err (wb_inode, op_errno);
+ wb_fulfill_err (head, op_errno);
} else if (op_ret < head->total_size) {
/*
* We've encountered a short write, for whatever reason.
@@ -664,7 +724,7 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* TODO: Retry the write so we can potentially capture
* a real error condition (i.e., ENOSPC).
*/
- wb_inode_err (wb_inode, EIO);
+ wb_fulfill_err (head, EIO);
}
wb_head_done (head);
@@ -678,34 +738,47 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
#define WB_IOV_LOAD(vec, cnt, req, head) do { \
- memcpy (&vec[cnt], req->stub->args.writev.vector, \
- (req->stub->args.writev.count * sizeof(vec[0]))); \
- cnt += req->stub->args.writev.count; \
+ memcpy (&vec[cnt], req->stub->args.vector, \
+ (req->stub->args.count * sizeof(vec[0]))); \
+ cnt += req->stub->args.count; \
head->total_size += req->write_size; \
} while (0)
-void
+int
wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)
{
- struct iovec vector[MAX_VECTOR_COUNT];
- int count = 0;
- wb_request_t *req = NULL;
- call_frame_t *frame = NULL;
+ struct iovec vector[MAX_VECTOR_COUNT];
+ int count = 0;
+ wb_request_t *req = NULL;
+ call_frame_t *frame = NULL;
+ gf_boolean_t fderr = _gf_false;
+ xlator_t *this = NULL;
- frame = create_frame (wb_inode->this, wb_inode->this->ctx->pool);
- if (!frame)
- goto enomem;
+ this = THIS;
+
+ /* make sure head->total_size is updated before we run into any
+ * errors
+ */
WB_IOV_LOAD (vector, count, head, head);
list_for_each_entry (req, &head->winds, winds) {
WB_IOV_LOAD (vector, count, req, head);
- iobref_merge (head->stub->args.writev.iobref,
- req->stub->args.writev.iobref);
+ iobref_merge (head->stub->args.iobref,
+ req->stub->args.iobref);
}
+ if (wb_fd_err (head->fd, this, NULL)) {
+ fderr = _gf_true;
+ goto err;
+ }
+
+ frame = create_frame (wb_inode->this, wb_inode->this->ctx->pool);
+ if (!frame)
+ goto err;
+
frame->root->lk_owner = head->lk_owner;
frame->local = head;
@@ -718,32 +791,36 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)
STACK_WIND (frame, wb_fulfill_cbk, FIRST_CHILD (frame->this),
FIRST_CHILD (frame->this)->fops->writev,
head->fd, vector, count,
- head->stub->args.writev.off,
- head->stub->args.writev.flags,
- head->stub->args.writev.iobref, NULL);
+ head->stub->args.offset,
+ head->stub->args.flags,
+ head->stub->args.iobref, NULL);
- return;
-enomem:
- wb_inode_err (wb_inode, ENOMEM);
+ return 0;
+err:
+ if (!fderr) {
+ /* frame creation failure */
+ fderr = ENOMEM;
+ wb_fulfill_err (head, fderr);
+ }
wb_head_done (head);
- return;
+ return fderr;
}
#define NEXT_HEAD(head, req) do { \
if (head) \
- wb_fulfill_head (wb_inode, head); \
+ ret |= wb_fulfill_head (wb_inode, head); \
head = req; \
- expected_offset = req->stub->args.writev.off + \
+ expected_offset = req->stub->args.offset + \
req->write_size; \
curr_aggregate = 0; \
vector_count = 0; \
} while (0)
-void
+int
wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
{
wb_request_t *req = NULL;
@@ -753,6 +830,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
off_t expected_offset = 0;
size_t curr_aggregate = 0;
size_t vector_count = 0;
+ int ret = 0;
conf = wb_inode->this->private;
@@ -774,7 +852,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
continue;
}
- if (expected_offset != req->stub->args.writev.off) {
+ if (expected_offset != req->stub->args.offset) {
NEXT_HEAD (head, req);
continue;
}
@@ -784,7 +862,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
continue;
}
- if (vector_count + req->stub->args.writev.count >
+ if (vector_count + req->stub->args.count >
MAX_VECTOR_COUNT) {
NEXT_HEAD (head, req);
continue;
@@ -792,12 +870,13 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
list_add_tail (&req->winds, &head->winds);
curr_aggregate += req->write_size;
- vector_count += req->stub->args.writev.count;
+ vector_count += req->stub->args.count;
}
if (head)
- wb_fulfill_head (wb_inode, head);
- return;
+ ret |= wb_fulfill_head (wb_inode, head);
+
+ return ret;
}
@@ -866,10 +945,10 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req)
size_t req_len = 0;
if (!holder->iobref) {
- holder_len = iov_length (holder->stub->args.writev.vector,
- holder->stub->args.writev.count);
- req_len = iov_length (req->stub->args.writev.vector,
- req->stub->args.writev.count);
+ holder_len = iov_length (holder->stub->args.vector,
+ holder->stub->args.count);
+ req_len = iov_length (req->stub->args.vector,
+ req->stub->args.count);
required_size = max ((THIS->ctx->page_size),
(holder_len + req_len));
@@ -895,25 +974,25 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req)
goto out;
}
- iov_unload (iobuf->ptr, holder->stub->args.writev.vector,
- holder->stub->args.writev.count);
- holder->stub->args.writev.vector[0].iov_base = iobuf->ptr;
- holder->stub->args.writev.count = 1;
+ iov_unload (iobuf->ptr, holder->stub->args.vector,
+ holder->stub->args.count);
+ holder->stub->args.vector[0].iov_base = iobuf->ptr;
+ holder->stub->args.count = 1;
- iobref_unref (holder->stub->args.writev.iobref);
- holder->stub->args.writev.iobref = iobref;
+ iobref_unref (holder->stub->args.iobref);
+ holder->stub->args.iobref = iobref;
iobuf_unref (iobuf);
holder->iobref = iobref_ref (iobref);
}
- ptr = holder->stub->args.writev.vector[0].iov_base + holder->write_size;
+ ptr = holder->stub->args.vector[0].iov_base + holder->write_size;
- iov_unload (ptr, req->stub->args.writev.vector,
- req->stub->args.writev.count);
+ iov_unload (ptr, req->stub->args.vector,
+ req->stub->args.count);
- holder->stub->args.writev.vector[0].iov_len += req->write_size;
+ holder->stub->args.vector[0].iov_len += req->write_size;
holder->write_size += req->write_size;
holder->ordering.size += req->write_size;
@@ -963,10 +1042,10 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
continue;
}
- offset_expected = holder->stub->args.writev.off
+ offset_expected = holder->stub->args.offset
+ holder->write_size;
- if (req->stub->args.writev.off != offset_expected) {
+ if (req->stub->args.offset != offset_expected) {
holder->ordering.go = 1;
holder = req;
continue;
@@ -978,6 +1057,12 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
continue;
}
+ if (req->fd != holder->fd) {
+ holder->ordering.go = 1;
+ holder = req;
+ continue;
+ }
+
space_left = page_size - holder->write_size;
if (space_left < req->write_size) {
@@ -1032,6 +1117,18 @@ __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks,
/* wait some more */
continue;
+ if (req->stub->fop == GF_FOP_WRITE) {
+ if (wb_wip_has_conflict (wb_inode, req))
+ continue;
+
+ list_add_tail (&req->wip, &wb_inode->wip);
+
+ if (!req->ordering.tempted)
+ /* unrefed in wb_writev_cbk */
+ req->stub->frame->local =
+ __wb_request_ref (req);
+ }
+
list_del_init (&req->todo);
if (req->ordering.tempted)
@@ -1064,38 +1161,69 @@ wb_process_queue (wb_inode_t *wb_inode)
list_head_t tasks = {0, };
list_head_t lies = {0, };
list_head_t liabilities = {0, };
+ int retry = 0;
INIT_LIST_HEAD (&tasks);
INIT_LIST_HEAD (&lies);
INIT_LIST_HEAD (&liabilities);
- LOCK (&wb_inode->lock);
- {
- __wb_preprocess_winds (wb_inode);
+ do {
+ LOCK (&wb_inode->lock);
+ {
+ __wb_preprocess_winds (wb_inode);
- __wb_pick_winds (wb_inode, &tasks, &liabilities);
+ __wb_pick_winds (wb_inode, &tasks, &liabilities);
- __wb_pick_unwinds (wb_inode, &lies);
+ __wb_pick_unwinds (wb_inode, &lies);
- }
- UNLOCK (&wb_inode->lock);
+ }
+ UNLOCK (&wb_inode->lock);
- wb_do_unwinds (wb_inode, &lies);
+ wb_do_unwinds (wb_inode, &lies);
- wb_do_winds (wb_inode, &tasks);
+ wb_do_winds (wb_inode, &tasks);
- wb_fulfill (wb_inode, &liabilities);
+ /* fd might've been marked bad due to previous errors.
+ * Since, caller of wb_process_queue might be the last fop on
+ * inode, make sure we keep processing request queue, till there
+ * are no requests left.
+ */
+ retry = wb_fulfill (wb_inode, &liabilities);
+ } while (retry);
return;
}
int
+wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ wb_request_t *req = NULL;
+ wb_inode_t *wb_inode;
+
+ req = frame->local;
+ frame->local = NULL;
+ wb_inode = req->wb_inode;
+
+ wb_request_unref (req);
+
+ /* requests could be pending while this was in progress */
+ wb_process_queue(wb_inode);
+
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+
+int
wb_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- STACK_WIND (frame, default_writev_cbk,
+ STACK_WIND (frame, wb_writev_cbk,
FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev,
fd, vector, count, offset, flags, iobref, xdata);
return 0;
@@ -1112,10 +1240,15 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
gf_boolean_t wb_disabled = 0;
call_stub_t *stub = NULL;
int ret = -1;
- int op_errno = EINVAL;
+ int32_t op_errno = EINVAL;
int o_direct = O_DIRECT;
conf = this->private;
+
+ if (wb_fd_err (fd, this, &op_errno)) {
+ goto unwind;
+ }
+
wb_inode = wb_inode_create (this, fd->inode);
if (!wb_inode) {
op_errno = ENOMEM;
@@ -1128,24 +1261,9 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
if (fd->flags & (O_SYNC|O_DSYNC|o_direct))
wb_disabled = 1;
- if (flags & (O_SYNC|O_DSYNC|O_DIRECT))
- /* O_DIRECT flag in params of writev must _always_ be honored */
+ if (flags & (O_SYNC|O_DSYNC|o_direct))
wb_disabled = 1;
- op_errno = 0;
- LOCK (&wb_inode->lock);
- {
- /* pick up a previous error in fulfillment */
- if (wb_inode->op_ret < 0)
- op_errno = wb_inode->op_errno;
-
- wb_inode->op_ret = 0;
- }
- UNLOCK (&wb_inode->lock);
-
- if (op_errno)
- goto unwind;
-
if (wb_disabled)
stub = fop_writev_stub (frame, wb_writev_helper, fd, vector,
count, offset, flags, iobref, xdata);
@@ -1243,7 +1361,7 @@ wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
wb_conf_t *conf = NULL;
wb_inode_t *wb_inode = NULL;
call_frame_t *bg_frame = NULL;
- int op_errno = 0;
+ int32_t op_errno = 0;
int op_ret = 0;
conf = this->private;
@@ -1255,19 +1373,10 @@ wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
goto unwind;
}
- LOCK (&wb_inode->lock);
- {
- if (wb_inode->op_ret < 0) {
- op_ret = -1;
- op_errno = wb_inode->op_errno;
- }
-
- wb_inode->op_ret = 0;
- }
- UNLOCK (&wb_inode->lock);
-
- if (op_errno)
+ if (wb_fd_err (fd, this, &op_errno)) {
+ op_ret = -1;
goto unwind;
+ }
if (conf->flush_behind)
goto flushbehind;
@@ -1311,7 +1420,7 @@ wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
if (!wb_enqueue (wb_inode, stub))
goto unwind;
- wb_process_queue (wb_inode);
+ wb_process_queue (wb_inode);
return 0;
@@ -1344,6 +1453,10 @@ wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
{
wb_inode_t *wb_inode = NULL;
call_stub_t *stub = NULL;
+ int32_t op_errno = EINVAL;
+
+ if (wb_fd_err (fd, this, &op_errno))
+ goto unwind;
wb_inode = wb_inode_ctx_get (this, fd->inode);
if (!wb_inode)
@@ -1361,7 +1474,7 @@ wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
return 0;
unwind:
- STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, NULL, NULL, NULL);
+ STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
@@ -1521,25 +1634,35 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
{
wb_inode_t *wb_inode = NULL;
call_stub_t *stub = NULL;
+ int32_t op_errno = 0;
wb_inode = wb_inode_create (this, fd->inode);
- if (!wb_inode)
+ if (!wb_inode) {
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ if (wb_fd_err (fd, this, &op_errno))
goto unwind;
stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd,
offset, xdata);
- if (!stub)
+ if (!stub) {
+ op_errno = ENOMEM;
goto unwind;
+ }
- if (!wb_enqueue (wb_inode, stub))
+ if (!wb_enqueue (wb_inode, stub)) {
+ op_errno = ENOMEM;
goto unwind;
+ }
wb_process_queue (wb_inode);
return 0;
unwind:
- STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+ STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
if (stub)
call_stub_destroy (stub);
@@ -1663,6 +1786,17 @@ wb_forget (xlator_t *this, inode_t *inode)
int
+wb_release (xlator_t *this, fd_t *fd)
+{
+ uint64_t tmp = 0;
+
+ fd_ctx_del (fd, this, &tmp);
+
+ return 0;
+}
+
+
+int
wb_priv_dump (xlator_t *this)
{
wb_conf_t *conf = NULL;
@@ -1717,7 +1851,7 @@ __wb_dump_requests (struct list_head *head, char *prefix)
req->write_size);
gf_proc_dump_write ("offset", "%"PRId64,
- req->stub->args.writev.off);
+ req->stub->args.offset);
flag = req->ordering.lied;
gf_proc_dump_write ("lied", "%d", flag);
@@ -1774,9 +1908,6 @@ wb_inode_dump (xlator_t *this, inode_t *inode)
gf_proc_dump_write ("window_current", "%"GF_PRI_SIZET,
wb_inode->window_current);
- gf_proc_dump_write ("op_ret", "%d", wb_inode->op_ret);
-
- gf_proc_dump_write ("op_errno", "%d", wb_inode->op_errno);
ret = TRY_LOCK (&wb_inode->lock);
if (!ret)
@@ -1949,6 +2080,7 @@ struct xlator_fops fops = {
struct xlator_cbks cbks = {
.forget = wb_forget,
+ .release = wb_release
};