summaryrefslogtreecommitdiffstats
path: root/libglusterfs
diff options
context:
space:
mode:
authorAmar Tumballi <amarts@redhat.com>2018-10-16 16:31:49 +0530
committerAmar Tumballi <amarts@redhat.com>2018-12-14 17:34:28 +0000
commitd49b41e817d592c1904b6f01716df6546dad3ebe (patch)
tree12546ad6da0534683cb7bed5b829500ecdfafe93 /libglusterfs
parentfc74ef85e0780e0a265275df00e4d0d4a2d05eab (diff)
fuse: add --lru-limit option
The inode LRU mechanism is moot in fuse xlator (ie. there is no limit for the LRU list), as fuse inodes are referenced from kernel context, and thus they can only be dropped on request of the kernel. This might results in a high number of passive inodes which are useless for the glusterfs client, causing a significant memory overhead. This change tries to remedy this by extending the LRU semantics and allowing to set a finite limit on the fuse inode LRU. A brief history of problem: When gluster's inode table was designed, fuse didn't have any 'invalidate' method, which means, userspace application could never ask kernel to send a 'forget()' fop, instead had to wait for kernel to send it based on kernel's parameters. Inode table remembers the number of times kernel has cached the inode based on the 'nlookup' parameter. And 'nlookup' field is not used by no other entry points (like server-protocol, gfapi etc). Hence the inode_table of fuse module always has to have lru-limit as '0', which means no limit. GlusterFS always had to keep all inodes in memory as kernel would have had a reference to it. Again, the reason for this is, kernel's glusterfs inode reference was pointer of 'inode_t' structure in glusterfs. As it is a pointer, we could never free it (to prevent segfault, or memory corruption). Solution: In the inode table, handle the prune case of inodes with 'nlookup' differently, and call a 'invalidator' method, which in this case is fuse_invalidate(), and it sends the request to kernel for getting the forget request. When the kernel sends the forget, it means, it has dropped all the reference to the inode, and it will send the forget with the 'nlookup' parameter too. We just need to make sure to reduce the 'nlookup' value we have when we get forget. That automatically cause the relevant prune to happen. Credits: Csaba Henk, Xavier Hernandez, Raghavendra Gowdappa, Nithya B fixes: bz#1560969 Change-Id: Ifee0737b23b12b1426c224ec5b8f591f487d83a2 Signed-off-by: Amar Tumballi <amarts@redhat.com>
Diffstat (limited to 'libglusterfs')
-rw-r--r--libglusterfs/src/glusterfs/glusterfs.h1
-rw-r--r--libglusterfs/src/glusterfs/inode.h17
-rw-r--r--libglusterfs/src/inode.c254
-rw-r--r--libglusterfs/src/libglusterfs.sym2
4 files changed, 238 insertions, 36 deletions
diff --git a/libglusterfs/src/glusterfs/glusterfs.h b/libglusterfs/src/glusterfs/glusterfs.h
index 908a0ce774f..9f14f2f5440 100644
--- a/libglusterfs/src/glusterfs/glusterfs.h
+++ b/libglusterfs/src/glusterfs/glusterfs.h
@@ -524,6 +524,7 @@ struct _cmd_args {
pid_t client_pid;
int client_pid_set;
unsigned uid_map_root;
+ int32_t lru_limit;
int background_qlen;
int congestion_threshold;
char *fuse_mountopts;
diff --git a/libglusterfs/src/glusterfs/inode.h b/libglusterfs/src/glusterfs/inode.h
index 5934373ec5b..52efdd85ccc 100644
--- a/libglusterfs/src/glusterfs/inode.h
+++ b/libglusterfs/src/glusterfs/inode.h
@@ -54,6 +54,13 @@ struct _inode_table {
struct mem_pool *dentry_pool; /* memory pool for dentrys */
struct mem_pool *fd_mem_pool; /* memory pool for fd_t */
int ctxcount; /* number of slots in inode->ctx */
+
+ /* This is required for 'invalidation' when 'nlookup' would be used,
+ specially in case of fuse-bridge */
+ int32_t (*invalidator_fn)(xlator_t *, inode_t *);
+ xlator_t *invalidator_xl;
+ struct list_head invalidate; /* inodes which are in invalidation queue */
+ uint32_t invalidate_size; /* count of inodes in invalidation list */
};
struct _dentry {
@@ -100,6 +107,7 @@ struct _inode {
struct list_head list; /* active/lru/purge */
struct _inode_ctx *_ctx; /* replacement for dict_t *(inode->ctx) */
+ bool invalidate_sent; /* Set it if invalidator_fn is called for inode */
};
#define UUID0_STR "00000000-0000-0000-0000-000000000000"
@@ -107,7 +115,12 @@ struct _inode {
#define GFID_STR_PFX_LEN (sizeof(GFID_STR_PFX) - 1)
inode_table_t *
-inode_table_new(size_t lru_limit, xlator_t *xl);
+inode_table_new(uint32_t lru_limit, xlator_t *xl);
+
+inode_table_t *
+inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl,
+ int32_t (*invalidator_fn)(xlator_t *, inode_t *),
+ xlator_t *invalidator_xl);
void
inode_table_destroy_all(glusterfs_ctx_t *ctx);
@@ -139,6 +152,8 @@ inode_lookup(inode_t *inode);
int
inode_forget(inode_t *inode, uint64_t nlookup);
+int
+inode_forget_with_unref(inode_t *inode, uint64_t nlookup);
int
inode_ref_reduce_by_n(inode_t *inode, uint64_t nref);
diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c
index 3bf32cfe442..b4a62897498 100644
--- a/libglusterfs/src/inode.c
+++ b/libglusterfs/src/inode.c
@@ -23,6 +23,100 @@
move latest accessed dentry to list_head of inode
*/
+// clang-format off
+/*
+
+Details as per Xavi:
+
+ I think we should have 3 lists: active, lru and invalidate.
+
+We'll need 3 things: refs, nlookups and invalidate_sent flag. Any change of
+refs, invalidate_sent flag and moving from one list to another must be done
+atomically.
+
+With this information, these are the states that cause a transition:
+
+ refs nlookups inv_sent op
+ 1 0 0 unref -> refs = 0, active--->destroy
+ 1 1 0 unref -> refs = 0, active--->lru
+ 1 1 0 forget -> nlookups = 0, active--->active
+ *0 1 0 forget -> nlookups = 0, lru--->destroy
+ *0 1 1 forget -> nlookups = 0, invalidate--->destroy
+ 0 1 0 ref -> refs = 1, lru--->active
+ 0 1 1 ref -> refs = 1, inv_sent = 0, invalidate--->active
+ 0 1 0 overflow -> refs = 1, inv_sent = 1, lru--->invalidate
+ 1 1 1 unref -> refs = 0, invalidate--->invalidate
+ 1 1 1 forget -> nlookups = 0, inv_sent = 0, invalidate--->active
+
+(*) technically these combinations cannot happen because a forget sent by the
+kernel first calls ref() and then unref(). However it's equivalent.
+
+overflow means that lru list has grown beyond the limit and the inode needs to
+be invalidated. All other combinations do not cause a change in state or are not
+possible.
+
+Based on this, the code could be similar to this:
+
+ ref(inode, inv)
+ {
+ if (refs == 0) {
+ if (inv_sent) {
+ invalidate_count--;
+ inv_sent = 0;
+ } else {
+ lru_count--;
+ }
+ if (inv) {
+ inv_sent = 1;
+ invalidate_count++;
+ list_move(inode, invalidate);
+ } else {
+ active_count++;
+ list_move(inode, active);
+ }
+ }
+ refs++;
+ }
+
+ unref(inode, clear)
+ {
+ if (clear && inv_sent) {
+ // there is a case of fuse itself sending forget, without
+ // invalidate, after entry delete, like unlink(), rmdir().
+ inv_sent = 0;
+ invalidate_count--;
+ active_count++;
+ list_move(inode, active);
+ }
+ refs--;
+ if ((refs == 0) && !inv_sent) {
+ active_count--;
+ if (nlookups == 0) {
+ destroy(inode);
+ } else {
+ lru_count++;
+ list_move(inode, lru);
+ }
+ }
+ }
+
+ forget(inode)
+ {
+ ref(inode, false);
+ nlookups--;
+ unref(inode, true);
+ }
+
+ overflow(inode)
+ {
+ ref(inode, true);
+ invalidator(inode);
+ unref(inode, false);
+ }
+
+*/
+// clang-format on
+
#define INODE_DUMP_LIST(head, key_buf, key_prefix, list_type) \
{ \
int i = 1; \
@@ -37,7 +131,7 @@
}
static inode_t *
-__inode_unref(inode_t *inode);
+__inode_unref(inode_t *inode, bool clear);
static int
inode_table_prune(inode_table_t *table);
@@ -132,7 +226,7 @@ __dentry_unset(dentry_t *dentry)
dentry->name = NULL;
if (dentry->parent) {
- __inode_unref(dentry->parent);
+ __inode_unref(dentry->parent, false);
dentry->parent = NULL;
}
@@ -446,7 +540,7 @@ out:
}
static inode_t *
-__inode_unref(inode_t *inode)
+__inode_unref(inode_t *inode, bool clear)
{
int index = 0;
xlator_t *this = NULL;
@@ -455,8 +549,6 @@ __inode_unref(inode_t *inode)
if (!inode)
return NULL;
- this = THIS;
-
/*
* Root inode should always be in active list of inode table. So unrefs
* on root inode are no-ops.
@@ -464,6 +556,13 @@ __inode_unref(inode_t *inode)
if (__is_root_gfid(inode->gfid))
return inode;
+ this = THIS;
+
+ if (clear && inode->invalidate_sent) {
+ inode->invalidate_sent = false;
+ inode->table->invalidate_size--;
+ __inode_activate(inode);
+ }
GF_ASSERT(inode->ref);
--inode->ref;
@@ -474,7 +573,7 @@ __inode_unref(inode_t *inode)
inode->_ctx[index].ref--;
}
- if (!inode->ref) {
+ if (!inode->ref && !inode->invalidate_sent) {
inode->table->active_size--;
nlookup = GF_ATOMIC_GET(inode->nlookup);
@@ -488,7 +587,7 @@ __inode_unref(inode_t *inode)
}
static inode_t *
-__inode_ref(inode_t *inode)
+__inode_ref(inode_t *inode, bool is_invalidate)
{
int index = 0;
xlator_t *this = NULL;
@@ -498,11 +597,6 @@ __inode_ref(inode_t *inode)
this = THIS;
- if (!inode->ref) {
- inode->table->lru_size--;
- __inode_activate(inode);
- }
-
/*
* Root inode should always be in active list of inode table. So unrefs
* on root inode are no-ops. If we do not allow unrefs but allow refs,
@@ -514,6 +608,22 @@ __inode_ref(inode_t *inode)
if (__is_root_gfid(inode->gfid) && inode->ref)
return inode;
+ if (!inode->ref) {
+ if (inode->invalidate_sent) {
+ inode->invalidate_sent = false;
+ inode->table->invalidate_size--;
+ } else {
+ inode->table->lru_size--;
+ }
+ if (is_invalidate) {
+ inode->invalidate_sent = true;
+ inode->table->invalidate_size++;
+ list_move_tail(&inode->list, &inode->table->invalidate);
+ } else {
+ __inode_activate(inode);
+ }
+ }
+
inode->ref++;
index = __inode_get_xl_index(inode, this);
@@ -537,7 +647,7 @@ inode_unref(inode_t *inode)
pthread_mutex_lock(&table->lock);
{
- inode = __inode_unref(inode);
+ inode = __inode_unref(inode, false);
}
pthread_mutex_unlock(&table->lock);
@@ -558,7 +668,7 @@ inode_ref(inode_t *inode)
pthread_mutex_lock(&table->lock);
{
- inode = __inode_ref(inode);
+ inode = __inode_ref(inode, false);
}
pthread_mutex_unlock(&table->lock);
@@ -592,7 +702,7 @@ __dentry_create(inode_t *inode, inode_t *parent, const char *name)
}
if (parent)
- newd->parent = __inode_ref(parent);
+ newd->parent = __inode_ref(parent, false);
list_add(&newd->inode_list, &inode->dentry_list);
newd->inode = inode;
@@ -662,7 +772,7 @@ inode_new(inode_table_t *table)
{
inode = __inode_create(table);
if (inode != NULL) {
- __inode_ref(inode);
+ __inode_ref(inode, false);
}
}
pthread_mutex_unlock(&table->lock);
@@ -769,7 +879,7 @@ inode_grep(inode_table_t *table, inode_t *parent, const char *name)
inode = dentry->inode;
if (inode)
- __inode_ref(inode);
+ __inode_ref(inode, false);
}
pthread_mutex_unlock(&table->lock);
@@ -912,7 +1022,7 @@ inode_find(inode_table_t *table, uuid_t gfid)
{
inode = __inode_find(table, gfid);
if (inode)
- __inode_ref(inode);
+ __inode_ref(inode, false);
}
pthread_mutex_unlock(&table->lock);
@@ -1057,7 +1167,7 @@ inode_link(inode_t *inode, inode_t *parent, const char *name, struct iatt *iatt)
linked_inode = __inode_link(inode, parent, name, iatt);
if (linked_inode)
- __inode_ref(linked_inode);
+ __inode_ref(linked_inode, false);
}
pthread_mutex_unlock(&table->lock);
@@ -1124,6 +1234,31 @@ inode_forget(inode_t *inode, uint64_t nlookup)
return 0;
}
+int
+inode_forget_with_unref(inode_t *inode, uint64_t nlookup)
+{
+ inode_table_t *table = NULL;
+
+ if (!inode) {
+ gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND,
+ "inode not found");
+ return -1;
+ }
+
+ table = inode->table;
+
+ pthread_mutex_lock(&table->lock);
+ {
+ inode_forget_atomic(inode, nlookup);
+ __inode_unref(inode, true);
+ }
+ pthread_mutex_unlock(&table->lock);
+
+ inode_table_prune(table);
+
+ return 0;
+}
+
/*
* Invalidate an inode. This is invoked when a translator decides that an
* inode's cache is no longer valid. Any translator interested in taking action
@@ -1298,7 +1433,7 @@ inode_parent(inode_t *inode, uuid_t pargfid, const char *name)
parent = dentry->parent;
if (parent)
- __inode_ref(parent);
+ __inode_ref(parent, false);
}
pthread_mutex_unlock(&table->lock);
@@ -1480,6 +1615,8 @@ inode_table_prune(inode_table_t *table)
inode_t *del = NULL;
inode_t *tmp = NULL;
inode_t *entry = NULL;
+ uint64_t nlookup = 0;
+ int64_t lru_size = 0;
if (!table)
return -1;
@@ -1488,7 +1625,11 @@ inode_table_prune(inode_table_t *table)
pthread_mutex_lock(&table->lock);
{
- while (table->lru_limit && table->lru_size > (table->lru_limit)) {
+ if (!table->lru_limit)
+ goto purge_list;
+
+ lru_size = table->lru_size;
+ while (lru_size > (table->lru_limit)) {
if (list_empty(&table->lru)) {
gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0,
LG_MSG_INVALID_INODE_LIST,
@@ -1498,26 +1639,46 @@ inode_table_prune(inode_table_t *table)
break;
}
+ lru_size--;
entry = list_entry(table->lru.next, inode_t, list);
+ /* The logic of invalidation is required only if invalidator_fn
+ is present */
+ if (table->invalidator_fn) {
+ /* check for valid inode with 'nlookup' */
+ nlookup = GF_ATOMIC_GET(entry->nlookup);
+ if (nlookup) {
+ __inode_ref(entry, true);
+ tmp = entry;
+ break;
+ }
+ }
table->lru_size--;
__inode_retire(entry);
-
ret++;
}
+ purge_list:
list_splice_init(&table->purge, &purge);
table->purge_size = 0;
}
pthread_mutex_unlock(&table->lock);
+ /* Pick 1 inode for invalidation */
+ if (tmp) {
+ xlator_t *old_THIS = THIS;
+ THIS = table->invalidator_xl;
+ table->invalidator_fn(table->invalidator_xl, tmp);
+ THIS = old_THIS;
+ inode_unref(tmp);
+ }
+
+ /* Just so that if purge list is handled too, then clear it off */
+ list_for_each_entry_safe(del, tmp, &purge, list)
{
- list_for_each_entry_safe(del, tmp, &purge, list)
- {
- list_del_init(&del->list);
- inode_forget_atomic(del, 0);
- __inode_destroy(del);
- }
+ list_del_init(&del->list);
+ inode_forget_atomic(del, 0);
+ __inode_destroy(del);
}
return ret;
@@ -1545,9 +1706,12 @@ __inode_table_init_root(inode_table_t *table)
}
inode_table_t *
-inode_table_new(size_t lru_limit, xlator_t *xl)
+inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl,
+ int32_t (*invalidator_fn)(xlator_t *, inode_t *),
+ xlator_t *invalidator_xl)
{
inode_table_t *new = NULL;
+ uint32_t mem_pool_size = lru_limit;
int ret = -1;
int i = 0;
@@ -1559,20 +1723,20 @@ inode_table_new(size_t lru_limit, xlator_t *xl)
new->ctxcount = xl->graph->xl_count + 1;
new->lru_limit = lru_limit;
+ new->invalidator_fn = invalidator_fn;
+ new->invalidator_xl = invalidator_xl;
new->hashsize = 14057; /* TODO: Random Number?? */
/* In case FUSE is initing the inode table. */
- if (lru_limit == 0)
- lru_limit = DEFAULT_INODE_MEMPOOL_ENTRIES;
-
- new->inode_pool = mem_pool_new(inode_t, lru_limit);
+ if (!mem_pool_size || (mem_pool_size > DEFAULT_INODE_MEMPOOL_ENTRIES))
+ mem_pool_size = DEFAULT_INODE_MEMPOOL_ENTRIES;
+ new->inode_pool = mem_pool_new(inode_t, mem_pool_size);
if (!new->inode_pool)
goto out;
- new->dentry_pool = mem_pool_new(dentry_t, lru_limit);
-
+ new->dentry_pool = mem_pool_new(dentry_t, mem_pool_size);
if (!new->dentry_pool)
goto out;
@@ -1604,6 +1768,7 @@ inode_table_new(size_t lru_limit, xlator_t *xl)
INIT_LIST_HEAD(&new->active);
INIT_LIST_HEAD(&new->lru);
INIT_LIST_HEAD(&new->purge);
+ INIT_LIST_HEAD(&new->invalidate);
ret = gf_asprintf(&new->name, "%s/inode", xl->name);
if (-1 == ret) {
@@ -1633,6 +1798,13 @@ out:
return new;
}
+inode_table_t *
+inode_table_new(uint32_t lru_limit, xlator_t *xl)
+{
+ /* Only fuse for now requires the inode table with invalidator */
+ return inode_table_with_invalidator(lru_limit, xl, NULL, NULL);
+}
+
int
inode_table_ctx_free(inode_table_t *table)
{
@@ -1771,6 +1943,14 @@ inode_table_destroy(inode_table_t *inode_table)
inode_table->lru_size--;
}
+ /* Same logic for invalidate list */
+ while (!list_empty(&inode_table->invalidate)) {
+ trav = list_first_entry(&inode_table->invalidate, inode_t, list);
+ inode_forget_atomic(trav, 0);
+ __inode_retire(trav);
+ inode_table->invalidate_size--;
+ }
+
while (!list_empty(&inode_table->active)) {
trav = list_first_entry(&inode_table->active, inode_t, list);
/* forget and unref the inode to retire and add it to
@@ -2280,6 +2460,7 @@ inode_dump(inode_t *inode, char *prefix)
gf_proc_dump_write("fd-count", "%u", inode->fd_count);
gf_proc_dump_write("active-fd-count", "%u", inode->active_fd_count);
gf_proc_dump_write("ref", "%u", inode->ref);
+ gf_proc_dump_write("invalidate-sent", "%d", inode->invalidate_sent);
gf_proc_dump_write("ia_type", "%d", inode->ia_type);
if (inode->_ctx) {
inode_ctx = GF_CALLOC(inode->table->ctxcount, sizeof(*inode_ctx),
@@ -2353,10 +2534,13 @@ inode_table_dump(inode_table_t *itable, char *prefix)
gf_proc_dump_write(key, "%d", itable->lru_size);
gf_proc_dump_build_key(key, prefix, "purge_size");
gf_proc_dump_write(key, "%d", itable->purge_size);
+ gf_proc_dump_build_key(key, prefix, "invalidate_size");
+ gf_proc_dump_write(key, "%d", itable->invalidate_size);
INODE_DUMP_LIST(&itable->active, key, prefix, "active");
INODE_DUMP_LIST(&itable->lru, key, prefix, "lru");
INODE_DUMP_LIST(&itable->purge, key, prefix, "purge");
+ INODE_DUMP_LIST(&itable->invalidate, key, prefix, "invalidate");
pthread_mutex_unlock(&itable->lock);
}
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
index 6ca6a639456..464493d6cfc 100644
--- a/libglusterfs/src/libglusterfs.sym
+++ b/libglusterfs/src/libglusterfs.sym
@@ -791,6 +791,7 @@ __inode_find
inode_find
inode_find_directory_name
inode_forget
+inode_forget_with_unref
inode_from_path
inode_grep
inode_grep_for_gfid
@@ -815,6 +816,7 @@ inode_table_destroy_all
inode_table_dump
inode_table_dump_to_dict
inode_table_new
+inode_table_with_invalidator
__inode_table_set_lru_limit
inode_table_set_lru_limit
inode_unlink