From 7e1f8e3bac201f88e2d9ef62fc69a044716dfced Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Fri, 13 Jan 2012 13:27:15 +0530 Subject: core: GFID filehandle based backend and anonymous FDs 1. What -------- This change introduces an infrastructure change in the filesystem which lets filesystem operation address objects (inodes) just by its GFID. Thus far GFID has been a unique identifier of a user-visible inode. But in terms of addressability the only mechanism thus far has been the backend filesystem path, which could be derived from the GFID only if it was cached in the inode table along with the entire set of dentry ancestry leading up to the root. This change essentially decouples addressability from the namespace. It is no more necessary to be aware of the parent directory to address a file or directory. 2. Why ------- The biggest use case for such a feature is NFS for generating persistent filehandles. So far the technique for generating filehandles in NFS has been to encode path components so that the appropriate inode_t can be repopulated into the inode table by means of a recursive lookup of each component top-down. Another use case is the ability to perform more intelligent self-healing and rebalancing of inodes with hardlinks and also to detect renames. A derived feature from GFID filehandles is anonymous FDs. An anonymous FD is an internal USABLE "fd_t" which does not map to a user opened file descriptor or to an internal ->open()'d fd. The ability to address a file by the GFID eliminates the need to have a persistent ->open()'d fd for the purpose of avoiding the namespace. This improves NFS read/write performance significantly eliminating open/close calls and also fixes some of today's limitations (like keeping an FD open longer than necessary resulting in disk space leakage) 3. How ------- At each storage/posix translator level, every file is hardlinked inside a hidden .glusterfs directory (under the top level export) with the name as the ascii-encoded standard UUID format string. For reasons of performance and scalability there is a two-tier classification of those hardlinks under directories with the initial parts of the UUID string as the directory names. For directories (which cannot be hardlinked), the approach is to use a symlink which dereferences the parent GFID path along with basename of the directory. The parent GFID dereference will in turn be a dereference of the grandparent with the parent's basename, and so on recursively up to the root export. 4. Development --------------- 4a. To leverage the ability to address an inode by its GFID, the technique is to perform a "nameless lookup". This means, to populate a loc_t structure as: loc_t { pargfid: NULL parent: NULL name: NULL path: NULL gfid: GFID to be looked up [out parameter] inode: inode_new () result [in parameter] } and performing such lookup will return in its callback an inode_t populated with the right contexts and a struct iatt which can be used to perform an inode_link () on the inode (without a parent and basename). The inode will now be hashed and linked in the inode table and findable via inode_find(). A fundamental change moving forward is that the primary fields in a loc_t structure are now going to be (pargfid, name) and (gfid) depending on the kind of FOP. So far path had been the primary field for operations. The remaining fields only serve as hints/helpers. 4b. If read/write is to be performed on an inode_t, the approach so far has been to: fd_create(), STACK_WIND(open, fd), fd_bind (in callback) and then perform STACK_WIND(read, fd) etc. With anonymous fds now you can do fd_anonymous (inode), STACK_WIND (read, fd). This results in great boost in performance in the inbuilt NFS server. 5. Misc ------- The inode_ctx_put[2] has been renamed to inode_ctx_set[2] to be consistent with the rest of the codebase. Change-Id: Ie4629edf6bd32a595f4d7f01e90c0a01f16fb12f BUG: 781318 Reviewed-on: http://review.gluster.com/669 Tested-by: Gluster Build System Reviewed-by: Anand Avati --- libglusterfs/src/fd.c | 139 ++++++++++++++++++++++++++++++++++------------ libglusterfs/src/fd.h | 10 +++- libglusterfs/src/inode.c | 128 ++++++++++++++++++++++-------------------- libglusterfs/src/inode.h | 52 +++++++++++------ libglusterfs/src/xlator.c | 4 +- 5 files changed, 220 insertions(+), 113 deletions(-) (limited to 'libglusterfs') diff --git a/libglusterfs/src/fd.c b/libglusterfs/src/fd.c index 47b42aef4d2..50a564ee6df 100644 --- a/libglusterfs/src/fd.c +++ b/libglusterfs/src/fd.c @@ -35,7 +35,7 @@ gf_fd_fdtable_expand (fdtable_t *fdtable, uint32_t nr); fd_t * -_fd_ref (fd_t *fd); +__fd_ref (fd_t *fd); static int gf_fd_chain_fd_entries (fdentry_t *entries, uint32_t startidx, @@ -269,6 +269,10 @@ gf_fd_put (fdtable_t *fdtable, int32_t fd) fd_t *fdptr = NULL; fdentry_t *fde = NULL; + if (fd == -2) + /* anonymous fd */ + return; + if (fdtable == NULL || fd < 0) { gf_log_callingfn ("fd", GF_LOG_ERROR, "invalid argument"); return; @@ -336,7 +340,7 @@ gf_fd_fdptr_get (fdtable_t *fdtable, int64_t fd) fd_t * -_fd_ref (fd_t *fd) +__fd_ref (fd_t *fd) { ++fd->refcount; @@ -355,7 +359,7 @@ fd_ref (fd_t *fd) } LOCK (&fd->inode->lock); - refed_fd = _fd_ref (fd); + refed_fd = __fd_ref (fd); UNLOCK (&fd->inode->lock); return refed_fd; @@ -363,7 +367,7 @@ fd_ref (fd_t *fd) fd_t * -_fd_unref (fd_t *fd) +__fd_unref (fd_t *fd) { GF_ASSERT (fd->refcount); @@ -443,7 +447,7 @@ fd_unref (fd_t *fd) LOCK (&fd->inode->lock); { - _fd_unref (fd); + __fd_unref (fd); refcount = fd->refcount; } UNLOCK (&fd->inode->lock); @@ -457,28 +461,34 @@ fd_unref (fd_t *fd) fd_t * -fd_bind (fd_t *fd) +__fd_bind (fd_t *fd) { - inode_t *inode = NULL; + list_add (&fd->inode_list, &fd->inode->fd_list); + + return fd; +} + +fd_t * +fd_bind (fd_t *fd) +{ if (!fd || !fd->inode) { gf_log_callingfn ("fd", GF_LOG_ERROR, "!fd || !fd->inode"); return NULL; } - inode = fd->inode; - LOCK (&inode->lock); + LOCK (&fd->inode->lock); { - list_add (&fd->inode_list, &inode->fd_list); + fd = __fd_bind (fd); } - UNLOCK (&inode->lock); + UNLOCK (&fd->inode->lock); return fd; } -fd_t * -fd_create (inode_t *inode, pid_t pid) +static fd_t * +__fd_create (inode_t *inode, pid_t pid) { fd_t *fd = NULL; @@ -506,22 +516,52 @@ fd_create (inode_t *inode, pid_t pid) INIT_LIST_HEAD (&fd->inode_list); LOCK_INIT (&fd->lock); +out: + return fd; +} + + +fd_t * +fd_create (inode_t *inode, pid_t pid) +{ + fd_t *fd = NULL; + + fd = __fd_create (inode, pid); + if (!fd) + goto out; + + fd = fd_ref (fd); - LOCK (&inode->lock); - { - fd = _fd_ref (fd); - } - UNLOCK (&inode->lock); out: return fd; } +static fd_t * +__fd_lookup (inode_t *inode, pid_t pid) +{ + fd_t *iter_fd = NULL; + fd_t *fd = NULL; + + if (list_empty (&inode->fd_list)) + return NULL; + + + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (!pid || iter_fd->pid == pid) { + fd = __fd_ref (iter_fd); + break; + } + } + + return fd; +} + + fd_t * fd_lookup (inode_t *inode, pid_t pid) { fd_t *fd = NULL; - fd_t *iter_fd = NULL; if (!inode) { gf_log_callingfn ("fd", GF_LOG_WARNING, "!inode"); @@ -530,21 +570,45 @@ fd_lookup (inode_t *inode, pid_t pid) LOCK (&inode->lock); { - if (list_empty (&inode->fd_list)) { - fd = NULL; - } else { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - if (pid) { - if (iter_fd->pid == pid) { - fd = _fd_ref (iter_fd); - break; - } - } else { - fd = _fd_ref (iter_fd); - break; - } - } - } + fd = __fd_lookup (inode, pid); + } + UNLOCK (&inode->lock); + + return fd; +} + + + +fd_t * +__fd_anonymous (inode_t *inode) +{ + fd_t *fd = NULL; + + fd = __fd_lookup (inode, -1); + + if (!fd) { + fd = __fd_create (inode, -1); + + if (!fd) + return NULL; + + __fd_bind (fd); + } + + __fd_ref (fd); + + return fd; +} + + +fd_t * +fd_anonymous (inode_t *inode) +{ + fd_t *fd = NULL; + + LOCK (&inode->lock); + { + fd = __fd_anonymous (inode); } UNLOCK (&inode->lock); @@ -552,6 +616,13 @@ fd_lookup (inode_t *inode, pid_t pid) } +gf_boolean_t +fd_is_anonymous (fd_t *fd) +{ + return (fd && fd->pid == -1); +} + + uint8_t fd_list_empty (inode_t *inode) { diff --git a/libglusterfs/src/fd.h b/libglusterfs/src/fd.h index 3c2be972ad9..d4cd9bd0662 100644 --- a/libglusterfs/src/fd.h +++ b/libglusterfs/src/fd.h @@ -132,6 +132,14 @@ fd_t * fd_lookup (struct _inode *inode, pid_t pid); +fd_t * +fd_anonymous (inode_t *inode); + + +gf_boolean_t +fd_is_anonymous (fd_t *fd); + + uint8_t fd_list_empty (struct _inode *inode); @@ -164,7 +172,7 @@ int __fd_ctx_del (fd_t *fd, xlator_t *xlator, uint64_t *value); fd_t * -_fd_ref (fd_t *fd); +__fd_ref (fd_t *fd); void fd_ctx_dump (fd_t *fd, char *prefix); diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c index 3513691c492..c23f0f0e545 100644 --- a/libglusterfs/src/inode.c +++ b/libglusterfs/src/inode.c @@ -660,6 +660,39 @@ inode_grep (inode_table_t *table, inode_t *parent, const char *name) return inode; } +int +inode_grep_for_gfid (inode_table_t *table, inode_t *parent, const char *name, + uuid_t gfid, ia_type_t *type) +{ + inode_t *inode = NULL; + dentry_t *dentry = NULL; + int ret = -1; + + if (!table || !parent || !name) { + gf_log_callingfn (THIS->name, GF_LOG_WARNING, + "table || parent || name not found"); + return ret; + } + + pthread_mutex_lock (&table->lock); + { + dentry = __dentry_grep (table, parent, name); + + if (dentry) + inode = dentry->inode; + + if (inode) { + uuid_copy (gfid, inode->gfid); + *type = inode->ia_type; + ret = 0; + } + } + pthread_mutex_unlock (&table->lock); + + return ret; +} + + /* return 1 if gfid is of root, 0 if not */ gf_boolean_t __is_root_gfid (uuid_t gfid) @@ -998,6 +1031,7 @@ int __inode_path (inode_t *inode, const char *name, char **bufp) { inode_table_t *table = NULL; + inode_t *itrav = NULL; dentry_t *trav = NULL; size_t i = 0, size = 0; int64_t ret = 0; @@ -1011,8 +1045,10 @@ __inode_path (inode_t *inode, const char *name, char **bufp) table = inode->table; - for (trav = __dentry_search_arbit (inode); trav; - trav = __dentry_search_arbit (trav->parent)) { + itrav = inode; + for (trav = __dentry_search_arbit (itrav); trav; + trav = __dentry_search_arbit (itrav)) { + itrav = trav->parent; i ++; /* "/" */ i += strlen (trav->name); if (i > PATH_MAX) { @@ -1024,13 +1060,9 @@ __inode_path (inode_t *inode, const char *name, char **bufp) } } - if (!__is_root_gfid (inode->gfid) && - (i == 0)) { - gf_log (table->name, GF_LOG_WARNING, - "no dentry for non-root inode : %s", - uuid_utoa (inode->gfid)); - ret = -ENOENT; - goto out; + if (!__is_root_gfid (itrav->gfid)) { + /* ""/path */ + i += GFID_STR_PFX_LEN; } if (name) { @@ -1052,13 +1084,22 @@ __inode_path (inode_t *inode, const char *name, char **bufp) i -= (len + 1); } - for (trav = __dentry_search_arbit (inode); trav; - trav = __dentry_search_arbit (trav->parent)) { + itrav = inode; + for (trav = __dentry_search_arbit (itrav); trav; + trav = __dentry_search_arbit (itrav)) { + itrav = trav->parent; len = strlen (trav->name); strncpy (buf + (i - len), trav->name, len); buf[i-len-1] = '/'; i -= (len + 1); } + + if (!__is_root_gfid (itrav->gfid)) { + snprintf (&buf[i-GFID_STR_PFX_LEN], GFID_STR_PFX_LEN, + "", uuid_utoa (itrav->gfid)); + buf[i-1] = '>'; + } + *bufp = buf; } else { ret = -ENOMEM; @@ -1323,45 +1364,47 @@ out: int -__inode_ctx_put2 (inode_t *inode, xlator_t *xlator, uint64_t value1, - uint64_t value2) +__inode_ctx_set2 (inode_t *inode, xlator_t *xlator, uint64_t *value1_p, + uint64_t *value2_p) { int ret = 0; int index = 0; - int put_idx = -1; + int set_idx = -1; if (!inode || !xlator) return -1; for (index = 0; index < xlator->graph->xl_count; index++) { if (!inode->_ctx[index].xl_key) { - if (put_idx == -1) - put_idx = index; + if (set_idx == -1) + set_idx = index; /* dont break, to check if key already exists further on */ } if (inode->_ctx[index].xl_key == xlator) { - put_idx = index; + set_idx = index; break; } } - if (put_idx == -1) { + if (set_idx == -1) { ret = -1; goto out;; } - inode->_ctx[put_idx].xl_key = xlator; - inode->_ctx[put_idx].value1 = value1; - inode->_ctx[put_idx].value2 = value2; + inode->_ctx[set_idx].xl_key = xlator; + if (value1_p) + inode->_ctx[set_idx].value1 = *value1_p; + if (value2_p) + inode->_ctx[set_idx].value2 = *value2_p; out: return ret; } int -inode_ctx_put2 (inode_t *inode, xlator_t *xlator, uint64_t value1, - uint64_t value2) +inode_ctx_set2 (inode_t *inode, xlator_t *xlator, uint64_t *value1_p, + uint64_t *value2_p) { int ret = 0; @@ -1370,7 +1413,7 @@ inode_ctx_put2 (inode_t *inode, xlator_t *xlator, uint64_t value1, LOCK (&inode->lock); { - ret = __inode_ctx_put2 (inode, xlator, value1, value2); + ret = __inode_ctx_set2 (inode, xlator, value1_p, value2_p); } UNLOCK (&inode->lock); @@ -1466,41 +1509,6 @@ unlock: } -int -__inode_ctx_put (inode_t *inode, xlator_t *key, uint64_t value) -{ - return __inode_ctx_put2 (inode, key, value, 0); -} - - -int -inode_ctx_put (inode_t *inode, xlator_t *key, uint64_t value) -{ - return inode_ctx_put2 (inode, key, value, 0); -} - - -int -__inode_ctx_get (inode_t *inode, xlator_t *key, uint64_t *value) -{ - return __inode_ctx_get2 (inode, key, value, 0); -} - - -int -inode_ctx_get (inode_t *inode, xlator_t *key, uint64_t *value) -{ - return inode_ctx_get2 (inode, key, value, 0); -} - - -int -inode_ctx_del (inode_t *inode, xlator_t *key, uint64_t *value) -{ - return inode_ctx_del2 (inode, key, value, 0); -} - - void inode_dump (inode_t *inode, char *prefix) { @@ -1557,7 +1565,7 @@ inode_dump (inode_t *inode, char *prefix) INIT_LIST_HEAD (&fd_wrapper->next); list_add_tail (&fd_wrapper->next, &fd_list); - fd_wrapper->fd = _fd_ref (fd); + fd_wrapper->fd = __fd_ref (fd); } } unlock: diff --git a/libglusterfs/src/inode.h b/libglusterfs/src/inode.h index df415286b20..7dda0401dcb 100644 --- a/libglusterfs/src/inode.h +++ b/libglusterfs/src/inode.h @@ -106,6 +106,10 @@ struct _inode { }; +#define UUID0_STR "00000000-0000-0000-0000-000000000000" +#define GFID_STR_PFX "" +#define GFID_STR_PFX_LEN (sizeof (GFID_STR_PFX) - 1) + inode_table_t * inode_table_new (size_t lru_limit, xlator_t *xl); @@ -142,6 +146,10 @@ inode_rename (inode_table_t *table, inode_t *olddir, const char *oldname, inode_t * inode_grep (inode_table_t *table, inode_t *parent, const char *name); +int +inode_grep_for_gfid (inode_table_t *table, inode_t *parent, const char *name, + uuid_t gfid, ia_type_t *type); + inode_t * inode_find (inode_table_t *table, uuid_t gfid); @@ -155,32 +163,44 @@ inode_t * inode_from_path (inode_table_t *table, const char *path); int -__inode_ctx_put (inode_t *inode, xlator_t *xlator, uint64_t value); - -int -inode_ctx_put (inode_t *inode, xlator_t *xlator, uint64_t value); - -int -__inode_ctx_get (inode_t *inode, xlator_t *xlator, uint64_t *value); - -int -inode_ctx_get (inode_t *inode, xlator_t *xlator, uint64_t *value); - -int -inode_ctx_del (inode_t *inode, xlator_t *xlator, uint64_t *value); - +inode_ctx_set2 (inode_t *inode, xlator_t *xlator, uint64_t *value1, + uint64_t *value2); int -inode_ctx_put2 (inode_t *inode, xlator_t *xlator, uint64_t value1, - uint64_t value2); +__inode_ctx_set2 (inode_t *inode, xlator_t *xlator, uint64_t *value1, + uint64_t *value2); int inode_ctx_get2 (inode_t *inode, xlator_t *xlator, uint64_t *value1, uint64_t *value2); +int +__inode_ctx_get2 (inode_t *inode, xlator_t *xlator, uint64_t *value1, + uint64_t *value2); int inode_ctx_del2 (inode_t *inode, xlator_t *xlator, uint64_t *value1, uint64_t *value2); +#define __inode_ctx_set(i,x,v_p) __inode_ctx_set2(i,x,v_p,0) +#define inode_ctx_set(i,x,v_p) inode_ctx_set2(i,x,v_p,0) + +static inline int +__inode_ctx_put(inode_t *inode, xlator_t *this, uint64_t v) +{ + return __inode_ctx_set2 (inode, this, &v, 0); +} + +static inline int +inode_ctx_put(inode_t *inode, xlator_t *this, uint64_t v) +{ + return inode_ctx_set2(inode, this, &v, 0); +} + +#define __inode_ctx_get(i,x,v) __inode_ctx_get2(i,x,v,0) +#define inode_ctx_get(i,x,v) inode_ctx_get2(i,x,v,0) + +#define inode_ctx_del(i,x,v) inode_ctx_del2(i,x,v,0) + + gf_boolean_t __is_root_gfid (uuid_t gfid); diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c index 023cbc94030..160ac2d6322 100644 --- a/libglusterfs/src/xlator.c +++ b/libglusterfs/src/xlator.c @@ -544,8 +544,8 @@ loc_wipe (loc_t *loc) inode_unref (loc->parent); loc->parent = NULL; } - uuid_clear (loc->gfid); - uuid_clear (loc->pargfid); + + memset (loc, 0, sizeof (*loc)); } -- cgit