diff options
Diffstat (limited to 'xlators/features/locks')
| -rw-r--r-- | xlators/features/locks/src/Makefile.am | 14 | ||||
| -rw-r--r-- | xlators/features/locks/src/clear.c | 689 | ||||
| -rw-r--r-- | xlators/features/locks/src/clear.h | 71 | ||||
| -rw-r--r-- | xlators/features/locks/src/common.c | 1966 | ||||
| -rw-r--r-- | xlators/features/locks/src/common.h | 258 | ||||
| -rw-r--r-- | xlators/features/locks/src/entrylk.c | 1458 | ||||
| -rw-r--r-- | xlators/features/locks/src/inodelk.c | 1548 | ||||
| -rw-r--r-- | xlators/features/locks/src/locks-mem-types.h | 23 | ||||
| -rw-r--r-- | xlators/features/locks/src/locks.h | 328 | ||||
| -rw-r--r-- | xlators/features/locks/src/pl-messages.h | 29 | ||||
| -rw-r--r-- | xlators/features/locks/src/posix.c | 6322 | ||||
| -rw-r--r-- | xlators/features/locks/src/reservelk.c | 587 | ||||
| -rw-r--r-- | xlators/features/locks/tests/unit-test.c | 96 |
13 files changed, 8538 insertions, 4851 deletions
diff --git a/xlators/features/locks/src/Makefile.am b/xlators/features/locks/src/Makefile.am index 0f79731b415..0b174c19d2d 100644 --- a/xlators/features/locks/src/Makefile.am +++ b/xlators/features/locks/src/Makefile.am @@ -1,23 +1,29 @@ +if WITH_SERVER xlator_LTLIBRARIES = locks.la +endif xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features -locks_la_LDFLAGS = -module -avoid-version +locks_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) locks_la_SOURCES = common.c posix.c entrylk.c inodelk.c reservelk.c \ - clear.c + clear.c + locks_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = locks.h common.h locks-mem-types.h clear.h +noinst_HEADERS = locks.h common.h locks-mem-types.h clear.h pl-messages.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) CLEANFILES = +if WITH_SERVER uninstall-local: rm -f $(DESTDIR)$(xlatordir)/posix-locks.so install-data-hook: ln -sf locks.so $(DESTDIR)$(xlatordir)/posix-locks.so +endif diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c index 75593b8988c..ab1eac68a53 100644 --- a/xlators/features/locks/src/clear.c +++ b/xlators/features/locks/src/clear.c @@ -12,412 +12,449 @@ #include <limits.h> #include <pthread.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> #include "locks.h" #include "common.h" -#include "statedump.h" +#include <glusterfs/statedump.h> #include "clear.h" +const char *clrlk_type_names[CLRLK_TYPE_MAX] = { + [CLRLK_INODE] = "inode", + [CLRLK_ENTRY] = "entry", + [CLRLK_POSIX] = "posix", +}; + int -clrlk_get_kind (char *kind) +clrlk_get_kind(char *kind) { - char *clrlk_kinds[CLRLK_KIND_MAX] = {"dummy", "blocked", "granted", - "all"}; - int ret_kind = CLRLK_KIND_MAX; - int i = 0; - - for (i = CLRLK_BLOCKED; i < CLRLK_KIND_MAX; i++) { - if (!strcmp (clrlk_kinds[i], kind)) { - ret_kind = i; - break; - } + char *clrlk_kinds[CLRLK_KIND_MAX] = {"dummy", "blocked", "granted", "all"}; + int ret_kind = CLRLK_KIND_MAX; + int i = 0; + + for (i = CLRLK_BLOCKED; i < CLRLK_KIND_MAX; i++) { + if (!strcmp(clrlk_kinds[i], kind)) { + ret_kind = i; + break; } + } - return ret_kind; + return ret_kind; } int -clrlk_get_type (char *type) +clrlk_get_type(char *type) { - char *clrlk_types[CLRLK_TYPE_MAX] = {"inode", "entry", "posix"}; - int ret_type = CLRLK_TYPE_MAX; - int i = 0; - - for (i = CLRLK_INODE; i < CLRLK_TYPE_MAX; i++) { - if (!strcmp (clrlk_types[i], type)) { - ret_type = i; - break; - } + char *clrlk_types[CLRLK_TYPE_MAX] = {"inode", "entry", "posix"}; + int ret_type = CLRLK_TYPE_MAX; + int i = 0; + + for (i = CLRLK_INODE; i < CLRLK_TYPE_MAX; i++) { + if (!strcmp(clrlk_types[i], type)) { + ret_type = i; + break; } + } - return ret_type; + return ret_type; } int -clrlk_get_lock_range (char *range_str, struct gf_flock *ulock, - gf_boolean_t *chk_range) +clrlk_get_lock_range(char *range_str, struct gf_flock *ulock, + gf_boolean_t *chk_range) { - int ret = -1; - - if (!chk_range) - goto out; + int ret = -1; - if (!range_str) { - ret = 0; - *chk_range = _gf_false; - goto out; - } - - if (sscanf (range_str, "%hd,%"PRId64"-""%"PRId64, &ulock->l_whence, - &ulock->l_start, &ulock->l_len) != 3) { - goto out; - } + if (!chk_range) + goto out; + if (!range_str) { ret = 0; - *chk_range = _gf_true; + *chk_range = _gf_false; + goto out; + } + + if (sscanf(range_str, + "%hd,%" PRId64 "-" + "%" PRId64, + &ulock->l_whence, &ulock->l_start, &ulock->l_len) != 3) { + goto out; + } + + ret = 0; + *chk_range = _gf_true; out: - return ret; + return ret; } int -clrlk_parse_args (const char* cmd, clrlk_args *args) +clrlk_parse_args(const char *cmd, clrlk_args *args) { - char *opts = NULL; - char *cur = NULL; - char *tok = NULL; - char *sptr = NULL; - char *free_ptr = NULL; - char kw[KW_MAX] = {[KW_TYPE] = 't', - [KW_KIND] = 'k', - }; - int ret = -1; - int i = 0; - - GF_ASSERT (cmd); - free_ptr = opts = GF_CALLOC (1, strlen (cmd), gf_common_mt_char); - if (!opts) - goto out; - - if (sscanf (cmd, GF_XATTR_CLRLK_CMD".%s", opts) < 1) { - ret = -1; - goto out; + char *opts = NULL; + char *cur = NULL; + char *tok = NULL; + char *sptr = NULL; + char *free_ptr = NULL; + char kw[KW_MAX] = { + [KW_TYPE] = 't', + [KW_KIND] = 'k', + }; + int ret = -1; + int i = 0; + + GF_ASSERT(cmd); + free_ptr = opts = GF_CALLOC(1, strlen(cmd), gf_common_mt_char); + if (!opts) + goto out; + + if (sscanf(cmd, GF_XATTR_CLRLK_CMD ".%s", opts) < 1) { + ret = -1; + goto out; + } + + /*clr_lk_prefix.ttype.kkind.args, args - type specific*/ + cur = opts; + for (i = 0; i < KW_MAX && (tok = strtok_r(cur, ".", &sptr)); + cur = NULL, i++) { + if (tok[0] != kw[i]) { + ret = -1; + goto out; } - - /*clr_lk_prefix.ttype.kkind.args, args - type specific*/ - cur = opts; - for (i = 0; i < KW_MAX && (tok = strtok_r (cur, ".", &sptr)); - cur = NULL, i++) { - if (tok[0] != kw[i]) { - ret = -1; - goto out; - } - if (i == KW_TYPE) - args->type = clrlk_get_type (tok+1); - if (i == KW_KIND) - args->kind = clrlk_get_kind (tok+1); - } - - if ((args->type == CLRLK_TYPE_MAX) || (args->kind == CLRLK_KIND_MAX)) - goto out; - - /*optional args, neither range nor basename can 'legally' contain - * "/" in them*/ - tok = strtok_r (NULL, "/", &sptr); - if (tok) - args->opts = gf_strdup (tok); - - ret = 0; + if (i == KW_TYPE) + args->type = clrlk_get_type(tok + 1); + if (i == KW_KIND) + args->kind = clrlk_get_kind(tok + 1); + } + + if ((args->type == CLRLK_TYPE_MAX) || (args->kind == CLRLK_KIND_MAX)) + goto out; + + /*optional args, neither range nor basename can 'legally' contain + * "/" in them*/ + tok = strtok_r(NULL, "/", &sptr); + if (tok) + args->opts = gf_strdup(tok); + + ret = 0; out: - GF_FREE (free_ptr); - return ret; + GF_FREE(free_ptr); + return ret; } int -clrlk_clear_posixlk (xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args, - int *blkd, int *granted, int *op_errno) +clrlk_clear_posixlk(xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args, + int *blkd, int *granted, int *op_errno) { - posix_lock_t *plock = NULL; - posix_lock_t *tmp = NULL; - struct gf_flock ulock = {0, }; - int ret = -1; - int bcount = 0; - int gcount = 0; - gf_boolean_t chk_range = _gf_false; - - if (clrlk_get_lock_range (args->opts, &ulock, &chk_range)) { - *op_errno = EINVAL; - goto out; - } - - pthread_mutex_lock (&pl_inode->mutex); + posix_lock_t *plock = NULL; + posix_lock_t *tmp = NULL; + struct gf_flock ulock = { + 0, + }; + int ret = -1; + int bcount = 0; + int gcount = 0; + gf_boolean_t chk_range = _gf_false; + + if (clrlk_get_lock_range(args->opts, &ulock, &chk_range)) { + *op_errno = EINVAL; + goto out; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(plock, tmp, &pl_inode->ext_list, list) { - list_for_each_entry_safe (plock, tmp, &pl_inode->ext_list, - list) { - if ((plock->blocked && - !(args->kind & CLRLK_BLOCKED)) || - (!plock->blocked && - !(args->kind & CLRLK_GRANTED))) - continue; - - if (chk_range && - (plock->user_flock.l_whence != ulock.l_whence - || plock->user_flock.l_start != ulock.l_start - || plock->user_flock.l_len != ulock.l_len)) - continue; - - list_del_init (&plock->list); - if (plock->blocked) { - bcount++; - pl_trace_out (this, plock->frame, NULL, NULL, - F_SETLKW, &plock->user_flock, - -1, EAGAIN, NULL); - - STACK_UNWIND_STRICT (lk, plock->frame, -1, EAGAIN, - &plock->user_flock, NULL); - - } else { - gcount++; - } - GF_FREE (plock); - } + if ((plock->blocked && !(args->kind & CLRLK_BLOCKED)) || + (!plock->blocked && !(args->kind & CLRLK_GRANTED))) + continue; + + if (chk_range && (plock->user_flock.l_whence != ulock.l_whence || + plock->user_flock.l_start != ulock.l_start || + plock->user_flock.l_len != ulock.l_len)) + continue; + + list_del_init(&plock->list); + if (plock->blocked) { + bcount++; + pl_trace_out(this, plock->frame, NULL, NULL, F_SETLKW, + &plock->user_flock, -1, EINTR, NULL); + + STACK_UNWIND_STRICT(lk, plock->frame, -1, EINTR, + &plock->user_flock, NULL); + + } else { + gcount++; + } + __destroy_lock(plock); } - pthread_mutex_unlock (&pl_inode->mutex); - grant_blocked_locks (this, pl_inode); - ret = 0; + } + pthread_mutex_unlock(&pl_inode->mutex); + grant_blocked_locks(this, pl_inode); + ret = 0; out: - *blkd = bcount; - *granted = gcount; - return ret; + *blkd = bcount; + *granted = gcount; + return ret; } /* Returns 0 on success and -1 on failure */ int -clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, - clrlk_args *args, int *blkd, int *granted, int *op_errno) +clrlk_clear_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, + clrlk_args *args, int *blkd, int *granted, int *op_errno) { - pl_inode_lock_t *ilock = NULL; - pl_inode_lock_t *tmp = NULL; - struct gf_flock ulock = {0, }; - int ret = -1; - int bcount = 0; - int gcount = 0; - gf_boolean_t chk_range = _gf_false; - struct list_head released; - - INIT_LIST_HEAD (&released); - if (clrlk_get_lock_range (args->opts, &ulock, &chk_range)) { - *op_errno = EINVAL; - goto out; - } - - if (args->kind & CLRLK_BLOCKED) - goto blkd; - - if (args->kind & CLRLK_GRANTED) - goto granted; + posix_locks_private_t *priv; + pl_inode_lock_t *ilock = NULL; + pl_inode_lock_t *tmp = NULL; + struct gf_flock ulock = { + 0, + }; + int ret = -1; + int bcount = 0; + int gcount = 0; + gf_boolean_t chk_range = _gf_false; + struct list_head *pcontend = NULL; + struct list_head released; + struct list_head contend; + struct timespec now = {}; + + INIT_LIST_HEAD(&released); + + priv = this->private; + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + if (clrlk_get_lock_range(args->opts, &ulock, &chk_range)) { + *op_errno = EINVAL; + goto out; + } + + if (args->kind & CLRLK_BLOCKED) + goto blkd; + + if (args->kind & CLRLK_GRANTED) + goto granted; blkd: - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(ilock, tmp, &dom->blocked_inodelks, + blocked_locks) { - list_for_each_entry_safe (ilock, tmp, &dom->blocked_inodelks, - blocked_locks) { - if (chk_range && - (ilock->user_flock.l_whence != ulock.l_whence - || ilock->user_flock.l_start != ulock.l_start - || ilock->user_flock.l_len != ulock.l_len)) - continue; - - bcount++; - list_del_init (&ilock->blocked_locks); - list_add (&ilock->blocked_locks, &released); - } - } - pthread_mutex_unlock (&pl_inode->mutex); - - list_for_each_entry_safe (ilock, tmp, &released, blocked_locks) { - list_del_init (&ilock->blocked_locks); - pl_trace_out (this, ilock->frame, NULL, NULL, F_SETLKW, - &ilock->user_flock, -1, EAGAIN, - ilock->volume); - STACK_UNWIND_STRICT (inodelk, ilock->frame, -1, - EAGAIN, NULL); - //No need to take lock as the locks are only in one list - __pl_inodelk_unref (ilock); + if (chk_range && (ilock->user_flock.l_whence != ulock.l_whence || + ilock->user_flock.l_start != ulock.l_start || + ilock->user_flock.l_len != ulock.l_len)) + continue; + + bcount++; + list_del_init(&ilock->client_list); + list_del_init(&ilock->blocked_locks); + list_add(&ilock->blocked_locks, &released); } + } + pthread_mutex_unlock(&pl_inode->mutex); - if (!(args->kind & CLRLK_GRANTED)) { - ret = 0; - goto out; + if (!list_empty(&released)) { + list_for_each_entry_safe(ilock, tmp, &released, blocked_locks) + { + list_del_init(&ilock->blocked_locks); + pl_trace_out(this, ilock->frame, NULL, NULL, F_SETLKW, + &ilock->user_flock, -1, EAGAIN, ilock->volume); + STACK_UNWIND_STRICT(inodelk, ilock->frame, -1, EAGAIN, NULL); + // No need to take lock as the locks are only in one list + __pl_inodelk_unref(ilock); } + } + + if (!(args->kind & CLRLK_GRANTED)) { + ret = 0; + goto out; + } granted: - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(ilock, tmp, &dom->inodelk_list, list) { - list_for_each_entry_safe (ilock, tmp, &dom->inodelk_list, - list) { - if (chk_range && - (ilock->user_flock.l_whence != ulock.l_whence - || ilock->user_flock.l_start != ulock.l_start - || ilock->user_flock.l_len != ulock.l_len)) - continue; - - gcount++; - list_del_init (&ilock->list); - list_add (&ilock->list, &released); - } + if (chk_range && (ilock->user_flock.l_whence != ulock.l_whence || + ilock->user_flock.l_start != ulock.l_start || + ilock->user_flock.l_len != ulock.l_len)) + continue; + + gcount++; + list_del_init(&ilock->client_list); + list_del_init(&ilock->list); + list_add(&ilock->list, &released); } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); - list_for_each_entry_safe (ilock, tmp, &released, list) { - list_del_init (&ilock->list); - //No need to take lock as the locks are only in one list - __pl_inodelk_unref (ilock); - } + list_for_each_entry_safe(ilock, tmp, &released, list) + { + list_del_init(&ilock->list); + // No need to take lock as the locks are only in one list + __pl_inodelk_unref(ilock); + } - ret = 0; + ret = 0; out: - grant_blocked_inode_locks (this, pl_inode, dom); - *blkd = bcount; - *granted = gcount; - return ret; + grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend); + if (pcontend != NULL) { + inodelk_contention_notify(this, pcontend); + } + *blkd = bcount; + *granted = gcount; + return ret; } /* Returns 0 on success and -1 on failure */ int -clrlk_clear_entrylk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, - clrlk_args *args, int *blkd, int *granted, int *op_errno) +clrlk_clear_entrylk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, + clrlk_args *args, int *blkd, int *granted, int *op_errno) { - pl_entry_lock_t *elock = NULL; - pl_entry_lock_t *tmp = NULL; - int bcount = 0; - int gcount = 0; - int ret = -1; - struct list_head removed; - struct list_head released; - - INIT_LIST_HEAD (&released); - if (args->kind & CLRLK_BLOCKED) - goto blkd; - - if (args->kind & CLRLK_GRANTED) - goto granted; + posix_locks_private_t *priv; + pl_entry_lock_t *elock = NULL; + pl_entry_lock_t *tmp = NULL; + int bcount = 0; + int gcount = 0; + int ret = -1; + struct list_head *pcontend = NULL; + struct list_head removed; + struct list_head released; + struct list_head contend; + struct timespec now; + + INIT_LIST_HEAD(&released); + + priv = this->private; + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + if (args->kind & CLRLK_BLOCKED) + goto blkd; + + if (args->kind & CLRLK_GRANTED) + goto granted; blkd: - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(elock, tmp, &dom->blocked_entrylks, + blocked_locks) { - list_for_each_entry_safe (elock, tmp, &dom->blocked_entrylks, - blocked_locks) { - if (args->opts) { - if (!elock->basename || - strcmp (elock->basename, args->opts)) - continue; - } - - bcount++; - - list_del_init (&elock->blocked_locks); - list_add_tail (&elock->blocked_locks, &released); - } - } - pthread_mutex_unlock (&pl_inode->mutex); + if (args->opts) { + if (!elock->basename || strcmp(elock->basename, args->opts)) + continue; + } - list_for_each_entry_safe (elock, tmp, &released, blocked_locks) { - list_del_init (&elock->blocked_locks); - entrylk_trace_out (this, elock->frame, elock->volume, NULL, NULL, - elock->basename, ENTRYLK_LOCK, elock->type, - -1, EAGAIN); - STACK_UNWIND_STRICT (entrylk, elock->frame, -1, EAGAIN, NULL); + bcount++; - __pl_entrylk_unref (elock); + list_del_init(&elock->client_list); + list_del_init(&elock->blocked_locks); + list_add_tail(&elock->blocked_locks, &released); } + } + pthread_mutex_unlock(&pl_inode->mutex); + + if (!list_empty(&released)) { + list_for_each_entry_safe(elock, tmp, &released, blocked_locks) + { + list_del_init(&elock->blocked_locks); + entrylk_trace_out(this, elock->frame, elock->volume, NULL, NULL, + elock->basename, ENTRYLK_LOCK, elock->type, -1, + EAGAIN); + STACK_UNWIND_STRICT(entrylk, elock->frame, -1, EAGAIN, NULL); - if (!(args->kind & CLRLK_GRANTED)) { - ret = 0; - goto out; + __pl_entrylk_unref(elock); } + } + + if (!(args->kind & CLRLK_GRANTED)) { + ret = 0; + goto out; + } granted: - INIT_LIST_HEAD (&removed); - pthread_mutex_lock (&pl_inode->mutex); + INIT_LIST_HEAD(&removed); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(elock, tmp, &dom->entrylk_list, domain_list) { - list_for_each_entry_safe (elock, tmp, &dom->entrylk_list, - domain_list) { - if (args->opts) { - if (!elock->basename || - strcmp (elock->basename, args->opts)) - continue; - } - - gcount++; - list_del_init (&elock->domain_list); - list_add_tail (&elock->domain_list, &removed); - - __pl_entrylk_unref (elock); - } + if (args->opts) { + if (!elock->basename || strcmp(elock->basename, args->opts)) + continue; + } + + gcount++; + list_del_init(&elock->client_list); + list_del_init(&elock->domain_list); + list_add_tail(&elock->domain_list, &removed); + + __pl_entrylk_unref(elock); } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); - grant_blocked_entry_locks (this, pl_inode, dom); + grant_blocked_entry_locks(this, pl_inode, dom, &now, pcontend); + if (pcontend != NULL) { + entrylk_contention_notify(this, pcontend); + } - ret = 0; + ret = 0; out: - *blkd = bcount; - *granted = gcount; - return ret; + *blkd = bcount; + *granted = gcount; + return ret; } int -clrlk_clear_lks_in_all_domains (xlator_t *this, pl_inode_t *pl_inode, - clrlk_args *args, int *blkd, int *granted, - int *op_errno) +clrlk_clear_lks_in_all_domains(xlator_t *this, pl_inode_t *pl_inode, + clrlk_args *args, int *blkd, int *granted, + int *op_errno) { - pl_dom_list_t *dom = NULL; - int ret = -1; - int tmp_bcount = 0; - int tmp_gcount = 0; - - if (list_empty (&pl_inode->dom_list)) { - ret = 0; - goto out; - } + pl_dom_list_t *dom = NULL; + int ret = -1; + int tmp_bcount = 0; + int tmp_gcount = 0; - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - tmp_bcount = tmp_gcount = 0; - - switch (args->type) - { - case CLRLK_INODE: - ret = clrlk_clear_inodelk (this, pl_inode, dom, args, - &tmp_bcount, &tmp_gcount, - op_errno); - if (ret) - goto out; - break; - case CLRLK_ENTRY: - ret = clrlk_clear_entrylk (this, pl_inode, dom, args, - &tmp_bcount, &tmp_gcount, - op_errno); - if (ret) - goto out; - break; - } - - *blkd += tmp_bcount; - *granted += tmp_gcount; + if (list_empty(&pl_inode->dom_list)) { + ret = 0; + goto out; + } + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + tmp_bcount = tmp_gcount = 0; + + switch (args->type) { + case CLRLK_INODE: + ret = clrlk_clear_inodelk(this, pl_inode, dom, args, + &tmp_bcount, &tmp_gcount, op_errno); + if (ret) + goto out; + break; + case CLRLK_ENTRY: + ret = clrlk_clear_entrylk(this, pl_inode, dom, args, + &tmp_bcount, &tmp_gcount, op_errno); + if (ret) + goto out; + break; } - ret = 0; + *blkd += tmp_bcount; + *granted += tmp_gcount; + } + + ret = 0; out: - return ret; + return ret; } diff --git a/xlators/features/locks/src/clear.h b/xlators/features/locks/src/clear.h index 511f3f74ae5..bc118cb1b81 100644 --- a/xlators/features/locks/src/clear.h +++ b/xlators/features/locks/src/clear.h @@ -10,67 +10,64 @@ #ifndef __CLEAR_H__ #define __CLEAR_H__ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "compat-errno.h" -#include "stack.h" -#include "call-stub.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/stack.h> +#include <glusterfs/call-stub.h> #include "locks.h" typedef enum { - CLRLK_INODE, - CLRLK_ENTRY, - CLRLK_POSIX, - CLRLK_TYPE_MAX + CLRLK_INODE, + CLRLK_ENTRY, + CLRLK_POSIX, + CLRLK_TYPE_MAX } clrlk_type; +extern const char *clrlk_type_names[]; + typedef enum { - CLRLK_BLOCKED = 1, - CLRLK_GRANTED, - CLRLK_ALL, - CLRLK_KIND_MAX + CLRLK_BLOCKED = 1, + CLRLK_GRANTED, + CLRLK_ALL, + CLRLK_KIND_MAX } clrlk_kind; typedef enum { - KW_TYPE, - KW_KIND, - /*add new keywords here*/ - KW_MAX + KW_TYPE, + KW_KIND, + /*add new keywords here*/ + KW_MAX } clrlk_opts; struct _clrlk_args; typedef struct _clrlk_args clrlk_args; struct _clrlk_args { - int type; - int kind; - char *opts; + int type; + int kind; + char *opts; }; int -clrlk_get__kind (char *kind); +clrlk_get__kind(char *kind); int -clrlk_get_type (char *type); +clrlk_get_type(char *type); int -clrlk_get_lock_range (char *range_str, struct gf_flock *ulock, - gf_boolean_t *chk_range); +clrlk_get_lock_range(char *range_str, struct gf_flock *ulock, + gf_boolean_t *chk_range); int -clrlk_parse_args (const char* cmd, clrlk_args *args); +clrlk_parse_args(const char *cmd, clrlk_args *args); int -clrlk_clear_posixlk (xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args, - int *blkd, int *granted, int *op_errno); +clrlk_clear_posixlk(xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args, + int *blkd, int *granted, int *op_errno); int -clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, - clrlk_args *args, int *blkd, int *granted, int *op_errno); +clrlk_clear_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, + clrlk_args *args, int *blkd, int *granted, int *op_errno); int -clrlk_clear_entrylk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, - clrlk_args *args, int *blkd, int *granted, int *op_errno); +clrlk_clear_entrylk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, + clrlk_args *args, int *blkd, int *granted, int *op_errno); int -clrlk_clear_lks_in_all_domains (xlator_t *this, pl_inode_t *pl_inode, - clrlk_args *args, int *blkd, int *granted, - int *op_errno); +clrlk_clear_lks_in_all_domains(xlator_t *this, pl_inode_t *pl_inode, + clrlk_args *args, int *blkd, int *granted, + int *op_errno); #endif /* __CLEAR_H__ */ diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index f6c71c1cf86..a2c6be93e03 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2006-2012, 2015-2016 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -12,737 +12,775 @@ #include <limits.h> #include <pthread.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/logging.h> +#include <glusterfs/syncop.h> #include "locks.h" #include "common.h" - static int -__is_lock_grantable (pl_inode_t *pl_inode, posix_lock_t *lock); +__is_lock_grantable(pl_inode_t *pl_inode, posix_lock_t *lock); static void -__insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock); +__insert_and_merge(pl_inode_t *pl_inode, posix_lock_t *lock); static int -pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode, - posix_lock_t *old_lock); +pl_send_prelock_unlock(xlator_t *this, pl_inode_t *pl_inode, + posix_lock_t *old_lock); static pl_dom_list_t * -__allocate_domain (const char *volume) +__allocate_domain(const char *volume) { - pl_dom_list_t *dom = NULL; + pl_dom_list_t *dom = NULL; - dom = GF_CALLOC (1, sizeof (*dom), - gf_locks_mt_pl_dom_list_t); - if (!dom) - goto out; + dom = GF_CALLOC(1, sizeof(*dom), gf_locks_mt_pl_dom_list_t); + if (!dom) + goto out; - dom->domain = gf_strdup(volume); - if (!dom->domain) - goto out; + dom->domain = gf_strdup(volume); + if (!dom->domain) + goto out; - gf_log ("posix-locks", GF_LOG_TRACE, - "New domain allocated: %s", dom->domain); + gf_log("posix-locks", GF_LOG_TRACE, "New domain allocated: %s", + dom->domain); - INIT_LIST_HEAD (&dom->inode_list); - INIT_LIST_HEAD (&dom->entrylk_list); - INIT_LIST_HEAD (&dom->blocked_entrylks); - INIT_LIST_HEAD (&dom->inodelk_list); - INIT_LIST_HEAD (&dom->blocked_inodelks); + INIT_LIST_HEAD(&dom->inode_list); + INIT_LIST_HEAD(&dom->entrylk_list); + INIT_LIST_HEAD(&dom->blocked_entrylks); + INIT_LIST_HEAD(&dom->inodelk_list); + INIT_LIST_HEAD(&dom->blocked_inodelks); out: - if (dom && (NULL == dom->domain)) { - GF_FREE (dom); - dom = NULL; - } + if (dom && (NULL == dom->domain)) { + GF_FREE(dom); + dom = NULL; + } - return dom; + return dom; } /* Returns domain for the lock. If domain is not present, * allocates a domain and returns it */ pl_dom_list_t * -get_domain (pl_inode_t *pl_inode, const char *volume) +get_domain(pl_inode_t *pl_inode, const char *volume) { - pl_dom_list_t *dom = NULL; + pl_dom_list_t *dom = NULL; - GF_VALIDATE_OR_GOTO ("posix-locks", pl_inode, out); - GF_VALIDATE_OR_GOTO ("posix-locks", volume, out); + GF_VALIDATE_OR_GOTO("posix-locks", pl_inode, out); + GF_VALIDATE_OR_GOTO("posix-locks", volume, out); - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) { - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - if (strcmp (dom->domain, volume) == 0) - goto unlock; - } - - dom = __allocate_domain (volume); - if (dom) - list_add (&dom->inode_list, &pl_inode->dom_list); + if (strcmp(dom->domain, volume) == 0) + goto unlock; } + + dom = __allocate_domain(volume); + if (dom) + list_add(&dom->inode_list, &pl_inode->dom_list); + } unlock: - pthread_mutex_unlock (&pl_inode->mutex); - if (dom) { - gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s found", volume); - } else { - gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s not found", volume); - } + pthread_mutex_unlock(&pl_inode->mutex); + if (dom) { + gf_log("posix-locks", GF_LOG_TRACE, "Domain %s found", volume); + } else { + gf_log("posix-locks", GF_LOG_TRACE, "Domain %s not found", volume); + } out: - return dom; + return dom; } unsigned long -fd_to_fdnum (fd_t *fd) +fd_to_fdnum(fd_t *fd) { - return ((unsigned long) fd); + return ((unsigned long)fd); } fd_t * -fd_from_fdnum (posix_lock_t *lock) +fd_from_fdnum(posix_lock_t *lock) { - return ((fd_t *) lock->fd_num); + return ((fd_t *)lock->fd_num); } int -__pl_inode_is_empty (pl_inode_t *pl_inode) +__pl_inode_is_empty(pl_inode_t *pl_inode) { - pl_dom_list_t *dom = NULL; - int is_empty = 1; - - if (!list_empty (&pl_inode->ext_list)) - is_empty = 0; - - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - if (!list_empty (&dom->entrylk_list)) - is_empty = 0; - - if (!list_empty (&dom->inodelk_list)) - is_empty = 0; - } - - return is_empty; + return (list_empty(&pl_inode->ext_list)); } void -pl_print_locker (char *str, int size, xlator_t *this, call_frame_t *frame) +pl_print_locker(char *str, int size, xlator_t *this, call_frame_t *frame) { - snprintf (str, size, "Pid=%llu, lk-owner=%s, Client=%p, Frame=%llu", - (unsigned long long) frame->root->pid, - lkowner_utoa (&frame->root->lk_owner), - frame->root->client, - (unsigned long long) frame->root->unique); + snprintf(str, size, "Pid=%llu, lk-owner=%s, Client=%p, Frame=%llu", + (unsigned long long)frame->root->pid, + lkowner_utoa(&frame->root->lk_owner), frame->root->client, + (unsigned long long)frame->root->unique); } - void -pl_print_lockee (char *str, int size, fd_t *fd, loc_t *loc) +pl_print_lockee(char *str, int size, fd_t *fd, loc_t *loc) { - inode_t *inode = NULL; - char *ipath = NULL; - int ret = 0; + inode_t *inode = NULL; + char *ipath = NULL; + int ret = 0; - if (fd) - inode = fd->inode; - if (loc) - inode = loc->inode; + if (fd) + inode = fd->inode; + if (loc) + inode = loc->inode; - if (!inode) { - snprintf (str, size, "<nul>"); - return; - } + if (!inode) { + snprintf(str, size, "<nul>"); + return; + } - if (loc && loc->path) { - ipath = gf_strdup (loc->path); - } else { - ret = inode_path (inode, NULL, &ipath); - if (ret <= 0) - ipath = NULL; - } + if (loc && loc->path) { + ipath = gf_strdup(loc->path); + } else { + ret = inode_path(inode, NULL, &ipath); + if (ret <= 0) + ipath = NULL; + } - snprintf (str, size, "gfid=%s, fd=%p, path=%s", - uuid_utoa (inode->gfid), fd, - ipath ? ipath : "<nul>"); + snprintf(str, size, "gfid=%s, fd=%p, path=%s", uuid_utoa(inode->gfid), fd, + ipath ? ipath : "<nul>"); - GF_FREE (ipath); + GF_FREE(ipath); } - void -pl_print_lock (char *str, int size, int cmd, - struct gf_flock *flock, gf_lkowner_t *owner) +pl_print_lock(char *str, int size, int cmd, struct gf_flock *flock, + gf_lkowner_t *owner) { - char *cmd_str = NULL; - char *type_str = NULL; + char *cmd_str = NULL; + char *type_str = NULL; - switch (cmd) { + switch (cmd) { #if F_GETLK != F_GETLK64 case F_GETLK64: #endif case F_GETLK: - cmd_str = "GETLK"; - break; + cmd_str = "GETLK"; + break; #if F_SETLK != F_SETLK64 case F_SETLK64: #endif case F_SETLK: - cmd_str = "SETLK"; - break; + cmd_str = "SETLK"; + break; #if F_SETLKW != F_SETLKW64 case F_SETLKW64: #endif case F_SETLKW: - cmd_str = "SETLKW"; - break; + cmd_str = "SETLKW"; + break; default: - cmd_str = "UNKNOWN"; - break; - } + cmd_str = "UNKNOWN"; + break; + } - switch (flock->l_type) { + switch (flock->l_type) { case F_RDLCK: - type_str = "READ"; - break; + type_str = "READ"; + break; case F_WRLCK: - type_str = "WRITE"; - break; + type_str = "WRITE"; + break; case F_UNLCK: - type_str = "UNLOCK"; - break; + type_str = "UNLOCK"; + break; default: - type_str = "UNKNOWN"; - break; - } - - snprintf (str, size, "lock=FCNTL, cmd=%s, type=%s, " - "start=%llu, len=%llu, pid=%llu, lk-owner=%s", - cmd_str, type_str, (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - lkowner_utoa (owner)); + type_str = "UNKNOWN"; + break; + } + + snprintf(str, size, + "lock=FCNTL, cmd=%s, type=%s, " + "start=%llu, len=%llu, pid=%llu, lk-owner=%s", + cmd_str, type_str, (unsigned long long)flock->l_start, + (unsigned long long)flock->l_len, (unsigned long long)flock->l_pid, + lkowner_utoa(owner)); } - void -pl_trace_in (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, const char *domain) +pl_trace_in(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd, + struct gf_flock *flock, const char *domain) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_lock[256]; - - priv = this->private; + posix_locks_private_t *priv = this->private; + char pl_locker[256]; + char pl_lockee[256]; + char pl_lock[256]; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - if (domain) - pl_print_inodelk (pl_lock, 256, cmd, flock, domain); - else - pl_print_lock (pl_lock, 256, cmd, flock, &frame->root->lk_owner); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + if (domain) + pl_print_inodelk(pl_lock, 256, cmd, flock, domain); + else + pl_print_lock(pl_lock, 256, cmd, flock, &frame->root->lk_owner); - gf_log (this->name, GF_LOG_INFO, - "[REQUEST] Locker = {%s} Lockee = {%s} Lock = {%s}", - pl_locker, pl_lockee, pl_lock); + gf_log(this->name, GF_LOG_INFO, + "[REQUEST] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker, + pl_lockee, pl_lock); } - void -pl_print_verdict (char *str, int size, int op_ret, int op_errno) +pl_print_verdict(char *str, int size, int op_ret, int op_errno) { - char *verdict = NULL; - - if (op_ret == 0) { - verdict = "GRANTED"; - } else { - switch (op_errno) { - case EAGAIN: - verdict = "TRYAGAIN"; - break; - default: - verdict = strerror (op_errno); - } + char *verdict = NULL; + + if (op_ret == 0) { + verdict = "GRANTED"; + } else { + switch (op_errno) { + case EAGAIN: + verdict = "TRYAGAIN"; + break; + default: + verdict = strerror(op_errno); } + } - snprintf (str, size, "%s", verdict); + snprintf(str, size, "%s", verdict); } - void -pl_trace_out (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, int op_ret, int op_errno, const char *domain) +pl_trace_out(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd, + struct gf_flock *flock, int op_ret, int op_errno, + const char *domain) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_lock[256]; - char verdict[32]; + posix_locks_private_t *priv = NULL; + char pl_locker[256]; + char pl_lockee[256]; + char pl_lock[256]; + char verdict[32]; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - if (domain) - pl_print_inodelk (pl_lock, 256, cmd, flock, domain); - else - pl_print_lock (pl_lock, 256, cmd, flock, &frame->root->lk_owner); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + if (domain) + pl_print_inodelk(pl_lock, 256, cmd, flock, domain); + else + pl_print_lock(pl_lock, 256, cmd, flock, &frame->root->lk_owner); - pl_print_verdict (verdict, 32, op_ret, op_errno); + pl_print_verdict(verdict, 32, op_ret, op_errno); - gf_log (this->name, GF_LOG_INFO, - "[%s] Locker = {%s} Lockee = {%s} Lock = {%s}", - verdict, pl_locker, pl_lockee, pl_lock); + gf_log(this->name, GF_LOG_INFO, + "[%s] Locker = {%s} Lockee = {%s} Lock = {%s}", verdict, pl_locker, + pl_lockee, pl_lock); } - void -pl_trace_block (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, const char *domain) +pl_trace_block(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, + int cmd, struct gf_flock *flock, const char *domain) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_lock[256]; - - priv = this->private; + posix_locks_private_t *priv = this->private; + char pl_locker[256]; + char pl_lockee[256]; + char pl_lock[256]; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - if (domain) - pl_print_inodelk (pl_lock, 256, cmd, flock, domain); - else - pl_print_lock (pl_lock, 256, cmd, flock, &frame->root->lk_owner); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + if (domain) + pl_print_inodelk(pl_lock, 256, cmd, flock, domain); + else + pl_print_lock(pl_lock, 256, cmd, flock, &frame->root->lk_owner); - gf_log (this->name, GF_LOG_INFO, - "[BLOCKED] Locker = {%s} Lockee = {%s} Lock = {%s}", - pl_locker, pl_lockee, pl_lock); + gf_log(this->name, GF_LOG_INFO, + "[BLOCKED] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker, + pl_lockee, pl_lock); } - void -pl_trace_flush (xlator_t *this, call_frame_t *frame, fd_t *fd) +pl_trace_flush(xlator_t *this, call_frame_t *frame, fd_t *fd) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - pl_inode_t *pl_inode = NULL; + posix_locks_private_t *priv = NULL; + char pl_locker[256]; + char pl_lockee[256]; + pl_inode_t *pl_inode = NULL; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_inode = pl_inode_get (this, fd->inode); + pl_inode = pl_inode_get(this, fd->inode, NULL); - if (pl_inode && __pl_inode_is_empty (pl_inode)) - return; + if (pl_inode && __pl_inode_is_empty(pl_inode)) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, NULL); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, NULL); - gf_log (this->name, GF_LOG_INFO, - "[FLUSH] Locker = {%s} Lockee = {%s}", - pl_locker, pl_lockee); + gf_log(this->name, GF_LOG_INFO, "[FLUSH] Locker = {%s} Lockee = {%s}", + pl_locker, pl_lockee); } void -pl_trace_release (xlator_t *this, fd_t *fd) +pl_trace_release(xlator_t *this, fd_t *fd) { - posix_locks_private_t *priv = NULL; - char pl_lockee[256]; + posix_locks_private_t *priv = NULL; + char pl_lockee[256]; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_lockee (pl_lockee, 256, fd, NULL); + pl_print_lockee(pl_lockee, 256, fd, NULL); - gf_log (this->name, GF_LOG_INFO, - "[RELEASE] Lockee = {%s}", pl_lockee); + gf_log(this->name, GF_LOG_INFO, "[RELEASE] Lockee = {%s}", pl_lockee); } - void -pl_update_refkeeper (xlator_t *this, inode_t *inode) +pl_update_refkeeper(xlator_t *this, inode_t *inode) { - pl_inode_t *pl_inode = NULL; - int is_empty = 0; - int need_unref = 0; - int need_ref = 0; + pl_inode_t *pl_inode = NULL; + int is_empty = 0; + int need_unref = 0; + int need_ref = 0; - pl_inode = pl_inode_get (this, inode); + pl_inode = pl_inode_get(this, inode, NULL); + if (!pl_inode) + return; - pthread_mutex_lock (&pl_inode->mutex); - { - is_empty = __pl_inode_is_empty (pl_inode); + pthread_mutex_lock(&pl_inode->mutex); + { + is_empty = __pl_inode_is_empty(pl_inode); - if (is_empty && pl_inode->refkeeper) { - need_unref = 1; - pl_inode->refkeeper = NULL; - } + if (is_empty && pl_inode->refkeeper) { + need_unref = 1; + pl_inode->refkeeper = NULL; + } - if (!is_empty && !pl_inode->refkeeper) { - need_ref = 1; - pl_inode->refkeeper = inode; - } + if (!is_empty && !pl_inode->refkeeper) { + need_ref = 1; + pl_inode->refkeeper = inode; } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); - if (need_unref) - inode_unref (inode); + if (need_unref) + inode_unref(inode); - if (need_ref) - inode_ref (inode); + if (need_ref) + inode_ref(inode); } - -pl_inode_t * -pl_inode_get (xlator_t *this, inode_t *inode) +/* Get lock enforcement info from disk */ +int +pl_fetch_mlock_info_from_disk(xlator_t *this, pl_inode_t *pl_inode, + pl_local_t *local) { - uint64_t tmp_pl_inode = 0; - pl_inode_t *pl_inode = NULL; - int ret = 0; - - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &tmp_pl_inode); - if (ret == 0) { - pl_inode = (pl_inode_t *)(long)tmp_pl_inode; - goto unlock; - } - pl_inode = GF_CALLOC (1, sizeof (*pl_inode), - gf_locks_mt_pl_inode_t); - if (!pl_inode) { - goto unlock; - } + dict_t *xdata_rsp = NULL; + int ret = 0; + int op_ret = 0; + + if (!local) { + return -1; + } + + if (local->fd) { + op_ret = syncop_fgetxattr(this, local->fd, &xdata_rsp, + GF_ENFORCE_MANDATORY_LOCK, NULL, NULL); + } else { + op_ret = syncop_getxattr(this, &local->loc[0], &xdata_rsp, + GF_ENFORCE_MANDATORY_LOCK, NULL, NULL); + } + + pthread_mutex_lock(&pl_inode->mutex); + { + if (op_ret >= 0) { + pl_inode->mlock_enforced = _gf_true; + pl_inode->check_mlock_info = _gf_false; + } else { + gf_msg(this->name, GF_LOG_WARNING, -op_ret, 0, + "getxattr failed with %d", op_ret); + pl_inode->mlock_enforced = _gf_false; + + if (-op_ret == ENODATA) { + pl_inode->check_mlock_info = _gf_false; + } else { + pl_inode->check_mlock_info = _gf_true; + } + } + } + pthread_mutex_unlock(&pl_inode->mutex); - gf_log (this->name, GF_LOG_TRACE, - "Allocating new pl inode"); + return ret; +} - pthread_mutex_init (&pl_inode->mutex, NULL); +pl_inode_t * +pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local) +{ + uint64_t tmp_pl_inode = 0; + pl_inode_t *pl_inode = NULL; + int ret = 0; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &tmp_pl_inode); + if (ret == 0) { + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; + goto unlock; + } - INIT_LIST_HEAD (&pl_inode->dom_list); - INIT_LIST_HEAD (&pl_inode->ext_list); - INIT_LIST_HEAD (&pl_inode->rw_list); - INIT_LIST_HEAD (&pl_inode->reservelk_list); - INIT_LIST_HEAD (&pl_inode->blocked_reservelks); - INIT_LIST_HEAD (&pl_inode->blocked_calls); + pl_inode = GF_CALLOC(1, sizeof(*pl_inode), gf_locks_mt_pl_inode_t); + if (!pl_inode) { + goto unlock; + } - __inode_ctx_put (inode, this, (uint64_t)(long)(pl_inode)); + gf_log(this->name, GF_LOG_TRACE, "Allocating new pl inode"); + + pthread_mutex_init(&pl_inode->mutex, NULL); + pthread_cond_init(&pl_inode->check_fop_wind_count, 0); + + INIT_LIST_HEAD(&pl_inode->dom_list); + INIT_LIST_HEAD(&pl_inode->ext_list); + INIT_LIST_HEAD(&pl_inode->rw_list); + INIT_LIST_HEAD(&pl_inode->reservelk_list); + INIT_LIST_HEAD(&pl_inode->blocked_reservelks); + INIT_LIST_HEAD(&pl_inode->blocked_calls); + INIT_LIST_HEAD(&pl_inode->metalk_list); + INIT_LIST_HEAD(&pl_inode->queued_locks); + INIT_LIST_HEAD(&pl_inode->waiting); + gf_uuid_copy(pl_inode->gfid, inode->gfid); + + pl_inode->check_mlock_info = _gf_true; + pl_inode->mlock_enforced = _gf_false; + + /* -2 means never looked up. -1 means something went wrong and link + * tracking is disabled. */ + pl_inode->links = -2; + + ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode)); + if (ret) { + pthread_mutex_destroy(&pl_inode->mutex); + GF_FREE(pl_inode); + pl_inode = NULL; + goto unlock; } + } unlock: - UNLOCK (&inode->lock); + UNLOCK(&inode->lock); - return pl_inode; -} + if ((pl_inode != NULL) && pl_is_mandatory_locking_enabled(pl_inode) && + pl_inode->check_mlock_info && local) { + /* Note: The lock enforcement information per file can be stored in the + attribute flag of stat(x) in posix. With that there won't be a need + for doing getxattr post a reboot + */ + pl_fetch_mlock_info_from_disk(this, pl_inode, local); + } + return pl_inode; +} /* Create a new posix_lock_t */ posix_lock_t * -new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, - gf_lkowner_t *owner, fd_t *fd) +new_posix_lock(struct gf_flock *flock, client_t *client, pid_t client_pid, + gf_lkowner_t *owner, fd_t *fd, uint32_t lk_flags, int blocking, + int32_t *op_errno) { - posix_lock_t *lock = NULL; + posix_lock_t *lock = NULL; - GF_VALIDATE_OR_GOTO ("posix-locks", flock, out); - GF_VALIDATE_OR_GOTO ("posix-locks", client, out); - GF_VALIDATE_OR_GOTO ("posix-locks", fd, out); + GF_VALIDATE_OR_GOTO("posix-locks", flock, out); + GF_VALIDATE_OR_GOTO("posix-locks", client, out); + GF_VALIDATE_OR_GOTO("posix-locks", fd, out); - lock = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!lock) { - goto out; - } + if (!pl_is_lk_owner_valid(owner, client)) { + *op_errno = EINVAL; + goto out; + } + + lock = GF_CALLOC(1, sizeof(posix_lock_t), gf_locks_mt_posix_lock_t); + if (!lock) { + *op_errno = ENOMEM; + goto out; + } - lock->fl_start = flock->l_start; - lock->fl_type = flock->l_type; + lock->fl_start = flock->l_start; + lock->fl_type = flock->l_type; - if (flock->l_len == 0) - lock->fl_end = LLONG_MAX; - else - lock->fl_end = flock->l_start + flock->l_len - 1; + if (flock->l_len == 0) + lock->fl_end = LLONG_MAX; + else + lock->fl_end = flock->l_start + flock->l_len - 1; - lock->client = client; - lock->fd_num = fd_to_fdnum (fd); - lock->fd = fd; - lock->client_pid = client_pid; - lock->owner = *owner; + lock->client = client; - INIT_LIST_HEAD (&lock->list); + lock->client_uid = gf_strdup(client->client_uid); + if (lock->client_uid == NULL) { + GF_FREE(lock); + lock = NULL; + *op_errno = ENOMEM; + goto out; + } + + lock->fd_num = fd_to_fdnum(fd); + lock->fd = fd; + lock->client_pid = client_pid; + lock->owner = *owner; + lock->lk_flags = lk_flags; + + lock->blocking = blocking; + memcpy(&lock->user_flock, flock, sizeof(lock->user_flock)); + + INIT_LIST_HEAD(&lock->list); out: - return lock; + return lock; } - /* Delete a lock from the inode's lock list */ void -__delete_lock (pl_inode_t *pl_inode, posix_lock_t *lock) +__delete_lock(posix_lock_t *lock) { - list_del_init (&lock->list); + list_del_init(&lock->list); } - /* Destroy a posix_lock */ void -__destroy_lock (posix_lock_t *lock) +__destroy_lock(posix_lock_t *lock) { - GF_FREE (lock); + GF_FREE(lock->client_uid); + GF_FREE(lock); } +static posix_lock_t * +__copy_lock(posix_lock_t *src) +{ + posix_lock_t *dst; + + dst = GF_MALLOC(sizeof(posix_lock_t), gf_locks_mt_posix_lock_t); + if (dst != NULL) { + memcpy(dst, src, sizeof(posix_lock_t)); + dst->client_uid = gf_strdup(src->client_uid); + if (dst->client_uid == NULL) { + GF_FREE(dst); + dst = NULL; + } + + if (dst != NULL) + INIT_LIST_HEAD(&dst->list); + } + + return dst; +} /* Convert a posix_lock to a struct gf_flock */ void -posix_lock_to_flock (posix_lock_t *lock, struct gf_flock *flock) +posix_lock_to_flock(posix_lock_t *lock, struct gf_flock *flock) { - flock->l_pid = lock->client_pid; - flock->l_type = lock->fl_type; - flock->l_start = lock->fl_start; - flock->l_owner = lock->owner; - - if (lock->fl_end == LLONG_MAX) - flock->l_len = 0; - else - flock->l_len = lock->fl_end - lock->fl_start + 1; + flock->l_pid = lock->user_flock.l_pid; + flock->l_type = lock->fl_type; + flock->l_start = lock->fl_start; + flock->l_owner = lock->owner; + + if (lock->fl_end == LLONG_MAX) + flock->l_len = 0; + else + flock->l_len = lock->fl_end - lock->fl_start + 1; } /* Insert the lock into the inode's lock list */ static void -__insert_lock (pl_inode_t *pl_inode, posix_lock_t *lock) +__insert_lock(pl_inode_t *pl_inode, posix_lock_t *lock) { - if (lock->blocked) - gettimeofday (&lock->blkd_time, NULL); - else - gettimeofday (&lock->granted_time, NULL); + if (lock->blocked) + lock->blkd_time = gf_time(); + else + lock->granted_time = gf_time(); - list_add_tail (&lock->list, &pl_inode->ext_list); - - return; + list_add_tail(&lock->list, &pl_inode->ext_list); } - /* Return true if the locks overlap, false otherwise */ int -locks_overlap (posix_lock_t *l1, posix_lock_t *l2) +locks_overlap(posix_lock_t *l1, posix_lock_t *l2) { - /* - Note: - FUSE always gives us absolute offsets, so no need to worry - about SEEK_CUR or SEEK_END - */ + /* + Note: + FUSE always gives us absolute offsets, so no need to worry + about SEEK_CUR or SEEK_END + */ - return ((l1->fl_end >= l2->fl_start) && - (l2->fl_end >= l1->fl_start)); + return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start)); } - /* Return true if the locks have the same owner */ int -same_owner (posix_lock_t *l1, posix_lock_t *l2) +same_owner(posix_lock_t *l1, posix_lock_t *l2) { - - return (is_same_lkowner (&l1->owner, &l2->owner) && - (l1->client == l2->client)); - + return (is_same_lkowner(&l1->owner, &l2->owner) && + (l1->client == l2->client)); } - /* Delete all F_UNLCK locks */ void -__delete_unlck_locks (pl_inode_t *pl_inode) +__delete_unlck_locks(pl_inode_t *pl_inode) { - posix_lock_t *l = NULL; - posix_lock_t *tmp = NULL; - - list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { - if (l->fl_type == F_UNLCK) { - __delete_lock (pl_inode, l); - __destroy_lock (l); - } + posix_lock_t *l = NULL; + posix_lock_t *tmp = NULL; + + list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list) + { + if (l->fl_type == F_UNLCK) { + __delete_lock(l); + __destroy_lock(l); } + } } - /* Add two locks */ static posix_lock_t * -add_locks (posix_lock_t *l1, posix_lock_t *l2) +add_locks(posix_lock_t *l1, posix_lock_t *l2, posix_lock_t *dst) { - posix_lock_t *sum = NULL; + posix_lock_t *sum = NULL; - sum = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!sum) - return NULL; + sum = __copy_lock(dst); + if (!sum) + return NULL; - sum->fl_start = min (l1->fl_start, l2->fl_start); - sum->fl_end = max (l1->fl_end, l2->fl_end); + sum->fl_start = min(l1->fl_start, l2->fl_start); + sum->fl_end = max(l1->fl_end, l2->fl_end); - return sum; + posix_lock_to_flock(sum, &sum->user_flock); + + return sum; } /* Subtract two locks */ struct _values { - posix_lock_t *locks[3]; + posix_lock_t *locks[3]; }; /* {big} must always be contained inside {small} */ static struct _values -subtract_locks (posix_lock_t *big, posix_lock_t *small) +subtract_locks(posix_lock_t *big, posix_lock_t *small) { + struct _values v = {.locks = {0, 0, 0}}; - struct _values v = { .locks = {0, 0, 0} }; - - if ((big->fl_start == small->fl_start) && - (big->fl_end == small->fl_end)) { - /* both edges coincide with big */ - v.locks[0] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[0]) - goto out; - memcpy (v.locks[0], big, sizeof (posix_lock_t)); - v.locks[0]->fl_type = small->fl_type; - goto done; + if ((big->fl_start == small->fl_start) && (big->fl_end == small->fl_end)) { + /* both edges coincide with big */ + v.locks[0] = __copy_lock(big); + if (!v.locks[0]) { + goto out; } - if ((small->fl_start > big->fl_start) && - (small->fl_end < big->fl_end)) { - /* both edges lie inside big */ - v.locks[0] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[0]) - goto out; - - v.locks[1] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[1]) - goto out; - - v.locks[2] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[1]) - goto out; - - memcpy (v.locks[0], big, sizeof (posix_lock_t)); - v.locks[0]->fl_end = small->fl_start - 1; - - memcpy (v.locks[1], small, sizeof (posix_lock_t)); - - memcpy (v.locks[2], big, sizeof (posix_lock_t)); - v.locks[2]->fl_start = small->fl_end + 1; - goto done; - + v.locks[0]->fl_type = small->fl_type; + v.locks[0]->user_flock.l_type = small->fl_type; + goto done; + } + + if ((small->fl_start > big->fl_start) && (small->fl_end < big->fl_end)) { + /* both edges lie inside big */ + v.locks[0] = __copy_lock(big); + v.locks[1] = __copy_lock(small); + v.locks[2] = __copy_lock(big); + if ((v.locks[0] == NULL) || (v.locks[1] == NULL) || + (v.locks[2] == NULL)) { + goto out; } - /* one edge coincides with big */ - if (small->fl_start == big->fl_start) { - v.locks[0] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[0]) - goto out; - - v.locks[1] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[1]) - goto out; - - memcpy (v.locks[0], big, sizeof (posix_lock_t)); - v.locks[0]->fl_start = small->fl_end + 1; - - memcpy (v.locks[1], small, sizeof (posix_lock_t)); - goto done; + v.locks[0]->fl_end = small->fl_start - 1; + v.locks[2]->fl_start = small->fl_end + 1; + posix_lock_to_flock(v.locks[0], &v.locks[0]->user_flock); + posix_lock_to_flock(v.locks[2], &v.locks[2]->user_flock); + goto done; + } + + /* one edge coincides with big */ + if (small->fl_start == big->fl_start) { + v.locks[0] = __copy_lock(big); + v.locks[1] = __copy_lock(small); + if ((v.locks[0] == NULL) || (v.locks[1] == NULL)) { + goto out; } - if (small->fl_end == big->fl_end) { - v.locks[0] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[0]) - goto out; - - v.locks[1] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[1]) - goto out; - - memcpy (v.locks[0], big, sizeof (posix_lock_t)); - v.locks[0]->fl_end = small->fl_start - 1; + v.locks[0]->fl_start = small->fl_end + 1; + posix_lock_to_flock(v.locks[0], &v.locks[0]->user_flock); + goto done; + } - memcpy (v.locks[1], small, sizeof (posix_lock_t)); - goto done; + if (small->fl_end == big->fl_end) { + v.locks[0] = __copy_lock(big); + v.locks[1] = __copy_lock(small); + if ((v.locks[0] == NULL) || (v.locks[1] == NULL)) { + goto out; } - GF_ASSERT (0); - gf_log ("posix-locks", GF_LOG_ERROR, "Unexpected case in subtract_locks"); + v.locks[0]->fl_end = small->fl_start - 1; + posix_lock_to_flock(v.locks[0], &v.locks[0]->user_flock); + goto done; + } + + GF_ASSERT(0); + gf_log("posix-locks", GF_LOG_ERROR, "Unexpected case in subtract_locks"); out: - if (v.locks[0]) { - GF_FREE (v.locks[0]); - v.locks[0] = NULL; - } - if (v.locks[1]) { - GF_FREE (v.locks[1]); - v.locks[1] = NULL; - } - if (v.locks[2]) { - GF_FREE (v.locks[2]); - v.locks[2] = NULL; - } + if (v.locks[0]) { + __destroy_lock(v.locks[0]); + v.locks[0] = NULL; + } + if (v.locks[1]) { + __destroy_lock(v.locks[1]); + v.locks[1] = NULL; + } + if (v.locks[2]) { + __destroy_lock(v.locks[2]); + v.locks[2] = NULL; + } done: - return v; + return v; } static posix_lock_t * -first_conflicting_overlap (pl_inode_t *pl_inode, posix_lock_t *lock) +first_conflicting_overlap(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *l = NULL; - posix_lock_t *conf = NULL; + posix_lock_t *l = NULL; + posix_lock_t *conf = NULL; - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry(l, &pl_inode->ext_list, list) { - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (l->blocked) - continue; - - if (locks_overlap (l, lock)) { - if (same_owner (l, lock)) - continue; - - if ((l->fl_type == F_WRLCK) || - (lock->fl_type == F_WRLCK)) { - conf = l; - goto unlock; - } - } + if (l->blocked) + continue; + + if (locks_overlap(l, lock)) { + if (same_owner(l, lock)) + continue; + + if ((l->fl_type == F_WRLCK) || (lock->fl_type == F_WRLCK)) { + conf = l; + goto unlock; } + } } + } unlock: - pthread_mutex_unlock (&pl_inode->mutex); + pthread_mutex_unlock(&pl_inode->mutex); - return conf; + return conf; } /* @@ -751,351 +789,803 @@ unlock: If {begin} is NULL, then start from the beginning of the list */ static posix_lock_t * -first_overlap (pl_inode_t *pl_inode, posix_lock_t *lock) +first_overlap(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *l = NULL; + posix_lock_t *l = NULL; - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (l->blocked) - continue; + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (l->blocked) + continue; - if (locks_overlap (l, lock)) - return l; - } + if (locks_overlap(l, lock)) + return l; + } - return NULL; + return NULL; } - - /* Return true if lock is grantable */ static int -__is_lock_grantable (pl_inode_t *pl_inode, posix_lock_t *lock) +__is_lock_grantable(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *l = NULL; - int ret = 1; - - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (!l->blocked && locks_overlap (lock, l)) { - if (((l->fl_type == F_WRLCK) - || (lock->fl_type == F_WRLCK)) - && (lock->fl_type != F_UNLCK) - && !same_owner (l, lock)) { - ret = 0; - break; - } - } + posix_lock_t *l = NULL; + int ret = 1; + + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (!l->blocked && locks_overlap(lock, l)) { + if (((l->fl_type == F_WRLCK) || (lock->fl_type == F_WRLCK)) && + (lock->fl_type != F_UNLCK) && !same_owner(l, lock)) { + ret = 0; + break; + } } - return ret; + } + return ret; } - -extern void do_blocked_rw (pl_inode_t *); - +extern void +do_blocked_rw(pl_inode_t *); static void -__insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock) +__insert_and_merge(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *conf = NULL; - posix_lock_t *t = NULL; - posix_lock_t *sum = NULL; - int i = 0; - struct _values v = { .locks = {0, 0, 0} }; - - list_for_each_entry_safe (conf, t, &pl_inode->ext_list, list) { - if (conf->blocked) - continue; - if (!locks_overlap (conf, lock)) - continue; + posix_lock_t *conf = NULL; + posix_lock_t *t = NULL; + posix_lock_t *sum = NULL; + int i = 0; + struct _values v = {.locks = {0, 0, 0}}; + + list_for_each_entry_safe(conf, t, &pl_inode->ext_list, list) + { + if (conf->blocked) + continue; + if (!locks_overlap(conf, lock)) + continue; + + if (same_owner(conf, lock)) { + if (conf->fl_type == lock->fl_type && + conf->lk_flags == lock->lk_flags) { + sum = add_locks(lock, conf, lock); + + __delete_lock(conf); + __destroy_lock(conf); + + __destroy_lock(lock); + INIT_LIST_HEAD(&sum->list); + posix_lock_to_flock(sum, &sum->user_flock); + __insert_and_merge(pl_inode, sum); - if (same_owner (conf, lock)) { - if (conf->fl_type == lock->fl_type) { - sum = add_locks (lock, conf); + return; + } else { + sum = add_locks(lock, conf, conf); - sum->fl_type = lock->fl_type; - sum->client = lock->client; - sum->fd_num = lock->fd_num; - sum->client_pid = lock->client_pid; - sum->owner = lock->owner; + v = subtract_locks(sum, lock); - __delete_lock (pl_inode, conf); - __destroy_lock (conf); + __delete_lock(conf); + __destroy_lock(conf); - __destroy_lock (lock); - INIT_LIST_HEAD (&sum->list); - posix_lock_to_flock (sum, &sum->user_flock); - __insert_and_merge (pl_inode, sum); + __delete_lock(lock); + __destroy_lock(lock); - return; - } else { - sum = add_locks (lock, conf); + __destroy_lock(sum); - sum->fl_type = conf->fl_type; - sum->client = conf->client; - sum->fd_num = conf->fd_num; - sum->client_pid = conf->client_pid; - sum->owner = conf->owner; + for (i = 0; i < 3; i++) { + if (!v.locks[i]) + continue; - v = subtract_locks (sum, lock); + __insert_and_merge(pl_inode, v.locks[i]); + } - __delete_lock (pl_inode, conf); - __destroy_lock (conf); + __delete_unlck_locks(pl_inode); + return; + } + } - __delete_lock (pl_inode, lock); - __destroy_lock (lock); + if (lock->fl_type == F_UNLCK) { + continue; + } - __destroy_lock (sum); + if ((conf->fl_type == F_RDLCK) && (lock->fl_type == F_RDLCK)) { + __insert_lock(pl_inode, lock); + return; + } + } + + /* no conflicts, so just insert */ + if (lock->fl_type != F_UNLCK) { + __insert_lock(pl_inode, lock); + } else { + __destroy_lock(lock); + } +} - for (i = 0; i < 3; i++) { - if (!v.locks[i]) - continue; +void +__grant_blocked_locks(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted) +{ + struct list_head tmp_list; + posix_lock_t *l = NULL; + posix_lock_t *tmp = NULL; + posix_lock_t *conf = NULL; + + INIT_LIST_HEAD(&tmp_list); + + list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list) + { + if (l->blocked) { + conf = first_overlap(pl_inode, l); + if (conf) + continue; + + l->blocked = 0; + list_move_tail(&l->list, &tmp_list); + } + } - INIT_LIST_HEAD (&v.locks[i]->list); - posix_lock_to_flock (v.locks[i], - &v.locks[i]->user_flock); - __insert_and_merge (pl_inode, - v.locks[i]); - } + list_for_each_entry_safe(l, tmp, &tmp_list, list) + { + list_del_init(&l->list); - __delete_unlck_locks (pl_inode); - return; - } - } + if (__is_lock_grantable(pl_inode, l)) { + conf = GF_CALLOC(1, sizeof(*conf), gf_locks_mt_posix_lock_t); - if (lock->fl_type == F_UNLCK) { - continue; - } + if (!conf) { + l->blocked = 1; + __insert_lock(pl_inode, l); + continue; + } - if ((conf->fl_type == F_RDLCK) && (lock->fl_type == F_RDLCK)) { - __insert_lock (pl_inode, lock); - return; - } - } + conf->frame = l->frame; + l->frame = NULL; - /* no conflicts, so just insert */ - if (lock->fl_type != F_UNLCK) { - __insert_lock (pl_inode, lock); + posix_lock_to_flock(l, &conf->user_flock); + + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 + " => Granted", + l->fl_type == F_UNLCK ? "Unlock" : "Lock", l->client_pid, + lkowner_utoa(&l->owner), l->user_flock.l_start, + l->user_flock.l_len); + + __insert_and_merge(pl_inode, l); + + list_add(&conf->list, granted); } else { - __destroy_lock (lock); + l->blocked = 1; + __insert_lock(pl_inode, l); } + } } - void -__grant_blocked_locks (xlator_t *this, pl_inode_t *pl_inode, struct list_head *granted) +grant_blocked_locks(xlator_t *this, pl_inode_t *pl_inode) { - struct list_head tmp_list; - posix_lock_t *l = NULL; - posix_lock_t *tmp = NULL; - posix_lock_t *conf = NULL; + struct list_head granted_list; + posix_lock_t *tmp = NULL; + posix_lock_t *lock = NULL; + pl_local_t *local = NULL; + INIT_LIST_HEAD(&granted_list); + + pthread_mutex_lock(&pl_inode->mutex); + { + __grant_blocked_locks(this, pl_inode, &granted_list); + } + pthread_mutex_unlock(&pl_inode->mutex); + + list_for_each_entry_safe(lock, tmp, &granted_list, list) + { + list_del_init(&lock->list); + + pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, + 0, 0, NULL); + local = lock->frame->local; + PL_STACK_UNWIND_AND_FREE(local, lk, lock->frame, 0, 0, + &lock->user_flock, NULL); + __destroy_lock(lock); + } + + return; +} - INIT_LIST_HEAD (&tmp_list); +static int +pl_send_prelock_unlock(xlator_t *this, pl_inode_t *pl_inode, + posix_lock_t *old_lock) +{ + struct gf_flock flock = { + 0, + }; + posix_lock_t *unlock_lock = NULL; + int32_t op_errno = 0; - list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { - if (l->blocked) { - conf = first_overlap (pl_inode, l); - if (conf) - continue; + struct list_head granted_list; + posix_lock_t *tmp = NULL; + posix_lock_t *lock = NULL; + pl_local_t *local = NULL; - l->blocked = 0; - list_move_tail (&l->list, &tmp_list); - } - } + int ret = -1; - list_for_each_entry_safe (l, tmp, &tmp_list, list) { - list_del_init (&l->list); + INIT_LIST_HEAD(&granted_list); - if (__is_lock_grantable (pl_inode, l)) { - conf = GF_CALLOC (1, sizeof (*conf), - gf_locks_mt_posix_lock_t); + flock.l_type = F_UNLCK; + flock.l_whence = old_lock->user_flock.l_whence; + flock.l_start = old_lock->user_flock.l_start; + flock.l_len = old_lock->user_flock.l_len; + flock.l_pid = old_lock->user_flock.l_pid; - if (!conf) { - l->blocked = 1; - __insert_lock (pl_inode, l); - continue; - } + unlock_lock = new_posix_lock(&flock, old_lock->client, old_lock->client_pid, + &old_lock->owner, old_lock->fd, + old_lock->lk_flags, 0, &op_errno); + GF_VALIDATE_OR_GOTO(this->name, unlock_lock, out); + ret = 0; - conf->frame = l->frame; - l->frame = NULL; + __insert_and_merge(pl_inode, unlock_lock); - posix_lock_to_flock (l, &conf->user_flock); + __grant_blocked_locks(this, pl_inode, &granted_list); - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Granted", - l->fl_type == F_UNLCK ? "Unlock" : "Lock", - l->client_pid, lkowner_utoa (&l->owner), - l->user_flock.l_start, - l->user_flock.l_len); + list_for_each_entry_safe(lock, tmp, &granted_list, list) + { + list_del_init(&lock->list); - __insert_and_merge (pl_inode, l); + pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, + 0, 0, NULL); + local = lock->frame->local; + PL_STACK_UNWIND_AND_FREE(local, lk, lock->frame, 0, 0, + &lock->user_flock, NULL); + __destroy_lock(lock); + } - list_add (&conf->list, granted); - } else { - l->blocked = 1; - __insert_lock (pl_inode, l); - } - } +out: + return ret; } +int +pl_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + int can_block) +{ + int ret = 0; -void -grant_blocked_locks (xlator_t *this, pl_inode_t *pl_inode) + errno = 0; + + pthread_mutex_lock(&pl_inode->mutex); + { + /* Send unlock before the actual lock to + prevent lock upgrade / downgrade + problems only if: + - it is a blocking call + - it has other conflicting locks + */ + + if (can_block && !(__is_lock_grantable(pl_inode, lock))) { + ret = pl_send_prelock_unlock(this, pl_inode, lock); + if (ret) + gf_log(this->name, GF_LOG_DEBUG, + "Could not send pre-lock " + "unlock"); + } + + if (__is_lock_grantable(pl_inode, lock)) { + if (pl_metalock_is_active(pl_inode)) { + __pl_queue_lock(pl_inode, lock); + pthread_mutex_unlock(&pl_inode->mutex); + ret = -2; + goto out; + } + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 " => OK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + __insert_and_merge(pl_inode, lock); + } else if (can_block) { + if (pl_metalock_is_active(pl_inode)) { + __pl_queue_lock(pl_inode, lock); + pthread_mutex_unlock(&pl_inode->mutex); + ret = -2; + goto out; + } + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 + " => Blocked", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + + pl_trace_block(this, lock->frame, NULL, NULL, F_SETLKW, + &lock->user_flock, NULL); + + lock->blocked = 1; + __insert_lock(pl_inode, lock); + ret = -1; + } else { + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 " => NOK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + errno = EAGAIN; + ret = -1; + } + } + pthread_mutex_unlock(&pl_inode->mutex); + + grant_blocked_locks(this, pl_inode); + + do_blocked_rw(pl_inode); + +out: + return ret; +} + +posix_lock_t * +pl_getlk(pl_inode_t *pl_inode, posix_lock_t *lock) { - struct list_head granted_list; - posix_lock_t *tmp = NULL; - posix_lock_t *lock = NULL; + posix_lock_t *conf = first_conflicting_overlap(pl_inode, lock); + if (conf == NULL) { + lock->fl_type = F_UNLCK; + return lock; + } + + return conf; +} - INIT_LIST_HEAD (&granted_list); +gf_boolean_t +pl_does_monkey_want_stuck_lock() +{ + long int monkey_unlock_rand = 0; + long int monkey_unlock_rand_rem = 0; + + /* coverity[DC.WEAK_CRYPTO] */ + monkey_unlock_rand = random(); + monkey_unlock_rand_rem = monkey_unlock_rand % 100; + if (monkey_unlock_rand_rem == 0) + return _gf_true; + return _gf_false; +} - pthread_mutex_lock (&pl_inode->mutex); +int +pl_lock_preempt(pl_inode_t *pl_inode, posix_lock_t *reqlock) +{ + posix_lock_t *lock = NULL; + posix_lock_t *i = NULL; + pl_rw_req_t *rw = NULL; + pl_rw_req_t *itr = NULL; + struct list_head unwind_blist = { + 0, + }; + struct list_head unwind_rw_list = { + 0, + }; + int ret = 0; + + INIT_LIST_HEAD(&unwind_blist); + INIT_LIST_HEAD(&unwind_rw_list); + + pthread_mutex_lock(&pl_inode->mutex); + { + /* + - go through the lock list + - remove all locks from different owners + - same owner locks will be added or substracted based on + the new request + - add the new lock + */ + list_for_each_entry_safe(lock, i, &pl_inode->ext_list, list) { - __grant_blocked_locks (this, pl_inode, &granted_list); + if (lock->blocked) { + list_del_init(&lock->list); + list_add(&lock->list, &unwind_blist); + continue; + } + + if (locks_overlap(lock, reqlock)) { + if (same_owner(lock, reqlock)) + continue; + + /* remove conflicting locks */ + list_del_init(&lock->list); + __delete_lock(lock); + __destroy_lock(lock); + } } - pthread_mutex_unlock (&pl_inode->mutex); - list_for_each_entry_safe (lock, tmp, &granted_list, list) { - list_del_init (&lock->list); + __insert_and_merge(pl_inode, reqlock); - pl_trace_out (this, lock->frame, NULL, NULL, F_SETLKW, - &lock->user_flock, 0, 0, NULL); + list_for_each_entry_safe(rw, itr, &pl_inode->rw_list, list) + { + list_del_init(&rw->list); + list_add(&rw->list, &unwind_rw_list); + } + } + pthread_mutex_unlock(&pl_inode->mutex); + + /* unwind blocked locks */ + list_for_each_entry_safe(lock, i, &unwind_blist, list) + { + PL_STACK_UNWIND_AND_FREE(((pl_local_t *)lock->frame->local), lk, + lock->frame, -1, EBUSY, &lock->user_flock, + NULL); + __destroy_lock(lock); + } + + /* unwind blocked IOs */ + list_for_each_entry_safe(rw, itr, &unwind_rw_list, list) + { + pl_clean_local(rw->stub->frame->local); + call_unwind_error(rw->stub, -1, EBUSY); + } + + return ret; +} - STACK_UNWIND_STRICT (lk, lock->frame, 0, 0, - &lock->user_flock, NULL); +/* Return true in case we need to ensure mandatory-locking + * semantics under different modes. + */ +gf_boolean_t +pl_is_mandatory_locking_enabled(pl_inode_t *pl_inode) +{ + posix_locks_private_t *priv = THIS->private; - GF_FREE (lock); - } + if (priv->mandatory_mode == MLK_FILE_BASED && pl_inode->mandatory) + return _gf_true; + else if (priv->mandatory_mode == MLK_FORCED || + priv->mandatory_mode == MLK_OPTIMAL) + return _gf_true; - return; + return _gf_false; } -static int -pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode, - posix_lock_t *old_lock) +void +pl_clean_local(pl_local_t *local) { - struct gf_flock flock = {0,}; - posix_lock_t *unlock_lock = NULL; + if (!local) + return; - struct list_head granted_list; - posix_lock_t *tmp = NULL; - posix_lock_t *lock = NULL; + if (local->inodelk_dom_count_req) + data_unref(local->inodelk_dom_count_req); + loc_wipe(&local->loc[0]); + loc_wipe(&local->loc[1]); + if (local->fd) + fd_unref(local->fd); + if (local->inode) + inode_unref(local->inode); + mem_put(local); +} - int ret = -1; +/* +TODO: detach local initialization from PL_LOCAL_GET_REQUESTS and add it here +*/ +int +pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +{ + pl_local_t *local = NULL; + + if (!loc && !fd) { + return -1; + } + + if (!frame->local) { + local = mem_get0(this->local_pool); + if (!local) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "mem allocation failed"); + return -1; + } - INIT_LIST_HEAD (&granted_list); + local->inode = (loc ? inode_ref(loc->inode) : inode_ref(fd->inode)); - flock.l_type = F_UNLCK; - flock.l_whence = old_lock->user_flock.l_whence; - flock.l_start = old_lock->user_flock.l_start; - flock.l_len = old_lock->user_flock.l_len; + frame->local = local; + } + return 0; +} - unlock_lock = new_posix_lock (&flock, old_lock->client, - old_lock->client_pid, &old_lock->owner, - old_lock->fd); - GF_VALIDATE_OR_GOTO (this->name, unlock_lock, out); - ret = 0; +gf_boolean_t +pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client) +{ + if (client && (client->opversion < GD_OP_VERSION_7_0)) { + return _gf_true; + } + + if (is_lk_owner_null(owner)) { + return _gf_false; + } + return _gf_true; +} - __insert_and_merge (pl_inode, unlock_lock); +static int32_t +pl_inode_from_loc(loc_t *loc, inode_t **pinode) +{ + inode_t *inode = NULL; + int32_t error = 0; + + if (loc->inode != NULL) { + inode = inode_ref(loc->inode); + goto done; + } + + if (loc->parent == NULL) { + error = EINVAL; + goto done; + } + + if (!gf_uuid_is_null(loc->gfid)) { + inode = inode_find(loc->parent->table, loc->gfid); + if (inode != NULL) { + goto done; + } + } - __grant_blocked_locks (this, pl_inode, &granted_list); + if (loc->name == NULL) { + error = EINVAL; + goto done; + } - list_for_each_entry_safe (lock, tmp, &granted_list, list) { - list_del_init (&lock->list); + inode = inode_grep(loc->parent->table, loc->parent, loc->name); + if (inode == NULL) { + /* We haven't found any inode. This means that the file doesn't exist + * or that even if it exists, we don't have any knowledge about it, so + * we don't have locks on it either, which is fine for our purposes. */ + goto done; + } - pl_trace_out (this, lock->frame, NULL, NULL, F_SETLKW, - &lock->user_flock, 0, 0, NULL); +done: + *pinode = inode; - STACK_UNWIND_STRICT (lk, lock->frame, 0, 0, - &lock->user_flock, NULL); + return error; +} + +static gf_boolean_t +pl_inode_has_owners(xlator_t *xl, client_t *client, pl_inode_t *pl_inode, + struct timespec *now, struct list_head *contend) +{ + pl_dom_list_t *dom; + pl_inode_lock_t *lock; + gf_boolean_t has_owners = _gf_false; - GF_FREE (lock); + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + list_for_each_entry(lock, &dom->inodelk_list, list) + { + /* If the lock belongs to the same client, we assume it's related + * to the same operation, so we allow the removal to continue. */ + if (lock->client == client) { + continue; + } + /* If the lock belongs to an internal process, we don't block the + * removal. */ + if (lock->client_pid < 0) { + continue; + } + if (contend == NULL) { + return _gf_true; + } + has_owners = _gf_true; + inodelk_contention_notify_check(xl, lock, now, contend); } + } -out: - return ret; + return has_owners; } -int -pl_setlk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, - int can_block) +int32_t +pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, + pl_inode_t **ppl_inode, struct list_head *contend) { - int ret = 0; + struct timespec now; + inode_t *inode; + pl_inode_t *pl_inode; + int32_t error; + + pl_inode = NULL; + + error = pl_inode_from_loc(loc, &inode); + if ((error != 0) || (inode == NULL)) { + goto done; + } + + pl_inode = pl_inode_get(xl, inode, NULL); + if (pl_inode == NULL) { + inode_unref(inode); + error = ENOMEM; + goto done; + } + + /* pl_inode_from_loc() already increments ref count for inode, so + * we only assign here our reference. */ + pl_inode->inode = inode; + + timespec_now(&now); + + pthread_mutex_lock(&pl_inode->mutex); + + if (pl_inode->removed) { + error = ESTALE; + goto unlock; + } + + if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) { + error = -1; + /* We skip the unlock here because the caller must create a stub when + * we return -1 and do a call to pl_inode_remove_complete(), which + * assumes the lock is still acquired and will release it once + * everything else is prepared. */ + goto done; + } + + pl_inode->is_locked = _gf_true; + pl_inode->remove_running++; - errno = 0; +unlock: + pthread_mutex_unlock(&pl_inode->mutex); - pthread_mutex_lock (&pl_inode->mutex); - { - /* Send unlock before the actual lock to - prevent lock upgrade / downgrade - problems only if: - - it is a blocking call - - it has other conflicting locks - */ - - if (can_block && - !(__is_lock_grantable (pl_inode, lock))) { - ret = pl_send_prelock_unlock (this, pl_inode, - lock); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not send pre-lock " - "unlock"); - } +done: + *ppl_inode = pl_inode; - if (__is_lock_grantable (pl_inode, lock)) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => OK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - __insert_and_merge (pl_inode, lock); - } else if (can_block) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - lock->blocked = 1; - __insert_lock (pl_inode, lock); - ret = -1; - } else { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => NOK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - errno = EAGAIN; - ret = -1; - } + return error; +} + +int32_t +pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub, + struct list_head *contend) +{ + pl_inode_lock_t *lock; + int32_t error = -1; + + if (stub != NULL) { + list_add_tail(&stub->list, &pl_inode->waiting); + pl_inode->is_locked = _gf_true; + } else { + error = ENOMEM; + + while (!list_empty(contend)) { + lock = list_first_entry(contend, pl_inode_lock_t, list); + list_del_init(&lock->list); + __pl_inodelk_unref(lock); } - pthread_mutex_unlock (&pl_inode->mutex); + } - grant_blocked_locks (this, pl_inode); + pthread_mutex_unlock(&pl_inode->mutex); - do_blocked_rw (pl_inode); + if (error < 0) { + inodelk_contention_notify(xl, contend); + } - return ret; + inode_unref(pl_inode->inode); + + return error; } +void +pl_inode_remove_wake(struct list_head *list) +{ + call_stub_t *stub; -posix_lock_t * -pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock) + while (!list_empty(list)) { + stub = list_first_entry(list, call_stub_t, list); + list_del_init(&stub->list); + + call_resume(stub); + } +} + +void +pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error) { - posix_lock_t *conf = NULL; + struct list_head contend, granted; + struct timespec now; + pl_dom_list_t *dom; + + if (pl_inode == NULL) { + return; + } - conf = first_conflicting_overlap (pl_inode, lock); + INIT_LIST_HEAD(&contend); + INIT_LIST_HEAD(&granted); + timespec_now(&now); - if (conf == NULL) { - lock->fl_type = F_UNLCK; - return lock; + pthread_mutex_lock(&pl_inode->mutex); + + if (error == 0) { + if (pl_inode->links >= 0) { + pl_inode->links--; + } + if (pl_inode->links == 0) { + pl_inode->removed = _gf_true; + } + } + + pl_inode->remove_running--; + + if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) { + pl_inode->is_locked = _gf_false; + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now, + &contend); } + } + + pthread_mutex_unlock(&pl_inode->mutex); - return conf; + unwind_granted_inodes(xl, pl_inode, &granted); + + inodelk_contention_notify(xl, &contend); + + inode_unref(pl_inode->inode); +} + +void +pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode, + struct list_head *list) +{ + call_stub_t *stub, *tmp; + + if (!pl_inode->is_locked) { + return; + } + + list_for_each_entry_safe(stub, tmp, &pl_inode->waiting, list) + { + if (!pl_inode_has_owners(xl, stub->frame->root->client, pl_inode, NULL, + NULL)) { + list_move_tail(&stub->list, list); + } + } } +/* This function determines if an inodelk attempt can be done now or it needs + * to wait. + * + * Possible return values: + * < 0: An error occurred. Currently only -ESTALE can be returned if the + * inode has been deleted previously by unlink/rmdir/rename + * = 0: The lock can be attempted. + * > 0: The lock needs to wait because a conflicting remove operation is + * ongoing. + */ +int32_t +pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock) +{ + pl_dom_list_t *dom; + pl_inode_lock_t *ilock; + + /* If the inode has been deleted, we won't allow any lock. */ + if (pl_inode->removed) { + return -ESTALE; + } + + /* We only synchronize with locks made for regular operations coming from + * the user. Locks done for internal purposes are hard to control and could + * lead to long delays or deadlocks quite easily. */ + if (lock->client_pid < 0) { + return 0; + } + if (!pl_inode->is_locked) { + return 0; + } + if (pl_inode->remove_running > 0) { + return 1; + } + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + list_for_each_entry(ilock, &dom->inodelk_list, list) + { + /* If a lock from the same client is already granted, we allow this + * one to continue. This is necessary to prevent deadlocks when + * multiple locks are taken for the same operation. + * + * On the other side it's unlikely that the same client sends + * completely unrelated locks for the same inode. + */ + if (ilock->client == lock->client) { + return 0; + } + } + } + + return 1; +} diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h index 5ec630ee857..281223bf3b8 100644 --- a/xlators/features/locks/src/common.h +++ b/xlators/features/locks/src/common.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2006-2012, 2015-2016 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -10,149 +10,253 @@ #ifndef __COMMON_H__ #define __COMMON_H__ -#include "lkowner.h" /*dump locks format strings */ -#define RANGE_FMT "type=%s, whence=%hd, start=%llu, len=%llu" -#define ENTRY_FMT "type=%s on basename=%s" -#define DUMP_GEN_FMT "pid = %llu, owner=%s, client=%p" -#define GRNTD_AT "granted at %s" -#define BLKD_AT "blocked at %s" -#define CONN_ID "connection-id=%s" -#define DUMP_BLKD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT -#define DUMP_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "GRNTD_AT -#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT", "GRNTD_AT - -#define ENTRY_BLKD_FMT ENTRY_FMT", "DUMP_BLKD_FMT -#define ENTRY_GRNTD_FMT ENTRY_FMT", "DUMP_GRNTD_FMT -#define ENTRY_BLKD_GRNTD_FMT ENTRY_FMT", "DUMP_BLKD_GRNTD_FMT - -#define RANGE_BLKD_FMT RANGE_FMT", "DUMP_BLKD_FMT -#define RANGE_GRNTD_FMT RANGE_FMT", "DUMP_GRNTD_FMT -#define RANGE_BLKD_GRNTD_FMT RANGE_FMT", "DUMP_BLKD_GRNTD_FMT +#define RANGE_FMT "type=%s, whence=%hd, start=%llu, len=%llu" +#define ENTRY_FMT "type=%s on basename=%s" +#define DUMP_GEN_FMT "pid = %llu, owner=%s, client=%p" +#define GRNTD_AT "granted at %s" +#define BLKD_AT "blocked at %s" +#define CONN_ID "connection-id=%s" +#define DUMP_BLKD_FMT DUMP_GEN_FMT ", " CONN_ID ", " BLKD_AT +#define DUMP_GRNTD_FMT DUMP_GEN_FMT ", " CONN_ID ", " GRNTD_AT +#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT ", " CONN_ID ", " BLKD_AT ", " GRNTD_AT + +#define ENTRY_BLKD_FMT ENTRY_FMT ", " DUMP_BLKD_FMT +#define ENTRY_GRNTD_FMT ENTRY_FMT ", " DUMP_GRNTD_FMT +#define ENTRY_BLKD_GRNTD_FMT ENTRY_FMT ", " DUMP_BLKD_GRNTD_FMT + +#define RANGE_BLKD_FMT RANGE_FMT ", " DUMP_BLKD_FMT +#define RANGE_GRNTD_FMT RANGE_FMT ", " DUMP_GRNTD_FMT +#define RANGE_BLKD_GRNTD_FMT RANGE_FMT ", " DUMP_BLKD_GRNTD_FMT #define SET_FLOCK_PID(flock, lock) ((flock)->l_pid = lock->client_pid) +#define PL_STACK_UNWIND_AND_FREE(__local, fop, frame, op_ret, params...) \ + do { \ + frame->local = NULL; \ + STACK_UNWIND_STRICT(fop, frame, op_ret, params); \ + if (__local) { \ + if (__local->inodelk_dom_count_req) \ + data_unref(__local->inodelk_dom_count_req); \ + loc_wipe(&__local->loc[0]); \ + loc_wipe(&__local->loc[1]); \ + if (__local->fd) \ + fd_unref(__local->fd); \ + if (__local->inode) \ + inode_unref(__local->inode); \ + if (__local->xdata) { \ + dict_unref(__local->xdata); \ + __local->xdata = NULL; \ + } \ + mem_put(__local); \ + } \ + } while (0) posix_lock_t * -new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, - gf_lkowner_t *owner, fd_t *fd); +new_posix_lock(struct gf_flock *flock, client_t *client, pid_t client_pid, + gf_lkowner_t *owner, fd_t *fd, uint32_t lk_flags, int blocking, + int32_t *op_errno); pl_inode_t * -pl_inode_get (xlator_t *this, inode_t *inode); +pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local); posix_lock_t * -pl_getlk (pl_inode_t *inode, posix_lock_t *lock); +pl_getlk(pl_inode_t *inode, posix_lock_t *lock); int -pl_setlk (xlator_t *this, pl_inode_t *inode, posix_lock_t *lock, - int can_block); +pl_setlk(xlator_t *this, pl_inode_t *inode, posix_lock_t *lock, int can_block); + +int +pl_lock_preempt(pl_inode_t *pl_inode, posix_lock_t *reqlock); void -grant_blocked_locks (xlator_t *this, pl_inode_t *inode); +grant_blocked_locks(xlator_t *this, pl_inode_t *inode); void -posix_lock_to_flock (posix_lock_t *lock, struct gf_flock *flock); +posix_lock_to_flock(posix_lock_t *lock, struct gf_flock *flock); int -locks_overlap (posix_lock_t *l1, posix_lock_t *l2); +locks_overlap(posix_lock_t *l1, posix_lock_t *l2); int -same_owner (posix_lock_t *l1, posix_lock_t *l2); +same_owner(posix_lock_t *l1, posix_lock_t *l2); -void __delete_lock (pl_inode_t *, posix_lock_t *); +void +__delete_lock(posix_lock_t *); -void __destroy_lock (posix_lock_t *); +void +__destroy_lock(posix_lock_t *); pl_dom_list_t * -get_domain (pl_inode_t *pl_inode, const char *volume); +get_domain(pl_inode_t *pl_inode, const char *volume); void -grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom); +grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend); void -__delete_inode_lock (pl_inode_lock_t *lock); +inodelk_contention_notify(xlator_t *this, struct list_head *contend); void -__pl_inodelk_unref (pl_inode_lock_t *lock); +__delete_inode_lock(pl_inode_lock_t *lock); void -grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom); +__pl_inodelk_unref(pl_inode_lock_t *lock); -void pl_update_refkeeper (xlator_t *this, inode_t *inode); +void +__grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted, pl_dom_list_t *dom, + struct timespec *now, struct list_head *contend); + +void +unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted); + +void +grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend); + +void +entrylk_contention_notify(xlator_t *this, struct list_head *contend); + +void +pl_update_refkeeper(xlator_t *this, inode_t *inode); int32_t -__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname); +__get_inodelk_count(xlator_t *this, pl_inode_t *pl_inode, char *domname); int32_t -get_inodelk_count (xlator_t *this, inode_t *inode, char *domname); +get_inodelk_count(xlator_t *this, inode_t *inode, char *domname); int32_t -__get_entrylk_count (xlator_t *this, pl_inode_t *pl_inode); +__get_entrylk_count(xlator_t *this, pl_inode_t *pl_inode); int32_t -get_entrylk_count (xlator_t *this, inode_t *inode); +get_entrylk_count(xlator_t *this, inode_t *inode); -void pl_trace_in (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, const char *domain); +void +pl_trace_in(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd, + struct gf_flock *flock, const char *domain); -void pl_trace_out (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, int op_ret, int op_errno, const char *domain); +void +pl_trace_out(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd, + struct gf_flock *flock, int op_ret, int op_errno, + const char *domain); -void pl_trace_block (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, const char *domain); +void +pl_trace_block(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, + int cmd, struct gf_flock *flock, const char *domain); -void pl_trace_flush (xlator_t *this, call_frame_t *frame, fd_t *fd); +void +pl_trace_flush(xlator_t *this, call_frame_t *frame, fd_t *fd); -void entrylk_trace_in (xlator_t *this, call_frame_t *frame, const char *volume, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type); +void +entrylk_trace_in(xlator_t *this, call_frame_t *frame, const char *volume, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type); -void entrylk_trace_out (xlator_t *this, call_frame_t *frame, const char *volume, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, - int op_ret, int op_errno); +void +entrylk_trace_out(xlator_t *this, call_frame_t *frame, const char *volume, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, int op_ret, int op_errno); -void entrylk_trace_block (xlator_t *this, call_frame_t *frame, const char *volume, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type); +void +entrylk_trace_block(xlator_t *this, call_frame_t *frame, const char *volume, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type); void -pl_print_verdict (char *str, int size, int op_ret, int op_errno); +pl_print_verdict(char *str, int size, int op_ret, int op_errno); void -pl_print_lockee (char *str, int size, fd_t *fd, loc_t *loc); +pl_print_lockee(char *str, int size, fd_t *fd, loc_t *loc); void -pl_print_locker (char *str, int size, xlator_t *this, call_frame_t *frame); +pl_print_locker(char *str, int size, xlator_t *this, call_frame_t *frame); void -pl_print_inodelk (char *str, int size, int cmd, struct gf_flock *flock, const char *domain); +pl_print_inodelk(char *str, int size, int cmd, struct gf_flock *flock, + const char *domain); void -pl_trace_release (xlator_t *this, fd_t *fd); +pl_trace_release(xlator_t *this, fd_t *fd); unsigned long -fd_to_fdnum (fd_t *fd); +fd_to_fdnum(fd_t *fd); fd_t * -fd_from_fdnum (posix_lock_t *lock); +fd_from_fdnum(posix_lock_t *lock); + +int +pl_reserve_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + int can_block); +int +reservelks_equal(posix_lock_t *l1, posix_lock_t *l2); int -pl_reserve_setlk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, - int can_block); +pl_verify_reservelk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + int can_block); int -reservelks_equal (posix_lock_t *l1, posix_lock_t *l2); +pl_reserve_unlock(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *reqlock); + +int32_t +check_entrylk_on_basename(xlator_t *this, inode_t *parent, char *basename); + +void +__pl_inodelk_unref(pl_inode_lock_t *lock); +void +__pl_entrylk_unref(pl_entry_lock_t *lock); int -pl_verify_reservelk (xlator_t *this, pl_inode_t *pl_inode, - posix_lock_t *lock, int can_block); +pl_metalock_is_active(pl_inode_t *pl_inode); + +void +__pl_queue_lock(pl_inode_t *pl_inode, posix_lock_t *reqlock); + +void +inodelk_contention_notify_check(xlator_t *xl, pl_inode_lock_t *lock, + struct timespec *now, + struct list_head *contend); + +void +entrylk_contention_notify_check(xlator_t *xl, pl_entry_lock_t *lock, + struct timespec *now, + struct list_head *contend); + +gf_boolean_t +pl_does_monkey_want_stuck_lock(); + +gf_boolean_t +pl_is_mandatory_locking_enabled(pl_inode_t *pl_inode); + +void +pl_clean_local(pl_local_t *local); + int -pl_reserve_unlock (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *reqlock); +pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd); + +gf_boolean_t +pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client); + +int32_t +pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, + pl_inode_t **ppl_inode, struct list_head *contend); + +int32_t +pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub, + struct list_head *contend); -uint32_t -check_entrylk_on_basename (xlator_t *this, inode_t *parent, char *basename); +void +pl_inode_remove_wake(struct list_head *list); + +void +pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error); -void __pl_inodelk_unref (pl_inode_lock_t *lock); -void __pl_entrylk_unref (pl_entry_lock_t *lock); +void +pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode, + struct list_head *list); + +int32_t +pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock); #endif /* __COMMON_H__ */ diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index 8496d9d8dce..fd772c850dd 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -7,77 +7,77 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" -#include "list.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/list.h> +#include <glusterfs/upcall-utils.h> #include "locks.h" +#include "clear.h" #include "common.h" - +#include "pl-messages.h" void -__pl_entrylk_unref (pl_entry_lock_t *lock) +__pl_entrylk_unref(pl_entry_lock_t *lock) { - lock->ref--; - if (!lock->ref) { - GF_FREE ((char *)lock->basename); - GF_FREE (lock->connection_id); - GF_FREE (lock); - } + lock->ref--; + if (!lock->ref) { + GF_FREE((char *)lock->basename); + GF_FREE(lock->connection_id); + GF_FREE(lock); + } } - static void -__pl_entrylk_ref (pl_entry_lock_t *lock) +__pl_entrylk_ref(pl_entry_lock_t *lock) { - lock->ref++; + lock->ref++; } - static pl_entry_lock_t * -new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, - const char *domain, call_frame_t *frame, char *conn_id) +new_entrylk_lock(pl_inode_t *pinode, const char *basename, entrylk_type type, + const char *domain, call_frame_t *frame, char *conn_id, + int32_t *op_errno) { - pl_entry_lock_t *newlock = NULL; - - newlock = GF_CALLOC (1, sizeof (pl_entry_lock_t), - gf_locks_mt_pl_entry_lock_t); - if (!newlock) { - goto out; - } - - newlock->basename = basename ? gf_strdup (basename) : NULL; - newlock->type = type; - newlock->client = frame->root->client; - newlock->client_pid = frame->root->pid; - newlock->volume = domain; - newlock->owner = frame->root->lk_owner; - newlock->frame = frame; - newlock->this = frame->this; - - if (conn_id) { - newlock->connection_id = gf_strdup (conn_id); - } - - INIT_LIST_HEAD (&newlock->domain_list); - INIT_LIST_HEAD (&newlock->blocked_locks); - INIT_LIST_HEAD (&newlock->client_list); - - __pl_entrylk_ref (newlock); + pl_entry_lock_t *newlock = NULL; + + if (!pl_is_lk_owner_valid(&frame->root->lk_owner, frame->root->client)) { + *op_errno = EINVAL; + goto out; + } + + newlock = GF_CALLOC(1, sizeof(pl_entry_lock_t), + gf_locks_mt_pl_entry_lock_t); + if (!newlock) { + *op_errno = ENOMEM; + goto out; + } + + newlock->basename = basename ? gf_strdup(basename) : NULL; + newlock->type = type; + newlock->client = frame->root->client; + newlock->client_pid = frame->root->pid; + newlock->volume = domain; + newlock->owner = frame->root->lk_owner; + newlock->frame = frame; + newlock->this = frame->this; + + if (conn_id) { + newlock->connection_id = gf_strdup(conn_id); + } + + INIT_LIST_HEAD(&newlock->domain_list); + INIT_LIST_HEAD(&newlock->blocked_locks); + INIT_LIST_HEAD(&newlock->client_list); + + __pl_entrylk_ref(newlock); out: - return newlock; + return newlock; } - /** * all_names - does a basename represent all names? * @basename: name to check @@ -92,20 +92,221 @@ out: */ static int -names_conflict (const char *n1, const char *n2) +names_conflict(const char *n1, const char *n2) +{ + return all_names(n1) || all_names(n2) || !strcmp(n1, n2); +} + +static int +__same_entrylk_owner(pl_entry_lock_t *l1, pl_entry_lock_t *l2) +{ + return (is_same_lkowner(&l1->owner, &l2->owner) && + (l1->client == l2->client)); +} + +/* Just as in inodelk, allow conflicting name locks from same (lk_owner, conn)*/ +static int +__conflicting_entrylks(pl_entry_lock_t *l1, pl_entry_lock_t *l2) +{ + if (names_conflict(l1->basename, l2->basename) && + !__same_entrylk_owner(l1, l2)) + return 1; + + return 0; +} + +/* See comments in inodelk.c for details */ +static inline gf_boolean_t +__stale_entrylk(xlator_t *this, pl_entry_lock_t *candidate_lock, + pl_entry_lock_t *requested_lock, time_t *lock_age_sec) { - return all_names (n1) || all_names (n2) || !strcmp (n1, n2); + posix_locks_private_t *priv = NULL; + + priv = this->private; + + /* Question: Should we just prune them all given the + * chance? Or just the locks we are attempting to acquire? + */ + if (names_conflict(candidate_lock->basename, requested_lock->basename)) { + *lock_age_sec = gf_time() - candidate_lock->granted_time; + if (*lock_age_sec > priv->revocation_secs) + return _gf_true; + } + return _gf_false; } +/* See comments in inodelk.c for details */ +static gf_boolean_t +__entrylk_prune_stale(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_entry_lock_t *lock) +{ + posix_locks_private_t *priv = NULL; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *lk = NULL; + gf_boolean_t revoke_lock = _gf_false; + int bcount = 0; + int gcount = 0; + int op_errno = 0; + clrlk_args args; + args.opts = NULL; + time_t lk_age_sec = 0; + uint32_t max_blocked = 0; + char *reason_str = NULL; + + priv = this->private; + args.type = CLRLK_ENTRY; + if (priv->revocation_clear_all == _gf_true) + args.kind = CLRLK_ALL; + else + args.kind = CLRLK_GRANTED; + + if (list_empty(&dom->entrylk_list)) + goto out; + + pthread_mutex_lock(&pinode->mutex); + lock->pinode = pinode; + list_for_each_entry_safe(lk, tmp, &dom->entrylk_list, domain_list) + { + if (__stale_entrylk(this, lk, lock, &lk_age_sec) == _gf_true) { + revoke_lock = _gf_true; + reason_str = "age"; + break; + } + } + max_blocked = priv->revocation_max_blocked; + if (max_blocked != 0 && revoke_lock == _gf_false) { + list_for_each_entry_safe(lk, tmp, &dom->blocked_entrylks, blocked_locks) + { + max_blocked--; + if (max_blocked == 0) { + revoke_lock = _gf_true; + reason_str = "max blocked"; + break; + } + } + } + pthread_mutex_unlock(&pinode->mutex); + +out: + if (revoke_lock == _gf_true) { + clrlk_clear_entrylk(this, pinode, dom, &args, &bcount, &gcount, + &op_errno); + gf_log(this->name, GF_LOG_WARNING, + "Lock revocation [reason: %s; gfid: %s; domain: %s; " + "age: %ld sec] - Entry lock revoked: %d granted & %d " + "blocked locks cleared", + reason_str, uuid_utoa(pinode->gfid), dom->domain, lk_age_sec, + gcount, bcount); + } + + return revoke_lock; +} -static inline int -__same_entrylk_owner (pl_entry_lock_t *l1, pl_entry_lock_t *l2) +void +entrylk_contention_notify_check(xlator_t *this, pl_entry_lock_t *lock, + struct timespec *now, struct list_head *contend) { + posix_locks_private_t *priv; + int64_t elapsed; + + priv = this->private; + + /* If this lock is in a list, it means that we are about to send a + * notification for it, so no need to do anything else. */ + if (!list_empty(&lock->contend)) { + return; + } + + elapsed = now->tv_sec; + elapsed -= lock->contention_time.tv_sec; + if (now->tv_nsec < lock->contention_time.tv_nsec) { + elapsed--; + } + if (elapsed < priv->notify_contention_delay) { + return; + } - return (is_same_lkowner (&l1->owner, &l2->owner) && - (l1->client == l2->client)); + /* All contention notifications will be sent outside of the locked + * region. This means that currently granted locks might have already + * been unlocked by that time. To avoid the lock or the inode to be + * destroyed before we process them, we take an additional reference + * on both. */ + inode_ref(lock->pinode->inode); + __pl_entrylk_ref(lock); + + lock->contention_time = *now; + + list_add_tail(&lock->contend, contend); } +void +entrylk_contention_notify(xlator_t *this, struct list_head *contend) +{ + struct gf_upcall up; + struct gf_upcall_entrylk_contention lc; + pl_entry_lock_t *lock; + pl_inode_t *pl_inode; + client_t *client; + gf_boolean_t notify; + + while (!list_empty(contend)) { + lock = list_first_entry(contend, pl_entry_lock_t, contend); + + pl_inode = lock->pinode; + + pthread_mutex_lock(&pl_inode->mutex); + + /* If the lock has already been released, no notification is + * sent. We clear the notification time in this case. */ + notify = !list_empty(&lock->domain_list); + if (!notify) { + lock->contention_time.tv_sec = 0; + lock->contention_time.tv_nsec = 0; + } else { + lc.type = lock->type; + lc.name = lock->basename; + lc.pid = lock->client_pid; + lc.domain = lock->volume; + lc.xdata = NULL; + + gf_uuid_copy(up.gfid, lock->pinode->gfid); + client = (client_t *)lock->client; + if (client == NULL) { + /* A NULL client can be found if the entrylk + * was issued by a server side xlator. */ + up.client_uid = NULL; + } else { + up.client_uid = client->client_uid; + } + } + + pthread_mutex_unlock(&pl_inode->mutex); + + if (notify) { + up.event_type = GF_UPCALL_ENTRYLK_CONTENTION; + up.data = &lc; + + if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) { + gf_msg_debug(this->name, 0, + "Entrylk contention notification " + "failed"); + } else { + gf_msg_debug(this->name, 0, + "Entrylk contention notification " + "sent"); + } + } + + pthread_mutex_lock(&pl_inode->mutex); + + list_del_init(&lock->contend); + __pl_entrylk_unref(lock); + + pthread_mutex_unlock(&pl_inode->mutex); + + inode_unref(pl_inode->inode); + } +} /** * entrylk_grantable - is this lock grantable? @@ -114,184 +315,188 @@ __same_entrylk_owner (pl_entry_lock_t *l1, pl_entry_lock_t *l2) * @type: type of lock */ static pl_entry_lock_t * -__entrylk_grantable (pl_dom_list_t *dom, pl_entry_lock_t *lock) +__entrylk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_entry_lock_t *lock, + struct timespec *now, struct list_head *contend) { - pl_entry_lock_t *tmp = NULL; - - if (list_empty (&dom->entrylk_list)) - return NULL; - - list_for_each_entry (tmp, &dom->entrylk_list, domain_list) { - if (names_conflict (tmp->basename, lock->basename)) - return tmp; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *ret = NULL; + + list_for_each_entry(tmp, &dom->entrylk_list, domain_list) + { + if (__conflicting_entrylks(tmp, lock)) { + if (ret == NULL) { + ret = tmp; + if (contend == NULL) { + break; + } + } + entrylk_contention_notify_check(this, tmp, now, contend); } + } - return NULL; + return ret; } static pl_entry_lock_t * -__blocked_entrylk_conflict (pl_dom_list_t *dom, pl_entry_lock_t *lock) +__blocked_entrylk_conflict(pl_dom_list_t *dom, pl_entry_lock_t *lock) { - pl_entry_lock_t *tmp = NULL; - - if (list_empty (&dom->blocked_entrylks)) - return NULL; + pl_entry_lock_t *tmp = NULL; - list_for_each_entry (tmp, &dom->blocked_entrylks, blocked_locks) { - if (names_conflict (tmp->basename, lock->basename)) - return lock; - } + list_for_each_entry(tmp, &dom->blocked_entrylks, blocked_locks) + { + if (names_conflict(tmp->basename, lock->basename)) + return lock; + } - return NULL; + return NULL; } static int -__owner_has_lock (pl_dom_list_t *dom, pl_entry_lock_t *newlock) +__owner_has_lock(pl_dom_list_t *dom, pl_entry_lock_t *newlock) { - pl_entry_lock_t *lock = NULL; + pl_entry_lock_t *lock = NULL; - list_for_each_entry (lock, &dom->entrylk_list, domain_list) { - if (__same_entrylk_owner (lock, newlock)) - return 1; - } + list_for_each_entry(lock, &dom->entrylk_list, domain_list) + { + if (__same_entrylk_owner(lock, newlock)) + return 1; + } - list_for_each_entry (lock, &dom->blocked_entrylks, blocked_locks) { - if (__same_entrylk_owner (lock, newlock)) - return 1; - } + list_for_each_entry(lock, &dom->blocked_entrylks, blocked_locks) + { + if (__same_entrylk_owner(lock, newlock)) + return 1; + } - return 0; + return 0; } static int -names_equal (const char *n1, const char *n2) +names_equal(const char *n1, const char *n2) { - return (n1 == NULL && n2 == NULL) || (n1 && n2 && !strcmp (n1, n2)); + return (n1 == NULL && n2 == NULL) || (n1 && n2 && !strcmp(n1, n2)); } void -pl_print_entrylk (char *str, int size, entrylk_cmd cmd, entrylk_type type, - const char *basename, const char *domain) +pl_print_entrylk(char *str, int size, entrylk_cmd cmd, entrylk_type type, + const char *basename, const char *domain) { - char *cmd_str = NULL; - char *type_str = NULL; + char *cmd_str = NULL; + char *type_str = NULL; - switch (cmd) { + switch (cmd) { case ENTRYLK_LOCK: - cmd_str = "LOCK"; - break; + cmd_str = "LOCK"; + break; case ENTRYLK_LOCK_NB: - cmd_str = "LOCK_NB"; - break; + cmd_str = "LOCK_NB"; + break; case ENTRYLK_UNLOCK: - cmd_str = "UNLOCK"; - break; + cmd_str = "UNLOCK"; + break; default: - cmd_str = "UNKNOWN"; - break; - } + cmd_str = "UNKNOWN"; + break; + } - switch (type) { + switch (type) { case ENTRYLK_RDLCK: - type_str = "READ"; - break; + type_str = "READ"; + break; case ENTRYLK_WRLCK: - type_str = "WRITE"; - break; + type_str = "WRITE"; + break; default: - type_str = "UNKNOWN"; - break; - } + type_str = "UNKNOWN"; + break; + } - snprintf (str, size, "lock=ENTRYLK, cmd=%s, type=%s, basename=%s, domain: %s", - cmd_str, type_str, basename, domain); + snprintf(str, size, + "lock=ENTRYLK, cmd=%s, type=%s, basename=%s, domain: %s", cmd_str, + type_str, basename, domain); } - void -entrylk_trace_in (xlator_t *this, call_frame_t *frame, const char *domain, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) +entrylk_trace_in(xlator_t *this, call_frame_t *frame, const char *domain, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_entrylk[256]; + posix_locks_private_t *priv = NULL; + char pl_locker[256]; + char pl_lockee[256]; + char pl_entrylk[256]; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - pl_print_entrylk (pl_entrylk, 256, cmd, type, basename, domain); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + pl_print_entrylk(pl_entrylk, 256, cmd, type, basename, domain); - gf_log (this->name, GF_LOG_INFO, - "[REQUEST] Locker = {%s} Lockee = {%s} Lock = {%s}", - pl_locker, pl_lockee, pl_entrylk); + gf_log(this->name, GF_LOG_INFO, + "[REQUEST] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker, + pl_lockee, pl_entrylk); } - void -entrylk_trace_out (xlator_t *this, call_frame_t *frame, const char *domain, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, int op_ret, int op_errno) +entrylk_trace_out(xlator_t *this, call_frame_t *frame, const char *domain, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, int op_ret, int op_errno) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_entrylk[256]; - char verdict[32]; + posix_locks_private_t *priv = NULL; + char pl_locker[256]; + char pl_lockee[256]; + char pl_entrylk[256]; + char verdict[32]; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - pl_print_entrylk (pl_entrylk, 256, cmd, type, basename, domain); - pl_print_verdict (verdict, 32, op_ret, op_errno); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + pl_print_entrylk(pl_entrylk, 256, cmd, type, basename, domain); + pl_print_verdict(verdict, 32, op_ret, op_errno); - gf_log (this->name, GF_LOG_INFO, - "[%s] Locker = {%s} Lockee = {%s} Lock = {%s}", - verdict, pl_locker, pl_lockee, pl_entrylk); + gf_log(this->name, GF_LOG_INFO, + "[%s] Locker = {%s} Lockee = {%s} Lock = {%s}", verdict, pl_locker, + pl_lockee, pl_entrylk); } - void -entrylk_trace_block (xlator_t *this, call_frame_t *frame, const char *volume, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) +entrylk_trace_block(xlator_t *this, call_frame_t *frame, const char *volume, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_entrylk[256]; + posix_locks_private_t *priv = NULL; + char pl_locker[256]; + char pl_lockee[256]; + char pl_entrylk[256]; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - pl_print_entrylk (pl_entrylk, 256, cmd, type, basename, volume); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + pl_print_entrylk(pl_entrylk, 256, cmd, type, basename, volume); - gf_log (this->name, GF_LOG_INFO, - "[BLOCKED] Locker = {%s} Lockee = {%s} Lock = {%s}", - pl_locker, pl_lockee, pl_entrylk); + gf_log(this->name, GF_LOG_INFO, + "[BLOCKED] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker, + pl_lockee, pl_entrylk); } /** - * __find_most_matching_lock - find the lock struct which most matches in order of: - * lock on the exact basename || - * an all_names lock + * __find_most_matching_lock - find the lock struct which most matches in order + * of: lock on the exact basename || an all_names lock * * * @inode: inode in which to look @@ -299,23 +504,57 @@ entrylk_trace_block (xlator_t *this, call_frame_t *frame, const char *volume, */ static pl_entry_lock_t * -__find_most_matching_lock (pl_dom_list_t *dom, const char *basename) +__find_most_matching_lock(pl_dom_list_t *dom, const char *basename) { - pl_entry_lock_t *lock; - pl_entry_lock_t *all = NULL; - pl_entry_lock_t *exact = NULL; + pl_entry_lock_t *lock; + pl_entry_lock_t *all = NULL; + pl_entry_lock_t *exact = NULL; - if (list_empty (&dom->entrylk_list)) - return NULL; + if (list_empty(&dom->entrylk_list)) + return NULL; - list_for_each_entry (lock, &dom->entrylk_list, domain_list) { - if (all_names (lock->basename)) - all = lock; - else if (names_equal (lock->basename, basename)) - exact = lock; - } + list_for_each_entry(lock, &dom->entrylk_list, domain_list) + { + if (all_names(lock->basename)) + all = lock; + else if (names_equal(lock->basename, basename)) + exact = lock; + } + + return (exact ? exact : all); +} + +static pl_entry_lock_t * +__find_matching_lock(pl_dom_list_t *dom, pl_entry_lock_t *lock) +{ + pl_entry_lock_t *tmp = NULL; + + list_for_each_entry(tmp, &dom->entrylk_list, domain_list) + { + if (names_equal(lock->basename, tmp->basename) && + __same_entrylk_owner(lock, tmp) && (lock->type == tmp->type)) + return tmp; + } + return NULL; +} + +static int +__lock_blocked_add(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_entry_lock_t *lock, int nonblock) +{ + if (nonblock) + goto out; + + lock->blkd_time = gf_time(); + list_add_tail(&lock->blocked_locks, &dom->blocked_entrylks); + + gf_msg_trace(this->name, 0, "Blocking lock: {pinode=%p, basename=%s}", + pinode, lock->basename); - return (exact ? exact : all); + entrylk_trace_block(this, lock->frame, NULL, NULL, NULL, lock->basename, + ENTRYLK_LOCK, lock->type); +out: + return -EAGAIN; } /** @@ -330,52 +569,49 @@ __find_most_matching_lock (pl_dom_list_t *dom, const char *basename) */ int -__lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock, - int nonblock, pl_dom_list_t *dom) +__lock_entrylk(xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock, + int nonblock, pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) { - pl_entry_lock_t *conf = NULL; - int ret = -EAGAIN; - - conf = __entrylk_grantable (dom, lock); - if (conf) { - ret = -EAGAIN; - if (nonblock) - goto out; - - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks); - - gf_log (this->name, GF_LOG_TRACE, - "Blocking lock: {pinode=%p, basename=%s}", - pinode, lock->basename); - - goto out; + pl_entry_lock_t *conf = NULL; + int ret = -EAGAIN; + + conf = __entrylk_grantable(this, dom, lock, now, contend); + if (conf) { + ret = __lock_blocked_add(this, pinode, dom, lock, nonblock); + goto out; + } + + /* To prevent blocked locks starvation, check if there are any blocked + * locks thay may conflict with this lock. If there is then don't grant + * the lock. BUT grant the lock if the owner already has lock to allow + * nested locks. + * Example: SHD from Machine1 takes (gfid, basename=257-length-name) + * and is granted. + * SHD from machine2 takes (gfid, basename=NULL) and is blocked. + * When SHD from Machine1 takes (gfid, basename=NULL) it needs to be + * granted, without which self-heal can't progress. + * TODO: Find why 'owner_has_lock' is checked even for blocked locks. + */ + if (__blocked_entrylk_conflict(dom, lock) && + !(__owner_has_lock(dom, lock))) { + if (nonblock == 0) { + gf_log(this->name, GF_LOG_DEBUG, + "Lock is grantable, but blocking to prevent " + "starvation"); } - if (__blocked_entrylk_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) { - ret = -EAGAIN; - if (nonblock) - goto out; + ret = __lock_blocked_add(this, pinode, dom, lock, nonblock); + goto out; + } - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks); + __pl_entrylk_ref(lock); + lock->granted_time = gf_time(); + list_add(&lock->domain_list, &dom->entrylk_list); - gf_log (this->name, GF_LOG_DEBUG, - "Lock is grantable, but blocking to prevent starvation"); - gf_log (this->name, GF_LOG_TRACE, - "Blocking lock: {pinode=%p, basename=%s}", - pinode, lock->basename); - - goto out; - } - - __pl_entrylk_ref (lock); - gettimeofday (&lock->granted_time, NULL); - list_add (&lock->domain_list, &dom->entrylk_list); - - ret = 0; + ret = 0; out: - return ret; + return ret; } /** @@ -386,262 +622,322 @@ out: */ pl_entry_lock_t * -__unlock_entrylk (pl_dom_list_t *dom, pl_entry_lock_t *lock) +__unlock_entrylk(pl_dom_list_t *dom, pl_entry_lock_t *lock) { - pl_entry_lock_t *tmp = NULL; - pl_entry_lock_t *ret_lock = NULL; - - tmp = __find_most_matching_lock (dom, lock->basename); - - if (!tmp) { - gf_log ("locks", GF_LOG_ERROR, - "unlock on %s (type=ENTRYLK_WRLCK) attempted but no matching lock found", - lock->basename); - goto out; - } - - if (names_equal (tmp->basename, lock->basename) - && tmp->type == lock->type) { + pl_entry_lock_t *ret_lock = NULL; - list_del_init (&tmp->domain_list); - ret_lock = tmp; + ret_lock = __find_matching_lock(dom, lock); - } else { - gf_log ("locks", GF_LOG_ERROR, - "Unlock on %s for a non-existing lock!", lock->basename); - goto out; - } + if (ret_lock) { + list_del_init(&ret_lock->domain_list); + } else { + gf_log("locks", GF_LOG_ERROR, + "unlock on %s " + "(type=ENTRYLK_WRLCK) attempted but no matching lock " + "found", + lock->basename); + } -out: - return ret_lock; + return ret_lock; } -uint32_t -check_entrylk_on_basename (xlator_t *this, inode_t *parent, char *basename) +int32_t +check_entrylk_on_basename(xlator_t *this, inode_t *parent, char *basename) { - uint32_t entrylk = 0; - pl_inode_t *pinode = 0; - pl_dom_list_t *dom = NULL; - pl_entry_lock_t *conf = NULL; - - pinode = pl_inode_get (this, parent); - if (!pinode) - goto out; - pthread_mutex_lock (&pinode->mutex); + int32_t entrylk = 0; + pl_dom_list_t *dom = NULL; + pl_entry_lock_t *conf = NULL; + + pl_inode_t *pinode = pl_inode_get(this, parent, NULL); + if (!pinode) + goto out; + pthread_mutex_lock(&pinode->mutex); + { + list_for_each_entry(dom, &pinode->dom_list, inode_list) { - list_for_each_entry (dom, &pinode->dom_list, inode_list) { - conf = __find_most_matching_lock (dom, basename); - if (conf && conf->basename) { - entrylk = 1; - break; - } - } + conf = __find_most_matching_lock(dom, basename); + if (conf && conf->basename) { + entrylk = 1; + break; + } } - pthread_mutex_unlock (&pinode->mutex); + } + pthread_mutex_unlock(&pinode->mutex); out: - return entrylk; + return entrylk; } void -__grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom, struct list_head *granted) +__grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct list_head *granted, + struct timespec *now, struct list_head *contend) { - int bl_ret = 0; - pl_entry_lock_t *bl = NULL; - pl_entry_lock_t *tmp = NULL; - - struct list_head blocked_list; + int bl_ret = 0; + pl_entry_lock_t *bl = NULL; + pl_entry_lock_t *tmp = NULL; - INIT_LIST_HEAD (&blocked_list); - list_splice_init (&dom->blocked_entrylks, &blocked_list); + struct list_head blocked_list; - list_for_each_entry_safe (bl, tmp, &blocked_list, blocked_locks) { + INIT_LIST_HEAD(&blocked_list); + list_splice_init(&dom->blocked_entrylks, &blocked_list); - list_del_init (&bl->blocked_locks); + list_for_each_entry_safe(bl, tmp, &blocked_list, blocked_locks) + { + list_del_init(&bl->blocked_locks); - bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom); + bl_ret = __lock_entrylk(bl->this, pl_inode, bl, 0, dom, now, contend); - if (bl_ret == 0) { - list_add (&bl->blocked_locks, granted); - } + if (bl_ret == 0) { + list_add_tail(&bl->blocked_locks, granted); } - return; + } } /* Grants locks if possible which are blocked on a lock */ void -grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom) +grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) { - struct list_head granted_list; - pl_entry_lock_t *tmp = NULL; - pl_entry_lock_t *lock = NULL; - - INIT_LIST_HEAD (&granted_list); - - pthread_mutex_lock (&pl_inode->mutex); + struct list_head granted_list; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *lock = NULL; + + INIT_LIST_HEAD(&granted_list); + + pthread_mutex_lock(&pl_inode->mutex); + { + __grant_blocked_entry_locks(this, pl_inode, dom, &granted_list, now, + contend); + } + pthread_mutex_unlock(&pl_inode->mutex); + + list_for_each_entry_safe(lock, tmp, &granted_list, blocked_locks) + { + entrylk_trace_out(this, lock->frame, NULL, NULL, NULL, lock->basename, + ENTRYLK_LOCK, lock->type, 0, 0); + + STACK_UNWIND_STRICT(entrylk, lock->frame, 0, 0, NULL); + lock->frame = NULL; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(lock, tmp, &granted_list, blocked_locks) { - __grant_blocked_entry_locks (this, pl_inode, dom, - &granted_list); + list_del_init(&lock->blocked_locks); + __pl_entrylk_unref(lock); } - pthread_mutex_unlock (&pl_inode->mutex); - - list_for_each_entry_safe (lock, tmp, &granted_list, blocked_locks) { - entrylk_trace_out (this, lock->frame, NULL, NULL, NULL, - lock->basename, ENTRYLK_LOCK, lock->type, - 0, 0); - - STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL); - lock->frame = NULL; - } - - pthread_mutex_lock (&pl_inode->mutex); - { - list_for_each_entry_safe (lock, tmp, &granted_list, blocked_locks) { - list_del_init (&lock->blocked_locks); - __pl_entrylk_unref (lock); - } - } - pthread_mutex_unlock (&pl_inode->mutex); - - return; + } + pthread_mutex_unlock(&pl_inode->mutex); } - /* Common entrylk code called by pl_entrylk and pl_fentrylk */ int -pl_common_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, inode_t *inode, const char *basename, - entrylk_cmd cmd, entrylk_type type, loc_t *loc, fd_t *fd, - dict_t *xdata) - -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int ret = -1; - char unwind = 1; - GF_UNUSED int dict_ret = -1; - pl_inode_t *pinode = NULL; - pl_entry_lock_t *reqlock = NULL; - pl_entry_lock_t *unlocked = NULL; - pl_dom_list_t *dom = NULL; - char *conn_id = NULL; - pl_ctx_t *ctx = NULL; - int nonblock = 0; - - if (xdata) - dict_ret = dict_get_str (xdata, "connection-id", &conn_id); - - pinode = pl_inode_get (this, inode); - if (!pinode) { - op_errno = ENOMEM; - goto out; - } +pl_common_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, + inode_t *inode, const char *basename, entrylk_cmd cmd, + entrylk_type type, loc_t *loc, fd_t *fd, dict_t *xdata) - if (frame->root->client) { - ctx = pl_ctx_get (frame->root->client, this); - if (!ctx) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); - goto unwind; - } - } - - dom = get_domain (pinode, volume); - if (!dom){ - op_errno = ENOMEM; +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int ret = -1; + char unwind = 1; + GF_UNUSED int dict_ret = -1; + pl_inode_t *pinode = NULL; + pl_entry_lock_t *reqlock = NULL; + pl_entry_lock_t *unlocked = NULL; + pl_dom_list_t *dom = NULL; + char *conn_id = NULL; + pl_ctx_t *ctx = NULL; + int nonblock = 0; + gf_boolean_t need_inode_unref = _gf_false; + posix_locks_private_t *priv = NULL; + struct list_head *pcontend = NULL; + struct list_head contend; + struct timespec now = {}; + + priv = this->private; + + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + if (xdata) + dict_ret = dict_get_str(xdata, "connection-id", &conn_id); + + pinode = pl_inode_get(this, inode, NULL); + if (!pinode) { + op_errno = ENOMEM; + goto out; + } + + if (frame->root->client) { + ctx = pl_ctx_get(frame->root->client, this); + if (!ctx) { + op_errno = ENOMEM; + gf_log(this->name, GF_LOG_INFO, "pl_ctx_get() failed"); + goto unwind; + } + } + + dom = get_domain(pinode, volume); + if (!dom) { + op_errno = ENOMEM; + goto out; + } + + entrylk_trace_in(this, frame, volume, fd, loc, basename, cmd, type); + + reqlock = new_entrylk_lock(pinode, basename, type, dom->domain, frame, + conn_id, &op_errno); + if (!reqlock) { + op_ret = -1; + goto unwind; + } + + /* Ideally, AFTER a successful lock (both blocking and non-blocking) or + * an unsuccessful blocking lock operation, the inode needs to be ref'd. + * + * But doing so might give room to a race where the lock-requesting + * client could send a DISCONNECT just before this thread refs the inode + * after the locking is done, and the epoll thread could unref the inode + * in cleanup which means the inode's refcount would come down to 0, and + * the call to pl_forget() at this point destroys @pinode. Now when + * the io-thread executing this function tries to access pinode, + * it could crash on account of illegal memory access. + * + * To get around this problem, the inode is ref'd once even before + * adding the lock into client_list as a precautionary measure. + * This way even if there are DISCONNECTs, there will always be 1 extra + * ref on the inode, so @pinode is still alive until after the + * current stack unwinds. + */ + pinode->inode = inode_ref(inode); + if (priv->revocation_secs != 0) { + if (cmd != ENTRYLK_UNLOCK) { + __entrylk_prune_stale(this, pinode, dom, reqlock); + } else if (priv->monkey_unlocking == _gf_true) { + if (pl_does_monkey_want_stuck_lock()) { + gf_log(this->name, GF_LOG_WARNING, + "MONKEY LOCKING (forcing stuck lock)!"); + op_ret = 0; + need_inode_unref = _gf_true; + pthread_mutex_lock(&pinode->mutex); + { + __pl_entrylk_unref(reqlock); + } + pthread_mutex_unlock(&pinode->mutex); goto out; + } } + } - entrylk_trace_in (this, frame, volume, fd, loc, basename, cmd, type); + switch (cmd) { + case ENTRYLK_LOCK_NB: + nonblock = 1; + /* fall through */ + case ENTRYLK_LOCK: + if (ctx) + pthread_mutex_lock(&ctx->lock); + pthread_mutex_lock(&pinode->mutex); + { + reqlock->pinode = pinode; + + ret = __lock_entrylk(this, pinode, reqlock, nonblock, dom, &now, + pcontend); + if (ret == 0) { + reqlock->frame = NULL; + op_ret = 0; + } else { + op_errno = -ret; + } - reqlock = new_entrylk_lock (pinode, basename, type, dom->domain, frame, - conn_id); - if (!reqlock) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } + if (ctx && (!ret || !nonblock)) + list_add(&reqlock->client_list, &ctx->entrylk_lockers); - switch (cmd) { - case ENTRYLK_LOCK_NB: - nonblock = 1; - /* fall through */ - case ENTRYLK_LOCK: - if (ctx) - pthread_mutex_lock (&ctx->lock); - pthread_mutex_lock (&pinode->mutex); - { - reqlock->pinode = pinode; - - ret = __lock_entrylk (this, pinode, reqlock, nonblock, dom); - if (ret == 0) { - reqlock->frame = NULL; - op_ret = 0; - } else { - op_errno = -ret; - } - - if (ctx && (!ret || !nonblock)) - list_add (&reqlock->client_list, - &ctx->entrylk_lockers); - - if (ret == -EAGAIN && !nonblock) { - /* blocked */ - unwind = 0; - } else { - __pl_entrylk_unref (reqlock); - } + if (ret == -EAGAIN && !nonblock) { + /* blocked */ + unwind = 0; + } else { + __pl_entrylk_unref(reqlock); } - pthread_mutex_unlock (&pinode->mutex); - if (ctx) - pthread_mutex_unlock (&ctx->lock); - break; + + /* For all but the case where a non-blocking lock + * attempt fails, the extra ref taken before the switch + * block must be negated. + */ + if ((ret == -EAGAIN) && (nonblock)) + need_inode_unref = _gf_true; + } + pthread_mutex_unlock(&pinode->mutex); + if (ctx) + pthread_mutex_unlock(&ctx->lock); + break; case ENTRYLK_UNLOCK: - if (ctx) - pthread_mutex_lock (&ctx->lock); - pthread_mutex_lock (&pinode->mutex); - { - unlocked = __unlock_entrylk (dom, reqlock); - if (unlocked) { - list_del_init (&unlocked->client_list); - __pl_entrylk_unref (unlocked); - op_ret = 0; - } else { - op_errno = EINVAL; - } - __pl_entrylk_unref (reqlock); + if (ctx) + pthread_mutex_lock(&ctx->lock); + pthread_mutex_lock(&pinode->mutex); + { + /* Irrespective of whether unlock succeeds or not, + * the extra inode ref that was done before the switch + * block must be negated. Towards this, + * @need_inode_unref flag is set unconditionally here. + */ + need_inode_unref = _gf_true; + unlocked = __unlock_entrylk(dom, reqlock); + if (unlocked) { + list_del_init(&unlocked->client_list); + __pl_entrylk_unref(unlocked); + op_ret = 0; + } else { + op_errno = EINVAL; } - pthread_mutex_unlock (&pinode->mutex); - if (ctx) - pthread_mutex_unlock (&ctx->lock); + __pl_entrylk_unref(reqlock); + } + pthread_mutex_unlock(&pinode->mutex); + if (ctx) + pthread_mutex_unlock(&ctx->lock); - grant_blocked_entry_locks (this, pinode, dom); + grant_blocked_entry_locks(this, pinode, dom, &now, pcontend); - break; + break; default: - gf_log (this->name, GF_LOG_ERROR, - "Unexpected case in entrylk (cmd=%d). Please file" - "a bug report at http://bugs.gluster.com", cmd); - goto out; - } + need_inode_unref = _gf_true; + gf_log(this->name, GF_LOG_ERROR, + "Unexpected case in entrylk (cmd=%d). Please file" + "a bug report at http://bugs.gluster.com", + cmd); + goto out; + } + /* The following (extra) unref corresponds to the ref that + * was done at the time the lock was granted. + */ + if ((cmd == ENTRYLK_UNLOCK) && (op_ret == 0)) + inode_unref(pinode->inode); + out: - pl_update_refkeeper (this, inode); - if (unwind) { - entrylk_trace_out (this, frame, volume, fd, loc, basename, - cmd, type, op_ret, op_errno); -unwind: - STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, NULL); - } else { - entrylk_trace_block (this, frame, volume, fd, loc, basename, - cmd, type); - } + if (need_inode_unref) + inode_unref(pinode->inode); + + if (unwind) { + entrylk_trace_out(this, frame, volume, fd, loc, basename, cmd, type, + op_ret, op_errno); + unwind: + STACK_UNWIND_STRICT(entrylk, frame, op_ret, op_errno, NULL); + } - return 0; + if (pcontend != NULL) { + entrylk_contention_notify(this, pcontend); + } + + return 0; } /** @@ -651,17 +947,16 @@ unwind: */ int -pl_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +pl_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - pl_common_entrylk (frame, this, volume, loc->inode, basename, cmd, - type, loc, NULL, xdata); + pl_common_entrylk(frame, this, volume, loc->inode, basename, cmd, type, loc, + NULL, xdata); - return 0; + return 0; } - /** * pl_fentrylk: * @@ -669,185 +964,190 @@ pl_entrylk (call_frame_t *frame, xlator_t *this, */ int -pl_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +pl_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - pl_common_entrylk (frame, this, volume, fd->inode, basename, cmd, - type, NULL, fd, xdata); + pl_common_entrylk(frame, this, volume, fd->inode, basename, cmd, type, NULL, + fd, xdata); - return 0; + return 0; } - static void -pl_entrylk_log_cleanup (pl_entry_lock_t *lock) +pl_entrylk_log_cleanup(pl_entry_lock_t *lock) { - pl_inode_t *pinode = NULL; - char *path = NULL; - char *file = NULL; - - pinode = lock->pinode; - - inode_path (pinode->refkeeper, NULL, &path); + pl_inode_t *pinode = NULL; - if (path) - file = path; - else - file = uuid_utoa (pinode->refkeeper->gfid); + pinode = lock->pinode; - gf_log (THIS->name, GF_LOG_WARNING, - "releasing lock on %s held by " - "{client=%p, pid=%"PRId64" lk-owner=%s}", - file, lock->client, (uint64_t) lock->client_pid, - lkowner_utoa (&lock->owner)); - GF_FREE (path); + gf_log(THIS->name, GF_LOG_WARNING, + "releasing lock on %s held by " + "{client=%p, pid=%" PRId64 " lk-owner=%s}", + uuid_utoa(pinode->gfid), lock->client, (uint64_t)lock->client_pid, + lkowner_utoa(&lock->owner)); } - /* Release all entrylks from this client */ int -pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) +pl_entrylk_client_cleanup(xlator_t *this, pl_ctx_t *ctx) { - pl_entry_lock_t *tmp = NULL; - pl_entry_lock_t *l = NULL; - pl_dom_list_t *dom = NULL; - pl_inode_t *pinode = NULL; - - struct list_head released; - struct list_head unwind; - - INIT_LIST_HEAD (&released); - INIT_LIST_HEAD (&unwind); - - pthread_mutex_lock (&ctx->lock); + posix_locks_private_t *priv; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *l = NULL; + pl_dom_list_t *dom = NULL; + pl_inode_t *pinode = NULL; + struct list_head *pcontend = NULL; + struct list_head released; + struct list_head unwind; + struct list_head contend; + struct timespec now = {}; + + INIT_LIST_HEAD(&released); + INIT_LIST_HEAD(&unwind); + + priv = this->private; + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + pthread_mutex_lock(&ctx->lock); + { + list_for_each_entry_safe(l, tmp, &ctx->entrylk_lockers, client_list) { - list_for_each_entry_safe (l, tmp, &ctx->entrylk_lockers, - client_list) { - list_del_init (&l->client_list); - - pl_entrylk_log_cleanup (l); - - pinode = l->pinode; - - pthread_mutex_lock (&pinode->mutex); - { - /* If the entrylk object is part of granted list but not - * blocked list, then perform the following actions: - * i. delete the object from granted list; - * ii. grant other locks (from other clients) that may - * have been blocked on this entrylk; and - * iii. unref the object. - * - * If the entrylk object (L1) is part of both granted - * and blocked lists, then this means that a parallel - * unlock on another entrylk (L2 say) may have 'granted' - * L1 and added it to 'granted' list in - * __grant_blocked_entry_locks() (although using the - * 'blocked_locks' member). In that case, the cleanup - * codepath must try and grant other overlapping - * blocked entrylks from other clients, now that L1 is - * out of their way and then unref L1 in the end, and - * leave it to the other thread (the one executing - * unlock codepath) to unwind L1's frame, delete it from - * blocked_locks list, and perform the last unref on L1. - * - * If the entrylk object (L1) is part of blocked list - * only, the cleanup code path must: - * i. delete it from the blocked_locks list inside - * this critical section, - * ii. unwind its frame with EAGAIN, - * iii. try and grant blocked entry locks from other - * clients that were otherwise grantable, but were - * blocked to avoid leaving L1 to starve forever. - * iv. unref the object. - */ - if (!list_empty (&l->domain_list)) { - list_del_init (&l->domain_list); - list_add_tail (&l->client_list, - &released); - } else { - list_del_init (&l->blocked_locks); - list_add_tail (&l->client_list, - &unwind); - } - } - pthread_mutex_unlock (&pinode->mutex); + pl_entrylk_log_cleanup(l); + + pinode = l->pinode; + + pthread_mutex_lock(&pinode->mutex); + { + /* If the entrylk object is part of granted list but not + * blocked list, then perform the following actions: + * i. delete the object from granted list; + * ii. grant other locks (from other clients) that may + * have been blocked on this entrylk; and + * iii. unref the object. + * + * If the entrylk object (L1) is part of both granted + * and blocked lists, then this means that a parallel + * unlock on another entrylk (L2 say) may have 'granted' + * L1 and added it to 'granted' list in + * __grant_blocked_entry_locks() (although using the + * 'blocked_locks' member). In that case, the cleanup + * codepath must try and grant other overlapping + * blocked entrylks from other clients, now that L1 is + * out of their way and then unref L1 in the end, and + * leave it to the other thread (the one executing + * unlock codepath) to unwind L1's frame, delete it from + * blocked_locks list, and perform the last unref on L1. + * + * If the entrylk object (L1) is part of blocked list + * only, the cleanup code path must: + * i. delete it from the blocked_locks list inside + * this critical section, + * ii. unwind its frame with EAGAIN, + * iii. try and grant blocked entry locks from other + * clients that were otherwise grantable, but were + * blocked to avoid leaving L1 to starve forever. + * iv. unref the object. + */ + list_del_init(&l->client_list); + + if (!list_empty(&l->domain_list)) { + list_del_init(&l->domain_list); + list_add_tail(&l->client_list, &released); + } else { + list_del_init(&l->blocked_locks); + list_add_tail(&l->client_list, &unwind); } - } - pthread_mutex_unlock (&ctx->lock); + } + pthread_mutex_unlock(&pinode->mutex); + } + } + pthread_mutex_unlock(&ctx->lock); - list_for_each_entry_safe (l, tmp, &unwind, client_list) { - list_del_init (&l->client_list); + if (!list_empty(&unwind)) { + list_for_each_entry_safe(l, tmp, &unwind, client_list) + { + list_del_init(&l->client_list); - if (l->frame) - STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN, - NULL); - list_add_tail (&l->client_list, &released); + if (l->frame) + STACK_UNWIND_STRICT(entrylk, l->frame, -1, EAGAIN, NULL); + list_add_tail(&l->client_list, &released); } + } + + if (!list_empty(&released)) { + list_for_each_entry_safe(l, tmp, &released, client_list) + { + list_del_init(&l->client_list); - list_for_each_entry_safe (l, tmp, &released, client_list) { - list_del_init (&l->client_list); + pinode = l->pinode; - pinode = l->pinode; + dom = get_domain(pinode, l->volume); - dom = get_domain (pinode, l->volume); + grant_blocked_entry_locks(this, pinode, dom, &now, pcontend); - grant_blocked_entry_locks (this, pinode, dom); + pthread_mutex_lock(&pinode->mutex); + { + __pl_entrylk_unref(l); + } + pthread_mutex_unlock(&pinode->mutex); - pthread_mutex_lock (&pinode->mutex); - { - __pl_entrylk_unref (l); - } - pthread_mutex_unlock (&pinode->mutex); + inode_unref(pinode->inode); } + } - return 0; -} + if (pcontend != NULL) { + entrylk_contention_notify(this, pcontend); + } + return 0; +} int32_t -__get_entrylk_count (xlator_t *this, pl_inode_t *pl_inode) +__get_entrylk_count(xlator_t *this, pl_inode_t *pl_inode) { - int32_t count = 0; - pl_entry_lock_t *lock = NULL; - pl_dom_list_t *dom = NULL; + int32_t count = 0; + pl_entry_lock_t *lock = NULL; + pl_dom_list_t *dom = NULL; - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - list_for_each_entry (lock, &dom->entrylk_list, domain_list) { - count++; - } - - list_for_each_entry (lock, &dom->blocked_entrylks, blocked_locks) { - count++; - } + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + list_for_each_entry(lock, &dom->entrylk_list, domain_list) { count++; } + list_for_each_entry(lock, &dom->blocked_entrylks, blocked_locks) + { + count++; } + } - return count; + return count; } int32_t -get_entrylk_count (xlator_t *this, inode_t *inode) +get_entrylk_count(xlator_t *this, inode_t *inode) { - pl_inode_t *pl_inode = NULL; - uint64_t tmp_pl_inode = 0; - int ret = 0; - int32_t count = 0; + pl_inode_t *pl_inode = NULL; + uint64_t tmp_pl_inode = 0; + int ret = 0; + int32_t count = 0; - ret = inode_ctx_get (inode, this, &tmp_pl_inode); - if (ret != 0) { - goto out; - } + ret = inode_ctx_get(inode, this, &tmp_pl_inode); + if (ret != 0) { + goto out; + } - pl_inode = (pl_inode_t *)(long) tmp_pl_inode; + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; - pthread_mutex_lock (&pl_inode->mutex); - { - count = __get_entrylk_count (this, pl_inode); - } - pthread_mutex_unlock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + count = __get_entrylk_count(this, pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); out: - return count; + return count; } diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index c76cb7f9199..d4e51d6e0a1 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -7,848 +7,1168 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" -#include "list.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/list.h> +#include <glusterfs/upcall-utils.h> #include "locks.h" +#include "clear.h" #include "common.h" -inline void -__delete_inode_lock (pl_inode_lock_t *lock) +void +__delete_inode_lock(pl_inode_lock_t *lock) { - list_del_init (&lock->list); + list_del_init(&lock->list); } -static inline void -__pl_inodelk_ref (pl_inode_lock_t *lock) +static void +__pl_inodelk_ref(pl_inode_lock_t *lock) { - lock->ref++; + lock->ref++; } -inline void -__pl_inodelk_unref (pl_inode_lock_t *lock) +void +__pl_inodelk_unref(pl_inode_lock_t *lock) { - lock->ref--; - if (!lock->ref) { - GF_FREE (lock->connection_id); - GF_FREE (lock); - } + lock->ref--; + if (!lock->ref) { + GF_FREE(lock->connection_id); + GF_FREE(lock); + } } -/* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't conflict */ -static inline int -inodelk_type_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2) +/* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't + * conflict */ +static int +inodelk_type_conflict(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { - if (l2->fl_type == F_WRLCK || l1->fl_type == F_WRLCK) - return 1; + if (l2->fl_type == F_WRLCK || l1->fl_type == F_WRLCK) + return 1; - return 0; + return 0; } void -pl_print_inodelk (char *str, int size, int cmd, struct gf_flock *flock, const char *domain) +pl_print_inodelk(char *str, int size, int cmd, struct gf_flock *flock, + const char *domain) { - char *cmd_str = NULL; - char *type_str = NULL; + char *cmd_str = NULL; + char *type_str = NULL; - switch (cmd) { + switch (cmd) { #if F_GETLK != F_GETLK64 case F_GETLK64: #endif case F_GETLK: - cmd_str = "GETLK"; - break; + cmd_str = "GETLK"; + break; #if F_SETLK != F_SETLK64 case F_SETLK64: #endif case F_SETLK: - cmd_str = "SETLK"; - break; + cmd_str = "SETLK"; + break; #if F_SETLKW != F_SETLKW64 case F_SETLKW64: #endif case F_SETLKW: - cmd_str = "SETLKW"; - break; + cmd_str = "SETLKW"; + break; default: - cmd_str = "UNKNOWN"; - break; - } + cmd_str = "UNKNOWN"; + break; + } - switch (flock->l_type) { + switch (flock->l_type) { case F_RDLCK: - type_str = "READ"; - break; + type_str = "READ"; + break; case F_WRLCK: - type_str = "WRITE"; - break; + type_str = "WRITE"; + break; case F_UNLCK: - type_str = "UNLOCK"; - break; + type_str = "UNLOCK"; + break; default: - type_str = "UNKNOWN"; - break; - } - - snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " - "domain: %s, start=%llu, len=%llu, pid=%llu", - cmd_str, type_str, domain, - (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid); + type_str = "UNKNOWN"; + break; + } + + snprintf(str, size, + "lock=INODELK, cmd=%s, type=%s, " + "domain: %s, start=%llu, len=%llu, pid=%llu", + cmd_str, type_str, domain, (unsigned long long)flock->l_start, + (unsigned long long)flock->l_len, + (unsigned long long)flock->l_pid); } /* Determine if the two inodelks overlap reach other's lock regions */ static int -inodelk_overlap (pl_inode_lock_t *l1, pl_inode_lock_t *l2) +inodelk_overlap(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { - return ((l1->fl_end >= l2->fl_start) && - (l2->fl_end >= l1->fl_start)); + return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start)); } /* Returns true if the 2 inodelks have the same owner */ -static inline int -same_inodelk_owner (pl_inode_lock_t *l1, pl_inode_lock_t *l2) +static int +same_inodelk_owner(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { - return (is_same_lkowner (&l1->owner, &l2->owner) && - (l1->client == l2->client)); + return (is_same_lkowner(&l1->owner, &l2->owner) && + (l1->client == l2->client)); } /* Returns true if the 2 inodelks conflict with each other */ static int -inodelk_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2) +inodelk_conflict(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { - return (inodelk_overlap (l1, l2) && - inodelk_type_conflict (l1, l2)); + return (inodelk_overlap(l1, l2) && inodelk_type_conflict(l1, l2)); } -/* Determine if lock is grantable or not */ -static pl_inode_lock_t * -__inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock) +/* + * Check to see if the candidate lock overlaps/conflicts with the + * requested lock. If so, determine how old the lock is and return + * true if it exceeds the configured threshold, false otherwise. + */ +static inline gf_boolean_t +__stale_inodelk(xlator_t *this, pl_inode_lock_t *candidate_lock, + pl_inode_lock_t *requested_lock, time_t *lock_age_sec) { - pl_inode_lock_t *l = NULL; - pl_inode_lock_t *ret = NULL; - if (list_empty (&dom->inodelk_list)) - goto out; - list_for_each_entry (l, &dom->inodelk_list, list){ - if (inodelk_conflict (lock, l) && - !same_inodelk_owner (lock, l)) { - ret = l; - goto out; - } - } -out: - return ret; + posix_locks_private_t *priv = NULL; + + priv = this->private; + /* Question: Should we just prune them all given the + * chance? Or just the locks we are attempting to acquire? + */ + if (inodelk_conflict(candidate_lock, requested_lock)) { + *lock_age_sec = gf_time() - candidate_lock->granted_time; + if (*lock_age_sec > priv->revocation_secs) + return _gf_true; + } + return _gf_false; } -static pl_inode_lock_t * -__blocked_lock_conflict (pl_dom_list_t *dom, pl_inode_lock_t *lock) +/* Examine any locks held on this inode and potentially revoke the lock + * if the age exceeds revocation_secs. We will clear _only_ those locks + * which are granted, and then grant those locks which are blocked. + * + * Depending on how this patch works in the wild, we may expand this and + * introduce a heuristic which clears blocked locks as well if they + * are beyond a threshold. + */ +static gf_boolean_t +__inodelk_prune_stale(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_inode_lock_t *lock) { - pl_inode_lock_t *l = NULL; - pl_inode_lock_t *ret = NULL; - - if (list_empty (&dom->blocked_inodelks)) - return NULL; + posix_locks_private_t *priv = NULL; + pl_inode_lock_t *tmp = NULL; + pl_inode_lock_t *lk = NULL; + gf_boolean_t revoke_lock = _gf_false; + int bcount = 0; + int gcount = 0; + int op_errno = 0; + clrlk_args args; + args.opts = NULL; + time_t lk_age_sec = 0; + uint32_t max_blocked = 0; + char *reason_str = NULL; + + priv = this->private; + + args.type = CLRLK_INODE; + if (priv->revocation_clear_all == _gf_true) + args.kind = CLRLK_ALL; + else + args.kind = CLRLK_GRANTED; + + if (list_empty(&dom->inodelk_list)) + goto out; + + pthread_mutex_lock(&pinode->mutex); + list_for_each_entry_safe(lk, tmp, &dom->inodelk_list, list) + { + if (__stale_inodelk(this, lk, lock, &lk_age_sec) == _gf_true) { + revoke_lock = _gf_true; + reason_str = "age"; + break; + } + } - list_for_each_entry (l, &dom->blocked_inodelks, blocked_locks) { - if (inodelk_conflict (lock, l)) { - ret = l; - goto out; - } + max_blocked = priv->revocation_max_blocked; + if (max_blocked != 0 && revoke_lock == _gf_false) { + list_for_each_entry_safe(lk, tmp, &dom->blocked_inodelks, blocked_locks) + { + max_blocked--; + if (max_blocked == 0) { + revoke_lock = _gf_true; + reason_str = "max blocked"; + break; + } } + } + pthread_mutex_unlock(&pinode->mutex); out: - return ret; + if (revoke_lock == _gf_true) { + clrlk_clear_inodelk(this, pinode, dom, &args, &bcount, &gcount, + &op_errno); + gf_log(this->name, GF_LOG_WARNING, + "Lock revocation [reason: %s; gfid: %s; domain: %s; " + "age: %ld sec] - Inode lock revoked: %d granted & %d " + "blocked locks cleared", + reason_str, uuid_utoa(pinode->gfid), dom->domain, lk_age_sec, + gcount, bcount); + } + return revoke_lock; } -static int -__owner_has_lock (pl_dom_list_t *dom, pl_inode_lock_t *newlock) +void +inodelk_contention_notify_check(xlator_t *this, pl_inode_lock_t *lock, + struct timespec *now, struct list_head *contend) { - pl_inode_lock_t *lock = NULL; + posix_locks_private_t *priv; + int64_t elapsed; - list_for_each_entry (lock, &dom->inodelk_list, list) { - if (same_inodelk_owner (lock, newlock)) - return 1; - } + priv = this->private; - list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) { - if (same_inodelk_owner (lock, newlock)) - return 1; - } + /* If this lock is in a list, it means that we are about to send a + * notification for it, so no need to do anything else. */ + if (!list_empty(&lock->contend)) { + return; + } + + elapsed = now->tv_sec; + elapsed -= lock->contention_time.tv_sec; + if (now->tv_nsec < lock->contention_time.tv_nsec) { + elapsed--; + } + if (elapsed < priv->notify_contention_delay) { + return; + } - return 0; -} + /* All contention notifications will be sent outside of the locked + * region. This means that currently granted locks might have already + * been unlocked by that time. To avoid the lock or the inode to be + * destroyed before we process them, we take an additional reference + * on both. */ + inode_ref(lock->pl_inode->inode); + __pl_inodelk_ref(lock); + lock->contention_time = *now; -/* Determines if lock can be granted and adds the lock. If the lock - * is blocking, adds it to the blocked_inodelks list of the domain. - */ -static int -__lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, - int can_block, pl_dom_list_t *dom) + list_add_tail(&lock->contend, contend); +} + +void +inodelk_contention_notify(xlator_t *this, struct list_head *contend) { - pl_inode_lock_t *conf = NULL; - int ret = -EINVAL; + struct gf_upcall up; + struct gf_upcall_inodelk_contention lc; + pl_inode_lock_t *lock; + pl_inode_t *pl_inode; + client_t *client; + gf_boolean_t notify; + + while (!list_empty(contend)) { + lock = list_first_entry(contend, pl_inode_lock_t, contend); + + pl_inode = lock->pl_inode; + + pthread_mutex_lock(&pl_inode->mutex); + + /* If the lock has already been released, no notification is + * sent. We clear the notification time in this case. */ + notify = !list_empty(&lock->list); + if (!notify) { + lock->contention_time.tv_sec = 0; + lock->contention_time.tv_nsec = 0; + } else { + memcpy(&lc.flock, &lock->user_flock, sizeof(lc.flock)); + lc.pid = lock->client_pid; + lc.domain = lock->volume; + lc.xdata = NULL; + + gf_uuid_copy(up.gfid, lock->pl_inode->gfid); + client = (client_t *)lock->client; + if (client == NULL) { + /* A NULL client can be found if the inodelk + * was issued by a server side xlator. */ + up.client_uid = NULL; + } else { + up.client_uid = client->client_uid; + } + } - conf = __inodelk_grantable (dom, lock); - if (conf) { - ret = -EAGAIN; - if (can_block == 0) - goto out; + pthread_mutex_unlock(&pl_inode->mutex); + + if (notify) { + up.event_type = GF_UPCALL_INODELK_CONTENTION; + up.data = &lc; + + if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) { + gf_msg_debug(this->name, 0, + "Inodelk contention notification " + "failed"); + } else { + gf_msg_debug(this->name, 0, + "Inodelk contention notification " + "sent"); + } + } - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); + pthread_mutex_lock(&pl_inode->mutex); - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); + list_del_init(&lock->contend); + __pl_inodelk_unref(lock); + pthread_mutex_unlock(&pl_inode->mutex); - goto out; + inode_unref(pl_inode->inode); + } +} + +/* Determine if lock is grantable or not */ +static pl_inode_lock_t * +__inodelk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, + struct timespec *now, struct list_head *contend) +{ + pl_inode_lock_t *l = NULL; + pl_inode_lock_t *ret = NULL; + + list_for_each_entry(l, &dom->inodelk_list, list) + { + if (inodelk_conflict(lock, l) && !same_inodelk_owner(lock, l)) { + if (ret == NULL) { + ret = l; + if (contend == NULL) { + break; + } + } + inodelk_contention_notify_check(this, l, now, contend); } + } - if (__blocked_lock_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) { - ret = -EAGAIN; - if (can_block == 0) - goto out; + return ret; +} - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); +static pl_inode_lock_t * +__blocked_lock_conflict(pl_dom_list_t *dom, pl_inode_lock_t *lock) +{ + pl_inode_lock_t *l = NULL; - gf_log (this->name, GF_LOG_DEBUG, - "Lock is grantable, but blocking to prevent starvation"); - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Blocked", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); + list_for_each_entry(l, &dom->blocked_inodelks, blocked_locks) + { + if (inodelk_conflict(lock, l)) { + return l; + } + } + return NULL; +} - goto out; - } - __pl_inodelk_ref (lock); - gettimeofday (&lock->granted_time, NULL); - list_add (&lock->list, &dom->inodelk_list); +static int +__owner_has_lock(pl_dom_list_t *dom, pl_inode_lock_t *newlock) +{ + pl_inode_lock_t *lock = NULL; + + list_for_each_entry(lock, &dom->inodelk_list, list) + { + if (same_inodelk_owner(lock, newlock)) + return 1; + } - ret = 0; + list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks) + { + if (same_inodelk_owner(lock, newlock)) + return 1; + } + return 0; +} + +static int +__lock_blocked_add(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, + int can_block) +{ + if (can_block == 0) { + goto out; + } + + lock->blkd_time = gf_time(); + list_add_tail(&lock->blocked_locks, &dom->blocked_inodelks); + + gf_msg_trace(this->name, 0, + "%s (pid=%d) (lk-owner=%s) %" PRId64 + " - " + "%" PRId64 " => Blocked", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, + lkowner_utoa(&lock->owner), lock->user_flock.l_start, + lock->user_flock.l_len); + + pl_trace_block(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, + lock->volume); out: + return -EAGAIN; +} + +/* Determines if lock can be granted and adds the lock. If the lock + * is blocking, adds it to the blocked_inodelks list of the domain. + */ +static int +__lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, + int can_block, pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) +{ + pl_inode_lock_t *conf = NULL; + int ret; + + ret = pl_inode_remove_inodelk(pl_inode, lock); + if (ret < 0) { return ret; + } + if (ret == 0) { + conf = __inodelk_grantable(this, dom, lock, now, contend); + } + if ((ret > 0) || (conf != NULL)) { + return __lock_blocked_add(this, dom, lock, can_block); + } + + /* To prevent blocked locks starvation, check if there are any blocked + * locks thay may conflict with this lock. If there is then don't grant + * the lock. BUT grant the lock if the owner already has lock to allow + * nested locks. + * Example: + * SHD from Machine1 takes (gfid, 0-infinity) and is granted. + * SHD from machine2 takes (gfid, 0-infinity) and is blocked. + * When SHD from Machine1 takes (gfid, 0-128KB) it + * needs to be granted, without which the earlier lock on 0-infinity + * will not be unlocked by SHD from Machine1. + * TODO: Find why 'owner_has_lock' is checked even for blocked locks. + */ + if (__blocked_lock_conflict(dom, lock) && !(__owner_has_lock(dom, lock))) { + if (can_block != 0) { + gf_log(this->name, GF_LOG_DEBUG, + "Lock is grantable, but blocking to prevent " + "starvation"); + } + + return __lock_blocked_add(this, dom, lock, can_block); + } + __pl_inodelk_ref(lock); + lock->granted_time = gf_time(); + list_add(&lock->list, &dom->inodelk_list); + + return 0; } /* Return true if the two inodelks have exactly same lock boundaries */ static int -inodelks_equal (pl_inode_lock_t *l1, pl_inode_lock_t *l2) +inodelks_equal(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { - if ((l1->fl_start == l2->fl_start) && - (l1->fl_end == l2->fl_end)) - return 1; + if ((l1->fl_start == l2->fl_start) && (l1->fl_end == l2->fl_end)) + return 1; - return 0; + return 0; } - static pl_inode_lock_t * -find_matching_inodelk (pl_inode_lock_t *lock, pl_dom_list_t *dom) +find_matching_inodelk(pl_inode_lock_t *lock, pl_dom_list_t *dom) { - pl_inode_lock_t *l = NULL; - list_for_each_entry (l, &dom->inodelk_list, list) { - if (inodelks_equal (l, lock) && - same_inodelk_owner (l, lock)) - return l; - } - return NULL; + pl_inode_lock_t *l = NULL; + list_for_each_entry(l, &dom->inodelk_list, list) + { + if (inodelks_equal(l, lock) && same_inodelk_owner(l, lock)) + return l; + } + return NULL; } /* Set F_UNLCK removes a lock which has the exact same lock boundaries * as the UNLCK lock specifies. If such a lock is not found, returns invalid */ static pl_inode_lock_t * -__inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom) +__inode_unlock_lock(xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom) { - - pl_inode_lock_t *conf = NULL; - - conf = find_matching_inodelk (lock, dom); - if (!conf) { - gf_log (this->name, GF_LOG_ERROR, - " Matching lock not found for unlock %llu-%llu, by %s " - "on %p", (unsigned long long)lock->fl_start, - (unsigned long long)lock->fl_end, - lkowner_utoa (&lock->owner), lock->client); - goto out; - } - __delete_inode_lock (conf); - gf_log (this->name, GF_LOG_DEBUG, - " Matching lock found for unlock %llu-%llu, by %s on %p", - (unsigned long long)lock->fl_start, - (unsigned long long)lock->fl_end, lkowner_utoa (&lock->owner), - lock->client); + pl_inode_lock_t *conf = NULL; + inode_t *inode = NULL; + + inode = lock->pl_inode->inode; + + conf = find_matching_inodelk(lock, dom); + if (!conf) { + gf_log(this->name, GF_LOG_ERROR, + " Matching lock not found for unlock %llu-%llu, by %s " + "on %p for gfid:%s", + (unsigned long long)lock->fl_start, + (unsigned long long)lock->fl_end, lkowner_utoa(&lock->owner), + lock->client, inode ? uuid_utoa(inode->gfid) : "UNKNOWN"); + goto out; + } + __delete_inode_lock(conf); + gf_log(this->name, GF_LOG_DEBUG, + " Matching lock found for unlock %llu-%llu, by %s on %p for gfid:%s", + (unsigned long long)lock->fl_start, (unsigned long long)lock->fl_end, + lkowner_utoa(&lock->owner), lock->client, + inode ? uuid_utoa(inode->gfid) : "UNKNOWN"); out: - return conf; + return conf; } - -static void -__grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, - struct list_head *granted, pl_dom_list_t *dom) +void +__grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted, pl_dom_list_t *dom, + struct timespec *now, struct list_head *contend) { - int bl_ret = 0; - pl_inode_lock_t *bl = NULL; - pl_inode_lock_t *tmp = NULL; - - struct list_head blocked_list; + pl_inode_lock_t *bl = NULL; + pl_inode_lock_t *tmp = NULL; - INIT_LIST_HEAD (&blocked_list); - list_splice_init (&dom->blocked_inodelks, &blocked_list); + struct list_head blocked_list; - list_for_each_entry_safe (bl, tmp, &blocked_list, blocked_locks) { + INIT_LIST_HEAD(&blocked_list); + list_splice_init(&dom->blocked_inodelks, &blocked_list); - list_del_init (&bl->blocked_locks); + list_for_each_entry_safe(bl, tmp, &blocked_list, blocked_locks) + { + list_del_init(&bl->blocked_locks); - bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom); + bl->status = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend); - if (bl_ret == 0) { - list_add (&bl->blocked_locks, granted); - } + if (bl->status != -EAGAIN) { + list_add_tail(&bl->blocked_locks, granted); } - return; + } } -/* Grant all inodelks blocked on a lock */ void -grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom) +unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted) { - struct list_head granted; - pl_inode_lock_t *lock; - pl_inode_lock_t *tmp; - - INIT_LIST_HEAD (&granted); - - pthread_mutex_lock (&pl_inode->mutex); - { - __grant_blocked_inode_locks (this, pl_inode, &granted, dom); - } - pthread_mutex_unlock (&pl_inode->mutex); - - list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Granted", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - - pl_trace_out (this, lock->frame, NULL, NULL, F_SETLKW, - &lock->user_flock, 0, 0, lock->volume); - - STACK_UNWIND_STRICT (inodelk, lock->frame, 0, 0, NULL); - lock->frame = NULL; + pl_inode_lock_t *lock; + pl_inode_lock_t *tmp; + int32_t op_ret; + int32_t op_errno; + + list_for_each_entry_safe(lock, tmp, granted, blocked_locks) + { + if (lock->status == 0) { + op_ret = 0; + op_errno = 0; + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 + " => Granted", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + } else { + op_ret = -1; + op_errno = -lock->status; } + pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, + op_ret, op_errno, lock->volume); + + STACK_UNWIND_STRICT(inodelk, lock->frame, op_ret, op_errno, NULL); + lock->frame = NULL; + } - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(lock, tmp, granted, blocked_locks) { - list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) { - list_del_init (&lock->blocked_locks); - __pl_inodelk_unref (lock); - } + list_del_init(&lock->blocked_locks); + __pl_inodelk_unref(lock); } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); } - -static void -pl_inodelk_log_cleanup (pl_inode_lock_t *lock) +/* Grant all inodelks blocked on a lock */ +void +grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) { - pl_inode_t *pl_inode = NULL; - char *path = NULL; - char *file = NULL; - - pl_inode = lock->pl_inode; + struct list_head granted; - inode_path (pl_inode->refkeeper, NULL, &path); + INIT_LIST_HEAD(&granted); - if (path) - file = path; - else - file = uuid_utoa (pl_inode->refkeeper->gfid); + pthread_mutex_lock(&pl_inode->mutex); + { + __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now, + contend); + } + pthread_mutex_unlock(&pl_inode->mutex); - gf_log (THIS->name, GF_LOG_WARNING, - "releasing lock on %s held by " - "{client=%p, pid=%"PRId64" lk-owner=%s}", - file, lock->client, (uint64_t) lock->client_pid, - lkowner_utoa (&lock->owner)); - GF_FREE (path); + unwind_granted_inodes(this, pl_inode, &granted); } - -/* Release all inodelks from this client */ -int -pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) +static void +pl_inodelk_log_cleanup(pl_inode_lock_t *lock) { - pl_inode_lock_t *tmp = NULL; - pl_inode_lock_t *l = NULL; - pl_dom_list_t *dom = NULL; - pl_inode_t *pl_inode = NULL; + pl_inode_t *pl_inode = NULL; - struct list_head released; - struct list_head unwind; + pl_inode = lock->pl_inode; - INIT_LIST_HEAD (&released); - INIT_LIST_HEAD (&unwind); + gf_log(THIS->name, GF_LOG_WARNING, + "releasing lock on %s held by " + "{client=%p, pid=%" PRId64 " lk-owner=%s}", + uuid_utoa(pl_inode->gfid), lock->client, (uint64_t)lock->client_pid, + lkowner_utoa(&lock->owner)); +} - pthread_mutex_lock (&ctx->lock); +/* Release all inodelks from this client */ +int +pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx) +{ + posix_locks_private_t *priv; + pl_inode_lock_t *tmp = NULL; + pl_inode_lock_t *l = NULL; + pl_dom_list_t *dom = NULL; + pl_inode_t *pl_inode = NULL; + struct list_head *pcontend = NULL; + struct list_head released; + struct list_head unwind; + struct list_head contend; + struct timespec now = {}; + + priv = this->private; + + INIT_LIST_HEAD(&released); + INIT_LIST_HEAD(&unwind); + + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + pthread_mutex_lock(&ctx->lock); + { + list_for_each_entry_safe(l, tmp, &ctx->inodelk_lockers, client_list) { - list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers, - client_list) { - list_del_init (&l->client_list); - - pl_inodelk_log_cleanup (l); - - pl_inode = l->pl_inode; - - pthread_mutex_lock (&pl_inode->mutex); - { - /* If the inodelk object is part of granted list but not - * blocked list, then perform the following actions: - * i. delete the object from granted list; - * ii. grant other locks (from other clients) that may - * have been blocked on this inodelk; and - * iii. unref the object. - * - * If the inodelk object (L1) is part of both granted - * and blocked lists, then this means that a parallel - * unlock on another inodelk (L2 say) may have 'granted' - * L1 and added it to 'granted' list in - * __grant_blocked_node_locks() (although using the - * 'blocked_locks' member). In that case, the cleanup - * codepath must try and grant other overlapping - * blocked inodelks from other clients, now that L1 is - * out of their way and then unref L1 in the end, and - * leave it to the other thread (the one executing - * unlock codepath) to unwind L1's frame, delete it from - * blocked_locks list, and perform the last unref on L1. - * - * If the inodelk object (L1) is part of blocked list - * only, the cleanup code path must: - * i. delete it from the blocked_locks list inside - * this critical section, - * ii. unwind its frame with EAGAIN, - * iii. try and grant blocked inode locks from other - * clients that were otherwise grantable, but just - * got blocked to avoid leaving L1 to starve - * forever. - * iv. unref the object. - */ - if (!list_empty (&l->list)) { - __delete_inode_lock (l); - list_add_tail (&l->client_list, - &released); - } else { - list_del_init(&l->blocked_locks); - list_add_tail (&l->client_list, - &unwind); - } - } - pthread_mutex_unlock (&pl_inode->mutex); + pl_inodelk_log_cleanup(l); + + pl_inode = l->pl_inode; + + pthread_mutex_lock(&pl_inode->mutex); + { + /* If the inodelk object is part of granted list but not + * blocked list, then perform the following actions: + * i. delete the object from granted list; + * ii. grant other locks (from other clients) that may + * have been blocked on this inodelk; and + * iii. unref the object. + * + * If the inodelk object (L1) is part of both granted + * and blocked lists, then this means that a parallel + * unlock on another inodelk (L2 say) may have 'granted' + * L1 and added it to 'granted' list in + * __grant_blocked_inode_locks() (although using the + * 'blocked_locks' member). In that case, the cleanup + * codepath must try and grant other overlapping + * blocked inodelks from other clients, now that L1 is + * out of their way and then unref L1 in the end, and + * leave it to the other thread (the one executing + * unlock codepath) to unwind L1's frame, delete it from + * blocked_locks list, and perform the last unref on L1. + * + * If the inodelk object (L1) is part of blocked list + * only, the cleanup code path must: + * i. delete it from the blocked_locks list inside + * this critical section, + * ii. unwind its frame with EAGAIN, + * iii. try and grant blocked inode locks from other + * clients that were otherwise grantable, but just + * got blocked to avoid leaving L1 to starve + * forever. + * iv. unref the object. + */ + list_del_init(&l->client_list); + + if (!list_empty(&l->list)) { + __delete_inode_lock(l); + list_add_tail(&l->client_list, &released); + } else { + list_del_init(&l->blocked_locks); + list_add_tail(&l->client_list, &unwind); } - } - pthread_mutex_unlock (&ctx->lock); - - list_for_each_entry_safe (l, tmp, &unwind, client_list) { - list_del_init (&l->client_list); + } + pthread_mutex_unlock(&pl_inode->mutex); + } + } + pthread_mutex_unlock(&ctx->lock); - if (l->frame) - STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN, - NULL); - list_add_tail (&l->client_list, &released); + if (!list_empty(&unwind)) { + list_for_each_entry_safe(l, tmp, &unwind, client_list) + { + list_del_init(&l->client_list); + if (l->frame) + STACK_UNWIND_STRICT(inodelk, l->frame, -1, EAGAIN, NULL); + list_add_tail(&l->client_list, &released); } + } - list_for_each_entry_safe (l, tmp, &released, client_list) { - list_del_init (&l->client_list); + if (!list_empty(&released)) { + list_for_each_entry_safe(l, tmp, &released, client_list) + { + list_del_init(&l->client_list); - pl_inode = l->pl_inode; + pl_inode = l->pl_inode; - dom = get_domain (pl_inode, l->volume); + dom = get_domain(pl_inode, l->volume); - grant_blocked_inode_locks (this, pl_inode, dom); + grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend); - pthread_mutex_lock (&pl_inode->mutex); - { - __pl_inodelk_unref (l); - } - pthread_mutex_unlock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + __pl_inodelk_unref(l); + } + pthread_mutex_unlock(&pl_inode->mutex); + inode_unref(pl_inode->inode); } + } - return 0; -} + if (pcontend != NULL) { + inodelk_contention_notify(this, pcontend); + } + return 0; +} static int -pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, - pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom) +pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, + pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom, + inode_t *inode) { - int ret = -EINVAL; - pl_inode_lock_t *retlock = NULL; - gf_boolean_t unref = _gf_true; - - lock->pl_inode = pl_inode; - - if (ctx) - pthread_mutex_lock (&ctx->lock); - pthread_mutex_lock (&pl_inode->mutex); - { - if (lock->fl_type != F_UNLCK) { - ret = __lock_inodelk (this, pl_inode, lock, can_block, dom); - if (ret == 0) { - lock->frame = NULL; - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => OK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->fl_start, - lock->fl_end); - } else if (ret == -EAGAIN) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => NOK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - if (can_block) - unref = _gf_false; - } - - if (ctx && (!ret || can_block)) - list_add_tail (&lock->client_list, - &ctx->inodelk_lockers); - } else { - retlock = __inode_unlock_lock (this, lock, dom); - if (!retlock) { - gf_log (this->name, GF_LOG_DEBUG, - "Bad Unlock issued on Inode lock"); - ret = -EINVAL; - goto out; - } - list_del_init (&retlock->client_list); - __pl_inodelk_unref (retlock); - - ret = 0; + posix_locks_private_t *priv = NULL; + int ret = -EINVAL; + pl_inode_lock_t *retlock = NULL; + gf_boolean_t unref = _gf_true; + gf_boolean_t need_inode_unref = _gf_false; + struct list_head *pcontend = NULL; + struct list_head contend; + struct list_head wake; + struct timespec now = {}; + short fl_type; + + lock->pl_inode = pl_inode; + fl_type = lock->fl_type; + + priv = this->private; + + /* Ideally, AFTER a successful lock (both blocking and non-blocking) or + * an unsuccessful blocking lock operation, the inode needs to be ref'd. + * + * But doing so might give room to a race where the lock-requesting + * client could send a DISCONNECT just before this thread refs the inode + * after the locking is done, and the epoll thread could unref the inode + * in cleanup which means the inode's refcount would come down to 0, and + * the call to pl_forget() at this point destroys @pl_inode. Now when + * the io-thread executing this function tries to access pl_inode, + * it could crash on account of illegal memory access. + * + * To get around this problem, the inode is ref'd once even before + * adding the lock into client_list as a precautionary measure. + * This way even if there are DISCONNECTs, there will always be 1 extra + * ref on the inode, so @pl_inode is still alive until after the + * current stack unwinds. + */ + pl_inode->inode = inode_ref(inode); + + if (priv->revocation_secs != 0) { + if (lock->fl_type != F_UNLCK) { + __inodelk_prune_stale(this, pl_inode, dom, lock); + } else if (priv->monkey_unlocking == _gf_true) { + if (pl_does_monkey_want_stuck_lock()) { + pthread_mutex_lock(&pl_inode->mutex); + { + __pl_inodelk_unref(lock); } -out: - if (unref) - __pl_inodelk_unref (lock); + pthread_mutex_unlock(&pl_inode->mutex); + inode_unref(pl_inode->inode); + gf_log(this->name, GF_LOG_WARNING, + "MONKEY LOCKING (forcing stuck lock)!"); + return 0; + } } - pthread_mutex_unlock (&pl_inode->mutex); - if (ctx) - pthread_mutex_unlock (&ctx->lock); + } + + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + INIT_LIST_HEAD(&wake); + + if (ctx) + pthread_mutex_lock(&ctx->lock); + pthread_mutex_lock(&pl_inode->mutex); + { + if (lock->fl_type != F_UNLCK) { + ret = __lock_inodelk(this, pl_inode, lock, can_block, dom, &now, + pcontend); + if (ret == 0) { + lock->frame = NULL; + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 + " => OK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->fl_start, lock->fl_end); + } else if (ret == -EAGAIN) { + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 + " => NOK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + if (can_block) { + unref = _gf_false; + } + } + /* For all but the case where a non-blocking lock attempt fails + * with -EAGAIN, the extra ref taken at the start of this function + * must be negated. */ + need_inode_unref = (ret != 0) && ((ret != -EAGAIN) || !can_block); + if (ctx && !need_inode_unref) { + list_add_tail(&lock->client_list, &ctx->inodelk_lockers); + } + } else { + /* Irrespective of whether unlock succeeds or not, + * the extra inode ref that was done at the start of + * this function must be negated. Towards this, + * @need_inode_unref flag is set unconditionally here. + */ + need_inode_unref = _gf_true; + retlock = __inode_unlock_lock(this, lock, dom); + if (!retlock) { + gf_log(this->name, GF_LOG_DEBUG, + "Bad Unlock issued on Inode lock"); + ret = -EINVAL; + goto out; + } + list_del_init(&retlock->client_list); + __pl_inodelk_unref(retlock); - grant_blocked_inode_locks (this, pl_inode, dom); + pl_inode_remove_unlocked(this, pl_inode, &wake); - return ret; + ret = 0; + } + out: + if (unref) + __pl_inodelk_unref(lock); + } + pthread_mutex_unlock(&pl_inode->mutex); + if (ctx) + pthread_mutex_unlock(&ctx->lock); + + pl_inode_remove_wake(&wake); + + /* The following (extra) unref corresponds to the ref that + * was done at the time the lock was granted. + */ + if ((fl_type == F_UNLCK) && (ret == 0)) { + inode_unref(pl_inode->inode); + grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend); + } + + if (need_inode_unref) { + inode_unref(pl_inode->inode); + } + + if (pcontend != NULL) { + inodelk_contention_notify(this, pcontend); + } + + return ret; } /* Create a new inode_lock_t */ -pl_inode_lock_t * -new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, - call_frame_t *frame, xlator_t *this, const char *volume, - char *conn_id) +static pl_inode_lock_t * +new_inode_lock(struct gf_flock *flock, client_t *client, pid_t client_pid, + call_frame_t *frame, xlator_t *this, const char *volume, + char *conn_id, int32_t *op_errno) { - pl_inode_lock_t *lock = NULL; + pl_inode_lock_t *lock = NULL; + + if (!pl_is_lk_owner_valid(&frame->root->lk_owner, frame->root->client)) { + *op_errno = EINVAL; + goto out; + } + + lock = GF_CALLOC(1, sizeof(*lock), gf_locks_mt_pl_inode_lock_t); + if (!lock) { + *op_errno = ENOMEM; + goto out; + } + + lock->fl_start = flock->l_start; + lock->fl_type = flock->l_type; + + if (flock->l_len == 0) + lock->fl_end = LLONG_MAX; + else + lock->fl_end = flock->l_start + flock->l_len - 1; + + lock->client = client; + lock->client_pid = client_pid; + lock->volume = volume; + lock->owner = frame->root->lk_owner; + lock->frame = frame; + lock->this = this; + + if (conn_id) { + lock->connection_id = gf_strdup(conn_id); + } + + INIT_LIST_HEAD(&lock->list); + INIT_LIST_HEAD(&lock->blocked_locks); + INIT_LIST_HEAD(&lock->client_list); + INIT_LIST_HEAD(&lock->contend); + __pl_inodelk_ref(lock); - lock = GF_CALLOC (1, sizeof (*lock), - gf_locks_mt_pl_inode_lock_t); - if (!lock) { - return NULL; - } - - lock->fl_start = flock->l_start; - lock->fl_type = flock->l_type; - - if (flock->l_len == 0) - lock->fl_end = LLONG_MAX; - else - lock->fl_end = flock->l_start + flock->l_len - 1; - - lock->client = client; - lock->client_pid = client_pid; - lock->volume = volume; - lock->owner = frame->root->lk_owner; - lock->frame = frame; - lock->this = this; - - if (conn_id) { - lock->connection_id = gf_strdup (conn_id); - } - - INIT_LIST_HEAD (&lock->list); - INIT_LIST_HEAD (&lock->blocked_locks); - INIT_LIST_HEAD (&lock->client_list); - __pl_inodelk_ref (lock); - - return lock; +out: + return lock; } int32_t -_pl_convert_volume (const char *volume, char **res) +_pl_convert_volume(const char *volume, char **res) { - char *mdata_vol = NULL; - int ret = 0; + char *mdata_vol = NULL; + int ret = 0; - mdata_vol = strrchr (volume, ':'); - //if the volume already ends with :metadata don't bother - if (mdata_vol && (strcmp (mdata_vol, ":metadata") == 0)) - return 0; - - ret = gf_asprintf (res, "%s:metadata", volume); - if (ret <= 0) - return ENOMEM; + mdata_vol = strrchr(volume, ':'); + // if the volume already ends with :metadata don't bother + if (mdata_vol && (strcmp(mdata_vol, ":metadata") == 0)) return 0; + + ret = gf_asprintf(res, "%s:metadata", volume); + if (ret <= 0) + return ENOMEM; + return 0; } int32_t -_pl_convert_volume_for_special_range (struct gf_flock *flock, - const char *volume, char **res) +_pl_convert_volume_for_special_range(struct gf_flock *flock, const char *volume, + char **res) { - int32_t ret = 0; + int32_t ret = 0; - if ((flock->l_start == LLONG_MAX -1) && - (flock->l_len == 0)) { - ret = _pl_convert_volume (volume, res); - } + if ((flock->l_start == LLONG_MAX - 1) && (flock->l_len == 0)) { + ret = _pl_convert_volume(volume, res); + } - return ret; + return ret; } /* Common inodelk code called from pl_inodelk and pl_finodelk */ int -pl_common_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, inode_t *inode, int32_t cmd, - struct gf_flock *flock, loc_t *loc, fd_t *fd, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int ret = -1; - GF_UNUSED int dict_ret = -1; - int can_block = 0; - pl_inode_t * pinode = NULL; - pl_inode_lock_t * reqlock = NULL; - pl_dom_list_t * dom = NULL; - char *res = NULL; - char *res1 = NULL; - char *conn_id = NULL; - pl_ctx_t *ctx = NULL; - - if (xdata) - dict_ret = dict_get_str (xdata, "connection-id", &conn_id); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (inode, unwind); - VALIDATE_OR_GOTO (flock, unwind); - - if ((flock->l_start < 0) || (flock->l_len < 0)) { - op_errno = EINVAL; - goto unwind; - } - - op_errno = _pl_convert_volume_for_special_range (flock, volume, &res); - if (op_errno) - goto unwind; - if (res) - volume = res; - - pl_trace_in (this, frame, fd, loc, cmd, flock, volume); - - if (frame->root->client) { - ctx = pl_ctx_get (frame->root->client, this); - if (!ctx) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); - goto unwind; - } - } - - pinode = pl_inode_get (this, inode); - if (!pinode) { - op_errno = ENOMEM; - goto unwind; +pl_common_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, + inode_t *inode, int32_t cmd, struct gf_flock *flock, + loc_t *loc, fd_t *fd, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int ret = -1; + GF_UNUSED int dict_ret = -1; + int can_block = 0; + short lock_type = 0; + pl_inode_t *pinode = NULL; + pl_inode_lock_t *reqlock = NULL; + pl_dom_list_t *dom = NULL; + char *res = NULL; + char *res1 = NULL; + char *conn_id = NULL; + pl_ctx_t *ctx = NULL; + + if (xdata) + dict_ret = dict_get_str(xdata, "connection-id", &conn_id); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(inode, unwind); + VALIDATE_OR_GOTO(flock, unwind); + + if ((flock->l_start < 0) || (flock->l_len < 0)) { + op_errno = EINVAL; + goto unwind; + } + + op_errno = _pl_convert_volume_for_special_range(flock, volume, &res); + if (op_errno) + goto unwind; + if (res) + volume = res; + + pl_trace_in(this, frame, fd, loc, cmd, flock, volume); + + if (frame->root->client) { + ctx = pl_ctx_get(frame->root->client, this); + if (!ctx) { + op_errno = ENOMEM; + gf_log(this->name, GF_LOG_INFO, "pl_ctx_get() failed"); + goto unwind; } + } - dom = get_domain (pinode, volume); - if (!dom) { - op_errno = ENOMEM; - goto unwind; - } + pinode = pl_inode_get(this, inode, NULL); + if (!pinode) { + op_errno = ENOMEM; + goto unwind; + } - reqlock = new_inode_lock (flock, frame->root->client, frame->root->pid, - frame, this, volume, conn_id); + dom = get_domain(pinode, volume); + if (!dom) { + op_errno = ENOMEM; + goto unwind; + } - if (!reqlock) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } + reqlock = new_inode_lock(flock, frame->root->client, frame->root->pid, + frame, this, dom->domain, conn_id, &op_errno); + if (!reqlock) { + op_ret = -1; + goto unwind; + } - switch (cmd) { + switch (cmd) { case F_SETLKW: - can_block = 1; + can_block = 1; - /* fall through */ + /* fall through */ case F_SETLK: - memcpy (&reqlock->user_flock, flock, sizeof (struct gf_flock)); - ret = pl_inode_setlk (this, ctx, pinode, reqlock, can_block, - dom); - - if (ret < 0) { - if ((can_block) && (F_UNLCK != flock->l_type)) { - pl_trace_block (this, frame, fd, loc, - cmd, flock, volume); - goto out; - } - gf_log (this->name, GF_LOG_TRACE, "returning EAGAIN"); - op_errno = -ret; - goto unwind; + lock_type = flock->l_type; + memcpy(&reqlock->user_flock, flock, sizeof(struct gf_flock)); + ret = pl_inode_setlk(this, ctx, pinode, reqlock, can_block, dom, + inode); + + if (ret < 0) { + if (ret == -EAGAIN) { + if (can_block && (F_UNLCK != lock_type)) { + goto out; + } + gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN"); + } else { + gf_log(this->name, GF_LOG_TRACE, "returning %d", ret); } - break; + op_errno = -ret; + goto unwind; + } + break; default: - op_errno = ENOTSUP; - gf_log (this->name, GF_LOG_DEBUG, - "Lock command F_GETLK not supported for [f]inodelk " - "(cmd=%d)", - cmd); - goto unwind; - } + op_errno = ENOTSUP; + gf_log(this->name, GF_LOG_DEBUG, + "Lock command F_GETLK not supported for [f]inodelk " + "(cmd=%d)", + cmd); + goto unwind; + } - op_ret = 0; + op_ret = 0; unwind: - if ((inode != NULL) && (flock !=NULL)) { - pl_update_refkeeper (this, inode); - pl_trace_out (this, frame, fd, loc, cmd, flock, op_ret, op_errno, volume); - } + if (flock != NULL) + pl_trace_out(this, frame, fd, loc, cmd, flock, op_ret, op_errno, + volume); - STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, NULL); + STACK_UNWIND_STRICT(inodelk, frame, op_ret, op_errno, NULL); out: - GF_FREE (res); - GF_FREE (res1); - return 0; + GF_FREE(res); + GF_FREE(res1); + return 0; } int -pl_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +pl_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock, - loc, NULL, xdata); + pl_common_inodelk(frame, this, volume, loc->inode, cmd, flock, loc, NULL, + xdata); - return 0; + return 0; } int -pl_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +pl_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock, - NULL, fd, xdata); - - return 0; + pl_common_inodelk(frame, this, volume, fd->inode, cmd, flock, NULL, fd, + xdata); + return 0; } -static inline int32_t -__get_inodelk_dom_count (pl_dom_list_t *dom) +static int32_t +__get_inodelk_dom_count(pl_dom_list_t *dom) { - pl_inode_lock_t *lock = NULL; - int32_t count = 0; - - list_for_each_entry (lock, &dom->inodelk_list, list) { - count++; - } - list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) { - count++; - } - return count; + pl_inode_lock_t *lock = NULL; + int32_t count = 0; + + list_for_each_entry(lock, &dom->inodelk_list, list) { count++; } + list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks) + { + count++; + } + return count; } /* Returns the no. of locks (blocked/granted) held on a given domain name * If @domname is NULL, returns the no. of locks in all the domains present. * If @domname is non-NULL and non-existent, returns 0 */ int32_t -__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname) +__get_inodelk_count(xlator_t *this, pl_inode_t *pl_inode, char *domname) { - int32_t count = 0; - pl_dom_list_t *dom = NULL; - - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - if (domname) { - if (strcmp (domname, dom->domain) == 0) { - count = __get_inodelk_dom_count (dom); - goto out; - } - - } else { - /* Counting locks from all domains */ - count += __get_inodelk_dom_count (dom); + int32_t count = 0; + pl_dom_list_t *dom = NULL; + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + if (domname) { + if (strcmp(domname, dom->domain) == 0) { + count = __get_inodelk_dom_count(dom); + goto out; + } - } + } else { + /* Counting locks from all domains */ + count += __get_inodelk_dom_count(dom); } + } out: - return count; + return count; } int32_t -get_inodelk_count (xlator_t *this, inode_t *inode, char *domname) +get_inodelk_count(xlator_t *this, inode_t *inode, char *domname) { - pl_inode_t *pl_inode = NULL; - uint64_t tmp_pl_inode = 0; - int ret = 0; - int32_t count = 0; + pl_inode_t *pl_inode = NULL; + uint64_t tmp_pl_inode = 0; + int ret = 0; + int32_t count = 0; - ret = inode_ctx_get (inode, this, &tmp_pl_inode); - if (ret != 0) { - goto out; - } + ret = inode_ctx_get(inode, this, &tmp_pl_inode); + if (ret != 0) { + goto out; + } - pl_inode = (pl_inode_t *)(long) tmp_pl_inode; + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; - pthread_mutex_lock (&pl_inode->mutex); - { - count = __get_inodelk_count (this, pl_inode, domname); - } - pthread_mutex_unlock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + count = __get_inodelk_count(this, pl_inode, domname); + } + pthread_mutex_unlock(&pl_inode->mutex); out: - return count; + return count; } diff --git a/xlators/features/locks/src/locks-mem-types.h b/xlators/features/locks/src/locks-mem-types.h index 08aeb0a7925..a76605027b3 100644 --- a/xlators/features/locks/src/locks-mem-types.h +++ b/xlators/features/locks/src/locks-mem-types.h @@ -11,19 +11,18 @@ #ifndef __LOCKS_MEM_TYPES_H__ #define __LOCKS_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_locks_mem_types_ { - gf_locks_mt_pl_dom_list_t = gf_common_mt_end + 1, - gf_locks_mt_pl_inode_t, - gf_locks_mt_posix_lock_t, - gf_locks_mt_pl_entry_lock_t, - gf_locks_mt_pl_inode_lock_t, - gf_locks_mt_truncate_ops, - gf_locks_mt_pl_rw_req_t, - gf_locks_mt_posix_locks_private_t, - gf_locks_mt_pl_fdctx_t, - gf_locks_mt_end + gf_locks_mt_pl_dom_list_t = gf_common_mt_end + 1, + gf_locks_mt_pl_inode_t, + gf_locks_mt_posix_lock_t, + gf_locks_mt_pl_entry_lock_t, + gf_locks_mt_pl_inode_lock_t, + gf_locks_mt_pl_rw_req_t, + gf_locks_mt_posix_locks_private_t, + gf_locks_mt_pl_fdctx_t, + gf_locks_mt_pl_meta_lock_t, + gf_locks_mt_end }; #endif - diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index 8c2a6f867ee..c868eb494a2 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2006-2012, 2015-2016 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -10,195 +10,283 @@ #ifndef __POSIX_LOCKS_H__ #define __POSIX_LOCKS_H__ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "compat-errno.h" -#include "stack.h" -#include "call-stub.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/stack.h> +#include <glusterfs/call-stub.h> #include "locks-mem-types.h" -#include "client_t.h" +#include <glusterfs/client_t.h> + +#include <glusterfs/lkowner.h> -#include "lkowner.h" +typedef enum { + MLK_NONE, + MLK_FILE_BASED, + MLK_FORCED, + MLK_OPTIMAL +} mlk_mode_t; /* defines different mandatory locking modes*/ struct __pl_fd; struct __posix_lock { - struct list_head list; + struct list_head list; + + off_t fl_start; + off_t fl_end; + uint32_t lk_flags; + + short fl_type; + short blocked; /* waiting to acquire */ + struct gf_flock user_flock; /* the flock supplied by the user */ + xlator_t *this; /* required for blocked locks */ + unsigned long fd_num; - short fl_type; - off_t fl_start; - off_t fl_end; + fd_t *fd; + call_frame_t *frame; - short blocked; /* waiting to acquire */ - struct gf_flock user_flock; /* the flock supplied by the user */ - xlator_t *this; /* required for blocked locks */ - unsigned long fd_num; + time_t blkd_time; /* time at which lock was queued into blkd list */ + time_t granted_time; /* time at which lock was queued into active list */ - fd_t *fd; - call_frame_t *frame; + /* These two together serve to uniquely identify each process + across nodes */ - struct timeval blkd_time; /*time at which lock was queued into blkd list*/ - struct timeval granted_time; /*time at which lock was queued into active list*/ + void *client; /* to identify client node */ - /* These two together serve to uniquely identify each process - across nodes */ + /* This field uniquely identifies the client the lock belongs to. As + * lock migration is handled by rebalance, the client_t object will be + * overwritten by rebalance and can't be deemed as the owner of the + * lock on destination. Hence, the below field is migrated from + * source to destination by lock_migration_info_t and updated on the + * destination. So that on client-server disconnection, server can + * cleanup the locks proper;y. */ - void *client; /* to identify client node */ - gf_lkowner_t owner; - pid_t client_pid; /* pid of client process */ + char *client_uid; + gf_lkowner_t owner; + pid_t client_pid; /* pid of client process */ + + int blocking; }; typedef struct __posix_lock posix_lock_t; struct __pl_inode_lock { - struct list_head list; - struct list_head blocked_locks; /* list_head pointing to blocked_inodelks */ - int ref; + struct list_head list; + struct list_head blocked_locks; /* list_head pointing to blocked_inodelks */ + struct list_head contend; /* list of contending locks */ + int ref; + + off_t fl_start; + off_t fl_end; - short fl_type; - off_t fl_start; - off_t fl_end; + const char *volume; - const char *volume; + struct gf_flock user_flock; /* the flock supplied by the user */ + xlator_t *this; /* required for blocked locks */ + struct __pl_inode *pl_inode; - struct gf_flock user_flock; /* the flock supplied by the user */ - xlator_t *this; /* required for blocked locks */ - struct __pl_inode *pl_inode; + call_frame_t *frame; - call_frame_t *frame; + time_t blkd_time; /* time at which lock was queued into blkd list */ + time_t granted_time; /* time at which lock was queued into active list */ - struct timeval blkd_time; /*time at which lock was queued into blkd list*/ - struct timeval granted_time; /*time at which lock was queued into active list*/ + /*last time at which lock contention was detected and notified*/ + struct timespec contention_time; - /* These two together serve to uniquely identify each process - across nodes */ + /* These two together serve to uniquely identify each process + across nodes */ - void *client; /* to identify client node */ - gf_lkowner_t owner; - pid_t client_pid; /* pid of client process */ + void *client; /* to identify client node */ + gf_lkowner_t owner; + pid_t client_pid; /* pid of client process */ - char *connection_id; /* stores the client connection id */ + char *connection_id; /* stores the client connection id */ - struct list_head client_list; /* list of all locks from a client */ + struct list_head client_list; /* list of all locks from a client */ + short fl_type; + + int32_t status; /* Error code when we try to grant a lock in blocked + state */ }; typedef struct __pl_inode_lock pl_inode_lock_t; -struct __pl_rw_req_t { - struct list_head list; - call_stub_t *stub; - posix_lock_t region; +struct _pl_rw_req { + struct list_head list; + call_stub_t *stub; + posix_lock_t region; }; -typedef struct __pl_rw_req_t pl_rw_req_t; - -struct __pl_dom_list_t { - struct list_head inode_list; /* list_head back to pl_inode_t */ - const char *domain; - struct list_head entrylk_list; /* List of entry locks */ - struct list_head blocked_entrylks; /* List of all blocked entrylks */ - struct list_head inodelk_list; /* List of inode locks */ - struct list_head blocked_inodelks; /* List of all blocked inodelks */ +typedef struct _pl_rw_req pl_rw_req_t; + +struct _pl_dom_list { + struct list_head inode_list; /* list_head back to pl_inode_t */ + const char *domain; + struct list_head entrylk_list; /* List of entry locks */ + struct list_head blocked_entrylks; /* List of all blocked entrylks */ + struct list_head inodelk_list; /* List of inode locks */ + struct list_head blocked_inodelks; /* List of all blocked inodelks */ }; -typedef struct __pl_dom_list_t pl_dom_list_t; +typedef struct _pl_dom_list pl_dom_list_t; struct __entry_lock { - struct list_head domain_list; /* list_head back to pl_dom_list_t */ - struct list_head blocked_locks; /* list_head back to blocked_entrylks */ - int ref; + struct list_head domain_list; /* list_head back to pl_dom_list_t */ + struct list_head blocked_locks; /* list_head back to blocked_entrylks */ + struct list_head contend; /* list of contending locks */ + int ref; + + call_frame_t *frame; + xlator_t *this; + struct __pl_inode *pinode; - call_frame_t *frame; - xlator_t *this; - struct __pl_inode *pinode; + const char *volume; - const char *volume; + const char *basename; - const char *basename; - entrylk_type type; + time_t blkd_time; /* time at which lock was queued into blkd list */ + time_t granted_time; /* time at which lock was queued into active list */ - struct timeval blkd_time; /*time at which lock was queued into blkd list*/ - struct timeval granted_time; /*time at which lock was queued into active list*/ + /*last time at which lock contention was detected and notified*/ + struct timespec contention_time; - void *client; - gf_lkowner_t owner; - pid_t client_pid; /* pid of client process */ + void *client; + gf_lkowner_t owner; + pid_t client_pid; /* pid of client process */ - char *connection_id; /* stores the client connection id */ + char *connection_id; /* stores the client connection id */ - struct list_head client_list; /* list of all locks from a client */ + struct list_head client_list; /* list of all locks from a client */ + entrylk_type type; }; typedef struct __entry_lock pl_entry_lock_t; - /* The "simulated" inode. This contains a list of all the locks associated with this file */ struct __pl_inode { - pthread_mutex_t mutex; - - struct list_head dom_list; /* list of domains */ - struct list_head ext_list; /* list of fcntl locks */ - struct list_head rw_list; /* list of waiting r/w requests */ - struct list_head reservelk_list; /* list of reservelks */ - struct list_head blocked_reservelks; /* list of blocked reservelks */ - struct list_head blocked_calls; /* List of blocked lock calls while a reserve is held*/ - int mandatory; /* if mandatory locking is enabled */ - - inode_t *refkeeper; /* hold refs on an inode while locks are - held to prevent pruning */ + pthread_mutex_t mutex; + + struct list_head dom_list; /* list of domains */ + struct list_head ext_list; /* list of fcntl locks */ + struct list_head rw_list; /* list of waiting r/w requests */ + struct list_head reservelk_list; /* list of reservelks */ + struct list_head blocked_reservelks; /* list of blocked reservelks */ + struct list_head blocked_calls; /* List of blocked lock calls while a + reserve is held*/ + struct list_head metalk_list; /* Meta lock list */ + struct list_head queued_locks; /* This is to store the incoming lock + requests while meta lock is enabled */ + struct list_head waiting; /* List of pending fops waiting to unlink/rmdir + the inode. */ + int mandatory; /* if mandatory locking is enabled */ + + inode_t *refkeeper; /* hold refs on an inode while locks are + held to prevent pruning */ + uuid_t gfid; /* placeholder for gfid of the inode */ + inode_t *inode; /* pointer to be used for ref and unref + of inode_t as long as there are + locks on it */ + gf_boolean_t migrated; + + /* Flag to indicate whether to read mlock-enforce xattr from disk */ + gf_boolean_t check_mlock_info; + + /* Mandatory_lock enforce: IO will be allowed if and only if the lkowner has + held the lock. + + Note: An xattr is set on the file to recover this information post + reboot. If client does not want mandatory lock to be enforced, then it + should remove this xattr explicitly + */ + gf_boolean_t mlock_enforced; + /* There are scenarios where mandatory lock is granted but there are IOs + pending at posix level. To avoid this before preempting the previous lock + owner, we wait for all the fops to be unwound. + */ + int fop_wind_count; + pthread_cond_t check_fop_wind_count; + + gf_boolean_t track_fop_wind_count; + + int32_t links; /* Number of hard links the inode has. */ + uint32_t remove_running; /* Number of remove operations running. */ + gf_boolean_t is_locked; /* Regular locks will be blocked. */ + gf_boolean_t removed; /* The inode has been deleted. */ }; typedef struct __pl_inode pl_inode_t; +struct __pl_metalk { + pthread_mutex_t mutex; + /* For pl_inode meta lock list */ + struct list_head list; + /* For pl_ctx_t list */ + struct list_head client_list; + char *client_uid; + + pl_inode_t *pl_inode; + int ref; +}; +typedef struct __pl_metalk pl_meta_lock_t; typedef struct { - gf_boolean_t mandatory; /* if mandatory locking is enabled */ - gf_boolean_t trace; /* trace lock requests in and out */ - char *brickname; + char *brickname; + uint32_t revocation_secs; + uint32_t revocation_max_blocked; + uint32_t notify_contention_delay; + mlk_mode_t mandatory_mode; /* holds current mandatory locking mode */ + gf_boolean_t trace; /* trace lock requests in and out */ + gf_boolean_t monkey_unlocking; + gf_boolean_t revocation_clear_all; + gf_boolean_t notify_contention; + gf_boolean_t mlock_enforced; } posix_locks_private_t; - typedef struct { - gf_boolean_t entrylk_count_req; - gf_boolean_t inodelk_count_req; - gf_boolean_t inodelk_dom_count_req; - gf_boolean_t posixlk_count_req; - gf_boolean_t parent_entrylk_req; - - /* used by {f,}truncate */ - loc_t loc; - fd_t *fd; - off_t offset; - dict_t *xdata; - enum {TRUNCATE, FTRUNCATE} op; + data_t *inodelk_dom_count_req; + + dict_t *xdata; + loc_t loc[2]; + fd_t *fd; + inode_t *inode; + off_t offset; + glusterfs_fop_t op; + gf_boolean_t entrylk_count_req; + gf_boolean_t inodelk_count_req; + gf_boolean_t posixlk_count_req; + gf_boolean_t parent_entrylk_req; + gf_boolean_t multiple_dom_lk_requests; + int update_mlock_enforced_flag; } pl_local_t; - typedef struct { - struct list_head locks_list; + struct list_head locks_list; } pl_fdctx_t; - struct _locker { - struct list_head lockers; - char *volume; - inode_t *inode; - gf_lkowner_t owner; + struct list_head lockers; + char *volume; + inode_t *inode; + gf_lkowner_t owner; }; typedef struct _locks_ctx { - pthread_mutex_t lock; - struct list_head inodelk_lockers; - struct list_head entrylk_lockers; + pthread_mutex_t lock; + struct list_head inodelk_lockers; + struct list_head entrylk_lockers; + struct list_head metalk_list; } pl_ctx_t; +typedef struct _multi_dom_lk_data { + xlator_t *this; + inode_t *inode; + dict_t *xdata_rsp; + gf_boolean_t keep_max; +} multi_dom_lk_data; + +typedef enum { DECREMENT, INCREMENT } pl_count_op_t; pl_ctx_t * -pl_ctx_get (client_t *client, xlator_t *xlator); +pl_ctx_get(client_t *client, xlator_t *xlator); int -pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx); +pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx); int -pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx); +pl_entrylk_client_cleanup(xlator_t *this, pl_ctx_t *ctx); #endif /* __POSIX_LOCKS_H__ */ diff --git a/xlators/features/locks/src/pl-messages.h b/xlators/features/locks/src/pl-messages.h new file mode 100644 index 00000000000..e2d3d7ca974 --- /dev/null +++ b/xlators/features/locks/src/pl-messages.h @@ -0,0 +1,29 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _PL_MESSAGES_H_ +#define _PL_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(PL, PL_MSG_LOCK_NUMBER, PL_MSG_INODELK_CONTENTION_FAILED, + PL_MSG_ENTRYLK_CONTENTION_FAILED); + +#endif /* !_PL_MESSAGES_H_ */ diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index 337623d6577..cf0ae4c57dd 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2006-2012, 2016 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -12,2718 +12,5084 @@ #include <limits.h> #include <pthread.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" +#include <glusterfs/compat.h> +#include <glusterfs/logging.h> #include "locks.h" #include "common.h" -#include "statedump.h" +#include <glusterfs/statedump.h> #include "clear.h" -#include "defaults.h" -#include "syncop.h" +#include <glusterfs/defaults.h> +#include <glusterfs/syncop.h> #ifndef LLONG_MAX #define LLONG_MAX LONG_LONG_MAX /* compat with old gcc */ -#endif /* LLONG_MAX */ +#endif /* LLONG_MAX */ /* Forward declarations */ +void +do_blocked_rw(pl_inode_t *); +static int +__rw_allowable(pl_inode_t *, posix_lock_t *, glusterfs_fop_t); +static int +format_brickname(char *); +int +pl_lockinfo_get_brickname(xlator_t *, inode_t *, int32_t *); +static int +fetch_pathinfo(xlator_t *, inode_t *, int32_t *, char **); -void do_blocked_rw (pl_inode_t *); -static int __rw_allowable (pl_inode_t *, posix_lock_t *, glusterfs_fop_t); -static int format_brickname(char *); -int pl_lockinfo_get_brickname (xlator_t *, inode_t *, int32_t *); -static int fetch_pathinfo(xlator_t *, inode_t *, int32_t *, char **); +/* + * The client is always requesting data, but older + * servers were not returning it. Newer ones are, so + * the client is receiving a mix of NULL and non-NULL + * xdata in the answers when bricks are of different + * versions. This triggers a bug in older clients. + * To prevent that, we avoid returning extra xdata to + * older clients (making the newer brick to behave as + * an old brick). + */ +#define PL_STACK_UNWIND_FOR_CLIENT(fop, xdata, frame, op_ret, params...) \ + do { \ + pl_local_t *__local = NULL; \ + if (frame->root->client && \ + (frame->root->client->opversion < GD_OP_VERSION_3_10_0)) { \ + __local = frame->local; \ + PL_STACK_UNWIND_AND_FREE(__local, fop, frame, op_ret, params); \ + } else { \ + PL_STACK_UNWIND(fop, xdata, frame, op_ret, params); \ + } \ + } while (0) + +#define PL_STACK_UNWIND(fop, xdata, frame, op_ret, params...) \ + do { \ + pl_local_t *__local = NULL; \ + inode_t *__parent = NULL; \ + inode_t *__inode = NULL; \ + char *__name = NULL; \ + dict_t *__unref = NULL; \ + int __i = 0; \ + __local = frame->local; \ + if (op_ret >= 0 && pl_needs_xdata_response(frame->local)) { \ + if (xdata) \ + dict_ref(xdata); \ + else \ + xdata = dict_new(); \ + if (xdata) { \ + __unref = xdata; \ + while (__local->fd || __local->loc[__i].inode) { \ + pl_get_xdata_rsp_args(__local, #fop, &__parent, &__inode, \ + &__name, __i); \ + pl_set_xdata_response(frame->this, __local, __parent, \ + __inode, __name, xdata, __i > 0); \ + if (__local->fd || __i == 1) \ + break; \ + __i++; \ + } \ + } \ + } \ + PL_STACK_UNWIND_AND_FREE(__local, fop, frame, op_ret, params); \ + if (__unref) \ + dict_unref(__unref); \ + } while (0) + +#define PL_LOCAL_GET_REQUESTS(frame, this, xdata, __fd, __loc, __newloc) \ + do { \ + if (pl_has_xdata_requests(xdata)) { \ + if (!frame->local) \ + frame->local = mem_get0(this->local_pool); \ + pl_local_t *__local = frame->local; \ + if (__local) { \ + if (__fd) { \ + __local->fd = fd_ref(__fd); \ + __local->inode = inode_ref(__fd->inode); \ + } else { \ + if (__loc) \ + loc_copy(&__local->loc[0], __loc); \ + if (__newloc) \ + loc_copy(&__local->loc[1], __newloc); \ + __local->inode = inode_ref(__local->loc[0].inode); \ + } \ + pl_get_xdata_requests(__local, xdata); \ + } \ + } \ + } while (0) + +#define PL_CHECK_LOCK_ENFORCE_KEY(frame, dict, name, this, loc, fd, priv) \ + do { \ + if ((dict && (dict_get(dict, GF_ENFORCE_MANDATORY_LOCK))) || \ + (name && (strcmp(name, GF_ENFORCE_MANDATORY_LOCK) == 0))) { \ + inode_t *__inode = (loc ? loc->inode : fd->inode); \ + pl_inode_t *__pl_inode = pl_inode_get(this, __inode, NULL); \ + if (__pl_inode == NULL) { \ + op_ret = -1; \ + op_errno = ENOMEM; \ + goto unwind; \ + } \ + if (!pl_is_mandatory_locking_enabled(__pl_inode) || \ + !priv->mlock_enforced) { \ + op_ret = -1; \ + gf_msg(this->name, GF_LOG_DEBUG, EINVAL, 0, \ + "option %s would need mandatory lock to be enabled " \ + "and feature.enforce-mandatory-lock option to be set " \ + "to on", \ + GF_ENFORCE_MANDATORY_LOCK); \ + op_errno = EINVAL; \ + goto unwind; \ + } \ + \ + op_ret = pl_local_init(frame, this, loc, fd); \ + if (op_ret) { \ + op_errno = ENOMEM; \ + goto unwind; \ + } \ + \ + ((pl_local_t *)(frame->local))->update_mlock_enforced_flag = 1; \ + } \ + } while (0) + +#define PL_INODE_REMOVE(_fop, _frame, _xl, _loc1, _loc2, _cont, _cbk, \ + _args...) \ + ({ \ + struct list_head contend; \ + pl_inode_t *__pl_inode; \ + call_stub_t *__stub; \ + int32_t __error; \ + INIT_LIST_HEAD(&contend); \ + __error = pl_inode_remove_prepare(_xl, _frame, _loc2 ? _loc2 : _loc1, \ + &__pl_inode, &contend); \ + if (__error < 0) { \ + __stub = fop_##_fop##_stub(_frame, _cont, ##_args); \ + __error = pl_inode_remove_complete(_xl, __pl_inode, __stub, \ + &contend); \ + } else if (__error == 0) { \ + PL_LOCAL_GET_REQUESTS(_frame, _xl, xdata, ((fd_t *)NULL), _loc1, \ + _loc2); \ + STACK_WIND_COOKIE(_frame, _cbk, __pl_inode, FIRST_CHILD(_xl), \ + FIRST_CHILD(_xl)->fops->_fop, ##_args); \ + } \ + __error; \ + }) + +gf_boolean_t +pl_has_xdata_requests(dict_t *xdata) +{ + static char *reqs[] = {GLUSTERFS_ENTRYLK_COUNT, + GLUSTERFS_INODELK_COUNT, + GLUSTERFS_INODELK_DOM_COUNT, + GLUSTERFS_POSIXLK_COUNT, + GLUSTERFS_PARENT_ENTRYLK, + GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, + NULL}; + static int reqs_size[] = {SLEN(GLUSTERFS_ENTRYLK_COUNT), + SLEN(GLUSTERFS_INODELK_COUNT), + SLEN(GLUSTERFS_INODELK_DOM_COUNT), + SLEN(GLUSTERFS_POSIXLK_COUNT), + SLEN(GLUSTERFS_PARENT_ENTRYLK), + SLEN(GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS), + 0}; + int i = 0; + + if (!xdata) + return _gf_false; + + for (i = 0; reqs[i]; i++) + if (dict_getn(xdata, reqs[i], reqs_size[i])) + return _gf_true; + + return _gf_false; +} -static pl_fdctx_t * -pl_new_fdctx () +static int +dict_delete_domain_key(dict_t *dict, char *key, data_t *value, void *data) { - pl_fdctx_t *fdctx = NULL; + dict_del(dict, key); + return 0; +} - fdctx = GF_CALLOC (1, sizeof (*fdctx), - gf_locks_mt_pl_fdctx_t); - GF_VALIDATE_OR_GOTO ("posix-locks", fdctx, out); +void +pl_get_xdata_requests(pl_local_t *local, dict_t *xdata) +{ + if (!local || !xdata) + return; - INIT_LIST_HEAD (&fdctx->locks_list); + GF_ASSERT(local->xdata == NULL); + local->xdata = dict_copy_with_ref(xdata, NULL); + + if (dict_get_sizen(xdata, GLUSTERFS_ENTRYLK_COUNT)) { + local->entrylk_count_req = 1; + dict_del_sizen(xdata, GLUSTERFS_ENTRYLK_COUNT); + } + if (dict_get_sizen(xdata, GLUSTERFS_INODELK_COUNT)) { + local->inodelk_count_req = 1; + dict_del_sizen(xdata, GLUSTERFS_INODELK_COUNT); + } + if (dict_get_sizen(xdata, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS)) { + local->multiple_dom_lk_requests = 1; + dict_del_sizen(xdata, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS); + dict_foreach_fnmatch(xdata, GLUSTERFS_INODELK_DOM_PREFIX "*", + dict_delete_domain_key, NULL); + } + + local->inodelk_dom_count_req = dict_get_sizen(xdata, + GLUSTERFS_INODELK_DOM_COUNT); + if (local->inodelk_dom_count_req) { + data_ref(local->inodelk_dom_count_req); + dict_del_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT); + } + + if (dict_get_sizen(xdata, GLUSTERFS_POSIXLK_COUNT)) { + local->posixlk_count_req = 1; + dict_del_sizen(xdata, GLUSTERFS_POSIXLK_COUNT); + } + + if (dict_get_sizen(xdata, GLUSTERFS_PARENT_ENTRYLK)) { + local->parent_entrylk_req = 1; + dict_del_sizen(xdata, GLUSTERFS_PARENT_ENTRYLK); + } +} -out: - return fdctx; +gf_boolean_t +pl_needs_xdata_response(pl_local_t *local) +{ + if (!local) + return _gf_false; + + if (local->parent_entrylk_req || local->entrylk_count_req || + local->inodelk_dom_count_req || local->inodelk_count_req || + local->posixlk_count_req || local->multiple_dom_lk_requests) + return _gf_true; + + return _gf_false; } -static pl_fdctx_t * -pl_check_n_create_fdctx (xlator_t *this, fd_t *fd) +void +pl_get_xdata_rsp_args(pl_local_t *local, char *fop, inode_t **parent, + inode_t **inode, char **name, int i) +{ + if (strcmp(fop, "lookup") == 0) { + *parent = local->loc[0].parent; + *inode = local->loc[0].inode; + *name = (char *)local->loc[0].name; + } else { + if (local->fd) { + *inode = local->fd->inode; + } else { + *inode = local->loc[i].parent; + } + } +} + +static inline int +pl_track_io_fop_count(pl_local_t *local, xlator_t *this, pl_count_op_t op) { - int ret = 0; - uint64_t tmp = 0; - pl_fdctx_t *fdctx = NULL; + pl_inode_t *pl_inode = NULL; - GF_VALIDATE_OR_GOTO ("posix-locks", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); + if (!local) + return -1; - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &tmp); - if ((ret != 0) || (tmp == 0)) { - fdctx = pl_new_fdctx (); - if (fdctx == NULL) { - goto unlock; - } - } + pl_inode = pl_inode_get(this, local->inode, NULL); + if (!pl_inode) + return -1; - ret = __fd_ctx_set (fd, this, (uint64_t)(long)fdctx); - if (ret != 0) { - GF_FREE (fdctx); - fdctx = NULL; - gf_log (this->name, GF_LOG_DEBUG, - "failed to set fd ctx"); + if (pl_inode->mlock_enforced && pl_inode->track_fop_wind_count) { + pthread_mutex_lock(&pl_inode->mutex); + { + if (op == DECREMENT) { + pl_inode->fop_wind_count--; + /* fop_wind_count can go negative when lock enforcement is + * enabled on unwind path of an IO. Hence the "<" comparision. + */ + if (pl_inode->fop_wind_count <= 0) { + pthread_cond_broadcast(&pl_inode->check_fop_wind_count); + pl_inode->track_fop_wind_count = _gf_false; + pl_inode->fop_wind_count = 0; } + } else { + pl_inode->fop_wind_count++; + } } -unlock: - UNLOCK (&fd->lock); + pthread_mutex_unlock(&pl_inode->mutex); + } -out: - return fdctx; + return 0; } -int -pl_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +static int32_t +__get_posixlk_count(pl_inode_t *pl_inode) { - pl_local_t *local = NULL; + posix_lock_t *lock = NULL; + int32_t count = 0; - local = frame->local; + list_for_each_entry(lock, &pl_inode->ext_list, list) { count++; } - if (local->op == TRUNCATE) - loc_wipe (&local->loc); + return count; +} - if (local->xdata) - dict_unref (local->xdata); - if (local->fd) - fd_unref (local->fd); +int32_t +get_posixlk_count(xlator_t *this, inode_t *inode) +{ + pl_inode_t *pl_inode = NULL; + uint64_t tmp_pl_inode = 0; + int32_t count = 0; - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, - prebuf, postbuf, xdata); - return 0; + int ret = inode_ctx_get(inode, this, &tmp_pl_inode); + if (ret != 0) { + goto out; + } + + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; + + pthread_mutex_lock(&pl_inode->mutex); + { + count = __get_posixlk_count(pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); + +out: + return count; } +void +pl_parent_entrylk_xattr_fill(xlator_t *this, inode_t *parent, char *basename, + dict_t *dict, gf_boolean_t keep_max) +{ + int32_t entrylk = 0; + int32_t maxcount = -1; + int ret = -1; + + if (!parent || !basename) + goto out; + if (keep_max) { + ret = dict_get_int32_sizen(dict, GLUSTERFS_PARENT_ENTRYLK, &maxcount); + if (ret < 0) + gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s", + GLUSTERFS_PARENT_ENTRYLK); + } + entrylk = check_entrylk_on_basename(this, parent, basename); + if (maxcount >= entrylk) + return; +out: + ret = dict_set_int32_sizen(dict, GLUSTERFS_PARENT_ENTRYLK, entrylk); + if (ret < 0) { + gf_msg_debug(this->name, 0, " dict_set failed on key %s", + GLUSTERFS_PARENT_ENTRYLK); + } +} -static int -truncate_allowed (pl_inode_t *pl_inode, - client_t *client, pid_t client_pid, - gf_lkowner_t *owner, off_t offset) +void +pl_entrylk_xattr_fill(xlator_t *this, inode_t *inode, dict_t *dict, + gf_boolean_t keep_max) +{ + int32_t count = 0; + int32_t maxcount = -1; + int ret = -1; + + if (keep_max) { + ret = dict_get_int32_sizen(dict, GLUSTERFS_ENTRYLK_COUNT, &maxcount); + if (ret < 0) + gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s", + GLUSTERFS_ENTRYLK_COUNT); + } + count = get_entrylk_count(this, inode); + if (maxcount >= count) + return; + + ret = dict_set_int32_sizen(dict, GLUSTERFS_ENTRYLK_COUNT, count); + if (ret < 0) { + gf_msg_debug(this->name, 0, " dict_set failed on key %s", + GLUSTERFS_ENTRYLK_COUNT); + } +} + +void +pl_inodelk_xattr_fill(xlator_t *this, inode_t *inode, dict_t *dict, + char *domname, gf_boolean_t keep_max) { - posix_lock_t *l = NULL; - posix_lock_t region = {.list = {0, }, }; - int ret = 1; + int32_t count = 0; + int32_t maxcount = -1; + int ret = -1; + + if (keep_max) { + ret = dict_get_int32_sizen(dict, GLUSTERFS_INODELK_COUNT, &maxcount); + if (ret < 0) + gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s", + GLUSTERFS_INODELK_COUNT); + } + count = get_inodelk_count(this, inode, domname); + if (maxcount >= count) + return; - region.fl_start = offset; - region.fl_end = LLONG_MAX; - region.client = client; - region.client_pid = client_pid; - region.owner = *owner; + ret = dict_set_int32_sizen(dict, GLUSTERFS_INODELK_COUNT, count); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "Failed to set count for " + "key %s", + GLUSTERFS_INODELK_COUNT); + } - pthread_mutex_lock (&pl_inode->mutex); - { - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (!l->blocked - && locks_overlap (®ion, l) - && !same_owner (®ion, l)) { - ret = 0; - gf_log ("posix-locks", GF_LOG_TRACE, "Truncate " - "allowed"); - break; - } - } - } - pthread_mutex_unlock (&pl_inode->mutex); + return; +} - return ret; +void +pl_posixlk_xattr_fill(xlator_t *this, inode_t *inode, dict_t *dict, + gf_boolean_t keep_max) +{ + int32_t count = 0; + int32_t maxcount = -1; + int ret = -1; + + if (keep_max) { + ret = dict_get_int32_sizen(dict, GLUSTERFS_POSIXLK_COUNT, &maxcount); + if (ret < 0) + gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s", + GLUSTERFS_POSIXLK_COUNT); + } + count = get_posixlk_count(this, inode); + if (maxcount >= count) + return; + + ret = dict_set_int32_sizen(dict, GLUSTERFS_POSIXLK_COUNT, count); + if (ret < 0) { + gf_msg_debug(this->name, 0, " dict_set failed on key %s", + GLUSTERFS_POSIXLK_COUNT); + } } +void +pl_inodelk_xattr_fill_each(xlator_t *this, inode_t *inode, dict_t *dict, + char *domname, gf_boolean_t keep_max, char *key) +{ + int32_t count = 0; + int32_t maxcount = -1; + int ret = -1; + + if (keep_max) { + ret = dict_get_int32(dict, key, &maxcount); + if (ret < 0) + gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s", + GLUSTERFS_INODELK_COUNT); + } + count = get_inodelk_count(this, inode, domname); + if (maxcount >= count) + return; + + ret = dict_set_int32(dict, key, count); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "Failed to set count for " + "key %s", + key); + } + + return; +} static int -truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) +pl_inodelk_xattr_fill_multiple(dict_t *this, char *key, data_t *value, + void *data) { - posix_locks_private_t *priv = NULL; - pl_local_t *local = NULL; - inode_t *inode = NULL; - pl_inode_t *pl_inode = NULL; + multi_dom_lk_data *d = data; + char *tmp_key = NULL; + char *save_ptr = NULL; + + tmp_key = gf_strdup(key); + if (!tmp_key) + return -1; + + strtok_r(tmp_key, ":", &save_ptr); + if (!*save_ptr) { + if (tmp_key) + GF_FREE(tmp_key); + gf_msg(THIS->name, GF_LOG_ERROR, 0, EINVAL, + "Could not tokenize domain string from key %s", key); + return -1; + } + + pl_inodelk_xattr_fill_each(d->this, d->inode, d->xdata_rsp, save_ptr, + d->keep_max, key); + if (tmp_key) + GF_FREE(tmp_key); + + return 0; +} +void +pl_fill_multiple_dom_lk_requests(xlator_t *this, pl_local_t *local, + inode_t *inode, dict_t *dict, + gf_boolean_t keep_max) +{ + multi_dom_lk_data data; - priv = this->private; - local = frame->local; + data.this = this; + data.inode = inode; + data.xdata_rsp = dict; + data.keep_max = keep_max; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "got error (errno=%d, stderror=%s) from child", - op_errno, strerror (op_errno)); - goto unwind; - } + dict_foreach_fnmatch(local->xdata, GLUSTERFS_INODELK_DOM_PREFIX "*", + pl_inodelk_xattr_fill_multiple, &data); +} - if (local->op == TRUNCATE) - inode = local->loc.inode; - else - inode = local->fd->inode; +void +pl_set_xdata_response(xlator_t *this, pl_local_t *local, inode_t *parent, + inode_t *inode, char *name, dict_t *xdata, + gf_boolean_t max_lock) +{ + if (!xdata || !local) + return; - pl_inode = pl_inode_get (this, inode); - if (!pl_inode) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } + if (local->parent_entrylk_req && parent && name && name[0] != '\0') + pl_parent_entrylk_xattr_fill(this, parent, name, xdata, max_lock); - if (priv->mandatory - && pl_inode->mandatory - && !truncate_allowed (pl_inode, frame->root->client, - frame->root->pid, &frame->root->lk_owner, - local->offset)) { - op_ret = -1; - op_errno = EAGAIN; - goto unwind; - } + if (!inode) + return; - switch (local->op) { - case TRUNCATE: - STACK_WIND (frame, pl_truncate_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->truncate, - &local->loc, local->offset, local->xdata); - break; - case FTRUNCATE: - STACK_WIND (frame, pl_truncate_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->ftruncate, - local->fd, local->offset, local->xdata); - break; - } + if (local->entrylk_count_req) + pl_entrylk_xattr_fill(this, inode, xdata, max_lock); - return 0; + if (local->inodelk_dom_count_req) + pl_inodelk_xattr_fill(this, inode, xdata, + data_to_str(local->inodelk_dom_count_req), + max_lock); -unwind: - gf_log (this->name, GF_LOG_ERROR, "truncate failed with ret: %d, " - "error: %s", op_ret, strerror (op_errno)); - if (local->op == TRUNCATE) - loc_wipe (&local->loc); - if (local->xdata) - dict_unref (local->xdata); - if (local->fd) - fd_unref (local->fd); - - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, buf, NULL, xdata); - return 0; -} + if (local->inodelk_count_req) + pl_inodelk_xattr_fill(this, inode, xdata, NULL, max_lock); + if (local->posixlk_count_req) + pl_posixlk_xattr_fill(this, inode, xdata, max_lock); + + if (local->multiple_dom_lk_requests) + pl_fill_multiple_dom_lk_requests(this, local, inode, xdata, max_lock); +} +/* Checks whether the region where fop is acting upon conflicts + * with existing locks. If there is no conflict function returns + * 1 else returns 0 with can_block boolean set accordingly to + * indicate block/fail the fop. + */ int -pl_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset, dict_t *xdata) +pl_is_fop_allowed(pl_inode_t *pl_inode, posix_lock_t *region, fd_t *fd, + glusterfs_fop_t op, gf_boolean_t *can_block) { - pl_local_t *local = NULL; + int ret = 0; + + if (!__rw_allowable(pl_inode, region, op)) { + if (pl_inode->mlock_enforced) { + *can_block = _gf_false; + } else if ((!fd) || (fd && (fd->flags & O_NONBLOCK))) { + gf_log("locks", GF_LOG_TRACE, + "returning EAGAIN" + " because fd is O_NONBLOCK"); + *can_block = _gf_false; + } else { + *can_block = _gf_true; + } + } else { + ret = 1; + } - local = mem_get0 (this->local_pool); - GF_VALIDATE_OR_GOTO (this->name, local, unwind); + return ret; +} - local->op = TRUNCATE; - local->offset = offset; - loc_copy (&local->loc, loc); - if (xdata) - local->xdata = dict_ref (xdata); +static pl_fdctx_t * +pl_new_fdctx() +{ + pl_fdctx_t *fdctx = GF_MALLOC(sizeof(*fdctx), gf_locks_mt_pl_fdctx_t); + GF_VALIDATE_OR_GOTO("posix-locks", fdctx, out); - frame->local = local; + INIT_LIST_HEAD(&fdctx->locks_list); - STACK_WIND (frame, truncate_stat_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->stat, loc, NULL); +out: + return fdctx; +} - return 0; +static pl_fdctx_t * +pl_check_n_create_fdctx(xlator_t *this, fd_t *fd) +{ + int ret = 0; + uint64_t tmp = 0; + pl_fdctx_t *fdctx = NULL; + + GF_VALIDATE_OR_GOTO("posix-locks", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + ret = __fd_ctx_get(fd, this, &tmp); + if ((ret != 0) || (tmp == 0)) { + fdctx = pl_new_fdctx(); + if (fdctx == NULL) { + goto unlock; + } + } -unwind: - gf_log (this->name, GF_LOG_ERROR, "truncate for %s failed with ret: %d, " - "error: %s", loc->path, -1, strerror (ENOMEM)); - STACK_UNWIND_STRICT (truncate, frame, -1, ENOMEM, NULL, NULL, NULL); + ret = __fd_ctx_set(fd, this, (uint64_t)(long)fdctx); + if (ret != 0) { + GF_FREE(fdctx); + fdctx = NULL; + UNLOCK(&fd->lock); + gf_log(this->name, GF_LOG_DEBUG, "failed to set fd ctx"); + goto out; + } + } +unlock: + UNLOCK(&fd->lock); - return 0; +out: + return fdctx; } +int32_t +pl_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + pl_track_io_fop_count(frame->local, this, DECREMENT); + + PL_STACK_UNWIND(discard, xdata, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} int -pl_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, dict_t *xdata) +pl_discard_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - pl_local_t *local = NULL; + pl_track_io_fop_count(frame->local, this, INCREMENT); + + STACK_WIND(frame, pl_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; +} + +int32_t +pl_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = { + .list = + { + 0, + }, + }; + gf_boolean_t enabled = _gf_false; + gf_boolean_t can_block = _gf_true; + int op_ret = 0; + int op_errno = 0; + int allowed = 1; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + local = mem_get0(this->local_pool); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + frame->local = local; + local->inode = inode_ref(fd->inode); + local->fd = fd_ref(fd); + + pl_inode = pl_inode_get(this, fd->inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (frame->root->pid < 0) + enabled = _gf_false; + else + enabled = pl_is_mandatory_locking_enabled(pl_inode); + + if (enabled) { + region.fl_start = offset; + region.fl_end = offset + len - 1; + region.client = frame->root->client; + region.fd_num = fd_to_fdnum(fd); + region.client_pid = frame->root->pid; + region.owner = frame->root->lk_owner; + + pthread_mutex_lock(&pl_inode->mutex); + { + allowed = pl_is_fop_allowed(pl_inode, ®ion, fd, GF_FOP_DISCARD, + &can_block); + if (allowed == 1) { + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; + } + goto unlock; + } else if (!can_block) { + op_errno = EAGAIN; + op_ret = -1; + goto unlock; + } - local = mem_get0 (this->local_pool); - GF_VALIDATE_OR_GOTO (this->name, local, unwind); + rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t); + if (!rw) { + op_errno = ENOMEM; + op_ret = -1; + goto unlock; + } - local->op = FTRUNCATE; - local->offset = offset; - local->fd = fd_ref (fd); - if (xdata) - local->xdata = dict_ref (xdata); + rw->stub = fop_discard_stub(frame, pl_discard_cont, fd, offset, len, + xdata); + if (!rw->stub) { + op_errno = ENOMEM; + op_ret = -1; + GF_FREE(rw); + goto unlock; + } - frame->local = local; + rw->region = region; - STACK_WIND (frame, truncate_stat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, fd, xdata); - return 0; + list_add_tail(&rw->list, &pl_inode->rw_list); + } + unlock: + pthread_mutex_unlock(&pl_inode->mutex); + } + if (allowed == 1) + STACK_WIND(frame, pl_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); unwind: - gf_log (this->name, GF_LOG_ERROR, "ftruncate failed with ret: %d, " - "error: %s", -1, strerror (ENOMEM)); - STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); + if (op_ret == -1) + PL_STACK_UNWIND(discard, xdata, frame, op_ret, op_errno, NULL, NULL, + NULL); - return 0; + return 0; } -int -pl_locks_by_fd (pl_inode_t *pl_inode, fd_t *fd) +int32_t +pl_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - posix_lock_t *l = NULL; - int found = 0; + pl_track_io_fop_count(frame->local, this, DECREMENT); - pthread_mutex_lock (&pl_inode->mutex); - { + PL_STACK_UNWIND(zerofill, xdata, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (l->fd_num == fd_to_fdnum(fd)) { - found = 1; - break; - } - } +int +pl_zerofill_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + pl_track_io_fop_count(frame->local, this, INCREMENT); - } - pthread_mutex_unlock (&pl_inode->mutex); - return found; + STACK_WIND(frame, pl_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; } -static void -delete_locks_of_fd (xlator_t *this, pl_inode_t *pl_inode, fd_t *fd) +int32_t +pl_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) { - posix_lock_t *tmp = NULL; - posix_lock_t *l = NULL; + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = { + .list = + { + 0, + }, + }; + gf_boolean_t enabled = _gf_false; + gf_boolean_t can_block = _gf_true; + int op_ret = 0; + int op_errno = 0; + int allowed = 1; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + local = mem_get0(this->local_pool); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + frame->local = local; + local->inode = inode_ref(fd->inode); + local->fd = fd_ref(fd); + + pl_inode = pl_inode_get(this, fd->inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (frame->root->pid < 0) + enabled = _gf_false; + else + enabled = pl_is_mandatory_locking_enabled(pl_inode); + + if (enabled) { + region.fl_start = offset; + region.fl_end = offset + len - 1; + region.client = frame->root->client; + region.fd_num = fd_to_fdnum(fd); + region.client_pid = frame->root->pid; + region.owner = frame->root->lk_owner; + + pthread_mutex_lock(&pl_inode->mutex); + { + allowed = pl_is_fop_allowed(pl_inode, ®ion, fd, GF_FOP_ZEROFILL, + &can_block); + if (allowed == 1) { + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; + } + goto unlock; + } else if (!can_block) { + op_errno = EAGAIN; + op_ret = -1; + goto unlock; + } - struct list_head blocked_list; + rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t); + if (!rw) { + op_errno = ENOMEM; + op_ret = -1; + goto unlock; + } - INIT_LIST_HEAD (&blocked_list); + rw->stub = fop_zerofill_stub(frame, pl_zerofill_cont, fd, offset, + len, xdata); + if (!rw->stub) { + op_errno = ENOMEM; + op_ret = -1; + GF_FREE(rw); + goto unlock; + } - pthread_mutex_lock (&pl_inode->mutex); - { + rw->region = region; - list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { - if (l->fd_num == fd_to_fdnum(fd)) { - if (l->blocked) { - list_move_tail (&l->list, &blocked_list); - continue; - } - __delete_lock (pl_inode, l); - __destroy_lock (l); - } - } + list_add_tail(&rw->list, &pl_inode->rw_list); + } + unlock: + pthread_mutex_unlock(&pl_inode->mutex); + } - } - pthread_mutex_unlock (&pl_inode->mutex); + if (allowed == 1) + STACK_WIND(frame, pl_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); +unwind: + if (op_ret == -1) + PL_STACK_UNWIND(zerofill, xdata, frame, op_ret, op_errno, NULL, NULL, + NULL); - list_for_each_entry_safe (l, tmp, &blocked_list, list) { - list_del_init(&l->list); - STACK_UNWIND_STRICT (lk, l->frame, -1, EAGAIN, &l->user_flock, - NULL); - __destroy_lock (l); - } + return 0; +} - grant_blocked_locks (this, pl_inode); +int +pl_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + pl_local_t *local = frame->local; - do_blocked_rw (pl_inode); + pl_track_io_fop_count(local, this, DECREMENT); + if (local->op == GF_FOP_TRUNCATE) + PL_STACK_UNWIND(truncate, xdata, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + else + PL_STACK_UNWIND(ftruncate, xdata, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; } -static void -__delete_locks_of_owner (pl_inode_t *pl_inode, - client_t *client, gf_lkowner_t *owner) -{ - posix_lock_t *tmp = NULL; - posix_lock_t *l = NULL; - - /* TODO: what if it is a blocked lock with pending l->frame */ - - list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { - if (l->blocked) - continue; - if ((l->client == client) && - is_same_lkowner (&l->owner, owner)) { - gf_log ("posix-locks", GF_LOG_TRACE, - " Flushing lock" - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" state: %s", - l->fl_type == F_UNLCK ? "Unlock" : "Lock", - l->client_pid, - lkowner_utoa (&l->owner), - l->user_flock.l_start, - l->user_flock.l_len, - l->blocked == 1 ? "Blocked" : "Active"); - - __delete_lock (pl_inode, l); - __destroy_lock (l); - } - } +int +pl_ftruncate_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + pl_track_io_fop_count(frame->local, this, INCREMENT); - return; + STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; } - -int32_t -pl_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +int +pl_truncate_cont(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + pl_track_io_fop_count(frame->local, this, INCREMENT); + STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; } -int32_t -pl_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) -{ - int32_t op_errno = EINVAL; - int op_ret = -1; - int32_t bcount = 0; - int32_t gcount = 0; - char key[PATH_MAX] = {0, }; - char *lk_summary = NULL; - pl_inode_t *pl_inode = NULL; - dict_t *dict = NULL; - clrlk_args args = {0,}; - char *brickname = NULL; - - if (!name) - goto usual; - - if (strncmp (name, GF_XATTR_CLRLK_CMD, strlen (GF_XATTR_CLRLK_CMD))) - goto usual; - - if (clrlk_parse_args (name, &args)) { - op_errno = EINVAL; - goto out; - } +static int +truncate_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + pl_local_t *local = frame->local; + inode_t *inode = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = { + .list = + { + 0, + }, + }; + gf_boolean_t enabled = _gf_false; + gf_boolean_t can_block = _gf_true; + int allowed = 1; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + if (op_ret != 0) { + gf_log(this->name, GF_LOG_ERROR, + "got error (errno=%d, stderror=%s) from child", op_errno, + strerror(op_errno)); + goto unwind; + } + + if (local->op == GF_FOP_TRUNCATE) + inode = local->loc[0].inode; + else + inode = local->fd->inode; + + local->inode = inode_ref(inode); + + pl_inode = pl_inode_get(this, inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (frame->root->pid < 0) + enabled = _gf_false; + else + enabled = pl_is_mandatory_locking_enabled(pl_inode); + + if (enabled) { + region.fl_start = local->offset; + region.fl_end = LLONG_MAX; + region.client = frame->root->client; + region.fd_num = fd_to_fdnum(local->fd); + region.client_pid = frame->root->pid; + region.owner = frame->root->lk_owner; + pthread_mutex_lock(&pl_inode->mutex); + { + allowed = pl_is_fop_allowed(pl_inode, ®ion, local->fd, local->op, + &can_block); - dict = dict_new (); - if (!dict) { - op_errno = ENOMEM; - goto out; - } + if (allowed == 1) { + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; + } + goto unlock; + } else if (!can_block) { + op_errno = EAGAIN; + op_ret = -1; + goto unlock; + } - pl_inode = pl_inode_get (this, loc->inode); - if (!pl_inode) { + rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t); + if (!rw) { op_errno = ENOMEM; - goto out; - } + op_ret = -1; + goto unlock; + } + + if (local->op == GF_FOP_TRUNCATE) + rw->stub = fop_truncate_stub(frame, pl_truncate_cont, + &local->loc[0], local->offset, + local->xdata); + else + rw->stub = fop_ftruncate_stub(frame, pl_ftruncate_cont, + local->fd, local->offset, + local->xdata); + if (!rw->stub) { + op_errno = ENOMEM; + op_ret = -1; + GF_FREE(rw); + goto unlock; + } - switch (args.type) { - case CLRLK_INODE: - case CLRLK_ENTRY: - op_ret = clrlk_clear_lks_in_all_domains (this, pl_inode, - &args, &bcount, - &gcount, - &op_errno); - if (op_ret) - goto out; - break; - case CLRLK_POSIX: - op_ret = clrlk_clear_posixlk (this, pl_inode, &args, - &bcount, &gcount, - &op_errno); - if (op_ret) - goto out; - break; - case CLRLK_TYPE_MAX: - op_errno = EINVAL; - goto out; - } + rw->region = region; - op_ret = fetch_pathinfo (this, loc->inode, &op_errno, &brickname); - if (op_ret) { - gf_log (this->name, GF_LOG_WARNING, - "Couldn't get brickname"); - } else { - op_ret = format_brickname(brickname); - if (op_ret) { - gf_log (this->name, GF_LOG_WARNING, - "Couldn't format brickname"); - GF_FREE(brickname); - brickname = NULL; - } + list_add_tail(&rw->list, &pl_inode->rw_list); } + unlock: + pthread_mutex_unlock(&pl_inode->mutex); + } - if (!gcount && !bcount) { - if (gf_asprintf (&lk_summary, "No locks cleared.") == -1) { - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - } else if (gf_asprintf (&lk_summary, "%s: %s blocked locks=%d " - "granted locks=%d", - (brickname == NULL)? this->name : brickname, - (args.type == CLRLK_INODE)? "inode": - (args.type == CLRLK_ENTRY)? "entry": - (args.type == CLRLK_POSIX)? "posix": " ", - bcount, gcount) == -1) { - op_ret = -1; - op_errno = ENOMEM; - goto out; + if (allowed == 1) { + switch (local->op) { + case GF_FOP_TRUNCATE: + STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc[0], + local->offset, local->xdata); + break; + case GF_FOP_FTRUNCATE: + STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, local->fd, + local->offset, local->xdata); + break; + default: + break; } + } +unwind: + if (op_ret == -1) { + gf_log(this ? this->name : "locks", GF_LOG_ERROR, + "truncate failed with " + "ret: %d, error: %s", + op_ret, strerror(op_errno)); - strncpy (key, name, strlen (name)); - if (dict_set_dynstr (dict, key, lk_summary)) { - op_ret = -1; - op_errno = ENOMEM; - goto out; + switch (local->op) { + case GF_FOP_TRUNCATE: + PL_STACK_UNWIND(truncate, xdata, frame, op_ret, op_errno, buf, + NULL, xdata); + break; + case GF_FOP_FTRUNCATE: + PL_STACK_UNWIND(ftruncate, xdata, frame, op_ret, op_errno, buf, + NULL, xdata); + break; + default: + break; } + } + return 0; +} - op_ret = 0; -out: - GF_FREE(brickname); - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); +int +pl_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + pl_local_t *local = NULL; + int ret = -1; - GF_FREE (args.opts); - if (op_ret && lk_summary) - GF_FREE (lk_summary); - if (dict) - dict_unref (dict); - return 0; + GF_VALIDATE_OR_GOTO("locks", this, unwind); -usual: - STACK_WIND (frame, pl_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); - return 0; + local = mem_get0(this->local_pool); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); + + local->op = GF_FOP_TRUNCATE; + local->offset = offset; + loc_copy(&local->loc[0], loc); + if (xdata) + local->xdata = dict_ref(xdata); + + frame->local = local; + + STACK_WIND(frame, truncate_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, NULL); + ret = 0; + +unwind: + if (ret == -1) { + gf_log(this ? this->name : "locks", GF_LOG_ERROR, + "truncate on %s failed with" + " ret: %d, error: %s", + loc->path, -1, strerror(ENOMEM)); + STACK_UNWIND_STRICT(truncate, frame, -1, ENOMEM, NULL, NULL, NULL); + } + return 0; } -static int -format_brickname(char *brickname) +int +pl_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - int ret = -1; - char *hostname = NULL; - char *volume = NULL; - char *saveptr = NULL; + pl_local_t *local = NULL; + int ret = -1; - if (!brickname) - goto out; + GF_VALIDATE_OR_GOTO("locks", this, unwind); + local = mem_get0(this->local_pool); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); - strtok_r(brickname, ":", &saveptr); - hostname = gf_strdup(strtok_r(NULL, ":", &saveptr)); - if (hostname == NULL) - goto out; - volume = gf_strdup(strtok_r(NULL, ".", &saveptr)); - if (volume == NULL) - goto out; + local->op = GF_FOP_FTRUNCATE; + local->offset = offset; + local->fd = fd_ref(fd); + if (xdata) + local->xdata = dict_ref(xdata); - sprintf(brickname, "%s:%s", hostname, volume); + frame->local = local; - ret = 0; -out: - GF_FREE(hostname); - GF_FREE(volume); - return ret; + STACK_WIND(frame, truncate_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + ret = 0; +unwind: + if (ret == -1) { + gf_log(this ? this->name : "locks", GF_LOG_ERROR, + "ftruncate failed with" + " ret: %d, error: %s", + -1, strerror(ENOMEM)); + STACK_UNWIND_STRICT(ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); + } + return 0; } -static int -fetch_pathinfo (xlator_t *this, inode_t *inode, int32_t *op_errno, - char **brickname) +int +pl_locks_by_fd(pl_inode_t *pl_inode, fd_t *fd) { - int ret = -1; - loc_t loc = {0, }; - dict_t *dict = NULL; + posix_lock_t *l = NULL; + int found = 0; - if (!brickname) - goto out; + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (l->fd_num == fd_to_fdnum(fd)) { + found = 1; + break; + } + } + } + pthread_mutex_unlock(&pl_inode->mutex); + return found; +} - if (!op_errno) - goto out; +static void +delete_locks_of_fd(xlator_t *this, pl_inode_t *pl_inode, fd_t *fd) +{ + posix_lock_t *tmp = NULL; + posix_lock_t *l = NULL; - uuid_copy (loc.gfid, inode->gfid); - loc.inode = inode_ref (inode); + struct list_head blocked_list; - ret = syncop_getxattr (FIRST_CHILD(this), &loc, &dict, - GF_XATTR_PATHINFO_KEY); - if (ret < 0) { - *op_errno = -ret; - ret = -1; - goto out; + INIT_LIST_HEAD(&blocked_list); + + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list) + { + if (l->fd_num == fd_to_fdnum(fd)) { + if (l->blocked) { + list_move_tail(&l->list, &blocked_list); + continue; + } + __delete_lock(l); + __destroy_lock(l); + } } + } + pthread_mutex_unlock(&pl_inode->mutex); - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, brickname); - if (ret) - goto out; + list_for_each_entry_safe(l, tmp, &blocked_list, list) + { + list_del_init(&l->list); + STACK_UNWIND_STRICT(lk, l->frame, -1, EAGAIN, &l->user_flock, NULL); + __destroy_lock(l); + } - *brickname = gf_strdup(*brickname); - if (*brickname == NULL) { - ret = -1; - goto out; - } + grant_blocked_locks(this, pl_inode); - ret = 0; -out: - if (dict != NULL) { - dict_unref (dict); + do_blocked_rw(pl_inode); +} + +static void +__delete_locks_of_owner(pl_inode_t *pl_inode, client_t *client, + gf_lkowner_t *owner) +{ + posix_lock_t *tmp = NULL; + posix_lock_t *l = NULL; + + /* TODO: what if it is a blocked lock with pending l->frame */ + + list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list) + { + if (l->blocked) + continue; + if ((l->client == client) && is_same_lkowner(&l->owner, owner)) { + gf_log("posix-locks", GF_LOG_TRACE, + " Flushing lock" + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 + " state: %s", + l->fl_type == F_UNLCK ? "Unlock" : "Lock", l->client_pid, + lkowner_utoa(&l->owner), l->user_flock.l_start, + l->user_flock.l_len, l->blocked == 1 ? "Blocked" : "Active"); + + __delete_lock(l); + __destroy_lock(l); } - loc_wipe(&loc); + } - return ret; + return; } +int32_t +pl_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} -int -pl_lockinfo_get_brickname (xlator_t *this, inode_t *inode, int32_t *op_errno) +static int32_t +pl_getxattr_clrlk(xlator_t *this, const char *name, inode_t *inode, + dict_t **dict, int32_t *op_errno) { - int ret = -1; - posix_locks_private_t *priv = NULL; - char *brickname = NULL; - char *end = NULL; - char *tmp = NULL; + int32_t bcount = 0; + int32_t gcount = 0; + char *key = NULL; + char *lk_summary = NULL; + pl_inode_t *pl_inode = NULL; + clrlk_args args = { + 0, + }; + char *brickname = NULL; + int32_t op_ret = -1; + + *op_errno = EINVAL; + + if (clrlk_parse_args(name, &args)) { + *op_errno = EINVAL; + goto out; + } + + *dict = dict_new(); + if (!*dict) { + *op_errno = ENOMEM; + goto out; + } + + pl_inode = pl_inode_get(this, inode, NULL); + if (!pl_inode) { + *op_errno = ENOMEM; + goto out; + } + + switch (args.type) { + case CLRLK_INODE: + case CLRLK_ENTRY: + op_ret = clrlk_clear_lks_in_all_domains(this, pl_inode, &args, + &bcount, &gcount, op_errno); + break; + case CLRLK_POSIX: + op_ret = clrlk_clear_posixlk(this, pl_inode, &args, &bcount, + &gcount, op_errno); + break; + default: + op_ret = -1; + *op_errno = EINVAL; + } + if (op_ret) { + if (args.type >= CLRLK_TYPE_MAX) { + gf_log(this->name, GF_LOG_ERROR, + "clear locks: invalid lock type %d", args.type); + } else { + gf_log(this->name, GF_LOG_ERROR, + "clear locks of type %s failed: %s", + clrlk_type_names[args.type], strerror(*op_errno)); + } - priv = this->private; + goto out; + } - ret = fetch_pathinfo (this, inode, op_errno, &brickname); - if (ret) - goto out; - - end = strrchr (brickname, ':'); - if (!end) { - GF_FREE(brickname); - ret = -1; - goto out; + op_ret = fetch_pathinfo(this, inode, op_errno, &brickname); + if (op_ret) { + gf_log(this->name, GF_LOG_WARNING, "Couldn't get brickname"); + } else { + op_ret = format_brickname(brickname); + if (op_ret) { + gf_log(this->name, GF_LOG_WARNING, "Couldn't format brickname"); + GF_FREE(brickname); + brickname = NULL; } + } - tmp = brickname; - brickname = gf_strndup (brickname, (end - brickname)); - if (brickname == NULL) { - ret = -1; - goto out; + if (!gcount && !bcount) { + if (gf_asprintf(&lk_summary, "No locks cleared.") == -1) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; } + } else if (gf_asprintf(&lk_summary, + "%s: %s blocked locks=%d " + "granted locks=%d", + (brickname == NULL) ? this->name : brickname, + clrlk_type_names[args.type], bcount, gcount) == -1) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + gf_log(this->name, GF_LOG_DEBUG, "%s", lk_summary); + + key = gf_strdup(name); + if (!key) { + op_ret = -1; + goto out; + } + if (dict_set_dynstr(*dict, key, lk_summary)) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + + op_ret = 0; - priv->brickname = brickname; - ret = 0; out: - GF_FREE(tmp); - return ret; + GF_FREE(brickname); + GF_FREE(args.opts); + GF_FREE(key); + if (op_ret) { + GF_FREE(lk_summary); + } + + return op_ret; } -char * -pl_lockinfo_key (xlator_t *this, inode_t *inode, int32_t *op_errno) +int32_t +pl_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) { - posix_locks_private_t *priv = NULL; - char *key = NULL; - int ret = 0; + int32_t op_errno = EINVAL; + int32_t op_ret = -1; + dict_t *dict = NULL; - priv = this->private; + if (!name) + goto usual; - if (priv->brickname == NULL) { - ret = pl_lockinfo_get_brickname (this, inode, op_errno); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "cannot get brickname"); - goto out; - } - } + if (strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD))) + goto usual; - key = priv->brickname; -out: - return key; + op_ret = pl_getxattr_clrlk(this, name, loc->inode, &dict, &op_errno); + + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata); + + if (dict) + dict_unref(dict); + return 0; + +usual: + STACK_WIND(frame, pl_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; } -int32_t -pl_fgetxattr_handle_lockinfo (xlator_t *this, fd_t *fd, - dict_t *dict, int32_t *op_errno) +static int +format_brickname(char *brickname) { - pl_inode_t *pl_inode = NULL; - char *key = NULL, *buf = NULL; - int32_t op_ret = 0; - unsigned long fdnum = 0; - int32_t len = 0; - dict_t *tmp = NULL; + int ret = -1; + char *hostname = NULL; + char *volume = NULL; + char *saveptr = NULL; - pl_inode = pl_inode_get (this, fd->inode); + if (!brickname) + goto out; - if (!pl_inode) { - gf_log (this->name, GF_LOG_DEBUG, "Could not get inode."); - *op_errno = EBADFD; - op_ret = -1; - goto out; - } + strtok_r(brickname, ":", &saveptr); + hostname = gf_strdup(strtok_r(NULL, ":", &saveptr)); + if (hostname == NULL) + goto out; + volume = gf_strdup(strtok_r(NULL, ".", &saveptr)); + if (volume == NULL) + goto out; - if (!pl_locks_by_fd (pl_inode, fd)) { - op_ret = 0; - goto out; - } + sprintf(brickname, "%s:%s", hostname, volume); - fdnum = fd_to_fdnum (fd); + ret = 0; +out: + GF_FREE(hostname); + GF_FREE(volume); + return ret; +} - key = pl_lockinfo_key (this, fd->inode, op_errno); - if (key == NULL) { - op_ret = -1; - goto out; - } +static int +fetch_pathinfo(xlator_t *this, inode_t *inode, int32_t *op_errno, + char **brickname) +{ + int ret = -1; + loc_t loc = { + 0, + }; + dict_t *dict = NULL; + + if (!brickname) + goto out; + + if (!op_errno) + goto out; + + gf_uuid_copy(loc.gfid, inode->gfid); + loc.inode = inode_ref(inode); + + ret = syncop_getxattr(FIRST_CHILD(this), &loc, &dict, GF_XATTR_PATHINFO_KEY, + NULL, NULL); + if (ret < 0) { + *op_errno = -ret; + ret = -1; + goto out; + } + + ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, brickname); + if (ret) + goto out; + + *brickname = gf_strdup(*brickname); + if (*brickname == NULL) { + ret = -1; + goto out; + } + + ret = 0; +out: + if (dict != NULL) { + dict_unref(dict); + } + loc_wipe(&loc); - tmp = dict_new (); - if (tmp == NULL) { - op_ret = -1; - *op_errno = ENOMEM; - goto out; - } + return ret; +} - op_ret = dict_set_uint64 (tmp, key, fdnum); - if (op_ret < 0) { - *op_errno = -op_ret; - op_ret = -1; - gf_log (this->name, GF_LOG_WARNING, "setting lockinfo value " - "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)", - fdnum, fd, uuid_utoa (fd->inode->gfid), - strerror (*op_errno)); - goto out; - } +int +pl_lockinfo_get_brickname(xlator_t *this, inode_t *inode, int32_t *op_errno) +{ + posix_locks_private_t *priv = this->private; + char *brickname = NULL; + char *end = NULL; + char *tmp = NULL; - len = dict_serialized_length (tmp); - if (len < 0) { - *op_errno = -op_ret; - op_ret = -1; - gf_log (this->name, GF_LOG_WARNING, - "dict_serialized_length failed (%s) while handling " - "lockinfo for fd (ptr:%p inode-gfid:%s)", - strerror (*op_errno), fd, uuid_utoa (fd->inode->gfid)); - goto out; - } + int ret = fetch_pathinfo(this, inode, op_errno, &brickname); + if (ret) + goto out; - buf = GF_CALLOC (1, len, gf_common_mt_char); - if (buf == NULL) { - op_ret = -1; - *op_errno = ENOMEM; - goto out; - } + end = strrchr(brickname, ':'); + if (!end) { + GF_FREE(brickname); + ret = -1; + goto out; + } + + tmp = brickname; + brickname = gf_strndup(brickname, (end - brickname)); + if (brickname == NULL) { + ret = -1; + goto out; + } + + priv->brickname = brickname; + ret = 0; +out: + GF_FREE(tmp); + return ret; +} - op_ret = dict_serialize (tmp, buf); - if (op_ret < 0) { - *op_errno = -op_ret; - op_ret = -1; - gf_log (this->name, GF_LOG_WARNING, - "dict_serialize failed (%s) while handling lockinfo " - "for fd (ptr: %p inode-gfid:%s)", strerror (*op_errno), - fd, uuid_utoa (fd->inode->gfid)); - goto out; - } +char * +pl_lockinfo_key(xlator_t *this, inode_t *inode, int32_t *op_errno) +{ + posix_locks_private_t *priv = this->private; + char *key = NULL; + int ret = 0; - op_ret = dict_set_dynptr (dict, GF_XATTR_LOCKINFO_KEY, buf, len); - if (op_ret < 0) { - *op_errno = -op_ret; - op_ret = -1; - gf_log (this->name, GF_LOG_WARNING, "setting lockinfo value " - "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)", - fdnum, fd, uuid_utoa (fd->inode->gfid), - strerror (*op_errno)); - goto out; + if (priv->brickname == NULL) { + ret = pl_lockinfo_get_brickname(this, inode, op_errno); + if (ret < 0) { + gf_log(this->name, GF_LOG_WARNING, "cannot get brickname"); + goto out; } + } - buf = NULL; + key = priv->brickname; out: - if (tmp != NULL) { - dict_unref (tmp); - } + return key; +} - if (buf != NULL) { - GF_FREE (buf); - } +int32_t +pl_fgetxattr_handle_lockinfo(xlator_t *this, fd_t *fd, dict_t *dict, + int32_t *op_errno) +{ + char *key = NULL, *buf = NULL; + int32_t op_ret = 0; + unsigned long fdnum = 0; + int32_t len = 0; + dict_t *tmp = NULL; + + pl_inode_t *pl_inode = pl_inode_get(this, fd->inode, NULL); + + if (!pl_inode) { + gf_log(this->name, GF_LOG_DEBUG, "Could not get inode."); + *op_errno = EBADFD; + op_ret = -1; + goto out; + } + + if (!pl_locks_by_fd(pl_inode, fd)) { + op_ret = 0; + goto out; + } + + fdnum = fd_to_fdnum(fd); + + key = pl_lockinfo_key(this, fd->inode, op_errno); + if (key == NULL) { + op_ret = -1; + goto out; + } + + tmp = dict_new(); + if (tmp == NULL) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + + op_ret = dict_set_uint64(tmp, key, fdnum); + if (op_ret < 0) { + *op_errno = -op_ret; + op_ret = -1; + gf_log(this->name, GF_LOG_WARNING, + "setting lockinfo value " + "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)", + fdnum, fd, uuid_utoa(fd->inode->gfid), strerror(*op_errno)); + goto out; + } + + op_ret = dict_allocate_and_serialize(tmp, (char **)&buf, + (unsigned int *)&len); + if (op_ret != 0) { + *op_errno = -op_ret; + op_ret = -1; + gf_log(this->name, GF_LOG_WARNING, + "dict_serialized_length failed (%s) while handling " + "lockinfo for fd (ptr:%p inode-gfid:%s)", + strerror(*op_errno), fd, uuid_utoa(fd->inode->gfid)); + goto out; + } + + op_ret = dict_set_dynptr(dict, GF_XATTR_LOCKINFO_KEY, buf, len); + if (op_ret < 0) { + *op_errno = -op_ret; + op_ret = -1; + gf_log(this->name, GF_LOG_WARNING, + "setting lockinfo value " + "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)", + fdnum, fd, uuid_utoa(fd->inode->gfid), strerror(*op_errno)); + goto out; + } + + buf = NULL; +out: + if (tmp != NULL) { + dict_unref(tmp); + } - return op_ret; -} + if (buf != NULL) { + GF_FREE(buf); + } + return op_ret; +} int32_t -pl_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) +pl_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) { - int32_t op_ret = 0, op_errno = 0; - dict_t *dict = NULL; - - if (!name) { - goto usual; + int32_t op_ret = 0, op_errno = 0; + dict_t *dict = NULL; + + if (!name) { + goto usual; + } + + if (strcmp(name, GF_XATTR_LOCKINFO_KEY) == 0) { + dict = dict_new(); + if (dict == NULL) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; } - if (strcmp (name, GF_XATTR_LOCKINFO_KEY) == 0) { - dict = dict_new (); - if (dict == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } + op_ret = pl_fgetxattr_handle_lockinfo(this, fd, dict, &op_errno); + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "getting lockinfo on fd (ptr:%p inode-gfid:%s) " + "failed (%s)", + fd, uuid_utoa(fd->inode->gfid), strerror(op_errno)); + } - op_ret = pl_fgetxattr_handle_lockinfo (this, fd, dict, - &op_errno); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "getting lockinfo on fd (ptr:%p inode-gfid:%s) " - "failed (%s)", fd, uuid_utoa (fd->inode->gfid), - strerror (op_errno)); - } + goto unwind; + } else if (strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == + 0) { + op_ret = pl_getxattr_clrlk(this, name, fd->inode, &dict, &op_errno); - goto unwind; - } else { - goto usual; - } + goto unwind; + } else { + goto usual; + } unwind: - STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, NULL); - if (dict != NULL) { - dict_unref (dict); - } + STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, NULL); + if (dict != NULL) { + dict_unref(dict); + } - return 0; + return 0; usual: - STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); - return 0; + STACK_WIND(frame, default_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; } int32_t -pl_migrate_locks (call_frame_t *frame, fd_t *newfd, uint64_t oldfd_num, - int32_t *op_errno) +pl_migrate_locks(call_frame_t *frame, fd_t *newfd, uint64_t oldfd_num, + int32_t *op_errno) { - pl_inode_t *pl_inode = NULL; - uint64_t newfd_num = 0; - posix_lock_t *l = NULL; - int32_t op_ret = 0; - - newfd_num = fd_to_fdnum (newfd); - - pl_inode = pl_inode_get (frame->this, newfd->inode); - if (pl_inode == NULL) { - op_ret = -1; - *op_errno = EBADFD; - goto out; - } - - pthread_mutex_lock (&pl_inode->mutex); + posix_lock_t *l = NULL; + int32_t op_ret = 0; + uint64_t newfd_num = fd_to_fdnum(newfd); + + pl_inode_t *pl_inode = pl_inode_get(frame->this, newfd->inode, NULL); + if (pl_inode == NULL) { + op_ret = -1; + *op_errno = EBADFD; + goto out; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry(l, &pl_inode->ext_list, list) { - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (l->fd_num == oldfd_num) { - l->fd_num = newfd_num; - l->client = frame->root->client; - } - } + if (l->fd_num == oldfd_num) { + l->fd_num = newfd_num; + l->client = frame->root->client; + } } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); - op_ret = 0; + op_ret = 0; out: - return op_ret; + return op_ret; } int32_t -pl_fsetxattr_handle_lockinfo (call_frame_t *frame, fd_t *fd, char *lockinfo_buf, - int len, int32_t *op_errno) +pl_fsetxattr_handle_lockinfo(call_frame_t *frame, fd_t *fd, char *lockinfo_buf, + int len, int32_t *op_errno) { - int32_t op_ret = -1; - dict_t *lockinfo = NULL; - uint64_t oldfd_num = 0; - char *key = NULL; - - lockinfo = dict_new (); - if (lockinfo == NULL) { - op_ret = -1; - *op_errno = ENOMEM; - goto out; - } + int32_t op_ret = -1; + uint64_t oldfd_num = 0; + char *key = NULL; + + dict_t *lockinfo = dict_new(); + if (lockinfo == NULL) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + + op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); + if (op_ret < 0) { + *op_errno = -op_ret; + op_ret = -1; + goto out; + } + + key = pl_lockinfo_key(frame->this, fd->inode, op_errno); + if (key == NULL) { + op_ret = -1; + goto out; + } + + op_ret = dict_get_uint64(lockinfo, key, &oldfd_num); + + if (oldfd_num == 0) { + op_ret = 0; + goto out; + } + + op_ret = pl_migrate_locks(frame, fd, oldfd_num, op_errno); + if (op_ret < 0) { + gf_log(frame->this->name, GF_LOG_WARNING, + "migration of locks from oldfd (ptr:%p) to newfd " + "(ptr:%p) (inode-gfid:%s)", + (void *)(uintptr_t)oldfd_num, fd, uuid_utoa(fd->inode->gfid)); + goto out; + } - op_ret = dict_unserialize (lockinfo_buf, len, &lockinfo); - if (op_ret < 0) { - *op_errno = -op_ret; - op_ret = -1; - goto out; - } +out: + dict_unref(lockinfo); - key = pl_lockinfo_key (frame->this, fd->inode, op_errno); - if (key == NULL) { - op_ret = -1; - goto out; - } + return op_ret; +} - op_ret = dict_get_uint64 (lockinfo, key, &oldfd_num); +int32_t +pl_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; - if (oldfd_num == 0) { - op_ret = 0; - goto out; + local = frame->local; + if (local && local->update_mlock_enforced_flag && op_ret != -1) { + pl_inode = pl_inode_get(this, local->inode, NULL); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; } - op_ret = pl_migrate_locks (frame, fd, oldfd_num, op_errno); - if (op_ret < 0) { - gf_log (frame->this->name, GF_LOG_WARNING, - "migration of locks from oldfd (ptr:%p) to newfd " - "(ptr:%p) (inode-gfid:%s)", (void *)oldfd_num, fd, - uuid_utoa (fd->inode->gfid)); - goto out; + pthread_mutex_lock(&pl_inode->mutex); + { + pl_inode->mlock_enforced = _gf_true; + pl_inode->check_mlock_info = _gf_false; } + pthread_mutex_unlock(&pl_inode->mutex); + } -out: - dict_unref (lockinfo); - - return op_ret; +unwind: + PL_STACK_UNWIND_FOR_CLIENT(fsetxattr, xdata, frame, op_ret, op_errno, + xdata); + return 0; } int32_t -pl_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags, dict_t *xdata) +pl_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) { - int32_t op_ret = 0, op_errno = 0; - void *lockinfo_buf = NULL; - int len = 0; + int32_t op_errno = 0; + void *lockinfo_buf = NULL; + int len = 0; + char *name = NULL; + posix_locks_private_t *priv = this->private; + + int32_t op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY, + &lockinfo_buf, &len); + if (lockinfo_buf == NULL) { + goto usual; + } + + op_ret = pl_fsetxattr_handle_lockinfo(frame, fd, lockinfo_buf, len, + &op_errno); + if (op_ret < 0) { + goto unwind; + } - op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, - &lockinfo_buf, &len); - if (lockinfo_buf == NULL) { - goto usual; - } +usual: + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); - op_ret = pl_fsetxattr_handle_lockinfo (frame, fd, lockinfo_buf, len, - &op_errno); - if (op_ret < 0) { - goto unwind; - } + PL_CHECK_LOCK_ENFORCE_KEY(frame, dict, name, this, ((loc_t *)NULL), fd, + priv); -usual: - STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); - return 0; + STACK_WIND(frame, pl_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, NULL); - return 0; + PL_STACK_UNWIND_FOR_CLIENT(fsetxattr, xdata, frame, op_ret, op_errno, NULL); + + return 0; } int32_t -pl_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, dict_t *xdata) +pl_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - pl_fdctx_t *fdctx = NULL; + pl_fdctx_t *fdctx = NULL; - if (op_ret < 0) - goto unwind; + if (op_ret < 0) + goto unwind; - fdctx = pl_check_n_create_fdctx (this, fd); - if (!fdctx) { - op_errno = ENOMEM; - op_ret = -1; - goto unwind; - } + fdctx = pl_check_n_create_fdctx(this, fd); + if (!fdctx) { + op_errno = ENOMEM; + op_ret = -1; + goto unwind; + } unwind: - STACK_UNWIND_STRICT (opendir, - frame, - op_ret, - op_errno, - fd, xdata); - return 0; + PL_STACK_UNWIND(opendir, xdata, frame, op_ret, op_errno, fd, xdata); + + return 0; } int32_t -pl_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd, dict_t *xdata) -{ - STACK_WIND (frame, - pl_opendir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, - loc, fd, xdata); - return 0; - +pl_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; } int -pl_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +pl_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); + PL_STACK_UNWIND_FOR_CLIENT(flush, xdata, frame, op_ret, op_errno, xdata); - return 0; + return 0; } - int -pl_flush (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xdata) +pl_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - pl_inode_t *pl_inode = NULL; - - pl_inode = pl_inode_get (this, fd->inode); - - if (!pl_inode) { - gf_log (this->name, GF_LOG_DEBUG, "Could not get inode."); - STACK_UNWIND_STRICT (flush, frame, -1, EBADFD, NULL); - return 0; - } - - pl_trace_flush (this, frame, fd); - - if (frame->root->lk_owner.len == 0) { - /* Handle special case when protocol/server sets lk-owner to zero. - * This usually happens due to a client disconnection. Hence, free - * all locks opened with this fd. - */ - gf_log (this->name, GF_LOG_TRACE, - "Releasing all locks with fd %p", fd); - delete_locks_of_fd (this, pl_inode, fd); - goto wind; - - } - pthread_mutex_lock (&pl_inode->mutex); - { - __delete_locks_of_owner (pl_inode, frame->root->client, - &frame->root->lk_owner); + pl_inode_t *pl_inode = pl_inode_get(this, fd->inode, NULL); + if (!pl_inode) { + gf_log(this->name, GF_LOG_DEBUG, "Could not get inode."); + STACK_UNWIND_STRICT(flush, frame, -1, EBADFD, NULL); + return 0; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + if (pl_inode->migrated) { + pthread_mutex_unlock(&pl_inode->mutex); + STACK_UNWIND_STRICT(flush, frame, -1, EREMOTE, NULL); + return 0; } - pthread_mutex_unlock (&pl_inode->mutex); - - grant_blocked_locks (this, pl_inode); - - do_blocked_rw (pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); + + pl_trace_flush(this, frame, fd); + + if (frame->root->lk_owner.len == 0) { + /* Handle special case when protocol/server sets lk-owner to zero. + * This usually happens due to a client disconnection. Hence, free + * all locks opened with this fd. + */ + gf_log(this->name, GF_LOG_TRACE, "Releasing all locks with fd %p", fd); + delete_locks_of_fd(this, pl_inode, fd); + goto wind; + } + pthread_mutex_lock(&pl_inode->mutex); + { + __delete_locks_of_owner(pl_inode, frame->root->client, + &frame->root->lk_owner); + } + pthread_mutex_unlock(&pl_inode->mutex); + + grant_blocked_locks(this, pl_inode); + + do_blocked_rw(pl_inode); wind: - STACK_WIND (frame, pl_flush_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, fd, xdata); - return 0; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; } - int -pl_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +pl_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - pl_fdctx_t *fdctx = NULL; + pl_fdctx_t *fdctx = NULL; - if (op_ret < 0) - goto unwind; + if (op_ret < 0) + goto unwind; - fdctx = pl_check_n_create_fdctx (this, fd); - if (!fdctx) { - op_errno = ENOMEM; - op_ret = -1; - goto unwind; - } + fdctx = pl_check_n_create_fdctx(this, fd); + if (!fdctx) { + op_errno = ENOMEM; + op_ret = -1; + goto unwind; + } unwind: - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); - return 0; + return 0; } - int -pl_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata) +pl_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, pl_open_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, - loc, flags, fd, xdata); + int op_ret = -1; + int op_errno = EINVAL; + pl_inode_t *pl_inode = NULL; + posix_lock_t *l = NULL; + posix_locks_private_t *priv = this->private; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + op_ret = 0, op_errno = 0; + pl_inode = pl_inode_get(this, fd->inode, NULL); + if (!pl_inode) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, "Could not get inode"); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + /* As per design, under forced and file-based mandatory locking modes + * it doesn't matter whether inodes's lock list contain advisory or + * mandatory type locks. So we just check whether inode's lock list is + * empty or not to make sure that no locks are being held for the file. + * Whereas under optimal mandatory locking mode, we strictly fail open + * if and only if lock list contain mandatory locks. + */ + if (((priv->mandatory_mode == MLK_FILE_BASED) && pl_inode->mandatory) || + priv->mandatory_mode == MLK_FORCED) { + if (fd->flags & O_TRUNC) { + pthread_mutex_lock(&pl_inode->mutex); + { + if (!list_empty(&pl_inode->ext_list)) { + op_ret = -1; + op_errno = EAGAIN; + } + } + pthread_mutex_unlock(&pl_inode->mutex); + } + } else if (priv->mandatory_mode == MLK_OPTIMAL) { + if (fd->flags & O_TRUNC) { + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if ((l->lk_flags & GF_LK_MANDATORY)) { + op_ret = -1; + op_errno = EAGAIN; + break; + } + } + } + pthread_mutex_unlock(&pl_inode->mutex); + } + } - return 0; +unwind: + if (op_ret == -1) + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, NULL, NULL); + else + STACK_WIND(frame, pl_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; } - int -pl_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd, inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +pl_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - pl_fdctx_t *fdctx = NULL; + pl_fdctx_t *fdctx = NULL; - if (op_ret < 0) - goto unwind; + if (op_ret < 0) + goto unwind; - fdctx = pl_check_n_create_fdctx (this, fd); - if (!fdctx) { - op_errno = ENOMEM; - op_ret = -1; - goto unwind; - } + fdctx = pl_check_n_create_fdctx(this, fd); + if (!fdctx) { + op_errno = ENOMEM; + op_ret = -1; + goto unwind; + } unwind: - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent, xdata); + PL_STACK_UNWIND(create, xdata, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); - return 0; + return 0; } - int -pl_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, - dict_t *xdata) +pl_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, pl_create_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, - loc, flags, mode, umask, fd, xdata); - return 0; -} + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +} int -pl_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *stbuf, - struct iobref *iobref, dict_t *xdata) +pl_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, - vector, count, stbuf, iobref, xdata); + pl_track_io_fop_count(frame->local, this, DECREMENT); - return 0; + PL_STACK_UNWIND(readv, xdata, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); + + return 0; } int -pl_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +pl_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + pl_track_io_fop_count(frame->local, this, DECREMENT); - return 0; -} + PL_STACK_UNWIND(writev, xdata, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} void -do_blocked_rw (pl_inode_t *pl_inode) +do_blocked_rw(pl_inode_t *pl_inode) { - struct list_head wind_list; - pl_rw_req_t *rw = NULL; - pl_rw_req_t *tmp = NULL; + struct list_head wind_list; + pl_rw_req_t *rw = NULL; + pl_rw_req_t *tmp = NULL; - INIT_LIST_HEAD (&wind_list); + INIT_LIST_HEAD(&wind_list); - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(rw, tmp, &pl_inode->rw_list, list) { - list_for_each_entry_safe (rw, tmp, &pl_inode->rw_list, list) { - if (__rw_allowable (pl_inode, &rw->region, - rw->stub->fop)) { - list_del_init (&rw->list); - list_add_tail (&rw->list, &wind_list); - } + if (__rw_allowable(pl_inode, &rw->region, rw->stub->fop)) { + list_del_init(&rw->list); + list_add_tail(&rw->list, &wind_list); + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; } + } } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); - list_for_each_entry_safe (rw, tmp, &wind_list, list) { - list_del_init (&rw->list); - call_resume (rw->stub); - GF_FREE (rw); - } + list_for_each_entry_safe(rw, tmp, &wind_list, list) + { + list_del_init(&rw->list); + call_resume(rw->stub); + GF_FREE(rw); + } - return; + return; } +/* when mandatory lock is enforced: + If an IO request comes on a region which is out of the boundary of the + granted mandatory lock, it will be rejected. + + Note: There is no IO blocking with mandatory lock enforced as it may be + a stale data from an old client. + */ +gf_boolean_t static within_range(posix_lock_t *existing, posix_lock_t *new) +{ + if (existing->fl_start <= new->fl_start && existing->fl_end >= new->fl_end) + return _gf_true; + + return _gf_false; +} static int -__rw_allowable (pl_inode_t *pl_inode, posix_lock_t *region, - glusterfs_fop_t op) +__rw_allowable(pl_inode_t *pl_inode, posix_lock_t *region, glusterfs_fop_t op) { - posix_lock_t *l = NULL; - int ret = 1; + posix_lock_t *l = NULL; + posix_locks_private_t *priv = THIS->private; + int ret = 1; - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (locks_overlap (l, region) && !same_owner (l, region)) { - if ((op == GF_FOP_READ) && (l->fl_type != F_WRLCK)) - continue; - ret = 0; - break; + if (pl_inode->mlock_enforced) { + list_for_each_entry(l, &pl_inode->ext_list, list) + { + /* + with lock enforced (fencing) there should not be any blocking + lock coexisting. + */ + if (same_owner(l, region)) { + /* Should range check be strict for same owner with fencing? */ + if (locks_overlap(l, region)) { + if (within_range(l, region)) { + return 1; + } else { + /* + Should we allow read fop if it does not fit it in the + range? + if (op == GF_FOP_READ && l->fl_type != F_WRLCK) { + return 1; + } + */ + return 0; + } + } + } else { + if (locks_overlap(l, region)) { + /* + with fencing should a read from a different owner be + allowed if the mandatory lock taken is F_RDLCK? + if (op == GF_FOP_READ && l->fl_type != F_WRLCK) { + return 1; + } + */ + return 0; } + } } - return ret; -} + /* No lock has been taken by this owner */ + return 0; + } + + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (!l->blocked && locks_overlap(l, region) && !same_owner(l, region)) { + if ((op == GF_FOP_READ) && (l->fl_type != F_WRLCK)) + continue; + /* Check for mandatory lock under optimal + * mandatory-locking mode */ + if (priv->mandatory_mode == MLK_OPTIMAL && + !(l->lk_flags & GF_LK_MANDATORY)) + continue; + ret = 0; + break; + } + } + return ret; +} int -pl_readv_cont (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +pl_readv_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - STACK_WIND (frame, pl_readv_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, - fd, size, offset, flags, xdata); + pl_track_io_fop_count(frame->local, this, INCREMENT); - return 0; -} + STACK_WIND(frame, pl_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; +} int -pl_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) -{ - posix_locks_private_t *priv = NULL; - pl_inode_t *pl_inode = NULL; - pl_rw_req_t *rw = NULL; - posix_lock_t region = {.list = {0, }, }; - int op_ret = 0; - int op_errno = 0; - char wind_needed = 1; - - - priv = this->private; - pl_inode = pl_inode_get (this, fd->inode); - - if (priv->mandatory && pl_inode->mandatory) { - region.fl_start = offset; - region.fl_end = offset + size - 1; - region.client = frame->root->client; - region.fd_num = fd_to_fdnum(fd); - region.client_pid = frame->root->pid; - region.owner = frame->root->lk_owner; - - pthread_mutex_lock (&pl_inode->mutex); - { - wind_needed = __rw_allowable (pl_inode, ®ion, - GF_FOP_READ); - if (wind_needed) { - goto unlock; - } - - if (fd->flags & O_NONBLOCK) { - gf_log (this->name, GF_LOG_TRACE, - "returning EAGAIN as fd is O_NONBLOCK"); - op_errno = EAGAIN; - op_ret = -1; - goto unlock; - } - - rw = GF_CALLOC (1, sizeof (*rw), - gf_locks_mt_pl_rw_req_t); - if (!rw) { - op_errno = ENOMEM; - op_ret = -1; - goto unlock; - } - - rw->stub = fop_readv_stub (frame, pl_readv_cont, - fd, size, offset, flags, - xdata); - if (!rw->stub) { - op_errno = ENOMEM; - op_ret = -1; - GF_FREE (rw); - goto unlock; - } +pl_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = { + .list = + { + 0, + }, + }; + gf_boolean_t enabled = _gf_false; + gf_boolean_t can_block = _gf_true; + int op_ret = 0; + int op_errno = 0; + int allowed = 1; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + + if (!frame->local) { + frame->local = mem_get0(this->local_pool); + local = frame->local; + local->inode = inode_ref(fd->inode); + local->fd = fd_ref(fd); + } + + pl_inode = pl_inode_get(this, fd->inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (frame->root->pid < 0) + enabled = _gf_false; + else + enabled = pl_is_mandatory_locking_enabled(pl_inode); + + if (enabled) { + region.fl_start = offset; + region.fl_end = offset + size - 1; + region.client = frame->root->client; + region.fd_num = fd_to_fdnum(fd); + region.client_pid = frame->root->pid; + region.owner = frame->root->lk_owner; + + pthread_mutex_lock(&pl_inode->mutex); + { + allowed = pl_is_fop_allowed(pl_inode, ®ion, fd, GF_FOP_READ, + &can_block); + if (allowed == 1) { + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; + } + goto unlock; + } else if (!can_block) { + op_errno = EAGAIN; + op_ret = -1; + goto unlock; + } - rw->region = region; + rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t); + if (!rw) { + op_errno = ENOMEM; + op_ret = -1; + goto unlock; + } - list_add_tail (&rw->list, &pl_inode->rw_list); - } - unlock: - pthread_mutex_unlock (&pl_inode->mutex); - } + rw->stub = fop_readv_stub(frame, pl_readv_cont, fd, size, offset, + flags, xdata); + if (!rw->stub) { + op_errno = ENOMEM; + op_ret = -1; + GF_FREE(rw); + goto unlock; + } + rw->region = region; - if (wind_needed) { - STACK_WIND (frame, pl_readv_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, - fd, size, offset, flags, xdata); + list_add_tail(&rw->list, &pl_inode->rw_list); } + unlock: + pthread_mutex_unlock(&pl_inode->mutex); + } + + if (allowed == 1) { + STACK_WIND(frame, pl_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + } +unwind: + if (op_ret == -1) + PL_STACK_UNWIND(readv, xdata, frame, op_ret, op_errno, NULL, 0, NULL, + NULL, NULL); - if (op_ret == -1) - STACK_UNWIND_STRICT (readv, frame, -1, op_errno, - NULL, 0, NULL, NULL, NULL); - - return 0; + return 0; } - int -pl_writev_cont (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata) +pl_writev_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) { - STACK_WIND (frame, pl_writev_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, - fd, vector, count, offset, flags, iobref, xdata); + pl_track_io_fop_count(frame->local, this, INCREMENT); - return 0; -} + STACK_WIND(frame, pl_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; +} int -pl_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata) -{ - posix_locks_private_t *priv = NULL; - pl_inode_t *pl_inode = NULL; - pl_rw_req_t *rw = NULL; - posix_lock_t region = {.list = {0, }, }; - int op_ret = 0; - int op_errno = 0; - char wind_needed = 1; - - priv = this->private; - pl_inode = pl_inode_get (this, fd->inode); - - if (priv->mandatory && pl_inode->mandatory) { - region.fl_start = offset; - region.fl_end = offset + iov_length (vector, count) - 1; - region.client = frame->root->client; - region.fd_num = fd_to_fdnum(fd); - region.client_pid = frame->root->pid; - region.owner = frame->root->lk_owner; - - pthread_mutex_lock (&pl_inode->mutex); - { - wind_needed = __rw_allowable (pl_inode, ®ion, - GF_FOP_WRITE); - if (wind_needed) - goto unlock; - - if (fd->flags & O_NONBLOCK) { - gf_log (this->name, GF_LOG_TRACE, - "returning EAGAIN because fd is " - "O_NONBLOCK"); - op_errno = EAGAIN; - op_ret = -1; - goto unlock; - } - - rw = GF_CALLOC (1, sizeof (*rw), - gf_locks_mt_pl_rw_req_t); - if (!rw) { - op_errno = ENOMEM; - op_ret = -1; - goto unlock; - } - - rw->stub = fop_writev_stub (frame, pl_writev_cont, - fd, vector, count, offset, - flags, iobref, xdata); - if (!rw->stub) { - op_errno = ENOMEM; - op_ret = -1; - GF_FREE (rw); - goto unlock; - } +pl_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = { + .list = + { + 0, + }, + }; + gf_boolean_t enabled = _gf_false; + gf_boolean_t can_block = _gf_true; + int op_ret = 0; + int op_errno = 0; + int allowed = 1; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + + if (!frame->local) { + frame->local = mem_get0(this->local_pool); + local = frame->local; + local->inode = inode_ref(fd->inode); + local->fd = fd_ref(fd); + } + + pl_inode = pl_inode_get(this, fd->inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (frame->root->pid < 0) + enabled = _gf_false; + else + enabled = pl_is_mandatory_locking_enabled(pl_inode); + + if (enabled) { + region.fl_start = offset; + region.fl_end = offset + iov_length(vector, count) - 1; + region.client = frame->root->client; + region.fd_num = fd_to_fdnum(fd); + region.client_pid = frame->root->pid; + region.owner = frame->root->lk_owner; + + pthread_mutex_lock(&pl_inode->mutex); + { + allowed = pl_is_fop_allowed(pl_inode, ®ion, fd, GF_FOP_WRITE, + &can_block); + if (allowed == 1) { + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; + } + goto unlock; + } else if (!can_block) { + if (pl_inode->mlock_enforced) { + op_errno = EBUSY; + } else { + op_errno = EAGAIN; + } - rw->region = region; + op_ret = -1; + goto unlock; + } - list_add_tail (&rw->list, &pl_inode->rw_list); - } - unlock: - pthread_mutex_unlock (&pl_inode->mutex); - } + rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t); + if (!rw) { + op_errno = ENOMEM; + op_ret = -1; + goto unlock; + } + rw->stub = fop_writev_stub(frame, pl_writev_cont, fd, vector, count, + offset, flags, iobref, xdata); + if (!rw->stub) { + op_errno = ENOMEM; + op_ret = -1; + GF_FREE(rw); + goto unlock; + } - if (wind_needed) - STACK_WIND (frame, pl_writev_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, - fd, vector, count, offset, flags, iobref, xdata); + rw->region = region; - if (op_ret == -1) - STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL, - NULL); + list_add_tail(&rw->list, &pl_inode->rw_list); + } + unlock: + pthread_mutex_unlock(&pl_inode->mutex); + } + + if (allowed == 1) { + STACK_WIND(frame, pl_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + } +unwind: + if (op_ret == -1) + PL_STACK_UNWIND(writev, xdata, frame, op_ret, op_errno, NULL, NULL, + NULL); - return 0; + return 0; } static int -__fd_has_locks (pl_inode_t *pl_inode, fd_t *fd) +__fd_has_locks(pl_inode_t *pl_inode, fd_t *fd) { - int found = 0; - posix_lock_t *l = NULL; + posix_lock_t *l = NULL; - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (l->fd_num == fd_to_fdnum(fd)) { - found = 1; - break; - } + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (l->fd_num == fd_to_fdnum(fd)) { + return 1; } + } - return found; + return 0; } static posix_lock_t * -lock_dup (posix_lock_t *lock) +lock_dup(posix_lock_t *lock) { - posix_lock_t *new_lock = NULL; - - new_lock = new_posix_lock (&lock->user_flock, lock->client, - lock->client_pid, &lock->owner, - (fd_t *)lock->fd_num); - return new_lock; + int32_t op_errno = 0; + return new_posix_lock(&lock->user_flock, lock->client, lock->client_pid, + &lock->owner, (fd_t *)lock->fd_num, lock->lk_flags, + lock->blocking, &op_errno); } static int -__dup_locks_to_fdctx (pl_inode_t *pl_inode, fd_t *fd, - pl_fdctx_t *fdctx) -{ - posix_lock_t *l = NULL; - posix_lock_t *duplock = NULL; - int ret = 0; - - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (l->fd_num == fd_to_fdnum(fd)) { - duplock = lock_dup (l); - if (!duplock) { - ret = -1; - break; - } +__dup_locks_to_fdctx(pl_inode_t *pl_inode, fd_t *fd, pl_fdctx_t *fdctx) +{ + posix_lock_t *l = NULL; + posix_lock_t *duplock = NULL; + int ret = 0; + + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (l->fd_num == fd_to_fdnum(fd)) { + duplock = lock_dup(l); + if (!duplock) { + ret = -1; + break; + } - list_add_tail (&duplock->list, &fdctx->locks_list); - } + list_add_tail(&duplock->list, &fdctx->locks_list); } + } - return ret; + return ret; } static int -__copy_locks_to_fdctx (pl_inode_t *pl_inode, fd_t *fd, - pl_fdctx_t *fdctx) +__copy_locks_to_fdctx(pl_inode_t *pl_inode, fd_t *fd, pl_fdctx_t *fdctx) { - int ret = 0; - - ret = __dup_locks_to_fdctx (pl_inode, fd, fdctx); - if (ret) - goto out; - -out: - return ret; - + return __dup_locks_to_fdctx(pl_inode, fd, fdctx); } static void -pl_mark_eol_lock (posix_lock_t *lock) +pl_mark_eol_lock(posix_lock_t *lock) { - lock->user_flock.l_type = GF_LK_EOL; - return; + lock->user_flock.l_type = GF_LK_EOL; + return; } static posix_lock_t * -__get_next_fdctx_lock (pl_fdctx_t *fdctx) +__get_next_fdctx_lock(pl_fdctx_t *fdctx) { - posix_lock_t *lock = NULL; + posix_lock_t *lock = NULL; - GF_ASSERT (fdctx); + GF_ASSERT(fdctx); - if (list_empty (&fdctx->locks_list)) { - gf_log (THIS->name, GF_LOG_DEBUG, - "fdctx lock list empty"); - goto out; - } + if (list_empty(&fdctx->locks_list)) { + gf_log(THIS->name, GF_LOG_DEBUG, "fdctx lock list empty"); + goto out; + } - lock = list_entry (fdctx->locks_list.next, typeof (*lock), - list); + lock = list_entry(fdctx->locks_list.next, typeof(*lock), list); - GF_ASSERT (lock); + GF_ASSERT(lock); - list_del_init (&lock->list); + list_del_init(&lock->list); out: - return lock; + return lock; } static int -__set_next_lock_fd (pl_fdctx_t *fdctx, posix_lock_t *reqlock) +__set_next_lock_fd(pl_fdctx_t *fdctx, posix_lock_t *reqlock) { - posix_lock_t *lock = NULL; - int ret = 0; + posix_lock_t *lock = NULL; + int ret = 0; - GF_ASSERT (fdctx); + GF_ASSERT(fdctx); - lock = __get_next_fdctx_lock (fdctx); - if (!lock) { - gf_log (THIS->name, GF_LOG_DEBUG, - "marking EOL in reqlock"); - pl_mark_eol_lock (reqlock); - goto out; - } + lock = __get_next_fdctx_lock(fdctx); + if (!lock) { + gf_log(THIS->name, GF_LOG_DEBUG, "marking EOL in reqlock"); + pl_mark_eol_lock(reqlock); + goto out; + } - reqlock->user_flock = lock->user_flock; - reqlock->fl_start = lock->fl_start; - reqlock->fl_type = lock->fl_type; - reqlock->fl_end = lock->fl_end; - reqlock->owner = lock->owner; + reqlock->user_flock = lock->user_flock; + reqlock->fl_start = lock->fl_start; + reqlock->fl_type = lock->fl_type; + reqlock->fl_end = lock->fl_end; + reqlock->owner = lock->owner; out: - if (lock) - __destroy_lock (lock); + if (lock) + __destroy_lock(lock); - return ret; + return ret; } static int -pl_getlk_fd (xlator_t *this, pl_inode_t *pl_inode, - fd_t *fd, posix_lock_t *reqlock) +pl_getlk_fd(xlator_t *this, pl_inode_t *pl_inode, fd_t *fd, + posix_lock_t *reqlock) { - uint64_t tmp = 0; - pl_fdctx_t *fdctx = NULL; - int ret = 0; + uint64_t tmp = 0; + pl_fdctx_t *fdctx = NULL; + int ret = 0; + + pthread_mutex_lock(&pl_inode->mutex); + { + if (!__fd_has_locks(pl_inode, fd)) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_log(this->name, GF_LOG_DEBUG, "fd=%p has no active locks", fd); + ret = 0; + goto out; + } - pthread_mutex_lock (&pl_inode->mutex); - { - if (!__fd_has_locks (pl_inode, fd)) { - gf_log (this->name, GF_LOG_DEBUG, - "fd=%p has no active locks", fd); - ret = 0; - goto unlock; - } + gf_log(this->name, GF_LOG_DEBUG, "There are active locks on fd"); - gf_log (this->name, GF_LOG_DEBUG, - "There are active locks on fd"); + ret = fd_ctx_get(fd, this, &tmp); + fdctx = (pl_fdctx_t *)(long)tmp; - ret = fd_ctx_get (fd, this, &tmp); - fdctx = (pl_fdctx_t *)(long) tmp; + if (list_empty(&fdctx->locks_list)) { + gf_log(this->name, GF_LOG_TRACE, + "no fdctx -> copying all locks on fd"); - if (list_empty (&fdctx->locks_list)) { - gf_log (this->name, GF_LOG_TRACE, - "no fdctx -> copying all locks on fd"); + ret = __copy_locks_to_fdctx(pl_inode, fd, fdctx); + if (ret) { + goto unlock; + } - ret = __copy_locks_to_fdctx (pl_inode, fd, fdctx); - if (ret) { - goto unlock; - } + ret = __set_next_lock_fd(fdctx, reqlock); - ret = __set_next_lock_fd (fdctx, reqlock); - - } else { - gf_log (this->name, GF_LOG_TRACE, - "fdctx present -> returning the next lock"); - ret = __set_next_lock_fd (fdctx, reqlock); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "could not get next lock of fd"); - goto unlock; - } - } + } else { + gf_log(this->name, GF_LOG_TRACE, + "fdctx present -> returning the next lock"); + ret = __set_next_lock_fd(fdctx, reqlock); + if (ret) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_log(this->name, GF_LOG_DEBUG, + "could not get next lock of fd"); + goto out; + } } + } unlock: - pthread_mutex_unlock (&pl_inode->mutex); - return ret; - + pthread_mutex_unlock(&pl_inode->mutex); +out: + return ret; } int -pl_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) -{ - pl_inode_t *pl_inode = NULL; - int op_ret = 0; - int op_errno = 0; - int can_block = 0; - posix_lock_t *reqlock = NULL; - posix_lock_t *conf = NULL; - int ret = 0; - - if ((flock->l_start < 0) || (flock->l_len < 0)) { - op_ret = -1; - op_errno = EINVAL; - goto unwind; - } +pl_metalock_is_active(pl_inode_t *pl_inode) +{ + if (list_empty(&pl_inode->metalk_list)) + return 0; + else + return 1; +} - pl_inode = pl_inode_get (this, fd->inode); - if (!pl_inode) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } +void +__pl_queue_lock(pl_inode_t *pl_inode, posix_lock_t *reqlock) +{ + list_add_tail(&reqlock->list, &pl_inode->queued_locks); +} - reqlock = new_posix_lock (flock, frame->root->client, frame->root->pid, - &frame->root->lk_owner, fd); +int +pl_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + pl_inode_t *pl_inode = NULL; + int op_ret = 0; + int op_errno = 0; + int can_block = 0; + posix_lock_t *reqlock = NULL; + posix_lock_t *conf = NULL; + uint32_t lk_flags = 0; + posix_locks_private_t *priv = this->private; + pl_local_t *local = NULL; + short lock_type = 0; + + int ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_flags); + if (ret == 0) { + if (priv->mandatory_mode == MLK_NONE) + gf_log(this->name, GF_LOG_DEBUG, + "Lock flags received " + "in a non-mandatory locking environment, " + "continuing"); + else + gf_log(this->name, GF_LOG_DEBUG, + "Lock flags received, " + "continuing"); + } + + if ((flock->l_start < 0) || ((flock->l_start + flock->l_len) < 0)) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + + /* As per 'man 3 fcntl', the value of l_len may be + * negative. In such cases, lock request should be + * considered for the range starting at 'l_start+l_len' + * and ending at 'l_start-1'. Update the fields accordingly. + */ + if (flock->l_len < 0) { + flock->l_start += flock->l_len; + flock->l_len = labs(flock->l_len); + } + + local = mem_get0(this->local_pool); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } else { + frame->local = local; + local->fd = fd_ref(fd); + } - if (!reqlock) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } + pl_inode = pl_inode_get(this, fd->inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + reqlock = new_posix_lock(flock, frame->root->client, frame->root->pid, + &frame->root->lk_owner, fd, lk_flags, can_block, + &op_errno); - pl_trace_in (this, frame, fd, NULL, cmd, flock, NULL); + if (!reqlock) { + op_ret = -1; + goto unwind; + } - switch (cmd) { + pl_trace_in(this, frame, fd, NULL, cmd, flock, NULL); + switch (cmd) { case F_RESLK_LCKW: - can_block = 1; + can_block = 1; - /* fall through */ + /* fall through */ case F_RESLK_LCK: - memcpy (&reqlock->user_flock, flock, sizeof (struct gf_flock)); - reqlock->frame = frame; - reqlock->this = this; + reqlock->frame = frame; + reqlock->this = this; - ret = pl_reserve_setlk (this, pl_inode, reqlock, - can_block); - if (ret < 0) { - if (can_block) - goto out; + ret = pl_reserve_setlk(this, pl_inode, reqlock, can_block); + if (ret < 0) { + if (can_block) + goto out; - op_ret = -1; - op_errno = -ret; - __destroy_lock (reqlock); - goto unwind; - } - /* Finally a getlk and return the call */ - conf = pl_getlk (pl_inode, reqlock); - if (conf) - posix_lock_to_flock (conf, flock); - break; + op_ret = -1; + op_errno = -ret; + __destroy_lock(reqlock); + goto unwind; + } + /* Finally a getlk and return the call */ + conf = pl_getlk(pl_inode, reqlock); + if (conf) + posix_lock_to_flock(conf, flock); + break; case F_RESLK_UNLCK: - reqlock->frame = frame; - reqlock->this = this; - ret = pl_reserve_unlock (this, pl_inode, reqlock); - if (ret < 0) { - op_ret = -1; - op_errno = -ret; - } - __destroy_lock (reqlock); - goto unwind; + reqlock->frame = frame; + reqlock->this = this; + ret = pl_reserve_unlock(this, pl_inode, reqlock); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + } + __destroy_lock(reqlock); + goto unwind; - break; + break; case F_GETLK_FD: - reqlock->frame = frame; - reqlock->this = this; - ret = pl_verify_reservelk (this, pl_inode, reqlock, can_block); - GF_ASSERT (ret >= 0); - - ret = pl_getlk_fd (this, pl_inode, fd, reqlock); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "getting locks on fd failed"); - op_ret = -1; - op_errno = ENOLCK; - goto unwind; - } + reqlock->frame = frame; + reqlock->this = this; + ret = pl_verify_reservelk(this, pl_inode, reqlock, can_block); + GF_ASSERT(ret >= 0); + + ret = pl_getlk_fd(this, pl_inode, fd, reqlock); + if (ret < 0) { + gf_log(this->name, GF_LOG_DEBUG, "getting locks on fd failed"); + op_ret = -1; + op_errno = ENOLCK; + goto unwind; + } - gf_log (this->name, GF_LOG_TRACE, - "Replying with a lock on fd for healing"); + gf_log(this->name, GF_LOG_TRACE, + "Replying with a lock on fd for healing"); - posix_lock_to_flock (reqlock, flock); - __destroy_lock (reqlock); + posix_lock_to_flock(reqlock, flock); + __destroy_lock(reqlock); - break; + break; #if F_GETLK != F_GETLK64 case F_GETLK64: #endif case F_GETLK: - conf = pl_getlk (pl_inode, reqlock); - posix_lock_to_flock (conf, flock); - __destroy_lock (reqlock); + conf = pl_getlk(pl_inode, reqlock); + posix_lock_to_flock(conf, flock); + __destroy_lock(reqlock); - break; + break; #if F_SETLKW != F_SETLKW64 case F_SETLKW64: #endif case F_SETLKW: - can_block = 1; - reqlock->frame = frame; - reqlock->this = this; - - /* fall through */ + can_block = 1; + reqlock->frame = frame; + reqlock->this = this; + reqlock->blocking = can_block; + /* fall through */ #if F_SETLK != F_SETLK64 case F_SETLK64: #endif case F_SETLK: - memcpy (&reqlock->user_flock, flock, sizeof (struct gf_flock)); - ret = pl_verify_reservelk (this, pl_inode, reqlock, can_block); - if (ret < 0) { - gf_log (this->name, GF_LOG_TRACE, - "Lock blocked due to conflicting reserve lock"); - goto out; + reqlock->frame = frame; + reqlock->this = this; + lock_type = flock->l_type; + + pthread_mutex_lock(&pl_inode->mutex); + { + if (pl_inode->migrated) { + op_errno = EREMOTE; + pthread_mutex_unlock(&pl_inode->mutex); + STACK_UNWIND_STRICT(lk, frame, -1, op_errno, flock, xdata); + + __destroy_lock(reqlock); + goto out; } - ret = pl_setlk (this, pl_inode, reqlock, - can_block); + } + pthread_mutex_unlock(&pl_inode->mutex); + + ret = pl_verify_reservelk(this, pl_inode, reqlock, can_block); + if (ret < 0) { + gf_log(this->name, GF_LOG_TRACE, + "Lock blocked due to conflicting reserve lock"); + goto out; + } + if (reqlock->fl_type != F_UNLCK && pl_inode->mlock_enforced) { + ret = pl_lock_preempt(pl_inode, reqlock); if (ret == -1) { - if ((can_block) && (F_UNLCK != flock->l_type)) { - pl_trace_block (this, frame, fd, NULL, cmd, flock, NULL); - goto out; - } - gf_log (this->name, GF_LOG_DEBUG, "returning EAGAIN"); - op_ret = -1; - op_errno = EAGAIN; - __destroy_lock (reqlock); - - } else if ((0 == ret) && (F_UNLCK == flock->l_type)) { - /* For NLM's last "unlock on fd" detection */ - if (pl_locks_by_fd (pl_inode, fd)) - flock->l_type = F_RDLCK; - else - flock->l_type = F_UNLCK; + gf_log(this->name, GF_LOG_ERROR, "lock preempt failed"); + op_ret = -1; + op_errno = EAGAIN; + __destroy_lock(reqlock); + goto out; } - } -unwind: - pl_trace_out (this, frame, fd, NULL, cmd, flock, op_ret, op_errno, NULL); - pl_update_refkeeper (this, fd->inode); + pl_trace_block(this, frame, fd, NULL, cmd, flock, NULL); + goto unwind; + } + ret = pl_setlk(this, pl_inode, reqlock, can_block); + if (ret == -1) { + if ((can_block) && (F_UNLCK != lock_type)) { + goto out; + } + gf_log(this->name, GF_LOG_DEBUG, "returning EAGAIN"); + op_ret = -1; + op_errno = EAGAIN; + __destroy_lock(reqlock); + } else if (ret == -2) { + goto out; + } else if ((0 == ret) && (F_UNLCK == flock->l_type)) { + /* For NLM's last "unlock on fd" detection */ + if (pl_locks_by_fd(pl_inode, fd)) + flock->l_type = F_RDLCK; + else + flock->l_type = F_UNLCK; + } + } - STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, flock, xdata); +unwind: + pl_trace_out(this, frame, fd, NULL, cmd, flock, op_ret, op_errno, NULL); + pl_update_refkeeper(this, fd->inode); + + PL_STACK_UNWIND(lk, xdata, frame, op_ret, op_errno, flock, xdata); out: - return 0; + return 0; } - /* TODO: this function just logs, no action required?? */ int -pl_forget (xlator_t *this, - inode_t *inode) +pl_forget(xlator_t *this, inode_t *inode) { - pl_inode_t *pl_inode = NULL; + pl_inode_t *pl_inode = NULL; + + posix_lock_t *ext_tmp = NULL; + posix_lock_t *ext_l = NULL; + struct list_head posixlks_released; - posix_lock_t *ext_tmp = NULL; - posix_lock_t *ext_l = NULL; - struct list_head posixlks_released; + pl_inode_lock_t *ino_tmp = NULL; + pl_inode_lock_t *ino_l = NULL; + struct list_head inodelks_released; - pl_inode_lock_t *ino_tmp = NULL; - pl_inode_lock_t *ino_l = NULL; - struct list_head inodelks_released; + pl_rw_req_t *rw_tmp = NULL; + pl_rw_req_t *rw_req = NULL; - pl_rw_req_t *rw_tmp = NULL; - pl_rw_req_t *rw_req = NULL; + pl_entry_lock_t *entry_tmp = NULL; + pl_entry_lock_t *entry_l = NULL; + struct list_head entrylks_released; - pl_entry_lock_t *entry_tmp = NULL; - pl_entry_lock_t *entry_l = NULL; - struct list_head entrylks_released; + pl_dom_list_t *dom = NULL; + pl_dom_list_t *dom_tmp = NULL; - pl_dom_list_t *dom = NULL; - pl_dom_list_t *dom_tmp = NULL; + INIT_LIST_HEAD(&posixlks_released); + INIT_LIST_HEAD(&inodelks_released); + INIT_LIST_HEAD(&entrylks_released); - INIT_LIST_HEAD (&posixlks_released); - INIT_LIST_HEAD (&inodelks_released); - INIT_LIST_HEAD (&entrylks_released); + pl_inode = pl_inode_get(this, inode, NULL); + if (!pl_inode) + return 0; - pl_inode = pl_inode_get (this, inode); + pthread_mutex_lock(&pl_inode->mutex); + { + if (!list_empty(&pl_inode->rw_list)) { + gf_log(this->name, GF_LOG_WARNING, + "Pending R/W requests found, releasing."); + + list_for_each_entry_safe(rw_req, rw_tmp, &pl_inode->rw_list, list) + { + list_del(&rw_req->list); + call_stub_destroy(rw_req->stub); + GF_FREE(rw_req); + } + } - pthread_mutex_lock (&pl_inode->mutex); - { + if (!list_empty(&pl_inode->ext_list)) { + gf_log(this->name, GF_LOG_WARNING, + "Pending fcntl locks found, releasing."); - if (!list_empty (&pl_inode->rw_list)) { - gf_log (this->name, GF_LOG_WARNING, - "Pending R/W requests found, releasing."); + list_for_each_entry_safe(ext_l, ext_tmp, &pl_inode->ext_list, list) + { + __delete_lock(ext_l); + if (ext_l->blocked) { + list_add_tail(&ext_l->list, &posixlks_released); + continue; + } + __destroy_lock(ext_l); + } + } - list_for_each_entry_safe (rw_req, rw_tmp, &pl_inode->rw_list, - list) { + list_for_each_entry_safe(dom, dom_tmp, &pl_inode->dom_list, inode_list) + { + if (!list_empty(&dom->inodelk_list)) { + gf_log(this->name, GF_LOG_WARNING, + "Pending inode locks found, releasing."); - list_del (&rw_req->list); - GF_FREE (rw_req); - } + list_for_each_entry_safe(ino_l, ino_tmp, &dom->inodelk_list, + list) + { + __delete_inode_lock(ino_l); + __pl_inodelk_unref(ino_l); } - if (!list_empty (&pl_inode->ext_list)) { - gf_log (this->name, GF_LOG_WARNING, - "Pending fcntl locks found, releasing."); + list_splice_init(&dom->blocked_inodelks, &inodelks_released); + } + if (!list_empty(&dom->entrylk_list)) { + gf_log(this->name, GF_LOG_WARNING, + "Pending entry locks found, releasing."); - list_for_each_entry_safe (ext_l, ext_tmp, &pl_inode->ext_list, - list) { + list_for_each_entry_safe(entry_l, entry_tmp, &dom->entrylk_list, + domain_list) + { + list_del_init(&entry_l->domain_list); - __delete_lock (pl_inode, ext_l); - if (ext_l->blocked) { - list_add_tail (&ext_l->list, &posixlks_released); - continue; - } - __destroy_lock (ext_l); - } + GF_FREE((char *)entry_l->basename); + GF_FREE(entry_l->connection_id); + GF_FREE(entry_l); } + list_splice_init(&dom->blocked_entrylks, &entrylks_released); + } - list_for_each_entry_safe (dom, dom_tmp, &pl_inode->dom_list, inode_list) { + list_del(&dom->inode_list); + gf_log("posix-locks", GF_LOG_TRACE, " Cleaning up domain: %s", + dom->domain); + GF_FREE((char *)(dom->domain)); + GF_FREE(dom); + } + } + pthread_mutex_unlock(&pl_inode->mutex); - if (!list_empty (&dom->inodelk_list)) { - gf_log (this->name, GF_LOG_WARNING, - "Pending inode locks found, releasing."); + if (!list_empty(&posixlks_released)) { + list_for_each_entry_safe(ext_l, ext_tmp, &posixlks_released, list) + { + STACK_UNWIND_STRICT(lk, ext_l->frame, -1, 0, &ext_l->user_flock, + NULL); + __destroy_lock(ext_l); + } + } - list_for_each_entry_safe (ino_l, ino_tmp, &dom->inodelk_list, list) { - __delete_inode_lock (ino_l); - __pl_inodelk_unref (ino_l); - } + if (!list_empty(&inodelks_released)) { + list_for_each_entry_safe(ino_l, ino_tmp, &inodelks_released, + blocked_locks) + { + STACK_UNWIND_STRICT(inodelk, ino_l->frame, -1, 0, NULL); + __pl_inodelk_unref(ino_l); + } + } - list_splice_init (&dom->blocked_inodelks, &inodelks_released); + if (!list_empty(&entrylks_released)) { + list_for_each_entry_safe(entry_l, entry_tmp, &entrylks_released, + blocked_locks) + { + STACK_UNWIND_STRICT(entrylk, entry_l->frame, -1, 0, NULL); + GF_FREE((char *)entry_l->basename); + GF_FREE(entry_l->connection_id); + GF_FREE(entry_l); + } + } + pthread_mutex_destroy(&pl_inode->mutex); - } - if (!list_empty (&dom->entrylk_list)) { - gf_log (this->name, GF_LOG_WARNING, - "Pending entry locks found, releasing."); + GF_FREE(pl_inode); - list_for_each_entry_safe (entry_l, entry_tmp, &dom->entrylk_list, domain_list) { - list_del_init (&entry_l->domain_list); + return 0; +} - GF_FREE ((char *)entry_l->basename); - GF_FREE (entry_l->connection_id); - GF_FREE (entry_l); - } +int +pl_release(xlator_t *this, fd_t *fd) +{ + pl_inode_t *pl_inode = NULL; + uint64_t tmp_pl_inode = 0; + int ret = -1; + uint64_t tmp = 0; + pl_fdctx_t *fdctx = NULL; - list_splice_init (&dom->blocked_entrylks, &entrylks_released); - } + if (fd == NULL) { + goto out; + } - list_del (&dom->inode_list); - gf_log ("posix-locks", GF_LOG_TRACE, - " Cleaning up domain: %s", dom->domain); - GF_FREE ((char *)(dom->domain)); - GF_FREE (dom); - } + ret = inode_ctx_get(fd->inode, this, &tmp_pl_inode); + if (ret != 0) + goto clean; - } - pthread_mutex_unlock (&pl_inode->mutex); + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; - list_for_each_entry_safe (ext_l, ext_tmp, &posixlks_released, list) { + pl_trace_release(this, fd); - STACK_UNWIND_STRICT (lk, ext_l->frame, -1, 0, - &ext_l->user_flock, NULL); - __destroy_lock (ext_l); - } + gf_log(this->name, GF_LOG_TRACE, "Releasing all locks with fd %p", fd); - list_for_each_entry_safe (ino_l, ino_tmp, &inodelks_released, blocked_locks) { + delete_locks_of_fd(this, pl_inode, fd); + pl_update_refkeeper(this, fd->inode); - STACK_UNWIND_STRICT (inodelk, ino_l->frame, -1, 0, NULL); - __pl_inodelk_unref (ino_l); - } +clean: + ret = fd_ctx_del(fd, this, &tmp); + if (ret) { + gf_log(this->name, GF_LOG_DEBUG, "Could not get fdctx"); + goto out; + } + + fdctx = (pl_fdctx_t *)(long)tmp; - list_for_each_entry_safe (entry_l, entry_tmp, &entrylks_released, blocked_locks) { + GF_FREE(fdctx); +out: + return ret; +} - STACK_UNWIND_STRICT (entrylk, entry_l->frame, -1, 0, NULL); - GF_FREE ((char *)entry_l->basename); - GF_FREE (entry_l->connection_id); - GF_FREE (entry_l); +int +pl_releasedir(xlator_t *this, fd_t *fd) +{ + int ret = -1; + uint64_t tmp = 0; + pl_fdctx_t *fdctx = NULL; - } + if (fd == NULL) { + goto out; + } - GF_FREE (pl_inode); + ret = fd_ctx_del(fd, this, &tmp); + if (ret) { + gf_log(this->name, GF_LOG_DEBUG, "Could not get fdctx"); + goto out; + } - return 0; + fdctx = (pl_fdctx_t *)(long)tmp; + + GF_FREE(fdctx); +out: + return ret; } -int -pl_release (xlator_t *this, fd_t *fd) +static int32_t +pl_request_link_count(dict_t **pxdata) { - pl_inode_t *pl_inode = NULL; - uint64_t tmp_pl_inode = 0; - int ret = -1; - uint64_t tmp = 0; - pl_fdctx_t *fdctx = NULL; + dict_t *xdata; - if (fd == NULL) { - goto out; + xdata = *pxdata; + if (xdata == NULL) { + xdata = dict_new(); + if (xdata == NULL) { + return ENOMEM; } + } else { + dict_ref(xdata); + } - ret = inode_ctx_get (fd->inode, this, &tmp_pl_inode); - if (ret != 0) - goto out; + if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) { + dict_unref(xdata); + return ENOMEM; + } - pl_inode = (pl_inode_t *)(long)tmp_pl_inode; + *pxdata = xdata; - pl_trace_release (this, fd); + return 0; +} - gf_log (this->name, GF_LOG_TRACE, - "Releasing all locks with fd %p", fd); +static int32_t +pl_check_link_count(dict_t *xdata) +{ + int32_t count; - delete_locks_of_fd (this, pl_inode, fd); - pl_update_refkeeper (this, fd->inode); + /* In case we are unable to read the link count from xdata, we take a + * conservative approach and return -2, which will prevent the inode from + * being considered deleted. In fact it will cause link tracking for this + * inode to be disabled completely to avoid races. */ - ret = fd_ctx_del (fd, this, &tmp); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "Could not get fdctx"); - goto out; - } + if (xdata == NULL) { + return -2; + } - fdctx = (pl_fdctx_t *)(long)tmp; + if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) { + return -2; + } - GF_FREE (fdctx); -out: - return ret; + return count; } -int -pl_releasedir (xlator_t *this, fd_t *fd) +int32_t +pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { - int ret = -1; - uint64_t tmp = 0; - pl_fdctx_t *fdctx = NULL; + pl_inode_t *pl_inode; - if (fd == NULL) { - goto out; + if (op_ret >= 0) { + pl_inode = pl_inode_get(this, inode, NULL); + if (pl_inode == NULL) { + PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL, + NULL); + return 0; } - ret = fd_ctx_del (fd, this, &tmp); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "Could not get fdctx"); - goto out; + pthread_mutex_lock(&pl_inode->mutex); + + /* We only update the link count if we previously didn't know it. + * Doing it always can lead to races since lookup is not executed + * atomically most of the times. */ + if (pl_inode->links == -2) { + pl_inode->links = pl_check_link_count(xdata); + if (buf->ia_type == IA_IFDIR) { + /* Directories have at least 2 links. To avoid special handling + * for directories, we simply decrement the value here to make + * them equivalent to regular files. */ + pl_inode->links--; + } } - fdctx = (pl_fdctx_t *)(long)tmp; + pthread_mutex_unlock(&pl_inode->mutex); + } - GF_FREE (fdctx); -out: - return ret; + PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; } int32_t -__get_posixlk_count (xlator_t *this, pl_inode_t *pl_inode) +pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - posix_lock_t *lock = NULL; - int32_t count = 0; - - list_for_each_entry (lock, &pl_inode->ext_list, list) { - - count++; - } + int32_t error; + + error = pl_request_link_count(&xdata); + if (error == 0) { + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + dict_unref(xdata); + } else { + STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL); + } + return 0; +} - return count; +int32_t +pl_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + PL_STACK_UNWIND(fstat, xdata, frame, op_ret, op_errno, buf, xdata); + return 0; } int32_t -get_posixlk_count (xlator_t *this, inode_t *inode) +pl_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - pl_inode_t *pl_inode = NULL; - uint64_t tmp_pl_inode = 0; - int ret = 0; - int32_t count = 0; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; +} - ret = inode_ctx_get (inode, this, &tmp_pl_inode); - if (ret != 0) { - goto out; - } +int +pl_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + pl_local_t *local = NULL; + gf_dirent_t *entry = NULL; - pl_inode = (pl_inode_t *)(long) tmp_pl_inode; + if (op_ret <= 0) + goto unwind; - pthread_mutex_lock (&pl_inode->mutex); - { - count =__get_posixlk_count (this, pl_inode); - } - pthread_mutex_unlock (&pl_inode->mutex); + local = frame->local; + if (!local) + goto unwind; -out: - return count; + list_for_each_entry(entry, &entries->list, list) + { + pl_set_xdata_response(this, local, local->fd->inode, entry->inode, + entry->d_name, entry->dict, 0); + } + +unwind: + PL_STACK_UNWIND(readdirp, xdata, frame, op_ret, op_errno, entries, xdata); + + return 0; } -void -pl_parent_entrylk_xattr_fill (xlator_t *this, inode_t *parent, - char *basename, dict_t *dict) +int +pl_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - uint32_t entrylk = 0; - int ret = -1; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata); + + return 0; +} + +lock_migration_info_t * +gf_mig_info_for_lock(posix_lock_t *lock) +{ + lock_migration_info_t *new = GF_MALLOC(sizeof(lock_migration_info_t), + gf_common_mt_lock_mig); + if (new == NULL) { + goto out; + } + + INIT_LIST_HEAD(&new->list); + + posix_lock_to_flock(lock, &new->flock); + + new->lk_flags = lock->lk_flags; + + new->client_uid = gf_strdup(lock->client_uid); - if (!parent || !basename || !strlen (basename)) - goto out; - entrylk = check_entrylk_on_basename (this, parent, basename); out: - ret = dict_set_uint32 (dict, GLUSTERFS_PARENT_ENTRYLK, entrylk); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - " dict_set failed on key %s", GLUSTERFS_PARENT_ENTRYLK); - } + return new; } -void -pl_entrylk_xattr_fill (xlator_t *this, inode_t *inode, - dict_t *dict) +int +pl_fill_active_locks(pl_inode_t *pl_inode, lock_migration_info_t *lmi) { - int32_t count = 0; - int ret = -1; + posix_lock_t *temp = NULL; + lock_migration_info_t *newlock = NULL; + int count = 0; + + pthread_mutex_lock(&pl_inode->mutex); + { + if (list_empty(&pl_inode->ext_list)) { + count = 0; + goto unlock; + } - count = get_entrylk_count (this, inode); - ret = dict_set_int32 (dict, GLUSTERFS_ENTRYLK_COUNT, count); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - " dict_set failed on key %s", GLUSTERFS_ENTRYLK_COUNT); + list_for_each_entry(temp, &pl_inode->ext_list, list) + { + if (temp->blocked) + continue; + + newlock = gf_mig_info_for_lock(temp); + if (!newlock) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, "lock_dup failed"); + count = -1; + goto out; + } + + list_add_tail(&newlock->list, &lmi->list); + count++; } + } +unlock: + pthread_mutex_unlock(&pl_inode->mutex); +out: + return count; } -void -pl_inodelk_xattr_fill (xlator_t *this, inode_t *inode, dict_t *dict, - gf_boolean_t per_dom) +/* This function reads only active locks */ +static int +pl_getactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - int32_t count = 0; - int ret = -1; - char *domname = NULL; + pl_inode_t *pl_inode = NULL; + lock_migration_info_t locks; + int op_ret = 0; + int op_errno = 0; + int count = 0; + INIT_LIST_HEAD(&locks.list); - if (per_dom){ - ret = dict_get_str (dict, GLUSTERFS_INODELK_DOM_COUNT, - &domname); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get " - "value for key %s",GLUSTERFS_INODELK_DOM_COUNT); - goto out; - } - } + pl_inode = pl_inode_get(this, loc->inode, NULL); + if (!pl_inode) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_inode_get failed"); - count = get_inodelk_count (this, inode, domname); + op_ret = -1; + op_errno = ENOMEM; + goto out; + } - ret = dict_set_int32 (dict, GLUSTERFS_INODELK_COUNT, count); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Failed to set count for " - "key %s", GLUSTERFS_INODELK_COUNT); - } + count = pl_fill_active_locks(pl_inode, &locks); + + op_ret = count; out: - return; + STACK_UNWIND_STRICT(getactivelk, frame, op_ret, op_errno, &locks, NULL); + + gf_free_mig_locks(&locks); + + return 0; } void -pl_posixlk_xattr_fill (xlator_t *this, inode_t *inode, - dict_t *dict) +pl_metalk_unref(pl_meta_lock_t *lock) { - int32_t count = 0; - int ret = -1; + lock->ref--; + if (!lock->ref) { + GF_FREE(lock->client_uid); + GF_FREE(lock); + } +} - count = get_posixlk_count (this, inode); - ret = dict_set_int32 (dict, GLUSTERFS_POSIXLK_COUNT, count); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - " dict_set failed on key %s", GLUSTERFS_POSIXLK_COUNT); - } +void +__pl_metalk_ref(pl_meta_lock_t *lock) +{ + lock->ref++; +} +pl_meta_lock_t * +new_meta_lock(call_frame_t *frame, xlator_t *this) +{ + pl_meta_lock_t *lock = GF_CALLOC(1, sizeof(*lock), + gf_locks_mt_pl_meta_lock_t); + + if (!lock) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "mem allocation" + " failed for meta lock"); + goto out; + } + + INIT_LIST_HEAD(&lock->list); + INIT_LIST_HEAD(&lock->client_list); + + lock->client_uid = gf_strdup(frame->root->client->client_uid); + if (!lock->client_uid) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "mem allocation" + " failed for client_uid"); + GF_FREE(lock); + lock = NULL; + goto out; + } + + __pl_metalk_ref(lock); +out: + return lock; } -int32_t -pl_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *xdata, - struct iatt *postparent) +int +pl_insert_metalk(pl_inode_t *pl_inode, pl_ctx_t *ctx, pl_meta_lock_t *lock) { - pl_local_t *local = NULL; + int ret = 0; - GF_VALIDATE_OR_GOTO (this->name, frame->local, out); + if (!pl_inode || !ctx || !lock) { + gf_msg(THIS->name, GF_LOG_INFO, 0, 0, "NULL parameter"); + ret = -1; + goto out; + } - if (op_ret) - goto out; + lock->pl_inode = pl_inode; - local = frame->local; + /* refer function pl_inode_setlk for more info for this ref. + * This should be unrefed on meta-unlock triggered by rebalance or + * in cleanup with client disconnect*/ + /*TODO: unref this in cleanup code for disconnect and meta-unlock*/ + pl_inode->inode = inode_ref(pl_inode->inode); - if (local->parent_entrylk_req) - pl_parent_entrylk_xattr_fill (this, local->loc.parent, - (char*)local->loc.name, xdata); - if (local->entrylk_count_req) - pl_entrylk_xattr_fill (this, inode, xdata); - if (local->inodelk_count_req) - pl_inodelk_xattr_fill (this, inode, xdata, _gf_false); - if (local->inodelk_dom_count_req) - pl_inodelk_xattr_fill (this, inode, xdata, _gf_true); - if (local->posixlk_count_req) - pl_posixlk_xattr_fill (this, inode, xdata); + /* NOTE:In case of a client-server disconnect we need to cleanup metalk. + * Hence, adding the metalk to pl_ctx_t as well. The mutex lock order + * should always be on ctx and then on pl_inode*/ + + pthread_mutex_lock(&ctx->lock); + { + pthread_mutex_lock(&pl_inode->mutex); + { + list_add_tail(&lock->list, &pl_inode->metalk_list); + } + pthread_mutex_unlock(&pl_inode->mutex); + list_add_tail(&lock->client_list, &ctx->metalk_list); + } + pthread_mutex_unlock(&ctx->lock); out: - local = frame->local; - frame->local = NULL; - - if (local != NULL) { - loc_wipe (&local->loc); - mem_put (local); - } - - STACK_UNWIND_STRICT ( - lookup, - frame, - op_ret, - op_errno, - inode, - buf, - xdata, - postparent); - return 0; + return ret; } int32_t -pl_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xdata) +pl_metalk(call_frame_t *frame, xlator_t *this, inode_t *inode) { - pl_local_t *local = NULL; - int ret = -1; + pl_inode_t *pl_inode = NULL; + int ret = 0; + pl_meta_lock_t *reqlk = NULL; + pl_ctx_t *ctx = NULL; + + pl_inode = pl_inode_get(this, inode, NULL); + if (!pl_inode) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "pl_inode mem allocation failedd"); + + ret = -1; + goto out; + } + + /* Non rebalance process trying to do metalock */ + if (frame->root->pid != GF_CLIENT_PID_DEFRAG) { + ret = -1; + goto out; + } + + /* Note: In the current scheme of glusterfs where lock migration is + * experimental, (ideally) the rebalance process which is migrating + * the file should request for a metalock. Hence, the metalock count + * should not be more than one for an inode. In future, if there is a + * need for meta-lock from other clients, the following block can be + * removed. + * + * Since pl_metalk is called as part of setxattr operation, any client + * process(non-rebalance) residing outside trusted network can exhaust + * memory of the server node by issuing setxattr repetitively on the + * metalock key. The following code makes sure that more than + * one metalock cannot be granted on an inode*/ + pthread_mutex_lock(&pl_inode->mutex); + { + if (pl_metalock_is_active(pl_inode)) { + ret = -1; + } + } + pthread_mutex_unlock(&pl_inode->mutex); + + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, 0, + "More than one meta-lock cannot be granted on" + " the inode"); + goto out; + } + + if (frame->root->client) { + ctx = pl_ctx_get(frame->root->client, this); + if (!ctx) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_ctx_get failed"); + + ret = -1; + goto out; + } + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "frame-root-client " + "is NULL"); + + ret = -1; + goto out; + } + + reqlk = new_meta_lock(frame, this); + if (!reqlk) { + ret = -1; + goto out; + } + + ret = pl_insert_metalk(pl_inode, ctx, reqlk); + if (ret < 0) { + pl_metalk_unref(reqlk); + } - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); +out: + return ret; +} - local = mem_get0 (this->local_pool); - GF_VALIDATE_OR_GOTO (this->name, local, out); +static void +__unwind_queued_locks(pl_inode_t *pl_inode, struct list_head *tmp_list) +{ + if (list_empty(&pl_inode->queued_locks)) + return; - if (xdata) { - if (dict_get (xdata, GLUSTERFS_ENTRYLK_COUNT)) - local->entrylk_count_req = 1; - if (dict_get (xdata, GLUSTERFS_INODELK_COUNT)) - local->inodelk_count_req = 1; - if (dict_get (xdata, GLUSTERFS_INODELK_DOM_COUNT)) - local->inodelk_dom_count_req = 1; - if (dict_get (xdata, GLUSTERFS_POSIXLK_COUNT)) - local->posixlk_count_req = 1; - if (dict_get (xdata, GLUSTERFS_PARENT_ENTRYLK)) - local->parent_entrylk_req = 1; - } + list_splice_init(&pl_inode->queued_locks, tmp_list); +} - frame->local = local; - loc_copy (&local->loc, loc); - - STACK_WIND (frame, - pl_lookup_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - loc, xdata); - ret = 0; -out: - if (ret == -1) - STACK_UNWIND_STRICT (lookup, frame, -1, 0, NULL, - NULL, NULL, NULL); +static void +__unwind_blocked_locks(pl_inode_t *pl_inode, struct list_head *tmp_list) +{ + posix_lock_t *lock = NULL; + posix_lock_t *tmp = NULL; - return 0; + if (list_empty(&pl_inode->ext_list)) + return; + + list_for_each_entry_safe(lock, tmp, &pl_inode->ext_list, list) + { + if (!lock->blocking) + continue; + + list_del_init(&lock->list); + list_add_tail(&lock->list, tmp_list); + } } + int -pl_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries, dict_t *xdata) +pl_metaunlock(call_frame_t *frame, xlator_t *this, inode_t *inode, dict_t *dict) { - pl_local_t *local = NULL; - gf_dirent_t *entry = NULL; + pl_inode_t *pl_inode = NULL; + int ret = 0; + pl_meta_lock_t *meta_lock = NULL; + pl_meta_lock_t *tmp_metalk = NULL; + pl_ctx_t *ctx = NULL; + posix_lock_t *posix_lock = NULL; + posix_lock_t *tmp_posixlk = NULL; + struct list_head tmp_posixlk_list; + + INIT_LIST_HEAD(&tmp_posixlk_list); + + if (frame->root->client) { + ctx = pl_ctx_get(frame->root->client, this); + if (!ctx) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_ctx_get failed"); + + ret = -1; + goto out; + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "frame-root-client is " + "NULL"); + ret = -1; + goto out; + } + + pl_inode = pl_inode_get(this, inode, NULL); + if (!pl_inode) { + ret = -1; + goto out; + } + + pthread_mutex_lock(&ctx->lock); + { + pthread_mutex_lock(&pl_inode->mutex); + { + /* Unwind queued locks regardless of migration status */ + __unwind_queued_locks(pl_inode, &tmp_posixlk_list); - local = frame->local; + /* Unwind blocked locks only for successful migration */ + if (dict_get_sizen(dict, "status")) { + /* unwind all blocked locks */ + __unwind_blocked_locks(pl_inode, &tmp_posixlk_list); + } - if (op_ret <= 0) - goto unwind; + /* unlock metalk */ + /* if this list is empty then pl_inode->metalk_list + * should be empty too. meta lock should in all cases + * be added/removed from both pl_ctx_t and pl_inode */ + + if (list_empty(&ctx->metalk_list)) + goto unlock; - list_for_each_entry (entry, &entries->list, list) { - if (local->entrylk_count_req) - pl_entrylk_xattr_fill (this, entry->inode, entry->dict); - if (local->inodelk_count_req) - pl_inodelk_xattr_fill (this, entry->inode, entry->dict, - _gf_false); - if (local->inodelk_dom_count_req) - pl_inodelk_xattr_fill (this, entry->inode, entry->dict, - _gf_true); - if (local->posixlk_count_req) - pl_posixlk_xattr_fill (this, entry->inode, entry->dict); + list_for_each_entry_safe(meta_lock, tmp_metalk, &ctx->metalk_list, + client_list) + { + list_del_init(&meta_lock->client_list); + + pl_inode = meta_lock->pl_inode; + + list_del_init(&meta_lock->list); + + pl_metalk_unref(meta_lock); + + /* The corresponding ref is taken in + * pl_insert_metalk*/ + inode_unref(pl_inode->inode); + } + + if (dict_get_sizen(dict, "status")) + pl_inode->migrated = _gf_true; + else + pl_inode->migrated = _gf_false; } + unlock: -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); + pthread_mutex_unlock(&pl_inode->mutex); + } + pthread_mutex_unlock(&ctx->lock); + +out: + list_for_each_entry_safe(posix_lock, tmp_posixlk, &tmp_posixlk_list, list) + { + list_del_init(&posix_lock->list); - if (local) - mem_put (local); + STACK_UNWIND_STRICT(lk, posix_lock->frame, -1, EREMOTE, + &posix_lock->user_flock, NULL); - return 0; + __destroy_lock(posix_lock); + } + + return ret; } -int -pl_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *dict) +int32_t +pl_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - pl_local_t *local = NULL; - - local = mem_get0 (this->local_pool); - GF_VALIDATE_OR_GOTO (this->name, local, out); + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + local = frame->local; + if (local && local->update_mlock_enforced_flag && op_ret != -1) { + pl_inode = pl_inode_get(this, local->inode, NULL); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } - if (dict) { - if (dict_get (dict, GLUSTERFS_ENTRYLK_COUNT)) - local->entrylk_count_req = 1; - if (dict_get (dict, GLUSTERFS_INODELK_COUNT)) - local->inodelk_count_req = 1; - if (dict_get (dict, GLUSTERFS_INODELK_DOM_COUNT)) - local->inodelk_dom_count_req = 1; - if (dict_get (dict, GLUSTERFS_POSIXLK_COUNT)) - local->posixlk_count_req = 1; + pthread_mutex_lock(&pl_inode->mutex); + { + while (pl_inode->fop_wind_count > 0) { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "waiting for existing fops (count %d) to drain for " + "gfid %s", + pl_inode->fop_wind_count, uuid_utoa(pl_inode->gfid)); + pthread_cond_wait(&pl_inode->check_fop_wind_count, + &pl_inode->mutex); + } + pl_inode->mlock_enforced = _gf_true; + pl_inode->check_mlock_info = _gf_false; } + pthread_mutex_unlock(&pl_inode->mutex); + } - frame->local = local; +unwind: + PL_STACK_UNWIND_FOR_CLIENT(setxattr, xdata, frame, op_ret, op_errno, xdata); + return 0; +} - STACK_WIND (frame, pl_readdirp_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, - fd, size, offset, dict); +int32_t +pl_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) +{ + int op_ret = 0; + int op_errno = EINVAL; + dict_t *xdata_rsp = NULL; + char *name = NULL; + posix_locks_private_t *priv = this->private; - return 0; -out: - STACK_UNWIND_STRICT (readdirp, frame, -1, ENOMEM, NULL, NULL); - return 0; -} + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + + if (dict_get_sizen(dict, GF_META_LOCK_KEY)) { + op_ret = pl_metalk(frame, this, loc->inode); + + } else if (dict_get_sizen(dict, GF_META_UNLOCK_KEY)) { + op_ret = pl_metaunlock(frame, this, loc->inode, dict); + } else { + goto usual; + } + PL_STACK_UNWIND_FOR_CLIENT(setxattr, xdata_rsp, frame, op_ret, op_errno, + xdata_rsp); + return 0; + +usual: + PL_CHECK_LOCK_ENFORCE_KEY(frame, dict, name, this, loc, ((fd_t *)NULL), + priv); + + STACK_WIND(frame, pl_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); + return 0; + +unwind: + PL_STACK_UNWIND_FOR_CLIENT(setxattr, xdata, frame, op_ret, op_errno, xdata); + + return 0; +} void -pl_dump_lock (char *str, int size, struct gf_flock *flock, - gf_lkowner_t *owner, void *trans, char *conn_id, - time_t *granted_time, time_t *blkd_time, gf_boolean_t active) +pl_dump_lock(char *str, int size, struct gf_flock *flock, gf_lkowner_t *owner, + void *trans, char *conn_id, time_t *granted_time, + time_t *blkd_time, gf_boolean_t active) { - char *type_str = NULL; - char granted[32] = {0,}; - char blocked[32] = {0,}; - - switch (flock->l_type) { + char *type_str = NULL; + char granted[GF_TIMESTR_SIZE] = { + 0, + }; + char blocked[GF_TIMESTR_SIZE] = { + 0, + }; + + if (granted_time) + gf_time_fmt(granted, sizeof(granted), *granted_time, gf_timefmt_FT); + if (blkd_time) + gf_time_fmt(blocked, sizeof(blocked), *blkd_time, gf_timefmt_FT); + switch (flock->l_type) { case F_RDLCK: - type_str = "READ"; - break; + type_str = "READ"; + break; case F_WRLCK: - type_str = "WRITE"; - break; + type_str = "WRITE"; + break; case F_UNLCK: - type_str = "UNLOCK"; - break; + type_str = "UNLOCK"; + break; default: - type_str = "UNKNOWN"; - break; + type_str = "UNKNOWN"; + break; + } + + if (active) { + if (blkd_time && *blkd_time == 0) { + snprintf(str, size, RANGE_GRNTD_FMT, type_str, flock->l_whence, + (unsigned long long)flock->l_start, + (unsigned long long)flock->l_len, + (unsigned long long)flock->l_pid, lkowner_utoa(owner), + trans, conn_id, granted); + } else { + snprintf(str, size, RANGE_BLKD_GRNTD_FMT, type_str, flock->l_whence, + (unsigned long long)flock->l_start, + (unsigned long long)flock->l_len, + (unsigned long long)flock->l_pid, lkowner_utoa(owner), + trans, conn_id, blocked, granted); } + } else { + snprintf(str, size, RANGE_BLKD_FMT, type_str, flock->l_whence, + (unsigned long long)flock->l_start, + (unsigned long long)flock->l_len, + (unsigned long long)flock->l_pid, lkowner_utoa(owner), trans, + conn_id, blocked); + } +} - if (active) { - if (blkd_time && *blkd_time == 0) { - snprintf (str, size, RANGE_GRNTD_FMT, - type_str, flock->l_whence, - (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, conn_id, - ctime_r (granted_time, granted)); - } else { - snprintf (str, size, RANGE_BLKD_GRNTD_FMT, - type_str, flock->l_whence, - (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, conn_id, - ctime_r (blkd_time, blocked), - ctime_r (granted_time, granted)); - } - } - else { - snprintf (str, size, RANGE_BLKD_FMT, - type_str, flock->l_whence, - (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, conn_id, - ctime_r (blkd_time, blocked)); +void +__dump_entrylks(pl_inode_t *pl_inode) +{ + pl_dom_list_t *dom = NULL; + pl_entry_lock_t *lock = NULL; + char blocked[GF_TIMESTR_SIZE] = { + 0, + }; + char granted[GF_TIMESTR_SIZE] = { + 0, + }; + int count = 0; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char *k = "xlator.feature.locks.lock-dump.domain.entrylk"; + + char tmp[4098]; + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + count = 0; + + gf_proc_dump_build_key(key, "lock-dump.domain", "domain"); + gf_proc_dump_write(key, "%s", dom->domain); + + list_for_each_entry(lock, &dom->entrylk_list, domain_list) + { + gf_time_fmt(granted, sizeof(granted), lock->granted_time, + gf_timefmt_FT); + gf_proc_dump_build_key(key, k, "entrylk[%d](ACTIVE)", count); + if (lock->blkd_time == 0) { + snprintf(tmp, sizeof(tmp), ENTRY_GRNTD_FMT, + lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" + : "ENTRYLK_WRLCK", + lock->basename, (unsigned long long)lock->client_pid, + lkowner_utoa(&lock->owner), lock->client, + lock->connection_id, granted); + } else { + gf_time_fmt(blocked, sizeof(blocked), lock->blkd_time, + gf_timefmt_FT); + snprintf(tmp, sizeof(tmp), ENTRY_BLKD_GRNTD_FMT, + lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" + : "ENTRYLK_WRLCK", + lock->basename, (unsigned long long)lock->client_pid, + lkowner_utoa(&lock->owner), lock->client, + lock->connection_id, blocked, granted); + } + + gf_proc_dump_write(key, "%s", tmp); + + count++; } + list_for_each_entry(lock, &dom->blocked_entrylks, blocked_locks) + { + gf_time_fmt(blocked, sizeof(blocked), lock->blkd_time, + gf_timefmt_FT); + + gf_proc_dump_build_key(key, k, "entrylk[%d](BLOCKED)", count); + snprintf( + tmp, sizeof(tmp), ENTRY_BLKD_FMT, + lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK", + lock->basename, (unsigned long long)lock->client_pid, + lkowner_utoa(&lock->owner), lock->client, lock->connection_id, + blocked); + + gf_proc_dump_write(key, "%s", tmp); + + count++; + } + } } void -__dump_entrylks (pl_inode_t *pl_inode) -{ - pl_dom_list_t *dom = NULL; - pl_entry_lock_t *lock = NULL; - char blocked[32] = {0,}; - char granted[32] = {0,}; - int count = 0; - char key[GF_DUMP_MAX_BUF_LEN] = {0,}; - - char tmp[256]; - - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - - count = 0; - - gf_proc_dump_build_key(key, - "lock-dump.domain", - "domain"); - gf_proc_dump_write(key, "%s", dom->domain); - - list_for_each_entry (lock, &dom->entrylk_list, domain_list) { - - gf_proc_dump_build_key(key, - "xlator.feature.locks.lock-dump.domain.entrylk", - "entrylk[%d](ACTIVE)", count ); - if (lock->blkd_time.tv_sec == 0 && lock->blkd_time.tv_usec == 0) { - snprintf (tmp, 256, ENTRY_GRNTD_FMT, - lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : - "ENTRYLK_WRLCK", lock->basename, - (unsigned long long) lock->client_pid, - lkowner_utoa (&lock->owner), lock->client, - lock->connection_id, - ctime_r (&lock->granted_time.tv_sec, granted)); - } else { - snprintf (tmp, 256, ENTRY_BLKD_GRNTD_FMT, - lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : - "ENTRYLK_WRLCK", lock->basename, - (unsigned long long) lock->client_pid, - lkowner_utoa (&lock->owner), lock->client, - lock->connection_id, - ctime_r (&lock->blkd_time.tv_sec, blocked), - ctime_r (&lock->granted_time.tv_sec, granted)); - } +dump_entrylks(pl_inode_t *pl_inode) +{ + pthread_mutex_lock(&pl_inode->mutex); + { + __dump_entrylks(pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); +} - gf_proc_dump_write(key, tmp); +void +__dump_inodelks(pl_inode_t *pl_inode) +{ + pl_dom_list_t *dom = NULL; + pl_inode_lock_t *lock = NULL; + int count = 0; + char key[GF_DUMP_MAX_BUF_LEN]; - count++; - } + char tmp[4098]; - list_for_each_entry (lock, &dom->blocked_entrylks, blocked_locks) { + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + count = 0; - gf_proc_dump_build_key(key, - "xlator.feature.locks.lock-dump.domain.entrylk", - "entrylk[%d](BLOCKED)", count ); - snprintf (tmp, 256, ENTRY_BLKD_FMT, - lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : - "ENTRYLK_WRLCK", lock->basename, - (unsigned long long) lock->client_pid, - lkowner_utoa (&lock->owner), lock->client, - lock->connection_id, - ctime_r (&lock->blkd_time.tv_sec, blocked)); + gf_proc_dump_build_key(key, "lock-dump.domain", "domain"); + gf_proc_dump_write(key, "%s", dom->domain); - gf_proc_dump_write(key, tmp); + list_for_each_entry(lock, &dom->inodelk_list, list) + { + gf_proc_dump_build_key(key, "inodelk", "inodelk[%d](ACTIVE)", + count); - count++; - } + SET_FLOCK_PID(&lock->user_flock, lock); + pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner, + lock->client, lock->connection_id, &lock->granted_time, + &lock->blkd_time, _gf_true); + gf_proc_dump_write(key, "%s", tmp); + count++; } + list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks) + { + gf_proc_dump_build_key(key, "inodelk", "inodelk[%d](BLOCKED)", + count); + SET_FLOCK_PID(&lock->user_flock, lock); + pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner, + lock->client, lock->connection_id, 0, &lock->blkd_time, + _gf_false); + gf_proc_dump_write(key, "%s", tmp); + + count++; + } + } } void -dump_entrylks (pl_inode_t *pl_inode) +dump_inodelks(pl_inode_t *pl_inode) { - pthread_mutex_lock (&pl_inode->mutex); - { - __dump_entrylks (pl_inode); - } - pthread_mutex_unlock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + __dump_inodelks(pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); +} +void +__dump_posixlks(pl_inode_t *pl_inode) +{ + posix_lock_t *lock = NULL; + int count = 0; + char key[GF_DUMP_MAX_BUF_LEN]; + + char tmp[4098]; + + list_for_each_entry(lock, &pl_inode->ext_list, list) + { + SET_FLOCK_PID(&lock->user_flock, lock); + gf_proc_dump_build_key(key, "posixlk", "posixlk[%d](%s)", count, + lock->blocked ? "BLOCKED" : "ACTIVE"); + pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner, + lock->client, lock->client_uid, &lock->granted_time, + &lock->blkd_time, (lock->blocked) ? _gf_false : _gf_true); + gf_proc_dump_write(key, "%s", tmp); + + count++; + } } void -__dump_inodelks (pl_inode_t *pl_inode) +dump_posixlks(pl_inode_t *pl_inode) { - pl_dom_list_t *dom = NULL; - pl_inode_lock_t *lock = NULL; - int count = 0; - char key[GF_DUMP_MAX_BUF_LEN]; + pthread_mutex_lock(&pl_inode->mutex); + { + __dump_posixlks(pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); +} - char tmp[256]; +int32_t +pl_dump_inode_priv(xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t tmp_pl_inode = 0; + pl_inode_t *pl_inode = NULL; + char *pathname = NULL; + gf_boolean_t section_added = _gf_false; + + int count = 0; + + if (!inode) { + errno = EINVAL; + goto out; + } + + ret = TRY_LOCK(&inode->lock); + if (ret) + goto out; + { + ret = __inode_ctx_get(inode, this, &tmp_pl_inode); + if (ret) + goto unlock; + } +unlock: + UNLOCK(&inode->lock); + if (ret) + goto out; + + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; + if (!pl_inode) { + ret = -1; + goto out; + } + + gf_proc_dump_add_section("xlator.features.locks.%s.inode", this->name); + section_added = _gf_true; + + /*We are safe to call __inode_path since we have the + * inode->table->lock */ + __inode_path(inode, NULL, &pathname); + if (pathname) + gf_proc_dump_write("path", "%s", pathname); + + gf_proc_dump_write("mandatory", "%d", pl_inode->mandatory); + + ret = pthread_mutex_trylock(&pl_inode->mutex); + if (ret) + goto out; + { + count = __get_entrylk_count(this, pl_inode); + if (count) { + gf_proc_dump_write("entrylk-count", "%d", count); + __dump_entrylks(pl_inode); + } - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { + count = __get_inodelk_count(this, pl_inode, NULL); + if (count) { + gf_proc_dump_write("inodelk-count", "%d", count); + __dump_inodelks(pl_inode); + } - count = 0; + count = __get_posixlk_count(pl_inode); + if (count) { + gf_proc_dump_write("posixlk-count", "%d", count); + __dump_posixlks(pl_inode); + } - gf_proc_dump_build_key(key, - "lock-dump.domain", - "domain"); - gf_proc_dump_write(key, "%s", dom->domain); + gf_proc_dump_write("links", "%d", pl_inode->links); + gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running); + gf_proc_dump_write("removed", "%u", pl_inode->removed); + } + pthread_mutex_unlock(&pl_inode->mutex); - list_for_each_entry (lock, &dom->inodelk_list, list) { +out: + GF_FREE(pathname); + + if (ret && inode) { + if (!section_added) + gf_proc_dump_add_section( + "xlator.features.locks.%s." + "inode", + this->name); + gf_proc_dump_write("Unable to print lock state", + "(Lock " + "acquisition failure) %s", + uuid_utoa(inode->gfid)); + } + return ret; +} - gf_proc_dump_build_key(key, - "inodelk", - "inodelk[%d](ACTIVE)",count ); +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; - SET_FLOCK_PID (&lock->user_flock, lock); - pl_dump_lock (tmp, 256, &lock->user_flock, - &lock->owner, - lock->client, lock->connection_id, - &lock->granted_time.tv_sec, - &lock->blkd_time.tv_sec, - _gf_true); - gf_proc_dump_write(key, tmp); + if (!this) + return ret; - count++; - } + ret = xlator_mem_acct_init(this, gf_locks_mt_end + 1); - list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) { + if (ret != 0) { + gf_log(this->name, GF_LOG_ERROR, + "Memory accounting init" + "failed"); + return ret; + } - gf_proc_dump_build_key(key, - "inodelk", - "inodelk[%d](BLOCKED)",count ); - SET_FLOCK_PID (&lock->user_flock, lock); - pl_dump_lock (tmp, 256, &lock->user_flock, - &lock->owner, - lock->client, lock->connection_id, - 0, &lock->blkd_time.tv_sec, - _gf_false); - gf_proc_dump_write(key, tmp); + return ret; +} - count++; - } +pl_ctx_t * +pl_ctx_get(client_t *client, xlator_t *xlator) +{ + void *tmp = NULL; + pl_ctx_t *ctx = NULL; + pl_ctx_t *setted_ctx = NULL; - } + client_ctx_get(client, xlator, &tmp); + + ctx = tmp; + + if (ctx != NULL) + goto out; + + ctx = GF_CALLOC(1, sizeof(pl_ctx_t), gf_locks_mt_posix_lock_t); + + if (ctx == NULL) + goto out; + pthread_mutex_init(&ctx->lock, NULL); + INIT_LIST_HEAD(&ctx->inodelk_lockers); + INIT_LIST_HEAD(&ctx->entrylk_lockers); + INIT_LIST_HEAD(&ctx->metalk_list); + + setted_ctx = client_ctx_set(client, xlator, ctx); + if (ctx != setted_ctx) { + pthread_mutex_destroy(&ctx->lock); + GF_FREE(ctx); + ctx = setted_ctx; + } +out: + return ctx; } -void -dump_inodelks (pl_inode_t *pl_inode) +int +pl_metalk_client_cleanup(xlator_t *this, pl_ctx_t *ctx) { - pthread_mutex_lock (&pl_inode->mutex); + pl_meta_lock_t *meta_lock = NULL; + pl_meta_lock_t *tmp_metalk = NULL; + pl_inode_t *pl_inode = NULL; + posix_lock_t *posix_lock = NULL; + posix_lock_t *tmp_posixlk = NULL; + struct list_head tmp_posixlk_list; + + INIT_LIST_HEAD(&tmp_posixlk_list); + + pthread_mutex_lock(&ctx->lock); + { + /* if this list is empty then pl_inode->metalk_list should be + * empty too. meta lock should in all cases be added/removed + * from both pl_ctx_t and pl_inode */ + if (list_empty(&ctx->metalk_list)) + goto unlock; + + list_for_each_entry_safe(meta_lock, tmp_metalk, &ctx->metalk_list, + client_list) { - __dump_inodelks (pl_inode); + list_del_init(&meta_lock->client_list); + + pl_inode = meta_lock->pl_inode; + + pthread_mutex_lock(&pl_inode->mutex); + + { + /* Since the migration status is unknown here + * unwind all queued and blocked locks to check + * migration status and find the correct + * destination */ + __unwind_queued_locks(pl_inode, &tmp_posixlk_list); + + __unwind_blocked_locks(pl_inode, &tmp_posixlk_list); + + list_del_init(&meta_lock->list); + + pl_metalk_unref(meta_lock); + } + pthread_mutex_unlock(&pl_inode->mutex); + + /* The corresponding ref is taken in + * pl_insert_metalk*/ + inode_unref(pl_inode->inode); } - pthread_mutex_unlock (&pl_inode->mutex); + } + +unlock: + pthread_mutex_unlock(&ctx->lock); + list_for_each_entry_safe(posix_lock, tmp_posixlk, &tmp_posixlk_list, list) + { + list_del_init(&posix_lock->list); + + STACK_UNWIND_STRICT(lk, posix_lock->frame, -1, EREMOTE, + &posix_lock->user_flock, NULL); + + __destroy_lock(posix_lock); + } + return 0; } -void -__dump_posixlks (pl_inode_t *pl_inode) +static int +pl_client_disconnect_cbk(xlator_t *this, client_t *client) { - posix_lock_t *lock = NULL; - int count = 0; - char key[GF_DUMP_MAX_BUF_LEN]; + pl_ctx_t *pl_ctx = pl_ctx_get(client, this); + if (pl_ctx) { + pl_inodelk_client_cleanup(this, pl_ctx); + pl_entrylk_client_cleanup(this, pl_ctx); + pl_metalk_client_cleanup(this, pl_ctx); + } + + return 0; +} - char tmp[256]; +static int +pl_client_destroy_cbk(xlator_t *this, client_t *client) +{ + void *tmp = NULL; + pl_ctx_t *pl_ctx = NULL; - list_for_each_entry (lock, &pl_inode->ext_list, list) { + pl_client_disconnect_cbk(this, client); - SET_FLOCK_PID (&lock->user_flock, lock); - gf_proc_dump_build_key(key, - "posixlk", - "posixlk[%d](%s)", - count, - lock->blocked ? "BLOCKED" : "ACTIVE"); - pl_dump_lock (tmp, 256, &lock->user_flock, - &lock->owner, lock->client, NULL, - &lock->granted_time.tv_sec, &lock->blkd_time.tv_sec, - (lock->blocked)? _gf_false: _gf_true); - gf_proc_dump_write(key, tmp); + client_ctx_del(client, this, &tmp); - count++; - } + if (tmp == NULL) + return 0; + + pl_ctx = tmp; + + GF_ASSERT(list_empty(&pl_ctx->inodelk_lockers)); + GF_ASSERT(list_empty(&pl_ctx->entrylk_lockers)); + + pthread_mutex_destroy(&pl_ctx->lock); + GF_FREE(pl_ctx); + + return 0; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + posix_locks_private_t *priv = this->private; + int ret = -1; + char *tmp_str = NULL; + + GF_OPTION_RECONF("trace", priv->trace, options, bool, out); + + GF_OPTION_RECONF("monkey-unlocking", priv->monkey_unlocking, options, bool, + out); + + GF_OPTION_RECONF("revocation-secs", priv->revocation_secs, options, uint32, + out); + + GF_OPTION_RECONF("revocation-clear-all", priv->revocation_clear_all, + options, bool, out); + + GF_OPTION_RECONF("revocation-max-blocked", priv->revocation_max_blocked, + options, uint32, out); + + GF_OPTION_RECONF("notify-contention", priv->notify_contention, options, + bool, out); + + GF_OPTION_RECONF("notify-contention-delay", priv->notify_contention_delay, + options, uint32, out); + + GF_OPTION_RECONF("mandatory-locking", tmp_str, options, str, out); + + GF_OPTION_RECONF("enforce-mandatory-lock", priv->mlock_enforced, options, + bool, out); + + if (!strcmp(tmp_str, "forced")) + priv->mandatory_mode = MLK_FORCED; + else if (!strcmp(tmp_str, "file")) + priv->mandatory_mode = MLK_FILE_BASED; + else if (!strcmp(tmp_str, "optimal")) + priv->mandatory_mode = MLK_OPTIMAL; + else + priv->mandatory_mode = MLK_NONE; + + ret = 0; + +out: + return ret; +} + +int +init(xlator_t *this) +{ + posix_locks_private_t *priv = NULL; + xlator_list_t *trav = NULL; + char *tmp_str = NULL; + int ret = -1; + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_CRITICAL, + "FATAL: posix-locks should have exactly one child"); + goto out; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, + "Volume is dangling. Please check the volume file."); + } + + trav = this->children; + while (trav->xlator->children) + trav = trav->xlator->children; + + if (strncmp("storage/", trav->xlator->type, 8)) { + gf_log(this->name, GF_LOG_CRITICAL, + "'locks' translator is not loaded over a storage " + "translator"); + goto out; + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_locks_mt_posix_locks_private_t); + + GF_OPTION_INIT("mandatory-locking", tmp_str, str, out); + if (!strcmp(tmp_str, "forced")) + priv->mandatory_mode = MLK_FORCED; + else if (!strcmp(tmp_str, "file")) + priv->mandatory_mode = MLK_FILE_BASED; + else if (!strcmp(tmp_str, "optimal")) + priv->mandatory_mode = MLK_OPTIMAL; + else + priv->mandatory_mode = MLK_NONE; + + tmp_str = NULL; + + GF_OPTION_INIT("trace", priv->trace, bool, out); + + GF_OPTION_INIT("monkey-unlocking", priv->monkey_unlocking, bool, out); + + GF_OPTION_INIT("revocation-secs", priv->revocation_secs, uint32, out); + + GF_OPTION_INIT("revocation-clear-all", priv->revocation_clear_all, bool, + out); + + GF_OPTION_INIT("revocation-max-blocked", priv->revocation_max_blocked, + uint32, out); + + GF_OPTION_INIT("notify-contention", priv->notify_contention, bool, out); + + GF_OPTION_INIT("notify-contention-delay", priv->notify_contention_delay, + uint32, out); + + GF_OPTION_INIT("enforce-mandatory-lock", priv->mlock_enforced, bool, out); + + this->local_pool = mem_pool_new(pl_local_t, 32); + if (!this->local_pool) { + ret = -1; + gf_log(this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + + this->private = priv; + ret = 0; + +out: + if (ret) { + GF_FREE(priv); + } + return ret; } void -dump_posixlks (pl_inode_t *pl_inode) +fini(xlator_t *this) { - pthread_mutex_lock (&pl_inode->mutex); - { - __dump_posixlks (pl_inode); - } - pthread_mutex_unlock (&pl_inode->mutex); + posix_locks_private_t *priv = this->private; + if (!priv) + return; + this->private = NULL; + if (this->local_pool) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + GF_FREE(priv->brickname); + GF_FREE(priv); + + return; +} + +int +pl_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *flock, dict_t *xdata); + +int +pl_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata); + +int +pl_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata); +int +pl_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata); + +int32_t +pl_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, + struct iatt *postoldparent, struct iatt *prenewparent, + struct iatt *postnewparent, dict_t *xdata) +{ + pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0); + + PL_STACK_UNWIND(rename, xdata, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); + + return 0; } int32_t -pl_dump_inode_priv (xlator_t *this, inode_t *inode) +pl_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { + int32_t error; - int ret = -1; - uint64_t tmp_pl_inode = 0; - pl_inode_t *pl_inode = NULL; - char *pathname = NULL; - gf_boolean_t section_added = _gf_false; + error = PL_INODE_REMOVE(rename, frame, this, oldloc, newloc, pl_rename, + pl_rename_cbk, oldloc, newloc, xdata); + if (error > 0) { + STACK_UNWIND_STRICT(rename, frame, -1, error, NULL, NULL, NULL, NULL, + NULL, NULL); + } - int count = 0; + return 0; +} - if (!inode) { - errno = EINVAL; - goto out; +posix_lock_t * +gf_lkmig_info_to_posix_lock(call_frame_t *frame, lock_migration_info_t *lmi) +{ + posix_lock_t *lock = GF_CALLOC(1, sizeof(posix_lock_t), + gf_locks_mt_posix_lock_t); + if (!lock) + goto out; + + lock->fl_start = lmi->flock.l_start; + lock->fl_type = lmi->flock.l_type; + + if (lmi->flock.l_len == 0) + lock->fl_end = LLONG_MAX; + else + lock->fl_end = lmi->flock.l_start + lmi->flock.l_len - 1; + + lock->client = frame->root->client; + + lock->lk_flags = lmi->lk_flags; + + lock->client_uid = gf_strdup(lmi->client_uid); + if (lock->client_uid == NULL) { + GF_FREE(lock); + lock = NULL; + goto out; + } + + lock->client_pid = lmi->flock.l_pid; + lock->owner = lmi->flock.l_owner; + + INIT_LIST_HEAD(&lock->list); + +out: + return lock; +} + +/* This function is supposed to write the active locks from the source brick(in + * rebalance context) and write here. Hence, will add the locks directly to the + * pl_inode->ext_list*/ +int +pl_write_active_locks(call_frame_t *frame, pl_inode_t *pl_inode, + lock_migration_info_t *locklist) +{ + posix_lock_t *newlock = NULL; + lock_migration_info_t *temp = NULL; + int ret = 0; + + pthread_mutex_lock(&pl_inode->mutex); + { + /* Just making sure the activelk list is empty. Should not + * happen though*/ + if (!list_empty(&pl_inode->ext_list)) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, "invalid locks found"); + + ret = -1; + goto out; } - ret = TRY_LOCK (&inode->lock); - if (ret) - goto out; - { - ret = __inode_ctx_get (inode, this, &tmp_pl_inode); - if (ret) - goto unlock; + /* This list also should not be empty */ + if (list_empty(&locklist->list)) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, "empty lock list"); + + ret = -1; + goto out; } -unlock: - UNLOCK (&inode->lock); - if (ret) - goto out; - pl_inode = (pl_inode_t *)(long)tmp_pl_inode; - if (!pl_inode) { + list_for_each_entry(temp, &locklist->list, list) + { + newlock = gf_lkmig_info_to_posix_lock(frame, temp); + if (!newlock) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, + "mem allocation failed for newlock"); + ret = -1; goto out; + } + list_add_tail(&newlock->list, &pl_inode->ext_list); } + } + /*TODO: What if few lock add failed with ENOMEM. Should the already + * added locks be clearted */ + pthread_mutex_unlock(&pl_inode->mutex); +out: + return ret; +} - gf_proc_dump_add_section("xlator.features.locks.%s.inode", this->name); - section_added = _gf_true; +static int +pl_setactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, + lock_migration_info_t *locklist, dict_t *xdata) +{ + int op_ret = 0; + int op_errno = 0; + int ret = 0; - /*We are safe to call __inode_path since we have the - * inode->table->lock */ - __inode_path (inode, NULL, &pathname); - if (pathname) - gf_proc_dump_write ("path", "%s", pathname); + pl_inode_t *pl_inode = pl_inode_get(this, loc->inode, NULL); + if (!pl_inode) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_inode_get failed"); - gf_proc_dump_write("mandatory", "%d", pl_inode->mandatory); + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + ret = pl_write_active_locks(frame, pl_inode, locklist); - ret = pthread_mutex_trylock (&pl_inode->mutex); - if (ret) - goto out; - { - count = __get_entrylk_count (this, pl_inode); - if (count) { - gf_proc_dump_write("entrylk-count", "%d", count); - __dump_entrylks (pl_inode); - } + op_ret = ret; - count = __get_inodelk_count (this, pl_inode, NULL); - if (count) { - gf_proc_dump_write("inodelk-count", "%d", count); - __dump_inodelks (pl_inode); - } +out: + STACK_UNWIND_STRICT(setactivelk, frame, op_ret, op_errno, NULL); - count = __get_posixlk_count (this, pl_inode); - if (count) { - gf_proc_dump_write("posixlk-count", "%d", count); - __dump_posixlks (pl_inode); - } - } - pthread_mutex_unlock (&pl_inode->mutex); + return 0; +} -out: - GF_FREE (pathname); +int32_t +pl_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0); - if (ret && inode) { - if (!section_added) - gf_proc_dump_add_section ("xlator.features.locks.%s." - "inode", this->name); - gf_proc_dump_write ("Unable to print lock state", "(Lock " - "acquisition failure) %s", - uuid_utoa (inode->gfid)); - } - return ret; + PL_STACK_UNWIND(unlink, xdata, frame, op_ret, op_errno, preparent, + postparent, xdata); + + return 0; } int32_t -mem_acct_init (xlator_t *this) +pl_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - int ret = -1; + int32_t error; - if (!this) - return ret; + error = PL_INODE_REMOVE(unlink, frame, this, loc, NULL, pl_unlink, + pl_unlink_cbk, loc, xflag, xdata); + if (error > 0) { + STACK_UNWIND_STRICT(unlink, frame, -1, error, NULL, NULL, NULL); + } - ret = xlator_mem_acct_init (this, gf_locks_mt_end + 1); + return 0; +} - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } +int32_t +pl_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(mkdir, xdata, frame, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; +} - return ret; +int +pl_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + return 0; } +int32_t +pl_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(stat, xdata, frame, op_ret, op_errno, buf, + xdata); + return 0; +} -pl_ctx_t* -pl_ctx_get (client_t *client, xlator_t *xlator) +int +pl_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - void *tmp = NULL; - pl_ctx_t *ctx = NULL; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +} - client_ctx_get (client, xlator, &tmp); +int32_t +pl_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(mknod, xdata, frame, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; +} - ctx = tmp; +int +pl_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + return 0; +} - if (ctx != NULL) - goto out; +int32_t +pl_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0); - ctx = GF_CALLOC (1, sizeof (pl_ctx_t), gf_locks_mt_posix_lock_t); + PL_STACK_UNWIND_FOR_CLIENT(rmdir, xdata, frame, op_ret, op_errno, preparent, + postparent, xdata); - if (ctx == NULL) - goto out; + return 0; +} - pthread_mutex_init (&ctx->lock, NULL); - INIT_LIST_HEAD (&ctx->inodelk_lockers); - INIT_LIST_HEAD (&ctx->entrylk_lockers); +int +pl_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + int32_t error; - if (client_ctx_set (client, xlator, ctx) != 0) { - pthread_mutex_destroy (&ctx->lock); - GF_FREE (ctx); - ctx = NULL; - } -out: - return ctx; + error = PL_INODE_REMOVE(rmdir, frame, this, loc, NULL, pl_rmdir, + pl_rmdir_cbk, loc, xflags, xdata); + if (error > 0) { + STACK_UNWIND_STRICT(rmdir, frame, -1, error, NULL, NULL, NULL); + } + + return 0; +} + +int32_t +pl_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(symlink, xdata, frame, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; } +int +pl_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata); + return 0; +} -static int -pl_client_disconnect_cbk (xlator_t *this, client_t *client) +int32_t +pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - pl_ctx_t *pl_ctx = NULL; + pl_inode_t *pl_inode = (pl_inode_t *)cookie; - pl_ctx = pl_ctx_get (client, this); + if (op_ret >= 0) { + pthread_mutex_lock(&pl_inode->mutex); - pl_inodelk_client_cleanup (this, pl_ctx); + /* TODO: can happen pl_inode->links == 0 ? */ + if (pl_inode->links >= 0) { + pl_inode->links++; + } - pl_entrylk_client_cleanup (this, pl_ctx); + pthread_mutex_unlock(&pl_inode->mutex); + } - return 0; + PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; } - -static int -pl_client_destroy_cbk (xlator_t *this, client_t *client) +int +pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - void *tmp = NULL; - pl_ctx_t *pl_ctx = NULL; + pl_inode_t *pl_inode; - pl_client_disconnect_cbk (this, client); + pl_inode = pl_inode_get(this, oldloc->inode, NULL); + if (pl_inode == NULL) { + STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL); + return 0; + } - client_ctx_del (client, this, &tmp); + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc); + STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; +} - if (tmp == NULL) - return 0; +int32_t +pl_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(fsync, xdata, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} - pl_ctx = tmp; +int +pl_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; +} - GF_ASSERT (list_empty(&pl_ctx->inodelk_lockers)); - GF_ASSERT (list_empty(&pl_ctx->entrylk_lockers)); +int32_t +pl_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(readdir, xdata, frame, op_ret, op_errno, entries, + xdata); + return 0; +} - pthread_mutex_destroy (&pl_ctx->lock); - GF_FREE (pl_ctx); +int +pl_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); + return 0; +} - return 0; +int32_t +pl_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(fsyncdir, xdata, frame, op_ret, op_errno, xdata); + return 0; } +int +pl_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fsyncdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, fd, datasync, xdata); + return 0; +} + +int32_t +pl_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct statvfs *buf, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(statfs, xdata, frame, op_ret, op_errno, buf, + xdata); + return 0; +} int -init (xlator_t *this) +pl_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; +} + +int32_t +pl_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - posix_locks_private_t *priv = NULL; - xlator_list_t *trav = NULL; - data_t *mandatory = NULL; - data_t *trace = NULL; - int ret = -1; + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: posix-locks should have exactly one child"); - goto out; + local = frame->local; + if (local && local->update_mlock_enforced_flag && op_ret != -1) { + pl_inode = pl_inode_get(this, local->inode, NULL); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; } - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling. Please check the volume file."); + pthread_mutex_lock(&pl_inode->mutex); + { + pl_inode->mlock_enforced = _gf_false; + pl_inode->check_mlock_info = _gf_false; + pl_inode->track_fop_wind_count = _gf_true; } + pthread_mutex_unlock(&pl_inode->mutex); + } - trav = this->children; - while (trav->xlator->children) - trav = trav->xlator->children; +unwind: + PL_STACK_UNWIND_FOR_CLIENT(removexattr, xdata, frame, op_ret, op_errno, + xdata); + return 0; +} - if (strncmp ("storage/", trav->xlator->type, 8)) { - gf_log (this->name, GF_LOG_CRITICAL, - "'locks' translator is not loaded over a storage " - "translator"); - goto out; - } +int +pl_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int op_ret = 0; + int op_errno = EINVAL; + posix_locks_private_t *priv = this->private; - priv = GF_CALLOC (1, sizeof (*priv), - gf_locks_mt_posix_locks_private_t); + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); - mandatory = dict_get (this->options, "mandatory-locks"); - if (mandatory) - gf_log (this->name, GF_LOG_WARNING, - "mandatory locks not supported in this minor release."); + PL_CHECK_LOCK_ENFORCE_KEY(frame, ((dict_t *)NULL), name, this, loc, + ((fd_t *)NULL), priv); - trace = dict_get (this->options, "trace"); - if (trace) { - if (gf_string2boolean (trace->data, - &priv->trace) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "'trace' takes on only boolean values."); - goto out; - } + STACK_WIND(frame, pl_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; + +unwind: + PL_STACK_UNWIND_FOR_CLIENT(removexattr, xdata, frame, op_ret, op_errno, + NULL); + + return 0; +} + +int32_t +pl_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + + local = frame->local; + if (local && local->update_mlock_enforced_flag && op_ret != -1) { + pl_inode = pl_inode_get(this, local->inode, NULL); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; } - this->local_pool = mem_pool_new (pl_local_t, 32); - if (!this->local_pool) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - goto out; + pthread_mutex_lock(&pl_inode->mutex); + { + pl_inode->mlock_enforced = _gf_false; + pl_inode->check_mlock_info = _gf_false; } + pthread_mutex_unlock(&pl_inode->mutex); + } - this->private = priv; - ret = 0; +unwind: + PL_STACK_UNWIND_FOR_CLIENT(fremovexattr, xdata, frame, op_ret, op_errno, + xdata); + return 0; +} -out: - if (ret) { - GF_FREE (priv); - } - return ret; +int +pl_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + int op_ret = -1; + int op_errno = EINVAL; + posix_locks_private_t *priv = this->private; + + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + + PL_CHECK_LOCK_ENFORCE_KEY(frame, ((dict_t *)NULL), name, this, + ((loc_t *)NULL), fd, priv); + + STACK_WIND(frame, pl_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; + +unwind: + PL_STACK_UNWIND_FOR_CLIENT(fremovexattr, xdata, frame, op_ret, op_errno, + NULL); + return 0; } +int32_t +pl_rchecksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, uint32_t weak_cksum, + uint8_t *strong_cksum, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(rchecksum, xdata, frame, op_ret, op_errno, + weak_cksum, strong_cksum, xdata); + return 0; +} int -fini (xlator_t *this) +pl_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) { - posix_locks_private_t *priv = NULL; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_rchecksum_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata); + return 0; +} - priv = this->private; - if (!priv) - return 0; - this->private = NULL; - GF_FREE (priv->brickname); - GF_FREE (priv); +int32_t +pl_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(xattrop, xdata, frame, op_ret, op_errno, dict, + xdata); + return 0; +} - return 0; +int +pl_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, optype, xattr, xdata); + return 0; } +int32_t +pl_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(fxattrop, xdata, frame, op_ret, op_errno, dict, + xdata); + return 0; +} int -pl_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, - dict_t *xdata); +pl_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fxattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, optype, xattr, xdata); + return 0; +} + +int32_t +pl_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(setattr, xdata, frame, op_ret, op_errno, statpre, + statpost, xdata); + return 0; +} int -pl_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata); +pl_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; +} + +int32_t +pl_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(fsetattr, xdata, frame, op_ret, op_errno, + statpre, statpost, xdata); + return 0; +} int -pl_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata); +pl_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; +} + +int32_t +pl_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(fallocate, xdata, frame, op_ret, op_errno, pre, + post, xdata); + return 0; +} + +int +pl_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, len, + xdata); + return 0; +} + +int32_t +pl_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, const char *path, + struct iatt *buf, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(readlink, xdata, frame, op_ret, op_errno, path, + buf, xdata); + return 0; +} int -pl_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata); +pl_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_readlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, loc, size, xdata); + return 0; +} + +int32_t +pl_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(access, xdata, frame, op_ret, op_errno, xdata); + return 0; +} + +int +pl_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_access_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, loc, mask, xdata); + return 0; +} + +int32_t +pl_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, off_t offset, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(seek, xdata, frame, op_ret, op_errno, offset, + xdata); + return 0; +} + +int32_t +pl_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_seek_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata); + return 0; +} struct xlator_fops fops = { - .lookup = pl_lookup, - .create = pl_create, - .truncate = pl_truncate, - .ftruncate = pl_ftruncate, - .open = pl_open, - .readv = pl_readv, - .writev = pl_writev, - .lk = pl_lk, - .inodelk = pl_inodelk, - .finodelk = pl_finodelk, - .entrylk = pl_entrylk, - .fentrylk = pl_fentrylk, - .flush = pl_flush, - .opendir = pl_opendir, - .readdirp = pl_readdirp, - .getxattr = pl_getxattr, - .fgetxattr = pl_fgetxattr, - .fsetxattr = pl_fsetxattr, + .lookup = pl_lookup, + .create = pl_create, + .fstat = pl_fstat, + .truncate = pl_truncate, + .ftruncate = pl_ftruncate, + .discard = pl_discard, + .zerofill = pl_zerofill, + .open = pl_open, + .readv = pl_readv, + .writev = pl_writev, + .lk = pl_lk, + .inodelk = pl_inodelk, + .finodelk = pl_finodelk, + .entrylk = pl_entrylk, + .fentrylk = pl_fentrylk, + .flush = pl_flush, + .opendir = pl_opendir, + .readdirp = pl_readdirp, + .setxattr = pl_setxattr, + .fsetxattr = pl_fsetxattr, + .getxattr = pl_getxattr, + .fgetxattr = pl_fgetxattr, + .removexattr = pl_removexattr, + .fremovexattr = pl_fremovexattr, + .rename = pl_rename, + .getactivelk = pl_getactivelk, + .setactivelk = pl_setactivelk, + .unlink = pl_unlink, + .access = pl_access, + .readlink = pl_readlink, + .fallocate = pl_fallocate, + .fsetattr = pl_fsetattr, + .setattr = pl_setattr, + .fxattrop = pl_fxattrop, + .xattrop = pl_xattrop, + .rchecksum = pl_rchecksum, + .statfs = pl_statfs, + .fsyncdir = pl_fsyncdir, + .readdir = pl_readdir, + .symlink = pl_symlink, + .link = pl_link, + .rmdir = pl_rmdir, + .mknod = pl_mknod, + .stat = pl_stat, + .seek = pl_seek, }; struct xlator_dumpops dumpops = { - .inodectx = pl_dump_inode_priv, + .inodectx = pl_dump_inode_priv, }; struct xlator_cbks cbks = { - .forget = pl_forget, - .release = pl_release, - .releasedir = pl_releasedir, - .client_destroy = pl_client_destroy_cbk, - .client_disconnect = pl_client_disconnect_cbk, + .forget = pl_forget, + .release = pl_release, + .releasedir = pl_releasedir, + .client_destroy = pl_client_destroy_cbk, + .client_disconnect = pl_client_disconnect_cbk, }; - struct volume_options options[] = { - { .key = { "mandatory-locks", "mandatory" }, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = { "trace" }, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {NULL} }, + {.key = {"mandatory-locking"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "off", + .op_version = {GD_OP_VERSION_3_8_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"locks"}, + .description = "Specifies the mandatory-locking mode. Valid options " + "are 'file' to use linux style mandatory locks, " + "'forced' to use volume strictly under mandatory lock " + "semantics only and 'optimal' to treat advisory and " + "mandatory locks separately on their own."}, + {.key = {"trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"locks"}, + .description = "Trace the different lock requests " + "to logs."}, + {.key = {"monkey-unlocking"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE, + .tags = {"locks"}, + .description = "Ignore a random number of unlock requests. Useful " + "for testing/creating robust lock recovery mechanisms."}, + { + .key = {"revocation-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "0", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"locks"}, + .description = "Maximum time a lock can be taken out, before" + "being revoked.", + }, + { + .key = {"revocation-clear-all"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"locks"}, + .description = "If set to true, will revoke BOTH granted and blocked " + "(pending) lock requests if a revocation threshold is " + "hit.", + }, + {.key = {"revocation-max-blocked"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "0", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"locks"}, + .description = "A number of blocked lock requests after which a lock " + "will be revoked to allow the others to proceed. Can " + "be used in conjunction w/ revocation-clear-all."}, + {.key = {"notify-contention"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes", + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .op_version = {GD_OP_VERSION_4_0_0}, + .tags = {"locks", "contention"}, + .description = "When this option is enabled and a lock request " + "conflicts with a currently granted lock, an upcall " + "notification will be sent to the current owner of " + "the lock to request it to be released as soon as " + "possible."}, + {.key = {"notify-contention-delay"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, /* An upcall notification is sent every time a conflict is + * detected. */ + .max = 60, + .default_value = "5", + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .op_version = {GD_OP_VERSION_4_0_0}, + .tags = {"locks", "contention", "timeout"}, + .description = "This value determines the minimum amount of time " + "(in seconds) between upcall contention notifications " + "on the same inode. If multiple lock requests are " + "received during this period, only one upcall will " + "be sent."}, + {.key = {"enforce-mandatory-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .flags = OPT_FLAG_SETTABLE, + .op_version = {GD_OP_VERSION_6_0}, + .description = "option to enable lock enforcement"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "locks", + .category = GF_MAINTAINED, }; diff --git a/xlators/features/locks/src/reservelk.c b/xlators/features/locks/src/reservelk.c index 11abd26d85f..604691fd887 100644 --- a/xlators/features/locks/src/reservelk.c +++ b/xlators/features/locks/src/reservelk.c @@ -7,437 +7,376 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" -#include "list.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/list.h> #include "locks.h" #include "common.h" -void -__delete_reserve_lock (posix_lock_t *lock) -{ - list_del (&lock->list); -} - -void -__destroy_reserve_lock (posix_lock_t *lock) -{ - GF_FREE (lock); -} - /* Return true if the two reservelks have exactly same lock boundaries */ int -reservelks_equal (posix_lock_t *l1, posix_lock_t *l2) +reservelks_equal(posix_lock_t *l1, posix_lock_t *l2) { - if ((l1->fl_start == l2->fl_start) && - (l1->fl_end == l2->fl_end)) - return 1; + if ((l1->fl_start == l2->fl_start) && (l1->fl_end == l2->fl_end)) + return 1; - return 0; + return 0; } /* Determine if lock is grantable or not */ static posix_lock_t * -__reservelk_grantable (pl_inode_t *pl_inode, posix_lock_t *lock) +__reservelk_grantable(pl_inode_t *pl_inode, posix_lock_t *lock) { - xlator_t *this = NULL; - posix_lock_t *l = NULL; - posix_lock_t *ret_lock = NULL; - - this = THIS; - - if (list_empty (&pl_inode->reservelk_list)) { - gf_log (this->name, GF_LOG_TRACE, - "No reservelks in list"); - goto out; - } - list_for_each_entry (l, &pl_inode->reservelk_list, list){ - if (reservelks_equal (lock, l)) { - ret_lock = l; - break; - } + xlator_t *this = THIS; + posix_lock_t *l = NULL; + posix_lock_t *ret_lock = NULL; + + if (list_empty(&pl_inode->reservelk_list)) { + gf_log(this->name, GF_LOG_TRACE, "No reservelks in list"); + goto out; + } + list_for_each_entry(l, &pl_inode->reservelk_list, list) + { + if (reservelks_equal(lock, l)) { + ret_lock = l; + break; } + } out: - return ret_lock; + return ret_lock; } -static inline int -__same_owner_reservelk (posix_lock_t *l1, posix_lock_t *l2) +static int +__same_owner_reservelk(posix_lock_t *l1, posix_lock_t *l2) { - return (is_same_lkowner (&l1->owner, &l2->owner)); - + return (is_same_lkowner(&l1->owner, &l2->owner)); } static posix_lock_t * -__matching_reservelk (pl_inode_t *pl_inode, posix_lock_t *lock) +__matching_reservelk(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *l = NULL; + posix_lock_t *l = NULL; - if (list_empty (&pl_inode->reservelk_list)) { - gf_log ("posix-locks", GF_LOG_TRACE, - "reservelk list empty"); - return NULL; - } + if (list_empty(&pl_inode->reservelk_list)) { + gf_log("posix-locks", GF_LOG_TRACE, "reservelk list empty"); + return NULL; + } - list_for_each_entry (l, &pl_inode->reservelk_list, list) { - if (reservelks_equal (l, lock)) { - gf_log ("posix-locks", GF_LOG_TRACE, - "equal reservelk found"); - break; - } + list_for_each_entry(l, &pl_inode->reservelk_list, list) + { + if (reservelks_equal(l, lock)) { + gf_log("posix-locks", GF_LOG_TRACE, "equal reservelk found"); + break; } + } - return l; + return l; } static int -__reservelk_conflict (xlator_t *this, pl_inode_t *pl_inode, - posix_lock_t *lock) +__reservelk_conflict(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *conf = NULL; - int ret = 0; - - conf = __matching_reservelk (pl_inode, lock); - if (conf) { - gf_log (this->name, GF_LOG_TRACE, - "Matching reservelk found"); - if (__same_owner_reservelk (lock, conf)) { - list_del_init (&conf->list); - gf_log (this->name, GF_LOG_TRACE, - "Removing the matching reservelk for setlk to progress"); - GF_FREE (conf); - ret = 0; - } else { - gf_log (this->name, GF_LOG_TRACE, - "Conflicting reservelk found"); - ret = 1; - } - + int ret = 0; + + posix_lock_t *conf = __matching_reservelk(pl_inode, lock); + if (conf) { + gf_log(this->name, GF_LOG_TRACE, "Matching reservelk found"); + if (__same_owner_reservelk(lock, conf)) { + list_del_init(&conf->list); + gf_log(this->name, GF_LOG_TRACE, + "Removing the matching reservelk for setlk to progress"); + __destroy_lock(conf); + ret = 0; + } else { + gf_log(this->name, GF_LOG_TRACE, "Conflicting reservelk found"); + ret = 1; } - return ret; - + } + return ret; } int -pl_verify_reservelk (xlator_t *this, pl_inode_t *pl_inode, - posix_lock_t *lock, int can_block) +pl_verify_reservelk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + const int can_block) { - int ret = 0; - - pthread_mutex_lock (&pl_inode->mutex); - { - if (__reservelk_conflict (this, pl_inode, lock)) { - gf_log (this->name, GF_LOG_TRACE, - "Found conflicting reservelk. Blocking until reservelk is unlocked."); - lock->blocked = can_block; - list_add_tail (&lock->list, &pl_inode->blocked_calls); - ret = -1; - goto unlock; - } - - gf_log (this->name, GF_LOG_TRACE, - "no conflicting reservelk found. Call continuing"); - ret = 0; - + int ret = 0; + + pthread_mutex_lock(&pl_inode->mutex); + { + if (__reservelk_conflict(this, pl_inode, lock)) { + lock->blocked = can_block; + list_add_tail(&lock->list, &pl_inode->blocked_calls); + pthread_mutex_unlock(&pl_inode->mutex); + gf_log(this->name, GF_LOG_TRACE, + "Found conflicting reservelk. Blocking until reservelk is " + "unlocked."); + ret = -1; + goto out; } -unlock: - pthread_mutex_unlock (&pl_inode->mutex); - - return ret; - + } + pthread_mutex_unlock(&pl_inode->mutex); + gf_log(this->name, GF_LOG_TRACE, + "no conflicting reservelk found. Call continuing"); + ret = 0; +out: + return ret; } - /* Determines if lock can be granted and adds the lock. If the lock * is blocking, adds it to the blocked_reservelks. */ static int -__lock_reservelk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, - int can_block) +__lock_reservelk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + const int can_block) { - posix_lock_t *conf = NULL; - int ret = -EINVAL; - - conf = __reservelk_grantable (pl_inode, lock); - if (conf){ - ret = -EAGAIN; - if (can_block == 0) - goto out; + int ret = -EINVAL; - list_add_tail (&lock->list, &pl_inode->blocked_reservelks); + posix_lock_t *conf = __reservelk_grantable(pl_inode, lock); + if (conf) { + ret = -EAGAIN; + if (can_block == 0) + goto out; - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); + list_add_tail(&lock->list, &pl_inode->blocked_reservelks); + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 " => Blocked", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, + lkowner_utoa(&lock->owner), lock->user_flock.l_start, + lock->user_flock.l_len); - goto out; - } + goto out; + } - list_add (&lock->list, &pl_inode->reservelk_list); + list_add(&lock->list, &pl_inode->reservelk_list); - ret = 0; + ret = 0; out: - return ret; + return ret; } static posix_lock_t * -find_matching_reservelk (posix_lock_t *lock, pl_inode_t *pl_inode) +find_matching_reservelk(posix_lock_t *lock, pl_inode_t *pl_inode) { - posix_lock_t *l = NULL; - list_for_each_entry (l, &pl_inode->reservelk_list, list) { - if (reservelks_equal (l, lock)) - return l; - } - return NULL; + posix_lock_t *l = NULL; + list_for_each_entry(l, &pl_inode->reservelk_list, list) + { + if (reservelks_equal(l, lock)) + return l; + } + return NULL; } /* Set F_UNLCK removes a lock which has the exact same lock boundaries * as the UNLCK lock specifies. If such a lock is not found, returns invalid */ static posix_lock_t * -__reserve_unlock_lock (xlator_t *this, posix_lock_t *lock, pl_inode_t *pl_inode) +__reserve_unlock_lock(xlator_t *this, posix_lock_t *lock, pl_inode_t *pl_inode) { - - posix_lock_t *conf = NULL; - - conf = find_matching_reservelk (lock, pl_inode); - if (!conf) { - gf_log (this->name, GF_LOG_DEBUG, - " Matching lock not found for unlock"); - goto out; - } - __delete_reserve_lock (conf); - gf_log (this->name, GF_LOG_DEBUG, - " Matching lock found for unlock"); + posix_lock_t *conf = find_matching_reservelk(lock, pl_inode); + if (!conf) { + gf_log(this->name, GF_LOG_DEBUG, " Matching lock not found for unlock"); + goto out; + } + __delete_lock(conf); + gf_log(this->name, GF_LOG_DEBUG, " Matching lock found for unlock"); out: - return conf; - - + return conf; } static void -__grant_blocked_reserve_locks (xlator_t *this, pl_inode_t *pl_inode, - struct list_head *granted) +__grant_blocked_reserve_locks(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted) { - int bl_ret = 0; - posix_lock_t *bl = NULL; - posix_lock_t *tmp = NULL; - - struct list_head blocked_list; + int bl_ret = 0; + posix_lock_t *bl = NULL; + posix_lock_t *tmp = NULL; - INIT_LIST_HEAD (&blocked_list); - list_splice_init (&pl_inode->blocked_reservelks, &blocked_list); + struct list_head blocked_list; - list_for_each_entry_safe (bl, tmp, &blocked_list, list) { + INIT_LIST_HEAD(&blocked_list); + list_splice_init(&pl_inode->blocked_reservelks, &blocked_list); - list_del_init (&bl->list); + list_for_each_entry_safe(bl, tmp, &blocked_list, list) + { + list_del_init(&bl->list); - bl_ret = __lock_reservelk (this, pl_inode, bl, 1); + bl_ret = __lock_reservelk(this, pl_inode, bl, 1); - if (bl_ret == 0) { - list_add (&bl->list, granted); - } + if (bl_ret == 0) { + list_add(&bl->list, granted); } - return; + } + return; } /* Grant all reservelks blocked on lock(s) */ void -grant_blocked_reserve_locks (xlator_t *this, pl_inode_t *pl_inode) +grant_blocked_reserve_locks(xlator_t *this, pl_inode_t *pl_inode) { - struct list_head granted; - posix_lock_t *lock = NULL; - posix_lock_t *tmp = NULL; + struct list_head granted; + posix_lock_t *lock = NULL; + posix_lock_t *tmp = NULL; - INIT_LIST_HEAD (&granted); - - if (list_empty (&pl_inode->blocked_reservelks)) { - gf_log (this->name, GF_LOG_TRACE, - "No blocked locks to be granted"); - return; - } - - pthread_mutex_lock (&pl_inode->mutex); - { - __grant_blocked_reserve_locks (this, pl_inode, &granted); - } - pthread_mutex_unlock (&pl_inode->mutex); - - list_for_each_entry_safe (lock, tmp, &granted, list) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Granted", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - - STACK_UNWIND_STRICT (lk, lock->frame, 0, 0, &lock->user_flock, - NULL); - } + INIT_LIST_HEAD(&granted); + if (list_empty(&pl_inode->blocked_reservelks)) { + gf_log(this->name, GF_LOG_TRACE, "No blocked locks to be granted"); + return; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + __grant_blocked_reserve_locks(this, pl_inode, &granted); + } + pthread_mutex_unlock(&pl_inode->mutex); + + list_for_each_entry_safe(lock, tmp, &granted, list) + { + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => Granted", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, + lkowner_utoa(&lock->owner), lock->user_flock.l_start, + lock->user_flock.l_len); + + STACK_UNWIND_STRICT(lk, lock->frame, 0, 0, &lock->user_flock, NULL); + } } static void -__grant_blocked_lock_calls (xlator_t *this, pl_inode_t *pl_inode, - struct list_head *granted) +__grant_blocked_lock_calls(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted) { - int bl_ret = 0; - posix_lock_t *bl = NULL; - posix_lock_t *tmp = NULL; + int bl_ret = 0; + posix_lock_t *bl = NULL; + posix_lock_t *tmp = NULL; - struct list_head blocked_list; + struct list_head blocked_list; - INIT_LIST_HEAD (&blocked_list); - list_splice_init (&pl_inode->blocked_reservelks, &blocked_list); + INIT_LIST_HEAD(&blocked_list); + list_splice_init(&pl_inode->blocked_reservelks, &blocked_list); - list_for_each_entry_safe (bl, tmp, &blocked_list, list) { + list_for_each_entry_safe(bl, tmp, &blocked_list, list) + { + list_del_init(&bl->list); - list_del_init (&bl->list); + bl_ret = pl_verify_reservelk(this, pl_inode, bl, bl->blocked); - bl_ret = pl_verify_reservelk (this, pl_inode, bl, bl->blocked); - - if (bl_ret == 0) { - list_add_tail (&bl->list, granted); - } + if (bl_ret == 0) { + list_add_tail(&bl->list, granted); } - return; + } + return; } void -grant_blocked_lock_calls (xlator_t *this, pl_inode_t *pl_inode) +grant_blocked_lock_calls(xlator_t *this, pl_inode_t *pl_inode) { - struct list_head granted; - posix_lock_t *lock = NULL; - posix_lock_t *tmp = NULL; - fd_t *fd = NULL; - - int can_block = 0; - int32_t cmd = 0; - int ret = 0; - - if (list_empty (&pl_inode->blocked_calls)) { - gf_log (this->name, GF_LOG_TRACE, - "No blocked lock calls to be granted"); - return; - } + struct list_head granted; + posix_lock_t *lock = NULL; + posix_lock_t *tmp = NULL; + fd_t *fd = NULL; - pthread_mutex_lock (&pl_inode->mutex); - { - __grant_blocked_lock_calls (this, pl_inode, &granted); - } - pthread_mutex_unlock (&pl_inode->mutex); - - list_for_each_entry_safe (lock, tmp, &granted, list) { - fd = fd_from_fdnum (lock); - - if (lock->blocked) { - can_block = 1; - cmd = F_SETLKW; - } - else - cmd = F_SETLK; - - lock->blocked = 0; - ret = pl_setlk (this, pl_inode, lock, can_block); - if (ret == -1) { - if (can_block) { - pl_trace_block (this, lock->frame, fd, NULL, - cmd, &lock->user_flock, NULL); - continue; - } else { - gf_log (this->name, GF_LOG_DEBUG, "returning EAGAIN"); - pl_trace_out (this, lock->frame, fd, NULL, cmd, - &lock->user_flock, -1, EAGAIN, NULL); - pl_update_refkeeper (this, fd->inode); - STACK_UNWIND_STRICT (lk, lock->frame, -1, - EAGAIN, &lock->user_flock, - NULL); - __destroy_lock (lock); - } - } + int can_block = 0; + int32_t cmd = 0; + int ret = 0; + if (list_empty(&pl_inode->blocked_calls)) { + gf_log(this->name, GF_LOG_TRACE, "No blocked lock calls to be granted"); + return; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + __grant_blocked_lock_calls(this, pl_inode, &granted); + } + pthread_mutex_unlock(&pl_inode->mutex); + + list_for_each_entry_safe(lock, tmp, &granted, list) + { + fd = fd_from_fdnum(lock); + + if (lock->blocked) { + can_block = 1; + cmd = F_SETLKW; + } else + cmd = F_SETLK; + + lock->blocked = 0; + ret = pl_setlk(this, pl_inode, lock, can_block); + if (ret == -1) { + if (can_block) { + continue; + } else { + gf_log(this->name, GF_LOG_DEBUG, "returning EAGAIN"); + pl_trace_out(this, lock->frame, fd, NULL, cmd, + &lock->user_flock, -1, EAGAIN, NULL); + pl_update_refkeeper(this, fd->inode); + STACK_UNWIND_STRICT(lk, lock->frame, -1, EAGAIN, + &lock->user_flock, NULL); + __destroy_lock(lock); + } } - + } } - int -pl_reserve_unlock (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock) +pl_reserve_unlock(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *retlock = NULL; - int ret = -1; - - pthread_mutex_lock (&pl_inode->mutex); - { - retlock = __reserve_unlock_lock (this, lock, pl_inode); - if (!retlock) { - gf_log (this->name, GF_LOG_DEBUG, - "Bad Unlock issued on Inode lock"); - ret = -EINVAL; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "Reservelk Unlock successful"); - __destroy_reserve_lock (retlock); - ret = 0; + posix_lock_t *retlock = NULL; + int ret = -1; + + pthread_mutex_lock(&pl_inode->mutex); + { + retlock = __reserve_unlock_lock(this, lock, pl_inode); + if (!retlock) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_log(this->name, GF_LOG_DEBUG, "Bad Unlock issued on Inode lock"); + ret = -EINVAL; + goto out; } -out: - pthread_mutex_unlock (&pl_inode->mutex); - - grant_blocked_reserve_locks (this, pl_inode); - grant_blocked_lock_calls (this, pl_inode); - return ret; + gf_log(this->name, GF_LOG_TRACE, "Reservelk Unlock successful"); + __destroy_lock(retlock); + ret = 0; + } + pthread_mutex_unlock(&pl_inode->mutex); +out: + grant_blocked_reserve_locks(this, pl_inode); + grant_blocked_lock_calls(this, pl_inode); + return ret; } int -pl_reserve_setlk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, - int can_block) +pl_reserve_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + int can_block) { - int ret = -EINVAL; - - pthread_mutex_lock (&pl_inode->mutex); - { - - ret = __lock_reservelk (this, pl_inode, lock, can_block); - if (ret < 0) - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => NOK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - else - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => OK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->fl_start, - lock->fl_end); - - } - pthread_mutex_unlock (&pl_inode->mutex); - return ret; + int ret = -EINVAL; + + pthread_mutex_lock(&pl_inode->mutex); + { + ret = __lock_reservelk(this, pl_inode, lock, can_block); + } + pthread_mutex_unlock(&pl_inode->mutex); + + if (ret < 0) + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => NOK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, + lkowner_utoa(&lock->owner), lock->user_flock.l_start, + lock->user_flock.l_len); + else + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => OK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, + lkowner_utoa(&lock->owner), lock->fl_start, lock->fl_end); + + return ret; } diff --git a/xlators/features/locks/tests/unit-test.c b/xlators/features/locks/tests/unit-test.c index d2cca32dec3..d285b12b5aa 100644 --- a/xlators/features/locks/tests/unit-test.c +++ b/xlators/features/locks/tests/unit-test.c @@ -7,59 +7,71 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" -#include "list.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/list.h> #include "locks.h" #include "common.h" -#define expect(cond) if (!(cond)) { goto out; } +#define expect(cond) \ + if (!(cond)) { \ + goto out; \ + } -extern int lock_name (pl_inode_t *, const char *, entrylk_type); -extern int unlock_name (pl_inode_t *, const char *, entrylk_type); +extern int +lock_name(pl_inode_t *, const char *, entrylk_type); +extern int +unlock_name(pl_inode_t *, const char *, entrylk_type); -int main (int argc, char **argv) +int +main(int argc, char **argv) { - int ret = 1; - int r = -1; + int ret = 1; + int r = -1; + + pl_inode_t *pinode = CALLOC(sizeof(pl_inode_t), 1); + pthread_mutex_init(&pinode->dir_lock_mutex, NULL); + INIT_LIST_HEAD(&pinode->gf_dir_locks); - pl_inode_t *pinode = CALLOC (sizeof (pl_inode_t), 1); - pthread_mutex_init (&pinode->dir_lock_mutex, NULL); - INIT_LIST_HEAD (&pinode->gf_dir_locks); + r = lock_name(pinode, NULL, ENTRYLK_WRLCK); + expect(r == 0); + { + r = lock_name(pinode, "foo", ENTRYLK_WRLCK); + expect(r == -EAGAIN); + } + r = unlock_name(pinode, NULL, ENTRYLK_WRLCK); + expect(r == 0); - r = lock_name (pinode, NULL, ENTRYLK_WRLCK); expect (r == 0); - { - r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == -EAGAIN); - } - r = unlock_name (pinode, NULL, ENTRYLK_WRLCK); expect (r == 0); + r = lock_name(pinode, "foo", ENTRYLK_RDLCK); + expect(r == 0); + { + r = lock_name(pinode, "foo", ENTRYLK_RDLCK); + expect(r == 0); + { + r = lock_name(pinode, "foo", ENTRYLK_WRLCK); + expect(r == -EAGAIN); + } + r = unlock_name(pinode, "foo", ENTRYLK_RDLCK); + expect(r == 0); + } + r = unlock_name(pinode, "foo", ENTRYLK_RDLCK); + expect(r == 0); - r = lock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); - { - r = lock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); - { - r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == -EAGAIN); - } - r = unlock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); - } - r = unlock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); - - r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == 0); - r = unlock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == 0); + r = lock_name(pinode, "foo", ENTRYLK_WRLCK); + expect(r == 0); + r = unlock_name(pinode, "foo", ENTRYLK_WRLCK); + expect(r == 0); - r = lock_name (pinode, "baz", ENTRYLK_WRLCK); expect (r == 0); - r = lock_name (pinode, "baz", ENTRYLK_RDLCK); expect (r == -EAGAIN); + r = lock_name(pinode, "baz", ENTRYLK_WRLCK); + expect(r == 0); + r = lock_name(pinode, "baz", ENTRYLK_RDLCK); + expect(r == -EAGAIN); - ret = 0; + ret = 0; out: - return ret; + return ret; } |
