/* Copyright (c) 2006-2012 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ #include #include #include #include #include #include #include "locks.h" #include "clear.h" #include "common.h" void __delete_inode_lock(pl_inode_lock_t *lock) { list_del_init(&lock->list); } static void __pl_inodelk_ref(pl_inode_lock_t *lock) { lock->ref++; } void __pl_inodelk_unref(pl_inode_lock_t *lock) { lock->ref--; if (!lock->ref) { GF_FREE(lock->connection_id); GF_FREE(lock); } } /* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't * conflict */ static int inodelk_type_conflict(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { if (l2->fl_type == F_WRLCK || l1->fl_type == F_WRLCK) return 1; return 0; } void pl_print_inodelk(char *str, int size, int cmd, struct gf_flock *flock, const char *domain) { char *cmd_str = NULL; char *type_str = NULL; switch (cmd) { #if F_GETLK != F_GETLK64 case F_GETLK64: #endif case F_GETLK: cmd_str = "GETLK"; break; #if F_SETLK != F_SETLK64 case F_SETLK64: #endif case F_SETLK: cmd_str = "SETLK"; break; #if F_SETLKW != F_SETLKW64 case F_SETLKW64: #endif case F_SETLKW: cmd_str = "SETLKW"; break; default: cmd_str = "UNKNOWN"; break; } switch (flock->l_type) { case F_RDLCK: type_str = "READ"; break; case F_WRLCK: type_str = "WRITE"; break; case F_UNLCK: type_str = "UNLOCK"; break; default: type_str = "UNKNOWN"; break; } snprintf(str, size, "lock=INODELK, cmd=%s, type=%s, " "domain: %s, start=%llu, len=%llu, pid=%llu", cmd_str, type_str, domain, (unsigned long long)flock->l_start, (unsigned long long)flock->l_len, (unsigned long long)flock->l_pid); } /* Determine if the two inodelks overlap reach other's lock regions */ static int inodelk_overlap(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start)); } /* Returns true if the 2 inodelks have the same owner */ static int same_inodelk_owner(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { return (is_same_lkowner(&l1->owner, &l2->owner) && (l1->client == l2->client)); } /* Returns true if the 2 inodelks conflict with each other */ static int inodelk_conflict(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { return (inodelk_overlap(l1, l2) && inodelk_type_conflict(l1, l2)); } /* * Check to see if the candidate lock overlaps/conflicts with the * requested lock. If so, determine how old the lock is and return * true if it exceeds the configured threshold, false otherwise. */ static inline gf_boolean_t __stale_inodelk(xlator_t *this, pl_inode_lock_t *candidate_lock, pl_inode_lock_t *requested_lock, time_t *lock_age_sec) { posix_locks_private_t *priv = NULL; priv = this->private; /* Question: Should we just prune them all given the * chance? Or just the locks we are attempting to acquire? */ if (inodelk_conflict(candidate_lock, requested_lock)) { *lock_age_sec = gf_time() - candidate_lock->granted_time; if (*lock_age_sec > priv->revocation_secs) return _gf_true; } return _gf_false; } /* Examine any locks held on this inode and potentially revoke the lock * if the age exceeds revocation_secs. We will clear _only_ those locks * which are granted, and then grant those locks which are blocked. * * Depending on how this patch works in the wild, we may expand this and * introduce a heuristic which clears blocked locks as well if they * are beyond a threshold. */ static gf_boolean_t __inodelk_prune_stale(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, pl_inode_lock_t *lock) { posix_locks_private_t *priv = NULL; pl_inode_lock_t *tmp = NULL; pl_inode_lock_t *lk = NULL; gf_boolean_t revoke_lock = _gf_false; int bcount = 0; int gcount = 0; int op_errno = 0; clrlk_args args; args.opts = NULL; time_t lk_age_sec = 0; uint32_t max_blocked = 0; char *reason_str = NULL; priv = this->private; args.type = CLRLK_INODE; if (priv->revocation_clear_all == _gf_true) args.kind = CLRLK_ALL; else args.kind = CLRLK_GRANTED; if (list_empty(&dom->inodelk_list)) goto out; pthread_mutex_lock(&pinode->mutex); list_for_each_entry_safe(lk, tmp, &dom->inodelk_list, list) { if (__stale_inodelk(this, lk, lock, &lk_age_sec) == _gf_true) { revoke_lock = _gf_true; reason_str = "age"; break; } } max_blocked = priv->revocation_max_blocked; if (max_blocked != 0 && revoke_lock == _gf_false) { list_for_each_entry_safe(lk, tmp, &dom->blocked_inodelks, blocked_locks) { max_blocked--; if (max_blocked == 0) { revoke_lock = _gf_true; reason_str = "max blocked"; break; } } } pthread_mutex_unlock(&pinode->mutex); out: if (revoke_lock == _gf_true) { clrlk_clear_inodelk(this, pinode, dom, &args, &bcount, &gcount, &op_errno); gf_log(this->name, GF_LOG_WARNING, "Lock revocation [reason: %s; gfid: %s; domain: %s; " "age: %ld sec] - Inode lock revoked: %d granted & %d " "blocked locks cleared", reason_str, uuid_utoa(pinode->gfid), dom->domain, lk_age_sec, gcount, bcount); } return revoke_lock; } void inodelk_contention_notify_check(xlator_t *this, pl_inode_lock_t *lock, struct timespec *now, struct list_head *contend) { posix_locks_private_t *priv; int64_t elapsed; priv = this->private; /* If this lock is in a list, it means that we are about to send a * notification for it, so no need to do anything else. */ if (!list_empty(&lock->contend)) { return; } elapsed = now->tv_sec; elapsed -= lock->contention_time.tv_sec; if (now->tv_nsec < lock->contention_time.tv_nsec) { elapsed--; } if (elapsed < priv->notify_contention_delay) { return; } /* All contention notifications will be sent outside of the locked * region. This means that currently granted locks might have already * been unlocked by that time. To avoid the lock or the inode to be * destroyed before we process them, we take an additional reference * on both. */ inode_ref(lock->pl_inode->inode); __pl_inodelk_ref(lock); lock->contention_time = *now; list_add_tail(&lock->contend, contend); } void inodelk_contention_notify(xlator_t *this, struct list_head *contend) { struct gf_upcall up; struct gf_upcall_inodelk_contention lc; pl_inode_lock_t *lock; pl_inode_t *pl_inode; client_t *client; gf_boolean_t notify; while (!list_empty(contend)) { lock = list_first_entry(contend, pl_inode_lock_t, contend); pl_inode = lock->pl_inode; pthread_mutex_lock(&pl_inode->mutex); /* If the lock has already been released, no notification is * sent. We clear the notification time in this case. */ notify = !list_empty(&lock->list); if (!notify) { lock->contention_time.tv_sec = 0; lock->contention_time.tv_nsec = 0; } else { memcpy(&lc.flock, &lock->user_flock, sizeof(lc.flock)); lc.pid = lock->client_pid; lc.domain = lock->volume; lc.xdata = NULL; gf_uuid_copy(up.gfid, lock->pl_inode->gfid); client = (client_t *)lock->client; if (client == NULL) { /* A NULL client can be found if the inodelk * was issued by a server side xlator. */ up.client_uid = NULL; } else { up.client_uid = client->client_uid; } } pthread_mutex_unlock(&pl_inode->mutex); if (notify) { up.event_type = GF_UPCALL_INODELK_CONTENTION; up.data = &lc; if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) { gf_msg_debug(this->name, 0, "Inodelk contention notification " "failed"); } else { gf_msg_debug(this->name, 0, "Inodelk contention notification " "sent"); } } pthread_mutex_lock(&pl_inode->mutex); list_del_init(&lock->contend); __pl_inodelk_unref(lock); pthread_mutex_unlock(&pl_inode->mutex); inode_unref(pl_inode->inode); } } /* Determine if lock is grantable or not */ static pl_inode_lock_t * __inodelk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, struct timespec *now, struct list_head *contend) { pl_inode_lock_t *l = NULL; pl_inode_lock_t *ret = NULL; list_for_each_entry(l, &dom->inodelk_list, list) { if (inodelk_conflict(lock, l) && !same_inodelk_owner(lock, l)) { if (ret == NULL) { ret = l; if (contend == NULL) { break; } } inodelk_contention_notify_check(this, l, now, contend); } } return ret; } static pl_inode_lock_t * __blocked_lock_conflict(pl_dom_list_t *dom, pl_inode_lock_t *lock) { pl_inode_lock_t *l = NULL; list_for_each_entry(l, &dom->blocked_inodelks, blocked_locks) { if (inodelk_conflict(lock, l)) { return l; } } return NULL; } static int __owner_has_lock(pl_dom_list_t *dom, pl_inode_lock_t *newlock) { pl_inode_lock_t *lock = NULL; list_for_each_entry(lock, &dom->inodelk_list, list) { if (same_inodelk_owner(lock, newlock)) return 1; } list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks) { if (same_inodelk_owner(lock, newlock)) return 1; } return 0; } static int __lock_blocked_add(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, int can_block) { if (can_block == 0) { goto out; } lock->blkd_time = gf_time(); list_add_tail(&lock->blocked_locks, &dom->blocked_inodelks); gf_msg_trace(this->name, 0, "%s (pid=%d) (lk-owner=%s) %" PRId64 " - " "%" PRId64 " => Blocked", lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, lkowner_utoa(&lock->owner), lock->user_flock.l_start, lock->user_flock.l_len); pl_trace_block(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, lock->volume); out: return -EAGAIN; } /* Determines if lock can be granted and adds the lock. If the lock * is blocking, adds it to the blocked_inodelks list of the domain. */ static int __lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom, struct timespec *now, struct list_head *contend) { pl_inode_lock_t *conf = NULL; int ret; ret = pl_inode_remove_inodelk(pl_inode, lock); if (ret < 0) { return ret; } if (ret == 0) { conf = __inodelk_grantable(this, dom, lock, now, contend); } if ((ret > 0) || (conf != NULL)) { return __lock_blocked_add(this, dom, lock, can_block); } /* To prevent blocked locks starvation, check if there are any blocked * locks thay may conflict with this lock. If there is then don't grant * the lock. BUT grant the lock if the owner already has lock to allow * nested locks. * Example: * SHD from Machine1 takes (gfid, 0-infinity) and is granted. * SHD from machine2 takes (gfid, 0-infinity) and is blocked. * When SHD from Machine1 takes (gfid, 0-128KB) it * needs to be granted, without which the earlier lock on 0-infinity * will not be unlocked by SHD from Machine1. * TODO: Find why 'owner_has_lock' is checked even for blocked locks. */ if (__blocked_lock_conflict(dom, lock) && !(__owner_has_lock(dom, lock))) { if (can_block != 0) { gf_log(this->name, GF_LOG_DEBUG, "Lock is grantable, but blocking to prevent " "starvation"); } return __lock_blocked_add(this, dom, lock, can_block); } __pl_inodelk_ref(lock); lock->granted_time = gf_time(); list_add(&lock->list, &dom->inodelk_list); return 0; } /* Return true if the two inodelks have exactly same lock boundaries */ static int inodelks_equal(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { if ((l1->fl_start == l2->fl_start) && (l1->fl_end == l2->fl_end)) return 1; return 0; } static pl_inode_lock_t * find_matching_inodelk(pl_inode_lock_t *lock, pl_dom_list_t *dom) { pl_inode_lock_t *l = NULL; list_for_each_entry(l, &dom->inodelk_list, list) { if (inodelks_equal(l, lock) && same_inodelk_owner(l, lock)) return l; } return NULL; } /* Set F_UNLCK removes a lock which has the exact same lock boundaries * as the UNLCK lock specifies. If such a lock is not found, returns invalid */ static pl_inode_lock_t * __inode_unlock_lock(xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom) { pl_inode_lock_t *conf = NULL; inode_t *inode = NULL; inode = lock->pl_inode->inode; conf = find_matching_inodelk(lock, dom); if (!conf) { gf_log(this->name, GF_LOG_ERROR, " Matching lock not found for unlock %llu-%llu, by %s " "on %p for gfid:%s", (unsigned long long)lock->fl_start, (unsigned long long)lock->fl_end, lkowner_utoa(&lock->owner), lock->client, inode ? uuid_utoa(inode->gfid) : "UNKNOWN"); goto out; } __delete_inode_lock(conf); gf_log(this->name, GF_LOG_DEBUG, " Matching lock found for unlock %llu-%llu, by %s on %p for gfid:%s", (unsigned long long)lock->fl_start, (unsigned long long)lock->fl_end, lkowner_utoa(&lock->owner), lock->client, inode ? uuid_utoa(inode->gfid) : "UNKNOWN"); out: return conf; } void __grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, struct list_head *granted, pl_dom_list_t *dom, struct timespec *now, struct list_head *contend) { pl_inode_lock_t *bl = NULL; pl_inode_lock_t *tmp = NULL; struct list_head blocked_list; INIT_LIST_HEAD(&blocked_list); list_splice_init(&dom->blocked_inodelks, &blocked_list); list_for_each_entry_safe(bl, tmp, &blocked_list, blocked_locks) { list_del_init(&bl->blocked_locks); bl->status = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend); if (bl->status != -EAGAIN) { list_add_tail(&bl->blocked_locks, granted); } } } void unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode, struct list_head *granted) { pl_inode_lock_t *lock; pl_inode_lock_t *tmp; int32_t op_ret; int32_t op_errno; list_for_each_entry_safe(lock, tmp, granted, blocked_locks) { if (lock->status == 0) { op_ret = 0; op_errno = 0; gf_log(this->name, GF_LOG_TRACE, "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => Granted", lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, lkowner_utoa(&lock->owner), lock->user_flock.l_start, lock->user_flock.l_len); } else { op_ret = -1; op_errno = -lock->status; } pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, op_ret, op_errno, lock->volume); STACK_UNWIND_STRICT(inodelk, lock->frame, op_ret, op_errno, NULL); lock->frame = NULL; } pthread_mutex_lock(&pl_inode->mutex); { list_for_each_entry_safe(lock, tmp, granted, blocked_locks) { list_del_init(&lock->blocked_locks); __pl_inodelk_unref(lock); } } pthread_mutex_unlock(&pl_inode->mutex); } /* Grant all inodelks blocked on a lock */ void grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, struct timespec *now, struct list_head *contend) { struct list_head granted; INIT_LIST_HEAD(&granted); pthread_mutex_lock(&pl_inode->mutex); { __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now, contend); } pthread_mutex_unlock(&pl_inode->mutex); unwind_granted_inodes(this, pl_inode, &granted); } static void pl_inodelk_log_cleanup(pl_inode_lock_t *lock) { pl_inode_t *pl_inode = NULL; pl_inode = lock->pl_inode; gf_log(THIS->name, GF_LOG_WARNING, "releasing lock on %s held by " "{client=%p, pid=%" PRId64 " lk-owner=%s}", uuid_utoa(pl_inode->gfid), lock->client, (uint64_t)lock->client_pid, lkowner_utoa(&lock->owner)); } /* Release all inodelks from this client */ int pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx) { posix_locks_private_t *priv; pl_inode_lock_t *tmp = NULL; pl_inode_lock_t *l = NULL; pl_dom_list_t *dom = NULL; pl_inode_t *pl_inode = NULL; struct list_head *pcontend = NULL; struct list_head released; struct list_head unwind; struct list_head contend; struct timespec now = {}; priv = this->private; INIT_LIST_HEAD(&released); INIT_LIST_HEAD(&unwind); if (priv->notify_contention) { pcontend = &contend; INIT_LIST_HEAD(pcontend); timespec_now(&now); } pthread_mutex_lock(&ctx->lock); { list_for_each_entry_safe(l, tmp, &ctx->inodelk_lockers, client_list) { pl_inodelk_log_cleanup(l); pl_inode = l->pl_inode; pthread_mutex_lock(&pl_inode->mutex); { /* If the inodelk object is part of granted list but not * blocked list, then perform the following actions: * i. delete the object from granted list; * ii. grant other locks (from other clients) that may * have been blocked on this inodelk; and * iii. unref the object. * * If the inodelk object (L1) is part of both granted * and blocked lists, then this means that a parallel * unlock on another inodelk (L2 say) may have 'granted' * L1 and added it to 'granted' list in * __grant_blocked_inode_locks() (although using the * 'blocked_locks' member). In that case, the cleanup * codepath must try and grant other overlapping * blocked inodelks from other clients, now that L1 is * out of their way and then unref L1 in the end, and * leave it to the other thread (the one executing * unlock codepath) to unwind L1's frame, delete it from * blocked_locks list, and perform the last unref on L1. * * If the inodelk object (L1) is part of blocked list * only, the cleanup code path must: * i. delete it from the blocked_locks list inside * this critical section, * ii. unwind its frame with EAGAIN, * iii. try and grant blocked inode locks from other * clients that were otherwise grantable, but just * got blocked to avoid leaving L1 to starve * forever. * iv. unref the object. */ list_del_init(&l->client_list); if (!list_empty(&l->list)) { __delete_inode_lock(l); list_add_tail(&l->client_list, &released); } else { list_del_init(&l->blocked_locks); list_add_tail(&l->client_list, &unwind); } } pthread_mutex_unlock(&pl_inode->mutex); } } pthread_mutex_unlock(&ctx->lock); if (!list_empty(&unwind)) { list_for_each_entry_safe(l, tmp, &unwind, client_list) { list_del_init(&l->client_list); if (l->frame) STACK_UNWIND_STRICT(inodelk, l->frame, -1, EAGAIN, NULL); list_add_tail(&l->client_list, &released); } } if (!list_empty(&released)) { list_for_each_entry_safe(l, tmp, &released, client_list) { list_del_init(&l->client_list); pl_inode = l->pl_inode; dom = get_domain(pl_inode, l->volume); grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend); pthread_mutex_lock(&pl_inode->mutex); { __pl_inodelk_unref(l); } pthread_mutex_unlock(&pl_inode->mutex); inode_unref(pl_inode->inode); } } if (pcontend != NULL) { inodelk_contention_notify(this, pcontend); } return 0; } static int pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom, inode_t *inode) { posix_locks_private_t *priv = NULL; int ret = -EINVAL; pl_inode_lock_t *retlock = NULL; gf_boolean_t unref = _gf_true; gf_boolean_t need_inode_unref = _gf_false; struct list_head *pcontend = NULL; struct list_head contend; struct list_head wake; struct timespec now = {}; short fl_type; lock->pl_inode = pl_inode; fl_type = lock->fl_type; priv = this->private; /* Ideally, AFTER a successful lock (both blocking and non-blocking) or * an unsuccessful blocking lock operation, the inode needs to be ref'd. * * But doing so might give room to a race where the lock-requesting * client could send a DISCONNECT just before this thread refs the inode * after the locking is done, and the epoll thread could unref the inode * in cleanup which means the inode's refcount would come down to 0, and * the call to pl_forget() at this point destroys @pl_inode. Now when * the io-thread executing this function tries to access pl_inode, * it could crash on account of illegal memory access. * * To get around this problem, the inode is ref'd once even before * adding the lock into client_list as a precautionary measure. * This way even if there are DISCONNECTs, there will always be 1 extra * ref on the inode, so @pl_inode is still alive until after the * current stack unwinds. */ pl_inode->inode = inode_ref(inode); if (priv->revocation_secs != 0) { if (lock->fl_type != F_UNLCK) { __inodelk_prune_stale(this, pl_inode, dom, lock); } else if (priv->monkey_unlocking == _gf_true) { if (pl_does_monkey_want_stuck_lock()) { pthread_mutex_lock(&pl_inode->mutex); { __pl_inodelk_unref(lock); } pthread_mutex_unlock(&pl_inode->mutex); inode_unref(pl_inode->inode); gf_log(this->name, GF_LOG_WARNING, "MONKEY LOCKING (forcing stuck lock)!"); return 0; } } } if (priv->notify_contention) { pcontend = &contend; INIT_LIST_HEAD(pcontend); timespec_now(&now); } INIT_LIST_HEAD(&wake); if (ctx) pthread_mutex_lock(&ctx->lock); pthread_mutex_lock(&pl_inode->mutex); { if (lock->fl_type != F_UNLCK) { ret = __lock_inodelk(this, pl_inode, lock, can_block, dom, &now, pcontend); if (ret == 0) { lock->frame = NULL; gf_log(this->name, GF_LOG_TRACE, "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => OK", lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, lkowner_utoa(&lock->owner), lock->fl_start, lock->fl_end); } else if (ret == -EAGAIN) { gf_log(this->name, GF_LOG_TRACE, "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => NOK", lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, lkowner_utoa(&lock->owner), lock->user_flock.l_start, lock->user_flock.l_len); if (can_block) { unref = _gf_false; } } /* For all but the case where a non-blocking lock attempt fails * with -EAGAIN, the extra ref taken at the start of this function * must be negated. */ need_inode_unref = (ret != 0) && ((ret != -EAGAIN) || !can_block); if (ctx && !need_inode_unref) { list_add_tail(&lock->client_list, &ctx->inodelk_lockers); } } else { /* Irrespective of whether unlock succeeds or not, * the extra inode ref that was done at the start of * this function must be negated. Towards this, * @need_inode_unref flag is set unconditionally here. */ need_inode_unref = _gf_true; retlock = __inode_unlock_lock(this, lock, dom); if (!retlock) { gf_log(this->name, GF_LOG_DEBUG, "Bad Unlock issued on Inode lock"); ret = -EINVAL; goto out; } list_del_init(&retlock->client_list); __pl_inodelk_unref(retlock); pl_inode_remove_unlocked(this, pl_inode, &wake); ret = 0; } out: if (unref) __pl_inodelk_unref(lock); } pthread_mutex_unlock(&pl_inode->mutex); if (ctx) pthread_mutex_unlock(&ctx->lock); pl_inode_remove_wake(&wake); /* The following (extra) unref corresponds to the ref that * was done at the time the lock was granted. */ if ((fl_type == F_UNLCK) && (ret == 0)) { inode_unref(pl_inode->inode); grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend); } if (need_inode_unref) { inode_unref(pl_inode->inode); } if (pcontend != NULL) { inodelk_contention_notify(this, pcontend); } return ret; } /* Create a new inode_lock_t */ static pl_inode_lock_t * new_inode_lock(struct gf_flock *flock, client_t *client, pid_t client_pid, call_frame_t *frame, xlator_t *this, const char *volume, char *conn_id, int32_t *op_errno) { pl_inode_lock_t *lock = NULL; if (!pl_is_lk_owner_valid(&frame->root->lk_owner, frame->root->client)) { *op_errno = EINVAL; goto out; } lock = GF_CALLOC(1, sizeof(*lock), gf_locks_mt_pl_inode_lock_t); if (!lock) { *op_errno = ENOMEM; goto out; } lock->fl_start = flock->l_start; lock->fl_type = flock->l_type; if (flock->l_len == 0) lock->fl_end = LLONG_MAX; else lock->fl_end = flock->l_start + flock->l_len - 1; lock->client = client; lock->client_pid = client_pid; lock->volume = volume; lock->owner = frame->root->lk_owner; lock->frame = frame; lock->this = this; if (conn_id) { lock->connection_id = gf_strdup(conn_id); } INIT_LIST_HEAD(&lock->list); INIT_LIST_HEAD(&lock->blocked_locks); INIT_LIST_HEAD(&lock->client_list); INIT_LIST_HEAD(&lock->contend); __pl_inodelk_ref(lock); out: return lock; } int32_t _pl_convert_volume(const char *volume, char **res) { char *mdata_vol = NULL; int ret = 0; mdata_vol = strrchr(volume, ':'); // if the volume already ends with :metadata don't bother if (mdata_vol && (strcmp(mdata_vol, ":metadata") == 0)) return 0; ret = gf_asprintf(res, "%s:metadata", volume); if (ret <= 0) return ENOMEM; return 0; } int32_t _pl_convert_volume_for_special_range(struct gf_flock *flock, const char *volume, char **res) { int32_t ret = 0; if ((flock->l_start == LLONG_MAX - 1) && (flock->l_len == 0)) { ret = _pl_convert_volume(volume, res); } return ret; } /* Common inodelk code called from pl_inodelk and pl_finodelk */ int pl_common_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, inode_t *inode, int32_t cmd, struct gf_flock *flock, loc_t *loc, fd_t *fd, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; int ret = -1; GF_UNUSED int dict_ret = -1; int can_block = 0; short lock_type = 0; pl_inode_t *pinode = NULL; pl_inode_lock_t *reqlock = NULL; pl_dom_list_t *dom = NULL; char *res = NULL; char *res1 = NULL; char *conn_id = NULL; pl_ctx_t *ctx = NULL; if (xdata) dict_ret = dict_get_str(xdata, "connection-id", &conn_id); VALIDATE_OR_GOTO(frame, out); VALIDATE_OR_GOTO(inode, unwind); VALIDATE_OR_GOTO(flock, unwind); if ((flock->l_start < 0) || (flock->l_len < 0)) { op_errno = EINVAL; goto unwind; } op_errno = _pl_convert_volume_for_special_range(flock, volume, &res); if (op_errno) goto unwind; if (res) volume = res; pl_trace_in(this, frame, fd, loc, cmd, flock, volume); if (frame->root->client) { ctx = pl_ctx_get(frame->root->client, this); if (!ctx) { op_errno = ENOMEM; gf_log(this->name, GF_LOG_INFO, "pl_ctx_get() failed"); goto unwind; } } pinode = pl_inode_get(this, inode, NULL); if (!pinode) { op_errno = ENOMEM; goto unwind; } dom = get_domain(pinode, volume); if (!dom) { op_errno = ENOMEM; goto unwind; } reqlock = new_inode_lock(flock, frame->root->client, frame->root->pid, frame, this, dom->domain, conn_id, &op_errno); if (!reqlock) { op_ret = -1; goto unwind; } switch (cmd) { case F_SETLKW: can_block = 1; /* fall through */ case F_SETLK: lock_type = flock->l_type; memcpy(&reqlock->user_flock, flock, sizeof(struct gf_flock)); ret = pl_inode_setlk(this, ctx, pinode, reqlock, can_block, dom, inode); if (ret < 0) { if (ret == -EAGAIN) { if (can_block && (F_UNLCK != lock_type)) { goto out; } gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN"); } else { gf_log(this->name, GF_LOG_TRACE, "returning %d", ret); } op_errno = -ret; goto unwind; } break; default: op_errno = ENOTSUP; gf_log(this->name, GF_LOG_DEBUG, "Lock command F_GETLK not supported for [f]inodelk " "(cmd=%d)", cmd); goto unwind; } op_ret = 0; unwind: if (flock != NULL) pl_trace_out(this, frame, fd, loc, cmd, flock, op_ret, op_errno, volume); STACK_UNWIND_STRICT(inodelk, frame, op_ret, op_errno, NULL); out: GF_FREE(res); GF_FREE(res1); return 0; } int pl_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { pl_common_inodelk(frame, this, volume, loc->inode, cmd, flock, loc, NULL, xdata); return 0; } int pl_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { pl_common_inodelk(frame, this, volume, fd->inode, cmd, flock, NULL, fd, xdata); return 0; } static int32_t __get_inodelk_dom_count(pl_dom_list_t *dom) { pl_inode_lock_t *lock = NULL; int32_t count = 0; list_for_each_entry(lock, &dom->inodelk_list, list) { count++; } list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks) { count++; } return count; } /* Returns the no. of locks (blocked/granted) held on a given domain name * If @domname is NULL, returns the no. of locks in all the domains present. * If @domname is non-NULL and non-existent, returns 0 */ int32_t __get_inodelk_count(xlator_t *this, pl_inode_t *pl_inode, char *domname) { int32_t count = 0; pl_dom_list_t *dom = NULL; list_for_each_entry(dom, &pl_inode->dom_list, inode_list) { if (domname) { if (strcmp(domname, dom->domain) == 0) { count = __get_inodelk_dom_count(dom); goto out; } } else { /* Counting locks from all domains */ count += __get_inodelk_dom_count(dom); } } out: return count; } int32_t get_inodelk_count(xlator_t *this, inode_t *inode, char *domname) { pl_inode_t *pl_inode = NULL; uint64_t tmp_pl_inode = 0; int ret = 0; int32_t count = 0; ret = inode_ctx_get(inode, this, &tmp_pl_inode); if (ret != 0) { goto out; } pl_inode = (pl_inode_t *)(long)tmp_pl_inode; pthread_mutex_lock(&pl_inode->mutex); { count = __get_inodelk_count(this, pl_inode, domname); } pthread_mutex_unlock(&pl_inode->mutex); out: return count; }