summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorXavier Hernandez <xhernandez@datalab.es>2016-06-15 14:42:19 +0200
committerPranith Kumar Karampuri <pkarampu@redhat.com>2018-01-16 10:37:22 +0000
commit7ba7a4b27d124f4ee16fe4776a4670cd5b0160c4 (patch)
tree73cca7226576b7ce3e480dc991aa70b9cc7cab07
parent3e9a9c029fac359477fb26d9cc7803749ba038b2 (diff)
locks: added inodelk/entrylk contention upcall notifications
The locks xlator now is able to send a contention notification to the current owner of the lock. This is only a notification that can be used to improve performance of some client side operations that might benefit from extended duration of lock ownership. Nothing is done if the lock owner decides to ignore the message and to not release the lock. For forced release of acquired resources, leases must be used. Change-Id: I7f1ad32a0b4b445505b09908a050080ad848f8e0 Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
-rw-r--r--libglusterfs/src/upcall-utils.h17
-rw-r--r--rpc/rpc-lib/src/protocol-common.h2
-rw-r--r--rpc/xdr/src/glusterfs3.h156
-rw-r--r--rpc/xdr/src/glusterfs4-xdr.x17
-rw-r--r--rpc/xdr/src/libgfxdr.sym4
-rw-r--r--tests/basic/ec/lock-contention.t62
-rw-r--r--tests/volume.rc12
-rw-r--r--xlators/cluster/ec/src/ec-common.c253
-rw-r--r--xlators/cluster/ec/src/ec-common.h1
-rw-r--r--xlators/cluster/ec/src/ec-types.h2
-rw-r--r--xlators/cluster/ec/src/ec.c77
-rw-r--r--xlators/features/locks/src/clear.c36
-rw-r--r--xlators/features/locks/src/common.h12
-rw-r--r--xlators/features/locks/src/entrylk.c290
-rw-r--r--xlators/features/locks/src/inodelk.c309
-rw-r--r--xlators/features/locks/src/locks.h8
-rw-r--r--xlators/features/locks/src/pl-messages.h4
-rw-r--r--xlators/features/locks/src/posix.c40
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c20
-rw-r--r--xlators/protocol/client/src/client-callback.c97
-rw-r--r--xlators/protocol/client/src/client-messages.h4
-rw-r--r--xlators/protocol/server/src/server.c44
22 files changed, 1203 insertions, 264 deletions
diff --git a/libglusterfs/src/upcall-utils.h b/libglusterfs/src/upcall-utils.h
index 3b5dce33e45..48d10382c10 100644
--- a/libglusterfs/src/upcall-utils.h
+++ b/libglusterfs/src/upcall-utils.h
@@ -63,6 +63,8 @@ typedef enum {
GF_UPCALL_EVENT_NULL,
GF_UPCALL_CACHE_INVALIDATION,
GF_UPCALL_RECALL_LEASE,
+ GF_UPCALL_INODELK_CONTENTION,
+ GF_UPCALL_ENTRYLK_CONTENTION,
} gf_upcall_event_t;
struct gf_upcall {
@@ -88,4 +90,19 @@ struct gf_upcall_recall_lease {
dict_t *dict;
};
+struct gf_upcall_inodelk_contention {
+ struct gf_flock flock;
+ pid_t pid;
+ const char *domain;
+ dict_t *xdata;
+};
+
+struct gf_upcall_entrylk_contention {
+ uint32_t type;
+ pid_t pid;
+ const char *name;
+ const char *domain;
+ dict_t *xdata;
+};
+
#endif /* _UPCALL_UTILS_H */
diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h
index aee34302205..aafd94400c6 100644
--- a/rpc/rpc-lib/src/protocol-common.h
+++ b/rpc/rpc-lib/src/protocol-common.h
@@ -150,6 +150,8 @@ enum gf_cbk_procnum {
GF_CBK_CHILD_DOWN,
GF_CBK_RECALL_LEASE,
GF_CBK_STATEDUMP,
+ GF_CBK_INODELK_CONTENTION,
+ GF_CBK_ENTRYLK_CONTENTION,
GF_CBK_MAXVALUE,
};
diff --git a/rpc/xdr/src/glusterfs3.h b/rpc/xdr/src/glusterfs3.h
index 2da5594a347..eef39416b5c 100644
--- a/rpc/xdr/src/glusterfs3.h
+++ b/rpc/xdr/src/glusterfs3.h
@@ -419,6 +419,162 @@ gf_proto_cache_invalidation_to_upcall (xlator_t *this,
return ret;
}
+static inline int
+gf_proto_inodelk_contention_to_upcall (struct gfs4_inodelk_contention_req *lc,
+ struct gf_upcall *gf_up_data)
+{
+ struct gf_upcall_inodelk_contention *tmp = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ this = THIS;
+
+ GF_VALIDATE_OR_GOTO(this->name, lc, out);
+ GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out);
+
+ tmp = (struct gf_upcall_inodelk_contention *)gf_up_data->data;
+
+ gf_uuid_copy(gf_up_data->gfid, (unsigned char *)lc->gfid);
+
+ gf_proto_flock_to_flock(&lc->flock, &tmp->flock);
+ tmp->pid = lc->pid;
+ tmp->domain = lc->domain;
+ if ((tmp->domain != NULL) && (*tmp->domain == 0)) {
+ tmp->domain = NULL;
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, tmp->xdata, lc->xdata.xdata_val,
+ lc->xdata.xdata_len, ret, op_errno, out);
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ ret = -op_errno;
+ }
+
+ return ret;
+}
+
+static inline int
+gf_proto_inodelk_contention_from_upcall (xlator_t *this,
+ struct gfs4_inodelk_contention_req *lc,
+ struct gf_upcall *gf_up_data)
+{
+ struct gf_upcall_inodelk_contention *tmp = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ GF_VALIDATE_OR_GOTO(this->name, lc, out);
+ GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out);
+
+ tmp = (struct gf_upcall_inodelk_contention *)gf_up_data->data;
+
+ gf_uuid_copy((unsigned char *)lc->gfid, gf_up_data->gfid);
+
+ gf_proto_flock_from_flock(&lc->flock, &tmp->flock);
+ lc->pid = tmp->pid;
+ lc->domain = (char *)tmp->domain;
+ if (lc->domain == NULL) {
+ lc->domain = "";
+ }
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, tmp->xdata, &lc->xdata.xdata_val,
+ lc->xdata.xdata_len, op_errno, out);
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ ret = -op_errno;
+ }
+
+ return ret;
+}
+
+static inline int
+gf_proto_entrylk_contention_to_upcall (struct gfs4_entrylk_contention_req *lc,
+ struct gf_upcall *gf_up_data)
+{
+ struct gf_upcall_entrylk_contention *tmp = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ this = THIS;
+
+ GF_VALIDATE_OR_GOTO(this->name, lc, out);
+ GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out);
+
+ tmp = (struct gf_upcall_entrylk_contention *)gf_up_data->data;
+
+ gf_uuid_copy(gf_up_data->gfid, (unsigned char *)lc->gfid);
+
+ tmp->type = lc->type;
+ tmp->name = lc->name;
+ if ((tmp->name != NULL) && (*tmp->name == 0)) {
+ tmp->name = NULL;
+ }
+ tmp->pid = lc->pid;
+ tmp->domain = lc->domain;
+ if ((tmp->domain != NULL) && (*tmp->domain == 0)) {
+ tmp->domain = NULL;
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, tmp->xdata, lc->xdata.xdata_val,
+ lc->xdata.xdata_len, ret, op_errno, out);
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ ret = -op_errno;
+ }
+
+ return ret;
+}
+
+static inline int
+gf_proto_entrylk_contention_from_upcall (xlator_t *this,
+ struct gfs4_entrylk_contention_req *lc,
+ struct gf_upcall *gf_up_data)
+{
+ struct gf_upcall_entrylk_contention *tmp = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ GF_VALIDATE_OR_GOTO(this->name, lc, out);
+ GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out);
+
+ tmp = (struct gf_upcall_entrylk_contention *)gf_up_data->data;
+
+ gf_uuid_copy((unsigned char *)lc->gfid, gf_up_data->gfid);
+
+ lc->type = tmp->type;
+ lc->name = (char *)tmp->name;
+ if (lc->name == NULL) {
+ lc->name = "";
+ }
+ lc->pid = tmp->pid;
+ lc->domain = (char *)tmp->domain;
+ if (lc->domain == NULL) {
+ lc->domain = "";
+ }
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, tmp->xdata, &lc->xdata.xdata_val,
+ lc->xdata.xdata_len, op_errno, out);
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ ret = -op_errno;
+ }
+
+ return ret;
+}
+
extern int dict_to_xdr (dict_t *this, gfx_dict *xdict);
extern int xdr_to_dict (gfx_dict *xdict, dict_t **to);
diff --git a/rpc/xdr/src/glusterfs4-xdr.x b/rpc/xdr/src/glusterfs4-xdr.x
index 7396b566fa7..ef0cfde0802 100644
--- a/rpc/xdr/src/glusterfs4-xdr.x
+++ b/rpc/xdr/src/glusterfs4-xdr.x
@@ -86,3 +86,20 @@ struct gfs4_namelink_req {
string bname<>;
opaque xdata<>;
};
+
+struct gfs4_inodelk_contention_req {
+ opaque gfid[16];
+ struct gf_proto_flock flock;
+ unsigned int pid;
+ string domain<>;
+ opaque xdata<>;
+};
+
+struct gfs4_entrylk_contention_req {
+ opaque gfid[16];
+ unsigned int type;
+ unsigned int pid;
+ string name<>;
+ string domain<>;
+ opaque xdata<>;
+};
diff --git a/rpc/xdr/src/libgfxdr.sym b/rpc/xdr/src/libgfxdr.sym
index 8af956ef5a9..83f1efc732a 100644
--- a/rpc/xdr/src/libgfxdr.sym
+++ b/rpc/xdr/src/libgfxdr.sym
@@ -155,8 +155,12 @@ xdr_gfs3_xattrop_req
xdr_gfs3_xattrop_rsp
xdr_gfs3_zerofill_req
xdr_gfs3_zerofill_rsp
+xdr_gfs4_entrylk_contention_req
+xdr_gfs4_entrylk_contention_rsp
xdr_gfs4_icreate_req
xdr_gfs4_icreate_rsp
+xdr_gfs4_inodelk_contention_req
+xdr_gfs4_inodelk_contention_rsp
xdr_gfs4_namelink_req
xdr_gfs4_namelink_rsp
xdr_gf_set_lk_ver_req
diff --git a/tests/basic/ec/lock-contention.t b/tests/basic/ec/lock-contention.t
new file mode 100644
index 00000000000..8f86cee16ad
--- /dev/null
+++ b/tests/basic/ec/lock-contention.t
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# This test verifies that when 'lock-notify-contention' option is enabled,
+# locks xlator actually sends an upcall notification that causes the acquired
+# lock from one client to be released before it's supposed to when another
+# client accesses the file.
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+function elapsed_time() {
+ local start="`date +%s`"
+
+ if [[ "test" == `cat "$1"` ]]; then
+ echo "$((`date +%s` - ${start}))"
+ fi
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume set $V0 features.locks-notify-contention off
+TEST $CLI volume set $V0 disperse.eager-lock on
+TEST $CLI volume set $V0 disperse.eager-lock-timeout 6
+TEST $CLI volume set $V0 disperse.other-eager-lock on
+TEST $CLI volume set $V0 disperse.other-eager-lock-timeout 6
+TEST $CLI volume start $V0
+
+TEST $GFS --direct-io-mode=yes --volfile-id=/$V0 --volfile-server=$H0 $M0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0
+
+TEST $GFS --direct-io-mode=yes --volfile-id=/$V0 --volfile-server=$H0 $M1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M1
+
+TEST $(echo "test" >$M0/file)
+
+# With locks-notify-contention set to off, accessing the file from another
+# client should take 6 seconds. Checking against 3 seconds to be safe.
+elapsed="$(elapsed_time $M1/file)"
+TEST [[ ${elapsed} -ge 3 ]]
+
+elapsed="$(elapsed_time $M0/file)"
+TEST [[ ${elapsed} -ge 3 ]]
+
+TEST $CLI volume set $V0 features.locks-notify-contention on
+
+# With locks-notify-contention set to on, accessing the file from another
+# client should be fast. Checking against 3 seconds to be safe.
+elapsed="$(elapsed_time $M1/file)"
+TEST [[ ${elapsed} -le 3 ]]
+
+elapsed="$(elapsed_time $M0/file)"
+TEST [[ ${elapsed} -le 3 ]]
+
+cleanup
diff --git a/tests/volume.rc b/tests/volume.rc
index 3a48b43d2e0..3ee83624058 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -93,7 +93,8 @@ function remove_brick_status_completed_field {
function get_mount_process_pid {
local vol=$1
- ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol " | awk '{print $2}' | head -1
+ local mnt=$2
+ ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol .*$mnt" | awk '{print $2}' | head -1
}
function get_nfs_pid ()
@@ -126,7 +127,8 @@ function generate_statedump {
function generate_mount_statedump {
local vol=$1
- generate_statedump $(get_mount_process_pid $vol)
+ local mnt=$2
+ generate_statedump $(get_mount_process_pid $vol $mnt)
}
function cleanup_mount_statedump {
@@ -205,14 +207,16 @@ function ec_child_up_status {
local vol=$1
local dist_id=$2
local brick_id=$(($3 + 1))
- local mask=$(ec_get_info $vol $dist_id "childs_up_mask" $(generate_mount_statedump $vol))
+ local mnt=$4
+ local mask=$(ec_get_info $vol $dist_id "childs_up_mask" $(generate_mount_statedump $vol $mnt))
echo "${mask: -$brick_id:1}"
}
function ec_child_up_count {
local vol=$1
local dist_id=$2
- ec_get_info $vol $dist_id "childs_up" $(generate_mount_statedump $vol)
+ local mnt=$3
+ ec_get_info $vol $dist_id "childs_up" $(generate_mount_statedump $vol $mnt)
}
function ec_child_up_status_shd {
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index cb627a92c9c..fbc7ac97aa0 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1847,6 +1847,67 @@ gf_boolean_t ec_lock_acquire(ec_lock_link_t *link)
return _gf_true;
}
+static ec_lock_link_t *
+ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock)
+{
+ ec_lock_link_t *timer_link;
+
+ /* If we don't have any timer, there's nothing to cancel. */
+ if (lock->timer == NULL) {
+ return NULL;
+ }
+
+ /* We are trying to access a lock that has an unlock timer active.
+ * This means that the lock must be idle, i.e. no fop can be in the
+ * owner, waiting or frozen lists. It also means that the lock cannot
+ * have been marked as being released (this is done without timers).
+ * There should only be one owner reference, but it's possible that
+ * some fops are being prepared to use this lock. */
+ GF_ASSERT ((lock->refs_owners == 1) &&
+ list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+ /* We take the timer_link before cancelling the timer, since a
+ * successful cancellation will destroy it. It must not be NULL
+ * because it references the fop responsible for the delayed unlock
+ * that we are currently trying to cancel. */
+ timer_link = lock->timer->data;
+ GF_ASSERT(timer_link != NULL);
+
+ if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) {
+ /* It's too late to avoid the execution of the timer callback.
+ * Since we need to be sure that the callback has access to all
+ * needed resources, we cannot resume the execution of the
+ * timer fop now. This will be done in the callback. */
+ timer_link = NULL;
+ } else {
+ /* The timer has been cancelled. The fop referenced by
+ * timer_link holds the last reference. The caller is
+ * responsible to release it when not needed anymore. */
+ ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+ }
+
+ /* We have two options here:
+ *
+ * 1. The timer has been successfully cancelled.
+ *
+ * This is the easiest case and we can continue with the currently
+ * acquired lock.
+ *
+ * 2. The timer callback has already been fired.
+ *
+ * In this case we have not been able to cancel the timer before
+ * the timer callback has been fired, but we also know that
+ * lock->timer != NULL. This means that the timer callback is still
+ * trying to acquire the inode mutex that we currently own. We are
+ * safe until we release it. In this case we can safely clear
+ * lock->timer. This will cause that the timer callback does nothing
+ * once it acquires the mutex.
+ */
+ lock->timer = NULL;
+
+ return timer_link;
+}
+
static gf_boolean_t
ec_lock_assign_owner(ec_lock_link_t *link)
{
@@ -1891,61 +1952,7 @@ ec_lock_assign_owner(ec_lock_link_t *link)
* empty. */
GF_ASSERT(list_empty(&lock->frozen));
- if (lock->timer != NULL) {
- /* We are trying to acquire a lock that has an unlock timer active.
- * This means that the lock must be idle, i.e. no fop can be in the
- * owner, waiting or frozen lists. It also means that the lock cannot
- * have been marked as being released (this is done without timers).
- * There should only be one owner reference, but it's possible that
- * some fops are being prepared to use this lock.
- */
- GF_ASSERT ((lock->refs_owners == 1) &&
- list_empty(&lock->owners) && list_empty(&lock->waiting));
-
- /* We take the timer_link before cancelling the timer, since a
- * successful cancellation will destroy it. It must not be NULL
- * because it references the fop responsible for the delayed unlock
- * that we are currently trying to cancel. */
- timer_link = lock->timer->data;
- GF_ASSERT(timer_link != NULL);
-
- if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) {
- /* It's too late to avoid the execution of the timer callback.
- * Since we need to be sure that the callback has access to all
- * needed resources, we cannot resume the execution of the timer
- * fop now. This will be done in the callback.
- */
- timer_link = NULL;
- } else {
- /* The timer has been cancelled, so we need to release the owner
- * reference that was held by the fop waiting for the timer. This
- * can be the last reference, but we'll immediately increment it
- * for the current fop, so no need to check it.
- */
- lock->refs_owners--;
-
- ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
- }
-
- /* We have two options here:
- *
- * 1. The timer has been successfully cancelled.
- *
- * This is the easiest case and we can continue with the currently
- * acquired lock.
- *
- * 2. The timer callback has already been fired.
- *
- * In this case we have not been able to cancel the timer before
- * the timer callback has been fired, but we also know that
- * lock->timer != NULL. This means that the timer callback is still
- * trying to acquire the inode mutex that we currently own. We are
- * safe until we release it. In this case we can safely clear
- * lock->timer. This will cause that the timer callback does nothing
- * once it acquires the mutex.
- */
- lock->timer = NULL;
- }
+ timer_link = ec_lock_timer_cancel(fop->xl, lock);
if (!list_empty(&lock->owners)) {
/* There are other owners of this lock. We can only take ownership if
@@ -1965,7 +1972,13 @@ ec_lock_assign_owner(ec_lock_link_t *link)
}
list_add_tail(&link->owner_list, &lock->owners);
- lock->refs_owners++;
+
+ /* If timer_link is not NULL, it means that we have inherited the owner
+ * reference assigned to the timer fop. In this case we simply reuse it.
+ * Otherwise we need to increase the number of owners. */
+ if (timer_link == NULL) {
+ lock->refs_owners++;
+ }
assigned = _gf_true;
@@ -2383,6 +2396,48 @@ ec_unlock_now(ec_lock_link_t *link)
ec_resume(link->fop, 0);
}
+void
+ec_lock_release(ec_t *ec, inode_t *inode)
+{
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
+ ec_lock_link_t *timer_link = NULL;
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get(inode, ec->xl);
+ if (ctx == NULL) {
+ goto done;
+ }
+ lock = ctx->inode_lock;
+ if ((lock == NULL) || !lock->acquired || lock->release) {
+ goto done;
+ }
+
+ gf_msg_debug(ec->xl->name, 0,
+ "Releasing inode %p due to lock contention", inode);
+
+ /* The lock is not marked to be released, so the frozen list should be
+ * empty. */
+ GF_ASSERT(list_empty(&lock->frozen));
+
+ timer_link = ec_lock_timer_cancel(ec->xl, lock);
+
+ /* We mark the lock to be released as soon as possible. */
+ lock->release = _gf_true;
+
+done:
+ UNLOCK(&inode->lock);
+
+ /* If we have cancelled the timer, we need to start the unlock of the
+ * inode. If there was a timer but we have been unable to cancel it
+ * because it was just triggered, the timer callback will take care
+ * of releasing the inode. */
+ if (timer_link != NULL) {
+ ec_unlock_now(timer_link);
+ }
+}
+
void ec_unlock_timer_add(ec_lock_link_t *link);
void
@@ -2470,9 +2525,60 @@ void ec_unlock_timer_cbk(void *data)
ec_unlock_timer_del(data);
}
+static gf_boolean_t
+ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop)
+{
+ /* Fops with no locks at this point mean that they are sent as sub-fops
+ * of other higher level fops. In this case we simply assume that the
+ * parent fop will take correct care of the eager lock. */
+ if (fop->lock_count == 0) {
+ return _gf_true;
+ }
+
+ /* We may have more than one lock, but this only happens in the rename
+ * fop, and both locks will reference an inode of the same type (a
+ * directory in this case), so we only need to check the first lock. */
+ if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
+ return ec->eager_lock;
+ }
+
+ return ec->other_eager_lock;
+}
+
+static uint32_t
+ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock)
+{
+ if (lock->loc.inode->ia_type == IA_IFREG) {
+ return ec->eager_lock_timeout;
+ }
+
+ return ec->other_eager_lock_timeout;
+}
+
+static gf_boolean_t
+ec_lock_delay_create(ec_lock_link_t *link)
+{
+ struct timespec delay;
+ ec_fop_data_t *fop = link->fop;
+ ec_lock_t *lock = link->lock;
+
+ delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock);
+ delay.tv_nsec = 0;
+ lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
+ ec_unlock_timer_cbk, link);
+ if (lock->timer == NULL) {
+ gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
+ EC_MSG_UNLOCK_DELAY_FAILED,
+ "Unable to delay an unlock");
+
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
void ec_unlock_timer_add(ec_lock_link_t *link)
{
- struct timespec delay;
ec_fop_data_t *fop = link->fop;
ec_lock_t *lock = link->lock;
gf_boolean_t now = _gf_false;
@@ -2526,19 +2632,12 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
lock->release);
- delay.tv_sec = 1;
- delay.tv_nsec = 0;
- lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
- ec_unlock_timer_cbk, link);
- if (lock->timer == NULL) {
- gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
- EC_MSG_UNLOCK_DELAY_FAILED,
- "Unable to delay an unlock");
-
+ if (!ec_lock_delay_create(link)) {
/* We are unable to create a new timer. We immediately release
* the lock. */
lock->release = now = _gf_true;
}
+
} else {
ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock,
lock->release);
@@ -2583,26 +2682,6 @@ void ec_flush_size_version(ec_fop_data_t * fop)
ec_update_info(&fop->locks[0]);
}
-static gf_boolean_t
-ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop)
-{
- /* Fops with no locks at this point mean that they are sent as sub-fops
- * of other higher level fops. In this case we simply assume that the
- * parent fop will take correct care of the eager lock. */
- if (fop->lock_count == 0) {
- return _gf_true;
- }
-
- /* We may have more than one lock, but this only happens in the rename
- * fop, and both locks will reference an inode of the same type (a
- * directory in this case), so we only need to check the first lock. */
- if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
- return ec->eager_lock;
- }
-
- return ec->other_eager_lock;
-}
-
static void
ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
ec_fop_data_t *fop)
@@ -2708,7 +2787,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
ec = fop->xl->private;
cbk = fop->answer;
- if (ec_use_eager_lock(ec, fop) && cbk != NULL) {
+ if (ec_eager_lock_used(ec, fop) && cbk != NULL) {
if (cbk->xdata != NULL) {
if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT,
&count) == 0) && (count > 1)) {
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index c3e291585ef..99e2f0653be 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -102,6 +102,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
void ec_lock(ec_fop_data_t * fop);
void ec_lock_reuse(ec_fop_data_t *fop);
void ec_unlock(ec_fop_data_t * fop);
+void ec_lock_release(ec_t *ec, inode_t *inode);
gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
uint64_t *size);
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 23dc434bc42..15b4c77abfe 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -669,6 +669,8 @@ struct _ec {
uint32_t background_heals;
uint32_t heal_wait_qlen;
uint32_t self_heal_window_size; /* max size of read/writes */
+ uint32_t eager_lock_timeout;
+ uint32_t other_eager_lock_timeout;
struct list_head pending_fops;
struct list_head heal_waiting;
struct list_head healing;
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 4c80f1283f1..30b0bdcb29c 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -271,6 +271,11 @@ reconfigure (xlator_t *this, dict_t *options)
bool, failed);
GF_OPTION_RECONF ("other-eager-lock", ec->other_eager_lock, options,
bool, failed);
+ GF_OPTION_RECONF ("eager-lock-timeout", ec->eager_lock_timeout,
+ options, uint32, failed);
+ GF_OPTION_RECONF ("other-eager-lock-timeout",
+ ec->other_eager_lock_timeout, options, uint32,
+ failed);
GF_OPTION_RECONF ("background-heals", background_heals, options,
uint32, failed);
GF_OPTION_RECONF ("heal-wait-qlength", heal_wait_qlen, options,
@@ -453,6 +458,43 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
}
}
+static gf_boolean_t
+ec_upcall(ec_t *ec, struct gf_upcall *upcall)
+{
+ struct gf_upcall_cache_invalidation *ci = NULL;
+ struct gf_upcall_inodelk_contention *lc = NULL;
+ inode_t *inode;
+
+ switch (upcall->event_type) {
+ case GF_UPCALL_CACHE_INVALIDATION:
+ ci = upcall->data;
+ ci->flags |= UP_INVAL_ATTR;
+ return _gf_true;
+
+ case GF_UPCALL_INODELK_CONTENTION:
+ lc = upcall->data;
+ if (strcmp(lc->domain, ec->xl->name) != 0) {
+ /* The lock is not owned by EC, ignore it. */
+ return _gf_true;
+ }
+ inode = inode_find(((xlator_t *)ec->xl->graph->top)->itable,
+ upcall->gfid);
+ /* If inode is not found, it means that it's already released,
+ * so we can ignore it. Probably it has been released and
+ * destroyed while the contention notification was being sent.
+ */
+ if (inode != NULL) {
+ ec_lock_release(ec, inode);
+ inode_unref(inode);
+ }
+
+ return _gf_false;
+
+ default:
+ return _gf_true;
+ }
+}
+
int32_t
ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
{
@@ -464,19 +506,13 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
dict_t *output = NULL;
gf_boolean_t propagate = _gf_true;
int32_t orig_event = event;
- struct gf_upcall *up_data = NULL;
- struct gf_upcall_cache_invalidation *up_ci = NULL;
uintptr_t mask = 0;
gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p",
event, data, data2);
if (event == GF_EVENT_UPCALL) {
- up_data = (struct gf_upcall *)data;
- if (up_data->event_type == GF_UPCALL_CACHE_INVALIDATION) {
- up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
- up_ci->flags |= UP_INVAL_ATTR;
- }
+ propagate = ec_upcall(ec, data);
goto done;
}
@@ -664,6 +700,10 @@ init (xlator_t *this)
GF_OPTION_INIT ("iam-self-heal-daemon", ec->shd.iamshd, bool, failed);
GF_OPTION_INIT ("eager-lock", ec->eager_lock, bool, failed);
GF_OPTION_INIT ("other-eager-lock", ec->other_eager_lock, bool, failed);
+ GF_OPTION_INIT ("eager-lock-timeout", ec->eager_lock_timeout, uint32,
+ failed);
+ GF_OPTION_INIT ("other-eager-lock-timeout", ec->other_eager_lock_timeout,
+ uint32, failed);
GF_OPTION_INIT ("background-heals", ec->background_heals, uint32, failed);
GF_OPTION_INIT ("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed);
GF_OPTION_INIT ("self-heal-window-size", ec->self_heal_window_size, uint32,
@@ -1456,6 +1496,29 @@ struct volume_options options[] =
.description = "It's equivalent to the eager-lock option but for non "
"regular files."
},
+ { .key = {"eager-lock-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 60,
+ .default_value = "1",
+ .op_version = { GD_OP_VERSION_4_0_0 },
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = { "disperse", "locks", "timeout" },
+ .description = "Maximum time (in seconds) that a lock on an inode is "
+ "kept held if no new operations on the inode are "
+ "received."
+ },
+ { .key = {"other-eager-lock-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 60,
+ .default_value = "1",
+ .op_version = { GD_OP_VERSION_4_0_0 },
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = { "disperse", "locks", "timeout" },
+ .description = "It's equivalent ot eager-lock-timeout option but for "
+ "non regular files."
+ },
{ .key = {"background-heals"},
.type = GF_OPTION_TYPE_INT,
.min = 0,/*Disabling background heals*/
diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c
index a76d6beacb1..1609fc416d2 100644
--- a/xlators/features/locks/src/clear.c
+++ b/xlators/features/locks/src/clear.c
@@ -200,6 +200,7 @@ int
clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,
clrlk_args *args, int *blkd, int *granted, int *op_errno)
{
+ posix_locks_private_t *priv;
pl_inode_lock_t *ilock = NULL;
pl_inode_lock_t *tmp = NULL;
struct gf_flock ulock = {0, };
@@ -207,9 +208,20 @@ clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,
int bcount = 0;
int gcount = 0;
gf_boolean_t chk_range = _gf_false;
+ struct list_head *pcontend = NULL;
struct list_head released;
+ struct list_head contend;
+ struct timespec now = { };
INIT_LIST_HEAD (&released);
+
+ priv = this->private;
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD (pcontend);
+ timespec_now(&now);
+ }
+
if (clrlk_get_lock_range (args->opts, &ulock, &chk_range)) {
*op_errno = EINVAL;
goto out;
@@ -283,7 +295,10 @@ granted:
ret = 0;
out:
- grant_blocked_inode_locks (this, pl_inode, dom);
+ grant_blocked_inode_locks (this, pl_inode, dom, &now, pcontend);
+ if (pcontend != NULL) {
+ inodelk_contention_notify(this, pcontend);
+ }
*blkd = bcount;
*granted = gcount;
return ret;
@@ -294,15 +309,27 @@ int
clrlk_clear_entrylk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,
clrlk_args *args, int *blkd, int *granted, int *op_errno)
{
+ posix_locks_private_t *priv;
pl_entry_lock_t *elock = NULL;
pl_entry_lock_t *tmp = NULL;
int bcount = 0;
int gcount = 0;
int ret = -1;
+ struct list_head *pcontend = NULL;
struct list_head removed;
struct list_head released;
+ struct list_head contend;
+ struct timespec now;
INIT_LIST_HEAD (&released);
+
+ priv = this->private;
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD (pcontend);
+ timespec_now(&now);
+ }
+
if (args->kind & CLRLK_BLOCKED)
goto blkd;
@@ -361,12 +388,15 @@ granted:
list_del_init (&elock->domain_list);
list_add_tail (&elock->domain_list, &removed);
- __pl_entrylk_unref (elock);
+ __pl_entrylk_unref (elock);
}
}
pthread_mutex_unlock (&pl_inode->mutex);
- grant_blocked_entry_locks (this, pl_inode, dom);
+ grant_blocked_entry_locks (this, pl_inode, dom, &now, pcontend);
+ if (pcontend != NULL) {
+ entrylk_contention_notify(this, pcontend);
+ }
ret = 0;
out:
diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h
index 3729ca24bed..50c156feb38 100644
--- a/xlators/features/locks/src/common.h
+++ b/xlators/features/locks/src/common.h
@@ -69,7 +69,11 @@ get_domain (pl_inode_t *pl_inode, const char *volume);
void
grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_dom_list_t *dom);
+ pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend);
+
+void
+inodelk_contention_notify (xlator_t *this, struct list_head *contend);
void
__delete_inode_lock (pl_inode_lock_t *lock);
@@ -79,7 +83,11 @@ __pl_inodelk_unref (pl_inode_lock_t *lock);
void
grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_dom_list_t *dom);
+ pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend);
+
+void
+entrylk_contention_notify (xlator_t *this, struct list_head *contend);
void pl_update_refkeeper (xlator_t *this, inode_t *inode);
diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c
index 6698516fc83..008d05a34c4 100644
--- a/xlators/features/locks/src/entrylk.c
+++ b/xlators/features/locks/src/entrylk.c
@@ -13,17 +13,19 @@
#include "logging.h"
#include "common-utils.h"
#include "list.h"
+#include "upcall-utils.h"
#include "locks.h"
#include "clear.h"
#include "common.h"
+#include "pl-messages.h"
void
__pl_entrylk_unref (pl_entry_lock_t *lock)
{
lock->ref--;
if (!lock->ref) {
- GF_FREE ((char *)lock->basename);
+ GF_FREE ((char *)lock->basename);
GF_FREE (lock->connection_id);
GF_FREE (lock);
}
@@ -39,7 +41,7 @@ __pl_entrylk_ref (pl_entry_lock_t *lock)
static pl_entry_lock_t *
new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
- const char *domain, call_frame_t *frame, char *conn_id)
+ const char *domain, call_frame_t *frame, char *conn_id)
{
pl_entry_lock_t *newlock = NULL;
@@ -55,7 +57,7 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
newlock->client_pid = frame->root->pid;
newlock->volume = domain;
newlock->owner = frame->root->lk_owner;
- newlock->frame = frame;
+ newlock->frame = frame;
newlock->this = frame->this;
if (conn_id) {
@@ -64,9 +66,9 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
INIT_LIST_HEAD (&newlock->domain_list);
INIT_LIST_HEAD (&newlock->blocked_locks);
- INIT_LIST_HEAD (&newlock->client_list);
+ INIT_LIST_HEAD (&newlock->client_list);
- __pl_entrylk_ref (newlock);
+ __pl_entrylk_ref (newlock);
out:
return newlock;
}
@@ -201,6 +203,113 @@ out:
return revoke_lock;
}
+static gf_boolean_t
+__entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock,
+ struct timespec *now)
+{
+ posix_locks_private_t *priv;
+ int64_t elapsed;
+
+ priv = this->private;
+
+ /* If this lock is in a list, it means that we are about to send a
+ * notification for it, so no need to do anything else. */
+ if (!list_empty(&lock->contend)) {
+ return _gf_false;
+ }
+
+ elapsed = now->tv_sec;
+ elapsed -= lock->contention_time.tv_sec;
+ if (now->tv_nsec < lock->contention_time.tv_nsec) {
+ elapsed--;
+ }
+ if (elapsed < priv->notify_contention_delay) {
+ return _gf_false;
+ }
+
+ /* All contention notifications will be sent outside of the locked
+ * region. This means that currently granted locks might have already
+ * been unlocked by that time. To avoid the lock or the inode to be
+ * destroyed before we process them, we take an additional reference
+ * on both. */
+ inode_ref(lock->pinode->inode);
+ __pl_entrylk_ref(lock);
+
+ lock->contention_time = *now;
+
+ return _gf_true;
+}
+
+void
+entrylk_contention_notify(xlator_t *this, struct list_head *contend)
+{
+ struct gf_upcall up;
+ struct gf_upcall_entrylk_contention lc;
+ pl_entry_lock_t *lock;
+ pl_inode_t *pl_inode;
+ client_t *client;
+ gf_boolean_t notify;
+
+ while (!list_empty(contend)) {
+ lock = list_first_entry(contend, pl_entry_lock_t, contend);
+
+ pl_inode = lock->pinode;
+
+ pthread_mutex_lock(&pl_inode->mutex);
+
+ /* If the lock has already been released, no notification is
+ * sent. We clear the notification time in this case. */
+ notify = !list_empty(&lock->domain_list);
+ if (!notify) {
+ lock->contention_time.tv_sec = 0;
+ lock->contention_time.tv_nsec = 0;
+ } else {
+ lc.type = lock->type;
+ lc.name = lock->basename;
+ lc.pid = lock->client_pid;
+ lc.domain = lock->volume;
+ lc.xdata = NULL;
+
+ gf_uuid_copy(up.gfid, lock->pinode->gfid);
+ client = (client_t *)lock->client;
+ if (client == NULL) {
+ /* A NULL client can be found if the entrylk
+ * was issued by a server side xlator. */
+ up.client_uid = NULL;
+ } else {
+ up.client_uid = client->client_uid;
+ }
+ }
+
+ pthread_mutex_unlock(&pl_inode->mutex);
+
+ if (notify) {
+ up.event_type = GF_UPCALL_ENTRYLK_CONTENTION;
+ up.data = &lc;
+
+ if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) {
+ gf_msg_debug(this->name, 0,
+ "Entrylk contention notification "
+ "failed");
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Entrylk contention notification "
+ "sent");
+ }
+ }
+
+ pthread_mutex_lock(&pl_inode->mutex);
+
+ list_del_init(&lock->contend);
+ __pl_entrylk_unref(lock);
+
+ pthread_mutex_unlock(&pl_inode->mutex);
+
+ inode_unref(pl_inode->inode);
+ }
+}
+
+
/**
* entrylk_grantable - is this lock grantable?
* @inode: inode in which to look
@@ -208,19 +317,27 @@ out:
* @type: type of lock
*/
static pl_entry_lock_t *
-__entrylk_grantable (pl_dom_list_t *dom, pl_entry_lock_t *lock)
+__entrylk_grantable (xlator_t *this, pl_dom_list_t *dom, pl_entry_lock_t *lock,
+ struct timespec *now, struct list_head *contend)
{
pl_entry_lock_t *tmp = NULL;
-
- if (list_empty (&dom->entrylk_list))
- return NULL;
+ pl_entry_lock_t *ret = NULL;
list_for_each_entry (tmp, &dom->entrylk_list, domain_list) {
- if (__conflicting_entrylks (tmp, lock))
- return tmp;
+ if (__conflicting_entrylks (tmp, lock)) {
+ if (ret == NULL) {
+ ret = tmp;
+ if (contend == NULL) {
+ break;
+ }
+ }
+ if (__entrylk_needs_contention_notify(this, tmp, now)) {
+ list_add_tail(&tmp->contend, contend);
+ }
+ }
}
- return NULL;
+ return ret;
}
static pl_entry_lock_t *
@@ -228,9 +345,6 @@ __blocked_entrylk_conflict (pl_dom_list_t *dom, pl_entry_lock_t *lock)
{
pl_entry_lock_t *tmp = NULL;
- if (list_empty (&dom->blocked_entrylks))
- return NULL;
-
list_for_each_entry (tmp, &dom->blocked_entrylks, blocked_locks) {
if (names_conflict (tmp->basename, lock->basename))
return lock;
@@ -426,6 +540,27 @@ __find_matching_lock (pl_dom_list_t *dom, pl_entry_lock_t *lock)
return NULL;
}
+static int
+__lock_blocked_add(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
+ pl_entry_lock_t *lock, int nonblock)
+{
+ struct timeval now;
+
+ gettimeofday(&now, NULL);
+
+ if (nonblock)
+ goto out;
+
+ lock->blkd_time = now;
+ list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks);
+
+ gf_msg_trace (this->name, 0, "Blocking lock: {pinode=%p, basename=%s}",
+ pinode, lock->basename);
+
+out:
+ return -EAGAIN;
+}
+
/**
* __lock_entrylk - lock a name in a directory
* @inode: inode for the directory in which to lock
@@ -439,24 +574,15 @@ __find_matching_lock (pl_dom_list_t *dom, pl_entry_lock_t *lock)
int
__lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock,
- int nonblock, pl_dom_list_t *dom)
+ int nonblock, pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend)
{
pl_entry_lock_t *conf = NULL;
int ret = -EAGAIN;
- conf = __entrylk_grantable (dom, lock);
+ conf = __entrylk_grantable (this, dom, lock, now, contend);
if (conf) {
- ret = -EAGAIN;
- if (nonblock)
- goto out;
-
- gettimeofday (&lock->blkd_time, NULL);
- list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks);
-
- gf_log (this->name, GF_LOG_TRACE,
- "Blocking lock: {pinode=%p, basename=%s}",
- pinode, lock->basename);
-
+ ret = __lock_blocked_add(this, pinode, dom, lock, nonblock);
goto out;
}
@@ -471,20 +597,15 @@ __lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock,
* granted, without which self-heal can't progress.
* TODO: Find why 'owner_has_lock' is checked even for blocked locks.
*/
- if (__blocked_entrylk_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) {
- ret = -EAGAIN;
- if (nonblock)
- goto out;
-
- gettimeofday (&lock->blkd_time, NULL);
- list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Lock is grantable, but blocking to prevent starvation");
- gf_log (this->name, GF_LOG_TRACE,
- "Blocking lock: {pinode=%p, basename=%s}",
- pinode, lock->basename);
+ if (__blocked_entrylk_conflict (dom, lock) &&
+ !(__owner_has_lock (dom, lock))) {
+ if (nonblock == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Lock is grantable, but blocking to prevent "
+ "starvation");
+ }
+ ret = __lock_blocked_add(this, pinode, dom, lock, nonblock);
goto out;
}
@@ -551,7 +672,8 @@ out:
void
__grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_dom_list_t *dom, struct list_head *granted)
+ pl_dom_list_t *dom, struct list_head *granted,
+ struct timespec *now, struct list_head *contend)
{
int bl_ret = 0;
pl_entry_lock_t *bl = NULL;
@@ -566,7 +688,8 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
list_del_init (&bl->blocked_locks);
- bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom);
+ bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom, now,
+ contend);
if (bl_ret == 0) {
list_add (&bl->blocked_locks, granted);
@@ -578,7 +701,8 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
/* Grants locks if possible which are blocked on a lock */
void
grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_dom_list_t *dom)
+ pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend)
{
struct list_head granted_list;
pl_entry_lock_t *tmp = NULL;
@@ -589,7 +713,7 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
pthread_mutex_lock (&pl_inode->mutex);
{
__grant_blocked_entry_locks (this, pl_inode, dom,
- &granted_list);
+ &granted_list, now, contend);
}
pthread_mutex_unlock (&pl_inode->mutex);
@@ -610,8 +734,6 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
}
}
pthread_mutex_unlock (&pl_inode->mutex);
-
- return;
}
@@ -637,9 +759,18 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
int nonblock = 0;
gf_boolean_t need_inode_unref = _gf_false;
posix_locks_private_t *priv = NULL;
+ struct list_head *pcontend = NULL;
+ struct list_head contend;
+ struct timespec now = { };
priv = this->private;
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD(pcontend);
+ timespec_now(&now);
+ }
+
if (xdata)
dict_ret = dict_get_str (xdata, "connection-id", &conn_id);
@@ -722,7 +853,8 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
{
reqlock->pinode = pinode;
- ret = __lock_entrylk (this, pinode, reqlock, nonblock, dom);
+ ret = __lock_entrylk (this, pinode, reqlock, nonblock,
+ dom, &now, pcontend);
if (ret == 0) {
reqlock->frame = NULL;
op_ret = 0;
@@ -778,7 +910,7 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
if (ctx)
pthread_mutex_unlock (&ctx->lock);
- grant_blocked_entry_locks (this, pinode, dom);
+ grant_blocked_entry_locks (this, pinode, dom, &now, pcontend);
break;
@@ -810,6 +942,10 @@ unwind:
cmd, type);
}
+ if (pcontend != NULL) {
+ entrylk_contention_notify(this, pcontend);
+ }
+
return 0;
}
@@ -868,27 +1004,37 @@ pl_entrylk_log_cleanup (pl_entry_lock_t *lock)
int
pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
{
+ posix_locks_private_t *priv;
pl_entry_lock_t *tmp = NULL;
pl_entry_lock_t *l = NULL;
- pl_dom_list_t *dom = NULL;
+ pl_dom_list_t *dom = NULL;
pl_inode_t *pinode = NULL;
-
+ struct list_head *pcontend = NULL;
struct list_head released;
struct list_head unwind;
+ struct list_head contend;
+ struct timespec now = { };
INIT_LIST_HEAD (&released);
INIT_LIST_HEAD (&unwind);
- pthread_mutex_lock (&ctx->lock);
+ priv = this->private;
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD (pcontend);
+ timespec_now(&now);
+ }
+
+ pthread_mutex_lock (&ctx->lock);
{
list_for_each_entry_safe (l, tmp, &ctx->entrylk_lockers,
- client_list) {
- pl_entrylk_log_cleanup (l);
+ client_list) {
+ pl_entrylk_log_cleanup (l);
- pinode = l->pinode;
+ pinode = l->pinode;
- pthread_mutex_lock (&pinode->mutex);
- {
+ pthread_mutex_lock (&pinode->mutex);
+ {
/* If the entrylk object is part of granted list but not
* blocked list, then perform the following actions:
* i. delete the object from granted list;
@@ -931,38 +1077,42 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
&unwind);
}
}
- pthread_mutex_unlock (&pinode->mutex);
+ pthread_mutex_unlock (&pinode->mutex);
}
- }
+ }
pthread_mutex_unlock (&ctx->lock);
list_for_each_entry_safe (l, tmp, &unwind, client_list) {
list_del_init (&l->client_list);
- if (l->frame)
- STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN,
- NULL);
+ if (l->frame)
+ STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN,
+ NULL);
list_add_tail (&l->client_list, &released);
}
list_for_each_entry_safe (l, tmp, &released, client_list) {
list_del_init (&l->client_list);
- pinode = l->pinode;
+ pinode = l->pinode;
- dom = get_domain (pinode, l->volume);
+ dom = get_domain (pinode, l->volume);
- grant_blocked_entry_locks (this, pinode, dom);
+ grant_blocked_entry_locks (this, pinode, dom, &now, pcontend);
- pthread_mutex_lock (&pinode->mutex);
- {
- __pl_entrylk_unref (l);
- }
- pthread_mutex_unlock (&pinode->mutex);
+ pthread_mutex_lock (&pinode->mutex);
+ {
+ __pl_entrylk_unref (l);
+ }
+ pthread_mutex_unlock (&pinode->mutex);
inode_unref (pinode->inode);
}
+ if (pcontend != NULL) {
+ entrylk_contention_notify(this, pcontend);
+ }
+
return 0;
}
diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c
index 64ffb00c18c..890ac8b6d00 100644
--- a/xlators/features/locks/src/inodelk.c
+++ b/xlators/features/locks/src/inodelk.c
@@ -13,10 +13,12 @@
#include "logging.h"
#include "common-utils.h"
#include "list.h"
+#include "upcall-utils.h"
#include "locks.h"
#include "clear.h"
#include "common.h"
+#include "pl-messages.h"
void
__delete_inode_lock (pl_inode_lock_t *lock)
@@ -229,22 +231,134 @@ out:
return revoke_lock;
}
+static gf_boolean_t
+__inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock,
+ struct timespec *now)
+{
+ posix_locks_private_t *priv;
+ int64_t elapsed;
+
+ priv = this->private;
+
+ /* If this lock is in a list, it means that we are about to send a
+ * notification for it, so no need to do anything else. */
+ if (!list_empty(&lock->contend)) {
+ return _gf_false;
+ }
+
+ elapsed = now->tv_sec;
+ elapsed -= lock->contention_time.tv_sec;
+ if (now->tv_nsec < lock->contention_time.tv_nsec) {
+ elapsed--;
+ }
+ if (elapsed < priv->notify_contention_delay) {
+ return _gf_false;
+ }
+
+ /* All contention notifications will be sent outside of the locked
+ * region. This means that currently granted locks might have already
+ * been unlocked by that time. To avoid the lock or the inode to be
+ * destroyed before we process them, we take an additional reference
+ * on both. */
+ inode_ref(lock->pl_inode->inode);
+ __pl_inodelk_ref(lock);
+
+ lock->contention_time = *now;
+
+ return _gf_true;
+}
+
+void
+inodelk_contention_notify(xlator_t *this, struct list_head *contend)
+{
+ struct gf_upcall up;
+ struct gf_upcall_inodelk_contention lc;
+ pl_inode_lock_t *lock;
+ pl_inode_t *pl_inode;
+ client_t *client;
+ gf_boolean_t notify;
+
+ while (!list_empty(contend)) {
+ lock = list_first_entry(contend, pl_inode_lock_t, contend);
+
+ pl_inode = lock->pl_inode;
+
+ pthread_mutex_lock(&pl_inode->mutex);
+
+ /* If the lock has already been released, no notification is
+ * sent. We clear the notification time in this case. */
+ notify = !list_empty(&lock->list);
+ if (!notify) {
+ lock->contention_time.tv_sec = 0;
+ lock->contention_time.tv_nsec = 0;
+ } else {
+ memcpy(&lc.flock, &lock->user_flock, sizeof(lc.flock));
+ lc.pid = lock->client_pid;
+ lc.domain = lock->volume;
+ lc.xdata = NULL;
+
+ gf_uuid_copy(up.gfid, lock->pl_inode->gfid);
+ client = (client_t *)lock->client;
+ if (client == NULL) {
+ /* A NULL client can be found if the inodelk
+ * was issued by a server side xlator. */
+ up.client_uid = NULL;
+ } else {
+ up.client_uid = client->client_uid;
+ }
+ }
+
+ pthread_mutex_unlock(&pl_inode->mutex);
+
+ if (notify) {
+ up.event_type = GF_UPCALL_INODELK_CONTENTION;
+ up.data = &lc;
+
+ if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) {
+ gf_msg_debug(this->name, 0,
+ "Inodelk contention notification "
+ "failed");
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Inodelk contention notification "
+ "sent");
+ }
+ }
+
+ pthread_mutex_lock(&pl_inode->mutex);
+
+ list_del_init(&lock->contend);
+ __pl_inodelk_unref(lock);
+
+ pthread_mutex_unlock(&pl_inode->mutex);
+
+ inode_unref(pl_inode->inode);
+ }
+}
+
/* Determine if lock is grantable or not */
static pl_inode_lock_t *
-__inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock)
+__inodelk_grantable (xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock,
+ struct timespec *now, struct list_head *contend)
{
pl_inode_lock_t *l = NULL;
pl_inode_lock_t *ret = NULL;
- if (list_empty (&dom->inodelk_list))
- goto out;
+
list_for_each_entry (l, &dom->inodelk_list, list){
if (inodelk_conflict (lock, l) &&
!same_inodelk_owner (lock, l)) {
- ret = l;
- goto out;
+ if (ret == NULL) {
+ ret = l;
+ if (contend == NULL) {
+ break;
+ }
+ }
+ if (__inodelk_needs_contention_notify(this, l, now)) {
+ list_add_tail(&l->contend, contend);
+ }
}
}
-out:
+
return ret;
}
@@ -252,20 +366,14 @@ static pl_inode_lock_t *
__blocked_lock_conflict (pl_dom_list_t *dom, pl_inode_lock_t *lock)
{
pl_inode_lock_t *l = NULL;
- pl_inode_lock_t *ret = NULL;
-
- if (list_empty (&dom->blocked_inodelks))
- return NULL;
list_for_each_entry (l, &dom->blocked_inodelks, blocked_locks) {
if (inodelk_conflict (lock, l)) {
- ret = l;
- goto out;
+ return l;
}
}
-out:
- return ret;
+ return NULL;
}
static int
@@ -286,35 +394,45 @@ __owner_has_lock (pl_dom_list_t *dom, pl_inode_lock_t *newlock)
return 0;
}
+static int
+__lock_blocked_add(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock,
+ int can_block)
+{
+ struct timeval now;
+
+ gettimeofday(&now, NULL);
+
+ if (can_block == 0) {
+ goto out;
+ }
+
+ lock->blkd_time = now;
+ list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks);
+
+ gf_msg_trace (this->name, 0, "%s (pid=%d) (lk-owner=%s) %"PRId64" - "
+ "%"PRId64" => Blocked",
+ lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+ lock->client_pid, lkowner_utoa (&lock->owner),
+ lock->user_flock.l_start, lock->user_flock.l_len);
+
+out:
+ return -EAGAIN;
+}
/* Determines if lock can be granted and adds the lock. If the lock
* is blocking, adds it to the blocked_inodelks list of the domain.
*/
static int
__lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
- int can_block, pl_dom_list_t *dom)
+ int can_block, pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend)
{
pl_inode_lock_t *conf = NULL;
int ret = -EINVAL;
- conf = __inodelk_grantable (dom, lock);
+ conf = __inodelk_grantable (this, dom, lock, now, contend);
if (conf) {
- ret = -EAGAIN;
- if (can_block == 0)
- goto out;
-
- gettimeofday (&lock->blkd_time, NULL);
- list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks);
-
- gf_log (this->name, GF_LOG_TRACE,
- "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked",
- lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
- lock->client_pid,
- lkowner_utoa (&lock->owner),
- lock->user_flock.l_start,
- lock->user_flock.l_len);
-
-
+ ret = __lock_blocked_add(this, dom, lock, can_block);
goto out;
}
@@ -330,25 +448,15 @@ __lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
* will not be unlocked by SHD from Machine1.
* TODO: Find why 'owner_has_lock' is checked even for blocked locks.
*/
- if (__blocked_lock_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) {
- ret = -EAGAIN;
- if (can_block == 0)
- goto out;
-
- gettimeofday (&lock->blkd_time, NULL);
- list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Lock is grantable, but blocking to prevent starvation");
- gf_log (this->name, GF_LOG_TRACE,
- "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Blocked",
- lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
- lock->client_pid,
- lkowner_utoa (&lock->owner),
- lock->user_flock.l_start,
- lock->user_flock.l_len);
-
+ if (__blocked_lock_conflict (dom, lock) &&
+ !(__owner_has_lock (dom, lock))) {
+ if (can_block != 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Lock is grantable, but blocking to prevent "
+ "starvation");
+ }
+ ret = __lock_blocked_add(this, dom, lock, can_block);
goto out;
}
__pl_inodelk_ref (lock);
@@ -417,7 +525,8 @@ out:
static void
__grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
- struct list_head *granted, pl_dom_list_t *dom)
+ struct list_head *granted, pl_dom_list_t *dom,
+ struct timespec *now, struct list_head *contend)
{
int bl_ret = 0;
pl_inode_lock_t *bl = NULL;
@@ -432,7 +541,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
list_del_init (&bl->blocked_locks);
- bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom);
+ bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom, now,
+ contend);
if (bl_ret == 0) {
list_add (&bl->blocked_locks, granted);
@@ -444,7 +554,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
/* Grant all inodelks blocked on a lock */
void
grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_dom_list_t *dom)
+ pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend)
{
struct list_head granted;
pl_inode_lock_t *lock;
@@ -454,7 +565,8 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
pthread_mutex_lock (&pl_inode->mutex);
{
- __grant_blocked_inode_locks (this, pl_inode, &granted, dom);
+ __grant_blocked_inode_locks (this, pl_inode, &granted, dom,
+ now, contend);
}
pthread_mutex_unlock (&pl_inode->mutex);
@@ -471,7 +583,7 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
&lock->user_flock, 0, 0, lock->volume);
STACK_UNWIND_STRICT (inodelk, lock->frame, 0, 0, NULL);
- lock->frame = NULL;
+ lock->frame = NULL;
}
pthread_mutex_lock (&pl_inode->mutex);
@@ -488,9 +600,9 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
static void
pl_inodelk_log_cleanup (pl_inode_lock_t *lock)
{
- pl_inode_t *pl_inode = NULL;
+ pl_inode_t *pl_inode = NULL;
- pl_inode = lock->pl_inode;
+ pl_inode = lock->pl_inode;
gf_log (THIS->name, GF_LOG_WARNING, "releasing lock on %s held by "
"{client=%p, pid=%"PRId64" lk-owner=%s}",
@@ -503,27 +615,38 @@ pl_inodelk_log_cleanup (pl_inode_lock_t *lock)
int
pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
{
+ posix_locks_private_t *priv;
pl_inode_lock_t *tmp = NULL;
pl_inode_lock_t *l = NULL;
- pl_dom_list_t *dom = NULL;
+ pl_dom_list_t *dom = NULL;
pl_inode_t *pl_inode = NULL;
-
+ struct list_head *pcontend = NULL;
struct list_head released;
struct list_head unwind;
+ struct list_head contend;
+ struct timespec now = { };
+
+ priv = this->private;
INIT_LIST_HEAD (&released);
INIT_LIST_HEAD (&unwind);
- pthread_mutex_lock (&ctx->lock);
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD (pcontend);
+ timespec_now(&now);
+ }
+
+ pthread_mutex_lock (&ctx->lock);
{
list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers,
- client_list) {
- pl_inodelk_log_cleanup (l);
+ client_list) {
+ pl_inodelk_log_cleanup (l);
- pl_inode = l->pl_inode;
+ pl_inode = l->pl_inode;
- pthread_mutex_lock (&pl_inode->mutex);
- {
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
/* If the inodelk object is part of granted list but not
* blocked list, then perform the following actions:
* i. delete the object from granted list;
@@ -567,45 +690,49 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
&unwind);
}
}
- pthread_mutex_unlock (&pl_inode->mutex);
+ pthread_mutex_unlock (&pl_inode->mutex);
}
- }
+ }
pthread_mutex_unlock (&ctx->lock);
list_for_each_entry_safe (l, tmp, &unwind, client_list) {
list_del_init (&l->client_list);
if (l->frame)
- STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN,
- NULL);
+ STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN,
+ NULL);
list_add_tail (&l->client_list, &released);
-
}
list_for_each_entry_safe (l, tmp, &released, client_list) {
list_del_init (&l->client_list);
- pl_inode = l->pl_inode;
+ pl_inode = l->pl_inode;
- dom = get_domain (pl_inode, l->volume);
+ dom = get_domain (pl_inode, l->volume);
- grant_blocked_inode_locks (this, pl_inode, dom);
+ grant_blocked_inode_locks (this, pl_inode, dom, &now,
+ pcontend);
- pthread_mutex_lock (&pl_inode->mutex);
- {
- __pl_inodelk_unref (l);
- }
- pthread_mutex_unlock (&pl_inode->mutex);
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ __pl_inodelk_unref (l);
+ }
+ pthread_mutex_unlock (&pl_inode->mutex);
inode_unref (pl_inode->inode);
}
+ if (pcontend != NULL) {
+ inodelk_contention_notify(this, pcontend);
+ }
+
return 0;
}
static int
pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
- pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,
+ pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,
inode_t *inode)
{
posix_locks_private_t *priv = NULL;
@@ -613,9 +740,12 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
pl_inode_lock_t *retlock = NULL;
gf_boolean_t unref = _gf_true;
gf_boolean_t need_inode_unref = _gf_false;
+ struct list_head *pcontend = NULL;
+ struct list_head contend;
+ struct timespec now = { };
short fl_type;
- lock->pl_inode = pl_inode;
+ lock->pl_inode = pl_inode;
fl_type = lock->fl_type;
priv = this->private;
@@ -657,12 +787,19 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
}
}
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD(pcontend);
+ timespec_now(&now);
+ }
+
if (ctx)
pthread_mutex_lock (&ctx->lock);
pthread_mutex_lock (&pl_inode->mutex);
{
if (lock->fl_type != F_UNLCK) {
- ret = __lock_inodelk (this, pl_inode, lock, can_block, dom);
+ ret = __lock_inodelk (this, pl_inode, lock, can_block,
+ dom, &now, pcontend);
if (ret == 0) {
lock->frame = NULL;
gf_log (this->name, GF_LOG_TRACE,
@@ -725,13 +862,18 @@ out:
*/
if ((fl_type == F_UNLCK) && (ret == 0)) {
inode_unref (pl_inode->inode);
- grant_blocked_inode_locks (this, pl_inode, dom);
+ grant_blocked_inode_locks (this, pl_inode, dom, &now,
+ pcontend);
}
if (need_inode_unref) {
inode_unref (pl_inode->inode);
}
+ if (pcontend != NULL) {
+ inodelk_contention_notify(this, pcontend);
+ }
+
return ret;
}
@@ -771,7 +913,8 @@ new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid,
INIT_LIST_HEAD (&lock->list);
INIT_LIST_HEAD (&lock->blocked_locks);
- INIT_LIST_HEAD (&lock->client_list);
+ INIT_LIST_HEAD (&lock->client_list);
+ INIT_LIST_HEAD (&lock->contend);
__pl_inodelk_ref (lock);
return lock;
diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
index 3d3b327f56c..c2edfff8f00 100644
--- a/xlators/features/locks/src/locks.h
+++ b/xlators/features/locks/src/locks.h
@@ -70,6 +70,7 @@ typedef struct __posix_lock posix_lock_t;
struct __pl_inode_lock {
struct list_head list;
struct list_head blocked_locks; /* list_head pointing to blocked_inodelks */
+ struct list_head contend; /* list of contending locks */
int ref;
short fl_type;
@@ -86,6 +87,8 @@ struct __pl_inode_lock {
struct timeval blkd_time; /*time at which lock was queued into blkd list*/
struct timeval granted_time; /*time at which lock was queued into active list*/
+ /*last time at wich lock contention was detected and notified*/
+ struct timespec contention_time;
/* These two together serve to uniquely identify each process
across nodes */
@@ -120,6 +123,7 @@ typedef struct _pl_dom_list pl_dom_list_t;
struct __entry_lock {
struct list_head domain_list; /* list_head back to pl_dom_list_t */
struct list_head blocked_locks; /* list_head back to blocked_entrylks */
+ struct list_head contend; /* list of contending locks */
int ref;
call_frame_t *frame;
@@ -133,6 +137,8 @@ struct __entry_lock {
struct timeval blkd_time; /*time at which lock was queued into blkd list*/
struct timeval granted_time; /*time at which lock was queued into active list*/
+ /*last time at wich lock contention was detected and notified*/
+ struct timespec contention_time;
void *client;
gf_lkowner_t owner;
@@ -194,6 +200,8 @@ typedef struct {
uint32_t revocation_secs;
gf_boolean_t revocation_clear_all;
uint32_t revocation_max_blocked;
+ gf_boolean_t notify_contention;
+ uint32_t notify_contention_delay;
} posix_locks_private_t;
diff --git a/xlators/features/locks/src/pl-messages.h b/xlators/features/locks/src/pl-messages.h
index 7a1e3f488e7..e5a276f35b5 100644
--- a/xlators/features/locks/src/pl-messages.h
+++ b/xlators/features/locks/src/pl-messages.h
@@ -24,7 +24,9 @@
*/
GLFS_MSGID(PL,
- PL_MSG_LOCK_NUMBER
+ PL_MSG_LOCK_NUMBER,
+ PL_MSG_INODELK_CONTENTION_FAILED,
+ PL_MSG_ENTRYLK_CONTENTION_FAILED
);
#endif /* !_PL_MESSAGES_H_ */
diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
index 78bf160058c..82d77db1164 100644
--- a/xlators/features/locks/src/posix.c
+++ b/xlators/features/locks/src/posix.c
@@ -3641,6 +3641,13 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("revocation-max-blocked",
priv->revocation_max_blocked, options,
uint32, out);
+
+ GF_OPTION_RECONF ("notify-contention", priv->notify_contention,
+ options, bool, out);
+
+ GF_OPTION_RECONF ("notify-contention-delay",
+ priv->notify_contention_delay, options, uint32, out);
+
ret = 0;
out:
@@ -3705,6 +3712,12 @@ init (xlator_t *this)
GF_OPTION_INIT ("revocation-max-blocked", priv->revocation_max_blocked,
uint32, out);
+ GF_OPTION_INIT ("notify-contention", priv->notify_contention, bool,
+ out);
+
+ GF_OPTION_INIT ("notify-contention-delay",
+ priv->notify_contention_delay, uint32, out);
+
this->local_pool = mem_pool_new (pl_local_t, 32);
if (!this->local_pool) {
ret = -1;
@@ -4461,5 +4474,32 @@ struct volume_options options[] = {
"will be revoked to allow the others to proceed. Can "
"be used in conjunction w/ revocation-clear-all."
},
+ { .key = {"notify-contention"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .op_version = { GD_OP_VERSION_4_0_0 },
+ .tags = { "locks", "contention" },
+ .description = "When this option is enabled and a lock request "
+ "conflicts with a currently granted lock, an upcall "
+ "notification will be sent to the current owner of "
+ "the lock to request it to be released as soon as "
+ "possible."
+ },
+ { .key = {"notify-contention-delay"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0, /* An upcall notification is sent every time a conflict is
+ * detected. */
+ .max = 60,
+ .default_value = "5",
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .op_version = { GD_OP_VERSION_4_0_0 },
+ .tags = { "locks", "contention", "timeout" },
+ .description = "This value determines the minimum amount of time "
+ "(in seconds) between upcall contention notifications "
+ "on the same inode. If multiple lock requests are "
+ "received during this period, only one upcall will "
+ "be sent."
+ },
{ .key = {NULL} },
};
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index da85cb5297f..e0ec3368ca7 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1484,6 +1484,16 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_13_0,
.flags = VOLOPT_FLAG_CLIENT_OPT
},
+ { .key = "disperse.eager-lock-timeout",
+ .voltype = "cluster/disperse",
+ .op_version = GD_OP_VERSION_4_0_0,
+ .flags = VOLOPT_FLAG_CLIENT_OPT
+ },
+ { .key = "disperse.other-eager-lock-timeout",
+ .voltype = "cluster/disperse",
+ .op_version = GD_OP_VERSION_4_0_0,
+ .flags = VOLOPT_FLAG_CLIENT_OPT
+ },
{ .key = "cluster.quorum-type",
.voltype = "cluster/replicate",
.option = "quorum-type",
@@ -3489,6 +3499,16 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_9_0,
.type = NO_DOC,
},
+ { .option = "notify-contention",
+ .key = "features.locks-notify-contention",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_4_0_0,
+ },
+ { .option = "notify-contention-delay",
+ .key = "features.locks-notify-contention-delay",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_4_0_0,
+ },
{ .key = "disperse.shd-max-threads",
.voltype = "cluster/disperse",
.op_version = GD_OP_VERSION_3_9_0,
diff --git a/xlators/protocol/client/src/client-callback.c b/xlators/protocol/client/src/client-callback.c
index 51164e57230..b2f9a225887 100644
--- a/xlators/protocol/client/src/client-callback.c
+++ b/xlators/protocol/client/src/client-callback.c
@@ -173,6 +173,101 @@ out:
return 0;
}
+int
+client_cbk_inodelk_contention (struct rpc_clnt *rpc, void *mydata, void *data)
+{
+ int ret = -1;
+ struct iovec *iov = NULL;
+ struct gf_upcall upcall_data = {0,};
+ struct gf_upcall_inodelk_contention lc = {{0,},};
+ gfs4_inodelk_contention_req proto_lc = {{0,},};
+
+ GF_VALIDATE_OR_GOTO ("client-callback", rpc, out);
+ GF_VALIDATE_OR_GOTO ("client-callback", mydata, out);
+ GF_VALIDATE_OR_GOTO ("client-callback", data, out);
+
+ iov = (struct iovec *)data;
+ ret = xdr_to_generic (*iov, &proto_lc,
+ (xdrproc_t)xdr_gfs4_inodelk_contention_req);
+
+ if (ret < 0) {
+ gf_msg (THIS->name, GF_LOG_WARNING, -ret,
+ PC_MSG_INODELK_CONTENTION_FAIL,
+ "XDR decode of inodelk contention failed.");
+ goto out;
+ }
+
+ upcall_data.data = &lc;
+ ret = gf_proto_inodelk_contention_to_upcall (&proto_lc, &upcall_data);
+ if (ret < 0)
+ goto out;
+
+ upcall_data.event_type = GF_UPCALL_INODELK_CONTENTION;
+
+ default_notify (THIS, GF_EVENT_UPCALL, &upcall_data);
+
+out:
+ if (proto_lc.domain)
+ free (proto_lc.domain);
+
+ if (proto_lc.xdata.xdata_val)
+ free (proto_lc.xdata.xdata_val);
+
+ if (lc.xdata)
+ dict_unref (lc.xdata);
+
+ return ret;
+}
+
+int
+client_cbk_entrylk_contention (struct rpc_clnt *rpc, void *mydata, void *data)
+{
+ int ret = -1;
+ struct iovec *iov = NULL;
+ struct gf_upcall upcall_data = {0,};
+ struct gf_upcall_entrylk_contention lc = {0,};
+ gfs4_entrylk_contention_req proto_lc = {{0,},};
+
+ GF_VALIDATE_OR_GOTO ("client-callback", rpc, out);
+ GF_VALIDATE_OR_GOTO ("client-callback", mydata, out);
+ GF_VALIDATE_OR_GOTO ("client-callback", data, out);
+
+ iov = (struct iovec *)data;
+ ret = xdr_to_generic (*iov, &proto_lc,
+ (xdrproc_t)xdr_gfs4_entrylk_contention_req);
+
+ if (ret < 0) {
+ gf_msg (THIS->name, GF_LOG_WARNING, -ret,
+ PC_MSG_ENTRYLK_CONTENTION_FAIL,
+ "XDR decode of entrylk contention failed.");
+ goto out;
+ }
+
+ upcall_data.data = &lc;
+ ret = gf_proto_entrylk_contention_to_upcall (&proto_lc, &upcall_data);
+ if (ret < 0)
+ goto out;
+
+ upcall_data.event_type = GF_UPCALL_ENTRYLK_CONTENTION;
+
+ default_notify (THIS, GF_EVENT_UPCALL, &upcall_data);
+
+out:
+ if (proto_lc.name)
+ free (proto_lc.name);
+
+ if (proto_lc.domain)
+ free (proto_lc.domain);
+
+ if (proto_lc.xdata.xdata_val)
+ free (proto_lc.xdata.xdata_val);
+
+ if (lc.xdata)
+ dict_unref (lc.xdata);
+
+ return ret;
+}
+
rpcclnt_cb_actor_t gluster_cbk_actors[GF_CBK_MAXVALUE] = {
[GF_CBK_NULL] = {"NULL", GF_CBK_NULL, client_cbk_null },
[GF_CBK_FETCHSPEC] = {"FETCHSPEC", GF_CBK_FETCHSPEC, client_cbk_fetchspec },
@@ -181,6 +276,8 @@ rpcclnt_cb_actor_t gluster_cbk_actors[GF_CBK_MAXVALUE] = {
[GF_CBK_CHILD_UP] = {"CHILD_UP", GF_CBK_CHILD_UP, client_cbk_child_up },
[GF_CBK_CHILD_DOWN] = {"CHILD_DOWN", GF_CBK_CHILD_DOWN, client_cbk_child_down },
[GF_CBK_RECALL_LEASE] = {"RECALL_LEASE", GF_CBK_RECALL_LEASE, client_cbk_recall_lease },
+ [GF_CBK_INODELK_CONTENTION] = {"INODELK_CONTENTION", GF_CBK_INODELK_CONTENTION, client_cbk_inodelk_contention },
+ [GF_CBK_ENTRYLK_CONTENTION] = {"ENTRYLK_CONTENTION", GF_CBK_ENTRYLK_CONTENTION, client_cbk_entrylk_contention },
};
diff --git a/xlators/protocol/client/src/client-messages.h b/xlators/protocol/client/src/client-messages.h
index 86b721bd593..5f146c67efe 100644
--- a/xlators/protocol/client/src/client-messages.h
+++ b/xlators/protocol/client/src/client-messages.h
@@ -89,7 +89,9 @@ GLFS_MSGID(PC,
PC_MSG_CACHE_INVALIDATION_FAIL,
PC_MSG_CHILD_STATUS,
PC_MSG_GFID_NULL,
- PC_MSG_RECALL_LEASE_FAIL
+ PC_MSG_RECALL_LEASE_FAIL,
+ PC_MSG_INODELK_CONTENTION_FAIL,
+ PC_MSG_ENTRYLK_CONTENTION_FAIL
);
#endif /* !_PC_MESSAGES_H__ */
diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c
index 7154355e690..66122318c79 100644
--- a/xlators/protocol/server/src/server.c
+++ b/xlators/protocol/server/src/server.c
@@ -1349,6 +1349,8 @@ server_process_event_upcall (xlator_t *this, void *data)
enum gf_cbk_procnum cbk_procnum = GF_CBK_NULL;
gfs3_cbk_cache_invalidation_req gf_c_req = {0,};
gfs3_recall_lease_req gf_recall_lease = {{0,},};
+ gfs4_inodelk_contention_req gf_inodelk_contention = {{0},};
+ gfs4_entrylk_contention_req gf_entrylk_contention = {{0},};
xdrproc_t xdrproc;
GF_VALIDATE_OR_GOTO(this->name, data, out);
@@ -1358,7 +1360,16 @@ server_process_event_upcall (xlator_t *this, void *data)
upcall_data = (struct gf_upcall *)data;
client_uid = upcall_data->client_uid;
- GF_VALIDATE_OR_GOTO(this->name, client_uid, out);
+ /* client_uid could be NULL if the upcall was intended for a server's
+ * child xlator (so no client_uid available) but it hasn't handled
+ * the notification. For this reason we silently ignore any upcall
+ * request with a NULL client_uid, but -1 will be returned.
+ */
+ if (client_uid == NULL) {
+ gf_msg_debug(this->name, 0,
+ "NULL client_uid for an upcall request");
+ goto out;
+ }
switch (upcall_data->event_type) {
case GF_UPCALL_CACHE_INVALIDATION:
@@ -1381,6 +1392,28 @@ server_process_event_upcall (xlator_t *this, void *data)
cbk_procnum = GF_CBK_RECALL_LEASE;
xdrproc = (xdrproc_t)xdr_gfs3_recall_lease_req;
break;
+ case GF_UPCALL_INODELK_CONTENTION:
+ ret = gf_proto_inodelk_contention_from_upcall (this,
+ &gf_inodelk_contention,
+ upcall_data);
+ if (ret < 0)
+ goto out;
+
+ up_req = &gf_inodelk_contention;
+ cbk_procnum = GF_CBK_INODELK_CONTENTION;
+ xdrproc = (xdrproc_t)xdr_gfs4_inodelk_contention_req;
+ break;
+ case GF_UPCALL_ENTRYLK_CONTENTION:
+ ret = gf_proto_entrylk_contention_from_upcall (this,
+ &gf_entrylk_contention,
+ upcall_data);
+ if (ret < 0)
+ goto out;
+
+ up_req = &gf_entrylk_contention;
+ cbk_procnum = GF_CBK_ENTRYLK_CONTENTION;
+ xdrproc = (xdrproc_t)xdr_gfs4_entrylk_contention_req;
+ break;
default:
gf_msg (this->name, GF_LOG_WARNING, EINVAL,
PS_MSG_INVALID_ENTRY,
@@ -1417,11 +1450,10 @@ server_process_event_upcall (xlator_t *this, void *data)
pthread_mutex_unlock (&conf->mutex);
ret = 0;
out:
- if ((gf_c_req.xdata).xdata_val)
- GF_FREE ((gf_c_req.xdata).xdata_val);
-
- if ((gf_recall_lease.xdata).xdata_val)
- GF_FREE ((gf_recall_lease.xdata).xdata_val);
+ GF_FREE ((gf_c_req.xdata).xdata_val);
+ GF_FREE ((gf_recall_lease.xdata).xdata_val);
+ GF_FREE ((gf_inodelk_contention.xdata).xdata_val);
+ GF_FREE ((gf_entrylk_contention.xdata).xdata_val);
return ret;
}