summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr-transaction.c
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2018-01-31 16:41:14 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2018-03-14 13:32:35 +0000
commit346714305f9de30d5f78494091770c1555c601bb (patch)
treeebdb744fd1858a98495e77069cb4e670b2ef87c6 /xlators/cluster/afr/src/afr-transaction.c
parentf32f85c4e6c8128643e1f88fe981a63680e79fe0 (diff)
cluster/afr: Make AFR eager-locking similar to EC
Problem: 1) Afr's eager-lock only works for data transactions. 2) When there are conflicting writes, write with conflicting region initiates unlock of eager-lock leading to extra pre-ops and post-ops on the file. When eager-lock goes off, it leads to extra fsyncs for random-write workload in afr. Solution (that is modeled after EC): In EC, when there is a conflicting write, it waits for the current write to complete before it winds the conflicted write. This leads to better utilization of network and disk, because we will not be doing extra xattrops and FSYNCs and inodelk/unlock. Moved fd based counters to inode based counters. I tried to model the solution based on EC's locking, but it is not similar to AFR because we had to keep backward compatibility. Lifecycle of lock: ================== First transaction is added to inode->owners list and an inodelk will be sent on the wire. All the next transactions will be put in inode->waiters list until the first transaction completes inodelk and [f]xattrop completely. Once [f]xattrop also completes, all the requests in the inode->waiters list are checked if it conflict with any of the existing locks which are in inode->owners list and if not are added to inode->owners list and resumed with doing transaction. When these transactions complete fop phase they will be moved to inode->post_op list and resume the transactions that were paused because of conflicts. Post-op and unlock will not be issued on the wire until that is the last transaction on that inode. Last transaction when it has to perform post-op can choose to sleep for deyed-post-op-secs value. During that time if any other transaction comes, it will wake up the sleeping transaction and takes over the ownership of the lock and the cycle continues. If the dealyed-post-op-secs expire, then the timer thread will wakeup the sleeping transaction and it will set lock->release to true and starts doing post-op and then unlock. During this time if any other transactions come, they will be put in inode->frozen list. Once the previous unlock comes it will move the frozen list to waiters list and moves the first element from this waiters-list to owners-list and attempts the lock and the cycle continues. This is the general idea. There is logic at the time of dealying and at the time of new transaction or in flush fop to wakeup existing sleeping transactions or choosing whether to delay a transaction etc, which is subjected to change based on future enhancements etc. Fixes: #418 BUG: 1549606 Change-Id: I88b570bbcf332a27c82d2767dfa82472f60055dc Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster/afr/src/afr-transaction.c')
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c913
1 files changed, 516 insertions, 397 deletions
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index a253c0835f5..ec72d46fb36 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -25,6 +25,18 @@ typedef enum {
AFR_TRANSACTION_POST_OP,
} afr_xattrop_type_t;
+static void
+afr_lock_resume_shared (struct list_head *list);
+
+void
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared);
+
+void
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this);
+
+int
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this);
+
gf_boolean_t
afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
@@ -168,13 +180,14 @@ afr_transaction_fop (call_frame_t *frame, xlator_t *this)
return 0;
}
-
int
afr_transaction_done (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- gf_boolean_t unwind = _gf_false;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t unwind = _gf_false;
+ afr_lock_t *lock = NULL;
+ afr_local_t *lock_local = NULL;
priv = this->private;
local = frame->local;
@@ -188,6 +201,31 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
if (unwind)/*It definitely did post-op*/
afr_zero_fill_stat (local);
}
+
+ if (local->transaction.do_eager_unlock) {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK (&local->inode->lock);
+ {
+ lock->acquired = _gf_false;
+ lock->release = _gf_false;
+ list_splice_init (&lock->frozen,
+ &lock->waiting);
+ if (list_empty (&lock->waiting))
+ goto unlock;
+ lock_local = list_entry (lock->waiting.next,
+ afr_local_t,
+ transaction.wait_list);
+ list_del_init (&lock_local->transaction.wait_list);
+ list_add (&lock_local->transaction.owner_list,
+ &lock->owners);
+ }
+unlock:
+ UNLOCK (&local->inode->lock);
+ }
+ if (lock_local) {
+ afr_lock (lock_local->transaction.frame,
+ lock_local->transaction.frame->this);
+ }
local->transaction.unwind (frame, this);
AFR_STACK_DESTROY (frame);
@@ -195,6 +233,52 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
return 0;
}
+static void
+afr_lock_fail_shared (afr_local_t *local, struct list_head *list)
+{
+ afr_local_t *each = NULL;
+
+ while (!list_empty(list)) {
+ each = list_entry (list->next, afr_local_t,
+ transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ each->op_ret = -1;
+ each->op_errno = local->op_errno;
+ afr_transaction_done (each->transaction.frame,
+ each->transaction.frame->this);
+ }
+}
+
+static void
+afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked)
+{
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
+
+ if (!local->transaction.eager_lock_on)
+ goto out;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ INIT_LIST_HEAD (&shared);
+ LOCK (&local->inode->lock);
+ {
+ list_splice_init (&lock->waiting, &shared);
+ }
+ UNLOCK (&local->inode->lock);
+
+ afr_lock_fail_shared (local, &shared);
+ local->transaction.do_eager_unlock = _gf_true;
+out:
+ if (locked) {
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ afr_unlock (local->transaction.frame,
+ local->transaction.frame->this);
+ } else {
+ afr_transaction_done (local->transaction.frame,
+ local->transaction.frame->this);
+ }
+}
call_frame_t*
afr_transaction_detach_fop_frame (call_frame_t *frame)
@@ -334,6 +418,7 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
int pre_op_sources_count = 0;
+ int i = 0;
priv = this->private;
local = frame->local;
@@ -345,11 +430,11 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
/* If arbiter is the only source, do not proceed. */
if (pre_op_sources_count < 2 &&
local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
- local->internal_lock.lock_cbk = afr_transaction_done;
local->op_ret = -1;
local->op_errno = ENOTCONN;
- afr_restore_lk_owner (frame);
- afr_unlock (frame, this);
+ for (i = 0; i < priv->child_count; i++)
+ local->transaction.failed_subvols[i] = 1;
+ afr_changelog_post_op (frame, this);/*uninherit should happen*/
} else {
afr_transaction_fop (frame, this);
}
@@ -362,14 +447,16 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- fd_t *fd = NULL;
int i = 0;
int ret = 0;
+ int failure_count = 0;
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
local = frame->local;
priv = this->private;
- fd = local->fd;
+ INIT_LIST_HEAD (&shared);
if (local->transaction.type == AFR_DATA_TRANSACTION &&
!local->transaction.inherited) {
ret = afr_write_subvol_set (frame, this);
@@ -394,22 +481,31 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
just now, before OP */
afr_changelog_pre_op_update (frame, this);
- /* The wake up needs to happen independent of
- what type of fop arrives here. If it was
- a write, then it has already inherited the
- lock and changelog. If it was not a write,
- then the presumption of the optimization (of
- optimizing for successive write operations)
- fails.
- */
- if (fd)
- afr_delayed_changelog_wake_up (this, fd);
+ if (!local->transaction.eager_lock_on ||
+ local->transaction.inherited)
+ goto fop;
+ failure_count = AFR_COUNT (local->transaction.failed_subvols,
+ priv->child_count);
+ if (failure_count == priv->child_count) {
+ afr_handle_lock_acquire_failure (local, _gf_true);
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK (&local->inode->lock);
+ {
+ lock->acquired = _gf_true;
+ __afr_transaction_wake_shared (local, &shared);
+ }
+ UNLOCK (&local->inode->lock);
+ }
+
+fop:
if (priv->arbiter_count == 1) {
afr_txn_arbitrate_fop (frame, this);
} else {
afr_transaction_fop (frame, this);
}
+ afr_lock_resume_shared (&shared);
return 0;
}
@@ -486,30 +582,14 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
}
-afr_inodelk_t*
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
-{
- afr_inodelk_t *inodelk = NULL;
- int i = 0;
-
- for (i = 0; int_lock->inodelk[i].domain; i++) {
- inodelk = &int_lock->inodelk[i];
- if (strcmp (dom, inodelk->domain) == 0)
- return inodelk;
- }
- return NULL;
-}
-
unsigned char*
afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
{
unsigned char *locked_nodes = NULL;
- afr_inodelk_t *inodelk = NULL;
switch (type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- locked_nodes = inodelk->locked_nodes;
+ locked_nodes = int_lock->locked_nodes;
break;
case AFR_ENTRY_TRANSACTION:
@@ -834,27 +914,19 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- fd_t *fd = NULL;
+ afr_inode_ctx_t *ctx = NULL;
int i = 0;
gf_boolean_t ret = _gf_false;
- afr_fd_ctx_t *fd_ctx = NULL;
int type = 0;
local = frame->local;
priv = this->private;
- fd = local->fd;
+ ctx = local->inode_ctx;
type = afr_index_for_transaction_type (local->transaction.type);
if (type != AFR_DATA_TRANSACTION)
return !local->transaction.dirtied;
- if (!fd)
- return !local->transaction.dirtied;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_false;
-
if (local->transaction.no_uninherit)
return _gf_false;
@@ -868,34 +940,34 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
if (local->transaction.uninherit_done)
return local->transaction.uninherit_value;
- LOCK(&fd->lock);
+ LOCK(&local->inode->lock);
{
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i] !=
- fd_ctx->pre_op_done[type][i]) {
+ ctx->pre_op_done[type][i]) {
ret = !local->transaction.dirtied;
goto unlock;
}
}
- if (fd_ctx->inherited[type]) {
+ if (ctx->inherited[type]) {
ret = _gf_true;
- fd_ctx->inherited[type]--;
- } else if (fd_ctx->on_disk[type]) {
+ ctx->inherited[type]--;
+ } else if (ctx->on_disk[type]) {
ret = _gf_false;
- fd_ctx->on_disk[type]--;
+ ctx->on_disk[type]--;
} else {
/* ASSERT */
ret = _gf_false;
}
- if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) {
+ if (!ctx->inherited[type] && !ctx->on_disk[type]) {
for (i = 0; i < priv->child_count; i++)
- fd_ctx->pre_op_done[type][i] = 0;
+ ctx->pre_op_done[type][i] = 0;
}
}
unlock:
- UNLOCK(&fd->lock);
+ UNLOCK(&local->inode->lock);
local->transaction.uninherit_done = _gf_true;
local->transaction.uninherit_value = ret;
@@ -909,31 +981,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- fd_t *fd = NULL;
int i = 0;
gf_boolean_t ret = _gf_false;
- afr_fd_ctx_t *fd_ctx = NULL;
int type = 0;
local = frame->local;
priv = this->private;
- fd = local->fd;
if (local->transaction.type != AFR_DATA_TRANSACTION)
return _gf_false;
type = afr_index_for_transaction_type (local->transaction.type);
- if (!fd)
- return _gf_false;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_false;
-
- LOCK(&fd->lock);
+ LOCK(&local->inode->lock);
{
- if (!fd_ctx->on_disk[type]) {
+ if (!local->inode_ctx->on_disk[type]) {
/* nothing to inherit yet */
ret = _gf_false;
goto unlock;
@@ -941,21 +1003,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i] !=
- fd_ctx->pre_op_done[type][i]) {
+ local->inode_ctx->pre_op_done[type][i]) {
/* either inherit exactly, or don't */
ret = _gf_false;
goto unlock;
}
}
- fd_ctx->inherited[type]++;
+ local->inode_ctx->inherited[type]++;
ret = _gf_true;
local->transaction.inherited = _gf_true;
}
unlock:
- UNLOCK(&fd->lock);
+ UNLOCK(&local->inode->lock);
return ret;
}
@@ -966,22 +1028,16 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
int i = 0;
gf_boolean_t ret = _gf_false;
int type = 0;
local = frame->local;
priv = this->private;
- fd = local->fd;
- if (!fd)
- return _gf_false;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_false;
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+ return _gf_false;
if (local->transaction.inherited)
/* was already inherited in afr_changelog_pre_op */
@@ -997,26 +1053,26 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
ret = _gf_false;
- LOCK(&fd->lock);
+ LOCK(&local->inode->lock);
{
- if (!fd_ctx->on_disk[type]) {
+ if (!local->inode_ctx->on_disk[type]) {
for (i = 0; i < priv->child_count; i++)
- fd_ctx->pre_op_done[type][i] =
+ local->inode_ctx->pre_op_done[type][i] =
(!local->transaction.failed_subvols[i]);
} else {
for (i = 0; i < priv->child_count; i++)
- if (fd_ctx->pre_op_done[type][i] !=
+ if (local->inode_ctx->pre_op_done[type][i] !=
(!local->transaction.failed_subvols[i])) {
local->transaction.no_uninherit = 1;
goto unlock;
}
}
- fd_ctx->on_disk[type]++;
+ local->inode_ctx->on_disk[type]++;
ret = _gf_true;
}
unlock:
- UNLOCK(&fd->lock);
+ UNLOCK(&local->inode->lock);
return ret;
}
@@ -1322,6 +1378,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
afr_init_optimistic_changelog_for_txn (this, local);
+ if (afr_changelog_pre_op_inherit (frame, this))
+ goto next;
+
/* This condition should not be met with present code, as
* transaction.done will be called if locks are not acquired on even a
* single node.
@@ -1347,9 +1406,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
goto err;
}
- if (afr_changelog_pre_op_inherit (frame, this))
- goto next;
-
if (call_count < priv->child_count)
pre_nop = _gf_false;
@@ -1406,7 +1462,7 @@ err:
local->op_ret = -1;
local->op_errno = op_errno;
- afr_unlock (frame, this);
+ afr_handle_lock_acquire_failure (local, _gf_true);
if (xdata_req)
dict_unref (xdata_req);
@@ -1416,31 +1472,6 @@ err:
int
-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- if (int_lock->lock_op_ret < 0) {
- gf_msg (this->name, GF_LOG_INFO,
- 0, AFR_MSG_BLOCKING_LKS_FAILED,
- "Blocking inodelks failed.");
- afr_transaction_done (frame, this);
- } else {
-
- gf_msg_debug (this->name, 0,
- "Blocking inodelks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
-
- return 0;
-}
-
-
-int
afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
@@ -1453,7 +1484,7 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
if (int_lock->lock_op_ret < 0) {
gf_msg_debug (this->name, 0,
"Non blocking inodelks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_post_blocking_inodelk_cbk;
+ int_lock->lock_cbk = afr_internal_lock_finish;
afr_blocking_lock (frame, this);
} else {
@@ -1467,31 +1498,6 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
int
-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- if (int_lock->lock_op_ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_BLOCKING_LKS_FAILED,
- "Blocking entrylks failed.");
- afr_transaction_done (frame, this);
- } else {
-
- gf_msg_debug (this->name, 0,
- "Blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
-
- return 0;
-}
-
-
-int
afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
@@ -1504,7 +1510,7 @@ afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
if (int_lock->lock_op_ret < 0) {
gf_msg_debug (this->name, 0,
"Non blocking entrylks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_post_blocking_entrylk_cbk;
+ int_lock->lock_cbk = afr_internal_lock_finish;
afr_blocking_lock (frame, this);
} else {
@@ -1565,29 +1571,28 @@ int
afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
{
afr_internal_lock_t *int_lock = NULL;
- afr_inodelk_t *inodelk = NULL;
afr_private_t *priv = NULL;
int_lock = &local->internal_lock;
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
priv = this->private;
- if ((priv->arbiter_count || priv->full_lock) &&
+ if ((priv->arbiter_count || local->transaction.eager_lock_on ||
+ priv->full_lock) &&
local->transaction.type == AFR_DATA_TRANSACTION) {
/*Lock entire file to avoid network split brains.*/
- inodelk->flock.l_len = 0;
- inodelk->flock.l_start = 0;
+ int_lock->flock.l_len = 0;
+ int_lock->flock.l_start = 0;
} else {
- inodelk->flock.l_len = local->transaction.len;
- inodelk->flock.l_start = local->transaction.start;
+ int_lock->flock.l_len = local->transaction.len;
+ int_lock->flock.l_start = local->transaction.start;
}
- inodelk->flock.l_type = F_WRLCK;
+ int_lock->flock.l_type = F_WRLCK;
return 0;
}
int
-afr_lock_rec (call_frame_t *frame, xlator_t *this)
+afr_lock (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
@@ -1628,74 +1633,153 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
return 0;
}
+static gf_boolean_t
+afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
+{
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
+
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
+
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
-int
-afr_lock (call_frame_t *frame, xlator_t *this)
+ return ((end1 >= start2) && (end2 >= start1));
+}
+
+gf_boolean_t
+afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check)
{
- afr_set_lock_number (frame, this);
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = NULL;
- return afr_lock_rec (frame, this);
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ /*
+ * Once full file lock is acquired in eager-lock phase, overlapping
+ * writes do not compete for inode-locks, instead are transferred to the
+ * next writes. Because of this overlapping writes are not ordered.
+ * This can cause inconsistencies in replication.
+ * Example:
+ * Two overlapping writes w1, w2 are sent in parallel on same fd
+ * in two threads t1, t2.
+ * Both threads can execute afr_writev_wind in the following manner.
+ * t1 winds w1 on brick-0
+ * t2 winds w2 on brick-0
+ * t2 winds w2 on brick-1
+ * t1 winds w1 on brick-1
+ *
+ * This check makes sure the locks are not transferred for
+ * overlapping writes.
+ */
+ list_for_each_entry (each, &lock->owners, transaction.owner_list) {
+ if (afr_locals_overlap (each, local)) {
+ return _gf_true;
+ }
+ }
+
+ if (!waitlist_check)
+ return _gf_false;
+ list_for_each_entry (each, &lock->waiting, transaction.wait_list) {
+ if (afr_locals_overlap (each, local)) {
+ return _gf_true;
+ }
+ }
+ return _gf_false;
}
/* }}} */
-
-int
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+static void
+afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src,
+ xlator_t *this)
{
- afr_changelog_pre_op (frame, this);
+ afr_private_t *priv = this->private;
- return 0;
+ dst->domain = src->domain;
+ dst->flock.l_len = src->flock.l_len;
+ dst->flock.l_start = src->flock.l_start;
+ dst->flock.l_type = src->flock.l_type;
+ dst->lock_count = src->lock_count;
+ memcpy (dst->locked_nodes, src->locked_nodes,
+ priv->child_count * sizeof (*dst->locked_nodes));
}
-
void
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ gf_boolean_t conflict = _gf_false;
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
- /* call this function from any of the related optimizations
- which benefit from delaying post op are enabled, namely:
-
- - changelog piggybacking
- - eager locking
- */
+ while (!conflict) {
+ if (list_empty (&lock->waiting))
+ return;
+ each = list_entry(lock->waiting.next, afr_local_t,
+ transaction.wait_list);
+ if (afr_has_lock_conflict (each, _gf_false)) {
+ conflict = _gf_true;
+ }
+ if (conflict && !list_empty (&lock->owners))
+ return;
+ afr_copy_inodelk_vars (&each->internal_lock,
+ &local->internal_lock,
+ each->transaction.frame->this);
+ list_move_tail (&each->transaction.wait_list, shared);
+ list_add_tail(&each->transaction.owner_list, &lock->owners);
+ }
+}
- priv = this->private;
- if (!priv)
- return;
+static void
+afr_lock_resume_shared (struct list_head *list)
+{
+ afr_local_t *each = NULL;
- if (!priv->post_op_delay_secs)
- return;
+ while (!list_empty(list)) {
+ each = list_entry(list->next, afr_local_t,
+ transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ afr_changelog_pre_op (each->transaction.frame,
+ each->transaction.frame->this);
+ }
+}
- local = frame->local;
- if (!local)
- return;
+int
+afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
- if (!local->transaction.eager_lock_on)
- return;
- if (!local->fd)
- return;
+ local->internal_lock.lock_cbk = NULL;
+ if (!local->transaction.eager_lock_on) {
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_transaction_done (frame, this);
+ return 0;
+ }
+ afr_changelog_pre_op (frame, this);
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_handle_lock_acquire_failure (local, _gf_false);
+ } else {
+ lock->event_generation = local->event_generation;
+ afr_changelog_pre_op (frame, this);
+ }
+ }
- if (local->op == GF_FOP_WRITE)
- local->delayed_post_op = _gf_true;
+ return 0;
}
gf_boolean_t
-afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
+afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this)
{
- afr_fd_ctx_t *fd_ctx = NULL;
-
- if (!fd) {
- /* If false is returned, it may keep on taking eager-lock
- * which may lead to starvation, so return true to avoid that.
- */
- gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF,
- AFR_MSG_INVALID_ARG, "Invalid fd");
- return _gf_true;
- }
/* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
* is taken mount2 opened the same file, it won't be able to
* perform any data operations until mount1 releases eager-lock.
@@ -1703,11 +1787,7 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
* if open-fd-count is > 1
*/
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_true;
-
- if (fd_ctx->open_fd_count > 1)
+ if (local->inode_ctx->open_fd_count > 1)
return _gf_true;
return _gf_false;
@@ -1715,24 +1795,45 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
gf_boolean_t
-is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
+afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this,
+ int delay)
{
- afr_local_t *local = NULL;
- gf_boolean_t res = _gf_false;
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t res = _gf_false;
local = frame->local;
- if (!local)
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ if (!afr_txn_nothing_failed (frame, this)) {
+ lock->release = _gf_true;
goto out;
+ }
- if (!local->delayed_post_op)
+ if (afr_are_multiple_fds_opened (local, this)) {
+ lock->release = _gf_true;
goto out;
+ }
- //Mark pending changelog ASAP
- if (!afr_txn_nothing_failed (frame, this))
+ if (!list_empty (&lock->owners))
+ goto out;
+ else
+ GF_ASSERT (list_empty (&lock->waiting));
+
+ if (lock->release) {
+ goto out;
+ }
+
+ if (!delay) {
goto out;
+ }
- if (local->fd && afr_are_multiple_fds_opened (local->fd, this))
+ if ((local->op != GF_FOP_WRITE) &&
+ (local->op != GF_FOP_FXATTROP)) {
+ /*Only allow writes but shard does [f]xattrops on writes, so
+ * they are fine too*/
goto out;
+ }
res = _gf_true;
out:
@@ -1743,50 +1844,61 @@ out:
void
afr_delayed_changelog_wake_up_cbk (void *data)
{
- fd_t *fd = NULL;
+ afr_lock_t *lock = NULL;
+ afr_local_t *local = data;
+ afr_local_t *timer_local = NULL;
+ struct list_head shared;
- fd = data;
-
- afr_delayed_changelog_wake_up (THIS, fd);
+ INIT_LIST_HEAD (&shared);
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK (&local->inode->lock);
+ {
+ timer_local = list_entry(lock->post_op.next,
+ afr_local_t,
+ transaction.owner_list);
+ if (list_empty (&lock->owners) && (local == timer_local)) {
+ GF_ASSERT (list_empty (&lock->waiting));
+ /*Last owner*/
+ lock->release = _gf_true;
+ lock->delay_timer = NULL;
+ }
+ }
+ UNLOCK (&local->inode->lock);
+ afr_changelog_post_op_now (local->transaction.frame,
+ local->transaction.frame->this);
}
/* SET operation */
int
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local)
{
- afr_fd_ctx_t *fdctx = NULL;
-
- fdctx = afr_fd_ctx_get (fd, this);
-
- LOCK(&fd->lock);
+ LOCK(&local->inode->lock);
{
- fdctx->witnessed_unstable_write = _gf_true;
+ local->inode_ctx->witnessed_unstable_write = _gf_true;
}
- UNLOCK(&fd->lock);
+ UNLOCK(&local->inode->lock);
return 0;
}
/* TEST and CLEAR operation */
gf_boolean_t
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode)
{
- afr_fd_ctx_t *fdctx = NULL;
+ afr_inode_ctx_t *ctx = NULL;
gf_boolean_t witness = _gf_false;
- fdctx = afr_fd_ctx_get (fd, this);
- if (!fdctx)
- return _gf_true;
-
- LOCK(&fd->lock);
+ LOCK(&inode->lock);
{
- if (fdctx->witnessed_unstable_write) {
+ (void)__afr_inode_ctx_get (this, inode, &ctx);
+
+ if (ctx->witnessed_unstable_write) {
witness = _gf_true;
- fdctx->witnessed_unstable_write = _gf_false;
+ ctx->witnessed_unstable_write = _gf_false;
}
}
- UNLOCK (&fd->lock);
+ UNLOCK (&inode->lock);
return witness;
}
@@ -1929,7 +2041,7 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
mark a flag in the fdctx whenever an unstable write is witnessed.
*/
- if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+ if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) {
afr_changelog_post_op_now (frame, this);
return 0;
}
@@ -1947,87 +2059,64 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
return 0;
}
-
void
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
- call_stub_t *stub)
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_fd_ctx_t *fd_ctx = NULL;
- call_frame_t *prev_frame = NULL;
- struct timespec delta = {0, };
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ struct timespec delta = {0, };
+ afr_private_t *priv = NULL;
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t post_op = _gf_true;
+ struct list_head shared;
priv = this->private;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- goto out;
-
delta.tv_sec = priv->post_op_delay_secs;
delta.tv_nsec = 0;
- pthread_mutex_lock (&fd_ctx->delay_lock);
- {
- prev_frame = fd_ctx->delay_frame;
- fd_ctx->delay_frame = NULL;
- if (fd_ctx->delay_timer)
- gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer);
- fd_ctx->delay_timer = NULL;
- if (!frame)
- goto unlock;
- fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta,
- afr_delayed_changelog_wake_up_cbk,
- fd);
- fd_ctx->delay_frame = frame;
- }
-unlock:
- pthread_mutex_unlock (&fd_ctx->delay_lock);
-
-out:
- if (prev_frame) {
- local = prev_frame->local;
- local->transaction.resume_stub = stub;
- afr_changelog_post_op_now (prev_frame, this);
- } else if (stub) {
- call_resume (stub);
- }
-}
-
-
-void
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- if (is_afr_delayed_changelog_post_op_needed (frame, this))
- afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
- else
- afr_changelog_post_op_safe (frame, this);
-}
-
+ INIT_LIST_HEAD (&shared);
+ if (!local->transaction.eager_lock_on)
+ goto out;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK (&local->inode->lock);
+ {
+ list_del_init (&local->transaction.owner_list);
+ list_add (&local->transaction.owner_list, &lock->post_op);
+ __afr_transaction_wake_shared (local, &shared);
+
+ if (!afr_is_delayed_changelog_post_op_needed (frame, this,
+ delta.tv_sec)) {
+ if (list_empty (&lock->owners))
+ lock->release = _gf_true;
+ goto unlock;
+ }
-/* Wake up the sleeping/delayed post-op, and also register
- a stub to have it resumed after this transaction
- completely finishes.
+ GF_ASSERT (lock->delay_timer == NULL);
+ lock->delay_timer = gf_timer_call_after (this->ctx, delta,
+ afr_delayed_changelog_wake_up_cbk,
+ local);
+ if (!lock->delay_timer) {
+ lock->release = _gf_true;
+ } else {
+ post_op = _gf_false;
+ }
- The @stub gets saved in @local and gets resumed in
- afr_local_cleanup()
- */
-void
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
-{
- afr_delayed_changelog_post_op (this, NULL, fd, stub);
-}
+ }
+unlock:
+ UNLOCK (&local->inode->lock);
+ if (!list_empty (&shared)) {
+ afr_lock_resume_shared (&shared);
+ }
-void
-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
-{
- afr_delayed_changelog_post_op (this, NULL, fd, NULL);
+out:
+ if (post_op) {
+ if (!local->transaction.eager_lock_on || lock->release) {
+ afr_changelog_post_op_safe (frame, this);
+ } else {
+ afr_changelog_post_op_now (frame, this);
+ }
+ }
}
int
@@ -2037,13 +2126,6 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
local = frame->local;
- if (local->transaction.eager_lock_on) {
- /* We don't need to retain "local" in the
- fd list anymore, writes to all subvols
- are finished by now */
- afr_remove_eager_lock_stub (local);
- }
-
afr_restore_lk_owner (frame);
afr_handle_symmetric_errors (frame, this);
@@ -2074,114 +2156,149 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
local->transaction.failed_subvols[child_index] = 1;
}
-
-
static gf_boolean_t
-afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
+__need_previous_lock_unlocked (afr_local_t *local)
{
- uint64_t start1 = local1->transaction.start;
- uint64_t start2 = local2->transaction.start;
- uint64_t end1 = 0;
- uint64_t end2 = 0;
-
- if (local1->transaction.len)
- end1 = start1 + local1->transaction.len - 1;
- else
- end1 = ULLONG_MAX;
+ afr_lock_t *lock = NULL;
- if (local2->transaction.len)
- end2 = start2 + local2->transaction.len - 1;
- else
- end2 = ULLONG_MAX;
+ if (!local->transaction.eager_lock_on)
+ return _gf_true;
- return ((end1 >= start2) && (end2 >= start1));
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (!lock->acquired)
+ return _gf_false;
+ if (lock->acquired && lock->event_generation != local->event_generation)
+ return _gf_true;
+ return _gf_false;
}
void
-afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
+__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock,
+ gf_boolean_t *do_pre_op, afr_local_t **timer_local)
{
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- afr_local_t *each = NULL;
+ afr_lock_t *lock = NULL;
+ afr_local_t *owner_local = NULL;
+ xlator_t *this = local->transaction.frame->this;
- priv = this->private;
-
- if (!local->fd)
- return;
-
- if (local->transaction.type != AFR_DATA_TRANSACTION)
- return;
+ if (local->fd && !afr_are_multiple_fds_opened (local, this)) {
+ local->transaction.eager_lock_on = _gf_true;
+ }
- if (!priv->eager_lock)
- return;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (__need_previous_lock_unlocked (local)) {
+ if (!list_empty (&lock->owners)) {
+ lock->release = _gf_true;
+ } else if (lock->delay_timer) {
+ lock->release = _gf_true;
+ if (gf_timer_call_cancel (this->ctx,
+ lock->delay_timer)) {
+ /* It will be put in frozen list
+ * in the code flow below*/
+ } else {
+ *timer_local = list_entry(lock->post_op.next,
+ afr_local_t,
+ transaction.owner_list);
+ lock->delay_timer = NULL;
+ }
+ }
+ if (!local->transaction.eager_lock_on)
+ goto out;
+ }
- fdctx = afr_fd_ctx_get (local->fd, this);
- if (!fdctx)
- return;
+ if (lock->release) {
+ list_add_tail (&local->transaction.wait_list,
+ &lock->frozen);
+ *take_lock = _gf_false;
+ goto out;
+ }
- if (afr_are_multiple_fds_opened (local->fd, this))
- return;
- /*
- * Once full file lock is acquired in eager-lock phase, overlapping
- * writes do not compete for inode-locks, instead are transferred to the
- * next writes. Because of this overlapping writes are not ordered.
- * This can cause inconsistencies in replication.
- * Example:
- * Two overlapping writes w1, w2 are sent in parallel on same fd
- * in two threads t1, t2.
- * Both threads can execute afr_writev_wind in the following manner.
- * t1 winds w1 on brick-0
- * t2 winds w2 on brick-0
- * t2 winds w2 on brick-1
- * t1 winds w1 on brick-1
- *
- * This check makes sure the locks are not transferred for
- * overlapping writes.
- */
- LOCK (&local->fd->lock);
- {
- list_for_each_entry (each, &fdctx->eager_locked,
- transaction.eager_locked) {
- if (afr_locals_overlap (each, local)) {
- local->transaction.eager_lock_on = _gf_false;
- goto unlock;
- }
+ if (lock->delay_timer) {
+ *take_lock = _gf_false;
+ if (gf_timer_call_cancel (this->ctx,
+ lock->delay_timer)) {
+ list_add_tail (&local->transaction.wait_list,
+ &lock->frozen);
+ } else {
+ *timer_local = list_entry(lock->post_op.next,
+ afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars (&local->internal_lock,
+ &(*timer_local)->internal_lock,
+ this);
+ lock->delay_timer = NULL;
+ *do_pre_op = _gf_true;
+ list_add_tail (&local->transaction.owner_list,
+ &lock->owners);
}
+ goto out;
+ }
- local->transaction.eager_lock_on = _gf_true;
- list_add_tail (&local->transaction.eager_locked,
- &fdctx->eager_locked);
+ if (!list_empty (&lock->owners)) {
+ if (!lock->acquired ||
+ afr_has_lock_conflict (local, _gf_true)) {
+ list_add_tail (&local->transaction.wait_list,
+ &lock->waiting);
+ *take_lock = _gf_false;
+ goto out;
+ }
+ owner_local = list_entry (lock->owners.next,
+ afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars (&local->internal_lock,
+ &owner_local->internal_lock,
+ this);
+ *take_lock = _gf_false;
+ *do_pre_op = _gf_true;
}
-unlock:
- UNLOCK (&local->fd->lock);
+
+ if (lock->acquired)
+ GF_ASSERT (!(*take_lock));
+ list_add_tail (&local->transaction.owner_list, &lock->owners);
+out:
+ return;
}
void
-afr_transaction_start (call_frame_t *frame, xlator_t *this)
+afr_transaction_start (afr_local_t *local, xlator_t *this)
{
- afr_local_t *local = frame->local;
- fd_t *fd = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t take_lock = _gf_true;
+ gf_boolean_t do_pre_op = _gf_false;
+ afr_local_t *timer_local = NULL;
- afr_transaction_eager_lock_init (local, this);
+ priv = this->private;
- if (local->fd && local->transaction.eager_lock_on)
- afr_set_lk_owner (frame, this, local->fd);
- else
- afr_set_lk_owner (frame, this, frame->root);
+ if (local->transaction.type != AFR_DATA_TRANSACTION &&
+ local->transaction.type != AFR_METADATA_TRANSACTION)
+ goto lock_phase;
- if (!local->transaction.eager_lock_on && local->loc.inode) {
- fd = fd_lookup (local->loc.inode, frame->root->pid);
- if (fd == NULL)
- fd = fd_lookup_anonymous (local->loc.inode,
- GF_ANON_FD_FLAGS);
+ if (!priv->eager_lock)
+ goto lock_phase;
- if (fd) {
- afr_delayed_changelog_wake_up (this, fd);
- fd_unref (fd);
- }
+ LOCK (&local->inode->lock);
+ {
+ __afr_eager_lock_handle (local, &take_lock, &do_pre_op,
+ &timer_local);
}
+ UNLOCK (&local->inode->lock);
+lock_phase:
+ if (!local->transaction.eager_lock_on) {
+ afr_set_lk_owner (local->transaction.frame, this,
+ local->transaction.frame->root);
+ } else {
+ afr_set_lk_owner (local->transaction.frame, this, local->inode);
+ }
+
- afr_lock (frame, this);
+ if (take_lock) {
+ afr_lock (local->transaction.frame, this);
+ } else if (do_pre_op) {
+ afr_changelog_pre_op (local->transaction.frame, this);
+ }
+ /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
+ * so that any inheriting can happen*/
+ if (timer_local)
+ afr_delayed_changelog_wake_up_cbk (timer_local);
}
int
@@ -2194,7 +2311,7 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
goto fail;
}
- afr_transaction_start (frame, this);
+ afr_transaction_start (local, this);
return 0;
fail:
local->transaction.unwind (frame, this);
@@ -2212,6 +2329,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
local = frame->local;
priv = this->private;
+ local->transaction.frame = frame;
local->transaction.type = type;
@@ -2224,11 +2342,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
if (ret < 0)
goto out;
- if (type == AFR_ENTRY_TRANSACTION ||
- type == AFR_ENTRY_RENAME_TRANSACTION) {
- afr_transaction_start (frame, this);
- ret = 0;
- goto out;
+
+ if (type != AFR_METADATA_TRANSACTION) {
+ goto txn_start;
}
ret = afr_inode_get_readable (frame, local->inode, this,
@@ -2238,10 +2354,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
event_generation)) {
afr_inode_refresh (frame, this, local->inode, local->loc.gfid,
afr_write_txn_refresh_done);
- } else {
- afr_transaction_start (frame, this);
+ ret = 0;
+ goto out;
}
+
+txn_start:
ret = 0;
+ afr_transaction_start (local, this);
out:
return ret;
}