summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr-transaction.c
diff options
context:
space:
mode:
authorGluster Ant <bugzilla-bot@gluster.org>2018-09-12 17:52:45 +0530
committerNigel Babu <nigelb@redhat.com>2018-09-12 17:52:45 +0530
commite16868dede6455cab644805af6fe1ac312775e13 (patch)
tree15aebdb4fff2d87cf8a72f836816b3aa634da58d /xlators/cluster/afr/src/afr-transaction.c
parent45a71c0548b6fd2c757aa2e7b7671a1411948894 (diff)
Land part 2 of clang-format changes
Change-Id: Ia84cc24c8924e6d22d02ac15f611c10e26db99b4 Signed-off-by: Nigel Babu <nigelb@redhat.com>
Diffstat (limited to 'xlators/cluster/afr/src/afr-transaction.c')
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c3887
1 files changed, 1903 insertions, 1984 deletions
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 9c587db0562..3a542ceca43 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -21,57 +21,57 @@
#include <signal.h>
typedef enum {
- AFR_TRANSACTION_PRE_OP,
- AFR_TRANSACTION_POST_OP,
+ AFR_TRANSACTION_PRE_OP,
+ AFR_TRANSACTION_POST_OP,
} afr_xattrop_type_t;
static void
-afr_lock_resume_shared (struct list_head *list);
+afr_lock_resume_shared(struct list_head *list);
void
-__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared);
+__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared);
void
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this);
+afr_changelog_post_op(call_frame_t *frame, xlator_t *this);
int
-afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this);
+afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this);
gf_boolean_t
-afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
+afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this);
gf_boolean_t
-afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this);
+afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this);
int
-afr_changelog_call_count (afr_transaction_type type,
- unsigned char *pre_op_subvols,
- unsigned char *failed_subvols,
- unsigned int child_count);
+afr_changelog_call_count(afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned char *failed_subvols,
+ unsigned int child_count);
int
-afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
- afr_changelog_resume_t changelog_resume,
- afr_xattrop_type_t op);
+afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume,
+ afr_xattrop_type_t op);
void
-afr_zero_fill_stat (afr_local_t *local)
-{
- if (!local)
- return;
- if (local->transaction.type == AFR_DATA_TRANSACTION ||
- local->transaction.type == AFR_METADATA_TRANSACTION) {
- gf_zero_fill_stat (&local->cont.inode_wfop.prebuf);
- gf_zero_fill_stat (&local->cont.inode_wfop.postbuf);
- } else if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
- local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- gf_zero_fill_stat (&local->cont.dir_fop.buf);
- gf_zero_fill_stat (&local->cont.dir_fop.preparent);
- gf_zero_fill_stat (&local->cont.dir_fop.postparent);
- if (local->transaction.type == AFR_ENTRY_TRANSACTION)
- return;
- gf_zero_fill_stat (&local->cont.dir_fop.prenewparent);
- gf_zero_fill_stat (&local->cont.dir_fop.postnewparent);
- }
+afr_zero_fill_stat(afr_local_t *local)
+{
+ if (!local)
+ return;
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION) {
+ gf_zero_fill_stat(&local->cont.inode_wfop.prebuf);
+ gf_zero_fill_stat(&local->cont.inode_wfop.postbuf);
+ } else if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ gf_zero_fill_stat(&local->cont.dir_fop.buf);
+ gf_zero_fill_stat(&local->cont.dir_fop.preparent);
+ gf_zero_fill_stat(&local->cont.dir_fop.postparent);
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION)
+ return;
+ gf_zero_fill_stat(&local->cont.dir_fop.prenewparent);
+ gf_zero_fill_stat(&local->cont.dir_fop.postnewparent);
+ }
}
/* In case of errors afr needs to choose which xdata from lower xlators it needs
@@ -79,2483 +79,2402 @@ afr_zero_fill_stat (afr_local_t *local)
* any good subvols which failed. Give preference to errnos other than
* ENOTCONN even if the child is source */
void
-afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv,
- inode_t *inode1, unsigned char *readable1,
- inode_t *inode2, unsigned char *readable2)
-{
- int s = -1;/*selection*/
- int i = 0;
- unsigned char *readable = NULL;
-
- if (local->xdata_rsp) {
- dict_unref (local->xdata_rsp);
- local->xdata_rsp = NULL;
- }
-
- readable = alloca0 (priv->child_count * sizeof (*readable));
- if (inode2 && readable2) {/*rename fop*/
- AFR_INTERSECT (readable, readable1, readable2,
- priv->child_count);
- } else {
- memcpy (readable, readable1,
- sizeof (*readable) * priv->child_count);
- }
-
+afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1,
+ unsigned char *readable1, inode_t *inode2,
+ unsigned char *readable2)
+{
+ int s = -1; /*selection*/
+ int i = 0;
+ unsigned char *readable = NULL;
+
+ if (local->xdata_rsp) {
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ }
+
+ readable = alloca0(priv->child_count * sizeof(*readable));
+ if (inode2 && readable2) { /*rename fop*/
+ AFR_INTERSECT(readable, readable1, readable2, priv->child_count);
+ } else {
+ memcpy(readable, readable1, sizeof(*readable) * priv->child_count);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+
+ if (local->replies[i].op_ret >= 0)
+ continue;
+
+ if (local->replies[i].op_errno == ENOTCONN)
+ continue;
+
+ /*Order is important in the following condition*/
+ if ((s < 0) || (!readable[s] && readable[i]))
+ s = i;
+ }
+
+ if (s != -1 && local->replies[s].xdata) {
+ local->xdata_rsp = dict_ref(local->replies[s].xdata);
+ } else if (s == -1) {
for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid)
- continue;
-
- if (local->replies[i].op_ret >= 0)
- continue;
+ if (!local->replies[i].valid)
+ continue;
- if (local->replies[i].op_errno == ENOTCONN)
- continue;
+ if (local->replies[i].op_ret >= 0)
+ continue;
- /*Order is important in the following condition*/
- if ((s < 0) || (!readable[s] && readable[i]))
- s = i;
- }
-
- if (s != -1 && local->replies[s].xdata) {
- local->xdata_rsp = dict_ref (local->replies[s].xdata);
- } else if (s == -1) {
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid)
- continue;
-
- if (local->replies[i].op_ret >= 0)
- continue;
-
- if (!local->replies[i].xdata)
- continue;
- local->xdata_rsp = dict_ref (local->replies[i].xdata);
- break;
- }
+ if (!local->replies[i].xdata)
+ continue;
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ break;
}
+ }
}
gf_boolean_t
-afr_needs_changelog_update (afr_local_t *local)
+afr_needs_changelog_update(afr_local_t *local)
{
- if (local->transaction.type == AFR_DATA_TRANSACTION)
- return _gf_true;
- if (!local->optimistic_change_log)
- return _gf_true;
- return _gf_false;
+ if (local->transaction.type == AFR_DATA_TRANSACTION)
+ return _gf_true;
+ if (!local->optimistic_change_log)
+ return _gf_true;
+ return _gf_false;
}
gf_boolean_t
-afr_changelog_has_quorum (afr_local_t *local, xlator_t *this)
+afr_changelog_has_quorum(afr_local_t *local, xlator_t *this)
{
- afr_private_t *priv = NULL;
- int i = 0;
- unsigned char *success_children = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ unsigned char *success_children = NULL;
- priv = this->private;
- success_children = alloca0 (priv->child_count);
+ priv = this->private;
+ success_children = alloca0(priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.failed_subvols[i]) {
- success_children[i] = 1;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i]) {
+ success_children[i] = 1;
}
+ }
- if (afr_has_quorum (success_children, this)) {
- return _gf_true;
- }
+ if (afr_has_quorum(success_children, this)) {
+ return _gf_true;
+ }
- return _gf_false;
+ return _gf_false;
}
-
gf_boolean_t
-afr_is_write_subvol_valid (call_frame_t *frame, xlator_t *this)
+afr_is_write_subvol_valid(call_frame_t *frame, xlator_t *this)
{
- int i = 0;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- uint64_t write_subvol = 0;
- unsigned char *writable = NULL;
- uint16_t datamap = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ uint64_t write_subvol = 0;
+ unsigned char *writable = NULL;
+ uint16_t datamap = 0;
- local = frame->local;
- priv = this->private;
- writable = alloca0 (priv->child_count);
+ local = frame->local;
+ priv = this->private;
+ writable = alloca0(priv->child_count);
- write_subvol = afr_write_subvol_get (frame, this);
- datamap = (write_subvol & 0x00000000ffff0000) >> 16;
- for (i = 0; i < priv->child_count; i++) {
- if (datamap & (1 << i))
- writable[i] = 1;
+ write_subvol = afr_write_subvol_get(frame, this);
+ datamap = (write_subvol & 0x00000000ffff0000) >> 16;
+ for (i = 0; i < priv->child_count; i++) {
+ if (datamap & (1 << i))
+ writable[i] = 1;
- if (writable[i] && !local->transaction.failed_subvols[i])
- return _gf_true;
- }
+ if (writable[i] && !local->transaction.failed_subvols[i])
+ return _gf_true;
+ }
- return _gf_false;
+ return _gf_false;
}
int
-afr_transaction_fop (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- unsigned char *failed_subvols = NULL;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- failed_subvols = local->transaction.failed_subvols;
- call_count = priv->child_count - AFR_COUNT (failed_subvols,
- priv->child_count);
- /* Fail if pre-op did not succeed on quorum no. of bricks. */
- if (!afr_changelog_has_quorum (local, this) || !call_count) {
- local->op_ret = -1;
- /* local->op_errno is already captured in changelog cbk. */
- afr_transaction_resume (frame, this);
- return 0;
- }
+afr_transaction_fop(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ unsigned char *failed_subvols = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ failed_subvols = local->transaction.failed_subvols;
+ call_count = priv->child_count -
+ AFR_COUNT(failed_subvols, priv->child_count);
+ /* Fail if pre-op did not succeed on quorum no. of bricks. */
+ if (!afr_changelog_has_quorum(local, this) || !call_count) {
+ local->op_ret = -1;
+ /* local->op_errno is already captured in changelog cbk. */
+ afr_transaction_resume(frame, this);
+ return 0;
+ }
- /* Fail if at least one writeable brick isn't up.*/
- if (local->transaction.type == AFR_DATA_TRANSACTION &&
- !afr_is_write_subvol_valid (frame, this)) {
- local->op_ret = -1;
- local->op_errno = EIO;
- afr_transaction_resume (frame, this);
- return 0;
- }
+ /* Fail if at least one writeable brick isn't up.*/
+ if (local->transaction.type == AFR_DATA_TRANSACTION &&
+ !afr_is_write_subvol_valid(frame, this)) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ afr_transaction_resume(frame, this);
+ return 0;
+ }
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i] && !failed_subvols[i]) {
- local->transaction.wind (frame, this, i);
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] && !failed_subvols[i]) {
+ local->transaction.wind(frame, this, i);
- if (!--call_count)
- break;
- }
+ if (!--call_count)
+ break;
}
+ }
- return 0;
+ return 0;
}
int
-afr_transaction_done (call_frame_t *frame, xlator_t *this)
+afr_transaction_done(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- gf_boolean_t unwind = _gf_false;
- afr_lock_t *lock = NULL;
- afr_local_t *lock_local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t unwind = _gf_false;
+ afr_lock_t *lock = NULL;
+ afr_local_t *lock_local = NULL;
- priv = this->private;
- local = frame->local;
+ priv = this->private;
+ local = frame->local;
- if (priv->consistent_metadata) {
- LOCK (&frame->lock);
- {
- unwind = (local->transaction.main_frame != NULL);
- }
- UNLOCK (&frame->lock);
- if (unwind)/*It definitely did post-op*/
- afr_zero_fill_stat (local);
+ if (priv->consistent_metadata) {
+ LOCK(&frame->lock);
+ {
+ unwind = (local->transaction.main_frame != NULL);
}
+ UNLOCK(&frame->lock);
+ if (unwind) /*It definitely did post-op*/
+ afr_zero_fill_stat(local);
+ }
- if (local->transaction.do_eager_unlock) {
- lock = &local->inode_ctx->lock[local->transaction.type];
- LOCK (&local->inode->lock);
- {
- lock->acquired = _gf_false;
- lock->release = _gf_false;
- list_splice_init (&lock->frozen,
- &lock->waiting);
- if (list_empty (&lock->waiting))
- goto unlock;
- lock_local = list_entry (lock->waiting.next,
- afr_local_t,
- transaction.wait_list);
- list_del_init (&lock_local->transaction.wait_list);
- list_add (&lock_local->transaction.owner_list,
- &lock->owners);
- }
-unlock:
- UNLOCK (&local->inode->lock);
- }
- if (lock_local) {
- afr_lock (lock_local->transaction.frame,
- lock_local->transaction.frame->this);
- }
- local->transaction.unwind (frame, this);
+ if (local->transaction.do_eager_unlock) {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ lock->acquired = _gf_false;
+ lock->release = _gf_false;
+ list_splice_init(&lock->frozen, &lock->waiting);
+ if (list_empty(&lock->waiting))
+ goto unlock;
+ lock_local = list_entry(lock->waiting.next, afr_local_t,
+ transaction.wait_list);
+ list_del_init(&lock_local->transaction.wait_list);
+ list_add(&lock_local->transaction.owner_list, &lock->owners);
+ }
+ unlock:
+ UNLOCK(&local->inode->lock);
+ }
+ if (lock_local) {
+ afr_lock(lock_local->transaction.frame,
+ lock_local->transaction.frame->this);
+ }
+ local->transaction.unwind(frame, this);
- AFR_STACK_DESTROY (frame);
+ AFR_STACK_DESTROY(frame);
- return 0;
+ return 0;
}
static void
-afr_lock_fail_shared (afr_local_t *local, struct list_head *list)
-{
- afr_local_t *each = NULL;
-
- while (!list_empty(list)) {
- each = list_entry (list->next, afr_local_t,
- transaction.wait_list);
- list_del_init(&each->transaction.wait_list);
- each->op_ret = -1;
- each->op_errno = local->op_errno;
- afr_transaction_done (each->transaction.frame,
- each->transaction.frame->this);
- }
+afr_lock_fail_shared(afr_local_t *local, struct list_head *list)
+{
+ afr_local_t *each = NULL;
+
+ while (!list_empty(list)) {
+ each = list_entry(list->next, afr_local_t, transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ each->op_ret = -1;
+ each->op_errno = local->op_errno;
+ afr_transaction_done(each->transaction.frame,
+ each->transaction.frame->this);
+ }
}
static void
-afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked)
+afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked)
{
- struct list_head shared;
- afr_lock_t *lock = NULL;
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
- if (!local->transaction.eager_lock_on)
- goto out;
+ if (!local->transaction.eager_lock_on)
+ goto out;
- lock = &local->inode_ctx->lock[local->transaction.type];
+ lock = &local->inode_ctx->lock[local->transaction.type];
- INIT_LIST_HEAD (&shared);
- LOCK (&local->inode->lock);
- {
- lock->release = _gf_true;
- list_splice_init (&lock->waiting, &shared);
- }
- UNLOCK (&local->inode->lock);
+ INIT_LIST_HEAD(&shared);
+ LOCK(&local->inode->lock);
+ {
+ lock->release = _gf_true;
+ list_splice_init(&lock->waiting, &shared);
+ }
+ UNLOCK(&local->inode->lock);
- afr_lock_fail_shared (local, &shared);
- local->transaction.do_eager_unlock = _gf_true;
+ afr_lock_fail_shared(local, &shared);
+ local->transaction.do_eager_unlock = _gf_true;
out:
- if (locked) {
- local->internal_lock.lock_cbk = afr_transaction_done;
- afr_unlock (local->transaction.frame,
- local->transaction.frame->this);
- } else {
- afr_transaction_done (local->transaction.frame,
- local->transaction.frame->this);
- }
+ if (locked) {
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ afr_unlock(local->transaction.frame, local->transaction.frame->this);
+ } else {
+ afr_transaction_done(local->transaction.frame,
+ local->transaction.frame->this);
+ }
}
-call_frame_t*
-afr_transaction_detach_fop_frame (call_frame_t *frame)
+call_frame_t *
+afr_transaction_detach_fop_frame(call_frame_t *frame)
{
- afr_local_t * local = NULL;
- call_frame_t *fop_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *fop_frame = NULL;
- local = frame->local;
+ local = frame->local;
- afr_handle_inconsistent_fop (frame, &local->op_ret, &local->op_errno);
- LOCK (&frame->lock);
- {
- fop_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ afr_handle_inconsistent_fop(frame, &local->op_ret, &local->op_errno);
+ LOCK(&frame->lock);
+ {
+ fop_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK(&frame->lock);
- return fop_frame;
+ return fop_frame;
}
-
static void
-afr_save_lk_owner (call_frame_t *frame)
+afr_save_lk_owner(call_frame_t *frame)
{
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- local->saved_lk_owner = frame->root->lk_owner;
+ local->saved_lk_owner = frame->root->lk_owner;
}
-
static void
-afr_restore_lk_owner (call_frame_t *frame)
+afr_restore_lk_owner(call_frame_t *frame)
{
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- frame->root->lk_owner = local->saved_lk_owner;
+ frame->root->lk_owner = local->saved_lk_owner;
}
void
-__mark_all_success (call_frame_t *frame, xlator_t *this)
+__mark_all_success(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- local->transaction.failed_subvols[i] = 0;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ local->transaction.failed_subvols[i] = 0;
+ }
}
void
-afr_compute_pre_op_sources (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_transaction_type type = -1;
- dict_t *xdata = NULL;
- int **matrix = NULL;
- int idx = -1;
- int i = 0;
- int j = 0;
-
- priv = this->private;
- local = frame->local;
- type = local->transaction.type;
- idx = afr_index_for_transaction_type (type);
- matrix = ALLOC_MATRIX (priv->child_count, int);
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.changelog_xdata[i])
- continue;
- xdata = local->transaction.changelog_xdata[i];
- afr_selfheal_fill_matrix (this, matrix, i, idx, xdata);
- }
-
- memset (local->transaction.pre_op_sources, 1, priv->child_count);
-
- /*If lock or pre-op failed on a brick, it is not a source. */
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.failed_subvols[i])
- local->transaction.pre_op_sources[i] = 0;
- }
-
- /* If brick is blamed by others, it is not a source. */
- for (i = 0; i < priv->child_count; i++)
- for (j = 0; j < priv->child_count; j++)
- if (matrix[i][j] != 0)
- local->transaction.pre_op_sources[j] = 0;
+afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_transaction_type type = -1;
+ dict_t *xdata = NULL;
+ int **matrix = NULL;
+ int idx = -1;
+ int i = 0;
+ int j = 0;
+
+ priv = this->private;
+ local = frame->local;
+ type = local->transaction.type;
+ idx = afr_index_for_transaction_type(type);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.changelog_xdata[i])
+ continue;
+ xdata = local->transaction.changelog_xdata[i];
+ afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
+ }
+
+ memset(local->transaction.pre_op_sources, 1, priv->child_count);
+
+ /*If lock or pre-op failed on a brick, it is not a source. */
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->transaction.pre_op_sources[i] = 0;
+ }
+
+ /* If brick is blamed by others, it is not a source. */
+ for (i = 0; i < priv->child_count; i++)
+ for (j = 0; j < priv->child_count; j++)
+ if (matrix[i][j] != 0)
+ local->transaction.pre_op_sources[j] = 0;
}
gf_boolean_t
-afr_has_arbiter_fop_cbk_quorum (call_frame_t *frame)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- xlator_t *this = NULL;
- gf_boolean_t fop_failed = _gf_false;
- unsigned char *pre_op_sources = NULL;
- int i = 0;
-
- local = frame->local;
- this = frame->this;
- priv = this->private;
- pre_op_sources = local->transaction.pre_op_sources;
-
- /* If the fop failed on the brick, it is not a source. */
- for (i = 0; i < priv->child_count; i++)
- if (local->transaction.failed_subvols[i])
- pre_op_sources[i] = 0;
-
- switch (AFR_COUNT (pre_op_sources, priv->child_count)) {
+afr_has_arbiter_fop_cbk_quorum(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t fop_failed = _gf_false;
+ unsigned char *pre_op_sources = NULL;
+ int i = 0;
+
+ local = frame->local;
+ this = frame->this;
+ priv = this->private;
+ pre_op_sources = local->transaction.pre_op_sources;
+
+ /* If the fop failed on the brick, it is not a source. */
+ for (i = 0; i < priv->child_count; i++)
+ if (local->transaction.failed_subvols[i])
+ pre_op_sources[i] = 0;
+
+ switch (AFR_COUNT(pre_op_sources, priv->child_count)) {
case 1:
- if (pre_op_sources[ARBITER_BRICK_INDEX])
- fop_failed = _gf_true;
- break;
- case 0:
+ if (pre_op_sources[ARBITER_BRICK_INDEX])
fop_failed = _gf_true;
- break;
- }
+ break;
+ case 0:
+ fop_failed = _gf_true;
+ break;
+ }
- if (fop_failed)
- return _gf_false;
+ if (fop_failed)
+ return _gf_false;
- return _gf_true;
+ return _gf_true;
}
void
-afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int pre_op_sources_count = 0;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
-
- afr_compute_pre_op_sources (frame, this);
- pre_op_sources_count = AFR_COUNT (local->transaction.pre_op_sources,
- priv->child_count);
-
- /* If arbiter is the only source, do not proceed. */
- if (pre_op_sources_count < 2 &&
- local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- for (i = 0; i < priv->child_count; i++)
- local->transaction.failed_subvols[i] = 1;
- }
+afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int pre_op_sources_count = 0;
+ int i = 0;
- afr_transaction_fop (frame, this);
+ priv = this->private;
+ local = frame->local;
- return;
+ afr_compute_pre_op_sources(frame, this);
+ pre_op_sources_count = AFR_COUNT(local->transaction.pre_op_sources,
+ priv->child_count);
+
+ /* If arbiter is the only source, do not proceed. */
+ if (pre_op_sources_count < 2 &&
+ local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ for (i = 0; i < priv->child_count; i++)
+ local->transaction.failed_subvols[i] = 1;
+ }
+
+ afr_transaction_fop(frame, this);
+
+ return;
}
int
-afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int ret = 0;
- int failure_count = 0;
- struct list_head shared;
- afr_lock_t *lock = NULL;
-
- local = frame->local;
- priv = this->private;
-
- INIT_LIST_HEAD (&shared);
- if (local->transaction.type == AFR_DATA_TRANSACTION &&
- !local->transaction.inherited) {
- ret = afr_write_subvol_set (frame, this);
- if (ret) {
- /*act as if operation failed on all subvols*/
- local->op_ret = -1;
- local->op_errno = -ret;
- for (i = 0; i < priv->child_count; i++)
- local->transaction.failed_subvols[i] = 1;
- }
- }
-
- if (local->pre_op_compat)
- /* old mode, pre-op was done as afr_changelog_do()
- just now, before OP */
- afr_changelog_pre_op_update (frame, this);
-
- if (!local->transaction.eager_lock_on ||
- local->transaction.inherited)
- goto fop;
- failure_count = AFR_COUNT (local->transaction.failed_subvols,
- priv->child_count);
- if (failure_count == priv->child_count) {
- afr_handle_lock_acquire_failure (local, _gf_true);
- return 0;
- } else {
- lock = &local->inode_ctx->lock[local->transaction.type];
- LOCK (&local->inode->lock);
- {
- lock->acquired = _gf_true;
- __afr_transaction_wake_shared (local, &shared);
- }
- UNLOCK (&local->inode->lock);
+afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int ret = 0;
+ int failure_count = 0;
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ INIT_LIST_HEAD(&shared);
+ if (local->transaction.type == AFR_DATA_TRANSACTION &&
+ !local->transaction.inherited) {
+ ret = afr_write_subvol_set(frame, this);
+ if (ret) {
+ /*act as if operation failed on all subvols*/
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ for (i = 0; i < priv->child_count; i++)
+ local->transaction.failed_subvols[i] = 1;
+ }
+ }
+
+ if (local->pre_op_compat)
+ /* old mode, pre-op was done as afr_changelog_do()
+ just now, before OP */
+ afr_changelog_pre_op_update(frame, this);
+
+ if (!local->transaction.eager_lock_on || local->transaction.inherited)
+ goto fop;
+ failure_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (failure_count == priv->child_count) {
+ afr_handle_lock_acquire_failure(local, _gf_true);
+ return 0;
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ lock->acquired = _gf_true;
+ __afr_transaction_wake_shared(local, &shared);
}
+ UNLOCK(&local->inode->lock);
+ }
fop:
- /* Perform fops with the lk-owner from top xlator.
- * Eg: lk-owner of posix-lk and flush should be same,
- * flush cant clear the posix-lks without that lk-owner.
- */
- afr_save_lk_owner (frame);
- frame->root->lk_owner =
- local->transaction.main_frame->root->lk_owner;
+ /* Perform fops with the lk-owner from top xlator.
+ * Eg: lk-owner of posix-lk and flush should be same,
+ * flush cant clear the posix-lks without that lk-owner.
+ */
+ afr_save_lk_owner(frame);
+ frame->root->lk_owner = local->transaction.main_frame->root->lk_owner;
- if (priv->arbiter_count == 1) {
- afr_txn_arbitrate_fop (frame, this);
- } else {
- afr_transaction_fop (frame, this);
- }
+ if (priv->arbiter_count == 1) {
+ afr_txn_arbitrate_fop(frame, this);
+ } else {
+ afr_transaction_fop(frame, this);
+ }
- afr_lock_resume_shared (&shared);
- return 0;
+ afr_lock_resume_shared(&shared);
+ return 0;
}
int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending)
+afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)
{
- int i = 0;
- int ret = 0;
-
- for (i = 0; i < priv->child_count; i++) {
+ int i = 0;
+ int ret = 0;
- ret = dict_set_static_bin (xattr, priv->pending_key[i],
- pending[i],
- AFR_NUM_CHANGE_LOGS * sizeof (int));
- /* 3 = data+metadata+entry */
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], pending[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ /* 3 = data+metadata+entry */
- if (ret)
- break;
- }
+ if (ret)
+ break;
+ }
- return ret;
+ return ret;
}
/* {{{ pending */
int
-afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */
- if (!afr_changelog_has_quorum (local, this)) {
- local->op_ret = -1;
- /*local->op_errno is already captured in changelog cbk*/
- }
+ /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */
+ if (!afr_changelog_has_quorum(local, this)) {
+ local->op_ret = -1;
+ /*local->op_errno is already captured in changelog cbk*/
+ }
- if (local->transaction.resume_stub) {
- call_resume (local->transaction.resume_stub);
- local->transaction.resume_stub = NULL;
- }
+ if (local->transaction.resume_stub) {
+ call_resume(local->transaction.resume_stub);
+ local->transaction.resume_stub = NULL;
+ }
- int_lock->lock_cbk = afr_transaction_done;
- afr_unlock (frame, this);
+ int_lock->lock_cbk = afr_transaction_done;
+ afr_unlock(frame, this);
- return 0;
+ return 0;
}
-
-unsigned char*
-afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
+unsigned char *
+afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock)
{
- unsigned char *locked_nodes = NULL;
- switch (type) {
+ unsigned char *locked_nodes = NULL;
+ switch (type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- locked_nodes = int_lock->locked_nodes;
- break;
+ locked_nodes = int_lock->locked_nodes;
+ break;
case AFR_ENTRY_TRANSACTION:
case AFR_ENTRY_RENAME_TRANSACTION:
- /*Because same set of subvols participate in all lockee
- * entities*/
- locked_nodes = int_lock->lockee[0].locked_nodes;
- break;
- }
- return locked_nodes;
+ /*Because same set of subvols participate in all lockee
+ * entities*/
+ locked_nodes = int_lock->lockee[0].locked_nodes;
+ break;
+ }
+ return locked_nodes;
}
-
int
-afr_changelog_call_count (afr_transaction_type type,
- unsigned char *pre_op_subvols,
- unsigned char *failed_subvols,
- unsigned int child_count)
+afr_changelog_call_count(afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned char *failed_subvols,
+ unsigned int child_count)
{
- int i = 0;
- int call_count = 0;
+ int i = 0;
+ int call_count = 0;
- for (i = 0; i < child_count; i++) {
- if (pre_op_subvols[i] && !failed_subvols[i]) {
- call_count++;
- }
+ for (i = 0; i < child_count; i++) {
+ if (pre_op_subvols[i] && !failed_subvols[i]) {
+ call_count++;
}
+ }
- if (type == AFR_ENTRY_RENAME_TRANSACTION)
- call_count *= 2;
+ if (type == AFR_ENTRY_RENAME_TRANSACTION)
+ call_count *= 2;
- return call_count;
+ return call_count;
}
-
gf_boolean_t
-afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
+afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- if (priv->thin_arbiter_count) {
- /* We need to perform post-op even if 1 data brick was down
- * before the txn started.*/
- if (AFR_COUNT (local->transaction.failed_subvols,
- priv->child_count))
- return _gf_false;
- }
+ if (priv->thin_arbiter_count) {
+ /* We need to perform post-op even if 1 data brick was down
+ * before the txn started.*/
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count))
+ return _gf_false;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i] &&
- local->transaction.failed_subvols[i])
- return _gf_false;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ local->transaction.failed_subvols[i])
+ return _gf_false;
+ }
- return _gf_true;
+ return _gf_true;
}
void
-afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this)
+afr_handle_symmetric_errors(call_frame_t *frame, xlator_t *this)
{
- if (afr_is_symmetric_error (frame, this))
- __mark_all_success (frame, this);
+ if (afr_is_symmetric_error(frame, this))
+ __mark_all_success(frame, this);
}
gf_boolean_t
-afr_has_quorum (unsigned char *subvols, xlator_t *this)
-{
- unsigned int quorum_count = 0;
- afr_private_t *priv = NULL;
- unsigned int up_children_count = 0;
-
- priv = this->private;
- up_children_count = AFR_COUNT (subvols, priv->child_count);
-
- if (priv->quorum_count == AFR_QUORUM_AUTO) {
- /*
- * Special case for auto-quorum with an even number of nodes.
- *
- * A replica set with even count N can only handle the same
- * number of failures as odd N-1 before losing "vanilla"
- * quorum, and the probability of more simultaneous failures is
- * actually higher. For example, with a 1% chance of failure
- * we'd have a 0.03% chance of two simultaneous failures with
- * N=3 but a 0.06% chance with N=4. However, the special case
- * is necessary for N=2 because there's no real quorum in that
- * case (i.e. can't normally survive *any* failures). In that
- * case, we treat the first node as a tie-breaker, allowing
- * quorum to be retained in some cases while still honoring the
- * all-important constraint that there can not simultaneously
- * be two partitioned sets of nodes each believing they have
- * quorum. Of two equally sized sets, the one without that
- * first node will lose.
- *
- * It turns out that the special case is beneficial for higher
- * values of N as well. Continuing the example above, the
- * probability of losing quorum with N=4 and this type of
- * quorum is (very) slightly lower than with N=3 and vanilla
- * quorum. The difference becomes even more pronounced with
- * higher N. Therefore, even though such replica counts are
- * unlikely to be seen in practice, we might as well use the
- * "special" quorum then as well.
- */
- if ((up_children_count * 2) == priv->child_count) {
- return subvols[0];
- }
- }
+afr_has_quorum(unsigned char *subvols, xlator_t *this)
+{
+ unsigned int quorum_count = 0;
+ afr_private_t *priv = NULL;
+ unsigned int up_children_count = 0;
- if (priv->quorum_count == AFR_QUORUM_AUTO) {
- quorum_count = priv->child_count/2 + 1;
- } else {
- quorum_count = priv->quorum_count;
+ priv = this->private;
+ up_children_count = AFR_COUNT(subvols, priv->child_count);
+
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ /*
+ * Special case for auto-quorum with an even number of nodes.
+ *
+ * A replica set with even count N can only handle the same
+ * number of failures as odd N-1 before losing "vanilla"
+ * quorum, and the probability of more simultaneous failures is
+ * actually higher. For example, with a 1% chance of failure
+ * we'd have a 0.03% chance of two simultaneous failures with
+ * N=3 but a 0.06% chance with N=4. However, the special case
+ * is necessary for N=2 because there's no real quorum in that
+ * case (i.e. can't normally survive *any* failures). In that
+ * case, we treat the first node as a tie-breaker, allowing
+ * quorum to be retained in some cases while still honoring the
+ * all-important constraint that there can not simultaneously
+ * be two partitioned sets of nodes each believing they have
+ * quorum. Of two equally sized sets, the one without that
+ * first node will lose.
+ *
+ * It turns out that the special case is beneficial for higher
+ * values of N as well. Continuing the example above, the
+ * probability of losing quorum with N=4 and this type of
+ * quorum is (very) slightly lower than with N=3 and vanilla
+ * quorum. The difference becomes even more pronounced with
+ * higher N. Therefore, even though such replica counts are
+ * unlikely to be seen in practice, we might as well use the
+ * "special" quorum then as well.
+ */
+ if ((up_children_count * 2) == priv->child_count) {
+ return subvols[0];
}
+ }
- if (up_children_count >= quorum_count)
- return _gf_true;
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ quorum_count = priv->child_count / 2 + 1;
+ } else {
+ quorum_count = priv->quorum_count;
+ }
- return _gf_false;
+ if (up_children_count >= quorum_count)
+ return _gf_true;
+
+ return _gf_false;
}
static gf_boolean_t
-afr_has_fop_quorum (call_frame_t *frame)
+afr_has_fop_quorum(call_frame_t *frame)
{
- xlator_t *this = frame->this;
- afr_local_t *local = frame->local;
- unsigned char *locked_nodes = NULL;
+ xlator_t *this = frame->this;
+ afr_local_t *local = frame->local;
+ unsigned char *locked_nodes = NULL;
- locked_nodes = afr_locked_nodes_get (local->transaction.type,
- &local->internal_lock);
- return afr_has_quorum (locked_nodes, this);
+ locked_nodes = afr_locked_nodes_get(local->transaction.type,
+ &local->internal_lock);
+ return afr_has_quorum(locked_nodes, this);
}
static gf_boolean_t
-afr_has_fop_cbk_quorum (call_frame_t *frame)
+afr_has_fop_cbk_quorum(call_frame_t *frame)
{
- afr_local_t *local = frame->local;
- xlator_t *this = frame->this;
- afr_private_t *priv = this->private;
- unsigned char *success = alloca0(priv->child_count);
- int i = 0;
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ unsigned char *success = alloca0(priv->child_count);
+ int i = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i])
- if (!local->transaction.failed_subvols[i])
- success[i] = 1;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i])
+ if (!local->transaction.failed_subvols[i])
+ success[i] = 1;
+ }
- return afr_has_quorum (success, this);
+ return afr_has_quorum(success, this);
}
gf_boolean_t
-afr_need_dirty_marking (call_frame_t *frame, xlator_t *this)
+afr_need_dirty_marking(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = this->private;
- afr_local_t *local = NULL;
- gf_boolean_t need_dirty = _gf_false;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = NULL;
+ gf_boolean_t need_dirty = _gf_false;
- local = frame->local;
+ local = frame->local;
- if (!priv->quorum_count || !local->optimistic_change_log)
- return _gf_false;
+ if (!priv->quorum_count || !local->optimistic_change_log)
+ return _gf_false;
- if (local->transaction.type == AFR_DATA_TRANSACTION ||
- local->transaction.type == AFR_METADATA_TRANSACTION)
- return _gf_false;
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION)
+ return _gf_false;
- if (AFR_COUNT (local->transaction.failed_subvols, priv->child_count) ==
- priv->child_count)
- return _gf_false;
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) ==
+ priv->child_count)
+ return _gf_false;
- if (priv->arbiter_count) {
- if (!afr_has_arbiter_fop_cbk_quorum (frame))
- need_dirty = _gf_true;
- } else if (!afr_has_fop_cbk_quorum (frame)) {
- need_dirty = _gf_true;
- }
+ if (priv->arbiter_count) {
+ if (!afr_has_arbiter_fop_cbk_quorum(frame))
+ need_dirty = _gf_true;
+ } else if (!afr_has_fop_cbk_quorum(frame)) {
+ need_dirty = _gf_true;
+ }
- return need_dirty;
+ return need_dirty;
}
void
-afr_handle_quorum (call_frame_t *frame, xlator_t *this)
+afr_handle_quorum(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- const char *file = NULL;
- uuid_t gfid = {0};
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ const char *file = NULL;
+ uuid_t gfid = {0};
- local = frame->local;
- priv = frame->this->private;
+ local = frame->local;
+ priv = frame->this->private;
- if (priv->quorum_count == 0)
- return;
-
- /* If the fop already failed return right away to preserve errno */
- if (local->op_ret == -1)
- return;
+ if (priv->quorum_count == 0)
+ return;
- /*
- * Network split may happen just after the fops are unwound, so check
- * if the fop succeeded in a way it still follows quorum. If it doesn't,
- * mark the fop as failure, mark the changelogs so it reflects that
- * failure.
- *
- * Scenario:
- * There are 3 mounts on 3 machines(node1, node2, node3) all writing to
- * single file. Network split happened in a way that node1 can't see
- * node2, node3. Node2, node3 both of them can't see node1. Now at the
- * time of sending write all the bricks are up. Just after write fop is
- * wound on node1, network split happens. Node1 thinks write fop failed
- * on node2, node3 so marks pending changelog for those 2 extended
- * attributes on node1. Node2, node3 thinks writes failed on node1 so
- * they mark pending changelog for node1. When the network is stable
- * again the file already is in split-brain. These checks prevent
- * marking pending changelog on other subvolumes if the fop doesn't
- * succeed in a way it is still following quorum. So with this fix what
- * is happening is, node1 will have all pending changelog(FOOL) because
- * the write succeeded only on node1 but failed on node2, node3 so
- * instead of marking pending changelogs on node2, node3 it just treats
- * the fop as failure and goes into DIRTY state. Where as node2, node3
- * say they are sources and have pending changelog to node1 so there is
- * no split-brain with the fix. The problem is eliminated completely.
- */
+ /* If the fop already failed return right away to preserve errno */
+ if (local->op_ret == -1)
+ return;
- if (priv->arbiter_count) {
- if (afr_has_arbiter_fop_cbk_quorum (frame))
- return;
- } else if (afr_has_fop_cbk_quorum (frame)) {
- return;
- }
+ /*
+ * Network split may happen just after the fops are unwound, so check
+ * if the fop succeeded in a way it still follows quorum. If it doesn't,
+ * mark the fop as failure, mark the changelogs so it reflects that
+ * failure.
+ *
+ * Scenario:
+ * There are 3 mounts on 3 machines(node1, node2, node3) all writing to
+ * single file. Network split happened in a way that node1 can't see
+ * node2, node3. Node2, node3 both of them can't see node1. Now at the
+ * time of sending write all the bricks are up. Just after write fop is
+ * wound on node1, network split happens. Node1 thinks write fop failed
+ * on node2, node3 so marks pending changelog for those 2 extended
+ * attributes on node1. Node2, node3 thinks writes failed on node1 so
+ * they mark pending changelog for node1. When the network is stable
+ * again the file already is in split-brain. These checks prevent
+ * marking pending changelog on other subvolumes if the fop doesn't
+ * succeed in a way it is still following quorum. So with this fix what
+ * is happening is, node1 will have all pending changelog(FOOL) because
+ * the write succeeded only on node1 but failed on node2, node3 so
+ * instead of marking pending changelogs on node2, node3 it just treats
+ * the fop as failure and goes into DIRTY state. Where as node2, node3
+ * say they are sources and have pending changelog to node1 so there is
+ * no split-brain with the fix. The problem is eliminated completely.
+ */
+
+ if (priv->arbiter_count) {
+ if (afr_has_arbiter_fop_cbk_quorum(frame))
+ return;
+ } else if (afr_has_fop_cbk_quorum(frame)) {
+ return;
+ }
- if (afr_need_dirty_marking (frame, this))
- goto set_response;
+ if (afr_need_dirty_marking(frame, this))
+ goto set_response;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i])
- afr_transaction_fop_failed (frame, frame->this, i);
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i])
+ afr_transaction_fop_failed(frame, frame->this, i);
+ }
set_response:
- local->op_ret = -1;
- local->op_errno = afr_final_errno (local, priv);
- if (local->op_errno == 0)
- local->op_errno = afr_quorum_errno (priv);
-
- if (local->fd) {
- gf_uuid_copy (gfid, local->fd->inode->gfid);
- file = uuid_utoa (gfid);
- } else {
- loc_path (&local->loc, local->loc.name);
- file = local->loc.path;
- }
-
- gf_msg (frame->this->name, GF_LOG_WARNING, local->op_errno,
- AFR_MSG_QUORUM_FAIL, "%s: Failing %s as quorum is not met",
- file, gf_fop_list[local->op]);
-
- switch (local->transaction.type) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ if (local->op_errno == 0)
+ local->op_errno = afr_quorum_errno(priv);
+
+ if (local->fd) {
+ gf_uuid_copy(gfid, local->fd->inode->gfid);
+ file = uuid_utoa(gfid);
+ } else {
+ loc_path(&local->loc, local->loc.name);
+ file = local->loc.path;
+ }
+
+ gf_msg(frame->this->name, GF_LOG_WARNING, local->op_errno,
+ AFR_MSG_QUORUM_FAIL, "%s: Failing %s as quorum is not met", file,
+ gf_fop_list[local->op]);
+
+ switch (local->transaction.type) {
case AFR_ENTRY_TRANSACTION:
case AFR_ENTRY_RENAME_TRANSACTION:
- afr_pick_error_xdata (local, priv, local->parent,
- local->readable, local->parent2,
- local->readable2);
- break;
+ afr_pick_error_xdata(local, priv, local->parent, local->readable,
+ local->parent2, local->readable2);
+ break;
default:
- afr_pick_error_xdata (local, priv, local->inode,
- local->readable, NULL, NULL);
- break;
- }
+ afr_pick_error_xdata(local, priv, local->inode, local->readable,
+ NULL, NULL);
+ break;
+ }
}
int
-afr_fill_ta_loc (xlator_t *this, loc_t *loc)
-{
- afr_private_t *priv = NULL;
-
- priv = this->private;
- loc->parent = inode_ref (priv->root_inode);
- gf_uuid_copy (loc->pargfid, loc->parent->gfid);
- loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
- gf_uuid_copy (loc->gfid, priv->ta_gfid);
- loc->inode = inode_new (loc->parent->table);
- if (!loc->inode) {
- loc_wipe(loc);
- return -ENOMEM;
- }
- return 0;
+afr_fill_ta_loc(xlator_t *this, loc_t *loc)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ loc->parent = inode_ref(priv->root_inode);
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+ loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ gf_uuid_copy(loc->gfid, priv->ta_gfid);
+ loc->inode = inode_new(loc->parent->table);
+ if (!loc->inode) {
+ loc_wipe(loc);
+ return -ENOMEM;
+ }
+ return 0;
}
int
-afr_changelog_thin_arbiter_post_op (xlator_t *this, afr_local_t *local)
-{
- int ret = 0;
- afr_private_t *priv = NULL;
- dict_t *xattr = NULL;
- int failed_count = 0;
- struct gf_flock flock = {0, };
- loc_t loc = {0,};
- int i = 0;
+afr_changelog_thin_arbiter_post_op(xlator_t *this, afr_local_t *local)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int failed_count = 0;
+ struct gf_flock flock = {
+ 0,
+ };
+ loc_t loc = {
+ 0,
+ };
+ int i = 0;
+
+ priv = this->private;
+ if (!priv->thin_arbiter_count)
+ return 0;
- priv = this->private;
- if (!priv->thin_arbiter_count)
- return 0;
+ failed_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (!failed_count)
+ return 0;
+ GF_ASSERT(failed_count == 1);
+ ret = afr_fill_ta_loc(this, &loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ goto out;
+ }
+
+ xattr = dict_new();
+ if (!xattr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i],
+ local->pending[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret)
+ goto out;
+ }
+
+ flock.l_type = F_WRLCK;
+ flock.l_start = 0;
+ flock.l_len = 0;
+
+ /*TODO: Convert to two domain locking. */
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, &loc, F_SETLKW, &flock, NULL, NULL);
+ if (ret)
+ goto out;
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+
+ if (ret == -EINVAL) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_THIN_ARB,
+ "Thin-arbiter has denied post-op on %s for gfid %s.",
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+ uuid_utoa(local->inode->gfid));
+
+ } else if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s failed for gfid %s.",
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+ uuid_utoa(local->inode->gfid));
+ }
+ flock.l_type = F_UNLCK;
+ syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY,
+ &loc, F_SETLK, &flock, NULL, NULL);
+out:
+ if (xattr)
+ dict_unref(xattr);
- failed_count = AFR_COUNT (local->transaction.failed_subvols,
- priv->child_count);
- if (!failed_count)
- return 0;
+ return ret;
+}
- GF_ASSERT (failed_count == 1);
- ret = afr_fill_ta_loc (this, &loc);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Failed to populate thin-arbiter loc for: %s.",
- loc.name);
- goto out;
- }
+int
+afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = NULL;
+ dict_t *xattr = NULL;
+ int i = 0;
+ int ret = 0;
+ int idx = 0;
+ int nothing_failed = 1;
+ gf_boolean_t need_undirty = _gf_false;
+
+ afr_handle_quorum(frame, this);
+ local = frame->local;
+ idx = afr_index_for_transaction_type(local->transaction.type);
+
+ xattr = dict_new();
+ if (!xattr) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
- xattr = dict_new ();
- if (!xattr) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_static_bin (xattr, priv->pending_key[i],
- local->pending[i],
- AFR_NUM_CHANGE_LOGS * sizeof (int));
- if (ret)
- goto out;
+ nothing_failed = afr_txn_nothing_failed(frame, this);
+
+ if (afr_changelog_pre_op_uninherit(frame, this))
+ need_undirty = _gf_false;
+ else
+ need_undirty = _gf_true;
+
+ if (local->op_ret < 0 && !nothing_failed) {
+ if (afr_need_dirty_marking(frame, this)) {
+ local->dirty[idx] = hton32(1);
+ goto set_dirty;
}
- flock.l_type = F_WRLCK;
- flock.l_start = 0;
- flock.l_len = 0;
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
- /*TODO: Convert to two domain locking. */
- ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX],
- AFR_TA_DOM_NOTIFY, &loc, F_SETLKW, &flock,
- NULL, NULL);
- if (ret)
- goto out;
+ if (nothing_failed && !need_undirty) {
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
+
+ if (local->transaction.in_flight_sb) {
+ local->op_ret = -1;
+ local->op_errno = local->transaction.in_flight_sb_errno;
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->pending[i][idx] = hton32(1);
+ }
+
+ ret = afr_set_pending_dict(priv, xattr, local->pending);
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
- ret = syncop_xattrop (priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
- GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+ ret = afr_changelog_thin_arbiter_post_op(this, local);
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
- if (ret == -EINVAL) {
- gf_msg (this->name, GF_LOG_INFO, -ret, AFR_MSG_THIN_ARB,
- "Thin-arbiter has denied post-op on %s for gfid %s.",
- priv->pending_key[THIN_ARBITER_BRICK_INDEX],
- uuid_utoa (local->inode->gfid));
+ if (need_undirty)
+ local->dirty[idx] = hton32(-1);
+ else
+ local->dirty[idx] = hton32(0);
- } else if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Post-op on thin-arbiter id file %s failed for gfid %s.",
- priv->pending_key[THIN_ARBITER_BRICK_INDEX],
- uuid_utoa (local->inode->gfid));
- }
- flock.l_type = F_UNLCK;
- syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX],
- AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL);
+set_dirty:
+ ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
+
+ afr_changelog_do(frame, this, xattr, afr_changelog_post_op_done,
+ AFR_TRANSACTION_POST_OP);
out:
- if (xattr)
- dict_unref (xattr);
+ if (xattr)
+ dict_unref(xattr);
- return ret;
+ return 0;
}
-int
-afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = this->private;
- afr_local_t *local = NULL;
- dict_t *xattr = NULL;
- int i = 0;
- int ret = 0;
- int idx = 0;
- int nothing_failed = 1;
- gf_boolean_t need_undirty = _gf_false;
-
- afr_handle_quorum (frame, this);
- local = frame->local;
- idx = afr_index_for_transaction_type (local->transaction.type);
-
- xattr = dict_new ();
- if (!xattr) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done (frame, this);
- goto out;
- }
+gf_boolean_t
+afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
- nothing_failed = afr_txn_nothing_failed (frame, this);
+ local = frame->local;
+ priv = this->private;
+ ctx = local->inode_ctx;
- if (afr_changelog_pre_op_uninherit (frame, this))
- need_undirty = _gf_false;
- else
- need_undirty = _gf_true;
+ type = afr_index_for_transaction_type(local->transaction.type);
+ if (type != AFR_DATA_TRANSACTION)
+ return !local->transaction.dirtied;
- if (local->op_ret < 0 && !nothing_failed) {
- if (afr_need_dirty_marking (frame, this)) {
- local->dirty[idx] = hton32(1);
- goto set_dirty;
- }
+ if (local->transaction.no_uninherit)
+ return _gf_false;
- afr_changelog_post_op_done (frame, this);
- goto out;
- }
+ /* This function must be idempotent. So check if we
+ were called before and return the same answer again.
- if (nothing_failed && !need_undirty) {
- afr_changelog_post_op_done (frame, this);
- goto out;
- }
+ It is important to keep this function idempotent for
+ the call in afr_changelog_post_op_safe() to not have
+ side effects on the call from afr_changelog_post_op_now()
+ */
+ if (local->transaction.uninherit_done)
+ return local->transaction.uninherit_value;
- if (local->transaction.in_flight_sb) {
- local->op_ret = -1;
- local->op_errno = local->transaction.in_flight_sb_errno;
- afr_changelog_post_op_done (frame, this);
- goto out;
+ LOCK(&local->inode->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] != ctx->pre_op_done[type][i]) {
+ ret = !local->transaction.dirtied;
+ goto unlock;
+ }
+ }
+
+ if (ctx->inherited[type]) {
+ ret = _gf_true;
+ ctx->inherited[type]--;
+ } else if (ctx->on_disk[type]) {
+ ret = _gf_false;
+ ctx->on_disk[type]--;
+ } else {
+ /* ASSERT */
+ ret = _gf_false;
}
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.failed_subvols[i])
- local->pending[i][idx] = hton32(1);
- }
-
- ret = afr_set_pending_dict (priv, xattr, local->pending);
- if (ret < 0) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done (frame, this);
- goto out;
- }
-
- ret = afr_changelog_thin_arbiter_post_op (this, local);
- if (ret < 0) {
- local->op_ret = -1;
- local->op_errno = -ret;
- afr_changelog_post_op_done (frame, this);
- goto out;
+ if (!ctx->inherited[type] && !ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ ctx->pre_op_done[type][i] = 0;
}
-
- if (need_undirty)
- local->dirty[idx] = hton32(-1);
- else
- local->dirty[idx] = hton32(0);
-
-set_dirty:
- ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty,
- sizeof(int) * AFR_NUM_CHANGE_LOGS);
- if (ret) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done (frame, this);
- goto out;
- }
-
- afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done,
- AFR_TRANSACTION_POST_OP);
-out:
- if (xattr)
- dict_unref (xattr);
-
- return 0;
-}
-
-
-gf_boolean_t
-afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_inode_ctx_t *ctx = NULL;
- int i = 0;
- gf_boolean_t ret = _gf_false;
- int type = 0;
-
- local = frame->local;
- priv = this->private;
- ctx = local->inode_ctx;
-
- type = afr_index_for_transaction_type (local->transaction.type);
- if (type != AFR_DATA_TRANSACTION)
- return !local->transaction.dirtied;
-
- if (local->transaction.no_uninherit)
- return _gf_false;
-
- /* This function must be idempotent. So check if we
- were called before and return the same answer again.
-
- It is important to keep this function idempotent for
- the call in afr_changelog_post_op_safe() to not have
- side effects on the call from afr_changelog_post_op_now()
- */
- if (local->transaction.uninherit_done)
- return local->transaction.uninherit_value;
-
- LOCK(&local->inode->lock);
- {
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i] !=
- ctx->pre_op_done[type][i]) {
- ret = !local->transaction.dirtied;
- goto unlock;
- }
- }
-
- if (ctx->inherited[type]) {
- ret = _gf_true;
- ctx->inherited[type]--;
- } else if (ctx->on_disk[type]) {
- ret = _gf_false;
- ctx->on_disk[type]--;
- } else {
- /* ASSERT */
- ret = _gf_false;
- }
-
- if (!ctx->inherited[type] && !ctx->on_disk[type]) {
- for (i = 0; i < priv->child_count; i++)
- ctx->pre_op_done[type][i] = 0;
- }
- }
+ }
unlock:
- UNLOCK(&local->inode->lock);
+ UNLOCK(&local->inode->lock);
- local->transaction.uninherit_done = _gf_true;
- local->transaction.uninherit_value = ret;
+ local->transaction.uninherit_done = _gf_true;
+ local->transaction.uninherit_value = ret;
- return ret;
+ return ret;
}
-
gf_boolean_t
-afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
+afr_changelog_pre_op_inherit(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- gf_boolean_t ret = _gf_false;
- int type = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- if (local->transaction.type != AFR_DATA_TRANSACTION)
- return _gf_false;
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return _gf_false;
- type = afr_index_for_transaction_type (local->transaction.type);
+ type = afr_index_for_transaction_type(local->transaction.type);
- LOCK(&local->inode->lock);
- {
- if (!local->inode_ctx->on_disk[type]) {
- /* nothing to inherit yet */
- ret = _gf_false;
- goto unlock;
- }
+ LOCK(&local->inode->lock);
+ {
+ if (!local->inode_ctx->on_disk[type]) {
+ /* nothing to inherit yet */
+ ret = _gf_false;
+ goto unlock;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i] !=
- local->inode_ctx->pre_op_done[type][i]) {
- /* either inherit exactly, or don't */
- ret = _gf_false;
- goto unlock;
- }
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] !=
+ local->inode_ctx->pre_op_done[type][i]) {
+ /* either inherit exactly, or don't */
+ ret = _gf_false;
+ goto unlock;
+ }
+ }
- local->inode_ctx->inherited[type]++;
+ local->inode_ctx->inherited[type]++;
- ret = _gf_true;
+ ret = _gf_true;
- local->transaction.inherited = _gf_true;
- }
+ local->transaction.inherited = _gf_true;
+ }
unlock:
- UNLOCK(&local->inode->lock);
+ UNLOCK(&local->inode->lock);
- return ret;
+ return ret;
}
-
gf_boolean_t
-afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- gf_boolean_t ret = _gf_false;
- int type = 0;
-
- local = frame->local;
- priv = this->private;
-
- if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
- local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
- return _gf_false;
-
- if (local->transaction.inherited)
- /* was already inherited in afr_changelog_pre_op */
- return _gf_false;
-
- if (!local->transaction.dirtied)
- return _gf_false;
-
- if (!afr_txn_nothing_failed (frame, this))
- return _gf_false;
-
- type = afr_index_for_transaction_type (local->transaction.type);
-
- ret = _gf_false;
-
- LOCK(&local->inode->lock);
- {
- if (!local->inode_ctx->on_disk[type]) {
- for (i = 0; i < priv->child_count; i++)
- local->inode_ctx->pre_op_done[type][i] =
- (!local->transaction.failed_subvols[i]);
- } else {
- for (i = 0; i < priv->child_count; i++)
- if (local->inode_ctx->pre_op_done[type][i] !=
- (!local->transaction.failed_subvols[i])) {
- local->transaction.no_uninherit = 1;
- goto unlock;
- }
- }
- local->inode_ctx->on_disk[type]++;
-
- ret = _gf_true;
- }
-unlock:
- UNLOCK(&local->inode->lock);
+afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
- return ret;
-}
+ local = frame->local;
+ priv = this->private;
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+ return _gf_false;
-int
-afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- int call_count = -1;
- int child_index = -1;
+ if (local->transaction.inherited)
+ /* was already inherited in afr_changelog_pre_op */
+ return _gf_false;
- local = frame->local;
- child_index = (long) cookie;
+ if (!local->transaction.dirtied)
+ return _gf_false;
- if (op_ret == -1) {
- local->op_errno = op_errno;
- afr_transaction_fop_failed (frame, this, child_index);
- }
+ if (!afr_txn_nothing_failed(frame, this))
+ return _gf_false;
- if (xattr)
- local->transaction.changelog_xdata[child_index] = dict_ref (xattr);
+ type = afr_index_for_transaction_type(local->transaction.type);
- call_count = afr_frame_return (frame);
+ ret = _gf_false;
- if (call_count == 0) {
- local->transaction.changelog_resume (frame, this);
+ LOCK(&local->inode->lock);
+ {
+ if (!local->inode_ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ local->inode_ctx->pre_op_done[type][i] =
+ (!local->transaction.failed_subvols[i]);
+ } else {
+ for (i = 0; i < priv->child_count; i++)
+ if (local->inode_ctx->pre_op_done[type][i] !=
+ (!local->transaction.failed_subvols[i])) {
+ local->transaction.no_uninherit = 1;
+ goto unlock;
+ }
}
+ local->inode_ctx->on_disk[type]++;
- return 0;
+ ret = _gf_true;
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
+
+ return ret;
}
-void
-afr_changelog_populate_xdata (call_frame_t *frame, afr_xattrop_type_t op,
- dict_t **xdata, dict_t **newloc_xdata)
-{
- int i = 0;
- int ret = 0;
- char *key = NULL;
- const char *name = NULL;
- dict_t *xdata1 = NULL;
- dict_t *xdata2 = NULL;
- xlator_t *this = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- gf_boolean_t need_entry_key_set = _gf_true;
-
- local = frame->local;
- this = THIS;
- priv = this->private;
-
- if (local->transaction.type == AFR_DATA_TRANSACTION ||
- local->transaction.type == AFR_METADATA_TRANSACTION)
- goto out;
+int
+afr_changelog_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = -1;
- if (!priv->esh_granular)
- goto out;
+ local = frame->local;
+ child_index = (long)cookie;
- xdata1 = dict_new();
- if (!xdata1)
- goto out;
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
+
+ if (xattr)
+ local->transaction.changelog_xdata[child_index] = dict_ref(xattr);
- name = local->loc.name;
- if (local->op == GF_FOP_LINK)
- name = local->newloc.name;
+ call_count = afr_frame_return(frame);
- switch (op) {
+ if (call_count == 0) {
+ local->transaction.changelog_resume(frame, this);
+ }
+
+ return 0;
+}
+
+void
+afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op,
+ dict_t **xdata, dict_t **newloc_xdata)
+{
+ int i = 0;
+ int ret = 0;
+ char *key = NULL;
+ const char *name = NULL;
+ dict_t *xdata1 = NULL;
+ dict_t *xdata2 = NULL;
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t need_entry_key_set = _gf_true;
+
+ local = frame->local;
+ this = THIS;
+ priv = this->private;
+
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION)
+ goto out;
+
+ if (!priv->esh_granular)
+ goto out;
+
+ xdata1 = dict_new();
+ if (!xdata1)
+ goto out;
+
+ name = local->loc.name;
+ if (local->op == GF_FOP_LINK)
+ name = local->newloc.name;
+
+ switch (op) {
case AFR_TRANSACTION_PRE_OP:
- key = GF_XATTROP_ENTRY_IN_KEY;
- break;
+ key = GF_XATTROP_ENTRY_IN_KEY;
+ break;
case AFR_TRANSACTION_POST_OP:
- if (afr_txn_nothing_failed (frame, this)) {
- key = GF_XATTROP_ENTRY_OUT_KEY;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.failed_subvols[i])
- continue;
- need_entry_key_set = _gf_false;
- break;
- }
- /* If the transaction itself did not fail and there
- * are no failed subvolumes, check whether the fop
- * failed due to a symmetric error. If it did, do
- * not set the ENTRY_OUT xattr which would end up
- * deleting a name index which was created possibly by
- * an earlier entry txn that may have failed on some
- * of the sub-volumes.
- */
- if (local->op_ret)
- need_entry_key_set = _gf_false;
- } else {
- key = GF_XATTROP_ENTRY_IN_KEY;
+ if (afr_txn_nothing_failed(frame, this)) {
+ key = GF_XATTROP_ENTRY_OUT_KEY;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i])
+ continue;
+ need_entry_key_set = _gf_false;
+ break;
}
- break;
- }
+ /* If the transaction itself did not fail and there
+ * are no failed subvolumes, check whether the fop
+ * failed due to a symmetric error. If it did, do
+ * not set the ENTRY_OUT xattr which would end up
+ * deleting a name index which was created possibly by
+ * an earlier entry txn that may have failed on some
+ * of the sub-volumes.
+ */
+ if (local->op_ret)
+ need_entry_key_set = _gf_false;
+ } else {
+ key = GF_XATTROP_ENTRY_IN_KEY;
+ }
+ break;
+ }
- if (need_entry_key_set) {
- ret = dict_set_str (xdata1, key, (char *)name);
- if (ret)
- gf_msg (THIS->name, GF_LOG_ERROR, 0,
- AFR_MSG_DICT_SET_FAILED,
- "%s/%s: Could not set %s key during xattrop",
- uuid_utoa (local->loc.pargfid), local->loc.name,
- key);
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- xdata2 = dict_new ();
- if (!xdata2)
- goto out;
-
- ret = dict_set_str (xdata2, key,
- (char *)local->newloc.name);
- if (ret)
- gf_msg (THIS->name, GF_LOG_ERROR, 0,
- AFR_MSG_DICT_SET_FAILED,
- "%s/%s: Could not set %s key during "
- "xattrop",
- uuid_utoa (local->newloc.pargfid),
- local->newloc.name, key);
- }
+ if (need_entry_key_set) {
+ ret = dict_set_str(xdata1, key, (char *)name);
+ if (ret)
+ gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "%s/%s: Could not set %s key during xattrop",
+ uuid_utoa(local->loc.pargfid), local->loc.name, key);
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ xdata2 = dict_new();
+ if (!xdata2)
+ goto out;
+
+ ret = dict_set_str(xdata2, key, (char *)local->newloc.name);
+ if (ret)
+ gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "%s/%s: Could not set %s key during "
+ "xattrop",
+ uuid_utoa(local->newloc.pargfid), local->newloc.name,
+ key);
}
+ }
- *xdata = xdata1;
- *newloc_xdata = xdata2;
- xdata1 = xdata2 = NULL;
+ *xdata = xdata1;
+ *newloc_xdata = xdata2;
+ xdata1 = xdata2 = NULL;
out:
- if (xdata1)
- dict_unref (xdata1);
- return;
+ if (xdata1)
+ dict_unref(xdata1);
+ return;
}
int
-afr_changelog_prepare (xlator_t *this, call_frame_t *frame, int *call_count,
- afr_changelog_resume_t changelog_resume,
- afr_xattrop_type_t op, dict_t **xdata,
- dict_t **newloc_xdata)
+afr_changelog_prepare(xlator_t *this, call_frame_t *frame, int *call_count,
+ afr_changelog_resume_t changelog_resume,
+ afr_xattrop_type_t op, dict_t **xdata,
+ dict_t **newloc_xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- *call_count = afr_changelog_call_count (local->transaction.type,
- local->transaction.pre_op,
- local->transaction.failed_subvols,
- priv->child_count);
+ *call_count = afr_changelog_call_count(
+ local->transaction.type, local->transaction.pre_op,
+ local->transaction.failed_subvols, priv->child_count);
- if (*call_count == 0) {
- changelog_resume (frame, this);
- return -1;
- }
+ if (*call_count == 0) {
+ changelog_resume(frame, this);
+ return -1;
+ }
- afr_changelog_populate_xdata (frame, op, xdata, newloc_xdata);
- local->call_count = *call_count;
+ afr_changelog_populate_xdata(frame, op, xdata, newloc_xdata);
+ local->call_count = *call_count;
- local->transaction.changelog_resume = changelog_resume;
- return 0;
+ local->transaction.changelog_resume = changelog_resume;
+ return 0;
}
int
-afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
- afr_changelog_resume_t changelog_resume,
- afr_xattrop_type_t op)
+afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume, afr_xattrop_type_t op)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t *xdata = NULL;
- dict_t *newloc_xdata = NULL;
- int i = 0;
- int call_count = 0;
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ dict_t *newloc_xdata = NULL;
+ int i = 0;
+ int call_count = 0;
+ int ret = 0;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.changelog_xdata[i]) {
- dict_unref (local->transaction.changelog_xdata[i]);
- local->transaction.changelog_xdata[i] = NULL;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.changelog_xdata[i]) {
+ dict_unref(local->transaction.changelog_xdata[i]);
+ local->transaction.changelog_xdata[i] = NULL;
}
+ }
- ret = afr_changelog_prepare (this, frame, &call_count, changelog_resume,
- op, &xdata, &newloc_xdata);
+ ret = afr_changelog_prepare(this, frame, &call_count, changelog_resume, op,
+ &xdata, &newloc_xdata);
- if (ret)
- return 0;
+ if (ret)
+ return 0;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op[i] ||
- local->transaction.failed_subvols[i])
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i] ||
+ local->transaction.failed_subvols[i])
+ continue;
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- if (!local->fd) {
- STACK_WIND_COOKIE (frame, afr_changelog_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr,
- xdata);
- } else {
- STACK_WIND_COOKIE (frame, afr_changelog_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr,
- xdata);
- }
- break;
- case AFR_ENTRY_RENAME_TRANSACTION:
-
- STACK_WIND_COOKIE (frame, afr_changelog_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr,
- newloc_xdata);
- call_count--;
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ if (!local->fd) {
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->xattrop,
+ &local->loc, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ } else {
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fxattrop,
+ local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ }
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
+
+ STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr, newloc_xdata);
+ call_count--;
/* fall through */
- case AFR_ENTRY_TRANSACTION:
- if (local->fd)
- STACK_WIND_COOKIE (frame, afr_changelog_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr,
- xdata);
- else
- STACK_WIND_COOKIE (frame, afr_changelog_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr,
- xdata);
- break;
- }
-
- if (!--call_count)
- break;
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd)
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fxattrop,
+ local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ else
+ STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ break;
}
- if (xdata)
- dict_unref (xdata);
- if (newloc_xdata)
- dict_unref (newloc_xdata);
- return 0;
+ if (!--call_count)
+ break;
+ }
+
+ if (xdata)
+ dict_unref(xdata);
+ if (newloc_xdata)
+ dict_unref(newloc_xdata);
+ return 0;
}
static void
-afr_init_optimistic_changelog_for_txn (xlator_t *this, afr_local_t *local)
+afr_init_optimistic_changelog_for_txn(xlator_t *this, afr_local_t *local)
{
- int locked_count = 0;
- afr_private_t *priv = NULL;
+ int locked_count = 0;
+ afr_private_t *priv = NULL;
- priv = this->private;
+ priv = this->private;
- locked_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
- if (priv->optimistic_change_log && locked_count == priv->child_count)
- local->optimistic_change_log = 1;
+ locked_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+ if (priv->optimistic_change_log && locked_count == priv->child_count)
+ local->optimistic_change_log = 1;
- return;
+ return;
}
int
-afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = this->private;
- int i = 0;
- int ret = 0;
- int call_count = 0;
- int op_errno = 0;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- unsigned char *locked_nodes = NULL;
- int idx = -1;
- gf_boolean_t pre_nop = _gf_true;
- dict_t *xdata_req = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- idx = afr_index_for_transaction_type (local->transaction.type);
-
- locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock);
-
- for (i = 0; i < priv->child_count; i++) {
- if (locked_nodes[i]) {
- local->transaction.pre_op[i] = 1;
- call_count++;
- } else {
- local->transaction.failed_subvols[i] = 1;
- }
- }
-
- afr_init_optimistic_changelog_for_txn (this, local);
-
- if (afr_changelog_pre_op_inherit (frame, this))
- goto next;
-
- /* This condition should not be met with present code, as
- * transaction.done will be called if locks are not acquired on even a
- * single node.
- */
- if (call_count == 0) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Check if the fop can be performed on at least
- * quorum number of nodes.
- */
- if (priv->quorum_count && !afr_has_fop_quorum (frame)) {
- op_errno = int_lock->lock_op_errno;
- if (op_errno == 0)
- op_errno = afr_quorum_errno (priv);
- goto err;
+afr_changelog_pre_op(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ int op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ unsigned char *locked_nodes = NULL;
+ int idx = -1;
+ gf_boolean_t pre_nop = _gf_true;
+ dict_t *xdata_req = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ idx = afr_index_for_transaction_type(local->transaction.type);
+
+ locked_nodes = afr_locked_nodes_get(local->transaction.type, int_lock);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_nodes[i]) {
+ local->transaction.pre_op[i] = 1;
+ call_count++;
+ } else {
+ local->transaction.failed_subvols[i] = 1;
+ }
+ }
+
+ afr_init_optimistic_changelog_for_txn(this, local);
+
+ if (afr_changelog_pre_op_inherit(frame, this))
+ goto next;
+
+ /* This condition should not be met with present code, as
+ * transaction.done will be called if locks are not acquired on even a
+ * single node.
+ */
+ if (call_count == 0) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+
+ /* Check if the fop can be performed on at least
+ * quorum number of nodes.
+ */
+ if (priv->quorum_count && !afr_has_fop_quorum(frame)) {
+ op_errno = int_lock->lock_op_errno;
+ if (op_errno == 0)
+ op_errno = afr_quorum_errno(priv);
+ goto err;
+ }
+
+ xdata_req = dict_new();
+ if (!xdata_req) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (call_count < priv->child_count)
+ pre_nop = _gf_false;
+
+ /* Set an all-zero pending changelog so that in the cbk, we can get the
+ * current on-disk values. In a replica 3 volume with arbiter enabled,
+ * these values are needed to arrive at a go/ no-go of the fop phase to
+ * avoid ending up in split-brain.*/
+
+ ret = afr_set_pending_dict(priv, xdata_req, local->pending);
+ if (ret < 0) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (afr_needs_changelog_update(local)) {
+ local->dirty[idx] = hton32(1);
+
+ ret = dict_set_static_bin(xdata_req, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
}
- xdata_req = dict_new();
- if (!xdata_req) {
- op_errno = ENOMEM;
- goto err;
- }
-
- if (call_count < priv->child_count)
- pre_nop = _gf_false;
-
- /* Set an all-zero pending changelog so that in the cbk, we can get the
- * current on-disk values. In a replica 3 volume with arbiter enabled,
- * these values are needed to arrive at a go/ no-go of the fop phase to
- * avoid ending up in split-brain.*/
-
- ret = afr_set_pending_dict (priv, xdata_req, local->pending);
- if (ret < 0) {
- op_errno = ENOMEM;
- goto err;
- }
-
- if (afr_needs_changelog_update (local)) {
-
- local->dirty[idx] = hton32(1);
-
- ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty,
- sizeof(int) * AFR_NUM_CHANGE_LOGS);
- if (ret) {
- op_errno = ENOMEM;
- goto err;
- }
+ pre_nop = _gf_false;
+ local->transaction.dirtied = 1;
+ }
- pre_nop = _gf_false;
- local->transaction.dirtied = 1;
- }
+ if (pre_nop)
+ goto next;
- if (pre_nop)
- goto next;
+ if (!local->pre_op_compat) {
+ dict_copy(xdata_req, local->xdata_req);
+ goto next;
+ }
- if (!local->pre_op_compat) {
- dict_copy (xdata_req, local->xdata_req);
- goto next;
- }
+ afr_changelog_do(frame, this, xdata_req, afr_transaction_perform_fop,
+ AFR_TRANSACTION_PRE_OP);
- afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop,
- AFR_TRANSACTION_PRE_OP);
+ if (xdata_req)
+ dict_unref(xdata_req);
- if (xdata_req)
- dict_unref (xdata_req);
-
- return 0;
+ return 0;
next:
- afr_transaction_perform_fop (frame, this);
+ afr_transaction_perform_fop(frame, this);
- if (xdata_req)
- dict_unref (xdata_req);
+ if (xdata_req)
+ dict_unref(xdata_req);
- return 0;
+ return 0;
err:
- local->internal_lock.lock_cbk = afr_transaction_done;
- local->op_ret = -1;
- local->op_errno = op_errno;
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- afr_handle_lock_acquire_failure (local, _gf_true);
+ afr_handle_lock_acquire_failure(local, _gf_true);
- if (xdata_req)
- dict_unref (xdata_req);
+ if (xdata_req)
+ dict_unref(xdata_req);
- return 0;
+ return 0;
}
-
int
-afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
+afr_post_nonblocking_inodelk_cbk(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- /* Initiate blocking locks if non-blocking has failed */
- if (int_lock->lock_op_ret < 0) {
- gf_msg_debug (this->name, 0,
- "Non blocking inodelks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_internal_lock_finish;
- afr_blocking_lock (frame, this);
- } else {
+ /* Initiate blocking locks if non-blocking has failed */
+ if (int_lock->lock_op_ret < 0) {
+ gf_msg_debug(this->name, 0,
+ "Non blocking inodelks failed. Proceeding to blocking");
+ int_lock->lock_cbk = afr_internal_lock_finish;
+ afr_blocking_lock(frame, this);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Non blocking inodelks done. Proceeding to FOP");
+ afr_internal_lock_finish(frame, this);
+ }
- gf_msg_debug (this->name, 0,
- "Non blocking inodelks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
-
- return 0;
+ return 0;
}
-
int
-afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
+afr_post_nonblocking_entrylk_cbk(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- /* Initiate blocking locks if non-blocking has failed */
- if (int_lock->lock_op_ret < 0) {
- gf_msg_debug (this->name, 0,
- "Non blocking entrylks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_internal_lock_finish;
- afr_blocking_lock (frame, this);
- } else {
+ local = frame->local;
+ int_lock = &local->internal_lock;
- gf_msg_debug (this->name, 0,
- "Non blocking entrylks done. Proceeding to FOP");
+ /* Initiate blocking locks if non-blocking has failed */
+ if (int_lock->lock_op_ret < 0) {
+ gf_msg_debug(this->name, 0,
+ "Non blocking entrylks failed. Proceeding to blocking");
+ int_lock->lock_cbk = afr_internal_lock_finish;
+ afr_blocking_lock(frame, this);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Non blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
+ afr_internal_lock_finish(frame, this);
+ }
- return 0;
+ return 0;
}
-
int
-afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this)
+afr_post_blocking_rename_cbk(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- if (int_lock->lock_op_ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_BLOCKING_LKS_FAILED,
- "Blocking entrylks failed.");
-
- afr_transaction_done (frame, this);
- } else {
+ if (int_lock->lock_op_ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_BLOCKING_LKS_FAILED,
+ "Blocking entrylks failed.");
- gf_msg_debug (this->name, 0,
- "Blocking entrylks done. Proceeding to FOP");
+ afr_transaction_done(frame, this);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
- return 0;
+ afr_internal_lock_finish(frame, this);
+ }
+ return 0;
}
int
-afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this)
+afr_post_lower_unlock_cbk(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- GF_ASSERT (!int_lock->higher_locked);
+ GF_ASSERT(!int_lock->higher_locked);
- int_lock->lock_cbk = afr_post_blocking_rename_cbk;
- afr_blocking_lock (frame, this);
+ int_lock->lock_cbk = afr_post_blocking_rename_cbk;
+ afr_blocking_lock(frame, this);
- return 0;
+ return 0;
}
-
int
-afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
+afr_set_transaction_flock(xlator_t *this, afr_local_t *local)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
- int_lock = &local->internal_lock;
- priv = this->private;
+ int_lock = &local->internal_lock;
+ priv = this->private;
- if ((priv->arbiter_count || local->transaction.eager_lock_on ||
- priv->full_lock) &&
- local->transaction.type == AFR_DATA_TRANSACTION) {
- /*Lock entire file to avoid network split brains.*/
- int_lock->flock.l_len = 0;
- int_lock->flock.l_start = 0;
- } else {
- int_lock->flock.l_len = local->transaction.len;
- int_lock->flock.l_start = local->transaction.start;
- }
- int_lock->flock.l_type = F_WRLCK;
+ if ((priv->arbiter_count || local->transaction.eager_lock_on ||
+ priv->full_lock) &&
+ local->transaction.type == AFR_DATA_TRANSACTION) {
+ /*Lock entire file to avoid network split brains.*/
+ int_lock->flock.l_len = 0;
+ int_lock->flock.l_start = 0;
+ } else {
+ int_lock->flock.l_len = local->transaction.len;
+ int_lock->flock.l_start = local->transaction.start;
+ }
+ int_lock->flock.l_type = F_WRLCK;
- return 0;
+ return 0;
}
int
-afr_lock (call_frame_t *frame, xlator_t *this)
+afr_lock(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- int_lock->domain = this->name;
+ int_lock->domain = this->name;
- switch (local->transaction.type) {
+ switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- afr_set_transaction_flock (this, local);
+ afr_set_transaction_flock(this, local);
- int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk;
+ int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk;
- afr_nonblocking_inodelk (frame, this);
- break;
+ afr_nonblocking_inodelk(frame, this);
+ break;
case AFR_ENTRY_RENAME_TRANSACTION:
- int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
- afr_nonblocking_entrylk (frame, this);
- break;
+ int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
+ afr_nonblocking_entrylk(frame, this);
+ break;
case AFR_ENTRY_TRANSACTION:
- int_lock->lk_basename = local->transaction.basename;
- if (local->transaction.parent_loc.path)
- int_lock->lk_loc = &local->transaction.parent_loc;
- else
- GF_ASSERT (local->fd);
+ int_lock->lk_basename = local->transaction.basename;
+ if (local->transaction.parent_loc.path)
+ int_lock->lk_loc = &local->transaction.parent_loc;
+ else
+ GF_ASSERT(local->fd);
- int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
- afr_nonblocking_entrylk (frame, this);
- break;
- }
+ int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
+ afr_nonblocking_entrylk(frame, this);
+ break;
+ }
- return 0;
+ return 0;
}
static gf_boolean_t
-afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
+afr_locals_overlap(afr_local_t *local1, afr_local_t *local2)
{
- uint64_t start1 = local1->transaction.start;
- uint64_t start2 = local2->transaction.start;
- uint64_t end1 = 0;
- uint64_t end2 = 0;
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
- if (local1->transaction.len)
- end1 = start1 + local1->transaction.len - 1;
- else
- end1 = ULLONG_MAX;
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
- if (local2->transaction.len)
- end2 = start2 + local2->transaction.len - 1;
- else
- end2 = ULLONG_MAX;
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
- return ((end1 >= start2) && (end2 >= start1));
+ return ((end1 >= start2) && (end2 >= start1));
}
gf_boolean_t
-afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check)
-{
- afr_local_t *each = NULL;
- afr_lock_t *lock = NULL;
-
- lock = &local->inode_ctx->lock[local->transaction.type];
- /*
- * Once full file lock is acquired in eager-lock phase, overlapping
- * writes do not compete for inode-locks, instead are transferred to the
- * next writes. Because of this overlapping writes are not ordered.
- * This can cause inconsistencies in replication.
- * Example:
- * Two overlapping writes w1, w2 are sent in parallel on same fd
- * in two threads t1, t2.
- * Both threads can execute afr_writev_wind in the following manner.
- * t1 winds w1 on brick-0
- * t2 winds w2 on brick-0
- * t2 winds w2 on brick-1
- * t1 winds w1 on brick-1
- *
- * This check makes sure the locks are not transferred for
- * overlapping writes.
- */
- list_for_each_entry (each, &lock->owners, transaction.owner_list) {
- if (afr_locals_overlap (each, local)) {
- return _gf_true;
- }
- }
-
- if (!waitlist_check)
- return _gf_false;
- list_for_each_entry (each, &lock->waiting, transaction.wait_list) {
- if (afr_locals_overlap (each, local)) {
- return _gf_true;
- }
- }
+afr_has_lock_conflict(afr_local_t *local, gf_boolean_t waitlist_check)
+{
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = NULL;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ /*
+ * Once full file lock is acquired in eager-lock phase, overlapping
+ * writes do not compete for inode-locks, instead are transferred to the
+ * next writes. Because of this overlapping writes are not ordered.
+ * This can cause inconsistencies in replication.
+ * Example:
+ * Two overlapping writes w1, w2 are sent in parallel on same fd
+ * in two threads t1, t2.
+ * Both threads can execute afr_writev_wind in the following manner.
+ * t1 winds w1 on brick-0
+ * t2 winds w2 on brick-0
+ * t2 winds w2 on brick-1
+ * t1 winds w1 on brick-1
+ *
+ * This check makes sure the locks are not transferred for
+ * overlapping writes.
+ */
+ list_for_each_entry(each, &lock->owners, transaction.owner_list)
+ {
+ if (afr_locals_overlap(each, local)) {
+ return _gf_true;
+ }
+ }
+
+ if (!waitlist_check)
return _gf_false;
+ list_for_each_entry(each, &lock->waiting, transaction.wait_list)
+ {
+ if (afr_locals_overlap(each, local)) {
+ return _gf_true;
+ }
+ }
+ return _gf_false;
}
-
/* }}} */
static void
-afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src,
- xlator_t *this)
+afr_copy_inodelk_vars(afr_internal_lock_t *dst, afr_internal_lock_t *src,
+ xlator_t *this)
{
- afr_private_t *priv = this->private;
+ afr_private_t *priv = this->private;
- dst->domain = src->domain;
- dst->flock.l_len = src->flock.l_len;
- dst->flock.l_start = src->flock.l_start;
- dst->flock.l_type = src->flock.l_type;
- dst->lock_count = src->lock_count;
- memcpy (dst->locked_nodes, src->locked_nodes,
- priv->child_count * sizeof (*dst->locked_nodes));
+ dst->domain = src->domain;
+ dst->flock.l_len = src->flock.l_len;
+ dst->flock.l_start = src->flock.l_start;
+ dst->flock.l_type = src->flock.l_type;
+ dst->lock_count = src->lock_count;
+ memcpy(dst->locked_nodes, src->locked_nodes,
+ priv->child_count * sizeof(*dst->locked_nodes));
}
void
-__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared)
-{
- gf_boolean_t conflict = _gf_false;
- afr_local_t *each = NULL;
- afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
-
- while (!conflict) {
- if (list_empty (&lock->waiting))
- return;
- each = list_entry(lock->waiting.next, afr_local_t,
- transaction.wait_list);
- if (afr_has_lock_conflict (each, _gf_false)) {
- conflict = _gf_true;
- }
- if (conflict && !list_empty (&lock->owners))
- return;
- afr_copy_inodelk_vars (&each->internal_lock,
- &local->internal_lock,
- each->transaction.frame->this);
- list_move_tail (&each->transaction.wait_list, shared);
- list_add_tail(&each->transaction.owner_list, &lock->owners);
+__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared)
+{
+ gf_boolean_t conflict = _gf_false;
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
+
+ while (!conflict) {
+ if (list_empty(&lock->waiting))
+ return;
+ each = list_entry(lock->waiting.next, afr_local_t,
+ transaction.wait_list);
+ if (afr_has_lock_conflict(each, _gf_false)) {
+ conflict = _gf_true;
}
+ if (conflict && !list_empty(&lock->owners))
+ return;
+ afr_copy_inodelk_vars(&each->internal_lock, &local->internal_lock,
+ each->transaction.frame->this);
+ list_move_tail(&each->transaction.wait_list, shared);
+ list_add_tail(&each->transaction.owner_list, &lock->owners);
+ }
}
static void
-afr_lock_resume_shared (struct list_head *list)
+afr_lock_resume_shared(struct list_head *list)
{
- afr_local_t *each = NULL;
+ afr_local_t *each = NULL;
- while (!list_empty(list)) {
- each = list_entry(list->next, afr_local_t,
- transaction.wait_list);
- list_del_init(&each->transaction.wait_list);
- afr_changelog_pre_op (each->transaction.frame,
- each->transaction.frame->this);
- }
+ while (!list_empty(list)) {
+ each = list_entry(list->next, afr_local_t, transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ afr_changelog_pre_op(each->transaction.frame,
+ each->transaction.frame->this);
+ }
}
int
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+afr_internal_lock_finish(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = frame->local;
- afr_lock_t *lock = NULL;
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
-
- local->internal_lock.lock_cbk = NULL;
- if (!local->transaction.eager_lock_on) {
- if (local->internal_lock.lock_op_ret < 0) {
- afr_transaction_done (frame, this);
- return 0;
- }
- afr_changelog_pre_op (frame, this);
+ local->internal_lock.lock_cbk = NULL;
+ if (!local->transaction.eager_lock_on) {
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_transaction_done(frame, this);
+ return 0;
+ }
+ afr_changelog_pre_op(frame, this);
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_handle_lock_acquire_failure(local, _gf_false);
} else {
- lock = &local->inode_ctx->lock[local->transaction.type];
- if (local->internal_lock.lock_op_ret < 0) {
- afr_handle_lock_acquire_failure (local, _gf_false);
- } else {
- lock->event_generation = local->event_generation;
- afr_changelog_pre_op (frame, this);
- }
+ lock->event_generation = local->event_generation;
+ afr_changelog_pre_op(frame, this);
}
+ }
- return 0;
+ return 0;
}
gf_boolean_t
-afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this)
+afr_are_multiple_fds_opened(afr_local_t *local, xlator_t *this)
{
- /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
- * is taken mount2 opened the same file, it won't be able to
- * perform any data operations until mount1 releases eager-lock.
- * To avoid such scenario do not enable eager-lock for this transaction
- * if open-fd-count is > 1
- */
+ /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
+ * is taken mount2 opened the same file, it won't be able to
+ * perform any data operations until mount1 releases eager-lock.
+ * To avoid such scenario do not enable eager-lock for this transaction
+ * if open-fd-count is > 1
+ */
- if (local->inode_ctx->open_fd_count > 1)
- return _gf_true;
+ if (local->inode_ctx->open_fd_count > 1)
+ return _gf_true;
- return _gf_false;
+ return _gf_false;
}
-
gf_boolean_t
-afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this,
- int delay)
-{
- afr_local_t *local = NULL;
- afr_lock_t *lock = NULL;
- gf_boolean_t res = _gf_false;
-
- local = frame->local;
- lock = &local->inode_ctx->lock[local->transaction.type];
-
- if (!afr_txn_nothing_failed (frame, this)) {
- lock->release = _gf_true;
- goto out;
- }
-
- if (afr_are_multiple_fds_opened (local, this)) {
- lock->release = _gf_true;
- goto out;
- }
-
- if (!list_empty (&lock->owners))
- goto out;
- else
- GF_ASSERT (list_empty (&lock->waiting));
-
- if (lock->release) {
- goto out;
- }
-
- if (!delay) {
- goto out;
- }
-
- if ((local->op != GF_FOP_WRITE) &&
- (local->op != GF_FOP_FXATTROP)) {
- /*Only allow writes but shard does [f]xattrops on writes, so
- * they are fine too*/
- goto out;
- }
-
- res = _gf_true;
+afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this,
+ int delay)
+{
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t res = _gf_false;
+
+ local = frame->local;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ if (!afr_txn_nothing_failed(frame, this)) {
+ lock->release = _gf_true;
+ goto out;
+ }
+
+ if (afr_are_multiple_fds_opened(local, this)) {
+ lock->release = _gf_true;
+ goto out;
+ }
+
+ if (!list_empty(&lock->owners))
+ goto out;
+ else
+ GF_ASSERT(list_empty(&lock->waiting));
+
+ if (lock->release) {
+ goto out;
+ }
+
+ if (!delay) {
+ goto out;
+ }
+
+ if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP)) {
+ /*Only allow writes but shard does [f]xattrops on writes, so
+ * they are fine too*/
+ goto out;
+ }
+
+ res = _gf_true;
out:
- return res;
+ return res;
}
-
void
-afr_delayed_changelog_wake_up_cbk (void *data)
-{
- afr_lock_t *lock = NULL;
- afr_local_t *local = data;
- afr_local_t *timer_local = NULL;
- struct list_head shared;
-
- INIT_LIST_HEAD (&shared);
- lock = &local->inode_ctx->lock[local->transaction.type];
- LOCK (&local->inode->lock);
- {
- timer_local = list_entry(lock->post_op.next,
- afr_local_t,
- transaction.owner_list);
- if (list_empty (&lock->owners) && (local == timer_local)) {
- GF_ASSERT (list_empty (&lock->waiting));
- /*Last owner*/
- lock->release = _gf_true;
- lock->delay_timer = NULL;
- }
- }
- UNLOCK (&local->inode->lock);
- afr_changelog_post_op_now (local->transaction.frame,
- local->transaction.frame->this);
+afr_delayed_changelog_wake_up_cbk(void *data)
+{
+ afr_lock_t *lock = NULL;
+ afr_local_t *local = data;
+ afr_local_t *timer_local = NULL;
+ struct list_head shared;
+
+ INIT_LIST_HEAD(&shared);
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ if (list_empty(&lock->owners) && (local == timer_local)) {
+ GF_ASSERT(list_empty(&lock->waiting));
+ /*Last owner*/
+ lock->release = _gf_true;
+ lock->delay_timer = NULL;
+ }
+ }
+ UNLOCK(&local->inode->lock);
+ afr_changelog_post_op_now(local->transaction.frame,
+ local->transaction.frame->this);
}
-
/* SET operation */
int
-afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local)
+afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local)
{
- LOCK(&local->inode->lock);
- {
- local->inode_ctx->witnessed_unstable_write = _gf_true;
- }
- UNLOCK(&local->inode->lock);
+ LOCK(&local->inode->lock);
+ {
+ local->inode_ctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&local->inode->lock);
- return 0;
+ return 0;
}
/* TEST and CLEAR operation */
gf_boolean_t
-afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode)
+afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode)
{
- afr_inode_ctx_t *ctx = NULL;
- gf_boolean_t witness = _gf_false;
+ afr_inode_ctx_t *ctx = NULL;
+ gf_boolean_t witness = _gf_false;
- LOCK(&inode->lock);
- {
- (void)__afr_inode_ctx_get (this, inode, &ctx);
+ LOCK(&inode->lock);
+ {
+ (void)__afr_inode_ctx_get(this, inode, &ctx);
- if (ctx->witnessed_unstable_write) {
- witness = _gf_true;
- ctx->witnessed_unstable_write = _gf_false;
- }
+ if (ctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ ctx->witnessed_unstable_write = _gf_false;
}
- UNLOCK (&inode->lock);
+ }
+ UNLOCK(&inode->lock);
- return witness;
+ return witness;
}
-
int
-afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *pre,
- struct iatt *post, dict_t *xdata)
+afr_changelog_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- int child_index = (long) cookie;
- int call_count = -1;
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+ afr_local_t *local = NULL;
- priv = this->private;
- local = frame->local;
+ priv = this->private;
+ local = frame->local;
- if (op_ret != 0) {
- /* Failure of fsync() is as good as failure of previous
- write(). So treat it like one.
- */
- gf_msg (this->name, GF_LOG_WARNING,
- op_errno, AFR_MSG_FSYNC_FAILED,
- "fsync(%s) failed on subvolume %s. Transaction was %s",
- uuid_utoa (local->fd->inode->gfid),
- priv->children[child_index]->name,
- gf_fop_list[local->op]);
+ if (op_ret != 0) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
+ */
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, AFR_MSG_FSYNC_FAILED,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name, gf_fop_list[local->op]);
- afr_transaction_fop_failed (frame, this, child_index);
- }
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
- call_count = afr_frame_return (frame);
+ call_count = afr_frame_return(frame);
- if (call_count == 0)
- afr_changelog_post_op_now (frame, this);
+ if (call_count == 0)
+ afr_changelog_post_op_now(frame, this);
- return 0;
+ return 0;
}
-
int
-afr_changelog_fsync (call_frame_t *frame, xlator_t *this)
+afr_changelog_fsync(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_private_t *priv = NULL;
- dict_t *xdata = NULL;
- GF_UNUSED int ret = -1;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = -1;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
+ call_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
- if (!call_count) {
- /* will go straight to unlock */
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
- local->call_count = call_count;
+ local->call_count = call_count;
- xdata = dict_new();
- if (xdata)
- ret = dict_set_int32 (xdata, "batch-fsync", 1);
+ xdata = dict_new();
+ if (xdata)
+ ret = dict_set_int32(xdata, "batch-fsync", 1);
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op[i])
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
- STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->fsync, local->fd,
- 1, xdata);
- if (!--call_count)
- break;
- }
+ STACK_WIND_COOKIE(frame, afr_changelog_fsync_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fsync,
+ local->fd, 1, xdata);
+ if (!--call_count)
+ break;
+ }
- if (xdata)
- dict_unref (xdata);
+ if (xdata)
+ dict_unref(xdata);
- return 0;
+ return 0;
}
-
int
-afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
+afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ local = frame->local;
+ priv = this->private;
- if (afr_changelog_pre_op_uninherit (frame, this) &&
- afr_txn_nothing_failed (frame, this)) {
- /* just detected that this post-op is about to
- be optimized away as a new write() has
- already piggybacked on this frame's changelog.
- */
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
- /* Calling afr_changelog_post_op_now() now will result in
- issuing ->[f]xattrop().
-
- Performing a hard POST-OP (->[f]xattrop() FOP) is a more
- responsible operation that what it might appear on the surface.
-
- The changelog of a file (in the xattr of the file on the server)
- stores information (pending count) about the state of the file
- on the OTHER server. This changelog is blindly trusted, and must
- therefore be updated in such a way it remains trustworthy. This
- implies that decrementing the pending count (essentially "clearing
- the dirty flag") must be done STRICTLY after we are sure that the
- operation on the other server has reached stable storage.
-
- While the backend filesystem on that server will eventually flush
- it to stable storage, we (being in userspace) have no mechanism
- to get notified when the write became "stable".
-
- This means we need take matter into our own hands and issue an
- fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
- and get an acknowledgement for it. And we need to wait for the
- fsync() acknowledgement before initiating the hard POST-OP.
-
- However if the FD itself was opened in O_SYNC or O_DSYNC then
- we are already guaranteed that the writes were made stable as
- part of the FOP itself. The same holds true for NFS stable
- writes which happen on an anonymous FD with O_DSYNC or O_SYNC
- flag set in the writev() @flags param. For all other write types,
- mark a flag in the fdctx whenever an unstable write is witnessed.
+ if (afr_changelog_pre_op_uninherit(frame, this) &&
+ afr_txn_nothing_failed(frame, this)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
*/
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
+
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write(this, local->inode)) {
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
- if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) {
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
-
- /* Check whether users want durability and perform fsync/post-op
- * accordingly.
- */
- if (priv->ensure_durability) {
- /* Time to fsync() */
- afr_changelog_fsync (frame, this);
- } else {
- afr_changelog_post_op_now (frame, this);
- }
+ /* Check whether users want durability and perform fsync/post-op
+ * accordingly.
+ */
+ if (priv->ensure_durability) {
+ /* Time to fsync() */
+ afr_changelog_fsync(frame, this);
+ } else {
+ afr_changelog_post_op_now(frame, this);
+ }
- return 0;
+ return 0;
}
void
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
-{
- struct timespec delta = {0, };
- afr_private_t *priv = NULL;
- afr_local_t *local = frame->local;
- afr_lock_t *lock = NULL;
- gf_boolean_t post_op = _gf_true;
- struct list_head shared;
-
- priv = this->private;
- delta.tv_sec = priv->post_op_delay_secs;
- delta.tv_nsec = 0;
-
- INIT_LIST_HEAD (&shared);
- if (!local->transaction.eager_lock_on)
- goto out;
-
- lock = &local->inode_ctx->lock[local->transaction.type];
- LOCK (&local->inode->lock);
- {
- list_del_init (&local->transaction.owner_list);
- list_add (&local->transaction.owner_list, &lock->post_op);
- __afr_transaction_wake_shared (local, &shared);
-
- if (!afr_is_delayed_changelog_post_op_needed (frame, this,
- delta.tv_sec)) {
- if (list_empty (&lock->owners))
- lock->release = _gf_true;
- goto unlock;
- }
-
- GF_ASSERT (lock->delay_timer == NULL);
- lock->delay_timer = gf_timer_call_after (this->ctx, delta,
- afr_delayed_changelog_wake_up_cbk,
- local);
- if (!lock->delay_timer) {
- lock->release = _gf_true;
- } else {
- post_op = _gf_false;
- }
+afr_changelog_post_op(call_frame_t *frame, xlator_t *this)
+{
+ struct timespec delta = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t post_op = _gf_true;
+ struct list_head shared;
+
+ priv = this->private;
+ delta.tv_sec = priv->post_op_delay_secs;
+ delta.tv_nsec = 0;
+
+ INIT_LIST_HEAD(&shared);
+ if (!local->transaction.eager_lock_on)
+ goto out;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ list_del_init(&local->transaction.owner_list);
+ list_add(&local->transaction.owner_list, &lock->post_op);
+ __afr_transaction_wake_shared(local, &shared);
+
+ if (!afr_is_delayed_changelog_post_op_needed(frame, this,
+ delta.tv_sec)) {
+ if (list_empty(&lock->owners))
+ lock->release = _gf_true;
+ goto unlock;
+ }
- }
+ GF_ASSERT(lock->delay_timer == NULL);
+ lock->delay_timer = gf_timer_call_after(
+ this->ctx, delta, afr_delayed_changelog_wake_up_cbk, local);
+ if (!lock->delay_timer) {
+ lock->release = _gf_true;
+ } else {
+ post_op = _gf_false;
+ }
+ }
unlock:
- UNLOCK (&local->inode->lock);
+ UNLOCK(&local->inode->lock);
- if (!list_empty (&shared)) {
- afr_lock_resume_shared (&shared);
- }
+ if (!list_empty(&shared)) {
+ afr_lock_resume_shared(&shared);
+ }
out:
- if (post_op) {
- if (!local->transaction.eager_lock_on || lock->release) {
- afr_changelog_post_op_safe (frame, this);
- } else {
- afr_changelog_post_op_now (frame, this);
- }
+ if (post_op) {
+ if (!local->transaction.eager_lock_on || lock->release) {
+ afr_changelog_post_op_safe(frame, this);
+ } else {
+ afr_changelog_post_op_now(frame, this);
}
+ }
}
int
-afr_transaction_resume (call_frame_t *frame, xlator_t *this)
+afr_transaction_resume(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- afr_restore_lk_owner (frame);
+ afr_restore_lk_owner(frame);
- afr_handle_symmetric_errors (frame, this);
+ afr_handle_symmetric_errors(frame, this);
- if (!local->pre_op_compat)
- /* new mode, pre-op was done along
- with OP */
- afr_changelog_pre_op_update (frame, this);
+ if (!local->pre_op_compat)
+ /* new mode, pre-op was done along
+ with OP */
+ afr_changelog_pre_op_update(frame, this);
- afr_changelog_post_op (frame, this);
+ afr_changelog_post_op(frame, this);
- return 0;
+ return 0;
}
-
/**
* afr_transaction_fop_failed - inform that an fop failed
*/
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
- int child_index)
+afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this, int child_index)
{
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- local->transaction.failed_subvols[child_index] = 1;
+ local->transaction.failed_subvols[child_index] = 1;
}
static gf_boolean_t
-__need_previous_lock_unlocked (afr_local_t *local)
+__need_previous_lock_unlocked(afr_local_t *local)
{
- afr_lock_t *lock = NULL;
+ afr_lock_t *lock = NULL;
- if (!local->transaction.eager_lock_on)
- return _gf_true;
+ if (!local->transaction.eager_lock_on)
+ return _gf_true;
- lock = &local->inode_ctx->lock[local->transaction.type];
- if (!lock->acquired)
- return _gf_false;
- if (lock->acquired && lock->event_generation != local->event_generation)
- return _gf_true;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (!lock->acquired)
return _gf_false;
+ if (lock->acquired && lock->event_generation != local->event_generation)
+ return _gf_true;
+ return _gf_false;
}
void
-__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock,
- gf_boolean_t *do_pre_op, afr_local_t **timer_local)
-{
- afr_lock_t *lock = NULL;
- afr_local_t *owner_local = NULL;
- xlator_t *this = local->transaction.frame->this;
-
- if (local->fd && !afr_are_multiple_fds_opened (local, this)) {
- local->transaction.eager_lock_on = _gf_true;
- afr_set_lk_owner (local->transaction.frame, this, local->inode);
- }
-
- lock = &local->inode_ctx->lock[local->transaction.type];
- if (__need_previous_lock_unlocked (local)) {
- if (!list_empty (&lock->owners)) {
- lock->release = _gf_true;
- } else if (lock->delay_timer) {
- lock->release = _gf_true;
- if (gf_timer_call_cancel (this->ctx,
- lock->delay_timer)) {
- /* It will be put in frozen list
- * in the code flow below*/
- } else {
- *timer_local = list_entry(lock->post_op.next,
- afr_local_t,
- transaction.owner_list);
- lock->delay_timer = NULL;
- }
- }
- if (!local->transaction.eager_lock_on)
- goto out;
- }
-
- if (lock->release) {
- list_add_tail (&local->transaction.wait_list,
- &lock->frozen);
- *take_lock = _gf_false;
- goto out;
- }
-
- if (lock->delay_timer) {
- *take_lock = _gf_false;
- if (gf_timer_call_cancel (this->ctx,
- lock->delay_timer)) {
- list_add_tail (&local->transaction.wait_list,
- &lock->frozen);
- } else {
- *timer_local = list_entry(lock->post_op.next,
- afr_local_t,
- transaction.owner_list);
- afr_copy_inodelk_vars (&local->internal_lock,
- &(*timer_local)->internal_lock,
- this);
- lock->delay_timer = NULL;
- *do_pre_op = _gf_true;
- list_add_tail (&local->transaction.owner_list,
- &lock->owners);
- }
- goto out;
- }
-
- if (!list_empty (&lock->owners)) {
- if (!lock->acquired ||
- afr_has_lock_conflict (local, _gf_true)) {
- list_add_tail (&local->transaction.wait_list,
- &lock->waiting);
- *take_lock = _gf_false;
- goto out;
- }
- owner_local = list_entry (lock->owners.next,
- afr_local_t,
+__afr_eager_lock_handle(afr_local_t *local, gf_boolean_t *take_lock,
+ gf_boolean_t *do_pre_op, afr_local_t **timer_local)
+{
+ afr_lock_t *lock = NULL;
+ afr_local_t *owner_local = NULL;
+ xlator_t *this = local->transaction.frame->this;
+
+ if (local->fd && !afr_are_multiple_fds_opened(local, this)) {
+ local->transaction.eager_lock_on = _gf_true;
+ afr_set_lk_owner(local->transaction.frame, this, local->inode);
+ }
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (__need_previous_lock_unlocked(local)) {
+ if (!list_empty(&lock->owners)) {
+ lock->release = _gf_true;
+ } else if (lock->delay_timer) {
+ lock->release = _gf_true;
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ /* It will be put in frozen list
+ * in the code flow below*/
+ } else {
+ *timer_local = list_entry(lock->post_op.next, afr_local_t,
transaction.owner_list);
- afr_copy_inodelk_vars (&local->internal_lock,
- &owner_local->internal_lock,
- this);
- *take_lock = _gf_false;
- *do_pre_op = _gf_true;
+ lock->delay_timer = NULL;
+ }
}
-
- if (lock->acquired)
- GF_ASSERT (!(*take_lock));
- list_add_tail (&local->transaction.owner_list, &lock->owners);
+ if (!local->transaction.eager_lock_on)
+ goto out;
+ }
+
+ if (lock->release) {
+ list_add_tail(&local->transaction.wait_list, &lock->frozen);
+ *take_lock = _gf_false;
+ goto out;
+ }
+
+ if (lock->delay_timer) {
+ *take_lock = _gf_false;
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ list_add_tail(&local->transaction.wait_list, &lock->frozen);
+ } else {
+ *timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars(&local->internal_lock,
+ &(*timer_local)->internal_lock, this);
+ lock->delay_timer = NULL;
+ *do_pre_op = _gf_true;
+ list_add_tail(&local->transaction.owner_list, &lock->owners);
+ }
+ goto out;
+ }
+
+ if (!list_empty(&lock->owners)) {
+ if (!lock->acquired || afr_has_lock_conflict(local, _gf_true)) {
+ list_add_tail(&local->transaction.wait_list, &lock->waiting);
+ *take_lock = _gf_false;
+ goto out;
+ }
+ owner_local = list_entry(lock->owners.next, afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars(&local->internal_lock,
+ &owner_local->internal_lock, this);
+ *take_lock = _gf_false;
+ *do_pre_op = _gf_true;
+ }
+
+ if (lock->acquired)
+ GF_ASSERT(!(*take_lock));
+ list_add_tail(&local->transaction.owner_list, &lock->owners);
out:
- return;
+ return;
}
void
-afr_transaction_start (afr_local_t *local, xlator_t *this)
+afr_transaction_start(afr_local_t *local, xlator_t *this)
{
- afr_private_t *priv = NULL;
- gf_boolean_t take_lock = _gf_true;
- gf_boolean_t do_pre_op = _gf_false;
- afr_local_t *timer_local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t take_lock = _gf_true;
+ gf_boolean_t do_pre_op = _gf_false;
+ afr_local_t *timer_local = NULL;
- priv = this->private;
+ priv = this->private;
- if (local->transaction.type != AFR_DATA_TRANSACTION &&
- local->transaction.type != AFR_METADATA_TRANSACTION)
- goto lock_phase;
+ if (local->transaction.type != AFR_DATA_TRANSACTION &&
+ local->transaction.type != AFR_METADATA_TRANSACTION)
+ goto lock_phase;
- if (!priv->eager_lock)
- goto lock_phase;
+ if (!priv->eager_lock)
+ goto lock_phase;
- LOCK (&local->inode->lock);
- {
- __afr_eager_lock_handle (local, &take_lock, &do_pre_op,
- &timer_local);
- }
- UNLOCK (&local->inode->lock);
+ LOCK(&local->inode->lock);
+ {
+ __afr_eager_lock_handle(local, &take_lock, &do_pre_op, &timer_local);
+ }
+ UNLOCK(&local->inode->lock);
lock_phase:
- if (!local->transaction.eager_lock_on) {
- afr_set_lk_owner (local->transaction.frame, this,
- local->transaction.frame->root);
- }
-
-
- if (take_lock) {
- afr_lock (local->transaction.frame, this);
- } else if (do_pre_op) {
- afr_changelog_pre_op (local->transaction.frame, this);
- }
- /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
- * so that any inheriting can happen*/
- if (timer_local)
- afr_delayed_changelog_wake_up_cbk (timer_local);
+ if (!local->transaction.eager_lock_on) {
+ afr_set_lk_owner(local->transaction.frame, this,
+ local->transaction.frame->root);
+ }
+
+ if (take_lock) {
+ afr_lock(local->transaction.frame, this);
+ } else if (do_pre_op) {
+ afr_changelog_pre_op(local->transaction.frame, this);
+ }
+ /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
+ * so that any inheriting can happen*/
+ if (timer_local)
+ afr_delayed_changelog_wake_up_cbk(timer_local);
}
int
-afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
+afr_write_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
{
- afr_local_t *local = frame->local;
+ afr_local_t *local = frame->local;
- if (err) {
- AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err);
- goto fail;
- }
+ if (err) {
+ AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err);
+ goto fail;
+ }
- afr_transaction_start (local, this);
- return 0;
+ afr_transaction_start(local, this);
+ return 0;
fail:
- local->transaction.unwind (frame, this);
- AFR_STACK_DESTROY (frame);
- return 0;
+ local->transaction.unwind(frame, this);
+ AFR_STACK_DESTROY(frame);
+ return 0;
}
int
-afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int event_generation = 0;
-
- local = frame->local;
- priv = this->private;
- local->transaction.frame = frame;
-
- local->transaction.type = type;
-
- if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {
- ret = -afr_quorum_errno(priv);
- goto out;
- }
-
- if (!afr_is_consistent_io_possible (local, priv, &ret)) {
- ret = -ret; /*op_errno to ret conversion*/
- goto out;
- }
-
- ret = afr_transaction_local_init (local, this);
- if (ret < 0)
- goto out;
-
-
- if (type != AFR_METADATA_TRANSACTION) {
- goto txn_start;
- }
-
- ret = afr_inode_get_readable (frame, local->inode, this,
- local->readable, &event_generation, type);
- if (ret < 0 || afr_is_inode_refresh_reqd (local->inode, this,
- priv->event_generation,
- event_generation)) {
- afr_inode_refresh (frame, this, local->inode, local->loc.gfid,
- afr_write_txn_refresh_done);
- ret = 0;
- goto out;
- }
+afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ int event_generation = 0;
+
+ local = frame->local;
+ priv = this->private;
+ local->transaction.frame = frame;
+
+ local->transaction.type = type;
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this)) {
+ ret = -afr_quorum_errno(priv);
+ goto out;
+ }
+
+ if (!afr_is_consistent_io_possible(local, priv, &ret)) {
+ ret = -ret; /*op_errno to ret conversion*/
+ goto out;
+ }
+
+ ret = afr_transaction_local_init(local, this);
+ if (ret < 0)
+ goto out;
+
+ if (type != AFR_METADATA_TRANSACTION) {
+ goto txn_start;
+ }
+
+ ret = afr_inode_get_readable(frame, local->inode, this, local->readable,
+ &event_generation, type);
+ if (ret < 0 ||
+ afr_is_inode_refresh_reqd(local->inode, this, priv->event_generation,
+ event_generation)) {
+ afr_inode_refresh(frame, this, local->inode, local->loc.gfid,
+ afr_write_txn_refresh_done);
+ ret = 0;
+ goto out;
+ }
txn_start:
- ret = 0;
- afr_transaction_start (local, this);
+ ret = 0;
+ afr_transaction_start(local, this);
out:
- return ret;
+ return ret;
}