From 25e8e74eb7b81ccd114a9833371a3f72d284c48d Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Wed, 15 Apr 2015 22:22:08 +0530 Subject: afr: add arbitration support Add logic in afr to work in conjunction with the arbiter xlator when a replica 3 arbiter volume is created. More specifically, this patch: * Enables full locks for afr data transaction for such volumes. * Removes the upfront marking of pending xattrs at the time of pre-op and defer it to post-op. (This is an arbiter independent change and is made for all afr transactions.) * After pre-op stage, check if we can proceed with the fop stage without ending up in split-brain by examining the changelog xattrs. * Unwinds the fop with failure if only one source was available at the time of pre-op and the fop happened to fail on particular source brick. * Skips data self-heal if arbiter brick is the only source available. * Adds the arbiter-count option to the shd graph. This patch is a part of the arbiter logic implementation for 3 way AFR details of which can be found at http://review.gluster.org/#/c/9656/ Change-Id: I9603db9d04de5626eb2f4d8d959ef5b46113561d BUG: 1199985 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/10258 Tested-by: Gluster Build System Reviewed-by: Pranith Kumar Karampuri --- xlators/cluster/afr/src/afr-common.c | 25 ++++ xlators/cluster/afr/src/afr-dir-write.c | 2 + xlators/cluster/afr/src/afr-inode-write.c | 12 +- xlators/cluster/afr/src/afr-self-heal-data.c | 6 + xlators/cluster/afr/src/afr-self-heal.h | 3 + xlators/cluster/afr/src/afr-transaction.c | 183 ++++++++++++++++++++++----- xlators/cluster/afr/src/afr-transaction.h | 2 + xlators/cluster/afr/src/afr.h | 6 + xlators/mgmt/glusterd/src/glusterd-volgen.c | 16 +-- 9 files changed, 208 insertions(+), 47 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 8993b164b91..8fbca0b6f42 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -993,6 +993,17 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) afr_entry_lockee_cleanup (&local->internal_lock); GF_FREE (local->transaction.pre_op); + + GF_FREE (local->transaction.pre_op_sources); + if (local->transaction.pre_op_xdata) { + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op_xdata[i]) + continue; + dict_unref (local->transaction.pre_op_xdata[i]); + } + GF_FREE (local->transaction.pre_op_xdata); + } + GF_FREE (local->transaction.eager_lock); GF_FREE (local->transaction.fop_subvols); GF_FREE (local->transaction.failed_subvols); @@ -4055,6 +4066,20 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->transaction.pre_op) goto out; + if (priv->arbiter_count == 1) { + local->transaction.pre_op_xdata = + GF_CALLOC (sizeof (*local->transaction.pre_op_xdata), + priv->child_count, gf_afr_mt_dict_t); + if (!local->transaction.pre_op_xdata) + goto out; + + local->transaction.pre_op_sources = + GF_CALLOC (sizeof (*local->transaction.pre_op_sources), + priv->child_count, gf_afr_mt_char); + if (!local->transaction.pre_op_sources) + goto out; + } + local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols), priv->child_count, gf_afr_mt_char); diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index f7ca9108092..8a2c0e46e40 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -168,6 +168,8 @@ __afr_dir_write_finalize (call_frame_t *frame, xlator_t *this) local->replies[i].postparent2; } } + + afr_txn_arbitrate_fop_cbk (frame, this); } diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index f712166e44d..f9fde44e9e4 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -37,7 +37,7 @@ #include "protocol-common.h" #include "afr-transaction.h" - +#include "afr-self-heal.h" static void __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) @@ -97,6 +97,8 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) } } } + + afr_txn_arbitrate_fop_cbk (frame, this); } @@ -342,6 +344,7 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) { call_frame_t *transaction_frame = NULL; afr_local_t *local = NULL; + afr_private_t *priv = NULL; int ret = -1; int op_errno = ENOMEM; @@ -350,6 +353,7 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) goto out; local = frame->local; + priv = this->private; transaction_frame->local = local; frame->local = NULL; @@ -379,6 +383,12 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.writev.offset; local->transaction.len = iov_length (local->cont.writev.vector, local->cont.writev.count); + + /*Lock entire file to avoid network split brains.*/ + if (priv->arbiter_count == 1) { + local->transaction.start = 0; + local->transaction.len = 0; + } } ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index a8a7326e4ec..7567fe9f851 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -684,6 +684,12 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, source = ret; + if (priv->arbiter_count == 1 && source == ARBITER_BRICK_INDEX && + AFR_COUNT (sources, priv->child_count) == 1) { + did_sh = _gf_false; + goto unlock; + } + ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks, locked_replies, locked_replies[source].poststat.ia_size); diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 32be2480234..956f075e25b 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -141,6 +141,9 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, afr_transaction_type type, unsigned char *locked_on, unsigned char *sources, unsigned char *sinks, uint64_t *witness); +int +afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, int idx, + dict_t *xdata); int afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index d2111060035..a2023884465 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -15,7 +15,7 @@ #include "afr.h" #include "afr-transaction.h" - +#include "afr-self-heal.h" #include gf_boolean_t @@ -139,14 +139,130 @@ __mark_all_success (call_frame_t *frame, xlator_t *this) } } +void +afr_compute_pre_op_sources (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_transaction_type type = -1; + dict_t *xdata = NULL; + int **matrix = NULL; + int idx = -1; + int i = 0; + int j = 0; + + priv = this->private; + local = frame->local; + type = local->transaction.type; + idx = afr_index_for_transaction_type (type); + matrix = ALLOC_MATRIX (priv->child_count, int); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op_xdata[i]) + continue; + xdata = local->transaction.pre_op_xdata[i]; + afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); + } + + memset (local->transaction.pre_op_sources, 1, priv->child_count); + + /*If lock or pre-op failed on a brick, it is not a source. */ + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->transaction.pre_op_sources[i] = 0; + } + + /* If brick is blamed by others, it is not a source. */ + for (i = 0; i < priv->child_count; i++) + for (j = 0; j < priv->child_count; j++) + if (matrix[i][j] != 0) + local->transaction.pre_op_sources[j] = 0; + + /*We don't need the xattrs any more. */ + for (i = 0; i < priv->child_count; i++) + if (local->transaction.pre_op_xdata[i]) { + dict_unref (local->transaction.pre_op_xdata[i]); + local->transaction.pre_op_xdata[i] = NULL; + } +} + +void +afr_txn_arbitrate_fop_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t fop_failed = _gf_false; + unsigned char *pre_op_sources = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + pre_op_sources = local->transaction.pre_op_sources; + + if (priv->arbiter_count != 1 || local->op_ret < 0) + return; + + /* If the fop failed on the brick, it is not a source. */ + for (i = 0; i < priv->child_count; i++) + if (local->transaction.failed_subvols[i]) + pre_op_sources[i] = 0; + + switch (AFR_COUNT (pre_op_sources, priv->child_count)) { + case 1: + if (pre_op_sources[ARBITER_BRICK_INDEX]) + fop_failed = _gf_true; + break; + case 0: + fop_failed = _gf_true; + break; + } + + if (fop_failed) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + } + + return; +} + +void +afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_sources_count = 0; + + priv = this->private; + local = frame->local; + + afr_compute_pre_op_sources (frame, this); + pre_op_sources_count = AFR_COUNT (local->transaction.pre_op_sources, + priv->child_count); + + /* If arbiter is the only source, do not proceed. */ + if (pre_op_sources_count < 2 && + local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { + local->internal_lock.lock_cbk = local->transaction.done; + local->op_ret = -1; + local->op_errno = ENOTCONN; + afr_restore_lk_owner (frame); + afr_unlock (frame, this); + } else { + local->transaction.fop (frame, this); + } + + return; +} int afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; + afr_private_t *priv = NULL; fd_t *fd = NULL; local = frame->local; + priv = this->private; fd = local->fd; /* Perform fops with the lk-owner from top xlator. @@ -172,12 +288,15 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) */ if (fd) afr_delayed_changelog_wake_up (this, fd); - local->transaction.fop (frame, this); + if (priv->arbiter_count == 1) { + afr_txn_arbitrate_fop (frame, this); + } else { + local->transaction.fop (frame, this); + } return 0; } - static int __changelog_enabled (afr_private_t *priv, afr_transaction_type type) { @@ -372,11 +491,16 @@ afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) { afr_private_t *priv = NULL; afr_local_t *local = NULL; + int pre_op_count = 0; int i = 0; local = frame->local; priv = this->private; + pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); + if (pre_op_count < priv->child_count) + return _gf_false; + for (i = 0; i < priv->child_count; i++) { if (local->transaction.failed_subvols[i]) return _gf_false; @@ -591,9 +715,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) else need_undirty = _gf_true; - //If the fop fails on all the subvols then pending markers are placed - //for every subvol on all subvolumes. Which is nothing but split-brain. - //Avoid this by not doing post-op in case of failures. if (local->op_ret < 0) { afr_changelog_post_op_done (frame, this); goto out; @@ -846,12 +967,22 @@ afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; + afr_private_t *priv = NULL; int call_count = -1; + int child_index = -1; local = frame->local; + priv = this->private; + child_index = (long) cookie; if (op_ret == -1) - afr_transaction_fop_failed (frame, this, (long) cookie); + afr_transaction_fop_failed (frame, this, child_index); + + if (priv->arbiter_count == 1 && !op_ret) { + if (xattr) + local->transaction.pre_op_xdata[child_index] = + dict_ref (xattr); + } call_count = afr_frame_return (frame); @@ -964,7 +1095,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; unsigned char *locked_nodes = NULL; - unsigned char *pending_subvols = NULL; int idx = -1; gf_boolean_t pre_nop = _gf_true; dict_t *xdata_req = NULL; @@ -975,15 +1105,13 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); - pending_subvols = alloca0 (priv->child_count); - for (i = 0; i < priv->child_count; i++) { if (locked_nodes[i]) { local->transaction.pre_op[i] = 1; call_count++; } else { - pending_subvols[i] = 1; - } + local->transaction.failed_subvols[i] = 1; + } } /* This condition should not be met with present code, as @@ -1009,28 +1137,21 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) goto err; } - pre_nop = _gf_true; - if (afr_changelog_pre_op_inherit (frame, this)) goto next; - if (call_count < priv->child_count) { - /* For subvols we are not performing operation on, - mark them as pending up-front along with the FOP - so that we can safely defer unmarking dirty until - later. - */ - for (i = 0; i < priv->child_count; i++) { - if (pending_subvols[i]) - local->pending[i][idx] = hton32(1); - } - ret = afr_set_pending_dict (priv, xdata_req, - local->pending); - if (ret < 0) { - op_errno = ENOMEM; - goto err; - } - pre_nop = _gf_false; + if (call_count < priv->child_count) + pre_nop = _gf_false; + + /* Set an all-zero pending changelog so that in the cbk, we can get the + * current on-disk values. In a replica 3 volume with arbiter enabled, + * these values are needed to arrive at a go/ no-go of the fop phase to + * avoid ending up in split-brain.*/ + + ret = afr_set_pending_dict (priv, xdata_req, local->pending); + if (ret < 0) { + op_errno = ENOMEM; + goto err; } if (call_count > 1 && diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index c3ce333b771..47d43d88991 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -16,6 +16,8 @@ void afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index); +void +afr_txn_arbitrate_fop_cbk (call_frame_t *frame, xlator_t *this); int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index f7bc6ea0f94..6cb708ffbd7 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -39,6 +39,8 @@ #define AFR_DOM_COUNT_MAX 3 #define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ +#define ARBITER_BRICK_INDEX 2 + typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol); @@ -656,6 +658,10 @@ typedef struct _afr_local { unsigned char *pre_op; + /* For arbiter configuration only. */ + dict_t **pre_op_xdata; + unsigned char *pre_op_sources; + /* @fop_subvols: subvolumes on which FOP will be attempted */ unsigned char *fop_subvols; diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index fe281dd33db..a56d6d5ccca 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -4059,19 +4059,6 @@ out: return ret; } -static int -volgen_graph_build_replicate_clusters (volgen_graph_t *graph, - glusterd_volinfo_t *volinfo) -{ - char *replicate_args[] = {"cluster/replicate", - "%s-replicate-%d"}; - - return volgen_link_bricks_from_list_tail (graph, volinfo, "cluster/replicate", - "%s-replicate-%d", - volinfo->brick_count, - volinfo->replica_count); -} - static int build_shd_clusters (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, dict_t *set_dict) @@ -4086,8 +4073,7 @@ build_shd_clusters (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, switch (volinfo->type) { case GF_CLUSTER_TYPE_REPLICATE: case GF_CLUSTER_TYPE_STRIPE_REPLICATE: - clusters = volgen_graph_build_replicate_clusters (graph, - volinfo); + clusters = volgen_graph_build_afr_clusters (graph, volinfo); break; case GF_CLUSTER_TYPE_DISPERSE: -- cgit