summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2014-04-21 03:13:58 +0530
committerNiels de Vos <ndevos@redhat.com>2014-05-10 13:30:48 -0700
commit46bf591eb87d743baacd30a85b99fd47db4fb055 (patch)
tree20ba994c07283e4d91d45f68ef844614da3045e3 /xlators/cluster
parentba328340e878c3156418bb3443c35a7db02a0f4b (diff)
cluster/afr: Fix bugs in quorum implementation
- Have common place to perform quorum fop wind check - Check if fop succeeded in a way that matches quorum to avoid marking changelog in split-brain. Change-Id: I663072ece0e1de6e1ee9fccb03e1b6c968793bc5 BUG: 1066996 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/7513 Reviewed-by: Ravishankar N <ravishankar@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Niels de Vos <ndevos@redhat.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/afr/src/afr-common.c51
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c16
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c30
-rw-r--r--xlators/cluster/afr/src/afr-open.c4
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c142
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h3
-rw-r--r--xlators/cluster/afr/src/afr.c6
-rw-r--r--xlators/cluster/afr/src/afr.h17
8 files changed, 166 insertions, 103 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 2a54a79ed4a..a31cc2cb015 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4034,6 +4034,8 @@ afr_notify (xlator_t *this, int32_t event,
int up_child = AFR_ALL_CHILDREN;
dict_t *input = NULL;
dict_t *output = NULL;
+ gf_boolean_t had_quorum = _gf_false;
+ gf_boolean_t has_quorum = _gf_false;
priv = this->private;
@@ -4071,6 +4073,8 @@ afr_notify (xlator_t *this, int32_t event,
goto out;
}
+ had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
+ this);
switch (event) {
case GF_EVENT_CHILD_UP:
LOCK (&priv->lock);
@@ -4163,6 +4167,16 @@ afr_notify (xlator_t *this, int32_t event,
break;
}
+ if (priv->quorum_count) {
+ has_quorum = afr_has_quorum (priv->child_up, this);
+ if (!had_quorum && has_quorum)
+ gf_log (this->name, GF_LOG_INFO, "Client-quorum"
+ " is met");
+ if (had_quorum && !has_quorum)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Client-quorum is not met");
+ }
+
/* have all subvolumes reported status once by now? */
have_heard_from_all = 1;
for (i = 0; i < priv->child_count; i++) {
@@ -4546,43 +4560,6 @@ out:
return ret;
}
-gf_boolean_t
-afr_have_quorum (char *logname, afr_private_t *priv)
-{
- unsigned int quorum = 0;
-
- GF_VALIDATE_OR_GOTO(logname,priv,out);
-
- quorum = priv->quorum_count;
- if (quorum != AFR_QUORUM_AUTO) {
- return (priv->up_count >= (priv->down_count + quorum));
- }
-
- quorum = priv->child_count / 2 + 1;
- if (priv->up_count >= (priv->down_count + quorum)) {
- return _gf_true;
- }
-
- /*
- * Special case for even numbers of nodes: if we have exactly half
- * and that includes the first ("senior-most") node, then that counts
- * as quorum even if it wouldn't otherwise. This supports e.g. N=2
- * while preserving the critical property that there can only be one
- * such group.
- */
- if ((priv->child_count % 2) == 0) {
- quorum = priv->child_count / 2;
- if (priv->up_count >= (priv->down_count + quorum)) {
- if (priv->child_up[0]) {
- return _gf_true;
- }
- }
- }
-
-out:
- return _gf_false;
-}
-
void
afr_priv_destroy (afr_private_t *priv)
{
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index 1943b719bb5..3cdec64536d 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -432,8 +432,6 @@ afr_create (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(create,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -637,8 +635,6 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
priv = this->private;
- QUORUM_CHECK(mknod,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -842,8 +838,6 @@ afr_mkdir (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(mkdir,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -1046,8 +1040,6 @@ afr_link (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(link,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -1249,8 +1241,6 @@ afr_symlink (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(symlink,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -1464,8 +1454,6 @@ afr_rename (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(rename,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -1685,8 +1673,6 @@ afr_unlink (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(unlink,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -1889,8 +1875,6 @@ afr_rmdir (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(rmdir,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index d62847defa3..5561e9c36d7 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -548,8 +548,6 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
goto out;
}
- QUORUM_CHECK(writev,out);
-
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -730,8 +728,6 @@ afr_truncate (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(truncate,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -745,6 +741,7 @@ afr_truncate (call_frame_t *frame, xlator_t *this,
if (ret < 0)
goto out;
+ local->op = GF_FOP_TRUNCATE;
local->cont.truncate.offset = offset;
local->transaction.fop = afr_truncate_wind;
@@ -974,8 +971,6 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,
op_errno = EIO;
goto out;
}
- QUORUM_CHECK(ftruncate,out);
-
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -1153,8 +1148,6 @@ afr_setattr (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(setattr,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -1168,6 +1161,7 @@ afr_setattr (call_frame_t *frame, xlator_t *this,
if (ret < 0)
goto out;
+ local->op = GF_FOP_SETATTR;
local->cont.setattr.in_buf = *buf;
local->cont.setattr.valid = valid;
@@ -1351,8 +1345,6 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,
goto out;
}
- QUORUM_CHECK(fsetattr,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -1366,6 +1358,7 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,
if (ret < 0)
goto out;
+ local->op = GF_FOP_FSETATTR;
local->cont.fsetattr.in_buf = *buf;
local->cont.fsetattr.valid = valid;
@@ -1540,7 +1533,6 @@ afr_setxattr (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(setxattr,out);
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -1554,6 +1546,7 @@ afr_setxattr (call_frame_t *frame, xlator_t *this,
if (ret < 0)
goto out;
+ local->op = GF_FOP_SETXATTR;
local->cont.setxattr.dict = dict_ref (dict);
local->cont.setxattr.flags = flags;
@@ -1730,8 +1723,6 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
goto out;
}
- QUORUM_CHECK(fsetxattr,out);
-
AFR_LOCAL_ALLOC_OR_GOTO (local, out);
ret = afr_local_init (local, priv, &op_errno);
@@ -1747,6 +1738,7 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
local->op_ret = -1;
+ local->op = GF_FOP_FSETXATTR;
local->cont.fsetxattr.dict = dict_ref (dict);
local->cont.fsetxattr.flags = flags;
@@ -1922,8 +1914,6 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,
priv = this->private;
- QUORUM_CHECK(removexattr,out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
op_errno = ENOMEM;
@@ -1937,6 +1927,7 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,
if (ret < 0)
goto out;
+ local->op = GF_FOP_REMOVEXATTR;
local->cont.removexattr.name = gf_strdup (name);
local->transaction.fop = afr_removexattr_wind;
@@ -2111,8 +2102,6 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this,
goto out;
}
- QUORUM_CHECK(fremovexattr, out);
-
transaction_frame = copy_frame (frame);
if (!transaction_frame) {
goto out;
@@ -2130,6 +2119,7 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this,
local->op_ret = -1;
+ local->op = GF_FOP_FREMOVEXATTR;
local->cont.removexattr.name = gf_strdup (name);
local->transaction.fop = afr_fremovexattr_wind;
@@ -2354,8 +2344,6 @@ afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
op_errno = EIO;
goto out;
}
- QUORUM_CHECK(fallocate,out);
-
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -2581,8 +2569,6 @@ afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
op_errno = EIO;
goto out;
}
- QUORUM_CHECK(discard, out);
-
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -2825,8 +2811,6 @@ afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
op_errno = EIO;
goto out;
}
- QUORUM_CHECK(zerofill, out);
-
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index 643a5d692df..9404b6e847b 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -203,10 +203,6 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
priv = this->private;
- if (flags & (O_CREAT|O_TRUNC)) {
- QUORUM_CHECK(open,out);
- }
-
if (afr_is_split_brain (this, loc->inode)) {
/* self-heal failed */
gf_log (this->name, GF_LOG_WARNING,
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 20306e46924..91af75b503b 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -640,6 +640,131 @@ afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this)
local->transaction.type);
}
+gf_boolean_t
+afr_has_quorum (unsigned char *subvols, xlator_t *this)
+{
+ unsigned int quorum_count = 0;
+ afr_private_t *priv = NULL;
+ unsigned int up_children_count = 0;
+
+ priv = this->private;
+ up_children_count = afr_up_children_count (subvols, priv->child_count);
+
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ quorum_count = priv->child_count/2 + 1;
+ } else {
+ quorum_count = priv->quorum_count;
+ }
+
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ /*
+ * Special case for even numbers of nodes in auto-quorum:
+ * if we have exactly half children up
+ * and that includes the first ("senior-most") node, then that counts
+ * as quorum even if it wouldn't otherwise. This supports e.g. N=2
+ * while preserving the critical property that there can only be one
+ * such group.
+ */
+ if ((priv->child_count % 2 == 0) &&
+ (up_children_count == (priv->child_count/2)))
+ return subvols[0];
+ }
+
+ if (up_children_count >= quorum_count)
+ return _gf_true;
+
+ return _gf_false;
+}
+
+static gf_boolean_t
+afr_has_fop_quorum (call_frame_t *frame)
+{
+ xlator_t *this = frame->this;
+ afr_local_t *local = frame->local;
+ unsigned char *locked_nodes = NULL;
+
+ locked_nodes = afr_locked_nodes_get (local->transaction.type,
+ &local->internal_lock);
+ return afr_has_quorum (locked_nodes, this);
+}
+
+static gf_boolean_t
+afr_has_fop_cbk_quorum (call_frame_t *frame)
+{
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ unsigned char *success = alloca(priv->child_count);
+ int i = 0;
+ int j = 0;
+
+ j = afr_index_for_transaction_type (local->transaction.type);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->pending[i][j])
+ success[i] = 1;
+ else
+ success[i] = 0;
+ }
+
+ return afr_has_quorum (success, this);
+}
+
+void
+afr_handle_quorum (call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ uuid_t gfid = {0};
+
+ local = frame->local;
+ priv = frame->this->private;
+
+ if (priv->quorum_count == 0)
+ return;
+
+ /*
+ * Network split may happen just after the fops are unwound, so check
+ * if the fop succeeded in a way it still follows quorum. If it doesn't,
+ * mark the fop as failure, mark the changelogs so it reflects that
+ * failure.
+ *
+ * Scenario:
+ * There are 3 mounts on 3 machines(node1, node2, node3) all writing to
+ * single file. Network split happened in a way that node1 can't see
+ * node2, node3. Node2, node3 both of them can't see node1. Now at the
+ * time of sending write all the bricks are up. Just after write fop is
+ * wound on node1, network split happens. Node1 thinks write fop failed
+ * on node2, node3 so marks pending changelog for those 2 extended
+ * attributes on node1. Node2, node3 thinks writes failed on node1 so
+ * they mark pending changelog for node1. When the network is stable
+ * again the file already is in split-brain. These checks prevent
+ * marking pending changelog on other subvolumes if the fop doesn't
+ * succeed in a way it is still following quorum. So with this fix what
+ * is happening is, node1 will have all pending changelog(FOOL) because
+ * the write succeeded only on node1 but failed on node2, node3 so
+ * instead of marking pending changelogs on node2, node3 it just treats
+ * the fop as failure and goes into FOOL state. Where as node2, node3
+ * say they are sources and have pending changelog to node1 so there is
+ * no split-brain with the fix. The problem is eliminated completely.
+ */
+ if (afr_has_fop_cbk_quorum (frame))
+ return;
+
+ if (local->fd)
+ uuid_copy (gfid, local->fd->inode->gfid);
+ else
+ loc_gfid (&local->loc, gfid);
+
+ gf_log (frame->this->name, GF_LOG_WARNING, "%s: Failing %s as quorum "
+ "is not met", uuid_utoa (gfid), gf_fop_list[local->op]);
+ for (i = 0; i < priv->child_count; i++)
+ __mark_child_dead (local->pending, priv->child_count, i,
+ local->transaction.type);
+ local->op_ret = -1;
+ local->op_errno = EROFS;
+}
+
int
afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
{
@@ -663,6 +788,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
afr_data_handle_quota_errors (frame, this);
afr_dir_fop_handle_all_fop_failures (frame);
+ afr_handle_quorum (frame);
if (local->fd)
afr_transaction_rm_stale_children (frame, this,
@@ -1320,12 +1446,24 @@ afr_lock (call_frame_t *frame, xlator_t *this)
int
afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+
+ if (priv->quorum_count && !afr_has_fop_quorum (frame)) {
+ local->op_ret = -1;
+ local->op_errno = EROFS;
+ local->internal_lock.lock_cbk = local->transaction.done;
+ afr_unlock (frame, this);
+ goto out;
+ }
+
if (__fop_changelog_needed (frame, this)) {
afr_changelog_pre_op (frame, this);
} else {
afr_transaction_perform_fop (frame, this);
}
+out:
return 0;
}
@@ -1833,9 +1971,7 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
child_index, local->transaction.type);
}
-
-
- static gf_boolean_t
+static gf_boolean_t
afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
{
uint64_t start1 = local1->transaction.start;
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index fa626fd0d6e..bcc40db99d3 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -48,4 +48,7 @@ afr_any_fops_failed (afr_local_t *local, afr_private_t *priv);
gf_boolean_t
afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_has_quorum (unsigned char *subvols, xlator_t *this);
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index c724eb2ae42..c020cbccac5 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -497,9 +497,6 @@ struct xlator_fops fops = {
.finodelk = afr_finodelk,
.entrylk = afr_entrylk,
.fentrylk = afr_fentrylk,
- .fallocate = afr_fallocate,
- .discard = afr_discard,
- .zerofill = afr_zerofill,
/* inode read */
.access = afr_access,
@@ -520,6 +517,9 @@ struct xlator_fops fops = {
.fsetattr = afr_fsetattr,
.removexattr = afr_removexattr,
.fremovexattr = afr_fremovexattr,
+ .fallocate = afr_fallocate,
+ .discard = afr_discard,
+ .zerofill = afr_zerofill,
/* dir read */
.opendir = afr_opendir,
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index e86300d0a22..59d08a1d0b7 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1136,9 +1136,6 @@ int
afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child,
int flags);
-gf_boolean_t
-afr_have_quorum (char *logname, afr_private_t *priv);
-
void
afr_matrix_cleanup (int32_t **pending, unsigned int m);
@@ -1167,20 +1164,6 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);
*/
#define AFR_QUORUM_AUTO INT_MAX
-/*
- * Having this as a macro will make debugging a bit weirder, but does reduce
- * the probability of functions handling this check inconsistently.
- */
-#define QUORUM_CHECK(_func,_label) do { \
- if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \
- gf_log(this->name,GF_LOG_WARNING, \
- "failing "#_func" due to lack of quorum"); \
- op_errno = EROFS; \
- goto _label; \
- } \
-} while (0);
-
-
#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO."
#define AFR_SBRAIN_CHECK_FD(fd, label) do { \