From eb04dab9992f8f5d4b2d45e1ca10032fededcff1 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Tue, 29 Apr 2014 05:49:21 +0530 Subject: cluster/afr: Fix bugs in quorum implementation - Have common place to perform quorum fop wind check - Check if fop succeeded in a way that matches quorum to avoid marking changelog in split-brain. BUG: 1066996 Change-Id: Ibc5b80e01dc206b2abbea2d29e26f3c60ff4f204 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.org/7600 Tested-by: Gluster Build System Reviewed-by: Ravishankar N --- xlators/cluster/afr/src/afr-common.c | 14 +++ xlators/cluster/afr/src/afr-dir-write.c | 16 ---- xlators/cluster/afr/src/afr-inode-write.c | 62 +------------ xlators/cluster/afr/src/afr-open.c | 4 - xlators/cluster/afr/src/afr-transaction.c | 144 +++++++++++++++++++++++++++++- xlators/cluster/afr/src/afr-transaction.h | 1 + xlators/cluster/afr/src/afr.c | 9 +- xlators/cluster/afr/src/afr.h | 16 ---- 8 files changed, 164 insertions(+), 102 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 164a651bad5..4c7692bd671 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -3144,6 +3144,8 @@ afr_notify (xlator_t *this, int32_t event, int up_child = -1; dict_t *input = NULL; dict_t *output = NULL; + gf_boolean_t had_quorum = _gf_false; + gf_boolean_t has_quorum = _gf_false; priv = this->private; @@ -3181,6 +3183,8 @@ afr_notify (xlator_t *this, int32_t event, goto out; } + had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, + this); switch (event) { case GF_EVENT_CHILD_UP: LOCK (&priv->lock); @@ -3279,6 +3283,16 @@ afr_notify (xlator_t *this, int32_t event, break; } + if (priv->quorum_count) { + has_quorum = afr_has_quorum (priv->child_up, this); + if (!had_quorum && has_quorum) + gf_log (this->name, GF_LOG_INFO, "Client-quorum" + " is met"); + if (had_quorum && !has_quorum) + gf_log (this->name, GF_LOG_WARNING, + "Client-quorum is not met"); + } + /* have all subvolumes reported status once by now? */ have_heard_from_all = 1; for (i = 0; i < priv->child_count; i++) { diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 465dde54f9c..d59536fcb48 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -447,8 +447,6 @@ afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, priv = this->private; - QUORUM_CHECK(create,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -585,8 +583,6 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, priv = this->private; - QUORUM_CHECK(mknod,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -718,8 +714,6 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, priv = this->private; - QUORUM_CHECK(mkdir,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -849,8 +843,6 @@ afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, priv = this->private; - QUORUM_CHECK(link,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -981,8 +973,6 @@ afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, priv = this->private; - QUORUM_CHECK(symlink,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -1116,8 +1106,6 @@ afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, priv = this->private; - QUORUM_CHECK(rename,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) op_errno = ENOMEM; @@ -1271,8 +1259,6 @@ afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, priv = this->private; - QUORUM_CHECK(unlink,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -1400,8 +1386,6 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, priv = this->private; - QUORUM_CHECK(rmdir,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 00e0d26768b..0c3ba920937 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -229,7 +229,7 @@ afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this) continue; if (local->replies[i].op_ret < local->op_ret) - afr_transaction_fop_failed(frame, this, i); + afr_transaction_fop_failed (frame, this, i); } } @@ -402,14 +402,9 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; int op_errno = ENOMEM; - priv = this->private; - - QUORUM_CHECK(writev,out); - local = AFR_FRAME_INIT (frame, op_errno); if (!local) goto out; @@ -528,16 +523,11 @@ int afr_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) { - afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; int op_errno = ENOMEM; - priv = this->private; - - QUORUM_CHECK(truncate,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -652,16 +642,11 @@ int afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; int op_errno = ENOMEM; - priv = this->private; - - QUORUM_CHECK(ftruncate,out); - transaction_frame = copy_frame (frame); if (!frame) goto out; @@ -769,16 +754,11 @@ int afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; int op_errno = ENOMEM; - priv = this->private; - - QUORUM_CHECK(setattr,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -879,16 +859,11 @@ int afr_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; int op_errno = ENOMEM; - priv = this->private; - - QUORUM_CHECK(fsetattr,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -991,7 +966,6 @@ int afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; @@ -1003,10 +977,6 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, op_errno, out); - priv = this->private; - - QUORUM_CHECK(setxattr,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -1107,7 +1077,6 @@ int afr_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; @@ -1119,10 +1088,6 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, op_errno, out); - priv = this->private; - - QUORUM_CHECK(fsetxattr,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -1226,7 +1191,6 @@ int afr_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; @@ -1238,10 +1202,6 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", name, op_errno, out); - priv = this->private; - - QUORUM_CHECK(removexattr,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -1339,7 +1299,6 @@ int afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; @@ -1351,10 +1310,6 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", name, op_errno, out); - priv = this->private; - - QUORUM_CHECK(fremovexattr, out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -1455,16 +1410,11 @@ int afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, off_t offset, size_t len, dict_t *xdata) { - afr_private_t *priv = NULL; call_frame_t *transaction_frame = NULL; afr_local_t *local = NULL; int ret = -1; int op_errno = ENOMEM; - priv = this->private; - - QUORUM_CHECK(fallocate,out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -1573,16 +1523,11 @@ int afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, size_t len, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; int op_errno = ENOMEM; - priv = this->private; - - QUORUM_CHECK(discard, out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; @@ -1687,16 +1632,11 @@ int afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, size_t len, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; int op_errno = ENOMEM; - priv = this->private; - - QUORUM_CHECK(discard, out); - transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index f86aa7fd80d..83da86053f4 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -131,10 +131,6 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, priv = this->private; - if (flags & (O_CREAT|O_TRUNC)) { - QUORUM_CHECK(open,out); - } - local = AFR_FRAME_INIT (frame, op_errno); if (!local) goto out; diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 205ff759ef5..270e2212913 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -430,6 +430,135 @@ afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this) __mark_all_success (frame, this); } +gf_boolean_t +afr_has_quorum (unsigned char *subvols, xlator_t *this) +{ + unsigned int quorum_count = 0; + afr_private_t *priv = NULL; + unsigned int up_children_count = 0; + + priv = this->private; + up_children_count = AFR_COUNT (subvols, priv->child_count); + + if (priv->quorum_count == AFR_QUORUM_AUTO) { + /* + * Special case for even numbers of nodes in auto-quorum: + * if we have exactly half children up + * and that includes the first ("senior-most") node, then that counts + * as quorum even if it wouldn't otherwise. This supports e.g. N=2 + * while preserving the critical property that there can only be one + * such group. + */ + if ((priv->child_count % 2 == 0) && + (up_children_count == (priv->child_count/2))) + return subvols[0]; + } + + if (priv->quorum_count == AFR_QUORUM_AUTO) { + quorum_count = priv->child_count/2 + 1; + } else { + quorum_count = priv->quorum_count; + } + + if (up_children_count >= quorum_count) + return _gf_true; + + return _gf_false; +} + +static gf_boolean_t +afr_has_fop_quorum (call_frame_t *frame) +{ + xlator_t *this = frame->this; + afr_local_t *local = frame->local; + unsigned char *locked_nodes = NULL; + + locked_nodes = afr_locked_nodes_get (local->transaction.type, + &local->internal_lock); + return afr_has_quorum (locked_nodes, this); +} + +static gf_boolean_t +afr_has_fop_cbk_quorum (call_frame_t *frame) +{ + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + unsigned char *success = alloca0(priv->child_count); + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) + if (!local->transaction.failed_subvols[i]) + success[i] = 1; + } + + return afr_has_quorum (success, this); +} + +void +afr_handle_quorum (call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + const char *file = NULL; + uuid_t gfid = {0}; + + local = frame->local; + priv = frame->this->private; + + if (priv->quorum_count == 0) + return; + + /* + * Network split may happen just after the fops are unwound, so check + * if the fop succeeded in a way it still follows quorum. If it doesn't, + * mark the fop as failure, mark the changelogs so it reflects that + * failure. + * + * Scenario: + * There are 3 mounts on 3 machines(node1, node2, node3) all writing to + * single file. Network split happened in a way that node1 can't see + * node2, node3. Node2, node3 both of them can't see node1. Now at the + * time of sending write all the bricks are up. Just after write fop is + * wound on node1, network split happens. Node1 thinks write fop failed + * on node2, node3 so marks pending changelog for those 2 extended + * attributes on node1. Node2, node3 thinks writes failed on node1 so + * they mark pending changelog for node1. When the network is stable + * again the file already is in split-brain. These checks prevent + * marking pending changelog on other subvolumes if the fop doesn't + * succeed in a way it is still following quorum. So with this fix what + * is happening is, node1 will have all pending changelog(FOOL) because + * the write succeeded only on node1 but failed on node2, node3 so + * instead of marking pending changelogs on node2, node3 it just treats + * the fop as failure and goes into DIRTY state. Where as node2, node3 + * say they are sources and have pending changelog to node1 so there is + * no split-brain with the fix. The problem is eliminated completely. + */ + + if (afr_has_fop_cbk_quorum (frame)) + return; + + if (local->fd) { + uuid_copy (gfid, local->fd->inode->gfid); + file = uuid_utoa (gfid); + } else { + loc_path (&local->loc, local->loc.name); + file = local->loc.path; + } + + gf_log (frame->this->name, GF_LOG_WARNING, "%s: Failing %s as quorum " + "is not met", file, gf_fop_list[local->op]); + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) + afr_transaction_fop_failed (frame, frame->this, i); + } + + local->op_ret = -1; + local->op_errno = EROFS; +} int afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) @@ -443,6 +572,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) int nothing_failed = 1; gf_boolean_t need_undirty = _gf_false; + afr_handle_quorum (frame); local = frame->local; idx = afr_index_for_transaction_type (local->transaction.type); @@ -453,6 +583,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) else need_undirty = _gf_true; + //If the fop fails on all the subvols then pending markers are placed + //for every subvol on all subvolumes. Which is nothing but split-brain. + //Avoid this by not doing post-op in case of failures. + if (local->op_ret < 0) { + afr_changelog_post_op_done (frame, this); + goto out; + } + if (nothing_failed && !need_undirty) { afr_changelog_post_op_done (frame, this); goto out; @@ -842,7 +980,10 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) } } - /* TBD: quorum check w/ call_count */ + if (priv->quorum_count && !afr_has_fop_quorum (frame)) { + op_errno = EROFS; + goto err; + } if (call_count == 0) { op_errno = ENOTCONN; @@ -1057,7 +1198,6 @@ afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this) return 0; } - int afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this) { diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 77cc8eed019..c3ce333b771 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -49,5 +49,6 @@ int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol); int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this); int __afr_txn_write_done (call_frame_t *frame, xlator_t *this); call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); +gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index ead08425f2c..6af136e9163 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -180,6 +180,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options, uint32, out); fix_quorum_options(this,priv,qtype); + if (priv->quorum_count && !afr_has_quorum (priv->child_up, this)) + gf_log (this->name, GF_LOG_WARNING, "Client-quorum is not met"); + GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options, uint32, out); @@ -456,9 +459,6 @@ struct xlator_fops fops = { .finodelk = afr_finodelk, .entrylk = afr_entrylk, .fentrylk = afr_fentrylk, - .fallocate = afr_fallocate, - .discard = afr_discard, - .zerofill = afr_zerofill, /* inode read */ .access = afr_access, @@ -479,6 +479,9 @@ struct xlator_fops fops = { .fsetattr = afr_fsetattr, .removexattr = afr_removexattr, .fremovexattr = afr_fremovexattr, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, /* dir read */ .opendir = afr_opendir, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 36042f7b2e5..674ea0bf72e 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -922,9 +922,6 @@ int afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, int flags); -gf_boolean_t -afr_have_quorum (char *logname, afr_private_t *priv); - void afr_matrix_cleanup (int32_t **pending, unsigned int m); @@ -940,19 +937,6 @@ afr_filter_xattrs (dict_t *xattr); */ #define AFR_QUORUM_AUTO INT_MAX -/* - * Having this as a macro will make debugging a bit weirder, but does reduce - * the probability of functions handling this check inconsistently. - */ -#define QUORUM_CHECK(_func,_label) do { \ - if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \ - gf_log(this->name,GF_LOG_WARNING, \ - "failing "#_func" due to lack of quorum"); \ - op_errno = EROFS; \ - goto _label; \ - } \ -} while (0); - int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); -- cgit