From 76d5e5d5b51eb2ffe5a0608bf8869650bb76585f Mon Sep 17 00:00:00 2001 From: Jeff Darcy Date: Tue, 20 Sep 2011 09:49:57 -0400 Subject: Add quorum checks to avoid split-brain. Change-Id: I2f123ef93989862aa796903a45682981d5d7fc3c BUG: 3533 Reviewed-on: http://review.gluster.com/473 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/cluster/afr/src/afr-common.c | 32 +++++++++++++++++++++++++++++++ xlators/cluster/afr/src/afr-dir-write.c | 17 +++++++++++++++- xlators/cluster/afr/src/afr-inode-write.c | 14 ++++++++++++++ xlators/cluster/afr/src/afr-open.c | 4 ++++ xlators/cluster/afr/src/afr.c | 6 ++++++ xlators/cluster/afr/src/afr.h | 18 +++++++++++++++++ 6 files changed, 90 insertions(+), 1 deletion(-) (limited to 'xlators') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index e25ff235324..01a092e2553 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -3872,3 +3872,35 @@ afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, out: return ret; } + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv) +{ + unsigned int quorum = 0; + + GF_VALIDATE_OR_GOTO(logname,priv,out); + + quorum = priv->child_count / 2 + 1; + if (priv->up_count >= (priv->down_count + quorum)) { + return _gf_true; + } + + /* + * Special case for even numbers of nodes: if we have exactly half + * and that includes the first ("senior-most") node, then that counts + * as quorum even if it wouldn't otherwise. This supports e.g. N=2 + * while preserving the critical property that there can only be one + * such group. + */ + if ((priv->child_count % 2) == 0) { + quorum = priv->child_count / 2; + if (priv->up_count >= (priv->down_count + quorum)) { + if (priv->child_up[0]) { + return _gf_true; + } + } + } + +out: + return _gf_false; +} diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index ca669b684f4..11df550d53e 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -47,7 +47,6 @@ #include "afr.h" #include "afr-transaction.h" - void afr_build_parent_loc (loc_t *parent, loc_t *child) { @@ -280,6 +279,8 @@ afr_create (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(create,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; @@ -503,6 +504,8 @@ afr_mknod (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(mknod,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; @@ -727,6 +730,8 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(mkdir,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; @@ -950,6 +955,8 @@ afr_link (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(link,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; @@ -1172,6 +1179,8 @@ afr_symlink (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(symlink,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; @@ -1388,6 +1397,8 @@ afr_rename (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(rename,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; @@ -1585,6 +1596,8 @@ afr_unlink (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(unlink,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; @@ -1779,6 +1792,8 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(rmdir,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 3f78c7b3658..4135ba947d2 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -449,6 +449,8 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; + QUORUM_CHECK(writev,out); + ALLOC_OR_GOTO (local, afr_local_t, out); ret = AFR_LOCAL_INIT (local, priv); @@ -647,6 +649,8 @@ afr_truncate (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(truncate,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; @@ -896,6 +900,8 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(ftruncate,out); + ALLOC_OR_GOTO (local, afr_local_t, out); ret = AFR_LOCAL_INIT (local, priv); @@ -1093,6 +1099,8 @@ afr_setattr (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(setattr,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; @@ -1298,6 +1306,8 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(fsetattr,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { op_errno = ENOMEM; @@ -1487,6 +1497,8 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(setxattr,out); + ALLOC_OR_GOTO (local, afr_local_t, out); ret = AFR_LOCAL_INIT (local, priv); @@ -1671,6 +1683,8 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(removexattr,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 646d23ccb53..e19847b0b35 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -214,6 +214,10 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, priv = this->private; + if (flags & (O_CREAT|O_TRUNC)) { + QUORUM_CHECK(open,out); + } + if (afr_is_split_brain (this, loc->inode)) { /* self-heal failed */ gf_log (this->name, GF_LOG_WARNING, diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 30da3fc729f..7791ec86bfc 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -260,6 +260,8 @@ init (xlator_t *this) GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out); + GF_OPTION_INIT ("enforce-quorum", priv->enforce_quorum, bool, out); + priv->wait_count = 1; child_count = xlator_subvolume_count (this); @@ -490,5 +492,9 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL, .default_value = "off", }, + { .key = {"enforce-quorum"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index e1f13b37655..0677b96e9fe 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -146,6 +146,7 @@ typedef struct _afr_private { struct list_head saved_fds; /* list of fds on which locks have succeeded */ gf_boolean_t optimistic_change_log; gf_boolean_t eager_lock; + gf_boolean_t enforce_quorum; char vol_uuid[UUID_SIZE + 1]; int32_t *last_event; @@ -998,4 +999,21 @@ afr_set_low_priority (call_frame_t *frame); int afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, int flags, int32_t wb_flags); + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv); + +/* + * Having this as a macro will make debugging a bit weirder, but does reduce + * the probability of functions handling this check inconsistently. + */ +#define QUORUM_CHECK(_func,_label) do { \ + if (priv->enforce_quorum && !afr_have_quorum(this->name,priv)) { \ + gf_log(this->name,GF_LOG_WARNING, \ + "failing "#_func" due to lack of quorum"); \ + op_errno = EROFS; \ + goto _label; \ + } \ +} while (0); + #endif /* __AFR_H__ */ -- cgit