From 3797caabb95ad8e62dee74a5331d324ddffc654f Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Mon, 2 Feb 2015 18:49:01 +0530 Subject: cluster/afr: Implementation of quorum-reads Provide a way of disabling reads when quorum is not met. Change-Id: Ic4f57c2b87a0b8514600759de3a7a47e217fe3b5 BUG: 1187885 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.org/9543 Reviewed-by: Ravishankar N Tested-by: Gluster Build System --- tests/basic/afr/quorum.t | 44 +++++++++++++++++++++++-- tests/include.rc | 1 + tests/volume.rc | 8 +++++ xlators/cluster/afr/src/afr-common.c | 1 + xlators/cluster/afr/src/afr-read-txn.c | 10 ++++++ xlators/cluster/afr/src/afr-transaction.c | 11 +++++-- xlators/cluster/afr/src/afr.c | 11 +++++++ xlators/cluster/afr/src/afr.h | 1 + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 4 +++ 9 files changed, 87 insertions(+), 4 deletions(-) diff --git a/tests/basic/afr/quorum.t b/tests/basic/afr/quorum.t index dbf8895e7ed..c105290445a 100644 --- a/tests/basic/afr/quorum.t +++ b/tests/basic/afr/quorum.t @@ -13,10 +13,16 @@ function test_write { #Tests for quorum-type option for replica 2 TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2}; +TEST $CLI volume set $V0 performance.quick-read off +TEST $CLI volume set $V0 performance.io-cache off +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 performance.read-ahead off TEST $CLI volume start $V0 -TEST $GFS -s $H0 --volfile-id=$V0 $M0; +TEST $GFS -s $H0 --volfile-id=$V0 $M0 --direct-io-mode=enable; touch $M0/a +echo abc > $M0/b TEST ! $CLI volume set $V0 cluster.quorum-type "" TEST $CLI volume set $V0 cluster.quorum-type fixed @@ -25,6 +31,11 @@ TEST $CLI volume set $V0 cluster.quorum-count 2 TEST test_write TEST kill_brick $V0 $H0 $B0/${V0}1 TEST ! test_write +EXPECT "abc" cat $M0/b +TEST $CLI volume set $V0 cluster.quorum-reads on +EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads +TEST ! cat $M0/b +TEST $CLI volume reset $V0 cluster.quorum-reads TEST $CLI volume set $V0 cluster.quorum-type auto EXPECT auto volume_option $V0 cluster.quorum-type @@ -33,6 +44,11 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 TEST test_write TEST kill_brick $V0 $H0 $B0/${V0}1 TEST ! test_write +EXPECT "abc" cat $M0/b +TEST $CLI volume set $V0 cluster.quorum-reads on +EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads +TEST ! cat $M0/b +TEST $CLI volume reset $V0 cluster.quorum-reads TEST $CLI volume set $V0 cluster.quorum-type none EXPECT none volume_option $V0 cluster.quorum-type @@ -40,6 +56,12 @@ TEST test_write #Default is 'none' for even number of bricks in replication TEST $CLI volume reset $V0 cluster.quorum-type TEST test_write +EXPECT "abc" cat $M0/b +TEST $CLI volume set $V0 cluster.quorum-reads on +EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads +EXPECT "abc" cat $M0/b +TEST $CLI volume reset $V0 cluster.quorum-reads + cleanup; TEST glusterd; @@ -47,10 +69,16 @@ TEST pidof glusterd #Tests for quorum-type option for replica 3 TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1,2,3}; +TEST $CLI volume set $V0 performance.quick-read off +TEST $CLI volume set $V0 performance.io-cache off +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 performance.read-ahead off TEST $CLI volume start $V0 -TEST $GFS -s $H0 --volfile-id=$V0 $M0; +TEST $GFS -s $H0 --volfile-id=$V0 $M0 --direct-io-mode=enable; touch $M0/a +echo abc > $M0/b TEST $CLI volume set $V0 cluster.quorum-type fixed EXPECT fixed volume_option $V0 cluster.quorum-type @@ -58,12 +86,24 @@ TEST $CLI volume set $V0 cluster.quorum-count 3 TEST test_write TEST kill_brick $V0 $H0 $B0/${V0}1 TEST ! test_write +EXPECT "abc" cat $M0/b +TEST $CLI volume set $V0 cluster.quorum-reads on +EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads +TEST ! cat $M0/b +TEST $CLI volume reset $V0 cluster.quorum-reads + TEST $CLI volume set $V0 cluster.quorum-type auto EXPECT auto volume_option $V0 cluster.quorum-type TEST test_write TEST kill_brick $V0 $H0 $B0/${V0}3 TEST ! test_write +EXPECT "abc" cat $M0/b +TEST $CLI volume set $V0 cluster.quorum-reads on +EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads +TEST ! cat $M0/b +TEST $CLI volume reset $V0 cluster.quorum-reads + TEST $CLI volume set $V0 cluster.quorum-type none EXPECT none volume_option $V0 cluster.quorum-type diff --git a/tests/include.rc b/tests/include.rc index 8e4e4617411..81c4df91727 100644 --- a/tests/include.rc +++ b/tests/include.rc @@ -61,6 +61,7 @@ HEAL_TIMEOUT=60 MARKER_UPDATE_TIMEOUT=20 JANITOR_TIMEOUT=60 UMOUNT_TIMEOUT=5 +CONFIG_UPDATE_TIMEOUT=5 statedumpdir=`gluster --print-statedumpdir`; # Default directory for statedump diff --git a/tests/volume.rc b/tests/volume.rc index 887a9cae861..6abf68dc75c 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -457,3 +457,11 @@ function volgen_volume_option { local xl_option="$5" sed -e "/./{H;\$!d;}" -e "x;/volume $xl_vol/!d;/type $xl_type\/$xl_feature/!d;/option $xl_option/!d" $volfile | grep " $xl_option " | awk '{print $3}' } + +function mount_get_option_value { + local m=$1 + local subvol=$2 + local key=$3 + + grep "$3" $m/.meta/graphs/active/$subvol/private | awk '{print $3}' +} diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index c31fcba6c3b..a28cbf4a2a9 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -3561,6 +3561,7 @@ afr_priv_dump (xlator_t *this) gf_proc_dump_write("read_child", "%d", priv->read_child); gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); gf_proc_dump_write("wait_count", "%u", priv->wait_count); + gf_proc_dump_write("quorum-reads", "%d", priv->quorum_reads); return 0; } diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 29a926dbd97..ec67a20e624 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -195,6 +195,15 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, local->readfn = readfn; local->inode = inode_ref (inode); + if (priv->quorum_reads && + priv->quorum_count && !afr_has_quorum (priv->child_up, this)) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + read_subvol = -1; + goto read; + } + + local->transaction.type = type; ret = afr_inode_read_subvol_type_get (inode, this, local->readable, &event_generation, type); @@ -232,6 +241,7 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, local->read_attempted[read_subvol] = 1; +read: local->readfn (frame, this, read_subvol); return 0; diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 3cb073ecfc3..afa11bba7ab 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -28,6 +28,13 @@ int afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, afr_changelog_resume_t changelog_resume); +static int32_t +afr_quorum_errno (afr_private_t *priv) +{ + if (priv->quorum_reads) + return ENOTCONN; + return EROFS; +} int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this) @@ -558,7 +565,7 @@ afr_handle_quorum (call_frame_t *frame) } local->op_ret = -1; - local->op_errno = EROFS; + local->op_errno = afr_quorum_errno (priv); } int @@ -992,7 +999,7 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) * quorum number of nodes. */ if (priv->quorum_count && !afr_has_fop_quorum (frame)) { - op_errno = EROFS; + op_errno = afr_quorum_errno (priv); goto err; } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index bf7ba3fb0ac..f435767f5e4 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -211,6 +211,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options, int32, out); + GF_OPTION_RECONF ("quorum-reads", priv->quorum_reads, options, + bool, out); + priv->did_discovery = _gf_false; ret = 0; @@ -359,6 +362,8 @@ init (xlator_t *this) GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); + GF_OPTION_INIT ("quorum-reads", priv->quorum_reads, bool, out); + priv->wait_count = 1; priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, @@ -724,6 +729,12 @@ struct volume_options options[] = { "this many bricks or present. Other quorum types " "will OVERWRITE this value.", }, + { .key = {"quorum-reads"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .description = "If quorum-reads is \"true\" only allow reads if " + "quorum is met when quorum is enabled.", + }, { .key = {"node-uuid"}, .type = GF_OPTION_TYPE_STR, .description = "Local glusterd uuid string, used in starting " diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 8156eaa995e..dff70e89bb3 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -97,6 +97,7 @@ typedef struct _afr_private { gf_boolean_t pre_op_compat; /* on/off */ uint32_t post_op_delay_secs; unsigned int quorum_count; + gf_boolean_t quorum_reads; char vol_uuid[UUID_SIZE + 1]; int32_t *last_event; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index e9473658176..e35a607cfc2 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1655,6 +1655,10 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_7_0, .validate_fn = validate_disperse_heal_enable_disable }, + { .key = "cluster.quorum-reads", + .voltype = "cluster/replicate", + .op_version = GD_OP_VERSION_3_7_0, + }, { .key = NULL } }; -- cgit