summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@redhat.com>2014-03-31 18:37:38 +0000
committerJeff Darcy <jdarcy@redhat.com>2014-04-22 15:20:46 +0000
commit46d333783a968ab39e0beade9c7a1eec8035f8b1 (patch)
treefaf1db8cb7ea7fefb0a4d8374440fa095116fef1
parentacd2292f085b15c2c5c28169d11f20dca90f5ec9 (diff)
nsr: add quorum enforcement
Change-Id: I0241f8c1ac97c80ae438e3d9f1ac492d63da9347 Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
-rw-r--r--tests/basic/quorum.t64
-rwxr-xr-xtests/basic/recon.t4
-rw-r--r--xlators/cluster/nsr-server/src/all-templates.c25
-rw-r--r--xlators/cluster/nsr-server/src/nsr-internal.h1
-rw-r--r--xlators/cluster/nsr-server/src/nsr.c8
-rw-r--r--xlators/cluster/nsr-server/src/recon_notify.c10
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c6
7 files changed, 116 insertions, 2 deletions
diff --git a/tests/basic/quorum.t b/tests/basic/quorum.t
new file mode 100644
index 0000000..b8fc9cf
--- /dev/null
+++ b/tests/basic/quorum.t
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Test *very basic* NSR functionality - startup, mount, simplest possible file
+# write.
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+function get_rep_count {
+ v=$(getfattr --only-values -e text -n trusted.nsr.rep-count $1 2> /dev/null)
+ #echo $v > /dev/tty
+ echo $v
+}
+
+function kill_a_brick {
+ for r in /var/lib/glusterd/vols/${V0}/run/*-recon.pid; do
+ rpid=$(cat $r)
+ #echo "recon PID = $rpid" > /dev/tty
+ b=$(echo $r | sed '/\(.*\):\(.*\)-recon.pid/s//\1\2.pid/')
+ bpid=$(cat $b)
+ #echo "brick PID = $bpid" > /dev/tty
+ kill -9 $bpid $rpid
+ return 0
+ done
+
+ # No bricks?!?
+ return 1
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info
+
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2}
+
+EXPECT "$V0" volinfo_field $V0 'Volume Name'
+EXPECT 'Created' volinfo_field $V0 'Status'
+EXPECT '2' brick_count $V0
+
+TEST $CLI volume set $V0 cluster.nsr on
+TEST $CLI volume set $V0 cluster.nsr.recon on
+
+TEST $CLI volume start $V0
+EXPECT 'Started' volinfo_field $V0 'Status'
+
+## Mount FUSE with caching disabled (read-only)
+TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
+
+# Give the bricks a chance to connect to each other.
+EXPECT_WITHIN 10 "2" get_rep_count $M0
+
+TEST kill_a_brick
+EXPECT_WITHIN 10 "1" get_rep_count $M0
+
+# Make sure writes fail while degraded.
+tmpfile=$(mktemp)
+trap "rm $tmpfile" EXIT
+dd if=/dev/urandom of=$M0/probe bs=4k count=100 status=none 2> $tmpfile
+TEST [ x"$?" != x"0" ]
+TEST grep -qs 'Read-only file system' $tmpfile
+
+cleanup
diff --git a/tests/basic/recon.t b/tests/basic/recon.t
index fac4545..e0fbea7 100755
--- a/tests/basic/recon.t
+++ b/tests/basic/recon.t
@@ -119,6 +119,10 @@ EXPECT '2' brick_count $V0
TEST $CLI volume set $V0 cluster.nsr on
TEST $CLI volume set $V0 cluster.nsr.recon on
+# This would normally be a terrible idea, but it's handy for issuing ops that
+# will have to be reconciled later.
+TEST $CLI volume set $V0 cluster.nsr.quorum-percent 0
+
TEST $CLI volume start $V0
EXPECT 'Started' volinfo_field $V0 'Status'
diff --git a/xlators/cluster/nsr-server/src/all-templates.c b/xlators/cluster/nsr-server/src/all-templates.c
index 2f0509a..fa29de7 100644
--- a/xlators/cluster/nsr-server/src/all-templates.c
+++ b/xlators/cluster/nsr-server/src/all-templates.c
@@ -59,6 +59,31 @@ nsr_$NAME$ (call_frame_t *frame, xlator_t *this,
int from_leader;
int from_recon;
uint32_t ti = 0;
+ double must_be_up;
+ double are_up;
+
+ /*
+ * Our first goal here is to avoid "split brain surprise" for users who
+ * specify exactly 50% with two- or three-way replication. That means
+ * either a more-than check against half the total replicas or an
+ * at-least check against half of our peers (one less). Of the two,
+ * only an at-least check supports the intuitive use of 100% to mean
+ * all replicas must be present, because "more than 100%" will never
+ * succeed regardless of which count we use. This leaves us with a
+ * slightly non-traditional definition of quorum ("at least X% of peers
+ * not including ourselves") but one that's useful enough to be worth
+ * it.
+ *
+ * Note that n_children and up_children *do* include the local
+ * subvolume, so we need to subtract one in each case.
+ */
+ must_be_up = ((double)(priv->n_children - 1)) * priv->quorum_pct;
+ are_up = ((double)(priv->up_children - 1)) * 100.0;
+ if (are_up < must_be_up) {
+ /* Emulate the AFR client-side-quorum behavior. */
+ op_errno = EROFS;
+ goto err;
+ }
local = mem_get0(this->local_pool);
if (!local) {
diff --git a/xlators/cluster/nsr-server/src/nsr-internal.h b/xlators/cluster/nsr-server/src/nsr-internal.h
index fc612c1..72b61bf 100644
--- a/xlators/cluster/nsr-server/src/nsr-internal.h
+++ b/xlators/cluster/nsr-server/src/nsr-internal.h
@@ -59,6 +59,7 @@ typedef struct {
volatile uint32_t ops_in_flight;
uint32_t index;
gf_lock_t index_lock;
+ double quorum_pct;
} nsr_private_t;
typedef struct {
diff --git a/xlators/cluster/nsr-server/src/nsr.c b/xlators/cluster/nsr-server/src/nsr.c
index eda9e55..85eba09 100644
--- a/xlators/cluster/nsr-server/src/nsr.c
+++ b/xlators/cluster/nsr-server/src/nsr.c
@@ -591,6 +591,9 @@ nsr_init (xlator_t *this)
goto err;
}
+
+ GF_OPTION_INIT ("quorum-percent", priv->quorum_pct, percent, err);
+
GF_OPTION_INIT ("subvol-uuid", priv->subvol_uuid, str, err);
gf_log (this->name, GF_LOG_INFO, "subvol_uuid = %s", priv->subvol_uuid);
if (gf_asprintf(&priv->leader_key,"%s:leader",priv->subvol_uuid) <= 0) {
@@ -800,5 +803,10 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_STR,
.description = "UUID for this NSR (sub)volume"
},
+ { .key = {"quorum-percent"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "50.0",
+ .description = "percentage of rep_count-1 that must be up"
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/nsr-server/src/recon_notify.c b/xlators/cluster/nsr-server/src/recon_notify.c
index 24f7cf2..1c50de2 100644
--- a/xlators/cluster/nsr-server/src/recon_notify.c
+++ b/xlators/cluster/nsr-server/src/recon_notify.c
@@ -91,8 +91,14 @@ nsr_recon_set_leader (xlator_t *this)
if (ctx->last_reconciled_term == priv->current_term)
return;
- // No majority as of yet
- if (priv->up_children <= (priv->n_children / 2))
+ /*
+ * Quorum for reconciliation is not the same as quorum for I/O. Here,
+ * we require a true majority. The +1 is because we don't count
+ * ourselves as part of n_children or up_children.
+ *
+ * TBD: re-evaluate when to reconcile (including partial)
+ */
+ if (priv->up_children <= (priv->n_children / 2))
return;
gf_log (this->name, GF_LOG_INFO,
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 39bbe0a..24a6ed7 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -889,6 +889,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.description = "enable NSR reconciliation",
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
},
+ { .key = "cluster.nsr.quorum-percent",
+ .voltype = "cluster/nsr",
+ .option = "quorum-percent",
+ .op_version = 3,
+ .description = "percent of rep_count-1 bricks that must be up"
+ },
/* Performance xlators enable/disbable options */
{ .key = "performance.write-behind",