summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xtests/basic/nsr/nsr.t33
-rw-r--r--tests/volume.rc18
-rw-r--r--xlators/experimental/nsr-client/src/nsrc.c110
-rw-r--r--xlators/experimental/nsr-client/src/nsrc.h3
-rw-r--r--xlators/experimental/nsr-server/src/all-templates.c12
-rw-r--r--xlators/experimental/nsr-server/src/nsr-internal.h2
-rw-r--r--xlators/experimental/nsr-server/src/nsr.c90
7 files changed, 248 insertions, 20 deletions
diff --git a/tests/basic/nsr/nsr.t b/tests/basic/nsr/nsr.t
new file mode 100755
index 00000000000..b5a4aaf1058
--- /dev/null
+++ b/tests/basic/nsr/nsr.t
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../cluster.rc
+. $(dirname $0)/../../snapshot.rc
+
+cleanup;
+
+TEST verify_lvm_version;
+#Create cluster with 3 nodes
+TEST launch_cluster 3;
+TEST setup_lvm 3
+
+TEST $CLI_1 peer probe $H2;
+TEST $CLI_1 peer probe $H3;
+EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count;
+
+TEST $CLI_1 volume create $V0 replica 3 $H1:$L1 $H2:$L2 $H3:$L3
+TEST $CLI_1 volume set $V0 cluster.nsr on
+#TEST $CLI_1 volume set $V0 diagnostics.brick-log-level DEBUG
+TEST $CLI_1 volume start $V0
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H1 --entry-timeout=0 $M0;
+
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" nsrc_child_up_status $V0 0
+
+echo "file" > $M0/file1
+TEST stat $L1/file1
+TEST stat $L2/file1
+TEST stat $L3/file1
+
+cleanup;
diff --git a/tests/volume.rc b/tests/volume.rc
index e488aa73b1c..71b40b72d66 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -110,6 +110,24 @@ function snap_client_connected_status {
echo "$up"
}
+function _nsrc_child_up_status {
+ local vol=$1
+ #brick_id is (brick-num in volume info - 1)
+ local brick_id=$2
+ local gen_state_dump=$3
+ local fpath=$($gen_state_dump $vol)
+ up=$(grep -a -B1 child_$brick_id=$vol-client-$brick_id $fpath | head -1 | cut -f2 -d'=')
+ rm -f $fpath
+ echo "$up"
+}
+
+function nsrc_child_up_status {
+ local vol=$1
+ #brick_id is (brick-num in volume info - 1)
+ local brick_id=$2
+ _nsrc_child_up_status $vol $brick_id generate_mount_statedump
+}
+
function _afr_child_up_status {
local vol=$1
#brick_id is (brick-num in volume info - 1)
diff --git a/xlators/experimental/nsr-client/src/nsrc.c b/xlators/experimental/nsr-client/src/nsrc.c
index dd3ad20544e..13f1a2d38c5 100644
--- a/xlators/experimental/nsr-client/src/nsrc.c
+++ b/xlators/experimental/nsr-client/src/nsrc.c
@@ -18,6 +18,7 @@
#include "xlator.h"
#include "nsr-messages.h"
#include "nsrc.h"
+#include "statedump.h"
#define SCAR_LIMIT 20
#define HILITE(x) (""x"")
@@ -168,6 +169,7 @@ int32_t
nsrc_init (xlator_t *this)
{
nsrc_private_t *priv = NULL;
+ xlator_list_t *trav = NULL;
this->local_pool = mem_pool_new (nsrc_local_t, 128);
if (!this->local_pool) {
@@ -181,6 +183,10 @@ nsrc_init (xlator_t *this)
goto err;
}
+ for (trav = this->children; trav; trav = trav->next) {
+ ++(priv->n_children);
+ }
+
priv->active = FIRST_CHILD(this);
this->private = priv;
return 0;
@@ -198,33 +204,111 @@ nsrc_fini (xlator_t *this)
GF_FREE(this->private);
}
+int
+nsrc_get_child_index (xlator_t *this, xlator_t *kid)
+{
+ xlator_list_t *trav;
+ int retval = -1;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ ++retval;
+ if (trav->xlator == kid) {
+ return retval;
+ }
+ }
+
+ return -1;
+}
+
+uint8_t
+nsrc_count_up_kids (nsrc_private_t *priv)
+{
+ uint8_t retval = 0;
+ uint8_t i;
+
+ for (i = 0; i < priv->n_children; ++i) {
+ if (priv->kid_state & (1 << i)) {
+ ++retval;
+ }
+ }
+
+ return retval;
+}
+
int32_t
nsrc_notify (xlator_t *this, int32_t event, void *data, ...)
{
- int32_t ret = 0;
+ int32_t ret = 0;
+ int32_t index = 0;
+ nsrc_private_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO (THIS->name, this, out);
+ priv = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, priv, out);
switch (event) {
+ case GF_EVENT_CHILD_UP:
+ index = nsrc_get_child_index(this, data);
+ if (index >= 0) {
+ priv->kid_state |= (1 << index);
+ priv->up_children = nsrc_count_up_kids(priv);
+ gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC,
+ "got CHILD_UP for %s, now %u kids",
+ ((xlator_t *)data)->name,
+ priv->up_children);
+ }
+ ret = default_notify (this, event, data);
+ break;
case GF_EVENT_CHILD_DOWN:
- /*
- * TBD: handle this properly
- *
- * What we really should do is propagate this only if it caused
- * us to lose quorum, and likewise for GF_EVENT_CHILD_UP only
- * if it caused us to gain quorum. However, that requires
- * tracking child states and for now it's easier to swallow
- * these unconditionally. The consequence of failing to do
- * this is that DHT sees the first GF_EVENT_CHILD_DOWN and gets
- * confused, so it doesn't call us and doesn't get up-to-date
- * directory listings etc.
- */
+ index = nsrc_get_child_index(this, data);
+ if (index >= 0) {
+ priv->kid_state &= ~(1 << index);
+ priv->up_children = nsrc_count_up_kids(priv);
+ gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC,
+ "got CHILD_DOWN for %s, now %u kids",
+ ((xlator_t *)data)->name,
+ priv->up_children);
+ }
break;
default:
ret = default_notify (this, event, data);
}
+out:
return ret;
}
+int
+nsrc_priv_dump (xlator_t *this)
+{
+ nsrc_private_t *priv = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ xlator_list_t *trav = NULL;
+ int32_t i = -1;
+
+ GF_VALIDATE_OR_GOTO (THIS->name, this, out);
+ priv = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, priv, out);
+
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s",
+ this->type, this->name);
+ gf_proc_dump_add_section(key_prefix);
+
+ gf_proc_dump_write("up_children", "%u", priv->up_children);
+
+ for (trav = this->children, i = 0; trav; trav = trav->next, i++) {
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "child_%d", i);
+ gf_proc_dump_write(key_prefix, "%s", trav->xlator->name);
+ }
+
+out:
+ return 0;
+}
+
+struct xlator_dumpops dumpops = {
+ .priv = nsrc_priv_dump,
+};
+
class_methods_t class_methods = {
.init = nsrc_init,
.fini = nsrc_fini,
diff --git a/xlators/experimental/nsr-client/src/nsrc.h b/xlators/experimental/nsr-client/src/nsrc.h
index 0c61d7a9fa8..15f0d7c85a0 100644
--- a/xlators/experimental/nsr-client/src/nsrc.h
+++ b/xlators/experimental/nsr-client/src/nsrc.h
@@ -13,6 +13,9 @@
typedef struct {
xlator_t *active;
+ uint8_t up_children;
+ uint8_t n_children;
+ uint32_t kid_state;
} nsrc_private_t;
typedef struct {
diff --git a/xlators/experimental/nsr-server/src/all-templates.c b/xlators/experimental/nsr-server/src/all-templates.c
index 300abea959d..c3819d2af54 100644
--- a/xlators/experimental/nsr-server/src/all-templates.c
+++ b/xlators/experimental/nsr-server/src/all-templates.c
@@ -83,6 +83,9 @@ nsr_@NAME@ (call_frame_t *frame, xlator_t *this,
if (result == _gf_false) {
/* Emulate the AFR client-side-quorum behavior. */
+ gf_msg (this->name, GF_LOG_ERROR, EROFS,
+ N_MSG_QUORUM_NOT_MET, "Sufficient number of "
+ "subvolumes are not up to meet quorum.");
op_errno = EROFS;
goto err;
}
@@ -309,6 +312,10 @@ nsr_@NAME@_continue (call_frame_t *frame, xlator_t *this,
result = fop_quorum_check (this, (double)priv->n_children,
(double)local->successful_acks + 1);
if (result == _gf_false) {
+ gf_msg (this->name, GF_LOG_ERROR, EROFS,
+ N_MSG_QUORUM_NOT_MET, "Didn't receive enough acks "
+ "to meet quorum. Failing the operation without trying "
+ "it on the leader.");
STACK_UNWIND_STRICT (@NAME@, frame, -1, EROFS,
@ERROR_ARGS@);
} else {
@@ -406,8 +413,9 @@ nsr_@NAME@_complete (call_frame_t *frame, void *cookie, xlator_t *this,
if (result == _gf_false) {
op_ret = -1;
op_errno = EROFS;
- gf_msg_debug (this->name, 0,
- "Quorum is not met. The operation has failed.");
+ gf_msg (this->name, GF_LOG_ERROR, EROFS,
+ N_MSG_QUORUM_NOT_MET, "Quorum is not met. "
+ "The operation has failed.");
} else {
#if defined(NSR_CG_NEED_FD)
op_ret = local->successful_op_ret;
diff --git a/xlators/experimental/nsr-server/src/nsr-internal.h b/xlators/experimental/nsr-server/src/nsr-internal.h
index b8c7fc314b7..d43fbac9a53 100644
--- a/xlators/experimental/nsr-server/src/nsr-internal.h
+++ b/xlators/experimental/nsr-server/src/nsr-internal.h
@@ -74,6 +74,8 @@ typedef struct {
* TBD: re-evaluate how to manage this
*/
char term_buf[CHANGELOG_ENTRY_SIZE];
+ gf_boolean_t child_up; /* To maintain the state of *
+ * the translator */
} nsr_private_t;
typedef struct {
diff --git a/xlators/experimental/nsr-server/src/nsr.c b/xlators/experimental/nsr-server/src/nsr.c
index 48966ab15a1..0fb618f236e 100644
--- a/xlators/experimental/nsr-server/src/nsr.c
+++ b/xlators/experimental/nsr-server/src/nsr.c
@@ -860,13 +860,23 @@ nsr_get_child_index (xlator_t *this, xlator_t *kid)
int
nsr_notify (xlator_t *this, int event, void *data, ...)
{
- nsr_private_t *priv = this->private;
- int index;
+ nsr_private_t *priv = this->private;
+ int index = -1;
+ int ret = -1;
+ gf_boolean_t result = _gf_false;
+ gf_boolean_t relevant = _gf_false;
switch (event) {
case GF_EVENT_CHILD_UP:
index = nsr_get_child_index(this, data);
if (index >= 0) {
+ /* Check if the child was previously down
+ * and it's not a false CHILD_UP
+ */
+ if (!(priv->kid_state & (1 << index))) {
+ relevant = _gf_true;
+ }
+
priv->kid_state |= (1 << index);
priv->up_children = nsr_count_up_kids(priv);
gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC,
@@ -876,27 +886,96 @@ nsr_notify (xlator_t *this, int event, void *data, ...)
if (!priv->config_leader && (priv->up_children > 1)) {
priv->leader = _gf_false;
}
+
+ /* If it's not relevant, or we have already *
+ * sent CHILD_UP just break */
+ if (!relevant || priv->child_up)
+ break;
+
+ /* If it's not a leader, just send the notify up */
+ if (!priv->leader) {
+ ret = default_notify(this, event, data);
+ if (!ret)
+ priv->child_up = _gf_true;
+ break;
+ }
+
+ result = fop_quorum_check (this,
+ (double)(priv->n_children - 1),
+ (double)(priv->up_children - 1));
+ if (result == _gf_false) {
+ gf_msg (this->name, GF_LOG_INFO, 0,
+ N_MSG_GENERIC, "Not enough children "
+ "are up to meet quorum. Waiting to "
+ "send CHILD_UP from leader");
+ } else {
+ gf_msg (this->name, GF_LOG_INFO, 0,
+ N_MSG_GENERIC, "Enough children are up "
+ "to meet quorum. Sending CHILD_UP "
+ "from leader");
+ ret = default_notify(this, event, data);
+ if (!ret)
+ priv->child_up = _gf_true;
+ }
}
break;
case GF_EVENT_CHILD_DOWN:
index = nsr_get_child_index(this, data);
if (index >= 0) {
+ /* Check if the child was previously up
+ * and it's not a false CHILD_DOWN
+ */
+ if (priv->kid_state & (1 << index)) {
+ relevant = _gf_true;
+ }
priv->kid_state &= ~(1 << index);
priv->up_children = nsr_count_up_kids(priv);
gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC,
"got CHILD_DOWN for %s, now %u kids",
((xlator_t *)data)->name,
priv->up_children);
- if (!priv->config_leader && (priv->up_children < 2)) {
+ if (!priv->config_leader && (priv->up_children < 2)
+ && relevant) {
priv->leader = _gf_true;
}
+
+ /* If it's not relevant, or we have already *
+ * sent CHILD_DOWN just break */
+ if (!relevant || !priv->child_up)
+ break;
+
+ /* If it's not a leader, just break coz we shouldn't *
+ * propagate the failure from the failure till it *
+ * itself goes down *
+ */
+ if (!priv->leader) {
+ break;
+ }
+
+ result = fop_quorum_check (this,
+ (double)(priv->n_children - 1),
+ (double)(priv->up_children - 1));
+ if (result == _gf_false) {
+ gf_msg (this->name, GF_LOG_INFO, 0,
+ N_MSG_GENERIC, "Enough children are "
+ "to down to fail quorum. "
+ "Sending CHILD_DOWN from leader");
+ ret = default_notify(this, event, data);
+ if (!ret)
+ priv->child_up = _gf_false;
+ } else {
+ gf_msg (this->name, GF_LOG_INFO, 0,
+ N_MSG_GENERIC, "Not enough children "
+ "are down to fail quorum. Waiting to "
+ "send CHILD_DOWN from leader");
+ }
}
break;
default:
- ;
+ ret = default_notify(this, event, data);
}
- return default_notify(this, event, data);
+ return ret;
}
@@ -995,6 +1074,7 @@ nsr_init (xlator_t *this)
GF_OPTION_INIT ("quorum-percent", priv->quorum_pct, percent, err);
priv->leader = priv->config_leader;
+ priv->child_up = _gf_false;
if (pthread_create(&kid, NULL, nsr_flush_thread,
this) != 0) {