summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
authorAnuradha <atalur@redhat.com>2015-02-25 15:09:28 +0530
committerVijay Bellur <vbellur@redhat.com>2015-03-19 06:33:12 -0700
commitd06692d1deec425f74747e2c463e56f7eca981c8 (patch)
tree2bb530f08506ee80b5704b09a02fb7f1bb75dfea /xlators/cluster
parent6f71bc02df5bd177c2f5dbf4e54b2af1525ab979 (diff)
cluster/afr : enable inspection & resolution of files in split-brain
Part 2/2 patch to enable users analyze and resolve split-brain. This patch enables : 1) Users to inspect the files in data and metadata split-brain. 2) Resolve the split-brain. Both using a series of setfattr commands. Consider a volume "test" with 2 bricks. 1) To inspect a file f1: setfattr -n replica.split-brain-choice -v test-client-0 f1 After the execution of this command, if no read_subvol is found, reads will be served from test-client-0 (corresponding to brick-0). 2) To resolve split-brain : setfattr -n replica.split-brain-heal-finalize -v test-client-0 f1 Execution of this command will lead to the resolution of data and metadata split-brain with subvol mentioned in the command (test-client-0 here) as the source and the rest as sink. Change-Id: Ia20f3ee5abd3119e3d54fcc599f1e55ac65fd179 BUG: 1191396 Signed-off-by: Anuradha <atalur@redhat.com> Reviewed-on: http://review.gluster.org/9743 Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/afr/src/afr-common.c184
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c146
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c7
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c18
-rw-r--r--xlators/cluster/afr/src/afr.h14
5 files changed, 336 insertions, 33 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index f7cc202d4d1..0af46993a34 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -67,6 +67,37 @@ afr_copy_frame (call_frame_t *base)
return frame;
}
+int
+__afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
+{
+ uint64_t ctx_int = 0;
+ int ret = -1;
+ afr_inode_ctx_t *tmp_ctx = NULL;
+
+ ret = __inode_ctx_get (inode, this, &ctx_int);
+ if (ret) {
+ tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t),
+ gf_afr_mt_inode_ctx_t);
+ if (!tmp_ctx)
+ goto out;
+
+ ctx_int = (long) tmp_ctx;
+ ret = __inode_ctx_set (inode, this, &ctx_int);
+ if (ret) {
+ GF_FREE (tmp_ctx);
+ goto out;
+ }
+ tmp_ctx->spb_choice = -1;
+ tmp_ctx->read_subvol = 0;
+ } else {
+ tmp_ctx = (afr_inode_ctx_t *) ctx_int;
+ }
+
+ *ctx = tmp_ctx;
+ ret = 0;
+out:
+ return ret;
+}
/*
* INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS:
*
@@ -109,13 +140,16 @@ __afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this,
uint32_t event = 0;
uint64_t val = 0;
int i = 0;
+ afr_inode_ctx_t *ctx = NULL;
priv = this->private;
- ret = __inode_ctx_get (inode, this, &val);
+ ret = __afr_inode_ctx_get (this, inode, &ctx);
if (ret < 0)
return ret;
+ val = ctx->read_subvol;
+
metadatamap = (val & 0x000000000000ffff);
datamap = (val & 0x00000000ffff0000) >> 16;
event = (val & 0xffffffff00000000) >> 32;
@@ -143,9 +177,15 @@ __afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this,
uint16_t metadatamap = 0;
uint64_t val = 0;
int i = 0;
+ int ret = -1;
+ afr_inode_ctx_t *ctx = NULL;
priv = this->private;
+ ret = __afr_inode_ctx_get (this, inode, &ctx);
+ if (ret)
+ goto out;
+
for (i = 0; i < priv->child_count; i++) {
if (data[i])
datamap |= (1 << i);
@@ -157,9 +197,12 @@ __afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this,
(((uint64_t) datamap) << 16) |
(((uint64_t) event) << 32);
- return __inode_ctx_set (inode, this, &val);
-}
+ ctx->read_subvol = val;
+ ret = 0;
+out:
+ return ret;
+}
int
__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this)
@@ -169,9 +212,13 @@ __afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this)
uint16_t metadatamap = 0;
uint32_t event = 0;
uint64_t val = 0;
+ afr_inode_ctx_t *ctx = NULL;
+
+ ret = __afr_inode_ctx_get (this, inode, &ctx);
+ if (ret)
+ return ret;
- ret = __inode_ctx_get (inode, this, &val);
- (void) ret;
+ val = ctx->read_subvol;
metadatamap = (val & 0x000000000000ffff) >> 0;
datamap = (val & 0x00000000ffff0000) >> 16;
@@ -181,7 +228,9 @@ __afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this)
(((uint64_t) datamap) << 16) |
(((uint64_t) event) << 32);
- return __inode_ctx_set (inode, this, &val);
+ ctx->read_subvol = val;
+
+ return ret;
}
@@ -205,6 +254,20 @@ __afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
return ret;
}
+int
+__afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,
+ int *spb_choice)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+
+ ret = __afr_inode_ctx_get (this, inode, &ctx);
+ if (ret < 0)
+ return ret;
+
+ *spb_choice = ctx->spb_choice;
+ return 0;
+}
int
__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
@@ -224,6 +287,23 @@ __afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data
return ret;
}
+int
+__afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this,
+ int spb_choice)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+
+ ret = __afr_inode_ctx_get (this, inode, &ctx);
+ if (ret)
+ goto out;
+
+ ctx->spb_choice = spb_choice;
+
+ ret = 0;
+out:
+ return ret;
+}
int
__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)
@@ -258,6 +338,22 @@ afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data,
return ret;
}
+int
+afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,
+ int *spb_choice)
+{
+ int ret = -1;
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_split_brain_choice_get (inode, this,
+ spb_choice);
+ }
+ UNLOCK(&inode->lock);
+
+ return ret;
+}
+
int
afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
@@ -275,6 +371,22 @@ afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
return ret;
}
+int
+afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this,
+ int spb_choice)
+{
+ int ret = -1;
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_split_brain_choice_set (inode, this,
+ spb_choice);
+ }
+ UNLOCK(&inode->lock);
+
+ return ret;
+}
+
int
afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)
@@ -1220,6 +1332,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
gf_boolean_t locked_entry = _gf_false;
gf_boolean_t can_interpret = _gf_true;
inode_t *parent = NULL;
+ int spb_choice = -1;
priv = this->private;
local = frame->local;
@@ -1232,6 +1345,8 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
afr_inode_read_subvol_get (parent, this, readable, NULL, &event);
+ afr_inode_split_brain_choice_get (local->inode, this,
+ &spb_choice);
/* First, check if we have a gfid-change from somewhere,
If so, propagate that so that a fresh lookup can be
issued
@@ -1321,18 +1436,24 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
}
} else {
cant_interpret:
- if (read_subvol == -1)
- dict_del (replies[0].xdata, GF_CONTENT_KEY);
- else
- dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);
+ if (read_subvol == -1) {
+ if (spb_choice >= 0)
+ read_subvol = spb_choice;
+ else
+ read_subvol = 0;
+ }
+ dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);
}
afr_handle_quota_size (frame, this);
unwind:
- if (read_subvol == -1)
- read_subvol = 0;
-
+ if (read_subvol == -1) {
+ if (spb_choice >= 0)
+ read_subvol = spb_choice;
+ else
+ read_subvol = 0;
+ }
par_read_subvol = afr_get_parent_read_subvol (this, parent, replies,
readable);
@@ -1741,8 +1862,12 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
}
unwind:
- if (read_subvol == -1)
- read_subvol = 0;
+ if (read_subvol == -1) {
+ afr_inode_split_brain_choice_get (local->inode, this,
+ &read_subvol);
+ if (read_subvol == -1)
+ read_subvol = 0;
+ }
AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
local->inode, &local->replies[read_subvol].poststat,
@@ -3468,6 +3593,15 @@ out:
int
afr_forget (xlator_t *this, inode_t *inode)
{
+ uint64_t ctx_int = 0;
+ afr_inode_ctx_t *ctx = NULL;
+
+ inode_ctx_del (inode, this, &ctx_int);
+ if (!ctx_int)
+ return 0;
+
+ ctx = (afr_inode_ctx_t *)ctx_int;
+ GF_FREE (ctx);
return 0;
}
@@ -4594,8 +4728,26 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
}
out:
- AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
+ if (local->op == GF_FOP_GETXATTR)
+ AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
+ else if (local->op == GF_FOP_SETXATTR)
+ AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
if (dict)
dict_unref(dict);
return ret;
}
+
+int
+afr_get_child_index_from_name (xlator_t *this, char *name)
+{
+ afr_private_t *priv = this->private;
+ int index = -1;
+
+ for (index = 0; index < priv->child_count; index++) {
+ if (!strcmp (priv->children[index]->name, name))
+ goto out;
+ }
+ index = -1;
+out:
+ return index;
+}
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 0c96d069ae5..776933892ff 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -34,8 +34,8 @@
#include "common-utils.h"
#include "compat-errno.h"
#include "compat.h"
+#include "protocol-common.h"
-#include "afr.h"
#include "afr-transaction.h"
@@ -961,6 +961,145 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
return 0;
}
+int
+afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ char *data)
+{
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_SETXATTR;
+
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_int32 (local->xdata_req, "heal-op",
+ GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str (local->xdata_req, "child-name", data);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ afr_heal_splitbrain_file (frame, this, loc);
+out:
+ if (ret < 0)
+ AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int
+afr_set_split_brain_choice (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int spb_choice)
+{
+ int ret = -1;
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = afr_inode_split_brain_choice_set (loc->inode, this, spb_choice);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set"
+ "split-brain choice as %s for %s",
+ priv->children[spb_choice]->name,
+ loc->name);
+ }
+ inode_invalidate (loc->inode);
+ AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
+ return ret;
+}
+
+int
+afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len)
+{
+ int spb_child_index = -1;
+ char *spb_child_str = NULL;
+
+ spb_child_str = alloca0 (len + 1);
+ memcpy (spb_child_str, value, len);
+
+ if (!strcmp (spb_child_str, "none"))
+ return -2;
+
+ spb_child_index = afr_get_child_index_from_name (this,
+ spb_child_str);
+ if (spb_child_index < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid subvol: %s",
+ spb_child_str);
+ }
+ return spb_child_index;
+}
+
+int
+afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame,
+ loc_t *loc, dict_t *dict)
+{
+ int len = 0;
+ void *value = NULL;
+ int spb_child_index = -1;
+ int ret = -1;
+ int op_errno = EINVAL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_CHOICE, &value,
+ &len);
+ if (value) {
+ spb_child_index = afr_get_split_brain_child_index (this, value,
+ len);
+ if (spb_child_index < 0) {
+ /* Case where value was "none" */
+ if (spb_child_index == -2)
+ spb_child_index = -1;
+ else {
+ ret = 1;
+ goto out;
+ }
+ }
+
+ afr_set_split_brain_choice (frame, this, loc,
+ spb_child_index);
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_RESOLVE, &value, &len);
+ if (value) {
+ spb_child_index = afr_get_split_brain_child_index (this, value,
+ len);
+ if (spb_child_index < 0) {
+ ret = 1;
+ goto out;
+ }
+
+ afr_split_brain_resolve_do (frame, this, loc,
+ priv->children[spb_child_index]->name);
+ ret = 0;
+ }
+out:
+ /* key was correct but value was invalid when ret == 1 */
+ if (ret == 1) {
+ AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+ ret = 0;
+ }
+ return ret;
+}
int
afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
@@ -977,6 +1116,11 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,
op_errno, out);
+ ret = afr_handle_split_brain_commands (this, frame, loc, dict);
+
+ if (ret == 0)
+ return 0;
+
transaction_frame = copy_frame (frame);
if (!transaction_frame)
goto out;
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index ec67a20e624..eaa73d9be20 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -56,6 +56,7 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
int event_generation = 0;
inode_t *inode = NULL;
int ret = -1;
+ int spb_choice = -1;
local = frame->local;
inode = local->inode;
@@ -96,6 +97,12 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
local->read_attempted[read_subvol] = 1;
readfn:
+ if (read_subvol == -1) {
+ ret = afr_inode_split_brain_choice_get (inode, this,
+ &spb_choice);
+ if ((ret == 0) && spb_choice >= 0)
+ read_subvol = spb_choice;
+ }
local->readfn (frame, this, read_subvol);
return 0;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 2441f413f3e..21b4c4414d9 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -389,9 +389,11 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
local = frame->local;
priv = this->private;
xdata_req = local->xdata_req;
+
ret = dict_get_int32 (xdata_req, "heal-op", &heal_op);
if (ret)
goto out;
+
for (i = 0; i < priv->child_count; i++) {
if (locked_on[i])
if (sources[i] || !sinks[i] || !healed_sinks[i]) {
@@ -468,22 +470,6 @@ out:
}
-int
-afr_get_child_index_from_name (xlator_t *this, char *name)
-{
- afr_private_t *priv = this->private;
- int index = -1;
-
- for (index = 0; index < priv->child_count; index++) {
- if (!strcmp (priv->children[index]->name, name))
- goto out;
- }
- index = -1;
-out:
- return index;
-}
-
-
gf_boolean_t
afr_does_witness_exist (xlator_t *this, uint64_t *witness)
{
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index d7d15c69845..0885b582d77 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -733,6 +733,11 @@ typedef struct _afr_local {
} afr_local_t;
+typedef struct _afr_inode_ctx {
+ uint64_t read_subvol;
+ int spb_choice;
+} afr_inode_ctx_t;
+
/* did a call fail due to a child failing? */
#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
((op_errno == ENOTCONN) || \
@@ -1026,4 +1031,13 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc);
int
afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc);
+
+int
+afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this,
+ int spb_choice);
+int
+afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,
+ int *spb_choice);
+int
+afr_get_child_index_from_name (xlator_t *this, char *name);
#endif /* __AFR_H__ */