summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src')
-rw-r--r--xlators/cluster/afr/src/afr-common.c22
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c82
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c141
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c3
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h7
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c42
-rw-r--r--xlators/cluster/afr/src/afr.c9
-rw-r--r--xlators/cluster/afr/src/afr.h3
10 files changed, 291 insertions, 22 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 7425b6688f2..f5210bd7d5d 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1267,7 +1267,7 @@ afr_inode_refresh_do (call_frame_t *frame, xlator_t *this)
return 0;
}
- ret = afr_xattr_req_prepare (this, xdata);
+ ret = afr_xattr_req_prepare (this, xdata, _gf_false);
if (ret != 0) {
dict_unref (xdata);
afr_inode_refresh_done (frame, this, -ret);
@@ -1360,7 +1360,7 @@ afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
int
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, gf_boolean_t checksum)
{
int i = 0;
afr_private_t *priv = NULL;
@@ -1397,6 +1397,19 @@ afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
"Unable to set ancestry path key in dict ");
}
+ if (checksum) {
+ ret = dict_set_int32 (xattr_req, "get-checksum", 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to set get-checksum in dict ");
+ }
+ ret = dict_set_int32 (xattr_req, "trusted.glusterfs.validate-status", 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to set validate-status in dict ");
+ }
+ }
+
return ret;
}
@@ -1415,7 +1428,7 @@ afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this,
if (xattr_req && (xattr_req != local->xattr_req))
dict_copy (xattr_req, local->xattr_req);
- ret = afr_xattr_req_prepare (this, local->xattr_req);
+ ret = afr_xattr_req_prepare (this, local->xattr_req, _gf_false);
ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
if (ret < 0) {
@@ -5906,7 +5919,8 @@ afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
replies = alloca0 (sizeof (*replies) * priv->child_count);
- ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
+ ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies,
+ priv->shd_validate_data);
if (ret)
goto out;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 800cf9705c9..5a9ab795a94 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -298,6 +298,51 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
return 0;
}
+static int
+afr_selfheal_vstatus_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+
+ syncbarrier_wake (&local->barrier);
+ return 0;
+}
+
+void
+afr_selfheal_update_vstatus (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *targets, char *new_status)
+{
+ loc_t loc = {0, };
+ dict_t *xattr;
+
+ loc.inode = inode_ref (inode);
+
+ xattr = dict_new ();
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unable to allocate validate-status for %s",
+ uuid_utoa (inode->gfid));
+ goto done;
+ }
+
+ if (dict_set_str (xattr, "trusted.glusterfs.validate-status",
+ new_status) != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "couldn't clear validate-status for %s",
+ uuid_utoa (inode->gfid));
+ goto done;
+ }
+
+ AFR_ONLIST (targets, frame, afr_selfheal_vstatus_cbk, setxattr,
+ &loc, xattr, 0, NULL);
+
+done:
+ if (xattr) {
+ dict_unref (xattr);
+ }
+ loc_wipe (&loc);
+}
+
void
afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count)
@@ -1379,7 +1424,7 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
if (xattr)
dict_copy (xattr, xattr_req);
- if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ if (afr_xattr_req_prepare (frame->this, xattr_req, _gf_false) != 0) {
dict_destroy (xattr_req);
return NULL;
}
@@ -1406,10 +1451,11 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
return inode;
}
-int
+static int
afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies,
- unsigned char *discover_on)
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on,
+ gf_boolean_t checksum)
{
loc_t loc = {0, };
dict_t *xattr_req = NULL;
@@ -1423,7 +1469,7 @@ afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
if (!xattr_req)
return -ENOMEM;
- if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ if (afr_xattr_req_prepare (frame->this, xattr_req, checksum) != 0) {
dict_destroy (xattr_req);
return -ENOMEM;
}
@@ -1444,14 +1490,15 @@ afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
int
afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies)
+ uuid_t gfid, struct afr_reply *replies,
+ gf_boolean_t checksum)
{
afr_private_t *priv = NULL;
priv = frame->this->private;
return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies,
- priv->child_up);
+ priv->child_up, checksum);
}
unsigned int
@@ -1865,7 +1912,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
replies = alloca0 (sizeof (*replies) * priv->child_count);
- ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
+ ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies, _gf_false);
if (ret)
goto out;
@@ -1983,6 +2030,22 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
if (data_selfheal)
*data_selfheal = _gf_true;
}
+
+ if (priv->shd_validate_data && data_selfheal && !*data_selfheal) {
+ if (IA_ISREG (replies[i].poststat.ia_type)) {
+ gf_log (this->name, GF_LOG_INFO,
+ "forcing data self-heal on %s",
+ uuid_utoa (replies[i].poststat.ia_gfid));
+ /*
+ * This will force our caller (e.g.
+ * afr_selfheal_do) to call afr_selfheal_data,
+ * even though it might otherwise think
+ * everything looks OK. From there, we'll do a
+ * more thorough inspection including checksums.
+ */
+ *data_selfheal = _gf_true;
+ }
+ }
}
if (valid_cnt > 0 && link_inode) {
@@ -1999,7 +2062,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
ret = 0;
out:
if (inode)
- inode_unref (inode);
+ inode_unref (inode);
if (replies)
afr_replies_wipe (replies, priv->child_count);
@@ -2140,7 +2203,6 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
&data_selfheal,
&metadata_selfheal,
&entry_selfheal);
-
if (ret)
goto out;
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index c1e945bfd82..894c8e68f25 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -587,6 +587,135 @@ out:
return source;
}
+static int
+afr_move_aside (call_frame_t *frame, xlator_t *this, inode_t *inode, int i)
+{
+ afr_private_t *priv = this->private;
+ dict_t *xattr = NULL;
+ int ret = -1;
+ loc_t loc = {0, };
+
+ loc.inode = inode_ref (inode);
+
+ xattr = dict_new ();
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to alloc move-aside dict for %s on child %d",
+ uuid_utoa (inode->gfid), i);
+ goto done;
+ }
+
+ if (dict_set_str (xattr, "trusted.move-aside", "please") != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set move-aside xattr for %s on child %d",
+ uuid_utoa (inode->gfid), i);
+ goto done;
+ }
+
+ if (syncop_setxattr (priv->children[i], &loc, xattr, 0,
+ NULL, NULL) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to send move-aside fop for %s on child %d",
+ uuid_utoa (inode->gfid), i);
+ goto done;
+ }
+
+ ret = 0;
+
+done:
+ if (xattr) {
+ dict_unref (xattr);
+ }
+ loc_wipe (&loc);
+
+ return ret;
+}
+
+static void
+afr_handle_validation (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = this->private;
+ uint32_t *values;
+ int i;
+ int same_as[2] = {0, 0};
+ char *vstatus;
+
+ if (!priv->shd_validate_data) {
+ return;
+ }
+
+ values = alloca0 (sizeof (*values) * priv->child_count);
+ for (i = 0; i < priv->child_count; ++i) {
+ if (!replies[i].xdata) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no xdata for child %d", i);
+ return;
+ }
+ if (dict_get_str (replies[i].xdata,
+ "trusted.glusterfs.validate-status",
+ &vstatus) != 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no validate-status for child %d", i);
+ return;
+ }
+ if (strncmp (vstatus, "suspect", 7) != 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "validate-status = %s for child %d", vstatus, i);
+ return;
+ }
+ if (dict_get_uint32 (replies[i].xdata, "checksum", &values[i]) != 0) {
+ return;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "checksum for child %d is 0x%x", i, values[i]);
+ }
+
+ /*
+ * Let's take a shortcut here by looking only for a single odd
+ * man out instead of a more generalized minority. To do this,
+ * we only need to compare the third item onward to (at most)
+ * the first two, and we only need two counters. There's all
+ * sorts of ways we could optimize this implementation, but
+ * there's little left to be saved.
+ */
+ for (i = 0; i < priv->child_count; ++i) {
+ same_as[0] += (values[i] == values[0]);
+ same_as[1] += (values[i] == values[1]);
+ }
+ if (same_as[0] == priv->child_count) {
+ gf_log (this->name, GF_LOG_DEBUG, "everything's OK");
+ afr_selfheal_update_vstatus (frame, this, inode,
+ sources, "clean");
+ } else if (same_as[0] == (priv->child_count - 1)) {
+ gf_log (this->name, GF_LOG_DEBUG, "odd man out, use 0");
+ for (i = 0; i < priv->child_count; ++i) {
+ if (values[i] != values[0]) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ afr_move_aside (frame, this, inode, i);
+ }
+ }
+ } else if (same_as[1] == (priv->child_count - 1)) {
+ gf_log (this->name, GF_LOG_DEBUG, "odd man out, use 1");
+ for (i = 0; i < priv->child_count; ++i) {
+ if (values[i] != values[1]) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ afr_move_aside (frame, this, inode, i);
+ }
+ }
+ } else {
+ gf_log (this->name, GF_LOG_WARNING, "three-way split on %s",
+ uuid_utoa (inode->gfid));
+ for (i = 0; i < priv->child_count; ++i) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ }
+ }
+}
+
/*
* __afr_selfheal_data_prepare:
*
@@ -612,7 +741,7 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
priv = this->private;
ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
+ replies, priv->shd_validate_data);
if (ret)
return ret;
@@ -625,6 +754,8 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
if (ret)
return ret;
+ afr_handle_validation (frame, this, inode, sources, sinks, replies);
+
/* Initialize the healed_sinks[] array optimistically to
the intersection of to-be-healed (i.e sinks[]) and
the list of servers which are up (i.e locked_on[]).
@@ -749,6 +880,14 @@ restore_time:
sources, sinks, healed_sinks,
undid_pending, AFR_DATA_TRANSACTION,
locked_replies, data_lock);
+
+ if (priv->shd_validate_data) {
+ afr_selfheal_update_vstatus (frame, this, fd->inode,
+ healed_sinks, "repaired");
+ afr_selfheal_update_vstatus (frame, this, fd->inode,
+ sources, "clean");
+ }
+
skip_undo_pending:
afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
data_lock);
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 25f8ea313aa..e0a82426a33 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -489,7 +489,7 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this,
priv = this->private;
ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
+ replies, _gf_false);
if (ret)
return ret;
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 4570ace7ef7..85dbdf2976e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -336,7 +336,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
priv = this->private;
ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
+ replies, _gf_false);
if (ret)
return ret;
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index 8e5546a702f..db78ef81804 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -796,7 +796,8 @@ __afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *paren
replies = alloca0 (priv->child_count * sizeof(*replies));
- ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies);
+ ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies,
+ _gf_false);
if (ret)
goto out;
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 0a3d6482ca3..b33db59b50f 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -143,7 +143,8 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
int
afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies);
+ uuid_t gfid, struct afr_reply *replies,
+ gf_boolean_t checksum);
inode_t *
afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
@@ -165,6 +166,10 @@ int
afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
afr_transaction_type type, int *dirty, int **matrix);
+void
+afr_selfheal_update_vstatus (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *targets, char *new_status);
+
int
afr_sh_generic_fop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *pre,
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 26f4a80777f..9c4f3ec81f6 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -315,8 +315,8 @@ afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent,
return ret;
}
-int
-afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
+static int
+_afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
{
int ret = 0;
eh_t *eh = NULL;
@@ -377,6 +377,44 @@ out:
}
+int
+afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
+{
+ afr_private_t *priv = healer->this->private;
+ int ret = _afr_shd_selfheal (healer, child, gfid);
+
+ /*
+ * You are not expected to understand this code. OK, sorry, it's a
+ * very old UNIX meme. I've been waiting years for an appropriate time
+ * to use it, and this seems as good as it's going to get. If it makes
+ * you feel any better, the reason I don't expect you to understand
+ * this code is that I don't understand it either and therefore can't
+ * explain it.
+ *
+ * What's going on here is that we only call afr_shd_zero_xattrop for a
+ * return value of two, which non-obviously means that no heal was
+ * deemed necessary. However, we made it seem necessary *only* because
+ * of data validation, so we skipped the part where we'd return that
+ * value normally. It was only later, and several layers deeper in the
+ * call hierarchy, that we realized everything was OK after all.
+ * Expecting to return a two at that point, and have it survive all the
+ * intervening layers, and not have any other untoward side effects,
+ * would require more optimism about this code than I've ever felt.
+ * Changing it here isn't entirely without risk either, but at least
+ * the side effects this way are easier to reason about.
+ *
+ * You might well wonder how the index entry ever gets removed in the
+ * other cases. I wonder too. Observation says that it does, and
+ * that's good enough. It's a big world, with many other mysteries in
+ * it.
+ */
+ if (priv->shd_validate_data && (ret >= 0)) {
+ ret = 2;
+ }
+
+ return ret;
+}
+
void
afr_shd_sweep_prepare (struct subvol_healer *healer)
{
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 86f667116af..f291626fff9 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -167,6 +167,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("pgfid-self-heal", priv->pgfid_self_heal,
options, bool, out);
+ GF_OPTION_RECONF ("shd-validate-data", priv->shd_validate_data,
+ options, bool, out);
+
GF_OPTION_RECONF ("data-self-heal-window-size",
priv->data_self_heal_window_size, options,
uint32, out);
@@ -426,6 +429,8 @@ init (xlator_t *this)
GF_OPTION_INIT ("pgfid-self-heal", priv->pgfid_self_heal, bool, out);
+ GF_OPTION_INIT ("shd-validate-data", priv->shd_validate_data, bool, out);
+
GF_OPTION_INIT ("background-self-heal-count",
priv->background_self_heal_count, uint32, out);
@@ -1112,5 +1117,9 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
},
+ { .key = {"shd-validate-data"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index b60822d0ca9..3314f865781 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -101,6 +101,7 @@ typedef struct _afr_private {
unsigned char *child_up;
int64_t *child_latency;
gf_boolean_t pgfid_self_heal;
+ gf_boolean_t shd_validate_data;;
unsigned char *local;
char **pending_key;
@@ -1101,7 +1102,7 @@ int
afr_final_errno (afr_local_t *local, afr_private_t *priv);
int
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req);
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, gf_boolean_t checksum);
void
afr_fix_open (fd_t *fd, xlator_t *this);