diff options
Diffstat (limited to 'xlators/cluster/afr/src')
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 22 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 82 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 141 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-entry.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-name.c | 3 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 7 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heald.c | 42 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 9 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 3 |
10 files changed, 291 insertions, 22 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 7425b6688f2..f5210bd7d5d 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1267,7 +1267,7 @@ afr_inode_refresh_do (call_frame_t *frame, xlator_t *this) return 0; } - ret = afr_xattr_req_prepare (this, xdata); + ret = afr_xattr_req_prepare (this, xdata, _gf_false); if (ret != 0) { dict_unref (xdata); afr_inode_refresh_done (frame, this, -ret); @@ -1360,7 +1360,7 @@ afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, int -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, gf_boolean_t checksum) { int i = 0; afr_private_t *priv = NULL; @@ -1397,6 +1397,19 @@ afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) "Unable to set ancestry path key in dict "); } + if (checksum) { + ret = dict_set_int32 (xattr_req, "get-checksum", 1); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Unable to set get-checksum in dict "); + } + ret = dict_set_int32 (xattr_req, "trusted.glusterfs.validate-status", 1); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Unable to set validate-status in dict "); + } + } + return ret; } @@ -1415,7 +1428,7 @@ afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, if (xattr_req && (xattr_req != local->xattr_req)) dict_copy (xattr_req, local->xattr_req); - ret = afr_xattr_req_prepare (this, local->xattr_req); + ret = afr_xattr_req_prepare (this, local->xattr_req, _gf_false); ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); if (ret < 0) { @@ -5906,7 +5919,8 @@ afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode, replies = alloca0 (sizeof (*replies) * priv->child_count); - ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies, + priv->shd_validate_data); if (ret) goto out; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 800cf9705c9..5a9ab795a94 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -298,6 +298,51 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, return 0; } +static int +afr_selfheal_vstatus_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + + syncbarrier_wake (&local->barrier); + return 0; +} + +void +afr_selfheal_update_vstatus (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *targets, char *new_status) +{ + loc_t loc = {0, }; + dict_t *xattr; + + loc.inode = inode_ref (inode); + + xattr = dict_new (); + if (!xattr) { + gf_log (this->name, GF_LOG_WARNING, + "unable to allocate validate-status for %s", + uuid_utoa (inode->gfid)); + goto done; + } + + if (dict_set_str (xattr, "trusted.glusterfs.validate-status", + new_status) != 0) { + gf_log (this->name, GF_LOG_WARNING, + "couldn't clear validate-status for %s", + uuid_utoa (inode->gfid)); + goto done; + } + + AFR_ONLIST (targets, frame, afr_selfheal_vstatus_cbk, setxattr, + &loc, xattr, 0, NULL); + +done: + if (xattr) { + dict_unref (xattr); + } + loc_wipe (&loc); +} + void afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count) @@ -1379,7 +1424,7 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, if (xattr) dict_copy (xattr, xattr_req); - if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + if (afr_xattr_req_prepare (frame->this, xattr_req, _gf_false) != 0) { dict_destroy (xattr_req); return NULL; } @@ -1406,10 +1451,11 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, return inode; } -int +static int afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, - uuid_t gfid, struct afr_reply *replies, - unsigned char *discover_on) + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on, + gf_boolean_t checksum) { loc_t loc = {0, }; dict_t *xattr_req = NULL; @@ -1423,7 +1469,7 @@ afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, if (!xattr_req) return -ENOMEM; - if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + if (afr_xattr_req_prepare (frame->this, xattr_req, checksum) != 0) { dict_destroy (xattr_req); return -ENOMEM; } @@ -1444,14 +1490,15 @@ afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, int afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, - uuid_t gfid, struct afr_reply *replies) + uuid_t gfid, struct afr_reply *replies, + gf_boolean_t checksum) { afr_private_t *priv = NULL; priv = frame->this->private; return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies, - priv->child_up); + priv->child_up, checksum); } unsigned int @@ -1865,7 +1912,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, replies = alloca0 (sizeof (*replies) * priv->child_count); - ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies, _gf_false); if (ret) goto out; @@ -1983,6 +2030,22 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, if (data_selfheal) *data_selfheal = _gf_true; } + + if (priv->shd_validate_data && data_selfheal && !*data_selfheal) { + if (IA_ISREG (replies[i].poststat.ia_type)) { + gf_log (this->name, GF_LOG_INFO, + "forcing data self-heal on %s", + uuid_utoa (replies[i].poststat.ia_gfid)); + /* + * This will force our caller (e.g. + * afr_selfheal_do) to call afr_selfheal_data, + * even though it might otherwise think + * everything looks OK. From there, we'll do a + * more thorough inspection including checksums. + */ + *data_selfheal = _gf_true; + } + } } if (valid_cnt > 0 && link_inode) { @@ -1999,7 +2062,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, ret = 0; out: if (inode) - inode_unref (inode); + inode_unref (inode); if (replies) afr_replies_wipe (replies, priv->child_count); @@ -2140,7 +2203,6 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) &data_selfheal, &metadata_selfheal, &entry_selfheal); - if (ret) goto out; diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index c1e945bfd82..894c8e68f25 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -587,6 +587,135 @@ out: return source; } +static int +afr_move_aside (call_frame_t *frame, xlator_t *this, inode_t *inode, int i) +{ + afr_private_t *priv = this->private; + dict_t *xattr = NULL; + int ret = -1; + loc_t loc = {0, }; + + loc.inode = inode_ref (inode); + + xattr = dict_new (); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "failed to alloc move-aside dict for %s on child %d", + uuid_utoa (inode->gfid), i); + goto done; + } + + if (dict_set_str (xattr, "trusted.move-aside", "please") != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set move-aside xattr for %s on child %d", + uuid_utoa (inode->gfid), i); + goto done; + } + + if (syncop_setxattr (priv->children[i], &loc, xattr, 0, + NULL, NULL) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to send move-aside fop for %s on child %d", + uuid_utoa (inode->gfid), i); + goto done; + } + + ret = 0; + +done: + if (xattr) { + dict_unref (xattr); + } + loc_wipe (&loc); + + return ret; +} + +static void +afr_handle_validation (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + struct afr_reply *replies) +{ + afr_private_t *priv = this->private; + uint32_t *values; + int i; + int same_as[2] = {0, 0}; + char *vstatus; + + if (!priv->shd_validate_data) { + return; + } + + values = alloca0 (sizeof (*values) * priv->child_count); + for (i = 0; i < priv->child_count; ++i) { + if (!replies[i].xdata) { + gf_log (this->name, GF_LOG_DEBUG, + "no xdata for child %d", i); + return; + } + if (dict_get_str (replies[i].xdata, + "trusted.glusterfs.validate-status", + &vstatus) != 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no validate-status for child %d", i); + return; + } + if (strncmp (vstatus, "suspect", 7) != 0) { + gf_log (this->name, GF_LOG_DEBUG, + "validate-status = %s for child %d", vstatus, i); + return; + } + if (dict_get_uint32 (replies[i].xdata, "checksum", &values[i]) != 0) { + return; + } + gf_log (this->name, GF_LOG_DEBUG, + "checksum for child %d is 0x%x", i, values[i]); + } + + /* + * Let's take a shortcut here by looking only for a single odd + * man out instead of a more generalized minority. To do this, + * we only need to compare the third item onward to (at most) + * the first two, and we only need two counters. There's all + * sorts of ways we could optimize this implementation, but + * there's little left to be saved. + */ + for (i = 0; i < priv->child_count; ++i) { + same_as[0] += (values[i] == values[0]); + same_as[1] += (values[i] == values[1]); + } + if (same_as[0] == priv->child_count) { + gf_log (this->name, GF_LOG_DEBUG, "everything's OK"); + afr_selfheal_update_vstatus (frame, this, inode, + sources, "clean"); + } else if (same_as[0] == (priv->child_count - 1)) { + gf_log (this->name, GF_LOG_DEBUG, "odd man out, use 0"); + for (i = 0; i < priv->child_count; ++i) { + if (values[i] != values[0]) { + sources[i] = 0; + sinks[i] = 1; + afr_move_aside (frame, this, inode, i); + } + } + } else if (same_as[1] == (priv->child_count - 1)) { + gf_log (this->name, GF_LOG_DEBUG, "odd man out, use 1"); + for (i = 0; i < priv->child_count; ++i) { + if (values[i] != values[1]) { + sources[i] = 0; + sinks[i] = 1; + afr_move_aside (frame, this, inode, i); + } + } + } else { + gf_log (this->name, GF_LOG_WARNING, "three-way split on %s", + uuid_utoa (inode->gfid)); + for (i = 0; i < priv->child_count; ++i) { + sources[i] = 0; + sinks[i] = 1; + } + } +} + /* * __afr_selfheal_data_prepare: * @@ -612,7 +741,7 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, priv = this->private; ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, - replies); + replies, priv->shd_validate_data); if (ret) return ret; @@ -625,6 +754,8 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, if (ret) return ret; + afr_handle_validation (frame, this, inode, sources, sinks, replies); + /* Initialize the healed_sinks[] array optimistically to the intersection of to-be-healed (i.e sinks[]) and the list of servers which are up (i.e locked_on[]). @@ -749,6 +880,14 @@ restore_time: sources, sinks, healed_sinks, undid_pending, AFR_DATA_TRANSACTION, locked_replies, data_lock); + + if (priv->shd_validate_data) { + afr_selfheal_update_vstatus (frame, this, fd->inode, + healed_sinks, "repaired"); + afr_selfheal_update_vstatus (frame, this, fd->inode, + sources, "clean"); + } + skip_undo_pending: afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0, data_lock); diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 25f8ea313aa..e0a82426a33 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -489,7 +489,7 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, priv = this->private; ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, - replies); + replies, _gf_false); if (ret) return ret; diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 4570ace7ef7..85dbdf2976e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -336,7 +336,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i priv = this->private; ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, - replies); + replies, _gf_false); if (ret) return ret; diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index 8e5546a702f..db78ef81804 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -796,7 +796,8 @@ __afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *paren replies = alloca0 (priv->child_count * sizeof(*replies)); - ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies); + ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies, + _gf_false); if (ret) goto out; diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 0a3d6482ca3..b33db59b50f 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -143,7 +143,8 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, int afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, - uuid_t gfid, struct afr_reply *replies); + uuid_t gfid, struct afr_reply *replies, + gf_boolean_t checksum); inode_t * afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, @@ -165,6 +166,10 @@ int afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, afr_transaction_type type, int *dirty, int **matrix); +void +afr_selfheal_update_vstatus (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *targets, char *new_status); + int afr_sh_generic_fop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *pre, diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 26f4a80777f..9c4f3ec81f6 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -315,8 +315,8 @@ afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent, return ret; } -int -afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid) +static int +_afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid) { int ret = 0; eh_t *eh = NULL; @@ -377,6 +377,44 @@ out: } +int +afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid) +{ + afr_private_t *priv = healer->this->private; + int ret = _afr_shd_selfheal (healer, child, gfid); + + /* + * You are not expected to understand this code. OK, sorry, it's a + * very old UNIX meme. I've been waiting years for an appropriate time + * to use it, and this seems as good as it's going to get. If it makes + * you feel any better, the reason I don't expect you to understand + * this code is that I don't understand it either and therefore can't + * explain it. + * + * What's going on here is that we only call afr_shd_zero_xattrop for a + * return value of two, which non-obviously means that no heal was + * deemed necessary. However, we made it seem necessary *only* because + * of data validation, so we skipped the part where we'd return that + * value normally. It was only later, and several layers deeper in the + * call hierarchy, that we realized everything was OK after all. + * Expecting to return a two at that point, and have it survive all the + * intervening layers, and not have any other untoward side effects, + * would require more optimism about this code than I've ever felt. + * Changing it here isn't entirely without risk either, but at least + * the side effects this way are easier to reason about. + * + * You might well wonder how the index entry ever gets removed in the + * other cases. I wonder too. Observation says that it does, and + * that's good enough. It's a big world, with many other mysteries in + * it. + */ + if (priv->shd_validate_data && (ret >= 0)) { + ret = 2; + } + + return ret; +} + void afr_shd_sweep_prepare (struct subvol_healer *healer) { diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 86f667116af..f291626fff9 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -167,6 +167,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("pgfid-self-heal", priv->pgfid_self_heal, options, bool, out); + GF_OPTION_RECONF ("shd-validate-data", priv->shd_validate_data, + options, bool, out); + GF_OPTION_RECONF ("data-self-heal-window-size", priv->data_self_heal_window_size, options, uint32, out); @@ -426,6 +429,8 @@ init (xlator_t *this) GF_OPTION_INIT ("pgfid-self-heal", priv->pgfid_self_heal, bool, out); + GF_OPTION_INIT ("shd-validate-data", priv->shd_validate_data, bool, out); + GF_OPTION_INIT ("background-self-heal-count", priv->background_self_heal_count, uint32, out); @@ -1112,5 +1117,9 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL, .default_value = "off", }, + { .key = {"shd-validate-data"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index b60822d0ca9..3314f865781 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -101,6 +101,7 @@ typedef struct _afr_private { unsigned char *child_up; int64_t *child_latency; gf_boolean_t pgfid_self_heal; + gf_boolean_t shd_validate_data;; unsigned char *local; char **pending_key; @@ -1101,7 +1102,7 @@ int afr_final_errno (afr_local_t *local, afr_private_t *priv); int -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req); +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, gf_boolean_t checksum); void afr_fix_open (fd_t *fd, xlator_t *this); |
