diff options
| author | Richard Wareing <rwareing@fb.com> | 2015-09-28 20:38:50 -0700 |
|---|---|---|
| committer | Jeff Darcy <jeff@pl.atyp.us> | 2017-07-11 00:51:25 +0000 |
| commit | 87cf185bb44eaf853de0ada71d66369685c52ae3 (patch) | |
| tree | 9a02b8c4297cfd1bfd47ec9ff066e95f5cf701f7 | |
| parent | 24d3099681c4fbeb4c9d868ee58810548335833a (diff) | |
cluster/afr: GFID unsplit improvements
Summary:
- Few improvements: handle type mis-matches (e.g. dir/file mis-matches), added in an option to control whether gfid unsplits will happen, ensured entry healing will happen in the gfid mis-match case when the option is enabled.
- Added prove test to cover entry healing & type mis-match cases
- Enable metadata split-brain resolution by default
- Enable gfid split-brain resolution by default
- Fix gfid unsplit logging bugs where it was showing null GFIDs instead of the actual chosen GFIDs
Test Plan:
- run prove -v test/basic/gfid_unsplit*
- Ran valgrind to verify leak-free state
Reviewers: moox, sshreyas
Reviewed By: sshreyas
Change-Id: Id67ddc728745ebbbaf7bdd3f9a5549e5a4cc4a20
Signed-off-by: Jeff Darcy <jdarcy@fb.com>
Change-Id: I4181233f9ba7f61ccd2ba91f0874eb2ac7cd40b5
Manually-merged-by: Jeff Darcy <jdarcy@fb.com>
Signed-off-by: Jeff Darcy <jdarcy@fb.com>
Reviewed-on: https://review.gluster.org/17739
Tested-by: Jeff Darcy <jeff@pl.atyp.us>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Jeff Darcy <jeff@pl.atyp.us>
| -rw-r--r-- | tests/basic/gfid_unsplit.t | 14 | ||||
| -rw-r--r-- | tests/basic/gfid_unsplit_type_mismatch.t | 85 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 10 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-entry.c | 17 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-name.c | 158 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 13 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 1 |
7 files changed, 193 insertions, 105 deletions
diff --git a/tests/basic/gfid_unsplit.t b/tests/basic/gfid_unsplit.t index 3fe7a6f140c..0df96bd5ed6 100644 --- a/tests/basic/gfid_unsplit.t +++ b/tests/basic/gfid_unsplit.t @@ -14,6 +14,7 @@ TEST $CLI volume info; TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; TEST $CLI volume set $V0 performance.stat-prefetch off TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.quorum-type none TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 nfs.disable off #EST $CLI volume set $V0 cluster.favorite-child-by-majority on @@ -44,6 +45,15 @@ GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/ #EST rm -f $B0/${V0}3/splitfile #m -rf $GFID_DIR_B3 +touch $M0/newfile + +# Synthetically force a conservative merge of the directory. We want +# to ensure that conservative merges happen in-spite of GFID mis-matches, +# since we can handle them there's no sense in not doing these. In fact, +# if we stop them it will block GFID split-brain resolution. +setfattr -n trusted.afr.patchy-client-1 -v 0x000000000000000000000002 $B0/${V0}1 +setfattr -n trusted.afr.patchy-client-2 -v 0x000000000000000000000002 $B0/${V0}1 + # Restart the down brick TEST $CLI volume start $V0 force EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 @@ -56,6 +66,10 @@ sleep 1 # Verify the file is readable TEST dd if=$M0/splitfile of=/dev/null 2>/dev/null +# Verify entry healing happened on the back-end regardless of the +# gfid-splitbrain state of the directory. +TEST stat $B0/${V0}1/splitfile + # Verify the MD5 signature of the file HEALED_MD5=$(md5sum $M0/splitfile | cut -d\ -f1) TEST [ "$MD5" == "$HEALED_MD5" ] diff --git a/tests/basic/gfid_unsplit_type_mismatch.t b/tests/basic/gfid_unsplit_type_mismatch.t new file mode 100644 index 00000000000..51e6a36445b --- /dev/null +++ b/tests/basic/gfid_unsplit_type_mismatch.t @@ -0,0 +1,85 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +# Setup a cluster with 3 replicas, and fav child by majority on +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 nfs.disable off +TEST $CLI volume set $V0 cluster.quorum-type none +TEST $CLI volume set $V0 cluster.favorite-child-policy majority +#EST $CLI volume set $V0 cluster.favorite-child-by-majority on +#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +sleep 5 + +# Part I: FUSE Test +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 + +cd $M0 +dd if=/dev/urandom of=splitfile bs=128k count=5 2>/dev/null + +MD5=$(md5sum splitfile | cut -d\ -f1) + +# Create a split-brain by downing a brick, and flipping the +# gfid on the down brick, then bring the brick back up. +TEST kill_brick $V0 $H0 $B0/${V0}1 +GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +rm -rf $GFID_DIR_B1 +rm -fv $B0/${V0}1/splitfile + +# Now really screw the file up, by changing it's type to a directory +# not a file...the so-called "type mismatch" situation. Our test +# should prove we can un-mangle this situation using the same strategy. +mkdir $B0/${V0}1/splitfile +touch -t 199011011510 $B0/${V0}1/splitfile +TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile +cd ~ + +touch $M0/newfile + +# Synthetically force a conservative merge of the directory. We want +# to ensure that conservative merges happen in-spite of GFID mis-matches, +# since we can handle them there's no sense in not doing these. In fact, +# if we stop them it will block GFID split-brain resolution. +setfattr -n trusted.afr.patchy-client-1 -v 0x000000000000000000000002 $B0/${V0}1 +setfattr -n trusted.afr.patchy-client-2 -v 0x000000000000000000000002 $B0/${V0}1 + +# Restart the down brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 +sleep 5 +cd $M0 + +# Tickle the file to trigger the gfid unsplit +TEST stat splitfile +sleep 1 + +# Verify the file is readable +TEST dd if=splitfile of=/dev/null 2>/dev/null + +# Verify entry healing happened on the back-end regardless of the +# gfid-splitbrain state of the directory. +TEST stat $B0/${V0}1/splitfile + +# Verify the MD5 signature of the file +HEALED_MD5=$(md5sum splitfile | cut -d\ -f1) +TEST [ "$MD5" == "$HEALED_MD5" ] + +# Verify the file can be removed +TEST rm -f splitfile +cd ~ + +cleanup diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 04f16cc0122..59228b1925a 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -697,7 +697,7 @@ afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies, priv->children[i]->name, replies[i].poststat.ia_mtime, replies[i].poststat.ia_size, - uuid_utoa (inode->gfid)); + uuid_utoa (replies[i].poststat.ia_gfid)); vote_count = 0; for (k = 1; k < priv->child_count; k++) { if (replies_are_same (replies, i, k)) { @@ -734,7 +734,7 @@ afr_sh_fav_by_mtime (xlator_t *this, struct afr_reply *replies, inode_t *inode) priv->children[i]->name, replies[i].poststat.ia_mtime, replies[i].poststat.ia_mtime_nsec, - uuid_utoa (inode->gfid)); + uuid_utoa (replies[i].poststat.ia_gfid)); if (replies[i].poststat.ia_mtime > cmp_mtime) { cmp_mtime = replies[i].poststat.ia_mtime; cmp_mtime_nsec = @@ -774,7 +774,7 @@ afr_sh_fav_by_ctime (xlator_t *this, struct afr_reply *replies, inode_t *inode) priv->children[i]->name, replies[i].poststat.ia_ctime, replies[i].poststat.ia_ctime_nsec, - uuid_utoa (inode->gfid)); + uuid_utoa (replies[i].poststat.ia_gfid)); if (replies[i].poststat.ia_ctime > cmp_ctime) { cmp_ctime = replies[i].poststat.ia_ctime; cmp_ctime_nsec = @@ -812,7 +812,7 @@ afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies, inode_t *inode) "file size = %lu for gfid %s", priv->children[i]->name, replies[i].poststat.ia_size, - uuid_utoa (inode->gfid)); + uuid_utoa (replies[i].poststat.ia_gfid)); if (replies[i].poststat.ia_size > cmp_sz) { cmp_sz = replies[i].poststat.ia_size; fav_child = i; @@ -911,7 +911,7 @@ afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame, "data in file (gfid:%s) by %s (%lu bytes @ %s mtime, " "%s ctime).", priv->children[fav_child]->name, - uuid_utoa (inode->gfid), + uuid_utoa (replies[fav_child].poststat.ia_gfid), policy_str, replies[fav_child].poststat.ia_size, mtime_str, diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 09f26690588..776cc9c5c21 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -299,13 +299,16 @@ __afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - /* In case of a gfid or type mismatch on the entry, return -1.*/ - ret = afr_selfheal_detect_gfid_and_type_mismatch (this, replies, - fd->inode->gfid, - name, source); - - if (ret < 0) - return ret; + /* Returning EIO here isn't needed if GFID forced heal is + * enabled. + */ + if (!priv->gfid_splitbrain_forced_heal) { + /* In case of a gfid or type mismatch on the entry, return -1.*/ + ret = afr_selfheal_detect_gfid_and_type_mismatch (this, + replies, fd->inode->gfid, name, source); + if (ret < 0) + return ret; + } for (i = 0; i < priv->child_count; i++) { if (i == source || !healed_sinks[i]) diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index bf34215ba1f..9ca56f8bd9d 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -129,13 +129,10 @@ __afr_selfheal_do_gfid_unsplit (xlator_t *this, unsigned char *locked_on, afr_private_t *priv = NULL; call_frame_t *frame = NULL; loc_t *unsplit_loc; - int fav_child = -1; - unsigned char *fav_gfid; unsigned int i = 0; unsigned int split_count = 0; unsigned char *rename_list; int ret = 0; - char *policy_str; frame = afr_frame_create (this); @@ -148,29 +145,11 @@ __afr_selfheal_do_gfid_unsplit (xlator_t *this, unsigned char *locked_on, goto out; } - /* - * Ok, go find our favorite child by one of the active policies: - * majority -> ctime -> mtime -> size -> predefined - * we'll use this gfid as the "real" one. - */ - fav_child = afr_sh_get_fav_by_policy (this, replies, inode, - &policy_str); - if (fav_child == -1) { /* No policies are in place, bail */ - gf_log (this->name, GF_LOG_WARNING, "Unable to resolve GFID " - "split brain, there are no favorite child policies " - "set."); - ret = -EIO; - goto out; - } - fav_gfid = replies[fav_child].poststat.ia_gfid; - gf_log (this->name, GF_LOG_INFO, "Using child %d to resolve gfid " - "split-brain. GFID is %s.", fav_child, uuid_utoa (fav_gfid)); - /* Pre-compute the number of rename calls we will be doing */ for (i = 0; i < priv->child_count; i++) { if (local->child_up[i] && !gf_uuid_is_null (replies[i].poststat.ia_gfid) && - gf_uuid_compare (replies[i].poststat.ia_gfid, fav_gfid)) { + gf_uuid_compare (replies[i].poststat.ia_gfid, loc->gfid)) { split_count++; } } @@ -192,7 +171,7 @@ __afr_selfheal_do_gfid_unsplit (xlator_t *this, unsigned char *locked_on, if (locked_on[i] && local->child_up[i] && replies[i].op_errno != ENOENT && !gf_uuid_is_null (replies[i].poststat.ia_gfid) && - gf_uuid_compare (replies[i].poststat.ia_gfid, fav_gfid)) { + gf_uuid_compare (replies[i].poststat.ia_gfid, loc->gfid)) { ret = _afr_sh_create_unsplit_loc (replies, i, loc, unsplit_loc); gf_log (this->name, GF_LOG_INFO, "Renaming child %d to " @@ -234,6 +213,9 @@ __afr_selfheal_gfid_unsplit (xlator_t *this, inode_t *parent, uuid_t pargfid, loc_t loc = {0, }; call_frame_t *new_frame = NULL; afr_local_t *new_local = NULL; + int fav_child = -1; + unsigned char *fav_gfid; + char *policy_str; priv = this->private; @@ -247,29 +229,48 @@ __afr_selfheal_gfid_unsplit (xlator_t *this, inode_t *parent, uuid_t pargfid, gf_uuid_copy (parent->gfid, pargfid); - xdata = dict_new (); - if (!xdata) { - ret = -ENOMEM; - goto out; - } - - ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16); - if (ret) { - ret = -ENOMEM; - goto out; - } - loc.parent = inode_ref (parent); loc.inode = inode_ref (inode); gf_uuid_copy (loc.pargfid, pargfid); loc.name = bname; + /* + * Ok, go find our favorite child by one of the active policies: + * majority -> ctime -> mtime -> size -> predefined + * we'll use this gfid as the "real" one. + */ + fav_child = afr_sh_get_fav_by_policy (this, replies, inode, + &policy_str); + if (fav_child == -1) { /* No policies are in place, bail */ + gf_log (this->name, GF_LOG_WARNING, "Unable to resolve GFID " + "split brain, there are no favorite child policies " + "set."); + ret = -EIO; + goto out; + } + fav_gfid = replies[fav_child].poststat.ia_gfid; + gf_log (this->name, GF_LOG_INFO, "Using child %d to resolve gfid " + "split-brain. GFID is %s.", fav_child, uuid_utoa (fav_gfid)); + + gf_uuid_copy (loc.gfid, fav_gfid); ret = __afr_selfheal_do_gfid_unsplit (this, locked_on, replies, inode, &loc); if (ret) goto out; + xdata = dict_new (); + if (!xdata) { + ret = -ENOMEM; + goto out; + } + + ret = dict_set_static_bin (xdata, "gfid-req", fav_gfid, 16); + if (ret) { + ret = -ENOMEM; + goto out; + } + /* Clear out old replies here and wind lookup on all locked * subvolumes to achieve two things: * a. gfid heal on those subvolumes that do not have gfid associated @@ -309,6 +310,7 @@ __afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid, loc_t loc = {0, }; call_frame_t *new_frame = NULL; afr_local_t *new_local = NULL; + int i; priv = this->private; @@ -364,6 +366,25 @@ __afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid, * __afr_selfheal_name_impunge(). */ + gf_log (this->name, GF_LOG_INFO, + "smashing gfid to %s", uuid_utoa(gfid)); + + ia_type_t ia_type = replies[0].poststat.ia_type; + for (i = 1; i < priv->child_count; ++i) { + if (replies[i].poststat.ia_type != ia_type) { + if (replies[i].poststat.ia_type == IA_INVAL) { + continue; + } + gf_log (this->name, GF_LOG_WARNING, + "type[%d] = %d (not %d)", i, + replies[i].poststat.ia_type, ia_type); + if (ia_type != IA_INVAL) { + ret = -EIO; + goto out; + } + ia_type = replies[i].poststat.ia_type; + } + } AFR_ONLIST (locked_on, new_frame, afr_selfheal_discover_cbk, lookup, &loc, xdata); @@ -547,52 +568,6 @@ afr_selfheal_name_need_heal_check (xlator_t *this, struct afr_reply *replies) return need_heal; } -static int -afr_selfheal_name_type_mismatch_check (xlator_t *this, struct afr_reply *replies, - int source, unsigned char *sources, - uuid_t pargfid, const char *bname) -{ - int i = 0; - int type_idx = -1; - ia_type_t inode_type = IA_INVAL; - afr_private_t *priv = NULL; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid) - continue; - - if (replies[i].poststat.ia_type == IA_INVAL) - continue; - - if (inode_type == IA_INVAL) { - inode_type = replies[i].poststat.ia_type; - type_idx = i; - continue; - } - - if (sources[i] || source == -1) { - if ((sources[type_idx] || source == -1) && - (inode_type != replies[i].poststat.ia_type)) { - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_SPLIT_BRAIN, - "Type mismatch for <gfid:%s>/%s: " - "%d on %s and %d on %s", - uuid_utoa(pargfid), bname, - replies[i].poststat.ia_type, - priv->children[i]->name, - replies[type_idx].poststat.ia_type, - priv->children[type_idx]->name); - - return -EIO; - } - inode_type = replies[i].poststat.ia_type; - type_idx = i; - } - } - return 0; -} static int afr_selfheal_name_gfid_mismatch_check (xlator_t *this, struct afr_reply *replies, @@ -690,7 +665,9 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, gf_boolean_t need_heal = _gf_false; gf_boolean_t is_gfid_absent = _gf_false; gf_boolean_t tried_gfid_unsplit = _gf_false; + afr_private_t *priv = NULL; + priv = this->private; need_heal = afr_selfheal_name_need_heal_check (this, replies); if (!need_heal) return 0; @@ -706,18 +683,14 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, return ret; } - ret = afr_selfheal_name_type_mismatch_check (this, replies, source, - sources, pargfid, bname); - if (ret) - return ret; - gfid_mismatch_check: ret = afr_selfheal_name_gfid_mismatch_check (this, replies, source, sources, &gfid_idx, pargfid, bname); - if (ret && tried_gfid_unsplit == _gf_true) + if (ret && tried_gfid_unsplit) { return ret; + } if (gfid_idx == -1) { if (!gfid_req || gf_uuid_is_null (gfid_req)) @@ -727,7 +700,7 @@ gfid_mismatch_check: gfid = &replies[gfid_idx].poststat.ia_gfid; } - if (ret && tried_gfid_unsplit == _gf_false) { + if (priv->gfid_splitbrain_forced_heal || ret) { ret = __afr_selfheal_gfid_unsplit (this, parent, pargfid, bname, inode, replies, gfid, locked_on); @@ -739,11 +712,12 @@ gfid_mismatch_check: } is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false; - ret = __afr_selfheal_assign_gfid (this, parent, pargfid, bname, inode, - replies, gfid, locked_on, - is_gfid_absent); - if (ret) + ret = __afr_selfheal_assign_gfid (this, parent, pargfid, bname, + inode, replies, gfid, + locked_on, is_gfid_absent); + if (ret) { return ret; + } if (gfid_idx == -1) { gfid_idx = afr_selfheal_gfid_idx_get (this, replies, sources); diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 6d4b3c73429..31949bdcea4 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -143,6 +143,10 @@ reconfigure (xlator_t *this, dict_t *options) priv->metadata_splitbrain_forced_heal, options, bool, out); + GF_OPTION_RECONF ("gfid-splitbrain-forced-heal", + priv->gfid_splitbrain_forced_heal, options, bool, + out); + GF_OPTION_RECONF ("background-self-heal-count", priv->background_self_heal_count, options, uint32, out); @@ -364,6 +368,9 @@ init (xlator_t *this) GF_OPTION_INIT ("metadata-splitbrain-forced-heal", priv->metadata_splitbrain_forced_heal, bool, out); + GF_OPTION_INIT ("gfid-splitbrain-forced-heal", + priv->gfid_splitbrain_forced_heal, bool, out); + GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out); if (read_subvol) { priv->read_child = xlator_subvolume_index (this, read_subvol); @@ -956,7 +963,7 @@ struct volume_options options[] = { }, { .key = {"iam-nfs-daemon"}, .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", + .default_value = "on", .description = "This option differentiates if the replicate " "translator is running as part of an NFS daemon " "or not." @@ -1023,6 +1030,10 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL, .default_value = "off", }, + { .key = {"gfid-splitbrain-forced-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, { .key = {"heal-timeout"}, .type = GF_OPTION_TYPE_INT, .min = 5, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 53abeaace11..aa2af38e8cf 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -126,6 +126,7 @@ typedef struct _afr_private { gf_boolean_t entry_change_log; /* on/off */ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ + gf_boolean_t gfid_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ unsigned int hash_mode; /* for when read_child is not set */ int favorite_child; /* subvolume to be preferred in resolving |
