diff options
| author | Richard Wareing <rwareing@fb.com> | 2015-09-24 15:53:12 -0700 | 
|---|---|---|
| committer | Jeff Darcy <jeff@pl.atyp.us> | 2017-07-06 19:37:59 +0000 | 
| commit | 3fcf536f4983fc8a3da7c5204f3dd9b75259a7a8 (patch) | |
| tree | 2ed35204b6c6ebd318b516380d9e9707391e4253 | |
| parent | fb4445816a9775daf32b590d0158ef0271c2c866 (diff) | |
cluster/afr: Non-destructive GFID unsplit brain support for v3.6.x
Summary:
- v3.6.3 port of non-destructive GFID unsplit-brain code, almost a re-write for AFR2, but the original behavior lives on.
- This feature allows the GlusterFS filesystem to automagically resolve GFID splitbrain situations by choosing the authorative file based on the last modification time.  Other policies such as majority or size are also possible but not implemented just yet.
- Core feature to Halo Geo-Replication, as this (gfid) form of split-brain is an everyday possibility with async mounts, so there needs to be an automated & scalable method to resolve them via the SHD or optionally in-line by FUSE clients or NFS daemons.
- Operational notes:
  1. Files or directory entries are supported, you can even write files into a directory and they will not be lost.
  2. Streamed writes to a files are fully supported while a split-brain resolution happens,  i.e. the writes will not be interrupted while the unsplit takes place.
  3. Un-split (ones which are determined not to be "authoritative") files are renamed like so: ".<filename>_<random uuid>"
Test Plan:
- Run prove -v tests/basic/gfid_unsplit.t
- Test output: https://phabricator.fb.com/P20041740
Reviewers: moox, dph, sshreyas
Reviewed By: sshreyas
Differential Revision: https://phabricator.fb.com/D2479409
Signature: t1:2479409:1443208319:373218aa9758a1b48db23ea5e211ec303fa92e64
Blame Revision: Change-Id: I5b3d2e79fad74b4372c02b86219e8ee98f5e29dc
Change-Id: I8ef719bcccb19ab6674647e02b72e1b36155fed9
Signed-off-by: Jeff Darcy <jdarcy@fb.com>
Reviewed-on: https://review.gluster.org/17720
Smoke: Gluster Build System <jenkins@build.gluster.org>
Tested-by: Jeff Darcy <jeff@pl.atyp.us>
Reviewed-by: Jeff Darcy <jeff@pl.atyp.us>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
| -rw-r--r-- | tests/basic/gfid_unsplit.t | 102 | ||||
| -rw-r--r-- | tests/include.rc | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 38 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-name.c | 296 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 1 | 
5 files changed, 426 insertions, 13 deletions
diff --git a/tests/basic/gfid_unsplit.t b/tests/basic/gfid_unsplit.t new file mode 100644 index 00000000000..9bb52f4533a --- /dev/null +++ b/tests/basic/gfid_unsplit.t @@ -0,0 +1,102 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +# Setup a cluster with 3 replicas, and fav child by majority on +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 nfs.disable off +#EST $CLI volume set $V0 cluster.favorite-child-by-majority on +#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.favorite-child-policy majority +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +sleep 5 + +# Part I: FUSE Test +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \ +  --attribute-timeout=0 --entry-timeout=0 + +dd if=/dev/urandom of=$M0/splitfile bs=128k count=5 2>/dev/null + +MD5=$(md5sum $M0/splitfile | cut -d\  -f1) + +# Create a split-brain by downing a brick, and flipping the +# gfid on the down brick, then bring the brick back up. +TEST kill_brick $V0 $H0 $B0/${V0}1 +GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +rm -rf $GFID_DIR_B1 +TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile + +GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +#EST rm -f $B0/${V0}3/splitfile +#m -rf $GFID_DIR_B3 + +# Restart the down brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 +sleep 5 + +# Tickle the file to trigger the gfid unsplit +TEST stat $M0/splitfile +sleep 1 + +# Verify the file is readable +TEST dd if=$M0/splitfile of=/dev/null 2>/dev/null + +# Verify the MD5 signature of the file +HEALED_MD5=$(md5sum $M0/splitfile | cut -d\  -f1) +TEST [ "$MD5" == "$HEALED_MD5" ] + +# Verify the file can be removed +TEST rm -f $M0/splitfile + +# Part II: NFS test +TEST mount -t nfs -o nolock,noatime,noacl,soft,intr $H0:/$V0 $N0; + +dd if=/dev/urandom of=$N0/splitfile bs=128k count=5 2>/dev/null + +MD5=$(md5sum $N0/splitfile | cut -d\  -f1) + +# Create a split-brain by downing a brick, and flipping the +# gfid on the down brick, then bring the brick back up. +TEST kill_brick $V0 $H0 $B0/${V0}1 +GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +rm -rf $GFID_DIR_B1 +TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile + +GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +#EST rm -f $B0/${V0}3/splitfile +#m -rf $GFID_DIR_B3 + +# Restart the down brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 +sleep 5 + +# Tickle the file to trigger the gfid unsplit +TEST stat $N0/splitfile +sleep 1 + +# Verify the file is readable +TEST dd if=$N0/splitfile of=/dev/null 2>/dev/null + +# Verify the MD5 signature of the file +HEALED_MD5=$(md5sum $N0/splitfile | cut -d\  -f1) +TEST [ "$MD5" == "$HEALED_MD5" ] + +# Verify the file can be removed +TEST rm -f $N0/splitfile + +cleanup diff --git a/tests/include.rc b/tests/include.rc index de3b90ee86b..872aec11edf 100644 --- a/tests/include.rc +++ b/tests/include.rc @@ -431,7 +431,7 @@ function cleanup()          # Prepare flags for umount          case `uname -s` in          Linux) -                flag="-l" +                flag="-l -f --no-canonicalize"                  ;;          NetBSD)                  flag="-f -R" diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 549c3c1ba71..04f16cc0122 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -664,6 +664,20 @@ out:  } +static int +replies_are_same (struct afr_reply *replies, int i, int k) +{ +        if (replies[k].poststat.ia_mtime != replies[i].poststat.ia_mtime) { +                return _gf_false; +        } +        if (replies[k].poststat.ia_size != replies[i].poststat.ia_size)  { +                return _gf_false; +        } + +        return gf_uuid_compare (replies[i].poststat.ia_gfid, +                                replies[k].poststat.ia_gfid) == 0; +} +  int  afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies,                          inode_t *inode) @@ -685,12 +699,8 @@ afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies,                                  replies[i].poststat.ia_size,                                  uuid_utoa (inode->gfid));                                  vote_count = 0; -                        for (k = 0; k < priv->child_count; k++) { -                                if ((replies[k].poststat.ia_mtime == -                                     replies[i].poststat.ia_mtime) && -                                    (replies[k].poststat.ia_size == -                                     replies[i].poststat.ia_size) -                                   ) { +                        for (k = 1; k < priv->child_count; k++) { +                                if (replies_are_same (replies, i, k)) {                                          vote_count++;                                  }                          } @@ -2248,3 +2258,19 @@ afr_choose_source_by_policy (afr_private_t *priv, unsigned char *sources,  out:          return source;  } + +void +afr_sh_get_source_by_policy (xlator_t *this, +                             unsigned char *sources, +                             unsigned char *healed_sinks, +                             unsigned char *locked_on, +                             struct afr_reply *replies, inode_t *inode) +{ +        int fav_child = -1; +        char *policy_str; + +        fav_child = afr_sh_get_fav_by_policy (this, replies, inode, +                                              &policy_str); +        sources[fav_child] = 1; +        healed_sinks[fav_child] = 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index b28ce4170f1..deb3b4e59af 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -13,6 +13,287 @@  #include "afr-self-heal.h"  #include "afr-messages.h" + + +/* + * Helper function to create the destination location for the copy + * of the directory entry we are moving out of the way. + */ +static int +_afr_sh_create_unsplit_loc (struct afr_reply *replies, const int child_idx, +                loc_t *loc, loc_t *unsplit_loc) +{ +        int     ret = 0; +        int     new_path_len = 0; +        int     new_name_len = 0; +        char    *new_path = NULL; +        char    *new_name = NULL; +        char    *tmp_gfid_str; +        const char *filename = NULL; +        uuid_t  rand_uuid; + +        tmp_gfid_str = alloca (sizeof (UUID0_STR)); + +        /* +         * All of these allocations will be cleaned up +         * @ afr_sh_gfid_unsplit_rename_done via loc_wipe. +         */ +        if (loc_copy (unsplit_loc, loc)) { +                ret = EINVAL; +                goto err; +        } + +        inode_unref (unsplit_loc->inode); +        unsplit_loc->inode = inode_new (loc->inode->table); +        unsplit_loc->parent = inode_ref (loc->parent); +        gf_uuid_copy (unsplit_loc->inode->gfid, +                      replies[child_idx].poststat.ia_gfid); +        unsplit_loc->inode->ia_type = loc->inode->ia_type; + +        gf_uuid_generate (rand_uuid); +        /* Note: Use re-entrant version of uuid_utoa! */ +        tmp_gfid_str = uuid_utoa_r (rand_uuid, tmp_gfid_str); + +        /* Copy the GFIDs, file + parent directory */ +        gf_uuid_copy (unsplit_loc->gfid, rand_uuid); +        gf_uuid_copy (unsplit_loc->pargfid, +                      replies[child_idx].postparent.ia_gfid); + +        filename = loc->name; + +        /* +         * New path: Add 11 for null + ".unsplit_" + "_". We _could_ nuke +         * tmp_gfid_str entirely here, iff we assume the uuid_utoa +         * formatting to _never_change.  If we assume this we can just add +         * 32 to the length and call uuid_utoa directly in the snprintf. +         */ +        new_path_len = strlen (filename) + strlen (tmp_gfid_str) + 11; +        new_path = GF_CALLOC (1, new_path_len, gf_common_mt_char); +        if (!new_path) { +                ret = ENOMEM; +                goto err; +        } +        snprintf (new_path, new_path_len, ".unsplit_%s_%s", tmp_gfid_str, +                filename); +        unsplit_loc->path = new_path; + +        /* New name: Add 11 for null + ".unsplit_" + "_" */ +        new_name_len = strlen (loc->name) + strlen (tmp_gfid_str) + 11; +        new_name = GF_CALLOC (1, new_name_len, gf_common_mt_char); +        if (!new_name) { +                ret = ENOMEM; +                goto err; +        } +        snprintf (new_name, new_name_len, ".unsplit_%s_%s", tmp_gfid_str, +                loc->name); +        unsplit_loc->name = new_name; + +        return 0; +err: +        GF_FREE (new_path); +        GF_FREE (new_name); +        return ret; +} + +static int +_afr_gfid_unsplit_rename_cbk (call_frame_t *frame, void *cookie, +                xlator_t *this, int32_t op_ret, int32_t op_errno, +                struct iatt *buf, struct iatt *preoldparent, +                struct iatt *postoldparent, struct iatt *prenewparent, +                struct iatt *postnewparent, dict_t *xdata) +{ +        afr_local_t             *local = NULL; +        int                     child_index = (long) cookie; + +        local = frame->local; + +        if ((op_ret == -1) && (op_errno != ENOENT)) { +                gf_log (this->name, GF_LOG_ERROR, +                        "rename entry %s/%s failed, on child %d reason, %s", +                        uuid_utoa (local->loc.pargfid), +                        local->loc.name, child_index, strerror (op_errno)); +        } +        gf_log (this->name, GF_LOG_DEBUG, +                "GFID unsplit successful on %s/%s, on child %d", +                uuid_utoa (local->loc.pargfid), local->loc.name, child_index); + +        syncbarrier_wake (&local->barrier); +        return 0; +} +int +__afr_selfheal_do_gfid_unsplit (xlator_t *this, unsigned char *locked_on, +                                struct afr_reply *replies, inode_t *inode, +                                loc_t *loc) +{ +        afr_local_t     *local = NULL; +        afr_private_t   *priv = NULL; +        call_frame_t   *frame    = NULL; +        loc_t           *unsplit_loc; +        int             fav_child = -1; +        unsigned char   *fav_gfid; +        unsigned int    i = 0; +        unsigned int    split_count = 0; +        unsigned char   *rename_list; +        int             ret = 0; +        char            *policy_str; + +        frame = afr_frame_create (this); + +        local = frame->local;     // Local variables for our frame +        priv = this->private;     // xlator specific variables +        rename_list = alloca0 (priv->child_count); + +        if (loc_copy (&local->loc, loc)) { +                ret = ENOMEM; +                goto out; +        } + +        /* +         * Ok, go find our favorite child by one of the active policies: +         * majority -> ctime -> mtime -> size -> predefined +         * we'll use this gfid as the "real" one. +         */ +        fav_child = afr_sh_get_fav_by_policy (this, replies, inode, +                                              &policy_str); +        if (fav_child == -1) {  /* No policies are in place, bail */ +                gf_log (this->name, GF_LOG_WARNING, "Unable to resolve GFID " +                        "split brain, there are no favorite child policies " +                        "set."); +                ret = -EIO; +                goto out; +        } +        fav_gfid = replies[fav_child].poststat.ia_gfid; +        gf_log (this->name, GF_LOG_INFO, "Using child %d to resolve gfid " +                "split-brain.  GFID is %s.", fav_child, uuid_utoa (fav_gfid)); + +        /* Pre-compute the number of rename calls we will be doing */ +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i] && +                    !gf_uuid_is_null (replies[i].poststat.ia_gfid) && +                    gf_uuid_compare (replies[i].poststat.ia_gfid, fav_gfid)) { +                        split_count++; +                } +        } + +        gf_log (this->name, GF_LOG_INFO, "Found %d split-brained gfid's.", +                split_count); + +        local->unsplit_locs = GF_CALLOC (priv->child_count, +            sizeof (*unsplit_loc), gf_afr_mt_loc_t); +        if (!local->unsplit_locs) { +                ret = ENOMEM; +                goto out; +        } + +        afr_local_replies_wipe (local, priv); +        local->call_count = 0; +        for (i = 0; i < priv->child_count; i++) { +                unsplit_loc = &local->unsplit_locs[i]; +                if (locked_on[i] && local->child_up[i] && +                    replies[i].op_errno != ENOENT && +                    !gf_uuid_is_null (replies[i].poststat.ia_gfid) && +                    gf_uuid_compare (replies[i].poststat.ia_gfid, fav_gfid)) { +                        ret = _afr_sh_create_unsplit_loc (replies, i, +                                                          loc, unsplit_loc); +                        gf_log (this->name, GF_LOG_INFO, "Renaming child %d to " +                                " %s/%s to resolve gfid split-brain.", i, +                                uuid_utoa (unsplit_loc->pargfid), +                                unsplit_loc->name); +                        rename_list[i] = 1; +                        /* frame, rfn, cky, obj, fn, params */ +                        STACK_WIND_COOKIE (frame, +                                _afr_gfid_unsplit_rename_cbk, +                                (void *) (long) i, +                                priv->children[i], +                                priv->children[i]->fops->rename, +                                loc, unsplit_loc, NULL); +                        local->call_count++; +                } +        } +        syncbarrier_wait (&local->barrier, local->call_count); + +out: +        for (i = 0; i < priv->child_count; i++) { +                if (rename_list[i]) +                        loc_wipe (&local->unsplit_locs[i]); +        } +        if (frame) +                AFR_STACK_DESTROY (frame); +        return ret; +} + +int +__afr_selfheal_gfid_unsplit (xlator_t *this, inode_t *parent, uuid_t pargfid, +                             const char *bname, inode_t *inode, +                             struct afr_reply *replies, void *gfid, +                             unsigned char *locked_on) +{ +        int             ret          = 0; +        afr_private_t  *priv         = NULL; +        dict_t         *xdata        = NULL; +        loc_t           loc          = {0, }; +        call_frame_t   *new_frame    = NULL; +        afr_local_t    *new_local    = NULL; + +        priv = this->private; + +        new_frame = afr_frame_create (this); +        if (!new_frame) { +                ret = -ENOMEM; +                goto out; +        } + +        new_local = new_frame->local; + +        gf_uuid_copy (parent->gfid, pargfid); + +        xdata = dict_new (); +        if (!xdata) { +                ret = -ENOMEM; +                goto out; +        } + +        ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16); +        if (ret) { +                ret = -ENOMEM; +                goto out; +        } + +        loc.parent = inode_ref (parent); +        loc.inode = inode_ref (inode); +        gf_uuid_copy (loc.pargfid, pargfid); +        loc.name = bname; + +        ret = __afr_selfheal_do_gfid_unsplit (this, locked_on, replies, +                                              inode, &loc); + +        if (ret) +                goto out; + +        /* Clear out old replies here and wind lookup on all locked +         * subvolumes to achieve two things: +         *   a. gfid heal on those subvolumes that do not have gfid associated +         *      with the inode, and +         *   b. refresh replies, which can be consumed by +         *      __afr_selfheal_name_impunge(). +         */ +        afr_replies_wipe (replies, priv->child_count); +        /* This sends out lookups to all bricks and blocks once we have +         * them. +         */ +        AFR_ONLIST (locked_on, new_frame, afr_selfheal_discover_cbk, lookup, +                    &loc, xdata); +        afr_replies_copy (replies, new_local->replies, priv->child_count); +out: +        loc_wipe (&loc); +        if (xdata) +                dict_unref (xdata); +        if (new_frame) +                AFR_STACK_DESTROY (new_frame); + +        return ret; +} +  int  __afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid,                              const char *bname, inode_t *inode, @@ -429,12 +710,6 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,          if (ret)                  return ret; -        ret = afr_selfheal_name_gfid_mismatch_check (this, replies, source, -                                                     sources, &gfid_idx, -                                                     pargfid, bname); -        if (ret) -                return ret; -  	if (gfid_idx == -1) {                  if (!gfid_req || gf_uuid_is_null (gfid_req))                          return -1; @@ -443,6 +718,15 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,                  gfid = &replies[gfid_idx].poststat.ia_gfid;          } +        ret = afr_selfheal_name_gfid_mismatch_check (this, replies, source, +                                                     sources, &gfid_idx, +                                                     pargfid, bname); +        if (ret) +                ret = __afr_selfheal_gfid_unsplit (this, parent, pargfid, +                    bname, inode, replies, gfid, locked_on); +        if (ret) +                return ret; +          is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false;  	ret = __afr_selfheal_assign_gfid (this, parent, pargfid, bname, inode,                                            replies, gfid, locked_on, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index bbb444c7974..53abeaace11 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -823,6 +823,7 @@ typedef struct _afr_local {          gf_boolean_t need_full_crawl;          gf_boolean_t is_read_txn; +        loc_t           *unsplit_locs;   /* Un-split targets */  } afr_local_t;  | 
