diff options
| -rw-r--r-- | tests/basic/gfid_unsplit.t | 102 | ||||
| -rw-r--r-- | tests/include.rc | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 38 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-name.c | 296 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 1 | 
5 files changed, 426 insertions, 13 deletions
diff --git a/tests/basic/gfid_unsplit.t b/tests/basic/gfid_unsplit.t new file mode 100644 index 00000000000..9bb52f4533a --- /dev/null +++ b/tests/basic/gfid_unsplit.t @@ -0,0 +1,102 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +# Setup a cluster with 3 replicas, and fav child by majority on +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 nfs.disable off +#EST $CLI volume set $V0 cluster.favorite-child-by-majority on +#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.favorite-child-policy majority +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +sleep 5 + +# Part I: FUSE Test +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \ +  --attribute-timeout=0 --entry-timeout=0 + +dd if=/dev/urandom of=$M0/splitfile bs=128k count=5 2>/dev/null + +MD5=$(md5sum $M0/splitfile | cut -d\  -f1) + +# Create a split-brain by downing a brick, and flipping the +# gfid on the down brick, then bring the brick back up. +TEST kill_brick $V0 $H0 $B0/${V0}1 +GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +rm -rf $GFID_DIR_B1 +TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile + +GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +#EST rm -f $B0/${V0}3/splitfile +#m -rf $GFID_DIR_B3 + +# Restart the down brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 +sleep 5 + +# Tickle the file to trigger the gfid unsplit +TEST stat $M0/splitfile +sleep 1 + +# Verify the file is readable +TEST dd if=$M0/splitfile of=/dev/null 2>/dev/null + +# Verify the MD5 signature of the file +HEALED_MD5=$(md5sum $M0/splitfile | cut -d\  -f1) +TEST [ "$MD5" == "$HEALED_MD5" ] + +# Verify the file can be removed +TEST rm -f $M0/splitfile + +# Part II: NFS test +TEST mount -t nfs -o nolock,noatime,noacl,soft,intr $H0:/$V0 $N0; + +dd if=/dev/urandom of=$N0/splitfile bs=128k count=5 2>/dev/null + +MD5=$(md5sum $N0/splitfile | cut -d\  -f1) + +# Create a split-brain by downing a brick, and flipping the +# gfid on the down brick, then bring the brick back up. +TEST kill_brick $V0 $H0 $B0/${V0}1 +GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +rm -rf $GFID_DIR_B1 +TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile + +GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +#EST rm -f $B0/${V0}3/splitfile +#m -rf $GFID_DIR_B3 + +# Restart the down brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 +sleep 5 + +# Tickle the file to trigger the gfid unsplit +TEST stat $N0/splitfile +sleep 1 + +# Verify the file is readable +TEST dd if=$N0/splitfile of=/dev/null 2>/dev/null + +# Verify the MD5 signature of the file +HEALED_MD5=$(md5sum $N0/splitfile | cut -d\  -f1) +TEST [ "$MD5" == "$HEALED_MD5" ] + +# Verify the file can be removed +TEST rm -f $N0/splitfile + +cleanup diff --git a/tests/include.rc b/tests/include.rc index de3b90ee86b..872aec11edf 100644 --- a/tests/include.rc +++ b/tests/include.rc @@ -431,7 +431,7 @@ function cleanup()          # Prepare flags for umount          case `uname -s` in          Linux) -                flag="-l" +                flag="-l -f --no-canonicalize"                  ;;          NetBSD)                  flag="-f -R" diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 549c3c1ba71..04f16cc0122 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -664,6 +664,20 @@ out:  } +static int +replies_are_same (struct afr_reply *replies, int i, int k) +{ +        if (replies[k].poststat.ia_mtime != replies[i].poststat.ia_mtime) { +                return _gf_false; +        } +        if (replies[k].poststat.ia_size != replies[i].poststat.ia_size)  { +                return _gf_false; +        } + +        return gf_uuid_compare (replies[i].poststat.ia_gfid, +                                replies[k].poststat.ia_gfid) == 0; +} +  int  afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies,                          inode_t *inode) @@ -685,12 +699,8 @@ afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies,                                  replies[i].poststat.ia_size,                                  uuid_utoa (inode->gfid));                                  vote_count = 0; -                        for (k = 0; k < priv->child_count; k++) { -                                if ((replies[k].poststat.ia_mtime == -                                     replies[i].poststat.ia_mtime) && -                                    (replies[k].poststat.ia_size == -                                     replies[i].poststat.ia_size) -                                   ) { +                        for (k = 1; k < priv->child_count; k++) { +                                if (replies_are_same (replies, i, k)) {                                          vote_count++;                                  }                          } @@ -2248,3 +2258,19 @@ afr_choose_source_by_policy (afr_private_t *priv, unsigned char *sources,  out:          return source;  } + +void +afr_sh_get_source_by_policy (xlator_t *this, +                             unsigned char *sources, +                             unsigned char *healed_sinks, +                             unsigned char *locked_on, +                             struct afr_reply *replies, inode_t *inode) +{ +        int fav_child = -1; +        char *policy_str; + +        fav_child = afr_sh_get_fav_by_policy (this, replies, inode, +                                              &policy_str); +        sources[fav_child] = 1; +        healed_sinks[fav_child] = 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index b28ce4170f1..deb3b4e59af 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -13,6 +13,287 @@  #include "afr-self-heal.h"  #include "afr-messages.h" + + +/* + * Helper function to create the destination location for the copy + * of the directory entry we are moving out of the way. + */ +static int +_afr_sh_create_unsplit_loc (struct afr_reply *replies, const int child_idx, +                loc_t *loc, loc_t *unsplit_loc) +{ +        int     ret = 0; +        int     new_path_len = 0; +        int     new_name_len = 0; +        char    *new_path = NULL; +        char    *new_name = NULL; +        char    *tmp_gfid_str; +        const char *filename = NULL; +        uuid_t  rand_uuid; + +        tmp_gfid_str = alloca (sizeof (UUID0_STR)); + +        /* +         * All of these allocations will be cleaned up +         * @ afr_sh_gfid_unsplit_rename_done via loc_wipe. +         */ +        if (loc_copy (unsplit_loc, loc)) { +                ret = EINVAL; +                goto err; +        } + +        inode_unref (unsplit_loc->inode); +        unsplit_loc->inode = inode_new (loc->inode->table); +        unsplit_loc->parent = inode_ref (loc->parent); +        gf_uuid_copy (unsplit_loc->inode->gfid, +                      replies[child_idx].poststat.ia_gfid); +        unsplit_loc->inode->ia_type = loc->inode->ia_type; + +        gf_uuid_generate (rand_uuid); +        /* Note: Use re-entrant version of uuid_utoa! */ +        tmp_gfid_str = uuid_utoa_r (rand_uuid, tmp_gfid_str); + +        /* Copy the GFIDs, file + parent directory */ +        gf_uuid_copy (unsplit_loc->gfid, rand_uuid); +        gf_uuid_copy (unsplit_loc->pargfid, +                      replies[child_idx].postparent.ia_gfid); + +        filename = loc->name; + +        /* +         * New path: Add 11 for null + ".unsplit_" + "_". We _could_ nuke +         * tmp_gfid_str entirely here, iff we assume the uuid_utoa +         * formatting to _never_change.  If we assume this we can just add +         * 32 to the length and call uuid_utoa directly in the snprintf. +         */ +        new_path_len = strlen (filename) + strlen (tmp_gfid_str) + 11; +        new_path = GF_CALLOC (1, new_path_len, gf_common_mt_char); +        if (!new_path) { +                ret = ENOMEM; +                goto err; +        } +        snprintf (new_path, new_path_len, ".unsplit_%s_%s", tmp_gfid_str, +                filename); +        unsplit_loc->path = new_path; + +        /* New name: Add 11 for null + ".unsplit_" + "_" */ +        new_name_len = strlen (loc->name) + strlen (tmp_gfid_str) + 11; +        new_name = GF_CALLOC (1, new_name_len, gf_common_mt_char); +        if (!new_name) { +                ret = ENOMEM; +                goto err; +        } +        snprintf (new_name, new_name_len, ".unsplit_%s_%s", tmp_gfid_str, +                loc->name); +        unsplit_loc->name = new_name; + +        return 0; +err: +        GF_FREE (new_path); +        GF_FREE (new_name); +        return ret; +} + +static int +_afr_gfid_unsplit_rename_cbk (call_frame_t *frame, void *cookie, +                xlator_t *this, int32_t op_ret, int32_t op_errno, +                struct iatt *buf, struct iatt *preoldparent, +                struct iatt *postoldparent, struct iatt *prenewparent, +                struct iatt *postnewparent, dict_t *xdata) +{ +        afr_local_t             *local = NULL; +        int                     child_index = (long) cookie; + +        local = frame->local; + +        if ((op_ret == -1) && (op_errno != ENOENT)) { +                gf_log (this->name, GF_LOG_ERROR, +                        "rename entry %s/%s failed, on child %d reason, %s", +                        uuid_utoa (local->loc.pargfid), +                        local->loc.name, child_index, strerror (op_errno)); +        } +        gf_log (this->name, GF_LOG_DEBUG, +                "GFID unsplit successful on %s/%s, on child %d", +                uuid_utoa (local->loc.pargfid), local->loc.name, child_index); + +        syncbarrier_wake (&local->barrier); +        return 0; +} +int +__afr_selfheal_do_gfid_unsplit (xlator_t *this, unsigned char *locked_on, +                                struct afr_reply *replies, inode_t *inode, +                                loc_t *loc) +{ +        afr_local_t     *local = NULL; +        afr_private_t   *priv = NULL; +        call_frame_t   *frame    = NULL; +        loc_t           *unsplit_loc; +        int             fav_child = -1; +        unsigned char   *fav_gfid; +        unsigned int    i = 0; +        unsigned int    split_count = 0; +        unsigned char   *rename_list; +        int             ret = 0; +        char            *policy_str; + +        frame = afr_frame_create (this); + +        local = frame->local;     // Local variables for our frame +        priv = this->private;     // xlator specific variables +        rename_list = alloca0 (priv->child_count); + +        if (loc_copy (&local->loc, loc)) { +                ret = ENOMEM; +                goto out; +        } + +        /* +         * Ok, go find our favorite child by one of the active policies: +         * majority -> ctime -> mtime -> size -> predefined +         * we'll use this gfid as the "real" one. +         */ +        fav_child = afr_sh_get_fav_by_policy (this, replies, inode, +                                              &policy_str); +        if (fav_child == -1) {  /* No policies are in place, bail */ +                gf_log (this->name, GF_LOG_WARNING, "Unable to resolve GFID " +                        "split brain, there are no favorite child policies " +                        "set."); +                ret = -EIO; +                goto out; +        } +        fav_gfid = replies[fav_child].poststat.ia_gfid; +        gf_log (this->name, GF_LOG_INFO, "Using child %d to resolve gfid " +                "split-brain.  GFID is %s.", fav_child, uuid_utoa (fav_gfid)); + +        /* Pre-compute the number of rename calls we will be doing */ +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i] && +                    !gf_uuid_is_null (replies[i].poststat.ia_gfid) && +                    gf_uuid_compare (replies[i].poststat.ia_gfid, fav_gfid)) { +                        split_count++; +                } +        } + +        gf_log (this->name, GF_LOG_INFO, "Found %d split-brained gfid's.", +                split_count); + +        local->unsplit_locs = GF_CALLOC (priv->child_count, +            sizeof (*unsplit_loc), gf_afr_mt_loc_t); +        if (!local->unsplit_locs) { +                ret = ENOMEM; +                goto out; +        } + +        afr_local_replies_wipe (local, priv); +        local->call_count = 0; +        for (i = 0; i < priv->child_count; i++) { +                unsplit_loc = &local->unsplit_locs[i]; +                if (locked_on[i] && local->child_up[i] && +                    replies[i].op_errno != ENOENT && +                    !gf_uuid_is_null (replies[i].poststat.ia_gfid) && +                    gf_uuid_compare (replies[i].poststat.ia_gfid, fav_gfid)) { +                        ret = _afr_sh_create_unsplit_loc (replies, i, +                                                          loc, unsplit_loc); +                        gf_log (this->name, GF_LOG_INFO, "Renaming child %d to " +                                " %s/%s to resolve gfid split-brain.", i, +                                uuid_utoa (unsplit_loc->pargfid), +                                unsplit_loc->name); +                        rename_list[i] = 1; +                        /* frame, rfn, cky, obj, fn, params */ +                        STACK_WIND_COOKIE (frame, +                                _afr_gfid_unsplit_rename_cbk, +                                (void *) (long) i, +                                priv->children[i], +                                priv->children[i]->fops->rename, +                                loc, unsplit_loc, NULL); +                        local->call_count++; +                } +        } +        syncbarrier_wait (&local->barrier, local->call_count); + +out: +        for (i = 0; i < priv->child_count; i++) { +                if (rename_list[i]) +                        loc_wipe (&local->unsplit_locs[i]); +        } +        if (frame) +                AFR_STACK_DESTROY (frame); +        return ret; +} + +int +__afr_selfheal_gfid_unsplit (xlator_t *this, inode_t *parent, uuid_t pargfid, +                             const char *bname, inode_t *inode, +                             struct afr_reply *replies, void *gfid, +                             unsigned char *locked_on) +{ +        int             ret          = 0; +        afr_private_t  *priv         = NULL; +        dict_t         *xdata        = NULL; +        loc_t           loc          = {0, }; +        call_frame_t   *new_frame    = NULL; +        afr_local_t    *new_local    = NULL; + +        priv = this->private; + +        new_frame = afr_frame_create (this); +        if (!new_frame) { +                ret = -ENOMEM; +                goto out; +        } + +        new_local = new_frame->local; + +        gf_uuid_copy (parent->gfid, pargfid); + +        xdata = dict_new (); +        if (!xdata) { +                ret = -ENOMEM; +                goto out; +        } + +        ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16); +        if (ret) { +                ret = -ENOMEM; +                goto out; +        } + +        loc.parent = inode_ref (parent); +        loc.inode = inode_ref (inode); +        gf_uuid_copy (loc.pargfid, pargfid); +        loc.name = bname; + +        ret = __afr_selfheal_do_gfid_unsplit (this, locked_on, replies, +                                              inode, &loc); + +        if (ret) +                goto out; + +        /* Clear out old replies here and wind lookup on all locked +         * subvolumes to achieve two things: +         *   a. gfid heal on those subvolumes that do not have gfid associated +         *      with the inode, and +         *   b. refresh replies, which can be consumed by +         *      __afr_selfheal_name_impunge(). +         */ +        afr_replies_wipe (replies, priv->child_count); +        /* This sends out lookups to all bricks and blocks once we have +         * them. +         */ +        AFR_ONLIST (locked_on, new_frame, afr_selfheal_discover_cbk, lookup, +                    &loc, xdata); +        afr_replies_copy (replies, new_local->replies, priv->child_count); +out: +        loc_wipe (&loc); +        if (xdata) +                dict_unref (xdata); +        if (new_frame) +                AFR_STACK_DESTROY (new_frame); + +        return ret; +} +  int  __afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid,                              const char *bname, inode_t *inode, @@ -429,12 +710,6 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,          if (ret)                  return ret; -        ret = afr_selfheal_name_gfid_mismatch_check (this, replies, source, -                                                     sources, &gfid_idx, -                                                     pargfid, bname); -        if (ret) -                return ret; -  	if (gfid_idx == -1) {                  if (!gfid_req || gf_uuid_is_null (gfid_req))                          return -1; @@ -443,6 +718,15 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,                  gfid = &replies[gfid_idx].poststat.ia_gfid;          } +        ret = afr_selfheal_name_gfid_mismatch_check (this, replies, source, +                                                     sources, &gfid_idx, +                                                     pargfid, bname); +        if (ret) +                ret = __afr_selfheal_gfid_unsplit (this, parent, pargfid, +                    bname, inode, replies, gfid, locked_on); +        if (ret) +                return ret; +          is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false;  	ret = __afr_selfheal_assign_gfid (this, parent, pargfid, bname, inode,                                            replies, gfid, locked_on, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index bbb444c7974..53abeaace11 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -823,6 +823,7 @@ typedef struct _afr_local {          gf_boolean_t need_full_crawl;          gf_boolean_t is_read_txn; +        loc_t           *unsplit_locs;   /* Un-split targets */  } afr_local_t;  | 
