diff options
| author | Jeff Darcy <jdarcy@redhat.com> | 2014-05-07 19:31:30 +0000 | 
|---|---|---|
| committer | Vijay Bellur <vbellur@redhat.com> | 2015-05-09 21:55:09 -0700 | 
| commit | 243d61575c093c03b9beb014bf9d097646836e95 (patch) | |
| tree | afaccb59310013c4f7c6bb867231c4d8988a697c | |
| parent | 58ef6a233f43bc644be55d2b5510b12718a6835e (diff) | |
dht: make lookup-unhashed=auto do something actually useful
The key concept here is to determine whether a directory is "clean" by
comparing its last-known-good topology to the current one for the
volume.  These are stored as "commit hashes" on the directory and the
volume root respectively.  The volume's commit hash changes whenever a
brick is added or removed, and a fix-layout is done.  A directory's
commit hash changes only when a full rebalance (not just fix-layout)
is done on it.  If all bricks are present and have a directory
commit hash that matches the volume commit hash, then we can assume
that every file is in its "proper" place. Therefore, if we look for
a file in that proper place and don't find it, we can assume it's not
on any other subvolume and *safely* skip the global (broadcast to all)
lookup.
Change-Id: Id6ce4593ba1f7daffa74cfab591cb45960629ae3
BUG: 1220064
Reviewed-on-master: http://review.gluster.org/#/c/7702/
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Signed-off-by: Shyam <srangana@redhat.com>
Reviewed-on: http://review.gluster.org/10729
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
| -rw-r--r-- | libglusterfs/src/glusterfs.h | 1 | ||||
| -rwxr-xr-x | tests/bugs/distribute/bug-907072.t | 18 | ||||
| -rwxr-xr-x | tests/bugs/distribute/bug-921408.t | 4 | ||||
| -rwxr-xr-x | tests/bugs/glusterd/bug-1070734.t | 7 | ||||
| -rwxr-xr-x | tests/features/unhashed-auto.t | 99 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 87 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 29 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-layout.c | 69 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 109 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 316 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 15 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 5 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-op-sm.c | 105 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-rebalance.c | 7 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd.h | 1 | 
15 files changed, 781 insertions, 91 deletions
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index b26580f1ec9..3843bb76ed9 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -227,6 +227,7 @@                                                         (iabuf)->ia_type) & ~S_IFMT)\                                       == DHT_LINKFILE_MODE)  #define DHT_LINKFILE_STR "linkto" +#define DHT_COMMITHASH_STR "commithash"  #define DHT_SKIP_NON_LINKTO_UNLINK "unlink-only-if-dht-linkto-file"  #define DHT_SKIP_OPEN_FD_UNLINK "dont-unlink-for-open-fd" diff --git a/tests/bugs/distribute/bug-907072.t b/tests/bugs/distribute/bug-907072.t index 1e8bd280f32..a4d98831380 100755 --- a/tests/bugs/distribute/bug-907072.t +++ b/tests/bugs/distribute/bug-907072.t @@ -17,10 +17,11 @@ TEST glusterfs -s $H0 --volfile-id $V0 $M0;  TEST mkdir $M0/test; -OLD_LAYOUT0=`get_layout $B0/${V0}0/test`; -OLD_LAYOUT1=`get_layout $B0/${V0}1/test`; -OLD_LAYOUT2=`get_layout $B0/${V0}2/test`; -OLD_LAYOUT3=`get_layout $B0/${V0}3/test`; +# Extract the layout sans the commit hash +OLD_LAYOUT0=`get_layout $B0/${V0}0/test | cut -c11-34`; +OLD_LAYOUT1=`get_layout $B0/${V0}1/test | cut -c11-34`; +OLD_LAYOUT2=`get_layout $B0/${V0}2/test | cut -c11-34`; +OLD_LAYOUT3=`get_layout $B0/${V0}3/test | cut -c11-34`;  TEST killall glusterfsd; @@ -36,10 +37,11 @@ EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0  TEST glusterfs -s $H0 --volfile-id $V0 $M0;  TEST stat $M0/test; -NEW_LAYOUT0=`get_layout $B0/${V0}0/test`; -NEW_LAYOUT1=`get_layout $B0/${V0}1/test`; -NEW_LAYOUT2=`get_layout $B0/${V0}2/test`; -NEW_LAYOUT3=`get_layout $B0/${V0}3/test`; +# Extract the layout sans the commit hash +NEW_LAYOUT0=`get_layout $B0/${V0}0/test | cut -c11-34`; +NEW_LAYOUT1=`get_layout $B0/${V0}1/test | cut -c11-34`; +NEW_LAYOUT2=`get_layout $B0/${V0}2/test | cut -c11-34`; +NEW_LAYOUT3=`get_layout $B0/${V0}3/test | cut -c11-34`;  EXPECT $OLD_LAYOUT0 echo $NEW_LAYOUT0;  EXPECT $OLD_LAYOUT1 echo $NEW_LAYOUT1; diff --git a/tests/bugs/distribute/bug-921408.t b/tests/bugs/distribute/bug-921408.t index b1887f8ae22..559114bb85a 100755 --- a/tests/bugs/distribute/bug-921408.t +++ b/tests/bugs/distribute/bug-921408.t @@ -37,7 +37,7 @@ addbr_rebal_till_layout_change()                  then                          break                  fi -                NEW_LAYOUT=`get_layout $B0/${V0}0` +                NEW_LAYOUT=`get_layout $B0/${V0}0 | cut -c11-34`                  if [ $OLD_LAYOUT == $NEW_LAYOUT ]                  then                          i=`expr $i + 1`; @@ -64,7 +64,7 @@ TEST touch $M0/test/test  fd=`fd_available`  TEST fd_open $fd "rw" $M0/test/test -OLD_LAYOUT=`get_layout $B0/${V0}0` +OLD_LAYOUT=`get_layout $B0/${V0}0 | cut -c11-34`  addbr_rebal_till_layout_change 1 diff --git a/tests/bugs/glusterd/bug-1070734.t b/tests/bugs/glusterd/bug-1070734.t index b5a53c24cab..5db60e0cfe6 100755 --- a/tests/bugs/glusterd/bug-1070734.t +++ b/tests/bugs/glusterd/bug-1070734.t @@ -65,8 +65,11 @@ TEST [ -f ${OTHERBRICK}/DIR/file ]  #Check the DIR on HASHED should have got zeroed layout and the \  #OTHERBRICK should have got full layout -EXPECT "0x00000001000000000000000000000000" dht_get_layout $HASHED/DIR ; -EXPECT "0x000000010000000000000000ffffffff" dht_get_layout $OTHERBRICK/DIR; +shorter_layout () { +	dht_get_layout $1 | cut -c 19- +} +EXPECT "0000000000000000" shorter_layout $HASHED/DIR ; +EXPECT "00000000ffffffff" shorter_layout $OTHERBRICK/DIR;  ## Before killing daemon to avoid deadlocks  EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0 diff --git a/tests/features/unhashed-auto.t b/tests/features/unhashed-auto.t new file mode 100755 index 00000000000..97663c20e10 --- /dev/null +++ b/tests/features/unhashed-auto.t @@ -0,0 +1,99 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../dht.rc + +NFILES=100 + +touch_files () { +	for i in $(seq 1 $NFILES); do +		touch $(printf $M0/dir/file%02d $i) +	done +} + +count_files () { +	found=0 +	for i in $(seq 1 $NFILES); do +		if [ -f $(printf $M0/dir/file%02d $i) ]; then +			found=$((found+1)) +		fi +	done +	echo "found $found files" > /dev/tty +	echo $found +} + +wait_for_rebalance () { +	while true; do +		tmp=$(rebalance_completed) +		if [ $tmp -eq 1 ]; then +			sleep 1 +		else +			break +		fi +	done +} + +get_xattr () { +	cmd="getfattr --absolute-names --only-values -n trusted.glusterfs.dht" +	$cmd $1 | od -tx1 -An | tr -d ' ' +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info + +TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2} +EXPECT "$V0" volinfo_field $V0 'Volume Name' +EXPECT 'Created' volinfo_field $V0 'Status' + +TEST $CLI volume set $V0 cluster.lookup-unhashed auto + +TEST $CLI volume start $V0 +EXPECT 'Started' volinfo_field $V0 'Status' + +# Create some files for later tests. +TEST $GFS -s $H0 --volfile-id $V0 $M0 +TEST mkdir $M0/dir +TEST touch_files +TEST umount $M0 + +# Add a brick and do the fix-layout part of rebalance to update directory layouts +# (including their directory commit hashes). +TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3 +EXPECT '3' brick_count $V0 +TEST $CLI volume rebalance $V0 fix-layout start +TEST wait_for_rebalance + +# Now for the sneaky part.  *Undo* the part of rebalance that updated the volume +# commit hash, forcing a false match between that and the directory commit hashes. +TEST setfattr -x trusted.glusterfs.dht.commithash $B0/${V0}1 +TEST setfattr -x trusted.glusterfs.dht.commithash $B0/${V0}2 +TEST setfattr -x trusted.glusterfs.dht.commithash $B0/${V0}3 + +# Mount and check that we do *not* see all of the files.  This indicates that we +# correctly skipped the broadcast lookup that would have found them. +TEST $GFS -s $H0 --volfile-id $V0 $M0 +TEST [ $(count_files) -ne 100 ] +TEST umount $M0 + +# Do the fix-layout again to generate a new volume commit hash. +TEST $CLI volume rebalance $V0 fix-layout start +TEST wait_for_rebalance + +# Mount and check that we *do* see all of the files.  This indicates that we saw +# the mismatch and did the broadcast lookup this time. +TEST $GFS -s $H0 --volfile-id $V0 $M0 +TEST [ $(count_files) -eq 100 ] +TEST umount $M0 + +# Do a *full* rebalance and verify that the directory commit hash changed. +old_val=$(get_xattr $B0/${V0}1/dir) +TEST $CLI volume rebalance $V0 start +TEST wait_for_rebalance +new_val=$(get_xattr $B0/${V0}1/dir) +TEST [ ! x"$old_val" = x"$new_val" ] + +cleanup diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 6c0afdbec90..37e07ad77da 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -210,6 +210,7 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)          int              ret = -1;          dht_layout_t    *layout = NULL;          dht_conf_t      *conf = NULL; +        uint32_t         vol_commit_hash = 0;          local = discover_frame->local;          layout = local->layout; @@ -279,6 +280,15 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)                          dht_layout_set (this, local->inode, layout);          } +        if (!conf->vch_forced) { +                ret = dict_get_uint32 (local->xattr, +                                       conf->commithash_xattr_name, +                                       &vol_commit_hash); +                if (ret == 0) { +                        conf->vol_commit_hash = vol_commit_hash; +                } +        } +          DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno,                            local->inode, &local->stbuf, local->xattr,                            &local->postparent); @@ -459,6 +469,12 @@ dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc)                          "%s: Failed to set dictionary value:key = %s",                          loc->path, conf->link_xattr_name); +        if (__is_root_gfid(local->loc.gfid)) { +                ret = dict_set_uint32 (local->xattr_req, +                                       conf->commithash_xattr_name, +                                       sizeof(uint32_t)); +        } +          call_cnt        = conf->subvolume_cnt;          local->call_cnt = call_cnt; @@ -655,6 +671,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          call_frame_t *copy          = NULL;          dht_local_t  *copy_local    = NULL;          char gfid[GF_UUID_BUF_SIZE] = {0}; +        uint32_t      vol_commit_hash = 0;          GF_VALIDATE_OR_GOTO ("dht", frame, err);          GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -667,6 +684,14 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          if (!conf)                  goto out; +        if (!conf->vch_forced) { +                ret = dict_get_uint32 (xattr, conf->commithash_xattr_name, +                                       &vol_commit_hash); +                if (ret == 0) { +                        conf->vol_commit_hash = vol_commit_hash; +                } +        } +          gf_uuid_unparse (local->loc.gfid, gfid);          LOCK (&frame->lock); @@ -1852,6 +1877,7 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          call_frame_t *prev          = NULL;          int           ret           = 0;          dht_layout_t *parent_layout = NULL; +        uint32_t      vol_commit_hash = 0;          GF_VALIDATE_OR_GOTO ("dht", frame, err);          GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -1875,6 +1901,14 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        "fresh_lookup returned for %s with op_ret %d and "                        "op_errno %d", loc->path, op_ret, op_errno); +        if (!conf->vch_forced) { +                ret = dict_get_uint32 (xattr, conf->commithash_xattr_name, +                                       &vol_commit_hash); +                if (ret == 0) { +                        conf->vol_commit_hash = vol_commit_hash; +                } +        } +          if (ENTRY_MISSING (op_ret, op_errno)) {                  gf_msg_debug (this->name, 0,                                "Entry %s missing on subvol %s", @@ -1891,7 +1925,10 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                                                          &parent_layout);                          if (ret || !parent_layout)                                  goto out; -                        if (parent_layout->search_unhashed) { +                        if (parent_layout->commit_hash +                                  != conf->vol_commit_hash) { +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "hashes don't match, do global lookup");                                  local->op_errno = ENOENT;                                  dht_lookup_everywhere (frame, this, loc);                                  return 0; @@ -2078,6 +2115,12 @@ dht_lookup (call_frame_t *frame, xlator_t *this,                  return 0;          } +        if (__is_root_gfid(loc->gfid)) { +                ret = dict_set_uint32 (local->xattr_req, +                                       conf->commithash_xattr_name, +                                       sizeof(uint32_t)); +        } +          if (!hashed_subvol)                  hashed_subvol = dht_subvol_get_hashed (this, loc);          local->hashed_subvol = hashed_subvol; @@ -3238,8 +3281,9 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this,          conf = this->private; -        GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, -                                   op_errno, err); +        if (!conf->defrag) +                GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, +                                           op_errno, err);          local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR);          if (!local) { @@ -3338,6 +3382,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,          char          value[4096] = {0,};          gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA;          int           call_cnt = 0; +        uint32_t      new_hash = 0;          VALIDATE_OR_GOTO (frame, err);          VALIDATE_OR_GOTO (this, err); @@ -3350,8 +3395,10 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,          methods = conf->methods;          GF_VALIDATE_OR_GOTO (this->name, conf->methods, err); -        GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, -                                   op_errno, err); +        /* Rebalance daemon is allowed to set internal keys */ +        if (!conf->defrag) +                GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, +                                           op_errno, err);          local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR);          if (!local) { @@ -3489,6 +3536,22 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,                  gf_log (this->name, GF_LOG_INFO,                          "fixing the layout of %s", loc->path); +                ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash); +                if (ret == 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "updating commit hash for %s from %u to %u", +                                uuid_utoa(loc->gfid), +                                layout->commit_hash, new_hash); +                        layout->commit_hash = new_hash; + +                        ret = dht_update_commit_hash_for_layout (frame); +                        if (ret) { +                                op_errno = ENOTCONN; +                                goto err; +                        } +                        return ret; +                } +                  ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk,                                                  layout);                  if (ret) { @@ -5377,6 +5440,8 @@ dht_mkdir (call_frame_t *frame, xlator_t *this,                  goto err;          } +        local->layout->commit_hash = conf->vol_commit_hash; +          STACK_WIND (frame, dht_mkdir_hashed_cbk,                      hashed_subvol,                      hashed_subvol->fops->mkdir, @@ -6570,10 +6635,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,                  ret  = snprintf (string, max_string_len,                                   "[Subvol_name: %s, Err: %d , Start: " -                                 "%"PRIu32 " , Stop: %"PRIu32 " ], ", +                                 "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %" +                                 PRIu32 " ], ",                                   layout->list[i].xlator->name,                                   layout->list[i].err, layout->list[i].start, -                                 layout->list[i].stop); +                                 layout->list[i].stop, +                                 layout->list[i].commit_hash);                  if (ret < 0)                          return; @@ -6602,10 +6669,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,                  ret  =  snprintf (output_string + off, len - off,                                    "[Subvol_name: %s, Err: %d , Start: " -                                  "%"PRIu32 " , Stop: %"PRIu32 " ], ", +                                  "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %" +                                  PRIu32  " ], ",                                    layout->list[i].xlator->name,                                    layout->list[i].err, layout->list[i].start, -                                  layout->list[i].stop); +                                  layout->list[i].stop, +                                  layout->list[i].commit_hash);                  if (ret < 0)                          goto err; diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 9a6ed1a889a..45b6cc9e80b 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -31,6 +31,7 @@  #define DHT_PATHINFO_HEADER         "DISTRIBUTE:"  #define DHT_FILE_MIGRATE_DOMAIN     "dht.file.migrate"  #define DHT_LAYOUT_HEAL_DOMAIN      "dht.layout.heal" +#define DHT_LAYOUT_HASH_INVALID     1  #include <fnmatch.h> @@ -48,6 +49,20 @@ struct dht_layout {                                             special key */          int                cnt;          int                preset; +        /* +         * The last *configuration* state for which this directory was known +         * to be in balance.  The corresponding vol_commit_hash changes +         * whenever bricks are added or removed.  This value changes when a +         * (full) rebalance is complete.  If they match, it's safe to assume +         * that every file is where it should be and there's no need to do +         * lookups for files elsewhere.  If they don't, then we have to do a +         * global lookup to be sure. +         */ +        uint32_t           commit_hash; +        /* +         * The *runtime* state of the volume, changes when connections to +         * bricks are made or lost. +         */          int                gen;          int                type;          int                ref; /* use with dht_conf_t->layout_lock */ @@ -59,6 +74,7 @@ struct dht_layout {                                    */                  uint32_t   start;                  uint32_t   stop; +                uint32_t   commit_hash;                  xlator_t  *xlator;          } list[];  }; @@ -325,6 +341,7 @@ struct gf_defrag_info_ {          uuid_t                       node_uuid;          struct timeval               start_time;          gf_boolean_t                 stats; +        uint32_t                     new_commit_hash;          gf_defrag_pattern_list_t    *defrag_pattern;          int                          tier_promote_frequency;          int                          tier_demote_frequency; @@ -422,6 +439,7 @@ struct dht_conf {          /* Support variable xattr names. */          char            *xattr_name;          char            *link_xattr_name; +        char            *commithash_xattr_name;          char            *wild_xattr_name;          /* Support size-weighted rebalancing (heterogeneous bricks). */ @@ -436,6 +454,13 @@ struct dht_conf {          /*local subvol storage for rebalance*/          xlator_t       **local_subvols;          int32_t          local_subvols_cnt; + +        /* +         * "Commit hash" for this volume topology.  Changed whenever bricks +         * are added or removed. +         */ +        uint32_t        vol_commit_hash; +        gf_boolean_t    vch_forced;  };  typedef struct dht_conf dht_conf_t; @@ -576,7 +601,7 @@ int dht_layouts_init (xlator_t *this, dht_conf_t *conf);  int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,                        int       op_ret, int op_errno, dict_t *xattr); -int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, +int     dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,                               int       pos, int32_t **disk_layout_p);  int dht_disk_layout_merge (xlator_t   *this, dht_layout_t *layout,                             int         pos, void *disk_layout_raw, int disk_layout_len); @@ -631,6 +656,7 @@ xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,  int       dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);  int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); +int dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol);  int           dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);;  void          dht_layout_unref (xlator_t *this, dht_layout_t *layout);  dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout); @@ -649,6 +675,7 @@ int dht_rename_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                           struct iatt      *preparent, struct iatt *postparent,                           dict_t *xdata); +int dht_update_commit_hash_for_layout (call_frame_t *frame);  int dht_fix_directory_layout (call_frame_t *frame,                                dht_selfheal_dir_cbk_t  dir_cbk,                                dht_layout_t           *layout); diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 2ed15c5e43c..f88c786a55b 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -267,7 +267,7 @@ dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,                  goto out;          } -        disk_layout[0] = hton32 (1); +        disk_layout[0] = hton32 (layout->list[pos].commit_hash);          disk_layout[1] = hton32 (layout->type);          disk_layout[2] = hton32 (layout->list[pos].start);          disk_layout[3] = hton32 (layout->list[pos].stop); @@ -288,10 +288,10 @@ int  dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,  		       int pos, void *disk_layout_raw, int disk_layout_len)  { -        int      cnt = 0;          int      type = 0;          int      start_off = 0;          int      stop_off = 0; +        int      commit_hash = 0;          int      disk_layout[4];  	if (!disk_layout_raw) { @@ -305,14 +305,6 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,          memcpy (disk_layout, disk_layout_raw, disk_layout_len); -        cnt  = ntoh32 (disk_layout[0]); -        if (cnt != 1) { -                gf_msg (this->name, GF_LOG_ERROR, 0, -                        DHT_MSG_INVALID_DISK_LAYOUT, -                        "Invalid disk layout: Invalid count %d", cnt); -                return -1; -        } -          type = ntoh32 (disk_layout[1]);  	switch (type) {          case DHT_HASH_TYPE_DM_USER: @@ -330,21 +322,22 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,  		return -1;  	} +        commit_hash = ntoh32 (disk_layout[0]);          start_off = ntoh32 (disk_layout[2]);          stop_off  = ntoh32 (disk_layout[3]); +        layout->list[pos].commit_hash = commit_hash;          layout->list[pos].start = start_off;          layout->list[pos].stop  = stop_off;          gf_msg_trace (this->name, 0, -                      "merged to layout: %u - %u (type %d) from %s", -                      start_off, stop_off, type, +                      "merged to layout: %u - %u (type %d, hash %d) from %s", +                      start_off, stop_off, commit_hash, type,                        layout->list[pos].xlator->name);          return 0;  } -  int  dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,                    int op_ret, int op_errno, dict_t *xattr) @@ -397,6 +390,13 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,                          subvol->name);                  goto out;          } + +        if (layout->commit_hash == 0) { +                layout->commit_hash = layout->list[i].commit_hash; +        } else if (layout->commit_hash != layout->list[i].commit_hash) { +                layout->commit_hash = DHT_LAYOUT_HASH_INVALID; +        } +          layout->list[i].err = 0;  out: @@ -409,6 +409,7 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)  {          uint32_t  start_swap = 0;          uint32_t  stop_swap = 0; +        uint32_t  commit_hash_swap = 0;          xlator_t *xlator_swap = 0;          int       err_swap = 0; @@ -416,16 +417,19 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)          stop_swap   = layout->list[i].stop;          xlator_swap = layout->list[i].xlator;          err_swap    = layout->list[i].err; +        commit_hash_swap = layout->list[i].commit_hash;          layout->list[i].start  = layout->list[j].start;          layout->list[i].stop   = layout->list[j].stop;          layout->list[i].xlator = layout->list[j].xlator;          layout->list[i].err    = layout->list[j].err; +        layout->list[i].commit_hash = layout->list[j].commit_hash;          layout->list[j].start  = start_swap;          layout->list[j].stop   = stop_swap;          layout->list[j].xlator = xlator_swap;          layout->list[j].err    = err_swap; +        layout->list[j].commit_hash = commit_hash_swap;  }  void @@ -728,9 +732,9 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,          int         dict_ret = 0;          int32_t     disk_layout[4];          void       *disk_layout_raw = NULL; -        int32_t     count = -1;          uint32_t    start_off = -1;          uint32_t    stop_off = -1; +        uint32_t    commit_hash = -1;          dht_conf_t *conf = this->private;          char        gfid[GF_UUID_BUF_SIZE] = {0}; @@ -779,27 +783,21 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,          memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout)); -        count  = ntoh32 (disk_layout[0]); -        if (count != 1) { -                gf_msg (this->name, GF_LOG_ERROR, 0, -                        DHT_MSG_INVALID_DISK_LAYOUT, -                        "Invalid disk layout: invalid count %d," -                        "path = %s, gfid = %s ", count, loc->path, gfid); -                ret = -1; -                goto out; -        } -          start_off = ntoh32 (disk_layout[2]);          stop_off  = ntoh32 (disk_layout[3]); +        commit_hash = ntoh32 (disk_layout[0]);          if ((layout->list[pos].start != start_off) -            || (layout->list[pos].stop != stop_off)) { +            || (layout->list[pos].stop != stop_off) +            || (layout->list[pos].commit_hash != commit_hash)) {                  gf_log (this->name, GF_LOG_INFO, -                        "subvol: %s; inode layout - %"PRIu32" - %"PRIu32"; " -                        "disk layout - %"PRIu32" - %"PRIu32, +                        "subvol: %s; inode layout - %"PRIu32" - %"PRIu32 +                        " - %"PRIu32"; " +                        "disk layout - %"PRIu32" - %"PRIu32" - %"PRIu32,                          layout->list[pos].xlator->name,                          layout->list[pos].start, layout->list[pos].stop, -                        start_off, stop_off); +                        layout->list[pos].commit_hash, +                        start_off, stop_off, commit_hash);                  ret = 1;          } else {                  ret = 0; @@ -839,3 +837,18 @@ dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode)  out:          return ret;  } + +int +dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol) +{ +        int i = 0, ret = -1; + +        for (i = 0; i < layout->cnt; i++) { +                if (layout->list[i].xlator == subvol) { +                        ret = i; +                        break; +                } +        } + +        return ret; +} diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 60f7314efe0..fae856d969f 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -2337,6 +2337,46 @@ out:  }  int +gf_defrag_settle_hash (xlator_t *this, gf_defrag_info_t *defrag, +                       loc_t *loc, dict_t *fix_layout) +{ +        int     ret; + +        /* +         * Now we're ready to update the directory commit hash for the volume +         * root, so that hash miscompares and broadcast lookups can stop. +         * However, we want to skip that if fix-layout is all we did.  In +         * that case, we want the miscompares etc. to continue until a real +         * rebalance is complete. +         */ +        if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX +            || defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER +            || defrag->cmd == GF_DEFRAG_CMD_START_TIER) { +                return 0; +        } + +        ret = dict_set_uint32 (fix_layout, "new-commit-hash", +                               defrag->new_commit_hash); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Failed to set new-commit-hash"); +                return -1; +        } + +        ret = syncop_setxattr (this, loc, fix_layout, 0, NULL, NULL); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +                        "fix layout on %s failed", loc->path); +                return -1; +        } + +        /* TBD: find more efficient solution than adding/deleting every time */ +        dict_del(fix_layout, "new-commit-hash"); + +        return 0; +} + +int  gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,                    dict_t *fix_layout, dict_t *migrate_data)  { @@ -2422,6 +2462,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,                          if (ret) {                                  gf_log (this->name, GF_LOG_ERROR, "Child loc"                                          " build failed"); +                                ret = -1;                                  goto out;                          } @@ -2487,9 +2528,16 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,                                          "Fix layout failed for %s",                                          entry_loc.path);                                  defrag->total_failures++; +                                ret = -1;                                  goto out;                          } +                        if (gf_defrag_settle_hash (this, defrag, &entry_loc, +                            fix_layout) != 0) { +                                defrag->total_failures++; +                                ret = -1; +                                goto out; +                        }                  }                  gf_dirent_free (&entries);                  free_entries = _gf_false; @@ -2573,6 +2621,36 @@ gf_defrag_start_crawl (void *data)                  goto out;          } +        /* +         * Unfortunately, we can't do special xattrs (like fix.layout) and +         * real ones in the same call currently, and changing it seems +         * riskier than just doing two calls. +         */ + +        gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u", +                __func__, conf->vol_commit_hash); + +        ret = dict_set_uint32 (fix_layout, conf->commithash_xattr_name, +                               conf->vol_commit_hash); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Failed to set %s", conf->commithash_xattr_name); +                defrag->total_failures++; +                ret = -1; +                goto out; +        } + +        ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed", +                        loc.path); +                defrag->total_failures++; +                ret = -1; +                goto out; +        } + +        /* We now return to our regularly scheduled program. */ +          ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes");          if (ret) {                  gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2580,10 +2658,13 @@ gf_defrag_start_crawl (void *data)                          "Failed to start rebalance:"                          "Failed to set dictionary value: key = %s",                          GF_XATTR_FIX_LAYOUT_KEY); +                defrag->total_failures++;                  ret = -1;                  goto out;          } +        defrag->new_commit_hash = conf->vol_commit_hash; +          ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL);          if (ret) {                  gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2599,19 +2680,18 @@ gf_defrag_start_crawl (void *data)              (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {                  migrate_data = dict_new ();                  if (!migrate_data) { +                        defrag->total_failures++;                          ret = -1;                          goto out;                  } -                if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) -                        ret = dict_set_str (migrate_data, -                                            GF_XATTR_FILE_MIGRATE_KEY, -                                            "force"); -                else -                        ret = dict_set_str (migrate_data, -                                            GF_XATTR_FILE_MIGRATE_KEY, -                                            "non-force"); -                if (ret) +                ret = dict_set_str (migrate_data, GF_XATTR_FILE_MIGRATE_KEY, +                        (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) +                        ?  "force" : "non-force"); +                if (ret) { +                        defrag->total_failures++; +                        ret = -1;                          goto out; +                }                  /* Find local subvolumes */                  ret = syncop_getxattr (this, &loc, &dict, @@ -2670,6 +2750,17 @@ gf_defrag_start_crawl (void *data)          ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout,                                      migrate_data); +        if (ret) { +                defrag->total_failures++; +                ret = -1; +                goto out; +        } + +        if (gf_defrag_settle_hash (this, defrag, &loc, fix_layout) != 0) { +                defrag->total_failures++; +                ret = -1; +                goto out; +        }          if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {                  methods = conf->methods; diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index cc093e1199f..c881a361804 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -23,11 +23,14 @@  #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path)    do {           \                  layout->list[i].start = srt;                            \                  layout->list[i].stop  = srt + chunk - 1;                \ +                layout->list[i].commit_hash = layout->commit_hash;      \                                                                          \                  gf_msg_trace (this->name, 0,                            \ -                              "gave fix: %u - %u on %s for %s",         \ +                              "gave fix: %u - %u, with commit-hash %u"  \ +                              " on %s for %s",                          \                                layout->list[i].start,                    \                                layout->list[i].stop,                     \ +                              layout->list[i].commit_hash,              \                                layout->list[i].xlator->name, path);      \          } while (0) @@ -448,6 +451,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,                         dht_layout_t **ondisk)  {          gf_boolean_t             fixit                 = _gf_true; +          dht_local_t             *local                 = NULL;          int                      layout_span           = 0;          int                      decommissioned_bricks = 0; @@ -482,6 +486,10 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,          if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt)                  goto out; +        /* If commit hashes are being updated, let it through */ +        if ((*inmem)->commit_hash != (*ondisk)->commit_hash) +                goto out; +          layout_span = dht_layout_span (*ondisk);          decommissioned_bricks @@ -497,6 +505,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,                  fixit = _gf_false;  out: +          return fixit;  } @@ -756,6 +765,7 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)          dummy = dht_layout_new (this, 1);          if (!dummy)                  goto out; +        dummy->commit_hash = layout->commit_hash;          for (i = 0; i < conf->subvolume_cnt; i++) {                  if (_gf_false ==                      dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { @@ -1474,6 +1484,8 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,  		new_layout->list[i].xlator = layout->list[i].xlator;          } +        new_layout->commit_hash = layout->commit_hash; +          if (priv->du_stats) {                  for (i = 0; i < priv->subvolume_cnt; ++i) {                          gf_log (this->name, GF_LOG_INFO, @@ -1653,6 +1665,11 @@ dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc,          overlaps = local->selfheal.overlaps_cnt;          if (holes || overlaps) { +                /* If the layout has anomolies which would change the hash +                 * ranges, then we need to reset the commit_hash for this +                 * directory, as the layout would change and things may not +                 * be in place as expected */ +                layout->commit_hash = DHT_LAYOUT_HASH_INVALID;                  dht_selfheal_layout_new_directory (frame, loc, layout);                  ret = 0;          } @@ -1934,3 +1951,300 @@ dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data)          DHT_STACK_DESTROY (sync_frame);          return 0;  } + +/* EXIT: dht_update_commit_hash_for_layout */ +int +dht_update_commit_hash_for_layout_done (call_frame_t *frame, void *cookie, +                       xlator_t *this, int32_t op_ret, int32_t op_errno, +                       dict_t *xdata) +{ +        dht_local_t  *local = NULL; + +        local = frame->local; + +        /* preserve oldest error */ +        if (op_ret && !local->op_ret) { +                local->op_ret = op_ret; +                local->op_errno = op_errno; +        } + +        DHT_STACK_UNWIND (setxattr, frame, local->op_ret, +                          local->op_errno, NULL); + +        return 0; +} + +int +dht_update_commit_hash_for_layout_unlock (call_frame_t *frame, xlator_t *this) +{ +        dht_local_t  *local = NULL; +        int ret = 0; + +        local = frame->local; + +        ret = dht_unlock_inodelk (frame, local->lock.locks, +                                  local->lock.lk_count, +                                  dht_update_commit_hash_for_layout_done); +        if (ret < 0) { +                /* preserve oldest error, just ... */ +                if (!local->op_ret) { +                        local->op_errno = errno; +                        local->op_ret = -1; +                } + +                gf_msg (this->name, GF_LOG_WARNING, errno, +                        DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, +                        "Winding unlock failed: stale locks left on brick" +                        " %s", local->loc.path); + +                dht_update_commit_hash_for_layout_done (frame, NULL, this, +                                                        0, 0, NULL); +        } + +        return 0; +} + +int +dht_update_commit_hash_for_layout_cbk (call_frame_t *frame, void *cookie, +                                       xlator_t *this, int op_ret, +                                       int op_errno, dict_t *xdata) +{ +        dht_local_t  *local = NULL; +        int           this_call_cnt = 0; + +        local = frame->local; + +        LOCK (&frame->lock); +        /* store first failure, just because */ +        if (op_ret && !local->op_ret) { +                local->op_ret = op_ret; +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +        this_call_cnt = dht_frame_return (frame); + +        if (is_last_call (this_call_cnt)) { +                dht_update_commit_hash_for_layout_unlock (frame, this); +        } + +        return 0; +} + +int +dht_update_commit_hash_for_layout_resume (call_frame_t *frame, void *cookie, +                                          xlator_t *this, int32_t op_ret, +                                          int32_t op_errno, dict_t *xdata) +{ +        dht_local_t   *local = NULL; +        int            count = 1, ret = -1, i = 0, j = 0; +        dht_conf_t    *conf = NULL; +        dht_layout_t  *layout = NULL; +        int32_t       *disk_layout = NULL; +        dict_t        **xattr = NULL; + +        local = frame->local; +        conf = frame->this->private; +        count = conf->local_subvols_cnt; +        layout = local->layout; + +        if (op_ret < 0) { +                goto err_done; +        } + +        /* We precreate the xattr list as we cannot change call count post the +         * first wind as we may never continue from there. So we finish prep +         * work before winding the setxattrs */ +        xattr = GF_CALLOC (count, sizeof (*xattr), gf_common_mt_char); +        if (!xattr) { +                local->op_errno = errno; + +                gf_msg (this->name, GF_LOG_WARNING, errno, +                        DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, +                        "Directory commit hash update failed:" +                        " %s: Allocation failed", local->loc.path); + +                goto err; +        } + +        for (i = 0; i < count; i++) { +                /* find the layout index for the subvolume */ +                ret = dht_layout_index_for_subvol (layout, +                                                   conf->local_subvols[i]); +                if (ret < 0) { +                        local->op_errno = ENOENT; + +                        gf_msg (this->name, GF_LOG_WARNING, 0, +                                DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, +                                "Directory commit hash update failed:" +                                " %s: (subvol %s) Failed to find disk layout", +                                local->loc.path, conf->local_subvols[i]->name); + +                        goto err; +                } +                j = ret; + +                /* update the commit hash for the layout */ +                layout->list[j].commit_hash = layout->commit_hash; + +                /* extract the current layout */ +                ret = dht_disk_layout_extract (this, layout, j, &disk_layout); +                if (ret == -1) { +                        local->op_errno = errno; + +                        gf_msg (this->name, GF_LOG_WARNING, errno, +                                DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, +                                "Directory commit hash update failed:" +                                " %s: (subvol %s) Failed to extract disk" +                                " layout", local->loc.path, +                                conf->local_subvols[i]->name); + +                        goto err; +                } + +                xattr[i] = get_new_dict (); +                if (!xattr[i]) { +                        local->op_errno = errno; + +                        gf_msg (this->name, GF_LOG_WARNING, errno, +                                DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, +                                "Directory commit hash update failed:" +                                " %s: Allocation failed", local->loc.path); + +                        goto err; +                } + +                ret = dict_set_bin (xattr[i], conf->xattr_name, +                                    disk_layout, 4 * 4); +                if (ret != 0) { +                        local->op_errno = ENOMEM; + +                        gf_msg (this->name, GF_LOG_WARNING, 0, +                                DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, +                                "Directory self heal xattr failed:" +                                "%s: (subvol %s) Failed to set xattr" +                                " dictionary,", local->loc.path, +                                conf->local_subvols[i]->name); + +                        goto err; +                } +                disk_layout = NULL; + +                gf_msg_trace (this->name, 0, +                              "setting commit hash %u on subvolume %s" +                              " for %s", layout->list[j].commit_hash, +                              conf->local_subvols[i]->name, local->loc.path); +        } + +        /* wind the setting of the commit hash across the local subvols */ +        local->call_cnt = count; +        local->op_ret = 0; +        local->op_errno = 0; +        for (i = 0; i < count; i++) { +                dict_ref (xattr[i]); + +                STACK_WIND (frame, dht_update_commit_hash_for_layout_cbk, +                            conf->local_subvols[i], +                            conf->local_subvols[i]->fops->setxattr, +                            &local->loc, xattr[i], 0, NULL); + +                dict_unref (xattr[i]); +        } + +        return 0; +err: +        if (xattr) { +                for (i = 0; i < count; i++) { +                        if (xattr[i]) +                                dict_destroy (xattr[i]); +                } + +                GF_FREE (xattr); +        } + +        GF_FREE (disk_layout); + +        local->op_ret = -1; + +        dht_update_commit_hash_for_layout_unlock (frame, this); + +        return 0; +err_done: +        local->op_ret = -1; + +        dht_update_commit_hash_for_layout_done (frame, NULL, this, 0, 0, NULL); + +        return 0; +} + +/* ENTER: dht_update_commit_hash_for_layout (see EXIT above) + * This function is invoked from rebalance only. + * As a result, the check here is simple enough to see if defrag is present + * in the conf, as other data would be populated appropriately if so. + * If ever this was to be used in other code paths, checks would need to + * change. + * + * Functional details: + *  - Lock the inodes on the subvols that we want the commit hash updated + *  - Update each layout with the inode layout, modified to take in the new + *    commit hash. + *  - Unlock and return. + */ +int +dht_update_commit_hash_for_layout (call_frame_t *frame) +{ +        dht_local_t   *local = NULL; +        int            count = 1, ret = -1, i = 0; +        dht_lock_t   **lk_array = NULL; +        dht_conf_t    *conf = NULL; + +        GF_VALIDATE_OR_GOTO ("dht", frame, err); +        GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err); + +        local = frame->local; +        conf = frame->this->private; + +        if (!conf->defrag) +                goto err; + +        count = conf->local_subvols_cnt; +        lk_array = GF_CALLOC (count, sizeof (*lk_array), +                              gf_common_mt_char); +        if (lk_array == NULL) +                goto err; + +        for (i = 0; i < count; i++) { +                lk_array[i] = dht_lock_new (frame->this, +                                            conf->local_subvols[i], +                                            &local->loc, F_WRLCK, +                                            DHT_LAYOUT_HEAL_DOMAIN); +                if (lk_array[i] == NULL) +                        goto err; +        } + +        local->lock.locks = lk_array; +        local->lock.lk_count = count; + +        ret = dht_blocking_inodelk (frame, lk_array, count, +                                    dht_update_commit_hash_for_layout_resume); +        if (ret < 0) { +                local->lock.locks = NULL; +                local->lock.lk_count = 0; +                goto err; +        } + +        return 0; +err: +        if (lk_array != NULL) { +                int tmp_count = 0, i = 0; + +                for (i = 0; (i < count) && (lk_array[i]); i++, tmp_count++) { +                        ; +                } + +                dht_lock_array_free (lk_array, tmp_count); +                GF_FREE (lk_array); +        } + +        return -1; +} diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index ffabc820d70..a1f72a85112 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -569,6 +569,7 @@ dht_init (xlator_t *this)          int                              cmd            = 0;          char                            *node_uuid      = NULL;          int                              throttle_count = 0; +        uint32_t                         commit_hash    = 0;          GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -590,6 +591,15 @@ dht_init (xlator_t *this)                  goto err;          } +        /* We get the commit-hash to set only for rebalance process */ +        if (dict_get_uint32 (this->options, +                             "commit-hash", &commit_hash) == 0) { +                gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u", +                        __func__, commit_hash); +                conf->vol_commit_hash = commit_hash; +                conf->vch_forced = _gf_true; +        } +          ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);          if (cmd) { @@ -760,6 +770,8 @@ dht_init (xlator_t *this)          GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err);          gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR,                       conf->xattr_name); +        gf_asprintf (&conf->commithash_xattr_name, "%s."DHT_COMMITHASH_STR, +                     conf->xattr_name);          gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name);          if (!conf->link_xattr_name || !conf->wild_xattr_name) {                  goto err; @@ -871,6 +883,9 @@ struct volume_options options[] = {          { .key  = {"rebalance-cmd"},            .type = GF_OPTION_TYPE_INT,          }, +        { .key = {"commit-hash"}, +          .type = GF_OPTION_TYPE_INT, +        },          { .key = {"node-uuid"},            .type = GF_OPTION_TYPE_STR,          }, diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index 92d15c615be..019766c5d83 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -1997,6 +1997,8 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)          dict_t                  *bricks_dict   = NULL;          char                    *brick_tmpstr  = NULL;          int                      start_remove  = 0; +        uint32_t                 commit_hash   = 0; +          this = THIS;          GF_ASSERT (this); @@ -2262,6 +2264,9 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)                  break;          }          if (!force && need_rebalance) { +                if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) { +                        volinfo->rebal.commit_hash = commit_hash; +                }                  /* perform the rebalance operations */                  ret = glusterd_handle_defrag_start                          (volinfo, err_str, sizeof (err_str), diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index 858f0771ca6..bc0763483fd 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -3404,6 +3404,36 @@ out:  }  int +gd_set_commit_hash (dict_t *dict) +{ +        struct timeval          tv; +        uint32_t                hash; + +        /* +         * We need a commit hash that won't conflict with others we might have +         * set, or zero which is the implicit value if we never have.  Using +         * seconds<<3 like this ensures that we'll only get a collision if two +         * consecutive rebalances are separated by exactly 2^29 seconds - about +         * 17 years - and even then there's only a 1/8 chance of a collision in +         * the low order bits.  It's far more likely that this code will have +         * changed completely by then.  If not, call me in 2031. +         * +         * P.S. Time zone changes?  Yeah, right. +         */ +        gettimeofday (&tv, NULL); +        hash = tv.tv_sec << 3; + +        /* +         * Make sure at least one of those low-order bits is set.  The extra +         * shifting is because not all machines have sub-millisecond time +         * resolution. +         */ +        hash |= 1 << ((tv.tv_usec >> 10) % 3); + +        return dict_set_uint32 (dict, "commit-hash", hash); +} + +int  glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)  {          int                     ret = -1; @@ -3415,6 +3445,7 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)          uint32_t                status_cmd = GF_CLI_STATUS_NONE;          char                    *errstr = NULL;          xlator_t                *this = NULL; +        gf_boolean_t            do_common = _gf_false;          GF_ASSERT (req); @@ -3503,12 +3534,6 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)                          }                          break; -                case GD_OP_SYNC_VOLUME: -                        { -                                dict_copy (dict, req_dict); -                                break; -                        } -                  case GD_OP_REMOVE_BRICK:                          {                                  dict_t *dict = ctx; @@ -3525,6 +3550,10 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)                                  if (ret)                                          goto out; +                                if (gd_set_commit_hash(dict) != 0) { +                                        goto out; +                                } +                                  dict_destroy (req_dict);                                  req_dict = dict_ref (dict);                          } @@ -3544,8 +3573,10 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)                                          dict_copy (dict, req_dict);                                          break;                                  } +                                do_common = _gf_true;                          } -                        /*fall-through*/ +                        break; +                  case GD_OP_DELETE_VOLUME:                  case GD_OP_START_VOLUME:                  case GD_OP_STOP_VOLUME: @@ -3555,7 +3586,6 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)                  case GD_OP_LOG_ROTATE:                  case GD_OP_QUOTA:                  case GD_OP_PROFILE_VOLUME: -                case GD_OP_REBALANCE:                  case GD_OP_HEAL_VOLUME:                  case GD_OP_STATEDUMP_VOLUME:                  case GD_OP_CLEARLOCKS_VOLUME: @@ -3563,49 +3593,62 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)                  case GD_OP_BARRIER:                  case GD_OP_BITROT:                          { -                                ret = dict_get_str (dict, "volname", &volname); -                                if (ret) { -                                        gf_log (this->name, GF_LOG_CRITICAL, -                                                "volname is not present in " -                                                "operation ctx"); -                                        goto out; -                                } - -                                if (strcasecmp (volname, "all")) { -                                        ret = glusterd_dict_set_volid (dict, -                                                                       volname, -                                                                     op_errstr); -                                        if (ret) -                                                goto out; -                                } -                                dict_copy (dict, req_dict); +                                do_common = _gf_true;                          }                          break; -                case GD_OP_COPY_FILE: +                case GD_OP_REBALANCE:                          { -                                dict_copy (dict, req_dict); -                                break; +                                if (gd_set_commit_hash(dict) != 0) { +                                        goto out; +                                } +                                do_common = _gf_true;                          } +                        break; +                case GD_OP_SYNC_VOLUME: +                case GD_OP_COPY_FILE:                  case GD_OP_SYS_EXEC:                          {                                  dict_copy (dict, req_dict); -                                break;                          } +                        break;                  case GD_OP_GANESHA:                          {                                  dict_copy (dict, req_dict); -                                break;                          } +                        break;                  default:                          break;          } -        *req = req_dict; -        ret = 0; +        /* +         * This has been moved out of the switch so that multiple ops with +         * other special needs can all "fall through" to it. +         */ +        if (do_common) { +                ret = dict_get_str (dict, "volname", &volname); +                if (ret) { +                        gf_log (this->name, GF_LOG_CRITICAL, +                                "volname is not present in " +                                "operation ctx"); +                        goto out; +                } + +                if (strcasecmp (volname, "all")) { +                        ret = glusterd_dict_set_volid (dict, +                                                       volname, +                                                     op_errstr); +                        if (ret) +                                goto out; +                } +                dict_copy (dict, req_dict); +        } + +       *req = req_dict; +       ret = 0;  out:          return ret; diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index 48d9a706042..cf8ee3a79f7 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -284,6 +284,9 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,          runner_argprintf ( &runner, "*dht.rebalance-cmd=%d",cmd);          runner_add_arg (&runner, "--xlator-option");          runner_argprintf (&runner, "*dht.node-uuid=%s", uuid_utoa(MY_UUID)); +        runner_add_arg (&runner, "--xlator-option"); +        runner_argprintf (&runner, "*dht.commit-hash=%u", +                          volinfo->rebal.commit_hash);          runner_add_arg (&runner, "--socket-file");          runner_argprintf (&runner, "%s",sockfile);          runner_add_arg (&runner, "--pid-file"); @@ -716,6 +719,7 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)          char                    *task_id_str = NULL;          dict_t                  *ctx = NULL;          xlator_t                *this = NULL; +        uint32_t                commit_hash;          this = THIS;          GF_ASSERT (this); @@ -804,6 +808,9 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)                          glusterd_store_perform_node_state_store (volinfo);                          break;                  } +                if (dict_get_uint32 (dict, "commit-hash", &commit_hash) == 0) { +                        volinfo->rebal.commit_hash = commit_hash; +                }                  ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg),                                                      cmd, NULL, GD_OP_REBALANCE);                  break; diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 3f2ff45f1a1..5341192e84a 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -286,6 +286,7 @@ struct glusterd_rebalance_ {          glusterd_op_t            op;          dict_t                  *dict; /* Dict to store misc information                                          * like list of bricks being removed */ +        uint32_t                 commit_hash;  };  typedef struct glusterd_rebalance_ glusterd_rebalance_t;  | 
