7 files changed, 269 insertions, 50 deletions
diff --git a/tests/bugs/bug-902610.t b/tests/bugs/bug-902610.t
index 00ba03adfce..3f26fdde970 100755
--- a/tests/bugs/bug-902610.t
+++ b/tests/bugs/bug-902610.t
@@ -8,27 +8,33 @@ cleanup;
 function get_layout()
 {
         layout1=`getfattr -n trusted.glusterfs.dht -e hex $1 2>&1|grep dht |cut -d = -f2`
+	layout1_s=$(echo $layout1 | cut -c 19-26)
+	layout1_e=$(echo $layout1 | cut -c 27-34)
+	#echo "layout1 from $layout1_s to $layout1_e" > /dev/tty
         layout2=`getfattr -n trusted.glusterfs.dht -e hex $2 2>&1|grep dht |cut -d = -f2`
+	layout2_s=$(echo $layout2 | cut -c 19-26)
+	layout2_e=$(echo $layout2 | cut -c 27-34)
+	#echo "layout2 from $layout2_s to $layout2_e" > /dev/tty
+
+	if [ x"$layout2_s" = x"00000000" ]; then
+		# Reverse so we only have the real logic in one place.
+		tmp_s=$layout1_s
+		tmp_e=$layout1_e
+		layout1_s=$layout2_s
+		layout1_e=$layout2_e
+		layout2_s=$tmp_s
+		layout2_e=$tmp_e
+	fi
+
+	# Figure out where the join point is.
+	target=$(python -c "print '%08x' % (0x$layout1_e + 1)")
+	#echo "target for layout2 = $target" > /dev/tty
+
+	# The second layout should cover everything that the first doesn't.
+	if [ x"$layout2_s" = x"$target" -a x"$layout2_e" = x"ffffffff" ]; then
+		return 0
+	fi
 
-        if [ $layout1 == "0x0000000100000000000000007ffffffe" ]
-        then
-                if [ $layout2 == "0x00000001000000007fffffffffffffff" ]
-		then
-			return 0
-		else
-			return 1
-		fi
-        fi
-
-	if [ $layout2 == "0x0000000100000000000000007ffffffe" ]
-        then
-                if [ $layout1 == "0x00000001000000007fffffffffffffff" ]
-		then
-			return 0
-		else
-			return 1
-		fi
-        fi
 	return 1
 }
 
diff --git a/tests/features/weighted-rebalance.t b/tests/features/weighted-rebalance.t
new file mode 100755
index 00000000000..a5e746970ae
--- /dev/null
+++ b/tests/features/weighted-rebalance.t
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../dht.rc
+
+NFILES=1000
+
+touch_files () {
+	for i in $(seq 1 $NFILES); do
+		touch $(printf $M0/dir/file%02d $i) 2> /dev/null
+	done
+}
+
+count_files () {
+	found=0
+	for i in $(seq 1 $NFILES); do
+		if [ -f $(printf $1/dir/file%02d $i) ]; then
+			found=$((found+1))
+		fi
+	done
+	echo $found
+}
+
+wait_for_rebalance () {
+	while true; do
+		rebalance_completed
+		if [ $? -eq 1 ]; then
+			sleep 1
+		else
+			break
+		fi
+	done
+}
+
+get_xattr () {
+	cmd="getfattr --absolute-names --only-values -n trusted.glusterfs.dht"
+	$cmd $1 | od -tx1 -An | tr -d ' '
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info
+
+TEST mkdir ${B0}/${V0}{1,2}
+
+TEST truncate --size $((40*1024*1024)) ${B0}/disk1
+TEST mkfs.xfs -f -i size=512 ${B0}/disk1
+TEST mount -o loop ${B0}/disk1 ${B0}/${V0}1
+
+TEST truncate --size $((80*1024*1024)) ${B0}/disk2
+TEST mkfs.xfs -f -i size=512 ${B0}/disk2
+TEST mount -o loop ${B0}/disk2 ${B0}/${V0}2
+
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2}
+EXPECT "$V0" volinfo_field $V0 'Volume Name'
+EXPECT 'Created' volinfo_field $V0 'Status'
+
+TEST $CLI volume start $V0
+EXPECT 'Started' volinfo_field $V0 'Status'
+
+# Create some files for later tests.
+TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
+TEST mkdir $M0/dir
+TEST touch_files
+TEST umount $M0
+
+# Check that the larger brick got more of the files.
+nfiles=$(count_files ${B0}/${V0}2)
+echo $nfiles $(get_xattr ${B0}/${V0}1) $(get_xattr ${B0}/${V0}2) > /dev/tty
+TEST [ $nfiles -ge 580 ]
+
+# Turn off the size-weighted rebalance.
+TEST $CLI volume set $V0 cluster.weighted-rebalance off
+
+# Rebalance again and check that the distribution is even again.
+TEST $CLI volume rebalance $V0 start force
+TEST wait_for_rebalance
+nfiles=$(count_files ${B0}/${V0}2)
+echo $nfiles $(get_xattr ${B0}/${V0}1) $(get_xattr ${B0}/${V0}2) > /dev/tty
+TEST [ $nfiles -le 580 ]
+
+exit 0
+
+$CLI volume stop $V0
+umount ${B0}/${V0}{1,2}
+rm -f ${B0}/disk{1,2}
+
+cleanup
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 5dee622a2a1..54f885d18b0 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -208,6 +208,7 @@ struct dht_du {
 	double   avail_inodes;
         uint64_t avail_space;
         uint32_t log;
+        uint32_t chunks;
 };
 typedef struct dht_du dht_du_t;
 
@@ -315,6 +316,9 @@ struct dht_conf {
         char            *xattr_name;
         char            *link_xattr_name;
         char            *wild_xattr_name;
+
+        /* Support size-weighted rebalancing (heterogeneous bricks). */
+        gf_boolean_t    do_weighting;
         gf_boolean_t    randomize_by_gfid;
 };
 typedef struct dht_conf dht_conf_t;
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 8664f550ba2..a2dc43c32aa 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -37,6 +37,8 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
 	double         percent = 0;
 	double         percent_inodes = 0;
 	uint64_t       bytes = 0;
+        uint32_t       bpc;     /* blocks per chunk */
+        uint32_t       chunks   = 0;
 
 	conf = this->private;
 	prev = cookie;
@@ -50,17 +52,28 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
 	if (statvfs && statvfs->f_blocks) {
 		percent = (statvfs->f_bavail * 100) / statvfs->f_blocks;
 		bytes = (statvfs->f_bavail * statvfs->f_frsize);
+                /*
+                 * A 32-bit count of 1MB chunks allows a maximum brick size of
+                 * ~4PB.  It's possible that we could see a single local FS
+                 * bigger than that some day, but this code is likely to be
+                 * irrelevant by then.  Meanwhile, it's more important to keep
+                 * the chunk size small so the layout-calculation code that
+                 * uses this value can be tested on normal machines.
+                 */
+                bpc = (1 << 20) / statvfs->f_bsize;
+                chunks = (statvfs->f_blocks + bpc - 1) / bpc;
 	}
 
 	if (statvfs && statvfs->f_files) {
 		percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files;
 	} else {
-		/* set percent inodes to 100 for dynamically allocated inode filesystems
-		   this logic holds good so that, distribute has nothing to worry about
-		   total inodes rather let the 'create()' to be scheduled on the hashed
-		   subvol regardless of the total inodes. since we have no awareness on
-		   loosing inodes this logic fits well
-		*/
+                /*
+                 * Set percent inodes to 100 for dynamically allocated inode
+                 * filesystems. The rationale is that distribute need not
+                 * worry about total inodes; rather, let the 'create()' be
+                 * scheduled on the hashed subvol regardless of the total
+                 * inodes.
+		 */
 		percent_inodes = 100;
 	}
 
@@ -71,6 +84,7 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
 				conf->du_stats[i].avail_percent = percent;
 				conf->du_stats[i].avail_space   = bytes;
 				conf->du_stats[i].avail_inodes  = percent_inodes;
+                                conf->du_stats[i].chunks        = chunks;
 				gf_msg_debug (this->name, 0,
 				              "subvolume '%s': avail_percent "
 					      "is: %.2f and avail_space "
@@ -80,6 +94,7 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
 					      conf->du_stats[i].avail_percent,
 					      conf->du_stats[i].avail_space,
 					      conf->du_stats[i].avail_inodes);
+                                break;  /* no point in looping further */
 			}
 	}
 	UNLOCK (&conf->subvolume_lock);
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index f476e44e0c1..a92dba89d2b 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -20,11 +20,11 @@
 #include "dht-messages.h"
 #include "glusterfs-acl.h"
 
-#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path)    do {       \
+#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path)    do {           \
                 layout->list[i].start = srt;                            \
                 layout->list[i].stop  = srt + chunk - 1;                \
                                                                         \
-                gf_msg_trace (this->name, 0,                       \
+                gf_msg_trace (this->name, 0,                            \
                               "gave fix: %u - %u on %s for %s",         \
                               layout->list[i].start,                    \
                               layout->list[i].stop,                     \
@@ -952,6 +952,18 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
 		new_layout->list[i].xlator = layout->list[i].xlator;
         }
 
+        if (priv->du_stats) {
+                for (i = 0; i < priv->subvolume_cnt; ++i) {
+                        gf_log (this->name, GF_LOG_INFO,
+                                "subvolume %d (%s): %u chunks", i,
+                                priv->subvolumes[i]->name,
+                                priv->du_stats[i].chunks);
+                }
+        }
+        else {
+                gf_log (this->name, GF_LOG_WARNING, "no du stats ?!?");
+        }
+
 	/* First give it a layout as though it is a new directory. This
 	   ensures rotation to kick in */
         dht_layout_sort_volname (new_layout);
@@ -976,6 +988,32 @@ done:
 }
 
 
+/*
+ * Having to call this 2x for each entry in the layout is pretty horrible, but
+ * that's what all of this layout-sorting nonsense gets us.
+ */
+uint32_t
+dht_get_chunks_from_xl (xlator_t *parent, xlator_t *child)
+{
+        dht_conf_t      *priv   = parent->private;
+        xlator_list_t   *trav;
+        uint32_t        index   = 0;
+
+        if (!priv->du_stats) {
+                return 0;
+        }
+
+        for (trav = parent->children; trav; trav = trav->next) {
+                if (trav->xlator == child) {
+                        return priv->du_stats[index].chunks;
+                }
+                ++index;
+        }
+
+        return 0;
+}
+
+
 void
 dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
                                    dht_layout_t *layout)
@@ -984,44 +1022,92 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
         uint32_t     chunk = 0;
         int          i = 0;
         uint32_t     start = 0;
-        int          cnt = 0;
+        int          bricks_to_use = 0;
         int          err = 0;
         int          start_subvol = 0;
+        uint32_t     curr_size;
+        uint32_t     total_size = 0;
+        int          real_i;
+        dht_conf_t   *priv;
+        gf_boolean_t weight_by_size;
+        int          bricks_used = 0;
 
         this = frame->this;
+        priv = this->private;
+        weight_by_size = priv->do_weighting;
+
+        bricks_to_use = dht_get_layout_count (this, layout, 1);
+        GF_ASSERT (bricks_to_use > 0);
 
-        cnt = dht_get_layout_count (this, layout, 1);
+        bricks_used = 0;
+        for (i = 0; i < layout->cnt; ++i) {
+                err = layout->list[i].err;
+                if ((err != -1) && (err != ENOENT)) {
+                        continue;
+                }
+                curr_size = dht_get_chunks_from_xl (this,
+                                                    layout->list[i].xlator);
+                if (!curr_size) {
+                        weight_by_size = _gf_false;
+                        break;
+                }
+                total_size += curr_size;
+                if (++bricks_used >= bricks_to_use) {
+                        break;
+                }
+        }
 
-        chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1);
+        if (weight_by_size) {
+                /* We know total_size is not zero. */
+                chunk = ((unsigned long) 0xffffffff) / total_size;
+                gf_log (this->name, GF_LOG_INFO,
+                        "chunk size = 0xffffffff / %u = 0x%x",
+                        total_size, chunk);
+        }
+        else {
+                chunk = ((unsigned long) 0xffffffff) / bricks_used;
+        }
 
         start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout);
 
         /* clear out the range, as we are re-computing here */
         DHT_RESET_LAYOUT_RANGE (layout);
-        for (i = start_subvol; i < layout->cnt; i++) {
-                err = layout->list[i].err;
-                if (err == -1 || err == ENOENT) {
-                        DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
-                                             cnt, loc->path);
-                        if (--cnt == 0) {
-                                layout->list[i].stop = 0xffffffff;
-                                goto done;
-                        }
-                        start += chunk;
-                }
-        }
 
-        for (i = 0; i < start_subvol; i++) {
+        /*
+         * OK, what's this "real_i" stuff about?  This used to be two loops -
+         * from start_subvol to layout->cnt-1, then from 0 to start_subvol-1.
+         * That way is practically an open invitation to bugs when only one
+         * of the loops is updated.  Using real_i and modulo operators to make
+         * it one loop avoids this problem.  Remember, folks: it's everyone's
+         * responsibility to help stamp out copy/paste abuse.
+         */
+        bricks_used = 0;
+        for (real_i = 0; real_i < layout->cnt; real_i++) {
+                i = (real_i + start_subvol) % layout->cnt;
                 err = layout->list[i].err;
-                if (err == -1 || err == ENOENT) {
-                        DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
-                                             cnt, loc->path);
-                        if (--cnt == 0) {
-                                layout->list[i].stop = 0xffffffff;
-                                goto done;
+                if ((err != -1) && (err != ENOENT)) {
+                        continue;
+                }
+                if (weight_by_size) {
+                        curr_size = dht_get_chunks_from_xl (this,
+                                layout->list[i].xlator);
+                        if (!curr_size) {
+                                continue;
                         }
-                        start += chunk;
                 }
+                else {
+                        curr_size = 1;
+                }
+                gf_log (this->name, GF_LOG_INFO,
+                        "assigning range size 0x%x to %s", chunk * curr_size,
+                        layout->list[i].xlator->name);
+                DHT_SET_LAYOUT_RANGE(layout, i, start, chunk * curr_size,
+                                     loc->path);
+                if (++bricks_used >= bricks_to_use) {
+                        layout->list[i].stop = 0xffffffff;
+                        goto done;
+                }
+                start += (chunk * curr_size);
         }
 
 done:
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 4748d2a4f61..f8faecf6870 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -419,6 +419,9 @@ dht_reconfigure (xlator_t *this, dict_t *options)
         dht_init_regex (this, options, "extra-hash-regex",
                         &conf->extra_regex, &conf->extra_regex_valid);
 
+        GF_OPTION_RECONF ("weighted-rebalance", conf->do_weighting, options,
+                          bool, out);
+
         ret = 0;
 out:
         return ret;
@@ -658,6 +661,8 @@ dht_init (xlator_t *this)
                 goto err;
         }
 
+        GF_OPTION_INIT ("weighted-rebalance", conf->do_weighting, bool, err);
+
         this->private = conf;
 
         return 0;
@@ -790,6 +795,14 @@ struct volume_options options[] = {
           "below it."
         },
 
+        { .key = {"weighted-rebalance"},
+          .type = GF_OPTION_TYPE_BOOL,
+          .default_value = "on",
+          .description = "When enabled, files will be allocated to bricks "
+          "with a probability proportional to their size.  Otherwise, all "
+          "bricks will have the same probability (legacy behavior)."
+        },
+
         /* NUFA option */
         { .key  = {"local-volume-name"},
           .type = GF_OPTION_TYPE_XLATOR
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 92ab3d1a3a3..5358d52a43a 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -420,6 +420,10 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .op_version = 3,
           .flags      = OPT_FLAG_CLIENT_OPT
         },
+        { .key        = "cluster.weighted-rebalance",
+          .voltype    = "cluster/distribute",
+          .op_version = GD_OP_VERSION_3_6_0,
+        },
 
         /* Switch xlator options (Distribute special case) */
         { .key        = "cluster.switch",