summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xtests/bugs/bug-902610.t44
-rwxr-xr-xtests/features/weighted-rebalance.t91
-rw-r--r--xlators/cluster/dht/src/dht-common.h4
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c27
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c136
-rw-r--r--xlators/cluster/dht/src/dht-shared.c13
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c4
7 files changed, 269 insertions, 50 deletions
diff --git a/tests/bugs/bug-902610.t b/tests/bugs/bug-902610.t
index 00ba03adfce..3f26fdde970 100755
--- a/tests/bugs/bug-902610.t
+++ b/tests/bugs/bug-902610.t
@@ -8,27 +8,33 @@ cleanup;
function get_layout()
{
layout1=`getfattr -n trusted.glusterfs.dht -e hex $1 2>&1|grep dht |cut -d = -f2`
+ layout1_s=$(echo $layout1 | cut -c 19-26)
+ layout1_e=$(echo $layout1 | cut -c 27-34)
+ #echo "layout1 from $layout1_s to $layout1_e" > /dev/tty
layout2=`getfattr -n trusted.glusterfs.dht -e hex $2 2>&1|grep dht |cut -d = -f2`
+ layout2_s=$(echo $layout2 | cut -c 19-26)
+ layout2_e=$(echo $layout2 | cut -c 27-34)
+ #echo "layout2 from $layout2_s to $layout2_e" > /dev/tty
+
+ if [ x"$layout2_s" = x"00000000" ]; then
+ # Reverse so we only have the real logic in one place.
+ tmp_s=$layout1_s
+ tmp_e=$layout1_e
+ layout1_s=$layout2_s
+ layout1_e=$layout2_e
+ layout2_s=$tmp_s
+ layout2_e=$tmp_e
+ fi
+
+ # Figure out where the join point is.
+ target=$(python -c "print '%08x' % (0x$layout1_e + 1)")
+ #echo "target for layout2 = $target" > /dev/tty
+
+ # The second layout should cover everything that the first doesn't.
+ if [ x"$layout2_s" = x"$target" -a x"$layout2_e" = x"ffffffff" ]; then
+ return 0
+ fi
- if [ $layout1 == "0x0000000100000000000000007ffffffe" ]
- then
- if [ $layout2 == "0x00000001000000007fffffffffffffff" ]
- then
- return 0
- else
- return 1
- fi
- fi
-
- if [ $layout2 == "0x0000000100000000000000007ffffffe" ]
- then
- if [ $layout1 == "0x00000001000000007fffffffffffffff" ]
- then
- return 0
- else
- return 1
- fi
- fi
return 1
}
diff --git a/tests/features/weighted-rebalance.t b/tests/features/weighted-rebalance.t
new file mode 100755
index 00000000000..a5e746970ae
--- /dev/null
+++ b/tests/features/weighted-rebalance.t
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../dht.rc
+
+NFILES=1000
+
+touch_files () {
+ for i in $(seq 1 $NFILES); do
+ touch $(printf $M0/dir/file%02d $i) 2> /dev/null
+ done
+}
+
+count_files () {
+ found=0
+ for i in $(seq 1 $NFILES); do
+ if [ -f $(printf $1/dir/file%02d $i) ]; then
+ found=$((found+1))
+ fi
+ done
+ echo $found
+}
+
+wait_for_rebalance () {
+ while true; do
+ rebalance_completed
+ if [ $? -eq 1 ]; then
+ sleep 1
+ else
+ break
+ fi
+ done
+}
+
+get_xattr () {
+ cmd="getfattr --absolute-names --only-values -n trusted.glusterfs.dht"
+ $cmd $1 | od -tx1 -An | tr -d ' '
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info
+
+TEST mkdir ${B0}/${V0}{1,2}
+
+TEST truncate --size $((40*1024*1024)) ${B0}/disk1
+TEST mkfs.xfs -f -i size=512 ${B0}/disk1
+TEST mount -o loop ${B0}/disk1 ${B0}/${V0}1
+
+TEST truncate --size $((80*1024*1024)) ${B0}/disk2
+TEST mkfs.xfs -f -i size=512 ${B0}/disk2
+TEST mount -o loop ${B0}/disk2 ${B0}/${V0}2
+
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2}
+EXPECT "$V0" volinfo_field $V0 'Volume Name'
+EXPECT 'Created' volinfo_field $V0 'Status'
+
+TEST $CLI volume start $V0
+EXPECT 'Started' volinfo_field $V0 'Status'
+
+# Create some files for later tests.
+TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
+TEST mkdir $M0/dir
+TEST touch_files
+TEST umount $M0
+
+# Check that the larger brick got more of the files.
+nfiles=$(count_files ${B0}/${V0}2)
+echo $nfiles $(get_xattr ${B0}/${V0}1) $(get_xattr ${B0}/${V0}2) > /dev/tty
+TEST [ $nfiles -ge 580 ]
+
+# Turn off the size-weighted rebalance.
+TEST $CLI volume set $V0 cluster.weighted-rebalance off
+
+# Rebalance again and check that the distribution is even again.
+TEST $CLI volume rebalance $V0 start force
+TEST wait_for_rebalance
+nfiles=$(count_files ${B0}/${V0}2)
+echo $nfiles $(get_xattr ${B0}/${V0}1) $(get_xattr ${B0}/${V0}2) > /dev/tty
+TEST [ $nfiles -le 580 ]
+
+exit 0
+
+$CLI volume stop $V0
+umount ${B0}/${V0}{1,2}
+rm -f ${B0}/disk{1,2}
+
+cleanup
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 5dee622a2a1..54f885d18b0 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -208,6 +208,7 @@ struct dht_du {
double avail_inodes;
uint64_t avail_space;
uint32_t log;
+ uint32_t chunks;
};
typedef struct dht_du dht_du_t;
@@ -315,6 +316,9 @@ struct dht_conf {
char *xattr_name;
char *link_xattr_name;
char *wild_xattr_name;
+
+ /* Support size-weighted rebalancing (heterogeneous bricks). */
+ gf_boolean_t do_weighting;
gf_boolean_t randomize_by_gfid;
};
typedef struct dht_conf dht_conf_t;
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 8664f550ba2..a2dc43c32aa 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -37,6 +37,8 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
double percent = 0;
double percent_inodes = 0;
uint64_t bytes = 0;
+ uint32_t bpc; /* blocks per chunk */
+ uint32_t chunks = 0;
conf = this->private;
prev = cookie;
@@ -50,17 +52,28 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (statvfs && statvfs->f_blocks) {
percent = (statvfs->f_bavail * 100) / statvfs->f_blocks;
bytes = (statvfs->f_bavail * statvfs->f_frsize);
+ /*
+ * A 32-bit count of 1MB chunks allows a maximum brick size of
+ * ~4PB. It's possible that we could see a single local FS
+ * bigger than that some day, but this code is likely to be
+ * irrelevant by then. Meanwhile, it's more important to keep
+ * the chunk size small so the layout-calculation code that
+ * uses this value can be tested on normal machines.
+ */
+ bpc = (1 << 20) / statvfs->f_bsize;
+ chunks = (statvfs->f_blocks + bpc - 1) / bpc;
}
if (statvfs && statvfs->f_files) {
percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files;
} else {
- /* set percent inodes to 100 for dynamically allocated inode filesystems
- this logic holds good so that, distribute has nothing to worry about
- total inodes rather let the 'create()' to be scheduled on the hashed
- subvol regardless of the total inodes. since we have no awareness on
- loosing inodes this logic fits well
- */
+ /*
+ * Set percent inodes to 100 for dynamically allocated inode
+ * filesystems. The rationale is that distribute need not
+ * worry about total inodes; rather, let the 'create()' be
+ * scheduled on the hashed subvol regardless of the total
+ * inodes.
+ */
percent_inodes = 100;
}
@@ -71,6 +84,7 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
conf->du_stats[i].avail_percent = percent;
conf->du_stats[i].avail_space = bytes;
conf->du_stats[i].avail_inodes = percent_inodes;
+ conf->du_stats[i].chunks = chunks;
gf_msg_debug (this->name, 0,
"subvolume '%s': avail_percent "
"is: %.2f and avail_space "
@@ -80,6 +94,7 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
conf->du_stats[i].avail_percent,
conf->du_stats[i].avail_space,
conf->du_stats[i].avail_inodes);
+ break; /* no point in looping further */
}
}
UNLOCK (&conf->subvolume_lock);
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index f476e44e0c1..a92dba89d2b 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -20,11 +20,11 @@
#include "dht-messages.h"
#include "glusterfs-acl.h"
-#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \
+#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path) do { \
layout->list[i].start = srt; \
layout->list[i].stop = srt + chunk - 1; \
\
- gf_msg_trace (this->name, 0, \
+ gf_msg_trace (this->name, 0, \
"gave fix: %u - %u on %s for %s", \
layout->list[i].start, \
layout->list[i].stop, \
@@ -952,6 +952,18 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
new_layout->list[i].xlator = layout->list[i].xlator;
}
+ if (priv->du_stats) {
+ for (i = 0; i < priv->subvolume_cnt; ++i) {
+ gf_log (this->name, GF_LOG_INFO,
+ "subvolume %d (%s): %u chunks", i,
+ priv->subvolumes[i]->name,
+ priv->du_stats[i].chunks);
+ }
+ }
+ else {
+ gf_log (this->name, GF_LOG_WARNING, "no du stats ?!?");
+ }
+
/* First give it a layout as though it is a new directory. This
ensures rotation to kick in */
dht_layout_sort_volname (new_layout);
@@ -976,6 +988,32 @@ done:
}
+/*
+ * Having to call this 2x for each entry in the layout is pretty horrible, but
+ * that's what all of this layout-sorting nonsense gets us.
+ */
+uint32_t
+dht_get_chunks_from_xl (xlator_t *parent, xlator_t *child)
+{
+ dht_conf_t *priv = parent->private;
+ xlator_list_t *trav;
+ uint32_t index = 0;
+
+ if (!priv->du_stats) {
+ return 0;
+ }
+
+ for (trav = parent->children; trav; trav = trav->next) {
+ if (trav->xlator == child) {
+ return priv->du_stats[index].chunks;
+ }
+ ++index;
+ }
+
+ return 0;
+}
+
+
void
dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
dht_layout_t *layout)
@@ -984,44 +1022,92 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
uint32_t chunk = 0;
int i = 0;
uint32_t start = 0;
- int cnt = 0;
+ int bricks_to_use = 0;
int err = 0;
int start_subvol = 0;
+ uint32_t curr_size;
+ uint32_t total_size = 0;
+ int real_i;
+ dht_conf_t *priv;
+ gf_boolean_t weight_by_size;
+ int bricks_used = 0;
this = frame->this;
+ priv = this->private;
+ weight_by_size = priv->do_weighting;
+
+ bricks_to_use = dht_get_layout_count (this, layout, 1);
+ GF_ASSERT (bricks_to_use > 0);
- cnt = dht_get_layout_count (this, layout, 1);
+ bricks_used = 0;
+ for (i = 0; i < layout->cnt; ++i) {
+ err = layout->list[i].err;
+ if ((err != -1) && (err != ENOENT)) {
+ continue;
+ }
+ curr_size = dht_get_chunks_from_xl (this,
+ layout->list[i].xlator);
+ if (!curr_size) {
+ weight_by_size = _gf_false;
+ break;
+ }
+ total_size += curr_size;
+ if (++bricks_used >= bricks_to_use) {
+ break;
+ }
+ }
- chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1);
+ if (weight_by_size) {
+ /* We know total_size is not zero. */
+ chunk = ((unsigned long) 0xffffffff) / total_size;
+ gf_log (this->name, GF_LOG_INFO,
+ "chunk size = 0xffffffff / %u = 0x%x",
+ total_size, chunk);
+ }
+ else {
+ chunk = ((unsigned long) 0xffffffff) / bricks_used;
+ }
start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout);
/* clear out the range, as we are re-computing here */
DHT_RESET_LAYOUT_RANGE (layout);
- for (i = start_subvol; i < layout->cnt; i++) {
- err = layout->list[i].err;
- if (err == -1 || err == ENOENT) {
- DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
- cnt, loc->path);
- if (--cnt == 0) {
- layout->list[i].stop = 0xffffffff;
- goto done;
- }
- start += chunk;
- }
- }
- for (i = 0; i < start_subvol; i++) {
+ /*
+ * OK, what's this "real_i" stuff about? This used to be two loops -
+ * from start_subvol to layout->cnt-1, then from 0 to start_subvol-1.
+ * That way is practically an open invitation to bugs when only one
+ * of the loops is updated. Using real_i and modulo operators to make
+ * it one loop avoids this problem. Remember, folks: it's everyone's
+ * responsibility to help stamp out copy/paste abuse.
+ */
+ bricks_used = 0;
+ for (real_i = 0; real_i < layout->cnt; real_i++) {
+ i = (real_i + start_subvol) % layout->cnt;
err = layout->list[i].err;
- if (err == -1 || err == ENOENT) {
- DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
- cnt, loc->path);
- if (--cnt == 0) {
- layout->list[i].stop = 0xffffffff;
- goto done;
+ if ((err != -1) && (err != ENOENT)) {
+ continue;
+ }
+ if (weight_by_size) {
+ curr_size = dht_get_chunks_from_xl (this,
+ layout->list[i].xlator);
+ if (!curr_size) {
+ continue;
}
- start += chunk;
}
+ else {
+ curr_size = 1;
+ }
+ gf_log (this->name, GF_LOG_INFO,
+ "assigning range size 0x%x to %s", chunk * curr_size,
+ layout->list[i].xlator->name);
+ DHT_SET_LAYOUT_RANGE(layout, i, start, chunk * curr_size,
+ loc->path);
+ if (++bricks_used >= bricks_to_use) {
+ layout->list[i].stop = 0xffffffff;
+ goto done;
+ }
+ start += (chunk * curr_size);
}
done:
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 4748d2a4f61..f8faecf6870 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -419,6 +419,9 @@ dht_reconfigure (xlator_t *this, dict_t *options)
dht_init_regex (this, options, "extra-hash-regex",
&conf->extra_regex, &conf->extra_regex_valid);
+ GF_OPTION_RECONF ("weighted-rebalance", conf->do_weighting, options,
+ bool, out);
+
ret = 0;
out:
return ret;
@@ -658,6 +661,8 @@ dht_init (xlator_t *this)
goto err;
}
+ GF_OPTION_INIT ("weighted-rebalance", conf->do_weighting, bool, err);
+
this->private = conf;
return 0;
@@ -790,6 +795,14 @@ struct volume_options options[] = {
"below it."
},
+ { .key = {"weighted-rebalance"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "When enabled, files will be allocated to bricks "
+ "with a probability proportional to their size. Otherwise, all "
+ "bricks will have the same probability (legacy behavior)."
+ },
+
/* NUFA option */
{ .key = {"local-volume-name"},
.type = GF_OPTION_TYPE_XLATOR
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 92ab3d1a3a3..5358d52a43a 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -420,6 +420,10 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = 3,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "cluster.weighted-rebalance",
+ .voltype = "cluster/distribute",
+ .op_version = GD_OP_VERSION_3_6_0,
+ },
/* Switch xlator options (Distribute special case) */
{ .key = "cluster.switch",