summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/dht/src/dht-selfheal.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/dht/src/dht-selfheal.c')
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c464
1 files changed, 282 insertions, 182 deletions
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index 7ceb80157..3fe96b1c7 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -17,6 +17,7 @@
#include "glusterfs.h"
#include "xlator.h"
#include "dht-common.h"
+#include "glusterfs-acl.h"
#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \
layout->list[i].start = srt; \
@@ -28,42 +29,40 @@
layout->list[i].xlator->name, path); \
} while (0)
-static inline uint32_t
-dht_find_overlap (int idx, int cnk_idx, uint32_t start, uint32_t stop,
- uint32_t chunk_size)
-{
- uint32_t overlap = 0;
- uint32_t chunk_begin = 0;
+#define DHT_RESET_LAYOUT_RANGE(layout) do { \
+ int cnt = 0; \
+ for (cnt = 0; cnt < layout->cnt; cnt++ ) { \
+ layout->list[cnt].start = 0; \
+ layout->list[cnt].stop = 0; \
+ } \
+ } while (0)
- chunk_begin = cnk_idx * chunk_size;
+static uint32_t
+dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n)
+{
+ if (o >= old->cnt || n >= new->cnt)
+ return 0;
- /* There is no chance of overlap */
- if ((chunk_begin > stop) ||
- ((chunk_begin + chunk_size) < start))
- goto out;
+ if (old->list[o].err > 0 || new->list[n].err > 0)
+ return 0;
- if ((chunk_begin <= start) &&
- ((chunk_begin + chunk_size) <= stop)) {
- overlap = ((chunk_begin + chunk_size) - start);
- goto out;
+ if (old->list[o].start == old->list[o].stop) {
+ return 0;
}
- if ((chunk_begin <= start) &&
- ((chunk_begin + chunk_size) >= stop)) {
- overlap = (stop - start);
- goto out;
+ if (new->list[n].start == new->list[n].stop) {
+ return 0;
}
- if ((chunk_begin < stop) &&
- ((chunk_begin + chunk_size) >= stop)) {
- overlap = (stop - chunk_begin);
- goto out;
- }
+ if ((old->list[o].start > new->list[n].stop) ||
+ (old->list[o].stop < new->list[n].start))
+ return 0;
-out:
- return overlap;
+ return min (old->list[o].stop, new->list[n].stop) -
+ max (old->list[o].start, new->list[n].start) + 1;
}
+
int
dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret)
{
@@ -118,7 +117,8 @@ dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
- dht_layout_t *layout, int i)
+ dht_layout_t *layout, int i,
+ xlator_t *req_subvol)
{
xlator_t *subvol = NULL;
dict_t *xattr = NULL;
@@ -126,16 +126,22 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
xlator_t *this = NULL;
int32_t *disk_layout = NULL;
dht_local_t *local = NULL;
-
+ dht_conf_t *conf = NULL;
local = frame->local;
- subvol = layout->list[i].xlator;
+ if (req_subvol)
+ subvol = req_subvol;
+ else
+ subvol = layout->list[i].xlator;
this = frame->this;
GF_VALIDATE_OR_GOTO ("", this, err);
GF_VALIDATE_OR_GOTO (this->name, layout, err);
GF_VALIDATE_OR_GOTO (this->name, local, err);
GF_VALIDATE_OR_GOTO (this->name, subvol, err);
+ VALIDATE_OR_GOTO (this->private, err);
+
+ conf = this->private;
xattr = get_new_dict ();
if (!xattr) {
@@ -150,8 +156,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
goto err;
}
- ret = dict_set_bin (xattr, "trusted.glusterfs.dht",
- disk_layout, 4 * 4);
+ ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4);
if (ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"%s: (subvol %s) failed to set xattr dictionary",
@@ -182,8 +187,7 @@ err:
if (xattr)
dict_destroy (xattr);
- if (disk_layout)
- GF_FREE (disk_layout);
+ GF_FREE (disk_layout);
dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this,
-1, ENOMEM, NULL);
@@ -197,21 +201,42 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
int i = 0;
int count = 0;
xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *dummy = NULL;
local = frame->local;
this = frame->this;
+ conf = this->private;
gf_log (this->name, GF_LOG_DEBUG,
"writing the new range for all subvolumes");
- local->call_cnt = count = layout->cnt;
+ local->call_cnt = count = conf->subvolume_cnt;
for (i = 0; i < layout->cnt; i++) {
- dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i);
+ dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL);
if (--count == 0)
- break;
+ goto out;
+ }
+ /* if we are here, subvolcount > layout_count. subvols-per-directory
+ * option might be set here. We need to clear out layout from the
+ * non-participating subvolumes, else it will result in overlaps */
+ dummy = dht_layout_new (this, 1);
+ if (!dummy)
+ goto out;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (_gf_false ==
+ dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
+ dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0,
+ conf->subvolumes[i]);
+ if (--count == 0)
+ break;
+ }
}
+
+ dht_layout_unref (this, dummy);
+out:
return 0;
}
@@ -222,9 +247,12 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
int missing_xattr = 0;
int i = 0;
xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *dummy = NULL;
local = frame->local;
this = frame->this;
+ conf = this->private;
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].err != -1 || !layout->list[i].stop) {
@@ -253,11 +281,23 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
if (layout->list[i].err != -1 || !layout->list[i].stop)
continue;
- dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i);
+ dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL);
if (--missing_xattr == 0)
break;
}
+ dummy = dht_layout_new (this, 1);
+ if (!dummy)
+ goto out;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (_gf_false ==
+ dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
+ dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0,
+ conf->subvolumes[i]);
+ }
+ }
+ dht_layout_unref (this, dummy);
+out:
return 0;
}
@@ -494,7 +534,7 @@ dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc,
uint32_t hashval = 0;
int ret = 0;
- ret = dht_hash_compute (layout->type, loc->path, &hashval);
+ ret = dht_hash_compute (this, layout->type, loc->path, &hashval);
if (ret == 0) {
start = (hashval % layout->cnt);
}
@@ -517,7 +557,7 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
for (j = 0; j < conf->subvolume_cnt; j++) {
if (conf->decommissioned_bricks[j] &&
conf->decommissioned_bricks[j] == layout->list[i].xlator) {
- layout->list[i].err = -EINVAL;
+ layout->list[i].err = EINVAL;
break;
}
}
@@ -525,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
for (i = 0; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1 || err == 0) {
- layout->list[i].err = -1;
+ if (err == -1 || err == 0 || err == ENOENT) {
+ /* Setting list[i].err = -1 is an indication for
+ dht_selfheal_layout_new_directory() to assign
+ a range. We set it to -1 based on any one of
+ the three criteria:
+
+ - err == -1 already, which means directory
+ existed but layout was not set on it.
+
+ - err == 0, which means directory exists and
+ has an old layout piece which will be
+ overwritten now.
+
+ - err == ENOENT, which means directory does
+ not exist (possibly racing with mkdir or
+ finishing half done mkdir). The missing
+ directory will be attempted to be recreated.
+
+ It is important to note that it is safe
+ to race with mkdir() as self-heal and
+ mkdir are idempotent operations. Both will
+ strive to set the directory and layouts to
+ the same final state.
+ */
count++;
+ if (!err)
+ layout->list[i].err = -1;
}
}
@@ -542,35 +606,118 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
}
}
- count = ((layout->spread_cnt) ? layout->spread_cnt :
- ((count) ? count : 1));
+ /* if layout->spread_cnt is set, check if it is <= available
+ * subvolumes (down brick and decommissioned bricks are considered
+ * un-availbale). Else return count (available up bricks) */
+ count = ((layout->spread_cnt &&
+ (layout->spread_cnt <= count)) ?
+ layout->spread_cnt : ((count) ? count : 1));
return count;
}
+void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *new_layout);
+
+void dht_layout_entry_swap (dht_layout_t *layout, int i, int j);
+void dht_layout_range_swap (dht_layout_t *layout, int i, int j);
+
+/*
+ * It's a bit icky using local variables in a macro, but it makes the rest
+ * of the code a lot clearer.
+ */
+#define OV_ENTRY(x,y) table[x*new->cnt+y]
+
+void
+dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *new, dht_layout_t *old)
+{
+ int i = 0;
+ int j = 0;
+ uint32_t curr_overlap = 0;
+ uint32_t max_overlap = 0;
+ int max_overlap_idx = -1;
+ uint32_t overlap = 0;
+ uint32_t *table = NULL;
+
+ dht_layout_sort_volname (old);
+ /* Now both old_layout->list[] and new_layout->list[]
+ are match the same xlators/subvolumes. i.e,
+ old_layout->[i] and new_layout->[i] are referring
+ to the same subvolumes
+ */
+
+ /* Build a table of overlaps between new[i] and old[j]. */
+ table = alloca(sizeof(overlap)*old->cnt*new->cnt);
+ if (!table) {
+ return;
+ }
+ memset(table,0,sizeof(overlap)*old->cnt*new->cnt);
+ for (i = 0; i < new->cnt; ++i) {
+ for (j = 0; j < old->cnt; ++j) {
+ OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i);
+ }
+ }
+
+ for (i = 0; i < new->cnt; i++) {
+ if (new->list[i].err > 0) {
+ /* Subvol might be marked for decommission
+ with EINVAL, or some other serious error
+ marked with positive errno.
+ */
+ continue;
+ }
+
+ max_overlap = 0;
+ max_overlap_idx = i;
+ for (j = (i + 1); j < new->cnt; ++j) {
+ if (new->list[j].err > 0) {
+ /* Subvol might be marked for decommission
+ with EINVAL, or some other serious error
+ marked with positive errno.
+ */
+ continue;
+ }
+ /* Calculate the overlap now. */
+ curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j);
+ /* Calculate the overlap after the proposed swap. */
+ overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i);
+ /* Are we better than status quo? */
+ if (overlap > curr_overlap) {
+ overlap -= curr_overlap;
+ /* Are we better than the previous choice? */
+ if (overlap > max_overlap) {
+ max_overlap = overlap;
+ max_overlap_idx = j;
+ }
+ }
+ }
+
+ if (max_overlap_idx != i) {
+ dht_layout_range_swap (new, i, max_overlap_idx);
+ /* Need to swap the table values too. */
+ for (j = 0; j < old->cnt; ++j) {
+ overlap = OV_ENTRY(i,j);
+ OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j);
+ OV_ENTRY(max_overlap_idx,j) = overlap;
+ }
+ }
+ }
+}
+
+
dht_layout_t *
dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
dht_layout_t *layout)
{
- uint32_t chunk = 0;
- uint32_t start = 0;
- uint32_t stop = 0;
- uint32_t overlap = 0;
- uint32_t max_overlap = 0;
- uint32_t chunk_begin = 0;
- int count = 0;
- int cnt = 0;
int i = 0;
- int j = 0;
- int k = 0;
- int loop_cnt = 0;
- int start_subvol = 0;
- int *fix_array = NULL;
xlator_t *this = NULL;
dht_layout_t *new_layout = NULL;
dht_conf_t *priv = NULL;
dht_local_t *local = NULL;
+ uint32_t subvol_down = 0;
+ int ret = 0;
this = frame->this;
priv = this->private;
@@ -582,114 +729,37 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
goto done;
}
- count = cnt = dht_get_layout_count (this, layout, 0);
-
- chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1);
-
- start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout);
-
- fix_array = GF_CALLOC (sizeof (int), layout->cnt, gf_common_mt_char);
- if (!fix_array) {
- /* No fix, use the existing layout itself */
- goto done;
- }
-
new_layout = dht_layout_new (this, priv->subvolume_cnt);
if (!new_layout)
goto done;
- for (i = 0; i < new_layout->cnt; i++) {
- /* TODO: fix this in layout_alloc() itself */
- new_layout->list[i].err = -ENOENT;
- if (i < layout->cnt)
- new_layout->list[i].xlator = layout->list[i].xlator;
- }
-
- /* Check if there are any overlap in layout, and give the proper fix */
- for (i = 0; i < layout->cnt; i++) {
- /* No need to fix if 'err' is not '-1' */
- if (layout->list[i].err != -1)
- continue;
-
- /* If already existing layout is having no range, skip it */
- start = layout->list[i].start;
- stop = layout->list[i].stop;
- if ((stop - start) == 0)
- continue;
-
- max_overlap = 0;
-
- /* 'j' is used as starting point of each chunk */
- for (j = 1; j <= count; j++) {
- /* if chunk is already used, don't use it again */
- for (k = 0; k < i; k++)
- if (j == fix_array[k])
- break;
- if (k < i)
- continue;
-
- overlap = dht_find_overlap (i, (j-1), start, stop, chunk);
- if (max_overlap < overlap) {
- max_overlap = overlap;
- fix_array[i] = j;
- }
- }
-
- /* If we have any overlap, then use that itself as new
- layout for the subvolume */
- if (fix_array[i]) {
- chunk_begin = chunk * (fix_array[i] - 1);
- new_layout->list[i].err = -1;
- DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin,
- chunk, cnt, loc->path);
- /* make sure to give (max - 1) as 'stop' range,
- if it is last chunk */
- if (fix_array[i] == count)
- new_layout->list[i].stop = 0xffffffff;
- if (--cnt == 0)
- goto done;
+ /* If a subvolume is down, do not re-write the layout. */
+ ret = dht_layout_anomalies (this, loc, layout, NULL, NULL, NULL,
+ &subvol_down, NULL, NULL);
- }
+ if (subvol_down || (ret == -1)) {
+ gf_log (this->name, GF_LOG_WARNING, "%u subvolume(s) are down"
+ ". Skipping fix layout.", subvol_down);
+ GF_FREE (new_layout);
+ return NULL;
}
- /* Now, look for layouts which are not having any overlaps
- and give it a fix */
- for (loop_cnt = 0, i = start_subvol; loop_cnt < new_layout->cnt;
- i++, loop_cnt++) {
- if (i == new_layout->cnt)
- i = 0;
-
- /* If 'fix_array[i]' is set, the layout is already fixed. */
- if (fix_array[i])
- continue;
+ for (i = 0; i < new_layout->cnt; i++) {
+ if (layout->list[i].err != ENOSPC)
+ new_layout->list[i].err = layout->list[i].err;
+ else
+ new_layout->list[i].err = -1;
- if (layout->list[i].err != -1) {
- new_layout->list[i].err = layout->list[i].err;
- continue;
- }
+ new_layout->list[i].xlator = layout->list[i].xlator;
+ }
- for (k = 1; k <= count; k++) {
- for (j = 0; j < new_layout->cnt; j++) {
- if (k == fix_array[j])
- break;
- }
- /* Didn't find any of the list begining with 'k' */
- if (j == new_layout->cnt)
- break;
- }
+ /* First give it a layout as though it is a new directory. This
+ ensures rotation to kick in */
+ dht_layout_sort_volname (new_layout);
+ dht_selfheal_layout_new_directory (frame, loc, new_layout);
- fix_array[i] = k;
- chunk_begin = (k - 1) * chunk;
- new_layout->list[i].err = -1;
- DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin, chunk, cnt,
- loc->path);
- /* make sure to give (max - 1) as 'stop' range,
- if it is last chunk */
- if (k == count)
- new_layout->list[i].stop = 0xffffffff;
- if (--cnt == 0)
- goto done;
- }
+ /* Now selectively re-assign ranges only when it helps */
+ dht_selfheal_layout_maximize_overlap (frame, loc, new_layout, layout);
done:
if (new_layout) {
@@ -703,9 +773,6 @@ done:
local->layout = new_layout;
}
- if (fix_array)
- GF_FREE (fix_array);
-
return local->layout;
}
@@ -730,9 +797,11 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout);
+ /* clear out the range, as we are re-computing here */
+ DHT_RESET_LAYOUT_RANGE (layout);
for (i = start_subvol; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
@@ -745,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
for (i = 0; i < start_subvol; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
@@ -764,35 +833,17 @@ int
dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc,
dht_layout_t *layout)
{
- dht_conf_t *conf = NULL;
- xlator_t *this = NULL;
dht_local_t *local = NULL;
- int missing = -1;
- int down = -1;
- int holes = -1;
+ uint32_t holes = 0;
int ret = -1;
int i = -1;
- int overlaps = -1;
+ uint32_t overlaps = 0;
- this = frame->this;
- conf = this->private;
local = frame->local;
- missing = local->selfheal.missing;
- down = local->selfheal.down;
holes = local->selfheal.hole_cnt;
overlaps = local->selfheal.overlaps_cnt;
- if ((missing + down) == conf->subvolume_cnt) {
- dht_selfheal_layout_new_directory (frame, loc, layout);
- ret = 0;
- }
-
- if (holes <= down) {
- /* the down subvol might fill up the holes */
- ret = 0;
- }
-
if (holes || overlaps) {
dht_selfheal_layout_new_directory (frame, loc, layout);
ret = 0;
@@ -844,6 +895,9 @@ dht_fix_directory_layout (call_frame_t *frame,
/* No layout sorting required here */
tmp_layout = dht_fix_layout_of_directory (frame, &local->loc, layout);
+ if (!tmp_layout) {
+ return -1;
+ }
dht_fix_dir_xattr (frame, &local->loc, tmp_layout);
return 0;
@@ -866,9 +920,8 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
dht_layout_anomalies (this, loc, layout,
&local->selfheal.hole_cnt,
&local->selfheal.overlaps_cnt,
- &local->selfheal.missing,
- &local->selfheal.down,
- &local->selfheal.misc);
+ NULL, &local->selfheal.down,
+ &local->selfheal.misc, NULL);
down = local->selfheal.down;
misc = local->selfheal.misc;
@@ -927,3 +980,50 @@ dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
return ret;
}
+
+int
+dht_dir_attr_heal (void *data)
+{
+ call_frame_t *frame = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ int call_cnt = 0;
+ int ret = -1;
+ int i = 0;
+
+ GF_VALIDATE_OR_GOTO ("dht", data, out);
+
+ frame = data;
+ local = frame->local;
+ this = frame->this;
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", local, out);
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO ("dht", conf, out);
+
+ call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < call_cnt; i++) {
+ subvol = conf->subvolumes[i];
+ if (!subvol || (subvol == dht_first_up_subvol (this)))
+ continue;
+ ret = syncop_setattr (subvol, &local->loc, &local->stbuf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+ NULL, NULL);
+ if (ret)
+ gf_log ("dht", GF_LOG_ERROR, "Failed to set uid/gid on"
+ " %s on %s subvol (%s)", local->loc.path,
+ subvol->name, strerror (errno));
+ }
+out:
+ return 0;
+}
+
+int
+dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data)
+{
+ DHT_STACK_DESTROY (sync_frame);
+ return 0;
+}