diff options
Diffstat (limited to 'xlators/cluster/dht/src/dht-selfheal.c')
| -rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 466 |
1 files changed, 286 insertions, 180 deletions
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index 68d9416fa..3fe96b1c7 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -17,6 +17,7 @@ #include "glusterfs.h" #include "xlator.h" #include "dht-common.h" +#include "glusterfs-acl.h" #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ layout->list[i].start = srt; \ @@ -28,42 +29,40 @@ layout->list[i].xlator->name, path); \ } while (0) -static inline uint32_t -dht_find_overlap (int idx, int cnk_idx, uint32_t start, uint32_t stop, - uint32_t chunk_size) -{ - uint32_t overlap = 0; - uint32_t chunk_begin = 0; +#define DHT_RESET_LAYOUT_RANGE(layout) do { \ + int cnt = 0; \ + for (cnt = 0; cnt < layout->cnt; cnt++ ) { \ + layout->list[cnt].start = 0; \ + layout->list[cnt].stop = 0; \ + } \ + } while (0) - chunk_begin = cnk_idx * chunk_size; +static uint32_t +dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n) +{ + if (o >= old->cnt || n >= new->cnt) + return 0; - /* There is no chance of overlap */ - if ((chunk_begin > stop) || - ((chunk_begin + chunk_size) < start)) - goto out; + if (old->list[o].err > 0 || new->list[n].err > 0) + return 0; - if ((chunk_begin <= start) && - ((chunk_begin + chunk_size) <= stop)) { - overlap = ((chunk_begin + chunk_size) - start); - goto out; + if (old->list[o].start == old->list[o].stop) { + return 0; } - if ((chunk_begin <= start) && - ((chunk_begin + chunk_size) >= stop)) { - overlap = (stop - start); - goto out; + if (new->list[n].start == new->list[n].stop) { + return 0; } - if ((chunk_begin < stop) && - ((chunk_begin + chunk_size) >= stop)) { - overlap = (stop - chunk_begin); - goto out; - } + if ((old->list[o].start > new->list[n].stop) || + (old->list[o].stop < new->list[n].start)) + return 0; -out: - return overlap; + return min (old->list[o].stop, new->list[n].stop) - + max (old->list[o].start, new->list[n].start) + 1; } + int dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) { @@ -118,7 +117,8 @@ dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout, int i) + dht_layout_t *layout, int i, + xlator_t *req_subvol) { xlator_t *subvol = NULL; dict_t *xattr = NULL; @@ -126,16 +126,22 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, xlator_t *this = NULL; int32_t *disk_layout = NULL; dht_local_t *local = NULL; - + dht_conf_t *conf = NULL; local = frame->local; - subvol = layout->list[i].xlator; + if (req_subvol) + subvol = req_subvol; + else + subvol = layout->list[i].xlator; this = frame->this; GF_VALIDATE_OR_GOTO ("", this, err); GF_VALIDATE_OR_GOTO (this->name, layout, err); GF_VALIDATE_OR_GOTO (this->name, local, err); GF_VALIDATE_OR_GOTO (this->name, subvol, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; xattr = get_new_dict (); if (!xattr) { @@ -150,8 +156,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, goto err; } - ret = dict_set_bin (xattr, "trusted.glusterfs.dht", - disk_layout, 4 * 4); + ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, "%s: (subvol %s) failed to set xattr dictionary", @@ -182,8 +187,7 @@ err: if (xattr) dict_destroy (xattr); - if (disk_layout) - GF_FREE (disk_layout); + GF_FREE (disk_layout); dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, -1, ENOMEM, NULL); @@ -197,21 +201,42 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) int i = 0; int count = 0; xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; local = frame->local; this = frame->this; + conf = this->private; gf_log (this->name, GF_LOG_DEBUG, "writing the new range for all subvolumes"); - local->call_cnt = count = layout->cnt; + local->call_cnt = count = conf->subvolume_cnt; for (i = 0; i < layout->cnt; i++) { - dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); if (--count == 0) - break; + goto out; + } + /* if we are here, subvolcount > layout_count. subvols-per-directory + * option might be set here. We need to clear out layout from the + * non-participating subvolumes, else it will result in overlaps */ + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + if (--count == 0) + break; + } } + + dht_layout_unref (this, dummy); +out: return 0; } @@ -222,9 +247,12 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) int missing_xattr = 0; int i = 0; xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; local = frame->local; this = frame->this; + conf = this->private; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err != -1 || !layout->list[i].stop) { @@ -253,11 +281,23 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) if (layout->list[i].err != -1 || !layout->list[i].stop) continue; - dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); if (--missing_xattr == 0) break; } + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + } + } + dht_layout_unref (this, dummy); +out: return 0; } @@ -494,7 +534,7 @@ dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc, uint32_t hashval = 0; int ret = 0; - ret = dht_hash_compute (layout->type, loc->path, &hashval); + ret = dht_hash_compute (this, layout->type, loc->path, &hashval); if (ret == 0) { start = (hashval % layout->cnt); } @@ -517,7 +557,7 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) for (j = 0; j < conf->subvolume_cnt; j++) { if (conf->decommissioned_bricks[j] && conf->decommissioned_bricks[j] == layout->list[i].xlator) { - layout->list[i].err = -EINVAL; + layout->list[i].err = EINVAL; break; } } @@ -525,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1 || err == 0) { - layout->list[i].err = -1; + if (err == -1 || err == 0 || err == ENOENT) { + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + + It is important to note that it is safe + to race with mkdir() as self-heal and + mkdir are idempotent operations. Both will + strive to set the directory and layouts to + the same final state. + */ count++; + if (!err) + layout->list[i].err = -1; } } @@ -542,49 +606,126 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) } } - count = ((layout->spread_cnt) ? layout->spread_cnt : - ((count) ? count : 1)); + /* if layout->spread_cnt is set, check if it is <= available + * subvolumes (down brick and decommissioned bricks are considered + * un-availbale). Else return count (available up bricks) */ + count = ((layout->spread_cnt && + (layout->spread_cnt <= count)) ? + layout->spread_cnt : ((count) ? count : 1)); return count; } +void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, + dht_layout_t *new_layout); + +void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); +void dht_layout_range_swap (dht_layout_t *layout, int i, int j); + +/* + * It's a bit icky using local variables in a macro, but it makes the rest + * of the code a lot clearer. + */ +#define OV_ENTRY(x,y) table[x*new->cnt+y] + +void +dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, + dht_layout_t *new, dht_layout_t *old) +{ + int i = 0; + int j = 0; + uint32_t curr_overlap = 0; + uint32_t max_overlap = 0; + int max_overlap_idx = -1; + uint32_t overlap = 0; + uint32_t *table = NULL; + + dht_layout_sort_volname (old); + /* Now both old_layout->list[] and new_layout->list[] + are match the same xlators/subvolumes. i.e, + old_layout->[i] and new_layout->[i] are referring + to the same subvolumes + */ + + /* Build a table of overlaps between new[i] and old[j]. */ + table = alloca(sizeof(overlap)*old->cnt*new->cnt); + if (!table) { + return; + } + memset(table,0,sizeof(overlap)*old->cnt*new->cnt); + for (i = 0; i < new->cnt; ++i) { + for (j = 0; j < old->cnt; ++j) { + OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i); + } + } + + for (i = 0; i < new->cnt; i++) { + if (new->list[i].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + + max_overlap = 0; + max_overlap_idx = i; + for (j = (i + 1); j < new->cnt; ++j) { + if (new->list[j].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + /* Calculate the overlap now. */ + curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j); + /* Calculate the overlap after the proposed swap. */ + overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i); + /* Are we better than status quo? */ + if (overlap > curr_overlap) { + overlap -= curr_overlap; + /* Are we better than the previous choice? */ + if (overlap > max_overlap) { + max_overlap = overlap; + max_overlap_idx = j; + } + } + } + + if (max_overlap_idx != i) { + dht_layout_range_swap (new, i, max_overlap_idx); + /* Need to swap the table values too. */ + for (j = 0; j < old->cnt; ++j) { + overlap = OV_ENTRY(i,j); + OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j); + OV_ENTRY(max_overlap_idx,j) = overlap; + } + } + } +} + + dht_layout_t * dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { - uint32_t chunk = 0; - uint32_t start = 0; - uint32_t stop = 0; - uint32_t overlap = 0; - uint32_t max_overlap = 0; - uint32_t chunk_begin = 0; - int count = 0; - int cnt = 0; int i = 0; - int j = 0; - int k = 0; - int loop_cnt = 0; - int start_subvol = 0; - int *fix_array = NULL; xlator_t *this = NULL; dht_layout_t *new_layout = NULL; dht_conf_t *priv = NULL; dht_local_t *local = NULL; + uint32_t subvol_down = 0; + int ret = 0; this = frame->this; priv = this->private; local = frame->local; - count = cnt = dht_get_layout_count (this, layout, 0); - - chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); - - start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); - - fix_array = GF_CALLOC (sizeof (int), layout->cnt, gf_common_mt_char); - if (!fix_array) { - /* No fix, use the existing layout itself */ + if (layout->type == DHT_HASH_TYPE_DM_USER) { + gf_log (THIS->name, GF_LOG_DEBUG, "leaving %s alone", + loc->path); goto done; } @@ -592,98 +733,33 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, if (!new_layout) goto done; - for (i = 0; i < new_layout->cnt; i++) { - /* TODO: fix this in layout_alloc() itself */ - new_layout->list[i].err = -ENOENT; - if (i < layout->cnt) - new_layout->list[i].xlator = layout->list[i].xlator; - } - - /* Check if there are any overlap in layout, and give the proper fix */ - for (i = 0; i < layout->cnt; i++) { - /* No need to fix if 'err' is not '-1' */ - if (layout->list[i].err != -1) - continue; - - /* If already existing layout is having no range, skip it */ - start = layout->list[i].start; - stop = layout->list[i].stop; - if ((stop - start) == 0) - continue; + /* If a subvolume is down, do not re-write the layout. */ + ret = dht_layout_anomalies (this, loc, layout, NULL, NULL, NULL, + &subvol_down, NULL, NULL); - max_overlap = 0; - - /* 'j' is used as starting point of each chunk */ - for (j = 1; j <= count; j++) { - /* if chunk is already used, don't use it again */ - for (k = 0; k < i; k++) - if (j == fix_array[k]) - break; - if (k < i) - continue; - - overlap = dht_find_overlap (i, (j-1), start, stop, chunk); - if (max_overlap < overlap) { - max_overlap = overlap; - fix_array[i] = j; - } - } - - /* If we have any overlap, then use that itself as new - layout for the subvolume */ - if (fix_array[i]) { - chunk_begin = chunk * (fix_array[i] - 1); - new_layout->list[i].err = -1; - DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin, - chunk, cnt, loc->path); - /* make sure to give (max - 1) as 'stop' range, - if it is last chunk */ - if (fix_array[i] == count) - new_layout->list[i].stop = 0xffffffff; - if (--cnt == 0) - goto done; - - } + if (subvol_down || (ret == -1)) { + gf_log (this->name, GF_LOG_WARNING, "%u subvolume(s) are down" + ". Skipping fix layout.", subvol_down); + GF_FREE (new_layout); + return NULL; } - /* Now, look for layouts which are not having any overlaps - and give it a fix */ - for (loop_cnt = 0, i = start_subvol; loop_cnt < new_layout->cnt; - i++, loop_cnt++) { - if (i == new_layout->cnt) - i = 0; - - /* If 'fix_array[i]' is set, the layout is already fixed. */ - if (fix_array[i]) - continue; + for (i = 0; i < new_layout->cnt; i++) { + if (layout->list[i].err != ENOSPC) + new_layout->list[i].err = layout->list[i].err; + else + new_layout->list[i].err = -1; - if (layout->list[i].err != -1) { - new_layout->list[i].err = layout->list[i].err; - continue; - } + new_layout->list[i].xlator = layout->list[i].xlator; + } - for (k = 1; k <= count; k++) { - for (j = 0; j < new_layout->cnt; j++) { - if (k == fix_array[j]) - break; - } - /* Didn't find any of the list begining with 'k' */ - if (j == new_layout->cnt) - break; - } + /* First give it a layout as though it is a new directory. This + ensures rotation to kick in */ + dht_layout_sort_volname (new_layout); + dht_selfheal_layout_new_directory (frame, loc, new_layout); - fix_array[i] = k; - chunk_begin = (k - 1) * chunk; - new_layout->list[i].err = -1; - DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin, chunk, cnt, - loc->path); - /* make sure to give (max - 1) as 'stop' range, - if it is last chunk */ - if (k == count) - new_layout->list[i].stop = 0xffffffff; - if (--cnt == 0) - goto done; - } + /* Now selectively re-assign ranges only when it helps */ + dht_selfheal_layout_maximize_overlap (frame, loc, new_layout, layout); done: if (new_layout) { @@ -697,10 +773,7 @@ done: local->layout = new_layout; } - if (fix_array) - GF_FREE (fix_array); - - return new_layout; + return local->layout; } @@ -724,9 +797,11 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); + /* clear out the range, as we are re-computing here */ + DHT_RESET_LAYOUT_RANGE (layout); for (i = start_subvol; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { @@ -739,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, for (i = 0; i < start_subvol; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { @@ -758,35 +833,17 @@ int dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { - dht_conf_t *conf = NULL; - xlator_t *this = NULL; dht_local_t *local = NULL; - int missing = -1; - int down = -1; - int holes = -1; + uint32_t holes = 0; int ret = -1; int i = -1; - int overlaps = -1; + uint32_t overlaps = 0; - this = frame->this; - conf = this->private; local = frame->local; - missing = local->selfheal.missing; - down = local->selfheal.down; holes = local->selfheal.hole_cnt; overlaps = local->selfheal.overlaps_cnt; - if ((missing + down) == conf->subvolume_cnt) { - dht_selfheal_layout_new_directory (frame, loc, layout); - ret = 0; - } - - if (holes <= down) { - /* the down subvol might fill up the holes */ - ret = 0; - } - if (holes || overlaps) { dht_selfheal_layout_new_directory (frame, loc, layout); ret = 0; @@ -838,6 +895,9 @@ dht_fix_directory_layout (call_frame_t *frame, /* No layout sorting required here */ tmp_layout = dht_fix_layout_of_directory (frame, &local->loc, layout); + if (!tmp_layout) { + return -1; + } dht_fix_dir_xattr (frame, &local->loc, tmp_layout); return 0; @@ -860,9 +920,8 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, dht_layout_anomalies (this, loc, layout, &local->selfheal.hole_cnt, &local->selfheal.overlaps_cnt, - &local->selfheal.missing, - &local->selfheal.down, - &local->selfheal.misc); + NULL, &local->selfheal.down, + &local->selfheal.misc, NULL); down = local->selfheal.down; misc = local->selfheal.misc; @@ -921,3 +980,50 @@ dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, return ret; } + +int +dht_dir_attr_heal (void *data) +{ + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int ret = -1; + int i = 0; + + GF_VALIDATE_OR_GOTO ("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", local, out); + conf = this->private; + GF_VALIDATE_OR_GOTO ("dht", conf, out); + + call_cnt = conf->subvolume_cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = conf->subvolumes[i]; + if (!subvol || (subvol == dht_first_up_subvol (this))) + continue; + ret = syncop_setattr (subvol, &local->loc, &local->stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL); + if (ret) + gf_log ("dht", GF_LOG_ERROR, "Failed to set uid/gid on" + " %s on %s subvol (%s)", local->loc.path, + subvol->name, strerror (errno)); + } +out: + return 0; +} + +int +dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data) +{ + DHT_STACK_DESTROY (sync_frame); + return 0; +} |
