From b350bcd6a3db2e92d1baa47a5ec02efd09f76f16 Mon Sep 17 00:00:00 2001 From: Susant Palai Date: Mon, 17 Apr 2017 13:00:54 +0530 Subject: cluster/dht: Skip file migration if the subvol that meets min-free-disk criteria happens to be the same subvol containing data-file Rebalance need to figure out a new subvol in case the hashed subvol does not have enough space. In the process of figuring out the new subvol, we need to ignore the source subvol, otherwise it will lead to data loss. Test: Manual Ran the following sizeof /tmp/1: 1.5GB sizeof /brick/1: 16GB sizeof /tmp/2: 1.5GB glusterd; gluster v create test1 vm1:/brick/1 vm1:/tmp/1; gluster v start test1; mount -t glusterfs vm1:test1 /mnt; for i in {1..2000} do dd if=/dev/zero of=/mnt/file$i bs=1KB count=1 &> /dev/null; done gluster v add-brick test1 vm1:/tmp/2 gluster v set test1 min-free-disk 12GB gluster v remove-brick test1 vm1:/tmp/1 star file count and data were intact. Change-Id: Ib8fc8467a3d48a7c12958824c4f0b88e160b86c1 BUG: 1441508 Signed-off-by: Susant Palai Reviewed-on: https://review.gluster.org/17064 Smoke: Gluster Build System NetBSD-regression: NetBSD Build System CentOS-regression: Gluster Build System Reviewed-by: Raghavendra G --- xlators/cluster/dht/src/dht-common.h | 2 +- xlators/cluster/dht/src/dht-diskusage.c | 19 ++++--- xlators/cluster/dht/src/dht-rebalance.c | 96 ++++++++++++++++++++++++++------- 3 files changed, 92 insertions(+), 25 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 4555152db54..0e082e35c57 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -1153,7 +1153,7 @@ dht_dir_has_layout (dict_t *xattr, char *name); gf_boolean_t dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); xlator_t * -dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, +dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, xlator_t *ignore, dht_layout_t *layout, uint64_t filesize); xlator_t * dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 13698a9616d..05592154e30 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -315,7 +315,7 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, LOCK (&conf->subvolume_lock); { - avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, NULL, layout, 0); if(!avail_subvol) { @@ -340,8 +340,8 @@ out: } static inline -int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, - dht_layout_t *layout) +int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, xlator_t *ignore, + dht_layout_t *layout) { int ret = -1; int i = 0; @@ -349,6 +349,13 @@ int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, if (!this || !layout) goto out; + /* this check is meant for rebalance process. The source of the file + * should be ignored for space check */ + if (this == ignore) { + goto out; + } + + /* check if subvol has layout errors, before selecting it */ for (i = 0; i < layout->cnt; i++) { if (!strcmp (layout->list[i].xlator->name, this->name) && @@ -376,7 +383,7 @@ out: /*Get subvolume which has both space and inodes more than the min criteria*/ xlator_t * -dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, xlator_t *ignore, dht_layout_t *layout, uint64_t filesize) { int i = 0; @@ -398,7 +405,7 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, /* check if subvol has layout errors and also it is not a * decommissioned brick, before selecting it */ ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], - layout); + ignore, layout); if (ignore_subvol) continue; @@ -463,7 +470,7 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, /* check if subvol has layout errors and also it is not a * decommissioned brick, before selecting it*/ - ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], + ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], NULL, layout); if (ignore_subvol) continue; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index d963cfb2b8f..c1174225138 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -719,27 +719,30 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc loc->path, to->name, strerror (-ret)); */ + ret = syncop_fsetattr (to, fd, stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL, NULL, NULL); + if (ret < 0) + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "chown failed for %s on %s (%s)", + loc->path, to->name, strerror (-ret)); + /* Fallocate does not work for size 0, hence the check. Anyway we don't * need to care about min-free-disk for 0 byte size file */ if (stbuf->ia_size > 0) { ret = syncop_fallocate (to, fd, 0, 0, stbuf->ia_size, NULL, NULL); - if (ret < 0) + if (ret < 0) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, "fallocate failed for %s on %s (%s)", loc->path, to->name, strerror (-ret)); + ret = -1; + goto out; + } } - ret = syncop_fsetattr (to, fd, stbuf, - (GF_SET_ATTR_UID | GF_SET_ATTR_GID), - NULL, NULL, NULL, NULL); - if (ret < 0) - gf_msg (this->name, GF_LOG_ERROR, 0, - DHT_MSG_MIGRATE_FILE_FAILED, - "chown failed for %s on %s (%s)", - loc->path, to->name, strerror (-ret)); - /* success */ ret = 0; @@ -761,7 +764,8 @@ out: static int __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, int flag, dht_conf_t *conf, - gf_boolean_t *target_changed, xlator_t **new_subvol) + gf_boolean_t *target_changed, xlator_t **new_subvol, + gf_boolean_t *ignore_failure) { struct statvfs src_statfs = {0,}; struct statvfs dst_statfs = {0,}; @@ -773,6 +777,7 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, uint64_t dst_statfs_blocks = 1; double post_availspace = 0; double post_percent = 0; + int i = 0; this = THIS; @@ -897,13 +902,27 @@ find_new_subvol: goto out; } - *new_subvol = dht_subvol_with_free_space_inodes (this, to, - layout, stbuf->ia_size); - if (!(*new_subvol)) { + *new_subvol = dht_subvol_with_free_space_inodes (this, to, from, layout, + stbuf->ia_size); + if ((!(*new_subvol)) || (*new_subvol == from)) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE, "Could not find any subvol" - " with space accomodating the file. Consider adding " - "bricks"); + " with space accomodating the file - %s. Consider adding " + "bricks", loc->path); + + /* For remove-brick case if the source is not one of the + * removed-brick, do not mark the error as failure */ + if (conf->decommission_subvols_cnt) { + *ignore_failure = _gf_true; + for (i = 0; i < conf->decommission_subvols_cnt; i++) { + if (conf->decommissioned_bricks[i] == from) { + *ignore_failure = _gf_false; + break; + } + } + } else { + *ignore_failure = _gf_false; + } *target_changed = _gf_false; ret = -1; @@ -1382,6 +1401,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, gf_boolean_t target_changed = _gf_false; xlator_t *new_target = NULL; xlator_t *old_target = NULL; + fd_t *linkto_fd = NULL; + gf_boolean_t ignore_failure = _gf_false; defrag = conf->defrag; if (!defrag) @@ -1500,7 +1521,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, clean_dst = _gf_true; ret = __dht_check_free_space (to, from, loc, &stbuf, flag, conf, - &target_changed, &new_target); + &target_changed, &new_target, &ignore_failure); if (target_changed) { /* Can't handle for hardlinks. Marking this as failure */ if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) { @@ -1544,6 +1565,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } if (ret) { + if (ignore_failure) + ret = 0; + goto out; } @@ -1791,13 +1815,47 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } ret = syncop_setxattr (old_target, loc, dict, 0, NULL, NULL); - if (ret) { + if (ret && -ret != ESTALE && -ret != ENOENT) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, "failed to set xattr on %s in %s (%s)", loc->path, old_target->name, strerror (-ret)); ret = -1; goto out; + } else if (-ret == ESTALE || -ret == ENOENT) { + /* The failure ESTALE indicates that the linkto + * file on the hashed subvol might have been deleted. + * In this case will create a linkto file with new target + * as linkto xattr value*/ + linkto_fd = fd_create (loc->inode, DHT_REBALANCE_PID); + if (!linkto_fd) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: fd create failed (%s)", + loc->path, strerror (errno)); + ret = -1; + goto out; + } + ret = syncop_create (old_target, loc, O_RDWR, + DHT_LINKFILE_MODE, linkto_fd, + NULL, dict, NULL); + if (ret != 0 && -ret != EEXIST && -ret != ESTALE) { + ret = -1; + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "failed to create linkto file on %s in %s (%s)", + loc->path, old_target->name, strerror (-ret)); + goto out; + } else if (ret == 0) { + ret = syncop_fsetattr (old_target, linkto_fd, &stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL, NULL, NULL); + if (ret < 0) + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "chown failed for %s on %s (%s)", + loc->path, old_target->name, strerror (-ret)); + } } } } @@ -2043,6 +2101,8 @@ out: syncop_close (dst_fd); if (src_fd) syncop_close (src_fd); + if (linkto_fd) + syncop_close (linkto_fd); loc_wipe (&tmp_loc); -- cgit