summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSusant Palai <spalai@redhat.com>2017-04-17 13:00:54 +0530
committerRaghavendra G <rgowdapp@redhat.com>2017-04-19 01:35:14 -0400
commitb350bcd6a3db2e92d1baa47a5ec02efd09f76f16 (patch)
treeaf5f3a473131a7978bbda2bdd74582904d0d0ab2
parente3d9018f2ddc28548c0aa18960a3a524521c9ad7 (diff)
cluster/dht: Skip file migration if the subvol that meets min-free-disk
criteria happens to be the same subvol containing data-file Rebalance need to figure out a new subvol in case the hashed subvol does not have enough space. In the process of figuring out the new subvol, we need to ignore the source subvol, otherwise it will lead to data loss. Test: Manual Ran the following sizeof /tmp/1: 1.5GB sizeof /brick/1: 16GB sizeof /tmp/2: 1.5GB <start> glusterd; gluster v create test1 vm1:/brick/1 vm1:/tmp/1; gluster v start test1; mount -t glusterfs vm1:test1 /mnt; for i in {1..2000} do dd if=/dev/zero of=/mnt/file$i bs=1KB count=1 &> /dev/null; done gluster v add-brick test1 vm1:/tmp/2 gluster v set test1 min-free-disk 12GB gluster v remove-brick test1 vm1:/tmp/1 star <end> file count and data were intact. Change-Id: Ib8fc8467a3d48a7c12958824c4f0b88e160b86c1 BUG: 1441508 Signed-off-by: Susant Palai <spalai@redhat.com> Reviewed-on: https://review.gluster.org/17064 Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
-rw-r--r--xlators/cluster/dht/src/dht-common.h2
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c19
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c96
3 files changed, 92 insertions, 25 deletions
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 4555152db54..0e082e35c57 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -1153,7 +1153,7 @@ dht_dir_has_layout (dict_t *xattr, char *name);
gf_boolean_t
dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator);
xlator_t *
-dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol,
+dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, xlator_t *ignore,
dht_layout_t *layout, uint64_t filesize);
xlator_t *
dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 13698a9616d..05592154e30 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -315,7 +315,7 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
LOCK (&conf->subvolume_lock);
{
- avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
+ avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, NULL,
layout, 0);
if(!avail_subvol)
{
@@ -340,8 +340,8 @@ out:
}
static inline
-int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this,
- dht_layout_t *layout)
+int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, xlator_t *ignore,
+ dht_layout_t *layout)
{
int ret = -1;
int i = 0;
@@ -349,6 +349,13 @@ int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this,
if (!this || !layout)
goto out;
+ /* this check is meant for rebalance process. The source of the file
+ * should be ignored for space check */
+ if (this == ignore) {
+ goto out;
+ }
+
+
/* check if subvol has layout errors, before selecting it */
for (i = 0; i < layout->cnt; i++) {
if (!strcmp (layout->list[i].xlator->name, this->name) &&
@@ -376,7 +383,7 @@ out:
/*Get subvolume which has both space and inodes more than the min criteria*/
xlator_t *
-dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, xlator_t *ignore,
dht_layout_t *layout, uint64_t filesize)
{
int i = 0;
@@ -398,7 +405,7 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
/* check if subvol has layout errors and also it is not a
* decommissioned brick, before selecting it */
ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i],
- layout);
+ ignore, layout);
if (ignore_subvol)
continue;
@@ -463,7 +470,7 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
/* check if subvol has layout errors and also it is not a
* decommissioned brick, before selecting it*/
- ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i],
+ ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], NULL,
layout);
if (ignore_subvol)
continue;
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index d963cfb2b8f..c1174225138 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -719,27 +719,30 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
loc->path, to->name, strerror (-ret));
*/
+ ret = syncop_fsetattr (to, fd, stbuf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+ NULL, NULL, NULL, NULL);
+ if (ret < 0)
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "chown failed for %s on %s (%s)",
+ loc->path, to->name, strerror (-ret));
+
/* Fallocate does not work for size 0, hence the check. Anyway we don't
* need to care about min-free-disk for 0 byte size file */
if (stbuf->ia_size > 0) {
ret = syncop_fallocate (to, fd, 0, 0, stbuf->ia_size, NULL,
NULL);
- if (ret < 0)
+ if (ret < 0) {
gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_MIGRATE_FILE_FAILED,
"fallocate failed for %s on %s (%s)",
loc->path, to->name, strerror (-ret));
+ ret = -1;
+ goto out;
+ }
}
- ret = syncop_fsetattr (to, fd, stbuf,
- (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
- NULL, NULL, NULL, NULL);
- if (ret < 0)
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "chown failed for %s on %s (%s)",
- loc->path, to->name, strerror (-ret));
-
/* success */
ret = 0;
@@ -761,7 +764,8 @@ out:
static int
__dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
struct iatt *stbuf, int flag, dht_conf_t *conf,
- gf_boolean_t *target_changed, xlator_t **new_subvol)
+ gf_boolean_t *target_changed, xlator_t **new_subvol,
+ gf_boolean_t *ignore_failure)
{
struct statvfs src_statfs = {0,};
struct statvfs dst_statfs = {0,};
@@ -773,6 +777,7 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
uint64_t dst_statfs_blocks = 1;
double post_availspace = 0;
double post_percent = 0;
+ int i = 0;
this = THIS;
@@ -897,13 +902,27 @@ find_new_subvol:
goto out;
}
- *new_subvol = dht_subvol_with_free_space_inodes (this, to,
- layout, stbuf->ia_size);
- if (!(*new_subvol)) {
+ *new_subvol = dht_subvol_with_free_space_inodes (this, to, from, layout,
+ stbuf->ia_size);
+ if ((!(*new_subvol)) || (*new_subvol == from)) {
gf_msg (this->name, GF_LOG_WARNING, 0,
DHT_MSG_SUBVOL_INSUFF_SPACE, "Could not find any subvol"
- " with space accomodating the file. Consider adding "
- "bricks");
+ " with space accomodating the file - %s. Consider adding "
+ "bricks", loc->path);
+
+ /* For remove-brick case if the source is not one of the
+ * removed-brick, do not mark the error as failure */
+ if (conf->decommission_subvols_cnt) {
+ *ignore_failure = _gf_true;
+ for (i = 0; i < conf->decommission_subvols_cnt; i++) {
+ if (conf->decommissioned_bricks[i] == from) {
+ *ignore_failure = _gf_false;
+ break;
+ }
+ }
+ } else {
+ *ignore_failure = _gf_false;
+ }
*target_changed = _gf_false;
ret = -1;
@@ -1382,6 +1401,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
gf_boolean_t target_changed = _gf_false;
xlator_t *new_target = NULL;
xlator_t *old_target = NULL;
+ fd_t *linkto_fd = NULL;
+ gf_boolean_t ignore_failure = _gf_false;
defrag = conf->defrag;
if (!defrag)
@@ -1500,7 +1521,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
clean_dst = _gf_true;
ret = __dht_check_free_space (to, from, loc, &stbuf, flag, conf,
- &target_changed, &new_target);
+ &target_changed, &new_target, &ignore_failure);
if (target_changed) {
/* Can't handle for hardlinks. Marking this as failure */
if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) {
@@ -1544,6 +1565,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
if (ret) {
+ if (ignore_failure)
+ ret = 0;
+
goto out;
}
@@ -1791,13 +1815,47 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
ret = syncop_setxattr (old_target, loc, dict, 0, NULL, NULL);
- if (ret) {
+ if (ret && -ret != ESTALE && -ret != ENOENT) {
gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_MIGRATE_FILE_FAILED,
"failed to set xattr on %s in %s (%s)",
loc->path, old_target->name, strerror (-ret));
ret = -1;
goto out;
+ } else if (-ret == ESTALE || -ret == ENOENT) {
+ /* The failure ESTALE indicates that the linkto
+ * file on the hashed subvol might have been deleted.
+ * In this case will create a linkto file with new target
+ * as linkto xattr value*/
+ linkto_fd = fd_create (loc->inode, DHT_REBALANCE_PID);
+ if (!linkto_fd) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: fd create failed (%s)",
+ loc->path, strerror (errno));
+ ret = -1;
+ goto out;
+ }
+ ret = syncop_create (old_target, loc, O_RDWR,
+ DHT_LINKFILE_MODE, linkto_fd,
+ NULL, dict, NULL);
+ if (ret != 0 && -ret != EEXIST && -ret != ESTALE) {
+ ret = -1;
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to create linkto file on %s in %s (%s)",
+ loc->path, old_target->name, strerror (-ret));
+ goto out;
+ } else if (ret == 0) {
+ ret = syncop_fsetattr (old_target, linkto_fd, &stbuf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+ NULL, NULL, NULL, NULL);
+ if (ret < 0)
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "chown failed for %s on %s (%s)",
+ loc->path, old_target->name, strerror (-ret));
+ }
}
}
}
@@ -2043,6 +2101,8 @@ out:
syncop_close (dst_fd);
if (src_fd)
syncop_close (src_fd);
+ if (linkto_fd)
+ syncop_close (linkto_fd);
loc_wipe (&tmp_loc);