summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSusant Palai <spalai@redhat.com>2018-01-18 13:06:12 +0530
committerShyamsundar Ranganathan <srangana@redhat.com>2018-02-06 14:33:30 +0000
commita6aaf29d57274c452de057cb8d7b4bf4da0466b1 (patch)
tree40c8f167a5e7428f8fa8345d56a6d30dd4ce60e9
parent0e6e25ae9e11a15771da796eff211ef1f1c43684 (diff)
cluster/dht: avoid overwriting client writes during migration
For more details on this issue see https://github.com/gluster/glusterfs/issues/308 Solution: This is a restrictive solution where a file will not be migrated if a client writes to it during the migration. This does not check if the writes from the rebalance and the client actually do overlap. If dht_writev_cbk finds that the file is being migrated (PHASE1) it will set an xattr on the destination file indicating the file was updated by a non-rebalance client. Rebalance checks if any other client has written to the dst file and aborts the file migration if it finds the xattr. updates gluster/glusterfs#308 Change-Id: I73aec28bc9dbb8da57c7425ec88c6b6af0fbc9dd Signed-off-by: Susant Palai <spalai@redhat.com> Signed-off-by: Raghavendra G <rgowdapp@redhat.com> Signed-off-by: N Balachandran <nbalacha@redhat.com> (cherry picked from commit 545a7ce6762a1b3a7b989b43a9d18b5b1b299df0)
-rw-r--r--libglusterfs/src/globals.h5
-rw-r--r--tests/basic/distribute/force-migration.t50
-rwxr-xr-xtests/basic/tier/fops-during-migration.t1
-rw-r--r--xlators/cluster/dht/src/dht-common.h2
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c28
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c103
-rw-r--r--xlators/cluster/dht/src/dht-shared.c17
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c8
-rw-r--r--xlators/storage/posix/src/posix-entry-ops.c12
-rw-r--r--xlators/storage/posix/src/posix-handle.h4
-rw-r--r--xlators/storage/posix/src/posix-helpers.c42
-rw-r--r--xlators/storage/posix/src/posix-inode-fd-ops.c9
12 files changed, 269 insertions, 12 deletions
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
index c6970511063..f0b0acda6b1 100644
--- a/libglusterfs/src/globals.h
+++ b/libglusterfs/src/globals.h
@@ -19,6 +19,11 @@
#define GD_MIN_OP_VERSION_KEY "minimum-operating-version"
#define GD_MAX_OP_VERSION_KEY "maximum-operating-version"
+#define GF_PROTECT_FROM_EXTERNAL_WRITES "trusted.glusterfs.protect.writes"
+#define GF_AVOID_OVERWRITE "glusterfs.avoid.overwrite"
+#define GF_CLEAN_WRITE_PROTECTION "glusterfs.clean.writexattr"
+
+
/* Gluster versions - OP-VERSION mapping
*
* 3.3.x - 1
diff --git a/tests/basic/distribute/force-migration.t b/tests/basic/distribute/force-migration.t
new file mode 100644
index 00000000000..f6c4997a505
--- /dev/null
+++ b/tests/basic/distribute/force-migration.t
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+#This tests checks if the file migration fails with force-migration
+#option set to off.
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+TEST touch $M0/file
+#This rename creates a link file for tile in the other brick.
+TEST mv $M0/file $M0/tile
+#Lets keep writing to the file which will have a open fd
+dd if=/dev/zero of=$M0/tile bs=1b &
+bg_pid=$!
+#Now rebalance will try to skip the file
+TEST $CLI volume set $V0 force-migration off
+TEST $CLI volume rebalance $V0 start force
+EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0
+skippedcount=`gluster v rebalance $V0 status | awk 'NR==3{print $6}'`
+TEST [[ $skippedcount -eq 1 ]]
+#file should be migrated now
+TEST $CLI volume set $V0 force-migration on
+TEST $CLI volume rebalance $V0 start force
+EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0
+skippedcount=`gluster v rebalance $V0 status | awk 'NR==3{print $6}'`
+rebalancedcount=`gluster v rebalance $V0 status | awk 'NR==3{print $2}'`
+TEST [[ $skippedcount -eq 0 ]]
+TEST [[ $rebalancedcount -eq 1 ]]
+kill -9 $bg_pid > /dev/null 2>&1
+wait > /dev/null 2>&1
+cleanup
+#Bad test because we are not sure writes are happening at the time of
+#rebalance. We need to write a test case which makes sure client
+#writes happen during rebalance. One way would be to set S+T bits on
+#src and write to file from client and then start rebalance. Currently
+#marking this as bad test.
+#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000
+
diff --git a/tests/basic/tier/fops-during-migration.t b/tests/basic/tier/fops-during-migration.t
index c85b4e337cc..458c01e93c5 100755
--- a/tests/basic/tier/fops-during-migration.t
+++ b/tests/basic/tier/fops-during-migration.t
@@ -22,6 +22,7 @@ function create_dist_tier_vol () {
TEST $CLI volume set $V0 performance.quick-read off
TEST $CLI volume set $V0 performance.io-cache off
TEST $CLI volume set $V0 features.ctr-enabled on
+ TEST $CLI volume set $V0 cluster.force-migration on
TEST $CLI volume start $V0
TEST $CLI volume tier $V0 attach $H0:$B0/hot/${V0}{0..$1}
TEST $CLI volume set $V0 cluster.tier-demote-frequency $DEMOTE_FREQ
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 91ba3418643..9671bbe1cbe 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -695,6 +695,8 @@ struct dht_conf {
synclock_t link_lock;
gf_boolean_t use_fallocate;
+
+ gf_boolean_t force_migration;
};
typedef struct dht_conf dht_conf_t;
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
index 7c596b1c099..226ee95e8b3 100644
--- a/xlators/cluster/dht/src/dht-inode-write.c
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -95,6 +95,33 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
/* Check if the rebalance phase1 is true */
if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ if (!dht_is_tier_xlator (this)) {
+ if (!local->xattr_req) {
+ local->xattr_req = dict_new ();
+ if (!local->xattr_req) {
+ gf_msg (this->name, GF_LOG_ERROR,
+ DHT_MSG_NO_MEMORY,
+ ENOMEM, "insufficient memory");
+ local->op_errno = ENOMEM;
+ local->op_ret = -1;
+ goto out;
+ }
+ }
+
+ ret = dict_set_uint32 (local->xattr_req,
+ GF_PROTECT_FROM_EXTERNAL_WRITES,
+ 1);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR,
+ DHT_MSG_DICT_SET_FAILED, 0,
+ "Failed to set key %s in dictionary",
+ GF_PROTECT_FROM_EXTERNAL_WRITES);
+ local->op_errno = ENOMEM;
+ local->op_ret = -1;
+ goto out;
+ }
+ }
+
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
@@ -146,7 +173,6 @@ dht_writev2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
return 0;
}
-
if (subvol == NULL)
goto out;
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 70d5a5f316f..d9c3149049c 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -26,7 +26,8 @@
#define MAX_REBAL_TYPE_SIZE 16
#define FILE_CNT_INTERVAL 600 /* 10 mins */
#define ESTIMATE_START_INTERVAL 600 /* 10 mins */
-
+#define HARDLINK_MIG_INPROGRESS -2
+#define SKIP_MIGRATION_FD_POSITIVE -3
#ifndef MAX
#define MAX(a, b) (((a) > (b))?(a):(b))
#endif
@@ -680,6 +681,7 @@ __dht_rebalance_create_dst_file (xlator_t *this, xlator_t *to, xlator_t *from,
struct iatt check_stbuf= {0,};
dht_conf_t *conf = NULL;
dict_t *dict = NULL;
+ dict_t *xdata = NULL;
conf = this->private;
@@ -725,7 +727,31 @@ __dht_rebalance_create_dst_file (xlator_t *this, xlator_t *to, xlator_t *from,
goto out;
}
- ret = syncop_lookup (to, loc, &new_stbuf, NULL, NULL, NULL);
+ if (!!dht_is_tier_xlator (this)) {
+ xdata = dict_new ();
+ if (!xdata) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: dict_new failed)",
+ loc->path);
+ goto out;
+ }
+
+ ret = dict_set_int32 (xdata, GF_CLEAN_WRITE_PROTECTION, 1);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DICT_SET_FAILED,
+ "%s: failed to set dictionary value: key = %s ",
+ loc->path, GF_CLEAN_WRITE_PROTECTION);
+ goto out;
+ }
+ }
+
+ ret = syncop_lookup (to, loc, &new_stbuf, NULL, xdata, NULL);
if (!ret) {
/* File exits in the destination, check if gfid matches */
if (gf_uuid_compare (stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) {
@@ -875,6 +901,10 @@ out:
if (dict)
dict_unref (dict);
+ if (xdata)
+ dict_unref (dict);
+
+
return ret;
}
@@ -1090,9 +1120,9 @@ out:
}
static int
-__dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from,
- xlator_t *to, fd_t *src, fd_t *dst,
- uint64_t ia_size, int hole_exists,
+__dht_rebalance_migrate_data (xlator_t *this, gf_defrag_info_t *defrag,
+ xlator_t *from, xlator_t *to, fd_t *src,
+ fd_t *dst, uint64_t ia_size, int hole_exists,
int *fop_errno)
{
int ret = 0;
@@ -1102,7 +1132,10 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from,
struct iobref *iobref = NULL;
uint64_t total = 0;
size_t read_size = 0;
+ dict_t *xdata = NULL;
+ dht_conf_t *conf = NULL;
+ conf = this->private;
/* if file size is '0', no need to enter this loop */
while (total < ia_size) {
read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ?
@@ -1121,8 +1154,42 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from,
ret, offset, iobref,
fop_errno);
} else {
+ if (!conf->force_migration &&
+ !dht_is_tier_xlator (this)) {
+ xdata = dict_new ();
+ if (!xdata) {
+ gf_msg ("dht", GF_LOG_ERROR, 0,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "insufficient memory");
+ ret = -1;
+ *fop_errno = ENOMEM;
+ break;
+ }
+
+ /* Fail this write and abort rebalance if we
+ * detect a write from client since migration of
+ * this file started. This is done to avoid
+ * potential data corruption due to out of order
+ * writes from rebalance and client to the same
+ * region (as compared between src and dst
+ * files). See
+ * https://github.com/gluster/glusterfs/issues/308
+ * for more details.
+ */
+ ret = dict_set_int32 (xdata,
+ GF_AVOID_OVERWRITE, 1);
+ if (ret) {
+ gf_msg ("dht", GF_LOG_ERROR, 0,
+ ENOMEM, "failed to set dict");
+ ret = -1;
+ *fop_errno = ENOMEM;
+ break;
+ }
+
+ }
+
ret = syncop_writev (to, dst, vector, count,
- offset, iobref, 0, NULL, NULL);
+ offset, iobref, 0, xdata, NULL);
if (ret < 0) {
*fop_errno = -ret;
}
@@ -1158,6 +1225,10 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from,
else
ret = -1;
+ if (xdata) {
+ dict_unref (xdata);
+ }
+
return ret;
}
@@ -1575,7 +1646,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
-
/* Do not migrate file in case lock migration is not enabled on the
* volume*/
if (!conf->lock_migration_enabled) {
@@ -1642,7 +1712,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag, defrag, conf,
fop_errno);
if (ret) {
- if (ret == -2)
+ if (ret == HARDLINK_MIG_INPROGRESS)
ret = 0;
goto out;
}
@@ -1785,7 +1855,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
ret = __check_file_has_hardlink (this, loc, &stbuf, xattr_rsp,
flag, defrag, conf, fop_errno);
if (ret) {
- if (ret == -2)
+ if (ret == HARDLINK_MIG_INPROGRESS)
ret = 0;
goto out;
}
@@ -1794,8 +1864,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
file_has_holes = 1;
- ret = __dht_rebalance_migrate_data (defrag, from, to, src_fd, dst_fd,
- stbuf.ia_size,
+ ret = __dht_rebalance_migrate_data (this, defrag, from, to,
+ src_fd, dst_fd, stbuf.ia_size,
file_has_holes, fop_errno);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
@@ -2280,6 +2350,17 @@ out:
}
}
+ if (!dht_is_tier_xlator (this)) {
+ lk_ret = syncop_removexattr (to, loc,
+ GF_PROTECT_FROM_EXTERNAL_WRITES,
+ NULL, NULL);
+ if (lk_ret) {
+ gf_msg (this->name, GF_LOG_WARNING, -lk_ret, 0,
+ "%s: removexattr failed key %s", loc->path,
+ GF_CLEAN_WRITE_PROTECTION);
+ }
+ }
+
if (dict)
dict_unref (dict);
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 8c011f72530..51c9d9cb3cf 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -524,6 +524,10 @@ dht_reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("lock-migration", conf->lock_migration_enabled,
options, bool, out);
+ GF_OPTION_RECONF ("force-migration", conf->force_migration,
+ options, bool, out);
+
+
if (conf->defrag) {
if (dict_get_str (options, "rebal-throttle", &temp_str) == 0) {
ret = dht_configure_throttle (this, conf, temp_str);
@@ -810,6 +814,10 @@ dht_init (xlator_t *this)
GF_OPTION_INIT ("lock-migration", conf->lock_migration_enabled,
bool, err);
+ GF_OPTION_INIT ("force-migration", conf->force_migration,
+ bool, err);
+
+
if (defrag) {
defrag->lock_migration_enabled = conf->lock_migration_enabled;
@@ -1203,5 +1211,14 @@ struct volume_options options[] = {
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC
},
+ { .key = {"force-migration"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "If disabled, rebalance will not migrate files that "
+ "are being written to by an application",
+ .op_version = {GD_OP_VERSION_4_0_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC
+ },
+
{ .key = {NULL} },
};
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index b87f6aa8cd7..674769c0b8f 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1349,6 +1349,14 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.flags = VOLOPT_FLAG_CLIENT_OPT,
},
+ { .key = "cluster.force-migration",
+ .voltype = "cluster/distribute",
+ .option = "force-migration",
+ .value = "off",
+ .op_version = GD_OP_VERSION_4_0_0,
+ .flags = VOLOPT_FLAG_CLIENT_OPT,
+ },
+
/* NUFA xlator options (Distribute special case) */
{ .key = "cluster.nufa",
.voltype = "cluster/distribute",
diff --git a/xlators/storage/posix/src/posix-entry-ops.c b/xlators/storage/posix/src/posix-entry-ops.c
index 4d7ed5be7c8..41d8c873b4c 100644
--- a/xlators/storage/posix/src/posix-entry-ops.c
+++ b/xlators/storage/posix/src/posix-entry-ops.c
@@ -180,6 +180,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this,
int32_t nlink_samepgfid = 0;
struct posix_private *priv = NULL;
posix_inode_ctx_t *ctx = NULL;
+ int ret = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -248,6 +249,17 @@ posix_lookup (call_frame_t *frame, xlator_t *this,
if (xdata && (op_ret == 0)) {
xattr = posix_xattr_fill (this, real_path, loc, NULL, -1, xdata,
&buf);
+
+ if (dict_get (xdata, GF_CLEAN_WRITE_PROTECTION)) {
+ ret = sys_lremovexattr (real_path,
+ GF_PROTECT_FROM_EXTERNAL_WRITES);
+ if (ret == -1 && (errno != ENODATA && errno != ENOATTR))
+ gf_msg (this->name, GF_LOG_ERROR,
+ P_MSG_XATTR_NOT_REMOVED, errno,
+ "removexattr failed. key %s path %s",
+ GF_PROTECT_FROM_EXTERNAL_WRITES,
+ loc->path);
+ }
}
if (priv->update_pgfid_nlinks) {
diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h
index 97186f91e64..8a07cf2b57d 100644
--- a/xlators/storage/posix/src/posix-handle.h
+++ b/xlators/storage/posix/src/posix-handle.h
@@ -186,4 +186,8 @@ int
posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid,
char *real_path, inode_table_t *itable);
+int
+posix_check_internal_writes (xlator_t *this, fd_t *fd, int sysfd,
+ dict_t *xdata);
+
#endif /* !_POSIX_HANDLE_H */
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
index 90afced604c..56c9b4afe94 100644
--- a/xlators/storage/posix/src/posix-helpers.c
+++ b/xlators/storage/posix/src/posix-helpers.c
@@ -2739,3 +2739,45 @@ posix_override_umask (mode_t mode, mode_t mode_bit)
gf_msg_debug ("posix", 0, "The value of mode is %u", mode);
return mode;
}
+
+int
+posix_check_internal_writes (xlator_t *this, fd_t *fd, int sysfd,
+ dict_t *xdata)
+{
+ int ret = 0;
+ size_t xattrsize = 0;
+ data_t *val = NULL;
+
+ LOCK (&fd->inode->lock);
+ {
+ val = dict_get (xdata, GF_PROTECT_FROM_EXTERNAL_WRITES);
+ if (val) {
+ ret = sys_fsetxattr (sysfd,
+ GF_PROTECT_FROM_EXTERNAL_WRITES,
+ val->data, val->len, 0);
+ if (ret == -1) {
+ gf_msg (this->name, GF_LOG_ERROR,
+ P_MSG_XATTR_FAILED,
+ errno, "setxattr failed key %s",
+ GF_PROTECT_FROM_EXTERNAL_WRITES);
+ }
+
+ goto out;
+ }
+
+ if (dict_get (xdata, GF_AVOID_OVERWRITE)) {
+ xattrsize = sys_fgetxattr (sysfd,
+ GF_PROTECT_FROM_EXTERNAL_WRITES,
+ NULL, 0);
+ if ((xattrsize == -1) && ((errno == ENOATTR) ||
+ (errno == ENODATA))) {
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+ }
+ }
+out:
+ UNLOCK (&fd->inode->lock);
+ return ret;
+}
diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c
index c90bc6c438a..812cf792874 100644
--- a/xlators/storage/posix/src/posix-inode-fd-ops.c
+++ b/xlators/storage/posix/src/posix-inode-fd-ops.c
@@ -1579,6 +1579,15 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
_fd = pfd->fd;
+ ret = posix_check_internal_writes (this, fd, _fd, xdata);
+ if (ret < 0) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, 0,
+ "possible overwrite from internal client, fd=%p", fd);
+ op_ret = -1;
+ op_errno = EBUSY;
+ goto out;
+ }
+
if (xdata) {
if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND))
write_append = _gf_true;