From a6aaf29d57274c452de057cb8d7b4bf4da0466b1 Mon Sep 17 00:00:00 2001 From: Susant Palai Date: Thu, 18 Jan 2018 13:06:12 +0530 Subject: cluster/dht: avoid overwriting client writes during migration For more details on this issue see https://github.com/gluster/glusterfs/issues/308 Solution: This is a restrictive solution where a file will not be migrated if a client writes to it during the migration. This does not check if the writes from the rebalance and the client actually do overlap. If dht_writev_cbk finds that the file is being migrated (PHASE1) it will set an xattr on the destination file indicating the file was updated by a non-rebalance client. Rebalance checks if any other client has written to the dst file and aborts the file migration if it finds the xattr. updates gluster/glusterfs#308 Change-Id: I73aec28bc9dbb8da57c7425ec88c6b6af0fbc9dd Signed-off-by: Susant Palai Signed-off-by: Raghavendra G Signed-off-by: N Balachandran (cherry picked from commit 545a7ce6762a1b3a7b989b43a9d18b5b1b299df0) --- xlators/cluster/dht/src/dht-common.h | 2 + xlators/cluster/dht/src/dht-inode-write.c | 28 ++++++- xlators/cluster/dht/src/dht-rebalance.c | 103 +++++++++++++++++++++--- xlators/cluster/dht/src/dht-shared.c | 17 ++++ xlators/mgmt/glusterd/src/glusterd-volume-set.c | 8 ++ xlators/storage/posix/src/posix-entry-ops.c | 12 +++ xlators/storage/posix/src/posix-handle.h | 4 + xlators/storage/posix/src/posix-helpers.c | 42 ++++++++++ xlators/storage/posix/src/posix-inode-fd-ops.c | 9 +++ 9 files changed, 213 insertions(+), 12 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 91ba3418643..9671bbe1cbe 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -695,6 +695,8 @@ struct dht_conf { synclock_t link_lock; gf_boolean_t use_fallocate; + + gf_boolean_t force_migration; }; typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 7c596b1c099..226ee95e8b3 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -95,6 +95,33 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, /* Check if the rebalance phase1 is true */ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + if (!dht_is_tier_xlator (this)) { + if (!local->xattr_req) { + local->xattr_req = dict_new (); + if (!local->xattr_req) { + gf_msg (this->name, GF_LOG_ERROR, + DHT_MSG_NO_MEMORY, + ENOMEM, "insufficient memory"); + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + } + + ret = dict_set_uint32 (local->xattr_req, + GF_PROTECT_FROM_EXTERNAL_WRITES, + 1); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, + DHT_MSG_DICT_SET_FAILED, 0, + "Failed to set key %s in dictionary", + GF_PROTECT_FROM_EXTERNAL_WRITES); + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + } + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); @@ -146,7 +173,6 @@ dht_writev2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) return 0; } - if (subvol == NULL) goto out; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 70d5a5f316f..d9c3149049c 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -26,7 +26,8 @@ #define MAX_REBAL_TYPE_SIZE 16 #define FILE_CNT_INTERVAL 600 /* 10 mins */ #define ESTIMATE_START_INTERVAL 600 /* 10 mins */ - +#define HARDLINK_MIG_INPROGRESS -2 +#define SKIP_MIGRATION_FD_POSITIVE -3 #ifndef MAX #define MAX(a, b) (((a) > (b))?(a):(b)) #endif @@ -680,6 +681,7 @@ __dht_rebalance_create_dst_file (xlator_t *this, xlator_t *to, xlator_t *from, struct iatt check_stbuf= {0,}; dht_conf_t *conf = NULL; dict_t *dict = NULL; + dict_t *xdata = NULL; conf = this->private; @@ -725,7 +727,31 @@ __dht_rebalance_create_dst_file (xlator_t *this, xlator_t *to, xlator_t *from, goto out; } - ret = syncop_lookup (to, loc, &new_stbuf, NULL, NULL, NULL); + if (!!dht_is_tier_xlator (this)) { + xdata = dict_new (); + if (!xdata) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: dict_new failed)", + loc->path); + goto out; + } + + ret = dict_set_int32 (xdata, GF_CLEAN_WRITE_PROTECTION, 1); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_DICT_SET_FAILED, + "%s: failed to set dictionary value: key = %s ", + loc->path, GF_CLEAN_WRITE_PROTECTION); + goto out; + } + } + + ret = syncop_lookup (to, loc, &new_stbuf, NULL, xdata, NULL); if (!ret) { /* File exits in the destination, check if gfid matches */ if (gf_uuid_compare (stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) { @@ -875,6 +901,10 @@ out: if (dict) dict_unref (dict); + if (xdata) + dict_unref (dict); + + return ret; } @@ -1090,9 +1120,9 @@ out: } static int -__dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from, - xlator_t *to, fd_t *src, fd_t *dst, - uint64_t ia_size, int hole_exists, +__dht_rebalance_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, + xlator_t *from, xlator_t *to, fd_t *src, + fd_t *dst, uint64_t ia_size, int hole_exists, int *fop_errno) { int ret = 0; @@ -1102,7 +1132,10 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from, struct iobref *iobref = NULL; uint64_t total = 0; size_t read_size = 0; + dict_t *xdata = NULL; + dht_conf_t *conf = NULL; + conf = this->private; /* if file size is '0', no need to enter this loop */ while (total < ia_size) { read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ? @@ -1121,8 +1154,42 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from, ret, offset, iobref, fop_errno); } else { + if (!conf->force_migration && + !dht_is_tier_xlator (this)) { + xdata = dict_new (); + if (!xdata) { + gf_msg ("dht", GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "insufficient memory"); + ret = -1; + *fop_errno = ENOMEM; + break; + } + + /* Fail this write and abort rebalance if we + * detect a write from client since migration of + * this file started. This is done to avoid + * potential data corruption due to out of order + * writes from rebalance and client to the same + * region (as compared between src and dst + * files). See + * https://github.com/gluster/glusterfs/issues/308 + * for more details. + */ + ret = dict_set_int32 (xdata, + GF_AVOID_OVERWRITE, 1); + if (ret) { + gf_msg ("dht", GF_LOG_ERROR, 0, + ENOMEM, "failed to set dict"); + ret = -1; + *fop_errno = ENOMEM; + break; + } + + } + ret = syncop_writev (to, dst, vector, count, - offset, iobref, 0, NULL, NULL); + offset, iobref, 0, xdata, NULL); if (ret < 0) { *fop_errno = -ret; } @@ -1158,6 +1225,10 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from, else ret = -1; + if (xdata) { + dict_unref (xdata); + } + return ret; } @@ -1575,7 +1646,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } - /* Do not migrate file in case lock migration is not enabled on the * volume*/ if (!conf->lock_migration_enabled) { @@ -1642,7 +1712,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag, defrag, conf, fop_errno); if (ret) { - if (ret == -2) + if (ret == HARDLINK_MIG_INPROGRESS) ret = 0; goto out; } @@ -1785,7 +1855,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, ret = __check_file_has_hardlink (this, loc, &stbuf, xattr_rsp, flag, defrag, conf, fop_errno); if (ret) { - if (ret == -2) + if (ret == HARDLINK_MIG_INPROGRESS) ret = 0; goto out; } @@ -1794,8 +1864,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, file_has_holes = 1; - ret = __dht_rebalance_migrate_data (defrag, from, to, src_fd, dst_fd, - stbuf.ia_size, + ret = __dht_rebalance_migrate_data (this, defrag, from, to, + src_fd, dst_fd, stbuf.ia_size, file_has_holes, fop_errno); if (ret) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2280,6 +2350,17 @@ out: } } + if (!dht_is_tier_xlator (this)) { + lk_ret = syncop_removexattr (to, loc, + GF_PROTECT_FROM_EXTERNAL_WRITES, + NULL, NULL); + if (lk_ret) { + gf_msg (this->name, GF_LOG_WARNING, -lk_ret, 0, + "%s: removexattr failed key %s", loc->path, + GF_CLEAN_WRITE_PROTECTION); + } + } + if (dict) dict_unref (dict); diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 8c011f72530..51c9d9cb3cf 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -524,6 +524,10 @@ dht_reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("lock-migration", conf->lock_migration_enabled, options, bool, out); + GF_OPTION_RECONF ("force-migration", conf->force_migration, + options, bool, out); + + if (conf->defrag) { if (dict_get_str (options, "rebal-throttle", &temp_str) == 0) { ret = dht_configure_throttle (this, conf, temp_str); @@ -810,6 +814,10 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("lock-migration", conf->lock_migration_enabled, bool, err); + GF_OPTION_INIT ("force-migration", conf->force_migration, + bool, err); + + if (defrag) { defrag->lock_migration_enabled = conf->lock_migration_enabled; @@ -1203,5 +1211,14 @@ struct volume_options options[] = { .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC }, + { .key = {"force-migration"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "If disabled, rebalance will not migrate files that " + "are being written to by an application", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC + }, + { .key = {NULL} }, }; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index b87f6aa8cd7..674769c0b8f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1349,6 +1349,14 @@ struct volopt_map_entry glusterd_volopt_map[] = { .flags = VOLOPT_FLAG_CLIENT_OPT, }, + { .key = "cluster.force-migration", + .voltype = "cluster/distribute", + .option = "force-migration", + .value = "off", + .op_version = GD_OP_VERSION_4_0_0, + .flags = VOLOPT_FLAG_CLIENT_OPT, + }, + /* NUFA xlator options (Distribute special case) */ { .key = "cluster.nufa", .voltype = "cluster/distribute", diff --git a/xlators/storage/posix/src/posix-entry-ops.c b/xlators/storage/posix/src/posix-entry-ops.c index 4d7ed5be7c8..41d8c873b4c 100644 --- a/xlators/storage/posix/src/posix-entry-ops.c +++ b/xlators/storage/posix/src/posix-entry-ops.c @@ -180,6 +180,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this, int32_t nlink_samepgfid = 0; struct posix_private *priv = NULL; posix_inode_ctx_t *ctx = NULL; + int ret = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -248,6 +249,17 @@ posix_lookup (call_frame_t *frame, xlator_t *this, if (xdata && (op_ret == 0)) { xattr = posix_xattr_fill (this, real_path, loc, NULL, -1, xdata, &buf); + + if (dict_get (xdata, GF_CLEAN_WRITE_PROTECTION)) { + ret = sys_lremovexattr (real_path, + GF_PROTECT_FROM_EXTERNAL_WRITES); + if (ret == -1 && (errno != ENODATA && errno != ENOATTR)) + gf_msg (this->name, GF_LOG_ERROR, + P_MSG_XATTR_NOT_REMOVED, errno, + "removexattr failed. key %s path %s", + GF_PROTECT_FROM_EXTERNAL_WRITES, + loc->path); + } } if (priv->update_pgfid_nlinks) { diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h index 97186f91e64..8a07cf2b57d 100644 --- a/xlators/storage/posix/src/posix-handle.h +++ b/xlators/storage/posix/src/posix-handle.h @@ -186,4 +186,8 @@ int posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid, char *real_path, inode_table_t *itable); +int +posix_check_internal_writes (xlator_t *this, fd_t *fd, int sysfd, + dict_t *xdata); + #endif /* !_POSIX_HANDLE_H */ diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index 90afced604c..56c9b4afe94 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -2739,3 +2739,45 @@ posix_override_umask (mode_t mode, mode_t mode_bit) gf_msg_debug ("posix", 0, "The value of mode is %u", mode); return mode; } + +int +posix_check_internal_writes (xlator_t *this, fd_t *fd, int sysfd, + dict_t *xdata) +{ + int ret = 0; + size_t xattrsize = 0; + data_t *val = NULL; + + LOCK (&fd->inode->lock); + { + val = dict_get (xdata, GF_PROTECT_FROM_EXTERNAL_WRITES); + if (val) { + ret = sys_fsetxattr (sysfd, + GF_PROTECT_FROM_EXTERNAL_WRITES, + val->data, val->len, 0); + if (ret == -1) { + gf_msg (this->name, GF_LOG_ERROR, + P_MSG_XATTR_FAILED, + errno, "setxattr failed key %s", + GF_PROTECT_FROM_EXTERNAL_WRITES); + } + + goto out; + } + + if (dict_get (xdata, GF_AVOID_OVERWRITE)) { + xattrsize = sys_fgetxattr (sysfd, + GF_PROTECT_FROM_EXTERNAL_WRITES, + NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || + (errno == ENODATA))) { + ret = 0; + } else { + ret = -1; + } + } + } +out: + UNLOCK (&fd->inode->lock); + return ret; +} diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c index c90bc6c438a..812cf792874 100644 --- a/xlators/storage/posix/src/posix-inode-fd-ops.c +++ b/xlators/storage/posix/src/posix-inode-fd-ops.c @@ -1579,6 +1579,15 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, _fd = pfd->fd; + ret = posix_check_internal_writes (this, fd, _fd, xdata); + if (ret < 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, + "possible overwrite from internal client, fd=%p", fd); + op_ret = -1; + op_errno = EBUSY; + goto out; + } + if (xdata) { if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) write_append = _gf_true; -- cgit