summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--xlators/cluster/dht/src/dht-common.c287
-rw-r--r--xlators/cluster/dht/src/dht-common.h17
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c57
-rw-r--r--xlators/storage/posix/src/posix.c76
-rw-r--r--xlators/storage/posix/src/posix.h2
5 files changed, 399 insertions, 40 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index e2d6da2da81..ad635af95b9 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -783,6 +783,80 @@ out:
return ret;
}
+int
+dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ int this_call_cnt = 0;
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ dht_lookup_everywhere_done (frame, this);
+ }
+
+ return 0;
+}
+
+int
+dht_lookup_unlink_stale_linkto_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+
+ /* NOTE:
+ * If stale file unlink fails either there is an open-fd or is not an
+ * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten
+ * to ENOENT
+ */
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+ NULL);
+
+ return 0;
+}
+
+/* Rebalance is performed from cached_node to hashed_node. Initial cached_node
+ * contains a non-linkto file. After migration it is converted to linkto and
+ * then unlinked. And at hashed_subvolume, first a linkto file is present,
+ * then after migration it is converted to a non-linkto file.
+ *
+ * Lets assume a file is present on cached subvolume and a new brick is added
+ * and new brick is the new_hashed subvolume. So fresh lookup on newly added
+ * hashed subvolume will fail and dht_lookup_everywhere gets called. If just
+ * before sending the dht_lookup_everywhere request rebalance is in progress,
+ *
+ * from cached subvolume it may see: Nonlinkto or linkto or No file
+ * from hashed subvolume it may see: No file or linkto file or non-linkto file
+ *
+ * So this boils down to 9 cases:
+ * at cached_subvol at hashed_subvol
+ * ---------------- -----------------
+ *
+ *a) No file No file
+ * [request reached after [Request reached before
+ * migration] Migration]
+ *
+ *b) No file Linkto File
+ *
+ *c) No file Non-Linkto File
+ *
+ *d) Linkto No-File
+ *
+ *e) Linkto Linkto
+ *
+ *f) Linkto Non-Linkto
+ *
+ *g) NonLinkto No-File
+ *
+ *h) NonLinkto Linkto
+ *
+ *i) NonLinkto NonLinkto
+ *
+ * dht_lookup_everywhere_done takes decision based on any of the above case
+ */
int
dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
@@ -792,6 +866,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
xlator_t *hashed_subvol = NULL;
xlator_t *cached_subvol = NULL;
dht_layout_t *layout = NULL;
+ gf_boolean_t found_non_linkto_on_hashed = _gf_false;
local = frame->local;
hashed_subvol = local->hashed_subvol;
@@ -813,19 +888,167 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
return 0;
}
+
if (!cached_subvol) {
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
- NULL);
+
+ if (local->skip_unlink.handle_valid_link && hashed_subvol) {
+
+ /*Purpose of "unlink-only-if-dht-linkto-file":
+ * If this lookup is performed by rebalance and this
+ * rebalance process detected hashed file and by
+ * the time it sends the lookup request to cached node,
+ * file got migrated and now at intial hashed_node,
+ * final migrated file is present. With current logic,
+ * because this process fails to find the cached_node,
+ * it will unlink the file at initial hashed_node.
+ *
+ * So we avoid this by setting key, and checking at the
+ * posix_unlink that unlink the file only if file is a
+ * linkto file and not a migrated_file.
+ */
+
+ ret = dict_set_int32 (local->xattr_req,
+ "unlink-only-if-dht-linkto-file",
+ 1);
+
+ if (ret)
+ goto dict_err;
+
+ /*Later other consumers can also use this key to avoid
+ * unlinking in case of open_fd
+ */
+
+ ret = dict_set_int32 (local->xattr_req,
+ "dont-unlink-for-open-fd", 1);
+
+dict_err:
+ if (ret) {
+ /* If for some reason, setting key in the dict
+ * fails, return with ENOENT, as with respect to
+ * this process, it detected only a stale link
+ * file.
+ *
+ * Next lookup will delete it.
+ *
+ * Performing deletion of stale link file when
+ * setting key in dict fails, may cause the data
+ * loss becase of the above mentioned race.
+ */
+
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
+ NULL, NULL, NULL, NULL);
+ } else {
+ local->skip_unlink.handle_valid_link = _gf_false;
+ STACK_WIND (frame,
+ dht_lookup_unlink_stale_linkto_cbk,
+ hashed_subvol,
+ hashed_subvol->fops->unlink,
+ &local->loc, 0, local->xattr_req);
+ }
+
+ } else {
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL,
+ NULL, NULL);
+ }
return 0;
}
- if (local->need_lookup_everywhere) {
- if (uuid_compare (local->gfid, local->inode->gfid)) {
- /* GFID different, return error */
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL,
- NULL, NULL, NULL);
- return 0;
+ /* At the time of dht_lookup, no file was found on hashed and that is
+ * why dht_lookup_everywhere is called, but by the time
+ * dht_lookup_everywhere
+ * reached to server, file might have already migrated. In that case we
+ * will find a migrated file at the hashed_node. In this case store the
+ * layout in context and return successfully.
+ */
+
+ if (hashed_subvol || local->need_lookup_everywhere) {
+
+ if (local->need_lookup_everywhere) {
+
+ found_non_linkto_on_hashed = _gf_true;
+
+ } else if ((local->file_count == 1) &&
+ (hashed_subvol == cached_subvol)) {
+
+ found_non_linkto_on_hashed = _gf_true;
+ }
+
+ if (found_non_linkto_on_hashed)
+ goto preset_layout;
+
+ }
+
+
+ if (hashed_subvol) {
+ if (local->skip_unlink.handle_valid_link == _gf_true) {
+ if (cached_subvol == local->skip_unlink.hash_links_to) {
+
+ if (uuid_compare (local->skip_unlink.cached_gfid,
+ local->skip_unlink.hashed_gfid)){
+
+ /*GFID different, return error*/
+ DHT_STACK_UNWIND (lookup, frame, -1,
+ ESTALE, NULL, NULL, NULL,
+ NULL);
+
+
+ }
+
+ ret = dht_layout_preset (this, cached_subvol,
+ local->loc.inode);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Could not set pre-set layout "
+ "for subvolume %s",
+ cached_subvol->name);
+ }
+
+ local->op_ret = (ret == 0) ? ret : -1;
+ local->op_errno = (ret == 0) ? ret : EINVAL;
+
+ /* Presence of local->cached_subvol validates
+ * that lookup from cached node is successful
+ */
+
+ if (!local->op_ret && local->loc.parent) {
+ dht_inode_ctx_time_update
+ (local->loc.parent, this,
+ &local->postparent, 1);
+ }
+ goto unwind_hashed_and_cached;
+ } else {
+
+ local->skip_unlink.handle_valid_link = _gf_false;
+ if (local->skip_unlink.opend_fd_count == 0) {
+ local->call_cnt = 1;
+ STACK_WIND (frame, dht_lookup_unlink_cbk,
+ hashed_subvol,
+ hashed_subvol->fops->unlink,
+ &local->loc, 0, NULL);
+ return 0;
+
+ }
+ }
+
}
+ }
+
+
+preset_layout:
+
+ if (found_non_linkto_on_hashed) {
+
+ if (local->need_lookup_everywhere) {
+ if (uuid_compare (local->gfid, local->inode->gfid)) {
+ /* GFID different, return error */
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
+ NULL, NULL, NULL, NULL);
+ return 0;
+ }
+ }
+
local->op_ret = 0;
local->op_errno = 0;
layout = dht_layout_for_subvol (this, cached_subvol);
@@ -902,26 +1125,15 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
cached_subvol, hashed_subvol, &local->loc);
return ret;
-}
-
-
-int
-dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- int this_call_cnt = 0;
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- dht_lookup_everywhere_done (frame, this);
- }
+unwind_hashed_and_cached:
+ DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
+ DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ local->loc.inode, &local->stbuf, local->xattr,
+ &local->postparent);
return 0;
}
-
int
dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -1008,6 +1220,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_iatt_merge (this, &local->postparent,
postparent, subvol);
+
+ uuid_copy (local->skip_unlink.cached_gfid,
+ buf->ia_gfid);
} else {
/* This is where we need 'rename' both entries logic */
gf_log (this->name, GF_LOG_WARNING,
@@ -1024,9 +1239,27 @@ unlock:
if (is_linkfile) {
ret = dict_get_int32 (xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count);
- /* Delete the linkfile only if there are no open fds on it.
- if there is a open-fd, it may be in migration */
- if (!ret && (fd_count == 0)) {
+
+ /* Any linkto file found on the non-hashed subvolume should
+ * be unlinked (performed in the "else if" block below)
+ *
+ * But if a linkto file is found on hashed subvolume, it may be
+ * pointing to vaild cached node. So unlinking of linkto
+ * file on hashed subvolume is skipped and inside
+ * dht_lookup_everywhere_done, checks are performed. If this
+ * linkto file is found as stale linkto file, it is deleted
+ * otherwise unlink is skipped.
+ */
+
+ if (local->hashed_subvol && local->hashed_subvol == subvol) {
+
+ local->skip_unlink.handle_valid_link = _gf_true;
+ local->skip_unlink.opend_fd_count = fd_count;
+ local->skip_unlink.hash_links_to = link_subvol;
+ uuid_copy (local->skip_unlink.hashed_gfid,
+ buf->ia_gfid);
+
+ } else if (!ret && (fd_count == 0)) {
gf_log (this->name, GF_LOG_INFO,
"deleting stale linkfile %s on %s",
loc->path, subvol->name);
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index dc23bfa7f0c..bc7ee19e241 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -99,6 +99,17 @@ struct dht_rebalance_ {
dict_t *xdata;
};
+
+struct dht_skip_linkto_unlink {
+
+ gf_boolean_t handle_valid_link;
+ int opend_fd_count;
+ xlator_t *hash_links_to;
+ uuid_t cached_gfid;
+ uuid_t hashed_gfid;
+};
+
+
struct dht_local {
int call_cnt;
loc_t loc;
@@ -187,6 +198,9 @@ struct dht_local {
xlator_t *first_up_subvol;
gf_boolean_t added_link;
+
+ struct dht_skip_linkto_unlink skip_unlink;
+
};
typedef struct dht_local dht_local_t;
@@ -787,4 +801,7 @@ dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol);
int
dht_subvol_status (dht_conf_t *conf, xlator_t *subvol);
+int
+dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this);
+
#endif/* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index f8eeb1fd25a..760e9b364a8 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -250,6 +250,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
int ret = -1;
fd_t *fd = NULL;
struct iatt new_stbuf = {0,};
+ struct iatt check_stbuf= {0,};
dht_conf_t *conf = NULL;
this = THIS;
@@ -308,6 +309,42 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
+
+ /*Reason of doing lookup after create again:
+ *In the create, there is some time-gap between opening fd at the
+ *server (posix_layer) and binding it in server (incrementing fd count),
+ *so if in that time-gap, if other process sends unlink considering it
+ *as a linkto file, because inode->fd count will be 0, so file will be
+ *unlinked at the backend. And because furthur operations are performed
+ *on fd, so though migration will be done but will end with no file
+ *at the backend.
+ */
+
+
+ ret = syncop_lookup (to, loc, NULL, &check_stbuf, NULL, NULL);
+ if (!ret) {
+
+ if (uuid_compare (stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "file %s exists in %s with different gfid,"
+ "found in lookup after create",
+ loc->path, to->name);
+ ret = -1;
+ fd_unref (fd);
+ goto out;
+ }
+
+ }
+
+ if (-ret == ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: file does not exist"
+ "on %s (%s)", loc->path, to->name, strerror (-ret));
+ ret = -1;
+ fd_unref (fd);
+ goto out;
+ }
+
ret = syncop_fsetxattr (to, fd, xattr, 0);
if (ret == -1)
gf_log (this->name, GF_LOG_WARNING,
@@ -689,6 +726,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
dict_t *xattr_rsp = NULL;
int file_has_holes = 0;
dht_conf_t *conf = this->private;
+ int rcvd_enoent_from_src = 0;
gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s",
loc->path, from->name, to->name);
@@ -867,15 +905,30 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
/* Do a stat and check the gfid before unlink */
+
+ /*
+ * Cached file changes its state from non-linkto to linkto file after
+ * migrating data. If lookup from any other mount-point is performed,
+ * converted-linkto-cached file will be treated as a stale and will be
+ * unlinked. But by this time, file is already migrated. So further
+ * failure because of ENOENT should not be treated as error
+ */
+
ret = syncop_stat (from, loc, &empty_iatt);
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to do a stat on %s (%s)",
loc->path, from->name, strerror (errno));
- goto out;
+
+ if (errno != ENOENT) {
+ goto out;
+ }
+
+ rcvd_enoent_from_src = 1;
}
- if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) {
+ if ((uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0 ) &&
+ (!rcvd_enoent_from_src)) {
/* take out the source from namespace */
ret = syncop_unlink (from, loc);
if (ret) {
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index da96e6d56cf..cedf71ef4de 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -1367,17 +1367,22 @@ int32_t
posix_unlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, int xflag, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *real_path = NULL;
- char *par_path = NULL;
- int32_t fd = -1;
- struct iatt stbuf = {0,};
- struct posix_private *priv = NULL;
- struct iatt preparent = {0,};
- struct iatt postparent = {0,};
- char *pgfid_xattr_key = NULL;
- int32_t nlink_samepgfid = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = NULL;
+ char *par_path = NULL;
+ int32_t fd = -1;
+ struct iatt stbuf = {0,};
+ struct posix_private *priv = NULL;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+ char *pgfid_xattr_key = NULL;
+ int32_t nlink_samepgfid = 0;
+ int32_t unlink_if_linkto = 0;
+ int32_t check_open_fd = 0;
+ int32_t skip_unlink = 0;
+ ssize_t xattr_size = -1;
+ int32_t is_dht_linkto_file = 0;
DECLARE_OLD_FS_ID_VAR;
@@ -1401,6 +1406,55 @@ posix_unlink (call_frame_t *frame, xlator_t *this,
posix_handle_unset (this, stbuf.ia_gfid, NULL);
priv = this->private;
+
+ op_ret = dict_get_int32 (xdata, "dont-unlink-for-open-fd",
+ &check_open_fd);
+
+ if (!op_ret && check_open_fd) {
+
+ LOCK (&loc->inode->lock);
+
+ if (loc->inode->fd_count) {
+ skip_unlink = 1;
+ }
+
+ UNLOCK (&loc->inode->lock);
+
+ if (skip_unlink) {
+ op_ret = -1;
+ op_errno = EBUSY;
+ goto out;
+ }
+ }
+
+
+ op_ret = dict_get_int32 (xdata, "unlink-only-if-dht-linkto-file",
+ &unlink_if_linkto);
+
+ if (!op_ret && unlink_if_linkto) {
+
+ LOCK (&loc->inode->lock);
+
+ xattr_size = sys_lgetxattr (real_path, LINKTO, NULL, 0);
+
+ if (xattr_size <= 0) {
+ skip_unlink = 1;
+ } else {
+ is_dht_linkto_file = IS_DHT_LINKFILE_MODE (&stbuf);
+ if (!is_dht_linkto_file)
+ skip_unlink = 1;
+ }
+
+ UNLOCK (&loc->inode->lock);
+
+ if (skip_unlink) {
+ op_ret = -1;
+ op_errno = EBUSY;
+ goto out;
+ }
+ }
+
+
if (priv->background_unlink) {
if (IA_ISREG (loc->inode->ia_type)) {
fd = open (real_path, O_RDONLY);
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index f2e947c7670..f899a932360 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -53,6 +53,8 @@
#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/
#define MAX_NO_VECT 1024
+#define LINKTO "trusted.glusterfs.dht.linkto"
+
#define POSIX_GFID_HANDLE_SIZE(base_path_len) (base_path_len + SLEN("/") \
+ SLEN(HANDLE_PFX) + SLEN("/") \
+ SLEN("00/") \