summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorVenkatesh Somyajulu <vsomyaju@redhat.com>2014-07-15 18:17:19 +0530
committerVijay Bellur <vbellur@redhat.com>2014-07-18 03:11:06 -0700
commit74d92e322e3c9f4f70ddfbf9b0e2140922009658 (patch)
treeb569c431d68676188a2df87711d4a975887b4ba9 /xlators
parent52da727e7564963a8a244fc5cb7028315e458529 (diff)
cluster/dht: Fix races to avoid deletion of linkto file
Explanation of Race between rebalance processes: https://bugzilla.redhat.com/show_bug.cgi?id=1110694#c4 STATE 1: BRICK-1 only one brick Cached File in the system STATE 2: Add brick-2 BRICK-1 BRICK-2 STATE 3: Lookup of File on brick-2 by this node's rebalance will fail because hashed file is not created yet. So dht_lookup_everywhere is about to get called. STATE 4: As part of lookup link file at brick-2 will be created. STATE 5: getxattr to check that cached file belongs to this node is done STATE 6: dht_lookup_everywhere_cbk detects the link created by rebalance-1. It will unlink it. STATE 7: getxattr at the link file with "pathinfo" key will be called will fail as the link file is deleted by rebalance on node-2 Fix: So in the STATE 6, we should avoid the deletion of link file. Every time dht_lookup_everywhere gets called, lookup will be performed on all the nodes. So to avoid STATE 6, if linkto file is found, it is not deleted until valid case is found in dht_lookup_everywhere_done. Case 1: if linkto file points to cached node, and cached file exists, uwind with success. Case 2: if linkto does not point to current cached node, and cached file exists: a) Unlink stale link file b) Create new link file Case 3: Only linkto file exists: Delete linkto file Case 4: Only cached file Create link file (Handled event without patch) Case 5: Neither cached nor hashed file is present Return with ENOENT (handled even without patch) Change-Id: Ibf53671410d8d613b8e2e7e5d0ec30fc7dcc0298 BUG: 1116150 Signed-off-by: Venkatesh Somyajulu <vsomyaju@redhat.com> Reviewed-on: http://review.gluster.org/8231 Reviewed-by: Vijay Bellur <vbellur@redhat.com> Tested-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/cluster/dht/src/dht-common.c287
-rw-r--r--xlators/cluster/dht/src/dht-common.h17
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c60
-rw-r--r--xlators/storage/posix/src/posix.c76
-rw-r--r--xlators/storage/posix/src/posix.h2
5 files changed, 401 insertions, 41 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 421f4917d86..c60fa56ffcf 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -850,6 +850,80 @@ out:
return ret;
}
+int
+dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ int this_call_cnt = 0;
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ dht_lookup_everywhere_done (frame, this);
+ }
+
+ return 0;
+}
+
+int
+dht_lookup_unlink_stale_linkto_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+
+ /* NOTE:
+ * If stale file unlink fails either there is an open-fd or is not an
+ * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten
+ * to ENOENT
+ */
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+ NULL);
+
+ return 0;
+}
+
+/* Rebalance is performed from cached_node to hashed_node. Initial cached_node
+ * contains a non-linkto file. After migration it is converted to linkto and
+ * then unlinked. And at hashed_subvolume, first a linkto file is present,
+ * then after migration it is converted to a non-linkto file.
+ *
+ * Lets assume a file is present on cached subvolume and a new brick is added
+ * and new brick is the new_hashed subvolume. So fresh lookup on newly added
+ * hashed subvolume will fail and dht_lookup_everywhere gets called. If just
+ * before sending the dht_lookup_everywhere request rebalance is in progress,
+ *
+ * from cached subvolume it may see: Nonlinkto or linkto or No file
+ * from hashed subvolume it may see: No file or linkto file or non-linkto file
+ *
+ * So this boils down to 9 cases:
+ * at cached_subvol at hashed_subvol
+ * ---------------- -----------------
+ *
+ *a) No file No file
+ * [request reached after [Request reached before
+ * migration] Migration]
+ *
+ *b) No file Linkto File
+ *
+ *c) No file Non-Linkto File
+ *
+ *d) Linkto No-File
+ *
+ *e) Linkto Linkto
+ *
+ *f) Linkto Non-Linkto
+ *
+ *g) NonLinkto No-File
+ *
+ *h) NonLinkto Linkto
+ *
+ *i) NonLinkto NonLinkto
+ *
+ * dht_lookup_everywhere_done takes decision based on any of the above case
+ */
int
dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
@@ -860,6 +934,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
xlator_t *cached_subvol = NULL;
dht_layout_t *layout = NULL;
char gfid[GF_UUID_BUF_SIZE] = {0};
+ gf_boolean_t found_non_linkto_on_hashed = _gf_false;
local = frame->local;
hashed_subvol = local->hashed_subvol;
@@ -884,19 +959,167 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
return 0;
}
+
if (!cached_subvol) {
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
- NULL);
+
+ if (local->skip_unlink.handle_valid_link && hashed_subvol) {
+
+ /*Purpose of "unlink-only-if-dht-linkto-file":
+ * If this lookup is performed by rebalance and this
+ * rebalance process detected hashed file and by
+ * the time it sends the lookup request to cached node,
+ * file got migrated and now at intial hashed_node,
+ * final migrated file is present. With current logic,
+ * because this process fails to find the cached_node,
+ * it will unlink the file at initial hashed_node.
+ *
+ * So we avoid this by setting key, and checking at the
+ * posix_unlink that unlink the file only if file is a
+ * linkto file and not a migrated_file.
+ */
+
+ ret = dict_set_int32 (local->xattr_req,
+ "unlink-only-if-dht-linkto-file",
+ 1);
+
+ if (ret)
+ goto dict_err;
+
+ /*Later other consumers can also use this key to avoid
+ * unlinking in case of open_fd
+ */
+
+ ret = dict_set_int32 (local->xattr_req,
+ "dont-unlink-for-open-fd", 1);
+
+dict_err:
+ if (ret) {
+ /* If for some reason, setting key in the dict
+ * fails, return with ENOENT, as with respect to
+ * this process, it detected only a stale link
+ * file.
+ *
+ * Next lookup will delete it.
+ *
+ * Performing deletion of stale link file when
+ * setting key in dict fails, may cause the data
+ * loss becase of the above mentioned race.
+ */
+
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
+ NULL, NULL, NULL, NULL);
+ } else {
+ local->skip_unlink.handle_valid_link = _gf_false;
+ STACK_WIND (frame,
+ dht_lookup_unlink_stale_linkto_cbk,
+ hashed_subvol,
+ hashed_subvol->fops->unlink,
+ &local->loc, 0, local->xattr_req);
+ }
+
+ } else {
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL,
+ NULL, NULL);
+ }
return 0;
}
- if (local->need_lookup_everywhere) {
- if (uuid_compare (local->gfid, local->inode->gfid)) {
- /* GFID different, return error */
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL,
- NULL, NULL, NULL);
- return 0;
+ /* At the time of dht_lookup, no file was found on hashed and that is
+ * why dht_lookup_everywhere is called, but by the time
+ * dht_lookup_everywhere
+ * reached to server, file might have already migrated. In that case we
+ * will find a migrated file at the hashed_node. In this case store the
+ * layout in context and return successfully.
+ */
+
+ if (hashed_subvol || local->need_lookup_everywhere) {
+
+ if (local->need_lookup_everywhere) {
+
+ found_non_linkto_on_hashed = _gf_true;
+
+ } else if ((local->file_count == 1) &&
+ (hashed_subvol == cached_subvol)) {
+
+ found_non_linkto_on_hashed = _gf_true;
+ }
+
+ if (found_non_linkto_on_hashed)
+ goto preset_layout;
+
+ }
+
+
+ if (hashed_subvol) {
+ if (local->skip_unlink.handle_valid_link == _gf_true) {
+ if (cached_subvol == local->skip_unlink.hash_links_to) {
+
+ if (uuid_compare (local->skip_unlink.cached_gfid,
+ local->skip_unlink.hashed_gfid)){
+
+ /*GFID different, return error*/
+ DHT_STACK_UNWIND (lookup, frame, -1,
+ ESTALE, NULL, NULL, NULL,
+ NULL);
+
+
+ }
+
+ ret = dht_layout_preset (this, cached_subvol,
+ local->loc.inode);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Could not set pre-set layout "
+ "for subvolume %s",
+ cached_subvol->name);
+ }
+
+ local->op_ret = (ret == 0) ? ret : -1;
+ local->op_errno = (ret == 0) ? ret : EINVAL;
+
+ /* Presence of local->cached_subvol validates
+ * that lookup from cached node is successful
+ */
+
+ if (!local->op_ret && local->loc.parent) {
+ dht_inode_ctx_time_update
+ (local->loc.parent, this,
+ &local->postparent, 1);
+ }
+ goto unwind_hashed_and_cached;
+ } else {
+
+ local->skip_unlink.handle_valid_link = _gf_false;
+ if (local->skip_unlink.opend_fd_count == 0) {
+ local->call_cnt = 1;
+ STACK_WIND (frame, dht_lookup_unlink_cbk,
+ hashed_subvol,
+ hashed_subvol->fops->unlink,
+ &local->loc, 0, NULL);
+ return 0;
+
+ }
+ }
+
}
+ }
+
+
+preset_layout:
+
+ if (found_non_linkto_on_hashed) {
+
+ if (local->need_lookup_everywhere) {
+ if (uuid_compare (local->gfid, local->inode->gfid)) {
+ /* GFID different, return error */
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
+ NULL, NULL, NULL, NULL);
+ return 0;
+ }
+ }
+
local->op_ret = 0;
local->op_errno = 0;
layout = dht_layout_for_subvol (this, cached_subvol);
@@ -977,26 +1200,15 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
cached_subvol, hashed_subvol, &local->loc);
return ret;
-}
-
-
-int
-dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- int this_call_cnt = 0;
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- dht_lookup_everywhere_done (frame, this);
- }
+unwind_hashed_and_cached:
+ DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
+ DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ local->loc.inode, &local->stbuf, local->xattr,
+ &local->postparent);
return 0;
}
-
int
dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -1089,6 +1301,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_iatt_merge (this, &local->postparent,
postparent, subvol);
+
+ uuid_copy (local->skip_unlink.cached_gfid,
+ buf->ia_gfid);
} else {
/* This is where we need 'rename' both entries logic */
gf_msg (this->name, GF_LOG_WARNING, 0,
@@ -1106,9 +1321,27 @@ unlock:
if (is_linkfile) {
ret = dict_get_int32 (xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count);
- /* Delete the linkfile only if there are no open fds on it.
- if there is a open-fd, it may be in migration */
- if (!ret && (fd_count == 0)) {
+
+ /* Any linkto file found on the non-hashed subvolume should
+ * be unlinked (performed in the "else if" block below)
+ *
+ * But if a linkto file is found on hashed subvolume, it may be
+ * pointing to vaild cached node. So unlinking of linkto
+ * file on hashed subvolume is skipped and inside
+ * dht_lookup_everywhere_done, checks are performed. If this
+ * linkto file is found as stale linkto file, it is deleted
+ * otherwise unlink is skipped.
+ */
+
+ if (local->hashed_subvol && local->hashed_subvol == subvol) {
+
+ local->skip_unlink.handle_valid_link = _gf_true;
+ local->skip_unlink.opend_fd_count = fd_count;
+ local->skip_unlink.hash_links_to = link_subvol;
+ uuid_copy (local->skip_unlink.hashed_gfid,
+ buf->ia_gfid);
+
+ } else if (!ret && (fd_count == 0)) {
gf_log (this->name, GF_LOG_INFO,
"deleting stale linkfile %s on %s",
loc->path, subvol->name);
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index b4c804d97b4..1a71ef41a08 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -111,6 +111,16 @@ typedef enum {
qdstatfs_action_COMPARE,
} qdstatfs_action_t;
+struct dht_skip_linkto_unlink {
+
+ gf_boolean_t handle_valid_link;
+ int opend_fd_count;
+ xlator_t *hash_links_to;
+ uuid_t cached_gfid;
+ uuid_t hashed_gfid;
+};
+
+
struct dht_local {
int call_cnt;
loc_t loc;
@@ -199,7 +209,11 @@ struct dht_local {
xlator_t *first_up_subvol;
gf_boolean_t quota_deem_statfs;
+
gf_boolean_t added_link;
+
+ struct dht_skip_linkto_unlink skip_unlink;
+
};
typedef struct dht_local dht_local_t;
@@ -815,4 +829,7 @@ dht_subvol_status (dht_conf_t *conf, xlator_t *subvol);
void
dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
dht_layout_t *layout);
+int
+dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this);
+
#endif/* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index eca1d52b62b..96bc02075cb 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -344,6 +344,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
int ret = -1;
fd_t *fd = NULL;
struct iatt new_stbuf = {0,};
+ struct iatt check_stbuf= {0,};
dht_conf_t *conf = NULL;
this = THIS;
@@ -412,6 +413,43 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
+
+ /*Reason of doing lookup after create again:
+ *In the create, there is some time-gap between opening fd at the
+ *server (posix_layer) and binding it in server (incrementing fd count),
+ *so if in that time-gap, if other process sends unlink considering it
+ *as a linkto file, because inode->fd count will be 0, so file will be
+ *unlinked at the backend. And because furthur operations are performed
+ *on fd, so though migration will be done but will end with no file
+ *at the backend.
+ */
+
+
+ ret = syncop_lookup (to, loc, NULL, &check_stbuf, NULL, NULL);
+ if (!ret) {
+
+ if (uuid_compare (stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_GFID_MISMATCH,
+ "file %s exists in %s with different gfid,"
+ "found in lookup after create",
+ loc->path, to->name);
+ ret = -1;
+ fd_unref (fd);
+ goto out;
+ }
+
+ }
+
+ if (-ret == ENOENT) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_MIGRATE_FILE_FAILED, "%s: file does not exists"
+ "on %s (%s)", loc->path, to->name, strerror (-ret));
+ ret = -1;
+ fd_unref (fd);
+ goto out;
+ }
+
ret = syncop_fsetxattr (to, fd, xattr, 0);
if (ret < 0)
gf_msg (this->name, GF_LOG_WARNING, 0,
@@ -831,6 +869,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
dict_t *xattr_rsp = NULL;
int file_has_holes = 0;
dht_conf_t *conf = this->private;
+ int rcvd_enoent_from_src = 0;
gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s",
loc->path, from->name, to->name);
@@ -1043,17 +1082,32 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
/* Do a stat and check the gfid before unlink */
+
+ /*
+ * Cached file changes its state from non-linkto to linkto file after
+ * migrating data. If lookup from any other mount-point is performed,
+ * converted-linkto-cached file will be treated as a stale and will be
+ * unlinked. But by this time, file is already migrated. So further
+ * failure because of ENOENT should not be treated as error
+ */
+
ret = syncop_stat (from, loc, &empty_iatt);
if (ret) {
gf_msg (this->name, GF_LOG_WARNING, 0,
DHT_MSG_MIGRATE_FILE_FAILED,
"%s: failed to do a stat on %s (%s)",
loc->path, from->name, strerror (-ret));
- ret = -1;
- goto out;
+
+ if (-ret != ENOENT) {
+ ret = -1;
+ goto out;
+ }
+
+ rcvd_enoent_from_src = 1;
}
- if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) {
+ if ((uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0 ) &&
+ (!rcvd_enoent_from_src)) {
/* take out the source from namespace */
ret = syncop_unlink (from, loc);
if (ret) {
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index eb0bac66d06..63b65edcda5 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -1343,17 +1343,22 @@ int32_t
posix_unlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, int xflag, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *real_path = NULL;
- char *par_path = NULL;
- int32_t fd = -1;
- struct iatt stbuf = {0,};
- struct posix_private *priv = NULL;
- struct iatt preparent = {0,};
- struct iatt postparent = {0,};
- char *pgfid_xattr_key = NULL;
- int32_t nlink_samepgfid = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = NULL;
+ char *par_path = NULL;
+ int32_t fd = -1;
+ struct iatt stbuf = {0,};
+ struct posix_private *priv = NULL;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+ char *pgfid_xattr_key = NULL;
+ int32_t nlink_samepgfid = 0;
+ int32_t unlink_if_linkto = 0;
+ int32_t check_open_fd = 0;
+ int32_t skip_unlink = 0;
+ ssize_t xattr_size = -1;
+ int32_t is_dht_linkto_file = 0;
DECLARE_OLD_FS_ID_VAR;
@@ -1377,6 +1382,55 @@ posix_unlink (call_frame_t *frame, xlator_t *this,
posix_handle_unset (this, stbuf.ia_gfid, NULL);
priv = this->private;
+
+ op_ret = dict_get_int32 (xdata, "dont-unlink-for-open-fd",
+ &check_open_fd);
+
+ if (!op_ret && check_open_fd) {
+
+ LOCK (&loc->inode->lock);
+
+ if (loc->inode->fd_count) {
+ skip_unlink = 1;
+ }
+
+ UNLOCK (&loc->inode->lock);
+
+ if (skip_unlink) {
+ op_ret = -1;
+ op_errno = EBUSY;
+ goto out;
+ }
+ }
+
+
+ op_ret = dict_get_int32 (xdata, "unlink-only-if-dht-linkto-file",
+ &unlink_if_linkto);
+
+ if (!op_ret && unlink_if_linkto) {
+
+ LOCK (&loc->inode->lock);
+
+ xattr_size = sys_lgetxattr (real_path, LINKTO, NULL, 0);
+
+ if (xattr_size <= 0) {
+ skip_unlink = 1;
+ } else {
+ is_dht_linkto_file = IS_DHT_LINKFILE_MODE (&stbuf);
+ if (!is_dht_linkto_file)
+ skip_unlink = 1;
+ }
+
+ UNLOCK (&loc->inode->lock);
+
+ if (skip_unlink) {
+ op_ret = -1;
+ op_errno = EBUSY;
+ goto out;
+ }
+ }
+
+
if (priv->background_unlink) {
if (IA_ISREG (loc->inode->ia_type)) {
fd = open (real_path, O_RDONLY);
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index c9bfc984da5..aa4a66c3051 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -53,6 +53,8 @@
#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/
#define MAX_NO_VECT 1024
+#define LINKTO "trusted.glusterfs.dht.linkto"
+
#define POSIX_GFID_HANDLE_SIZE(base_path_len) (base_path_len + SLEN("/") \
+ SLEN(GF_HIDDEN_PATH) + SLEN("/") \
+ SLEN("00/") \