summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/dht/src/dht-rebalance.c
diff options
context:
space:
mode:
authorRaghavendra G <rgowdapp@redhat.com>2018-02-08 17:12:41 +0530
committerRaghavendra G <rgowdapp@redhat.com>2018-05-07 06:04:10 +0000
commit5e356e3f470779433bfe6b0b368676062842b367 (patch)
tree79f5bcf2ed1b3b0b377cdb36176eb185103dc8f4 /xlators/cluster/dht/src/dht-rebalance.c
parent01a7d27bbaf12618fde56ca05cf9c953445493f3 (diff)
cluster/dht: fixes to parallel renames to same destination codepath
Test case: # while true; do uuid="`uuidgen`"; echo "some data" > "test$uuid"; mv "test$uuid" "test" -f || break; echo "done:$uuid"; done This script was run in parallel from multiple mountpoints Along the course of getting the above usecase working, many issues were found: Issue 1: ======= consider a case of rename (src, dst). We can encounter a situation where, * dst is a file present at the time of lookup * dst is removed by the time rename fop reaches glusterfs In this scenario, acquring inodelk on dst fails with ESTALE resulting in failure of rename. However, as per POSIX irrespective of whether dst is present or not, rename should be successful. Acquiring entrylk provides synchronization even in races like this. Algorithm: 1. Take inodelks on src and dst (if dst is present) on respective cached subvols. These inodelks are done to preserve backward compatibility with older clients, so that synchronization is preserved when a volume is mounted by clients of different versions. Once relevant older versions (3.10, 3.12, 3.13) reach EOL, this code can be removed. 2. Ignore ENOENT/ESTALE errors of inodelk on dst. 3. protect namespace of src and dst. To protect namespace of a file, take inodelk on parent on hashed subvol, then take entrylk on the same subvol on parent with basename of file. inodelk on parent is done to guard against changes to parent layout so that hashed subvol won't change during rename. 4. <rest of rename continues> 5. unlock all locks Issue 2: ======== linkfile creation in lookup codepath can race with a rename. Imagine the following scenario: * lookup finds a data-file with gfid - gfid-dst - without a corresponding linkto file on hashed-subvol. It decides to create linkto file with gfid - gfid-dst. - Note that some codepaths of dht-rename deletes linkto file of dst as first step. So, a lookup racing with an in-progress rename can easily run into this situation. * a rename (src-path:gfid-src, dst-path:gfid-dst) renames data-file and hence gfid of data-file changes to gfid-src with path dst-path. * lookup proceeds and creates linkto file - dst-path - with gfid - dst-gfid - on hashed-subvol. * rename tries to create a linkto file dst-path with src-gfid on hashed-subvol, but it fails with EEXIST. But EEXIST is ignored during linkto file creation. Now we've ended with dst-path having different gfids - dst-gfid on linkto file and src-gfid on data file. Future lookups on dst-path will always fail with ESTALE, due to differing gfids. The fix is to synchronize linkfile creation in lookup path with rename using the same mechanism of protecting namespace explained in solution of Issue 1. Once locks are acquired, before proceeding with linkfile creation, we check whether conditions for linkto file creation are still valid. If not, we skip linkto file creation. Issue 3: ======== gfid of dst-path can change by the time locks are acquired. This means, either another rename overwrote dst-path or dst-path was deleted and recreated by a different client. When this happens, cached-subvol for dst can change. If rename proceeds with old-gfid and old-cached subvol, we'll end up in inconsistent state(s) like dst-path with different gfids on different subvols, more than one data-file being present etc. Fix is to do the lookup with a new inode after protecting namespace of dst. Post lookup, we've to compare gfids and correct local state appropriately to be in sync with backend. Issue 4: ======== During revalidate lookup, if following a linkto file doesn't lead to a valid data-file, local->cached-subvol was not reset to NULL. This means we would be operating on a stale state which can lead to inconsistency. As a fix, reset it to NULL before proceeding with lookup everywhere. Issue 5: ======== Stale dentries left out in inode table on brick resulted in failures of link fop even though the file/dentry didn't exist on backend fs. A patch is submitted to fix this issue. Please check the dependency tree of current patch on gerrit for details In short, we fix the problem by not blindly trusting the inode-table. Instead we validate whether dentry is present by doing lookup on backend fs. Change-Id: I832e5c47d232f90c4edb1fafc512bf19bebde165 updates: bz#1543279 BUG: 1543279 Signed-off-by: Raghavendra G <rgowdapp@redhat.com>
Diffstat (limited to 'xlators/cluster/dht/src/dht-rebalance.c')
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c63
1 files changed, 60 insertions, 3 deletions
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 91fb94d529c..cc03eaa1195 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -1605,7 +1605,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
struct gf_flock flock = {0, };
struct gf_flock plock = {0, };
loc_t tmp_loc = {0, };
- gf_boolean_t locked = _gf_false;
+ loc_t parent_loc = {0, };
+ gf_boolean_t inodelk_locked = _gf_false;
+ gf_boolean_t entrylk_locked = _gf_false;
gf_boolean_t p_locked = _gf_false;
int lk_ret = -1;
gf_defrag_info_t *defrag = NULL;
@@ -1619,6 +1621,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
gf_boolean_t target_changed = _gf_false;
xlator_t *new_target = NULL;
xlator_t *old_target = NULL;
+ xlator_t *hashed_subvol = NULL;
fd_t *linkto_fd = NULL;
@@ -1686,6 +1689,28 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
" for file: %s", loc->path);
}
+ ret = dht_build_parent_loc (this, &parent_loc, loc, fop_errno);
+ if (ret < 0) {
+ ret = -1;
+ gf_msg (this->name, GF_LOG_WARNING, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to build parent loc, which is needed to "
+ "acquire entrylk to synchronize with renames on this "
+ "path. Skipping migration", loc->path);
+ goto out;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed (this, loc);
+ if (hashed_subvol == NULL) {
+ ret = -1;
+ gf_msg (this->name, GF_LOG_WARNING, EINVAL,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: cannot find hashed subvol which is needed to "
+ "synchronize with renames on this path. "
+ "Skipping migration", loc->path);
+ goto out;
+ }
+
flock.l_type = F_WRLCK;
tmp_loc.inode = inode_ref (loc->inode);
@@ -1710,7 +1735,26 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
- locked = _gf_true;
+ inodelk_locked = _gf_true;
+
+ /* dht_rename has changed to use entrylk on hashed subvol for
+ * synchronization. So, rebalance too has to acquire an entrylk on
+ * hashed subvol.
+ */
+ ret = syncop_entrylk (hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, &parent_loc,
+ loc->name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL,
+ NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ ret = -1;
+ gf_msg (this->name, GF_LOG_WARNING, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to acquire entrylk on subvol %s",
+ loc->path, hashed_subvol->name);
+ goto out;
+ }
+
+ entrylk_locked = _gf_true;
/* Phase 1 - Data migration is in progress from now on */
ret = syncop_lookup (from, loc, &stbuf, NULL, dict, &xattr_rsp);
@@ -2348,7 +2392,7 @@ out:
}
}
- if (locked) {
+ if (inodelk_locked) {
flock.l_type = F_UNLCK;
lk_ret = syncop_inodelk (from, DHT_FILE_MIGRATE_DOMAIN,
@@ -2361,6 +2405,18 @@ out:
}
}
+ if (entrylk_locked) {
+ lk_ret = syncop_entrylk (hashed_subvol, DHT_ENTRY_SYNC_DOMAIN,
+ &parent_loc, loc->name, ENTRYLK_UNLOCK,
+ ENTRYLK_UNLOCK, NULL, NULL);
+ if (lk_ret < 0) {
+ gf_msg (this->name, GF_LOG_WARNING, -lk_ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to unlock entrylk on %s",
+ loc->path, hashed_subvol->name);
+ }
+ }
+
if (p_locked) {
plock.l_type = F_UNLCK;
lk_ret = syncop_lk (from, src_fd, F_SETLK, &plock, NULL, NULL);
@@ -2400,6 +2456,7 @@ out:
syncop_close (linkto_fd);
loc_wipe (&tmp_loc);
+ loc_wipe (&parent_loc);
return ret;
}