summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
authorN Balachandran <nbalacha@redhat.com>2018-06-22 10:42:42 +0530
committerRaghavendra G <rgowdapp@redhat.com>2018-06-29 11:10:47 +0000
commitb3c2116d99a5c049e4ee0f88f35440258b49496e (patch)
tree20215379c379842c07746bda8c3bea3c54024c83 /xlators/cluster
parentc0e66dddcd8964871e0d574f927684ee7e3c4904 (diff)
cluster/dht: Do not try to use up the readdirp buffer
DHT attempts to use up the entire buffer in readdirp before unwinding in an attempt to reduce the number of calls. However, this has 2 disadvantages: 1. This can cause a stack overflow when parallel readdir is enabled. If the buffer only has a little space,rda can send back only one or two entries. If those entries are stripped out by dht_readdirp_cbk (linkto files for example) it will once again wind down to rda in an attempt to fill the buffer before unwinding to FUSE. This process can continue for several iterations, causing the stack to grow and eventually overflow, causing the process to crash. 2. If parallel readdir is disabled, dht could send readdirp calls with small buffers to the bricks, thus increasing the number of network calls. We are therefore reverting to the existing behaviour. Please note, this only mitigates the stack overflow, it does not prevent it from happening. This is still possible if a subvol has thousands of linkto files for instance. Change-Id: I291bc181c5249762d0c4fe27fa4fc2631166adf5 fixes: bz#1593548 Signed-off-by: N Balachandran <nbalacha@redhat.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/dht/src/dht-common.c112
-rw-r--r--xlators/cluster/dht/src/dht-common.h4
-rw-r--r--xlators/cluster/dht/src/dht-helper.c3
3 files changed, 66 insertions, 53 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 9415af31bd7..07f00b4a92d 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -6886,11 +6886,15 @@ out:
return;
}
+
+/* Posix returns op_errno = ENOENT to indicate that there are no more entries
+ */
int
dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
{
dht_local_t *local = NULL;
+ gf_dirent_t entries;
gf_dirent_t *orig_entry = NULL;
gf_dirent_t *entry = NULL;
xlator_t *prev = NULL;
@@ -6907,6 +6911,8 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
inode_table_t *itable = NULL;
inode_t *inode = NULL;
+ INIT_LIST_HEAD (&entries.list);
+
prev = cookie;
local = frame->local;
itable = local->fd ? local->fd->inode->table : NULL;
@@ -6916,14 +6922,9 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
methods = &(conf->methods);
- local->op_errno = op_errno;
-
- if (op_ret < 0)
+ if (op_ret <= 0)
goto done;
- if (local->op_ret < 0)
- local->op_ret = 0;
-
if (!local->layout)
local->layout = dht_layout_get (this, local->fd->inode);
@@ -7085,21 +7086,40 @@ list:
gf_msg_debug (this->name, 0, "%s: Adding entry = %s",
prev->name, entry->d_name);
- list_add_tail (&entry->list, &local->entries.list);
- local->filled += gf_dirent_size (entry->d_name);
+ list_add_tail (&entry->list, &entries.list);
count++;
- local->op_ret++;
}
done:
- if ((op_ret == 0) && op_errno != ENOENT) {
- /* remaining buffer size is not enough to hold even one
- * dentry
- */
- goto unwind;
- }
- if ((count == 0) || (local && (local->filled < local->size))) {
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ * Possible values:
+ * op_ret == 0 and op_errno != 0
+ * if op_errno != ENOENT : Error.Unwind.
+ * if op_errno == ENOENT : There are no more entries on this subvol.
+ * Move to the next one.
+ * op_ret > 0 and count == 0 :
+ * The subvol returned entries to dht but all were stripped out.
+ * For example, if they were linkto files or dirs where
+ * hashed_subvol != prev. Try to get some entries by winding
+ * to the next subvol. This can be dangerous if parallel readdir
+ * is enabled as it grows the stack.
+ *
+ * op_ret > 0 and count > 0:
+ * We found some entries. Unwind even if the buffer is not full.
+ *
+ */
+
+ op_ret = count;
+ if (count == 0) {
+ /* non-zero next_offset means that
+ * EOF is not yet hit on the current subvol
+ */
if ((next_offset == 0) || (op_errno == ENOENT)) {
next_offset = 0;
next_subvol = dht_subvol_next (this, prev);
@@ -7129,7 +7149,7 @@ done:
STACK_WIND_COOKIE (frame, dht_readdirp_cbk, next_subvol,
next_subvol, next_subvol->fops->readdirp,
- local->fd, (local->size - local->filled),
+ local->fd, local->size,
next_offset, local->xattr);
return 0;
}
@@ -7142,13 +7162,16 @@ unwind:
* distribute we're not concerned only with a posix's view of the
* directory but the aggregated namespace' view of the directory.
*/
- if ((local->op_ret >= 0) && (prev != dht_last_up_subvol (this)))
- local->op_errno = 0;
+ if (op_ret < 0)
+ op_ret = 0;
+ if (prev != dht_last_up_subvol (this))
+ op_errno = 0;
- DHT_STACK_UNWIND (readdirp, frame, local->op_ret, local->op_errno,
- &local->entries, NULL);
+ DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno,
+ &entries, NULL);
+ gf_dirent_free (&entries);
return 0;
}
@@ -7159,6 +7182,7 @@ dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dict_t *xdata)
{
dht_local_t *local = NULL;
+ gf_dirent_t entries;
gf_dirent_t *orig_entry = NULL;
gf_dirent_t *entry = NULL;
xlator_t *prev = NULL;
@@ -7170,6 +7194,8 @@ dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_conf_t *conf = NULL;
dht_methods_t *methods = NULL;
+ INIT_LIST_HEAD (&entries.list);
+
prev = cookie;
local = frame->local;
@@ -7178,14 +7204,8 @@ dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
methods = &(conf->methods);
- local->op_errno = op_errno;
-
- if (op_ret < 0) {
+ if (op_ret <= 0)
goto done;
- }
-
- if (local->op_ret < 0)
- local->op_ret = 0;
if (!local->layout)
local->layout = dht_layout_get (this, local->fd->inode);
@@ -7222,23 +7242,22 @@ dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_msg_debug (this->name, 0, "%s: Adding = entry %s",
prev->name, entry->d_name);
- list_add_tail (&entry->list, &local->entries.list);
+ list_add_tail (&entry->list, &entries.list);
count++;
- local->filled += gf_dirent_size (entry->d_name);
- local->op_ret++;
}
}
-
done:
- if ((op_ret == 0) && op_errno != ENOENT) {
- /* remaining buffer size is not enough to hold even one
- * dentry
- */
- goto unwind;
- }
-
- if ((count == 0) || (local && (local->filled < local->size))) {
- if ((op_ret <= 0) || (op_errno == ENOENT)) {
+ op_ret = count;
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ */
+ if (count == 0) {
+ if ((next_offset == 0) || (op_errno == ENOENT)) {
+ next_offset = 0;
next_subvol = dht_subvol_next (this, prev);
} else {
next_subvol = prev;
@@ -7250,7 +7269,7 @@ done:
STACK_WIND_COOKIE (frame, dht_readdir_cbk, next_subvol,
next_subvol, next_subvol->fops->readdir,
- local->fd, (local->size - local->filled),
+ local->fd, local->size,
next_offset, NULL);
return 0;
}
@@ -7263,12 +7282,13 @@ unwind:
* distribute we're not concerned only with a posix's view of the
* directory but the aggregated namespace' view of the directory.
*/
- if ((local->op_ret >= 0) && (prev != dht_last_up_subvol (this)))
- local->op_errno = 0;
+ if (prev != dht_last_up_subvol (this))
+ op_errno = 0;
- DHT_STACK_UNWIND (readdir, frame, local->op_ret, local->op_errno,
- &local->entries, NULL);
+ DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno,
+ &entries, NULL);
+ gf_dirent_free (&entries);
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 9a4734e1ee2..43bceb9fd2d 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -377,10 +377,6 @@ struct dht_local {
call_stub_t *stub;
int32_t parent_disk_layout[4];
- /* To hold dentries of readdir spawning across subvols */
- gf_dirent_t entries;
- size_t filled;
-
/* rename rollback */
int *ret_cache ;
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index c301b1fd47e..2f2ca1f635e 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -818,8 +818,6 @@ dht_local_wipe (xlator_t *this, dht_local_t *local)
if (local->ret_cache)
GF_FREE (local->ret_cache);
- gf_dirent_free (&local->entries);
-
mem_put (local);
}
@@ -859,7 +857,6 @@ dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop)
inode);
}
- INIT_LIST_HEAD (&local->entries.list);
frame->local = local;
out: