summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorXavi Hernandez <xhernandez@redhat.com>2020-03-01 19:49:04 +0100
committerRinku Kothiya <rkothiya@redhat.com>2020-03-16 08:18:51 +0000
commit87f1163e83339b2c2923106e0d9a45e8a327bb7a (patch)
tree7c99e035c7e49ff20dab46fe8c231ad3fc3057e9
parentf3b85b01e4b61a52598a6789c61997c5dad65bf0 (diff)
cluster/afr: fix race when bricks come up
The was a problem when self-heal was sending lookups at the same time that one of the bricks was coming up. In this case there was a chance that the number of 'up' bricks changes in the middle of sending the requests to subvolumes which caused a discrepancy in the expected number of replies and the actual number of sent requests. This discrepancy caused that AFR continued executing requests before all requests were complete. Eventually, the frame of the pending request was destroyed when the operation terminated, causing a use- after-free issue when the answer was finally received. In theory the same thing could happen in the reverse way, i.e. AFR tries to wait for more replies than sent requests, causing a hang. Backport of: > Change-Id: I7ed6108554ca379d532efb1a29b2de8085410b70 > Signed-off-by: Xavi Hernandez <xhernandez@redhat.com> > Fixes: bz#1808875 Change-Id: I7ed6108554ca379d532efb1a29b2de8085410b70 Signed-off-by: Xavi Hernandez <xhernandez@redhat.com> Fixes: bz#1809438
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c4
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c4
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h7
3 files changed, 9 insertions, 6 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 0341e4a0be0..2962289c35a 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1907,17 +1907,15 @@ int
afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
dict_t *dict = NULL;
- priv = frame->this->private;
local = frame->local;
if (local && local->xattr_req)
dict = local->xattr_req;
return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
- priv->child_up, dict);
+ local->child_up, dict);
}
unsigned int
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index 7d4f2080ec3..dace07131cb 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -560,13 +560,15 @@ afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this,
struct afr_reply *replies = NULL;
inode_t *inode = NULL;
int first_idx = -1;
+ afr_local_t *local = NULL;
priv = this->private;
+ local = frame->local;
replies = alloca0(sizeof(*replies) * priv->child_count);
inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
- priv->child_up, NULL);
+ local->child_up, NULL);
if (!inode)
return -ENOMEM;
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 4190a46bd17..8f6fb0035fd 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -46,13 +46,16 @@
afr_local_t *__local = frame->local; \
afr_private_t *__priv = frame->this->private; \
int __i = 0; \
- int __count = AFR_COUNT(list, __priv->child_count); \
+ int __count = 0; \
+ unsigned char *__list = alloca(__priv->child_count); \
\
+ memcpy(__list, list, sizeof(*__list) * __priv->child_count); \
+ __count = AFR_COUNT(__list, __priv->child_count); \
__local->barrier.waitfor = __count; \
afr_local_replies_wipe(__local, __priv); \
\
for (__i = 0; __i < __priv->child_count; __i++) { \
- if (!list[__i]) \
+ if (!__list[__i]) \
continue; \
STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
__priv->children[__i], \