From 370f05546eeedab394ca0d333a6ca6637757f1e3 Mon Sep 17 00:00:00 2001 From: Raghavendra G Date: Tue, 21 Aug 2018 09:44:15 +0530 Subject: performance/write-behind: fix fulfill and readdirp race Current invalidation of stats in wb_readdirp_cbk is prone to races. As the deleted comment explains, We cannot guarantee integrity of entry->d_stat as there are cached writes. The stat is most likely stale as it doesn't account the cached writes. However, checking for non-empty liability list here is not a fool-proof solution as there can be races like, 1. readdirp is successful on posix 2. sync of cached write is successful on posix 3. write-behind received sync response and removed the request from liability queue 4. readdirp response is processed at write-behind. In the above scenario, stat for the file is sent back in readdirp response but it is stale. The fix is to mark readdirp sessions (tracked in this patch by non-zero value of "readdirps" on parent inode) and if fulfill completes when one or more readdirp sessions are in progress, mark the inode so that wb_readdirp_cbk doesn't send iatts for that in inode in readdirp response. Note that wb_readdirp_cbk already checks for presence of a non-empty liability queue and invalidates iatt. Since the only way a liability queue can shrink is by fulfilling requests in liability queue, wb_fulfill_cbk indicates wb_readdirp_cbk that a potential race could've happened b/w readdirp and fulfill. Change-Id: I12d167bf450648baa64be1cbe1ca0fddf5379521 Signed-off-by: Raghavendra G updates: bz#1512691 --- .../performance/write-behind/src/write-behind.c | 169 +++++++++++++++++---- 1 file changed, 136 insertions(+), 33 deletions(-) diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index b2a05881fcc..9fc8391632b 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -84,6 +84,13 @@ typedef struct wb_inode { writes are in progress at the same time. Modules like eager-lock in AFR depend on this behavior. */ + list_head_t invalidate_list; /* list of wb_inodes that were marked for + * iatt invalidation due to requests in + * liability queue fulfilled while there + * was a readdirp session on parent + * directory. For a directory inode, this + * list points to list of children. + */ uint64_t gen; /* Liability generation number. Represents the current 'state' of liability. Every new addition to the liability list bumps @@ -107,6 +114,7 @@ typedef struct wb_inode { size_t size; /* Size of the file to catch write after EOF. */ gf_lock_t lock; xlator_t *this; + inode_t *inode; int dontsync; /* If positive, don't pick lies for * winding. This is needed to break infinite * recursion during invocation of @@ -114,6 +122,8 @@ typedef struct wb_inode { * wb_fulfill_cbk in case of an * error during fulfill. */ + gf_atomic_int32_t readdirps; + gf_atomic_int8_t invalidate; } wb_inode_t; @@ -186,9 +196,6 @@ typedef struct wb_conf { } wb_conf_t; -void -wb_process_queue (wb_inode_t *wb_inode); - wb_inode_t * __wb_inode_ctx_get (xlator_t *this, inode_t *inode) @@ -225,6 +232,44 @@ out: } +static void +wb_set_invalidate (wb_inode_t *wb_inode, int set) +{ + int readdirps = 0; + inode_t *parent_inode = NULL; + wb_inode_t *wb_parent_inode = NULL; + + parent_inode = inode_parent (wb_inode->inode, NULL, NULL); + if (parent_inode) + wb_parent_inode = wb_inode_ctx_get (wb_inode->this, + parent_inode); + + if (wb_parent_inode) { + LOCK (&wb_parent_inode->lock); + { + readdirps = GF_ATOMIC_GET (wb_parent_inode->readdirps); + if (readdirps && set) { + GF_ATOMIC_SWAP (wb_inode->invalidate, 1); + list_del_init (&wb_inode->invalidate_list); + list_add (&wb_inode->invalidate_list, + &wb_parent_inode->invalidate_list); + } else if (readdirps == 0) { + GF_ATOMIC_SWAP (wb_inode->invalidate, 0); + list_del_init (&wb_inode->invalidate_list); + } + } + UNLOCK (&wb_parent_inode->lock); + } else { + GF_ATOMIC_SWAP (wb_inode->invalidate, 0); + } + + return; +} + +void +wb_process_queue (wb_inode_t *wb_inode); + + /* Below is a succinct explanation of the code deciding whether two regions overlap, from Pavan . @@ -646,12 +691,16 @@ __wb_inode_create (xlator_t *this, inode_t *inode) INIT_LIST_HEAD (&wb_inode->liability); INIT_LIST_HEAD (&wb_inode->temptation); INIT_LIST_HEAD (&wb_inode->wip); + INIT_LIST_HEAD (&wb_inode->invalidate_list); wb_inode->this = this; wb_inode->window_conf = conf->window_size; + wb_inode->inode = inode; LOCK_INIT (&wb_inode->lock); + GF_ATOMIC_INIT (wb_inode->invalidate, 0); + GF_ATOMIC_INIT (wb_inode->readdirps, 0); ret = __inode_ctx_put (inode, this, (uint64_t)(unsigned long)wb_inode); if (ret) { @@ -1047,6 +1096,30 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, wb_inode = head->wb_inode; + /* There could be a readdirp session in progress. Since wb_fulfill_cbk + * can potentially remove a request from liability queue, + * wb_readdirp_cbk will miss writes on this inode (as it invalidates + * stats only if liability queue is not empty) and hence mark inode + * for invalidation of stats in readdirp response. Specifically this + * code fixes the following race mentioned in wb_readdirp_cbk: + */ + + /* + * We cannot guarantee integrity of entry->d_stat as there are cached + * writes. The stat is most likely stale as it doesn't account the + * cached writes. However, checking for non-empty liability list here is + * not a fool-proof solution as there can be races like, + * 1. readdirp is successful on posix + * 2. sync of cached write is successful on posix + * 3. write-behind received sync response and removed the request from + * liability queue + * 4. readdirp response is processed at write-behind + * + * In the above scenario, stat for the file is sent back in readdirp + * response but it is stale. + * */ + wb_set_invalidate (wb_inode, 1); + if (op_ret == -1) { wb_fulfill_err (head, op_errno); } else if (op_ret < head->total_size) { @@ -1070,14 +1143,13 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, head->total_size += req->write_size; \ } while (0) - int wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) { struct iovec vector[MAX_VECTOR_COUNT]; - int count = 0; - wb_request_t *req = NULL; - call_frame_t *frame = NULL; + int count = 0; + wb_request_t *req = NULL; + call_frame_t *frame = NULL; /* make sure head->total_size is updated before we run into any * errors @@ -1148,7 +1220,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities) conf = wb_inode->this->private; - list_for_each_entry_safe (req, tmp, liabilities, winds) { + list_for_each_entry_safe (req, tmp, liabilities, winds) { list_del_init (&req->winds); if (!head) { @@ -2442,6 +2514,48 @@ noqueue: return 0; } +static void +wb_mark_readdirp_start (xlator_t *this, inode_t *directory) +{ + wb_inode_t *wb_directory_inode = NULL; + + wb_directory_inode = wb_inode_create (this, directory); + + LOCK (&wb_directory_inode->lock); + { + GF_ATOMIC_INC (wb_directory_inode->readdirps); + } + UNLOCK (&wb_directory_inode->lock); + + return; +} + +static void +wb_mark_readdirp_end (xlator_t *this, inode_t *directory) +{ + wb_inode_t *wb_directory_inode = NULL, *wb_inode = NULL, *tmp = NULL; + int readdirps = 0; + + wb_directory_inode = wb_inode_ctx_get (this, directory); + + LOCK (&wb_directory_inode->lock); + { + readdirps = GF_ATOMIC_DEC (wb_directory_inode->readdirps); + if (readdirps) + goto unlock; + + list_for_each_entry_safe (wb_inode, tmp, + &wb_directory_inode->invalidate_list, + invalidate_list) { + list_del_init (&wb_inode->invalidate_list); + GF_ATOMIC_SWAP (wb_inode->invalidate, 0); + } + } +unlock: + UNLOCK (&wb_directory_inode->lock); + + return; +} int32_t wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -2451,6 +2565,10 @@ wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, wb_inode_t *wb_inode = NULL; gf_dirent_t *entry = NULL; inode_t *inode = NULL; + fd_t *fd = NULL; + + fd = frame->local; + frame->local = NULL; if (op_ret <= 0) goto unwind; @@ -2465,29 +2583,8 @@ wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&wb_inode->lock); { - if (!list_empty (&wb_inode->liability)) { - /* We cannot guarantee integrity of - entry->d_stat as there are cached writes. - The stat is most likely stale as it doesn't - account the cached writes. However, checking - for non-empty liability list here is not a - fool-proof solution as there can be races - like, - 1. readdirp is successful on posix - 2. sync of cached write is successful on - posix - 3. write-behind received sync response and - removed the request from liability queue - 4. readdirp response is processed at - write-behind - - In the above scenario, stat for the file is - sent back in readdirp response but it is - stale. - - For lack of better solutions I am sticking - with current solution. - */ + if (!list_empty (&wb_inode->liability) || + GF_ATOMIC_GET (wb_inode->invalidate)) { inode = entry->inode; entry->inode = NULL; @@ -2500,9 +2597,11 @@ wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, UNLOCK (&wb_inode->lock); } + wb_mark_readdirp_end (this, fd->inode); + unwind: - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, - entries, xdata); + frame->local = NULL; + STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); return 0; } @@ -2511,6 +2610,10 @@ int32_t wb_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, dict_t *xdata) { + wb_mark_readdirp_start (this, fd->inode); + + frame->local = fd; + STACK_WIND (frame, wb_readdirp_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); -- cgit