From d7a4d256bd86aadcd60668ee37079514dfcf41f3 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Sun, 23 Sep 2018 16:59:58 +0530 Subject: afr: thin-arbiter 2 domain locking and in-memory state 2 domain locking + xattrop for write-txn failures: -------------------------------------------------- - A post-op wound on TA takes AFR_TA_DOM_NOTIFY range lock and AFR_TA_DOM_MODIFY full lock, does xattrop on TA and releases AFR_TA_DOM_MODIFY lock and stores in-memory which brick is bad. - All further write txn failures are handled based on this in-memory value without querying the TA. - When shd heals the files, it does so by requesting full lock on AFR_TA_DOM_NOTIFY domain. Client uses this as a cue (via upcall), releases AFR_TA_DOM_NOTIFY range lock and invalidates its in-memory notion of which brick is bad. The next write txn failure is wound on TA to again update the in-memory state. - Any incomplete write txns before the AFR_TA_DOM_NOTIFY upcall release request is got is completed before the lock is released. - Any write txns got after the release request are maintained in a ta_waitq. - After the release is complete, the ta_waitq elements are spliced to a separate queue which is then processed one by one. - For fops that come in parallel when the in-memory bad brick is still unknown, only one is wound to TA on wire. The other ones are maintained in a ta_onwireq which is then processed after we get the response from TA. Change-Id: I32c7b61a61776663601ab0040e2f0767eca1fd64 updates: bz#1648205 Signed-off-by: Ravishankar N Signed-off-by: Ashish Pandey --- xlators/cluster/afr/src/afr-common.c | 204 +++++++++++++++++++++++++++-------- 1 file changed, 157 insertions(+), 47 deletions(-) (limited to 'xlators/cluster/afr/src/afr-common.c') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 7d352344e7a..54889e0a9b1 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4893,6 +4893,13 @@ afr_priv_dump(xlator_t *this) gf_proc_dump_write("quorum-count", "%d", priv->quorum_count); } gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this)); + if (priv->thin_arbiter_count) { + gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up); + gf_proc_dump_write("ta_bad_child_index", "%d", + priv->ta_bad_child_index); + gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64, + priv->ta_notify_dom_lock_offset); + } return 0; } @@ -4904,14 +4911,19 @@ afr_priv_dump(xlator_t *this) */ static int -find_child_index(xlator_t *this, xlator_t *child) +afr_find_child_index(xlator_t *this, xlator_t *child) { afr_private_t *priv = NULL; + int child_count = -1; int i = -1; priv = this->private; + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + child_count++; + } - for (i = 0; i < priv->child_count; i++) { + for (i = 0; i < child_count; i++) { if ((xlator_t *)child == priv->children[i]) break; } @@ -5307,6 +5319,103 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, priv->last_event[idx] = *event; } +void +afr_ta_lock_release_synctask(xlator_t *this) +{ + call_frame_t *ta_frame = NULL; + int ret = 0; + + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + return; + } + + ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta, + afr_ta_lock_release_done, ta_frame, this); + if (ret) { + STACK_DESTROY(ta_frame->root); + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to release " + "AFR_TA_DOM_NOTIFY lock."); + } +} + +static void +afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall) +{ + struct gf_upcall_inodelk_contention *lc = NULL; + unsigned int inmem_count = 0; + unsigned int onwire_count = 0; + afr_private_t *priv = this->private; + + lc = upcall->data; + + if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0) + return; + + if (priv->shd.iamshd) { + /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */ + return; + } + LOCK(&priv->lock); + { + priv->release_ta_notify_dom_lock = _gf_true; + inmem_count = priv->ta_in_mem_txn_count; + onwire_count = priv->ta_on_wire_txn_count; + } + UNLOCK(&priv->lock); + if (inmem_count || onwire_count) + /* lock release will happen in txn code path after + * inflight or on-wire txns are over.*/ + return; + + afr_ta_lock_release_synctask(this); +} + +static void +afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall) +{ + struct gf_upcall_cache_invalidation *up_ci = NULL; + afr_private_t *priv = this->private; + inode_t *inode = NULL; + inode_table_t *itable = NULL; + int i = 0; + + switch (upcall->event_type) { + case GF_UPCALL_INODELK_CONTENTION: + afr_handle_inodelk_contention(this, upcall); + break; + case GF_UPCALL_CACHE_INVALIDATION: + up_ci = (struct gf_upcall_cache_invalidation *)upcall->data; + + /* Since md-cache will be aggressively filtering + * lookups, the stale read issue will be more + * pronounced. Hence when a pending xattr is set notify + * all the md-cache clients to invalidate the existing + * stat cache and send the lookup next time */ + if (!up_ci->dict) + break; + for (i = 0; i < priv->child_count; i++) { + if (!dict_get(up_ci->dict, priv->pending_key[i])) + continue; + up_ci->flags |= UP_INVAL_ATTR; + itable = ((xlator_t *)this->graph->top)->itable; + /*Internal processes may not have itable for + *top xlator*/ + if (itable) + inode = inode_find(itable, upcall->gfid); + if (inode) + afr_inode_need_refresh_set(inode, this); + break; + } + break; + default: + break; + } +} + int32_t afr_notify(xlator_t *this, int32_t event, void *data, void *data2) { @@ -5324,10 +5433,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) dict_t *output = NULL; gf_boolean_t had_quorum = _gf_false; gf_boolean_t has_quorum = _gf_false; - struct gf_upcall *up_data = NULL; - struct gf_upcall_cache_invalidation *up_ci = NULL; - inode_table_t *itable = NULL; - inode_t *inode = NULL; int64_t halo_max_latency_msec = 0; int64_t child_latency_msec = -1; @@ -5355,7 +5460,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) * subsequent revalidate lookup happens on all the dht's subvolumes * which triggers afr self-heals if any. */ - idx = find_child_index(this, child_xlator); + idx = afr_find_child_index(this, child_xlator); if (idx < 0) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, "Received child_up from invalid subvolume"); @@ -5404,6 +5509,10 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) goto out; } + if (event == GF_EVENT_UPCALL) { + afr_handle_upcall_event(this, data); + } + LOCK(&priv->lock); { had_heard_from_all = __get_heard_from_all_status(this); @@ -5413,12 +5522,22 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) propagate = 1; break; case GF_EVENT_CHILD_UP: + if (priv->thin_arbiter_count && + (idx == AFR_CHILD_THIN_ARBITER)) { + priv->ta_child_up = 1; + break; + } __afr_handle_child_up_event(this, child_xlator, idx, child_latency_msec, &event, &call_psh, &up_child); break; case GF_EVENT_CHILD_DOWN: + if (priv->thin_arbiter_count && + (idx == AFR_CHILD_THIN_ARBITER)) { + priv->ta_child_up = 0; + break; + } __afr_handle_child_down_event(this, child_xlator, idx, child_latency_msec, &event, &call_psh, &up_child); @@ -5432,34 +5551,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) case GF_EVENT_SOME_DESCENDENT_DOWN: priv->last_event[idx] = event; break; - case GF_EVENT_UPCALL: - up_data = (struct gf_upcall *)data; - if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) - break; - up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; - - /* Since md-cache will be aggressively filtering - * lookups, the stale read issue will be more - * pronounced. Hence when a pending xattr is set notify - * all the md-cache clients to invalidate the existing - * stat cache and send the lookup next time */ - if (!up_ci->dict) - break; - for (i = 0; i < priv->child_count; i++) { - if (dict_get(up_ci->dict, priv->pending_key[i])) { - up_ci->flags |= UP_INVAL_ATTR; - itable = ((xlator_t *)this->graph->top)->itable; - /*Internal processes may not have itable for top - * xlator*/ - if (itable) - inode = inode_find(itable, up_data->gfid); - if (inode) - afr_inode_need_refresh_set(inode, this); - - break; - } - } - break; default: propagate = 1; break; @@ -5599,6 +5690,10 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) } local->need_full_crawl = _gf_false; + if (priv->thin_arbiter_count) { + local->ta_child_up = priv->ta_child_up; + local->ta_failed_subvol = AFR_CHILD_UNKNOWN; + } INIT_LIST_HEAD(&local->healer); return 0; @@ -5712,6 +5807,8 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this) ret = 0; INIT_LIST_HEAD(&local->transaction.wait_list); INIT_LIST_HEAD(&local->transaction.owner_list); + INIT_LIST_HEAD(&local->ta_waitq); + INIT_LIST_HEAD(&local->ta_onwireq); out: return ret; } @@ -6700,9 +6797,6 @@ afr_ta_is_fop_called_from_synctask(xlator_t *this) int afr_ta_post_op_lock(xlator_t *this, loc_t *loc) { - /*Note: At any given time, only one instance of this function must - * be in progress.*/ - int ret = 0; uuid_t gfid = { 0, @@ -6717,6 +6811,11 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) }; int32_t cmd = 0; + /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock + * has been released in afr_notify due to upcall notification from shd. + */ + GF_ASSERT(priv->ta_notify_dom_lock_offset == 0); + if (!priv->shd.iamshd) GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); flock1.l_type = F_WRLCK; @@ -6728,14 +6827,10 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) flock1.l_len = 0; } else { cmd = F_SETLK; - if (priv->ta_notify_dom_lock_offset) { - flock1.l_start = priv->ta_notify_dom_lock_offset; - } else { - gf_uuid_generate(gfid); - flock1.l_start = gfid_to_ino(gfid); - if (flock1.l_start < 0) - flock1.l_start = -flock1.l_start; - } + gf_uuid_generate(gfid); + flock1.l_start = gfid_to_ino(gfid); + if (flock1.l_start < 0) + flock1.l_start = -flock1.l_start; flock1.l_len = 1; } ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], @@ -6761,7 +6856,7 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL); if (ret) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Failed to get AFR_TA_DOM_MODIFY lock."); + "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name); flock1.l_type = F_UNLCK; ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL, @@ -6826,3 +6921,18 @@ afr_ta_frame_create(xlator_t *this) afr_set_lk_owner(frame, this, lk_owner); return frame; } + +gf_boolean_t +afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local) +{ + int data_count = 0; + + data_count = AFR_COUNT(local->child_up, priv->child_count); + if (data_count == 2) { + return _gf_true; + } else if (data_count == 1 && local->ta_child_up) { + return _gf_true; + } + + return _gf_false; +} -- cgit