From d7a4d256bd86aadcd60668ee37079514dfcf41f3 Mon Sep 17 00:00:00 2001
From: Ravishankar N <ravishankar@redhat.com>
Date: Sun, 23 Sep 2018 16:59:58 +0530
Subject: afr: thin-arbiter 2 domain locking and in-memory state

2 domain locking + xattrop for write-txn failures:
--------------------------------------------------
- A post-op wound on TA takes AFR_TA_DOM_NOTIFY range lock and
AFR_TA_DOM_MODIFY full lock, does xattrop on TA and releases
AFR_TA_DOM_MODIFY lock and stores in-memory which brick is bad.

- All further write txn failures are handled based on this in-memory
value without querying the TA.

- When shd heals the files, it does so by requesting full lock on
AFR_TA_DOM_NOTIFY domain. Client uses this as a cue (via upcall),
releases AFR_TA_DOM_NOTIFY range lock and invalidates its in-memory
notion of which brick is bad. The next write txn failure is wound on TA
to again update the in-memory state.

- Any incomplete write txns before the AFR_TA_DOM_NOTIFY upcall release
request is got is completed before the lock is released.

- Any write txns got after the release request are maintained in a ta_waitq.

- After the release is complete, the ta_waitq elements are spliced to a
separate queue which is then processed one by one.

- For fops that come in parallel when the in-memory bad brick is still
unknown, only one is wound to TA on wire. The other ones are maintained
in a ta_onwireq which is then processed after we get the response from
TA.

Change-Id: I32c7b61a61776663601ab0040e2f0767eca1fd64
updates: bz#1648205
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
---
 xlators/cluster/afr/src/afr-common.c | 204 +++++++++++++++++++++++++++--------
 1 file changed, 157 insertions(+), 47 deletions(-)

(limited to 'xlators/cluster/afr/src/afr-common.c')

diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 7d352344e7a..54889e0a9b1 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4893,6 +4893,13 @@ afr_priv_dump(xlator_t *this)
         gf_proc_dump_write("quorum-count", "%d", priv->quorum_count);
     }
     gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this));
+    if (priv->thin_arbiter_count) {
+        gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up);
+        gf_proc_dump_write("ta_bad_child_index", "%d",
+                           priv->ta_bad_child_index);
+        gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64,
+                           priv->ta_notify_dom_lock_offset);
+    }
 
     return 0;
 }
@@ -4904,14 +4911,19 @@ afr_priv_dump(xlator_t *this)
  */
 
 static int
-find_child_index(xlator_t *this, xlator_t *child)
+afr_find_child_index(xlator_t *this, xlator_t *child)
 {
     afr_private_t *priv = NULL;
+    int child_count = -1;
     int i = -1;
 
     priv = this->private;
+    child_count = priv->child_count;
+    if (priv->thin_arbiter_count) {
+        child_count++;
+    }
 
-    for (i = 0; i < priv->child_count; i++) {
+    for (i = 0; i < child_count; i++) {
         if ((xlator_t *)child == priv->children[i])
             break;
     }
@@ -5307,6 +5319,103 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
     priv->last_event[idx] = *event;
 }
 
+void
+afr_ta_lock_release_synctask(xlator_t *this)
+{
+    call_frame_t *ta_frame = NULL;
+    int ret = 0;
+
+    ta_frame = afr_ta_frame_create(this);
+    if (!ta_frame) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+               "Failed to create ta_frame");
+        return;
+    }
+
+    ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta,
+                       afr_ta_lock_release_done, ta_frame, this);
+    if (ret) {
+        STACK_DESTROY(ta_frame->root);
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+               "Failed to release "
+               "AFR_TA_DOM_NOTIFY lock.");
+    }
+}
+
+static void
+afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)
+{
+    struct gf_upcall_inodelk_contention *lc = NULL;
+    unsigned int inmem_count = 0;
+    unsigned int onwire_count = 0;
+    afr_private_t *priv = this->private;
+
+    lc = upcall->data;
+
+    if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0)
+        return;
+
+    if (priv->shd.iamshd) {
+        /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */
+        return;
+    }
+    LOCK(&priv->lock);
+    {
+        priv->release_ta_notify_dom_lock = _gf_true;
+        inmem_count = priv->ta_in_mem_txn_count;
+        onwire_count = priv->ta_on_wire_txn_count;
+    }
+    UNLOCK(&priv->lock);
+    if (inmem_count || onwire_count)
+        /* lock release will happen in txn code path after
+         * inflight or on-wire txns are over.*/
+        return;
+
+    afr_ta_lock_release_synctask(this);
+}
+
+static void
+afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall)
+{
+    struct gf_upcall_cache_invalidation *up_ci = NULL;
+    afr_private_t *priv = this->private;
+    inode_t *inode = NULL;
+    inode_table_t *itable = NULL;
+    int i = 0;
+
+    switch (upcall->event_type) {
+        case GF_UPCALL_INODELK_CONTENTION:
+            afr_handle_inodelk_contention(this, upcall);
+            break;
+        case GF_UPCALL_CACHE_INVALIDATION:
+            up_ci = (struct gf_upcall_cache_invalidation *)upcall->data;
+
+            /* Since md-cache will be aggressively filtering
+             * lookups, the stale read issue will be more
+             * pronounced. Hence when a pending xattr is set notify
+             * all the md-cache clients to invalidate the existing
+             * stat cache and send the lookup next time */
+            if (!up_ci->dict)
+                break;
+            for (i = 0; i < priv->child_count; i++) {
+                if (!dict_get(up_ci->dict, priv->pending_key[i]))
+                    continue;
+                up_ci->flags |= UP_INVAL_ATTR;
+                itable = ((xlator_t *)this->graph->top)->itable;
+                /*Internal processes may not have itable for
+                 *top xlator*/
+                if (itable)
+                    inode = inode_find(itable, upcall->gfid);
+                if (inode)
+                    afr_inode_need_refresh_set(inode, this);
+                break;
+            }
+            break;
+        default:
+            break;
+    }
+}
+
 int32_t
 afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
 {
@@ -5324,10 +5433,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
     dict_t *output = NULL;
     gf_boolean_t had_quorum = _gf_false;
     gf_boolean_t has_quorum = _gf_false;
-    struct gf_upcall *up_data = NULL;
-    struct gf_upcall_cache_invalidation *up_ci = NULL;
-    inode_table_t *itable = NULL;
-    inode_t *inode = NULL;
     int64_t halo_max_latency_msec = 0;
     int64_t child_latency_msec = -1;
 
@@ -5355,7 +5460,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
      * subsequent revalidate lookup happens on all the dht's subvolumes
      * which triggers afr self-heals if any.
      */
-    idx = find_child_index(this, child_xlator);
+    idx = afr_find_child_index(this, child_xlator);
     if (idx < 0) {
         gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
                "Received child_up from invalid subvolume");
@@ -5404,6 +5509,10 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
         goto out;
     }
 
+    if (event == GF_EVENT_UPCALL) {
+        afr_handle_upcall_event(this, data);
+    }
+
     LOCK(&priv->lock);
     {
         had_heard_from_all = __get_heard_from_all_status(this);
@@ -5413,12 +5522,22 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
                 propagate = 1;
                 break;
             case GF_EVENT_CHILD_UP:
+                if (priv->thin_arbiter_count &&
+                    (idx == AFR_CHILD_THIN_ARBITER)) {
+                    priv->ta_child_up = 1;
+                    break;
+                }
                 __afr_handle_child_up_event(this, child_xlator, idx,
                                             child_latency_msec, &event,
                                             &call_psh, &up_child);
                 break;
 
             case GF_EVENT_CHILD_DOWN:
+                if (priv->thin_arbiter_count &&
+                    (idx == AFR_CHILD_THIN_ARBITER)) {
+                    priv->ta_child_up = 0;
+                    break;
+                }
                 __afr_handle_child_down_event(this, child_xlator, idx,
                                               child_latency_msec, &event,
                                               &call_psh, &up_child);
@@ -5432,34 +5551,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
             case GF_EVENT_SOME_DESCENDENT_DOWN:
                 priv->last_event[idx] = event;
                 break;
-            case GF_EVENT_UPCALL:
-                up_data = (struct gf_upcall *)data;
-                if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
-                    break;
-                up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
-
-                /* Since md-cache will be aggressively filtering
-                 * lookups, the stale read issue will be more
-                 * pronounced. Hence when a pending xattr is set notify
-                 * all the md-cache clients to invalidate the existing
-                 * stat cache and send the lookup next time */
-                if (!up_ci->dict)
-                    break;
-                for (i = 0; i < priv->child_count; i++) {
-                    if (dict_get(up_ci->dict, priv->pending_key[i])) {
-                        up_ci->flags |= UP_INVAL_ATTR;
-                        itable = ((xlator_t *)this->graph->top)->itable;
-                        /*Internal processes may not have itable for top
-                         * xlator*/
-                        if (itable)
-                            inode = inode_find(itable, up_data->gfid);
-                        if (inode)
-                            afr_inode_need_refresh_set(inode, this);
-
-                        break;
-                    }
-                }
-                break;
             default:
                 propagate = 1;
                 break;
@@ -5599,6 +5690,10 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
     }
 
     local->need_full_crawl = _gf_false;
+    if (priv->thin_arbiter_count) {
+        local->ta_child_up = priv->ta_child_up;
+        local->ta_failed_subvol = AFR_CHILD_UNKNOWN;
+    }
 
     INIT_LIST_HEAD(&local->healer);
     return 0;
@@ -5712,6 +5807,8 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
     ret = 0;
     INIT_LIST_HEAD(&local->transaction.wait_list);
     INIT_LIST_HEAD(&local->transaction.owner_list);
+    INIT_LIST_HEAD(&local->ta_waitq);
+    INIT_LIST_HEAD(&local->ta_onwireq);
 out:
     return ret;
 }
@@ -6700,9 +6797,6 @@ afr_ta_is_fop_called_from_synctask(xlator_t *this)
 int
 afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
 {
-    /*Note: At any given time, only one instance of this function must
-     * be in progress.*/
-
     int ret = 0;
     uuid_t gfid = {
         0,
@@ -6717,6 +6811,11 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
     };
     int32_t cmd = 0;
 
+    /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock
+     * has been released in afr_notify due to upcall notification from shd.
+     */
+    GF_ASSERT(priv->ta_notify_dom_lock_offset == 0);
+
     if (!priv->shd.iamshd)
         GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
     flock1.l_type = F_WRLCK;
@@ -6728,14 +6827,10 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
             flock1.l_len = 0;
         } else {
             cmd = F_SETLK;
-            if (priv->ta_notify_dom_lock_offset) {
-                flock1.l_start = priv->ta_notify_dom_lock_offset;
-            } else {
-                gf_uuid_generate(gfid);
-                flock1.l_start = gfid_to_ino(gfid);
-                if (flock1.l_start < 0)
-                    flock1.l_start = -flock1.l_start;
-            }
+            gf_uuid_generate(gfid);
+            flock1.l_start = gfid_to_ino(gfid);
+            if (flock1.l_start < 0)
+                flock1.l_start = -flock1.l_start;
             flock1.l_len = 1;
         }
         ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
@@ -6761,7 +6856,7 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
                          AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL);
     if (ret) {
         gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
-               "Failed to get AFR_TA_DOM_MODIFY lock.");
+               "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name);
         flock1.l_type = F_UNLCK;
         ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
                              AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL,
@@ -6826,3 +6921,18 @@ afr_ta_frame_create(xlator_t *this)
     afr_set_lk_owner(frame, this, lk_owner);
     return frame;
 }
+
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local)
+{
+    int data_count = 0;
+
+    data_count = AFR_COUNT(local->child_up, priv->child_count);
+    if (data_count == 2) {
+        return _gf_true;
+    } else if (data_count == 1 && local->ta_child_up) {
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
-- 
cgit