diff options
| -rw-r--r-- | tests/basic/afr/ta.t | 10 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 204 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-read-txn.c | 8 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 485 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.h | 5 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 22 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 31 | 
7 files changed, 684 insertions, 81 deletions
diff --git a/tests/basic/afr/ta.t b/tests/basic/afr/ta.t index 20c29784eaf..d8beb1be461 100644 --- a/tests/basic/afr/ta.t +++ b/tests/basic/afr/ta.t @@ -32,11 +32,11 @@ EXPECT "000000010000000200000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/bri  TEST ! ls $B0/brick0/b.txt  TEST ls $B0/brick1/b.txt -#This is not working as expected -#TEST ta_kill_brick ta -#TEST touch $M0/c.txt -#TEST ! ls $B0/brick0/c.txt -#TEST ! ls $B0/brick1/c.txt +TEST ta_kill_brick ta +# Entry create must fail since only 1 brick is up. +TEST ! touch $M0/c.txt +TEST ! ls $B0/brick0/c.txt +TEST ! ls $B0/brick1/c.txt  TEST ta_start_brick_process brick0  EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 7d352344e7a..54889e0a9b1 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4893,6 +4893,13 @@ afr_priv_dump(xlator_t *this)          gf_proc_dump_write("quorum-count", "%d", priv->quorum_count);      }      gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this)); +    if (priv->thin_arbiter_count) { +        gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up); +        gf_proc_dump_write("ta_bad_child_index", "%d", +                           priv->ta_bad_child_index); +        gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64, +                           priv->ta_notify_dom_lock_offset); +    }      return 0;  } @@ -4904,14 +4911,19 @@ afr_priv_dump(xlator_t *this)   */  static int -find_child_index(xlator_t *this, xlator_t *child) +afr_find_child_index(xlator_t *this, xlator_t *child)  {      afr_private_t *priv = NULL; +    int child_count = -1;      int i = -1;      priv = this->private; +    child_count = priv->child_count; +    if (priv->thin_arbiter_count) { +        child_count++; +    } -    for (i = 0; i < priv->child_count; i++) { +    for (i = 0; i < child_count; i++) {          if ((xlator_t *)child == priv->children[i])              break;      } @@ -5307,6 +5319,103 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,      priv->last_event[idx] = *event;  } +void +afr_ta_lock_release_synctask(xlator_t *this) +{ +    call_frame_t *ta_frame = NULL; +    int ret = 0; + +    ta_frame = afr_ta_frame_create(this); +    if (!ta_frame) { +        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, +               "Failed to create ta_frame"); +        return; +    } + +    ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta, +                       afr_ta_lock_release_done, ta_frame, this); +    if (ret) { +        STACK_DESTROY(ta_frame->root); +        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, +               "Failed to release " +               "AFR_TA_DOM_NOTIFY lock."); +    } +} + +static void +afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall) +{ +    struct gf_upcall_inodelk_contention *lc = NULL; +    unsigned int inmem_count = 0; +    unsigned int onwire_count = 0; +    afr_private_t *priv = this->private; + +    lc = upcall->data; + +    if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0) +        return; + +    if (priv->shd.iamshd) { +        /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */ +        return; +    } +    LOCK(&priv->lock); +    { +        priv->release_ta_notify_dom_lock = _gf_true; +        inmem_count = priv->ta_in_mem_txn_count; +        onwire_count = priv->ta_on_wire_txn_count; +    } +    UNLOCK(&priv->lock); +    if (inmem_count || onwire_count) +        /* lock release will happen in txn code path after +         * inflight or on-wire txns are over.*/ +        return; + +    afr_ta_lock_release_synctask(this); +} + +static void +afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall) +{ +    struct gf_upcall_cache_invalidation *up_ci = NULL; +    afr_private_t *priv = this->private; +    inode_t *inode = NULL; +    inode_table_t *itable = NULL; +    int i = 0; + +    switch (upcall->event_type) { +        case GF_UPCALL_INODELK_CONTENTION: +            afr_handle_inodelk_contention(this, upcall); +            break; +        case GF_UPCALL_CACHE_INVALIDATION: +            up_ci = (struct gf_upcall_cache_invalidation *)upcall->data; + +            /* Since md-cache will be aggressively filtering +             * lookups, the stale read issue will be more +             * pronounced. Hence when a pending xattr is set notify +             * all the md-cache clients to invalidate the existing +             * stat cache and send the lookup next time */ +            if (!up_ci->dict) +                break; +            for (i = 0; i < priv->child_count; i++) { +                if (!dict_get(up_ci->dict, priv->pending_key[i])) +                    continue; +                up_ci->flags |= UP_INVAL_ATTR; +                itable = ((xlator_t *)this->graph->top)->itable; +                /*Internal processes may not have itable for +                 *top xlator*/ +                if (itable) +                    inode = inode_find(itable, upcall->gfid); +                if (inode) +                    afr_inode_need_refresh_set(inode, this); +                break; +            } +            break; +        default: +            break; +    } +} +  int32_t  afr_notify(xlator_t *this, int32_t event, void *data, void *data2)  { @@ -5324,10 +5433,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)      dict_t *output = NULL;      gf_boolean_t had_quorum = _gf_false;      gf_boolean_t has_quorum = _gf_false; -    struct gf_upcall *up_data = NULL; -    struct gf_upcall_cache_invalidation *up_ci = NULL; -    inode_table_t *itable = NULL; -    inode_t *inode = NULL;      int64_t halo_max_latency_msec = 0;      int64_t child_latency_msec = -1; @@ -5355,7 +5460,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)       * subsequent revalidate lookup happens on all the dht's subvolumes       * which triggers afr self-heals if any.       */ -    idx = find_child_index(this, child_xlator); +    idx = afr_find_child_index(this, child_xlator);      if (idx < 0) {          gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,                 "Received child_up from invalid subvolume"); @@ -5404,6 +5509,10 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)          goto out;      } +    if (event == GF_EVENT_UPCALL) { +        afr_handle_upcall_event(this, data); +    } +      LOCK(&priv->lock);      {          had_heard_from_all = __get_heard_from_all_status(this); @@ -5413,12 +5522,22 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)                  propagate = 1;                  break;              case GF_EVENT_CHILD_UP: +                if (priv->thin_arbiter_count && +                    (idx == AFR_CHILD_THIN_ARBITER)) { +                    priv->ta_child_up = 1; +                    break; +                }                  __afr_handle_child_up_event(this, child_xlator, idx,                                              child_latency_msec, &event,                                              &call_psh, &up_child);                  break;              case GF_EVENT_CHILD_DOWN: +                if (priv->thin_arbiter_count && +                    (idx == AFR_CHILD_THIN_ARBITER)) { +                    priv->ta_child_up = 0; +                    break; +                }                  __afr_handle_child_down_event(this, child_xlator, idx,                                                child_latency_msec, &event,                                                &call_psh, &up_child); @@ -5432,34 +5551,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)              case GF_EVENT_SOME_DESCENDENT_DOWN:                  priv->last_event[idx] = event;                  break; -            case GF_EVENT_UPCALL: -                up_data = (struct gf_upcall *)data; -                if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) -                    break; -                up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; - -                /* Since md-cache will be aggressively filtering -                 * lookups, the stale read issue will be more -                 * pronounced. Hence when a pending xattr is set notify -                 * all the md-cache clients to invalidate the existing -                 * stat cache and send the lookup next time */ -                if (!up_ci->dict) -                    break; -                for (i = 0; i < priv->child_count; i++) { -                    if (dict_get(up_ci->dict, priv->pending_key[i])) { -                        up_ci->flags |= UP_INVAL_ATTR; -                        itable = ((xlator_t *)this->graph->top)->itable; -                        /*Internal processes may not have itable for top -                         * xlator*/ -                        if (itable) -                            inode = inode_find(itable, up_data->gfid); -                        if (inode) -                            afr_inode_need_refresh_set(inode, this); - -                        break; -                    } -                } -                break;              default:                  propagate = 1;                  break; @@ -5599,6 +5690,10 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)      }      local->need_full_crawl = _gf_false; +    if (priv->thin_arbiter_count) { +        local->ta_child_up = priv->ta_child_up; +        local->ta_failed_subvol = AFR_CHILD_UNKNOWN; +    }      INIT_LIST_HEAD(&local->healer);      return 0; @@ -5712,6 +5807,8 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)      ret = 0;      INIT_LIST_HEAD(&local->transaction.wait_list);      INIT_LIST_HEAD(&local->transaction.owner_list); +    INIT_LIST_HEAD(&local->ta_waitq); +    INIT_LIST_HEAD(&local->ta_onwireq);  out:      return ret;  } @@ -6700,9 +6797,6 @@ afr_ta_is_fop_called_from_synctask(xlator_t *this)  int  afr_ta_post_op_lock(xlator_t *this, loc_t *loc)  { -    /*Note: At any given time, only one instance of this function must -     * be in progress.*/ -      int ret = 0;      uuid_t gfid = {          0, @@ -6717,6 +6811,11 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)      };      int32_t cmd = 0; +    /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock +     * has been released in afr_notify due to upcall notification from shd. +     */ +    GF_ASSERT(priv->ta_notify_dom_lock_offset == 0); +      if (!priv->shd.iamshd)          GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));      flock1.l_type = F_WRLCK; @@ -6728,14 +6827,10 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)              flock1.l_len = 0;          } else {              cmd = F_SETLK; -            if (priv->ta_notify_dom_lock_offset) { -                flock1.l_start = priv->ta_notify_dom_lock_offset; -            } else { -                gf_uuid_generate(gfid); -                flock1.l_start = gfid_to_ino(gfid); -                if (flock1.l_start < 0) -                    flock1.l_start = -flock1.l_start; -            } +            gf_uuid_generate(gfid); +            flock1.l_start = gfid_to_ino(gfid); +            if (flock1.l_start < 0) +                flock1.l_start = -flock1.l_start;              flock1.l_len = 1;          }          ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], @@ -6761,7 +6856,7 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)                           AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL);      if (ret) {          gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, -               "Failed to get AFR_TA_DOM_MODIFY lock."); +               "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name);          flock1.l_type = F_UNLCK;          ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],                               AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL, @@ -6826,3 +6921,18 @@ afr_ta_frame_create(xlator_t *this)      afr_set_lk_owner(frame, this, lk_owner);      return frame;  } + +gf_boolean_t +afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local) +{ +    int data_count = 0; + +    data_count = AFR_COUNT(local->child_up, priv->child_count); +    if (data_count == 2) { +        return _gf_true; +    } else if (data_count == 1 && local->ta_child_up) { +        return _gf_true; +    } + +    return _gf_false; +} diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 1df39c35fce..1cd5c2eee3b 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -261,6 +261,8 @@ afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this)      if (!ta_frame) {          local->op_ret = -1;          local->op_errno = ENOMEM; +        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, +               "Failed to create ta_frame");          goto out;      }      ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done, @@ -440,6 +442,12 @@ afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,          goto read;      } +    if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) { +        local->op_ret = -1; +        local->op_errno = -afr_quorum_errno(priv); +        goto read; +    } +      if (priv->thin_arbiter_count &&          AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {          afr_ta_read_txn_synctask(frame, this); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index cf153dea9cc..fb78c198d9c 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -32,7 +32,7 @@ void  __afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared);  void -afr_changelog_post_op(call_frame_t *frame, xlator_t *this); +afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this);  int  afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this); @@ -53,6 +53,90 @@ afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,                   afr_changelog_resume_t changelog_resume,                   afr_xattrop_type_t op); +static void +afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this); + +static int +afr_ta_post_op_do(void *opaque); + +static int +afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local); + +static int +afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this); + +static void +afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno); + +static void +afr_ta_process_waitq(xlator_t *this) +{ +    afr_local_t *entry = NULL; +    afr_private_t *priv = this->private; +    struct list_head waitq = { +        0, +    }; + +    INIT_LIST_HEAD(&waitq); +    LOCK(&priv->lock); +    list_splice_init(&priv->ta_waitq, &waitq); +    UNLOCK(&priv->lock); +    list_for_each_entry(entry, &waitq, ta_waitq) +    { +        afr_ta_decide_post_op_state(entry->transaction.frame, this); +    } +} + +int +afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque) +{ +    afr_ta_process_waitq(ta_frame->this); +    STACK_DESTROY(ta_frame->root); +    return 0; +} + +int +afr_release_notify_lock_for_ta(void *opaque) +{ +    xlator_t *this = NULL; +    afr_private_t *priv = NULL; +    loc_t loc = { +        0, +    }; +    struct gf_flock flock = { +        0, +    }; +    int ret = -1; + +    this = (xlator_t *)opaque; +    priv = this->private; +    ret = afr_fill_ta_loc(this, &loc); +    if (ret) { +        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, +               "Failed to populate loc for thin-arbiter."); +        goto out; +    } +    flock.l_type = F_UNLCK; +    flock.l_start = priv->ta_notify_dom_lock_offset; +    flock.l_len = 1; +    ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], +                         AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL); +    if (!ret) { +        LOCK(&priv->lock); +        priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; +        priv->release_ta_notify_dom_lock = _gf_false; +        priv->ta_notify_dom_lock_offset = 0; +        UNLOCK(&priv->lock); +    } else { +        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +               "Failed to unlock AFR_TA_DOM_NOTIFY lock."); +    } + +out: +    loc_wipe(&loc); +    return ret; +} +  void  afr_zero_fill_stat(afr_local_t *local)  { @@ -576,18 +660,109 @@ afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)      return ret;  } +static void +afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this) +{ +    afr_private_t *priv = this->private; +    unsigned int inmem_count = 0; +    unsigned int onwire_count = 0; +    gf_boolean_t release = _gf_false; + +    LOCK(&priv->lock); +    { +        /*Once we get notify lock release upcall notification, +         if two fop states are non empty/non zero, we will +         not release lock. +         1 - If anything in memory txn +         2 - If anything in onwire or onwireq +         */ +        if (local->fop_state == TA_INFO_IN_MEMORY_SUCCESS) { +            inmem_count = --priv->ta_in_mem_txn_count; +        } else { +            inmem_count = priv->ta_in_mem_txn_count; +        } +        onwire_count = priv->ta_on_wire_txn_count; +        release = priv->release_ta_notify_dom_lock; +    } +    UNLOCK(&priv->lock); + +    if (inmem_count != 0 || release == _gf_false || onwire_count != 0) +        return; + +    afr_ta_lock_release_synctask(this); +} + +static void +afr_ta_process_onwireq(afr_local_t *local, xlator_t *this) +{ +    afr_private_t *priv = this->private; +    afr_local_t *entry = NULL; +    int bad_child = AFR_CHILD_UNKNOWN; + +    struct list_head onwireq = { +        0, +    }; +    INIT_LIST_HEAD(&onwireq); -/* {{{ pending */ +    LOCK(&priv->lock); +    { +        if (--priv->ta_on_wire_txn_count == 0) { +            UNLOCK(&priv->lock); +            /*Only one write fop came and after taking notify +             *lock and before doing xattrop, it has received +             *lock contention upcall, so this is the only place +             *to find this out and release the lock*/ +            afr_ta_dom_lock_check_and_release(local, this); +            return; +        } +        bad_child = priv->ta_bad_child_index; +        if (bad_child == AFR_CHILD_UNKNOWN) { +            /*The previous on-wire ta_post_op was a failure. Just dequeue +             *one element to wind on-wire again. */ +            entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq); +            list_del_init(&entry->ta_onwireq); +        } else { +            /* Prepare to process all fops based on bad_child_index. */ +            list_splice_init(&priv->ta_onwireq, &onwireq); +        } +    } +    UNLOCK(&priv->lock); + +    if (entry) { +        afr_ta_post_op_synctask(this, entry); +        return; +    } else { +        while (!list_empty(&onwireq)) { +            entry = list_entry(onwireq.next, afr_local_t, ta_onwireq); +            list_del_init(&entry->ta_onwireq); +            LOCK(&priv->lock); +            --priv->ta_on_wire_txn_count; +            UNLOCK(&priv->lock); +            if (entry->ta_failed_subvol == bad_child) { +                afr_changelog_post_op_do(entry->transaction.frame, this); +            } else { +                afr_changelog_post_op_fail(entry->transaction.frame, this, EIO); +            } +        } +    } +}  int  afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)  {      afr_local_t *local = NULL;      afr_internal_lock_t *int_lock = NULL; +    afr_private_t *priv = NULL;      local = frame->local; +    priv = this->private;      int_lock = &local->internal_lock; +    if (priv->thin_arbiter_count) { +        /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */ +        afr_ta_dom_lock_check_and_release(frame->local, this); +    } +      /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */      if (!afr_changelog_has_quorum(local, this)) {          local->op_ret = -1; @@ -605,6 +780,20 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)      return 0;  } +static void +afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno) +{ +    afr_local_t *local = frame->local; +    local->op_ret = -1; +    local->op_errno = op_errno; + +    gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB, +           "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op], +           uuid_utoa(local->inode->gfid), local->fop_state); + +    afr_changelog_post_op_done(frame, this); +} +  unsigned char *  afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock)  { @@ -983,8 +1172,240 @@ out:      return ret;  } -int -afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this) +static int +afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque) +{ +    xlator_t *this = NULL; +    afr_local_t *local = NULL; + +    local = (afr_local_t *)opaque; +    this = frame->this; + +    STACK_DESTROY(frame->root); +    afr_ta_process_onwireq(local, this); + +    return 0; +} + +static int +afr_ta_post_op_do(void *opaque) +{ +    afr_local_t *local = NULL; +    afr_private_t *priv = NULL; +    xlator_t *this = NULL; +    call_frame_t *txn_frame = NULL; +    dict_t *xattr = NULL; +    int **pending = NULL; +    int failed_subvol = -1; +    int success_subvol = -1; +    loc_t loc = { +        0, +    }; +    int idx = 0; +    int i = 0; +    int ret = 0; + +    local = (afr_local_t *)opaque; +    txn_frame = local->transaction.frame; +    this = txn_frame->this; +    priv = this->private; +    idx = afr_index_for_transaction_type(local->transaction.type); + +    ret = afr_fill_ta_loc(this, &loc); +    if (ret) { +        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, +               "Failed to populate loc for thin-arbiter."); +        goto out; +    } + +    xattr = dict_new(); +    if (!xattr) { +        ret = -ENOMEM; +        goto out; +    } + +    pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); +    if (!pending) { +        ret = -ENOMEM; +        goto out; +    } +    for (i = 0; i < priv->child_count; i++) { +        if (local->transaction.failed_subvols[i]) { +            pending[i][idx] = hton32(1); +            failed_subvol = i; +        } else { +            success_subvol = i; +        } +    } + +    ret = afr_set_pending_dict(priv, xattr, pending); +    if (ret < 0) +        goto out; + +    ret = afr_ta_post_op_lock(this, &loc); +    if (ret) +        goto out; + +    ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, +                         GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); +    LOCK(&priv->lock); +    { +        if (ret == 0) { +            priv->ta_bad_child_index = failed_subvol; +        } else if (ret == -EINVAL) { +            priv->ta_bad_child_index = success_subvol; +        } +    } +    UNLOCK(&priv->lock); +    if (ret) { +        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +               "Post-op on thin-arbiter id file %s failed for gfid %s.", +               priv->pending_key[THIN_ARBITER_BRICK_INDEX], +               uuid_utoa(local->inode->gfid)); +        if (ret == -EINVAL) +            ret = -EIO; /* TA failed the fop. Return EIO to application. */ +    } + +    afr_ta_post_op_unlock(this, &loc); +out: +    if (xattr) +        dict_unref(xattr); + +    if (pending) +        afr_matrix_cleanup(pending, priv->child_count); + +    loc_wipe(&loc); + +    if (ret == 0) { +        /*Mark pending xattrs on the up data brick.*/ +        afr_changelog_post_op_do(local->transaction.frame, this); +    } else { +        afr_changelog_post_op_fail(local->transaction.frame, this, -ret); +    } +    return ret; +} + +static int +afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local) +{ +    call_frame_t *ta_frame = NULL; +    int ret = 0; + +    ta_frame = afr_ta_frame_create(this); +    if (!ta_frame) { +        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, +               "Failed to create ta_frame"); +        goto err; +    } +    ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done, +                       ta_frame, local); +    if (ret) { +        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, +               "Failed to launch post-op on thin arbiter for gfid %s", +               uuid_utoa(local->inode->gfid)); +        STACK_DESTROY(ta_frame->root); +        goto err; +    } + +    return ret; +err: +    afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM); +    return ret; +} + +static void +afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local, +                     int *on_wire_count) +{ +    LOCK(&priv->lock); +    { +        if (priv->release_ta_notify_dom_lock == _gf_true) { +            /* Put the fop in waitq until notify dom lock is released.*/ +            local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL; +            list_add_tail(&local->ta_waitq, &priv->ta_waitq); +        } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) { +            /* Post-op on thin-arbiter to decide success/failure. */ +            local->fop_state = TA_GET_INFO_FROM_TA_FILE; +            *on_wire_count = ++priv->ta_on_wire_txn_count; +            if (*on_wire_count > 1) { +                /*Avoid sending multiple on-wire post-ops on TA*/ +                list_add_tail(&local->ta_onwireq, &priv->ta_onwireq); +            } +        } else if (local->ta_failed_subvol == priv->ta_bad_child_index) { +            /* Post-op on TA not needed as the fop failed on the in-memory bad +             * brick. Just mark pending xattrs on the good data brick.*/ +            local->fop_state = TA_INFO_IN_MEMORY_SUCCESS; +            priv->ta_in_mem_txn_count++; +        } else { +            /* Post-op on TA not needed as the fop succeeded only on the +             * in-memory bad data brick and not the good one. Fail the fop.*/ +            local->fop_state = TA_INFO_IN_MEMORY_FAILED; +        } +    } +    UNLOCK(&priv->lock); +} + +static void +afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local) +{ +    int i = 0; + +    for (i = 0; i < priv->child_count; i++) { +        if (local->transaction.failed_subvols[i]) { +            local->ta_failed_subvol = i; +            break; +        } +    } +} + +static void +afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this) +{ +    afr_private_t *priv = NULL; +    afr_local_t *local = NULL; +    int on_wire_count = 0; + +    priv = this->private; +    local = frame->local; + +    afr_ta_set_fop_state(priv, local, &on_wire_count); + +    switch (local->fop_state) { +        case TA_GET_INFO_FROM_TA_FILE: +            if (on_wire_count == 1) +                afr_ta_post_op_synctask(this, local); +            /*else, fop is queued in ta_onwireq.*/ +            break; +        case TA_WAIT_FOR_NOTIFY_LOCK_REL: +            /*Post releasing the notify lock, we will act on this queue*/ +            break; +        case TA_INFO_IN_MEMORY_SUCCESS: +            afr_changelog_post_op_do(frame, this); +            break; +        case TA_INFO_IN_MEMORY_FAILED: +            afr_changelog_post_op_fail(frame, this, EIO); +            break; +    } +    return; +} + +static void +afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this) +{ +    afr_private_t *priv = this->private; +    afr_local_t *local = frame->local; + +    afr_ta_fill_failed_subvol(priv, local); +    gf_msg_debug(this->name, 0, +                 "Fop failed on data brick (%s) for gfid=%s. " +                 "ta info needed to decide fop result.", +                 priv->children[local->ta_failed_subvol]->name, +                 uuid_utoa(local->inode->gfid)); +    afr_ta_decide_post_op_state(frame, this); +} + +void +afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this)  {      afr_private_t *priv = this->private;      afr_local_t *local = NULL; @@ -1001,9 +1422,7 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)      xattr = dict_new();      if (!xattr) { -        local->op_ret = -1; -        local->op_errno = ENOMEM; -        afr_changelog_post_op_done(frame, this); +        afr_changelog_post_op_fail(frame, this, ENOMEM);          goto out;      } @@ -1030,9 +1449,8 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)      }      if (local->transaction.in_flight_sb) { -        local->op_ret = -1; -        local->op_errno = local->transaction.in_flight_sb_errno; -        afr_changelog_post_op_done(frame, this); +        afr_changelog_post_op_fail(frame, this, +                                   local->transaction.in_flight_sb_errno);          goto out;      } @@ -1043,17 +1461,7 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)      ret = afr_set_pending_dict(priv, xattr, local->pending);      if (ret < 0) { -        local->op_ret = -1; -        local->op_errno = ENOMEM; -        afr_changelog_post_op_done(frame, this); -        goto out; -    } - -    ret = afr_changelog_thin_arbiter_post_op(this, local); -    if (ret < 0) { -        local->op_ret = -1; -        local->op_errno = -ret; -        afr_changelog_post_op_done(frame, this); +        afr_changelog_post_op_fail(frame, this, ENOMEM);          goto out;      } @@ -1066,9 +1474,7 @@ set_dirty:      ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty,                                sizeof(int) * AFR_NUM_CHANGE_LOGS);      if (ret) { -        local->op_ret = -1; -        local->op_errno = ENOMEM; -        afr_changelog_post_op_done(frame, this); +        afr_changelog_post_op_fail(frame, this, ENOMEM);          goto out;      } @@ -1078,6 +1484,32 @@ out:      if (xattr)          dict_unref(xattr); +    return; +} + +static int +afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this) +{ +    afr_private_t *priv = NULL; +    afr_local_t *local = NULL; +    int failed_count = 0; + +    priv = this->private; +    local = frame->local; + +    if (priv->thin_arbiter_count) { +        failed_count = AFR_COUNT(local->transaction.failed_subvols, +                                 priv->child_count); +        if (failed_count == 1) { +            afr_handle_failure_using_thin_arbiter(frame, this); +            return 0; +        } else { +            /* Txn either succeeded or failed on both data bricks. Let +             * post_op_do handle it as the case might be. */ +        } +    } + +    afr_changelog_post_op_do(frame, this);      return 0;  } @@ -2457,6 +2889,11 @@ afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)          goto out;      } +    if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) { +        ret = -afr_quorum_errno(priv); +        goto out; +    } +      ret = afr_transaction_local_init(local, this);      if (ret < 0)          goto out; diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index fff8c65e976..35a922544bc 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -67,4 +67,9 @@ afr_lock(call_frame_t *frame, xlator_t *this);  void  afr_delayed_changelog_wake_up_cbk(void *data); +int +afr_release_notify_lock_for_ta(void *opaque); + +int +afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque);  #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 26950fd7927..5d5e536ff60 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -336,6 +336,22 @@ out:      return ret;  } +void +afr_ta_init(afr_private_t *priv) +{ +    priv->thin_arbiter_count = 1; +    priv->child_count--; +    priv->ta_child_up = 0; +    priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; +    priv->ta_notify_dom_lock_offset = 0; +    priv->ta_in_mem_txn_count = 0; +    priv->ta_on_wire_txn_count = 0; +    priv->release_ta_notify_dom_lock = _gf_false; +    INIT_LIST_HEAD(&priv->ta_waitq); +    INIT_LIST_HEAD(&priv->ta_onwireq); +    *priv->ta_gfid = 0; +} +  int32_t  init(xlator_t *this)  { @@ -380,11 +396,7 @@ init(xlator_t *this)      GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out);      GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out);      if (thin_arbiter && strlen(thin_arbiter) > 0) { -        priv->thin_arbiter_count = 1; -        priv->child_count--; -        priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; -        priv->ta_notify_dom_lock_offset = 0; -        *priv->ta_gfid = 0; +        afr_ta_init(priv);      }      INIT_LIST_HEAD(&priv->healing);      INIT_LIST_HEAD(&priv->heal_waiting); diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 3d2c1950571..6f8015380f0 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -107,8 +107,20 @@ typedef enum {      AFR_CHILD_UNKNOWN = -1,      AFR_CHILD_ZERO,      AFR_CHILD_ONE, +    AFR_CHILD_THIN_ARBITER,  } afr_child_index; +typedef enum { +    TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall +                                   notification and waiting for its release.*/ +    TA_GET_INFO_FROM_TA_FILE,    /*FOP needs post-op on ta file to get +                                  *info about which brick is bad.*/ +    TA_INFO_IN_MEMORY_SUCCESS,   /*Bad brick info is in memory and fop failed +                                  *on BAD brick - Success*/ +    TA_INFO_IN_MEMORY_FAILED,    /*Bad brick info is in memory and fop failed +                                  *on GOOD brick - Failed*/ +} afr_ta_fop_state_t; +  struct afr_nfsd {      gf_boolean_t iamnfsd;      uint32_t halo_max_latency_msec; @@ -127,8 +139,14 @@ typedef struct _afr_private {      /* For thin-arbiter. */      unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/      uuid_t ta_gfid; +    unsigned char ta_child_up;      int ta_bad_child_index;      off_t ta_notify_dom_lock_offset; +    gf_boolean_t release_ta_notify_dom_lock; +    unsigned int ta_in_mem_txn_count; +    unsigned int ta_on_wire_txn_count; +    struct list_head ta_waitq; +    struct list_head ta_onwireq;      unsigned char *child_up;      int64_t *child_latency; @@ -855,6 +873,13 @@ typedef struct _afr_local {      gf_boolean_t is_read_txn;      afr_inode_ctx_t *inode_ctx; + +    /*For thin-arbiter transactions.*/ +    unsigned char ta_child_up; +    struct list_head ta_waitq; +    struct list_head ta_onwireq; +    afr_ta_fop_state_t fop_state; +    int ta_failed_subvol;  } afr_local_t;  typedef struct afr_spbc_timeout { @@ -1289,4 +1314,10 @@ __afr_get_up_children_count(afr_private_t *priv);  call_frame_t *  afr_ta_frame_create(xlator_t *this); + +gf_boolean_t +afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local); + +void +afr_ta_lock_release_synctask(xlator_t *this);  #endif /* __AFR_H__ */  | 
