diff options
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 242 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 25 | 
2 files changed, 192 insertions, 75 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index de0e597d124..58cfc732ced 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -711,8 +711,7 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,      link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;      link->base = base; -    lock->refs++; -    lock->inserted++; +    lock->refs_pending++;  }  void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc, @@ -1347,6 +1346,7 @@ ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)          list_move_tail(&link->wait_list, list);          list_add_tail(&link->owner_list, &lock->owners); +        lock->refs_owners++;          ec_lock_update_fd(lock, fop);      } @@ -1478,6 +1478,8 @@ ec_lock_assign_owner(ec_lock_link_t *link)      ec_lock_link_t *timer_link = NULL;      gf_boolean_t assigned = _gf_false; +    /* The link cannot be in any list because we have just finished preparing +     * it. */      GF_ASSERT(list_empty(&link->wait_list));      fop = link->fop; @@ -1485,27 +1487,85 @@ ec_lock_assign_owner(ec_lock_link_t *link)      LOCK(&lock->loc.inode->lock); -    GF_ASSERT (lock->inserted > 0); -    lock->inserted--; +    /* Since the link has just been prepared but it's not active yet, the +     * refs_pending must be one at least (the ref owned by this link). */ +    GF_ASSERT (lock->refs_pending > 0); +    /* The link is not pending any more. It will be assigned to the owner, +     * waiting or frozen list. */ +    lock->refs_pending--;      if (lock->release) {          ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock); -        list_add_tail(&link->wait_list, &lock->frozen); +        /* When lock->release is set, we'll unlock the lock as soon as +         * possible, meaning that we won't use a timer. */ +        GF_ASSERT(lock->timer == NULL); -        /* The lock is frozen, so we move the current reference to refs_frozen. -         * After that, there should remain at least one ref belonging to the -         * lock that is processing the release. */ -        lock->refs--; -        GF_ASSERT(lock->refs > 0); -        lock->refs_frozen++; +        /* The lock is marked to be released. We can still have owners and fops +         * in the waiting ilist f they have been added before the lock has been +         * marked to be released. However new fops are put into the frozen list +         * to wait for the next unlock/lock cycle. */ +        list_add_tail(&link->wait_list, &lock->frozen);          goto unlock;      } +    /* The lock is not marked to be released, so the frozen list should be +     * empty. */ +    GF_ASSERT(list_empty(&lock->frozen)); + +    if (lock->timer != NULL) { +        /* We are trying to acquire a lock that has an unlock timer active. +         * This means that the lock must be idle, i.e. no fop can be in the +         * owner, waiting or frozen lists. It also means that the lock cannot +         * have been marked as being released (this is done without timers) +         * and it must not be exclusive. There should only be one owner +         * reference, but it's possible that some fops are being prepared to +         * use this lock. */ +        GF_ASSERT ((lock->exclusive == 0) && (lock->refs_owners == 1) && +                   list_empty(&lock->owners) && list_empty(&lock->waiting)); + +        /* We take the timer_link before cancelling the timer, since a +         * successful cancellation will destroy it. It must not be NULL +         * because it references the fop responsible for the delayed unlock +         * that we are currently trying to cancel. */ +        timer_link = lock->timer->data; +        GF_ASSERT(timer_link != NULL); + +        gf_timer_call_cancel(fop->xl->ctx, lock->timer); +        ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); + +        /* We have two options here: +         * +         * 1. The timer has been successfully cancelled. +         * +         *    This is the easiest case and we can continue with the currently +         *    acquired lock. +         * +         * 2. The timer callback has already been fired. +         * +         *    In this case we have not been able to cancel the timer before +         *    the timer callback has been fired, but we also know that +         *    lock->timer != NULL. This means that the timer callback is still +         *    trying to acquire the inode mutex that we currently own. We are +         *    safe until we release it. In this case we can safely clear +         *    lock->timer. This will cause that the timer callback does nothing +         *    once it acquires the mutex. +         * +         * In both cases we must release the owner reference assigned to the +         * fop that was handling the unlock because ec_unlock_now() won't be +         * called for that fop. +         */ +        lock->timer = NULL; +        lock->refs_owners--; +    } +      lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;      if (!list_empty(&lock->owners)) { +        /* There are other owners of this lock. We can only take ownership if +         * the lock is already acquired and can be shared. Otherwise we need +         * to wait. */          if (!lock->acquired || (lock->exclusive != 0)) {              ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock); @@ -1513,36 +1573,24 @@ ec_lock_assign_owner(ec_lock_link_t *link)              goto unlock;          } -    } else if (lock->timer != NULL) { -        GF_ASSERT (lock->release == _gf_false); - -        timer_link = lock->timer->data; -        if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) == 0) { -            ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); -            lock->timer = NULL; -            lock->refs--; -            /* There should remain at least 1 ref, the current one. */ -            GF_ASSERT(lock->refs > 0); -        } else { -            /* Timer expired and on the way to unlock. -             * Set lock->release to _gf_true, so that this -             * lock will be put in frozen list*/ -            timer_link = NULL; -            lock->release = _gf_true; -        }      }      list_add_tail(&link->owner_list, &lock->owners); +    lock->refs_owners++;      assigned = _gf_true;  unlock:      if (!assigned) { +        /* We have not been able to take ownership of this lock. The fop must +         * be put to sleep. */          ec_sleep(fop);      }      UNLOCK(&lock->loc.inode->lock); +    /* If we have cancelled the timer, we need to resume the fop that was +     * waiting for it. */      if (timer_link != NULL) {          ec_resume(timer_link->fop, 0);      } @@ -1566,8 +1614,14 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,      ec_trace("LOCK_DONE", fop, "lock=%p", lock); -    GF_ASSERT(!list_empty(&link->owner_list)); +    /* Current link must belong to the owner list of the lock. We don't +     * decrement lock->refs_owners here because the inode mutex is released +     * before ec_unlock() is called and we need to know when the last owner +     * unlocks the lock to do proper cleanup. lock->refs_owners is used for +     * this task. */ +    GF_ASSERT((lock->refs_owners > 0) && !list_empty(&link->owner_list));      list_del_init(&link->owner_list); +      lock->release |= release;      if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) { @@ -1625,6 +1679,7 @@ ec_lock_unfreeze(ec_lock_link_t *link)  {      struct list_head list;      ec_lock_t *lock; +    gf_boolean_t destroy = _gf_false;      lock = link->lock; @@ -1632,18 +1687,30 @@ ec_lock_unfreeze(ec_lock_link_t *link)      LOCK(&lock->loc.inode->lock); -    lock->acquired = _gf_false; +    /* The lock must be marked to be released here, since we have just released +     * it and any attempt to assign it to more fops must have added them to the +     * frozen list. We can only have one active reference here: the one that +     * is processing this unfreeze. */ +    GF_ASSERT(lock->release && (lock->refs_owners == 1));      lock->release = _gf_false; -    lock->refs--; +    lock->refs_owners = 0; -    GF_ASSERT (lock->refs == lock->inserted); -    GF_ASSERT(lock->exclusive == 0); -    GF_ASSERT(list_empty(&lock->waiting) && list_empty(&lock->owners)); +    lock->acquired = _gf_false; + +    /* We are unfreezing a lock. This means that the lock has already been +     * released. In this state it shouldn't be exclusive nor have a pending +     * timer nor have any owner, and the waiting list should be empty. Only +     * the frozen list can contain some fop. */ +    GF_ASSERT((lock->exclusive == 0) && (lock->timer == NULL) && +              list_empty(&lock->waiting) && list_empty(&lock->owners)); +    /* We move all frozen fops to the waiting list. */      list_splice_init(&lock->frozen, &lock->waiting); -    lock->refs += lock->refs_frozen; -    lock->refs_frozen = 0; -    if (lock->refs == 0) { + +    /* If we don't have any fop waiting nor there are any prepared fops using +     * this lock, we can finally dispose it. */ +    destroy = list_empty(&lock->waiting) && (lock->refs_pending == 0); +    if (destroy) {          ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock);          lock->ctx->inode_lock = NULL; @@ -1657,7 +1724,7 @@ ec_lock_unfreeze(ec_lock_link_t *link)      ec_lock_resume_shared(&list); -    if (lock->refs == 0) { +    if (destroy) {          ec_lock_destroy(lock);      }  } @@ -1770,9 +1837,6 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,      fop = link->fop; -    GF_ASSERT(version[0] < 0x100000000); -    GF_ASSERT(version[1] < 0x100000000); -      ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld",               version[0], version[1], size, dirty[0], dirty[1]); @@ -1814,7 +1878,7 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,          }      } -    /* If config information is not know, we request it now. */ +    /* If config information is not known, we request it now. */      if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) {          /* A failure requesting this xattr is ignored because it's not           * absolutely required right now. */ @@ -1850,6 +1914,13 @@ out:      gf_msg (fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL,              "Unable to update version and size"); + +    if ((fop->parent->id != GF_FOP_FLUSH) && +        (fop->parent->id != GF_FOP_FSYNC) && +        (fop->parent->id != GF_FOP_FSYNCDIR)) { +        ec_unlock_lock(fop->data); +    } +  }  gf_boolean_t @@ -1900,7 +1971,6 @@ ec_unlock_now(ec_lock_link_t *link)  void  ec_unlock_timer_del(ec_lock_link_t *link)  { -        int32_t before = 0;          ec_lock_t *lock;          inode_t *inode;          gf_boolean_t now = _gf_false; @@ -1922,22 +1992,24 @@ ec_unlock_timer_del(ec_lock_link_t *link)          if (lock->timer != NULL) {                  ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock); +                /* The unlock timer has expired without anyone cancelling it. +                 * This means that it shouldn't have any owner, and the +                 * waiting and frozen lists should be empty. It shouldn't have +                 * been marked as release nor be exclusive either. It must have +                 * only one owner reference, but there can be fops being +                 * prepared though. */ +                GF_ASSERT(!lock->release && (lock->exclusive == 0) && +                          (lock->refs_owners == 1) && +                          list_empty(&lock->owners) && +                          list_empty(&lock->waiting) && +                          list_empty(&lock->frozen)); +                  gf_timer_call_cancel(link->fop->xl->ctx, lock->timer);                  lock->timer = NULL; +                /* Any fop being processed from now on, will need to wait +                 * until the next unlock/lock cycle. */                  lock->release = now = _gf_true; - -                /* TODO: If the assertion is really true, following code is -                 *       not needed. */ -                GF_ASSERT(list_empty(&lock->waiting)); - -                before = lock->refs + lock->refs_frozen; -                list_splice_init(&lock->waiting, &lock->frozen); -                lock->refs_frozen += lock->refs - lock->inserted - 1; -                lock->refs = 1 + lock->inserted; -                /* We moved around the locks, so total number of locks shouldn't -                 * change by this operation*/ -                GF_ASSERT (before == (lock->refs + lock->refs_frozen));          }          UNLOCK(&inode->lock); @@ -1961,24 +2033,50 @@ void ec_unlock_timer_add(ec_lock_link_t *link)      LOCK(&lock->loc.inode->lock); -    GF_ASSERT(lock->timer == NULL); +    /* We are trying to unlock the lock. We can have multiple scenarios here, +     * but all of them need to have lock->timer == NULL: +     * +     * 1. There are other owners currently running that can call ec_unlock(). +     * +     *    None of them can have started the timer until the last one. But this +     *    call should be the consequence of this lastest one. +     * +     * 2. There are fops in the waiting or frozen lists. +     * +     *    These fops cannot call ec_unlock(). So we should be here. +     * +     * We must reach here with at least one owner reference. +     */ +    GF_ASSERT((lock->timer == NULL) && (lock->refs_owners > 0)); -    if ((lock->refs - lock->inserted) > 1) { +    /* If the fop detects that a heal is needed, we mark the lock to be +     * released as soon as possible. */ +    lock->release |= ec_fop_needs_heal(fop); + +    if (lock->refs_owners > 1) {          ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock); -        lock->refs--; +        /* If there are other owners we cannot do anything else with the lock. +         * Note that the current fop has already been removed from the owners +         * list in ec_lock_reuse(). */ +        lock->refs_owners--;          UNLOCK(&lock->loc.inode->lock);      } else if (lock->acquired) { -        ec_t *ec = fop->xl->private; +        /* There are no other owners and the lock is acquired. If there were +         * fops waiting, at least one of them should have been promoted to an +         * owner, so the waiting list should be empty. */ +        GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting)); -        GF_ASSERT(list_empty(&lock->owners)); +        ec_t *ec = fop->xl->private; +        /* If everything goes as expected this fop will be put to sleep until +         * the timer callback is executed. */          ec_sleep(fop); -        /* If healing is needed, the lock needs to be released due to -         * contention, or ec is shutting down, do not delay lock release. */ -        if (!lock->release && !ec_fop_needs_heal(fop) && !ec->shutdown) { +        /* If the lock needs to be released, or ec is shutting down, do not +         * delay lock release. */ +        if (!lock->release && !ec->shutdown) {              ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,                       lock->release); @@ -1989,9 +2087,10 @@ void ec_unlock_timer_add(ec_lock_link_t *link)              if (lock->timer == NULL) {                  gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,                         EC_MSG_UNLOCK_DELAY_FAILED, -                       "Unable to delay an " -                       "unlock"); +                       "Unable to delay an unlock"); +                /* We are unable to create a new timer. We immediately release +                 * the lock. */                  lock->release = now = _gf_true;              }          } else { @@ -2006,10 +2105,17 @@ void ec_unlock_timer_add(ec_lock_link_t *link)              ec_unlock_now(link);          }      } else { +        /* There are no owners and the lock is not acquired. This can only +         * happen if a lock attempt has failed and we get to the unlock step +         * of the fop. As in the previous case, the waiting list must be +         * empty. */ +        GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting)); + +        /* We need to mark the lock to be released to correctly handle fops +         * that may get in after we release the inode mutex but before +         * ec_lock_unfreeze() is processed. */          lock->release = _gf_true; -        GF_ASSERT(list_empty(&lock->owners)); -          UNLOCK(&lock->loc.inode->lock);          ec_lock_unfreeze(link); @@ -2052,7 +2158,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)              }          }      } else { -        /* If eager lock is disabled or If we haven't get +        /* If eager lock is disabled or if we haven't get           * an answer with enough quorum, we always release           * the lock. */          release = _gf_true; diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 9107b4b156e..f4214ecfed7 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -139,17 +139,28 @@ struct _ec_lock  {      ec_inode_t        *ctx;      gf_timer_t        *timer; -    struct list_head   owners;  /* List of owners of this lock. */ -    struct list_head   waiting; /* Queue of requests being serviced. */ -    struct list_head   frozen;  /* Queue of requests that will be serviced in -                                   the next unlock/lock cycle. */ + +    /* List of owners of this lock. All fops added to this list are running +     * concurrently. */ +    struct list_head   owners; + +    /* List of fops waiting to be an owner of the lock. Fops are added to this +     * list when the current owner has an incompatible access (shared vs +     * exclusive) or the lock is not acquired yet. */ +    struct list_head   waiting; + +    /* List of fops that will wait until the next unlock/lock cycle. This +     * happens when the currently acquired lock is decided to be released as +     * soon as possible. In this case, all frozen fops will be continued only +     * after the lock is reacquired. */ +    struct list_head   frozen; +      int32_t            exclusive;      uintptr_t          mask;      uintptr_t          good_mask;      uintptr_t          healing; -    int32_t            refs; -    int32_t            refs_frozen; -    int32_t            inserted; +    uint32_t           refs_owners;  /* Refs for fops owning the lock */ +    uint32_t           refs_pending; /* Refs assigned to fops being prepared */      gf_boolean_t       acquired;      gf_boolean_t       getting_size;      gf_boolean_t       release;  | 
