diff options
| author | Ravishankar N <ravishankar@redhat.com> | 2019-04-09 09:44:33 +0530 | 
|---|---|---|
| committer | Ravishankar N <ravishankar@redhat.com> | 2019-05-15 04:16:52 +0000 | 
| commit | 9f225fa2c419b3ecd17ac49bef6d727e1fc55fde (patch) | |
| tree | 31adeaef9424012ce760b64773ba7288ece71e91 | |
| parent | a1fa0379b7ae059a9fbce737cd477407ab082c05 (diff) | |
afr: thin-arbiter lock release fixes
- pass fop state instead of afr local to
afr_ta_dom_lock_check_and_release()
- avoid afr_lock_release_synctask() being called simultaneosuly from
notify code path and transaction (post-op) code path due to races.
- Check if the post-op on TA is valid based on event_gen checks.
- Invalidate in-memory information when we get TA child down.
Note: Thi patch addresses some pending review comments of commit
053b1309dc8fbc05fcde5223e734da9f694cf5cc
(https://review.gluster.org/#/c/glusterfs/+/20095/)
fixes: bz#1709130
Change-Id: I2ccd7e1b53362f9f3fed8680aecb23b5011eb18c
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
(cherry picked from commit 9ab2747da78061882f6734df4b265bce11adaef1)
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 12 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 122 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 6 | 
3 files changed, 93 insertions, 47 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 8e8d7eb7473..5d56efca777 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -5366,6 +5366,11 @@ afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)      }      LOCK(&priv->lock);      { +        if (priv->release_ta_notify_dom_lock == _gf_true) { +            /* Ignore multiple release requests from shds.*/ +            UNLOCK(&priv->lock); +            return; +        }          priv->release_ta_notify_dom_lock = _gf_true;          inmem_count = priv->ta_in_mem_txn_count;          onwire_count = priv->ta_on_wire_txn_count; @@ -5373,7 +5378,7 @@ afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)      UNLOCK(&priv->lock);      if (inmem_count || onwire_count)          /* lock release will happen in txn code path after -         * inflight or on-wire txns are over.*/ +         * in-memory or on-wire txns are over.*/          return;      afr_ta_lock_release_synctask(this); @@ -5531,6 +5536,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)                  if (priv->thin_arbiter_count &&                      (idx == AFR_CHILD_THIN_ARBITER)) {                      priv->ta_child_up = 1; +                    priv->ta_event_gen++;                      break;                  }                  __afr_handle_child_up_event(this, child_xlator, idx, @@ -5542,6 +5548,8 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)                  if (priv->thin_arbiter_count &&                      (idx == AFR_CHILD_THIN_ARBITER)) {                      priv->ta_child_up = 0; +                    priv->ta_event_gen++; +                    afr_ta_locked_priv_invalidate(priv);                      break;                  }                  __afr_handle_child_down_event(this, child_xlator, idx, @@ -5702,6 +5710,8 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)          local->ta_child_up = priv->ta_child_up;          local->ta_failed_subvol = AFR_CHILD_UNKNOWN;          local->read_txn_query_child = AFR_CHILD_UNKNOWN; +        local->ta_event_gen = priv->ta_event_gen; +        local->fop_state = TA_SUCCESS;      }      local->is_new_entry = _gf_false; diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 4a34d838596..bdc4bfc0b10 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -74,6 +74,14 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this);  static void  afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno); +void +afr_ta_locked_priv_invalidate(afr_private_t *priv) +{ +    priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; +    priv->release_ta_notify_dom_lock = _gf_false; +    priv->ta_notify_dom_lock_offset = 0; +} +  static void  afr_ta_process_waitq(xlator_t *this)  { @@ -127,17 +135,16 @@ afr_release_notify_lock_for_ta(void *opaque)      flock.l_len = 1;      ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],                           AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL); -    if (!ret) { -        LOCK(&priv->lock); -        priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; -        priv->release_ta_notify_dom_lock = _gf_false; -        priv->ta_notify_dom_lock_offset = 0; -        UNLOCK(&priv->lock); -    } else { +    if (ret) {          gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,                 "Failed to unlock AFR_TA_DOM_NOTIFY lock.");      } +    LOCK(&priv->lock); +    { +        afr_ta_locked_priv_invalidate(priv); +    } +    UNLOCK(&priv->lock);  out:      loc_wipe(&loc);      return ret; @@ -665,7 +672,7 @@ afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)  }  static void -afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this) +afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this)  {      afr_private_t *priv = this->private;      unsigned int inmem_count = 0; @@ -675,17 +682,25 @@ afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this)      LOCK(&priv->lock);      {          /*Once we get notify lock release upcall notification, -         if two fop states are non empty/non zero, we will -         not release lock. -         1 - If anything in memory txn -         2 - If anything in onwire or onwireq +         if any of the fop state counters are non-zero, we will +         not release the lock.           */ -        if (local->fop_state == TA_INFO_IN_MEMORY_SUCCESS) { -            inmem_count = --priv->ta_in_mem_txn_count; -        } else { -            inmem_count = priv->ta_in_mem_txn_count; -        }          onwire_count = priv->ta_on_wire_txn_count; +        inmem_count = priv->ta_in_mem_txn_count; +        switch (fop_state) { +            case TA_GET_INFO_FROM_TA_FILE: +                onwire_count = --priv->ta_on_wire_txn_count; +                break; +            case TA_INFO_IN_MEMORY_SUCCESS: +            case TA_INFO_IN_MEMORY_FAILED: +                inmem_count = --priv->ta_in_mem_txn_count; +                break; +            case TA_WAIT_FOR_NOTIFY_LOCK_REL: +                GF_ASSERT(0); +                break; +            case TA_SUCCESS: +                break; +        }          release = priv->release_ta_notify_dom_lock;      }      UNLOCK(&priv->lock); @@ -697,7 +712,7 @@ afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this)  }  static void -afr_ta_process_onwireq(afr_local_t *local, xlator_t *this) +afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this)  {      afr_private_t *priv = this->private;      afr_local_t *entry = NULL; @@ -710,15 +725,6 @@ afr_ta_process_onwireq(afr_local_t *local, xlator_t *this)      LOCK(&priv->lock);      { -        if (--priv->ta_on_wire_txn_count == 0) { -            UNLOCK(&priv->lock); -            /*Only one write fop came and after taking notify -             *lock and before doing xattrop, it has received -             *lock contention upcall, so this is the only place -             *to find this out and release the lock*/ -            afr_ta_dom_lock_check_and_release(local, this); -            return; -        }          bad_child = priv->ta_bad_child_index;          if (bad_child == AFR_CHILD_UNKNOWN) {              /*The previous on-wire ta_post_op was a failure. Just dequeue @@ -739,9 +745,6 @@ afr_ta_process_onwireq(afr_local_t *local, xlator_t *this)          while (!list_empty(&onwireq)) {              entry = list_entry(onwireq.next, afr_local_t, ta_onwireq);              list_del_init(&entry->ta_onwireq); -            LOCK(&priv->lock); -            --priv->ta_on_wire_txn_count; -            UNLOCK(&priv->lock);              if (entry->ta_failed_subvol == bad_child) {                  afr_post_op_handle_success(entry->transaction.frame, this);              } else { @@ -764,7 +767,7 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)      if (priv->thin_arbiter_count) {          /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */ -        afr_ta_dom_lock_check_and_release(frame->local, this); +        afr_ta_dom_lock_check_and_release(local->fop_state, this);      }      /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */ @@ -1172,12 +1175,23 @@ afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque)  {      xlator_t *this = NULL;      afr_local_t *local = NULL; +    call_frame_t *txn_frame = NULL; +    afr_ta_fop_state_t fop_state;      local = (afr_local_t *)opaque; +    fop_state = local->fop_state; +    txn_frame = local->transaction.frame;      this = frame->this; +    if (ret == 0) { +        /*Mark pending xattrs on the up data brick.*/ +        afr_post_op_handle_success(txn_frame, this); +    } else { +        afr_post_op_handle_failure(txn_frame, this, -ret); +    } +      STACK_DESTROY(frame->root); -    afr_ta_process_onwireq(local, this); +    afr_ta_process_onwireq(fop_state, this);      return 0;  } @@ -1210,13 +1224,25 @@ out:      return changelog;  } +static void +afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local, +                               gf_boolean_t *valid) +{ +    if (priv->ta_event_gen > local->ta_event_gen) { +        /* We can't trust the ta's response anymore.*/ +        afr_ta_locked_priv_invalidate(priv); +        *valid = _gf_false; +        return; +    } +    return; +} +  static int  afr_ta_post_op_do(void *opaque)  {      afr_local_t *local = NULL;      afr_private_t *priv = NULL;      xlator_t *this = NULL; -    call_frame_t *txn_frame = NULL;      dict_t *xattr = NULL;      unsigned char *pending = NULL;      int **changelog = NULL; @@ -1227,10 +1253,10 @@ afr_ta_post_op_do(void *opaque)      };      int i = 0;      int ret = 0; +    gf_boolean_t valid = _gf_true;      local = (afr_local_t *)opaque; -    txn_frame = local->transaction.frame; -    this = txn_frame->this; +    this = local->transaction.frame->this;      priv = this->private;      ret = afr_fill_ta_loc(this, &loc); @@ -1268,22 +1294,31 @@ afr_ta_post_op_do(void *opaque)      ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,                           GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); +    if (ret) { +        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +               "Post-op on thin-arbiter id file %s failed for gfid %s.", +               priv->pending_key[THIN_ARBITER_BRICK_INDEX], +               uuid_utoa(local->inode->gfid)); +    }      LOCK(&priv->lock);      {          if (ret == 0) {              priv->ta_bad_child_index = failed_subvol;          } else if (ret == -EINVAL) {              priv->ta_bad_child_index = success_subvol; +            ret = -EIO; /* TA failed the fop. Return EIO to application. */          } + +        afr_ta_locked_xattrop_validate(priv, local, &valid);      }      UNLOCK(&priv->lock); -    if (ret) { -        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, -               "Post-op on thin-arbiter id file %s failed for gfid %s.", +    if (valid == _gf_false) { +        gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, +               "Post-op on thin-arbiter id file %s for gfid %s invalidated due " +               "to event-gen mismatch.",                 priv->pending_key[THIN_ARBITER_BRICK_INDEX],                 uuid_utoa(local->inode->gfid)); -        if (ret == -EINVAL) -            ret = -EIO; /* TA failed the fop. Return EIO to application. */ +        ret = -EIO;      }      afr_ta_post_op_unlock(this, &loc); @@ -1296,12 +1331,6 @@ out:      loc_wipe(&loc); -    if (ret == 0) { -        /*Mark pending xattrs on the up data brick.*/ -        afr_post_op_handle_success(local->transaction.frame, this); -    } else { -        afr_post_op_handle_failure(local->transaction.frame, this, -ret); -    }      return ret;  } @@ -1360,6 +1389,7 @@ afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local,              /* Post-op on TA not needed as the fop succeeded only on the               * in-memory bad data brick and not the good one. Fail the fop.*/              local->fop_state = TA_INFO_IN_MEMORY_FAILED; +            priv->ta_in_mem_txn_count++;          }      }      UNLOCK(&priv->lock); diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 6f91c9172f1..88e327bb00e 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -126,6 +126,7 @@ typedef enum {                                    *on BAD brick - Success*/      TA_INFO_IN_MEMORY_FAILED,    /*Bad brick info is in memory and fop failed                                    *on GOOD brick - Failed*/ +    TA_SUCCESS,                  /*FOP succeeded on both data bricks.*/  } afr_ta_fop_state_t;  struct afr_nfsd { @@ -148,6 +149,7 @@ typedef struct _afr_private {      uuid_t ta_gfid;      unsigned char ta_child_up;      int ta_bad_child_index; +    int ta_event_gen;      off_t ta_notify_dom_lock_offset;      gf_boolean_t release_ta_notify_dom_lock;      unsigned int ta_in_mem_txn_count; @@ -887,6 +889,7 @@ typedef struct _afr_local {      struct list_head ta_onwireq;      afr_ta_fop_state_t fop_state;      int ta_failed_subvol; +    int ta_event_gen;      gf_boolean_t is_new_entry;  } afr_local_t; @@ -1326,6 +1329,9 @@ afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local);  void  afr_ta_lock_release_synctask(xlator_t *this); +void +afr_ta_locked_priv_invalidate(afr_private_t *priv); +  gf_boolean_t  afr_lookup_has_quorum(call_frame_t *frame, xlator_t *this,                        unsigned char *subvols);  | 
