diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-read-txn.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-read-txn.c | 249 | 
1 files changed, 230 insertions, 19 deletions
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index a66a6660b0d..945c050da03 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -30,6 +30,28 @@ afr_pending_read_decrement (afr_private_t *priv, int child_index)          GF_ATOMIC_DEC(priv->pending_reads[child_index]);  } +static gf_boolean_t +afr_ta_dict_contains_pending_xattr (dict_t *dict, afr_private_t *priv, +                                    int child) +{ +        int     *pending                     = NULL; +        int     ret                          = 0; +        int     i                            = 0; + +        ret = dict_get_ptr (dict, priv->pending_key[child], (void *)&pending); +        if (ret == 0) { +                for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { +                        /* Not doing a ntoh32(pending) as we just want to check +                         * if it is non-zero or not. */ +                        if (pending[i]) { +                                return _gf_true; +                        } +                } +        } + +        return _gf_false; +} +  void  afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol)  { @@ -81,21 +103,207 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)  	return 0;  } +static int +afr_ta_read_txn_done (int ret, call_frame_t *ta_frame, void *opaque) +{ +        STACK_DESTROY(ta_frame->root); +        return 0; +} + +static int +afr_ta_read_txn (void *opaque) +{ +        call_frame_t    *frame             = NULL; +        xlator_t        *this              = NULL; +        int             read_subvol        = -1; +        int             up_child           = AFR_CHILD_UNKNOWN; +        int             possible_bad_child = AFR_CHILD_UNKNOWN; +        int             ret                = 0; +        int             op_errno           = ENOMEM; +        afr_local_t     *local             = NULL; +        afr_private_t   *priv              = NULL; +        struct gf_flock flock              = {0, }; +        dict_t          *xdata_req         = NULL; +        dict_t          *xdata_rsp         = NULL; +        int             **pending          = NULL; +        loc_t           loc                = {0,}; + +        frame = (call_frame_t *)opaque; +        this = frame->this; +        local = frame->local; +        priv = this->private; + +        if (local->child_up[AFR_CHILD_ZERO]) { +                up_child = AFR_CHILD_ZERO; +                possible_bad_child = AFR_CHILD_ONE; +        } else if (local->child_up[AFR_CHILD_ONE]) { +                up_child = AFR_CHILD_ONE; +                possible_bad_child = AFR_CHILD_ZERO; +        } + +        GF_ASSERT (up_child != AFR_CHILD_UNKNOWN); + +        /* Query the up_child to see if it blames the down one. */ +        xdata_req = dict_new(); +        if (!xdata_req) +                goto out; + +        pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); +        if (!pending) +                goto out; + +        ret = afr_set_pending_dict (priv, xdata_req, pending); +        if (ret < 0) +                goto out; + +        if (local->fd) { +                ret = syncop_fxattrop (priv->children[up_child], local->fd, +                                       GF_XATTROP_ADD_ARRAY, xdata_req, NULL, +                                       &xdata_rsp, NULL); +        } else { +               ret = syncop_xattrop (priv->children[up_child], &local->loc, +                                     GF_XATTROP_ADD_ARRAY, xdata_req, NULL, +                                     &xdata_rsp, NULL); +        } +        if (ret || !xdata_rsp) { +                gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +                        "Failed xattrop for gfid %s on %s", +                        uuid_utoa (local->inode->gfid), +                        priv->children[up_child]->name); +                op_errno = -ret; +                goto out; +        } + +        if (afr_ta_dict_contains_pending_xattr (xdata_rsp, priv, +                                                possible_bad_child)) { +                read_subvol = up_child; +                goto out; +        } +        dict_unref (xdata_rsp); +        /* Query thin-arbiter to see if it blames any data brick. */ +        ret = afr_fill_ta_loc (this, &loc); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +                        "Failed to populate thin-arbiter loc for: %s.", +                        loc.name); +                goto out; +        } +        flock.l_type = F_WRLCK;/*start and length are already zero. */ +        ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX], +                              AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock, +                              NULL, NULL); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +                        "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.", +                        uuid_utoa (local->inode->gfid), +                        priv->pending_key[THIN_ARBITER_BRICK_INDEX]); +                op_errno = -ret; +                goto out; +        } + +        ret = syncop_xattrop (priv->children[THIN_ARBITER_BRICK_INDEX], &loc, +                              GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, +                              NULL); +        if (ret || !xdata_rsp) { +                gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +                        "gfid:%s: Failed xattrop on %s.", +                        uuid_utoa (local->inode->gfid), +                        priv->pending_key[THIN_ARBITER_BRICK_INDEX]); +                op_errno = -ret; +                goto unlock; +        } + +        if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, up_child)) { +                read_subvol = up_child; +        } else { +                gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, +                        "Failing read for gfid %s since good brick %s is down", +                        uuid_utoa (local->inode->gfid), +                        priv->children[possible_bad_child]->name); +                op_errno = EIO; +        } + +unlock: +        flock.l_type = F_UNLCK; +        ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX], +                              AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock, +                              NULL, NULL); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +                        "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on " +                        "%s.", uuid_utoa (local->inode->gfid), +                        priv->pending_key[THIN_ARBITER_BRICK_INDEX]); +        } +out: +        if (xdata_req) +                dict_unref(xdata_req); +        if (xdata_rsp) +                dict_unref(xdata_rsp); +        if (pending) +                afr_matrix_cleanup (pending, priv->child_count); +        loc_wipe (&loc); + +        if (read_subvol == -1) { +                local->op_ret = -1; +                local->op_errno = op_errno; +        } +        afr_read_txn_wind (frame, this, read_subvol); +        return ret; +} + +void +afr_ta_read_txn_synctask (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *ta_frame  = NULL; +        afr_local_t  *local     = NULL; +        int          ret        = 0; + +        local = frame->local; +        ta_frame = afr_ta_frame_create(this); +        if (!ta_frame) { +                local->op_ret = -1; +                local->op_errno = ENOMEM; +                goto out; +        } +        ret = synctask_new (this->ctx->env, afr_ta_read_txn, +                            afr_ta_read_txn_done, ta_frame, frame); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, ENOMEM, +                        AFR_MSG_THIN_ARB, "Failed to launch " +                        "afr_ta_read_txn synctask for gfid %s.", +                        uuid_utoa(local->inode->gfid)); +                local->op_ret = -1; +                local->op_errno = ENOMEM; +                STACK_DESTROY(ta_frame->root); +                goto out; +        } +        return; +out: +        afr_read_txn_wind (frame, this, -1); +} +  int  afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)  { -	afr_local_t *local = NULL; -	int read_subvol = 0; -	inode_t *inode = NULL; -	int ret = -1; -        int spb_choice = -1; +        afr_private_t   *priv           = NULL; +        afr_local_t     *local          = NULL; +        int             read_subvol     = -1; +        inode_t         *inode          = NULL; +        int             ret             = -1; +        int             spb_choice      = -1;  	local = frame->local;  	inode = local->inode; +        priv = this->private;          if (err) { -                read_subvol = -1; -                goto readfn; +                if (!priv->thin_arbiter_count) +                        goto readfn; +                if (err != EINVAL) +                        goto readfn; +                /* We need to query the good bricks and/or thin-arbiter.*/ +                afr_ta_read_txn_synctask (frame, this); +                return 0;          }  	read_subvol = afr_read_subvol_select_by_policy (inode, this, @@ -127,7 +335,6 @@ readfn:  	return 0;  } -  int  afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol)  { @@ -175,7 +382,6 @@ afr_read_txn_wipe (call_frame_t *frame, xlator_t *this)  	}  } -  /*    afr_read_txn: @@ -207,13 +413,13 @@ int  afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,  	      afr_read_txn_wind_t readfn, afr_transaction_type type)  { -	afr_local_t *local = NULL; -	afr_private_t *priv = NULL; -        unsigned char *data = NULL; -        unsigned char *metadata = NULL; -	int read_subvol = -1; -	int event_generation = 0; -	int ret = -1; +        afr_local_t     *local           = NULL; +        afr_private_t   *priv            = NULL; +        unsigned char   *data            = NULL; +        unsigned char   *metadata        = NULL; +        int             read_subvol      = -1; +        int             event_generation = 0; +        int             ret              = -1;  	priv = this->private;  	local = frame->local; @@ -225,21 +431,26 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,  	local->readfn = readfn;  	local->inode = inode_ref (inode);          local->is_read_txn = _gf_true; +        local->transaction.type = type;          if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {                  local->op_ret = -1;                  local->op_errno = afr_quorum_errno(priv); -                read_subvol = -1;                  goto read;          }          if (!afr_is_consistent_io_possible (local, priv, &local->op_errno)) {                  local->op_ret = -1; -                read_subvol = -1;                  goto read;          } -	local->transaction.type = type; +        if (priv->thin_arbiter_count && +            AFR_COUNT (local->child_up, priv->child_count) != +                       priv->child_count) { +                afr_ta_read_txn_synctask (frame, this); +                return 0; +        } +          ret = afr_inode_read_subvol_get (inode, this, data, metadata,                                           &event_generation);  	if (ret == -1)  | 
