diff options
22 files changed, 1203 insertions, 264 deletions
diff --git a/libglusterfs/src/upcall-utils.h b/libglusterfs/src/upcall-utils.h index 3b5dce33e45..48d10382c10 100644 --- a/libglusterfs/src/upcall-utils.h +++ b/libglusterfs/src/upcall-utils.h @@ -63,6 +63,8 @@ typedef enum {          GF_UPCALL_EVENT_NULL,          GF_UPCALL_CACHE_INVALIDATION,          GF_UPCALL_RECALL_LEASE, +        GF_UPCALL_INODELK_CONTENTION, +        GF_UPCALL_ENTRYLK_CONTENTION,  } gf_upcall_event_t;  struct gf_upcall { @@ -88,4 +90,19 @@ struct gf_upcall_recall_lease {          dict_t   *dict;  }; +struct gf_upcall_inodelk_contention { +        struct gf_flock  flock; +        pid_t            pid; +        const char      *domain; +        dict_t          *xdata; +}; + +struct gf_upcall_entrylk_contention { +        uint32_t    type; +        pid_t       pid; +        const char *name; +        const char *domain; +        dict_t     *xdata; +}; +  #endif /* _UPCALL_UTILS_H */ diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h index aee34302205..aafd94400c6 100644 --- a/rpc/rpc-lib/src/protocol-common.h +++ b/rpc/rpc-lib/src/protocol-common.h @@ -150,6 +150,8 @@ enum gf_cbk_procnum {          GF_CBK_CHILD_DOWN,          GF_CBK_RECALL_LEASE,          GF_CBK_STATEDUMP, +        GF_CBK_INODELK_CONTENTION, +        GF_CBK_ENTRYLK_CONTENTION,          GF_CBK_MAXVALUE,  }; diff --git a/rpc/xdr/src/glusterfs3.h b/rpc/xdr/src/glusterfs3.h index 2da5594a347..eef39416b5c 100644 --- a/rpc/xdr/src/glusterfs3.h +++ b/rpc/xdr/src/glusterfs3.h @@ -419,6 +419,162 @@ gf_proto_cache_invalidation_to_upcall (xlator_t *this,          return ret;  } +static inline int +gf_proto_inodelk_contention_to_upcall (struct gfs4_inodelk_contention_req *lc, +                                       struct gf_upcall *gf_up_data) +{ +        struct gf_upcall_inodelk_contention *tmp = NULL; +        xlator_t *this                           = NULL; +        int    ret                               = -1; +        int    op_errno                          = EINVAL; + +        this = THIS; + +        GF_VALIDATE_OR_GOTO(this->name, lc, out); +        GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out); + +        tmp = (struct gf_upcall_inodelk_contention *)gf_up_data->data; + +        gf_uuid_copy(gf_up_data->gfid, (unsigned char *)lc->gfid); + +        gf_proto_flock_to_flock(&lc->flock, &tmp->flock); +        tmp->pid = lc->pid; +        tmp->domain = lc->domain; +        if ((tmp->domain != NULL) && (*tmp->domain == 0)) { +                tmp->domain = NULL; +        } + +        GF_PROTOCOL_DICT_UNSERIALIZE (this, tmp->xdata, lc->xdata.xdata_val, +                                      lc->xdata.xdata_len, ret, op_errno, out); + +        ret = 0; + +out: +        if (ret < 0) { +                ret = -op_errno; +        } + +        return ret; +} + +static inline int +gf_proto_inodelk_contention_from_upcall (xlator_t *this, +                                         struct gfs4_inodelk_contention_req *lc, +                                         struct gf_upcall *gf_up_data) +{ +        struct gf_upcall_inodelk_contention *tmp = NULL; +        int    ret                               = -1; +        int    op_errno                          = EINVAL; + +        GF_VALIDATE_OR_GOTO(this->name, lc, out); +        GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out); + +        tmp = (struct gf_upcall_inodelk_contention *)gf_up_data->data; + +        gf_uuid_copy((unsigned char *)lc->gfid, gf_up_data->gfid); + +        gf_proto_flock_from_flock(&lc->flock, &tmp->flock); +        lc->pid = tmp->pid; +        lc->domain = (char *)tmp->domain; +        if (lc->domain == NULL) { +                lc->domain = ""; +        } + +        GF_PROTOCOL_DICT_SERIALIZE (this, tmp->xdata, &lc->xdata.xdata_val, +                                    lc->xdata.xdata_len, op_errno, out); + +        ret = 0; + +out: +        if (ret < 0) { +                ret = -op_errno; +        } + +        return ret; +} + +static inline int +gf_proto_entrylk_contention_to_upcall (struct gfs4_entrylk_contention_req *lc, +                                       struct gf_upcall *gf_up_data) +{ +        struct gf_upcall_entrylk_contention *tmp = NULL; +        xlator_t *this                           = NULL; +        int    ret                               = -1; +        int    op_errno                          = EINVAL; + +        this = THIS; + +        GF_VALIDATE_OR_GOTO(this->name, lc, out); +        GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out); + +        tmp = (struct gf_upcall_entrylk_contention *)gf_up_data->data; + +        gf_uuid_copy(gf_up_data->gfid, (unsigned char *)lc->gfid); + +        tmp->type = lc->type; +        tmp->name = lc->name; +        if ((tmp->name != NULL) && (*tmp->name == 0)) { +                tmp->name = NULL; +        } +        tmp->pid = lc->pid; +        tmp->domain = lc->domain; +        if ((tmp->domain != NULL) && (*tmp->domain == 0)) { +                tmp->domain = NULL; +        } + +        GF_PROTOCOL_DICT_UNSERIALIZE (this, tmp->xdata, lc->xdata.xdata_val, +                                      lc->xdata.xdata_len, ret, op_errno, out); + +        ret = 0; + +out: +        if (ret < 0) { +                ret = -op_errno; +        } + +        return ret; +} + +static inline int +gf_proto_entrylk_contention_from_upcall (xlator_t *this, +                                         struct gfs4_entrylk_contention_req *lc, +                                         struct gf_upcall *gf_up_data) +{ +        struct gf_upcall_entrylk_contention *tmp = NULL; +        int    ret                               = -1; +        int    op_errno                          = EINVAL; + +        GF_VALIDATE_OR_GOTO(this->name, lc, out); +        GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out); + +        tmp = (struct gf_upcall_entrylk_contention *)gf_up_data->data; + +        gf_uuid_copy((unsigned char *)lc->gfid, gf_up_data->gfid); + +        lc->type = tmp->type; +        lc->name = (char *)tmp->name; +        if (lc->name == NULL) { +                lc->name = ""; +        } +        lc->pid = tmp->pid; +        lc->domain = (char *)tmp->domain; +        if (lc->domain == NULL) { +                lc->domain = ""; +        } + +        GF_PROTOCOL_DICT_SERIALIZE (this, tmp->xdata, &lc->xdata.xdata_val, +                                    lc->xdata.xdata_len, op_errno, out); + +        ret = 0; + +out: +        if (ret < 0) { +                ret = -op_errno; +        } + +        return ret; +} +  extern int dict_to_xdr (dict_t *this, gfx_dict *xdict);  extern int xdr_to_dict (gfx_dict *xdict, dict_t **to); diff --git a/rpc/xdr/src/glusterfs4-xdr.x b/rpc/xdr/src/glusterfs4-xdr.x index 7396b566fa7..ef0cfde0802 100644 --- a/rpc/xdr/src/glusterfs4-xdr.x +++ b/rpc/xdr/src/glusterfs4-xdr.x @@ -86,3 +86,20 @@ struct gfs4_namelink_req {         string bname<>;         opaque xdata<>;  }; + +struct gfs4_inodelk_contention_req { +        opaque                gfid[16]; +        struct gf_proto_flock flock; +        unsigned int          pid; +        string                domain<>; +        opaque                xdata<>; +}; + +struct gfs4_entrylk_contention_req { +        opaque                gfid[16]; +        unsigned int          type; +        unsigned int          pid; +        string                name<>; +        string                domain<>; +        opaque                xdata<>; +}; diff --git a/rpc/xdr/src/libgfxdr.sym b/rpc/xdr/src/libgfxdr.sym index 8af956ef5a9..83f1efc732a 100644 --- a/rpc/xdr/src/libgfxdr.sym +++ b/rpc/xdr/src/libgfxdr.sym @@ -155,8 +155,12 @@ xdr_gfs3_xattrop_req  xdr_gfs3_xattrop_rsp  xdr_gfs3_zerofill_req  xdr_gfs3_zerofill_rsp +xdr_gfs4_entrylk_contention_req +xdr_gfs4_entrylk_contention_rsp  xdr_gfs4_icreate_req  xdr_gfs4_icreate_rsp +xdr_gfs4_inodelk_contention_req +xdr_gfs4_inodelk_contention_rsp  xdr_gfs4_namelink_req  xdr_gfs4_namelink_rsp  xdr_gf_set_lk_ver_req diff --git a/tests/basic/ec/lock-contention.t b/tests/basic/ec/lock-contention.t new file mode 100644 index 00000000000..8f86cee16ad --- /dev/null +++ b/tests/basic/ec/lock-contention.t @@ -0,0 +1,62 @@ +#!/bin/bash + +# This test verifies that when 'lock-notify-contention' option is enabled, +# locks xlator actually sends an upcall notification that causes the acquired +# lock from one client to be released before it's supposed to when another +# client accesses the file. + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +function elapsed_time() { +        local start="`date +%s`" + +        if [[ "test" == `cat "$1"` ]]; then +                echo "$((`date +%s` - ${start}))" +        fi +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2} +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.quick-read off +TEST $CLI volume set $V0 performance.read-ahead off +TEST $CLI volume set $V0 performance.io-cache off +TEST $CLI volume set $V0 features.locks-notify-contention off +TEST $CLI volume set $V0 disperse.eager-lock on +TEST $CLI volume set $V0 disperse.eager-lock-timeout 6 +TEST $CLI volume set $V0 disperse.other-eager-lock on +TEST $CLI volume set $V0 disperse.other-eager-lock-timeout 6 +TEST $CLI volume start $V0 + +TEST $GFS --direct-io-mode=yes --volfile-id=/$V0 --volfile-server=$H0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0 + +TEST $GFS --direct-io-mode=yes --volfile-id=/$V0 --volfile-server=$H0 $M1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M1 + +TEST $(echo "test" >$M0/file) + +# With locks-notify-contention set to off, accessing the file from another +# client should take 6 seconds. Checking against 3 seconds to be safe. +elapsed="$(elapsed_time $M1/file)" +TEST [[ ${elapsed} -ge 3 ]] + +elapsed="$(elapsed_time $M0/file)" +TEST [[ ${elapsed} -ge 3 ]] + +TEST $CLI volume set $V0 features.locks-notify-contention on + +# With locks-notify-contention set to on, accessing the file from another +# client should be fast. Checking against 3 seconds to be safe. +elapsed="$(elapsed_time $M1/file)" +TEST [[ ${elapsed} -le 3 ]] + +elapsed="$(elapsed_time $M0/file)" +TEST [[ ${elapsed} -le 3 ]] + +cleanup diff --git a/tests/volume.rc b/tests/volume.rc index 3a48b43d2e0..3ee83624058 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -93,7 +93,8 @@ function remove_brick_status_completed_field {  function get_mount_process_pid {          local vol=$1 -        ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol " | awk '{print $2}' | head -1 +        local mnt=$2 +        ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol .*$mnt" | awk '{print $2}' | head -1  }  function get_nfs_pid () @@ -126,7 +127,8 @@ function generate_statedump {  function generate_mount_statedump {          local vol=$1 -        generate_statedump $(get_mount_process_pid $vol) +        local mnt=$2 +        generate_statedump $(get_mount_process_pid $vol $mnt)  }  function cleanup_mount_statedump { @@ -205,14 +207,16 @@ function ec_child_up_status {          local vol=$1          local dist_id=$2          local brick_id=$(($3 + 1)) -        local mask=$(ec_get_info $vol $dist_id "childs_up_mask" $(generate_mount_statedump $vol)) +        local mnt=$4 +        local mask=$(ec_get_info $vol $dist_id "childs_up_mask" $(generate_mount_statedump $vol $mnt))          echo "${mask: -$brick_id:1}"  }  function ec_child_up_count {          local vol=$1          local dist_id=$2 -        ec_get_info $vol $dist_id "childs_up" $(generate_mount_statedump $vol) +        local mnt=$3 +        ec_get_info $vol $dist_id "childs_up" $(generate_mount_statedump $vol $mnt)  }  function ec_child_up_status_shd { diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index cb627a92c9c..fbc7ac97aa0 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -1847,6 +1847,67 @@ gf_boolean_t ec_lock_acquire(ec_lock_link_t *link)      return _gf_true;  } +static ec_lock_link_t * +ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock) +{ +        ec_lock_link_t *timer_link; + +        /* If we don't have any timer, there's nothing to cancel. */ +        if (lock->timer == NULL) { +                return NULL; +        } + +        /* We are trying to access a lock that has an unlock timer active. +         * This means that the lock must be idle, i.e. no fop can be in the +         * owner, waiting or frozen lists. It also means that the lock cannot +         * have been marked as being released (this is done without timers). +         * There should only be one owner reference, but it's possible that +         * some fops are being prepared to use this lock. */ +        GF_ASSERT ((lock->refs_owners == 1) && +                   list_empty(&lock->owners) && list_empty(&lock->waiting)); + +        /* We take the timer_link before cancelling the timer, since a +         * successful cancellation will destroy it. It must not be NULL +         * because it references the fop responsible for the delayed unlock +         * that we are currently trying to cancel. */ +        timer_link = lock->timer->data; +        GF_ASSERT(timer_link != NULL); + +        if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) { +                /* It's too late to avoid the execution of the timer callback. +                 * Since we need to be sure that the callback has access to all +                 * needed resources, we cannot resume the execution of the +                 * timer fop now. This will be done in the callback. */ +                timer_link = NULL; +        } else { +                /* The timer has been cancelled. The fop referenced by +                 * timer_link holds the last reference. The caller is +                 * responsible to release it when not needed anymore. */ +                ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); +        } + +        /* We have two options here: +         * +         * 1. The timer has been successfully cancelled. +         * +         *    This is the easiest case and we can continue with the currently +         *    acquired lock. +         * +         * 2. The timer callback has already been fired. +         * +         *    In this case we have not been able to cancel the timer before +         *    the timer callback has been fired, but we also know that +         *    lock->timer != NULL. This means that the timer callback is still +         *    trying to acquire the inode mutex that we currently own. We are +         *    safe until we release it. In this case we can safely clear +         *    lock->timer. This will cause that the timer callback does nothing +         *    once it acquires the mutex. +         */ +        lock->timer = NULL; + +        return timer_link; +} +  static gf_boolean_t  ec_lock_assign_owner(ec_lock_link_t *link)  { @@ -1891,61 +1952,7 @@ ec_lock_assign_owner(ec_lock_link_t *link)       * empty. */      GF_ASSERT(list_empty(&lock->frozen)); -    if (lock->timer != NULL) { -        /* We are trying to acquire a lock that has an unlock timer active. -         * This means that the lock must be idle, i.e. no fop can be in the -         * owner, waiting or frozen lists. It also means that the lock cannot -         * have been marked as being released (this is done without timers). -         * There should only be one owner reference, but it's possible that -         * some fops are being prepared to use this lock. -         */ -        GF_ASSERT ((lock->refs_owners == 1) && -                   list_empty(&lock->owners) && list_empty(&lock->waiting)); - -        /* We take the timer_link before cancelling the timer, since a -         * successful cancellation will destroy it. It must not be NULL -         * because it references the fop responsible for the delayed unlock -         * that we are currently trying to cancel. */ -        timer_link = lock->timer->data; -        GF_ASSERT(timer_link != NULL); - -        if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) { -            /* It's too late to avoid the execution of the timer callback. -             * Since we need to be sure that the callback has access to all -             * needed resources, we cannot resume the execution of the timer -             * fop now. This will be done in the callback. -             */ -            timer_link = NULL; -        } else { -            /* The timer has been cancelled, so we need to release the owner -             * reference that was held by the fop waiting for the timer. This -             * can be the last reference, but we'll immediately increment it -             * for the current fop, so no need to check it. -             */ -            lock->refs_owners--; - -            ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); -        } - -        /* We have two options here: -         * -         * 1. The timer has been successfully cancelled. -         * -         *    This is the easiest case and we can continue with the currently -         *    acquired lock. -         * -         * 2. The timer callback has already been fired. -         * -         *    In this case we have not been able to cancel the timer before -         *    the timer callback has been fired, but we also know that -         *    lock->timer != NULL. This means that the timer callback is still -         *    trying to acquire the inode mutex that we currently own. We are -         *    safe until we release it. In this case we can safely clear -         *    lock->timer. This will cause that the timer callback does nothing -         *    once it acquires the mutex. -         */ -        lock->timer = NULL; -    } +    timer_link = ec_lock_timer_cancel(fop->xl, lock);      if (!list_empty(&lock->owners)) {          /* There are other owners of this lock. We can only take ownership if @@ -1965,7 +1972,13 @@ ec_lock_assign_owner(ec_lock_link_t *link)      }      list_add_tail(&link->owner_list, &lock->owners); -    lock->refs_owners++; + +    /* If timer_link is not NULL, it means that we have inherited the owner +     * reference assigned to the timer fop. In this case we simply reuse it. +     * Otherwise we need to increase the number of owners. */ +    if (timer_link == NULL) { +            lock->refs_owners++; +    }      assigned = _gf_true; @@ -2383,6 +2396,48 @@ ec_unlock_now(ec_lock_link_t *link)      ec_resume(link->fop, 0);  } +void +ec_lock_release(ec_t *ec, inode_t *inode) +{ +        ec_lock_t *lock; +        ec_inode_t *ctx; +        ec_lock_link_t *timer_link = NULL; + +        LOCK(&inode->lock); + +        ctx = __ec_inode_get(inode, ec->xl); +        if (ctx == NULL) { +                goto done; +        } +        lock = ctx->inode_lock; +        if ((lock == NULL) || !lock->acquired || lock->release) { +                goto done; +        } + +        gf_msg_debug(ec->xl->name, 0, +                     "Releasing inode %p due to lock contention", inode); + +        /* The lock is not marked to be released, so the frozen list should be +         * empty. */ +        GF_ASSERT(list_empty(&lock->frozen)); + +        timer_link = ec_lock_timer_cancel(ec->xl, lock); + +        /* We mark the lock to be released as soon as possible. */ +        lock->release = _gf_true; + +done: +        UNLOCK(&inode->lock); + +        /* If we have cancelled the timer, we need to start the unlock of the +         * inode. If there was a timer but we have been unable to cancel it +         * because it was just triggered, the timer callback will take care +         * of releasing the inode. */ +        if (timer_link != NULL) { +                ec_unlock_now(timer_link); +        } +} +  void ec_unlock_timer_add(ec_lock_link_t *link);  void @@ -2470,9 +2525,60 @@ void ec_unlock_timer_cbk(void *data)          ec_unlock_timer_del(data);  } +static gf_boolean_t +ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop) +{ +        /* Fops with no locks at this point mean that they are sent as sub-fops +         * of other higher level fops. In this case we simply assume that the +         * parent fop will take correct care of the eager lock. */ +        if (fop->lock_count == 0) { +                return _gf_true; +        } + +        /* We may have more than one lock, but this only happens in the rename +         * fop, and both locks will reference an inode of the same type (a +         * directory in this case), so we only need to check the first lock. */ +        if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) { +                return ec->eager_lock; +        } + +        return ec->other_eager_lock; +} + +static uint32_t +ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock) +{ +        if (lock->loc.inode->ia_type == IA_IFREG) { +                return ec->eager_lock_timeout; +        } + +        return ec->other_eager_lock_timeout; +} + +static gf_boolean_t +ec_lock_delay_create(ec_lock_link_t *link) +{ +        struct timespec delay; +        ec_fop_data_t *fop = link->fop; +        ec_lock_t *lock = link->lock; + +        delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock); +        delay.tv_nsec = 0; +        lock->timer = gf_timer_call_after(fop->xl->ctx, delay, +                                          ec_unlock_timer_cbk, link); +        if (lock->timer == NULL) { +                gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM, +                       EC_MSG_UNLOCK_DELAY_FAILED, +                       "Unable to delay an unlock"); + +                return _gf_false; +        } + +        return _gf_true; +} +  void ec_unlock_timer_add(ec_lock_link_t *link)  { -    struct timespec delay;      ec_fop_data_t *fop = link->fop;      ec_lock_t *lock = link->lock;      gf_boolean_t now = _gf_false; @@ -2526,19 +2632,12 @@ void ec_unlock_timer_add(ec_lock_link_t *link)              ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,                       lock->release); -            delay.tv_sec = 1; -            delay.tv_nsec = 0; -            lock->timer = gf_timer_call_after(fop->xl->ctx, delay, -                                              ec_unlock_timer_cbk, link); -            if (lock->timer == NULL) { -                gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM, -                       EC_MSG_UNLOCK_DELAY_FAILED, -                       "Unable to delay an unlock"); - +            if (!ec_lock_delay_create(link)) {                  /* We are unable to create a new timer. We immediately release                   * the lock. */                  lock->release = now = _gf_true;              } +          } else {              ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock,                       lock->release); @@ -2583,26 +2682,6 @@ void ec_flush_size_version(ec_fop_data_t * fop)      ec_update_info(&fop->locks[0]);  } -static gf_boolean_t -ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop) -{ -        /* Fops with no locks at this point mean that they are sent as sub-fops -         * of other higher level fops. In this case we simply assume that the -         * parent fop will take correct care of the eager lock. */ -        if (fop->lock_count == 0) { -                return _gf_true; -        } - -        /* We may have more than one lock, but this only happens in the rename -         * fop, and both locks will reference an inode of the same type (a -         * directory in this case), so we only need to check the first lock. */ -        if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) { -                return ec->eager_lock; -        } - -        return ec->other_eager_lock; -} -  static void  ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,                   ec_fop_data_t *fop) @@ -2708,7 +2787,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)      ec = fop->xl->private;      cbk = fop->answer; -    if (ec_use_eager_lock(ec, fop) && cbk != NULL) { +    if (ec_eager_lock_used(ec, fop) && cbk != NULL) {          if (cbk->xdata != NULL) {              if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT,                                  &count) == 0) && (count > 1)) { diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index c3e291585ef..99e2f0653be 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -102,6 +102,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,  void ec_lock(ec_fop_data_t * fop);  void ec_lock_reuse(ec_fop_data_t *fop);  void ec_unlock(ec_fop_data_t * fop); +void ec_lock_release(ec_t *ec, inode_t *inode);  gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,                                 uint64_t *size); diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index 23dc434bc42..15b4c77abfe 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -669,6 +669,8 @@ struct _ec {      uint32_t           background_heals;      uint32_t           heal_wait_qlen;      uint32_t           self_heal_window_size; /* max size of read/writes */ +    uint32_t           eager_lock_timeout; +    uint32_t           other_eager_lock_timeout;      struct list_head   pending_fops;      struct list_head   heal_waiting;      struct list_head   healing; diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 4c80f1283f1..30b0bdcb29c 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -271,6 +271,11 @@ reconfigure (xlator_t *this, dict_t *options)                            bool, failed);          GF_OPTION_RECONF ("other-eager-lock", ec->other_eager_lock, options,                            bool, failed); +        GF_OPTION_RECONF ("eager-lock-timeout", ec->eager_lock_timeout, +                          options, uint32, failed); +        GF_OPTION_RECONF ("other-eager-lock-timeout", +                          ec->other_eager_lock_timeout, options, uint32, +                          failed);          GF_OPTION_RECONF ("background-heals", background_heals, options,                            uint32, failed);          GF_OPTION_RECONF ("heal-wait-qlength", heal_wait_qlen, options, @@ -453,6 +458,43 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)          }  } +static gf_boolean_t +ec_upcall(ec_t *ec, struct gf_upcall *upcall) +{ +        struct gf_upcall_cache_invalidation *ci = NULL; +        struct gf_upcall_inodelk_contention *lc = NULL; +        inode_t *inode; + +        switch (upcall->event_type) { +        case GF_UPCALL_CACHE_INVALIDATION: +                ci = upcall->data; +                ci->flags |= UP_INVAL_ATTR; +                return _gf_true; + +        case GF_UPCALL_INODELK_CONTENTION: +                lc = upcall->data; +                if (strcmp(lc->domain, ec->xl->name) != 0) { +                        /* The lock is not owned by EC, ignore it. */ +                        return _gf_true; +                } +                inode = inode_find(((xlator_t *)ec->xl->graph->top)->itable, +                                   upcall->gfid); +                /* If inode is not found, it means that it's already released, +                 * so we can ignore it. Probably it has been released and +                 * destroyed while the contention notification was being sent. +                 */ +                if (inode != NULL) { +                        ec_lock_release(ec, inode); +                        inode_unref(inode); +                } + +                return _gf_false; + +        default: +                return _gf_true; +        } +} +  int32_t  ec_notify (xlator_t *this, int32_t event, void *data, void *data2)  { @@ -464,19 +506,13 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)          dict_t            *output   = NULL;          gf_boolean_t      propagate = _gf_true;          int32_t           orig_event = event; -        struct gf_upcall *up_data   = NULL; -        struct gf_upcall_cache_invalidation *up_ci = NULL;          uintptr_t mask = 0;          gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p",                  event, data, data2);          if (event == GF_EVENT_UPCALL) { -                up_data = (struct gf_upcall *)data; -                if (up_data->event_type == GF_UPCALL_CACHE_INVALIDATION) { -                        up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; -                        up_ci->flags |= UP_INVAL_ATTR; -                } +                propagate = ec_upcall(ec, data);                  goto done;          } @@ -664,6 +700,10 @@ init (xlator_t *this)      GF_OPTION_INIT ("iam-self-heal-daemon", ec->shd.iamshd, bool, failed);      GF_OPTION_INIT ("eager-lock", ec->eager_lock, bool, failed);      GF_OPTION_INIT ("other-eager-lock", ec->other_eager_lock, bool, failed); +    GF_OPTION_INIT ("eager-lock-timeout", ec->eager_lock_timeout, uint32, +                    failed); +    GF_OPTION_INIT ("other-eager-lock-timeout", ec->other_eager_lock_timeout, +                    uint32, failed);      GF_OPTION_INIT ("background-heals", ec->background_heals, uint32, failed);      GF_OPTION_INIT ("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed);      GF_OPTION_INIT ("self-heal-window-size", ec->self_heal_window_size, uint32, @@ -1456,6 +1496,29 @@ struct volume_options options[] =        .description = "It's equivalent to the eager-lock option but for non "                       "regular files."      }, +    { .key = {"eager-lock-timeout"}, +      .type = GF_OPTION_TYPE_INT, +      .min = 1, +      .max = 60, +      .default_value = "1", +      .op_version = { GD_OP_VERSION_4_0_0 }, +      .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, +      .tags = { "disperse", "locks", "timeout" }, +      .description = "Maximum time (in seconds) that a lock on an inode is " +                     "kept held if no new operations on the inode are " +                     "received." +    }, +    { .key = {"other-eager-lock-timeout"}, +      .type = GF_OPTION_TYPE_INT, +      .min = 1, +      .max = 60, +      .default_value = "1", +      .op_version = { GD_OP_VERSION_4_0_0 }, +      .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, +      .tags = { "disperse", "locks", "timeout" }, +      .description = "It's equivalent ot eager-lock-timeout option but for " +                     "non regular files." +    },      { .key = {"background-heals"},        .type = GF_OPTION_TYPE_INT,        .min = 0,/*Disabling background heals*/ diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c index a76d6beacb1..1609fc416d2 100644 --- a/xlators/features/locks/src/clear.c +++ b/xlators/features/locks/src/clear.c @@ -200,6 +200,7 @@ int  clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,                       clrlk_args *args, int *blkd, int *granted, int *op_errno)  { +        posix_locks_private_t   *priv;          pl_inode_lock_t         *ilock          = NULL;          pl_inode_lock_t         *tmp            = NULL;          struct gf_flock         ulock           = {0, }; @@ -207,9 +208,20 @@ clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,          int                     bcount          = 0;          int                     gcount          = 0;          gf_boolean_t            chk_range       = _gf_false; +        struct list_head        *pcontend       = NULL;          struct list_head        released; +        struct list_head        contend; +        struct timespec         now = { };          INIT_LIST_HEAD (&released); + +        priv = this->private; +        if (priv->notify_contention) { +                pcontend = &contend; +                INIT_LIST_HEAD (pcontend); +                timespec_now(&now); +        } +          if (clrlk_get_lock_range (args->opts, &ulock, &chk_range)) {                  *op_errno = EINVAL;                  goto out; @@ -283,7 +295,10 @@ granted:          ret = 0;  out: -        grant_blocked_inode_locks (this, pl_inode, dom); +        grant_blocked_inode_locks (this, pl_inode, dom, &now, pcontend); +        if (pcontend != NULL) { +                inodelk_contention_notify(this, pcontend); +        }          *blkd    = bcount;          *granted = gcount;          return ret; @@ -294,15 +309,27 @@ int  clrlk_clear_entrylk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,                       clrlk_args *args, int *blkd, int *granted, int *op_errno)  { +        posix_locks_private_t   *priv;          pl_entry_lock_t         *elock          = NULL;          pl_entry_lock_t         *tmp            = NULL;          int                     bcount          = 0;          int                     gcount          = 0;          int                     ret             = -1; +        struct list_head        *pcontend       = NULL;          struct list_head        removed;          struct list_head        released; +        struct list_head        contend; +        struct timespec         now;          INIT_LIST_HEAD (&released); + +        priv = this->private; +        if (priv->notify_contention) { +                pcontend = &contend; +                INIT_LIST_HEAD (pcontend); +                timespec_now(&now); +        } +          if (args->kind & CLRLK_BLOCKED)                  goto blkd; @@ -361,12 +388,15 @@ granted:                          list_del_init (&elock->domain_list);                          list_add_tail (&elock->domain_list, &removed); -			__pl_entrylk_unref (elock); +                        __pl_entrylk_unref (elock);                  }          }          pthread_mutex_unlock (&pl_inode->mutex); -	grant_blocked_entry_locks (this, pl_inode, dom); +        grant_blocked_entry_locks (this, pl_inode, dom, &now, pcontend); +        if (pcontend != NULL) { +                entrylk_contention_notify(this, pcontend); +        }          ret = 0;  out: diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h index 3729ca24bed..50c156feb38 100644 --- a/xlators/features/locks/src/common.h +++ b/xlators/features/locks/src/common.h @@ -69,7 +69,11 @@ get_domain (pl_inode_t *pl_inode, const char *volume);  void  grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, -                           pl_dom_list_t *dom); +                           pl_dom_list_t *dom, struct timespec *now, +                           struct list_head *contend); + +void +inodelk_contention_notify (xlator_t *this, struct list_head *contend);  void  __delete_inode_lock (pl_inode_lock_t *lock); @@ -79,7 +83,11 @@ __pl_inodelk_unref (pl_inode_lock_t *lock);  void  grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, -                           pl_dom_list_t *dom); +                           pl_dom_list_t *dom, struct timespec *now, +                           struct list_head *contend); + +void +entrylk_contention_notify (xlator_t *this, struct list_head *contend);  void pl_update_refkeeper (xlator_t *this, inode_t *inode); diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index 6698516fc83..008d05a34c4 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -13,17 +13,19 @@  #include "logging.h"  #include "common-utils.h"  #include "list.h" +#include "upcall-utils.h"  #include "locks.h"  #include "clear.h"  #include "common.h" +#include "pl-messages.h"  void  __pl_entrylk_unref (pl_entry_lock_t *lock)  {          lock->ref--;          if (!lock->ref) { -		GF_FREE ((char *)lock->basename); +                GF_FREE ((char *)lock->basename);                  GF_FREE (lock->connection_id);                  GF_FREE (lock);          } @@ -39,7 +41,7 @@ __pl_entrylk_ref (pl_entry_lock_t *lock)  static pl_entry_lock_t *  new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, -		  const char *domain, call_frame_t *frame, char *conn_id) +                  const char *domain, call_frame_t *frame, char *conn_id)  {          pl_entry_lock_t *newlock = NULL; @@ -55,7 +57,7 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,          newlock->client_pid = frame->root->pid;          newlock->volume     = domain;          newlock->owner      = frame->root->lk_owner; -	newlock->frame      = frame; +        newlock->frame      = frame;          newlock->this       = frame->this;          if (conn_id) { @@ -64,9 +66,9 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,          INIT_LIST_HEAD (&newlock->domain_list);          INIT_LIST_HEAD (&newlock->blocked_locks); -	INIT_LIST_HEAD (&newlock->client_list); +        INIT_LIST_HEAD (&newlock->client_list); -	__pl_entrylk_ref (newlock); +        __pl_entrylk_ref (newlock);  out:          return newlock;  } @@ -201,6 +203,113 @@ out:          return revoke_lock;  } +static gf_boolean_t +__entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock, +                                  struct timespec *now) +{ +        posix_locks_private_t *priv; +        int64_t elapsed; + +        priv = this->private; + +        /* If this lock is in a list, it means that we are about to send a +         * notification for it, so no need to do anything else. */ +        if (!list_empty(&lock->contend)) { +                return _gf_false; +        } + +        elapsed = now->tv_sec; +        elapsed -= lock->contention_time.tv_sec; +        if (now->tv_nsec < lock->contention_time.tv_nsec) { +                elapsed--; +        } +        if (elapsed < priv->notify_contention_delay) { +                return _gf_false; +        } + +        /* All contention notifications will be sent outside of the locked +         * region. This means that currently granted locks might have already +         * been unlocked by that time. To avoid the lock or the inode to be +         * destroyed before we process them, we take an additional reference +         * on both. */ +        inode_ref(lock->pinode->inode); +        __pl_entrylk_ref(lock); + +        lock->contention_time = *now; + +        return _gf_true; +} + +void +entrylk_contention_notify(xlator_t *this, struct list_head *contend) +{ +        struct gf_upcall up; +        struct gf_upcall_entrylk_contention lc; +        pl_entry_lock_t *lock; +        pl_inode_t *pl_inode; +        client_t *client; +        gf_boolean_t notify; + +        while (!list_empty(contend)) { +                lock = list_first_entry(contend, pl_entry_lock_t, contend); + +                pl_inode = lock->pinode; + +                pthread_mutex_lock(&pl_inode->mutex); + +                /* If the lock has already been released, no notification is +                 * sent. We clear the notification time in this case. */ +                notify = !list_empty(&lock->domain_list); +                if (!notify) { +                        lock->contention_time.tv_sec = 0; +                        lock->contention_time.tv_nsec = 0; +                } else { +                        lc.type = lock->type; +                        lc.name = lock->basename; +                        lc.pid = lock->client_pid; +                        lc.domain = lock->volume; +                        lc.xdata = NULL; + +                        gf_uuid_copy(up.gfid, lock->pinode->gfid); +                        client = (client_t *)lock->client; +                        if (client == NULL) { +                                /* A NULL client can be found if the entrylk +                                 * was issued by a server side xlator. */ +                                up.client_uid = NULL; +                        } else { +                                up.client_uid = client->client_uid; +                        } +                } + +                pthread_mutex_unlock(&pl_inode->mutex); + +                if (notify) { +                        up.event_type = GF_UPCALL_ENTRYLK_CONTENTION; +                        up.data = &lc; + +                        if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) { +                                gf_msg_debug(this->name, 0, +                                             "Entrylk contention notification " +                                             "failed"); +                        } else { +                                gf_msg_debug(this->name, 0, +                                             "Entrylk contention notification " +                                             "sent"); +                        } +                } + +                pthread_mutex_lock(&pl_inode->mutex); + +                list_del_init(&lock->contend); +                __pl_entrylk_unref(lock); + +                pthread_mutex_unlock(&pl_inode->mutex); + +                inode_unref(pl_inode->inode); +        } +} + +  /**   * entrylk_grantable - is this lock grantable?   * @inode: inode in which to look @@ -208,19 +317,27 @@ out:   * @type: type of lock   */  static pl_entry_lock_t * -__entrylk_grantable (pl_dom_list_t *dom, pl_entry_lock_t *lock) +__entrylk_grantable (xlator_t *this, pl_dom_list_t *dom, pl_entry_lock_t *lock, +                     struct timespec *now, struct list_head *contend)  {          pl_entry_lock_t *tmp = NULL; - -        if (list_empty (&dom->entrylk_list)) -                return NULL; +        pl_entry_lock_t *ret = NULL;          list_for_each_entry (tmp, &dom->entrylk_list, domain_list) { -                if (__conflicting_entrylks (tmp, lock)) -                        return tmp; +                if (__conflicting_entrylks (tmp, lock)) { +                        if (ret == NULL) { +                                ret = tmp; +                                if (contend == NULL) { +                                        break; +                                } +                        } +                        if (__entrylk_needs_contention_notify(this, tmp, now)) { +                                list_add_tail(&tmp->contend, contend); +                        } +                }          } -        return NULL; +        return ret;  }  static pl_entry_lock_t * @@ -228,9 +345,6 @@ __blocked_entrylk_conflict (pl_dom_list_t *dom, pl_entry_lock_t *lock)  {          pl_entry_lock_t *tmp = NULL; -        if (list_empty (&dom->blocked_entrylks)) -                return NULL; -          list_for_each_entry (tmp, &dom->blocked_entrylks, blocked_locks) {                  if (names_conflict (tmp->basename, lock->basename))                          return lock; @@ -426,6 +540,27 @@ __find_matching_lock (pl_dom_list_t *dom, pl_entry_lock_t *lock)          return NULL;  } +static int +__lock_blocked_add(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, +                   pl_entry_lock_t *lock, int nonblock) +{ +        struct timeval now; + +        gettimeofday(&now, NULL); + +        if (nonblock) +                goto out; + +        lock->blkd_time = now; +        list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks); + +        gf_msg_trace (this->name, 0, "Blocking lock: {pinode=%p, basename=%s}", +                      pinode, lock->basename); + +out: +        return -EAGAIN; +} +  /**   * __lock_entrylk - lock a name in a directory   * @inode: inode for the directory in which to lock @@ -439,24 +574,15 @@ __find_matching_lock (pl_dom_list_t *dom, pl_entry_lock_t *lock)  int  __lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock, -		int nonblock, pl_dom_list_t *dom) +                int nonblock, pl_dom_list_t *dom, struct timespec *now, +                struct list_head *contend)  {          pl_entry_lock_t *conf = NULL;          int              ret  = -EAGAIN; -        conf = __entrylk_grantable (dom, lock); +        conf = __entrylk_grantable (this, dom, lock, now, contend);          if (conf) { -                ret = -EAGAIN; -                if (nonblock) -                        goto out; - -                gettimeofday (&lock->blkd_time, NULL); -                list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks); - -                gf_log (this->name, GF_LOG_TRACE, -                        "Blocking lock: {pinode=%p, basename=%s}", -                        pinode, lock->basename); - +                ret = __lock_blocked_add(this, pinode, dom, lock, nonblock);                  goto out;          } @@ -471,20 +597,15 @@ __lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock,           * granted, without which self-heal can't progress.           * TODO: Find why 'owner_has_lock' is checked even for blocked locks.           */ -        if (__blocked_entrylk_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) { -                ret = -EAGAIN; -                if (nonblock) -                        goto out; - -                gettimeofday (&lock->blkd_time, NULL); -                list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks); - -                gf_log (this->name, GF_LOG_DEBUG, -                        "Lock is grantable, but blocking to prevent starvation"); -                gf_log (this->name, GF_LOG_TRACE, -                        "Blocking lock: {pinode=%p, basename=%s}", -                        pinode, lock->basename); +        if (__blocked_entrylk_conflict (dom, lock) && +            !(__owner_has_lock (dom, lock))) { +                if (nonblock == 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Lock is grantable, but blocking to prevent " +                                "starvation"); +                } +                ret = __lock_blocked_add(this, pinode, dom, lock, nonblock);                  goto out;          } @@ -551,7 +672,8 @@ out:  void  __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, -                             pl_dom_list_t *dom, struct list_head *granted) +                             pl_dom_list_t *dom, struct list_head *granted, +                             struct timespec *now, struct list_head *contend)  {          int              bl_ret = 0;          pl_entry_lock_t *bl   = NULL; @@ -566,7 +688,8 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,                  list_del_init (&bl->blocked_locks); -                bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom); +                bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom, now, +                                         contend);                  if (bl_ret == 0) {                          list_add (&bl->blocked_locks, granted); @@ -578,7 +701,8 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,  /* Grants locks if possible which are blocked on a lock */  void  grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, -			   pl_dom_list_t *dom) +                           pl_dom_list_t *dom, struct timespec *now, +                           struct list_head *contend)  {          struct list_head  granted_list;          pl_entry_lock_t  *tmp = NULL; @@ -589,7 +713,7 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,          pthread_mutex_lock (&pl_inode->mutex);          {                  __grant_blocked_entry_locks (this, pl_inode, dom, -                                             &granted_list); +                                             &granted_list, now, contend);          }          pthread_mutex_unlock (&pl_inode->mutex); @@ -610,8 +734,6 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,  		}  	}          pthread_mutex_unlock (&pl_inode->mutex); - -        return;  } @@ -637,9 +759,18 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,  	int              nonblock         =  0;          gf_boolean_t     need_inode_unref =  _gf_false;          posix_locks_private_t  *priv = NULL; +        struct list_head *pcontend = NULL; +        struct list_head contend; +        struct timespec  now = { };          priv = this->private; +        if (priv->notify_contention) { +                pcontend = &contend; +                INIT_LIST_HEAD(pcontend); +                timespec_now(&now); +        } +          if (xdata)                  dict_ret = dict_get_str (xdata, "connection-id", &conn_id); @@ -722,7 +853,8 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,                  {  			reqlock->pinode = pinode; -                        ret = __lock_entrylk (this, pinode, reqlock, nonblock, dom); +                        ret = __lock_entrylk (this, pinode, reqlock, nonblock, +                                              dom, &now, pcontend);  			if (ret == 0) {  				reqlock->frame = NULL;  				op_ret = 0; @@ -778,7 +910,7 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,  		if (ctx)  			pthread_mutex_unlock (&ctx->lock); -		grant_blocked_entry_locks (this, pinode, dom); +		grant_blocked_entry_locks (this, pinode, dom, &now, pcontend);                  break; @@ -810,6 +942,10 @@ unwind:                                       cmd, type);          } +        if (pcontend != NULL) { +                entrylk_contention_notify(this, pcontend); +        } +          return 0;  } @@ -868,27 +1004,37 @@ pl_entrylk_log_cleanup (pl_entry_lock_t *lock)  int  pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)  { +        posix_locks_private_t *priv;          pl_entry_lock_t *tmp = NULL;          pl_entry_lock_t *l = NULL; -	pl_dom_list_t *dom = NULL; +        pl_dom_list_t *dom = NULL;          pl_inode_t *pinode = NULL; - +        struct list_head *pcontend = NULL;          struct list_head released;          struct list_head unwind; +        struct list_head contend; +        struct timespec now = { };          INIT_LIST_HEAD (&released);          INIT_LIST_HEAD (&unwind); -	pthread_mutex_lock (&ctx->lock); +        priv = this->private; +        if (priv->notify_contention) { +                pcontend = &contend; +                INIT_LIST_HEAD (pcontend); +                timespec_now(&now); +        } + +        pthread_mutex_lock (&ctx->lock);          {                  list_for_each_entry_safe (l, tmp, &ctx->entrylk_lockers, -					  client_list) { -			pl_entrylk_log_cleanup (l); +                                          client_list) { +                        pl_entrylk_log_cleanup (l); -			pinode = l->pinode; +                        pinode = l->pinode; -			pthread_mutex_lock (&pinode->mutex); -			{ +                        pthread_mutex_lock (&pinode->mutex); +                        {                          /* If the entrylk object is part of granted list but not                           * blocked list, then perform the following actions:                           * i.   delete the object from granted list; @@ -931,38 +1077,42 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)                                                         &unwind);                                  }                          } -			pthread_mutex_unlock (&pinode->mutex); +                        pthread_mutex_unlock (&pinode->mutex);                  } -	} +        }          pthread_mutex_unlock (&ctx->lock);          list_for_each_entry_safe (l, tmp, &unwind, client_list) {                  list_del_init (&l->client_list); -		if (l->frame) -			STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN, -					     NULL); +                if (l->frame) +                        STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN, +                                             NULL);                  list_add_tail (&l->client_list, &released);          }          list_for_each_entry_safe (l, tmp, &released, client_list) {                  list_del_init (&l->client_list); -		pinode = l->pinode; +                pinode = l->pinode; -		dom = get_domain (pinode, l->volume); +                dom = get_domain (pinode, l->volume); -		grant_blocked_entry_locks (this, pinode, dom); +                grant_blocked_entry_locks (this, pinode, dom, &now, pcontend); -		pthread_mutex_lock (&pinode->mutex); -		{ -			__pl_entrylk_unref (l); -		} -		pthread_mutex_unlock (&pinode->mutex); +                pthread_mutex_lock (&pinode->mutex); +                { +                        __pl_entrylk_unref (l); +                } +                pthread_mutex_unlock (&pinode->mutex);                  inode_unref (pinode->inode);          } +        if (pcontend != NULL) { +                entrylk_contention_notify(this, pcontend); +        } +          return 0;  } diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index 64ffb00c18c..890ac8b6d00 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -13,10 +13,12 @@  #include "logging.h"  #include "common-utils.h"  #include "list.h" +#include "upcall-utils.h"  #include "locks.h"  #include "clear.h"  #include "common.h" +#include "pl-messages.h"  void  __delete_inode_lock (pl_inode_lock_t *lock) @@ -229,22 +231,134 @@ out:          return revoke_lock;  } +static gf_boolean_t +__inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock, +                                  struct timespec *now) +{ +        posix_locks_private_t *priv; +        int64_t elapsed; + +        priv = this->private; + +        /* If this lock is in a list, it means that we are about to send a +         * notification for it, so no need to do anything else. */ +        if (!list_empty(&lock->contend)) { +                return _gf_false; +        } + +        elapsed = now->tv_sec; +        elapsed -= lock->contention_time.tv_sec; +        if (now->tv_nsec < lock->contention_time.tv_nsec) { +                elapsed--; +        } +        if (elapsed < priv->notify_contention_delay) { +                return _gf_false; +        } + +        /* All contention notifications will be sent outside of the locked +         * region. This means that currently granted locks might have already +         * been unlocked by that time. To avoid the lock or the inode to be +         * destroyed before we process them, we take an additional reference +         * on both. */ +        inode_ref(lock->pl_inode->inode); +        __pl_inodelk_ref(lock); + +        lock->contention_time = *now; + +        return _gf_true; +} + +void +inodelk_contention_notify(xlator_t *this, struct list_head *contend) +{ +        struct gf_upcall up; +        struct gf_upcall_inodelk_contention lc; +        pl_inode_lock_t *lock; +        pl_inode_t *pl_inode; +        client_t *client; +        gf_boolean_t notify; + +        while (!list_empty(contend)) { +                lock = list_first_entry(contend, pl_inode_lock_t, contend); + +                pl_inode = lock->pl_inode; + +                pthread_mutex_lock(&pl_inode->mutex); + +                /* If the lock has already been released, no notification is +                 * sent. We clear the notification time in this case. */ +                notify = !list_empty(&lock->list); +                if (!notify) { +                        lock->contention_time.tv_sec = 0; +                        lock->contention_time.tv_nsec = 0; +                } else { +                        memcpy(&lc.flock, &lock->user_flock, sizeof(lc.flock)); +                        lc.pid = lock->client_pid; +                        lc.domain = lock->volume; +                        lc.xdata = NULL; + +                        gf_uuid_copy(up.gfid, lock->pl_inode->gfid); +                        client = (client_t *)lock->client; +                        if (client == NULL) { +                                /* A NULL client can be found if the inodelk +                                 * was issued by a server side xlator. */ +                                up.client_uid = NULL; +                        } else { +                                up.client_uid = client->client_uid; +                        } +                } + +                pthread_mutex_unlock(&pl_inode->mutex); + +                if (notify) { +                        up.event_type = GF_UPCALL_INODELK_CONTENTION; +                        up.data = &lc; + +                        if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) { +                                gf_msg_debug(this->name, 0, +                                             "Inodelk contention notification " +                                             "failed"); +                        } else { +                                gf_msg_debug(this->name, 0, +                                             "Inodelk contention notification " +                                             "sent"); +                        } +                } + +                pthread_mutex_lock(&pl_inode->mutex); + +                list_del_init(&lock->contend); +                __pl_inodelk_unref(lock); + +                pthread_mutex_unlock(&pl_inode->mutex); + +                inode_unref(pl_inode->inode); +        } +} +  /* Determine if lock is grantable or not */  static pl_inode_lock_t * -__inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock) +__inodelk_grantable (xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, +                     struct timespec *now, struct list_head *contend)  {          pl_inode_lock_t *l = NULL;          pl_inode_lock_t *ret = NULL; -        if (list_empty (&dom->inodelk_list)) -                goto out; +          list_for_each_entry (l, &dom->inodelk_list, list){                  if (inodelk_conflict (lock, l) &&                      !same_inodelk_owner (lock, l)) { -                        ret = l; -                        goto out; +                        if (ret == NULL) { +                                ret = l; +                                if (contend == NULL) { +                                        break; +                                } +                        } +                        if (__inodelk_needs_contention_notify(this, l, now)) { +                                list_add_tail(&l->contend, contend); +                        }                  }          } -out: +          return ret;  } @@ -252,20 +366,14 @@ static pl_inode_lock_t *  __blocked_lock_conflict (pl_dom_list_t *dom, pl_inode_lock_t *lock)  {          pl_inode_lock_t *l   = NULL; -        pl_inode_lock_t *ret = NULL; - -        if (list_empty (&dom->blocked_inodelks)) -                return NULL;          list_for_each_entry (l, &dom->blocked_inodelks, blocked_locks) {                  if (inodelk_conflict (lock, l)) { -                        ret = l; -                        goto out; +                        return l;                  }          } -out: -        return ret; +        return NULL;  }  static int @@ -286,35 +394,45 @@ __owner_has_lock (pl_dom_list_t *dom, pl_inode_lock_t *newlock)          return 0;  } +static int +__lock_blocked_add(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, +                   int can_block) +{ +        struct timeval now; + +        gettimeofday(&now, NULL); + +        if (can_block == 0) { +                goto out; +        } + +        lock->blkd_time = now; +        list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); + +        gf_msg_trace (this->name, 0, "%s (pid=%d) (lk-owner=%s) %"PRId64" - " +                                     "%"PRId64" => Blocked", +                      lock->fl_type == F_UNLCK ? "Unlock" : "Lock", +                      lock->client_pid, lkowner_utoa (&lock->owner), +                      lock->user_flock.l_start, lock->user_flock.l_len); + +out: +        return -EAGAIN; +}  /* Determines if lock can be granted and adds the lock. If the lock   * is blocking, adds it to the blocked_inodelks list of the domain.   */  static int  __lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, -                int can_block,  pl_dom_list_t *dom) +                int can_block,  pl_dom_list_t *dom, struct timespec *now, +                struct list_head *contend)  {          pl_inode_lock_t *conf = NULL;          int ret = -EINVAL; -        conf = __inodelk_grantable (dom, lock); +        conf = __inodelk_grantable (this, dom, lock, now, contend);          if (conf) { -                ret = -EAGAIN; -                if (can_block == 0) -                        goto out; - -                gettimeofday (&lock->blkd_time, NULL); -                list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); - -                gf_log (this->name, GF_LOG_TRACE, -                        "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked", -                        lock->fl_type == F_UNLCK ? "Unlock" : "Lock", -                        lock->client_pid, -                        lkowner_utoa (&lock->owner), -                        lock->user_flock.l_start, -                        lock->user_flock.l_len); - - +                ret = __lock_blocked_add(this, dom, lock, can_block);                  goto out;          } @@ -330,25 +448,15 @@ __lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,           * will not be unlocked by SHD from Machine1.           * TODO: Find why 'owner_has_lock' is checked even for blocked locks.           */ -        if (__blocked_lock_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) { -                ret = -EAGAIN; -                if (can_block == 0) -                        goto out; - -                gettimeofday (&lock->blkd_time, NULL); -                list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); - -                gf_log (this->name, GF_LOG_DEBUG, -                        "Lock is grantable, but blocking to prevent starvation"); -                gf_log (this->name, GF_LOG_TRACE, -                        "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Blocked", -                        lock->fl_type == F_UNLCK ? "Unlock" : "Lock", -                        lock->client_pid, -                        lkowner_utoa (&lock->owner), -                        lock->user_flock.l_start, -                        lock->user_flock.l_len); - +        if (__blocked_lock_conflict (dom, lock) && +            !(__owner_has_lock (dom, lock))) { +                if (can_block != 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Lock is grantable, but blocking to prevent " +                                "starvation"); +                } +                ret = __lock_blocked_add(this, dom, lock, can_block);                  goto out;          }          __pl_inodelk_ref (lock); @@ -417,7 +525,8 @@ out:  static void  __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, -                             struct list_head *granted, pl_dom_list_t *dom) +                             struct list_head *granted, pl_dom_list_t *dom, +                             struct timespec *now, struct list_head *contend)  {          int              bl_ret = 0;          pl_inode_lock_t *bl = NULL; @@ -432,7 +541,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,                  list_del_init (&bl->blocked_locks); -                bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom); +                bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom, now, +                                         contend);                  if (bl_ret == 0) {                          list_add (&bl->blocked_locks, granted); @@ -444,7 +554,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,  /* Grant all inodelks blocked on a lock */  void  grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, -                           pl_dom_list_t *dom) +                           pl_dom_list_t *dom, struct timespec *now, +                           struct list_head *contend)  {          struct list_head granted;          pl_inode_lock_t *lock; @@ -454,7 +565,8 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,          pthread_mutex_lock (&pl_inode->mutex);          { -                __grant_blocked_inode_locks (this, pl_inode, &granted, dom); +                __grant_blocked_inode_locks (this, pl_inode, &granted, dom, +                                             now, contend);          }          pthread_mutex_unlock (&pl_inode->mutex); @@ -471,7 +583,7 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,                                &lock->user_flock, 0, 0, lock->volume);                  STACK_UNWIND_STRICT (inodelk, lock->frame, 0, 0, NULL); -		lock->frame = NULL; +                lock->frame = NULL;          }          pthread_mutex_lock (&pl_inode->mutex); @@ -488,9 +600,9 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,  static void  pl_inodelk_log_cleanup (pl_inode_lock_t *lock)  { -	pl_inode_t *pl_inode = NULL; +        pl_inode_t *pl_inode = NULL; -	pl_inode = lock->pl_inode; +        pl_inode = lock->pl_inode;          gf_log (THIS->name, GF_LOG_WARNING, "releasing lock on %s held by "                  "{client=%p, pid=%"PRId64" lk-owner=%s}", @@ -503,27 +615,38 @@ pl_inodelk_log_cleanup (pl_inode_lock_t *lock)  int  pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)  { +        posix_locks_private_t *priv;          pl_inode_lock_t *tmp = NULL;          pl_inode_lock_t *l = NULL; -	pl_dom_list_t *dom = NULL; +        pl_dom_list_t *dom = NULL;          pl_inode_t *pl_inode = NULL; - +        struct list_head *pcontend = NULL;          struct list_head released;          struct list_head unwind; +        struct list_head contend; +        struct timespec now = { }; + +        priv = this->private;          INIT_LIST_HEAD (&released);          INIT_LIST_HEAD (&unwind); -	pthread_mutex_lock (&ctx->lock); +        if (priv->notify_contention) { +                pcontend = &contend; +                INIT_LIST_HEAD (pcontend); +                timespec_now(&now); +        } + +        pthread_mutex_lock (&ctx->lock);          {                  list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers, -					  client_list) { -			pl_inodelk_log_cleanup (l); +                                          client_list) { +                        pl_inodelk_log_cleanup (l); -			pl_inode = l->pl_inode; +                        pl_inode = l->pl_inode; -			pthread_mutex_lock (&pl_inode->mutex); -			{ +                        pthread_mutex_lock (&pl_inode->mutex); +                        {                          /* If the inodelk object is part of granted list but not                           * blocked list, then perform the following actions:                           * i.   delete the object from granted list; @@ -567,45 +690,49 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)                                                         &unwind);                                  }                          } -			pthread_mutex_unlock (&pl_inode->mutex); +                        pthread_mutex_unlock (&pl_inode->mutex);                  } -	} +        }          pthread_mutex_unlock (&ctx->lock);          list_for_each_entry_safe (l, tmp, &unwind, client_list) {                  list_del_init (&l->client_list);                  if (l->frame) -			STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN, -					     NULL); +                        STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN, +                                             NULL);                  list_add_tail (&l->client_list, &released); -          }          list_for_each_entry_safe (l, tmp, &released, client_list) {                  list_del_init (&l->client_list); -		pl_inode = l->pl_inode; +                pl_inode = l->pl_inode; -		dom = get_domain (pl_inode, l->volume); +                dom = get_domain (pl_inode, l->volume); -		grant_blocked_inode_locks (this, pl_inode, dom); +                grant_blocked_inode_locks (this, pl_inode, dom, &now, +                                           pcontend); -		pthread_mutex_lock (&pl_inode->mutex); -		{ -			__pl_inodelk_unref (l); -		} -		pthread_mutex_unlock (&pl_inode->mutex); +                pthread_mutex_lock (&pl_inode->mutex); +                { +                        __pl_inodelk_unref (l); +                } +                pthread_mutex_unlock (&pl_inode->mutex);                  inode_unref (pl_inode->inode);          } +        if (pcontend != NULL) { +                inodelk_contention_notify(this, pcontend); +        } +          return 0;  }  static int  pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, -		pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom, +                pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,                  inode_t *inode)  {          posix_locks_private_t  *priv = NULL; @@ -613,9 +740,12 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,          pl_inode_lock_t  *retlock          =  NULL;          gf_boolean_t      unref            =  _gf_true;          gf_boolean_t      need_inode_unref =  _gf_false; +        struct list_head *pcontend = NULL; +        struct list_head  contend; +        struct timespec   now = { };          short             fl_type; -	lock->pl_inode = pl_inode; +        lock->pl_inode = pl_inode;          fl_type = lock->fl_type;          priv = this->private; @@ -657,12 +787,19 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,                  }          } +        if (priv->notify_contention) { +                pcontend = &contend; +                INIT_LIST_HEAD(pcontend); +                timespec_now(&now); +        } +  	if (ctx)  		pthread_mutex_lock (&ctx->lock);          pthread_mutex_lock (&pl_inode->mutex);          {                  if (lock->fl_type != F_UNLCK) { -                        ret = __lock_inodelk (this, pl_inode, lock, can_block, dom); +                        ret = __lock_inodelk (this, pl_inode, lock, can_block, +                                              dom, &now, pcontend);                          if (ret == 0) {  				lock->frame = NULL;                                  gf_log (this->name, GF_LOG_TRACE, @@ -725,13 +862,18 @@ out:           */          if ((fl_type == F_UNLCK) && (ret == 0)) {                  inode_unref (pl_inode->inode); -                grant_blocked_inode_locks (this, pl_inode, dom); +                grant_blocked_inode_locks (this, pl_inode, dom, &now, +                                           pcontend);          }          if (need_inode_unref) {                  inode_unref (pl_inode->inode);          } +        if (pcontend != NULL) { +                inodelk_contention_notify(this, pcontend); +        } +          return ret;  } @@ -771,7 +913,8 @@ new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid,          INIT_LIST_HEAD (&lock->list);          INIT_LIST_HEAD (&lock->blocked_locks); -	INIT_LIST_HEAD (&lock->client_list); +        INIT_LIST_HEAD (&lock->client_list); +        INIT_LIST_HEAD (&lock->contend);          __pl_inodelk_ref (lock);          return lock; diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index 3d3b327f56c..c2edfff8f00 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -70,6 +70,7 @@ typedef struct __posix_lock posix_lock_t;  struct __pl_inode_lock {          struct list_head   list;          struct list_head   blocked_locks; /* list_head pointing to blocked_inodelks */ +        struct list_head   contend; /* list of contending locks */          int                ref;          short              fl_type; @@ -86,6 +87,8 @@ struct __pl_inode_lock {          struct timeval     blkd_time;   /*time at which lock was queued into blkd list*/          struct timeval     granted_time; /*time at which lock was queued into active list*/ +        /*last time at wich lock contention was detected and notified*/ +        struct timespec    contention_time;          /* These two together serve to uniquely identify each process             across nodes */ @@ -120,6 +123,7 @@ typedef struct _pl_dom_list pl_dom_list_t;  struct __entry_lock {          struct list_head  domain_list;    /* list_head back to pl_dom_list_t */          struct list_head  blocked_locks; /* list_head back to blocked_entrylks */ +        struct list_head  contend;       /* list of contending locks */  	int ref;          call_frame_t     *frame; @@ -133,6 +137,8 @@ struct __entry_lock {          struct timeval     blkd_time;   /*time at which lock was queued into blkd list*/          struct timeval     granted_time; /*time at which lock was queued into active list*/ +        /*last time at wich lock contention was detected and notified*/ +        struct timespec    contention_time;          void             *client;          gf_lkowner_t      owner; @@ -194,6 +200,8 @@ typedef struct {          uint32_t        revocation_secs;          gf_boolean_t    revocation_clear_all;          uint32_t        revocation_max_blocked; +        gf_boolean_t    notify_contention; +        uint32_t        notify_contention_delay;  } posix_locks_private_t; diff --git a/xlators/features/locks/src/pl-messages.h b/xlators/features/locks/src/pl-messages.h index 7a1e3f488e7..e5a276f35b5 100644 --- a/xlators/features/locks/src/pl-messages.h +++ b/xlators/features/locks/src/pl-messages.h @@ -24,7 +24,9 @@   */  GLFS_MSGID(PL, -        PL_MSG_LOCK_NUMBER +        PL_MSG_LOCK_NUMBER, +        PL_MSG_INODELK_CONTENTION_FAILED, +        PL_MSG_ENTRYLK_CONTENTION_FAILED  );  #endif /* !_PL_MESSAGES_H_ */ diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index 78bf160058c..82d77db1164 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -3641,6 +3641,13 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("revocation-max-blocked",                            priv->revocation_max_blocked, options,                            uint32, out); + +        GF_OPTION_RECONF ("notify-contention", priv->notify_contention, +                          options, bool, out); + +        GF_OPTION_RECONF ("notify-contention-delay", +                          priv->notify_contention_delay, options, uint32, out); +          ret = 0;  out: @@ -3705,6 +3712,12 @@ init (xlator_t *this)          GF_OPTION_INIT ("revocation-max-blocked", priv->revocation_max_blocked,                          uint32, out); +        GF_OPTION_INIT ("notify-contention", priv->notify_contention, bool, +                        out); + +        GF_OPTION_INIT ("notify-contention-delay", +                        priv->notify_contention_delay, uint32, out); +          this->local_pool = mem_pool_new (pl_local_t, 32);          if (!this->local_pool) {                  ret = -1; @@ -4461,5 +4474,32 @@ struct volume_options options[] = {                           "will be revoked to allow the others to proceed.  Can "                           "be used in conjunction w/ revocation-clear-all."          }, +        { .key = {"notify-contention"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "no", +          .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, +          .op_version = { GD_OP_VERSION_4_0_0 }, +          .tags = { "locks", "contention" }, +          .description = "When this option is enabled and a lock request " +                         "conflicts with a currently granted lock, an upcall " +                         "notification will be sent to the current owner of " +                         "the lock to request it to be released as soon as " +                         "possible." +        }, +        { .key = {"notify-contention-delay"}, +          .type = GF_OPTION_TYPE_INT, +          .min = 0, /* An upcall notification is sent every time a conflict is +                     * detected. */ +          .max = 60, +          .default_value = "5", +          .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, +          .op_version = { GD_OP_VERSION_4_0_0 }, +          .tags = { "locks", "contention", "timeout" }, +          .description = "This value determines the minimum amount of time " +                         "(in seconds) between upcall contention notifications " +                         "on the same inode. If multiple lock requests are " +                         "received during this period, only one upcall will " +                         "be sent." +        },          { .key = {NULL} },  }; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index da85cb5297f..e0ec3368ca7 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1484,6 +1484,16 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version = GD_OP_VERSION_3_13_0,            .flags      = VOLOPT_FLAG_CLIENT_OPT          }, +        { .key        = "disperse.eager-lock-timeout", +          .voltype    = "cluster/disperse", +          .op_version = GD_OP_VERSION_4_0_0, +          .flags      = VOLOPT_FLAG_CLIENT_OPT +        }, +        { .key        = "disperse.other-eager-lock-timeout", +          .voltype    = "cluster/disperse", +          .op_version = GD_OP_VERSION_4_0_0, +          .flags      = VOLOPT_FLAG_CLIENT_OPT +        },          { .key        = "cluster.quorum-type",            .voltype    = "cluster/replicate",            .option     = "quorum-type", @@ -3489,6 +3499,16 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version  = GD_OP_VERSION_3_9_0,            .type        = NO_DOC,          }, +        { .option      = "notify-contention", +          .key         = "features.locks-notify-contention", +          .voltype     = "features/locks", +          .op_version  = GD_OP_VERSION_4_0_0, +        }, +        { .option      = "notify-contention-delay", +          .key         = "features.locks-notify-contention-delay", +          .voltype     = "features/locks", +          .op_version  = GD_OP_VERSION_4_0_0, +        },          { .key        = "disperse.shd-max-threads",            .voltype    = "cluster/disperse",            .op_version = GD_OP_VERSION_3_9_0, diff --git a/xlators/protocol/client/src/client-callback.c b/xlators/protocol/client/src/client-callback.c index 51164e57230..b2f9a225887 100644 --- a/xlators/protocol/client/src/client-callback.c +++ b/xlators/protocol/client/src/client-callback.c @@ -173,6 +173,101 @@ out:          return 0;  } +int +client_cbk_inodelk_contention (struct rpc_clnt *rpc, void *mydata, void *data) +{ +        int                                  ret         = -1; +        struct iovec                        *iov         = NULL; +        struct gf_upcall                     upcall_data = {0,}; +        struct gf_upcall_inodelk_contention  lc          = {{0,},}; +        gfs4_inodelk_contention_req          proto_lc    = {{0,},}; + +        GF_VALIDATE_OR_GOTO ("client-callback", rpc, out); +        GF_VALIDATE_OR_GOTO ("client-callback", mydata, out); +        GF_VALIDATE_OR_GOTO ("client-callback", data, out); + +        iov = (struct iovec *)data; +        ret = xdr_to_generic (*iov, &proto_lc, +                              (xdrproc_t)xdr_gfs4_inodelk_contention_req); + +        if (ret < 0) { +                gf_msg (THIS->name, GF_LOG_WARNING, -ret, +                        PC_MSG_INODELK_CONTENTION_FAIL, +                        "XDR decode of inodelk contention failed."); +                goto out; +        } + +        upcall_data.data = &lc; +        ret = gf_proto_inodelk_contention_to_upcall (&proto_lc, &upcall_data); +        if (ret < 0) +                goto out; + +        upcall_data.event_type = GF_UPCALL_INODELK_CONTENTION; + +        default_notify (THIS, GF_EVENT_UPCALL, &upcall_data); + +out: +        if (proto_lc.domain) +                free (proto_lc.domain); + +        if (proto_lc.xdata.xdata_val) +                free (proto_lc.xdata.xdata_val); + +        if (lc.xdata) +                dict_unref (lc.xdata); + +        return ret; +} + +int +client_cbk_entrylk_contention (struct rpc_clnt *rpc, void *mydata, void *data) +{ +        int                                  ret         = -1; +        struct iovec                        *iov         = NULL; +        struct gf_upcall                     upcall_data = {0,}; +        struct gf_upcall_entrylk_contention  lc          = {0,}; +        gfs4_entrylk_contention_req          proto_lc    = {{0,},}; + +        GF_VALIDATE_OR_GOTO ("client-callback", rpc, out); +        GF_VALIDATE_OR_GOTO ("client-callback", mydata, out); +        GF_VALIDATE_OR_GOTO ("client-callback", data, out); + +        iov = (struct iovec *)data; +        ret = xdr_to_generic (*iov, &proto_lc, +                              (xdrproc_t)xdr_gfs4_entrylk_contention_req); + +        if (ret < 0) { +                gf_msg (THIS->name, GF_LOG_WARNING, -ret, +                        PC_MSG_ENTRYLK_CONTENTION_FAIL, +                        "XDR decode of entrylk contention failed."); +                goto out; +        } + +        upcall_data.data = &lc; +        ret = gf_proto_entrylk_contention_to_upcall (&proto_lc, &upcall_data); +        if (ret < 0) +                goto out; + +        upcall_data.event_type = GF_UPCALL_ENTRYLK_CONTENTION; + +        default_notify (THIS, GF_EVENT_UPCALL, &upcall_data); + +out: +        if (proto_lc.name) +                free (proto_lc.name); + +        if (proto_lc.domain) +                free (proto_lc.domain); + +        if (proto_lc.xdata.xdata_val) +                free (proto_lc.xdata.xdata_val); + +        if (lc.xdata) +                dict_unref (lc.xdata); + +        return ret; +} +  rpcclnt_cb_actor_t gluster_cbk_actors[GF_CBK_MAXVALUE] = {          [GF_CBK_NULL]               = {"NULL",               GF_CBK_NULL,               client_cbk_null },          [GF_CBK_FETCHSPEC]          = {"FETCHSPEC",          GF_CBK_FETCHSPEC,          client_cbk_fetchspec }, @@ -181,6 +276,8 @@ rpcclnt_cb_actor_t gluster_cbk_actors[GF_CBK_MAXVALUE] = {          [GF_CBK_CHILD_UP]           = {"CHILD_UP",           GF_CBK_CHILD_UP,           client_cbk_child_up },          [GF_CBK_CHILD_DOWN]         = {"CHILD_DOWN",         GF_CBK_CHILD_DOWN,         client_cbk_child_down },          [GF_CBK_RECALL_LEASE]       = {"RECALL_LEASE",       GF_CBK_RECALL_LEASE,       client_cbk_recall_lease }, +        [GF_CBK_INODELK_CONTENTION] = {"INODELK_CONTENTION", GF_CBK_INODELK_CONTENTION, client_cbk_inodelk_contention }, +        [GF_CBK_ENTRYLK_CONTENTION] = {"ENTRYLK_CONTENTION", GF_CBK_ENTRYLK_CONTENTION, client_cbk_entrylk_contention },  }; diff --git a/xlators/protocol/client/src/client-messages.h b/xlators/protocol/client/src/client-messages.h index 86b721bd593..5f146c67efe 100644 --- a/xlators/protocol/client/src/client-messages.h +++ b/xlators/protocol/client/src/client-messages.h @@ -89,7 +89,9 @@ GLFS_MSGID(PC,          PC_MSG_CACHE_INVALIDATION_FAIL,          PC_MSG_CHILD_STATUS,          PC_MSG_GFID_NULL, -        PC_MSG_RECALL_LEASE_FAIL +        PC_MSG_RECALL_LEASE_FAIL, +        PC_MSG_INODELK_CONTENTION_FAIL, +        PC_MSG_ENTRYLK_CONTENTION_FAIL  );  #endif /* !_PC_MESSAGES_H__ */ diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c index 7154355e690..66122318c79 100644 --- a/xlators/protocol/server/src/server.c +++ b/xlators/protocol/server/src/server.c @@ -1349,6 +1349,8 @@ server_process_event_upcall (xlator_t *this, void *data)          enum gf_cbk_procnum             cbk_procnum     = GF_CBK_NULL;          gfs3_cbk_cache_invalidation_req gf_c_req        = {0,};          gfs3_recall_lease_req           gf_recall_lease = {{0,},}; +        gfs4_inodelk_contention_req     gf_inodelk_contention = {{0},}; +        gfs4_entrylk_contention_req     gf_entrylk_contention = {{0},};          xdrproc_t                       xdrproc;          GF_VALIDATE_OR_GOTO(this->name, data, out); @@ -1358,7 +1360,16 @@ server_process_event_upcall (xlator_t *this, void *data)          upcall_data = (struct gf_upcall *)data;          client_uid = upcall_data->client_uid; -        GF_VALIDATE_OR_GOTO(this->name, client_uid, out); +        /* client_uid could be NULL if the upcall was intended for a server's +         * child xlator (so no client_uid available) but it hasn't handled +         * the notification. For this reason we silently ignore any upcall +         * request with a NULL client_uid, but -1 will be returned. +         */ +        if (client_uid == NULL) { +                gf_msg_debug(this->name, 0, +                             "NULL client_uid for an upcall request"); +                goto out; +        }          switch (upcall_data->event_type) {          case GF_UPCALL_CACHE_INVALIDATION: @@ -1381,6 +1392,28 @@ server_process_event_upcall (xlator_t *this, void *data)                  cbk_procnum = GF_CBK_RECALL_LEASE;                  xdrproc = (xdrproc_t)xdr_gfs3_recall_lease_req;                  break; +        case GF_UPCALL_INODELK_CONTENTION: +                ret = gf_proto_inodelk_contention_from_upcall (this, +                                                        &gf_inodelk_contention, +                                                         upcall_data); +                if (ret < 0) +                        goto out; + +                up_req = &gf_inodelk_contention; +                cbk_procnum = GF_CBK_INODELK_CONTENTION; +                xdrproc = (xdrproc_t)xdr_gfs4_inodelk_contention_req; +                break; +        case GF_UPCALL_ENTRYLK_CONTENTION: +                ret = gf_proto_entrylk_contention_from_upcall (this, +                                                        &gf_entrylk_contention, +                                                         upcall_data); +                if (ret < 0) +                        goto out; + +                up_req = &gf_entrylk_contention; +                cbk_procnum = GF_CBK_ENTRYLK_CONTENTION; +                xdrproc = (xdrproc_t)xdr_gfs4_entrylk_contention_req; +                break;          default:                  gf_msg (this->name, GF_LOG_WARNING, EINVAL,                          PS_MSG_INVALID_ENTRY, @@ -1417,11 +1450,10 @@ server_process_event_upcall (xlator_t *this, void *data)          pthread_mutex_unlock (&conf->mutex);          ret = 0;  out: -        if ((gf_c_req.xdata).xdata_val) -                GF_FREE ((gf_c_req.xdata).xdata_val); - -        if ((gf_recall_lease.xdata).xdata_val) -                GF_FREE ((gf_recall_lease.xdata).xdata_val); +        GF_FREE ((gf_c_req.xdata).xdata_val); +        GF_FREE ((gf_recall_lease.xdata).xdata_val); +        GF_FREE ((gf_inodelk_contention.xdata).xdata_val); +        GF_FREE ((gf_entrylk_contention.xdata).xdata_val);          return ret;  }  | 
