From 7febb66a26f01c94f8e76bb90cf4edd7c6cc1421 Mon Sep 17 00:00:00 2001 From: Mohammed Rafi KC Date: Tue, 17 Feb 2015 20:17:58 +0530 Subject: rdma: pre-register iobuf_pool with rdma devices. Back port pf : http://review.gluster.org/9506 registring buffers with rdma device is a time consuming operation. So performing registration in code path will decrease the performance. Using a pre registered memory will give a bettor performance, ie, register iobuf_pool during rdma initialization. For dynamically created arena, we can register with all the device. Change-Id: Ic79183e2efd014c43faf5911fdb6d5cfbcee64ca BUG: 1202212 Signed-off-by: Mohammed Rafi KC Reviewed-on: http://review.gluster.org/9506 Tested-by: Gluster Build System Reviewed-by: Raghavendra G Tested-by: Raghavendra G Reviewed-on: http://review.gluster.org/9889 Reviewed-by: Raghavendra Bhat --- libglusterfs/src/iobuf.c | 38 ++++++-- libglusterfs/src/iobuf.h | 12 ++- libglusterfs/src/mem-types.h | 1 + rpc/rpc-transport/rdma/src/rdma.c | 200 +++++++++++++++++++++++++++++++++++--- rpc/rpc-transport/rdma/src/rdma.h | 10 ++ 5 files changed, 240 insertions(+), 21 deletions(-) diff --git a/libglusterfs/src/iobuf.c b/libglusterfs/src/iobuf.c index 82ffe2dd8fd..f8f1860889b 100644 --- a/libglusterfs/src/iobuf.c +++ b/libglusterfs/src/iobuf.c @@ -50,6 +50,7 @@ gf_iobuf_get_arena_index (size_t page_size) return i; } + size_t gf_iobuf_get_pagesize (size_t page_size) { @@ -138,10 +139,15 @@ out: void -__iobuf_arena_destroy (struct iobuf_arena *iobuf_arena) +__iobuf_arena_destroy (struct iobuf_pool *iobuf_pool, + struct iobuf_arena *iobuf_arena) { GF_VALIDATE_OR_GOTO ("iobuf", iobuf_arena, out); + if (iobuf_pool->rdma_deregistration) + iobuf_pool->rdma_deregistration (iobuf_pool->mr_list, + iobuf_arena); + __iobuf_arena_destroy_iobufs (iobuf_arena); if (iobuf_arena->mem_base @@ -169,6 +175,7 @@ __iobuf_arena_alloc (struct iobuf_pool *iobuf_pool, size_t page_size, goto err; INIT_LIST_HEAD (&iobuf_arena->list); + INIT_LIST_HEAD (&iobuf_arena->all_list); INIT_LIST_HEAD (&iobuf_arena->active.list); INIT_LIST_HEAD (&iobuf_arena->passive.list); iobuf_arena->iobuf_pool = iobuf_pool; @@ -188,6 +195,13 @@ __iobuf_arena_alloc (struct iobuf_pool *iobuf_pool, size_t page_size, goto err; } + if (iobuf_pool->rdma_registration) { + iobuf_pool->rdma_registration (iobuf_pool->device, + iobuf_arena); + } + + list_add_tail (&iobuf_arena->all_list, &iobuf_pool->all_arenas); + __iobuf_arena_init_iobufs (iobuf_arena); if (!iobuf_arena->iobufs) { gf_log (THIS->name, GF_LOG_ERROR, "init failed"); @@ -199,7 +213,7 @@ __iobuf_arena_alloc (struct iobuf_pool *iobuf_pool, size_t page_size, return iobuf_arena; err: - __iobuf_arena_destroy (iobuf_arena); + __iobuf_arena_destroy (iobuf_pool, iobuf_arena); out: return NULL; @@ -258,8 +272,8 @@ __iobuf_pool_add_arena (struct iobuf_pool *iobuf_pool, size_t page_size, gf_log (THIS->name, GF_LOG_WARNING, "arena not found"); return NULL; } + list_add (&iobuf_arena->list, &iobuf_pool->arenas[index]); - list_add_tail (&iobuf_arena->list, &iobuf_pool->arenas[index]); return iobuf_arena; } @@ -299,7 +313,8 @@ iobuf_pool_destroy (struct iobuf_pool *iobuf_pool) &iobuf_pool->arenas[i], list) { list_del_init (&iobuf_arena->list); iobuf_pool->arena_cnt--; - __iobuf_arena_destroy (iobuf_arena); + + __iobuf_arena_destroy (iobuf_pool, iobuf_arena); } } @@ -347,7 +362,7 @@ iobuf_pool_new (void) gf_common_mt_iobuf_pool); if (!iobuf_pool) goto out; - + INIT_LIST_HEAD (&iobuf_pool->all_arenas); pthread_mutex_init (&iobuf_pool->mutex, NULL); for (i = 0; i <= IOBUF_ARENA_MAX_INDEX; i++) { INIT_LIST_HEAD (&iobuf_pool->arenas[i]); @@ -357,6 +372,16 @@ iobuf_pool_new (void) iobuf_pool->default_page_size = 128 * GF_UNIT_KB; + iobuf_pool->rdma_registration = NULL; + iobuf_pool->rdma_deregistration = NULL; + + for (i = 0; i < GF_RDMA_DEVICE_COUNT; i++) { + + iobuf_pool->device[i] = NULL; + iobuf_pool->mr_list[i] = NULL; + + } + arena_size = 0; for (i = 0; i < IOBUF_ARENA_MAX_INDEX; i++) { page_size = gf_iobuf_init_config[i].pagesize; @@ -393,9 +418,10 @@ __iobuf_arena_prune (struct iobuf_pool *iobuf_pool, /* All cases matched, destroy */ list_del_init (&iobuf_arena->list); + list_del_init (&iobuf_arena->all_list); iobuf_pool->arena_cnt--; - __iobuf_arena_destroy (iobuf_arena); + __iobuf_arena_destroy (iobuf_pool, iobuf_arena); out: return; diff --git a/libglusterfs/src/iobuf.h b/libglusterfs/src/iobuf.h index 4e07910d722..7e5cfe37a28 100644 --- a/libglusterfs/src/iobuf.h +++ b/libglusterfs/src/iobuf.h @@ -19,6 +19,8 @@ #define GF_VARIABLE_IOBUF_COUNT 32 +#define GF_RDMA_DEVICE_COUNT 8 + /* Lets try to define the new anonymous mapping * flag, in case the system is still using the * now deprecated MAP_ANON flag. @@ -81,6 +83,7 @@ struct iobuf_arena { }; }; + struct list_head all_list; size_t page_size; /* size of all iobufs in this arena */ size_t arena_size; /* this is equal to (iobuf_pool->arena_size / page_size) @@ -110,6 +113,7 @@ struct iobuf_pool { size_t default_page_size; /* default size of iobuf */ int arena_cnt; + struct list_head all_arenas; struct list_head arenas[GF_VARIABLE_IOBUF_COUNT]; /* array of arenas. Each element of the array is a list of arenas holding iobufs of particular page_size */ @@ -121,7 +125,13 @@ struct iobuf_pool { /* array of of arenas which can be purged */ uint64_t request_misses; /* mostly the requests for higher - value of iobufs */ + value of iobufs */ + int rdma_device_count; + struct list_head *mr_list[GF_RDMA_DEVICE_COUNT]; + void *device[GF_RDMA_DEVICE_COUNT]; + int (*rdma_registration)(void **, void*); + int (*rdma_deregistration)(struct list_head**, struct iobuf_arena *); + }; diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h index 4f566f9ec57..4359488c5f9 100644 --- a/libglusterfs/src/mem-types.h +++ b/libglusterfs/src/mem-types.h @@ -126,6 +126,7 @@ enum gf_common_mem_types_ { gf_common_mt_strfd_data_t = 110, gf_common_mt_regex_t = 111, gf_common_mt_wr = 112, + gf_common_mt_rdma_arena_mr = 113, gf_common_mt_end }; #endif diff --git a/rpc/rpc-transport/rdma/src/rdma.c b/rpc/rpc-transport/rdma/src/rdma.c index 92d5da258f2..cb5ce77291e 100644 --- a/rpc/rpc-transport/rdma/src/rdma.c +++ b/rpc/rpc-transport/rdma/src/rdma.c @@ -15,6 +15,7 @@ #include "dict.h" #include "glusterfs.h" +#include "iobuf.h" #include "logging.h" #include "rdma.h" #include "name.h" @@ -361,6 +362,135 @@ gf_rdma_post_recv (struct ibv_srq *srq, return ibv_post_srq_recv (srq, &wr, &bad_wr); } +int +gf_rdma_deregister_arena (struct list_head **mr_list, + struct iobuf_arena *iobuf_arena) +{ + gf_rdma_arena_mr *tmp = NULL; + int count = 0, i = 0; + + count = iobuf_arena->iobuf_pool->rdma_device_count; + for (i = 0; i < count; i++) { + list_for_each_entry(tmp, mr_list[i], list) { + if (tmp->iobuf_arena == iobuf_arena) { + if (ibv_dereg_mr(tmp->mr)) { + gf_log("rdma", GF_LOG_WARNING, + "deallocation of memory region " + "failed"); + return -1; + } + list_del(&tmp->list); + GF_FREE(tmp); + break; + } + } + } + + return 0; +} + + +int +gf_rdma_register_arena (void **arg1, void *arg2) +{ + struct ibv_mr *mr = NULL; + gf_rdma_arena_mr *new = NULL; + struct iobuf_pool *iobuf_pool = NULL; + gf_rdma_device_t **device = (gf_rdma_device_t **)arg1; + struct iobuf_arena *iobuf_arena = arg2; + int count = 0, i = 0; + + iobuf_pool = iobuf_arena->iobuf_pool; + count = iobuf_pool->rdma_device_count; + for (i = 0; i < count; i++) { + new = GF_CALLOC(1, sizeof(gf_rdma_arena_mr), + gf_common_mt_rdma_arena_mr); + INIT_LIST_HEAD (&new->list); + new->iobuf_arena = iobuf_arena; + + mr = ibv_reg_mr(device[i]->pd, iobuf_arena->mem_base, + iobuf_arena->arena_size, + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE + ); + if (!mr) + gf_log("rdma", GF_LOG_WARNING, + "allocation of mr failed"); + + new->mr = mr; + list_add (&new->list, &device[i]->all_mr); + new = NULL; + } + + return 0; + +} + +static void +gf_rdma_register_iobuf_pool (rpc_transport_t *this) +{ + struct iobuf_pool *iobuf_pool = NULL; + struct iobuf_arena *tmp = NULL; + gf_rdma_private_t *priv = NULL; + gf_rdma_device_t *device = NULL; + struct ibv_mr *mr = NULL; + gf_rdma_arena_mr *new = NULL; + + priv = this->private; + device = priv->device; + iobuf_pool = this->ctx->iobuf_pool; + + if (!list_empty(&iobuf_pool->all_arenas)) { + + list_for_each_entry (tmp, &iobuf_pool->all_arenas, all_list) { + new = GF_CALLOC(1, sizeof(gf_rdma_arena_mr), + gf_common_mt_rdma_arena_mr); + INIT_LIST_HEAD (&new->list); + new->iobuf_arena = tmp; + + mr = ibv_reg_mr(device->pd, tmp->mem_base, + tmp->arena_size, + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE); + if (!mr) { + gf_log ("rdma", GF_LOG_WARNING, "failed to pre" + " register buffers with rdma " + "devices."); + + } + new->mr = mr; + list_add (&new->list, &device->all_mr); + + new = NULL; + } + } + + return; +} + +static struct ibv_mr* +gf_rdma_get_pre_registred_mr(rpc_transport_t *this, void *ptr, int size) +{ + gf_rdma_arena_mr *tmp = NULL; + gf_rdma_private_t *priv = NULL; + gf_rdma_device_t *device = NULL; + + priv = this->private; + device = priv->device; + + if (!list_empty(&device->all_mr)) { + list_for_each_entry (tmp, &device->all_mr, list) { + if (tmp->iobuf_arena->mem_base <= ptr && + ptr < tmp->iobuf_arena->mem_base + + tmp->iobuf_arena->arena_size) + return tmp->mr; + } + } + + return NULL; +} static int32_t gf_rdma_create_posts (rpc_transport_t *this) @@ -510,11 +640,13 @@ gf_rdma_get_device (rpc_transport_t *this, struct ibv_context *ibctx, int32_t i = 0; gf_rdma_device_t *trav = NULL, *device = NULL; gf_rdma_ctx_t *rdma_ctx = NULL; + struct iobuf_pool *iobuf_pool = NULL; priv = this->private; options = &priv->options; ctx = this->ctx; rdma_ctx = ctx->ib; + iobuf_pool = ctx->iobuf_pool; trav = rdma_ctx->device; @@ -530,10 +662,10 @@ gf_rdma_get_device (rpc_transport_t *this, struct ibv_context *ibctx, if (trav == NULL) { goto out; } - priv->device = trav; trav->context = ibctx; - + iobuf_pool->device[iobuf_pool->rdma_device_count] = trav; + iobuf_pool->mr_list[iobuf_pool->rdma_device_count++] = &trav->all_mr; trav->request_ctx_pool = mem_pool_new (gf_rdma_request_context_t, GF_RDMA_POOL_SIZE); @@ -613,6 +745,9 @@ gf_rdma_get_device (rpc_transport_t *this, struct ibv_context *ibctx, gf_rdma_queue_init (&trav->sendq); gf_rdma_queue_init (&trav->recvq); + INIT_LIST_HEAD (&trav->all_mr); + gf_rdma_register_iobuf_pool(this); + if (gf_rdma_create_posts (this) < 0) { gf_log (this->name, GF_LOG_ERROR, "could not allocate posts for device (%s)", @@ -1239,9 +1374,13 @@ __gf_rdma_create_read_chunks_from_vector (gf_rdma_peer_t *peer, readch->rc_discrim = hton32 (1); readch->rc_position = hton32 (*pos); + mr = gf_rdma_get_pre_registred_mr(peer->trans, + (void *)vector[i].iov_base, vector[i].iov_len); + if (!mr) { mr = ibv_reg_mr (device->pd, vector[i].iov_base, vector[i].iov_len, IBV_ACCESS_REMOTE_READ); + } if (!mr) { gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING, "memory registration failed (%s) (peer:%s)", @@ -1374,10 +1513,16 @@ __gf_rdma_create_write_chunks_from_vector (gf_rdma_peer_t *peer, device = priv->device; for (i = 0; i < count; i++) { + + mr = gf_rdma_get_pre_registred_mr(peer->trans, + (void *)vector[i].iov_base, vector[i].iov_len); + if (!mr) { mr = ibv_reg_mr (device->pd, vector[i].iov_base, vector[i].iov_len, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); + } + if (!mr) { gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING, "memory registration failed (%s) (peer:%s)", @@ -1504,16 +1649,30 @@ out: static inline void -__gf_rdma_deregister_mr (struct ibv_mr **mr, int count) +__gf_rdma_deregister_mr (gf_rdma_device_t *device, + struct ibv_mr **mr, int count) { - int i = 0; + gf_rdma_arena_mr *tmp = NULL; + int i = 0; + int found = 0; - if (mr == NULL) { + if (mr == NULL) { goto out; } for (i = 0; i < count; i++) { - ibv_dereg_mr (mr[i]); + found = 0; + if (!list_empty(&device->all_mr)) { + list_for_each_entry(tmp, &device->all_mr, list) { + if (tmp->mr == mr[i]) { + found = 1; + break; + } + } + } + if (!found) + ibv_dereg_mr (mr[i]); + } out: @@ -1558,9 +1717,10 @@ gf_rdma_quota_put (gf_rdma_peer_t *peer) void __gf_rdma_request_context_destroy (gf_rdma_request_context_t *context) { - gf_rdma_peer_t *peer = NULL; - gf_rdma_private_t *priv = NULL; - int32_t ret = 0; + gf_rdma_peer_t *peer = NULL; + gf_rdma_private_t *priv = NULL; + gf_rdma_device_t *device = NULL; + int32_t ret = 0; if (context == NULL) { goto out; @@ -1568,9 +1728,10 @@ __gf_rdma_request_context_destroy (gf_rdma_request_context_t *context) peer = context->peer; - __gf_rdma_deregister_mr (context->mr, context->mr_count); - priv = peer->trans->private; + device = priv->device; + __gf_rdma_deregister_mr (device, context->mr, context->mr_count); + if (priv->connected) { ret = __gf_rdma_quota_put (peer); @@ -1602,13 +1763,14 @@ out: void -gf_rdma_post_context_destroy (gf_rdma_post_context_t *ctx) +gf_rdma_post_context_destroy (gf_rdma_device_t *device, + gf_rdma_post_context_t *ctx) { if (ctx == NULL) { goto out; } - __gf_rdma_deregister_mr (ctx->mr, ctx->mr_count); + __gf_rdma_deregister_mr (device, ctx->mr, ctx->mr_count); if (ctx->iobref != NULL) { iobref_unref (ctx->iobref); @@ -1640,7 +1802,7 @@ gf_rdma_post_unref (gf_rdma_post_t *post) pthread_mutex_unlock (&post->lock); if (refcount == 0) { - gf_rdma_post_context_destroy (&post->ctx); + gf_rdma_post_context_destroy (post->device, &post->ctx); if (post->type == GF_RDMA_SEND_POST) { gf_rdma_put_post (&post->device->sendq, post); } else { @@ -2060,10 +2222,16 @@ __gf_rdma_register_local_mr_for_rdma (gf_rdma_peer_t *peer, * Infiniband Architecture Specification Volume 1 * (Release 1.2.1) */ + ctx->mr[ctx->mr_count] = gf_rdma_get_pre_registred_mr( + peer->trans, (void *)vector[i].iov_base, + vector[i].iov_len); + + if (!ctx->mr[ctx->mr_count]) { ctx->mr[ctx->mr_count] = ibv_reg_mr (device->pd, vector[i].iov_base, vector[i].iov_len, IBV_ACCESS_LOCAL_WRITE); + } if (ctx->mr[ctx->mr_count] == NULL) { gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING, "registering memory for IBV_ACCESS_LOCAL_WRITE " @@ -4553,6 +4721,7 @@ int32_t init (rpc_transport_t *this) { gf_rdma_private_t *priv = NULL; + struct iobuf_pool *iobuf_pool = NULL; priv = GF_CALLOC (1, sizeof (*priv), gf_common_mt_rdma_private_t); if (!priv) @@ -4565,6 +4734,9 @@ init (rpc_transport_t *this) "Failed to initialize IB Device"); return -1; } + iobuf_pool = this->ctx->iobuf_pool; + iobuf_pool->rdma_registration = gf_rdma_register_arena; + iobuf_pool->rdma_deregistration = gf_rdma_deregister_arena; return 0; } diff --git a/rpc/rpc-transport/rdma/src/rdma.h b/rpc/rpc-transport/rdma/src/rdma.h index 7f76244f071..fda01aa53ef 100644 --- a/rpc/rpc-transport/rdma/src/rdma.h +++ b/rpc/rpc-transport/rdma/src/rdma.h @@ -34,6 +34,7 @@ /* FIXME: give appropriate values to these macros */ #define GF_DEFAULT_RDMA_LISTEN_PORT (GF_DEFAULT_BASE_PORT + 1) + /* If you are changing GF_RDMA_MAX_SEGMENTS, please make sure to update * GLUSTERFS_GF_RDMA_MAX_HEADER_SIZE defined in glusterfs.h . */ @@ -328,9 +329,18 @@ struct __gf_rdma_device { struct mem_pool *request_ctx_pool; struct mem_pool *ioq_pool; struct mem_pool *reply_info_pool; + struct list_head all_mr; }; typedef struct __gf_rdma_device gf_rdma_device_t; + +struct __gf_rdma_arena_mr { + struct list_head list; + struct iobuf_arena *iobuf_arena; + struct ibv_mr *mr; +}; + +typedef struct __gf_rdma_arena_mr gf_rdma_arena_mr; struct __gf_rdma_ctx { gf_rdma_device_t *device; struct rdma_event_channel *rdma_cm_event_channel; -- cgit