diff options
Diffstat (limited to 'rpc/rpc-transport/rdma/src/rdma.h')
| -rw-r--r-- | rpc/rpc-transport/rdma/src/rdma.h | 358 |
1 files changed, 172 insertions, 186 deletions
diff --git a/rpc/rpc-transport/rdma/src/rdma.h b/rpc/rpc-transport/rdma/src/rdma.h index a408ef494..7f76244f0 100644 --- a/rpc/rpc-transport/rdma/src/rdma.h +++ b/rpc/rpc-transport/rdma/src/rdma.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2006-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _XPORT_RDMA_H @@ -38,49 +29,59 @@ #include <list.h> #include <arpa/inet.h> #include <infiniband/verbs.h> +#include <rdma/rdma_cma.h> /* FIXME: give appropriate values to these macros */ #define GF_DEFAULT_RDMA_LISTEN_PORT (GF_DEFAULT_BASE_PORT + 1) -#define RDMA_MAX_SEGMENTS 8 -#define RDMA_MAX_HEADER_SIZE (sizeof (rdma_header_t) \ - + RDMA_MAX_SEGMENTS \ - * sizeof (rdma_read_chunk_t)) -#define RDMA_INLINE_THRESHOLD (1024 * 128) -#define RDMA_VERSION 1 -#define RDMA_POOL_SIZE 512 - -typedef enum rdma_errcode { + +/* If you are changing GF_RDMA_MAX_SEGMENTS, please make sure to update + * GLUSTERFS_GF_RDMA_MAX_HEADER_SIZE defined in glusterfs.h . + */ +#define GF_RDMA_MAX_SEGMENTS 8 + +#define GF_RDMA_VERSION 1 +#define GF_RDMA_POOL_SIZE 512 + +/* Additional attributes */ +#define GF_RDMA_TIMEOUT 14 +#define GF_RDMA_RETRY_CNT 7 +#define GF_RDMA_RNR_RETRY 7 + +typedef enum gf_rdma_errcode { ERR_VERS = 1, ERR_CHUNK = 2 -}rdma_errcode_t; +}gf_rdma_errcode_t; -struct rdma_err_vers { - uint32_t rdma_vers_low; /* Version range supported by peer */ - uint32_t rdma_vers_high; +struct gf_rdma_err_vers { + uint32_t gf_rdma_vers_low; /* Version range supported by peer */ + uint32_t gf_rdma_vers_high; }__attribute__ ((packed)); -typedef struct rdma_err_vers rdma_err_vers_t; - -typedef enum rdma_proc { - RDMA_MSG = 0, /* An RPC call or reply msg */ - RDMA_NOMSG = 1, /* An RPC call or reply msg - separate body */ - RDMA_MSGP = 2, /* An RPC call or reply msg with padding */ - RDMA_DONE = 3, /* Client signals reply completion */ - RDMA_ERROR = 4 /* An RPC RDMA encoding error */ -}rdma_proc_t; - -typedef enum rdma_chunktype { - rdma_noch = 0, /* no chunk */ - rdma_readch, /* some argument through rdma read */ - rdma_areadch, /* entire request through rdma read */ - rdma_writech, /* some result through rdma write */ - rdma_replych /* entire reply through rdma write */ -}rdma_chunktype_t; - -struct __rdma_header { +typedef struct gf_rdma_err_vers gf_rdma_err_vers_t; + +typedef enum gf_rdma_proc { + GF_RDMA_MSG = 0, /* An RPC call or reply msg */ + GF_RDMA_NOMSG = 1, /* An RPC call or reply msg - separate body */ + GF_RDMA_MSGP = 2, /* An RPC call or reply msg with padding */ + GF_RDMA_DONE = 3, /* Client signals reply completion */ + GF_RDMA_ERROR = 4 /* An RPC RDMA encoding error */ +}gf_rdma_proc_t; + +typedef enum gf_rdma_chunktype { + gf_rdma_noch = 0, /* no chunk */ + gf_rdma_readch, /* some argument through rdma read */ + gf_rdma_areadch, /* entire request through rdma read */ + gf_rdma_writech, /* some result through rdma write */ + gf_rdma_replych /* entire reply through rdma write */ +}gf_rdma_chunktype_t; + +/* If you are modifying __gf_rdma_header, please make sure to change + * GLUSTERFS_GF_RDMA_MAX_HEADER_SIZE defined in glusterfs.h to reflect your changes + */ +struct __gf_rdma_header { uint32_t rm_xid; /* Mirrors the RPC header xid */ uint32_t rm_vers; /* Version of this protocol */ uint32_t rm_credit; /* Buffers requested/granted */ - uint32_t rm_type; /* Type of message (enum rdma_proc) */ + uint32_t rm_type; /* Type of message (enum gf_rdma_proc) */ union { struct { /* no chunks */ uint32_t rm_empty[3]; /* 3 empty chunk lists */ @@ -94,45 +95,49 @@ struct __rdma_header { struct { uint32_t rm_type; - rdma_err_vers_t rm_version; + gf_rdma_err_vers_t rm_version; }__attribute__ ((packed)) rm_error; uint32_t rm_chunks[0]; /* read, write and reply chunks */ }__attribute__ ((packed)) rm_body; } __attribute__((packed)); -typedef struct __rdma_header rdma_header_t; +typedef struct __gf_rdma_header gf_rdma_header_t; -struct __rdma_segment { +/* If you are modifying __gf_rdma_segment or __gf_rdma_read_chunk, please make sure + * to change GLUSTERFS_GF_RDMA_MAX_HEADER_SIZE defined in glusterfs.h to reflect + * your changes. + */ +struct __gf_rdma_segment { uint32_t rs_handle; /* Registered memory handle */ uint32_t rs_length; /* Length of the chunk in bytes */ uint64_t rs_offset; /* Chunk virtual address or offset */ } __attribute__((packed)); -typedef struct __rdma_segment rdma_segment_t; +typedef struct __gf_rdma_segment gf_rdma_segment_t; /* read chunk(s), encoded as a linked list. */ -struct __rdma_read_chunk { +struct __gf_rdma_read_chunk { uint32_t rc_discrim; /* 1 indicates presence */ uint32_t rc_position; /* Position in XDR stream */ - rdma_segment_t rc_target; + gf_rdma_segment_t rc_target; } __attribute__((packed)); -typedef struct __rdma_read_chunk rdma_read_chunk_t; +typedef struct __gf_rdma_read_chunk gf_rdma_read_chunk_t; /* write chunk, and reply chunk. */ -struct __rdma_write_chunk { - rdma_segment_t wc_target; +struct __gf_rdma_write_chunk { + gf_rdma_segment_t wc_target; } __attribute__((packed)); -typedef struct __rdma_write_chunk rdma_write_chunk_t; +typedef struct __gf_rdma_write_chunk gf_rdma_write_chunk_t; /* write chunk(s), encoded as a counted array. */ -struct __rdma_write_array { +struct __gf_rdma_write_array { uint32_t wc_discrim; /* 1 indicates presence */ uint32_t wc_nchunks; /* Array count */ - struct __rdma_write_chunk wc_array[0]; + struct __gf_rdma_write_chunk wc_array[0]; } __attribute__((packed)); -typedef struct __rdma_write_array rdma_write_array_t; +typedef struct __gf_rdma_write_array gf_rdma_write_array_t; /* options per transport end point */ -struct __rdma_options { +struct __gf_rdma_options { int32_t port; char *device_name; enum ibv_mtu mtu; @@ -140,26 +145,29 @@ struct __rdma_options { int32_t recv_count; uint64_t recv_size; uint64_t send_size; + uint8_t attr_timeout; + uint8_t attr_retry_cnt; + uint8_t attr_rnr_retry; }; -typedef struct __rdma_options rdma_options_t; +typedef struct __gf_rdma_options gf_rdma_options_t; -struct __rdma_reply_info { +struct __gf_rdma_reply_info { uint32_t rm_xid; /* xid in network endian */ - rdma_chunktype_t type; /* - * can be either rdma_replych - * or rdma_writech. + gf_rdma_chunktype_t type; /* + * can be either gf_rdma_replych + * or gf_rdma_writech. */ - rdma_write_array_t *wc_array; + gf_rdma_write_array_t *wc_array; struct mem_pool *pool; }; -typedef struct __rdma_reply_info rdma_reply_info_t; +typedef struct __gf_rdma_reply_info gf_rdma_reply_info_t; -struct __rdma_ioq { +struct __gf_rdma_ioq { union { struct list_head list; struct { - struct __rdma_ioq *next; - struct __rdma_ioq *prev; + struct __gf_rdma_ioq *next; + struct __gf_rdma_ioq *prev; }; }; @@ -174,8 +182,8 @@ struct __rdma_ioq { struct iobref *iobref; union { - struct __rdma_ioq_request { - /* used to build reply_chunk for RDMA_NOMSG type msgs */ + struct __gf_rdma_ioq_request { + /* used to build reply_chunk for GF_RDMA_NOMSG type msgs */ struct iovec rsphdr_vec[MAX_IOVEC]; int rsphdr_count; @@ -192,205 +200,183 @@ struct __rdma_ioq { struct iobref *rsp_iobref; }request; - rdma_reply_info_t *reply_info; + gf_rdma_reply_info_t *reply_info; }msg; struct mem_pool *pool; }; -typedef struct __rdma_ioq rdma_ioq_t; +typedef struct __gf_rdma_ioq gf_rdma_ioq_t; -typedef enum __rdma_send_post_type { - RDMA_SEND_POST_NO_CHUNKLIST, /* post which is sent using rdma-send +typedef enum __gf_rdma_send_post_type { + GF_RDMA_SEND_POST_NO_CHUNKLIST, /* post which is sent using rdma-send * and the msg carries no * chunklists. */ - RDMA_SEND_POST_READ_CHUNKLIST, /* post which is sent using rdma-send + GF_RDMA_SEND_POST_READ_CHUNKLIST, /* post which is sent using rdma-send * and the msg carries only read * chunklist. */ - RDMA_SEND_POST_WRITE_CHUNKLIST, /* post which is sent using + GF_RDMA_SEND_POST_WRITE_CHUNKLIST, /* post which is sent using * rdma-send and the msg carries * only write chunklist. */ - RDMA_SEND_POST_READ_WRITE_CHUNKLIST, /* post which is sent using + GF_RDMA_SEND_POST_READ_WRITE_CHUNKLIST, /* post which is sent using * rdma-send and the msg * carries both read and * write chunklists. */ - RDMA_SEND_POST_RDMA_READ, /* RDMA read */ - RDMA_SEND_POST_RDMA_WRITE, /* RDMA write */ -}rdma_send_post_type_t; - + GF_RDMA_SEND_POST_GF_RDMA_READ, /* RDMA read */ + GF_RDMA_SEND_POST_GF_RDMA_WRITE, /* RDMA write */ +}gf_rdma_send_post_type_t; + /* represents one communication peer, two per transport_t */ -struct __rdma_peer { - rpc_transport_t *trans; - struct ibv_qp *qp; +struct __gf_rdma_peer { + rpc_transport_t *trans; + struct rdma_cm_id *cm_id; + struct ibv_qp *qp; + pthread_t rdma_event_thread; + char quota_set; int32_t recv_count; int32_t send_count; int32_t recv_size; int32_t send_size; - int32_t quota; + int32_t quota; union { - struct list_head ioq; + struct list_head ioq; struct { - rdma_ioq_t *ioq_next; - rdma_ioq_t *ioq_prev; + gf_rdma_ioq_t *ioq_next; + gf_rdma_ioq_t *ioq_prev; }; }; /* QP attributes, needed to connect with remote QP */ - int32_t local_lid; - int32_t local_psn; - int32_t local_qpn; - int32_t remote_lid; - int32_t remote_psn; - int32_t remote_qpn; + int32_t local_lid; + int32_t local_psn; + int32_t local_qpn; + int32_t remote_lid; + int32_t remote_psn; + int32_t remote_qpn; }; -typedef struct __rdma_peer rdma_peer_t; +typedef struct __gf_rdma_peer gf_rdma_peer_t; -struct __rdma_post_context { - struct ibv_mr *mr[RDMA_MAX_SEGMENTS]; +struct __gf_rdma_post_context { + struct ibv_mr *mr[GF_RDMA_MAX_SEGMENTS]; int mr_count; struct iovec vector[MAX_IOVEC]; int count; struct iobref *iobref; + struct iobuf *hdr_iobuf; char is_request; - int rdma_reads; - rdma_reply_info_t *reply_info; + int gf_rdma_reads; + gf_rdma_reply_info_t *reply_info; }; -typedef struct __rdma_post_context rdma_post_context_t; +typedef struct __gf_rdma_post_context gf_rdma_post_context_t; typedef enum { - RDMA_SEND_POST, - RDMA_RECV_POST -} rdma_post_type_t; + GF_RDMA_SEND_POST, + GF_RDMA_RECV_POST +} gf_rdma_post_type_t; -struct __rdma_post { - struct __rdma_post *next, *prev; +struct __gf_rdma_post { + struct __gf_rdma_post *next, *prev; struct ibv_mr *mr; char *buf; int32_t buf_size; char aux; int32_t reused; - struct __rdma_device *device; - rdma_post_type_t type; - rdma_post_context_t ctx; + struct __gf_rdma_device *device; + gf_rdma_post_type_t type; + gf_rdma_post_context_t ctx; int refcount; pthread_mutex_t lock; }; -typedef struct __rdma_post rdma_post_t; +typedef struct __gf_rdma_post gf_rdma_post_t; -struct __rdma_queue { - rdma_post_t active_posts, passive_posts; +struct __gf_rdma_queue { + gf_rdma_post_t active_posts, passive_posts; int32_t active_count, passive_count; pthread_mutex_t lock; }; -typedef struct __rdma_queue rdma_queue_t; +typedef struct __gf_rdma_queue gf_rdma_queue_t; -struct __rdma_qpreg { +struct __gf_rdma_qpreg { pthread_mutex_t lock; int32_t count; struct _qpent { struct _qpent *next, *prev; int32_t qp_num; - rdma_peer_t *peer; + gf_rdma_peer_t *peer; } ents[42]; }; -typedef struct __rdma_qpreg rdma_qpreg_t; +typedef struct __gf_rdma_qpreg gf_rdma_qpreg_t; /* context per device, stored in global glusterfs_ctx_t->ib */ -struct __rdma_device { - struct __rdma_device *next; +struct __gf_rdma_device { + struct __gf_rdma_device *next; const char *device_name; struct ibv_context *context; int32_t port; struct ibv_pd *pd; struct ibv_srq *srq; - rdma_qpreg_t qpreg; + gf_rdma_qpreg_t qpreg; struct ibv_comp_channel *send_chan, *recv_chan; struct ibv_cq *send_cq, *recv_cq; - rdma_queue_t sendq, recvq; - pthread_t send_thread, recv_thread; + gf_rdma_queue_t sendq, recvq; + pthread_t send_thread, recv_thread, async_event_thread; + struct mem_pool *request_ctx_pool; + struct mem_pool *ioq_pool; + struct mem_pool *reply_info_pool; }; -typedef struct __rdma_device rdma_device_t; +typedef struct __gf_rdma_device gf_rdma_device_t; -typedef enum { - RDMA_HANDSHAKE_START = 0, - RDMA_HANDSHAKE_SENDING_DATA, - RDMA_HANDSHAKE_RECEIVING_DATA, - RDMA_HANDSHAKE_SENT_DATA, - RDMA_HANDSHAKE_RECEIVED_DATA, - RDMA_HANDSHAKE_SENDING_ACK, - RDMA_HANDSHAKE_RECEIVING_ACK, - RDMA_HANDSHAKE_RECEIVED_ACK, - RDMA_HANDSHAKE_COMPLETE, -} rdma_handshake_state_t; - -struct rdma_nbio { - int state; - char *buf; - int count; - struct iovec vector; - struct iovec *pending_vector; - int pending_count; +struct __gf_rdma_ctx { + gf_rdma_device_t *device; + struct rdma_event_channel *rdma_cm_event_channel; + pthread_t rdma_cm_thread; }; +typedef struct __gf_rdma_ctx gf_rdma_ctx_t; -struct __rdma_request_context { - struct ibv_mr *mr[RDMA_MAX_SEGMENTS]; +struct __gf_rdma_request_context { + struct ibv_mr *mr[GF_RDMA_MAX_SEGMENTS]; int mr_count; struct mem_pool *pool; - rdma_peer_t *peer; + gf_rdma_peer_t *peer; struct iobref *iobref; struct iobref *rsp_iobref; }; -typedef struct __rdma_request_context rdma_request_context_t; - -struct __rdma_private { - int32_t sock; - int32_t idx; - unsigned char connected; - unsigned char tcp_connected; - unsigned char ib_connected; - in_addr_t addr; +typedef struct __gf_rdma_request_context gf_rdma_request_context_t; + +typedef enum { + GF_RDMA_SERVER_LISTENER, + GF_RDMA_SERVER, + GF_RDMA_CLIENT, +} gf_rdma_transport_entity_t; + +struct __gf_rdma_private { + int32_t idx; + unsigned char connected; + in_addr_t addr; unsigned short port; /* IB Verbs Driver specific variables, pointers */ - rdma_peer_t peer; - struct __rdma_device *device; - rdma_options_t options; + gf_rdma_peer_t peer; + struct __gf_rdma_device *device; + gf_rdma_options_t options; /* Used by trans->op->receive */ - char *data_ptr; - int32_t data_offset; - int32_t data_len; + char *data_ptr; + int32_t data_offset; + int32_t data_len; /* Mutex */ - pthread_mutex_t read_mutex; - pthread_mutex_t write_mutex; - pthread_barrier_t handshake_barrier; - char handshake_ret; - char is_server; - rpc_transport_t *listener; - - pthread_mutex_t recv_mutex; - pthread_cond_t recv_cond; - - struct mem_pool *request_ctx_pool; - struct mem_pool *ioq_pool; - struct mem_pool *reply_info_pool; - - /* used during rdma_handshake */ - struct { - struct rdma_nbio incoming; - struct rdma_nbio outgoing; - int state; - rdma_header_t header; - char *buf; - size_t size; - } handshake; + pthread_mutex_t write_mutex; + rpc_transport_t *listener; + pthread_mutex_t recv_mutex; + pthread_cond_t recv_cond; + gf_rdma_transport_entity_t entity; }; -typedef struct __rdma_private rdma_private_t; +typedef struct __gf_rdma_private gf_rdma_private_t; -#endif /* _XPORT_RDMA_H */ +#endif /* _XPORT_GF_RDMA_H */ |
