31 #include "ompi_config.h"
32 #include <sys/types.h>
34 #include <infiniband/verbs.h>
37 #include "ompi/class/ompi_free_list.h"
41 #include "opal/mca/event/event.h"
42 #include "opal/threads/threads.h"
45 #include "ompi/mca/btl/base/btl_base_error.h"
50 #include "connect/connect.h"
54 #define HAVE_XRC (1 == OMPI_HAVE_CONNECTX_XRC)
55 #define ENABLE_DYNAMIC_SL (1 == OMPI_ENABLE_DYNAMIC_SL)
57 #define MCA_BTL_IB_LEAVE_PINNED 1
58 #define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll
59 #define MCA_BTL_IB_PKEY_MASK 0x7fff
65 #define ATTACH() do { \
67 opal_output(0, "WAITING TO DEBUG ATTACH"); \
68 while (i == 0) sleep(5); \
81 MCA_BTL_OPENIB_TRANSPORT_IB,
82 MCA_BTL_OPENIB_TRANSPORT_IWARP,
83 MCA_BTL_OPENIB_TRANSPORT_RDMAOE,
84 MCA_BTL_OPENIB_TRANSPORT_UNKNOWN,
85 MCA_BTL_OPENIB_TRANSPORT_SIZE
86 } mca_btl_openib_transport_type_t;
90 MCA_BTL_OPENIB_SRQ_QP,
92 } mca_btl_openib_qp_type_t;
110 mca_btl_openib_qp_type_t type;
120 #define BTL_OPENIB_QP_TYPE(Q) (mca_btl_openib_component.qp_infos[(Q)].type)
121 #define BTL_OPENIB_QP_TYPE_PP(Q) \
122 (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_PP_QP)
123 #define BTL_OPENIB_QP_TYPE_SRQ(Q) \
124 (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_SRQ_QP)
125 #define BTL_OPENIB_QP_TYPE_XRC(Q) \
126 (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP)
129 BTL_OPENIB_RQ_SOURCE_DEFAULT,
130 BTL_OPENIB_RQ_SOURCE_MCA,
131 BTL_OPENIB_RQ_SOURCE_DEVICE_INI,
132 BTL_OPENIB_RQ_SOURCE_MAX
133 } btl_openib_receive_queues_source_t;
139 } btl_openib_device_type_t;
141 #if OPAL_HAVE_THREADS
143 typedef struct mca_btl_openib_srq_manager_t {
149 } mca_btl_openib_srq_manager_t;
209 uint32_t ib_pkey_val;
211 uint32_t ib_qp_ous_rd_atom;
213 uint32_t ib_min_rnr_timer;
215 uint32_t ib_retry_count;
216 uint32_t ib_rnr_retry;
217 uint32_t ib_max_rdma_dst_ops;
218 uint32_t ib_service_level;
219 #if (ENABLE_DYNAMIC_SL)
220 uint32_t ib_path_record_service_level;
222 int32_t use_eager_rdma;
224 int32_t eager_rdma_num;
225 int32_t max_eager_rdma;
226 uint32_t btls_per_lid;
231 #if OPAL_HAVE_THREADS
232 int32_t error_counter;
234 int async_comp_pipe[2];
235 pthread_t async_thread;
236 uint32_t use_async_event_thread;
237 mca_btl_openib_srq_manager_t srq_manager;
238 #if BTL_OPENIB_FAILOVER_ENABLED
239 uint32_t port_error_failover;
242 btl_openib_device_type_t device_type;
244 char **if_include_list;
246 char **if_exclude_list;
247 char *ipaddr_include;
248 char *ipaddr_exclude;
251 char *receive_queues;
253 btl_openib_receive_queues_source_t receive_queues_source;
274 bool use_message_coalescing;
275 uint32_t cq_poll_ratio;
276 uint32_t cq_poll_progress;
277 uint32_t eager_rdma_poll_ratio;
278 #ifdef HAVE_IBV_FORK_INIT
280 int want_fork_support;
297 #if BTL_OPENIB_FAILOVER_ENABLED
298 int verbose_failover;
328 #define MCA_BTL_OPENIB_MODEX_MSG_NTOH(hdr) \
330 (hdr).subnet_id = ntoh64((hdr).subnet_id); \
331 (hdr).lid = ntohs((hdr).lid); \
333 #define MCA_BTL_OPENIB_MODEX_MSG_HTON(hdr) \
335 (hdr).subnet_id = hton64((hdr).subnet_id); \
336 (hdr).lid = htons((hdr).lid); \
348 struct ibv_device *ib_dev;
349 #if OMPI_ENABLE_PROGRESS_THREADS == 1
350 struct ibv_comp_channel *ib_channel;
352 volatile bool progress;
355 struct ibv_context *ib_dev_context;
356 struct ibv_device_attr ib_dev_attr;
357 struct ibv_pd *ib_pd;
358 struct ibv_cq *ib_cq[2];
364 uint8_t use_eager_rdma;
368 uint16_t hp_cq_polls;
369 uint16_t eager_rdma_polls;
371 #if OPAL_HAVE_THREADS
372 volatile bool got_fatal_event;
373 volatile bool got_port_event;
376 struct ibv_xrc_domain *xrc_domain;
379 int32_t non_eager_rdma_endpoints;
380 int32_t eager_rdma_buffers_count;
387 uint32_t max_inline_data;
450 struct ibv_port_attr ib_port_attr;
476 #if OMPI_ENABLE_PROGRESS_THREADS == 1
488 int mca_btl_openib_register_error_cb(
501 extern int mca_btl_openib_finalize(
518 extern int mca_btl_openib_add_procs(
536 extern int mca_btl_openib_del_procs(
552 extern int mca_btl_openib_send(
556 mca_btl_base_tag_t tag
581 mca_btl_base_tag_t tag,
592 extern int mca_btl_openib_put(
605 extern int mca_btl_openib_get(
632 extern int mca_btl_openib_free(
671 extern void mca_btl_openib_frag_progress_pending_put_get(
680 extern int mca_btl_openib_ft_event(
int state);
687 void mca_btl_openib_show_init_error(
const char *file,
int line,
688 const char *func,
const char *dev);
690 #define BTL_OPENIB_HP_CQ 0
691 #define BTL_OPENIB_LP_CQ 1
709 const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type);
717 static inline int qp_cq_prio(
const int qp)
720 return BTL_OPENIB_HP_CQ;
724 return (mca_btl_openib_component.qp_infos[qp].size <=
726 BTL_OPENIB_HP_CQ : BTL_OPENIB_LP_CQ;
729 #define BTL_OPENIB_RDMA_QP(QP) \
730 ((QP) == mca_btl_openib_component.rdma_qp)
uint32_t use_srq
Use the Shared Receive Queue (SRQ mode)
Definition: btl_openib.h:204
struct mca_btl_openib_module_t ** openib_btls
array of available BTLs
Definition: btl_openib.h:161
opal_list_t pending_frags[2]
list of high/low prio frags
Definition: btl_openib.h:400
Definition: opal_hash_table.h:42
Definition: btl_openib.h:152
Definition: btl_openib.h:470
A descriptor that holds the parameters to a send/put/get operation along w/ a callback routine that i...
Definition: btl.h:275
OPAL output stream facility.
bool srq_limit_event_flag
The flag points if we want to get the IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ...
Definition: btl_openib.h:410
ompi_free_list_t recv_free
free lists of receive buffer descriptors
Definition: btl_openib.h:341
Definition: btl_openib.h:99
dynamic pointer array
Definition: opal_pointer_array.h:45
Definition: btl_openib.h:346
Definition: opal_bitmap.h:53
Definition: btl_openib.h:94
Definition: btl_openib.h:395
mca_btl_base_component_2_0_0_t super
base BTL component
Definition: btl_openib.h:153
bool enable_srq_resize
Whether we want a dynamically resizing srq, enabled by default.
Definition: btl_openib.h:296
Definition: btl_openib.h:339
opal_event_t ib_send_event
event structure for sends
Definition: btl_openib.h:179
Structure to represent a single event.
Definition: event_struct.h:87
bool cpc_explicitly_defined
free list of frags only; used for pining user memory
Definition: btl_openib.h:284
mca_btl_openib_modex_message_t port_info
Common information about all ports.
Definition: btl_openib.h:439
bool warn_no_device_params_found
Whether we want a warning if no device-specific parameters are found in INI files.
Definition: btl_openib.h:263
uint8_t num_cpcs
Number of elements in the cpcs array.
Definition: btl_openib.h:445
void(* mca_btl_base_module_error_cb_fn_t)(struct mca_btl_base_module_t *btl, int32_t flags, struct ompi_proc_t *errproc, char *btlinfo)
Callback function that is called asynchronously on receipt of an error from the transport layer...
Definition: btl.h:538
Definition: mutex_unix.h:53
Definition: btl_openib.h:422
int gid_index
GID index to use.
Definition: btl_openib.h:294
int ib_num_btls
number of devices available to the openib component
Definition: btl_openib.h:158
uint8_t num_srq_qps
number of srq qp's
Definition: btl_openib.h:192
char * default_recv_qps
Default receive queues.
Definition: btl_openib.h:292
char * device_params_file_names
Colon-delimited list of filenames for device parameters.
Definition: btl_openib.h:256
uint8_t end
Dummy field used to calculate the real length.
Definition: btl_openib.h:325
IB BTL Interface.
Definition: btl_openib.h:432
See opal_bitmap.h for an explanation of why there is a split between OPAL and ORTE for this generic c...
Remote Open MPI process structure.
Definition: proc.h:56
uint8_t num_qps
total number of qp's
Definition: btl_openib.h:194
opal_list_t ib_procs
list of ib proc structures
Definition: btl_openib.h:176
uint8_t num_xrc_qps
number of xrc qp's
Definition: btl_openib.h:193
opal_pointer_array_t devices
array of available devices
Definition: btl_openib.h:164
int32_t ib_max_inline_data
Max size of inline data.
Definition: btl_openib.h:208
int ib_free_list_num
initial size of free lists
Definition: btl_openib.h:167
bool verbose
Whether we're in verbose mode or not.
Definition: btl_openib.h:259
ompi_free_list_t send_free
free lists of send buffer descriptors
Definition: btl_openib.h:340
uint16_t lid
LID of this port.
Definition: btl_openib.h:313
Common information for all ports that is sent in the modex message.
Definition: btl_openib.h:309
size_t max_send_size
Maximum send size, in Bytes.
Definition: btl_openib.h:201
int apm_port
Alternative port that may be used for APM.
Definition: btl_openib.h:452
char * ib_mpool_name
name of ib memory pool
Definition: btl_openib.h:188
ompi_free_list_t send_user_free
free list of frags only; used for pining user memory
Definition: btl_openib.h:286
Byte Transfer Layer (BTL)
opal_hash_table_t ib_addr_table
used only for xrc.hash-table that keeps table of all lids/subnets
Definition: btl_openib.h:196
uint32_t reg_mru_len
Length of the registration cache most recently used list.
Definition: btl_openib.h:203
Struct for holding CPC module and associated meta data.
Definition: connect.h:328
uint8_t transport_type
Transport type of remote port.
Definition: btl_openib.h:323
Definition: ompi_free_list.h:39
A hash table that may be indexed with either fixed length (e.g.
ompi_free_list_t recv_user_free
frags for coalesced massages
Definition: btl_openib.h:288
uint32_t vendor_part_id
vendor part id define device type and tuning
Definition: btl_openib.h:321
mca_btl_base_module_error_cb_fn_t error_cb
error handler
Definition: btl_openib.h:462
volatile int32_t eager_rdma_channels
number of open RDMA channels
Definition: btl_openib.h:460
State of ELAN endpoint connection.
Definition: btl_elan_endpoint.h:33
bool warn_default_gid_prefix
Whether we want a warning if non default GID prefix is not configured on multiport setup...
Definition: btl_openib.h:266
int ib_free_list_max
maximum size of free lists
Definition: btl_openib.h:170
BTL component descriptor.
Definition: btl.h:411
Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana University Research and Techno...
Base object.
Definition: opal_object.h:182
Definition: opal_convertor.h:90
uint8_t port_num
ID of the PORT.
Definition: btl_openib.h:448
size_t eager_rdma_frag_size
length of eager frag
Definition: btl_openib.h:459
opal_mutex_t ib_lock
module level lock
Definition: btl_openib.h:457
uint32_t buffer_alignment
Preferred communication buffer alignment in Bytes (must be power of two)
Definition: btl_openib.h:230
Definition: opal_list.h:147
opal_mutex_t ib_lock
lock for accessing module state
Definition: btl_openib.h:185
struct mca_btl_base_endpoint_t ** eager_rdma_buffers
frags for control massages
Definition: btl_openib.h:381
Definition: btl_openib.h:391
char ** if_list
Dummy argv-style list; a copy of names from the if_[in|ex]clude list that we use for error checking (...
Definition: btl_openib.h:273
int ib_max_btls
maximum number of devices available to openib component
Definition: btl_openib.h:155
uint32_t max_hw_msg_size
Maximum message size for RDMA protocols in Bytes.
Definition: btl_openib.h:202
int32_t eager_rdma_threshold
After this number of msg, use RDMA for short messages, always.
Definition: btl_openib.h:223
uint8_t src_path_bits
offset from base lid (for LMC)
Definition: btl_openib.h:453
size_t eager_limit
Eager send limit of first fragment, in Bytes.
Definition: btl_openib.h:200
uint16_t lid
lid that is actually used (for LMC)
Definition: btl_openib.h:451
uint64_t subnet_id
The subnet ID of this port.
Definition: btl_openib.h:311
opal_event_t ib_recv_event
event structure for recvs
Definition: btl_openib.h:182
int ib_free_list_inc
number of elements to alloc when growing free lists
Definition: btl_openib.h:173
opal_pointer_array_t * endpoints
< number of btls using this device
Definition: btl_openib.h:366
uint8_t num_pp_qps
number of pp qp's
Definition: btl_openib.h:191
BTL module interface functions and attributes.
Definition: btl.h:786
bool warn_nonexistent_if
Whether we want a warning if the user specifies a non-existent device and/or port via btl_openib_if_[...
Definition: btl_openib.h:269
Definition: btl_openib.h:109
int32_t rd_curr_num
The number of receive buffers that can be post in the current time.
Definition: btl_openib.h:404
uint16_t apm_lid
APM LID for this port.
Definition: btl_openib.h:315
int32_t rd_low_local
We post additional WQEs only if a number of WQEs (in specific SRQ) is less of this value...
Definition: btl_openib.h:407
ompi_btl_openib_connect_base_module_t ** cpcs
Array of CPCs on this port.
Definition: btl_openib.h:442
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
uint32_t ib_cq_size[2]
Max outstanding CQE on the CQ.
Definition: btl_openib.h:206
uint8_t mtu
The MTU used by this port.
Definition: btl_openib.h:317
uint32_t vendor_id
vendor id define device type and tuning
Definition: btl_openib.h:319
mpool module descriptor.
Definition: mpool.h:174