OpenMPI  0.1.1
btl_openib.h
1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2 /*
3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
4  * University Research and Technology
5  * Corporation. All rights reserved.
6  * Copyright (c) 2004-2009 The University of Tennessee and The University
7  * of Tennessee Research Foundation. All rights
8  * reserved.
9  * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
10  * University of Stuttgart. All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  * All rights reserved.
13  * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
14  * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
15  * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
16  * reserved.
17  * Copyright (c) 2006-2007 Voltaire All rights reserved.
18  * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
19  * $COPYRIGHT$
20  *
21  * Additional copyrights may follow
22  *
23  * $HEADER$
24  *
25  * @file
26  */
27 
28 #ifndef MCA_BTL_IB_H
29 #define MCA_BTL_IB_H
30 
31 #include "ompi_config.h"
32 #include <sys/types.h>
33 #include <string.h>
34 #include <infiniband/verbs.h>
35 
36 /* Open MPI includes */
37 #include "ompi/class/ompi_free_list.h"
40 #include "opal/util/output.h"
41 #include "opal/mca/event/event.h"
42 #include "opal/threads/threads.h"
43 #include "ompi/mca/btl/btl.h"
44 #include "ompi/mca/mpool/mpool.h"
45 #include "ompi/mca/btl/base/btl_base_error.h"
46 
47 #include "ompi/mca/btl/btl.h"
48 #include "ompi/mca/btl/base/base.h"
49 
50 #include "connect/connect.h"
51 
52 BEGIN_C_DECLS
53 
54 #define HAVE_XRC (1 == OMPI_HAVE_CONNECTX_XRC)
55 #define ENABLE_DYNAMIC_SL (1 == OMPI_ENABLE_DYNAMIC_SL)
56 
57 #define MCA_BTL_IB_LEAVE_PINNED 1
58 #define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll
59 #define MCA_BTL_IB_PKEY_MASK 0x7fff
60 
61 
62 /*--------------------------------------------------------------------*/
63 
64 #if OPAL_ENABLE_DEBUG
65 #define ATTACH() do { \
66  int i = 0; \
67  opal_output(0, "WAITING TO DEBUG ATTACH"); \
68  while (i == 0) sleep(5); \
69  } while(0);
70 #else
71 #define ATTACH()
72 #endif
73 
74 /*--------------------------------------------------------------------*/
75 
76 /**
77  * Infiniband (IB) BTL component.
78  */
79 
80 typedef enum {
81  MCA_BTL_OPENIB_TRANSPORT_IB,
82  MCA_BTL_OPENIB_TRANSPORT_IWARP,
83  MCA_BTL_OPENIB_TRANSPORT_RDMAOE,
84  MCA_BTL_OPENIB_TRANSPORT_UNKNOWN,
85  MCA_BTL_OPENIB_TRANSPORT_SIZE
86 } mca_btl_openib_transport_type_t;
87 
88 typedef enum {
89  MCA_BTL_OPENIB_PP_QP,
90  MCA_BTL_OPENIB_SRQ_QP,
91  MCA_BTL_OPENIB_XRC_QP
92 } mca_btl_openib_qp_type_t;
93 
95  int32_t rd_win;
96  int32_t rd_rsv;
98 
100  int32_t sd_max;
101  /* The init value for rd_curr_num variables of all SRQs */
102  int32_t rd_init;
103  /* The watermark, threshold - if the number of WQEs in SRQ is less then this value =>
104  the SRQ limit event (IBV_EVENT_SRQ_LIMIT_REACHED) will be generated on corresponding SRQ.
105  As result the maximal number of pre-posted WQEs on the SRQ will be increased */
106  int32_t srq_limit;
108 
110  mca_btl_openib_qp_type_t type;
111  size_t size;
112  int32_t rd_num;
113  int32_t rd_low;
114  union {
117  } u;
119 
120 #define BTL_OPENIB_QP_TYPE(Q) (mca_btl_openib_component.qp_infos[(Q)].type)
121 #define BTL_OPENIB_QP_TYPE_PP(Q) \
122  (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_PP_QP)
123 #define BTL_OPENIB_QP_TYPE_SRQ(Q) \
124  (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_SRQ_QP)
125 #define BTL_OPENIB_QP_TYPE_XRC(Q) \
126  (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP)
127 
128 typedef enum {
129  BTL_OPENIB_RQ_SOURCE_DEFAULT,
130  BTL_OPENIB_RQ_SOURCE_MCA,
131  BTL_OPENIB_RQ_SOURCE_DEVICE_INI,
132  BTL_OPENIB_RQ_SOURCE_MAX
133 } btl_openib_receive_queues_source_t;
134 
135 typedef enum {
136  BTL_OPENIB_DT_IB,
137  BTL_OPENIB_DT_IWARP,
138  BTL_OPENIB_DT_ALL
139 } btl_openib_device_type_t;
140 
141 #if OPAL_HAVE_THREADS
142 /* The structer for manage all BTL SRQs */
143 typedef struct mca_btl_openib_srq_manager_t {
144  opal_mutex_t lock;
145  /* The keys of this hash table are addresses of
146  SRQs structures, and the elements are BTL modules
147  pointers that associated with these SRQs */
148  opal_hash_table_t srq_addr_table;
149 } mca_btl_openib_srq_manager_t;
150 #endif
151 
153  mca_btl_base_component_2_0_0_t super; /**< base BTL component */
154 
156  /**< maximum number of devices available to openib component */
157 
159  /**< number of devices available to the openib component */
160 
162  /**< array of available BTLs */
163 
164  opal_pointer_array_t devices; /**< array of available devices */
165  int devices_count;
166 
168  /**< initial size of free lists */
169 
171  /**< maximum size of free lists */
172 
174  /**< number of elements to alloc when growing free lists */
175 
177  /**< list of ib proc structures */
178 
180  /**< event structure for sends */
181 
183  /**< event structure for recvs */
184 
186  /**< lock for accessing module state */
187 
189  /**< name of ib memory pool */
190 
191  uint8_t num_pp_qps; /**< number of pp qp's */
192  uint8_t num_srq_qps; /**< number of srq qp's */
193  uint8_t num_xrc_qps; /**< number of xrc qp's */
194  uint8_t num_qps; /**< total number of qp's */
195 
196  opal_hash_table_t ib_addr_table; /**< used only for xrc.hash-table that
197  keeps table of all lids/subnets */
198  mca_btl_openib_qp_info_t* qp_infos;
199 
200  size_t eager_limit; /**< Eager send limit of first fragment, in Bytes */
201  size_t max_send_size; /**< Maximum send size, in Bytes */
202  uint32_t max_hw_msg_size;/**< Maximum message size for RDMA protocols in Bytes */
203  uint32_t reg_mru_len; /**< Length of the registration cache most recently used list */
204  uint32_t use_srq; /**< Use the Shared Receive Queue (SRQ mode) */
205 
206  uint32_t ib_cq_size[2]; /**< Max outstanding CQE on the CQ */
207 
208  int32_t ib_max_inline_data; /**< Max size of inline data */
209  uint32_t ib_pkey_val;
210  uint32_t ib_psn;
211  uint32_t ib_qp_ous_rd_atom;
212  uint32_t ib_mtu;
213  uint32_t ib_min_rnr_timer;
214  uint32_t ib_timeout;
215  uint32_t ib_retry_count;
216  uint32_t ib_rnr_retry;
217  uint32_t ib_max_rdma_dst_ops;
218  uint32_t ib_service_level;
219 #if (ENABLE_DYNAMIC_SL)
220  uint32_t ib_path_record_service_level;
221 #endif
222  int32_t use_eager_rdma;
223  int32_t eager_rdma_threshold; /**< After this number of msg, use RDMA for short messages, always */
224  int32_t eager_rdma_num;
225  int32_t max_eager_rdma;
226  uint32_t btls_per_lid;
227  uint32_t max_lmc;
228  int32_t apm_lmc;
229  int32_t apm_ports;
230  uint32_t buffer_alignment; /**< Preferred communication buffer alignment in Bytes (must be power of two) */
231 #if OPAL_HAVE_THREADS
232  int32_t error_counter; /**< Counts number on error events that we got on all devices */
233  int async_pipe[2]; /**< Pipe for comunication with async event thread */
234  int async_comp_pipe[2]; /**< Pipe for async thread comunication with main thread */
235  pthread_t async_thread; /**< Async thread that will handle fatal errors */
236  uint32_t use_async_event_thread; /**< Use the async event handler */
237  mca_btl_openib_srq_manager_t srq_manager; /**< Hash table for all BTL SRQs */
238 #if BTL_OPENIB_FAILOVER_ENABLED
239  uint32_t port_error_failover; /**< Report port errors to speed up failover */
240 #endif
241 #endif
242  btl_openib_device_type_t device_type;
243  char *if_include;
244  char **if_include_list;
245  char *if_exclude;
246  char **if_exclude_list;
247  char *ipaddr_include;
248  char *ipaddr_exclude;
249 
250  /* MCA param btl_openib_receive_queues */
251  char *receive_queues;
252  /* Whether we got a non-default value of btl_openib_receive_queues */
253  btl_openib_receive_queues_source_t receive_queues_source;
254 
255  /** Colon-delimited list of filenames for device parameters */
257 
258  /** Whether we're in verbose mode or not */
259  bool verbose;
260 
261  /** Whether we want a warning if no device-specific parameters are
262  found in INI files */
264  /** Whether we want a warning if non default GID prefix is not configured
265  on multiport setup */
267  /** Whether we want a warning if the user specifies a non-existent
268  device and/or port via btl_openib_if_[in|ex]clude MCA params */
270  /** Dummy argv-style list; a copy of names from the
271  if_[in|ex]clude list that we use for error checking (to ensure
272  that they all exist) */
273  char **if_list;
274  bool use_message_coalescing;
275  uint32_t cq_poll_ratio;
276  uint32_t cq_poll_progress;
277  uint32_t eager_rdma_poll_ratio;
278 #ifdef HAVE_IBV_FORK_INIT
279  /** Whether we want fork support or not */
280  int want_fork_support;
281 #endif
282  int rdma_qp;
283  int credits_qp; /* qp used for software flow control */
285  /**< free list of frags only; used for pining user memory */
287  /**< free list of frags only; used for pining user memory */
289  /**< frags for coalesced massages */
290  ompi_free_list_t send_free_coalesced;
291  /** Default receive queues */
293  /** GID index to use */
295  /** Whether we want a dynamically resizing srq, enabled by default */
297 #if BTL_OPENIB_FAILOVER_ENABLED
298  int verbose_failover;
299 #endif
301 
302 OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
303 
305 
306 /**
307  * Common information for all ports that is sent in the modex message
308  */
310  /** The subnet ID of this port */
311  uint64_t subnet_id;
312  /** LID of this port */
313  uint16_t lid;
314  /** APM LID for this port */
315  uint16_t apm_lid;
316  /** The MTU used by this port */
317  uint8_t mtu;
318  /** vendor id define device type and tuning */
319  uint32_t vendor_id;
320  /** vendor part id define device type and tuning */
321  uint32_t vendor_part_id;
322  /** Transport type of remote port */
323  uint8_t transport_type;
324  /** Dummy field used to calculate the real length */
325  uint8_t end;
327 
328 #define MCA_BTL_OPENIB_MODEX_MSG_NTOH(hdr) \
329  do { \
330  (hdr).subnet_id = ntoh64((hdr).subnet_id); \
331  (hdr).lid = ntohs((hdr).lid); \
332  } while (0)
333 #define MCA_BTL_OPENIB_MODEX_MSG_HTON(hdr) \
334  do { \
335  (hdr).subnet_id = hton64((hdr).subnet_id); \
336  (hdr).lid = htons((hdr).lid); \
337  } while (0)
338 
340  ompi_free_list_t send_free; /**< free lists of send buffer descriptors */
341  ompi_free_list_t recv_free; /**< free lists of receive buffer descriptors */
343 
345 
346 typedef struct mca_btl_openib_device_t {
347  opal_object_t super;
348  struct ibv_device *ib_dev; /* the ib device */
349 #if OMPI_ENABLE_PROGRESS_THREADS == 1
350  struct ibv_comp_channel *ib_channel; /* Channel event for the device */
351  opal_thread_t thread; /* Progress thread */
352  volatile bool progress; /* Progress status */
353 #endif
354  opal_mutex_t device_lock; /* device level lock */
355  struct ibv_context *ib_dev_context;
356  struct ibv_device_attr ib_dev_attr;
357  struct ibv_pd *ib_pd;
358  struct ibv_cq *ib_cq[2];
359  uint32_t cq_size[2];
361  /* MTU for this device */
362  uint32_t mtu;
363  /* Whether this device supports eager RDMA */
364  uint8_t use_eager_rdma;
365  uint8_t btls; /** < number of btls using this device */
367  opal_pointer_array_t *device_btls;
368  uint16_t hp_cq_polls;
369  uint16_t eager_rdma_polls;
370  bool pollme;
371 #if OPAL_HAVE_THREADS
372  volatile bool got_fatal_event;
373  volatile bool got_port_event;
374 #endif
375 #if HAVE_XRC
376  struct ibv_xrc_domain *xrc_domain;
377  int xrc_fd;
378 #endif
379  int32_t non_eager_rdma_endpoints;
380  int32_t eager_rdma_buffers_count;
382  /**< frags for control massages */
383  ompi_free_list_t send_free_control;
384  /* QP types and attributes that will be used on this device */
386  /* Maximum value supported by this device for max_inline_data */
387  uint32_t max_inline_data;
390 
392  int32_t dummy;
394 
396  struct ibv_srq *srq;
397  int32_t rd_posted;
398  int32_t sd_credits; /* the max number of outstanding sends on a QP when using SRQ */
399  /* i.e. the number of frags that can be outstanding (down counter) */
400  opal_list_t pending_frags[2]; /**< list of high/low prio frags */
401  /** The number of receive buffers that can be post in the current time.
402  The value may be increased in the IBV_EVENT_SRQ_LIMIT_REACHED
403  event handler. The value starts from (rd_num / 4) and increased up to rd_num */
404  int32_t rd_curr_num;
405  /** We post additional WQEs only if a number of WQEs (in specific SRQ) is less of this value.
406  The value increased together with rd_curr_num. The value is unique for every SRQ. */
407  int32_t rd_low_local;
408  /** The flag points if we want to get the
409  IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ */
411  /**< In difference of the "--mca enable_srq_resize" parameter that says, if we want(or no)
412  to start with small num of pre-posted receive buffers (rd_curr_num) and to increase this number by needs
413  (the max of this value is rd_num * the whole size of SRQ), the "srq_limit_event_flag" says if we want to get limit event
414  from device if the defined srq limit was reached (signal to the main thread) and we put off this flag if the rd_curr_num
415  was increased up to rd_num.
416  In order to prevent lock/unlock operation in the critical path we prefer only put-on
417  the srq_limit_event_flag in asynchronous thread, because in this way we post receive buffers
418  in the main thread only and only after posting we set (if srq_limit_event_flag is true)
419  the limit for IBV_EVENT_SRQ_LIMIT_REACHED event. */
421 
423  union {
426  } u;
428 
429 /**
430  * IB BTL Interface
431  */
433  /* Base BTL module */
434  mca_btl_base_module_t super;
435 
436  bool btl_inited;
437 
438  /** Common information about all ports */
440 
441  /** Array of CPCs on this port */
443 
444  /** Number of elements in the cpcs array */
445  uint8_t num_cpcs;
446 
447  mca_btl_openib_device_t *device;
448  uint8_t port_num; /**< ID of the PORT */
449  uint16_t pkey_index;
450  struct ibv_port_attr ib_port_attr;
451  uint16_t lid; /**< lid that is actually used (for LMC) */
452  int apm_port; /**< Alternative port that may be used for APM */
453  uint8_t src_path_bits; /**< offset from base lid (for LMC) */
454 
455  int32_t num_peers;
456 
457  opal_mutex_t ib_lock; /**< module level lock */
458 
459  size_t eager_rdma_frag_size; /**< length of eager frag */
460  volatile int32_t eager_rdma_channels; /**< number of open RDMA channels */
461 
463 
465 };
467 
468 extern mca_btl_openib_module_t mca_btl_openib_module;
469 
472  struct ibv_mr *mr;
473 };
475 
476 #if OMPI_ENABLE_PROGRESS_THREADS == 1
477 extern void* mca_btl_openib_progress_thread(opal_object_t*);
478 #endif
479 
480 
481 /**
482  * Register a callback function that is called on error..
483  *
484  * @param btl (IN) BTL module
485  * @return Status indicating if cleanup was successful
486  */
487 
488 int mca_btl_openib_register_error_cb(
489  struct mca_btl_base_module_t* btl,
491 );
492 
493 
494 /**
495  * Cleanup any resources held by the BTL.
496  *
497  * @param btl BTL instance.
498  * @return OMPI_SUCCESS or error status on failure.
499  */
500 
501 extern int mca_btl_openib_finalize(
502  struct mca_btl_base_module_t* btl
503 );
504 
505 
506 /**
507  * PML->BTL notification of change in the process list.
508  *
509  * @param btl (IN) BTL module
510  * @param nprocs (IN) Number of processes
511  * @param procs (IN) Set of processes
512  * @param peers (OUT) Set of (optional) peer addressing info.
513  * @param reachable (IN/OUT) Set of processes that are reachable via this BTL.
514  * @return OMPI_SUCCESS or error status on failure.
515  *
516  */
517 
518 extern int mca_btl_openib_add_procs(
519  struct mca_btl_base_module_t* btl,
520  size_t nprocs,
521  struct ompi_proc_t **procs,
522  struct mca_btl_base_endpoint_t** peers,
523  opal_bitmap_t* reachable
524 );
525 
526 /**
527  * PML->BTL notification of change in the process list.
528  *
529  * @param btl (IN) BTL instance
530  * @param nproc (IN) Number of processes.
531  * @param procs (IN) Set of processes.
532  * @param peers (IN) Set of peer data structures.
533  * @return Status indicating if cleanup was successful
534  *
535  */
536 extern int mca_btl_openib_del_procs(
537  struct mca_btl_base_module_t* btl,
538  size_t nprocs,
539  struct ompi_proc_t **procs,
540  struct mca_btl_base_endpoint_t** peers
541 );
542 
543 
544 /**
545  * PML->BTL Initiate a send of the specified size.
546  *
547  * @param btl (IN) BTL instance
548  * @param btl_peer (IN) BTL peer addressing
549  * @param descriptor (IN) Descriptor of data to be transmitted.
550  * @param tag (IN) Tag.
551  */
552 extern int mca_btl_openib_send(
553  struct mca_btl_base_module_t* btl,
554  struct mca_btl_base_endpoint_t* btl_peer,
555  struct mca_btl_base_descriptor_t* descriptor,
556  mca_btl_base_tag_t tag
557 );
558 
559 /**
560  * PML->BTL Initiate a immediate send of the specified size.
561  *
562  * @param btl (IN) BTL instance
563  * @param ep (IN) Endpoint
564  * @param convertor (IN) Datatypes converter
565  * @param header (IN) PML header
566  * @param header_size (IN) PML header size
567  * @param payload_size (IN) Payload size
568  * @param order (IN) Order
569  * @param flags (IN) Flags
570  * @param tag (IN) Tag
571  * @param descriptor (OUT) Messages descriptor
572  */
573 extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
574  struct mca_btl_base_endpoint_t* ep,
575  struct opal_convertor_t* convertor,
576  void* header,
577  size_t header_size,
578  size_t payload_size,
579  uint8_t order,
580  uint32_t flags,
581  mca_btl_base_tag_t tag,
582  mca_btl_base_descriptor_t** descriptor
583 );
584 
585 /**
586  * PML->BTL Initiate a put of the specified size.
587  *
588  * @param btl (IN) BTL instance
589  * @param btl_peer (IN) BTL peer addressing
590  * @param descriptor (IN) Descriptor of data to be transmitted.
591  */
592 extern int mca_btl_openib_put(
593  struct mca_btl_base_module_t* btl,
594  struct mca_btl_base_endpoint_t* btl_peer,
595  struct mca_btl_base_descriptor_t* descriptor
596  );
597 
598 /**
599  * PML->BTL Initiate a get of the specified size.
600  *
601  * @param btl (IN) BTL instance
602  * @param btl_base_peer (IN) BTL peer addressing
603  * @param descriptor (IN) Descriptor of data to be transmitted.
604  */
605 extern int mca_btl_openib_get(
606  struct mca_btl_base_module_t* btl,
607  struct mca_btl_base_endpoint_t* btl_peer,
608  struct mca_btl_base_descriptor_t* descriptor
609  );
610 
611 
612 /**
613  * Allocate a descriptor.
614  *
615  * @param btl (IN) BTL module
616  * @param size (IN) Requested descriptor size.
617  */
618 extern mca_btl_base_descriptor_t* mca_btl_openib_alloc(
619  struct mca_btl_base_module_t* btl,
620  struct mca_btl_base_endpoint_t* endpoint,
621  uint8_t order,
622  size_t size,
623  uint32_t flags);
624 
625 
626 /**
627  * Return a segment allocated by this BTL.
628  *
629  * @param btl (IN) BTL module
630  * @param descriptor (IN) Allocated descriptor.
631  */
632 extern int mca_btl_openib_free(
633  struct mca_btl_base_module_t* btl,
635 
636 
637 /**
638  * Pack data and return a descriptor that can be
639  * used for send/put.
640  *
641  * @param btl (IN) BTL module
642  * @param peer (IN) BTL peer addressing
643  */
644 mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
645  struct mca_btl_base_module_t* btl,
646  struct mca_btl_base_endpoint_t* peer,
647  mca_mpool_base_registration_t* registration,
648  struct opal_convertor_t* convertor,
649  uint8_t order,
650  size_t reserve,
651  size_t* size,
652  uint32_t flags
653  );
654 
655 /**
656  * Allocate a descriptor initialized for RDMA write.
657  *
658  * @param btl (IN) BTL module
659  * @param peer (IN) BTL peer addressing
660  */
661 extern mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
662  struct mca_btl_base_module_t* btl,
663  struct mca_btl_base_endpoint_t* peer,
664  mca_mpool_base_registration_t* registration,
665  struct opal_convertor_t* convertor,
666  uint8_t order,
667  size_t reserve,
668  size_t* size,
669  uint32_t flags);
670 
671 extern void mca_btl_openib_frag_progress_pending_put_get(
672  struct mca_btl_base_endpoint_t*, const int);
673 
674 /**
675  * Fault Tolerance Event Notification Function
676  *
677  * @param state (IN) Checkpoint State
678  * @return OMPI_SUCCESS or failure status
679  */
680 extern int mca_btl_openib_ft_event(int state);
681 
682 
683 /**
684  * Show an error during init, particularly when running out of
685  * registered memory.
686  */
687 void mca_btl_openib_show_init_error(const char *file, int line,
688  const char *func, const char *dev);
689 
690 #define BTL_OPENIB_HP_CQ 0
691 #define BTL_OPENIB_LP_CQ 1
692 
693 
694 /**
695  * Post to Shared Receive Queue with certain priority
696  *
697  * @param openib_btl (IN) BTL module
698  * @param additional (IN) Additional Bytes to reserve
699  * @param prio (IN) Priority (either BTL_OPENIB_HP_QP or BTL_OPENIB_LP_QP)
700  * @return OMPI_SUCCESS or failure status
701  */
702 
703 int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp);
704 
705 /**
706  * Get a transport name of btl by its transport type.
707  */
708 
709 const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type);
710 
711 /**
712  * Get a transport type of btl.
713  */
714 
715 mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl);
716 
717 static inline int qp_cq_prio(const int qp)
718 {
719  if(0 == qp)
720  return BTL_OPENIB_HP_CQ; /* smallest qp is always HP */
721 
722  /* If the size for this qp is <= the eager limit, make it a
723  high priority QP. Otherwise, make it a low priority QP. */
724  return (mca_btl_openib_component.qp_infos[qp].size <=
725  mca_btl_openib_component.eager_limit) ?
726  BTL_OPENIB_HP_CQ : BTL_OPENIB_LP_CQ;
727 }
728 
729 #define BTL_OPENIB_RDMA_QP(QP) \
730  ((QP) == mca_btl_openib_component.rdma_qp)
731 
732 END_C_DECLS
733 
734 #endif /* MCA_BTL_IB_H */
uint32_t use_srq
Use the Shared Receive Queue (SRQ mode)
Definition: btl_openib.h:204
struct mca_btl_openib_module_t ** openib_btls
array of available BTLs
Definition: btl_openib.h:161
opal_list_t pending_frags[2]
list of high/low prio frags
Definition: btl_openib.h:400
Definition: opal_hash_table.h:42
Definition: btl_openib.h:152
Definition: btl_openib.h:470
A descriptor that holds the parameters to a send/put/get operation along w/ a callback routine that i...
Definition: btl.h:275
OPAL output stream facility.
bool srq_limit_event_flag
The flag points if we want to get the IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ...
Definition: btl_openib.h:410
ompi_free_list_t recv_free
free lists of receive buffer descriptors
Definition: btl_openib.h:341
Definition: btl_openib.h:99
dynamic pointer array
Definition: opal_pointer_array.h:45
Definition: btl_openib.h:346
Definition: opal_bitmap.h:53
Definition: base.h:44
Definition: btl_openib.h:94
Definition: btl_openib.h:395
mca_btl_base_component_2_0_0_t super
base BTL component
Definition: btl_openib.h:153
bool enable_srq_resize
Whether we want a dynamically resizing srq, enabled by default.
Definition: btl_openib.h:296
Definition: btl_openib.h:339
opal_event_t ib_send_event
event structure for sends
Definition: btl_openib.h:179
Structure to represent a single event.
Definition: event_struct.h:87
bool cpc_explicitly_defined
free list of frags only; used for pining user memory
Definition: btl_openib.h:284
mca_btl_openib_modex_message_t port_info
Common information about all ports.
Definition: btl_openib.h:439
bool warn_no_device_params_found
Whether we want a warning if no device-specific parameters are found in INI files.
Definition: btl_openib.h:263
uint8_t num_cpcs
Number of elements in the cpcs array.
Definition: btl_openib.h:445
void(* mca_btl_base_module_error_cb_fn_t)(struct mca_btl_base_module_t *btl, int32_t flags, struct ompi_proc_t *errproc, char *btlinfo)
Callback function that is called asynchronously on receipt of an error from the transport layer...
Definition: btl.h:538
Definition: mutex_unix.h:53
Definition: btl_openib.h:422
int gid_index
GID index to use.
Definition: btl_openib.h:294
int ib_num_btls
number of devices available to the openib component
Definition: btl_openib.h:158
uint8_t num_srq_qps
number of srq qp's
Definition: btl_openib.h:192
char * default_recv_qps
Default receive queues.
Definition: btl_openib.h:292
char * device_params_file_names
Colon-delimited list of filenames for device parameters.
Definition: btl_openib.h:256
uint8_t end
Dummy field used to calculate the real length.
Definition: btl_openib.h:325
IB BTL Interface.
Definition: btl_openib.h:432
Definition: mpool.h:44
See opal_bitmap.h for an explanation of why there is a split between OPAL and ORTE for this generic c...
Remote Open MPI process structure.
Definition: proc.h:56
uint8_t num_qps
total number of qp's
Definition: btl_openib.h:194
opal_list_t ib_procs
list of ib proc structures
Definition: btl_openib.h:176
uint8_t num_xrc_qps
number of xrc qp's
Definition: btl_openib.h:193
opal_pointer_array_t devices
array of available devices
Definition: btl_openib.h:164
int32_t ib_max_inline_data
Max size of inline data.
Definition: btl_openib.h:208
int ib_free_list_num
initial size of free lists
Definition: btl_openib.h:167
bool verbose
Whether we're in verbose mode or not.
Definition: btl_openib.h:259
ompi_free_list_t send_free
free lists of send buffer descriptors
Definition: btl_openib.h:340
uint16_t lid
LID of this port.
Definition: btl_openib.h:313
Common information for all ports that is sent in the modex message.
Definition: btl_openib.h:309
size_t max_send_size
Maximum send size, in Bytes.
Definition: btl_openib.h:201
int apm_port
Alternative port that may be used for APM.
Definition: btl_openib.h:452
char * ib_mpool_name
name of ib memory pool
Definition: btl_openib.h:188
ompi_free_list_t send_user_free
free list of frags only; used for pining user memory
Definition: btl_openib.h:286
Byte Transfer Layer (BTL)
opal_hash_table_t ib_addr_table
used only for xrc.hash-table that keeps table of all lids/subnets
Definition: btl_openib.h:196
uint32_t reg_mru_len
Length of the registration cache most recently used list.
Definition: btl_openib.h:203
Struct for holding CPC module and associated meta data.
Definition: connect.h:328
uint8_t transport_type
Transport type of remote port.
Definition: btl_openib.h:323
Definition: ompi_free_list.h:39
A hash table that may be indexed with either fixed length (e.g.
ompi_free_list_t recv_user_free
frags for coalesced massages
Definition: btl_openib.h:288
uint32_t vendor_part_id
vendor part id define device type and tuning
Definition: btl_openib.h:321
mca_btl_base_module_error_cb_fn_t error_cb
error handler
Definition: btl_openib.h:462
volatile int32_t eager_rdma_channels
number of open RDMA channels
Definition: btl_openib.h:460
State of ELAN endpoint connection.
Definition: btl_elan_endpoint.h:33
bool warn_default_gid_prefix
Whether we want a warning if non default GID prefix is not configured on multiport setup...
Definition: btl_openib.h:266
int ib_free_list_max
maximum size of free lists
Definition: btl_openib.h:170
BTL component descriptor.
Definition: btl.h:411
Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana University Research and Techno...
Base object.
Definition: opal_object.h:182
Definition: opal_convertor.h:90
uint8_t port_num
ID of the PORT.
Definition: btl_openib.h:448
size_t eager_rdma_frag_size
length of eager frag
Definition: btl_openib.h:459
opal_mutex_t ib_lock
module level lock
Definition: btl_openib.h:457
uint32_t buffer_alignment
Preferred communication buffer alignment in Bytes (must be power of two)
Definition: btl_openib.h:230
Definition: opal_list.h:147
opal_mutex_t ib_lock
lock for accessing module state
Definition: btl_openib.h:185
struct mca_btl_base_endpoint_t ** eager_rdma_buffers
frags for control massages
Definition: btl_openib.h:381
Definition: btl_openib.h:391
char ** if_list
Dummy argv-style list; a copy of names from the if_[in|ex]clude list that we use for error checking (...
Definition: btl_openib.h:273
int ib_max_btls
maximum number of devices available to openib component
Definition: btl_openib.h:155
uint32_t max_hw_msg_size
Maximum message size for RDMA protocols in Bytes.
Definition: btl_openib.h:202
int32_t eager_rdma_threshold
After this number of msg, use RDMA for short messages, always.
Definition: btl_openib.h:223
uint8_t src_path_bits
offset from base lid (for LMC)
Definition: btl_openib.h:453
Definition: threads.h:46
size_t eager_limit
Eager send limit of first fragment, in Bytes.
Definition: btl_openib.h:200
uint16_t lid
lid that is actually used (for LMC)
Definition: btl_openib.h:451
uint64_t subnet_id
The subnet ID of this port.
Definition: btl_openib.h:311
opal_event_t ib_recv_event
event structure for recvs
Definition: btl_openib.h:182
int ib_free_list_inc
number of elements to alloc when growing free lists
Definition: btl_openib.h:173
opal_pointer_array_t * endpoints
< number of btls using this device
Definition: btl_openib.h:366
uint8_t num_pp_qps
number of pp qp's
Definition: btl_openib.h:191
BTL module interface functions and attributes.
Definition: btl.h:786
bool warn_nonexistent_if
Whether we want a warning if the user specifies a non-existent device and/or port via btl_openib_if_[...
Definition: btl_openib.h:269
Definition: btl_openib.h:109
int32_t rd_curr_num
The number of receive buffers that can be post in the current time.
Definition: btl_openib.h:404
uint16_t apm_lid
APM LID for this port.
Definition: btl_openib.h:315
int32_t rd_low_local
We post additional WQEs only if a number of WQEs (in specific SRQ) is less of this value...
Definition: btl_openib.h:407
ompi_btl_openib_connect_base_module_t ** cpcs
Array of CPCs on this port.
Definition: btl_openib.h:442
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
uint32_t ib_cq_size[2]
Max outstanding CQE on the CQ.
Definition: btl_openib.h:206
uint8_t mtu
The MTU used by this port.
Definition: btl_openib.h:317
uint32_t vendor_id
vendor id define device type and tuning
Definition: btl_openib.h:319
mpool module descriptor.
Definition: mpool.h:174