OpenMPI  0.1.1
btl_ofud.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2011 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2006 Sandia National Laboratories. All rights
13  * reserved.
14  * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
15  * $COPYRIGHT$
16  *
17  * Additional copyrights may follow
18  *
19  * $HEADER$
20  */
21 /**
22  * @file
23  */
24 #ifndef MCA_BTL_UD_H
25 #define MCA_BTL_UD_H
26 
27 /* Number of QP's to stripe sends over - keep this as power of 2 */
28 /* AWF - This is intentionally NOT an MCA parameter so that I can do fast
29  modular arithmetic with it. */
30 #define MCA_BTL_UD_NUM_QP 4
31 
32 #include "ompi_config.h"
33 #include <sys/types.h>
34 #include <infiniband/verbs.h>
35 
36 /* Open MPI includes */
38 #include "ompi/class/ompi_free_list.h"
39 #include "ompi/mca/btl/btl.h"
40 #include "ompi/mca/btl/base/btl_base_error.h"
41 #include "ompi/mca/btl/base/base.h"
42 #include "ompi/mca/mpool/mpool.h"
43 
44 /* TODO - If I want this to go away, addr_t has to come over here */
45 #include "btl_ofud_endpoint.h"
46 
47 BEGIN_C_DECLS
48 
49 
50 /**
51  * UD Infiniband (IB) BTL component.
52  */
53 
55  mca_btl_base_component_2_0_0_t super; /**< base BTL component */
56 
57  uint32_t max_btls; /**< Maximum number of BTL modules */
58  uint32_t num_btls; /**< Number of available/initialized BTL modules */
59 
60  char* if_include;
61  char** if_include_list;
62  char* if_exclude;
63  char** if_exclude_list;
64  char** if_list;
65 
66  struct mca_btl_ud_module_t* ud_btls; /**< array of available BTLs */
67 
68  opal_list_t ud_procs; /**< list of ib proc structures */
69  opal_mutex_t ud_lock; /**< lock for accessing component state */
70 
71  char* ud_mpool_name; /**< name of memory pool */
72 
73  int32_t sd_num; /**< max send descriptors to post per BTL */
74 
75  int32_t rd_num; /**< number of receive descriptors per BTL */
76 #if 0
77  int32_t rd_num_init; /**< initial recv descriptors to post per BTL */
78  int32_t rd_num_max;
79  int32_t rd_num_inc;
80 #endif
81 
82  uint32_t ib_pkey_ix;
83  uint32_t ib_qkey;
84  uint32_t ib_service_level;
85  uint32_t ib_src_path_bits;
86 
88 
89 OMPI_MODULE_DECLSPEC extern mca_btl_ud_component_t mca_btl_ofud_component;
90 
92 
93 
94 /**
95  * Profiling variables
96  */
97 
98 #if OPAL_ENABLE_DEBUG
99 #define MCA_BTL_UD_ENABLE_PROFILE 0
100 #else
101 #define MCA_BTL_UD_ENABLE_PROFILE 0
102 #endif
103 
104 #if MCA_BTL_UD_ENABLE_PROFILE
105 
106 #define MCA_BTL_UD_PROFILE_VAR(var) \
107  opal_timer_t avg_ ## var; \
108  opal_timer_t cnt_ ## var; \
109  opal_timer_t tmp_ ## var
110 
111 struct mca_btl_ud_profile_t {
112  MCA_BTL_UD_PROFILE_VAR(post_send);
113  MCA_BTL_UD_PROFILE_VAR(ibv_post_send);
114 };
115 
116 typedef struct mca_btl_ud_profile_t mca_btl_ud_profile_t;
117 extern mca_btl_ud_profile_t mca_btl_ud_profile;
118 
119 #endif
120 
121 
122 /**
123  * UD/IB BTL Interface
124  */
125 
127  mca_btl_base_module_t super;
128 
129  uint8_t ib_port_num;
130  struct ibv_device* ib_dev;
131  struct ibv_context* ib_dev_context;
132  struct ibv_pd* ib_pd;
133  struct ibv_cq* ib_cq;
134 
135  struct mca_btl_ud_addr_t addr; /**< local address information */
136 
137  ompi_free_list_t send_frags; /**< send fragments & buffers */
138  ompi_free_list_t user_frags; /**< user data fragments */
139  ompi_free_list_t recv_frags; /**< receive fragments & buffers */
140 
141  opal_list_t pending_frags; /**< list of pending send frags */
142 
143  opal_mutex_t ud_lock; /**< lock for pending_frags */
144 
145  size_t ib_inline_max; /**< max size of IB inline send */
146 
147  /*int32_t rd_posted;*/ /**< number of receives currently posted */
148 
149  int32_t sd_wqe; /**< available send WQ entries */
150  /* No lock needed, these are incremented/decremented atomically */
151 
152  /*opal_hash_table_t* ep_lookup;*/
153  /**< hash table for fast lookup of endpoint structures in recv path */
154  /* lid:qpnum is key, value is mca_btl_ud_endpoint_t* */
155 
156  struct ibv_qp* ib_qp[MCA_BTL_UD_NUM_QP];
157  uint32_t ib_qp_next;
158  /**< Local QPs and stripe counters */
159  /* No lock needed - counters only ever increase by 1 */
160 }; typedef struct mca_btl_ud_module_t mca_btl_ud_module_t;
161 
162 struct mca_btl_ud_frag_t;
163 extern mca_btl_ud_module_t mca_btl_ofud_module;
164 
165 
166 /**
167  * IB component initialization.
168  *
169  * @param num_btl_modules (OUT)
170  * Number of BTLs returned in BTL array.
171  * @param allow_multi_user_threads (OUT)
172  * Flag indicating wether BTL supports user threads (TRUE)
173  * @param have_hidden_threads (OUT)
174  * Flag indicating whether BTL uses threads (TRUE)
175  *
176  * (1) read interface list from verbs and compare against component parameters
177  * then create a BTL instance for selected interfaces
178  * (2) publish BTL addressing info
179  */
180 
182  int *num_btl_modules,
183  bool allow_multi_user_threads,
184  bool have_hidden_threads);
185 
186 
187 /**
188  * UD/IB component progress.
189  */
190 extern int mca_btl_ud_component_progress(void);
191 
192 
193 /**
194  * Cleanup any resources held by the BTL.
195  *
196  * @param btl BTL instance.
197  * @return OMPI_SUCCESS or error status on failure.
198  */
199 
200 extern int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl);
201 
202 
203 /**
204  * PML->BTL notification of change in the process list.
205  *
206  * @param btl (IN)
207  * @param nprocs (IN) Number of processes
208  * @param procs (IN) Set of processes
209  * @param peers (OUT) Set of (optional) peer addressing info.
210  * @param peers (IN/OUT) Set of processes that are reachable via this BTL.
211  * @return OMPI_SUCCESS or error status on failure.
212  */
213 
214 extern int mca_btl_ud_add_procs(struct mca_btl_base_module_t* btl,
215  size_t nprocs,
216  struct ompi_proc_t **procs,
217  struct mca_btl_base_endpoint_t** peers,
218  opal_bitmap_t* reachable);
219 
220 
221 /**
222  * PML->BTL notification of change in the process list.
223  *
224  * @param btl (IN) BTL instance
225  * @param nproc (IN) Number of processes.
226  * @param procs (IN) Set of processes.
227  * @param peers (IN) Set of peer data structures.
228  * @return Status indicating if cleanup was successful
229  *
230  */
231 
232 extern int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl,
233  size_t nprocs,
234  struct ompi_proc_t **procs,
235  struct mca_btl_base_endpoint_t** peers);
236 
237 
238 /**
239  * PML->BTL Initiate a send of the specified size.
240  *
241  * @param btl (IN)
242  * BTL instance
243  * @param btl_base_peer (IN)
244  * BTL peer addressing
245  * @param send_request (IN/OUT)
246  * Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
247  * @param size (IN)
248  * Number of bytes PML is requesting BTL to deliver
249  * @param flags (IN)
250  * Flags that should be passed to the peer via the message header.
251  * @param request (OUT)
252  * OMPI_SUCCESS if the BTL was able to queue one or more fragments
253  */
254 
255 extern int mca_btl_ud_send(struct mca_btl_base_module_t* btl,
256  struct mca_btl_base_endpoint_t* btl_peer,
257  struct mca_btl_base_descriptor_t* descriptor,
258  mca_btl_base_tag_t tag);
259 
260 
261 /**
262  * Allocate a descriptor.
263  *
264  * @param btl (IN) BTL module
265  * @param size (IN) Requested descriptor size.
266  */
267 
269  struct mca_btl_base_module_t* btl,
270  struct mca_btl_base_endpoint_t* endpoint,
271  uint8_t order,
272  size_t size,
273  uint32_t flags);
274 
275 
276 /**
277  * Return a segment allocated by this BTL.
278  *
279  * @param btl (IN) BTL module
280  * @param descriptor (IN) Allocated descriptor.
281  */
282 
283 extern int mca_btl_ud_free(struct mca_btl_base_module_t* btl,
285 
286 
287 /**
288  * Pack data and return a descriptor that can be
289  * used for send/put.
290  *
291  * @param btl (IN) BTL module
292  * @param peer (IN) BTL peer addressing
293  */
294 
296  struct mca_btl_base_module_t* btl,
297  struct mca_btl_base_endpoint_t* peer,
298  mca_mpool_base_registration_t* registration,
299  struct opal_convertor_t* convertor,
300  uint8_t order,
301  size_t reserve,
302  size_t* size,
303  uint32_t flags);
304 
305 
306 
307 int mca_btl_ud_module_init(mca_btl_ud_module_t* ud_btl);
308 
309 /**
310  * Fault Tolerance Event Notification Function
311  * @param state Checkpoint State
312  * @return OMPI_SUCCESS or failure status
313  */
314 
315 extern int mca_btl_ud_ft_event(int state);
316 
317 
318 
319 /*
320  * Profiling stuff
321  */
322 
323 #if MCA_BTL_UD_ENABLE_PROFILE
324 
325 #define MCA_BTL_UD_START_TIME(var) \
326  ((mca_btl_ud_profile.tmp_ ## var) = opal_timer_base_get_cycles())
327 
328 #define MCA_BTL_UD_END_TIME(var) \
329 do { \
330  mca_btl_ud_profile.avg_ ## var += \
331  opal_timer_base_get_cycles() - mca_btl_ud_profile.tmp_ ## var; \
332  mca_btl_ud_profile.cnt_ ## var++; \
333 } while(0)
334 
335 #define MCA_BTL_UD_SHOW_TIME(var) \
336  OPAL_OUTPUT((0, " " #var " avg %lu cnt %lu", \
337  (mca_btl_ud_profile.avg_ ## var) / (mca_btl_ud_profile.cnt_ ## var), \
338  mca_btl_ud_profile.cnt_ ## var));
339 
340 #else
341 #define MCA_BTL_UD_START_TIME(var)
342 #define MCA_BTL_UD_END_TIME(var)
343 #define MCA_BTL_UD_SHOW_TIME(var)
344 #endif
345 
346 END_C_DECLS
347 #endif
opal_list_t ud_procs
list of ib proc structures
Definition: btl_ofud.h:68
int32_t sd_wqe
< number of receives currently posted
Definition: btl_ofud.h:149
int mca_btl_ud_ft_event(int state)
Fault Tolerance Event Notification Function.
Definition: btl_ofud.c:689
A descriptor that holds the parameters to a send/put/get operation along w/ a callback routine that i...
Definition: btl.h:275
mca_btl_base_component_2_0_0_t super
base BTL component
Definition: btl_ofud.h:55
Definition: btl_ofud_endpoint.h:34
ompi_free_list_t send_frags
send fragments & buffers
Definition: btl_ofud.h:137
opal_list_t pending_frags
list of pending send frags
Definition: btl_ofud.h:141
Definition: opal_bitmap.h:53
Definition: base.h:44
uint32_t ib_qp_next
Local QPs and stripe counters.
Definition: btl_ofud.h:157
size_t ib_inline_max
max size of IB inline send
Definition: btl_ofud.h:145
int mca_btl_ud_add_procs(struct mca_btl_base_module_t *btl, size_t nprocs, struct ompi_proc_t **procs, struct mca_btl_base_endpoint_t **peers, opal_bitmap_t *reachable)
PML->BTL notification of change in the process list.
Definition: btl_ofud.c:80
opal_mutex_t ud_lock
lock for accessing component state
Definition: btl_ofud.h:69
mca_btl_base_descriptor_t * mca_btl_ud_alloc(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint8_t order, size_t size, uint32_t flags)
Allocate a descriptor.
Definition: btl_ofud.c:202
UD/IB BTL Interface.
Definition: btl_ofud.h:126
Definition: mutex_unix.h:53
struct mca_btl_ud_addr_t addr
local address information
Definition: btl_ofud.h:135
int mca_btl_ud_send(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *btl_peer, struct mca_btl_base_descriptor_t *descriptor, mca_btl_base_tag_t tag)
PML->BTL Initiate a send of the specified size.
Definition: btl_ofud.c:394
int32_t sd_num
max send descriptors to post per BTL
Definition: btl_ofud.h:73
struct mca_btl_ud_module_t * ud_btls
array of available BTLs
Definition: btl_ofud.h:66
Definition: mpool.h:44
Remote Open MPI process structure.
Definition: proc.h:56
ompi_free_list_t recv_frags
receive fragments & buffers
Definition: btl_ofud.h:139
int mca_btl_ud_del_procs(struct mca_btl_base_module_t *btl, size_t nprocs, struct ompi_proc_t **procs, struct mca_btl_base_endpoint_t **peers)
PML->BTL notification of change in the process list.
Definition: btl_ofud.c:166
struct ibv_qp * ib_qp[MCA_BTL_UD_NUM_QP]
< hash table for fast lookup of endpoint structures in recv path
Definition: btl_ofud.h:156
uint32_t num_btls
Number of available/initialized BTL modules.
Definition: btl_ofud.h:58
mca_btl_base_module_t ** mca_btl_ud_component_init(int *num_btl_modules, bool allow_multi_user_threads, bool have_hidden_threads)
IB component initialization.
Definition: btl_ofud_component.c:390
Byte Transfer Layer (BTL)
Definition: ompi_free_list.h:39
A hash table that may be indexed with either fixed length (e.g.
opal_mutex_t ud_lock
lock for pending_frags
Definition: btl_ofud.h:143
State of ELAN endpoint connection.
Definition: btl_elan_endpoint.h:33
BTL component descriptor.
Definition: btl.h:411
Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana University Research and Techno...
Definition: opal_convertor.h:90
IB send fragment derived type.
Definition: btl_ofud_frag.h:73
ompi_free_list_t user_frags
user data fragments
Definition: btl_ofud.h:138
char * ud_mpool_name
name of memory pool
Definition: btl_ofud.h:71
Definition: opal_list.h:147
uint32_t max_btls
Maximum number of BTL modules.
Definition: btl_ofud.h:57
int32_t rd_num
number of receive descriptors per BTL
Definition: btl_ofud.h:75
int mca_btl_ud_free(struct mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des)
Return a segment allocated by this BTL.
Definition: btl_ofud.c:233
int mca_btl_ud_component_progress(void)
UD/IB component progress.
Definition: btl_ofud_component.c:626
BTL module interface functions and attributes.
Definition: btl.h:786
int mca_btl_ud_finalize(struct mca_btl_base_module_t *btl)
Cleanup any resources held by the BTL.
Definition: btl_ofud.c:369
mca_btl_base_descriptor_t * mca_btl_ud_prepare_src(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *peer, mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags)
Pack data and return a descriptor that can be used for send/put.
Definition: btl_ofud.c:277
UD Infiniband (IB) BTL component.
Definition: btl_ofud.h:54