OpenMPI  0.1.1
btl_openib_frag.h
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2006 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2009 IBM Corporation. All rights reserved.
13  * Copyright (c) 2006-2009 Los Alamos National Security, LLC. All rights
14  * reserved.
15  * Copyright (c) 2006-2007 Voltaire All rights reserved.
16  * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
17  * $COPYRIGHT$
18  *
19  * Additional copyrights may follow
20  *
21  * $HEADER$
22  */
23 
24 #ifndef MCA_BTL_IB_FRAG_H
25 #define MCA_BTL_IB_FRAG_H
26 
27 #include "ompi_config.h"
28 #include "opal/align.h"
29 
30 #include <infiniband/verbs.h>
31 #include "ompi/mca/btl/btl.h"
32 
33 BEGIN_C_DECLS
34 
36 
38  mca_btl_base_tag_t tag;
39  uint8_t cm_seen;
40  uint16_t credits;
41 #if OMPI_OPENIB_PAD_HDR
42  uint8_t padding[4];
43 #endif
44 };
46 #define BTL_OPENIB_RDMA_CREDITS_FLAG (1<<15)
47 #define BTL_OPENIB_IS_RDMA_CREDITS(I) ((I)&BTL_OPENIB_RDMA_CREDITS_FLAG)
48 #define BTL_OPENIB_CREDITS(I) ((I)&~BTL_OPENIB_RDMA_CREDITS_FLAG)
49 
50 #define BTL_OPENIB_HEADER_HTON(h) \
51 do { \
52  (h).credits = htons((h).credits); \
53 } while (0)
54 
55 #define BTL_OPENIB_HEADER_NTOH(h) \
56 do { \
57  (h).credits = ntohs((h).credits); \
58 } while (0)
59 
61  mca_btl_base_tag_t tag;
62  uint32_t size;
63  uint32_t alloc_size;
64 #if OMPI_OPENIB_PAD_HDR
65  uint8_t padding[4];
66 #endif
68 
69 #define BTL_OPENIB_HEADER_COALESCED_NTOH(h) \
70  do { \
71  (h).size = ntohl((h).size); \
72  (h).alloc_size = ntohl((h).alloc_size); \
73  } while(0)
74 
75 #define BTL_OPENIB_HEADER_COALESCED_HTON(h) \
76  do { \
77  (h).size = htonl((h).size); \
78  (h).alloc_size = htonl((h).alloc_size); \
79  } while(0)
80 
81 #if OMPI_OPENIB_PAD_HDR
82 /* BTL_OPENIB_FTR_PADDING
83  * This macro is used to keep the pointer to openib footers aligned for
84  * systems like SPARC64 that take a big performance hit when addresses
85  * are not aligned (and by default sigbus instead of coercing the type on
86  * an unaligned address).
87  *
88  * We assure alignment of a packet's structures when OMPI_OPENIB_PAD_HDR
89  * is set to 1. When this is the case then several structures are padded
90  * to assure alignment and the mca_btl_openib_footer_t structure itself
91  * will uses the BTL_OPENIB_FTR_PADDING macro to shift the location of the
92  * pointer to assure proper alignment after the PML Header and data.
93  * For example sending a 1 byte data packet the memory layout without
94  * footer alignment would look something like the following:
95  *
96  * 0x00 : mca_btl_openib_coalesced_header_t (12 bytes + 4 byte pad)
97  * 0x10 : mca_btl_openib_control_header_t (1 byte + 7 byte pad)
98  * 0x18 : mca_btl_openib_header_t (4 bytes + 4 byte pad)
99  * 0x20 : PML Header and data (16 bytes PML + 1 byte data)
100  * 0x29 : mca_btl_openib_footer_t (4 bytes + 4 byte pad)
101  * 0x31 : end of packet
102  *
103  * By applying the BTL_OPENIB_FTR_PADDING() in the progress_one_device
104  * and post_send routines we adjust the pointer to mca_btl_openib_footer_t
105  * from 0x29 to 0x2C thus correctly aligning the start of the
106  * footer pointer. This adjustment will cause the padding field of
107  * mca_btl_openib_footer_t to overlap with the neighboring memory but since
108  * we never use the padding we do not end up inadvertently overwriting
109  * memory that does not belong to the fragment.
110  */
111 #define BTL_OPENIB_FTR_PADDING(size) \
112  OPAL_ALIGN_PAD_AMOUNT(size, sizeof(uint64_t))
113 
114 /* BTL_OPENIB_ALIGN_COALESCE_HDR
115  * This macro is used in btl_openib.c, while creating a coalesce fragment,
116  * to align the coalesce headers.
117  */
118 #define BTL_OPENIB_ALIGN_COALESCE_HDR(ptr) \
119  OPAL_ALIGN_PTR(ptr, sizeof(uint32_t), unsigned char*)
120 
121 /* BTL_OPENIB_COALESCE_HDR_PADDING
122  * This macro is used in btl_openib_component.c, while parsing an incoming
123  * coalesce fragment, to determine the padding amount used to align the
124  * mca_btl_openib_coalesce_hdr_t.
125  */
126 #define BTL_OPENIB_COALESCE_HDR_PADDING(ptr) \
127  OPAL_ALIGN_PAD_AMOUNT(ptr, sizeof(uint32_t))
128 #else
129 #define BTL_OPENIB_FTR_PADDING(size) 0
130 #define BTL_OPENIB_ALIGN_COALESCE_HDR(ptr) ptr
131 #define BTL_OPENIB_COALESCE_HDR_PADDING(ptr) 0
132 #endif
133 
135 #if OPAL_ENABLE_DEBUG
136  uint32_t seq;
137 #endif
138  union {
139  uint32_t size;
140  uint8_t buf[4];
141  } u;
142 #if OMPI_OPENIB_PAD_HDR
143 #if OPAL_ENABLE_DEBUG
144  /* this footer needs to be of a 8-byte multiple so by adding the
145  * seq field you throw this off and you cannot just remove the
146  * padding because the padding is needed in order to adjust the alignment
147  * and not overwrite other packets.
148  */
149  uint8_t padding[12];
150 #else
151  uint8_t padding[8];
152 #endif
153 #endif
154 };
156 
157 #ifdef WORDS_BIGENDIAN
158 #define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr)
159 #else
160 #define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr) \
161  do { \
162  uint8_t tmp = (ftr).u.buf[0]; \
163  (ftr).u.buf[0]=(ftr).u.buf[2]; \
164  (ftr).u.buf[2]=tmp; \
165  } while (0)
166 #endif
167 
168 #if OPAL_ENABLE_DEBUG
169 #define BTL_OPENIB_FOOTER_SEQ_HTON(h) ((h).seq = htonl((h).seq))
170 #define BTL_OPENIB_FOOTER_SEQ_NTOH(h) ((h).seq = ntohl((h).seq))
171 #else
172 #define BTL_OPENIB_FOOTER_SEQ_HTON(h)
173 #define BTL_OPENIB_FOOTER_SEQ_NTOH(h)
174 #endif
175 
176 #define BTL_OPENIB_FOOTER_HTON(h) \
177  do { \
178  BTL_OPENIB_FOOTER_SEQ_HTON(h); \
179  MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
180  } while (0)
181 
182 #define BTL_OPENIB_FOOTER_NTOH(h) \
183  do { \
184  BTL_OPENIB_FOOTER_SEQ_NTOH(h); \
185  MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
186  } while (0)
187 
188 #define MCA_BTL_OPENIB_CONTROL_CREDITS 0
189 #define MCA_BTL_OPENIB_CONTROL_RDMA 1
190 #define MCA_BTL_OPENIB_CONTROL_COALESCED 2
191 #define MCA_BTL_OPENIB_CONTROL_CTS 3
192 #if BTL_OPENIB_FAILOVER_ENABLED
193 #define MCA_BTL_OPENIB_CONTROL_EP_BROKEN 4
194 #define MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR 5
195 #endif
196 
198  uint8_t type;
199 #if OMPI_OPENIB_PAD_HDR
200  uint8_t padding[7];
201 #endif
202 };
204 
207  uint32_t rkey;
208  ompi_ptr_t rdma_start;
209 };
211 
212 #define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON(h) \
213  do { \
214  (h).rkey = htonl((h).rkey); \
215  (h).rdma_start.lval = hton64((h).rdma_start.lval); \
216  } while (0)
217 
218 #define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(h) \
219  do { \
220  (h).rkey = ntohl((h).rkey); \
221  (h).rdma_start.lval = ntoh64((h).rdma_start.lval); \
222  } while (0)
223 
224 
227 #if OMPI_OPENIB_PAD_HDR
228  uint8_t padding[1];
229 #endif
230  uint8_t qpn;
231  uint16_t rdma_credits;
232 };
234 
235 #define BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(h) \
236 do { \
237  (h).rdma_credits = htons((h).rdma_credits); \
238 } while (0)
239 
240 #define BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(h) \
241 do { \
242  (h).rdma_credits = ntohs((h).rdma_credits); \
243 } while (0)
244 
245 #if BTL_OPENIB_FAILOVER_ENABLED
246 struct mca_btl_openib_broken_connection_header_t {
248  uint32_t lid;
249  uint64_t subnet_id;
250  uint32_t vpid;
251  uint32_t index; /* for eager RDMA only */
252 };
253 typedef struct mca_btl_openib_broken_connection_header_t mca_btl_openib_broken_connection_header_t;
254 
255 #define BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON(h) \
256  do { \
257  (h).lid = htonl((h).lid); \
258  (h).subnet_id = hton64((h).subnet_id); \
259  (h).vpid = htonl((h).vpid); \
260  (h).index = htonl((h).index); \
261  } while (0)
262 
263 #define BTL_OPENIB_BROKEN_CONNECTION_HEADER_NTOH(h) \
264  do { \
265  (h).lid = ntohl((h).lid); \
266  (h).subnet_id = ntoh64((h).subnet_id); \
267  (h).vpid = ntohl((h).vpid); \
268  (h).index = ntohl((h).index); \
269  } while (0)
270 #endif
271 enum mca_btl_openib_frag_type_t {
272  MCA_BTL_OPENIB_FRAG_RECV,
273  MCA_BTL_OPENIB_FRAG_RECV_USER,
274  MCA_BTL_OPENIB_FRAG_SEND,
275  MCA_BTL_OPENIB_FRAG_SEND_USER,
276  MCA_BTL_OPENIB_FRAG_EAGER_RDMA,
277  MCA_BTL_OPENIB_FRAG_CONTROL,
278  MCA_BTL_OPENIB_FRAG_COALESCED
279 };
280 typedef enum mca_btl_openib_frag_type_t mca_btl_openib_frag_type_t;
281 
282 #define openib_frag_type(f) (to_base_frag(f)->type)
283 /**
284  * IB fragment derived type.
285  */
286 
287 /* base openib frag */
288 typedef struct mca_btl_openib_frag_t {
290  mca_btl_base_segment_t segment;
291  mca_btl_openib_frag_type_t type;
292  ompi_free_list_t* list;
295 
296 #define to_base_frag(f) ((mca_btl_openib_frag_t*)(f))
297 
298 /* frag used for communication */
300  mca_btl_openib_frag_t super;
301  struct ibv_sge sg_entry;
302  struct mca_btl_openib_reg_t *registration;
303  struct mca_btl_base_endpoint_t *endpoint;
306 
307 #define to_com_frag(f) ((mca_btl_openib_com_frag_t*)(f))
308 
311  struct ibv_send_wr sr_desc;
314 
315 #define to_out_frag(f) ((mca_btl_openib_out_frag_t*)(f))
316 
319 
320 #define to_in_frag(f) ((mca_btl_openib_in_frag_t*)(f))
321 
324  mca_btl_openib_header_t *hdr, *chdr;
326  uint8_t qp_idx;
327  uint32_t coalesced_length;
328  opal_list_t coalesced_frags;
331 
332 #define to_send_frag(f) ((mca_btl_openib_send_frag_t*)(f))
333 
338  struct ibv_recv_wr rd_desc;
339  uint8_t qp_idx;
342 
343 #define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f))
344 
347 
348 #define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f))
349 
352  struct ibv_send_wr sr_desc;
355 
356 #define to_get_frag(f) ((mca_btl_openib_get_frag_t*)(f))
357 
360 
361 #define to_send_control_frag(f) ((mca_btl_openib_send_control_frag_t*)(f))
362 
364  mca_btl_openib_frag_t super;
365  mca_btl_openib_send_frag_t *send_frag;
369 
370 #define to_coalesced_frag(f) ((mca_btl_openib_coalesced_frag_t*)(f))
371 
372 /*
373  * Allocate an IB send descriptor
374  *
375  */
376 
378 alloc_control_frag(mca_btl_openib_module_t *btl)
379 {
380  int rc;
381  ompi_free_list_item_t *item;
382 
383  OMPI_FREE_LIST_WAIT(&btl->device->send_free_control, item, rc);
384 
385  return to_send_control_frag(item);
386 }
387 
388 static inline uint8_t frag_size_to_order(mca_btl_openib_module_t* btl,
389  size_t size)
390 {
391  int qp;
392  for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++)
393  if(mca_btl_openib_component.qp_infos[qp].size >= size)
394  return qp;
395 
396  return MCA_BTL_NO_ORDER;
397 }
398 
399 static inline mca_btl_openib_com_frag_t *alloc_send_user_frag(void)
400 {
401  int rc;
402  ompi_free_list_item_t *item;
403 
404  OMPI_FREE_LIST_GET(&mca_btl_openib_component.send_user_free, item, rc);
405 
406  return to_com_frag(item);
407 }
408 
409 static inline mca_btl_openib_com_frag_t *alloc_recv_user_frag(void)
410 {
411  int rc;
412  ompi_free_list_item_t *item;
413 
414  OMPI_FREE_LIST_GET(&mca_btl_openib_component.recv_user_free, item, rc);
415 
416  return to_com_frag(item);
417 }
418 
419 static inline mca_btl_openib_coalesced_frag_t *alloc_coalesced_frag(void)
420 {
421  int rc;
422  ompi_free_list_item_t *item;
423 
424  OMPI_FREE_LIST_GET(&mca_btl_openib_component.send_free_coalesced, item, rc);
425 
426  return to_coalesced_frag(item);
427 }
428 
429 #define MCA_BTL_IB_FRAG_RETURN(frag) \
430  do { \
431  OMPI_FREE_LIST_RETURN(to_base_frag(frag)->list, \
432  (ompi_free_list_item_t*)(frag)); \
433  } while(0);
434 
435 #define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(list) \
436  while(!opal_list_is_empty(list)){ \
437  opal_list_item_t *frag_item; \
438  frag_item = opal_list_remove_first(list); \
439  MCA_BTL_IB_FRAG_RETURN(frag_item); \
440  } \
441 
443 
445  uint8_t order;
446  ompi_free_list_t* list;
447 };
449 
450 void mca_btl_openib_frag_init(ompi_free_list_item_t* item, void* ctx);
451 
452 
453 END_C_DECLS
454 #endif
Definition: btl_openib_frag.h:205
Definition: btl_openib.h:470
A descriptor that holds the parameters to a send/put/get operation along w/ a callback routine that i...
Definition: btl.h:275
Definition: btl_openib_frag.h:322
Definition: btl_openib_frag.h:60
Definition: btl_openib_frag.h:334
Definition: types.h:52
IB BTL Interface.
Definition: btl_openib.h:432
uint8_t num_qps
total number of qp's
Definition: btl_openib.h:194
Definition: btl_openib_frag.h:363
ompi_free_list_t send_user_free
free list of frags only; used for pining user memory
Definition: btl_openib.h:286
Definition: btl_openib_frag.h:37
IB fragment derived type.
Definition: btl_openib_frag.h:288
Byte Transfer Layer (BTL)
Definition: btl_openib_frag.h:444
Definition: ompi_free_list.h:39
Definition: btl_openib_frag.h:350
ompi_free_list_t recv_user_free
frags for coalesced massages
Definition: btl_openib.h:288
State of ELAN endpoint connection.
Definition: btl_elan_endpoint.h:33
Definition: ompi_free_list.h:62
Definition: btl_openib_frag.h:225
Definition: btl_openib_frag.h:309
Definition: opal_list.h:147
Definition: btl_openib_frag.h:197
Definition: btl_openib_frag.h:299
Describes a region/segment of memory that is addressable by an BTL.
Definition: btl.h:236
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236