OpenMPI  0.1.1
bml.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2009 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2006 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
13  * $COPYRIGHT$
14  *
15  * Additional copyrights may follow
16  *
17  * $HEADER$
18  */
19 /**
20  * @file
21  *
22  * BML Management Layer (BML)
23  *
24  */
25 
26 #ifndef MCA_BML_H
27 #define MCA_BML_H
28 
29 #include "ompi_config.h"
30 #include "opal/mca/mca.h"
31 #include "opal/datatype/opal_convertor.h"
32 #include "opal/mca/crs/crs.h"
33 #include "opal/mca/crs/base/base.h"
34 #include "opal/util/opal_sos.h"
35 
36 #include "ompi/mca/btl/btl.h"
37 
38 #include "ompi/mca/bml/base/bml_base_btl.h"
39 #include "ompi/types.h"
40 
41 #include "ompi/constants.h"
42 
43 #define OPAL_ENABLE_DEBUG_RELIABILITY 0
44 
45 /*
46  * BML types
47  */
48 
49 struct ompi_proc_t;
53 
54 /*
55  * Cached set of information for each btl
56  */
57 
59  uint32_t btl_flags; /**< support for put/get? */
60  float btl_weight; /**< BTL weight for scheduling */
61  struct mca_btl_base_module_t *btl; /**< BTL module */
62  struct mca_btl_base_endpoint_t* btl_endpoint; /**< BTL addressing info */
63 };
65 
66 
67 
68 /**
69  * A dynamically growable array of mca_bml_base_btl_t instances.
70  * Maintains an index into the array that is used for round-robin
71  * scheduling across contents.
72  */
74  opal_object_t super;
75  size_t arr_size; /**< number available */
76  size_t arr_reserve; /**< size of allocated btl_proc array */
77  size_t arr_index; /**< last used index*/
78  mca_bml_base_btl_t* bml_btls; /**< array of bml btl's */
79 };
81 
83 
84 
85 /**
86  * If required, reallocate (grow) the array to the indicate size.
87  *
88  * @param array (IN)
89  * @param size (IN)
90  */
92 {
93  return array->arr_size;
94 }
95 
96 /**
97  * Grow the array if required, and set the size.
98  *
99  * @param array (IN)
100  * @param size (IN)
101  */
102 static inline void mca_bml_base_btl_array_set_size(mca_bml_base_btl_array_t* array, size_t size)
103 {
104  if(array->arr_size > array->arr_reserve)
105  mca_bml_base_btl_array_reserve(array, size);
106  array->arr_size = size;
107 }
108 
109 /**
110  * Grow the array size by one and return the item at that index.
111  *
112  * @param array (IN)
113  */
115 {
116 #if OPAL_ENABLE_DEBUG
117  if(array->arr_size >= array->arr_reserve) {
118  opal_output(0, "mca_bml_base_btl_array_insert: invalid array index %lu >= %lu",
119  (unsigned long)array->arr_size, (unsigned long)array->arr_reserve);
120  return 0;
121  }
122 #endif
123  return &array->bml_btls[array->arr_size++];
124 }
125 
126 /**
127  * Remove a btl from a bml_btl
128  *
129  * @param array (IN)
130  * @param btl (IN)
131  */
133  struct mca_btl_base_module_t* btl )
134 {
135  size_t i = 0;
136  /* find the btl */
137  for( i = 0; i < array->arr_size; i++ ) {
138  if( array->bml_btls[i].btl == btl ) {
139  /* make sure not to go out of bounds */
140  for( ; i < array->arr_size-1; i++ ) {
141  /* move all btl's back by 1, so the found
142  btl is "removed" */
143  array->bml_btls[i] = array->bml_btls[(i+1)];
144  }
145  array->arr_size--;
146  array->arr_index = 0;
147  return true;
148  }
149  }
150  return false;
151 }
152 
153 
154 /**
155  * Return an array item at the specified index.
156  *
157  * @param array (IN)
158  * @param item_index (IN)
159  */
161 {
162 #if OPAL_ENABLE_DEBUG
163  if(item_index >= array->arr_size) {
164  opal_output(0, "mca_bml_base_btl_array_get_index: invalid array index %lu >= %lu",
165  (unsigned long)item_index, (unsigned long)array->arr_size);
166  return 0;
167  }
168 #endif
169  return &array->bml_btls[item_index];
170 }
171 
172 /**
173  * Return the next LRU index in the array.
174  *
175  * @param array (IN)
176  *
177  * @param index (OUT)
178  */
180 {
181 #if OPAL_ENABLE_DEBUG
182  if(array->arr_size == 0) {
183  opal_output(0, "mca_bml_base_btl_array_get_next: invalid array size");
184  return 0;
185  }
186 #endif
187  if( 1 == array->arr_size ) {
188  return &array->bml_btls[0]; /* force the return to avoid a jump */
189  } else {
190  size_t current_position = array->arr_index; /* force to always start from zero */
191  if( (current_position + 1) == array->arr_size ) {
192  array->arr_index = 0; /* next time serve from the beginning */
193  } else {
194  array->arr_index = current_position + 1; /* continue */
195  }
196  return &array->bml_btls[current_position];
197  }
198 }
199 
200 /**
201  * Locate an element in the array
202  *
203  * @param array (IN)
204  * @param index (IN)
205  */
208 {
209  size_t i=0;
210  for(i=0; i<array->arr_size; i++) {
211  if(array->bml_btls[i].btl == btl) {
212  return &array->bml_btls[i];
213  }
214  }
215  return NULL;
216 }
217 
218 /**
219  * Structure associated w/ ompi_proc_t that contains the set
220  * of BTLs used to reach a destination
221  */
223  opal_list_item_t super; /**< base_endpoint is a list item */
224  struct ompi_proc_t* btl_proc; /**< backpointer to target ompi_proc_t */
225  size_t btl_pipeline_send_length; /**< max of pipeline send_length of available BTLs */
226  size_t btl_send_limit; /**< max of min rdma pipeline for available rmda btls */
227  size_t btl_max_send_size; /**< min of max send size for available send btls */
228  mca_bml_base_btl_array_t btl_eager; /**< array of btls to use for first fragments */
229  mca_bml_base_btl_array_t btl_send; /**< array of btls to use for remaining fragments */
230  mca_bml_base_btl_array_t btl_rdma; /**< array of btls that support (prefer) rdma */
231  size_t btl_rdma_index; /**< index of last used BTL for RDMA */
232  uint32_t btl_flags_or; /**< the bitwise OR of the btl flags */
233 };
235 
236 
238 
239 static inline void mca_bml_base_alloc( mca_bml_base_btl_t* bml_btl,
241  uint8_t order, size_t size, uint32_t flags )
242 {
243  mca_btl_base_module_t* btl = bml_btl->btl;
244  *des = btl->btl_alloc(btl, bml_btl->btl_endpoint, order, size, flags);
245 }
246 
247 static inline void mca_bml_base_free( mca_bml_base_btl_t* bml_btl,
249 {
250  mca_btl_base_module_t* btl = bml_btl->btl;
251 
252  btl->btl_free( btl, des );
253  /* The previous function is supposed to release the des object
254  * so we should not touch it anymore.
255  */
256 }
257 
258 #if OPAL_ENABLE_DEBUG_RELIABILITY
259 
260 int mca_bml_base_send( mca_bml_base_btl_t* bml_btl,
262  mca_btl_base_tag_t tag );
263 
264 
265 #else
266 
267 static inline int mca_bml_base_send( mca_bml_base_btl_t* bml_btl,
269  mca_btl_base_tag_t tag )
270 {
271  int rc;
272  mca_btl_base_module_t* btl = bml_btl->btl;
273 
274  des->des_context = (void*) bml_btl;
275  rc = btl->btl_send(btl, bml_btl->btl_endpoint, des, tag);
276  if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_RESOURCE_BUSY)
277  rc = OMPI_SUCCESS;
278 
279  return rc;
280 }
281 
282 #endif
283 
284 static inline int mca_bml_base_send_status( mca_bml_base_btl_t* bml_btl,
286  mca_btl_base_tag_t tag )
287 {
288  mca_btl_base_module_t* btl = bml_btl->btl;
289 
290  des->des_context = (void*) bml_btl;
291  return btl->btl_send(btl, bml_btl->btl_endpoint, des, tag);
292 }
293 
294 static inline int mca_bml_base_sendi( mca_bml_base_btl_t* bml_btl,
295  struct opal_convertor_t* convertor,
296  void* header,
297  size_t header_size,
298  size_t payload_size,
299  uint8_t order,
300  uint32_t flags,
301  mca_btl_base_tag_t tag,
302  mca_btl_base_descriptor_t** descriptor )
303 {
304  mca_btl_base_module_t* btl = bml_btl->btl;
305  return btl->btl_sendi(btl, bml_btl->btl_endpoint,
306  convertor, header, header_size,
307  payload_size, order, flags, tag, descriptor);
308 }
309 
310 static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl,
312 {
313  mca_btl_base_module_t* btl = bml_btl->btl;
314 
315  des->des_context = (void*) bml_btl;
316  return btl->btl_put( btl, bml_btl->btl_endpoint, des );
317 }
318 
319 static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl,
321 {
322  mca_btl_base_module_t* btl = bml_btl->btl;
323 
324  des->des_context = (void*) bml_btl;
325  return btl->btl_get( btl, bml_btl->btl_endpoint, des );
326 }
327 
328 
329 static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl,
331  struct opal_convertor_t* conv,
332  uint8_t order,
333  size_t reserve,
334  size_t *size,
335  uint32_t flags,
337 {
338  mca_btl_base_module_t* btl = bml_btl->btl;
339 
340  *des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, reg, conv,
341  order, reserve, size, flags );
342  if( OPAL_LIKELY((*des) != NULL) ) {
343  (*des)->des_context = (void*) bml_btl;
344  }
345 }
346 
347 static inline void mca_bml_base_prepare_dst(mca_bml_base_btl_t* bml_btl,
349  struct opal_convertor_t* conv,
350  uint8_t order,
351  size_t reserve,
352  size_t *size,
353  uint32_t flags,
355 {
356  mca_btl_base_module_t* btl = bml_btl->btl;
357 
358  *des = btl->btl_prepare_dst( btl, bml_btl->btl_endpoint, reg, conv,
359  order, reserve, size, flags );
360  if( OPAL_LIKELY((*des) != NULL) ) {
361  (*des)->des_context = (void*) bml_btl;
362  }
363 }
364 
365 /*
366  * BML component interface functions and datatype.
367  */
368 
369 /**
370  * MCA->BML Initializes the BML component and creates specific BML
371  * module(s).
372  *
373  * @param num_bmls (OUT) Returns the number of bml modules created, or 0
374  * if the transport is not available.
375  *
376  * @param enable_progress_threads (IN) Whether this component is
377  * allowed to run a hidden/progress thread or not.
378  *
379  * @param enable_mpi_threads (IN) Whether support for multiple MPI
380  * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which
381  * indicates whether multiple threads may invoke this component
382  * simultaneously or not.
383  *
384  * @return Array of pointers to BML modules, or NULL if the transport
385  * is not available.
386  *
387  * During component initialization, the BML component should discover
388  * the physical devices that are available for the given transport,
389  * and create a BML module to represent each device. Any addressing
390  * information required by peers to reach the device should be published
391  * during this function via the mca_base_modex_send() interface.
392  *
393  */
394 
395 typedef struct mca_bml_base_module_t* (*mca_bml_base_component_init_fn_t)(
396  int* priority,
397  bool enable_progress_threads,
398  bool enable_mpi_threads
399  );
400 
401 /**
402  * BML component descriptor. Contains component version information
403  * and component open/close/init functions.
404  */
405 
407  mca_base_component_t bml_version;
408  mca_base_component_data_t bml_data;
410 };
413 
414 
415 /*
416  * BML module interface functions and datatype.
417  */
418 
419 /**
420  * MCA->BML Clean up any resources held by BML module
421  * before the module is unloaded.
422  *
423  * @param bml (IN) BML module.
424  *
425  * Prior to unloading a BML module, the MCA framework will call
426  * the BML finalize method of the module. Any resources held by
427  * the BML should be released and if required the memory corresponding
428  * to the BML module freed.
429  *
430  */
431 typedef int (*mca_bml_base_module_finalize_fn_t)( void );
432 
433 /**
434  * PML->BML notification of change in the process list.
435  *
436  * @param nprocs (IN) Number of processes
437  * @param procs (IN) Set of processes
438  * @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BML.
439  * @return OMPI_SUCCESS or error status on failure.
440  *
441  * The mca_bml_base_module_add_procs_fn_t() is called by the PML to
442  * determine the set of BMLs that should be used to reach each process.
443  * Any addressing information exported by the peer via the mca_base_modex_send()
444  * function should be available during this call via the corresponding
445  * mca_base_modex_recv() function. The BML may utilize this information to
446  * determine reachability of each peer process.
447  *
448  * For each process that is reachable by the BML, the bit corresponding to the index
449  * into the proc array (nprocs) should be set in the reachable bitmask. The PML
450  * provides the BML the option to return a pointer to a data structure defined
451  * by the BML that is returned to the BML on subsequent calls to the BML data
452  * transfer functions (e.g bml_send). This may be used by the BML to cache any addressing
453  * or connection information (e.g. TCP socket, IP queue pair).
454  *
455  * \note This function will return OMPI_ERR_UNREACH if one or more
456  * processes can not be reached by the currently active BTLs. This is
457  * not a fatal error, and the calling layer is free to continue using
458  * the BML interface.
459  */
461  size_t nprocs,
462  struct ompi_proc_t** procs,
463  struct opal_bitmap_t* reachable
464  );
465 
466 /**
467  * Notification of change to the process list.
468  *
469  * @param nprocs (IN) Number of processes
470  * @param proc (IN) Set of processes
471  * @return Status indicating if cleanup was successful
472  *
473  * When the process list changes, the PML notifies the BML of the
474  * change, to provide the opportunity to cleanup or release any
475  * resources associated with the peer.
476  */
478  size_t nprocs,
479  struct ompi_proc_t** procs
480  );
481 
482 /**
483  * Notification of change to the btl list.
484  *
485  * @param bml (IN) BTL module
486  * @return Status indicating if cleanup was successful
487  *
488  * On recovery of a btl, add it to the set of forwarding
489  * entries used by the BML.
490  */
492 
493 /**
494  * Notification of change to the btl list.
495  *
496  * @param bml (IN) BTL module
497  * @return Status indicating if cleanup was successful
498  *
499  * On failure of a btl, remove it from the set of forwarding
500  * entries used by the BML.
501  */
503 
504 /**
505  * Notification of change to the btl list.
506  *
507  * @param bml (IN) BTL module
508  * @return Status indicating if cleanup was successful
509  *
510  * On failure of a btl, remove it from the set of forwarding
511  * entries used by the BML.
512  */
514  struct ompi_proc_t*,
515  struct mca_btl_base_module_t* );
516 
517 /**
518  * Register a callback function that is called on receipt
519  * of a fragment.
520  *
521  * @param bml (IN) BML module
522  * @return Status indicating if cleanup was successful
523  *
524  * When the process list changes, the PML notifies the BML of the
525  * change, to provide the opportunity to cleanup or release any
526  * resources associated with the peer.
527  */
529  mca_btl_base_tag_t tag,
531  void* cbdata
532  );
533 /**
534  * Register a callback function that is called of error.
535  *
536  * @param bml (IN) BML module
537  * @return Status indicating if cleanup was successful
538  *
539  */
542 );
543 
544 /**
545  * Fault Tolerance Event Notification Function
546  * @param status Checkpoint Status
547  * @return OMPI_SUCCESS or failure status
548  */
549 typedef int (*mca_bml_base_module_ft_event_fn_t)(int status);
550 
551 
552 /**
553  * BML module interface functions and attributes.
554  */
556  /* BML common attributes */
557  mca_bml_base_component_t* bml_component; /**< pointer back to the BML component structure */
558 
559  /* BML function table */
564  mca_bml_base_module_del_proc_btl_fn_t bml_del_proc_btl;
567 
569 
571 };
573 
574 /*
575  * Macro for use in modules that are of type bml
576  */
577 #define MCA_BML_BASE_VERSION_2_0_0 \
578  MCA_BASE_VERSION_2_0_0, \
579  "bml", 2, 0, 0
580 
581 #endif /* OMPI_MCA_BML_H */
uint32_t btl_flags_or
the bitwise OR of the btl flags
Definition: bml.h:232
float btl_weight
BTL weight for scheduling.
Definition: bml.h:60
size_t arr_reserve
size of allocated btl_proc array
Definition: bml.h:76
Common type for all MCA components.
Definition: mca.h:250
A descriptor that holds the parameters to a send/put/get operation along w/ a callback routine that i...
Definition: btl.h:275
static mca_bml_base_btl_t * mca_bml_base_btl_array_get_index(mca_bml_base_btl_array_t *array, size_t item_index)
Return an array item at the specified index.
Definition: bml.h:160
BML module interface functions and attributes.
Definition: bml.h:555
uint32_t btl_flags
support for put/get?
Definition: bml.h:59
void(* mca_btl_base_module_recv_cb_fn_t)(struct mca_btl_base_module_t *btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t *descriptor, void *cbdata)
Callback function that is called asynchronously on receipt of data by the transport layer...
Definition: btl.h:391
Definition: opal_bitmap.h:53
int(* mca_bml_base_module_finalize_fn_t)(void)
MCA->BML Clean up any resources held by BML module before the module is unloaded. ...
Definition: bml.h:431
size_t btl_pipeline_send_length
max of pipeline send_length of available BTLs
Definition: bml.h:225
size_t arr_index
last used index
Definition: bml.h:77
size_t btl_rdma_index
index of last used BTL for RDMA
Definition: bml.h:231
size_t btl_max_send_size
min of max send size for available send btls
Definition: bml.h:227
void(* mca_btl_base_module_error_cb_fn_t)(struct mca_btl_base_module_t *btl, int32_t flags, struct ompi_proc_t *errproc, char *btlinfo)
Callback function that is called asynchronously on receipt of an error from the transport layer...
Definition: btl.h:538
int(* mca_bml_base_module_del_btl_fn_t)(struct mca_btl_base_module_t *)
Notification of change to the btl list.
Definition: bml.h:502
static void mca_bml_base_btl_array_set_size(mca_bml_base_btl_array_t *array, size_t size)
Grow the array if required, and set the size.
Definition: bml.h:102
int(* mca_bml_base_module_ft_event_fn_t)(int status)
Fault Tolerance Event Notification Function.
Definition: bml.h:549
Definition: mpool.h:44
Remote Open MPI process structure.
Definition: proc.h:56
int(* mca_bml_base_module_add_procs_fn_t)(size_t nprocs, struct ompi_proc_t **procs, struct opal_bitmap_t *reachable)
PML->BML notification of change in the process list.
Definition: bml.h:460
mca_bml_base_btl_array_t btl_rdma
array of btls that support (prefer) rdma
Definition: bml.h:230
struct mca_btl_base_endpoint_t * btl_endpoint
BTL addressing info.
Definition: bml.h:62
Top-level interface for all MCA components.
Definition: opal_list.h:98
struct mca_bml_base_module_t *(* mca_bml_base_component_init_fn_t)(int *priority, bool enable_progress_threads, bool enable_mpi_threads)
MCA->BML Initializes the BML component and creates specific BML module(s).
Definition: bml.h:395
void * des_context
more opaque callback data
Definition: btl.h:283
mca_bml_base_btl_t * bml_btls
array of bml btl's
Definition: bml.h:78
int(* mca_bml_base_module_register_error_cb_fn_t)(mca_btl_base_module_error_cb_fn_t cbfunc)
Register a callback function that is called of error.
Definition: bml.h:540
struct ompi_proc_t * btl_proc
backpointer to target ompi_proc_t
Definition: bml.h:224
OPAL_DECLSPEC void opal_output(int output_id, const char *format,...) __opal_attribute_format__(__printf__
Main function to send output to a stream.
Byte Transfer Layer (BTL)
size_t btl_send_limit
max of min rdma pipeline for available rmda btls
Definition: bml.h:226
BML component descriptor.
Definition: bml.h:406
mca_bml_base_btl_array_t btl_eager
array of btls to use for first fragments
Definition: bml.h:228
static mca_bml_base_btl_t * mca_bml_base_btl_array_insert(mca_bml_base_btl_array_t *array)
Grow the array size by one and return the item at that index.
Definition: bml.h:114
State of ELAN endpoint connection.
Definition: btl_elan_endpoint.h:33
Structure associated w/ ompi_proc_t that contains the set of BTLs used to reach a destination...
Definition: bml.h:222
Base object.
Definition: opal_object.h:182
Definition: opal_convertor.h:90
mca_bml_base_component_t * bml_component
pointer back to the BML component structure
Definition: bml.h:557
Definition: bml.h:58
Meta data for MCA v2.0.0 components.
Definition: mca.h:309
struct mca_btl_base_module_t * btl
BTL module.
Definition: bml.h:61
mca_bml_base_btl_array_t btl_send
array of btls to use for remaining fragments
Definition: bml.h:229
static mca_bml_base_btl_t * mca_bml_base_btl_array_find(mca_bml_base_btl_array_t *array, struct mca_btl_base_module_t *btl)
Locate an element in the array.
Definition: bml.h:206
size_t arr_size
number available
Definition: bml.h:75
int(* mca_bml_base_module_add_btl_fn_t)(struct mca_btl_base_module_t *)
Notification of change to the btl list.
Definition: bml.h:491
static bool mca_bml_base_btl_array_remove(mca_bml_base_btl_array_t *array, struct mca_btl_base_module_t *btl)
Remove a btl from a bml_btl.
Definition: bml.h:132
static size_t mca_bml_base_btl_array_get_size(mca_bml_base_btl_array_t *array)
If required, reallocate (grow) the array to the indicate size.
Definition: bml.h:91
Checkpoint and Restart Service (CRS) Interface.
BTL module interface functions and attributes.
Definition: btl.h:786
opal_list_item_t super
base_endpoint is a list item
Definition: bml.h:223
int(* mca_bml_base_module_del_proc_btl_fn_t)(struct ompi_proc_t *, struct mca_btl_base_module_t *)
Notification of change to the btl list.
Definition: bml.h:513
A dynamically growable array of mca_bml_base_btl_t instances.
Definition: bml.h:73
int(* mca_bml_base_module_register_fn_t)(mca_btl_base_tag_t tag, mca_btl_base_module_recv_cb_fn_t cbfunc, void *cbdata)
Register a callback function that is called on receipt of a fragment.
Definition: bml.h:528
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
int(* mca_bml_base_module_del_procs_fn_t)(size_t nprocs, struct ompi_proc_t **procs)
Notification of change to the process list.
Definition: bml.h:477
static mca_bml_base_btl_t * mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t *array)
Return the next LRU index in the array.
Definition: bml.h:179
Definition: mpool_fake.h:38