OpenMPI  0.1.1
coll_sm.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2006 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
13  * $COPYRIGHT$
14  *
15  * Additional copyrights may follow
16  *
17  * $HEADER$
18  */
19 /** @file */
20 
21 #ifndef MCA_COLL_SM_EXPORT_H
22 #define MCA_COLL_SM_EXPORT_H
23 
24 #include "ompi_config.h"
25 
26 #include "mpi.h"
27 #include "opal/mca/mca.h"
28 #include "opal/datatype/opal_convertor.h"
29 #include "orte/types.h"
30 #include "ompi/mca/coll/coll.h"
31 #include "ompi/mca/common/sm/common_sm.h"
32 
33 BEGIN_C_DECLS
34 
35 /* Attempt to give some sort of progress / fairness if we're blocked
36  in an sm collective for a long time: call opal_progress once in a
37  great while. Use a "goto" label for expdiency to exit loops. */
38 #define SPIN_CONDITION_MAX 100000
39 #define SPIN_CONDITION(cond, exit_label) \
40  do { int i; \
41  if (cond) goto exit_label; \
42  for (i = 0; i < SPIN_CONDITION_MAX; ++i) { \
43  if (cond) { goto exit_label; } \
44  } \
45  opal_progress(); \
46  } while (1); \
47  exit_label:
48 
49  /**
50  * Structure to hold the sm coll component. First it holds the
51  * base coll component, and then holds a bunch of
52  * sm-coll-component-specific stuff (e.g., current MCA param
53  * values).
54  */
55  typedef struct mca_coll_sm_component_t {
56  /** Base coll component */
58 
59  /** MCA parameter: Priority of this component */
61 
62  /** MCA parameter: Length of a cache line or page (in bytes) */
64 
65  /** MCA parameter: Number of "in use" flags in each
66  communicator's area in the data mpool */
68 
69  /** MCA parameter: Number of segments for each communicator in
70  the data mpool */
72 
73  /** MCA parameter: Fragment size for data */
75 
76  /** MCA parameter: Degree of tree for tree-based collectives */
78 
79  /** MCA parameter: Number of processes to use in the
80  calculation of the "info" MCA parameter */
82 
83  /******* end of MCA params ********/
84 
85  /** How many fragment segments are protected by a single
86  in-use flags. This is solely so that we can only perform
87  the division once and then just use the value without
88  having to re-calculate. */
91 
92  /**
93  * Structure for representing a node in the tree
94  */
95  typedef struct mca_coll_sm_tree_node_t {
96  /** Arbitrary ID number, starting from 0 */
97  int mcstn_id;
98  /** Pointer to parent, or NULL if root */
100  /** Number of children, or 0 if a leaf */
102  /** Pointer to an array of children, or NULL if 0 ==
103  mcstn_num_children */
106 
107  /**
108  * Simple structure comprising the "in use" flags. Contains two
109  * members: the number of processes that are currently using this
110  * set of segments and the operation number of the current
111  * operation.
112  */
113  typedef struct mca_coll_sm_in_use_flag_t {
114  /** Number of processes currently using this set of
115  segments */
116  volatile uint32_t mcsiuf_num_procs_using;
117  /** Must match data->mcb_count */
118  volatile uint32_t mcsiuf_operation_count;
120 
121  /**
122  * Structure containing pointers to various arrays of data in the
123  * per-communicator shmem data segment (one of these indexes a
124  * single segment in the per-communicator shmem data segment).
125  * Nothing is hard-coded because all the array lengths and
126  * displacements of the pointers all depend on how many processes
127  * are in the communicator.
128  */
129  typedef struct mca_coll_sm_data_index_t {
130  /** Pointer to beginning of control data */
131  uint32_t volatile *mcbmi_control;
132  /** Pointer to beginning of message fragment data */
133  char *mcbmi_data;
135 
136  /**
137  * Structure for the sm coll module to hang off the communicator.
138  * Contains communicator-specific information, including pointers
139  * into the per-communicator shmem data data segment for this
140  * comm's sm collective operations area.
141  */
142  typedef struct mca_coll_sm_comm_t {
143  /* Meta data that we get back from the common mmap allocation
144  function */
145  mca_common_sm_module_t *sm_bootstrap_meta;
146 
147  /** Pointer to my barrier control pages (odd index pages are
148  "in", even index pages are "out") */
150 
151  /** Pointer to my parent's barrier control pages (will be NULL
152  for communicator rank 0; odd index pages are "in", even
153  index pages are "out") */
155 
156  /** Pointers to my childrens' barrier control pages (they're
157  contiguous in memory, so we only point to the base -- the
158  number of children is in my entry in the mcb_tree); will
159  be NULL if this process has no children (odd index pages
160  are "in", even index pages are "out") */
162 
163  /** Number of barriers that we have executed (i.e., which set
164  of barrier buffers to use). */
166 
167  /** "In use" flags indicating which segments are available */
169 
170  /** Array of indexes into the per-communicator shmem data
171  segment for control and data fragment passing (containing
172  pointers to each segments control and data areas). */
174 
175  /** Array of graph nodes representing the tree used for
176  communications */
178 
179  /** Operation number (i.e., which segment number to use) */
182 
183  /** Coll sm module */
184  typedef struct mca_coll_sm_module_t {
185  /** Base module */
187 
188  /* Whether this module has been lazily initialized or not yet */
189  bool enabled;
190 
191  /* Data that hangs off the communicator */
192  mca_coll_sm_comm_t *sm_comm_data;
193 
194  /* Underlying reduce function and module */
195  mca_coll_base_module_reduce_fn_t previous_reduce;
196  mca_coll_base_module_t *previous_reduce_module;
199 
200  /**
201  * Global component instance
202  */
203  OMPI_MODULE_DECLSPEC extern mca_coll_sm_component_t mca_coll_sm_component;
204 
205  /*
206  * coll module functions
207  */
208  int mca_coll_sm_init_query(bool enable_progress_threads,
209  bool enable_mpi_threads);
210 
212  mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority);
213 
214  /* Lazily enable a module (since it involves expensive/slow mmap
215  allocation, etc.) */
216  int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module,
217  struct ompi_communicator_t *comm);
218 
219  int mca_coll_sm_allgather_intra(void *sbuf, int scount,
220  struct ompi_datatype_t *sdtype,
221  void *rbuf, int rcount,
222  struct ompi_datatype_t *rdtype,
223  struct ompi_communicator_t *comm,
224  mca_coll_base_module_t *module);
225 
226  int mca_coll_sm_allgatherv_intra(void *sbuf, int scount,
227  struct ompi_datatype_t *sdtype,
228  void * rbuf, int *rcounts, int *disps,
229  struct ompi_datatype_t *rdtype,
230  struct ompi_communicator_t *comm,
231  mca_coll_base_module_t *module);
232  int mca_coll_sm_allreduce_intra(void *sbuf, void *rbuf, int count,
233  struct ompi_datatype_t *dtype,
234  struct ompi_op_t *op,
235  struct ompi_communicator_t *comm,
236  mca_coll_base_module_t *module);
237  int mca_coll_sm_alltoall_intra(void *sbuf, int scount,
238  struct ompi_datatype_t *sdtype,
239  void* rbuf, int rcount,
240  struct ompi_datatype_t *rdtype,
241  struct ompi_communicator_t *comm,
242  mca_coll_base_module_t *module);
243  int mca_coll_sm_alltoallv_intra(void *sbuf, int *scounts, int *sdisps,
244  struct ompi_datatype_t *sdtype,
245  void *rbuf, int *rcounts, int *rdisps,
246  struct ompi_datatype_t *rdtype,
247  struct ompi_communicator_t *comm,
248  mca_coll_base_module_t *module);
249  int mca_coll_sm_alltoallw_intra(void *sbuf, int *scounts, int *sdisps,
250  struct ompi_datatype_t **sdtypes,
251  void *rbuf, int *rcounts, int *rdisps,
252  struct ompi_datatype_t **rdtypes,
253  struct ompi_communicator_t *comm,
254  mca_coll_base_module_t *module);
256  mca_coll_base_module_t *module);
257  int mca_coll_sm_bcast_intra(void *buff, int count,
258  struct ompi_datatype_t *datatype,
259  int root,
260  struct ompi_communicator_t *comm,
261  mca_coll_base_module_t *module);
262  int mca_coll_sm_bcast_log_intra(void *buff, int count,
263  struct ompi_datatype_t *datatype,
264  int root,
265  struct ompi_communicator_t *comm,
266  mca_coll_base_module_t *module);
267  int mca_coll_sm_exscan_intra(void *sbuf, void *rbuf, int count,
268  struct ompi_datatype_t *dtype,
269  struct ompi_op_t *op,
270  struct ompi_communicator_t *comm,
271  mca_coll_base_module_t *module);
272  int mca_coll_sm_gather_intra(void *sbuf, int scount,
273  struct ompi_datatype_t *sdtype, void *rbuf,
274  int rcount, struct ompi_datatype_t *rdtype,
275  int root, struct ompi_communicator_t *comm,
276  mca_coll_base_module_t *module);
277  int mca_coll_sm_gatherv_intra(void *sbuf, int scount,
278  struct ompi_datatype_t *sdtype, void *rbuf,
279  int *rcounts, int *disps,
280  struct ompi_datatype_t *rdtype, int root,
281  struct ompi_communicator_t *comm,
282  mca_coll_base_module_t *module);
283  int mca_coll_sm_reduce_intra(void *sbuf, void* rbuf, int count,
284  struct ompi_datatype_t *dtype,
285  struct ompi_op_t *op,
286  int root,
287  struct ompi_communicator_t *comm,
288  mca_coll_base_module_t *module);
289  int mca_coll_sm_reduce_log_intra(void *sbuf, void* rbuf, int count,
290  struct ompi_datatype_t *dtype,
291  struct ompi_op_t *op,
292  int root,
293  struct ompi_communicator_t *comm,
294  mca_coll_base_module_t *module);
295  int mca_coll_sm_reduce_scatter_intra(void *sbuf, void *rbuf,
296  int *rcounts,
297  struct ompi_datatype_t *dtype,
298  struct ompi_op_t *op,
299  struct ompi_communicator_t *comm,
300  mca_coll_base_module_t *module);
301  int mca_coll_sm_scan_intra(void *sbuf, void *rbuf, int count,
302  struct ompi_datatype_t *dtype,
303  struct ompi_op_t *op,
304  struct ompi_communicator_t *comm,
305  mca_coll_base_module_t *module);
306  int mca_coll_sm_scatter_intra(void *sbuf, int scount,
307  struct ompi_datatype_t *sdtype, void *rbuf,
308  int rcount, struct ompi_datatype_t *rdtype,
309  int root, struct ompi_communicator_t *comm,
310  mca_coll_base_module_t *module);
311  int mca_coll_sm_scatterv_intra(void *sbuf, int *scounts, int *disps,
312  struct ompi_datatype_t *sdtype,
313  void* rbuf, int rcount,
314  struct ompi_datatype_t *rdtype, int root,
315  struct ompi_communicator_t *comm,
316  mca_coll_base_module_t *module);
317 
318  int mca_coll_sm_ft_event(int state);
319 
320 /**
321  * Global variables used in the macros (essentially constants, so
322  * these are thread safe)
323  */
324 extern uint32_t mca_coll_sm_one;
325 
326 
327 /**
328  * Macro to setup flag usage
329  */
330 #define FLAG_SETUP(flag_num, flag, data) \
331  (flag) = (mca_coll_sm_in_use_flag_t*) \
332  (((char *) (data)->mcb_in_use_flags) + \
333  ((flag_num) * mca_coll_sm_component.sm_control_size))
334 
335 /**
336  * Macro to wait for the in-use flag to become idle (used by the root)
337  */
338 #define FLAG_WAIT_FOR_IDLE(flag, label) \
339  SPIN_CONDITION(0 == (flag)->mcsiuf_num_procs_using, label)
340 
341 /**
342  * Macro to wait for a flag to indicate that it's ready for this
343  * operation (used by non-root processes to know when FLAG_SET() has
344  * been called)
345  */
346 #define FLAG_WAIT_FOR_OP(flag, op, label) \
347  SPIN_CONDITION((op) == flag->mcsiuf_operation_count, label)
348 
349 /**
350  * Macro to set an in-use flag with relevant data to claim it
351  */
352 #define FLAG_RETAIN(flag, num_procs, op_count) \
353  (flag)->mcsiuf_num_procs_using = (num_procs); \
354  (flag)->mcsiuf_operation_count = (op_count)
355 
356 /**
357  * Macro to release an in-use flag from this process
358  */
359 #define FLAG_RELEASE(flag) \
360  opal_atomic_add(&(flag)->mcsiuf_num_procs_using, -1)
361 
362 /**
363  * Macro to copy a single segment in from a user buffer to a shared
364  * segment
365  */
366 #define COPY_FRAGMENT_IN(convertor, index, rank, iov, max_data) \
367  (iov).iov_base = \
368  (index)->mcbmi_data + \
369  ((rank) * mca_coll_sm_component.sm_fragment_size); \
370  (iov).iov_len = (max_data); \
371  opal_convertor_pack(&(convertor), &(iov), &mca_coll_sm_one, \
372  &(max_data) )
373 
374 /**
375  * Macro to copy a single segment out from a shared segment to a user
376  * buffer
377  */
378 #define COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data) \
379  (iov).iov_base = (((char*) (index)->mcbmi_data) + \
380  ((src_rank) * (mca_coll_sm_component.sm_fragment_size))); \
381  (iov).iov_len = (max_data); \
382  opal_convertor_unpack(&(convertor), &(iov), &mca_coll_sm_one, \
383  &(max_data) )
384 
385 /**
386  * Macro to memcpy a fragment between one shared segment and another
387  */
388 #define COPY_FRAGMENT_BETWEEN(src_rank, dest_rank, index, len) \
389  memcpy(((index)->mcbmi_data + \
390  ((dest_rank) * mca_coll_sm_component.sm_fragment_size)), \
391  ((index)->mcbmi_data + \
392  ((src_rank) * \
393  mca_coll_sm_component.sm_fragment_size)), \
394  (len))
395 
396 /**
397  * Macro to tell children that a segment is ready (normalize
398  * the child's ID based on the shift used to calculate the "me" node
399  * in the tree). Used in fan out opertations.
400  */
401 #define PARENT_NOTIFY_CHILDREN(children, num_children, index, value) \
402  do { \
403  for (i = 0; i < (num_children); ++i) { \
404  *((size_t*) \
405  (((char*) index->mcbmi_control) + \
406  (mca_coll_sm_component.sm_control_size * \
407  (((children)[i]->mcstn_id + root) % size)))) = (value); \
408  } \
409  } while (0)
410 
411 /**
412  * Macro for childen to wait for parent notification (use real rank).
413  * Save the value passed and then reset it when done. Used in fan out
414  * operations.
415  */
416 #define CHILD_WAIT_FOR_NOTIFY(rank, index, value, label) \
417  do { \
418  uint32_t volatile *ptr = ((uint32_t*) \
419  (((char*) index->mcbmi_control) + \
420  ((rank) * mca_coll_sm_component.sm_control_size))); \
421  SPIN_CONDITION(0 != *ptr, label); \
422  (value) = *ptr; \
423  *ptr = 0; \
424  } while (0)
425 
426 /**
427  * Macro for children to tell parent that the data is ready in their
428  * segment. Used for fan in operations.
429  */
430 #define CHILD_NOTIFY_PARENT(child_rank, parent_rank, index, value) \
431  ((size_t volatile *) \
432  (((char*) (index)->mcbmi_control) + \
433  (mca_coll_sm_component.sm_control_size * \
434  (parent_rank))))[(child_rank)] = (value)
435 
436 /**
437  * Macro for parent to wait for a specific child to tell it that the
438  * data is in the child's segment. Save the value when done. Used
439  * for fan in operations.
440  */
441 #define PARENT_WAIT_FOR_NOTIFY_SPECIFIC(child_rank, parent_rank, index, value, label) \
442  do { \
443  size_t volatile *ptr = ((size_t volatile *) \
444  (((char*) index->mcbmi_control) + \
445  (mca_coll_sm_component.sm_control_size * \
446  (parent_rank)))) + child_rank; \
447  SPIN_CONDITION(0 != *ptr, label); \
448  (value) = *ptr; \
449  *ptr = 0; \
450  } while (0)
451 
452 END_C_DECLS
453 
454 #endif /* MCA_COLL_SM_EXPORT_H */
char * mcbmi_data
Pointer to beginning of message fragment data.
Definition: coll_sm.h:133
uint32_t mca_coll_sm_one
Global variables used in the macros (essentially constants, so these are thread safe) ...
Definition: coll_sm_module.c:70
struct mca_coll_sm_in_use_flag_t mca_coll_sm_in_use_flag_t
Simple structure comprising the "in use" flags.
int mca_coll_sm_allreduce_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
Shared memory allreduce.
Definition: coll_sm_allreduce.c:34
volatile uint32_t mcsiuf_num_procs_using
Number of processes currently using this set of segments.
Definition: coll_sm.h:116
Collective module interface.
Definition: coll.h:316
Definition: common_sm.h:60
uint32_t volatile * mcbmi_control
Pointer to beginning of control data.
Definition: coll_sm.h:131
Definition: ompi_datatype.h:68
mca_coll_base_component_2_0_0_t super
Base coll component.
Definition: coll_sm.h:57
Structure for representing a node in the tree.
Definition: coll_sm.h:95
int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
Shared memory barrier.
Definition: coll_sm_barrier.c:52
int sm_priority
MCA parameter: Priority of this component.
Definition: coll_sm.h:60
int mcstn_num_children
Number of children, or 0 if a leaf.
Definition: coll_sm.h:101
Collective component interface.
Definition: coll.h:283
int sm_comm_num_segments
MCA parameter: Number of segments for each communicator in the data mpool.
Definition: coll_sm.h:71
int mca_coll_sm_reduce_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
Shared memory reduction.
Definition: coll_sm_reduce.c:67
uint32_t mcb_operation_count
Operation number (i.e., which segment number to use)
Definition: coll_sm.h:180
int sm_segs_per_inuse_flag
How many fragment segments are protected by a single in-use flags.
Definition: coll_sm.h:89
struct mca_coll_sm_component_t mca_coll_sm_component_t
Structure to hold the sm coll component.
mca_coll_base_module_t super
Base module.
Definition: coll_sm.h:186
volatile uint32_t mcsiuf_operation_count
Must match data->mcb_count.
Definition: coll_sm.h:118
Structure for the sm coll module to hang off the communicator.
Definition: coll_sm.h:142
Top-level interface for all MCA components.
struct mca_coll_sm_module_t mca_coll_sm_module_t
Coll sm module.
int sm_fragment_size
MCA parameter: Fragment size for data.
Definition: coll_sm.h:74
Collective Communication Interface.
int sm_tree_degree
MCA parameter: Degree of tree for tree-based collectives.
Definition: coll_sm.h:77
struct mca_coll_sm_comm_t mca_coll_sm_comm_t
Structure for the sm coll module to hang off the communicator.
mca_coll_sm_in_use_flag_t * mcb_in_use_flags
"In use" flags indicating which segments are available
Definition: coll_sm.h:168
int sm_comm_num_in_use_flags
MCA parameter: Number of "in use" flags in each communicator's area in the data mpool.
Definition: coll_sm.h:67
int sm_control_size
MCA parameter: Length of a cache line or page (in bytes)
Definition: coll_sm.h:63
struct mca_coll_sm_data_index_t mca_coll_sm_data_index_t
Structure containing pointers to various arrays of data in the per-communicator shmem data segment (o...
int mcb_barrier_count
Number of barriers that we have executed (i.e., which set of barrier buffers to use).
Definition: coll_sm.h:165
mca_coll_sm_tree_node_t * mcb_tree
Array of graph nodes representing the tree used for communications.
Definition: coll_sm.h:177
Structure to hold the sm coll component.
Definition: coll_sm.h:55
struct mca_coll_sm_tree_node_t ** mcstn_children
Pointer to an array of children, or NULL if 0 == mcstn_num_children.
Definition: coll_sm.h:104
int sm_info_comm_size
MCA parameter: Number of processes to use in the calculation of the "info" MCA parameter.
Definition: coll_sm.h:81
int mcstn_id
Arbitrary ID number, starting from 0.
Definition: coll_sm.h:97
uint32_t * mcb_barrier_control_children
Pointers to my childrens' barrier control pages (they're contiguous in memory, so we only point to th...
Definition: coll_sm.h:161
uint32_t * mcb_barrier_control_parent
Pointer to my parent's barrier control pages (will be NULL for communicator rank 0; odd index pages a...
Definition: coll_sm.h:154
int mca_coll_sm_bcast_intra(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
Shared memory broadcast.
Definition: coll_sm_bcast.c:58
OMPI_MODULE_DECLSPEC mca_coll_sm_component_t mca_coll_sm_component
Global component instance.
Definition: coll_sm_component.c:55
Simple structure comprising the "in use" flags.
Definition: coll_sm.h:113
struct mca_coll_sm_tree_node_t * mcstn_parent
Pointer to parent, or NULL if root.
Definition: coll_sm.h:99
mca_coll_sm_data_index_t * mcb_data_index
Array of indexes into the per-communicator shmem data segment for control and data fragment passing (...
Definition: coll_sm.h:173
Definition: communicator.h:118
Back-end type of MPI_Op.
Definition: op.h:100
uint32_t * mcb_barrier_control_me
Pointer to my barrier control pages (odd index pages are "in", even index pages are "out") ...
Definition: coll_sm.h:149
Coll sm module.
Definition: coll_sm.h:184
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
struct mca_coll_sm_tree_node_t mca_coll_sm_tree_node_t
Structure for representing a node in the tree.
Structure containing pointers to various arrays of data in the per-communicator shmem data segment (o...
Definition: coll_sm.h:129