OpenMPI  0.1.1
coll_sm.h File Reference
#include "ompi_config.h"
#include "mpi.h"
#include "opal/mca/mca.h"
#include "opal/datatype/opal_convertor.h"
#include "orte/types.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/common/sm/common_sm.h"

Go to the source code of this file.

Data Structures

struct  mca_coll_sm_component_t
 Structure to hold the sm coll component. More...
 
struct  mca_coll_sm_tree_node_t
 Structure for representing a node in the tree. More...
 
struct  mca_coll_sm_in_use_flag_t
 Simple structure comprising the "in use" flags. More...
 
struct  mca_coll_sm_data_index_t
 Structure containing pointers to various arrays of data in the per-communicator shmem data segment (one of these indexes a single segment in the per-communicator shmem data segment). More...
 
struct  mca_coll_sm_comm_t
 Structure for the sm coll module to hang off the communicator. More...
 
struct  mca_coll_sm_module_t
 Coll sm module. More...
 

Macros

#define SPIN_CONDITION_MAX   100000
 
#define SPIN_CONDITION(cond, exit_label)
 
#define FLAG_SETUP(flag_num, flag, data)
 Macro to setup flag usage. More...
 
#define FLAG_WAIT_FOR_IDLE(flag, label)   SPIN_CONDITION(0 == (flag)->mcsiuf_num_procs_using, label)
 Macro to wait for the in-use flag to become idle (used by the root)
 
#define FLAG_WAIT_FOR_OP(flag, op, label)   SPIN_CONDITION((op) == flag->mcsiuf_operation_count, label)
 Macro to wait for a flag to indicate that it's ready for this operation (used by non-root processes to know when FLAG_SET() has been called)
 
#define FLAG_RETAIN(flag, num_procs, op_count)
 Macro to set an in-use flag with relevant data to claim it. More...
 
#define FLAG_RELEASE(flag)   opal_atomic_add(&(flag)->mcsiuf_num_procs_using, -1)
 Macro to release an in-use flag from this process.
 
#define COPY_FRAGMENT_IN(convertor, index, rank, iov, max_data)
 Macro to copy a single segment in from a user buffer to a shared segment. More...
 
#define COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data)
 Macro to copy a single segment out from a shared segment to a user buffer. More...
 
#define COPY_FRAGMENT_BETWEEN(src_rank, dest_rank, index, len)
 Macro to memcpy a fragment between one shared segment and another. More...
 
#define PARENT_NOTIFY_CHILDREN(children, num_children, index, value)
 Macro to tell children that a segment is ready (normalize the child's ID based on the shift used to calculate the "me" node in the tree). More...
 
#define CHILD_WAIT_FOR_NOTIFY(rank, index, value, label)
 Macro for childen to wait for parent notification (use real rank). More...
 
#define CHILD_NOTIFY_PARENT(child_rank, parent_rank, index, value)
 Macro for children to tell parent that the data is ready in their segment. More...
 
#define PARENT_WAIT_FOR_NOTIFY_SPECIFIC(child_rank, parent_rank, index, value, label)
 Macro for parent to wait for a specific child to tell it that the data is in the child's segment. More...
 

Typedefs

typedef struct
mca_coll_sm_component_t 
mca_coll_sm_component_t
 Structure to hold the sm coll component. More...
 
typedef struct
mca_coll_sm_tree_node_t 
mca_coll_sm_tree_node_t
 Structure for representing a node in the tree.
 
typedef struct
mca_coll_sm_in_use_flag_t 
mca_coll_sm_in_use_flag_t
 Simple structure comprising the "in use" flags. More...
 
typedef struct
mca_coll_sm_data_index_t 
mca_coll_sm_data_index_t
 Structure containing pointers to various arrays of data in the per-communicator shmem data segment (one of these indexes a single segment in the per-communicator shmem data segment). More...
 
typedef struct mca_coll_sm_comm_t mca_coll_sm_comm_t
 Structure for the sm coll module to hang off the communicator. More...
 
typedef struct mca_coll_sm_module_t mca_coll_sm_module_t
 Coll sm module.
 

Functions

 OBJ_CLASS_DECLARATION (mca_coll_sm_module_t)
 
int mca_coll_sm_init_query (bool enable_progress_threads, bool enable_mpi_threads)
 
mca_coll_base_module_tmca_coll_sm_comm_query (struct ompi_communicator_t *comm, int *priority)
 
int ompi_coll_sm_lazy_enable (mca_coll_base_module_t *module, struct ompi_communicator_t *comm)
 
int mca_coll_sm_allgather_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_allgatherv_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_allreduce_intra (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 Shared memory allreduce. More...
 
int mca_coll_sm_alltoall_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_alltoallv_intra (void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_alltoallw_intra (void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_barrier_intra (struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 Shared memory barrier. More...
 
int mca_coll_sm_bcast_intra (void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 Shared memory broadcast. More...
 
int mca_coll_sm_bcast_log_intra (void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_exscan_intra (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_gather_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_gatherv_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_reduce_intra (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 Shared memory reduction. More...
 
int mca_coll_sm_reduce_log_intra (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_reduce_scatter_intra (void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_scan_intra (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_scatter_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_scatterv_intra (void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
 
int mca_coll_sm_ft_event (int state)
 

Variables

OMPI_MODULE_DECLSPEC
mca_coll_sm_component_t 
mca_coll_sm_component
 Global component instance.
 
uint32_t mca_coll_sm_one
 Global variables used in the macros (essentially constants, so these are thread safe)
 

Macro Definition Documentation

#define CHILD_NOTIFY_PARENT (   child_rank,
  parent_rank,
  index,
  value 
)
Value:
((size_t volatile *) \
(((char*) (index)->mcbmi_control) + \
(parent_rank))))[(child_rank)] = (value)
int sm_control_size
MCA parameter: Length of a cache line or page (in bytes)
Definition: coll_sm.h:63
OMPI_MODULE_DECLSPEC mca_coll_sm_component_t mca_coll_sm_component
Global component instance.
Definition: coll_sm_component.c:55

Macro for children to tell parent that the data is ready in their segment.

Used for fan in operations.

#define CHILD_WAIT_FOR_NOTIFY (   rank,
  index,
  value,
  label 
)
Value:
do { \
uint32_t volatile *ptr = ((uint32_t*) \
(((char*) index->mcbmi_control) + \
SPIN_CONDITION(0 != *ptr, label); \
(value) = *ptr; \
*ptr = 0; \
} while (0)
int sm_control_size
MCA parameter: Length of a cache line or page (in bytes)
Definition: coll_sm.h:63
OMPI_MODULE_DECLSPEC mca_coll_sm_component_t mca_coll_sm_component
Global component instance.
Definition: coll_sm_component.c:55

Macro for childen to wait for parent notification (use real rank).

Save the value passed and then reset it when done. Used in fan out operations.

#define COPY_FRAGMENT_BETWEEN (   src_rank,
  dest_rank,
  index,
  len 
)
Value:
memcpy(((index)->mcbmi_data + \
((index)->mcbmi_data + \
((src_rank) * \
(len))
int sm_fragment_size
MCA parameter: Fragment size for data.
Definition: coll_sm.h:74
OMPI_MODULE_DECLSPEC mca_coll_sm_component_t mca_coll_sm_component
Global component instance.
Definition: coll_sm_component.c:55

Macro to memcpy a fragment between one shared segment and another.

#define COPY_FRAGMENT_IN (   convertor,
  index,
  rank,
  iov,
  max_data 
)
Value:
(iov).iov_base = \
(index)->mcbmi_data + \
(iov).iov_len = (max_data); \
opal_convertor_pack(&(convertor), &(iov), &mca_coll_sm_one, \
&(max_data) )
uint32_t mca_coll_sm_one
Global variables used in the macros (essentially constants, so these are thread safe) ...
Definition: coll_sm_module.c:70
int sm_fragment_size
MCA parameter: Fragment size for data.
Definition: coll_sm.h:74
OMPI_MODULE_DECLSPEC mca_coll_sm_component_t mca_coll_sm_component
Global component instance.
Definition: coll_sm_component.c:55

Macro to copy a single segment in from a user buffer to a shared segment.

Referenced by mca_coll_sm_bcast_intra().

#define COPY_FRAGMENT_OUT (   convertor,
  src_rank,
  index,
  iov,
  max_data 
)
Value:
(iov).iov_base = (((char*) (index)->mcbmi_data) + \
(iov).iov_len = (max_data); \
opal_convertor_unpack(&(convertor), &(iov), &mca_coll_sm_one, \
&(max_data) )
uint32_t mca_coll_sm_one
Global variables used in the macros (essentially constants, so these are thread safe) ...
Definition: coll_sm_module.c:70
int sm_fragment_size
MCA parameter: Fragment size for data.
Definition: coll_sm.h:74
OMPI_MODULE_DECLSPEC mca_coll_sm_component_t mca_coll_sm_component
Global component instance.
Definition: coll_sm_component.c:55

Macro to copy a single segment out from a shared segment to a user buffer.

#define FLAG_RETAIN (   flag,
  num_procs,
  op_count 
)
Value:
(flag)->mcsiuf_num_procs_using = (num_procs); \
(flag)->mcsiuf_operation_count = (op_count)

Macro to set an in-use flag with relevant data to claim it.

Referenced by mca_coll_sm_bcast_intra().

#define FLAG_SETUP (   flag_num,
  flag,
  data 
)
Value:
(((char *) (data)->mcb_in_use_flags) + \
int sm_control_size
MCA parameter: Length of a cache line or page (in bytes)
Definition: coll_sm.h:63
OMPI_MODULE_DECLSPEC mca_coll_sm_component_t mca_coll_sm_component
Global component instance.
Definition: coll_sm_component.c:55
Simple structure comprising the "in use" flags.
Definition: coll_sm.h:113

Macro to setup flag usage.

Referenced by mca_coll_sm_bcast_intra().

#define PARENT_NOTIFY_CHILDREN (   children,
  num_children,
  index,
  value 
)
Value:
do { \
for (i = 0; i < (num_children); ++i) { \
*((size_t*) \
(((char*) index->mcbmi_control) + \
(((children)[i]->mcstn_id + root) % size)))) = (value); \
} \
} while (0)
int sm_control_size
MCA parameter: Length of a cache line or page (in bytes)
Definition: coll_sm.h:63
OMPI_MODULE_DECLSPEC mca_coll_sm_component_t mca_coll_sm_component
Global component instance.
Definition: coll_sm_component.c:55

Macro to tell children that a segment is ready (normalize the child's ID based on the shift used to calculate the "me" node in the tree).

Used in fan out opertations.

Referenced by mca_coll_sm_bcast_intra().

#define PARENT_WAIT_FOR_NOTIFY_SPECIFIC (   child_rank,
  parent_rank,
  index,
  value,
  label 
)
Value:
do { \
size_t volatile *ptr = ((size_t volatile *) \
(((char*) index->mcbmi_control) + \
(parent_rank)))) + child_rank; \
SPIN_CONDITION(0 != *ptr, label); \
(value) = *ptr; \
*ptr = 0; \
} while (0)
int sm_control_size
MCA parameter: Length of a cache line or page (in bytes)
Definition: coll_sm.h:63
OMPI_MODULE_DECLSPEC mca_coll_sm_component_t mca_coll_sm_component
Global component instance.
Definition: coll_sm_component.c:55

Macro for parent to wait for a specific child to tell it that the data is in the child's segment.

Save the value when done. Used for fan in operations.

#define SPIN_CONDITION (   cond,
  exit_label 
)
Value:
do { int i; \
if (cond) goto exit_label; \
for (i = 0; i < SPIN_CONDITION_MAX; ++i) { \
if (cond) { goto exit_label; } \
} \
} while (1); \
exit_label:
OPAL_DECLSPEC void opal_progress(void)
Progress all pending events.
Definition: opal_progress.c:165

Typedef Documentation

Structure for the sm coll module to hang off the communicator.

Contains communicator-specific information, including pointers into the per-communicator shmem data data segment for this comm's sm collective operations area.

Structure to hold the sm coll component.

First it holds the base coll component, and then holds a bunch of sm-coll-component-specific stuff (e.g., current MCA param values).

Structure containing pointers to various arrays of data in the per-communicator shmem data segment (one of these indexes a single segment in the per-communicator shmem data segment).

Nothing is hard-coded because all the array lengths and displacements of the pointers all depend on how many processes are in the communicator.

Simple structure comprising the "in use" flags.

Contains two members: the number of processes that are currently using this set of segments and the operation number of the current operation.

Function Documentation

int mca_coll_sm_allreduce_intra ( void *  sbuf,
void *  rbuf,
int  count,
struct ompi_datatype_t dtype,
struct ompi_op_t op,
struct ompi_communicator_t comm,
mca_coll_base_module_t module 
)

Shared memory allreduce.

For the moment, all we're doing is a reduce to root==0 and then a broadcast. It is possible that we'll do something better someday.

References mca_coll_sm_bcast_intra(), and mca_coll_sm_reduce_intra().

int mca_coll_sm_barrier_intra ( struct ompi_communicator_t comm,
mca_coll_base_module_t module 
)

Shared memory barrier.

Tree-based algorithm for a barrier: a fan in to rank 0 followed by a fan out using the barrier segments in the shared memory area.

There are 2 sets of barrier buffers – since there can only be, at most, 2 outstanding barriers at any time, there is no need for more than this. The generalized in-use flags, control, and data segments are not used.

The general algorithm is for a given process to wait for its N children to fan in by monitoring a uint32_t in its barrier "in" buffer. When this value reaches N (i.e., each of the children have atomically incremented the value), then the process atomically increases the uint32_t in its parent's "in" buffer. Then the process waits for the parent to set a "1" in the process' "out" buffer. Once this happens, the process writes a "1" in each of its children's "out" buffers, and returns.

There's corner cases, of course, such as the root that has no parent, and the leaves that have no children. But that's the general idea.

References mca_coll_sm_component, mca_coll_sm_comm_t::mcb_barrier_control_children, mca_coll_sm_comm_t::mcb_barrier_control_me, mca_coll_sm_comm_t::mcb_barrier_control_parent, mca_coll_sm_comm_t::mcb_barrier_count, mca_coll_sm_comm_t::mcb_tree, mca_coll_sm_tree_node_t::mcstn_num_children, opal_atomic_add, and mca_coll_sm_component_t::sm_control_size.

int mca_coll_sm_bcast_intra ( void *  buff,
int  count,
struct ompi_datatype_t datatype,
int  root,
struct ompi_communicator_t comm,
mca_coll_base_module_t module 
)

Shared memory broadcast.

For the root, the general algorithm is to wait for a set of segments to become available. Once it is, the root claims the set by writing the current operation number and the number of processes using the set to the flag. The root then loops over the set of segments; for each segment, it copies a fragment of the user's buffer into the shared data segment and then writes the data size into its childrens' control buffers. The process is repeated until all fragments have been written.

For non-roots, for each set of buffers, they wait until the current operation number appears in the in-use flag (i.e., written by the root). Then for each segment, they wait for a nonzero to appear into their control buffers. If they have children, they copy the data from their parent's shared data segment into their shared data segment, and write the data size into each of their childrens' control buffers. They then copy the data from their shared [local] data segment into the user's output buffer. The process is repeated until all fragments have been received. If they do not have children, they copy the data directly from the parent's shared data segment into the user's output buffer.

References COPY_FRAGMENT_IN, FLAG_RETAIN, FLAG_SETUP, FLAG_WAIT_FOR_IDLE, mca_coll_sm_component, mca_coll_sm_comm_t::mcb_data_index, mca_coll_sm_comm_t::mcb_operation_count, mca_coll_sm_comm_t::mcb_tree, mca_coll_sm_tree_node_t::mcstn_children, mca_coll_sm_tree_node_t::mcstn_num_children, mca_coll_sm_tree_node_t::mcstn_parent, OBJ_CONSTRUCT, opal_atomic_wmb(), PARENT_NOTIFY_CHILDREN, mca_coll_sm_component_t::sm_comm_num_in_use_flags, mca_coll_sm_component_t::sm_fragment_size, mca_coll_sm_component_t::sm_segs_per_inuse_flag, and ompi_datatype_t::super.

Referenced by mca_coll_sm_allreduce_intra().

int mca_coll_sm_reduce_intra ( void *  sbuf,
void *  rbuf,
int  count,
struct ompi_datatype_t dtype,
struct ompi_op_t op,
int  root,
struct ompi_communicator_t comm,
mca_coll_base_module_t module 
)

Shared memory reduction.

Simply farms out to the associative or non-associative functions.

References mca_coll_sm_component, ompi_op_is_float_assoc(), ompi_op_is_intrinsic(), and mca_coll_sm_component_t::sm_control_size.

Referenced by mca_coll_sm_allreduce_intra().