OpenMPI
0.1.1
|
#include "ompi_config.h"
#include "mpi.h"
#include "opal/mca/mca.h"
#include "opal/datatype/opal_convertor.h"
#include "orte/types.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/common/sm/common_sm.h"
Go to the source code of this file.
Data Structures | |
struct | mca_coll_sm_component_t |
Structure to hold the sm coll component. More... | |
struct | mca_coll_sm_tree_node_t |
Structure for representing a node in the tree. More... | |
struct | mca_coll_sm_in_use_flag_t |
Simple structure comprising the "in use" flags. More... | |
struct | mca_coll_sm_data_index_t |
Structure containing pointers to various arrays of data in the per-communicator shmem data segment (one of these indexes a single segment in the per-communicator shmem data segment). More... | |
struct | mca_coll_sm_comm_t |
Structure for the sm coll module to hang off the communicator. More... | |
struct | mca_coll_sm_module_t |
Coll sm module. More... | |
Macros | |
#define | SPIN_CONDITION_MAX 100000 |
#define | SPIN_CONDITION(cond, exit_label) |
#define | FLAG_SETUP(flag_num, flag, data) |
Macro to setup flag usage. More... | |
#define | FLAG_WAIT_FOR_IDLE(flag, label) SPIN_CONDITION(0 == (flag)->mcsiuf_num_procs_using, label) |
Macro to wait for the in-use flag to become idle (used by the root) | |
#define | FLAG_WAIT_FOR_OP(flag, op, label) SPIN_CONDITION((op) == flag->mcsiuf_operation_count, label) |
Macro to wait for a flag to indicate that it's ready for this operation (used by non-root processes to know when FLAG_SET() has been called) | |
#define | FLAG_RETAIN(flag, num_procs, op_count) |
Macro to set an in-use flag with relevant data to claim it. More... | |
#define | FLAG_RELEASE(flag) opal_atomic_add(&(flag)->mcsiuf_num_procs_using, -1) |
Macro to release an in-use flag from this process. | |
#define | COPY_FRAGMENT_IN(convertor, index, rank, iov, max_data) |
Macro to copy a single segment in from a user buffer to a shared segment. More... | |
#define | COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data) |
Macro to copy a single segment out from a shared segment to a user buffer. More... | |
#define | COPY_FRAGMENT_BETWEEN(src_rank, dest_rank, index, len) |
Macro to memcpy a fragment between one shared segment and another. More... | |
#define | PARENT_NOTIFY_CHILDREN(children, num_children, index, value) |
Macro to tell children that a segment is ready (normalize the child's ID based on the shift used to calculate the "me" node in the tree). More... | |
#define | CHILD_WAIT_FOR_NOTIFY(rank, index, value, label) |
Macro for childen to wait for parent notification (use real rank). More... | |
#define | CHILD_NOTIFY_PARENT(child_rank, parent_rank, index, value) |
Macro for children to tell parent that the data is ready in their segment. More... | |
#define | PARENT_WAIT_FOR_NOTIFY_SPECIFIC(child_rank, parent_rank, index, value, label) |
Macro for parent to wait for a specific child to tell it that the data is in the child's segment. More... | |
Typedefs | |
typedef struct mca_coll_sm_component_t | mca_coll_sm_component_t |
Structure to hold the sm coll component. More... | |
typedef struct mca_coll_sm_tree_node_t | mca_coll_sm_tree_node_t |
Structure for representing a node in the tree. | |
typedef struct mca_coll_sm_in_use_flag_t | mca_coll_sm_in_use_flag_t |
Simple structure comprising the "in use" flags. More... | |
typedef struct mca_coll_sm_data_index_t | mca_coll_sm_data_index_t |
Structure containing pointers to various arrays of data in the per-communicator shmem data segment (one of these indexes a single segment in the per-communicator shmem data segment). More... | |
typedef struct mca_coll_sm_comm_t | mca_coll_sm_comm_t |
Structure for the sm coll module to hang off the communicator. More... | |
typedef struct mca_coll_sm_module_t | mca_coll_sm_module_t |
Coll sm module. | |
Functions | |
OBJ_CLASS_DECLARATION (mca_coll_sm_module_t) | |
int | mca_coll_sm_init_query (bool enable_progress_threads, bool enable_mpi_threads) |
mca_coll_base_module_t * | mca_coll_sm_comm_query (struct ompi_communicator_t *comm, int *priority) |
int | ompi_coll_sm_lazy_enable (mca_coll_base_module_t *module, struct ompi_communicator_t *comm) |
int | mca_coll_sm_allgather_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_allgatherv_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_allreduce_intra (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
Shared memory allreduce. More... | |
int | mca_coll_sm_alltoall_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_alltoallv_intra (void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_alltoallw_intra (void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_barrier_intra (struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
Shared memory barrier. More... | |
int | mca_coll_sm_bcast_intra (void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
Shared memory broadcast. More... | |
int | mca_coll_sm_bcast_log_intra (void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_exscan_intra (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_gather_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_gatherv_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_reduce_intra (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
Shared memory reduction. More... | |
int | mca_coll_sm_reduce_log_intra (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_reduce_scatter_intra (void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_scan_intra (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_scatter_intra (void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_scatterv_intra (void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) |
int | mca_coll_sm_ft_event (int state) |
Variables | |
OMPI_MODULE_DECLSPEC mca_coll_sm_component_t | mca_coll_sm_component |
Global component instance. | |
uint32_t | mca_coll_sm_one |
Global variables used in the macros (essentially constants, so these are thread safe) | |
#define CHILD_NOTIFY_PARENT | ( | child_rank, | |
parent_rank, | |||
index, | |||
value | |||
) |
Macro for children to tell parent that the data is ready in their segment.
Used for fan in operations.
#define CHILD_WAIT_FOR_NOTIFY | ( | rank, | |
index, | |||
value, | |||
label | |||
) |
Macro for childen to wait for parent notification (use real rank).
Save the value passed and then reset it when done. Used in fan out operations.
#define COPY_FRAGMENT_BETWEEN | ( | src_rank, | |
dest_rank, | |||
index, | |||
len | |||
) |
Macro to memcpy a fragment between one shared segment and another.
#define COPY_FRAGMENT_IN | ( | convertor, | |
index, | |||
rank, | |||
iov, | |||
max_data | |||
) |
Macro to copy a single segment in from a user buffer to a shared segment.
Referenced by mca_coll_sm_bcast_intra().
#define COPY_FRAGMENT_OUT | ( | convertor, | |
src_rank, | |||
index, | |||
iov, | |||
max_data | |||
) |
Macro to copy a single segment out from a shared segment to a user buffer.
#define FLAG_RETAIN | ( | flag, | |
num_procs, | |||
op_count | |||
) |
Macro to set an in-use flag with relevant data to claim it.
Referenced by mca_coll_sm_bcast_intra().
#define FLAG_SETUP | ( | flag_num, | |
flag, | |||
data | |||
) |
Macro to setup flag usage.
Referenced by mca_coll_sm_bcast_intra().
#define PARENT_NOTIFY_CHILDREN | ( | children, | |
num_children, | |||
index, | |||
value | |||
) |
Macro to tell children that a segment is ready (normalize the child's ID based on the shift used to calculate the "me" node in the tree).
Used in fan out opertations.
Referenced by mca_coll_sm_bcast_intra().
#define PARENT_WAIT_FOR_NOTIFY_SPECIFIC | ( | child_rank, | |
parent_rank, | |||
index, | |||
value, | |||
label | |||
) |
Macro for parent to wait for a specific child to tell it that the data is in the child's segment.
Save the value when done. Used for fan in operations.
#define SPIN_CONDITION | ( | cond, | |
exit_label | |||
) |
typedef struct mca_coll_sm_comm_t mca_coll_sm_comm_t |
Structure for the sm coll module to hang off the communicator.
Contains communicator-specific information, including pointers into the per-communicator shmem data data segment for this comm's sm collective operations area.
typedef struct mca_coll_sm_component_t mca_coll_sm_component_t |
Structure to hold the sm coll component.
First it holds the base coll component, and then holds a bunch of sm-coll-component-specific stuff (e.g., current MCA param values).
typedef struct mca_coll_sm_data_index_t mca_coll_sm_data_index_t |
Structure containing pointers to various arrays of data in the per-communicator shmem data segment (one of these indexes a single segment in the per-communicator shmem data segment).
Nothing is hard-coded because all the array lengths and displacements of the pointers all depend on how many processes are in the communicator.
typedef struct mca_coll_sm_in_use_flag_t mca_coll_sm_in_use_flag_t |
Simple structure comprising the "in use" flags.
Contains two members: the number of processes that are currently using this set of segments and the operation number of the current operation.
int mca_coll_sm_allreduce_intra | ( | void * | sbuf, |
void * | rbuf, | ||
int | count, | ||
struct ompi_datatype_t * | dtype, | ||
struct ompi_op_t * | op, | ||
struct ompi_communicator_t * | comm, | ||
mca_coll_base_module_t * | module | ||
) |
Shared memory allreduce.
For the moment, all we're doing is a reduce to root==0 and then a broadcast. It is possible that we'll do something better someday.
References mca_coll_sm_bcast_intra(), and mca_coll_sm_reduce_intra().
int mca_coll_sm_barrier_intra | ( | struct ompi_communicator_t * | comm, |
mca_coll_base_module_t * | module | ||
) |
Shared memory barrier.
Tree-based algorithm for a barrier: a fan in to rank 0 followed by a fan out using the barrier segments in the shared memory area.
There are 2 sets of barrier buffers – since there can only be, at most, 2 outstanding barriers at any time, there is no need for more than this. The generalized in-use flags, control, and data segments are not used.
The general algorithm is for a given process to wait for its N children to fan in by monitoring a uint32_t in its barrier "in" buffer. When this value reaches N (i.e., each of the children have atomically incremented the value), then the process atomically increases the uint32_t in its parent's "in" buffer. Then the process waits for the parent to set a "1" in the process' "out" buffer. Once this happens, the process writes a "1" in each of its children's "out" buffers, and returns.
There's corner cases, of course, such as the root that has no parent, and the leaves that have no children. But that's the general idea.
References mca_coll_sm_component, mca_coll_sm_comm_t::mcb_barrier_control_children, mca_coll_sm_comm_t::mcb_barrier_control_me, mca_coll_sm_comm_t::mcb_barrier_control_parent, mca_coll_sm_comm_t::mcb_barrier_count, mca_coll_sm_comm_t::mcb_tree, mca_coll_sm_tree_node_t::mcstn_num_children, opal_atomic_add, and mca_coll_sm_component_t::sm_control_size.
int mca_coll_sm_bcast_intra | ( | void * | buff, |
int | count, | ||
struct ompi_datatype_t * | datatype, | ||
int | root, | ||
struct ompi_communicator_t * | comm, | ||
mca_coll_base_module_t * | module | ||
) |
Shared memory broadcast.
For the root, the general algorithm is to wait for a set of segments to become available. Once it is, the root claims the set by writing the current operation number and the number of processes using the set to the flag. The root then loops over the set of segments; for each segment, it copies a fragment of the user's buffer into the shared data segment and then writes the data size into its childrens' control buffers. The process is repeated until all fragments have been written.
For non-roots, for each set of buffers, they wait until the current operation number appears in the in-use flag (i.e., written by the root). Then for each segment, they wait for a nonzero to appear into their control buffers. If they have children, they copy the data from their parent's shared data segment into their shared data segment, and write the data size into each of their childrens' control buffers. They then copy the data from their shared [local] data segment into the user's output buffer. The process is repeated until all fragments have been received. If they do not have children, they copy the data directly from the parent's shared data segment into the user's output buffer.
References COPY_FRAGMENT_IN, FLAG_RETAIN, FLAG_SETUP, FLAG_WAIT_FOR_IDLE, mca_coll_sm_component, mca_coll_sm_comm_t::mcb_data_index, mca_coll_sm_comm_t::mcb_operation_count, mca_coll_sm_comm_t::mcb_tree, mca_coll_sm_tree_node_t::mcstn_children, mca_coll_sm_tree_node_t::mcstn_num_children, mca_coll_sm_tree_node_t::mcstn_parent, OBJ_CONSTRUCT, opal_atomic_wmb(), PARENT_NOTIFY_CHILDREN, mca_coll_sm_component_t::sm_comm_num_in_use_flags, mca_coll_sm_component_t::sm_fragment_size, mca_coll_sm_component_t::sm_segs_per_inuse_flag, and ompi_datatype_t::super.
Referenced by mca_coll_sm_allreduce_intra().
int mca_coll_sm_reduce_intra | ( | void * | sbuf, |
void * | rbuf, | ||
int | count, | ||
struct ompi_datatype_t * | dtype, | ||
struct ompi_op_t * | op, | ||
int | root, | ||
struct ompi_communicator_t * | comm, | ||
mca_coll_base_module_t * | module | ||
) |
Shared memory reduction.
Simply farms out to the associative or non-associative functions.
References mca_coll_sm_component, ompi_op_is_float_assoc(), ompi_op_is_intrinsic(), and mca_coll_sm_component_t::sm_control_size.
Referenced by mca_coll_sm_allreduce_intra().