OpenMPI  0.1.1
coll_tuned.h
1 /*
2  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2009 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
13  * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
14  * $COPYRIGHT$
15  *
16  * Additional copyrights may follow
17  *
18  * $HEADER$
19  */
20 
21 #ifndef MCA_COLL_TUNED_EXPORT_H
22 #define MCA_COLL_TUNED_EXPORT_H
23 
24 #include "ompi_config.h"
25 
26 #include "mpi.h"
27 #include "opal/mca/mca.h"
28 #include "ompi/mca/coll/coll.h"
29 #include "ompi/request/request.h"
30 
31 /* need to include our own topo prototypes so we can malloc data on the comm correctly */
32 #include "coll_tuned_topo.h"
33 
34 /* also need the dynamic rule structures */
35 #include "coll_tuned_dynamic_rules.h"
36 
37 /* some fixed value index vars to simplify certain operations */
38 typedef enum COLLTYPE {
39  ALLGATHER = 0, /* 0 */
40  ALLGATHERV, /* 1 */
41  ALLREDUCE, /* 2 */
42  ALLTOALL, /* 3 */
43  ALLTOALLV, /* 4 */
44  ALLTOALLW, /* 5 */
45  BARRIER, /* 6 */
46  BCAST, /* 7 */
47  EXSCAN, /* 8 */
48  GATHER, /* 9 */
49  GATHERV, /* 10 */
50  REDUCE, /* 11 */
51  REDUCESCATTER, /* 12 */
52  SCAN, /* 13 */
53  SCATTER, /* 14 */
54  SCATTERV, /* 15 */
55  COLLCOUNT /* 16 end counter keep it as last element */
56 } COLLTYPE_T;
57 
58 /* defined arg lists to simply auto inclusion of user overriding decision functions */
59 #define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
60 #define ALLGATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
61 #define ALLREDUCE_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
62 #define ALLTOALL_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
63 #define ALLTOALLV_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
64 #define ALLTOALLW_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
65 #define BARRIER_ARGS struct ompi_communicator_t *comm, mca_coll_base_module_t *module
66 #define BCAST_ARGS void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
67 #define EXSCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
68 #define GATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
69 #define GATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
70 #define REDUCE_ARGS void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
71 #define REDUCESCATTER_ARGS void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
72 #define SCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
73 #define SCATTER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
74 #define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
75 /* end defined arg lists to simply auto inclusion of user overriding decision functions */
76 
77 BEGIN_C_DECLS
78 
79 /* these are the same across all modules and are loaded at component query time */
80 extern int ompi_coll_tuned_stream;
81 extern int ompi_coll_tuned_priority;
82 extern int ompi_coll_tuned_preallocate_memory_comm_size_limit;
83 extern int ompi_coll_tuned_use_dynamic_rules;
84 extern char* ompi_coll_tuned_dynamic_rules_filename;
85 extern int ompi_coll_tuned_init_tree_fanout;
86 extern int ompi_coll_tuned_init_chain_fanout;
87 extern int ompi_coll_tuned_init_max_requests;
88 
89 /* forced algorithm choices */
90 /* this structure is for storing the indexes to the forced algorithm mca params... */
91 /* we get these at component query (so that registered values appear in ompi_infoi) */
93  int algorithm_param_index; /* which algorithm you want to force */
94  int segsize_param_index; /* segsize to use (if supported), 0 = no segmentation */
95  int tree_fanout_param_index; /* tree fanout/in to use */
96  int chain_fanout_param_index; /* K-chain fanout/in to use */
97  int max_requests_param_index; /* Maximum number of outstanding send or recv requests */
98 };
100 
101 
102 /* the following type is for storing actual value obtained from the MCA on each tuned module */
103 /* via their mca param indices lookup in the component */
104 /* this structure is stored once per collective type per communicator... */
106  int algorithm; /* which algorithm you want to force */
107  int segsize; /* segsize to use (if supported), 0 = no segmentation */
108  int tree_fanout; /* tree fanout/in to use */
109  int chain_fanout; /* K-chain fanout/in to use */
110  int max_requests; /* Maximum number of outstanding send or recv requests */
111 };
113 
114 /* the indices to the MCA params so that modules can look them up at open / comm create time */
115 extern coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT];
116 /* the actual max algorithm values (readonly), loaded at component open */
117 extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
118 
119 /*
120  * coll API functions
121  */
122 
123 /* API functions */
124 
125 int ompi_coll_tuned_init_query(bool enable_progress_threads,
126  bool enable_mpi_threads);
127 
129 ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority);
130 
131 /* API functions of decision functions and any implementations */
132 
133 /*
134  * Note this gets long as we have to have a prototype for each
135  * MPI collective 4 times.. 2 for the comm type and 2 for each decision
136  * type.
137  * we might cut down the decision prototypes by conditional compiling
138  */
139 
140 /* All Gather */
141 int ompi_coll_tuned_allgather_intra_dec_fixed(ALLGATHER_ARGS);
142 int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS);
143 int ompi_coll_tuned_allgather_intra_do_forced(ALLGATHER_ARGS);
144 int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize);
145 int ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
146 int ompi_coll_tuned_allgather_intra_bruck(ALLGATHER_ARGS);
147 int ompi_coll_tuned_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
148 int ompi_coll_tuned_allgather_intra_ring(ALLGATHER_ARGS);
149 int ompi_coll_tuned_allgather_intra_neighborexchange(ALLGATHER_ARGS);
150 int ompi_coll_tuned_allgather_intra_basic_linear(ALLGATHER_ARGS);
151 int ompi_coll_tuned_allgather_intra_two_procs(ALLGATHER_ARGS);
152 int ompi_coll_tuned_allgather_inter_dec_fixed(ALLGATHER_ARGS);
153 int ompi_coll_tuned_allgather_inter_dec_dynamic(ALLGATHER_ARGS);
154 
155 /* All GatherV */
156 int ompi_coll_tuned_allgatherv_intra_dec_fixed(ALLGATHERV_ARGS);
157 int ompi_coll_tuned_allgatherv_intra_dec_dynamic(ALLGATHERV_ARGS);
158 int ompi_coll_tuned_allgatherv_intra_do_forced(ALLGATHERV_ARGS);
159 int ompi_coll_tuned_allgatherv_intra_do_this(ALLGATHERV_ARGS, int algorithm, int faninout, int segsize);
160 int ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
161 int ompi_coll_tuned_allgatherv_intra_bruck(ALLGATHERV_ARGS);
162 int ompi_coll_tuned_allgatherv_intra_ring(ALLGATHERV_ARGS);
163 int ompi_coll_tuned_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
164 int ompi_coll_tuned_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
165 int ompi_coll_tuned_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
166 int ompi_coll_tuned_allgatherv_inter_dec_fixed(ALLGATHERV_ARGS);
167 int ompi_coll_tuned_allgatherv_inter_dec_dynamic(ALLGATHERV_ARGS);
168 
169 /* All Reduce */
170 int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS);
171 int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
172 int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS);
173 int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
174 int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
175 int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
176 int ompi_coll_tuned_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
177 int ompi_coll_tuned_allreduce_intra_ring(ALLREDUCE_ARGS);
178 int ompi_coll_tuned_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
179 int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
180 int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS);
181 int ompi_coll_tuned_allreduce_inter_dec_dynamic(ALLREDUCE_ARGS);
182 
183 /* AlltoAll */
184 int ompi_coll_tuned_alltoall_intra_dec_fixed(ALLTOALL_ARGS);
185 int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS);
186 int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS);
187 int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int algorithm, int faninout, int segsize, int max_requests);
188 int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
189 int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS);
190 int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS);
191 int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS);
192 int ompi_coll_tuned_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests);
193 int ompi_coll_tuned_alltoall_intra_two_procs(ALLTOALL_ARGS);
194 int ompi_coll_tuned_alltoall_inter_dec_fixed(ALLTOALL_ARGS);
195 int ompi_coll_tuned_alltoall_inter_dec_dynamic(ALLTOALL_ARGS);
196 
197 /* AlltoAllV */
198 int ompi_coll_tuned_alltoallv_intra_dec_fixed(ALLTOALLV_ARGS);
199 int ompi_coll_tuned_alltoallv_intra_dec_dynamic(ALLTOALLV_ARGS);
200 int ompi_coll_tuned_alltoallv_intra_do_forced(ALLTOALLV_ARGS);
201 int ompi_coll_tuned_alltoallv_intra_do_this(ALLTOALLV_ARGS, int algorithm);
202 int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
203 int ompi_coll_tuned_alltoallv_intra_pairwise(ALLTOALLV_ARGS);
204 int ompi_coll_tuned_alltoallv_intra_basic_linear(ALLTOALLV_ARGS);
205 int ompi_coll_tuned_alltoallv_inter_dec_fixed(ALLTOALLV_ARGS);
206 int ompi_coll_tuned_alltoallv_inter_dec_dynamic(ALLTOALLV_ARGS);
207 
208 /* AlltoAllW */
209 int ompi_coll_tuned_alltoallw_intra_dec_fixed(ALLTOALLW_ARGS);
210 int ompi_coll_tuned_alltoallw_intra_dec_dynamic(ALLTOALLW_ARGS);
211 int ompi_coll_tuned_alltoallw_inter_dec_fixed(ALLTOALLW_ARGS);
212 int ompi_coll_tuned_alltoallw_inter_dec_dynamic(ALLTOALLW_ARGS);
213 
214 /* Barrier */
215 int ompi_coll_tuned_barrier_intra_dec_fixed(BARRIER_ARGS);
216 int ompi_coll_tuned_barrier_intra_dec_dynamic(BARRIER_ARGS);
217 int ompi_coll_tuned_barrier_intra_do_forced(BARRIER_ARGS);
218 int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int algorithm, int faninout, int segsize);
219 int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
220 int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS);
221 int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS);
222 int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS);
223 int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS);
224 int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS);
225 int ompi_coll_tuned_barrier_intra_two_procs(BARRIER_ARGS);
226 int ompi_coll_tuned_barrier_intra_linear(BARRIER_ARGS);
227 int ompi_coll_tuned_barrier_intra_tree(BARRIER_ARGS);
228 
229 /* Bcast */
230 int ompi_coll_tuned_bcast_intra_generic( BCAST_ARGS, uint32_t count_by_segment, ompi_coll_tree_t* tree );
231 int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS);
232 int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS);
233 int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS);
234 int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize);
235 int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
236 int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS);
237 int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
238 int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
239 int ompi_coll_tuned_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize);
240 int ompi_coll_tuned_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize);
241 int ompi_coll_tuned_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize);
242 int ompi_coll_tuned_bcast_inter_dec_fixed(BCAST_ARGS);
243 int ompi_coll_tuned_bcast_inter_dec_dynamic(BCAST_ARGS);
244 
245 /* Exscan */
246 int ompi_coll_tuned_exscan_intra_dec_fixed(EXSCAN_ARGS);
247 int ompi_coll_tuned_exscan_intra_dec_dynamic(EXSCAN_ARGS);
248 int ompi_coll_tuned_exscan_inter_dec_fixed(EXSCAN_ARGS);
249 int ompi_coll_tuned_exscan_inter_dec_dynamic(EXSCAN_ARGS);
250 
251 /* Gather */
252 int ompi_coll_tuned_gather_intra_dec_fixed(GATHER_ARGS);
253 int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS);
254 int ompi_coll_tuned_gather_intra_do_forced(GATHER_ARGS);
255 int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize);
256 int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
257 int ompi_coll_tuned_gather_intra_basic_linear(GATHER_ARGS);
258 int ompi_coll_tuned_gather_intra_binomial(GATHER_ARGS);
259 int ompi_coll_tuned_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
260 int ompi_coll_tuned_gather_inter_dec_fixed(GATHER_ARGS);
261 int ompi_coll_tuned_gather_inter_dec_dynamic(GATHER_ARGS);
262 
263 /* GatherV */
264 int ompi_coll_tuned_gatherv_intra_dec_fixed(GATHERV_ARGS);
265 int ompi_coll_tuned_gatherv_intra_dec_dynamic(GATHER_ARGS);
266 int ompi_coll_tuned_gatherv_inter_dec_fixed(GATHER_ARGS);
267 int ompi_coll_tuned_gatherv_inter_dec_dynamic(GATHER_ARGS);
268 
269 /* Reduce */
270 int ompi_coll_tuned_reduce_generic( REDUCE_ARGS, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs );
271 int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
272 int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
273 int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
274 int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs);
275 int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
276 int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
277 int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
278 int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
279 int ompi_coll_tuned_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
280 int ompi_coll_tuned_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
281 int ompi_coll_tuned_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
282 int ompi_coll_tuned_reduce_inter_dec_fixed(REDUCE_ARGS);
283 int ompi_coll_tuned_reduce_inter_dec_dynamic(REDUCE_ARGS);
284 
285 /* Reduce_scatter */
286 int ompi_coll_tuned_reduce_scatter_intra_dec_fixed(REDUCESCATTER_ARGS);
287 int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
288 int ompi_coll_tuned_reduce_scatter_intra_do_forced(REDUCESCATTER_ARGS);
289 int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize);
290 int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
291 int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
292 int ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
293 int ompi_coll_tuned_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
294 
295 int ompi_coll_tuned_reduce_scatter_inter_dec_fixed(REDUCESCATTER_ARGS);
296 int ompi_coll_tuned_reduce_scatter_inter_dec_dynamic(REDUCESCATTER_ARGS);
297 
298 /* Scan */
299 int ompi_coll_tuned_scan_intra_dec_fixed(SCAN_ARGS);
300 int ompi_coll_tuned_scan_intra_dec_dynamic(SCAN_ARGS);
301 int ompi_coll_tuned_scan_inter_dec_fixed(SCAN_ARGS);
302 int ompi_coll_tuned_scan_inter_dec_dynamic(SCAN_ARGS);
303 
304 /* Scatter */
305 int ompi_coll_tuned_scatter_intra_dec_fixed(SCATTER_ARGS);
306 int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS);
307 int ompi_coll_tuned_scatter_intra_do_forced(SCATTER_ARGS);
308 int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize);
309 int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
310 int ompi_coll_tuned_scatter_intra_basic_linear(SCATTER_ARGS);
311 int ompi_coll_tuned_scatter_intra_binomial(SCATTER_ARGS);
312 int ompi_coll_tuned_scatter_inter_dec_fixed(SCATTER_ARGS);
313 int ompi_coll_tuned_scatter_inter_dec_dynamic(SCATTER_ARGS);
314 
315 /* ScatterV */
316 int ompi_coll_tuned_scatterv_intra_dec_fixed(SCATTERV_ARGS);
317 int ompi_coll_tuned_scatterv_intra_dec_dynamic(SCATTERV_ARGS);
318 int ompi_coll_tuned_scatterv_inter_dec_fixed(SCATTERV_ARGS);
319 int ompi_coll_tuned_scatterv_inter_dec_dynamic(SCATTERV_ARGS);
320 
321 int mca_coll_tuned_ft_event(int state);
322 
323 
324 /* Utility functions */
325 
326 static inline void ompi_coll_tuned_free_reqs(ompi_request_t **reqs, int count)
327 {
328  int i;
329  for (i = 0; i < count; ++i)
330  ompi_request_free(&reqs[i]);
331 }
332 
334  /** Base coll component */
336 
337  /** MCA parameter: Priority of this component */
339 
340  /** global stuff that I need the component to store */
341 
342  /* MCA parameters first */
343 
344  /* cached decision table stuff (moved from MCW module) */
346 };
347 /**
348  * Convenience typedef
349  */
351 
352 /**
353  * Global component instance
354  */
355 OMPI_MODULE_DECLSPEC extern mca_coll_tuned_component_t mca_coll_tuned_component;
356 
357 /*
358  * Data structure for hanging data off the communicator
359  * i.e. per module instance
360  */
362  /* standard data for requests and PML usage */
363 
364  /* Precreate space for requests
365  * Note this does not effect basic,
366  * but if in wrong context can confuse a debugger
367  * this is controlled by an MCA param
368  */
369 
370  ompi_request_t **mcct_reqs;
371  int mcct_num_reqs;
372 
373  /*
374  * tuned topo information caching per communicator
375  *
376  * for each communicator we cache the topo information so we can
377  * reuse without regenerating if we change the root, [or fanout]
378  * then regenerate and recache this information
379  */
380 
381  /* general tree with n fan out */
382  ompi_coll_tree_t *cached_ntree;
383  int cached_ntree_root;
384  int cached_ntree_fanout;
385 
386  /* binary tree */
387  ompi_coll_tree_t *cached_bintree;
388  int cached_bintree_root;
389 
390  /* binomial tree */
391  ompi_coll_tree_t *cached_bmtree;
392  int cached_bmtree_root;
393 
394  /* binomial tree */
395  ompi_coll_tree_t *cached_in_order_bmtree;
396  int cached_in_order_bmtree_root;
397 
398  /* chained tree (fanout followed by pipelines) */
399  ompi_coll_tree_t *cached_chain;
400  int cached_chain_root;
401  int cached_chain_fanout;
402 
403  /* pipeline */
404  ompi_coll_tree_t *cached_pipeline;
405  int cached_pipeline_root;
406 
407  /* in-order binary tree (root of the in-order binary tree is rank 0) */
408  ompi_coll_tree_t *cached_in_order_bintree;
409 
410  /* moving to the component */
411  ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */
412 
413  /* for forced algorithms we store the information on the module */
414  /* previously we only had one shared copy, ops, it really is per comm/module */
415  coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
416 };
418 
421 
422  mca_coll_tuned_comm_t *tuned_data;
423 };
426 
427 END_C_DECLS
428 
429 #define COLL_TUNED_UPDATE_BINTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
430 do { \
431  mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
432  if( !( (coll_comm->cached_bintree) \
433  && (coll_comm->cached_bintree_root == (ROOT)) ) ) { \
434  if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */ \
435  ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bintree) ); \
436  } \
437  coll_comm->cached_bintree = ompi_coll_tuned_topo_build_tree(2,(OMPI_COMM),(ROOT)); \
438  coll_comm->cached_bintree_root = (ROOT); \
439  } \
440 } while (0)
441 
442 #define COLL_TUNED_UPDATE_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
443 do { \
444  mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
445  if( !( (coll_comm->cached_bmtree) \
446  && (coll_comm->cached_bmtree_root == (ROOT)) ) ) { \
447  if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */ \
448  ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bmtree) ); \
449  } \
450  coll_comm->cached_bmtree = ompi_coll_tuned_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \
451  coll_comm->cached_bmtree_root = (ROOT); \
452  } \
453 } while (0)
454 
455 #define COLL_TUNED_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
456 do { \
457  mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
458  if( !( (coll_comm->cached_in_order_bmtree) \
459  && (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) { \
460  if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \
461  ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) ); \
462  } \
463  coll_comm->cached_in_order_bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \
464  coll_comm->cached_in_order_bmtree_root = (ROOT); \
465  } \
466 } while (0)
467 
468 #define COLL_TUNED_UPDATE_PIPELINE( OMPI_COMM, TUNED_MODULE, ROOT ) \
469 do { \
470  mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
471  if( !( (coll_comm->cached_pipeline) \
472  && (coll_comm->cached_pipeline_root == (ROOT)) ) ) { \
473  if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */ \
474  ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_pipeline) ); \
475  } \
476  coll_comm->cached_pipeline = ompi_coll_tuned_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \
477  coll_comm->cached_pipeline_root = (ROOT); \
478  } \
479 } while (0)
480 
481 #define COLL_TUNED_UPDATE_CHAIN( OMPI_COMM, TUNED_MODULE, ROOT, FANOUT ) \
482 do { \
483  mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
484  if( !( (coll_comm->cached_chain) \
485  && (coll_comm->cached_chain_root == (ROOT)) \
486  && (coll_comm->cached_chain_fanout == (FANOUT)) ) ) { \
487  if( coll_comm->cached_chain) { /* destroy previous chain if defined */ \
488  ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_chain) ); \
489  } \
490  coll_comm->cached_chain = ompi_coll_tuned_topo_build_chain((FANOUT), (OMPI_COMM), (ROOT)); \
491  coll_comm->cached_chain_root = (ROOT); \
492  coll_comm->cached_chain_fanout = (FANOUT); \
493  } \
494 } while (0)
495 
496 #define COLL_TUNED_UPDATE_IN_ORDER_BINTREE( OMPI_COMM, TUNED_MODULE ) \
497 do { \
498  mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
499  if( !(coll_comm->cached_in_order_bintree) ) { \
500  /* In-order binary tree topology is defined by communicator size */ \
501  /* Thus, there is no need to destroy anything */ \
502  coll_comm->cached_in_order_bintree = \
503  ompi_coll_tuned_topo_build_in_order_bintree((OMPI_COMM)); \
504  } \
505 } while (0)
506 
507 /**
508  * This macro give a generic way to compute the best count of
509  * the segment (i.e. the number of complete datatypes that
510  * can fit in the specified SEGSIZE). Beware, when this macro
511  * is called, the SEGCOUNT should be initialized to the count as
512  * expected by the collective call.
513  */
514 #define COLL_TUNED_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \
515  if( ((SEGSIZE) >= (TYPELNG)) && \
516  ((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) { \
517  size_t residual; \
518  (SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \
519  residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \
520  if( residual > ((TYPELNG) >> 1) ) \
521  (SEGCOUNT)++; \
522  } \
523 
524 /**
525  * This macro gives a generic wait to compute the well distributed block counts
526  * when the count and number of blocks are fixed.
527  * Macro returns "early-block" count, "late-block" count, and "split-index"
528  * which is the block at which we switch from "early-block" count to
529  * the "late-block" count.
530  * count = split_index * early_block_count +
531  * (block_count - split_index) * late_block_count
532  * We do not perform ANY error checks - make sure that the input values
533  * make sense (eg. count > num_blocks).
534  */
535 #define COLL_TUNED_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX, \
536  EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
537  EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS; \
538  SPLIT_INDEX = COUNT % NUM_BLOCKS; \
539  if (0 != SPLIT_INDEX) { \
540  EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1; \
541  } \
542 
543 
544 #endif /* MCA_COLL_TUNED_EXPORT_H */
545 
Definition: coll_tuned_dynamic_rules.h:64
Definition: coll_tuned_dynamic_rules.h:49
ompi_coll_alg_rule_t * all_base_rules
global stuff that I need the component to store
Definition: coll_tuned.h:345
Collective module interface.
Definition: coll.h:316
Definition: coll_tuned_topo.h:28
Collective component interface.
Definition: coll.h:283
Definition: coll_tuned.h:419
Top-level interface for all MCA components.
Collective Communication Interface.
int tuned_priority
MCA parameter: Priority of this component.
Definition: coll_tuned.h:338
Top-level description of requests.
mca_coll_base_component_2_0_0_t super
Base coll component.
Definition: coll_tuned.h:335
Definition: coll_tuned.h:333
static int ompi_request_free(ompi_request_t **request)
Free a request.
Definition: request.h:371
Definition: coll_tuned.h:105
Definition: communicator.h:118
Main top-level request struct definition.
Definition: request.h:100
Definition: coll_tuned.h:361
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236