OpenMPI  0.1.1
pml_bfo.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2007 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
13  * $COPYRIGHT$
14  *
15  * Additional copyrights may follow
16  *
17  * $HEADER$
18  */
19 /**
20  * @file
21  */
22 
23 #ifndef MCA_PML_BFO_H
24 #define MCA_PML_BFO_H
25 
26 #include "ompi_config.h"
27 #include "ompi/class/ompi_free_list.h"
28 #include "ompi/request/request.h"
29 #include "ompi/mca/pml/pml.h"
31 #include "ompi/mca/pml/base/pml_base_bsend.h"
33 #include "ompi/datatype/ompi_datatype.h"
34 #include "pml_bfo_hdr.h"
35 #include "ompi/mca/bml/base/base.h"
36 #include "ompi/proc/proc.h"
38 
39 BEGIN_C_DECLS
40 
41 /**
42  * BFO PML module
43  */
44 
45 struct mca_pml_bfo_t {
46  mca_pml_base_module_t super;
47 
48  int priority;
49  int free_list_num; /* initial size of free list */
50  int free_list_max; /* maximum size of free list */
51  int free_list_inc; /* number of elements to grow free list */
52  size_t send_pipeline_depth;
53  size_t recv_pipeline_depth;
54  size_t rdma_put_retries_limit;
55  int max_rdma_per_request;
56  int max_send_per_range;
57  bool leave_pinned;
58  int leave_pinned_pipeline;
59 
60  /* lock queue access */
61  opal_mutex_t lock;
62 
63  /* free lists */
64  ompi_free_list_t rdma_frags;
65  ompi_free_list_t recv_frags;
66  ompi_free_list_t pending_pckts;
67  ompi_free_list_t buffers;
68  ompi_free_list_t send_ranges;
69 
70  /* list of pending operations */
71  opal_list_t pckt_pending;
72  opal_list_t send_pending;
73  opal_list_t recv_pending;
74  opal_list_t rdma_pending;
75  /* List of pending fragments without a matching communicator */
76  opal_list_t non_existing_communicator_pending;
77  bool enabled;
78  char* allocator_name;
79  mca_allocator_base_module_t* allocator;
80  uint32_t unexpected_limit;
81 };
82 typedef struct mca_pml_bfo_t mca_pml_bfo_t;
83 
84 extern mca_pml_bfo_t mca_pml_bfo;
85 extern int mca_pml_bfo_output;
86 
87 /*
88  * PML interface functions.
89  */
90 
91 extern int mca_pml_bfo_add_comm(
92  struct ompi_communicator_t* comm
93 );
94 
95 extern int mca_pml_bfo_del_comm(
96  struct ompi_communicator_t* comm
97 );
98 
99 extern int mca_pml_bfo_add_procs(
100  struct ompi_proc_t **procs,
101  size_t nprocs
102 );
103 
104 extern int mca_pml_bfo_del_procs(
105  struct ompi_proc_t **procs,
106  size_t nprocs
107 );
108 
109 extern int mca_pml_bfo_enable( bool enable );
110 
111 extern int mca_pml_bfo_progress(void);
112 
113 extern int mca_pml_bfo_iprobe( int dst,
114  int tag,
115  struct ompi_communicator_t* comm,
116  int *matched,
117  ompi_status_public_t* status );
118 
119 extern int mca_pml_bfo_probe( int dst,
120  int tag,
121  struct ompi_communicator_t* comm,
122  ompi_status_public_t* status );
123 
124 extern int mca_pml_bfo_improbe( int dst,
125  int tag,
126  struct ompi_communicator_t* comm,
127  int *matched,
128  struct ompi_message_t **message,
129  ompi_status_public_t* status );
130 
131 extern int mca_pml_bfo_mprobe( int dst,
132  int tag,
133  struct ompi_communicator_t* comm,
134  struct ompi_message_t **message,
135  ompi_status_public_t* status );
136 
137 extern int mca_pml_bfo_isend_init( void *buf,
138  size_t count,
139  ompi_datatype_t *datatype,
140  int dst,
141  int tag,
142  mca_pml_base_send_mode_t mode,
143  struct ompi_communicator_t* comm,
144  struct ompi_request_t **request );
145 
146 extern int mca_pml_bfo_isend( void *buf,
147  size_t count,
148  ompi_datatype_t *datatype,
149  int dst,
150  int tag,
151  mca_pml_base_send_mode_t mode,
152  struct ompi_communicator_t* comm,
153  struct ompi_request_t **request );
154 
155 extern int mca_pml_bfo_send( void *buf,
156  size_t count,
157  ompi_datatype_t *datatype,
158  int dst,
159  int tag,
160  mca_pml_base_send_mode_t mode,
161  struct ompi_communicator_t* comm );
162 
163 extern int mca_pml_bfo_irecv_init( void *buf,
164  size_t count,
165  ompi_datatype_t *datatype,
166  int src,
167  int tag,
168  struct ompi_communicator_t* comm,
169  struct ompi_request_t **request );
170 
171 extern int mca_pml_bfo_irecv( void *buf,
172  size_t count,
173  ompi_datatype_t *datatype,
174  int src,
175  int tag,
176  struct ompi_communicator_t* comm,
177  struct ompi_request_t **request );
178 
179 extern int mca_pml_bfo_recv( void *buf,
180  size_t count,
181  ompi_datatype_t *datatype,
182  int src,
183  int tag,
184  struct ompi_communicator_t* comm,
185  ompi_status_public_t* status );
186 
187 extern int mca_pml_bfo_imrecv( void *buf,
188  size_t count,
189  ompi_datatype_t *datatype,
190  struct ompi_message_t **message,
191  struct ompi_request_t **request );
192 
193 extern int mca_pml_bfo_mrecv( void *buf,
194  size_t count,
195  ompi_datatype_t *datatype,
196  struct ompi_message_t **message,
197  ompi_status_public_t* status );
198 
199 extern int mca_pml_bfo_dump( struct ompi_communicator_t* comm,
200  int verbose );
201 
202 extern int mca_pml_bfo_start( size_t count,
203  ompi_request_t** requests );
204 
205 extern int mca_pml_bfo_ft_event( int state );
206 
207 END_C_DECLS
208 
210  ompi_free_list_item_t super;
211  ompi_proc_t* proc;
212  mca_pml_bfo_hdr_t hdr;
213  struct mca_bml_base_btl_t *bml_btl;
214  uint8_t order;
215 };
218 
219 #define MCA_PML_BFO_PCKT_PENDING_ALLOC(pckt,rc) \
220 do { \
221  ompi_free_list_item_t* item; \
222  OMPI_FREE_LIST_WAIT(&mca_pml_bfo.pending_pckts, item, rc); \
223  pckt = (mca_pml_bfo_pckt_pending_t*)item; \
224 } while (0)
225 
226 #define MCA_PML_BFO_PCKT_PENDING_RETURN(pckt) \
227 do { \
228  /* return packet */ \
229  OMPI_FREE_LIST_RETURN(&mca_pml_bfo.pending_pckts, \
230  (ompi_free_list_item_t*)pckt); \
231 } while(0)
232 
233 #define MCA_PML_BFO_ADD_FIN_TO_PENDING(P, D, B, O, S) \
234  do { \
235  mca_pml_bfo_pckt_pending_t *_pckt; \
236  int _rc; \
237  \
238  MCA_PML_BFO_PCKT_PENDING_ALLOC(_pckt,_rc); \
239  _pckt->hdr.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN; \
240  _pckt->hdr.hdr_fin.hdr_des = (D); \
241  _pckt->hdr.hdr_fin.hdr_fail = (S); \
242  _pckt->proc = (P); \
243  _pckt->bml_btl = (B); \
244  _pckt->order = (O); \
245  OPAL_THREAD_LOCK(&mca_pml_bfo.lock); \
246  opal_list_append(&mca_pml_bfo.pckt_pending, \
247  (opal_list_item_t*)_pckt); \
248  OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); \
249  } while(0)
250 
251 
253 #if PML_BFO
254  ompi_ptr_t hdr_des, uint8_t order, uint32_t status,
255  uint16_t seq, uint8_t reqseq, uint16_t ctx, uint32_t src);
256 #else /* PML_BFO */
257  ompi_ptr_t hdr_des, uint8_t order, uint32_t status);
258 #endif /* PML_BFO */
259 
260 /* This function tries to resend FIN/ACK packets from pckt_pending queue.
261  * Packets are added to the queue when sending of FIN or ACK is failed due to
262  * resource unavailability. bml_btl passed to the function doesn't represents
263  * packet's destination, it represents BTL on which resource was freed, so only
264  * this BTL should be considered for resending packets */
265 void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl);
266 
267 /* This function retries failed PUT/GET operations on frag. When RDMA operation
268  * cannot be accomplished for some reason, frag is put on the rdma_pending list.
269  * Later the operation is retried. The destination of RDMA operation is stored
270  * inside the frag structure */
271 void mca_pml_bfo_process_pending_rdma(void);
272 
273 #define MCA_PML_BFO_PROGRESS_PENDING(bml_btl) \
274  do { \
275  if(opal_list_get_size(&mca_pml_bfo.pckt_pending)) \
276  mca_pml_bfo_process_pending_packets(bml_btl); \
277  if(opal_list_get_size(&mca_pml_bfo.recv_pending)) \
278  mca_pml_bfo_recv_request_process_pending(); \
279  if(opal_list_get_size(&mca_pml_bfo.send_pending)) \
280  mca_pml_bfo_send_request_process_pending(bml_btl); \
281  if(opal_list_get_size(&mca_pml_bfo.rdma_pending)) \
282  mca_pml_bfo_process_pending_rdma(); \
283  } while (0)
284 
285 /*
286  * Compute the total number of bytes on supplied descriptor
287  */
288 #define MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH(segments, count, hdrlen, length) \
289 do { \
290  size_t i; \
291  \
292  for( i = 0; i < count; i++ ) { \
293  length += segments[i].seg_len; \
294  } \
295  length -= hdrlen; \
296 } while(0)
297 
298 /* represent BTL chosen for sending request */
300  mca_bml_base_btl_t *bml_btl;
301  struct mca_mpool_base_registration_t* btl_reg;
302  size_t length;
303 };
305 
306 int mca_pml_bfo_com_btl_comp(const void *v1, const void *v2);
307 
308 /* Calculate what percentage of a message to send through each BTL according to
309  * relative weight */
310 static inline void
311 mca_pml_bfo_calc_weighted_length( mca_pml_bfo_com_btl_t *btls, int num_btls, size_t size,
312  double weight_total )
313 {
314  int i;
315  size_t length_left;
316 
317  /* shortcut for common case for only one BTL */
318  if( OPAL_LIKELY(1 == num_btls) ) {
319  btls[0].length = size;
320  return;
321  }
322 
323  /* sort BTLs according of their weights so BTLs with smaller weight will
324  * not hijack all of the traffic */
325  qsort( btls, num_btls, sizeof(mca_pml_bfo_com_btl_t),
326  mca_pml_bfo_com_btl_comp );
327 
328  for(length_left = size, i = 0; i < num_btls; i++) {
329  mca_bml_base_btl_t* bml_btl = btls[i].bml_btl;
330  size_t length = 0;
331  if( OPAL_UNLIKELY(0 != length_left) ) {
332  length = (length_left > bml_btl->btl->btl_eager_limit)?
333  ((size_t)(size * (bml_btl->btl_weight / weight_total))) :
334  length_left;
335 
336  if(length > length_left)
337  length = length_left;
338  length_left -= length;
339  }
340  btls[i].length = length;
341  }
342 
343  /* account for rounding errors */
344  btls[0].length += length_left;
345 }
346 
347 #endif
int mca_pml_bfo_start(size_t count, ompi_request_t **requests)
Definition: pml_bfo_start.c:29
float btl_weight
BTL weight for scheduling.
Definition: bml.h:60
Definition: pml_bfo.h:299
P2P Management Layer (PML)
Definition: ompi_datatype.h:68
Definition: types.h:52
Definition: mutex_unix.h:53
Union of defined hdr types.
Definition: pml_bfo_hdr.h:441
BFO PML module.
Definition: pml_bfo.h:45
Definition: pml_bfo.h:209
Definition: mpool.h:44
Process identification structure interface.
Remote Open MPI process structure.
Definition: proc.h:56
Top-level description of requests.
PML instance.
Definition: pml.h:512
Definition: ompi_free_list.h:39
Definition: ompi_free_list.h:62
Definition: bml.h:58
Definition: opal_list.h:147
Definition: mpi.h:337
struct mca_btl_base_module_t * btl
BTL module.
Definition: bml.h:61
int mca_pml_bfo_enable(bool enable)
Definition: pml_bfo.c:89
Definition: evdns.c:158
int mca_pml_bfo_send_fin(ompi_proc_t *proc, mca_bml_base_btl_t *bml_btl, ompi_ptr_t hdr_des, uint8_t order, uint32_t status)
Send an FIN to the peer.
Definition: pml_bfo.c:496
Definition: communicator.h:118
Main top-level request struct definition.
Definition: request.h:100
size_t btl_eager_limit
maximum size of first fragment – eager send
Definition: btl.h:790
The data structure for each component.
Definition: allocator.h:78
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
Definition: message.h:22