OpenMPI  0.1.1
pml_ob1.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2007 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved
13  * Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
14  * $COPYRIGHT$
15  *
16  * Additional copyrights may follow
17  *
18  * $HEADER$
19  */
20 /**
21  * @file
22  */
23 
24 #ifndef MCA_PML_OB1_H
25 #define MCA_PML_OB1_H
26 
27 #include "ompi_config.h"
28 #include "ompi/class/ompi_free_list.h"
29 #include "ompi/request/request.h"
30 #include "ompi/mca/pml/pml.h"
32 #include "ompi/mca/pml/base/pml_base_bsend.h"
34 #include "ompi/datatype/ompi_datatype.h"
35 #include "pml_ob1_hdr.h"
36 #include "ompi/mca/bml/base/base.h"
37 #include "ompi/proc/proc.h"
39 
40 BEGIN_C_DECLS
41 
42 /**
43  * OB1 PML module
44  */
45 
46 struct mca_pml_ob1_t {
47  mca_pml_base_module_t super;
48 
49  int priority;
50  int free_list_num; /* initial size of free list */
51  int free_list_max; /* maximum size of free list */
52  int free_list_inc; /* number of elements to grow free list */
53  size_t send_pipeline_depth;
54  size_t recv_pipeline_depth;
55  size_t rdma_put_retries_limit;
56  int max_rdma_per_request;
57  int max_send_per_range;
58  bool leave_pinned;
59  int leave_pinned_pipeline;
60 
61  /* lock queue access */
62  opal_mutex_t lock;
63 
64  /* free lists */
65  ompi_free_list_t rdma_frags;
66  ompi_free_list_t recv_frags;
67  ompi_free_list_t pending_pckts;
68  ompi_free_list_t buffers;
69  ompi_free_list_t send_ranges;
70 
71  /* list of pending operations */
72  opal_list_t pckt_pending;
73  opal_list_t send_pending;
74  opal_list_t recv_pending;
75  opal_list_t rdma_pending;
76  /* List of pending fragments without a matching communicator */
77  opal_list_t non_existing_communicator_pending;
78  bool enabled;
79  char* allocator_name;
80  mca_allocator_base_module_t* allocator;
81  uint32_t unexpected_limit;
82 };
83 typedef struct mca_pml_ob1_t mca_pml_ob1_t;
84 
85 extern mca_pml_ob1_t mca_pml_ob1;
86 extern int mca_pml_ob1_output;
87 
88 /*
89  * PML interface functions.
90  */
91 
92 extern int mca_pml_ob1_add_comm(
93  struct ompi_communicator_t* comm
94 );
95 
96 extern int mca_pml_ob1_del_comm(
97  struct ompi_communicator_t* comm
98 );
99 
100 extern int mca_pml_ob1_add_procs(
101  struct ompi_proc_t **procs,
102  size_t nprocs
103 );
104 
105 extern int mca_pml_ob1_del_procs(
106  struct ompi_proc_t **procs,
107  size_t nprocs
108 );
109 
110 extern int mca_pml_ob1_enable( bool enable );
111 
112 extern int mca_pml_ob1_progress(void);
113 
114 extern int mca_pml_ob1_iprobe( int dst,
115  int tag,
116  struct ompi_communicator_t* comm,
117  int *matched,
118  ompi_status_public_t* status );
119 
120 extern int mca_pml_ob1_probe( int dst,
121  int tag,
122  struct ompi_communicator_t* comm,
123  ompi_status_public_t* status );
124 
125 extern int mca_pml_ob1_improbe( int dst,
126  int tag,
127  struct ompi_communicator_t* comm,
128  int *matched,
129  struct ompi_message_t **message,
130  ompi_status_public_t* status );
131 
132 extern int mca_pml_ob1_mprobe( int dst,
133  int tag,
134  struct ompi_communicator_t* comm,
135  struct ompi_message_t **message,
136  ompi_status_public_t* status );
137 
138 extern int mca_pml_ob1_isend_init( void *buf,
139  size_t count,
140  ompi_datatype_t *datatype,
141  int dst,
142  int tag,
143  mca_pml_base_send_mode_t mode,
144  struct ompi_communicator_t* comm,
145  struct ompi_request_t **request );
146 
147 extern int mca_pml_ob1_isend( void *buf,
148  size_t count,
149  ompi_datatype_t *datatype,
150  int dst,
151  int tag,
152  mca_pml_base_send_mode_t mode,
153  struct ompi_communicator_t* comm,
154  struct ompi_request_t **request );
155 
156 extern int mca_pml_ob1_send( void *buf,
157  size_t count,
158  ompi_datatype_t *datatype,
159  int dst,
160  int tag,
161  mca_pml_base_send_mode_t mode,
162  struct ompi_communicator_t* comm );
163 
164 extern int mca_pml_ob1_irecv_init( void *buf,
165  size_t count,
166  ompi_datatype_t *datatype,
167  int src,
168  int tag,
169  struct ompi_communicator_t* comm,
170  struct ompi_request_t **request );
171 
172 extern int mca_pml_ob1_irecv( void *buf,
173  size_t count,
174  ompi_datatype_t *datatype,
175  int src,
176  int tag,
177  struct ompi_communicator_t* comm,
178  struct ompi_request_t **request );
179 
180 extern int mca_pml_ob1_recv( void *buf,
181  size_t count,
182  ompi_datatype_t *datatype,
183  int src,
184  int tag,
185  struct ompi_communicator_t* comm,
186  ompi_status_public_t* status );
187 
188 extern int mca_pml_ob1_imrecv( void *buf,
189  size_t count,
190  ompi_datatype_t *datatype,
191  struct ompi_message_t **message,
192  struct ompi_request_t **request );
193 
194 extern int mca_pml_ob1_mrecv( void *buf,
195  size_t count,
196  ompi_datatype_t *datatype,
197  struct ompi_message_t **message,
198  ompi_status_public_t* status );
199 
200 extern int mca_pml_ob1_dump( struct ompi_communicator_t* comm,
201  int verbose );
202 
203 extern int mca_pml_ob1_start( size_t count,
204  ompi_request_t** requests );
205 
206 extern int mca_pml_ob1_ft_event( int state );
207 
208 END_C_DECLS
209 
211  ompi_free_list_item_t super;
212  ompi_proc_t* proc;
213  mca_pml_ob1_hdr_t hdr;
214  struct mca_bml_base_btl_t *bml_btl;
215  uint8_t order;
216 };
219 
220 #define MCA_PML_OB1_PCKT_PENDING_ALLOC(pckt,rc) \
221 do { \
222  ompi_free_list_item_t* item; \
223  OMPI_FREE_LIST_WAIT(&mca_pml_ob1.pending_pckts, item, rc); \
224  pckt = (mca_pml_ob1_pckt_pending_t*)item; \
225 } while (0)
226 
227 #define MCA_PML_OB1_PCKT_PENDING_RETURN(pckt) \
228 do { \
229  /* return packet */ \
230  OMPI_FREE_LIST_RETURN(&mca_pml_ob1.pending_pckts, \
231  (ompi_free_list_item_t*)pckt); \
232 } while(0)
233 
234 #define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B, O, S) \
235  do { \
236  mca_pml_ob1_pckt_pending_t *_pckt; \
237  int _rc; \
238  \
239  MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt,_rc); \
240  _pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \
241  _pckt->hdr.hdr_fin.hdr_des = (D); \
242  _pckt->hdr.hdr_fin.hdr_fail = (S); \
243  _pckt->proc = (P); \
244  _pckt->bml_btl = (B); \
245  _pckt->order = (O); \
246  OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
247  opal_list_append(&mca_pml_ob1.pckt_pending, \
248  (opal_list_item_t*)_pckt); \
249  OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); \
250  (void)_rc; \
251  } while(0)
252 
253 
255  ompi_ptr_t hdr_des, uint8_t order, uint32_t status);
256 
257 /* This function tries to resend FIN/ACK packets from pckt_pending queue.
258  * Packets are added to the queue when sending of FIN or ACK is failed due to
259  * resource unavailability. bml_btl passed to the function doesn't represents
260  * packet's destination, it represents BTL on which resource was freed, so only
261  * this BTL should be considered for resending packets */
262 void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl);
263 
264 /* This function retries failed PUT/GET operations on frag. When RDMA operation
265  * cannot be accomplished for some reason, frag is put on the rdma_pending list.
266  * Later the operation is retried. The destination of RDMA operation is stored
267  * inside the frag structure */
268 void mca_pml_ob1_process_pending_rdma(void);
269 
270 #define MCA_PML_OB1_PROGRESS_PENDING(bml_btl) \
271  do { \
272  if(opal_list_get_size(&mca_pml_ob1.pckt_pending)) \
273  mca_pml_ob1_process_pending_packets(bml_btl); \
274  if(opal_list_get_size(&mca_pml_ob1.recv_pending)) \
275  mca_pml_ob1_recv_request_process_pending(); \
276  if(opal_list_get_size(&mca_pml_ob1.send_pending)) \
277  mca_pml_ob1_send_request_process_pending(bml_btl); \
278  if(opal_list_get_size(&mca_pml_ob1.rdma_pending)) \
279  mca_pml_ob1_process_pending_rdma(); \
280  } while (0)
281 
282 /*
283  * Compute the total number of bytes on supplied descriptor
284  */
285 #define MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH(segments, count, hdrlen, length) \
286 do { \
287  size_t i; \
288  \
289  for( i = 0; i < count; i++ ) { \
290  length += segments[i].seg_len; \
291  } \
292  length -= hdrlen; \
293 } while(0)
294 
295 /* represent BTL chosen for sending request */
297  mca_bml_base_btl_t *bml_btl;
298  struct mca_mpool_base_registration_t* btl_reg;
299  size_t length;
300 };
302 
303 int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2);
304 
305 /* Calculate what percentage of a message to send through each BTL according to
306  * relative weight */
307 static inline void
308 mca_pml_ob1_calc_weighted_length( mca_pml_ob1_com_btl_t *btls, int num_btls, size_t size,
309  double weight_total )
310 {
311  int i;
312  size_t length_left;
313 
314  /* shortcut for common case for only one BTL */
315  if( OPAL_LIKELY(1 == num_btls) ) {
316  btls[0].length = size;
317  return;
318  }
319 
320  /* sort BTLs according of their weights so BTLs with smaller weight will
321  * not hijack all of the traffic */
322  qsort( btls, num_btls, sizeof(mca_pml_ob1_com_btl_t),
323  mca_pml_ob1_com_btl_comp );
324 
325  for(length_left = size, i = 0; i < num_btls; i++) {
326  mca_bml_base_btl_t* bml_btl = btls[i].bml_btl;
327  size_t length = 0;
328  if( OPAL_UNLIKELY(0 != length_left) ) {
329  length = (length_left > bml_btl->btl->btl_eager_limit)?
330  ((size_t)(size * (bml_btl->btl_weight / weight_total))) :
331  length_left;
332 
333  if(length > length_left)
334  length = length_left;
335  length_left -= length;
336  }
337  btls[i].length = length;
338  }
339 
340  /* account for rounding errors */
341  btls[0].length += length_left;
342 }
343 
344 #endif
float btl_weight
BTL weight for scheduling.
Definition: bml.h:60
P2P Management Layer (PML)
Definition: ompi_datatype.h:68
Definition: types.h:52
Definition: mutex_unix.h:53
Definition: pml_ob1.h:296
Definition: mpool.h:44
Process identification structure interface.
Remote Open MPI process structure.
Definition: proc.h:56
Union of defined hdr types.
Definition: pml_ob1_hdr.h:359
Top-level description of requests.
OB1 PML module.
Definition: pml_ob1.h:46
PML instance.
Definition: pml.h:512
Definition: ompi_free_list.h:39
Definition: ompi_free_list.h:62
Definition: bml.h:58
Definition: opal_list.h:147
Definition: mpi.h:337
Definition: pml_ob1.h:210
struct mca_btl_base_module_t * btl
BTL module.
Definition: bml.h:61
Definition: evdns.c:158
int mca_pml_ob1_start(size_t count, ompi_request_t **requests)
Definition: pml_ob1_start.c:29
Definition: communicator.h:118
Main top-level request struct definition.
Definition: request.h:100
int mca_pml_ob1_enable(bool enable)
Definition: pml_ob1.c:86
size_t btl_eager_limit
maximum size of first fragment – eager send
Definition: btl.h:790
The data structure for each component.
Definition: allocator.h:78
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
int mca_pml_ob1_send_fin(ompi_proc_t *proc, mca_bml_base_btl_t *bml_btl, ompi_ptr_t hdr_des, uint8_t order, uint32_t status)
Send an FIN to the peer.
Definition: pml_ob1.c:598
Definition: message.h:22