OpenMPI  0.1.1
pml_csum.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2007 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2009 IBM Corporation. All rights reserved.
13  * Copyright (c) 2009 Los Alamos National Security, LLC. All rights
14  * reserved.
15  * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
16  * $COPYRIGHT$
17  *
18  * Additional copyrights may follow
19  *
20  * $HEADER$
21  */
22 /**
23  * @file
24  */
25 
26 #ifndef MCA_PML_CSUM_H
27 #define MCA_PML_CSUM_H
28 
29 #include "ompi_config.h"
30 #include "ompi/class/ompi_free_list.h"
31 #include "ompi/request/request.h"
32 #include "ompi/mca/pml/pml.h"
34 #include "ompi/mca/pml/base/pml_base_bsend.h"
36 #include "ompi/datatype/ompi_datatype.h"
37 #include "pml_csum_hdr.h"
38 #include "ompi/mca/bml/base/base.h"
39 #include "ompi/proc/proc.h"
41 
42 BEGIN_C_DECLS
43 
44 /**
45  * CSUM PML module
46  */
47 
49  mca_pml_base_module_t super;
50 
51  int priority;
52  int free_list_num; /* initial size of free list */
53  int free_list_max; /* maximum size of free list */
54  int free_list_inc; /* number of elements to grow free list */
55  size_t send_pipeline_depth;
56  size_t recv_pipeline_depth;
57  size_t rdma_put_retries_limit;
58  int max_rdma_per_request;
59  int max_send_per_range;
60  bool leave_pinned;
61  int leave_pinned_pipeline;
62 
63  /* lock queue access */
64  opal_mutex_t lock;
65 
66  /* free lists */
67  ompi_free_list_t rdma_frags;
68  ompi_free_list_t recv_frags;
69  ompi_free_list_t pending_pckts;
70  ompi_free_list_t buffers;
71  ompi_free_list_t send_ranges;
72 
73  /* list of pending operations */
74  opal_list_t pckt_pending;
75  opal_list_t send_pending;
76  opal_list_t recv_pending;
77  opal_list_t rdma_pending;
78  /* List of pending fragments without a matching communicator */
79  opal_list_t non_existing_communicator_pending;
80  bool enabled;
81  char* allocator_name;
82  mca_allocator_base_module_t* allocator;
83  uint32_t unexpected_limit;
84 };
85 typedef struct mca_pml_csum_t mca_pml_csum_t;
86 
87 extern mca_pml_csum_t mca_pml_csum;
88 extern int mca_pml_csum_output;
89 
90 /*
91  * PML interface functions.
92  */
93 
94 extern int mca_pml_csum_add_comm(
95  struct ompi_communicator_t* comm
96 );
97 
98 extern int mca_pml_csum_del_comm(
99  struct ompi_communicator_t* comm
100 );
101 
102 extern int mca_pml_csum_add_procs(
103  struct ompi_proc_t **procs,
104  size_t nprocs
105 );
106 
107 extern int mca_pml_csum_del_procs(
108  struct ompi_proc_t **procs,
109  size_t nprocs
110 );
111 
112 extern int mca_pml_csum_enable( bool enable );
113 
114 extern int mca_pml_csum_progress(void);
115 
116 extern int mca_pml_csum_iprobe( int dst,
117  int tag,
118  struct ompi_communicator_t* comm,
119  int *matched,
120  ompi_status_public_t* status );
121 
122 extern int mca_pml_csum_probe( int dst,
123  int tag,
124  struct ompi_communicator_t* comm,
125  ompi_status_public_t* status );
126 
127 extern int mca_pml_csum_improbe( int dst,
128  int tag,
129  struct ompi_communicator_t* comm,
130  int *matched,
131  struct ompi_message_t **message,
132  ompi_status_public_t* status );
133 
134 extern int mca_pml_csum_mprobe( int dst,
135  int tag,
136  struct ompi_communicator_t* comm,
137  struct ompi_message_t **message,
138  ompi_status_public_t* status );
139 
140 extern int mca_pml_csum_isend_init( void *buf,
141  size_t count,
142  ompi_datatype_t *datatype,
143  int dst,
144  int tag,
145  mca_pml_base_send_mode_t mode,
146  struct ompi_communicator_t* comm,
147  struct ompi_request_t **request );
148 
149 extern int mca_pml_csum_isend( void *buf,
150  size_t count,
151  ompi_datatype_t *datatype,
152  int dst,
153  int tag,
154  mca_pml_base_send_mode_t mode,
155  struct ompi_communicator_t* comm,
156  struct ompi_request_t **request );
157 
158 extern int mca_pml_csum_send( void *buf,
159  size_t count,
160  ompi_datatype_t *datatype,
161  int dst,
162  int tag,
163  mca_pml_base_send_mode_t mode,
164  struct ompi_communicator_t* comm );
165 
166 extern int mca_pml_csum_irecv_init( void *buf,
167  size_t count,
168  ompi_datatype_t *datatype,
169  int src,
170  int tag,
171  struct ompi_communicator_t* comm,
172  struct ompi_request_t **request );
173 
174 extern int mca_pml_csum_irecv( void *buf,
175  size_t count,
176  ompi_datatype_t *datatype,
177  int src,
178  int tag,
179  struct ompi_communicator_t* comm,
180  struct ompi_request_t **request );
181 
182 extern int mca_pml_csum_recv( void *buf,
183  size_t count,
184  ompi_datatype_t *datatype,
185  int src,
186  int tag,
187  struct ompi_communicator_t* comm,
188  ompi_status_public_t* status );
189 
190 extern int mca_pml_csum_imrecv( void *buf,
191  size_t count,
192  ompi_datatype_t *datatype,
193  struct ompi_message_t **message,
194  struct ompi_request_t **request );
195 
196 extern int mca_pml_csum_mrecv( void *buf,
197  size_t count,
198  ompi_datatype_t *datatype,
199  struct ompi_message_t **message,
200  ompi_status_public_t* status );
201 
202 extern int mca_pml_csum_dump( struct ompi_communicator_t* comm,
203  int verbose );
204 
205 extern int mca_pml_csum_start( size_t count,
206  ompi_request_t** requests );
207 
208 extern int mca_pml_csum_ft_event( int state );
209 
210 END_C_DECLS
211 
213  ompi_free_list_item_t super;
214  ompi_proc_t* proc;
215  mca_pml_csum_hdr_t hdr;
216  struct mca_bml_base_btl_t *bml_btl;
217  uint8_t order;
218 };
221 
222 #define MCA_PML_CSUM_PCKT_PENDING_ALLOC(pckt,rc) \
223 do { \
224  ompi_free_list_item_t* item; \
225  OMPI_FREE_LIST_WAIT(&mca_pml_csum.pending_pckts, item, rc); \
226  pckt = (mca_pml_csum_pckt_pending_t*)item; \
227 } while (0)
228 
229 #define MCA_PML_CSUM_PCKT_PENDING_RETURN(pckt) \
230 do { \
231  /* return packet */ \
232  OMPI_FREE_LIST_RETURN(&mca_pml_csum.pending_pckts, \
233  (ompi_free_list_item_t*)pckt); \
234 } while(0)
235 
236 #define MCA_PML_CSUM_ADD_FIN_TO_PENDING(P, D, B, O, S) \
237  do { \
238  mca_pml_csum_pckt_pending_t *_pckt; \
239  int _rc; \
240  \
241  MCA_PML_CSUM_PCKT_PENDING_ALLOC(_pckt,_rc); \
242  _pckt->hdr.hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_FIN; \
243  _pckt->hdr.hdr_fin.hdr_des = (D); \
244  _pckt->hdr.hdr_fin.hdr_fail = (S); \
245  _pckt->proc = (P); \
246  _pckt->bml_btl = (B); \
247  _pckt->order = (O); \
248  OPAL_THREAD_LOCK(&mca_pml_csum.lock); \
249  opal_list_append(&mca_pml_csum.pckt_pending, \
250  (opal_list_item_t*)_pckt); \
251  OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); \
252  } while(0)
253 
254 
256  ompi_ptr_t hdr_des, uint8_t order, uint32_t status);
257 
258 /* This function tries to resend FIN/ACK packets from pckt_pending queue.
259  * Packets are added to the queue when sending of FIN or ACK is failed due to
260  * resource unavailability. bml_btl passed to the function doesn't represents
261  * packet's destination, it represents BTL on which resource was freed, so only
262  * this BTL should be considered for resending packets */
263 void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl);
264 
265 /* This function retries failed PUT/GET operations on frag. When RDMA operation
266  * cannot be accomplished for some reason, frag is put on the rdma_pending list.
267  * Later the operation is retried. The destination of RDMA operation is stored
268  * inside the frag structure */
269 void mca_pml_csum_process_pending_rdma(void);
270 
271 #define MCA_PML_CSUM_PROGRESS_PENDING(bml_btl) \
272  do { \
273  if(opal_list_get_size(&mca_pml_csum.pckt_pending)) \
274  mca_pml_csum_process_pending_packets(bml_btl); \
275  if(opal_list_get_size(&mca_pml_csum.recv_pending)) \
276  mca_pml_csum_recv_request_process_pending(); \
277  if(opal_list_get_size(&mca_pml_csum.send_pending)) \
278  mca_pml_csum_send_request_process_pending(bml_btl); \
279  if(opal_list_get_size(&mca_pml_csum.rdma_pending)) \
280  mca_pml_csum_process_pending_rdma(); \
281  } while (0)
282 
283 /*
284  * Compute the total number of bytes on supplied descriptor
285  */
286 #define MCA_PML_CSUM_COMPUTE_SEGMENT_LENGTH(segments, count, hdrlen, length) \
287 do { \
288  size_t i; \
289  \
290  for( i = 0; i < count; i++ ) { \
291  length += segments[i].seg_len; \
292  } \
293  length -= hdrlen; \
294 } while(0)
295 
296 /* represent BTL chosen for sending request */
298  mca_bml_base_btl_t *bml_btl;
299  struct mca_mpool_base_registration_t* btl_reg;
300  size_t length;
301 };
303 
304 int mca_pml_csum_com_btl_comp(const void *v1, const void *v2);
305 
306 /* Calculate what percentage of a message to send through each BTL according to
307  * relative weight */
308 static inline void
309 mca_pml_csum_calc_weighted_length( mca_pml_csum_com_btl_t *btls, int num_btls, size_t size,
310  double weight_total )
311 {
312  int i;
313  size_t length_left;
314 
315  /* shortcut for common case for only one BTL */
316  if( OPAL_LIKELY(1 == num_btls) ) {
317  btls[0].length = size;
318  return;
319  }
320 
321  /* sort BTLs according of their weights so BTLs with smaller weight will
322  * not hijack all of the traffic */
323  qsort( btls, num_btls, sizeof(mca_pml_csum_com_btl_t),
324  mca_pml_csum_com_btl_comp );
325 
326  for(length_left = size, i = 0; i < num_btls; i++) {
327  mca_bml_base_btl_t* bml_btl = btls[i].bml_btl;
328  size_t length = 0;
329  if( OPAL_UNLIKELY(0 != length_left) ) {
330  length = (length_left > bml_btl->btl->btl_eager_limit)?
331  ((size_t)(size * (bml_btl->btl_weight / weight_total))) :
332  length_left;
333 
334  if(length > length_left)
335  length = length_left;
336  length_left -= length;
337  }
338  btls[i].length = length;
339  }
340 
341  /* account for rounding errors */
342  btls[0].length += length_left;
343 }
344 
345 #endif
Definition: pml_csum.h:212
float btl_weight
BTL weight for scheduling.
Definition: bml.h:60
int mca_pml_csum_enable(bool enable)
Definition: pml_csum.c:90
int mca_pml_csum_send_fin(ompi_proc_t *proc, mca_bml_base_btl_t *bml_btl, ompi_ptr_t hdr_des, uint8_t order, uint32_t status)
Send an FIN to the peer.
Definition: pml_csum.c:502
P2P Management Layer (PML)
Definition: ompi_datatype.h:68
Definition: pml_csum.h:297
Union of defined hdr types.
Definition: pml_csum_hdr.h:298
Definition: types.h:52
Definition: mutex_unix.h:53
Definition: mpool.h:44
Process identification structure interface.
Remote Open MPI process structure.
Definition: proc.h:56
int mca_pml_csum_start(size_t count, ompi_request_t **requests)
Definition: pml_csum_start.c:29
CSUM PML module.
Definition: pml_csum.h:48
Top-level description of requests.
PML instance.
Definition: pml.h:512
Definition: ompi_free_list.h:39
Definition: ompi_free_list.h:62
Definition: bml.h:58
Definition: opal_list.h:147
Definition: mpi.h:337
struct mca_btl_base_module_t * btl
BTL module.
Definition: bml.h:61
Definition: evdns.c:158
Definition: communicator.h:118
Main top-level request struct definition.
Definition: request.h:100
size_t btl_eager_limit
maximum size of first fragment – eager send
Definition: btl.h:790
The data structure for each component.
Definition: allocator.h:78
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
Definition: message.h:22