OpenMPI  0.1.1
pml_csum_sendreq.h
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2010 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2009 IBM Corporation. All rights reserved.
13  * Copyright (c) 2009 Los Alamos National Security, LLC. All rights
14  * reserved.
15  * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
16  * $COPYRIGHT$
17  *
18  * Additional copyrights may follow
19  *
20  * $HEADER$
21  */
22 
23 #ifndef OMPI_PML_CSUM_SEND_REQUEST_H
24 #define OMPI_PML_CSUM_SEND_REQUEST_H
25 
26 #include "ompi/mca/btl/btl.h"
29 #include "pml_csum_comm.h"
30 #include "pml_csum_hdr.h"
31 #include "pml_csum_rdma.h"
32 #include "pml_csum_rdmafrag.h"
33 #include "opal/datatype/opal_convertor.h"
34 #include "ompi/mca/bml/bml.h"
35 
36 BEGIN_C_DECLS
37 
38 typedef enum {
39  MCA_PML_CSUM_SEND_PENDING_NONE,
40  MCA_PML_CSUM_SEND_PENDING_SCHEDULE,
41  MCA_PML_CSUM_SEND_PENDING_START
42 } mca_pml_csum_send_pending_t;
43 
46  mca_bml_base_endpoint_t* req_endpoint;
47  ompi_ptr_t req_recv;
48  int32_t req_state;
49  int32_t req_lock;
50  bool req_throttle_sends;
51  size_t req_pipeline_depth;
52  size_t req_bytes_delivered;
53  uint32_t req_rdma_cnt;
54  mca_pml_csum_send_pending_t req_pending;
55  opal_mutex_t req_send_range_lock;
56  opal_list_t req_send_ranges;
57  mca_pml_csum_com_btl_t req_rdma[1];
58 };
60 
62 
65  uint64_t range_send_offset;
66  uint64_t range_send_length;
67  int range_btl_idx;
68  int range_btl_cnt;
69  mca_pml_csum_com_btl_t range_btls[1];
70 };
73 
74 static inline bool lock_send_request(mca_pml_csum_send_request_t *sendreq)
75 {
76  return OPAL_THREAD_ADD32(&sendreq->req_lock, 1) == 1;
77 }
78 
79 static inline bool unlock_send_request(mca_pml_csum_send_request_t *sendreq)
80 {
81  return OPAL_THREAD_ADD32(&sendreq->req_lock, -1) == 0;
82 }
83 
84 static inline void
85 add_request_to_send_pending(mca_pml_csum_send_request_t* sendreq,
86  const mca_pml_csum_send_pending_t type,
87  const bool append)
88 {
89  opal_list_item_t *item = (opal_list_item_t*)sendreq;
90 
91  OPAL_THREAD_LOCK(&mca_pml_csum.lock);
92  sendreq->req_pending = type;
93  if(append)
94  opal_list_append(&mca_pml_csum.send_pending, item);
95  else
96  opal_list_prepend(&mca_pml_csum.send_pending, item);
97 
98  OPAL_THREAD_UNLOCK(&mca_pml_csum.lock);
99 }
100 
101 static inline mca_pml_csum_send_request_t*
102 get_request_from_send_pending(mca_pml_csum_send_pending_t *type)
103 {
105 
106  OPAL_THREAD_LOCK(&mca_pml_csum.lock);
107  sendreq = (mca_pml_csum_send_request_t*)
108  opal_list_remove_first(&mca_pml_csum.send_pending);
109  if(sendreq) {
110  *type = sendreq->req_pending;
111  sendreq->req_pending = MCA_PML_CSUM_SEND_PENDING_NONE;
112  }
113  OPAL_THREAD_UNLOCK(&mca_pml_csum.lock);
114 
115  return sendreq;
116 }
117 
118 #define MCA_PML_CSUM_SEND_REQUEST_ALLOC( comm, \
119  dst, \
120  sendreq, \
121  rc) \
122  { \
123  ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst ); \
124  ompi_free_list_item_t* item; \
125  \
126  rc = OMPI_ERR_OUT_OF_RESOURCE; \
127  if( OPAL_LIKELY(NULL != proc) ) { \
128  rc = OMPI_SUCCESS; \
129  OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \
130  sendreq = (mca_pml_csum_send_request_t*)item; \
131  sendreq->req_send.req_base.req_proc = proc; \
132  } \
133  }
134 
135 
136 #define MCA_PML_CSUM_SEND_REQUEST_INIT(sendreq, \
137  buf, \
138  count, \
139  datatype, \
140  dst, \
141  tag, \
142  comm, \
143  sendmode, \
144  persistent) \
145  { \
146  MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \
147  buf, \
148  count, \
149  datatype, \
150  dst, \
151  tag, \
152  comm, \
153  sendmode, \
154  persistent, \
155  0); /* convertor_flags */ \
156  (sendreq)->req_recv.pval = NULL; \
157  }
158 
159 
160 static inline void mca_pml_csum_free_rdma_resources(mca_pml_csum_send_request_t* sendreq)
161 {
162  size_t r;
163 
164  /* return mpool resources */
165  for(r = 0; r < sendreq->req_rdma_cnt; r++) {
166  mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
167  if( NULL != reg && reg->mpool != NULL ) {
168  reg->mpool->mpool_deregister(reg->mpool, reg);
169  }
170  }
171  sendreq->req_rdma_cnt = 0;
172 }
173 
174 
175 /**
176  * Start a send request.
177  */
178 
179 #define MCA_PML_CSUM_SEND_REQUEST_START(sendreq, rc) \
180  do { \
181  rc = mca_pml_csum_send_request_start(sendreq); \
182  } while (0)
183 
184 
185 /*
186  * Mark a send request as completed at the MPI level.
187  */
188 
189 #define MCA_PML_CSUM_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal) \
190 do { \
191  (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \
192  (sendreq)->req_send.req_base.req_comm->c_my_rank; \
193  (sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \
194  (sendreq)->req_send.req_base.req_tag; \
195  (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \
196  (sendreq)->req_send.req_base.req_ompi.req_status._ucount = \
197  (sendreq)->req_send.req_bytes_packed; \
198  ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \
199  \
200  PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
201  &(sendreq->req_send.req_base), PERUSE_SEND); \
202 } while(0)
203 
204 /*
205  * Release resources associated with a request
206  */
207 
208 #define MCA_PML_CSUM_SEND_REQUEST_RETURN(sendreq) \
209  do { \
210  /* Let the base handle the reference counts */ \
211  MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
212  OMPI_FREE_LIST_RETURN( &mca_pml_base_send_requests, \
213  (ompi_free_list_item_t*)sendreq); \
214  } while(0)
215 
216 
217 /*
218  * The PML has completed a send request. Note that this request
219  * may have been orphaned by the user or have already completed
220  * at the MPI level.
221  * This function will never be called directly from the upper level, as it
222  * should only be an internal call to the PML.
223  *
224  */
225 static inline void
226 send_request_pml_complete(mca_pml_csum_send_request_t *sendreq)
227 {
228  assert(false == sendreq->req_send.req_base.req_pml_complete);
229 
230  if(sendreq->req_send.req_bytes_packed > 0) {
231  PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
232  &(sendreq->req_send.req_base), PERUSE_SEND);
233  }
234 
235  /* return mpool resources */
236  mca_pml_csum_free_rdma_resources(sendreq);
237 
238  if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&
239  sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {
240  mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);
241  }
242 
243  OPAL_THREAD_LOCK(&ompi_request_lock);
244  if(false == sendreq->req_send.req_base.req_ompi.req_complete) {
245  /* Should only be called for long messages (maybe synchronous) */
246  MCA_PML_CSUM_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
247  }
248  sendreq->req_send.req_base.req_pml_complete = true;
249 
250  if(sendreq->req_send.req_base.req_free_called) {
251  MCA_PML_CSUM_SEND_REQUEST_RETURN(sendreq);
252  }
253  OPAL_THREAD_UNLOCK(&ompi_request_lock);
254 }
255 
256 /* returns true if request was completed on PML level */
257 static inline bool
258 send_request_pml_complete_check(mca_pml_csum_send_request_t *sendreq)
259 {
260 #if OPAL_ENABLE_MULTI_THREADS
261  opal_atomic_rmb();
262 #endif
263  /* if no more events are expected for the request and the whole message is
264  * already sent and send fragment scheduling isn't running in another
265  * thread then complete the request on PML level. From now on, if user
266  * called free on this request, the request structure can be reused for
267  * another request or if the request is persistent it can be restarted */
268  if(sendreq->req_state == 0 &&
269  sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed
270  && lock_send_request(sendreq)) {
271  send_request_pml_complete(sendreq);
272  return true;
273  }
274 
275  return false;
276 }
277 
278 /**
279  * Schedule additional fragments
280  */
281 int
282 mca_pml_csum_send_request_schedule_once(mca_pml_csum_send_request_t*);
283 
284 static inline int
285 mca_pml_csum_send_request_schedule_exclusive(mca_pml_csum_send_request_t* sendreq)
286 {
287  int rc;
288  do {
289  rc = mca_pml_csum_send_request_schedule_once(sendreq);
290  if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
291  break;
292  } while(!unlock_send_request(sendreq));
293 
294  if(OMPI_SUCCESS == rc)
295  send_request_pml_complete_check(sendreq);
296 
297  return rc;
298 }
299 
300 static inline void
301 mca_pml_csum_send_request_schedule(mca_pml_csum_send_request_t* sendreq)
302 {
303  /*
304  * Only allow one thread in this routine for a given request.
305  * However, we cannot block callers on a mutex, so simply keep track
306  * of the number of times the routine has been called and run through
307  * the scheduling logic once for every call.
308  */
309 
310  if(!lock_send_request(sendreq))
311  return;
312 
313  mca_pml_csum_send_request_schedule_exclusive(sendreq);
314 }
315 
316 /**
317  * Start the specified request
318  */
319 
320 int mca_pml_csum_send_request_start_buffered(
322  mca_bml_base_btl_t* bml_btl,
323  size_t size);
324 
325 int mca_pml_csum_send_request_start_copy(
327  mca_bml_base_btl_t* bml_btl,
328  size_t size);
329 
330 int mca_pml_csum_send_request_start_prepare(
332  mca_bml_base_btl_t* bml_btl,
333  size_t size);
334 
335 int mca_pml_csum_send_request_start_rdma(
337  mca_bml_base_btl_t* bml_btl,
338  size_t size);
339 
340 int mca_pml_csum_send_request_start_rndv(
342  mca_bml_base_btl_t* bml_btl,
343  size_t size,
344  int flags);
345 
346 static inline int
347 mca_pml_csum_send_request_start_btl( mca_pml_csum_send_request_t* sendreq,
348  mca_bml_base_btl_t* bml_btl )
349 {
350  size_t size = sendreq->req_send.req_bytes_packed;
351  mca_btl_base_module_t* btl = bml_btl->btl;
352  size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_csum_hdr_t);
353  int rc;
354 
355  if( OPAL_LIKELY(size <= eager_limit) ) {
356  switch(sendreq->req_send.req_send_mode) {
357  case MCA_PML_BASE_SEND_SYNCHRONOUS:
358  rc = mca_pml_csum_send_request_start_rndv(sendreq, bml_btl, size, 0);
359  break;
360  case MCA_PML_BASE_SEND_BUFFERED:
361  rc = mca_pml_csum_send_request_start_copy(sendreq, bml_btl, size);
362  break;
363  case MCA_PML_BASE_SEND_COMPLETE:
364  rc = mca_pml_csum_send_request_start_prepare(sendreq, bml_btl, size);
365  break;
366  default:
367  if (size != 0 && bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) {
368  rc = mca_pml_csum_send_request_start_prepare(sendreq, bml_btl, size);
369  } else {
370  rc = mca_pml_csum_send_request_start_copy(sendreq, bml_btl, size);
371  }
372  break;
373  }
374  } else {
375  size = eager_limit;
376  if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit))
377  size = btl->btl_rndv_eager_limit;
378  if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) {
379  rc = mca_pml_csum_send_request_start_buffered(sendreq, bml_btl, size);
380  } else if
381  (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
382  unsigned char *base;
383  opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
384 
385  if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_csum_rdma_btls(
386  sendreq->req_endpoint,
387  base,
388  sendreq->req_send.req_bytes_packed,
389  sendreq->req_rdma))) {
390  rc = mca_pml_csum_send_request_start_rdma(sendreq, bml_btl,
391  sendreq->req_send.req_bytes_packed);
392  if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
393  mca_pml_csum_free_rdma_resources(sendreq);
394  }
395  } else {
396  rc = mca_pml_csum_send_request_start_rndv(sendreq, bml_btl, size,
397  MCA_PML_CSUM_HDR_FLAGS_CONTIG);
398  }
399  } else {
400  rc = mca_pml_csum_send_request_start_rndv(sendreq, bml_btl, size, 0);
401  }
402  }
403 
404  return rc;
405 }
406 
407 static inline int
408 mca_pml_csum_send_request_start( mca_pml_csum_send_request_t* sendreq )
409 {
410  mca_pml_csum_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm;
412  sendreq->req_send.req_base.req_proc->proc_bml;
413  size_t i;
414 
415  if( OPAL_UNLIKELY(endpoint == NULL) ) {
416  return OMPI_ERR_UNREACH;
417  }
418 
419  sendreq->req_endpoint = endpoint;
420  sendreq->req_state = 0;
421  sendreq->req_lock = 0;
422  sendreq->req_pipeline_depth = 0;
423  sendreq->req_bytes_delivered = 0;
424  sendreq->req_pending = MCA_PML_CSUM_SEND_PENDING_NONE;
425  sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32(
426  &comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1);
427 
428  MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );
429 
430  for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
431  mca_bml_base_btl_t* bml_btl;
432  int rc;
433 
434  /* select a btl */
435  bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
436  rc = mca_pml_csum_send_request_start_btl(sendreq, bml_btl);
437  if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) )
438  return rc;
439  }
440  add_request_to_send_pending(sendreq, MCA_PML_CSUM_SEND_PENDING_START, true);
441 
442  return OMPI_SUCCESS;
443 }
444 
445 /**
446  * Initiate a put scheduled by the receiver.
447  */
448 
449 void mca_pml_csum_send_request_put( mca_pml_csum_send_request_t* sendreq,
452 
453 int mca_pml_csum_send_request_put_frag(mca_pml_csum_rdma_frag_t* frag);
454 
455 /* This function tries to continue sendreq that was stuck because of resource
456  * unavailability. A sendreq may be added to send_pending list if there is no
457  * resource to send initial packet or there is not resource to schedule data
458  * for sending. The reason the sendreq was added to the list is stored inside
459  * sendreq struct and appropriate operation is retried when resource became
460  * available. bml_btl passed to the function doesn't represents sendreq
461  * destination, it represents BTL on which resource was freed, so only this BTL
462  * should be considered for sending packets */
463 void mca_pml_csum_send_request_process_pending(mca_bml_base_btl_t *bml_btl);
464 
465 void mca_pml_csum_send_request_copy_in_out(mca_pml_csum_send_request_t *sendreq,
466  uint64_t send_offset, uint64_t send_length);
467 
468 END_C_DECLS
469 
470 #endif /* OMPI_PML_CSUM_SEND_REQUEST_H */
Header used to initiate an RDMA operation.
Definition: pml_csum_hdr.h:245
#define OPAL_THREAD_ADD32(x, y)
Use an atomic operation for increment/decrement if opal_using_threads() indicates that threads are in...
Definition: mutex.h:367
struct mca_bml_base_endpoint_t * proc_bml
BML specific proc data.
Definition: proc.h:64
uint32_t btl_flags
support for put/get?
Definition: bml.h:59
void opal_atomic_rmb(void)
Read memory barrier.
Definition: pml_csum_sendreq.h:63
Definition: pml_csum.h:297
struct ompi_communicator_t * req_comm
communicator pointer
Definition: pml_base_request.h:63
Union of defined hdr types.
Definition: pml_csum_hdr.h:298
Definition: types.h:52
Cached on ompi_communicator_t to hold queues/state used by the PML<->PTL interface for matching logic...
Definition: pml_bfo_comm.h:51
Definition: mutex_unix.h:53
size_t btl_rndv_eager_limit
the size of a data sent in a first fragment of rendezvous protocol
Definition: btl.h:791
int32_t send_sequence
send side sequence number
Definition: pml_bfo_comm.h:38
int32_t req_peer
peer process - rank w/in this communicator
Definition: pml_base_request.h:71
Definition: mpool.h:44
BML Management Layer (BML)
Definition: opal_list.h:98
#define OPAL_THREAD_LOCK(mutex)
Lock a mutex if opal_using_threads() says that multiple threads may be active in the process...
Definition: mutex.h:223
void * req_addr
pointer to send buffer - may not be application buffer
Definition: pml_base_sendreq.h:39
#define OPAL_THREAD_UNLOCK(mutex)
Unlock a mutex if opal_using_threads() says that multiple threads may be active in the process...
Definition: mutex.h:309
opal_convertor_t req_convertor
always need the convertor
Definition: pml_base_request.h:66
Definition: pml_csum_sendreq.h:44
mca_pml_base_request_t req_base
base request type - common data structure for use by wait/test
Definition: pml_base_sendreq.h:38
#define opal_list_append(l, i)
Append an item to the end of the list.
Definition: opal_list.h:410
volatile bool req_pml_complete
flag indicating if the pt-2-pt layer is done with this request
Definition: pml_base_request.h:61
Byte Transfer Layer (BTL)
struct ompi_proc_t * req_proc
peer process
Definition: pml_base_request.h:73
mca_bml_base_btl_array_t btl_eager
array of btls to use for first fragments
Definition: bml.h:228
Definition: pml_csum_rdmafrag.h:35
volatile bool req_complete
Flag indicating wether request has completed.
Definition: request.h:104
mca_mpool_base_module_deregister_fn_t mpool_deregister
deregister memory
Definition: mpool.h:181
Base type for send requests.
Definition: pml_base_sendreq.h:37
Definition: ompi_free_list.h:62
Structure associated w/ ompi_proc_t that contains the set of BTLs used to reach a destination...
Definition: bml.h:222
void * req_addr
pointer to application buffer
Definition: pml_base_request.h:69
#define MCA_PML_BASE_SEND_START(request)
Mark the request as started from the PML base point of view.
Definition: pml_base_sendreq.h:120
Definition: bml.h:58
Definition: opal_list.h:147
static opal_list_item_t * opal_list_remove_first(opal_list_t *list)
Remove the first item from the list and return it.
Definition: opal_list.h:522
static void opal_list_prepend(opal_list_t *list, opal_list_item_t *item)
Prepend an item to the beginning of the list.
Definition: opal_list.h:469
struct mca_btl_base_module_t * btl
BTL module.
Definition: bml.h:61
ompi_request_t req_ompi
base request
Definition: pml_base_request.h:60
static size_t mca_bml_base_btl_array_get_size(mca_bml_base_btl_array_t *array)
If required, reallocate (grow) the array to the indicate size.
Definition: bml.h:91
BTL module interface functions and attributes.
Definition: btl.h:786
size_t req_bytes_packed
packed size of a message given the datatype and count
Definition: pml_base_sendreq.h:40
mca_pml_base_send_mode_t req_send_mode
type of send
Definition: pml_base_sendreq.h:41
Main top-level request struct definition.
Definition: request.h:100
size_t btl_eager_limit
maximum size of first fragment – eager send
Definition: btl.h:790
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
uint64_t req_sequence
sequence number for MPI pt-2-pt ordering
Definition: pml_base_request.h:74
volatile bool req_free_called
flag indicating if the user has freed this request
Definition: pml_base_request.h:65
static mca_bml_base_btl_t * mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t *array)
Return the next LRU index in the array.
Definition: bml.h:179