OpenMPI  0.1.1
pml_bfo_sendreq.h
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2010 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
13  * $COPYRIGHT$
14  *
15  * Additional copyrights may follow
16  *
17  * $HEADER$
18  */
19 
20 #ifndef OMPI_PML_BFO_SEND_REQUEST_H
21 #define OMPI_PML_BFO_SEND_REQUEST_H
22 
23 #include "ompi/mca/btl/btl.h"
26 #include "pml_bfo_comm.h"
27 #include "pml_bfo_hdr.h"
28 #include "pml_bfo_rdma.h"
29 #include "pml_bfo_rdmafrag.h"
30 #include "opal/datatype/opal_convertor.h"
31 #include "ompi/mca/bml/bml.h"
32 
33 BEGIN_C_DECLS
34 
35 typedef enum {
36  MCA_PML_BFO_SEND_PENDING_NONE,
37  MCA_PML_BFO_SEND_PENDING_SCHEDULE,
38  MCA_PML_BFO_SEND_PENDING_START
39 } mca_pml_bfo_send_pending_t;
40 
43  mca_bml_base_endpoint_t* req_endpoint;
44  ompi_ptr_t req_recv;
45 #if PML_BFO
46  int32_t req_events; /* number of outstanding events on request */
47  int32_t req_restartseq; /* sequence number of restarted request */
48  int32_t req_restart; /* state of restarted request */
49  int32_t req_error; /* non-zero when error has occurred on request */
50 #endif /* PML_BFO */
51  int32_t req_state;
52  int32_t req_lock;
53  bool req_throttle_sends;
54  size_t req_pipeline_depth;
55  size_t req_bytes_delivered;
56  uint32_t req_rdma_cnt;
57  mca_pml_bfo_send_pending_t req_pending;
58  opal_mutex_t req_send_range_lock;
59  opal_list_t req_send_ranges;
60  mca_pml_bfo_com_btl_t req_rdma[1];
61 };
63 
65 
68  uint64_t range_send_offset;
69  uint64_t range_send_length;
70  int range_btl_idx;
71  int range_btl_cnt;
72  mca_pml_bfo_com_btl_t range_btls[1];
73 };
76 
77 static inline bool lock_send_request(mca_pml_bfo_send_request_t *sendreq)
78 {
79  return OPAL_THREAD_ADD32(&sendreq->req_lock, 1) == 1;
80 }
81 
82 static inline bool unlock_send_request(mca_pml_bfo_send_request_t *sendreq)
83 {
84  return OPAL_THREAD_ADD32(&sendreq->req_lock, -1) == 0;
85 }
86 
87 static inline void
88 add_request_to_send_pending(mca_pml_bfo_send_request_t* sendreq,
89  const mca_pml_bfo_send_pending_t type,
90  const bool append)
91 {
92  opal_list_item_t *item = (opal_list_item_t*)sendreq;
93 
94  OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
95  sendreq->req_pending = type;
96  if(append)
97  opal_list_append(&mca_pml_bfo.send_pending, item);
98  else
99  opal_list_prepend(&mca_pml_bfo.send_pending, item);
100 
101  OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
102 }
103 
104 static inline mca_pml_bfo_send_request_t*
105 get_request_from_send_pending(mca_pml_bfo_send_pending_t *type)
106 {
108 
109  OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
110  sendreq = (mca_pml_bfo_send_request_t*)
111  opal_list_remove_first(&mca_pml_bfo.send_pending);
112  if(sendreq) {
113  *type = sendreq->req_pending;
114  sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
115  }
116  OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
117 
118  return sendreq;
119 }
120 
121 #define MCA_PML_BFO_SEND_REQUEST_ALLOC( comm, \
122  dst, \
123  sendreq, \
124  rc) \
125  { \
126  ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst ); \
127  ompi_free_list_item_t* item; \
128  \
129  rc = OMPI_ERR_OUT_OF_RESOURCE; \
130  if( OPAL_LIKELY(NULL != proc) ) { \
131  rc = OMPI_SUCCESS; \
132  OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \
133  sendreq = (mca_pml_bfo_send_request_t*)item; \
134  sendreq->req_send.req_base.req_proc = proc; \
135  } \
136  }
137 
138 
139 #define MCA_PML_BFO_SEND_REQUEST_INIT( sendreq, \
140  buf, \
141  count, \
142  datatype, \
143  dst, \
144  tag, \
145  comm, \
146  sendmode, \
147  persistent) \
148  { \
149  MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \
150  buf, \
151  count, \
152  datatype, \
153  dst, \
154  tag, \
155  comm, \
156  sendmode, \
157  persistent, \
158  0); /* convertor_flags */ \
159  (sendreq)->req_recv.pval = NULL; \
160  }
161 
162 
163 static inline void mca_pml_bfo_free_rdma_resources(mca_pml_bfo_send_request_t* sendreq)
164 {
165  size_t r;
166 
167  /* return mpool resources */
168  for(r = 0; r < sendreq->req_rdma_cnt; r++) {
169  mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
170  if( NULL != reg && reg->mpool != NULL ) {
171  reg->mpool->mpool_deregister(reg->mpool, reg);
172  }
173  }
174  sendreq->req_rdma_cnt = 0;
175 }
176 
177 
178 /**
179  * Start a send request.
180  */
181 
182 #define MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc) \
183  do { \
184  rc = mca_pml_bfo_send_request_start(sendreq); \
185  } while (0)
186 
187 
188 /*
189  * Mark a send request as completed at the MPI level.
190  */
191 
192 #define MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal) \
193 do { \
194  (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \
195  (sendreq)->req_send.req_base.req_comm->c_my_rank; \
196  (sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \
197  (sendreq)->req_send.req_base.req_tag; \
198  (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \
199  (sendreq)->req_send.req_base.req_ompi.req_status._ucount = \
200  (sendreq)->req_send.req_bytes_packed; \
201  ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \
202  \
203  PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
204  &(sendreq->req_send.req_base), PERUSE_SEND); \
205 } while(0)
206 
207 /*
208  * Release resources associated with a request
209  */
210 
211 #define MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq) \
212  do { \
213  /* Let the base handle the reference counts */ \
214  MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
215  OMPI_FREE_LIST_RETURN( &mca_pml_base_send_requests, \
216  (ompi_free_list_item_t*)sendreq); \
217  } while(0)
218 
219 
220 /*
221  * The PML has completed a send request. Note that this request
222  * may have been orphaned by the user or have already completed
223  * at the MPI level.
224  * This function will never be called directly from the upper level, as it
225  * should only be an internal call to the PML.
226  *
227  */
228 static inline void
229 send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq)
230 {
231  assert(false == sendreq->req_send.req_base.req_pml_complete);
232 
233  if(sendreq->req_send.req_bytes_packed > 0) {
234  PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
235  &(sendreq->req_send.req_base), PERUSE_SEND);
236  }
237 
238  /* return mpool resources */
239  mca_pml_bfo_free_rdma_resources(sendreq);
240 
241  if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&
242  sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {
243  mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);
244  }
245 
246  OPAL_THREAD_LOCK(&ompi_request_lock);
247  if(false == sendreq->req_send.req_base.req_ompi.req_complete) {
248  /* Should only be called for long messages (maybe synchronous) */
249  MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
250  }
251  sendreq->req_send.req_base.req_pml_complete = true;
252 #if PML_BFO
253  sendreq->req_send.req_base.req_sequence -= 100;
254 #endif /* PML_BFO */
255 
256  if(sendreq->req_send.req_base.req_free_called) {
257  MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq);
258  }
259  OPAL_THREAD_UNLOCK(&ompi_request_lock);
260 }
261 
262 /* returns true if request was completed on PML level */
263 static inline bool
264 send_request_pml_complete_check(mca_pml_bfo_send_request_t *sendreq)
265 {
266 #if OPAL_ENABLE_MULTI_THREADS
267  opal_atomic_rmb();
268 #endif
269  /* if no more events are expected for the request and the whole message is
270  * already sent and send fragment scheduling isn't running in another
271  * thread then complete the request on PML level. From now on, if user
272  * called free on this request, the request structure can be reused for
273  * another request or if the request is persistent it can be restarted */
274  if(sendreq->req_state == 0 &&
275  sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed
276  && lock_send_request(sendreq)) {
277  send_request_pml_complete(sendreq);
278  return true;
279  }
280 
281  return false;
282 }
283 
284 /**
285  * Schedule additional fragments
286  */
287 int
288 mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t*);
289 
290 static inline int
291 mca_pml_bfo_send_request_schedule_exclusive(mca_pml_bfo_send_request_t* sendreq)
292 {
293  int rc;
294  do {
295  rc = mca_pml_bfo_send_request_schedule_once(sendreq);
296  if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
297  break;
298  } while(!unlock_send_request(sendreq));
299 
300  if(OMPI_SUCCESS == rc)
301  send_request_pml_complete_check(sendreq);
302 
303  return rc;
304 }
305 
306 static inline void
307 mca_pml_bfo_send_request_schedule(mca_pml_bfo_send_request_t* sendreq)
308 {
309  /*
310  * Only allow one thread in this routine for a given request.
311  * However, we cannot block callers on a mutex, so simply keep track
312  * of the number of times the routine has been called and run through
313  * the scheduling logic once for every call.
314  */
315 
316  if(!lock_send_request(sendreq))
317  return;
318 
319  mca_pml_bfo_send_request_schedule_exclusive(sendreq);
320 }
321 
322 #if OMPI_CUDA_SUPPORT
323 int mca_pml_bfo_send_request_start_cuda(
324  mca_pml_bfo_send_request_t* sendreq,
325  mca_bml_base_btl_t* bml_btl,
326  size_t size);
327 #endif /* OMPI_CUDA_SUPPORT */
328 
329 /**
330  * Start the specified request
331  */
332 
333 int mca_pml_bfo_send_request_start_buffered(
335  mca_bml_base_btl_t* bml_btl,
336  size_t size);
337 
338 int mca_pml_bfo_send_request_start_copy(
340  mca_bml_base_btl_t* bml_btl,
341  size_t size);
342 
343 int mca_pml_bfo_send_request_start_prepare(
345  mca_bml_base_btl_t* bml_btl,
346  size_t size);
347 
348 int mca_pml_bfo_send_request_start_rdma(
350  mca_bml_base_btl_t* bml_btl,
351  size_t size);
352 
353 int mca_pml_bfo_send_request_start_rndv(
355  mca_bml_base_btl_t* bml_btl,
356  size_t size,
357  int flags);
358 
359 static inline int
360 mca_pml_bfo_send_request_start_btl( mca_pml_bfo_send_request_t* sendreq,
361  mca_bml_base_btl_t* bml_btl )
362 {
363  size_t size = sendreq->req_send.req_bytes_packed;
364  mca_btl_base_module_t* btl = bml_btl->btl;
365  size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_bfo_hdr_t);
366  int rc;
367 
368  if( OPAL_LIKELY(size <= eager_limit) ) {
369  switch(sendreq->req_send.req_send_mode) {
370  case MCA_PML_BASE_SEND_SYNCHRONOUS:
371  rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
372  break;
373  case MCA_PML_BASE_SEND_BUFFERED:
374  rc = mca_pml_bfo_send_request_start_copy(sendreq, bml_btl, size);
375  break;
376  case MCA_PML_BASE_SEND_COMPLETE:
377  rc = mca_pml_bfo_send_request_start_prepare(sendreq, bml_btl, size);
378  break;
379  default:
380  if (size != 0 && bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) {
381  rc = mca_pml_bfo_send_request_start_prepare(sendreq, bml_btl, size);
382  } else {
383  rc = mca_pml_bfo_send_request_start_copy(sendreq, bml_btl, size);
384  }
385  break;
386  }
387  } else {
388  size = eager_limit;
389  if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit))
390  size = btl->btl_rndv_eager_limit;
391  if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) {
392  rc = mca_pml_bfo_send_request_start_buffered(sendreq, bml_btl, size);
393  } else if
394  (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
395  unsigned char *base;
396  opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
397 
398  if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_bfo_rdma_btls(
399  sendreq->req_endpoint,
400  base,
401  sendreq->req_send.req_bytes_packed,
402  sendreq->req_rdma))) {
403  rc = mca_pml_bfo_send_request_start_rdma(sendreq, bml_btl,
404  sendreq->req_send.req_bytes_packed);
405  if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
406  mca_pml_bfo_free_rdma_resources(sendreq);
407  }
408  } else {
409  rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size,
410  MCA_PML_BFO_HDR_FLAGS_CONTIG);
411  }
412  } else {
413 #if OMPI_CUDA_SUPPORT
414  if (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) {
415  return mca_pml_bfo_send_request_start_cuda(sendreq, bml_btl, size);
416  }
417 #endif /* OMPI_CUDA_SUPPORT */
418  rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
419  }
420  }
421 
422  return rc;
423 }
424 
425 static inline int
426 mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq )
427 {
428  mca_pml_bfo_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm;
430  sendreq->req_send.req_base.req_proc->proc_bml;
431  size_t i;
432 
433  if( OPAL_UNLIKELY(endpoint == NULL) ) {
434  return OMPI_ERR_UNREACH;
435  }
436 
437  sendreq->req_endpoint = endpoint;
438  sendreq->req_state = 0;
439  sendreq->req_lock = 0;
440  sendreq->req_pipeline_depth = 0;
441  sendreq->req_bytes_delivered = 0;
442  sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
443  sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32(
444  &comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1);
445 #if PML_BFO
446  sendreq->req_restartseq = 0; /* counts up restarts */
447  sendreq->req_restart = 0; /* reset in case we restart again */
448  sendreq->req_error = 0; /* clear error state */
449  sendreq->req_events = 0; /* clear events, probably 0 anyways */
450 #endif /* PML_BFO */
451 
452  MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );
453 
454  for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
455  mca_bml_base_btl_t* bml_btl;
456  int rc;
457 
458  /* select a btl */
459  bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
460  rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
461  if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) )
462  return rc;
463  }
464  add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
465 
466  return OMPI_SUCCESS;
467 }
468 
469 /**
470  * Initiate a put scheduled by the receiver.
471  */
472 
473 void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq,
475  mca_pml_bfo_rdma_hdr_t* hdr );
476 
477 int mca_pml_bfo_send_request_put_frag(mca_pml_bfo_rdma_frag_t* frag);
478 
479 /* This function tries to continue sendreq that was stuck because of resource
480  * unavailability. A sendreq may be added to send_pending list if there is no
481  * resource to send initial packet or there is not resource to schedule data
482  * for sending. The reason the sendreq was added to the list is stored inside
483  * sendreq struct and appropriate operation is retried when resource became
484  * available. bml_btl passed to the function doesn't represents sendreq
485  * destination, it represents BTL on which resource was freed, so only this BTL
486  * should be considered for sending packets */
487 void mca_pml_bfo_send_request_process_pending(mca_bml_base_btl_t *bml_btl);
488 
489 void mca_pml_bfo_send_request_copy_in_out(mca_pml_bfo_send_request_t *sendreq,
490  uint64_t send_offset, uint64_t send_length);
491 
492 END_C_DECLS
493 
494 #endif /* OMPI_PML_BFO_SEND_REQUEST_H */
#define OPAL_THREAD_ADD32(x, y)
Use an atomic operation for increment/decrement if opal_using_threads() indicates that threads are in...
Definition: mutex.h:367
struct mca_bml_base_endpoint_t * proc_bml
BML specific proc data.
Definition: proc.h:64
Definition: pml_bfo.h:299
uint32_t btl_flags
support for put/get?
Definition: bml.h:59
void opal_atomic_rmb(void)
Read memory barrier.
struct ompi_communicator_t * req_comm
communicator pointer
Definition: pml_base_request.h:63
Definition: types.h:52
Cached on ompi_communicator_t to hold queues/state used by the PML<->PTL interface for matching logic...
Definition: pml_bfo_comm.h:51
Definition: mutex_unix.h:53
Union of defined hdr types.
Definition: pml_bfo_hdr.h:441
size_t btl_rndv_eager_limit
the size of a data sent in a first fragment of rendezvous protocol
Definition: btl.h:791
int32_t send_sequence
send side sequence number
Definition: pml_bfo_comm.h:38
int32_t req_peer
peer process - rank w/in this communicator
Definition: pml_base_request.h:71
Definition: mpool.h:44
BML Management Layer (BML)
Definition: opal_list.h:98
#define OPAL_THREAD_LOCK(mutex)
Lock a mutex if opal_using_threads() says that multiple threads may be active in the process...
Definition: mutex.h:223
Header used to initiate an RDMA operation.
Definition: pml_bfo_hdr.h:294
void * req_addr
pointer to send buffer - may not be application buffer
Definition: pml_base_sendreq.h:39
#define OPAL_THREAD_UNLOCK(mutex)
Unlock a mutex if opal_using_threads() says that multiple threads may be active in the process...
Definition: mutex.h:309
opal_convertor_t req_convertor
always need the convertor
Definition: pml_base_request.h:66
mca_pml_base_request_t req_base
base request type - common data structure for use by wait/test
Definition: pml_base_sendreq.h:38
#define opal_list_append(l, i)
Append an item to the end of the list.
Definition: opal_list.h:410
volatile bool req_pml_complete
flag indicating if the pt-2-pt layer is done with this request
Definition: pml_base_request.h:61
Byte Transfer Layer (BTL)
struct ompi_proc_t * req_proc
peer process
Definition: pml_base_request.h:73
mca_bml_base_btl_array_t btl_eager
array of btls to use for first fragments
Definition: bml.h:228
volatile bool req_complete
Flag indicating wether request has completed.
Definition: request.h:104
mca_mpool_base_module_deregister_fn_t mpool_deregister
deregister memory
Definition: mpool.h:181
Base type for send requests.
Definition: pml_base_sendreq.h:37
Definition: ompi_free_list.h:62
Definition: pml_bfo_sendreq.h:66
Structure associated w/ ompi_proc_t that contains the set of BTLs used to reach a destination...
Definition: bml.h:222
void * req_addr
pointer to application buffer
Definition: pml_base_request.h:69
Definition: pml_bfo_rdmafrag.h:36
#define MCA_PML_BASE_SEND_START(request)
Mark the request as started from the PML base point of view.
Definition: pml_base_sendreq.h:120
Definition: bml.h:58
Definition: opal_list.h:147
static opal_list_item_t * opal_list_remove_first(opal_list_t *list)
Remove the first item from the list and return it.
Definition: opal_list.h:522
static void opal_list_prepend(opal_list_t *list, opal_list_item_t *item)
Prepend an item to the beginning of the list.
Definition: opal_list.h:469
struct mca_btl_base_module_t * btl
BTL module.
Definition: bml.h:61
Definition: pml_bfo_sendreq.h:41
ompi_request_t req_ompi
base request
Definition: pml_base_request.h:60
static size_t mca_bml_base_btl_array_get_size(mca_bml_base_btl_array_t *array)
If required, reallocate (grow) the array to the indicate size.
Definition: bml.h:91
BTL module interface functions and attributes.
Definition: btl.h:786
size_t req_bytes_packed
packed size of a message given the datatype and count
Definition: pml_base_sendreq.h:40
uint32_t flags
the properties of this convertor
Definition: opal_convertor.h:93
mca_pml_base_send_mode_t req_send_mode
type of send
Definition: pml_base_sendreq.h:41
Main top-level request struct definition.
Definition: request.h:100
size_t btl_eager_limit
maximum size of first fragment – eager send
Definition: btl.h:790
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
uint64_t req_sequence
sequence number for MPI pt-2-pt ordering
Definition: pml_base_request.h:74
volatile bool req_free_called
flag indicating if the user has freed this request
Definition: pml_base_request.h:65
static mca_bml_base_btl_t * mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t *array)
Return the next LRU index in the array.
Definition: bml.h:179