OpenMPI  0.1.1
pml_bfo_recvreq.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2010 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
13  * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
14  * $COPYRIGHT$
15  *
16  * Additional copyrights may follow
17  *
18  * $HEADER$
19  */
20 /**
21  * @file
22  */
23 #ifndef OMPI_PML_BFO_RECV_REQUEST_H
24 #define OMPI_PML_BFO_RECV_REQUEST_H
25 
26 #include "pml_bfo.h"
27 #include "pml_bfo_rdma.h"
28 #include "pml_bfo_rdmafrag.h"
29 #include "ompi/proc/proc.h"
33 #if PML_BFO
34 #define RECVREQ_RECVERRSENT 0x01
35 #define RECVREQ_RNDVRESTART_RECVED 0x02
36 #define RECVREQ_RNDVRESTART_ACKED 0x04
37 #endif /* PML_BFO */
38 
39 BEGIN_C_DECLS
40 
43  ompi_ptr_t remote_req_send;
44 #if PML_BFO
45  int32_t req_msgseq; /* PML sequence number */
46  int32_t req_events; /* number of outstanding events on request */
47  int32_t req_restartseq; /* sequence number of restarted request */
48  int32_t req_errstate; /* state of request if in error */
49 #endif /* PML_BFO */
50  int32_t req_lock;
51  size_t req_pipeline_depth;
52  size_t req_bytes_received; /**< amount of data transferred into the user buffer */
53  size_t req_bytes_expected; /**< local size of the data as suggested by the user */
54  size_t req_rdma_offset;
55  size_t req_send_offset;
56  uint32_t req_rdma_cnt;
57  uint32_t req_rdma_idx;
58  bool req_pending;
59  bool req_ack_sent; /**< whether ack was sent to the sender */
60  bool req_match_received; /**< Prevent request to be completed prematurely */
61  opal_mutex_t lock;
62  mca_pml_bfo_com_btl_t req_rdma[1];
63 };
65 
67 
68 static inline bool lock_recv_request(mca_pml_bfo_recv_request_t *recvreq)
69 {
70  return OPAL_THREAD_ADD32(&recvreq->req_lock, 1) == 1;
71 }
72 
73 static inline bool unlock_recv_request(mca_pml_bfo_recv_request_t *recvreq)
74 {
75  return OPAL_THREAD_ADD32(&recvreq->req_lock, -1) == 0;
76 }
77 
78 /**
79  * Allocate a recv request from the modules free list.
80  *
81  * @param rc (OUT) OMPI_SUCCESS or error status on failure.
82  * @return Receive request.
83  */
84 #define MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq, rc) \
85 do { \
86  ompi_free_list_item_t* item; \
87  rc = OMPI_SUCCESS; \
88  OMPI_FREE_LIST_GET(&mca_pml_base_recv_requests, item, rc); \
89  recvreq = (mca_pml_bfo_recv_request_t*)item; \
90 } while(0)
91 
92 
93 /**
94  * Initialize a receive request with call parameters.
95  *
96  * @param request (IN) Receive request.
97  * @param addr (IN) User buffer.
98  * @param count (IN) Number of elements of indicated datatype.
99  * @param datatype (IN) User defined datatype.
100  * @param src (IN) Source rank w/in the communicator.
101  * @param tag (IN) User defined tag.
102  * @param comm (IN) Communicator.
103  * @param persistent (IN) Is this a ersistent request.
104  */
105 #define MCA_PML_BFO_RECV_REQUEST_INIT( request, \
106  addr, \
107  count, \
108  datatype, \
109  src, \
110  tag, \
111  comm, \
112  persistent) \
113 do { \
114  MCA_PML_BASE_RECV_REQUEST_INIT( &(request)->req_recv, \
115  addr, \
116  count, \
117  datatype, \
118  src, \
119  tag, \
120  comm, \
121  persistent); \
122 } while(0)
123 
124 /**
125  * Mark the request as completed at MPI level for internal purposes.
126  *
127  * @param recvreq (IN) Receive request.
128  */
129 #define MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE( recvreq ) \
130  do { \
131  PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
132  &(recvreq->req_recv.req_base), PERUSE_RECV ); \
133  ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \
134  } while (0)
135 
136 /*
137  * Free the PML receive request
138  */
139 #define MCA_PML_BFO_RECV_REQUEST_RETURN(recvreq) \
140  { \
141  MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
142  OMPI_FREE_LIST_RETURN( &mca_pml_base_recv_requests, \
143  (ompi_free_list_item_t*)(recvreq)); \
144  }
145 
146 /**
147  * Complete receive request. Request structure cannot be accessed after calling
148  * this function any more.
149  *
150  * @param recvreq (IN) Receive request.
151  */
152 static inline void
154 {
155  size_t i;
156 
157  assert(false == recvreq->req_recv.req_base.req_pml_complete);
158 
159  if(recvreq->req_recv.req_bytes_packed > 0) {
160  PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
161  &recvreq->req_recv.req_base, PERUSE_RECV );
162  }
163 
164  for(i = 0; i < recvreq->req_rdma_cnt; i++) {
165  mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
166  if( NULL != btl_reg && btl_reg->mpool != NULL) {
167  btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
168  }
169  }
170  recvreq->req_rdma_cnt = 0;
171 #if PML_BFO
172  recvreq->req_msgseq -= 100;
173 #endif /* PML_BFO */
174 
175  OPAL_THREAD_LOCK(&ompi_request_lock);
176  if(true == recvreq->req_recv.req_base.req_free_called) {
177  MCA_PML_BFO_RECV_REQUEST_RETURN(recvreq);
178  } else {
179  /* initialize request status */
180  recvreq->req_recv.req_base.req_pml_complete = true;
181  recvreq->req_recv.req_base.req_ompi.req_status._ucount =
182  recvreq->req_bytes_received;
183  if (recvreq->req_recv.req_bytes_packed > recvreq->req_bytes_expected) {
184  recvreq->req_recv.req_base.req_ompi.req_status._ucount =
185  recvreq->req_recv.req_bytes_packed;
186  recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
187  MPI_ERR_TRUNCATE;
188  }
190  }
191  OPAL_THREAD_UNLOCK(&ompi_request_lock);
192 }
193 
194 static inline bool
195 recv_request_pml_complete_check(mca_pml_bfo_recv_request_t *recvreq)
196 {
197 #if OPAL_ENABLE_MULTI_THREADS
198  opal_atomic_rmb();
199 #endif
200  if(recvreq->req_match_received &&
201  recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed &&
202 #if PML_BFO
203  (0 == recvreq->req_events) && lock_recv_request(recvreq)) {
204 #else /* PML_BFO */
205  lock_recv_request(recvreq)) {
206 #endif /* PML_BFO */
207  recv_request_pml_complete(recvreq);
208  return true;
209  }
210 
211  return false;
212 }
213 
215 #define MCA_PML_BFO_RECV_REQUEST_START(r) mca_pml_bfo_recv_req_start(r)
216 
217 static inline void prepare_recv_req_converter(mca_pml_bfo_recv_request_t *req)
218 {
219  if( req->req_recv.req_base.req_datatype->super.size | req->req_recv.req_base.req_count ) {
220  opal_convertor_copy_and_prepare_for_recv(
221  req->req_recv.req_base.req_proc->proc_convertor,
222  &(req->req_recv.req_base.req_datatype->super),
223  req->req_recv.req_base.req_count,
224  req->req_recv.req_base.req_addr,
225  0,
226  &req->req_recv.req_base.req_convertor);
227  opal_convertor_get_unpacked_size(&req->req_recv.req_base.req_convertor,
228  &req->req_bytes_expected);
229  }
230 }
231 
232 #define MCA_PML_BFO_RECV_REQUEST_MATCHED(request, hdr) \
233  recv_req_matched(request, hdr)
234 
235 static inline void recv_req_matched(mca_pml_bfo_recv_request_t *req,
237 {
238  req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src;
239  req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag;
240  req->req_match_received = true;
241 #if PML_BFO
242  req->req_msgseq = hdr->hdr_seq;
243 #endif /* PML_BFO */
244 #if OPAL_ENABLE_MULTI_THREADS
245  opal_atomic_wmb();
246 #endif
247  if(req->req_recv.req_bytes_packed > 0) {
248 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
249  if(MPI_ANY_SOURCE == req->req_recv.req_base.req_peer) {
250  /* non wildcard prepared during post recv */
251  prepare_recv_req_converter(req);
252  }
253 #endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */
254  PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_XFER_BEGIN,
255  &req->req_recv.req_base, PERUSE_RECV);
256  }
257 }
258 
259 
260 /**
261  *
262  */
263 
264 #define MCA_PML_BFO_RECV_REQUEST_UNPACK( request, \
265  segments, \
266  num_segments, \
267  seg_offset, \
268  data_offset, \
269  bytes_received, \
270  bytes_delivered) \
271 do { \
272  bytes_delivered = 0; \
273  if(request->req_recv.req_bytes_packed > 0) { \
274  struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS]; \
275  uint32_t iov_count = 0; \
276  size_t max_data = bytes_received; \
277  size_t n, offset = seg_offset; \
278  mca_btl_base_segment_t* segment = segments; \
279  \
280  OPAL_THREAD_LOCK(&request->lock); \
281  for( n = 0; n < num_segments; n++, segment++ ) { \
282  if(offset >= segment->seg_len) { \
283  offset -= segment->seg_len; \
284  } else { \
285  iov[iov_count].iov_len = segment->seg_len - offset; \
286  iov[iov_count].iov_base = (IOVBASE_TYPE*) \
287  ((unsigned char*)segment->seg_addr.pval + offset); \
288  iov_count++; \
289  offset = 0; \
290  } \
291  } \
292  PERUSE_TRACE_COMM_OMPI_EVENT (PERUSE_COMM_REQ_XFER_CONTINUE, \
293  &(recvreq->req_recv.req_base), max_data, \
294  PERUSE_RECV); \
295  opal_convertor_set_position( &(request->req_recv.req_base.req_convertor), \
296  &data_offset ); \
297  opal_convertor_unpack( &(request)->req_recv.req_base.req_convertor, \
298  iov, \
299  &iov_count, \
300  &max_data ); \
301  bytes_delivered = max_data; \
302  OPAL_THREAD_UNLOCK(&request->lock); \
303  } \
304 } while (0)
305 
306 
307 /**
308  *
309  */
310 
311 void mca_pml_bfo_recv_request_progress_match(
313  struct mca_btl_base_module_t* btl,
314  mca_btl_base_segment_t* segments,
315  size_t num_segments);
316 
317 /**
318  *
319  */
320 
321 void mca_pml_bfo_recv_request_progress_frag(
323  struct mca_btl_base_module_t* btl,
324  mca_btl_base_segment_t* segments,
325  size_t num_segments);
326 
327 /**
328  *
329  */
330 
333  struct mca_btl_base_module_t* btl,
334  mca_btl_base_segment_t* segments,
335  size_t num_segments);
336 
337 /**
338  *
339  */
340 
341 void mca_pml_bfo_recv_request_progress_rget(
343  struct mca_btl_base_module_t* btl,
344  mca_btl_base_segment_t* segments,
345  size_t num_segments);
346 
347 /**
348  *
349  */
350 
353  struct mca_btl_base_module_t* btl,
354  mca_btl_base_segment_t* segments,
355  size_t num_segments);
356 
357 /**
358  *
359  */
360 
361 int mca_pml_bfo_recv_request_schedule_once(
362  mca_pml_bfo_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl);
363 
364 static inline int mca_pml_bfo_recv_request_schedule_exclusive(
366  mca_bml_base_btl_t* start_bml_btl)
367 {
368  int rc;
369 
370  do {
371  rc = mca_pml_bfo_recv_request_schedule_once(req, start_bml_btl);
372  if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
373  break;
374  } while(!unlock_recv_request(req));
375 
376  if(OMPI_SUCCESS == rc)
377  recv_request_pml_complete_check(req);
378 
379  return rc;
380 }
381 
382 static inline void mca_pml_bfo_recv_request_schedule(
384  mca_bml_base_btl_t* start_bml_btl)
385 {
386  if(!lock_recv_request(req))
387  return;
388 
389  (void)mca_pml_bfo_recv_request_schedule_exclusive(req, start_bml_btl);
390 }
391 
392 #define MCA_PML_BFO_ADD_ACK_TO_PENDING(P, S, D, O) \
393  do { \
394  mca_pml_bfo_pckt_pending_t *_pckt; \
395  int _rc; \
396  \
397  MCA_PML_BFO_PCKT_PENDING_ALLOC(_pckt,_rc); \
398  _pckt->hdr.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK; \
399  _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
400  _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
401  _pckt->hdr.hdr_ack.hdr_send_offset = (O); \
402  _pckt->proc = (P); \
403  _pckt->bml_btl = NULL; \
404  OPAL_THREAD_LOCK(&mca_pml_bfo.lock); \
405  opal_list_append(&mca_pml_bfo.pckt_pending, \
406  (opal_list_item_t*)_pckt); \
407  OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); \
408  } while(0)
409 
410 int mca_pml_bfo_recv_request_ack_send_btl(ompi_proc_t* proc,
411  mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
412  uint64_t hdr_rdma_offset, bool nordma);
413 
414 static inline int mca_pml_bfo_recv_request_ack_send(ompi_proc_t* proc,
415  uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
416  bool nordma)
417 {
418  size_t i;
419  mca_bml_base_btl_t* bml_btl;
420  mca_bml_base_endpoint_t* endpoint =
422 
423  for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
424  bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
425  if(mca_pml_bfo_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
426  hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
427  return OMPI_SUCCESS;
428  }
429 
430  MCA_PML_BFO_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
431  hdr_send_offset);
432 
433  return OMPI_ERR_OUT_OF_RESOURCE;
434 }
435 
436 int mca_pml_bfo_recv_request_get_frag(mca_pml_bfo_rdma_frag_t* frag);
437 
438 /* This function tries to continue recvreq that stuck due to resource
439  * unavailability. Recvreq is added to recv_pending list if scheduling of put
440  * operation cannot be accomplished for some reason. */
441 void mca_pml_bfo_recv_request_process_pending(void);
442 
443 END_C_DECLS
444 
445 #endif
446 
#define OPAL_THREAD_ADD32(x, y)
Use an atomic operation for increment/decrement if opal_using_threads() indicates that threads are in...
Definition: mutex.h:367
void mca_pml_bfo_recv_request_matched_probe(mca_pml_bfo_recv_request_t *req, struct mca_btl_base_module_t *btl, mca_btl_base_segment_t *segments, size_t num_segments)
Handle completion of a probe request.
Definition: pml_bfo_recvreq.c:734
struct mca_bml_base_endpoint_t * proc_bml
BML specific proc data.
Definition: proc.h:64
Definition: pml_bfo.h:299
Definition: pml_bfo_recvreq.h:41
ompi_status_public_t req_status
Completion status.
Definition: request.h:103
void opal_atomic_rmb(void)
Read memory barrier.
Header definition for the first fragment, contains the attributes required to match the corresponding...
Definition: pml_bfo_hdr.h:77
size_t req_count
count of user datatype elements
Definition: pml_base_request.h:70
int32_t hdr_tag
user tag
Definition: pml_bfo_hdr.h:81
bool req_match_received
Prevent request to be completed prematurely.
Definition: pml_bfo_recvreq.h:60
mca_pml_base_request_t req_base
base request
Definition: pml_base_recvreq.h:37
Definition: types.h:52
Definition: mutex_unix.h:53
int32_t req_peer
peer process - rank w/in this communicator
Definition: pml_base_request.h:71
Definition: mpool.h:44
Process identification structure interface.
Remote Open MPI process structure.
Definition: proc.h:56
#define MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE(recvreq)
Mark the request as completed at MPI level for internal purposes.
Definition: pml_bfo_recvreq.h:129
#define OPAL_THREAD_LOCK(mutex)
Lock a mutex if opal_using_threads() says that multiple threads may be active in the process...
Definition: mutex.h:223
#define OPAL_THREAD_UNLOCK(mutex)
Unlock a mutex if opal_using_threads() says that multiple threads may be active in the process...
Definition: mutex.h:309
opal_convertor_t req_convertor
always need the convertor
Definition: pml_base_request.h:66
static void recv_request_pml_complete(mca_pml_bfo_recv_request_t *recvreq)
Complete receive request.
Definition: pml_bfo_recvreq.h:153
volatile bool req_pml_complete
flag indicating if the pt-2-pt layer is done with this request
Definition: pml_base_request.h:61
opal_datatype_t super
Base opal_datatype_t superclass.
Definition: ompi_datatype.h:69
struct ompi_proc_t * req_proc
peer process
Definition: pml_base_request.h:73
size_t size
total size in bytes of the memory used by the data if the data is put on a contiguous buffer ...
Definition: opal_datatype.h:108
int32_t hdr_src
source rank
Definition: pml_bfo_hdr.h:80
mca_bml_base_btl_array_t btl_eager
array of btls to use for first fragments
Definition: bml.h:228
mca_mpool_base_module_deregister_fn_t mpool_deregister
deregister memory
Definition: mpool.h:181
struct ompi_datatype_t * req_datatype
pointer to data type
Definition: pml_base_request.h:64
Structure associated w/ ompi_proc_t that contains the set of BTLs used to reach a destination...
Definition: bml.h:222
void * req_addr
pointer to application buffer
Definition: pml_base_request.h:69
Definition: pml_bfo_rdmafrag.h:36
Definition: bml.h:58
void opal_atomic_wmb(void)
Write memory barrier.
struct opal_convertor_t * proc_convertor
Base convertor for the proc described by this process.
Definition: proc.h:70
Base type for receive requests.
Definition: pml_base_recvreq.h:36
size_t req_bytes_received
amount of data transferred into the user buffer
Definition: pml_bfo_recvreq.h:52
void mca_pml_bfo_recv_request_progress_rndv(mca_pml_bfo_recv_request_t *req, struct mca_btl_base_module_t *btl, mca_btl_base_segment_t *segments, size_t num_segments)
Definition: pml_bfo_recvreq.c:622
ompi_request_t req_ompi
base request
Definition: pml_base_request.h:60
bool req_ack_sent
whether ack was sent to the sender
Definition: pml_bfo_recvreq.h:59
static size_t mca_bml_base_btl_array_get_size(mca_bml_base_btl_array_t *array)
If required, reallocate (grow) the array to the indicate size.
Definition: bml.h:91
BTL module interface functions and attributes.
Definition: btl.h:786
uint16_t hdr_seq
message sequence number
Definition: pml_bfo_hdr.h:82
size_t req_bytes_expected
local size of the data as suggested by the user
Definition: pml_bfo_recvreq.h:53
void mca_pml_bfo_recv_req_start(mca_pml_bfo_recv_request_t *req)
Definition: pml_bfo_recvreq.c:1046
size_t req_bytes_packed
size of message being received
Definition: pml_base_recvreq.h:38
Describes a region/segment of memory that is addressable by an BTL.
Definition: btl.h:236
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
volatile bool req_free_called
flag indicating if the user has freed this request
Definition: pml_base_request.h:65
static mca_bml_base_btl_t * mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t *array)
Return the next LRU index in the array.
Definition: bml.h:179