OpenMPI  0.1.1
pml_ob1_recvreq.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2010 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
13  * $COPYRIGHT$
14  *
15  * Additional copyrights may follow
16  *
17  * $HEADER$
18  */
19 /**
20  * @file
21  */
22 #ifndef OMPI_PML_OB1_RECV_REQUEST_H
23 #define OMPI_PML_OB1_RECV_REQUEST_H
24 
25 #include "pml_ob1.h"
26 #include "pml_ob1_rdma.h"
27 #include "pml_ob1_rdmafrag.h"
28 #include "ompi/proc/proc.h"
32 
33 BEGIN_C_DECLS
34 
37  ompi_ptr_t remote_req_send;
38  int32_t req_lock;
39  size_t req_pipeline_depth;
40  size_t req_bytes_received; /**< amount of data transferred into the user buffer */
41  size_t req_bytes_expected; /**< local size of the data as suggested by the user */
42  size_t req_rdma_offset;
43  size_t req_send_offset;
44  uint32_t req_rdma_cnt;
45  uint32_t req_rdma_idx;
46  bool req_pending;
47  bool req_ack_sent; /**< whether ack was sent to the sender */
48  bool req_match_received; /**< Prevent request to be completed prematurely */
49  opal_mutex_t lock;
50  mca_pml_ob1_com_btl_t req_rdma[1];
51 };
53 
55 
56 static inline bool lock_recv_request(mca_pml_ob1_recv_request_t *recvreq)
57 {
58  return OPAL_THREAD_ADD32(&recvreq->req_lock, 1) == 1;
59 }
60 
61 static inline bool unlock_recv_request(mca_pml_ob1_recv_request_t *recvreq)
62 {
63  return OPAL_THREAD_ADD32(&recvreq->req_lock, -1) == 0;
64 }
65 
66 /**
67  * Allocate a recv request from the modules free list.
68  *
69  * @param rc (OUT) OMPI_SUCCESS or error status on failure.
70  * @return Receive request.
71  */
72 #define MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq, rc) \
73 do { \
74  ompi_free_list_item_t* item; \
75  rc = OMPI_SUCCESS; \
76  OMPI_FREE_LIST_GET(&mca_pml_base_recv_requests, item, rc); \
77  recvreq = (mca_pml_ob1_recv_request_t*)item; \
78 } while(0)
79 
80 
81 /**
82  * Initialize a receive request with call parameters.
83  *
84  * @param request (IN) Receive request.
85  * @param addr (IN) User buffer.
86  * @param count (IN) Number of elements of indicated datatype.
87  * @param datatype (IN) User defined datatype.
88  * @param src (IN) Source rank w/in the communicator.
89  * @param tag (IN) User defined tag.
90  * @param comm (IN) Communicator.
91  * @param persistent (IN) Is this a ersistent request.
92  */
93 #define MCA_PML_OB1_RECV_REQUEST_INIT( request, \
94  addr, \
95  count, \
96  datatype, \
97  src, \
98  tag, \
99  comm, \
100  persistent) \
101 do { \
102  MCA_PML_BASE_RECV_REQUEST_INIT( &(request)->req_recv, \
103  addr, \
104  count, \
105  datatype, \
106  src, \
107  tag, \
108  comm, \
109  persistent); \
110 } while(0)
111 
112 /**
113  * Mark the request as completed at MPI level for internal purposes.
114  *
115  * @param recvreq (IN) Receive request.
116  */
117 #define MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE( recvreq ) \
118  do { \
119  PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
120  &(recvreq->req_recv.req_base), PERUSE_RECV ); \
121  ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \
122  } while (0)
123 
124 /*
125  * Free the PML receive request
126  */
127 #define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \
128  { \
129  MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
130  OMPI_FREE_LIST_RETURN( &mca_pml_base_recv_requests, \
131  (ompi_free_list_item_t*)(recvreq)); \
132  }
133 
134 /**
135  * Complete receive request. Request structure cannot be accessed after calling
136  * this function any more.
137  *
138  * @param recvreq (IN) Receive request.
139  */
140 static inline void
142 {
143  size_t i;
144 
145  assert(false == recvreq->req_recv.req_base.req_pml_complete);
146 
147  if(recvreq->req_recv.req_bytes_packed > 0) {
148  PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
149  &recvreq->req_recv.req_base, PERUSE_RECV );
150  }
151 
152  for(i = 0; i < recvreq->req_rdma_cnt; i++) {
153  mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
154  if( NULL != btl_reg && btl_reg->mpool != NULL) {
155  btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
156  }
157  }
158  recvreq->req_rdma_cnt = 0;
159 
160  OPAL_THREAD_LOCK(&ompi_request_lock);
161  if(true == recvreq->req_recv.req_base.req_free_called) {
162  MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq);
163  } else {
164  /* initialize request status */
165  recvreq->req_recv.req_base.req_pml_complete = true;
166  recvreq->req_recv.req_base.req_ompi.req_status._ucount =
167  recvreq->req_bytes_received;
168  if (recvreq->req_recv.req_bytes_packed > recvreq->req_bytes_expected) {
169  recvreq->req_recv.req_base.req_ompi.req_status._ucount =
170  recvreq->req_recv.req_bytes_packed;
171  recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
172  MPI_ERR_TRUNCATE;
173  }
175  }
176  OPAL_THREAD_UNLOCK(&ompi_request_lock);
177 }
178 
179 static inline bool
180 recv_request_pml_complete_check(mca_pml_ob1_recv_request_t *recvreq)
181 {
182 #if OPAL_ENABLE_MULTI_THREADS
183  opal_atomic_rmb();
184 #endif
185  if(recvreq->req_match_received &&
186  recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed &&
187  lock_recv_request(recvreq)) {
188  recv_request_pml_complete(recvreq);
189  return true;
190  }
191 
192  return false;
193 }
194 
196 #define MCA_PML_OB1_RECV_REQUEST_START(r) mca_pml_ob1_recv_req_start(r)
197 
198 static inline void prepare_recv_req_converter(mca_pml_ob1_recv_request_t *req)
199 {
200  if( req->req_recv.req_base.req_datatype->super.size | req->req_recv.req_base.req_count ) {
201  opal_convertor_copy_and_prepare_for_recv(
202  req->req_recv.req_base.req_proc->proc_convertor,
203  &(req->req_recv.req_base.req_datatype->super),
204  req->req_recv.req_base.req_count,
205  req->req_recv.req_base.req_addr,
206  0,
207  &req->req_recv.req_base.req_convertor);
208  opal_convertor_get_unpacked_size(&req->req_recv.req_base.req_convertor,
209  &req->req_bytes_expected);
210  }
211 }
212 
213 #define MCA_PML_OB1_RECV_REQUEST_MATCHED(request, hdr) \
214  recv_req_matched(request, hdr)
215 
216 static inline void recv_req_matched(mca_pml_ob1_recv_request_t *req,
218 {
219  req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src;
220  req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag;
221  req->req_match_received = true;
222 #if OPAL_ENABLE_MULTI_THREADS
223  opal_atomic_wmb();
224 #endif
225  if(req->req_recv.req_bytes_packed > 0) {
226 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
227  if(MPI_ANY_SOURCE == req->req_recv.req_base.req_peer) {
228  /* non wildcard prepared during post recv */
229  prepare_recv_req_converter(req);
230  }
231 #endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */
232  PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_XFER_BEGIN,
233  &req->req_recv.req_base, PERUSE_RECV);
234  }
235 }
236 
237 
238 /**
239  *
240  */
241 
242 #define MCA_PML_OB1_RECV_REQUEST_UNPACK( request, \
243  segments, \
244  num_segments, \
245  seg_offset, \
246  data_offset, \
247  bytes_received, \
248  bytes_delivered) \
249 do { \
250  bytes_delivered = 0; \
251  if(request->req_recv.req_bytes_packed > 0) { \
252  struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS]; \
253  uint32_t iov_count = 0; \
254  size_t max_data = bytes_received; \
255  size_t n, offset = seg_offset; \
256  mca_btl_base_segment_t* segment = segments; \
257  \
258  OPAL_THREAD_LOCK(&request->lock); \
259  for( n = 0; n < num_segments; n++, segment++ ) { \
260  if(offset >= segment->seg_len) { \
261  offset -= segment->seg_len; \
262  } else { \
263  iov[iov_count].iov_len = segment->seg_len - offset; \
264  iov[iov_count].iov_base = (IOVBASE_TYPE*) \
265  ((unsigned char*)segment->seg_addr.pval + offset); \
266  iov_count++; \
267  offset = 0; \
268  } \
269  } \
270  PERUSE_TRACE_COMM_OMPI_EVENT (PERUSE_COMM_REQ_XFER_CONTINUE, \
271  &(recvreq->req_recv.req_base), max_data, \
272  PERUSE_RECV); \
273  opal_convertor_set_position( &(request->req_recv.req_base.req_convertor), \
274  &data_offset ); \
275  opal_convertor_unpack( &(request)->req_recv.req_base.req_convertor, \
276  iov, \
277  &iov_count, \
278  &max_data ); \
279  bytes_delivered = max_data; \
280  OPAL_THREAD_UNLOCK(&request->lock); \
281  } \
282 } while (0)
283 
284 
285 /**
286  *
287  */
288 
289 void mca_pml_ob1_recv_request_progress_match(
291  struct mca_btl_base_module_t* btl,
292  mca_btl_base_segment_t* segments,
293  size_t num_segments);
294 
295 /**
296  *
297  */
298 
299 void mca_pml_ob1_recv_request_progress_frag(
301  struct mca_btl_base_module_t* btl,
302  mca_btl_base_segment_t* segments,
303  size_t num_segments);
304 
305 /**
306  *
307  */
308 
311  struct mca_btl_base_module_t* btl,
312  mca_btl_base_segment_t* segments,
313  size_t num_segments);
314 
315 /**
316  *
317  */
318 
319 void mca_pml_ob1_recv_request_progress_rget(
321  struct mca_btl_base_module_t* btl,
322  mca_btl_base_segment_t* segments,
323  size_t num_segments);
324 
325 /**
326  *
327  */
328 
331  struct mca_btl_base_module_t* btl,
332  mca_btl_base_segment_t* segments,
333  size_t num_segments);
334 
335 /**
336  *
337  */
338 
339 int mca_pml_ob1_recv_request_schedule_once(
340  mca_pml_ob1_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl);
341 
342 static inline int mca_pml_ob1_recv_request_schedule_exclusive(
344  mca_bml_base_btl_t* start_bml_btl)
345 {
346  int rc;
347 
348  do {
349  rc = mca_pml_ob1_recv_request_schedule_once(req, start_bml_btl);
350  if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
351  break;
352  } while(!unlock_recv_request(req));
353 
354  if(OMPI_SUCCESS == rc)
355  recv_request_pml_complete_check(req);
356 
357  return rc;
358 }
359 
360 static inline void mca_pml_ob1_recv_request_schedule(
362  mca_bml_base_btl_t* start_bml_btl)
363 {
364  if(!lock_recv_request(req))
365  return;
366 
367  (void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl);
368 }
369 
370 #define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \
371  do { \
372  mca_pml_ob1_pckt_pending_t *_pckt; \
373  int _rc; \
374  \
375  MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt,_rc); \
376  _pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK; \
377  _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
378  _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
379  _pckt->hdr.hdr_ack.hdr_send_offset = (O); \
380  _pckt->proc = (P); \
381  _pckt->bml_btl = NULL; \
382  OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
383  opal_list_append(&mca_pml_ob1.pckt_pending, \
384  (opal_list_item_t*)_pckt); \
385  OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); \
386  (void)_rc; \
387  } while(0)
388 
389 int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc,
390  mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
391  uint64_t hdr_rdma_offset, bool nordma);
392 
393 static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
394  uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
395  bool nordma)
396 {
397  size_t i;
398  mca_bml_base_btl_t* bml_btl;
399  mca_bml_base_endpoint_t* endpoint =
401 
402  for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
403  bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
404  if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
405  hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
406  return OMPI_SUCCESS;
407  }
408 
409  MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
410  hdr_send_offset);
411 
412  return OMPI_ERR_OUT_OF_RESOURCE;
413 }
414 
415 int mca_pml_ob1_recv_request_get_frag(mca_pml_ob1_rdma_frag_t* frag);
416 
417 /* This function tries to continue recvreq that stuck due to resource
418  * unavailability. Recvreq is added to recv_pending list if scheduling of put
419  * operation cannot be accomplished for some reason. */
420 void mca_pml_ob1_recv_request_process_pending(void);
421 
422 END_C_DECLS
423 
424 #endif
425 
int32_t hdr_tag
user tag
Definition: pml_ob1_hdr.h:74
#define OPAL_THREAD_ADD32(x, y)
Use an atomic operation for increment/decrement if opal_using_threads() indicates that threads are in...
Definition: mutex.h:367
struct mca_bml_base_endpoint_t * proc_bml
BML specific proc data.
Definition: proc.h:64
size_t req_bytes_expected
local size of the data as suggested by the user
Definition: pml_ob1_recvreq.h:41
ompi_status_public_t req_status
Completion status.
Definition: request.h:103
void opal_atomic_rmb(void)
Read memory barrier.
size_t req_count
count of user datatype elements
Definition: pml_base_request.h:70
mca_pml_base_request_t req_base
base request
Definition: pml_base_recvreq.h:37
Definition: types.h:52
void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req)
Definition: pml_ob1_recvreq.c:992
Definition: mutex_unix.h:53
Header definition for the first fragment, contains the attributes required to match the corresponding...
Definition: pml_ob1_hdr.h:70
Definition: pml_ob1.h:296
void mca_pml_ob1_recv_request_matched_probe(mca_pml_ob1_recv_request_t *req, struct mca_btl_base_module_t *btl, mca_btl_base_segment_t *segments, size_t num_segments)
Handle completion of a probe request.
Definition: pml_ob1_recvreq.c:692
int32_t req_peer
peer process - rank w/in this communicator
Definition: pml_base_request.h:71
Definition: mpool.h:44
Process identification structure interface.
Remote Open MPI process structure.
Definition: proc.h:56
#define OPAL_THREAD_LOCK(mutex)
Lock a mutex if opal_using_threads() says that multiple threads may be active in the process...
Definition: mutex.h:223
bool req_ack_sent
whether ack was sent to the sender
Definition: pml_ob1_recvreq.h:47
Definition: pml_ob1_rdmafrag.h:35
#define OPAL_THREAD_UNLOCK(mutex)
Unlock a mutex if opal_using_threads() says that multiple threads may be active in the process...
Definition: mutex.h:309
opal_convertor_t req_convertor
always need the convertor
Definition: pml_base_request.h:66
volatile bool req_pml_complete
flag indicating if the pt-2-pt layer is done with this request
Definition: pml_base_request.h:61
opal_datatype_t super
Base opal_datatype_t superclass.
Definition: ompi_datatype.h:69
int32_t hdr_src
source rank
Definition: pml_ob1_hdr.h:73
static void recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
Complete receive request.
Definition: pml_ob1_recvreq.h:141
#define MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq)
Mark the request as completed at MPI level for internal purposes.
Definition: pml_ob1_recvreq.h:117
struct ompi_proc_t * req_proc
peer process
Definition: pml_base_request.h:73
size_t size
total size in bytes of the memory used by the data if the data is put on a contiguous buffer ...
Definition: opal_datatype.h:108
mca_bml_base_btl_array_t btl_eager
array of btls to use for first fragments
Definition: bml.h:228
size_t req_bytes_received
amount of data transferred into the user buffer
Definition: pml_ob1_recvreq.h:40
Definition: pml_ob1_recvreq.h:35
mca_mpool_base_module_deregister_fn_t mpool_deregister
deregister memory
Definition: mpool.h:181
struct ompi_datatype_t * req_datatype
pointer to data type
Definition: pml_base_request.h:64
Structure associated w/ ompi_proc_t that contains the set of BTLs used to reach a destination...
Definition: bml.h:222
void * req_addr
pointer to application buffer
Definition: pml_base_request.h:69
Definition: bml.h:58
void opal_atomic_wmb(void)
Write memory barrier.
struct opal_convertor_t * proc_convertor
Base convertor for the proc described by this process.
Definition: proc.h:70
Base type for receive requests.
Definition: pml_base_recvreq.h:36
void mca_pml_ob1_recv_request_progress_rndv(mca_pml_ob1_recv_request_t *req, struct mca_btl_base_module_t *btl, mca_btl_base_segment_t *segments, size_t num_segments)
Definition: pml_ob1_recvreq.c:580
bool req_match_received
Prevent request to be completed prematurely.
Definition: pml_ob1_recvreq.h:48
ompi_request_t req_ompi
base request
Definition: pml_base_request.h:60
static size_t mca_bml_base_btl_array_get_size(mca_bml_base_btl_array_t *array)
If required, reallocate (grow) the array to the indicate size.
Definition: bml.h:91
BTL module interface functions and attributes.
Definition: btl.h:786
size_t req_bytes_packed
size of message being received
Definition: pml_base_recvreq.h:38
Describes a region/segment of memory that is addressable by an BTL.
Definition: btl.h:236
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
volatile bool req_free_called
flag indicating if the user has freed this request
Definition: pml_base_request.h:65
static mca_bml_base_btl_t * mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t *array)
Return the next LRU index in the array.
Definition: bml.h:179