OpenMPI  0.1.1
pml_csum_recvreq.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2010 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
13  * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
14  * $COPYRIGHT$
15  *
16  * Additional copyrights may follow
17  *
18  * $HEADER$
19  */
20 /**
21  * @file
22  */
23 #ifndef OMPI_PML_CSUM_RECV_REQUEST_H
24 #define OMPI_PML_CSUM_RECV_REQUEST_H
25 
26 #include "pml_csum.h"
27 #include "pml_csum_rdma.h"
28 #include "pml_csum_rdmafrag.h"
29 #include "ompi/proc/proc.h"
33 
34 BEGIN_C_DECLS
35 
38  ompi_ptr_t remote_req_send;
39  int32_t req_lock;
40  size_t req_pipeline_depth;
41  size_t req_bytes_received; /**< amount of data transferred into the user buffer */
42  size_t req_bytes_expected; /**< local size of the data as suggested by the user */
43  size_t req_rdma_offset;
44  size_t req_send_offset;
45  uint32_t req_rdma_cnt;
46  uint32_t req_rdma_idx;
47  bool req_pending;
48  bool req_ack_sent; /**< whether ack was sent to the sender */
49  bool req_match_received; /**< Prevent request to be completed prematurely */
50  opal_mutex_t lock;
51  mca_pml_csum_com_btl_t req_rdma[1];
52 };
54 
56 
57 static inline bool lock_recv_request(mca_pml_csum_recv_request_t *recvreq)
58 {
59  return OPAL_THREAD_ADD32(&recvreq->req_lock, 1) == 1;
60 }
61 
62 static inline bool unlock_recv_request(mca_pml_csum_recv_request_t *recvreq)
63 {
64  return OPAL_THREAD_ADD32(&recvreq->req_lock, -1) == 0;
65 }
66 
67 /**
68  * Allocate a recv request from the modules free list.
69  *
70  * @param rc (OUT) OMPI_SUCCESS or error status on failure.
71  * @return Receive request.
72  */
73 #define MCA_PML_CSUM_RECV_REQUEST_ALLOC(recvreq, rc) \
74 do { \
75  ompi_free_list_item_t* item; \
76  rc = OMPI_SUCCESS; \
77  OMPI_FREE_LIST_GET(&mca_pml_base_recv_requests, item, rc); \
78  recvreq = (mca_pml_csum_recv_request_t*)item; \
79 } while(0)
80 
81 
82 /**
83  * Initialize a receive request with call parameters.
84  *
85  * @param request (IN) Receive request.
86  * @param addr (IN) User buffer.
87  * @param count (IN) Number of elements of indicated datatype.
88  * @param datatype (IN) User defined datatype.
89  * @param src (IN) Source rank w/in the communicator.
90  * @param tag (IN) User defined tag.
91  * @param comm (IN) Communicator.
92  * @param persistent (IN) Is this a ersistent request.
93  */
94 #define MCA_PML_CSUM_RECV_REQUEST_INIT( request, \
95  addr, \
96  count, \
97  datatype, \
98  src, \
99  tag, \
100  comm, \
101  persistent) \
102 do { \
103  MCA_PML_BASE_RECV_REQUEST_INIT( &(request)->req_recv, \
104  addr, \
105  count, \
106  datatype, \
107  src, \
108  tag, \
109  comm, \
110  persistent); \
111 } while(0)
112 
113 /**
114  * Mark the request as completed at MPI level for internal purposes.
115  *
116  * @param recvreq (IN) Receive request.
117  */
118 #define MCA_PML_CSUM_RECV_REQUEST_MPI_COMPLETE( recvreq ) \
119  do { \
120  PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
121  &(recvreq->req_recv.req_base), PERUSE_RECV ); \
122  ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \
123  } while (0)
124 
125 /*
126  * Free the PML receive request
127  */
128 #define MCA_PML_CSUM_RECV_REQUEST_RETURN(recvreq) \
129  { \
130  MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
131  OMPI_FREE_LIST_RETURN( &mca_pml_base_recv_requests, \
132  (ompi_free_list_item_t*)(recvreq)); \
133  }
134 
135 /**
136  * Complete receive request. Request structure cannot be accessed after calling
137  * this function any more.
138  *
139  * @param recvreq (IN) Receive request.
140  */
141 static inline void
143 {
144  size_t i;
145 
146  assert(false == recvreq->req_recv.req_base.req_pml_complete);
147 
148  if(recvreq->req_recv.req_bytes_packed > 0) {
149  PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
150  &recvreq->req_recv.req_base, PERUSE_RECV );
151  }
152 
153  for(i = 0; i < recvreq->req_rdma_cnt; i++) {
154  mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
155  if( NULL != btl_reg && btl_reg->mpool != NULL) {
156  btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
157  }
158  }
159  recvreq->req_rdma_cnt = 0;
160 
161  OPAL_THREAD_LOCK(&ompi_request_lock);
162  if(true == recvreq->req_recv.req_base.req_free_called) {
163  MCA_PML_CSUM_RECV_REQUEST_RETURN(recvreq);
164  } else {
165  /* initialize request status */
166  recvreq->req_recv.req_base.req_pml_complete = true;
167  recvreq->req_recv.req_base.req_ompi.req_status._ucount =
168  recvreq->req_bytes_received;
169  if (recvreq->req_recv.req_bytes_packed > recvreq->req_bytes_expected) {
170  recvreq->req_recv.req_base.req_ompi.req_status._ucount =
171  recvreq->req_recv.req_bytes_packed;
172  recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
173  MPI_ERR_TRUNCATE;
174  }
176  }
177  OPAL_THREAD_UNLOCK(&ompi_request_lock);
178 }
179 
180 static inline bool
181 recv_request_pml_complete_check(mca_pml_csum_recv_request_t *recvreq)
182 {
183 #if OPAL_ENABLE_MULTI_THREADS
184  opal_atomic_rmb();
185 #endif
186  if(recvreq->req_match_received &&
187  recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed &&
188  lock_recv_request(recvreq)) {
189  recv_request_pml_complete(recvreq);
190  return true;
191  }
192 
193  return false;
194 }
195 
197 #define MCA_PML_CSUM_RECV_REQUEST_START(r) mca_pml_csum_recv_req_start(r)
198 
199 static inline void prepare_recv_req_converter(mca_pml_csum_recv_request_t *req)
200 {
201  if( req->req_recv.req_base.req_datatype->super.size | req->req_recv.req_base.req_count ) {
202  opal_convertor_copy_and_prepare_for_recv(
203  req->req_recv.req_base.req_proc->proc_convertor,
204  &(req->req_recv.req_base.req_datatype->super),
205  req->req_recv.req_base.req_count,
206  req->req_recv.req_base.req_addr,
207  0,
208  &req->req_recv.req_base.req_convertor);
209  opal_convertor_get_unpacked_size(&req->req_recv.req_base.req_convertor,
210  &req->req_bytes_expected);
211  }
212 }
213 
214 #define MCA_PML_CSUM_RECV_REQUEST_MATCHED(request, hdr) \
215  recv_req_matched(request, hdr)
216 
217 static inline void recv_req_matched(mca_pml_csum_recv_request_t *req,
219 {
220  req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src;
221  req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag;
222  req->req_match_received = true;
223 #if OPAL_ENABLE_MULTI_THREADS
224  opal_atomic_wmb();
225 #endif
226  if(req->req_recv.req_bytes_packed > 0) {
227  if(MPI_ANY_SOURCE == req->req_recv.req_base.req_peer) {
228  /* non wildcard prepared during post recv */
229  prepare_recv_req_converter(req);
230  }
231  PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_XFER_BEGIN,
232  &req->req_recv.req_base, PERUSE_RECV);
233  }
234 }
235 
236 
237 /**
238  *
239  */
240 
241 #define MCA_PML_CSUM_RECV_REQUEST_UNPACK( request, \
242  segments, \
243  num_segments, \
244  seg_offset, \
245  data_offset, \
246  bytes_received, \
247  bytes_delivered) \
248 do { \
249  bytes_delivered = 0; \
250  if(request->req_recv.req_bytes_packed > 0) { \
251  struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS]; \
252  uint32_t iov_count = 0; \
253  size_t max_data = bytes_received; \
254  size_t n, offset = seg_offset; \
255  mca_btl_base_segment_t* segment = segments; \
256  \
257  OPAL_THREAD_LOCK(&request->lock); \
258  for( n = 0; n < num_segments; n++, segment++ ) { \
259  if(offset >= segment->seg_len) { \
260  offset -= segment->seg_len; \
261  } else { \
262  iov[iov_count].iov_len = segment->seg_len - offset; \
263  iov[iov_count].iov_base = (IOVBASE_TYPE*) \
264  ((unsigned char*)segment->seg_addr.pval + offset); \
265  iov_count++; \
266  offset = 0; \
267  } \
268  } \
269  PERUSE_TRACE_COMM_OMPI_EVENT (PERUSE_COMM_REQ_XFER_CONTINUE, \
270  &(recvreq->req_recv.req_base), max_data, \
271  PERUSE_RECV); \
272  opal_convertor_set_position( &(request->req_recv.req_base.req_convertor), \
273  &data_offset ); \
274  opal_convertor_unpack( &(request)->req_recv.req_base.req_convertor, \
275  iov, \
276  &iov_count, \
277  &max_data ); \
278  bytes_delivered = max_data; \
279  OPAL_THREAD_UNLOCK(&request->lock); \
280  } \
281 } while (0)
282 
283 
284 /**
285  *
286  */
287 
288 void mca_pml_csum_recv_request_progress_match(
290  struct mca_btl_base_module_t* btl,
291  mca_btl_base_segment_t* segments,
292  size_t num_segments);
293 
294 /**
295  *
296  */
297 
298 void mca_pml_csum_recv_request_progress_frag(
300  struct mca_btl_base_module_t* btl,
301  mca_btl_base_segment_t* segments,
302  size_t num_segments);
303 
304 /**
305  *
306  */
307 
310  struct mca_btl_base_module_t* btl,
311  mca_btl_base_segment_t* segments,
312  size_t num_segments);
313 
314 /**
315  *
316  */
317 
318 void mca_pml_csum_recv_request_progress_rget(
320  struct mca_btl_base_module_t* btl,
321  mca_btl_base_segment_t* segments,
322  size_t num_segments);
323 
324 /**
325  *
326  */
327 
330  struct mca_btl_base_module_t* btl,
331  mca_btl_base_segment_t* segments,
332  size_t num_segments);
333 
334 /**
335  *
336  */
337 
338 int mca_pml_csum_recv_request_schedule_once(
339  mca_pml_csum_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl);
340 
341 static inline int mca_pml_csum_recv_request_schedule_exclusive(
343  mca_bml_base_btl_t* start_bml_btl)
344 {
345  int rc;
346 
347  do {
348  rc = mca_pml_csum_recv_request_schedule_once(req, start_bml_btl);
349  if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
350  break;
351  } while(!unlock_recv_request(req));
352 
353  if(OMPI_SUCCESS == rc)
354  recv_request_pml_complete_check(req);
355 
356  return rc;
357 }
358 
359 static inline void mca_pml_csum_recv_request_schedule(
361  mca_bml_base_btl_t* start_bml_btl)
362 {
363  if(!lock_recv_request(req))
364  return;
365 
366  (void)mca_pml_csum_recv_request_schedule_exclusive(req, start_bml_btl);
367 }
368 
369 #define MCA_PML_CSUM_ADD_ACK_TO_PENDING(P, S, D, O) \
370  do { \
371  mca_pml_csum_pckt_pending_t *_pckt; \
372  int _rc; \
373  \
374  MCA_PML_CSUM_PCKT_PENDING_ALLOC(_pckt,_rc); \
375  _pckt->hdr.hdr_common.hdr_type = MCA_PML_CSUM_HDR_TYPE_ACK; \
376  _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
377  _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
378  _pckt->hdr.hdr_ack.hdr_send_offset = (O); \
379  _pckt->proc = (P); \
380  _pckt->bml_btl = NULL; \
381  OPAL_THREAD_LOCK(&mca_pml_csum.lock); \
382  opal_list_append(&mca_pml_csum.pckt_pending, \
383  (opal_list_item_t*)_pckt); \
384  OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); \
385  } while(0)
386 
387 int mca_pml_csum_recv_request_ack_send_btl(ompi_proc_t* proc,
388  mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
389  uint64_t hdr_rdma_offset, bool nordma);
390 
391 static inline int mca_pml_csum_recv_request_ack_send(ompi_proc_t* proc,
392  uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
393  bool nordma)
394 {
395  size_t i;
396  mca_bml_base_btl_t* bml_btl;
397  mca_bml_base_endpoint_t* endpoint =
399 
400  for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
401  bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
402  if(mca_pml_csum_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
403  hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
404  return OMPI_SUCCESS;
405  }
406 
407  MCA_PML_CSUM_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
408  hdr_send_offset);
409 
410  return OMPI_ERR_OUT_OF_RESOURCE;
411 }
412 
413 int mca_pml_csum_recv_request_get_frag(mca_pml_csum_rdma_frag_t* frag);
414 
415 /* This function tries to continue recvreq that stuck due to resource
416  * unavailability. Recvreq is added to recv_pending list if scheduling of put
417  * operation cannot be accomplished for some reason. */
418 void mca_pml_csum_recv_request_process_pending(void);
419 
420 END_C_DECLS
421 
422 #endif
423 
void mca_pml_csum_recv_request_progress_rndv(mca_pml_csum_recv_request_t *req, struct mca_btl_base_module_t *btl, mca_btl_base_segment_t *segments, size_t num_segments)
Definition: pml_csum_recvreq.c:593
#define OPAL_THREAD_ADD32(x, y)
Use an atomic operation for increment/decrement if opal_using_threads() indicates that threads are in...
Definition: mutex.h:367
struct mca_bml_base_endpoint_t * proc_bml
BML specific proc data.
Definition: proc.h:64
void mca_pml_csum_recv_request_matched_probe(mca_pml_csum_recv_request_t *req, struct mca_btl_base_module_t *btl, mca_btl_base_segment_t *segments, size_t num_segments)
Handle completion of a probe request.
Definition: pml_csum_recvreq.c:739
#define MCA_PML_CSUM_RECV_REQUEST_MPI_COMPLETE(recvreq)
Mark the request as completed at MPI level for internal purposes.
Definition: pml_csum_recvreq.h:118
size_t req_bytes_expected
local size of the data as suggested by the user
Definition: pml_csum_recvreq.h:42
int32_t hdr_tag
user tag
Definition: pml_csum_hdr.h:78
ompi_status_public_t req_status
Completion status.
Definition: request.h:103
void opal_atomic_rmb(void)
Read memory barrier.
size_t req_count
count of user datatype elements
Definition: pml_base_request.h:70
bool req_match_received
Prevent request to be completed prematurely.
Definition: pml_csum_recvreq.h:49
Header definition for the first fragment, contains the attributes required to match the corresponding...
Definition: pml_csum_hdr.h:73
Definition: pml_csum_recvreq.h:36
Definition: pml_csum.h:297
mca_pml_base_request_t req_base
base request
Definition: pml_base_recvreq.h:37
Definition: types.h:52
Definition: mutex_unix.h:53
size_t req_bytes_received
amount of data transferred into the user buffer
Definition: pml_csum_recvreq.h:41
int32_t req_peer
peer process - rank w/in this communicator
Definition: pml_base_request.h:71
Definition: mpool.h:44
Process identification structure interface.
Remote Open MPI process structure.
Definition: proc.h:56
void mca_pml_csum_recv_req_start(mca_pml_csum_recv_request_t *req)
Definition: pml_csum_recvreq.c:1024
#define OPAL_THREAD_LOCK(mutex)
Lock a mutex if opal_using_threads() says that multiple threads may be active in the process...
Definition: mutex.h:223
#define OPAL_THREAD_UNLOCK(mutex)
Unlock a mutex if opal_using_threads() says that multiple threads may be active in the process...
Definition: mutex.h:309
opal_convertor_t req_convertor
always need the convertor
Definition: pml_base_request.h:66
int32_t hdr_src
source rank
Definition: pml_csum_hdr.h:77
volatile bool req_pml_complete
flag indicating if the pt-2-pt layer is done with this request
Definition: pml_base_request.h:61
opal_datatype_t super
Base opal_datatype_t superclass.
Definition: ompi_datatype.h:69
static void recv_request_pml_complete(mca_pml_csum_recv_request_t *recvreq)
Complete receive request.
Definition: pml_csum_recvreq.h:142
struct ompi_proc_t * req_proc
peer process
Definition: pml_base_request.h:73
size_t size
total size in bytes of the memory used by the data if the data is put on a contiguous buffer ...
Definition: opal_datatype.h:108
mca_bml_base_btl_array_t btl_eager
array of btls to use for first fragments
Definition: bml.h:228
Definition: pml_csum_rdmafrag.h:35
mca_mpool_base_module_deregister_fn_t mpool_deregister
deregister memory
Definition: mpool.h:181
struct ompi_datatype_t * req_datatype
pointer to data type
Definition: pml_base_request.h:64
Structure associated w/ ompi_proc_t that contains the set of BTLs used to reach a destination...
Definition: bml.h:222
void * req_addr
pointer to application buffer
Definition: pml_base_request.h:69
Definition: bml.h:58
void opal_atomic_wmb(void)
Write memory barrier.
struct opal_convertor_t * proc_convertor
Base convertor for the proc described by this process.
Definition: proc.h:70
Base type for receive requests.
Definition: pml_base_recvreq.h:36
ompi_request_t req_ompi
base request
Definition: pml_base_request.h:60
static size_t mca_bml_base_btl_array_get_size(mca_bml_base_btl_array_t *array)
If required, reallocate (grow) the array to the indicate size.
Definition: bml.h:91
BTL module interface functions and attributes.
Definition: btl.h:786
bool req_ack_sent
whether ack was sent to the sender
Definition: pml_csum_recvreq.h:48
size_t req_bytes_packed
size of message being received
Definition: pml_base_recvreq.h:38
Describes a region/segment of memory that is addressable by an BTL.
Definition: btl.h:236
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
volatile bool req_free_called
flag indicating if the user has freed this request
Definition: pml_base_request.h:65
static mca_bml_base_btl_t * mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t *array)
Return the next LRU index in the array.
Definition: bml.h:179