OpenMPI  0.1.1
pml_bfo_failover.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
3  * $COPYRIGHT$
4  *
5  * Additional copyrights may follow
6  *
7  * $HEADER$
8  */
9 
10 /**
11  * @file
12  * Functions that implement failover capabilities.
13  */
14 
15 #ifndef MCA_PML_BFO_FAILOVER_H
16 #define MCA_PML_BFO_FAILOVER_H
17 
18 #include "ompi/mca/btl/btl.h"
19 #include "pml_bfo_hdr.h"
20 
21 BEGIN_C_DECLS
22 
27 
29 
31  bool repost, mca_btl_base_tag_t tag);
33  bool repost, mca_btl_base_tag_t tag, int status,
35 
36 void
38  struct mca_btl_base_endpoint_t* ep,
39  struct mca_btl_base_descriptor_t* des,
40  int status);
41 void
43  struct mca_btl_base_descriptor_t* des,
44  int status);
45 
46 /* Reset a receive request to the beginning */
48 /* Notify sender that receiver detected an error */
50  mca_btl_base_tag_t tag, int status);
51 /* Ack the RNDVRESTARTNOTIFY message */
53  mca_btl_base_tag_t tag, int status,
55 /* Nack the RNDVRESTARTNOTIFY message */
57  ompi_proc_t* ompi_proc, bool repost);
58 
59 void mca_pml_bfo_recv_restart_completion(mca_btl_base_module_t* btl,
60  struct mca_btl_base_endpoint_t* ep,
61  struct mca_btl_base_descriptor_t* des,
62  int status);
63 void mca_pml_bfo_failover_error_handler(struct mca_btl_base_module_t* btl,
64  int32_t flags, ompi_proc_t *errproc, char *btlname);
67 
68 void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl,
69  ompi_proc_t *errproc, char *btlname);
70 
71 extern void mca_pml_bfo_map_out( mca_btl_base_module_t *btl,
72  mca_btl_base_tag_t tag,
73  mca_btl_base_descriptor_t* descriptor,
74  void* cbdata );
75 
77 
79  mca_pml_bfo_send_request_t*, char *type);
80 
81 void mca_pml_bfo_update_bml_btl(mca_bml_base_btl_t** bml_btl, mca_btl_base_module_t* btl,
82  struct mca_btl_base_descriptor_t* des);
83 
84 void mca_pml_bfo_find_recvreq_eager_bml_btl(mca_bml_base_btl_t** bml_btl,
87  char* type);
88 
89 void mca_pml_bfo_find_sendreq_eager_bml_btl(mca_bml_base_btl_t** bml_btl,
92  char* type);
93 
94 void mca_pml_bfo_find_sendreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
97  char* type);
98 
101  struct mca_btl_base_descriptor_t* des);
102 void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
105  char* type);
106 
108  mca_pml_bfo_send_request_t* sendreq);
109 void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des);
110 
111 
113  int status,
115  int type,
116  char *description);
117 /**
118  * Four new callbacks for the four new message types.
119  */
121  mca_btl_base_tag_t tag,
122  mca_btl_base_descriptor_t* descriptor,
123  void* cbdata );
124 
126  mca_btl_base_tag_t tag,
127  mca_btl_base_descriptor_t* descriptor,
128  void* cbdata );
129 
131  mca_btl_base_tag_t tag,
132  mca_btl_base_descriptor_t* descriptor,
133  void* cbdata );
134 
136  mca_btl_base_tag_t tag,
137  mca_btl_base_descriptor_t* descriptor,
138  void* cbdata );
139 
140 /**
141  * A bunch of macros to help isolate failover code from regular ob1 code.
142  */
143 
144 /* Drop any ACK fragments if request is in error state. Do not want
145  * to initiate any more activity. */
146 #define MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq) \
147  if( OPAL_UNLIKELY((sendreq)->req_error)) { \
148  opal_output_verbose(20, mca_pml_bfo_output, \
149  "ACK: received: dropping because request in error, " \
150  "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
151  (uint16_t)(sendreq)->req_send.req_base.req_sequence, \
152  (sendreq)->req_restartseq, \
153  (void *)(sendreq), (sendreq)->req_recv.pval, \
154  (sendreq)->req_send.req_base.req_peer); \
155  return; \
156  }
157 
158 /* Drop any FRAG fragments if request is in error state. Do not want
159  * to initiate any more activity. */
160 #define MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq) \
161  if( OPAL_UNLIKELY((recvreq)->req_errstate)) { \
162  opal_output_verbose(20, mca_pml_bfo_output, \
163  "FRAG: received: dropping because request in error, " \
164  "PML=%d, src_req=%p, dst_req=%p, peer=%d, offset=%d", \
165  (uint16_t)(recvreq)->req_msgseq, \
166  (recvreq)->remote_req_send.pval, \
167  (void *)(recvreq), \
168  (recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, \
169  (int)hdr->hdr_frag.hdr_frag_offset); \
170  return; \
171  }
172 
173 /* Drop any PUT fragments if request is in error state. Do not want
174  * to initiate any more activity. */
175 #define MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq) \
176  if( OPAL_UNLIKELY((sendreq)->req_error)) { \
177  opal_output_verbose(20, mca_pml_bfo_output, \
178  "PUT: received: dropping because request in error, " \
179  "PML=%d, src_req=%p, dst_req=%p, peer=%d", \
180  (uint16_t)(sendreq)->req_send.req_base.req_sequence, \
181  (void *)(sendreq), (sendreq)->req_recv.pval, \
182  (sendreq)->req_send.req_base.req_peer); \
183  return; \
184  }
185 
186 /**
187  * Macros for pml_bfo_recvreq.c file.
188  */
189 
190 /* This can happen if a FIN message arrives after the request was
191  * marked in error. So, just drop the message. Note that the status
192  * field is not being checked. That is because the status field is the
193  * value returned in the FIN hdr.hdr_fail field and may be used for
194  * other things. Note that we allow the various fields to be updated
195  * in case this actually completes the request and the sending side
196  * thinks it is done. */
197 #define MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT(recvreq) \
198  if( OPAL_UNLIKELY((recvreq)->req_errstate)) { \
199  opal_output_verbose(20, mca_pml_bfo_output, \
200  "FIN: received on broken request, skipping, " \
201  "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
202  (recvreq)->req_msgseq, (recvreq)->req_restartseq, \
203  (recvreq)->remote_req_send.pval, (void *)(recvreq), \
204  (recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
205  /* Even though in error, it still might complete. */ \
206  recv_request_pml_complete_check(recvreq); \
207  return; \
208  }
209 
210 #define MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq) \
211  if ((recvreq)->req_errstate) { \
212  opal_output_verbose(30, mca_pml_bfo_output, \
213  "RDMA read: completion failed, error already seen, " \
214  "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", \
215  (recvreq)->req_msgseq, (recvreq)->req_restartseq, \
216  (unsigned long)(recvreq)->remote_req_send.pval, \
217  (unsigned long)(recvreq), \
218  (recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
219  return; \
220  } else { \
221  opal_output_verbose(30, mca_pml_bfo_output, \
222  "RDMA read: completion failed, sending RECVERRNOTIFY to " \
223  "sender, PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", \
224  (recvreq)->req_msgseq, (recvreq)->req_restartseq, \
225  (unsigned long)(recvreq)->remote_req_send.pval, \
226  (unsigned long)(recvreq), \
227  (recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
228  mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, status); \
229  }
230 
231 #define MCA_PML_BFO_SECOND_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq, status, btl) \
232  /* See if the request has received a RNDVRESTARTNOTIFY */ \
233  if( OPAL_UNLIKELY(recvreq->req_errstate)) { \
234  if (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) { \
235  opal_output_verbose(30, mca_pml_bfo_output, \
236  "RDMA read: completion: recvreq has error, outstanding events=%d " \
237  "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d", \
238  recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, \
239  (unsigned long)recvreq->remote_req_send.pval, \
240  (unsigned long)recvreq, status, \
241  recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
242  if (0 == recvreq->req_events) { \
243  mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, \
244  status, btl); \
245  } \
246  } \
247  MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
248  return; \
249  }
250 
251 /**
252  * Macros for pml_bfo_sendreq.c file.
253  */
254 
255 /* This macro is called on the sending side after receiving
256  * a PUT message. There is a chance that this PUT message
257  * has shown up and is attempting to modify the state of
258  * the req_state, but the req_state is no longer being tracked
259  * because the RNDV message has turned into a RGET message
260  * because it got an error on the RNDV completion.
261  */
262 #define MCA_PML_BFO_VERIFY_SENDREQ_REQ_STATE_VALUE(sendreq) \
263  if (sendreq->req_state == -1) { \
264  OPAL_THREAD_ADD32(&sendreq->req_state, 1); \
265  }
266 
267 /* Now check the error state. This request can be in error if the
268  * RNDV message made it over, but the receiver got an error trying to
269  * send the ACK back and therefore sent a RECVERRNOTIFY message. In
270  * that case, we want to start the restart dance as the receiver has
271  * matched this message already. Only restart if there are no
272  * outstanding events on send request. */
273 #define MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \
274  if( OPAL_UNLIKELY ((sendreq)->req_error)) { \
275  mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \
276  btl, type, description); \
277  return; \
278  }
279 
280 /**
281  * This macro is called within the frag completion function in two
282  * places. It is called to see if any errors occur prior to the
283  * completion event on the frag. It is then called a second time
284  * after the scheduling routine is called as the scheduling routine
285  * may have detected that a BTL that was cached on the request had
286  * been removed and therefore marked the request in error. In that
287  * case, the scheduling of fragments can no longer proceed properly,
288  * and if there are no outstanding events, iniated the restart dance.
289  */
290 #define MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \
291  if( OPAL_UNLIKELY((sendreq)->req_error)) { \
292  mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \
293  btl, type, description); \
294  return; \
295  }
296 
297 /* This can happen if a FIN message arrives after the request was
298  * marked in error. So, just drop the message. Note that the status
299  * field is not checked here. That is because that is the value
300  * returned in the FIN hdr.hdr_fail field and may be used for other
301  * things. */
302 #define MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, btl, des) \
303  if( OPAL_UNLIKELY(sendreq->req_error)) { \
304  opal_output_verbose(30, mca_pml_bfo_output, \
305  "FIN: received on broken request, skipping, " \
306  "PML=%d, src_req=%lx, dst_req=%lx, peer=%d", \
307  (uint16_t)sendreq->req_send.req_base.req_sequence, \
308  (unsigned long)sendreq, (unsigned long)sendreq->req_recv.pval, \
309  sendreq->req_send.req_base.req_peer); \
310  btl->btl_free(btl, des); \
311  return; \
312  }
313 
314 
315 /* Check if there has been an error on the send request when we get
316  * a completion event on the RDMA write. */
317 #define MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl) \
318  if ( OPAL_UNLIKELY(sendreq->req_error)) { \
319  mca_pml_bfo_completion_sendreq_has_error(sendreq, status, btl, \
320  MCA_PML_BFO_HDR_TYPE_PUT, "RDMA write"); \
321  MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
322  return; \
323  }
324 
325 #define MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, type) \
326  if (0 < sendreq->req_restartseq) { \
327  mca_pml_bfo_update_rndv_fields(hdr, sendreq, type); \
328  }
329 
330 /* If a bml_btl gets mapped out, then we need to adjust it based
331  * on the btl from the callback function. These macros are called on
332  * every callback to make sure things are copacetic.
333  */
334 #define MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION(bml_btl, btl, des) \
335  if (bml_btl->btl != btl) { \
336  ompi_proc_t *proc = (ompi_proc_t*) des->des_cbdata; \
337  mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; \
338  bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_eager, btl); \
339  }
340 #define MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, type) \
341  if (bml_btl->btl != btl) { \
342  mca_pml_bfo_find_sendreq_eager_bml_btl(&bml_btl, btl, sendreq, type); \
343  }
344 #define MCA_PML_BFO_CHECK_SENDREQ_RDMA_BML_BTL(bml_btl, btl, sendreq, type) \
345  if (bml_btl->btl != btl) { \
346  mca_pml_bfo_find_sendreq_rdma_bml_btl(&bml_btl, btl, sendreq, type); \
347  }
348 
349 #define MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL(bml_btl, btl, recvreq, type) \
350  if (bml_btl->btl != btl) { \
351  mca_pml_bfo_find_recvreq_eager_bml_btl(&bml_btl, btl, recvreq, type); \
352  }
353 
354 #define MCA_PML_BFO_CHECK_RECVREQ_RDMA_BML_BTL(bml_btl, btl, recvreq, type) \
355  if (bml_btl->btl != btl) { \
356  mca_pml_bfo_find_recvreq_rdma_bml_btl(&bml_btl, btl, recvreq, type); \
357  }
358 
359 #define MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL_RECV_CTL(bml_btl, btl, des) \
360  if (bml_btl->btl != btl) { \
361  mca_pml_bfo_update_eager_bml_btl_recv_ctl(&bml_btl, btl, des); \
362  }
363 
364 #define MCA_PML_BFO_CHECK_FOR_REMOVED_BML(sendreq, frag, btl) \
365  if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { \
366  opal_output_verbose(30, mca_pml_bfo_output, \
367  "PUT received: no matching BTL to RDMA write to, oustanding " \
368  "events=%d, PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
369  sendreq->req_events, \
370  (uint16_t)sendreq->req_send.req_base.req_sequence, \
371  sendreq->req_restartseq, (void *)sendreq, \
372  sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); \
373  MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
374  sendreq->req_error++; \
375  if (0 == sendreq->req_events) { \
376  mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \
377  MCA_PML_BFO_HDR_TYPE_PUT, \
378  OMPI_ERROR, btl); \
379  } \
380  return; \
381  }
382 
383 /* This macro checks to see if the cached number of BTLs in the
384  * send request still matches the value from the endpoint.
385  * If it does not, this means that a BTL was removed from the
386  * available list. In this case, start the request over.
387  */
388 #define MCA_PML_BFO_CHECK_FOR_REMOVED_BTL(sendreq, range) \
389  if ((int)mca_bml_base_btl_array_get_size(&sendreq->req_endpoint->btl_send) \
390  != range->range_btl_cnt) { \
391  sendreq->req_error++; \
392  return OMPI_ERROR; \
393  }
394 
395 
396 END_C_DECLS
397 
398 #endif
void mca_pml_bfo_recv_request_reset(mca_pml_bfo_recv_request_t *recvreq)
Reset all the receive request fields to match what a request looks like when it is first started...
Definition: pml_bfo_failover.c:1266
void mca_pml_bfo_recv_frag_callback_rndvrestartnack(mca_btl_base_module_t *btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t *descriptor, void *cbdata)
Callback for when a RNDVRESTARTNACK message is received.
Definition: pml_bfo_failover.c:604
Definition: pml_bfo_recvreq.h:41
A descriptor that holds the parameters to a send/put/get operation along w/ a callback routine that i...
Definition: btl.h:275
void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t *des)
Repost a FIN message if we get an error on the completion event.
Definition: pml_bfo_failover.c:264
void mca_pml_bfo_recv_frag_callback_rndvrestartack(mca_btl_base_module_t *btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t *descriptor, void *cbdata)
Callback for when a RNDVRESTARTACK message is received.
Definition: pml_bfo_failover.c:459
BEGIN_C_DECLS bool mca_pml_bfo_is_duplicate_msg(mca_pml_bfo_comm_proc_t *proc, mca_pml_bfo_match_hdr_t *hdr)
When running with failover enabled, check the PML sequence numbers to see if we have received a dupli...
Definition: pml_bfo_failover.c:71
Header definition for the first fragment, contains the attributes required to match the corresponding...
Definition: pml_bfo_hdr.h:77
void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t *olddes, ompi_proc_t *ompi_proc, bool repost)
Called after the receipt of a RNDVRESTARTNOTIFY message to a request that no longer matches...
Definition: pml_bfo_failover.c:1192
void mca_pml_bfo_update_eager_bml_btl_recv_ctl(mca_bml_base_btl_t **bml_btl, mca_btl_base_module_t *btl, struct mca_btl_base_descriptor_t *des)
The following set of functions are all called when it is determined that the cached bml_btl->btl does...
Definition: pml_bfo_failover.c:1953
Union of defined hdr types.
Definition: pml_bfo_hdr.h:441
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t *des, mca_pml_bfo_send_request_t *sendreq)
The completion event for the RNDV message has returned with an error.
Definition: pml_bfo_failover.c:2114
void mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, struct mca_btl_base_descriptor_t *des, int status)
Completion callback for rndvrestartnotify completion event.
Definition: pml_bfo_failover.c:966
Remote Open MPI process structure.
Definition: proc.h:56
void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t *sendreq, bool repost, mca_btl_base_tag_t tag, int status, mca_btl_base_module_t *btl)
This function gets called when failover is enabled and an error occurs during the rendezvous protocol...
Definition: pml_bfo_failover.c:660
void mca_pml_bfo_update_rndv_fields(mca_pml_bfo_hdr_t *hdr, mca_pml_bfo_send_request_t *, char *type)
Update a few fields when we are restarting either a RNDV or RGET type message.
Definition: pml_bfo_failover.c:1929
Byte Transfer Layer (BTL)
void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t *btl, struct mca_btl_base_descriptor_t *des, int status)
Call each time we get a completion event on ACK or PUT message.
Definition: pml_bfo_failover.c:1766
void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t *sendreq, bool repost, mca_btl_base_tag_t tag)
This function restarts a RNDV send request.
Definition: pml_bfo_failover.c:744
bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t *hdr, mca_btl_base_descriptor_t *rdma, mca_btl_base_module_t *btl)
This function checks to see if we have received a duplicate FIN message.
Definition: pml_bfo_failover.c:139
void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t *recvreq, mca_btl_base_tag_t tag, int status)
This function is called when an error is detected on a completion event on the receiving side...
Definition: pml_bfo_failover.c:1038
State of ELAN endpoint connection.
Definition: btl_elan_endpoint.h:33
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t *sendreq, int status, mca_btl_base_module_t *btl, int type, char *description)
Check to see if an error has occurred on this send request.
Definition: pml_bfo_failover.c:2136
mca_pml_bfo_recv_request_t * mca_pml_bfo_get_request(mca_pml_bfo_match_hdr_t *hdr)
This function is called when a RNDV or RGET is received with the FLAGS_RESTART flag set...
Definition: pml_bfo_failover.c:309
Definition: bml.h:58
void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t *recvreq, mca_btl_base_tag_t tag, int status, mca_btl_base_module_t *btl)
This function is called when it may be time to send a RNDVRESTARTACK message back to the sending side...
Definition: pml_bfo_failover.c:1109
void mca_pml_bfo_recv_frag_callback_recverrnotify(mca_btl_base_module_t *btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t *descriptor, void *cbdata)
Callback for when a RECVERRNOTIFY message is received.
Definition: pml_bfo_failover.c:520
int mca_pml_bfo_register_callbacks(void)
Register four functions to handle extra PML message types that are utilized when a failover occurs...
Definition: pml_bfo_failover.c:1894
Definition: pml_bfo_comm.h:31
Definition: pml_bfo_sendreq.h:41
BTL module interface functions and attributes.
Definition: btl.h:786
void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t *des)
This function will repost a match fragment.
Definition: pml_bfo_failover.c:854
void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t *btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t *descriptor, void *cbdata)
Four new callbacks for the four new message types.
Definition: pml_bfo_failover.c:374