OpenMPI  0.1.1
pml_bfo_failover.h File Reference

Functions that implement failover capabilities. More...

#include "ompi/mca/btl/btl.h"
#include "pml_bfo_hdr.h"

Go to the source code of this file.

Macros

#define MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq)
 A bunch of macros to help isolate failover code from regular ob1 code. More...
 
#define MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq)
 
#define MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq)
 
#define MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT(recvreq)
 Macros for pml_bfo_recvreq.c file. More...
 
#define MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq)
 
#define MCA_PML_BFO_SECOND_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq, status, btl)
 
#define MCA_PML_BFO_VERIFY_SENDREQ_REQ_STATE_VALUE(sendreq)
 Macros for pml_bfo_sendreq.c file. More...
 
#define MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description)
 
#define MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description)
 This macro is called within the frag completion function in two places. More...
 
#define MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, btl, des)
 
#define MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl)
 
#define MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, type)
 
#define MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION(bml_btl, btl, des)
 
#define MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, type)
 
#define MCA_PML_BFO_CHECK_SENDREQ_RDMA_BML_BTL(bml_btl, btl, sendreq, type)
 
#define MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL(bml_btl, btl, recvreq, type)
 
#define MCA_PML_BFO_CHECK_RECVREQ_RDMA_BML_BTL(bml_btl, btl, recvreq, type)
 
#define MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL_RECV_CTL(bml_btl, btl, des)
 
#define MCA_PML_BFO_CHECK_FOR_REMOVED_BML(sendreq, frag, btl)
 
#define MCA_PML_BFO_CHECK_FOR_REMOVED_BTL(sendreq, range)
 

Functions

BEGIN_C_DECLS bool mca_pml_bfo_is_duplicate_msg (mca_pml_bfo_comm_proc_t *proc, mca_pml_bfo_match_hdr_t *hdr)
 When running with failover enabled, check the PML sequence numbers to see if we have received a duplicate message. More...
 
bool mca_pml_bfo_is_duplicate_fin (mca_pml_bfo_hdr_t *hdr, mca_btl_base_descriptor_t *rdma, mca_btl_base_module_t *btl)
 This function checks to see if we have received a duplicate FIN message. More...
 
mca_pml_bfo_recv_request_tmca_pml_bfo_get_request (mca_pml_bfo_match_hdr_t *hdr)
 This function is called when a RNDV or RGET is received with the FLAGS_RESTART flag set. More...
 
void mca_pml_bfo_send_request_restart (mca_pml_bfo_send_request_t *sendreq, bool repost, mca_btl_base_tag_t tag)
 This function restarts a RNDV send request. More...
 
void mca_pml_bfo_send_request_rndvrestartnotify (mca_pml_bfo_send_request_t *sendreq, bool repost, mca_btl_base_tag_t tag, int status, mca_btl_base_module_t *btl)
 This function gets called when failover is enabled and an error occurs during the rendezvous protocol. More...
 
void mca_pml_bfo_rndvrestartnotify_completion (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, struct mca_btl_base_descriptor_t *des, int status)
 Completion callback for rndvrestartnotify completion event. More...
 
void mca_pml_bfo_check_recv_ctl_completion_status (mca_btl_base_module_t *btl, struct mca_btl_base_descriptor_t *des, int status)
 Call each time we get a completion event on ACK or PUT message. More...
 
void mca_pml_bfo_recv_request_reset (mca_pml_bfo_recv_request_t *recvreq)
 Reset all the receive request fields to match what a request looks like when it is first started. More...
 
void mca_pml_bfo_recv_request_recverrnotify (mca_pml_bfo_recv_request_t *recvreq, mca_btl_base_tag_t tag, int status)
 This function is called when an error is detected on a completion event on the receiving side. More...
 
void mca_pml_bfo_recv_request_rndvrestartack (mca_pml_bfo_recv_request_t *recvreq, mca_btl_base_tag_t tag, int status, mca_btl_base_module_t *btl)
 This function is called when it may be time to send a RNDVRESTARTACK message back to the sending side. More...
 
void mca_pml_bfo_recv_request_rndvrestartnack (mca_btl_base_descriptor_t *olddes, ompi_proc_t *ompi_proc, bool repost)
 Called after the receipt of a RNDVRESTARTNOTIFY message to a request that no longer matches. More...
 
void mca_pml_bfo_recv_restart_completion (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, struct mca_btl_base_descriptor_t *des, int status)
 
void mca_pml_bfo_failover_error_handler (struct mca_btl_base_module_t *btl, int32_t flags, ompi_proc_t *errproc, char *btlname)
 
void mca_pml_bfo_repost_match_fragment (struct mca_btl_base_descriptor_t *des)
 This function will repost a match fragment. More...
 
void mca_pml_bfo_repost_fin (struct mca_btl_base_descriptor_t *des)
 Repost a FIN message if we get an error on the completion event.
 
void mca_pml_bfo_map_out_btl (struct mca_btl_base_module_t *btl, ompi_proc_t *errproc, char *btlname)
 
void mca_pml_bfo_map_out (mca_btl_base_module_t *btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t *descriptor, void *cbdata)
 
int mca_pml_bfo_register_callbacks (void)
 Register four functions to handle extra PML message types that are utilized when a failover occurs.
 
void mca_pml_bfo_update_rndv_fields (mca_pml_bfo_hdr_t *hdr, mca_pml_bfo_send_request_t *, char *type)
 Update a few fields when we are restarting either a RNDV or RGET type message.
 
void mca_pml_bfo_update_bml_btl (mca_bml_base_btl_t **bml_btl, mca_btl_base_module_t *btl, struct mca_btl_base_descriptor_t *des)
 
void mca_pml_bfo_find_recvreq_eager_bml_btl (mca_bml_base_btl_t **bml_btl, mca_btl_base_module_t *btl, mca_pml_bfo_recv_request_t *recvreq, char *type)
 
void mca_pml_bfo_find_sendreq_eager_bml_btl (mca_bml_base_btl_t **bml_btl, mca_btl_base_module_t *btl, mca_pml_bfo_send_request_t *sendreq, char *type)
 
void mca_pml_bfo_find_sendreq_rdma_bml_btl (mca_bml_base_btl_t **bml_btl, mca_btl_base_module_t *btl, mca_pml_bfo_send_request_t *sendreq, char *type)
 
void mca_pml_bfo_update_eager_bml_btl_recv_ctl (mca_bml_base_btl_t **bml_btl, mca_btl_base_module_t *btl, struct mca_btl_base_descriptor_t *des)
 The following set of functions are all called when it is determined that the cached bml_btl->btl does not match the btl handed back by the callback function. More...
 
void mca_pml_bfo_find_recvreq_rdma_bml_btl (mca_bml_base_btl_t **bml_btl, mca_btl_base_module_t *btl, mca_pml_bfo_recv_request_t *recvreq, char *type)
 
bool mca_pml_bfo_rndv_completion_status_error (struct mca_btl_base_descriptor_t *des, mca_pml_bfo_send_request_t *sendreq)
 The completion event for the RNDV message has returned with an error. More...
 
void mca_pml_bfo_send_ctl_completion_status_error (struct mca_btl_base_descriptor_t *des)
 
void mca_pml_bfo_completion_sendreq_has_error (mca_pml_bfo_send_request_t *sendreq, int status, mca_btl_base_module_t *btl, int type, char *description)
 Check to see if an error has occurred on this send request. More...
 
void mca_pml_bfo_recv_frag_callback_rndvrestartnotify (mca_btl_base_module_t *btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t *descriptor, void *cbdata)
 Four new callbacks for the four new message types. More...
 
void mca_pml_bfo_recv_frag_callback_rndvrestartack (mca_btl_base_module_t *btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t *descriptor, void *cbdata)
 Callback for when a RNDVRESTARTACK message is received. More...
 
void mca_pml_bfo_recv_frag_callback_rndvrestartnack (mca_btl_base_module_t *btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t *descriptor, void *cbdata)
 Callback for when a RNDVRESTARTNACK message is received. More...
 
void mca_pml_bfo_recv_frag_callback_recverrnotify (mca_btl_base_module_t *btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t *descriptor, void *cbdata)
 Callback for when a RECVERRNOTIFY message is received. More...
 

Detailed Description

Functions that implement failover capabilities.

Macro Definition Documentation

#define MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION (   bml_btl,
  btl,
  des 
)
Value:
if (bml_btl->btl != btl) { \
ompi_proc_t *proc = (ompi_proc_t*) des->des_cbdata; \
mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; \
bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_eager, btl); \
}
Remote Open MPI process structure.
Definition: proc.h:56
Structure associated w/ ompi_proc_t that contains the set of BTLs used to reach a destination...
Definition: bml.h:222
static mca_bml_base_btl_t * mca_bml_base_btl_array_find(mca_bml_base_btl_array_t *array, struct mca_btl_base_module_t *btl)
Locate an element in the array.
Definition: bml.h:206
#define MCA_PML_BFO_CHECK_FOR_REMOVED_BML (   sendreq,
  frag,
  btl 
)
Value:
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"PUT received: no matching BTL to RDMA write to, oustanding " \
"events=%d, PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
sendreq->req_events, \
(uint16_t)sendreq->req_send.req_base.req_sequence, \
sendreq->req_restartseq, (void *)sendreq, \
sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
sendreq->req_error++; \
if (0 == sendreq->req_events) { \
MCA_PML_BFO_HDR_TYPE_PUT, \
OMPI_ERROR, btl); \
} \
return; \
}
OPAL_DECLSPEC void OPAL_DECLSPEC void opal_output_verbose(int verbose_level, int output_id, const char *format,...) __opal_attribute_format__(__printf__
Send output to a stream only if the passed verbosity level is high enough.
void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t *sendreq, bool repost, mca_btl_base_tag_t tag, int status, mca_btl_base_module_t *btl)
This function gets called when failover is enabled and an error occurs during the rendezvous protocol...
Definition: pml_bfo_failover.c:660
#define MCA_PML_BFO_CHECK_FOR_REMOVED_BTL (   sendreq,
  range 
)
Value:
if ((int)mca_bml_base_btl_array_get_size(&sendreq->req_endpoint->btl_send) \
!= range->range_btl_cnt) { \
sendreq->req_error++; \
return OMPI_ERROR; \
}
static size_t mca_bml_base_btl_array_get_size(mca_bml_base_btl_array_t *array)
If required, reallocate (grow) the array to the indicate size.
Definition: bml.h:91
#define MCA_PML_BFO_CHECK_FOR_RNDV_RESTART (   hdr,
  sendreq,
  type 
)
Value:
if (0 < sendreq->req_restartseq) { \
mca_pml_bfo_update_rndv_fields(hdr, sendreq, type); \
}
void mca_pml_bfo_update_rndv_fields(mca_pml_bfo_hdr_t *hdr, mca_pml_bfo_send_request_t *, char *type)
Update a few fields when we are restarting either a RNDV or RGET type message.
Definition: pml_bfo_failover.c:1929
#define MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL (   bml_btl,
  btl,
  recvreq,
  type 
)
Value:
if (bml_btl->btl != btl) { \
mca_pml_bfo_find_recvreq_eager_bml_btl(&bml_btl, btl, recvreq, type); \
}
#define MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL_RECV_CTL (   bml_btl,
  btl,
  des 
)
Value:
if (bml_btl->btl != btl) { \
}
void mca_pml_bfo_update_eager_bml_btl_recv_ctl(mca_bml_base_btl_t **bml_btl, mca_btl_base_module_t *btl, struct mca_btl_base_descriptor_t *des)
The following set of functions are all called when it is determined that the cached bml_btl->btl does...
Definition: pml_bfo_failover.c:1953
#define MCA_PML_BFO_CHECK_RECVREQ_RDMA_BML_BTL (   bml_btl,
  btl,
  recvreq,
  type 
)
Value:
if (bml_btl->btl != btl) { \
mca_pml_bfo_find_recvreq_rdma_bml_btl(&bml_btl, btl, recvreq, type); \
}
#define MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL (   bml_btl,
  btl,
  sendreq,
  type 
)
Value:
if (bml_btl->btl != btl) { \
mca_pml_bfo_find_sendreq_eager_bml_btl(&bml_btl, btl, sendreq, type); \
}
#define MCA_PML_BFO_CHECK_SENDREQ_RDMA_BML_BTL (   bml_btl,
  btl,
  sendreq,
  type 
)
Value:
if (bml_btl->btl != btl) { \
mca_pml_bfo_find_sendreq_rdma_bml_btl(&bml_btl, btl, sendreq, type); \
}
#define MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK (   sendreq)
Value:
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"ACK: received: dropping because request in error, " \
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
(uint16_t)(sendreq)->req_send.req_base.req_sequence, \
(sendreq)->req_restartseq, \
(void *)(sendreq), (sendreq)->req_recv.pval, \
(sendreq)->req_send.req_base.req_peer); \
return; \
}
OPAL_DECLSPEC void OPAL_DECLSPEC void opal_output_verbose(int verbose_level, int output_id, const char *format,...) __opal_attribute_format__(__printf__
Send output to a stream only if the passed verbosity level is high enough.

A bunch of macros to help isolate failover code from regular ob1 code.

Referenced by mca_pml_bfo_recv_frag_callback_ack().

#define MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT (   recvreq)
Value:
if( OPAL_UNLIKELY((recvreq)->req_errstate)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"FIN: received on broken request, skipping, " \
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
(recvreq)->remote_req_send.pval, (void *)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
/* Even though in error, it still might complete. */ \
recv_request_pml_complete_check(recvreq); \
return; \
}
OPAL_DECLSPEC void OPAL_DECLSPEC void opal_output_verbose(int verbose_level, int output_id, const char *format,...) __opal_attribute_format__(__printf__
Send output to a stream only if the passed verbosity level is high enough.

Macros for pml_bfo_recvreq.c file.

#define MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK (   recvreq)
Value:
if( OPAL_UNLIKELY((recvreq)->req_errstate)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"FRAG: received: dropping because request in error, " \
"PML=%d, src_req=%p, dst_req=%p, peer=%d, offset=%d", \
(uint16_t)(recvreq)->req_msgseq, \
(recvreq)->remote_req_send.pval, \
(void *)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, \
(int)hdr->hdr_frag.hdr_frag_offset); \
return; \
}
OPAL_DECLSPEC void OPAL_DECLSPEC void opal_output_verbose(int verbose_level, int output_id, const char *format,...) __opal_attribute_format__(__printf__
Send output to a stream only if the passed verbosity level is high enough.
#define MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK (   sendreq)
Value:
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"PUT: received: dropping because request in error, " \
"PML=%d, src_req=%p, dst_req=%p, peer=%d", \
(uint16_t)(sendreq)->req_send.req_base.req_sequence, \
(void *)(sendreq), (sendreq)->req_recv.pval, \
(sendreq)->req_send.req_base.req_peer); \
return; \
}
OPAL_DECLSPEC void OPAL_DECLSPEC void opal_output_verbose(int verbose_level, int output_id, const char *format,...) __opal_attribute_format__(__printf__
Send output to a stream only if the passed verbosity level is high enough.
#define MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION (   recvreq)
Value:
if ((recvreq)->req_errstate) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RDMA read: completion failed, error already seen, " \
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", \
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
(unsigned long)(recvreq)->remote_req_send.pval, \
(unsigned long)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
return; \
} else { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RDMA read: completion failed, sending RECVERRNOTIFY to " \
"sender, PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", \
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
(unsigned long)(recvreq)->remote_req_send.pval, \
(unsigned long)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, status); \
}
OPAL_DECLSPEC void OPAL_DECLSPEC void opal_output_verbose(int verbose_level, int output_id, const char *format,...) __opal_attribute_format__(__printf__
Send output to a stream only if the passed verbosity level is high enough.
void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t *recvreq, mca_btl_base_tag_t tag, int status)
This function is called when an error is detected on a completion event on the receiving side...
Definition: pml_bfo_failover.c:1038
#define MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK (   sendreq,
  status,
  btl,
  type,
  description 
)
Value:
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
btl, type, description); \
return; \
}
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t *sendreq, int status, mca_btl_base_module_t *btl, int type, char *description)
Check to see if an error has occurred on this send request.
Definition: pml_bfo_failover.c:2136

This macro is called within the frag completion function in two places.

It is called to see if any errors occur prior to the completion event on the frag. It is then called a second time after the scheduling routine is called as the scheduling routine may have detected that a BTL that was cached on the request had been removed and therefore marked the request in error. In that case, the scheduling of fragments can no longer proceed properly, and if there are no outstanding events, iniated the restart dance.

#define MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK (   sendreq,
  status,
  btl 
)
Value:
if ( OPAL_UNLIKELY(sendreq->req_error)) { \
MCA_PML_BFO_HDR_TYPE_PUT, "RDMA write"); \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
return; \
}
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t *sendreq, int status, mca_btl_base_module_t *btl, int type, char *description)
Check to see if an error has occurred on this send request.
Definition: pml_bfo_failover.c:2136
#define MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK (   sendreq,
  btl,
  des 
)
Value:
if( OPAL_UNLIKELY(sendreq->req_error)) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"FIN: received on broken request, skipping, " \
"PML=%d, src_req=%lx, dst_req=%lx, peer=%d", \
(uint16_t)sendreq->req_send.req_base.req_sequence, \
(unsigned long)sendreq, (unsigned long)sendreq->req_recv.pval, \
sendreq->req_send.req_base.req_peer); \
btl->btl_free(btl, des); \
return; \
}
OPAL_DECLSPEC void OPAL_DECLSPEC void opal_output_verbose(int verbose_level, int output_id, const char *format,...) __opal_attribute_format__(__printf__
Send output to a stream only if the passed verbosity level is high enough.
#define MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK (   sendreq,
  status,
  btl,
  type,
  description 
)
Value:
if( OPAL_UNLIKELY ((sendreq)->req_error)) { \
btl, type, description); \
return; \
}
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t *sendreq, int status, mca_btl_base_module_t *btl, int type, char *description)
Check to see if an error has occurred on this send request.
Definition: pml_bfo_failover.c:2136
#define MCA_PML_BFO_SECOND_ERROR_CHECK_ON_RDMA_READ_COMPLETION (   recvreq,
  status,
  btl 
)
Value:
/* See if the request has received a RNDVRESTARTNOTIFY */ \
if( OPAL_UNLIKELY(recvreq->req_errstate)) { \
if (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RDMA read: completion: recvreq has error, outstanding events=%d " \
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d", \
recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, \
(unsigned long)recvreq->remote_req_send.pval, \
(unsigned long)recvreq, status, \
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
if (0 == recvreq->req_events) { \
mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, \
status, btl); \
} \
} \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
return; \
}
OPAL_DECLSPEC void OPAL_DECLSPEC void opal_output_verbose(int verbose_level, int output_id, const char *format,...) __opal_attribute_format__(__printf__
Send output to a stream only if the passed verbosity level is high enough.
void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t *recvreq, mca_btl_base_tag_t tag, int status, mca_btl_base_module_t *btl)
This function is called when it may be time to send a RNDVRESTARTACK message back to the sending side...
Definition: pml_bfo_failover.c:1109
#define MCA_PML_BFO_VERIFY_SENDREQ_REQ_STATE_VALUE (   sendreq)
Value:
if (sendreq->req_state == -1) { \
OPAL_THREAD_ADD32(&sendreq->req_state, 1); \
}
#define OPAL_THREAD_ADD32(x, y)
Use an atomic operation for increment/decrement if opal_using_threads() indicates that threads are in...
Definition: mutex.h:367

Macros for pml_bfo_sendreq.c file.

Function Documentation

void mca_pml_bfo_check_recv_ctl_completion_status ( mca_btl_base_module_t btl,
struct mca_btl_base_descriptor_t des,
int  status 
)

Call each time we get a completion event on ACK or PUT message.

These types of messages are receive control type messages. This function is only called if the underlying BTL supports failover. Otherwise, there is no need for this check.

References mca_btl_base_descriptor_t::des_cbdata, mca_btl_base_descriptor_t::des_src, mca_pml_bfo_rdma_hdr_t::hdr_des, mca_pml_bfo_common_hdr_t::hdr_type, mca_pml_bfo_recv_request_recverrnotify(), mca_pml_bfo_recv_request_rndvrestartack(), opal_output_verbose(), mca_pml_base_recv_request_t::req_base, mca_pml_base_request_t::req_ompi, ompi_request_t::req_status, and mca_btl_base_segment_t::seg_addr.

void mca_pml_bfo_completion_sendreq_has_error ( mca_pml_bfo_send_request_t sendreq,
int  status,
mca_btl_base_module_t btl,
int  type,
char *  description 
)

Check to see if an error has occurred on this send request.

If it has and there are no outstanding events, then we can start the restart dance.

References mca_pml_bfo_send_request_rndvrestartnotify(), opal_output_verbose(), mca_pml_base_send_request_t::req_base, mca_pml_base_request_t::req_peer, and mca_pml_base_request_t::req_sequence.

bool mca_pml_bfo_is_duplicate_fin ( mca_pml_bfo_hdr_t hdr,
mca_btl_base_descriptor_t rdma,
mca_btl_base_module_t btl 
)

This function checks to see if we have received a duplicate FIN message.

This is done by first pulling the pointer of the request that the FIN message is pointing to from the message. We then check the various fields in the request to the fields in the header and make sure they match. If they do not, then the request must have been recycled already and this is a duplicate FIN message. We have to do this check on every FIN message that we receive.

References mca_btl_base_module_t::btl_flags, mca_btl_base_descriptor_t::des_cbdata, mca_btl_base_descriptor_t::des_flags, mca_pml_bfo_fin_hdr_t::hdr_common, mca_pml_bfo_common_hdr_t::hdr_flags, opal_output_verbose(), mca_pml_base_recv_request_t::req_base, mca_pml_base_send_request_t::req_base, mca_pml_base_request_t::req_comm, mca_pml_base_request_t::req_ompi, mca_pml_base_request_t::req_peer, mca_pml_base_request_t::req_sequence, ompi_request_t::req_status, and mca_pml_base_request_t::req_type.

Referenced by mca_pml_bfo_recv_frag_callback_fin().

BEGIN_C_DECLS bool mca_pml_bfo_is_duplicate_msg ( mca_pml_bfo_comm_proc_t proc,
mca_pml_bfo_match_hdr_t hdr 
)

When running with failover enabled, check the PML sequence numbers to see if we have received a duplicate message.

This check is done for for all MATCH fragments. It is also done for RNDV and RGET fragments that do not have the MCA_PML_BFO_HDR_FLAGS_RESTART flag set. We set the window size to half the total range of sequence numbers. We only enter this code when the seq_num is not the expected one. A few more notes on the algorithm used here. In normal operation, the expected value will either be equal to or less than the sequence number of the header. This is because we are using this sequence number to detect packets arriving prior to them being expected. If we determine that expected is less than header, then make sure this is not a rollover case. We do that by adding the maxnum to the expected.

Parameters
procPointer to proc from where message came
hdrPointer to header of message

References mca_pml_bfo_comm_proc_t::expected_sequence, mca_pml_bfo_comm_proc_t::frags_cant_match, mca_pml_bfo_match_hdr_t::hdr_common, mca_pml_bfo_match_hdr_t::hdr_seq, mca_pml_bfo_common_hdr_t::hdr_type, opal_list_get_end(), opal_list_get_first(), opal_list_get_next, opal_list_get_size(), opal_output(), and opal_output_verbose().

Referenced by mca_pml_bfo_recv_frag_callback_match(), and mca_pml_bfo_recv_frag_match().

void mca_pml_bfo_recv_frag_callback_recverrnotify ( mca_btl_base_module_t btl,
mca_btl_base_tag_t  tag,
mca_btl_base_descriptor_t des,
void *  cbdata 
)

Callback for when a RECVERRNOTIFY message is received.

This message is sent from the receiver to the sender and tells the sender that the receiver has seen an error. This will trigger the sender to start the request restart sequence.

References mca_btl_base_descriptor_t::des_dst, mca_pml_bfo_match_hdr_t::hdr_ctx, mca_pml_bfo_match_hdr_t::hdr_seq, mca_pml_bfo_match_hdr_t::hdr_src, mca_pml_bfo_send_request_rndvrestartnotify(), opal_output_verbose(), and mca_btl_base_segment_t::seg_addr.

Referenced by mca_pml_bfo_register_callbacks().

void mca_pml_bfo_recv_frag_callback_rndvrestartack ( mca_btl_base_module_t btl,
mca_btl_base_tag_t  tag,
mca_btl_base_descriptor_t des,
void *  cbdata 
)

Callback for when a RNDVRESTARTACK message is received.

This message is sent from the receiver to the sender to acknowledge the receipt of the RNDVRESTARTNOTIFY message. At this point, the sender can reset the send request and restart the message.

References mca_btl_base_descriptor_t::des_dst, mca_pml_bfo_match_hdr_t::hdr_ctx, mca_pml_bfo_match_hdr_t::hdr_seq, mca_pml_bfo_match_hdr_t::hdr_src, mca_pml_bfo_send_request_restart(), opal_output_verbose(), and mca_btl_base_segment_t::seg_addr.

Referenced by mca_pml_bfo_register_callbacks().

void mca_pml_bfo_recv_frag_callback_rndvrestartnack ( mca_btl_base_module_t btl,
mca_btl_base_tag_t  tag,
mca_btl_base_descriptor_t des,
void *  cbdata 
)

Callback for when a RNDVRESTARTNACK message is received.

This message is sent from the receiver to the sender and tells the sender that the receiver has already completed the message and there is nothing else to be done. The sender should then just make the send request complete.

References mca_btl_base_descriptor_t::des_dst, mca_pml_bfo_match_hdr_t::hdr_ctx, mca_pml_bfo_match_hdr_t::hdr_seq, mca_pml_bfo_match_hdr_t::hdr_src, opal_output_verbose(), and mca_btl_base_segment_t::seg_addr.

Referenced by mca_pml_bfo_register_callbacks().

void mca_pml_bfo_recv_frag_callback_rndvrestartnotify ( mca_btl_base_module_t btl,
mca_btl_base_tag_t  tag,
mca_btl_base_descriptor_t des,
void *  cbdata 
)

Four new callbacks for the four new message types.

Four new callbacks for the four new message types.

A RNDVRESTARTNOTIFY message is sent from the sender to the receiver telling the receiver that the message is going to be started over. The receiver first makes sure that the request being pointed to is still valid. If it is not, that means the receiver must have completed the request and therefore we need to send a NACK back to the sender. The receiver then makes sure this is not a duplicate message. If it is a duplicate, it will just drop it. Otherwise, it will then send a RNDVRESTARTACK message if there are no outstanding events on the receiver. Otherwise, it will just change the state of the request and wait for another event to send the RNDVRESTARTACK to the sender.

References mca_btl_base_descriptor_t::des_dst, mca_pml_bfo_match_hdr_t::hdr_ctx, mca_pml_bfo_match_hdr_t::hdr_seq, mca_pml_bfo_match_hdr_t::hdr_src, mca_pml_bfo_recv_request_rndvrestartack(), mca_pml_bfo_recv_request_rndvrestartnack(), ompi_proc_find(), opal_output_verbose(), and mca_btl_base_segment_t::seg_addr.

Referenced by mca_pml_bfo_register_callbacks().

void mca_pml_bfo_recv_request_recverrnotify ( mca_pml_bfo_recv_request_t recvreq,
mca_btl_base_tag_t  tag,
int  status 
)

This function is called when an error is detected on a completion event on the receiving side.

This can come from a ACK, PUT, RDMA read (GET) or RECVERRNOTIFY completion event. When this happens, check the state of the request and decide if the sender needs be notified that a problem was seen. If no RECVERRNOTIFY message has been sent and no RNDVRESTARTNOTIFY has been received from the sender, then send a message telling the sender an error was seen.

References mca_bml_base_btl_t::btl, mca_btl_base_descriptor_t::des_cbfunc, mca_btl_base_descriptor_t::des_src, mca_bml_base_btl_array_get_next(), opal_output(), opal_output_verbose(), ompi_proc_t::proc_bml, mca_pml_base_recv_request_t::req_base, mca_pml_base_request_t::req_comm, mca_pml_base_request_t::req_ompi, mca_pml_base_request_t::req_proc, ompi_request_t::req_status, and mca_btl_base_segment_t::seg_addr.

Referenced by mca_pml_bfo_check_recv_ctl_completion_status(), and mca_pml_bfo_error_pending_packets().

void mca_pml_bfo_recv_request_reset ( mca_pml_bfo_recv_request_t match)
void mca_pml_bfo_recv_request_rndvrestartack ( mca_pml_bfo_recv_request_t recvreq,
mca_btl_base_tag_t  tag,
int  status,
mca_btl_base_module_t btl 
)

This function is called when it may be time to send a RNDVRESTARTACK message back to the sending side.

This can happen because we received a RNDVRESTARTNOTIFY message from the sender. This can also happen if we have noticed that the request has received the RNDVRESTARTNOTIFY message, but has not yet sent out the RNDVRESTARTACK because there were still some pending receive events on the request. That means we can enter this routine from a completion event on a ACK, PUT, or RDMA read as well as from the receipt of a RNDVRESTARTNOTIFY message. If all is good, we sent the RNDVRESTARTACK message back to the sender. Then sometime later a message will arrive telling us to reset and restart the receive request.

References mca_bml_base_btl_t::btl, mca_btl_base_descriptor_t::des_cbdata, mca_btl_base_descriptor_t::des_cbfunc, mca_btl_base_descriptor_t::des_src, mca_bml_base_btl_array_get_next(), opal_output(), opal_output_verbose(), ompi_proc_t::proc_bml, mca_pml_base_recv_request_t::req_base, mca_pml_base_request_t::req_comm, mca_pml_base_request_t::req_ompi, mca_pml_base_request_t::req_proc, ompi_request_t::req_status, and mca_btl_base_segment_t::seg_addr.

Referenced by mca_pml_bfo_check_recv_ctl_completion_status(), mca_pml_bfo_error_pending_packets(), and mca_pml_bfo_recv_frag_callback_rndvrestartnotify().

void mca_pml_bfo_recv_request_rndvrestartnack ( mca_btl_base_descriptor_t olddes,
ompi_proc_t ompi_proc,
bool  repost 
)

Called after the receipt of a RNDVRESTARTNOTIFY message to a request that no longer matches.

This can happen if the sender detected an error, but the receiver actually received all the data. Therefore send a NACK back instead of the ACK so that the sender can complete its request. This happens very rarely. Note that we need to make use of the hdr_dst_rank that we received from the notify message. This is so the sending side make sure the message matches a valid request on the sending side.

References mca_bml_base_btl_array_t::arr_size, mca_bml_base_endpoint_t::btl_eager, mca_btl_base_descriptor_t::des_cbdata, mca_btl_base_descriptor_t::des_cbfunc, mca_btl_base_descriptor_t::des_dst, mca_btl_base_descriptor_t::des_src, mca_bml_base_btl_array_get_next(), opal_output(), opal_output_verbose(), ompi_proc_t::proc_bml, ompi_proc_t::proc_name, mca_btl_base_segment_t::seg_addr, and orte_process_name_t::vpid.

Referenced by mca_pml_bfo_recv_frag_callback_rndvrestartnotify().

bool mca_pml_bfo_rndv_completion_status_error ( struct mca_btl_base_descriptor_t des,
mca_pml_bfo_send_request_t sendreq 
)

The completion event for the RNDV message has returned with an error.

We know that the send request we are looking at is valid because it cannot be completed until the sendreq->req_state value reaches 0. And for the sendreq->req_state to reach 0, the completion event on the RNDV message must occur. So, we do not bother checking whether the send request is valid, because we know it is, but we put a few asserts in for good measure. We then check a few fields in the request to decide what to do. If the sendreq->req_error is set, that means that something has happend already to the request and we do not want to restart it. Presumably, we may have received a RECVERRNOTIFY message from the receiver. We also check the sendreq->req_acked field to see if it has been acked. If it has, then again we do not restart everything because obviously the RNDV message has made it to the other side.

References mca_btl_base_descriptor_t::des_src, mca_pml_bfo_send_request_restart(), and mca_btl_base_segment_t::seg_addr.

void mca_pml_bfo_rndvrestartnotify_completion ( mca_btl_base_module_t btl,
struct mca_btl_base_endpoint_t ep,
struct mca_btl_base_descriptor_t des,
int  status 
)

Completion callback for rndvrestartnotify completion event.

If the RNDVRESTARTACK has already been received, then reset and restart. Otherwise, just update the state and let the RNDVRESTARTACK trigger the reset and restart.

References mca_btl_base_descriptor_t::des_src, mca_pml_bfo_send_request_restart(), mca_pml_bfo_send_request_rndvrestartnotify(), opal_output_verbose(), mca_pml_base_send_request_t::req_base, mca_pml_base_request_t::req_comm, mca_pml_base_request_t::req_peer, mca_pml_base_request_t::req_sequence, and mca_btl_base_segment_t::seg_addr.

Referenced by mca_pml_bfo_send_request_rndvrestartnotify().

void mca_pml_bfo_send_request_rndvrestartnotify ( mca_pml_bfo_send_request_t sendreq,
bool  repost,
mca_btl_base_tag_t  tag,
int  status,
mca_btl_base_module_t btl 
)

This function gets called when failover is enabled and an error occurs during the rendezvous protocol.

A message is sent to the receiving side notifying the request that the communication is going to be starting over. However, none of the information in the send request is reset yet, so that any in flight fragments can still find a home. Information in the send request gets reset when the completion event for this send occurs AND an ACK has been received back from the receiver.

References mca_bml_base_btl_t::btl, mca_btl_base_descriptor_t::des_cbfunc, mca_btl_base_descriptor_t::des_src, mca_bml_base_btl_array_get_next(), mca_pml_bfo_rndvrestartnotify_completion(), opal_output(), opal_output_verbose(), ORTE_PROC_MY_NAME, ompi_proc_t::proc_bml, mca_pml_base_send_request_t::req_base, mca_pml_base_request_t::req_comm, mca_pml_base_request_t::req_peer, mca_pml_base_request_t::req_proc, mca_pml_base_request_t::req_sequence, and mca_btl_base_segment_t::seg_addr.

Referenced by mca_pml_bfo_completion_sendreq_has_error(), mca_pml_bfo_error_pending_packets(), mca_pml_bfo_recv_frag_callback_recverrnotify(), and mca_pml_bfo_rndvrestartnotify_completion().

void mca_pml_bfo_update_eager_bml_btl_recv_ctl ( mca_bml_base_btl_t **  bml_btl,
mca_btl_base_module_t btl,
struct mca_btl_base_descriptor_t des 
)

The following set of functions are all called when it is determined that the cached bml_btl->btl does not match the btl handed back by the callback function.

This means that the bml_btl array has been shuffled and the bml_btl matching the btl has to be found back. If it cannot be found, then just find a different one to use.

References mca_btl_base_descriptor_t::des_cbdata, mca_btl_base_descriptor_t::des_src, mca_pml_bfo_ack_hdr_t::hdr_dst_req, mca_pml_bfo_common_hdr_t::hdr_type, opal_output(), and mca_btl_base_segment_t::seg_addr.