OpenMPI
0.1.1
|
The Open RTE Error and Recovery Manager (ErrMgr) More...
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_object.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/util/output.h"
#include "opal/util/error.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/plm/plm_types.h"
Go to the source code of this file.
Data Structures | |
struct | orte_errmgr_predicted_proc_t |
struct | orte_errmgr_predicted_node_t |
struct | orte_errmgr_predicted_map_t |
struct | orte_errmgr_base_module_2_3_0_t |
struct | orte_errmgr_base_component_3_0_0_t |
Macros | |
#define | ORTE_ERROR_NAME(n) opal_strerror(n) |
#define | ORTE_ERROR_LOG(n) orte_errmgr.log(n, __FILE__, __LINE__); |
#define | ORTE_ERRMGR_BASE_VERSION_3_0_0 |
Typedefs | |
typedef struct orte_errmgr_predicted_proc_t | orte_errmgr_predicted_proc_t |
typedef struct orte_errmgr_predicted_node_t | orte_errmgr_predicted_node_t |
typedef void( | orte_errmgr_fault_callback_t )(opal_pointer_array_t *procs) |
typedef struct orte_errmgr_predicted_map_t | orte_errmgr_predicted_map_t |
typedef int(* | orte_errmgr_base_module_init_fn_t )(void) |
Module initialization function. More... | |
typedef int(* | orte_errmgr_base_module_finalize_fn_t )(void) |
Module finalization function. More... | |
typedef void(* | orte_errmgr_base_module_log_fn_t )(int error_code, char *filename, int line) |
This is not part of any module so it can be used at any time! | |
typedef void(* | orte_errmgr_base_module_abort_fn_t )(int error_code, char *fmt,...) __opal_attribute_format_funcptr__(__printf__ |
Alert - self aborting This function is called when a process is aborting due to some internal error. More... | |
typedef void(*) typedef int(* | orte_errmgr_base_module_abort_peers_fn_t )(orte_process_name_t *procs, orte_std_cntr_t num_procs) |
Alert - abort peers This function is called when a process wants to abort one or more peer processes. More... | |
typedef int(* | orte_errmgr_base_module_update_state_fn_t )(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc_name, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) |
Alert - process aborted This function is called by the PLM when a remote process aborts during execution. More... | |
typedef int(* | orte_errmgr_base_module_predicted_fault_fn_t )(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map) |
Predicted process/node failure notification. More... | |
typedef int(* | orte_errmgr_base_module_suggest_map_targets_fn_t )(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list) |
Suggest a node to map a restarting process onto. More... | |
typedef int(* | orte_errmgr_base_module_ft_event_fn_t )(int state) |
Handle fault tolerance updates. More... | |
typedef void(* | orte_errmgr_base_module_register_migration_warning_fn_t )(struct timeval *tv) |
Function to perform actions that require the rest of the ORTE layer to be up and running. More... | |
typedef orte_errmgr_fault_callback_t *(* | orte_errmgr_base_module_set_fault_callback_t )(orte_errmgr_fault_callback_t *cbfunc) |
Set the callback function for faults. More... | |
typedef struct orte_errmgr_base_module_2_3_0_t | orte_errmgr_base_module_2_3_0_t |
typedef orte_errmgr_base_module_2_3_0_t | orte_errmgr_base_module_t |
typedef struct orte_errmgr_base_component_3_0_0_t | orte_errmgr_base_component_3_0_0_t |
typedef orte_errmgr_base_component_3_0_0_t | orte_errmgr_base_component_t |
Functions | |
OBJ_CLASS_DECLARATION (orte_errmgr_predicted_proc_t) | |
OBJ_CLASS_DECLARATION (orte_errmgr_predicted_node_t) | |
OBJ_CLASS_DECLARATION (orte_errmgr_predicted_map_t) | |
Variables | |
ORTE_DECLSPEC orte_errmgr_fault_callback_t * | fault_cbfunc |
ORTE_DECLSPEC orte_errmgr_base_module_t | orte_errmgr |
The Open RTE Error and Recovery Manager (ErrMgr)
This framework is the logically central clearing house for process/daemon state updates. In particular when a process fails and another process detects it, then that information is reported through this framework. This framework then (depending on the active component) decides how to handle the failure.
For example, if a process fails this may activate an automatic recovery of the process from a previous checkpoint, or initial state. Conversely, the active component could decide not to continue the job, and request that it be terminated. The error and recovery policy is determined by individual components within this framework.
#define ORTE_ERRMGR_BASE_VERSION_3_0_0 |
typedef void(* orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt,...) __opal_attribute_format_funcptr__(__printf__ |
Alert - self aborting This function is called when a process is aborting due to some internal error.
It will finalize the process itself, and then exit - it takes no other actions. The intent here is to provide a last-ditch exit procedure that attempts to clean up a little.
typedef void(*) typedef int(* orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs, orte_std_cntr_t num_procs) |
Alert - abort peers This function is called when a process wants to abort one or more peer processes.
For example, MPI_Abort(comm) will use this function to terminate peers in the communicator group before aborting itself.
typedef int(* orte_errmgr_base_module_finalize_fn_t)(void) |
Module finalization function.
ORTE_SUCCESS | The operation completed successfully |
ORTE_ERROR | An unspecifed error occurred |
typedef int(* orte_errmgr_base_module_ft_event_fn_t)(int state) |
Handle fault tolerance updates.
[in] | state | Fault tolerance state update |
ORTE_SUCCESS | The operation completed successfully |
ORTE_ERROR | An unspecifed error occurred |
typedef int(* orte_errmgr_base_module_init_fn_t)(void) |
Module initialization function.
ORTE_SUCCESS | The operation completed successfully |
ORTE_ERROR | An unspecifed error occurred |
typedef int(* orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map) |
Predicted process/node failure notification.
[in] | proc_list | List of processes (or NULL if none) |
[in] | node_list | List of nodes (or NULL if none) |
[in] | suggested_map | List of mapping suggestions to use on recovery (or NULL if none) |
ORTE_SUCCESS | The operation completed successfully |
ORTE_ERROR | An unspecifed error occurred |
typedef void(* orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv) |
Function to perform actions that require the rest of the ORTE layer to be up and running.
ORTE_SUCCESS | The operation completed successfully |
ORTE_ERROR | An unspecified error occured |
typedef orte_errmgr_fault_callback_t*(* orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc) |
Set the callback function for faults.
[in] | cbfunc | The callback function. |
The | previous fault callback function. |
typedef int(* orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list) |
Suggest a node to map a restarting process onto.
[in] | proc | Process that is being mapped |
[in] | oldnode | Previous node where this process resided |
in|out] | node_list List of nodes to select from |
ORTE_SUCCESS | The operation completed successfully |
ORTE_ERROR | An unspecifed error occurred |
typedef int(* orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc_name, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) |
Alert - process aborted This function is called by the PLM when a remote process aborts during execution.
Actions taken in response to the abnormal termination of a remote application process will vary across the various errmgr components.
NOTE: Local process errors should always be reported through the error_detected interface and NOT here.
*name | Pointer to the name of the proc that aborted |
ORTE_SUCCESS | Whatever action that was taken was successful |
ORTE_ERROR | Appropriate error code |