OpenMPI  0.1.1
errmgr.h File Reference

The Open RTE Error and Recovery Manager (ErrMgr) More...

#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_object.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/util/output.h"
#include "opal/util/error.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/plm/plm_types.h"

Go to the source code of this file.

Data Structures

struct  orte_errmgr_predicted_proc_t
 
struct  orte_errmgr_predicted_node_t
 
struct  orte_errmgr_predicted_map_t
 
struct  orte_errmgr_base_module_2_3_0_t
 
struct  orte_errmgr_base_component_3_0_0_t
 

Macros

#define ORTE_ERROR_NAME(n)   opal_strerror(n)
 
#define ORTE_ERROR_LOG(n)   orte_errmgr.log(n, __FILE__, __LINE__);
 
#define ORTE_ERRMGR_BASE_VERSION_3_0_0
 

Typedefs

typedef struct
orte_errmgr_predicted_proc_t 
orte_errmgr_predicted_proc_t
 
typedef struct
orte_errmgr_predicted_node_t 
orte_errmgr_predicted_node_t
 
typedef void( orte_errmgr_fault_callback_t )(opal_pointer_array_t *procs)
 
typedef struct
orte_errmgr_predicted_map_t 
orte_errmgr_predicted_map_t
 
typedef int(* orte_errmgr_base_module_init_fn_t )(void)
 Module initialization function. More...
 
typedef int(* orte_errmgr_base_module_finalize_fn_t )(void)
 Module finalization function. More...
 
typedef void(* orte_errmgr_base_module_log_fn_t )(int error_code, char *filename, int line)
 This is not part of any module so it can be used at any time!
 
typedef void(* orte_errmgr_base_module_abort_fn_t )(int error_code, char *fmt,...) __opal_attribute_format_funcptr__(__printf__
 Alert - self aborting This function is called when a process is aborting due to some internal error. More...
 
typedef void(*) typedef int(* orte_errmgr_base_module_abort_peers_fn_t )(orte_process_name_t *procs, orte_std_cntr_t num_procs)
 Alert - abort peers This function is called when a process wants to abort one or more peer processes. More...
 
typedef int(* orte_errmgr_base_module_update_state_fn_t )(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc_name, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code)
 Alert - process aborted This function is called by the PLM when a remote process aborts during execution. More...
 
typedef int(* orte_errmgr_base_module_predicted_fault_fn_t )(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map)
 Predicted process/node failure notification. More...
 
typedef int(* orte_errmgr_base_module_suggest_map_targets_fn_t )(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list)
 Suggest a node to map a restarting process onto. More...
 
typedef int(* orte_errmgr_base_module_ft_event_fn_t )(int state)
 Handle fault tolerance updates. More...
 
typedef void(* orte_errmgr_base_module_register_migration_warning_fn_t )(struct timeval *tv)
 Function to perform actions that require the rest of the ORTE layer to be up and running. More...
 
typedef
orte_errmgr_fault_callback_t *(* 
orte_errmgr_base_module_set_fault_callback_t )(orte_errmgr_fault_callback_t *cbfunc)
 Set the callback function for faults. More...
 
typedef struct
orte_errmgr_base_module_2_3_0_t 
orte_errmgr_base_module_2_3_0_t
 
typedef
orte_errmgr_base_module_2_3_0_t 
orte_errmgr_base_module_t
 
typedef struct
orte_errmgr_base_component_3_0_0_t 
orte_errmgr_base_component_3_0_0_t
 
typedef
orte_errmgr_base_component_3_0_0_t 
orte_errmgr_base_component_t
 

Functions

 OBJ_CLASS_DECLARATION (orte_errmgr_predicted_proc_t)
 
 OBJ_CLASS_DECLARATION (orte_errmgr_predicted_node_t)
 
 OBJ_CLASS_DECLARATION (orte_errmgr_predicted_map_t)
 

Variables

ORTE_DECLSPEC
orte_errmgr_fault_callback_t * 
fault_cbfunc
 
ORTE_DECLSPEC
orte_errmgr_base_module_t 
orte_errmgr
 

Detailed Description

The Open RTE Error and Recovery Manager (ErrMgr)

This framework is the logically central clearing house for process/daemon state updates. In particular when a process fails and another process detects it, then that information is reported through this framework. This framework then (depending on the active component) decides how to handle the failure.

For example, if a process fails this may activate an automatic recovery of the process from a previous checkpoint, or initial state. Conversely, the active component could decide not to continue the job, and request that it be terminated. The error and recovery policy is determined by individual components within this framework.

Macro Definition Documentation

#define ORTE_ERRMGR_BASE_VERSION_3_0_0
Value:
MCA_BASE_VERSION_2_0_0, \
"errmgr", 3, 0, 0

Typedef Documentation

typedef void(* orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt,...) __opal_attribute_format_funcptr__(__printf__

Alert - self aborting This function is called when a process is aborting due to some internal error.

It will finalize the process itself, and then exit - it takes no other actions. The intent here is to provide a last-ditch exit procedure that attempts to clean up a little.

typedef void(*) typedef int(* orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs, orte_std_cntr_t num_procs)

Alert - abort peers This function is called when a process wants to abort one or more peer processes.

For example, MPI_Abort(comm) will use this function to terminate peers in the communicator group before aborting itself.

typedef int(* orte_errmgr_base_module_finalize_fn_t)(void)

Module finalization function.

Return values
ORTE_SUCCESSThe operation completed successfully
ORTE_ERRORAn unspecifed error occurred
typedef int(* orte_errmgr_base_module_ft_event_fn_t)(int state)

Handle fault tolerance updates.

Parameters
[in]stateFault tolerance state update
Return values
ORTE_SUCCESSThe operation completed successfully
ORTE_ERRORAn unspecifed error occurred
typedef int(* orte_errmgr_base_module_init_fn_t)(void)

Module initialization function.

Return values
ORTE_SUCCESSThe operation completed successfully
ORTE_ERRORAn unspecifed error occurred
typedef int(* orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map)

Predicted process/node failure notification.

Parameters
[in]proc_listList of processes (or NULL if none)
[in]node_listList of nodes (or NULL if none)
[in]suggested_mapList of mapping suggestions to use on recovery (or NULL if none)
Return values
ORTE_SUCCESSThe operation completed successfully
ORTE_ERRORAn unspecifed error occurred
typedef void(* orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv)

Function to perform actions that require the rest of the ORTE layer to be up and running.

Return values
ORTE_SUCCESSThe operation completed successfully
ORTE_ERRORAn unspecified error occured
typedef orte_errmgr_fault_callback_t*(* orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc)

Set the callback function for faults.

Parameters
[in]cbfuncThe callback function.
Return values
Theprevious fault callback function.
typedef int(* orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list)

Suggest a node to map a restarting process onto.

Parameters
[in]procProcess that is being mapped
[in]oldnodePrevious node where this process resided
in|out]node_list List of nodes to select from
Return values
ORTE_SUCCESSThe operation completed successfully
ORTE_ERRORAn unspecifed error occurred
typedef int(* orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc_name, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code)

Alert - process aborted This function is called by the PLM when a remote process aborts during execution.

Actions taken in response to the abnormal termination of a remote application process will vary across the various errmgr components.

NOTE: Local process errors should always be reported through the error_detected interface and NOT here.

Parameters
*namePointer to the name of the proc that aborted
Return values
ORTE_SUCCESSWhatever action that was taken was successful
ORTE_ERRORAppropriate error code