37 #ifndef ORTE_MCA_ERRMGR_H
38 #define ORTE_MCA_ERRMGR_H
44 #include "orte_config.h"
45 #include "orte/constants.h"
49 #include "opal/mca/base/base.h"
54 #include "opal/util/error.h"
57 #include "orte/mca/plm/plm_types.h"
107 ORTE_DECLSPEC
extern orte_errmgr_fault_callback_t *fault_cbfunc;
149 #define ORTE_ERROR_NAME(n) opal_strerror(n)
150 #define ORTE_ERROR_LOG(n) \
151 orte_errmgr.log(n, __FILE__, __LINE__);
154 #define ORTE_PMI_ERROR(pmi_err, pmi_func) \
156 opal_output(0, "%s[%s:%d:%s] %s: %s\n", \
157 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
158 __FILE__, __LINE__, __func__, \
159 pmi_func, orte_errmgr_base_pmi_error(pmi_err)); \
161 OPAL_DECLSPEC
char* orte_errmgr_base_pmi_error(
int pmi_err);
196 __opal_attribute_format_funcptr__(__printf__, 2, 3);
222 orte_job_state_t jobstate,
224 orte_proc_state_t state,
226 orte_exit_code_t exit_code);
282 typedef orte_errmgr_fault_callback_t *(*orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc);
339 #define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
340 MCA_BASE_VERSION_2_0_0, \
char * node_name
Node Name.
Definition: errmgr.h:88
Common type for all MCA components.
Definition: mca.h:250
OPAL output stream facility.
orte_process_name_t proc_name
Process Name.
Definition: errmgr.h:72
dynamic pointer array
Definition: opal_pointer_array.h:45
orte_errmgr_fault_callback_t *(* orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc)
Set the callback function for faults.
Definition: errmgr.h:282
int(* orte_errmgr_base_module_finalize_fn_t)(void)
Module finalization function.
Definition: errmgr.h:181
uint32_t orte_jobid_t
Set the allowed range for ids in each space.
Definition: types.h:76
int(* orte_errmgr_base_module_init_fn_t)(void)
Module initialization function.
Definition: errmgr.h:173
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets
Suggest a node to map a restarting process onto.
Definition: errmgr.h:302
mca_base_component_data_t base_data
MCA base data.
Definition: errmgr.h:324
int32_t orte_std_cntr_t
Supported datatypes for messaging and storage operations.
Definition: types.h:34
int(* orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map)
Predicted process/node failure notification.
Definition: errmgr.h:238
opal_list_item_t super
This is an object, so must have a super.
Definition: errmgr.h:69
See opal_bitmap.h for an explanation of why there is a split between OPAL and ORTE for this generic c...
orte_errmgr_base_module_update_state_fn_t update_state
Actual process failure notification.
Definition: errmgr.h:298
int verbose
Verbosity Level.
Definition: errmgr.h:327
bool off_current_node
Just off current node.
Definition: errmgr.h:132
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault
Predicted process/node failure notification.
Definition: errmgr.h:300
Top-level interface for all MCA components.
Definition: opal_list.h:98
void(* orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv)
Function to perform actions that require the rest of the ORTE layer to be up and running.
Definition: errmgr.h:273
orte_process_name_t map_proc_name
Process Name (Map to)
Definition: errmgr.h:126
orte_errmgr_base_module_init_fn_t init
Initialization Function.
Definition: errmgr.h:289
Definition: orte_globals.h:386
int(* orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc_name, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code)
Alert - process aborted This function is called by the PLM when a remote process aborts during execut...
Definition: errmgr.h:221
void(* orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt,...) __opal_attribute_format_funcptr__(__printf__
Alert - self aborting This function is called when a process is aborting due to some internal error...
Definition: errmgr.h:195
int(* orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list)
Suggest a node to map a restarting process onto.
Definition: errmgr.h:252
mca_base_component_t base_version
MCA base component.
Definition: errmgr.h:322
char * map_node_name
Node Name (Map to)
Definition: errmgr.h:129
orte_errmgr_base_module_ft_event_fn_t ft_event
Handle any FT Notifications.
Definition: errmgr.h:305
opal_list_item_t super
This is an object, so must have a super.
Definition: errmgr.h:117
void(* orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line)
This is not part of any module so it can be used at any time!
Definition: errmgr.h:186
opal_list_item_t super
This is an object, so must have a super.
Definition: errmgr.h:85
char * node_name
Node Name (predicted to fail)
Definition: errmgr.h:123
char * pre_map_fixed_node
Pre-map fixed node assignment.
Definition: errmgr.h:135
Meta data for MCA v2.0.0 components.
Definition: mca.h:309
int priority
Default Priority.
Definition: errmgr.h:331
Definition: opal_list.h:147
int output_handle
Output Handle for opal_output.
Definition: errmgr.h:329
Global params for OpenRTE.
int(* orte_errmgr_base_module_ft_event_fn_t)(int state)
Handle fault tolerance updates.
Definition: errmgr.h:264
orte_errmgr_base_module_finalize_fn_t finalize
Finalization Function.
Definition: errmgr.h:291
A simple C-language object-oriented system with single inheritance and ownership-based memory managem...
void(*) typedef int(* orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs, orte_std_cntr_t num_procs)
Alert - abort peers This function is called when a process wants to abort one or more peer processes...
Definition: errmgr.h:204
Definition: orte_globals.h:254
orte_process_name_t proc_name
Process Name (predicted to fail)
Definition: errmgr.h:120
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236