OpenMPI  0.1.1
errmgr_hnp.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
3  * Copyright (c) 2004-2011 The University of Tennessee and The University
4  * of Tennessee Research Foundation. All rights
5  * reserved.
6  *
7  * $COPYRIGHT$
8  *
9  * Additional copyrights may follow
10  *
11  * $HEADER$
12  */
13 
14 /**
15  * @file
16  *
17  */
18 
19 #ifndef MCA_ERRMGR_hnp_EXPORT_H
20 #define MCA_ERRMGR_hnp_EXPORT_H
21 
22 #include "orte_config.h"
23 
24 #include "orte/mca/errmgr/errmgr.h"
25 
26 BEGIN_C_DECLS
27 
28 /*
29  * Local Component structures
30  */
32  orte_errmgr_base_component_t super; /** Base Errmgr component */
33 
35  bool term_in_progress;
36 
37 #if OPAL_ENABLE_FT_CR
38  /* State of the Recovery */
39  bool crmig_in_progress;
40  bool autor_in_progress;
41 
42  /* CRMig Options */
43  bool crmig_enabled;
44  bool crmig_timing_enabled;
45 
46  /* AutoR Options */
47  bool autor_enabled;
48  bool autor_timing_enabled;
49  int autor_recovery_delay;
50  bool autor_skip_oldnode;
51 #endif
52 };
54 ORTE_MODULE_DECLSPEC extern orte_errmgr_hnp_component_t mca_errmgr_hnp_component;
55 
56 int orte_errmgr_hnp_component_query(mca_base_module_t **module, int *priority);
57 
58 void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
59  orte_process_name_t *proc,
60  orte_proc_state_t state,
61  pid_t pid,
62  orte_exit_code_t exit_code);
63 
64 /***************************
65  * Module functions: Global
66  ***************************/
67 int orte_errmgr_hnp_global_module_init(void);
68 int orte_errmgr_hnp_global_module_finalize(void);
69 
70 int orte_errmgr_hnp_global_update_state(orte_jobid_t job,
71  orte_job_state_t jobstate,
72  orte_process_name_t *proc_name,
73  orte_proc_state_t state,
74  pid_t pid,
75  orte_exit_code_t exit_code);
76 int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list,
77  opal_list_t *node_list,
78  opal_list_t *suggested_map);
79 int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
80  orte_node_t *oldnode,
81  opal_list_t *node_list);
82 int orte_errmgr_hnp_global_ft_event(int state);
83 int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
84 int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
85 int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc);
86 
87 /* hnp Versions */
88 int orte_errmgr_hnp_base_global_init(void);
89 int orte_errmgr_hnp_base_global_finalize(void);
90 int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
91  orte_job_state_t jobstate,
92  orte_process_name_t *proc,
93  orte_proc_state_t state,
94  pid_t pid,
95  orte_exit_code_t exit_code);
96 int orte_errmgr_hnp_base_global_ft_event(int state);
97 
98 #if OPAL_ENABLE_FT_CR
99 /* CRMig Versions */
100 int orte_errmgr_hnp_crmig_global_module_init(void);
101 int orte_errmgr_hnp_crmig_global_module_finalize(void);
102 
103 int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job,
104  orte_job_state_t jobstate,
105  orte_process_name_t *proc_name,
106  orte_proc_state_t state,
107  pid_t pid,
108  orte_exit_code_t exit_code);
109 int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list,
110  opal_list_t *node_list,
111  opal_list_t *suggested_map);
112 int orte_errmgr_hnp_crmig_global_suggest_map_targets(orte_proc_t *proc,
113  orte_node_t *oldnode,
114  opal_list_t *node_list);
115 int orte_errmgr_hnp_crmig_global_ft_event(int state);
116 
117 /* AutoR Versions */
118 int orte_errmgr_hnp_autor_global_module_init(void);
119 int orte_errmgr_hnp_autor_global_module_finalize(void);
120 
121 int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job,
122  orte_job_state_t jobstate,
123  orte_process_name_t *proc_name,
124  orte_proc_state_t state,
125  pid_t pid,
126  orte_exit_code_t exit_code);
127 int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
128  orte_node_t *oldnode,
129  opal_list_t *node_list);
130 int orte_errmgr_hnp_autor_global_ft_event(int state);
131 #endif
132 
133 END_C_DECLS
134 
135 #endif /* MCA_ERRMGR_hnp_EXPORT_H */
Common type for all MCA modules.
Definition: mca.h:100
dynamic pointer array
Definition: opal_pointer_array.h:45
Definition: errmgr.h:320
Definition: errmgr_hnp.h:31
uint32_t orte_jobid_t
Set the allowed range for ids in each space.
Definition: types.h:76
Definition: types.h:146
Definition: orte_globals.h:386
The Open RTE Error and Recovery Manager (ErrMgr)
Definition: orte_globals.h:316
Definition: opal_list.h:147
bool ignore_current_update
Base Errmgr component.
Definition: errmgr_hnp.h:34
Structure for holding a buffer to be used with the RML or OOB subsystems.
Definition: dss_types.h:159
Definition: orte_globals.h:254