OpenMPI  0.1.1
errmgr.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2011 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
13  * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
14  * $COPYRIGHT$
15  *
16  * Additional copyrights may follow
17  *
18  * $HEADER$
19  */
20 /** @file:
21  *
22  * The Open RTE Error and Recovery Manager (ErrMgr)
23  *
24  * This framework is the logically central clearing house for process/daemon
25  * state updates. In particular when a process fails and another process detects
26  * it, then that information is reported through this framework. This framework
27  * then (depending on the active component) decides how to handle the failure.
28  *
29  * For example, if a process fails this may activate an automatic recovery
30  * of the process from a previous checkpoint, or initial state. Conversely,
31  * the active component could decide not to continue the job, and request that
32  * it be terminated. The error and recovery policy is determined by individual
33  * components within this framework.
34  *
35  */
36 
37 #ifndef ORTE_MCA_ERRMGR_H
38 #define ORTE_MCA_ERRMGR_H
39 
40 /*
41  * includes
42  */
43 
44 #include "orte_config.h"
45 #include "orte/constants.h"
46 #include "orte/types.h"
47 
48 #include "opal/mca/mca.h"
49 #include "opal/mca/base/base.h"
50 
51 #include "opal/class/opal_object.h"
53 #include "opal/util/output.h"
54 #include "opal/util/error.h"
55 
57 #include "orte/mca/plm/plm_types.h"
58 
59 BEGIN_C_DECLS
60 
61 /*
62  * Structure to describe a predicted process fault.
63  *
64  * This can be expanded in the future to support assurance levels, and
65  * additional information that may wish to be conveyed.
66  */
68  /** This is an object, so must have a super */
70 
71  /** Process Name */
73 };
76 
77 /*
78  * Structure to describe a predicted node fault.
79  *
80  * This can be expanded in the future to support assurance levels, and
81  * additional information that may wish to be conveyed.
82  */
84  /** This is an object, so must have a super */
86 
87  /** Node Name */
88  char * node_name;
89 };
92 
93 /*
94  * Callback function that should be called when there is a fault.
95  *
96  * This callback function will be used anytime (other than during finalize) the
97  * runtime detects and handles a process failure. The runtime will complete all
98  * its stabilization before alerting the callback function. The parameter to the
99  * callback function will be the orte_process_name_t of the process that failed.
100  * It will not alert the application to failures that are not in the same job as
101  * the alerted process, only failures within the same jobid.
102  *
103  * @param[in] proc The names of the process that failed
104  */
105 typedef void (orte_errmgr_fault_callback_t)(opal_pointer_array_t *procs);
106 
107 ORTE_DECLSPEC extern orte_errmgr_fault_callback_t *fault_cbfunc;
108 
109 /*
110  * Structure to describe a suggested remapping element for a predicted fault.
111  *
112  * This can be expanded in the future to support weights , and
113  * additional information that may wish to be conveyed.
114  */
116  /** This is an object, so must have a super */
118 
119  /** Process Name (predicted to fail) */
121 
122  /** Node Name (predicted to fail) */
123  char * node_name;
124 
125  /** Process Name (Map to) */
127 
128  /** Node Name (Map to) */
130 
131  /** Just off current node */
133 
134  /** Pre-map fixed node assignment */
136 };
139 
140 
141 /*
142  * Macro definitions
143  */
144 /*
145  * Thess macros and associated error name array are used to output intelligible error
146  * messages.
147  */
148 
149 #define ORTE_ERROR_NAME(n) opal_strerror(n)
150 #define ORTE_ERROR_LOG(n) \
151  orte_errmgr.log(n, __FILE__, __LINE__);
152 
153 #if WANT_PMI_SUPPORT
154 #define ORTE_PMI_ERROR(pmi_err, pmi_func) \
155  do { \
156  opal_output(0, "%s[%s:%d:%s] %s: %s\n", \
157  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
158  __FILE__, __LINE__, __func__, \
159  pmi_func, orte_errmgr_base_pmi_error(pmi_err)); \
160  } while(0);
161 OPAL_DECLSPEC char* orte_errmgr_base_pmi_error(int pmi_err);
162 #endif
163 
164 /*
165  * Framework Interfaces
166  */
167 /**
168  * Module initialization function.
169  *
170  * @retval ORTE_SUCCESS The operation completed successfully
171  * @retval ORTE_ERROR An unspecifed error occurred
172  */
174 
175 /**
176  * Module finalization function.
177  *
178  * @retval ORTE_SUCCESS The operation completed successfully
179  * @retval ORTE_ERROR An unspecifed error occurred
180  */
182 
183 /**
184  * This is not part of any module so it can be used at any time!
185  */
186 typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line);
187 
188 /**
189  * Alert - self aborting
190  * This function is called when a process is aborting due to some internal error.
191  * It will finalize the process
192  * itself, and then exit - it takes no other actions. The intent here is to provide
193  * a last-ditch exit procedure that attempts to clean up a little.
194  */
195 typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
196 __opal_attribute_format_funcptr__(__printf__, 2, 3);
197 
198 /**
199  * Alert - abort peers
200  * This function is called when a process wants to abort one or more peer processes.
201  * For example, MPI_Abort(comm) will use this function to terminate peers in the
202  * communicator group before aborting itself.
203  */
205  orte_std_cntr_t num_procs);
206 
207 /**
208  * Alert - process aborted
209  * This function is called by the PLM when a remote process aborts during execution. Actions taken
210  * in response to the abnormal termination of a remote application process will vary across
211  * the various errmgr components.
212  *
213  * NOTE: Local process errors should always be reported through the error_detected interface and
214  * NOT here.
215  *
216  * @param *name Pointer to the name of the proc that aborted
217  *
218  * @retval ORTE_SUCCESS Whatever action that was taken was successful
219  * @retval ORTE_ERROR Appropriate error code
220  */
222  orte_job_state_t jobstate,
224  orte_proc_state_t state,
225  pid_t pid,
226  orte_exit_code_t exit_code);
227 
228 /**
229  * Predicted process/node failure notification
230  *
231  * @param[in] proc_list List of processes (or NULL if none)
232  * @param[in] node_list List of nodes (or NULL if none)
233  * @param[in] suggested_map List of mapping suggestions to use on recovery (or NULL if none)
234  *
235  * @retval ORTE_SUCCESS The operation completed successfully
236  * @retval ORTE_ERROR An unspecifed error occurred
237  */
239  opal_list_t *node_list,
240  opal_list_t *suggested_map);
241 
242 /**
243  * Suggest a node to map a restarting process onto
244  *
245  * @param[in] proc Process that is being mapped
246  * @param[in] oldnode Previous node where this process resided
247  * @param[in|out] node_list List of nodes to select from
248  *
249  * @retval ORTE_SUCCESS The operation completed successfully
250  * @retval ORTE_ERROR An unspecifed error occurred
251  */
253  orte_node_t *oldnode,
254  opal_list_t *node_list);
255 
256 /**
257  * Handle fault tolerance updates
258  *
259  * @param[in] state Fault tolerance state update
260  *
261  * @retval ORTE_SUCCESS The operation completed successfully
262  * @retval ORTE_ERROR An unspecifed error occurred
263  */
264 typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
265 
266 /**
267  * Function to perform actions that require the rest of the ORTE layer to be up
268  * and running.
269  *
270  * @retval ORTE_SUCCESS The operation completed successfully
271  * @retval ORTE_ERROR An unspecified error occured
272  */
274 
275 /**
276  * Set the callback function for faults.
277  *
278  * @param[in] cbfunc The callback function.
279  *
280  * @retval The previous fault callback function.
281  */
282 typedef orte_errmgr_fault_callback_t *(*orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc);
283 
284 /*
285  * Module Structure
286  */
288  /** Initialization Function */
290  /** Finalization Function */
292 
296 
297  /** Actual process failure notification */
299  /** Predicted process/node failure notification */
301  /** Suggest a node to map a restarting process onto */
303 
304  /** Handle any FT Notifications */
306 
307  /* Register to be warned of impending migration */
309 
310  /* Set the callback function */
312 };
315 ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
316 
317 /*
318  * ErrMgr Component
319  */
321  /** MCA base component */
323  /** MCA base data */
325 
326  /** Verbosity Level */
327  int verbose;
328  /** Output Handle for opal_output */
330  /** Default Priority */
331  int priority;
332 };
335 
336 /*
337  * Macro for use in components that are of type errmgr
338  */
339 #define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
340  MCA_BASE_VERSION_2_0_0, \
341  "errmgr", 3, 0, 0
342 
343 END_C_DECLS
344 
345 #endif
char * node_name
Node Name.
Definition: errmgr.h:88
Definition: errmgr.h:67
Common type for all MCA components.
Definition: mca.h:250
OPAL output stream facility.
orte_process_name_t proc_name
Process Name.
Definition: errmgr.h:72
dynamic pointer array
Definition: opal_pointer_array.h:45
orte_errmgr_fault_callback_t *(* orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc)
Set the callback function for faults.
Definition: errmgr.h:282
Definition: errmgr.h:320
int(* orte_errmgr_base_module_finalize_fn_t)(void)
Module finalization function.
Definition: errmgr.h:181
uint32_t orte_jobid_t
Set the allowed range for ids in each space.
Definition: types.h:76
Definition: types.h:146
int(* orte_errmgr_base_module_init_fn_t)(void)
Module initialization function.
Definition: errmgr.h:173
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets
Suggest a node to map a restarting process onto.
Definition: errmgr.h:302
mca_base_component_data_t base_data
MCA base data.
Definition: errmgr.h:324
int32_t orte_std_cntr_t
Supported datatypes for messaging and storage operations.
Definition: types.h:34
int(* orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map)
Predicted process/node failure notification.
Definition: errmgr.h:238
opal_list_item_t super
This is an object, so must have a super.
Definition: errmgr.h:69
See opal_bitmap.h for an explanation of why there is a split between OPAL and ORTE for this generic c...
orte_errmgr_base_module_update_state_fn_t update_state
Actual process failure notification.
Definition: errmgr.h:298
int verbose
Verbosity Level.
Definition: errmgr.h:327
Definition: errmgr.h:287
bool off_current_node
Just off current node.
Definition: errmgr.h:132
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault
Predicted process/node failure notification.
Definition: errmgr.h:300
Top-level interface for all MCA components.
Definition: opal_list.h:98
void(* orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv)
Function to perform actions that require the rest of the ORTE layer to be up and running.
Definition: errmgr.h:273
orte_process_name_t map_proc_name
Process Name (Map to)
Definition: errmgr.h:126
orte_errmgr_base_module_init_fn_t init
Initialization Function.
Definition: errmgr.h:289
Definition: orte_globals.h:386
int(* orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc_name, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code)
Alert - process aborted This function is called by the PLM when a remote process aborts during execut...
Definition: errmgr.h:221
void(* orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt,...) __opal_attribute_format_funcptr__(__printf__
Alert - self aborting This function is called when a process is aborting due to some internal error...
Definition: errmgr.h:195
int(* orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list)
Suggest a node to map a restarting process onto.
Definition: errmgr.h:252
Definition: errmgr.h:115
mca_base_component_t base_version
MCA base component.
Definition: errmgr.h:322
char * map_node_name
Node Name (Map to)
Definition: errmgr.h:129
orte_errmgr_base_module_ft_event_fn_t ft_event
Handle any FT Notifications.
Definition: errmgr.h:305
opal_list_item_t super
This is an object, so must have a super.
Definition: errmgr.h:117
void(* orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line)
This is not part of any module so it can be used at any time!
Definition: errmgr.h:186
opal_list_item_t super
This is an object, so must have a super.
Definition: errmgr.h:85
char * node_name
Node Name (predicted to fail)
Definition: errmgr.h:123
char * pre_map_fixed_node
Pre-map fixed node assignment.
Definition: errmgr.h:135
Meta data for MCA v2.0.0 components.
Definition: mca.h:309
int priority
Default Priority.
Definition: errmgr.h:331
Definition: opal_list.h:147
int output_handle
Output Handle for opal_output.
Definition: errmgr.h:329
Global params for OpenRTE.
Definition: errmgr.h:83
int(* orte_errmgr_base_module_ft_event_fn_t)(int state)
Handle fault tolerance updates.
Definition: errmgr.h:264
orte_errmgr_base_module_finalize_fn_t finalize
Finalization Function.
Definition: errmgr.h:291
A simple C-language object-oriented system with single inheritance and ownership-based memory managem...
void(*) typedef int(* orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs, orte_std_cntr_t num_procs)
Alert - abort peers This function is called when a process wants to abort one or more peer processes...
Definition: errmgr.h:204
Definition: orte_globals.h:254
orte_process_name_t proc_name
Process Name (predicted to fail)
Definition: errmgr.h:120
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236