26 #include "opal_config.h"
28 #include "opal/mca/event/event.h"
41 #define OPAL_CR_DONE ((char) 0)
42 #define OPAL_CR_ACK ((char) 1)
43 #define OPAL_CR_CHECKPOINT ((char) 2)
44 #define OPAL_CR_NAMED_PROG_R ("opal_cr_prog_read")
45 #define OPAL_CR_NAMED_PROG_W ("opal_cr_prog_write")
46 #define OPAL_CR_BASE_ENV_NAME ("opal_cr_restart-env")
51 enum opal_cr_ckpt_cmd_state_t {
52 OPAL_CHECKPOINT_CMD_START,
53 OPAL_CHECKPOINT_CMD_IN_PROGRESS,
54 OPAL_CHECKPOINT_CMD_NULL,
55 OPAL_CHECKPOINT_CMD_ERROR,
58 OPAL_CR_STATUS_REQUESTED,
59 OPAL_CR_STATUS_RUNNING,
62 OPAL_CR_STATUS_CONTINUE,
64 OPAL_CR_STATUS_RESTART_PRE,
65 OPAL_CR_STATUS_RESTART_POST
67 typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;
71 OPAL_DECLSPEC
extern int opal_cr_output;
75 OPAL_DECLSPEC
extern char * opal_cr_pipe_dir;
79 OPAL_DECLSPEC
extern int opal_cr_entry_point_signal;
82 OPAL_DECLSPEC
extern bool opal_cr_is_enabled;
86 OPAL_DECLSPEC
extern bool opal_cr_is_tool;
89 OPAL_DECLSPEC
extern int opal_cr_checkpoint_request;
92 OPAL_DECLSPEC
extern int opal_cr_checkpointing_state;
94 #if OPAL_ENABLE_CRDEBUG == 1
96 OPAL_DECLSPEC
extern int MPIR_debug_with_checkpoint;
101 OPAL_DECLSPEC
int opal_cr_debug_set_current_ckpt_thread_self(
void);
102 OPAL_DECLSPEC
int opal_cr_debug_clear_current_ckpt_thread(
void);
108 OPAL_DECLSPEC
int MPIR_checkpoint_debugger_detach(
void);
114 OPAL_DECLSPEC
void *MPIR_checkpoint_debugger_breakpoint(
void);
119 OPAL_DECLSPEC
void *MPIR_checkpoint_debugger_waitpoint(
void);
124 OPAL_DECLSPEC
void MPIR_checkpoint_debugger_signal_handler(
int signo);
130 OPAL_DECLSPEC
int opal_cr_refresh_environ(
int prev_pid);
138 OPAL_DECLSPEC
int opal_cr_set_enabled(
bool);
169 OPAL_DECLSPEC
void opal_cr_test_if_checkpoint_ready(
void);
174 OPAL_DECLSPEC
extern bool opal_cr_stall_check;
175 OPAL_DECLSPEC
extern bool opal_cr_currently_stalled;
177 #if OPAL_ENABLE_FT_THREAD == 1
179 OPAL_DECLSPEC
void opal_cr_thread_init_library(
void);
180 OPAL_DECLSPEC
void opal_cr_thread_finalize_library(
void);
181 OPAL_DECLSPEC
void opal_cr_thread_abort_library(
void);
182 OPAL_DECLSPEC
void opal_cr_thread_enter_library(
void);
183 OPAL_DECLSPEC
void opal_cr_thread_exit_library(
void);
184 OPAL_DECLSPEC
void opal_cr_thread_noop_progress(
void);
190 #if OPAL_ENABLE_FT == 0 || OPAL_ENABLE_FT_CR == 0
191 #define OPAL_CR_TEST_CHECKPOINT_READY() ;
192 #define OPAL_CR_TEST_CHECKPOINT_READY_STALL() ;
193 #define OPAL_CR_INIT_LIBRARY() ;
194 #define OPAL_CR_FINALIZE_LIBRARY() ;
195 #define OPAL_CR_ABORT_LIBRARY() ;
196 #define OPAL_CR_ENTER_LIBRARY() ;
197 #define OPAL_CR_EXIT_LIBRARY() ;
198 #define OPAL_CR_NOOP_PROGRESS() ;
204 #if OPAL_ENABLE_FT_CR == 1
205 #define OPAL_CR_TEST_CHECKPOINT_READY() \
207 if(OPAL_UNLIKELY(opal_cr_is_enabled) ) { \
208 opal_cr_test_if_checkpoint_ready(); \
212 #define OPAL_CR_TEST_CHECKPOINT_READY_STALL() \
214 if(OPAL_UNLIKELY(opal_cr_is_enabled && !opal_cr_stall_check)) { \
215 opal_cr_test_if_checkpoint_ready(); \
220 #if OPAL_ENABLE_FT_THREAD == 0
221 #define OPAL_CR_INIT_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
222 #define OPAL_CR_FINALIZE_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
223 #define OPAL_CR_ABORT_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
224 #define OPAL_CR_ENTER_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
225 #define OPAL_CR_EXIT_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
226 #define OPAL_CR_NOOP_PROGRESS() OPAL_CR_TEST_CHECKPOINT_READY();
230 #if OPAL_ENABLE_FT_THREAD == 1
231 #define OPAL_CR_INIT_LIBRARY() \
233 opal_cr_thread_init_library(); \
235 #define OPAL_CR_FINALIZE_LIBRARY() \
237 opal_cr_thread_finalize_library(); \
239 #define OPAL_CR_ABORT_LIBRARY() \
241 opal_cr_thread_abort_library(); \
243 #define OPAL_CR_ENTER_LIBRARY() \
245 opal_cr_thread_enter_library(); \
247 #define OPAL_CR_EXIT_LIBRARY() \
249 opal_cr_thread_exit_library(); \
251 #define OPAL_CR_NOOP_PROGRESS() \
253 opal_cr_thread_noop_progress(); \
272 OPAL_DECLSPEC
int opal_cr_reg_notify_callback
287 OPAL_DECLSPEC
int opal_cr_inc_core_prep(
void);
288 OPAL_DECLSPEC
int opal_cr_inc_core_ckpt(pid_t pid,
292 OPAL_DECLSPEC
int opal_cr_inc_core_recover(
int state);
299 OMPI_CR_INC_PRE_CRS_PRE_MPI = 0,
300 OMPI_CR_INC_PRE_CRS_POST_MPI = 1,
301 OMPI_CR_INC_CRS_PRE_CKPT = 2,
302 OMPI_CR_INC_CRS_POST_CKPT = 3,
303 OMPI_CR_INC_POST_CRS_PRE_MPI = 4,
304 OMPI_CR_INC_POST_CRS_POST_MPI = 5,
306 } opal_cr_user_inc_callback_event_t;
309 OMPI_CR_INC_STATE_PREPARE = 0,
310 OMPI_CR_INC_STATE_CONTINUE = 1,
311 OMPI_CR_INC_STATE_RESTART = 2,
312 OMPI_CR_INC_STATE_ERROR = 3
313 } opal_cr_user_inc_callback_state_t;
319 opal_cr_user_inc_callback_state_t state);
321 OPAL_DECLSPEC
int opal_cr_user_inc_register_callback
322 (opal_cr_user_inc_callback_event_t
event,
326 OPAL_DECLSPEC
int trigger_user_inc_callback(opal_cr_user_inc_callback_event_t
event,
327 opal_cr_user_inc_callback_state_t state);
355 OPAL_DECLSPEC
void opal_cr_display_all_timers(
void);
356 OPAL_DECLSPEC
void opal_cr_clear_timers(
void);
358 OPAL_DECLSPEC
extern bool opal_cr_timing_enabled;
359 OPAL_DECLSPEC
extern bool opal_cr_timing_barrier_enabled;
360 OPAL_DECLSPEC
extern int opal_cr_timing_my_rank;
361 OPAL_DECLSPEC
extern int opal_cr_timing_target_rank;
364 #define OPAL_CR_TIMER_ENTRY0 0
365 #define OPAL_CR_TIMER_ENTRY1 1
366 #define OPAL_CR_TIMER_ENTRY2 2
367 #define OPAL_CR_TIMER_CRCPBR0 3
368 #define OPAL_CR_TIMER_CRCP0 4
369 #define OPAL_CR_TIMER_CRCPBR1 5
370 #define OPAL_CR_TIMER_P2P0 6
371 #define OPAL_CR_TIMER_P2P1 7
372 #define OPAL_CR_TIMER_P2PBR0 8
373 #define OPAL_CR_TIMER_CORE0 9
374 #define OPAL_CR_TIMER_CORE1 10
375 #define OPAL_CR_TIMER_COREBR0 11
376 #define OPAL_CR_TIMER_P2P2 12
377 #define OPAL_CR_TIMER_P2PBR1 13
378 #define OPAL_CR_TIMER_P2P3 14
379 #define OPAL_CR_TIMER_P2PBR2 15
380 #define OPAL_CR_TIMER_CRCP1 16
381 #define OPAL_CR_TIMER_COREBR1 17
382 #define OPAL_CR_TIMER_CORE2 18
383 #define OPAL_CR_TIMER_ENTRY3 19
384 #define OPAL_CR_TIMER_ENTRY4 20
385 #define OPAL_CR_TIMER_MAX 21
388 #define OPAL_CR_CLEAR_TIMERS() \
390 if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
391 opal_cr_clear_timers(); \
395 #define OPAL_CR_SET_TIMER(idx) \
397 if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
398 opal_cr_set_time(idx); \
402 #define OPAL_CR_DISPLAY_ALL_TIMERS() \
404 if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
405 opal_cr_display_all_timers(); \
OPAL_DECLSPEC void opal_cr_set_time(int idx)
Checkpoint life-cycle timing.
Definition: opal_cr.c:1195
OPAL_DECLSPEC int opal_cr_init(void)
Initialize the notification and coordination elements.
Definition: opal_cr.c:197
OPAL output stream facility.
OPAL_DECLSPEC int opal_cr_finalize(void)
Finalize the notification and coordination elements.
Definition: opal_cr.c:468
Structure to represent a single event.
Definition: event_struct.h:87
int(* opal_cr_notify_callback_fn_t)(opal_cr_ckpt_cmd_state_t)
A function to respond to the async checkpoint request this is useful when figuring out who should res...
Definition: opal_cr.h:270
Structure for Single process snapshot Each component is assumed to have extened this definition in th...
Definition: crs.h:107
OPAL_DECLSPEC int opal_cr_coord(int state)
OPAL Checkpoint Coordination Routine.
Definition: opal_cr.c:774
int(* opal_cr_coord_callback_fn_t)(int)
Coordination callback routine signature.
Definition: opal_cr.h:336
Compiler-specific prefetch functions.
OPAL_DECLSPEC int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_base_ckpt_options_t *options, int *state)
Function to go through the INC.
Definition: opal_cr.c:736
int(* opal_cr_user_inc_callback_fn_t)(opal_cr_user_inc_callback_event_t event, opal_cr_user_inc_callback_state_t state)
User coordination callback routine.
Definition: opal_cr.h:318
Checkpoint and Restart Service (CRS) Interface.
OPAL_DECLSPEC int opal_cr_reg_coord_callback(opal_cr_coord_callback_fn_t new_func, opal_cr_coord_callback_fn_t *prev_func)
Register a checkpoint coodination routine for a higher level.
Definition: opal_cr.c:874