OpenMPI  0.1.1
crs.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2005 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2007 Evergrid, Inc. All rights reserved.
13  * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
14  *
15  * $COPYRIGHT$
16  *
17  * Additional copyrights may follow
18  *
19  * $HEADER$
20  */
21 /**
22  * @file
23  *
24  * Checkpoint and Restart Service (CRS) Interface
25  *
26  * General Description:
27  *
28  * The OPAL Checkpoint and Restart Service (CRS) has been created to create an
29  * abstract notion of a single process checkpointer for upper levels to
30  * incorporate checkpoint/restart calls genericly into their code. This keeps
31  * the upper levels from becoming too tied to a specfic checkpoint and restart
32  * implementation.
33  *
34  * This interface will change in the future to allow for some additional
35  * specialized functionality such as memory inclusion/exclusion, explicit
36  * restarting while running, and others.
37  *
38  * Words to the Wise:
39  *
40  * The CRS module must adhere to the API exactly inorder to be fully supported.
41  * How the module goes about conforming to the API is an internal module issue
42  * and in no cases should the module impose restrictions upon the upper layers
43  * as this is an API violation.
44  *
45  */
46 
47 #ifndef MCA_CRS_H
48 #define MCA_CRS_H
49 
50 #include "opal_config.h"
51 #include "opal/mca/mca.h"
52 #include "opal/mca/base/base.h"
53 #include "opal/class/opal_object.h"
54 
55 BEGIN_C_DECLS
56 
57 /**
58  * States of the module
59  */
61  OPAL_CRS_NONE = 0,
62  OPAL_CRS_CHECKPOINT = 1,
63  OPAL_CRS_RESTART_PRE = 2,
64  OPAL_CRS_RESTART = 3, /* RESTART_POST */
65  OPAL_CRS_CONTINUE = 4,
66  OPAL_CRS_TERM = 5,
67  OPAL_CRS_RUNNING = 6,
68  OPAL_CRS_ERROR = 7,
69  OPAL_CRS_STATE_MAX = 8
70 };
72 
73 /*
74  * Possible checkpoint options
75  */
77  /** Parent is an object type */
79 
80  /** Terminate after checkpoint */
81  bool term;
82  /** Send SIGSTOP after checkpoint */
83  bool stop;
84 
85  /** INC Prep Only */
87 
88  /** INC Recover Only */
90 
91 #if OPAL_ENABLE_CRDEBUG == 1
92  /** Wait for debugger to attach after checkpoint */
93  bool attach_debugger;
94  /** Do not wait for debugger to reattach after checkpoint */
95  bool detach_debugger;
96 #endif
97 };
101 
102 /**
103  * Structure for Single process snapshot
104  * Each component is assumed to have extened this definition
105  * in the same way they exten the opal_crs_base_compoinent_t below.
106  */
108  /** This is an object, so must have super */
110 
111  /** MCA Component name */
113 
114  /** Metadata filename */
116 
117  /** Metadata fd */
118  FILE * metadata;
119 
120  /** Absolute path the the snapshot directory */
122 
123  /** Cold Start:
124  * If we are restarting cold, then we need to recreate this structure
125  * opal_restart would set this, and let the component do the heavy lifting
126  * of recreating the structure, sicne it doesn't know exactly how to.
127  */
129 };
132 
134 
135 /**
136  * Module initialization function.
137  * Returns OPAL_SUCCESS
138  */
139 typedef int (*opal_crs_base_module_init_fn_t)
140  (void);
141 
142 /**
143  * Module finalization function.
144  * Returns OPAL_SUCCESS
145  */
147  (void);
148 
149 /**
150  * Call the underlying checkpointer.
151  * Returns OPAL_SUCCESS upon success, and OPAL_ERROR otherwise.
152  *
153  * Arguments:
154  * pid = PID of the process to checkpoint, or 0 if checkpointing self.
155  * fname = the filename where the checkpoint has been written.
156  * state = The state at which the checkpoint is exiting
157  * - OPAL_CRS_CONTINUE
158  * Continuing after a checkpoint has been taken
159  * - OPAL_CRS_RESTART
160  * Restarting from a checkpoint
161  * - OPAL_CRS_ERROR
162  * Checkpoint was not successful.
163  *
164  * The 'fname' string is owned by the caller: if appropriate, it must be eventually
165  * freed by the caller.
166  */
168  (pid_t pid,
169  opal_crs_base_snapshot_t *snapshot,
171  opal_crs_state_type_t *state);
172 
173 /**
174  * Call the underlying restart command for this process
175  * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
176  *
177  * Arguments:
178  * fname = Checkpoint filename
179  * spawn_child = true if the restarted process should be forked as a new process,
180  * in which case 'child_pid' will be returned.
181  * false if the restarted process should overwrite the current
182  * process space.
183  * child_pid = PID of the child that was started, if applicable
184  *
185  */
188  bool spawn_child,
189  pid_t *child_pid);
190 
191 /**
192  * Disable the checkpointer
193  * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
194  *
195  * This should set a flag/mutex to disallow checkpoints to occur.
196  * If a checkpoint were to occur while checkpoints are disabled,
197  * they should block until reenabled.
198  * A quality module implementation would notify the user that the
199  * checkpoint has been delayed until the program is out of this critical
200  * section of code.
201  */
203  (void);
204 
205 /**
206  * Enable the checkpointer
207  * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
208  *
209  * This should set a flag/mutex to allow checkpoints to occur
210  */
212  (void);
213 
214 /**
215  * Prepare the CRS component for process launch.
216  * Some CRS components need to take action before the
217  * process is ever launched to do such things as:
218  * - seed the process environment
219  * - LD_PRELOAD
220  * - Analyze the binary before launch
221  *
222  * @param rank Rank of the process to be started
223  * @param app Absolute pathname of argv[0]
224  * @param argv Standard argv-style array, including a final NULL pointer
225  * @param env Standard environ-style array, including a final NULL pointer
226  */
228  (int32_t rank,
229  char *base_snapshot_dir,
230  char **app,
231  char **cwd,
232  char ***argv,
233  char ***env);
234 
235 /**
236  * Register another thread that may call this library.
237  * Some CR systems require that each thread that will call into their library
238  * register individually before doing so.
239  *
240  * Returns OPAL_SUCCESS or OPAL_ERROR
241  */
243  (void);
244 
245 /**
246  * Structure for CRS components.
247  */
249  /** MCA base component */
251  /** MCA base data */
253 
254  /** Verbosity Level */
255  int verbose;
256  /** Output Handle for opal_output */
258  /** Default Priority */
259  int priority;
260 };
263 
264 /**
265  * Structure for CRS modules
266  */
268  /** Initialization Function */
270  /** Finalization Function */
272 
273  /** Checkpoint interface */
275 
276  /** Restart Interface */
278 
279  /** Disable checkpoints */
281  /** Enable checkpoints */
283 
284  /** Pre Launch */
286 
287  /** Per thread registration */
289 };
292 
293 OPAL_DECLSPEC extern opal_crs_base_module_t opal_crs;
294 
295 /**
296  * Macro for use in components that are of type CRS
297  */
298 #define OPAL_CRS_BASE_VERSION_2_0_0 \
299  MCA_BASE_VERSION_2_0_0, \
300  "crs", 2, 0, 0
301 
302 END_C_DECLS
303 
304 #endif /* OPAL_CRS_H */
305 
Common type for all MCA components.
Definition: mca.h:250
int verbose
Verbosity Level.
Definition: crs.h:255
int output_handle
Output Handle for opal_output.
Definition: crs.h:257
int(* opal_crs_base_module_disable_checkpoint_fn_t)(void)
Disable the checkpointer Returns OPAL_SUCCESS or OPAL_CRS_ERROR.
Definition: crs.h:203
char * component_name
MCA Component name.
Definition: crs.h:112
bool term
Terminate after checkpoint.
Definition: crs.h:81
opal_crs_base_module_prelaunch_fn_t crs_prelaunch
Pre Launch.
Definition: crs.h:285
int(* opal_crs_base_module_restart_fn_t)(opal_crs_base_snapshot_t *snapshot, bool spawn_child, pid_t *child_pid)
Call the underlying restart command for this process Returns OPAL_SUCCESS or OPAL_CRS_ERROR.
Definition: crs.h:187
int(* opal_crs_base_module_reg_thread_fn_t)(void)
Register another thread that may call this library.
Definition: crs.h:243
int(* opal_crs_base_module_init_fn_t)(void)
Module initialization function.
Definition: crs.h:140
int(* opal_crs_base_module_prelaunch_fn_t)(int32_t rank, char *base_snapshot_dir, char **app, char **cwd, char ***argv, char ***env)
Prepare the CRS component for process launch.
Definition: crs.h:228
Top-level interface for all MCA components.
Definition: opal_list.h:98
bool inc_prep_only
INC Prep Only.
Definition: crs.h:86
Structure for CRS modules.
Definition: crs.h:267
opal_crs_base_module_reg_thread_fn_t crs_reg_thread
Per thread registration.
Definition: crs.h:288
FILE * metadata
Metadata fd.
Definition: crs.h:118
bool cold_start
Cold Start: If we are restarting cold, then we need to recreate this structure opal_restart would set...
Definition: crs.h:128
Structure for Single process snapshot Each component is assumed to have extened this definition in th...
Definition: crs.h:107
mca_base_component_t base_version
MCA base component.
Definition: crs.h:250
int priority
Default Priority.
Definition: crs.h:259
Structure for CRS components.
Definition: crs.h:248
int(* opal_crs_base_module_checkpoint_fn_t)(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_base_ckpt_options_t *options, opal_crs_state_type_t *state)
Call the underlying checkpointer.
Definition: crs.h:168
int(* opal_crs_base_module_enable_checkpoint_fn_t)(void)
Enable the checkpointer Returns OPAL_SUCCESS or OPAL_CRS_ERROR.
Definition: crs.h:212
char * metadata_filename
Metadata filename.
Definition: crs.h:115
opal_list_item_t super
This is an object, so must have super.
Definition: crs.h:109
Base object.
Definition: opal_object.h:182
opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint
Disable checkpoints.
Definition: crs.h:280
opal_crs_base_module_finalize_fn_t crs_finalize
Finalization Function.
Definition: crs.h:271
Meta data for MCA v2.0.0 components.
Definition: mca.h:309
int(* opal_crs_base_module_finalize_fn_t)(void)
Module finalization function.
Definition: crs.h:147
opal_crs_base_module_restart_fn_t crs_restart
Restart Interface.
Definition: crs.h:277
bool stop
Send SIGSTOP after checkpoint.
Definition: crs.h:83
opal_crs_base_module_checkpoint_fn_t crs_checkpoint
Checkpoint interface.
Definition: crs.h:274
bool inc_recover_only
INC Recover Only.
Definition: crs.h:89
char * snapshot_directory
Absolute path the the snapshot directory.
Definition: crs.h:121
A simple C-language object-oriented system with single inheritance and ownership-based memory managem...
mca_base_component_data_t base_data
MCA base data.
Definition: crs.h:252
opal_object_t super
Parent is an object type.
Definition: crs.h:78
opal_crs_base_module_enable_checkpoint_fn_t crs_enable_checkpoint
Enable checkpoints.
Definition: crs.h:282
opal_crs_base_module_init_fn_t crs_init
Initialization Function.
Definition: crs.h:269
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
opal_crs_state_type_t
States of the module.
Definition: crs.h:60