OpenMPI
0.1.1
|
Checkpoint and Restart Service (CRS) Interface. More...
#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_object.h"
Go to the source code of this file.
Data Structures | |
struct | opal_crs_base_ckpt_options_1_0_0_t |
struct | opal_crs_base_snapshot_1_0_0_t |
Structure for Single process snapshot Each component is assumed to have extened this definition in the same way they exten the opal_crs_base_compoinent_t below. More... | |
struct | opal_crs_base_component_2_0_0_t |
Structure for CRS components. More... | |
struct | opal_crs_base_module_1_0_0_t |
Structure for CRS modules. More... | |
Macros | |
#define | OPAL_CRS_BASE_VERSION_2_0_0 |
Macro for use in components that are of type CRS. More... | |
Typedefs | |
typedef enum opal_crs_state_type_t | opal_crs_state_type_t |
typedef struct opal_crs_base_ckpt_options_1_0_0_t | opal_crs_base_ckpt_options_1_0_0_t |
typedef struct opal_crs_base_ckpt_options_1_0_0_t | opal_crs_base_ckpt_options_t |
typedef struct opal_crs_base_snapshot_1_0_0_t | opal_crs_base_snapshot_1_0_0_t |
typedef struct opal_crs_base_snapshot_1_0_0_t | opal_crs_base_snapshot_t |
typedef int(* | opal_crs_base_module_init_fn_t )(void) |
Module initialization function. More... | |
typedef int(* | opal_crs_base_module_finalize_fn_t )(void) |
Module finalization function. More... | |
typedef int(* | opal_crs_base_module_checkpoint_fn_t )(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_base_ckpt_options_t *options, opal_crs_state_type_t *state) |
Call the underlying checkpointer. More... | |
typedef int(* | opal_crs_base_module_restart_fn_t )(opal_crs_base_snapshot_t *snapshot, bool spawn_child, pid_t *child_pid) |
Call the underlying restart command for this process Returns OPAL_SUCCESS or OPAL_CRS_ERROR. More... | |
typedef int(* | opal_crs_base_module_disable_checkpoint_fn_t )(void) |
Disable the checkpointer Returns OPAL_SUCCESS or OPAL_CRS_ERROR. More... | |
typedef int(* | opal_crs_base_module_enable_checkpoint_fn_t )(void) |
Enable the checkpointer Returns OPAL_SUCCESS or OPAL_CRS_ERROR. More... | |
typedef int(* | opal_crs_base_module_prelaunch_fn_t )(int32_t rank, char *base_snapshot_dir, char **app, char **cwd, char ***argv, char ***env) |
Prepare the CRS component for process launch. More... | |
typedef int(* | opal_crs_base_module_reg_thread_fn_t )(void) |
Register another thread that may call this library. More... | |
typedef struct opal_crs_base_component_2_0_0_t | opal_crs_base_component_2_0_0_t |
typedef struct opal_crs_base_component_2_0_0_t | opal_crs_base_component_t |
typedef struct opal_crs_base_module_1_0_0_t | opal_crs_base_module_1_0_0_t |
typedef struct opal_crs_base_module_1_0_0_t | opal_crs_base_module_t |
Enumerations | |
enum | opal_crs_state_type_t { OPAL_CRS_NONE = 0, OPAL_CRS_CHECKPOINT = 1, OPAL_CRS_RESTART_PRE = 2, OPAL_CRS_RESTART = 3, OPAL_CRS_CONTINUE = 4, OPAL_CRS_TERM = 5, OPAL_CRS_RUNNING = 6, OPAL_CRS_ERROR = 7, OPAL_CRS_STATE_MAX = 8 } |
States of the module. | |
Functions | |
OPAL_DECLSPEC | OBJ_CLASS_DECLARATION (opal_crs_base_ckpt_options_t) |
OPAL_DECLSPEC | OBJ_CLASS_DECLARATION (opal_crs_base_snapshot_t) |
Variables | |
OPAL_DECLSPEC opal_crs_base_module_t | opal_crs |
Checkpoint and Restart Service (CRS) Interface.
General Description:
The OPAL Checkpoint and Restart Service (CRS) has been created to create an abstract notion of a single process checkpointer for upper levels to incorporate checkpoint/restart calls genericly into their code. This keeps the upper levels from becoming too tied to a specfic checkpoint and restart implementation.
This interface will change in the future to allow for some additional specialized functionality such as memory inclusion/exclusion, explicit restarting while running, and others.
Words to the Wise:
The CRS module must adhere to the API exactly inorder to be fully supported. How the module goes about conforming to the API is an internal module issue and in no cases should the module impose restrictions upon the upper layers as this is an API violation.
#define OPAL_CRS_BASE_VERSION_2_0_0 |
Macro for use in components that are of type CRS.
typedef int(* opal_crs_base_module_checkpoint_fn_t)(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_base_ckpt_options_t *options, opal_crs_state_type_t *state) |
Call the underlying checkpointer.
Returns OPAL_SUCCESS upon success, and OPAL_ERROR otherwise.
Arguments: pid = PID of the process to checkpoint, or 0 if checkpointing self. fname = the filename where the checkpoint has been written. state = The state at which the checkpoint is exiting
The 'fname' string is owned by the caller: if appropriate, it must be eventually freed by the caller.
typedef int(* opal_crs_base_module_disable_checkpoint_fn_t)(void) |
Disable the checkpointer Returns OPAL_SUCCESS or OPAL_CRS_ERROR.
This should set a flag/mutex to disallow checkpoints to occur. If a checkpoint were to occur while checkpoints are disabled, they should block until reenabled. A quality module implementation would notify the user that the checkpoint has been delayed until the program is out of this critical section of code.
typedef int(* opal_crs_base_module_enable_checkpoint_fn_t)(void) |
Enable the checkpointer Returns OPAL_SUCCESS or OPAL_CRS_ERROR.
This should set a flag/mutex to allow checkpoints to occur
typedef int(* opal_crs_base_module_finalize_fn_t)(void) |
Module finalization function.
Returns OPAL_SUCCESS
typedef int(* opal_crs_base_module_init_fn_t)(void) |
Module initialization function.
Returns OPAL_SUCCESS
typedef int(* opal_crs_base_module_prelaunch_fn_t)(int32_t rank, char *base_snapshot_dir, char **app, char **cwd, char ***argv, char ***env) |
Prepare the CRS component for process launch.
Some CRS components need to take action before the process is ever launched to do such things as:
rank | Rank of the process to be started |
app | Absolute pathname of argv[0] |
argv | Standard argv-style array, including a final NULL pointer |
env | Standard environ-style array, including a final NULL pointer |
typedef int(* opal_crs_base_module_reg_thread_fn_t)(void) |
Register another thread that may call this library.
Some CR systems require that each thread that will call into their library register individually before doing so.
Returns OPAL_SUCCESS or OPAL_ERROR
typedef int(* opal_crs_base_module_restart_fn_t)(opal_crs_base_snapshot_t *snapshot, bool spawn_child, pid_t *child_pid) |
Call the underlying restart command for this process Returns OPAL_SUCCESS or OPAL_CRS_ERROR.
Arguments: fname = Checkpoint filename spawn_child = true if the restarted process should be forked as a new process, in which case 'child_pid' will be returned. false if the restarted process should overwrite the current process space. child_pid = PID of the child that was started, if applicable