OpenMPI  0.1.1
snapc.h File Reference

Snapshot Coordination (SNAPC) Interface. More...

#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "opal/class/opal_object.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/util/output.h"
#include "orte/mca/sstore/sstore.h"

Go to the source code of this file.

Data Structures

struct  orte_snapc_base_local_snapshot_1_0_0_t
 Definition of a orte local snapshot. More...
 
struct  orte_snapc_base_global_snapshot_1_0_0_t
 Definition of the global snapshot. More...
 
struct  orte_snapc_base_quiesce_1_0_0_t
 
struct  orte_snapc_base_request_op_1_0_0_t
 
struct  orte_snapc_base_component_2_0_0_t
 Structure for SNAPC components. More...
 
struct  orte_snapc_base_module_1_0_0_t
 Structure for SNAPC modules. More...
 

Macros

#define ORTE_SNAPC_CKPT_STATE_ERROR   0
 States that a process can be in while checkpointing.
 
#define ORTE_SNAPC_CKPT_STATE_NONE   1
 
#define ORTE_SNAPC_CKPT_STATE_REQUEST   2
 
#define ORTE_SNAPC_CKPT_STATE_PENDING   3
 
#define ORTE_SNAPC_CKPT_STATE_RUNNING   4
 
#define ORTE_SNAPC_CKPT_STATE_INC_PREPED   5
 
#define ORTE_SNAPC_CKPT_STATE_STOPPED   6
 
#define ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL   7
 
#define ORTE_SNAPC_CKPT_STATE_MIGRATING   8
 
#define ORTE_SNAPC_CKPT_STATE_ESTABLISHED   9
 
#define ORTE_SNAPC_CKPT_STATE_RECOVERED   10
 
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT   11
 
#define ORTE_SNAPC_CKPT_STATE_NO_RESTART   12
 
#define ORTE_SNAPC_CKPT_MAX   13
 
#define ORTE_SNAPC_CKPT_SHIFT   131072
 Sufficiently high shift value to avoid colliding the process checkpointing states above with the ORTE process states.
 
#define ORTE_SNAPC_CKPT_NOTIFY(state)   (ORTE_SNAPC_CKPT_SHIFT + state)
 
#define ORTE_SNAPC_CKPT_STATE(state)   (state - ORTE_SNAPC_CKPT_SHIFT)
 
#define CHECK_ORTE_SNAPC_CKPT_STATE(state)   (state >= ORTE_SNAPC_CKPT_SHIFT)
 
#define ORTE_SNAPC_BASE_VERSION_2_0_0
 Macro for use in components that are of type SNAPC. More...
 

Typedefs

typedef struct
orte_snapc_base_local_snapshot_1_0_0_t 
orte_snapc_base_local_snapshot_1_0_0_t
 
typedef struct
orte_snapc_base_local_snapshot_1_0_0_t 
orte_snapc_base_local_snapshot_t
 
typedef struct
orte_snapc_base_global_snapshot_1_0_0_t 
orte_snapc_base_global_snapshot_1_0_0_t
 
typedef struct
orte_snapc_base_global_snapshot_1_0_0_t 
orte_snapc_base_global_snapshot_t
 
typedef struct
orte_snapc_base_quiesce_1_0_0_t 
orte_snapc_base_quiesce_1_0_0_t
 
typedef struct
orte_snapc_base_quiesce_1_0_0_t 
orte_snapc_base_quiesce_t
 
typedef struct
orte_snapc_base_request_op_1_0_0_t 
orte_snapc_base_request_op_1_0_0_t
 
typedef struct
orte_snapc_base_request_op_1_0_0_t 
orte_snapc_base_request_op_t
 
typedef int(* orte_snapc_base_module_init_fn_t )(bool seed, bool app)
 Module initialization function. More...
 
typedef int(* orte_snapc_base_module_finalize_fn_t )(void)
 Module finalization function. More...
 
typedef int(* orte_snapc_base_setup_job_fn_t )(orte_jobid_t jobid)
 Setup the necessary structures for this job Returns ORTE_SUCCESS.
 
typedef int(* orte_snapc_base_release_job_fn_t )(orte_jobid_t jobid)
 Setup the necessary structures for this job Returns ORTE_SUCCESS.
 
typedef int(* orte_snapc_base_ft_event_fn_t )(int state)
 Handle fault tolerance updates. More...
 
typedef int(* orte_snapc_base_start_checkpoint_fn_t )(orte_snapc_base_quiesce_t *datum)
 Start a checkpoint originating from an internal source. More...
 
typedef int(* orte_snapc_base_end_checkpoint_fn_t )(orte_snapc_base_quiesce_t *datum)
 Signal end of checkpoint epoch originating from an internal source. More...
 
typedef int(* orte_snapc_base_request_op_fn_t )(orte_snapc_base_request_op_t *datum)
 Request a checkpoint related operation to take place.
 
typedef struct
orte_snapc_base_component_2_0_0_t 
orte_snapc_base_component_2_0_0_t
 
typedef struct
orte_snapc_base_component_2_0_0_t 
orte_snapc_base_component_t
 
typedef struct
orte_snapc_base_module_1_0_0_t 
orte_snapc_base_module_1_0_0_t
 
typedef struct
orte_snapc_base_module_1_0_0_t 
orte_snapc_base_module_t
 

Enumerations

enum  orte_snapc_base_request_op_event_t {
  ORTE_SNAPC_OP_NONE = 0, ORTE_SNAPC_OP_INIT, ORTE_SNAPC_OP_FIN, ORTE_SNAPC_OP_FIN_ACK,
  ORTE_SNAPC_OP_CHECKPOINT, ORTE_SNAPC_OP_RESTART, ORTE_SNAPC_OP_MIGRATE, ORTE_SNAPC_OP_QUIESCE_START,
  ORTE_SNAPC_OP_QUIESCE_CHECKPOINT, ORTE_SNAPC_OP_QUIESCE_END
}
 Application request for a global checkpoint related operation.
 

Functions

ORTE_DECLSPEC OBJ_CLASS_DECLARATION (orte_snapc_base_local_snapshot_t)
 
ORTE_DECLSPEC OBJ_CLASS_DECLARATION (orte_snapc_base_global_snapshot_t)
 
ORTE_DECLSPEC OBJ_CLASS_DECLARATION (orte_snapc_base_quiesce_t)
 
ORTE_DECLSPEC OBJ_CLASS_DECLARATION (orte_snapc_base_request_op_t)
 

Variables

ORTE_DECLSPEC
orte_snapc_base_module_t 
orte_snapc
 
ORTE_DECLSPEC
orte_snapc_base_component_t 
orte_snapc_base_selected_component
 

Detailed Description

Snapshot Coordination (SNAPC) Interface.

Terminology:

Global Snapshot Coordinator:

  • HNP(s) coordination function. Local Snapshot Coordinator
  • VHNP(s) [e.g., orted] coordination function Application Snapshot Coordinator
  • Application level coordinaton function Local Snapshot
  • Snapshot generated by a single process in the parallel job Local Snapshot Reference
  • A generic reference to the physical Local Snapshot Global Snapshot
  • Snapshot generated for the entire parallel job Global Snapshot Reference
  • A generic reference to the physical Global Snapshot

General Description:

This framework is tasked with:

  • Initiating the checkpoint in the system
  • Physically moving the local snapshot files to a location Initially this location, is the node on which the Head Node Process (HNP) is running, but later this will be a replicated checkpoint server or the like.
  • Generating a 'global snapshot handle' that the user can use to restart the parallel job.

Each component will have 3 teirs of behavior that must behave in concert:

  • Global Snapshot Coordinator This is the HNPs tasks. Mostly distributing the notification of the checkpoint, and then compiling the physical and virtual nature of the global snapshot handle.
  • Local Snapshot Coordinator This is the VHNPs (or orted, if available) tasks. This will involve working with the Global Snapshot Coordinator to route the physical and virtual 'local snapshot's from the application to the desired location. This process must also notify the Global Snapshot Coordinator when it's set of processes have completed the checkpoint.
  • Application Snapshot Coordinator This is the application level coordinator. This is very light, just a subscription to be triggered when it needs to checkpoint, and then, once finished with the checkpoint, notify the Local Snapshot Coordinator that it is complete. If there is no orted (so no bootproxy), then the application assumes the responsibility of the Local Snapshot Coordinator as well.

Macro Definition Documentation

#define ORTE_SNAPC_BASE_VERSION_2_0_0
Value:
MCA_BASE_VERSION_2_0_0, \
"snapc", 2, 0, 0

Macro for use in components that are of type SNAPC.

Typedef Documentation

typedef int(* orte_snapc_base_end_checkpoint_fn_t)(orte_snapc_base_quiesce_t *datum)

Signal end of checkpoint epoch originating from an internal source.

Parameters
[in]epochEpoch number to associate with this checkpoint operation Returns ORTE_SUCCESS
typedef int(* orte_snapc_base_ft_event_fn_t)(int state)

Handle fault tolerance updates.

Parameters
[in]stateFault tolerance state update
Return values
ORTE_SUCCESSThe operation completed successfully
ORTE_ERRORAn unspecifed error occurred
typedef int(* orte_snapc_base_module_finalize_fn_t)(void)

Module finalization function.

Returns ORTE_SUCCESS

typedef int(* orte_snapc_base_module_init_fn_t)(bool seed, bool app)

Module initialization function.

Returns ORTE_SUCCESS

typedef int(* orte_snapc_base_start_checkpoint_fn_t)(orte_snapc_base_quiesce_t *datum)

Start a checkpoint originating from an internal source.

This really only makes sense to call from an application, but in the future we may allow the checkpoint operation to use this function from the local coordinator.

Parameters
[out]epochEpoch number to associate with this checkpoint operation Returns ORTE_SUCCESS