OpenMPI  0.1.1
sstore.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2010 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * $COPYRIGHT$
6  *
7  * Additional copyrights may follow
8  *
9  * $HEADER$
10  */
11 /**
12  * @file
13  *
14  * Distributed Stable Storage (SStore) Interface
15  *
16  */
17 
18 #ifndef MCA_SSTORE_H
19 #define MCA_SSTORE_H
20 
21 #include "orte_config.h"
22 #include "orte/constants.h"
23 #include "orte/types.h"
25 
26 #include "opal/mca/mca.h"
27 #include "opal/mca/base/base.h"
28 
29 #include "opal/class/opal_object.h"
30 
31 BEGIN_C_DECLS
32 
33 /**
34  * Keys accepted as metadata
35  */
36 typedef uint32_t orte_sstore_base_key_t;
37 /** CRS Component */
38 #define SSTORE_METADATA_LOCAL_CRS_COMP 0
39 /** Compress Component */
40 #define SSTORE_METADATA_LOCAL_COMPRESS_COMP 1
41 /** Compress Component Postfix */
42 #define SSTORE_METADATA_LOCAL_COMPRESS_POSTFIX 2
43 /** Process PID */
44 #define SSTORE_METADATA_LOCAL_PID 3
45 /** Checkpoint Context File */
46 #define SSTORE_METADATA_LOCAL_CONTEXT 4
47 /** Directory to make on restart */
48 #define SSTORE_METADATA_LOCAL_MKDIR 5
49 /** File to touch on restart */
50 #define SSTORE_METADATA_LOCAL_TOUCH 6
51 
52 /** Local snapshot reference (e.g., opal_snapshot_0.ckpt) */
53 #define SSTORE_METADATA_LOCAL_SNAP_REF 7
54 /** Local snapshot reference format string (e.g., opal_snapshot_%d.ckpt) passed vpid */
55 #define SSTORE_METADATA_LOCAL_SNAP_REF_FMT 8
56 /** Local snapshot directory (Full Path excluding reference) */
57 #define SSTORE_METADATA_LOCAL_SNAP_LOC 9
58 /** Local snapshot reference directory (Full Path) */
59 #define SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT 10
60 /** Local snapshot metadata file (Full Path) */
61 #define SSTORE_METADATA_LOCAL_SNAP_META 11
62 
63 /** Global snapshot reference (e.g., ompi_global_snapshot_1234.ckpt) */
64 #define SSTORE_METADATA_GLOBAL_SNAP_REF 12
65 /** Global snapshot location (Relative Path from base) */
66 #define SSTORE_METADATA_GLOBAL_SNAP_LOC 13
67 /** Global snapshot location (Full path) */
68 #define SSTORE_METADATA_GLOBAL_SNAP_LOC_ABS 14
69 /** Global snapshot metadata file (Full path) */
70 #define SSTORE_METADATA_GLOBAL_SNAP_META 15
71 /** Global snapshot sequence number */
72 #define SSTORE_METADATA_GLOBAL_SNAP_SEQ 16
73 /** AMCA Parameter to be preserved for ompi-restart */
74 #define SSTORE_METADATA_GLOBAL_AMCA_PARAM 17
75 
76 /** Total number of sequence numbers for this snapshot */
77 #define SSTORE_METADATA_GLOBAL_SNAP_NUM_SEQ 18
78 /** Comma separated list of all sequence numbers for this snapshot */
79 #define SSTORE_METADATA_GLOBAL_SNAP_ALL_SEQ 19
80 
81 /** Access the current default base directory (Full Path) */
82 #define SSTORE_METADATA_BASE_LOC 20
83 
84 /** The local process is skipping the checkpoint
85  * Usually this is because there is a migration, and it is not participating
86  */
87 #define SSTORE_METADATA_LOCAL_SKIP_CKPT 21
88 
89 /** A Migration checkpoint does not necessarily contain all of the processes
90  * in the job, so it is not a checkpoint that can be restarted from normally.
91  * Therefore, it needs to be marked specially. */
92 #define SSTORE_METADATA_GLOBAL_MIGRATING 22
93 
94 /** */
95 #define SSTORE_METADATA_MAX 23
96 
97 /**
98  * Storage handle
99  */
100 #define ORTE_SSTORE_HANDLE OPAL_UINT32
101 typedef uint32_t orte_sstore_base_handle_t;
102 ORTE_DECLSPEC extern orte_sstore_base_handle_t orte_sstore_handle_current;
103 ORTE_DECLSPEC extern orte_sstore_base_handle_t orte_sstore_handle_last_stable;
104 #define ORTE_SSTORE_HANDLE_INVALID 0
105 
106 /**
107  * Local and Global snapshot information structure
108  * Primarily used by orte-restart as an abstract way to handle metadata
109  */
111  /** List super object */
113 
114  /** Stable Storage Handle */
115  orte_sstore_base_handle_t ss_handle;
116 
117  /** ORTE Process name */
119 
120  /** CRS Component */
121  char *crs_comp;
122 
123  /** Compress Component */
125 
126  /** Compress Component Postfix */
128 
129  /** Start/End Timestamps */
130  char *start_time;
131  char *end_time;
132 };
135 
137 
139  /** List super object */
141 
142  /** A list of orte_sstore_base_local_snapshot_info_t's */
144 
145  /** Stable Storage Handle */
146  orte_sstore_base_handle_t ss_handle;
147 
148  /** Start Timestamp */
149  char * start_time;
150 
151  /** End Timestamp */
152  char * end_time;
153 
154  /** Sequence number */
155  int seq_num;
156 
157  /** Reference */
158  char *reference;
159 
160  /** AMCA parameter used */
161  char *amca_param;
162 
163  /** Internal use only: Cache some information on the structure */
164  int num_seqs;
165  char ** all_seqs;
166  char *basedir;
167  char *metadata_filename;
168 };
171 
173 
174 /**
175  * Module initialization function.
176  * Returns ORTE_SUCCESS
177  */
179  (void);
180 
181 /**
182  * Module finalization function.
183  * Returns ORTE_SUCCESS
184  */
186  (void);
187 
188 /**
189  * Request a checkpoint storage handle from stable storage
190  *
191  * @param handle Checkpoint storage handle
192  * @param key Key to use as an identifier
193  * @param value Value of the key specified
194  *
195  * @return ORTE_SUCCESS on success
196  * @return ORTE_ERROR on failure
197  */
199  (orte_sstore_base_handle_t *handle, int seq, orte_jobid_t jobid);
200 
201 /**
202  * Request a restart storage handle from stable storage
203  * This function will fail if the key cannot be matched.
204  * If multiple matches exist, it will return the latest one.
205  * If they key is NULL, then the latest entry will be used.
206  *
207  * @param handle Restart storage handle
208  *
209  * @return ORTE_SUCCESS on success
210  * @return ORTE_ERROR on failure
211  */
213  (orte_sstore_base_handle_t *handle,
214  char *basedir, char *ref, int seq,
216 
217 /**
218  * Request snapshot info from a given handle.
219  * If they key is NULL, then the latest entry will be used.
220  *
221  * @param handle Restart storage handle
222  *
223  * @return ORTE_SUCCESS on success
224  * @return ORTE_ERROR on failure
225  */
227  (orte_sstore_base_handle_t *handle,
229 
230 /**
231  * Register access to a handle.
232  *
233  * @param handle Storage handle
234  *
235  * @return ORTE_SUCCESS on success
236  * @return ORTE_ERROR on failure
237  */
239  (orte_sstore_base_handle_t handle);
240 
241 /**
242  * Get attribute on the storage handle
243  *
244  * @param handle Storage handle
245  * @param key Key to access
246  * @param value Value of the key. NULL if not avaialble
247  *
248  * @return ORTE_SUCCESS on success
249  * @return ORTE_ERROR on failure
250  */
252  (orte_sstore_base_handle_t handle, orte_sstore_base_key_t key, char **value);
253 
254 /**
255  * Set attribute on the storage handle
256  *
257  * @param handle Storage handle
258  * @param key Key to set
259  * @param value Value of the key.
260  *
261  * @return ORTE_SUCCESS on success
262  * @return ORTE_ERROR on failure
263  */
265  (orte_sstore_base_handle_t handle, orte_sstore_base_key_t key, char *value);
266 
267 /**
268  * Synchronize the handle
269  *
270  * @param handle Storage handle
271  *
272  * @return ORTE_SUCCESS on success
273  * @return ORTE_ERROR on failure
274  */
275 typedef int (*orte_sstore_base_sync_fn_t)
276  (orte_sstore_base_handle_t handle);
277 
278 /**
279  * Remove data associated with the handle
280  *
281  * @param handle Storage handle
282  *
283  * @return ORTE_SUCCESS on success
284  * @return ORTE_ERROR on failure
285  */
286 typedef int (*orte_sstore_base_remove_fn_t)
287  (orte_sstore_base_handle_t handle);
288 
289 /**
290  * Pack a handle into a buffer
291  * Only called between the HNP and ORTED (or Global and Local SnapC coordinators)
292  *
293  * @param peer Peer to which this is being sent (or NULL if to all peers)
294  * @param buffer Buffer to pack the data into
295  * @param handle Storage handle
296  *
297  * @return ORTE_SUCCESS on success
298  * @return ORTE_ERROR on failure
299  */
300 typedef int (*orte_sstore_base_pack_fn_t)
301  (orte_process_name_t* peer, opal_buffer_t* buffer, orte_sstore_base_handle_t handle);
302 
303 /**
304  * Unack a handle from a buffer
305  * Only called between the HNP and ORTED (or Global and Local SnapC coordinators)
306  *
307  * @param peer Peer from which this was received
308  * @param buffer Buffer to unpack the data
309  * @param handle Storage handle
310  *
311  * @return ORTE_SUCCESS on success
312  * @return ORTE_ERROR on failure
313  */
314 typedef int (*orte_sstore_base_unpack_fn_t)
315  (orte_process_name_t* peer, opal_buffer_t* buffer, orte_sstore_base_handle_t *handle);
316 
317 /**
318  * Fetch application context dependencies before local launch
319  *
320  * @param app Application context
321  *
322  * @return ORTE_SUCCESS on success
323  * @return ORTE_ERROR on failure
324  */
327 
328 /**
329  * Wait for all application context dependencies to be fetched
330  *
331  * @return ORTE_SUCCESS on success
332  * @return ORTE_ERROR on failure
333  */
335  (void);
336 
337 /**
338  * Structure for SSTORE components.
339  */
341  /** MCA base component */
343  /** MCA base data */
345 
346  /** Verbosity Level */
347  int verbose;
348  /** Output Handle for opal_output */
350  /** Default Priority */
351  int priority;
352 };
355 
356 /**
357  * Structure for SSTORE modules
358  */
360  /** Initialization Function */
362  /** Finalization Function */
364 
365  /** Request handle */
367  orte_sstore_base_request_restart_handle_fn_t request_restart_handle;
368  orte_sstore_base_request_global_snapshot_data_fn_t request_global_snapshot_data;
370 
371  /** Get/Set Attributes */
374 
375  /** Sync */
377 
378  /** Remove */
380 
381  /** Pack/Unpack Handle */
383  orte_sstore_base_unpack_fn_t unpack_handle;
384 
385  /** Launch Helpers */
388 };
391 
392 ORTE_DECLSPEC extern orte_sstore_base_module_t orte_sstore;
393 
394 /**
395  * Macro for use in components that are of type SSTORE
396  */
397 #define ORTE_SSTORE_BASE_VERSION_2_0_0 \
398  MCA_BASE_VERSION_2_0_0, \
399  "sstore", 2, 0, 0
400 
401 END_C_DECLS
402 
403 #endif /* ORTE_SSTORE_H */
404 
int(* orte_sstore_base_register_handle_fn_t)(orte_sstore_base_handle_t handle)
Register access to a handle.
Definition: sstore.h:239
Information about a specific application to be launched in the RTE.
Definition: orte_globals.h:196
int(* orte_sstore_base_pack_fn_t)(orte_process_name_t *peer, opal_buffer_t *buffer, orte_sstore_base_handle_t handle)
Pack a handle into a buffer Only called between the HNP and ORTED (or Global and Local SnapC coordina...
Definition: sstore.h:301
Common type for all MCA components.
Definition: mca.h:250
int(* orte_sstore_base_remove_fn_t)(orte_sstore_base_handle_t handle)
Remove data associated with the handle.
Definition: sstore.h:287
orte_sstore_base_sync_fn_t sync
Sync.
Definition: sstore.h:376
opal_list_t local_snapshots
A list of orte_sstore_base_local_snapshot_info_t's.
Definition: sstore.h:143
int(* orte_sstore_base_wait_all_deps_fn_t)(void)
Wait for all application context dependencies to be fetched.
Definition: sstore.h:335
int(* orte_sstore_base_set_attribute_fn_t)(orte_sstore_base_handle_t handle, orte_sstore_base_key_t key, char *value)
Set attribute on the storage handle.
Definition: sstore.h:265
orte_sstore_base_module_finalize_fn_t sstore_finalize
Finalization Function.
Definition: sstore.h:363
uint32_t orte_jobid_t
Set the allowed range for ids in each space.
Definition: types.h:76
int priority
Default Priority.
Definition: sstore.h:351
Definition: types.h:146
char * end_time
End Timestamp.
Definition: sstore.h:152
orte_sstore_base_handle_t ss_handle
Stable Storage Handle.
Definition: sstore.h:115
orte_sstore_base_module_init_fn_t sstore_init
Initialization Function.
Definition: sstore.h:361
orte_process_name_t process_name
ORTE Process name.
Definition: sstore.h:118
Structure for SSTORE modules.
Definition: sstore.h:359
char * amca_param
AMCA parameter used.
Definition: sstore.h:161
Structure for SSTORE components.
Definition: sstore.h:340
Top-level interface for all MCA components.
Definition: opal_list.h:98
int(* orte_sstore_base_get_attribute_fn_t)(orte_sstore_base_handle_t handle, orte_sstore_base_key_t key, char **value)
Get attribute on the storage handle.
Definition: sstore.h:252
orte_sstore_base_pack_fn_t pack_handle
Pack/Unpack Handle.
Definition: sstore.h:382
int output_handle
Output Handle for opal_output.
Definition: sstore.h:349
char * start_time
Start/End Timestamps.
Definition: sstore.h:130
orte_sstore_base_request_checkpoint_handle_fn_t request_checkpoint_handle
Request handle.
Definition: sstore.h:366
char * crs_comp
CRS Component.
Definition: sstore.h:121
opal_list_item_t super
List super object.
Definition: sstore.h:112
char * compress_comp
Compress Component.
Definition: sstore.h:124
orte_sstore_base_fetch_app_deps_fn_t fetch_app_deps
Launch Helpers.
Definition: sstore.h:386
int num_seqs
Internal use only: Cache some information on the structure.
Definition: sstore.h:164
orte_sstore_base_get_attribute_fn_t get_attr
Get/Set Attributes.
Definition: sstore.h:372
char * compress_postfix
Compress Component Postfix.
Definition: sstore.h:127
int(* orte_sstore_base_request_restart_handle_fn_t)(orte_sstore_base_handle_t *handle, char *basedir, char *ref, int seq, orte_sstore_base_global_snapshot_info_t *snapshot)
Request a restart storage handle from stable storage This function will fail if the key cannot be mat...
Definition: sstore.h:213
int(* orte_sstore_base_fetch_app_deps_fn_t)(orte_app_context_t *app)
Fetch application context dependencies before local launch.
Definition: sstore.h:326
char * start_time
Start Timestamp.
Definition: sstore.h:149
BEGIN_C_DECLS typedef uint32_t orte_sstore_base_key_t
Keys accepted as metadata.
Definition: sstore.h:36
Local and Global snapshot information structure Primarily used by orte-restart as an abstract way to ...
Definition: sstore.h:110
mca_base_component_t base_version
MCA base component.
Definition: sstore.h:342
Meta data for MCA v2.0.0 components.
Definition: mca.h:309
Definition: opal_list.h:147
Global params for OpenRTE.
char * reference
Reference.
Definition: sstore.h:158
int(* orte_sstore_base_module_finalize_fn_t)(void)
Module finalization function.
Definition: sstore.h:186
int(* orte_sstore_base_request_checkpoint_handle_fn_t)(orte_sstore_base_handle_t *handle, int seq, orte_jobid_t jobid)
Request a checkpoint storage handle from stable storage.
Definition: sstore.h:199
int verbose
Verbosity Level.
Definition: sstore.h:347
int(* orte_sstore_base_unpack_fn_t)(orte_process_name_t *peer, opal_buffer_t *buffer, orte_sstore_base_handle_t *handle)
Unack a handle from a buffer Only called between the HNP and ORTED (or Global and Local SnapC coordin...
Definition: sstore.h:315
Structure for holding a buffer to be used with the RML or OOB subsystems.
Definition: dss_types.h:159
opal_list_item_t super
List super object.
Definition: sstore.h:140
int(* orte_sstore_base_request_global_snapshot_data_fn_t)(orte_sstore_base_handle_t *handle, orte_sstore_base_global_snapshot_info_t *snapshot)
Request snapshot info from a given handle.
Definition: sstore.h:227
A simple C-language object-oriented system with single inheritance and ownership-based memory managem...
int seq_num
Sequence number.
Definition: sstore.h:155
orte_sstore_base_handle_t ss_handle
Stable Storage Handle.
Definition: sstore.h:146
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236
int(* orte_sstore_base_sync_fn_t)(orte_sstore_base_handle_t handle)
Synchronize the handle.
Definition: sstore.h:276
mca_base_component_data_t base_data
MCA base data.
Definition: sstore.h:344
int(* orte_sstore_base_module_init_fn_t)(void)
Module initialization function.
Definition: sstore.h:179