OpenMPI  0.1.1
orte_wait.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2011 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2008 Institut National de Recherche en Informatique
13  * et Automatique. All rights reserved.
14  * $COPYRIGHT$
15  *
16  * Additional copyrights may follow
17  *
18  * $HEADER$
19  */
20 
21 /**
22  * @file
23  *
24  * Interface for waitpid / async notification of child death with the
25  * libevent runtime system.
26  */
27 #ifndef ORTE_WAIT_H
28 #define ORTE_WAIT_H
29 
30 #include "orte_config.h"
31 
32 #ifdef HAVE_SYS_TYPES_H
33 #include <sys/types.h>
34 #endif
35 #ifdef HAVE_SYS_TIME_H
36 #include <sys/time.h>
37 #endif
38 
39 #include "opal/dss/dss.h"
40 #include "opal/util/output.h"
41 #include "opal/sys/atomic.h"
42 #include "opal/mca/event/event.h"
43 
44 #include "orte/types.h"
45 #include "orte/mca/rml/rml_types.h"
47 
48 BEGIN_C_DECLS
49 
50 typedef struct {
51  opal_object_t super;
52  char *name;
53  int channel;
54  opal_atomic_lock_t lock;
57 
58 /** typedef for callback function used in \c ompi_rte_wait_cb */
59 typedef void (*orte_wait_fn_t)(pid_t wpid, int status, void *data);
60 
61 /**
62  * Disable / re-Enable SIGCHLD handler
63  *
64  * These functions have to be used after orte_wait_init was called.
65  */
66 
67 ORTE_DECLSPEC void orte_wait_enable(void);
68 ORTE_DECLSPEC void orte_wait_disable(void);
69 
70 /**
71  * Wait for process terminiation
72  *
73  * Similar to \c waitpid, \c orte_waitpid utilizes the run-time
74  * event library for process terminiation notification. The \c
75  * WUNTRACED option is not supported, but the \c WNOHANG option is
76  * supported.
77  *
78  * \note A \c wpid value of \c -1 is not currently supported and will
79  * return an error.
80  */
81 ORTE_DECLSPEC pid_t orte_waitpid(pid_t wpid, int *status, int options);
82 
83 
84 /**
85  * Register a callback for process termination
86  *
87  * Register a callback for notification when \c wpid causes a SIGCHLD.
88  * \c waitpid() will have already been called on the process at this
89  * time.
90  *
91  * If a thread is already blocked in \c ompi_rte_waitpid for \c wpid,
92  * this function will return \c ORTE_ERR_EXISTS. It is illegal for
93  * multiple callbacks to be registered for a single \c wpid
94  * (OMPI_EXISTS will be returned in this case).
95  *
96  * \warning It is not legal for \c wpid to be -1 when registering a
97  * callback.
98  */
99 ORTE_DECLSPEC int orte_wait_cb(pid_t wpid, orte_wait_fn_t callback, void *data);
100 
101 ORTE_DECLSPEC int orte_wait_cb_cancel(pid_t wpid);
102 
103 ORTE_DECLSPEC int orte_wait_cb_disable(void);
104 
105 ORTE_DECLSPEC int orte_wait_cb_enable(void);
106 
107 /**
108  * Setup to wait for an event
109  *
110  * This function is used to setup a trigger event that can be used elsewhere
111  * in the code base where we want to wait for some event to
112  * happen. For example, orterun uses this function to setup an event
113  * that is used to notify orterun of abnormal and normal termination
114  * so it can wakeup and exit cleanly.
115  *
116  * The event will be defined so that firing the provided trigger
117  * will cause the event to trigger and callback to the provided
118  * function
119  */
120 ORTE_DECLSPEC int orte_wait_event(opal_event_t **event,
121  orte_trigger_event_t *trig,
122  char *trigger_name,
123  void (*cbfunc)(int, short, void*));
124 
125 /**
126  * In a number of places in the code, we need to wait for something
127  * to complete - for example, waiting for all launched procs to
128  * report into the HNP. In such cases, we want to just call progress
129  * so that any messages get processed, but otherwise "hold" the
130  * program at this spot until the counter achieves the specified
131  * value. We also want to provide a boolean flag, though, so that
132  * we break out of the loop should something go wrong.
133  */
134 #define ORTE_PROGRESSED_WAIT(failed, counter, limit) \
135  do { \
136  OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
137  "progressed_wait: %s %d", \
138  __FILE__, __LINE__)); \
139  while (!(failed) && (counter) < (limit)) { \
140  opal_progress(); \
141  } \
142  } while(0); \
143 
144 
145 /**
146  * Trigger a defined event
147  *
148  * This function will trigger a previously-defined event - as setup
149  * by orte_wait_event - by firing the provided trigger
150  */
151 ORTE_DECLSPEC void orte_trigger_event(orte_trigger_event_t *trig);
152 
153 /**
154  * Setup an event to process a message
155  *
156  * If we are in an OOB recv callback, we frequently cannot process
157  * the received message until after we return from the callback to
158  * avoid a potential loopback situation - i.e., where processing
159  * the message can result in a message being sent somewhere that
160  * subsequently causes the recv we are in to get called again.
161  * This is typically the problem facing the daemons and HNP.
162  *
163  * To resolve this problem, we place the message to be processed on
164  * a list, and create a zero-time event that calls the function
165  * that will process the received message. The event library kindly
166  * does not trigger this event until after we return from the recv
167  * since the recv itself is considered an "event"! Thus, we will
168  * always execute the specified event cb function -after- leaving
169  * the recv.
170  */
171 typedef struct {
172  opal_object_t super;
173  opal_event_t *ev;
174  orte_process_name_t sender;
175  opal_buffer_t *buffer;
176  orte_rml_tag_t tag;
177 #if OPAL_ENABLE_DEBUG
178  char *file;
179  int line;
180 #endif
183 
184 #define ORTE_MESSAGE_EVENT_DELAY(delay, mev) \
185  do { \
186  struct timeval now; \
187  OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
188  "defining message event delay: %s %d", \
189  __FILE__, __LINE__)); \
190  now.tv_sec = delay/1000000; \
191  now.tv_usec = delay%1000000; \
192  opal_event_evtimer_add(mev->ev, &now); \
193  } while(0);
194 
195 #if OPAL_ENABLE_DEBUG
196 
197 #define ORTE_MESSAGE_EVENT(sndr, buf, tg, cbfunc) \
198  do { \
199  orte_message_event_t *mev; \
200  struct timeval now; \
201  OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
202  "defining message event: %s %d", \
203  __FILE__, __LINE__)); \
204  mev = OBJ_NEW(orte_message_event_t); \
205  mev->sender.jobid = (sndr)->jobid; \
206  mev->sender.vpid = (sndr)->vpid; \
207  ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \
208  opal_dss.copy_payload(mev->buffer, (buf)); \
209  mev->tag = (tg); \
210  mev->file = strdup((buf)->parent.cls_init_file_name); \
211  mev->line = (buf)->parent.cls_init_lineno; \
212  opal_event_evtimer_set(opal_event_base, \
213  mev->ev, (cbfunc), mev); \
214  now.tv_sec = 0; \
215  now.tv_usec = 0; \
216  opal_event_evtimer_add(mev->ev, &now); \
217  } while(0);
218 
219 #else
220 
221 #define ORTE_MESSAGE_EVENT(sndr, buf, tg, cbfunc) \
222  do { \
223  orte_message_event_t *mev; \
224  struct timeval now; \
225  OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
226  "defining message event: %s %d", \
227  __FILE__, __LINE__)); \
228  mev = OBJ_NEW(orte_message_event_t); \
229  mev->sender.jobid = (sndr)->jobid; \
230  mev->sender.vpid = (sndr)->vpid; \
231  ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \
232  opal_dss.copy_payload(mev->buffer, (buf)); \
233  mev->tag = (tg); \
234  opal_event_evtimer_set(opal_event_base, \
235  mev->ev, (cbfunc), mev); \
236  now.tv_sec = 0; \
237  now.tv_usec = 0; \
238  opal_event_evtimer_add(mev->ev, &now); \
239  } while(0);
240 
241 #endif
242 
243 /* Sometimes, we just need to get out of the event library so
244  * we can progress - and we need to pass a little info. For those
245  * cases, we define a zero-time event that passes info to a cbfunc
246  */
247 typedef struct {
248  opal_object_t super;
249  opal_event_t *ev;
250  orte_process_name_t proc;
253 
254 #define ORTE_NOTIFY_EVENT(cbfunc, data) \
255  do { \
256  struct timeval now; \
257  orte_notify_event_t *tmp; \
258  tmp = OBJ_NEW(orte_notify_event_t); \
259  tmp->proc.jobid = (data)->jobid; \
260  tmp->proc.vpid = (data)->vpid; \
261  ORTE_EPOCH_SET(tmp->proc.epoch,(data)->epoch); \
262  opal_event.evtimer_set(opal_event_base, \
263  tmp->ev, (cbfunc), tmp); \
264  now.tv_sec = 0; \
265  now.tv_usec = 0; \
266  OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
267  "defining notify event at %s:%d", \
268  __FILE__, __LINE__)); \
269  opal_event_evtimer_add(tmp->ev, &now); \
270  } while(0); \
271 
272 /**
273  * In a number of places within the code, we want to setup a timer
274  * to detect when some procedure failed to complete. For example,
275  * when we launch the daemons, we frequently have no way to directly
276  * detect that a daemon failed to launch. Setting a timer allows us
277  * to automatically fail out of the launch if we don't hear from a
278  * daemon in some specified time window.
279  *
280  * Computing the amount of time to wait takes a few lines of code, but
281  * this macro encapsulates those lines along with the timer event
282  * definition just as a convenience. It also centralizes the
283  * necessary checks to ensure that the microsecond field is always
284  * less than 1M since some systems care about that, and to ensure
285  * that the computed wait time doesn't exceed the desired max
286  * wait
287  */
288 #define ORTE_DETECT_TIMEOUT(event, n, deltat, maxwait, cbfunc) \
289  do { \
290  struct timeval now; \
291  opal_event_t *tmp; \
292  int timeout; \
293  tmp = (opal_event_t *) malloc(sizeof(opal_event_t)); \
294  opal_event_evtimer_set(opal_event_base, \
295  tmp, (cbfunc), tmp); \
296  timeout = (deltat) * (n); \
297  if ((maxwait) > 0 && timeout > (maxwait)) { \
298  timeout = (maxwait); \
299  } \
300  now.tv_sec = timeout/1000000; \
301  now.tv_usec = timeout%1000000; \
302  OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
303  "defining timeout: %ld sec %ld usec at %s:%d", \
304  (long)now.tv_sec, (long)now.tv_usec, \
305  __FILE__, __LINE__)); \
306  opal_event_evtimer_add(tmp, &now); \
307  *(event) = tmp; \
308  }while(0); \
309 
310 
311 /**
312  * There are places in the code where we just want to periodically
313  * wakeup to do something, and then go back to sleep again. Setting
314  * a timer allows us to do this
315  */
316 #define ORTE_TIMER_EVENT(sec, usec, cbfunc) \
317  do { \
318  struct timeval now; \
319  opal_event_t *tmp; \
320  tmp = (opal_event_t *) malloc(sizeof(opal_event_t)); \
321  opal_event_evtimer_set(opal_event_base, \
322  tmp, (cbfunc), tmp); \
323  now.tv_sec = (sec); \
324  now.tv_usec = (usec); \
325  OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
326  "defining timer event: %ld sec %ld usec at %s:%d", \
327  (long)now.tv_sec, (long)now.tv_usec, \
328  __FILE__, __LINE__)); \
329  opal_event_evtimer_add(tmp, &now); \
330  }while(0); \
331 
332 
333 /**
334  * \internal
335  *
336  * Initialize the wait system (allocate mutexes, etc.)
337  */
338 ORTE_DECLSPEC int orte_wait_init(void);
339 
340 /**
341  * Kill all processes we are waiting on.
342  */
343 ORTE_DECLSPEC int orte_wait_kill(int sig);
344 
345 /**
346  * \internal
347  *
348  * Finalize the wait system (deallocate mutexes, etc.)
349  */
350 ORTE_DECLSPEC int orte_wait_finalize(void);
351 
352 END_C_DECLS
353 
354 #endif /* #ifndef ORTE_WAIT_H */
Progress engine for Open MPI.
OPAL output stream facility.
Definition: types.h:146
Structure to represent a single event.
Definition: event_struct.h:87
ORTE_DECLSPEC int orte_wait_kill(int sig)
Kill all processes we are waiting on.
Definition: orte_wait.c:1172
Volatile lock object (with optional padding).
Definition: atomic.h:102
Setup an event to process a message.
Definition: orte_wait.h:171
ORTE_DECLSPEC pid_t orte_waitpid(pid_t wpid, int *status, int options)
Wait for process terminiation.
Definition: orte_wait.c:1130
void(* orte_wait_fn_t)(pid_t wpid, int status, void *data)
typedef for callback function used in ompi_rte_wait_cb
Definition: orte_wait.h:59
Base object.
Definition: opal_object.h:182
Definition: orte_wait.h:247
uint32_t orte_rml_tag_t
Message matching tag.
Definition: rml_types.h:220
ORTE_DECLSPEC void orte_trigger_event(orte_trigger_event_t *trig)
Trigger a defined event.
Definition: orte_wait.c:1159
ORTE_DECLSPEC int orte_wait_cb(pid_t wpid, orte_wait_fn_t callback, void *data)
Register a callback for process termination.
Definition: orte_wait.c:1136
Structure for holding a buffer to be used with the RML or OOB subsystems.
Definition: dss_types.h:159
Atomic operations.
Definition: orte_wait.h:50
Data packing subsystem.
Contains the typedefs for the use of the rml.
ORTE_DECLSPEC void orte_wait_enable(void)
Disable / re-Enable SIGCHLD handler.
ORTE_DECLSPEC int orte_wait_event(opal_event_t **event, orte_trigger_event_t *trig, char *trigger_name, void(*cbfunc)(int, short, void *))
Setup to wait for an event.
#define OBJ_CLASS_DECLARATION(NAME)
Declaration for class descriptor.
Definition: opal_object.h:236