OpenMPI  0.1.1
osc_rdma.h
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University.
3  * All rights reserved.
4  * Copyright (c) 2004-2006 The Trustees of the University of Tennessee.
5  * All rights reserved.
6  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
7  * University of Stuttgart. All rights reserved.
8  * Copyright (c) 2004-2005 The Regents of the University of California.
9  * All rights reserved.
10  * Copyright (c) 2007 Los Alamos National Security, LLC. All rights
11  * reserved.
12  * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
13  * $COPYRIGHT$
14  *
15  * Additional copyrights may follow
16  *
17  * $HEADER$
18  */
19 
20 #ifndef OMPI_OSC_RDMA_H
21 #define OMPI_OSC_RDMA_H
22 
23 #include "ompi_config.h"
24 #include "opal/class/opal_list.h"
25 #include "opal/class/opal_free_list.h"
27 #include "opal/threads/threads.h"
28 
29 #include "ompi/win/win.h"
30 #include "ompi/communicator/communicator.h"
31 #include "ompi/request/request.h"
32 #include "ompi/mca/osc/osc.h"
33 #include "ompi/mca/btl/btl.h"
34 #include "ompi/mca/bml/bml.h"
35 
36 BEGIN_C_DECLS
37 
39  mca_btl_base_descriptor_t* descriptor;
40  size_t remain_len;
41  mca_bml_base_btl_t *bml_btl;
42 };
44 
46  /** Extend the basic osc component interface */
48 
49  /** lock access to datastructures in the component structure */
51 
52  /** List of ompi_osc_rdma_module_ts currently in existance.
53  Needed so that received fragments can be dispatched to the
54  correct module */
56 
57  /** Lock for request management */
59 
60  /** Condition variable for request management */
62 
63  /** free list of ompi_osc_rdma_sendreq_t structures */
65  /** free list of ompi_osc_rdma_replyreq_t structures */
67  /** free list of ompi_osc_rdma_longreq_t structures */
69 
70  bool c_btl_registered;
71 
72  uint32_t c_sequence_number;
73 };
75 
76 
78  uint64_t peer_seg_key;
79  mca_bml_base_btl_t *bml_btl;
80  int rdma_order;
81  int32_t num_sent;
82 };
84 
85 
87  uint64_t peer_base;
88  uint64_t peer_len;
89 
90  int peer_num_btls;
91  volatile int peer_index_btls;
92  ompi_osc_rdma_btl_t *peer_btls;
93 
94  int local_num_btls;
95  mca_bml_base_btl_t **local_btls;
96  mca_mpool_base_registration_t **local_registrations;
97  mca_btl_base_descriptor_t **local_descriptors;
98 };
100 
101 
103  volatile int32_t num_btls_callin;
104  int32_t num_btls_expected;
105  volatile int32_t num_btls_outgoing;
106  opal_list_t *outstanding_btl_requests;
107 };
109 
110 
112  /** Extend the basic osc module interface */
114 
115  uint32_t m_sequence_number;
116 
117  /** lock access to data structures in the current module */
119 
120  /** condition variable for access to current module */
122 
123  /** lock for "atomic" window updates from reductions */
125 
126  /** pointer back to window */
128 
129  /** communicator created with this window */
131 
132  /** list of ompi_osc_rdma_sendreq_t structures, and includes all
133  requests for this access epoch that have not already been
134  started. m_lock must be held when modifying this field. */
136 
137  /** list of unsigned int counters for the number of requests to a
138  particular rank in m_comm for this access epoc. m_lock
139  must be held when modifying this field */
140  unsigned int *m_num_pending_sendreqs;
141 
142  /** For MPI_Fence synchronization, the number of messages to send
143  in epoch. For Start/Complete, the number of updates for this
144  Complete. For lock, the number of
145  messages waiting for completion on on the origin side. Not
146  protected by m_lock - must use atomic counter operations. */
148 
149  /** For MPI_Fence synchronization, the number of expected incoming
150  messages. For Post/Wait, the number of expected updates from
151  complete. For lock, the number of messages on the passive side
152  we are waiting for. Not protected by m_lock - must use
153  atomic counter operations. */
155 
156  /** Number of "ping" messages from the remote post group we've
157  received */
159 
160  /** Number of "count" messages from the remote complete group
161  we've received */
163 
164  /** cyclic counter for a unique tage for long messages. Not
165  protected by the m_lock - must use create_send_tag() to
166  create a send tag */
167  volatile int32_t m_tag_counter;
168 
169  opal_list_t m_copy_pending_sendreqs;
170  unsigned int *m_copy_num_pending_sendreqs;
171 
172  opal_list_t m_queued_sendreqs;
173 
174  /** start sending data eagerly */
176  bool m_eager_send_ok;
177 
178  /* RDMA data */
179  bool m_use_rdma;
180  bool m_rdma_wait_completion;
181  ompi_osc_rdma_setup_info_t *m_setup_info;
182  ompi_osc_rdma_peer_info_t *m_peer_info;
183  int32_t m_rdma_num_pending;
184 
185  /*** buffering ***/
186  bool m_use_buffers;
187  ompi_osc_rdma_buffer_t *m_pending_buffers;
188 
189  /* ********************* FENCE data ************************ */
190  /* an array of <sizeof(m_comm)> ints, each containing the value
191  1. */
192  int *m_fence_coll_counts;
193 
194  /* ********************* PWSC data ************************ */
195  struct ompi_group_t *m_pw_group;
196  struct ompi_group_t *m_sc_group;
197  bool *m_sc_remote_active_ranks;
198  int *m_sc_remote_ranks;
199 
200  /* ********************* LOCK data ************************ */
201  int32_t m_lock_status; /* one of 0, MPI_LOCK_EXCLUSIVE, MPI_LOCK_SHARED */
202  int32_t m_shared_count;
203  opal_list_t m_locks_pending;
204  opal_list_t m_unlocks_pending;
205  int32_t m_lock_received_ack;
206 };
208 OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
209 
210 
211 #define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
212 
213 /*
214  * Component functions
215  */
216 
217 int ompi_osc_rdma_component_init(bool enable_progress_threads,
218  bool enable_mpi_threads);
219 
220 int ompi_osc_rdma_component_finalize(void);
221 
222 int ompi_osc_rdma_component_query(struct ompi_win_t *win,
223  struct ompi_info_t *info,
224  struct ompi_communicator_t *comm);
225 
226 int ompi_osc_rdma_component_select(struct ompi_win_t *win,
227  struct ompi_info_t *info,
228  struct ompi_communicator_t *comm);
229 
230 /* helper function that properly sets up request handling */
231 int ompi_osc_rdma_component_irecv(void *buf,
232  size_t count,
233  struct ompi_datatype_t *datatype,
234  int src,
235  int tag,
236  struct ompi_communicator_t *comm,
237  struct ompi_request_t **request,
238  ompi_request_complete_fn_t callback,
239  void *data);
240 
241 int ompi_osc_rdma_component_isend(void *buf,
242  size_t count,
243  struct ompi_datatype_t *datatype,
244  int dest,
245  int tag,
246  struct ompi_communicator_t *comm,
247  struct ompi_request_t **request,
248  ompi_request_complete_fn_t callback,
249  void *data);
250 
251 int ompi_osc_rdma_peer_info_free(ompi_osc_rdma_peer_info_t *peer_info);
252 
253 /*
254  * Module interface function types
255  */
256 int ompi_osc_rdma_module_free(struct ompi_win_t *win);
257 
258 int ompi_osc_rdma_module_put(void *origin_addr,
259  int origin_count,
260  struct ompi_datatype_t *origin_dt,
261  int target,
262  OPAL_PTRDIFF_TYPE target_disp,
263  int target_count,
264  struct ompi_datatype_t *target_dt,
265  struct ompi_win_t *win);
266 
267 int ompi_osc_rdma_module_accumulate(void *origin_addr,
268  int origin_count,
269  struct ompi_datatype_t *origin_dt,
270  int target,
271  OPAL_PTRDIFF_TYPE target_disp,
272  int target_count,
273  struct ompi_datatype_t *target_dt,
274  struct ompi_op_t *op,
275  struct ompi_win_t *win);
276 
277 int ompi_osc_rdma_module_get(void *origin_addr,
278  int origin_count,
279  struct ompi_datatype_t *origin_dt,
280  int target,
281  OPAL_PTRDIFF_TYPE target_disp,
282  int target_count,
283  struct ompi_datatype_t *target_dt,
284  struct ompi_win_t *win);
285 
286 int ompi_osc_rdma_module_fence(int assert, struct ompi_win_t *win);
287 
288 int ompi_osc_rdma_module_start(struct ompi_group_t *group,
289  int assert,
290  struct ompi_win_t *win);
291 int ompi_osc_rdma_module_complete(struct ompi_win_t *win);
292 
293 int ompi_osc_rdma_module_post(struct ompi_group_t *group,
294  int assert,
295  struct ompi_win_t *win);
296 
297 int ompi_osc_rdma_module_wait(struct ompi_win_t *win);
298 
299 int ompi_osc_rdma_module_test(struct ompi_win_t *win,
300  int *flag);
301 
302 int ompi_osc_rdma_module_lock(int lock_type,
303  int target,
304  int assert,
305  struct ompi_win_t *win);
306 
307 int ompi_osc_rdma_module_unlock(int target,
308  struct ompi_win_t *win);
309 
310 /*
311  * passive side sync interface functions
312  */
313 int ompi_osc_rdma_passive_lock(ompi_osc_rdma_module_t *module,
314  int32_t origin,
315  int32_t lock_type);
316 
317 int ompi_osc_rdma_passive_unlock(ompi_osc_rdma_module_t *module,
318  int32_t origin,
319  int32_t count);
320 
321 int ompi_osc_rdma_passive_unlock_complete(ompi_osc_rdma_module_t *module);
322 
323 
324 END_C_DECLS
325 
326 #endif /* OMPI_OSC_RDMA_H */
OSC module instance.
Definition: osc.h:269
opal_free_list_t c_sendreqs
free list of ompi_osc_rdma_sendreq_t structures
Definition: osc_rdma.h:64
Definition: opal_hash_table.h:42
Definition: win.h:53
A descriptor that holds the parameters to a send/put/get operation along w/ a callback routine that i...
Definition: btl.h:275
Definition: osc_rdma.h:111
opal_mutex_t c_request_lock
Lock for request management.
Definition: osc_rdma.h:58
Definition: condition.h:49
opal_condition_t c_request_cond
Condition variable for request management.
Definition: osc_rdma.h:61
Definition: ompi_datatype.h:68
unsigned int * m_num_pending_sendreqs
list of unsigned int counters for the number of requests to a particular rank in m_comm for this acce...
Definition: osc_rdma.h:140
Definition: mutex_unix.h:53
int32_t m_num_post_msgs
Number of "ping" messages from the remote post group we've received.
Definition: osc_rdma.h:158
opal_free_list_t c_longreqs
free list of ompi_osc_rdma_longreq_t structures
Definition: osc_rdma.h:68
Definition: osc_rdma.h:102
The opal_list_t interface is used to provide a generic doubly-linked list container for Open MPI...
Definition: mpool.h:44
ompi_osc_base_component_t super
Extend the basic osc component interface.
Definition: osc_rdma.h:47
opal_list_t m_pending_sendreqs
list of ompi_osc_rdma_sendreq_t structures, and includes all requests for this access epoch that have...
Definition: osc_rdma.h:135
opal_mutex_t m_lock
lock access to data structures in the current module
Definition: osc_rdma.h:118
BML Management Layer (BML)
Definition: info.h:38
opal_free_list_t c_replyreqs
free list of ompi_osc_rdma_replyreq_t structures
Definition: osc_rdma.h:66
opal_condition_t m_cond
condition variable for access to current module
Definition: osc_rdma.h:121
Top-level description of requests.
Definition: osc_rdma.h:45
One-sided Communication interface.
Definition: opal_free_list.h:31
OSC component interface.
Definition: osc.h:154
int32_t m_num_complete_msgs
Number of "count" messages from the remote complete group we've received.
Definition: osc_rdma.h:162
Byte Transfer Layer (BTL)
int32_t m_num_pending_out
For MPI_Fence synchronization, the number of messages to send in epoch.
Definition: osc_rdma.h:147
Definition: osc_rdma.h:86
A hash table that may be indexed with either fixed length (e.g.
Group structure Currently we have four formats for storing the process pointers that are members of t...
Definition: group.h:79
opal_hash_table_t c_modules
List of ompi_osc_rdma_module_ts currently in existance.
Definition: osc_rdma.h:55
opal_mutex_t c_lock
lock access to datastructures in the component structure
Definition: osc_rdma.h:50
Definition: bml.h:58
Definition: opal_list.h:147
ompi_osc_base_module_t super
Extend the basic osc module interface.
Definition: osc_rdma.h:113
volatile int32_t m_tag_counter
cyclic counter for a unique tage for long messages.
Definition: osc_rdma.h:167
int32_t m_num_pending_in
For MPI_Fence synchronization, the number of expected incoming messages.
Definition: osc_rdma.h:154
Definition: osc_rdma.h:38
Definition: evdns.c:158
opal_mutex_t m_acc_lock
lock for "atomic" window updates from reductions
Definition: osc_rdma.h:124
bool m_eager_send_active
start sending data eagerly
Definition: osc_rdma.h:175
Definition: osc_rdma.h:77
Definition: communicator.h:118
Main top-level request struct definition.
Definition: request.h:100
ompi_communicator_t * m_comm
communicator created with this window
Definition: osc_rdma.h:130
Back-end type of MPI_Op.
Definition: op.h:100
ompi_win_t * m_win
pointer back to window
Definition: osc_rdma.h:127