OpenMPI  0.1.1
btl_vader.h
Go to the documentation of this file.
1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
4  * University Research and Technology
5  * Corporation. All rights reserved.
6  * Copyright (c) 2004-2009 The University of Tennessee and The University
7  * of Tennessee Research Foundation. All rights
8  * reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  * University of Stuttgart. All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  * All rights reserved.
13  * Copyright (c) 2006-2007 Voltaire. All rights reserved.
14  * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
15  * Copyright (c) 2010-2012 Los Alamos National Security, LLC.
16  * All rights reserved.
17  * $COPYRIGHT$
18  *
19  * Additional copyrights may follow
20  *
21  * $HEADER$
22  */
23 /**
24  * @file
25  */
26 #ifndef MCA_BTL_VADER_H
27 #define MCA_BTL_VADER_H
28 
29 #include "ompi_config.h"
30 
31 #include <stddef.h>
32 #include <stdlib.h>
33 #include <string.h>
34 
35 #ifdef HAVE_STDINT_H
36 # include <stdint.h>
37 #endif /* HAVE_STDINT_H */
38 #ifdef HAVE_SCHED_H
39 # include <sched.h>
40 #endif /* HAVE_SCHED_H */
41 #ifdef HAVE_UNISTD_H
42 # include <unistd.h>
43 #endif /* HAVE_UNISTD_H */
44 
45 /* xpmem is required by vader atm */
46 #include <xpmem.h>
47 
48 #include "opal/include/opal/align.h"
49 #include "opal/class/opal_free_list.h"
50 #include "opal/sys/atomic.h"
51 #include "ompi/mca/btl/btl.h"
52 
53 #include "ompi/mca/mpool/mpool.h"
56 #include "ompi/mca/common/sm/common_sm.h"
57 
58 #include "ompi/mca/rcache/rcache.h"
60 
61 
62 BEGIN_C_DECLS
63 
64 #define min(a,b) ((a) < (b) ? (a) : (b))
65 
66 extern int mca_btl_vader_memcpy_limit;
67 extern int mca_btl_vader_log_align;
68 extern int mca_btl_vader_max_inline_send;
69 
70 /* We can't use opal_cache_line_size here because we need a
71  compile-time constant for padding the struct. We can't really have
72  a compile-time constant that is portable, either (e.g., compile on
73  one machine and run on another). So just use a big enough cache
74  line that should hopefully be good in most places. */
75 #define VADER_CACHE_LINE_PAD 128
76 
77 /* largest address we can attach to using xpmem */
78 #define VADER_MAX_ADDRESS ((uintptr_t)0x7ffffffff000)
79 
80 /*
81  * Shared Memory resource managment
82  */
83 
84 struct vader_fifo_t;
85 
86 /**
87  * Shared Memory (VADER) BTL module.
88  */
90  mca_btl_base_component_2_0_0_t super; /**< base BTL component */
91  int vader_free_list_num; /**< initial size of free lists */
92  int vader_free_list_max; /**< maximum size of free lists */
93  int vader_free_list_inc; /**< number of elements to alloc
94  * when growing free lists */
95  char *vader_mpool_name; /**< name of shared memory pool module */
96  mca_mpool_base_module_t *vader_mpool; /**< mpool on local node */
97  void *vader_mpool_base; /**< base address of shared memory pool */
98  size_t eager_limit; /**< send fragment size */
99  mca_common_sm_module_t *vader_seg; /**< description of shared memory segment */
100  volatile struct vader_fifo_t **shm_fifo;/**< pointer to fifo 2D array in
101  * shared memory */
102  char **shm_bases; /**< pointer to base pointers in
103  * shared memory */
104  xpmem_segid_t my_seg_id; /* this rank's xpmem segment id */
105  xpmem_segid_t *shm_seg_ids; /* xpmem segment ids */
106  struct vader_fifo_t **fifo; /**< cached copy of the pointer to
107  * the 2D fifo array. */
108  struct mca_rcache_base_module_t **xpmem_rcaches;
109  xpmem_apid_t *apids; /* xpmem apids */
110  int32_t num_smp_procs; /**< current number of smp procs on this host */
111  int32_t my_smp_rank; /**< My SMP process rank. Used for accessing
112  * SMP specfic data structures. */
113  ompi_free_list_t vader_frags_eager; /**< free list of vader send frags */
114  ompi_free_list_t vader_frags_user; /**< free list of vader put/get frags */
115 
116  opal_list_t active_sends; /**< list of outstanding fragments */
117 
118  unsigned char **vader_fboxes_in; /**< incomming fast boxes (memory belongs to this process) */
119  unsigned char **vader_fboxes_out; /**< outgoing fast boxes (memory belongs to remote peers) */
120 
121  unsigned char *vader_next_fbox_in; /**< indices of fast boxes to poll */
122  unsigned char *vader_next_fbox_out; /**< indices of fast boxes to write */
123 
124  struct mca_btl_base_endpoint_t **vader_peers;
125 };
127 OMPI_MODULE_DECLSPEC extern mca_btl_vader_component_t mca_btl_vader_component;
128 
129 /**
130  * VADER BTL Interface
131  */
133  mca_btl_base_module_t super; /**< base BTL interface */
134  bool btl_inited; /**< flag indicating if btl has been inited */
136 };
137 typedef struct mca_btl_vader_t mca_btl_vader_t;
138 OMPI_MODULE_DECLSPEC extern mca_btl_vader_t mca_btl_vader;
139 
140 /***
141  * One or more FIFO components may be a pointer that must be
142  * accessed by multiple processes. Since the shared region may
143  * be mmapped differently into each process's address space,
144  * these pointers will be relative to some base address. Here,
145  * we define macros to translate between relative addresses and
146  * virtual addresses.
147  */
148 #define VIRTUAL2RELATIVE(VADDR ) ((intptr_t)(VADDR) - (intptr_t)mca_btl_vader_component.shm_bases[mca_btl_vader_component.my_smp_rank])
149 #define RELATIVE2VIRTUAL(OFFSET) ((intptr_t)(OFFSET) + (intptr_t)mca_btl_vader_component.shm_bases[mca_btl_vader_component.my_smp_rank])
150 
151 /* look up the remote pointer in the peer rcache and attach if
152  * necessary */
153 static inline mca_mpool_base_registration_t *vader_get_registation (int peer_smp_rank, void *rem_ptr,
154  size_t size, int flags)
155 {
156  struct mca_rcache_base_module_t *rcache = mca_btl_vader_component.xpmem_rcaches[peer_smp_rank];
157  mca_mpool_base_registration_t *regs[10], *reg = NULL;
158  struct xpmem_addr xpmem_addr;
159  uintptr_t base, bound;
160  int rc, i;
161 
162  if (OPAL_UNLIKELY(peer_smp_rank == mca_btl_vader_component.my_smp_rank)) {
163  return rem_ptr;
164  }
165 
166  base = (uintptr_t) down_align_addr(rem_ptr, mca_btl_vader_log_align);
167  bound = (uintptr_t) up_align_addr((void *)((uintptr_t) rem_ptr + size - 1),
168  mca_btl_vader_log_align) + 1;
169  if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
170  bound = VADER_MAX_ADDRESS;
171  }
172 
173  /* several segments may match the base pointer */
174  rc = rcache->rcache_find_all (rcache, (void *) base, bound - base, regs, 10);
175  for (i = 0 ; i < rc ; ++i) {
176  if (bound <= (uintptr_t)regs[i]->bound && base >= (uintptr_t)regs[i]->base) {
177  opal_atomic_add (&regs[i]->ref_count, 1);
178  return regs[i];
179  }
180 
181  if (regs[i]->flags & MCA_MPOOL_FLAGS_PERSIST) {
182  continue;
183  }
184 
185  /* remove this pointer from the rcache and decrement its reference count
186  (so it is detached later) */
187  rc = rcache->rcache_delete (rcache, regs[i]);
188  if (OPAL_UNLIKELY(0 != rc)) {
189  /* someone beat us to it? */
190  break;
191  }
192 
193  /* start the new segment from the lower of the two bases */
194  base = (uintptr_t) regs[i]->base < base ? (uintptr_t) regs[i]->base : base;
195 
196  opal_atomic_add (&regs[i]->ref_count, -1);
197 
198  if (OPAL_LIKELY(0 == regs[i]->ref_count)) {
199  /* this pointer is not in use */
200  (void) xpmem_detach (regs[i]->alloc_base);
201  OBJ_RELEASE(regs[i]);
202  }
203 
204  break;
205  }
206 
207  reg = OBJ_NEW(mca_mpool_base_registration_t);
208  if (OPAL_LIKELY(NULL != reg)) {
209  /* stick around for awhile */
210  reg->ref_count = 2;
211  reg->base = (unsigned char *) base;
212  reg->bound = (unsigned char *) bound;
213  reg->flags = flags;
214 
215  xpmem_addr.apid = mca_btl_vader_component.apids[peer_smp_rank];
216  xpmem_addr.offset = base;
217 
218  reg->alloc_base = xpmem_attach (xpmem_addr, bound - base, NULL);
219  if (OPAL_UNLIKELY((void *)-1 == reg->alloc_base)) {
220  OBJ_RELEASE(reg);
221  reg = NULL;
222  } else {
223  rcache->rcache_insert (rcache, reg, 0);
224  }
225  }
226 
227  return reg;
228 }
229 
230 static inline void vader_return_registration (mca_mpool_base_registration_t *reg, int peer_smp_rank)
231 {
232  struct mca_rcache_base_module_t *rcache = mca_btl_vader_component.xpmem_rcaches[peer_smp_rank];
233 
234  opal_atomic_add (&reg->ref_count, -1);
235  if (OPAL_UNLIKELY(0 == reg->ref_count && !(reg->flags & MCA_MPOOL_FLAGS_PERSIST))) {
236  rcache->rcache_delete (rcache, reg);
237  (void)xpmem_detach (reg->alloc_base);
238  OBJ_RELEASE (reg);
239  }
240 }
241 
242 static inline void *vader_reg_to_ptr (mca_mpool_base_registration_t *reg, void *rem_ptr)
243 {
244  return (void *) ((uintptr_t) reg->alloc_base +
245  (ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));
246 }
247 
248 /* memcpy is faster at larger sizes but is undefined if the
249  pointers are aliased (TODO -- readd alias check) */
250 static inline void vader_memmove (void *dst, void *src, size_t size)
251 {
252  if (size >= (size_t) mca_btl_vader_memcpy_limit) {
253  memcpy (dst, src, size);
254  } else {
255  memmove (dst, src, size);
256  }
257 }
258 
259 /**
260  * Initiate a send to the peer.
261  *
262  * @param btl (IN) BTL module
263  * @param peer (IN) BTL peer addressing
264  */
266  struct mca_btl_base_endpoint_t *endpoint,
267  struct mca_btl_base_descriptor_t *descriptor,
268  mca_btl_base_tag_t tag);
269 
270 /**
271  * Initiate an inline send to the peer.
272  *
273  * @param btl (IN) BTL module
274  * @param peer (IN) BTL peer addressing
275  */
277  struct mca_btl_base_endpoint_t *endpoint,
278  struct opal_convertor_t *convertor,
279  void *header, size_t header_size,
280  size_t payload_size, uint8_t order,
281  uint32_t flags, mca_btl_base_tag_t tag,
282  mca_btl_base_descriptor_t **descriptor);
283 
284 /**
285  * Initiate an synchronous put.
286  *
287  * @param btl (IN) BTL module
288  * @param endpoint (IN) BTL addressing information
289  * @param descriptor (IN) Description of the data to be transferred
290  */
292  struct mca_btl_base_endpoint_t *endpoint,
293  struct mca_btl_base_descriptor_t *des);
294 
295 /**
296  * Initiate an synchronous get.
297  *
298  * @param btl (IN) BTL module
299  * @param endpoint (IN) BTL addressing information
300  * @param descriptor (IN) Description of the data to be transferred
301  */
303  struct mca_btl_base_endpoint_t *endpoint,
304  struct mca_btl_base_descriptor_t *des);
305 
306 /**
307  * Allocate a segment.
308  *
309  * @param btl (IN) BTL module
310  * @param size (IN) Request segment size.
311  */
313  struct mca_btl_base_endpoint_t* endpoint,
314  uint8_t order, size_t size, uint32_t flags);
315 
316 
317 END_C_DECLS
318 
319 #endif
A descriptor that holds the parameters to a send/put/get operation along w/ a callback routine that i...
Definition: btl.h:275
ompi_free_list_t vader_frags_eager
free list of vader send frags
Definition: btl_vader.h:113
size_t eager_limit
send fragment size
Definition: btl_vader.h:98
mca_mpool_base_module_t * vader_mpool
mpool on local node
Definition: btl_vader.h:96
bool btl_inited
flag indicating if btl has been inited
Definition: btl_vader.h:134
int32_t num_smp_procs
current number of smp procs on this host
Definition: btl_vader.h:110
rcache module descriptor
Definition: rcache.h:83
unsigned char * vader_next_fbox_out
indices of fast boxes to write
Definition: btl_vader.h:122
mca_btl_base_descriptor_t * mca_btl_vader_alloc(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint8_t order, size_t size, uint32_t flags)
Allocate a segment.
Definition: btl_vader.c:579
Definition: common_sm.h:60
void(* mca_btl_base_module_error_cb_fn_t)(struct mca_btl_base_module_t *btl, int32_t flags, struct ompi_proc_t *errproc, char *btlinfo)
Callback function that is called asynchronously on receipt of an error from the transport layer...
Definition: btl.h:538
char ** shm_bases
pointer to base pointers in shared memory
Definition: btl_vader.h:102
mca_common_sm_module_t * vader_seg
description of shared memory segment
Definition: btl_vader.h:99
int vader_free_list_max
maximum size of free lists
Definition: btl_vader.h:92
Definition: mpool.h:44
unsigned char * vader_next_fbox_in
indices of fast boxes to poll
Definition: btl_vader.h:121
mca_btl_base_module_t super
base BTL interface
Definition: btl_vader.h:133
#define OBJ_RELEASE(object)
Release an object (by decrementing its reference count).
Definition: opal_object.h:324
opal_list_t active_sends
list of outstanding fragments
Definition: btl_vader.h:116
int mca_btl_vader_put(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, struct mca_btl_base_descriptor_t *des)
Initiate an synchronous put.
Definition: btl_vader_put.c:25
int32_t my_smp_rank
My SMP process rank.
Definition: btl_vader.h:111
Shared Memory (VADER) BTL module.
Definition: btl_vader.h:89
int mca_btl_vader_send(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, struct mca_btl_base_descriptor_t *descriptor, mca_btl_base_tag_t tag)
Initiate a send to the peer.
Definition: btl_vader_send.c:37
Byte Transfer Layer (BTL)
#define opal_atomic_add(ADDR, VALUE)
Atomically increment the content depending on the type.
Definition: atomic.h:597
Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana University Research and Techno...
void * vader_mpool_base
base address of shared memory pool
Definition: btl_vader.h:97
volatile struct vader_fifo_t ** shm_fifo
pointer to fifo 2D array in shared memory
Definition: btl_vader.h:100
Definition: ompi_free_list.h:39
State of ELAN endpoint connection.
Definition: btl_elan_endpoint.h:33
unsigned char ** vader_fboxes_out
outgoing fast boxes (memory belongs to remote peers)
Definition: btl_vader.h:119
BTL component descriptor.
Definition: btl.h:411
Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana University Research and Techno...
Definition: opal_convertor.h:90
int vader_free_list_num
initial size of free lists
Definition: btl_vader.h:91
int mca_btl_vader_get(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, struct mca_btl_base_descriptor_t *des)
Initiate an synchronous get.
Definition: btl_vader_get.c:25
struct vader_fifo_t ** fifo
cached copy of the pointer to the 2D fifo array.
Definition: btl_vader.h:106
Definition: opal_list.h:147
char * vader_mpool_name
name of shared memory pool module
Definition: btl_vader.h:95
mca_btl_base_component_2_0_0_t super
base BTL component
Definition: btl_vader.h:90
Definition: btl_vader_fifo.h:50
int mca_btl_vader_sendi(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor, void *header, size_t header_size, size_t payload_size, uint8_t order, uint32_t flags, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t **descriptor)
Initiate an inline send to the peer.
Definition: btl_vader_sendi.c:38
VADER BTL Interface.
Definition: btl_vader.h:132
ompi_free_list_t vader_frags_user
free list of vader put/get frags
Definition: btl_vader.h:114
Atomic operations.
unsigned char ** vader_fboxes_in
incomming fast boxes (memory belongs to this process)
Definition: btl_vader.h:118
BTL module interface functions and attributes.
Definition: btl.h:786
int vader_free_list_inc
number of elements to alloc when growing free lists
Definition: btl_vader.h:93
mpool module descriptor.
Definition: mpool.h:174