1 //===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======//
3 // The LLVM Compiler Infrastructure
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
8 //===----------------------------------------------------------------------===//
10 // RTL for NEC Aurora TSUBASA machines
12 //===----------------------------------------------------------------------===//
22 #include <ve_offload.h>
24 #include <veosinfo/veosinfo.h>
27 #include "omptargetplugin.h"
30 #define TARGET_NAME VE
33 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
36 #define TARGET_ELF_ID 0
39 #include "elf_common.h"
43 uint64_t VeoLibHandle
;
46 /// Keep entries table per device.
47 struct FuncOrGblEntryTy
{
48 __tgt_target_table Table
;
49 std::vector
<__tgt_offload_entry
> Entries
;
52 class RTLDeviceInfoTy
{
53 std::vector
<std::list
<FuncOrGblEntryTy
>> FuncOrGblEntry
;
56 std::vector
<struct veo_proc_handle
*> ProcHandles
;
57 std::vector
<struct veo_thr_ctxt
*> Contexts
;
58 std::vector
<uint64_t> LibraryHandles
;
59 std::list
<DynLibTy
> DynLibs
;
60 // Maps OpenMP device Ids to Ve nodeids
61 std::vector
<int> NodeIds
;
63 void buildOffloadTableFromHost(int32_t device_id
, uint64_t VeoLibHandle
,
64 __tgt_offload_entry
*HostBegin
,
65 __tgt_offload_entry
*HostEnd
) {
66 FuncOrGblEntry
[device_id
].emplace_back();
67 std::vector
<__tgt_offload_entry
> &T
=
68 FuncOrGblEntry
[device_id
].back().Entries
;
70 for (__tgt_offload_entry
*i
= HostBegin
; i
!= HostEnd
; ++i
) {
71 char *SymbolName
= i
->name
;
72 // we have not enough access to the target memory to conveniently parse
73 // the offload table there so we need to lookup every symbol with the host
75 DP("Looking up symbol: %s\n", SymbolName
);
76 uint64_t SymbolTargetAddr
=
77 veo_get_sym(ProcHandles
[device_id
], VeoLibHandle
, SymbolName
);
78 __tgt_offload_entry Entry
;
80 if (!SymbolTargetAddr
) {
81 DP("Symbol %s not found in target image\n", SymbolName
);
82 Entry
= {NULL
, NULL
, 0, 0, 0};
84 DP("Found symbol %s successfully in target image (addr: %p)\n",
85 SymbolName
, reinterpret_cast<void *>(SymbolTargetAddr
));
86 Entry
= {reinterpret_cast<void *>(SymbolTargetAddr
), i
->name
, i
->size
,
93 FuncOrGblEntry
[device_id
].back().Table
.EntriesBegin
= &T
.front();
94 FuncOrGblEntry
[device_id
].back().Table
.EntriesEnd
= &T
.back() + 1;
97 __tgt_target_table
*getOffloadTable(int32_t device_id
) {
98 return &FuncOrGblEntry
[device_id
].back().Table
;
103 struct ve_nodeinfo node_info
;
104 ve_node_info(&node_info
);
106 // Build a predictable mapping between VE node ids and OpenMP device ids.
107 // This is necessary, because nodes can be missing or offline and (active)
108 // node ids are thus not consecutive. The entries in ve_nodeinfo may also
109 // not be in the order of their node ids.
110 for (int i
= 0; i
< node_info
.total_node_count
; ++i
) {
111 if (node_info
.status
[i
] == 0) {
112 NodeIds
.push_back(node_info
.nodeid
[i
]);
116 // Because the entries in ve_nodeinfo may not be in the order of their node
117 // ids, we sort NodeIds to get a predictable mapping.
118 std::sort(NodeIds
.begin(), NodeIds
.end());
120 int NumDevices
= NodeIds
.size();
121 DP("Found %i VE devices\n", NumDevices
);
122 ProcHandles
.resize(NumDevices
, NULL
);
123 Contexts
.resize(NumDevices
, NULL
);
124 FuncOrGblEntry
.resize(NumDevices
);
125 LibraryHandles
.resize(NumDevices
);
129 for (auto &ctx
: Contexts
) {
131 if (veo_context_close(ctx
) != 0) {
132 DP("Failed to close VEO context.\n");
137 for (auto &hdl
: ProcHandles
) {
139 veo_proc_destroy(hdl
);
143 for (auto &lib
: DynLibs
) {
145 remove(lib
.FileName
);
151 static RTLDeviceInfoTy DeviceInfo
;
153 static int target_run_function_wait(uint32_t DeviceID
, uint64_t FuncAddr
,
154 struct veo_args
*args
, uint64_t *RetVal
) {
155 DP("Running function with entry point %p\n",
156 reinterpret_cast<void *>(FuncAddr
));
157 uint64_t RequestHandle
=
158 veo_call_async(DeviceInfo
.Contexts
[DeviceID
], FuncAddr
, args
);
159 if (RequestHandle
== VEO_REQUEST_ID_INVALID
) {
160 DP("Execution of entry point %p failed\n",
161 reinterpret_cast<void *>(FuncAddr
));
165 DP("Function at address %p called (VEO request ID: %" PRIu64
")\n",
166 reinterpret_cast<void *>(FuncAddr
), RequestHandle
);
168 int ret
= veo_call_wait_result(DeviceInfo
.Contexts
[DeviceID
], RequestHandle
,
171 DP("Waiting for entry point %p failed (Error code %d)\n",
172 reinterpret_cast<void *>(FuncAddr
), ret
);
175 return OFFLOAD_SUCCESS
;
178 // Return the number of available devices of the type supported by the
180 int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo
.NodeIds
.size(); }
182 // Return an integer different from zero if the provided device image can be
183 // supported by the runtime. The functionality is similar to comparing the
184 // result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
185 // lightweight query to determine if the RTL is suitable for an image without
186 // having to load the library, which can be expensive.
187 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image
*Image
) {
188 #if TARGET_ELF_ID < 1
191 return elf_check_machine(Image
, TARGET_ELF_ID
);
195 // Initialize the specified device. In case of success return 0; otherwise
196 // return an error code.
197 int32_t __tgt_rtl_init_device(int32_t ID
) {
198 DP("Available VEO version: %i\n", veo_api_version());
200 // At the moment we do not really initialize (i.e. create a process or
201 // context on) the device here, but in "__tgt_rtl_load_binary".
202 // The reason for this is, that, when we create a process for a statically
203 // linked binary, the VEO api needs us to already supply the binary (but we
204 // can load a dynamically linked binary later, after we create the process).
205 // At this stage, we cannot check if we have a dynamically or statically
206 // linked binary so we defer process creation until we know.
207 return OFFLOAD_SUCCESS
;
210 // Pass an executable image section described by image to the specified
211 // device and prepare an address table of target entities. In case of error,
212 // return NULL. Otherwise, return a pointer to the built address table.
213 // Individual entries in the table may also be NULL, when the corresponding
214 // offload region is not supported on the target device.
215 __tgt_target_table
*__tgt_rtl_load_binary(int32_t ID
,
216 __tgt_device_image
*Image
) {
217 DP("Dev %d: load binary from " DPxMOD
" image\n", ID
,
218 DPxPTR(Image
->ImageStart
));
220 assert(ID
>= 0 && "bad dev id");
222 size_t ImageSize
= (size_t)Image
->ImageEnd
- (size_t)Image
->ImageStart
;
223 size_t NumEntries
= (size_t)(Image
->EntriesEnd
- Image
->EntriesBegin
);
224 DP("Expecting to have %zd entries defined.\n", NumEntries
);
226 // load dynamic library and get the entry points. We use the dl library
227 // to do the loading of the library, but we could do it directly to avoid the
228 // dump to the temporary file.
230 // 1) Create tmp file with the library contents.
231 // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
232 char tmp_name
[] = "/tmp/tmpfile_XXXXXX";
233 int tmp_fd
= mkstemp(tmp_name
);
239 FILE *ftmp
= fdopen(tmp_fd
, "wb");
242 DP("fdopen() for %s failed. Could not write target image\n", tmp_name
);
246 fwrite(Image
->ImageStart
, ImageSize
, 1, ftmp
);
248 // at least for the static case we need to change the permissions
249 chmod(tmp_name
, 0700);
251 DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name
, ImageSize
);
255 // See comment in "__tgt_rtl_init_device"
257 if (DeviceInfo
.ProcHandles
[ID
] == NULL
) {
258 struct veo_proc_handle
*proc_handle
;
259 is_dyn
= elf_is_dynamic(Image
);
260 // If we have a dynamically linked image, we create the process handle, then
261 // the thread, and then load the image.
262 // If we have a statically linked image, we need to create the process
263 // handle and load the image at the same time with veo_proc_create_static().
265 proc_handle
= veo_proc_create(DeviceInfo
.NodeIds
[ID
]);
267 DP("veo_proc_create() failed for device %d\n", ID
);
271 proc_handle
= veo_proc_create_static(DeviceInfo
.NodeIds
[ID
], tmp_name
);
273 DP("veo_proc_create_static() failed for device %d, image=%s\n", ID
,
278 DeviceInfo
.ProcHandles
[ID
] = proc_handle
;
281 if (DeviceInfo
.Contexts
[ID
] == NULL
) {
282 struct veo_thr_ctxt
*ctx
= veo_context_open(DeviceInfo
.ProcHandles
[ID
]);
285 DP("veo_context_open() failed: %s\n", std::strerror(errno
));
289 DeviceInfo
.Contexts
[ID
] = ctx
;
292 DP("Aurora device successfully initialized with loaded binary: "
293 "proc_handle=%p, ctx=%p\n",
294 DeviceInfo
.ProcHandles
[ID
], DeviceInfo
.Contexts
[ID
]);
296 uint64_t LibHandle
= 0UL;
298 LibHandle
= veo_load_library(DeviceInfo
.ProcHandles
[ID
], tmp_name
);
301 DP("veo_load_library() failed: LibHandle=%" PRIu64
302 " Name=%s. Set env VEORUN_BIN for static linked target code.\n",
303 LibHandle
, tmp_name
);
307 DP("Successfully loaded library dynamically\n");
309 DP("Symbol table is expected to have been created by "
310 "veo_create_proc_static()\n");
313 DynLibTy Lib
= {tmp_name
, LibHandle
};
314 DeviceInfo
.DynLibs
.push_back(Lib
);
315 DeviceInfo
.LibraryHandles
[ID
] = LibHandle
;
317 DeviceInfo
.buildOffloadTableFromHost(ID
, LibHandle
, Image
->EntriesBegin
,
320 return DeviceInfo
.getOffloadTable(ID
);
323 // Allocate data on the particular target device, of the specified size.
324 // HostPtr is a address of the host data the allocated target data
325 // will be associated with (HostPtr may be NULL if it is not known at
326 // allocation time, like for example it would be for target data that
327 // is allocated by omp_target_alloc() API). Return address of the
328 // allocated data on the target that will be used by libomptarget.so to
329 // initialize the target data mapping structures. These addresses are
330 // used to generate a table of target variables to pass to
331 // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
332 // case an error occurred on the target device.
333 void *__tgt_rtl_data_alloc(int32_t ID
, int64_t Size
, void *HostPtr
,
338 if (kind
!= TARGET_ALLOC_DEFAULT
) {
339 REPORT("Invalid target data allocation kind or requested allocator not "
340 "implemented yet\n");
344 if (DeviceInfo
.ProcHandles
[ID
] == NULL
) {
345 struct veo_proc_handle
*proc_handle
;
346 proc_handle
= veo_proc_create(DeviceInfo
.NodeIds
[ID
]);
348 DP("veo_proc_create() failed for device %d\n", ID
);
351 DeviceInfo
.ProcHandles
[ID
] = proc_handle
;
352 DP("Aurora device successfully initialized: proc_handle=%p", proc_handle
);
355 ret
= veo_alloc_mem(DeviceInfo
.ProcHandles
[ID
], &addr
, Size
);
356 DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64
"\n",
357 ID
, reinterpret_cast<void *>(addr
), Size
);
359 DP("veo_alloc_mem(%d, %p, %" PRIu64
") failed with error code %d\n", ID
,
360 reinterpret_cast<void *>(addr
), Size
, ret
);
364 return reinterpret_cast<void *>(addr
);
367 // Pass the data content to the target device using the target address.
368 // In case of success, return zero. Otherwise, return an error code.
369 int32_t __tgt_rtl_data_submit(int32_t ID
, void *TargetPtr
, void *HostPtr
,
371 int ret
= veo_write_mem(DeviceInfo
.ProcHandles
[ID
], (uint64_t)TargetPtr
,
372 HostPtr
, (size_t)Size
);
374 DP("veo_write_mem() failed with error code %d\n", ret
);
377 return OFFLOAD_SUCCESS
;
380 // Retrieve the data content from the target device using its address.
381 // In case of success, return zero. Otherwise, return an error code.
382 int32_t __tgt_rtl_data_retrieve(int32_t ID
, void *HostPtr
, void *TargetPtr
,
384 int ret
= veo_read_mem(DeviceInfo
.ProcHandles
[ID
], HostPtr
,
385 (uint64_t)TargetPtr
, Size
);
387 DP("veo_read_mem() failed with error code %d\n", ret
);
390 return OFFLOAD_SUCCESS
;
393 // De-allocate the data referenced by target ptr on the device. In case of
394 // success, return zero. Otherwise, return an error code.
395 int32_t __tgt_rtl_data_delete(int32_t ID
, void *TargetPtr
) {
396 int ret
= veo_free_mem(DeviceInfo
.ProcHandles
[ID
], (uint64_t)TargetPtr
);
399 DP("veo_free_mem() failed with error code %d\n", ret
);
402 return OFFLOAD_SUCCESS
;
405 // Similar to __tgt_rtl_run_target_region, but additionally specify the
406 // number of teams to be created and a number of threads in each team.
407 int32_t __tgt_rtl_run_target_team_region(int32_t ID
, void *Entry
, void **Args
,
408 ptrdiff_t *Offsets
, int32_t NumArgs
,
409 int32_t NumTeams
, int32_t ThreadLimit
,
410 uint64_t loop_tripcount
) {
413 // ignore team num and thread limit.
414 std::vector
<void *> ptrs(NumArgs
);
416 struct veo_args
*TargetArgs
;
417 TargetArgs
= veo_args_alloc();
419 if (TargetArgs
== NULL
) {
420 DP("Could not allocate VEO args\n");
424 for (int i
= 0; i
< NumArgs
; ++i
) {
425 ret
= veo_args_set_u64(TargetArgs
, i
, (intptr_t)Args
[i
]);
428 DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n", ret
,
435 if (target_run_function_wait(ID
, reinterpret_cast<uint64_t>(Entry
),
436 TargetArgs
, &RetVal
) != OFFLOAD_SUCCESS
) {
437 veo_args_free(TargetArgs
);
440 veo_args_free(TargetArgs
);
441 return OFFLOAD_SUCCESS
;
444 // Transfer control to the offloaded entry Entry on the target device.
445 // Args and Offsets are arrays of NumArgs size of target addresses and
446 // offsets. An offset should be added to the target address before passing it
447 // to the outlined function on device side. In case of success, return zero.
448 // Otherwise, return an error code.
449 int32_t __tgt_rtl_run_target_region(int32_t ID
, void *Entry
, void **Args
,
450 ptrdiff_t *Offsets
, int32_t NumArgs
) {
451 return __tgt_rtl_run_target_team_region(ID
, Entry
, Args
, Offsets
, NumArgs
, 1,
455 int32_t __tgt_rtl_supports_empty_images() { return 1; }
457 // VEC plugin's internal InfoLevel.
458 std::atomic
<uint32_t> InfoLevel
;