[sanitizer] Improve FreeBSD ASLR detection
[llvm-project.git] / openmp / libomptarget / plugins / cuda / src / rtl.cpp
blob970a574b2eb3fdea97e3e63dc6f90c73afba20e6
1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
11 //===----------------------------------------------------------------------===//
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
23 #include "Debug.h"
24 #include "DeviceEnvironment.h"
25 #include "omptargetplugin.h"
27 #define TARGET_NAME CUDA
28 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
30 #include "MemoryManager.h"
32 #include "llvm/Frontend/OpenMP/OMPConstants.h"
34 // Utility for retrieving and printing CUDA error string.
35 #ifdef OMPTARGET_DEBUG
36 #define CUDA_ERR_STRING(err) \
37 do { \
38 if (getDebugLevel() > 0) { \
39 const char *errStr = nullptr; \
40 CUresult errStr_status = cuGetErrorString(err, &errStr); \
41 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \
42 REPORT("Unrecognized CUDA error code: %d\n", err); \
43 else if (errStr_status == CUDA_SUCCESS) \
44 REPORT("CUDA error is: %s\n", errStr); \
45 else { \
46 REPORT("Unresolved CUDA error code: %d\n", err); \
47 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \
48 errStr_status); \
49 } \
50 } else { \
51 const char *errStr = nullptr; \
52 CUresult errStr_status = cuGetErrorString(err, &errStr); \
53 if (errStr_status == CUDA_SUCCESS) \
54 REPORT("%s \n", errStr); \
55 } \
56 } while (false)
57 #else // OMPTARGET_DEBUG
58 #define CUDA_ERR_STRING(err) \
59 do { \
60 const char *errStr = nullptr; \
61 CUresult errStr_status = cuGetErrorString(err, &errStr); \
62 if (errStr_status == CUDA_SUCCESS) \
63 REPORT("%s \n", errStr); \
64 } while (false)
65 #endif // OMPTARGET_DEBUG
67 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
69 #include "elf_common.h"
71 /// Keep entries table per device.
72 struct FuncOrGblEntryTy {
73 __tgt_target_table Table;
74 std::vector<__tgt_offload_entry> Entries;
77 /// Use a single entity to encode a kernel and a set of flags.
78 struct KernelTy {
79 CUfunction Func;
81 // execution mode of kernel
82 llvm::omp::OMPTgtExecModeFlags ExecutionMode;
84 /// Maximal number of threads per block for this kernel.
85 int MaxThreadsPerBlock = 0;
87 KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
88 : Func(_Func), ExecutionMode(_ExecutionMode) {}
91 namespace {
92 bool checkResult(CUresult Err, const char *ErrMsg) {
93 if (Err == CUDA_SUCCESS)
94 return true;
96 REPORT("%s", ErrMsg);
97 CUDA_ERR_STRING(Err);
98 return false;
101 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
102 CUstream Stream) {
103 CUresult Err =
104 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
106 if (Err != CUDA_SUCCESS) {
107 DP("Error when copying data from device to device. Pointers: src "
108 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
109 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
110 CUDA_ERR_STRING(Err);
111 return OFFLOAD_FAIL;
114 return OFFLOAD_SUCCESS;
117 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
118 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
119 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
121 CUresult Err = cuEventRecord(Event, Stream);
122 if (Err != CUDA_SUCCESS) {
123 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
124 DPxPTR(Stream), DPxPTR(Event));
125 CUDA_ERR_STRING(Err);
126 return OFFLOAD_FAIL;
129 return OFFLOAD_SUCCESS;
132 int syncEvent(void *EventPtr) {
133 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
135 CUresult Err = cuEventSynchronize(Event);
136 if (Err != CUDA_SUCCESS) {
137 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
138 CUDA_ERR_STRING(Err);
139 return OFFLOAD_FAIL;
142 return OFFLOAD_SUCCESS;
145 // Structure contains per-device data
146 struct DeviceDataTy {
147 /// List that contains all the kernels.
148 std::list<KernelTy> KernelsList;
150 std::list<FuncOrGblEntryTy> FuncGblEntries;
152 CUcontext Context = nullptr;
153 // Device properties
154 int ThreadsPerBlock = 0;
155 int BlocksPerGrid = 0;
156 int WarpSize = 0;
157 // OpenMP properties
158 int NumTeams = 0;
159 int NumThreads = 0;
162 /// Resource allocator where \p T is the resource type.
163 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
164 /// accordingly. The implementation should not raise any exception.
165 template <typename T> class AllocatorTy {
166 public:
167 /// Create a resource and assign to R.
168 int create(T &R) noexcept;
169 /// Destroy the resource.
170 int destroy(T) noexcept;
173 /// Allocator for CUstream.
174 template <> class AllocatorTy<CUstream> {
175 CUcontext Context;
177 public:
178 AllocatorTy(CUcontext C) noexcept : Context(C) {}
180 /// See AllocatorTy<T>::create.
181 int create(CUstream &Stream) noexcept {
182 if (!checkResult(cuCtxSetCurrent(Context),
183 "Error returned from cuCtxSetCurrent\n"))
184 return OFFLOAD_FAIL;
186 if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
187 "Error returned from cuStreamCreate\n"))
188 return OFFLOAD_FAIL;
190 return OFFLOAD_SUCCESS;
193 /// See AllocatorTy<T>::destroy.
194 int destroy(CUstream Stream) noexcept {
195 if (!checkResult(cuCtxSetCurrent(Context),
196 "Error returned from cuCtxSetCurrent\n"))
197 return OFFLOAD_FAIL;
198 if (!checkResult(cuStreamDestroy(Stream),
199 "Error returned from cuStreamDestroy\n"))
200 return OFFLOAD_FAIL;
202 return OFFLOAD_SUCCESS;
206 /// Allocator for CUevent.
207 template <> class AllocatorTy<CUevent> {
208 public:
209 /// See AllocatorTy<T>::create.
210 int create(CUevent &Event) noexcept {
211 if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
212 "Error returned from cuEventCreate\n"))
213 return OFFLOAD_FAIL;
215 return OFFLOAD_SUCCESS;
218 /// See AllocatorTy<T>::destroy.
219 int destroy(CUevent Event) noexcept {
220 if (!checkResult(cuEventDestroy(Event),
221 "Error returned from cuEventDestroy\n"))
222 return OFFLOAD_FAIL;
224 return OFFLOAD_SUCCESS;
228 /// A generic pool of resources where \p T is the resource type.
229 /// \p T should be copyable as the object is stored in \p std::vector .
230 template <typename T> class ResourcePoolTy {
231 /// Index of the next available resource.
232 size_t Next = 0;
233 /// Mutex to guard the pool.
234 std::mutex Mutex;
235 /// Pool of resources.
236 std::vector<T> Resources;
237 /// A reference to the corresponding allocator.
238 AllocatorTy<T> Allocator;
240 /// If `Resources` is used up, we will fill in more resources. It assumes that
241 /// the new size `Size` should be always larger than the current size.
242 bool resize(size_t Size) {
243 auto CurSize = Resources.size();
244 assert(Size > CurSize && "Unexpected smaller size");
245 Resources.reserve(Size);
246 for (auto I = CurSize; I < Size; ++I) {
247 T NewItem;
248 int Ret = Allocator.create(NewItem);
249 if (Ret != OFFLOAD_SUCCESS)
250 return false;
251 Resources.push_back(NewItem);
253 return true;
256 public:
257 ResourcePoolTy(AllocatorTy<T> &&A, size_t Size = 0) noexcept
258 : Allocator(std::move(A)) {
259 if (Size)
260 (void)resize(Size);
263 ~ResourcePoolTy() noexcept { clear(); }
265 /// Get a resource from pool. `Next` always points to the next available
266 /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
267 /// still available. If there is no resource left, we will ask for more. Each
268 /// time a resource is assigned, the id will increase one.
269 /// xxxxxs+++++++++
270 /// ^
271 /// Next
272 /// After assignment, the pool becomes the following and s is assigned.
273 /// xxxxxs+++++++++
274 /// ^
275 /// Next
276 int acquire(T &R) noexcept {
277 std::lock_guard<std::mutex> LG(Mutex);
278 if (Next == Resources.size()) {
279 auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
280 if (!resize(NewSize))
281 return OFFLOAD_FAIL;
284 assert(Next < Resources.size());
286 R = Resources[Next++];
288 return OFFLOAD_SUCCESS;
291 /// Return the resource back to the pool. When we return a resource, we need
292 /// to first decrease `Next`, and then copy the resource back. It is worth
293 /// noting that, the order of resources return might be different from that
294 /// they're assigned, that saying, at some point, there might be two identical
295 /// resources.
296 /// xxax+a+++++
297 /// ^
298 /// Next
299 /// However, it doesn't matter, because they're always on the two sides of
300 /// `Next`. The left one will in the end be overwritten by another resource.
301 /// Therefore, after several execution, the order of pool might be different
302 /// from its initial state.
303 void release(T R) noexcept {
304 std::lock_guard<std::mutex> LG(Mutex);
305 Resources[--Next] = R;
308 /// Released all stored resources and clear the pool.
309 /// Note: This function is not thread safe. Be sure to guard it if necessary.
310 void clear() noexcept {
311 for (auto &R : Resources)
312 (void)Allocator.destroy(R);
313 Resources.clear();
317 class DeviceRTLTy {
318 int NumberOfDevices;
319 // OpenMP environment properties
320 int EnvNumTeams;
321 int EnvTeamLimit;
322 int EnvTeamThreadLimit;
323 // OpenMP requires flags
324 int64_t RequiresFlags;
325 // Amount of dynamic shared memory to use at launch.
326 uint64_t DynamicMemorySize;
327 // Number of initial streams for each device.
328 int NumInitialStreams = 32;
330 static constexpr const int HardTeamLimit = 1U << 16U; // 64k
331 static constexpr const int HardThreadLimit = 1024;
332 static constexpr const int DefaultNumTeams = 128;
333 static constexpr const int DefaultNumThreads = 128;
335 using StreamPoolTy = ResourcePoolTy<CUstream>;
336 std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
338 ResourcePoolTy<CUevent> EventPool;
340 std::vector<DeviceDataTy> DeviceData;
341 std::vector<CUmodule> Modules;
343 /// A class responsible for interacting with device native runtime library to
344 /// allocate and free memory.
345 class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
346 const int DeviceId;
347 const std::vector<DeviceDataTy> &DeviceData;
348 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
350 public:
351 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
352 : DeviceId(DeviceId), DeviceData(DeviceData) {}
354 void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
355 if (Size == 0)
356 return nullptr;
358 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
359 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
360 return nullptr;
362 void *MemAlloc = nullptr;
363 switch (Kind) {
364 case TARGET_ALLOC_DEFAULT:
365 case TARGET_ALLOC_DEVICE:
366 CUdeviceptr DevicePtr;
367 Err = cuMemAlloc(&DevicePtr, Size);
368 MemAlloc = (void *)DevicePtr;
369 if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
370 return nullptr;
371 break;
372 case TARGET_ALLOC_HOST:
373 void *HostPtr;
374 Err = cuMemAllocHost(&HostPtr, Size);
375 MemAlloc = HostPtr;
376 if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
377 return nullptr;
378 HostPinnedAllocs[MemAlloc] = Kind;
379 break;
380 case TARGET_ALLOC_SHARED:
381 CUdeviceptr SharedPtr;
382 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
383 MemAlloc = (void *)SharedPtr;
384 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
385 return nullptr;
386 break;
389 return MemAlloc;
392 int free(void *TgtPtr) override {
393 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
394 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
395 return OFFLOAD_FAIL;
397 // Host pinned memory must be freed differently.
398 TargetAllocTy Kind =
399 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
400 ? TARGET_ALLOC_DEFAULT
401 : TARGET_ALLOC_HOST;
402 switch (Kind) {
403 case TARGET_ALLOC_DEFAULT:
404 case TARGET_ALLOC_DEVICE:
405 case TARGET_ALLOC_SHARED:
406 Err = cuMemFree((CUdeviceptr)TgtPtr);
407 if (!checkResult(Err, "Error returned from cuMemFree\n"))
408 return OFFLOAD_FAIL;
409 break;
410 case TARGET_ALLOC_HOST:
411 Err = cuMemFreeHost(TgtPtr);
412 if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
413 return OFFLOAD_FAIL;
414 break;
417 return OFFLOAD_SUCCESS;
421 /// A vector of device allocators
422 std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
424 /// A vector of memory managers. Since the memory manager is non-copyable and
425 // non-removable, we wrap them into std::unique_ptr.
426 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
428 /// Whether use memory manager
429 bool UseMemoryManager = true;
431 // Record entry point associated with device
432 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
433 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
434 E.Entries.push_back(entry);
437 // Return a pointer to the entry associated with the pointer
438 const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
439 const void *Addr) const {
440 for (const __tgt_offload_entry &Itr :
441 DeviceData[DeviceId].FuncGblEntries.back().Entries)
442 if (Itr.addr == Addr)
443 return &Itr;
445 return nullptr;
448 // Return the pointer to the target entries table
449 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
450 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
452 if (E.Entries.empty())
453 return nullptr;
455 // Update table info according to the entries and return the pointer
456 E.Table.EntriesBegin = E.Entries.data();
457 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
459 return &E.Table;
462 // Clear entries table for a device
463 void clearOffloadEntriesTable(const int DeviceId) {
464 DeviceData[DeviceId].FuncGblEntries.emplace_back();
465 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
466 E.Entries.clear();
467 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
470 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
471 assert(AsyncInfo && "AsyncInfo is nullptr");
473 if (!AsyncInfo->Queue) {
474 CUstream S;
475 if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
476 return nullptr;
478 AsyncInfo->Queue = S;
481 return reinterpret_cast<CUstream>(AsyncInfo->Queue);
484 public:
485 // This class should not be copied
486 DeviceRTLTy(const DeviceRTLTy &) = delete;
487 DeviceRTLTy(DeviceRTLTy &&) = delete;
489 DeviceRTLTy()
490 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
491 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
492 DynamicMemorySize(0), EventPool(AllocatorTy<CUevent>()) {
494 DP("Start initializing CUDA\n");
496 CUresult Err = cuInit(0);
497 if (Err == CUDA_ERROR_INVALID_HANDLE) {
498 // Can't call cuGetErrorString if dlsym failed
499 DP("Failed to load CUDA shared library\n");
500 return;
502 if (!checkResult(Err, "Error returned from cuInit\n")) {
503 return;
506 Err = cuDeviceGetCount(&NumberOfDevices);
507 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
508 return;
510 if (NumberOfDevices == 0) {
511 DP("There are no devices supporting CUDA.\n");
512 return;
515 DeviceData.resize(NumberOfDevices);
516 StreamPool.resize(NumberOfDevices);
518 // Get environment variables regarding teams
519 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
520 // OMP_TEAM_LIMIT has been set
521 EnvTeamLimit = std::stoi(EnvStr);
522 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
524 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
525 // OMP_TEAMS_THREAD_LIMIT has been set
526 EnvTeamThreadLimit = std::stoi(EnvStr);
527 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
529 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
530 // OMP_NUM_TEAMS has been set
531 EnvNumTeams = std::stoi(EnvStr);
532 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
534 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
535 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
536 DynamicMemorySize = std::stoi(EnvStr);
537 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
538 DynamicMemorySize);
540 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
541 // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
542 NumInitialStreams = std::stoi(EnvStr);
543 DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
546 for (int I = 0; I < NumberOfDevices; ++I)
547 DeviceAllocators.emplace_back(I, DeviceData);
549 // Get the size threshold from environment variable
550 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
551 UseMemoryManager = Res.second;
552 size_t MemoryManagerThreshold = Res.first;
554 if (UseMemoryManager)
555 for (int I = 0; I < NumberOfDevices; ++I)
556 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
557 DeviceAllocators[I], MemoryManagerThreshold));
560 ~DeviceRTLTy() {
561 // We first destruct memory managers in case that its dependent data are
562 // destroyed before it.
563 for (auto &M : MemoryManagers)
564 M.release();
566 for (CUmodule &M : Modules)
567 // Close module
568 if (M)
569 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
571 for (auto &S : StreamPool)
572 S.reset();
574 EventPool.clear();
576 for (DeviceDataTy &D : DeviceData) {
577 // Destroy context
578 if (D.Context) {
579 checkResult(cuCtxSetCurrent(D.Context),
580 "Error returned from cuCtxSetCurrent\n");
581 CUdevice Device;
582 checkResult(cuCtxGetDevice(&Device),
583 "Error returned from cuCtxGetDevice\n");
584 checkResult(cuDevicePrimaryCtxRelease(Device),
585 "Error returned from cuDevicePrimaryCtxRelease\n");
590 // Check whether a given DeviceId is valid
591 bool isValidDeviceId(const int DeviceId) const {
592 return DeviceId >= 0 && DeviceId < NumberOfDevices;
595 int getNumOfDevices() const { return NumberOfDevices; }
597 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
599 int initDevice(const int DeviceId) {
600 CUdevice Device;
602 DP("Getting device %d\n", DeviceId);
603 CUresult Err = cuDeviceGet(&Device, DeviceId);
604 if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
605 return OFFLOAD_FAIL;
607 // Query the current flags of the primary context and set its flags if
608 // it is inactive
609 unsigned int FormerPrimaryCtxFlags = 0;
610 int FormerPrimaryCtxIsActive = 0;
611 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
612 &FormerPrimaryCtxIsActive);
613 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
614 return OFFLOAD_FAIL;
616 if (FormerPrimaryCtxIsActive) {
617 DP("The primary context is active, no change to its flags\n");
618 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
619 CU_CTX_SCHED_BLOCKING_SYNC)
620 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
621 } else {
622 DP("The primary context is inactive, set its flags to "
623 "CU_CTX_SCHED_BLOCKING_SYNC\n");
624 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
625 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
626 return OFFLOAD_FAIL;
629 // Retain the per device primary context and save it to use whenever this
630 // device is selected.
631 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
632 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
633 return OFFLOAD_FAIL;
635 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
636 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
637 return OFFLOAD_FAIL;
639 // Initialize stream pool
640 if (!StreamPool[DeviceId])
641 StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(
642 AllocatorTy<CUstream>(DeviceData[DeviceId].Context),
643 NumInitialStreams);
645 // Query attributes to determine number of threads/block and blocks/grid.
646 int MaxGridDimX;
647 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
648 Device);
649 if (Err != CUDA_SUCCESS) {
650 DP("Error getting max grid dimension, use default value %d\n",
651 DeviceRTLTy::DefaultNumTeams);
652 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
653 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
654 DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
655 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
656 } else {
657 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
658 "at the hard limit\n",
659 MaxGridDimX, DeviceRTLTy::HardTeamLimit);
660 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
663 // We are only exploiting threads along the x axis.
664 int MaxBlockDimX;
665 Err = cuDeviceGetAttribute(&MaxBlockDimX,
666 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
667 if (Err != CUDA_SUCCESS) {
668 DP("Error getting max block dimension, use default value %d\n",
669 DeviceRTLTy::DefaultNumThreads);
670 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
671 } else {
672 DP("Using %d CUDA threads per block\n", MaxBlockDimX);
673 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
675 if (EnvTeamThreadLimit > 0 &&
676 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
677 DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
678 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
679 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
680 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
682 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
683 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
684 "capping at the hard limit\n",
685 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
686 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
690 // Get and set warp size
691 int WarpSize;
692 Err =
693 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
694 if (Err != CUDA_SUCCESS) {
695 DP("Error getting warp size, assume default value 32\n");
696 DeviceData[DeviceId].WarpSize = 32;
697 } else {
698 DP("Using warp size %d\n", WarpSize);
699 DeviceData[DeviceId].WarpSize = WarpSize;
702 // Adjust teams to the env variables
703 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
704 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
705 EnvTeamLimit);
706 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
709 size_t StackLimit;
710 size_t HeapLimit;
711 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
712 StackLimit = std::stol(EnvStr);
713 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
714 return OFFLOAD_FAIL;
715 } else {
716 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
717 return OFFLOAD_FAIL;
719 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
720 HeapLimit = std::stol(EnvStr);
721 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
722 return OFFLOAD_FAIL;
723 } else {
724 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
725 return OFFLOAD_FAIL;
728 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
729 "Device supports up to %d CUDA blocks and %d threads with a "
730 "warp size of %d\n",
731 DeviceData[DeviceId].BlocksPerGrid,
732 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
733 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
734 "Device heap size is %d Bytes, device stack size is %d Bytes per "
735 "thread\n",
736 (int)HeapLimit, (int)StackLimit);
738 // Set default number of teams
739 if (EnvNumTeams > 0) {
740 DP("Default number of teams set according to environment %d\n",
741 EnvNumTeams);
742 DeviceData[DeviceId].NumTeams = EnvNumTeams;
743 } else {
744 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
745 DP("Default number of teams set according to library's default %d\n",
746 DeviceRTLTy::DefaultNumTeams);
749 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
750 DP("Default number of teams exceeds device limit, capping at %d\n",
751 DeviceData[DeviceId].BlocksPerGrid);
752 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
755 // Set default number of threads
756 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
757 DP("Default number of threads set according to library's default %d\n",
758 DeviceRTLTy::DefaultNumThreads);
759 if (DeviceData[DeviceId].NumThreads >
760 DeviceData[DeviceId].ThreadsPerBlock) {
761 DP("Default number of threads exceeds device limit, capping at %d\n",
762 DeviceData[DeviceId].ThreadsPerBlock);
763 DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
766 return OFFLOAD_SUCCESS;
769 __tgt_target_table *loadBinary(const int DeviceId,
770 const __tgt_device_image *Image) {
771 // Set the context we are using
772 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
773 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
774 return nullptr;
776 // Clear the offload table as we are going to create a new one.
777 clearOffloadEntriesTable(DeviceId);
779 // Create the module and extract the function pointers.
780 CUmodule Module;
781 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
782 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
783 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
784 return nullptr;
786 DP("CUDA module successfully loaded!\n");
788 Modules.push_back(Module);
790 // Find the symbols in the module by name.
791 const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
792 const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
794 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
795 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
796 if (!E->addr) {
797 // We return nullptr when something like this happens, the host should
798 // have always something in the address to uniquely identify the target
799 // region.
800 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
801 return nullptr;
804 if (E->size) {
805 __tgt_offload_entry Entry = *E;
806 CUdeviceptr CUPtr;
807 size_t CUSize;
808 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
809 // We keep this style here because we need the name
810 if (Err != CUDA_SUCCESS) {
811 REPORT("Loading global '%s' Failed\n", E->name);
812 CUDA_ERR_STRING(Err);
813 return nullptr;
816 if (CUSize != E->size) {
817 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
818 CUSize, E->size);
819 return nullptr;
822 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
823 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
825 Entry.addr = (void *)(CUPtr);
827 // Note: In the current implementation declare target variables
828 // can either be link or to. This means that once unified
829 // memory is activated via the requires directive, the variable
830 // can be used directly from the host in both cases.
831 // TODO: when variables types other than to or link are added,
832 // the below condition should be changed to explicitly
833 // check for to and link variables types:
834 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
835 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
836 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
837 // If unified memory is present any target link or to variables
838 // can access host addresses directly. There is no longer a
839 // need for device copies.
840 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
841 DP("Copy linked variable host address (" DPxMOD
842 ") to device address (" DPxMOD ")\n",
843 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
846 addOffloadEntry(DeviceId, Entry);
848 continue;
851 CUfunction Func;
852 Err = cuModuleGetFunction(&Func, Module, E->name);
853 // We keep this style here because we need the name
854 if (Err != CUDA_SUCCESS) {
855 REPORT("Loading '%s' Failed\n", E->name);
856 CUDA_ERR_STRING(Err);
857 return nullptr;
860 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
861 DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
863 // default value GENERIC (in case symbol is missing from cubin file)
864 llvm::omp::OMPTgtExecModeFlags ExecModeVal;
865 std::string ExecModeNameStr(E->name);
866 ExecModeNameStr += "_exec_mode";
867 const char *ExecModeName = ExecModeNameStr.c_str();
869 CUdeviceptr ExecModePtr;
870 size_t CUSize;
871 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
872 if (Err == CUDA_SUCCESS) {
873 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
874 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
875 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
876 return nullptr;
879 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
880 if (Err != CUDA_SUCCESS) {
881 REPORT("Error when copying data from device to host. Pointers: "
882 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
883 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
884 CUDA_ERR_STRING(Err);
885 return nullptr;
887 } else {
888 DP("Loading global exec_mode '%s' - symbol missing, using default "
889 "value GENERIC (1)\n",
890 ExecModeName);
893 KernelsList.emplace_back(Func, ExecModeVal);
895 __tgt_offload_entry Entry = *E;
896 Entry.addr = &KernelsList.back();
897 addOffloadEntry(DeviceId, Entry);
900 // send device environment data to the device
902 // TODO: The device ID used here is not the real device ID used by OpenMP.
903 DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
904 static_cast<uint32_t>(DeviceId),
905 static_cast<uint32_t>(DynamicMemorySize)};
907 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
908 DeviceEnv.DebugKind = std::stoi(EnvStr);
910 const char *DeviceEnvName = "omptarget_device_environment";
911 CUdeviceptr DeviceEnvPtr;
912 size_t CUSize;
914 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
915 if (Err == CUDA_SUCCESS) {
916 if (CUSize != sizeof(DeviceEnv)) {
917 REPORT(
918 "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
919 DeviceEnvName, CUSize, sizeof(int32_t));
920 CUDA_ERR_STRING(Err);
921 return nullptr;
924 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
925 if (Err != CUDA_SUCCESS) {
926 REPORT("Error when copying data from host to device. Pointers: "
927 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
928 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
929 CUDA_ERR_STRING(Err);
930 return nullptr;
933 DP("Sending global device environment data %zu bytes\n", CUSize);
934 } else {
935 DP("Finding global device environment '%s' - symbol missing.\n",
936 DeviceEnvName);
937 DP("Continue, considering this is a device RTL which does not accept "
938 "environment setting.\n");
942 return getOffloadEntriesTable(DeviceId);
945 void *dataAlloc(const int DeviceId, const int64_t Size,
946 const TargetAllocTy Kind) {
947 switch (Kind) {
948 case TARGET_ALLOC_DEFAULT:
949 case TARGET_ALLOC_DEVICE:
950 if (UseMemoryManager)
951 return MemoryManagers[DeviceId]->allocate(Size, nullptr);
952 else
953 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
954 case TARGET_ALLOC_HOST:
955 case TARGET_ALLOC_SHARED:
956 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
959 REPORT("Invalid target data allocation kind or requested allocator not "
960 "implemented yet\n");
962 return nullptr;
965 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
966 const int64_t Size, __tgt_async_info *AsyncInfo) const {
967 assert(AsyncInfo && "AsyncInfo is nullptr");
969 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
970 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
971 return OFFLOAD_FAIL;
973 CUstream Stream = getStream(DeviceId, AsyncInfo);
975 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
976 if (Err != CUDA_SUCCESS) {
977 DP("Error when copying data from host to device. Pointers: host "
978 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
979 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
980 CUDA_ERR_STRING(Err);
981 return OFFLOAD_FAIL;
984 return OFFLOAD_SUCCESS;
987 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
988 const int64_t Size, __tgt_async_info *AsyncInfo) const {
989 assert(AsyncInfo && "AsyncInfo is nullptr");
991 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
992 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
993 return OFFLOAD_FAIL;
995 CUstream Stream = getStream(DeviceId, AsyncInfo);
997 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
998 if (Err != CUDA_SUCCESS) {
999 DP("Error when copying data from device to host. Pointers: host "
1000 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1001 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1002 CUDA_ERR_STRING(Err);
1003 return OFFLOAD_FAIL;
1006 return OFFLOAD_SUCCESS;
1009 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1010 int64_t Size, __tgt_async_info *AsyncInfo) const {
1011 assert(AsyncInfo && "AsyncInfo is nullptr");
1013 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
1014 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1015 return OFFLOAD_FAIL;
1017 CUstream Stream = getStream(SrcDevId, AsyncInfo);
1019 // If they are two devices, we try peer to peer copy first
1020 if (SrcDevId != DstDevId) {
1021 int CanAccessPeer = 0;
1022 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1023 if (Err != CUDA_SUCCESS) {
1024 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1025 ", dst = %" PRId32 "\n",
1026 SrcDevId, DstDevId);
1027 CUDA_ERR_STRING(Err);
1028 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1031 if (!CanAccessPeer) {
1032 DP("P2P memcpy not supported so fall back to D2D memcpy");
1033 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1036 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1037 if (Err != CUDA_SUCCESS) {
1038 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1039 ", dst = %" PRId32 "\n",
1040 SrcDevId, DstDevId);
1041 CUDA_ERR_STRING(Err);
1042 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1045 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1046 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1047 Size, Stream);
1048 if (Err == CUDA_SUCCESS)
1049 return OFFLOAD_SUCCESS;
1051 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1052 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1053 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1054 CUDA_ERR_STRING(Err);
1057 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1060 int dataDelete(const int DeviceId, void *TgtPtr) {
1061 if (UseMemoryManager)
1062 return MemoryManagers[DeviceId]->free(TgtPtr);
1064 return DeviceAllocators[DeviceId].free(TgtPtr);
1067 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1068 ptrdiff_t *TgtOffsets, const int ArgNum,
1069 const int TeamNum, const int ThreadLimit,
1070 const unsigned int LoopTripCount,
1071 __tgt_async_info *AsyncInfo) const {
1072 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1073 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1074 return OFFLOAD_FAIL;
1076 // All args are references.
1077 std::vector<void *> Args(ArgNum);
1078 std::vector<void *> Ptrs(ArgNum);
1080 for (int I = 0; I < ArgNum; ++I) {
1081 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1082 Args[I] = &Ptrs[I];
1085 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1087 const bool IsSPMDGenericMode =
1088 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1089 const bool IsSPMDMode =
1090 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1091 const bool IsGenericMode =
1092 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1094 int CudaThreadsPerBlock;
1095 if (ThreadLimit > 0) {
1096 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1097 CudaThreadsPerBlock = ThreadLimit;
1098 // Add master warp if necessary
1099 if (IsGenericMode) {
1100 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1101 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1103 } else {
1104 DP("Setting CUDA threads per block to default %d\n",
1105 DeviceData[DeviceId].NumThreads);
1106 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1109 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1110 DP("Threads per block capped at device limit %d\n",
1111 DeviceData[DeviceId].ThreadsPerBlock);
1112 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1115 if (!KernelInfo->MaxThreadsPerBlock) {
1116 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1117 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1118 KernelInfo->Func);
1119 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1120 return OFFLOAD_FAIL;
1123 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1124 DP("Threads per block capped at kernel limit %d\n",
1125 KernelInfo->MaxThreadsPerBlock);
1126 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1129 unsigned int CudaBlocksPerGrid;
1130 if (TeamNum <= 0) {
1131 if (LoopTripCount > 0 && EnvNumTeams < 0) {
1132 if (IsSPMDGenericMode) {
1133 // If we reach this point, then we are executing a kernel that was
1134 // transformed from Generic-mode to SPMD-mode. This kernel has
1135 // SPMD-mode execution, but needs its blocks to be scheduled
1136 // differently because the current loop trip count only applies to the
1137 // `teams distribute` region and will create var too few blocks using
1138 // the regular SPMD-mode method.
1139 CudaBlocksPerGrid = LoopTripCount;
1140 } else if (IsSPMDMode) {
1141 // We have a combined construct, i.e. `target teams distribute
1142 // parallel for [simd]`. We launch so many teams so that each thread
1143 // will execute one iteration of the loop. round up to the nearest
1144 // integer
1145 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1146 } else if (IsGenericMode) {
1147 // If we reach this point, then we have a non-combined construct, i.e.
1148 // `teams distribute` with a nested `parallel for` and each team is
1149 // assigned one iteration of the `distribute` loop. E.g.:
1151 // #pragma omp target teams distribute
1152 // for(...loop_tripcount...) {
1153 // #pragma omp parallel for
1154 // for(...) {}
1155 // }
1157 // Threads within a team will execute the iterations of the `parallel`
1158 // loop.
1159 CudaBlocksPerGrid = LoopTripCount;
1160 } else {
1161 REPORT("Unknown execution mode: %d\n",
1162 static_cast<int8_t>(KernelInfo->ExecutionMode));
1163 return OFFLOAD_FAIL;
1165 DP("Using %d teams due to loop trip count %" PRIu32
1166 " and number of threads per block %d\n",
1167 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1168 } else {
1169 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1170 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1172 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1173 DP("Capping number of teams to team limit %d\n",
1174 DeviceData[DeviceId].BlocksPerGrid);
1175 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1176 } else {
1177 DP("Using requested number of teams %d\n", TeamNum);
1178 CudaBlocksPerGrid = TeamNum;
1181 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1182 "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1183 (getOffloadEntry(DeviceId, TgtEntryPtr))
1184 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1185 : "(null)",
1186 CudaBlocksPerGrid, CudaThreadsPerBlock,
1187 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1189 CUstream Stream = getStream(DeviceId, AsyncInfo);
1190 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1191 /* gridDimZ */ 1, CudaThreadsPerBlock,
1192 /* blockDimY */ 1, /* blockDimZ */ 1,
1193 DynamicMemorySize, Stream, &Args[0], nullptr);
1194 if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1195 return OFFLOAD_FAIL;
1197 DP("Launch of entry point at " DPxMOD " successful!\n",
1198 DPxPTR(TgtEntryPtr));
1200 return OFFLOAD_SUCCESS;
1203 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1204 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1205 CUresult Err = cuStreamSynchronize(Stream);
1207 // Once the stream is synchronized, return it to stream pool and reset
1208 // AsyncInfo. This is to make sure the synchronization only works for its
1209 // own tasks.
1210 StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1211 AsyncInfo->Queue = nullptr;
1213 if (Err != CUDA_SUCCESS) {
1214 DP("Error when synchronizing stream. stream = " DPxMOD
1215 ", async info ptr = " DPxMOD "\n",
1216 DPxPTR(Stream), DPxPTR(AsyncInfo));
1217 CUDA_ERR_STRING(Err);
1219 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1222 void printDeviceInfo(int32_t device_id) {
1223 char TmpChar[1000];
1224 std::string TmpStr;
1225 size_t TmpSt;
1226 int TmpInt, TmpInt2, TmpInt3;
1228 CUdevice Device;
1229 checkResult(cuDeviceGet(&Device, device_id),
1230 "Error returned from cuCtxGetDevice\n");
1232 cuDriverGetVersion(&TmpInt);
1233 printf(" CUDA Driver Version: \t\t%d \n", TmpInt);
1234 printf(" CUDA Device Number: \t\t%d \n", device_id);
1235 checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1236 "Error returned from cuDeviceGetName\n");
1237 printf(" Device Name: \t\t\t%s \n", TmpChar);
1238 checkResult(cuDeviceTotalMem(&TmpSt, Device),
1239 "Error returned from cuDeviceTotalMem\n");
1240 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt);
1241 checkResult(cuDeviceGetAttribute(
1242 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1243 "Error returned from cuDeviceGetAttribute\n");
1244 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt);
1245 checkResult(
1246 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1247 "Error returned from cuDeviceGetAttribute\n");
1248 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1249 checkResult(cuDeviceGetAttribute(
1250 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1251 "Error returned from cuDeviceGetAttribute\n");
1252 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt);
1253 checkResult(
1254 cuDeviceGetAttribute(
1255 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1256 "Error returned from cuDeviceGetAttribute\n");
1257 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1258 checkResult(
1259 cuDeviceGetAttribute(
1260 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1261 "Error returned from cuDeviceGetAttribute\n");
1262 printf(" Registers per Block: \t\t%d \n", TmpInt);
1263 checkResult(
1264 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1265 "Error returned from cuDeviceGetAttribute\n");
1266 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1267 checkResult(cuDeviceGetAttribute(
1268 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1269 "Error returned from cuDeviceGetAttribute\n");
1270 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt);
1271 checkResult(cuDeviceGetAttribute(
1272 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1273 "Error returned from cuDeviceGetAttribute\n");
1274 checkResult(cuDeviceGetAttribute(
1275 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1276 "Error returned from cuDeviceGetAttribute\n");
1277 checkResult(cuDeviceGetAttribute(
1278 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1279 "Error returned from cuDeviceGetAttribute\n");
1280 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1281 TmpInt3);
1282 checkResult(cuDeviceGetAttribute(
1283 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1284 "Error returned from cuDeviceGetAttribute\n");
1285 checkResult(cuDeviceGetAttribute(
1286 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1287 "Error returned from cuDeviceGetAttribute\n");
1288 checkResult(cuDeviceGetAttribute(
1289 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1290 "Error returned from cuDeviceGetAttribute\n");
1291 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1292 TmpInt3);
1293 checkResult(
1294 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1295 "Error returned from cuDeviceGetAttribute\n");
1296 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1297 checkResult(cuDeviceGetAttribute(
1298 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1299 "Error returned from cuDeviceGetAttribute\n");
1300 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1301 checkResult(
1302 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1303 "Error returned from cuDeviceGetAttribute\n");
1304 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt);
1305 checkResult(cuDeviceGetAttribute(
1306 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1307 "Error returned from cuDeviceGetAttribute\n");
1308 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1309 checkResult(
1310 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1311 "Error returned from cuDeviceGetAttribute\n");
1312 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1313 checkResult(cuDeviceGetAttribute(
1314 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1315 "Error returned from cuDeviceGetAttribute\n");
1316 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1317 checkResult(
1318 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1319 "Error returned from cuDeviceGetAttribute\n");
1320 if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1321 TmpStr = "DEFAULT";
1322 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1323 TmpStr = "PROHIBITED";
1324 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1325 TmpStr = "EXCLUSIVE PROCESS";
1326 else
1327 TmpStr = "unknown";
1328 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1329 checkResult(cuDeviceGetAttribute(
1330 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1331 "Error returned from cuDeviceGetAttribute\n");
1332 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1333 checkResult(
1334 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1335 "Error returned from cuDeviceGetAttribute\n");
1336 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1337 checkResult(cuDeviceGetAttribute(
1338 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1339 "Error returned from cuDeviceGetAttribute\n");
1340 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1341 checkResult(
1342 cuDeviceGetAttribute(
1343 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1344 "Error returned from cuDeviceGetAttribute\n");
1345 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1346 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1347 Device),
1348 "Error returned from cuDeviceGetAttribute\n");
1349 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1350 checkResult(cuDeviceGetAttribute(
1351 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1352 Device),
1353 "Error returned from cuDeviceGetAttribute\n");
1354 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt);
1355 checkResult(cuDeviceGetAttribute(
1356 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1357 "Error returned from cuDeviceGetAttribute\n");
1358 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1359 checkResult(cuDeviceGetAttribute(
1360 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1361 "Error returned from cuDeviceGetAttribute\n");
1362 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1363 checkResult(cuDeviceGetAttribute(
1364 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1365 "Error returned from cuDeviceGetAttribute\n");
1366 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1367 checkResult(
1368 cuDeviceGetAttribute(
1369 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1370 "Error returned from cuDeviceGetAttribute\n");
1371 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1372 checkResult(
1373 cuDeviceGetAttribute(
1374 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1375 "Error returned from cuDeviceGetAttribute\n");
1376 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1377 checkResult(cuDeviceGetAttribute(
1378 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1379 "Error returned from cuDeviceGetAttribute\n");
1380 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1381 checkResult(cuDeviceGetAttribute(
1382 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1383 "Error returned from cuDeviceGetAttribute\n");
1384 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1385 checkResult(
1386 cuDeviceGetAttribute(
1387 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1388 "Error returned from cuDeviceGetAttribute\n");
1389 checkResult(
1390 cuDeviceGetAttribute(
1391 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1392 "Error returned from cuDeviceGetAttribute\n");
1393 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1396 int createEvent(void **P) {
1397 CUevent Event = nullptr;
1398 if (EventPool.acquire(Event) != OFFLOAD_SUCCESS)
1399 return OFFLOAD_FAIL;
1400 *P = Event;
1401 return OFFLOAD_SUCCESS;
1404 int destroyEvent(void *EventPtr) {
1405 EventPool.release(reinterpret_cast<CUevent>(EventPtr));
1406 return OFFLOAD_SUCCESS;
1409 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1410 void *EventPtr) const {
1411 CUstream Stream = getStream(DeviceId, AsyncInfo);
1412 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1414 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1415 // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1416 // API document, that argument has to be 0x0.
1417 CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1418 if (Err != CUDA_SUCCESS) {
1419 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1420 DPxPTR(Stream), DPxPTR(Event));
1421 CUDA_ERR_STRING(Err);
1422 return OFFLOAD_FAIL;
1425 return OFFLOAD_SUCCESS;
1429 DeviceRTLTy DeviceRTL;
1430 } // namespace
1432 // Exposed library API function
1433 #ifdef __cplusplus
1434 extern "C" {
1435 #endif
1437 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1438 return elf_check_machine(image, /* EM_CUDA */ 190);
1441 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1443 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1444 DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1445 DeviceRTL.setRequiresFlag(RequiresFlags);
1446 return RequiresFlags;
1449 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1450 if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1451 DeviceRTL.isValidDeviceId(dst_dev_id))
1452 return 1;
1454 return 0;
1457 int32_t __tgt_rtl_init_device(int32_t device_id) {
1458 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1460 return DeviceRTL.initDevice(device_id);
1463 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1464 __tgt_device_image *image) {
1465 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1467 return DeviceRTL.loadBinary(device_id, image);
1470 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1471 int32_t kind) {
1472 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1474 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1477 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1478 int64_t size) {
1479 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1481 __tgt_async_info AsyncInfo;
1482 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1483 size, &AsyncInfo);
1484 if (rc != OFFLOAD_SUCCESS)
1485 return OFFLOAD_FAIL;
1487 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1490 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1491 void *hst_ptr, int64_t size,
1492 __tgt_async_info *async_info_ptr) {
1493 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1494 assert(async_info_ptr && "async_info_ptr is nullptr");
1496 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1497 async_info_ptr);
1500 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1501 int64_t size) {
1502 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1504 __tgt_async_info AsyncInfo;
1505 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1506 size, &AsyncInfo);
1507 if (rc != OFFLOAD_SUCCESS)
1508 return OFFLOAD_FAIL;
1510 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1513 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1514 void *tgt_ptr, int64_t size,
1515 __tgt_async_info *async_info_ptr) {
1516 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1517 assert(async_info_ptr && "async_info_ptr is nullptr");
1519 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1520 async_info_ptr);
1523 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1524 int dst_dev_id, void *dst_ptr,
1525 int64_t size,
1526 __tgt_async_info *AsyncInfo) {
1527 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1528 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1529 assert(AsyncInfo && "AsyncInfo is nullptr");
1531 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1532 AsyncInfo);
1535 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1536 int32_t dst_dev_id, void *dst_ptr,
1537 int64_t size) {
1538 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1539 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1541 __tgt_async_info AsyncInfo;
1542 const int32_t rc = __tgt_rtl_data_exchange_async(
1543 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1544 if (rc != OFFLOAD_SUCCESS)
1545 return OFFLOAD_FAIL;
1547 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1550 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1551 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1553 return DeviceRTL.dataDelete(device_id, tgt_ptr);
1556 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1557 void **tgt_args,
1558 ptrdiff_t *tgt_offsets,
1559 int32_t arg_num, int32_t team_num,
1560 int32_t thread_limit,
1561 uint64_t loop_tripcount) {
1562 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1564 __tgt_async_info AsyncInfo;
1565 const int32_t rc = __tgt_rtl_run_target_team_region_async(
1566 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1567 thread_limit, loop_tripcount, &AsyncInfo);
1568 if (rc != OFFLOAD_SUCCESS)
1569 return OFFLOAD_FAIL;
1571 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1574 int32_t __tgt_rtl_run_target_team_region_async(
1575 int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1576 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1577 int32_t thread_limit, uint64_t loop_tripcount,
1578 __tgt_async_info *async_info_ptr) {
1579 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1581 return DeviceRTL.runTargetTeamRegion(
1582 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1583 thread_limit, loop_tripcount, async_info_ptr);
1586 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1587 void **tgt_args, ptrdiff_t *tgt_offsets,
1588 int32_t arg_num) {
1589 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1591 __tgt_async_info AsyncInfo;
1592 const int32_t rc = __tgt_rtl_run_target_region_async(
1593 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1594 if (rc != OFFLOAD_SUCCESS)
1595 return OFFLOAD_FAIL;
1597 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1600 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1601 void *tgt_entry_ptr, void **tgt_args,
1602 ptrdiff_t *tgt_offsets,
1603 int32_t arg_num,
1604 __tgt_async_info *async_info_ptr) {
1605 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1607 return __tgt_rtl_run_target_team_region_async(
1608 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1609 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1610 async_info_ptr);
1613 int32_t __tgt_rtl_synchronize(int32_t device_id,
1614 __tgt_async_info *async_info_ptr) {
1615 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1616 assert(async_info_ptr && "async_info_ptr is nullptr");
1617 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1619 return DeviceRTL.synchronize(device_id, async_info_ptr);
1622 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1623 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1624 InfoLevel.store(NewInfoLevel);
1627 void __tgt_rtl_print_device_info(int32_t device_id) {
1628 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1629 DeviceRTL.printDeviceInfo(device_id);
1632 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1633 assert(event && "event is nullptr");
1634 return DeviceRTL.createEvent(event);
1637 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1638 __tgt_async_info *async_info_ptr) {
1639 assert(async_info_ptr && "async_info_ptr is nullptr");
1640 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1641 assert(event_ptr && "event_ptr is nullptr");
1643 return recordEvent(event_ptr, async_info_ptr);
1646 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1647 __tgt_async_info *async_info_ptr) {
1648 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1649 assert(async_info_ptr && "async_info_ptr is nullptr");
1650 assert(event_ptr && "event is nullptr");
1652 return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1655 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1656 assert(event_ptr && "event is nullptr");
1658 return syncEvent(event_ptr);
1661 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1662 assert(event_ptr && "event is nullptr");
1664 return DeviceRTL.destroyEvent(event_ptr);
1667 #ifdef __cplusplus
1669 #endif