1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // RTL for CUDA machine
11 //===----------------------------------------------------------------------===//
20 #include <unordered_map>
24 #include "DeviceEnvironment.h"
25 #include "omptargetplugin.h"
27 #define TARGET_NAME CUDA
28 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
30 #include "MemoryManager.h"
32 #include "llvm/Frontend/OpenMP/OMPConstants.h"
34 // Utility for retrieving and printing CUDA error string.
35 #ifdef OMPTARGET_DEBUG
36 #define CUDA_ERR_STRING(err) \
38 if (getDebugLevel() > 0) { \
39 const char *errStr = nullptr; \
40 CUresult errStr_status = cuGetErrorString(err, &errStr); \
41 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \
42 REPORT("Unrecognized CUDA error code: %d\n", err); \
43 else if (errStr_status == CUDA_SUCCESS) \
44 REPORT("CUDA error is: %s\n", errStr); \
46 REPORT("Unresolved CUDA error code: %d\n", err); \
47 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \
51 const char *errStr = nullptr; \
52 CUresult errStr_status = cuGetErrorString(err, &errStr); \
53 if (errStr_status == CUDA_SUCCESS) \
54 REPORT("%s \n", errStr); \
57 #else // OMPTARGET_DEBUG
58 #define CUDA_ERR_STRING(err) \
60 const char *errStr = nullptr; \
61 CUresult errStr_status = cuGetErrorString(err, &errStr); \
62 if (errStr_status == CUDA_SUCCESS) \
63 REPORT("%s \n", errStr); \
65 #endif // OMPTARGET_DEBUG
67 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
69 #include "elf_common.h"
71 /// Keep entries table per device.
72 struct FuncOrGblEntryTy
{
73 __tgt_target_table Table
;
74 std::vector
<__tgt_offload_entry
> Entries
;
77 /// Use a single entity to encode a kernel and a set of flags.
81 // execution mode of kernel
82 llvm::omp::OMPTgtExecModeFlags ExecutionMode
;
84 /// Maximal number of threads per block for this kernel.
85 int MaxThreadsPerBlock
= 0;
87 KernelTy(CUfunction _Func
, llvm::omp::OMPTgtExecModeFlags _ExecutionMode
)
88 : Func(_Func
), ExecutionMode(_ExecutionMode
) {}
92 bool checkResult(CUresult Err
, const char *ErrMsg
) {
93 if (Err
== CUDA_SUCCESS
)
101 int memcpyDtoD(const void *SrcPtr
, void *DstPtr
, int64_t Size
,
104 cuMemcpyDtoDAsync((CUdeviceptr
)DstPtr
, (CUdeviceptr
)SrcPtr
, Size
, Stream
);
106 if (Err
!= CUDA_SUCCESS
) {
107 DP("Error when copying data from device to device. Pointers: src "
108 "= " DPxMOD
", dst = " DPxMOD
", size = %" PRId64
"\n",
109 DPxPTR(SrcPtr
), DPxPTR(DstPtr
), Size
);
110 CUDA_ERR_STRING(Err
);
114 return OFFLOAD_SUCCESS
;
117 int recordEvent(void *EventPtr
, __tgt_async_info
*AsyncInfo
) {
118 CUstream Stream
= reinterpret_cast<CUstream
>(AsyncInfo
->Queue
);
119 CUevent Event
= reinterpret_cast<CUevent
>(EventPtr
);
121 CUresult Err
= cuEventRecord(Event
, Stream
);
122 if (Err
!= CUDA_SUCCESS
) {
123 DP("Error when recording event. stream = " DPxMOD
", event = " DPxMOD
"\n",
124 DPxPTR(Stream
), DPxPTR(Event
));
125 CUDA_ERR_STRING(Err
);
129 return OFFLOAD_SUCCESS
;
132 int syncEvent(void *EventPtr
) {
133 CUevent Event
= reinterpret_cast<CUevent
>(EventPtr
);
135 CUresult Err
= cuEventSynchronize(Event
);
136 if (Err
!= CUDA_SUCCESS
) {
137 DP("Error when syncing event = " DPxMOD
"\n", DPxPTR(Event
));
138 CUDA_ERR_STRING(Err
);
142 return OFFLOAD_SUCCESS
;
145 // Structure contains per-device data
146 struct DeviceDataTy
{
147 /// List that contains all the kernels.
148 std::list
<KernelTy
> KernelsList
;
150 std::list
<FuncOrGblEntryTy
> FuncGblEntries
;
152 CUcontext Context
= nullptr;
154 int ThreadsPerBlock
= 0;
155 int BlocksPerGrid
= 0;
162 /// Resource allocator where \p T is the resource type.
163 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
164 /// accordingly. The implementation should not raise any exception.
165 template <typename T
> class AllocatorTy
{
167 /// Create a resource and assign to R.
168 int create(T
&R
) noexcept
;
169 /// Destroy the resource.
170 int destroy(T
) noexcept
;
173 /// Allocator for CUstream.
174 template <> class AllocatorTy
<CUstream
> {
178 AllocatorTy(CUcontext C
) noexcept
: Context(C
) {}
180 /// See AllocatorTy<T>::create.
181 int create(CUstream
&Stream
) noexcept
{
182 if (!checkResult(cuCtxSetCurrent(Context
),
183 "Error returned from cuCtxSetCurrent\n"))
186 if (!checkResult(cuStreamCreate(&Stream
, CU_STREAM_NON_BLOCKING
),
187 "Error returned from cuStreamCreate\n"))
190 return OFFLOAD_SUCCESS
;
193 /// See AllocatorTy<T>::destroy.
194 int destroy(CUstream Stream
) noexcept
{
195 if (!checkResult(cuCtxSetCurrent(Context
),
196 "Error returned from cuCtxSetCurrent\n"))
198 if (!checkResult(cuStreamDestroy(Stream
),
199 "Error returned from cuStreamDestroy\n"))
202 return OFFLOAD_SUCCESS
;
206 /// Allocator for CUevent.
207 template <> class AllocatorTy
<CUevent
> {
209 /// See AllocatorTy<T>::create.
210 int create(CUevent
&Event
) noexcept
{
211 if (!checkResult(cuEventCreate(&Event
, CU_EVENT_DEFAULT
),
212 "Error returned from cuEventCreate\n"))
215 return OFFLOAD_SUCCESS
;
218 /// See AllocatorTy<T>::destroy.
219 int destroy(CUevent Event
) noexcept
{
220 if (!checkResult(cuEventDestroy(Event
),
221 "Error returned from cuEventDestroy\n"))
224 return OFFLOAD_SUCCESS
;
228 /// A generic pool of resources where \p T is the resource type.
229 /// \p T should be copyable as the object is stored in \p std::vector .
230 template <typename T
> class ResourcePoolTy
{
231 /// Index of the next available resource.
233 /// Mutex to guard the pool.
235 /// Pool of resources.
236 std::vector
<T
> Resources
;
237 /// A reference to the corresponding allocator.
238 AllocatorTy
<T
> Allocator
;
240 /// If `Resources` is used up, we will fill in more resources. It assumes that
241 /// the new size `Size` should be always larger than the current size.
242 bool resize(size_t Size
) {
243 auto CurSize
= Resources
.size();
244 assert(Size
> CurSize
&& "Unexpected smaller size");
245 Resources
.reserve(Size
);
246 for (auto I
= CurSize
; I
< Size
; ++I
) {
248 int Ret
= Allocator
.create(NewItem
);
249 if (Ret
!= OFFLOAD_SUCCESS
)
251 Resources
.push_back(NewItem
);
257 ResourcePoolTy(AllocatorTy
<T
> &&A
, size_t Size
= 0) noexcept
258 : Allocator(std::move(A
)) {
263 ~ResourcePoolTy() noexcept
{ clear(); }
265 /// Get a resource from pool. `Next` always points to the next available
266 /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
267 /// still available. If there is no resource left, we will ask for more. Each
268 /// time a resource is assigned, the id will increase one.
272 /// After assignment, the pool becomes the following and s is assigned.
276 int acquire(T
&R
) noexcept
{
277 std::lock_guard
<std::mutex
> LG(Mutex
);
278 if (Next
== Resources
.size()) {
279 auto NewSize
= Resources
.size() ? Resources
.size() * 2 : 1;
280 if (!resize(NewSize
))
284 assert(Next
< Resources
.size());
286 R
= Resources
[Next
++];
288 return OFFLOAD_SUCCESS
;
291 /// Return the resource back to the pool. When we return a resource, we need
292 /// to first decrease `Next`, and then copy the resource back. It is worth
293 /// noting that, the order of resources return might be different from that
294 /// they're assigned, that saying, at some point, there might be two identical
299 /// However, it doesn't matter, because they're always on the two sides of
300 /// `Next`. The left one will in the end be overwritten by another resource.
301 /// Therefore, after several execution, the order of pool might be different
302 /// from its initial state.
303 void release(T R
) noexcept
{
304 std::lock_guard
<std::mutex
> LG(Mutex
);
305 Resources
[--Next
] = R
;
308 /// Released all stored resources and clear the pool.
309 /// Note: This function is not thread safe. Be sure to guard it if necessary.
310 void clear() noexcept
{
311 for (auto &R
: Resources
)
312 (void)Allocator
.destroy(R
);
319 // OpenMP environment properties
322 int EnvTeamThreadLimit
;
323 // OpenMP requires flags
324 int64_t RequiresFlags
;
325 // Amount of dynamic shared memory to use at launch.
326 uint64_t DynamicMemorySize
;
327 // Number of initial streams for each device.
328 int NumInitialStreams
= 32;
330 static constexpr const int HardTeamLimit
= 1U << 16U; // 64k
331 static constexpr const int HardThreadLimit
= 1024;
332 static constexpr const int DefaultNumTeams
= 128;
333 static constexpr const int DefaultNumThreads
= 128;
335 using StreamPoolTy
= ResourcePoolTy
<CUstream
>;
336 std::vector
<std::unique_ptr
<StreamPoolTy
>> StreamPool
;
338 ResourcePoolTy
<CUevent
> EventPool
;
340 std::vector
<DeviceDataTy
> DeviceData
;
341 std::vector
<CUmodule
> Modules
;
343 /// A class responsible for interacting with device native runtime library to
344 /// allocate and free memory.
345 class CUDADeviceAllocatorTy
: public DeviceAllocatorTy
{
347 const std::vector
<DeviceDataTy
> &DeviceData
;
348 std::unordered_map
<void *, TargetAllocTy
> HostPinnedAllocs
;
351 CUDADeviceAllocatorTy(int DeviceId
, std::vector
<DeviceDataTy
> &DeviceData
)
352 : DeviceId(DeviceId
), DeviceData(DeviceData
) {}
354 void *allocate(size_t Size
, void *, TargetAllocTy Kind
) override
{
358 CUresult Err
= cuCtxSetCurrent(DeviceData
[DeviceId
].Context
);
359 if (!checkResult(Err
, "Error returned from cuCtxSetCurrent\n"))
362 void *MemAlloc
= nullptr;
364 case TARGET_ALLOC_DEFAULT
:
365 case TARGET_ALLOC_DEVICE
:
366 CUdeviceptr DevicePtr
;
367 Err
= cuMemAlloc(&DevicePtr
, Size
);
368 MemAlloc
= (void *)DevicePtr
;
369 if (!checkResult(Err
, "Error returned from cuMemAlloc\n"))
372 case TARGET_ALLOC_HOST
:
374 Err
= cuMemAllocHost(&HostPtr
, Size
);
376 if (!checkResult(Err
, "Error returned from cuMemAllocHost\n"))
378 HostPinnedAllocs
[MemAlloc
] = Kind
;
380 case TARGET_ALLOC_SHARED
:
381 CUdeviceptr SharedPtr
;
382 Err
= cuMemAllocManaged(&SharedPtr
, Size
, CU_MEM_ATTACH_GLOBAL
);
383 MemAlloc
= (void *)SharedPtr
;
384 if (!checkResult(Err
, "Error returned from cuMemAllocManaged\n"))
392 int free(void *TgtPtr
) override
{
393 CUresult Err
= cuCtxSetCurrent(DeviceData
[DeviceId
].Context
);
394 if (!checkResult(Err
, "Error returned from cuCtxSetCurrent\n"))
397 // Host pinned memory must be freed differently.
399 (HostPinnedAllocs
.find(TgtPtr
) == HostPinnedAllocs
.end())
400 ? TARGET_ALLOC_DEFAULT
403 case TARGET_ALLOC_DEFAULT
:
404 case TARGET_ALLOC_DEVICE
:
405 case TARGET_ALLOC_SHARED
:
406 Err
= cuMemFree((CUdeviceptr
)TgtPtr
);
407 if (!checkResult(Err
, "Error returned from cuMemFree\n"))
410 case TARGET_ALLOC_HOST
:
411 Err
= cuMemFreeHost(TgtPtr
);
412 if (!checkResult(Err
, "Error returned from cuMemFreeHost\n"))
417 return OFFLOAD_SUCCESS
;
421 /// A vector of device allocators
422 std::vector
<CUDADeviceAllocatorTy
> DeviceAllocators
;
424 /// A vector of memory managers. Since the memory manager is non-copyable and
425 // non-removable, we wrap them into std::unique_ptr.
426 std::vector
<std::unique_ptr
<MemoryManagerTy
>> MemoryManagers
;
428 /// Whether use memory manager
429 bool UseMemoryManager
= true;
431 // Record entry point associated with device
432 void addOffloadEntry(const int DeviceId
, const __tgt_offload_entry entry
) {
433 FuncOrGblEntryTy
&E
= DeviceData
[DeviceId
].FuncGblEntries
.back();
434 E
.Entries
.push_back(entry
);
437 // Return a pointer to the entry associated with the pointer
438 const __tgt_offload_entry
*getOffloadEntry(const int DeviceId
,
439 const void *Addr
) const {
440 for (const __tgt_offload_entry
&Itr
:
441 DeviceData
[DeviceId
].FuncGblEntries
.back().Entries
)
442 if (Itr
.addr
== Addr
)
448 // Return the pointer to the target entries table
449 __tgt_target_table
*getOffloadEntriesTable(const int DeviceId
) {
450 FuncOrGblEntryTy
&E
= DeviceData
[DeviceId
].FuncGblEntries
.back();
452 if (E
.Entries
.empty())
455 // Update table info according to the entries and return the pointer
456 E
.Table
.EntriesBegin
= E
.Entries
.data();
457 E
.Table
.EntriesEnd
= E
.Entries
.data() + E
.Entries
.size();
462 // Clear entries table for a device
463 void clearOffloadEntriesTable(const int DeviceId
) {
464 DeviceData
[DeviceId
].FuncGblEntries
.emplace_back();
465 FuncOrGblEntryTy
&E
= DeviceData
[DeviceId
].FuncGblEntries
.back();
467 E
.Table
.EntriesBegin
= E
.Table
.EntriesEnd
= nullptr;
470 CUstream
getStream(const int DeviceId
, __tgt_async_info
*AsyncInfo
) const {
471 assert(AsyncInfo
&& "AsyncInfo is nullptr");
473 if (!AsyncInfo
->Queue
) {
475 if (StreamPool
[DeviceId
]->acquire(S
) != OFFLOAD_SUCCESS
)
478 AsyncInfo
->Queue
= S
;
481 return reinterpret_cast<CUstream
>(AsyncInfo
->Queue
);
485 // This class should not be copied
486 DeviceRTLTy(const DeviceRTLTy
&) = delete;
487 DeviceRTLTy(DeviceRTLTy
&&) = delete;
490 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
491 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED
),
492 DynamicMemorySize(0), EventPool(AllocatorTy
<CUevent
>()) {
494 DP("Start initializing CUDA\n");
496 CUresult Err
= cuInit(0);
497 if (Err
== CUDA_ERROR_INVALID_HANDLE
) {
498 // Can't call cuGetErrorString if dlsym failed
499 DP("Failed to load CUDA shared library\n");
502 if (!checkResult(Err
, "Error returned from cuInit\n")) {
506 Err
= cuDeviceGetCount(&NumberOfDevices
);
507 if (!checkResult(Err
, "Error returned from cuDeviceGetCount\n"))
510 if (NumberOfDevices
== 0) {
511 DP("There are no devices supporting CUDA.\n");
515 DeviceData
.resize(NumberOfDevices
);
516 StreamPool
.resize(NumberOfDevices
);
518 // Get environment variables regarding teams
519 if (const char *EnvStr
= getenv("OMP_TEAM_LIMIT")) {
520 // OMP_TEAM_LIMIT has been set
521 EnvTeamLimit
= std::stoi(EnvStr
);
522 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit
);
524 if (const char *EnvStr
= getenv("OMP_TEAMS_THREAD_LIMIT")) {
525 // OMP_TEAMS_THREAD_LIMIT has been set
526 EnvTeamThreadLimit
= std::stoi(EnvStr
);
527 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit
);
529 if (const char *EnvStr
= getenv("OMP_NUM_TEAMS")) {
530 // OMP_NUM_TEAMS has been set
531 EnvNumTeams
= std::stoi(EnvStr
);
532 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams
);
534 if (const char *EnvStr
= getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
535 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
536 DynamicMemorySize
= std::stoi(EnvStr
);
537 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64
"\n",
540 if (const char *EnvStr
= getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
541 // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
542 NumInitialStreams
= std::stoi(EnvStr
);
543 DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams
);
546 for (int I
= 0; I
< NumberOfDevices
; ++I
)
547 DeviceAllocators
.emplace_back(I
, DeviceData
);
549 // Get the size threshold from environment variable
550 std::pair
<size_t, bool> Res
= MemoryManagerTy::getSizeThresholdFromEnv();
551 UseMemoryManager
= Res
.second
;
552 size_t MemoryManagerThreshold
= Res
.first
;
554 if (UseMemoryManager
)
555 for (int I
= 0; I
< NumberOfDevices
; ++I
)
556 MemoryManagers
.emplace_back(std::make_unique
<MemoryManagerTy
>(
557 DeviceAllocators
[I
], MemoryManagerThreshold
));
561 // We first destruct memory managers in case that its dependent data are
562 // destroyed before it.
563 for (auto &M
: MemoryManagers
)
566 for (CUmodule
&M
: Modules
)
569 checkResult(cuModuleUnload(M
), "Error returned from cuModuleUnload\n");
571 for (auto &S
: StreamPool
)
576 for (DeviceDataTy
&D
: DeviceData
) {
579 checkResult(cuCtxSetCurrent(D
.Context
),
580 "Error returned from cuCtxSetCurrent\n");
582 checkResult(cuCtxGetDevice(&Device
),
583 "Error returned from cuCtxGetDevice\n");
584 checkResult(cuDevicePrimaryCtxRelease(Device
),
585 "Error returned from cuDevicePrimaryCtxRelease\n");
590 // Check whether a given DeviceId is valid
591 bool isValidDeviceId(const int DeviceId
) const {
592 return DeviceId
>= 0 && DeviceId
< NumberOfDevices
;
595 int getNumOfDevices() const { return NumberOfDevices
; }
597 void setRequiresFlag(const int64_t Flags
) { this->RequiresFlags
= Flags
; }
599 int initDevice(const int DeviceId
) {
602 DP("Getting device %d\n", DeviceId
);
603 CUresult Err
= cuDeviceGet(&Device
, DeviceId
);
604 if (!checkResult(Err
, "Error returned from cuDeviceGet\n"))
607 // Query the current flags of the primary context and set its flags if
609 unsigned int FormerPrimaryCtxFlags
= 0;
610 int FormerPrimaryCtxIsActive
= 0;
611 Err
= cuDevicePrimaryCtxGetState(Device
, &FormerPrimaryCtxFlags
,
612 &FormerPrimaryCtxIsActive
);
613 if (!checkResult(Err
, "Error returned from cuDevicePrimaryCtxGetState\n"))
616 if (FormerPrimaryCtxIsActive
) {
617 DP("The primary context is active, no change to its flags\n");
618 if ((FormerPrimaryCtxFlags
& CU_CTX_SCHED_MASK
) !=
619 CU_CTX_SCHED_BLOCKING_SYNC
)
620 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
622 DP("The primary context is inactive, set its flags to "
623 "CU_CTX_SCHED_BLOCKING_SYNC\n");
624 Err
= cuDevicePrimaryCtxSetFlags(Device
, CU_CTX_SCHED_BLOCKING_SYNC
);
625 if (!checkResult(Err
, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
629 // Retain the per device primary context and save it to use whenever this
630 // device is selected.
631 Err
= cuDevicePrimaryCtxRetain(&DeviceData
[DeviceId
].Context
, Device
);
632 if (!checkResult(Err
, "Error returned from cuDevicePrimaryCtxRetain\n"))
635 Err
= cuCtxSetCurrent(DeviceData
[DeviceId
].Context
);
636 if (!checkResult(Err
, "Error returned from cuCtxSetCurrent\n"))
639 // Initialize stream pool
640 if (!StreamPool
[DeviceId
])
641 StreamPool
[DeviceId
] = std::make_unique
<StreamPoolTy
>(
642 AllocatorTy
<CUstream
>(DeviceData
[DeviceId
].Context
),
645 // Query attributes to determine number of threads/block and blocks/grid.
647 Err
= cuDeviceGetAttribute(&MaxGridDimX
, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X
,
649 if (Err
!= CUDA_SUCCESS
) {
650 DP("Error getting max grid dimension, use default value %d\n",
651 DeviceRTLTy::DefaultNumTeams
);
652 DeviceData
[DeviceId
].BlocksPerGrid
= DeviceRTLTy::DefaultNumTeams
;
653 } else if (MaxGridDimX
<= DeviceRTLTy::HardTeamLimit
) {
654 DP("Using %d CUDA blocks per grid\n", MaxGridDimX
);
655 DeviceData
[DeviceId
].BlocksPerGrid
= MaxGridDimX
;
657 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
658 "at the hard limit\n",
659 MaxGridDimX
, DeviceRTLTy::HardTeamLimit
);
660 DeviceData
[DeviceId
].BlocksPerGrid
= DeviceRTLTy::HardTeamLimit
;
663 // We are only exploiting threads along the x axis.
665 Err
= cuDeviceGetAttribute(&MaxBlockDimX
,
666 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
, Device
);
667 if (Err
!= CUDA_SUCCESS
) {
668 DP("Error getting max block dimension, use default value %d\n",
669 DeviceRTLTy::DefaultNumThreads
);
670 DeviceData
[DeviceId
].ThreadsPerBlock
= DeviceRTLTy::DefaultNumThreads
;
672 DP("Using %d CUDA threads per block\n", MaxBlockDimX
);
673 DeviceData
[DeviceId
].ThreadsPerBlock
= MaxBlockDimX
;
675 if (EnvTeamThreadLimit
> 0 &&
676 DeviceData
[DeviceId
].ThreadsPerBlock
> EnvTeamThreadLimit
) {
677 DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
678 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
679 DeviceData
[DeviceId
].ThreadsPerBlock
, EnvTeamThreadLimit
);
680 DeviceData
[DeviceId
].ThreadsPerBlock
= EnvTeamThreadLimit
;
682 if (DeviceData
[DeviceId
].ThreadsPerBlock
> DeviceRTLTy::HardThreadLimit
) {
683 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
684 "capping at the hard limit\n",
685 DeviceData
[DeviceId
].ThreadsPerBlock
, DeviceRTLTy::HardThreadLimit
);
686 DeviceData
[DeviceId
].ThreadsPerBlock
= DeviceRTLTy::HardThreadLimit
;
690 // Get and set warp size
693 cuDeviceGetAttribute(&WarpSize
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, Device
);
694 if (Err
!= CUDA_SUCCESS
) {
695 DP("Error getting warp size, assume default value 32\n");
696 DeviceData
[DeviceId
].WarpSize
= 32;
698 DP("Using warp size %d\n", WarpSize
);
699 DeviceData
[DeviceId
].WarpSize
= WarpSize
;
702 // Adjust teams to the env variables
703 if (EnvTeamLimit
> 0 && DeviceData
[DeviceId
].BlocksPerGrid
> EnvTeamLimit
) {
704 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
706 DeviceData
[DeviceId
].BlocksPerGrid
= EnvTeamLimit
;
711 if (const char *EnvStr
= getenv("LIBOMPTARGET_STACK_SIZE")) {
712 StackLimit
= std::stol(EnvStr
);
713 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE
, StackLimit
) != CUDA_SUCCESS
)
716 if (cuCtxGetLimit(&StackLimit
, CU_LIMIT_STACK_SIZE
) != CUDA_SUCCESS
)
719 if (const char *EnvStr
= getenv("LIBOMPTARGET_HEAP_SIZE")) {
720 HeapLimit
= std::stol(EnvStr
);
721 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE
, HeapLimit
) != CUDA_SUCCESS
)
724 if (cuCtxGetLimit(&HeapLimit
, CU_LIMIT_MALLOC_HEAP_SIZE
) != CUDA_SUCCESS
)
728 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, DeviceId
,
729 "Device supports up to %d CUDA blocks and %d threads with a "
731 DeviceData
[DeviceId
].BlocksPerGrid
,
732 DeviceData
[DeviceId
].ThreadsPerBlock
, DeviceData
[DeviceId
].WarpSize
);
733 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, DeviceId
,
734 "Device heap size is %d Bytes, device stack size is %d Bytes per "
736 (int)HeapLimit
, (int)StackLimit
);
738 // Set default number of teams
739 if (EnvNumTeams
> 0) {
740 DP("Default number of teams set according to environment %d\n",
742 DeviceData
[DeviceId
].NumTeams
= EnvNumTeams
;
744 DeviceData
[DeviceId
].NumTeams
= DeviceRTLTy::DefaultNumTeams
;
745 DP("Default number of teams set according to library's default %d\n",
746 DeviceRTLTy::DefaultNumTeams
);
749 if (DeviceData
[DeviceId
].NumTeams
> DeviceData
[DeviceId
].BlocksPerGrid
) {
750 DP("Default number of teams exceeds device limit, capping at %d\n",
751 DeviceData
[DeviceId
].BlocksPerGrid
);
752 DeviceData
[DeviceId
].NumTeams
= DeviceData
[DeviceId
].BlocksPerGrid
;
755 // Set default number of threads
756 DeviceData
[DeviceId
].NumThreads
= DeviceRTLTy::DefaultNumThreads
;
757 DP("Default number of threads set according to library's default %d\n",
758 DeviceRTLTy::DefaultNumThreads
);
759 if (DeviceData
[DeviceId
].NumThreads
>
760 DeviceData
[DeviceId
].ThreadsPerBlock
) {
761 DP("Default number of threads exceeds device limit, capping at %d\n",
762 DeviceData
[DeviceId
].ThreadsPerBlock
);
763 DeviceData
[DeviceId
].NumThreads
= DeviceData
[DeviceId
].ThreadsPerBlock
;
766 return OFFLOAD_SUCCESS
;
769 __tgt_target_table
*loadBinary(const int DeviceId
,
770 const __tgt_device_image
*Image
) {
771 // Set the context we are using
772 CUresult Err
= cuCtxSetCurrent(DeviceData
[DeviceId
].Context
);
773 if (!checkResult(Err
, "Error returned from cuCtxSetCurrent\n"))
776 // Clear the offload table as we are going to create a new one.
777 clearOffloadEntriesTable(DeviceId
);
779 // Create the module and extract the function pointers.
781 DP("Load data from image " DPxMOD
"\n", DPxPTR(Image
->ImageStart
));
782 Err
= cuModuleLoadDataEx(&Module
, Image
->ImageStart
, 0, nullptr, nullptr);
783 if (!checkResult(Err
, "Error returned from cuModuleLoadDataEx\n"))
786 DP("CUDA module successfully loaded!\n");
788 Modules
.push_back(Module
);
790 // Find the symbols in the module by name.
791 const __tgt_offload_entry
*HostBegin
= Image
->EntriesBegin
;
792 const __tgt_offload_entry
*HostEnd
= Image
->EntriesEnd
;
794 std::list
<KernelTy
> &KernelsList
= DeviceData
[DeviceId
].KernelsList
;
795 for (const __tgt_offload_entry
*E
= HostBegin
; E
!= HostEnd
; ++E
) {
797 // We return nullptr when something like this happens, the host should
798 // have always something in the address to uniquely identify the target
800 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E
->size
);
805 __tgt_offload_entry Entry
= *E
;
808 Err
= cuModuleGetGlobal(&CUPtr
, &CUSize
, Module
, E
->name
);
809 // We keep this style here because we need the name
810 if (Err
!= CUDA_SUCCESS
) {
811 REPORT("Loading global '%s' Failed\n", E
->name
);
812 CUDA_ERR_STRING(Err
);
816 if (CUSize
!= E
->size
) {
817 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E
->name
,
822 DP("Entry point " DPxMOD
" maps to global %s (" DPxMOD
")\n",
823 DPxPTR(E
- HostBegin
), E
->name
, DPxPTR(CUPtr
));
825 Entry
.addr
= (void *)(CUPtr
);
827 // Note: In the current implementation declare target variables
828 // can either be link or to. This means that once unified
829 // memory is activated via the requires directive, the variable
830 // can be used directly from the host in both cases.
831 // TODO: when variables types other than to or link are added,
832 // the below condition should be changed to explicitly
833 // check for to and link variables types:
834 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
835 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
836 if (RequiresFlags
& OMP_REQ_UNIFIED_SHARED_MEMORY
) {
837 // If unified memory is present any target link or to variables
838 // can access host addresses directly. There is no longer a
839 // need for device copies.
840 cuMemcpyHtoD(CUPtr
, E
->addr
, sizeof(void *));
841 DP("Copy linked variable host address (" DPxMOD
842 ") to device address (" DPxMOD
")\n",
843 DPxPTR(*((void **)E
->addr
)), DPxPTR(CUPtr
));
846 addOffloadEntry(DeviceId
, Entry
);
852 Err
= cuModuleGetFunction(&Func
, Module
, E
->name
);
853 // We keep this style here because we need the name
854 if (Err
!= CUDA_SUCCESS
) {
855 REPORT("Loading '%s' Failed\n", E
->name
);
856 CUDA_ERR_STRING(Err
);
860 DP("Entry point " DPxMOD
" maps to %s (" DPxMOD
")\n",
861 DPxPTR(E
- HostBegin
), E
->name
, DPxPTR(Func
));
863 // default value GENERIC (in case symbol is missing from cubin file)
864 llvm::omp::OMPTgtExecModeFlags ExecModeVal
;
865 std::string
ExecModeNameStr(E
->name
);
866 ExecModeNameStr
+= "_exec_mode";
867 const char *ExecModeName
= ExecModeNameStr
.c_str();
869 CUdeviceptr ExecModePtr
;
871 Err
= cuModuleGetGlobal(&ExecModePtr
, &CUSize
, Module
, ExecModeName
);
872 if (Err
== CUDA_SUCCESS
) {
873 if (CUSize
!= sizeof(llvm::omp::OMPTgtExecModeFlags
)) {
874 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
875 ExecModeName
, CUSize
, sizeof(llvm::omp::OMPTgtExecModeFlags
));
879 Err
= cuMemcpyDtoH(&ExecModeVal
, ExecModePtr
, CUSize
);
880 if (Err
!= CUDA_SUCCESS
) {
881 REPORT("Error when copying data from device to host. Pointers: "
882 "host = " DPxMOD
", device = " DPxMOD
", size = %zd\n",
883 DPxPTR(&ExecModeVal
), DPxPTR(ExecModePtr
), CUSize
);
884 CUDA_ERR_STRING(Err
);
888 DP("Loading global exec_mode '%s' - symbol missing, using default "
889 "value GENERIC (1)\n",
893 KernelsList
.emplace_back(Func
, ExecModeVal
);
895 __tgt_offload_entry Entry
= *E
;
896 Entry
.addr
= &KernelsList
.back();
897 addOffloadEntry(DeviceId
, Entry
);
900 // send device environment data to the device
902 // TODO: The device ID used here is not the real device ID used by OpenMP.
903 DeviceEnvironmentTy DeviceEnv
{0, static_cast<uint32_t>(NumberOfDevices
),
904 static_cast<uint32_t>(DeviceId
),
905 static_cast<uint32_t>(DynamicMemorySize
)};
907 if (const char *EnvStr
= getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
908 DeviceEnv
.DebugKind
= std::stoi(EnvStr
);
910 const char *DeviceEnvName
= "omptarget_device_environment";
911 CUdeviceptr DeviceEnvPtr
;
914 Err
= cuModuleGetGlobal(&DeviceEnvPtr
, &CUSize
, Module
, DeviceEnvName
);
915 if (Err
== CUDA_SUCCESS
) {
916 if (CUSize
!= sizeof(DeviceEnv
)) {
918 "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
919 DeviceEnvName
, CUSize
, sizeof(int32_t));
920 CUDA_ERR_STRING(Err
);
924 Err
= cuMemcpyHtoD(DeviceEnvPtr
, &DeviceEnv
, CUSize
);
925 if (Err
!= CUDA_SUCCESS
) {
926 REPORT("Error when copying data from host to device. Pointers: "
927 "host = " DPxMOD
", device = " DPxMOD
", size = %zu\n",
928 DPxPTR(&DeviceEnv
), DPxPTR(DeviceEnvPtr
), CUSize
);
929 CUDA_ERR_STRING(Err
);
933 DP("Sending global device environment data %zu bytes\n", CUSize
);
935 DP("Finding global device environment '%s' - symbol missing.\n",
937 DP("Continue, considering this is a device RTL which does not accept "
938 "environment setting.\n");
942 return getOffloadEntriesTable(DeviceId
);
945 void *dataAlloc(const int DeviceId
, const int64_t Size
,
946 const TargetAllocTy Kind
) {
948 case TARGET_ALLOC_DEFAULT
:
949 case TARGET_ALLOC_DEVICE
:
950 if (UseMemoryManager
)
951 return MemoryManagers
[DeviceId
]->allocate(Size
, nullptr);
953 return DeviceAllocators
[DeviceId
].allocate(Size
, nullptr, Kind
);
954 case TARGET_ALLOC_HOST
:
955 case TARGET_ALLOC_SHARED
:
956 return DeviceAllocators
[DeviceId
].allocate(Size
, nullptr, Kind
);
959 REPORT("Invalid target data allocation kind or requested allocator not "
960 "implemented yet\n");
965 int dataSubmit(const int DeviceId
, const void *TgtPtr
, const void *HstPtr
,
966 const int64_t Size
, __tgt_async_info
*AsyncInfo
) const {
967 assert(AsyncInfo
&& "AsyncInfo is nullptr");
969 CUresult Err
= cuCtxSetCurrent(DeviceData
[DeviceId
].Context
);
970 if (!checkResult(Err
, "Error returned from cuCtxSetCurrent\n"))
973 CUstream Stream
= getStream(DeviceId
, AsyncInfo
);
975 Err
= cuMemcpyHtoDAsync((CUdeviceptr
)TgtPtr
, HstPtr
, Size
, Stream
);
976 if (Err
!= CUDA_SUCCESS
) {
977 DP("Error when copying data from host to device. Pointers: host "
978 "= " DPxMOD
", device = " DPxMOD
", size = %" PRId64
"\n",
979 DPxPTR(HstPtr
), DPxPTR(TgtPtr
), Size
);
980 CUDA_ERR_STRING(Err
);
984 return OFFLOAD_SUCCESS
;
987 int dataRetrieve(const int DeviceId
, void *HstPtr
, const void *TgtPtr
,
988 const int64_t Size
, __tgt_async_info
*AsyncInfo
) const {
989 assert(AsyncInfo
&& "AsyncInfo is nullptr");
991 CUresult Err
= cuCtxSetCurrent(DeviceData
[DeviceId
].Context
);
992 if (!checkResult(Err
, "Error returned from cuCtxSetCurrent\n"))
995 CUstream Stream
= getStream(DeviceId
, AsyncInfo
);
997 Err
= cuMemcpyDtoHAsync(HstPtr
, (CUdeviceptr
)TgtPtr
, Size
, Stream
);
998 if (Err
!= CUDA_SUCCESS
) {
999 DP("Error when copying data from device to host. Pointers: host "
1000 "= " DPxMOD
", device = " DPxMOD
", size = %" PRId64
"\n",
1001 DPxPTR(HstPtr
), DPxPTR(TgtPtr
), Size
);
1002 CUDA_ERR_STRING(Err
);
1003 return OFFLOAD_FAIL
;
1006 return OFFLOAD_SUCCESS
;
1009 int dataExchange(int SrcDevId
, const void *SrcPtr
, int DstDevId
, void *DstPtr
,
1010 int64_t Size
, __tgt_async_info
*AsyncInfo
) const {
1011 assert(AsyncInfo
&& "AsyncInfo is nullptr");
1013 CUresult Err
= cuCtxSetCurrent(DeviceData
[SrcDevId
].Context
);
1014 if (!checkResult(Err
, "Error returned from cuCtxSetCurrent\n"))
1015 return OFFLOAD_FAIL
;
1017 CUstream Stream
= getStream(SrcDevId
, AsyncInfo
);
1019 // If they are two devices, we try peer to peer copy first
1020 if (SrcDevId
!= DstDevId
) {
1021 int CanAccessPeer
= 0;
1022 Err
= cuDeviceCanAccessPeer(&CanAccessPeer
, SrcDevId
, DstDevId
);
1023 if (Err
!= CUDA_SUCCESS
) {
1024 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1025 ", dst = %" PRId32
"\n",
1026 SrcDevId
, DstDevId
);
1027 CUDA_ERR_STRING(Err
);
1028 return memcpyDtoD(SrcPtr
, DstPtr
, Size
, Stream
);
1031 if (!CanAccessPeer
) {
1032 DP("P2P memcpy not supported so fall back to D2D memcpy");
1033 return memcpyDtoD(SrcPtr
, DstPtr
, Size
, Stream
);
1036 Err
= cuCtxEnablePeerAccess(DeviceData
[DstDevId
].Context
, 0);
1037 if (Err
!= CUDA_SUCCESS
) {
1038 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1039 ", dst = %" PRId32
"\n",
1040 SrcDevId
, DstDevId
);
1041 CUDA_ERR_STRING(Err
);
1042 return memcpyDtoD(SrcPtr
, DstPtr
, Size
, Stream
);
1045 Err
= cuMemcpyPeerAsync((CUdeviceptr
)DstPtr
, DeviceData
[DstDevId
].Context
,
1046 (CUdeviceptr
)SrcPtr
, DeviceData
[SrcDevId
].Context
,
1048 if (Err
== CUDA_SUCCESS
)
1049 return OFFLOAD_SUCCESS
;
1051 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1052 ", src_id =%" PRId32
", dst_ptr = " DPxMOD
", dst_id =%" PRId32
"\n",
1053 DPxPTR(SrcPtr
), SrcDevId
, DPxPTR(DstPtr
), DstDevId
);
1054 CUDA_ERR_STRING(Err
);
1057 return memcpyDtoD(SrcPtr
, DstPtr
, Size
, Stream
);
1060 int dataDelete(const int DeviceId
, void *TgtPtr
) {
1061 if (UseMemoryManager
)
1062 return MemoryManagers
[DeviceId
]->free(TgtPtr
);
1064 return DeviceAllocators
[DeviceId
].free(TgtPtr
);
1067 int runTargetTeamRegion(const int DeviceId
, void *TgtEntryPtr
, void **TgtArgs
,
1068 ptrdiff_t *TgtOffsets
, const int ArgNum
,
1069 const int TeamNum
, const int ThreadLimit
,
1070 const unsigned int LoopTripCount
,
1071 __tgt_async_info
*AsyncInfo
) const {
1072 CUresult Err
= cuCtxSetCurrent(DeviceData
[DeviceId
].Context
);
1073 if (!checkResult(Err
, "Error returned from cuCtxSetCurrent\n"))
1074 return OFFLOAD_FAIL
;
1076 // All args are references.
1077 std::vector
<void *> Args(ArgNum
);
1078 std::vector
<void *> Ptrs(ArgNum
);
1080 for (int I
= 0; I
< ArgNum
; ++I
) {
1081 Ptrs
[I
] = (void *)((intptr_t)TgtArgs
[I
] + TgtOffsets
[I
]);
1085 KernelTy
*KernelInfo
= reinterpret_cast<KernelTy
*>(TgtEntryPtr
);
1087 const bool IsSPMDGenericMode
=
1088 KernelInfo
->ExecutionMode
== llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD
;
1089 const bool IsSPMDMode
=
1090 KernelInfo
->ExecutionMode
== llvm::omp::OMP_TGT_EXEC_MODE_SPMD
;
1091 const bool IsGenericMode
=
1092 KernelInfo
->ExecutionMode
== llvm::omp::OMP_TGT_EXEC_MODE_GENERIC
;
1094 int CudaThreadsPerBlock
;
1095 if (ThreadLimit
> 0) {
1096 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit
);
1097 CudaThreadsPerBlock
= ThreadLimit
;
1098 // Add master warp if necessary
1099 if (IsGenericMode
) {
1100 DP("Adding master warp: +%d threads\n", DeviceData
[DeviceId
].WarpSize
);
1101 CudaThreadsPerBlock
+= DeviceData
[DeviceId
].WarpSize
;
1104 DP("Setting CUDA threads per block to default %d\n",
1105 DeviceData
[DeviceId
].NumThreads
);
1106 CudaThreadsPerBlock
= DeviceData
[DeviceId
].NumThreads
;
1109 if (CudaThreadsPerBlock
> DeviceData
[DeviceId
].ThreadsPerBlock
) {
1110 DP("Threads per block capped at device limit %d\n",
1111 DeviceData
[DeviceId
].ThreadsPerBlock
);
1112 CudaThreadsPerBlock
= DeviceData
[DeviceId
].ThreadsPerBlock
;
1115 if (!KernelInfo
->MaxThreadsPerBlock
) {
1116 Err
= cuFuncGetAttribute(&KernelInfo
->MaxThreadsPerBlock
,
1117 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
,
1119 if (!checkResult(Err
, "Error returned from cuFuncGetAttribute\n"))
1120 return OFFLOAD_FAIL
;
1123 if (KernelInfo
->MaxThreadsPerBlock
< CudaThreadsPerBlock
) {
1124 DP("Threads per block capped at kernel limit %d\n",
1125 KernelInfo
->MaxThreadsPerBlock
);
1126 CudaThreadsPerBlock
= KernelInfo
->MaxThreadsPerBlock
;
1129 unsigned int CudaBlocksPerGrid
;
1131 if (LoopTripCount
> 0 && EnvNumTeams
< 0) {
1132 if (IsSPMDGenericMode
) {
1133 // If we reach this point, then we are executing a kernel that was
1134 // transformed from Generic-mode to SPMD-mode. This kernel has
1135 // SPMD-mode execution, but needs its blocks to be scheduled
1136 // differently because the current loop trip count only applies to the
1137 // `teams distribute` region and will create var too few blocks using
1138 // the regular SPMD-mode method.
1139 CudaBlocksPerGrid
= LoopTripCount
;
1140 } else if (IsSPMDMode
) {
1141 // We have a combined construct, i.e. `target teams distribute
1142 // parallel for [simd]`. We launch so many teams so that each thread
1143 // will execute one iteration of the loop. round up to the nearest
1145 CudaBlocksPerGrid
= ((LoopTripCount
- 1) / CudaThreadsPerBlock
) + 1;
1146 } else if (IsGenericMode
) {
1147 // If we reach this point, then we have a non-combined construct, i.e.
1148 // `teams distribute` with a nested `parallel for` and each team is
1149 // assigned one iteration of the `distribute` loop. E.g.:
1151 // #pragma omp target teams distribute
1152 // for(...loop_tripcount...) {
1153 // #pragma omp parallel for
1157 // Threads within a team will execute the iterations of the `parallel`
1159 CudaBlocksPerGrid
= LoopTripCount
;
1161 REPORT("Unknown execution mode: %d\n",
1162 static_cast<int8_t>(KernelInfo
->ExecutionMode
));
1163 return OFFLOAD_FAIL
;
1165 DP("Using %d teams due to loop trip count %" PRIu32
1166 " and number of threads per block %d\n",
1167 CudaBlocksPerGrid
, LoopTripCount
, CudaThreadsPerBlock
);
1169 DP("Using default number of teams %d\n", DeviceData
[DeviceId
].NumTeams
);
1170 CudaBlocksPerGrid
= DeviceData
[DeviceId
].NumTeams
;
1172 } else if (TeamNum
> DeviceData
[DeviceId
].BlocksPerGrid
) {
1173 DP("Capping number of teams to team limit %d\n",
1174 DeviceData
[DeviceId
].BlocksPerGrid
);
1175 CudaBlocksPerGrid
= DeviceData
[DeviceId
].BlocksPerGrid
;
1177 DP("Using requested number of teams %d\n", TeamNum
);
1178 CudaBlocksPerGrid
= TeamNum
;
1181 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, DeviceId
,
1182 "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1183 (getOffloadEntry(DeviceId
, TgtEntryPtr
))
1184 ? getOffloadEntry(DeviceId
, TgtEntryPtr
)->name
1186 CudaBlocksPerGrid
, CudaThreadsPerBlock
,
1187 (!IsSPMDMode
? (IsGenericMode
? "Generic" : "SPMD-Generic") : "SPMD"));
1189 CUstream Stream
= getStream(DeviceId
, AsyncInfo
);
1190 Err
= cuLaunchKernel(KernelInfo
->Func
, CudaBlocksPerGrid
, /* gridDimY */ 1,
1191 /* gridDimZ */ 1, CudaThreadsPerBlock
,
1192 /* blockDimY */ 1, /* blockDimZ */ 1,
1193 DynamicMemorySize
, Stream
, &Args
[0], nullptr);
1194 if (!checkResult(Err
, "Error returned from cuLaunchKernel\n"))
1195 return OFFLOAD_FAIL
;
1197 DP("Launch of entry point at " DPxMOD
" successful!\n",
1198 DPxPTR(TgtEntryPtr
));
1200 return OFFLOAD_SUCCESS
;
1203 int synchronize(const int DeviceId
, __tgt_async_info
*AsyncInfo
) const {
1204 CUstream Stream
= reinterpret_cast<CUstream
>(AsyncInfo
->Queue
);
1205 CUresult Err
= cuStreamSynchronize(Stream
);
1207 // Once the stream is synchronized, return it to stream pool and reset
1208 // AsyncInfo. This is to make sure the synchronization only works for its
1210 StreamPool
[DeviceId
]->release(reinterpret_cast<CUstream
>(AsyncInfo
->Queue
));
1211 AsyncInfo
->Queue
= nullptr;
1213 if (Err
!= CUDA_SUCCESS
) {
1214 DP("Error when synchronizing stream. stream = " DPxMOD
1215 ", async info ptr = " DPxMOD
"\n",
1216 DPxPTR(Stream
), DPxPTR(AsyncInfo
));
1217 CUDA_ERR_STRING(Err
);
1219 return (Err
== CUDA_SUCCESS
) ? OFFLOAD_SUCCESS
: OFFLOAD_FAIL
;
1222 void printDeviceInfo(int32_t device_id
) {
1226 int TmpInt
, TmpInt2
, TmpInt3
;
1229 checkResult(cuDeviceGet(&Device
, device_id
),
1230 "Error returned from cuCtxGetDevice\n");
1232 cuDriverGetVersion(&TmpInt
);
1233 printf(" CUDA Driver Version: \t\t%d \n", TmpInt
);
1234 printf(" CUDA Device Number: \t\t%d \n", device_id
);
1235 checkResult(cuDeviceGetName(TmpChar
, 1000, Device
),
1236 "Error returned from cuDeviceGetName\n");
1237 printf(" Device Name: \t\t\t%s \n", TmpChar
);
1238 checkResult(cuDeviceTotalMem(&TmpSt
, Device
),
1239 "Error returned from cuDeviceTotalMem\n");
1240 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt
);
1241 checkResult(cuDeviceGetAttribute(
1242 &TmpInt
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, Device
),
1243 "Error returned from cuDeviceGetAttribute\n");
1244 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt
);
1246 cuDeviceGetAttribute(&TmpInt
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, Device
),
1247 "Error returned from cuDeviceGetAttribute\n");
1248 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt
));
1249 checkResult(cuDeviceGetAttribute(
1250 &TmpInt
, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY
, Device
),
1251 "Error returned from cuDeviceGetAttribute\n");
1252 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt
);
1254 cuDeviceGetAttribute(
1255 &TmpInt
, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
, Device
),
1256 "Error returned from cuDeviceGetAttribute\n");
1257 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt
);
1259 cuDeviceGetAttribute(
1260 &TmpInt
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, Device
),
1261 "Error returned from cuDeviceGetAttribute\n");
1262 printf(" Registers per Block: \t\t%d \n", TmpInt
);
1264 cuDeviceGetAttribute(&TmpInt
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, Device
),
1265 "Error returned from cuDeviceGetAttribute\n");
1266 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt
);
1267 checkResult(cuDeviceGetAttribute(
1268 &TmpInt
, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, Device
),
1269 "Error returned from cuDeviceGetAttribute\n");
1270 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt
);
1271 checkResult(cuDeviceGetAttribute(
1272 &TmpInt
, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
, Device
),
1273 "Error returned from cuDeviceGetAttribute\n");
1274 checkResult(cuDeviceGetAttribute(
1275 &TmpInt2
, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y
, Device
),
1276 "Error returned from cuDeviceGetAttribute\n");
1277 checkResult(cuDeviceGetAttribute(
1278 &TmpInt3
, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z
, Device
),
1279 "Error returned from cuDeviceGetAttribute\n");
1280 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt
, TmpInt2
,
1282 checkResult(cuDeviceGetAttribute(
1283 &TmpInt
, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X
, Device
),
1284 "Error returned from cuDeviceGetAttribute\n");
1285 checkResult(cuDeviceGetAttribute(
1286 &TmpInt2
, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y
, Device
),
1287 "Error returned from cuDeviceGetAttribute\n");
1288 checkResult(cuDeviceGetAttribute(
1289 &TmpInt3
, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z
, Device
),
1290 "Error returned from cuDeviceGetAttribute\n");
1291 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt
, TmpInt2
,
1294 cuDeviceGetAttribute(&TmpInt
, CU_DEVICE_ATTRIBUTE_MAX_PITCH
, Device
),
1295 "Error returned from cuDeviceGetAttribute\n");
1296 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt
);
1297 checkResult(cuDeviceGetAttribute(
1298 &TmpInt
, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT
, Device
),
1299 "Error returned from cuDeviceGetAttribute\n");
1300 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt
);
1302 cuDeviceGetAttribute(&TmpInt
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, Device
),
1303 "Error returned from cuDeviceGetAttribute\n");
1304 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt
);
1305 checkResult(cuDeviceGetAttribute(
1306 &TmpInt
, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT
, Device
),
1307 "Error returned from cuDeviceGetAttribute\n");
1308 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt
));
1310 cuDeviceGetAttribute(&TmpInt
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, Device
),
1311 "Error returned from cuDeviceGetAttribute\n");
1312 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt
));
1313 checkResult(cuDeviceGetAttribute(
1314 &TmpInt
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, Device
),
1315 "Error returned from cuDeviceGetAttribute\n");
1316 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt
));
1318 cuDeviceGetAttribute(&TmpInt
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, Device
),
1319 "Error returned from cuDeviceGetAttribute\n");
1320 if (TmpInt
== CU_COMPUTEMODE_DEFAULT
)
1322 else if (TmpInt
== CU_COMPUTEMODE_PROHIBITED
)
1323 TmpStr
= "PROHIBITED";
1324 else if (TmpInt
== CU_COMPUTEMODE_EXCLUSIVE_PROCESS
)
1325 TmpStr
= "EXCLUSIVE PROCESS";
1328 printf(" Compute Mode: \t\t\t%s \n", TmpStr
.c_str());
1329 checkResult(cuDeviceGetAttribute(
1330 &TmpInt
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, Device
),
1331 "Error returned from cuDeviceGetAttribute\n");
1332 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt
));
1334 cuDeviceGetAttribute(&TmpInt
, CU_DEVICE_ATTRIBUTE_ECC_ENABLED
, Device
),
1335 "Error returned from cuDeviceGetAttribute\n");
1336 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt
));
1337 checkResult(cuDeviceGetAttribute(
1338 &TmpInt
, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE
, Device
),
1339 "Error returned from cuDeviceGetAttribute\n");
1340 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt
);
1342 cuDeviceGetAttribute(
1343 &TmpInt
, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH
, Device
),
1344 "Error returned from cuDeviceGetAttribute\n");
1345 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt
);
1346 checkResult(cuDeviceGetAttribute(&TmpInt
, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE
,
1348 "Error returned from cuDeviceGetAttribute\n");
1349 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt
);
1350 checkResult(cuDeviceGetAttribute(
1351 &TmpInt
, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
,
1353 "Error returned from cuDeviceGetAttribute\n");
1354 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt
);
1355 checkResult(cuDeviceGetAttribute(
1356 &TmpInt
, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, Device
),
1357 "Error returned from cuDeviceGetAttribute\n");
1358 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt
), TmpInt
);
1359 checkResult(cuDeviceGetAttribute(
1360 &TmpInt
, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING
, Device
),
1361 "Error returned from cuDeviceGetAttribute\n");
1362 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt
));
1363 checkResult(cuDeviceGetAttribute(
1364 &TmpInt
, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY
, Device
),
1365 "Error returned from cuDeviceGetAttribute\n");
1366 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt
));
1368 cuDeviceGetAttribute(
1369 &TmpInt
, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
, Device
),
1370 "Error returned from cuDeviceGetAttribute\n");
1371 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt
));
1373 cuDeviceGetAttribute(
1374 &TmpInt
, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED
, Device
),
1375 "Error returned from cuDeviceGetAttribute\n");
1376 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt
));
1377 checkResult(cuDeviceGetAttribute(
1378 &TmpInt
, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH
, Device
),
1379 "Error returned from cuDeviceGetAttribute\n");
1380 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt
));
1381 checkResult(cuDeviceGetAttribute(
1382 &TmpInt
, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD
, Device
),
1383 "Error returned from cuDeviceGetAttribute\n");
1384 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt
));
1386 cuDeviceGetAttribute(
1387 &TmpInt
, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
, Device
),
1388 "Error returned from cuDeviceGetAttribute\n");
1390 cuDeviceGetAttribute(
1391 &TmpInt2
, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
, Device
),
1392 "Error returned from cuDeviceGetAttribute\n");
1393 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt
, TmpInt2
);
1396 int createEvent(void **P
) {
1397 CUevent Event
= nullptr;
1398 if (EventPool
.acquire(Event
) != OFFLOAD_SUCCESS
)
1399 return OFFLOAD_FAIL
;
1401 return OFFLOAD_SUCCESS
;
1404 int destroyEvent(void *EventPtr
) {
1405 EventPool
.release(reinterpret_cast<CUevent
>(EventPtr
));
1406 return OFFLOAD_SUCCESS
;
1409 int waitEvent(const int DeviceId
, __tgt_async_info
*AsyncInfo
,
1410 void *EventPtr
) const {
1411 CUstream Stream
= getStream(DeviceId
, AsyncInfo
);
1412 CUevent Event
= reinterpret_cast<CUevent
>(EventPtr
);
1414 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1415 // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1416 // API document, that argument has to be 0x0.
1417 CUresult Err
= cuStreamWaitEvent(Stream
, Event
, 0);
1418 if (Err
!= CUDA_SUCCESS
) {
1419 DP("Error when waiting event. stream = " DPxMOD
", event = " DPxMOD
"\n",
1420 DPxPTR(Stream
), DPxPTR(Event
));
1421 CUDA_ERR_STRING(Err
);
1422 return OFFLOAD_FAIL
;
1425 return OFFLOAD_SUCCESS
;
1429 DeviceRTLTy DeviceRTL
;
1432 // Exposed library API function
1437 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image
*image
) {
1438 return elf_check_machine(image
, /* EM_CUDA */ 190);
1441 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL
.getNumOfDevices(); }
1443 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags
) {
1444 DP("Init requires flags to %" PRId64
"\n", RequiresFlags
);
1445 DeviceRTL
.setRequiresFlag(RequiresFlags
);
1446 return RequiresFlags
;
1449 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id
, int dst_dev_id
) {
1450 if (DeviceRTL
.isValidDeviceId(src_dev_id
) &&
1451 DeviceRTL
.isValidDeviceId(dst_dev_id
))
1457 int32_t __tgt_rtl_init_device(int32_t device_id
) {
1458 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1460 return DeviceRTL
.initDevice(device_id
);
1463 __tgt_target_table
*__tgt_rtl_load_binary(int32_t device_id
,
1464 __tgt_device_image
*image
) {
1465 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1467 return DeviceRTL
.loadBinary(device_id
, image
);
1470 void *__tgt_rtl_data_alloc(int32_t device_id
, int64_t size
, void *,
1472 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1474 return DeviceRTL
.dataAlloc(device_id
, size
, (TargetAllocTy
)kind
);
1477 int32_t __tgt_rtl_data_submit(int32_t device_id
, void *tgt_ptr
, void *hst_ptr
,
1479 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1481 __tgt_async_info AsyncInfo
;
1482 const int32_t rc
= __tgt_rtl_data_submit_async(device_id
, tgt_ptr
, hst_ptr
,
1484 if (rc
!= OFFLOAD_SUCCESS
)
1485 return OFFLOAD_FAIL
;
1487 return __tgt_rtl_synchronize(device_id
, &AsyncInfo
);
1490 int32_t __tgt_rtl_data_submit_async(int32_t device_id
, void *tgt_ptr
,
1491 void *hst_ptr
, int64_t size
,
1492 __tgt_async_info
*async_info_ptr
) {
1493 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1494 assert(async_info_ptr
&& "async_info_ptr is nullptr");
1496 return DeviceRTL
.dataSubmit(device_id
, tgt_ptr
, hst_ptr
, size
,
1500 int32_t __tgt_rtl_data_retrieve(int32_t device_id
, void *hst_ptr
, void *tgt_ptr
,
1502 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1504 __tgt_async_info AsyncInfo
;
1505 const int32_t rc
= __tgt_rtl_data_retrieve_async(device_id
, hst_ptr
, tgt_ptr
,
1507 if (rc
!= OFFLOAD_SUCCESS
)
1508 return OFFLOAD_FAIL
;
1510 return __tgt_rtl_synchronize(device_id
, &AsyncInfo
);
1513 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id
, void *hst_ptr
,
1514 void *tgt_ptr
, int64_t size
,
1515 __tgt_async_info
*async_info_ptr
) {
1516 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1517 assert(async_info_ptr
&& "async_info_ptr is nullptr");
1519 return DeviceRTL
.dataRetrieve(device_id
, hst_ptr
, tgt_ptr
, size
,
1523 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id
, void *src_ptr
,
1524 int dst_dev_id
, void *dst_ptr
,
1526 __tgt_async_info
*AsyncInfo
) {
1527 assert(DeviceRTL
.isValidDeviceId(src_dev_id
) && "src_dev_id is invalid");
1528 assert(DeviceRTL
.isValidDeviceId(dst_dev_id
) && "dst_dev_id is invalid");
1529 assert(AsyncInfo
&& "AsyncInfo is nullptr");
1531 return DeviceRTL
.dataExchange(src_dev_id
, src_ptr
, dst_dev_id
, dst_ptr
, size
,
1535 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id
, void *src_ptr
,
1536 int32_t dst_dev_id
, void *dst_ptr
,
1538 assert(DeviceRTL
.isValidDeviceId(src_dev_id
) && "src_dev_id is invalid");
1539 assert(DeviceRTL
.isValidDeviceId(dst_dev_id
) && "dst_dev_id is invalid");
1541 __tgt_async_info AsyncInfo
;
1542 const int32_t rc
= __tgt_rtl_data_exchange_async(
1543 src_dev_id
, src_ptr
, dst_dev_id
, dst_ptr
, size
, &AsyncInfo
);
1544 if (rc
!= OFFLOAD_SUCCESS
)
1545 return OFFLOAD_FAIL
;
1547 return __tgt_rtl_synchronize(src_dev_id
, &AsyncInfo
);
1550 int32_t __tgt_rtl_data_delete(int32_t device_id
, void *tgt_ptr
) {
1551 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1553 return DeviceRTL
.dataDelete(device_id
, tgt_ptr
);
1556 int32_t __tgt_rtl_run_target_team_region(int32_t device_id
, void *tgt_entry_ptr
,
1558 ptrdiff_t *tgt_offsets
,
1559 int32_t arg_num
, int32_t team_num
,
1560 int32_t thread_limit
,
1561 uint64_t loop_tripcount
) {
1562 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1564 __tgt_async_info AsyncInfo
;
1565 const int32_t rc
= __tgt_rtl_run_target_team_region_async(
1566 device_id
, tgt_entry_ptr
, tgt_args
, tgt_offsets
, arg_num
, team_num
,
1567 thread_limit
, loop_tripcount
, &AsyncInfo
);
1568 if (rc
!= OFFLOAD_SUCCESS
)
1569 return OFFLOAD_FAIL
;
1571 return __tgt_rtl_synchronize(device_id
, &AsyncInfo
);
1574 int32_t __tgt_rtl_run_target_team_region_async(
1575 int32_t device_id
, void *tgt_entry_ptr
, void **tgt_args
,
1576 ptrdiff_t *tgt_offsets
, int32_t arg_num
, int32_t team_num
,
1577 int32_t thread_limit
, uint64_t loop_tripcount
,
1578 __tgt_async_info
*async_info_ptr
) {
1579 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1581 return DeviceRTL
.runTargetTeamRegion(
1582 device_id
, tgt_entry_ptr
, tgt_args
, tgt_offsets
, arg_num
, team_num
,
1583 thread_limit
, loop_tripcount
, async_info_ptr
);
1586 int32_t __tgt_rtl_run_target_region(int32_t device_id
, void *tgt_entry_ptr
,
1587 void **tgt_args
, ptrdiff_t *tgt_offsets
,
1589 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1591 __tgt_async_info AsyncInfo
;
1592 const int32_t rc
= __tgt_rtl_run_target_region_async(
1593 device_id
, tgt_entry_ptr
, tgt_args
, tgt_offsets
, arg_num
, &AsyncInfo
);
1594 if (rc
!= OFFLOAD_SUCCESS
)
1595 return OFFLOAD_FAIL
;
1597 return __tgt_rtl_synchronize(device_id
, &AsyncInfo
);
1600 int32_t __tgt_rtl_run_target_region_async(int32_t device_id
,
1601 void *tgt_entry_ptr
, void **tgt_args
,
1602 ptrdiff_t *tgt_offsets
,
1604 __tgt_async_info
*async_info_ptr
) {
1605 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1607 return __tgt_rtl_run_target_team_region_async(
1608 device_id
, tgt_entry_ptr
, tgt_args
, tgt_offsets
, arg_num
,
1609 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1613 int32_t __tgt_rtl_synchronize(int32_t device_id
,
1614 __tgt_async_info
*async_info_ptr
) {
1615 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1616 assert(async_info_ptr
&& "async_info_ptr is nullptr");
1617 assert(async_info_ptr
->Queue
&& "async_info_ptr->Queue is nullptr");
1619 return DeviceRTL
.synchronize(device_id
, async_info_ptr
);
1622 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel
) {
1623 std::atomic
<uint32_t> &InfoLevel
= getInfoLevelInternal();
1624 InfoLevel
.store(NewInfoLevel
);
1627 void __tgt_rtl_print_device_info(int32_t device_id
) {
1628 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1629 DeviceRTL
.printDeviceInfo(device_id
);
1632 int32_t __tgt_rtl_create_event(int32_t device_id
, void **event
) {
1633 assert(event
&& "event is nullptr");
1634 return DeviceRTL
.createEvent(event
);
1637 int32_t __tgt_rtl_record_event(int32_t device_id
, void *event_ptr
,
1638 __tgt_async_info
*async_info_ptr
) {
1639 assert(async_info_ptr
&& "async_info_ptr is nullptr");
1640 assert(async_info_ptr
->Queue
&& "async_info_ptr->Queue is nullptr");
1641 assert(event_ptr
&& "event_ptr is nullptr");
1643 return recordEvent(event_ptr
, async_info_ptr
);
1646 int32_t __tgt_rtl_wait_event(int32_t device_id
, void *event_ptr
,
1647 __tgt_async_info
*async_info_ptr
) {
1648 assert(DeviceRTL
.isValidDeviceId(device_id
) && "device_id is invalid");
1649 assert(async_info_ptr
&& "async_info_ptr is nullptr");
1650 assert(event_ptr
&& "event is nullptr");
1652 return DeviceRTL
.waitEvent(device_id
, async_info_ptr
, event_ptr
);
1655 int32_t __tgt_rtl_sync_event(int32_t device_id
, void *event_ptr
) {
1656 assert(event_ptr
&& "event is nullptr");
1658 return syncEvent(event_ptr
);
1661 int32_t __tgt_rtl_destroy_event(int32_t device_id
, void *event_ptr
) {
1662 assert(event_ptr
&& "event is nullptr");
1664 return DeviceRTL
.destroyEvent(event_ptr
);