1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // RTL NextGen for CUDA machine
11 //===----------------------------------------------------------------------===//
17 #include <unordered_map>
20 #include "Environment.h"
21 #include "GlobalHandler.h"
22 #include "OmptCallback.h"
23 #include "PluginInterface.h"
25 #include "llvm/BinaryFormat/ELF.h"
26 #include "llvm/Frontend/OpenMP/OMPConstants.h"
27 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
28 #include "llvm/Support/Error.h"
35 /// Forward declarations for all specialized data structures.
40 /// Class implementing the CUDA device images properties.
41 struct CUDADeviceImageTy
: public DeviceImageTy
{
42 /// Create the CUDA image with the id and the target image pointer.
43 CUDADeviceImageTy(int32_t ImageId
, const __tgt_device_image
*TgtImage
)
44 : DeviceImageTy(ImageId
, TgtImage
), Module(nullptr) {}
46 /// Load the image as a CUDA module.
48 assert(!Module
&& "Module already loaded");
50 CUresult Res
= cuModuleLoadDataEx(&Module
, getStart(), 0, nullptr, nullptr);
51 if (auto Err
= Plugin::check(Res
, "Error in cuModuleLoadDataEx: %s"))
54 return Plugin::success();
57 /// Unload the CUDA module corresponding to the image.
58 Error
unloadModule() {
59 assert(Module
&& "Module not loaded");
61 CUresult Res
= cuModuleUnload(Module
);
62 if (auto Err
= Plugin::check(Res
, "Error in cuModuleUnload: %s"))
67 return Plugin::success();
70 /// Getter of the CUDA module.
71 CUmodule
getModule() const { return Module
; }
74 /// The CUDA module that loaded the image.
78 /// Class implementing the CUDA kernel functionalities which derives from the
79 /// generic kernel class.
80 struct CUDAKernelTy
: public GenericKernelTy
{
81 /// Create a CUDA kernel with a name and an execution mode.
82 CUDAKernelTy(const char *Name
) : GenericKernelTy(Name
), Func(nullptr) {}
84 /// Initialize the CUDA kernel.
85 Error
initImpl(GenericDeviceTy
&GenericDevice
,
86 DeviceImageTy
&Image
) override
{
88 CUDADeviceImageTy
&CUDAImage
= static_cast<CUDADeviceImageTy
&>(Image
);
90 // Retrieve the function pointer of the kernel.
91 Res
= cuModuleGetFunction(&Func
, CUDAImage
.getModule(), getName());
92 if (auto Err
= Plugin::check(Res
, "Error in cuModuleGetFunction('%s'): %s",
96 // Check that the function pointer is valid.
98 return Plugin::error("Invalid function for kernel %s", getName());
101 Res
= cuFuncGetAttribute(&MaxThreads
,
102 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, Func
);
103 if (auto Err
= Plugin::check(Res
, "Error in cuFuncGetAttribute: %s"))
106 // The maximum number of threads cannot exceed the maximum of the kernel.
107 MaxNumThreads
= std::min(MaxNumThreads
, (uint32_t)MaxThreads
);
109 return Plugin::success();
112 /// Launch the CUDA kernel function.
113 Error
launchImpl(GenericDeviceTy
&GenericDevice
, uint32_t NumThreads
,
114 uint64_t NumBlocks
, KernelArgsTy
&KernelArgs
, void *Args
,
115 AsyncInfoWrapperTy
&AsyncInfoWrapper
) const override
;
118 /// The CUDA kernel function to execute.
122 /// Class wrapping a CUDA stream reference. These are the objects handled by the
123 /// Stream Manager for the CUDA plugin.
124 struct CUDAStreamRef final
: public GenericDeviceResourceRef
{
125 /// The underlying handle type for streams.
126 using HandleTy
= CUstream
;
128 /// Create an empty reference to an invalid stream.
129 CUDAStreamRef() : Stream(nullptr) {}
131 /// Create a reference to an existing stream.
132 CUDAStreamRef(HandleTy Stream
) : Stream(Stream
) {}
134 /// Create a new stream and save the reference. The reference must be empty
135 /// before calling to this function.
136 Error
create(GenericDeviceTy
&Device
) override
{
138 return Plugin::error("Creating an existing stream");
140 CUresult Res
= cuStreamCreate(&Stream
, CU_STREAM_NON_BLOCKING
);
141 if (auto Err
= Plugin::check(Res
, "Error in cuStreamCreate: %s"))
144 return Plugin::success();
147 /// Destroy the referenced stream and invalidate the reference. The reference
148 /// must be to a valid stream before calling to this function.
149 Error
destroy(GenericDeviceTy
&Device
) override
{
151 return Plugin::error("Destroying an invalid stream");
153 CUresult Res
= cuStreamDestroy(Stream
);
154 if (auto Err
= Plugin::check(Res
, "Error in cuStreamDestroy: %s"))
158 return Plugin::success();
161 /// Get the underlying CUDA stream.
162 operator HandleTy() const { return Stream
; }
165 /// The reference to the CUDA stream.
169 /// Class wrapping a CUDA event reference. These are the objects handled by the
170 /// Event Manager for the CUDA plugin.
171 struct CUDAEventRef final
: public GenericDeviceResourceRef
{
172 /// The underlying handle type for events.
173 using HandleTy
= CUevent
;
175 /// Create an empty reference to an invalid event.
176 CUDAEventRef() : Event(nullptr) {}
178 /// Create a reference to an existing event.
179 CUDAEventRef(HandleTy Event
) : Event(Event
) {}
181 /// Create a new event and save the reference. The reference must be empty
182 /// before calling to this function.
183 Error
create(GenericDeviceTy
&Device
) override
{
185 return Plugin::error("Creating an existing event");
187 CUresult Res
= cuEventCreate(&Event
, CU_EVENT_DEFAULT
);
188 if (auto Err
= Plugin::check(Res
, "Error in cuEventCreate: %s"))
191 return Plugin::success();
194 /// Destroy the referenced event and invalidate the reference. The reference
195 /// must be to a valid event before calling to this function.
196 Error
destroy(GenericDeviceTy
&Device
) override
{
198 return Plugin::error("Destroying an invalid event");
200 CUresult Res
= cuEventDestroy(Event
);
201 if (auto Err
= Plugin::check(Res
, "Error in cuEventDestroy: %s"))
205 return Plugin::success();
208 /// Get the underlying CUevent.
209 operator HandleTy() const { return Event
; }
212 /// The reference to the CUDA event.
216 /// Class implementing the CUDA device functionalities which derives from the
217 /// generic device class.
218 struct CUDADeviceTy
: public GenericDeviceTy
{
219 // Create a CUDA device with a device id and the default CUDA grid values.
220 CUDADeviceTy(int32_t DeviceId
, int32_t NumDevices
)
221 : GenericDeviceTy(DeviceId
, NumDevices
, NVPTXGridValues
),
222 CUDAStreamManager(*this), CUDAEventManager(*this) {}
226 /// Initialize the device, its resources and get its properties.
227 Error
initImpl(GenericPluginTy
&Plugin
) override
{
228 CUresult Res
= cuDeviceGet(&Device
, DeviceId
);
229 if (auto Err
= Plugin::check(Res
, "Error in cuDeviceGet: %s"))
232 // Query the current flags of the primary context and set its flags if
234 unsigned int FormerPrimaryCtxFlags
= 0;
235 int FormerPrimaryCtxIsActive
= 0;
236 Res
= cuDevicePrimaryCtxGetState(Device
, &FormerPrimaryCtxFlags
,
237 &FormerPrimaryCtxIsActive
);
239 Plugin::check(Res
, "Error in cuDevicePrimaryCtxGetState: %s"))
242 if (FormerPrimaryCtxIsActive
) {
243 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, DeviceId
,
244 "The primary context is active, no change to its flags\n");
245 if ((FormerPrimaryCtxFlags
& CU_CTX_SCHED_MASK
) !=
246 CU_CTX_SCHED_BLOCKING_SYNC
)
247 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, DeviceId
,
248 "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
250 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, DeviceId
,
251 "The primary context is inactive, set its flags to "
252 "CU_CTX_SCHED_BLOCKING_SYNC\n");
253 Res
= cuDevicePrimaryCtxSetFlags(Device
, CU_CTX_SCHED_BLOCKING_SYNC
);
255 Plugin::check(Res
, "Error in cuDevicePrimaryCtxSetFlags: %s"))
259 // Retain the per device primary context and save it to use whenever this
260 // device is selected.
261 Res
= cuDevicePrimaryCtxRetain(&Context
, Device
);
262 if (auto Err
= Plugin::check(Res
, "Error in cuDevicePrimaryCtxRetain: %s"))
265 if (auto Err
= setContext())
268 // Initialize stream pool.
269 if (auto Err
= CUDAStreamManager
.init(OMPX_InitialNumStreams
))
272 // Initialize event pool.
273 if (auto Err
= CUDAEventManager
.init(OMPX_InitialNumEvents
))
276 // Query attributes to determine number of threads/block and blocks/grid.
277 if (auto Err
= getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X
,
278 GridValues
.GV_Max_Teams
))
281 if (auto Err
= getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
282 GridValues
.GV_Max_WG_Size
))
285 if (auto Err
= getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE
,
286 GridValues
.GV_Warp_Size
))
289 if (auto Err
= getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
,
290 ComputeCapability
.Major
))
293 if (auto Err
= getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
,
294 ComputeCapability
.Minor
))
297 uint32_t NumMuliprocessors
= 0;
298 uint32_t MaxThreadsPerSM
= 0;
299 uint32_t WarpSize
= 0;
300 if (auto Err
= getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
304 getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
,
307 if (auto Err
= getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE
, WarpSize
))
309 HardwareParallelism
= NumMuliprocessors
* (MaxThreadsPerSM
/ WarpSize
);
311 return Plugin::success();
314 /// Deinitialize the device and release its resources.
315 Error
deinitImpl() override
{
317 if (auto Err
= setContext())
321 // Deinitialize the stream manager.
322 if (auto Err
= CUDAStreamManager
.deinit())
325 if (auto Err
= CUDAEventManager
.deinit())
328 // Close modules if necessary.
329 if (!LoadedImages
.empty()) {
330 assert(Context
&& "Invalid CUDA context");
332 // Each image has its own module.
333 for (DeviceImageTy
*Image
: LoadedImages
) {
334 CUDADeviceImageTy
&CUDAImage
= static_cast<CUDADeviceImageTy
&>(*Image
);
336 // Unload the module of the image.
337 if (auto Err
= CUDAImage
.unloadModule())
343 CUresult Res
= cuDevicePrimaryCtxRelease(Device
);
345 Plugin::check(Res
, "Error in cuDevicePrimaryCtxRelease: %s"))
349 // Invalidate context and device references.
351 Device
= CU_DEVICE_INVALID
;
353 return Plugin::success();
356 /// Allocate and construct a CUDA kernel.
357 Expected
<GenericKernelTy
&>
358 constructKernel(const __tgt_offload_entry
&KernelEntry
) override
{
359 // Allocate and construct the CUDA kernel.
360 CUDAKernelTy
*CUDAKernel
= Plugin::get().allocate
<CUDAKernelTy
>();
362 return Plugin::error("Failed to allocate memory for CUDA kernel");
364 new (CUDAKernel
) CUDAKernelTy(KernelEntry
.name
);
369 /// Set the current context to this device's context.
370 Error
setContext() override
{
371 CUresult Res
= cuCtxSetCurrent(Context
);
372 return Plugin::check(Res
, "Error in cuCtxSetCurrent: %s");
375 /// NVIDIA returns the product of the SM count and the number of warps that
376 /// fit if the maximum number of threads were scheduled on each SM.
377 uint64_t getHardwareParallelism() const override
{
378 return HardwareParallelism
;
381 /// We want to set up the RPC server for host services to the GPU if it is
383 bool shouldSetupRPCServer() const override
{
384 return libomptargetSupportsRPC();
387 /// The RPC interface should have enough space for all availible parallelism.
388 uint64_t requestedRPCPortCount() const override
{
389 return getHardwareParallelism();
392 /// Get the stream of the asynchronous info sructure or get a new one.
393 Error
getStream(AsyncInfoWrapperTy
&AsyncInfoWrapper
, CUstream
&Stream
) {
394 // Get the stream (if any) from the async info.
395 Stream
= AsyncInfoWrapper
.getQueueAs
<CUstream
>();
397 // There was no stream; get an idle one.
398 if (auto Err
= CUDAStreamManager
.getResource(Stream
))
401 // Modify the async info's stream.
402 AsyncInfoWrapper
.setQueueAs
<CUstream
>(Stream
);
404 return Plugin::success();
407 /// Getters of CUDA references.
408 CUcontext
getCUDAContext() const { return Context
; }
409 CUdevice
getCUDADevice() const { return Device
; }
411 /// Load the binary image into the device and allocate an image object.
412 Expected
<DeviceImageTy
*> loadBinaryImpl(const __tgt_device_image
*TgtImage
,
413 int32_t ImageId
) override
{
414 if (auto Err
= setContext())
415 return std::move(Err
);
417 // Allocate and initialize the image object.
418 CUDADeviceImageTy
*CUDAImage
= Plugin::get().allocate
<CUDADeviceImageTy
>();
419 new (CUDAImage
) CUDADeviceImageTy(ImageId
, TgtImage
);
421 // Load the CUDA module.
422 if (auto Err
= CUDAImage
->loadModule())
423 return std::move(Err
);
428 /// Allocate memory on the device or related to the device.
429 void *allocate(size_t Size
, void *, TargetAllocTy Kind
) override
{
433 if (auto Err
= setContext()) {
434 REPORT("Failure to alloc memory: %s\n", toString(std::move(Err
)).data());
438 void *MemAlloc
= nullptr;
439 CUdeviceptr DevicePtr
;
443 case TARGET_ALLOC_DEFAULT
:
444 case TARGET_ALLOC_DEVICE
:
445 Res
= cuMemAlloc(&DevicePtr
, Size
);
446 MemAlloc
= (void *)DevicePtr
;
448 case TARGET_ALLOC_HOST
:
449 Res
= cuMemAllocHost(&MemAlloc
, Size
);
451 case TARGET_ALLOC_SHARED
:
452 Res
= cuMemAllocManaged(&DevicePtr
, Size
, CU_MEM_ATTACH_GLOBAL
);
453 MemAlloc
= (void *)DevicePtr
;
458 Plugin::check(Res
, "Error in cuMemAlloc[Host|Managed]: %s")) {
459 REPORT("Failure to alloc memory: %s\n", toString(std::move(Err
)).data());
465 /// Deallocate memory on the device or related to the device.
466 int free(void *TgtPtr
, TargetAllocTy Kind
) override
{
467 if (TgtPtr
== nullptr)
468 return OFFLOAD_SUCCESS
;
470 if (auto Err
= setContext()) {
471 REPORT("Failure to free memory: %s\n", toString(std::move(Err
)).data());
477 case TARGET_ALLOC_DEFAULT
:
478 case TARGET_ALLOC_DEVICE
:
479 case TARGET_ALLOC_SHARED
:
480 Res
= cuMemFree((CUdeviceptr
)TgtPtr
);
482 case TARGET_ALLOC_HOST
:
483 Res
= cuMemFreeHost(TgtPtr
);
487 if (auto Err
= Plugin::check(Res
, "Error in cuMemFree[Host]: %s")) {
488 REPORT("Failure to free memory: %s\n", toString(std::move(Err
)).data());
491 return OFFLOAD_SUCCESS
;
494 /// Synchronize current thread with the pending operations on the async info.
495 Error
synchronizeImpl(__tgt_async_info
&AsyncInfo
) override
{
496 CUstream Stream
= reinterpret_cast<CUstream
>(AsyncInfo
.Queue
);
498 // If we have an RPC server running on this device we will continuously
499 // query it for work rather than blocking.
500 if (!getRPCServer()) {
501 Res
= cuStreamSynchronize(Stream
);
504 Res
= cuStreamQuery(Stream
);
505 if (auto Err
= getRPCServer()->runServer(*this))
507 } while (Res
== CUDA_ERROR_NOT_READY
);
510 // Once the stream is synchronized, return it to stream pool and reset
511 // AsyncInfo. This is to make sure the synchronization only works for its
513 AsyncInfo
.Queue
= nullptr;
514 if (auto Err
= CUDAStreamManager
.returnResource(Stream
))
517 return Plugin::check(Res
, "Error in cuStreamSynchronize: %s");
520 /// CUDA support VA management
521 bool supportVAManagement() const override
{ return true; }
523 /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
524 /// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
525 /// At return \p RSize contains the actual size
526 Error
memoryVAMap(void **Addr
, void *VAddr
, size_t *RSize
) override
{
527 CUdeviceptr DVAddr
= reinterpret_cast<CUdeviceptr
>(VAddr
);
528 auto IHandle
= DeviceMMaps
.find(DVAddr
);
529 size_t Size
= *RSize
;
532 return Plugin::error("Memory Map Size must be larger than 0");
534 // Check if we have already mapped this address
535 if (IHandle
!= DeviceMMaps
.end())
536 return Plugin::error("Address already memory mapped");
538 CUmemAllocationProp Prop
= {};
539 size_t Granularity
= 0;
542 CUresult Res
= cuMemGetInfo(&Free
, &Total
);
543 if (auto Err
= Plugin::check(Res
, "Error in cuMemGetInfo: %s"))
548 return Plugin::error(
549 "Canot map memory size larger than the available device memory");
552 // currently NVidia only supports pinned device types
553 Prop
.type
= CU_MEM_ALLOCATION_TYPE_PINNED
;
554 Prop
.location
.type
= CU_MEM_LOCATION_TYPE_DEVICE
;
556 Prop
.location
.id
= DeviceId
;
557 cuMemGetAllocationGranularity(&Granularity
, &Prop
,
558 CU_MEM_ALLOC_GRANULARITY_MINIMUM
);
560 Plugin::check(Res
, "Error in cuMemGetAllocationGranularity: %s"))
563 if (Granularity
== 0)
564 return Plugin::error("Wrong device Page size");
566 // Ceil to page size.
567 Size
= roundUp(Size
, Granularity
);
569 // Create a handler of our allocation
570 CUmemGenericAllocationHandle AHandle
;
571 Res
= cuMemCreate(&AHandle
, Size
, &Prop
, 0);
572 if (auto Err
= Plugin::check(Res
, "Error in cuMemCreate: %s"))
575 CUdeviceptr DevPtr
= 0;
576 Res
= cuMemAddressReserve(&DevPtr
, Size
, 0, DVAddr
, 0);
577 if (auto Err
= Plugin::check(Res
, "Error in cuMemAddressReserve: %s"))
580 Res
= cuMemMap(DevPtr
, Size
, 0, AHandle
, 0);
581 if (auto Err
= Plugin::check(Res
, "Error in cuMemMap: %s"))
584 CUmemAccessDesc ADesc
= {};
585 ADesc
.location
.type
= CU_MEM_LOCATION_TYPE_DEVICE
;
586 ADesc
.location
.id
= DeviceId
;
587 ADesc
.flags
= CU_MEM_ACCESS_FLAGS_PROT_READWRITE
;
590 Res
= cuMemSetAccess(DevPtr
, Size
, &ADesc
, 1);
591 if (auto Err
= Plugin::check(Res
, "Error in cuMemSetAccess: %s"))
594 *Addr
= reinterpret_cast<void *>(DevPtr
);
596 DeviceMMaps
.insert({DevPtr
, AHandle
});
597 return Plugin::success();
600 /// De-allocates device memory and Unmaps the Virtual Addr
601 Error
memoryVAUnMap(void *VAddr
, size_t Size
) override
{
602 CUdeviceptr DVAddr
= reinterpret_cast<CUdeviceptr
>(VAddr
);
603 auto IHandle
= DeviceMMaps
.find(DVAddr
);
604 // Mapping does not exist
605 if (IHandle
== DeviceMMaps
.end()) {
606 return Plugin::error("Addr is not MemoryMapped");
609 if (IHandle
== DeviceMMaps
.end())
610 return Plugin::error("Addr is not MemoryMapped");
612 CUmemGenericAllocationHandle
&AllocHandle
= IHandle
->second
;
614 CUresult Res
= cuMemUnmap(DVAddr
, Size
);
615 if (auto Err
= Plugin::check(Res
, "Error in cuMemUnmap: %s"))
618 Res
= cuMemRelease(AllocHandle
);
619 if (auto Err
= Plugin::check(Res
, "Error in cuMemRelease: %s"))
622 Res
= cuMemAddressFree(DVAddr
, Size
);
623 if (auto Err
= Plugin::check(Res
, "Error in cuMemAddressFree: %s"))
626 DeviceMMaps
.erase(IHandle
);
627 return Plugin::success();
630 /// Query for the completion of the pending operations on the async info.
631 Error
queryAsyncImpl(__tgt_async_info
&AsyncInfo
) override
{
632 CUstream Stream
= reinterpret_cast<CUstream
>(AsyncInfo
.Queue
);
633 CUresult Res
= cuStreamQuery(Stream
);
635 // Not ready streams must be considered as successful operations.
636 if (Res
== CUDA_ERROR_NOT_READY
)
637 return Plugin::success();
639 // Once the stream is synchronized and the operations completed (or an error
640 // occurs), return it to stream pool and reset AsyncInfo. This is to make
641 // sure the synchronization only works for its own tasks.
642 AsyncInfo
.Queue
= nullptr;
643 if (auto Err
= CUDAStreamManager
.returnResource(Stream
))
646 return Plugin::check(Res
, "Error in cuStreamQuery: %s");
649 Expected
<void *> dataLockImpl(void *HstPtr
, int64_t Size
) override
{
650 // TODO: Register the buffer as CUDA host memory.
654 Error
dataUnlockImpl(void *HstPtr
) override
{ return Plugin::success(); }
656 Expected
<bool> isPinnedPtrImpl(void *HstPtr
, void *&BaseHstPtr
,
657 void *&BaseDevAccessiblePtr
,
658 size_t &BaseSize
) const override
{
659 // TODO: Implement pinning feature for CUDA.
663 /// Submit data to the device (host to device transfer).
664 Error
dataSubmitImpl(void *TgtPtr
, const void *HstPtr
, int64_t Size
,
665 AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
666 if (auto Err
= setContext())
670 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
673 CUresult Res
= cuMemcpyHtoDAsync((CUdeviceptr
)TgtPtr
, HstPtr
, Size
, Stream
);
674 return Plugin::check(Res
, "Error in cuMemcpyHtoDAsync: %s");
677 /// Retrieve data from the device (device to host transfer).
678 Error
dataRetrieveImpl(void *HstPtr
, const void *TgtPtr
, int64_t Size
,
679 AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
680 if (auto Err
= setContext())
684 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
687 // If there is already pending work on the stream it could be waiting for
688 // someone to check the RPC server.
689 if (auto RPCServer
= getRPCServer()) {
690 CUresult Res
= cuStreamQuery(Stream
);
691 while (Res
== CUDA_ERROR_NOT_READY
) {
692 if (auto Err
= RPCServer
->runServer(*this))
694 Res
= cuStreamQuery(Stream
);
698 CUresult Res
= cuMemcpyDtoHAsync(HstPtr
, (CUdeviceptr
)TgtPtr
, Size
, Stream
);
699 return Plugin::check(Res
, "Error in cuMemcpyDtoHAsync: %s");
702 /// Exchange data between two devices directly. We may use peer access if
703 /// the CUDA devices and driver allow them.
704 Error
dataExchangeImpl(const void *SrcPtr
, GenericDeviceTy
&DstGenericDevice
,
705 void *DstPtr
, int64_t Size
,
706 AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
;
708 /// Initialize the async info for interoperability purposes.
709 Error
initAsyncInfoImpl(AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
710 if (auto Err
= setContext())
714 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
717 return Plugin::success();
720 /// Initialize the device info for interoperability purposes.
721 Error
initDeviceInfoImpl(__tgt_device_info
*DeviceInfo
) override
{
722 assert(Context
&& "Context is null");
723 assert(Device
!= CU_DEVICE_INVALID
&& "Invalid CUDA device");
725 if (auto Err
= setContext())
728 if (!DeviceInfo
->Context
)
729 DeviceInfo
->Context
= Context
;
731 if (!DeviceInfo
->Device
)
732 DeviceInfo
->Device
= reinterpret_cast<void *>(Device
);
734 return Plugin::success();
738 Error
createEventImpl(void **EventPtrStorage
) override
{
739 CUevent
*Event
= reinterpret_cast<CUevent
*>(EventPtrStorage
);
740 return CUDAEventManager
.getResource(*Event
);
743 /// Destroy a previously created event.
744 Error
destroyEventImpl(void *EventPtr
) override
{
745 CUevent Event
= reinterpret_cast<CUevent
>(EventPtr
);
746 return CUDAEventManager
.returnResource(Event
);
749 /// Record the event.
750 Error
recordEventImpl(void *EventPtr
,
751 AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
752 CUevent Event
= reinterpret_cast<CUevent
>(EventPtr
);
755 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
758 CUresult Res
= cuEventRecord(Event
, Stream
);
759 return Plugin::check(Res
, "Error in cuEventRecord: %s");
762 /// Make the stream wait on the event.
763 Error
waitEventImpl(void *EventPtr
,
764 AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
765 CUevent Event
= reinterpret_cast<CUevent
>(EventPtr
);
768 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
771 // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from
772 // specific CUDA version, and defined as 0x0. In previous version, per CUDA
773 // API document, that argument has to be 0x0.
774 CUresult Res
= cuStreamWaitEvent(Stream
, Event
, 0);
775 return Plugin::check(Res
, "Error in cuStreamWaitEvent: %s");
778 /// Synchronize the current thread with the event.
779 Error
syncEventImpl(void *EventPtr
) override
{
780 CUevent Event
= reinterpret_cast<CUevent
>(EventPtr
);
781 CUresult Res
= cuEventSynchronize(Event
);
782 return Plugin::check(Res
, "Error in cuEventSynchronize: %s");
785 /// Print information about the device.
786 Error
obtainInfoImpl(InfoQueueTy
&Info
) override
{
788 const char *TmpCharPtr
;
792 CUresult Res
= cuDriverGetVersion(&TmpInt
);
793 if (Res
== CUDA_SUCCESS
)
794 Info
.add("CUDA Driver Version", TmpInt
);
796 Info
.add("CUDA OpenMP Device Number", DeviceId
);
798 Res
= cuDeviceGetName(TmpChar
, 1000, Device
);
799 if (Res
== CUDA_SUCCESS
)
800 Info
.add("Device Name", TmpChar
);
802 Res
= cuDeviceTotalMem(&TmpSt
, Device
);
803 if (Res
== CUDA_SUCCESS
)
804 Info
.add("Global Memory Size", TmpSt
, "bytes");
806 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, TmpInt
);
807 if (Res
== CUDA_SUCCESS
)
808 Info
.add("Number of Multiprocessors", TmpInt
);
810 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, TmpInt
);
811 if (Res
== CUDA_SUCCESS
)
812 Info
.add("Concurrent Copy and Execution", (bool)TmpInt
);
814 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY
, TmpInt
);
815 if (Res
== CUDA_SUCCESS
)
816 Info
.add("Total Constant Memory", TmpInt
, "bytes");
818 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
820 if (Res
== CUDA_SUCCESS
)
821 Info
.add("Max Shared Memory per Block", TmpInt
, "bytes");
823 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, TmpInt
);
824 if (Res
== CUDA_SUCCESS
)
825 Info
.add("Registers per Block", TmpInt
);
827 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE
, TmpInt
);
828 if (Res
== CUDA_SUCCESS
)
829 Info
.add("Warp Size", TmpInt
);
831 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, TmpInt
);
832 if (Res
== CUDA_SUCCESS
)
833 Info
.add("Maximum Threads per Block", TmpInt
);
835 Info
.add("Maximum Block Dimensions", "");
836 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
, TmpInt
);
837 if (Res
== CUDA_SUCCESS
)
838 Info
.add
<InfoLevel2
>("x", TmpInt
);
839 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y
, TmpInt
);
840 if (Res
== CUDA_SUCCESS
)
841 Info
.add
<InfoLevel2
>("y", TmpInt
);
842 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z
, TmpInt
);
843 if (Res
== CUDA_SUCCESS
)
844 Info
.add
<InfoLevel2
>("z", TmpInt
);
846 Info
.add("Maximum Grid Dimensions", "");
847 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X
, TmpInt
);
848 if (Res
== CUDA_SUCCESS
)
849 Info
.add
<InfoLevel2
>("x", TmpInt
);
850 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y
, TmpInt
);
851 if (Res
== CUDA_SUCCESS
)
852 Info
.add
<InfoLevel2
>("y", TmpInt
);
853 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z
, TmpInt
);
854 if (Res
== CUDA_SUCCESS
)
855 Info
.add
<InfoLevel2
>("z", TmpInt
);
857 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH
, TmpInt
);
858 if (Res
== CUDA_SUCCESS
)
859 Info
.add("Maximum Memory Pitch", TmpInt
, "bytes");
861 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT
, TmpInt
);
862 if (Res
== CUDA_SUCCESS
)
863 Info
.add("Texture Alignment", TmpInt
, "bytes");
865 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, TmpInt
);
866 if (Res
== CUDA_SUCCESS
)
867 Info
.add("Clock Rate", TmpInt
, "kHz");
869 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT
, TmpInt
);
870 if (Res
== CUDA_SUCCESS
)
871 Info
.add("Execution Timeout", (bool)TmpInt
);
873 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED
, TmpInt
);
874 if (Res
== CUDA_SUCCESS
)
875 Info
.add("Integrated Device", (bool)TmpInt
);
877 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, TmpInt
);
878 if (Res
== CUDA_SUCCESS
)
879 Info
.add("Can Map Host Memory", (bool)TmpInt
);
881 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, TmpInt
);
882 if (Res
== CUDA_SUCCESS
) {
883 if (TmpInt
== CU_COMPUTEMODE_DEFAULT
)
884 TmpCharPtr
= "Default";
885 else if (TmpInt
== CU_COMPUTEMODE_PROHIBITED
)
886 TmpCharPtr
= "Prohibited";
887 else if (TmpInt
== CU_COMPUTEMODE_EXCLUSIVE_PROCESS
)
888 TmpCharPtr
= "Exclusive process";
890 TmpCharPtr
= "Unknown";
891 Info
.add("Compute Mode", TmpCharPtr
);
894 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, TmpInt
);
895 if (Res
== CUDA_SUCCESS
)
896 Info
.add("Concurrent Kernels", (bool)TmpInt
);
898 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED
, TmpInt
);
899 if (Res
== CUDA_SUCCESS
)
900 Info
.add("ECC Enabled", (bool)TmpInt
);
902 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE
, TmpInt
);
903 if (Res
== CUDA_SUCCESS
)
904 Info
.add("Memory Clock Rate", TmpInt
, "kHz");
906 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH
, TmpInt
);
907 if (Res
== CUDA_SUCCESS
)
908 Info
.add("Memory Bus Width", TmpInt
, "bits");
910 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE
, TmpInt
);
911 if (Res
== CUDA_SUCCESS
)
912 Info
.add("L2 Cache Size", TmpInt
, "bytes");
914 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
,
916 if (Res
== CUDA_SUCCESS
)
917 Info
.add("Max Threads Per SMP", TmpInt
);
919 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, TmpInt
);
920 if (Res
== CUDA_SUCCESS
)
921 Info
.add("Async Engines", TmpInt
);
923 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING
, TmpInt
);
924 if (Res
== CUDA_SUCCESS
)
925 Info
.add("Unified Addressing", (bool)TmpInt
);
927 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY
, TmpInt
);
928 if (Res
== CUDA_SUCCESS
)
929 Info
.add("Managed Memory", (bool)TmpInt
);
932 getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
, TmpInt
);
933 if (Res
== CUDA_SUCCESS
)
934 Info
.add("Concurrent Managed Memory", (bool)TmpInt
);
936 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED
,
938 if (Res
== CUDA_SUCCESS
)
939 Info
.add("Preemption Supported", (bool)TmpInt
);
941 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH
, TmpInt
);
942 if (Res
== CUDA_SUCCESS
)
943 Info
.add("Cooperative Launch", (bool)TmpInt
);
945 Res
= getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD
, TmpInt
);
946 if (Res
== CUDA_SUCCESS
)
947 Info
.add("Multi-Device Boars", (bool)TmpInt
);
949 Info
.add("Compute Capabilities", ComputeCapability
.str());
951 return Plugin::success();
954 virtual bool shouldSetupDeviceMemoryPool() const override
{
955 /// We use the CUDA malloc for now.
959 /// Getters and setters for stack and heap sizes.
960 Error
getDeviceStackSize(uint64_t &Value
) override
{
961 return getCtxLimit(CU_LIMIT_STACK_SIZE
, Value
);
963 Error
setDeviceStackSize(uint64_t Value
) override
{
964 return setCtxLimit(CU_LIMIT_STACK_SIZE
, Value
);
966 Error
getDeviceHeapSize(uint64_t &Value
) override
{
967 return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE
, Value
);
969 Error
setDeviceHeapSize(uint64_t Value
) override
{
970 return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE
, Value
);
972 Error
getDeviceMemorySize(uint64_t &Value
) override
{
973 CUresult Res
= cuDeviceTotalMem(&Value
, Device
);
974 return Plugin::check(Res
, "Error in getDeviceMemorySize %s");
977 /// CUDA-specific functions for getting and setting context limits.
978 Error
setCtxLimit(CUlimit Kind
, uint64_t Value
) {
979 CUresult Res
= cuCtxSetLimit(Kind
, Value
);
980 return Plugin::check(Res
, "Error in cuCtxSetLimit: %s");
982 Error
getCtxLimit(CUlimit Kind
, uint64_t &Value
) {
983 CUresult Res
= cuCtxGetLimit(&Value
, Kind
);
984 return Plugin::check(Res
, "Error in cuCtxGetLimit: %s");
987 /// CUDA-specific function to get device attributes.
988 Error
getDeviceAttr(uint32_t Kind
, uint32_t &Value
) {
989 // TODO: Warn if the new value is larger than the old.
991 cuDeviceGetAttribute((int *)&Value
, (CUdevice_attribute
)Kind
, Device
);
992 return Plugin::check(Res
, "Error in cuDeviceGetAttribute: %s");
995 CUresult
getDeviceAttrRaw(uint32_t Kind
, int &Value
) {
996 return cuDeviceGetAttribute(&Value
, (CUdevice_attribute
)Kind
, Device
);
999 /// See GenericDeviceTy::getComputeUnitKind().
1000 std::string
getComputeUnitKind() const override
{
1001 return ComputeCapability
.str();
1004 /// Returns the clock frequency for the given NVPTX device.
1005 uint64_t getClockFrequency() const override
{ return 1000000000; }
1008 using CUDAStreamManagerTy
= GenericDeviceResourceManagerTy
<CUDAStreamRef
>;
1009 using CUDAEventManagerTy
= GenericDeviceResourceManagerTy
<CUDAEventRef
>;
1011 /// Stream manager for CUDA streams.
1012 CUDAStreamManagerTy CUDAStreamManager
;
1014 /// Event manager for CUDA events.
1015 CUDAEventManagerTy CUDAEventManager
;
1017 /// The device's context. This context should be set before performing
1018 /// operations on the device.
1019 CUcontext Context
= nullptr;
1021 /// The CUDA device handler.
1022 CUdevice Device
= CU_DEVICE_INVALID
;
1024 /// The memory mapped addresses and their handles
1025 std::unordered_map
<CUdeviceptr
, CUmemGenericAllocationHandle
> DeviceMMaps
;
1027 /// The compute capability of the corresponding CUDA device.
1028 struct ComputeCapabilityTy
{
1031 std::string
str() const {
1032 return "sm_" + std::to_string(Major
* 10 + Minor
);
1034 } ComputeCapability
;
1036 /// The maximum number of warps that can be resident on all the SMs
1038 uint32_t HardwareParallelism
= 0;
1041 Error
CUDAKernelTy::launchImpl(GenericDeviceTy
&GenericDevice
,
1042 uint32_t NumThreads
, uint64_t NumBlocks
,
1043 KernelArgsTy
&KernelArgs
, void *Args
,
1044 AsyncInfoWrapperTy
&AsyncInfoWrapper
) const {
1045 CUDADeviceTy
&CUDADevice
= static_cast<CUDADeviceTy
&>(GenericDevice
);
1048 if (auto Err
= CUDADevice
.getStream(AsyncInfoWrapper
, Stream
))
1051 uint32_t MaxDynCGroupMem
=
1052 std::max(KernelArgs
.DynCGroupMem
, GenericDevice
.getDynamicMemorySize());
1055 cuLaunchKernel(Func
, NumBlocks
, /* gridDimY */ 1,
1056 /* gridDimZ */ 1, NumThreads
,
1057 /* blockDimY */ 1, /* blockDimZ */ 1, MaxDynCGroupMem
,
1058 Stream
, (void **)Args
, nullptr);
1059 return Plugin::check(Res
, "Error in cuLaunchKernel for '%s': %s", getName());
1062 /// Class implementing the CUDA-specific functionalities of the global handler.
1063 class CUDAGlobalHandlerTy final
: public GenericGlobalHandlerTy
{
1065 /// Get the metadata of a global from the device. The name and size of the
1066 /// global is read from DeviceGlobal and the address of the global is written
1067 /// to DeviceGlobal.
1068 Error
getGlobalMetadataFromDevice(GenericDeviceTy
&Device
,
1069 DeviceImageTy
&Image
,
1070 GlobalTy
&DeviceGlobal
) override
{
1071 CUDADeviceImageTy
&CUDAImage
= static_cast<CUDADeviceImageTy
&>(Image
);
1073 const char *GlobalName
= DeviceGlobal
.getName().data();
1078 cuModuleGetGlobal(&CUPtr
, &CUSize
, CUDAImage
.getModule(), GlobalName
);
1079 if (auto Err
= Plugin::check(Res
, "Error in cuModuleGetGlobal for '%s': %s",
1083 if (CUSize
!= DeviceGlobal
.getSize())
1084 return Plugin::error(
1085 "Failed to load global '%s' due to size mismatch (%zu != %zu)",
1086 GlobalName
, CUSize
, (size_t)DeviceGlobal
.getSize());
1088 DeviceGlobal
.setPtr(reinterpret_cast<void *>(CUPtr
));
1089 return Plugin::success();
1093 /// Class implementing the CUDA-specific functionalities of the plugin.
1094 struct CUDAPluginTy final
: public GenericPluginTy
{
1095 /// Create a CUDA plugin.
1096 CUDAPluginTy() : GenericPluginTy(getTripleArch()) {}
1098 /// This class should not be copied.
1099 CUDAPluginTy(const CUDAPluginTy
&) = delete;
1100 CUDAPluginTy(CUDAPluginTy
&&) = delete;
1102 /// Initialize the plugin and return the number of devices.
1103 Expected
<int32_t> initImpl() override
{
1104 CUresult Res
= cuInit(0);
1105 if (Res
== CUDA_ERROR_INVALID_HANDLE
) {
1106 // Cannot call cuGetErrorString if dlsym failed.
1107 DP("Failed to load CUDA shared library\n");
1112 ompt::connectLibrary();
1115 if (Res
== CUDA_ERROR_NO_DEVICE
) {
1116 // Do not initialize if there are no devices.
1117 DP("There are no devices supporting CUDA.\n");
1121 if (auto Err
= Plugin::check(Res
, "Error in cuInit: %s"))
1122 return std::move(Err
);
1124 // Get the number of devices.
1126 Res
= cuDeviceGetCount(&NumDevices
);
1127 if (auto Err
= Plugin::check(Res
, "Error in cuDeviceGetCount: %s"))
1128 return std::move(Err
);
1130 // Do not initialize if there are no devices.
1131 if (NumDevices
== 0)
1132 DP("There are no devices supporting CUDA.\n");
1137 /// Deinitialize the plugin.
1138 Error
deinitImpl() override
{ return Plugin::success(); }
1140 /// Get the ELF code for recognizing the compatible image binary.
1141 uint16_t getMagicElfBits() const override
{ return ELF::EM_CUDA
; }
1143 Triple::ArchType
getTripleArch() const override
{
1144 // TODO: I think we can drop the support for 32-bit NVPTX devices.
1145 return Triple::nvptx64
;
1148 /// Check whether the image is compatible with the available CUDA devices.
1149 Expected
<bool> isImageCompatible(__tgt_image_info
*Info
) const override
{
1150 for (int32_t DevId
= 0; DevId
< getNumDevices(); ++DevId
) {
1152 CUresult Res
= cuDeviceGet(&Device
, DevId
);
1153 if (auto Err
= Plugin::check(Res
, "Error in cuDeviceGet: %s"))
1154 return std::move(Err
);
1156 int32_t Major
, Minor
;
1157 Res
= cuDeviceGetAttribute(
1158 &Major
, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
, Device
);
1159 if (auto Err
= Plugin::check(Res
, "Error in cuDeviceGetAttribute: %s"))
1160 return std::move(Err
);
1162 Res
= cuDeviceGetAttribute(
1163 &Minor
, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
, Device
);
1164 if (auto Err
= Plugin::check(Res
, "Error in cuDeviceGetAttribute: %s"))
1165 return std::move(Err
);
1167 StringRef
ArchStr(Info
->Arch
);
1168 StringRef
PrefixStr("sm_");
1169 if (!ArchStr
.startswith(PrefixStr
))
1170 return Plugin::error("Unrecognized image arch %s", ArchStr
.data());
1172 int32_t ImageMajor
= ArchStr
[PrefixStr
.size() + 0] - '0';
1173 int32_t ImageMinor
= ArchStr
[PrefixStr
.size() + 1] - '0';
1175 // A cubin generated for a certain compute capability is supported to run
1176 // on any GPU with the same major revision and same or higher minor
1178 if (Major
!= ImageMajor
|| Minor
< ImageMinor
)
1185 Error
CUDADeviceTy::dataExchangeImpl(const void *SrcPtr
,
1186 GenericDeviceTy
&DstGenericDevice
,
1187 void *DstPtr
, int64_t Size
,
1188 AsyncInfoWrapperTy
&AsyncInfoWrapper
) {
1189 if (auto Err
= setContext())
1192 CUDADeviceTy
&DstDevice
= static_cast<CUDADeviceTy
&>(DstGenericDevice
);
1195 int32_t DstDeviceId
= DstDevice
.DeviceId
;
1196 CUdeviceptr CUSrcPtr
= (CUdeviceptr
)SrcPtr
;
1197 CUdeviceptr CUDstPtr
= (CUdeviceptr
)DstPtr
;
1199 int CanAccessPeer
= 0;
1200 if (DeviceId
!= DstDeviceId
) {
1201 // Make sure the lock is released before performing the copies.
1202 std::lock_guard
<std::mutex
> Lock(PeerAccessesLock
);
1204 switch (PeerAccesses
[DstDeviceId
]) {
1205 case PeerAccessState::AVAILABLE
:
1208 case PeerAccessState::UNAVAILABLE
:
1211 case PeerAccessState::PENDING
:
1212 // Check whether the source device can access the destination device.
1213 Res
= cuDeviceCanAccessPeer(&CanAccessPeer
, Device
, DstDevice
.Device
);
1214 if (auto Err
= Plugin::check(Res
, "Error in cuDeviceCanAccessPeer: %s"))
1217 if (CanAccessPeer
) {
1218 Res
= cuCtxEnablePeerAccess(DstDevice
.Context
, 0);
1219 if (Res
== CUDA_ERROR_TOO_MANY_PEERS
) {
1220 // Resources may be exhausted due to many P2P links.
1222 DP("Too many P2P so fall back to D2D memcpy");
1223 } else if (auto Err
=
1224 Plugin::check(Res
, "Error in cuCtxEnablePeerAccess: %s"))
1227 PeerAccesses
[DstDeviceId
] = (CanAccessPeer
)
1228 ? PeerAccessState::AVAILABLE
1229 : PeerAccessState::UNAVAILABLE
;
1234 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
1237 if (CanAccessPeer
) {
1238 // TODO: Should we fallback to D2D if peer access fails?
1239 Res
= cuMemcpyPeerAsync(CUDstPtr
, Context
, CUSrcPtr
, DstDevice
.Context
,
1241 return Plugin::check(Res
, "Error in cuMemcpyPeerAsync: %s");
1244 // Fallback to D2D copy.
1245 Res
= cuMemcpyDtoDAsync(CUDstPtr
, CUSrcPtr
, Size
, Stream
);
1246 return Plugin::check(Res
, "Error in cuMemcpyDtoDAsync: %s");
1249 GenericPluginTy
*Plugin::createPlugin() { return new CUDAPluginTy(); }
1251 GenericDeviceTy
*Plugin::createDevice(int32_t DeviceId
, int32_t NumDevices
) {
1252 return new CUDADeviceTy(DeviceId
, NumDevices
);
1255 GenericGlobalHandlerTy
*Plugin::createGlobalHandler() {
1256 return new CUDAGlobalHandlerTy();
1259 template <typename
... ArgsTy
>
1260 Error
Plugin::check(int32_t Code
, const char *ErrFmt
, ArgsTy
... Args
) {
1261 CUresult ResultCode
= static_cast<CUresult
>(Code
);
1262 if (ResultCode
== CUDA_SUCCESS
)
1263 return Error::success();
1265 const char *Desc
= "Unknown error";
1266 CUresult Ret
= cuGetErrorString(ResultCode
, &Desc
);
1267 if (Ret
!= CUDA_SUCCESS
)
1268 REPORT("Unrecognized " GETNAME(TARGET_NAME
) " error code %d\n", Code
);
1270 return createStringError
<ArgsTy
..., const char *>(inconvertibleErrorCode(),
1271 ErrFmt
, Args
..., Desc
);
1274 } // namespace plugin
1275 } // namespace target