Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / plugins-nextgen / cuda / src / rtl.cpp
blobbe34051bc96f974b4aae9b5a10efda83801d2d63
1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL NextGen for CUDA machine
11 //===----------------------------------------------------------------------===//
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <string>
17 #include <unordered_map>
19 #include "Debug.h"
20 #include "Environment.h"
21 #include "GlobalHandler.h"
22 #include "OmptCallback.h"
23 #include "PluginInterface.h"
25 #include "llvm/BinaryFormat/ELF.h"
26 #include "llvm/Frontend/OpenMP/OMPConstants.h"
27 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
28 #include "llvm/Support/Error.h"
30 namespace llvm {
31 namespace omp {
32 namespace target {
33 namespace plugin {
35 /// Forward declarations for all specialized data structures.
36 struct CUDAKernelTy;
37 struct CUDADeviceTy;
38 struct CUDAPluginTy;
40 /// Class implementing the CUDA device images properties.
41 struct CUDADeviceImageTy : public DeviceImageTy {
42 /// Create the CUDA image with the id and the target image pointer.
43 CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
44 : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {}
46 /// Load the image as a CUDA module.
47 Error loadModule() {
48 assert(!Module && "Module already loaded");
50 CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
51 if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s"))
52 return Err;
54 return Plugin::success();
57 /// Unload the CUDA module corresponding to the image.
58 Error unloadModule() {
59 assert(Module && "Module not loaded");
61 CUresult Res = cuModuleUnload(Module);
62 if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s"))
63 return Err;
65 Module = nullptr;
67 return Plugin::success();
70 /// Getter of the CUDA module.
71 CUmodule getModule() const { return Module; }
73 private:
74 /// The CUDA module that loaded the image.
75 CUmodule Module;
78 /// Class implementing the CUDA kernel functionalities which derives from the
79 /// generic kernel class.
80 struct CUDAKernelTy : public GenericKernelTy {
81 /// Create a CUDA kernel with a name and an execution mode.
82 CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
84 /// Initialize the CUDA kernel.
85 Error initImpl(GenericDeviceTy &GenericDevice,
86 DeviceImageTy &Image) override {
87 CUresult Res;
88 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
90 // Retrieve the function pointer of the kernel.
91 Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName());
92 if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s",
93 getName()))
94 return Err;
96 // Check that the function pointer is valid.
97 if (!Func)
98 return Plugin::error("Invalid function for kernel %s", getName());
100 int MaxThreads;
101 Res = cuFuncGetAttribute(&MaxThreads,
102 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
103 if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
104 return Err;
106 // The maximum number of threads cannot exceed the maximum of the kernel.
107 MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
109 return Plugin::success();
112 /// Launch the CUDA kernel function.
113 Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
114 uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
115 AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
117 private:
118 /// The CUDA kernel function to execute.
119 CUfunction Func;
122 /// Class wrapping a CUDA stream reference. These are the objects handled by the
123 /// Stream Manager for the CUDA plugin.
124 struct CUDAStreamRef final : public GenericDeviceResourceRef {
125 /// The underlying handle type for streams.
126 using HandleTy = CUstream;
128 /// Create an empty reference to an invalid stream.
129 CUDAStreamRef() : Stream(nullptr) {}
131 /// Create a reference to an existing stream.
132 CUDAStreamRef(HandleTy Stream) : Stream(Stream) {}
134 /// Create a new stream and save the reference. The reference must be empty
135 /// before calling to this function.
136 Error create(GenericDeviceTy &Device) override {
137 if (Stream)
138 return Plugin::error("Creating an existing stream");
140 CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
141 if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s"))
142 return Err;
144 return Plugin::success();
147 /// Destroy the referenced stream and invalidate the reference. The reference
148 /// must be to a valid stream before calling to this function.
149 Error destroy(GenericDeviceTy &Device) override {
150 if (!Stream)
151 return Plugin::error("Destroying an invalid stream");
153 CUresult Res = cuStreamDestroy(Stream);
154 if (auto Err = Plugin::check(Res, "Error in cuStreamDestroy: %s"))
155 return Err;
157 Stream = nullptr;
158 return Plugin::success();
161 /// Get the underlying CUDA stream.
162 operator HandleTy() const { return Stream; }
164 private:
165 /// The reference to the CUDA stream.
166 HandleTy Stream;
169 /// Class wrapping a CUDA event reference. These are the objects handled by the
170 /// Event Manager for the CUDA plugin.
171 struct CUDAEventRef final : public GenericDeviceResourceRef {
172 /// The underlying handle type for events.
173 using HandleTy = CUevent;
175 /// Create an empty reference to an invalid event.
176 CUDAEventRef() : Event(nullptr) {}
178 /// Create a reference to an existing event.
179 CUDAEventRef(HandleTy Event) : Event(Event) {}
181 /// Create a new event and save the reference. The reference must be empty
182 /// before calling to this function.
183 Error create(GenericDeviceTy &Device) override {
184 if (Event)
185 return Plugin::error("Creating an existing event");
187 CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT);
188 if (auto Err = Plugin::check(Res, "Error in cuEventCreate: %s"))
189 return Err;
191 return Plugin::success();
194 /// Destroy the referenced event and invalidate the reference. The reference
195 /// must be to a valid event before calling to this function.
196 Error destroy(GenericDeviceTy &Device) override {
197 if (!Event)
198 return Plugin::error("Destroying an invalid event");
200 CUresult Res = cuEventDestroy(Event);
201 if (auto Err = Plugin::check(Res, "Error in cuEventDestroy: %s"))
202 return Err;
204 Event = nullptr;
205 return Plugin::success();
208 /// Get the underlying CUevent.
209 operator HandleTy() const { return Event; }
211 private:
212 /// The reference to the CUDA event.
213 HandleTy Event;
216 /// Class implementing the CUDA device functionalities which derives from the
217 /// generic device class.
218 struct CUDADeviceTy : public GenericDeviceTy {
219 // Create a CUDA device with a device id and the default CUDA grid values.
220 CUDADeviceTy(int32_t DeviceId, int32_t NumDevices)
221 : GenericDeviceTy(DeviceId, NumDevices, NVPTXGridValues),
222 CUDAStreamManager(*this), CUDAEventManager(*this) {}
224 ~CUDADeviceTy() {}
226 /// Initialize the device, its resources and get its properties.
227 Error initImpl(GenericPluginTy &Plugin) override {
228 CUresult Res = cuDeviceGet(&Device, DeviceId);
229 if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
230 return Err;
232 // Query the current flags of the primary context and set its flags if
233 // it is inactive.
234 unsigned int FormerPrimaryCtxFlags = 0;
235 int FormerPrimaryCtxIsActive = 0;
236 Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
237 &FormerPrimaryCtxIsActive);
238 if (auto Err =
239 Plugin::check(Res, "Error in cuDevicePrimaryCtxGetState: %s"))
240 return Err;
242 if (FormerPrimaryCtxIsActive) {
243 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
244 "The primary context is active, no change to its flags\n");
245 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
246 CU_CTX_SCHED_BLOCKING_SYNC)
247 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
248 "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
249 } else {
250 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
251 "The primary context is inactive, set its flags to "
252 "CU_CTX_SCHED_BLOCKING_SYNC\n");
253 Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
254 if (auto Err =
255 Plugin::check(Res, "Error in cuDevicePrimaryCtxSetFlags: %s"))
256 return Err;
259 // Retain the per device primary context and save it to use whenever this
260 // device is selected.
261 Res = cuDevicePrimaryCtxRetain(&Context, Device);
262 if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxRetain: %s"))
263 return Err;
265 if (auto Err = setContext())
266 return Err;
268 // Initialize stream pool.
269 if (auto Err = CUDAStreamManager.init(OMPX_InitialNumStreams))
270 return Err;
272 // Initialize event pool.
273 if (auto Err = CUDAEventManager.init(OMPX_InitialNumEvents))
274 return Err;
276 // Query attributes to determine number of threads/block and blocks/grid.
277 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
278 GridValues.GV_Max_Teams))
279 return Err;
281 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
282 GridValues.GV_Max_WG_Size))
283 return Err;
285 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE,
286 GridValues.GV_Warp_Size))
287 return Err;
289 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
290 ComputeCapability.Major))
291 return Err;
293 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
294 ComputeCapability.Minor))
295 return Err;
297 uint32_t NumMuliprocessors = 0;
298 uint32_t MaxThreadsPerSM = 0;
299 uint32_t WarpSize = 0;
300 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
301 NumMuliprocessors))
302 return Err;
303 if (auto Err =
304 getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
305 MaxThreadsPerSM))
306 return Err;
307 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
308 return Err;
309 HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
311 return Plugin::success();
314 /// Deinitialize the device and release its resources.
315 Error deinitImpl() override {
316 if (Context) {
317 if (auto Err = setContext())
318 return Err;
321 // Deinitialize the stream manager.
322 if (auto Err = CUDAStreamManager.deinit())
323 return Err;
325 if (auto Err = CUDAEventManager.deinit())
326 return Err;
328 // Close modules if necessary.
329 if (!LoadedImages.empty()) {
330 assert(Context && "Invalid CUDA context");
332 // Each image has its own module.
333 for (DeviceImageTy *Image : LoadedImages) {
334 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
336 // Unload the module of the image.
337 if (auto Err = CUDAImage.unloadModule())
338 return Err;
342 if (Context) {
343 CUresult Res = cuDevicePrimaryCtxRelease(Device);
344 if (auto Err =
345 Plugin::check(Res, "Error in cuDevicePrimaryCtxRelease: %s"))
346 return Err;
349 // Invalidate context and device references.
350 Context = nullptr;
351 Device = CU_DEVICE_INVALID;
353 return Plugin::success();
356 /// Allocate and construct a CUDA kernel.
357 Expected<GenericKernelTy &>
358 constructKernel(const __tgt_offload_entry &KernelEntry) override {
359 // Allocate and construct the CUDA kernel.
360 CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
361 if (!CUDAKernel)
362 return Plugin::error("Failed to allocate memory for CUDA kernel");
364 new (CUDAKernel) CUDAKernelTy(KernelEntry.name);
366 return *CUDAKernel;
369 /// Set the current context to this device's context.
370 Error setContext() override {
371 CUresult Res = cuCtxSetCurrent(Context);
372 return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
375 /// NVIDIA returns the product of the SM count and the number of warps that
376 /// fit if the maximum number of threads were scheduled on each SM.
377 uint64_t getHardwareParallelism() const override {
378 return HardwareParallelism;
381 /// We want to set up the RPC server for host services to the GPU if it is
382 /// availible.
383 bool shouldSetupRPCServer() const override {
384 return libomptargetSupportsRPC();
387 /// The RPC interface should have enough space for all availible parallelism.
388 uint64_t requestedRPCPortCount() const override {
389 return getHardwareParallelism();
392 /// Get the stream of the asynchronous info sructure or get a new one.
393 Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
394 // Get the stream (if any) from the async info.
395 Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
396 if (!Stream) {
397 // There was no stream; get an idle one.
398 if (auto Err = CUDAStreamManager.getResource(Stream))
399 return Err;
401 // Modify the async info's stream.
402 AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
404 return Plugin::success();
407 /// Getters of CUDA references.
408 CUcontext getCUDAContext() const { return Context; }
409 CUdevice getCUDADevice() const { return Device; }
411 /// Load the binary image into the device and allocate an image object.
412 Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
413 int32_t ImageId) override {
414 if (auto Err = setContext())
415 return std::move(Err);
417 // Allocate and initialize the image object.
418 CUDADeviceImageTy *CUDAImage = Plugin::get().allocate<CUDADeviceImageTy>();
419 new (CUDAImage) CUDADeviceImageTy(ImageId, TgtImage);
421 // Load the CUDA module.
422 if (auto Err = CUDAImage->loadModule())
423 return std::move(Err);
425 return CUDAImage;
428 /// Allocate memory on the device or related to the device.
429 void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
430 if (Size == 0)
431 return nullptr;
433 if (auto Err = setContext()) {
434 REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
435 return nullptr;
438 void *MemAlloc = nullptr;
439 CUdeviceptr DevicePtr;
440 CUresult Res;
442 switch (Kind) {
443 case TARGET_ALLOC_DEFAULT:
444 case TARGET_ALLOC_DEVICE:
445 Res = cuMemAlloc(&DevicePtr, Size);
446 MemAlloc = (void *)DevicePtr;
447 break;
448 case TARGET_ALLOC_HOST:
449 Res = cuMemAllocHost(&MemAlloc, Size);
450 break;
451 case TARGET_ALLOC_SHARED:
452 Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
453 MemAlloc = (void *)DevicePtr;
454 break;
457 if (auto Err =
458 Plugin::check(Res, "Error in cuMemAlloc[Host|Managed]: %s")) {
459 REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
460 return nullptr;
462 return MemAlloc;
465 /// Deallocate memory on the device or related to the device.
466 int free(void *TgtPtr, TargetAllocTy Kind) override {
467 if (TgtPtr == nullptr)
468 return OFFLOAD_SUCCESS;
470 if (auto Err = setContext()) {
471 REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
472 return OFFLOAD_FAIL;
475 CUresult Res;
476 switch (Kind) {
477 case TARGET_ALLOC_DEFAULT:
478 case TARGET_ALLOC_DEVICE:
479 case TARGET_ALLOC_SHARED:
480 Res = cuMemFree((CUdeviceptr)TgtPtr);
481 break;
482 case TARGET_ALLOC_HOST:
483 Res = cuMemFreeHost(TgtPtr);
484 break;
487 if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
488 REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
489 return OFFLOAD_FAIL;
491 return OFFLOAD_SUCCESS;
494 /// Synchronize current thread with the pending operations on the async info.
495 Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
496 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
497 CUresult Res;
498 // If we have an RPC server running on this device we will continuously
499 // query it for work rather than blocking.
500 if (!getRPCServer()) {
501 Res = cuStreamSynchronize(Stream);
502 } else {
503 do {
504 Res = cuStreamQuery(Stream);
505 if (auto Err = getRPCServer()->runServer(*this))
506 return Err;
507 } while (Res == CUDA_ERROR_NOT_READY);
510 // Once the stream is synchronized, return it to stream pool and reset
511 // AsyncInfo. This is to make sure the synchronization only works for its
512 // own tasks.
513 AsyncInfo.Queue = nullptr;
514 if (auto Err = CUDAStreamManager.returnResource(Stream))
515 return Err;
517 return Plugin::check(Res, "Error in cuStreamSynchronize: %s");
520 /// CUDA support VA management
521 bool supportVAManagement() const override { return true; }
523 /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
524 /// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
525 /// At return \p RSize contains the actual size
526 Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
527 CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
528 auto IHandle = DeviceMMaps.find(DVAddr);
529 size_t Size = *RSize;
531 if (Size == 0)
532 return Plugin::error("Memory Map Size must be larger than 0");
534 // Check if we have already mapped this address
535 if (IHandle != DeviceMMaps.end())
536 return Plugin::error("Address already memory mapped");
538 CUmemAllocationProp Prop = {};
539 size_t Granularity = 0;
541 size_t Free, Total;
542 CUresult Res = cuMemGetInfo(&Free, &Total);
543 if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
544 return Err;
546 if (Size >= Free) {
547 *Addr = nullptr;
548 return Plugin::error(
549 "Canot map memory size larger than the available device memory");
552 // currently NVidia only supports pinned device types
553 Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
554 Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
556 Prop.location.id = DeviceId;
557 cuMemGetAllocationGranularity(&Granularity, &Prop,
558 CU_MEM_ALLOC_GRANULARITY_MINIMUM);
559 if (auto Err =
560 Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s"))
561 return Err;
563 if (Granularity == 0)
564 return Plugin::error("Wrong device Page size");
566 // Ceil to page size.
567 Size = roundUp(Size, Granularity);
569 // Create a handler of our allocation
570 CUmemGenericAllocationHandle AHandle;
571 Res = cuMemCreate(&AHandle, Size, &Prop, 0);
572 if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s"))
573 return Err;
575 CUdeviceptr DevPtr = 0;
576 Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
577 if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s"))
578 return Err;
580 Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
581 if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s"))
582 return Err;
584 CUmemAccessDesc ADesc = {};
585 ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
586 ADesc.location.id = DeviceId;
587 ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
589 // Sets address
590 Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
591 if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s"))
592 return Err;
594 *Addr = reinterpret_cast<void *>(DevPtr);
595 *RSize = Size;
596 DeviceMMaps.insert({DevPtr, AHandle});
597 return Plugin::success();
600 /// De-allocates device memory and Unmaps the Virtual Addr
601 Error memoryVAUnMap(void *VAddr, size_t Size) override {
602 CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
603 auto IHandle = DeviceMMaps.find(DVAddr);
604 // Mapping does not exist
605 if (IHandle == DeviceMMaps.end()) {
606 return Plugin::error("Addr is not MemoryMapped");
609 if (IHandle == DeviceMMaps.end())
610 return Plugin::error("Addr is not MemoryMapped");
612 CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
614 CUresult Res = cuMemUnmap(DVAddr, Size);
615 if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s"))
616 return Err;
618 Res = cuMemRelease(AllocHandle);
619 if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s"))
620 return Err;
622 Res = cuMemAddressFree(DVAddr, Size);
623 if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s"))
624 return Err;
626 DeviceMMaps.erase(IHandle);
627 return Plugin::success();
630 /// Query for the completion of the pending operations on the async info.
631 Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
632 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
633 CUresult Res = cuStreamQuery(Stream);
635 // Not ready streams must be considered as successful operations.
636 if (Res == CUDA_ERROR_NOT_READY)
637 return Plugin::success();
639 // Once the stream is synchronized and the operations completed (or an error
640 // occurs), return it to stream pool and reset AsyncInfo. This is to make
641 // sure the synchronization only works for its own tasks.
642 AsyncInfo.Queue = nullptr;
643 if (auto Err = CUDAStreamManager.returnResource(Stream))
644 return Err;
646 return Plugin::check(Res, "Error in cuStreamQuery: %s");
649 Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
650 // TODO: Register the buffer as CUDA host memory.
651 return HstPtr;
654 Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
656 Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
657 void *&BaseDevAccessiblePtr,
658 size_t &BaseSize) const override {
659 // TODO: Implement pinning feature for CUDA.
660 return false;
663 /// Submit data to the device (host to device transfer).
664 Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
665 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
666 if (auto Err = setContext())
667 return Err;
669 CUstream Stream;
670 if (auto Err = getStream(AsyncInfoWrapper, Stream))
671 return Err;
673 CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
674 return Plugin::check(Res, "Error in cuMemcpyHtoDAsync: %s");
677 /// Retrieve data from the device (device to host transfer).
678 Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
679 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
680 if (auto Err = setContext())
681 return Err;
683 CUstream Stream;
684 if (auto Err = getStream(AsyncInfoWrapper, Stream))
685 return Err;
687 // If there is already pending work on the stream it could be waiting for
688 // someone to check the RPC server.
689 if (auto RPCServer = getRPCServer()) {
690 CUresult Res = cuStreamQuery(Stream);
691 while (Res == CUDA_ERROR_NOT_READY) {
692 if (auto Err = RPCServer->runServer(*this))
693 return Err;
694 Res = cuStreamQuery(Stream);
698 CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
699 return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s");
702 /// Exchange data between two devices directly. We may use peer access if
703 /// the CUDA devices and driver allow them.
704 Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
705 void *DstPtr, int64_t Size,
706 AsyncInfoWrapperTy &AsyncInfoWrapper) override;
708 /// Initialize the async info for interoperability purposes.
709 Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
710 if (auto Err = setContext())
711 return Err;
713 CUstream Stream;
714 if (auto Err = getStream(AsyncInfoWrapper, Stream))
715 return Err;
717 return Plugin::success();
720 /// Initialize the device info for interoperability purposes.
721 Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
722 assert(Context && "Context is null");
723 assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
725 if (auto Err = setContext())
726 return Err;
728 if (!DeviceInfo->Context)
729 DeviceInfo->Context = Context;
731 if (!DeviceInfo->Device)
732 DeviceInfo->Device = reinterpret_cast<void *>(Device);
734 return Plugin::success();
737 /// Create an event.
738 Error createEventImpl(void **EventPtrStorage) override {
739 CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
740 return CUDAEventManager.getResource(*Event);
743 /// Destroy a previously created event.
744 Error destroyEventImpl(void *EventPtr) override {
745 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
746 return CUDAEventManager.returnResource(Event);
749 /// Record the event.
750 Error recordEventImpl(void *EventPtr,
751 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
752 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
754 CUstream Stream;
755 if (auto Err = getStream(AsyncInfoWrapper, Stream))
756 return Err;
758 CUresult Res = cuEventRecord(Event, Stream);
759 return Plugin::check(Res, "Error in cuEventRecord: %s");
762 /// Make the stream wait on the event.
763 Error waitEventImpl(void *EventPtr,
764 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
765 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
767 CUstream Stream;
768 if (auto Err = getStream(AsyncInfoWrapper, Stream))
769 return Err;
771 // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from
772 // specific CUDA version, and defined as 0x0. In previous version, per CUDA
773 // API document, that argument has to be 0x0.
774 CUresult Res = cuStreamWaitEvent(Stream, Event, 0);
775 return Plugin::check(Res, "Error in cuStreamWaitEvent: %s");
778 /// Synchronize the current thread with the event.
779 Error syncEventImpl(void *EventPtr) override {
780 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
781 CUresult Res = cuEventSynchronize(Event);
782 return Plugin::check(Res, "Error in cuEventSynchronize: %s");
785 /// Print information about the device.
786 Error obtainInfoImpl(InfoQueueTy &Info) override {
787 char TmpChar[1000];
788 const char *TmpCharPtr;
789 size_t TmpSt;
790 int TmpInt;
792 CUresult Res = cuDriverGetVersion(&TmpInt);
793 if (Res == CUDA_SUCCESS)
794 Info.add("CUDA Driver Version", TmpInt);
796 Info.add("CUDA OpenMP Device Number", DeviceId);
798 Res = cuDeviceGetName(TmpChar, 1000, Device);
799 if (Res == CUDA_SUCCESS)
800 Info.add("Device Name", TmpChar);
802 Res = cuDeviceTotalMem(&TmpSt, Device);
803 if (Res == CUDA_SUCCESS)
804 Info.add("Global Memory Size", TmpSt, "bytes");
806 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
807 if (Res == CUDA_SUCCESS)
808 Info.add("Number of Multiprocessors", TmpInt);
810 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
811 if (Res == CUDA_SUCCESS)
812 Info.add("Concurrent Copy and Execution", (bool)TmpInt);
814 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt);
815 if (Res == CUDA_SUCCESS)
816 Info.add("Total Constant Memory", TmpInt, "bytes");
818 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
819 TmpInt);
820 if (Res == CUDA_SUCCESS)
821 Info.add("Max Shared Memory per Block", TmpInt, "bytes");
823 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
824 if (Res == CUDA_SUCCESS)
825 Info.add("Registers per Block", TmpInt);
827 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt);
828 if (Res == CUDA_SUCCESS)
829 Info.add("Warp Size", TmpInt);
831 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt);
832 if (Res == CUDA_SUCCESS)
833 Info.add("Maximum Threads per Block", TmpInt);
835 Info.add("Maximum Block Dimensions", "");
836 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
837 if (Res == CUDA_SUCCESS)
838 Info.add<InfoLevel2>("x", TmpInt);
839 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
840 if (Res == CUDA_SUCCESS)
841 Info.add<InfoLevel2>("y", TmpInt);
842 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
843 if (Res == CUDA_SUCCESS)
844 Info.add<InfoLevel2>("z", TmpInt);
846 Info.add("Maximum Grid Dimensions", "");
847 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
848 if (Res == CUDA_SUCCESS)
849 Info.add<InfoLevel2>("x", TmpInt);
850 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
851 if (Res == CUDA_SUCCESS)
852 Info.add<InfoLevel2>("y", TmpInt);
853 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
854 if (Res == CUDA_SUCCESS)
855 Info.add<InfoLevel2>("z", TmpInt);
857 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
858 if (Res == CUDA_SUCCESS)
859 Info.add("Maximum Memory Pitch", TmpInt, "bytes");
861 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt);
862 if (Res == CUDA_SUCCESS)
863 Info.add("Texture Alignment", TmpInt, "bytes");
865 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
866 if (Res == CUDA_SUCCESS)
867 Info.add("Clock Rate", TmpInt, "kHz");
869 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
870 if (Res == CUDA_SUCCESS)
871 Info.add("Execution Timeout", (bool)TmpInt);
873 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt);
874 if (Res == CUDA_SUCCESS)
875 Info.add("Integrated Device", (bool)TmpInt);
877 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt);
878 if (Res == CUDA_SUCCESS)
879 Info.add("Can Map Host Memory", (bool)TmpInt);
881 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt);
882 if (Res == CUDA_SUCCESS) {
883 if (TmpInt == CU_COMPUTEMODE_DEFAULT)
884 TmpCharPtr = "Default";
885 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
886 TmpCharPtr = "Prohibited";
887 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
888 TmpCharPtr = "Exclusive process";
889 else
890 TmpCharPtr = "Unknown";
891 Info.add("Compute Mode", TmpCharPtr);
894 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt);
895 if (Res == CUDA_SUCCESS)
896 Info.add("Concurrent Kernels", (bool)TmpInt);
898 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt);
899 if (Res == CUDA_SUCCESS)
900 Info.add("ECC Enabled", (bool)TmpInt);
902 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
903 if (Res == CUDA_SUCCESS)
904 Info.add("Memory Clock Rate", TmpInt, "kHz");
906 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
907 if (Res == CUDA_SUCCESS)
908 Info.add("Memory Bus Width", TmpInt, "bits");
910 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt);
911 if (Res == CUDA_SUCCESS)
912 Info.add("L2 Cache Size", TmpInt, "bytes");
914 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
915 TmpInt);
916 if (Res == CUDA_SUCCESS)
917 Info.add("Max Threads Per SMP", TmpInt);
919 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt);
920 if (Res == CUDA_SUCCESS)
921 Info.add("Async Engines", TmpInt);
923 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt);
924 if (Res == CUDA_SUCCESS)
925 Info.add("Unified Addressing", (bool)TmpInt);
927 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt);
928 if (Res == CUDA_SUCCESS)
929 Info.add("Managed Memory", (bool)TmpInt);
931 Res =
932 getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt);
933 if (Res == CUDA_SUCCESS)
934 Info.add("Concurrent Managed Memory", (bool)TmpInt);
936 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
937 TmpInt);
938 if (Res == CUDA_SUCCESS)
939 Info.add("Preemption Supported", (bool)TmpInt);
941 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
942 if (Res == CUDA_SUCCESS)
943 Info.add("Cooperative Launch", (bool)TmpInt);
945 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
946 if (Res == CUDA_SUCCESS)
947 Info.add("Multi-Device Boars", (bool)TmpInt);
949 Info.add("Compute Capabilities", ComputeCapability.str());
951 return Plugin::success();
954 virtual bool shouldSetupDeviceMemoryPool() const override {
955 /// We use the CUDA malloc for now.
956 return false;
959 /// Getters and setters for stack and heap sizes.
960 Error getDeviceStackSize(uint64_t &Value) override {
961 return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
963 Error setDeviceStackSize(uint64_t Value) override {
964 return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
966 Error getDeviceHeapSize(uint64_t &Value) override {
967 return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
969 Error setDeviceHeapSize(uint64_t Value) override {
970 return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
972 Error getDeviceMemorySize(uint64_t &Value) override {
973 CUresult Res = cuDeviceTotalMem(&Value, Device);
974 return Plugin::check(Res, "Error in getDeviceMemorySize %s");
977 /// CUDA-specific functions for getting and setting context limits.
978 Error setCtxLimit(CUlimit Kind, uint64_t Value) {
979 CUresult Res = cuCtxSetLimit(Kind, Value);
980 return Plugin::check(Res, "Error in cuCtxSetLimit: %s");
982 Error getCtxLimit(CUlimit Kind, uint64_t &Value) {
983 CUresult Res = cuCtxGetLimit(&Value, Kind);
984 return Plugin::check(Res, "Error in cuCtxGetLimit: %s");
987 /// CUDA-specific function to get device attributes.
988 Error getDeviceAttr(uint32_t Kind, uint32_t &Value) {
989 // TODO: Warn if the new value is larger than the old.
990 CUresult Res =
991 cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device);
992 return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s");
995 CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) {
996 return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device);
999 /// See GenericDeviceTy::getComputeUnitKind().
1000 std::string getComputeUnitKind() const override {
1001 return ComputeCapability.str();
1004 /// Returns the clock frequency for the given NVPTX device.
1005 uint64_t getClockFrequency() const override { return 1000000000; }
1007 private:
1008 using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
1009 using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
1011 /// Stream manager for CUDA streams.
1012 CUDAStreamManagerTy CUDAStreamManager;
1014 /// Event manager for CUDA events.
1015 CUDAEventManagerTy CUDAEventManager;
1017 /// The device's context. This context should be set before performing
1018 /// operations on the device.
1019 CUcontext Context = nullptr;
1021 /// The CUDA device handler.
1022 CUdevice Device = CU_DEVICE_INVALID;
1024 /// The memory mapped addresses and their handles
1025 std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
1027 /// The compute capability of the corresponding CUDA device.
1028 struct ComputeCapabilityTy {
1029 uint32_t Major;
1030 uint32_t Minor;
1031 std::string str() const {
1032 return "sm_" + std::to_string(Major * 10 + Minor);
1034 } ComputeCapability;
1036 /// The maximum number of warps that can be resident on all the SMs
1037 /// simultaneously.
1038 uint32_t HardwareParallelism = 0;
1041 Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
1042 uint32_t NumThreads, uint64_t NumBlocks,
1043 KernelArgsTy &KernelArgs, void *Args,
1044 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
1045 CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
1047 CUstream Stream;
1048 if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
1049 return Err;
1051 uint32_t MaxDynCGroupMem =
1052 std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
1054 CUresult Res =
1055 cuLaunchKernel(Func, NumBlocks, /* gridDimY */ 1,
1056 /* gridDimZ */ 1, NumThreads,
1057 /* blockDimY */ 1, /* blockDimZ */ 1, MaxDynCGroupMem,
1058 Stream, (void **)Args, nullptr);
1059 return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
1062 /// Class implementing the CUDA-specific functionalities of the global handler.
1063 class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
1064 public:
1065 /// Get the metadata of a global from the device. The name and size of the
1066 /// global is read from DeviceGlobal and the address of the global is written
1067 /// to DeviceGlobal.
1068 Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
1069 DeviceImageTy &Image,
1070 GlobalTy &DeviceGlobal) override {
1071 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
1073 const char *GlobalName = DeviceGlobal.getName().data();
1075 size_t CUSize;
1076 CUdeviceptr CUPtr;
1077 CUresult Res =
1078 cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
1079 if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s",
1080 GlobalName))
1081 return Err;
1083 if (CUSize != DeviceGlobal.getSize())
1084 return Plugin::error(
1085 "Failed to load global '%s' due to size mismatch (%zu != %zu)",
1086 GlobalName, CUSize, (size_t)DeviceGlobal.getSize());
1088 DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
1089 return Plugin::success();
1093 /// Class implementing the CUDA-specific functionalities of the plugin.
1094 struct CUDAPluginTy final : public GenericPluginTy {
1095 /// Create a CUDA plugin.
1096 CUDAPluginTy() : GenericPluginTy(getTripleArch()) {}
1098 /// This class should not be copied.
1099 CUDAPluginTy(const CUDAPluginTy &) = delete;
1100 CUDAPluginTy(CUDAPluginTy &&) = delete;
1102 /// Initialize the plugin and return the number of devices.
1103 Expected<int32_t> initImpl() override {
1104 CUresult Res = cuInit(0);
1105 if (Res == CUDA_ERROR_INVALID_HANDLE) {
1106 // Cannot call cuGetErrorString if dlsym failed.
1107 DP("Failed to load CUDA shared library\n");
1108 return 0;
1111 #ifdef OMPT_SUPPORT
1112 ompt::connectLibrary();
1113 #endif
1115 if (Res == CUDA_ERROR_NO_DEVICE) {
1116 // Do not initialize if there are no devices.
1117 DP("There are no devices supporting CUDA.\n");
1118 return 0;
1121 if (auto Err = Plugin::check(Res, "Error in cuInit: %s"))
1122 return std::move(Err);
1124 // Get the number of devices.
1125 int NumDevices;
1126 Res = cuDeviceGetCount(&NumDevices);
1127 if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s"))
1128 return std::move(Err);
1130 // Do not initialize if there are no devices.
1131 if (NumDevices == 0)
1132 DP("There are no devices supporting CUDA.\n");
1134 return NumDevices;
1137 /// Deinitialize the plugin.
1138 Error deinitImpl() override { return Plugin::success(); }
1140 /// Get the ELF code for recognizing the compatible image binary.
1141 uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; }
1143 Triple::ArchType getTripleArch() const override {
1144 // TODO: I think we can drop the support for 32-bit NVPTX devices.
1145 return Triple::nvptx64;
1148 /// Check whether the image is compatible with the available CUDA devices.
1149 Expected<bool> isImageCompatible(__tgt_image_info *Info) const override {
1150 for (int32_t DevId = 0; DevId < getNumDevices(); ++DevId) {
1151 CUdevice Device;
1152 CUresult Res = cuDeviceGet(&Device, DevId);
1153 if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
1154 return std::move(Err);
1156 int32_t Major, Minor;
1157 Res = cuDeviceGetAttribute(
1158 &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
1159 if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
1160 return std::move(Err);
1162 Res = cuDeviceGetAttribute(
1163 &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
1164 if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
1165 return std::move(Err);
1167 StringRef ArchStr(Info->Arch);
1168 StringRef PrefixStr("sm_");
1169 if (!ArchStr.startswith(PrefixStr))
1170 return Plugin::error("Unrecognized image arch %s", ArchStr.data());
1172 int32_t ImageMajor = ArchStr[PrefixStr.size() + 0] - '0';
1173 int32_t ImageMinor = ArchStr[PrefixStr.size() + 1] - '0';
1175 // A cubin generated for a certain compute capability is supported to run
1176 // on any GPU with the same major revision and same or higher minor
1177 // revision.
1178 if (Major != ImageMajor || Minor < ImageMinor)
1179 return false;
1181 return true;
1185 Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
1186 GenericDeviceTy &DstGenericDevice,
1187 void *DstPtr, int64_t Size,
1188 AsyncInfoWrapperTy &AsyncInfoWrapper) {
1189 if (auto Err = setContext())
1190 return Err;
1192 CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice);
1194 CUresult Res;
1195 int32_t DstDeviceId = DstDevice.DeviceId;
1196 CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr;
1197 CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr;
1199 int CanAccessPeer = 0;
1200 if (DeviceId != DstDeviceId) {
1201 // Make sure the lock is released before performing the copies.
1202 std::lock_guard<std::mutex> Lock(PeerAccessesLock);
1204 switch (PeerAccesses[DstDeviceId]) {
1205 case PeerAccessState::AVAILABLE:
1206 CanAccessPeer = 1;
1207 break;
1208 case PeerAccessState::UNAVAILABLE:
1209 CanAccessPeer = 0;
1210 break;
1211 case PeerAccessState::PENDING:
1212 // Check whether the source device can access the destination device.
1213 Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device);
1214 if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s"))
1215 return Err;
1217 if (CanAccessPeer) {
1218 Res = cuCtxEnablePeerAccess(DstDevice.Context, 0);
1219 if (Res == CUDA_ERROR_TOO_MANY_PEERS) {
1220 // Resources may be exhausted due to many P2P links.
1221 CanAccessPeer = 0;
1222 DP("Too many P2P so fall back to D2D memcpy");
1223 } else if (auto Err =
1224 Plugin::check(Res, "Error in cuCtxEnablePeerAccess: %s"))
1225 return Err;
1227 PeerAccesses[DstDeviceId] = (CanAccessPeer)
1228 ? PeerAccessState::AVAILABLE
1229 : PeerAccessState::UNAVAILABLE;
1233 CUstream Stream;
1234 if (auto Err = getStream(AsyncInfoWrapper, Stream))
1235 return Err;
1237 if (CanAccessPeer) {
1238 // TODO: Should we fallback to D2D if peer access fails?
1239 Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context,
1240 Size, Stream);
1241 return Plugin::check(Res, "Error in cuMemcpyPeerAsync: %s");
1244 // Fallback to D2D copy.
1245 Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream);
1246 return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s");
1249 GenericPluginTy *Plugin::createPlugin() { return new CUDAPluginTy(); }
1251 GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) {
1252 return new CUDADeviceTy(DeviceId, NumDevices);
1255 GenericGlobalHandlerTy *Plugin::createGlobalHandler() {
1256 return new CUDAGlobalHandlerTy();
1259 template <typename... ArgsTy>
1260 Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
1261 CUresult ResultCode = static_cast<CUresult>(Code);
1262 if (ResultCode == CUDA_SUCCESS)
1263 return Error::success();
1265 const char *Desc = "Unknown error";
1266 CUresult Ret = cuGetErrorString(ResultCode, &Desc);
1267 if (Ret != CUDA_SUCCESS)
1268 REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
1270 return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
1271 ErrFmt, Args..., Desc);
1274 } // namespace plugin
1275 } // namespace target
1276 } // namespace omp
1277 } // namespace llvm