openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp

   1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // RTL NextGen for CUDA machine
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include <cassert>
  14 #include <cstddef>
  15 #include <cuda.h>
  16 #include <string>
  17 #include <unordered_map>
  18
  19 #include "Debug.h"
  20 #include "Environment.h"
  21 #include "GlobalHandler.h"
  22 #include "OmptCallback.h"
  23 #include "PluginInterface.h"
  24
  25 #include "llvm/BinaryFormat/ELF.h"
  26 #include "llvm/Frontend/OpenMP/OMPConstants.h"
  27 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
  28 #include "llvm/Support/Error.h"
  29
  30 namespace llvm {
  31 namespace omp {
  32 namespace target {
  33 namespace plugin {
  34
  35 /// Forward declarations for all specialized data structures.
  36 struct CUDAKernelTy;
  37 struct CUDADeviceTy;
  38 struct CUDAPluginTy;
  39
  40 /// Class implementing the CUDA device images properties.
  41 struct CUDADeviceImageTy : public DeviceImageTy {
  42   /// Create the CUDA image with the id and the target image pointer.
  43   CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
  44       : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {}
  45
  46   /// Load the image as a CUDA module.
  47   Error loadModule() {
  48     assert(!Module && "Module already loaded");
  49
  50     CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
  51     if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s"))
  52       return Err;
  53
  54     return Plugin::success();
  55   }
  56
  57   /// Unload the CUDA module corresponding to the image.
  58   Error unloadModule() {
  59     assert(Module && "Module not loaded");
  60
  61     CUresult Res = cuModuleUnload(Module);
  62     if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s"))
  63       return Err;
  64
  65     Module = nullptr;
  66
  67     return Plugin::success();
  68   }
  69
  70   /// Getter of the CUDA module.
  71   CUmodule getModule() const { return Module; }
  72
  73 private:
  74   /// The CUDA module that loaded the image.
  75   CUmodule Module;
  76 };
  77
  78 /// Class implementing the CUDA kernel functionalities which derives from the
  79 /// generic kernel class.
  80 struct CUDAKernelTy : public GenericKernelTy {
  81   /// Create a CUDA kernel with a name and an execution mode.
  82   CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
  83
  84   /// Initialize the CUDA kernel.
  85   Error initImpl(GenericDeviceTy &GenericDevice,
  86                  DeviceImageTy &Image) override {
  87     CUresult Res;
  88     CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
  89
  90     // Retrieve the function pointer of the kernel.
  91     Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName());
  92     if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s",
  93                                  getName()))
  94       return Err;
  95
  96     // Check that the function pointer is valid.
  97     if (!Func)
  98       return Plugin::error("Invalid function for kernel %s", getName());
  99
 100     int MaxThreads;
 101     Res = cuFuncGetAttribute(&MaxThreads,
 102                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
 103     if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
 104       return Err;
 105
 106     // The maximum number of threads cannot exceed the maximum of the kernel.
 107     MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
 108
 109     return Plugin::success();
 110   }
 111
 112   /// Launch the CUDA kernel function.
 113   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
 114                    uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
 115                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 116
 117 private:
 118   /// The CUDA kernel function to execute.
 119   CUfunction Func;
 120 };
 121
 122 /// Class wrapping a CUDA stream reference. These are the objects handled by the
 123 /// Stream Manager for the CUDA plugin.
 124 struct CUDAStreamRef final : public GenericDeviceResourceRef {
 125   /// The underlying handle type for streams.
 126   using HandleTy = CUstream;
 127
 128   /// Create an empty reference to an invalid stream.
 129   CUDAStreamRef() : Stream(nullptr) {}
 130
 131   /// Create a reference to an existing stream.
 132   CUDAStreamRef(HandleTy Stream) : Stream(Stream) {}
 133
 134   /// Create a new stream and save the reference. The reference must be empty
 135   /// before calling to this function.
 136   Error create(GenericDeviceTy &Device) override {
 137     if (Stream)
 138       return Plugin::error("Creating an existing stream");
 139
 140     CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
 141     if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s"))
 142       return Err;
 143
 144     return Plugin::success();
 145   }
 146
 147   /// Destroy the referenced stream and invalidate the reference. The reference
 148   /// must be to a valid stream before calling to this function.
 149   Error destroy(GenericDeviceTy &Device) override {
 150     if (!Stream)
 151       return Plugin::error("Destroying an invalid stream");
 152
 153     CUresult Res = cuStreamDestroy(Stream);
 154     if (auto Err = Plugin::check(Res, "Error in cuStreamDestroy: %s"))
 155       return Err;
 156
 157     Stream = nullptr;
 158     return Plugin::success();
 159   }
 160
 161   /// Get the underlying CUDA stream.
 162   operator HandleTy() const { return Stream; }
 163
 164 private:
 165   /// The reference to the CUDA stream.
 166   HandleTy Stream;
 167 };
 168
 169 /// Class wrapping a CUDA event reference. These are the objects handled by the
 170 /// Event Manager for the CUDA plugin.
 171 struct CUDAEventRef final : public GenericDeviceResourceRef {
 172   /// The underlying handle type for events.
 173   using HandleTy = CUevent;
 174
 175   /// Create an empty reference to an invalid event.
 176   CUDAEventRef() : Event(nullptr) {}
 177
 178   /// Create a reference to an existing event.
 179   CUDAEventRef(HandleTy Event) : Event(Event) {}
 180
 181   /// Create a new event and save the reference. The reference must be empty
 182   /// before calling to this function.
 183   Error create(GenericDeviceTy &Device) override {
 184     if (Event)
 185       return Plugin::error("Creating an existing event");
 186
 187     CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT);
 188     if (auto Err = Plugin::check(Res, "Error in cuEventCreate: %s"))
 189       return Err;
 190
 191     return Plugin::success();
 192   }
 193
 194   /// Destroy the referenced event and invalidate the reference. The reference
 195   /// must be to a valid event before calling to this function.
 196   Error destroy(GenericDeviceTy &Device) override {
 197     if (!Event)
 198       return Plugin::error("Destroying an invalid event");
 199
 200     CUresult Res = cuEventDestroy(Event);
 201     if (auto Err = Plugin::check(Res, "Error in cuEventDestroy: %s"))
 202       return Err;
 203
 204     Event = nullptr;
 205     return Plugin::success();
 206   }
 207
 208   /// Get the underlying CUevent.
 209   operator HandleTy() const { return Event; }
 210
 211 private:
 212   /// The reference to the CUDA event.
 213   HandleTy Event;
 214 };
 215
 216 /// Class implementing the CUDA device functionalities which derives from the
 217 /// generic device class.
 218 struct CUDADeviceTy : public GenericDeviceTy {
 219   // Create a CUDA device with a device id and the default CUDA grid values.
 220   CUDADeviceTy(int32_t DeviceId, int32_t NumDevices)
 221       : GenericDeviceTy(DeviceId, NumDevices, NVPTXGridValues),
 222         CUDAStreamManager(*this), CUDAEventManager(*this) {}
 223
 224   ~CUDADeviceTy() {}
 225
 226   /// Initialize the device, its resources and get its properties.
 227   Error initImpl(GenericPluginTy &Plugin) override {
 228     CUresult Res = cuDeviceGet(&Device, DeviceId);
 229     if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
 230       return Err;
 231
 232     // Query the current flags of the primary context and set its flags if
 233     // it is inactive.
 234     unsigned int FormerPrimaryCtxFlags = 0;
 235     int FormerPrimaryCtxIsActive = 0;
 236     Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
 237                                      &FormerPrimaryCtxIsActive);
 238     if (auto Err =
 239             Plugin::check(Res, "Error in cuDevicePrimaryCtxGetState: %s"))
 240       return Err;
 241
 242     if (FormerPrimaryCtxIsActive) {
 243       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
 244            "The primary context is active, no change to its flags\n");
 245       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
 246           CU_CTX_SCHED_BLOCKING_SYNC)
 247         INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
 248              "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
 249     } else {
 250       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
 251            "The primary context is inactive, set its flags to "
 252            "CU_CTX_SCHED_BLOCKING_SYNC\n");
 253       Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
 254       if (auto Err =
 255               Plugin::check(Res, "Error in cuDevicePrimaryCtxSetFlags: %s"))
 256         return Err;
 257     }
 258
 259     // Retain the per device primary context and save it to use whenever this
 260     // device is selected.
 261     Res = cuDevicePrimaryCtxRetain(&Context, Device);
 262     if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxRetain: %s"))
 263       return Err;
 264
 265     if (auto Err = setContext())
 266       return Err;
 267
 268     // Initialize stream pool.
 269     if (auto Err = CUDAStreamManager.init(OMPX_InitialNumStreams))
 270       return Err;
 271
 272     // Initialize event pool.
 273     if (auto Err = CUDAEventManager.init(OMPX_InitialNumEvents))
 274       return Err;
 275
 276     // Query attributes to determine number of threads/block and blocks/grid.
 277     if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
 278                                  GridValues.GV_Max_Teams))
 279       return Err;
 280
 281     if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
 282                                  GridValues.GV_Max_WG_Size))
 283       return Err;
 284
 285     if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE,
 286                                  GridValues.GV_Warp_Size))
 287       return Err;
 288
 289     if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
 290                                  ComputeCapability.Major))
 291       return Err;
 292
 293     if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
 294                                  ComputeCapability.Minor))
 295       return Err;
 296
 297     uint32_t NumMuliprocessors = 0;
 298     uint32_t MaxThreadsPerSM = 0;
 299     uint32_t WarpSize = 0;
 300     if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
 301                                  NumMuliprocessors))
 302       return Err;
 303     if (auto Err =
 304             getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
 305                           MaxThreadsPerSM))
 306       return Err;
 307     if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
 308       return Err;
 309     HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
 310
 311     return Plugin::success();
 312   }
 313
 314   /// Deinitialize the device and release its resources.
 315   Error deinitImpl() override {
 316     if (Context) {
 317       if (auto Err = setContext())
 318         return Err;
 319     }
 320
 321     // Deinitialize the stream manager.
 322     if (auto Err = CUDAStreamManager.deinit())
 323       return Err;
 324
 325     if (auto Err = CUDAEventManager.deinit())
 326       return Err;
 327
 328     // Close modules if necessary.
 329     if (!LoadedImages.empty()) {
 330       assert(Context && "Invalid CUDA context");
 331
 332       // Each image has its own module.
 333       for (DeviceImageTy *Image : LoadedImages) {
 334         CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
 335
 336         // Unload the module of the image.
 337         if (auto Err = CUDAImage.unloadModule())
 338           return Err;
 339       }
 340     }
 341
 342     if (Context) {
 343       CUresult Res = cuDevicePrimaryCtxRelease(Device);
 344       if (auto Err =
 345               Plugin::check(Res, "Error in cuDevicePrimaryCtxRelease: %s"))
 346         return Err;
 347     }
 348
 349     // Invalidate context and device references.
 350     Context = nullptr;
 351     Device = CU_DEVICE_INVALID;
 352
 353     return Plugin::success();
 354   }
 355
 356   /// Allocate and construct a CUDA kernel.
 357   Expected<GenericKernelTy &>
 358   constructKernel(const __tgt_offload_entry &KernelEntry) override {
 359     // Allocate and construct the CUDA kernel.
 360     CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
 361     if (!CUDAKernel)
 362       return Plugin::error("Failed to allocate memory for CUDA kernel");
 363
 364     new (CUDAKernel) CUDAKernelTy(KernelEntry.name);
 365
 366     return *CUDAKernel;
 367   }
 368
 369   /// Set the current context to this device's context.
 370   Error setContext() override {
 371     CUresult Res = cuCtxSetCurrent(Context);
 372     return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
 373   }
 374
 375   /// NVIDIA returns the product of the SM count and the number of warps that
 376   /// fit if the maximum number of threads were scheduled on each SM.
 377   uint64_t getHardwareParallelism() const override {
 378     return HardwareParallelism;
 379   }
 380
 381   /// We want to set up the RPC server for host services to the GPU if it is
 382   /// availible.
 383   bool shouldSetupRPCServer() const override {
 384     return libomptargetSupportsRPC();
 385   }
 386
 387   /// The RPC interface should have enough space for all availible parallelism.
 388   uint64_t requestedRPCPortCount() const override {
 389     return getHardwareParallelism();
 390   }
 391
 392   /// Get the stream of the asynchronous info sructure or get a new one.
 393   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
 394     // Get the stream (if any) from the async info.
 395     Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
 396     if (!Stream) {
 397       // There was no stream; get an idle one.
 398       if (auto Err = CUDAStreamManager.getResource(Stream))
 399         return Err;
 400
 401       // Modify the async info's stream.
 402       AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
 403     }
 404     return Plugin::success();
 405   }
 406
 407   /// Getters of CUDA references.
 408   CUcontext getCUDAContext() const { return Context; }
 409   CUdevice getCUDADevice() const { return Device; }
 410
 411   /// Load the binary image into the device and allocate an image object.
 412   Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
 413                                            int32_t ImageId) override {
 414     if (auto Err = setContext())
 415       return std::move(Err);
 416
 417     // Allocate and initialize the image object.
 418     CUDADeviceImageTy *CUDAImage = Plugin::get().allocate<CUDADeviceImageTy>();
 419     new (CUDAImage) CUDADeviceImageTy(ImageId, TgtImage);
 420
 421     // Load the CUDA module.
 422     if (auto Err = CUDAImage->loadModule())
 423       return std::move(Err);
 424
 425     return CUDAImage;
 426   }
 427
 428   /// Allocate memory on the device or related to the device.
 429   void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
 430     if (Size == 0)
 431       return nullptr;
 432
 433     if (auto Err = setContext()) {
 434       REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
 435       return nullptr;
 436     }
 437
 438     void *MemAlloc = nullptr;
 439     CUdeviceptr DevicePtr;
 440     CUresult Res;
 441
 442     switch (Kind) {
 443     case TARGET_ALLOC_DEFAULT:
 444     case TARGET_ALLOC_DEVICE:
 445       Res = cuMemAlloc(&DevicePtr, Size);
 446       MemAlloc = (void *)DevicePtr;
 447       break;
 448     case TARGET_ALLOC_HOST:
 449       Res = cuMemAllocHost(&MemAlloc, Size);
 450       break;
 451     case TARGET_ALLOC_SHARED:
 452       Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
 453       MemAlloc = (void *)DevicePtr;
 454       break;
 455     }
 456
 457     if (auto Err =
 458             Plugin::check(Res, "Error in cuMemAlloc[Host|Managed]: %s")) {
 459       REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
 460       return nullptr;
 461     }
 462     return MemAlloc;
 463   }
 464
 465   /// Deallocate memory on the device or related to the device.
 466   int free(void *TgtPtr, TargetAllocTy Kind) override {
 467     if (TgtPtr == nullptr)
 468       return OFFLOAD_SUCCESS;
 469
 470     if (auto Err = setContext()) {
 471       REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
 472       return OFFLOAD_FAIL;
 473     }
 474
 475     CUresult Res;
 476     switch (Kind) {
 477     case TARGET_ALLOC_DEFAULT:
 478     case TARGET_ALLOC_DEVICE:
 479     case TARGET_ALLOC_SHARED:
 480       Res = cuMemFree((CUdeviceptr)TgtPtr);
 481       break;
 482     case TARGET_ALLOC_HOST:
 483       Res = cuMemFreeHost(TgtPtr);
 484       break;
 485     }
 486
 487     if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
 488       REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
 489       return OFFLOAD_FAIL;
 490     }
 491     return OFFLOAD_SUCCESS;
 492   }
 493
 494   /// Synchronize current thread with the pending operations on the async info.
 495   Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
 496     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
 497     CUresult Res;
 498     // If we have an RPC server running on this device we will continuously
 499     // query it for work rather than blocking.
 500     if (!getRPCServer()) {
 501       Res = cuStreamSynchronize(Stream);
 502     } else {
 503       do {
 504         Res = cuStreamQuery(Stream);
 505         if (auto Err = getRPCServer()->runServer(*this))
 506           return Err;
 507       } while (Res == CUDA_ERROR_NOT_READY);
 508     }
 509
 510     // Once the stream is synchronized, return it to stream pool and reset
 511     // AsyncInfo. This is to make sure the synchronization only works for its
 512     // own tasks.
 513     AsyncInfo.Queue = nullptr;
 514     if (auto Err = CUDAStreamManager.returnResource(Stream))
 515       return Err;
 516
 517     return Plugin::check(Res, "Error in cuStreamSynchronize: %s");
 518   }
 519
 520   /// CUDA support VA management
 521   bool supportVAManagement() const override { return true; }
 522
 523   /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
 524   /// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
 525   /// At return \p RSize contains the actual size
 526   Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
 527     CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
 528     auto IHandle = DeviceMMaps.find(DVAddr);
 529     size_t Size = *RSize;
 530
 531     if (Size == 0)
 532       return Plugin::error("Memory Map Size must be larger than 0");
 533
 534     // Check if we have already mapped this address
 535     if (IHandle != DeviceMMaps.end())
 536       return Plugin::error("Address already memory mapped");
 537
 538     CUmemAllocationProp Prop = {};
 539     size_t Granularity = 0;
 540
 541     size_t Free, Total;
 542     CUresult Res = cuMemGetInfo(&Free, &Total);
 543     if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
 544       return Err;
 545
 546     if (Size >= Free) {
 547       *Addr = nullptr;
 548       return Plugin::error(
 549           "Canot map memory size larger than the available device memory");
 550     }
 551
 552     // currently NVidia only supports pinned device types
 553     Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 554     Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 555
 556     Prop.location.id = DeviceId;
 557     cuMemGetAllocationGranularity(&Granularity, &Prop,
 558                                   CU_MEM_ALLOC_GRANULARITY_MINIMUM);
 559     if (auto Err =
 560             Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s"))
 561       return Err;
 562
 563     if (Granularity == 0)
 564       return Plugin::error("Wrong device Page size");
 565
 566     // Ceil to page size.
 567     Size = roundUp(Size, Granularity);
 568
 569     // Create a handler of our allocation
 570     CUmemGenericAllocationHandle AHandle;
 571     Res = cuMemCreate(&AHandle, Size, &Prop, 0);
 572     if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s"))
 573       return Err;
 574
 575     CUdeviceptr DevPtr = 0;
 576     Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
 577     if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s"))
 578       return Err;
 579
 580     Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
 581     if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s"))
 582       return Err;
 583
 584     CUmemAccessDesc ADesc = {};
 585     ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 586     ADesc.location.id = DeviceId;
 587     ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 588
 589     // Sets address
 590     Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
 591     if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s"))
 592       return Err;
 593
 594     *Addr = reinterpret_cast<void *>(DevPtr);
 595     *RSize = Size;
 596     DeviceMMaps.insert({DevPtr, AHandle});
 597     return Plugin::success();
 598   }
 599
 600   /// De-allocates device memory and Unmaps the Virtual Addr
 601   Error memoryVAUnMap(void *VAddr, size_t Size) override {
 602     CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
 603     auto IHandle = DeviceMMaps.find(DVAddr);
 604     // Mapping does not exist
 605     if (IHandle == DeviceMMaps.end()) {
 606       return Plugin::error("Addr is not MemoryMapped");
 607     }
 608
 609     if (IHandle == DeviceMMaps.end())
 610       return Plugin::error("Addr is not MemoryMapped");
 611
 612     CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
 613
 614     CUresult Res = cuMemUnmap(DVAddr, Size);
 615     if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s"))
 616       return Err;
 617
 618     Res = cuMemRelease(AllocHandle);
 619     if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s"))
 620       return Err;
 621
 622     Res = cuMemAddressFree(DVAddr, Size);
 623     if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s"))
 624       return Err;
 625
 626     DeviceMMaps.erase(IHandle);
 627     return Plugin::success();
 628   }
 629
 630   /// Query for the completion of the pending operations on the async info.
 631   Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
 632     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
 633     CUresult Res = cuStreamQuery(Stream);
 634
 635     // Not ready streams must be considered as successful operations.
 636     if (Res == CUDA_ERROR_NOT_READY)
 637       return Plugin::success();
 638
 639     // Once the stream is synchronized and the operations completed (or an error
 640     // occurs), return it to stream pool and reset AsyncInfo. This is to make
 641     // sure the synchronization only works for its own tasks.
 642     AsyncInfo.Queue = nullptr;
 643     if (auto Err = CUDAStreamManager.returnResource(Stream))
 644       return Err;
 645
 646     return Plugin::check(Res, "Error in cuStreamQuery: %s");
 647   }
 648
 649   Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
 650     // TODO: Register the buffer as CUDA host memory.
 651     return HstPtr;
 652   }
 653
 654   Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
 655
 656   Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
 657                                  void *&BaseDevAccessiblePtr,
 658                                  size_t &BaseSize) const override {
 659     // TODO: Implement pinning feature for CUDA.
 660     return false;
 661   }
 662
 663   /// Submit data to the device (host to device transfer).
 664   Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
 665                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
 666     if (auto Err = setContext())
 667       return Err;
 668
 669     CUstream Stream;
 670     if (auto Err = getStream(AsyncInfoWrapper, Stream))
 671       return Err;
 672
 673     CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
 674     return Plugin::check(Res, "Error in cuMemcpyHtoDAsync: %s");
 675   }
 676
 677   /// Retrieve data from the device (device to host transfer).
 678   Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
 679                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
 680     if (auto Err = setContext())
 681       return Err;
 682
 683     CUstream Stream;
 684     if (auto Err = getStream(AsyncInfoWrapper, Stream))
 685       return Err;
 686
 687     // If there is already pending work on the stream it could be waiting for
 688     // someone to check the RPC server.
 689     if (auto RPCServer = getRPCServer()) {
 690       CUresult Res = cuStreamQuery(Stream);
 691       while (Res == CUDA_ERROR_NOT_READY) {
 692         if (auto Err = RPCServer->runServer(*this))
 693           return Err;
 694         Res = cuStreamQuery(Stream);
 695       }
 696     }
 697
 698     CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
 699     return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s");
 700   }
 701
 702   /// Exchange data between two devices directly. We may use peer access if
 703   /// the CUDA devices and driver allow them.
 704   Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
 705                          void *DstPtr, int64_t Size,
 706                          AsyncInfoWrapperTy &AsyncInfoWrapper) override;
 707
 708   /// Initialize the async info for interoperability purposes.
 709   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
 710     if (auto Err = setContext())
 711       return Err;
 712
 713     CUstream Stream;
 714     if (auto Err = getStream(AsyncInfoWrapper, Stream))
 715       return Err;
 716
 717     return Plugin::success();
 718   }
 719
 720   /// Initialize the device info for interoperability purposes.
 721   Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
 722     assert(Context && "Context is null");
 723     assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
 724
 725     if (auto Err = setContext())
 726       return Err;
 727
 728     if (!DeviceInfo->Context)
 729       DeviceInfo->Context = Context;
 730
 731     if (!DeviceInfo->Device)
 732       DeviceInfo->Device = reinterpret_cast<void *>(Device);
 733
 734     return Plugin::success();
 735   }
 736
 737   /// Create an event.
 738   Error createEventImpl(void **EventPtrStorage) override {
 739     CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
 740     return CUDAEventManager.getResource(*Event);
 741   }
 742
 743   /// Destroy a previously created event.
 744   Error destroyEventImpl(void *EventPtr) override {
 745     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
 746     return CUDAEventManager.returnResource(Event);
 747   }
 748
 749   /// Record the event.
 750   Error recordEventImpl(void *EventPtr,
 751                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
 752     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
 753
 754     CUstream Stream;
 755     if (auto Err = getStream(AsyncInfoWrapper, Stream))
 756       return Err;
 757
 758     CUresult Res = cuEventRecord(Event, Stream);
 759     return Plugin::check(Res, "Error in cuEventRecord: %s");
 760   }
 761
 762   /// Make the stream wait on the event.
 763   Error waitEventImpl(void *EventPtr,
 764                       AsyncInfoWrapperTy &AsyncInfoWrapper) override {
 765     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
 766
 767     CUstream Stream;
 768     if (auto Err = getStream(AsyncInfoWrapper, Stream))
 769       return Err;
 770
 771     // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from
 772     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
 773     // API document, that argument has to be 0x0.
 774     CUresult Res = cuStreamWaitEvent(Stream, Event, 0);
 775     return Plugin::check(Res, "Error in cuStreamWaitEvent: %s");
 776   }
 777
 778   /// Synchronize the current thread with the event.
 779   Error syncEventImpl(void *EventPtr) override {
 780     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
 781     CUresult Res = cuEventSynchronize(Event);
 782     return Plugin::check(Res, "Error in cuEventSynchronize: %s");
 783   }
 784
 785   /// Print information about the device.
 786   Error obtainInfoImpl(InfoQueueTy &Info) override {
 787     char TmpChar[1000];
 788     const char *TmpCharPtr;
 789     size_t TmpSt;
 790     int TmpInt;
 791
 792     CUresult Res = cuDriverGetVersion(&TmpInt);
 793     if (Res == CUDA_SUCCESS)
 794       Info.add("CUDA Driver Version", TmpInt);
 795
 796     Info.add("CUDA OpenMP Device Number", DeviceId);
 797
 798     Res = cuDeviceGetName(TmpChar, 1000, Device);
 799     if (Res == CUDA_SUCCESS)
 800       Info.add("Device Name", TmpChar);
 801
 802     Res = cuDeviceTotalMem(&TmpSt, Device);
 803     if (Res == CUDA_SUCCESS)
 804       Info.add("Global Memory Size", TmpSt, "bytes");
 805
 806     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
 807     if (Res == CUDA_SUCCESS)
 808       Info.add("Number of Multiprocessors", TmpInt);
 809
 810     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
 811     if (Res == CUDA_SUCCESS)
 812       Info.add("Concurrent Copy and Execution", (bool)TmpInt);
 813
 814     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt);
 815     if (Res == CUDA_SUCCESS)
 816       Info.add("Total Constant Memory", TmpInt, "bytes");
 817
 818     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
 819                            TmpInt);
 820     if (Res == CUDA_SUCCESS)
 821       Info.add("Max Shared Memory per Block", TmpInt, "bytes");
 822
 823     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
 824     if (Res == CUDA_SUCCESS)
 825       Info.add("Registers per Block", TmpInt);
 826
 827     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt);
 828     if (Res == CUDA_SUCCESS)
 829       Info.add("Warp Size", TmpInt);
 830
 831     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt);
 832     if (Res == CUDA_SUCCESS)
 833       Info.add("Maximum Threads per Block", TmpInt);
 834
 835     Info.add("Maximum Block Dimensions", "");
 836     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
 837     if (Res == CUDA_SUCCESS)
 838       Info.add<InfoLevel2>("x", TmpInt);
 839     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
 840     if (Res == CUDA_SUCCESS)
 841       Info.add<InfoLevel2>("y", TmpInt);
 842     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
 843     if (Res == CUDA_SUCCESS)
 844       Info.add<InfoLevel2>("z", TmpInt);
 845
 846     Info.add("Maximum Grid Dimensions", "");
 847     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
 848     if (Res == CUDA_SUCCESS)
 849       Info.add<InfoLevel2>("x", TmpInt);
 850     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
 851     if (Res == CUDA_SUCCESS)
 852       Info.add<InfoLevel2>("y", TmpInt);
 853     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
 854     if (Res == CUDA_SUCCESS)
 855       Info.add<InfoLevel2>("z", TmpInt);
 856
 857     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
 858     if (Res == CUDA_SUCCESS)
 859       Info.add("Maximum Memory Pitch", TmpInt, "bytes");
 860
 861     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt);
 862     if (Res == CUDA_SUCCESS)
 863       Info.add("Texture Alignment", TmpInt, "bytes");
 864
 865     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
 866     if (Res == CUDA_SUCCESS)
 867       Info.add("Clock Rate", TmpInt, "kHz");
 868
 869     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
 870     if (Res == CUDA_SUCCESS)
 871       Info.add("Execution Timeout", (bool)TmpInt);
 872
 873     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt);
 874     if (Res == CUDA_SUCCESS)
 875       Info.add("Integrated Device", (bool)TmpInt);
 876
 877     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt);
 878     if (Res == CUDA_SUCCESS)
 879       Info.add("Can Map Host Memory", (bool)TmpInt);
 880
 881     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt);
 882     if (Res == CUDA_SUCCESS) {
 883       if (TmpInt == CU_COMPUTEMODE_DEFAULT)
 884         TmpCharPtr = "Default";
 885       else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
 886         TmpCharPtr = "Prohibited";
 887       else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
 888         TmpCharPtr = "Exclusive process";
 889       else
 890         TmpCharPtr = "Unknown";
 891       Info.add("Compute Mode", TmpCharPtr);
 892     }
 893
 894     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt);
 895     if (Res == CUDA_SUCCESS)
 896       Info.add("Concurrent Kernels", (bool)TmpInt);
 897
 898     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt);
 899     if (Res == CUDA_SUCCESS)
 900       Info.add("ECC Enabled", (bool)TmpInt);
 901
 902     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
 903     if (Res == CUDA_SUCCESS)
 904       Info.add("Memory Clock Rate", TmpInt, "kHz");
 905
 906     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
 907     if (Res == CUDA_SUCCESS)
 908       Info.add("Memory Bus Width", TmpInt, "bits");
 909
 910     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt);
 911     if (Res == CUDA_SUCCESS)
 912       Info.add("L2 Cache Size", TmpInt, "bytes");
 913
 914     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
 915                            TmpInt);
 916     if (Res == CUDA_SUCCESS)
 917       Info.add("Max Threads Per SMP", TmpInt);
 918
 919     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt);
 920     if (Res == CUDA_SUCCESS)
 921       Info.add("Async Engines", TmpInt);
 922
 923     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt);
 924     if (Res == CUDA_SUCCESS)
 925       Info.add("Unified Addressing", (bool)TmpInt);
 926
 927     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt);
 928     if (Res == CUDA_SUCCESS)
 929       Info.add("Managed Memory", (bool)TmpInt);
 930
 931     Res =
 932         getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt);
 933     if (Res == CUDA_SUCCESS)
 934       Info.add("Concurrent Managed Memory", (bool)TmpInt);
 935
 936     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
 937                            TmpInt);
 938     if (Res == CUDA_SUCCESS)
 939       Info.add("Preemption Supported", (bool)TmpInt);
 940
 941     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
 942     if (Res == CUDA_SUCCESS)
 943       Info.add("Cooperative Launch", (bool)TmpInt);
 944
 945     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
 946     if (Res == CUDA_SUCCESS)
 947       Info.add("Multi-Device Boars", (bool)TmpInt);
 948
 949     Info.add("Compute Capabilities", ComputeCapability.str());
 950
 951     return Plugin::success();
 952   }
 953
 954   virtual bool shouldSetupDeviceMemoryPool() const override {
 955     /// We use the CUDA malloc for now.
 956     return false;
 957   }
 958
 959   /// Getters and setters for stack and heap sizes.
 960   Error getDeviceStackSize(uint64_t &Value) override {
 961     return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
 962   }
 963   Error setDeviceStackSize(uint64_t Value) override {
 964     return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
 965   }
 966   Error getDeviceHeapSize(uint64_t &Value) override {
 967     return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
 968   }
 969   Error setDeviceHeapSize(uint64_t Value) override {
 970     return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
 971   }
 972   Error getDeviceMemorySize(uint64_t &Value) override {
 973     CUresult Res = cuDeviceTotalMem(&Value, Device);
 974     return Plugin::check(Res, "Error in getDeviceMemorySize %s");
 975   }
 976
 977   /// CUDA-specific functions for getting and setting context limits.
 978   Error setCtxLimit(CUlimit Kind, uint64_t Value) {
 979     CUresult Res = cuCtxSetLimit(Kind, Value);
 980     return Plugin::check(Res, "Error in cuCtxSetLimit: %s");
 981   }
 982   Error getCtxLimit(CUlimit Kind, uint64_t &Value) {
 983     CUresult Res = cuCtxGetLimit(&Value, Kind);
 984     return Plugin::check(Res, "Error in cuCtxGetLimit: %s");
 985   }
 986
 987   /// CUDA-specific function to get device attributes.
 988   Error getDeviceAttr(uint32_t Kind, uint32_t &Value) {
 989     // TODO: Warn if the new value is larger than the old.
 990     CUresult Res =
 991         cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device);
 992     return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s");
 993   }
 994
 995   CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) {
 996     return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device);
 997   }
 998
 999   /// See GenericDeviceTy::getComputeUnitKind().
1000   std::string getComputeUnitKind() const override {
1001     return ComputeCapability.str();
1002   }
1003
1004   /// Returns the clock frequency for the given NVPTX device.
1005   uint64_t getClockFrequency() const override { return 1000000000; }
1006
1007 private:
1008   using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
1009   using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
1010
1011   /// Stream manager for CUDA streams.
1012   CUDAStreamManagerTy CUDAStreamManager;
1013
1014   /// Event manager for CUDA events.
1015   CUDAEventManagerTy CUDAEventManager;
1016
1017   /// The device's context. This context should be set before performing
1018   /// operations on the device.
1019   CUcontext Context = nullptr;
1020
1021   /// The CUDA device handler.
1022   CUdevice Device = CU_DEVICE_INVALID;
1023
1024   /// The memory mapped addresses and their handles
1025   std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
1026
1027   /// The compute capability of the corresponding CUDA device.
1028   struct ComputeCapabilityTy {
1029     uint32_t Major;
1030     uint32_t Minor;
1031     std::string str() const {
1032       return "sm_" + std::to_string(Major * 10 + Minor);
1033     }
1034   } ComputeCapability;
1035
1036   /// The maximum number of warps that can be resident on all the SMs
1037   /// simultaneously.
1038   uint32_t HardwareParallelism = 0;
1039 };
1040
1041 Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
1042                                uint32_t NumThreads, uint64_t NumBlocks,
1043                                KernelArgsTy &KernelArgs, void *Args,
1044                                AsyncInfoWrapperTy &AsyncInfoWrapper) const {
1045   CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
1046
1047   CUstream Stream;
1048   if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
1049     return Err;
1050
1051   uint32_t MaxDynCGroupMem =
1052       std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
1053
1054   CUresult Res =
1055       cuLaunchKernel(Func, NumBlocks, /* gridDimY */ 1,
1056                      /* gridDimZ */ 1, NumThreads,
1057                      /* blockDimY */ 1, /* blockDimZ */ 1, MaxDynCGroupMem,
1058                      Stream, (void **)Args, nullptr);
1059   return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
1060 }
1061
1062 /// Class implementing the CUDA-specific functionalities of the global handler.
1063 class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
1064 public:
1065   /// Get the metadata of a global from the device. The name and size of the
1066   /// global is read from DeviceGlobal and the address of the global is written
1067   /// to DeviceGlobal.
1068   Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
1069                                     DeviceImageTy &Image,
1070                                     GlobalTy &DeviceGlobal) override {
1071     CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
1072
1073     const char *GlobalName = DeviceGlobal.getName().data();
1074
1075     size_t CUSize;
1076     CUdeviceptr CUPtr;
1077     CUresult Res =
1078         cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
1079     if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s",
1080                                  GlobalName))
1081       return Err;
1082
1083     if (CUSize != DeviceGlobal.getSize())
1084       return Plugin::error(
1085           "Failed to load global '%s' due to size mismatch (%zu != %zu)",
1086           GlobalName, CUSize, (size_t)DeviceGlobal.getSize());
1087
1088     DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
1089     return Plugin::success();
1090   }
1091 };
1092
1093 /// Class implementing the CUDA-specific functionalities of the plugin.
1094 struct CUDAPluginTy final : public GenericPluginTy {
1095   /// Create a CUDA plugin.
1096   CUDAPluginTy() : GenericPluginTy(getTripleArch()) {}
1097
1098   /// This class should not be copied.
1099   CUDAPluginTy(const CUDAPluginTy &) = delete;
1100   CUDAPluginTy(CUDAPluginTy &&) = delete;
1101
1102   /// Initialize the plugin and return the number of devices.
1103   Expected<int32_t> initImpl() override {
1104     CUresult Res = cuInit(0);
1105     if (Res == CUDA_ERROR_INVALID_HANDLE) {
1106       // Cannot call cuGetErrorString if dlsym failed.
1107       DP("Failed to load CUDA shared library\n");
1108       return 0;
1109     }
1110
1111 #ifdef OMPT_SUPPORT
1112     ompt::connectLibrary();
1113 #endif
1114
1115     if (Res == CUDA_ERROR_NO_DEVICE) {
1116       // Do not initialize if there are no devices.
1117       DP("There are no devices supporting CUDA.\n");
1118       return 0;
1119     }
1120
1121     if (auto Err = Plugin::check(Res, "Error in cuInit: %s"))
1122       return std::move(Err);
1123
1124     // Get the number of devices.
1125     int NumDevices;
1126     Res = cuDeviceGetCount(&NumDevices);
1127     if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s"))
1128       return std::move(Err);
1129
1130     // Do not initialize if there are no devices.
1131     if (NumDevices == 0)
1132       DP("There are no devices supporting CUDA.\n");
1133
1134     return NumDevices;
1135   }
1136
1137   /// Deinitialize the plugin.
1138   Error deinitImpl() override { return Plugin::success(); }
1139
1140   /// Get the ELF code for recognizing the compatible image binary.
1141   uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; }
1142
1143   Triple::ArchType getTripleArch() const override {
1144     // TODO: I think we can drop the support for 32-bit NVPTX devices.
1145     return Triple::nvptx64;
1146   }
1147
1148   /// Check whether the image is compatible with the available CUDA devices.
1149   Expected<bool> isImageCompatible(__tgt_image_info *Info) const override {
1150     for (int32_t DevId = 0; DevId < getNumDevices(); ++DevId) {
1151       CUdevice Device;
1152       CUresult Res = cuDeviceGet(&Device, DevId);
1153       if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
1154         return std::move(Err);
1155
1156       int32_t Major, Minor;
1157       Res = cuDeviceGetAttribute(
1158           &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
1159       if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
1160         return std::move(Err);
1161
1162       Res = cuDeviceGetAttribute(
1163           &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
1164       if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
1165         return std::move(Err);
1166
1167       StringRef ArchStr(Info->Arch);
1168       StringRef PrefixStr("sm_");
1169       if (!ArchStr.startswith(PrefixStr))
1170         return Plugin::error("Unrecognized image arch %s", ArchStr.data());
1171
1172       int32_t ImageMajor = ArchStr[PrefixStr.size() + 0] - '0';
1173       int32_t ImageMinor = ArchStr[PrefixStr.size() + 1] - '0';
1174
1175       // A cubin generated for a certain compute capability is supported to run
1176       // on any GPU with the same major revision and same or higher minor
1177       // revision.
1178       if (Major != ImageMajor || Minor < ImageMinor)
1179         return false;
1180     }
1181     return true;
1182   }
1183 };
1184
1185 Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
1186                                      GenericDeviceTy &DstGenericDevice,
1187                                      void *DstPtr, int64_t Size,
1188                                      AsyncInfoWrapperTy &AsyncInfoWrapper) {
1189   if (auto Err = setContext())
1190     return Err;
1191
1192   CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice);
1193
1194   CUresult Res;
1195   int32_t DstDeviceId = DstDevice.DeviceId;
1196   CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr;
1197   CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr;
1198
1199   int CanAccessPeer = 0;
1200   if (DeviceId != DstDeviceId) {
1201     // Make sure the lock is released before performing the copies.
1202     std::lock_guard<std::mutex> Lock(PeerAccessesLock);
1203
1204     switch (PeerAccesses[DstDeviceId]) {
1205     case PeerAccessState::AVAILABLE:
1206       CanAccessPeer = 1;
1207       break;
1208     case PeerAccessState::UNAVAILABLE:
1209       CanAccessPeer = 0;
1210       break;
1211     case PeerAccessState::PENDING:
1212       // Check whether the source device can access the destination device.
1213       Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device);
1214       if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s"))
1215         return Err;
1216
1217       if (CanAccessPeer) {
1218         Res = cuCtxEnablePeerAccess(DstDevice.Context, 0);
1219         if (Res == CUDA_ERROR_TOO_MANY_PEERS) {
1220           // Resources may be exhausted due to many P2P links.
1221           CanAccessPeer = 0;
1222           DP("Too many P2P so fall back to D2D memcpy");
1223         } else if (auto Err =
1224                        Plugin::check(Res, "Error in cuCtxEnablePeerAccess: %s"))
1225           return Err;
1226       }
1227       PeerAccesses[DstDeviceId] = (CanAccessPeer)
1228                                       ? PeerAccessState::AVAILABLE
1229                                       : PeerAccessState::UNAVAILABLE;
1230     }
1231   }
1232
1233   CUstream Stream;
1234   if (auto Err = getStream(AsyncInfoWrapper, Stream))
1235     return Err;
1236
1237   if (CanAccessPeer) {
1238     // TODO: Should we fallback to D2D if peer access fails?
1239     Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context,
1240                             Size, Stream);
1241     return Plugin::check(Res, "Error in cuMemcpyPeerAsync: %s");
1242   }
1243
1244   // Fallback to D2D copy.
1245   Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream);
1246   return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s");
1247 }
1248
1249 GenericPluginTy *Plugin::createPlugin() { return new CUDAPluginTy(); }
1250
1251 GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) {
1252   return new CUDADeviceTy(DeviceId, NumDevices);
1253 }
1254
1255 GenericGlobalHandlerTy *Plugin::createGlobalHandler() {
1256   return new CUDAGlobalHandlerTy();
1257 }
1258
1259 template <typename... ArgsTy>
1260 Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
1261   CUresult ResultCode = static_cast<CUresult>(Code);
1262   if (ResultCode == CUDA_SUCCESS)
1263     return Error::success();
1264
1265   const char *Desc = "Unknown error";
1266   CUresult Ret = cuGetErrorString(ResultCode, &Desc);
1267   if (Ret != CUDA_SUCCESS)
1268     REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
1269
1270   return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
1271                                                     ErrFmt, Args..., Desc);
1272 }
1273
1274 } // namespace plugin
1275 } // namespace target
1276 } // namespace omp
1277 } // namespace llvm