Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / plugins-nextgen / amdgpu / src / rtl.cpp
blob71207f767fdcc60e1c0424566cc26585d2916630
1 //===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL NextGen for AMDGPU machine
11 //===----------------------------------------------------------------------===//
13 #include <atomic>
14 #include <cassert>
15 #include <cstddef>
16 #include <deque>
17 #include <mutex>
18 #include <string>
19 #include <system_error>
20 #include <unistd.h>
21 #include <unordered_map>
23 #include "Debug.h"
24 #include "Environment.h"
25 #include "GlobalHandler.h"
26 #include "OmptCallback.h"
27 #include "PluginInterface.h"
28 #include "Utilities.h"
29 #include "UtilitiesRTL.h"
30 #include "omptarget.h"
32 #include "llvm/ADT/SmallString.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/StringRef.h"
35 #include "llvm/BinaryFormat/ELF.h"
36 #include "llvm/Frontend/OpenMP/OMPConstants.h"
37 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
38 #include "llvm/Support/Error.h"
39 #include "llvm/Support/FileSystem.h"
40 #include "llvm/Support/MemoryBuffer.h"
41 #include "llvm/Support/Program.h"
42 #include "llvm/Support/raw_ostream.h"
44 #if defined(__has_include)
45 #if __has_include("hsa/hsa.h")
46 #include "hsa/hsa.h"
47 #include "hsa/hsa_ext_amd.h"
48 #elif __has_include("hsa.h")
49 #include "hsa.h"
50 #include "hsa_ext_amd.h"
51 #endif
52 #else
53 #include "hsa/hsa.h"
54 #include "hsa/hsa_ext_amd.h"
55 #endif
57 namespace llvm {
58 namespace omp {
59 namespace target {
60 namespace plugin {
62 /// Forward declarations for all specialized data structures.
63 struct AMDGPUKernelTy;
64 struct AMDGPUDeviceTy;
65 struct AMDGPUPluginTy;
66 struct AMDGPUStreamTy;
67 struct AMDGPUEventTy;
68 struct AMDGPUStreamManagerTy;
69 struct AMDGPUEventManagerTy;
70 struct AMDGPUDeviceImageTy;
71 struct AMDGPUMemoryManagerTy;
72 struct AMDGPUMemoryPoolTy;
74 namespace utils {
76 /// Iterate elements using an HSA iterate function. Do not use this function
77 /// directly but the specialized ones below instead.
78 template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
79 hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
80 auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
81 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
82 return (*Unwrapped)(Elem);
84 return Func(L, static_cast<void *>(&Cb));
87 /// Iterate elements using an HSA iterate function passing a parameter. Do not
88 /// use this function directly but the specialized ones below instead.
89 template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
90 typename CallbackTy>
91 hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
92 auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
93 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
94 return (*Unwrapped)(Elem);
96 return Func(FuncArg, L, static_cast<void *>(&Cb));
99 /// Iterate elements using an HSA iterate function passing a parameter. Do not
100 /// use this function directly but the specialized ones below instead.
101 template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
102 typename IterFuncArgTy, typename CallbackTy>
103 hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
104 auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
105 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
106 return (*Unwrapped)(Elem1, Elem2);
108 return Func(FuncArg, L, static_cast<void *>(&Cb));
111 /// Iterate agents.
112 template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
113 hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
114 return Plugin::check(Status, "Error in hsa_iterate_agents: %s");
117 /// Iterate ISAs of an agent.
118 template <typename CallbackTy>
119 Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
120 hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
121 return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s");
124 /// Iterate memory pools of an agent.
125 template <typename CallbackTy>
126 Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
127 hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
128 hsa_amd_agent_iterate_memory_pools, Agent, Cb);
129 return Plugin::check(Status,
130 "Error in hsa_amd_agent_iterate_memory_pools: %s");
133 } // namespace utils
135 /// Utility class representing generic resource references to AMDGPU resources.
136 template <typename ResourceTy>
137 struct AMDGPUResourceRef : public GenericDeviceResourceRef {
138 /// The underlying handle type for resources.
139 using HandleTy = ResourceTy *;
141 /// Create an empty reference to an invalid resource.
142 AMDGPUResourceRef() : Resource(nullptr) {}
144 /// Create a reference to an existing resource.
145 AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {}
147 virtual ~AMDGPUResourceRef() {}
149 /// Create a new resource and save the reference. The reference must be empty
150 /// before calling to this function.
151 Error create(GenericDeviceTy &Device) override;
153 /// Destroy the referenced resource and invalidate the reference. The
154 /// reference must be to a valid resource before calling to this function.
155 Error destroy(GenericDeviceTy &Device) override {
156 if (!Resource)
157 return Plugin::error("Destroying an invalid resource");
159 if (auto Err = Resource->deinit())
160 return Err;
162 delete Resource;
164 Resource = nullptr;
165 return Plugin::success();
168 /// Get the underlying resource handle.
169 operator HandleTy() const { return Resource; }
171 private:
172 /// The handle to the actual resource.
173 HandleTy Resource;
176 /// Class holding an HSA memory pool.
177 struct AMDGPUMemoryPoolTy {
178 /// Create a memory pool from an HSA memory pool.
179 AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
180 : MemoryPool(MemoryPool), GlobalFlags(0) {}
182 /// Initialize the memory pool retrieving its properties.
183 Error init() {
184 if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
185 return Err;
187 if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
188 return Err;
190 return Plugin::success();
193 /// Getter of the HSA memory pool.
194 hsa_amd_memory_pool_t get() const { return MemoryPool; }
196 /// Indicate the segment which belongs to.
197 bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
198 bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
199 bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
200 bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
202 /// Indicate if it is fine-grained memory. Valid only for global.
203 bool isFineGrained() const {
204 assert(isGlobal() && "Not global memory");
205 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
208 /// Indicate if it is coarse-grained memory. Valid only for global.
209 bool isCoarseGrained() const {
210 assert(isGlobal() && "Not global memory");
211 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
214 /// Indicate if it supports storing kernel arguments. Valid only for global.
215 bool supportsKernelArgs() const {
216 assert(isGlobal() && "Not global memory");
217 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
220 /// Allocate memory on the memory pool.
221 Error allocate(size_t Size, void **PtrStorage) {
222 hsa_status_t Status =
223 hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage);
224 return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s");
227 /// Return memory to the memory pool.
228 Error deallocate(void *Ptr) {
229 hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
230 return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s");
233 /// Allow the device to access a specific allocation.
234 Error enableAccess(void *Ptr, int64_t Size,
235 const llvm::SmallVector<hsa_agent_t> &Agents) const {
236 #ifdef OMPTARGET_DEBUG
237 for (hsa_agent_t Agent : Agents) {
238 hsa_amd_memory_pool_access_t Access;
239 if (auto Err =
240 getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
241 return Err;
243 // The agent is not allowed to access the memory pool in any case. Do not
244 // continue because otherwise it result in undefined behavior.
245 if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
246 return Plugin::error("An agent is not allowed to access a memory pool");
248 #endif
250 // We can access but it is disabled by default. Enable the access then.
251 hsa_status_t Status =
252 hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
253 return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
256 /// Get attribute from the memory pool.
257 template <typename Ty>
258 Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
259 hsa_status_t Status;
260 Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
261 return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
264 template <typename Ty>
265 hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
266 return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
269 /// Get attribute from the memory pool relating to an agent.
270 template <typename Ty>
271 Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
272 Ty &Value) const {
273 hsa_status_t Status;
274 Status =
275 hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
276 return Plugin::check(Status,
277 "Error in hsa_amd_agent_memory_pool_get_info: %s");
280 private:
281 /// The HSA memory pool.
282 hsa_amd_memory_pool_t MemoryPool;
284 /// The segment where the memory pool belongs to.
285 hsa_amd_segment_t Segment;
287 /// The global flags of memory pool. Only valid if the memory pool belongs to
288 /// the global segment.
289 uint32_t GlobalFlags;
292 /// Class that implements a memory manager that gets memory from a specific
293 /// memory pool.
294 struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
296 /// Create an empty memory manager.
297 AMDGPUMemoryManagerTy() : MemoryPool(nullptr), MemoryManager(nullptr) {}
299 /// Initialize the memory manager from a memory pool.
300 Error init(AMDGPUMemoryPoolTy &MemoryPool) {
301 const uint32_t Threshold = 1 << 30;
302 this->MemoryManager = new MemoryManagerTy(*this, Threshold);
303 this->MemoryPool = &MemoryPool;
304 return Plugin::success();
307 /// Deinitialize the memory manager and free its allocations.
308 Error deinit() {
309 assert(MemoryManager && "Invalid memory manager");
311 // Delete and invalidate the memory manager. At this point, the memory
312 // manager will deallocate all its allocations.
313 delete MemoryManager;
314 MemoryManager = nullptr;
316 return Plugin::success();
319 /// Reuse or allocate memory through the memory manager.
320 Error allocate(size_t Size, void **PtrStorage) {
321 assert(MemoryManager && "Invalid memory manager");
322 assert(PtrStorage && "Invalid pointer storage");
324 *PtrStorage = MemoryManager->allocate(Size, nullptr);
325 if (*PtrStorage == nullptr)
326 return Plugin::error("Failure to allocate from AMDGPU memory manager");
328 return Plugin::success();
331 /// Release an allocation to be reused.
332 Error deallocate(void *Ptr) {
333 assert(Ptr && "Invalid pointer");
335 if (MemoryManager->free(Ptr))
336 return Plugin::error("Failure to deallocate from AMDGPU memory manager");
338 return Plugin::success();
341 private:
342 /// Allocation callback that will be called once the memory manager does not
343 /// have more previously allocated buffers.
344 void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
346 /// Deallocation callack that will be called by the memory manager.
347 int free(void *TgtPtr, TargetAllocTy Kind) override {
348 if (auto Err = MemoryPool->deallocate(TgtPtr)) {
349 consumeError(std::move(Err));
350 return OFFLOAD_FAIL;
352 return OFFLOAD_SUCCESS;
355 /// The memory pool used to allocate memory.
356 AMDGPUMemoryPoolTy *MemoryPool;
358 /// Reference to the actual memory manager.
359 MemoryManagerTy *MemoryManager;
362 /// Class implementing the AMDGPU device images' properties.
363 struct AMDGPUDeviceImageTy : public DeviceImageTy {
364 /// Create the AMDGPU image with the id and the target image pointer.
365 AMDGPUDeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
366 : DeviceImageTy(ImageId, TgtImage) {}
368 /// Prepare and load the executable corresponding to the image.
369 Error loadExecutable(const AMDGPUDeviceTy &Device);
371 /// Unload the executable.
372 Error unloadExecutable() {
373 hsa_status_t Status = hsa_executable_destroy(Executable);
374 if (auto Err = Plugin::check(Status, "Error in hsa_executable_destroy: %s"))
375 return Err;
377 Status = hsa_code_object_destroy(CodeObject);
378 return Plugin::check(Status, "Error in hsa_code_object_destroy: %s");
381 /// Get the executable.
382 hsa_executable_t getExecutable() const { return Executable; }
384 /// Get to Code Object Version of the ELF
385 uint16_t getELFABIVersion() const { return ELFABIVersion; }
387 /// Find an HSA device symbol by its name on the executable.
388 Expected<hsa_executable_symbol_t>
389 findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
391 /// Get additional info for kernel, e.g., register spill counts
392 std::optional<utils::KernelMetaDataTy>
393 getKernelInfo(StringRef Identifier) const {
394 auto It = KernelInfoMap.find(Identifier);
396 if (It == KernelInfoMap.end())
397 return {};
399 return It->second;
402 private:
403 /// The exectuable loaded on the agent.
404 hsa_executable_t Executable;
405 hsa_code_object_t CodeObject;
406 StringMap<utils::KernelMetaDataTy> KernelInfoMap;
407 uint16_t ELFABIVersion;
410 /// Class implementing the AMDGPU kernel functionalities which derives from the
411 /// generic kernel class.
412 struct AMDGPUKernelTy : public GenericKernelTy {
413 /// Create an AMDGPU kernel with a name and an execution mode.
414 AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
416 /// Initialize the AMDGPU kernel.
417 Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
418 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
420 // Kernel symbols have a ".kd" suffix.
421 std::string KernelName(getName());
422 KernelName += ".kd";
424 // Find the symbol on the device executable.
425 auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
426 if (!SymbolOrErr)
427 return SymbolOrErr.takeError();
429 hsa_executable_symbol_t Symbol = *SymbolOrErr;
430 hsa_symbol_kind_t SymbolType;
431 hsa_status_t Status;
433 // Retrieve different properties of the kernel symbol.
434 std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
435 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
436 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
437 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
438 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
439 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
441 for (auto &Info : RequiredInfos) {
442 Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
443 if (auto Err = Plugin::check(
444 Status, "Error in hsa_executable_symbol_get_info: %s"))
445 return Err;
448 // Make sure it is a kernel symbol.
449 if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
450 return Plugin::error("Symbol %s is not a kernel function");
452 // TODO: Read the kernel descriptor for the max threads per block. May be
453 // read from the image.
455 ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
456 DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
458 // Get additional kernel info read from image
459 KernelInfo = AMDImage.getKernelInfo(getName());
460 if (!KernelInfo.has_value())
461 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
462 "Could not read extra information for kernel %s.", getName());
464 return Plugin::success();
467 /// Launch the AMDGPU kernel function.
468 Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
469 uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
470 AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
472 /// Print more elaborate kernel launch info for AMDGPU
473 Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
474 KernelArgsTy &KernelArgs, uint32_t NumThreads,
475 uint64_t NumBlocks) const override;
477 /// Get group and private segment kernel size.
478 uint32_t getGroupSize() const { return GroupSize; }
479 uint32_t getPrivateSize() const { return PrivateSize; }
481 /// Get the HSA kernel object representing the kernel function.
482 uint64_t getKernelObject() const { return KernelObject; }
484 /// Get the size of implicitargs based on the code object version
485 /// @return 56 for cov4 and 256 for cov5
486 uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
488 private:
489 /// The kernel object to execute.
490 uint64_t KernelObject;
492 /// The args, group and private segments sizes required by a kernel instance.
493 uint32_t ArgsSize;
494 uint32_t GroupSize;
495 uint32_t PrivateSize;
497 /// The size of implicit kernel arguments.
498 uint32_t ImplicitArgsSize;
500 /// Additional Info for the AMD GPU Kernel
501 std::optional<utils::KernelMetaDataTy> KernelInfo;
504 /// Class representing an HSA signal. Signals are used to define dependencies
505 /// between asynchronous operations: kernel launches and memory transfers.
506 struct AMDGPUSignalTy {
507 /// Create an empty signal.
508 AMDGPUSignalTy() : HSASignal({0}), UseCount() {}
509 AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {}
511 /// Initialize the signal with an initial value.
512 Error init(uint32_t InitialValue = 1) {
513 hsa_status_t Status =
514 hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal);
515 return Plugin::check(Status, "Error in hsa_signal_create: %s");
518 /// Deinitialize the signal.
519 Error deinit() {
520 hsa_status_t Status = hsa_signal_destroy(HSASignal);
521 return Plugin::check(Status, "Error in hsa_signal_destroy: %s");
524 /// Wait until the signal gets a zero value.
525 Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr,
526 GenericDeviceTy *Device = nullptr) const {
527 if (ActiveTimeout && !RPCServer) {
528 hsa_signal_value_t Got = 1;
529 Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
530 ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
531 if (Got == 0)
532 return Plugin::success();
535 // If there is an RPC device attached to this stream we run it as a server.
536 uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
537 auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
538 while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
539 Timeout, WaitState) != 0) {
540 if (RPCServer && Device)
541 if (auto Err = RPCServer->runServer(*Device))
542 return Err;
544 return Plugin::success();
547 /// Load the value on the signal.
548 hsa_signal_value_t load() const {
549 return hsa_signal_load_scacquire(HSASignal);
552 /// Signal decrementing by one.
553 void signal() {
554 assert(load() > 0 && "Invalid signal value");
555 hsa_signal_subtract_screlease(HSASignal, 1);
558 /// Reset the signal value before reusing the signal. Do not call this
559 /// function if the signal is being currently used by any watcher, such as a
560 /// plugin thread or the HSA runtime.
561 void reset() { hsa_signal_store_screlease(HSASignal, 1); }
563 /// Increase the number of concurrent uses.
564 void increaseUseCount() { UseCount.increase(); }
566 /// Decrease the number of concurrent uses and return whether was the last.
567 bool decreaseUseCount() { return UseCount.decrease(); }
569 hsa_signal_t get() const { return HSASignal; }
571 private:
572 /// The underlying HSA signal.
573 hsa_signal_t HSASignal;
575 /// Reference counter for tracking the concurrent use count. This is mainly
576 /// used for knowing how many streams are using the signal.
577 RefCountTy<> UseCount;
580 /// Classes for holding AMDGPU signals and managing signals.
581 using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
582 using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
584 /// Class holding an HSA queue to submit kernel and barrier packets.
585 struct AMDGPUQueueTy {
586 /// Create an empty queue.
587 AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
589 /// Lazily initialize a new queue belonging to a specific agent.
590 Error init(hsa_agent_t Agent, int32_t QueueSize) {
591 if (Queue)
592 return Plugin::success();
593 hsa_status_t Status =
594 hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
595 nullptr, UINT32_MAX, UINT32_MAX, &Queue);
596 return Plugin::check(Status, "Error in hsa_queue_create: %s");
599 /// Deinitialize the queue and destroy its resources.
600 Error deinit() {
601 std::lock_guard<std::mutex> Lock(Mutex);
602 if (!Queue)
603 return Plugin::success();
604 hsa_status_t Status = hsa_queue_destroy(Queue);
605 return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
608 /// Returns the number of streams, this queue is currently assigned to.
609 bool getUserCount() const { return NumUsers; }
611 /// Returns if the underlying HSA queue is initialized.
612 bool isInitialized() { return Queue != nullptr; }
614 /// Decrement user count of the queue object.
615 void removeUser() { --NumUsers; }
617 /// Increase user count of the queue object.
618 void addUser() { ++NumUsers; }
620 /// Push a kernel launch to the queue. The kernel launch requires an output
621 /// signal and can define an optional input signal (nullptr if none).
622 Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
623 uint32_t NumThreads, uint64_t NumBlocks,
624 uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
625 AMDGPUSignalTy *InputSignal) {
626 assert(OutputSignal && "Invalid kernel output signal");
628 // Lock the queue during the packet publishing process. Notice this blocks
629 // the addition of other packets to the queue. The following piece of code
630 // should be lightweight; do not block the thread, allocate memory, etc.
631 std::lock_guard<std::mutex> Lock(Mutex);
632 assert(Queue && "Interacted with a non-initialized queue!");
634 // Avoid defining the input dependency if already satisfied.
635 if (InputSignal && !InputSignal->load())
636 InputSignal = nullptr;
638 // Add a barrier packet before the kernel packet in case there is a pending
639 // preceding operation. The barrier packet will delay the processing of
640 // subsequent queue's packets until the barrier input signal are satisfied.
641 // No need output signal needed because the dependency is already guaranteed
642 // by the queue barrier itself.
643 if (InputSignal)
644 if (auto Err = pushBarrierImpl(nullptr, InputSignal))
645 return Err;
647 // Now prepare the kernel packet.
648 uint64_t PacketId;
649 hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
650 assert(Packet && "Invalid packet");
652 // The first 32 bits of the packet are written after the other fields
653 uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
654 Packet->workgroup_size_x = NumThreads;
655 Packet->workgroup_size_y = 1;
656 Packet->workgroup_size_z = 1;
657 Packet->reserved0 = 0;
658 Packet->grid_size_x = NumBlocks * NumThreads;
659 Packet->grid_size_y = 1;
660 Packet->grid_size_z = 1;
661 Packet->private_segment_size = Kernel.getPrivateSize();
662 Packet->group_segment_size = GroupSize;
663 Packet->kernel_object = Kernel.getKernelObject();
664 Packet->kernarg_address = KernelArgs;
665 Packet->reserved2 = 0;
666 Packet->completion_signal = OutputSignal->get();
668 // Publish the packet. Do not modify the packet after this point.
669 publishKernelPacket(PacketId, Setup, Packet);
671 return Plugin::success();
674 /// Push a barrier packet that will wait up to two input signals. All signals
675 /// are optional (nullptr if none).
676 Error pushBarrier(AMDGPUSignalTy *OutputSignal,
677 const AMDGPUSignalTy *InputSignal1,
678 const AMDGPUSignalTy *InputSignal2) {
679 // Lock the queue during the packet publishing process.
680 std::lock_guard<std::mutex> Lock(Mutex);
681 assert(Queue && "Interacted with a non-initialized queue!");
683 // Push the barrier with the lock acquired.
684 return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
687 private:
688 /// Push a barrier packet that will wait up to two input signals. Assumes the
689 /// the queue lock is acquired.
690 Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
691 const AMDGPUSignalTy *InputSignal1,
692 const AMDGPUSignalTy *InputSignal2 = nullptr) {
693 // Add a queue barrier waiting on both the other stream's operation and the
694 // last operation on the current stream (if any).
695 uint64_t PacketId;
696 hsa_barrier_and_packet_t *Packet =
697 (hsa_barrier_and_packet_t *)acquirePacket(PacketId);
698 assert(Packet && "Invalid packet");
700 Packet->reserved0 = 0;
701 Packet->reserved1 = 0;
702 Packet->dep_signal[0] = {0};
703 Packet->dep_signal[1] = {0};
704 Packet->dep_signal[2] = {0};
705 Packet->dep_signal[3] = {0};
706 Packet->dep_signal[4] = {0};
707 Packet->reserved2 = 0;
708 Packet->completion_signal = {0};
710 // Set input and output dependencies if needed.
711 if (OutputSignal)
712 Packet->completion_signal = OutputSignal->get();
713 if (InputSignal1)
714 Packet->dep_signal[0] = InputSignal1->get();
715 if (InputSignal2)
716 Packet->dep_signal[1] = InputSignal2->get();
718 // Publish the packet. Do not modify the packet after this point.
719 publishBarrierPacket(PacketId, Packet);
721 return Plugin::success();
724 /// Acquire a packet from the queue. This call may block the thread if there
725 /// is no space in the underlying HSA queue. It may need to wait until the HSA
726 /// runtime processes some packets. Assumes the queue lock is acquired.
727 hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
728 // Increase the queue index with relaxed memory order. Notice this will need
729 // another subsequent atomic operation with acquire order.
730 PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
732 // Wait for the package to be available. Notice the atomic operation uses
733 // the acquire memory order.
734 while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
737 // Return the packet reference.
738 const uint32_t Mask = Queue->size - 1; // The size is a power of 2.
739 return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
740 (PacketId & Mask);
743 /// Publish the kernel packet so that the HSA runtime can start processing
744 /// the kernel launch. Do not modify the packet once this function is called.
745 /// Assumes the queue lock is acquired.
746 void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
747 hsa_kernel_dispatch_packet_t *Packet) {
748 uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
750 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
751 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
752 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
754 // Publish the packet. Do not modify the package after this point.
755 uint32_t HeaderWord = Header | (Setup << 16u);
756 __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
758 // Signal the doorbell about the published packet.
759 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
762 /// Publish the barrier packet so that the HSA runtime can start processing
763 /// the barrier. Next packets in the queue will not be processed until all
764 /// barrier dependencies (signals) are satisfied. Assumes the queue is locked
765 void publishBarrierPacket(uint64_t PacketId,
766 hsa_barrier_and_packet_t *Packet) {
767 uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
768 uint16_t Setup = 0;
769 uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
770 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
771 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
773 // Publish the packet. Do not modify the package after this point.
774 uint32_t HeaderWord = Header | (Setup << 16u);
775 __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
777 // Signal the doorbell about the published packet.
778 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
781 /// Callack that will be called when an error is detected on the HSA queue.
782 static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
783 auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
784 FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
787 /// The HSA queue.
788 hsa_queue_t *Queue;
790 /// Mutex to protect the acquiring and publishing of packets. For the moment,
791 /// we need this mutex to prevent publishing packets that are not ready to be
792 /// published in a multi-thread scenario. Without a queue lock, a thread T1
793 /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
794 /// publish its packet P+1 (signaling the queue's doorbell) before packet P
795 /// from T1 is ready to be processed. That scenario should be invalid. Thus,
796 /// we use the following mutex to make packet acquiring and publishing atomic.
797 /// TODO: There are other more advanced approaches to avoid this mutex using
798 /// atomic operations. We can further investigate it if this is a bottleneck.
799 std::mutex Mutex;
801 /// The number of streams, this queue is currently assigned to. A queue is
802 /// considered idle when this is zero, otherwise: busy.
803 uint32_t NumUsers;
806 /// Struct that implements a stream of asynchronous operations for AMDGPU
807 /// devices. This class relies on signals to implement streams and define the
808 /// dependencies between asynchronous operations.
809 struct AMDGPUStreamTy {
810 private:
811 /// Utility struct holding arguments for async H2H memory copies.
812 struct MemcpyArgsTy {
813 void *Dst;
814 const void *Src;
815 size_t Size;
818 /// Utility struct holding arguments for freeing buffers to memory managers.
819 struct ReleaseBufferArgsTy {
820 void *Buffer;
821 AMDGPUMemoryManagerTy *MemoryManager;
824 /// Utility struct holding arguments for releasing signals to signal managers.
825 struct ReleaseSignalArgsTy {
826 AMDGPUSignalTy *Signal;
827 AMDGPUSignalManagerTy *SignalManager;
830 /// The stream is composed of N stream's slots. The struct below represents
831 /// the fields of each slot. Each slot has a signal and an optional action
832 /// function. When appending an HSA asynchronous operation to the stream, one
833 /// slot is consumed and used to store the operation's information. The
834 /// operation's output signal is set to the consumed slot's signal. If there
835 /// is a previous asynchronous operation on the previous slot, the HSA async
836 /// operation's input signal is set to the signal of the previous slot. This
837 /// way, we obtain a chain of dependant async operations. The action is a
838 /// function that will be executed eventually after the operation is
839 /// completed, e.g., for releasing a buffer.
840 struct StreamSlotTy {
841 /// The output signal of the stream operation. May be used by the subsequent
842 /// operation as input signal.
843 AMDGPUSignalTy *Signal;
845 /// The action that must be performed after the operation's completion. Set
846 /// to nullptr when there is no action to perform.
847 Error (*ActionFunction)(void *);
849 /// Space for the action's arguments. A pointer to these arguments is passed
850 /// to the action function. Notice the space of arguments is limited.
851 union {
852 MemcpyArgsTy MemcpyArgs;
853 ReleaseBufferArgsTy ReleaseBufferArgs;
854 ReleaseSignalArgsTy ReleaseSignalArgs;
855 } ActionArgs;
857 /// Create an empty slot.
858 StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {}
860 /// Schedule a host memory copy action on the slot.
861 Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
862 ActionFunction = memcpyAction;
863 ActionArgs.MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
864 return Plugin::success();
867 /// Schedule a release buffer action on the slot.
868 Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
869 ActionFunction = releaseBufferAction;
870 ActionArgs.ReleaseBufferArgs = ReleaseBufferArgsTy{Buffer, &Manager};
871 return Plugin::success();
874 /// Schedule a signal release action on the slot.
875 Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
876 AMDGPUSignalManagerTy *SignalManager) {
877 ActionFunction = releaseSignalAction;
878 ActionArgs.ReleaseSignalArgs =
879 ReleaseSignalArgsTy{SignalToRelease, SignalManager};
880 return Plugin::success();
883 // Perform the action if needed.
884 Error performAction() {
885 if (!ActionFunction)
886 return Plugin::success();
888 // Perform the action.
889 if (ActionFunction == memcpyAction) {
890 if (auto Err = memcpyAction(&ActionArgs))
891 return Err;
892 } else if (ActionFunction == releaseBufferAction) {
893 if (auto Err = releaseBufferAction(&ActionArgs))
894 return Err;
895 } else if (ActionFunction == releaseSignalAction) {
896 if (auto Err = releaseSignalAction(&ActionArgs))
897 return Err;
898 } else {
899 return Plugin::error("Unknown action function!");
902 // Invalidate the action.
903 ActionFunction = nullptr;
905 return Plugin::success();
909 /// The device agent where the stream was created.
910 hsa_agent_t Agent;
912 /// The queue that the stream uses to launch kernels.
913 AMDGPUQueueTy *Queue;
915 /// The manager of signals to reuse signals.
916 AMDGPUSignalManagerTy &SignalManager;
918 /// A reference to the associated device.
919 GenericDeviceTy &Device;
921 /// Array of stream slots. Use std::deque because it can dynamically grow
922 /// without invalidating the already inserted elements. For instance, the
923 /// std::vector may invalidate the elements by reallocating the internal
924 /// array if there is not enough space on new insertions.
925 std::deque<StreamSlotTy> Slots;
927 /// The next available slot on the queue. This is reset to zero each time the
928 /// stream is synchronized. It also indicates the current number of consumed
929 /// slots at a given time.
930 uint32_t NextSlot;
932 /// The synchronization id. This number is increased each time the stream is
933 /// synchronized. It is useful to detect if an AMDGPUEventTy points to an
934 /// operation that was already finalized in a previous stream sycnhronize.
935 uint32_t SyncCycle;
937 /// A pointer associated with an RPC server running on the given device. If
938 /// RPC is not being used this will be a null pointer. Otherwise, this
939 /// indicates that an RPC server is expected to be run on this stream.
940 RPCServerTy *RPCServer;
942 /// Mutex to protect stream's management.
943 mutable std::mutex Mutex;
945 /// Timeout hint for HSA actively waiting for signal value to change
946 const uint64_t StreamBusyWaitMicroseconds;
948 /// Return the current number of asychronous operations on the stream.
949 uint32_t size() const { return NextSlot; }
951 /// Return the last valid slot on the stream.
952 uint32_t last() const { return size() - 1; }
954 /// Consume one slot from the stream. Since the stream uses signals on demand
955 /// and releases them once the slot is no longer used, the function requires
956 /// an idle signal for the new consumed slot.
957 std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) {
958 // Double the stream size if needed. Since we use std::deque, this operation
959 // does not invalidate the already added slots.
960 if (Slots.size() == NextSlot)
961 Slots.resize(Slots.size() * 2);
963 // Update the next available slot and the stream size.
964 uint32_t Curr = NextSlot++;
966 // Retrieve the input signal, if any, of the current operation.
967 AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr;
969 // Set the output signal of the current slot.
970 Slots[Curr].Signal = OutputSignal;
972 return std::make_pair(Curr, InputSignal);
975 /// Complete all pending post actions and reset the stream after synchronizing
976 /// or positively querying the stream.
977 Error complete() {
978 for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) {
979 // Take the post action of the operation if any.
980 if (auto Err = Slots[Slot].performAction())
981 return Err;
983 // Release the slot's signal if possible. Otherwise, another user will.
984 if (Slots[Slot].Signal->decreaseUseCount())
985 if (auto Err = SignalManager.returnResource(Slots[Slot].Signal))
986 return Err;
988 Slots[Slot].Signal = nullptr;
991 // Reset the stream slots to zero.
992 NextSlot = 0;
994 // Increase the synchronization id since the stream completed a sync cycle.
995 SyncCycle += 1;
997 return Plugin::success();
1000 /// Make the current stream wait on a specific operation of another stream.
1001 /// The idea is to make the current stream waiting on two signals: 1) the last
1002 /// signal of the current stream, and 2) the last signal of the other stream.
1003 /// Use a barrier packet with two input signals.
1004 Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
1005 if (Queue == nullptr)
1006 return Plugin::error("Target queue was nullptr");
1008 /// The signal that we must wait from the other stream.
1009 AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal;
1011 // Prevent the release of the other stream's signal.
1012 OtherSignal->increaseUseCount();
1014 // Retrieve an available signal for the operation's output.
1015 AMDGPUSignalTy *OutputSignal = nullptr;
1016 if (auto Err = SignalManager.getResource(OutputSignal))
1017 return Err;
1018 OutputSignal->reset();
1019 OutputSignal->increaseUseCount();
1021 // Consume stream slot and compute dependencies.
1022 auto [Curr, InputSignal] = consume(OutputSignal);
1024 // Setup the post action to release the signal.
1025 if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
1026 return Err;
1028 // Push a barrier into the queue with both input signals.
1029 return Queue->pushBarrier(OutputSignal, InputSignal, OtherSignal);
1032 /// Callback for running a specific asynchronous operation. This callback is
1033 /// used for hsa_amd_signal_async_handler. The argument is the operation that
1034 /// should be executed. Notice we use the post action mechanism to codify the
1035 /// asynchronous operation.
1036 static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
1037 StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args);
1038 assert(Slot && "Invalid slot");
1039 assert(Slot->Signal && "Invalid signal");
1041 // This thread is outside the stream mutex. Make sure the thread sees the
1042 // changes on the slot.
1043 std::atomic_thread_fence(std::memory_order_acquire);
1045 // Peform the operation.
1046 if (auto Err = Slot->performAction())
1047 FATAL_MESSAGE(1, "Error peforming post action: %s",
1048 toString(std::move(Err)).data());
1050 // Signal the output signal to notify the asycnhronous operation finalized.
1051 Slot->Signal->signal();
1053 // Unregister callback.
1054 return false;
1057 // Callback for host-to-host memory copies. This is an asynchronous action.
1058 static Error memcpyAction(void *Data) {
1059 MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data);
1060 assert(Args && "Invalid arguments");
1061 assert(Args->Dst && "Invalid destination buffer");
1062 assert(Args->Src && "Invalid source buffer");
1064 std::memcpy(Args->Dst, Args->Src, Args->Size);
1066 return Plugin::success();
1069 /// Releasing a memory buffer to a memory manager. This is a post completion
1070 /// action. There are two kinds of memory buffers:
1071 /// 1. For kernel arguments. This buffer can be freed after receiving the
1072 /// kernel completion signal.
1073 /// 2. For H2D tranfers that need pinned memory space for staging. This
1074 /// buffer can be freed after receiving the transfer completion signal.
1075 /// 3. For D2H tranfers that need pinned memory space for staging. This
1076 /// buffer cannot be freed after receiving the transfer completion signal
1077 /// because of the following asynchronous H2H callback.
1078 /// For this reason, This action can only be taken at
1079 /// AMDGPUStreamTy::complete()
1080 /// Because of the case 3, all releaseBufferActions are taken at
1081 /// AMDGPUStreamTy::complete() in the current implementation.
1082 static Error releaseBufferAction(void *Data) {
1083 ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data);
1084 assert(Args && "Invalid arguments");
1085 assert(Args->MemoryManager && "Invalid memory manager");
1086 assert(Args->Buffer && "Invalid buffer");
1088 // Release the allocation to the memory manager.
1089 return Args->MemoryManager->deallocate(Args->Buffer);
1092 /// Releasing a signal object back to SignalManager. This is a post completion
1093 /// action. This action can only be taken at AMDGPUStreamTy::complete()
1094 static Error releaseSignalAction(void *Data) {
1095 ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data);
1096 assert(Args && "Invalid arguments");
1097 assert(Args->Signal && "Invalid signal");
1098 assert(Args->SignalManager && "Invalid signal manager");
1100 // Release the signal if needed.
1101 if (Args->Signal->decreaseUseCount())
1102 if (auto Err = Args->SignalManager->returnResource(Args->Signal))
1103 return Err;
1105 return Plugin::success();
1108 public:
1109 /// Create an empty stream associated with a specific device.
1110 AMDGPUStreamTy(AMDGPUDeviceTy &Device);
1112 /// Intialize the stream's signals.
1113 Error init() { return Plugin::success(); }
1115 /// Deinitialize the stream's signals.
1116 Error deinit() { return Plugin::success(); }
1118 /// Attach an RPC server to this stream.
1119 void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
1121 /// Push a asynchronous kernel to the stream. The kernel arguments must be
1122 /// placed in a special allocation for kernel args and must keep alive until
1123 /// the kernel finalizes. Once the kernel is finished, the stream will release
1124 /// the kernel args buffer to the specified memory manager.
1125 Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
1126 uint32_t NumThreads, uint64_t NumBlocks,
1127 uint32_t GroupSize,
1128 AMDGPUMemoryManagerTy &MemoryManager) {
1129 if (Queue == nullptr)
1130 return Plugin::error("Target queue was nullptr");
1132 // Retrieve an available signal for the operation's output.
1133 AMDGPUSignalTy *OutputSignal = nullptr;
1134 if (auto Err = SignalManager.getResource(OutputSignal))
1135 return Err;
1136 OutputSignal->reset();
1137 OutputSignal->increaseUseCount();
1139 std::lock_guard<std::mutex> StreamLock(Mutex);
1141 // Consume stream slot and compute dependencies.
1142 auto [Curr, InputSignal] = consume(OutputSignal);
1144 // Setup the post action to release the kernel args buffer.
1145 if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
1146 return Err;
1148 // Push the kernel with the output signal and an input signal (optional)
1149 return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
1150 GroupSize, OutputSignal, InputSignal);
1153 /// Push an asynchronous memory copy between pinned memory buffers.
1154 Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src,
1155 uint64_t CopySize) {
1156 // Retrieve an available signal for the operation's output.
1157 AMDGPUSignalTy *OutputSignal = nullptr;
1158 if (auto Err = SignalManager.getResource(OutputSignal))
1159 return Err;
1160 OutputSignal->reset();
1161 OutputSignal->increaseUseCount();
1163 std::lock_guard<std::mutex> Lock(Mutex);
1165 // Consume stream slot and compute dependencies.
1166 auto [Curr, InputSignal] = consume(OutputSignal);
1168 // Avoid defining the input dependency if already satisfied.
1169 if (InputSignal && !InputSignal->load())
1170 InputSignal = nullptr;
1172 // Issue the async memory copy.
1173 hsa_status_t Status;
1174 if (InputSignal) {
1175 hsa_signal_t InputSignalRaw = InputSignal->get();
1176 Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1,
1177 &InputSignalRaw, OutputSignal->get());
1178 } else
1179 Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0,
1180 nullptr, OutputSignal->get());
1181 return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
1184 /// Push an asynchronous memory copy device-to-host involving an unpinned
1185 /// memory buffer. The operation consists of a two-step copy from the
1186 /// device buffer to an intermediate pinned host buffer, and then, to a
1187 /// unpinned host buffer. Both operations are asynchronous and dependant.
1188 /// The intermediate pinned buffer will be released to the specified memory
1189 /// manager once the operation completes.
1190 Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
1191 uint64_t CopySize,
1192 AMDGPUMemoryManagerTy &MemoryManager) {
1193 // Retrieve available signals for the operation's outputs.
1194 AMDGPUSignalTy *OutputSignals[2] = {};
1195 if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1196 return Err;
1197 for (auto Signal : OutputSignals) {
1198 Signal->reset();
1199 Signal->increaseUseCount();
1202 std::lock_guard<std::mutex> Lock(Mutex);
1204 // Consume stream slot and compute dependencies.
1205 auto [Curr, InputSignal] = consume(OutputSignals[0]);
1207 // Avoid defining the input dependency if already satisfied.
1208 if (InputSignal && !InputSignal->load())
1209 InputSignal = nullptr;
1211 // Setup the post action for releasing the intermediate buffer.
1212 if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
1213 return Err;
1215 // Issue the first step: device to host transfer. Avoid defining the input
1216 // dependency if already satisfied.
1217 hsa_status_t Status;
1218 if (InputSignal) {
1219 hsa_signal_t InputSignalRaw = InputSignal->get();
1220 Status =
1221 hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
1222 &InputSignalRaw, OutputSignals[0]->get());
1223 } else {
1224 Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0,
1225 nullptr, OutputSignals[0]->get());
1228 if (auto Err =
1229 Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
1230 return Err;
1232 // Consume another stream slot and compute dependencies.
1233 std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
1234 assert(InputSignal && "Invalid input signal");
1236 // The std::memcpy is done asynchronously using an async handler. We store
1237 // the function's information in the action but it's not actually an action.
1238 if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Inter, CopySize))
1239 return Err;
1241 // Make changes on this slot visible to the async handler's thread.
1242 std::atomic_thread_fence(std::memory_order_release);
1244 // Issue the second step: host to host transfer.
1245 Status = hsa_amd_signal_async_handler(
1246 InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1247 (void *)&Slots[Curr]);
1249 return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s");
1252 /// Push an asynchronous memory copy host-to-device involving an unpinned
1253 /// memory buffer. The operation consists of a two-step copy from the
1254 /// unpinned host buffer to an intermediate pinned host buffer, and then, to
1255 /// the pinned host buffer. Both operations are asynchronous and dependant.
1256 /// The intermediate pinned buffer will be released to the specified memory
1257 /// manager once the operation completes.
1258 Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
1259 uint64_t CopySize,
1260 AMDGPUMemoryManagerTy &MemoryManager) {
1261 // Retrieve available signals for the operation's outputs.
1262 AMDGPUSignalTy *OutputSignals[2] = {};
1263 if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1264 return Err;
1265 for (auto Signal : OutputSignals) {
1266 Signal->reset();
1267 Signal->increaseUseCount();
1270 AMDGPUSignalTy *OutputSignal = OutputSignals[0];
1272 std::lock_guard<std::mutex> Lock(Mutex);
1274 // Consume stream slot and compute dependencies.
1275 auto [Curr, InputSignal] = consume(OutputSignal);
1277 // Avoid defining the input dependency if already satisfied.
1278 if (InputSignal && !InputSignal->load())
1279 InputSignal = nullptr;
1281 // Issue the first step: host to host transfer.
1282 if (InputSignal) {
1283 // The std::memcpy is done asynchronously using an async handler. We store
1284 // the function's information in the action but it is not actually a
1285 // post action.
1286 if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
1287 return Err;
1289 // Make changes on this slot visible to the async handler's thread.
1290 std::atomic_thread_fence(std::memory_order_release);
1292 hsa_status_t Status = hsa_amd_signal_async_handler(
1293 InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1294 (void *)&Slots[Curr]);
1296 if (auto Err = Plugin::check(Status,
1297 "Error in hsa_amd_signal_async_handler: %s"))
1298 return Err;
1300 // Let's use now the second output signal.
1301 OutputSignal = OutputSignals[1];
1303 // Consume another stream slot and compute dependencies.
1304 std::tie(Curr, InputSignal) = consume(OutputSignal);
1305 } else {
1306 // All preceding operations completed, copy the memory synchronously.
1307 std::memcpy(Inter, Src, CopySize);
1309 // Return the second signal because it will not be used.
1310 OutputSignals[1]->decreaseUseCount();
1311 if (auto Err = SignalManager.returnResource(OutputSignals[1]))
1312 return Err;
1315 // Setup the post action to release the intermediate pinned buffer.
1316 if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
1317 return Err;
1319 // Issue the second step: host to device transfer. Avoid defining the input
1320 // dependency if already satisfied.
1321 hsa_status_t Status;
1322 if (InputSignal && InputSignal->load()) {
1323 hsa_signal_t InputSignalRaw = InputSignal->get();
1324 Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1,
1325 &InputSignalRaw, OutputSignal->get());
1326 } else
1327 Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0,
1328 nullptr, OutputSignal->get());
1330 return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
1333 // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
1334 Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
1335 hsa_agent_t SrcAgent, uint64_t CopySize) {
1336 AMDGPUSignalTy *OutputSignal;
1337 if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
1338 return Err;
1339 OutputSignal->reset();
1340 OutputSignal->increaseUseCount();
1342 std::lock_guard<std::mutex> Lock(Mutex);
1344 // Consume stream slot and compute dependencies.
1345 auto [Curr, InputSignal] = consume(OutputSignal);
1347 // Avoid defining the input dependency if already satisfied.
1348 if (InputSignal && !InputSignal->load())
1349 InputSignal = nullptr;
1351 // The agents need to have access to the corresponding memory
1352 // This is presently only true if the pointers were originally
1353 // allocated by this runtime or the caller made the appropriate
1354 // access calls.
1356 hsa_status_t Status;
1357 if (InputSignal && InputSignal->load()) {
1358 hsa_signal_t InputSignalRaw = InputSignal->get();
1359 Status =
1360 hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
1361 &InputSignalRaw, OutputSignal->get());
1362 } else
1363 Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
1364 0, nullptr, OutputSignal->get());
1366 return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
1369 /// Synchronize with the stream. The current thread waits until all operations
1370 /// are finalized and it performs the pending post actions (i.e., releasing
1371 /// intermediate buffers).
1372 Error synchronize() {
1373 std::lock_guard<std::mutex> Lock(Mutex);
1375 // No need to synchronize anything.
1376 if (size() == 0)
1377 return Plugin::success();
1379 // Wait until all previous operations on the stream have completed.
1380 if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds,
1381 RPCServer, &Device))
1382 return Err;
1384 // Reset the stream and perform all pending post actions.
1385 return complete();
1388 /// Query the stream and complete pending post actions if operations finished.
1389 /// Return whether all the operations completed. This operation does not block
1390 /// the calling thread.
1391 Expected<bool> query() {
1392 std::lock_guard<std::mutex> Lock(Mutex);
1394 // No need to query anything.
1395 if (size() == 0)
1396 return true;
1398 // The last operation did not complete yet. Return directly.
1399 if (Slots[last()].Signal->load())
1400 return false;
1402 // Reset the stream and perform all pending post actions.
1403 if (auto Err = complete())
1404 return std::move(Err);
1406 return true;
1409 /// Record the state of the stream on an event.
1410 Error recordEvent(AMDGPUEventTy &Event) const;
1412 /// Make the stream wait on an event.
1413 Error waitEvent(const AMDGPUEventTy &Event);
1415 friend struct AMDGPUStreamManagerTy;
1418 /// Class representing an event on AMDGPU. The event basically stores some
1419 /// information regarding the state of the recorded stream.
1420 struct AMDGPUEventTy {
1421 /// Create an empty event.
1422 AMDGPUEventTy(AMDGPUDeviceTy &Device)
1423 : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
1425 /// Initialize and deinitialize.
1426 Error init() { return Plugin::success(); }
1427 Error deinit() { return Plugin::success(); }
1429 /// Record the state of a stream on the event.
1430 Error record(AMDGPUStreamTy &Stream) {
1431 std::lock_guard<std::mutex> Lock(Mutex);
1433 // Ignore the last recorded stream.
1434 RecordedStream = &Stream;
1436 return Stream.recordEvent(*this);
1439 /// Make a stream wait on the current event.
1440 Error wait(AMDGPUStreamTy &Stream) {
1441 std::lock_guard<std::mutex> Lock(Mutex);
1443 if (!RecordedStream)
1444 return Plugin::error("Event does not have any recorded stream");
1446 // Synchronizing the same stream. Do nothing.
1447 if (RecordedStream == &Stream)
1448 return Plugin::success();
1450 // No need to wait anything, the recorded stream already finished the
1451 // corresponding operation.
1452 if (RecordedSlot < 0)
1453 return Plugin::success();
1455 return Stream.waitEvent(*this);
1458 protected:
1459 /// The stream registered in this event.
1460 AMDGPUStreamTy *RecordedStream;
1462 /// The recordered operation on the recorded stream.
1463 int64_t RecordedSlot;
1465 /// The sync cycle when the stream was recorded. Used to detect stale events.
1466 int64_t RecordedSyncCycle;
1468 /// Mutex to safely access event fields.
1469 mutable std::mutex Mutex;
1471 friend struct AMDGPUStreamTy;
1474 Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
1475 std::lock_guard<std::mutex> Lock(Mutex);
1477 if (size() > 0) {
1478 // Record the synchronize identifier (to detect stale recordings) and
1479 // the last valid stream's operation.
1480 Event.RecordedSyncCycle = SyncCycle;
1481 Event.RecordedSlot = last();
1483 assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
1484 assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
1485 } else {
1486 // The stream is empty, everything already completed, record nothing.
1487 Event.RecordedSyncCycle = -1;
1488 Event.RecordedSlot = -1;
1490 return Plugin::success();
1493 Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
1494 // Retrieve the recorded stream on the event.
1495 AMDGPUStreamTy &RecordedStream = *Event.RecordedStream;
1497 std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex);
1499 // The recorded stream already completed the operation because the synchronize
1500 // identifier is already outdated.
1501 if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle)
1502 return Plugin::success();
1504 // Again, the recorded stream already completed the operation, the last
1505 // operation's output signal is satisfied.
1506 if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load())
1507 return Plugin::success();
1509 // Otherwise, make the current stream wait on the other stream's operation.
1510 return waitOnStreamOperation(RecordedStream, Event.RecordedSlot);
1513 struct AMDGPUStreamManagerTy final
1514 : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
1515 using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
1516 using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1518 AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1519 : GenericDeviceResourceManagerTy(Device),
1520 OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
1521 NextQueue(0), Agent(HSAAgent) {}
1523 Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
1524 Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
1525 QueueSize = HSAQueueSize;
1526 MaxNumQueues = NumHSAQueues;
1527 // Initialize one queue eagerly
1528 if (auto Err = Queues.front().init(Agent, QueueSize))
1529 return Err;
1531 return GenericDeviceResourceManagerTy::init(InitialSize);
1534 /// Deinitialize the resource pool and delete all resources. This function
1535 /// must be called before the destructor.
1536 Error deinit() override {
1537 // De-init all queues
1538 for (AMDGPUQueueTy &Queue : Queues) {
1539 if (auto Err = Queue.deinit())
1540 return Err;
1543 return GenericDeviceResourceManagerTy::deinit();
1546 /// Get a single stream from the pool or create new resources.
1547 virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override {
1548 return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) {
1549 return assignNextQueue(Handle);
1553 /// Return stream to the pool.
1554 virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override {
1555 return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) {
1556 Handle->Queue->removeUser();
1557 return Plugin::success();
1561 private:
1562 /// Search for and assign an prefereably idle queue to the given Stream. If
1563 /// there is no queue without current users, choose the queue with the lowest
1564 /// user count. If utilization is ignored: use round robin selection.
1565 inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
1566 // Start from zero when tracking utilization, otherwise: round robin policy.
1567 uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
1569 if (OMPX_QueueTracking) {
1570 // Find the least used queue.
1571 for (uint32_t I = 0; I < MaxNumQueues; ++I) {
1572 // Early exit when an initialized queue is idle.
1573 if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
1574 Index = I;
1575 break;
1578 // Update the least used queue.
1579 if (Queues[Index].getUserCount() > Queues[I].getUserCount())
1580 Index = I;
1584 // Make sure the queue is initialized, then add user & assign.
1585 if (auto Err = Queues[Index].init(Agent, QueueSize))
1586 return Err;
1587 Queues[Index].addUser();
1588 Stream->Queue = &Queues[Index];
1590 return Plugin::success();
1593 /// Envar for controlling the tracking of busy HSA queues.
1594 BoolEnvar OMPX_QueueTracking;
1596 /// The next queue index to use for round robin selection.
1597 uint32_t NextQueue;
1599 /// The queues which are assigned to requested streams.
1600 std::vector<AMDGPUQueueTy> Queues;
1602 /// The corresponding device as HSA agent.
1603 hsa_agent_t Agent;
1605 /// The maximum number of queues.
1606 int MaxNumQueues;
1608 /// The size of created queues.
1609 int QueueSize;
1612 /// Abstract class that holds the common members of the actual kernel devices
1613 /// and the host device. Both types should inherit from this class.
1614 struct AMDGenericDeviceTy {
1615 AMDGenericDeviceTy() {}
1617 virtual ~AMDGenericDeviceTy() {}
1619 /// Create all memory pools which the device has access to and classify them.
1620 Error initMemoryPools() {
1621 // Retrieve all memory pools from the device agent(s).
1622 Error Err = retrieveAllMemoryPools();
1623 if (Err)
1624 return Err;
1626 for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
1627 // Initialize the memory pool and retrieve some basic info.
1628 Error Err = MemoryPool->init();
1629 if (Err)
1630 return Err;
1632 if (!MemoryPool->isGlobal())
1633 continue;
1635 // Classify the memory pools depending on their properties.
1636 if (MemoryPool->isFineGrained()) {
1637 FineGrainedMemoryPools.push_back(MemoryPool);
1638 if (MemoryPool->supportsKernelArgs())
1639 ArgsMemoryPools.push_back(MemoryPool);
1640 } else if (MemoryPool->isCoarseGrained()) {
1641 CoarseGrainedMemoryPools.push_back(MemoryPool);
1644 return Plugin::success();
1647 /// Destroy all memory pools.
1648 Error deinitMemoryPools() {
1649 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools)
1650 delete Pool;
1652 AllMemoryPools.clear();
1653 FineGrainedMemoryPools.clear();
1654 CoarseGrainedMemoryPools.clear();
1655 ArgsMemoryPools.clear();
1657 return Plugin::success();
1660 /// Retrieve and construct all memory pools from the device agent(s).
1661 virtual Error retrieveAllMemoryPools() = 0;
1663 /// Get the device agent.
1664 virtual hsa_agent_t getAgent() const = 0;
1666 protected:
1667 /// Array of all memory pools available to the host agents.
1668 llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools;
1670 /// Array of fine-grained memory pools available to the host agents.
1671 llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools;
1673 /// Array of coarse-grained memory pools available to the host agents.
1674 llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools;
1676 /// Array of kernel args memory pools available to the host agents.
1677 llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools;
1680 /// Class representing the host device. This host device may have more than one
1681 /// HSA host agent. We aggregate all its resources into the same instance.
1682 struct AMDHostDeviceTy : public AMDGenericDeviceTy {
1683 /// Create a host device from an array of host agents.
1684 AMDHostDeviceTy(const llvm::SmallVector<hsa_agent_t> &HostAgents)
1685 : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(),
1686 PinnedMemoryManager() {
1687 assert(HostAgents.size() && "No host agent found");
1690 /// Initialize the host device memory pools and the memory managers for
1691 /// kernel args and host pinned memory allocations.
1692 Error init() {
1693 if (auto Err = initMemoryPools())
1694 return Err;
1696 if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool()))
1697 return Err;
1699 if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool()))
1700 return Err;
1702 return Plugin::success();
1705 /// Deinitialize memory pools and managers.
1706 Error deinit() {
1707 if (auto Err = deinitMemoryPools())
1708 return Err;
1710 if (auto Err = ArgsMemoryManager.deinit())
1711 return Err;
1713 if (auto Err = PinnedMemoryManager.deinit())
1714 return Err;
1716 return Plugin::success();
1719 /// Retrieve and construct all memory pools from the host agents.
1720 Error retrieveAllMemoryPools() override {
1721 // Iterate through the available pools across the host agents.
1722 for (hsa_agent_t Agent : Agents) {
1723 Error Err = utils::iterateAgentMemoryPools(
1724 Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
1725 AMDGPUMemoryPoolTy *MemoryPool =
1726 new AMDGPUMemoryPoolTy(HSAMemoryPool);
1727 AllMemoryPools.push_back(MemoryPool);
1728 return HSA_STATUS_SUCCESS;
1730 if (Err)
1731 return Err;
1733 return Plugin::success();
1736 /// Get one of the host agents. Return always the first agent.
1737 hsa_agent_t getAgent() const override { return Agents[0]; }
1739 /// Get a memory pool for fine-grained allocations.
1740 AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
1741 assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
1742 // Retrive any memory pool.
1743 return *FineGrainedMemoryPools[0];
1746 AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
1747 assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
1748 // Retrive any memory pool.
1749 return *CoarseGrainedMemoryPools[0];
1752 /// Get a memory pool for kernel args allocations.
1753 AMDGPUMemoryPoolTy &getArgsMemoryPool() {
1754 assert(!ArgsMemoryPools.empty() && "No kernelargs mempool");
1755 // Retrieve any memory pool.
1756 return *ArgsMemoryPools[0];
1759 /// Getters for kernel args and host pinned memory managers.
1760 AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; }
1761 AMDGPUMemoryManagerTy &getPinnedMemoryManager() {
1762 return PinnedMemoryManager;
1765 private:
1766 /// Array of agents on the host side.
1767 const llvm::SmallVector<hsa_agent_t> Agents;
1769 // Memory manager for kernel arguments.
1770 AMDGPUMemoryManagerTy ArgsMemoryManager;
1772 // Memory manager for pinned memory.
1773 AMDGPUMemoryManagerTy PinnedMemoryManager;
1776 /// Class implementing the AMDGPU device functionalities which derives from the
1777 /// generic device class.
1778 struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1779 // Create an AMDGPU device with a device id and default AMDGPU grid values.
1780 AMDGPUDeviceTy(int32_t DeviceId, int32_t NumDevices,
1781 AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
1782 : GenericDeviceTy(DeviceId, NumDevices, {0}), AMDGenericDeviceTy(),
1783 OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
1784 OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
1785 OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
1786 OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
1787 1 * 1024 * 1024), // 1MB
1788 OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
1789 64),
1790 OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
1791 AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
1792 AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
1794 ~AMDGPUDeviceTy() {}
1796 /// Initialize the device, its resources and get its properties.
1797 Error initImpl(GenericPluginTy &Plugin) override {
1798 // First setup all the memory pools.
1799 if (auto Err = initMemoryPools())
1800 return Err;
1802 char GPUName[64];
1803 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
1804 return Err;
1805 ComputeUnitKind = GPUName;
1807 // Get the wavefront size.
1808 uint32_t WavefrontSize = 0;
1809 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
1810 return Err;
1811 GridValues.GV_Warp_Size = WavefrontSize;
1813 // Get the frequency of the steady clock.
1814 if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
1815 ClockFrequency))
1816 return Err;
1818 // Load the grid values dependending on the wavefront.
1819 if (WavefrontSize == 32)
1820 GridValues = getAMDGPUGridValues<32>();
1821 else if (WavefrontSize == 64)
1822 GridValues = getAMDGPUGridValues<64>();
1823 else
1824 return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
1826 // Get maximum number of workitems per workgroup.
1827 uint16_t WorkgroupMaxDim[3];
1828 if (auto Err =
1829 getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim))
1830 return Err;
1831 GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0];
1833 // Get maximum number of workgroups.
1834 hsa_dim3_t GridMaxDim;
1835 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
1836 return Err;
1838 GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
1839 if (GridValues.GV_Max_Teams == 0)
1840 return Plugin::error("Maximum number of teams cannot be zero");
1842 // Compute the default number of teams.
1843 uint32_t ComputeUnits = 0;
1844 if (auto Err =
1845 getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
1846 return Err;
1847 GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
1849 uint32_t WavesPerCU = 0;
1850 if (auto Err =
1851 getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
1852 return Err;
1853 HardwareParallelism = ComputeUnits * WavesPerCU;
1855 // Get maximum size of any device queues and maximum number of queues.
1856 uint32_t MaxQueueSize;
1857 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
1858 return Err;
1860 uint32_t MaxQueues;
1861 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues))
1862 return Err;
1864 // Compute the number of queues and their size.
1865 OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues));
1866 OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
1868 // Initialize stream pool.
1869 if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams,
1870 OMPX_NumQueues, OMPX_QueueSize))
1871 return Err;
1873 // Initialize event pool.
1874 if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents))
1875 return Err;
1877 // Initialize signal pool.
1878 if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
1879 return Err;
1881 return Plugin::success();
1884 /// Deinitialize the device and release its resources.
1885 Error deinitImpl() override {
1886 // Deinitialize the stream and event pools.
1887 if (auto Err = AMDGPUStreamManager.deinit())
1888 return Err;
1890 if (auto Err = AMDGPUEventManager.deinit())
1891 return Err;
1893 if (auto Err = AMDGPUSignalManager.deinit())
1894 return Err;
1896 // Close modules if necessary.
1897 if (!LoadedImages.empty()) {
1898 // Each image has its own module.
1899 for (DeviceImageTy *Image : LoadedImages) {
1900 AMDGPUDeviceImageTy &AMDImage =
1901 static_cast<AMDGPUDeviceImageTy &>(*Image);
1903 // Unload the executable of the image.
1904 if (auto Err = AMDImage.unloadExecutable())
1905 return Err;
1909 // Invalidate agent reference.
1910 Agent = {0};
1912 return Plugin::success();
1915 const uint64_t getStreamBusyWaitMicroseconds() const {
1916 return OMPX_StreamBusyWait;
1919 Expected<std::unique_ptr<MemoryBuffer>>
1920 doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
1922 // TODO: We should try to avoid materialization but there seems to be no
1923 // good linker interface w/o file i/o.
1924 SmallString<128> LinkerOutputFilePath;
1925 std::error_code EC = sys::fs::createTemporaryFile(
1926 "amdgpu-pre-link-jit", ".out", LinkerOutputFilePath);
1927 if (EC)
1928 return createStringError(EC,
1929 "Failed to create temporary file for linker");
1931 SmallString<128> LinkerInputFilePath = LinkerOutputFilePath;
1932 LinkerInputFilePath.pop_back_n(2);
1934 auto FD = raw_fd_ostream(LinkerInputFilePath.data(), EC);
1935 if (EC)
1936 return createStringError(EC, "Failed to open temporary file for linker");
1937 FD.write(MB->getBufferStart(), MB->getBufferSize());
1938 FD.close();
1940 const auto &ErrorOrPath = sys::findProgramByName("lld");
1941 if (!ErrorOrPath)
1942 return createStringError(inconvertibleErrorCode(),
1943 "Failed to find `lld` on the PATH.");
1945 std::string LLDPath = ErrorOrPath.get();
1946 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
1947 "Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str());
1949 std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
1951 StringRef Args[] = {LLDPath,
1952 "-flavor",
1953 "gnu",
1954 "--no-undefined",
1955 "-shared",
1956 MCPU,
1957 "-o",
1958 LinkerOutputFilePath.data(),
1959 LinkerInputFilePath.data()};
1961 std::string Error;
1962 int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error);
1963 if (RC)
1964 return createStringError(inconvertibleErrorCode(),
1965 "Linking optimized bitcode failed: %s",
1966 Error.c_str());
1968 return std::move(
1969 MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath.data()).get());
1972 /// See GenericDeviceTy::getComputeUnitKind().
1973 std::string getComputeUnitKind() const override { return ComputeUnitKind; }
1975 /// Returns the clock frequency for the given AMDGPU device.
1976 uint64_t getClockFrequency() const override { return ClockFrequency; }
1978 /// Allocate and construct an AMDGPU kernel.
1979 Expected<GenericKernelTy &>
1980 constructKernel(const __tgt_offload_entry &KernelEntry) override {
1981 // Allocate and construct the AMDGPU kernel.
1982 AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
1983 if (!AMDGPUKernel)
1984 return Plugin::error("Failed to allocate memory for AMDGPU kernel");
1986 new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name);
1988 return *AMDGPUKernel;
1991 /// Set the current context to this device's context. Do nothing since the
1992 /// AMDGPU devices do not have the concept of contexts.
1993 Error setContext() override { return Plugin::success(); }
1995 /// AMDGPU returns the product of the number of compute units and the waves
1996 /// per compute unit.
1997 uint64_t getHardwareParallelism() const override {
1998 return HardwareParallelism;
2001 /// We want to set up the RPC server for host services to the GPU if it is
2002 /// availible.
2003 bool shouldSetupRPCServer() const override {
2004 return libomptargetSupportsRPC();
2007 /// The RPC interface should have enough space for all availible parallelism.
2008 uint64_t requestedRPCPortCount() const override {
2009 return getHardwareParallelism();
2012 /// Get the stream of the asynchronous info sructure or get a new one.
2013 Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
2014 AMDGPUStreamTy *&Stream) {
2015 // Get the stream (if any) from the async info.
2016 Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
2017 if (!Stream) {
2018 // There was no stream; get an idle one.
2019 if (auto Err = AMDGPUStreamManager.getResource(Stream))
2020 return Err;
2022 // Modify the async info's stream.
2023 AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
2025 return Plugin::success();
2028 /// Load the binary image into the device and allocate an image object.
2029 Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
2030 int32_t ImageId) override {
2031 // Allocate and initialize the image object.
2032 AMDGPUDeviceImageTy *AMDImage =
2033 Plugin::get().allocate<AMDGPUDeviceImageTy>();
2034 new (AMDImage) AMDGPUDeviceImageTy(ImageId, TgtImage);
2036 // Load the HSA executable.
2037 if (Error Err = AMDImage->loadExecutable(*this))
2038 return std::move(Err);
2040 return AMDImage;
2043 /// Allocate memory on the device or related to the device.
2044 void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
2046 /// Deallocate memory on the device or related to the device.
2047 int free(void *TgtPtr, TargetAllocTy Kind) override {
2048 if (TgtPtr == nullptr)
2049 return OFFLOAD_SUCCESS;
2051 AMDGPUMemoryPoolTy *MemoryPool = nullptr;
2052 switch (Kind) {
2053 case TARGET_ALLOC_DEFAULT:
2054 case TARGET_ALLOC_DEVICE:
2055 MemoryPool = CoarseGrainedMemoryPools[0];
2056 break;
2057 case TARGET_ALLOC_HOST:
2058 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2059 break;
2060 case TARGET_ALLOC_SHARED:
2061 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2062 break;
2065 if (!MemoryPool) {
2066 REPORT("No memory pool for the specified allocation kind\n");
2067 return OFFLOAD_FAIL;
2070 if (Error Err = MemoryPool->deallocate(TgtPtr)) {
2071 REPORT("%s\n", toString(std::move(Err)).data());
2072 return OFFLOAD_FAIL;
2075 return OFFLOAD_SUCCESS;
2078 /// Synchronize current thread with the pending operations on the async info.
2079 Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
2080 AMDGPUStreamTy *Stream =
2081 reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2082 assert(Stream && "Invalid stream");
2084 if (auto Err = Stream->synchronize())
2085 return Err;
2087 // Once the stream is synchronized, return it to stream pool and reset
2088 // AsyncInfo. This is to make sure the synchronization only works for its
2089 // own tasks.
2090 AsyncInfo.Queue = nullptr;
2091 return AMDGPUStreamManager.returnResource(Stream);
2094 /// Query for the completion of the pending operations on the async info.
2095 Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
2096 AMDGPUStreamTy *Stream =
2097 reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2098 assert(Stream && "Invalid stream");
2100 auto CompletedOrErr = Stream->query();
2101 if (!CompletedOrErr)
2102 return CompletedOrErr.takeError();
2104 // Return if it the stream did not complete yet.
2105 if (!(*CompletedOrErr))
2106 return Plugin::success();
2108 // Once the stream is completed, return it to stream pool and reset
2109 // AsyncInfo. This is to make sure the synchronization only works for its
2110 // own tasks.
2111 AsyncInfo.Queue = nullptr;
2112 return AMDGPUStreamManager.returnResource(Stream);
2115 /// Pin the host buffer and return the device pointer that should be used for
2116 /// device transfers.
2117 Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
2118 void *PinnedPtr = nullptr;
2120 hsa_status_t Status =
2121 hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr);
2122 if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2123 return std::move(Err);
2125 return PinnedPtr;
2128 /// Unpin the host buffer.
2129 Error dataUnlockImpl(void *HstPtr) override {
2130 hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
2131 return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2134 /// Check through the HSA runtime whether the \p HstPtr buffer is pinned.
2135 Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
2136 void *&BaseDevAccessiblePtr,
2137 size_t &BaseSize) const override {
2138 hsa_amd_pointer_info_t Info;
2139 Info.size = sizeof(hsa_amd_pointer_info_t);
2141 hsa_status_t Status =
2142 hsa_amd_pointer_info(HstPtr, &Info, /* Allocator */ nullptr,
2143 /* Number of accessible agents (out) */ nullptr,
2144 /* Accessible agents */ nullptr);
2145 if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s"))
2146 return std::move(Err);
2148 // The buffer may be locked or allocated through HSA allocators. Assume that
2149 // the buffer is host pinned if the runtime reports a HSA type.
2150 if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED &&
2151 Info.type != HSA_EXT_POINTER_TYPE_HSA)
2152 return false;
2154 assert(Info.hostBaseAddress && "Invalid host pinned address");
2155 assert(Info.agentBaseAddress && "Invalid agent pinned address");
2156 assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size");
2158 // Save the allocation info in the output parameters.
2159 BaseHstPtr = Info.hostBaseAddress;
2160 BaseDevAccessiblePtr = Info.agentBaseAddress;
2161 BaseSize = Info.sizeInBytes;
2163 return true;
2166 /// Submit data to the device (host to device transfer).
2167 Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
2168 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2169 AMDGPUStreamTy *Stream = nullptr;
2170 void *PinnedPtr = nullptr;
2172 // Use one-step asynchronous operation when host memory is already pinned.
2173 if (void *PinnedPtr =
2174 PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2175 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2176 return Err;
2177 return Stream->pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size);
2180 // For large transfers use synchronous behavior.
2181 if (Size >= OMPX_MaxAsyncCopyBytes) {
2182 if (AsyncInfoWrapper.hasQueue())
2183 if (auto Err = synchronize(AsyncInfoWrapper))
2184 return Err;
2186 hsa_status_t Status;
2187 Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2188 &PinnedPtr);
2189 if (auto Err =
2190 Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2191 return Err;
2193 AMDGPUSignalTy Signal;
2194 if (auto Err = Signal.init())
2195 return Err;
2197 Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size,
2198 0, nullptr, Signal.get());
2199 if (auto Err =
2200 Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
2201 return Err;
2203 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2204 return Err;
2206 if (auto Err = Signal.deinit())
2207 return Err;
2209 Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2210 return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2213 // Otherwise, use two-step copy with an intermediate pinned host buffer.
2214 AMDGPUMemoryManagerTy &PinnedMemoryManager =
2215 HostDevice.getPinnedMemoryManager();
2216 if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
2217 return Err;
2219 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2220 return Err;
2222 return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size,
2223 PinnedMemoryManager);
2226 /// Retrieve data from the device (device to host transfer).
2227 Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
2228 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2229 AMDGPUStreamTy *Stream = nullptr;
2230 void *PinnedPtr = nullptr;
2232 // Use one-step asynchronous operation when host memory is already pinned.
2233 if (void *PinnedPtr =
2234 PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2235 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2236 return Err;
2238 return Stream->pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size);
2241 // For large transfers use synchronous behavior.
2242 if (Size >= OMPX_MaxAsyncCopyBytes) {
2243 if (AsyncInfoWrapper.hasQueue())
2244 if (auto Err = synchronize(AsyncInfoWrapper))
2245 return Err;
2247 hsa_status_t Status;
2248 Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2249 &PinnedPtr);
2250 if (auto Err =
2251 Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2252 return Err;
2254 AMDGPUSignalTy Signal;
2255 if (auto Err = Signal.init())
2256 return Err;
2258 Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size,
2259 0, nullptr, Signal.get());
2260 if (auto Err =
2261 Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
2262 return Err;
2264 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2265 return Err;
2267 if (auto Err = Signal.deinit())
2268 return Err;
2270 Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2271 return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2274 // Otherwise, use two-step copy with an intermediate pinned host buffer.
2275 AMDGPUMemoryManagerTy &PinnedMemoryManager =
2276 HostDevice.getPinnedMemoryManager();
2277 if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
2278 return Err;
2280 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2281 return Err;
2283 return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size,
2284 PinnedMemoryManager);
2287 /// Exchange data between two devices within the plugin.
2288 Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
2289 void *DstPtr, int64_t Size,
2290 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2291 AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
2293 AMDGPUStreamTy *Stream = nullptr;
2294 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2295 return Err;
2296 if (Size <= 0)
2297 return Plugin::success();
2299 return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
2300 getAgent(), (uint64_t)Size);
2303 /// Initialize the async info for interoperability purposes.
2304 Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2305 // TODO: Implement this function.
2306 return Plugin::success();
2309 /// Initialize the device info for interoperability purposes.
2310 Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
2311 DeviceInfo->Context = nullptr;
2313 if (!DeviceInfo->Device)
2314 DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
2316 return Plugin::success();
2319 /// Create an event.
2320 Error createEventImpl(void **EventPtrStorage) override {
2321 AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
2322 return AMDGPUEventManager.getResource(*Event);
2325 /// Destroy a previously created event.
2326 Error destroyEventImpl(void *EventPtr) override {
2327 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2328 return AMDGPUEventManager.returnResource(Event);
2331 /// Record the event.
2332 Error recordEventImpl(void *EventPtr,
2333 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2334 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2335 assert(Event && "Invalid event");
2337 AMDGPUStreamTy *Stream = nullptr;
2338 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2339 return Err;
2341 return Event->record(*Stream);
2344 /// Make the stream wait on the event.
2345 Error waitEventImpl(void *EventPtr,
2346 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2347 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2349 AMDGPUStreamTy *Stream = nullptr;
2350 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2351 return Err;
2353 return Event->wait(*Stream);
2356 /// Synchronize the current thread with the event.
2357 Error syncEventImpl(void *EventPtr) override {
2358 return Plugin::error("Synchronize event not implemented");
2361 /// Print information about the device.
2362 Error obtainInfoImpl(InfoQueueTy &Info) override {
2363 char TmpChar[1000];
2364 const char *TmpCharPtr = "Unknown";
2365 uint16_t Major, Minor;
2366 uint32_t TmpUInt, TmpUInt2;
2367 uint32_t CacheSize[4];
2368 size_t TmpSt;
2369 bool TmpBool;
2370 uint16_t WorkgrpMaxDim[3];
2371 hsa_dim3_t GridMaxDim;
2372 hsa_status_t Status, Status2;
2374 Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
2375 Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
2376 if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
2377 Info.add("HSA Runtime Version",
2378 std::to_string(Major) + "." + std::to_string(Minor));
2380 Info.add("HSA OpenMP Device Number", DeviceId);
2382 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
2383 if (Status == HSA_STATUS_SUCCESS)
2384 Info.add("Product Name", TmpChar);
2386 Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
2387 if (Status == HSA_STATUS_SUCCESS)
2388 Info.add("Device Name", TmpChar);
2390 Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
2391 if (Status == HSA_STATUS_SUCCESS)
2392 Info.add("Vendor Name", TmpChar);
2394 hsa_device_type_t DevType;
2395 Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
2396 if (Status == HSA_STATUS_SUCCESS) {
2397 switch (DevType) {
2398 case HSA_DEVICE_TYPE_CPU:
2399 TmpCharPtr = "CPU";
2400 break;
2401 case HSA_DEVICE_TYPE_GPU:
2402 TmpCharPtr = "GPU";
2403 break;
2404 case HSA_DEVICE_TYPE_DSP:
2405 TmpCharPtr = "DSP";
2406 break;
2408 Info.add("Device Type", TmpCharPtr);
2411 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
2412 if (Status == HSA_STATUS_SUCCESS)
2413 Info.add("Max Queues", TmpUInt);
2415 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
2416 if (Status == HSA_STATUS_SUCCESS)
2417 Info.add("Queue Min Size", TmpUInt);
2419 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
2420 if (Status == HSA_STATUS_SUCCESS)
2421 Info.add("Queue Max Size", TmpUInt);
2423 // FIXME: This is deprecated according to HSA documentation. But using
2424 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
2425 // runtime.
2426 Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
2427 if (Status == HSA_STATUS_SUCCESS) {
2428 Info.add("Cache");
2430 for (int I = 0; I < 4; I++)
2431 if (CacheSize[I])
2432 Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
2435 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
2436 if (Status == HSA_STATUS_SUCCESS)
2437 Info.add("Cacheline Size", TmpUInt);
2439 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
2440 if (Status == HSA_STATUS_SUCCESS)
2441 Info.add("Max Clock Freq", TmpUInt, "MHz");
2443 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
2444 if (Status == HSA_STATUS_SUCCESS)
2445 Info.add("Compute Units", TmpUInt);
2447 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
2448 if (Status == HSA_STATUS_SUCCESS)
2449 Info.add("SIMD per CU", TmpUInt);
2451 Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
2452 if (Status == HSA_STATUS_SUCCESS)
2453 Info.add("Fast F16 Operation", TmpBool);
2455 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
2456 if (Status == HSA_STATUS_SUCCESS)
2457 Info.add("Wavefront Size", TmpUInt2);
2459 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
2460 if (Status == HSA_STATUS_SUCCESS)
2461 Info.add("Workgroup Max Size", TmpUInt);
2463 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
2464 if (Status == HSA_STATUS_SUCCESS) {
2465 Info.add("Workgroup Max Size per Dimension");
2466 Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
2467 Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
2468 Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
2471 Status = getDeviceAttrRaw(
2472 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
2473 if (Status == HSA_STATUS_SUCCESS) {
2474 Info.add("Max Waves Per CU", TmpUInt);
2475 Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
2478 Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
2479 if (Status == HSA_STATUS_SUCCESS)
2480 Info.add("Grid Max Size", TmpUInt);
2482 Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
2483 if (Status == HSA_STATUS_SUCCESS) {
2484 Info.add("Grid Max Size per Dimension");
2485 Info.add<InfoLevel2>("x", GridMaxDim.x);
2486 Info.add<InfoLevel2>("y", GridMaxDim.y);
2487 Info.add<InfoLevel2>("z", GridMaxDim.z);
2490 Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
2491 if (Status == HSA_STATUS_SUCCESS)
2492 Info.add("Max fbarriers/Workgrp", TmpUInt);
2494 Info.add("Memory Pools");
2495 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2496 std::string TmpStr, TmpStr2;
2498 if (Pool->isGlobal())
2499 TmpStr = "Global";
2500 else if (Pool->isReadOnly())
2501 TmpStr = "ReadOnly";
2502 else if (Pool->isPrivate())
2503 TmpStr = "Private";
2504 else if (Pool->isGroup())
2505 TmpStr = "Group";
2506 else
2507 TmpStr = "Unknown";
2509 Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
2511 if (Pool->isGlobal()) {
2512 if (Pool->isFineGrained())
2513 TmpStr2 += "Fine Grained ";
2514 if (Pool->isCoarseGrained())
2515 TmpStr2 += "Coarse Grained ";
2516 if (Pool->supportsKernelArgs())
2517 TmpStr2 += "Kernarg ";
2519 Info.add<InfoLevel3>("Flags", TmpStr2);
2522 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
2523 if (Status == HSA_STATUS_SUCCESS)
2524 Info.add<InfoLevel3>("Size", TmpSt, "bytes");
2526 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
2527 TmpBool);
2528 if (Status == HSA_STATUS_SUCCESS)
2529 Info.add<InfoLevel3>("Allocatable", TmpBool);
2531 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
2532 TmpSt);
2533 if (Status == HSA_STATUS_SUCCESS)
2534 Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
2536 Status = Pool->getAttrRaw(
2537 HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
2538 if (Status == HSA_STATUS_SUCCESS)
2539 Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
2541 Status =
2542 Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
2543 if (Status == HSA_STATUS_SUCCESS)
2544 Info.add<InfoLevel3>("Accessable by all", TmpBool);
2547 Info.add("ISAs");
2548 auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
2549 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
2550 if (Status == HSA_STATUS_SUCCESS)
2551 Info.add<InfoLevel2>("Name", TmpChar);
2553 return Status;
2556 // Silently consume the error.
2557 if (Err)
2558 consumeError(std::move(Err));
2560 return Plugin::success();
2563 /// Getters and setters for stack and heap sizes.
2564 Error getDeviceStackSize(uint64_t &Value) override {
2565 Value = 0;
2566 return Plugin::success();
2568 Error setDeviceStackSize(uint64_t Value) override {
2569 return Plugin::success();
2571 Error getDeviceHeapSize(uint64_t &Value) override {
2572 Value = DeviceMemoryPoolSize;
2573 return Plugin::success();
2575 Error setDeviceHeapSize(uint64_t Value) override {
2576 for (DeviceImageTy *Image : LoadedImages)
2577 if (auto Err = setupDeviceMemoryPool(Plugin::get(), *Image, Value))
2578 return Err;
2579 DeviceMemoryPoolSize = Value;
2580 return Plugin::success();
2582 Error getDeviceMemorySize(uint64_t &Value) override {
2583 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2584 if (Pool->isGlobal()) {
2585 hsa_status_t Status =
2586 Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
2587 return Plugin::check(Status, "Error in getting device memory size: %s");
2590 return Plugin::error("getDeviceMemorySize:: no global pool");
2593 /// AMDGPU-specific function to get device attributes.
2594 template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
2595 hsa_status_t Status =
2596 hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2597 return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
2600 template <typename Ty>
2601 hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
2602 return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2605 /// Get the device agent.
2606 hsa_agent_t getAgent() const override { return Agent; }
2608 /// Get the signal manager.
2609 AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; }
2611 /// Retrieve and construct all memory pools of the device agent.
2612 Error retrieveAllMemoryPools() override {
2613 // Iterate through the available pools of the device agent.
2614 return utils::iterateAgentMemoryPools(
2615 Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
2616 AMDGPUMemoryPoolTy *MemoryPool =
2617 Plugin::get().allocate<AMDGPUMemoryPoolTy>();
2618 new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
2619 AllMemoryPools.push_back(MemoryPool);
2620 return HSA_STATUS_SUCCESS;
2624 private:
2625 using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2626 using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
2628 /// Envar for controlling the number of HSA queues per device. High number of
2629 /// queues may degrade performance.
2630 UInt32Envar OMPX_NumQueues;
2632 /// Envar for controlling the size of each HSA queue. The size is the number
2633 /// of HSA packets a queue is expected to hold. It is also the number of HSA
2634 /// packets that can be pushed into each queue without waiting the driver to
2635 /// process them.
2636 UInt32Envar OMPX_QueueSize;
2638 /// Envar for controlling the default number of teams relative to the number
2639 /// of compute units (CUs) the device has:
2640 /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
2641 UInt32Envar OMPX_DefaultTeamsPerCU;
2643 /// Envar specifying the maximum size in bytes where the memory copies are
2644 /// asynchronous operations. Up to this transfer size, the memory copies are
2645 /// asychronous operations pushed to the corresponding stream. For larger
2646 /// transfers, they are synchronous transfers.
2647 UInt32Envar OMPX_MaxAsyncCopyBytes;
2649 /// Envar controlling the initial number of HSA signals per device. There is
2650 /// one manager of signals per device managing several pre-allocated signals.
2651 /// These signals are mainly used by AMDGPU streams. If needed, more signals
2652 /// will be created.
2653 UInt32Envar OMPX_InitialNumSignals;
2655 /// Environment variables to set the time to wait in active state before
2656 /// switching to blocked state. The default 2000000 busywaits for 2 seconds
2657 /// before going into a blocking HSA wait state. The unit for these variables
2658 /// are microseconds.
2659 UInt32Envar OMPX_StreamBusyWait;
2661 /// Stream manager for AMDGPU streams.
2662 AMDGPUStreamManagerTy AMDGPUStreamManager;
2664 /// Event manager for AMDGPU events.
2665 AMDGPUEventManagerTy AMDGPUEventManager;
2667 /// Signal manager for AMDGPU signals.
2668 AMDGPUSignalManagerTy AMDGPUSignalManager;
2670 /// The agent handler corresponding to the device.
2671 hsa_agent_t Agent;
2673 /// The GPU architecture.
2674 std::string ComputeUnitKind;
2676 /// The frequency of the steady clock inside the device.
2677 uint64_t ClockFrequency;
2679 /// The total number of concurrent work items that can be running on the GPU.
2680 uint64_t HardwareParallelism;
2682 /// Reference to the host device.
2683 AMDHostDeviceTy &HostDevice;
2685 /// The current size of the global device memory pool (managed by us).
2686 uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
2689 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
2690 hsa_status_t Status;
2691 Status = hsa_code_object_deserialize(getStart(), getSize(), "", &CodeObject);
2692 if (auto Err =
2693 Plugin::check(Status, "Error in hsa_code_object_deserialize: %s"))
2694 return Err;
2696 Status = hsa_executable_create_alt(
2697 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable);
2698 if (auto Err =
2699 Plugin::check(Status, "Error in hsa_executable_create_alt: %s"))
2700 return Err;
2702 Status = hsa_executable_load_code_object(Executable, Device.getAgent(),
2703 CodeObject, "");
2704 if (auto Err =
2705 Plugin::check(Status, "Error in hsa_executable_load_code_object: %s"))
2706 return Err;
2708 Status = hsa_executable_freeze(Executable, "");
2709 if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s"))
2710 return Err;
2712 uint32_t Result;
2713 Status = hsa_executable_validate(Executable, &Result);
2714 if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s"))
2715 return Err;
2717 if (Result)
2718 return Plugin::error("Loaded HSA executable does not validate");
2720 if (auto Err = utils::readAMDGPUMetaDataFromImage(
2721 getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
2722 return Err;
2724 return Plugin::success();
2727 Expected<hsa_executable_symbol_t>
2728 AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device,
2729 StringRef SymbolName) const {
2731 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
2732 hsa_agent_t Agent = AMDGPUDevice.getAgent();
2734 hsa_executable_symbol_t Symbol;
2735 hsa_status_t Status = hsa_executable_get_symbol_by_name(
2736 Executable, SymbolName.data(), &Agent, &Symbol);
2737 if (auto Err = Plugin::check(
2738 Status, "Error in hsa_executable_get_symbol_by_name(%s): %s",
2739 SymbolName.data()))
2740 return std::move(Err);
2742 return Symbol;
2745 template <typename ResourceTy>
2746 Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
2747 if (Resource)
2748 return Plugin::error("Creating an existing resource");
2750 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
2752 Resource = new ResourceTy(AMDGPUDevice);
2754 return Resource->init();
2757 AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
2758 : Agent(Device.getAgent()), Queue(nullptr),
2759 SignalManager(Device.getSignalManager()), Device(Device),
2760 // Initialize the std::deque with some empty positions.
2761 Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
2762 StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
2764 /// Class implementing the AMDGPU-specific functionalities of the global
2765 /// handler.
2766 struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
2767 /// Get the metadata of a global from the device. The name and size of the
2768 /// global is read from DeviceGlobal and the address of the global is written
2769 /// to DeviceGlobal.
2770 Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
2771 DeviceImageTy &Image,
2772 GlobalTy &DeviceGlobal) override {
2773 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
2775 // Find the symbol on the device executable.
2776 auto SymbolOrErr =
2777 AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName());
2778 if (!SymbolOrErr)
2779 return SymbolOrErr.takeError();
2781 hsa_executable_symbol_t Symbol = *SymbolOrErr;
2782 hsa_symbol_kind_t SymbolType;
2783 hsa_status_t Status;
2784 uint64_t SymbolAddr;
2785 uint32_t SymbolSize;
2787 // Retrieve the type, address and size of the symbol.
2788 std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
2789 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
2790 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr},
2791 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}};
2793 for (auto &Info : RequiredInfos) {
2794 Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
2795 if (auto Err = Plugin::check(
2796 Status, "Error in hsa_executable_symbol_get_info: %s"))
2797 return Err;
2800 // Check the size of the symbol.
2801 if (SymbolSize != DeviceGlobal.getSize())
2802 return Plugin::error(
2803 "Failed to load global '%s' due to size mismatch (%zu != %zu)",
2804 DeviceGlobal.getName().data(), SymbolSize,
2805 (size_t)DeviceGlobal.getSize());
2807 // Store the symbol address on the device global metadata.
2808 DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr));
2810 return Plugin::success();
2814 /// Class implementing the AMDGPU-specific functionalities of the plugin.
2815 struct AMDGPUPluginTy final : public GenericPluginTy {
2816 /// Create an AMDGPU plugin and initialize the AMDGPU driver.
2817 AMDGPUPluginTy()
2818 : GenericPluginTy(getTripleArch()), Initialized(false),
2819 HostDevice(nullptr) {}
2821 /// This class should not be copied.
2822 AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
2823 AMDGPUPluginTy(AMDGPUPluginTy &&) = delete;
2825 /// Initialize the plugin and return the number of devices.
2826 Expected<int32_t> initImpl() override {
2827 hsa_status_t Status = hsa_init();
2828 if (Status != HSA_STATUS_SUCCESS) {
2829 // Cannot call hsa_success_string.
2830 DP("Failed to initialize AMDGPU's HSA library\n");
2831 return 0;
2834 // The initialization of HSA was successful. It should be safe to call
2835 // HSA functions from now on, e.g., hsa_shut_down.
2836 Initialized = true;
2838 #ifdef OMPT_SUPPORT
2839 ompt::connectLibrary();
2840 #endif
2842 // Register event handler to detect memory errors on the devices.
2843 Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
2844 if (auto Err = Plugin::check(
2845 Status, "Error in hsa_amd_register_system_event_handler: %s"))
2846 return std::move(Err);
2848 // List of host (CPU) agents.
2849 llvm::SmallVector<hsa_agent_t> HostAgents;
2851 // Count the number of available agents.
2852 auto Err = utils::iterateAgents([&](hsa_agent_t Agent) {
2853 // Get the device type of the agent.
2854 hsa_device_type_t DeviceType;
2855 hsa_status_t Status =
2856 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
2857 if (Status != HSA_STATUS_SUCCESS)
2858 return Status;
2860 // Classify the agents into kernel (GPU) and host (CPU) kernels.
2861 if (DeviceType == HSA_DEVICE_TYPE_GPU) {
2862 // Ensure that the GPU agent supports kernel dispatch packets.
2863 hsa_agent_feature_t Features;
2864 Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
2865 if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
2866 KernelAgents.push_back(Agent);
2867 } else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
2868 HostAgents.push_back(Agent);
2870 return HSA_STATUS_SUCCESS;
2873 if (Err)
2874 return std::move(Err);
2876 int32_t NumDevices = KernelAgents.size();
2877 if (NumDevices == 0) {
2878 // Do not initialize if there are no devices.
2879 DP("There are no devices supporting AMDGPU.\n");
2880 return 0;
2883 // There are kernel agents but there is no host agent. That should be
2884 // treated as an error.
2885 if (HostAgents.empty())
2886 return Plugin::error("No AMDGPU host agents");
2888 // Initialize the host device using host agents.
2889 HostDevice = allocate<AMDHostDeviceTy>();
2890 new (HostDevice) AMDHostDeviceTy(HostAgents);
2892 // Setup the memory pools of available for the host.
2893 if (auto Err = HostDevice->init())
2894 return std::move(Err);
2896 return NumDevices;
2899 /// Deinitialize the plugin.
2900 Error deinitImpl() override {
2901 // The HSA runtime was not initialized, so nothing from the plugin was
2902 // actually initialized.
2903 if (!Initialized)
2904 return Plugin::success();
2906 if (HostDevice)
2907 if (auto Err = HostDevice->deinit())
2908 return Err;
2910 // Finalize the HSA runtime.
2911 hsa_status_t Status = hsa_shut_down();
2912 return Plugin::check(Status, "Error in hsa_shut_down: %s");
2915 Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
2917 /// Get the ELF code for recognizing the compatible image binary.
2918 uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
2920 /// Check whether the image is compatible with an AMDGPU device.
2921 Expected<bool> isImageCompatible(__tgt_image_info *Info) const override {
2922 for (hsa_agent_t Agent : KernelAgents) {
2923 std::string Target;
2924 auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
2925 uint32_t Length;
2926 hsa_status_t Status;
2927 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
2928 if (Status != HSA_STATUS_SUCCESS)
2929 return Status;
2931 llvm::SmallVector<char> ISAName(Length);
2932 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
2933 if (Status != HSA_STATUS_SUCCESS)
2934 return Status;
2936 llvm::StringRef TripleTarget(ISAName.begin(), Length);
2937 if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
2938 Target = TripleTarget.ltrim('-').rtrim('\0').str();
2939 return HSA_STATUS_SUCCESS;
2941 if (Err)
2942 return std::move(Err);
2944 if (!utils::isImageCompatibleWithEnv(Info, Target))
2945 return false;
2947 return true;
2950 bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
2951 return true;
2954 /// Get the host device instance.
2955 AMDHostDeviceTy &getHostDevice() {
2956 assert(HostDevice && "Host device not initialized");
2957 return *HostDevice;
2960 /// Get the kernel agent with the corresponding agent id.
2961 hsa_agent_t getKernelAgent(int32_t AgentId) const {
2962 assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id");
2963 return KernelAgents[AgentId];
2966 /// Get the list of the available kernel agents.
2967 const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const {
2968 return KernelAgents;
2971 private:
2972 /// Event handler that will be called by ROCr if an event is detected.
2973 static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
2974 if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
2975 return HSA_STATUS_SUCCESS;
2977 SmallVector<std::string> Reasons;
2978 uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
2979 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
2980 Reasons.emplace_back("Page not present or supervisor privilege");
2981 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
2982 Reasons.emplace_back("Write access to a read-only page");
2983 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
2984 Reasons.emplace_back("Execute access to a page marked NX");
2985 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
2986 Reasons.emplace_back("GPU attempted access to a host only page");
2987 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
2988 Reasons.emplace_back("DRAM ECC failure");
2989 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
2990 Reasons.emplace_back("Can't determine the exact fault address");
2991 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
2992 Reasons.emplace_back("SRAM ECC failure (ie registers, no fault address)");
2993 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
2994 Reasons.emplace_back("GPU reset following unspecified hang");
2996 // If we do not know the reason, say so, otherwise remove the trailing comma
2997 // and space.
2998 if (Reasons.empty())
2999 Reasons.emplace_back("Unknown (" + std::to_string(ReasonsMask) + ")");
3001 uint32_t Node = -1;
3002 hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
3004 // Abort the execution since we do not recover from this error.
3005 FATAL_MESSAGE(1,
3006 "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
3007 ") at virtual address %p. Reasons: %s",
3008 Node, Event->memory_fault.agent.handle,
3009 (void *)Event->memory_fault.virtual_address,
3010 llvm::join(Reasons, ", ").c_str());
3012 return HSA_STATUS_ERROR;
3015 /// Indicate whether the HSA runtime was correctly initialized. Even if there
3016 /// is no available devices this boolean will be true. It indicates whether
3017 /// we can safely call HSA functions (e.g., hsa_shut_down).
3018 bool Initialized;
3020 /// Arrays of the available GPU and CPU agents. These arrays of handles should
3021 /// not be here but in the AMDGPUDeviceTy structures directly. However, the
3022 /// HSA standard does not provide API functions to retirve agents directly,
3023 /// only iterating functions. We cache the agents here for convenience.
3024 llvm::SmallVector<hsa_agent_t> KernelAgents;
3026 /// The device representing all HSA host agents.
3027 AMDHostDeviceTy *HostDevice;
3030 Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3031 uint32_t NumThreads, uint64_t NumBlocks,
3032 KernelArgsTy &KernelArgs, void *Args,
3033 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3034 const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
3036 if (ArgsSize < KernelArgsSize)
3037 return Plugin::error("Mismatch of kernel arguments size");
3039 // The args size reported by HSA may or may not contain the implicit args.
3040 // For now, assume that HSA does not consider the implicit arguments when
3041 // reporting the arguments of a kernel. In the worst case, we can waste
3042 // 56 bytes per allocation.
3043 uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize;
3045 AMDHostDeviceTy &HostDevice = Plugin::get<AMDGPUPluginTy>().getHostDevice();
3046 AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
3048 void *AllArgs = nullptr;
3049 if (auto Err = ArgsMemoryManager.allocate(AllArgsSize, &AllArgs))
3050 return Err;
3052 // Account for user requested dynamic shared memory.
3053 uint32_t GroupSize = getGroupSize();
3054 if (uint32_t MaxDynCGroupMem = std::max(
3055 KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
3056 GroupSize += MaxDynCGroupMem;
3059 // Initialize implicit arguments.
3060 utils::AMDGPUImplicitArgsTy *ImplArgs =
3061 reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
3062 advanceVoidPtr(AllArgs, KernelArgsSize));
3064 // Initialize the implicit arguments to zero.
3065 std::memset(ImplArgs, 0, ImplicitArgsSize);
3067 // Copy the explicit arguments.
3068 // TODO: We should expose the args memory manager alloc to the common part as
3069 // alternative to copying them twice.
3070 if (KernelArgs.NumArgs)
3071 std::memcpy(AllArgs, *static_cast<void **>(Args),
3072 sizeof(void *) * KernelArgs.NumArgs);
3074 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
3076 AMDGPUStreamTy *Stream = nullptr;
3077 if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
3078 return Err;
3080 // If this kernel requires an RPC server we attach its pointer to the stream.
3081 if (GenericDevice.getRPCServer())
3082 Stream->setRPCServer(GenericDevice.getRPCServer());
3084 // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
3085 if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
3086 ImplArgs->BlockCountX = NumBlocks;
3087 ImplArgs->GroupSizeX = NumThreads;
3088 ImplArgs->GroupSizeY = 1;
3089 ImplArgs->GroupSizeZ = 1;
3090 ImplArgs->GridDims = 1;
3093 // Push the kernel launch into the stream.
3094 return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
3095 GroupSize, ArgsMemoryManager);
3098 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
3099 KernelArgsTy &KernelArgs,
3100 uint32_t NumThreads,
3101 uint64_t NumBlocks) const {
3102 // Only do all this when the output is requested
3103 if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
3104 return Plugin::success();
3106 // We don't have data to print additional info, but no hard error
3107 if (!KernelInfo.has_value())
3108 return Plugin::success();
3110 // General Info
3111 auto NumGroups = NumBlocks;
3112 auto ThreadsPerGroup = NumThreads;
3114 // Kernel Arguments Info
3115 auto ArgNum = KernelArgs.NumArgs;
3116 auto LoopTripCount = KernelArgs.Tripcount;
3118 // Details for AMDGPU kernels (read from image)
3119 // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
3120 auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
3121 auto SGPRCount = (*KernelInfo).SGPRCount;
3122 auto VGPRCount = (*KernelInfo).VGPRCount;
3123 auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
3124 auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
3125 auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
3127 // Prints additional launch info that contains the following.
3128 // Num Args: The number of kernel arguments
3129 // Teams x Thrds: The number of teams and the number of threads actually
3130 // running.
3131 // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
3132 // kernel in work-items
3133 // LDS Usage: Amount of bytes used in LDS storage
3134 // S/VGPR Count: the number of S/V GPRs occupied by the kernel
3135 // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
3136 // Tripcount: loop tripcount for the kernel
3137 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
3138 "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
3139 "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
3140 "%lu\n",
3141 ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
3142 GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
3143 LoopTripCount);
3145 return Plugin::success();
3148 GenericPluginTy *Plugin::createPlugin() { return new AMDGPUPluginTy(); }
3150 GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) {
3151 AMDGPUPluginTy &Plugin = get<AMDGPUPluginTy &>();
3152 return new AMDGPUDeviceTy(DeviceId, NumDevices, Plugin.getHostDevice(),
3153 Plugin.getKernelAgent(DeviceId));
3156 GenericGlobalHandlerTy *Plugin::createGlobalHandler() {
3157 return new AMDGPUGlobalHandlerTy();
3160 template <typename... ArgsTy>
3161 Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
3162 hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
3163 if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK)
3164 return Error::success();
3166 const char *Desc = "Unknown error";
3167 hsa_status_t Ret = hsa_status_string(ResultCode, &Desc);
3168 if (Ret != HSA_STATUS_SUCCESS)
3169 REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
3171 return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
3172 ErrFmt, Args..., Desc);
3175 void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
3176 TargetAllocTy Kind) {
3177 // Allocate memory from the pool.
3178 void *Ptr = nullptr;
3179 if (auto Err = MemoryPool->allocate(Size, &Ptr)) {
3180 consumeError(std::move(Err));
3181 return nullptr;
3183 assert(Ptr && "Invalid pointer");
3185 auto &KernelAgents = Plugin::get<AMDGPUPluginTy>().getKernelAgents();
3187 // Allow all kernel agents to access the allocation.
3188 if (auto Err = MemoryPool->enableAccess(Ptr, Size, KernelAgents)) {
3189 REPORT("%s\n", toString(std::move(Err)).data());
3190 return nullptr;
3192 return Ptr;
3195 void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
3196 if (Size == 0)
3197 return nullptr;
3199 // Find the correct memory pool.
3200 AMDGPUMemoryPoolTy *MemoryPool = nullptr;
3201 switch (Kind) {
3202 case TARGET_ALLOC_DEFAULT:
3203 case TARGET_ALLOC_DEVICE:
3204 MemoryPool = CoarseGrainedMemoryPools[0];
3205 break;
3206 case TARGET_ALLOC_HOST:
3207 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3208 break;
3209 case TARGET_ALLOC_SHARED:
3210 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3211 break;
3214 if (!MemoryPool) {
3215 REPORT("No memory pool for the specified allocation kind\n");
3216 return nullptr;
3219 // Allocate from the corresponding memory pool.
3220 void *Alloc = nullptr;
3221 if (Error Err = MemoryPool->allocate(Size, &Alloc)) {
3222 REPORT("%s\n", toString(std::move(Err)).data());
3223 return nullptr;
3226 if (Alloc) {
3227 auto &KernelAgents = Plugin::get<AMDGPUPluginTy>().getKernelAgents();
3228 // Inherently necessary for host or shared allocations
3229 // Also enabled for device memory to allow device to device memcpy
3231 // Enable all kernel agents to access the buffer.
3232 if (auto Err = MemoryPool->enableAccess(Alloc, Size, KernelAgents)) {
3233 REPORT("%s\n", toString(std::move(Err)).data());
3234 return nullptr;
3238 return Alloc;
3241 } // namespace plugin
3242 } // namespace target
3243 } // namespace omp
3244 } // namespace llvm