1 //===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // RTL NextGen for AMDGPU machine
11 //===----------------------------------------------------------------------===//
19 #include <system_error>
21 #include <unordered_map>
24 #include "Environment.h"
25 #include "GlobalHandler.h"
26 #include "OmptCallback.h"
27 #include "PluginInterface.h"
28 #include "Utilities.h"
29 #include "UtilitiesRTL.h"
30 #include "omptarget.h"
32 #include "llvm/ADT/SmallString.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/StringRef.h"
35 #include "llvm/BinaryFormat/ELF.h"
36 #include "llvm/Frontend/OpenMP/OMPConstants.h"
37 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
38 #include "llvm/Support/Error.h"
39 #include "llvm/Support/FileSystem.h"
40 #include "llvm/Support/MemoryBuffer.h"
41 #include "llvm/Support/Program.h"
42 #include "llvm/Support/raw_ostream.h"
44 #if defined(__has_include)
45 #if __has_include("hsa/hsa.h")
47 #include "hsa/hsa_ext_amd.h"
48 #elif __has_include("hsa.h")
50 #include "hsa_ext_amd.h"
54 #include "hsa/hsa_ext_amd.h"
62 /// Forward declarations for all specialized data structures.
63 struct AMDGPUKernelTy
;
64 struct AMDGPUDeviceTy
;
65 struct AMDGPUPluginTy
;
66 struct AMDGPUStreamTy
;
68 struct AMDGPUStreamManagerTy
;
69 struct AMDGPUEventManagerTy
;
70 struct AMDGPUDeviceImageTy
;
71 struct AMDGPUMemoryManagerTy
;
72 struct AMDGPUMemoryPoolTy
;
76 /// Iterate elements using an HSA iterate function. Do not use this function
77 /// directly but the specialized ones below instead.
78 template <typename ElemTy
, typename IterFuncTy
, typename CallbackTy
>
79 hsa_status_t
iterate(IterFuncTy Func
, CallbackTy Cb
) {
80 auto L
= [](ElemTy Elem
, void *Data
) -> hsa_status_t
{
81 CallbackTy
*Unwrapped
= static_cast<CallbackTy
*>(Data
);
82 return (*Unwrapped
)(Elem
);
84 return Func(L
, static_cast<void *>(&Cb
));
87 /// Iterate elements using an HSA iterate function passing a parameter. Do not
88 /// use this function directly but the specialized ones below instead.
89 template <typename ElemTy
, typename IterFuncTy
, typename IterFuncArgTy
,
91 hsa_status_t
iterate(IterFuncTy Func
, IterFuncArgTy FuncArg
, CallbackTy Cb
) {
92 auto L
= [](ElemTy Elem
, void *Data
) -> hsa_status_t
{
93 CallbackTy
*Unwrapped
= static_cast<CallbackTy
*>(Data
);
94 return (*Unwrapped
)(Elem
);
96 return Func(FuncArg
, L
, static_cast<void *>(&Cb
));
99 /// Iterate elements using an HSA iterate function passing a parameter. Do not
100 /// use this function directly but the specialized ones below instead.
101 template <typename Elem1Ty
, typename Elem2Ty
, typename IterFuncTy
,
102 typename IterFuncArgTy
, typename CallbackTy
>
103 hsa_status_t
iterate(IterFuncTy Func
, IterFuncArgTy FuncArg
, CallbackTy Cb
) {
104 auto L
= [](Elem1Ty Elem1
, Elem2Ty Elem2
, void *Data
) -> hsa_status_t
{
105 CallbackTy
*Unwrapped
= static_cast<CallbackTy
*>(Data
);
106 return (*Unwrapped
)(Elem1
, Elem2
);
108 return Func(FuncArg
, L
, static_cast<void *>(&Cb
));
112 template <typename CallbackTy
> Error
iterateAgents(CallbackTy Callback
) {
113 hsa_status_t Status
= iterate
<hsa_agent_t
>(hsa_iterate_agents
, Callback
);
114 return Plugin::check(Status
, "Error in hsa_iterate_agents: %s");
117 /// Iterate ISAs of an agent.
118 template <typename CallbackTy
>
119 Error
iterateAgentISAs(hsa_agent_t Agent
, CallbackTy Cb
) {
120 hsa_status_t Status
= iterate
<hsa_isa_t
>(hsa_agent_iterate_isas
, Agent
, Cb
);
121 return Plugin::check(Status
, "Error in hsa_agent_iterate_isas: %s");
124 /// Iterate memory pools of an agent.
125 template <typename CallbackTy
>
126 Error
iterateAgentMemoryPools(hsa_agent_t Agent
, CallbackTy Cb
) {
127 hsa_status_t Status
= iterate
<hsa_amd_memory_pool_t
>(
128 hsa_amd_agent_iterate_memory_pools
, Agent
, Cb
);
129 return Plugin::check(Status
,
130 "Error in hsa_amd_agent_iterate_memory_pools: %s");
135 /// Utility class representing generic resource references to AMDGPU resources.
136 template <typename ResourceTy
>
137 struct AMDGPUResourceRef
: public GenericDeviceResourceRef
{
138 /// The underlying handle type for resources.
139 using HandleTy
= ResourceTy
*;
141 /// Create an empty reference to an invalid resource.
142 AMDGPUResourceRef() : Resource(nullptr) {}
144 /// Create a reference to an existing resource.
145 AMDGPUResourceRef(HandleTy Resource
) : Resource(Resource
) {}
147 virtual ~AMDGPUResourceRef() {}
149 /// Create a new resource and save the reference. The reference must be empty
150 /// before calling to this function.
151 Error
create(GenericDeviceTy
&Device
) override
;
153 /// Destroy the referenced resource and invalidate the reference. The
154 /// reference must be to a valid resource before calling to this function.
155 Error
destroy(GenericDeviceTy
&Device
) override
{
157 return Plugin::error("Destroying an invalid resource");
159 if (auto Err
= Resource
->deinit())
165 return Plugin::success();
168 /// Get the underlying resource handle.
169 operator HandleTy() const { return Resource
; }
172 /// The handle to the actual resource.
176 /// Class holding an HSA memory pool.
177 struct AMDGPUMemoryPoolTy
{
178 /// Create a memory pool from an HSA memory pool.
179 AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool
)
180 : MemoryPool(MemoryPool
), GlobalFlags(0) {}
182 /// Initialize the memory pool retrieving its properties.
184 if (auto Err
= getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT
, Segment
))
187 if (auto Err
= getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS
, GlobalFlags
))
190 return Plugin::success();
193 /// Getter of the HSA memory pool.
194 hsa_amd_memory_pool_t
get() const { return MemoryPool
; }
196 /// Indicate the segment which belongs to.
197 bool isGlobal() const { return (Segment
== HSA_AMD_SEGMENT_GLOBAL
); }
198 bool isReadOnly() const { return (Segment
== HSA_AMD_SEGMENT_READONLY
); }
199 bool isPrivate() const { return (Segment
== HSA_AMD_SEGMENT_PRIVATE
); }
200 bool isGroup() const { return (Segment
== HSA_AMD_SEGMENT_GROUP
); }
202 /// Indicate if it is fine-grained memory. Valid only for global.
203 bool isFineGrained() const {
204 assert(isGlobal() && "Not global memory");
205 return (GlobalFlags
& HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED
);
208 /// Indicate if it is coarse-grained memory. Valid only for global.
209 bool isCoarseGrained() const {
210 assert(isGlobal() && "Not global memory");
211 return (GlobalFlags
& HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
);
214 /// Indicate if it supports storing kernel arguments. Valid only for global.
215 bool supportsKernelArgs() const {
216 assert(isGlobal() && "Not global memory");
217 return (GlobalFlags
& HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
);
220 /// Allocate memory on the memory pool.
221 Error
allocate(size_t Size
, void **PtrStorage
) {
222 hsa_status_t Status
=
223 hsa_amd_memory_pool_allocate(MemoryPool
, Size
, 0, PtrStorage
);
224 return Plugin::check(Status
, "Error in hsa_amd_memory_pool_allocate: %s");
227 /// Return memory to the memory pool.
228 Error
deallocate(void *Ptr
) {
229 hsa_status_t Status
= hsa_amd_memory_pool_free(Ptr
);
230 return Plugin::check(Status
, "Error in hsa_amd_memory_pool_free: %s");
233 /// Allow the device to access a specific allocation.
234 Error
enableAccess(void *Ptr
, int64_t Size
,
235 const llvm::SmallVector
<hsa_agent_t
> &Agents
) const {
236 #ifdef OMPTARGET_DEBUG
237 for (hsa_agent_t Agent
: Agents
) {
238 hsa_amd_memory_pool_access_t Access
;
240 getAttr(Agent
, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS
, Access
))
243 // The agent is not allowed to access the memory pool in any case. Do not
244 // continue because otherwise it result in undefined behavior.
245 if (Access
== HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED
)
246 return Plugin::error("An agent is not allowed to access a memory pool");
250 // We can access but it is disabled by default. Enable the access then.
251 hsa_status_t Status
=
252 hsa_amd_agents_allow_access(Agents
.size(), Agents
.data(), nullptr, Ptr
);
253 return Plugin::check(Status
, "Error in hsa_amd_agents_allow_access: %s");
256 /// Get attribute from the memory pool.
257 template <typename Ty
>
258 Error
getAttr(hsa_amd_memory_pool_info_t Kind
, Ty
&Value
) const {
260 Status
= hsa_amd_memory_pool_get_info(MemoryPool
, Kind
, &Value
);
261 return Plugin::check(Status
, "Error in hsa_amd_memory_pool_get_info: %s");
264 template <typename Ty
>
265 hsa_status_t
getAttrRaw(hsa_amd_memory_pool_info_t Kind
, Ty
&Value
) const {
266 return hsa_amd_memory_pool_get_info(MemoryPool
, Kind
, &Value
);
269 /// Get attribute from the memory pool relating to an agent.
270 template <typename Ty
>
271 Error
getAttr(hsa_agent_t Agent
, hsa_amd_agent_memory_pool_info_t Kind
,
275 hsa_amd_agent_memory_pool_get_info(Agent
, MemoryPool
, Kind
, &Value
);
276 return Plugin::check(Status
,
277 "Error in hsa_amd_agent_memory_pool_get_info: %s");
281 /// The HSA memory pool.
282 hsa_amd_memory_pool_t MemoryPool
;
284 /// The segment where the memory pool belongs to.
285 hsa_amd_segment_t Segment
;
287 /// The global flags of memory pool. Only valid if the memory pool belongs to
288 /// the global segment.
289 uint32_t GlobalFlags
;
292 /// Class that implements a memory manager that gets memory from a specific
294 struct AMDGPUMemoryManagerTy
: public DeviceAllocatorTy
{
296 /// Create an empty memory manager.
297 AMDGPUMemoryManagerTy() : MemoryPool(nullptr), MemoryManager(nullptr) {}
299 /// Initialize the memory manager from a memory pool.
300 Error
init(AMDGPUMemoryPoolTy
&MemoryPool
) {
301 const uint32_t Threshold
= 1 << 30;
302 this->MemoryManager
= new MemoryManagerTy(*this, Threshold
);
303 this->MemoryPool
= &MemoryPool
;
304 return Plugin::success();
307 /// Deinitialize the memory manager and free its allocations.
309 assert(MemoryManager
&& "Invalid memory manager");
311 // Delete and invalidate the memory manager. At this point, the memory
312 // manager will deallocate all its allocations.
313 delete MemoryManager
;
314 MemoryManager
= nullptr;
316 return Plugin::success();
319 /// Reuse or allocate memory through the memory manager.
320 Error
allocate(size_t Size
, void **PtrStorage
) {
321 assert(MemoryManager
&& "Invalid memory manager");
322 assert(PtrStorage
&& "Invalid pointer storage");
324 *PtrStorage
= MemoryManager
->allocate(Size
, nullptr);
325 if (*PtrStorage
== nullptr)
326 return Plugin::error("Failure to allocate from AMDGPU memory manager");
328 return Plugin::success();
331 /// Release an allocation to be reused.
332 Error
deallocate(void *Ptr
) {
333 assert(Ptr
&& "Invalid pointer");
335 if (MemoryManager
->free(Ptr
))
336 return Plugin::error("Failure to deallocate from AMDGPU memory manager");
338 return Plugin::success();
342 /// Allocation callback that will be called once the memory manager does not
343 /// have more previously allocated buffers.
344 void *allocate(size_t Size
, void *HstPtr
, TargetAllocTy Kind
) override
;
346 /// Deallocation callack that will be called by the memory manager.
347 int free(void *TgtPtr
, TargetAllocTy Kind
) override
{
348 if (auto Err
= MemoryPool
->deallocate(TgtPtr
)) {
349 consumeError(std::move(Err
));
352 return OFFLOAD_SUCCESS
;
355 /// The memory pool used to allocate memory.
356 AMDGPUMemoryPoolTy
*MemoryPool
;
358 /// Reference to the actual memory manager.
359 MemoryManagerTy
*MemoryManager
;
362 /// Class implementing the AMDGPU device images' properties.
363 struct AMDGPUDeviceImageTy
: public DeviceImageTy
{
364 /// Create the AMDGPU image with the id and the target image pointer.
365 AMDGPUDeviceImageTy(int32_t ImageId
, const __tgt_device_image
*TgtImage
)
366 : DeviceImageTy(ImageId
, TgtImage
) {}
368 /// Prepare and load the executable corresponding to the image.
369 Error
loadExecutable(const AMDGPUDeviceTy
&Device
);
371 /// Unload the executable.
372 Error
unloadExecutable() {
373 hsa_status_t Status
= hsa_executable_destroy(Executable
);
374 if (auto Err
= Plugin::check(Status
, "Error in hsa_executable_destroy: %s"))
377 Status
= hsa_code_object_destroy(CodeObject
);
378 return Plugin::check(Status
, "Error in hsa_code_object_destroy: %s");
381 /// Get the executable.
382 hsa_executable_t
getExecutable() const { return Executable
; }
384 /// Get to Code Object Version of the ELF
385 uint16_t getELFABIVersion() const { return ELFABIVersion
; }
387 /// Find an HSA device symbol by its name on the executable.
388 Expected
<hsa_executable_symbol_t
>
389 findDeviceSymbol(GenericDeviceTy
&Device
, StringRef SymbolName
) const;
391 /// Get additional info for kernel, e.g., register spill counts
392 std::optional
<utils::KernelMetaDataTy
>
393 getKernelInfo(StringRef Identifier
) const {
394 auto It
= KernelInfoMap
.find(Identifier
);
396 if (It
== KernelInfoMap
.end())
403 /// The exectuable loaded on the agent.
404 hsa_executable_t Executable
;
405 hsa_code_object_t CodeObject
;
406 StringMap
<utils::KernelMetaDataTy
> KernelInfoMap
;
407 uint16_t ELFABIVersion
;
410 /// Class implementing the AMDGPU kernel functionalities which derives from the
411 /// generic kernel class.
412 struct AMDGPUKernelTy
: public GenericKernelTy
{
413 /// Create an AMDGPU kernel with a name and an execution mode.
414 AMDGPUKernelTy(const char *Name
) : GenericKernelTy(Name
) {}
416 /// Initialize the AMDGPU kernel.
417 Error
initImpl(GenericDeviceTy
&Device
, DeviceImageTy
&Image
) override
{
418 AMDGPUDeviceImageTy
&AMDImage
= static_cast<AMDGPUDeviceImageTy
&>(Image
);
420 // Kernel symbols have a ".kd" suffix.
421 std::string
KernelName(getName());
424 // Find the symbol on the device executable.
425 auto SymbolOrErr
= AMDImage
.findDeviceSymbol(Device
, KernelName
);
427 return SymbolOrErr
.takeError();
429 hsa_executable_symbol_t Symbol
= *SymbolOrErr
;
430 hsa_symbol_kind_t SymbolType
;
433 // Retrieve different properties of the kernel symbol.
434 std::pair
<hsa_executable_symbol_info_t
, void *> RequiredInfos
[] = {
435 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE
, &SymbolType
},
436 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &KernelObject
},
437 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
, &ArgsSize
},
438 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
, &GroupSize
},
439 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
, &PrivateSize
}};
441 for (auto &Info
: RequiredInfos
) {
442 Status
= hsa_executable_symbol_get_info(Symbol
, Info
.first
, Info
.second
);
443 if (auto Err
= Plugin::check(
444 Status
, "Error in hsa_executable_symbol_get_info: %s"))
448 // Make sure it is a kernel symbol.
449 if (SymbolType
!= HSA_SYMBOL_KIND_KERNEL
)
450 return Plugin::error("Symbol %s is not a kernel function");
452 // TODO: Read the kernel descriptor for the max threads per block. May be
453 // read from the image.
455 ImplicitArgsSize
= utils::getImplicitArgsSize(AMDImage
.getELFABIVersion());
456 DP("ELFABIVersion: %d\n", AMDImage
.getELFABIVersion());
458 // Get additional kernel info read from image
459 KernelInfo
= AMDImage
.getKernelInfo(getName());
460 if (!KernelInfo
.has_value())
461 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, Device
.getDeviceId(),
462 "Could not read extra information for kernel %s.", getName());
464 return Plugin::success();
467 /// Launch the AMDGPU kernel function.
468 Error
launchImpl(GenericDeviceTy
&GenericDevice
, uint32_t NumThreads
,
469 uint64_t NumBlocks
, KernelArgsTy
&KernelArgs
, void *Args
,
470 AsyncInfoWrapperTy
&AsyncInfoWrapper
) const override
;
472 /// Print more elaborate kernel launch info for AMDGPU
473 Error
printLaunchInfoDetails(GenericDeviceTy
&GenericDevice
,
474 KernelArgsTy
&KernelArgs
, uint32_t NumThreads
,
475 uint64_t NumBlocks
) const override
;
477 /// Get group and private segment kernel size.
478 uint32_t getGroupSize() const { return GroupSize
; }
479 uint32_t getPrivateSize() const { return PrivateSize
; }
481 /// Get the HSA kernel object representing the kernel function.
482 uint64_t getKernelObject() const { return KernelObject
; }
484 /// Get the size of implicitargs based on the code object version
485 /// @return 56 for cov4 and 256 for cov5
486 uint32_t getImplicitArgsSize() const { return ImplicitArgsSize
; }
489 /// The kernel object to execute.
490 uint64_t KernelObject
;
492 /// The args, group and private segments sizes required by a kernel instance.
495 uint32_t PrivateSize
;
497 /// The size of implicit kernel arguments.
498 uint32_t ImplicitArgsSize
;
500 /// Additional Info for the AMD GPU Kernel
501 std::optional
<utils::KernelMetaDataTy
> KernelInfo
;
504 /// Class representing an HSA signal. Signals are used to define dependencies
505 /// between asynchronous operations: kernel launches and memory transfers.
506 struct AMDGPUSignalTy
{
507 /// Create an empty signal.
508 AMDGPUSignalTy() : HSASignal({0}), UseCount() {}
509 AMDGPUSignalTy(AMDGPUDeviceTy
&Device
) : HSASignal({0}), UseCount() {}
511 /// Initialize the signal with an initial value.
512 Error
init(uint32_t InitialValue
= 1) {
513 hsa_status_t Status
=
514 hsa_amd_signal_create(InitialValue
, 0, nullptr, 0, &HSASignal
);
515 return Plugin::check(Status
, "Error in hsa_signal_create: %s");
518 /// Deinitialize the signal.
520 hsa_status_t Status
= hsa_signal_destroy(HSASignal
);
521 return Plugin::check(Status
, "Error in hsa_signal_destroy: %s");
524 /// Wait until the signal gets a zero value.
525 Error
wait(const uint64_t ActiveTimeout
= 0, RPCServerTy
*RPCServer
= nullptr,
526 GenericDeviceTy
*Device
= nullptr) const {
527 if (ActiveTimeout
&& !RPCServer
) {
528 hsa_signal_value_t Got
= 1;
529 Got
= hsa_signal_wait_scacquire(HSASignal
, HSA_SIGNAL_CONDITION_EQ
, 0,
530 ActiveTimeout
, HSA_WAIT_STATE_ACTIVE
);
532 return Plugin::success();
535 // If there is an RPC device attached to this stream we run it as a server.
536 uint64_t Timeout
= RPCServer
? 8192 : UINT64_MAX
;
537 auto WaitState
= RPCServer
? HSA_WAIT_STATE_ACTIVE
: HSA_WAIT_STATE_BLOCKED
;
538 while (hsa_signal_wait_scacquire(HSASignal
, HSA_SIGNAL_CONDITION_EQ
, 0,
539 Timeout
, WaitState
) != 0) {
540 if (RPCServer
&& Device
)
541 if (auto Err
= RPCServer
->runServer(*Device
))
544 return Plugin::success();
547 /// Load the value on the signal.
548 hsa_signal_value_t
load() const {
549 return hsa_signal_load_scacquire(HSASignal
);
552 /// Signal decrementing by one.
554 assert(load() > 0 && "Invalid signal value");
555 hsa_signal_subtract_screlease(HSASignal
, 1);
558 /// Reset the signal value before reusing the signal. Do not call this
559 /// function if the signal is being currently used by any watcher, such as a
560 /// plugin thread or the HSA runtime.
561 void reset() { hsa_signal_store_screlease(HSASignal
, 1); }
563 /// Increase the number of concurrent uses.
564 void increaseUseCount() { UseCount
.increase(); }
566 /// Decrease the number of concurrent uses and return whether was the last.
567 bool decreaseUseCount() { return UseCount
.decrease(); }
569 hsa_signal_t
get() const { return HSASignal
; }
572 /// The underlying HSA signal.
573 hsa_signal_t HSASignal
;
575 /// Reference counter for tracking the concurrent use count. This is mainly
576 /// used for knowing how many streams are using the signal.
577 RefCountTy
<> UseCount
;
580 /// Classes for holding AMDGPU signals and managing signals.
581 using AMDGPUSignalRef
= AMDGPUResourceRef
<AMDGPUSignalTy
>;
582 using AMDGPUSignalManagerTy
= GenericDeviceResourceManagerTy
<AMDGPUSignalRef
>;
584 /// Class holding an HSA queue to submit kernel and barrier packets.
585 struct AMDGPUQueueTy
{
586 /// Create an empty queue.
587 AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
589 /// Lazily initialize a new queue belonging to a specific agent.
590 Error
init(hsa_agent_t Agent
, int32_t QueueSize
) {
592 return Plugin::success();
593 hsa_status_t Status
=
594 hsa_queue_create(Agent
, QueueSize
, HSA_QUEUE_TYPE_MULTI
, callbackError
,
595 nullptr, UINT32_MAX
, UINT32_MAX
, &Queue
);
596 return Plugin::check(Status
, "Error in hsa_queue_create: %s");
599 /// Deinitialize the queue and destroy its resources.
601 std::lock_guard
<std::mutex
> Lock(Mutex
);
603 return Plugin::success();
604 hsa_status_t Status
= hsa_queue_destroy(Queue
);
605 return Plugin::check(Status
, "Error in hsa_queue_destroy: %s");
608 /// Returns the number of streams, this queue is currently assigned to.
609 bool getUserCount() const { return NumUsers
; }
611 /// Returns if the underlying HSA queue is initialized.
612 bool isInitialized() { return Queue
!= nullptr; }
614 /// Decrement user count of the queue object.
615 void removeUser() { --NumUsers
; }
617 /// Increase user count of the queue object.
618 void addUser() { ++NumUsers
; }
620 /// Push a kernel launch to the queue. The kernel launch requires an output
621 /// signal and can define an optional input signal (nullptr if none).
622 Error
pushKernelLaunch(const AMDGPUKernelTy
&Kernel
, void *KernelArgs
,
623 uint32_t NumThreads
, uint64_t NumBlocks
,
624 uint32_t GroupSize
, AMDGPUSignalTy
*OutputSignal
,
625 AMDGPUSignalTy
*InputSignal
) {
626 assert(OutputSignal
&& "Invalid kernel output signal");
628 // Lock the queue during the packet publishing process. Notice this blocks
629 // the addition of other packets to the queue. The following piece of code
630 // should be lightweight; do not block the thread, allocate memory, etc.
631 std::lock_guard
<std::mutex
> Lock(Mutex
);
632 assert(Queue
&& "Interacted with a non-initialized queue!");
634 // Avoid defining the input dependency if already satisfied.
635 if (InputSignal
&& !InputSignal
->load())
636 InputSignal
= nullptr;
638 // Add a barrier packet before the kernel packet in case there is a pending
639 // preceding operation. The barrier packet will delay the processing of
640 // subsequent queue's packets until the barrier input signal are satisfied.
641 // No need output signal needed because the dependency is already guaranteed
642 // by the queue barrier itself.
644 if (auto Err
= pushBarrierImpl(nullptr, InputSignal
))
647 // Now prepare the kernel packet.
649 hsa_kernel_dispatch_packet_t
*Packet
= acquirePacket(PacketId
);
650 assert(Packet
&& "Invalid packet");
652 // The first 32 bits of the packet are written after the other fields
653 uint16_t Setup
= UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
;
654 Packet
->workgroup_size_x
= NumThreads
;
655 Packet
->workgroup_size_y
= 1;
656 Packet
->workgroup_size_z
= 1;
657 Packet
->reserved0
= 0;
658 Packet
->grid_size_x
= NumBlocks
* NumThreads
;
659 Packet
->grid_size_y
= 1;
660 Packet
->grid_size_z
= 1;
661 Packet
->private_segment_size
= Kernel
.getPrivateSize();
662 Packet
->group_segment_size
= GroupSize
;
663 Packet
->kernel_object
= Kernel
.getKernelObject();
664 Packet
->kernarg_address
= KernelArgs
;
665 Packet
->reserved2
= 0;
666 Packet
->completion_signal
= OutputSignal
->get();
668 // Publish the packet. Do not modify the packet after this point.
669 publishKernelPacket(PacketId
, Setup
, Packet
);
671 return Plugin::success();
674 /// Push a barrier packet that will wait up to two input signals. All signals
675 /// are optional (nullptr if none).
676 Error
pushBarrier(AMDGPUSignalTy
*OutputSignal
,
677 const AMDGPUSignalTy
*InputSignal1
,
678 const AMDGPUSignalTy
*InputSignal2
) {
679 // Lock the queue during the packet publishing process.
680 std::lock_guard
<std::mutex
> Lock(Mutex
);
681 assert(Queue
&& "Interacted with a non-initialized queue!");
683 // Push the barrier with the lock acquired.
684 return pushBarrierImpl(OutputSignal
, InputSignal1
, InputSignal2
);
688 /// Push a barrier packet that will wait up to two input signals. Assumes the
689 /// the queue lock is acquired.
690 Error
pushBarrierImpl(AMDGPUSignalTy
*OutputSignal
,
691 const AMDGPUSignalTy
*InputSignal1
,
692 const AMDGPUSignalTy
*InputSignal2
= nullptr) {
693 // Add a queue barrier waiting on both the other stream's operation and the
694 // last operation on the current stream (if any).
696 hsa_barrier_and_packet_t
*Packet
=
697 (hsa_barrier_and_packet_t
*)acquirePacket(PacketId
);
698 assert(Packet
&& "Invalid packet");
700 Packet
->reserved0
= 0;
701 Packet
->reserved1
= 0;
702 Packet
->dep_signal
[0] = {0};
703 Packet
->dep_signal
[1] = {0};
704 Packet
->dep_signal
[2] = {0};
705 Packet
->dep_signal
[3] = {0};
706 Packet
->dep_signal
[4] = {0};
707 Packet
->reserved2
= 0;
708 Packet
->completion_signal
= {0};
710 // Set input and output dependencies if needed.
712 Packet
->completion_signal
= OutputSignal
->get();
714 Packet
->dep_signal
[0] = InputSignal1
->get();
716 Packet
->dep_signal
[1] = InputSignal2
->get();
718 // Publish the packet. Do not modify the packet after this point.
719 publishBarrierPacket(PacketId
, Packet
);
721 return Plugin::success();
724 /// Acquire a packet from the queue. This call may block the thread if there
725 /// is no space in the underlying HSA queue. It may need to wait until the HSA
726 /// runtime processes some packets. Assumes the queue lock is acquired.
727 hsa_kernel_dispatch_packet_t
*acquirePacket(uint64_t &PacketId
) {
728 // Increase the queue index with relaxed memory order. Notice this will need
729 // another subsequent atomic operation with acquire order.
730 PacketId
= hsa_queue_add_write_index_relaxed(Queue
, 1);
732 // Wait for the package to be available. Notice the atomic operation uses
733 // the acquire memory order.
734 while (PacketId
- hsa_queue_load_read_index_scacquire(Queue
) >= Queue
->size
)
737 // Return the packet reference.
738 const uint32_t Mask
= Queue
->size
- 1; // The size is a power of 2.
739 return (hsa_kernel_dispatch_packet_t
*)Queue
->base_address
+
743 /// Publish the kernel packet so that the HSA runtime can start processing
744 /// the kernel launch. Do not modify the packet once this function is called.
745 /// Assumes the queue lock is acquired.
746 void publishKernelPacket(uint64_t PacketId
, uint16_t Setup
,
747 hsa_kernel_dispatch_packet_t
*Packet
) {
748 uint32_t *PacketPtr
= reinterpret_cast<uint32_t *>(Packet
);
750 uint16_t Header
= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
751 Header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
752 Header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
754 // Publish the packet. Do not modify the package after this point.
755 uint32_t HeaderWord
= Header
| (Setup
<< 16u);
756 __atomic_store_n(PacketPtr
, HeaderWord
, __ATOMIC_RELEASE
);
758 // Signal the doorbell about the published packet.
759 hsa_signal_store_relaxed(Queue
->doorbell_signal
, PacketId
);
762 /// Publish the barrier packet so that the HSA runtime can start processing
763 /// the barrier. Next packets in the queue will not be processed until all
764 /// barrier dependencies (signals) are satisfied. Assumes the queue is locked
765 void publishBarrierPacket(uint64_t PacketId
,
766 hsa_barrier_and_packet_t
*Packet
) {
767 uint32_t *PacketPtr
= reinterpret_cast<uint32_t *>(Packet
);
769 uint16_t Header
= HSA_PACKET_TYPE_BARRIER_AND
<< HSA_PACKET_HEADER_TYPE
;
770 Header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
771 Header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
773 // Publish the packet. Do not modify the package after this point.
774 uint32_t HeaderWord
= Header
| (Setup
<< 16u);
775 __atomic_store_n(PacketPtr
, HeaderWord
, __ATOMIC_RELEASE
);
777 // Signal the doorbell about the published packet.
778 hsa_signal_store_relaxed(Queue
->doorbell_signal
, PacketId
);
781 /// Callack that will be called when an error is detected on the HSA queue.
782 static void callbackError(hsa_status_t Status
, hsa_queue_t
*Source
, void *) {
783 auto Err
= Plugin::check(Status
, "Received error in queue %p: %s", Source
);
784 FATAL_MESSAGE(1, "%s", toString(std::move(Err
)).data());
790 /// Mutex to protect the acquiring and publishing of packets. For the moment,
791 /// we need this mutex to prevent publishing packets that are not ready to be
792 /// published in a multi-thread scenario. Without a queue lock, a thread T1
793 /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
794 /// publish its packet P+1 (signaling the queue's doorbell) before packet P
795 /// from T1 is ready to be processed. That scenario should be invalid. Thus,
796 /// we use the following mutex to make packet acquiring and publishing atomic.
797 /// TODO: There are other more advanced approaches to avoid this mutex using
798 /// atomic operations. We can further investigate it if this is a bottleneck.
801 /// The number of streams, this queue is currently assigned to. A queue is
802 /// considered idle when this is zero, otherwise: busy.
806 /// Struct that implements a stream of asynchronous operations for AMDGPU
807 /// devices. This class relies on signals to implement streams and define the
808 /// dependencies between asynchronous operations.
809 struct AMDGPUStreamTy
{
811 /// Utility struct holding arguments for async H2H memory copies.
812 struct MemcpyArgsTy
{
818 /// Utility struct holding arguments for freeing buffers to memory managers.
819 struct ReleaseBufferArgsTy
{
821 AMDGPUMemoryManagerTy
*MemoryManager
;
824 /// Utility struct holding arguments for releasing signals to signal managers.
825 struct ReleaseSignalArgsTy
{
826 AMDGPUSignalTy
*Signal
;
827 AMDGPUSignalManagerTy
*SignalManager
;
830 /// The stream is composed of N stream's slots. The struct below represents
831 /// the fields of each slot. Each slot has a signal and an optional action
832 /// function. When appending an HSA asynchronous operation to the stream, one
833 /// slot is consumed and used to store the operation's information. The
834 /// operation's output signal is set to the consumed slot's signal. If there
835 /// is a previous asynchronous operation on the previous slot, the HSA async
836 /// operation's input signal is set to the signal of the previous slot. This
837 /// way, we obtain a chain of dependant async operations. The action is a
838 /// function that will be executed eventually after the operation is
839 /// completed, e.g., for releasing a buffer.
840 struct StreamSlotTy
{
841 /// The output signal of the stream operation. May be used by the subsequent
842 /// operation as input signal.
843 AMDGPUSignalTy
*Signal
;
845 /// The action that must be performed after the operation's completion. Set
846 /// to nullptr when there is no action to perform.
847 Error (*ActionFunction
)(void *);
849 /// Space for the action's arguments. A pointer to these arguments is passed
850 /// to the action function. Notice the space of arguments is limited.
852 MemcpyArgsTy MemcpyArgs
;
853 ReleaseBufferArgsTy ReleaseBufferArgs
;
854 ReleaseSignalArgsTy ReleaseSignalArgs
;
857 /// Create an empty slot.
858 StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {}
860 /// Schedule a host memory copy action on the slot.
861 Error
schedHostMemoryCopy(void *Dst
, const void *Src
, size_t Size
) {
862 ActionFunction
= memcpyAction
;
863 ActionArgs
.MemcpyArgs
= MemcpyArgsTy
{Dst
, Src
, Size
};
864 return Plugin::success();
867 /// Schedule a release buffer action on the slot.
868 Error
schedReleaseBuffer(void *Buffer
, AMDGPUMemoryManagerTy
&Manager
) {
869 ActionFunction
= releaseBufferAction
;
870 ActionArgs
.ReleaseBufferArgs
= ReleaseBufferArgsTy
{Buffer
, &Manager
};
871 return Plugin::success();
874 /// Schedule a signal release action on the slot.
875 Error
schedReleaseSignal(AMDGPUSignalTy
*SignalToRelease
,
876 AMDGPUSignalManagerTy
*SignalManager
) {
877 ActionFunction
= releaseSignalAction
;
878 ActionArgs
.ReleaseSignalArgs
=
879 ReleaseSignalArgsTy
{SignalToRelease
, SignalManager
};
880 return Plugin::success();
883 // Perform the action if needed.
884 Error
performAction() {
886 return Plugin::success();
888 // Perform the action.
889 if (ActionFunction
== memcpyAction
) {
890 if (auto Err
= memcpyAction(&ActionArgs
))
892 } else if (ActionFunction
== releaseBufferAction
) {
893 if (auto Err
= releaseBufferAction(&ActionArgs
))
895 } else if (ActionFunction
== releaseSignalAction
) {
896 if (auto Err
= releaseSignalAction(&ActionArgs
))
899 return Plugin::error("Unknown action function!");
902 // Invalidate the action.
903 ActionFunction
= nullptr;
905 return Plugin::success();
909 /// The device agent where the stream was created.
912 /// The queue that the stream uses to launch kernels.
913 AMDGPUQueueTy
*Queue
;
915 /// The manager of signals to reuse signals.
916 AMDGPUSignalManagerTy
&SignalManager
;
918 /// A reference to the associated device.
919 GenericDeviceTy
&Device
;
921 /// Array of stream slots. Use std::deque because it can dynamically grow
922 /// without invalidating the already inserted elements. For instance, the
923 /// std::vector may invalidate the elements by reallocating the internal
924 /// array if there is not enough space on new insertions.
925 std::deque
<StreamSlotTy
> Slots
;
927 /// The next available slot on the queue. This is reset to zero each time the
928 /// stream is synchronized. It also indicates the current number of consumed
929 /// slots at a given time.
932 /// The synchronization id. This number is increased each time the stream is
933 /// synchronized. It is useful to detect if an AMDGPUEventTy points to an
934 /// operation that was already finalized in a previous stream sycnhronize.
937 /// A pointer associated with an RPC server running on the given device. If
938 /// RPC is not being used this will be a null pointer. Otherwise, this
939 /// indicates that an RPC server is expected to be run on this stream.
940 RPCServerTy
*RPCServer
;
942 /// Mutex to protect stream's management.
943 mutable std::mutex Mutex
;
945 /// Timeout hint for HSA actively waiting for signal value to change
946 const uint64_t StreamBusyWaitMicroseconds
;
948 /// Return the current number of asychronous operations on the stream.
949 uint32_t size() const { return NextSlot
; }
951 /// Return the last valid slot on the stream.
952 uint32_t last() const { return size() - 1; }
954 /// Consume one slot from the stream. Since the stream uses signals on demand
955 /// and releases them once the slot is no longer used, the function requires
956 /// an idle signal for the new consumed slot.
957 std::pair
<uint32_t, AMDGPUSignalTy
*> consume(AMDGPUSignalTy
*OutputSignal
) {
958 // Double the stream size if needed. Since we use std::deque, this operation
959 // does not invalidate the already added slots.
960 if (Slots
.size() == NextSlot
)
961 Slots
.resize(Slots
.size() * 2);
963 // Update the next available slot and the stream size.
964 uint32_t Curr
= NextSlot
++;
966 // Retrieve the input signal, if any, of the current operation.
967 AMDGPUSignalTy
*InputSignal
= (Curr
> 0) ? Slots
[Curr
- 1].Signal
: nullptr;
969 // Set the output signal of the current slot.
970 Slots
[Curr
].Signal
= OutputSignal
;
972 return std::make_pair(Curr
, InputSignal
);
975 /// Complete all pending post actions and reset the stream after synchronizing
976 /// or positively querying the stream.
978 for (uint32_t Slot
= 0; Slot
< NextSlot
; ++Slot
) {
979 // Take the post action of the operation if any.
980 if (auto Err
= Slots
[Slot
].performAction())
983 // Release the slot's signal if possible. Otherwise, another user will.
984 if (Slots
[Slot
].Signal
->decreaseUseCount())
985 if (auto Err
= SignalManager
.returnResource(Slots
[Slot
].Signal
))
988 Slots
[Slot
].Signal
= nullptr;
991 // Reset the stream slots to zero.
994 // Increase the synchronization id since the stream completed a sync cycle.
997 return Plugin::success();
1000 /// Make the current stream wait on a specific operation of another stream.
1001 /// The idea is to make the current stream waiting on two signals: 1) the last
1002 /// signal of the current stream, and 2) the last signal of the other stream.
1003 /// Use a barrier packet with two input signals.
1004 Error
waitOnStreamOperation(AMDGPUStreamTy
&OtherStream
, uint32_t Slot
) {
1005 if (Queue
== nullptr)
1006 return Plugin::error("Target queue was nullptr");
1008 /// The signal that we must wait from the other stream.
1009 AMDGPUSignalTy
*OtherSignal
= OtherStream
.Slots
[Slot
].Signal
;
1011 // Prevent the release of the other stream's signal.
1012 OtherSignal
->increaseUseCount();
1014 // Retrieve an available signal for the operation's output.
1015 AMDGPUSignalTy
*OutputSignal
= nullptr;
1016 if (auto Err
= SignalManager
.getResource(OutputSignal
))
1018 OutputSignal
->reset();
1019 OutputSignal
->increaseUseCount();
1021 // Consume stream slot and compute dependencies.
1022 auto [Curr
, InputSignal
] = consume(OutputSignal
);
1024 // Setup the post action to release the signal.
1025 if (auto Err
= Slots
[Curr
].schedReleaseSignal(OtherSignal
, &SignalManager
))
1028 // Push a barrier into the queue with both input signals.
1029 return Queue
->pushBarrier(OutputSignal
, InputSignal
, OtherSignal
);
1032 /// Callback for running a specific asynchronous operation. This callback is
1033 /// used for hsa_amd_signal_async_handler. The argument is the operation that
1034 /// should be executed. Notice we use the post action mechanism to codify the
1035 /// asynchronous operation.
1036 static bool asyncActionCallback(hsa_signal_value_t Value
, void *Args
) {
1037 StreamSlotTy
*Slot
= reinterpret_cast<StreamSlotTy
*>(Args
);
1038 assert(Slot
&& "Invalid slot");
1039 assert(Slot
->Signal
&& "Invalid signal");
1041 // This thread is outside the stream mutex. Make sure the thread sees the
1042 // changes on the slot.
1043 std::atomic_thread_fence(std::memory_order_acquire
);
1045 // Peform the operation.
1046 if (auto Err
= Slot
->performAction())
1047 FATAL_MESSAGE(1, "Error peforming post action: %s",
1048 toString(std::move(Err
)).data());
1050 // Signal the output signal to notify the asycnhronous operation finalized.
1051 Slot
->Signal
->signal();
1053 // Unregister callback.
1057 // Callback for host-to-host memory copies. This is an asynchronous action.
1058 static Error
memcpyAction(void *Data
) {
1059 MemcpyArgsTy
*Args
= reinterpret_cast<MemcpyArgsTy
*>(Data
);
1060 assert(Args
&& "Invalid arguments");
1061 assert(Args
->Dst
&& "Invalid destination buffer");
1062 assert(Args
->Src
&& "Invalid source buffer");
1064 std::memcpy(Args
->Dst
, Args
->Src
, Args
->Size
);
1066 return Plugin::success();
1069 /// Releasing a memory buffer to a memory manager. This is a post completion
1070 /// action. There are two kinds of memory buffers:
1071 /// 1. For kernel arguments. This buffer can be freed after receiving the
1072 /// kernel completion signal.
1073 /// 2. For H2D tranfers that need pinned memory space for staging. This
1074 /// buffer can be freed after receiving the transfer completion signal.
1075 /// 3. For D2H tranfers that need pinned memory space for staging. This
1076 /// buffer cannot be freed after receiving the transfer completion signal
1077 /// because of the following asynchronous H2H callback.
1078 /// For this reason, This action can only be taken at
1079 /// AMDGPUStreamTy::complete()
1080 /// Because of the case 3, all releaseBufferActions are taken at
1081 /// AMDGPUStreamTy::complete() in the current implementation.
1082 static Error
releaseBufferAction(void *Data
) {
1083 ReleaseBufferArgsTy
*Args
= reinterpret_cast<ReleaseBufferArgsTy
*>(Data
);
1084 assert(Args
&& "Invalid arguments");
1085 assert(Args
->MemoryManager
&& "Invalid memory manager");
1086 assert(Args
->Buffer
&& "Invalid buffer");
1088 // Release the allocation to the memory manager.
1089 return Args
->MemoryManager
->deallocate(Args
->Buffer
);
1092 /// Releasing a signal object back to SignalManager. This is a post completion
1093 /// action. This action can only be taken at AMDGPUStreamTy::complete()
1094 static Error
releaseSignalAction(void *Data
) {
1095 ReleaseSignalArgsTy
*Args
= reinterpret_cast<ReleaseSignalArgsTy
*>(Data
);
1096 assert(Args
&& "Invalid arguments");
1097 assert(Args
->Signal
&& "Invalid signal");
1098 assert(Args
->SignalManager
&& "Invalid signal manager");
1100 // Release the signal if needed.
1101 if (Args
->Signal
->decreaseUseCount())
1102 if (auto Err
= Args
->SignalManager
->returnResource(Args
->Signal
))
1105 return Plugin::success();
1109 /// Create an empty stream associated with a specific device.
1110 AMDGPUStreamTy(AMDGPUDeviceTy
&Device
);
1112 /// Intialize the stream's signals.
1113 Error
init() { return Plugin::success(); }
1115 /// Deinitialize the stream's signals.
1116 Error
deinit() { return Plugin::success(); }
1118 /// Attach an RPC server to this stream.
1119 void setRPCServer(RPCServerTy
*Server
) { RPCServer
= Server
; }
1121 /// Push a asynchronous kernel to the stream. The kernel arguments must be
1122 /// placed in a special allocation for kernel args and must keep alive until
1123 /// the kernel finalizes. Once the kernel is finished, the stream will release
1124 /// the kernel args buffer to the specified memory manager.
1125 Error
pushKernelLaunch(const AMDGPUKernelTy
&Kernel
, void *KernelArgs
,
1126 uint32_t NumThreads
, uint64_t NumBlocks
,
1128 AMDGPUMemoryManagerTy
&MemoryManager
) {
1129 if (Queue
== nullptr)
1130 return Plugin::error("Target queue was nullptr");
1132 // Retrieve an available signal for the operation's output.
1133 AMDGPUSignalTy
*OutputSignal
= nullptr;
1134 if (auto Err
= SignalManager
.getResource(OutputSignal
))
1136 OutputSignal
->reset();
1137 OutputSignal
->increaseUseCount();
1139 std::lock_guard
<std::mutex
> StreamLock(Mutex
);
1141 // Consume stream slot and compute dependencies.
1142 auto [Curr
, InputSignal
] = consume(OutputSignal
);
1144 // Setup the post action to release the kernel args buffer.
1145 if (auto Err
= Slots
[Curr
].schedReleaseBuffer(KernelArgs
, MemoryManager
))
1148 // Push the kernel with the output signal and an input signal (optional)
1149 return Queue
->pushKernelLaunch(Kernel
, KernelArgs
, NumThreads
, NumBlocks
,
1150 GroupSize
, OutputSignal
, InputSignal
);
1153 /// Push an asynchronous memory copy between pinned memory buffers.
1154 Error
pushPinnedMemoryCopyAsync(void *Dst
, const void *Src
,
1155 uint64_t CopySize
) {
1156 // Retrieve an available signal for the operation's output.
1157 AMDGPUSignalTy
*OutputSignal
= nullptr;
1158 if (auto Err
= SignalManager
.getResource(OutputSignal
))
1160 OutputSignal
->reset();
1161 OutputSignal
->increaseUseCount();
1163 std::lock_guard
<std::mutex
> Lock(Mutex
);
1165 // Consume stream slot and compute dependencies.
1166 auto [Curr
, InputSignal
] = consume(OutputSignal
);
1168 // Avoid defining the input dependency if already satisfied.
1169 if (InputSignal
&& !InputSignal
->load())
1170 InputSignal
= nullptr;
1172 // Issue the async memory copy.
1173 hsa_status_t Status
;
1175 hsa_signal_t InputSignalRaw
= InputSignal
->get();
1176 Status
= hsa_amd_memory_async_copy(Dst
, Agent
, Src
, Agent
, CopySize
, 1,
1177 &InputSignalRaw
, OutputSignal
->get());
1179 Status
= hsa_amd_memory_async_copy(Dst
, Agent
, Src
, Agent
, CopySize
, 0,
1180 nullptr, OutputSignal
->get());
1181 return Plugin::check(Status
, "Error in hsa_amd_memory_async_copy: %s");
1184 /// Push an asynchronous memory copy device-to-host involving an unpinned
1185 /// memory buffer. The operation consists of a two-step copy from the
1186 /// device buffer to an intermediate pinned host buffer, and then, to a
1187 /// unpinned host buffer. Both operations are asynchronous and dependant.
1188 /// The intermediate pinned buffer will be released to the specified memory
1189 /// manager once the operation completes.
1190 Error
pushMemoryCopyD2HAsync(void *Dst
, const void *Src
, void *Inter
,
1192 AMDGPUMemoryManagerTy
&MemoryManager
) {
1193 // Retrieve available signals for the operation's outputs.
1194 AMDGPUSignalTy
*OutputSignals
[2] = {};
1195 if (auto Err
= SignalManager
.getResources(/*Num=*/2, OutputSignals
))
1197 for (auto Signal
: OutputSignals
) {
1199 Signal
->increaseUseCount();
1202 std::lock_guard
<std::mutex
> Lock(Mutex
);
1204 // Consume stream slot and compute dependencies.
1205 auto [Curr
, InputSignal
] = consume(OutputSignals
[0]);
1207 // Avoid defining the input dependency if already satisfied.
1208 if (InputSignal
&& !InputSignal
->load())
1209 InputSignal
= nullptr;
1211 // Setup the post action for releasing the intermediate buffer.
1212 if (auto Err
= Slots
[Curr
].schedReleaseBuffer(Inter
, MemoryManager
))
1215 // Issue the first step: device to host transfer. Avoid defining the input
1216 // dependency if already satisfied.
1217 hsa_status_t Status
;
1219 hsa_signal_t InputSignalRaw
= InputSignal
->get();
1221 hsa_amd_memory_async_copy(Inter
, Agent
, Src
, Agent
, CopySize
, 1,
1222 &InputSignalRaw
, OutputSignals
[0]->get());
1224 Status
= hsa_amd_memory_async_copy(Inter
, Agent
, Src
, Agent
, CopySize
, 0,
1225 nullptr, OutputSignals
[0]->get());
1229 Plugin::check(Status
, "Error in hsa_amd_memory_async_copy: %s"))
1232 // Consume another stream slot and compute dependencies.
1233 std::tie(Curr
, InputSignal
) = consume(OutputSignals
[1]);
1234 assert(InputSignal
&& "Invalid input signal");
1236 // The std::memcpy is done asynchronously using an async handler. We store
1237 // the function's information in the action but it's not actually an action.
1238 if (auto Err
= Slots
[Curr
].schedHostMemoryCopy(Dst
, Inter
, CopySize
))
1241 // Make changes on this slot visible to the async handler's thread.
1242 std::atomic_thread_fence(std::memory_order_release
);
1244 // Issue the second step: host to host transfer.
1245 Status
= hsa_amd_signal_async_handler(
1246 InputSignal
->get(), HSA_SIGNAL_CONDITION_EQ
, 0, asyncActionCallback
,
1247 (void *)&Slots
[Curr
]);
1249 return Plugin::check(Status
, "Error in hsa_amd_signal_async_handler: %s");
1252 /// Push an asynchronous memory copy host-to-device involving an unpinned
1253 /// memory buffer. The operation consists of a two-step copy from the
1254 /// unpinned host buffer to an intermediate pinned host buffer, and then, to
1255 /// the pinned host buffer. Both operations are asynchronous and dependant.
1256 /// The intermediate pinned buffer will be released to the specified memory
1257 /// manager once the operation completes.
1258 Error
pushMemoryCopyH2DAsync(void *Dst
, const void *Src
, void *Inter
,
1260 AMDGPUMemoryManagerTy
&MemoryManager
) {
1261 // Retrieve available signals for the operation's outputs.
1262 AMDGPUSignalTy
*OutputSignals
[2] = {};
1263 if (auto Err
= SignalManager
.getResources(/*Num=*/2, OutputSignals
))
1265 for (auto Signal
: OutputSignals
) {
1267 Signal
->increaseUseCount();
1270 AMDGPUSignalTy
*OutputSignal
= OutputSignals
[0];
1272 std::lock_guard
<std::mutex
> Lock(Mutex
);
1274 // Consume stream slot and compute dependencies.
1275 auto [Curr
, InputSignal
] = consume(OutputSignal
);
1277 // Avoid defining the input dependency if already satisfied.
1278 if (InputSignal
&& !InputSignal
->load())
1279 InputSignal
= nullptr;
1281 // Issue the first step: host to host transfer.
1283 // The std::memcpy is done asynchronously using an async handler. We store
1284 // the function's information in the action but it is not actually a
1286 if (auto Err
= Slots
[Curr
].schedHostMemoryCopy(Inter
, Src
, CopySize
))
1289 // Make changes on this slot visible to the async handler's thread.
1290 std::atomic_thread_fence(std::memory_order_release
);
1292 hsa_status_t Status
= hsa_amd_signal_async_handler(
1293 InputSignal
->get(), HSA_SIGNAL_CONDITION_EQ
, 0, asyncActionCallback
,
1294 (void *)&Slots
[Curr
]);
1296 if (auto Err
= Plugin::check(Status
,
1297 "Error in hsa_amd_signal_async_handler: %s"))
1300 // Let's use now the second output signal.
1301 OutputSignal
= OutputSignals
[1];
1303 // Consume another stream slot and compute dependencies.
1304 std::tie(Curr
, InputSignal
) = consume(OutputSignal
);
1306 // All preceding operations completed, copy the memory synchronously.
1307 std::memcpy(Inter
, Src
, CopySize
);
1309 // Return the second signal because it will not be used.
1310 OutputSignals
[1]->decreaseUseCount();
1311 if (auto Err
= SignalManager
.returnResource(OutputSignals
[1]))
1315 // Setup the post action to release the intermediate pinned buffer.
1316 if (auto Err
= Slots
[Curr
].schedReleaseBuffer(Inter
, MemoryManager
))
1319 // Issue the second step: host to device transfer. Avoid defining the input
1320 // dependency if already satisfied.
1321 hsa_status_t Status
;
1322 if (InputSignal
&& InputSignal
->load()) {
1323 hsa_signal_t InputSignalRaw
= InputSignal
->get();
1324 Status
= hsa_amd_memory_async_copy(Dst
, Agent
, Inter
, Agent
, CopySize
, 1,
1325 &InputSignalRaw
, OutputSignal
->get());
1327 Status
= hsa_amd_memory_async_copy(Dst
, Agent
, Inter
, Agent
, CopySize
, 0,
1328 nullptr, OutputSignal
->get());
1330 return Plugin::check(Status
, "Error in hsa_amd_memory_async_copy: %s");
1333 // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
1334 Error
pushMemoryCopyD2DAsync(void *Dst
, hsa_agent_t DstAgent
, const void *Src
,
1335 hsa_agent_t SrcAgent
, uint64_t CopySize
) {
1336 AMDGPUSignalTy
*OutputSignal
;
1337 if (auto Err
= SignalManager
.getResources(/*Num=*/1, &OutputSignal
))
1339 OutputSignal
->reset();
1340 OutputSignal
->increaseUseCount();
1342 std::lock_guard
<std::mutex
> Lock(Mutex
);
1344 // Consume stream slot and compute dependencies.
1345 auto [Curr
, InputSignal
] = consume(OutputSignal
);
1347 // Avoid defining the input dependency if already satisfied.
1348 if (InputSignal
&& !InputSignal
->load())
1349 InputSignal
= nullptr;
1351 // The agents need to have access to the corresponding memory
1352 // This is presently only true if the pointers were originally
1353 // allocated by this runtime or the caller made the appropriate
1356 hsa_status_t Status
;
1357 if (InputSignal
&& InputSignal
->load()) {
1358 hsa_signal_t InputSignalRaw
= InputSignal
->get();
1360 hsa_amd_memory_async_copy(Dst
, DstAgent
, Src
, SrcAgent
, CopySize
, 1,
1361 &InputSignalRaw
, OutputSignal
->get());
1363 Status
= hsa_amd_memory_async_copy(Dst
, DstAgent
, Src
, SrcAgent
, CopySize
,
1364 0, nullptr, OutputSignal
->get());
1366 return Plugin::check(Status
, "Error in D2D hsa_amd_memory_async_copy: %s");
1369 /// Synchronize with the stream. The current thread waits until all operations
1370 /// are finalized and it performs the pending post actions (i.e., releasing
1371 /// intermediate buffers).
1372 Error
synchronize() {
1373 std::lock_guard
<std::mutex
> Lock(Mutex
);
1375 // No need to synchronize anything.
1377 return Plugin::success();
1379 // Wait until all previous operations on the stream have completed.
1380 if (auto Err
= Slots
[last()].Signal
->wait(StreamBusyWaitMicroseconds
,
1381 RPCServer
, &Device
))
1384 // Reset the stream and perform all pending post actions.
1388 /// Query the stream and complete pending post actions if operations finished.
1389 /// Return whether all the operations completed. This operation does not block
1390 /// the calling thread.
1391 Expected
<bool> query() {
1392 std::lock_guard
<std::mutex
> Lock(Mutex
);
1394 // No need to query anything.
1398 // The last operation did not complete yet. Return directly.
1399 if (Slots
[last()].Signal
->load())
1402 // Reset the stream and perform all pending post actions.
1403 if (auto Err
= complete())
1404 return std::move(Err
);
1409 /// Record the state of the stream on an event.
1410 Error
recordEvent(AMDGPUEventTy
&Event
) const;
1412 /// Make the stream wait on an event.
1413 Error
waitEvent(const AMDGPUEventTy
&Event
);
1415 friend struct AMDGPUStreamManagerTy
;
1418 /// Class representing an event on AMDGPU. The event basically stores some
1419 /// information regarding the state of the recorded stream.
1420 struct AMDGPUEventTy
{
1421 /// Create an empty event.
1422 AMDGPUEventTy(AMDGPUDeviceTy
&Device
)
1423 : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
1425 /// Initialize and deinitialize.
1426 Error
init() { return Plugin::success(); }
1427 Error
deinit() { return Plugin::success(); }
1429 /// Record the state of a stream on the event.
1430 Error
record(AMDGPUStreamTy
&Stream
) {
1431 std::lock_guard
<std::mutex
> Lock(Mutex
);
1433 // Ignore the last recorded stream.
1434 RecordedStream
= &Stream
;
1436 return Stream
.recordEvent(*this);
1439 /// Make a stream wait on the current event.
1440 Error
wait(AMDGPUStreamTy
&Stream
) {
1441 std::lock_guard
<std::mutex
> Lock(Mutex
);
1443 if (!RecordedStream
)
1444 return Plugin::error("Event does not have any recorded stream");
1446 // Synchronizing the same stream. Do nothing.
1447 if (RecordedStream
== &Stream
)
1448 return Plugin::success();
1450 // No need to wait anything, the recorded stream already finished the
1451 // corresponding operation.
1452 if (RecordedSlot
< 0)
1453 return Plugin::success();
1455 return Stream
.waitEvent(*this);
1459 /// The stream registered in this event.
1460 AMDGPUStreamTy
*RecordedStream
;
1462 /// The recordered operation on the recorded stream.
1463 int64_t RecordedSlot
;
1465 /// The sync cycle when the stream was recorded. Used to detect stale events.
1466 int64_t RecordedSyncCycle
;
1468 /// Mutex to safely access event fields.
1469 mutable std::mutex Mutex
;
1471 friend struct AMDGPUStreamTy
;
1474 Error
AMDGPUStreamTy::recordEvent(AMDGPUEventTy
&Event
) const {
1475 std::lock_guard
<std::mutex
> Lock(Mutex
);
1478 // Record the synchronize identifier (to detect stale recordings) and
1479 // the last valid stream's operation.
1480 Event
.RecordedSyncCycle
= SyncCycle
;
1481 Event
.RecordedSlot
= last();
1483 assert(Event
.RecordedSyncCycle
>= 0 && "Invalid recorded sync cycle");
1484 assert(Event
.RecordedSlot
>= 0 && "Invalid recorded slot");
1486 // The stream is empty, everything already completed, record nothing.
1487 Event
.RecordedSyncCycle
= -1;
1488 Event
.RecordedSlot
= -1;
1490 return Plugin::success();
1493 Error
AMDGPUStreamTy::waitEvent(const AMDGPUEventTy
&Event
) {
1494 // Retrieve the recorded stream on the event.
1495 AMDGPUStreamTy
&RecordedStream
= *Event
.RecordedStream
;
1497 std::scoped_lock
<std::mutex
, std::mutex
> Lock(Mutex
, RecordedStream
.Mutex
);
1499 // The recorded stream already completed the operation because the synchronize
1500 // identifier is already outdated.
1501 if (RecordedStream
.SyncCycle
!= (uint32_t)Event
.RecordedSyncCycle
)
1502 return Plugin::success();
1504 // Again, the recorded stream already completed the operation, the last
1505 // operation's output signal is satisfied.
1506 if (!RecordedStream
.Slots
[Event
.RecordedSlot
].Signal
->load())
1507 return Plugin::success();
1509 // Otherwise, make the current stream wait on the other stream's operation.
1510 return waitOnStreamOperation(RecordedStream
, Event
.RecordedSlot
);
1513 struct AMDGPUStreamManagerTy final
1514 : GenericDeviceResourceManagerTy
<AMDGPUResourceRef
<AMDGPUStreamTy
>> {
1515 using ResourceRef
= AMDGPUResourceRef
<AMDGPUStreamTy
>;
1516 using ResourcePoolTy
= GenericDeviceResourceManagerTy
<ResourceRef
>;
1518 AMDGPUStreamManagerTy(GenericDeviceTy
&Device
, hsa_agent_t HSAAgent
)
1519 : GenericDeviceResourceManagerTy(Device
),
1520 OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
1521 NextQueue(0), Agent(HSAAgent
) {}
1523 Error
init(uint32_t InitialSize
, int NumHSAQueues
, int HSAQueueSize
) {
1524 Queues
= std::vector
<AMDGPUQueueTy
>(NumHSAQueues
);
1525 QueueSize
= HSAQueueSize
;
1526 MaxNumQueues
= NumHSAQueues
;
1527 // Initialize one queue eagerly
1528 if (auto Err
= Queues
.front().init(Agent
, QueueSize
))
1531 return GenericDeviceResourceManagerTy::init(InitialSize
);
1534 /// Deinitialize the resource pool and delete all resources. This function
1535 /// must be called before the destructor.
1536 Error
deinit() override
{
1537 // De-init all queues
1538 for (AMDGPUQueueTy
&Queue
: Queues
) {
1539 if (auto Err
= Queue
.deinit())
1543 return GenericDeviceResourceManagerTy::deinit();
1546 /// Get a single stream from the pool or create new resources.
1547 virtual Error
getResource(AMDGPUStreamTy
*&StreamHandle
) override
{
1548 return getResourcesImpl(1, &StreamHandle
, [this](AMDGPUStreamTy
*&Handle
) {
1549 return assignNextQueue(Handle
);
1553 /// Return stream to the pool.
1554 virtual Error
returnResource(AMDGPUStreamTy
*StreamHandle
) override
{
1555 return returnResourceImpl(StreamHandle
, [](AMDGPUStreamTy
*Handle
) {
1556 Handle
->Queue
->removeUser();
1557 return Plugin::success();
1562 /// Search for and assign an prefereably idle queue to the given Stream. If
1563 /// there is no queue without current users, choose the queue with the lowest
1564 /// user count. If utilization is ignored: use round robin selection.
1565 inline Error
assignNextQueue(AMDGPUStreamTy
*Stream
) {
1566 // Start from zero when tracking utilization, otherwise: round robin policy.
1567 uint32_t Index
= OMPX_QueueTracking
? 0 : NextQueue
++ % MaxNumQueues
;
1569 if (OMPX_QueueTracking
) {
1570 // Find the least used queue.
1571 for (uint32_t I
= 0; I
< MaxNumQueues
; ++I
) {
1572 // Early exit when an initialized queue is idle.
1573 if (Queues
[I
].isInitialized() && Queues
[I
].getUserCount() == 0) {
1578 // Update the least used queue.
1579 if (Queues
[Index
].getUserCount() > Queues
[I
].getUserCount())
1584 // Make sure the queue is initialized, then add user & assign.
1585 if (auto Err
= Queues
[Index
].init(Agent
, QueueSize
))
1587 Queues
[Index
].addUser();
1588 Stream
->Queue
= &Queues
[Index
];
1590 return Plugin::success();
1593 /// Envar for controlling the tracking of busy HSA queues.
1594 BoolEnvar OMPX_QueueTracking
;
1596 /// The next queue index to use for round robin selection.
1599 /// The queues which are assigned to requested streams.
1600 std::vector
<AMDGPUQueueTy
> Queues
;
1602 /// The corresponding device as HSA agent.
1605 /// The maximum number of queues.
1608 /// The size of created queues.
1612 /// Abstract class that holds the common members of the actual kernel devices
1613 /// and the host device. Both types should inherit from this class.
1614 struct AMDGenericDeviceTy
{
1615 AMDGenericDeviceTy() {}
1617 virtual ~AMDGenericDeviceTy() {}
1619 /// Create all memory pools which the device has access to and classify them.
1620 Error
initMemoryPools() {
1621 // Retrieve all memory pools from the device agent(s).
1622 Error Err
= retrieveAllMemoryPools();
1626 for (AMDGPUMemoryPoolTy
*MemoryPool
: AllMemoryPools
) {
1627 // Initialize the memory pool and retrieve some basic info.
1628 Error Err
= MemoryPool
->init();
1632 if (!MemoryPool
->isGlobal())
1635 // Classify the memory pools depending on their properties.
1636 if (MemoryPool
->isFineGrained()) {
1637 FineGrainedMemoryPools
.push_back(MemoryPool
);
1638 if (MemoryPool
->supportsKernelArgs())
1639 ArgsMemoryPools
.push_back(MemoryPool
);
1640 } else if (MemoryPool
->isCoarseGrained()) {
1641 CoarseGrainedMemoryPools
.push_back(MemoryPool
);
1644 return Plugin::success();
1647 /// Destroy all memory pools.
1648 Error
deinitMemoryPools() {
1649 for (AMDGPUMemoryPoolTy
*Pool
: AllMemoryPools
)
1652 AllMemoryPools
.clear();
1653 FineGrainedMemoryPools
.clear();
1654 CoarseGrainedMemoryPools
.clear();
1655 ArgsMemoryPools
.clear();
1657 return Plugin::success();
1660 /// Retrieve and construct all memory pools from the device agent(s).
1661 virtual Error
retrieveAllMemoryPools() = 0;
1663 /// Get the device agent.
1664 virtual hsa_agent_t
getAgent() const = 0;
1667 /// Array of all memory pools available to the host agents.
1668 llvm::SmallVector
<AMDGPUMemoryPoolTy
*> AllMemoryPools
;
1670 /// Array of fine-grained memory pools available to the host agents.
1671 llvm::SmallVector
<AMDGPUMemoryPoolTy
*> FineGrainedMemoryPools
;
1673 /// Array of coarse-grained memory pools available to the host agents.
1674 llvm::SmallVector
<AMDGPUMemoryPoolTy
*> CoarseGrainedMemoryPools
;
1676 /// Array of kernel args memory pools available to the host agents.
1677 llvm::SmallVector
<AMDGPUMemoryPoolTy
*> ArgsMemoryPools
;
1680 /// Class representing the host device. This host device may have more than one
1681 /// HSA host agent. We aggregate all its resources into the same instance.
1682 struct AMDHostDeviceTy
: public AMDGenericDeviceTy
{
1683 /// Create a host device from an array of host agents.
1684 AMDHostDeviceTy(const llvm::SmallVector
<hsa_agent_t
> &HostAgents
)
1685 : AMDGenericDeviceTy(), Agents(HostAgents
), ArgsMemoryManager(),
1686 PinnedMemoryManager() {
1687 assert(HostAgents
.size() && "No host agent found");
1690 /// Initialize the host device memory pools and the memory managers for
1691 /// kernel args and host pinned memory allocations.
1693 if (auto Err
= initMemoryPools())
1696 if (auto Err
= ArgsMemoryManager
.init(getArgsMemoryPool()))
1699 if (auto Err
= PinnedMemoryManager
.init(getFineGrainedMemoryPool()))
1702 return Plugin::success();
1705 /// Deinitialize memory pools and managers.
1707 if (auto Err
= deinitMemoryPools())
1710 if (auto Err
= ArgsMemoryManager
.deinit())
1713 if (auto Err
= PinnedMemoryManager
.deinit())
1716 return Plugin::success();
1719 /// Retrieve and construct all memory pools from the host agents.
1720 Error
retrieveAllMemoryPools() override
{
1721 // Iterate through the available pools across the host agents.
1722 for (hsa_agent_t Agent
: Agents
) {
1723 Error Err
= utils::iterateAgentMemoryPools(
1724 Agent
, [&](hsa_amd_memory_pool_t HSAMemoryPool
) {
1725 AMDGPUMemoryPoolTy
*MemoryPool
=
1726 new AMDGPUMemoryPoolTy(HSAMemoryPool
);
1727 AllMemoryPools
.push_back(MemoryPool
);
1728 return HSA_STATUS_SUCCESS
;
1733 return Plugin::success();
1736 /// Get one of the host agents. Return always the first agent.
1737 hsa_agent_t
getAgent() const override
{ return Agents
[0]; }
1739 /// Get a memory pool for fine-grained allocations.
1740 AMDGPUMemoryPoolTy
&getFineGrainedMemoryPool() {
1741 assert(!FineGrainedMemoryPools
.empty() && "No fine-grained mempool");
1742 // Retrive any memory pool.
1743 return *FineGrainedMemoryPools
[0];
1746 AMDGPUMemoryPoolTy
&getCoarseGrainedMemoryPool() {
1747 assert(!CoarseGrainedMemoryPools
.empty() && "No coarse-grained mempool");
1748 // Retrive any memory pool.
1749 return *CoarseGrainedMemoryPools
[0];
1752 /// Get a memory pool for kernel args allocations.
1753 AMDGPUMemoryPoolTy
&getArgsMemoryPool() {
1754 assert(!ArgsMemoryPools
.empty() && "No kernelargs mempool");
1755 // Retrieve any memory pool.
1756 return *ArgsMemoryPools
[0];
1759 /// Getters for kernel args and host pinned memory managers.
1760 AMDGPUMemoryManagerTy
&getArgsMemoryManager() { return ArgsMemoryManager
; }
1761 AMDGPUMemoryManagerTy
&getPinnedMemoryManager() {
1762 return PinnedMemoryManager
;
1766 /// Array of agents on the host side.
1767 const llvm::SmallVector
<hsa_agent_t
> Agents
;
1769 // Memory manager for kernel arguments.
1770 AMDGPUMemoryManagerTy ArgsMemoryManager
;
1772 // Memory manager for pinned memory.
1773 AMDGPUMemoryManagerTy PinnedMemoryManager
;
1776 /// Class implementing the AMDGPU device functionalities which derives from the
1777 /// generic device class.
1778 struct AMDGPUDeviceTy
: public GenericDeviceTy
, AMDGenericDeviceTy
{
1779 // Create an AMDGPU device with a device id and default AMDGPU grid values.
1780 AMDGPUDeviceTy(int32_t DeviceId
, int32_t NumDevices
,
1781 AMDHostDeviceTy
&HostDevice
, hsa_agent_t Agent
)
1782 : GenericDeviceTy(DeviceId
, NumDevices
, {0}), AMDGenericDeviceTy(),
1783 OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
1784 OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
1785 OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
1786 OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
1787 1 * 1024 * 1024), // 1MB
1788 OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
1790 OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
1791 AMDGPUStreamManager(*this, Agent
), AMDGPUEventManager(*this),
1792 AMDGPUSignalManager(*this), Agent(Agent
), HostDevice(HostDevice
) {}
1794 ~AMDGPUDeviceTy() {}
1796 /// Initialize the device, its resources and get its properties.
1797 Error
initImpl(GenericPluginTy
&Plugin
) override
{
1798 // First setup all the memory pools.
1799 if (auto Err
= initMemoryPools())
1803 if (auto Err
= getDeviceAttr(HSA_AGENT_INFO_NAME
, GPUName
))
1805 ComputeUnitKind
= GPUName
;
1807 // Get the wavefront size.
1808 uint32_t WavefrontSize
= 0;
1809 if (auto Err
= getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE
, WavefrontSize
))
1811 GridValues
.GV_Warp_Size
= WavefrontSize
;
1813 // Get the frequency of the steady clock.
1814 if (auto Err
= getDeviceAttr(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY
,
1818 // Load the grid values dependending on the wavefront.
1819 if (WavefrontSize
== 32)
1820 GridValues
= getAMDGPUGridValues
<32>();
1821 else if (WavefrontSize
== 64)
1822 GridValues
= getAMDGPUGridValues
<64>();
1824 return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize
);
1826 // Get maximum number of workitems per workgroup.
1827 uint16_t WorkgroupMaxDim
[3];
1829 getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM
, WorkgroupMaxDim
))
1831 GridValues
.GV_Max_WG_Size
= WorkgroupMaxDim
[0];
1833 // Get maximum number of workgroups.
1834 hsa_dim3_t GridMaxDim
;
1835 if (auto Err
= getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM
, GridMaxDim
))
1838 GridValues
.GV_Max_Teams
= GridMaxDim
.x
/ GridValues
.GV_Max_WG_Size
;
1839 if (GridValues
.GV_Max_Teams
== 0)
1840 return Plugin::error("Maximum number of teams cannot be zero");
1842 // Compute the default number of teams.
1843 uint32_t ComputeUnits
= 0;
1845 getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, ComputeUnits
))
1847 GridValues
.GV_Default_Num_Teams
= ComputeUnits
* OMPX_DefaultTeamsPerCU
;
1849 uint32_t WavesPerCU
= 0;
1851 getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU
, WavesPerCU
))
1853 HardwareParallelism
= ComputeUnits
* WavesPerCU
;
1855 // Get maximum size of any device queues and maximum number of queues.
1856 uint32_t MaxQueueSize
;
1857 if (auto Err
= getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE
, MaxQueueSize
))
1861 if (auto Err
= getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX
, MaxQueues
))
1864 // Compute the number of queues and their size.
1865 OMPX_NumQueues
= std::max(1U, std::min(OMPX_NumQueues
.get(), MaxQueues
));
1866 OMPX_QueueSize
= std::min(OMPX_QueueSize
.get(), MaxQueueSize
);
1868 // Initialize stream pool.
1869 if (auto Err
= AMDGPUStreamManager
.init(OMPX_InitialNumStreams
,
1870 OMPX_NumQueues
, OMPX_QueueSize
))
1873 // Initialize event pool.
1874 if (auto Err
= AMDGPUEventManager
.init(OMPX_InitialNumEvents
))
1877 // Initialize signal pool.
1878 if (auto Err
= AMDGPUSignalManager
.init(OMPX_InitialNumSignals
))
1881 return Plugin::success();
1884 /// Deinitialize the device and release its resources.
1885 Error
deinitImpl() override
{
1886 // Deinitialize the stream and event pools.
1887 if (auto Err
= AMDGPUStreamManager
.deinit())
1890 if (auto Err
= AMDGPUEventManager
.deinit())
1893 if (auto Err
= AMDGPUSignalManager
.deinit())
1896 // Close modules if necessary.
1897 if (!LoadedImages
.empty()) {
1898 // Each image has its own module.
1899 for (DeviceImageTy
*Image
: LoadedImages
) {
1900 AMDGPUDeviceImageTy
&AMDImage
=
1901 static_cast<AMDGPUDeviceImageTy
&>(*Image
);
1903 // Unload the executable of the image.
1904 if (auto Err
= AMDImage
.unloadExecutable())
1909 // Invalidate agent reference.
1912 return Plugin::success();
1915 const uint64_t getStreamBusyWaitMicroseconds() const {
1916 return OMPX_StreamBusyWait
;
1919 Expected
<std::unique_ptr
<MemoryBuffer
>>
1920 doJITPostProcessing(std::unique_ptr
<MemoryBuffer
> MB
) const override
{
1922 // TODO: We should try to avoid materialization but there seems to be no
1923 // good linker interface w/o file i/o.
1924 SmallString
<128> LinkerOutputFilePath
;
1925 std::error_code EC
= sys::fs::createTemporaryFile(
1926 "amdgpu-pre-link-jit", ".out", LinkerOutputFilePath
);
1928 return createStringError(EC
,
1929 "Failed to create temporary file for linker");
1931 SmallString
<128> LinkerInputFilePath
= LinkerOutputFilePath
;
1932 LinkerInputFilePath
.pop_back_n(2);
1934 auto FD
= raw_fd_ostream(LinkerInputFilePath
.data(), EC
);
1936 return createStringError(EC
, "Failed to open temporary file for linker");
1937 FD
.write(MB
->getBufferStart(), MB
->getBufferSize());
1940 const auto &ErrorOrPath
= sys::findProgramByName("lld");
1942 return createStringError(inconvertibleErrorCode(),
1943 "Failed to find `lld` on the PATH.");
1945 std::string LLDPath
= ErrorOrPath
.get();
1946 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, getDeviceId(),
1947 "Using `%s` to link JITed amdgcn ouput.", LLDPath
.c_str());
1949 std::string MCPU
= "-plugin-opt=mcpu=" + getComputeUnitKind();
1951 StringRef Args
[] = {LLDPath
,
1958 LinkerOutputFilePath
.data(),
1959 LinkerInputFilePath
.data()};
1962 int RC
= sys::ExecuteAndWait(LLDPath
, Args
, std::nullopt
, {}, 0, 0, &Error
);
1964 return createStringError(inconvertibleErrorCode(),
1965 "Linking optimized bitcode failed: %s",
1969 MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath
.data()).get());
1972 /// See GenericDeviceTy::getComputeUnitKind().
1973 std::string
getComputeUnitKind() const override
{ return ComputeUnitKind
; }
1975 /// Returns the clock frequency for the given AMDGPU device.
1976 uint64_t getClockFrequency() const override
{ return ClockFrequency
; }
1978 /// Allocate and construct an AMDGPU kernel.
1979 Expected
<GenericKernelTy
&>
1980 constructKernel(const __tgt_offload_entry
&KernelEntry
) override
{
1981 // Allocate and construct the AMDGPU kernel.
1982 AMDGPUKernelTy
*AMDGPUKernel
= Plugin::get().allocate
<AMDGPUKernelTy
>();
1984 return Plugin::error("Failed to allocate memory for AMDGPU kernel");
1986 new (AMDGPUKernel
) AMDGPUKernelTy(KernelEntry
.name
);
1988 return *AMDGPUKernel
;
1991 /// Set the current context to this device's context. Do nothing since the
1992 /// AMDGPU devices do not have the concept of contexts.
1993 Error
setContext() override
{ return Plugin::success(); }
1995 /// AMDGPU returns the product of the number of compute units and the waves
1996 /// per compute unit.
1997 uint64_t getHardwareParallelism() const override
{
1998 return HardwareParallelism
;
2001 /// We want to set up the RPC server for host services to the GPU if it is
2003 bool shouldSetupRPCServer() const override
{
2004 return libomptargetSupportsRPC();
2007 /// The RPC interface should have enough space for all availible parallelism.
2008 uint64_t requestedRPCPortCount() const override
{
2009 return getHardwareParallelism();
2012 /// Get the stream of the asynchronous info sructure or get a new one.
2013 Error
getStream(AsyncInfoWrapperTy
&AsyncInfoWrapper
,
2014 AMDGPUStreamTy
*&Stream
) {
2015 // Get the stream (if any) from the async info.
2016 Stream
= AsyncInfoWrapper
.getQueueAs
<AMDGPUStreamTy
*>();
2018 // There was no stream; get an idle one.
2019 if (auto Err
= AMDGPUStreamManager
.getResource(Stream
))
2022 // Modify the async info's stream.
2023 AsyncInfoWrapper
.setQueueAs
<AMDGPUStreamTy
*>(Stream
);
2025 return Plugin::success();
2028 /// Load the binary image into the device and allocate an image object.
2029 Expected
<DeviceImageTy
*> loadBinaryImpl(const __tgt_device_image
*TgtImage
,
2030 int32_t ImageId
) override
{
2031 // Allocate and initialize the image object.
2032 AMDGPUDeviceImageTy
*AMDImage
=
2033 Plugin::get().allocate
<AMDGPUDeviceImageTy
>();
2034 new (AMDImage
) AMDGPUDeviceImageTy(ImageId
, TgtImage
);
2036 // Load the HSA executable.
2037 if (Error Err
= AMDImage
->loadExecutable(*this))
2038 return std::move(Err
);
2043 /// Allocate memory on the device or related to the device.
2044 void *allocate(size_t Size
, void *, TargetAllocTy Kind
) override
;
2046 /// Deallocate memory on the device or related to the device.
2047 int free(void *TgtPtr
, TargetAllocTy Kind
) override
{
2048 if (TgtPtr
== nullptr)
2049 return OFFLOAD_SUCCESS
;
2051 AMDGPUMemoryPoolTy
*MemoryPool
= nullptr;
2053 case TARGET_ALLOC_DEFAULT
:
2054 case TARGET_ALLOC_DEVICE
:
2055 MemoryPool
= CoarseGrainedMemoryPools
[0];
2057 case TARGET_ALLOC_HOST
:
2058 MemoryPool
= &HostDevice
.getFineGrainedMemoryPool();
2060 case TARGET_ALLOC_SHARED
:
2061 MemoryPool
= &HostDevice
.getFineGrainedMemoryPool();
2066 REPORT("No memory pool for the specified allocation kind\n");
2067 return OFFLOAD_FAIL
;
2070 if (Error Err
= MemoryPool
->deallocate(TgtPtr
)) {
2071 REPORT("%s\n", toString(std::move(Err
)).data());
2072 return OFFLOAD_FAIL
;
2075 return OFFLOAD_SUCCESS
;
2078 /// Synchronize current thread with the pending operations on the async info.
2079 Error
synchronizeImpl(__tgt_async_info
&AsyncInfo
) override
{
2080 AMDGPUStreamTy
*Stream
=
2081 reinterpret_cast<AMDGPUStreamTy
*>(AsyncInfo
.Queue
);
2082 assert(Stream
&& "Invalid stream");
2084 if (auto Err
= Stream
->synchronize())
2087 // Once the stream is synchronized, return it to stream pool and reset
2088 // AsyncInfo. This is to make sure the synchronization only works for its
2090 AsyncInfo
.Queue
= nullptr;
2091 return AMDGPUStreamManager
.returnResource(Stream
);
2094 /// Query for the completion of the pending operations on the async info.
2095 Error
queryAsyncImpl(__tgt_async_info
&AsyncInfo
) override
{
2096 AMDGPUStreamTy
*Stream
=
2097 reinterpret_cast<AMDGPUStreamTy
*>(AsyncInfo
.Queue
);
2098 assert(Stream
&& "Invalid stream");
2100 auto CompletedOrErr
= Stream
->query();
2101 if (!CompletedOrErr
)
2102 return CompletedOrErr
.takeError();
2104 // Return if it the stream did not complete yet.
2105 if (!(*CompletedOrErr
))
2106 return Plugin::success();
2108 // Once the stream is completed, return it to stream pool and reset
2109 // AsyncInfo. This is to make sure the synchronization only works for its
2111 AsyncInfo
.Queue
= nullptr;
2112 return AMDGPUStreamManager
.returnResource(Stream
);
2115 /// Pin the host buffer and return the device pointer that should be used for
2116 /// device transfers.
2117 Expected
<void *> dataLockImpl(void *HstPtr
, int64_t Size
) override
{
2118 void *PinnedPtr
= nullptr;
2120 hsa_status_t Status
=
2121 hsa_amd_memory_lock(HstPtr
, Size
, nullptr, 0, &PinnedPtr
);
2122 if (auto Err
= Plugin::check(Status
, "Error in hsa_amd_memory_lock: %s\n"))
2123 return std::move(Err
);
2128 /// Unpin the host buffer.
2129 Error
dataUnlockImpl(void *HstPtr
) override
{
2130 hsa_status_t Status
= hsa_amd_memory_unlock(HstPtr
);
2131 return Plugin::check(Status
, "Error in hsa_amd_memory_unlock: %s\n");
2134 /// Check through the HSA runtime whether the \p HstPtr buffer is pinned.
2135 Expected
<bool> isPinnedPtrImpl(void *HstPtr
, void *&BaseHstPtr
,
2136 void *&BaseDevAccessiblePtr
,
2137 size_t &BaseSize
) const override
{
2138 hsa_amd_pointer_info_t Info
;
2139 Info
.size
= sizeof(hsa_amd_pointer_info_t
);
2141 hsa_status_t Status
=
2142 hsa_amd_pointer_info(HstPtr
, &Info
, /* Allocator */ nullptr,
2143 /* Number of accessible agents (out) */ nullptr,
2144 /* Accessible agents */ nullptr);
2145 if (auto Err
= Plugin::check(Status
, "Error in hsa_amd_pointer_info: %s"))
2146 return std::move(Err
);
2148 // The buffer may be locked or allocated through HSA allocators. Assume that
2149 // the buffer is host pinned if the runtime reports a HSA type.
2150 if (Info
.type
!= HSA_EXT_POINTER_TYPE_LOCKED
&&
2151 Info
.type
!= HSA_EXT_POINTER_TYPE_HSA
)
2154 assert(Info
.hostBaseAddress
&& "Invalid host pinned address");
2155 assert(Info
.agentBaseAddress
&& "Invalid agent pinned address");
2156 assert(Info
.sizeInBytes
> 0 && "Invalid pinned allocation size");
2158 // Save the allocation info in the output parameters.
2159 BaseHstPtr
= Info
.hostBaseAddress
;
2160 BaseDevAccessiblePtr
= Info
.agentBaseAddress
;
2161 BaseSize
= Info
.sizeInBytes
;
2166 /// Submit data to the device (host to device transfer).
2167 Error
dataSubmitImpl(void *TgtPtr
, const void *HstPtr
, int64_t Size
,
2168 AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
2169 AMDGPUStreamTy
*Stream
= nullptr;
2170 void *PinnedPtr
= nullptr;
2172 // Use one-step asynchronous operation when host memory is already pinned.
2173 if (void *PinnedPtr
=
2174 PinnedAllocs
.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr
)) {
2175 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
2177 return Stream
->pushPinnedMemoryCopyAsync(TgtPtr
, PinnedPtr
, Size
);
2180 // For large transfers use synchronous behavior.
2181 if (Size
>= OMPX_MaxAsyncCopyBytes
) {
2182 if (AsyncInfoWrapper
.hasQueue())
2183 if (auto Err
= synchronize(AsyncInfoWrapper
))
2186 hsa_status_t Status
;
2187 Status
= hsa_amd_memory_lock(const_cast<void *>(HstPtr
), Size
, nullptr, 0,
2190 Plugin::check(Status
, "Error in hsa_amd_memory_lock: %s\n"))
2193 AMDGPUSignalTy Signal
;
2194 if (auto Err
= Signal
.init())
2197 Status
= hsa_amd_memory_async_copy(TgtPtr
, Agent
, PinnedPtr
, Agent
, Size
,
2198 0, nullptr, Signal
.get());
2200 Plugin::check(Status
, "Error in hsa_amd_memory_async_copy: %s"))
2203 if (auto Err
= Signal
.wait(getStreamBusyWaitMicroseconds()))
2206 if (auto Err
= Signal
.deinit())
2209 Status
= hsa_amd_memory_unlock(const_cast<void *>(HstPtr
));
2210 return Plugin::check(Status
, "Error in hsa_amd_memory_unlock: %s\n");
2213 // Otherwise, use two-step copy with an intermediate pinned host buffer.
2214 AMDGPUMemoryManagerTy
&PinnedMemoryManager
=
2215 HostDevice
.getPinnedMemoryManager();
2216 if (auto Err
= PinnedMemoryManager
.allocate(Size
, &PinnedPtr
))
2219 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
2222 return Stream
->pushMemoryCopyH2DAsync(TgtPtr
, HstPtr
, PinnedPtr
, Size
,
2223 PinnedMemoryManager
);
2226 /// Retrieve data from the device (device to host transfer).
2227 Error
dataRetrieveImpl(void *HstPtr
, const void *TgtPtr
, int64_t Size
,
2228 AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
2229 AMDGPUStreamTy
*Stream
= nullptr;
2230 void *PinnedPtr
= nullptr;
2232 // Use one-step asynchronous operation when host memory is already pinned.
2233 if (void *PinnedPtr
=
2234 PinnedAllocs
.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr
)) {
2235 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
2238 return Stream
->pushPinnedMemoryCopyAsync(PinnedPtr
, TgtPtr
, Size
);
2241 // For large transfers use synchronous behavior.
2242 if (Size
>= OMPX_MaxAsyncCopyBytes
) {
2243 if (AsyncInfoWrapper
.hasQueue())
2244 if (auto Err
= synchronize(AsyncInfoWrapper
))
2247 hsa_status_t Status
;
2248 Status
= hsa_amd_memory_lock(const_cast<void *>(HstPtr
), Size
, nullptr, 0,
2251 Plugin::check(Status
, "Error in hsa_amd_memory_lock: %s\n"))
2254 AMDGPUSignalTy Signal
;
2255 if (auto Err
= Signal
.init())
2258 Status
= hsa_amd_memory_async_copy(PinnedPtr
, Agent
, TgtPtr
, Agent
, Size
,
2259 0, nullptr, Signal
.get());
2261 Plugin::check(Status
, "Error in hsa_amd_memory_async_copy: %s"))
2264 if (auto Err
= Signal
.wait(getStreamBusyWaitMicroseconds()))
2267 if (auto Err
= Signal
.deinit())
2270 Status
= hsa_amd_memory_unlock(const_cast<void *>(HstPtr
));
2271 return Plugin::check(Status
, "Error in hsa_amd_memory_unlock: %s\n");
2274 // Otherwise, use two-step copy with an intermediate pinned host buffer.
2275 AMDGPUMemoryManagerTy
&PinnedMemoryManager
=
2276 HostDevice
.getPinnedMemoryManager();
2277 if (auto Err
= PinnedMemoryManager
.allocate(Size
, &PinnedPtr
))
2280 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
2283 return Stream
->pushMemoryCopyD2HAsync(HstPtr
, TgtPtr
, PinnedPtr
, Size
,
2284 PinnedMemoryManager
);
2287 /// Exchange data between two devices within the plugin.
2288 Error
dataExchangeImpl(const void *SrcPtr
, GenericDeviceTy
&DstGenericDevice
,
2289 void *DstPtr
, int64_t Size
,
2290 AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
2291 AMDGPUDeviceTy
&DstDevice
= static_cast<AMDGPUDeviceTy
&>(DstGenericDevice
);
2293 AMDGPUStreamTy
*Stream
= nullptr;
2294 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
2297 return Plugin::success();
2299 return Stream
->pushMemoryCopyD2DAsync(DstPtr
, DstDevice
.getAgent(), SrcPtr
,
2300 getAgent(), (uint64_t)Size
);
2303 /// Initialize the async info for interoperability purposes.
2304 Error
initAsyncInfoImpl(AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
2305 // TODO: Implement this function.
2306 return Plugin::success();
2309 /// Initialize the device info for interoperability purposes.
2310 Error
initDeviceInfoImpl(__tgt_device_info
*DeviceInfo
) override
{
2311 DeviceInfo
->Context
= nullptr;
2313 if (!DeviceInfo
->Device
)
2314 DeviceInfo
->Device
= reinterpret_cast<void *>(Agent
.handle
);
2316 return Plugin::success();
2319 /// Create an event.
2320 Error
createEventImpl(void **EventPtrStorage
) override
{
2321 AMDGPUEventTy
**Event
= reinterpret_cast<AMDGPUEventTy
**>(EventPtrStorage
);
2322 return AMDGPUEventManager
.getResource(*Event
);
2325 /// Destroy a previously created event.
2326 Error
destroyEventImpl(void *EventPtr
) override
{
2327 AMDGPUEventTy
*Event
= reinterpret_cast<AMDGPUEventTy
*>(EventPtr
);
2328 return AMDGPUEventManager
.returnResource(Event
);
2331 /// Record the event.
2332 Error
recordEventImpl(void *EventPtr
,
2333 AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
2334 AMDGPUEventTy
*Event
= reinterpret_cast<AMDGPUEventTy
*>(EventPtr
);
2335 assert(Event
&& "Invalid event");
2337 AMDGPUStreamTy
*Stream
= nullptr;
2338 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
2341 return Event
->record(*Stream
);
2344 /// Make the stream wait on the event.
2345 Error
waitEventImpl(void *EventPtr
,
2346 AsyncInfoWrapperTy
&AsyncInfoWrapper
) override
{
2347 AMDGPUEventTy
*Event
= reinterpret_cast<AMDGPUEventTy
*>(EventPtr
);
2349 AMDGPUStreamTy
*Stream
= nullptr;
2350 if (auto Err
= getStream(AsyncInfoWrapper
, Stream
))
2353 return Event
->wait(*Stream
);
2356 /// Synchronize the current thread with the event.
2357 Error
syncEventImpl(void *EventPtr
) override
{
2358 return Plugin::error("Synchronize event not implemented");
2361 /// Print information about the device.
2362 Error
obtainInfoImpl(InfoQueueTy
&Info
) override
{
2364 const char *TmpCharPtr
= "Unknown";
2365 uint16_t Major
, Minor
;
2366 uint32_t TmpUInt
, TmpUInt2
;
2367 uint32_t CacheSize
[4];
2370 uint16_t WorkgrpMaxDim
[3];
2371 hsa_dim3_t GridMaxDim
;
2372 hsa_status_t Status
, Status2
;
2374 Status
= hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR
, &Major
);
2375 Status2
= hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR
, &Minor
);
2376 if (Status
== HSA_STATUS_SUCCESS
&& Status2
== HSA_STATUS_SUCCESS
)
2377 Info
.add("HSA Runtime Version",
2378 std::to_string(Major
) + "." + std::to_string(Minor
));
2380 Info
.add("HSA OpenMP Device Number", DeviceId
);
2382 Status
= getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME
, TmpChar
);
2383 if (Status
== HSA_STATUS_SUCCESS
)
2384 Info
.add("Product Name", TmpChar
);
2386 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_NAME
, TmpChar
);
2387 if (Status
== HSA_STATUS_SUCCESS
)
2388 Info
.add("Device Name", TmpChar
);
2390 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME
, TmpChar
);
2391 if (Status
== HSA_STATUS_SUCCESS
)
2392 Info
.add("Vendor Name", TmpChar
);
2394 hsa_device_type_t DevType
;
2395 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE
, DevType
);
2396 if (Status
== HSA_STATUS_SUCCESS
) {
2398 case HSA_DEVICE_TYPE_CPU
:
2401 case HSA_DEVICE_TYPE_GPU
:
2404 case HSA_DEVICE_TYPE_DSP
:
2408 Info
.add("Device Type", TmpCharPtr
);
2411 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX
, TmpUInt
);
2412 if (Status
== HSA_STATUS_SUCCESS
)
2413 Info
.add("Max Queues", TmpUInt
);
2415 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE
, TmpUInt
);
2416 if (Status
== HSA_STATUS_SUCCESS
)
2417 Info
.add("Queue Min Size", TmpUInt
);
2419 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE
, TmpUInt
);
2420 if (Status
== HSA_STATUS_SUCCESS
)
2421 Info
.add("Queue Max Size", TmpUInt
);
2423 // FIXME: This is deprecated according to HSA documentation. But using
2424 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
2426 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE
, CacheSize
);
2427 if (Status
== HSA_STATUS_SUCCESS
) {
2430 for (int I
= 0; I
< 4; I
++)
2432 Info
.add
<InfoLevel2
>("L" + std::to_string(I
), CacheSize
[I
]);
2435 Status
= getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE
, TmpUInt
);
2436 if (Status
== HSA_STATUS_SUCCESS
)
2437 Info
.add("Cacheline Size", TmpUInt
);
2439 Status
= getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY
, TmpUInt
);
2440 if (Status
== HSA_STATUS_SUCCESS
)
2441 Info
.add("Max Clock Freq", TmpUInt
, "MHz");
2443 Status
= getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, TmpUInt
);
2444 if (Status
== HSA_STATUS_SUCCESS
)
2445 Info
.add("Compute Units", TmpUInt
);
2447 Status
= getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU
, TmpUInt
);
2448 if (Status
== HSA_STATUS_SUCCESS
)
2449 Info
.add("SIMD per CU", TmpUInt
);
2451 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION
, TmpBool
);
2452 if (Status
== HSA_STATUS_SUCCESS
)
2453 Info
.add("Fast F16 Operation", TmpBool
);
2455 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE
, TmpUInt2
);
2456 if (Status
== HSA_STATUS_SUCCESS
)
2457 Info
.add("Wavefront Size", TmpUInt2
);
2459 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE
, TmpUInt
);
2460 if (Status
== HSA_STATUS_SUCCESS
)
2461 Info
.add("Workgroup Max Size", TmpUInt
);
2463 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM
, WorkgrpMaxDim
);
2464 if (Status
== HSA_STATUS_SUCCESS
) {
2465 Info
.add("Workgroup Max Size per Dimension");
2466 Info
.add
<InfoLevel2
>("x", WorkgrpMaxDim
[0]);
2467 Info
.add
<InfoLevel2
>("y", WorkgrpMaxDim
[1]);
2468 Info
.add
<InfoLevel2
>("z", WorkgrpMaxDim
[2]);
2471 Status
= getDeviceAttrRaw(
2472 (hsa_agent_info_t
)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU
, TmpUInt
);
2473 if (Status
== HSA_STATUS_SUCCESS
) {
2474 Info
.add("Max Waves Per CU", TmpUInt
);
2475 Info
.add("Max Work-item Per CU", TmpUInt
* TmpUInt2
);
2478 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE
, TmpUInt
);
2479 if (Status
== HSA_STATUS_SUCCESS
)
2480 Info
.add("Grid Max Size", TmpUInt
);
2482 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM
, GridMaxDim
);
2483 if (Status
== HSA_STATUS_SUCCESS
) {
2484 Info
.add("Grid Max Size per Dimension");
2485 Info
.add
<InfoLevel2
>("x", GridMaxDim
.x
);
2486 Info
.add
<InfoLevel2
>("y", GridMaxDim
.y
);
2487 Info
.add
<InfoLevel2
>("z", GridMaxDim
.z
);
2490 Status
= getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE
, TmpUInt
);
2491 if (Status
== HSA_STATUS_SUCCESS
)
2492 Info
.add("Max fbarriers/Workgrp", TmpUInt
);
2494 Info
.add("Memory Pools");
2495 for (AMDGPUMemoryPoolTy
*Pool
: AllMemoryPools
) {
2496 std::string TmpStr
, TmpStr2
;
2498 if (Pool
->isGlobal())
2500 else if (Pool
->isReadOnly())
2501 TmpStr
= "ReadOnly";
2502 else if (Pool
->isPrivate())
2504 else if (Pool
->isGroup())
2509 Info
.add
<InfoLevel2
>(std::string("Pool ") + TmpStr
);
2511 if (Pool
->isGlobal()) {
2512 if (Pool
->isFineGrained())
2513 TmpStr2
+= "Fine Grained ";
2514 if (Pool
->isCoarseGrained())
2515 TmpStr2
+= "Coarse Grained ";
2516 if (Pool
->supportsKernelArgs())
2517 TmpStr2
+= "Kernarg ";
2519 Info
.add
<InfoLevel3
>("Flags", TmpStr2
);
2522 Status
= Pool
->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE
, TmpSt
);
2523 if (Status
== HSA_STATUS_SUCCESS
)
2524 Info
.add
<InfoLevel3
>("Size", TmpSt
, "bytes");
2526 Status
= Pool
->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED
,
2528 if (Status
== HSA_STATUS_SUCCESS
)
2529 Info
.add
<InfoLevel3
>("Allocatable", TmpBool
);
2531 Status
= Pool
->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE
,
2533 if (Status
== HSA_STATUS_SUCCESS
)
2534 Info
.add
<InfoLevel3
>("Runtime Alloc Granule", TmpSt
, "bytes");
2536 Status
= Pool
->getAttrRaw(
2537 HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT
, TmpSt
);
2538 if (Status
== HSA_STATUS_SUCCESS
)
2539 Info
.add
<InfoLevel3
>("Runtime Alloc Alignment", TmpSt
, "bytes");
2542 Pool
->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL
, TmpBool
);
2543 if (Status
== HSA_STATUS_SUCCESS
)
2544 Info
.add
<InfoLevel3
>("Accessable by all", TmpBool
);
2548 auto Err
= utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA
) {
2549 Status
= hsa_isa_get_info_alt(ISA
, HSA_ISA_INFO_NAME
, TmpChar
);
2550 if (Status
== HSA_STATUS_SUCCESS
)
2551 Info
.add
<InfoLevel2
>("Name", TmpChar
);
2556 // Silently consume the error.
2558 consumeError(std::move(Err
));
2560 return Plugin::success();
2563 /// Getters and setters for stack and heap sizes.
2564 Error
getDeviceStackSize(uint64_t &Value
) override
{
2566 return Plugin::success();
2568 Error
setDeviceStackSize(uint64_t Value
) override
{
2569 return Plugin::success();
2571 Error
getDeviceHeapSize(uint64_t &Value
) override
{
2572 Value
= DeviceMemoryPoolSize
;
2573 return Plugin::success();
2575 Error
setDeviceHeapSize(uint64_t Value
) override
{
2576 for (DeviceImageTy
*Image
: LoadedImages
)
2577 if (auto Err
= setupDeviceMemoryPool(Plugin::get(), *Image
, Value
))
2579 DeviceMemoryPoolSize
= Value
;
2580 return Plugin::success();
2582 Error
getDeviceMemorySize(uint64_t &Value
) override
{
2583 for (AMDGPUMemoryPoolTy
*Pool
: AllMemoryPools
) {
2584 if (Pool
->isGlobal()) {
2585 hsa_status_t Status
=
2586 Pool
->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE
, Value
);
2587 return Plugin::check(Status
, "Error in getting device memory size: %s");
2590 return Plugin::error("getDeviceMemorySize:: no global pool");
2593 /// AMDGPU-specific function to get device attributes.
2594 template <typename Ty
> Error
getDeviceAttr(uint32_t Kind
, Ty
&Value
) {
2595 hsa_status_t Status
=
2596 hsa_agent_get_info(Agent
, (hsa_agent_info_t
)Kind
, &Value
);
2597 return Plugin::check(Status
, "Error in hsa_agent_get_info: %s");
2600 template <typename Ty
>
2601 hsa_status_t
getDeviceAttrRaw(uint32_t Kind
, Ty
&Value
) {
2602 return hsa_agent_get_info(Agent
, (hsa_agent_info_t
)Kind
, &Value
);
2605 /// Get the device agent.
2606 hsa_agent_t
getAgent() const override
{ return Agent
; }
2608 /// Get the signal manager.
2609 AMDGPUSignalManagerTy
&getSignalManager() { return AMDGPUSignalManager
; }
2611 /// Retrieve and construct all memory pools of the device agent.
2612 Error
retrieveAllMemoryPools() override
{
2613 // Iterate through the available pools of the device agent.
2614 return utils::iterateAgentMemoryPools(
2615 Agent
, [&](hsa_amd_memory_pool_t HSAMemoryPool
) {
2616 AMDGPUMemoryPoolTy
*MemoryPool
=
2617 Plugin::get().allocate
<AMDGPUMemoryPoolTy
>();
2618 new (MemoryPool
) AMDGPUMemoryPoolTy(HSAMemoryPool
);
2619 AllMemoryPools
.push_back(MemoryPool
);
2620 return HSA_STATUS_SUCCESS
;
2625 using AMDGPUEventRef
= AMDGPUResourceRef
<AMDGPUEventTy
>;
2626 using AMDGPUEventManagerTy
= GenericDeviceResourceManagerTy
<AMDGPUEventRef
>;
2628 /// Envar for controlling the number of HSA queues per device. High number of
2629 /// queues may degrade performance.
2630 UInt32Envar OMPX_NumQueues
;
2632 /// Envar for controlling the size of each HSA queue. The size is the number
2633 /// of HSA packets a queue is expected to hold. It is also the number of HSA
2634 /// packets that can be pushed into each queue without waiting the driver to
2636 UInt32Envar OMPX_QueueSize
;
2638 /// Envar for controlling the default number of teams relative to the number
2639 /// of compute units (CUs) the device has:
2640 /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
2641 UInt32Envar OMPX_DefaultTeamsPerCU
;
2643 /// Envar specifying the maximum size in bytes where the memory copies are
2644 /// asynchronous operations. Up to this transfer size, the memory copies are
2645 /// asychronous operations pushed to the corresponding stream. For larger
2646 /// transfers, they are synchronous transfers.
2647 UInt32Envar OMPX_MaxAsyncCopyBytes
;
2649 /// Envar controlling the initial number of HSA signals per device. There is
2650 /// one manager of signals per device managing several pre-allocated signals.
2651 /// These signals are mainly used by AMDGPU streams. If needed, more signals
2652 /// will be created.
2653 UInt32Envar OMPX_InitialNumSignals
;
2655 /// Environment variables to set the time to wait in active state before
2656 /// switching to blocked state. The default 2000000 busywaits for 2 seconds
2657 /// before going into a blocking HSA wait state. The unit for these variables
2658 /// are microseconds.
2659 UInt32Envar OMPX_StreamBusyWait
;
2661 /// Stream manager for AMDGPU streams.
2662 AMDGPUStreamManagerTy AMDGPUStreamManager
;
2664 /// Event manager for AMDGPU events.
2665 AMDGPUEventManagerTy AMDGPUEventManager
;
2667 /// Signal manager for AMDGPU signals.
2668 AMDGPUSignalManagerTy AMDGPUSignalManager
;
2670 /// The agent handler corresponding to the device.
2673 /// The GPU architecture.
2674 std::string ComputeUnitKind
;
2676 /// The frequency of the steady clock inside the device.
2677 uint64_t ClockFrequency
;
2679 /// The total number of concurrent work items that can be running on the GPU.
2680 uint64_t HardwareParallelism
;
2682 /// Reference to the host device.
2683 AMDHostDeviceTy
&HostDevice
;
2685 /// The current size of the global device memory pool (managed by us).
2686 uint64_t DeviceMemoryPoolSize
= 1L << 29L /* 512MB */;
2689 Error
AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy
&Device
) {
2690 hsa_status_t Status
;
2691 Status
= hsa_code_object_deserialize(getStart(), getSize(), "", &CodeObject
);
2693 Plugin::check(Status
, "Error in hsa_code_object_deserialize: %s"))
2696 Status
= hsa_executable_create_alt(
2697 HSA_PROFILE_FULL
, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO
, "", &Executable
);
2699 Plugin::check(Status
, "Error in hsa_executable_create_alt: %s"))
2702 Status
= hsa_executable_load_code_object(Executable
, Device
.getAgent(),
2705 Plugin::check(Status
, "Error in hsa_executable_load_code_object: %s"))
2708 Status
= hsa_executable_freeze(Executable
, "");
2709 if (auto Err
= Plugin::check(Status
, "Error in hsa_executable_freeze: %s"))
2713 Status
= hsa_executable_validate(Executable
, &Result
);
2714 if (auto Err
= Plugin::check(Status
, "Error in hsa_executable_validate: %s"))
2718 return Plugin::error("Loaded HSA executable does not validate");
2720 if (auto Err
= utils::readAMDGPUMetaDataFromImage(
2721 getMemoryBuffer(), KernelInfoMap
, ELFABIVersion
))
2724 return Plugin::success();
2727 Expected
<hsa_executable_symbol_t
>
2728 AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy
&Device
,
2729 StringRef SymbolName
) const {
2731 AMDGPUDeviceTy
&AMDGPUDevice
= static_cast<AMDGPUDeviceTy
&>(Device
);
2732 hsa_agent_t Agent
= AMDGPUDevice
.getAgent();
2734 hsa_executable_symbol_t Symbol
;
2735 hsa_status_t Status
= hsa_executable_get_symbol_by_name(
2736 Executable
, SymbolName
.data(), &Agent
, &Symbol
);
2737 if (auto Err
= Plugin::check(
2738 Status
, "Error in hsa_executable_get_symbol_by_name(%s): %s",
2740 return std::move(Err
);
2745 template <typename ResourceTy
>
2746 Error AMDGPUResourceRef
<ResourceTy
>::create(GenericDeviceTy
&Device
) {
2748 return Plugin::error("Creating an existing resource");
2750 AMDGPUDeviceTy
&AMDGPUDevice
= static_cast<AMDGPUDeviceTy
&>(Device
);
2752 Resource
= new ResourceTy(AMDGPUDevice
);
2754 return Resource
->init();
2757 AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy
&Device
)
2758 : Agent(Device
.getAgent()), Queue(nullptr),
2759 SignalManager(Device
.getSignalManager()), Device(Device
),
2760 // Initialize the std::deque with some empty positions.
2761 Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
2762 StreamBusyWaitMicroseconds(Device
.getStreamBusyWaitMicroseconds()) {}
2764 /// Class implementing the AMDGPU-specific functionalities of the global
2766 struct AMDGPUGlobalHandlerTy final
: public GenericGlobalHandlerTy
{
2767 /// Get the metadata of a global from the device. The name and size of the
2768 /// global is read from DeviceGlobal and the address of the global is written
2769 /// to DeviceGlobal.
2770 Error
getGlobalMetadataFromDevice(GenericDeviceTy
&Device
,
2771 DeviceImageTy
&Image
,
2772 GlobalTy
&DeviceGlobal
) override
{
2773 AMDGPUDeviceImageTy
&AMDImage
= static_cast<AMDGPUDeviceImageTy
&>(Image
);
2775 // Find the symbol on the device executable.
2777 AMDImage
.findDeviceSymbol(Device
, DeviceGlobal
.getName());
2779 return SymbolOrErr
.takeError();
2781 hsa_executable_symbol_t Symbol
= *SymbolOrErr
;
2782 hsa_symbol_kind_t SymbolType
;
2783 hsa_status_t Status
;
2784 uint64_t SymbolAddr
;
2785 uint32_t SymbolSize
;
2787 // Retrieve the type, address and size of the symbol.
2788 std::pair
<hsa_executable_symbol_info_t
, void *> RequiredInfos
[] = {
2789 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE
, &SymbolType
},
2790 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
, &SymbolAddr
},
2791 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE
, &SymbolSize
}};
2793 for (auto &Info
: RequiredInfos
) {
2794 Status
= hsa_executable_symbol_get_info(Symbol
, Info
.first
, Info
.second
);
2795 if (auto Err
= Plugin::check(
2796 Status
, "Error in hsa_executable_symbol_get_info: %s"))
2800 // Check the size of the symbol.
2801 if (SymbolSize
!= DeviceGlobal
.getSize())
2802 return Plugin::error(
2803 "Failed to load global '%s' due to size mismatch (%zu != %zu)",
2804 DeviceGlobal
.getName().data(), SymbolSize
,
2805 (size_t)DeviceGlobal
.getSize());
2807 // Store the symbol address on the device global metadata.
2808 DeviceGlobal
.setPtr(reinterpret_cast<void *>(SymbolAddr
));
2810 return Plugin::success();
2814 /// Class implementing the AMDGPU-specific functionalities of the plugin.
2815 struct AMDGPUPluginTy final
: public GenericPluginTy
{
2816 /// Create an AMDGPU plugin and initialize the AMDGPU driver.
2818 : GenericPluginTy(getTripleArch()), Initialized(false),
2819 HostDevice(nullptr) {}
2821 /// This class should not be copied.
2822 AMDGPUPluginTy(const AMDGPUPluginTy
&) = delete;
2823 AMDGPUPluginTy(AMDGPUPluginTy
&&) = delete;
2825 /// Initialize the plugin and return the number of devices.
2826 Expected
<int32_t> initImpl() override
{
2827 hsa_status_t Status
= hsa_init();
2828 if (Status
!= HSA_STATUS_SUCCESS
) {
2829 // Cannot call hsa_success_string.
2830 DP("Failed to initialize AMDGPU's HSA library\n");
2834 // The initialization of HSA was successful. It should be safe to call
2835 // HSA functions from now on, e.g., hsa_shut_down.
2839 ompt::connectLibrary();
2842 // Register event handler to detect memory errors on the devices.
2843 Status
= hsa_amd_register_system_event_handler(eventHandler
, nullptr);
2844 if (auto Err
= Plugin::check(
2845 Status
, "Error in hsa_amd_register_system_event_handler: %s"))
2846 return std::move(Err
);
2848 // List of host (CPU) agents.
2849 llvm::SmallVector
<hsa_agent_t
> HostAgents
;
2851 // Count the number of available agents.
2852 auto Err
= utils::iterateAgents([&](hsa_agent_t Agent
) {
2853 // Get the device type of the agent.
2854 hsa_device_type_t DeviceType
;
2855 hsa_status_t Status
=
2856 hsa_agent_get_info(Agent
, HSA_AGENT_INFO_DEVICE
, &DeviceType
);
2857 if (Status
!= HSA_STATUS_SUCCESS
)
2860 // Classify the agents into kernel (GPU) and host (CPU) kernels.
2861 if (DeviceType
== HSA_DEVICE_TYPE_GPU
) {
2862 // Ensure that the GPU agent supports kernel dispatch packets.
2863 hsa_agent_feature_t Features
;
2864 Status
= hsa_agent_get_info(Agent
, HSA_AGENT_INFO_FEATURE
, &Features
);
2865 if (Features
& HSA_AGENT_FEATURE_KERNEL_DISPATCH
)
2866 KernelAgents
.push_back(Agent
);
2867 } else if (DeviceType
== HSA_DEVICE_TYPE_CPU
) {
2868 HostAgents
.push_back(Agent
);
2870 return HSA_STATUS_SUCCESS
;
2874 return std::move(Err
);
2876 int32_t NumDevices
= KernelAgents
.size();
2877 if (NumDevices
== 0) {
2878 // Do not initialize if there are no devices.
2879 DP("There are no devices supporting AMDGPU.\n");
2883 // There are kernel agents but there is no host agent. That should be
2884 // treated as an error.
2885 if (HostAgents
.empty())
2886 return Plugin::error("No AMDGPU host agents");
2888 // Initialize the host device using host agents.
2889 HostDevice
= allocate
<AMDHostDeviceTy
>();
2890 new (HostDevice
) AMDHostDeviceTy(HostAgents
);
2892 // Setup the memory pools of available for the host.
2893 if (auto Err
= HostDevice
->init())
2894 return std::move(Err
);
2899 /// Deinitialize the plugin.
2900 Error
deinitImpl() override
{
2901 // The HSA runtime was not initialized, so nothing from the plugin was
2902 // actually initialized.
2904 return Plugin::success();
2907 if (auto Err
= HostDevice
->deinit())
2910 // Finalize the HSA runtime.
2911 hsa_status_t Status
= hsa_shut_down();
2912 return Plugin::check(Status
, "Error in hsa_shut_down: %s");
2915 Triple::ArchType
getTripleArch() const override
{ return Triple::amdgcn
; }
2917 /// Get the ELF code for recognizing the compatible image binary.
2918 uint16_t getMagicElfBits() const override
{ return ELF::EM_AMDGPU
; }
2920 /// Check whether the image is compatible with an AMDGPU device.
2921 Expected
<bool> isImageCompatible(__tgt_image_info
*Info
) const override
{
2922 for (hsa_agent_t Agent
: KernelAgents
) {
2924 auto Err
= utils::iterateAgentISAs(Agent
, [&](hsa_isa_t ISA
) {
2926 hsa_status_t Status
;
2927 Status
= hsa_isa_get_info_alt(ISA
, HSA_ISA_INFO_NAME_LENGTH
, &Length
);
2928 if (Status
!= HSA_STATUS_SUCCESS
)
2931 llvm::SmallVector
<char> ISAName(Length
);
2932 Status
= hsa_isa_get_info_alt(ISA
, HSA_ISA_INFO_NAME
, ISAName
.begin());
2933 if (Status
!= HSA_STATUS_SUCCESS
)
2936 llvm::StringRef
TripleTarget(ISAName
.begin(), Length
);
2937 if (TripleTarget
.consume_front("amdgcn-amd-amdhsa"))
2938 Target
= TripleTarget
.ltrim('-').rtrim('\0').str();
2939 return HSA_STATUS_SUCCESS
;
2942 return std::move(Err
);
2944 if (!utils::isImageCompatibleWithEnv(Info
, Target
))
2950 bool isDataExchangable(int32_t SrcDeviceId
, int32_t DstDeviceId
) override
{
2954 /// Get the host device instance.
2955 AMDHostDeviceTy
&getHostDevice() {
2956 assert(HostDevice
&& "Host device not initialized");
2960 /// Get the kernel agent with the corresponding agent id.
2961 hsa_agent_t
getKernelAgent(int32_t AgentId
) const {
2962 assert((uint32_t)AgentId
< KernelAgents
.size() && "Invalid agent id");
2963 return KernelAgents
[AgentId
];
2966 /// Get the list of the available kernel agents.
2967 const llvm::SmallVector
<hsa_agent_t
> &getKernelAgents() const {
2968 return KernelAgents
;
2972 /// Event handler that will be called by ROCr if an event is detected.
2973 static hsa_status_t
eventHandler(const hsa_amd_event_t
*Event
, void *) {
2974 if (Event
->event_type
!= HSA_AMD_GPU_MEMORY_FAULT_EVENT
)
2975 return HSA_STATUS_SUCCESS
;
2977 SmallVector
<std::string
> Reasons
;
2978 uint32_t ReasonsMask
= Event
->memory_fault
.fault_reason_mask
;
2979 if (ReasonsMask
& HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT
)
2980 Reasons
.emplace_back("Page not present or supervisor privilege");
2981 if (ReasonsMask
& HSA_AMD_MEMORY_FAULT_READ_ONLY
)
2982 Reasons
.emplace_back("Write access to a read-only page");
2983 if (ReasonsMask
& HSA_AMD_MEMORY_FAULT_NX
)
2984 Reasons
.emplace_back("Execute access to a page marked NX");
2985 if (ReasonsMask
& HSA_AMD_MEMORY_FAULT_HOST_ONLY
)
2986 Reasons
.emplace_back("GPU attempted access to a host only page");
2987 if (ReasonsMask
& HSA_AMD_MEMORY_FAULT_DRAMECC
)
2988 Reasons
.emplace_back("DRAM ECC failure");
2989 if (ReasonsMask
& HSA_AMD_MEMORY_FAULT_IMPRECISE
)
2990 Reasons
.emplace_back("Can't determine the exact fault address");
2991 if (ReasonsMask
& HSA_AMD_MEMORY_FAULT_SRAMECC
)
2992 Reasons
.emplace_back("SRAM ECC failure (ie registers, no fault address)");
2993 if (ReasonsMask
& HSA_AMD_MEMORY_FAULT_HANG
)
2994 Reasons
.emplace_back("GPU reset following unspecified hang");
2996 // If we do not know the reason, say so, otherwise remove the trailing comma
2998 if (Reasons
.empty())
2999 Reasons
.emplace_back("Unknown (" + std::to_string(ReasonsMask
) + ")");
3002 hsa_agent_get_info(Event
->memory_fault
.agent
, HSA_AGENT_INFO_NODE
, &Node
);
3004 // Abort the execution since we do not recover from this error.
3006 "Memory access fault by GPU %" PRIu32
" (agent 0x%" PRIx64
3007 ") at virtual address %p. Reasons: %s",
3008 Node
, Event
->memory_fault
.agent
.handle
,
3009 (void *)Event
->memory_fault
.virtual_address
,
3010 llvm::join(Reasons
, ", ").c_str());
3012 return HSA_STATUS_ERROR
;
3015 /// Indicate whether the HSA runtime was correctly initialized. Even if there
3016 /// is no available devices this boolean will be true. It indicates whether
3017 /// we can safely call HSA functions (e.g., hsa_shut_down).
3020 /// Arrays of the available GPU and CPU agents. These arrays of handles should
3021 /// not be here but in the AMDGPUDeviceTy structures directly. However, the
3022 /// HSA standard does not provide API functions to retirve agents directly,
3023 /// only iterating functions. We cache the agents here for convenience.
3024 llvm::SmallVector
<hsa_agent_t
> KernelAgents
;
3026 /// The device representing all HSA host agents.
3027 AMDHostDeviceTy
*HostDevice
;
3030 Error
AMDGPUKernelTy::launchImpl(GenericDeviceTy
&GenericDevice
,
3031 uint32_t NumThreads
, uint64_t NumBlocks
,
3032 KernelArgsTy
&KernelArgs
, void *Args
,
3033 AsyncInfoWrapperTy
&AsyncInfoWrapper
) const {
3034 const uint32_t KernelArgsSize
= KernelArgs
.NumArgs
* sizeof(void *);
3036 if (ArgsSize
< KernelArgsSize
)
3037 return Plugin::error("Mismatch of kernel arguments size");
3039 // The args size reported by HSA may or may not contain the implicit args.
3040 // For now, assume that HSA does not consider the implicit arguments when
3041 // reporting the arguments of a kernel. In the worst case, we can waste
3042 // 56 bytes per allocation.
3043 uint32_t AllArgsSize
= KernelArgsSize
+ ImplicitArgsSize
;
3045 AMDHostDeviceTy
&HostDevice
= Plugin::get
<AMDGPUPluginTy
>().getHostDevice();
3046 AMDGPUMemoryManagerTy
&ArgsMemoryManager
= HostDevice
.getArgsMemoryManager();
3048 void *AllArgs
= nullptr;
3049 if (auto Err
= ArgsMemoryManager
.allocate(AllArgsSize
, &AllArgs
))
3052 // Account for user requested dynamic shared memory.
3053 uint32_t GroupSize
= getGroupSize();
3054 if (uint32_t MaxDynCGroupMem
= std::max(
3055 KernelArgs
.DynCGroupMem
, GenericDevice
.getDynamicMemorySize())) {
3056 GroupSize
+= MaxDynCGroupMem
;
3059 // Initialize implicit arguments.
3060 utils::AMDGPUImplicitArgsTy
*ImplArgs
=
3061 reinterpret_cast<utils::AMDGPUImplicitArgsTy
*>(
3062 advanceVoidPtr(AllArgs
, KernelArgsSize
));
3064 // Initialize the implicit arguments to zero.
3065 std::memset(ImplArgs
, 0, ImplicitArgsSize
);
3067 // Copy the explicit arguments.
3068 // TODO: We should expose the args memory manager alloc to the common part as
3069 // alternative to copying them twice.
3070 if (KernelArgs
.NumArgs
)
3071 std::memcpy(AllArgs
, *static_cast<void **>(Args
),
3072 sizeof(void *) * KernelArgs
.NumArgs
);
3074 AMDGPUDeviceTy
&AMDGPUDevice
= static_cast<AMDGPUDeviceTy
&>(GenericDevice
);
3076 AMDGPUStreamTy
*Stream
= nullptr;
3077 if (auto Err
= AMDGPUDevice
.getStream(AsyncInfoWrapper
, Stream
))
3080 // If this kernel requires an RPC server we attach its pointer to the stream.
3081 if (GenericDevice
.getRPCServer())
3082 Stream
->setRPCServer(GenericDevice
.getRPCServer());
3084 // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
3085 if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy
)) {
3086 ImplArgs
->BlockCountX
= NumBlocks
;
3087 ImplArgs
->GroupSizeX
= NumThreads
;
3088 ImplArgs
->GroupSizeY
= 1;
3089 ImplArgs
->GroupSizeZ
= 1;
3090 ImplArgs
->GridDims
= 1;
3093 // Push the kernel launch into the stream.
3094 return Stream
->pushKernelLaunch(*this, AllArgs
, NumThreads
, NumBlocks
,
3095 GroupSize
, ArgsMemoryManager
);
3098 Error
AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy
&GenericDevice
,
3099 KernelArgsTy
&KernelArgs
,
3100 uint32_t NumThreads
,
3101 uint64_t NumBlocks
) const {
3102 // Only do all this when the output is requested
3103 if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL
))
3104 return Plugin::success();
3106 // We don't have data to print additional info, but no hard error
3107 if (!KernelInfo
.has_value())
3108 return Plugin::success();
3111 auto NumGroups
= NumBlocks
;
3112 auto ThreadsPerGroup
= NumThreads
;
3114 // Kernel Arguments Info
3115 auto ArgNum
= KernelArgs
.NumArgs
;
3116 auto LoopTripCount
= KernelArgs
.Tripcount
;
3118 // Details for AMDGPU kernels (read from image)
3119 // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
3120 auto GroupSegmentSize
= (*KernelInfo
).GroupSegmentList
;
3121 auto SGPRCount
= (*KernelInfo
).SGPRCount
;
3122 auto VGPRCount
= (*KernelInfo
).VGPRCount
;
3123 auto SGPRSpillCount
= (*KernelInfo
).SGPRSpillCount
;
3124 auto VGPRSpillCount
= (*KernelInfo
).VGPRSpillCount
;
3125 auto MaxFlatWorkgroupSize
= (*KernelInfo
).MaxFlatWorkgroupSize
;
3127 // Prints additional launch info that contains the following.
3128 // Num Args: The number of kernel arguments
3129 // Teams x Thrds: The number of teams and the number of threads actually
3131 // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
3132 // kernel in work-items
3133 // LDS Usage: Amount of bytes used in LDS storage
3134 // S/VGPR Count: the number of S/V GPRs occupied by the kernel
3135 // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
3136 // Tripcount: loop tripcount for the kernel
3137 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, GenericDevice
.getDeviceId(),
3138 "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
3139 "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
3141 ArgNum
, NumGroups
, ThreadsPerGroup
, MaxFlatWorkgroupSize
,
3142 GroupSegmentSize
, SGPRCount
, VGPRCount
, SGPRSpillCount
, VGPRSpillCount
,
3145 return Plugin::success();
3148 GenericPluginTy
*Plugin::createPlugin() { return new AMDGPUPluginTy(); }
3150 GenericDeviceTy
*Plugin::createDevice(int32_t DeviceId
, int32_t NumDevices
) {
3151 AMDGPUPluginTy
&Plugin
= get
<AMDGPUPluginTy
&>();
3152 return new AMDGPUDeviceTy(DeviceId
, NumDevices
, Plugin
.getHostDevice(),
3153 Plugin
.getKernelAgent(DeviceId
));
3156 GenericGlobalHandlerTy
*Plugin::createGlobalHandler() {
3157 return new AMDGPUGlobalHandlerTy();
3160 template <typename
... ArgsTy
>
3161 Error
Plugin::check(int32_t Code
, const char *ErrFmt
, ArgsTy
... Args
) {
3162 hsa_status_t ResultCode
= static_cast<hsa_status_t
>(Code
);
3163 if (ResultCode
== HSA_STATUS_SUCCESS
|| ResultCode
== HSA_STATUS_INFO_BREAK
)
3164 return Error::success();
3166 const char *Desc
= "Unknown error";
3167 hsa_status_t Ret
= hsa_status_string(ResultCode
, &Desc
);
3168 if (Ret
!= HSA_STATUS_SUCCESS
)
3169 REPORT("Unrecognized " GETNAME(TARGET_NAME
) " error code %d\n", Code
);
3171 return createStringError
<ArgsTy
..., const char *>(inconvertibleErrorCode(),
3172 ErrFmt
, Args
..., Desc
);
3175 void *AMDGPUMemoryManagerTy::allocate(size_t Size
, void *HstPtr
,
3176 TargetAllocTy Kind
) {
3177 // Allocate memory from the pool.
3178 void *Ptr
= nullptr;
3179 if (auto Err
= MemoryPool
->allocate(Size
, &Ptr
)) {
3180 consumeError(std::move(Err
));
3183 assert(Ptr
&& "Invalid pointer");
3185 auto &KernelAgents
= Plugin::get
<AMDGPUPluginTy
>().getKernelAgents();
3187 // Allow all kernel agents to access the allocation.
3188 if (auto Err
= MemoryPool
->enableAccess(Ptr
, Size
, KernelAgents
)) {
3189 REPORT("%s\n", toString(std::move(Err
)).data());
3195 void *AMDGPUDeviceTy::allocate(size_t Size
, void *, TargetAllocTy Kind
) {
3199 // Find the correct memory pool.
3200 AMDGPUMemoryPoolTy
*MemoryPool
= nullptr;
3202 case TARGET_ALLOC_DEFAULT
:
3203 case TARGET_ALLOC_DEVICE
:
3204 MemoryPool
= CoarseGrainedMemoryPools
[0];
3206 case TARGET_ALLOC_HOST
:
3207 MemoryPool
= &HostDevice
.getFineGrainedMemoryPool();
3209 case TARGET_ALLOC_SHARED
:
3210 MemoryPool
= &HostDevice
.getFineGrainedMemoryPool();
3215 REPORT("No memory pool for the specified allocation kind\n");
3219 // Allocate from the corresponding memory pool.
3220 void *Alloc
= nullptr;
3221 if (Error Err
= MemoryPool
->allocate(Size
, &Alloc
)) {
3222 REPORT("%s\n", toString(std::move(Err
)).data());
3227 auto &KernelAgents
= Plugin::get
<AMDGPUPluginTy
>().getKernelAgents();
3228 // Inherently necessary for host or shared allocations
3229 // Also enabled for device memory to allow device to device memcpy
3231 // Enable all kernel agents to access the buffer.
3232 if (auto Err
= MemoryPool
->enableAccess(Alloc
, Size
, KernelAgents
)) {
3233 REPORT("%s\n", toString(std::move(Err
)).data());
3241 } // namespace plugin
3242 } // namespace target