1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //==-----------------------------------------------------------------------===//
10 /// Base class for AMDGPU specific classes of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
17 #include "llvm/IR/CallingConv.h"
18 #include "llvm/Support/Alignment.h"
19 #include "llvm/TargetParser/Triple.h"
23 enum AMDGPUDwarfFlavour
: unsigned;
26 class MachineFunction
;
29 class AMDGPUSubtarget
{
50 bool GCN3Encoding
= false;
51 bool Has16BitInsts
= false;
52 bool HasTrue16BitInsts
= false;
53 bool EnableRealTrue16Insts
= false;
54 bool HasMadMixInsts
= false;
55 bool HasMadMacF32Insts
= false;
56 bool HasDsSrc2Insts
= false;
58 bool HasVOP3PInsts
= false;
59 bool HasMulI24
= true;
60 bool HasMulU24
= true;
61 bool HasSMulHi
= false;
62 bool HasInv2PiInlineImm
= false;
63 bool HasFminFmaxLegacy
= true;
64 bool EnablePromoteAlloca
= false;
65 bool HasTrigReducedRange
= false;
66 bool FastFMAF32
= false;
67 unsigned EUsPerCU
= 4;
68 unsigned MaxWavesPerEU
= 10;
69 unsigned LocalMemorySize
= 0;
70 unsigned AddressableLocalMemorySize
= 0;
71 char WavefrontSizeLog2
= 0;
74 AMDGPUSubtarget(Triple TT
);
76 static const AMDGPUSubtarget
&get(const MachineFunction
&MF
);
77 static const AMDGPUSubtarget
&get(const TargetMachine
&TM
,
80 /// \returns Default range flat work group size for a calling convention.
81 std::pair
<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC
) const;
83 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
84 /// for function \p F, or minimum/maximum flat work group sizes explicitly
85 /// requested using "amdgpu-flat-work-group-size" attribute attached to
88 /// \returns Subtarget's default values if explicitly requested values cannot
89 /// be converted to integer, or violate subtarget's specifications.
90 std::pair
<unsigned, unsigned> getFlatWorkGroupSizes(const Function
&F
) const;
92 /// \returns Subtarget's default pair of minimum/maximum number of waves per
93 /// execution unit for function \p F, or minimum/maximum number of waves per
94 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
95 /// attached to function \p F.
97 /// \returns Subtarget's default values if explicitly requested values cannot
98 /// be converted to integer, violate subtarget's specifications, or are not
99 /// compatible with minimum/maximum number of waves limited by flat work group
100 /// size, register usage, and/or lds usage.
101 std::pair
<unsigned, unsigned> getWavesPerEU(const Function
&F
) const {
102 // Default/requested minimum/maximum flat work group sizes.
103 std::pair
<unsigned, unsigned> FlatWorkGroupSizes
= getFlatWorkGroupSizes(F
);
104 return getWavesPerEU(F
, FlatWorkGroupSizes
);
107 /// Overload which uses the specified values for the flat work group sizes,
108 /// rather than querying the function itself. \p FlatWorkGroupSizes Should
109 /// correspond to the function's value for getFlatWorkGroupSizes.
110 std::pair
<unsigned, unsigned>
111 getWavesPerEU(const Function
&F
,
112 std::pair
<unsigned, unsigned> FlatWorkGroupSizes
) const;
113 std::pair
<unsigned, unsigned> getEffectiveWavesPerEU(
114 std::pair
<unsigned, unsigned> WavesPerEU
,
115 std::pair
<unsigned, unsigned> FlatWorkGroupSizes
) const;
117 /// Return the amount of LDS that can be used that will not restrict the
118 /// occupancy lower than WaveCount.
119 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount
,
120 const Function
&) const;
122 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
123 /// the given LDS memory size is the only constraint.
124 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes
, const Function
&) const;
126 unsigned getOccupancyWithLocalMemSize(const MachineFunction
&MF
) const;
128 bool isAmdHsaOS() const {
129 return TargetTriple
.getOS() == Triple::AMDHSA
;
132 bool isAmdPalOS() const {
133 return TargetTriple
.getOS() == Triple::AMDPAL
;
136 bool isMesa3DOS() const {
137 return TargetTriple
.getOS() == Triple::Mesa3D
;
140 bool isMesaKernel(const Function
&F
) const;
142 bool isAmdHsaOrMesa(const Function
&F
) const {
143 return isAmdHsaOS() || isMesaKernel(F
);
147 return TargetTriple
.getArch() == Triple::amdgcn
;
150 bool isGCN3Encoding() const {
154 bool has16BitInsts() const {
155 return Has16BitInsts
;
158 /// Return true if the subtarget supports True16 instructions.
159 bool hasTrue16BitInsts() const { return HasTrue16BitInsts
; }
161 /// Return true if real (non-fake) variants of True16 instructions using
162 /// 16-bit registers should be code-generated. Fake True16 instructions are
163 /// identical to non-fake ones except that they take 32-bit registers as
164 /// operands and always use their low halves.
165 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
166 // supported and the support for fake True16 instructions is removed.
167 bool useRealTrue16Insts() const;
169 bool hasMadMixInsts() const {
170 return HasMadMixInsts
;
173 bool hasMadMacF32Insts() const {
174 return HasMadMacF32Insts
|| !isGCN();
177 bool hasDsSrc2Insts() const {
178 return HasDsSrc2Insts
;
181 bool hasSDWA() const {
185 bool hasVOP3PInsts() const {
186 return HasVOP3PInsts
;
189 bool hasMulI24() const {
193 bool hasMulU24() const {
197 bool hasSMulHi() const {
201 bool hasInv2PiInlineImm() const {
202 return HasInv2PiInlineImm
;
205 bool hasFminFmaxLegacy() const {
206 return HasFminFmaxLegacy
;
209 bool hasTrigReducedRange() const {
210 return HasTrigReducedRange
;
213 bool hasFastFMAF32() const {
217 bool isPromoteAllocaEnabled() const {
218 return EnablePromoteAlloca
;
221 unsigned getWavefrontSize() const {
222 return 1 << WavefrontSizeLog2
;
225 unsigned getWavefrontSizeLog2() const {
226 return WavefrontSizeLog2
;
229 unsigned getLocalMemorySize() const {
230 return LocalMemorySize
;
233 unsigned getAddressableLocalMemorySize() const {
234 return AddressableLocalMemorySize
;
237 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
238 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
239 /// CU mode into account.
240 unsigned getEUsPerCU() const { return EUsPerCU
; }
242 Align
getAlignmentForImplicitArgPtr() const {
243 return isAmdHsaOS() ? Align(8) : Align(4);
246 /// Returns the offset in bytes from the start of the input buffer
247 /// of the first explicit kernel argument.
248 unsigned getExplicitKernelArgOffset() const {
249 switch (TargetTriple
.getOS()) {
254 case Triple::UnknownOS
:
256 // For legacy reasons unknown/other is treated as a different version of
261 llvm_unreachable("invalid triple OS");
264 /// \returns Maximum number of work groups per compute unit supported by the
265 /// subtarget and limited by given \p FlatWorkGroupSize.
266 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize
) const = 0;
268 /// \returns Minimum flat work group size supported by the subtarget.
269 virtual unsigned getMinFlatWorkGroupSize() const = 0;
271 /// \returns Maximum flat work group size supported by the subtarget.
272 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
274 /// \returns Number of waves per execution unit required to support the given
275 /// \p FlatWorkGroupSize.
277 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize
) const = 0;
279 /// \returns Minimum number of waves per execution unit supported by the
281 virtual unsigned getMinWavesPerEU() const = 0;
283 /// \returns Maximum number of waves per execution unit supported by the
284 /// subtarget without any kind of limitation.
285 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU
; }
287 /// Return the maximum workitem ID value in the function, for the given (0, 1,
289 unsigned getMaxWorkitemID(const Function
&Kernel
, unsigned Dimension
) const;
291 /// Return the number of work groups for the function.
292 SmallVector
<unsigned> getMaxNumWorkGroups(const Function
&F
) const;
294 /// Return true if only a single workitem can be active in a wave.
295 bool isSingleLaneExecution(const Function
&Kernel
) const;
297 /// Creates value range metadata on an workitemid.* intrinsic call or load.
298 bool makeLIDRangeMetadata(Instruction
*I
) const;
300 /// \returns Number of bytes of arguments that are passed to a shader or
301 /// kernel in addition to the explicit ones declared for the function.
302 unsigned getImplicitArgNumBytes(const Function
&F
) const;
303 uint64_t getExplicitKernArgSize(const Function
&F
, Align
&MaxAlign
) const;
304 unsigned getKernArgSegmentSize(const Function
&F
, Align
&MaxAlign
) const;
306 /// \returns Corresponding DWARF register number mapping flavour for the
307 /// \p WavefrontSize.
308 AMDGPUDwarfFlavour
getAMDGPUDwarfFlavour() const;
310 virtual ~AMDGPUSubtarget() = default;
313 } // end namespace llvm
315 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H