1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
15 //===----------------------------------------------------------------------===//
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIModeRegisterDefaults.h"
21 #include "llvm/Analysis/InlineCost.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/Analysis.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/PatternMatch.h"
28 #include "llvm/Support/KnownBits.h"
33 #define DEBUG_TYPE "AMDGPUtti"
35 static cl::opt
<unsigned> UnrollThresholdPrivate(
36 "amdgpu-unroll-threshold-private",
37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38 cl::init(2700), cl::Hidden
);
40 static cl::opt
<unsigned> UnrollThresholdLocal(
41 "amdgpu-unroll-threshold-local",
42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43 cl::init(1000), cl::Hidden
);
45 static cl::opt
<unsigned> UnrollThresholdIf(
46 "amdgpu-unroll-threshold-if",
47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48 cl::init(200), cl::Hidden
);
50 static cl::opt
<bool> UnrollRuntimeLocal(
51 "amdgpu-unroll-runtime-local",
52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53 cl::init(true), cl::Hidden
);
55 static cl::opt
<unsigned> UnrollMaxBlockToAnalyze(
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58 cl::init(32), cl::Hidden
);
60 static cl::opt
<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61 cl::Hidden
, cl::init(4000),
62 cl::desc("Cost of alloca argument"));
64 // If the amount of scratch memory to eliminate exceeds our ability to allocate
65 // it into registers we gain nothing by aggressively inlining functions for that
67 static cl::opt
<unsigned>
68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden
,
70 cl::desc("Maximum alloca size to use for inline cost"));
72 // Inliner constraint to achieve reasonable compilation time.
73 static cl::opt
<size_t> InlineMaxBB(
74 "amdgpu-inline-max-bb", cl::Hidden
, cl::init(1100),
75 cl::desc("Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
78 static bool dependsOnLocalPhi(const Loop
*L
, const Value
*Cond
,
80 const Instruction
*I
= dyn_cast
<Instruction
>(Cond
);
84 for (const Value
*V
: I
->operand_values()) {
87 if (const PHINode
*PHI
= dyn_cast
<PHINode
>(V
)) {
88 if (llvm::none_of(L
->getSubLoops(), [PHI
](const Loop
* SubLoop
) {
89 return SubLoop
->contains(PHI
); }))
91 } else if (Depth
< 10 && dependsOnLocalPhi(L
, V
, Depth
+1))
97 AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine
*TM
, const Function
&F
)
98 : BaseT(TM
, F
.getDataLayout()),
99 TargetTriple(TM
->getTargetTriple()),
100 ST(static_cast<const GCNSubtarget
*>(TM
->getSubtargetImpl(F
))),
101 TLI(ST
->getTargetLowering()) {}
103 void AMDGPUTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
104 TTI::UnrollingPreferences
&UP
,
105 OptimizationRemarkEmitter
*ORE
) {
106 const Function
&F
= *L
->getHeader()->getParent();
108 F
.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
109 UP
.MaxCount
= std::numeric_limits
<unsigned>::max();
112 // Conditional branch in a loop back edge needs 3 additional exec
113 // manipulations in average.
116 // We want to run unroll even for the loops which have been vectorized.
117 UP
.UnrollVectorizedLoop
= true;
119 // TODO: Do we want runtime unrolling?
121 // Maximum alloca size than can fit registers. Reserve 16 registers.
122 const unsigned MaxAlloca
= (256 - 16) * 4;
123 unsigned ThresholdPrivate
= UnrollThresholdPrivate
;
124 unsigned ThresholdLocal
= UnrollThresholdLocal
;
126 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
127 // provided threshold value as the default for Threshold
128 if (MDNode
*LoopUnrollThreshold
=
129 findOptionMDForLoop(L
, "amdgpu.loop.unroll.threshold")) {
130 if (LoopUnrollThreshold
->getNumOperands() == 2) {
131 ConstantInt
*MetaThresholdValue
= mdconst::extract_or_null
<ConstantInt
>(
132 LoopUnrollThreshold
->getOperand(1));
133 if (MetaThresholdValue
) {
134 // We will also use the supplied value for PartialThreshold for now.
135 // We may introduce additional metadata if it becomes necessary in the
137 UP
.Threshold
= MetaThresholdValue
->getSExtValue();
138 UP
.PartialThreshold
= UP
.Threshold
;
139 ThresholdPrivate
= std::min(ThresholdPrivate
, UP
.Threshold
);
140 ThresholdLocal
= std::min(ThresholdLocal
, UP
.Threshold
);
145 unsigned MaxBoost
= std::max(ThresholdPrivate
, ThresholdLocal
);
146 for (const BasicBlock
*BB
: L
->getBlocks()) {
147 const DataLayout
&DL
= BB
->getDataLayout();
148 unsigned LocalGEPsSeen
= 0;
150 if (llvm::any_of(L
->getSubLoops(), [BB
](const Loop
* SubLoop
) {
151 return SubLoop
->contains(BB
); }))
152 continue; // Block belongs to an inner loop.
154 for (const Instruction
&I
: *BB
) {
155 // Unroll a loop which contains an "if" statement whose condition
156 // defined by a PHI belonging to the loop. This may help to eliminate
157 // if region and potentially even PHI itself, saving on both divergence
158 // and registers used for the PHI.
159 // Add a small bonus for each of such "if" statements.
160 if (const BranchInst
*Br
= dyn_cast
<BranchInst
>(&I
)) {
161 if (UP
.Threshold
< MaxBoost
&& Br
->isConditional()) {
162 BasicBlock
*Succ0
= Br
->getSuccessor(0);
163 BasicBlock
*Succ1
= Br
->getSuccessor(1);
164 if ((L
->contains(Succ0
) && L
->isLoopExiting(Succ0
)) ||
165 (L
->contains(Succ1
) && L
->isLoopExiting(Succ1
)))
167 if (dependsOnLocalPhi(L
, Br
->getCondition())) {
168 UP
.Threshold
+= UnrollThresholdIf
;
169 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP
.Threshold
171 << *L
<< " due to " << *Br
<< '\n');
172 if (UP
.Threshold
>= MaxBoost
)
179 const GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(&I
);
183 unsigned AS
= GEP
->getAddressSpace();
184 unsigned Threshold
= 0;
185 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
)
186 Threshold
= ThresholdPrivate
;
187 else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
)
188 Threshold
= ThresholdLocal
;
192 if (UP
.Threshold
>= Threshold
)
195 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
196 const Value
*Ptr
= GEP
->getPointerOperand();
197 const AllocaInst
*Alloca
=
198 dyn_cast
<AllocaInst
>(getUnderlyingObject(Ptr
));
199 if (!Alloca
|| !Alloca
->isStaticAlloca())
201 Type
*Ty
= Alloca
->getAllocatedType();
202 unsigned AllocaSize
= Ty
->isSized() ? DL
.getTypeAllocSize(Ty
) : 0;
203 if (AllocaSize
> MaxAlloca
)
205 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
||
206 AS
== AMDGPUAS::REGION_ADDRESS
) {
208 // Inhibit unroll for local memory if we have seen addressing not to
209 // a variable, most likely we will be unable to combine it.
210 // Do not unroll too deep inner loops for local memory to give a chance
211 // to unroll an outer loop for a more important reason.
212 if (LocalGEPsSeen
> 1 || L
->getLoopDepth() > 2 ||
213 (!isa
<GlobalVariable
>(GEP
->getPointerOperand()) &&
214 !isa
<Argument
>(GEP
->getPointerOperand())))
216 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
217 << *L
<< " due to LDS use.\n");
218 UP
.Runtime
= UnrollRuntimeLocal
;
221 // Check if GEP depends on a value defined by this loop itself.
222 bool HasLoopDef
= false;
223 for (const Value
*Op
: GEP
->operands()) {
224 const Instruction
*Inst
= dyn_cast
<Instruction
>(Op
);
225 if (!Inst
|| L
->isLoopInvariant(Op
))
228 if (llvm::any_of(L
->getSubLoops(), [Inst
](const Loop
* SubLoop
) {
229 return SubLoop
->contains(Inst
); }))
237 // We want to do whatever we can to limit the number of alloca
238 // instructions that make it through to the code generator. allocas
239 // require us to use indirect addressing, which is slow and prone to
240 // compiler bugs. If this loop does an address calculation on an
241 // alloca ptr, then we want to use a higher than normal loop unroll
242 // threshold. This will give SROA a better chance to eliminate these
245 // We also want to have more unrolling for local memory to let ds
246 // instructions with different offsets combine.
248 // Don't use the maximum allowed value here as it will make some
249 // programs way too big.
250 UP
.Threshold
= Threshold
;
251 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
253 << *L
<< " due to " << *GEP
<< '\n');
254 if (UP
.Threshold
>= MaxBoost
)
258 // If we got a GEP in a small BB from inner loop then increase max trip
259 // count to analyze for better estimation cost in unroll
260 if (L
->isInnermost() && BB
->size() < UnrollMaxBlockToAnalyze
)
261 UP
.MaxIterationsCountToAnalyze
= 32;
265 void AMDGPUTTIImpl::getPeelingPreferences(Loop
*L
, ScalarEvolution
&SE
,
266 TTI::PeelingPreferences
&PP
) {
267 BaseT::getPeelingPreferences(L
, SE
, PP
);
270 int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
274 const FeatureBitset
GCNTTIImpl::InlineFeatureIgnoreList
= {
275 // Codegen control options which don't matter.
276 AMDGPU::FeatureEnableLoadStoreOpt
, AMDGPU::FeatureEnableSIScheduler
,
277 AMDGPU::FeatureEnableUnsafeDSOffsetFolding
, AMDGPU::FeatureFlatForGlobal
,
278 AMDGPU::FeaturePromoteAlloca
, AMDGPU::FeatureUnalignedScratchAccess
,
279 AMDGPU::FeatureUnalignedAccessMode
,
281 AMDGPU::FeatureAutoWaitcntBeforeBarrier
,
283 // Property of the kernel/environment which can't actually differ.
284 AMDGPU::FeatureSGPRInitBug
, AMDGPU::FeatureXNACK
,
285 AMDGPU::FeatureTrapHandler
,
287 // The default assumption needs to be ecc is enabled, but no directly
288 // exposed operations depend on it, so it can be safely inlined.
289 AMDGPU::FeatureSRAMECC
,
291 // Perf-tuning features
292 AMDGPU::FeatureFastFMAF32
, AMDGPU::HalfRate64Ops
};
294 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine
*TM
, const Function
&F
)
295 : BaseT(TM
, F
.getDataLayout()),
296 ST(static_cast<const GCNSubtarget
*>(TM
->getSubtargetImpl(F
))),
297 TLI(ST
->getTargetLowering()), CommonTTI(TM
, F
),
298 IsGraphics(AMDGPU::isGraphics(F
.getCallingConv())) {
299 SIModeRegisterDefaults
Mode(F
, *ST
);
300 HasFP32Denormals
= Mode
.FP32Denormals
!= DenormalMode::getPreserveSign();
301 HasFP64FP16Denormals
=
302 Mode
.FP64FP16Denormals
!= DenormalMode::getPreserveSign();
305 bool GCNTTIImpl::hasBranchDivergence(const Function
*F
) const {
306 return !F
|| !ST
->isSingleLaneExecution(*F
);
309 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID
) const {
310 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311 // registers. See getRegisterClassForType for the implementation.
312 // In this case vector registers are not vector in terms of
313 // VGPRs, but those which can hold multiple values.
315 // This is really the number of registers to fill when vectorizing /
316 // interleaving loops, so we lie to avoid trying to use all registers.
321 GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K
) const {
323 case TargetTransformInfo::RGK_Scalar
:
324 return TypeSize::getFixed(32);
325 case TargetTransformInfo::RGK_FixedWidthVector
:
326 return TypeSize::getFixed(ST
->hasPackedFP32Ops() ? 64 : 32);
327 case TargetTransformInfo::RGK_ScalableVector
:
328 return TypeSize::getScalable(0);
330 llvm_unreachable("Unsupported register kind");
333 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
337 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth
, unsigned Opcode
) const {
338 if (Opcode
== Instruction::Load
|| Opcode
== Instruction::Store
)
339 return 32 * 4 / ElemWidth
;
340 return (ElemWidth
== 16 && ST
->has16BitInsts()) ? 2
341 : (ElemWidth
== 32 && ST
->hasPackedFP32Ops()) ? 2
345 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF
, unsigned LoadSize
,
346 unsigned ChainSizeInBytes
,
347 VectorType
*VecTy
) const {
348 unsigned VecRegBitWidth
= VF
* LoadSize
;
349 if (VecRegBitWidth
> 128 && VecTy
->getScalarSizeInBits() < 32)
350 // TODO: Support element-size less than 32bit?
351 return 128 / LoadSize
;
356 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF
, unsigned StoreSize
,
357 unsigned ChainSizeInBytes
,
358 VectorType
*VecTy
) const {
359 unsigned VecRegBitWidth
= VF
* StoreSize
;
360 if (VecRegBitWidth
> 128)
361 return 128 / StoreSize
;
366 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace
) const {
367 if (AddrSpace
== AMDGPUAS::GLOBAL_ADDRESS
||
368 AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS
||
369 AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
370 AddrSpace
== AMDGPUAS::BUFFER_FAT_POINTER
||
371 AddrSpace
== AMDGPUAS::BUFFER_RESOURCE
||
372 AddrSpace
== AMDGPUAS::BUFFER_STRIDED_POINTER
) {
376 if (AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
)
377 return 8 * ST
->getMaxPrivateElementSize();
379 // Common to flat, global, local and region. Assume for unknown addrspace.
383 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes
,
385 unsigned AddrSpace
) const {
386 // We allow vectorization of flat stores, even though we may need to decompose
387 // them later if they may access private memory. We don't have enough context
388 // here, and legalization can handle it.
389 if (AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
) {
390 return (Alignment
>= 4 || ST
->hasUnalignedScratchAccess()) &&
391 ChainSizeInBytes
<= ST
->getMaxPrivateElementSize();
396 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes
,
398 unsigned AddrSpace
) const {
399 return isLegalToVectorizeMemChain(ChainSizeInBytes
, Alignment
, AddrSpace
);
402 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes
,
404 unsigned AddrSpace
) const {
405 return isLegalToVectorizeMemChain(ChainSizeInBytes
, Alignment
, AddrSpace
);
408 int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
412 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
413 // iteration. Should we report a larger size and let it legalize?
415 // FIXME: Should we use narrower types for local/region, or account for when
416 // unaligned access is legal?
418 // FIXME: This could use fine tuning and microbenchmarks.
419 Type
*GCNTTIImpl::getMemcpyLoopLoweringType(
420 LLVMContext
&Context
, Value
*Length
, unsigned SrcAddrSpace
,
421 unsigned DestAddrSpace
, unsigned SrcAlign
, unsigned DestAlign
,
422 std::optional
<uint32_t> AtomicElementSize
) const {
424 if (AtomicElementSize
)
425 return Type::getIntNTy(Context
, *AtomicElementSize
* 8);
427 unsigned MinAlign
= std::min(SrcAlign
, DestAlign
);
429 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
430 // hardware into byte accesses. If you assume all alignments are equally
431 // probable, it's more efficient on average to use short accesses for this
434 return Type::getInt16Ty(Context
);
436 // Not all subtargets have 128-bit DS instructions, and we currently don't
437 // form them by default.
438 if (SrcAddrSpace
== AMDGPUAS::LOCAL_ADDRESS
||
439 SrcAddrSpace
== AMDGPUAS::REGION_ADDRESS
||
440 DestAddrSpace
== AMDGPUAS::LOCAL_ADDRESS
||
441 DestAddrSpace
== AMDGPUAS::REGION_ADDRESS
) {
442 return FixedVectorType::get(Type::getInt32Ty(Context
), 2);
445 // Global memory works best with 16-byte accesses. Private memory will also
446 // hit this, although they'll be decomposed.
447 return FixedVectorType::get(Type::getInt32Ty(Context
), 4);
450 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
451 SmallVectorImpl
<Type
*> &OpsOut
, LLVMContext
&Context
,
452 unsigned RemainingBytes
, unsigned SrcAddrSpace
, unsigned DestAddrSpace
,
453 unsigned SrcAlign
, unsigned DestAlign
,
454 std::optional
<uint32_t> AtomicCpySize
) const {
455 assert(RemainingBytes
< 16);
458 BaseT::getMemcpyLoopResidualLoweringType(
459 OpsOut
, Context
, RemainingBytes
, SrcAddrSpace
, DestAddrSpace
, SrcAlign
,
460 DestAlign
, AtomicCpySize
);
462 unsigned MinAlign
= std::min(SrcAlign
, DestAlign
);
465 Type
*I64Ty
= Type::getInt64Ty(Context
);
466 while (RemainingBytes
>= 8) {
467 OpsOut
.push_back(I64Ty
);
471 Type
*I32Ty
= Type::getInt32Ty(Context
);
472 while (RemainingBytes
>= 4) {
473 OpsOut
.push_back(I32Ty
);
478 Type
*I16Ty
= Type::getInt16Ty(Context
);
479 while (RemainingBytes
>= 2) {
480 OpsOut
.push_back(I16Ty
);
484 Type
*I8Ty
= Type::getInt8Ty(Context
);
485 while (RemainingBytes
) {
486 OpsOut
.push_back(I8Ty
);
491 unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF
) {
492 // Disable unrolling if the loop is not vectorized.
493 // TODO: Enable this again.
500 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst
*Inst
,
501 MemIntrinsicInfo
&Info
) const {
502 switch (Inst
->getIntrinsicID()) {
503 case Intrinsic::amdgcn_ds_ordered_add
:
504 case Intrinsic::amdgcn_ds_ordered_swap
: {
505 auto *Ordering
= dyn_cast
<ConstantInt
>(Inst
->getArgOperand(2));
506 auto *Volatile
= dyn_cast
<ConstantInt
>(Inst
->getArgOperand(4));
507 if (!Ordering
|| !Volatile
)
508 return false; // Invalid.
510 unsigned OrderingVal
= Ordering
->getZExtValue();
511 if (OrderingVal
> static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent
))
514 Info
.PtrVal
= Inst
->getArgOperand(0);
515 Info
.Ordering
= static_cast<AtomicOrdering
>(OrderingVal
);
517 Info
.WriteMem
= true;
518 Info
.IsVolatile
= !Volatile
->isZero();
526 InstructionCost
GCNTTIImpl::getArithmeticInstrCost(
527 unsigned Opcode
, Type
*Ty
, TTI::TargetCostKind CostKind
,
528 TTI::OperandValueInfo Op1Info
, TTI::OperandValueInfo Op2Info
,
529 ArrayRef
<const Value
*> Args
,
530 const Instruction
*CxtI
) {
532 // Legalize the type.
533 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Ty
);
534 int ISD
= TLI
->InstructionOpcodeToISD(Opcode
);
536 // Because we don't have any legal vector operations, but the legal types, we
537 // need to account for split vectors.
538 unsigned NElts
= LT
.second
.isVector() ?
539 LT
.second
.getVectorNumElements() : 1;
541 MVT::SimpleValueType SLT
= LT
.second
.getScalarType().SimpleTy
;
548 return get64BitInstrCost(CostKind
) * LT
.first
* NElts
;
550 if (ST
->has16BitInsts() && SLT
== MVT::i16
)
551 NElts
= (NElts
+ 1) / 2;
554 return getFullRateInstrCost() * LT
.first
* NElts
;
560 if (SLT
== MVT::i64
) {
561 // and, or and xor are typically split into 2 VALU instructions.
562 return 2 * getFullRateInstrCost() * LT
.first
* NElts
;
565 if (ST
->has16BitInsts() && SLT
== MVT::i16
)
566 NElts
= (NElts
+ 1) / 2;
568 return LT
.first
* NElts
* getFullRateInstrCost();
570 const int QuarterRateCost
= getQuarterRateInstrCost(CostKind
);
571 if (SLT
== MVT::i64
) {
572 const int FullRateCost
= getFullRateInstrCost();
573 return (4 * QuarterRateCost
+ (2 * 2) * FullRateCost
) * LT
.first
* NElts
;
576 if (ST
->has16BitInsts() && SLT
== MVT::i16
)
577 NElts
= (NElts
+ 1) / 2;
580 return QuarterRateCost
* NElts
* LT
.first
;
583 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
584 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
586 if (CxtI
&& CxtI
->hasOneUse())
587 if (const auto *FAdd
= dyn_cast
<BinaryOperator
>(*CxtI
->user_begin())) {
588 const int OPC
= TLI
->InstructionOpcodeToISD(FAdd
->getOpcode());
589 if (OPC
== ISD::FADD
|| OPC
== ISD::FSUB
) {
590 if (ST
->hasMadMacF32Insts() && SLT
== MVT::f32
&& !HasFP32Denormals
)
591 return TargetTransformInfo::TCC_Free
;
592 if (ST
->has16BitInsts() && SLT
== MVT::f16
&& !HasFP64FP16Denormals
)
593 return TargetTransformInfo::TCC_Free
;
595 // Estimate all types may be fused with contract/unsafe flags
596 const TargetOptions
&Options
= TLI
->getTargetMachine().Options
;
597 if (Options
.AllowFPOpFusion
== FPOpFusion::Fast
||
598 Options
.UnsafeFPMath
||
599 (FAdd
->hasAllowContract() && CxtI
->hasAllowContract()))
600 return TargetTransformInfo::TCC_Free
;
606 if (ST
->hasPackedFP32Ops() && SLT
== MVT::f32
)
607 NElts
= (NElts
+ 1) / 2;
609 return LT
.first
* NElts
* get64BitInstrCost(CostKind
);
611 if (ST
->has16BitInsts() && SLT
== MVT::f16
)
612 NElts
= (NElts
+ 1) / 2;
614 if (SLT
== MVT::f32
|| SLT
== MVT::f16
)
615 return LT
.first
* NElts
* getFullRateInstrCost();
619 // FIXME: frem should be handled separately. The fdiv in it is most of it,
620 // but the current lowering is also not entirely correct.
621 if (SLT
== MVT::f64
) {
622 int Cost
= 7 * get64BitInstrCost(CostKind
) +
623 getQuarterRateInstrCost(CostKind
) +
624 3 * getHalfRateInstrCost(CostKind
);
625 // Add cost of workaround.
626 if (!ST
->hasUsableDivScaleConditionOutput())
627 Cost
+= 3 * getFullRateInstrCost();
629 return LT
.first
* Cost
* NElts
;
632 if (!Args
.empty() && match(Args
[0], PatternMatch::m_FPOne())) {
633 // TODO: This is more complicated, unsafe flags etc.
634 if ((SLT
== MVT::f32
&& !HasFP32Denormals
) ||
635 (SLT
== MVT::f16
&& ST
->has16BitInsts())) {
636 return LT
.first
* getQuarterRateInstrCost(CostKind
) * NElts
;
640 if (SLT
== MVT::f16
&& ST
->has16BitInsts()) {
647 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind
);
648 return LT
.first
* Cost
* NElts
;
651 if (SLT
== MVT::f32
&& ((CxtI
&& CxtI
->hasApproxFunc()) ||
652 TLI
->getTargetMachine().Options
.UnsafeFPMath
)) {
653 // Fast unsafe fdiv lowering:
656 int Cost
= getQuarterRateInstrCost(CostKind
) + getFullRateInstrCost();
657 return LT
.first
* Cost
* NElts
;
660 if (SLT
== MVT::f32
|| SLT
== MVT::f16
) {
661 // 4 more v_cvt_* insts without f16 insts support
662 int Cost
= (SLT
== MVT::f16
? 14 : 10) * getFullRateInstrCost() +
663 1 * getQuarterRateInstrCost(CostKind
);
665 if (!HasFP32Denormals
) {
667 Cost
+= 2 * getFullRateInstrCost();
670 return LT
.first
* NElts
* Cost
;
674 // Use the backend' estimation. If fneg is not free each element will cost
675 // one additional instruction.
676 return TLI
->isFNegFree(SLT
) ? 0 : NElts
;
681 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
685 // Return true if there's a potential benefit from using v2f16/v2i16
686 // instructions for an intrinsic, even if it requires nontrivial legalization.
687 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID
) {
689 case Intrinsic::fma
: // TODO: fmuladd
690 // There's a small benefit to using vector ops in the legalized code.
691 case Intrinsic::round
:
692 case Intrinsic::uadd_sat
:
693 case Intrinsic::usub_sat
:
694 case Intrinsic::sadd_sat
:
695 case Intrinsic::ssub_sat
:
703 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes
&ICA
,
704 TTI::TargetCostKind CostKind
) {
705 if (ICA
.getID() == Intrinsic::fabs
)
708 if (!intrinsicHasPackedVectorBenefit(ICA
.getID()))
709 return BaseT::getIntrinsicInstrCost(ICA
, CostKind
);
711 Type
*RetTy
= ICA
.getReturnType();
713 // Legalize the type.
714 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(RetTy
);
716 unsigned NElts
= LT
.second
.isVector() ?
717 LT
.second
.getVectorNumElements() : 1;
719 MVT::SimpleValueType SLT
= LT
.second
.getScalarType().SimpleTy
;
722 return LT
.first
* NElts
* get64BitInstrCost(CostKind
);
724 if ((ST
->has16BitInsts() && SLT
== MVT::f16
) ||
725 (ST
->hasPackedFP32Ops() && SLT
== MVT::f32
))
726 NElts
= (NElts
+ 1) / 2;
728 // TODO: Get more refined intrinsic costs?
729 unsigned InstRate
= getQuarterRateInstrCost(CostKind
);
731 switch (ICA
.getID()) {
733 InstRate
= ST
->hasFastFMAF32() ? getHalfRateInstrCost(CostKind
)
734 : getQuarterRateInstrCost(CostKind
);
736 case Intrinsic::uadd_sat
:
737 case Intrinsic::usub_sat
:
738 case Intrinsic::sadd_sat
:
739 case Intrinsic::ssub_sat
:
740 static const auto ValidSatTys
= {MVT::v2i16
, MVT::v4i16
};
741 if (any_of(ValidSatTys
, [<
](MVT M
) { return M
== LT
.second
; }))
746 return LT
.first
* NElts
* InstRate
;
749 InstructionCost
GCNTTIImpl::getCFInstrCost(unsigned Opcode
,
750 TTI::TargetCostKind CostKind
,
751 const Instruction
*I
) {
752 assert((I
== nullptr || I
->getOpcode() == Opcode
) &&
753 "Opcode should reflect passed instruction.");
755 (CostKind
== TTI::TCK_CodeSize
|| CostKind
== TTI::TCK_SizeAndLatency
);
756 const int CBrCost
= SCost
? 5 : 7;
758 case Instruction::Br
: {
759 // Branch instruction takes about 4 slots on gfx900.
760 auto BI
= dyn_cast_or_null
<BranchInst
>(I
);
761 if (BI
&& BI
->isUnconditional())
762 return SCost
? 1 : 4;
763 // Suppose conditional branch takes additional 3 exec manipulations
764 // instructions in average.
767 case Instruction::Switch
: {
768 auto SI
= dyn_cast_or_null
<SwitchInst
>(I
);
769 // Each case (including default) takes 1 cmp + 1 cbr instructions in
771 return (SI
? (SI
->getNumCases() + 1) : 4) * (CBrCost
+ 1);
773 case Instruction::Ret
:
774 return SCost
? 1 : 10;
776 return BaseT::getCFInstrCost(Opcode
, CostKind
, I
);
780 GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode
, VectorType
*Ty
,
781 std::optional
<FastMathFlags
> FMF
,
782 TTI::TargetCostKind CostKind
) {
783 if (TTI::requiresOrderedReduction(FMF
))
784 return BaseT::getArithmeticReductionCost(Opcode
, Ty
, FMF
, CostKind
);
786 EVT OrigTy
= TLI
->getValueType(DL
, Ty
);
788 // Computes cost on targets that have packed math instructions(which support
789 // 16-bit types only).
790 if (!ST
->hasVOP3PInsts() || OrigTy
.getScalarSizeInBits() != 16)
791 return BaseT::getArithmeticReductionCost(Opcode
, Ty
, FMF
, CostKind
);
793 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Ty
);
794 return LT
.first
* getFullRateInstrCost();
798 GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID
, VectorType
*Ty
,
800 TTI::TargetCostKind CostKind
) {
801 EVT OrigTy
= TLI
->getValueType(DL
, Ty
);
803 // Computes cost on targets that have packed math instructions(which support
804 // 16-bit types only).
805 if (!ST
->hasVOP3PInsts() || OrigTy
.getScalarSizeInBits() != 16)
806 return BaseT::getMinMaxReductionCost(IID
, Ty
, FMF
, CostKind
);
808 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Ty
);
809 return LT
.first
* getHalfRateInstrCost(CostKind
);
812 InstructionCost
GCNTTIImpl::getVectorInstrCost(unsigned Opcode
, Type
*ValTy
,
813 TTI::TargetCostKind CostKind
,
814 unsigned Index
, Value
*Op0
,
817 case Instruction::ExtractElement
:
818 case Instruction::InsertElement
: {
820 = DL
.getTypeSizeInBits(cast
<VectorType
>(ValTy
)->getElementType());
822 if (EltSize
== 16 && Index
== 0 && ST
->has16BitInsts())
824 return BaseT::getVectorInstrCost(Opcode
, ValTy
, CostKind
, Index
, Op0
,
828 // Extracts are just reads of a subregister, so are free. Inserts are
829 // considered free because we don't want to have any cost for scalarizing
830 // operations, and we don't have to copy into a different register class.
832 // Dynamic indexing isn't free and is best avoided.
833 return Index
== ~0u ? 2 : 0;
836 return BaseT::getVectorInstrCost(Opcode
, ValTy
, CostKind
, Index
, Op0
, Op1
);
840 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
841 /// this is analyzing the collective result of all output registers. Otherwise,
842 /// this is only querying a specific result index if this returns multiple
843 /// registers in a struct.
844 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
845 const CallInst
*CI
, ArrayRef
<unsigned> Indices
) const {
846 // TODO: Handle complex extract indices
847 if (Indices
.size() > 1)
850 const DataLayout
&DL
= CI
->getDataLayout();
851 const SIRegisterInfo
*TRI
= ST
->getRegisterInfo();
852 TargetLowering::AsmOperandInfoVector TargetConstraints
=
853 TLI
->ParseConstraints(DL
, ST
->getRegisterInfo(), *CI
);
855 const int TargetOutputIdx
= Indices
.empty() ? -1 : Indices
[0];
858 for (auto &TC
: TargetConstraints
) {
859 if (TC
.Type
!= InlineAsm::isOutput
)
862 // Skip outputs we don't care about.
863 if (TargetOutputIdx
!= -1 && TargetOutputIdx
!= OutputIdx
++)
866 TLI
->ComputeConstraintToUse(TC
, SDValue());
868 const TargetRegisterClass
*RC
= TLI
->getRegForInlineAsmConstraint(
869 TRI
, TC
.ConstraintCode
, TC
.ConstraintVT
).second
;
871 // For AGPR constraints null is returned on subtargets without AGPRs, so
872 // assume divergent for null.
873 if (!RC
|| !TRI
->isSGPRClass(RC
))
880 bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
881 const IntrinsicInst
*ReadReg
) const {
883 cast
<MetadataAsValue
>(ReadReg
->getArgOperand(0))->getMetadata();
885 cast
<MDString
>(cast
<MDNode
>(MD
)->getOperand(0))->getString();
887 // Special case registers that look like VCC.
888 MVT VT
= MVT::getVT(ReadReg
->getType());
892 // Special case scalar registers that start with 'v'.
893 if (RegName
.starts_with("vcc") || RegName
.empty())
896 // VGPR or AGPR is divergent. There aren't any specially named vector
898 return RegName
[0] == 'v' || RegName
[0] == 'a';
901 /// \returns true if the result of the value could potentially be
902 /// different across workitems in a wavefront.
903 bool GCNTTIImpl::isSourceOfDivergence(const Value
*V
) const {
904 if (const Argument
*A
= dyn_cast
<Argument
>(V
))
905 return !AMDGPU::isArgPassedInSGPR(A
);
907 // Loads from the private and flat address spaces are divergent, because
908 // threads can execute the load instruction with the same inputs and get
909 // different results.
911 // All other loads are not divergent, because if threads issue loads with the
912 // same arguments, they will always get the same result.
913 if (const LoadInst
*Load
= dyn_cast
<LoadInst
>(V
))
914 return Load
->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
||
915 Load
->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS
;
917 // Atomics are divergent because they are executed sequentially: when an
918 // atomic operation refers to the same address in each thread, then each
919 // thread after the first sees the value written by the previous thread as
921 if (isa
<AtomicRMWInst
>(V
) || isa
<AtomicCmpXchgInst
>(V
))
924 if (const IntrinsicInst
*Intrinsic
= dyn_cast
<IntrinsicInst
>(V
)) {
925 if (Intrinsic
->getIntrinsicID() == Intrinsic::read_register
)
926 return isReadRegisterSourceOfDivergence(Intrinsic
);
928 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic
->getIntrinsicID());
931 // Assume all function calls are a source of divergence.
932 if (const CallInst
*CI
= dyn_cast
<CallInst
>(V
)) {
933 if (CI
->isInlineAsm())
934 return isInlineAsmSourceOfDivergence(CI
);
938 // Assume all function calls are a source of divergence.
939 if (isa
<InvokeInst
>(V
))
945 bool GCNTTIImpl::isAlwaysUniform(const Value
*V
) const {
946 if (const IntrinsicInst
*Intrinsic
= dyn_cast
<IntrinsicInst
>(V
))
947 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic
->getIntrinsicID());
949 if (const CallInst
*CI
= dyn_cast
<CallInst
>(V
)) {
950 if (CI
->isInlineAsm())
951 return !isInlineAsmSourceOfDivergence(CI
);
955 // In most cases TID / wavefrontsize is uniform.
957 // However, if a kernel has uneven dimesions we can have a value of
958 // workitem-id-x divided by the wavefrontsize non-uniform. For example
959 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
960 // packed into a same wave which gives 1 and 0 after the division by 64
963 // FIXME: limit it to 1D kernels only, although that shall be possible
964 // to perform this optimization is the size of the X dimension is a power
965 // of 2, we just do not currently have infrastructure to query it.
966 using namespace llvm::PatternMatch
;
968 if (match(V
, m_LShr(m_Intrinsic
<Intrinsic::amdgcn_workitem_id_x
>(),
969 m_ConstantInt(C
))) ||
970 match(V
, m_AShr(m_Intrinsic
<Intrinsic::amdgcn_workitem_id_x
>(),
971 m_ConstantInt(C
)))) {
972 const Function
*F
= cast
<Instruction
>(V
)->getFunction();
973 return C
>= ST
->getWavefrontSizeLog2() &&
974 ST
->getMaxWorkitemID(*F
, 1) == 0 && ST
->getMaxWorkitemID(*F
, 2) == 0;
978 if (match(V
, m_c_And(m_Intrinsic
<Intrinsic::amdgcn_workitem_id_x
>(),
980 const Function
*F
= cast
<Instruction
>(V
)->getFunction();
981 const DataLayout
&DL
= F
->getDataLayout();
982 return computeKnownBits(Mask
, DL
).countMinTrailingZeros() >=
983 ST
->getWavefrontSizeLog2() &&
984 ST
->getMaxWorkitemID(*F
, 1) == 0 && ST
->getMaxWorkitemID(*F
, 2) == 0;
987 const ExtractValueInst
*ExtValue
= dyn_cast
<ExtractValueInst
>(V
);
991 const CallInst
*CI
= dyn_cast
<CallInst
>(ExtValue
->getOperand(0));
995 if (const IntrinsicInst
*Intrinsic
= dyn_cast
<IntrinsicInst
>(CI
)) {
996 switch (Intrinsic
->getIntrinsicID()) {
999 case Intrinsic::amdgcn_if
:
1000 case Intrinsic::amdgcn_else
: {
1001 ArrayRef
<unsigned> Indices
= ExtValue
->getIndices();
1002 return Indices
.size() == 1 && Indices
[0] == 1;
1007 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1008 // divergent for the overall struct return. We need to override it in the
1009 // case we're extracting an SGPR component here.
1010 if (CI
->isInlineAsm())
1011 return !isInlineAsmSourceOfDivergence(CI
, ExtValue
->getIndices());
1016 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl
<int> &OpIndexes
,
1017 Intrinsic::ID IID
) const {
1019 case Intrinsic::amdgcn_is_shared
:
1020 case Intrinsic::amdgcn_is_private
:
1021 case Intrinsic::amdgcn_flat_atomic_fadd
:
1022 case Intrinsic::amdgcn_flat_atomic_fmax
:
1023 case Intrinsic::amdgcn_flat_atomic_fmin
:
1024 case Intrinsic::amdgcn_flat_atomic_fmax_num
:
1025 case Intrinsic::amdgcn_flat_atomic_fmin_num
:
1026 OpIndexes
.push_back(0);
1033 Value
*GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst
*II
,
1035 Value
*NewV
) const {
1036 auto IntrID
= II
->getIntrinsicID();
1038 case Intrinsic::amdgcn_is_shared
:
1039 case Intrinsic::amdgcn_is_private
: {
1040 unsigned TrueAS
= IntrID
== Intrinsic::amdgcn_is_shared
?
1041 AMDGPUAS::LOCAL_ADDRESS
: AMDGPUAS::PRIVATE_ADDRESS
;
1042 unsigned NewAS
= NewV
->getType()->getPointerAddressSpace();
1043 LLVMContext
&Ctx
= NewV
->getType()->getContext();
1044 ConstantInt
*NewVal
= (TrueAS
== NewAS
) ?
1045 ConstantInt::getTrue(Ctx
) : ConstantInt::getFalse(Ctx
);
1048 case Intrinsic::ptrmask
: {
1049 unsigned OldAS
= OldV
->getType()->getPointerAddressSpace();
1050 unsigned NewAS
= NewV
->getType()->getPointerAddressSpace();
1051 Value
*MaskOp
= II
->getArgOperand(1);
1052 Type
*MaskTy
= MaskOp
->getType();
1054 bool DoTruncate
= false;
1056 const GCNTargetMachine
&TM
=
1057 static_cast<const GCNTargetMachine
&>(getTLI()->getTargetMachine());
1058 if (!TM
.isNoopAddrSpaceCast(OldAS
, NewAS
)) {
1059 // All valid 64-bit to 32-bit casts work by chopping off the high
1060 // bits. Any masking only clearing the low bits will also apply in the new
1062 if (DL
.getPointerSizeInBits(OldAS
) != 64 ||
1063 DL
.getPointerSizeInBits(NewAS
) != 32)
1066 // TODO: Do we need to thread more context in here?
1067 KnownBits Known
= computeKnownBits(MaskOp
, DL
, 0, nullptr, II
);
1068 if (Known
.countMinLeadingOnes() < 32)
1076 MaskTy
= B
.getInt32Ty();
1077 MaskOp
= B
.CreateTrunc(MaskOp
, MaskTy
);
1080 return B
.CreateIntrinsic(Intrinsic::ptrmask
, {NewV
->getType(), MaskTy
},
1083 case Intrinsic::amdgcn_flat_atomic_fadd
:
1084 case Intrinsic::amdgcn_flat_atomic_fmax
:
1085 case Intrinsic::amdgcn_flat_atomic_fmin
:
1086 case Intrinsic::amdgcn_flat_atomic_fmax_num
:
1087 case Intrinsic::amdgcn_flat_atomic_fmin_num
: {
1088 Type
*DestTy
= II
->getType();
1089 Type
*SrcTy
= NewV
->getType();
1090 unsigned NewAS
= SrcTy
->getPointerAddressSpace();
1091 if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS
))
1093 Module
*M
= II
->getModule();
1094 Function
*NewDecl
= Intrinsic::getDeclaration(M
, II
->getIntrinsicID(),
1095 {DestTy
, SrcTy
, DestTy
});
1096 II
->setArgOperand(0, NewV
);
1097 II
->setCalledFunction(NewDecl
);
1105 InstructionCost
GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind
,
1106 VectorType
*VT
, ArrayRef
<int> Mask
,
1107 TTI::TargetCostKind CostKind
,
1108 int Index
, VectorType
*SubTp
,
1109 ArrayRef
<const Value
*> Args
,
1110 const Instruction
*CxtI
) {
1111 if (!isa
<FixedVectorType
>(VT
))
1112 return BaseT::getShuffleCost(Kind
, VT
, Mask
, CostKind
, Index
, SubTp
);
1114 Kind
= improveShuffleKindFromMask(Kind
, Mask
, VT
, Index
, SubTp
);
1116 // Larger vector widths may require additional instructions, but are
1117 // typically cheaper than scalarized versions.
1118 unsigned NumVectorElts
= cast
<FixedVectorType
>(VT
)->getNumElements();
1119 if (ST
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
&&
1120 DL
.getTypeSizeInBits(VT
->getElementType()) == 16) {
1121 bool HasVOP3P
= ST
->hasVOP3PInsts();
1122 unsigned RequestedElts
=
1123 count_if(Mask
, [](int MaskElt
) { return MaskElt
!= -1; });
1124 if (RequestedElts
== 0)
1127 case TTI::SK_Broadcast
:
1128 case TTI::SK_Reverse
:
1129 case TTI::SK_PermuteSingleSrc
: {
1130 // With op_sel VOP3P instructions freely can access the low half or high
1131 // half of a register, so any swizzle of two elements is free.
1132 if (HasVOP3P
&& NumVectorElts
== 2)
1134 unsigned NumPerms
= alignTo(RequestedElts
, 2) / 2;
1135 // SK_Broadcast just reuses the same mask
1136 unsigned NumPermMasks
= Kind
== TTI::SK_Broadcast
? 1 : NumPerms
;
1137 return NumPerms
+ NumPermMasks
;
1139 case TTI::SK_ExtractSubvector
:
1140 case TTI::SK_InsertSubvector
: {
1141 // Even aligned accesses are free
1144 // Insert/extract subvectors only require shifts / extract code to get the
1146 return alignTo(RequestedElts
, 2) / 2;
1148 case TTI::SK_PermuteTwoSrc
:
1149 case TTI::SK_Splice
:
1150 case TTI::SK_Select
: {
1151 unsigned NumPerms
= alignTo(RequestedElts
, 2) / 2;
1152 // SK_Select just reuses the same mask
1153 unsigned NumPermMasks
= Kind
== TTI::SK_Select
? 1 : NumPerms
;
1154 return NumPerms
+ NumPermMasks
;
1162 return BaseT::getShuffleCost(Kind
, VT
, Mask
, CostKind
, Index
, SubTp
);
1165 bool GCNTTIImpl::areInlineCompatible(const Function
*Caller
,
1166 const Function
*Callee
) const {
1167 const TargetMachine
&TM
= getTLI()->getTargetMachine();
1168 const GCNSubtarget
*CallerST
1169 = static_cast<const GCNSubtarget
*>(TM
.getSubtargetImpl(*Caller
));
1170 const GCNSubtarget
*CalleeST
1171 = static_cast<const GCNSubtarget
*>(TM
.getSubtargetImpl(*Callee
));
1173 const FeatureBitset
&CallerBits
= CallerST
->getFeatureBits();
1174 const FeatureBitset
&CalleeBits
= CalleeST
->getFeatureBits();
1176 FeatureBitset RealCallerBits
= CallerBits
& ~InlineFeatureIgnoreList
;
1177 FeatureBitset RealCalleeBits
= CalleeBits
& ~InlineFeatureIgnoreList
;
1178 if ((RealCallerBits
& RealCalleeBits
) != RealCalleeBits
)
1181 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1182 // no way to support merge for backend defined attributes.
1183 SIModeRegisterDefaults
CallerMode(*Caller
, *CallerST
);
1184 SIModeRegisterDefaults
CalleeMode(*Callee
, *CalleeST
);
1185 if (!CallerMode
.isInlineCompatible(CalleeMode
))
1188 if (Callee
->hasFnAttribute(Attribute::AlwaysInline
) ||
1189 Callee
->hasFnAttribute(Attribute::InlineHint
))
1192 // Hack to make compile times reasonable.
1194 // Single BB does not increase total BB amount.
1195 if (Callee
->size() == 1)
1197 size_t BBSize
= Caller
->size() + Callee
->size() - 1;
1198 return BBSize
<= InlineMaxBB
;
1204 static unsigned adjustInliningThresholdUsingCallee(const CallBase
*CB
,
1205 const SITargetLowering
*TLI
,
1206 const GCNTTIImpl
*TTIImpl
) {
1207 const int NrOfSGPRUntilSpill
= 26;
1208 const int NrOfVGPRUntilSpill
= 32;
1210 const DataLayout
&DL
= TTIImpl
->getDataLayout();
1212 unsigned adjustThreshold
= 0;
1215 for (const Use
&A
: CB
->args()) {
1216 SmallVector
<EVT
, 4> ValueVTs
;
1217 ComputeValueVTs(*TLI
, DL
, A
.get()->getType(), ValueVTs
);
1218 for (auto ArgVT
: ValueVTs
) {
1219 unsigned CCRegNum
= TLI
->getNumRegistersForCallingConv(
1220 CB
->getContext(), CB
->getCallingConv(), ArgVT
);
1221 if (AMDGPU::isArgPassedInSGPR(CB
, CB
->getArgOperandNo(&A
)))
1222 SGPRsInUse
+= CCRegNum
;
1224 VGPRsInUse
+= CCRegNum
;
1228 // The cost of passing function arguments through the stack:
1229 // 1 instruction to put a function argument on the stack in the caller.
1230 // 1 instruction to take a function argument from the stack in callee.
1231 // 1 instruction is explicitly take care of data dependencies in callee
1233 InstructionCost
ArgStackCost(1);
1234 ArgStackCost
+= const_cast<GCNTTIImpl
*>(TTIImpl
)->getMemoryOpCost(
1235 Instruction::Store
, Type::getInt32Ty(CB
->getContext()), Align(4),
1236 AMDGPUAS::PRIVATE_ADDRESS
, TTI::TCK_SizeAndLatency
);
1237 ArgStackCost
+= const_cast<GCNTTIImpl
*>(TTIImpl
)->getMemoryOpCost(
1238 Instruction::Load
, Type::getInt32Ty(CB
->getContext()), Align(4),
1239 AMDGPUAS::PRIVATE_ADDRESS
, TTI::TCK_SizeAndLatency
);
1241 // The penalty cost is computed relative to the cost of instructions and does
1242 // not model any storage costs.
1243 adjustThreshold
+= std::max(0, SGPRsInUse
- NrOfSGPRUntilSpill
) *
1244 *ArgStackCost
.getValue() * InlineConstants::getInstrCost();
1245 adjustThreshold
+= std::max(0, VGPRsInUse
- NrOfVGPRUntilSpill
) *
1246 *ArgStackCost
.getValue() * InlineConstants::getInstrCost();
1247 return adjustThreshold
;
1250 static unsigned getCallArgsTotalAllocaSize(const CallBase
*CB
,
1251 const DataLayout
&DL
) {
1252 // If we have a pointer to a private array passed into a function
1253 // it will not be optimized out, leaving scratch usage.
1254 // This function calculates the total size in bytes of the memory that would
1255 // end in scratch if the call was not inlined.
1256 unsigned AllocaSize
= 0;
1257 SmallPtrSet
<const AllocaInst
*, 8> AIVisited
;
1258 for (Value
*PtrArg
: CB
->args()) {
1259 PointerType
*Ty
= dyn_cast
<PointerType
>(PtrArg
->getType());
1263 unsigned AddrSpace
= Ty
->getAddressSpace();
1264 if (AddrSpace
!= AMDGPUAS::FLAT_ADDRESS
&&
1265 AddrSpace
!= AMDGPUAS::PRIVATE_ADDRESS
)
1268 const AllocaInst
*AI
= dyn_cast
<AllocaInst
>(getUnderlyingObject(PtrArg
));
1269 if (!AI
|| !AI
->isStaticAlloca() || !AIVisited
.insert(AI
).second
)
1272 AllocaSize
+= DL
.getTypeAllocSize(AI
->getAllocatedType());
1277 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase
*CB
) const {
1278 unsigned Threshold
= adjustInliningThresholdUsingCallee(CB
, TLI
, this);
1280 // Private object passed as arguments may end up in scratch usage if the call
1281 // is not inlined. Increase the inline threshold to promote inlining.
1282 unsigned AllocaSize
= getCallArgsTotalAllocaSize(CB
, DL
);
1284 Threshold
+= ArgAllocaCost
;
1288 unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase
*CB
,
1289 const AllocaInst
*AI
) const {
1291 // Below the cutoff, assume that the private memory objects would be
1293 auto AllocaSize
= getCallArgsTotalAllocaSize(CB
, DL
);
1294 if (AllocaSize
<= ArgAllocaCutoff
)
1297 // Above the cutoff, we give a cost to each private memory object
1298 // depending its size. If the array can be optimized by SROA this cost is not
1299 // added to the total-cost in the inliner cost analysis.
1301 // We choose the total cost of the alloca such that their sum cancels the
1302 // bonus given in the threshold (ArgAllocaCost).
1304 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1306 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1307 // the single-bb bonus and the vector-bonus.
1309 // We compensate the first two multipliers, by repeating logic from the
1310 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1311 static_assert(InlinerVectorBonusPercent
== 0, "vector bonus assumed to be 0");
1312 unsigned Threshold
= ArgAllocaCost
* getInliningThresholdMultiplier();
1314 bool SingleBB
= none_of(*CB
->getCalledFunction(), [](const BasicBlock
&BB
) {
1315 return BB
.getTerminator()->getNumSuccessors() > 1;
1318 Threshold
+= Threshold
/ 2;
1321 auto ArgAllocaSize
= DL
.getTypeAllocSize(AI
->getAllocatedType());
1323 // Attribute the bonus proportionally to the alloca size
1324 unsigned AllocaThresholdBonus
= (Threshold
* ArgAllocaSize
) / AllocaSize
;
1326 return AllocaThresholdBonus
;
1329 void GCNTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
1330 TTI::UnrollingPreferences
&UP
,
1331 OptimizationRemarkEmitter
*ORE
) {
1332 CommonTTI
.getUnrollingPreferences(L
, SE
, UP
, ORE
);
1335 void GCNTTIImpl::getPeelingPreferences(Loop
*L
, ScalarEvolution
&SE
,
1336 TTI::PeelingPreferences
&PP
) {
1337 CommonTTI
.getPeelingPreferences(L
, SE
, PP
);
1340 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind
) const {
1341 return ST
->hasFullRate64Ops()
1342 ? getFullRateInstrCost()
1343 : ST
->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind
)
1344 : getQuarterRateInstrCost(CostKind
);
1347 std::pair
<InstructionCost
, MVT
>
1348 GCNTTIImpl::getTypeLegalizationCost(Type
*Ty
) const {
1349 std::pair
<InstructionCost
, MVT
> Cost
= BaseT::getTypeLegalizationCost(Ty
);
1350 auto Size
= DL
.getTypeSizeInBits(Ty
);
1351 // Maximum load or store can handle 8 dwords for scalar and 4 for
1352 // vector ALU. Let's assume anything above 8 dwords is expensive
1357 Cost
.first
+= (Size
+ 255) / 256;
1361 unsigned GCNTTIImpl::getPrefetchDistance() const {
1362 return ST
->hasPrefetch() ? 128 : 0;
1365 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS
) const {
1366 return AMDGPU::isFlatGlobalAddrSpace(AS
);