1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 // This file implements a TargetTransformInfo analysis pass specific to the
12 // AMDGPU target machine. It uses the target's detailed information to provide
13 // more precise answers to certain TTI queries, while letting the target
14 // independent and default TTI implementations handle the rest.
16 //===----------------------------------------------------------------------===//
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "Utils/AMDGPUBaseInfo.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/TargetTransformInfo.h"
24 #include "llvm/Analysis/ValueTracking.h"
25 #include "llvm/CodeGen/ISDOpcodes.h"
26 #include "llvm/CodeGen/ValueTypes.h"
27 #include "llvm/IR/Argument.h"
28 #include "llvm/IR/Attributes.h"
29 #include "llvm/IR/BasicBlock.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/IR/Instruction.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/PatternMatch.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/IR/Value.h"
41 #include "llvm/MC/SubtargetFeature.h"
42 #include "llvm/Support/Casting.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/ErrorHandling.h"
46 #include "llvm/Support/MachineValueType.h"
47 #include "llvm/Support/raw_ostream.h"
48 #include "llvm/Target/TargetMachine.h"
56 #define DEBUG_TYPE "AMDGPUtti"
58 static cl::opt
<unsigned> UnrollThresholdPrivate(
59 "amdgpu-unroll-threshold-private",
60 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
61 cl::init(2500), cl::Hidden
);
63 static cl::opt
<unsigned> UnrollThresholdLocal(
64 "amdgpu-unroll-threshold-local",
65 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
66 cl::init(1000), cl::Hidden
);
68 static cl::opt
<unsigned> UnrollThresholdIf(
69 "amdgpu-unroll-threshold-if",
70 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
71 cl::init(150), cl::Hidden
);
73 static bool dependsOnLocalPhi(const Loop
*L
, const Value
*Cond
,
75 const Instruction
*I
= dyn_cast
<Instruction
>(Cond
);
79 for (const Value
*V
: I
->operand_values()) {
82 if (const PHINode
*PHI
= dyn_cast
<PHINode
>(V
)) {
83 if (llvm::none_of(L
->getSubLoops(), [PHI
](const Loop
* SubLoop
) {
84 return SubLoop
->contains(PHI
); }))
86 } else if (Depth
< 10 && dependsOnLocalPhi(L
, V
, Depth
+1))
92 void AMDGPUTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
93 TTI::UnrollingPreferences
&UP
) {
94 UP
.Threshold
= 300; // Twice the default.
95 UP
.MaxCount
= std::numeric_limits
<unsigned>::max();
98 // TODO: Do we want runtime unrolling?
100 // Maximum alloca size than can fit registers. Reserve 16 registers.
101 const unsigned MaxAlloca
= (256 - 16) * 4;
102 unsigned ThresholdPrivate
= UnrollThresholdPrivate
;
103 unsigned ThresholdLocal
= UnrollThresholdLocal
;
104 unsigned MaxBoost
= std::max(ThresholdPrivate
, ThresholdLocal
);
105 const AMDGPUAS
&ASST
= AMDGPU::getAMDGPUAS(TargetTriple
);
106 for (const BasicBlock
*BB
: L
->getBlocks()) {
107 const DataLayout
&DL
= BB
->getModule()->getDataLayout();
108 unsigned LocalGEPsSeen
= 0;
110 if (llvm::any_of(L
->getSubLoops(), [BB
](const Loop
* SubLoop
) {
111 return SubLoop
->contains(BB
); }))
112 continue; // Block belongs to an inner loop.
114 for (const Instruction
&I
: *BB
) {
115 // Unroll a loop which contains an "if" statement whose condition
116 // defined by a PHI belonging to the loop. This may help to eliminate
117 // if region and potentially even PHI itself, saving on both divergence
118 // and registers used for the PHI.
119 // Add a small bonus for each of such "if" statements.
120 if (const BranchInst
*Br
= dyn_cast
<BranchInst
>(&I
)) {
121 if (UP
.Threshold
< MaxBoost
&& Br
->isConditional()) {
122 if (L
->isLoopExiting(Br
->getSuccessor(0)) ||
123 L
->isLoopExiting(Br
->getSuccessor(1)))
125 if (dependsOnLocalPhi(L
, Br
->getCondition())) {
126 UP
.Threshold
+= UnrollThresholdIf
;
127 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP
.Threshold
129 << *L
<< " due to " << *Br
<< '\n');
130 if (UP
.Threshold
>= MaxBoost
)
137 const GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(&I
);
141 unsigned AS
= GEP
->getAddressSpace();
142 unsigned Threshold
= 0;
143 if (AS
== ASST
.PRIVATE_ADDRESS
)
144 Threshold
= ThresholdPrivate
;
145 else if (AS
== ASST
.LOCAL_ADDRESS
)
146 Threshold
= ThresholdLocal
;
150 if (UP
.Threshold
>= Threshold
)
153 if (AS
== ASST
.PRIVATE_ADDRESS
) {
154 const Value
*Ptr
= GEP
->getPointerOperand();
155 const AllocaInst
*Alloca
=
156 dyn_cast
<AllocaInst
>(GetUnderlyingObject(Ptr
, DL
));
157 if (!Alloca
|| !Alloca
->isStaticAlloca())
159 Type
*Ty
= Alloca
->getAllocatedType();
160 unsigned AllocaSize
= Ty
->isSized() ? DL
.getTypeAllocSize(Ty
) : 0;
161 if (AllocaSize
> MaxAlloca
)
163 } else if (AS
== ASST
.LOCAL_ADDRESS
) {
165 // Inhibit unroll for local memory if we have seen addressing not to
166 // a variable, most likely we will be unable to combine it.
167 // Do not unroll too deep inner loops for local memory to give a chance
168 // to unroll an outer loop for a more important reason.
169 if (LocalGEPsSeen
> 1 || L
->getLoopDepth() > 2 ||
170 (!isa
<GlobalVariable
>(GEP
->getPointerOperand()) &&
171 !isa
<Argument
>(GEP
->getPointerOperand())))
175 // Check if GEP depends on a value defined by this loop itself.
176 bool HasLoopDef
= false;
177 for (const Value
*Op
: GEP
->operands()) {
178 const Instruction
*Inst
= dyn_cast
<Instruction
>(Op
);
179 if (!Inst
|| L
->isLoopInvariant(Op
))
182 if (llvm::any_of(L
->getSubLoops(), [Inst
](const Loop
* SubLoop
) {
183 return SubLoop
->contains(Inst
); }))
191 // We want to do whatever we can to limit the number of alloca
192 // instructions that make it through to the code generator. allocas
193 // require us to use indirect addressing, which is slow and prone to
194 // compiler bugs. If this loop does an address calculation on an
195 // alloca ptr, then we want to use a higher than normal loop unroll
196 // threshold. This will give SROA a better chance to eliminate these
199 // We also want to have more unrolling for local memory to let ds
200 // instructions with different offsets combine.
202 // Don't use the maximum allowed value here as it will make some
203 // programs way too big.
204 UP
.Threshold
= Threshold
;
205 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
207 << *L
<< " due to " << *GEP
<< '\n');
208 if (UP
.Threshold
>= MaxBoost
)
214 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec
) const {
215 // The concept of vector registers doesn't really exist. Some packed vector
216 // operations operate on the normal 32-bit registers.
220 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec
) const {
221 // This is really the number of registers to fill when vectorizing /
222 // interleaving loops, so we lie to avoid trying to use all registers.
223 return getHardwareNumberOfRegisters(Vec
) >> 3;
226 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector
) const {
230 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
234 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF
, unsigned LoadSize
,
235 unsigned ChainSizeInBytes
,
236 VectorType
*VecTy
) const {
237 unsigned VecRegBitWidth
= VF
* LoadSize
;
238 if (VecRegBitWidth
> 128 && VecTy
->getScalarSizeInBits() < 32)
239 // TODO: Support element-size less than 32bit?
240 return 128 / LoadSize
;
245 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF
, unsigned StoreSize
,
246 unsigned ChainSizeInBytes
,
247 VectorType
*VecTy
) const {
248 unsigned VecRegBitWidth
= VF
* StoreSize
;
249 if (VecRegBitWidth
> 128)
250 return 128 / StoreSize
;
255 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace
) const {
256 AMDGPUAS AS
= ST
->getAMDGPUAS();
257 if (AddrSpace
== AS
.GLOBAL_ADDRESS
||
258 AddrSpace
== AS
.CONSTANT_ADDRESS
||
259 AddrSpace
== AS
.CONSTANT_ADDRESS_32BIT
) {
263 if (AddrSpace
== AS
.FLAT_ADDRESS
||
264 AddrSpace
== AS
.LOCAL_ADDRESS
||
265 AddrSpace
== AS
.REGION_ADDRESS
)
268 if (AddrSpace
== AS
.PRIVATE_ADDRESS
)
269 return 8 * ST
->getMaxPrivateElementSize();
271 llvm_unreachable("unhandled address space");
274 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes
,
276 unsigned AddrSpace
) const {
277 // We allow vectorization of flat stores, even though we may need to decompose
278 // them later if they may access private memory. We don't have enough context
279 // here, and legalization can handle it.
280 if (AddrSpace
== ST
->getAMDGPUAS().PRIVATE_ADDRESS
) {
281 return (Alignment
>= 4 || ST
->hasUnalignedScratchAccess()) &&
282 ChainSizeInBytes
<= ST
->getMaxPrivateElementSize();
287 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes
,
289 unsigned AddrSpace
) const {
290 return isLegalToVectorizeMemChain(ChainSizeInBytes
, Alignment
, AddrSpace
);
293 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes
,
295 unsigned AddrSpace
) const {
296 return isLegalToVectorizeMemChain(ChainSizeInBytes
, Alignment
, AddrSpace
);
299 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF
) {
300 // Disable unrolling if the loop is not vectorized.
301 // TODO: Enable this again.
308 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst
*Inst
,
309 MemIntrinsicInfo
&Info
) const {
310 switch (Inst
->getIntrinsicID()) {
311 case Intrinsic::amdgcn_atomic_inc
:
312 case Intrinsic::amdgcn_atomic_dec
:
313 case Intrinsic::amdgcn_ds_fadd
:
314 case Intrinsic::amdgcn_ds_fmin
:
315 case Intrinsic::amdgcn_ds_fmax
: {
316 auto *Ordering
= dyn_cast
<ConstantInt
>(Inst
->getArgOperand(2));
317 auto *Volatile
= dyn_cast
<ConstantInt
>(Inst
->getArgOperand(4));
318 if (!Ordering
|| !Volatile
)
319 return false; // Invalid.
321 unsigned OrderingVal
= Ordering
->getZExtValue();
322 if (OrderingVal
> static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent
))
325 Info
.PtrVal
= Inst
->getArgOperand(0);
326 Info
.Ordering
= static_cast<AtomicOrdering
>(OrderingVal
);
328 Info
.WriteMem
= true;
329 Info
.IsVolatile
= !Volatile
->isNullValue();
337 int GCNTTIImpl::getArithmeticInstrCost(
338 unsigned Opcode
, Type
*Ty
, TTI::OperandValueKind Opd1Info
,
339 TTI::OperandValueKind Opd2Info
, TTI::OperandValueProperties Opd1PropInfo
,
340 TTI::OperandValueProperties Opd2PropInfo
, ArrayRef
<const Value
*> Args
) {
341 EVT OrigTy
= TLI
->getValueType(DL
, Ty
);
342 if (!OrigTy
.isSimple()) {
343 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, Opd1Info
, Opd2Info
,
344 Opd1PropInfo
, Opd2PropInfo
);
347 // Legalize the type.
348 std::pair
<int, MVT
> LT
= TLI
->getTypeLegalizationCost(DL
, Ty
);
349 int ISD
= TLI
->InstructionOpcodeToISD(Opcode
);
351 // Because we don't have any legal vector operations, but the legal types, we
352 // need to account for split vectors.
353 unsigned NElts
= LT
.second
.isVector() ?
354 LT
.second
.getVectorNumElements() : 1;
356 MVT::SimpleValueType SLT
= LT
.second
.getScalarType().SimpleTy
;
363 return get64BitInstrCost() * LT
.first
* NElts
;
366 return getFullRateInstrCost() * LT
.first
* NElts
;
372 if (SLT
== MVT::i64
){
373 // and, or and xor are typically split into 2 VALU instructions.
374 return 2 * getFullRateInstrCost() * LT
.first
* NElts
;
377 return LT
.first
* NElts
* getFullRateInstrCost();
379 const int QuarterRateCost
= getQuarterRateInstrCost();
380 if (SLT
== MVT::i64
) {
381 const int FullRateCost
= getFullRateInstrCost();
382 return (4 * QuarterRateCost
+ (2 * 2) * FullRateCost
) * LT
.first
* NElts
;
386 return QuarterRateCost
* NElts
* LT
.first
;
392 return LT
.first
* NElts
* get64BitInstrCost();
394 if (SLT
== MVT::f32
|| SLT
== MVT::f16
)
395 return LT
.first
* NElts
* getFullRateInstrCost();
399 // FIXME: frem should be handled separately. The fdiv in it is most of it,
400 // but the current lowering is also not entirely correct.
401 if (SLT
== MVT::f64
) {
402 int Cost
= 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
403 // Add cost of workaround.
404 if (ST
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
)
405 Cost
+= 3 * getFullRateInstrCost();
407 return LT
.first
* Cost
* NElts
;
410 if (!Args
.empty() && match(Args
[0], PatternMatch::m_FPOne())) {
411 // TODO: This is more complicated, unsafe flags etc.
412 if ((SLT
== MVT::f32
&& !ST
->hasFP32Denormals()) ||
413 (SLT
== MVT::f16
&& ST
->has16BitInsts())) {
414 return LT
.first
* getQuarterRateInstrCost() * NElts
;
418 if (SLT
== MVT::f16
&& ST
->has16BitInsts()) {
424 int Cost
= 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
425 return LT
.first
* Cost
* NElts
;
428 if (SLT
== MVT::f32
|| SLT
== MVT::f16
) {
429 int Cost
= 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
431 if (!ST
->hasFP32Denormals()) {
433 Cost
+= 2 * getFullRateInstrCost();
436 return LT
.first
* NElts
* Cost
;
443 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, Opd1Info
, Opd2Info
,
444 Opd1PropInfo
, Opd2PropInfo
);
447 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode
) {
448 // XXX - For some reason this isn't called for switch.
450 case Instruction::Br
:
451 case Instruction::Ret
:
454 return BaseT::getCFInstrCost(Opcode
);
458 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode
, Type
*Ty
,
460 EVT OrigTy
= TLI
->getValueType(DL
, Ty
);
462 // Computes cost on targets that have packed math instructions(which support
463 // 16-bit types only).
465 !ST
->hasVOP3PInsts() ||
466 OrigTy
.getScalarSizeInBits() != 16)
467 return BaseT::getArithmeticReductionCost(Opcode
, Ty
, IsPairwise
);
469 std::pair
<int, MVT
> LT
= TLI
->getTypeLegalizationCost(DL
, Ty
);
470 return LT
.first
* getFullRateInstrCost();
473 int GCNTTIImpl::getMinMaxReductionCost(Type
*Ty
, Type
*CondTy
,
476 EVT OrigTy
= TLI
->getValueType(DL
, Ty
);
478 // Computes cost on targets that have packed math instructions(which support
479 // 16-bit types only).
481 !ST
->hasVOP3PInsts() ||
482 OrigTy
.getScalarSizeInBits() != 16)
483 return BaseT::getMinMaxReductionCost(Ty
, CondTy
, IsPairwise
, IsUnsigned
);
485 std::pair
<int, MVT
> LT
= TLI
->getTypeLegalizationCost(DL
, Ty
);
486 return LT
.first
* getHalfRateInstrCost();
489 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode
, Type
*ValTy
,
492 case Instruction::ExtractElement
:
493 case Instruction::InsertElement
: {
495 = DL
.getTypeSizeInBits(cast
<VectorType
>(ValTy
)->getElementType());
497 if (EltSize
== 16 && Index
== 0 && ST
->has16BitInsts())
499 return BaseT::getVectorInstrCost(Opcode
, ValTy
, Index
);
502 // Extracts are just reads of a subregister, so are free. Inserts are
503 // considered free because we don't want to have any cost for scalarizing
504 // operations, and we don't have to copy into a different register class.
506 // Dynamic indexing isn't free and is best avoided.
507 return Index
== ~0u ? 2 : 0;
510 return BaseT::getVectorInstrCost(Opcode
, ValTy
, Index
);
516 static bool isArgPassedInSGPR(const Argument
*A
) {
517 const Function
*F
= A
->getParent();
519 // Arguments to compute shaders are never a source of divergence.
520 CallingConv::ID CC
= F
->getCallingConv();
522 case CallingConv::AMDGPU_KERNEL
:
523 case CallingConv::SPIR_KERNEL
:
525 case CallingConv::AMDGPU_VS
:
526 case CallingConv::AMDGPU_LS
:
527 case CallingConv::AMDGPU_HS
:
528 case CallingConv::AMDGPU_ES
:
529 case CallingConv::AMDGPU_GS
:
530 case CallingConv::AMDGPU_PS
:
531 case CallingConv::AMDGPU_CS
:
532 // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
533 // Everything else is in VGPRs.
534 return F
->getAttributes().hasParamAttribute(A
->getArgNo(), Attribute::InReg
) ||
535 F
->getAttributes().hasParamAttribute(A
->getArgNo(), Attribute::ByVal
);
537 // TODO: Should calls support inreg for SGPR inputs?
542 /// \returns true if the result of the value could potentially be
543 /// different across workitems in a wavefront.
544 bool GCNTTIImpl::isSourceOfDivergence(const Value
*V
) const {
545 if (const Argument
*A
= dyn_cast
<Argument
>(V
))
546 return !isArgPassedInSGPR(A
);
548 // Loads from the private address space are divergent, because threads
549 // can execute the load instruction with the same inputs and get different
552 // All other loads are not divergent, because if threads issue loads with the
553 // same arguments, they will always get the same result.
554 if (const LoadInst
*Load
= dyn_cast
<LoadInst
>(V
))
555 return Load
->getPointerAddressSpace() == ST
->getAMDGPUAS().PRIVATE_ADDRESS
;
557 // Atomics are divergent because they are executed sequentially: when an
558 // atomic operation refers to the same address in each thread, then each
559 // thread after the first sees the value written by the previous thread as
561 if (isa
<AtomicRMWInst
>(V
) || isa
<AtomicCmpXchgInst
>(V
))
564 if (const IntrinsicInst
*Intrinsic
= dyn_cast
<IntrinsicInst
>(V
))
565 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic
->getIntrinsicID());
567 // Assume all function calls are a source of divergence.
568 if (isa
<CallInst
>(V
) || isa
<InvokeInst
>(V
))
574 bool GCNTTIImpl::isAlwaysUniform(const Value
*V
) const {
575 if (const IntrinsicInst
*Intrinsic
= dyn_cast
<IntrinsicInst
>(V
)) {
576 switch (Intrinsic
->getIntrinsicID()) {
579 case Intrinsic::amdgcn_readfirstlane
:
580 case Intrinsic::amdgcn_readlane
:
587 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind
, Type
*Tp
, int Index
,
589 if (ST
->hasVOP3PInsts()) {
590 VectorType
*VT
= cast
<VectorType
>(Tp
);
591 if (VT
->getNumElements() == 2 &&
592 DL
.getTypeSizeInBits(VT
->getElementType()) == 16) {
593 // With op_sel VOP3P instructions freely can access the low half or high
594 // half of a register, so any swizzle is free.
597 case TTI::SK_Broadcast
:
598 case TTI::SK_Reverse
:
599 case TTI::SK_PermuteSingleSrc
:
607 return BaseT::getShuffleCost(Kind
, Tp
, Index
, SubTp
);
610 bool GCNTTIImpl::areInlineCompatible(const Function
*Caller
,
611 const Function
*Callee
) const {
612 const TargetMachine
&TM
= getTLI()->getTargetMachine();
613 const FeatureBitset
&CallerBits
=
614 TM
.getSubtargetImpl(*Caller
)->getFeatureBits();
615 const FeatureBitset
&CalleeBits
=
616 TM
.getSubtargetImpl(*Callee
)->getFeatureBits();
618 FeatureBitset RealCallerBits
= CallerBits
& ~InlineFeatureIgnoreList
;
619 FeatureBitset RealCalleeBits
= CalleeBits
& ~InlineFeatureIgnoreList
;
620 return ((RealCallerBits
& RealCalleeBits
) == RealCalleeBits
);
623 void GCNTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
624 TTI::UnrollingPreferences
&UP
) {
625 CommonTTI
.getUnrollingPreferences(L
, SE
, UP
);
628 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec
) const {
629 return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
632 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec
) const {
633 return getHardwareNumberOfRegisters(Vec
);
636 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector
) const {
640 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
644 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace
) const {
645 AMDGPUAS AS
= ST
->getAMDGPUAS();
646 if (AddrSpace
== AS
.GLOBAL_ADDRESS
||
647 AddrSpace
== AS
.CONSTANT_ADDRESS
)
649 if (AddrSpace
== AS
.LOCAL_ADDRESS
||
650 AddrSpace
== AS
.REGION_ADDRESS
)
652 if (AddrSpace
== AS
.PRIVATE_ADDRESS
)
655 if ((AddrSpace
== AS
.PARAM_D_ADDRESS
||
656 AddrSpace
== AS
.PARAM_I_ADDRESS
||
657 (AddrSpace
>= AS
.CONSTANT_BUFFER_0
&&
658 AddrSpace
<= AS
.CONSTANT_BUFFER_15
)))
660 llvm_unreachable("unhandled address space");
663 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes
,
665 unsigned AddrSpace
) const {
666 // We allow vectorization of flat stores, even though we may need to decompose
667 // them later if they may access private memory. We don't have enough context
668 // here, and legalization can handle it.
669 if (AddrSpace
== ST
->getAMDGPUAS().PRIVATE_ADDRESS
)
674 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes
,
676 unsigned AddrSpace
) const {
677 return isLegalToVectorizeMemChain(ChainSizeInBytes
, Alignment
, AddrSpace
);
680 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes
,
682 unsigned AddrSpace
) const {
683 return isLegalToVectorizeMemChain(ChainSizeInBytes
, Alignment
, AddrSpace
);
686 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF
) {
687 // Disable unrolling if the loop is not vectorized.
688 // TODO: Enable this again.
695 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode
) {
696 // XXX - For some reason this isn't called for switch.
698 case Instruction::Br
:
699 case Instruction::Ret
:
702 return BaseT::getCFInstrCost(Opcode
);
706 int R600TTIImpl::getVectorInstrCost(unsigned Opcode
, Type
*ValTy
,
709 case Instruction::ExtractElement
:
710 case Instruction::InsertElement
: {
712 = DL
.getTypeSizeInBits(cast
<VectorType
>(ValTy
)->getElementType());
714 return BaseT::getVectorInstrCost(Opcode
, ValTy
, Index
);
717 // Extracts are just reads of a subregister, so are free. Inserts are
718 // considered free because we don't want to have any cost for scalarizing
719 // operations, and we don't have to copy into a different register class.
721 // Dynamic indexing isn't free and is best avoided.
722 return Index
== ~0u ? 2 : 0;
725 return BaseT::getVectorInstrCost(Opcode
, ValTy
, Index
);
729 void R600TTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
730 TTI::UnrollingPreferences
&UP
) {
731 CommonTTI
.getUnrollingPreferences(L
, SE
, UP
);