1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass eliminates allocas by either converting them into vectors or
10 // by migrating them to local address space.
12 //===----------------------------------------------------------------------===//
15 #include "GCNSubtarget.h"
16 #include "llvm/Analysis/CaptureTracking.h"
17 #include "llvm/Analysis/ValueTracking.h"
18 #include "llvm/CodeGen/TargetPassConfig.h"
19 #include "llvm/IR/IRBuilder.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/IR/IntrinsicsR600.h"
22 #include "llvm/Pass.h"
23 #include "llvm/Target/TargetMachine.h"
25 #define DEBUG_TYPE "amdgpu-promote-alloca"
31 static cl::opt
<bool> DisablePromoteAllocaToVector(
32 "disable-promote-alloca-to-vector",
33 cl::desc("Disable promote alloca to vector"),
36 static cl::opt
<bool> DisablePromoteAllocaToLDS(
37 "disable-promote-alloca-to-lds",
38 cl::desc("Disable promote alloca to LDS"),
41 static cl::opt
<unsigned> PromoteAllocaToVectorLimit(
42 "amdgpu-promote-alloca-to-vector-limit",
43 cl::desc("Maximum byte size to consider promote alloca to vector"),
46 // FIXME: This can create globals so should be a module pass.
47 class AMDGPUPromoteAlloca
: public FunctionPass
{
51 AMDGPUPromoteAlloca() : FunctionPass(ID
) {}
53 bool runOnFunction(Function
&F
) override
;
55 StringRef
getPassName() const override
{ return "AMDGPU Promote Alloca"; }
57 bool handleAlloca(AllocaInst
&I
, bool SufficientLDS
);
59 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
61 FunctionPass::getAnalysisUsage(AU
);
65 class AMDGPUPromoteAllocaImpl
{
67 const TargetMachine
&TM
;
68 Module
*Mod
= nullptr;
69 const DataLayout
*DL
= nullptr;
71 // FIXME: This should be per-kernel.
72 uint32_t LocalMemLimit
= 0;
73 uint32_t CurrentLocalMemUsage
= 0;
76 bool IsAMDGCN
= false;
77 bool IsAMDHSA
= false;
79 std::pair
<Value
*, Value
*> getLocalSizeYZ(IRBuilder
<> &Builder
);
80 Value
*getWorkitemID(IRBuilder
<> &Builder
, unsigned N
);
82 /// BaseAlloca is the alloca root the search started from.
83 /// Val may be that alloca or a recursive user of it.
84 bool collectUsesWithPtrTypes(Value
*BaseAlloca
,
86 std::vector
<Value
*> &WorkList
) const;
88 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
89 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
90 /// Returns true if both operands are derived from the same alloca. Val should
91 /// be the same value as one of the input operands of UseInst.
92 bool binaryOpIsDerivedFromSameAlloca(Value
*Alloca
, Value
*Val
,
94 int OpIdx0
, int OpIdx1
) const;
96 /// Check whether we have enough local memory for promotion.
97 bool hasSufficientLocalMem(const Function
&F
);
99 bool handleAlloca(AllocaInst
&I
, bool SufficientLDS
);
102 AMDGPUPromoteAllocaImpl(TargetMachine
&TM
) : TM(TM
) {}
103 bool run(Function
&F
);
106 class AMDGPUPromoteAllocaToVector
: public FunctionPass
{
110 AMDGPUPromoteAllocaToVector() : FunctionPass(ID
) {}
112 bool runOnFunction(Function
&F
) override
;
114 StringRef
getPassName() const override
{
115 return "AMDGPU Promote Alloca to vector";
118 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
119 AU
.setPreservesCFG();
120 FunctionPass::getAnalysisUsage(AU
);
124 } // end anonymous namespace
126 char AMDGPUPromoteAlloca::ID
= 0;
127 char AMDGPUPromoteAllocaToVector::ID
= 0;
129 INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca
, DEBUG_TYPE
,
130 "AMDGPU promote alloca to vector or LDS", false, false)
131 // Move LDS uses from functions to kernels before promote alloca for accurate
132 // estimation of LDS available
133 INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS
)
134 INITIALIZE_PASS_END(AMDGPUPromoteAlloca
, DEBUG_TYPE
,
135 "AMDGPU promote alloca to vector or LDS", false, false)
137 INITIALIZE_PASS(AMDGPUPromoteAllocaToVector
, DEBUG_TYPE
"-to-vector",
138 "AMDGPU promote alloca to vector", false, false)
140 char &llvm::AMDGPUPromoteAllocaID
= AMDGPUPromoteAlloca::ID
;
141 char &llvm::AMDGPUPromoteAllocaToVectorID
= AMDGPUPromoteAllocaToVector::ID
;
143 bool AMDGPUPromoteAlloca::runOnFunction(Function
&F
) {
147 if (auto *TPC
= getAnalysisIfAvailable
<TargetPassConfig
>()) {
148 return AMDGPUPromoteAllocaImpl(TPC
->getTM
<TargetMachine
>()).run(F
);
153 PreservedAnalyses
AMDGPUPromoteAllocaPass::run(Function
&F
,
154 FunctionAnalysisManager
&AM
) {
155 bool Changed
= AMDGPUPromoteAllocaImpl(TM
).run(F
);
157 PreservedAnalyses PA
;
158 PA
.preserveSet
<CFGAnalyses
>();
161 return PreservedAnalyses::all();
164 bool AMDGPUPromoteAllocaImpl::run(Function
&F
) {
166 DL
= &Mod
->getDataLayout();
168 const Triple
&TT
= TM
.getTargetTriple();
169 IsAMDGCN
= TT
.getArch() == Triple::amdgcn
;
170 IsAMDHSA
= TT
.getOS() == Triple::AMDHSA
;
172 const AMDGPUSubtarget
&ST
= AMDGPUSubtarget::get(TM
, F
);
173 if (!ST
.isPromoteAllocaEnabled())
177 const GCNSubtarget
&ST
= TM
.getSubtarget
<GCNSubtarget
>(F
);
178 MaxVGPRs
= ST
.getMaxNumVGPRs(ST
.getWavesPerEU(F
).first
);
183 bool SufficientLDS
= hasSufficientLocalMem(F
);
184 bool Changed
= false;
185 BasicBlock
&EntryBB
= *F
.begin();
187 SmallVector
<AllocaInst
*, 16> Allocas
;
188 for (Instruction
&I
: EntryBB
) {
189 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(&I
))
190 Allocas
.push_back(AI
);
193 for (AllocaInst
*AI
: Allocas
) {
194 if (handleAlloca(*AI
, SufficientLDS
))
201 std::pair
<Value
*, Value
*>
202 AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder
<> &Builder
) {
203 const Function
&F
= *Builder
.GetInsertBlock()->getParent();
204 const AMDGPUSubtarget
&ST
= AMDGPUSubtarget::get(TM
, F
);
207 Function
*LocalSizeYFn
208 = Intrinsic::getDeclaration(Mod
, Intrinsic::r600_read_local_size_y
);
209 Function
*LocalSizeZFn
210 = Intrinsic::getDeclaration(Mod
, Intrinsic::r600_read_local_size_z
);
212 CallInst
*LocalSizeY
= Builder
.CreateCall(LocalSizeYFn
, {});
213 CallInst
*LocalSizeZ
= Builder
.CreateCall(LocalSizeZFn
, {});
215 ST
.makeLIDRangeMetadata(LocalSizeY
);
216 ST
.makeLIDRangeMetadata(LocalSizeZ
);
218 return std::make_pair(LocalSizeY
, LocalSizeZ
);
221 // We must read the size out of the dispatch pointer.
224 // We are indexing into this struct, and want to extract the workgroup_size_*
227 // typedef struct hsa_kernel_dispatch_packet_s {
230 // uint16_t workgroup_size_x ;
231 // uint16_t workgroup_size_y;
232 // uint16_t workgroup_size_z;
233 // uint16_t reserved0;
234 // uint32_t grid_size_x ;
235 // uint32_t grid_size_y ;
236 // uint32_t grid_size_z;
238 // uint32_t private_segment_size;
239 // uint32_t group_segment_size;
240 // uint64_t kernel_object;
242 // #ifdef HSA_LARGE_MODEL
243 // void *kernarg_address;
244 // #elif defined HSA_LITTLE_ENDIAN
245 // void *kernarg_address;
246 // uint32_t reserved1;
248 // uint32_t reserved1;
249 // void *kernarg_address;
251 // uint64_t reserved2;
252 // hsa_signal_t completion_signal; // uint64_t wrapper
253 // } hsa_kernel_dispatch_packet_t
255 Function
*DispatchPtrFn
256 = Intrinsic::getDeclaration(Mod
, Intrinsic::amdgcn_dispatch_ptr
);
258 CallInst
*DispatchPtr
= Builder
.CreateCall(DispatchPtrFn
, {});
259 DispatchPtr
->addRetAttr(Attribute::NoAlias
);
260 DispatchPtr
->addRetAttr(Attribute::NonNull
);
262 // Size of the dispatch packet struct.
263 DispatchPtr
->addDereferenceableRetAttr(64);
265 Type
*I32Ty
= Type::getInt32Ty(Mod
->getContext());
266 Value
*CastDispatchPtr
= Builder
.CreateBitCast(
267 DispatchPtr
, PointerType::get(I32Ty
, AMDGPUAS::CONSTANT_ADDRESS
));
269 // We could do a single 64-bit load here, but it's likely that the basic
270 // 32-bit and extract sequence is already present, and it is probably easier
271 // to CSE this. The loads should be mergable later anyway.
272 Value
*GEPXY
= Builder
.CreateConstInBoundsGEP1_64(I32Ty
, CastDispatchPtr
, 1);
273 LoadInst
*LoadXY
= Builder
.CreateAlignedLoad(I32Ty
, GEPXY
, Align(4));
275 Value
*GEPZU
= Builder
.CreateConstInBoundsGEP1_64(I32Ty
, CastDispatchPtr
, 2);
276 LoadInst
*LoadZU
= Builder
.CreateAlignedLoad(I32Ty
, GEPZU
, Align(4));
278 MDNode
*MD
= MDNode::get(Mod
->getContext(), None
);
279 LoadXY
->setMetadata(LLVMContext::MD_invariant_load
, MD
);
280 LoadZU
->setMetadata(LLVMContext::MD_invariant_load
, MD
);
281 ST
.makeLIDRangeMetadata(LoadZU
);
283 // Extract y component. Upper half of LoadZU should be zero already.
284 Value
*Y
= Builder
.CreateLShr(LoadXY
, 16);
286 return std::make_pair(Y
, LoadZU
);
289 Value
*AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder
<> &Builder
,
291 const AMDGPUSubtarget
&ST
=
292 AMDGPUSubtarget::get(TM
, *Builder
.GetInsertBlock()->getParent());
293 Intrinsic::ID IntrID
= Intrinsic::not_intrinsic
;
297 IntrID
= IsAMDGCN
? (Intrinsic::ID
)Intrinsic::amdgcn_workitem_id_x
298 : (Intrinsic::ID
)Intrinsic::r600_read_tidig_x
;
301 IntrID
= IsAMDGCN
? (Intrinsic::ID
)Intrinsic::amdgcn_workitem_id_y
302 : (Intrinsic::ID
)Intrinsic::r600_read_tidig_y
;
306 IntrID
= IsAMDGCN
? (Intrinsic::ID
)Intrinsic::amdgcn_workitem_id_z
307 : (Intrinsic::ID
)Intrinsic::r600_read_tidig_z
;
310 llvm_unreachable("invalid dimension");
313 Function
*WorkitemIdFn
= Intrinsic::getDeclaration(Mod
, IntrID
);
314 CallInst
*CI
= Builder
.CreateCall(WorkitemIdFn
);
315 ST
.makeLIDRangeMetadata(CI
);
320 static FixedVectorType
*arrayTypeToVecType(ArrayType
*ArrayTy
) {
321 return FixedVectorType::get(ArrayTy
->getElementType(),
322 ArrayTy
->getNumElements());
325 static Value
*stripBitcasts(Value
*V
) {
326 while (Instruction
*I
= dyn_cast
<Instruction
>(V
)) {
327 if (I
->getOpcode() != Instruction::BitCast
)
329 V
= I
->getOperand(0);
335 calculateVectorIndex(Value
*Ptr
,
336 const std::map
<GetElementPtrInst
*, Value
*> &GEPIdx
) {
337 GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(stripBitcasts(Ptr
));
341 auto I
= GEPIdx
.find(GEP
);
342 return I
== GEPIdx
.end() ? nullptr : I
->second
;
345 static Value
* GEPToVectorIndex(GetElementPtrInst
*GEP
) {
346 // FIXME we only support simple cases
347 if (GEP
->getNumOperands() != 3)
350 ConstantInt
*I0
= dyn_cast
<ConstantInt
>(GEP
->getOperand(1));
351 if (!I0
|| !I0
->isZero())
354 return GEP
->getOperand(2);
357 // Not an instruction handled below to turn into a vector.
359 // TODO: Check isTriviallyVectorizable for calls and handle other
361 static bool canVectorizeInst(Instruction
*Inst
, User
*User
,
362 const DataLayout
&DL
) {
363 switch (Inst
->getOpcode()) {
364 case Instruction::Load
: {
365 // Currently only handle the case where the Pointer Operand is a GEP.
366 // Also we could not vectorize volatile or atomic loads.
367 LoadInst
*LI
= cast
<LoadInst
>(Inst
);
368 if (isa
<AllocaInst
>(User
) &&
369 LI
->getPointerOperandType() == User
->getType() &&
370 isa
<VectorType
>(LI
->getType()))
373 Instruction
*PtrInst
= dyn_cast
<Instruction
>(LI
->getPointerOperand());
377 return (PtrInst
->getOpcode() == Instruction::GetElementPtr
||
378 PtrInst
->getOpcode() == Instruction::BitCast
) &&
381 case Instruction::BitCast
:
383 case Instruction::Store
: {
384 // Must be the stored pointer operand, not a stored value, plus
385 // since it should be canonical form, the User should be a GEP.
386 // Also we could not vectorize volatile or atomic stores.
387 StoreInst
*SI
= cast
<StoreInst
>(Inst
);
388 if (isa
<AllocaInst
>(User
) &&
389 SI
->getPointerOperandType() == User
->getType() &&
390 isa
<VectorType
>(SI
->getValueOperand()->getType()))
393 Instruction
*UserInst
= dyn_cast
<Instruction
>(User
);
397 return (SI
->getPointerOperand() == User
) &&
398 (UserInst
->getOpcode() == Instruction::GetElementPtr
||
399 UserInst
->getOpcode() == Instruction::BitCast
) &&
407 static bool tryPromoteAllocaToVector(AllocaInst
*Alloca
, const DataLayout
&DL
,
410 if (DisablePromoteAllocaToVector
) {
411 LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
415 Type
*AllocaTy
= Alloca
->getAllocatedType();
416 auto *VectorTy
= dyn_cast
<FixedVectorType
>(AllocaTy
);
417 if (auto *ArrayTy
= dyn_cast
<ArrayType
>(AllocaTy
)) {
418 if (VectorType::isValidElementType(ArrayTy
->getElementType()) &&
419 ArrayTy
->getNumElements() > 0)
420 VectorTy
= arrayTypeToVecType(ArrayTy
);
423 // Use up to 1/4 of available register budget for vectorization.
424 unsigned Limit
= PromoteAllocaToVectorLimit
? PromoteAllocaToVectorLimit
* 8
427 if (DL
.getTypeSizeInBits(AllocaTy
) * 4 > Limit
) {
428 LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
429 << MaxVGPRs
<< " registers available\n");
433 LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
435 // FIXME: There is no reason why we can't support larger arrays, we
436 // are just being conservative for now.
437 // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
438 // could also be promoted but we don't currently handle this case
439 if (!VectorTy
|| VectorTy
->getNumElements() > 16 ||
440 VectorTy
->getNumElements() < 2) {
441 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
445 std::map
<GetElementPtrInst
*, Value
*> GEPVectorIdx
;
446 std::vector
<Value
*> WorkList
;
447 SmallVector
<User
*, 8> Users(Alloca
->users());
448 SmallVector
<User
*, 8> UseUsers(Users
.size(), Alloca
);
449 Type
*VecEltTy
= VectorTy
->getElementType();
450 while (!Users
.empty()) {
451 User
*AllocaUser
= Users
.pop_back_val();
452 User
*UseUser
= UseUsers
.pop_back_val();
453 Instruction
*Inst
= dyn_cast
<Instruction
>(AllocaUser
);
455 GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(AllocaUser
);
457 if (!canVectorizeInst(Inst
, UseUser
, DL
))
460 if (Inst
->getOpcode() == Instruction::BitCast
) {
461 Type
*FromTy
= Inst
->getOperand(0)->getType()->getPointerElementType();
462 Type
*ToTy
= Inst
->getType()->getPointerElementType();
463 if (FromTy
->isAggregateType() || ToTy
->isAggregateType() ||
464 DL
.getTypeSizeInBits(FromTy
) != DL
.getTypeSizeInBits(ToTy
))
467 for (User
*CastUser
: Inst
->users()) {
468 if (isAssumeLikeIntrinsic(cast
<Instruction
>(CastUser
)))
470 Users
.push_back(CastUser
);
471 UseUsers
.push_back(Inst
);
477 WorkList
.push_back(AllocaUser
);
481 Value
*Index
= GEPToVectorIndex(GEP
);
483 // If we can't compute a vector index from this GEP, then we can't
484 // promote this alloca to vector.
486 LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
491 GEPVectorIdx
[GEP
] = Index
;
492 Users
.append(GEP
->user_begin(), GEP
->user_end());
493 UseUsers
.append(GEP
->getNumUses(), GEP
);
496 LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy
<< " -> "
497 << *VectorTy
<< '\n');
499 for (Value
*V
: WorkList
) {
500 Instruction
*Inst
= cast
<Instruction
>(V
);
501 IRBuilder
<> Builder(Inst
);
502 switch (Inst
->getOpcode()) {
503 case Instruction::Load
: {
504 if (Inst
->getType() == AllocaTy
|| Inst
->getType()->isVectorTy())
507 Value
*Ptr
= cast
<LoadInst
>(Inst
)->getPointerOperand();
508 Value
*Index
= calculateVectorIndex(Ptr
, GEPVectorIdx
);
512 Type
*VecPtrTy
= VectorTy
->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS
);
513 Value
*BitCast
= Builder
.CreateBitCast(Alloca
, VecPtrTy
);
514 Value
*VecValue
= Builder
.CreateLoad(VectorTy
, BitCast
);
515 Value
*ExtractElement
= Builder
.CreateExtractElement(VecValue
, Index
);
516 if (Inst
->getType() != VecEltTy
)
517 ExtractElement
= Builder
.CreateBitOrPointerCast(ExtractElement
, Inst
->getType());
518 Inst
->replaceAllUsesWith(ExtractElement
);
519 Inst
->eraseFromParent();
522 case Instruction::Store
: {
523 StoreInst
*SI
= cast
<StoreInst
>(Inst
);
524 if (SI
->getValueOperand()->getType() == AllocaTy
||
525 SI
->getValueOperand()->getType()->isVectorTy())
528 Value
*Ptr
= SI
->getPointerOperand();
529 Value
*Index
= calculateVectorIndex(Ptr
, GEPVectorIdx
);
533 Type
*VecPtrTy
= VectorTy
->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS
);
534 Value
*BitCast
= Builder
.CreateBitCast(Alloca
, VecPtrTy
);
535 Value
*VecValue
= Builder
.CreateLoad(VectorTy
, BitCast
);
536 Value
*Elt
= SI
->getValueOperand();
537 if (Elt
->getType() != VecEltTy
)
538 Elt
= Builder
.CreateBitOrPointerCast(Elt
, VecEltTy
);
539 Value
*NewVecValue
= Builder
.CreateInsertElement(VecValue
, Elt
, Index
);
540 Builder
.CreateStore(NewVecValue
, BitCast
);
541 Inst
->eraseFromParent();
546 llvm_unreachable("Inconsistency in instructions promotable to vector");
552 static bool isCallPromotable(CallInst
*CI
) {
553 IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(CI
);
557 switch (II
->getIntrinsicID()) {
558 case Intrinsic::memcpy
:
559 case Intrinsic::memmove
:
560 case Intrinsic::memset
:
561 case Intrinsic::lifetime_start
:
562 case Intrinsic::lifetime_end
:
563 case Intrinsic::invariant_start
:
564 case Intrinsic::invariant_end
:
565 case Intrinsic::launder_invariant_group
:
566 case Intrinsic::strip_invariant_group
:
567 case Intrinsic::objectsize
:
574 bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
575 Value
*BaseAlloca
, Value
*Val
, Instruction
*Inst
, int OpIdx0
,
577 // Figure out which operand is the one we might not be promoting.
578 Value
*OtherOp
= Inst
->getOperand(OpIdx0
);
580 OtherOp
= Inst
->getOperand(OpIdx1
);
582 if (isa
<ConstantPointerNull
>(OtherOp
))
585 Value
*OtherObj
= getUnderlyingObject(OtherOp
);
586 if (!isa
<AllocaInst
>(OtherObj
))
589 // TODO: We should be able to replace undefs with the right pointer type.
591 // TODO: If we know the other base object is another promotable
592 // alloca, not necessarily this alloca, we can do this. The
593 // important part is both must have the same address space at
595 if (OtherObj
!= BaseAlloca
) {
597 dbgs() << "Found a binary instruction with another alloca object\n");
604 bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
605 Value
*BaseAlloca
, Value
*Val
, std::vector
<Value
*> &WorkList
) const {
607 for (User
*User
: Val
->users()) {
608 if (is_contained(WorkList
, User
))
611 if (CallInst
*CI
= dyn_cast
<CallInst
>(User
)) {
612 if (!isCallPromotable(CI
))
615 WorkList
.push_back(User
);
619 Instruction
*UseInst
= cast
<Instruction
>(User
);
620 if (UseInst
->getOpcode() == Instruction::PtrToInt
)
623 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(UseInst
)) {
624 if (LI
->isVolatile())
630 if (StoreInst
*SI
= dyn_cast
<StoreInst
>(UseInst
)) {
631 if (SI
->isVolatile())
634 // Reject if the stored value is not the pointer operand.
635 if (SI
->getPointerOperand() != Val
)
637 } else if (AtomicRMWInst
*RMW
= dyn_cast
<AtomicRMWInst
>(UseInst
)) {
638 if (RMW
->isVolatile())
640 } else if (AtomicCmpXchgInst
*CAS
= dyn_cast
<AtomicCmpXchgInst
>(UseInst
)) {
641 if (CAS
->isVolatile())
645 // Only promote a select if we know that the other select operand
646 // is from another pointer that will also be promoted.
647 if (ICmpInst
*ICmp
= dyn_cast
<ICmpInst
>(UseInst
)) {
648 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca
, Val
, ICmp
, 0, 1))
651 // May need to rewrite constant operands.
652 WorkList
.push_back(ICmp
);
655 if (UseInst
->getOpcode() == Instruction::AddrSpaceCast
) {
656 // Give up if the pointer may be captured.
657 if (PointerMayBeCaptured(UseInst
, true, true))
659 // Don't collect the users of this.
660 WorkList
.push_back(User
);
664 // Do not promote vector/aggregate type instructions. It is hard to track
666 if (isa
<InsertValueInst
>(User
) || isa
<InsertElementInst
>(User
))
669 if (!User
->getType()->isPointerTy())
672 if (GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(UseInst
)) {
673 // Be conservative if an address could be computed outside the bounds of
675 if (!GEP
->isInBounds())
679 // Only promote a select if we know that the other select operand is from
680 // another pointer that will also be promoted.
681 if (SelectInst
*SI
= dyn_cast
<SelectInst
>(UseInst
)) {
682 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca
, Val
, SI
, 1, 2))
687 if (PHINode
*Phi
= dyn_cast
<PHINode
>(UseInst
)) {
688 // TODO: Handle more complex cases. We should be able to replace loops
690 switch (Phi
->getNumIncomingValues()) {
694 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca
, Val
, Phi
, 0, 1))
702 WorkList
.push_back(User
);
703 if (!collectUsesWithPtrTypes(BaseAlloca
, User
, WorkList
))
710 bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function
&F
) {
712 FunctionType
*FTy
= F
.getFunctionType();
713 const AMDGPUSubtarget
&ST
= AMDGPUSubtarget::get(TM
, F
);
715 // If the function has any arguments in the local address space, then it's
716 // possible these arguments require the entire local memory space, so
717 // we cannot use local memory in the pass.
718 for (Type
*ParamTy
: FTy
->params()) {
719 PointerType
*PtrTy
= dyn_cast
<PointerType
>(ParamTy
);
720 if (PtrTy
&& PtrTy
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
) {
722 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
723 "local memory disabled.\n");
728 LocalMemLimit
= ST
.getLocalMemorySize();
729 if (LocalMemLimit
== 0)
732 SmallVector
<const Constant
*, 16> Stack
;
733 SmallPtrSet
<const Constant
*, 8> VisitedConstants
;
734 SmallPtrSet
<const GlobalVariable
*, 8> UsedLDS
;
736 auto visitUsers
= [&](const GlobalVariable
*GV
, const Constant
*Val
) -> bool {
737 for (const User
*U
: Val
->users()) {
738 if (const Instruction
*Use
= dyn_cast
<Instruction
>(U
)) {
739 if (Use
->getParent()->getParent() == &F
)
742 const Constant
*C
= cast
<Constant
>(U
);
743 if (VisitedConstants
.insert(C
).second
)
751 for (GlobalVariable
&GV
: Mod
->globals()) {
752 if (GV
.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS
)
755 if (visitUsers(&GV
, &GV
)) {
761 // For any ConstantExpr uses, we need to recursively search the users until
762 // we see a function.
763 while (!Stack
.empty()) {
764 const Constant
*C
= Stack
.pop_back_val();
765 if (visitUsers(&GV
, C
)) {
773 const DataLayout
&DL
= Mod
->getDataLayout();
774 SmallVector
<std::pair
<uint64_t, Align
>, 16> AllocatedSizes
;
775 AllocatedSizes
.reserve(UsedLDS
.size());
777 for (const GlobalVariable
*GV
: UsedLDS
) {
779 DL
.getValueOrABITypeAlignment(GV
->getAlign(), GV
->getValueType());
780 uint64_t AllocSize
= DL
.getTypeAllocSize(GV
->getValueType());
781 AllocatedSizes
.emplace_back(AllocSize
, Alignment
);
784 // Sort to try to estimate the worst case alignment padding
786 // FIXME: We should really do something to fix the addresses to a more optimal
788 llvm::sort(AllocatedSizes
, [](std::pair
<uint64_t, Align
> LHS
,
789 std::pair
<uint64_t, Align
> RHS
) {
790 return LHS
.second
< RHS
.second
;
793 // Check how much local memory is being used by global objects
794 CurrentLocalMemUsage
= 0;
796 // FIXME: Try to account for padding here. The real padding and address is
797 // currently determined from the inverse order of uses in the function when
798 // legalizing, which could also potentially change. We try to estimate the
799 // worst case here, but we probably should fix the addresses earlier.
800 for (auto Alloc
: AllocatedSizes
) {
801 CurrentLocalMemUsage
= alignTo(CurrentLocalMemUsage
, Alloc
.second
);
802 CurrentLocalMemUsage
+= Alloc
.first
;
805 unsigned MaxOccupancy
= ST
.getOccupancyWithLocalMemSize(CurrentLocalMemUsage
,
808 // Restrict local memory usage so that we don't drastically reduce occupancy,
809 // unless it is already significantly reduced.
811 // TODO: Have some sort of hint or other heuristics to guess occupancy based
812 // on other factors..
813 unsigned OccupancyHint
= ST
.getWavesPerEU(F
).second
;
814 if (OccupancyHint
== 0)
817 // Clamp to max value.
818 OccupancyHint
= std::min(OccupancyHint
, ST
.getMaxWavesPerEU());
820 // Check the hint but ignore it if it's obviously wrong from the existing LDS
822 MaxOccupancy
= std::min(OccupancyHint
, MaxOccupancy
);
825 // Round up to the next tier of usage.
826 unsigned MaxSizeWithWaveCount
827 = ST
.getMaxLocalMemSizeWithWaveCount(MaxOccupancy
, F
);
829 // Program is possibly broken by using more local mem than available.
830 if (CurrentLocalMemUsage
> MaxSizeWithWaveCount
)
833 LocalMemLimit
= MaxSizeWithWaveCount
;
835 LLVM_DEBUG(dbgs() << F
.getName() << " uses " << CurrentLocalMemUsage
837 << " Rounding size to " << MaxSizeWithWaveCount
838 << " with a maximum occupancy of " << MaxOccupancy
<< '\n'
839 << " and " << (LocalMemLimit
- CurrentLocalMemUsage
)
840 << " available for promotion\n");
845 // FIXME: Should try to pick the most likely to be profitable allocas first.
846 bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst
&I
, bool SufficientLDS
) {
847 // Array allocations are probably not worth handling, since an allocation of
848 // the array type is the canonical form.
849 if (!I
.isStaticAlloca() || I
.isArrayAllocation())
852 const DataLayout
&DL
= Mod
->getDataLayout();
853 IRBuilder
<> Builder(&I
);
855 // First try to replace the alloca with a vector
856 Type
*AllocaTy
= I
.getAllocatedType();
858 LLVM_DEBUG(dbgs() << "Trying to promote " << I
<< '\n');
860 if (tryPromoteAllocaToVector(&I
, DL
, MaxVGPRs
))
861 return true; // Promoted to vector.
863 if (DisablePromoteAllocaToLDS
)
866 const Function
&ContainingFunction
= *I
.getParent()->getParent();
867 CallingConv::ID CC
= ContainingFunction
.getCallingConv();
869 // Don't promote the alloca to LDS for shader calling conventions as the work
870 // item ID intrinsics are not supported for these calling conventions.
871 // Furthermore not all LDS is available for some of the stages.
873 case CallingConv::AMDGPU_KERNEL
:
874 case CallingConv::SPIR_KERNEL
:
879 << " promote alloca to LDS not supported with calling convention.\n");
883 // Not likely to have sufficient local memory for promotion.
887 const AMDGPUSubtarget
&ST
= AMDGPUSubtarget::get(TM
, ContainingFunction
);
888 unsigned WorkGroupSize
= ST
.getFlatWorkGroupSizes(ContainingFunction
).second
;
891 DL
.getValueOrABITypeAlignment(I
.getAlign(), I
.getAllocatedType());
893 // FIXME: This computed padding is likely wrong since it depends on inverse
896 // FIXME: It is also possible that if we're allowed to use all of the memory
897 // could could end up using more than the maximum due to alignment padding.
899 uint32_t NewSize
= alignTo(CurrentLocalMemUsage
, Alignment
);
900 uint32_t AllocSize
= WorkGroupSize
* DL
.getTypeAllocSize(AllocaTy
);
901 NewSize
+= AllocSize
;
903 if (NewSize
> LocalMemLimit
) {
904 LLVM_DEBUG(dbgs() << " " << AllocSize
905 << " bytes of local memory not available to promote\n");
909 CurrentLocalMemUsage
= NewSize
;
911 std::vector
<Value
*> WorkList
;
913 if (!collectUsesWithPtrTypes(&I
, &I
, WorkList
)) {
914 LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
918 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
920 Function
*F
= I
.getParent()->getParent();
922 Type
*GVTy
= ArrayType::get(I
.getAllocatedType(), WorkGroupSize
);
923 GlobalVariable
*GV
= new GlobalVariable(
924 *Mod
, GVTy
, false, GlobalValue::InternalLinkage
,
925 UndefValue::get(GVTy
),
926 Twine(F
->getName()) + Twine('.') + I
.getName(),
928 GlobalVariable::NotThreadLocal
,
929 AMDGPUAS::LOCAL_ADDRESS
);
930 GV
->setUnnamedAddr(GlobalValue::UnnamedAddr::Global
);
931 GV
->setAlignment(MaybeAlign(I
.getAlignment()));
933 Value
*TCntY
, *TCntZ
;
935 std::tie(TCntY
, TCntZ
) = getLocalSizeYZ(Builder
);
936 Value
*TIdX
= getWorkitemID(Builder
, 0);
937 Value
*TIdY
= getWorkitemID(Builder
, 1);
938 Value
*TIdZ
= getWorkitemID(Builder
, 2);
940 Value
*Tmp0
= Builder
.CreateMul(TCntY
, TCntZ
, "", true, true);
941 Tmp0
= Builder
.CreateMul(Tmp0
, TIdX
);
942 Value
*Tmp1
= Builder
.CreateMul(TIdY
, TCntZ
, "", true, true);
943 Value
*TID
= Builder
.CreateAdd(Tmp0
, Tmp1
);
944 TID
= Builder
.CreateAdd(TID
, TIdZ
);
947 Constant::getNullValue(Type::getInt32Ty(Mod
->getContext())),
951 Value
*Offset
= Builder
.CreateInBoundsGEP(GVTy
, GV
, Indices
);
952 I
.mutateType(Offset
->getType());
953 I
.replaceAllUsesWith(Offset
);
956 SmallVector
<IntrinsicInst
*> DeferredIntrs
;
958 for (Value
*V
: WorkList
) {
959 CallInst
*Call
= dyn_cast
<CallInst
>(V
);
961 if (ICmpInst
*CI
= dyn_cast
<ICmpInst
>(V
)) {
962 Value
*Src0
= CI
->getOperand(0);
963 PointerType
*NewTy
= PointerType::getWithSamePointeeType(
964 cast
<PointerType
>(Src0
->getType()), AMDGPUAS::LOCAL_ADDRESS
);
966 if (isa
<ConstantPointerNull
>(CI
->getOperand(0)))
967 CI
->setOperand(0, ConstantPointerNull::get(NewTy
));
969 if (isa
<ConstantPointerNull
>(CI
->getOperand(1)))
970 CI
->setOperand(1, ConstantPointerNull::get(NewTy
));
975 // The operand's value should be corrected on its own and we don't want to
977 if (isa
<AddrSpaceCastInst
>(V
))
980 PointerType
*NewTy
= PointerType::getWithSamePointeeType(
981 cast
<PointerType
>(V
->getType()), AMDGPUAS::LOCAL_ADDRESS
);
983 // FIXME: It doesn't really make sense to try to do this for all
985 V
->mutateType(NewTy
);
987 // Adjust the types of any constant operands.
988 if (SelectInst
*SI
= dyn_cast
<SelectInst
>(V
)) {
989 if (isa
<ConstantPointerNull
>(SI
->getOperand(1)))
990 SI
->setOperand(1, ConstantPointerNull::get(NewTy
));
992 if (isa
<ConstantPointerNull
>(SI
->getOperand(2)))
993 SI
->setOperand(2, ConstantPointerNull::get(NewTy
));
994 } else if (PHINode
*Phi
= dyn_cast
<PHINode
>(V
)) {
995 for (unsigned I
= 0, E
= Phi
->getNumIncomingValues(); I
!= E
; ++I
) {
996 if (isa
<ConstantPointerNull
>(Phi
->getIncomingValue(I
)))
997 Phi
->setIncomingValue(I
, ConstantPointerNull::get(NewTy
));
1004 IntrinsicInst
*Intr
= cast
<IntrinsicInst
>(Call
);
1005 Builder
.SetInsertPoint(Intr
);
1006 switch (Intr
->getIntrinsicID()) {
1007 case Intrinsic::lifetime_start
:
1008 case Intrinsic::lifetime_end
:
1009 // These intrinsics are for address space 0 only
1010 Intr
->eraseFromParent();
1012 case Intrinsic::memcpy
:
1013 case Intrinsic::memmove
:
1014 // These have 2 pointer operands. In case if second pointer also needs
1015 // to be replaced we defer processing of these intrinsics until all
1016 // other values are processed.
1017 DeferredIntrs
.push_back(Intr
);
1019 case Intrinsic::memset
: {
1020 MemSetInst
*MemSet
= cast
<MemSetInst
>(Intr
);
1021 Builder
.CreateMemSet(
1022 MemSet
->getRawDest(), MemSet
->getValue(), MemSet
->getLength(),
1023 MaybeAlign(MemSet
->getDestAlignment()), MemSet
->isVolatile());
1024 Intr
->eraseFromParent();
1027 case Intrinsic::invariant_start
:
1028 case Intrinsic::invariant_end
:
1029 case Intrinsic::launder_invariant_group
:
1030 case Intrinsic::strip_invariant_group
:
1031 Intr
->eraseFromParent();
1032 // FIXME: I think the invariant marker should still theoretically apply,
1033 // but the intrinsics need to be changed to accept pointers with any
1036 case Intrinsic::objectsize
: {
1037 Value
*Src
= Intr
->getOperand(0);
1038 Function
*ObjectSize
= Intrinsic::getDeclaration(
1039 Mod
, Intrinsic::objectsize
,
1041 PointerType::getWithSamePointeeType(
1042 cast
<PointerType
>(Src
->getType()), AMDGPUAS::LOCAL_ADDRESS
)});
1044 CallInst
*NewCall
= Builder
.CreateCall(
1046 {Src
, Intr
->getOperand(1), Intr
->getOperand(2), Intr
->getOperand(3)});
1047 Intr
->replaceAllUsesWith(NewCall
);
1048 Intr
->eraseFromParent();
1052 Intr
->print(errs());
1053 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1057 for (IntrinsicInst
*Intr
: DeferredIntrs
) {
1058 Builder
.SetInsertPoint(Intr
);
1059 Intrinsic::ID ID
= Intr
->getIntrinsicID();
1060 assert(ID
== Intrinsic::memcpy
|| ID
== Intrinsic::memmove
);
1062 MemTransferInst
*MI
= cast
<MemTransferInst
>(Intr
);
1064 Builder
.CreateMemTransferInst(ID
, MI
->getRawDest(), MI
->getDestAlign(),
1065 MI
->getRawSource(), MI
->getSourceAlign(),
1066 MI
->getLength(), MI
->isVolatile());
1068 for (unsigned I
= 0; I
!= 2; ++I
) {
1069 if (uint64_t Bytes
= Intr
->getParamDereferenceableBytes(I
)) {
1070 B
->addDereferenceableParamAttr(I
, Bytes
);
1074 Intr
->eraseFromParent();
1080 bool handlePromoteAllocaToVector(AllocaInst
&I
, unsigned MaxVGPRs
) {
1081 // Array allocations are probably not worth handling, since an allocation of
1082 // the array type is the canonical form.
1083 if (!I
.isStaticAlloca() || I
.isArrayAllocation())
1086 LLVM_DEBUG(dbgs() << "Trying to promote " << I
<< '\n');
1088 Module
*Mod
= I
.getParent()->getParent()->getParent();
1089 return tryPromoteAllocaToVector(&I
, Mod
->getDataLayout(), MaxVGPRs
);
1092 bool promoteAllocasToVector(Function
&F
, TargetMachine
&TM
) {
1093 if (DisablePromoteAllocaToVector
)
1096 const AMDGPUSubtarget
&ST
= AMDGPUSubtarget::get(TM
, F
);
1097 if (!ST
.isPromoteAllocaEnabled())
1101 if (TM
.getTargetTriple().getArch() == Triple::amdgcn
) {
1102 const GCNSubtarget
&ST
= TM
.getSubtarget
<GCNSubtarget
>(F
);
1103 MaxVGPRs
= ST
.getMaxNumVGPRs(ST
.getWavesPerEU(F
).first
);
1108 bool Changed
= false;
1109 BasicBlock
&EntryBB
= *F
.begin();
1111 SmallVector
<AllocaInst
*, 16> Allocas
;
1112 for (Instruction
&I
: EntryBB
) {
1113 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(&I
))
1114 Allocas
.push_back(AI
);
1117 for (AllocaInst
*AI
: Allocas
) {
1118 if (handlePromoteAllocaToVector(*AI
, MaxVGPRs
))
1125 bool AMDGPUPromoteAllocaToVector::runOnFunction(Function
&F
) {
1126 if (skipFunction(F
))
1128 if (auto *TPC
= getAnalysisIfAvailable
<TargetPassConfig
>()) {
1129 return promoteAllocasToVector(F
, TPC
->getTM
<TargetMachine
>());
1135 AMDGPUPromoteAllocaToVectorPass::run(Function
&F
, FunctionAnalysisManager
&AM
) {
1136 bool Changed
= promoteAllocasToVector(F
, TM
);
1138 PreservedAnalyses PA
;
1139 PA
.preserveSet
<CFGAnalyses
>();
1142 return PreservedAnalyses::all();
1145 FunctionPass
*llvm::createAMDGPUPromoteAlloca() {
1146 return new AMDGPUPromoteAlloca();
1149 FunctionPass
*llvm::createAMDGPUPromoteAllocaToVector() {
1150 return new AMDGPUPromoteAllocaToVector();