1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements the lowering of LLVM calls to machine code calls for
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUISelLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIISelLowering.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/CodeGen/Analysis.h"
24 #include "llvm/CodeGen/CallingConvLower.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/MachineInstrBuilder.h"
27 #include "llvm/Support/LowLevelTypeImpl.h"
33 struct OutgoingValueHandler
: public CallLowering::ValueHandler
{
34 OutgoingValueHandler(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
,
35 MachineInstrBuilder MIB
, CCAssignFn
*AssignFn
)
36 : ValueHandler(B
, MRI
, AssignFn
), MIB(MIB
) {}
38 MachineInstrBuilder MIB
;
40 bool isIncomingArgumentHandler() const override
{ return false; }
42 Register
getStackAddress(uint64_t Size
, int64_t Offset
,
43 MachinePointerInfo
&MPO
) override
{
44 llvm_unreachable("not implemented");
47 void assignValueToAddress(Register ValVReg
, Register Addr
, uint64_t Size
,
48 MachinePointerInfo
&MPO
, CCValAssign
&VA
) override
{
49 llvm_unreachable("not implemented");
52 void assignValueToReg(Register ValVReg
, Register PhysReg
,
53 CCValAssign
&VA
) override
{
55 if (VA
.getLocVT().getSizeInBits() < 32) {
56 // 16-bit types are reported as legal for 32-bit registers. We need to
57 // extend and do a 32-bit copy to avoid the verifier complaining about it.
58 ExtReg
= MIRBuilder
.buildAnyExt(LLT::scalar(32), ValVReg
).getReg(0);
60 ExtReg
= extendRegister(ValVReg
, VA
);
62 MIRBuilder
.buildCopy(PhysReg
, ExtReg
);
63 MIB
.addUse(PhysReg
, RegState::Implicit
);
66 bool assignArg(unsigned ValNo
, MVT ValVT
, MVT LocVT
,
67 CCValAssign::LocInfo LocInfo
,
68 const CallLowering::ArgInfo
&Info
,
69 ISD::ArgFlagsTy Flags
,
70 CCState
&State
) override
{
71 return AssignFn(ValNo
, ValVT
, LocVT
, LocInfo
, Flags
, State
);
75 struct IncomingArgHandler
: public CallLowering::ValueHandler
{
76 uint64_t StackUsed
= 0;
78 IncomingArgHandler(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
,
80 : ValueHandler(B
, MRI
, AssignFn
) {}
82 Register
getStackAddress(uint64_t Size
, int64_t Offset
,
83 MachinePointerInfo
&MPO
) override
{
84 auto &MFI
= MIRBuilder
.getMF().getFrameInfo();
85 int FI
= MFI
.CreateFixedObject(Size
, Offset
, true);
86 MPO
= MachinePointerInfo::getFixedStack(MIRBuilder
.getMF(), FI
);
87 Register AddrReg
= MRI
.createGenericVirtualRegister(
88 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS
, 32));
89 MIRBuilder
.buildFrameIndex(AddrReg
, FI
);
90 StackUsed
= std::max(StackUsed
, Size
+ Offset
);
94 void assignValueToReg(Register ValVReg
, Register PhysReg
,
95 CCValAssign
&VA
) override
{
96 markPhysRegUsed(PhysReg
);
98 if (VA
.getLocVT().getSizeInBits() < 32) {
99 // 16-bit types are reported as legal for 32-bit registers. We need to do
100 // a 32-bit copy, and truncate to avoid the verifier complaining about it.
101 auto Copy
= MIRBuilder
.buildCopy(LLT::scalar(32), PhysReg
);
102 MIRBuilder
.buildTrunc(ValVReg
, Copy
);
106 switch (VA
.getLocInfo()) {
107 case CCValAssign::LocInfo::SExt
:
108 case CCValAssign::LocInfo::ZExt
:
109 case CCValAssign::LocInfo::AExt
: {
110 auto Copy
= MIRBuilder
.buildCopy(LLT
{VA
.getLocVT()}, PhysReg
);
111 MIRBuilder
.buildTrunc(ValVReg
, Copy
);
115 MIRBuilder
.buildCopy(ValVReg
, PhysReg
);
120 void assignValueToAddress(Register ValVReg
, Register Addr
, uint64_t Size
,
121 MachinePointerInfo
&MPO
, CCValAssign
&VA
) override
{
122 // FIXME: Get alignment
123 auto MMO
= MIRBuilder
.getMF().getMachineMemOperand(
124 MPO
, MachineMemOperand::MOLoad
| MachineMemOperand::MOInvariant
, Size
, 1);
125 MIRBuilder
.buildLoad(ValVReg
, Addr
, *MMO
);
128 /// How the physical register gets marked varies between formal
129 /// parameters (it's a basic-block live-in), and a call instruction
130 /// (it's an implicit-def of the BL).
131 virtual void markPhysRegUsed(unsigned PhysReg
) = 0;
133 // FIXME: What is the point of this being a callback?
134 bool isIncomingArgumentHandler() const override
{ return true; }
137 struct FormalArgHandler
: public IncomingArgHandler
{
138 FormalArgHandler(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
,
139 CCAssignFn
*AssignFn
)
140 : IncomingArgHandler(B
, MRI
, AssignFn
) {}
142 void markPhysRegUsed(unsigned PhysReg
) override
{
143 MIRBuilder
.getMBB().addLiveIn(PhysReg
);
149 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering
&TLI
)
150 : CallLowering(&TLI
) {
153 void AMDGPUCallLowering::splitToValueTypes(
154 const ArgInfo
&OrigArg
, SmallVectorImpl
<ArgInfo
> &SplitArgs
,
155 const DataLayout
&DL
, MachineRegisterInfo
&MRI
, CallingConv::ID CallConv
,
156 SplitArgTy PerformArgSplit
) const {
157 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
158 LLVMContext
&Ctx
= OrigArg
.Ty
->getContext();
160 if (OrigArg
.Ty
->isVoidTy())
163 SmallVector
<EVT
, 4> SplitVTs
;
164 ComputeValueVTs(TLI
, DL
, OrigArg
.Ty
, SplitVTs
);
166 assert(OrigArg
.Regs
.size() == SplitVTs
.size());
169 for (EVT VT
: SplitVTs
) {
170 unsigned NumParts
= TLI
.getNumRegistersForCallingConv(Ctx
, CallConv
, VT
);
171 Type
*Ty
= VT
.getTypeForEVT(Ctx
);
176 // No splitting to do, but we want to replace the original type (e.g. [1 x
177 // double] -> double).
178 SplitArgs
.emplace_back(OrigArg
.Regs
[SplitIdx
], Ty
,
179 OrigArg
.Flags
, OrigArg
.IsFixed
);
185 LLT LLTy
= getLLTForType(*Ty
, DL
);
187 SmallVector
<Register
, 8> SplitRegs
;
189 EVT PartVT
= TLI
.getRegisterTypeForCallingConv(Ctx
, CallConv
, VT
);
190 Type
*PartTy
= PartVT
.getTypeForEVT(Ctx
);
191 LLT PartLLT
= getLLTForType(*PartTy
, DL
);
193 // FIXME: Should we be reporting all of the part registers for a single
194 // argument, and let handleAssignments take care of the repacking?
195 for (unsigned i
= 0; i
< NumParts
; ++i
) {
196 Register PartReg
= MRI
.createGenericVirtualRegister(PartLLT
);
197 SplitRegs
.push_back(PartReg
);
198 SplitArgs
.emplace_back(ArrayRef
<Register
>(PartReg
), PartTy
, OrigArg
.Flags
);
201 PerformArgSplit(SplitRegs
, LLTy
, PartLLT
, SplitIdx
);
207 // Get the appropriate type to make \p OrigTy \p Factor times bigger.
208 static LLT
getMultipleType(LLT OrigTy
, int Factor
) {
209 if (OrigTy
.isVector()) {
210 return LLT::vector(OrigTy
.getNumElements() * Factor
,
211 OrigTy
.getElementType());
214 return LLT::scalar(OrigTy
.getSizeInBits() * Factor
);
217 // TODO: Move to generic code
218 static void unpackRegsToOrigType(MachineIRBuilder
&B
,
219 ArrayRef
<Register
> DstRegs
,
223 assert(DstRegs
.size() > 1 && "Nothing to unpack");
225 MachineFunction
&MF
= B
.getMF();
226 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
228 const unsigned SrcSize
= SrcTy
.getSizeInBits();
229 const unsigned PartSize
= PartTy
.getSizeInBits();
231 if (SrcTy
.isVector() && !PartTy
.isVector() &&
232 PartSize
> SrcTy
.getElementType().getSizeInBits()) {
233 // Vector was scalarized, and the elements extended.
234 auto UnmergeToEltTy
= B
.buildUnmerge(SrcTy
.getElementType(),
236 for (int i
= 0, e
= DstRegs
.size(); i
!= e
; ++i
)
237 B
.buildAnyExt(DstRegs
[i
], UnmergeToEltTy
.getReg(i
));
241 if (SrcSize
% PartSize
== 0) {
242 B
.buildUnmerge(DstRegs
, SrcReg
);
246 const int NumRoundedParts
= (SrcSize
+ PartSize
- 1) / PartSize
;
248 LLT BigTy
= getMultipleType(PartTy
, NumRoundedParts
);
249 auto ImpDef
= B
.buildUndef(BigTy
);
251 Register BigReg
= MRI
.createGenericVirtualRegister(BigTy
);
252 B
.buildInsert(BigReg
, ImpDef
.getReg(0), SrcReg
, 0).getReg(0);
255 for (unsigned i
= 0, e
= DstRegs
.size(); i
!= e
; ++i
, Offset
+= PartSize
)
256 B
.buildExtract(DstRegs
[i
], BigReg
, Offset
);
259 /// Lower the return value for the already existing \p Ret. This assumes that
260 /// \p B's insertion point is correct.
261 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder
&B
,
262 const Value
*Val
, ArrayRef
<Register
> VRegs
,
263 MachineInstrBuilder
&Ret
) const {
267 auto &MF
= B
.getMF();
268 const auto &F
= MF
.getFunction();
269 const DataLayout
&DL
= MF
.getDataLayout();
271 CallingConv::ID CC
= F
.getCallingConv();
272 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
273 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
275 ArgInfo
OrigRetInfo(VRegs
, Val
->getType());
276 setArgFlags(OrigRetInfo
, AttributeList::ReturnIndex
, DL
, F
);
277 SmallVector
<ArgInfo
, 4> SplitRetInfos
;
280 OrigRetInfo
, SplitRetInfos
, DL
, MRI
, CC
,
281 [&](ArrayRef
<Register
> Regs
, LLT LLTy
, LLT PartLLT
, int VTSplitIdx
) {
282 unpackRegsToOrigType(B
, Regs
, VRegs
[VTSplitIdx
], LLTy
, PartLLT
);
285 CCAssignFn
*AssignFn
= TLI
.CCAssignFnForReturn(CC
, F
.isVarArg());
287 OutgoingValueHandler
RetHandler(B
, MF
.getRegInfo(), Ret
, AssignFn
);
288 return handleAssignments(B
, SplitRetInfos
, RetHandler
);
291 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder
&B
,
293 ArrayRef
<Register
> VRegs
) const {
295 MachineFunction
&MF
= B
.getMF();
296 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
297 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
298 MFI
->setIfReturnsVoid(!Val
);
300 assert(!Val
== VRegs
.empty() && "Return value without a vreg");
302 CallingConv::ID CC
= B
.getMF().getFunction().getCallingConv();
303 const bool IsShader
= AMDGPU::isShader(CC
);
304 const bool IsWaveEnd
= (IsShader
&& MFI
->returnsVoid()) ||
305 AMDGPU::isKernel(CC
);
307 B
.buildInstr(AMDGPU::S_ENDPGM
)
312 auto const &ST
= B
.getMF().getSubtarget
<GCNSubtarget
>();
315 IsShader
? AMDGPU::SI_RETURN_TO_EPILOG
: AMDGPU::S_SETPC_B64_return
;
317 auto Ret
= B
.buildInstrNoInsert(ReturnOpc
);
318 Register ReturnAddrVReg
;
319 if (ReturnOpc
== AMDGPU::S_SETPC_B64_return
) {
320 ReturnAddrVReg
= MRI
.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass
);
321 Ret
.addUse(ReturnAddrVReg
);
324 if (!lowerReturnVal(B
, Val
, VRegs
, Ret
))
327 if (ReturnOpc
== AMDGPU::S_SETPC_B64_return
) {
328 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
329 Register LiveInReturn
= MF
.addLiveIn(TRI
->getReturnAddressReg(MF
),
330 &AMDGPU::SGPR_64RegClass
);
331 B
.buildCopy(ReturnAddrVReg
, LiveInReturn
);
334 // TODO: Handle CalleeSavedRegsViaCopy.
340 Register
AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder
&B
,
342 uint64_t Offset
) const {
344 MachineFunction
&MF
= B
.getMF();
345 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
346 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
347 const Function
&F
= MF
.getFunction();
348 const DataLayout
&DL
= F
.getParent()->getDataLayout();
349 PointerType
*PtrTy
= PointerType::get(ParamTy
, AMDGPUAS::CONSTANT_ADDRESS
);
350 LLT PtrType
= getLLTForType(*PtrTy
, DL
);
351 Register DstReg
= MRI
.createGenericVirtualRegister(PtrType
);
352 Register KernArgSegmentPtr
=
353 MFI
->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
354 Register KernArgSegmentVReg
= MRI
.getLiveInVirtReg(KernArgSegmentPtr
);
356 Register OffsetReg
= MRI
.createGenericVirtualRegister(LLT::scalar(64));
357 B
.buildConstant(OffsetReg
, Offset
);
359 B
.buildGEP(DstReg
, KernArgSegmentVReg
, OffsetReg
);
364 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder
&B
,
365 Type
*ParamTy
, uint64_t Offset
,
367 Register DstReg
) const {
368 MachineFunction
&MF
= B
.getMF();
369 const Function
&F
= MF
.getFunction();
370 const DataLayout
&DL
= F
.getParent()->getDataLayout();
371 PointerType
*PtrTy
= PointerType::get(ParamTy
, AMDGPUAS::CONSTANT_ADDRESS
);
372 MachinePointerInfo
PtrInfo(UndefValue::get(PtrTy
));
373 unsigned TypeSize
= DL
.getTypeStoreSize(ParamTy
);
374 Register PtrReg
= lowerParameterPtr(B
, ParamTy
, Offset
);
376 MachineMemOperand
*MMO
=
377 MF
.getMachineMemOperand(PtrInfo
, MachineMemOperand::MOLoad
|
378 MachineMemOperand::MODereferenceable
|
379 MachineMemOperand::MOInvariant
,
382 B
.buildLoad(DstReg
, PtrReg
, *MMO
);
385 // Allocate special inputs passed in user SGPRs.
386 static void allocateHSAUserSGPRs(CCState
&CCInfo
,
389 const SIRegisterInfo
&TRI
,
390 SIMachineFunctionInfo
&Info
) {
391 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
392 if (Info
.hasPrivateSegmentBuffer()) {
393 unsigned PrivateSegmentBufferReg
= Info
.addPrivateSegmentBuffer(TRI
);
394 MF
.addLiveIn(PrivateSegmentBufferReg
, &AMDGPU::SGPR_128RegClass
);
395 CCInfo
.AllocateReg(PrivateSegmentBufferReg
);
398 if (Info
.hasDispatchPtr()) {
399 unsigned DispatchPtrReg
= Info
.addDispatchPtr(TRI
);
400 MF
.addLiveIn(DispatchPtrReg
, &AMDGPU::SGPR_64RegClass
);
401 CCInfo
.AllocateReg(DispatchPtrReg
);
404 if (Info
.hasQueuePtr()) {
405 unsigned QueuePtrReg
= Info
.addQueuePtr(TRI
);
406 MF
.addLiveIn(QueuePtrReg
, &AMDGPU::SGPR_64RegClass
);
407 CCInfo
.AllocateReg(QueuePtrReg
);
410 if (Info
.hasKernargSegmentPtr()) {
411 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
412 Register InputPtrReg
= Info
.addKernargSegmentPtr(TRI
);
413 const LLT P4
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
414 Register VReg
= MRI
.createGenericVirtualRegister(P4
);
415 MRI
.addLiveIn(InputPtrReg
, VReg
);
416 B
.getMBB().addLiveIn(InputPtrReg
);
417 B
.buildCopy(VReg
, InputPtrReg
);
418 CCInfo
.AllocateReg(InputPtrReg
);
421 if (Info
.hasDispatchID()) {
422 unsigned DispatchIDReg
= Info
.addDispatchID(TRI
);
423 MF
.addLiveIn(DispatchIDReg
, &AMDGPU::SGPR_64RegClass
);
424 CCInfo
.AllocateReg(DispatchIDReg
);
427 if (Info
.hasFlatScratchInit()) {
428 unsigned FlatScratchInitReg
= Info
.addFlatScratchInit(TRI
);
429 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
430 CCInfo
.AllocateReg(FlatScratchInitReg
);
433 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
434 // these from the dispatch pointer.
437 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
438 MachineIRBuilder
&B
, const Function
&F
,
439 ArrayRef
<ArrayRef
<Register
>> VRegs
) const {
440 MachineFunction
&MF
= B
.getMF();
441 const GCNSubtarget
*Subtarget
= &MF
.getSubtarget
<GCNSubtarget
>();
442 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
443 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
444 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
445 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
447 const DataLayout
&DL
= F
.getParent()->getDataLayout();
449 SmallVector
<CCValAssign
, 16> ArgLocs
;
450 CCState
CCInfo(F
.getCallingConv(), F
.isVarArg(), MF
, ArgLocs
, F
.getContext());
452 allocateHSAUserSGPRs(CCInfo
, B
, MF
, *TRI
, *Info
);
455 const unsigned KernArgBaseAlign
= 16;
456 const unsigned BaseOffset
= Subtarget
->getExplicitKernelArgOffset(F
);
457 uint64_t ExplicitArgOffset
= 0;
459 // TODO: Align down to dword alignment and extract bits for extending loads.
460 for (auto &Arg
: F
.args()) {
461 Type
*ArgTy
= Arg
.getType();
462 unsigned AllocSize
= DL
.getTypeAllocSize(ArgTy
);
466 unsigned ABIAlign
= DL
.getABITypeAlignment(ArgTy
);
468 uint64_t ArgOffset
= alignTo(ExplicitArgOffset
, ABIAlign
) + BaseOffset
;
469 ExplicitArgOffset
= alignTo(ExplicitArgOffset
, ABIAlign
) + AllocSize
;
471 ArrayRef
<Register
> OrigArgRegs
= VRegs
[i
];
473 OrigArgRegs
.size() == 1
475 : MRI
.createGenericVirtualRegister(getLLTForType(*ArgTy
, DL
));
476 unsigned Align
= MinAlign(KernArgBaseAlign
, ArgOffset
);
477 ArgOffset
= alignTo(ArgOffset
, DL
.getABITypeAlignment(ArgTy
));
478 lowerParameter(B
, ArgTy
, ArgOffset
, Align
, ArgReg
);
479 if (OrigArgRegs
.size() > 1)
480 unpackRegs(OrigArgRegs
, ArgReg
, ArgTy
, B
);
484 TLI
.allocateSpecialEntryInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
485 TLI
.allocateSystemSGPRs(CCInfo
, MF
, *Info
, F
.getCallingConv(), false);
489 // TODO: Move this to generic code
490 static void packSplitRegsToOrigType(MachineIRBuilder
&B
,
491 ArrayRef
<Register
> OrigRegs
,
492 ArrayRef
<Register
> Regs
,
495 if (!LLTy
.isVector() && !PartLLT
.isVector()) {
496 B
.buildMerge(OrigRegs
[0], Regs
);
500 if (LLTy
.isVector() && PartLLT
.isVector()) {
501 assert(LLTy
.getElementType() == PartLLT
.getElementType());
503 int DstElts
= LLTy
.getNumElements();
504 int PartElts
= PartLLT
.getNumElements();
505 if (DstElts
% PartElts
== 0)
506 B
.buildConcatVectors(OrigRegs
[0], Regs
);
508 // Deal with v3s16 split into v2s16
509 assert(PartElts
== 2 && DstElts
% 2 != 0);
510 int RoundedElts
= PartElts
* ((DstElts
+ PartElts
- 1) / PartElts
);
512 LLT RoundedDestTy
= LLT::vector(RoundedElts
, PartLLT
.getElementType());
513 auto RoundedConcat
= B
.buildConcatVectors(RoundedDestTy
, Regs
);
514 B
.buildExtract(OrigRegs
[0], RoundedConcat
, 0);
520 assert(LLTy
.isVector() && !PartLLT
.isVector());
522 LLT DstEltTy
= LLTy
.getElementType();
523 if (DstEltTy
== PartLLT
) {
524 // Vector was trivially scalarized.
525 B
.buildBuildVector(OrigRegs
[0], Regs
);
526 } else if (DstEltTy
.getSizeInBits() > PartLLT
.getSizeInBits()) {
527 // Deal with vector with 64-bit elements decomposed to 32-bit
528 // registers. Need to create intermediate 64-bit elements.
529 SmallVector
<Register
, 8> EltMerges
;
530 int PartsPerElt
= DstEltTy
.getSizeInBits() / PartLLT
.getSizeInBits();
532 assert(DstEltTy
.getSizeInBits() % PartLLT
.getSizeInBits() == 0);
534 for (int I
= 0, NumElts
= LLTy
.getNumElements(); I
!= NumElts
; ++I
) {
535 auto Merge
= B
.buildMerge(DstEltTy
,
536 Regs
.take_front(PartsPerElt
));
537 EltMerges
.push_back(Merge
.getReg(0));
538 Regs
= Regs
.drop_front(PartsPerElt
);
541 B
.buildBuildVector(OrigRegs
[0], EltMerges
);
543 // Vector was split, and elements promoted to a wider type.
544 LLT BVType
= LLT::vector(LLTy
.getNumElements(), PartLLT
);
545 auto BV
= B
.buildBuildVector(BVType
, Regs
);
546 B
.buildTrunc(OrigRegs
[0], BV
);
550 bool AMDGPUCallLowering::lowerFormalArguments(
551 MachineIRBuilder
&B
, const Function
&F
,
552 ArrayRef
<ArrayRef
<Register
>> VRegs
) const {
553 CallingConv::ID CC
= F
.getCallingConv();
555 // The infrastructure for normal calling convention lowering is essentially
556 // useless for kernels. We want to avoid any kind of legalization or argument
558 if (CC
== CallingConv::AMDGPU_KERNEL
)
559 return lowerFormalArgumentsKernel(B
, F
, VRegs
);
561 const bool IsShader
= AMDGPU::isShader(CC
);
562 const bool IsEntryFunc
= AMDGPU::isEntryFunctionCC(CC
);
564 MachineFunction
&MF
= B
.getMF();
565 MachineBasicBlock
&MBB
= B
.getMBB();
566 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
567 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
568 const GCNSubtarget
&Subtarget
= MF
.getSubtarget
<GCNSubtarget
>();
569 const SIRegisterInfo
*TRI
= Subtarget
.getRegisterInfo();
570 const DataLayout
&DL
= F
.getParent()->getDataLayout();
573 SmallVector
<CCValAssign
, 16> ArgLocs
;
574 CCState
CCInfo(CC
, F
.isVarArg(), MF
, ArgLocs
, F
.getContext());
577 Register ReturnAddrReg
= TRI
->getReturnAddressReg(MF
);
578 Register LiveInReturn
= MF
.addLiveIn(ReturnAddrReg
,
579 &AMDGPU::SGPR_64RegClass
);
580 MBB
.addLiveIn(ReturnAddrReg
);
581 B
.buildCopy(LiveInReturn
, ReturnAddrReg
);
584 if (Info
->hasImplicitBufferPtr()) {
585 Register ImplicitBufferPtrReg
= Info
->addImplicitBufferPtr(*TRI
);
586 MF
.addLiveIn(ImplicitBufferPtrReg
, &AMDGPU::SGPR_64RegClass
);
587 CCInfo
.AllocateReg(ImplicitBufferPtrReg
);
591 SmallVector
<ArgInfo
, 32> SplitArgs
;
593 unsigned PSInputNum
= 0;
595 for (auto &Arg
: F
.args()) {
596 if (DL
.getTypeStoreSize(Arg
.getType()) == 0)
599 const bool InReg
= Arg
.hasAttribute(Attribute::InReg
);
601 // SGPR arguments to functions not implemented.
602 if (!IsShader
&& InReg
)
605 if (Arg
.hasAttribute(Attribute::SwiftSelf
) ||
606 Arg
.hasAttribute(Attribute::SwiftError
) ||
607 Arg
.hasAttribute(Attribute::Nest
))
610 if (CC
== CallingConv::AMDGPU_PS
&& !InReg
&& PSInputNum
<= 15) {
611 const bool ArgUsed
= !Arg
.use_empty();
612 bool SkipArg
= !ArgUsed
&& !Info
->isPSInputAllocated(PSInputNum
);
615 Info
->markPSInputAllocated(PSInputNum
);
617 Info
->markPSInputEnabled(PSInputNum
);
623 for (int I
= 0, E
= VRegs
[Idx
].size(); I
!= E
; ++I
)
624 B
.buildUndef(VRegs
[Idx
][I
]);
631 ArgInfo
OrigArg(VRegs
[Idx
], Arg
.getType());
632 setArgFlags(OrigArg
, Idx
+ AttributeList::FirstArgIndex
, DL
, F
);
635 OrigArg
, SplitArgs
, DL
, MRI
, CC
,
636 // FIXME: We should probably be passing multiple registers to
637 // handleAssignments to do this
638 [&](ArrayRef
<Register
> Regs
, LLT LLTy
, LLT PartLLT
, int VTSplitIdx
) {
639 packSplitRegsToOrigType(B
, VRegs
[Idx
][VTSplitIdx
], Regs
,
646 // At least one interpolation mode must be enabled or else the GPU will
649 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
650 // set PSInputAddr, the user wants to enable some bits after the compilation
651 // based on run-time states. Since we can't know what the final PSInputEna
652 // will look like, so we shouldn't do anything here and the user should take
653 // responsibility for the correct programming.
655 // Otherwise, the following restrictions apply:
656 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
657 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
659 if (CC
== CallingConv::AMDGPU_PS
) {
660 if ((Info
->getPSInputAddr() & 0x7F) == 0 ||
661 ((Info
->getPSInputAddr() & 0xF) == 0 &&
662 Info
->isPSInputAllocated(11))) {
663 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
664 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
665 Info
->markPSInputAllocated(0);
666 Info
->markPSInputEnabled(0);
669 if (Subtarget
.isAmdPalOS()) {
670 // For isAmdPalOS, the user does not enable some bits after compilation
671 // based on run-time states; the register values being generated here are
672 // the final ones set in hardware. Therefore we need to apply the
673 // workaround to PSInputAddr and PSInputEnable together. (The case where
674 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
675 // set up an input arg for a particular interpolation mode, but nothing
676 // uses that input arg. Really we should have an earlier pass that removes
678 unsigned PsInputBits
= Info
->getPSInputAddr() & Info
->getPSInputEnable();
679 if ((PsInputBits
& 0x7F) == 0 ||
680 ((PsInputBits
& 0xF) == 0 &&
681 (PsInputBits
>> 11 & 1)))
682 Info
->markPSInputEnabled(
683 countTrailingZeros(Info
->getPSInputAddr(), ZB_Undefined
));
687 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
688 CCAssignFn
*AssignFn
= TLI
.CCAssignFnForCall(CC
, F
.isVarArg());
691 B
.setInstr(*MBB
.begin());
693 FormalArgHandler
Handler(B
, MRI
, AssignFn
);
694 if (!handleAssignments(CCInfo
, ArgLocs
, B
, SplitArgs
, Handler
))
698 // Special inputs come after user arguments.
699 TLI
.allocateSpecialInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
702 // Start adding system SGPRs.
704 TLI
.allocateSystemSGPRs(CCInfo
, MF
, *Info
, CC
, IsShader
);
706 CCInfo
.AllocateReg(Info
->getScratchRSrcReg());
707 CCInfo
.AllocateReg(Info
->getScratchWaveOffsetReg());
708 CCInfo
.AllocateReg(Info
->getFrameOffsetReg());
709 TLI
.allocateSpecialInputSGPRs(CCInfo
, MF
, *TRI
, *Info
);
712 // Move back to the end of the basic block.