1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements the lowering of LLVM calls to machine code calls for
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUISelLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIISelLowering.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/CodeGen/Analysis.h"
24 #include "llvm/CodeGen/CallingConvLower.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/MachineInstrBuilder.h"
27 #include "llvm/Support/LowLevelTypeImpl.h"
33 struct OutgoingValueHandler
: public CallLowering::ValueHandler
{
34 OutgoingValueHandler(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
,
35 MachineInstrBuilder MIB
, CCAssignFn
*AssignFn
)
36 : ValueHandler(B
, MRI
, AssignFn
), MIB(MIB
) {}
38 MachineInstrBuilder MIB
;
40 Register
getStackAddress(uint64_t Size
, int64_t Offset
,
41 MachinePointerInfo
&MPO
) override
{
42 llvm_unreachable("not implemented");
45 void assignValueToAddress(Register ValVReg
, Register Addr
, uint64_t Size
,
46 MachinePointerInfo
&MPO
, CCValAssign
&VA
) override
{
47 llvm_unreachable("not implemented");
50 void assignValueToReg(Register ValVReg
, Register PhysReg
,
51 CCValAssign
&VA
) override
{
53 if (VA
.getLocVT().getSizeInBits() < 32) {
54 // 16-bit types are reported as legal for 32-bit registers. We need to
55 // extend and do a 32-bit copy to avoid the verifier complaining about it.
56 ExtReg
= MIRBuilder
.buildAnyExt(LLT::scalar(32), ValVReg
).getReg(0);
58 ExtReg
= extendRegister(ValVReg
, VA
);
60 MIRBuilder
.buildCopy(PhysReg
, ExtReg
);
61 MIB
.addUse(PhysReg
, RegState::Implicit
);
64 bool assignArg(unsigned ValNo
, MVT ValVT
, MVT LocVT
,
65 CCValAssign::LocInfo LocInfo
,
66 const CallLowering::ArgInfo
&Info
,
67 ISD::ArgFlagsTy Flags
,
68 CCState
&State
) override
{
69 return AssignFn(ValNo
, ValVT
, LocVT
, LocInfo
, Flags
, State
);
73 struct IncomingArgHandler
: public CallLowering::ValueHandler
{
74 uint64_t StackUsed
= 0;
76 IncomingArgHandler(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
,
78 : ValueHandler(B
, MRI
, AssignFn
) {}
80 Register
getStackAddress(uint64_t Size
, int64_t Offset
,
81 MachinePointerInfo
&MPO
) override
{
82 auto &MFI
= MIRBuilder
.getMF().getFrameInfo();
83 int FI
= MFI
.CreateFixedObject(Size
, Offset
, true);
84 MPO
= MachinePointerInfo::getFixedStack(MIRBuilder
.getMF(), FI
);
85 Register AddrReg
= MRI
.createGenericVirtualRegister(
86 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS
, 32));
87 MIRBuilder
.buildFrameIndex(AddrReg
, FI
);
88 StackUsed
= std::max(StackUsed
, Size
+ Offset
);
92 void assignValueToReg(Register ValVReg
, Register PhysReg
,
93 CCValAssign
&VA
) override
{
94 markPhysRegUsed(PhysReg
);
96 if (VA
.getLocVT().getSizeInBits() < 32) {
97 // 16-bit types are reported as legal for 32-bit registers. We need to do
98 // a 32-bit copy, and truncate to avoid the verifier complaining about it.
99 auto Copy
= MIRBuilder
.buildCopy(LLT::scalar(32), PhysReg
);
100 MIRBuilder
.buildTrunc(ValVReg
, Copy
);
104 switch (VA
.getLocInfo()) {
105 case CCValAssign::LocInfo::SExt
:
106 case CCValAssign::LocInfo::ZExt
:
107 case CCValAssign::LocInfo::AExt
: {
108 auto Copy
= MIRBuilder
.buildCopy(LLT
{VA
.getLocVT()}, PhysReg
);
109 MIRBuilder
.buildTrunc(ValVReg
, Copy
);
113 MIRBuilder
.buildCopy(ValVReg
, PhysReg
);
118 void assignValueToAddress(Register ValVReg
, Register Addr
, uint64_t Size
,
119 MachinePointerInfo
&MPO
, CCValAssign
&VA
) override
{
120 // FIXME: Get alignment
121 auto MMO
= MIRBuilder
.getMF().getMachineMemOperand(
122 MPO
, MachineMemOperand::MOLoad
| MachineMemOperand::MOInvariant
, Size
, 1);
123 MIRBuilder
.buildLoad(ValVReg
, Addr
, *MMO
);
126 /// How the physical register gets marked varies between formal
127 /// parameters (it's a basic-block live-in), and a call instruction
128 /// (it's an implicit-def of the BL).
129 virtual void markPhysRegUsed(unsigned PhysReg
) = 0;
131 // FIXME: What is the point of this being a callback?
132 bool isIncomingArgumentHandler() const override
{ return true; }
135 struct FormalArgHandler
: public IncomingArgHandler
{
136 FormalArgHandler(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
,
137 CCAssignFn
*AssignFn
)
138 : IncomingArgHandler(B
, MRI
, AssignFn
) {}
140 void markPhysRegUsed(unsigned PhysReg
) override
{
141 MIRBuilder
.getMBB().addLiveIn(PhysReg
);
147 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering
&TLI
)
148 : CallLowering(&TLI
) {
151 void AMDGPUCallLowering::splitToValueTypes(
152 const ArgInfo
&OrigArg
, SmallVectorImpl
<ArgInfo
> &SplitArgs
,
153 const DataLayout
&DL
, MachineRegisterInfo
&MRI
, CallingConv::ID CallConv
,
154 SplitArgTy PerformArgSplit
) const {
155 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
156 LLVMContext
&Ctx
= OrigArg
.Ty
->getContext();
158 if (OrigArg
.Ty
->isVoidTy())
161 SmallVector
<EVT
, 4> SplitVTs
;
162 ComputeValueVTs(TLI
, DL
, OrigArg
.Ty
, SplitVTs
);
164 assert(OrigArg
.Regs
.size() == SplitVTs
.size());
167 for (EVT VT
: SplitVTs
) {
168 unsigned NumParts
= TLI
.getNumRegistersForCallingConv(Ctx
, CallConv
, VT
);
169 Type
*Ty
= VT
.getTypeForEVT(Ctx
);
174 // No splitting to do, but we want to replace the original type (e.g. [1 x
175 // double] -> double).
176 SplitArgs
.emplace_back(OrigArg
.Regs
[SplitIdx
], Ty
,
177 OrigArg
.Flags
, OrigArg
.IsFixed
);
183 LLT LLTy
= getLLTForType(*Ty
, DL
);
185 SmallVector
<Register
, 8> SplitRegs
;
187 EVT PartVT
= TLI
.getRegisterTypeForCallingConv(Ctx
, CallConv
, VT
);
188 Type
*PartTy
= PartVT
.getTypeForEVT(Ctx
);
189 LLT PartLLT
= getLLTForType(*PartTy
, DL
);
191 // FIXME: Should we be reporting all of the part registers for a single
192 // argument, and let handleAssignments take care of the repacking?
193 for (unsigned i
= 0; i
< NumParts
; ++i
) {
194 Register PartReg
= MRI
.createGenericVirtualRegister(PartLLT
);
195 SplitRegs
.push_back(PartReg
);
196 SplitArgs
.emplace_back(ArrayRef
<Register
>(PartReg
), PartTy
, OrigArg
.Flags
);
199 PerformArgSplit(SplitRegs
, LLTy
, PartLLT
, SplitIdx
);
205 // Get the appropriate type to make \p OrigTy \p Factor times bigger.
206 static LLT
getMultipleType(LLT OrigTy
, int Factor
) {
207 if (OrigTy
.isVector()) {
208 return LLT::vector(OrigTy
.getNumElements() * Factor
,
209 OrigTy
.getElementType());
212 return LLT::scalar(OrigTy
.getSizeInBits() * Factor
);
215 // TODO: Move to generic code
216 static void unpackRegsToOrigType(MachineIRBuilder
&B
,
217 ArrayRef
<Register
> DstRegs
,
221 assert(DstRegs
.size() > 1 && "Nothing to unpack");
223 MachineFunction
&MF
= B
.getMF();
224 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
226 const unsigned SrcSize
= SrcTy
.getSizeInBits();
227 const unsigned PartSize
= PartTy
.getSizeInBits();
229 if (SrcTy
.isVector() && !PartTy
.isVector() &&
230 PartSize
> SrcTy
.getElementType().getSizeInBits()) {
231 // Vector was scalarized, and the elements extended.
232 auto UnmergeToEltTy
= B
.buildUnmerge(SrcTy
.getElementType(),
234 for (int i
= 0, e
= DstRegs
.size(); i
!= e
; ++i
)
235 B
.buildAnyExt(DstRegs
[i
], UnmergeToEltTy
.getReg(i
));
239 if (SrcSize
% PartSize
== 0) {
240 B
.buildUnmerge(DstRegs
, SrcReg
);
244 const int NumRoundedParts
= (SrcSize
+ PartSize
- 1) / PartSize
;
246 LLT BigTy
= getMultipleType(PartTy
, NumRoundedParts
);
247 auto ImpDef
= B
.buildUndef(BigTy
);
249 Register BigReg
= MRI
.createGenericVirtualRegister(BigTy
);
250 B
.buildInsert(BigReg
, ImpDef
.getReg(0), SrcReg
, 0).getReg(0);
253 for (unsigned i
= 0, e
= DstRegs
.size(); i
!= e
; ++i
, Offset
+= PartSize
)
254 B
.buildExtract(DstRegs
[i
], BigReg
, Offset
);
257 /// Lower the return value for the already existing \p Ret. This assumes that
258 /// \p B's insertion point is correct.
259 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder
&B
,
260 const Value
*Val
, ArrayRef
<Register
> VRegs
,
261 MachineInstrBuilder
&Ret
) const {
265 auto &MF
= B
.getMF();
266 const auto &F
= MF
.getFunction();
267 const DataLayout
&DL
= MF
.getDataLayout();
269 CallingConv::ID CC
= F
.getCallingConv();
270 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
271 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
273 ArgInfo
OrigRetInfo(VRegs
, Val
->getType());
274 setArgFlags(OrigRetInfo
, AttributeList::ReturnIndex
, DL
, F
);
275 SmallVector
<ArgInfo
, 4> SplitRetInfos
;
278 OrigRetInfo
, SplitRetInfos
, DL
, MRI
, CC
,
279 [&](ArrayRef
<Register
> Regs
, LLT LLTy
, LLT PartLLT
, int VTSplitIdx
) {
280 unpackRegsToOrigType(B
, Regs
, VRegs
[VTSplitIdx
], LLTy
, PartLLT
);
283 CCAssignFn
*AssignFn
= TLI
.CCAssignFnForReturn(CC
, F
.isVarArg());
285 OutgoingValueHandler
RetHandler(B
, MF
.getRegInfo(), Ret
, AssignFn
);
286 return handleAssignments(B
, SplitRetInfos
, RetHandler
);
289 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder
&B
,
291 ArrayRef
<Register
> VRegs
) const {
293 MachineFunction
&MF
= B
.getMF();
294 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
295 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
296 MFI
->setIfReturnsVoid(!Val
);
298 assert(!Val
== VRegs
.empty() && "Return value without a vreg");
300 CallingConv::ID CC
= B
.getMF().getFunction().getCallingConv();
301 const bool IsShader
= AMDGPU::isShader(CC
);
302 const bool IsWaveEnd
= (IsShader
&& MFI
->returnsVoid()) ||
303 AMDGPU::isKernel(CC
);
305 B
.buildInstr(AMDGPU::S_ENDPGM
)
310 auto const &ST
= B
.getMF().getSubtarget
<GCNSubtarget
>();
313 IsShader
? AMDGPU::SI_RETURN_TO_EPILOG
: AMDGPU::S_SETPC_B64_return
;
315 auto Ret
= B
.buildInstrNoInsert(ReturnOpc
);
316 Register ReturnAddrVReg
;
317 if (ReturnOpc
== AMDGPU::S_SETPC_B64_return
) {
318 ReturnAddrVReg
= MRI
.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass
);
319 Ret
.addUse(ReturnAddrVReg
);
322 if (!lowerReturnVal(B
, Val
, VRegs
, Ret
))
325 if (ReturnOpc
== AMDGPU::S_SETPC_B64_return
) {
326 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
327 Register LiveInReturn
= MF
.addLiveIn(TRI
->getReturnAddressReg(MF
),
328 &AMDGPU::SGPR_64RegClass
);
329 B
.buildCopy(ReturnAddrVReg
, LiveInReturn
);
332 // TODO: Handle CalleeSavedRegsViaCopy.
338 Register
AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder
&B
,
340 uint64_t Offset
) const {
342 MachineFunction
&MF
= B
.getMF();
343 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
344 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
345 const Function
&F
= MF
.getFunction();
346 const DataLayout
&DL
= F
.getParent()->getDataLayout();
347 PointerType
*PtrTy
= PointerType::get(ParamTy
, AMDGPUAS::CONSTANT_ADDRESS
);
348 LLT PtrType
= getLLTForType(*PtrTy
, DL
);
349 Register DstReg
= MRI
.createGenericVirtualRegister(PtrType
);
350 Register KernArgSegmentPtr
=
351 MFI
->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
352 Register KernArgSegmentVReg
= MRI
.getLiveInVirtReg(KernArgSegmentPtr
);
354 Register OffsetReg
= MRI
.createGenericVirtualRegister(LLT::scalar(64));
355 B
.buildConstant(OffsetReg
, Offset
);
357 B
.buildGEP(DstReg
, KernArgSegmentVReg
, OffsetReg
);
362 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder
&B
,
363 Type
*ParamTy
, uint64_t Offset
,
365 Register DstReg
) const {
366 MachineFunction
&MF
= B
.getMF();
367 const Function
&F
= MF
.getFunction();
368 const DataLayout
&DL
= F
.getParent()->getDataLayout();
369 PointerType
*PtrTy
= PointerType::get(ParamTy
, AMDGPUAS::CONSTANT_ADDRESS
);
370 MachinePointerInfo
PtrInfo(UndefValue::get(PtrTy
));
371 unsigned TypeSize
= DL
.getTypeStoreSize(ParamTy
);
372 Register PtrReg
= lowerParameterPtr(B
, ParamTy
, Offset
);
374 MachineMemOperand
*MMO
=
375 MF
.getMachineMemOperand(PtrInfo
, MachineMemOperand::MOLoad
|
376 MachineMemOperand::MODereferenceable
|
377 MachineMemOperand::MOInvariant
,
380 B
.buildLoad(DstReg
, PtrReg
, *MMO
);
383 // Allocate special inputs passed in user SGPRs.
384 static void allocateHSAUserSGPRs(CCState
&CCInfo
,
387 const SIRegisterInfo
&TRI
,
388 SIMachineFunctionInfo
&Info
) {
389 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
390 if (Info
.hasPrivateSegmentBuffer()) {
391 unsigned PrivateSegmentBufferReg
= Info
.addPrivateSegmentBuffer(TRI
);
392 MF
.addLiveIn(PrivateSegmentBufferReg
, &AMDGPU::SGPR_128RegClass
);
393 CCInfo
.AllocateReg(PrivateSegmentBufferReg
);
396 if (Info
.hasDispatchPtr()) {
397 unsigned DispatchPtrReg
= Info
.addDispatchPtr(TRI
);
398 MF
.addLiveIn(DispatchPtrReg
, &AMDGPU::SGPR_64RegClass
);
399 CCInfo
.AllocateReg(DispatchPtrReg
);
402 if (Info
.hasQueuePtr()) {
403 unsigned QueuePtrReg
= Info
.addQueuePtr(TRI
);
404 MF
.addLiveIn(QueuePtrReg
, &AMDGPU::SGPR_64RegClass
);
405 CCInfo
.AllocateReg(QueuePtrReg
);
408 if (Info
.hasKernargSegmentPtr()) {
409 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
410 Register InputPtrReg
= Info
.addKernargSegmentPtr(TRI
);
411 const LLT P4
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
412 Register VReg
= MRI
.createGenericVirtualRegister(P4
);
413 MRI
.addLiveIn(InputPtrReg
, VReg
);
414 B
.getMBB().addLiveIn(InputPtrReg
);
415 B
.buildCopy(VReg
, InputPtrReg
);
416 CCInfo
.AllocateReg(InputPtrReg
);
419 if (Info
.hasDispatchID()) {
420 unsigned DispatchIDReg
= Info
.addDispatchID(TRI
);
421 MF
.addLiveIn(DispatchIDReg
, &AMDGPU::SGPR_64RegClass
);
422 CCInfo
.AllocateReg(DispatchIDReg
);
425 if (Info
.hasFlatScratchInit()) {
426 unsigned FlatScratchInitReg
= Info
.addFlatScratchInit(TRI
);
427 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
428 CCInfo
.AllocateReg(FlatScratchInitReg
);
431 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
432 // these from the dispatch pointer.
435 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
436 MachineIRBuilder
&B
, const Function
&F
,
437 ArrayRef
<ArrayRef
<Register
>> VRegs
) const {
438 MachineFunction
&MF
= B
.getMF();
439 const GCNSubtarget
*Subtarget
= &MF
.getSubtarget
<GCNSubtarget
>();
440 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
441 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
442 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
443 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
445 const DataLayout
&DL
= F
.getParent()->getDataLayout();
447 SmallVector
<CCValAssign
, 16> ArgLocs
;
448 CCState
CCInfo(F
.getCallingConv(), F
.isVarArg(), MF
, ArgLocs
, F
.getContext());
450 allocateHSAUserSGPRs(CCInfo
, B
, MF
, *TRI
, *Info
);
453 const unsigned KernArgBaseAlign
= 16;
454 const unsigned BaseOffset
= Subtarget
->getExplicitKernelArgOffset(F
);
455 uint64_t ExplicitArgOffset
= 0;
457 // TODO: Align down to dword alignment and extract bits for extending loads.
458 for (auto &Arg
: F
.args()) {
459 Type
*ArgTy
= Arg
.getType();
460 unsigned AllocSize
= DL
.getTypeAllocSize(ArgTy
);
464 unsigned ABIAlign
= DL
.getABITypeAlignment(ArgTy
);
466 uint64_t ArgOffset
= alignTo(ExplicitArgOffset
, ABIAlign
) + BaseOffset
;
467 ExplicitArgOffset
= alignTo(ExplicitArgOffset
, ABIAlign
) + AllocSize
;
469 ArrayRef
<Register
> OrigArgRegs
= VRegs
[i
];
471 OrigArgRegs
.size() == 1
473 : MRI
.createGenericVirtualRegister(getLLTForType(*ArgTy
, DL
));
474 unsigned Align
= MinAlign(KernArgBaseAlign
, ArgOffset
);
475 ArgOffset
= alignTo(ArgOffset
, DL
.getABITypeAlignment(ArgTy
));
476 lowerParameter(B
, ArgTy
, ArgOffset
, Align
, ArgReg
);
477 if (OrigArgRegs
.size() > 1)
478 unpackRegs(OrigArgRegs
, ArgReg
, ArgTy
, B
);
482 TLI
.allocateSpecialEntryInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
483 TLI
.allocateSystemSGPRs(CCInfo
, MF
, *Info
, F
.getCallingConv(), false);
487 // TODO: Move this to generic code
488 static void packSplitRegsToOrigType(MachineIRBuilder
&B
,
489 ArrayRef
<Register
> OrigRegs
,
490 ArrayRef
<Register
> Regs
,
493 if (!LLTy
.isVector() && !PartLLT
.isVector()) {
494 B
.buildMerge(OrigRegs
[0], Regs
);
498 if (LLTy
.isVector() && PartLLT
.isVector()) {
499 assert(LLTy
.getElementType() == PartLLT
.getElementType());
501 int DstElts
= LLTy
.getNumElements();
502 int PartElts
= PartLLT
.getNumElements();
503 if (DstElts
% PartElts
== 0)
504 B
.buildConcatVectors(OrigRegs
[0], Regs
);
506 // Deal with v3s16 split into v2s16
507 assert(PartElts
== 2 && DstElts
% 2 != 0);
508 int RoundedElts
= PartElts
* ((DstElts
+ PartElts
- 1) / PartElts
);
510 LLT RoundedDestTy
= LLT::vector(RoundedElts
, PartLLT
.getElementType());
511 auto RoundedConcat
= B
.buildConcatVectors(RoundedDestTy
, Regs
);
512 B
.buildExtract(OrigRegs
[0], RoundedConcat
, 0);
518 assert(LLTy
.isVector() && !PartLLT
.isVector());
520 LLT DstEltTy
= LLTy
.getElementType();
521 if (DstEltTy
== PartLLT
) {
522 // Vector was trivially scalarized.
523 B
.buildBuildVector(OrigRegs
[0], Regs
);
524 } else if (DstEltTy
.getSizeInBits() > PartLLT
.getSizeInBits()) {
525 // Deal with vector with 64-bit elements decomposed to 32-bit
526 // registers. Need to create intermediate 64-bit elements.
527 SmallVector
<Register
, 8> EltMerges
;
528 int PartsPerElt
= DstEltTy
.getSizeInBits() / PartLLT
.getSizeInBits();
530 assert(DstEltTy
.getSizeInBits() % PartLLT
.getSizeInBits() == 0);
532 for (int I
= 0, NumElts
= LLTy
.getNumElements(); I
!= NumElts
; ++I
) {
533 auto Merge
= B
.buildMerge(DstEltTy
,
534 Regs
.take_front(PartsPerElt
));
535 EltMerges
.push_back(Merge
.getReg(0));
536 Regs
= Regs
.drop_front(PartsPerElt
);
539 B
.buildBuildVector(OrigRegs
[0], EltMerges
);
541 // Vector was split, and elements promoted to a wider type.
542 LLT BVType
= LLT::vector(LLTy
.getNumElements(), PartLLT
);
543 auto BV
= B
.buildBuildVector(BVType
, Regs
);
544 B
.buildTrunc(OrigRegs
[0], BV
);
548 bool AMDGPUCallLowering::lowerFormalArguments(
549 MachineIRBuilder
&B
, const Function
&F
,
550 ArrayRef
<ArrayRef
<Register
>> VRegs
) const {
551 CallingConv::ID CC
= F
.getCallingConv();
553 // The infrastructure for normal calling convention lowering is essentially
554 // useless for kernels. We want to avoid any kind of legalization or argument
556 if (CC
== CallingConv::AMDGPU_KERNEL
)
557 return lowerFormalArgumentsKernel(B
, F
, VRegs
);
559 const bool IsShader
= AMDGPU::isShader(CC
);
560 const bool IsEntryFunc
= AMDGPU::isEntryFunctionCC(CC
);
562 MachineFunction
&MF
= B
.getMF();
563 MachineBasicBlock
&MBB
= B
.getMBB();
564 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
565 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
566 const GCNSubtarget
&Subtarget
= MF
.getSubtarget
<GCNSubtarget
>();
567 const SIRegisterInfo
*TRI
= Subtarget
.getRegisterInfo();
568 const DataLayout
&DL
= F
.getParent()->getDataLayout();
571 SmallVector
<CCValAssign
, 16> ArgLocs
;
572 CCState
CCInfo(CC
, F
.isVarArg(), MF
, ArgLocs
, F
.getContext());
575 Register ReturnAddrReg
= TRI
->getReturnAddressReg(MF
);
576 Register LiveInReturn
= MF
.addLiveIn(ReturnAddrReg
,
577 &AMDGPU::SGPR_64RegClass
);
578 MBB
.addLiveIn(ReturnAddrReg
);
579 B
.buildCopy(LiveInReturn
, ReturnAddrReg
);
582 if (Info
->hasImplicitBufferPtr()) {
583 Register ImplicitBufferPtrReg
= Info
->addImplicitBufferPtr(*TRI
);
584 MF
.addLiveIn(ImplicitBufferPtrReg
, &AMDGPU::SGPR_64RegClass
);
585 CCInfo
.AllocateReg(ImplicitBufferPtrReg
);
589 SmallVector
<ArgInfo
, 32> SplitArgs
;
591 unsigned PSInputNum
= 0;
593 for (auto &Arg
: F
.args()) {
594 if (DL
.getTypeStoreSize(Arg
.getType()) == 0)
597 const bool InReg
= Arg
.hasAttribute(Attribute::InReg
);
599 // SGPR arguments to functions not implemented.
600 if (!IsShader
&& InReg
)
603 if (Arg
.hasAttribute(Attribute::SwiftSelf
) ||
604 Arg
.hasAttribute(Attribute::SwiftError
) ||
605 Arg
.hasAttribute(Attribute::Nest
))
608 if (CC
== CallingConv::AMDGPU_PS
&& !InReg
&& PSInputNum
<= 15) {
609 const bool ArgUsed
= !Arg
.use_empty();
610 bool SkipArg
= !ArgUsed
&& !Info
->isPSInputAllocated(PSInputNum
);
613 Info
->markPSInputAllocated(PSInputNum
);
615 Info
->markPSInputEnabled(PSInputNum
);
621 for (int I
= 0, E
= VRegs
[Idx
].size(); I
!= E
; ++I
)
622 B
.buildUndef(VRegs
[Idx
][I
]);
629 ArgInfo
OrigArg(VRegs
[Idx
], Arg
.getType());
630 setArgFlags(OrigArg
, Idx
+ AttributeList::FirstArgIndex
, DL
, F
);
633 OrigArg
, SplitArgs
, DL
, MRI
, CC
,
634 // FIXME: We should probably be passing multiple registers to
635 // handleAssignments to do this
636 [&](ArrayRef
<Register
> Regs
, LLT LLTy
, LLT PartLLT
, int VTSplitIdx
) {
637 packSplitRegsToOrigType(B
, VRegs
[Idx
][VTSplitIdx
], Regs
,
644 // At least one interpolation mode must be enabled or else the GPU will
647 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
648 // set PSInputAddr, the user wants to enable some bits after the compilation
649 // based on run-time states. Since we can't know what the final PSInputEna
650 // will look like, so we shouldn't do anything here and the user should take
651 // responsibility for the correct programming.
653 // Otherwise, the following restrictions apply:
654 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
655 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
657 if (CC
== CallingConv::AMDGPU_PS
) {
658 if ((Info
->getPSInputAddr() & 0x7F) == 0 ||
659 ((Info
->getPSInputAddr() & 0xF) == 0 &&
660 Info
->isPSInputAllocated(11))) {
661 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
662 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
663 Info
->markPSInputAllocated(0);
664 Info
->markPSInputEnabled(0);
667 if (Subtarget
.isAmdPalOS()) {
668 // For isAmdPalOS, the user does not enable some bits after compilation
669 // based on run-time states; the register values being generated here are
670 // the final ones set in hardware. Therefore we need to apply the
671 // workaround to PSInputAddr and PSInputEnable together. (The case where
672 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
673 // set up an input arg for a particular interpolation mode, but nothing
674 // uses that input arg. Really we should have an earlier pass that removes
676 unsigned PsInputBits
= Info
->getPSInputAddr() & Info
->getPSInputEnable();
677 if ((PsInputBits
& 0x7F) == 0 ||
678 ((PsInputBits
& 0xF) == 0 &&
679 (PsInputBits
>> 11 & 1)))
680 Info
->markPSInputEnabled(
681 countTrailingZeros(Info
->getPSInputAddr(), ZB_Undefined
));
685 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
686 CCAssignFn
*AssignFn
= TLI
.CCAssignFnForCall(CC
, F
.isVarArg());
689 B
.setInstr(*MBB
.begin());
691 FormalArgHandler
Handler(B
, MRI
, AssignFn
);
692 if (!handleAssignments(CCInfo
, ArgLocs
, B
, SplitArgs
, Handler
))
696 // Special inputs come after user arguments.
697 TLI
.allocateSpecialInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
700 // Start adding system SGPRs.
702 TLI
.allocateSystemSGPRs(CCInfo
, MF
, *Info
, CC
, IsShader
);
704 CCInfo
.AllocateReg(Info
->getScratchRSrcReg());
705 CCInfo
.AllocateReg(Info
->getScratchWaveOffsetReg());
706 CCInfo
.AllocateReg(Info
->getFrameOffsetReg());
707 TLI
.allocateSpecialInputSGPRs(CCInfo
, MF
, *TRI
, *Info
);
710 // Move back to the end of the basic block.