1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements the lowering of LLVM calls to machine code calls for
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUCallLowering.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/CodeGen/FunctionLoweringInfo.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #define DEBUG_TYPE "amdgpu-call-lowering"
33 /// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
34 static Register
extendRegisterMin32(CallLowering::ValueHandler
&Handler
,
35 Register ValVReg
, const CCValAssign
&VA
) {
36 if (VA
.getLocVT().getSizeInBits() < 32) {
37 // 16-bit types are reported as legal for 32-bit registers. We need to
38 // extend and do a 32-bit copy to avoid the verifier complaining about it.
39 return Handler
.MIRBuilder
.buildAnyExt(LLT::scalar(32), ValVReg
).getReg(0);
42 return Handler
.extendRegister(ValVReg
, VA
);
45 struct AMDGPUOutgoingValueHandler
: public CallLowering::OutgoingValueHandler
{
46 AMDGPUOutgoingValueHandler(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
,
47 MachineInstrBuilder MIB
)
48 : OutgoingValueHandler(B
, MRI
), MIB(MIB
) {}
50 MachineInstrBuilder MIB
;
52 Register
getStackAddress(uint64_t Size
, int64_t Offset
,
53 MachinePointerInfo
&MPO
,
54 ISD::ArgFlagsTy Flags
) override
{
55 llvm_unreachable("not implemented");
58 void assignValueToAddress(Register ValVReg
, Register Addr
, LLT MemTy
,
59 const MachinePointerInfo
&MPO
,
60 const CCValAssign
&VA
) override
{
61 llvm_unreachable("not implemented");
64 void assignValueToReg(Register ValVReg
, Register PhysReg
,
65 const CCValAssign
&VA
) override
{
66 Register ExtReg
= extendRegisterMin32(*this, ValVReg
, VA
);
68 // If this is a scalar return, insert a readfirstlane just in case the value
70 // FIXME: Assert this is a shader return.
71 const SIRegisterInfo
*TRI
72 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
73 if (TRI
->isSGPRReg(MRI
, PhysReg
)) {
74 LLT Ty
= MRI
.getType(ExtReg
);
75 LLT S32
= LLT::scalar(32);
77 // FIXME: We should probably support readfirstlane intrinsics with all
78 // legal 32-bit types.
79 assert(Ty
.getSizeInBits() == 32);
81 ExtReg
= MIRBuilder
.buildPtrToInt(S32
, ExtReg
).getReg(0);
83 ExtReg
= MIRBuilder
.buildBitcast(S32
, ExtReg
).getReg(0);
86 auto ToSGPR
= MIRBuilder
87 .buildIntrinsic(Intrinsic::amdgcn_readfirstlane
,
88 {MRI
.getType(ExtReg
)})
90 ExtReg
= ToSGPR
.getReg(0);
93 MIRBuilder
.buildCopy(PhysReg
, ExtReg
);
94 MIB
.addUse(PhysReg
, RegState::Implicit
);
98 struct AMDGPUIncomingArgHandler
: public CallLowering::IncomingValueHandler
{
99 uint64_t StackUsed
= 0;
101 AMDGPUIncomingArgHandler(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
)
102 : IncomingValueHandler(B
, MRI
) {}
104 Register
getStackAddress(uint64_t Size
, int64_t Offset
,
105 MachinePointerInfo
&MPO
,
106 ISD::ArgFlagsTy Flags
) override
{
107 auto &MFI
= MIRBuilder
.getMF().getFrameInfo();
109 // Byval is assumed to be writable memory, but other stack passed arguments
111 const bool IsImmutable
= !Flags
.isByVal();
112 int FI
= MFI
.CreateFixedObject(Size
, Offset
, IsImmutable
);
113 MPO
= MachinePointerInfo::getFixedStack(MIRBuilder
.getMF(), FI
);
114 auto AddrReg
= MIRBuilder
.buildFrameIndex(
115 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS
, 32), FI
);
116 StackUsed
= std::max(StackUsed
, Size
+ Offset
);
117 return AddrReg
.getReg(0);
120 void assignValueToReg(Register ValVReg
, Register PhysReg
,
121 const CCValAssign
&VA
) override
{
122 markPhysRegUsed(PhysReg
);
124 if (VA
.getLocVT().getSizeInBits() < 32) {
125 // 16-bit types are reported as legal for 32-bit registers. We need to do
126 // a 32-bit copy, and truncate to avoid the verifier complaining about it.
127 auto Copy
= MIRBuilder
.buildCopy(LLT::scalar(32), PhysReg
);
129 // If we have signext/zeroext, it applies to the whole 32-bit register
130 // before truncation.
132 buildExtensionHint(VA
, Copy
.getReg(0), LLT(VA
.getLocVT()));
133 MIRBuilder
.buildTrunc(ValVReg
, Extended
);
137 IncomingValueHandler::assignValueToReg(ValVReg
, PhysReg
, VA
);
140 void assignValueToAddress(Register ValVReg
, Register Addr
, LLT MemTy
,
141 const MachinePointerInfo
&MPO
,
142 const CCValAssign
&VA
) override
{
143 MachineFunction
&MF
= MIRBuilder
.getMF();
145 auto MMO
= MF
.getMachineMemOperand(
146 MPO
, MachineMemOperand::MOLoad
| MachineMemOperand::MOInvariant
, MemTy
,
147 inferAlignFromPtrInfo(MF
, MPO
));
148 MIRBuilder
.buildLoad(ValVReg
, Addr
, *MMO
);
151 /// How the physical register gets marked varies between formal
152 /// parameters (it's a basic-block live-in), and a call instruction
153 /// (it's an implicit-def of the BL).
154 virtual void markPhysRegUsed(unsigned PhysReg
) = 0;
157 struct FormalArgHandler
: public AMDGPUIncomingArgHandler
{
158 FormalArgHandler(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
)
159 : AMDGPUIncomingArgHandler(B
, MRI
) {}
161 void markPhysRegUsed(unsigned PhysReg
) override
{
162 MIRBuilder
.getMBB().addLiveIn(PhysReg
);
166 struct CallReturnHandler
: public AMDGPUIncomingArgHandler
{
167 CallReturnHandler(MachineIRBuilder
&MIRBuilder
, MachineRegisterInfo
&MRI
,
168 MachineInstrBuilder MIB
)
169 : AMDGPUIncomingArgHandler(MIRBuilder
, MRI
), MIB(MIB
) {}
171 void markPhysRegUsed(unsigned PhysReg
) override
{
172 MIB
.addDef(PhysReg
, RegState::Implicit
);
175 MachineInstrBuilder MIB
;
178 struct AMDGPUOutgoingArgHandler
: public AMDGPUOutgoingValueHandler
{
179 /// For tail calls, the byte offset of the call's argument area from the
180 /// callee's. Unused elsewhere.
183 // Cache the SP register vreg if we need it more than once in this call site.
188 AMDGPUOutgoingArgHandler(MachineIRBuilder
&MIRBuilder
,
189 MachineRegisterInfo
&MRI
, MachineInstrBuilder MIB
,
190 bool IsTailCall
= false, int FPDiff
= 0)
191 : AMDGPUOutgoingValueHandler(MIRBuilder
, MRI
, MIB
), FPDiff(FPDiff
),
192 IsTailCall(IsTailCall
) {}
194 Register
getStackAddress(uint64_t Size
, int64_t Offset
,
195 MachinePointerInfo
&MPO
,
196 ISD::ArgFlagsTy Flags
) override
{
197 MachineFunction
&MF
= MIRBuilder
.getMF();
198 const LLT PtrTy
= LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS
, 32);
199 const LLT S32
= LLT::scalar(32);
203 int FI
= MF
.getFrameInfo().CreateFixedObject(Size
, Offset
, true);
204 auto FIReg
= MIRBuilder
.buildFrameIndex(PtrTy
, FI
);
205 MPO
= MachinePointerInfo::getFixedStack(MF
, FI
);
206 return FIReg
.getReg(0);
209 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
212 const GCNSubtarget
&ST
= MIRBuilder
.getMF().getSubtarget
<GCNSubtarget
>();
213 if (ST
.enableFlatScratch()) {
214 // The stack is accessed unswizzled, so we can use a regular copy.
215 SPReg
= MIRBuilder
.buildCopy(PtrTy
,
216 MFI
->getStackPtrOffsetReg()).getReg(0);
218 // The address we produce here, without knowing the use context, is going
219 // to be interpreted as a vector address, so we need to convert to a
221 SPReg
= MIRBuilder
.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS
, {PtrTy
},
222 {MFI
->getStackPtrOffsetReg()}).getReg(0);
226 auto OffsetReg
= MIRBuilder
.buildConstant(S32
, Offset
);
228 auto AddrReg
= MIRBuilder
.buildPtrAdd(PtrTy
, SPReg
, OffsetReg
);
229 MPO
= MachinePointerInfo::getStack(MF
, Offset
);
230 return AddrReg
.getReg(0);
233 void assignValueToReg(Register ValVReg
, Register PhysReg
,
234 const CCValAssign
&VA
) override
{
235 MIB
.addUse(PhysReg
, RegState::Implicit
);
236 Register ExtReg
= extendRegisterMin32(*this, ValVReg
, VA
);
237 MIRBuilder
.buildCopy(PhysReg
, ExtReg
);
240 void assignValueToAddress(Register ValVReg
, Register Addr
, LLT MemTy
,
241 const MachinePointerInfo
&MPO
,
242 const CCValAssign
&VA
) override
{
243 MachineFunction
&MF
= MIRBuilder
.getMF();
244 uint64_t LocMemOffset
= VA
.getLocMemOffset();
245 const auto &ST
= MF
.getSubtarget
<GCNSubtarget
>();
247 auto MMO
= MF
.getMachineMemOperand(
248 MPO
, MachineMemOperand::MOStore
, MemTy
,
249 commonAlignment(ST
.getStackAlignment(), LocMemOffset
));
250 MIRBuilder
.buildStore(ValVReg
, Addr
, *MMO
);
253 void assignValueToAddress(const CallLowering::ArgInfo
&Arg
,
254 unsigned ValRegIndex
, Register Addr
, LLT MemTy
,
255 const MachinePointerInfo
&MPO
,
256 const CCValAssign
&VA
) override
{
257 Register ValVReg
= VA
.getLocInfo() != CCValAssign::LocInfo::FPExt
258 ? extendRegister(Arg
.Regs
[ValRegIndex
], VA
)
259 : Arg
.Regs
[ValRegIndex
];
260 assignValueToAddress(ValVReg
, Addr
, MemTy
, MPO
, VA
);
263 } // anonymous namespace
265 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering
&TLI
)
266 : CallLowering(&TLI
) {
269 // FIXME: Compatibility shim
270 static ISD::NodeType
extOpcodeToISDExtOpcode(unsigned MIOpc
) {
272 case TargetOpcode::G_SEXT
:
273 return ISD::SIGN_EXTEND
;
274 case TargetOpcode::G_ZEXT
:
275 return ISD::ZERO_EXTEND
;
276 case TargetOpcode::G_ANYEXT
:
277 return ISD::ANY_EXTEND
;
279 llvm_unreachable("not an extend opcode");
283 bool AMDGPUCallLowering::canLowerReturn(MachineFunction
&MF
,
284 CallingConv::ID CallConv
,
285 SmallVectorImpl
<BaseArgInfo
> &Outs
,
286 bool IsVarArg
) const {
287 // For shaders. Vector types should be explicitly handled by CC.
288 if (AMDGPU::isEntryFunctionCC(CallConv
))
291 SmallVector
<CCValAssign
, 16> ArgLocs
;
292 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
293 CCState
CCInfo(CallConv
, IsVarArg
, MF
, ArgLocs
,
294 MF
.getFunction().getContext());
296 return checkReturn(CCInfo
, Outs
, TLI
.CCAssignFnForReturn(CallConv
, IsVarArg
));
299 /// Lower the return value for the already existing \p Ret. This assumes that
300 /// \p B's insertion point is correct.
301 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder
&B
,
302 const Value
*Val
, ArrayRef
<Register
> VRegs
,
303 MachineInstrBuilder
&Ret
) const {
307 auto &MF
= B
.getMF();
308 const auto &F
= MF
.getFunction();
309 const DataLayout
&DL
= MF
.getDataLayout();
310 MachineRegisterInfo
*MRI
= B
.getMRI();
311 LLVMContext
&Ctx
= F
.getContext();
313 CallingConv::ID CC
= F
.getCallingConv();
314 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
316 SmallVector
<EVT
, 8> SplitEVTs
;
317 ComputeValueVTs(TLI
, DL
, Val
->getType(), SplitEVTs
);
318 assert(VRegs
.size() == SplitEVTs
.size() &&
319 "For each split Type there should be exactly one VReg.");
321 SmallVector
<ArgInfo
, 8> SplitRetInfos
;
323 for (unsigned i
= 0; i
< SplitEVTs
.size(); ++i
) {
324 EVT VT
= SplitEVTs
[i
];
325 Register Reg
= VRegs
[i
];
326 ArgInfo
RetInfo(Reg
, VT
.getTypeForEVT(Ctx
), 0);
327 setArgFlags(RetInfo
, AttributeList::ReturnIndex
, DL
, F
);
329 if (VT
.isScalarInteger()) {
330 unsigned ExtendOp
= TargetOpcode::G_ANYEXT
;
331 if (RetInfo
.Flags
[0].isSExt()) {
332 assert(RetInfo
.Regs
.size() == 1 && "expect only simple return values");
333 ExtendOp
= TargetOpcode::G_SEXT
;
334 } else if (RetInfo
.Flags
[0].isZExt()) {
335 assert(RetInfo
.Regs
.size() == 1 && "expect only simple return values");
336 ExtendOp
= TargetOpcode::G_ZEXT
;
339 EVT ExtVT
= TLI
.getTypeForExtReturn(Ctx
, VT
,
340 extOpcodeToISDExtOpcode(ExtendOp
));
342 RetInfo
.Ty
= ExtVT
.getTypeForEVT(Ctx
);
343 LLT ExtTy
= getLLTForType(*RetInfo
.Ty
, DL
);
344 Reg
= B
.buildInstr(ExtendOp
, {ExtTy
}, {Reg
}).getReg(0);
348 if (Reg
!= RetInfo
.Regs
[0]) {
349 RetInfo
.Regs
[0] = Reg
;
350 // Reset the arg flags after modifying Reg.
351 setArgFlags(RetInfo
, AttributeList::ReturnIndex
, DL
, F
);
354 splitToValueTypes(RetInfo
, SplitRetInfos
, DL
, CC
);
357 CCAssignFn
*AssignFn
= TLI
.CCAssignFnForReturn(CC
, F
.isVarArg());
359 OutgoingValueAssigner
Assigner(AssignFn
);
360 AMDGPUOutgoingValueHandler
RetHandler(B
, *MRI
, Ret
);
361 return determineAndHandleAssignments(RetHandler
, Assigner
, SplitRetInfos
, B
,
365 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder
&B
, const Value
*Val
,
366 ArrayRef
<Register
> VRegs
,
367 FunctionLoweringInfo
&FLI
) const {
369 MachineFunction
&MF
= B
.getMF();
370 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
371 MFI
->setIfReturnsVoid(!Val
);
373 assert(!Val
== VRegs
.empty() && "Return value without a vreg");
375 CallingConv::ID CC
= B
.getMF().getFunction().getCallingConv();
376 const bool IsShader
= AMDGPU::isShader(CC
);
377 const bool IsWaveEnd
=
378 (IsShader
&& MFI
->returnsVoid()) || AMDGPU::isKernel(CC
);
380 B
.buildInstr(AMDGPU::S_ENDPGM
)
386 IsShader
? AMDGPU::SI_RETURN_TO_EPILOG
: AMDGPU::SI_RETURN
;
387 auto Ret
= B
.buildInstrNoInsert(ReturnOpc
);
389 if (!FLI
.CanLowerReturn
)
390 insertSRetStores(B
, Val
->getType(), VRegs
, FLI
.DemoteRegister
);
391 else if (!lowerReturnVal(B
, Val
, VRegs
, Ret
))
394 // TODO: Handle CalleeSavedRegsViaCopy.
400 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg
, MachineIRBuilder
&B
,
401 uint64_t Offset
) const {
402 MachineFunction
&MF
= B
.getMF();
403 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
404 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
405 Register KernArgSegmentPtr
=
406 MFI
->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
407 Register KernArgSegmentVReg
= MRI
.getLiveInVirtReg(KernArgSegmentPtr
);
409 auto OffsetReg
= B
.buildConstant(LLT::scalar(64), Offset
);
411 B
.buildPtrAdd(DstReg
, KernArgSegmentVReg
, OffsetReg
);
414 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder
&B
, ArgInfo
&OrigArg
,
416 Align Alignment
) const {
417 MachineFunction
&MF
= B
.getMF();
418 const Function
&F
= MF
.getFunction();
419 const DataLayout
&DL
= F
.getDataLayout();
420 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
422 LLT PtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
424 SmallVector
<ArgInfo
, 32> SplitArgs
;
425 SmallVector
<uint64_t> FieldOffsets
;
426 splitToValueTypes(OrigArg
, SplitArgs
, DL
, F
.getCallingConv(), &FieldOffsets
);
429 for (ArgInfo
&SplitArg
: SplitArgs
) {
430 Register PtrReg
= B
.getMRI()->createGenericVirtualRegister(PtrTy
);
431 lowerParameterPtr(PtrReg
, B
, Offset
+ FieldOffsets
[Idx
]);
433 LLT ArgTy
= getLLTForType(*SplitArg
.Ty
, DL
);
434 if (SplitArg
.Flags
[0].isPointer()) {
435 // Compensate for losing pointeriness in splitValueTypes.
436 LLT PtrTy
= LLT::pointer(SplitArg
.Flags
[0].getPointerAddrSpace(),
437 ArgTy
.getScalarSizeInBits());
438 ArgTy
= ArgTy
.isVector() ? LLT::vector(ArgTy
.getElementCount(), PtrTy
)
442 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
444 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
445 MachineMemOperand::MOInvariant
,
446 ArgTy
, commonAlignment(Alignment
, FieldOffsets
[Idx
]));
448 assert(SplitArg
.Regs
.size() == 1);
450 B
.buildLoad(SplitArg
.Regs
[0], PtrReg
, *MMO
);
455 // Allocate special inputs passed in user SGPRs.
456 static void allocateHSAUserSGPRs(CCState
&CCInfo
,
459 const SIRegisterInfo
&TRI
,
460 SIMachineFunctionInfo
&Info
) {
461 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
462 const GCNUserSGPRUsageInfo
&UserSGPRInfo
= Info
.getUserSGPRInfo();
463 if (UserSGPRInfo
.hasPrivateSegmentBuffer()) {
464 Register PrivateSegmentBufferReg
= Info
.addPrivateSegmentBuffer(TRI
);
465 MF
.addLiveIn(PrivateSegmentBufferReg
, &AMDGPU::SGPR_128RegClass
);
466 CCInfo
.AllocateReg(PrivateSegmentBufferReg
);
469 if (UserSGPRInfo
.hasDispatchPtr()) {
470 Register DispatchPtrReg
= Info
.addDispatchPtr(TRI
);
471 MF
.addLiveIn(DispatchPtrReg
, &AMDGPU::SGPR_64RegClass
);
472 CCInfo
.AllocateReg(DispatchPtrReg
);
475 const Module
*M
= MF
.getFunction().getParent();
476 if (UserSGPRInfo
.hasQueuePtr() &&
477 AMDGPU::getAMDHSACodeObjectVersion(*M
) < AMDGPU::AMDHSA_COV5
) {
478 Register QueuePtrReg
= Info
.addQueuePtr(TRI
);
479 MF
.addLiveIn(QueuePtrReg
, &AMDGPU::SGPR_64RegClass
);
480 CCInfo
.AllocateReg(QueuePtrReg
);
483 if (UserSGPRInfo
.hasKernargSegmentPtr()) {
484 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
485 Register InputPtrReg
= Info
.addKernargSegmentPtr(TRI
);
486 const LLT P4
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
487 Register VReg
= MRI
.createGenericVirtualRegister(P4
);
488 MRI
.addLiveIn(InputPtrReg
, VReg
);
489 B
.getMBB().addLiveIn(InputPtrReg
);
490 B
.buildCopy(VReg
, InputPtrReg
);
491 CCInfo
.AllocateReg(InputPtrReg
);
494 if (UserSGPRInfo
.hasDispatchID()) {
495 Register DispatchIDReg
= Info
.addDispatchID(TRI
);
496 MF
.addLiveIn(DispatchIDReg
, &AMDGPU::SGPR_64RegClass
);
497 CCInfo
.AllocateReg(DispatchIDReg
);
500 if (UserSGPRInfo
.hasFlatScratchInit()) {
501 Register FlatScratchInitReg
= Info
.addFlatScratchInit(TRI
);
502 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
503 CCInfo
.AllocateReg(FlatScratchInitReg
);
506 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
507 // these from the dispatch pointer.
510 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
511 MachineIRBuilder
&B
, const Function
&F
,
512 ArrayRef
<ArrayRef
<Register
>> VRegs
) const {
513 MachineFunction
&MF
= B
.getMF();
514 const GCNSubtarget
*Subtarget
= &MF
.getSubtarget
<GCNSubtarget
>();
515 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
516 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
517 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
518 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
519 const DataLayout
&DL
= F
.getDataLayout();
521 SmallVector
<CCValAssign
, 16> ArgLocs
;
522 CCState
CCInfo(F
.getCallingConv(), F
.isVarArg(), MF
, ArgLocs
, F
.getContext());
524 allocateHSAUserSGPRs(CCInfo
, B
, MF
, *TRI
, *Info
);
527 const Align
KernArgBaseAlign(16);
528 const unsigned BaseOffset
= Subtarget
->getExplicitKernelArgOffset();
529 uint64_t ExplicitArgOffset
= 0;
531 // TODO: Align down to dword alignment and extract bits for extending loads.
532 for (auto &Arg
: F
.args()) {
533 const bool IsByRef
= Arg
.hasByRefAttr();
534 Type
*ArgTy
= IsByRef
? Arg
.getParamByRefType() : Arg
.getType();
535 unsigned AllocSize
= DL
.getTypeAllocSize(ArgTy
);
539 MaybeAlign ParamAlign
= IsByRef
? Arg
.getParamAlign() : std::nullopt
;
540 Align ABIAlign
= DL
.getValueOrABITypeAlignment(ParamAlign
, ArgTy
);
542 uint64_t ArgOffset
= alignTo(ExplicitArgOffset
, ABIAlign
) + BaseOffset
;
543 ExplicitArgOffset
= alignTo(ExplicitArgOffset
, ABIAlign
) + AllocSize
;
545 if (Arg
.use_empty()) {
550 Align Alignment
= commonAlignment(KernArgBaseAlign
, ArgOffset
);
553 unsigned ByRefAS
= cast
<PointerType
>(Arg
.getType())->getAddressSpace();
555 assert(VRegs
[i
].size() == 1 &&
556 "expected only one register for byval pointers");
557 if (ByRefAS
== AMDGPUAS::CONSTANT_ADDRESS
) {
558 lowerParameterPtr(VRegs
[i
][0], B
, ArgOffset
);
560 const LLT ConstPtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
561 Register PtrReg
= MRI
.createGenericVirtualRegister(ConstPtrTy
);
562 lowerParameterPtr(PtrReg
, B
, ArgOffset
);
564 B
.buildAddrSpaceCast(VRegs
[i
][0], PtrReg
);
567 ArgInfo
OrigArg(VRegs
[i
], Arg
, i
);
568 const unsigned OrigArgIdx
= i
+ AttributeList::FirstArgIndex
;
569 setArgFlags(OrigArg
, OrigArgIdx
, DL
, F
);
570 lowerParameter(B
, OrigArg
, ArgOffset
, Alignment
);
576 TLI
.allocateSpecialEntryInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
577 TLI
.allocateSystemSGPRs(CCInfo
, MF
, *Info
, F
.getCallingConv(), false);
581 bool AMDGPUCallLowering::lowerFormalArguments(
582 MachineIRBuilder
&B
, const Function
&F
, ArrayRef
<ArrayRef
<Register
>> VRegs
,
583 FunctionLoweringInfo
&FLI
) const {
584 CallingConv::ID CC
= F
.getCallingConv();
586 // The infrastructure for normal calling convention lowering is essentially
587 // useless for kernels. We want to avoid any kind of legalization or argument
589 if (CC
== CallingConv::AMDGPU_KERNEL
)
590 return lowerFormalArgumentsKernel(B
, F
, VRegs
);
592 const bool IsGraphics
= AMDGPU::isGraphics(CC
);
593 const bool IsEntryFunc
= AMDGPU::isEntryFunctionCC(CC
);
595 MachineFunction
&MF
= B
.getMF();
596 MachineBasicBlock
&MBB
= B
.getMBB();
597 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
598 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
599 const GCNSubtarget
&Subtarget
= MF
.getSubtarget
<GCNSubtarget
>();
600 const SIRegisterInfo
*TRI
= Subtarget
.getRegisterInfo();
601 const DataLayout
&DL
= F
.getDataLayout();
603 SmallVector
<CCValAssign
, 16> ArgLocs
;
604 CCState
CCInfo(CC
, F
.isVarArg(), MF
, ArgLocs
, F
.getContext());
605 const GCNUserSGPRUsageInfo
&UserSGPRInfo
= Info
->getUserSGPRInfo();
607 if (UserSGPRInfo
.hasImplicitBufferPtr()) {
608 Register ImplicitBufferPtrReg
= Info
->addImplicitBufferPtr(*TRI
);
609 MF
.addLiveIn(ImplicitBufferPtrReg
, &AMDGPU::SGPR_64RegClass
);
610 CCInfo
.AllocateReg(ImplicitBufferPtrReg
);
613 // FIXME: This probably isn't defined for mesa
614 if (UserSGPRInfo
.hasFlatScratchInit() && !Subtarget
.isAmdPalOS()) {
615 Register FlatScratchInitReg
= Info
->addFlatScratchInit(*TRI
);
616 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
617 CCInfo
.AllocateReg(FlatScratchInitReg
);
620 SmallVector
<ArgInfo
, 32> SplitArgs
;
622 unsigned PSInputNum
= 0;
624 // Insert the hidden sret parameter if the return value won't fit in the
626 if (!FLI
.CanLowerReturn
)
627 insertSRetIncomingArgument(F
, SplitArgs
, FLI
.DemoteRegister
, MRI
, DL
);
629 for (auto &Arg
: F
.args()) {
630 if (DL
.getTypeStoreSize(Arg
.getType()) == 0)
633 const bool InReg
= Arg
.hasAttribute(Attribute::InReg
);
635 if (Arg
.hasAttribute(Attribute::SwiftSelf
) ||
636 Arg
.hasAttribute(Attribute::SwiftError
) ||
637 Arg
.hasAttribute(Attribute::Nest
))
640 if (CC
== CallingConv::AMDGPU_PS
&& !InReg
&& PSInputNum
<= 15) {
641 const bool ArgUsed
= !Arg
.use_empty();
642 bool SkipArg
= !ArgUsed
&& !Info
->isPSInputAllocated(PSInputNum
);
645 Info
->markPSInputAllocated(PSInputNum
);
647 Info
->markPSInputEnabled(PSInputNum
);
653 for (Register R
: VRegs
[Idx
])
661 ArgInfo
OrigArg(VRegs
[Idx
], Arg
, Idx
);
662 const unsigned OrigArgIdx
= Idx
+ AttributeList::FirstArgIndex
;
663 setArgFlags(OrigArg
, OrigArgIdx
, DL
, F
);
665 splitToValueTypes(OrigArg
, SplitArgs
, DL
, CC
);
669 // At least one interpolation mode must be enabled or else the GPU will
672 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
673 // set PSInputAddr, the user wants to enable some bits after the compilation
674 // based on run-time states. Since we can't know what the final PSInputEna
675 // will look like, so we shouldn't do anything here and the user should take
676 // responsibility for the correct programming.
678 // Otherwise, the following restrictions apply:
679 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
680 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
682 if (CC
== CallingConv::AMDGPU_PS
) {
683 if ((Info
->getPSInputAddr() & 0x7F) == 0 ||
684 ((Info
->getPSInputAddr() & 0xF) == 0 &&
685 Info
->isPSInputAllocated(11))) {
686 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
687 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
688 Info
->markPSInputAllocated(0);
689 Info
->markPSInputEnabled(0);
692 if (Subtarget
.isAmdPalOS()) {
693 // For isAmdPalOS, the user does not enable some bits after compilation
694 // based on run-time states; the register values being generated here are
695 // the final ones set in hardware. Therefore we need to apply the
696 // workaround to PSInputAddr and PSInputEnable together. (The case where
697 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
698 // set up an input arg for a particular interpolation mode, but nothing
699 // uses that input arg. Really we should have an earlier pass that removes
701 unsigned PsInputBits
= Info
->getPSInputAddr() & Info
->getPSInputEnable();
702 if ((PsInputBits
& 0x7F) == 0 ||
703 ((PsInputBits
& 0xF) == 0 &&
704 (PsInputBits
>> 11 & 1)))
705 Info
->markPSInputEnabled(llvm::countr_zero(Info
->getPSInputAddr()));
709 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
710 CCAssignFn
*AssignFn
= TLI
.CCAssignFnForCall(CC
, F
.isVarArg());
713 B
.setInstr(*MBB
.begin());
715 if (!IsEntryFunc
&& !IsGraphics
) {
716 // For the fixed ABI, pass workitem IDs in the last argument register.
717 TLI
.allocateSpecialInputVGPRsFixed(CCInfo
, MF
, *TRI
, *Info
);
719 if (!Subtarget
.enableFlatScratch())
720 CCInfo
.AllocateReg(Info
->getScratchRSrcReg());
721 TLI
.allocateSpecialInputSGPRs(CCInfo
, MF
, *TRI
, *Info
);
724 IncomingValueAssigner
Assigner(AssignFn
);
725 if (!determineAssignments(Assigner
, SplitArgs
, CCInfo
))
728 FormalArgHandler
Handler(B
, MRI
);
729 if (!handleAssignments(Handler
, SplitArgs
, CCInfo
, ArgLocs
, B
))
732 uint64_t StackSize
= Assigner
.StackSize
;
734 // Start adding system SGPRs.
736 TLI
.allocateSystemSGPRs(CCInfo
, MF
, *Info
, CC
, IsGraphics
);
738 // When we tail call, we need to check if the callee's arguments will fit on
739 // the caller's stack. So, whenever we lower formal arguments, we should keep
740 // track of this information, since we might lower a tail call in this
742 Info
->setBytesInStackArgArea(StackSize
);
744 // Move back to the end of the basic block.
750 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder
&MIRBuilder
,
752 SmallVectorImpl
<std::pair
<MCRegister
, Register
>> &ArgRegs
,
753 CallLoweringInfo
&Info
) const {
754 MachineFunction
&MF
= MIRBuilder
.getMF();
756 // If there's no call site, this doesn't correspond to a call from the IR and
757 // doesn't need implicit inputs.
761 const AMDGPUFunctionArgInfo
*CalleeArgInfo
762 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo
;
764 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
765 const AMDGPUFunctionArgInfo
&CallerArgInfo
= MFI
->getArgInfo();
768 // TODO: Unify with private memory register handling. This is complicated by
769 // the fact that at least in kernels, the input argument is not necessarily
770 // in the same location as the input.
771 AMDGPUFunctionArgInfo::PreloadedValue InputRegs
[] = {
772 AMDGPUFunctionArgInfo::DISPATCH_PTR
,
773 AMDGPUFunctionArgInfo::QUEUE_PTR
,
774 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
,
775 AMDGPUFunctionArgInfo::DISPATCH_ID
,
776 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
,
777 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
,
778 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
,
779 AMDGPUFunctionArgInfo::LDS_KERNEL_ID
,
782 static constexpr StringLiteral ImplicitAttrNames
[] = {
783 "amdgpu-no-dispatch-ptr",
784 "amdgpu-no-queue-ptr",
785 "amdgpu-no-implicitarg-ptr",
786 "amdgpu-no-dispatch-id",
787 "amdgpu-no-workgroup-id-x",
788 "amdgpu-no-workgroup-id-y",
789 "amdgpu-no-workgroup-id-z",
790 "amdgpu-no-lds-kernel-id",
793 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
795 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
796 const AMDGPULegalizerInfo
*LI
797 = static_cast<const AMDGPULegalizerInfo
*>(ST
.getLegalizerInfo());
800 for (auto InputID
: InputRegs
) {
801 const ArgDescriptor
*OutgoingArg
;
802 const TargetRegisterClass
*ArgRC
;
805 // If the callee does not use the attribute value, skip copying the value.
806 if (Info
.CB
->hasFnAttr(ImplicitAttrNames
[I
++]))
809 std::tie(OutgoingArg
, ArgRC
, ArgTy
) =
810 CalleeArgInfo
->getPreloadedValue(InputID
);
814 const ArgDescriptor
*IncomingArg
;
815 const TargetRegisterClass
*IncomingArgRC
;
816 std::tie(IncomingArg
, IncomingArgRC
, ArgTy
) =
817 CallerArgInfo
.getPreloadedValue(InputID
);
818 assert(IncomingArgRC
== ArgRC
);
820 Register InputReg
= MRI
.createGenericVirtualRegister(ArgTy
);
823 LI
->loadInputValue(InputReg
, MIRBuilder
, IncomingArg
, ArgRC
, ArgTy
);
824 } else if (InputID
== AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
) {
825 LI
->getImplicitArgPtr(InputReg
, MRI
, MIRBuilder
);
826 } else if (InputID
== AMDGPUFunctionArgInfo::LDS_KERNEL_ID
) {
827 std::optional
<uint32_t> Id
=
828 AMDGPUMachineFunction::getLDSKernelIdMetadata(MF
.getFunction());
830 MIRBuilder
.buildConstant(InputReg
, *Id
);
832 MIRBuilder
.buildUndef(InputReg
);
835 // We may have proven the input wasn't needed, although the ABI is
836 // requiring it. We just need to allocate the register appropriately.
837 MIRBuilder
.buildUndef(InputReg
);
840 if (OutgoingArg
->isRegister()) {
841 ArgRegs
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
842 if (!CCInfo
.AllocateReg(OutgoingArg
->getRegister()))
843 report_fatal_error("failed to allocate implicit input argument");
845 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
850 // Pack workitem IDs into a single register or pass it as is if already
852 const ArgDescriptor
*OutgoingArg
;
853 const TargetRegisterClass
*ArgRC
;
856 std::tie(OutgoingArg
, ArgRC
, ArgTy
) =
857 CalleeArgInfo
->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
859 std::tie(OutgoingArg
, ArgRC
, ArgTy
) =
860 CalleeArgInfo
->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
862 std::tie(OutgoingArg
, ArgRC
, ArgTy
) =
863 CalleeArgInfo
->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
868 CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
870 CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
872 CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
874 const ArgDescriptor
*IncomingArgX
= std::get
<0>(WorkitemIDX
);
875 const ArgDescriptor
*IncomingArgY
= std::get
<0>(WorkitemIDY
);
876 const ArgDescriptor
*IncomingArgZ
= std::get
<0>(WorkitemIDZ
);
877 const LLT S32
= LLT::scalar(32);
879 const bool NeedWorkItemIDX
= !Info
.CB
->hasFnAttr("amdgpu-no-workitem-id-x");
880 const bool NeedWorkItemIDY
= !Info
.CB
->hasFnAttr("amdgpu-no-workitem-id-y");
881 const bool NeedWorkItemIDZ
= !Info
.CB
->hasFnAttr("amdgpu-no-workitem-id-z");
883 // If incoming ids are not packed we need to pack them.
884 // FIXME: Should consider known workgroup size to eliminate known 0 cases.
886 if (IncomingArgX
&& !IncomingArgX
->isMasked() && CalleeArgInfo
->WorkItemIDX
&&
888 if (ST
.getMaxWorkitemID(MF
.getFunction(), 0) != 0) {
889 InputReg
= MRI
.createGenericVirtualRegister(S32
);
890 LI
->loadInputValue(InputReg
, MIRBuilder
, IncomingArgX
,
891 std::get
<1>(WorkitemIDX
), std::get
<2>(WorkitemIDX
));
893 InputReg
= MIRBuilder
.buildConstant(S32
, 0).getReg(0);
897 if (IncomingArgY
&& !IncomingArgY
->isMasked() && CalleeArgInfo
->WorkItemIDY
&&
898 NeedWorkItemIDY
&& ST
.getMaxWorkitemID(MF
.getFunction(), 1) != 0) {
899 Register Y
= MRI
.createGenericVirtualRegister(S32
);
900 LI
->loadInputValue(Y
, MIRBuilder
, IncomingArgY
, std::get
<1>(WorkitemIDY
),
901 std::get
<2>(WorkitemIDY
));
903 Y
= MIRBuilder
.buildShl(S32
, Y
, MIRBuilder
.buildConstant(S32
, 10)).getReg(0);
904 InputReg
= InputReg
? MIRBuilder
.buildOr(S32
, InputReg
, Y
).getReg(0) : Y
;
907 if (IncomingArgZ
&& !IncomingArgZ
->isMasked() && CalleeArgInfo
->WorkItemIDZ
&&
908 NeedWorkItemIDZ
&& ST
.getMaxWorkitemID(MF
.getFunction(), 2) != 0) {
909 Register Z
= MRI
.createGenericVirtualRegister(S32
);
910 LI
->loadInputValue(Z
, MIRBuilder
, IncomingArgZ
, std::get
<1>(WorkitemIDZ
),
911 std::get
<2>(WorkitemIDZ
));
913 Z
= MIRBuilder
.buildShl(S32
, Z
, MIRBuilder
.buildConstant(S32
, 20)).getReg(0);
914 InputReg
= InputReg
? MIRBuilder
.buildOr(S32
, InputReg
, Z
).getReg(0) : Z
;
918 (NeedWorkItemIDX
|| NeedWorkItemIDY
|| NeedWorkItemIDZ
)) {
919 InputReg
= MRI
.createGenericVirtualRegister(S32
);
920 if (!IncomingArgX
&& !IncomingArgY
&& !IncomingArgZ
) {
921 // We're in a situation where the outgoing function requires the workitem
922 // ID, but the calling function does not have it (e.g a graphics function
923 // calling a C calling convention function). This is illegal, but we need
924 // to produce something.
925 MIRBuilder
.buildUndef(InputReg
);
927 // Workitem ids are already packed, any of present incoming arguments will
928 // carry all required fields.
929 ArgDescriptor IncomingArg
= ArgDescriptor::createArg(
930 IncomingArgX
? *IncomingArgX
:
931 IncomingArgY
? *IncomingArgY
: *IncomingArgZ
, ~0u);
932 LI
->loadInputValue(InputReg
, MIRBuilder
, &IncomingArg
,
933 &AMDGPU::VGPR_32RegClass
, S32
);
937 if (OutgoingArg
->isRegister()) {
939 ArgRegs
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
941 if (!CCInfo
.AllocateReg(OutgoingArg
->getRegister()))
942 report_fatal_error("failed to allocate implicit input argument");
944 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
951 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
953 static std::pair
<CCAssignFn
*, CCAssignFn
*>
954 getAssignFnsForCC(CallingConv::ID CC
, const SITargetLowering
&TLI
) {
955 return {TLI
.CCAssignFnForCall(CC
, false), TLI
.CCAssignFnForCall(CC
, true)};
958 static unsigned getCallOpcode(const MachineFunction
&CallerF
, bool IsIndirect
,
959 bool IsTailCall
, bool isWave32
,
960 CallingConv::ID CC
) {
961 // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
962 assert((AMDGPU::isChainCC(CC
) || !IsIndirect
|| !IsTailCall
) &&
963 "Indirect calls can't be tail calls, "
964 "because the address can be divergent");
966 return AMDGPU::G_SI_CALL
;
968 if (AMDGPU::isChainCC(CC
))
969 return isWave32
? AMDGPU::SI_CS_CHAIN_TC_W32
: AMDGPU::SI_CS_CHAIN_TC_W64
;
971 return CC
== CallingConv::AMDGPU_Gfx
? AMDGPU::SI_TCRETURN_GFX
:
975 // Add operands to call instruction to track the callee.
976 static bool addCallTargetOperands(MachineInstrBuilder
&CallInst
,
977 MachineIRBuilder
&MIRBuilder
,
978 AMDGPUCallLowering::CallLoweringInfo
&Info
) {
979 if (Info
.Callee
.isReg()) {
980 CallInst
.addReg(Info
.Callee
.getReg());
982 } else if (Info
.Callee
.isGlobal() && Info
.Callee
.getOffset() == 0) {
983 // The call lowering lightly assumed we can directly encode a call target in
984 // the instruction, which is not the case. Materialize the address here.
985 const GlobalValue
*GV
= Info
.Callee
.getGlobal();
986 auto Ptr
= MIRBuilder
.buildGlobalValue(
987 LLT::pointer(GV
->getAddressSpace(), 64), GV
);
988 CallInst
.addReg(Ptr
.getReg(0));
989 CallInst
.add(Info
.Callee
);
996 bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
997 CallLoweringInfo
&Info
, MachineFunction
&MF
,
998 SmallVectorImpl
<ArgInfo
> &InArgs
) const {
999 const Function
&CallerF
= MF
.getFunction();
1000 CallingConv::ID CalleeCC
= Info
.CallConv
;
1001 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
1003 // If the calling conventions match, then everything must be the same.
1004 if (CalleeCC
== CallerCC
)
1007 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1009 // Make sure that the caller and callee preserve all of the same registers.
1010 auto TRI
= ST
.getRegisterInfo();
1012 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
1013 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
1014 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
1017 // Check if the caller and callee will handle arguments in the same way.
1018 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
1019 CCAssignFn
*CalleeAssignFnFixed
;
1020 CCAssignFn
*CalleeAssignFnVarArg
;
1021 std::tie(CalleeAssignFnFixed
, CalleeAssignFnVarArg
) =
1022 getAssignFnsForCC(CalleeCC
, TLI
);
1024 CCAssignFn
*CallerAssignFnFixed
;
1025 CCAssignFn
*CallerAssignFnVarArg
;
1026 std::tie(CallerAssignFnFixed
, CallerAssignFnVarArg
) =
1027 getAssignFnsForCC(CallerCC
, TLI
);
1029 // FIXME: We are not accounting for potential differences in implicitly passed
1030 // inputs, but only the fixed ABI is supported now anyway.
1031 IncomingValueAssigner
CalleeAssigner(CalleeAssignFnFixed
,
1032 CalleeAssignFnVarArg
);
1033 IncomingValueAssigner
CallerAssigner(CallerAssignFnFixed
,
1034 CallerAssignFnVarArg
);
1035 return resultsCompatible(Info
, MF
, InArgs
, CalleeAssigner
, CallerAssigner
);
1038 bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
1039 CallLoweringInfo
&Info
, MachineFunction
&MF
,
1040 SmallVectorImpl
<ArgInfo
> &OutArgs
) const {
1041 // If there are no outgoing arguments, then we are done.
1042 if (OutArgs
.empty())
1045 const Function
&CallerF
= MF
.getFunction();
1046 CallingConv::ID CalleeCC
= Info
.CallConv
;
1047 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
1048 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
1050 CCAssignFn
*AssignFnFixed
;
1051 CCAssignFn
*AssignFnVarArg
;
1052 std::tie(AssignFnFixed
, AssignFnVarArg
) = getAssignFnsForCC(CalleeCC
, TLI
);
1054 // We have outgoing arguments. Make sure that we can tail call with them.
1055 SmallVector
<CCValAssign
, 16> OutLocs
;
1056 CCState
OutInfo(CalleeCC
, false, MF
, OutLocs
, CallerF
.getContext());
1057 OutgoingValueAssigner
Assigner(AssignFnFixed
, AssignFnVarArg
);
1059 if (!determineAssignments(Assigner
, OutArgs
, OutInfo
)) {
1060 LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
1064 // Make sure that they can fit on the caller's stack.
1065 const SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
1066 if (OutInfo
.getStackSize() > FuncInfo
->getBytesInStackArgArea()) {
1067 LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1071 // Verify that the parameters in callee-saved registers match.
1072 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1073 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
1074 const uint32_t *CallerPreservedMask
= TRI
->getCallPreservedMask(MF
, CallerCC
);
1075 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1076 return parametersInCSRMatch(MRI
, CallerPreservedMask
, OutLocs
, OutArgs
);
1079 /// Return true if the calling convention is one that we can guarantee TCO for.
1080 static bool canGuaranteeTCO(CallingConv::ID CC
) {
1081 return CC
== CallingConv::Fast
;
1084 /// Return true if we might ever do TCO for calls with this calling convention.
1085 static bool mayTailCallThisCC(CallingConv::ID CC
) {
1087 case CallingConv::C
:
1088 case CallingConv::AMDGPU_Gfx
:
1091 return canGuaranteeTCO(CC
);
1095 bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
1096 MachineIRBuilder
&B
, CallLoweringInfo
&Info
,
1097 SmallVectorImpl
<ArgInfo
> &InArgs
, SmallVectorImpl
<ArgInfo
> &OutArgs
) const {
1098 // Must pass all target-independent checks in order to tail call optimize.
1099 if (!Info
.IsTailCall
)
1102 // Indirect calls can't be tail calls, because the address can be divergent.
1103 // TODO Check divergence info if the call really is divergent.
1104 if (Info
.Callee
.isReg())
1107 MachineFunction
&MF
= B
.getMF();
1108 const Function
&CallerF
= MF
.getFunction();
1109 CallingConv::ID CalleeCC
= Info
.CallConv
;
1110 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
1112 const SIRegisterInfo
*TRI
= MF
.getSubtarget
<GCNSubtarget
>().getRegisterInfo();
1113 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
1114 // Kernels aren't callable, and don't have a live in return address so it
1115 // doesn't make sense to do a tail call with entry functions.
1116 if (!CallerPreserved
)
1119 if (!mayTailCallThisCC(CalleeCC
)) {
1120 LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1124 if (any_of(CallerF
.args(), [](const Argument
&A
) {
1125 return A
.hasByValAttr() || A
.hasSwiftErrorAttr();
1127 LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1128 "or swifterror arguments\n");
1132 // If we have -tailcallopt, then we're done.
1133 if (MF
.getTarget().Options
.GuaranteedTailCallOpt
)
1134 return canGuaranteeTCO(CalleeCC
) && CalleeCC
== CallerF
.getCallingConv();
1136 // Verify that the incoming and outgoing arguments from the callee are
1137 // safe to tail call.
1138 if (!doCallerAndCalleePassArgsTheSameWay(Info
, MF
, InArgs
)) {
1141 << "... Caller and callee have incompatible calling conventions.\n");
1145 if (!areCalleeOutgoingArgsTailCallable(Info
, MF
, OutArgs
))
1148 LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1152 // Insert outgoing implicit arguments for a call, by inserting copies to the
1153 // implicit argument registers and adding the necessary implicit uses to the
1154 // call instruction.
1155 void AMDGPUCallLowering::handleImplicitCallArguments(
1156 MachineIRBuilder
&MIRBuilder
, MachineInstrBuilder
&CallInst
,
1157 const GCNSubtarget
&ST
, const SIMachineFunctionInfo
&FuncInfo
,
1158 CallingConv::ID CalleeCC
,
1159 ArrayRef
<std::pair
<MCRegister
, Register
>> ImplicitArgRegs
) const {
1160 if (!ST
.enableFlatScratch()) {
1161 // Insert copies for the SRD. In the HSA case, this should be an identity
1163 auto ScratchRSrcReg
= MIRBuilder
.buildCopy(LLT::fixed_vector(4, 32),
1164 FuncInfo
.getScratchRSrcReg());
1166 auto CalleeRSrcReg
= AMDGPU::isChainCC(CalleeCC
)
1167 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1168 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3
;
1170 MIRBuilder
.buildCopy(CalleeRSrcReg
, ScratchRSrcReg
);
1171 CallInst
.addReg(CalleeRSrcReg
, RegState::Implicit
);
1174 for (std::pair
<MCRegister
, Register
> ArgReg
: ImplicitArgRegs
) {
1175 MIRBuilder
.buildCopy((Register
)ArgReg
.first
, ArgReg
.second
);
1176 CallInst
.addReg(ArgReg
.first
, RegState::Implicit
);
1180 bool AMDGPUCallLowering::lowerTailCall(
1181 MachineIRBuilder
&MIRBuilder
, CallLoweringInfo
&Info
,
1182 SmallVectorImpl
<ArgInfo
> &OutArgs
) const {
1183 MachineFunction
&MF
= MIRBuilder
.getMF();
1184 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1185 SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
1186 const Function
&F
= MF
.getFunction();
1187 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1188 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
1190 // True when we're tail calling, but without -tailcallopt.
1191 bool IsSibCall
= !MF
.getTarget().Options
.GuaranteedTailCallOpt
;
1193 // Find out which ABI gets to decide where things go.
1194 CallingConv::ID CalleeCC
= Info
.CallConv
;
1195 CCAssignFn
*AssignFnFixed
;
1196 CCAssignFn
*AssignFnVarArg
;
1197 std::tie(AssignFnFixed
, AssignFnVarArg
) = getAssignFnsForCC(CalleeCC
, TLI
);
1199 MachineInstrBuilder CallSeqStart
;
1201 CallSeqStart
= MIRBuilder
.buildInstr(AMDGPU::ADJCALLSTACKUP
);
1204 getCallOpcode(MF
, Info
.Callee
.isReg(), true, ST
.isWave32(), CalleeCC
);
1205 auto MIB
= MIRBuilder
.buildInstrNoInsert(Opc
);
1206 if (!addCallTargetOperands(MIB
, MIRBuilder
, Info
))
1209 // Byte offset for the tail call. When we are sibcalling, this will always
1213 // If this is a chain call, we need to pass in the EXEC mask.
1214 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
1215 if (AMDGPU::isChainCC(Info
.CallConv
)) {
1216 ArgInfo ExecArg
= Info
.OrigArgs
[1];
1217 assert(ExecArg
.Regs
.size() == 1 && "Too many regs for EXEC");
1219 if (!ExecArg
.Ty
->isIntegerTy(ST
.getWavefrontSize()))
1222 if (auto CI
= dyn_cast
<ConstantInt
>(ExecArg
.OrigValue
)) {
1223 MIB
.addImm(CI
->getSExtValue());
1225 MIB
.addReg(ExecArg
.Regs
[0]);
1226 unsigned Idx
= MIB
->getNumOperands() - 1;
1227 MIB
->getOperand(Idx
).setReg(constrainOperandRegClass(
1228 MF
, *TRI
, MRI
, *ST
.getInstrInfo(), *ST
.getRegBankInfo(), *MIB
,
1229 MIB
->getDesc(), MIB
->getOperand(Idx
), Idx
));
1233 // Tell the call which registers are clobbered.
1234 const uint32_t *Mask
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
1235 MIB
.addRegMask(Mask
);
1237 // FPDiff is the byte offset of the call's argument area from the callee's.
1238 // Stores to callee stack arguments will be placed in FixedStackSlots offset
1239 // by this amount for a tail call. In a sibling call it must be 0 because the
1240 // caller will deallocate the entire stack and the callee still expects its
1241 // arguments to begin at SP+0.
1244 // This will be 0 for sibcalls, potentially nonzero for tail calls produced
1245 // by -tailcallopt. For sibcalls, the memory operands for the call are
1246 // already available in the caller's incoming argument space.
1247 unsigned NumBytes
= 0;
1249 // We aren't sibcalling, so we need to compute FPDiff. We need to do this
1250 // before handling assignments, because FPDiff must be known for memory
1252 unsigned NumReusableBytes
= FuncInfo
->getBytesInStackArgArea();
1253 SmallVector
<CCValAssign
, 16> OutLocs
;
1254 CCState
OutInfo(CalleeCC
, false, MF
, OutLocs
, F
.getContext());
1256 // FIXME: Not accounting for callee implicit inputs
1257 OutgoingValueAssigner
CalleeAssigner(AssignFnFixed
, AssignFnVarArg
);
1258 if (!determineAssignments(CalleeAssigner
, OutArgs
, OutInfo
))
1261 // The callee will pop the argument stack as a tail call. Thus, we must
1262 // keep it 16-byte aligned.
1263 NumBytes
= alignTo(OutInfo
.getStackSize(), ST
.getStackAlignment());
1265 // FPDiff will be negative if this tail call requires more space than we
1266 // would automatically have in our incoming argument space. Positive if we
1267 // actually shrink the stack.
1268 FPDiff
= NumReusableBytes
- NumBytes
;
1270 // The stack pointer must be 16-byte aligned at all times it's used for a
1271 // memory operation, which in practice means at *all* times and in
1272 // particular across call boundaries. Therefore our own arguments started at
1273 // a 16-byte aligned SP and the delta applied for the tail call should
1274 // satisfy the same constraint.
1275 assert(isAligned(ST
.getStackAlignment(), FPDiff
) &&
1276 "unaligned stack on tail call");
1279 SmallVector
<CCValAssign
, 16> ArgLocs
;
1280 CCState
CCInfo(Info
.CallConv
, Info
.IsVarArg
, MF
, ArgLocs
, F
.getContext());
1282 // We could pass MIB and directly add the implicit uses to the call
1283 // now. However, as an aesthetic choice, place implicit argument operands
1284 // after the ordinary user argument registers.
1285 SmallVector
<std::pair
<MCRegister
, Register
>, 12> ImplicitArgRegs
;
1287 if (Info
.CallConv
!= CallingConv::AMDGPU_Gfx
&&
1288 !AMDGPU::isChainCC(Info
.CallConv
)) {
1289 // With a fixed ABI, allocate fixed registers before user arguments.
1290 if (!passSpecialInputs(MIRBuilder
, CCInfo
, ImplicitArgRegs
, Info
))
1294 OutgoingValueAssigner
Assigner(AssignFnFixed
, AssignFnVarArg
);
1296 if (!determineAssignments(Assigner
, OutArgs
, CCInfo
))
1299 // Do the actual argument marshalling.
1300 AMDGPUOutgoingArgHandler
Handler(MIRBuilder
, MRI
, MIB
, true, FPDiff
);
1301 if (!handleAssignments(Handler
, OutArgs
, CCInfo
, ArgLocs
, MIRBuilder
))
1304 if (Info
.ConvergenceCtrlToken
) {
1305 MIB
.addUse(Info
.ConvergenceCtrlToken
, RegState::Implicit
);
1307 handleImplicitCallArguments(MIRBuilder
, MIB
, ST
, *FuncInfo
, CalleeCC
,
1310 // If we have -tailcallopt, we need to adjust the stack. We'll do the call
1311 // sequence start and end here.
1313 MIB
->getOperand(1).setImm(FPDiff
);
1314 CallSeqStart
.addImm(NumBytes
).addImm(0);
1315 // End the call sequence *before* emitting the call. Normally, we would
1316 // tidy the frame up after the call. However, here, we've laid out the
1317 // parameters so that when SP is reset, they will be in the correct
1319 MIRBuilder
.buildInstr(AMDGPU::ADJCALLSTACKDOWN
).addImm(NumBytes
).addImm(0);
1322 // Now we can add the actual call instruction to the correct basic block.
1323 MIRBuilder
.insertInstr(MIB
);
1325 // If Callee is a reg, since it is used by a target specific
1326 // instruction, it must have a register class matching the
1327 // constraint of that instruction.
1329 // FIXME: We should define regbankselectable call instructions to handle
1330 // divergent call targets.
1331 if (MIB
->getOperand(0).isReg()) {
1332 MIB
->getOperand(0).setReg(constrainOperandRegClass(
1333 MF
, *TRI
, MRI
, *ST
.getInstrInfo(), *ST
.getRegBankInfo(), *MIB
,
1334 MIB
->getDesc(), MIB
->getOperand(0), 0));
1337 MF
.getFrameInfo().setHasTailCall();
1338 Info
.LoweredTailCall
= true;
1342 /// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1343 bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder
&MIRBuilder
,
1344 CallLoweringInfo
&Info
) const {
1345 ArgInfo Callee
= Info
.OrigArgs
[0];
1346 ArgInfo SGPRArgs
= Info
.OrigArgs
[2];
1347 ArgInfo VGPRArgs
= Info
.OrigArgs
[3];
1348 ArgInfo Flags
= Info
.OrigArgs
[4];
1350 assert(cast
<ConstantInt
>(Flags
.OrigValue
)->isZero() &&
1351 "Non-zero flags aren't supported yet.");
1352 assert(Info
.OrigArgs
.size() == 5 && "Additional args aren't supported yet.");
1354 MachineFunction
&MF
= MIRBuilder
.getMF();
1355 const Function
&F
= MF
.getFunction();
1356 const DataLayout
&DL
= F
.getDataLayout();
1358 // The function to jump to is actually the first argument, so we'll change the
1359 // Callee and other info to match that before using our existing helper.
1360 const Value
*CalleeV
= Callee
.OrigValue
->stripPointerCasts();
1361 if (const Function
*F
= dyn_cast
<Function
>(CalleeV
)) {
1362 Info
.Callee
= MachineOperand::CreateGA(F
, 0);
1363 Info
.CallConv
= F
->getCallingConv();
1365 assert(Callee
.Regs
.size() == 1 && "Too many regs for the callee");
1366 Info
.Callee
= MachineOperand::CreateReg(Callee
.Regs
[0], false);
1367 Info
.CallConv
= CallingConv::AMDGPU_CS_Chain
; // amdgpu_cs_chain_preserve
1368 // behaves the same here.
1371 // The function that we're calling cannot be vararg (only the intrinsic is).
1372 Info
.IsVarArg
= false;
1374 assert(std::all_of(SGPRArgs
.Flags
.begin(), SGPRArgs
.Flags
.end(),
1375 [](ISD::ArgFlagsTy F
) { return F
.isInReg(); }) &&
1376 "SGPR arguments should be marked inreg");
1377 assert(std::none_of(VGPRArgs
.Flags
.begin(), VGPRArgs
.Flags
.end(),
1378 [](ISD::ArgFlagsTy F
) { return F
.isInReg(); }) &&
1379 "VGPR arguments should not be marked inreg");
1381 SmallVector
<ArgInfo
, 8> OutArgs
;
1382 splitToValueTypes(SGPRArgs
, OutArgs
, DL
, Info
.CallConv
);
1383 splitToValueTypes(VGPRArgs
, OutArgs
, DL
, Info
.CallConv
);
1385 Info
.IsMustTailCall
= true;
1386 return lowerTailCall(MIRBuilder
, Info
, OutArgs
);
1389 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder
&MIRBuilder
,
1390 CallLoweringInfo
&Info
) const {
1391 if (Function
*F
= Info
.CB
->getCalledFunction())
1392 if (F
->isIntrinsic()) {
1393 assert(F
->getIntrinsicID() == Intrinsic::amdgcn_cs_chain
&&
1394 "Unexpected intrinsic");
1395 return lowerChainCall(MIRBuilder
, Info
);
1398 if (Info
.IsVarArg
) {
1399 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1403 MachineFunction
&MF
= MIRBuilder
.getMF();
1404 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1405 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
1407 const Function
&F
= MF
.getFunction();
1408 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1409 const SITargetLowering
&TLI
= *getTLI
<SITargetLowering
>();
1410 const DataLayout
&DL
= F
.getDataLayout();
1412 SmallVector
<ArgInfo
, 8> OutArgs
;
1413 for (auto &OrigArg
: Info
.OrigArgs
)
1414 splitToValueTypes(OrigArg
, OutArgs
, DL
, Info
.CallConv
);
1416 SmallVector
<ArgInfo
, 8> InArgs
;
1417 if (Info
.CanLowerReturn
&& !Info
.OrigRet
.Ty
->isVoidTy())
1418 splitToValueTypes(Info
.OrigRet
, InArgs
, DL
, Info
.CallConv
);
1420 // If we can lower as a tail call, do that instead.
1421 bool CanTailCallOpt
=
1422 isEligibleForTailCallOptimization(MIRBuilder
, Info
, InArgs
, OutArgs
);
1424 // We must emit a tail call if we have musttail.
1425 if (Info
.IsMustTailCall
&& !CanTailCallOpt
) {
1426 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1430 Info
.IsTailCall
= CanTailCallOpt
;
1432 return lowerTailCall(MIRBuilder
, Info
, OutArgs
);
1434 // Find out which ABI gets to decide where things go.
1435 CCAssignFn
*AssignFnFixed
;
1436 CCAssignFn
*AssignFnVarArg
;
1437 std::tie(AssignFnFixed
, AssignFnVarArg
) =
1438 getAssignFnsForCC(Info
.CallConv
, TLI
);
1440 MIRBuilder
.buildInstr(AMDGPU::ADJCALLSTACKUP
)
1444 // Create a temporarily-floating call instruction so we can add the implicit
1445 // uses of arg registers.
1446 unsigned Opc
= getCallOpcode(MF
, Info
.Callee
.isReg(), false, ST
.isWave32(),
1449 auto MIB
= MIRBuilder
.buildInstrNoInsert(Opc
);
1450 MIB
.addDef(TRI
->getReturnAddressReg(MF
));
1452 if (!Info
.IsConvergent
)
1453 MIB
.setMIFlag(MachineInstr::NoConvergent
);
1455 if (!addCallTargetOperands(MIB
, MIRBuilder
, Info
))
1458 // Tell the call which registers are clobbered.
1459 const uint32_t *Mask
= TRI
->getCallPreservedMask(MF
, Info
.CallConv
);
1460 MIB
.addRegMask(Mask
);
1462 SmallVector
<CCValAssign
, 16> ArgLocs
;
1463 CCState
CCInfo(Info
.CallConv
, Info
.IsVarArg
, MF
, ArgLocs
, F
.getContext());
1465 // We could pass MIB and directly add the implicit uses to the call
1466 // now. However, as an aesthetic choice, place implicit argument operands
1467 // after the ordinary user argument registers.
1468 SmallVector
<std::pair
<MCRegister
, Register
>, 12> ImplicitArgRegs
;
1470 if (Info
.CallConv
!= CallingConv::AMDGPU_Gfx
) {
1471 // With a fixed ABI, allocate fixed registers before user arguments.
1472 if (!passSpecialInputs(MIRBuilder
, CCInfo
, ImplicitArgRegs
, Info
))
1476 // Do the actual argument marshalling.
1477 SmallVector
<Register
, 8> PhysRegs
;
1479 OutgoingValueAssigner
Assigner(AssignFnFixed
, AssignFnVarArg
);
1480 if (!determineAssignments(Assigner
, OutArgs
, CCInfo
))
1483 AMDGPUOutgoingArgHandler
Handler(MIRBuilder
, MRI
, MIB
, false);
1484 if (!handleAssignments(Handler
, OutArgs
, CCInfo
, ArgLocs
, MIRBuilder
))
1487 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1489 if (Info
.ConvergenceCtrlToken
) {
1490 MIB
.addUse(Info
.ConvergenceCtrlToken
, RegState::Implicit
);
1492 handleImplicitCallArguments(MIRBuilder
, MIB
, ST
, *MFI
, Info
.CallConv
,
1495 // Get a count of how many bytes are to be pushed on the stack.
1496 unsigned NumBytes
= CCInfo
.getStackSize();
1498 // If Callee is a reg, since it is used by a target specific
1499 // instruction, it must have a register class matching the
1500 // constraint of that instruction.
1502 // FIXME: We should define regbankselectable call instructions to handle
1503 // divergent call targets.
1504 if (MIB
->getOperand(1).isReg()) {
1505 MIB
->getOperand(1).setReg(constrainOperandRegClass(
1506 MF
, *TRI
, MRI
, *ST
.getInstrInfo(),
1507 *ST
.getRegBankInfo(), *MIB
, MIB
->getDesc(), MIB
->getOperand(1),
1511 // Now we can add the actual call instruction to the correct position.
1512 MIRBuilder
.insertInstr(MIB
);
1514 // Finally we can copy the returned value back into its virtual-register. In
1515 // symmetry with the arguments, the physical register must be an
1516 // implicit-define of the call instruction.
1517 if (Info
.CanLowerReturn
&& !Info
.OrigRet
.Ty
->isVoidTy()) {
1518 CCAssignFn
*RetAssignFn
= TLI
.CCAssignFnForReturn(Info
.CallConv
,
1520 IncomingValueAssigner
Assigner(RetAssignFn
);
1521 CallReturnHandler
Handler(MIRBuilder
, MRI
, MIB
);
1522 if (!determineAndHandleAssignments(Handler
, Assigner
, InArgs
, MIRBuilder
,
1523 Info
.CallConv
, Info
.IsVarArg
))
1527 uint64_t CalleePopBytes
= NumBytes
;
1529 MIRBuilder
.buildInstr(AMDGPU::ADJCALLSTACKDOWN
)
1531 .addImm(CalleePopBytes
);
1533 if (!Info
.CanLowerReturn
) {
1534 insertSRetLoads(MIRBuilder
, Info
.OrigRet
.Ty
, Info
.OrigRet
.Regs
,
1535 Info
.DemoteRegister
, Info
.DemoteStackIndex
);