1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the RegisterBankInfo class for
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
48 /// \par Constant bus restriction
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
69 //===----------------------------------------------------------------------===//
71 #include "AMDGPURegisterBankInfo.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
93 using namespace MIPatternMatch
;
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final
: public GISelChangeObserver
{
101 const AMDGPURegisterBankInfo
&RBI
;
102 MachineRegisterInfo
&MRI
;
103 const RegisterBank
*NewBank
;
104 SmallVector
<MachineInstr
*, 4> NewInsts
;
107 ApplyRegBankMapping(MachineIRBuilder
&B
, const AMDGPURegisterBankInfo
&RBI_
,
108 MachineRegisterInfo
&MRI_
, const RegisterBank
*RB
)
109 : B(B
), RBI(RBI_
), MRI(MRI_
), NewBank(RB
) {
110 assert(!B
.isObservingChanges());
111 B
.setChangeObserver(*this);
114 ~ApplyRegBankMapping() override
{
115 for (MachineInstr
*MI
: NewInsts
)
118 B
.stopObservingChanges();
121 /// Set any registers that don't have a set register class or bank to SALU.
122 void applyBank(MachineInstr
&MI
) {
123 const unsigned Opc
= MI
.getOpcode();
124 if (Opc
== AMDGPU::G_ANYEXT
|| Opc
== AMDGPU::G_ZEXT
||
125 Opc
== AMDGPU::G_SEXT
) {
126 // LegalizerHelper wants to use the basic legalization artifacts when
127 // widening etc. We don't handle selection with vcc in artifact sources,
128 // so we need to use a select instead to handle these properly.
129 Register DstReg
= MI
.getOperand(0).getReg();
130 Register SrcReg
= MI
.getOperand(1).getReg();
131 const RegisterBank
*SrcBank
= RBI
.getRegBank(SrcReg
, MRI
, *RBI
.TRI
);
132 if (SrcBank
== &AMDGPU::VCCRegBank
) {
133 const LLT S32
= LLT::scalar(32);
134 assert(MRI
.getType(SrcReg
) == LLT::scalar(1));
135 assert(MRI
.getType(DstReg
) == S32
);
136 assert(NewBank
== &AMDGPU::VGPRRegBank
);
138 // Replace the extension with a select, which really uses the boolean
140 B
.setInsertPt(*MI
.getParent(), MI
);
142 auto True
= B
.buildConstant(S32
, Opc
== AMDGPU::G_SEXT
? -1 : 1);
143 auto False
= B
.buildConstant(S32
, 0);
144 B
.buildSelect(DstReg
, SrcReg
, True
, False
);
145 MRI
.setRegBank(True
.getReg(0), *NewBank
);
146 MRI
.setRegBank(False
.getReg(0), *NewBank
);
147 MI
.eraseFromParent();
150 assert(!MRI
.getRegClassOrRegBank(DstReg
));
151 MRI
.setRegBank(DstReg
, *NewBank
);
156 if (Opc
== AMDGPU::G_TRUNC
) {
157 Register DstReg
= MI
.getOperand(0).getReg();
158 const RegisterBank
*DstBank
= RBI
.getRegBank(DstReg
, MRI
, *RBI
.TRI
);
159 assert(DstBank
!= &AMDGPU::VCCRegBank
);
163 for (MachineOperand
&Op
: MI
.operands()) {
167 // We may see physical registers if building a real MI
168 Register Reg
= Op
.getReg();
169 if (Reg
.isPhysical() || MRI
.getRegClassOrRegBank(Reg
))
172 const RegisterBank
*RB
= NewBank
;
173 if (MRI
.getType(Reg
) == LLT::scalar(1)) {
174 assert(NewBank
== &AMDGPU::VGPRRegBank
&&
175 "s1 operands should only be used for vector bools");
176 assert((MI
.getOpcode() != AMDGPU::G_TRUNC
&&
177 MI
.getOpcode() != AMDGPU::G_ANYEXT
) &&
178 "not expecting legalization artifacts here");
179 RB
= &AMDGPU::VCCRegBank
;
182 MRI
.setRegBank(Reg
, *RB
);
186 void erasingInstr(MachineInstr
&MI
) override
{}
188 void createdInstr(MachineInstr
&MI
) override
{
189 // At this point, the instruction was just inserted and has no operands.
190 NewInsts
.push_back(&MI
);
193 void changingInstr(MachineInstr
&MI
) override
{}
194 void changedInstr(MachineInstr
&MI
) override
{
195 // FIXME: In principle we should probably add the instruction to NewInsts,
196 // but the way the LegalizerHelper uses the observer, we will always see the
197 // registers we need to set the regbank on also referenced in a new
202 } // anonymous namespace
204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget
&ST
)
205 : Subtarget(ST
), TRI(Subtarget
.getRegisterInfo()),
206 TII(Subtarget
.getInstrInfo()) {
208 // HACK: Until this is fully tablegen'd.
209 static llvm::once_flag InitializeRegisterBankFlag
;
211 static auto InitializeRegisterBankOnce
= [this]() {
212 assert(&getRegBank(AMDGPU::SGPRRegBankID
) == &AMDGPU::SGPRRegBank
&&
213 &getRegBank(AMDGPU::VGPRRegBankID
) == &AMDGPU::VGPRRegBank
&&
214 &getRegBank(AMDGPU::AGPRRegBankID
) == &AMDGPU::AGPRRegBank
);
218 llvm::call_once(InitializeRegisterBankFlag
, InitializeRegisterBankOnce
);
221 static bool isVectorRegisterBank(const RegisterBank
&Bank
) {
222 unsigned BankID
= Bank
.getID();
223 return BankID
== AMDGPU::VGPRRegBankID
|| BankID
== AMDGPU::AGPRRegBankID
;
226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank
*RB
) const {
227 return RB
!= &AMDGPU::SGPRRegBank
;
230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank
&Dst
,
231 const RegisterBank
&Src
,
232 TypeSize Size
) const {
233 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234 if (Dst
.getID() == AMDGPU::SGPRRegBankID
&&
235 (isVectorRegisterBank(Src
) || Src
.getID() == AMDGPU::VCCRegBankID
)) {
236 return std::numeric_limits
<unsigned>::max();
239 // Bool values are tricky, because the meaning is based on context. The SCC
240 // and VCC banks are for the natural scalar and vector conditions produced by
243 // Legalization doesn't know about the necessary context, so an s1 use may
244 // have been a truncate from an arbitrary value, in which case a copy (lowered
245 // as a compare with 0) needs to be inserted.
247 (Dst
.getID() == AMDGPU::SGPRRegBankID
) &&
248 (isVectorRegisterBank(Src
) ||
249 Src
.getID() == AMDGPU::SGPRRegBankID
||
250 Src
.getID() == AMDGPU::VCCRegBankID
))
251 return std::numeric_limits
<unsigned>::max();
253 // There is no direct copy between AGPRs.
254 if (Dst
.getID() == AMDGPU::AGPRRegBankID
&&
255 Src
.getID() == AMDGPU::AGPRRegBankID
)
258 return RegisterBankInfo::copyCost(Dst
, Src
, Size
);
261 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
262 const ValueMapping
&ValMapping
,
263 const RegisterBank
*CurBank
) const {
264 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
266 // FIXME: Is there a better way to do this?
267 if (ValMapping
.NumBreakDowns
>= 2 || ValMapping
.BreakDown
[0].Length
>= 64)
268 return 10; // This is expensive.
270 assert(ValMapping
.NumBreakDowns
== 2 &&
271 ValMapping
.BreakDown
[0].Length
== 32 &&
272 ValMapping
.BreakDown
[0].StartIdx
== 0 &&
273 ValMapping
.BreakDown
[1].Length
== 32 &&
274 ValMapping
.BreakDown
[1].StartIdx
== 32 &&
275 ValMapping
.BreakDown
[0].RegBank
== ValMapping
.BreakDown
[1].RegBank
);
277 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
281 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282 // alignment restrictions, but this probably isn't important.
287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass
&RC
,
289 if (&RC
== &AMDGPU::SReg_1RegClass
)
290 return AMDGPU::VCCRegBank
;
292 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
294 if (TRI
->isSGPRClass(&RC
)) {
295 // FIXME: This probably came from a copy from a physical register, which
296 // should be inferable from the copied to-type. We don't have many boolean
297 // physical register constraints so just assume a normal SGPR for now.
299 return AMDGPU::SGPRRegBank
;
301 return Ty
== LLT::scalar(1) ? AMDGPU::VCCRegBank
: AMDGPU::SGPRRegBank
;
304 return TRI
->isAGPRClass(&RC
) ? AMDGPU::AGPRRegBank
: AMDGPU::VGPRRegBank
;
307 template <unsigned NumOps
>
308 RegisterBankInfo::InstructionMappings
309 AMDGPURegisterBankInfo::addMappingFromTable(
310 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
,
311 const std::array
<unsigned, NumOps
> RegSrcOpIdx
,
312 ArrayRef
<OpRegBankEntry
<NumOps
>> Table
) const {
314 InstructionMappings AltMappings
;
316 SmallVector
<const ValueMapping
*, 10> Operands(MI
.getNumOperands());
318 unsigned Sizes
[NumOps
];
319 for (unsigned I
= 0; I
< NumOps
; ++I
) {
320 Register Reg
= MI
.getOperand(RegSrcOpIdx
[I
]).getReg();
321 Sizes
[I
] = getSizeInBits(Reg
, MRI
, *TRI
);
324 for (unsigned I
= 0, E
= MI
.getNumExplicitDefs(); I
!= E
; ++I
) {
325 unsigned SizeI
= getSizeInBits(MI
.getOperand(I
).getReg(), MRI
, *TRI
);
326 Operands
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SizeI
);
329 // getInstrMapping's default mapping uses ID 1, so start at 2.
330 unsigned MappingID
= 2;
331 for (const auto &Entry
: Table
) {
332 for (unsigned I
= 0; I
< NumOps
; ++I
) {
333 int OpIdx
= RegSrcOpIdx
[I
];
334 Operands
[OpIdx
] = AMDGPU::getValueMapping(Entry
.RegBanks
[I
], Sizes
[I
]);
337 AltMappings
.push_back(&getInstructionMapping(MappingID
++, Entry
.Cost
,
338 getOperandsMapping(Operands
),
345 RegisterBankInfo::InstructionMappings
346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
347 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
348 switch (cast
<GIntrinsic
>(MI
).getIntrinsicID()) {
349 case Intrinsic::amdgcn_readlane
: {
350 static const OpRegBankEntry
<3> Table
[2] = {
352 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
354 // Need a readfirstlane for the index.
355 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
358 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
359 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, Table
);
361 case Intrinsic::amdgcn_writelane
: {
362 static const OpRegBankEntry
<4> Table
[4] = {
364 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
366 // Need readfirstlane of first op
367 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
369 // Need readfirstlane of second op
370 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
372 // Need readfirstlane of both ops
373 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 3 }
376 // rsrc, voffset, offset
377 const std::array
<unsigned, 4> RegSrcOpIdx
= { { 0, 2, 3, 4 } };
378 return addMappingFromTable
<4>(MI
, MRI
, RegSrcOpIdx
, Table
);
381 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
385 RegisterBankInfo::InstructionMappings
386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
387 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
389 switch (cast
<GIntrinsic
>(MI
).getIntrinsicID()) {
390 case Intrinsic::amdgcn_s_buffer_load
: {
391 static const OpRegBankEntry
<2> Table
[4] = {
393 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
395 // Only need 1 register in loop
396 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 300 },
398 // Have to waterfall the resource.
399 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1000 },
401 // Have to waterfall the resource, and the offset.
402 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1500 }
406 const std::array
<unsigned, 2> RegSrcOpIdx
= { { 2, 3 } };
407 return addMappingFromTable
<2>(MI
, MRI
, RegSrcOpIdx
, Table
);
409 case Intrinsic::amdgcn_ds_ordered_add
:
410 case Intrinsic::amdgcn_ds_ordered_swap
: {
412 static const OpRegBankEntry
<3> Table
[2] = {
414 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
416 // Need a readfirstlane for m0
417 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
420 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
421 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, Table
);
423 case Intrinsic::amdgcn_s_sendmsg
:
424 case Intrinsic::amdgcn_s_sendmsghalt
: {
425 // FIXME: Should have no register for immediate
426 static const OpRegBankEntry
<1> Table
[2] = {
428 { { AMDGPU::SGPRRegBankID
}, 1 },
431 { { AMDGPU::VGPRRegBankID
}, 3 }
434 const std::array
<unsigned, 1> RegSrcOpIdx
= { { 2 } };
435 return addMappingFromTable
<1>(MI
, MRI
, RegSrcOpIdx
, Table
);
438 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
442 // FIXME: Returns uniform if there's no source value information. This is
444 bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr
&MI
) const {
445 if (!MI
.hasOneMemOperand())
448 const MachineMemOperand
*MMO
= *MI
.memoperands_begin();
449 const unsigned AS
= MMO
->getAddrSpace();
450 const bool IsConst
= AS
== AMDGPUAS::CONSTANT_ADDRESS
||
451 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
;
452 const unsigned MemSize
= 8 * MMO
->getSize().getValue();
454 // Require 4-byte alignment.
455 return (MMO
->getAlign() >= Align(4) ||
456 (Subtarget
.hasScalarSubwordLoads() &&
457 ((MemSize
== 16 && MMO
->getAlign() >= Align(2)) ||
458 (MemSize
== 8 && MMO
->getAlign() >= Align(1))))) &&
459 // Can't do a scalar atomic load.
461 // Don't use scalar loads for volatile accesses to non-constant address
463 (IsConst
|| !MMO
->isVolatile()) &&
464 // Memory must be known constant, or not written before this load.
465 (IsConst
|| MMO
->isInvariant() || (MMO
->getFlags() & MONoClobber
)) &&
466 AMDGPUInstrInfo::isUniformMMO(MMO
);
469 RegisterBankInfo::InstructionMappings
470 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
471 const MachineInstr
&MI
) const {
473 const MachineFunction
&MF
= *MI
.getParent()->getParent();
474 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
477 InstructionMappings AltMappings
;
478 switch (MI
.getOpcode()) {
479 case TargetOpcode::G_CONSTANT
:
480 case TargetOpcode::G_IMPLICIT_DEF
: {
481 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
483 static const OpRegBankEntry
<1> Table
[3] = {
484 { { AMDGPU::VGPRRegBankID
}, 1 },
485 { { AMDGPU::SGPRRegBankID
}, 1 },
486 { { AMDGPU::VCCRegBankID
}, 1 }
489 return addMappingFromTable
<1>(MI
, MRI
, {{ 0 }}, Table
);
494 case TargetOpcode::G_FCONSTANT
:
495 case TargetOpcode::G_FRAME_INDEX
:
496 case TargetOpcode::G_GLOBAL_VALUE
: {
497 static const OpRegBankEntry
<1> Table
[2] = {
498 { { AMDGPU::VGPRRegBankID
}, 1 },
499 { { AMDGPU::SGPRRegBankID
}, 1 }
502 return addMappingFromTable
<1>(MI
, MRI
, {{ 0 }}, Table
);
504 case TargetOpcode::G_AND
:
505 case TargetOpcode::G_OR
:
506 case TargetOpcode::G_XOR
: {
507 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
510 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
511 const InstructionMapping
&SCCMapping
= getInstructionMapping(
512 1, 1, getOperandsMapping(
513 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32),
514 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32),
515 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32)}),
517 AltMappings
.push_back(&SCCMapping
);
519 const InstructionMapping
&VCCMapping0
= getInstructionMapping(
520 2, 1, getOperandsMapping(
521 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
522 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
523 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
)}),
525 AltMappings
.push_back(&VCCMapping0
);
532 const InstructionMapping
&SSMapping
= getInstructionMapping(
533 1, 1, getOperandsMapping(
534 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
535 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
536 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
538 AltMappings
.push_back(&SSMapping
);
540 const InstructionMapping
&VVMapping
= getInstructionMapping(
541 2, 2, getOperandsMapping(
542 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
543 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
544 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
546 AltMappings
.push_back(&VVMapping
);
549 case TargetOpcode::G_LOAD
:
550 case TargetOpcode::G_ZEXTLOAD
:
551 case TargetOpcode::G_SEXTLOAD
: {
552 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
553 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
554 unsigned PtrSize
= PtrTy
.getSizeInBits();
555 unsigned AS
= PtrTy
.getAddressSpace();
557 if ((AS
!= AMDGPUAS::LOCAL_ADDRESS
&& AS
!= AMDGPUAS::REGION_ADDRESS
&&
558 AS
!= AMDGPUAS::PRIVATE_ADDRESS
) &&
559 isScalarLoadLegal(MI
)) {
560 const InstructionMapping
&SSMapping
= getInstructionMapping(
561 1, 1, getOperandsMapping(
562 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
563 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
)}),
565 AltMappings
.push_back(&SSMapping
);
568 const InstructionMapping
&VVMapping
= getInstructionMapping(
571 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
572 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
)}),
574 AltMappings
.push_back(&VVMapping
);
576 // It may be possible to have a vgpr = load sgpr mapping here, because
577 // the mubuf instructions support this kind of load, but probably for only
578 // gfx7 and older. However, the addressing mode matching in the instruction
579 // selector should be able to do a better job of detecting and selecting
580 // these kinds of loads from the vgpr = load vgpr mapping.
585 case TargetOpcode::G_SELECT
: {
586 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
587 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
588 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
589 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1),
590 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
591 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
593 AltMappings
.push_back(&SSMapping
);
595 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
596 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
597 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
598 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
599 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
601 AltMappings
.push_back(&VVMapping
);
605 case TargetOpcode::G_UADDE
:
606 case TargetOpcode::G_USUBE
:
607 case TargetOpcode::G_SADDE
:
608 case TargetOpcode::G_SSUBE
: {
609 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
610 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
612 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
613 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1),
614 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
615 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
616 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1)}),
618 AltMappings
.push_back(&SSMapping
);
620 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
621 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
622 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
623 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
624 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
625 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1)}),
627 AltMappings
.push_back(&VVMapping
);
630 case AMDGPU::G_BRCOND
: {
631 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
633 // TODO: Change type to 32 for scalar
634 const InstructionMapping
&SMapping
= getInstructionMapping(
635 1, 1, getOperandsMapping(
636 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1), nullptr}),
638 AltMappings
.push_back(&SMapping
);
640 const InstructionMapping
&VMapping
= getInstructionMapping(
641 1, 1, getOperandsMapping(
642 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1), nullptr }),
644 AltMappings
.push_back(&VMapping
);
647 case AMDGPU::G_INTRINSIC
:
648 case AMDGPU::G_INTRINSIC_CONVERGENT
:
649 return getInstrAlternativeMappingsIntrinsic(MI
, MRI
);
650 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
:
651 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS
:
652 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI
, MRI
);
656 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
659 void AMDGPURegisterBankInfo::split64BitValueForMapping(
661 SmallVector
<Register
, 2> &Regs
,
663 Register Reg
) const {
664 assert(HalfTy
.getSizeInBits() == 32);
665 MachineRegisterInfo
*MRI
= B
.getMRI();
666 Register LoLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
667 Register HiLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
668 const RegisterBank
*Bank
= getRegBank(Reg
, *MRI
, *TRI
);
669 MRI
->setRegBank(LoLHS
, *Bank
);
670 MRI
->setRegBank(HiLHS
, *Bank
);
672 Regs
.push_back(LoLHS
);
673 Regs
.push_back(HiLHS
);
675 B
.buildInstr(AMDGPU::G_UNMERGE_VALUES
)
681 /// Replace the current type each register in \p Regs has with \p NewTy
682 static void setRegsToType(MachineRegisterInfo
&MRI
, ArrayRef
<Register
> Regs
,
684 for (Register Reg
: Regs
) {
685 assert(MRI
.getType(Reg
).getSizeInBits() == NewTy
.getSizeInBits());
686 MRI
.setType(Reg
, NewTy
);
690 static LLT
getHalfSizedType(LLT Ty
) {
692 assert(Ty
.getElementCount().isKnownMultipleOf(2));
693 return LLT::scalarOrVector(Ty
.getElementCount().divideCoefficientBy(2),
694 Ty
.getElementType());
697 assert(Ty
.getScalarSizeInBits() % 2 == 0);
698 return LLT::scalar(Ty
.getScalarSizeInBits() / 2);
701 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
702 // source value into a scalar register.
703 Register
AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder
&B
,
704 MachineRegisterInfo
&MRI
,
705 Register Src
) const {
706 LLT Ty
= MRI
.getType(Src
);
707 const RegisterBank
*Bank
= getRegBank(Src
, MRI
, *TRI
);
709 if (Bank
== &AMDGPU::SGPRRegBank
)
712 unsigned Bits
= Ty
.getSizeInBits();
713 assert(Bits
% 32 == 0);
715 if (Bank
!= &AMDGPU::VGPRRegBank
) {
716 // We need to copy from AGPR to VGPR
717 Src
= B
.buildCopy(Ty
, Src
).getReg(0);
718 MRI
.setRegBank(Src
, AMDGPU::VGPRRegBank
);
721 LLT S32
= LLT::scalar(32);
722 unsigned NumParts
= Bits
/ 32;
723 SmallVector
<Register
, 8> SrcParts
;
724 SmallVector
<Register
, 8> DstParts
;
727 SrcParts
.push_back(Src
);
729 auto Unmerge
= B
.buildUnmerge(S32
, Src
);
730 for (unsigned i
= 0; i
< NumParts
; ++i
)
731 SrcParts
.push_back(Unmerge
.getReg(i
));
734 for (unsigned i
= 0; i
< NumParts
; ++i
) {
735 Register SrcPart
= SrcParts
[i
];
736 Register DstPart
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
737 MRI
.setType(DstPart
, NumParts
== 1 ? Ty
: S32
);
739 const TargetRegisterClass
*Constrained
=
740 constrainGenericRegister(SrcPart
, AMDGPU::VGPR_32RegClass
, MRI
);
742 assert(Constrained
&& "Failed to constrain readfirstlane src reg");
744 B
.buildInstr(AMDGPU::V_READFIRSTLANE_B32
, {DstPart
}, {SrcPart
});
746 DstParts
.push_back(DstPart
);
752 Register Dst
= B
.buildMergeLikeInstr(Ty
, DstParts
).getReg(0);
753 MRI
.setRegBank(Dst
, AMDGPU::SGPRRegBank
);
757 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
758 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
759 /// execute the instruction for each unique combination of values in all lanes
760 /// in the wave. The block will be split such that rest of the instructions are
761 /// moved to a new block.
763 /// Essentially performs this loop:
765 /// Save Execution Mask
766 /// For (Lane : Wavefront) {
767 /// Enable Lane, Disable all other lanes
768 /// SGPR = read SGPR value for current lane from VGPR
769 /// VGPRResult[Lane] = use_op SGPR
771 /// Restore Execution Mask
773 /// There is additional complexity to try for compare values to identify the
774 /// unique values used.
775 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
776 MachineIRBuilder
&B
, iterator_range
<MachineBasicBlock::iterator
> Range
,
777 SmallSet
<Register
, 4> &SGPROperandRegs
) const {
778 // Track use registers which have already been expanded with a readfirstlane
779 // sequence. This may have multiple uses if moving a sequence.
780 DenseMap
<Register
, Register
> WaterfalledRegMap
;
782 MachineBasicBlock
&MBB
= B
.getMBB();
783 MachineFunction
*MF
= &B
.getMF();
785 const TargetRegisterClass
*WaveRC
= TRI
->getWaveMaskRegClass();
786 const unsigned MovExecOpc
=
787 Subtarget
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
788 const unsigned MovExecTermOpc
=
789 Subtarget
.isWave32() ? AMDGPU::S_MOV_B32_term
: AMDGPU::S_MOV_B64_term
;
791 const unsigned XorTermOpc
= Subtarget
.isWave32() ?
792 AMDGPU::S_XOR_B32_term
: AMDGPU::S_XOR_B64_term
;
793 const unsigned AndSaveExecOpc
= Subtarget
.isWave32() ?
794 AMDGPU::S_AND_SAVEEXEC_B32
: AMDGPU::S_AND_SAVEEXEC_B64
;
795 const unsigned ExecReg
= Subtarget
.isWave32() ?
796 AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
799 const int OrigRangeSize
= std::distance(Range
.begin(), Range
.end());
802 MachineRegisterInfo
&MRI
= *B
.getMRI();
803 Register SaveExecReg
= MRI
.createVirtualRegister(WaveRC
);
804 Register InitSaveExecReg
= MRI
.createVirtualRegister(WaveRC
);
806 // Don't bother using generic instructions/registers for the exec mask.
807 B
.buildInstr(TargetOpcode::IMPLICIT_DEF
)
808 .addDef(InitSaveExecReg
);
810 Register PhiExec
= MRI
.createVirtualRegister(WaveRC
);
811 Register NewExec
= MRI
.createVirtualRegister(WaveRC
);
813 // To insert the loop we need to split the block. Move everything before this
814 // point to a new block, and insert a new empty block before this instruction.
815 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
816 MachineBasicBlock
*BodyBB
= MF
->CreateMachineBasicBlock();
817 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
818 MachineBasicBlock
*RestoreExecBB
= MF
->CreateMachineBasicBlock();
819 MachineFunction::iterator
MBBI(MBB
);
821 MF
->insert(MBBI
, LoopBB
);
822 MF
->insert(MBBI
, BodyBB
);
823 MF
->insert(MBBI
, RestoreExecBB
);
824 MF
->insert(MBBI
, RemainderBB
);
826 LoopBB
->addSuccessor(BodyBB
);
827 BodyBB
->addSuccessor(RestoreExecBB
);
828 BodyBB
->addSuccessor(LoopBB
);
830 // Move the rest of the block into a new block.
831 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
832 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, Range
.end(), MBB
.end());
834 MBB
.addSuccessor(LoopBB
);
835 RestoreExecBB
->addSuccessor(RemainderBB
);
837 B
.setInsertPt(*LoopBB
, LoopBB
->end());
839 B
.buildInstr(TargetOpcode::PHI
)
841 .addReg(InitSaveExecReg
)
846 const DebugLoc
&DL
= B
.getDL();
848 MachineInstr
&FirstInst
= *Range
.begin();
850 // Move the instruction into the loop body. Note we moved everything after
851 // Range.end() already into a new block, so Range.end() is no longer valid.
852 BodyBB
->splice(BodyBB
->end(), &MBB
, Range
.begin(), MBB
.end());
854 // Figure out the iterator range after splicing the instructions.
855 MachineBasicBlock::iterator NewBegin
= FirstInst
.getIterator();
856 auto NewEnd
= BodyBB
->end();
860 LLT S1
= LLT::scalar(1);
863 assert(std::distance(NewBegin
, NewEnd
) == OrigRangeSize
);
865 for (MachineInstr
&MI
: make_range(NewBegin
, NewEnd
)) {
866 for (MachineOperand
&Op
: MI
.all_uses()) {
867 Register OldReg
= Op
.getReg();
868 if (!SGPROperandRegs
.count(OldReg
))
871 // See if we already processed this register in another instruction in the
873 auto OldVal
= WaterfalledRegMap
.find(OldReg
);
874 if (OldVal
!= WaterfalledRegMap
.end()) {
875 Op
.setReg(OldVal
->second
);
879 Register OpReg
= Op
.getReg();
880 LLT OpTy
= MRI
.getType(OpReg
);
882 const RegisterBank
*OpBank
= getRegBank(OpReg
, MRI
, *TRI
);
883 if (OpBank
!= &AMDGPU::VGPRRegBank
) {
884 // Insert copy from AGPR to VGPR before the loop.
886 OpReg
= B
.buildCopy(OpTy
, OpReg
).getReg(0);
887 MRI
.setRegBank(OpReg
, AMDGPU::VGPRRegBank
);
891 Register CurrentLaneReg
= buildReadFirstLane(B
, MRI
, OpReg
);
893 // Build the comparison(s).
894 unsigned OpSize
= OpTy
.getSizeInBits();
895 bool Is64
= OpSize
% 64 == 0;
896 unsigned PartSize
= Is64
? 64 : 32;
897 LLT PartTy
= LLT::scalar(PartSize
);
898 unsigned NumParts
= OpSize
/ PartSize
;
899 SmallVector
<Register
, 8> OpParts
;
900 SmallVector
<Register
, 8> CurrentLaneParts
;
903 OpParts
.push_back(OpReg
);
904 CurrentLaneParts
.push_back(CurrentLaneReg
);
906 auto UnmergeOp
= B
.buildUnmerge(PartTy
, OpReg
);
907 auto UnmergeCurrentLane
= B
.buildUnmerge(PartTy
, CurrentLaneReg
);
908 for (unsigned i
= 0; i
< NumParts
; ++i
) {
909 OpParts
.push_back(UnmergeOp
.getReg(i
));
910 CurrentLaneParts
.push_back(UnmergeCurrentLane
.getReg(i
));
911 MRI
.setRegBank(OpParts
[i
], AMDGPU::VGPRRegBank
);
912 MRI
.setRegBank(CurrentLaneParts
[i
], AMDGPU::SGPRRegBank
);
916 for (unsigned i
= 0; i
< NumParts
; ++i
) {
917 auto CmpReg
= B
.buildICmp(CmpInst::ICMP_EQ
, S1
, CurrentLaneParts
[i
],
918 OpParts
[i
]).getReg(0);
919 MRI
.setRegBank(CmpReg
, AMDGPU::VCCRegBank
);
924 CondReg
= B
.buildAnd(S1
, CondReg
, CmpReg
).getReg(0);
925 MRI
.setRegBank(CondReg
, AMDGPU::VCCRegBank
);
929 Op
.setReg(CurrentLaneReg
);
931 // Make sure we don't re-process this register again.
932 WaterfalledRegMap
.insert(std::pair(OldReg
, Op
.getReg()));
936 // The ballot becomes a no-op during instruction selection.
937 CondReg
= B
.buildIntrinsic(Intrinsic::amdgcn_ballot
,
938 {LLT::scalar(Subtarget
.isWave32() ? 32 : 64)})
941 MRI
.setRegClass(CondReg
, WaveRC
);
943 // Update EXEC, save the original EXEC value to VCC.
944 B
.buildInstr(AndSaveExecOpc
)
946 .addReg(CondReg
, RegState::Kill
);
948 MRI
.setSimpleHint(NewExec
, CondReg
);
950 B
.setInsertPt(*BodyBB
, BodyBB
->end());
952 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
953 B
.buildInstr(XorTermOpc
)
958 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
961 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
962 B
.buildInstr(AMDGPU::SI_WATERFALL_LOOP
).addMBB(LoopBB
);
964 // Save the EXEC mask before the loop.
965 BuildMI(MBB
, MBB
.end(), DL
, TII
->get(MovExecOpc
), SaveExecReg
)
968 // Restore the EXEC mask after the loop.
969 B
.setMBB(*RestoreExecBB
);
970 B
.buildInstr(MovExecTermOpc
)
972 .addReg(SaveExecReg
);
974 // Set the insert point after the original instruction, so any new
975 // instructions will be in the remainder.
976 B
.setInsertPt(*RemainderBB
, RemainderBB
->begin());
981 // Return any unique registers used by \p MI at \p OpIndices that need to be
982 // handled in a waterfall loop. Returns these registers in \p
983 // SGPROperandRegs. Returns true if there are any operands to handle and a
984 // waterfall loop is necessary.
985 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
986 SmallSet
<Register
, 4> &SGPROperandRegs
, MachineInstr
&MI
,
987 MachineRegisterInfo
&MRI
, ArrayRef
<unsigned> OpIndices
) const {
988 for (unsigned Op
: OpIndices
) {
989 assert(MI
.getOperand(Op
).isUse());
990 Register Reg
= MI
.getOperand(Op
).getReg();
991 const RegisterBank
*OpBank
= getRegBank(Reg
, MRI
, *TRI
);
992 if (OpBank
->getID() != AMDGPU::SGPRRegBankID
)
993 SGPROperandRegs
.insert(Reg
);
996 // No operands need to be replaced, so no need to loop.
997 return !SGPROperandRegs
.empty();
1000 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1001 MachineIRBuilder
&B
, MachineInstr
&MI
, ArrayRef
<unsigned> OpIndices
) const {
1002 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1003 // are the same register.
1004 SmallSet
<Register
, 4> SGPROperandRegs
;
1006 if (!collectWaterfallOperands(SGPROperandRegs
, MI
, *B
.getMRI(), OpIndices
))
1009 MachineBasicBlock::iterator I
= MI
.getIterator();
1010 return executeInWaterfallLoop(B
, make_range(I
, std::next(I
)),
1014 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1015 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1016 MachineIRBuilder
&B
, MachineInstr
&MI
, unsigned OpIdx
) const {
1017 Register Reg
= MI
.getOperand(OpIdx
).getReg();
1018 MachineRegisterInfo
&MRI
= *B
.getMRI();
1019 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
1020 if (Bank
== &AMDGPU::SGPRRegBank
)
1023 Reg
= buildReadFirstLane(B
, MRI
, Reg
);
1024 MI
.getOperand(OpIdx
).setReg(Reg
);
1027 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1028 /// rest will be in the remainder.
1029 static std::pair
<LLT
, LLT
> splitUnequalType(LLT Ty
, unsigned FirstSize
) {
1030 unsigned TotalSize
= Ty
.getSizeInBits();
1032 return {LLT::scalar(FirstSize
), LLT::scalar(TotalSize
- FirstSize
)};
1034 LLT EltTy
= Ty
.getElementType();
1035 unsigned EltSize
= EltTy
.getSizeInBits();
1036 assert(FirstSize
% EltSize
== 0);
1038 unsigned FirstPartNumElts
= FirstSize
/ EltSize
;
1039 unsigned RemainderElts
= (TotalSize
- FirstSize
) / EltSize
;
1041 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts
), EltTy
),
1042 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts
), EltTy
)};
1045 static LLT
widen96To128(LLT Ty
) {
1047 return LLT::scalar(128);
1049 LLT EltTy
= Ty
.getElementType();
1050 assert(128 % EltTy
.getSizeInBits() == 0);
1051 return LLT::fixed_vector(128 / EltTy
.getSizeInBits(), EltTy
);
1054 bool AMDGPURegisterBankInfo::applyMappingLoad(
1055 MachineIRBuilder
&B
,
1056 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1057 MachineInstr
&MI
) const {
1058 MachineRegisterInfo
&MRI
= *B
.getMRI();
1059 Register DstReg
= MI
.getOperand(0).getReg();
1060 const LLT LoadTy
= MRI
.getType(DstReg
);
1061 unsigned LoadSize
= LoadTy
.getSizeInBits();
1062 const unsigned MaxNonSmrdLoadSize
= 128;
1064 const RegisterBank
*DstBank
=
1065 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
1066 if (DstBank
== &AMDGPU::SGPRRegBank
) {
1067 // There are some special cases that we need to look at for 32 bit and 96
1068 // bit SGPR loads otherwise we have nothing to do.
1069 if (LoadSize
!= 32 && (LoadSize
!= 96 || Subtarget
.hasScalarDwordx3Loads()))
1072 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
1073 const unsigned MemSize
= 8 * MMO
->getSize().getValue();
1074 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1075 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1076 // scalar loads should have a load size of 32 but memory access size of less
1078 if (LoadSize
== 32 &&
1079 (MemSize
== 32 || LoadTy
.isVector() || !isScalarLoadLegal(MI
)))
1082 if (LoadSize
== 32 &&
1083 ((MemSize
== 8 && MMO
->getAlign() >= Align(1)) ||
1084 (MemSize
== 16 && MMO
->getAlign() >= Align(2))) &&
1085 isScalarLoadLegal(MI
) &&
1086 Subtarget
.getGeneration() >= AMDGPUSubtarget::GFX12
)
1089 Register PtrReg
= MI
.getOperand(1).getReg();
1091 ApplyRegBankMapping
ApplyBank(B
, *this, MRI
, DstBank
);
1093 if (LoadSize
== 32) {
1094 // This is an extending load from a sub-dword size. Widen the memory
1095 // access size to 4 bytes and clear the extra high bits appropriately
1096 const LLT S32
= LLT::scalar(32);
1097 if (MI
.getOpcode() == AMDGPU::G_SEXTLOAD
) {
1098 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1099 auto WideLoad
= B
.buildLoadFromOffset(S32
, PtrReg
, *MMO
, 0);
1100 B
.buildSExtInReg(MI
.getOperand(0), WideLoad
, MemSize
);
1101 } else if (MI
.getOpcode() == AMDGPU::G_ZEXTLOAD
) {
1102 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1103 auto WideLoad
= B
.buildLoadFromOffset(S32
, PtrReg
, *MMO
, 0);
1104 B
.buildZExtInReg(MI
.getOperand(0), WideLoad
, MemSize
);
1106 // We do not need to touch the higher bits for regular loads.
1107 B
.buildLoadFromOffset(MI
.getOperand(0), PtrReg
, *MMO
, 0);
1109 // 96-bit loads are only available for vector loads. We need to split this
1110 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1111 if (MMO
->getAlign() < Align(16)) {
1112 LegalizerHelper
Helper(B
.getMF(), ApplyBank
, B
);
1114 std::tie(Part64
, Part32
) = splitUnequalType(LoadTy
, 64);
1115 if (Helper
.reduceLoadStoreWidth(cast
<GAnyLoad
>(MI
), 0, Part64
) !=
1116 LegalizerHelper::Legalized
)
1120 LLT WiderTy
= widen96To128(LoadTy
);
1121 auto WideLoad
= B
.buildLoadFromOffset(WiderTy
, PtrReg
, *MMO
, 0);
1122 if (WiderTy
.isScalar()) {
1123 B
.buildTrunc(MI
.getOperand(0), WideLoad
);
1125 B
.buildDeleteTrailingVectorElements(MI
.getOperand(0).getReg(),
1130 MI
.eraseFromParent();
1134 // 128-bit loads are supported for all instruction types.
1135 if (LoadSize
<= MaxNonSmrdLoadSize
)
1138 SmallVector
<Register
, 16> DefRegs(OpdMapper
.getVRegs(0));
1139 SmallVector
<Register
, 1> SrcRegs(OpdMapper
.getVRegs(1));
1141 if (SrcRegs
.empty())
1142 SrcRegs
.push_back(MI
.getOperand(1).getReg());
1144 assert(LoadSize
% MaxNonSmrdLoadSize
== 0);
1146 // RegBankSelect only emits scalar types, so we need to reset the pointer
1147 // operand to a pointer type.
1148 Register BasePtrReg
= SrcRegs
[0];
1149 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
1150 MRI
.setType(BasePtrReg
, PtrTy
);
1152 unsigned NumSplitParts
= LoadTy
.getSizeInBits() / MaxNonSmrdLoadSize
;
1153 const LLT LoadSplitTy
= LoadTy
.divide(NumSplitParts
);
1154 ApplyRegBankMapping
O(B
, *this, MRI
, &AMDGPU::VGPRRegBank
);
1155 LegalizerHelper
Helper(B
.getMF(), O
, B
);
1157 if (LoadTy
.isVector()) {
1158 if (Helper
.fewerElementsVector(MI
, 0, LoadSplitTy
) != LegalizerHelper::Legalized
)
1161 if (Helper
.narrowScalar(MI
, 0, LoadSplitTy
) != LegalizerHelper::Legalized
)
1165 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
1169 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1170 MachineIRBuilder
&B
,
1171 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1172 MachineInstr
&MI
) const {
1173 MachineRegisterInfo
&MRI
= *B
.getMRI();
1174 const MachineFunction
&MF
= B
.getMF();
1175 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1176 const auto &TFI
= *ST
.getFrameLowering();
1178 // Guard in case the stack growth direction ever changes with scratch
1180 if (TFI
.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown
)
1183 Register Dst
= MI
.getOperand(0).getReg();
1184 Register AllocSize
= MI
.getOperand(1).getReg();
1185 Align Alignment
= assumeAligned(MI
.getOperand(2).getImm());
1187 const RegisterBank
*SizeBank
= getRegBank(AllocSize
, MRI
, *TRI
);
1189 // TODO: Need to emit a wave reduction to get the maximum size.
1190 if (SizeBank
!= &AMDGPU::SGPRRegBank
)
1193 LLT PtrTy
= MRI
.getType(Dst
);
1194 LLT IntPtrTy
= LLT::scalar(PtrTy
.getSizeInBits());
1196 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1197 Register SPReg
= Info
->getStackPtrOffsetReg();
1198 ApplyRegBankMapping
ApplyBank(B
, *this, MRI
, &AMDGPU::SGPRRegBank
);
1200 auto WaveSize
= B
.buildConstant(LLT::scalar(32), ST
.getWavefrontSizeLog2());
1201 auto ScaledSize
= B
.buildShl(IntPtrTy
, AllocSize
, WaveSize
);
1203 auto SPCopy
= B
.buildCopy(PtrTy
, SPReg
);
1204 if (Alignment
> TFI
.getStackAlign()) {
1205 auto PtrAdd
= B
.buildPtrAdd(PtrTy
, SPCopy
, ScaledSize
);
1206 B
.buildMaskLowPtrBits(Dst
, PtrAdd
,
1207 Log2(Alignment
) + ST
.getWavefrontSizeLog2());
1209 B
.buildPtrAdd(Dst
, SPCopy
, ScaledSize
);
1212 MI
.eraseFromParent();
1216 bool AMDGPURegisterBankInfo::applyMappingImage(
1217 MachineIRBuilder
&B
, MachineInstr
&MI
,
1218 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1219 int RsrcIdx
) const {
1220 const int NumDefs
= MI
.getNumExplicitDefs();
1222 // The reported argument index is relative to the IR intrinsic call arguments,
1223 // so we need to shift by the number of defs and the intrinsic ID.
1224 RsrcIdx
+= NumDefs
+ 1;
1226 // Insert copies to VGPR arguments.
1227 applyDefaultMapping(OpdMapper
);
1229 // Fixup any SGPR arguments.
1230 SmallVector
<unsigned, 4> SGPRIndexes
;
1231 for (int I
= NumDefs
, NumOps
= MI
.getNumOperands(); I
!= NumOps
; ++I
) {
1232 if (!MI
.getOperand(I
).isReg())
1235 // If this intrinsic has a sampler, it immediately follows rsrc.
1236 if (I
== RsrcIdx
|| I
== RsrcIdx
+ 1)
1237 SGPRIndexes
.push_back(I
);
1240 executeInWaterfallLoop(B
, MI
, SGPRIndexes
);
1244 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1245 // the three offsets (voffset, soffset and instoffset)
1246 unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1247 MachineIRBuilder
&B
, Register CombinedOffset
, Register
&VOffsetReg
,
1248 Register
&SOffsetReg
, int64_t &InstOffsetVal
, Align Alignment
) const {
1249 const LLT S32
= LLT::scalar(32);
1250 MachineRegisterInfo
*MRI
= B
.getMRI();
1252 if (std::optional
<int64_t> Imm
=
1253 getIConstantVRegSExtVal(CombinedOffset
, *MRI
)) {
1254 uint32_t SOffset
, ImmOffset
;
1255 if (TII
->splitMUBUFOffset(*Imm
, SOffset
, ImmOffset
, Alignment
)) {
1256 VOffsetReg
= B
.buildConstant(S32
, 0).getReg(0);
1257 SOffsetReg
= B
.buildConstant(S32
, SOffset
).getReg(0);
1258 InstOffsetVal
= ImmOffset
;
1260 B
.getMRI()->setRegBank(VOffsetReg
, AMDGPU::VGPRRegBank
);
1261 B
.getMRI()->setRegBank(SOffsetReg
, AMDGPU::SGPRRegBank
);
1262 return SOffset
+ ImmOffset
;
1269 std::tie(Base
, Offset
) =
1270 AMDGPU::getBaseWithConstantOffset(*MRI
, CombinedOffset
);
1272 uint32_t SOffset
, ImmOffset
;
1273 if ((int)Offset
> 0 &&
1274 TII
->splitMUBUFOffset(Offset
, SOffset
, ImmOffset
, Alignment
)) {
1275 if (getRegBank(Base
, *MRI
, *TRI
) == &AMDGPU::VGPRRegBank
) {
1277 SOffsetReg
= B
.buildConstant(S32
, SOffset
).getReg(0);
1278 B
.getMRI()->setRegBank(SOffsetReg
, AMDGPU::SGPRRegBank
);
1279 InstOffsetVal
= ImmOffset
;
1280 return 0; // XXX - Why is this 0?
1283 // If we have SGPR base, we can use it for soffset.
1285 VOffsetReg
= B
.buildConstant(S32
, 0).getReg(0);
1286 B
.getMRI()->setRegBank(VOffsetReg
, AMDGPU::VGPRRegBank
);
1288 InstOffsetVal
= ImmOffset
;
1289 return 0; // XXX - Why is this 0?
1293 // Handle the variable sgpr + vgpr case.
1294 MachineInstr
*Add
= getOpcodeDef(AMDGPU::G_ADD
, CombinedOffset
, *MRI
);
1295 if (Add
&& (int)Offset
>= 0) {
1296 Register Src0
= getSrcRegIgnoringCopies(Add
->getOperand(1).getReg(), *MRI
);
1297 Register Src1
= getSrcRegIgnoringCopies(Add
->getOperand(2).getReg(), *MRI
);
1299 const RegisterBank
*Src0Bank
= getRegBank(Src0
, *MRI
, *TRI
);
1300 const RegisterBank
*Src1Bank
= getRegBank(Src1
, *MRI
, *TRI
);
1302 if (Src0Bank
== &AMDGPU::VGPRRegBank
&& Src1Bank
== &AMDGPU::SGPRRegBank
) {
1308 if (Src0Bank
== &AMDGPU::SGPRRegBank
&& Src1Bank
== &AMDGPU::VGPRRegBank
) {
1315 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1316 // have an SGPR offset and a VGPR resource.
1317 if (getRegBank(CombinedOffset
, *MRI
, *TRI
) == &AMDGPU::VGPRRegBank
) {
1318 VOffsetReg
= CombinedOffset
;
1320 VOffsetReg
= B
.buildCopy(S32
, CombinedOffset
).getReg(0);
1321 B
.getMRI()->setRegBank(VOffsetReg
, AMDGPU::VGPRRegBank
);
1324 SOffsetReg
= B
.buildConstant(S32
, 0).getReg(0);
1325 B
.getMRI()->setRegBank(SOffsetReg
, AMDGPU::SGPRRegBank
);
1329 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1330 MachineIRBuilder
&B
, const OperandsMapper
&OpdMapper
) const {
1331 MachineInstr
&MI
= OpdMapper
.getMI();
1332 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
1334 const LLT S32
= LLT::scalar(32);
1335 Register Dst
= MI
.getOperand(0).getReg();
1336 LLT Ty
= MRI
.getType(Dst
);
1338 const RegisterBank
*RSrcBank
=
1339 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
1340 const RegisterBank
*OffsetBank
=
1341 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
1342 if (RSrcBank
== &AMDGPU::SGPRRegBank
&&
1343 OffsetBank
== &AMDGPU::SGPRRegBank
)
1344 return true; // Legal mapping
1346 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1347 // here but don't have an MMO.
1349 unsigned LoadSize
= Ty
.getSizeInBits();
1351 if (LoadSize
== 256 || LoadSize
== 512) {
1352 NumLoads
= LoadSize
/ 128;
1353 Ty
= Ty
.divide(NumLoads
);
1356 // Use the alignment to ensure that the required offsets will fit into the
1357 // immediate offsets.
1358 const Align Alignment
= NumLoads
> 1 ? Align(16 * NumLoads
) : Align(1);
1360 MachineFunction
&MF
= B
.getMF();
1364 int64_t ImmOffset
= 0;
1366 unsigned MMOOffset
= setBufferOffsets(B
, MI
.getOperand(2).getReg(), VOffset
,
1367 SOffset
, ImmOffset
, Alignment
);
1369 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1370 // can, but we need to track an MMO for that.
1371 const unsigned MemSize
= (Ty
.getSizeInBits() + 7) / 8;
1372 const Align
MemAlign(4); // FIXME: ABI type alignment?
1373 MachineMemOperand
*BaseMMO
= MF
.getMachineMemOperand(
1374 MachinePointerInfo(),
1375 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
1376 MachineMemOperand::MOInvariant
,
1379 BaseMMO
= MF
.getMachineMemOperand(BaseMMO
, MMOOffset
, MemSize
);
1381 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1382 // assume that the buffer is unswizzled.
1384 Register RSrc
= MI
.getOperand(1).getReg();
1385 Register VIndex
= B
.buildConstant(S32
, 0).getReg(0);
1386 B
.getMRI()->setRegBank(VIndex
, AMDGPU::VGPRRegBank
);
1388 SmallVector
<Register
, 4> LoadParts(NumLoads
);
1390 MachineBasicBlock::iterator MII
= MI
.getIterator();
1391 MachineInstrSpan
Span(MII
, &B
.getMBB());
1393 for (int i
= 0; i
< NumLoads
; ++i
) {
1394 if (NumLoads
== 1) {
1397 LoadParts
[i
] = MRI
.createGenericVirtualRegister(Ty
);
1398 MRI
.setRegBank(LoadParts
[i
], AMDGPU::VGPRRegBank
);
1401 MachineMemOperand
*MMO
= BaseMMO
;
1403 BaseMMO
= MF
.getMachineMemOperand(BaseMMO
, MMOOffset
+ 16 * i
, MemSize
);
1405 B
.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD
)
1406 .addDef(LoadParts
[i
]) // vdata
1407 .addUse(RSrc
) // rsrc
1408 .addUse(VIndex
) // vindex
1409 .addUse(VOffset
) // voffset
1410 .addUse(SOffset
) // soffset
1411 .addImm(ImmOffset
+ 16 * i
) // offset(imm)
1412 .addImm(0) // cachepolicy, swizzled buffer(imm)
1413 .addImm(0) // idxen(imm)
1414 .addMemOperand(MMO
);
1417 // TODO: If only the resource is a VGPR, it may be better to execute the
1418 // scalar load in the waterfall loop if the resource is expected to frequently
1419 // be dynamically uniform.
1420 if (RSrcBank
!= &AMDGPU::SGPRRegBank
) {
1421 // Remove the original instruction to avoid potentially confusing the
1422 // waterfall loop logic.
1423 B
.setInstr(*Span
.begin());
1424 MI
.eraseFromParent();
1426 SmallSet
<Register
, 4> OpsToWaterfall
;
1428 OpsToWaterfall
.insert(RSrc
);
1429 executeInWaterfallLoop(B
, make_range(Span
.begin(), Span
.end()),
1433 if (NumLoads
!= 1) {
1435 B
.buildConcatVectors(Dst
, LoadParts
);
1437 B
.buildMergeLikeInstr(Dst
, LoadParts
);
1440 // We removed the instruction earlier with a waterfall loop.
1441 if (RSrcBank
== &AMDGPU::SGPRRegBank
)
1442 MI
.eraseFromParent();
1447 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder
&B
,
1448 const OperandsMapper
&OpdMapper
,
1449 bool Signed
) const {
1450 MachineInstr
&MI
= OpdMapper
.getMI();
1451 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
1453 // Insert basic copies
1454 applyDefaultMapping(OpdMapper
);
1456 Register DstReg
= MI
.getOperand(0).getReg();
1457 LLT Ty
= MRI
.getType(DstReg
);
1459 const LLT S32
= LLT::scalar(32);
1461 unsigned FirstOpnd
= isa
<GIntrinsic
>(MI
) ? 2 : 1;
1462 Register SrcReg
= MI
.getOperand(FirstOpnd
).getReg();
1463 Register OffsetReg
= MI
.getOperand(FirstOpnd
+ 1).getReg();
1464 Register WidthReg
= MI
.getOperand(FirstOpnd
+ 2).getReg();
1466 const RegisterBank
*DstBank
=
1467 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
1468 if (DstBank
== &AMDGPU::VGPRRegBank
) {
1472 // There is no 64-bit vgpr bitfield extract instructions so the operation
1473 // is expanded to a sequence of instructions that implement the operation.
1474 ApplyRegBankMapping
ApplyBank(B
, *this, MRI
, &AMDGPU::VGPRRegBank
);
1476 const LLT S64
= LLT::scalar(64);
1477 // Shift the source operand so that extracted bits start at bit 0.
1478 auto ShiftOffset
= Signed
? B
.buildAShr(S64
, SrcReg
, OffsetReg
)
1479 : B
.buildLShr(S64
, SrcReg
, OffsetReg
);
1480 auto UnmergeSOffset
= B
.buildUnmerge({S32
, S32
}, ShiftOffset
);
1482 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1483 // if the width is a constant.
1484 if (auto ConstWidth
= getIConstantVRegValWithLookThrough(WidthReg
, MRI
)) {
1485 // Use the 32-bit bitfield extract instruction if the width is a constant.
1486 // Depending on the width size, use either the low or high 32-bits.
1487 auto Zero
= B
.buildConstant(S32
, 0);
1488 auto WidthImm
= ConstWidth
->Value
.getZExtValue();
1489 if (WidthImm
<= 32) {
1490 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1491 // or clear the upper 32-bits.
1493 Signed
? B
.buildSbfx(S32
, UnmergeSOffset
.getReg(0), Zero
, WidthReg
)
1494 : B
.buildUbfx(S32
, UnmergeSOffset
.getReg(0), Zero
, WidthReg
);
1496 Signed
? B
.buildAShr(S32
, Extract
, B
.buildConstant(S32
, 31)) : Zero
;
1497 B
.buildMergeLikeInstr(DstReg
, {Extract
, Extend
});
1499 // Use bitfield extract on upper 32-bit source, and combine with lower
1501 auto UpperWidth
= B
.buildConstant(S32
, WidthImm
- 32);
1504 ? B
.buildSbfx(S32
, UnmergeSOffset
.getReg(1), Zero
, UpperWidth
)
1505 : B
.buildUbfx(S32
, UnmergeSOffset
.getReg(1), Zero
, UpperWidth
);
1506 B
.buildMergeLikeInstr(DstReg
, {UnmergeSOffset
.getReg(0), Extract
});
1508 MI
.eraseFromParent();
1512 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1514 auto ExtShift
= B
.buildSub(S32
, B
.buildConstant(S32
, 64), WidthReg
);
1515 auto SignBit
= B
.buildShl(S64
, ShiftOffset
, ExtShift
);
1517 B
.buildAShr(S64
, SignBit
, ExtShift
);
1519 B
.buildLShr(S64
, SignBit
, ExtShift
);
1520 MI
.eraseFromParent();
1524 // The scalar form packs the offset and width in a single operand.
1526 ApplyRegBankMapping
ApplyBank(B
, *this, MRI
, &AMDGPU::SGPRRegBank
);
1528 // Ensure the high bits are clear to insert the offset.
1529 auto OffsetMask
= B
.buildConstant(S32
, maskTrailingOnes
<unsigned>(6));
1530 auto ClampOffset
= B
.buildAnd(S32
, OffsetReg
, OffsetMask
);
1532 // Zeros out the low bits, so don't bother clamping the input value.
1533 auto ShiftWidth
= B
.buildShl(S32
, WidthReg
, B
.buildConstant(S32
, 16));
1535 // Transformation function, pack the offset and width of a BFE into
1536 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1537 // source, bits [5:0] contain the offset and bits [22:16] the width.
1538 auto MergedInputs
= B
.buildOr(S32
, ClampOffset
, ShiftWidth
);
1540 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1541 // register class constraints.
1542 unsigned Opc
= Ty
== S32
? (Signed
? AMDGPU::S_BFE_I32
: AMDGPU::S_BFE_U32
) :
1543 (Signed
? AMDGPU::S_BFE_I64
: AMDGPU::S_BFE_U64
);
1545 auto MIB
= B
.buildInstr(Opc
, {DstReg
}, {SrcReg
, MergedInputs
});
1546 if (!constrainSelectedInstRegOperands(*MIB
, *TII
, *TRI
, *this))
1547 llvm_unreachable("failed to constrain BFE");
1549 MI
.eraseFromParent();
1553 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1554 MachineIRBuilder
&B
, const OperandsMapper
&OpdMapper
) const {
1555 MachineInstr
&MI
= OpdMapper
.getMI();
1556 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
1558 // Insert basic copies.
1559 applyDefaultMapping(OpdMapper
);
1561 Register Dst0
= MI
.getOperand(0).getReg();
1562 Register Dst1
= MI
.getOperand(1).getReg();
1563 Register Src0
= MI
.getOperand(2).getReg();
1564 Register Src1
= MI
.getOperand(3).getReg();
1565 Register Src2
= MI
.getOperand(4).getReg();
1567 if (MRI
.getRegBankOrNull(Src0
) == &AMDGPU::VGPRRegBank
)
1570 bool IsUnsigned
= MI
.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32
;
1571 LLT S1
= LLT::scalar(1);
1572 LLT S32
= LLT::scalar(32);
1574 bool DstOnValu
= MRI
.getRegBankOrNull(Src2
) == &AMDGPU::VGPRRegBank
;
1575 bool Accumulate
= true;
1578 if (mi_match(Src2
, MRI
, m_ZeroInt()))
1582 // Keep the multiplication on the SALU.
1584 Register DstLo
= B
.buildMul(S32
, Src0
, Src1
).getReg(0);
1585 bool MulHiInVgpr
= false;
1587 MRI
.setRegBank(DstLo
, AMDGPU::SGPRRegBank
);
1589 if (Subtarget
.hasSMulHi()) {
1590 DstHi
= IsUnsigned
? B
.buildUMulH(S32
, Src0
, Src1
).getReg(0)
1591 : B
.buildSMulH(S32
, Src0
, Src1
).getReg(0);
1592 MRI
.setRegBank(DstHi
, AMDGPU::SGPRRegBank
);
1594 Register VSrc0
= B
.buildCopy(S32
, Src0
).getReg(0);
1595 Register VSrc1
= B
.buildCopy(S32
, Src1
).getReg(0);
1597 MRI
.setRegBank(VSrc0
, AMDGPU::VGPRRegBank
);
1598 MRI
.setRegBank(VSrc1
, AMDGPU::VGPRRegBank
);
1600 DstHi
= IsUnsigned
? B
.buildUMulH(S32
, VSrc0
, VSrc1
).getReg(0)
1601 : B
.buildSMulH(S32
, VSrc0
, VSrc1
).getReg(0);
1602 MRI
.setRegBank(DstHi
, AMDGPU::VGPRRegBank
);
1605 DstHi
= buildReadFirstLane(B
, MRI
, DstHi
);
1611 // Accumulate and produce the "carry-out" bit.
1613 // The "carry-out" is defined as bit 64 of the result when computed as a
1614 // big integer. For unsigned multiply-add, this matches the usual definition
1615 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1616 // result, which is determined as:
1617 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1618 LLT CarryType
= DstOnValu
? S1
: S32
;
1619 const RegisterBank
&CarryBank
=
1620 DstOnValu
? AMDGPU::VCCRegBank
: AMDGPU::SGPRRegBank
;
1621 const RegisterBank
&DstBank
=
1622 DstOnValu
? AMDGPU::VGPRRegBank
: AMDGPU::SGPRRegBank
;
1627 Zero
= B
.buildConstant(S32
, 0).getReg(0);
1628 MRI
.setRegBank(Zero
,
1629 MulHiInVgpr
? AMDGPU::VGPRRegBank
: AMDGPU::SGPRRegBank
);
1631 Carry
= B
.buildICmp(CmpInst::ICMP_SLT
, MulHiInVgpr
? S1
: S32
, DstHi
, Zero
)
1633 MRI
.setRegBank(Carry
, MulHiInVgpr
? AMDGPU::VCCRegBank
1634 : AMDGPU::SGPRRegBank
);
1636 if (DstOnValu
&& !MulHiInVgpr
) {
1637 Carry
= B
.buildTrunc(S1
, Carry
).getReg(0);
1638 MRI
.setRegBank(Carry
, AMDGPU::VCCRegBank
);
1644 DstLo
= B
.buildCopy(S32
, DstLo
).getReg(0);
1645 DstHi
= B
.buildCopy(S32
, DstHi
).getReg(0);
1646 MRI
.setRegBank(DstLo
, AMDGPU::VGPRRegBank
);
1647 MRI
.setRegBank(DstHi
, AMDGPU::VGPRRegBank
);
1650 auto Unmerge
= B
.buildUnmerge(S32
, Src2
);
1651 Register Src2Lo
= Unmerge
.getReg(0);
1652 Register Src2Hi
= Unmerge
.getReg(1);
1653 MRI
.setRegBank(Src2Lo
, DstBank
);
1654 MRI
.setRegBank(Src2Hi
, DstBank
);
1657 auto Src2Sign
= B
.buildICmp(CmpInst::ICMP_SLT
, CarryType
, Src2Hi
, Zero
);
1658 MRI
.setRegBank(Src2Sign
.getReg(0), CarryBank
);
1660 Carry
= B
.buildXor(CarryType
, Carry
, Src2Sign
).getReg(0);
1661 MRI
.setRegBank(Carry
, CarryBank
);
1664 auto AddLo
= B
.buildUAddo(S32
, CarryType
, DstLo
, Src2Lo
);
1665 DstLo
= AddLo
.getReg(0);
1666 Register CarryLo
= AddLo
.getReg(1);
1667 MRI
.setRegBank(DstLo
, DstBank
);
1668 MRI
.setRegBank(CarryLo
, CarryBank
);
1670 auto AddHi
= B
.buildUAdde(S32
, CarryType
, DstHi
, Src2Hi
, CarryLo
);
1671 DstHi
= AddHi
.getReg(0);
1672 MRI
.setRegBank(DstHi
, DstBank
);
1674 Register CarryHi
= AddHi
.getReg(1);
1675 MRI
.setRegBank(CarryHi
, CarryBank
);
1680 Carry
= B
.buildXor(CarryType
, Carry
, CarryHi
).getReg(0);
1681 MRI
.setRegBank(Carry
, CarryBank
);
1685 Carry
= B
.buildConstant(CarryType
, 0).getReg(0);
1686 MRI
.setRegBank(Carry
, CarryBank
);
1690 B
.buildMergeLikeInstr(Dst0
, {DstLo
, DstHi
});
1693 B
.buildCopy(Dst1
, Carry
);
1695 B
.buildTrunc(Dst1
, Carry
);
1698 MI
.eraseFromParent();
1702 // Return a suitable opcode for extending the operands of Opc when widening.
1703 static unsigned getExtendOp(unsigned Opc
) {
1705 case TargetOpcode::G_ASHR
:
1706 case TargetOpcode::G_SMIN
:
1707 case TargetOpcode::G_SMAX
:
1708 return TargetOpcode::G_SEXT
;
1709 case TargetOpcode::G_LSHR
:
1710 case TargetOpcode::G_UMIN
:
1711 case TargetOpcode::G_UMAX
:
1712 return TargetOpcode::G_ZEXT
;
1714 return TargetOpcode::G_ANYEXT
;
1718 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1719 // any illegal vector extend or unmerge operations.
1720 static std::pair
<Register
, Register
>
1721 unpackV2S16ToS32(MachineIRBuilder
&B
, Register Src
, unsigned ExtOpcode
) {
1722 const LLT S32
= LLT::scalar(32);
1723 auto Bitcast
= B
.buildBitcast(S32
, Src
);
1725 if (ExtOpcode
== TargetOpcode::G_SEXT
) {
1726 auto ExtLo
= B
.buildSExtInReg(S32
, Bitcast
, 16);
1727 auto ShiftHi
= B
.buildAShr(S32
, Bitcast
, B
.buildConstant(S32
, 16));
1728 return std::pair(ExtLo
.getReg(0), ShiftHi
.getReg(0));
1731 auto ShiftHi
= B
.buildLShr(S32
, Bitcast
, B
.buildConstant(S32
, 16));
1732 if (ExtOpcode
== TargetOpcode::G_ZEXT
) {
1733 auto ExtLo
= B
.buildAnd(S32
, Bitcast
, B
.buildConstant(S32
, 0xffff));
1734 return std::pair(ExtLo
.getReg(0), ShiftHi
.getReg(0));
1737 assert(ExtOpcode
== TargetOpcode::G_ANYEXT
);
1738 return std::pair(Bitcast
.getReg(0), ShiftHi
.getReg(0));
1741 // For cases where only a single copy is inserted for matching register banks.
1742 // Replace the register in the instruction operand
1743 static bool substituteSimpleCopyRegs(
1744 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
, unsigned OpIdx
) {
1745 SmallVector
<unsigned, 1> SrcReg(OpdMapper
.getVRegs(OpIdx
));
1746 if (!SrcReg
.empty()) {
1747 assert(SrcReg
.size() == 1);
1748 OpdMapper
.getMI().getOperand(OpIdx
).setReg(SrcReg
[0]);
1755 /// Handle register layout difference for f16 images for some subtargets.
1756 Register
AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder
&B
,
1757 MachineRegisterInfo
&MRI
,
1758 Register Reg
) const {
1759 if (!Subtarget
.hasUnpackedD16VMem())
1762 const LLT S16
= LLT::scalar(16);
1763 LLT StoreVT
= MRI
.getType(Reg
);
1764 if (!StoreVT
.isVector() || StoreVT
.getElementType() != S16
)
1767 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
1770 SmallVector
<Register
, 4> WideRegs
;
1771 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
1772 WideRegs
.push_back(Unmerge
.getReg(I
));
1774 const LLT S32
= LLT::scalar(32);
1775 int NumElts
= StoreVT
.getNumElements();
1777 return B
.buildMergeLikeInstr(LLT::fixed_vector(NumElts
, S32
), WideRegs
)
1781 static std::pair
<Register
, unsigned>
1782 getBaseWithConstantOffset(MachineRegisterInfo
&MRI
, Register Reg
) {
1784 if (mi_match(Reg
, MRI
, m_ICst(Const
)))
1785 return std::pair(Register(), Const
);
1788 if (mi_match(Reg
, MRI
, m_GAdd(m_Reg(Base
), m_ICst(Const
))))
1789 return std::pair(Base
, Const
);
1791 // TODO: Handle G_OR used for add case
1792 return std::pair(Reg
, 0);
1795 std::pair
<Register
, unsigned>
1796 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder
&B
,
1797 Register OrigOffset
) const {
1798 const unsigned MaxImm
= SIInstrInfo::getMaxMUBUFImmOffset(Subtarget
);
1801 const LLT S32
= LLT::scalar(32);
1803 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1804 std::tie(BaseReg
, ImmOffset
) = getBaseWithConstantOffset(*B
.getMRI(),
1808 if (ImmOffset
!= 0) {
1809 // If the immediate value is too big for the immoffset field, put only bits
1810 // that would normally fit in the immoffset field. The remaining value that
1811 // is copied/added for the voffset field is a large power of 2, and it
1812 // stands more chance of being CSEd with the copy/add for another similar
1814 // However, do not do that rounding down if that is a negative
1815 // number, as it appears to be illegal to have a negative offset in the
1816 // vgpr, even if adding the immediate offset makes it positive.
1817 unsigned Overflow
= ImmOffset
& ~MaxImm
;
1818 ImmOffset
-= Overflow
;
1819 if ((int32_t)Overflow
< 0) {
1820 Overflow
+= ImmOffset
;
1825 if (Overflow
!= 0) {
1827 BaseReg
= B
.buildConstant(S32
, Overflow
).getReg(0);
1829 auto OverflowVal
= B
.buildConstant(S32
, Overflow
);
1830 BaseReg
= B
.buildAdd(S32
, BaseReg
, OverflowVal
).getReg(0);
1836 BaseReg
= B
.buildConstant(S32
, 0).getReg(0);
1838 return {BaseReg
, C1
};
1841 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder
&B
, Register DstReg
,
1842 Register SrcReg
) const {
1843 MachineRegisterInfo
&MRI
= *B
.getMRI();
1844 LLT SrcTy
= MRI
.getType(SrcReg
);
1845 if (SrcTy
.getSizeInBits() == 32) {
1846 // Use a v_mov_b32 here to make the exec dependency explicit.
1847 B
.buildInstr(AMDGPU::V_MOV_B32_e32
)
1850 return constrainGenericRegister(DstReg
, AMDGPU::VGPR_32RegClass
, MRI
) &&
1851 constrainGenericRegister(SrcReg
, AMDGPU::SReg_32RegClass
, MRI
);
1854 Register TmpReg0
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1855 Register TmpReg1
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1857 B
.buildInstr(AMDGPU::V_MOV_B32_e32
)
1859 .addUse(SrcReg
, 0, AMDGPU::sub0
);
1860 B
.buildInstr(AMDGPU::V_MOV_B32_e32
)
1862 .addUse(SrcReg
, 0, AMDGPU::sub1
);
1863 B
.buildInstr(AMDGPU::REG_SEQUENCE
)
1866 .addImm(AMDGPU::sub0
)
1868 .addImm(AMDGPU::sub1
);
1870 return constrainGenericRegister(SrcReg
, AMDGPU::SReg_64RegClass
, MRI
) &&
1871 constrainGenericRegister(DstReg
, AMDGPU::VReg_64RegClass
, MRI
);
1874 /// Utility function for pushing dynamic vector indexes with a constant offset
1875 /// into waterfall loops.
1876 static void reinsertVectorIndexAdd(MachineIRBuilder
&B
,
1877 MachineInstr
&IdxUseInstr
,
1879 unsigned ConstOffset
) {
1880 MachineRegisterInfo
&MRI
= *B
.getMRI();
1881 const LLT S32
= LLT::scalar(32);
1882 Register WaterfallIdx
= IdxUseInstr
.getOperand(OpIdx
).getReg();
1883 B
.setInsertPt(*IdxUseInstr
.getParent(), IdxUseInstr
.getIterator());
1885 auto MaterializedOffset
= B
.buildConstant(S32
, ConstOffset
);
1887 auto Add
= B
.buildAdd(S32
, WaterfallIdx
, MaterializedOffset
);
1888 MRI
.setRegBank(MaterializedOffset
.getReg(0), AMDGPU::SGPRRegBank
);
1889 MRI
.setRegBank(Add
.getReg(0), AMDGPU::SGPRRegBank
);
1890 IdxUseInstr
.getOperand(OpIdx
).setReg(Add
.getReg(0));
1893 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1894 /// original 32-bit source value (to be inserted in the low part of the combined
1895 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1897 static void extendLow32IntoHigh32(MachineIRBuilder
&B
,
1898 Register Hi32Reg
, Register Lo32Reg
,
1900 const RegisterBank
&RegBank
,
1901 bool IsBooleanSrc
= false) {
1902 if (ExtOpc
== AMDGPU::G_ZEXT
) {
1903 B
.buildConstant(Hi32Reg
, 0);
1904 } else if (ExtOpc
== AMDGPU::G_SEXT
) {
1906 // If we know the original source was an s1, the high half is the same as
1908 B
.buildCopy(Hi32Reg
, Lo32Reg
);
1910 // Replicate sign bit from 32-bit extended part.
1911 auto ShiftAmt
= B
.buildConstant(LLT::scalar(32), 31);
1912 B
.getMRI()->setRegBank(ShiftAmt
.getReg(0), RegBank
);
1913 B
.buildAShr(Hi32Reg
, Lo32Reg
, ShiftAmt
);
1916 assert(ExtOpc
== AMDGPU::G_ANYEXT
&& "not an integer extension");
1917 B
.buildUndef(Hi32Reg
);
1921 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1922 MachineIRBuilder
&B
, MachineInstr
&MI
,
1923 const OperandsMapper
&OpdMapper
) const {
1924 MachineRegisterInfo
&MRI
= *B
.getMRI();
1926 Register VecReg
= MI
.getOperand(1).getReg();
1927 Register Idx
= MI
.getOperand(2).getReg();
1929 const RegisterBank
&IdxBank
=
1930 *OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
1932 bool IsDivergentIdx
= IdxBank
!= AMDGPU::SGPRRegBank
;
1934 LLT VecTy
= MRI
.getType(VecReg
);
1935 unsigned EltSize
= VecTy
.getScalarSizeInBits();
1936 unsigned NumElem
= VecTy
.getNumElements();
1938 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize
, NumElem
,
1939 IsDivergentIdx
, &Subtarget
))
1942 LLT S32
= LLT::scalar(32);
1944 const RegisterBank
&DstBank
=
1945 *OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
1946 const RegisterBank
&SrcBank
=
1947 *OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
1949 const RegisterBank
&CCBank
=
1950 (DstBank
== AMDGPU::SGPRRegBank
&&
1951 SrcBank
== AMDGPU::SGPRRegBank
&&
1952 IdxBank
== AMDGPU::SGPRRegBank
) ? AMDGPU::SGPRRegBank
1953 : AMDGPU::VCCRegBank
;
1954 LLT CCTy
= (CCBank
== AMDGPU::SGPRRegBank
) ? S32
: LLT::scalar(1);
1956 if (CCBank
== AMDGPU::VCCRegBank
&& IdxBank
== AMDGPU::SGPRRegBank
) {
1957 Idx
= B
.buildCopy(S32
, Idx
)->getOperand(0).getReg();
1958 MRI
.setRegBank(Idx
, AMDGPU::VGPRRegBank
);
1961 LLT EltTy
= VecTy
.getScalarType();
1962 SmallVector
<Register
, 2> DstRegs(OpdMapper
.getVRegs(0));
1963 unsigned NumLanes
= DstRegs
.size();
1967 EltTy
= MRI
.getType(DstRegs
[0]);
1969 auto UnmergeToEltTy
= B
.buildUnmerge(EltTy
, VecReg
);
1970 SmallVector
<Register
, 2> Res(NumLanes
);
1971 for (unsigned L
= 0; L
< NumLanes
; ++L
)
1972 Res
[L
] = UnmergeToEltTy
.getReg(L
);
1974 for (unsigned I
= 1; I
< NumElem
; ++I
) {
1975 auto IC
= B
.buildConstant(S32
, I
);
1976 MRI
.setRegBank(IC
->getOperand(0).getReg(), AMDGPU::SGPRRegBank
);
1977 auto Cmp
= B
.buildICmp(CmpInst::ICMP_EQ
, CCTy
, Idx
, IC
);
1978 MRI
.setRegBank(Cmp
->getOperand(0).getReg(), CCBank
);
1980 for (unsigned L
= 0; L
< NumLanes
; ++L
) {
1981 auto S
= B
.buildSelect(EltTy
, Cmp
,
1982 UnmergeToEltTy
.getReg(I
* NumLanes
+ L
), Res
[L
]);
1984 for (unsigned N
: { 0, 2, 3 })
1985 MRI
.setRegBank(S
->getOperand(N
).getReg(), DstBank
);
1987 Res
[L
] = S
->getOperand(0).getReg();
1991 for (unsigned L
= 0; L
< NumLanes
; ++L
) {
1992 Register DstReg
= (NumLanes
== 1) ? MI
.getOperand(0).getReg() : DstRegs
[L
];
1993 B
.buildCopy(DstReg
, Res
[L
]);
1994 MRI
.setRegBank(DstReg
, DstBank
);
1997 MRI
.setRegBank(MI
.getOperand(0).getReg(), DstBank
);
1998 MI
.eraseFromParent();
2003 // Insert a cross regbank copy for a register if it already has a bank that
2004 // differs from the one we want to set.
2005 static Register
constrainRegToBank(MachineRegisterInfo
&MRI
,
2006 MachineIRBuilder
&B
, Register
&Reg
,
2007 const RegisterBank
&Bank
) {
2008 const RegisterBank
*CurrBank
= MRI
.getRegBankOrNull(Reg
);
2009 if (CurrBank
&& *CurrBank
!= Bank
) {
2010 Register Copy
= B
.buildCopy(MRI
.getType(Reg
), Reg
).getReg(0);
2011 MRI
.setRegBank(Copy
, Bank
);
2015 MRI
.setRegBank(Reg
, Bank
);
2019 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2020 MachineIRBuilder
&B
, MachineInstr
&MI
,
2021 const OperandsMapper
&OpdMapper
) const {
2023 MachineRegisterInfo
&MRI
= *B
.getMRI();
2024 Register VecReg
= MI
.getOperand(1).getReg();
2025 Register Idx
= MI
.getOperand(3).getReg();
2027 const RegisterBank
&IdxBank
=
2028 *OpdMapper
.getInstrMapping().getOperandMapping(3).BreakDown
[0].RegBank
;
2030 bool IsDivergentIdx
= IdxBank
!= AMDGPU::SGPRRegBank
;
2032 LLT VecTy
= MRI
.getType(VecReg
);
2033 unsigned EltSize
= VecTy
.getScalarSizeInBits();
2034 unsigned NumElem
= VecTy
.getNumElements();
2036 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize
, NumElem
,
2037 IsDivergentIdx
, &Subtarget
))
2040 LLT S32
= LLT::scalar(32);
2042 const RegisterBank
&DstBank
=
2043 *OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2044 const RegisterBank
&SrcBank
=
2045 *OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2046 const RegisterBank
&InsBank
=
2047 *OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
2049 const RegisterBank
&CCBank
=
2050 (DstBank
== AMDGPU::SGPRRegBank
&&
2051 SrcBank
== AMDGPU::SGPRRegBank
&&
2052 InsBank
== AMDGPU::SGPRRegBank
&&
2053 IdxBank
== AMDGPU::SGPRRegBank
) ? AMDGPU::SGPRRegBank
2054 : AMDGPU::VCCRegBank
;
2055 LLT CCTy
= (CCBank
== AMDGPU::SGPRRegBank
) ? S32
: LLT::scalar(1);
2057 if (CCBank
== AMDGPU::VCCRegBank
&& IdxBank
== AMDGPU::SGPRRegBank
) {
2058 Idx
= B
.buildCopy(S32
, Idx
)->getOperand(0).getReg();
2059 MRI
.setRegBank(Idx
, AMDGPU::VGPRRegBank
);
2062 LLT EltTy
= VecTy
.getScalarType();
2063 SmallVector
<Register
, 2> InsRegs(OpdMapper
.getVRegs(2));
2064 unsigned NumLanes
= InsRegs
.size();
2067 InsRegs
.push_back(MI
.getOperand(2).getReg());
2069 EltTy
= MRI
.getType(InsRegs
[0]);
2072 auto UnmergeToEltTy
= B
.buildUnmerge(EltTy
, VecReg
);
2073 SmallVector
<Register
, 16> Ops(NumElem
* NumLanes
);
2075 for (unsigned I
= 0; I
< NumElem
; ++I
) {
2076 auto IC
= B
.buildConstant(S32
, I
);
2077 MRI
.setRegBank(IC
->getOperand(0).getReg(), AMDGPU::SGPRRegBank
);
2078 auto Cmp
= B
.buildICmp(CmpInst::ICMP_EQ
, CCTy
, Idx
, IC
);
2079 MRI
.setRegBank(Cmp
->getOperand(0).getReg(), CCBank
);
2081 for (unsigned L
= 0; L
< NumLanes
; ++L
) {
2082 Register Op0
= constrainRegToBank(MRI
, B
, InsRegs
[L
], DstBank
);
2083 Register Op1
= UnmergeToEltTy
.getReg(I
* NumLanes
+ L
);
2084 Op1
= constrainRegToBank(MRI
, B
, Op1
, DstBank
);
2086 Register Select
= B
.buildSelect(EltTy
, Cmp
, Op0
, Op1
).getReg(0);
2087 MRI
.setRegBank(Select
, DstBank
);
2089 Ops
[I
* NumLanes
+ L
] = Select
;
2093 LLT MergeTy
= LLT::fixed_vector(Ops
.size(), EltTy
);
2094 if (MergeTy
== MRI
.getType(MI
.getOperand(0).getReg())) {
2095 B
.buildBuildVector(MI
.getOperand(0), Ops
);
2097 auto Vec
= B
.buildBuildVector(MergeTy
, Ops
);
2098 MRI
.setRegBank(Vec
->getOperand(0).getReg(), DstBank
);
2099 B
.buildBitcast(MI
.getOperand(0).getReg(), Vec
);
2102 MRI
.setRegBank(MI
.getOperand(0).getReg(), DstBank
);
2103 MI
.eraseFromParent();
2108 // Break s_mul_u64 into 32-bit vector operations.
2109 void AMDGPURegisterBankInfo::applyMappingSMULU64(
2110 MachineIRBuilder
&B
, const OperandsMapper
&OpdMapper
) const {
2111 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2112 SmallVector
<Register
, 2> Src0Regs(OpdMapper
.getVRegs(1));
2113 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
2115 // All inputs are SGPRs, nothing special to do.
2116 if (DefRegs
.empty()) {
2117 assert(Src0Regs
.empty() && Src1Regs
.empty());
2118 applyDefaultMapping(OpdMapper
);
2122 assert(DefRegs
.size() == 2);
2123 assert(Src0Regs
.size() == Src1Regs
.size() &&
2124 (Src0Regs
.empty() || Src0Regs
.size() == 2));
2126 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
2127 MachineInstr
&MI
= OpdMapper
.getMI();
2128 Register DstReg
= MI
.getOperand(0).getReg();
2129 LLT HalfTy
= LLT::scalar(32);
2131 // Depending on where the source registers came from, the generic code may
2132 // have decided to split the inputs already or not. If not, we still need to
2133 // extract the values.
2135 if (Src0Regs
.empty())
2136 split64BitValueForMapping(B
, Src0Regs
, HalfTy
, MI
.getOperand(1).getReg());
2138 setRegsToType(MRI
, Src0Regs
, HalfTy
);
2140 if (Src1Regs
.empty())
2141 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
2143 setRegsToType(MRI
, Src1Regs
, HalfTy
);
2145 setRegsToType(MRI
, DefRegs
, HalfTy
);
2147 // The multiplication is done as follows:
2151 // --------------------
2152 // Op1H*Op0L Op1L*Op0L
2153 // + Op1H*Op0H Op1L*Op0H
2154 // -----------------------------------------
2155 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
2157 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2158 // value and that would overflow.
2159 // The low 32-bit value is Op1L*Op0L.
2160 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2163 ApplyRegBankMapping
ApplyBank(B
, *this, MRI
, &AMDGPU::VGPRRegBank
);
2165 Register Hi
= B
.buildUMulH(HalfTy
, Src0Regs
[0], Src1Regs
[0]).getReg(0);
2166 Register MulLoHi
= B
.buildMul(HalfTy
, Src0Regs
[0], Src1Regs
[1]).getReg(0);
2167 Register Add
= B
.buildAdd(HalfTy
, Hi
, MulLoHi
).getReg(0);
2168 Register MulHiLo
= B
.buildMul(HalfTy
, Src0Regs
[1], Src1Regs
[0]).getReg(0);
2169 B
.buildAdd(DefRegs
[1], Add
, MulHiLo
);
2170 B
.buildMul(DefRegs
[0], Src0Regs
[0], Src1Regs
[0]);
2172 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
2173 MI
.eraseFromParent();
2176 void AMDGPURegisterBankInfo::applyMappingImpl(
2177 MachineIRBuilder
&B
, const OperandsMapper
&OpdMapper
) const {
2178 MachineInstr
&MI
= OpdMapper
.getMI();
2179 B
.setInstrAndDebugLoc(MI
);
2180 unsigned Opc
= MI
.getOpcode();
2181 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
2183 case AMDGPU::G_CONSTANT
:
2184 case AMDGPU::G_IMPLICIT_DEF
: {
2185 Register DstReg
= MI
.getOperand(0).getReg();
2186 LLT DstTy
= MRI
.getType(DstReg
);
2187 if (DstTy
!= LLT::scalar(1))
2190 const RegisterBank
*DstBank
=
2191 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2192 if (DstBank
== &AMDGPU::VCCRegBank
)
2194 SmallVector
<Register
, 1> DefRegs(OpdMapper
.getVRegs(0));
2195 if (DefRegs
.empty())
2196 DefRegs
.push_back(DstReg
);
2198 B
.setInsertPt(*MI
.getParent(), ++MI
.getIterator());
2200 Register NewDstReg
= MRI
.createGenericVirtualRegister(LLT::scalar(32));
2201 LLVMContext
&Ctx
= B
.getMF().getFunction().getContext();
2203 MI
.getOperand(0).setReg(NewDstReg
);
2204 if (Opc
!= AMDGPU::G_IMPLICIT_DEF
) {
2205 uint64_t ConstVal
= MI
.getOperand(1).getCImm()->getZExtValue();
2206 MI
.getOperand(1).setCImm(
2207 ConstantInt::get(IntegerType::getInt32Ty(Ctx
), ConstVal
));
2210 MRI
.setRegBank(NewDstReg
, *DstBank
);
2211 B
.buildTrunc(DefRegs
[0], NewDstReg
);
2214 case AMDGPU::G_PHI
: {
2215 Register DstReg
= MI
.getOperand(0).getReg();
2216 LLT DstTy
= MRI
.getType(DstReg
);
2217 if (DstTy
!= LLT::scalar(1))
2220 const LLT S32
= LLT::scalar(32);
2221 const RegisterBank
*DstBank
=
2222 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2223 if (DstBank
== &AMDGPU::VCCRegBank
) {
2224 applyDefaultMapping(OpdMapper
);
2225 // The standard handling only considers the result register bank for
2226 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2227 // produce an invalid copy. We can only copy with some kind of compare to
2228 // get a vector boolean result. Insert a register bank copy that will be
2229 // correctly lowered to a compare.
2230 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
2231 Register SrcReg
= MI
.getOperand(I
).getReg();
2232 const RegisterBank
*SrcBank
= getRegBank(SrcReg
, MRI
, *TRI
);
2234 if (SrcBank
!= &AMDGPU::VCCRegBank
) {
2235 MachineBasicBlock
*SrcMBB
= MI
.getOperand(I
+ 1).getMBB();
2236 B
.setInsertPt(*SrcMBB
, SrcMBB
->getFirstTerminator());
2238 auto Copy
= B
.buildCopy(LLT::scalar(1), SrcReg
);
2239 MRI
.setRegBank(Copy
.getReg(0), AMDGPU::VCCRegBank
);
2240 MI
.getOperand(I
).setReg(Copy
.getReg(0));
2247 // Phi handling is strange and only considers the bank of the destination.
2248 substituteSimpleCopyRegs(OpdMapper
, 0);
2250 // Promote SGPR/VGPR booleans to s32
2251 ApplyRegBankMapping
ApplyBank(B
, *this, MRI
, DstBank
);
2252 B
.setInsertPt(B
.getMBB(), MI
);
2253 LegalizerHelper
Helper(B
.getMF(), ApplyBank
, B
);
2255 if (Helper
.widenScalar(MI
, 0, S32
) != LegalizerHelper::Legalized
)
2256 llvm_unreachable("widen scalar should have succeeded");
2260 case AMDGPU::G_FCMP
:
2261 if (!Subtarget
.hasSALUFloatInsts())
2264 case AMDGPU::G_ICMP
:
2265 case AMDGPU::G_UADDO
:
2266 case AMDGPU::G_USUBO
:
2267 case AMDGPU::G_UADDE
:
2268 case AMDGPU::G_SADDE
:
2269 case AMDGPU::G_USUBE
:
2270 case AMDGPU::G_SSUBE
: {
2271 unsigned BoolDstOp
=
2272 (Opc
== AMDGPU::G_ICMP
|| Opc
== AMDGPU::G_FCMP
) ? 0 : 1;
2273 Register DstReg
= MI
.getOperand(BoolDstOp
).getReg();
2275 const RegisterBank
*DstBank
=
2276 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2277 if (DstBank
!= &AMDGPU::SGPRRegBank
)
2280 const bool HasCarryIn
= MI
.getNumOperands() == 5;
2282 // If this is a scalar compare, promote the result to s32, as the selection
2283 // will end up using a copy to a 32-bit vreg.
2284 const LLT S32
= LLT::scalar(32);
2285 Register NewDstReg
= MRI
.createGenericVirtualRegister(S32
);
2286 MRI
.setRegBank(NewDstReg
, AMDGPU::SGPRRegBank
);
2287 MI
.getOperand(BoolDstOp
).setReg(NewDstReg
);
2290 Register NewSrcReg
= MRI
.createGenericVirtualRegister(S32
);
2291 MRI
.setRegBank(NewSrcReg
, AMDGPU::SGPRRegBank
);
2292 B
.buildZExt(NewSrcReg
, MI
.getOperand(4).getReg());
2293 MI
.getOperand(4).setReg(NewSrcReg
);
2296 MachineBasicBlock
*MBB
= MI
.getParent();
2297 B
.setInsertPt(*MBB
, std::next(MI
.getIterator()));
2299 // If we had a constrained VCC result register, a copy was inserted to VCC
2301 SmallVector
<Register
, 1> DefRegs(OpdMapper
.getVRegs(0));
2302 if (DefRegs
.empty())
2303 DefRegs
.push_back(DstReg
);
2304 B
.buildTrunc(DefRegs
[0], NewDstReg
);
2307 case AMDGPU::G_SELECT
: {
2308 Register DstReg
= MI
.getOperand(0).getReg();
2309 LLT DstTy
= MRI
.getType(DstReg
);
2311 SmallVector
<Register
, 1> CondRegs(OpdMapper
.getVRegs(1));
2312 if (CondRegs
.empty())
2313 CondRegs
.push_back(MI
.getOperand(1).getReg());
2315 assert(CondRegs
.size() == 1);
2318 const RegisterBank
*CondBank
= getRegBank(CondRegs
[0], MRI
, *TRI
);
2319 if (CondBank
== &AMDGPU::SGPRRegBank
) {
2320 const LLT S32
= LLT::scalar(32);
2321 Register NewCondReg
= MRI
.createGenericVirtualRegister(S32
);
2322 MRI
.setRegBank(NewCondReg
, AMDGPU::SGPRRegBank
);
2324 MI
.getOperand(1).setReg(NewCondReg
);
2325 B
.buildZExt(NewCondReg
, CondRegs
[0]);
2328 if (DstTy
.getSizeInBits() != 64)
2331 LLT HalfTy
= getHalfSizedType(DstTy
);
2333 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2334 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
2335 SmallVector
<Register
, 2> Src2Regs(OpdMapper
.getVRegs(3));
2337 // All inputs are SGPRs, nothing special to do.
2338 if (DefRegs
.empty()) {
2339 assert(Src1Regs
.empty() && Src2Regs
.empty());
2343 if (Src1Regs
.empty())
2344 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
2346 setRegsToType(MRI
, Src1Regs
, HalfTy
);
2349 if (Src2Regs
.empty())
2350 split64BitValueForMapping(B
, Src2Regs
, HalfTy
, MI
.getOperand(3).getReg());
2352 setRegsToType(MRI
, Src2Regs
, HalfTy
);
2354 setRegsToType(MRI
, DefRegs
, HalfTy
);
2356 B
.buildSelect(DefRegs
[0], CondRegs
[0], Src1Regs
[0], Src2Regs
[0]);
2357 B
.buildSelect(DefRegs
[1], CondRegs
[0], Src1Regs
[1], Src2Regs
[1]);
2359 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
2360 MI
.eraseFromParent();
2363 case AMDGPU::G_BRCOND
: {
2364 Register CondReg
= MI
.getOperand(0).getReg();
2365 // FIXME: Should use legalizer helper, but should change bool ext type.
2366 const RegisterBank
*CondBank
=
2367 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2369 if (CondBank
== &AMDGPU::SGPRRegBank
) {
2370 const LLT S32
= LLT::scalar(32);
2371 Register NewCondReg
= MRI
.createGenericVirtualRegister(S32
);
2372 MRI
.setRegBank(NewCondReg
, AMDGPU::SGPRRegBank
);
2374 MI
.getOperand(0).setReg(NewCondReg
);
2375 B
.buildZExt(NewCondReg
, CondReg
);
2383 case AMDGPU::G_XOR
: {
2384 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2385 // there is a VGPR input.
2386 Register DstReg
= MI
.getOperand(0).getReg();
2387 LLT DstTy
= MRI
.getType(DstReg
);
2389 if (DstTy
.getSizeInBits() == 1) {
2390 const RegisterBank
*DstBank
=
2391 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2392 if (DstBank
== &AMDGPU::VCCRegBank
)
2395 MachineFunction
*MF
= MI
.getParent()->getParent();
2396 ApplyRegBankMapping
ApplyBank(B
, *this, MRI
, DstBank
);
2397 LegalizerHelper
Helper(*MF
, ApplyBank
, B
);
2399 if (Helper
.widenScalar(MI
, 0, LLT::scalar(32)) !=
2400 LegalizerHelper::Legalized
)
2401 llvm_unreachable("widen scalar should have succeeded");
2405 if (DstTy
.getSizeInBits() != 64)
2408 LLT HalfTy
= getHalfSizedType(DstTy
);
2409 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2410 SmallVector
<Register
, 2> Src0Regs(OpdMapper
.getVRegs(1));
2411 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
2413 // All inputs are SGPRs, nothing special to do.
2414 if (DefRegs
.empty()) {
2415 assert(Src0Regs
.empty() && Src1Regs
.empty());
2419 assert(DefRegs
.size() == 2);
2420 assert(Src0Regs
.size() == Src1Regs
.size() &&
2421 (Src0Regs
.empty() || Src0Regs
.size() == 2));
2423 // Depending on where the source registers came from, the generic code may
2424 // have decided to split the inputs already or not. If not, we still need to
2425 // extract the values.
2427 if (Src0Regs
.empty())
2428 split64BitValueForMapping(B
, Src0Regs
, HalfTy
, MI
.getOperand(1).getReg());
2430 setRegsToType(MRI
, Src0Regs
, HalfTy
);
2432 if (Src1Regs
.empty())
2433 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
2435 setRegsToType(MRI
, Src1Regs
, HalfTy
);
2437 setRegsToType(MRI
, DefRegs
, HalfTy
);
2439 B
.buildInstr(Opc
, {DefRegs
[0]}, {Src0Regs
[0], Src1Regs
[0]});
2440 B
.buildInstr(Opc
, {DefRegs
[1]}, {Src0Regs
[1], Src1Regs
[1]});
2442 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
2443 MI
.eraseFromParent();
2446 case AMDGPU::G_ABS
: {
2447 Register SrcReg
= MI
.getOperand(1).getReg();
2448 const RegisterBank
*SrcBank
= MRI
.getRegBankOrNull(SrcReg
);
2450 // There is no VALU abs instruction so we need to replace it with a sub and
2452 if (SrcBank
&& SrcBank
== &AMDGPU::VGPRRegBank
) {
2453 MachineFunction
*MF
= MI
.getParent()->getParent();
2454 ApplyRegBankMapping
Apply(B
, *this, MRI
, &AMDGPU::VGPRRegBank
);
2455 LegalizerHelper
Helper(*MF
, Apply
, B
);
2457 if (Helper
.lowerAbsToMaxNeg(MI
) != LegalizerHelper::Legalized
)
2458 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2467 case AMDGPU::G_LSHR
:
2468 case AMDGPU::G_ASHR
:
2469 case AMDGPU::G_SMIN
:
2470 case AMDGPU::G_SMAX
:
2471 case AMDGPU::G_UMIN
:
2472 case AMDGPU::G_UMAX
: {
2473 Register DstReg
= MI
.getOperand(0).getReg();
2474 LLT DstTy
= MRI
.getType(DstReg
);
2476 // Special case for s_mul_u64. There is not a vector equivalent of
2477 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2479 if (Opc
== AMDGPU::G_MUL
&& DstTy
.getSizeInBits() == 64) {
2480 applyMappingSMULU64(B
, OpdMapper
);
2484 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2485 // Packed 16-bit operations need to be scalarized and promoted.
2486 if (DstTy
!= LLT::scalar(16) && DstTy
!= LLT::fixed_vector(2, 16))
2489 const RegisterBank
*DstBank
=
2490 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2491 if (DstBank
== &AMDGPU::VGPRRegBank
)
2494 const LLT S32
= LLT::scalar(32);
2495 MachineBasicBlock
*MBB
= MI
.getParent();
2496 MachineFunction
*MF
= MBB
->getParent();
2497 ApplyRegBankMapping
ApplySALU(B
, *this, MRI
, &AMDGPU::SGPRRegBank
);
2499 if (DstTy
.isVector() && Opc
== AMDGPU::G_ABS
) {
2500 Register WideSrcLo
, WideSrcHi
;
2502 std::tie(WideSrcLo
, WideSrcHi
) =
2503 unpackV2S16ToS32(B
, MI
.getOperand(1).getReg(), TargetOpcode::G_SEXT
);
2504 auto Lo
= B
.buildInstr(AMDGPU::G_ABS
, {S32
}, {WideSrcLo
});
2505 auto Hi
= B
.buildInstr(AMDGPU::G_ABS
, {S32
}, {WideSrcHi
});
2506 B
.buildBuildVectorTrunc(DstReg
, {Lo
.getReg(0), Hi
.getReg(0)});
2507 MI
.eraseFromParent();
2511 if (DstTy
.isVector()) {
2512 Register WideSrc0Lo
, WideSrc0Hi
;
2513 Register WideSrc1Lo
, WideSrc1Hi
;
2515 unsigned ExtendOp
= getExtendOp(MI
.getOpcode());
2516 std::tie(WideSrc0Lo
, WideSrc0Hi
)
2517 = unpackV2S16ToS32(B
, MI
.getOperand(1).getReg(), ExtendOp
);
2518 std::tie(WideSrc1Lo
, WideSrc1Hi
)
2519 = unpackV2S16ToS32(B
, MI
.getOperand(2).getReg(), ExtendOp
);
2520 auto Lo
= B
.buildInstr(MI
.getOpcode(), {S32
}, {WideSrc0Lo
, WideSrc1Lo
});
2521 auto Hi
= B
.buildInstr(MI
.getOpcode(), {S32
}, {WideSrc0Hi
, WideSrc1Hi
});
2522 B
.buildBuildVectorTrunc(DstReg
, {Lo
.getReg(0), Hi
.getReg(0)});
2523 MI
.eraseFromParent();
2525 LegalizerHelper
Helper(*MF
, ApplySALU
, B
);
2527 if (Helper
.widenScalar(MI
, 0, S32
) != LegalizerHelper::Legalized
)
2528 llvm_unreachable("widen scalar should have succeeded");
2530 // FIXME: s16 shift amounts should be legal.
2531 if (Opc
== AMDGPU::G_SHL
|| Opc
== AMDGPU::G_LSHR
||
2532 Opc
== AMDGPU::G_ASHR
) {
2533 B
.setInsertPt(*MBB
, MI
.getIterator());
2534 if (Helper
.widenScalar(MI
, 1, S32
) != LegalizerHelper::Legalized
)
2535 llvm_unreachable("widen scalar should have succeeded");
2541 case AMDGPU::G_AMDGPU_S_MUL_I64_I32
:
2542 case AMDGPU::G_AMDGPU_S_MUL_U64_U32
: {
2543 // This is a special case for s_mul_u64. We use
2544 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2545 // where the 33 higher bits are sign-extended and
2546 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2547 // where the 32 higher bits are zero-extended. In case scalar registers are
2548 // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2549 // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2550 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2552 // Insert basic copies.
2553 applyDefaultMapping(OpdMapper
);
2555 Register DstReg
= MI
.getOperand(0).getReg();
2556 Register SrcReg0
= MI
.getOperand(1).getReg();
2557 Register SrcReg1
= MI
.getOperand(2).getReg();
2558 const LLT S32
= LLT::scalar(32);
2559 const LLT S64
= LLT::scalar(64);
2560 assert(MRI
.getType(DstReg
) == S64
&& "This is a special case for s_mul_u64 "
2561 "that handles only 64-bit operands.");
2562 const RegisterBank
*DstBank
=
2563 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2565 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2566 // with s_mul_u64 operation.
2567 if (DstBank
== &AMDGPU::SGPRRegBank
) {
2568 MI
.setDesc(TII
->get(AMDGPU::S_MUL_U64
));
2569 MRI
.setRegClass(DstReg
, &AMDGPU::SGPR_64RegClass
);
2570 MRI
.setRegClass(SrcReg0
, &AMDGPU::SGPR_64RegClass
);
2571 MRI
.setRegClass(SrcReg1
, &AMDGPU::SGPR_64RegClass
);
2575 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2576 // with a vector mad.
2577 assert(MRI
.getRegBankOrNull(DstReg
) == &AMDGPU::VGPRRegBank
&&
2578 "The destination operand should be in vector registers.");
2580 DebugLoc DL
= MI
.getDebugLoc();
2582 // Extract the lower subregister from the first operand.
2583 Register Op0L
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
2584 MRI
.setRegClass(Op0L
, &AMDGPU::VGPR_32RegClass
);
2585 MRI
.setType(Op0L
, S32
);
2586 B
.buildTrunc(Op0L
, SrcReg0
);
2588 // Extract the lower subregister from the second operand.
2589 Register Op1L
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
2590 MRI
.setRegClass(Op1L
, &AMDGPU::VGPR_32RegClass
);
2591 MRI
.setType(Op1L
, S32
);
2592 B
.buildTrunc(Op1L
, SrcReg1
);
2594 unsigned NewOpc
= Opc
== AMDGPU::G_AMDGPU_S_MUL_U64_U32
2595 ? AMDGPU::G_AMDGPU_MAD_U64_U32
2596 : AMDGPU::G_AMDGPU_MAD_I64_I32
;
2598 MachineIRBuilder
B(MI
);
2599 Register Zero64
= B
.buildConstant(S64
, 0).getReg(0);
2600 MRI
.setRegClass(Zero64
, &AMDGPU::VReg_64RegClass
);
2601 Register CarryOut
= MRI
.createVirtualRegister(&AMDGPU::VReg_64RegClass
);
2602 MRI
.setRegClass(CarryOut
, &AMDGPU::VReg_64RegClass
);
2603 B
.buildInstr(NewOpc
, {DstReg
, CarryOut
}, {Op0L
, Op1L
, Zero64
});
2604 MI
.eraseFromParent();
2607 case AMDGPU::G_SEXT_INREG
: {
2608 SmallVector
<Register
, 2> SrcRegs(OpdMapper
.getVRegs(1));
2609 if (SrcRegs
.empty())
2610 break; // Nothing to repair
2612 const LLT S32
= LLT::scalar(32);
2613 ApplyRegBankMapping
O(B
, *this, MRI
, &AMDGPU::VGPRRegBank
);
2615 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2616 // we would need to further expand, and doesn't let us directly set the
2617 // result registers.
2618 SmallVector
<Register
, 2> DstRegs(OpdMapper
.getVRegs(0));
2620 int Amt
= MI
.getOperand(2).getImm();
2622 // Downstream users have expectations for the high bit behavior, so freeze
2623 // incoming undefined bits.
2625 // The low bits are unchanged.
2626 B
.buildFreeze(DstRegs
[0], SrcRegs
[0]);
2628 auto Freeze
= B
.buildFreeze(S32
, SrcRegs
[0]);
2629 // Extend in the low bits and propagate the sign bit to the high half.
2630 B
.buildSExtInReg(DstRegs
[0], Freeze
, Amt
);
2633 B
.buildAShr(DstRegs
[1], DstRegs
[0], B
.buildConstant(S32
, 31));
2635 // The low bits are unchanged, and extend in the high bits.
2636 // No freeze required
2637 B
.buildCopy(DstRegs
[0], SrcRegs
[0]);
2638 B
.buildSExtInReg(DstRegs
[1], DstRegs
[0], Amt
- 32);
2641 Register DstReg
= MI
.getOperand(0).getReg();
2642 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
2643 MI
.eraseFromParent();
2646 case AMDGPU::G_CTPOP
:
2647 case AMDGPU::G_BITREVERSE
: {
2648 const RegisterBank
*DstBank
=
2649 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2650 if (DstBank
== &AMDGPU::SGPRRegBank
)
2653 Register SrcReg
= MI
.getOperand(1).getReg();
2654 const LLT S32
= LLT::scalar(32);
2655 LLT Ty
= MRI
.getType(SrcReg
);
2659 ApplyRegBankMapping
ApplyVALU(B
, *this, MRI
, &AMDGPU::VGPRRegBank
);
2661 MachineFunction
&MF
= B
.getMF();
2662 LegalizerHelper
Helper(MF
, ApplyVALU
, B
);
2664 if (Helper
.narrowScalar(MI
, 1, S32
) != LegalizerHelper::Legalized
)
2665 llvm_unreachable("narrowScalar should have succeeded");
2668 case AMDGPU::G_AMDGPU_FFBH_U32
:
2669 case AMDGPU::G_AMDGPU_FFBL_B32
:
2670 case AMDGPU::G_CTLZ_ZERO_UNDEF
:
2671 case AMDGPU::G_CTTZ_ZERO_UNDEF
: {
2672 const RegisterBank
*DstBank
=
2673 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2674 if (DstBank
== &AMDGPU::SGPRRegBank
)
2677 Register SrcReg
= MI
.getOperand(1).getReg();
2678 const LLT S32
= LLT::scalar(32);
2679 LLT Ty
= MRI
.getType(SrcReg
);
2683 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2684 // which return -1 when the input is zero:
2685 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2686 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2687 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2688 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2689 ApplyRegBankMapping
ApplyVALU(B
, *this, MRI
, &AMDGPU::VGPRRegBank
);
2690 SmallVector
<Register
, 2> SrcRegs(OpdMapper
.getVRegs(1));
2691 unsigned NewOpc
= Opc
== AMDGPU::G_CTLZ_ZERO_UNDEF
2692 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2693 : Opc
== AMDGPU::G_CTTZ_ZERO_UNDEF
2694 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2696 unsigned Idx
= NewOpc
== AMDGPU::G_AMDGPU_FFBH_U32
;
2697 auto X
= B
.buildInstr(NewOpc
, {S32
}, {SrcRegs
[Idx
]});
2698 auto Y
= B
.buildInstr(NewOpc
, {S32
}, {SrcRegs
[Idx
^ 1]});
2700 Opc
== AMDGPU::G_CTLZ_ZERO_UNDEF
|| Opc
== AMDGPU::G_CTTZ_ZERO_UNDEF
2702 : AMDGPU::G_UADDSAT
;
2703 Y
= B
.buildInstr(AddOpc
, {S32
}, {Y
, B
.buildConstant(S32
, 32)});
2704 Register DstReg
= MI
.getOperand(0).getReg();
2705 B
.buildUMin(DstReg
, X
, Y
);
2706 MI
.eraseFromParent();
2709 case AMDGPU::G_SEXT
:
2710 case AMDGPU::G_ZEXT
:
2711 case AMDGPU::G_ANYEXT
: {
2712 Register SrcReg
= MI
.getOperand(1).getReg();
2713 LLT SrcTy
= MRI
.getType(SrcReg
);
2714 const bool Signed
= Opc
== AMDGPU::G_SEXT
;
2716 assert(OpdMapper
.getVRegs(1).empty());
2718 const RegisterBank
*SrcBank
=
2719 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2721 Register DstReg
= MI
.getOperand(0).getReg();
2722 LLT DstTy
= MRI
.getType(DstReg
);
2723 if (DstTy
.isScalar() &&
2724 SrcBank
!= &AMDGPU::SGPRRegBank
&&
2725 SrcBank
!= &AMDGPU::VCCRegBank
&&
2726 // FIXME: Should handle any type that round to s64 when irregular
2727 // breakdowns supported.
2728 DstTy
.getSizeInBits() == 64 &&
2729 SrcTy
.getSizeInBits() <= 32) {
2730 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2732 // Extend to 32-bit, and then extend the low half.
2734 // TODO: Should really be buildSExtOrCopy
2735 B
.buildSExtOrTrunc(DefRegs
[0], SrcReg
);
2736 } else if (Opc
== AMDGPU::G_ZEXT
) {
2737 B
.buildZExtOrTrunc(DefRegs
[0], SrcReg
);
2739 B
.buildAnyExtOrTrunc(DefRegs
[0], SrcReg
);
2742 extendLow32IntoHigh32(B
, DefRegs
[1], DefRegs
[0], Opc
, *SrcBank
);
2743 MRI
.setRegBank(DstReg
, *SrcBank
);
2744 MI
.eraseFromParent();
2748 if (SrcTy
!= LLT::scalar(1))
2751 // It is not legal to have a legalization artifact with a VCC source. Rather
2752 // than introducing a copy, insert the select we would have to select the
2754 if (SrcBank
== &AMDGPU::VCCRegBank
) {
2755 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2757 const RegisterBank
*DstBank
= &AMDGPU::VGPRRegBank
;
2759 unsigned DstSize
= DstTy
.getSizeInBits();
2760 // 64-bit select is SGPR only
2761 const bool UseSel64
= DstSize
> 32 &&
2762 SrcBank
->getID() == AMDGPU::SGPRRegBankID
;
2764 // TODO: Should s16 select be legal?
2765 LLT SelType
= UseSel64
? LLT::scalar(64) : LLT::scalar(32);
2766 auto True
= B
.buildConstant(SelType
, Signed
? -1 : 1);
2767 auto False
= B
.buildConstant(SelType
, 0);
2769 MRI
.setRegBank(True
.getReg(0), *DstBank
);
2770 MRI
.setRegBank(False
.getReg(0), *DstBank
);
2771 MRI
.setRegBank(DstReg
, *DstBank
);
2774 B
.buildSelect(DefRegs
[0], SrcReg
, True
, False
);
2775 extendLow32IntoHigh32(B
, DefRegs
[1], DefRegs
[0], Opc
, *SrcBank
, true);
2776 } else if (DstSize
< 32) {
2777 auto Sel
= B
.buildSelect(SelType
, SrcReg
, True
, False
);
2778 MRI
.setRegBank(Sel
.getReg(0), *DstBank
);
2779 B
.buildTrunc(DstReg
, Sel
);
2781 B
.buildSelect(DstReg
, SrcReg
, True
, False
);
2784 MI
.eraseFromParent();
2790 case AMDGPU::G_EXTRACT_VECTOR_ELT
: {
2791 SmallVector
<Register
, 2> DstRegs(OpdMapper
.getVRegs(0));
2793 assert(OpdMapper
.getVRegs(1).empty() && OpdMapper
.getVRegs(2).empty());
2795 Register DstReg
= MI
.getOperand(0).getReg();
2796 Register SrcReg
= MI
.getOperand(1).getReg();
2798 const LLT S32
= LLT::scalar(32);
2799 LLT DstTy
= MRI
.getType(DstReg
);
2800 LLT SrcTy
= MRI
.getType(SrcReg
);
2802 if (foldExtractEltToCmpSelect(B
, MI
, OpdMapper
))
2805 const ValueMapping
&DstMapping
2806 = OpdMapper
.getInstrMapping().getOperandMapping(0);
2807 const RegisterBank
*DstBank
= DstMapping
.BreakDown
[0].RegBank
;
2808 const RegisterBank
*SrcBank
=
2809 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2810 const RegisterBank
*IdxBank
=
2811 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
2813 Register BaseIdxReg
;
2814 unsigned ConstOffset
;
2815 std::tie(BaseIdxReg
, ConstOffset
) =
2816 AMDGPU::getBaseWithConstantOffset(MRI
, MI
.getOperand(2).getReg());
2818 // See if the index is an add of a constant which will be foldable by moving
2819 // the base register of the index later if this is going to be executed in a
2820 // waterfall loop. This is essentially to reassociate the add of a constant
2821 // with the readfirstlane.
2822 bool ShouldMoveIndexIntoLoop
= IdxBank
!= &AMDGPU::SGPRRegBank
&&
2824 ConstOffset
< SrcTy
.getNumElements();
2826 // Move the base register. We'll re-insert the add later.
2827 if (ShouldMoveIndexIntoLoop
)
2828 MI
.getOperand(2).setReg(BaseIdxReg
);
2830 // If this is a VGPR result only because the index was a VGPR result, the
2831 // actual indexing will be done on the SGPR source vector, which will
2832 // produce a scalar result. We need to copy to the VGPR result inside the
2834 const bool NeedCopyToVGPR
= DstBank
== &AMDGPU::VGPRRegBank
&&
2835 SrcBank
== &AMDGPU::SGPRRegBank
;
2836 if (DstRegs
.empty()) {
2837 applyDefaultMapping(OpdMapper
);
2839 executeInWaterfallLoop(B
, MI
, {2});
2841 if (NeedCopyToVGPR
) {
2842 // We don't want a phi for this temporary reg.
2843 Register TmpReg
= MRI
.createGenericVirtualRegister(DstTy
);
2844 MRI
.setRegBank(TmpReg
, AMDGPU::SGPRRegBank
);
2845 MI
.getOperand(0).setReg(TmpReg
);
2846 B
.setInsertPt(*MI
.getParent(), ++MI
.getIterator());
2848 // Use a v_mov_b32 here to make the exec dependency explicit.
2849 buildVCopy(B
, DstReg
, TmpReg
);
2852 // Re-insert the constant offset add inside the waterfall loop.
2853 if (ShouldMoveIndexIntoLoop
)
2854 reinsertVectorIndexAdd(B
, MI
, 2, ConstOffset
);
2859 assert(DstTy
.getSizeInBits() == 64);
2861 LLT Vec32
= LLT::fixed_vector(2 * SrcTy
.getNumElements(), 32);
2863 auto CastSrc
= B
.buildBitcast(Vec32
, SrcReg
);
2864 auto One
= B
.buildConstant(S32
, 1);
2866 MachineBasicBlock::iterator MII
= MI
.getIterator();
2868 // Split the vector index into 32-bit pieces. Prepare to move all of the
2869 // new instructions into a waterfall loop if necessary.
2871 // Don't put the bitcast or constant in the loop.
2872 MachineInstrSpan
Span(MII
, &B
.getMBB());
2874 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2875 auto IdxLo
= B
.buildShl(S32
, BaseIdxReg
, One
);
2876 auto IdxHi
= B
.buildAdd(S32
, IdxLo
, One
);
2878 auto Extract0
= B
.buildExtractVectorElement(DstRegs
[0], CastSrc
, IdxLo
);
2879 auto Extract1
= B
.buildExtractVectorElement(DstRegs
[1], CastSrc
, IdxHi
);
2881 MRI
.setRegBank(DstReg
, *DstBank
);
2882 MRI
.setRegBank(CastSrc
.getReg(0), *SrcBank
);
2883 MRI
.setRegBank(One
.getReg(0), AMDGPU::SGPRRegBank
);
2884 MRI
.setRegBank(IdxLo
.getReg(0), AMDGPU::SGPRRegBank
);
2885 MRI
.setRegBank(IdxHi
.getReg(0), AMDGPU::SGPRRegBank
);
2887 SmallSet
<Register
, 4> OpsToWaterfall
;
2888 if (!collectWaterfallOperands(OpsToWaterfall
, MI
, MRI
, { 2 })) {
2889 MI
.eraseFromParent();
2893 // Remove the original instruction to avoid potentially confusing the
2894 // waterfall loop logic.
2895 B
.setInstr(*Span
.begin());
2896 MI
.eraseFromParent();
2897 executeInWaterfallLoop(B
, make_range(Span
.begin(), Span
.end()),
2900 if (NeedCopyToVGPR
) {
2901 MachineBasicBlock
*LoopBB
= Extract1
->getParent();
2902 Register TmpReg0
= MRI
.createGenericVirtualRegister(S32
);
2903 Register TmpReg1
= MRI
.createGenericVirtualRegister(S32
);
2904 MRI
.setRegBank(TmpReg0
, AMDGPU::SGPRRegBank
);
2905 MRI
.setRegBank(TmpReg1
, AMDGPU::SGPRRegBank
);
2907 Extract0
->getOperand(0).setReg(TmpReg0
);
2908 Extract1
->getOperand(0).setReg(TmpReg1
);
2910 B
.setInsertPt(*LoopBB
, ++Extract1
->getIterator());
2912 buildVCopy(B
, DstRegs
[0], TmpReg0
);
2913 buildVCopy(B
, DstRegs
[1], TmpReg1
);
2916 if (ShouldMoveIndexIntoLoop
)
2917 reinsertVectorIndexAdd(B
, *IdxLo
, 1, ConstOffset
);
2921 case AMDGPU::G_INSERT_VECTOR_ELT
: {
2922 SmallVector
<Register
, 2> InsRegs(OpdMapper
.getVRegs(2));
2924 Register DstReg
= MI
.getOperand(0).getReg();
2925 LLT VecTy
= MRI
.getType(DstReg
);
2927 assert(OpdMapper
.getVRegs(0).empty());
2928 assert(OpdMapper
.getVRegs(3).empty());
2930 if (substituteSimpleCopyRegs(OpdMapper
, 1))
2931 MRI
.setType(MI
.getOperand(1).getReg(), VecTy
);
2933 if (foldInsertEltToCmpSelect(B
, MI
, OpdMapper
))
2936 const RegisterBank
*IdxBank
=
2937 OpdMapper
.getInstrMapping().getOperandMapping(3).BreakDown
[0].RegBank
;
2939 Register SrcReg
= MI
.getOperand(1).getReg();
2940 Register InsReg
= MI
.getOperand(2).getReg();
2941 LLT InsTy
= MRI
.getType(InsReg
);
2944 Register BaseIdxReg
;
2945 unsigned ConstOffset
;
2946 std::tie(BaseIdxReg
, ConstOffset
) =
2947 AMDGPU::getBaseWithConstantOffset(MRI
, MI
.getOperand(3).getReg());
2949 // See if the index is an add of a constant which will be foldable by moving
2950 // the base register of the index later if this is going to be executed in a
2951 // waterfall loop. This is essentially to reassociate the add of a constant
2952 // with the readfirstlane.
2953 bool ShouldMoveIndexIntoLoop
= IdxBank
!= &AMDGPU::SGPRRegBank
&&
2955 ConstOffset
< VecTy
.getNumElements();
2957 // Move the base register. We'll re-insert the add later.
2958 if (ShouldMoveIndexIntoLoop
)
2959 MI
.getOperand(3).setReg(BaseIdxReg
);
2962 if (InsRegs
.empty()) {
2963 executeInWaterfallLoop(B
, MI
, {3});
2965 // Re-insert the constant offset add inside the waterfall loop.
2966 if (ShouldMoveIndexIntoLoop
) {
2967 reinsertVectorIndexAdd(B
, MI
, 3, ConstOffset
);
2973 assert(InsTy
.getSizeInBits() == 64);
2975 const LLT S32
= LLT::scalar(32);
2976 LLT Vec32
= LLT::fixed_vector(2 * VecTy
.getNumElements(), 32);
2978 auto CastSrc
= B
.buildBitcast(Vec32
, SrcReg
);
2979 auto One
= B
.buildConstant(S32
, 1);
2981 // Split the vector index into 32-bit pieces. Prepare to move all of the
2982 // new instructions into a waterfall loop if necessary.
2984 // Don't put the bitcast or constant in the loop.
2985 MachineInstrSpan
Span(MachineBasicBlock::iterator(&MI
), &B
.getMBB());
2987 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2988 auto IdxLo
= B
.buildShl(S32
, BaseIdxReg
, One
);
2989 auto IdxHi
= B
.buildAdd(S32
, IdxLo
, One
);
2991 auto InsLo
= B
.buildInsertVectorElement(Vec32
, CastSrc
, InsRegs
[0], IdxLo
);
2992 auto InsHi
= B
.buildInsertVectorElement(Vec32
, InsLo
, InsRegs
[1], IdxHi
);
2994 const RegisterBank
*DstBank
=
2995 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2996 const RegisterBank
*SrcBank
=
2997 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2998 const RegisterBank
*InsSrcBank
=
2999 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
3001 MRI
.setRegBank(InsReg
, *InsSrcBank
);
3002 MRI
.setRegBank(CastSrc
.getReg(0), *SrcBank
);
3003 MRI
.setRegBank(InsLo
.getReg(0), *DstBank
);
3004 MRI
.setRegBank(InsHi
.getReg(0), *DstBank
);
3005 MRI
.setRegBank(One
.getReg(0), AMDGPU::SGPRRegBank
);
3006 MRI
.setRegBank(IdxLo
.getReg(0), AMDGPU::SGPRRegBank
);
3007 MRI
.setRegBank(IdxHi
.getReg(0), AMDGPU::SGPRRegBank
);
3010 SmallSet
<Register
, 4> OpsToWaterfall
;
3011 if (!collectWaterfallOperands(OpsToWaterfall
, MI
, MRI
, { 3 })) {
3012 B
.setInsertPt(B
.getMBB(), MI
);
3013 B
.buildBitcast(DstReg
, InsHi
);
3014 MI
.eraseFromParent();
3018 B
.setInstr(*Span
.begin());
3019 MI
.eraseFromParent();
3021 // Figure out the point after the waterfall loop before mangling the control
3023 executeInWaterfallLoop(B
, make_range(Span
.begin(), Span
.end()),
3026 // The insertion point is now right after the original instruction.
3028 // Keep the bitcast to the original vector type out of the loop. Doing this
3029 // saved an extra phi we don't need inside the loop.
3030 B
.buildBitcast(DstReg
, InsHi
);
3032 // Re-insert the constant offset add inside the waterfall loop.
3033 if (ShouldMoveIndexIntoLoop
)
3034 reinsertVectorIndexAdd(B
, *IdxLo
, 1, ConstOffset
);
3038 case AMDGPU::G_AMDGPU_BUFFER_LOAD
:
3039 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
:
3040 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT
:
3041 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
:
3042 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
:
3043 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
:
3044 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
:
3045 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE
:
3046 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
:
3047 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE
:
3048 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT
:
3049 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
:
3050 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16
:
3051 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT
:
3052 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16
:
3053 case AMDGPU::G_AMDGPU_BUFFER_STORE
:
3054 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE
:
3055 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT
:
3056 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT
:
3057 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16
:
3058 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT
:
3059 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16
: {
3060 applyDefaultMapping(OpdMapper
);
3061 executeInWaterfallLoop(B
, MI
, {1, 4});
3064 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP
:
3065 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD
:
3066 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB
:
3067 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN
:
3068 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN
:
3069 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX
:
3070 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX
:
3071 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND
:
3072 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR
:
3073 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR
:
3074 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC
:
3075 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC
: {
3076 applyDefaultMapping(OpdMapper
);
3077 executeInWaterfallLoop(B
, MI
, {2, 5});
3080 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD
:
3081 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN
:
3082 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX
: {
3083 applyDefaultMapping(OpdMapper
);
3084 executeInWaterfallLoop(B
, MI
, {2, 5});
3087 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP
: {
3088 applyDefaultMapping(OpdMapper
);
3089 executeInWaterfallLoop(B
, MI
, {3, 6});
3092 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD
:
3093 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
:
3094 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE
:
3095 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT
:
3096 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT
: {
3097 applyMappingSBufferLoad(B
, OpdMapper
);
3100 case AMDGPU::G_INTRINSIC
:
3101 case AMDGPU::G_INTRINSIC_CONVERGENT
: {
3102 switch (cast
<GIntrinsic
>(MI
).getIntrinsicID()) {
3103 case Intrinsic::amdgcn_readlane
: {
3104 substituteSimpleCopyRegs(OpdMapper
, 2);
3106 assert(OpdMapper
.getVRegs(0).empty());
3107 assert(OpdMapper
.getVRegs(3).empty());
3109 // Make sure the index is an SGPR. It doesn't make sense to run this in a
3110 // waterfall loop, so assume it's a uniform value.
3111 constrainOpWithReadfirstlane(B
, MI
, 3); // Index
3114 case Intrinsic::amdgcn_writelane
: {
3115 assert(OpdMapper
.getVRegs(0).empty());
3116 assert(OpdMapper
.getVRegs(2).empty());
3117 assert(OpdMapper
.getVRegs(3).empty());
3119 substituteSimpleCopyRegs(OpdMapper
, 4); // VGPR input val
3120 constrainOpWithReadfirstlane(B
, MI
, 2); // Source value
3121 constrainOpWithReadfirstlane(B
, MI
, 3); // Index
3124 case Intrinsic::amdgcn_interp_p1
:
3125 case Intrinsic::amdgcn_interp_p2
:
3126 case Intrinsic::amdgcn_interp_mov
:
3127 case Intrinsic::amdgcn_interp_p1_f16
:
3128 case Intrinsic::amdgcn_interp_p2_f16
:
3129 case Intrinsic::amdgcn_lds_param_load
: {
3130 applyDefaultMapping(OpdMapper
);
3132 // Readlane for m0 value, which is always the last operand.
3133 // FIXME: Should this be a waterfall loop instead?
3134 constrainOpWithReadfirstlane(B
, MI
, MI
.getNumOperands() - 1); // Index
3137 case Intrinsic::amdgcn_interp_inreg_p10
:
3138 case Intrinsic::amdgcn_interp_inreg_p2
:
3139 case Intrinsic::amdgcn_interp_inreg_p10_f16
:
3140 case Intrinsic::amdgcn_interp_inreg_p2_f16
:
3141 case Intrinsic::amdgcn_interp_p10_rtz_f16
:
3142 case Intrinsic::amdgcn_interp_p2_rtz_f16
:
3143 applyDefaultMapping(OpdMapper
);
3145 case Intrinsic::amdgcn_permlane16
:
3146 case Intrinsic::amdgcn_permlanex16
: {
3147 // Doing a waterfall loop over these wouldn't make any sense.
3148 substituteSimpleCopyRegs(OpdMapper
, 2);
3149 substituteSimpleCopyRegs(OpdMapper
, 3);
3150 constrainOpWithReadfirstlane(B
, MI
, 4);
3151 constrainOpWithReadfirstlane(B
, MI
, 5);
3154 case Intrinsic::amdgcn_sbfe
:
3155 applyMappingBFE(B
, OpdMapper
, true);
3157 case Intrinsic::amdgcn_ubfe
:
3158 applyMappingBFE(B
, OpdMapper
, false);
3160 case Intrinsic::amdgcn_inverse_ballot
:
3161 case Intrinsic::amdgcn_s_bitreplicate
:
3162 case Intrinsic::amdgcn_s_quadmask
:
3163 case Intrinsic::amdgcn_s_wqm
:
3164 applyDefaultMapping(OpdMapper
);
3165 constrainOpWithReadfirstlane(B
, MI
, 2); // Mask
3167 case Intrinsic::amdgcn_ballot
:
3168 // Use default handling and insert copy to vcc source.
3173 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD
:
3174 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
:
3175 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET
:
3176 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE
:
3177 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
: {
3178 const AMDGPU::RsrcIntrinsic
*RSrcIntrin
=
3179 AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI
));
3180 assert(RSrcIntrin
&& RSrcIntrin
->IsImage
);
3181 // Non-images can have complications from operands that allow both SGPR
3182 // and VGPR. For now it's too complicated to figure out the final opcode
3183 // to derive the register bank from the MCInstrDesc.
3184 applyMappingImage(B
, MI
, OpdMapper
, RSrcIntrin
->RsrcArg
);
3187 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY
: {
3188 unsigned N
= MI
.getNumExplicitOperands() - 2;
3189 applyDefaultMapping(OpdMapper
);
3190 executeInWaterfallLoop(B
, MI
, {N
});
3193 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
:
3194 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS
: {
3195 auto IntrID
= cast
<GIntrinsic
>(MI
).getIntrinsicID();
3197 case Intrinsic::amdgcn_ds_ordered_add
:
3198 case Intrinsic::amdgcn_ds_ordered_swap
: {
3199 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3200 assert(OpdMapper
.getVRegs(0).empty());
3201 substituteSimpleCopyRegs(OpdMapper
, 3);
3202 constrainOpWithReadfirstlane(B
, MI
, 2); // M0
3205 case Intrinsic::amdgcn_ds_gws_init
:
3206 case Intrinsic::amdgcn_ds_gws_barrier
:
3207 case Intrinsic::amdgcn_ds_gws_sema_br
: {
3208 // Only the first lane is executes, so readfirstlane is safe.
3209 substituteSimpleCopyRegs(OpdMapper
, 1);
3210 constrainOpWithReadfirstlane(B
, MI
, 2); // M0
3213 case Intrinsic::amdgcn_ds_gws_sema_v
:
3214 case Intrinsic::amdgcn_ds_gws_sema_p
:
3215 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
3216 // Only the first lane is executes, so readfirstlane is safe.
3217 constrainOpWithReadfirstlane(B
, MI
, 1); // M0
3220 case Intrinsic::amdgcn_ds_append
:
3221 case Intrinsic::amdgcn_ds_consume
: {
3222 constrainOpWithReadfirstlane(B
, MI
, 2); // M0
3225 case Intrinsic::amdgcn_s_sendmsg
:
3226 case Intrinsic::amdgcn_s_sendmsghalt
: {
3227 // FIXME: Should this use a waterfall loop?
3228 constrainOpWithReadfirstlane(B
, MI
, 2); // M0
3231 case Intrinsic::amdgcn_s_setreg
: {
3232 constrainOpWithReadfirstlane(B
, MI
, 2);
3235 case Intrinsic::amdgcn_s_ttracedata
:
3236 constrainOpWithReadfirstlane(B
, MI
, 1); // M0
3238 case Intrinsic::amdgcn_raw_buffer_load_lds
:
3239 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds
: {
3240 applyDefaultMapping(OpdMapper
);
3241 constrainOpWithReadfirstlane(B
, MI
, 1); // rsrc
3242 constrainOpWithReadfirstlane(B
, MI
, 2); // M0
3243 constrainOpWithReadfirstlane(B
, MI
, 5); // soffset
3246 case Intrinsic::amdgcn_struct_buffer_load_lds
:
3247 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds
: {
3248 applyDefaultMapping(OpdMapper
);
3249 constrainOpWithReadfirstlane(B
, MI
, 1); // rsrc
3250 constrainOpWithReadfirstlane(B
, MI
, 2); // M0
3251 constrainOpWithReadfirstlane(B
, MI
, 6); // soffset
3254 case Intrinsic::amdgcn_global_load_lds
: {
3255 applyDefaultMapping(OpdMapper
);
3256 constrainOpWithReadfirstlane(B
, MI
, 2);
3259 case Intrinsic::amdgcn_lds_direct_load
: {
3260 applyDefaultMapping(OpdMapper
);
3261 // Readlane for m0 value, which is always the last operand.
3262 constrainOpWithReadfirstlane(B
, MI
, MI
.getNumOperands() - 1); // Index
3265 case Intrinsic::amdgcn_exp_row
:
3266 applyDefaultMapping(OpdMapper
);
3267 constrainOpWithReadfirstlane(B
, MI
, 8); // M0
3269 case Intrinsic::amdgcn_s_sleep_var
:
3270 assert(OpdMapper
.getVRegs(1).empty());
3271 constrainOpWithReadfirstlane(B
, MI
, 1);
3273 case Intrinsic::amdgcn_s_barrier_signal_var
:
3274 case Intrinsic::amdgcn_s_barrier_join
:
3275 case Intrinsic::amdgcn_s_wakeup_barrier
:
3276 constrainOpWithReadfirstlane(B
, MI
, 1);
3278 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var
:
3279 constrainOpWithReadfirstlane(B
, MI
, 2);
3281 case Intrinsic::amdgcn_s_barrier_init
:
3282 constrainOpWithReadfirstlane(B
, MI
, 1);
3283 constrainOpWithReadfirstlane(B
, MI
, 2);
3285 case Intrinsic::amdgcn_s_get_barrier_state
: {
3286 constrainOpWithReadfirstlane(B
, MI
, 2);
3290 if (const AMDGPU::RsrcIntrinsic
*RSrcIntrin
=
3291 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
3292 // Non-images can have complications from operands that allow both SGPR
3293 // and VGPR. For now it's too complicated to figure out the final opcode
3294 // to derive the register bank from the MCInstrDesc.
3295 if (RSrcIntrin
->IsImage
) {
3296 applyMappingImage(B
, MI
, OpdMapper
, RSrcIntrin
->RsrcArg
);
3306 case AMDGPU::G_SI_CALL
: {
3307 // Use a set to avoid extra readfirstlanes in the case where multiple
3308 // operands are the same register.
3309 SmallSet
<Register
, 4> SGPROperandRegs
;
3311 if (!collectWaterfallOperands(SGPROperandRegs
, MI
, MRI
, {1}))
3314 // Move all copies to physical SGPRs that are used by the call instruction
3315 // into the loop block. Start searching for these copies until the
3317 unsigned FrameSetupOpcode
= AMDGPU::ADJCALLSTACKUP
;
3318 unsigned FrameDestroyOpcode
= AMDGPU::ADJCALLSTACKDOWN
;
3320 // Move all non-copies before the copies, so that a complete range can be
3321 // moved into the waterfall loop.
3322 SmallVector
<MachineInstr
*, 4> NonCopyInstrs
;
3323 // Count of NonCopyInstrs found until the current LastCopy.
3324 unsigned NonCopyInstrsLen
= 0;
3325 MachineBasicBlock::iterator
Start(&MI
);
3326 MachineBasicBlock::iterator LastCopy
= Start
;
3327 MachineBasicBlock
*MBB
= MI
.getParent();
3328 const SIMachineFunctionInfo
*Info
=
3329 MBB
->getParent()->getInfo
<SIMachineFunctionInfo
>();
3330 while (Start
->getOpcode() != FrameSetupOpcode
) {
3332 bool IsCopy
= false;
3333 if (Start
->getOpcode() == AMDGPU::COPY
) {
3334 auto &Dst
= Start
->getOperand(0);
3336 Register Reg
= Dst
.getReg();
3337 if (Reg
.isPhysical() && MI
.readsRegister(Reg
, TRI
)) {
3340 // Also move the copy from the scratch rsrc descriptor into the loop
3341 // to allow it to be optimized away.
3342 auto &Src
= Start
->getOperand(1);
3345 IsCopy
= Info
->getScratchRSrcReg() == Reg
;
3353 NonCopyInstrsLen
= NonCopyInstrs
.size();
3355 NonCopyInstrs
.push_back(&*Start
);
3358 NonCopyInstrs
.resize(NonCopyInstrsLen
);
3360 for (auto *NonCopy
: reverse(NonCopyInstrs
)) {
3361 MBB
->splice(LastCopy
, MBB
, NonCopy
->getIterator());
3365 // Do the same for copies after the loop
3366 NonCopyInstrs
.clear();
3367 NonCopyInstrsLen
= 0;
3368 MachineBasicBlock::iterator
End(&MI
);
3370 while (End
->getOpcode() != FrameDestroyOpcode
) {
3372 bool IsCopy
= false;
3373 if (End
->getOpcode() == AMDGPU::COPY
) {
3374 auto &Src
= End
->getOperand(1);
3376 Register Reg
= Src
.getReg();
3377 IsCopy
= Reg
.isPhysical() && MI
.modifiesRegister(Reg
, TRI
);
3383 NonCopyInstrsLen
= NonCopyInstrs
.size();
3385 NonCopyInstrs
.push_back(&*End
);
3388 NonCopyInstrs
.resize(NonCopyInstrsLen
);
3392 for (auto *NonCopy
: reverse(NonCopyInstrs
)) {
3393 MBB
->splice(LastCopy
, MBB
, NonCopy
->getIterator());
3397 B
.setInsertPt(B
.getMBB(), Start
);
3398 executeInWaterfallLoop(B
, make_range(Start
, End
), SGPROperandRegs
);
3401 case AMDGPU::G_LOAD
:
3402 case AMDGPU::G_ZEXTLOAD
:
3403 case AMDGPU::G_SEXTLOAD
: {
3404 if (applyMappingLoad(B
, OpdMapper
, MI
))
3408 case AMDGPU::G_DYN_STACKALLOC
:
3409 applyMappingDynStackAlloc(B
, OpdMapper
, MI
);
3411 case AMDGPU::G_STACKRESTORE
: {
3412 applyDefaultMapping(OpdMapper
);
3413 constrainOpWithReadfirstlane(B
, MI
, 0);
3416 case AMDGPU::G_SBFX
:
3417 applyMappingBFE(B
, OpdMapper
, /*Signed*/ true);
3419 case AMDGPU::G_UBFX
:
3420 applyMappingBFE(B
, OpdMapper
, /*Signed*/ false);
3422 case AMDGPU::G_AMDGPU_MAD_U64_U32
:
3423 case AMDGPU::G_AMDGPU_MAD_I64_I32
:
3424 applyMappingMAD_64_32(B
, OpdMapper
);
3426 case AMDGPU::G_PREFETCH
: {
3427 if (!Subtarget
.hasPrefetch()) {
3428 MI
.eraseFromParent();
3431 Register PtrReg
= MI
.getOperand(0).getReg();
3432 unsigned PtrBank
= getRegBankID(PtrReg
, MRI
, AMDGPU::SGPRRegBankID
);
3433 if (PtrBank
== AMDGPU::VGPRRegBankID
) {
3434 MI
.eraseFromParent();
3437 unsigned AS
= MRI
.getType(PtrReg
).getAddressSpace();
3438 if (!AMDGPU::isFlatGlobalAddrSpace(AS
) &&
3439 AS
!= AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
3440 MI
.eraseFromParent();
3443 applyDefaultMapping(OpdMapper
);
3450 return applyDefaultMapping(OpdMapper
);
3453 // vgpr, sgpr -> vgpr
3454 // vgpr, agpr -> vgpr
3455 // agpr, agpr -> agpr
3456 // agpr, sgpr -> vgpr
3457 static unsigned regBankUnion(unsigned RB0
, unsigned RB1
) {
3458 if (RB0
== AMDGPU::InvalidRegBankID
)
3460 if (RB1
== AMDGPU::InvalidRegBankID
)
3463 if (RB0
== AMDGPU::SGPRRegBankID
&& RB1
== AMDGPU::SGPRRegBankID
)
3464 return AMDGPU::SGPRRegBankID
;
3466 if (RB0
== AMDGPU::AGPRRegBankID
&& RB1
== AMDGPU::AGPRRegBankID
)
3467 return AMDGPU::AGPRRegBankID
;
3469 return AMDGPU::VGPRRegBankID
;
3472 static unsigned regBankBoolUnion(unsigned RB0
, unsigned RB1
) {
3473 if (RB0
== AMDGPU::InvalidRegBankID
)
3475 if (RB1
== AMDGPU::InvalidRegBankID
)
3481 if (RB0
== AMDGPU::VCCRegBankID
|| RB1
== AMDGPU::VCCRegBankID
)
3482 return AMDGPU::VCCRegBankID
;
3484 // vcc, vgpr -> vgpr
3485 return regBankUnion(RB0
, RB1
);
3488 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo
&MRI
,
3489 const MachineInstr
&MI
) const {
3490 unsigned RegBank
= AMDGPU::InvalidRegBankID
;
3492 for (const MachineOperand
&MO
: MI
.operands()) {
3495 Register Reg
= MO
.getReg();
3496 if (const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
)) {
3497 RegBank
= regBankUnion(RegBank
, Bank
->getID());
3498 if (RegBank
== AMDGPU::VGPRRegBankID
)
3506 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr
&MI
) const {
3507 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3508 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3509 for (const MachineOperand
&MO
: MI
.operands()) {
3512 Register Reg
= MO
.getReg();
3513 if (const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
)) {
3514 if (Bank
->getID() != AMDGPU::SGPRRegBankID
)
3521 const RegisterBankInfo::InstructionMapping
&
3522 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr
&MI
) const {
3523 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3524 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3525 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3527 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
3528 const MachineOperand
&SrcOp
= MI
.getOperand(i
);
3532 unsigned Size
= getSizeInBits(SrcOp
.getReg(), MRI
, *TRI
);
3533 OpdsMapping
[i
] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
3535 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
3536 MI
.getNumOperands());
3539 const RegisterBankInfo::InstructionMapping
&
3540 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr
&MI
) const {
3541 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3542 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3543 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3545 // Even though we technically could use SGPRs, this would require knowledge of
3546 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3548 // TODO: Unary ops are trivially OK, so accept SGPRs?
3549 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
3550 const MachineOperand
&Src
= MI
.getOperand(i
);
3554 unsigned Size
= getSizeInBits(Src
.getReg(), MRI
, *TRI
);
3555 unsigned BankID
= Size
== 1 ? AMDGPU::VCCRegBankID
: AMDGPU::VGPRRegBankID
;
3556 OpdsMapping
[i
] = AMDGPU::getValueMapping(BankID
, Size
);
3559 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
3560 MI
.getNumOperands());
3563 const RegisterBankInfo::InstructionMapping
&
3564 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr
&MI
) const {
3565 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3566 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3567 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3569 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
) {
3570 const MachineOperand
&Op
= MI
.getOperand(I
);
3574 unsigned Size
= getSizeInBits(Op
.getReg(), MRI
, *TRI
);
3575 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3578 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
3579 MI
.getNumOperands());
3582 const RegisterBankInfo::InstructionMapping
&
3583 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo
&MRI
,
3584 const MachineInstr
&MI
,
3585 int RsrcIdx
) const {
3586 // The reported argument index is relative to the IR intrinsic call arguments,
3587 // so we need to shift by the number of defs and the intrinsic ID.
3588 RsrcIdx
+= MI
.getNumExplicitDefs() + 1;
3590 const int NumOps
= MI
.getNumOperands();
3591 SmallVector
<const ValueMapping
*, 8> OpdsMapping(NumOps
);
3593 // TODO: Should packed/unpacked D16 difference be reported here as part of
3594 // the value mapping?
3595 for (int I
= 0; I
!= NumOps
; ++I
) {
3596 if (!MI
.getOperand(I
).isReg())
3599 Register OpReg
= MI
.getOperand(I
).getReg();
3600 // We replace some dead address operands with $noreg
3604 unsigned Size
= getSizeInBits(OpReg
, MRI
, *TRI
);
3606 // FIXME: Probably need a new intrinsic register bank searchable table to
3607 // handle arbitrary intrinsics easily.
3609 // If this has a sampler, it immediately follows rsrc.
3610 const bool MustBeSGPR
= I
== RsrcIdx
|| I
== RsrcIdx
+ 1;
3613 // If this must be an SGPR, so we must report whatever it is as legal.
3614 unsigned NewBank
= getRegBankID(OpReg
, MRI
, AMDGPU::SGPRRegBankID
);
3615 OpdsMapping
[I
] = AMDGPU::getValueMapping(NewBank
, Size
);
3617 // Some operands must be VGPR, and these are easy to copy to.
3618 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3622 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
), NumOps
);
3625 /// Return the mapping for a pointer argument.
3626 const RegisterBankInfo::ValueMapping
*
3627 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo
&MRI
,
3628 Register PtrReg
) const {
3629 LLT PtrTy
= MRI
.getType(PtrReg
);
3630 unsigned Size
= PtrTy
.getSizeInBits();
3631 if (Subtarget
.useFlatForGlobal() ||
3632 !AMDGPU::isFlatGlobalAddrSpace(PtrTy
.getAddressSpace()))
3633 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3635 // If we're using MUBUF instructions for global memory, an SGPR base register
3636 // is possible. Otherwise this needs to be a VGPR.
3637 const RegisterBank
*PtrBank
= getRegBank(PtrReg
, MRI
, *TRI
);
3638 return AMDGPU::getValueMapping(PtrBank
->getID(), Size
);
3641 const RegisterBankInfo::InstructionMapping
&
3642 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr
&MI
) const {
3644 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3645 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3646 SmallVector
<const ValueMapping
*, 2> OpdsMapping(2);
3647 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3648 Register PtrReg
= MI
.getOperand(1).getReg();
3649 LLT PtrTy
= MRI
.getType(PtrReg
);
3650 unsigned AS
= PtrTy
.getAddressSpace();
3651 unsigned PtrSize
= PtrTy
.getSizeInBits();
3653 const ValueMapping
*ValMapping
;
3654 const ValueMapping
*PtrMapping
;
3656 const RegisterBank
*PtrBank
= getRegBank(PtrReg
, MRI
, *TRI
);
3658 if (PtrBank
== &AMDGPU::SGPRRegBank
&& AMDGPU::isFlatGlobalAddrSpace(AS
)) {
3659 if (isScalarLoadLegal(MI
)) {
3660 // We have a uniform instruction so we want to use an SMRD load
3661 ValMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
3662 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
);
3664 ValMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3666 // If we're using MUBUF instructions for global memory, an SGPR base
3667 // register is possible. Otherwise this needs to be a VGPR.
3668 unsigned PtrBankID
= Subtarget
.useFlatForGlobal() ?
3669 AMDGPU::VGPRRegBankID
: AMDGPU::SGPRRegBankID
;
3671 PtrMapping
= AMDGPU::getValueMapping(PtrBankID
, PtrSize
);
3674 ValMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3675 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
);
3678 OpdsMapping
[0] = ValMapping
;
3679 OpdsMapping
[1] = PtrMapping
;
3680 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstructionMapping(
3681 1, 1, getOperandsMapping(OpdsMapping
), MI
.getNumOperands());
3684 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3685 // handle that during instruction selection?
3689 AMDGPURegisterBankInfo::getRegBankID(Register Reg
,
3690 const MachineRegisterInfo
&MRI
,
3691 unsigned Default
) const {
3692 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
3693 return Bank
? Bank
->getID() : Default
;
3696 const RegisterBankInfo::ValueMapping
*
3697 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg
,
3698 const MachineRegisterInfo
&MRI
,
3699 const TargetRegisterInfo
&TRI
) const {
3700 // Lie and claim anything is legal, even though this needs to be an SGPR
3701 // applyMapping will have to deal with it as a waterfall loop.
3702 unsigned Bank
= getRegBankID(Reg
, MRI
, AMDGPU::SGPRRegBankID
);
3703 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
3704 return AMDGPU::getValueMapping(Bank
, Size
);
3707 const RegisterBankInfo::ValueMapping
*
3708 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg
,
3709 const MachineRegisterInfo
&MRI
,
3710 const TargetRegisterInfo
&TRI
) const {
3711 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
3712 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3715 const RegisterBankInfo::ValueMapping
*
3716 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg
,
3717 const MachineRegisterInfo
&MRI
,
3718 const TargetRegisterInfo
&TRI
) const {
3719 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
3720 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID
, Size
);
3724 /// This function must return a legal mapping, because
3725 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3726 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3727 /// VGPR to SGPR generated is illegal.
3729 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3730 // legal. These will be dealt with in applyMappingImpl.
3732 const RegisterBankInfo::InstructionMapping
&
3733 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr
&MI
) const {
3734 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3735 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3737 if (MI
.isCopy() || MI
.getOpcode() == AMDGPU::G_FREEZE
) {
3738 // The default logic bothers to analyze impossible alternative mappings. We
3739 // want the most straightforward mapping, so just directly handle this.
3740 const RegisterBank
*DstBank
= getRegBank(MI
.getOperand(0).getReg(), MRI
,
3742 const RegisterBank
*SrcBank
= getRegBank(MI
.getOperand(1).getReg(), MRI
,
3744 assert(SrcBank
&& "src bank should have been assigned already");
3748 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3749 if (MI
.getOpcode() != AMDGPU::G_FREEZE
&&
3750 cannotCopy(*DstBank
, *SrcBank
, TypeSize::getFixed(Size
)))
3751 return getInvalidInstructionMapping();
3753 const ValueMapping
&ValMap
= getValueMapping(0, Size
, *DstBank
);
3754 unsigned OpdsMappingSize
= MI
.isCopy() ? 1 : 2;
3755 SmallVector
<const ValueMapping
*, 1> OpdsMapping(OpdsMappingSize
);
3756 OpdsMapping
[0] = &ValMap
;
3757 if (MI
.getOpcode() == AMDGPU::G_FREEZE
)
3758 OpdsMapping
[1] = &ValMap
;
3760 return getInstructionMapping(
3762 /*OperandsMapping*/ getOperandsMapping(OpdsMapping
), OpdsMappingSize
);
3765 if (MI
.isRegSequence()) {
3766 // If any input is a VGPR, the result must be a VGPR. The default handling
3767 // assumes any copy between banks is legal.
3768 unsigned BankID
= AMDGPU::SGPRRegBankID
;
3770 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
3771 auto OpBank
= getRegBankID(MI
.getOperand(I
).getReg(), MRI
);
3772 // It doesn't make sense to use vcc or scc banks here, so just ignore
3774 if (OpBank
!= AMDGPU::SGPRRegBankID
) {
3775 BankID
= AMDGPU::VGPRRegBankID
;
3779 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3781 const ValueMapping
&ValMap
= getValueMapping(0, Size
, getRegBank(BankID
));
3782 return getInstructionMapping(
3784 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
3787 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3790 // TODO: There are additional exec masking dependencies to analyze.
3791 if (auto *PHI
= dyn_cast
<GPhi
>(&MI
)) {
3792 unsigned ResultBank
= AMDGPU::InvalidRegBankID
;
3793 Register DstReg
= PHI
->getReg(0);
3795 // Sometimes the result may have already been assigned a bank.
3796 if (const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
))
3797 ResultBank
= DstBank
->getID();
3799 for (unsigned I
= 0; I
< PHI
->getNumIncomingValues(); ++I
) {
3800 Register Reg
= PHI
->getIncomingValue(I
);
3801 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
3803 // FIXME: Assuming VGPR for any undetermined inputs.
3804 if (!Bank
|| Bank
->getID() == AMDGPU::VGPRRegBankID
) {
3805 ResultBank
= AMDGPU::VGPRRegBankID
;
3809 // FIXME: Need to promote SGPR case to s32
3810 unsigned OpBank
= Bank
->getID();
3811 ResultBank
= regBankBoolUnion(ResultBank
, OpBank
);
3814 assert(ResultBank
!= AMDGPU::InvalidRegBankID
);
3816 unsigned Size
= MRI
.getType(DstReg
).getSizeInBits();
3818 const ValueMapping
&ValMap
=
3819 getValueMapping(0, Size
, getRegBank(ResultBank
));
3820 return getInstructionMapping(
3822 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
3825 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstrMappingImpl(MI
);
3826 if (Mapping
.isValid())
3829 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3831 switch (MI
.getOpcode()) {
3833 return getInvalidInstructionMapping();
3838 case AMDGPU::G_MUL
: {
3839 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3841 const RegisterBank
*DstBank
3842 = getRegBank(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3844 unsigned TargetBankID
= AMDGPU::InvalidRegBankID
;
3845 unsigned BankLHS
= AMDGPU::InvalidRegBankID
;
3846 unsigned BankRHS
= AMDGPU::InvalidRegBankID
;
3848 TargetBankID
= DstBank
->getID();
3849 if (DstBank
== &AMDGPU::VCCRegBank
) {
3850 TargetBankID
= AMDGPU::VCCRegBankID
;
3851 BankLHS
= AMDGPU::VCCRegBankID
;
3852 BankRHS
= AMDGPU::VCCRegBankID
;
3854 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
3855 AMDGPU::SGPRRegBankID
);
3856 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
3857 AMDGPU::SGPRRegBankID
);
3860 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
3861 AMDGPU::VCCRegBankID
);
3862 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
3863 AMDGPU::VCCRegBankID
);
3865 // Both inputs should be true booleans to produce a boolean result.
3866 if (BankLHS
== AMDGPU::VGPRRegBankID
|| BankRHS
== AMDGPU::VGPRRegBankID
) {
3867 TargetBankID
= AMDGPU::VGPRRegBankID
;
3868 } else if (BankLHS
== AMDGPU::VCCRegBankID
|| BankRHS
== AMDGPU::VCCRegBankID
) {
3869 TargetBankID
= AMDGPU::VCCRegBankID
;
3870 BankLHS
= AMDGPU::VCCRegBankID
;
3871 BankRHS
= AMDGPU::VCCRegBankID
;
3872 } else if (BankLHS
== AMDGPU::SGPRRegBankID
&& BankRHS
== AMDGPU::SGPRRegBankID
) {
3873 TargetBankID
= AMDGPU::SGPRRegBankID
;
3877 OpdsMapping
[0] = AMDGPU::getValueMapping(TargetBankID
, Size
);
3878 OpdsMapping
[1] = AMDGPU::getValueMapping(BankLHS
, Size
);
3879 OpdsMapping
[2] = AMDGPU::getValueMapping(BankRHS
, Size
);
3885 if (isSALUMapping(MI
)) {
3886 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
);
3887 OpdsMapping
[1] = OpdsMapping
[2] = OpdsMapping
[0];
3889 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
);
3890 unsigned Bank1
= getRegBankID(MI
.getOperand(1).getReg(), MRI
/*, DefaultBankID*/);
3891 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank1
, Size
);
3893 unsigned Bank2
= getRegBankID(MI
.getOperand(2).getReg(), MRI
/*, DefaultBankID*/);
3894 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank2
, Size
);
3902 case AMDGPU::G_PTR_ADD
:
3903 case AMDGPU::G_PTRMASK
:
3907 case AMDGPU::G_LSHR
:
3908 case AMDGPU::G_ASHR
:
3909 case AMDGPU::G_UADDO
:
3910 case AMDGPU::G_USUBO
:
3911 case AMDGPU::G_UADDE
:
3912 case AMDGPU::G_SADDE
:
3913 case AMDGPU::G_USUBE
:
3914 case AMDGPU::G_SSUBE
:
3915 case AMDGPU::G_SMIN
:
3916 case AMDGPU::G_SMAX
:
3917 case AMDGPU::G_UMIN
:
3918 case AMDGPU::G_UMAX
:
3920 case AMDGPU::G_SHUFFLE_VECTOR
:
3921 case AMDGPU::G_SBFX
:
3922 case AMDGPU::G_UBFX
:
3923 case AMDGPU::G_AMDGPU_S_MUL_I64_I32
:
3924 case AMDGPU::G_AMDGPU_S_MUL_U64_U32
:
3925 if (isSALUMapping(MI
))
3926 return getDefaultMappingSOP(MI
);
3927 return getDefaultMappingVOP(MI
);
3928 case AMDGPU::G_FADD
:
3929 case AMDGPU::G_FSUB
:
3930 case AMDGPU::G_FMUL
:
3932 case AMDGPU::G_FFLOOR
:
3933 case AMDGPU::G_FCEIL
:
3934 case AMDGPU::G_INTRINSIC_ROUNDEVEN
:
3935 case AMDGPU::G_FMINNUM
:
3936 case AMDGPU::G_FMAXNUM
:
3937 case AMDGPU::G_FMINIMUM
:
3938 case AMDGPU::G_FMAXIMUM
:
3939 case AMDGPU::G_INTRINSIC_TRUNC
:
3940 case AMDGPU::G_STRICT_FADD
:
3941 case AMDGPU::G_STRICT_FSUB
:
3942 case AMDGPU::G_STRICT_FMUL
:
3943 case AMDGPU::G_STRICT_FMA
: {
3944 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
3945 unsigned Size
= Ty
.getSizeInBits();
3946 if (Subtarget
.hasSALUFloatInsts() && Ty
.isScalar() &&
3947 (Size
== 32 || Size
== 16) && isSALUMapping(MI
))
3948 return getDefaultMappingSOP(MI
);
3949 return getDefaultMappingVOP(MI
);
3951 case AMDGPU::G_FPTOSI
:
3952 case AMDGPU::G_FPTOUI
:
3953 case AMDGPU::G_SITOFP
:
3954 case AMDGPU::G_UITOFP
: {
3955 unsigned SizeDst
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3956 unsigned SizeSrc
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3957 if (Subtarget
.hasSALUFloatInsts() && SizeDst
== 32 && SizeSrc
== 32 &&
3959 return getDefaultMappingSOP(MI
);
3960 return getDefaultMappingVOP(MI
);
3962 case AMDGPU::G_FPTRUNC
:
3963 case AMDGPU::G_FPEXT
: {
3964 unsigned SizeDst
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3965 unsigned SizeSrc
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3966 if (Subtarget
.hasSALUFloatInsts() && SizeDst
!= 64 && SizeSrc
!= 64 &&
3968 return getDefaultMappingSOP(MI
);
3969 return getDefaultMappingVOP(MI
);
3971 case AMDGPU::G_FSQRT
:
3972 case AMDGPU::G_FEXP2
:
3973 case AMDGPU::G_FLOG2
: {
3974 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3975 if (Subtarget
.hasPseudoScalarTrans() && (Size
== 16 || Size
== 32) &&
3977 return getDefaultMappingSOP(MI
);
3978 return getDefaultMappingVOP(MI
);
3980 case AMDGPU::G_SADDSAT
: // FIXME: Could lower sat ops for SALU
3981 case AMDGPU::G_SSUBSAT
:
3982 case AMDGPU::G_UADDSAT
:
3983 case AMDGPU::G_USUBSAT
:
3984 case AMDGPU::G_FMAD
:
3985 case AMDGPU::G_FLDEXP
:
3986 case AMDGPU::G_FMINNUM_IEEE
:
3987 case AMDGPU::G_FMAXNUM_IEEE
:
3988 case AMDGPU::G_FCANONICALIZE
:
3989 case AMDGPU::G_STRICT_FLDEXP
:
3990 case AMDGPU::G_BSWAP
: // TODO: Somehow expand for scalar?
3991 case AMDGPU::G_FSHR
: // TODO: Expand for scalar
3992 case AMDGPU::G_AMDGPU_FMIN_LEGACY
:
3993 case AMDGPU::G_AMDGPU_FMAX_LEGACY
:
3994 case AMDGPU::G_AMDGPU_RCP_IFLAG
:
3995 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0
:
3996 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1
:
3997 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2
:
3998 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3
:
3999 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32
:
4000 case AMDGPU::G_AMDGPU_SMED3
:
4001 case AMDGPU::G_AMDGPU_FMED3
:
4002 return getDefaultMappingVOP(MI
);
4003 case AMDGPU::G_UMULH
:
4004 case AMDGPU::G_SMULH
: {
4005 if (Subtarget
.hasScalarMulHiInsts() && isSALUMapping(MI
))
4006 return getDefaultMappingSOP(MI
);
4007 return getDefaultMappingVOP(MI
);
4009 case AMDGPU::G_AMDGPU_MAD_U64_U32
:
4010 case AMDGPU::G_AMDGPU_MAD_I64_I32
: {
4011 // Three possible mappings:
4015 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4017 // This allows instruction selection to keep the multiplication part of the
4018 // instruction on the SALU.
4019 bool AllSalu
= true;
4020 bool MulSalu
= true;
4021 for (unsigned i
= 0; i
< 5; ++i
) {
4022 Register Reg
= MI
.getOperand(i
).getReg();
4023 if (const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
)) {
4024 if (Bank
->getID() != AMDGPU::SGPRRegBankID
) {
4026 if (i
== 2 || i
== 3) {
4035 return getDefaultMappingSOP(MI
);
4037 // If the multiply-add is full-rate in VALU, use that even if the
4038 // multiplication part is scalar. Accumulating separately on the VALU would
4039 // take two instructions.
4040 if (!MulSalu
|| Subtarget
.hasFullRate64Ops())
4041 return getDefaultMappingVOP(MI
);
4043 // Keep the multiplication on the SALU, then accumulate on the VALU.
4044 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 64);
4045 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4046 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
4047 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
4048 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 64);
4051 case AMDGPU::G_IMPLICIT_DEF
: {
4052 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4053 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4056 case AMDGPU::G_FCONSTANT
:
4057 case AMDGPU::G_CONSTANT
:
4058 case AMDGPU::G_GLOBAL_VALUE
:
4059 case AMDGPU::G_BLOCK_ADDR
:
4060 case AMDGPU::G_READSTEADYCOUNTER
:
4061 case AMDGPU::G_READCYCLECOUNTER
: {
4062 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4063 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4066 case AMDGPU::G_FRAME_INDEX
: {
4067 // TODO: This should be the same as other constants, but eliminateFrameIndex
4068 // currently assumes VALU uses.
4069 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4070 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4073 case AMDGPU::G_DYN_STACKALLOC
: {
4074 // Result is always uniform, and a wave reduction is needed for the source.
4075 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
4076 unsigned SrcBankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
4077 OpdsMapping
[1] = AMDGPU::getValueMapping(SrcBankID
, 32);
4080 case AMDGPU::G_AMDGPU_WAVE_ADDRESS
: {
4081 // This case is weird because we expect a physical register in the source,
4082 // but need to set a bank anyway.
4084 // TODO: We could select the result to SGPR or VGPR
4085 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
4086 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
4089 case AMDGPU::G_INSERT
: {
4090 unsigned BankID
= getMappingType(MRI
, MI
);
4091 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4092 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4093 unsigned EltSize
= getSizeInBits(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4094 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
4095 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
4096 OpdsMapping
[2] = AMDGPU::getValueMapping(BankID
, EltSize
);
4097 OpdsMapping
[3] = nullptr;
4100 case AMDGPU::G_EXTRACT
: {
4101 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
4102 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4103 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4104 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
4105 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
4106 OpdsMapping
[2] = nullptr;
4109 case AMDGPU::G_BUILD_VECTOR
:
4110 case AMDGPU::G_BUILD_VECTOR_TRUNC
: {
4111 LLT DstTy
= MRI
.getType(MI
.getOperand(0).getReg());
4112 if (DstTy
== LLT::fixed_vector(2, 16)) {
4113 unsigned DstSize
= DstTy
.getSizeInBits();
4114 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
4115 unsigned Src0BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
4116 unsigned Src1BankID
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
4117 unsigned DstBankID
= regBankUnion(Src0BankID
, Src1BankID
);
4119 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBankID
, DstSize
);
4120 OpdsMapping
[1] = AMDGPU::getValueMapping(Src0BankID
, SrcSize
);
4121 OpdsMapping
[2] = AMDGPU::getValueMapping(Src1BankID
, SrcSize
);
4127 case AMDGPU::G_MERGE_VALUES
:
4128 case AMDGPU::G_CONCAT_VECTORS
: {
4129 unsigned Bank
= getMappingType(MRI
, MI
);
4130 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4131 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
4133 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
4134 // Op1 and Dst should use the same register bank.
4135 for (unsigned i
= 1, e
= MI
.getNumOperands(); i
!= e
; ++i
)
4136 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, SrcSize
);
4139 case AMDGPU::G_BITREVERSE
:
4140 case AMDGPU::G_BITCAST
:
4141 case AMDGPU::G_INTTOPTR
:
4142 case AMDGPU::G_PTRTOINT
:
4143 case AMDGPU::G_FABS
:
4144 case AMDGPU::G_FNEG
: {
4145 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4146 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
4147 OpdsMapping
[0] = OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, Size
);
4150 case AMDGPU::G_AMDGPU_FFBH_U32
:
4151 case AMDGPU::G_AMDGPU_FFBL_B32
:
4152 case AMDGPU::G_CTLZ_ZERO_UNDEF
:
4153 case AMDGPU::G_CTTZ_ZERO_UNDEF
: {
4154 unsigned Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
4155 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
4156 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, 32);
4157 OpdsMapping
[1] = AMDGPU::getValueMappingSGPR64Only(BankID
, Size
);
4160 case AMDGPU::G_CTPOP
: {
4161 unsigned Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
4162 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
4163 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, 32);
4165 // This should really be getValueMappingSGPR64Only, but allowing the generic
4166 // code to handle the register split just makes using LegalizerHelper more
4168 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, Size
);
4171 case AMDGPU::G_TRUNC
: {
4172 Register Dst
= MI
.getOperand(0).getReg();
4173 Register Src
= MI
.getOperand(1).getReg();
4174 unsigned Bank
= getRegBankID(Src
, MRI
);
4175 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
4176 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
4177 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
4178 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, SrcSize
);
4181 case AMDGPU::G_ZEXT
:
4182 case AMDGPU::G_SEXT
:
4183 case AMDGPU::G_ANYEXT
:
4184 case AMDGPU::G_SEXT_INREG
: {
4185 Register Dst
= MI
.getOperand(0).getReg();
4186 Register Src
= MI
.getOperand(1).getReg();
4187 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
4188 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
4191 const RegisterBank
*SrcBank
= getRegBank(Src
, MRI
, *TRI
);
4193 switch (SrcBank
->getID()) {
4194 case AMDGPU::SGPRRegBankID
:
4195 DstBank
= AMDGPU::SGPRRegBankID
;
4198 DstBank
= AMDGPU::VGPRRegBankID
;
4202 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4203 // 32-bits, and then to 64.
4204 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(DstBank
, DstSize
);
4205 OpdsMapping
[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank
->getID(),
4209 case AMDGPU::G_IS_FPCLASS
: {
4210 Register SrcReg
= MI
.getOperand(1).getReg();
4211 unsigned SrcSize
= MRI
.getType(SrcReg
).getSizeInBits();
4212 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4213 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, DstSize
);
4214 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4217 case AMDGPU::G_STORE
: {
4218 assert(MI
.getOperand(0).isReg());
4219 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4221 // FIXME: We need to specify a different reg bank once scalar stores are
4223 const ValueMapping
*ValMapping
=
4224 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4225 OpdsMapping
[0] = ValMapping
;
4226 OpdsMapping
[1] = getValueMappingForPtr(MRI
, MI
.getOperand(1).getReg());
4229 case AMDGPU::G_ICMP
:
4230 case AMDGPU::G_FCMP
: {
4231 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4233 // See if the result register has already been constrained to vcc, which may
4234 // happen due to control flow intrinsic lowering.
4235 unsigned DstBank
= getRegBankID(MI
.getOperand(0).getReg(), MRI
,
4236 AMDGPU::SGPRRegBankID
);
4237 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
4238 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
);
4240 auto canUseSCCICMP
= [&]() {
4242 static_cast<CmpInst::Predicate
>(MI
.getOperand(1).getPredicate());
4243 return Size
== 32 ||
4245 (Pred
== CmpInst::ICMP_EQ
|| Pred
== CmpInst::ICMP_NE
) &&
4246 Subtarget
.hasScalarCompareEq64());
4248 auto canUseSCCFCMP
= [&]() {
4249 return Subtarget
.hasSALUFloatInsts() && (Size
== 32 || Size
== 16);
4252 bool isICMP
= MI
.getOpcode() == AMDGPU::G_ICMP
;
4253 bool CanUseSCC
= DstBank
== AMDGPU::SGPRRegBankID
&&
4254 Op2Bank
== AMDGPU::SGPRRegBankID
&&
4255 Op3Bank
== AMDGPU::SGPRRegBankID
&&
4256 (isICMP
? canUseSCCICMP() : canUseSCCFCMP());
4258 DstBank
= CanUseSCC
? AMDGPU::SGPRRegBankID
: AMDGPU::VCCRegBankID
;
4259 unsigned SrcBank
= CanUseSCC
? AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
4261 // TODO: Use 32-bit for scalar output size.
4262 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4263 const unsigned ResultSize
= 1;
4265 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBank
, ResultSize
);
4266 OpdsMapping
[1] = nullptr; // Predicate Operand.
4267 OpdsMapping
[2] = AMDGPU::getValueMapping(SrcBank
, Size
);
4268 OpdsMapping
[3] = AMDGPU::getValueMapping(SrcBank
, Size
);
4271 case AMDGPU::G_EXTRACT_VECTOR_ELT
: {
4272 // VGPR index can be used for waterfall when indexing a SGPR vector.
4273 unsigned SrcBankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
4274 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4275 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
4276 unsigned IdxSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4277 unsigned IdxBank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
4278 unsigned OutputBankID
= regBankUnion(SrcBankID
, IdxBank
);
4280 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID
, DstSize
);
4281 OpdsMapping
[1] = AMDGPU::getValueMapping(SrcBankID
, SrcSize
);
4283 // The index can be either if the source vector is VGPR.
4284 OpdsMapping
[2] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
4287 case AMDGPU::G_INSERT_VECTOR_ELT
: {
4288 unsigned OutputBankID
= isSALUMapping(MI
) ?
4289 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
4291 unsigned VecSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4292 unsigned InsertSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4293 unsigned IdxSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
4294 unsigned InsertEltBankID
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
4295 unsigned IdxBankID
= getRegBankID(MI
.getOperand(3).getReg(), MRI
);
4297 OpdsMapping
[0] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
4298 OpdsMapping
[1] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
4300 // This is a weird case, because we need to break down the mapping based on
4301 // the register bank of a different operand.
4302 if (InsertSize
== 64 && OutputBankID
== AMDGPU::VGPRRegBankID
) {
4303 OpdsMapping
[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID
,
4306 assert(InsertSize
== 32 || InsertSize
== 64);
4307 OpdsMapping
[2] = AMDGPU::getValueMapping(InsertEltBankID
, InsertSize
);
4310 // The index can be either if the source vector is VGPR.
4311 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBankID
, IdxSize
);
4314 case AMDGPU::G_UNMERGE_VALUES
: {
4315 unsigned Bank
= getMappingType(MRI
, MI
);
4317 // Op1 and Dst should use the same register bank.
4318 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4319 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
4320 unsigned Size
= getSizeInBits(MI
.getOperand(i
).getReg(), MRI
, *TRI
);
4321 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, Size
);
4325 case AMDGPU::G_AMDGPU_BUFFER_LOAD
:
4326 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
:
4327 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
:
4328 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
:
4329 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT
:
4330 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
:
4331 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
:
4332 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE
:
4333 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
:
4334 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE
:
4335 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT
:
4336 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
:
4337 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16
:
4338 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT
:
4339 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16
:
4340 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT
:
4341 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16
:
4342 case AMDGPU::G_AMDGPU_BUFFER_STORE
:
4343 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE
:
4344 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT
:
4345 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT
:
4346 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16
: {
4347 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4350 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4353 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4356 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4359 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4361 // Any remaining operands are immediates and were correctly null
4365 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP
:
4366 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD
:
4367 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB
:
4368 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN
:
4369 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN
:
4370 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX
:
4371 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX
:
4372 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND
:
4373 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR
:
4374 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR
:
4375 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC
:
4376 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC
:
4377 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD
:
4378 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN
:
4379 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX
: {
4381 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4384 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4387 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4390 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4393 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4396 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
4398 // Any remaining operands are immediates and were correctly null
4402 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP
: {
4404 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4407 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4410 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4413 OpdsMapping
[3] = getSGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4416 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4419 OpdsMapping
[5] = getVGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
4422 OpdsMapping
[6] = getSGPROpMapping(MI
.getOperand(6).getReg(), MRI
, *TRI
);
4424 // Any remaining operands are immediates and were correctly null
4428 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD
:
4429 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
:
4430 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE
:
4431 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT
:
4432 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT
: {
4433 // Lie and claim everything is legal, even though some need to be
4434 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4435 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4436 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4438 // We need to convert this to a MUBUF if either the resource of offset is
4440 unsigned RSrcBank
= OpdsMapping
[1]->BreakDown
[0].RegBank
->getID();
4441 unsigned OffsetBank
= OpdsMapping
[2]->BreakDown
[0].RegBank
->getID();
4442 unsigned ResultBank
= regBankUnion(RSrcBank
, OffsetBank
);
4444 unsigned Size0
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4445 OpdsMapping
[0] = AMDGPU::getValueMapping(ResultBank
, Size0
);
4448 case AMDGPU::G_INTRINSIC
:
4449 case AMDGPU::G_INTRINSIC_CONVERGENT
: {
4450 switch (cast
<GIntrinsic
>(MI
).getIntrinsicID()) {
4452 return getInvalidInstructionMapping();
4453 case Intrinsic::amdgcn_div_fmas
:
4454 case Intrinsic::amdgcn_div_fixup
:
4455 case Intrinsic::amdgcn_trig_preop
:
4456 case Intrinsic::amdgcn_sin
:
4457 case Intrinsic::amdgcn_cos
:
4458 case Intrinsic::amdgcn_log_clamp
:
4459 case Intrinsic::amdgcn_rcp_legacy
:
4460 case Intrinsic::amdgcn_rsq_legacy
:
4461 case Intrinsic::amdgcn_rsq_clamp
:
4462 case Intrinsic::amdgcn_fmul_legacy
:
4463 case Intrinsic::amdgcn_fma_legacy
:
4464 case Intrinsic::amdgcn_frexp_mant
:
4465 case Intrinsic::amdgcn_frexp_exp
:
4466 case Intrinsic::amdgcn_fract
:
4467 case Intrinsic::amdgcn_cvt_pknorm_i16
:
4468 case Intrinsic::amdgcn_cvt_pknorm_u16
:
4469 case Intrinsic::amdgcn_cvt_pk_i16
:
4470 case Intrinsic::amdgcn_cvt_pk_u16
:
4471 case Intrinsic::amdgcn_fmed3
:
4472 case Intrinsic::amdgcn_cubeid
:
4473 case Intrinsic::amdgcn_cubema
:
4474 case Intrinsic::amdgcn_cubesc
:
4475 case Intrinsic::amdgcn_cubetc
:
4476 case Intrinsic::amdgcn_sffbh
:
4477 case Intrinsic::amdgcn_fmad_ftz
:
4478 case Intrinsic::amdgcn_mbcnt_lo
:
4479 case Intrinsic::amdgcn_mbcnt_hi
:
4480 case Intrinsic::amdgcn_mul_u24
:
4481 case Intrinsic::amdgcn_mul_i24
:
4482 case Intrinsic::amdgcn_mulhi_u24
:
4483 case Intrinsic::amdgcn_mulhi_i24
:
4484 case Intrinsic::amdgcn_lerp
:
4485 case Intrinsic::amdgcn_sad_u8
:
4486 case Intrinsic::amdgcn_msad_u8
:
4487 case Intrinsic::amdgcn_sad_hi_u8
:
4488 case Intrinsic::amdgcn_sad_u16
:
4489 case Intrinsic::amdgcn_qsad_pk_u16_u8
:
4490 case Intrinsic::amdgcn_mqsad_pk_u16_u8
:
4491 case Intrinsic::amdgcn_mqsad_u32_u8
:
4492 case Intrinsic::amdgcn_cvt_pk_u8_f32
:
4493 case Intrinsic::amdgcn_alignbyte
:
4494 case Intrinsic::amdgcn_perm
:
4495 case Intrinsic::amdgcn_fdot2
:
4496 case Intrinsic::amdgcn_sdot2
:
4497 case Intrinsic::amdgcn_udot2
:
4498 case Intrinsic::amdgcn_sdot4
:
4499 case Intrinsic::amdgcn_udot4
:
4500 case Intrinsic::amdgcn_sdot8
:
4501 case Intrinsic::amdgcn_udot8
:
4502 case Intrinsic::amdgcn_fdot2_bf16_bf16
:
4503 case Intrinsic::amdgcn_fdot2_f16_f16
:
4504 case Intrinsic::amdgcn_fdot2_f32_bf16
:
4505 case Intrinsic::amdgcn_sudot4
:
4506 case Intrinsic::amdgcn_sudot8
:
4507 case Intrinsic::amdgcn_dot4_f32_fp8_bf8
:
4508 case Intrinsic::amdgcn_dot4_f32_bf8_fp8
:
4509 case Intrinsic::amdgcn_dot4_f32_fp8_fp8
:
4510 case Intrinsic::amdgcn_dot4_f32_bf8_bf8
:
4511 case Intrinsic::amdgcn_cvt_f32_fp8
:
4512 case Intrinsic::amdgcn_cvt_f32_bf8
:
4513 case Intrinsic::amdgcn_cvt_pk_f32_fp8
:
4514 case Intrinsic::amdgcn_cvt_pk_f32_bf8
:
4515 case Intrinsic::amdgcn_cvt_pk_fp8_f32
:
4516 case Intrinsic::amdgcn_cvt_pk_bf8_f32
:
4517 case Intrinsic::amdgcn_cvt_sr_fp8_f32
:
4518 case Intrinsic::amdgcn_cvt_sr_bf8_f32
:
4519 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16
:
4520 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16
:
4521 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied
:
4522 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied
:
4523 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16
:
4524 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16
:
4525 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4
:
4526 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8
:
4527 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8
:
4528 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8
:
4529 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8
:
4530 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8
:
4531 case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4
:
4532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16
:
4533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16
:
4534 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16
:
4535 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16
:
4536 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8
:
4537 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4
:
4538 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4
:
4539 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8
:
4540 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8
:
4541 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8
:
4542 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8
:
4543 return getDefaultMappingVOP(MI
);
4544 case Intrinsic::amdgcn_log
:
4545 case Intrinsic::amdgcn_exp2
:
4546 case Intrinsic::amdgcn_rcp
:
4547 case Intrinsic::amdgcn_rsq
:
4548 case Intrinsic::amdgcn_sqrt
: {
4549 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4550 if (Subtarget
.hasPseudoScalarTrans() && (Size
== 16 || Size
== 32) &&
4552 return getDefaultMappingSOP(MI
);
4553 return getDefaultMappingVOP(MI
);
4555 case Intrinsic::amdgcn_sbfe
:
4556 case Intrinsic::amdgcn_ubfe
:
4557 if (isSALUMapping(MI
))
4558 return getDefaultMappingSOP(MI
);
4559 return getDefaultMappingVOP(MI
);
4560 case Intrinsic::amdgcn_ds_swizzle
:
4561 case Intrinsic::amdgcn_ds_permute
:
4562 case Intrinsic::amdgcn_ds_bpermute
:
4563 case Intrinsic::amdgcn_update_dpp
:
4564 case Intrinsic::amdgcn_mov_dpp8
:
4565 case Intrinsic::amdgcn_mov_dpp
:
4566 case Intrinsic::amdgcn_strict_wwm
:
4567 case Intrinsic::amdgcn_wwm
:
4568 case Intrinsic::amdgcn_strict_wqm
:
4569 case Intrinsic::amdgcn_wqm
:
4570 case Intrinsic::amdgcn_softwqm
:
4571 case Intrinsic::amdgcn_set_inactive
:
4572 case Intrinsic::amdgcn_set_inactive_chain_arg
:
4573 case Intrinsic::amdgcn_permlane64
:
4574 return getDefaultMappingAllVGPR(MI
);
4575 case Intrinsic::amdgcn_cvt_pkrtz
:
4576 if (Subtarget
.hasSALUFloatInsts() && isSALUMapping(MI
))
4577 return getDefaultMappingSOP(MI
);
4578 return getDefaultMappingVOP(MI
);
4579 case Intrinsic::amdgcn_kernarg_segment_ptr
:
4580 case Intrinsic::amdgcn_s_getpc
:
4581 case Intrinsic::amdgcn_groupstaticsize
:
4582 case Intrinsic::amdgcn_reloc_constant
:
4583 case Intrinsic::returnaddress
: {
4584 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4585 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4588 case Intrinsic::amdgcn_wqm_vote
: {
4589 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4590 OpdsMapping
[0] = OpdsMapping
[2]
4591 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
);
4594 case Intrinsic::amdgcn_ps_live
: {
4595 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4598 case Intrinsic::amdgcn_div_scale
: {
4599 unsigned Dst0Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4600 unsigned Dst1Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
4601 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Dst0Size
);
4602 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Dst1Size
);
4604 unsigned SrcSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
4605 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4606 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4609 case Intrinsic::amdgcn_class
: {
4610 Register Src0Reg
= MI
.getOperand(2).getReg();
4611 Register Src1Reg
= MI
.getOperand(3).getReg();
4612 unsigned Src0Size
= MRI
.getType(Src0Reg
).getSizeInBits();
4613 unsigned Src1Size
= MRI
.getType(Src1Reg
).getSizeInBits();
4614 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4615 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, DstSize
);
4616 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Src0Size
);
4617 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Src1Size
);
4620 case Intrinsic::amdgcn_icmp
:
4621 case Intrinsic::amdgcn_fcmp
: {
4622 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4623 // This is not VCCRegBank because this is not used in boolean contexts.
4624 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
4625 unsigned OpSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4626 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, OpSize
);
4627 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, OpSize
);
4630 case Intrinsic::amdgcn_readlane
: {
4631 // This must be an SGPR, but accept a VGPR.
4632 Register IdxReg
= MI
.getOperand(3).getReg();
4633 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
4634 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, AMDGPU::SGPRRegBankID
);
4635 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
4638 case Intrinsic::amdgcn_readfirstlane
: {
4639 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4640 unsigned SrcSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4641 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
4642 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4645 case Intrinsic::amdgcn_writelane
: {
4646 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4647 Register SrcReg
= MI
.getOperand(2).getReg();
4648 unsigned SrcSize
= MRI
.getType(SrcReg
).getSizeInBits();
4649 unsigned SrcBank
= getRegBankID(SrcReg
, MRI
, AMDGPU::SGPRRegBankID
);
4650 Register IdxReg
= MI
.getOperand(3).getReg();
4651 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
4652 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, AMDGPU::SGPRRegBankID
);
4653 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4655 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4657 OpdsMapping
[2] = AMDGPU::getValueMapping(SrcBank
, SrcSize
);
4658 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
4659 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4662 case Intrinsic::amdgcn_if_break
: {
4663 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4664 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4665 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4666 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4669 case Intrinsic::amdgcn_permlane16
:
4670 case Intrinsic::amdgcn_permlanex16
: {
4671 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4672 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4673 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4674 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4675 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4676 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4679 case Intrinsic::amdgcn_permlane16_var
:
4680 case Intrinsic::amdgcn_permlanex16_var
: {
4681 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4682 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4683 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4684 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4685 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4688 case Intrinsic::amdgcn_mfma_f32_4x4x1f32
:
4689 case Intrinsic::amdgcn_mfma_f32_4x4x4f16
:
4690 case Intrinsic::amdgcn_mfma_i32_4x4x4i8
:
4691 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16
:
4692 case Intrinsic::amdgcn_mfma_f32_16x16x1f32
:
4693 case Intrinsic::amdgcn_mfma_f32_16x16x4f32
:
4694 case Intrinsic::amdgcn_mfma_f32_16x16x4f16
:
4695 case Intrinsic::amdgcn_mfma_f32_16x16x16f16
:
4696 case Intrinsic::amdgcn_mfma_i32_16x16x4i8
:
4697 case Intrinsic::amdgcn_mfma_i32_16x16x16i8
:
4698 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16
:
4699 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16
:
4700 case Intrinsic::amdgcn_mfma_f32_32x32x1f32
:
4701 case Intrinsic::amdgcn_mfma_f32_32x32x2f32
:
4702 case Intrinsic::amdgcn_mfma_f32_32x32x4f16
:
4703 case Intrinsic::amdgcn_mfma_f32_32x32x8f16
:
4704 case Intrinsic::amdgcn_mfma_i32_32x32x4i8
:
4705 case Intrinsic::amdgcn_mfma_i32_32x32x8i8
:
4706 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16
:
4707 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16
:
4708 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k
:
4709 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k
:
4710 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k
:
4711 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k
:
4712 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k
:
4713 case Intrinsic::amdgcn_mfma_f64_16x16x4f64
:
4714 case Intrinsic::amdgcn_mfma_f64_4x4x4f64
:
4715 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8
:
4716 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8
:
4717 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32
:
4718 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32
:
4719 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8
:
4720 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8
:
4721 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8
:
4722 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8
:
4723 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8
:
4724 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8
:
4725 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8
:
4726 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8
: {
4727 // Default for MAI intrinsics.
4728 // srcC can also be an immediate which can be folded later.
4729 // FIXME: Should we eventually add an alternative mapping with AGPR src
4732 // vdst, srcA, srcB, srcC
4733 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4735 Info
->mayNeedAGPRs()
4736 ? getAGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
)
4737 : getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4738 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4739 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4741 Info
->mayNeedAGPRs()
4742 ? getAGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
)
4743 : getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4746 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16
:
4747 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16
:
4748 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16
:
4749 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16
:
4750 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8
:
4751 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8
:
4752 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8
:
4753 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8
:
4754 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8
:
4755 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8
:
4756 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8
:
4757 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8
:
4758 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8
:
4759 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8
: {
4760 // vdst, srcA, srcB, srcC, idx
4761 OpdsMapping
[0] = getAGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4762 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4763 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4764 OpdsMapping
[4] = getAGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4765 OpdsMapping
[5] = getVGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
4768 case Intrinsic::amdgcn_interp_p1
:
4769 case Intrinsic::amdgcn_interp_p2
:
4770 case Intrinsic::amdgcn_interp_mov
:
4771 case Intrinsic::amdgcn_interp_p1_f16
:
4772 case Intrinsic::amdgcn_interp_p2_f16
:
4773 case Intrinsic::amdgcn_lds_param_load
: {
4774 const int M0Idx
= MI
.getNumOperands() - 1;
4775 Register M0Reg
= MI
.getOperand(M0Idx
).getReg();
4776 unsigned M0Bank
= getRegBankID(M0Reg
, MRI
, AMDGPU::SGPRRegBankID
);
4777 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4779 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4780 for (int I
= 2; I
!= M0Idx
&& MI
.getOperand(I
).isReg(); ++I
)
4781 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4783 // Must be SGPR, but we must take whatever the original bank is and fix it
4785 OpdsMapping
[M0Idx
] = AMDGPU::getValueMapping(M0Bank
, 32);
4788 case Intrinsic::amdgcn_interp_inreg_p10
:
4789 case Intrinsic::amdgcn_interp_inreg_p2
:
4790 case Intrinsic::amdgcn_interp_inreg_p10_f16
:
4791 case Intrinsic::amdgcn_interp_inreg_p2_f16
:
4792 case Intrinsic::amdgcn_interp_p10_rtz_f16
:
4793 case Intrinsic::amdgcn_interp_p2_rtz_f16
: {
4794 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4795 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4796 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4797 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4798 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4801 case Intrinsic::amdgcn_ballot
: {
4802 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4803 unsigned SrcSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4804 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
4805 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, SrcSize
);
4808 case Intrinsic::amdgcn_inverse_ballot
: {
4809 // This must be an SGPR, but accept a VGPR.
4810 Register MaskReg
= MI
.getOperand(2).getReg();
4811 unsigned MaskSize
= MRI
.getType(MaskReg
).getSizeInBits();
4812 unsigned MaskBank
= getRegBankID(MaskReg
, MRI
, AMDGPU::SGPRRegBankID
);
4813 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4814 OpdsMapping
[2] = AMDGPU::getValueMapping(MaskBank
, MaskSize
);
4817 case Intrinsic::amdgcn_s_quadmask
:
4818 case Intrinsic::amdgcn_s_wqm
: {
4819 Register MaskReg
= MI
.getOperand(2).getReg();
4820 unsigned MaskSize
= MRI
.getType(MaskReg
).getSizeInBits();
4821 unsigned MaskBank
= getRegBankID(MaskReg
, MRI
, AMDGPU::SGPRRegBankID
);
4822 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, MaskSize
);
4823 OpdsMapping
[2] = AMDGPU::getValueMapping(MaskBank
, MaskSize
);
4826 case Intrinsic::amdgcn_wave_reduce_umin
:
4827 case Intrinsic::amdgcn_wave_reduce_umax
: {
4828 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4829 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
4830 unsigned OpSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4832 isSALUMapping(MI
) ? AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
4833 OpdsMapping
[2] = AMDGPU::getValueMapping(regBankID
, OpSize
);
4836 case Intrinsic::amdgcn_s_bitreplicate
:
4837 Register MaskReg
= MI
.getOperand(2).getReg();
4838 unsigned MaskBank
= getRegBankID(MaskReg
, MRI
, AMDGPU::SGPRRegBankID
);
4839 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 64);
4840 OpdsMapping
[2] = AMDGPU::getValueMapping(MaskBank
, 32);
4844 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD
:
4845 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
:
4846 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET
:
4847 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE
:
4848 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
: {
4849 auto IntrID
= AMDGPU::getIntrinsicID(MI
);
4850 const AMDGPU::RsrcIntrinsic
*RSrcIntrin
= AMDGPU::lookupRsrcIntrinsic(IntrID
);
4851 assert(RSrcIntrin
&& "missing RsrcIntrinsic for image intrinsic");
4852 // Non-images can have complications from operands that allow both SGPR
4853 // and VGPR. For now it's too complicated to figure out the final opcode
4854 // to derive the register bank from the MCInstrDesc.
4855 assert(RSrcIntrin
->IsImage
);
4856 return getImageMapping(MRI
, MI
, RSrcIntrin
->RsrcArg
);
4858 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY
: {
4859 unsigned N
= MI
.getNumExplicitOperands() - 2;
4860 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 128);
4861 OpdsMapping
[N
] = getSGPROpMapping(MI
.getOperand(N
).getReg(), MRI
, *TRI
);
4863 // Sequential form: all operands combined into VGPR256/VGPR512
4864 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4867 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4870 for (unsigned I
= 2; I
< N
; ++I
) {
4871 unsigned Size
= MRI
.getType(MI
.getOperand(I
).getReg()).getSizeInBits();
4872 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4877 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
:
4878 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS
: {
4879 auto IntrID
= cast
<GIntrinsic
>(MI
).getIntrinsicID();
4881 case Intrinsic::amdgcn_s_getreg
:
4882 case Intrinsic::amdgcn_s_memtime
:
4883 case Intrinsic::amdgcn_s_memrealtime
:
4884 case Intrinsic::amdgcn_s_get_waveid_in_workgroup
:
4885 case Intrinsic::amdgcn_s_sendmsg_rtn
: {
4886 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4887 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4890 case Intrinsic::amdgcn_global_atomic_fadd
:
4891 case Intrinsic::amdgcn_global_atomic_csub
:
4892 case Intrinsic::amdgcn_global_atomic_fmin
:
4893 case Intrinsic::amdgcn_global_atomic_fmax
:
4894 case Intrinsic::amdgcn_global_atomic_fmin_num
:
4895 case Intrinsic::amdgcn_global_atomic_fmax_num
:
4896 case Intrinsic::amdgcn_flat_atomic_fadd
:
4897 case Intrinsic::amdgcn_flat_atomic_fmin
:
4898 case Intrinsic::amdgcn_flat_atomic_fmax
:
4899 case Intrinsic::amdgcn_flat_atomic_fmin_num
:
4900 case Intrinsic::amdgcn_flat_atomic_fmax_num
:
4901 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16
:
4902 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16
:
4903 case Intrinsic::amdgcn_atomic_cond_sub_u32
:
4904 case Intrinsic::amdgcn_global_atomic_ordered_add_b64
:
4905 case Intrinsic::amdgcn_global_load_tr_b64
:
4906 case Intrinsic::amdgcn_global_load_tr_b128
:
4907 return getDefaultMappingAllVGPR(MI
);
4908 case Intrinsic::amdgcn_ds_ordered_add
:
4909 case Intrinsic::amdgcn_ds_ordered_swap
: {
4910 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4911 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4912 unsigned M0Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4913 AMDGPU::SGPRRegBankID
);
4914 OpdsMapping
[2] = AMDGPU::getValueMapping(M0Bank
, 32);
4915 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4918 case Intrinsic::amdgcn_ds_append
:
4919 case Intrinsic::amdgcn_ds_consume
: {
4920 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4921 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4922 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4925 case Intrinsic::amdgcn_exp_compr
:
4926 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4927 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4929 case Intrinsic::amdgcn_exp
:
4930 // FIXME: Could we support packed types here?
4931 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4932 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4933 OpdsMapping
[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4934 OpdsMapping
[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4936 case Intrinsic::amdgcn_exp_row
:
4937 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4938 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4939 OpdsMapping
[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4940 OpdsMapping
[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4941 OpdsMapping
[8] = getSGPROpMapping(MI
.getOperand(8).getReg(), MRI
, *TRI
);
4943 case Intrinsic::amdgcn_s_sendmsg
:
4944 case Intrinsic::amdgcn_s_sendmsghalt
: {
4945 // This must be an SGPR, but accept a VGPR.
4946 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4947 AMDGPU::SGPRRegBankID
);
4948 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
4951 case Intrinsic::amdgcn_s_setreg
: {
4952 // This must be an SGPR, but accept a VGPR.
4953 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4954 AMDGPU::SGPRRegBankID
);
4955 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
4958 case Intrinsic::amdgcn_s_ttracedata
: {
4959 // This must be an SGPR, but accept a VGPR.
4961 getRegBankID(MI
.getOperand(1).getReg(), MRI
, AMDGPU::SGPRRegBankID
);
4962 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, 32);
4965 case Intrinsic::amdgcn_end_cf
: {
4966 unsigned Size
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4967 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4970 case Intrinsic::amdgcn_else
: {
4971 unsigned WaveSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4972 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4973 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, WaveSize
);
4974 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, WaveSize
);
4977 case Intrinsic::amdgcn_live_mask
: {
4978 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4981 case Intrinsic::amdgcn_wqm_demote
:
4982 case Intrinsic::amdgcn_kill
: {
4983 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4986 case Intrinsic::amdgcn_raw_buffer_load
:
4987 case Intrinsic::amdgcn_raw_ptr_buffer_load
:
4988 case Intrinsic::amdgcn_raw_atomic_buffer_load
:
4989 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load
:
4990 case Intrinsic::amdgcn_raw_tbuffer_load
:
4991 case Intrinsic::amdgcn_raw_ptr_tbuffer_load
: {
4992 // FIXME: Should make intrinsic ID the last operand of the instruction,
4993 // then this would be the same as store
4994 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4995 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4996 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4997 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
5000 case Intrinsic::amdgcn_raw_buffer_load_lds
:
5001 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds
: {
5002 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
5003 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5004 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
5005 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
5008 case Intrinsic::amdgcn_raw_buffer_store
:
5009 case Intrinsic::amdgcn_raw_ptr_buffer_store
:
5010 case Intrinsic::amdgcn_raw_buffer_store_format
:
5011 case Intrinsic::amdgcn_raw_ptr_buffer_store_format
:
5012 case Intrinsic::amdgcn_raw_tbuffer_store
:
5013 case Intrinsic::amdgcn_raw_ptr_tbuffer_store
: {
5014 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
5015 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5016 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
5017 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
5020 case Intrinsic::amdgcn_struct_buffer_load
:
5021 case Intrinsic::amdgcn_struct_ptr_buffer_load
:
5022 case Intrinsic::amdgcn_struct_tbuffer_load
:
5023 case Intrinsic::amdgcn_struct_ptr_tbuffer_load
: {
5024 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
5025 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5026 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
5027 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
5028 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
5031 case Intrinsic::amdgcn_struct_buffer_load_lds
:
5032 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds
: {
5033 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
5034 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5035 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
5036 OpdsMapping
[5] = getVGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
5037 OpdsMapping
[6] = getSGPROpMapping(MI
.getOperand(6).getReg(), MRI
, *TRI
);
5040 case Intrinsic::amdgcn_struct_buffer_store
:
5041 case Intrinsic::amdgcn_struct_ptr_buffer_store
:
5042 case Intrinsic::amdgcn_struct_tbuffer_store
:
5043 case Intrinsic::amdgcn_struct_ptr_tbuffer_store
: {
5044 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
5045 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5046 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
5047 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
5048 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
5051 case Intrinsic::amdgcn_init_exec_from_input
: {
5052 unsigned Size
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
5053 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
5056 case Intrinsic::amdgcn_ds_gws_init
:
5057 case Intrinsic::amdgcn_ds_gws_barrier
:
5058 case Intrinsic::amdgcn_ds_gws_sema_br
: {
5059 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
5061 // This must be an SGPR, but accept a VGPR.
5062 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
5063 AMDGPU::SGPRRegBankID
);
5064 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
5067 case Intrinsic::amdgcn_ds_gws_sema_v
:
5068 case Intrinsic::amdgcn_ds_gws_sema_p
:
5069 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
5070 // This must be an SGPR, but accept a VGPR.
5071 unsigned Bank
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
5072 AMDGPU::SGPRRegBankID
);
5073 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, 32);
5076 case Intrinsic::amdgcn_global_load_lds
: {
5077 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
5078 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5081 case Intrinsic::amdgcn_lds_direct_load
: {
5082 const int M0Idx
= MI
.getNumOperands() - 1;
5083 Register M0Reg
= MI
.getOperand(M0Idx
).getReg();
5084 unsigned M0Bank
= getRegBankID(M0Reg
, MRI
, AMDGPU::SGPRRegBankID
);
5085 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
5087 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
5088 for (int I
= 2; I
!= M0Idx
&& MI
.getOperand(I
).isReg(); ++I
)
5089 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
5091 // Must be SGPR, but we must take whatever the original bank is and fix it
5093 OpdsMapping
[M0Idx
] = AMDGPU::getValueMapping(M0Bank
, 32);
5096 case Intrinsic::amdgcn_ds_add_gs_reg_rtn
:
5097 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn
:
5098 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
5099 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5101 case Intrinsic::amdgcn_ds_bvh_stack_rtn
: {
5103 getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
); // %vdst
5105 getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
); // %addr
5107 getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
); // %addr
5109 getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
); // %data0
5111 getVGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
); // %data1
5114 case Intrinsic::amdgcn_s_sleep_var
:
5115 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
5117 case Intrinsic::amdgcn_s_barrier_signal_var
:
5118 case Intrinsic::amdgcn_s_barrier_join
:
5119 case Intrinsic::amdgcn_s_wakeup_barrier
:
5120 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
5122 case Intrinsic::amdgcn_s_barrier_init
:
5123 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
5124 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5126 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var
: {
5127 const unsigned ResultSize
= 1;
5129 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, ResultSize
);
5130 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5133 case Intrinsic::amdgcn_s_barrier_signal_isfirst
:
5134 case Intrinsic::amdgcn_s_barrier_leave
: {
5135 const unsigned ResultSize
= 1;
5137 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, ResultSize
);
5140 case Intrinsic::amdgcn_s_get_barrier_state
: {
5141 OpdsMapping
[0] = getSGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
5142 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5145 case Intrinsic::amdgcn_pops_exiting_wave_id
:
5146 return getDefaultMappingSOP(MI
);
5148 return getInvalidInstructionMapping();
5152 case AMDGPU::G_SELECT
: {
5153 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
5154 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
5155 AMDGPU::SGPRRegBankID
);
5156 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
,
5157 AMDGPU::SGPRRegBankID
);
5158 bool SGPRSrcs
= Op2Bank
== AMDGPU::SGPRRegBankID
&&
5159 Op3Bank
== AMDGPU::SGPRRegBankID
;
5161 unsigned CondBankDefault
= SGPRSrcs
?
5162 AMDGPU::SGPRRegBankID
: AMDGPU::VCCRegBankID
;
5163 unsigned CondBank
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
5165 if (CondBank
== AMDGPU::SGPRRegBankID
)
5166 CondBank
= SGPRSrcs
? AMDGPU::SGPRRegBankID
: AMDGPU::VCCRegBankID
;
5167 else if (CondBank
== AMDGPU::VGPRRegBankID
)
5168 CondBank
= AMDGPU::VCCRegBankID
;
5170 unsigned Bank
= SGPRSrcs
&& CondBank
== AMDGPU::SGPRRegBankID
?
5171 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
5173 assert(CondBank
== AMDGPU::VCCRegBankID
|| CondBank
== AMDGPU::SGPRRegBankID
);
5175 // TODO: Should report 32-bit for scalar condition type.
5177 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
5178 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
5179 OpdsMapping
[2] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
5180 OpdsMapping
[3] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
5182 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, Size
);
5183 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
5184 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, Size
);
5185 OpdsMapping
[3] = AMDGPU::getValueMapping(Bank
, Size
);
5191 case AMDGPU::G_SI_CALL
: {
5192 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 64);
5193 // Lie and claim everything is legal, even though some need to be
5194 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5195 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
5197 // Allow anything for implicit arguments
5198 for (unsigned I
= 4; I
< MI
.getNumOperands(); ++I
) {
5199 if (MI
.getOperand(I
).isReg()) {
5200 Register Reg
= MI
.getOperand(I
).getReg();
5201 auto OpBank
= getRegBankID(Reg
, MRI
);
5202 unsigned Size
= getSizeInBits(Reg
, MRI
, *TRI
);
5203 OpdsMapping
[I
] = AMDGPU::getValueMapping(OpBank
, Size
);
5208 case AMDGPU::G_LOAD
:
5209 case AMDGPU::G_ZEXTLOAD
:
5210 case AMDGPU::G_SEXTLOAD
:
5211 return getInstrMappingForLoad(MI
);
5213 case AMDGPU::G_ATOMICRMW_XCHG
:
5214 case AMDGPU::G_ATOMICRMW_ADD
:
5215 case AMDGPU::G_ATOMICRMW_SUB
:
5216 case AMDGPU::G_ATOMICRMW_AND
:
5217 case AMDGPU::G_ATOMICRMW_OR
:
5218 case AMDGPU::G_ATOMICRMW_XOR
:
5219 case AMDGPU::G_ATOMICRMW_MAX
:
5220 case AMDGPU::G_ATOMICRMW_MIN
:
5221 case AMDGPU::G_ATOMICRMW_UMAX
:
5222 case AMDGPU::G_ATOMICRMW_UMIN
:
5223 case AMDGPU::G_ATOMICRMW_FADD
:
5224 case AMDGPU::G_ATOMICRMW_FMIN
:
5225 case AMDGPU::G_ATOMICRMW_FMAX
:
5226 case AMDGPU::G_ATOMICRMW_UINC_WRAP
:
5227 case AMDGPU::G_ATOMICRMW_UDEC_WRAP
:
5228 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG
: {
5229 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
5230 OpdsMapping
[1] = getValueMappingForPtr(MRI
, MI
.getOperand(1).getReg());
5231 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5234 case AMDGPU::G_ATOMIC_CMPXCHG
: {
5235 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
5236 OpdsMapping
[1] = getValueMappingForPtr(MRI
, MI
.getOperand(1).getReg());
5237 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
5238 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
5241 case AMDGPU::G_BRCOND
: {
5242 unsigned Bank
= getRegBankID(MI
.getOperand(0).getReg(), MRI
,
5243 AMDGPU::SGPRRegBankID
);
5244 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
5245 if (Bank
!= AMDGPU::SGPRRegBankID
)
5246 Bank
= AMDGPU::VCCRegBankID
;
5248 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, 1);
5251 case AMDGPU::G_FPTRUNC_ROUND_UPWARD
:
5252 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD
:
5253 return getDefaultMappingVOP(MI
);
5254 case AMDGPU::G_PREFETCH
:
5255 OpdsMapping
[0] = getSGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
5259 return getInstructionMapping(/*ID*/1, /*Cost*/1,
5260 getOperandsMapping(OpdsMapping
),
5261 MI
.getNumOperands());