1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the RegisterBankInfo class for
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
48 /// \par Constant bus restriction
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
69 //===----------------------------------------------------------------------===//
71 #include "AMDGPURegisterBankInfo.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
92 using namespace MIPatternMatch
;
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final
: public GISelChangeObserver
{
99 const AMDGPURegisterBankInfo
&RBI
;
100 MachineRegisterInfo
&MRI
;
101 const RegisterBank
*NewBank
;
102 SmallVector
<MachineInstr
*, 4> NewInsts
;
105 ApplyRegBankMapping(const AMDGPURegisterBankInfo
&RBI_
,
106 MachineRegisterInfo
&MRI_
, const RegisterBank
*RB
)
107 : RBI(RBI_
), MRI(MRI_
), NewBank(RB
) {}
109 ~ApplyRegBankMapping() {
110 for (MachineInstr
*MI
: NewInsts
)
114 /// Set any registers that don't have a set register class or bank to SALU.
115 void applyBank(MachineInstr
&MI
) {
116 const unsigned Opc
= MI
.getOpcode();
117 if (Opc
== AMDGPU::G_ANYEXT
|| Opc
== AMDGPU::G_ZEXT
||
118 Opc
== AMDGPU::G_SEXT
) {
119 // LegalizerHelper wants to use the basic legalization artifacts when
120 // widening etc. We don't handle selection with vcc in artifact sources,
121 // so we need to use a sslect instead to handle these properly.
122 Register DstReg
= MI
.getOperand(0).getReg();
123 Register SrcReg
= MI
.getOperand(1).getReg();
124 const RegisterBank
*SrcBank
= RBI
.getRegBank(SrcReg
, MRI
, *RBI
.TRI
);
125 if (SrcBank
== &AMDGPU::VCCRegBank
) {
126 const LLT S32
= LLT::scalar(32);
127 assert(MRI
.getType(SrcReg
) == LLT::scalar(1));
128 assert(MRI
.getType(DstReg
) == S32
);
129 assert(NewBank
== &AMDGPU::VGPRRegBank
);
131 // Replace the extension with a select, which really uses the boolean
133 MachineIRBuilder
B(MI
);
134 auto True
= B
.buildConstant(S32
, Opc
== AMDGPU::G_SEXT
? -1 : 1);
135 auto False
= B
.buildConstant(S32
, 0);
136 B
.buildSelect(DstReg
, SrcReg
, True
, False
);
137 MRI
.setRegBank(True
.getReg(0), *NewBank
);
138 MRI
.setRegBank(False
.getReg(0), *NewBank
);
139 MI
.eraseFromParent();
142 assert(!MRI
.getRegClassOrRegBank(DstReg
));
143 MRI
.setRegBank(DstReg
, *NewBank
);
148 if (Opc
== AMDGPU::G_TRUNC
) {
149 Register DstReg
= MI
.getOperand(0).getReg();
150 const RegisterBank
*DstBank
= RBI
.getRegBank(DstReg
, MRI
, *RBI
.TRI
);
151 assert(DstBank
!= &AMDGPU::VCCRegBank
);
155 for (MachineOperand
&Op
: MI
.operands()) {
159 // We may see physical registers if building a real MI
160 Register Reg
= Op
.getReg();
161 if (Reg
.isPhysical() || MRI
.getRegClassOrRegBank(Reg
))
164 const RegisterBank
*RB
= NewBank
;
165 if (MRI
.getType(Reg
) == LLT::scalar(1)) {
166 assert(NewBank
== &AMDGPU::VGPRRegBank
&&
167 "s1 operands should only be used for vector bools");
168 assert((MI
.getOpcode() != AMDGPU::G_TRUNC
&&
169 MI
.getOpcode() != AMDGPU::G_ANYEXT
) &&
170 "not expecting legalization artifacts here");
171 RB
= &AMDGPU::VCCRegBank
;
174 MRI
.setRegBank(Reg
, *RB
);
178 void erasingInstr(MachineInstr
&MI
) override
{}
180 void createdInstr(MachineInstr
&MI
) override
{
181 // At this point, the instruction was just inserted and has no operands.
182 NewInsts
.push_back(&MI
);
185 void changingInstr(MachineInstr
&MI
) override
{}
186 void changedInstr(MachineInstr
&MI
) override
{
187 // FIXME: In principle we should probably add the instruction to NewInsts,
188 // but the way the LegalizerHelper uses the observer, we will always see the
189 // registers we need to set the regbank on also referenced in a new
195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget
&ST
)
196 : AMDGPUGenRegisterBankInfo(),
198 TRI(Subtarget
.getRegisterInfo()),
199 TII(Subtarget
.getInstrInfo()) {
201 // HACK: Until this is fully tablegen'd.
202 static llvm::once_flag InitializeRegisterBankFlag
;
204 static auto InitializeRegisterBankOnce
= [this]() {
205 assert(&getRegBank(AMDGPU::SGPRRegBankID
) == &AMDGPU::SGPRRegBank
&&
206 &getRegBank(AMDGPU::VGPRRegBankID
) == &AMDGPU::VGPRRegBank
&&
207 &getRegBank(AMDGPU::AGPRRegBankID
) == &AMDGPU::AGPRRegBank
);
211 llvm::call_once(InitializeRegisterBankFlag
, InitializeRegisterBankOnce
);
214 static bool isVectorRegisterBank(const RegisterBank
&Bank
) {
215 unsigned BankID
= Bank
.getID();
216 return BankID
== AMDGPU::VGPRRegBankID
|| BankID
== AMDGPU::AGPRRegBankID
;
219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank
&Dst
,
220 const RegisterBank
&Src
,
221 unsigned Size
) const {
222 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223 if (Dst
.getID() == AMDGPU::SGPRRegBankID
&&
224 (isVectorRegisterBank(Src
) || Src
.getID() == AMDGPU::VCCRegBankID
)) {
225 return std::numeric_limits
<unsigned>::max();
228 // Bool values are tricky, because the meaning is based on context. The SCC
229 // and VCC banks are for the natural scalar and vector conditions produced by
232 // Legalization doesn't know about the necessary context, so an s1 use may
233 // have been a truncate from an arbitrary value, in which case a copy (lowered
234 // as a compare with 0) needs to be inserted.
236 (Dst
.getID() == AMDGPU::SGPRRegBankID
) &&
237 (isVectorRegisterBank(Src
) ||
238 Src
.getID() == AMDGPU::SGPRRegBankID
||
239 Src
.getID() == AMDGPU::VCCRegBankID
))
240 return std::numeric_limits
<unsigned>::max();
242 // There is no direct copy between AGPRs.
243 if (Dst
.getID() == AMDGPU::AGPRRegBankID
&&
244 Src
.getID() == AMDGPU::AGPRRegBankID
)
247 return RegisterBankInfo::copyCost(Dst
, Src
, Size
);
250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251 const ValueMapping
&ValMapping
,
252 const RegisterBank
*CurBank
) const {
253 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
255 // FIXME: Is there a better way to do this?
256 if (ValMapping
.NumBreakDowns
>= 2 || ValMapping
.BreakDown
[0].Length
>= 64)
257 return 10; // This is expensive.
259 assert(ValMapping
.NumBreakDowns
== 2 &&
260 ValMapping
.BreakDown
[0].Length
== 32 &&
261 ValMapping
.BreakDown
[0].StartIdx
== 0 &&
262 ValMapping
.BreakDown
[1].Length
== 32 &&
263 ValMapping
.BreakDown
[1].StartIdx
== 32 &&
264 ValMapping
.BreakDown
[0].RegBank
== ValMapping
.BreakDown
[1].RegBank
);
266 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
270 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271 // alignment restrictions, but this probably isn't important.
276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass
&RC
,
278 if (&RC
== &AMDGPU::SReg_1RegClass
)
279 return AMDGPU::VCCRegBank
;
281 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
283 if (TRI
->isSGPRClass(&RC
)) {
284 // FIXME: This probably came from a copy from a physical register, which
285 // should be inferrrable from the copied to-type. We don't have many boolean
286 // physical register constraints so just assume a normal SGPR for now.
288 return AMDGPU::SGPRRegBank
;
290 return Ty
== LLT::scalar(1) ? AMDGPU::VCCRegBank
: AMDGPU::SGPRRegBank
;
293 return TRI
->isAGPRClass(&RC
) ? AMDGPU::AGPRRegBank
: AMDGPU::VGPRRegBank
;
296 template <unsigned NumOps
>
297 RegisterBankInfo::InstructionMappings
298 AMDGPURegisterBankInfo::addMappingFromTable(
299 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
,
300 const std::array
<unsigned, NumOps
> RegSrcOpIdx
,
301 ArrayRef
<OpRegBankEntry
<NumOps
>> Table
) const {
303 InstructionMappings AltMappings
;
305 SmallVector
<const ValueMapping
*, 10> Operands(MI
.getNumOperands());
307 unsigned Sizes
[NumOps
];
308 for (unsigned I
= 0; I
< NumOps
; ++I
) {
309 Register Reg
= MI
.getOperand(RegSrcOpIdx
[I
]).getReg();
310 Sizes
[I
] = getSizeInBits(Reg
, MRI
, *TRI
);
313 for (unsigned I
= 0, E
= MI
.getNumExplicitDefs(); I
!= E
; ++I
) {
314 unsigned SizeI
= getSizeInBits(MI
.getOperand(I
).getReg(), MRI
, *TRI
);
315 Operands
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SizeI
);
318 // getInstrMapping's default mapping uses ID 1, so start at 2.
319 unsigned MappingID
= 2;
320 for (const auto &Entry
: Table
) {
321 for (unsigned I
= 0; I
< NumOps
; ++I
) {
322 int OpIdx
= RegSrcOpIdx
[I
];
323 Operands
[OpIdx
] = AMDGPU::getValueMapping(Entry
.RegBanks
[I
], Sizes
[I
]);
326 AltMappings
.push_back(&getInstructionMapping(MappingID
++, Entry
.Cost
,
327 getOperandsMapping(Operands
),
334 RegisterBankInfo::InstructionMappings
335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
337 switch (MI
.getIntrinsicID()) {
338 case Intrinsic::amdgcn_readlane
: {
339 static const OpRegBankEntry
<3> Table
[2] = {
341 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
343 // Need a readfirstlane for the index.
344 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
347 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
348 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
350 case Intrinsic::amdgcn_writelane
: {
351 static const OpRegBankEntry
<4> Table
[4] = {
353 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
355 // Need readfirstlane of first op
356 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
358 // Need readfirstlane of second op
359 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
361 // Need readfirstlane of both ops
362 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 3 }
365 // rsrc, voffset, offset
366 const std::array
<unsigned, 4> RegSrcOpIdx
= { { 0, 2, 3, 4 } };
367 return addMappingFromTable
<4>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
370 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
374 RegisterBankInfo::InstructionMappings
375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
378 switch (MI
.getIntrinsicID()) {
379 case Intrinsic::amdgcn_s_buffer_load
: {
380 static const OpRegBankEntry
<2> Table
[4] = {
382 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
384 // Only need 1 register in loop
385 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 300 },
387 // Have to waterfall the resource.
388 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1000 },
390 // Have to waterfall the resource, and the offset.
391 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1500 }
395 const std::array
<unsigned, 2> RegSrcOpIdx
= { { 2, 3 } };
396 return addMappingFromTable
<2>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
398 case Intrinsic::amdgcn_ds_ordered_add
:
399 case Intrinsic::amdgcn_ds_ordered_swap
: {
401 static const OpRegBankEntry
<3> Table
[2] = {
403 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
405 // Need a readfirstlane for m0
406 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
409 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
410 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
412 case Intrinsic::amdgcn_s_sendmsg
:
413 case Intrinsic::amdgcn_s_sendmsghalt
: {
414 // FIXME: Should have no register for immediate
415 static const OpRegBankEntry
<1> Table
[2] = {
417 { { AMDGPU::SGPRRegBankID
}, 1 },
420 { { AMDGPU::VGPRRegBankID
}, 3 }
423 const std::array
<unsigned, 1> RegSrcOpIdx
= { { 2 } };
424 return addMappingFromTable
<1>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
427 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
431 static bool memOpHasNoClobbered(const MachineMemOperand
*MMO
) {
432 const Instruction
*I
= dyn_cast_or_null
<Instruction
>(MMO
->getValue());
433 return I
&& I
->getMetadata("amdgpu.noclobber");
436 // FIXME: Returns uniform if there's no source value information. This is
438 static bool isScalarLoadLegal(const MachineInstr
&MI
) {
439 if (!MI
.hasOneMemOperand())
442 const MachineMemOperand
*MMO
= *MI
.memoperands_begin();
443 const unsigned AS
= MMO
->getAddrSpace();
444 const bool IsConst
= AS
== AMDGPUAS::CONSTANT_ADDRESS
||
445 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
;
446 // Require 4-byte alignment.
447 return MMO
->getAlign() >= Align(4) &&
448 // Can't do a scalar atomic load.
450 // Don't use scalar loads for volatile accesses to non-constant address
452 (IsConst
|| !MMO
->isVolatile()) &&
453 // Memory must be known constant, or not written before this load.
454 (IsConst
|| MMO
->isInvariant() || memOpHasNoClobbered(MMO
)) &&
455 AMDGPUInstrInfo::isUniformMMO(MMO
);
458 RegisterBankInfo::InstructionMappings
459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460 const MachineInstr
&MI
) const {
462 const MachineFunction
&MF
= *MI
.getParent()->getParent();
463 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
466 InstructionMappings AltMappings
;
467 switch (MI
.getOpcode()) {
468 case TargetOpcode::G_CONSTANT
: {
469 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
471 static const OpRegBankEntry
<1> Table
[3] = {
472 { { AMDGPU::VGPRRegBankID
}, 1 },
473 { { AMDGPU::SGPRRegBankID
}, 1 },
474 { { AMDGPU::VCCRegBankID
}, 1 }
477 return addMappingFromTable
<1>(MI
, MRI
, {{ 0 }}, Table
);
482 case TargetOpcode::G_FCONSTANT
:
483 case TargetOpcode::G_FRAME_INDEX
:
484 case TargetOpcode::G_GLOBAL_VALUE
: {
485 static const OpRegBankEntry
<1> Table
[2] = {
486 { { AMDGPU::VGPRRegBankID
}, 1 },
487 { { AMDGPU::SGPRRegBankID
}, 1 }
490 return addMappingFromTable
<1>(MI
, MRI
, {{ 0 }}, Table
);
492 case TargetOpcode::G_AND
:
493 case TargetOpcode::G_OR
:
494 case TargetOpcode::G_XOR
: {
495 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
498 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499 const InstructionMapping
&SCCMapping
= getInstructionMapping(
500 1, 1, getOperandsMapping(
501 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32),
502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32),
503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32)}),
505 AltMappings
.push_back(&SCCMapping
);
507 const InstructionMapping
&VCCMapping0
= getInstructionMapping(
508 2, 1, getOperandsMapping(
509 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
)}),
513 AltMappings
.push_back(&VCCMapping0
);
520 const InstructionMapping
&SSMapping
= getInstructionMapping(
521 1, 1, getOperandsMapping(
522 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
526 AltMappings
.push_back(&SSMapping
);
528 const InstructionMapping
&VVMapping
= getInstructionMapping(
529 2, 2, getOperandsMapping(
530 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
534 AltMappings
.push_back(&VVMapping
);
537 case TargetOpcode::G_LOAD
:
538 case TargetOpcode::G_ZEXTLOAD
:
539 case TargetOpcode::G_SEXTLOAD
: {
540 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
541 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
542 unsigned PtrSize
= PtrTy
.getSizeInBits();
543 unsigned AS
= PtrTy
.getAddressSpace();
545 if ((AS
!= AMDGPUAS::LOCAL_ADDRESS
&& AS
!= AMDGPUAS::REGION_ADDRESS
&&
546 AS
!= AMDGPUAS::PRIVATE_ADDRESS
) &&
547 isScalarLoadLegal(MI
)) {
548 const InstructionMapping
&SSMapping
= getInstructionMapping(
549 1, 1, getOperandsMapping(
550 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
551 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
)}),
553 AltMappings
.push_back(&SSMapping
);
556 const InstructionMapping
&VVMapping
= getInstructionMapping(
559 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
560 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
)}),
562 AltMappings
.push_back(&VVMapping
);
564 // It may be possible to have a vgpr = load sgpr mapping here, because
565 // the mubuf instructions support this kind of load, but probably for only
566 // gfx7 and older. However, the addressing mode matching in the instruction
567 // selector should be able to do a better job of detecting and selecting
568 // these kinds of loads from the vgpr = load vgpr mapping.
573 case TargetOpcode::G_SELECT
: {
574 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
575 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
576 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1),
578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
581 AltMappings
.push_back(&SSMapping
);
583 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
584 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
585 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
589 AltMappings
.push_back(&VVMapping
);
593 case TargetOpcode::G_UADDE
:
594 case TargetOpcode::G_USUBE
:
595 case TargetOpcode::G_SADDE
:
596 case TargetOpcode::G_SSUBE
: {
597 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
598 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
600 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
601 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1),
602 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
603 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
604 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1)}),
606 AltMappings
.push_back(&SSMapping
);
608 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
609 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
610 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
611 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
612 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
613 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1)}),
615 AltMappings
.push_back(&VVMapping
);
618 case AMDGPU::G_BRCOND
: {
619 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
621 // TODO: Change type to 32 for scalar
622 const InstructionMapping
&SMapping
= getInstructionMapping(
623 1, 1, getOperandsMapping(
624 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1), nullptr}),
626 AltMappings
.push_back(&SMapping
);
628 const InstructionMapping
&VMapping
= getInstructionMapping(
629 1, 1, getOperandsMapping(
630 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1), nullptr }),
632 AltMappings
.push_back(&VMapping
);
635 case AMDGPU::G_INTRINSIC
:
636 return getInstrAlternativeMappingsIntrinsic(MI
, MRI
);
637 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
:
638 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI
, MRI
);
642 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
645 void AMDGPURegisterBankInfo::split64BitValueForMapping(
647 SmallVector
<Register
, 2> &Regs
,
649 Register Reg
) const {
650 assert(HalfTy
.getSizeInBits() == 32);
651 MachineRegisterInfo
*MRI
= B
.getMRI();
652 Register LoLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
653 Register HiLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
654 const RegisterBank
*Bank
= getRegBank(Reg
, *MRI
, *TRI
);
655 MRI
->setRegBank(LoLHS
, *Bank
);
656 MRI
->setRegBank(HiLHS
, *Bank
);
658 Regs
.push_back(LoLHS
);
659 Regs
.push_back(HiLHS
);
661 B
.buildInstr(AMDGPU::G_UNMERGE_VALUES
)
667 /// Replace the current type each register in \p Regs has with \p NewTy
668 static void setRegsToType(MachineRegisterInfo
&MRI
, ArrayRef
<Register
> Regs
,
670 for (Register Reg
: Regs
) {
671 assert(MRI
.getType(Reg
).getSizeInBits() == NewTy
.getSizeInBits());
672 MRI
.setType(Reg
, NewTy
);
676 static LLT
getHalfSizedType(LLT Ty
) {
678 assert(Ty
.getElementCount().isKnownMultipleOf(2));
679 return LLT::scalarOrVector(Ty
.getElementCount().divideCoefficientBy(2),
680 Ty
.getElementType());
683 assert(Ty
.getScalarSizeInBits() % 2 == 0);
684 return LLT::scalar(Ty
.getScalarSizeInBits() / 2);
687 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
688 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
689 /// execute the instruction for each unique combination of values in all lanes
690 /// in the wave. The block will be split such that rest of the instructions are
691 /// moved to a new block.
693 /// Essentially performs this loop:
695 /// Save Execution Mask
696 /// For (Lane : Wavefront) {
697 /// Enable Lane, Disable all other lanes
698 /// SGPR = read SGPR value for current lane from VGPR
699 /// VGPRResult[Lane] = use_op SGPR
701 /// Restore Execution Mask
703 /// There is additional complexity to try for compare values to identify the
704 /// unique values used.
705 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
707 iterator_range
<MachineBasicBlock::iterator
> Range
,
708 SmallSet
<Register
, 4> &SGPROperandRegs
,
709 MachineRegisterInfo
&MRI
) const {
710 SmallVector
<Register
, 4> ResultRegs
;
711 SmallVector
<Register
, 4> InitResultRegs
;
712 SmallVector
<Register
, 4> PhiRegs
;
714 // Track use registers which have already been expanded with a readfirstlane
715 // sequence. This may have multiple uses if moving a sequence.
716 DenseMap
<Register
, Register
> WaterfalledRegMap
;
718 MachineBasicBlock
&MBB
= B
.getMBB();
719 MachineFunction
*MF
= &B
.getMF();
721 const TargetRegisterClass
*WaveRC
= TRI
->getWaveMaskRegClass();
722 const unsigned WaveAndOpc
= Subtarget
.isWave32() ?
723 AMDGPU::S_AND_B32
: AMDGPU::S_AND_B64
;
724 const unsigned MovTermOpc
= Subtarget
.isWave32() ?
725 AMDGPU::S_MOV_B32_term
: AMDGPU::S_MOV_B64_term
;
726 const unsigned XorTermOpc
= Subtarget
.isWave32() ?
727 AMDGPU::S_XOR_B32_term
: AMDGPU::S_XOR_B64_term
;
728 const unsigned AndSaveExecOpc
= Subtarget
.isWave32() ?
729 AMDGPU::S_AND_SAVEEXEC_B32
: AMDGPU::S_AND_SAVEEXEC_B64
;
730 const unsigned ExecReg
= Subtarget
.isWave32() ?
731 AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
734 const int OrigRangeSize
= std::distance(Range
.begin(), Range
.end());
737 for (MachineInstr
&MI
: Range
) {
738 for (MachineOperand
&Def
: MI
.defs()) {
739 if (MRI
.use_nodbg_empty(Def
.getReg()))
742 LLT ResTy
= MRI
.getType(Def
.getReg());
743 const RegisterBank
*DefBank
= getRegBank(Def
.getReg(), MRI
, *TRI
);
744 ResultRegs
.push_back(Def
.getReg());
745 Register InitReg
= B
.buildUndef(ResTy
).getReg(0);
746 Register PhiReg
= MRI
.createGenericVirtualRegister(ResTy
);
747 InitResultRegs
.push_back(InitReg
);
748 PhiRegs
.push_back(PhiReg
);
749 MRI
.setRegBank(PhiReg
, *DefBank
);
750 MRI
.setRegBank(InitReg
, *DefBank
);
754 Register SaveExecReg
= MRI
.createVirtualRegister(WaveRC
);
755 Register InitSaveExecReg
= MRI
.createVirtualRegister(WaveRC
);
757 // Don't bother using generic instructions/registers for the exec mask.
758 B
.buildInstr(TargetOpcode::IMPLICIT_DEF
)
759 .addDef(InitSaveExecReg
);
761 Register PhiExec
= MRI
.createVirtualRegister(WaveRC
);
762 Register NewExec
= MRI
.createVirtualRegister(WaveRC
);
764 // To insert the loop we need to split the block. Move everything before this
765 // point to a new block, and insert a new empty block before this instruction.
766 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
767 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
768 MachineBasicBlock
*RestoreExecBB
= MF
->CreateMachineBasicBlock();
769 MachineFunction::iterator
MBBI(MBB
);
771 MF
->insert(MBBI
, LoopBB
);
772 MF
->insert(MBBI
, RestoreExecBB
);
773 MF
->insert(MBBI
, RemainderBB
);
775 LoopBB
->addSuccessor(RestoreExecBB
);
776 LoopBB
->addSuccessor(LoopBB
);
778 // Move the rest of the block into a new block.
779 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
780 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, Range
.end(), MBB
.end());
782 MBB
.addSuccessor(LoopBB
);
783 RestoreExecBB
->addSuccessor(RemainderBB
);
785 B
.setInsertPt(*LoopBB
, LoopBB
->end());
787 B
.buildInstr(TargetOpcode::PHI
)
789 .addReg(InitSaveExecReg
)
794 for (auto Result
: zip(InitResultRegs
, ResultRegs
, PhiRegs
)) {
795 B
.buildInstr(TargetOpcode::G_PHI
)
796 .addDef(std::get
<2>(Result
))
797 .addReg(std::get
<0>(Result
)) // Initial value / implicit_def
799 .addReg(std::get
<1>(Result
)) // Mid-loop value.
803 const DebugLoc
&DL
= B
.getDL();
805 MachineInstr
&FirstInst
= *Range
.begin();
807 // Move the instruction into the loop. Note we moved everything after
808 // Range.end() already into a new block, so Range.end() is no longer valid.
809 LoopBB
->splice(LoopBB
->end(), &MBB
, Range
.begin(), MBB
.end());
811 // Figure out the iterator range after splicing the instructions.
812 MachineBasicBlock::iterator NewBegin
= FirstInst
.getIterator();
813 auto NewEnd
= LoopBB
->end();
815 MachineBasicBlock::iterator I
= Range
.begin();
816 B
.setInsertPt(*LoopBB
, I
);
820 assert(std::distance(NewBegin
, NewEnd
) == OrigRangeSize
);
822 for (MachineInstr
&MI
: make_range(NewBegin
, NewEnd
)) {
823 for (MachineOperand
&Op
: MI
.uses()) {
824 if (!Op
.isReg() || Op
.isDef())
827 Register OldReg
= Op
.getReg();
828 if (!SGPROperandRegs
.count(OldReg
))
831 // See if we already processed this register in another instruction in the
833 auto OldVal
= WaterfalledRegMap
.find(OldReg
);
834 if (OldVal
!= WaterfalledRegMap
.end()) {
835 Op
.setReg(OldVal
->second
);
839 Register OpReg
= Op
.getReg();
840 LLT OpTy
= MRI
.getType(OpReg
);
842 const RegisterBank
*OpBank
= getRegBank(OpReg
, MRI
, *TRI
);
843 if (OpBank
!= &AMDGPU::VGPRRegBank
) {
844 // Insert copy from AGPR to VGPR before the loop.
846 OpReg
= B
.buildCopy(OpTy
, OpReg
).getReg(0);
847 MRI
.setRegBank(OpReg
, AMDGPU::VGPRRegBank
);
851 unsigned OpSize
= OpTy
.getSizeInBits();
853 // Can only do a readlane of 32-bit pieces.
855 // Avoid extra copies in the simple case of one 32-bit register.
856 Register CurrentLaneOpReg
857 = MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
858 MRI
.setType(CurrentLaneOpReg
, OpTy
);
860 constrainGenericRegister(OpReg
, AMDGPU::VGPR_32RegClass
, MRI
);
861 // Read the next variant <- also loop target.
862 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
866 Register NewCondReg
= MRI
.createVirtualRegister(WaveRC
);
867 bool First
= CondReg
== AMDGPU::NoRegister
;
869 CondReg
= NewCondReg
;
871 // Compare the just read M0 value to all possible Idx values.
872 B
.buildInstr(AMDGPU::V_CMP_EQ_U32_e64
)
874 .addReg(CurrentLaneOpReg
)
876 Op
.setReg(CurrentLaneOpReg
);
879 Register AndReg
= MRI
.createVirtualRegister(WaveRC
);
881 // If there are multiple operands to consider, and the conditions.
882 B
.buildInstr(WaveAndOpc
)
889 LLT S32
= LLT::scalar(32);
890 SmallVector
<Register
, 8> ReadlanePieces
;
892 // The compares can be done as 64-bit, but the extract needs to be done
895 bool Is64
= OpSize
% 64 == 0;
897 LLT UnmergeTy
= OpSize
% 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
898 unsigned CmpOp
= OpSize
% 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
899 : AMDGPU::V_CMP_EQ_U32_e64
;
901 // The compares can be done as 64-bit, but the extract needs to be done
904 // Insert the unmerge before the loop.
907 auto Unmerge
= B
.buildUnmerge(UnmergeTy
, OpReg
);
910 unsigned NumPieces
= Unmerge
->getNumOperands() - 1;
911 for (unsigned PieceIdx
= 0; PieceIdx
!= NumPieces
; ++PieceIdx
) {
912 Register UnmergePiece
= Unmerge
.getReg(PieceIdx
);
914 Register CurrentLaneOpReg
;
916 Register CurrentLaneOpRegLo
= MRI
.createGenericVirtualRegister(S32
);
917 Register CurrentLaneOpRegHi
= MRI
.createGenericVirtualRegister(S32
);
919 MRI
.setRegClass(UnmergePiece
, &AMDGPU::VReg_64RegClass
);
920 MRI
.setRegClass(CurrentLaneOpRegLo
, &AMDGPU::SReg_32_XM0RegClass
);
921 MRI
.setRegClass(CurrentLaneOpRegHi
, &AMDGPU::SReg_32_XM0RegClass
);
923 // Read the next variant <- also loop target.
924 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
926 .addReg(UnmergePiece
, 0, AMDGPU::sub0
);
928 // Read the next variant <- also loop target.
929 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
931 .addReg(UnmergePiece
, 0, AMDGPU::sub1
);
934 B
.buildMerge(LLT::scalar(64),
935 {CurrentLaneOpRegLo
, CurrentLaneOpRegHi
})
938 MRI
.setRegClass(CurrentLaneOpReg
, &AMDGPU::SReg_64_XEXECRegClass
);
940 if (OpTy
.getScalarSizeInBits() == 64) {
941 // If we need to produce a 64-bit element vector, so use the
943 ReadlanePieces
.push_back(CurrentLaneOpReg
);
945 // 32-bit element type.
946 ReadlanePieces
.push_back(CurrentLaneOpRegLo
);
947 ReadlanePieces
.push_back(CurrentLaneOpRegHi
);
950 CurrentLaneOpReg
= MRI
.createGenericVirtualRegister(S32
);
951 MRI
.setRegClass(UnmergePiece
, &AMDGPU::VGPR_32RegClass
);
952 MRI
.setRegClass(CurrentLaneOpReg
, &AMDGPU::SReg_32_XM0RegClass
);
954 // Read the next variant <- also loop target.
955 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
957 .addReg(UnmergePiece
);
958 ReadlanePieces
.push_back(CurrentLaneOpReg
);
961 Register NewCondReg
= MRI
.createVirtualRegister(WaveRC
);
962 bool First
= CondReg
== AMDGPU::NoRegister
;
964 CondReg
= NewCondReg
;
968 .addReg(CurrentLaneOpReg
)
969 .addReg(UnmergePiece
);
972 Register AndReg
= MRI
.createVirtualRegister(WaveRC
);
974 // If there are multiple operands to consider, and the conditions.
975 B
.buildInstr(WaveAndOpc
)
983 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
985 if (OpTy
.isVector()) {
986 auto Merge
= B
.buildBuildVector(OpTy
, ReadlanePieces
);
987 Op
.setReg(Merge
.getReg(0));
989 auto Merge
= B
.buildMerge(OpTy
, ReadlanePieces
);
990 Op
.setReg(Merge
.getReg(0));
993 MRI
.setRegBank(Op
.getReg(), AMDGPU::SGPRRegBank
);
996 // Make sure we don't re-process this register again.
997 WaterfalledRegMap
.insert(std::make_pair(OldReg
, Op
.getReg()));
1001 B
.setInsertPt(*LoopBB
, LoopBB
->end());
1003 // Update EXEC, save the original EXEC value to VCC.
1004 B
.buildInstr(AndSaveExecOpc
)
1006 .addReg(CondReg
, RegState::Kill
);
1008 MRI
.setSimpleHint(NewExec
, CondReg
);
1010 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1011 B
.buildInstr(XorTermOpc
)
1016 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1019 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1020 B
.buildInstr(AMDGPU::S_CBRANCH_EXECNZ
)
1023 // Save the EXEC mask before the loop.
1024 BuildMI(MBB
, MBB
.end(), DL
, TII
->get(MovTermOpc
), SaveExecReg
)
1027 // Restore the EXEC mask after the loop.
1028 B
.setMBB(*RestoreExecBB
);
1029 B
.buildInstr(MovTermOpc
)
1031 .addReg(SaveExecReg
);
1033 // Set the insert point after the original instruction, so any new
1034 // instructions will be in the remainder.
1035 B
.setInsertPt(*RemainderBB
, RemainderBB
->begin());
1040 // Return any unique registers used by \p MI at \p OpIndices that need to be
1041 // handled in a waterfall loop. Returns these registers in \p
1042 // SGPROperandRegs. Returns true if there are any operands to handle and a
1043 // waterfall loop is necessary.
1044 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1045 SmallSet
<Register
, 4> &SGPROperandRegs
, MachineInstr
&MI
,
1046 MachineRegisterInfo
&MRI
, ArrayRef
<unsigned> OpIndices
) const {
1047 for (unsigned Op
: OpIndices
) {
1048 assert(MI
.getOperand(Op
).isUse());
1049 Register Reg
= MI
.getOperand(Op
).getReg();
1050 const RegisterBank
*OpBank
= getRegBank(Reg
, MRI
, *TRI
);
1051 if (OpBank
->getID() != AMDGPU::SGPRRegBankID
)
1052 SGPROperandRegs
.insert(Reg
);
1055 // No operands need to be replaced, so no need to loop.
1056 return !SGPROperandRegs
.empty();
1059 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1060 MachineIRBuilder
&B
, MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1061 ArrayRef
<unsigned> OpIndices
) const {
1062 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1063 // are the same register.
1064 SmallSet
<Register
, 4> SGPROperandRegs
;
1066 if (!collectWaterfallOperands(SGPROperandRegs
, MI
, MRI
, OpIndices
))
1069 MachineBasicBlock::iterator I
= MI
.getIterator();
1070 return executeInWaterfallLoop(B
, make_range(I
, std::next(I
)),
1071 SGPROperandRegs
, MRI
);
1074 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1075 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1076 ArrayRef
<unsigned> OpIndices
) const {
1077 MachineIRBuilder
B(MI
);
1078 return executeInWaterfallLoop(B
, MI
, MRI
, OpIndices
);
1081 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1082 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1083 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, unsigned OpIdx
) const {
1084 Register Reg
= MI
.getOperand(OpIdx
).getReg();
1085 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
1086 if (Bank
== &AMDGPU::SGPRRegBank
)
1089 LLT Ty
= MRI
.getType(Reg
);
1090 MachineIRBuilder
B(MI
);
1092 if (Bank
!= &AMDGPU::VGPRRegBank
) {
1093 // We need to copy from AGPR to VGPR
1094 Reg
= B
.buildCopy(Ty
, Reg
).getReg(0);
1095 MRI
.setRegBank(Reg
, AMDGPU::VGPRRegBank
);
1098 Register SGPR
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1099 B
.buildInstr(AMDGPU::V_READFIRSTLANE_B32
)
1103 MRI
.setType(SGPR
, Ty
);
1105 const TargetRegisterClass
*Constrained
=
1106 constrainGenericRegister(Reg
, AMDGPU::VGPR_32RegClass
, MRI
);
1108 assert(Constrained
&& "Failed to constrain readfirstlane src reg");
1110 MI
.getOperand(OpIdx
).setReg(SGPR
);
1113 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1114 /// rest will be in the remainder.
1115 static std::pair
<LLT
, LLT
> splitUnequalType(LLT Ty
, unsigned FirstSize
) {
1116 unsigned TotalSize
= Ty
.getSizeInBits();
1118 return {LLT::scalar(FirstSize
), LLT::scalar(TotalSize
- FirstSize
)};
1120 LLT EltTy
= Ty
.getElementType();
1121 unsigned EltSize
= EltTy
.getSizeInBits();
1122 assert(FirstSize
% EltSize
== 0);
1124 unsigned FirstPartNumElts
= FirstSize
/ EltSize
;
1125 unsigned RemainderElts
= (TotalSize
- FirstSize
) / EltSize
;
1127 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts
), EltTy
),
1128 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts
), EltTy
)};
1131 static LLT
widen96To128(LLT Ty
) {
1133 return LLT::scalar(128);
1135 LLT EltTy
= Ty
.getElementType();
1136 assert(128 % EltTy
.getSizeInBits() == 0);
1137 return LLT::fixed_vector(128 / EltTy
.getSizeInBits(), EltTy
);
1140 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr
&MI
,
1141 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1142 MachineRegisterInfo
&MRI
) const {
1143 Register DstReg
= MI
.getOperand(0).getReg();
1144 const LLT LoadTy
= MRI
.getType(DstReg
);
1145 unsigned LoadSize
= LoadTy
.getSizeInBits();
1146 const unsigned MaxNonSmrdLoadSize
= 128;
1148 const RegisterBank
*DstBank
=
1149 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
1150 if (DstBank
== &AMDGPU::SGPRRegBank
) {
1151 // There are some special cases that we need to look at for 32 bit and 96
1152 // bit SGPR loads otherwise we have nothing to do.
1153 if (LoadSize
!= 32 && LoadSize
!= 96)
1156 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
1157 const unsigned MemSize
= 8 * MMO
->getSize();
1158 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1159 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1160 // scalar loads should have a load size of 32 but memory access size of less
1162 if (LoadSize
== 32 &&
1163 (MemSize
== 32 || LoadTy
.isVector() || !isScalarLoadLegal(MI
)))
1166 Register PtrReg
= MI
.getOperand(1).getReg();
1168 ApplyRegBankMapping
O(*this, MRI
, &AMDGPU::SGPRRegBank
);
1169 MachineIRBuilder
B(MI
, O
);
1171 if (LoadSize
== 32) {
1172 // This is an extending load from a sub-dword size. Widen the memory
1173 // access size to 4 bytes and clear the extra high bits appropriately
1174 const LLT S32
= LLT::scalar(32);
1175 if (MI
.getOpcode() == AMDGPU::G_SEXTLOAD
) {
1176 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1177 auto WideLoad
= B
.buildLoadFromOffset(S32
, PtrReg
, *MMO
, 0);
1178 B
.buildSExtInReg(MI
.getOperand(0), WideLoad
, MemSize
);
1179 } else if (MI
.getOpcode() == AMDGPU::G_ZEXTLOAD
) {
1180 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1181 auto WideLoad
= B
.buildLoadFromOffset(S32
, PtrReg
, *MMO
, 0);
1182 B
.buildZExtInReg(MI
.getOperand(0), WideLoad
, MemSize
);
1184 // We do not need to touch the higher bits for regular loads.
1185 B
.buildLoadFromOffset(MI
.getOperand(0), PtrReg
, *MMO
, 0);
1187 // 96-bit loads are only available for vector loads. We need to split this
1188 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1189 if (MMO
->getAlign() < Align(16)) {
1191 std::tie(Part64
, Part32
) = splitUnequalType(LoadTy
, 64);
1192 auto Load0
= B
.buildLoadFromOffset(Part64
, PtrReg
, *MMO
, 0);
1193 auto Load1
= B
.buildLoadFromOffset(Part32
, PtrReg
, *MMO
, 8);
1195 auto Undef
= B
.buildUndef(LoadTy
);
1196 auto Ins0
= B
.buildInsert(LoadTy
, Undef
, Load0
, 0);
1197 B
.buildInsert(MI
.getOperand(0), Ins0
, Load1
, 64);
1199 LLT WiderTy
= widen96To128(LoadTy
);
1200 auto WideLoad
= B
.buildLoadFromOffset(WiderTy
, PtrReg
, *MMO
, 0);
1201 B
.buildExtract(MI
.getOperand(0), WideLoad
, 0);
1205 MI
.eraseFromParent();
1209 // 128-bit loads are supported for all instruction types.
1210 if (LoadSize
<= MaxNonSmrdLoadSize
)
1213 SmallVector
<Register
, 16> DefRegs(OpdMapper
.getVRegs(0));
1214 SmallVector
<Register
, 1> SrcRegs(OpdMapper
.getVRegs(1));
1216 if (SrcRegs
.empty())
1217 SrcRegs
.push_back(MI
.getOperand(1).getReg());
1219 assert(LoadSize
% MaxNonSmrdLoadSize
== 0);
1221 // RegBankSelect only emits scalar types, so we need to reset the pointer
1222 // operand to a pointer type.
1223 Register BasePtrReg
= SrcRegs
[0];
1224 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
1225 MRI
.setType(BasePtrReg
, PtrTy
);
1227 unsigned NumSplitParts
= LoadTy
.getSizeInBits() / MaxNonSmrdLoadSize
;
1228 const LLT LoadSplitTy
= LoadTy
.divide(NumSplitParts
);
1229 ApplyRegBankMapping
Observer(*this, MRI
, &AMDGPU::VGPRRegBank
);
1230 MachineIRBuilder
B(MI
, Observer
);
1231 LegalizerHelper
Helper(B
.getMF(), Observer
, B
);
1233 if (LoadTy
.isVector()) {
1234 if (Helper
.fewerElementsVector(MI
, 0, LoadSplitTy
) != LegalizerHelper::Legalized
)
1237 if (Helper
.narrowScalar(MI
, 0, LoadSplitTy
) != LegalizerHelper::Legalized
)
1241 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
1245 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1247 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1248 MachineRegisterInfo
&MRI
) const {
1249 const MachineFunction
&MF
= *MI
.getMF();
1250 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1251 const auto &TFI
= *ST
.getFrameLowering();
1253 // Guard in case the stack growth direction ever changes with scratch
1255 if (TFI
.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown
)
1258 Register Dst
= MI
.getOperand(0).getReg();
1259 Register AllocSize
= MI
.getOperand(1).getReg();
1260 Align Alignment
= assumeAligned(MI
.getOperand(2).getImm());
1262 const RegisterBank
*SizeBank
= getRegBank(AllocSize
, MRI
, *TRI
);
1264 // TODO: Need to emit a wave reduction to get the maximum size.
1265 if (SizeBank
!= &AMDGPU::SGPRRegBank
)
1268 LLT PtrTy
= MRI
.getType(Dst
);
1269 LLT IntPtrTy
= LLT::scalar(PtrTy
.getSizeInBits());
1271 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1272 Register SPReg
= Info
->getStackPtrOffsetReg();
1273 ApplyRegBankMapping
ApplyBank(*this, MRI
, &AMDGPU::SGPRRegBank
);
1274 MachineIRBuilder
B(MI
, ApplyBank
);
1276 auto WaveSize
= B
.buildConstant(LLT::scalar(32), ST
.getWavefrontSizeLog2());
1277 auto ScaledSize
= B
.buildShl(IntPtrTy
, AllocSize
, WaveSize
);
1279 auto SPCopy
= B
.buildCopy(PtrTy
, SPReg
);
1280 if (Alignment
> TFI
.getStackAlign()) {
1281 auto PtrAdd
= B
.buildPtrAdd(PtrTy
, SPCopy
, ScaledSize
);
1282 B
.buildMaskLowPtrBits(Dst
, PtrAdd
,
1283 Log2(Alignment
) + ST
.getWavefrontSizeLog2());
1285 B
.buildPtrAdd(Dst
, SPCopy
, ScaledSize
);
1288 MI
.eraseFromParent();
1292 bool AMDGPURegisterBankInfo::applyMappingImage(
1293 MachineInstr
&MI
, const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1294 MachineRegisterInfo
&MRI
, int RsrcIdx
) const {
1295 const int NumDefs
= MI
.getNumExplicitDefs();
1297 // The reported argument index is relative to the IR intrinsic call arguments,
1298 // so we need to shift by the number of defs and the intrinsic ID.
1299 RsrcIdx
+= NumDefs
+ 1;
1301 // Insert copies to VGPR arguments.
1302 applyDefaultMapping(OpdMapper
);
1304 // Fixup any SGPR arguments.
1305 SmallVector
<unsigned, 4> SGPRIndexes
;
1306 for (int I
= NumDefs
, NumOps
= MI
.getNumOperands(); I
!= NumOps
; ++I
) {
1307 if (!MI
.getOperand(I
).isReg())
1310 // If this intrinsic has a sampler, it immediately follows rsrc.
1311 if (I
== RsrcIdx
|| I
== RsrcIdx
+ 1)
1312 SGPRIndexes
.push_back(I
);
1315 executeInWaterfallLoop(MI
, MRI
, SGPRIndexes
);
1319 static Register
getSrcRegIgnoringCopies(const MachineRegisterInfo
&MRI
,
1321 MachineInstr
*Def
= getDefIgnoringCopies(Reg
, MRI
);
1325 // TODO: Guard against this being an implicit def
1326 return Def
->getOperand(0).getReg();
1329 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1330 // the three offsets (voffset, soffset and instoffset)
1331 static unsigned setBufferOffsets(MachineIRBuilder
&B
,
1332 const AMDGPURegisterBankInfo
&RBI
,
1333 Register CombinedOffset
, Register
&VOffsetReg
,
1334 Register
&SOffsetReg
, int64_t &InstOffsetVal
,
1336 const LLT S32
= LLT::scalar(32);
1337 MachineRegisterInfo
*MRI
= B
.getMRI();
1339 if (Optional
<int64_t> Imm
= getConstantVRegSExtVal(CombinedOffset
, *MRI
)) {
1340 uint32_t SOffset
, ImmOffset
;
1341 if (AMDGPU::splitMUBUFOffset(*Imm
, SOffset
, ImmOffset
, &RBI
.Subtarget
,
1343 VOffsetReg
= B
.buildConstant(S32
, 0).getReg(0);
1344 SOffsetReg
= B
.buildConstant(S32
, SOffset
).getReg(0);
1345 InstOffsetVal
= ImmOffset
;
1347 B
.getMRI()->setRegBank(VOffsetReg
, AMDGPU::VGPRRegBank
);
1348 B
.getMRI()->setRegBank(SOffsetReg
, AMDGPU::SGPRRegBank
);
1349 return SOffset
+ ImmOffset
;
1356 std::tie(Base
, Offset
) =
1357 AMDGPU::getBaseWithConstantOffset(*MRI
, CombinedOffset
);
1359 uint32_t SOffset
, ImmOffset
;
1360 if ((int)Offset
> 0 && AMDGPU::splitMUBUFOffset(Offset
, SOffset
, ImmOffset
,
1361 &RBI
.Subtarget
, Alignment
)) {
1362 if (RBI
.getRegBank(Base
, *MRI
, *RBI
.TRI
) == &AMDGPU::VGPRRegBank
) {
1364 SOffsetReg
= B
.buildConstant(S32
, SOffset
).getReg(0);
1365 B
.getMRI()->setRegBank(SOffsetReg
, AMDGPU::SGPRRegBank
);
1366 InstOffsetVal
= ImmOffset
;
1367 return 0; // XXX - Why is this 0?
1370 // If we have SGPR base, we can use it for soffset.
1372 VOffsetReg
= B
.buildConstant(S32
, 0).getReg(0);
1373 B
.getMRI()->setRegBank(VOffsetReg
, AMDGPU::VGPRRegBank
);
1375 InstOffsetVal
= ImmOffset
;
1376 return 0; // XXX - Why is this 0?
1380 // Handle the variable sgpr + vgpr case.
1381 MachineInstr
*Add
= getOpcodeDef(AMDGPU::G_ADD
, CombinedOffset
, *MRI
);
1382 if (Add
&& (int)Offset
>= 0) {
1383 Register Src0
= getSrcRegIgnoringCopies(*MRI
, Add
->getOperand(1).getReg());
1384 Register Src1
= getSrcRegIgnoringCopies(*MRI
, Add
->getOperand(2).getReg());
1386 const RegisterBank
*Src0Bank
= RBI
.getRegBank(Src0
, *MRI
, *RBI
.TRI
);
1387 const RegisterBank
*Src1Bank
= RBI
.getRegBank(Src1
, *MRI
, *RBI
.TRI
);
1389 if (Src0Bank
== &AMDGPU::VGPRRegBank
&& Src1Bank
== &AMDGPU::SGPRRegBank
) {
1395 if (Src0Bank
== &AMDGPU::SGPRRegBank
&& Src1Bank
== &AMDGPU::VGPRRegBank
) {
1402 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1403 // have an SGPR offset and a VGPR resource.
1404 if (RBI
.getRegBank(CombinedOffset
, *MRI
, *RBI
.TRI
) == &AMDGPU::VGPRRegBank
) {
1405 VOffsetReg
= CombinedOffset
;
1407 VOffsetReg
= B
.buildCopy(S32
, CombinedOffset
).getReg(0);
1408 B
.getMRI()->setRegBank(VOffsetReg
, AMDGPU::VGPRRegBank
);
1411 SOffsetReg
= B
.buildConstant(S32
, 0).getReg(0);
1412 B
.getMRI()->setRegBank(SOffsetReg
, AMDGPU::SGPRRegBank
);
1416 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1417 const OperandsMapper
&OpdMapper
) const {
1418 MachineInstr
&MI
= OpdMapper
.getMI();
1419 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
1421 const LLT S32
= LLT::scalar(32);
1422 Register Dst
= MI
.getOperand(0).getReg();
1423 LLT Ty
= MRI
.getType(Dst
);
1425 const RegisterBank
*RSrcBank
=
1426 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
1427 const RegisterBank
*OffsetBank
=
1428 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
1429 if (RSrcBank
== &AMDGPU::SGPRRegBank
&&
1430 OffsetBank
== &AMDGPU::SGPRRegBank
)
1431 return true; // Legal mapping
1433 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1434 // here but don't have an MMO.
1436 unsigned LoadSize
= Ty
.getSizeInBits();
1438 if (LoadSize
== 256 || LoadSize
== 512) {
1439 NumLoads
= LoadSize
/ 128;
1440 Ty
= Ty
.divide(NumLoads
);
1443 // Use the alignment to ensure that the required offsets will fit into the
1444 // immediate offsets.
1445 const Align Alignment
= NumLoads
> 1 ? Align(16 * NumLoads
) : Align(1);
1447 MachineIRBuilder
B(MI
);
1448 MachineFunction
&MF
= B
.getMF();
1452 int64_t ImmOffset
= 0;
1454 unsigned MMOOffset
= setBufferOffsets(B
, *this, MI
.getOperand(2).getReg(),
1455 VOffset
, SOffset
, ImmOffset
, Alignment
);
1457 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1458 // can, but we neeed to track an MMO for that.
1459 const unsigned MemSize
= (Ty
.getSizeInBits() + 7) / 8;
1460 const Align
MemAlign(4); // FIXME: ABI type alignment?
1461 MachineMemOperand
*BaseMMO
= MF
.getMachineMemOperand(
1462 MachinePointerInfo(),
1463 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
1464 MachineMemOperand::MOInvariant
,
1467 BaseMMO
= MF
.getMachineMemOperand(BaseMMO
, MMOOffset
, MemSize
);
1469 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1470 // assume that the buffer is unswizzled.
1472 Register RSrc
= MI
.getOperand(1).getReg();
1473 Register VIndex
= B
.buildConstant(S32
, 0).getReg(0);
1474 B
.getMRI()->setRegBank(VIndex
, AMDGPU::VGPRRegBank
);
1476 SmallVector
<Register
, 4> LoadParts(NumLoads
);
1478 MachineBasicBlock::iterator MII
= MI
.getIterator();
1479 MachineInstrSpan
Span(MII
, &B
.getMBB());
1481 for (int i
= 0; i
< NumLoads
; ++i
) {
1482 if (NumLoads
== 1) {
1485 LoadParts
[i
] = MRI
.createGenericVirtualRegister(Ty
);
1486 MRI
.setRegBank(LoadParts
[i
], AMDGPU::VGPRRegBank
);
1489 MachineMemOperand
*MMO
= BaseMMO
;
1491 BaseMMO
= MF
.getMachineMemOperand(BaseMMO
, MMOOffset
+ 16 * i
, MemSize
);
1493 B
.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD
)
1494 .addDef(LoadParts
[i
]) // vdata
1495 .addUse(RSrc
) // rsrc
1496 .addUse(VIndex
) // vindex
1497 .addUse(VOffset
) // voffset
1498 .addUse(SOffset
) // soffset
1499 .addImm(ImmOffset
+ 16 * i
) // offset(imm)
1500 .addImm(0) // cachepolicy, swizzled buffer(imm)
1501 .addImm(0) // idxen(imm)
1502 .addMemOperand(MMO
);
1505 // TODO: If only the resource is a VGPR, it may be better to execute the
1506 // scalar load in the waterfall loop if the resource is expected to frequently
1507 // be dynamically uniform.
1508 if (RSrcBank
!= &AMDGPU::SGPRRegBank
) {
1509 // Remove the original instruction to avoid potentially confusing the
1510 // waterfall loop logic.
1511 B
.setInstr(*Span
.begin());
1512 MI
.eraseFromParent();
1514 SmallSet
<Register
, 4> OpsToWaterfall
;
1516 OpsToWaterfall
.insert(RSrc
);
1517 executeInWaterfallLoop(B
, make_range(Span
.begin(), Span
.end()),
1518 OpsToWaterfall
, MRI
);
1521 if (NumLoads
!= 1) {
1523 B
.buildConcatVectors(Dst
, LoadParts
);
1525 B
.buildMerge(Dst
, LoadParts
);
1528 // We removed the instruction earlier with a waterfall loop.
1529 if (RSrcBank
== &AMDGPU::SGPRRegBank
)
1530 MI
.eraseFromParent();
1535 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper
&OpdMapper
,
1536 bool Signed
) const {
1537 MachineInstr
&MI
= OpdMapper
.getMI();
1538 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
1540 // Insert basic copies
1541 applyDefaultMapping(OpdMapper
);
1543 Register DstReg
= MI
.getOperand(0).getReg();
1544 LLT Ty
= MRI
.getType(DstReg
);
1546 const LLT S32
= LLT::scalar(32);
1548 unsigned FirstOpnd
= MI
.getOpcode() == AMDGPU::G_INTRINSIC
? 2 : 1;
1549 Register SrcReg
= MI
.getOperand(FirstOpnd
).getReg();
1550 Register OffsetReg
= MI
.getOperand(FirstOpnd
+ 1).getReg();
1551 Register WidthReg
= MI
.getOperand(FirstOpnd
+ 2).getReg();
1553 const RegisterBank
*DstBank
=
1554 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
1555 if (DstBank
== &AMDGPU::VGPRRegBank
) {
1559 // There is no 64-bit vgpr bitfield extract instructions so the operation
1560 // is expanded to a sequence of instructions that implement the operation.
1561 ApplyRegBankMapping
ApplyBank(*this, MRI
, &AMDGPU::VGPRRegBank
);
1562 MachineIRBuilder
B(MI
, ApplyBank
);
1564 const LLT S64
= LLT::scalar(64);
1565 // Shift the source operand so that extracted bits start at bit 0.
1566 auto ShiftOffset
= Signed
? B
.buildAShr(S64
, SrcReg
, OffsetReg
)
1567 : B
.buildLShr(S64
, SrcReg
, OffsetReg
);
1568 auto UnmergeSOffset
= B
.buildUnmerge({S32
, S32
}, ShiftOffset
);
1570 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1571 // if the width is a constant.
1572 if (auto ConstWidth
= getConstantVRegValWithLookThrough(WidthReg
, MRI
)) {
1573 // Use the 32-bit bitfield extract instruction if the width is a constant.
1574 // Depending on the width size, use either the low or high 32-bits.
1575 auto Zero
= B
.buildConstant(S32
, 0);
1576 auto WidthImm
= ConstWidth
->Value
.getZExtValue();
1577 if (WidthImm
<= 32) {
1578 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1579 // or clear the upper 32-bits.
1581 Signed
? B
.buildSbfx(S32
, UnmergeSOffset
.getReg(0), Zero
, WidthReg
)
1582 : B
.buildUbfx(S32
, UnmergeSOffset
.getReg(0), Zero
, WidthReg
);
1584 Signed
? B
.buildAShr(S32
, Extract
, B
.buildConstant(S32
, 31)) : Zero
;
1585 B
.buildMerge(DstReg
, {Extract
, Extend
});
1587 // Use bitfield extract on upper 32-bit source, and combine with lower
1589 auto UpperWidth
= B
.buildConstant(S32
, WidthImm
- 32);
1592 ? B
.buildSbfx(S32
, UnmergeSOffset
.getReg(1), Zero
, UpperWidth
)
1593 : B
.buildUbfx(S32
, UnmergeSOffset
.getReg(1), Zero
, UpperWidth
);
1594 B
.buildMerge(DstReg
, {UnmergeSOffset
.getReg(0), Extract
});
1596 MI
.eraseFromParent();
1600 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1602 auto ExtShift
= B
.buildSub(S32
, B
.buildConstant(S32
, 64), WidthReg
);
1603 auto SignBit
= B
.buildShl(S64
, ShiftOffset
, ExtShift
);
1605 B
.buildAShr(S64
, SignBit
, ExtShift
);
1607 B
.buildLShr(S64
, SignBit
, ExtShift
);
1608 MI
.eraseFromParent();
1612 // The scalar form packs the offset and width in a single operand.
1614 ApplyRegBankMapping
ApplyBank(*this, MRI
, &AMDGPU::SGPRRegBank
);
1615 MachineIRBuilder
B(MI
, ApplyBank
);
1617 // Ensure the high bits are clear to insert the offset.
1618 auto OffsetMask
= B
.buildConstant(S32
, maskTrailingOnes
<unsigned>(6));
1619 auto ClampOffset
= B
.buildAnd(S32
, OffsetReg
, OffsetMask
);
1621 // Zeros out the low bits, so don't bother clamping the input value.
1622 auto ShiftWidth
= B
.buildShl(S32
, WidthReg
, B
.buildConstant(S32
, 16));
1624 // Transformation function, pack the offset and width of a BFE into
1625 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1626 // source, bits [5:0] contain the offset and bits [22:16] the width.
1627 auto MergedInputs
= B
.buildOr(S32
, ClampOffset
, ShiftWidth
);
1629 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1630 // register class constraints.
1631 unsigned Opc
= Ty
== S32
? (Signed
? AMDGPU::S_BFE_I32
: AMDGPU::S_BFE_U32
) :
1632 (Signed
? AMDGPU::S_BFE_I64
: AMDGPU::S_BFE_U64
);
1634 auto MIB
= B
.buildInstr(Opc
, {DstReg
}, {SrcReg
, MergedInputs
});
1635 if (!constrainSelectedInstRegOperands(*MIB
, *TII
, *TRI
, *this))
1636 llvm_unreachable("failed to constrain BFE");
1638 MI
.eraseFromParent();
1642 // Return a suitable opcode for extending the operands of Opc when widening.
1643 static unsigned getExtendOp(unsigned Opc
) {
1645 case TargetOpcode::G_ASHR
:
1646 case TargetOpcode::G_SMIN
:
1647 case TargetOpcode::G_SMAX
:
1648 return TargetOpcode::G_SEXT
;
1649 case TargetOpcode::G_LSHR
:
1650 case TargetOpcode::G_UMIN
:
1651 case TargetOpcode::G_UMAX
:
1652 return TargetOpcode::G_ZEXT
;
1654 return TargetOpcode::G_ANYEXT
;
1658 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1659 // any illegal vector extend or unmerge operations.
1660 static std::pair
<Register
, Register
>
1661 unpackV2S16ToS32(MachineIRBuilder
&B
, Register Src
, unsigned ExtOpcode
) {
1662 const LLT S32
= LLT::scalar(32);
1663 auto Bitcast
= B
.buildBitcast(S32
, Src
);
1665 if (ExtOpcode
== TargetOpcode::G_SEXT
) {
1666 auto ExtLo
= B
.buildSExtInReg(S32
, Bitcast
, 16);
1667 auto ShiftHi
= B
.buildAShr(S32
, Bitcast
, B
.buildConstant(S32
, 16));
1668 return std::make_pair(ExtLo
.getReg(0), ShiftHi
.getReg(0));
1671 auto ShiftHi
= B
.buildLShr(S32
, Bitcast
, B
.buildConstant(S32
, 16));
1672 if (ExtOpcode
== TargetOpcode::G_ZEXT
) {
1673 auto ExtLo
= B
.buildAnd(S32
, Bitcast
, B
.buildConstant(S32
, 0xffff));
1674 return std::make_pair(ExtLo
.getReg(0), ShiftHi
.getReg(0));
1677 assert(ExtOpcode
== TargetOpcode::G_ANYEXT
);
1678 return std::make_pair(Bitcast
.getReg(0), ShiftHi
.getReg(0));
1681 // For cases where only a single copy is inserted for matching register banks.
1682 // Replace the register in the instruction operand
1683 static bool substituteSimpleCopyRegs(
1684 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
, unsigned OpIdx
) {
1685 SmallVector
<unsigned, 1> SrcReg(OpdMapper
.getVRegs(OpIdx
));
1686 if (!SrcReg
.empty()) {
1687 assert(SrcReg
.size() == 1);
1688 OpdMapper
.getMI().getOperand(OpIdx
).setReg(SrcReg
[0]);
1695 /// Handle register layout difference for f16 images for some subtargets.
1696 Register
AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder
&B
,
1697 MachineRegisterInfo
&MRI
,
1698 Register Reg
) const {
1699 if (!Subtarget
.hasUnpackedD16VMem())
1702 const LLT S16
= LLT::scalar(16);
1703 LLT StoreVT
= MRI
.getType(Reg
);
1704 if (!StoreVT
.isVector() || StoreVT
.getElementType() != S16
)
1707 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
1710 SmallVector
<Register
, 4> WideRegs
;
1711 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
1712 WideRegs
.push_back(Unmerge
.getReg(I
));
1714 const LLT S32
= LLT::scalar(32);
1715 int NumElts
= StoreVT
.getNumElements();
1717 return B
.buildMerge(LLT::fixed_vector(NumElts
, S32
), WideRegs
).getReg(0);
1720 static std::pair
<Register
, unsigned>
1721 getBaseWithConstantOffset(MachineRegisterInfo
&MRI
, Register Reg
) {
1723 if (mi_match(Reg
, MRI
, m_ICst(Const
)))
1724 return std::make_pair(Register(), Const
);
1727 if (mi_match(Reg
, MRI
, m_GAdd(m_Reg(Base
), m_ICst(Const
))))
1728 return std::make_pair(Base
, Const
);
1730 // TODO: Handle G_OR used for add case
1731 return std::make_pair(Reg
, 0);
1734 std::pair
<Register
, unsigned>
1735 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder
&B
,
1736 Register OrigOffset
) const {
1737 const unsigned MaxImm
= 4095;
1740 const LLT S32
= LLT::scalar(32);
1742 std::tie(BaseReg
, ImmOffset
) = getBaseWithConstantOffset(*B
.getMRI(),
1746 if (ImmOffset
!= 0) {
1747 // If the immediate value is too big for the immoffset field, put the value
1748 // and -4096 into the immoffset field so that the value that is copied/added
1749 // for the voffset field is a multiple of 4096, and it stands more chance
1750 // of being CSEd with the copy/add for another similar load/store.
1751 // However, do not do that rounding down to a multiple of 4096 if that is a
1752 // negative number, as it appears to be illegal to have a negative offset
1753 // in the vgpr, even if adding the immediate offset makes it positive.
1754 unsigned Overflow
= ImmOffset
& ~MaxImm
;
1755 ImmOffset
-= Overflow
;
1756 if ((int32_t)Overflow
< 0) {
1757 Overflow
+= ImmOffset
;
1762 if (Overflow
!= 0) {
1764 BaseReg
= B
.buildConstant(S32
, Overflow
).getReg(0);
1766 auto OverflowVal
= B
.buildConstant(S32
, Overflow
);
1767 BaseReg
= B
.buildAdd(S32
, BaseReg
, OverflowVal
).getReg(0);
1773 BaseReg
= B
.buildConstant(S32
, 0).getReg(0);
1775 return {BaseReg
, C1
};
1778 static bool isZero(Register Reg
, MachineRegisterInfo
&MRI
) {
1780 return mi_match(Reg
, MRI
, m_ICst(C
)) && C
== 0;
1783 static unsigned extractCPol(unsigned CachePolicy
) {
1784 return CachePolicy
& AMDGPU::CPol::ALL
;
1787 static unsigned extractSWZ(unsigned CachePolicy
) {
1788 return (CachePolicy
>> 3) & 1;
1793 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder
&B
,
1794 MachineInstr
&MI
) const {
1795 MachineRegisterInfo
&MRI
= *B
.getMRI();
1796 executeInWaterfallLoop(B
, MI
, MRI
, {2, 4});
1798 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1800 Register VData
= MI
.getOperand(1).getReg();
1801 LLT Ty
= MRI
.getType(VData
);
1803 int EltSize
= Ty
.getScalarSizeInBits();
1804 int Size
= Ty
.getSizeInBits();
1806 // FIXME: Broken integer truncstore.
1808 report_fatal_error("unhandled intrinsic store");
1810 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1811 const int MemSize
= (*MI
.memoperands_begin())->getSize();
1814 Register RSrc
= MI
.getOperand(2).getReg();
1815 Register VOffset
= MI
.getOperand(3).getReg();
1816 Register SOffset
= MI
.getOperand(4).getReg();
1817 unsigned CachePolicy
= MI
.getOperand(5).getImm();
1820 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
1822 const bool Offen
= !isZero(VOffset
, MRI
);
1824 unsigned Opc
= AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
;
1825 switch (8 * MemSize
) {
1827 Opc
= Offen
? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact
:
1828 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact
;
1831 Opc
= Offen
? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact
:
1832 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact
;
1835 Opc
= Offen
? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
:
1836 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact
;
1838 Opc
= AMDGPU::getMUBUFOpcode(Opc
, Size
/ 32);
1843 // Set the insertion point back to the instruction in case it was moved into a
1847 MachineInstrBuilder MIB
= B
.buildInstr(Opc
)
1851 MIB
.addUse(VOffset
);
1856 .addImm(extractCPol(CachePolicy
))
1857 .addImm(0) // tfe: FIXME: Remove from inst
1858 .addImm(extractSWZ(CachePolicy
))
1861 // FIXME: We need a way to report failure from applyMappingImpl.
1862 // Insert constrain copies before inserting the loop.
1863 if (!constrainSelectedInstRegOperands(*MIB
, *TII
, *TRI
, *this))
1864 report_fatal_error("failed to constrain selected store intrinsic");
1869 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder
&B
, Register DstReg
,
1870 Register SrcReg
) const {
1871 MachineRegisterInfo
&MRI
= *B
.getMRI();
1872 LLT SrcTy
= MRI
.getType(SrcReg
);
1873 if (SrcTy
.getSizeInBits() == 32) {
1874 // Use a v_mov_b32 here to make the exec dependency explicit.
1875 B
.buildInstr(AMDGPU::V_MOV_B32_e32
)
1878 return constrainGenericRegister(DstReg
, AMDGPU::VGPR_32RegClass
, MRI
) &&
1879 constrainGenericRegister(SrcReg
, AMDGPU::SReg_32RegClass
, MRI
);
1882 Register TmpReg0
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1883 Register TmpReg1
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1885 B
.buildInstr(AMDGPU::V_MOV_B32_e32
)
1887 .addUse(SrcReg
, 0, AMDGPU::sub0
);
1888 B
.buildInstr(AMDGPU::V_MOV_B32_e32
)
1890 .addUse(SrcReg
, 0, AMDGPU::sub1
);
1891 B
.buildInstr(AMDGPU::REG_SEQUENCE
)
1894 .addImm(AMDGPU::sub0
)
1896 .addImm(AMDGPU::sub1
);
1898 return constrainGenericRegister(SrcReg
, AMDGPU::SReg_64RegClass
, MRI
) &&
1899 constrainGenericRegister(DstReg
, AMDGPU::VReg_64RegClass
, MRI
);
1902 /// Utility function for pushing dynamic vector indexes with a constant offset
1903 /// into waterwall loops.
1904 static void reinsertVectorIndexAdd(MachineIRBuilder
&B
,
1905 MachineInstr
&IdxUseInstr
,
1907 unsigned ConstOffset
) {
1908 MachineRegisterInfo
&MRI
= *B
.getMRI();
1909 const LLT S32
= LLT::scalar(32);
1910 Register WaterfallIdx
= IdxUseInstr
.getOperand(OpIdx
).getReg();
1911 B
.setInsertPt(*IdxUseInstr
.getParent(), IdxUseInstr
.getIterator());
1913 auto MaterializedOffset
= B
.buildConstant(S32
, ConstOffset
);
1915 auto Add
= B
.buildAdd(S32
, WaterfallIdx
, MaterializedOffset
);
1916 MRI
.setRegBank(MaterializedOffset
.getReg(0), AMDGPU::SGPRRegBank
);
1917 MRI
.setRegBank(Add
.getReg(0), AMDGPU::SGPRRegBank
);
1918 IdxUseInstr
.getOperand(OpIdx
).setReg(Add
.getReg(0));
1921 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1922 /// original 32-bit source value (to be inserted in the low part of the combined
1923 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1925 static void extendLow32IntoHigh32(MachineIRBuilder
&B
,
1926 Register Hi32Reg
, Register Lo32Reg
,
1928 const RegisterBank
&RegBank
,
1929 bool IsBooleanSrc
= false) {
1930 if (ExtOpc
== AMDGPU::G_ZEXT
) {
1931 B
.buildConstant(Hi32Reg
, 0);
1932 } else if (ExtOpc
== AMDGPU::G_SEXT
) {
1934 // If we know the original source was an s1, the high half is the same as
1936 B
.buildCopy(Hi32Reg
, Lo32Reg
);
1938 // Replicate sign bit from 32-bit extended part.
1939 auto ShiftAmt
= B
.buildConstant(LLT::scalar(32), 31);
1940 B
.getMRI()->setRegBank(ShiftAmt
.getReg(0), RegBank
);
1941 B
.buildAShr(Hi32Reg
, Lo32Reg
, ShiftAmt
);
1944 assert(ExtOpc
== AMDGPU::G_ANYEXT
&& "not an integer extension");
1945 B
.buildUndef(Hi32Reg
);
1949 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1950 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1951 const OperandsMapper
&OpdMapper
) const {
1953 Register VecReg
= MI
.getOperand(1).getReg();
1954 Register Idx
= MI
.getOperand(2).getReg();
1956 const RegisterBank
&IdxBank
=
1957 *OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
1959 bool IsDivergentIdx
= IdxBank
!= AMDGPU::SGPRRegBank
;
1961 LLT VecTy
= MRI
.getType(VecReg
);
1962 unsigned EltSize
= VecTy
.getScalarSizeInBits();
1963 unsigned NumElem
= VecTy
.getNumElements();
1965 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize
, NumElem
,
1969 MachineIRBuilder
B(MI
);
1970 LLT S32
= LLT::scalar(32);
1972 const RegisterBank
&DstBank
=
1973 *OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
1974 const RegisterBank
&SrcBank
=
1975 *OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
1977 const RegisterBank
&CCBank
=
1978 (DstBank
== AMDGPU::SGPRRegBank
&&
1979 SrcBank
== AMDGPU::SGPRRegBank
&&
1980 IdxBank
== AMDGPU::SGPRRegBank
) ? AMDGPU::SGPRRegBank
1981 : AMDGPU::VCCRegBank
;
1982 LLT CCTy
= (CCBank
== AMDGPU::SGPRRegBank
) ? S32
: LLT::scalar(1);
1984 if (CCBank
== AMDGPU::VCCRegBank
&& IdxBank
== AMDGPU::SGPRRegBank
) {
1985 Idx
= B
.buildCopy(S32
, Idx
)->getOperand(0).getReg();
1986 MRI
.setRegBank(Idx
, AMDGPU::VGPRRegBank
);
1989 LLT EltTy
= VecTy
.getScalarType();
1990 SmallVector
<Register
, 2> DstRegs(OpdMapper
.getVRegs(0));
1991 unsigned NumLanes
= DstRegs
.size();
1995 EltTy
= MRI
.getType(DstRegs
[0]);
1997 auto UnmergeToEltTy
= B
.buildUnmerge(EltTy
, VecReg
);
1998 SmallVector
<Register
, 2> Res(NumLanes
);
1999 for (unsigned L
= 0; L
< NumLanes
; ++L
)
2000 Res
[L
] = UnmergeToEltTy
.getReg(L
);
2002 for (unsigned I
= 1; I
< NumElem
; ++I
) {
2003 auto IC
= B
.buildConstant(S32
, I
);
2004 MRI
.setRegBank(IC
->getOperand(0).getReg(), AMDGPU::SGPRRegBank
);
2005 auto Cmp
= B
.buildICmp(CmpInst::ICMP_EQ
, CCTy
, Idx
, IC
);
2006 MRI
.setRegBank(Cmp
->getOperand(0).getReg(), CCBank
);
2008 for (unsigned L
= 0; L
< NumLanes
; ++L
) {
2009 auto S
= B
.buildSelect(EltTy
, Cmp
,
2010 UnmergeToEltTy
.getReg(I
* NumLanes
+ L
), Res
[L
]);
2012 for (unsigned N
: { 0, 2, 3 })
2013 MRI
.setRegBank(S
->getOperand(N
).getReg(), DstBank
);
2015 Res
[L
] = S
->getOperand(0).getReg();
2019 for (unsigned L
= 0; L
< NumLanes
; ++L
) {
2020 Register DstReg
= (NumLanes
== 1) ? MI
.getOperand(0).getReg() : DstRegs
[L
];
2021 B
.buildCopy(DstReg
, Res
[L
]);
2022 MRI
.setRegBank(DstReg
, DstBank
);
2025 MRI
.setRegBank(MI
.getOperand(0).getReg(), DstBank
);
2026 MI
.eraseFromParent();
2031 // Insert a cross regbank copy for a register if it already has a bank that
2032 // differs from the one we want to set.
2033 static Register
constrainRegToBank(MachineRegisterInfo
&MRI
,
2034 MachineIRBuilder
&B
, Register
&Reg
,
2035 const RegisterBank
&Bank
) {
2036 const RegisterBank
*CurrBank
= MRI
.getRegBankOrNull(Reg
);
2037 if (CurrBank
&& *CurrBank
!= Bank
) {
2038 Register Copy
= B
.buildCopy(MRI
.getType(Reg
), Reg
).getReg(0);
2039 MRI
.setRegBank(Copy
, Bank
);
2043 MRI
.setRegBank(Reg
, Bank
);
2047 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2048 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2049 const OperandsMapper
&OpdMapper
) const {
2051 Register VecReg
= MI
.getOperand(1).getReg();
2052 Register Idx
= MI
.getOperand(3).getReg();
2054 const RegisterBank
&IdxBank
=
2055 *OpdMapper
.getInstrMapping().getOperandMapping(3).BreakDown
[0].RegBank
;
2057 bool IsDivergentIdx
= IdxBank
!= AMDGPU::SGPRRegBank
;
2059 LLT VecTy
= MRI
.getType(VecReg
);
2060 unsigned EltSize
= VecTy
.getScalarSizeInBits();
2061 unsigned NumElem
= VecTy
.getNumElements();
2063 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize
, NumElem
,
2067 MachineIRBuilder
B(MI
);
2068 LLT S32
= LLT::scalar(32);
2070 const RegisterBank
&DstBank
=
2071 *OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2072 const RegisterBank
&SrcBank
=
2073 *OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2074 const RegisterBank
&InsBank
=
2075 *OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
2077 const RegisterBank
&CCBank
=
2078 (DstBank
== AMDGPU::SGPRRegBank
&&
2079 SrcBank
== AMDGPU::SGPRRegBank
&&
2080 InsBank
== AMDGPU::SGPRRegBank
&&
2081 IdxBank
== AMDGPU::SGPRRegBank
) ? AMDGPU::SGPRRegBank
2082 : AMDGPU::VCCRegBank
;
2083 LLT CCTy
= (CCBank
== AMDGPU::SGPRRegBank
) ? S32
: LLT::scalar(1);
2085 if (CCBank
== AMDGPU::VCCRegBank
&& IdxBank
== AMDGPU::SGPRRegBank
) {
2086 Idx
= B
.buildCopy(S32
, Idx
)->getOperand(0).getReg();
2087 MRI
.setRegBank(Idx
, AMDGPU::VGPRRegBank
);
2090 LLT EltTy
= VecTy
.getScalarType();
2091 SmallVector
<Register
, 2> InsRegs(OpdMapper
.getVRegs(2));
2092 unsigned NumLanes
= InsRegs
.size();
2095 InsRegs
.push_back(MI
.getOperand(2).getReg());
2097 EltTy
= MRI
.getType(InsRegs
[0]);
2100 auto UnmergeToEltTy
= B
.buildUnmerge(EltTy
, VecReg
);
2101 SmallVector
<Register
, 16> Ops(NumElem
* NumLanes
);
2103 for (unsigned I
= 0; I
< NumElem
; ++I
) {
2104 auto IC
= B
.buildConstant(S32
, I
);
2105 MRI
.setRegBank(IC
->getOperand(0).getReg(), AMDGPU::SGPRRegBank
);
2106 auto Cmp
= B
.buildICmp(CmpInst::ICMP_EQ
, CCTy
, Idx
, IC
);
2107 MRI
.setRegBank(Cmp
->getOperand(0).getReg(), CCBank
);
2109 for (unsigned L
= 0; L
< NumLanes
; ++L
) {
2110 Register Op0
= constrainRegToBank(MRI
, B
, InsRegs
[L
], DstBank
);
2111 Register Op1
= UnmergeToEltTy
.getReg(I
* NumLanes
+ L
);
2112 Op1
= constrainRegToBank(MRI
, B
, Op1
, DstBank
);
2114 Register Select
= B
.buildSelect(EltTy
, Cmp
, Op0
, Op1
).getReg(0);
2115 MRI
.setRegBank(Select
, DstBank
);
2117 Ops
[I
* NumLanes
+ L
] = Select
;
2121 LLT MergeTy
= LLT::fixed_vector(Ops
.size(), EltTy
);
2122 if (MergeTy
== MRI
.getType(MI
.getOperand(0).getReg())) {
2123 B
.buildBuildVector(MI
.getOperand(0), Ops
);
2125 auto Vec
= B
.buildBuildVector(MergeTy
, Ops
);
2126 MRI
.setRegBank(Vec
->getOperand(0).getReg(), DstBank
);
2127 B
.buildBitcast(MI
.getOperand(0).getReg(), Vec
);
2130 MRI
.setRegBank(MI
.getOperand(0).getReg(), DstBank
);
2131 MI
.eraseFromParent();
2136 void AMDGPURegisterBankInfo::applyMappingImpl(
2137 const OperandsMapper
&OpdMapper
) const {
2138 MachineInstr
&MI
= OpdMapper
.getMI();
2139 unsigned Opc
= MI
.getOpcode();
2140 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
2142 case AMDGPU::G_PHI
: {
2143 Register DstReg
= MI
.getOperand(0).getReg();
2144 LLT DstTy
= MRI
.getType(DstReg
);
2145 if (DstTy
!= LLT::scalar(1))
2148 const LLT S32
= LLT::scalar(32);
2149 const RegisterBank
*DstBank
=
2150 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2151 if (DstBank
== &AMDGPU::VCCRegBank
) {
2152 applyDefaultMapping(OpdMapper
);
2153 // The standard handling only considers the result register bank for
2154 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2155 // produce an invalid copy. We can only copy with some kind of compare to
2156 // get a vector boolean result. Insert a regitser bank copy that will be
2157 // correctly lowered to a compare.
2158 MachineIRBuilder
B(*MI
.getParent()->getParent());
2160 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
2161 Register SrcReg
= MI
.getOperand(I
).getReg();
2162 const RegisterBank
*SrcBank
= getRegBank(SrcReg
, MRI
, *TRI
);
2164 if (SrcBank
!= &AMDGPU::VCCRegBank
) {
2165 MachineBasicBlock
*SrcMBB
= MI
.getOperand(I
+ 1).getMBB();
2166 B
.setInsertPt(*SrcMBB
, SrcMBB
->getFirstTerminator());
2168 auto Copy
= B
.buildCopy(LLT::scalar(1), SrcReg
);
2169 MRI
.setRegBank(Copy
.getReg(0), AMDGPU::VCCRegBank
);
2170 MI
.getOperand(I
).setReg(Copy
.getReg(0));
2177 // Phi handling is strange and only considers the bank of the destination.
2178 substituteSimpleCopyRegs(OpdMapper
, 0);
2180 // Promote SGPR/VGPR booleans to s32
2181 MachineFunction
*MF
= MI
.getParent()->getParent();
2182 ApplyRegBankMapping
ApplyBank(*this, MRI
, DstBank
);
2183 MachineIRBuilder
B(MI
, ApplyBank
);
2184 LegalizerHelper
Helper(*MF
, ApplyBank
, B
);
2186 if (Helper
.widenScalar(MI
, 0, S32
) != LegalizerHelper::Legalized
)
2187 llvm_unreachable("widen scalar should have succeeded");
2191 case AMDGPU::G_ICMP
:
2192 case AMDGPU::G_UADDO
:
2193 case AMDGPU::G_USUBO
:
2194 case AMDGPU::G_UADDE
:
2195 case AMDGPU::G_SADDE
:
2196 case AMDGPU::G_USUBE
:
2197 case AMDGPU::G_SSUBE
: {
2198 unsigned BoolDstOp
= Opc
== AMDGPU::G_ICMP
? 0 : 1;
2199 Register DstReg
= MI
.getOperand(BoolDstOp
).getReg();
2201 const RegisterBank
*DstBank
=
2202 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2203 if (DstBank
!= &AMDGPU::SGPRRegBank
)
2206 const bool HasCarryIn
= MI
.getNumOperands() == 5;
2208 // If this is a scalar compare, promote the result to s32, as the selection
2209 // will end up using a copy to a 32-bit vreg.
2210 const LLT S32
= LLT::scalar(32);
2211 Register NewDstReg
= MRI
.createGenericVirtualRegister(S32
);
2212 MRI
.setRegBank(NewDstReg
, AMDGPU::SGPRRegBank
);
2213 MI
.getOperand(BoolDstOp
).setReg(NewDstReg
);
2214 MachineIRBuilder
B(MI
);
2217 Register NewSrcReg
= MRI
.createGenericVirtualRegister(S32
);
2218 MRI
.setRegBank(NewSrcReg
, AMDGPU::SGPRRegBank
);
2219 B
.buildZExt(NewSrcReg
, MI
.getOperand(4).getReg());
2220 MI
.getOperand(4).setReg(NewSrcReg
);
2223 MachineBasicBlock
*MBB
= MI
.getParent();
2224 B
.setInsertPt(*MBB
, std::next(MI
.getIterator()));
2226 // If we had a constrained VCC result register, a copy was inserted to VCC
2228 SmallVector
<Register
, 1> DefRegs(OpdMapper
.getVRegs(0));
2229 if (DefRegs
.empty())
2230 DefRegs
.push_back(DstReg
);
2231 B
.buildTrunc(DefRegs
[0], NewDstReg
);
2234 case AMDGPU::G_SELECT
: {
2235 Register DstReg
= MI
.getOperand(0).getReg();
2236 LLT DstTy
= MRI
.getType(DstReg
);
2238 SmallVector
<Register
, 1> CondRegs(OpdMapper
.getVRegs(1));
2239 if (CondRegs
.empty())
2240 CondRegs
.push_back(MI
.getOperand(1).getReg());
2242 assert(CondRegs
.size() == 1);
2245 const RegisterBank
*CondBank
= getRegBank(CondRegs
[0], MRI
, *TRI
);
2246 if (CondBank
== &AMDGPU::SGPRRegBank
) {
2247 MachineIRBuilder
B(MI
);
2248 const LLT S32
= LLT::scalar(32);
2249 Register NewCondReg
= MRI
.createGenericVirtualRegister(S32
);
2250 MRI
.setRegBank(NewCondReg
, AMDGPU::SGPRRegBank
);
2252 MI
.getOperand(1).setReg(NewCondReg
);
2253 B
.buildZExt(NewCondReg
, CondRegs
[0]);
2256 if (DstTy
.getSizeInBits() != 64)
2259 MachineIRBuilder
B(MI
);
2260 LLT HalfTy
= getHalfSizedType(DstTy
);
2262 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2263 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
2264 SmallVector
<Register
, 2> Src2Regs(OpdMapper
.getVRegs(3));
2266 // All inputs are SGPRs, nothing special to do.
2267 if (DefRegs
.empty()) {
2268 assert(Src1Regs
.empty() && Src2Regs
.empty());
2272 if (Src1Regs
.empty())
2273 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
2275 setRegsToType(MRI
, Src1Regs
, HalfTy
);
2278 if (Src2Regs
.empty())
2279 split64BitValueForMapping(B
, Src2Regs
, HalfTy
, MI
.getOperand(3).getReg());
2281 setRegsToType(MRI
, Src2Regs
, HalfTy
);
2283 setRegsToType(MRI
, DefRegs
, HalfTy
);
2285 B
.buildSelect(DefRegs
[0], CondRegs
[0], Src1Regs
[0], Src2Regs
[0]);
2286 B
.buildSelect(DefRegs
[1], CondRegs
[0], Src1Regs
[1], Src2Regs
[1]);
2288 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
2289 MI
.eraseFromParent();
2292 case AMDGPU::G_BRCOND
: {
2293 Register CondReg
= MI
.getOperand(0).getReg();
2294 // FIXME: Should use legalizer helper, but should change bool ext type.
2295 const RegisterBank
*CondBank
=
2296 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2298 if (CondBank
== &AMDGPU::SGPRRegBank
) {
2299 MachineIRBuilder
B(MI
);
2300 const LLT S32
= LLT::scalar(32);
2301 Register NewCondReg
= MRI
.createGenericVirtualRegister(S32
);
2302 MRI
.setRegBank(NewCondReg
, AMDGPU::SGPRRegBank
);
2304 MI
.getOperand(0).setReg(NewCondReg
);
2305 B
.buildZExt(NewCondReg
, CondReg
);
2313 case AMDGPU::G_XOR
: {
2314 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2315 // there is a VGPR input.
2316 Register DstReg
= MI
.getOperand(0).getReg();
2317 LLT DstTy
= MRI
.getType(DstReg
);
2319 if (DstTy
.getSizeInBits() == 1) {
2320 const RegisterBank
*DstBank
=
2321 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2322 if (DstBank
== &AMDGPU::VCCRegBank
)
2325 MachineFunction
*MF
= MI
.getParent()->getParent();
2326 ApplyRegBankMapping
ApplyBank(*this, MRI
, DstBank
);
2327 MachineIRBuilder
B(MI
, ApplyBank
);
2328 LegalizerHelper
Helper(*MF
, ApplyBank
, B
);
2330 if (Helper
.widenScalar(MI
, 0, LLT::scalar(32)) !=
2331 LegalizerHelper::Legalized
)
2332 llvm_unreachable("widen scalar should have succeeded");
2336 if (DstTy
.getSizeInBits() != 64)
2339 LLT HalfTy
= getHalfSizedType(DstTy
);
2340 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2341 SmallVector
<Register
, 2> Src0Regs(OpdMapper
.getVRegs(1));
2342 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
2344 // All inputs are SGPRs, nothing special to do.
2345 if (DefRegs
.empty()) {
2346 assert(Src0Regs
.empty() && Src1Regs
.empty());
2350 assert(DefRegs
.size() == 2);
2351 assert(Src0Regs
.size() == Src1Regs
.size() &&
2352 (Src0Regs
.empty() || Src0Regs
.size() == 2));
2354 // Depending on where the source registers came from, the generic code may
2355 // have decided to split the inputs already or not. If not, we still need to
2356 // extract the values.
2357 MachineIRBuilder
B(MI
);
2359 if (Src0Regs
.empty())
2360 split64BitValueForMapping(B
, Src0Regs
, HalfTy
, MI
.getOperand(1).getReg());
2362 setRegsToType(MRI
, Src0Regs
, HalfTy
);
2364 if (Src1Regs
.empty())
2365 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
2367 setRegsToType(MRI
, Src1Regs
, HalfTy
);
2369 setRegsToType(MRI
, DefRegs
, HalfTy
);
2371 B
.buildInstr(Opc
, {DefRegs
[0]}, {Src0Regs
[0], Src1Regs
[0]});
2372 B
.buildInstr(Opc
, {DefRegs
[1]}, {Src0Regs
[1], Src1Regs
[1]});
2374 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
2375 MI
.eraseFromParent();
2378 case AMDGPU::G_ABS
: {
2379 Register SrcReg
= MI
.getOperand(1).getReg();
2380 const RegisterBank
*SrcBank
= MRI
.getRegBankOrNull(SrcReg
);
2382 // There is no VALU abs instruction so we need to replace it with a sub and
2384 if (SrcBank
&& SrcBank
== &AMDGPU::VGPRRegBank
) {
2385 MachineFunction
*MF
= MI
.getParent()->getParent();
2386 ApplyRegBankMapping
Apply(*this, MRI
, &AMDGPU::VGPRRegBank
);
2387 MachineIRBuilder
B(MI
, Apply
);
2388 LegalizerHelper
Helper(*MF
, Apply
, B
);
2390 if (Helper
.lowerAbsToMaxNeg(MI
) != LegalizerHelper::Legalized
)
2391 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2400 case AMDGPU::G_LSHR
:
2401 case AMDGPU::G_ASHR
:
2402 case AMDGPU::G_SMIN
:
2403 case AMDGPU::G_SMAX
:
2404 case AMDGPU::G_UMIN
:
2405 case AMDGPU::G_UMAX
: {
2406 Register DstReg
= MI
.getOperand(0).getReg();
2407 LLT DstTy
= MRI
.getType(DstReg
);
2409 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2410 // Packed 16-bit operations need to be scalarized and promoted.
2411 if (DstTy
!= LLT::scalar(16) && DstTy
!= LLT::fixed_vector(2, 16))
2414 const RegisterBank
*DstBank
=
2415 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2416 if (DstBank
== &AMDGPU::VGPRRegBank
)
2419 const LLT S32
= LLT::scalar(32);
2420 MachineBasicBlock
*MBB
= MI
.getParent();
2421 MachineFunction
*MF
= MBB
->getParent();
2422 ApplyRegBankMapping
ApplySALU(*this, MRI
, &AMDGPU::SGPRRegBank
);
2423 MachineIRBuilder
B(MI
, ApplySALU
);
2425 if (DstTy
.isVector()) {
2426 Register WideSrc0Lo
, WideSrc0Hi
;
2427 Register WideSrc1Lo
, WideSrc1Hi
;
2429 unsigned ExtendOp
= getExtendOp(MI
.getOpcode());
2430 std::tie(WideSrc0Lo
, WideSrc0Hi
)
2431 = unpackV2S16ToS32(B
, MI
.getOperand(1).getReg(), ExtendOp
);
2432 std::tie(WideSrc1Lo
, WideSrc1Hi
)
2433 = unpackV2S16ToS32(B
, MI
.getOperand(2).getReg(), ExtendOp
);
2434 auto Lo
= B
.buildInstr(MI
.getOpcode(), {S32
}, {WideSrc0Lo
, WideSrc1Lo
});
2435 auto Hi
= B
.buildInstr(MI
.getOpcode(), {S32
}, {WideSrc0Hi
, WideSrc1Hi
});
2436 B
.buildBuildVectorTrunc(DstReg
, {Lo
.getReg(0), Hi
.getReg(0)});
2437 MI
.eraseFromParent();
2439 LegalizerHelper
Helper(*MF
, ApplySALU
, B
);
2441 if (Helper
.widenScalar(MI
, 0, S32
) != LegalizerHelper::Legalized
)
2442 llvm_unreachable("widen scalar should have succeeded");
2444 // FIXME: s16 shift amounts should be legal.
2445 if (Opc
== AMDGPU::G_SHL
|| Opc
== AMDGPU::G_LSHR
||
2446 Opc
== AMDGPU::G_ASHR
) {
2447 B
.setInsertPt(*MBB
, MI
.getIterator());
2448 if (Helper
.widenScalar(MI
, 1, S32
) != LegalizerHelper::Legalized
)
2449 llvm_unreachable("widen scalar should have succeeded");
2455 case AMDGPU::G_SEXT_INREG
: {
2456 SmallVector
<Register
, 2> SrcRegs(OpdMapper
.getVRegs(1));
2457 if (SrcRegs
.empty())
2458 break; // Nothing to repair
2460 const LLT S32
= LLT::scalar(32);
2461 MachineIRBuilder
B(MI
);
2462 ApplyRegBankMapping
O(*this, MRI
, &AMDGPU::VGPRRegBank
);
2463 GISelObserverWrapper
Observer(&O
);
2464 B
.setChangeObserver(Observer
);
2466 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2467 // we would need to further expand, and doesn't let us directly set the
2468 // result registers.
2469 SmallVector
<Register
, 2> DstRegs(OpdMapper
.getVRegs(0));
2471 int Amt
= MI
.getOperand(2).getImm();
2474 // The low bits are unchanged.
2475 B
.buildCopy(DstRegs
[0], SrcRegs
[0]);
2477 // Extend in the low bits and propagate the sign bit to the high half.
2478 B
.buildSExtInReg(DstRegs
[0], SrcRegs
[0], Amt
);
2481 B
.buildAShr(DstRegs
[1], DstRegs
[0], B
.buildConstant(S32
, 31));
2483 // The low bits are unchanged, and extend in the high bits.
2484 B
.buildCopy(DstRegs
[0], SrcRegs
[0]);
2485 B
.buildSExtInReg(DstRegs
[1], DstRegs
[0], Amt
- 32);
2488 Register DstReg
= MI
.getOperand(0).getReg();
2489 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
2490 MI
.eraseFromParent();
2493 case AMDGPU::G_CTPOP
:
2494 case AMDGPU::G_BITREVERSE
: {
2495 const RegisterBank
*DstBank
=
2496 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2497 if (DstBank
== &AMDGPU::SGPRRegBank
)
2500 Register SrcReg
= MI
.getOperand(1).getReg();
2501 const LLT S32
= LLT::scalar(32);
2502 LLT Ty
= MRI
.getType(SrcReg
);
2506 ApplyRegBankMapping
ApplyVALU(*this, MRI
, &AMDGPU::VGPRRegBank
);
2507 MachineIRBuilder
B(MI
, ApplyVALU
);
2509 MachineFunction
&MF
= B
.getMF();
2510 LegalizerHelper
Helper(MF
, ApplyVALU
, B
);
2512 if (Helper
.narrowScalar(MI
, 1, S32
) != LegalizerHelper::Legalized
)
2513 llvm_unreachable("narrowScalar should have succeeded");
2516 case AMDGPU::G_AMDGPU_FFBH_U32
:
2517 case AMDGPU::G_AMDGPU_FFBL_B32
:
2518 case AMDGPU::G_CTLZ_ZERO_UNDEF
:
2519 case AMDGPU::G_CTTZ_ZERO_UNDEF
: {
2520 const RegisterBank
*DstBank
=
2521 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2522 if (DstBank
== &AMDGPU::SGPRRegBank
)
2525 Register SrcReg
= MI
.getOperand(1).getReg();
2526 const LLT S32
= LLT::scalar(32);
2527 LLT Ty
= MRI
.getType(SrcReg
);
2531 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2532 // which return -1 when the input is zero:
2533 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2534 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2535 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2536 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2537 ApplyRegBankMapping
ApplyVALU(*this, MRI
, &AMDGPU::VGPRRegBank
);
2538 MachineIRBuilder
B(MI
, ApplyVALU
);
2539 SmallVector
<Register
, 2> SrcRegs(OpdMapper
.getVRegs(1));
2540 unsigned NewOpc
= Opc
== AMDGPU::G_CTLZ_ZERO_UNDEF
2541 ? AMDGPU::G_AMDGPU_FFBH_U32
2542 : Opc
== AMDGPU::G_CTTZ_ZERO_UNDEF
2543 ? AMDGPU::G_AMDGPU_FFBL_B32
2545 unsigned Idx
= NewOpc
== AMDGPU::G_AMDGPU_FFBH_U32
;
2546 auto X
= B
.buildInstr(NewOpc
, {S32
}, {SrcRegs
[Idx
]});
2547 auto Y
= B
.buildInstr(NewOpc
, {S32
}, {SrcRegs
[Idx
^ 1]});
2549 Opc
== AMDGPU::G_CTLZ_ZERO_UNDEF
|| Opc
== AMDGPU::G_CTTZ_ZERO_UNDEF
2551 : AMDGPU::G_UADDSAT
;
2552 Y
= B
.buildInstr(AddOpc
, {S32
}, {Y
, B
.buildConstant(S32
, 32)});
2553 Register DstReg
= MI
.getOperand(0).getReg();
2554 B
.buildUMin(DstReg
, X
, Y
);
2555 MI
.eraseFromParent();
2558 case AMDGPU::G_SEXT
:
2559 case AMDGPU::G_ZEXT
:
2560 case AMDGPU::G_ANYEXT
: {
2561 Register SrcReg
= MI
.getOperand(1).getReg();
2562 LLT SrcTy
= MRI
.getType(SrcReg
);
2563 const bool Signed
= Opc
== AMDGPU::G_SEXT
;
2565 assert(empty(OpdMapper
.getVRegs(1)));
2567 MachineIRBuilder
B(MI
);
2568 const RegisterBank
*SrcBank
=
2569 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2571 Register DstReg
= MI
.getOperand(0).getReg();
2572 LLT DstTy
= MRI
.getType(DstReg
);
2573 if (DstTy
.isScalar() &&
2574 SrcBank
!= &AMDGPU::SGPRRegBank
&&
2575 SrcBank
!= &AMDGPU::VCCRegBank
&&
2576 // FIXME: Should handle any type that round to s64 when irregular
2577 // breakdowns supported.
2578 DstTy
.getSizeInBits() == 64 &&
2579 SrcTy
.getSizeInBits() <= 32) {
2580 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2582 // Extend to 32-bit, and then extend the low half.
2584 // TODO: Should really be buildSExtOrCopy
2585 B
.buildSExtOrTrunc(DefRegs
[0], SrcReg
);
2586 } else if (Opc
== AMDGPU::G_ZEXT
) {
2587 B
.buildZExtOrTrunc(DefRegs
[0], SrcReg
);
2589 B
.buildAnyExtOrTrunc(DefRegs
[0], SrcReg
);
2592 extendLow32IntoHigh32(B
, DefRegs
[1], DefRegs
[0], Opc
, *SrcBank
);
2593 MRI
.setRegBank(DstReg
, *SrcBank
);
2594 MI
.eraseFromParent();
2598 if (SrcTy
!= LLT::scalar(1))
2601 // It is not legal to have a legalization artifact with a VCC source. Rather
2602 // than introducing a copy, insert the select we would have to select the
2604 if (SrcBank
== &AMDGPU::VCCRegBank
) {
2605 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2607 const RegisterBank
*DstBank
= &AMDGPU::VGPRRegBank
;
2609 unsigned DstSize
= DstTy
.getSizeInBits();
2610 // 64-bit select is SGPR only
2611 const bool UseSel64
= DstSize
> 32 &&
2612 SrcBank
->getID() == AMDGPU::SGPRRegBankID
;
2614 // TODO: Should s16 select be legal?
2615 LLT SelType
= UseSel64
? LLT::scalar(64) : LLT::scalar(32);
2616 auto True
= B
.buildConstant(SelType
, Signed
? -1 : 1);
2617 auto False
= B
.buildConstant(SelType
, 0);
2619 MRI
.setRegBank(True
.getReg(0), *DstBank
);
2620 MRI
.setRegBank(False
.getReg(0), *DstBank
);
2621 MRI
.setRegBank(DstReg
, *DstBank
);
2624 B
.buildSelect(DefRegs
[0], SrcReg
, True
, False
);
2625 extendLow32IntoHigh32(B
, DefRegs
[1], DefRegs
[0], Opc
, *SrcBank
, true);
2626 } else if (DstSize
< 32) {
2627 auto Sel
= B
.buildSelect(SelType
, SrcReg
, True
, False
);
2628 MRI
.setRegBank(Sel
.getReg(0), *DstBank
);
2629 B
.buildTrunc(DstReg
, Sel
);
2631 B
.buildSelect(DstReg
, SrcReg
, True
, False
);
2634 MI
.eraseFromParent();
2640 case AMDGPU::G_BUILD_VECTOR
:
2641 case AMDGPU::G_BUILD_VECTOR_TRUNC
: {
2642 Register DstReg
= MI
.getOperand(0).getReg();
2643 LLT DstTy
= MRI
.getType(DstReg
);
2644 if (DstTy
!= LLT::fixed_vector(2, 16))
2647 assert(MI
.getNumOperands() == 3 && OpdMapper
.getVRegs(0).empty());
2648 substituteSimpleCopyRegs(OpdMapper
, 1);
2649 substituteSimpleCopyRegs(OpdMapper
, 2);
2651 const RegisterBank
*DstBank
=
2652 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2653 if (DstBank
== &AMDGPU::SGPRRegBank
)
2654 break; // Can use S_PACK_* instructions.
2656 MachineIRBuilder
B(MI
);
2658 Register Lo
= MI
.getOperand(1).getReg();
2659 Register Hi
= MI
.getOperand(2).getReg();
2660 const LLT S32
= LLT::scalar(32);
2662 const RegisterBank
*BankLo
=
2663 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2664 const RegisterBank
*BankHi
=
2665 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
2670 if (Opc
== AMDGPU::G_BUILD_VECTOR
) {
2671 ZextLo
= B
.buildZExt(S32
, Lo
).getReg(0);
2672 MRI
.setRegBank(ZextLo
, *BankLo
);
2674 Register ZextHi
= B
.buildZExt(S32
, Hi
).getReg(0);
2675 MRI
.setRegBank(ZextHi
, *BankHi
);
2677 auto ShiftAmt
= B
.buildConstant(S32
, 16);
2678 MRI
.setRegBank(ShiftAmt
.getReg(0), *BankHi
);
2680 ShiftHi
= B
.buildShl(S32
, ZextHi
, ShiftAmt
).getReg(0);
2681 MRI
.setRegBank(ShiftHi
, *BankHi
);
2683 Register MaskLo
= B
.buildConstant(S32
, 0xffff).getReg(0);
2684 MRI
.setRegBank(MaskLo
, *BankLo
);
2686 auto ShiftAmt
= B
.buildConstant(S32
, 16);
2687 MRI
.setRegBank(ShiftAmt
.getReg(0), *BankHi
);
2689 ShiftHi
= B
.buildShl(S32
, Hi
, ShiftAmt
).getReg(0);
2690 MRI
.setRegBank(ShiftHi
, *BankHi
);
2692 ZextLo
= B
.buildAnd(S32
, Lo
, MaskLo
).getReg(0);
2693 MRI
.setRegBank(ZextLo
, *BankLo
);
2696 auto Or
= B
.buildOr(S32
, ZextLo
, ShiftHi
);
2697 MRI
.setRegBank(Or
.getReg(0), *DstBank
);
2699 B
.buildBitcast(DstReg
, Or
);
2700 MI
.eraseFromParent();
2703 case AMDGPU::G_EXTRACT_VECTOR_ELT
: {
2704 SmallVector
<Register
, 2> DstRegs(OpdMapper
.getVRegs(0));
2706 assert(OpdMapper
.getVRegs(1).empty() && OpdMapper
.getVRegs(2).empty());
2708 Register DstReg
= MI
.getOperand(0).getReg();
2709 Register SrcReg
= MI
.getOperand(1).getReg();
2711 const LLT S32
= LLT::scalar(32);
2712 LLT DstTy
= MRI
.getType(DstReg
);
2713 LLT SrcTy
= MRI
.getType(SrcReg
);
2715 if (foldExtractEltToCmpSelect(MI
, MRI
, OpdMapper
))
2718 MachineIRBuilder
B(MI
);
2720 const ValueMapping
&DstMapping
2721 = OpdMapper
.getInstrMapping().getOperandMapping(0);
2722 const RegisterBank
*DstBank
= DstMapping
.BreakDown
[0].RegBank
;
2723 const RegisterBank
*SrcBank
=
2724 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2725 const RegisterBank
*IdxBank
=
2726 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
2728 Register BaseIdxReg
;
2729 unsigned ConstOffset
;
2730 std::tie(BaseIdxReg
, ConstOffset
) =
2731 AMDGPU::getBaseWithConstantOffset(MRI
, MI
.getOperand(2).getReg());
2733 // See if the index is an add of a constant which will be foldable by moving
2734 // the base register of the index later if this is going to be executed in a
2735 // waterfall loop. This is essentially to reassociate the add of a constant
2736 // with the readfirstlane.
2737 bool ShouldMoveIndexIntoLoop
= IdxBank
!= &AMDGPU::SGPRRegBank
&&
2739 ConstOffset
< SrcTy
.getNumElements();
2741 // Move the base register. We'll re-insert the add later.
2742 if (ShouldMoveIndexIntoLoop
)
2743 MI
.getOperand(2).setReg(BaseIdxReg
);
2745 // If this is a VGPR result only because the index was a VGPR result, the
2746 // actual indexing will be done on the SGPR source vector, which will
2747 // produce a scalar result. We need to copy to the VGPR result inside the
2749 const bool NeedCopyToVGPR
= DstBank
== &AMDGPU::VGPRRegBank
&&
2750 SrcBank
== &AMDGPU::SGPRRegBank
;
2751 if (DstRegs
.empty()) {
2752 applyDefaultMapping(OpdMapper
);
2754 executeInWaterfallLoop(MI
, MRI
, { 2 });
2756 if (NeedCopyToVGPR
) {
2757 // We don't want a phi for this temporary reg.
2758 Register TmpReg
= MRI
.createGenericVirtualRegister(DstTy
);
2759 MRI
.setRegBank(TmpReg
, AMDGPU::SGPRRegBank
);
2760 MI
.getOperand(0).setReg(TmpReg
);
2761 B
.setInsertPt(*MI
.getParent(), ++MI
.getIterator());
2763 // Use a v_mov_b32 here to make the exec dependency explicit.
2764 buildVCopy(B
, DstReg
, TmpReg
);
2767 // Re-insert the constant offset add inside the waterfall loop.
2768 if (ShouldMoveIndexIntoLoop
)
2769 reinsertVectorIndexAdd(B
, MI
, 2, ConstOffset
);
2774 assert(DstTy
.getSizeInBits() == 64);
2776 LLT Vec32
= LLT::fixed_vector(2 * SrcTy
.getNumElements(), 32);
2778 auto CastSrc
= B
.buildBitcast(Vec32
, SrcReg
);
2779 auto One
= B
.buildConstant(S32
, 1);
2781 MachineBasicBlock::iterator MII
= MI
.getIterator();
2783 // Split the vector index into 32-bit pieces. Prepare to move all of the
2784 // new instructions into a waterfall loop if necessary.
2786 // Don't put the bitcast or constant in the loop.
2787 MachineInstrSpan
Span(MII
, &B
.getMBB());
2789 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2790 auto IdxLo
= B
.buildShl(S32
, BaseIdxReg
, One
);
2791 auto IdxHi
= B
.buildAdd(S32
, IdxLo
, One
);
2793 auto Extract0
= B
.buildExtractVectorElement(DstRegs
[0], CastSrc
, IdxLo
);
2794 auto Extract1
= B
.buildExtractVectorElement(DstRegs
[1], CastSrc
, IdxHi
);
2796 MRI
.setRegBank(DstReg
, *DstBank
);
2797 MRI
.setRegBank(CastSrc
.getReg(0), *SrcBank
);
2798 MRI
.setRegBank(One
.getReg(0), AMDGPU::SGPRRegBank
);
2799 MRI
.setRegBank(IdxLo
.getReg(0), AMDGPU::SGPRRegBank
);
2800 MRI
.setRegBank(IdxHi
.getReg(0), AMDGPU::SGPRRegBank
);
2802 SmallSet
<Register
, 4> OpsToWaterfall
;
2803 if (!collectWaterfallOperands(OpsToWaterfall
, MI
, MRI
, { 2 })) {
2804 MI
.eraseFromParent();
2808 // Remove the original instruction to avoid potentially confusing the
2809 // waterfall loop logic.
2810 B
.setInstr(*Span
.begin());
2811 MI
.eraseFromParent();
2812 executeInWaterfallLoop(B
, make_range(Span
.begin(), Span
.end()),
2813 OpsToWaterfall
, MRI
);
2815 if (NeedCopyToVGPR
) {
2816 MachineBasicBlock
*LoopBB
= Extract1
->getParent();
2817 Register TmpReg0
= MRI
.createGenericVirtualRegister(S32
);
2818 Register TmpReg1
= MRI
.createGenericVirtualRegister(S32
);
2819 MRI
.setRegBank(TmpReg0
, AMDGPU::SGPRRegBank
);
2820 MRI
.setRegBank(TmpReg1
, AMDGPU::SGPRRegBank
);
2822 Extract0
->getOperand(0).setReg(TmpReg0
);
2823 Extract1
->getOperand(0).setReg(TmpReg1
);
2825 B
.setInsertPt(*LoopBB
, ++Extract1
->getIterator());
2827 buildVCopy(B
, DstRegs
[0], TmpReg0
);
2828 buildVCopy(B
, DstRegs
[1], TmpReg1
);
2831 if (ShouldMoveIndexIntoLoop
)
2832 reinsertVectorIndexAdd(B
, *IdxLo
, 1, ConstOffset
);
2836 case AMDGPU::G_INSERT_VECTOR_ELT
: {
2837 SmallVector
<Register
, 2> InsRegs(OpdMapper
.getVRegs(2));
2839 Register DstReg
= MI
.getOperand(0).getReg();
2840 LLT VecTy
= MRI
.getType(DstReg
);
2842 assert(OpdMapper
.getVRegs(0).empty());
2843 assert(OpdMapper
.getVRegs(3).empty());
2845 if (substituteSimpleCopyRegs(OpdMapper
, 1))
2846 MRI
.setType(MI
.getOperand(1).getReg(), VecTy
);
2848 if (foldInsertEltToCmpSelect(MI
, MRI
, OpdMapper
))
2851 const RegisterBank
*IdxBank
=
2852 OpdMapper
.getInstrMapping().getOperandMapping(3).BreakDown
[0].RegBank
;
2854 Register SrcReg
= MI
.getOperand(1).getReg();
2855 Register InsReg
= MI
.getOperand(2).getReg();
2856 LLT InsTy
= MRI
.getType(InsReg
);
2859 Register BaseIdxReg
;
2860 unsigned ConstOffset
;
2861 std::tie(BaseIdxReg
, ConstOffset
) =
2862 AMDGPU::getBaseWithConstantOffset(MRI
, MI
.getOperand(3).getReg());
2864 // See if the index is an add of a constant which will be foldable by moving
2865 // the base register of the index later if this is going to be executed in a
2866 // waterfall loop. This is essentially to reassociate the add of a constant
2867 // with the readfirstlane.
2868 bool ShouldMoveIndexIntoLoop
= IdxBank
!= &AMDGPU::SGPRRegBank
&&
2870 ConstOffset
< VecTy
.getNumElements();
2872 // Move the base register. We'll re-insert the add later.
2873 if (ShouldMoveIndexIntoLoop
)
2874 MI
.getOperand(3).setReg(BaseIdxReg
);
2877 if (InsRegs
.empty()) {
2878 executeInWaterfallLoop(MI
, MRI
, { 3 });
2880 // Re-insert the constant offset add inside the waterfall loop.
2881 if (ShouldMoveIndexIntoLoop
) {
2882 MachineIRBuilder
B(MI
);
2883 reinsertVectorIndexAdd(B
, MI
, 3, ConstOffset
);
2890 assert(InsTy
.getSizeInBits() == 64);
2892 const LLT S32
= LLT::scalar(32);
2893 LLT Vec32
= LLT::fixed_vector(2 * VecTy
.getNumElements(), 32);
2895 MachineIRBuilder
B(MI
);
2896 auto CastSrc
= B
.buildBitcast(Vec32
, SrcReg
);
2897 auto One
= B
.buildConstant(S32
, 1);
2899 // Split the vector index into 32-bit pieces. Prepare to move all of the
2900 // new instructions into a waterfall loop if necessary.
2902 // Don't put the bitcast or constant in the loop.
2903 MachineInstrSpan
Span(MachineBasicBlock::iterator(&MI
), &B
.getMBB());
2905 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2906 auto IdxLo
= B
.buildShl(S32
, BaseIdxReg
, One
);
2907 auto IdxHi
= B
.buildAdd(S32
, IdxLo
, One
);
2909 auto InsLo
= B
.buildInsertVectorElement(Vec32
, CastSrc
, InsRegs
[0], IdxLo
);
2910 auto InsHi
= B
.buildInsertVectorElement(Vec32
, InsLo
, InsRegs
[1], IdxHi
);
2912 const RegisterBank
*DstBank
=
2913 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2914 const RegisterBank
*SrcBank
=
2915 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2916 const RegisterBank
*InsSrcBank
=
2917 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
2919 MRI
.setRegBank(InsReg
, *InsSrcBank
);
2920 MRI
.setRegBank(CastSrc
.getReg(0), *SrcBank
);
2921 MRI
.setRegBank(InsLo
.getReg(0), *DstBank
);
2922 MRI
.setRegBank(InsHi
.getReg(0), *DstBank
);
2923 MRI
.setRegBank(One
.getReg(0), AMDGPU::SGPRRegBank
);
2924 MRI
.setRegBank(IdxLo
.getReg(0), AMDGPU::SGPRRegBank
);
2925 MRI
.setRegBank(IdxHi
.getReg(0), AMDGPU::SGPRRegBank
);
2928 SmallSet
<Register
, 4> OpsToWaterfall
;
2929 if (!collectWaterfallOperands(OpsToWaterfall
, MI
, MRI
, { 3 })) {
2930 B
.setInsertPt(B
.getMBB(), MI
);
2931 B
.buildBitcast(DstReg
, InsHi
);
2932 MI
.eraseFromParent();
2936 B
.setInstr(*Span
.begin());
2937 MI
.eraseFromParent();
2939 // Figure out the point after the waterfall loop before mangling the control
2941 executeInWaterfallLoop(B
, make_range(Span
.begin(), Span
.end()),
2942 OpsToWaterfall
, MRI
);
2944 // The insertion point is now right after the original instruction.
2946 // Keep the bitcast to the original vector type out of the loop. Doing this
2947 // saved an extra phi we don't need inside the loop.
2948 B
.buildBitcast(DstReg
, InsHi
);
2950 // Re-insert the constant offset add inside the waterfall loop.
2951 if (ShouldMoveIndexIntoLoop
)
2952 reinsertVectorIndexAdd(B
, *IdxLo
, 1, ConstOffset
);
2956 case AMDGPU::G_AMDGPU_BUFFER_LOAD
:
2957 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
:
2958 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT
:
2959 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
:
2960 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
:
2961 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT
:
2962 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16
:
2963 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT
:
2964 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16
:
2965 case AMDGPU::G_AMDGPU_BUFFER_STORE
:
2966 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE
:
2967 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT
:
2968 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT
:
2969 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16
:
2970 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT
:
2971 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16
: {
2972 applyDefaultMapping(OpdMapper
);
2973 executeInWaterfallLoop(MI
, MRI
, {1, 4});
2976 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP
:
2977 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD
:
2978 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB
:
2979 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN
:
2980 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN
:
2981 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX
:
2982 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX
:
2983 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND
:
2984 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR
:
2985 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR
:
2986 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC
:
2987 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC
: {
2988 applyDefaultMapping(OpdMapper
);
2989 executeInWaterfallLoop(MI
, MRI
, {2, 5});
2992 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD
:
2993 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN
:
2994 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX
: {
2995 applyDefaultMapping(OpdMapper
);
2996 executeInWaterfallLoop(MI
, MRI
, {2, 5});
2999 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP
: {
3000 applyDefaultMapping(OpdMapper
);
3001 executeInWaterfallLoop(MI
, MRI
, {3, 6});
3004 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD
: {
3005 applyMappingSBufferLoad(OpdMapper
);
3008 case AMDGPU::G_INTRINSIC
: {
3009 switch (MI
.getIntrinsicID()) {
3010 case Intrinsic::amdgcn_readlane
: {
3011 substituteSimpleCopyRegs(OpdMapper
, 2);
3013 assert(OpdMapper
.getVRegs(0).empty());
3014 assert(OpdMapper
.getVRegs(3).empty());
3016 // Make sure the index is an SGPR. It doesn't make sense to run this in a
3017 // waterfall loop, so assume it's a uniform value.
3018 constrainOpWithReadfirstlane(MI
, MRI
, 3); // Index
3021 case Intrinsic::amdgcn_writelane
: {
3022 assert(OpdMapper
.getVRegs(0).empty());
3023 assert(OpdMapper
.getVRegs(2).empty());
3024 assert(OpdMapper
.getVRegs(3).empty());
3026 substituteSimpleCopyRegs(OpdMapper
, 4); // VGPR input val
3027 constrainOpWithReadfirstlane(MI
, MRI
, 2); // Source value
3028 constrainOpWithReadfirstlane(MI
, MRI
, 3); // Index
3031 case Intrinsic::amdgcn_interp_p1
:
3032 case Intrinsic::amdgcn_interp_p2
:
3033 case Intrinsic::amdgcn_interp_mov
:
3034 case Intrinsic::amdgcn_interp_p1_f16
:
3035 case Intrinsic::amdgcn_interp_p2_f16
: {
3036 applyDefaultMapping(OpdMapper
);
3038 // Readlane for m0 value, which is always the last operand.
3039 // FIXME: Should this be a waterfall loop instead?
3040 constrainOpWithReadfirstlane(MI
, MRI
, MI
.getNumOperands() - 1); // Index
3043 case Intrinsic::amdgcn_permlane16
:
3044 case Intrinsic::amdgcn_permlanex16
: {
3045 // Doing a waterfall loop over these wouldn't make any sense.
3046 substituteSimpleCopyRegs(OpdMapper
, 2);
3047 substituteSimpleCopyRegs(OpdMapper
, 3);
3048 constrainOpWithReadfirstlane(MI
, MRI
, 4);
3049 constrainOpWithReadfirstlane(MI
, MRI
, 5);
3052 case Intrinsic::amdgcn_sbfe
:
3053 applyMappingBFE(OpdMapper
, true);
3055 case Intrinsic::amdgcn_ubfe
:
3056 applyMappingBFE(OpdMapper
, false);
3058 case Intrinsic::amdgcn_ballot
:
3059 // Use default handling and insert copy to vcc source.
3064 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD
:
3065 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE
: {
3066 const AMDGPU::RsrcIntrinsic
*RSrcIntrin
3067 = AMDGPU::lookupRsrcIntrinsic(MI
.getIntrinsicID());
3068 assert(RSrcIntrin
&& RSrcIntrin
->IsImage
);
3069 // Non-images can have complications from operands that allow both SGPR
3070 // and VGPR. For now it's too complicated to figure out the final opcode
3071 // to derive the register bank from the MCInstrDesc.
3072 applyMappingImage(MI
, OpdMapper
, MRI
, RSrcIntrin
->RsrcArg
);
3075 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY
: {
3076 unsigned N
= MI
.getNumExplicitOperands() - 2;
3077 applyDefaultMapping(OpdMapper
);
3078 executeInWaterfallLoop(MI
, MRI
, { N
});
3081 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
: {
3082 auto IntrID
= MI
.getIntrinsicID();
3084 case Intrinsic::amdgcn_ds_ordered_add
:
3085 case Intrinsic::amdgcn_ds_ordered_swap
: {
3086 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3087 assert(OpdMapper
.getVRegs(0).empty());
3088 substituteSimpleCopyRegs(OpdMapper
, 3);
3089 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
3092 case Intrinsic::amdgcn_ds_gws_init
:
3093 case Intrinsic::amdgcn_ds_gws_barrier
:
3094 case Intrinsic::amdgcn_ds_gws_sema_br
: {
3095 // Only the first lane is executes, so readfirstlane is safe.
3096 substituteSimpleCopyRegs(OpdMapper
, 1);
3097 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
3100 case Intrinsic::amdgcn_ds_gws_sema_v
:
3101 case Intrinsic::amdgcn_ds_gws_sema_p
:
3102 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
3103 // Only the first lane is executes, so readfirstlane is safe.
3104 constrainOpWithReadfirstlane(MI
, MRI
, 1); // M0
3107 case Intrinsic::amdgcn_ds_append
:
3108 case Intrinsic::amdgcn_ds_consume
: {
3109 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
3112 case Intrinsic::amdgcn_s_sendmsg
:
3113 case Intrinsic::amdgcn_s_sendmsghalt
: {
3114 // FIXME: Should this use a waterfall loop?
3115 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
3118 case Intrinsic::amdgcn_s_setreg
: {
3119 constrainOpWithReadfirstlane(MI
, MRI
, 2);
3123 if (const AMDGPU::RsrcIntrinsic
*RSrcIntrin
=
3124 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
3125 // Non-images can have complications from operands that allow both SGPR
3126 // and VGPR. For now it's too complicated to figure out the final opcode
3127 // to derive the register bank from the MCInstrDesc.
3128 if (RSrcIntrin
->IsImage
) {
3129 applyMappingImage(MI
, OpdMapper
, MRI
, RSrcIntrin
->RsrcArg
);
3139 case AMDGPU::G_LOAD
:
3140 case AMDGPU::G_ZEXTLOAD
:
3141 case AMDGPU::G_SEXTLOAD
: {
3142 if (applyMappingLoad(MI
, OpdMapper
, MRI
))
3146 case AMDGPU::G_DYN_STACKALLOC
:
3147 applyMappingDynStackAlloc(MI
, OpdMapper
, MRI
);
3149 case AMDGPU::G_SBFX
:
3150 applyMappingBFE(OpdMapper
, /*Signed*/ true);
3152 case AMDGPU::G_UBFX
:
3153 applyMappingBFE(OpdMapper
, /*Signed*/ false);
3159 return applyDefaultMapping(OpdMapper
);
3162 // vgpr, sgpr -> vgpr
3163 // vgpr, agpr -> vgpr
3164 // agpr, agpr -> agpr
3165 // agpr, sgpr -> vgpr
3166 static unsigned regBankUnion(unsigned RB0
, unsigned RB1
) {
3167 if (RB0
== AMDGPU::InvalidRegBankID
)
3169 if (RB1
== AMDGPU::InvalidRegBankID
)
3172 if (RB0
== AMDGPU::SGPRRegBankID
&& RB1
== AMDGPU::SGPRRegBankID
)
3173 return AMDGPU::SGPRRegBankID
;
3175 if (RB0
== AMDGPU::AGPRRegBankID
&& RB1
== AMDGPU::AGPRRegBankID
)
3176 return AMDGPU::AGPRRegBankID
;
3178 return AMDGPU::VGPRRegBankID
;
3181 static unsigned regBankBoolUnion(unsigned RB0
, unsigned RB1
) {
3182 if (RB0
== AMDGPU::InvalidRegBankID
)
3184 if (RB1
== AMDGPU::InvalidRegBankID
)
3190 if (RB0
== AMDGPU::VCCRegBankID
|| RB1
== AMDGPU::VCCRegBankID
)
3191 return AMDGPU::VCCRegBankID
;
3193 // vcc, vgpr -> vgpr
3194 return regBankUnion(RB0
, RB1
);
3197 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo
&MRI
,
3198 const MachineInstr
&MI
) const {
3199 unsigned RegBank
= AMDGPU::InvalidRegBankID
;
3201 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
3202 if (!MI
.getOperand(i
).isReg())
3204 Register Reg
= MI
.getOperand(i
).getReg();
3205 if (const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
)) {
3206 RegBank
= regBankUnion(RegBank
, Bank
->getID());
3207 if (RegBank
== AMDGPU::VGPRRegBankID
)
3215 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr
&MI
) const {
3216 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3217 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3218 for (unsigned i
= 0, e
= MI
.getNumOperands();i
!= e
; ++i
) {
3219 if (!MI
.getOperand(i
).isReg())
3221 Register Reg
= MI
.getOperand(i
).getReg();
3222 if (const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
)) {
3223 if (Bank
->getID() != AMDGPU::SGPRRegBankID
)
3230 const RegisterBankInfo::InstructionMapping
&
3231 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr
&MI
) const {
3232 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3233 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3234 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3236 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
3237 const MachineOperand
&SrcOp
= MI
.getOperand(i
);
3241 unsigned Size
= getSizeInBits(SrcOp
.getReg(), MRI
, *TRI
);
3242 OpdsMapping
[i
] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
3244 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
3245 MI
.getNumOperands());
3248 const RegisterBankInfo::InstructionMapping
&
3249 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr
&MI
) const {
3250 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3251 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3252 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3254 // Even though we technically could use SGPRs, this would require knowledge of
3255 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3257 // TODO: Unary ops are trivially OK, so accept SGPRs?
3258 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
3259 const MachineOperand
&Src
= MI
.getOperand(i
);
3263 unsigned Size
= getSizeInBits(Src
.getReg(), MRI
, *TRI
);
3264 unsigned BankID
= Size
== 1 ? AMDGPU::VCCRegBankID
: AMDGPU::VGPRRegBankID
;
3265 OpdsMapping
[i
] = AMDGPU::getValueMapping(BankID
, Size
);
3268 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
3269 MI
.getNumOperands());
3272 const RegisterBankInfo::InstructionMapping
&
3273 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr
&MI
) const {
3274 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3275 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3276 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3278 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
) {
3279 const MachineOperand
&Op
= MI
.getOperand(I
);
3283 unsigned Size
= getSizeInBits(Op
.getReg(), MRI
, *TRI
);
3284 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3287 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
3288 MI
.getNumOperands());
3291 const RegisterBankInfo::InstructionMapping
&
3292 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo
&MRI
,
3293 const MachineInstr
&MI
,
3294 int RsrcIdx
) const {
3295 // The reported argument index is relative to the IR intrinsic call arguments,
3296 // so we need to shift by the number of defs and the intrinsic ID.
3297 RsrcIdx
+= MI
.getNumExplicitDefs() + 1;
3299 const int NumOps
= MI
.getNumOperands();
3300 SmallVector
<const ValueMapping
*, 8> OpdsMapping(NumOps
);
3302 // TODO: Should packed/unpacked D16 difference be reported here as part of
3303 // the value mapping?
3304 for (int I
= 0; I
!= NumOps
; ++I
) {
3305 if (!MI
.getOperand(I
).isReg())
3308 Register OpReg
= MI
.getOperand(I
).getReg();
3309 // We replace some dead address operands with $noreg
3313 unsigned Size
= getSizeInBits(OpReg
, MRI
, *TRI
);
3315 // FIXME: Probably need a new intrinsic register bank searchable table to
3316 // handle arbitrary intrinsics easily.
3318 // If this has a sampler, it immediately follows rsrc.
3319 const bool MustBeSGPR
= I
== RsrcIdx
|| I
== RsrcIdx
+ 1;
3322 // If this must be an SGPR, so we must report whatever it is as legal.
3323 unsigned NewBank
= getRegBankID(OpReg
, MRI
, AMDGPU::SGPRRegBankID
);
3324 OpdsMapping
[I
] = AMDGPU::getValueMapping(NewBank
, Size
);
3326 // Some operands must be VGPR, and these are easy to copy to.
3327 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3331 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
), NumOps
);
3334 /// Return the mapping for a pointer arugment.
3335 const RegisterBankInfo::ValueMapping
*
3336 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo
&MRI
,
3337 Register PtrReg
) const {
3338 LLT PtrTy
= MRI
.getType(PtrReg
);
3339 unsigned Size
= PtrTy
.getSizeInBits();
3340 if (Subtarget
.useFlatForGlobal() ||
3341 !AMDGPU::isFlatGlobalAddrSpace(PtrTy
.getAddressSpace()))
3342 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3344 // If we're using MUBUF instructions for global memory, an SGPR base register
3345 // is possible. Otherwise this needs to be a VGPR.
3346 const RegisterBank
*PtrBank
= getRegBank(PtrReg
, MRI
, *TRI
);
3347 return AMDGPU::getValueMapping(PtrBank
->getID(), Size
);
3350 const RegisterBankInfo::InstructionMapping
&
3351 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr
&MI
) const {
3353 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3354 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3355 SmallVector
<const ValueMapping
*, 2> OpdsMapping(2);
3356 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3357 Register PtrReg
= MI
.getOperand(1).getReg();
3358 LLT PtrTy
= MRI
.getType(PtrReg
);
3359 unsigned AS
= PtrTy
.getAddressSpace();
3360 unsigned PtrSize
= PtrTy
.getSizeInBits();
3362 const ValueMapping
*ValMapping
;
3363 const ValueMapping
*PtrMapping
;
3365 const RegisterBank
*PtrBank
= getRegBank(PtrReg
, MRI
, *TRI
);
3367 if (PtrBank
== &AMDGPU::SGPRRegBank
&& AMDGPU::isFlatGlobalAddrSpace(AS
)) {
3368 if (isScalarLoadLegal(MI
)) {
3369 // We have a uniform instruction so we want to use an SMRD load
3370 ValMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
3371 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
);
3373 ValMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3375 // If we're using MUBUF instructions for global memory, an SGPR base
3376 // register is possible. Otherwise this needs to be a VGPR.
3377 unsigned PtrBankID
= Subtarget
.useFlatForGlobal() ?
3378 AMDGPU::VGPRRegBankID
: AMDGPU::SGPRRegBankID
;
3380 PtrMapping
= AMDGPU::getValueMapping(PtrBankID
, PtrSize
);
3383 ValMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3384 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
);
3387 OpdsMapping
[0] = ValMapping
;
3388 OpdsMapping
[1] = PtrMapping
;
3389 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstructionMapping(
3390 1, 1, getOperandsMapping(OpdsMapping
), MI
.getNumOperands());
3393 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3394 // handle that during instruction selection?
3398 AMDGPURegisterBankInfo::getRegBankID(Register Reg
,
3399 const MachineRegisterInfo
&MRI
,
3400 unsigned Default
) const {
3401 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
3402 return Bank
? Bank
->getID() : Default
;
3405 const RegisterBankInfo::ValueMapping
*
3406 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg
,
3407 const MachineRegisterInfo
&MRI
,
3408 const TargetRegisterInfo
&TRI
) const {
3409 // Lie and claim anything is legal, even though this needs to be an SGPR
3410 // applyMapping will have to deal with it as a waterfall loop.
3411 unsigned Bank
= getRegBankID(Reg
, MRI
, AMDGPU::SGPRRegBankID
);
3412 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
3413 return AMDGPU::getValueMapping(Bank
, Size
);
3416 const RegisterBankInfo::ValueMapping
*
3417 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg
,
3418 const MachineRegisterInfo
&MRI
,
3419 const TargetRegisterInfo
&TRI
) const {
3420 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
3421 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3424 const RegisterBankInfo::ValueMapping
*
3425 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg
,
3426 const MachineRegisterInfo
&MRI
,
3427 const TargetRegisterInfo
&TRI
) const {
3428 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
3429 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID
, Size
);
3433 /// This function must return a legal mapping, because
3434 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3435 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3436 /// VGPR to SGPR generated is illegal.
3438 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3439 // legal. These will be dealt with in applyMappingImpl.
3441 const RegisterBankInfo::InstructionMapping
&
3442 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr
&MI
) const {
3443 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3444 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3446 if (MI
.isCopy() || MI
.getOpcode() == AMDGPU::G_FREEZE
) {
3447 // The default logic bothers to analyze impossible alternative mappings. We
3448 // want the most straightforward mapping, so just directly handle this.
3449 const RegisterBank
*DstBank
= getRegBank(MI
.getOperand(0).getReg(), MRI
,
3451 const RegisterBank
*SrcBank
= getRegBank(MI
.getOperand(1).getReg(), MRI
,
3453 assert(SrcBank
&& "src bank should have been assigned already");
3457 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3458 if (cannotCopy(*DstBank
, *SrcBank
, Size
))
3459 return getInvalidInstructionMapping();
3461 const ValueMapping
&ValMap
= getValueMapping(0, Size
, *DstBank
);
3462 unsigned OpdsMappingSize
= MI
.isCopy() ? 1 : 2;
3463 SmallVector
<const ValueMapping
*, 1> OpdsMapping(OpdsMappingSize
);
3464 OpdsMapping
[0] = &ValMap
;
3465 if (MI
.getOpcode() == AMDGPU::G_FREEZE
)
3466 OpdsMapping
[1] = &ValMap
;
3468 return getInstructionMapping(
3470 /*OperandsMapping*/ getOperandsMapping(OpdsMapping
), OpdsMappingSize
);
3473 if (MI
.isRegSequence()) {
3474 // If any input is a VGPR, the result must be a VGPR. The default handling
3475 // assumes any copy between banks is legal.
3476 unsigned BankID
= AMDGPU::SGPRRegBankID
;
3478 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
3479 auto OpBank
= getRegBankID(MI
.getOperand(I
).getReg(), MRI
);
3480 // It doesn't make sense to use vcc or scc banks here, so just ignore
3482 if (OpBank
!= AMDGPU::SGPRRegBankID
) {
3483 BankID
= AMDGPU::VGPRRegBankID
;
3487 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3489 const ValueMapping
&ValMap
= getValueMapping(0, Size
, getRegBank(BankID
));
3490 return getInstructionMapping(
3492 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
3495 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3498 // TODO: There are additional exec masking dependencies to analyze.
3499 if (MI
.getOpcode() == TargetOpcode::G_PHI
) {
3500 unsigned ResultBank
= AMDGPU::InvalidRegBankID
;
3501 Register DstReg
= MI
.getOperand(0).getReg();
3503 // Sometimes the result may have already been assigned a bank.
3504 if (const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
))
3505 ResultBank
= DstBank
->getID();
3507 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
3508 Register Reg
= MI
.getOperand(I
).getReg();
3509 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
3511 // FIXME: Assuming VGPR for any undetermined inputs.
3512 if (!Bank
|| Bank
->getID() == AMDGPU::VGPRRegBankID
) {
3513 ResultBank
= AMDGPU::VGPRRegBankID
;
3517 // FIXME: Need to promote SGPR case to s32
3518 unsigned OpBank
= Bank
->getID();
3519 ResultBank
= regBankBoolUnion(ResultBank
, OpBank
);
3522 assert(ResultBank
!= AMDGPU::InvalidRegBankID
);
3524 unsigned Size
= MRI
.getType(DstReg
).getSizeInBits();
3526 const ValueMapping
&ValMap
=
3527 getValueMapping(0, Size
, getRegBank(ResultBank
));
3528 return getInstructionMapping(
3530 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
3533 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstrMappingImpl(MI
);
3534 if (Mapping
.isValid())
3537 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3539 switch (MI
.getOpcode()) {
3541 return getInvalidInstructionMapping();
3545 case AMDGPU::G_XOR
: {
3546 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3548 const RegisterBank
*DstBank
3549 = getRegBank(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3551 unsigned TargetBankID
= AMDGPU::InvalidRegBankID
;
3552 unsigned BankLHS
= AMDGPU::InvalidRegBankID
;
3553 unsigned BankRHS
= AMDGPU::InvalidRegBankID
;
3555 TargetBankID
= DstBank
->getID();
3556 if (DstBank
== &AMDGPU::VCCRegBank
) {
3557 TargetBankID
= AMDGPU::VCCRegBankID
;
3558 BankLHS
= AMDGPU::VCCRegBankID
;
3559 BankRHS
= AMDGPU::VCCRegBankID
;
3561 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
3562 AMDGPU::SGPRRegBankID
);
3563 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
3564 AMDGPU::SGPRRegBankID
);
3567 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
3568 AMDGPU::VCCRegBankID
);
3569 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
3570 AMDGPU::VCCRegBankID
);
3572 // Both inputs should be true booleans to produce a boolean result.
3573 if (BankLHS
== AMDGPU::VGPRRegBankID
|| BankRHS
== AMDGPU::VGPRRegBankID
) {
3574 TargetBankID
= AMDGPU::VGPRRegBankID
;
3575 } else if (BankLHS
== AMDGPU::VCCRegBankID
|| BankRHS
== AMDGPU::VCCRegBankID
) {
3576 TargetBankID
= AMDGPU::VCCRegBankID
;
3577 BankLHS
= AMDGPU::VCCRegBankID
;
3578 BankRHS
= AMDGPU::VCCRegBankID
;
3579 } else if (BankLHS
== AMDGPU::SGPRRegBankID
&& BankRHS
== AMDGPU::SGPRRegBankID
) {
3580 TargetBankID
= AMDGPU::SGPRRegBankID
;
3584 OpdsMapping
[0] = AMDGPU::getValueMapping(TargetBankID
, Size
);
3585 OpdsMapping
[1] = AMDGPU::getValueMapping(BankLHS
, Size
);
3586 OpdsMapping
[2] = AMDGPU::getValueMapping(BankRHS
, Size
);
3592 if (isSALUMapping(MI
)) {
3593 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
);
3594 OpdsMapping
[1] = OpdsMapping
[2] = OpdsMapping
[0];
3596 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
);
3597 unsigned Bank1
= getRegBankID(MI
.getOperand(1).getReg(), MRI
/*, DefaultBankID*/);
3598 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank1
, Size
);
3600 unsigned Bank2
= getRegBankID(MI
.getOperand(2).getReg(), MRI
/*, DefaultBankID*/);
3601 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank2
, Size
);
3609 case AMDGPU::G_PTR_ADD
:
3610 case AMDGPU::G_PTRMASK
:
3615 case AMDGPU::G_LSHR
:
3616 case AMDGPU::G_ASHR
:
3617 case AMDGPU::G_UADDO
:
3618 case AMDGPU::G_USUBO
:
3619 case AMDGPU::G_UADDE
:
3620 case AMDGPU::G_SADDE
:
3621 case AMDGPU::G_USUBE
:
3622 case AMDGPU::G_SSUBE
:
3623 case AMDGPU::G_SMIN
:
3624 case AMDGPU::G_SMAX
:
3625 case AMDGPU::G_UMIN
:
3626 case AMDGPU::G_UMAX
:
3628 case AMDGPU::G_SHUFFLE_VECTOR
:
3629 case AMDGPU::G_SBFX
:
3630 case AMDGPU::G_UBFX
:
3631 if (isSALUMapping(MI
))
3632 return getDefaultMappingSOP(MI
);
3635 case AMDGPU::G_SADDSAT
: // FIXME: Could lower sat ops for SALU
3636 case AMDGPU::G_SSUBSAT
:
3637 case AMDGPU::G_UADDSAT
:
3638 case AMDGPU::G_USUBSAT
:
3639 case AMDGPU::G_FADD
:
3640 case AMDGPU::G_FSUB
:
3641 case AMDGPU::G_FPTOSI
:
3642 case AMDGPU::G_FPTOUI
:
3643 case AMDGPU::G_FMUL
:
3645 case AMDGPU::G_FMAD
:
3646 case AMDGPU::G_FSQRT
:
3647 case AMDGPU::G_FFLOOR
:
3648 case AMDGPU::G_FCEIL
:
3649 case AMDGPU::G_FRINT
:
3650 case AMDGPU::G_SITOFP
:
3651 case AMDGPU::G_UITOFP
:
3652 case AMDGPU::G_FPTRUNC
:
3653 case AMDGPU::G_FPEXT
:
3654 case AMDGPU::G_FEXP2
:
3655 case AMDGPU::G_FLOG2
:
3656 case AMDGPU::G_FMINNUM
:
3657 case AMDGPU::G_FMAXNUM
:
3658 case AMDGPU::G_FMINNUM_IEEE
:
3659 case AMDGPU::G_FMAXNUM_IEEE
:
3660 case AMDGPU::G_FCANONICALIZE
:
3661 case AMDGPU::G_INTRINSIC_TRUNC
:
3662 case AMDGPU::G_BSWAP
: // TODO: Somehow expand for scalar?
3663 case AMDGPU::G_FSHR
: // TODO: Expand for scalar
3664 case AMDGPU::G_AMDGPU_FMIN_LEGACY
:
3665 case AMDGPU::G_AMDGPU_FMAX_LEGACY
:
3666 case AMDGPU::G_AMDGPU_RCP_IFLAG
:
3667 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0
:
3668 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1
:
3669 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2
:
3670 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3
:
3671 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32
:
3672 case AMDGPU::G_AMDGPU_SMED3
:
3673 return getDefaultMappingVOP(MI
);
3674 case AMDGPU::G_UMULH
:
3675 case AMDGPU::G_SMULH
: {
3676 if (Subtarget
.hasScalarMulHiInsts() && isSALUMapping(MI
))
3677 return getDefaultMappingSOP(MI
);
3678 return getDefaultMappingVOP(MI
);
3680 case AMDGPU::G_IMPLICIT_DEF
: {
3681 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3682 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
3685 case AMDGPU::G_FCONSTANT
:
3686 case AMDGPU::G_CONSTANT
:
3687 case AMDGPU::G_GLOBAL_VALUE
:
3688 case AMDGPU::G_BLOCK_ADDR
:
3689 case AMDGPU::G_READCYCLECOUNTER
: {
3690 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3691 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
3694 case AMDGPU::G_FRAME_INDEX
: {
3695 // TODO: This should be the same as other constants, but eliminateFrameIndex
3696 // currently assumes VALU uses.
3697 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3698 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3701 case AMDGPU::G_DYN_STACKALLOC
: {
3702 // Result is always uniform, and a wave reduction is needed for the source.
3703 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
3704 unsigned SrcBankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3705 OpdsMapping
[1] = AMDGPU::getValueMapping(SrcBankID
, 32);
3708 case AMDGPU::G_INSERT
: {
3709 unsigned BankID
= getMappingType(MRI
, MI
);
3710 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3711 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
3712 unsigned EltSize
= getSizeInBits(MI
.getOperand(2).getReg(), MRI
, *TRI
);
3713 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
3714 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
3715 OpdsMapping
[2] = AMDGPU::getValueMapping(BankID
, EltSize
);
3716 OpdsMapping
[3] = nullptr;
3719 case AMDGPU::G_EXTRACT
: {
3720 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3721 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3722 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
3723 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
3724 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
3725 OpdsMapping
[2] = nullptr;
3728 case AMDGPU::G_BUILD_VECTOR
:
3729 case AMDGPU::G_BUILD_VECTOR_TRUNC
: {
3730 LLT DstTy
= MRI
.getType(MI
.getOperand(0).getReg());
3731 if (DstTy
== LLT::fixed_vector(2, 16)) {
3732 unsigned DstSize
= DstTy
.getSizeInBits();
3733 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3734 unsigned Src0BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3735 unsigned Src1BankID
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
3736 unsigned DstBankID
= regBankUnion(Src0BankID
, Src1BankID
);
3738 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBankID
, DstSize
);
3739 OpdsMapping
[1] = AMDGPU::getValueMapping(Src0BankID
, SrcSize
);
3740 OpdsMapping
[2] = AMDGPU::getValueMapping(Src1BankID
, SrcSize
);
3746 case AMDGPU::G_MERGE_VALUES
:
3747 case AMDGPU::G_CONCAT_VECTORS
: {
3748 unsigned Bank
= getMappingType(MRI
, MI
);
3749 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3750 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3752 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
3753 // Op1 and Dst should use the same register bank.
3754 for (unsigned i
= 1, e
= MI
.getNumOperands(); i
!= e
; ++i
)
3755 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, SrcSize
);
3758 case AMDGPU::G_BITREVERSE
:
3759 case AMDGPU::G_BITCAST
:
3760 case AMDGPU::G_INTTOPTR
:
3761 case AMDGPU::G_PTRTOINT
:
3762 case AMDGPU::G_FABS
:
3763 case AMDGPU::G_FNEG
: {
3764 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3765 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3766 OpdsMapping
[0] = OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, Size
);
3769 case AMDGPU::G_AMDGPU_FFBH_U32
:
3770 case AMDGPU::G_AMDGPU_FFBL_B32
:
3771 case AMDGPU::G_CTLZ_ZERO_UNDEF
:
3772 case AMDGPU::G_CTTZ_ZERO_UNDEF
: {
3773 unsigned Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3774 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3775 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, 32);
3776 OpdsMapping
[1] = AMDGPU::getValueMappingSGPR64Only(BankID
, Size
);
3779 case AMDGPU::G_CTPOP
: {
3780 unsigned Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3781 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3782 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, 32);
3784 // This should really be getValueMappingSGPR64Only, but allowing the generic
3785 // code to handle the register split just makes using LegalizerHelper more
3787 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, Size
);
3790 case AMDGPU::G_TRUNC
: {
3791 Register Dst
= MI
.getOperand(0).getReg();
3792 Register Src
= MI
.getOperand(1).getReg();
3793 unsigned Bank
= getRegBankID(Src
, MRI
);
3794 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
3795 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
3796 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
3797 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, SrcSize
);
3800 case AMDGPU::G_ZEXT
:
3801 case AMDGPU::G_SEXT
:
3802 case AMDGPU::G_ANYEXT
:
3803 case AMDGPU::G_SEXT_INREG
: {
3804 Register Dst
= MI
.getOperand(0).getReg();
3805 Register Src
= MI
.getOperand(1).getReg();
3806 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
3807 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
3810 const RegisterBank
*SrcBank
= getRegBank(Src
, MRI
, *TRI
);
3812 switch (SrcBank
->getID()) {
3813 case AMDGPU::SGPRRegBankID
:
3814 DstBank
= AMDGPU::SGPRRegBankID
;
3817 DstBank
= AMDGPU::VGPRRegBankID
;
3821 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3822 // 32-bits, and then to 64.
3823 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(DstBank
, DstSize
);
3824 OpdsMapping
[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank
->getID(),
3828 case AMDGPU::G_FCMP
: {
3829 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
3830 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
3831 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
3832 OpdsMapping
[1] = nullptr; // Predicate Operand.
3833 OpdsMapping
[2] = AMDGPU::getValueMapping(Op2Bank
, Size
);
3834 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3837 case AMDGPU::G_STORE
: {
3838 assert(MI
.getOperand(0).isReg());
3839 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3841 // FIXME: We need to specify a different reg bank once scalar stores are
3843 const ValueMapping
*ValMapping
=
3844 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3845 OpdsMapping
[0] = ValMapping
;
3846 OpdsMapping
[1] = getValueMappingForPtr(MRI
, MI
.getOperand(1).getReg());
3849 case AMDGPU::G_ICMP
: {
3850 auto Pred
= static_cast<CmpInst::Predicate
>(MI
.getOperand(1).getPredicate());
3851 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
3853 // See if the result register has already been constrained to vcc, which may
3854 // happen due to control flow intrinsic lowering.
3855 unsigned DstBank
= getRegBankID(MI
.getOperand(0).getReg(), MRI
,
3856 AMDGPU::SGPRRegBankID
);
3857 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
3858 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
);
3860 bool CanUseSCC
= DstBank
== AMDGPU::SGPRRegBankID
&&
3861 Op2Bank
== AMDGPU::SGPRRegBankID
&&
3862 Op3Bank
== AMDGPU::SGPRRegBankID
&&
3863 (Size
== 32 || (Size
== 64 &&
3864 (Pred
== CmpInst::ICMP_EQ
|| Pred
== CmpInst::ICMP_NE
) &&
3865 Subtarget
.hasScalarCompareEq64()));
3867 DstBank
= CanUseSCC
? AMDGPU::SGPRRegBankID
: AMDGPU::VCCRegBankID
;
3868 unsigned SrcBank
= CanUseSCC
? AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
3870 // TODO: Use 32-bit for scalar output size.
3871 // SCC results will need to be copied to a 32-bit SGPR virtual register.
3872 const unsigned ResultSize
= 1;
3874 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBank
, ResultSize
);
3875 OpdsMapping
[2] = AMDGPU::getValueMapping(SrcBank
, Size
);
3876 OpdsMapping
[3] = AMDGPU::getValueMapping(SrcBank
, Size
);
3879 case AMDGPU::G_EXTRACT_VECTOR_ELT
: {
3880 // VGPR index can be used for waterfall when indexing a SGPR vector.
3881 unsigned SrcBankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3882 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3883 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3884 unsigned IdxSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
3885 unsigned IdxBank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
3886 unsigned OutputBankID
= regBankUnion(SrcBankID
, IdxBank
);
3888 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID
, DstSize
);
3889 OpdsMapping
[1] = AMDGPU::getValueMapping(SrcBankID
, SrcSize
);
3891 // The index can be either if the source vector is VGPR.
3892 OpdsMapping
[2] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
3895 case AMDGPU::G_INSERT_VECTOR_ELT
: {
3896 unsigned OutputBankID
= isSALUMapping(MI
) ?
3897 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
3899 unsigned VecSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3900 unsigned InsertSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
3901 unsigned IdxSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
3902 unsigned InsertEltBankID
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
3903 unsigned IdxBankID
= getRegBankID(MI
.getOperand(3).getReg(), MRI
);
3905 OpdsMapping
[0] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
3906 OpdsMapping
[1] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
3908 // This is a weird case, because we need to break down the mapping based on
3909 // the register bank of a different operand.
3910 if (InsertSize
== 64 && OutputBankID
== AMDGPU::VGPRRegBankID
) {
3911 OpdsMapping
[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID
,
3914 assert(InsertSize
== 32 || InsertSize
== 64);
3915 OpdsMapping
[2] = AMDGPU::getValueMapping(InsertEltBankID
, InsertSize
);
3918 // The index can be either if the source vector is VGPR.
3919 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBankID
, IdxSize
);
3922 case AMDGPU::G_UNMERGE_VALUES
: {
3923 unsigned Bank
= getMappingType(MRI
, MI
);
3925 // Op1 and Dst should use the same register bank.
3926 // FIXME: Shouldn't this be the default? Why do we need to handle this?
3927 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
3928 unsigned Size
= getSizeInBits(MI
.getOperand(i
).getReg(), MRI
, *TRI
);
3929 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, Size
);
3933 case AMDGPU::G_AMDGPU_BUFFER_LOAD
:
3934 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
:
3935 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
:
3936 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
:
3937 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT
:
3938 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT
:
3939 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16
:
3940 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT
:
3941 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16
:
3942 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT
:
3943 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16
:
3944 case AMDGPU::G_AMDGPU_BUFFER_STORE
:
3945 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE
:
3946 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT
:
3947 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT
:
3948 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16
: {
3949 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3952 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
3955 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
3958 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
3961 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
3963 // Any remaining operands are immediates and were correctly null
3967 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP
:
3968 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD
:
3969 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB
:
3970 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN
:
3971 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN
:
3972 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX
:
3973 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX
:
3974 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND
:
3975 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR
:
3976 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR
:
3977 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC
:
3978 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC
:
3979 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD
:
3980 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN
:
3981 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX
: {
3983 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3986 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
3989 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
3992 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
3995 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
3998 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
4000 // Any remaining operands are immediates and were correctly null
4004 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP
: {
4006 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4009 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4012 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4015 OpdsMapping
[3] = getSGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4018 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4021 OpdsMapping
[5] = getVGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
4024 OpdsMapping
[6] = getSGPROpMapping(MI
.getOperand(6).getReg(), MRI
, *TRI
);
4026 // Any remaining operands are immediates and were correctly null
4030 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD
: {
4031 // Lie and claim everything is legal, even though some need to be
4032 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4033 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4034 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4036 // We need to convert this to a MUBUF if either the resource of offset is
4038 unsigned RSrcBank
= OpdsMapping
[1]->BreakDown
[0].RegBank
->getID();
4039 unsigned OffsetBank
= OpdsMapping
[2]->BreakDown
[0].RegBank
->getID();
4040 unsigned ResultBank
= regBankUnion(RSrcBank
, OffsetBank
);
4042 unsigned Size0
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4043 OpdsMapping
[0] = AMDGPU::getValueMapping(ResultBank
, Size0
);
4046 case AMDGPU::G_INTRINSIC
: {
4047 switch (MI
.getIntrinsicID()) {
4049 return getInvalidInstructionMapping();
4050 case Intrinsic::amdgcn_div_fmas
:
4051 case Intrinsic::amdgcn_div_fixup
:
4052 case Intrinsic::amdgcn_trig_preop
:
4053 case Intrinsic::amdgcn_sin
:
4054 case Intrinsic::amdgcn_cos
:
4055 case Intrinsic::amdgcn_log_clamp
:
4056 case Intrinsic::amdgcn_rcp
:
4057 case Intrinsic::amdgcn_rcp_legacy
:
4058 case Intrinsic::amdgcn_sqrt
:
4059 case Intrinsic::amdgcn_rsq
:
4060 case Intrinsic::amdgcn_rsq_legacy
:
4061 case Intrinsic::amdgcn_rsq_clamp
:
4062 case Intrinsic::amdgcn_fmul_legacy
:
4063 case Intrinsic::amdgcn_fma_legacy
:
4064 case Intrinsic::amdgcn_ldexp
:
4065 case Intrinsic::amdgcn_frexp_mant
:
4066 case Intrinsic::amdgcn_frexp_exp
:
4067 case Intrinsic::amdgcn_fract
:
4068 case Intrinsic::amdgcn_cvt_pkrtz
:
4069 case Intrinsic::amdgcn_cvt_pknorm_i16
:
4070 case Intrinsic::amdgcn_cvt_pknorm_u16
:
4071 case Intrinsic::amdgcn_cvt_pk_i16
:
4072 case Intrinsic::amdgcn_cvt_pk_u16
:
4073 case Intrinsic::amdgcn_fmed3
:
4074 case Intrinsic::amdgcn_cubeid
:
4075 case Intrinsic::amdgcn_cubema
:
4076 case Intrinsic::amdgcn_cubesc
:
4077 case Intrinsic::amdgcn_cubetc
:
4078 case Intrinsic::amdgcn_sffbh
:
4079 case Intrinsic::amdgcn_fmad_ftz
:
4080 case Intrinsic::amdgcn_mbcnt_lo
:
4081 case Intrinsic::amdgcn_mbcnt_hi
:
4082 case Intrinsic::amdgcn_mul_u24
:
4083 case Intrinsic::amdgcn_mul_i24
:
4084 case Intrinsic::amdgcn_lerp
:
4085 case Intrinsic::amdgcn_sad_u8
:
4086 case Intrinsic::amdgcn_msad_u8
:
4087 case Intrinsic::amdgcn_sad_hi_u8
:
4088 case Intrinsic::amdgcn_sad_u16
:
4089 case Intrinsic::amdgcn_qsad_pk_u16_u8
:
4090 case Intrinsic::amdgcn_mqsad_pk_u16_u8
:
4091 case Intrinsic::amdgcn_mqsad_u32_u8
:
4092 case Intrinsic::amdgcn_cvt_pk_u8_f32
:
4093 case Intrinsic::amdgcn_alignbit
:
4094 case Intrinsic::amdgcn_alignbyte
:
4095 case Intrinsic::amdgcn_perm
:
4096 case Intrinsic::amdgcn_fdot2
:
4097 case Intrinsic::amdgcn_sdot2
:
4098 case Intrinsic::amdgcn_udot2
:
4099 case Intrinsic::amdgcn_sdot4
:
4100 case Intrinsic::amdgcn_udot4
:
4101 case Intrinsic::amdgcn_sdot8
:
4102 case Intrinsic::amdgcn_udot8
:
4103 return getDefaultMappingVOP(MI
);
4104 case Intrinsic::amdgcn_sbfe
:
4105 case Intrinsic::amdgcn_ubfe
:
4106 if (isSALUMapping(MI
))
4107 return getDefaultMappingSOP(MI
);
4108 return getDefaultMappingVOP(MI
);
4109 case Intrinsic::amdgcn_ds_swizzle
:
4110 case Intrinsic::amdgcn_ds_permute
:
4111 case Intrinsic::amdgcn_ds_bpermute
:
4112 case Intrinsic::amdgcn_update_dpp
:
4113 case Intrinsic::amdgcn_mov_dpp8
:
4114 case Intrinsic::amdgcn_mov_dpp
:
4115 case Intrinsic::amdgcn_strict_wwm
:
4116 case Intrinsic::amdgcn_wwm
:
4117 case Intrinsic::amdgcn_strict_wqm
:
4118 case Intrinsic::amdgcn_wqm
:
4119 case Intrinsic::amdgcn_softwqm
:
4120 case Intrinsic::amdgcn_set_inactive
:
4121 return getDefaultMappingAllVGPR(MI
);
4122 case Intrinsic::amdgcn_kernarg_segment_ptr
:
4123 case Intrinsic::amdgcn_s_getpc
:
4124 case Intrinsic::amdgcn_groupstaticsize
:
4125 case Intrinsic::amdgcn_reloc_constant
:
4126 case Intrinsic::returnaddress
: {
4127 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4128 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4131 case Intrinsic::amdgcn_wqm_vote
: {
4132 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4133 OpdsMapping
[0] = OpdsMapping
[2]
4134 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
);
4137 case Intrinsic::amdgcn_ps_live
: {
4138 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4141 case Intrinsic::amdgcn_div_scale
: {
4142 unsigned Dst0Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4143 unsigned Dst1Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
4144 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Dst0Size
);
4145 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Dst1Size
);
4147 unsigned SrcSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
4148 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4149 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4152 case Intrinsic::amdgcn_class
: {
4153 Register Src0Reg
= MI
.getOperand(2).getReg();
4154 Register Src1Reg
= MI
.getOperand(3).getReg();
4155 unsigned Src0Size
= MRI
.getType(Src0Reg
).getSizeInBits();
4156 unsigned Src1Size
= MRI
.getType(Src1Reg
).getSizeInBits();
4157 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4158 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, DstSize
);
4159 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Src0Size
);
4160 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Src1Size
);
4163 case Intrinsic::amdgcn_icmp
:
4164 case Intrinsic::amdgcn_fcmp
: {
4165 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4166 // This is not VCCRegBank because this is not used in boolean contexts.
4167 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
4168 unsigned OpSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4169 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, OpSize
);
4170 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, OpSize
);
4173 case Intrinsic::amdgcn_readlane
: {
4174 // This must be an SGPR, but accept a VGPR.
4175 Register IdxReg
= MI
.getOperand(3).getReg();
4176 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
4177 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, AMDGPU::SGPRRegBankID
);
4178 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
4181 case Intrinsic::amdgcn_readfirstlane
: {
4182 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4183 unsigned SrcSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4184 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
4185 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4188 case Intrinsic::amdgcn_writelane
: {
4189 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4190 Register SrcReg
= MI
.getOperand(2).getReg();
4191 unsigned SrcSize
= MRI
.getType(SrcReg
).getSizeInBits();
4192 unsigned SrcBank
= getRegBankID(SrcReg
, MRI
, AMDGPU::SGPRRegBankID
);
4193 Register IdxReg
= MI
.getOperand(3).getReg();
4194 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
4195 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, AMDGPU::SGPRRegBankID
);
4196 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4198 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4200 OpdsMapping
[2] = AMDGPU::getValueMapping(SrcBank
, SrcSize
);
4201 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
4202 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4205 case Intrinsic::amdgcn_if_break
: {
4206 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4207 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4208 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4209 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4212 case Intrinsic::amdgcn_permlane16
:
4213 case Intrinsic::amdgcn_permlanex16
: {
4214 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4215 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4216 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4217 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4218 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4219 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4222 case Intrinsic::amdgcn_mfma_f32_4x4x1f32
:
4223 case Intrinsic::amdgcn_mfma_f32_4x4x4f16
:
4224 case Intrinsic::amdgcn_mfma_i32_4x4x4i8
:
4225 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16
:
4226 case Intrinsic::amdgcn_mfma_f32_16x16x1f32
:
4227 case Intrinsic::amdgcn_mfma_f32_16x16x4f32
:
4228 case Intrinsic::amdgcn_mfma_f32_16x16x4f16
:
4229 case Intrinsic::amdgcn_mfma_f32_16x16x16f16
:
4230 case Intrinsic::amdgcn_mfma_i32_16x16x4i8
:
4231 case Intrinsic::amdgcn_mfma_i32_16x16x16i8
:
4232 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16
:
4233 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16
:
4234 case Intrinsic::amdgcn_mfma_f32_32x32x1f32
:
4235 case Intrinsic::amdgcn_mfma_f32_32x32x2f32
:
4236 case Intrinsic::amdgcn_mfma_f32_32x32x4f16
:
4237 case Intrinsic::amdgcn_mfma_f32_32x32x8f16
:
4238 case Intrinsic::amdgcn_mfma_i32_32x32x4i8
:
4239 case Intrinsic::amdgcn_mfma_i32_32x32x8i8
:
4240 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16
:
4241 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16
:
4242 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k
:
4243 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k
:
4244 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k
:
4245 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k
:
4246 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k
:
4247 case Intrinsic::amdgcn_mfma_f64_16x16x4f64
:
4248 case Intrinsic::amdgcn_mfma_f64_4x4x4f64
: {
4249 // Default for MAI intrinsics.
4250 // srcC can also be an immediate which can be folded later.
4251 // FIXME: Should we eventually add an alternative mapping with AGPR src
4254 // vdst, srcA, srcB, srcC
4255 OpdsMapping
[0] = getAGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4256 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4257 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4258 OpdsMapping
[4] = getAGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4261 case Intrinsic::amdgcn_interp_p1
:
4262 case Intrinsic::amdgcn_interp_p2
:
4263 case Intrinsic::amdgcn_interp_mov
:
4264 case Intrinsic::amdgcn_interp_p1_f16
:
4265 case Intrinsic::amdgcn_interp_p2_f16
: {
4266 const int M0Idx
= MI
.getNumOperands() - 1;
4267 Register M0Reg
= MI
.getOperand(M0Idx
).getReg();
4268 unsigned M0Bank
= getRegBankID(M0Reg
, MRI
, AMDGPU::SGPRRegBankID
);
4269 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4271 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4272 for (int I
= 2; I
!= M0Idx
&& MI
.getOperand(I
).isReg(); ++I
)
4273 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4275 // Must be SGPR, but we must take whatever the original bank is and fix it
4277 OpdsMapping
[M0Idx
] = AMDGPU::getValueMapping(M0Bank
, 32);
4280 case Intrinsic::amdgcn_ballot
: {
4281 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4282 unsigned SrcSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4283 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
4284 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, SrcSize
);
4290 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD
:
4291 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE
: {
4292 auto IntrID
= MI
.getIntrinsicID();
4293 const AMDGPU::RsrcIntrinsic
*RSrcIntrin
= AMDGPU::lookupRsrcIntrinsic(IntrID
);
4294 assert(RSrcIntrin
&& "missing RsrcIntrinsic for image intrinsic");
4295 // Non-images can have complications from operands that allow both SGPR
4296 // and VGPR. For now it's too complicated to figure out the final opcode
4297 // to derive the register bank from the MCInstrDesc.
4298 assert(RSrcIntrin
->IsImage
);
4299 return getImageMapping(MRI
, MI
, RSrcIntrin
->RsrcArg
);
4301 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY
: {
4302 unsigned N
= MI
.getNumExplicitOperands() - 2;
4303 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 128);
4304 OpdsMapping
[N
] = getSGPROpMapping(MI
.getOperand(N
).getReg(), MRI
, *TRI
);
4306 // Sequential form: all operands combined into VGPR256/VGPR512
4307 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4310 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4313 for (unsigned I
= 2; I
< N
; ++I
)
4314 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4318 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
: {
4319 auto IntrID
= MI
.getIntrinsicID();
4321 case Intrinsic::amdgcn_s_getreg
:
4322 case Intrinsic::amdgcn_s_memtime
:
4323 case Intrinsic::amdgcn_s_memrealtime
:
4324 case Intrinsic::amdgcn_s_get_waveid_in_workgroup
: {
4325 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4326 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4329 case Intrinsic::amdgcn_global_atomic_fadd
:
4330 case Intrinsic::amdgcn_global_atomic_csub
:
4331 case Intrinsic::amdgcn_global_atomic_fmin
:
4332 case Intrinsic::amdgcn_global_atomic_fmax
:
4333 case Intrinsic::amdgcn_flat_atomic_fadd
:
4334 case Intrinsic::amdgcn_flat_atomic_fmin
:
4335 case Intrinsic::amdgcn_flat_atomic_fmax
:
4336 return getDefaultMappingAllVGPR(MI
);
4337 case Intrinsic::amdgcn_ds_ordered_add
:
4338 case Intrinsic::amdgcn_ds_ordered_swap
: {
4339 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4340 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4341 unsigned M0Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4342 AMDGPU::SGPRRegBankID
);
4343 OpdsMapping
[2] = AMDGPU::getValueMapping(M0Bank
, 32);
4344 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4347 case Intrinsic::amdgcn_ds_append
:
4348 case Intrinsic::amdgcn_ds_consume
: {
4349 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4350 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4351 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4354 case Intrinsic::amdgcn_exp_compr
:
4355 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4356 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4358 case Intrinsic::amdgcn_exp
:
4359 // FIXME: Could we support packed types here?
4360 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4361 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4362 OpdsMapping
[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4363 OpdsMapping
[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4365 case Intrinsic::amdgcn_s_sendmsg
:
4366 case Intrinsic::amdgcn_s_sendmsghalt
: {
4367 // This must be an SGPR, but accept a VGPR.
4368 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4369 AMDGPU::SGPRRegBankID
);
4370 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
4373 case Intrinsic::amdgcn_s_setreg
: {
4374 // This must be an SGPR, but accept a VGPR.
4375 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4376 AMDGPU::SGPRRegBankID
);
4377 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
4380 case Intrinsic::amdgcn_end_cf
: {
4381 unsigned Size
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4382 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4385 case Intrinsic::amdgcn_else
: {
4386 unsigned WaveSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4387 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4388 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, WaveSize
);
4389 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, WaveSize
);
4392 case Intrinsic::amdgcn_live_mask
: {
4393 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4396 case Intrinsic::amdgcn_wqm_demote
:
4397 case Intrinsic::amdgcn_kill
: {
4398 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4401 case Intrinsic::amdgcn_raw_buffer_load
:
4402 case Intrinsic::amdgcn_raw_tbuffer_load
: {
4403 // FIXME: Should make intrinsic ID the last operand of the instruction,
4404 // then this would be the same as store
4405 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4406 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4407 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4408 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4411 case Intrinsic::amdgcn_raw_buffer_store
:
4412 case Intrinsic::amdgcn_raw_buffer_store_format
:
4413 case Intrinsic::amdgcn_raw_tbuffer_store
: {
4414 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4415 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4416 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4417 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4420 case Intrinsic::amdgcn_struct_buffer_load
:
4421 case Intrinsic::amdgcn_struct_tbuffer_load
: {
4422 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4423 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4424 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4425 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4426 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
4429 case Intrinsic::amdgcn_struct_buffer_store
:
4430 case Intrinsic::amdgcn_struct_tbuffer_store
: {
4431 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4432 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4433 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4434 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4435 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
4438 case Intrinsic::amdgcn_init_exec_from_input
: {
4439 unsigned Size
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4440 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4443 case Intrinsic::amdgcn_ds_gws_init
:
4444 case Intrinsic::amdgcn_ds_gws_barrier
:
4445 case Intrinsic::amdgcn_ds_gws_sema_br
: {
4446 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4448 // This must be an SGPR, but accept a VGPR.
4449 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4450 AMDGPU::SGPRRegBankID
);
4451 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
4454 case Intrinsic::amdgcn_ds_gws_sema_v
:
4455 case Intrinsic::amdgcn_ds_gws_sema_p
:
4456 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
4457 // This must be an SGPR, but accept a VGPR.
4458 unsigned Bank
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
4459 AMDGPU::SGPRRegBankID
);
4460 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, 32);
4464 return getInvalidInstructionMapping();
4468 case AMDGPU::G_SELECT
: {
4469 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4470 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4471 AMDGPU::SGPRRegBankID
);
4472 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
,
4473 AMDGPU::SGPRRegBankID
);
4474 bool SGPRSrcs
= Op2Bank
== AMDGPU::SGPRRegBankID
&&
4475 Op3Bank
== AMDGPU::SGPRRegBankID
;
4477 unsigned CondBankDefault
= SGPRSrcs
?
4478 AMDGPU::SGPRRegBankID
: AMDGPU::VCCRegBankID
;
4479 unsigned CondBank
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
4481 if (CondBank
== AMDGPU::SGPRRegBankID
)
4482 CondBank
= SGPRSrcs
? AMDGPU::SGPRRegBankID
: AMDGPU::VCCRegBankID
;
4483 else if (CondBank
== AMDGPU::VGPRRegBankID
)
4484 CondBank
= AMDGPU::VCCRegBankID
;
4486 unsigned Bank
= SGPRSrcs
&& CondBank
== AMDGPU::SGPRRegBankID
?
4487 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
4489 assert(CondBank
== AMDGPU::VCCRegBankID
|| CondBank
== AMDGPU::SGPRRegBankID
);
4491 // TODO: Should report 32-bit for scalar condition type.
4493 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
4494 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
4495 OpdsMapping
[2] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
4496 OpdsMapping
[3] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
4498 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, Size
);
4499 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
4500 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, Size
);
4501 OpdsMapping
[3] = AMDGPU::getValueMapping(Bank
, Size
);
4507 case AMDGPU::G_LOAD
:
4508 case AMDGPU::G_ZEXTLOAD
:
4509 case AMDGPU::G_SEXTLOAD
:
4510 return getInstrMappingForLoad(MI
);
4512 case AMDGPU::G_ATOMICRMW_XCHG
:
4513 case AMDGPU::G_ATOMICRMW_ADD
:
4514 case AMDGPU::G_ATOMICRMW_SUB
:
4515 case AMDGPU::G_ATOMICRMW_AND
:
4516 case AMDGPU::G_ATOMICRMW_OR
:
4517 case AMDGPU::G_ATOMICRMW_XOR
:
4518 case AMDGPU::G_ATOMICRMW_MAX
:
4519 case AMDGPU::G_ATOMICRMW_MIN
:
4520 case AMDGPU::G_ATOMICRMW_UMAX
:
4521 case AMDGPU::G_ATOMICRMW_UMIN
:
4522 case AMDGPU::G_ATOMICRMW_FADD
:
4523 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG
:
4524 case AMDGPU::G_AMDGPU_ATOMIC_INC
:
4525 case AMDGPU::G_AMDGPU_ATOMIC_DEC
:
4526 case AMDGPU::G_AMDGPU_ATOMIC_FMIN
:
4527 case AMDGPU::G_AMDGPU_ATOMIC_FMAX
: {
4528 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4529 OpdsMapping
[1] = getValueMappingForPtr(MRI
, MI
.getOperand(1).getReg());
4530 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4533 case AMDGPU::G_ATOMIC_CMPXCHG
: {
4534 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4535 OpdsMapping
[1] = getValueMappingForPtr(MRI
, MI
.getOperand(1).getReg());
4536 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4537 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4540 case AMDGPU::G_BRCOND
: {
4541 unsigned Bank
= getRegBankID(MI
.getOperand(0).getReg(), MRI
,
4542 AMDGPU::SGPRRegBankID
);
4543 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
4544 if (Bank
!= AMDGPU::SGPRRegBankID
)
4545 Bank
= AMDGPU::VCCRegBankID
;
4547 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, 1);
4552 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4553 getOperandsMapping(OpdsMapping
),
4554 MI
.getNumOperands());