1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the RegisterBankInfo class for
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
48 /// \par Constant bus restriction
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
69 //===----------------------------------------------------------------------===//
71 #include "AMDGPURegisterBankInfo.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
92 using namespace MIPatternMatch
;
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final
: public GISelChangeObserver
{
99 const AMDGPURegisterBankInfo
&RBI
;
100 MachineRegisterInfo
&MRI
;
101 const RegisterBank
*NewBank
;
102 SmallVector
<MachineInstr
*, 4> NewInsts
;
105 ApplyRegBankMapping(const AMDGPURegisterBankInfo
&RBI_
,
106 MachineRegisterInfo
&MRI_
, const RegisterBank
*RB
)
107 : RBI(RBI_
), MRI(MRI_
), NewBank(RB
) {}
109 ~ApplyRegBankMapping() {
110 for (MachineInstr
*MI
: NewInsts
)
114 /// Set any registers that don't have a set register class or bank to SALU.
115 void applyBank(MachineInstr
&MI
) {
116 const unsigned Opc
= MI
.getOpcode();
117 if (Opc
== AMDGPU::G_ANYEXT
|| Opc
== AMDGPU::G_ZEXT
||
118 Opc
== AMDGPU::G_SEXT
) {
119 // LegalizerHelper wants to use the basic legalization artifacts when
120 // widening etc. We don't handle selection with vcc in artifact sources,
121 // so we need to use a select instead to handle these properly.
122 Register DstReg
= MI
.getOperand(0).getReg();
123 Register SrcReg
= MI
.getOperand(1).getReg();
124 const RegisterBank
*SrcBank
= RBI
.getRegBank(SrcReg
, MRI
, *RBI
.TRI
);
125 if (SrcBank
== &AMDGPU::VCCRegBank
) {
126 const LLT S32
= LLT::scalar(32);
127 assert(MRI
.getType(SrcReg
) == LLT::scalar(1));
128 assert(MRI
.getType(DstReg
) == S32
);
129 assert(NewBank
== &AMDGPU::VGPRRegBank
);
131 // Replace the extension with a select, which really uses the boolean
133 MachineIRBuilder
B(MI
);
134 auto True
= B
.buildConstant(S32
, Opc
== AMDGPU::G_SEXT
? -1 : 1);
135 auto False
= B
.buildConstant(S32
, 0);
136 B
.buildSelect(DstReg
, SrcReg
, True
, False
);
137 MRI
.setRegBank(True
.getReg(0), *NewBank
);
138 MRI
.setRegBank(False
.getReg(0), *NewBank
);
139 MI
.eraseFromParent();
142 assert(!MRI
.getRegClassOrRegBank(DstReg
));
143 MRI
.setRegBank(DstReg
, *NewBank
);
148 if (Opc
== AMDGPU::G_TRUNC
) {
149 Register DstReg
= MI
.getOperand(0).getReg();
150 const RegisterBank
*DstBank
= RBI
.getRegBank(DstReg
, MRI
, *RBI
.TRI
);
151 assert(DstBank
!= &AMDGPU::VCCRegBank
);
155 for (MachineOperand
&Op
: MI
.operands()) {
159 // We may see physical registers if building a real MI
160 Register Reg
= Op
.getReg();
161 if (Reg
.isPhysical() || MRI
.getRegClassOrRegBank(Reg
))
164 const RegisterBank
*RB
= NewBank
;
165 if (MRI
.getType(Reg
) == LLT::scalar(1)) {
166 assert(NewBank
== &AMDGPU::VGPRRegBank
&&
167 "s1 operands should only be used for vector bools");
168 assert((MI
.getOpcode() != AMDGPU::G_TRUNC
&&
169 MI
.getOpcode() != AMDGPU::G_ANYEXT
) &&
170 "not expecting legalization artifacts here");
171 RB
= &AMDGPU::VCCRegBank
;
174 MRI
.setRegBank(Reg
, *RB
);
178 void erasingInstr(MachineInstr
&MI
) override
{}
180 void createdInstr(MachineInstr
&MI
) override
{
181 // At this point, the instruction was just inserted and has no operands.
182 NewInsts
.push_back(&MI
);
185 void changingInstr(MachineInstr
&MI
) override
{}
186 void changedInstr(MachineInstr
&MI
) override
{
187 // FIXME: In principle we should probably add the instruction to NewInsts,
188 // but the way the LegalizerHelper uses the observer, we will always see the
189 // registers we need to set the regbank on also referenced in a new
195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget
&ST
)
196 : AMDGPUGenRegisterBankInfo(),
198 TRI(Subtarget
.getRegisterInfo()),
199 TII(Subtarget
.getInstrInfo()) {
201 // HACK: Until this is fully tablegen'd.
202 static llvm::once_flag InitializeRegisterBankFlag
;
204 static auto InitializeRegisterBankOnce
= [this]() {
205 assert(&getRegBank(AMDGPU::SGPRRegBankID
) == &AMDGPU::SGPRRegBank
&&
206 &getRegBank(AMDGPU::VGPRRegBankID
) == &AMDGPU::VGPRRegBank
&&
207 &getRegBank(AMDGPU::AGPRRegBankID
) == &AMDGPU::AGPRRegBank
);
211 llvm::call_once(InitializeRegisterBankFlag
, InitializeRegisterBankOnce
);
214 static bool isVectorRegisterBank(const RegisterBank
&Bank
) {
215 unsigned BankID
= Bank
.getID();
216 return BankID
== AMDGPU::VGPRRegBankID
|| BankID
== AMDGPU::AGPRRegBankID
;
219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank
&Dst
,
220 const RegisterBank
&Src
,
221 unsigned Size
) const {
222 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223 if (Dst
.getID() == AMDGPU::SGPRRegBankID
&&
224 (isVectorRegisterBank(Src
) || Src
.getID() == AMDGPU::VCCRegBankID
)) {
225 return std::numeric_limits
<unsigned>::max();
228 // Bool values are tricky, because the meaning is based on context. The SCC
229 // and VCC banks are for the natural scalar and vector conditions produced by
232 // Legalization doesn't know about the necessary context, so an s1 use may
233 // have been a truncate from an arbitrary value, in which case a copy (lowered
234 // as a compare with 0) needs to be inserted.
236 (Dst
.getID() == AMDGPU::SGPRRegBankID
) &&
237 (isVectorRegisterBank(Src
) ||
238 Src
.getID() == AMDGPU::SGPRRegBankID
||
239 Src
.getID() == AMDGPU::VCCRegBankID
))
240 return std::numeric_limits
<unsigned>::max();
242 // There is no direct copy between AGPRs.
243 if (Dst
.getID() == AMDGPU::AGPRRegBankID
&&
244 Src
.getID() == AMDGPU::AGPRRegBankID
)
247 return RegisterBankInfo::copyCost(Dst
, Src
, Size
);
250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251 const ValueMapping
&ValMapping
,
252 const RegisterBank
*CurBank
) const {
253 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
255 // FIXME: Is there a better way to do this?
256 if (ValMapping
.NumBreakDowns
>= 2 || ValMapping
.BreakDown
[0].Length
>= 64)
257 return 10; // This is expensive.
259 assert(ValMapping
.NumBreakDowns
== 2 &&
260 ValMapping
.BreakDown
[0].Length
== 32 &&
261 ValMapping
.BreakDown
[0].StartIdx
== 0 &&
262 ValMapping
.BreakDown
[1].Length
== 32 &&
263 ValMapping
.BreakDown
[1].StartIdx
== 32 &&
264 ValMapping
.BreakDown
[0].RegBank
== ValMapping
.BreakDown
[1].RegBank
);
266 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
270 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271 // alignment restrictions, but this probably isn't important.
276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass
&RC
,
278 if (&RC
== &AMDGPU::SReg_1RegClass
)
279 return AMDGPU::VCCRegBank
;
281 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
283 if (TRI
->isSGPRClass(&RC
)) {
284 // FIXME: This probably came from a copy from a physical register, which
285 // should be inferable from the copied to-type. We don't have many boolean
286 // physical register constraints so just assume a normal SGPR for now.
288 return AMDGPU::SGPRRegBank
;
290 return Ty
== LLT::scalar(1) ? AMDGPU::VCCRegBank
: AMDGPU::SGPRRegBank
;
293 return TRI
->isAGPRClass(&RC
) ? AMDGPU::AGPRRegBank
: AMDGPU::VGPRRegBank
;
296 template <unsigned NumOps
>
297 RegisterBankInfo::InstructionMappings
298 AMDGPURegisterBankInfo::addMappingFromTable(
299 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
,
300 const std::array
<unsigned, NumOps
> RegSrcOpIdx
,
301 ArrayRef
<OpRegBankEntry
<NumOps
>> Table
) const {
303 InstructionMappings AltMappings
;
305 SmallVector
<const ValueMapping
*, 10> Operands(MI
.getNumOperands());
307 unsigned Sizes
[NumOps
];
308 for (unsigned I
= 0; I
< NumOps
; ++I
) {
309 Register Reg
= MI
.getOperand(RegSrcOpIdx
[I
]).getReg();
310 Sizes
[I
] = getSizeInBits(Reg
, MRI
, *TRI
);
313 for (unsigned I
= 0, E
= MI
.getNumExplicitDefs(); I
!= E
; ++I
) {
314 unsigned SizeI
= getSizeInBits(MI
.getOperand(I
).getReg(), MRI
, *TRI
);
315 Operands
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SizeI
);
318 // getInstrMapping's default mapping uses ID 1, so start at 2.
319 unsigned MappingID
= 2;
320 for (const auto &Entry
: Table
) {
321 for (unsigned I
= 0; I
< NumOps
; ++I
) {
322 int OpIdx
= RegSrcOpIdx
[I
];
323 Operands
[OpIdx
] = AMDGPU::getValueMapping(Entry
.RegBanks
[I
], Sizes
[I
]);
326 AltMappings
.push_back(&getInstructionMapping(MappingID
++, Entry
.Cost
,
327 getOperandsMapping(Operands
),
334 RegisterBankInfo::InstructionMappings
335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
337 switch (MI
.getIntrinsicID()) {
338 case Intrinsic::amdgcn_readlane
: {
339 static const OpRegBankEntry
<3> Table
[2] = {
341 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
343 // Need a readfirstlane for the index.
344 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
347 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
348 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
350 case Intrinsic::amdgcn_writelane
: {
351 static const OpRegBankEntry
<4> Table
[4] = {
353 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
355 // Need readfirstlane of first op
356 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
358 // Need readfirstlane of second op
359 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
361 // Need readfirstlane of both ops
362 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 3 }
365 // rsrc, voffset, offset
366 const std::array
<unsigned, 4> RegSrcOpIdx
= { { 0, 2, 3, 4 } };
367 return addMappingFromTable
<4>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
370 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
374 RegisterBankInfo::InstructionMappings
375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
378 switch (MI
.getIntrinsicID()) {
379 case Intrinsic::amdgcn_s_buffer_load
: {
380 static const OpRegBankEntry
<2> Table
[4] = {
382 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
384 // Only need 1 register in loop
385 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 300 },
387 // Have to waterfall the resource.
388 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1000 },
390 // Have to waterfall the resource, and the offset.
391 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1500 }
395 const std::array
<unsigned, 2> RegSrcOpIdx
= { { 2, 3 } };
396 return addMappingFromTable
<2>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
398 case Intrinsic::amdgcn_ds_ordered_add
:
399 case Intrinsic::amdgcn_ds_ordered_swap
: {
401 static const OpRegBankEntry
<3> Table
[2] = {
403 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
405 // Need a readfirstlane for m0
406 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
409 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
410 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
412 case Intrinsic::amdgcn_s_sendmsg
:
413 case Intrinsic::amdgcn_s_sendmsghalt
: {
414 // FIXME: Should have no register for immediate
415 static const OpRegBankEntry
<1> Table
[2] = {
417 { { AMDGPU::SGPRRegBankID
}, 1 },
420 { { AMDGPU::VGPRRegBankID
}, 3 }
423 const std::array
<unsigned, 1> RegSrcOpIdx
= { { 2 } };
424 return addMappingFromTable
<1>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
427 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
431 // FIXME: Returns uniform if there's no source value information. This is
433 static bool isScalarLoadLegal(const MachineInstr
&MI
) {
434 if (!MI
.hasOneMemOperand())
437 const MachineMemOperand
*MMO
= *MI
.memoperands_begin();
438 const unsigned AS
= MMO
->getAddrSpace();
439 const bool IsConst
= AS
== AMDGPUAS::CONSTANT_ADDRESS
||
440 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
;
441 // Require 4-byte alignment.
442 return MMO
->getAlign() >= Align(4) &&
443 // Can't do a scalar atomic load.
445 // Don't use scalar loads for volatile accesses to non-constant address
447 (IsConst
|| !MMO
->isVolatile()) &&
448 // Memory must be known constant, or not written before this load.
449 (IsConst
|| MMO
->isInvariant() || (MMO
->getFlags() & MONoClobber
)) &&
450 AMDGPUInstrInfo::isUniformMMO(MMO
);
453 RegisterBankInfo::InstructionMappings
454 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
455 const MachineInstr
&MI
) const {
457 const MachineFunction
&MF
= *MI
.getParent()->getParent();
458 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
461 InstructionMappings AltMappings
;
462 switch (MI
.getOpcode()) {
463 case TargetOpcode::G_CONSTANT
: {
464 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
466 static const OpRegBankEntry
<1> Table
[3] = {
467 { { AMDGPU::VGPRRegBankID
}, 1 },
468 { { AMDGPU::SGPRRegBankID
}, 1 },
469 { { AMDGPU::VCCRegBankID
}, 1 }
472 return addMappingFromTable
<1>(MI
, MRI
, {{ 0 }}, Table
);
477 case TargetOpcode::G_FCONSTANT
:
478 case TargetOpcode::G_FRAME_INDEX
:
479 case TargetOpcode::G_GLOBAL_VALUE
: {
480 static const OpRegBankEntry
<1> Table
[2] = {
481 { { AMDGPU::VGPRRegBankID
}, 1 },
482 { { AMDGPU::SGPRRegBankID
}, 1 }
485 return addMappingFromTable
<1>(MI
, MRI
, {{ 0 }}, Table
);
487 case TargetOpcode::G_AND
:
488 case TargetOpcode::G_OR
:
489 case TargetOpcode::G_XOR
: {
490 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
493 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
494 const InstructionMapping
&SCCMapping
= getInstructionMapping(
495 1, 1, getOperandsMapping(
496 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32),
497 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32),
498 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32)}),
500 AltMappings
.push_back(&SCCMapping
);
502 const InstructionMapping
&VCCMapping0
= getInstructionMapping(
503 2, 1, getOperandsMapping(
504 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
505 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
506 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
)}),
508 AltMappings
.push_back(&VCCMapping0
);
515 const InstructionMapping
&SSMapping
= getInstructionMapping(
516 1, 1, getOperandsMapping(
517 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
518 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
519 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
521 AltMappings
.push_back(&SSMapping
);
523 const InstructionMapping
&VVMapping
= getInstructionMapping(
524 2, 2, getOperandsMapping(
525 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
526 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
527 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
529 AltMappings
.push_back(&VVMapping
);
532 case TargetOpcode::G_LOAD
:
533 case TargetOpcode::G_ZEXTLOAD
:
534 case TargetOpcode::G_SEXTLOAD
: {
535 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
536 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
537 unsigned PtrSize
= PtrTy
.getSizeInBits();
538 unsigned AS
= PtrTy
.getAddressSpace();
540 if ((AS
!= AMDGPUAS::LOCAL_ADDRESS
&& AS
!= AMDGPUAS::REGION_ADDRESS
&&
541 AS
!= AMDGPUAS::PRIVATE_ADDRESS
) &&
542 isScalarLoadLegal(MI
)) {
543 const InstructionMapping
&SSMapping
= getInstructionMapping(
544 1, 1, getOperandsMapping(
545 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
546 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
)}),
548 AltMappings
.push_back(&SSMapping
);
551 const InstructionMapping
&VVMapping
= getInstructionMapping(
554 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
555 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
)}),
557 AltMappings
.push_back(&VVMapping
);
559 // It may be possible to have a vgpr = load sgpr mapping here, because
560 // the mubuf instructions support this kind of load, but probably for only
561 // gfx7 and older. However, the addressing mode matching in the instruction
562 // selector should be able to do a better job of detecting and selecting
563 // these kinds of loads from the vgpr = load vgpr mapping.
568 case TargetOpcode::G_SELECT
: {
569 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
570 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
571 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
572 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1),
573 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
574 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
576 AltMappings
.push_back(&SSMapping
);
578 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
579 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
580 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
581 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
582 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
584 AltMappings
.push_back(&VVMapping
);
588 case TargetOpcode::G_UADDE
:
589 case TargetOpcode::G_USUBE
:
590 case TargetOpcode::G_SADDE
:
591 case TargetOpcode::G_SSUBE
: {
592 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
593 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
595 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
596 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1),
597 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
598 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
599 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1)}),
601 AltMappings
.push_back(&SSMapping
);
603 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
604 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
605 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
606 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
607 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
608 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1)}),
610 AltMappings
.push_back(&VVMapping
);
613 case AMDGPU::G_BRCOND
: {
614 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
616 // TODO: Change type to 32 for scalar
617 const InstructionMapping
&SMapping
= getInstructionMapping(
618 1, 1, getOperandsMapping(
619 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 1), nullptr}),
621 AltMappings
.push_back(&SMapping
);
623 const InstructionMapping
&VMapping
= getInstructionMapping(
624 1, 1, getOperandsMapping(
625 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1), nullptr }),
627 AltMappings
.push_back(&VMapping
);
630 case AMDGPU::G_INTRINSIC
:
631 return getInstrAlternativeMappingsIntrinsic(MI
, MRI
);
632 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
:
633 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI
, MRI
);
637 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
640 void AMDGPURegisterBankInfo::split64BitValueForMapping(
642 SmallVector
<Register
, 2> &Regs
,
644 Register Reg
) const {
645 assert(HalfTy
.getSizeInBits() == 32);
646 MachineRegisterInfo
*MRI
= B
.getMRI();
647 Register LoLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
648 Register HiLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
649 const RegisterBank
*Bank
= getRegBank(Reg
, *MRI
, *TRI
);
650 MRI
->setRegBank(LoLHS
, *Bank
);
651 MRI
->setRegBank(HiLHS
, *Bank
);
653 Regs
.push_back(LoLHS
);
654 Regs
.push_back(HiLHS
);
656 B
.buildInstr(AMDGPU::G_UNMERGE_VALUES
)
662 /// Replace the current type each register in \p Regs has with \p NewTy
663 static void setRegsToType(MachineRegisterInfo
&MRI
, ArrayRef
<Register
> Regs
,
665 for (Register Reg
: Regs
) {
666 assert(MRI
.getType(Reg
).getSizeInBits() == NewTy
.getSizeInBits());
667 MRI
.setType(Reg
, NewTy
);
671 static LLT
getHalfSizedType(LLT Ty
) {
673 assert(Ty
.getElementCount().isKnownMultipleOf(2));
674 return LLT::scalarOrVector(Ty
.getElementCount().divideCoefficientBy(2),
675 Ty
.getElementType());
678 assert(Ty
.getScalarSizeInBits() % 2 == 0);
679 return LLT::scalar(Ty
.getScalarSizeInBits() / 2);
682 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
683 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
684 /// execute the instruction for each unique combination of values in all lanes
685 /// in the wave. The block will be split such that rest of the instructions are
686 /// moved to a new block.
688 /// Essentially performs this loop:
690 /// Save Execution Mask
691 /// For (Lane : Wavefront) {
692 /// Enable Lane, Disable all other lanes
693 /// SGPR = read SGPR value for current lane from VGPR
694 /// VGPRResult[Lane] = use_op SGPR
696 /// Restore Execution Mask
698 /// There is additional complexity to try for compare values to identify the
699 /// unique values used.
700 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
702 iterator_range
<MachineBasicBlock::iterator
> Range
,
703 SmallSet
<Register
, 4> &SGPROperandRegs
,
704 MachineRegisterInfo
&MRI
) const {
706 // Track use registers which have already been expanded with a readfirstlane
707 // sequence. This may have multiple uses if moving a sequence.
708 DenseMap
<Register
, Register
> WaterfalledRegMap
;
710 MachineBasicBlock
&MBB
= B
.getMBB();
711 MachineFunction
*MF
= &B
.getMF();
713 const TargetRegisterClass
*WaveRC
= TRI
->getWaveMaskRegClass();
714 const unsigned WaveAndOpc
= Subtarget
.isWave32() ?
715 AMDGPU::S_AND_B32
: AMDGPU::S_AND_B64
;
716 const unsigned MovExecOpc
=
717 Subtarget
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
718 const unsigned MovExecTermOpc
=
719 Subtarget
.isWave32() ? AMDGPU::S_MOV_B32_term
: AMDGPU::S_MOV_B64_term
;
721 const unsigned XorTermOpc
= Subtarget
.isWave32() ?
722 AMDGPU::S_XOR_B32_term
: AMDGPU::S_XOR_B64_term
;
723 const unsigned AndSaveExecOpc
= Subtarget
.isWave32() ?
724 AMDGPU::S_AND_SAVEEXEC_B32
: AMDGPU::S_AND_SAVEEXEC_B64
;
725 const unsigned ExecReg
= Subtarget
.isWave32() ?
726 AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
729 const int OrigRangeSize
= std::distance(Range
.begin(), Range
.end());
732 Register SaveExecReg
= MRI
.createVirtualRegister(WaveRC
);
733 Register InitSaveExecReg
= MRI
.createVirtualRegister(WaveRC
);
735 // Don't bother using generic instructions/registers for the exec mask.
736 B
.buildInstr(TargetOpcode::IMPLICIT_DEF
)
737 .addDef(InitSaveExecReg
);
739 Register PhiExec
= MRI
.createVirtualRegister(WaveRC
);
740 Register NewExec
= MRI
.createVirtualRegister(WaveRC
);
742 // To insert the loop we need to split the block. Move everything before this
743 // point to a new block, and insert a new empty block before this instruction.
744 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
745 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
746 MachineBasicBlock
*RestoreExecBB
= MF
->CreateMachineBasicBlock();
747 MachineFunction::iterator
MBBI(MBB
);
749 MF
->insert(MBBI
, LoopBB
);
750 MF
->insert(MBBI
, RestoreExecBB
);
751 MF
->insert(MBBI
, RemainderBB
);
753 LoopBB
->addSuccessor(RestoreExecBB
);
754 LoopBB
->addSuccessor(LoopBB
);
756 // Move the rest of the block into a new block.
757 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
758 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, Range
.end(), MBB
.end());
760 MBB
.addSuccessor(LoopBB
);
761 RestoreExecBB
->addSuccessor(RemainderBB
);
763 B
.setInsertPt(*LoopBB
, LoopBB
->end());
765 B
.buildInstr(TargetOpcode::PHI
)
767 .addReg(InitSaveExecReg
)
772 const DebugLoc
&DL
= B
.getDL();
774 MachineInstr
&FirstInst
= *Range
.begin();
776 // Move the instruction into the loop. Note we moved everything after
777 // Range.end() already into a new block, so Range.end() is no longer valid.
778 LoopBB
->splice(LoopBB
->end(), &MBB
, Range
.begin(), MBB
.end());
780 // Figure out the iterator range after splicing the instructions.
781 MachineBasicBlock::iterator NewBegin
= FirstInst
.getIterator();
782 auto NewEnd
= LoopBB
->end();
784 MachineBasicBlock::iterator I
= Range
.begin();
785 B
.setInsertPt(*LoopBB
, I
);
789 assert(std::distance(NewBegin
, NewEnd
) == OrigRangeSize
);
791 for (MachineInstr
&MI
: make_range(NewBegin
, NewEnd
)) {
792 for (MachineOperand
&Op
: MI
.uses()) {
793 if (!Op
.isReg() || Op
.isDef())
796 Register OldReg
= Op
.getReg();
797 if (!SGPROperandRegs
.count(OldReg
))
800 // See if we already processed this register in another instruction in the
802 auto OldVal
= WaterfalledRegMap
.find(OldReg
);
803 if (OldVal
!= WaterfalledRegMap
.end()) {
804 Op
.setReg(OldVal
->second
);
808 Register OpReg
= Op
.getReg();
809 LLT OpTy
= MRI
.getType(OpReg
);
811 const RegisterBank
*OpBank
= getRegBank(OpReg
, MRI
, *TRI
);
812 if (OpBank
!= &AMDGPU::VGPRRegBank
) {
813 // Insert copy from AGPR to VGPR before the loop.
815 OpReg
= B
.buildCopy(OpTy
, OpReg
).getReg(0);
816 MRI
.setRegBank(OpReg
, AMDGPU::VGPRRegBank
);
820 unsigned OpSize
= OpTy
.getSizeInBits();
822 // Can only do a readlane of 32-bit pieces.
824 // Avoid extra copies in the simple case of one 32-bit register.
825 Register CurrentLaneOpReg
826 = MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
827 MRI
.setType(CurrentLaneOpReg
, OpTy
);
829 constrainGenericRegister(OpReg
, AMDGPU::VGPR_32RegClass
, MRI
);
830 // Read the next variant <- also loop target.
831 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
835 Register NewCondReg
= MRI
.createVirtualRegister(WaveRC
);
836 bool First
= CondReg
== AMDGPU::NoRegister
;
838 CondReg
= NewCondReg
;
840 // Compare the just read M0 value to all possible Idx values.
841 B
.buildInstr(AMDGPU::V_CMP_EQ_U32_e64
)
843 .addReg(CurrentLaneOpReg
)
845 Op
.setReg(CurrentLaneOpReg
);
848 Register AndReg
= MRI
.createVirtualRegister(WaveRC
);
850 // If there are multiple operands to consider, and the conditions.
851 B
.buildInstr(WaveAndOpc
)
858 LLT S32
= LLT::scalar(32);
859 SmallVector
<Register
, 8> ReadlanePieces
;
861 // The compares can be done as 64-bit, but the extract needs to be done
864 bool Is64
= OpSize
% 64 == 0;
866 unsigned UnmergeTySize
= Is64
? 64 : 32;
868 Is64
? AMDGPU::V_CMP_EQ_U64_e64
: AMDGPU::V_CMP_EQ_U32_e64
;
870 // Insert the unmerge before the loop.
873 unsigned NumPieces
= OpSize
/ UnmergeTySize
;
874 SmallVector
<Register
, 8> UnmergePieces
;
875 if (NumPieces
== 1) {
876 UnmergePieces
.push_back(OpReg
);
878 LLT UnmergeTy
= LLT::scalar(UnmergeTySize
);
879 MachineInstrBuilder Unmerge
= B
.buildUnmerge(UnmergeTy
, OpReg
);
880 for (unsigned PieceIdx
= 0; PieceIdx
!= NumPieces
; ++PieceIdx
)
881 UnmergePieces
.push_back(Unmerge
.getReg(PieceIdx
));
885 for (Register UnmergePiece
: UnmergePieces
) {
886 Register CurrentLaneOpReg
;
888 Register CurrentLaneOpRegLo
= MRI
.createGenericVirtualRegister(S32
);
889 Register CurrentLaneOpRegHi
= MRI
.createGenericVirtualRegister(S32
);
891 MRI
.setRegClass(UnmergePiece
, &AMDGPU::VReg_64RegClass
);
892 MRI
.setRegClass(CurrentLaneOpRegLo
, &AMDGPU::SReg_32_XM0RegClass
);
893 MRI
.setRegClass(CurrentLaneOpRegHi
, &AMDGPU::SReg_32_XM0RegClass
);
895 // Read the next variant <- also loop target.
896 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
898 .addReg(UnmergePiece
, 0, AMDGPU::sub0
);
900 // Read the next variant <- also loop target.
901 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
903 .addReg(UnmergePiece
, 0, AMDGPU::sub1
);
906 B
.buildMerge(LLT::scalar(64),
907 {CurrentLaneOpRegLo
, CurrentLaneOpRegHi
})
910 MRI
.setRegClass(CurrentLaneOpReg
, &AMDGPU::SReg_64_XEXECRegClass
);
912 if (OpTy
.getScalarSizeInBits() == 64) {
913 // If we need to produce a 64-bit element vector, so use the
915 ReadlanePieces
.push_back(CurrentLaneOpReg
);
917 // 32-bit element type.
918 ReadlanePieces
.push_back(CurrentLaneOpRegLo
);
919 ReadlanePieces
.push_back(CurrentLaneOpRegHi
);
922 CurrentLaneOpReg
= MRI
.createGenericVirtualRegister(S32
);
923 MRI
.setRegClass(UnmergePiece
, &AMDGPU::VGPR_32RegClass
);
924 MRI
.setRegClass(CurrentLaneOpReg
, &AMDGPU::SReg_32_XM0RegClass
);
926 // Read the next variant <- also loop target.
927 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
929 .addReg(UnmergePiece
);
930 ReadlanePieces
.push_back(CurrentLaneOpReg
);
933 Register NewCondReg
= MRI
.createVirtualRegister(WaveRC
);
934 bool First
= CondReg
== AMDGPU::NoRegister
;
936 CondReg
= NewCondReg
;
940 .addReg(CurrentLaneOpReg
)
941 .addReg(UnmergePiece
);
944 Register AndReg
= MRI
.createVirtualRegister(WaveRC
);
946 // If there are multiple operands to consider, and the conditions.
947 B
.buildInstr(WaveAndOpc
)
955 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
957 if (OpTy
.isVector()) {
958 auto Merge
= B
.buildBuildVector(OpTy
, ReadlanePieces
);
959 Op
.setReg(Merge
.getReg(0));
960 MRI
.setRegBank(Op
.getReg(), AMDGPU::SGPRRegBank
);
961 } else if (ReadlanePieces
.size() > 1) {
962 auto Merge
= B
.buildMerge(OpTy
, ReadlanePieces
);
963 Op
.setReg(Merge
.getReg(0));
964 MRI
.setRegBank(Op
.getReg(), AMDGPU::SGPRRegBank
);
966 Op
.setReg(ReadlanePieces
[0]);
970 // Make sure we don't re-process this register again.
971 WaterfalledRegMap
.insert(std::make_pair(OldReg
, Op
.getReg()));
975 // Update EXEC, save the original EXEC value to VCC.
976 B
.buildInstr(AndSaveExecOpc
)
978 .addReg(CondReg
, RegState::Kill
);
980 MRI
.setSimpleHint(NewExec
, CondReg
);
982 B
.setInsertPt(*LoopBB
, LoopBB
->end());
984 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
985 B
.buildInstr(XorTermOpc
)
990 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
993 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
994 B
.buildInstr(AMDGPU::SI_WATERFALL_LOOP
).addMBB(LoopBB
);
996 // Save the EXEC mask before the loop.
997 BuildMI(MBB
, MBB
.end(), DL
, TII
->get(MovExecOpc
), SaveExecReg
)
1000 // Restore the EXEC mask after the loop.
1001 B
.setMBB(*RestoreExecBB
);
1002 B
.buildInstr(MovExecTermOpc
)
1004 .addReg(SaveExecReg
);
1006 // Set the insert point after the original instruction, so any new
1007 // instructions will be in the remainder.
1008 B
.setInsertPt(*RemainderBB
, RemainderBB
->begin());
1013 // Return any unique registers used by \p MI at \p OpIndices that need to be
1014 // handled in a waterfall loop. Returns these registers in \p
1015 // SGPROperandRegs. Returns true if there are any operands to handle and a
1016 // waterfall loop is necessary.
1017 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1018 SmallSet
<Register
, 4> &SGPROperandRegs
, MachineInstr
&MI
,
1019 MachineRegisterInfo
&MRI
, ArrayRef
<unsigned> OpIndices
) const {
1020 for (unsigned Op
: OpIndices
) {
1021 assert(MI
.getOperand(Op
).isUse());
1022 Register Reg
= MI
.getOperand(Op
).getReg();
1023 const RegisterBank
*OpBank
= getRegBank(Reg
, MRI
, *TRI
);
1024 if (OpBank
->getID() != AMDGPU::SGPRRegBankID
)
1025 SGPROperandRegs
.insert(Reg
);
1028 // No operands need to be replaced, so no need to loop.
1029 return !SGPROperandRegs
.empty();
1032 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1033 MachineIRBuilder
&B
, MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1034 ArrayRef
<unsigned> OpIndices
) const {
1035 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1036 // are the same register.
1037 SmallSet
<Register
, 4> SGPROperandRegs
;
1039 if (!collectWaterfallOperands(SGPROperandRegs
, MI
, MRI
, OpIndices
))
1042 MachineBasicBlock::iterator I
= MI
.getIterator();
1043 return executeInWaterfallLoop(B
, make_range(I
, std::next(I
)),
1044 SGPROperandRegs
, MRI
);
1047 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1048 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1049 ArrayRef
<unsigned> OpIndices
) const {
1050 MachineIRBuilder
B(MI
);
1051 return executeInWaterfallLoop(B
, MI
, MRI
, OpIndices
);
1054 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1055 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1056 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, unsigned OpIdx
) const {
1057 Register Reg
= MI
.getOperand(OpIdx
).getReg();
1058 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
1059 if (Bank
== &AMDGPU::SGPRRegBank
)
1062 LLT Ty
= MRI
.getType(Reg
);
1063 MachineIRBuilder
B(MI
);
1065 if (Bank
!= &AMDGPU::VGPRRegBank
) {
1066 // We need to copy from AGPR to VGPR
1067 Reg
= B
.buildCopy(Ty
, Reg
).getReg(0);
1068 MRI
.setRegBank(Reg
, AMDGPU::VGPRRegBank
);
1071 Register SGPR
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1072 B
.buildInstr(AMDGPU::V_READFIRSTLANE_B32
)
1076 MRI
.setType(SGPR
, Ty
);
1078 const TargetRegisterClass
*Constrained
=
1079 constrainGenericRegister(Reg
, AMDGPU::VGPR_32RegClass
, MRI
);
1081 assert(Constrained
&& "Failed to constrain readfirstlane src reg");
1083 MI
.getOperand(OpIdx
).setReg(SGPR
);
1086 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1087 /// rest will be in the remainder.
1088 static std::pair
<LLT
, LLT
> splitUnequalType(LLT Ty
, unsigned FirstSize
) {
1089 unsigned TotalSize
= Ty
.getSizeInBits();
1091 return {LLT::scalar(FirstSize
), LLT::scalar(TotalSize
- FirstSize
)};
1093 LLT EltTy
= Ty
.getElementType();
1094 unsigned EltSize
= EltTy
.getSizeInBits();
1095 assert(FirstSize
% EltSize
== 0);
1097 unsigned FirstPartNumElts
= FirstSize
/ EltSize
;
1098 unsigned RemainderElts
= (TotalSize
- FirstSize
) / EltSize
;
1100 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts
), EltTy
),
1101 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts
), EltTy
)};
1104 static LLT
widen96To128(LLT Ty
) {
1106 return LLT::scalar(128);
1108 LLT EltTy
= Ty
.getElementType();
1109 assert(128 % EltTy
.getSizeInBits() == 0);
1110 return LLT::fixed_vector(128 / EltTy
.getSizeInBits(), EltTy
);
1113 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr
&MI
,
1114 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1115 MachineRegisterInfo
&MRI
) const {
1116 Register DstReg
= MI
.getOperand(0).getReg();
1117 const LLT LoadTy
= MRI
.getType(DstReg
);
1118 unsigned LoadSize
= LoadTy
.getSizeInBits();
1119 const unsigned MaxNonSmrdLoadSize
= 128;
1121 const RegisterBank
*DstBank
=
1122 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
1123 if (DstBank
== &AMDGPU::SGPRRegBank
) {
1124 // There are some special cases that we need to look at for 32 bit and 96
1125 // bit SGPR loads otherwise we have nothing to do.
1126 if (LoadSize
!= 32 && LoadSize
!= 96)
1129 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
1130 const unsigned MemSize
= 8 * MMO
->getSize();
1131 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1132 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1133 // scalar loads should have a load size of 32 but memory access size of less
1135 if (LoadSize
== 32 &&
1136 (MemSize
== 32 || LoadTy
.isVector() || !isScalarLoadLegal(MI
)))
1139 Register PtrReg
= MI
.getOperand(1).getReg();
1141 ApplyRegBankMapping
O(*this, MRI
, &AMDGPU::SGPRRegBank
);
1142 MachineIRBuilder
B(MI
, O
);
1144 if (LoadSize
== 32) {
1145 // This is an extending load from a sub-dword size. Widen the memory
1146 // access size to 4 bytes and clear the extra high bits appropriately
1147 const LLT S32
= LLT::scalar(32);
1148 if (MI
.getOpcode() == AMDGPU::G_SEXTLOAD
) {
1149 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1150 auto WideLoad
= B
.buildLoadFromOffset(S32
, PtrReg
, *MMO
, 0);
1151 B
.buildSExtInReg(MI
.getOperand(0), WideLoad
, MemSize
);
1152 } else if (MI
.getOpcode() == AMDGPU::G_ZEXTLOAD
) {
1153 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1154 auto WideLoad
= B
.buildLoadFromOffset(S32
, PtrReg
, *MMO
, 0);
1155 B
.buildZExtInReg(MI
.getOperand(0), WideLoad
, MemSize
);
1157 // We do not need to touch the higher bits for regular loads.
1158 B
.buildLoadFromOffset(MI
.getOperand(0), PtrReg
, *MMO
, 0);
1160 // 96-bit loads are only available for vector loads. We need to split this
1161 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1162 if (MMO
->getAlign() < Align(16)) {
1163 MachineFunction
*MF
= MI
.getParent()->getParent();
1164 ApplyRegBankMapping
ApplyBank(*this, MRI
, DstBank
);
1165 MachineIRBuilder
B(MI
, ApplyBank
);
1166 LegalizerHelper
Helper(*MF
, ApplyBank
, B
);
1168 std::tie(Part64
, Part32
) = splitUnequalType(LoadTy
, 64);
1169 if (Helper
.reduceLoadStoreWidth(cast
<GAnyLoad
>(MI
), 0, Part64
) !=
1170 LegalizerHelper::Legalized
)
1174 LLT WiderTy
= widen96To128(LoadTy
);
1175 auto WideLoad
= B
.buildLoadFromOffset(WiderTy
, PtrReg
, *MMO
, 0);
1176 if (WiderTy
.isScalar())
1177 B
.buildTrunc(MI
.getOperand(0), WideLoad
);
1179 B
.buildDeleteTrailingVectorElements(MI
.getOperand(0).getReg(),
1185 MI
.eraseFromParent();
1189 // 128-bit loads are supported for all instruction types.
1190 if (LoadSize
<= MaxNonSmrdLoadSize
)
1193 SmallVector
<Register
, 16> DefRegs(OpdMapper
.getVRegs(0));
1194 SmallVector
<Register
, 1> SrcRegs(OpdMapper
.getVRegs(1));
1196 if (SrcRegs
.empty())
1197 SrcRegs
.push_back(MI
.getOperand(1).getReg());
1199 assert(LoadSize
% MaxNonSmrdLoadSize
== 0);
1201 // RegBankSelect only emits scalar types, so we need to reset the pointer
1202 // operand to a pointer type.
1203 Register BasePtrReg
= SrcRegs
[0];
1204 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
1205 MRI
.setType(BasePtrReg
, PtrTy
);
1207 unsigned NumSplitParts
= LoadTy
.getSizeInBits() / MaxNonSmrdLoadSize
;
1208 const LLT LoadSplitTy
= LoadTy
.divide(NumSplitParts
);
1209 ApplyRegBankMapping
Observer(*this, MRI
, &AMDGPU::VGPRRegBank
);
1210 MachineIRBuilder
B(MI
, Observer
);
1211 LegalizerHelper
Helper(B
.getMF(), Observer
, B
);
1213 if (LoadTy
.isVector()) {
1214 if (Helper
.fewerElementsVector(MI
, 0, LoadSplitTy
) != LegalizerHelper::Legalized
)
1217 if (Helper
.narrowScalar(MI
, 0, LoadSplitTy
) != LegalizerHelper::Legalized
)
1221 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
1225 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1227 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1228 MachineRegisterInfo
&MRI
) const {
1229 const MachineFunction
&MF
= *MI
.getMF();
1230 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1231 const auto &TFI
= *ST
.getFrameLowering();
1233 // Guard in case the stack growth direction ever changes with scratch
1235 if (TFI
.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown
)
1238 Register Dst
= MI
.getOperand(0).getReg();
1239 Register AllocSize
= MI
.getOperand(1).getReg();
1240 Align Alignment
= assumeAligned(MI
.getOperand(2).getImm());
1242 const RegisterBank
*SizeBank
= getRegBank(AllocSize
, MRI
, *TRI
);
1244 // TODO: Need to emit a wave reduction to get the maximum size.
1245 if (SizeBank
!= &AMDGPU::SGPRRegBank
)
1248 LLT PtrTy
= MRI
.getType(Dst
);
1249 LLT IntPtrTy
= LLT::scalar(PtrTy
.getSizeInBits());
1251 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1252 Register SPReg
= Info
->getStackPtrOffsetReg();
1253 ApplyRegBankMapping
ApplyBank(*this, MRI
, &AMDGPU::SGPRRegBank
);
1254 MachineIRBuilder
B(MI
, ApplyBank
);
1256 auto WaveSize
= B
.buildConstant(LLT::scalar(32), ST
.getWavefrontSizeLog2());
1257 auto ScaledSize
= B
.buildShl(IntPtrTy
, AllocSize
, WaveSize
);
1259 auto SPCopy
= B
.buildCopy(PtrTy
, SPReg
);
1260 if (Alignment
> TFI
.getStackAlign()) {
1261 auto PtrAdd
= B
.buildPtrAdd(PtrTy
, SPCopy
, ScaledSize
);
1262 B
.buildMaskLowPtrBits(Dst
, PtrAdd
,
1263 Log2(Alignment
) + ST
.getWavefrontSizeLog2());
1265 B
.buildPtrAdd(Dst
, SPCopy
, ScaledSize
);
1268 MI
.eraseFromParent();
1272 bool AMDGPURegisterBankInfo::applyMappingImage(
1273 MachineInstr
&MI
, const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1274 MachineRegisterInfo
&MRI
, int RsrcIdx
) const {
1275 const int NumDefs
= MI
.getNumExplicitDefs();
1277 // The reported argument index is relative to the IR intrinsic call arguments,
1278 // so we need to shift by the number of defs and the intrinsic ID.
1279 RsrcIdx
+= NumDefs
+ 1;
1281 // Insert copies to VGPR arguments.
1282 applyDefaultMapping(OpdMapper
);
1284 // Fixup any SGPR arguments.
1285 SmallVector
<unsigned, 4> SGPRIndexes
;
1286 for (int I
= NumDefs
, NumOps
= MI
.getNumOperands(); I
!= NumOps
; ++I
) {
1287 if (!MI
.getOperand(I
).isReg())
1290 // If this intrinsic has a sampler, it immediately follows rsrc.
1291 if (I
== RsrcIdx
|| I
== RsrcIdx
+ 1)
1292 SGPRIndexes
.push_back(I
);
1295 executeInWaterfallLoop(MI
, MRI
, SGPRIndexes
);
1299 static Register
getSrcRegIgnoringCopies(const MachineRegisterInfo
&MRI
,
1301 MachineInstr
*Def
= getDefIgnoringCopies(Reg
, MRI
);
1305 // TODO: Guard against this being an implicit def
1306 return Def
->getOperand(0).getReg();
1309 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1310 // the three offsets (voffset, soffset and instoffset)
1311 static unsigned setBufferOffsets(MachineIRBuilder
&B
,
1312 const AMDGPURegisterBankInfo
&RBI
,
1313 Register CombinedOffset
, Register
&VOffsetReg
,
1314 Register
&SOffsetReg
, int64_t &InstOffsetVal
,
1316 const LLT S32
= LLT::scalar(32);
1317 MachineRegisterInfo
*MRI
= B
.getMRI();
1319 if (Optional
<int64_t> Imm
= getIConstantVRegSExtVal(CombinedOffset
, *MRI
)) {
1320 uint32_t SOffset
, ImmOffset
;
1321 if (AMDGPU::splitMUBUFOffset(*Imm
, SOffset
, ImmOffset
, &RBI
.Subtarget
,
1323 VOffsetReg
= B
.buildConstant(S32
, 0).getReg(0);
1324 SOffsetReg
= B
.buildConstant(S32
, SOffset
).getReg(0);
1325 InstOffsetVal
= ImmOffset
;
1327 B
.getMRI()->setRegBank(VOffsetReg
, AMDGPU::VGPRRegBank
);
1328 B
.getMRI()->setRegBank(SOffsetReg
, AMDGPU::SGPRRegBank
);
1329 return SOffset
+ ImmOffset
;
1336 std::tie(Base
, Offset
) =
1337 AMDGPU::getBaseWithConstantOffset(*MRI
, CombinedOffset
);
1339 uint32_t SOffset
, ImmOffset
;
1340 if ((int)Offset
> 0 && AMDGPU::splitMUBUFOffset(Offset
, SOffset
, ImmOffset
,
1341 &RBI
.Subtarget
, Alignment
)) {
1342 if (RBI
.getRegBank(Base
, *MRI
, *RBI
.TRI
) == &AMDGPU::VGPRRegBank
) {
1344 SOffsetReg
= B
.buildConstant(S32
, SOffset
).getReg(0);
1345 B
.getMRI()->setRegBank(SOffsetReg
, AMDGPU::SGPRRegBank
);
1346 InstOffsetVal
= ImmOffset
;
1347 return 0; // XXX - Why is this 0?
1350 // If we have SGPR base, we can use it for soffset.
1352 VOffsetReg
= B
.buildConstant(S32
, 0).getReg(0);
1353 B
.getMRI()->setRegBank(VOffsetReg
, AMDGPU::VGPRRegBank
);
1355 InstOffsetVal
= ImmOffset
;
1356 return 0; // XXX - Why is this 0?
1360 // Handle the variable sgpr + vgpr case.
1361 MachineInstr
*Add
= getOpcodeDef(AMDGPU::G_ADD
, CombinedOffset
, *MRI
);
1362 if (Add
&& (int)Offset
>= 0) {
1363 Register Src0
= getSrcRegIgnoringCopies(*MRI
, Add
->getOperand(1).getReg());
1364 Register Src1
= getSrcRegIgnoringCopies(*MRI
, Add
->getOperand(2).getReg());
1366 const RegisterBank
*Src0Bank
= RBI
.getRegBank(Src0
, *MRI
, *RBI
.TRI
);
1367 const RegisterBank
*Src1Bank
= RBI
.getRegBank(Src1
, *MRI
, *RBI
.TRI
);
1369 if (Src0Bank
== &AMDGPU::VGPRRegBank
&& Src1Bank
== &AMDGPU::SGPRRegBank
) {
1375 if (Src0Bank
== &AMDGPU::SGPRRegBank
&& Src1Bank
== &AMDGPU::VGPRRegBank
) {
1382 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1383 // have an SGPR offset and a VGPR resource.
1384 if (RBI
.getRegBank(CombinedOffset
, *MRI
, *RBI
.TRI
) == &AMDGPU::VGPRRegBank
) {
1385 VOffsetReg
= CombinedOffset
;
1387 VOffsetReg
= B
.buildCopy(S32
, CombinedOffset
).getReg(0);
1388 B
.getMRI()->setRegBank(VOffsetReg
, AMDGPU::VGPRRegBank
);
1391 SOffsetReg
= B
.buildConstant(S32
, 0).getReg(0);
1392 B
.getMRI()->setRegBank(SOffsetReg
, AMDGPU::SGPRRegBank
);
1396 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1397 const OperandsMapper
&OpdMapper
) const {
1398 MachineInstr
&MI
= OpdMapper
.getMI();
1399 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
1401 const LLT S32
= LLT::scalar(32);
1402 Register Dst
= MI
.getOperand(0).getReg();
1403 LLT Ty
= MRI
.getType(Dst
);
1405 const RegisterBank
*RSrcBank
=
1406 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
1407 const RegisterBank
*OffsetBank
=
1408 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
1409 if (RSrcBank
== &AMDGPU::SGPRRegBank
&&
1410 OffsetBank
== &AMDGPU::SGPRRegBank
)
1411 return true; // Legal mapping
1413 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1414 // here but don't have an MMO.
1416 unsigned LoadSize
= Ty
.getSizeInBits();
1418 if (LoadSize
== 256 || LoadSize
== 512) {
1419 NumLoads
= LoadSize
/ 128;
1420 Ty
= Ty
.divide(NumLoads
);
1423 // Use the alignment to ensure that the required offsets will fit into the
1424 // immediate offsets.
1425 const Align Alignment
= NumLoads
> 1 ? Align(16 * NumLoads
) : Align(1);
1427 MachineIRBuilder
B(MI
);
1428 MachineFunction
&MF
= B
.getMF();
1432 int64_t ImmOffset
= 0;
1434 unsigned MMOOffset
= setBufferOffsets(B
, *this, MI
.getOperand(2).getReg(),
1435 VOffset
, SOffset
, ImmOffset
, Alignment
);
1437 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1438 // can, but we need to track an MMO for that.
1439 const unsigned MemSize
= (Ty
.getSizeInBits() + 7) / 8;
1440 const Align
MemAlign(4); // FIXME: ABI type alignment?
1441 MachineMemOperand
*BaseMMO
= MF
.getMachineMemOperand(
1442 MachinePointerInfo(),
1443 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
1444 MachineMemOperand::MOInvariant
,
1447 BaseMMO
= MF
.getMachineMemOperand(BaseMMO
, MMOOffset
, MemSize
);
1449 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1450 // assume that the buffer is unswizzled.
1452 Register RSrc
= MI
.getOperand(1).getReg();
1453 Register VIndex
= B
.buildConstant(S32
, 0).getReg(0);
1454 B
.getMRI()->setRegBank(VIndex
, AMDGPU::VGPRRegBank
);
1456 SmallVector
<Register
, 4> LoadParts(NumLoads
);
1458 MachineBasicBlock::iterator MII
= MI
.getIterator();
1459 MachineInstrSpan
Span(MII
, &B
.getMBB());
1461 for (int i
= 0; i
< NumLoads
; ++i
) {
1462 if (NumLoads
== 1) {
1465 LoadParts
[i
] = MRI
.createGenericVirtualRegister(Ty
);
1466 MRI
.setRegBank(LoadParts
[i
], AMDGPU::VGPRRegBank
);
1469 MachineMemOperand
*MMO
= BaseMMO
;
1471 BaseMMO
= MF
.getMachineMemOperand(BaseMMO
, MMOOffset
+ 16 * i
, MemSize
);
1473 B
.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD
)
1474 .addDef(LoadParts
[i
]) // vdata
1475 .addUse(RSrc
) // rsrc
1476 .addUse(VIndex
) // vindex
1477 .addUse(VOffset
) // voffset
1478 .addUse(SOffset
) // soffset
1479 .addImm(ImmOffset
+ 16 * i
) // offset(imm)
1480 .addImm(0) // cachepolicy, swizzled buffer(imm)
1481 .addImm(0) // idxen(imm)
1482 .addMemOperand(MMO
);
1485 // TODO: If only the resource is a VGPR, it may be better to execute the
1486 // scalar load in the waterfall loop if the resource is expected to frequently
1487 // be dynamically uniform.
1488 if (RSrcBank
!= &AMDGPU::SGPRRegBank
) {
1489 // Remove the original instruction to avoid potentially confusing the
1490 // waterfall loop logic.
1491 B
.setInstr(*Span
.begin());
1492 MI
.eraseFromParent();
1494 SmallSet
<Register
, 4> OpsToWaterfall
;
1496 OpsToWaterfall
.insert(RSrc
);
1497 executeInWaterfallLoop(B
, make_range(Span
.begin(), Span
.end()),
1498 OpsToWaterfall
, MRI
);
1501 if (NumLoads
!= 1) {
1503 B
.buildConcatVectors(Dst
, LoadParts
);
1505 B
.buildMerge(Dst
, LoadParts
);
1508 // We removed the instruction earlier with a waterfall loop.
1509 if (RSrcBank
== &AMDGPU::SGPRRegBank
)
1510 MI
.eraseFromParent();
1515 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper
&OpdMapper
,
1516 bool Signed
) const {
1517 MachineInstr
&MI
= OpdMapper
.getMI();
1518 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
1520 // Insert basic copies
1521 applyDefaultMapping(OpdMapper
);
1523 Register DstReg
= MI
.getOperand(0).getReg();
1524 LLT Ty
= MRI
.getType(DstReg
);
1526 const LLT S32
= LLT::scalar(32);
1528 unsigned FirstOpnd
= MI
.getOpcode() == AMDGPU::G_INTRINSIC
? 2 : 1;
1529 Register SrcReg
= MI
.getOperand(FirstOpnd
).getReg();
1530 Register OffsetReg
= MI
.getOperand(FirstOpnd
+ 1).getReg();
1531 Register WidthReg
= MI
.getOperand(FirstOpnd
+ 2).getReg();
1533 const RegisterBank
*DstBank
=
1534 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
1535 if (DstBank
== &AMDGPU::VGPRRegBank
) {
1539 // There is no 64-bit vgpr bitfield extract instructions so the operation
1540 // is expanded to a sequence of instructions that implement the operation.
1541 ApplyRegBankMapping
ApplyBank(*this, MRI
, &AMDGPU::VGPRRegBank
);
1542 MachineIRBuilder
B(MI
, ApplyBank
);
1544 const LLT S64
= LLT::scalar(64);
1545 // Shift the source operand so that extracted bits start at bit 0.
1546 auto ShiftOffset
= Signed
? B
.buildAShr(S64
, SrcReg
, OffsetReg
)
1547 : B
.buildLShr(S64
, SrcReg
, OffsetReg
);
1548 auto UnmergeSOffset
= B
.buildUnmerge({S32
, S32
}, ShiftOffset
);
1550 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1551 // if the width is a constant.
1552 if (auto ConstWidth
= getIConstantVRegValWithLookThrough(WidthReg
, MRI
)) {
1553 // Use the 32-bit bitfield extract instruction if the width is a constant.
1554 // Depending on the width size, use either the low or high 32-bits.
1555 auto Zero
= B
.buildConstant(S32
, 0);
1556 auto WidthImm
= ConstWidth
->Value
.getZExtValue();
1557 if (WidthImm
<= 32) {
1558 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1559 // or clear the upper 32-bits.
1561 Signed
? B
.buildSbfx(S32
, UnmergeSOffset
.getReg(0), Zero
, WidthReg
)
1562 : B
.buildUbfx(S32
, UnmergeSOffset
.getReg(0), Zero
, WidthReg
);
1564 Signed
? B
.buildAShr(S32
, Extract
, B
.buildConstant(S32
, 31)) : Zero
;
1565 B
.buildMerge(DstReg
, {Extract
, Extend
});
1567 // Use bitfield extract on upper 32-bit source, and combine with lower
1569 auto UpperWidth
= B
.buildConstant(S32
, WidthImm
- 32);
1572 ? B
.buildSbfx(S32
, UnmergeSOffset
.getReg(1), Zero
, UpperWidth
)
1573 : B
.buildUbfx(S32
, UnmergeSOffset
.getReg(1), Zero
, UpperWidth
);
1574 B
.buildMerge(DstReg
, {UnmergeSOffset
.getReg(0), Extract
});
1576 MI
.eraseFromParent();
1580 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1582 auto ExtShift
= B
.buildSub(S32
, B
.buildConstant(S32
, 64), WidthReg
);
1583 auto SignBit
= B
.buildShl(S64
, ShiftOffset
, ExtShift
);
1585 B
.buildAShr(S64
, SignBit
, ExtShift
);
1587 B
.buildLShr(S64
, SignBit
, ExtShift
);
1588 MI
.eraseFromParent();
1592 // The scalar form packs the offset and width in a single operand.
1594 ApplyRegBankMapping
ApplyBank(*this, MRI
, &AMDGPU::SGPRRegBank
);
1595 MachineIRBuilder
B(MI
, ApplyBank
);
1597 // Ensure the high bits are clear to insert the offset.
1598 auto OffsetMask
= B
.buildConstant(S32
, maskTrailingOnes
<unsigned>(6));
1599 auto ClampOffset
= B
.buildAnd(S32
, OffsetReg
, OffsetMask
);
1601 // Zeros out the low bits, so don't bother clamping the input value.
1602 auto ShiftWidth
= B
.buildShl(S32
, WidthReg
, B
.buildConstant(S32
, 16));
1604 // Transformation function, pack the offset and width of a BFE into
1605 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1606 // source, bits [5:0] contain the offset and bits [22:16] the width.
1607 auto MergedInputs
= B
.buildOr(S32
, ClampOffset
, ShiftWidth
);
1609 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1610 // register class constraints.
1611 unsigned Opc
= Ty
== S32
? (Signed
? AMDGPU::S_BFE_I32
: AMDGPU::S_BFE_U32
) :
1612 (Signed
? AMDGPU::S_BFE_I64
: AMDGPU::S_BFE_U64
);
1614 auto MIB
= B
.buildInstr(Opc
, {DstReg
}, {SrcReg
, MergedInputs
});
1615 if (!constrainSelectedInstRegOperands(*MIB
, *TII
, *TRI
, *this))
1616 llvm_unreachable("failed to constrain BFE");
1618 MI
.eraseFromParent();
1622 // Return a suitable opcode for extending the operands of Opc when widening.
1623 static unsigned getExtendOp(unsigned Opc
) {
1625 case TargetOpcode::G_ASHR
:
1626 case TargetOpcode::G_SMIN
:
1627 case TargetOpcode::G_SMAX
:
1628 return TargetOpcode::G_SEXT
;
1629 case TargetOpcode::G_LSHR
:
1630 case TargetOpcode::G_UMIN
:
1631 case TargetOpcode::G_UMAX
:
1632 return TargetOpcode::G_ZEXT
;
1634 return TargetOpcode::G_ANYEXT
;
1638 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1639 // any illegal vector extend or unmerge operations.
1640 static std::pair
<Register
, Register
>
1641 unpackV2S16ToS32(MachineIRBuilder
&B
, Register Src
, unsigned ExtOpcode
) {
1642 const LLT S32
= LLT::scalar(32);
1643 auto Bitcast
= B
.buildBitcast(S32
, Src
);
1645 if (ExtOpcode
== TargetOpcode::G_SEXT
) {
1646 auto ExtLo
= B
.buildSExtInReg(S32
, Bitcast
, 16);
1647 auto ShiftHi
= B
.buildAShr(S32
, Bitcast
, B
.buildConstant(S32
, 16));
1648 return std::make_pair(ExtLo
.getReg(0), ShiftHi
.getReg(0));
1651 auto ShiftHi
= B
.buildLShr(S32
, Bitcast
, B
.buildConstant(S32
, 16));
1652 if (ExtOpcode
== TargetOpcode::G_ZEXT
) {
1653 auto ExtLo
= B
.buildAnd(S32
, Bitcast
, B
.buildConstant(S32
, 0xffff));
1654 return std::make_pair(ExtLo
.getReg(0), ShiftHi
.getReg(0));
1657 assert(ExtOpcode
== TargetOpcode::G_ANYEXT
);
1658 return std::make_pair(Bitcast
.getReg(0), ShiftHi
.getReg(0));
1661 // For cases where only a single copy is inserted for matching register banks.
1662 // Replace the register in the instruction operand
1663 static bool substituteSimpleCopyRegs(
1664 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
, unsigned OpIdx
) {
1665 SmallVector
<unsigned, 1> SrcReg(OpdMapper
.getVRegs(OpIdx
));
1666 if (!SrcReg
.empty()) {
1667 assert(SrcReg
.size() == 1);
1668 OpdMapper
.getMI().getOperand(OpIdx
).setReg(SrcReg
[0]);
1675 /// Handle register layout difference for f16 images for some subtargets.
1676 Register
AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder
&B
,
1677 MachineRegisterInfo
&MRI
,
1678 Register Reg
) const {
1679 if (!Subtarget
.hasUnpackedD16VMem())
1682 const LLT S16
= LLT::scalar(16);
1683 LLT StoreVT
= MRI
.getType(Reg
);
1684 if (!StoreVT
.isVector() || StoreVT
.getElementType() != S16
)
1687 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
1690 SmallVector
<Register
, 4> WideRegs
;
1691 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
1692 WideRegs
.push_back(Unmerge
.getReg(I
));
1694 const LLT S32
= LLT::scalar(32);
1695 int NumElts
= StoreVT
.getNumElements();
1697 return B
.buildMerge(LLT::fixed_vector(NumElts
, S32
), WideRegs
).getReg(0);
1700 static std::pair
<Register
, unsigned>
1701 getBaseWithConstantOffset(MachineRegisterInfo
&MRI
, Register Reg
) {
1703 if (mi_match(Reg
, MRI
, m_ICst(Const
)))
1704 return std::make_pair(Register(), Const
);
1707 if (mi_match(Reg
, MRI
, m_GAdd(m_Reg(Base
), m_ICst(Const
))))
1708 return std::make_pair(Base
, Const
);
1710 // TODO: Handle G_OR used for add case
1711 return std::make_pair(Reg
, 0);
1714 std::pair
<Register
, unsigned>
1715 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder
&B
,
1716 Register OrigOffset
) const {
1717 const unsigned MaxImm
= 4095;
1720 const LLT S32
= LLT::scalar(32);
1722 std::tie(BaseReg
, ImmOffset
) = getBaseWithConstantOffset(*B
.getMRI(),
1726 if (ImmOffset
!= 0) {
1727 // If the immediate value is too big for the immoffset field, put the value
1728 // and -4096 into the immoffset field so that the value that is copied/added
1729 // for the voffset field is a multiple of 4096, and it stands more chance
1730 // of being CSEd with the copy/add for another similar load/store.
1731 // However, do not do that rounding down to a multiple of 4096 if that is a
1732 // negative number, as it appears to be illegal to have a negative offset
1733 // in the vgpr, even if adding the immediate offset makes it positive.
1734 unsigned Overflow
= ImmOffset
& ~MaxImm
;
1735 ImmOffset
-= Overflow
;
1736 if ((int32_t)Overflow
< 0) {
1737 Overflow
+= ImmOffset
;
1742 if (Overflow
!= 0) {
1744 BaseReg
= B
.buildConstant(S32
, Overflow
).getReg(0);
1746 auto OverflowVal
= B
.buildConstant(S32
, Overflow
);
1747 BaseReg
= B
.buildAdd(S32
, BaseReg
, OverflowVal
).getReg(0);
1753 BaseReg
= B
.buildConstant(S32
, 0).getReg(0);
1755 return {BaseReg
, C1
};
1758 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder
&B
, Register DstReg
,
1759 Register SrcReg
) const {
1760 MachineRegisterInfo
&MRI
= *B
.getMRI();
1761 LLT SrcTy
= MRI
.getType(SrcReg
);
1762 if (SrcTy
.getSizeInBits() == 32) {
1763 // Use a v_mov_b32 here to make the exec dependency explicit.
1764 B
.buildInstr(AMDGPU::V_MOV_B32_e32
)
1767 return constrainGenericRegister(DstReg
, AMDGPU::VGPR_32RegClass
, MRI
) &&
1768 constrainGenericRegister(SrcReg
, AMDGPU::SReg_32RegClass
, MRI
);
1771 Register TmpReg0
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1772 Register TmpReg1
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1774 B
.buildInstr(AMDGPU::V_MOV_B32_e32
)
1776 .addUse(SrcReg
, 0, AMDGPU::sub0
);
1777 B
.buildInstr(AMDGPU::V_MOV_B32_e32
)
1779 .addUse(SrcReg
, 0, AMDGPU::sub1
);
1780 B
.buildInstr(AMDGPU::REG_SEQUENCE
)
1783 .addImm(AMDGPU::sub0
)
1785 .addImm(AMDGPU::sub1
);
1787 return constrainGenericRegister(SrcReg
, AMDGPU::SReg_64RegClass
, MRI
) &&
1788 constrainGenericRegister(DstReg
, AMDGPU::VReg_64RegClass
, MRI
);
1791 /// Utility function for pushing dynamic vector indexes with a constant offset
1792 /// into waterwall loops.
1793 static void reinsertVectorIndexAdd(MachineIRBuilder
&B
,
1794 MachineInstr
&IdxUseInstr
,
1796 unsigned ConstOffset
) {
1797 MachineRegisterInfo
&MRI
= *B
.getMRI();
1798 const LLT S32
= LLT::scalar(32);
1799 Register WaterfallIdx
= IdxUseInstr
.getOperand(OpIdx
).getReg();
1800 B
.setInsertPt(*IdxUseInstr
.getParent(), IdxUseInstr
.getIterator());
1802 auto MaterializedOffset
= B
.buildConstant(S32
, ConstOffset
);
1804 auto Add
= B
.buildAdd(S32
, WaterfallIdx
, MaterializedOffset
);
1805 MRI
.setRegBank(MaterializedOffset
.getReg(0), AMDGPU::SGPRRegBank
);
1806 MRI
.setRegBank(Add
.getReg(0), AMDGPU::SGPRRegBank
);
1807 IdxUseInstr
.getOperand(OpIdx
).setReg(Add
.getReg(0));
1810 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1811 /// original 32-bit source value (to be inserted in the low part of the combined
1812 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1814 static void extendLow32IntoHigh32(MachineIRBuilder
&B
,
1815 Register Hi32Reg
, Register Lo32Reg
,
1817 const RegisterBank
&RegBank
,
1818 bool IsBooleanSrc
= false) {
1819 if (ExtOpc
== AMDGPU::G_ZEXT
) {
1820 B
.buildConstant(Hi32Reg
, 0);
1821 } else if (ExtOpc
== AMDGPU::G_SEXT
) {
1823 // If we know the original source was an s1, the high half is the same as
1825 B
.buildCopy(Hi32Reg
, Lo32Reg
);
1827 // Replicate sign bit from 32-bit extended part.
1828 auto ShiftAmt
= B
.buildConstant(LLT::scalar(32), 31);
1829 B
.getMRI()->setRegBank(ShiftAmt
.getReg(0), RegBank
);
1830 B
.buildAShr(Hi32Reg
, Lo32Reg
, ShiftAmt
);
1833 assert(ExtOpc
== AMDGPU::G_ANYEXT
&& "not an integer extension");
1834 B
.buildUndef(Hi32Reg
);
1838 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1839 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1840 const OperandsMapper
&OpdMapper
) const {
1842 Register VecReg
= MI
.getOperand(1).getReg();
1843 Register Idx
= MI
.getOperand(2).getReg();
1845 const RegisterBank
&IdxBank
=
1846 *OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
1848 bool IsDivergentIdx
= IdxBank
!= AMDGPU::SGPRRegBank
;
1850 LLT VecTy
= MRI
.getType(VecReg
);
1851 unsigned EltSize
= VecTy
.getScalarSizeInBits();
1852 unsigned NumElem
= VecTy
.getNumElements();
1854 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize
, NumElem
,
1858 MachineIRBuilder
B(MI
);
1859 LLT S32
= LLT::scalar(32);
1861 const RegisterBank
&DstBank
=
1862 *OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
1863 const RegisterBank
&SrcBank
=
1864 *OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
1866 const RegisterBank
&CCBank
=
1867 (DstBank
== AMDGPU::SGPRRegBank
&&
1868 SrcBank
== AMDGPU::SGPRRegBank
&&
1869 IdxBank
== AMDGPU::SGPRRegBank
) ? AMDGPU::SGPRRegBank
1870 : AMDGPU::VCCRegBank
;
1871 LLT CCTy
= (CCBank
== AMDGPU::SGPRRegBank
) ? S32
: LLT::scalar(1);
1873 if (CCBank
== AMDGPU::VCCRegBank
&& IdxBank
== AMDGPU::SGPRRegBank
) {
1874 Idx
= B
.buildCopy(S32
, Idx
)->getOperand(0).getReg();
1875 MRI
.setRegBank(Idx
, AMDGPU::VGPRRegBank
);
1878 LLT EltTy
= VecTy
.getScalarType();
1879 SmallVector
<Register
, 2> DstRegs(OpdMapper
.getVRegs(0));
1880 unsigned NumLanes
= DstRegs
.size();
1884 EltTy
= MRI
.getType(DstRegs
[0]);
1886 auto UnmergeToEltTy
= B
.buildUnmerge(EltTy
, VecReg
);
1887 SmallVector
<Register
, 2> Res(NumLanes
);
1888 for (unsigned L
= 0; L
< NumLanes
; ++L
)
1889 Res
[L
] = UnmergeToEltTy
.getReg(L
);
1891 for (unsigned I
= 1; I
< NumElem
; ++I
) {
1892 auto IC
= B
.buildConstant(S32
, I
);
1893 MRI
.setRegBank(IC
->getOperand(0).getReg(), AMDGPU::SGPRRegBank
);
1894 auto Cmp
= B
.buildICmp(CmpInst::ICMP_EQ
, CCTy
, Idx
, IC
);
1895 MRI
.setRegBank(Cmp
->getOperand(0).getReg(), CCBank
);
1897 for (unsigned L
= 0; L
< NumLanes
; ++L
) {
1898 auto S
= B
.buildSelect(EltTy
, Cmp
,
1899 UnmergeToEltTy
.getReg(I
* NumLanes
+ L
), Res
[L
]);
1901 for (unsigned N
: { 0, 2, 3 })
1902 MRI
.setRegBank(S
->getOperand(N
).getReg(), DstBank
);
1904 Res
[L
] = S
->getOperand(0).getReg();
1908 for (unsigned L
= 0; L
< NumLanes
; ++L
) {
1909 Register DstReg
= (NumLanes
== 1) ? MI
.getOperand(0).getReg() : DstRegs
[L
];
1910 B
.buildCopy(DstReg
, Res
[L
]);
1911 MRI
.setRegBank(DstReg
, DstBank
);
1914 MRI
.setRegBank(MI
.getOperand(0).getReg(), DstBank
);
1915 MI
.eraseFromParent();
1920 // Insert a cross regbank copy for a register if it already has a bank that
1921 // differs from the one we want to set.
1922 static Register
constrainRegToBank(MachineRegisterInfo
&MRI
,
1923 MachineIRBuilder
&B
, Register
&Reg
,
1924 const RegisterBank
&Bank
) {
1925 const RegisterBank
*CurrBank
= MRI
.getRegBankOrNull(Reg
);
1926 if (CurrBank
&& *CurrBank
!= Bank
) {
1927 Register Copy
= B
.buildCopy(MRI
.getType(Reg
), Reg
).getReg(0);
1928 MRI
.setRegBank(Copy
, Bank
);
1932 MRI
.setRegBank(Reg
, Bank
);
1936 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
1937 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1938 const OperandsMapper
&OpdMapper
) const {
1940 Register VecReg
= MI
.getOperand(1).getReg();
1941 Register Idx
= MI
.getOperand(3).getReg();
1943 const RegisterBank
&IdxBank
=
1944 *OpdMapper
.getInstrMapping().getOperandMapping(3).BreakDown
[0].RegBank
;
1946 bool IsDivergentIdx
= IdxBank
!= AMDGPU::SGPRRegBank
;
1948 LLT VecTy
= MRI
.getType(VecReg
);
1949 unsigned EltSize
= VecTy
.getScalarSizeInBits();
1950 unsigned NumElem
= VecTy
.getNumElements();
1952 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize
, NumElem
,
1956 MachineIRBuilder
B(MI
);
1957 LLT S32
= LLT::scalar(32);
1959 const RegisterBank
&DstBank
=
1960 *OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
1961 const RegisterBank
&SrcBank
=
1962 *OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
1963 const RegisterBank
&InsBank
=
1964 *OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
1966 const RegisterBank
&CCBank
=
1967 (DstBank
== AMDGPU::SGPRRegBank
&&
1968 SrcBank
== AMDGPU::SGPRRegBank
&&
1969 InsBank
== AMDGPU::SGPRRegBank
&&
1970 IdxBank
== AMDGPU::SGPRRegBank
) ? AMDGPU::SGPRRegBank
1971 : AMDGPU::VCCRegBank
;
1972 LLT CCTy
= (CCBank
== AMDGPU::SGPRRegBank
) ? S32
: LLT::scalar(1);
1974 if (CCBank
== AMDGPU::VCCRegBank
&& IdxBank
== AMDGPU::SGPRRegBank
) {
1975 Idx
= B
.buildCopy(S32
, Idx
)->getOperand(0).getReg();
1976 MRI
.setRegBank(Idx
, AMDGPU::VGPRRegBank
);
1979 LLT EltTy
= VecTy
.getScalarType();
1980 SmallVector
<Register
, 2> InsRegs(OpdMapper
.getVRegs(2));
1981 unsigned NumLanes
= InsRegs
.size();
1984 InsRegs
.push_back(MI
.getOperand(2).getReg());
1986 EltTy
= MRI
.getType(InsRegs
[0]);
1989 auto UnmergeToEltTy
= B
.buildUnmerge(EltTy
, VecReg
);
1990 SmallVector
<Register
, 16> Ops(NumElem
* NumLanes
);
1992 for (unsigned I
= 0; I
< NumElem
; ++I
) {
1993 auto IC
= B
.buildConstant(S32
, I
);
1994 MRI
.setRegBank(IC
->getOperand(0).getReg(), AMDGPU::SGPRRegBank
);
1995 auto Cmp
= B
.buildICmp(CmpInst::ICMP_EQ
, CCTy
, Idx
, IC
);
1996 MRI
.setRegBank(Cmp
->getOperand(0).getReg(), CCBank
);
1998 for (unsigned L
= 0; L
< NumLanes
; ++L
) {
1999 Register Op0
= constrainRegToBank(MRI
, B
, InsRegs
[L
], DstBank
);
2000 Register Op1
= UnmergeToEltTy
.getReg(I
* NumLanes
+ L
);
2001 Op1
= constrainRegToBank(MRI
, B
, Op1
, DstBank
);
2003 Register Select
= B
.buildSelect(EltTy
, Cmp
, Op0
, Op1
).getReg(0);
2004 MRI
.setRegBank(Select
, DstBank
);
2006 Ops
[I
* NumLanes
+ L
] = Select
;
2010 LLT MergeTy
= LLT::fixed_vector(Ops
.size(), EltTy
);
2011 if (MergeTy
== MRI
.getType(MI
.getOperand(0).getReg())) {
2012 B
.buildBuildVector(MI
.getOperand(0), Ops
);
2014 auto Vec
= B
.buildBuildVector(MergeTy
, Ops
);
2015 MRI
.setRegBank(Vec
->getOperand(0).getReg(), DstBank
);
2016 B
.buildBitcast(MI
.getOperand(0).getReg(), Vec
);
2019 MRI
.setRegBank(MI
.getOperand(0).getReg(), DstBank
);
2020 MI
.eraseFromParent();
2025 void AMDGPURegisterBankInfo::applyMappingImpl(
2026 const OperandsMapper
&OpdMapper
) const {
2027 MachineInstr
&MI
= OpdMapper
.getMI();
2028 unsigned Opc
= MI
.getOpcode();
2029 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
2031 case AMDGPU::G_PHI
: {
2032 Register DstReg
= MI
.getOperand(0).getReg();
2033 LLT DstTy
= MRI
.getType(DstReg
);
2034 if (DstTy
!= LLT::scalar(1))
2037 const LLT S32
= LLT::scalar(32);
2038 const RegisterBank
*DstBank
=
2039 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2040 if (DstBank
== &AMDGPU::VCCRegBank
) {
2041 applyDefaultMapping(OpdMapper
);
2042 // The standard handling only considers the result register bank for
2043 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2044 // produce an invalid copy. We can only copy with some kind of compare to
2045 // get a vector boolean result. Insert a register bank copy that will be
2046 // correctly lowered to a compare.
2047 MachineIRBuilder
B(*MI
.getParent()->getParent());
2049 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
2050 Register SrcReg
= MI
.getOperand(I
).getReg();
2051 const RegisterBank
*SrcBank
= getRegBank(SrcReg
, MRI
, *TRI
);
2053 if (SrcBank
!= &AMDGPU::VCCRegBank
) {
2054 MachineBasicBlock
*SrcMBB
= MI
.getOperand(I
+ 1).getMBB();
2055 B
.setInsertPt(*SrcMBB
, SrcMBB
->getFirstTerminator());
2057 auto Copy
= B
.buildCopy(LLT::scalar(1), SrcReg
);
2058 MRI
.setRegBank(Copy
.getReg(0), AMDGPU::VCCRegBank
);
2059 MI
.getOperand(I
).setReg(Copy
.getReg(0));
2066 // Phi handling is strange and only considers the bank of the destination.
2067 substituteSimpleCopyRegs(OpdMapper
, 0);
2069 // Promote SGPR/VGPR booleans to s32
2070 MachineFunction
*MF
= MI
.getParent()->getParent();
2071 ApplyRegBankMapping
ApplyBank(*this, MRI
, DstBank
);
2072 MachineIRBuilder
B(MI
, ApplyBank
);
2073 LegalizerHelper
Helper(*MF
, ApplyBank
, B
);
2075 if (Helper
.widenScalar(MI
, 0, S32
) != LegalizerHelper::Legalized
)
2076 llvm_unreachable("widen scalar should have succeeded");
2080 case AMDGPU::G_ICMP
:
2081 case AMDGPU::G_UADDO
:
2082 case AMDGPU::G_USUBO
:
2083 case AMDGPU::G_UADDE
:
2084 case AMDGPU::G_SADDE
:
2085 case AMDGPU::G_USUBE
:
2086 case AMDGPU::G_SSUBE
: {
2087 unsigned BoolDstOp
= Opc
== AMDGPU::G_ICMP
? 0 : 1;
2088 Register DstReg
= MI
.getOperand(BoolDstOp
).getReg();
2090 const RegisterBank
*DstBank
=
2091 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2092 if (DstBank
!= &AMDGPU::SGPRRegBank
)
2095 const bool HasCarryIn
= MI
.getNumOperands() == 5;
2097 // If this is a scalar compare, promote the result to s32, as the selection
2098 // will end up using a copy to a 32-bit vreg.
2099 const LLT S32
= LLT::scalar(32);
2100 Register NewDstReg
= MRI
.createGenericVirtualRegister(S32
);
2101 MRI
.setRegBank(NewDstReg
, AMDGPU::SGPRRegBank
);
2102 MI
.getOperand(BoolDstOp
).setReg(NewDstReg
);
2103 MachineIRBuilder
B(MI
);
2106 Register NewSrcReg
= MRI
.createGenericVirtualRegister(S32
);
2107 MRI
.setRegBank(NewSrcReg
, AMDGPU::SGPRRegBank
);
2108 B
.buildZExt(NewSrcReg
, MI
.getOperand(4).getReg());
2109 MI
.getOperand(4).setReg(NewSrcReg
);
2112 MachineBasicBlock
*MBB
= MI
.getParent();
2113 B
.setInsertPt(*MBB
, std::next(MI
.getIterator()));
2115 // If we had a constrained VCC result register, a copy was inserted to VCC
2117 SmallVector
<Register
, 1> DefRegs(OpdMapper
.getVRegs(0));
2118 if (DefRegs
.empty())
2119 DefRegs
.push_back(DstReg
);
2120 B
.buildTrunc(DefRegs
[0], NewDstReg
);
2123 case AMDGPU::G_SELECT
: {
2124 Register DstReg
= MI
.getOperand(0).getReg();
2125 LLT DstTy
= MRI
.getType(DstReg
);
2127 SmallVector
<Register
, 1> CondRegs(OpdMapper
.getVRegs(1));
2128 if (CondRegs
.empty())
2129 CondRegs
.push_back(MI
.getOperand(1).getReg());
2131 assert(CondRegs
.size() == 1);
2134 const RegisterBank
*CondBank
= getRegBank(CondRegs
[0], MRI
, *TRI
);
2135 if (CondBank
== &AMDGPU::SGPRRegBank
) {
2136 MachineIRBuilder
B(MI
);
2137 const LLT S32
= LLT::scalar(32);
2138 Register NewCondReg
= MRI
.createGenericVirtualRegister(S32
);
2139 MRI
.setRegBank(NewCondReg
, AMDGPU::SGPRRegBank
);
2141 MI
.getOperand(1).setReg(NewCondReg
);
2142 B
.buildZExt(NewCondReg
, CondRegs
[0]);
2145 if (DstTy
.getSizeInBits() != 64)
2148 MachineIRBuilder
B(MI
);
2149 LLT HalfTy
= getHalfSizedType(DstTy
);
2151 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2152 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
2153 SmallVector
<Register
, 2> Src2Regs(OpdMapper
.getVRegs(3));
2155 // All inputs are SGPRs, nothing special to do.
2156 if (DefRegs
.empty()) {
2157 assert(Src1Regs
.empty() && Src2Regs
.empty());
2161 if (Src1Regs
.empty())
2162 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
2164 setRegsToType(MRI
, Src1Regs
, HalfTy
);
2167 if (Src2Regs
.empty())
2168 split64BitValueForMapping(B
, Src2Regs
, HalfTy
, MI
.getOperand(3).getReg());
2170 setRegsToType(MRI
, Src2Regs
, HalfTy
);
2172 setRegsToType(MRI
, DefRegs
, HalfTy
);
2174 B
.buildSelect(DefRegs
[0], CondRegs
[0], Src1Regs
[0], Src2Regs
[0]);
2175 B
.buildSelect(DefRegs
[1], CondRegs
[0], Src1Regs
[1], Src2Regs
[1]);
2177 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
2178 MI
.eraseFromParent();
2181 case AMDGPU::G_BRCOND
: {
2182 Register CondReg
= MI
.getOperand(0).getReg();
2183 // FIXME: Should use legalizer helper, but should change bool ext type.
2184 const RegisterBank
*CondBank
=
2185 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2187 if (CondBank
== &AMDGPU::SGPRRegBank
) {
2188 MachineIRBuilder
B(MI
);
2189 const LLT S32
= LLT::scalar(32);
2190 Register NewCondReg
= MRI
.createGenericVirtualRegister(S32
);
2191 MRI
.setRegBank(NewCondReg
, AMDGPU::SGPRRegBank
);
2193 MI
.getOperand(0).setReg(NewCondReg
);
2194 B
.buildZExt(NewCondReg
, CondReg
);
2202 case AMDGPU::G_XOR
: {
2203 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2204 // there is a VGPR input.
2205 Register DstReg
= MI
.getOperand(0).getReg();
2206 LLT DstTy
= MRI
.getType(DstReg
);
2208 if (DstTy
.getSizeInBits() == 1) {
2209 const RegisterBank
*DstBank
=
2210 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2211 if (DstBank
== &AMDGPU::VCCRegBank
)
2214 MachineFunction
*MF
= MI
.getParent()->getParent();
2215 ApplyRegBankMapping
ApplyBank(*this, MRI
, DstBank
);
2216 MachineIRBuilder
B(MI
, ApplyBank
);
2217 LegalizerHelper
Helper(*MF
, ApplyBank
, B
);
2219 if (Helper
.widenScalar(MI
, 0, LLT::scalar(32)) !=
2220 LegalizerHelper::Legalized
)
2221 llvm_unreachable("widen scalar should have succeeded");
2225 if (DstTy
.getSizeInBits() != 64)
2228 LLT HalfTy
= getHalfSizedType(DstTy
);
2229 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2230 SmallVector
<Register
, 2> Src0Regs(OpdMapper
.getVRegs(1));
2231 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
2233 // All inputs are SGPRs, nothing special to do.
2234 if (DefRegs
.empty()) {
2235 assert(Src0Regs
.empty() && Src1Regs
.empty());
2239 assert(DefRegs
.size() == 2);
2240 assert(Src0Regs
.size() == Src1Regs
.size() &&
2241 (Src0Regs
.empty() || Src0Regs
.size() == 2));
2243 // Depending on where the source registers came from, the generic code may
2244 // have decided to split the inputs already or not. If not, we still need to
2245 // extract the values.
2246 MachineIRBuilder
B(MI
);
2248 if (Src0Regs
.empty())
2249 split64BitValueForMapping(B
, Src0Regs
, HalfTy
, MI
.getOperand(1).getReg());
2251 setRegsToType(MRI
, Src0Regs
, HalfTy
);
2253 if (Src1Regs
.empty())
2254 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
2256 setRegsToType(MRI
, Src1Regs
, HalfTy
);
2258 setRegsToType(MRI
, DefRegs
, HalfTy
);
2260 B
.buildInstr(Opc
, {DefRegs
[0]}, {Src0Regs
[0], Src1Regs
[0]});
2261 B
.buildInstr(Opc
, {DefRegs
[1]}, {Src0Regs
[1], Src1Regs
[1]});
2263 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
2264 MI
.eraseFromParent();
2267 case AMDGPU::G_ABS
: {
2268 Register SrcReg
= MI
.getOperand(1).getReg();
2269 const RegisterBank
*SrcBank
= MRI
.getRegBankOrNull(SrcReg
);
2271 // There is no VALU abs instruction so we need to replace it with a sub and
2273 if (SrcBank
&& SrcBank
== &AMDGPU::VGPRRegBank
) {
2274 MachineFunction
*MF
= MI
.getParent()->getParent();
2275 ApplyRegBankMapping
Apply(*this, MRI
, &AMDGPU::VGPRRegBank
);
2276 MachineIRBuilder
B(MI
, Apply
);
2277 LegalizerHelper
Helper(*MF
, Apply
, B
);
2279 if (Helper
.lowerAbsToMaxNeg(MI
) != LegalizerHelper::Legalized
)
2280 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2289 case AMDGPU::G_LSHR
:
2290 case AMDGPU::G_ASHR
:
2291 case AMDGPU::G_SMIN
:
2292 case AMDGPU::G_SMAX
:
2293 case AMDGPU::G_UMIN
:
2294 case AMDGPU::G_UMAX
: {
2295 Register DstReg
= MI
.getOperand(0).getReg();
2296 LLT DstTy
= MRI
.getType(DstReg
);
2298 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2299 // Packed 16-bit operations need to be scalarized and promoted.
2300 if (DstTy
!= LLT::scalar(16) && DstTy
!= LLT::fixed_vector(2, 16))
2303 const RegisterBank
*DstBank
=
2304 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2305 if (DstBank
== &AMDGPU::VGPRRegBank
)
2308 const LLT S32
= LLT::scalar(32);
2309 MachineBasicBlock
*MBB
= MI
.getParent();
2310 MachineFunction
*MF
= MBB
->getParent();
2311 ApplyRegBankMapping
ApplySALU(*this, MRI
, &AMDGPU::SGPRRegBank
);
2312 MachineIRBuilder
B(MI
, ApplySALU
);
2314 if (DstTy
.isVector()) {
2315 Register WideSrc0Lo
, WideSrc0Hi
;
2316 Register WideSrc1Lo
, WideSrc1Hi
;
2318 unsigned ExtendOp
= getExtendOp(MI
.getOpcode());
2319 std::tie(WideSrc0Lo
, WideSrc0Hi
)
2320 = unpackV2S16ToS32(B
, MI
.getOperand(1).getReg(), ExtendOp
);
2321 std::tie(WideSrc1Lo
, WideSrc1Hi
)
2322 = unpackV2S16ToS32(B
, MI
.getOperand(2).getReg(), ExtendOp
);
2323 auto Lo
= B
.buildInstr(MI
.getOpcode(), {S32
}, {WideSrc0Lo
, WideSrc1Lo
});
2324 auto Hi
= B
.buildInstr(MI
.getOpcode(), {S32
}, {WideSrc0Hi
, WideSrc1Hi
});
2325 B
.buildBuildVectorTrunc(DstReg
, {Lo
.getReg(0), Hi
.getReg(0)});
2326 MI
.eraseFromParent();
2328 LegalizerHelper
Helper(*MF
, ApplySALU
, B
);
2330 if (Helper
.widenScalar(MI
, 0, S32
) != LegalizerHelper::Legalized
)
2331 llvm_unreachable("widen scalar should have succeeded");
2333 // FIXME: s16 shift amounts should be legal.
2334 if (Opc
== AMDGPU::G_SHL
|| Opc
== AMDGPU::G_LSHR
||
2335 Opc
== AMDGPU::G_ASHR
) {
2336 B
.setInsertPt(*MBB
, MI
.getIterator());
2337 if (Helper
.widenScalar(MI
, 1, S32
) != LegalizerHelper::Legalized
)
2338 llvm_unreachable("widen scalar should have succeeded");
2344 case AMDGPU::G_SEXT_INREG
: {
2345 SmallVector
<Register
, 2> SrcRegs(OpdMapper
.getVRegs(1));
2346 if (SrcRegs
.empty())
2347 break; // Nothing to repair
2349 const LLT S32
= LLT::scalar(32);
2350 MachineIRBuilder
B(MI
);
2351 ApplyRegBankMapping
O(*this, MRI
, &AMDGPU::VGPRRegBank
);
2352 GISelObserverWrapper
Observer(&O
);
2353 B
.setChangeObserver(Observer
);
2355 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2356 // we would need to further expand, and doesn't let us directly set the
2357 // result registers.
2358 SmallVector
<Register
, 2> DstRegs(OpdMapper
.getVRegs(0));
2360 int Amt
= MI
.getOperand(2).getImm();
2363 // The low bits are unchanged.
2364 B
.buildCopy(DstRegs
[0], SrcRegs
[0]);
2366 // Extend in the low bits and propagate the sign bit to the high half.
2367 B
.buildSExtInReg(DstRegs
[0], SrcRegs
[0], Amt
);
2370 B
.buildAShr(DstRegs
[1], DstRegs
[0], B
.buildConstant(S32
, 31));
2372 // The low bits are unchanged, and extend in the high bits.
2373 B
.buildCopy(DstRegs
[0], SrcRegs
[0]);
2374 B
.buildSExtInReg(DstRegs
[1], DstRegs
[0], Amt
- 32);
2377 Register DstReg
= MI
.getOperand(0).getReg();
2378 MRI
.setRegBank(DstReg
, AMDGPU::VGPRRegBank
);
2379 MI
.eraseFromParent();
2382 case AMDGPU::G_CTPOP
:
2383 case AMDGPU::G_BITREVERSE
: {
2384 const RegisterBank
*DstBank
=
2385 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2386 if (DstBank
== &AMDGPU::SGPRRegBank
)
2389 Register SrcReg
= MI
.getOperand(1).getReg();
2390 const LLT S32
= LLT::scalar(32);
2391 LLT Ty
= MRI
.getType(SrcReg
);
2395 ApplyRegBankMapping
ApplyVALU(*this, MRI
, &AMDGPU::VGPRRegBank
);
2396 MachineIRBuilder
B(MI
, ApplyVALU
);
2398 MachineFunction
&MF
= B
.getMF();
2399 LegalizerHelper
Helper(MF
, ApplyVALU
, B
);
2401 if (Helper
.narrowScalar(MI
, 1, S32
) != LegalizerHelper::Legalized
)
2402 llvm_unreachable("narrowScalar should have succeeded");
2405 case AMDGPU::G_AMDGPU_FFBH_U32
:
2406 case AMDGPU::G_AMDGPU_FFBL_B32
:
2407 case AMDGPU::G_CTLZ_ZERO_UNDEF
:
2408 case AMDGPU::G_CTTZ_ZERO_UNDEF
: {
2409 const RegisterBank
*DstBank
=
2410 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2411 if (DstBank
== &AMDGPU::SGPRRegBank
)
2414 Register SrcReg
= MI
.getOperand(1).getReg();
2415 const LLT S32
= LLT::scalar(32);
2416 LLT Ty
= MRI
.getType(SrcReg
);
2420 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2421 // which return -1 when the input is zero:
2422 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2423 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2424 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2425 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2426 ApplyRegBankMapping
ApplyVALU(*this, MRI
, &AMDGPU::VGPRRegBank
);
2427 MachineIRBuilder
B(MI
, ApplyVALU
);
2428 SmallVector
<Register
, 2> SrcRegs(OpdMapper
.getVRegs(1));
2429 unsigned NewOpc
= Opc
== AMDGPU::G_CTLZ_ZERO_UNDEF
2430 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2431 : Opc
== AMDGPU::G_CTTZ_ZERO_UNDEF
2432 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2434 unsigned Idx
= NewOpc
== AMDGPU::G_AMDGPU_FFBH_U32
;
2435 auto X
= B
.buildInstr(NewOpc
, {S32
}, {SrcRegs
[Idx
]});
2436 auto Y
= B
.buildInstr(NewOpc
, {S32
}, {SrcRegs
[Idx
^ 1]});
2438 Opc
== AMDGPU::G_CTLZ_ZERO_UNDEF
|| Opc
== AMDGPU::G_CTTZ_ZERO_UNDEF
2440 : AMDGPU::G_UADDSAT
;
2441 Y
= B
.buildInstr(AddOpc
, {S32
}, {Y
, B
.buildConstant(S32
, 32)});
2442 Register DstReg
= MI
.getOperand(0).getReg();
2443 B
.buildUMin(DstReg
, X
, Y
);
2444 MI
.eraseFromParent();
2447 case AMDGPU::G_SEXT
:
2448 case AMDGPU::G_ZEXT
:
2449 case AMDGPU::G_ANYEXT
: {
2450 Register SrcReg
= MI
.getOperand(1).getReg();
2451 LLT SrcTy
= MRI
.getType(SrcReg
);
2452 const bool Signed
= Opc
== AMDGPU::G_SEXT
;
2454 assert(empty(OpdMapper
.getVRegs(1)));
2456 MachineIRBuilder
B(MI
);
2457 const RegisterBank
*SrcBank
=
2458 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2460 Register DstReg
= MI
.getOperand(0).getReg();
2461 LLT DstTy
= MRI
.getType(DstReg
);
2462 if (DstTy
.isScalar() &&
2463 SrcBank
!= &AMDGPU::SGPRRegBank
&&
2464 SrcBank
!= &AMDGPU::VCCRegBank
&&
2465 // FIXME: Should handle any type that round to s64 when irregular
2466 // breakdowns supported.
2467 DstTy
.getSizeInBits() == 64 &&
2468 SrcTy
.getSizeInBits() <= 32) {
2469 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2471 // Extend to 32-bit, and then extend the low half.
2473 // TODO: Should really be buildSExtOrCopy
2474 B
.buildSExtOrTrunc(DefRegs
[0], SrcReg
);
2475 } else if (Opc
== AMDGPU::G_ZEXT
) {
2476 B
.buildZExtOrTrunc(DefRegs
[0], SrcReg
);
2478 B
.buildAnyExtOrTrunc(DefRegs
[0], SrcReg
);
2481 extendLow32IntoHigh32(B
, DefRegs
[1], DefRegs
[0], Opc
, *SrcBank
);
2482 MRI
.setRegBank(DstReg
, *SrcBank
);
2483 MI
.eraseFromParent();
2487 if (SrcTy
!= LLT::scalar(1))
2490 // It is not legal to have a legalization artifact with a VCC source. Rather
2491 // than introducing a copy, insert the select we would have to select the
2493 if (SrcBank
== &AMDGPU::VCCRegBank
) {
2494 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
2496 const RegisterBank
*DstBank
= &AMDGPU::VGPRRegBank
;
2498 unsigned DstSize
= DstTy
.getSizeInBits();
2499 // 64-bit select is SGPR only
2500 const bool UseSel64
= DstSize
> 32 &&
2501 SrcBank
->getID() == AMDGPU::SGPRRegBankID
;
2503 // TODO: Should s16 select be legal?
2504 LLT SelType
= UseSel64
? LLT::scalar(64) : LLT::scalar(32);
2505 auto True
= B
.buildConstant(SelType
, Signed
? -1 : 1);
2506 auto False
= B
.buildConstant(SelType
, 0);
2508 MRI
.setRegBank(True
.getReg(0), *DstBank
);
2509 MRI
.setRegBank(False
.getReg(0), *DstBank
);
2510 MRI
.setRegBank(DstReg
, *DstBank
);
2513 B
.buildSelect(DefRegs
[0], SrcReg
, True
, False
);
2514 extendLow32IntoHigh32(B
, DefRegs
[1], DefRegs
[0], Opc
, *SrcBank
, true);
2515 } else if (DstSize
< 32) {
2516 auto Sel
= B
.buildSelect(SelType
, SrcReg
, True
, False
);
2517 MRI
.setRegBank(Sel
.getReg(0), *DstBank
);
2518 B
.buildTrunc(DstReg
, Sel
);
2520 B
.buildSelect(DstReg
, SrcReg
, True
, False
);
2523 MI
.eraseFromParent();
2529 case AMDGPU::G_BUILD_VECTOR
:
2530 case AMDGPU::G_BUILD_VECTOR_TRUNC
: {
2531 Register DstReg
= MI
.getOperand(0).getReg();
2532 LLT DstTy
= MRI
.getType(DstReg
);
2533 if (DstTy
!= LLT::fixed_vector(2, 16))
2536 assert(MI
.getNumOperands() == 3 && OpdMapper
.getVRegs(0).empty());
2537 substituteSimpleCopyRegs(OpdMapper
, 1);
2538 substituteSimpleCopyRegs(OpdMapper
, 2);
2540 const RegisterBank
*DstBank
=
2541 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2542 if (DstBank
== &AMDGPU::SGPRRegBank
)
2543 break; // Can use S_PACK_* instructions.
2545 MachineIRBuilder
B(MI
);
2547 Register Lo
= MI
.getOperand(1).getReg();
2548 Register Hi
= MI
.getOperand(2).getReg();
2549 const LLT S32
= LLT::scalar(32);
2551 const RegisterBank
*BankLo
=
2552 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2553 const RegisterBank
*BankHi
=
2554 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
2559 if (Opc
== AMDGPU::G_BUILD_VECTOR
) {
2560 ZextLo
= B
.buildZExt(S32
, Lo
).getReg(0);
2561 MRI
.setRegBank(ZextLo
, *BankLo
);
2563 Register ZextHi
= B
.buildZExt(S32
, Hi
).getReg(0);
2564 MRI
.setRegBank(ZextHi
, *BankHi
);
2566 auto ShiftAmt
= B
.buildConstant(S32
, 16);
2567 MRI
.setRegBank(ShiftAmt
.getReg(0), *BankHi
);
2569 ShiftHi
= B
.buildShl(S32
, ZextHi
, ShiftAmt
).getReg(0);
2570 MRI
.setRegBank(ShiftHi
, *BankHi
);
2572 Register MaskLo
= B
.buildConstant(S32
, 0xffff).getReg(0);
2573 MRI
.setRegBank(MaskLo
, *BankLo
);
2575 auto ShiftAmt
= B
.buildConstant(S32
, 16);
2576 MRI
.setRegBank(ShiftAmt
.getReg(0), *BankHi
);
2578 ShiftHi
= B
.buildShl(S32
, Hi
, ShiftAmt
).getReg(0);
2579 MRI
.setRegBank(ShiftHi
, *BankHi
);
2581 ZextLo
= B
.buildAnd(S32
, Lo
, MaskLo
).getReg(0);
2582 MRI
.setRegBank(ZextLo
, *BankLo
);
2585 auto Or
= B
.buildOr(S32
, ZextLo
, ShiftHi
);
2586 MRI
.setRegBank(Or
.getReg(0), *DstBank
);
2588 B
.buildBitcast(DstReg
, Or
);
2589 MI
.eraseFromParent();
2592 case AMDGPU::G_EXTRACT_VECTOR_ELT
: {
2593 SmallVector
<Register
, 2> DstRegs(OpdMapper
.getVRegs(0));
2595 assert(OpdMapper
.getVRegs(1).empty() && OpdMapper
.getVRegs(2).empty());
2597 Register DstReg
= MI
.getOperand(0).getReg();
2598 Register SrcReg
= MI
.getOperand(1).getReg();
2600 const LLT S32
= LLT::scalar(32);
2601 LLT DstTy
= MRI
.getType(DstReg
);
2602 LLT SrcTy
= MRI
.getType(SrcReg
);
2604 if (foldExtractEltToCmpSelect(MI
, MRI
, OpdMapper
))
2607 MachineIRBuilder
B(MI
);
2609 const ValueMapping
&DstMapping
2610 = OpdMapper
.getInstrMapping().getOperandMapping(0);
2611 const RegisterBank
*DstBank
= DstMapping
.BreakDown
[0].RegBank
;
2612 const RegisterBank
*SrcBank
=
2613 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2614 const RegisterBank
*IdxBank
=
2615 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
2617 Register BaseIdxReg
;
2618 unsigned ConstOffset
;
2619 std::tie(BaseIdxReg
, ConstOffset
) =
2620 AMDGPU::getBaseWithConstantOffset(MRI
, MI
.getOperand(2).getReg());
2622 // See if the index is an add of a constant which will be foldable by moving
2623 // the base register of the index later if this is going to be executed in a
2624 // waterfall loop. This is essentially to reassociate the add of a constant
2625 // with the readfirstlane.
2626 bool ShouldMoveIndexIntoLoop
= IdxBank
!= &AMDGPU::SGPRRegBank
&&
2628 ConstOffset
< SrcTy
.getNumElements();
2630 // Move the base register. We'll re-insert the add later.
2631 if (ShouldMoveIndexIntoLoop
)
2632 MI
.getOperand(2).setReg(BaseIdxReg
);
2634 // If this is a VGPR result only because the index was a VGPR result, the
2635 // actual indexing will be done on the SGPR source vector, which will
2636 // produce a scalar result. We need to copy to the VGPR result inside the
2638 const bool NeedCopyToVGPR
= DstBank
== &AMDGPU::VGPRRegBank
&&
2639 SrcBank
== &AMDGPU::SGPRRegBank
;
2640 if (DstRegs
.empty()) {
2641 applyDefaultMapping(OpdMapper
);
2643 executeInWaterfallLoop(MI
, MRI
, { 2 });
2645 if (NeedCopyToVGPR
) {
2646 // We don't want a phi for this temporary reg.
2647 Register TmpReg
= MRI
.createGenericVirtualRegister(DstTy
);
2648 MRI
.setRegBank(TmpReg
, AMDGPU::SGPRRegBank
);
2649 MI
.getOperand(0).setReg(TmpReg
);
2650 B
.setInsertPt(*MI
.getParent(), ++MI
.getIterator());
2652 // Use a v_mov_b32 here to make the exec dependency explicit.
2653 buildVCopy(B
, DstReg
, TmpReg
);
2656 // Re-insert the constant offset add inside the waterfall loop.
2657 if (ShouldMoveIndexIntoLoop
)
2658 reinsertVectorIndexAdd(B
, MI
, 2, ConstOffset
);
2663 assert(DstTy
.getSizeInBits() == 64);
2665 LLT Vec32
= LLT::fixed_vector(2 * SrcTy
.getNumElements(), 32);
2667 auto CastSrc
= B
.buildBitcast(Vec32
, SrcReg
);
2668 auto One
= B
.buildConstant(S32
, 1);
2670 MachineBasicBlock::iterator MII
= MI
.getIterator();
2672 // Split the vector index into 32-bit pieces. Prepare to move all of the
2673 // new instructions into a waterfall loop if necessary.
2675 // Don't put the bitcast or constant in the loop.
2676 MachineInstrSpan
Span(MII
, &B
.getMBB());
2678 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2679 auto IdxLo
= B
.buildShl(S32
, BaseIdxReg
, One
);
2680 auto IdxHi
= B
.buildAdd(S32
, IdxLo
, One
);
2682 auto Extract0
= B
.buildExtractVectorElement(DstRegs
[0], CastSrc
, IdxLo
);
2683 auto Extract1
= B
.buildExtractVectorElement(DstRegs
[1], CastSrc
, IdxHi
);
2685 MRI
.setRegBank(DstReg
, *DstBank
);
2686 MRI
.setRegBank(CastSrc
.getReg(0), *SrcBank
);
2687 MRI
.setRegBank(One
.getReg(0), AMDGPU::SGPRRegBank
);
2688 MRI
.setRegBank(IdxLo
.getReg(0), AMDGPU::SGPRRegBank
);
2689 MRI
.setRegBank(IdxHi
.getReg(0), AMDGPU::SGPRRegBank
);
2691 SmallSet
<Register
, 4> OpsToWaterfall
;
2692 if (!collectWaterfallOperands(OpsToWaterfall
, MI
, MRI
, { 2 })) {
2693 MI
.eraseFromParent();
2697 // Remove the original instruction to avoid potentially confusing the
2698 // waterfall loop logic.
2699 B
.setInstr(*Span
.begin());
2700 MI
.eraseFromParent();
2701 executeInWaterfallLoop(B
, make_range(Span
.begin(), Span
.end()),
2702 OpsToWaterfall
, MRI
);
2704 if (NeedCopyToVGPR
) {
2705 MachineBasicBlock
*LoopBB
= Extract1
->getParent();
2706 Register TmpReg0
= MRI
.createGenericVirtualRegister(S32
);
2707 Register TmpReg1
= MRI
.createGenericVirtualRegister(S32
);
2708 MRI
.setRegBank(TmpReg0
, AMDGPU::SGPRRegBank
);
2709 MRI
.setRegBank(TmpReg1
, AMDGPU::SGPRRegBank
);
2711 Extract0
->getOperand(0).setReg(TmpReg0
);
2712 Extract1
->getOperand(0).setReg(TmpReg1
);
2714 B
.setInsertPt(*LoopBB
, ++Extract1
->getIterator());
2716 buildVCopy(B
, DstRegs
[0], TmpReg0
);
2717 buildVCopy(B
, DstRegs
[1], TmpReg1
);
2720 if (ShouldMoveIndexIntoLoop
)
2721 reinsertVectorIndexAdd(B
, *IdxLo
, 1, ConstOffset
);
2725 case AMDGPU::G_INSERT_VECTOR_ELT
: {
2726 SmallVector
<Register
, 2> InsRegs(OpdMapper
.getVRegs(2));
2728 Register DstReg
= MI
.getOperand(0).getReg();
2729 LLT VecTy
= MRI
.getType(DstReg
);
2731 assert(OpdMapper
.getVRegs(0).empty());
2732 assert(OpdMapper
.getVRegs(3).empty());
2734 if (substituteSimpleCopyRegs(OpdMapper
, 1))
2735 MRI
.setType(MI
.getOperand(1).getReg(), VecTy
);
2737 if (foldInsertEltToCmpSelect(MI
, MRI
, OpdMapper
))
2740 const RegisterBank
*IdxBank
=
2741 OpdMapper
.getInstrMapping().getOperandMapping(3).BreakDown
[0].RegBank
;
2743 Register SrcReg
= MI
.getOperand(1).getReg();
2744 Register InsReg
= MI
.getOperand(2).getReg();
2745 LLT InsTy
= MRI
.getType(InsReg
);
2748 Register BaseIdxReg
;
2749 unsigned ConstOffset
;
2750 std::tie(BaseIdxReg
, ConstOffset
) =
2751 AMDGPU::getBaseWithConstantOffset(MRI
, MI
.getOperand(3).getReg());
2753 // See if the index is an add of a constant which will be foldable by moving
2754 // the base register of the index later if this is going to be executed in a
2755 // waterfall loop. This is essentially to reassociate the add of a constant
2756 // with the readfirstlane.
2757 bool ShouldMoveIndexIntoLoop
= IdxBank
!= &AMDGPU::SGPRRegBank
&&
2759 ConstOffset
< VecTy
.getNumElements();
2761 // Move the base register. We'll re-insert the add later.
2762 if (ShouldMoveIndexIntoLoop
)
2763 MI
.getOperand(3).setReg(BaseIdxReg
);
2766 if (InsRegs
.empty()) {
2767 executeInWaterfallLoop(MI
, MRI
, { 3 });
2769 // Re-insert the constant offset add inside the waterfall loop.
2770 if (ShouldMoveIndexIntoLoop
) {
2771 MachineIRBuilder
B(MI
);
2772 reinsertVectorIndexAdd(B
, MI
, 3, ConstOffset
);
2779 assert(InsTy
.getSizeInBits() == 64);
2781 const LLT S32
= LLT::scalar(32);
2782 LLT Vec32
= LLT::fixed_vector(2 * VecTy
.getNumElements(), 32);
2784 MachineIRBuilder
B(MI
);
2785 auto CastSrc
= B
.buildBitcast(Vec32
, SrcReg
);
2786 auto One
= B
.buildConstant(S32
, 1);
2788 // Split the vector index into 32-bit pieces. Prepare to move all of the
2789 // new instructions into a waterfall loop if necessary.
2791 // Don't put the bitcast or constant in the loop.
2792 MachineInstrSpan
Span(MachineBasicBlock::iterator(&MI
), &B
.getMBB());
2794 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2795 auto IdxLo
= B
.buildShl(S32
, BaseIdxReg
, One
);
2796 auto IdxHi
= B
.buildAdd(S32
, IdxLo
, One
);
2798 auto InsLo
= B
.buildInsertVectorElement(Vec32
, CastSrc
, InsRegs
[0], IdxLo
);
2799 auto InsHi
= B
.buildInsertVectorElement(Vec32
, InsLo
, InsRegs
[1], IdxHi
);
2801 const RegisterBank
*DstBank
=
2802 OpdMapper
.getInstrMapping().getOperandMapping(0).BreakDown
[0].RegBank
;
2803 const RegisterBank
*SrcBank
=
2804 OpdMapper
.getInstrMapping().getOperandMapping(1).BreakDown
[0].RegBank
;
2805 const RegisterBank
*InsSrcBank
=
2806 OpdMapper
.getInstrMapping().getOperandMapping(2).BreakDown
[0].RegBank
;
2808 MRI
.setRegBank(InsReg
, *InsSrcBank
);
2809 MRI
.setRegBank(CastSrc
.getReg(0), *SrcBank
);
2810 MRI
.setRegBank(InsLo
.getReg(0), *DstBank
);
2811 MRI
.setRegBank(InsHi
.getReg(0), *DstBank
);
2812 MRI
.setRegBank(One
.getReg(0), AMDGPU::SGPRRegBank
);
2813 MRI
.setRegBank(IdxLo
.getReg(0), AMDGPU::SGPRRegBank
);
2814 MRI
.setRegBank(IdxHi
.getReg(0), AMDGPU::SGPRRegBank
);
2817 SmallSet
<Register
, 4> OpsToWaterfall
;
2818 if (!collectWaterfallOperands(OpsToWaterfall
, MI
, MRI
, { 3 })) {
2819 B
.setInsertPt(B
.getMBB(), MI
);
2820 B
.buildBitcast(DstReg
, InsHi
);
2821 MI
.eraseFromParent();
2825 B
.setInstr(*Span
.begin());
2826 MI
.eraseFromParent();
2828 // Figure out the point after the waterfall loop before mangling the control
2830 executeInWaterfallLoop(B
, make_range(Span
.begin(), Span
.end()),
2831 OpsToWaterfall
, MRI
);
2833 // The insertion point is now right after the original instruction.
2835 // Keep the bitcast to the original vector type out of the loop. Doing this
2836 // saved an extra phi we don't need inside the loop.
2837 B
.buildBitcast(DstReg
, InsHi
);
2839 // Re-insert the constant offset add inside the waterfall loop.
2840 if (ShouldMoveIndexIntoLoop
)
2841 reinsertVectorIndexAdd(B
, *IdxLo
, 1, ConstOffset
);
2845 case AMDGPU::G_AMDGPU_BUFFER_LOAD
:
2846 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
:
2847 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT
:
2848 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
:
2849 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
:
2850 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT
:
2851 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16
:
2852 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT
:
2853 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16
:
2854 case AMDGPU::G_AMDGPU_BUFFER_STORE
:
2855 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE
:
2856 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT
:
2857 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT
:
2858 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16
:
2859 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT
:
2860 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16
: {
2861 applyDefaultMapping(OpdMapper
);
2862 executeInWaterfallLoop(MI
, MRI
, {1, 4});
2865 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP
:
2866 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD
:
2867 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB
:
2868 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN
:
2869 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN
:
2870 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX
:
2871 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX
:
2872 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND
:
2873 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR
:
2874 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR
:
2875 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC
:
2876 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC
: {
2877 applyDefaultMapping(OpdMapper
);
2878 executeInWaterfallLoop(MI
, MRI
, {2, 5});
2881 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD
:
2882 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN
:
2883 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX
: {
2884 applyDefaultMapping(OpdMapper
);
2885 executeInWaterfallLoop(MI
, MRI
, {2, 5});
2888 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP
: {
2889 applyDefaultMapping(OpdMapper
);
2890 executeInWaterfallLoop(MI
, MRI
, {3, 6});
2893 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD
: {
2894 applyMappingSBufferLoad(OpdMapper
);
2897 case AMDGPU::G_INTRINSIC
: {
2898 switch (MI
.getIntrinsicID()) {
2899 case Intrinsic::amdgcn_readlane
: {
2900 substituteSimpleCopyRegs(OpdMapper
, 2);
2902 assert(OpdMapper
.getVRegs(0).empty());
2903 assert(OpdMapper
.getVRegs(3).empty());
2905 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2906 // waterfall loop, so assume it's a uniform value.
2907 constrainOpWithReadfirstlane(MI
, MRI
, 3); // Index
2910 case Intrinsic::amdgcn_writelane
: {
2911 assert(OpdMapper
.getVRegs(0).empty());
2912 assert(OpdMapper
.getVRegs(2).empty());
2913 assert(OpdMapper
.getVRegs(3).empty());
2915 substituteSimpleCopyRegs(OpdMapper
, 4); // VGPR input val
2916 constrainOpWithReadfirstlane(MI
, MRI
, 2); // Source value
2917 constrainOpWithReadfirstlane(MI
, MRI
, 3); // Index
2920 case Intrinsic::amdgcn_interp_p1
:
2921 case Intrinsic::amdgcn_interp_p2
:
2922 case Intrinsic::amdgcn_interp_mov
:
2923 case Intrinsic::amdgcn_interp_p1_f16
:
2924 case Intrinsic::amdgcn_interp_p2_f16
: {
2925 applyDefaultMapping(OpdMapper
);
2927 // Readlane for m0 value, which is always the last operand.
2928 // FIXME: Should this be a waterfall loop instead?
2929 constrainOpWithReadfirstlane(MI
, MRI
, MI
.getNumOperands() - 1); // Index
2932 case Intrinsic::amdgcn_permlane16
:
2933 case Intrinsic::amdgcn_permlanex16
: {
2934 // Doing a waterfall loop over these wouldn't make any sense.
2935 substituteSimpleCopyRegs(OpdMapper
, 2);
2936 substituteSimpleCopyRegs(OpdMapper
, 3);
2937 constrainOpWithReadfirstlane(MI
, MRI
, 4);
2938 constrainOpWithReadfirstlane(MI
, MRI
, 5);
2941 case Intrinsic::amdgcn_sbfe
:
2942 applyMappingBFE(OpdMapper
, true);
2944 case Intrinsic::amdgcn_ubfe
:
2945 applyMappingBFE(OpdMapper
, false);
2947 case Intrinsic::amdgcn_ballot
:
2948 // Use default handling and insert copy to vcc source.
2953 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD
:
2954 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
:
2955 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE
:
2956 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
: {
2957 const AMDGPU::RsrcIntrinsic
*RSrcIntrin
2958 = AMDGPU::lookupRsrcIntrinsic(MI
.getIntrinsicID());
2959 assert(RSrcIntrin
&& RSrcIntrin
->IsImage
);
2960 // Non-images can have complications from operands that allow both SGPR
2961 // and VGPR. For now it's too complicated to figure out the final opcode
2962 // to derive the register bank from the MCInstrDesc.
2963 applyMappingImage(MI
, OpdMapper
, MRI
, RSrcIntrin
->RsrcArg
);
2966 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY
: {
2967 unsigned N
= MI
.getNumExplicitOperands() - 2;
2968 applyDefaultMapping(OpdMapper
);
2969 executeInWaterfallLoop(MI
, MRI
, { N
});
2972 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
: {
2973 auto IntrID
= MI
.getIntrinsicID();
2975 case Intrinsic::amdgcn_ds_ordered_add
:
2976 case Intrinsic::amdgcn_ds_ordered_swap
: {
2977 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2978 assert(OpdMapper
.getVRegs(0).empty());
2979 substituteSimpleCopyRegs(OpdMapper
, 3);
2980 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
2983 case Intrinsic::amdgcn_ds_gws_init
:
2984 case Intrinsic::amdgcn_ds_gws_barrier
:
2985 case Intrinsic::amdgcn_ds_gws_sema_br
: {
2986 // Only the first lane is executes, so readfirstlane is safe.
2987 substituteSimpleCopyRegs(OpdMapper
, 1);
2988 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
2991 case Intrinsic::amdgcn_ds_gws_sema_v
:
2992 case Intrinsic::amdgcn_ds_gws_sema_p
:
2993 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
2994 // Only the first lane is executes, so readfirstlane is safe.
2995 constrainOpWithReadfirstlane(MI
, MRI
, 1); // M0
2998 case Intrinsic::amdgcn_ds_append
:
2999 case Intrinsic::amdgcn_ds_consume
: {
3000 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
3003 case Intrinsic::amdgcn_s_sendmsg
:
3004 case Intrinsic::amdgcn_s_sendmsghalt
: {
3005 // FIXME: Should this use a waterfall loop?
3006 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
3009 case Intrinsic::amdgcn_s_setreg
: {
3010 constrainOpWithReadfirstlane(MI
, MRI
, 2);
3014 if (const AMDGPU::RsrcIntrinsic
*RSrcIntrin
=
3015 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
3016 // Non-images can have complications from operands that allow both SGPR
3017 // and VGPR. For now it's too complicated to figure out the final opcode
3018 // to derive the register bank from the MCInstrDesc.
3019 if (RSrcIntrin
->IsImage
) {
3020 applyMappingImage(MI
, OpdMapper
, MRI
, RSrcIntrin
->RsrcArg
);
3030 case AMDGPU::G_SI_CALL
: {
3031 // Use a set to avoid extra readfirstlanes in the case where multiple
3032 // operands are the same register.
3033 SmallSet
<Register
, 4> SGPROperandRegs
;
3035 if (!collectWaterfallOperands(SGPROperandRegs
, MI
, MRI
, {1}))
3038 // Move all copies to physical SGPRs that are used by the call instruction
3039 // into the loop block. Start searching for these copies until the
3041 unsigned FrameSetupOpcode
= AMDGPU::ADJCALLSTACKUP
;
3042 unsigned FrameDestroyOpcode
= AMDGPU::ADJCALLSTACKDOWN
;
3044 // Move all non-copies before the copies, so that a complete range can be
3045 // moved into the waterfall loop.
3046 SmallVector
<MachineInstr
*, 4> NonCopyInstrs
;
3047 // Count of NonCopyInstrs found until the current LastCopy.
3048 unsigned NonCopyInstrsLen
= 0;
3049 MachineBasicBlock::iterator
Start(&MI
);
3050 MachineBasicBlock::iterator LastCopy
= Start
;
3051 MachineBasicBlock
*MBB
= MI
.getParent();
3052 const SIMachineFunctionInfo
*Info
=
3053 MBB
->getParent()->getInfo
<SIMachineFunctionInfo
>();
3054 while (Start
->getOpcode() != FrameSetupOpcode
) {
3056 bool IsCopy
= false;
3057 if (Start
->getOpcode() == AMDGPU::COPY
) {
3058 auto &Dst
= Start
->getOperand(0);
3060 Register Reg
= Dst
.getReg();
3061 if (Reg
.isPhysical() && MI
.readsRegister(Reg
, TRI
)) {
3064 // Also move the copy from the scratch rsrc descriptor into the loop
3065 // to allow it to be optimized away.
3066 auto &Src
= Start
->getOperand(1);
3069 IsCopy
= Info
->getScratchRSrcReg() == Reg
;
3077 NonCopyInstrsLen
= NonCopyInstrs
.size();
3079 NonCopyInstrs
.push_back(&*Start
);
3082 NonCopyInstrs
.resize(NonCopyInstrsLen
);
3084 for (auto *NonCopy
: reverse(NonCopyInstrs
)) {
3085 MBB
->splice(LastCopy
, MBB
, NonCopy
->getIterator());
3089 // Do the same for copies after the loop
3090 NonCopyInstrs
.clear();
3091 NonCopyInstrsLen
= 0;
3092 MachineBasicBlock::iterator
End(&MI
);
3094 while (End
->getOpcode() != FrameDestroyOpcode
) {
3096 bool IsCopy
= false;
3097 if (End
->getOpcode() == AMDGPU::COPY
) {
3098 auto &Src
= End
->getOperand(1);
3100 Register Reg
= Src
.getReg();
3101 IsCopy
= Reg
.isPhysical() && MI
.modifiesRegister(Reg
, TRI
);
3107 NonCopyInstrsLen
= NonCopyInstrs
.size();
3109 NonCopyInstrs
.push_back(&*End
);
3112 NonCopyInstrs
.resize(NonCopyInstrsLen
);
3116 for (auto *NonCopy
: reverse(NonCopyInstrs
)) {
3117 MBB
->splice(LastCopy
, MBB
, NonCopy
->getIterator());
3121 MachineIRBuilder
B(*Start
);
3122 executeInWaterfallLoop(B
, make_range(Start
, End
), SGPROperandRegs
, MRI
);
3125 case AMDGPU::G_LOAD
:
3126 case AMDGPU::G_ZEXTLOAD
:
3127 case AMDGPU::G_SEXTLOAD
: {
3128 if (applyMappingLoad(MI
, OpdMapper
, MRI
))
3132 case AMDGPU::G_DYN_STACKALLOC
:
3133 applyMappingDynStackAlloc(MI
, OpdMapper
, MRI
);
3135 case AMDGPU::G_SBFX
:
3136 applyMappingBFE(OpdMapper
, /*Signed*/ true);
3138 case AMDGPU::G_UBFX
:
3139 applyMappingBFE(OpdMapper
, /*Signed*/ false);
3145 return applyDefaultMapping(OpdMapper
);
3148 // vgpr, sgpr -> vgpr
3149 // vgpr, agpr -> vgpr
3150 // agpr, agpr -> agpr
3151 // agpr, sgpr -> vgpr
3152 static unsigned regBankUnion(unsigned RB0
, unsigned RB1
) {
3153 if (RB0
== AMDGPU::InvalidRegBankID
)
3155 if (RB1
== AMDGPU::InvalidRegBankID
)
3158 if (RB0
== AMDGPU::SGPRRegBankID
&& RB1
== AMDGPU::SGPRRegBankID
)
3159 return AMDGPU::SGPRRegBankID
;
3161 if (RB0
== AMDGPU::AGPRRegBankID
&& RB1
== AMDGPU::AGPRRegBankID
)
3162 return AMDGPU::AGPRRegBankID
;
3164 return AMDGPU::VGPRRegBankID
;
3167 static unsigned regBankBoolUnion(unsigned RB0
, unsigned RB1
) {
3168 if (RB0
== AMDGPU::InvalidRegBankID
)
3170 if (RB1
== AMDGPU::InvalidRegBankID
)
3176 if (RB0
== AMDGPU::VCCRegBankID
|| RB1
== AMDGPU::VCCRegBankID
)
3177 return AMDGPU::VCCRegBankID
;
3179 // vcc, vgpr -> vgpr
3180 return regBankUnion(RB0
, RB1
);
3183 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo
&MRI
,
3184 const MachineInstr
&MI
) const {
3185 unsigned RegBank
= AMDGPU::InvalidRegBankID
;
3187 for (const MachineOperand
&MO
: MI
.operands()) {
3190 Register Reg
= MO
.getReg();
3191 if (const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
)) {
3192 RegBank
= regBankUnion(RegBank
, Bank
->getID());
3193 if (RegBank
== AMDGPU::VGPRRegBankID
)
3201 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr
&MI
) const {
3202 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3203 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3204 for (const MachineOperand
&MO
: MI
.operands()) {
3207 Register Reg
= MO
.getReg();
3208 if (const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
)) {
3209 if (Bank
->getID() != AMDGPU::SGPRRegBankID
)
3216 const RegisterBankInfo::InstructionMapping
&
3217 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr
&MI
) const {
3218 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3219 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3220 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3222 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
3223 const MachineOperand
&SrcOp
= MI
.getOperand(i
);
3227 unsigned Size
= getSizeInBits(SrcOp
.getReg(), MRI
, *TRI
);
3228 OpdsMapping
[i
] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
3230 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
3231 MI
.getNumOperands());
3234 const RegisterBankInfo::InstructionMapping
&
3235 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr
&MI
) const {
3236 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3237 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3238 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3240 // Even though we technically could use SGPRs, this would require knowledge of
3241 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3243 // TODO: Unary ops are trivially OK, so accept SGPRs?
3244 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
3245 const MachineOperand
&Src
= MI
.getOperand(i
);
3249 unsigned Size
= getSizeInBits(Src
.getReg(), MRI
, *TRI
);
3250 unsigned BankID
= Size
== 1 ? AMDGPU::VCCRegBankID
: AMDGPU::VGPRRegBankID
;
3251 OpdsMapping
[i
] = AMDGPU::getValueMapping(BankID
, Size
);
3254 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
3255 MI
.getNumOperands());
3258 const RegisterBankInfo::InstructionMapping
&
3259 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr
&MI
) const {
3260 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3261 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3262 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3264 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
) {
3265 const MachineOperand
&Op
= MI
.getOperand(I
);
3269 unsigned Size
= getSizeInBits(Op
.getReg(), MRI
, *TRI
);
3270 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3273 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
3274 MI
.getNumOperands());
3277 const RegisterBankInfo::InstructionMapping
&
3278 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo
&MRI
,
3279 const MachineInstr
&MI
,
3280 int RsrcIdx
) const {
3281 // The reported argument index is relative to the IR intrinsic call arguments,
3282 // so we need to shift by the number of defs and the intrinsic ID.
3283 RsrcIdx
+= MI
.getNumExplicitDefs() + 1;
3285 const int NumOps
= MI
.getNumOperands();
3286 SmallVector
<const ValueMapping
*, 8> OpdsMapping(NumOps
);
3288 // TODO: Should packed/unpacked D16 difference be reported here as part of
3289 // the value mapping?
3290 for (int I
= 0; I
!= NumOps
; ++I
) {
3291 if (!MI
.getOperand(I
).isReg())
3294 Register OpReg
= MI
.getOperand(I
).getReg();
3295 // We replace some dead address operands with $noreg
3299 unsigned Size
= getSizeInBits(OpReg
, MRI
, *TRI
);
3301 // FIXME: Probably need a new intrinsic register bank searchable table to
3302 // handle arbitrary intrinsics easily.
3304 // If this has a sampler, it immediately follows rsrc.
3305 const bool MustBeSGPR
= I
== RsrcIdx
|| I
== RsrcIdx
+ 1;
3308 // If this must be an SGPR, so we must report whatever it is as legal.
3309 unsigned NewBank
= getRegBankID(OpReg
, MRI
, AMDGPU::SGPRRegBankID
);
3310 OpdsMapping
[I
] = AMDGPU::getValueMapping(NewBank
, Size
);
3312 // Some operands must be VGPR, and these are easy to copy to.
3313 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3317 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
), NumOps
);
3320 /// Return the mapping for a pointer argument.
3321 const RegisterBankInfo::ValueMapping
*
3322 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo
&MRI
,
3323 Register PtrReg
) const {
3324 LLT PtrTy
= MRI
.getType(PtrReg
);
3325 unsigned Size
= PtrTy
.getSizeInBits();
3326 if (Subtarget
.useFlatForGlobal() ||
3327 !AMDGPU::isFlatGlobalAddrSpace(PtrTy
.getAddressSpace()))
3328 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3330 // If we're using MUBUF instructions for global memory, an SGPR base register
3331 // is possible. Otherwise this needs to be a VGPR.
3332 const RegisterBank
*PtrBank
= getRegBank(PtrReg
, MRI
, *TRI
);
3333 return AMDGPU::getValueMapping(PtrBank
->getID(), Size
);
3336 const RegisterBankInfo::InstructionMapping
&
3337 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr
&MI
) const {
3339 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3340 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3341 SmallVector
<const ValueMapping
*, 2> OpdsMapping(2);
3342 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3343 Register PtrReg
= MI
.getOperand(1).getReg();
3344 LLT PtrTy
= MRI
.getType(PtrReg
);
3345 unsigned AS
= PtrTy
.getAddressSpace();
3346 unsigned PtrSize
= PtrTy
.getSizeInBits();
3348 const ValueMapping
*ValMapping
;
3349 const ValueMapping
*PtrMapping
;
3351 const RegisterBank
*PtrBank
= getRegBank(PtrReg
, MRI
, *TRI
);
3353 if (PtrBank
== &AMDGPU::SGPRRegBank
&& AMDGPU::isFlatGlobalAddrSpace(AS
)) {
3354 if (isScalarLoadLegal(MI
)) {
3355 // We have a uniform instruction so we want to use an SMRD load
3356 ValMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
3357 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
);
3359 ValMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3361 // If we're using MUBUF instructions for global memory, an SGPR base
3362 // register is possible. Otherwise this needs to be a VGPR.
3363 unsigned PtrBankID
= Subtarget
.useFlatForGlobal() ?
3364 AMDGPU::VGPRRegBankID
: AMDGPU::SGPRRegBankID
;
3366 PtrMapping
= AMDGPU::getValueMapping(PtrBankID
, PtrSize
);
3369 ValMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3370 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
);
3373 OpdsMapping
[0] = ValMapping
;
3374 OpdsMapping
[1] = PtrMapping
;
3375 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstructionMapping(
3376 1, 1, getOperandsMapping(OpdsMapping
), MI
.getNumOperands());
3379 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3380 // handle that during instruction selection?
3384 AMDGPURegisterBankInfo::getRegBankID(Register Reg
,
3385 const MachineRegisterInfo
&MRI
,
3386 unsigned Default
) const {
3387 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
3388 return Bank
? Bank
->getID() : Default
;
3391 const RegisterBankInfo::ValueMapping
*
3392 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg
,
3393 const MachineRegisterInfo
&MRI
,
3394 const TargetRegisterInfo
&TRI
) const {
3395 // Lie and claim anything is legal, even though this needs to be an SGPR
3396 // applyMapping will have to deal with it as a waterfall loop.
3397 unsigned Bank
= getRegBankID(Reg
, MRI
, AMDGPU::SGPRRegBankID
);
3398 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
3399 return AMDGPU::getValueMapping(Bank
, Size
);
3402 const RegisterBankInfo::ValueMapping
*
3403 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg
,
3404 const MachineRegisterInfo
&MRI
,
3405 const TargetRegisterInfo
&TRI
) const {
3406 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
3407 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3410 const RegisterBankInfo::ValueMapping
*
3411 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg
,
3412 const MachineRegisterInfo
&MRI
,
3413 const TargetRegisterInfo
&TRI
) const {
3414 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
3415 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID
, Size
);
3419 /// This function must return a legal mapping, because
3420 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3421 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3422 /// VGPR to SGPR generated is illegal.
3424 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3425 // legal. These will be dealt with in applyMappingImpl.
3427 const RegisterBankInfo::InstructionMapping
&
3428 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr
&MI
) const {
3429 const MachineFunction
&MF
= *MI
.getParent()->getParent();
3430 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3432 if (MI
.isCopy() || MI
.getOpcode() == AMDGPU::G_FREEZE
) {
3433 // The default logic bothers to analyze impossible alternative mappings. We
3434 // want the most straightforward mapping, so just directly handle this.
3435 const RegisterBank
*DstBank
= getRegBank(MI
.getOperand(0).getReg(), MRI
,
3437 const RegisterBank
*SrcBank
= getRegBank(MI
.getOperand(1).getReg(), MRI
,
3439 assert(SrcBank
&& "src bank should have been assigned already");
3443 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3444 if (cannotCopy(*DstBank
, *SrcBank
, Size
))
3445 return getInvalidInstructionMapping();
3447 const ValueMapping
&ValMap
= getValueMapping(0, Size
, *DstBank
);
3448 unsigned OpdsMappingSize
= MI
.isCopy() ? 1 : 2;
3449 SmallVector
<const ValueMapping
*, 1> OpdsMapping(OpdsMappingSize
);
3450 OpdsMapping
[0] = &ValMap
;
3451 if (MI
.getOpcode() == AMDGPU::G_FREEZE
)
3452 OpdsMapping
[1] = &ValMap
;
3454 return getInstructionMapping(
3456 /*OperandsMapping*/ getOperandsMapping(OpdsMapping
), OpdsMappingSize
);
3459 if (MI
.isRegSequence()) {
3460 // If any input is a VGPR, the result must be a VGPR. The default handling
3461 // assumes any copy between banks is legal.
3462 unsigned BankID
= AMDGPU::SGPRRegBankID
;
3464 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
3465 auto OpBank
= getRegBankID(MI
.getOperand(I
).getReg(), MRI
);
3466 // It doesn't make sense to use vcc or scc banks here, so just ignore
3468 if (OpBank
!= AMDGPU::SGPRRegBankID
) {
3469 BankID
= AMDGPU::VGPRRegBankID
;
3473 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3475 const ValueMapping
&ValMap
= getValueMapping(0, Size
, getRegBank(BankID
));
3476 return getInstructionMapping(
3478 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
3481 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3484 // TODO: There are additional exec masking dependencies to analyze.
3485 if (MI
.getOpcode() == TargetOpcode::G_PHI
) {
3486 unsigned ResultBank
= AMDGPU::InvalidRegBankID
;
3487 Register DstReg
= MI
.getOperand(0).getReg();
3489 // Sometimes the result may have already been assigned a bank.
3490 if (const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
))
3491 ResultBank
= DstBank
->getID();
3493 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
3494 Register Reg
= MI
.getOperand(I
).getReg();
3495 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
3497 // FIXME: Assuming VGPR for any undetermined inputs.
3498 if (!Bank
|| Bank
->getID() == AMDGPU::VGPRRegBankID
) {
3499 ResultBank
= AMDGPU::VGPRRegBankID
;
3503 // FIXME: Need to promote SGPR case to s32
3504 unsigned OpBank
= Bank
->getID();
3505 ResultBank
= regBankBoolUnion(ResultBank
, OpBank
);
3508 assert(ResultBank
!= AMDGPU::InvalidRegBankID
);
3510 unsigned Size
= MRI
.getType(DstReg
).getSizeInBits();
3512 const ValueMapping
&ValMap
=
3513 getValueMapping(0, Size
, getRegBank(ResultBank
));
3514 return getInstructionMapping(
3516 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
3519 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstrMappingImpl(MI
);
3520 if (Mapping
.isValid())
3523 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
3525 switch (MI
.getOpcode()) {
3527 return getInvalidInstructionMapping();
3531 case AMDGPU::G_XOR
: {
3532 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3534 const RegisterBank
*DstBank
3535 = getRegBank(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3537 unsigned TargetBankID
= AMDGPU::InvalidRegBankID
;
3538 unsigned BankLHS
= AMDGPU::InvalidRegBankID
;
3539 unsigned BankRHS
= AMDGPU::InvalidRegBankID
;
3541 TargetBankID
= DstBank
->getID();
3542 if (DstBank
== &AMDGPU::VCCRegBank
) {
3543 TargetBankID
= AMDGPU::VCCRegBankID
;
3544 BankLHS
= AMDGPU::VCCRegBankID
;
3545 BankRHS
= AMDGPU::VCCRegBankID
;
3547 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
3548 AMDGPU::SGPRRegBankID
);
3549 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
3550 AMDGPU::SGPRRegBankID
);
3553 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
3554 AMDGPU::VCCRegBankID
);
3555 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
3556 AMDGPU::VCCRegBankID
);
3558 // Both inputs should be true booleans to produce a boolean result.
3559 if (BankLHS
== AMDGPU::VGPRRegBankID
|| BankRHS
== AMDGPU::VGPRRegBankID
) {
3560 TargetBankID
= AMDGPU::VGPRRegBankID
;
3561 } else if (BankLHS
== AMDGPU::VCCRegBankID
|| BankRHS
== AMDGPU::VCCRegBankID
) {
3562 TargetBankID
= AMDGPU::VCCRegBankID
;
3563 BankLHS
= AMDGPU::VCCRegBankID
;
3564 BankRHS
= AMDGPU::VCCRegBankID
;
3565 } else if (BankLHS
== AMDGPU::SGPRRegBankID
&& BankRHS
== AMDGPU::SGPRRegBankID
) {
3566 TargetBankID
= AMDGPU::SGPRRegBankID
;
3570 OpdsMapping
[0] = AMDGPU::getValueMapping(TargetBankID
, Size
);
3571 OpdsMapping
[1] = AMDGPU::getValueMapping(BankLHS
, Size
);
3572 OpdsMapping
[2] = AMDGPU::getValueMapping(BankRHS
, Size
);
3578 if (isSALUMapping(MI
)) {
3579 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
);
3580 OpdsMapping
[1] = OpdsMapping
[2] = OpdsMapping
[0];
3582 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
);
3583 unsigned Bank1
= getRegBankID(MI
.getOperand(1).getReg(), MRI
/*, DefaultBankID*/);
3584 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank1
, Size
);
3586 unsigned Bank2
= getRegBankID(MI
.getOperand(2).getReg(), MRI
/*, DefaultBankID*/);
3587 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank2
, Size
);
3595 case AMDGPU::G_PTR_ADD
:
3596 case AMDGPU::G_PTRMASK
:
3601 case AMDGPU::G_LSHR
:
3602 case AMDGPU::G_ASHR
:
3603 case AMDGPU::G_UADDO
:
3604 case AMDGPU::G_USUBO
:
3605 case AMDGPU::G_UADDE
:
3606 case AMDGPU::G_SADDE
:
3607 case AMDGPU::G_USUBE
:
3608 case AMDGPU::G_SSUBE
:
3609 case AMDGPU::G_SMIN
:
3610 case AMDGPU::G_SMAX
:
3611 case AMDGPU::G_UMIN
:
3612 case AMDGPU::G_UMAX
:
3614 case AMDGPU::G_SHUFFLE_VECTOR
:
3615 case AMDGPU::G_SBFX
:
3616 case AMDGPU::G_UBFX
:
3617 if (isSALUMapping(MI
))
3618 return getDefaultMappingSOP(MI
);
3621 case AMDGPU::G_SADDSAT
: // FIXME: Could lower sat ops for SALU
3622 case AMDGPU::G_SSUBSAT
:
3623 case AMDGPU::G_UADDSAT
:
3624 case AMDGPU::G_USUBSAT
:
3625 case AMDGPU::G_FADD
:
3626 case AMDGPU::G_FSUB
:
3627 case AMDGPU::G_FPTOSI
:
3628 case AMDGPU::G_FPTOUI
:
3629 case AMDGPU::G_FMUL
:
3631 case AMDGPU::G_FMAD
:
3632 case AMDGPU::G_FSQRT
:
3633 case AMDGPU::G_FFLOOR
:
3634 case AMDGPU::G_FCEIL
:
3635 case AMDGPU::G_FRINT
:
3636 case AMDGPU::G_SITOFP
:
3637 case AMDGPU::G_UITOFP
:
3638 case AMDGPU::G_FPTRUNC
:
3639 case AMDGPU::G_FPEXT
:
3640 case AMDGPU::G_FEXP2
:
3641 case AMDGPU::G_FLOG2
:
3642 case AMDGPU::G_FMINNUM
:
3643 case AMDGPU::G_FMAXNUM
:
3644 case AMDGPU::G_FMINNUM_IEEE
:
3645 case AMDGPU::G_FMAXNUM_IEEE
:
3646 case AMDGPU::G_FCANONICALIZE
:
3647 case AMDGPU::G_INTRINSIC_TRUNC
:
3648 case AMDGPU::G_BSWAP
: // TODO: Somehow expand for scalar?
3649 case AMDGPU::G_FSHR
: // TODO: Expand for scalar
3650 case AMDGPU::G_AMDGPU_FMIN_LEGACY
:
3651 case AMDGPU::G_AMDGPU_FMAX_LEGACY
:
3652 case AMDGPU::G_AMDGPU_RCP_IFLAG
:
3653 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0
:
3654 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1
:
3655 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2
:
3656 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3
:
3657 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32
:
3658 case AMDGPU::G_AMDGPU_SMED3
:
3659 return getDefaultMappingVOP(MI
);
3660 case AMDGPU::G_UMULH
:
3661 case AMDGPU::G_SMULH
: {
3662 if (Subtarget
.hasScalarMulHiInsts() && isSALUMapping(MI
))
3663 return getDefaultMappingSOP(MI
);
3664 return getDefaultMappingVOP(MI
);
3666 case AMDGPU::G_IMPLICIT_DEF
: {
3667 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3668 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
3671 case AMDGPU::G_FCONSTANT
:
3672 case AMDGPU::G_CONSTANT
:
3673 case AMDGPU::G_GLOBAL_VALUE
:
3674 case AMDGPU::G_BLOCK_ADDR
:
3675 case AMDGPU::G_READCYCLECOUNTER
: {
3676 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3677 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
3680 case AMDGPU::G_FRAME_INDEX
: {
3681 // TODO: This should be the same as other constants, but eliminateFrameIndex
3682 // currently assumes VALU uses.
3683 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3684 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3687 case AMDGPU::G_DYN_STACKALLOC
: {
3688 // Result is always uniform, and a wave reduction is needed for the source.
3689 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
3690 unsigned SrcBankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3691 OpdsMapping
[1] = AMDGPU::getValueMapping(SrcBankID
, 32);
3694 case AMDGPU::G_AMDGPU_WAVE_ADDRESS
: {
3695 // This case is weird because we expect a physical register in the source,
3696 // but need to set a bank anyway.
3698 // We could select the result to SGPR or VGPR, but for the one current use
3699 // it's more practical to always use VGPR.
3700 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
3701 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
3704 case AMDGPU::G_INSERT
: {
3705 unsigned BankID
= getMappingType(MRI
, MI
);
3706 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3707 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
3708 unsigned EltSize
= getSizeInBits(MI
.getOperand(2).getReg(), MRI
, *TRI
);
3709 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
3710 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
3711 OpdsMapping
[2] = AMDGPU::getValueMapping(BankID
, EltSize
);
3712 OpdsMapping
[3] = nullptr;
3715 case AMDGPU::G_EXTRACT
: {
3716 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3717 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3718 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
3719 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
3720 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
3721 OpdsMapping
[2] = nullptr;
3724 case AMDGPU::G_BUILD_VECTOR
:
3725 case AMDGPU::G_BUILD_VECTOR_TRUNC
: {
3726 LLT DstTy
= MRI
.getType(MI
.getOperand(0).getReg());
3727 if (DstTy
== LLT::fixed_vector(2, 16)) {
3728 unsigned DstSize
= DstTy
.getSizeInBits();
3729 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3730 unsigned Src0BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3731 unsigned Src1BankID
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
3732 unsigned DstBankID
= regBankUnion(Src0BankID
, Src1BankID
);
3734 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBankID
, DstSize
);
3735 OpdsMapping
[1] = AMDGPU::getValueMapping(Src0BankID
, SrcSize
);
3736 OpdsMapping
[2] = AMDGPU::getValueMapping(Src1BankID
, SrcSize
);
3742 case AMDGPU::G_MERGE_VALUES
:
3743 case AMDGPU::G_CONCAT_VECTORS
: {
3744 unsigned Bank
= getMappingType(MRI
, MI
);
3745 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3746 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3748 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
3749 // Op1 and Dst should use the same register bank.
3750 for (unsigned i
= 1, e
= MI
.getNumOperands(); i
!= e
; ++i
)
3751 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, SrcSize
);
3754 case AMDGPU::G_BITREVERSE
:
3755 case AMDGPU::G_BITCAST
:
3756 case AMDGPU::G_INTTOPTR
:
3757 case AMDGPU::G_PTRTOINT
:
3758 case AMDGPU::G_FABS
:
3759 case AMDGPU::G_FNEG
: {
3760 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3761 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3762 OpdsMapping
[0] = OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, Size
);
3765 case AMDGPU::G_AMDGPU_FFBH_U32
:
3766 case AMDGPU::G_AMDGPU_FFBL_B32
:
3767 case AMDGPU::G_CTLZ_ZERO_UNDEF
:
3768 case AMDGPU::G_CTTZ_ZERO_UNDEF
: {
3769 unsigned Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3770 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3771 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, 32);
3772 OpdsMapping
[1] = AMDGPU::getValueMappingSGPR64Only(BankID
, Size
);
3775 case AMDGPU::G_CTPOP
: {
3776 unsigned Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3777 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3778 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, 32);
3780 // This should really be getValueMappingSGPR64Only, but allowing the generic
3781 // code to handle the register split just makes using LegalizerHelper more
3783 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, Size
);
3786 case AMDGPU::G_TRUNC
: {
3787 Register Dst
= MI
.getOperand(0).getReg();
3788 Register Src
= MI
.getOperand(1).getReg();
3789 unsigned Bank
= getRegBankID(Src
, MRI
);
3790 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
3791 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
3792 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
3793 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, SrcSize
);
3796 case AMDGPU::G_ZEXT
:
3797 case AMDGPU::G_SEXT
:
3798 case AMDGPU::G_ANYEXT
:
3799 case AMDGPU::G_SEXT_INREG
: {
3800 Register Dst
= MI
.getOperand(0).getReg();
3801 Register Src
= MI
.getOperand(1).getReg();
3802 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
3803 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
3806 const RegisterBank
*SrcBank
= getRegBank(Src
, MRI
, *TRI
);
3808 switch (SrcBank
->getID()) {
3809 case AMDGPU::SGPRRegBankID
:
3810 DstBank
= AMDGPU::SGPRRegBankID
;
3813 DstBank
= AMDGPU::VGPRRegBankID
;
3817 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3818 // 32-bits, and then to 64.
3819 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(DstBank
, DstSize
);
3820 OpdsMapping
[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank
->getID(),
3824 case AMDGPU::G_FCMP
: {
3825 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
3826 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
3827 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
3828 OpdsMapping
[1] = nullptr; // Predicate Operand.
3829 OpdsMapping
[2] = AMDGPU::getValueMapping(Op2Bank
, Size
);
3830 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3833 case AMDGPU::G_STORE
: {
3834 assert(MI
.getOperand(0).isReg());
3835 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3837 // FIXME: We need to specify a different reg bank once scalar stores are
3839 const ValueMapping
*ValMapping
=
3840 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
3841 OpdsMapping
[0] = ValMapping
;
3842 OpdsMapping
[1] = getValueMappingForPtr(MRI
, MI
.getOperand(1).getReg());
3845 case AMDGPU::G_ICMP
: {
3846 auto Pred
= static_cast<CmpInst::Predicate
>(MI
.getOperand(1).getPredicate());
3847 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
3849 // See if the result register has already been constrained to vcc, which may
3850 // happen due to control flow intrinsic lowering.
3851 unsigned DstBank
= getRegBankID(MI
.getOperand(0).getReg(), MRI
,
3852 AMDGPU::SGPRRegBankID
);
3853 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
3854 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
);
3856 bool CanUseSCC
= DstBank
== AMDGPU::SGPRRegBankID
&&
3857 Op2Bank
== AMDGPU::SGPRRegBankID
&&
3858 Op3Bank
== AMDGPU::SGPRRegBankID
&&
3859 (Size
== 32 || (Size
== 64 &&
3860 (Pred
== CmpInst::ICMP_EQ
|| Pred
== CmpInst::ICMP_NE
) &&
3861 Subtarget
.hasScalarCompareEq64()));
3863 DstBank
= CanUseSCC
? AMDGPU::SGPRRegBankID
: AMDGPU::VCCRegBankID
;
3864 unsigned SrcBank
= CanUseSCC
? AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
3866 // TODO: Use 32-bit for scalar output size.
3867 // SCC results will need to be copied to a 32-bit SGPR virtual register.
3868 const unsigned ResultSize
= 1;
3870 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBank
, ResultSize
);
3871 OpdsMapping
[2] = AMDGPU::getValueMapping(SrcBank
, Size
);
3872 OpdsMapping
[3] = AMDGPU::getValueMapping(SrcBank
, Size
);
3875 case AMDGPU::G_EXTRACT_VECTOR_ELT
: {
3876 // VGPR index can be used for waterfall when indexing a SGPR vector.
3877 unsigned SrcBankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
);
3878 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3879 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
3880 unsigned IdxSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
3881 unsigned IdxBank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
3882 unsigned OutputBankID
= regBankUnion(SrcBankID
, IdxBank
);
3884 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID
, DstSize
);
3885 OpdsMapping
[1] = AMDGPU::getValueMapping(SrcBankID
, SrcSize
);
3887 // The index can be either if the source vector is VGPR.
3888 OpdsMapping
[2] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
3891 case AMDGPU::G_INSERT_VECTOR_ELT
: {
3892 unsigned OutputBankID
= isSALUMapping(MI
) ?
3893 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
3895 unsigned VecSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
3896 unsigned InsertSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
3897 unsigned IdxSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
3898 unsigned InsertEltBankID
= getRegBankID(MI
.getOperand(2).getReg(), MRI
);
3899 unsigned IdxBankID
= getRegBankID(MI
.getOperand(3).getReg(), MRI
);
3901 OpdsMapping
[0] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
3902 OpdsMapping
[1] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
3904 // This is a weird case, because we need to break down the mapping based on
3905 // the register bank of a different operand.
3906 if (InsertSize
== 64 && OutputBankID
== AMDGPU::VGPRRegBankID
) {
3907 OpdsMapping
[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID
,
3910 assert(InsertSize
== 32 || InsertSize
== 64);
3911 OpdsMapping
[2] = AMDGPU::getValueMapping(InsertEltBankID
, InsertSize
);
3914 // The index can be either if the source vector is VGPR.
3915 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBankID
, IdxSize
);
3918 case AMDGPU::G_UNMERGE_VALUES
: {
3919 unsigned Bank
= getMappingType(MRI
, MI
);
3921 // Op1 and Dst should use the same register bank.
3922 // FIXME: Shouldn't this be the default? Why do we need to handle this?
3923 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
3924 unsigned Size
= getSizeInBits(MI
.getOperand(i
).getReg(), MRI
, *TRI
);
3925 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, Size
);
3929 case AMDGPU::G_AMDGPU_BUFFER_LOAD
:
3930 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
:
3931 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
:
3932 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
:
3933 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT
:
3934 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT
:
3935 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16
:
3936 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT
:
3937 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16
:
3938 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT
:
3939 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16
:
3940 case AMDGPU::G_AMDGPU_BUFFER_STORE
:
3941 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE
:
3942 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT
:
3943 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT
:
3944 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16
: {
3945 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3948 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
3951 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
3954 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
3957 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
3959 // Any remaining operands are immediates and were correctly null
3963 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP
:
3964 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD
:
3965 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB
:
3966 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN
:
3967 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN
:
3968 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX
:
3969 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX
:
3970 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND
:
3971 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR
:
3972 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR
:
3973 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC
:
3974 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC
:
3975 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD
:
3976 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN
:
3977 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX
: {
3979 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
3982 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
3985 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
3988 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
3991 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
3994 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
3996 // Any remaining operands are immediates and were correctly null
4000 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP
: {
4002 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4005 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4008 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4011 OpdsMapping
[3] = getSGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4014 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4017 OpdsMapping
[5] = getVGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
4020 OpdsMapping
[6] = getSGPROpMapping(MI
.getOperand(6).getReg(), MRI
, *TRI
);
4022 // Any remaining operands are immediates and were correctly null
4026 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD
: {
4027 // Lie and claim everything is legal, even though some need to be
4028 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4029 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4030 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4032 // We need to convert this to a MUBUF if either the resource of offset is
4034 unsigned RSrcBank
= OpdsMapping
[1]->BreakDown
[0].RegBank
->getID();
4035 unsigned OffsetBank
= OpdsMapping
[2]->BreakDown
[0].RegBank
->getID();
4036 unsigned ResultBank
= regBankUnion(RSrcBank
, OffsetBank
);
4038 unsigned Size0
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4039 OpdsMapping
[0] = AMDGPU::getValueMapping(ResultBank
, Size0
);
4042 case AMDGPU::G_INTRINSIC
: {
4043 switch (MI
.getIntrinsicID()) {
4045 return getInvalidInstructionMapping();
4046 case Intrinsic::amdgcn_div_fmas
:
4047 case Intrinsic::amdgcn_div_fixup
:
4048 case Intrinsic::amdgcn_trig_preop
:
4049 case Intrinsic::amdgcn_sin
:
4050 case Intrinsic::amdgcn_cos
:
4051 case Intrinsic::amdgcn_log_clamp
:
4052 case Intrinsic::amdgcn_rcp
:
4053 case Intrinsic::amdgcn_rcp_legacy
:
4054 case Intrinsic::amdgcn_sqrt
:
4055 case Intrinsic::amdgcn_rsq
:
4056 case Intrinsic::amdgcn_rsq_legacy
:
4057 case Intrinsic::amdgcn_rsq_clamp
:
4058 case Intrinsic::amdgcn_fmul_legacy
:
4059 case Intrinsic::amdgcn_fma_legacy
:
4060 case Intrinsic::amdgcn_ldexp
:
4061 case Intrinsic::amdgcn_frexp_mant
:
4062 case Intrinsic::amdgcn_frexp_exp
:
4063 case Intrinsic::amdgcn_fract
:
4064 case Intrinsic::amdgcn_cvt_pkrtz
:
4065 case Intrinsic::amdgcn_cvt_pknorm_i16
:
4066 case Intrinsic::amdgcn_cvt_pknorm_u16
:
4067 case Intrinsic::amdgcn_cvt_pk_i16
:
4068 case Intrinsic::amdgcn_cvt_pk_u16
:
4069 case Intrinsic::amdgcn_fmed3
:
4070 case Intrinsic::amdgcn_cubeid
:
4071 case Intrinsic::amdgcn_cubema
:
4072 case Intrinsic::amdgcn_cubesc
:
4073 case Intrinsic::amdgcn_cubetc
:
4074 case Intrinsic::amdgcn_sffbh
:
4075 case Intrinsic::amdgcn_fmad_ftz
:
4076 case Intrinsic::amdgcn_mbcnt_lo
:
4077 case Intrinsic::amdgcn_mbcnt_hi
:
4078 case Intrinsic::amdgcn_mul_u24
:
4079 case Intrinsic::amdgcn_mul_i24
:
4080 case Intrinsic::amdgcn_mulhi_u24
:
4081 case Intrinsic::amdgcn_mulhi_i24
:
4082 case Intrinsic::amdgcn_lerp
:
4083 case Intrinsic::amdgcn_sad_u8
:
4084 case Intrinsic::amdgcn_msad_u8
:
4085 case Intrinsic::amdgcn_sad_hi_u8
:
4086 case Intrinsic::amdgcn_sad_u16
:
4087 case Intrinsic::amdgcn_qsad_pk_u16_u8
:
4088 case Intrinsic::amdgcn_mqsad_pk_u16_u8
:
4089 case Intrinsic::amdgcn_mqsad_u32_u8
:
4090 case Intrinsic::amdgcn_cvt_pk_u8_f32
:
4091 case Intrinsic::amdgcn_alignbyte
:
4092 case Intrinsic::amdgcn_perm
:
4093 case Intrinsic::amdgcn_fdot2
:
4094 case Intrinsic::amdgcn_sdot2
:
4095 case Intrinsic::amdgcn_udot2
:
4096 case Intrinsic::amdgcn_sdot4
:
4097 case Intrinsic::amdgcn_udot4
:
4098 case Intrinsic::amdgcn_sdot8
:
4099 case Intrinsic::amdgcn_udot8
:
4100 return getDefaultMappingVOP(MI
);
4101 case Intrinsic::amdgcn_sbfe
:
4102 case Intrinsic::amdgcn_ubfe
:
4103 if (isSALUMapping(MI
))
4104 return getDefaultMappingSOP(MI
);
4105 return getDefaultMappingVOP(MI
);
4106 case Intrinsic::amdgcn_ds_swizzle
:
4107 case Intrinsic::amdgcn_ds_permute
:
4108 case Intrinsic::amdgcn_ds_bpermute
:
4109 case Intrinsic::amdgcn_update_dpp
:
4110 case Intrinsic::amdgcn_mov_dpp8
:
4111 case Intrinsic::amdgcn_mov_dpp
:
4112 case Intrinsic::amdgcn_strict_wwm
:
4113 case Intrinsic::amdgcn_wwm
:
4114 case Intrinsic::amdgcn_strict_wqm
:
4115 case Intrinsic::amdgcn_wqm
:
4116 case Intrinsic::amdgcn_softwqm
:
4117 case Intrinsic::amdgcn_set_inactive
:
4118 return getDefaultMappingAllVGPR(MI
);
4119 case Intrinsic::amdgcn_kernarg_segment_ptr
:
4120 case Intrinsic::amdgcn_s_getpc
:
4121 case Intrinsic::amdgcn_groupstaticsize
:
4122 case Intrinsic::amdgcn_reloc_constant
:
4123 case Intrinsic::returnaddress
: {
4124 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4125 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4128 case Intrinsic::amdgcn_wqm_vote
: {
4129 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4130 OpdsMapping
[0] = OpdsMapping
[2]
4131 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
);
4134 case Intrinsic::amdgcn_ps_live
: {
4135 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4138 case Intrinsic::amdgcn_div_scale
: {
4139 unsigned Dst0Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4140 unsigned Dst1Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
4141 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Dst0Size
);
4142 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Dst1Size
);
4144 unsigned SrcSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
4145 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4146 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4149 case Intrinsic::amdgcn_class
: {
4150 Register Src0Reg
= MI
.getOperand(2).getReg();
4151 Register Src1Reg
= MI
.getOperand(3).getReg();
4152 unsigned Src0Size
= MRI
.getType(Src0Reg
).getSizeInBits();
4153 unsigned Src1Size
= MRI
.getType(Src1Reg
).getSizeInBits();
4154 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4155 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, DstSize
);
4156 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Src0Size
);
4157 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Src1Size
);
4160 case Intrinsic::amdgcn_icmp
:
4161 case Intrinsic::amdgcn_fcmp
: {
4162 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4163 // This is not VCCRegBank because this is not used in boolean contexts.
4164 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
4165 unsigned OpSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4166 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, OpSize
);
4167 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, OpSize
);
4170 case Intrinsic::amdgcn_readlane
: {
4171 // This must be an SGPR, but accept a VGPR.
4172 Register IdxReg
= MI
.getOperand(3).getReg();
4173 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
4174 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, AMDGPU::SGPRRegBankID
);
4175 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
4178 case Intrinsic::amdgcn_readfirstlane
: {
4179 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4180 unsigned SrcSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4181 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
4182 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4185 case Intrinsic::amdgcn_writelane
: {
4186 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4187 Register SrcReg
= MI
.getOperand(2).getReg();
4188 unsigned SrcSize
= MRI
.getType(SrcReg
).getSizeInBits();
4189 unsigned SrcBank
= getRegBankID(SrcReg
, MRI
, AMDGPU::SGPRRegBankID
);
4190 Register IdxReg
= MI
.getOperand(3).getReg();
4191 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
4192 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, AMDGPU::SGPRRegBankID
);
4193 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4195 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4197 OpdsMapping
[2] = AMDGPU::getValueMapping(SrcBank
, SrcSize
);
4198 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
4199 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
4202 case Intrinsic::amdgcn_if_break
: {
4203 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4204 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4205 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4206 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4209 case Intrinsic::amdgcn_permlane16
:
4210 case Intrinsic::amdgcn_permlanex16
: {
4211 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4212 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4213 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4214 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4215 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4216 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4219 case Intrinsic::amdgcn_mfma_f32_4x4x1f32
:
4220 case Intrinsic::amdgcn_mfma_f32_4x4x4f16
:
4221 case Intrinsic::amdgcn_mfma_i32_4x4x4i8
:
4222 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16
:
4223 case Intrinsic::amdgcn_mfma_f32_16x16x1f32
:
4224 case Intrinsic::amdgcn_mfma_f32_16x16x4f32
:
4225 case Intrinsic::amdgcn_mfma_f32_16x16x4f16
:
4226 case Intrinsic::amdgcn_mfma_f32_16x16x16f16
:
4227 case Intrinsic::amdgcn_mfma_i32_16x16x4i8
:
4228 case Intrinsic::amdgcn_mfma_i32_16x16x16i8
:
4229 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16
:
4230 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16
:
4231 case Intrinsic::amdgcn_mfma_f32_32x32x1f32
:
4232 case Intrinsic::amdgcn_mfma_f32_32x32x2f32
:
4233 case Intrinsic::amdgcn_mfma_f32_32x32x4f16
:
4234 case Intrinsic::amdgcn_mfma_f32_32x32x8f16
:
4235 case Intrinsic::amdgcn_mfma_i32_32x32x4i8
:
4236 case Intrinsic::amdgcn_mfma_i32_32x32x8i8
:
4237 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16
:
4238 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16
:
4239 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k
:
4240 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k
:
4241 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k
:
4242 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k
:
4243 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k
:
4244 case Intrinsic::amdgcn_mfma_f64_16x16x4f64
:
4245 case Intrinsic::amdgcn_mfma_f64_4x4x4f64
: {
4246 // Default for MAI intrinsics.
4247 // srcC can also be an immediate which can be folded later.
4248 // FIXME: Should we eventually add an alternative mapping with AGPR src
4251 // vdst, srcA, srcB, srcC
4252 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4254 Info
->mayNeedAGPRs()
4255 ? getAGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
)
4256 : getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4257 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4258 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4260 Info
->mayNeedAGPRs()
4261 ? getAGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
)
4262 : getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4265 case Intrinsic::amdgcn_interp_p1
:
4266 case Intrinsic::amdgcn_interp_p2
:
4267 case Intrinsic::amdgcn_interp_mov
:
4268 case Intrinsic::amdgcn_interp_p1_f16
:
4269 case Intrinsic::amdgcn_interp_p2_f16
: {
4270 const int M0Idx
= MI
.getNumOperands() - 1;
4271 Register M0Reg
= MI
.getOperand(M0Idx
).getReg();
4272 unsigned M0Bank
= getRegBankID(M0Reg
, MRI
, AMDGPU::SGPRRegBankID
);
4273 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4275 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4276 for (int I
= 2; I
!= M0Idx
&& MI
.getOperand(I
).isReg(); ++I
)
4277 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4279 // Must be SGPR, but we must take whatever the original bank is and fix it
4281 OpdsMapping
[M0Idx
] = AMDGPU::getValueMapping(M0Bank
, 32);
4284 case Intrinsic::amdgcn_ballot
: {
4285 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4286 unsigned SrcSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4287 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
4288 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, SrcSize
);
4294 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD
:
4295 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
:
4296 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE
:
4297 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
: {
4298 auto IntrID
= MI
.getIntrinsicID();
4299 const AMDGPU::RsrcIntrinsic
*RSrcIntrin
= AMDGPU::lookupRsrcIntrinsic(IntrID
);
4300 assert(RSrcIntrin
&& "missing RsrcIntrinsic for image intrinsic");
4301 // Non-images can have complications from operands that allow both SGPR
4302 // and VGPR. For now it's too complicated to figure out the final opcode
4303 // to derive the register bank from the MCInstrDesc.
4304 assert(RSrcIntrin
->IsImage
);
4305 return getImageMapping(MRI
, MI
, RSrcIntrin
->RsrcArg
);
4307 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY
: {
4308 unsigned N
= MI
.getNumExplicitOperands() - 2;
4309 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 128);
4310 OpdsMapping
[N
] = getSGPROpMapping(MI
.getOperand(N
).getReg(), MRI
, *TRI
);
4312 // Sequential form: all operands combined into VGPR256/VGPR512
4313 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
4316 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
4319 for (unsigned I
= 2; I
< N
; ++I
)
4320 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4324 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
: {
4325 auto IntrID
= MI
.getIntrinsicID();
4327 case Intrinsic::amdgcn_s_getreg
:
4328 case Intrinsic::amdgcn_s_memtime
:
4329 case Intrinsic::amdgcn_s_memrealtime
:
4330 case Intrinsic::amdgcn_s_get_waveid_in_workgroup
: {
4331 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4332 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4335 case Intrinsic::amdgcn_global_atomic_fadd
:
4336 case Intrinsic::amdgcn_global_atomic_csub
:
4337 case Intrinsic::amdgcn_global_atomic_fmin
:
4338 case Intrinsic::amdgcn_global_atomic_fmax
:
4339 case Intrinsic::amdgcn_flat_atomic_fadd
:
4340 case Intrinsic::amdgcn_flat_atomic_fmin
:
4341 case Intrinsic::amdgcn_flat_atomic_fmax
:
4342 return getDefaultMappingAllVGPR(MI
);
4343 case Intrinsic::amdgcn_ds_ordered_add
:
4344 case Intrinsic::amdgcn_ds_ordered_swap
: {
4345 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4346 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4347 unsigned M0Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4348 AMDGPU::SGPRRegBankID
);
4349 OpdsMapping
[2] = AMDGPU::getValueMapping(M0Bank
, 32);
4350 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4353 case Intrinsic::amdgcn_ds_append
:
4354 case Intrinsic::amdgcn_ds_consume
: {
4355 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4356 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
4357 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4360 case Intrinsic::amdgcn_exp_compr
:
4361 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4362 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4364 case Intrinsic::amdgcn_exp
:
4365 // FIXME: Could we support packed types here?
4366 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4367 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4368 OpdsMapping
[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4369 OpdsMapping
[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4371 case Intrinsic::amdgcn_s_sendmsg
:
4372 case Intrinsic::amdgcn_s_sendmsghalt
: {
4373 // This must be an SGPR, but accept a VGPR.
4374 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4375 AMDGPU::SGPRRegBankID
);
4376 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
4379 case Intrinsic::amdgcn_s_setreg
: {
4380 // This must be an SGPR, but accept a VGPR.
4381 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4382 AMDGPU::SGPRRegBankID
);
4383 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
4386 case Intrinsic::amdgcn_end_cf
: {
4387 unsigned Size
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4388 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4391 case Intrinsic::amdgcn_else
: {
4392 unsigned WaveSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4393 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4394 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, WaveSize
);
4395 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, WaveSize
);
4398 case Intrinsic::amdgcn_live_mask
: {
4399 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4402 case Intrinsic::amdgcn_wqm_demote
:
4403 case Intrinsic::amdgcn_kill
: {
4404 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
4407 case Intrinsic::amdgcn_raw_buffer_load
:
4408 case Intrinsic::amdgcn_raw_tbuffer_load
: {
4409 // FIXME: Should make intrinsic ID the last operand of the instruction,
4410 // then this would be the same as store
4411 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4412 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4413 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4414 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4417 case Intrinsic::amdgcn_raw_buffer_store
:
4418 case Intrinsic::amdgcn_raw_buffer_store_format
:
4419 case Intrinsic::amdgcn_raw_tbuffer_store
: {
4420 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4421 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4422 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4423 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4426 case Intrinsic::amdgcn_struct_buffer_load
:
4427 case Intrinsic::amdgcn_struct_tbuffer_load
: {
4428 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4429 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4430 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4431 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4432 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
4435 case Intrinsic::amdgcn_struct_buffer_store
:
4436 case Intrinsic::amdgcn_struct_tbuffer_store
: {
4437 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4438 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4439 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4440 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
4441 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
4444 case Intrinsic::amdgcn_init_exec_from_input
: {
4445 unsigned Size
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4446 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
4449 case Intrinsic::amdgcn_ds_gws_init
:
4450 case Intrinsic::amdgcn_ds_gws_barrier
:
4451 case Intrinsic::amdgcn_ds_gws_sema_br
: {
4452 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
4454 // This must be an SGPR, but accept a VGPR.
4455 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4456 AMDGPU::SGPRRegBankID
);
4457 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
4460 case Intrinsic::amdgcn_ds_gws_sema_v
:
4461 case Intrinsic::amdgcn_ds_gws_sema_p
:
4462 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
4463 // This must be an SGPR, but accept a VGPR.
4464 unsigned Bank
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
4465 AMDGPU::SGPRRegBankID
);
4466 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, 32);
4470 return getInvalidInstructionMapping();
4474 case AMDGPU::G_SELECT
: {
4475 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
4476 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
,
4477 AMDGPU::SGPRRegBankID
);
4478 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
,
4479 AMDGPU::SGPRRegBankID
);
4480 bool SGPRSrcs
= Op2Bank
== AMDGPU::SGPRRegBankID
&&
4481 Op3Bank
== AMDGPU::SGPRRegBankID
;
4483 unsigned CondBankDefault
= SGPRSrcs
?
4484 AMDGPU::SGPRRegBankID
: AMDGPU::VCCRegBankID
;
4485 unsigned CondBank
= getRegBankID(MI
.getOperand(1).getReg(), MRI
,
4487 if (CondBank
== AMDGPU::SGPRRegBankID
)
4488 CondBank
= SGPRSrcs
? AMDGPU::SGPRRegBankID
: AMDGPU::VCCRegBankID
;
4489 else if (CondBank
== AMDGPU::VGPRRegBankID
)
4490 CondBank
= AMDGPU::VCCRegBankID
;
4492 unsigned Bank
= SGPRSrcs
&& CondBank
== AMDGPU::SGPRRegBankID
?
4493 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
4495 assert(CondBank
== AMDGPU::VCCRegBankID
|| CondBank
== AMDGPU::SGPRRegBankID
);
4497 // TODO: Should report 32-bit for scalar condition type.
4499 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
4500 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
4501 OpdsMapping
[2] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
4502 OpdsMapping
[3] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
4504 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, Size
);
4505 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
4506 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, Size
);
4507 OpdsMapping
[3] = AMDGPU::getValueMapping(Bank
, Size
);
4513 case AMDGPU::G_SI_CALL
: {
4514 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 64);
4515 // Lie and claim everything is legal, even though some need to be
4516 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4517 OpdsMapping
[1] = getSGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
4519 // Allow anything for implicit arguments
4520 for (unsigned I
= 4; I
< MI
.getNumOperands(); ++I
) {
4521 if (MI
.getOperand(I
).isReg()) {
4522 Register Reg
= MI
.getOperand(I
).getReg();
4523 auto OpBank
= getRegBankID(Reg
, MRI
);
4524 unsigned Size
= getSizeInBits(Reg
, MRI
, *TRI
);
4525 OpdsMapping
[I
] = AMDGPU::getValueMapping(OpBank
, Size
);
4530 case AMDGPU::G_LOAD
:
4531 case AMDGPU::G_ZEXTLOAD
:
4532 case AMDGPU::G_SEXTLOAD
:
4533 return getInstrMappingForLoad(MI
);
4535 case AMDGPU::G_ATOMICRMW_XCHG
:
4536 case AMDGPU::G_ATOMICRMW_ADD
:
4537 case AMDGPU::G_ATOMICRMW_SUB
:
4538 case AMDGPU::G_ATOMICRMW_AND
:
4539 case AMDGPU::G_ATOMICRMW_OR
:
4540 case AMDGPU::G_ATOMICRMW_XOR
:
4541 case AMDGPU::G_ATOMICRMW_MAX
:
4542 case AMDGPU::G_ATOMICRMW_MIN
:
4543 case AMDGPU::G_ATOMICRMW_UMAX
:
4544 case AMDGPU::G_ATOMICRMW_UMIN
:
4545 case AMDGPU::G_ATOMICRMW_FADD
:
4546 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG
:
4547 case AMDGPU::G_AMDGPU_ATOMIC_INC
:
4548 case AMDGPU::G_AMDGPU_ATOMIC_DEC
:
4549 case AMDGPU::G_AMDGPU_ATOMIC_FMIN
:
4550 case AMDGPU::G_AMDGPU_ATOMIC_FMAX
: {
4551 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4552 OpdsMapping
[1] = getValueMappingForPtr(MRI
, MI
.getOperand(1).getReg());
4553 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4556 case AMDGPU::G_ATOMIC_CMPXCHG
: {
4557 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
4558 OpdsMapping
[1] = getValueMappingForPtr(MRI
, MI
.getOperand(1).getReg());
4559 OpdsMapping
[2] = getVGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
4560 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
4563 case AMDGPU::G_BRCOND
: {
4564 unsigned Bank
= getRegBankID(MI
.getOperand(0).getReg(), MRI
,
4565 AMDGPU::SGPRRegBankID
);
4566 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
4567 if (Bank
!= AMDGPU::SGPRRegBankID
)
4568 Bank
= AMDGPU::VCCRegBankID
;
4570 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, 1);
4575 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4576 getOperandsMapping(OpdsMapping
),
4577 MI
.getNumOperands());