1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the RegisterBankInfo class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/SmallSet.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
25 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
26 #include "llvm/CodeGen/TargetRegisterInfo.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Constants.h"
30 #define GET_TARGET_REGBANK_IMPL
31 #include "AMDGPUGenRegisterBank.inc"
33 // This file will be TableGen'ed at some point.
34 #include "AMDGPUGenRegisterBankInfo.def"
37 using namespace MIPatternMatch
;
41 // Observer to apply a register bank to new registers created by LegalizerHelper.
42 class ApplyRegBankMapping final
: public GISelChangeObserver
{
44 MachineRegisterInfo
&MRI
;
45 const RegisterBank
*NewBank
;
46 SmallVector
<MachineInstr
*, 4> NewInsts
;
49 ApplyRegBankMapping(MachineRegisterInfo
&MRI_
, const RegisterBank
*RB
)
50 : MRI(MRI_
), NewBank(RB
) {}
52 ~ApplyRegBankMapping() {
53 for (MachineInstr
*MI
: NewInsts
)
57 /// Set any registers that don't have a set register class or bank to SALU.
58 void applyBank(MachineInstr
&MI
) {
59 for (MachineOperand
&Op
: MI
.operands()) {
63 Register Reg
= Op
.getReg();
64 if (MRI
.getRegClassOrRegBank(Reg
))
67 const RegisterBank
*RB
= NewBank
;
68 // FIXME: This might not be enough to detect when SCC should be used.
69 if (MRI
.getType(Reg
) == LLT::scalar(1))
70 RB
= (NewBank
== &AMDGPU::SGPRRegBank
?
71 &AMDGPU::SCCRegBank
: &AMDGPU::VCCRegBank
);
73 MRI
.setRegBank(Reg
, *RB
);
77 void erasingInstr(MachineInstr
&MI
) override
{}
79 void createdInstr(MachineInstr
&MI
) override
{
80 // At this point, the instruction was just inserted and has no operands.
81 NewInsts
.push_back(&MI
);
84 void changingInstr(MachineInstr
&MI
) override
{}
85 void changedInstr(MachineInstr
&MI
) override
{}
89 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget
&ST
)
90 : AMDGPUGenRegisterBankInfo(),
92 TRI(Subtarget
.getRegisterInfo()),
93 TII(Subtarget
.getInstrInfo()) {
95 // HACK: Until this is fully tablegen'd.
96 static bool AlreadyInit
= false;
102 const RegisterBank
&RBSGPR
= getRegBank(AMDGPU::SGPRRegBankID
);
104 assert(&RBSGPR
== &AMDGPU::SGPRRegBank
);
106 const RegisterBank
&RBVGPR
= getRegBank(AMDGPU::VGPRRegBankID
);
108 assert(&RBVGPR
== &AMDGPU::VGPRRegBank
);
112 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank
&Dst
,
113 const RegisterBank
&Src
,
114 unsigned Size
) const {
115 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
116 if (Dst
.getID() == AMDGPU::SGPRRegBankID
&&
117 Src
.getID() == AMDGPU::VGPRRegBankID
) {
118 return std::numeric_limits
<unsigned>::max();
121 // Bool values are tricky, because the meaning is based on context. The SCC
122 // and VCC banks are for the natural scalar and vector conditions produced by
125 // Legalization doesn't know about the necessary context, so an s1 use may
126 // have been a truncate from an arbitrary value, in which case a copy (lowered
127 // as a compare with 0) needs to be inserted.
129 (Dst
.getID() == AMDGPU::SCCRegBankID
||
130 Dst
.getID() == AMDGPU::SGPRRegBankID
) &&
131 (Src
.getID() == AMDGPU::SGPRRegBankID
||
132 Src
.getID() == AMDGPU::VGPRRegBankID
||
133 Src
.getID() == AMDGPU::VCCRegBankID
))
134 return std::numeric_limits
<unsigned>::max();
136 if (Dst
.getID() == AMDGPU::SCCRegBankID
&&
137 Src
.getID() == AMDGPU::VCCRegBankID
)
138 return std::numeric_limits
<unsigned>::max();
140 return RegisterBankInfo::copyCost(Dst
, Src
, Size
);
143 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
144 const ValueMapping
&ValMapping
,
145 const RegisterBank
*CurBank
) const {
146 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
148 // FIXME: Is there a better way to do this?
149 if (ValMapping
.NumBreakDowns
>= 2 || ValMapping
.BreakDown
[0].Length
>= 64)
150 return 10; // This is expensive.
152 assert(ValMapping
.NumBreakDowns
== 2 &&
153 ValMapping
.BreakDown
[0].Length
== 32 &&
154 ValMapping
.BreakDown
[0].StartIdx
== 0 &&
155 ValMapping
.BreakDown
[1].Length
== 32 &&
156 ValMapping
.BreakDown
[1].StartIdx
== 32 &&
157 ValMapping
.BreakDown
[0].RegBank
== ValMapping
.BreakDown
[1].RegBank
);
159 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
160 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
163 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
164 // alignment restrictions, but this probably isn't important.
168 const RegisterBank
&AMDGPURegisterBankInfo::getRegBankFromRegClass(
169 const TargetRegisterClass
&RC
) const {
170 if (&RC
== &AMDGPU::SReg_1RegClass
)
171 return AMDGPU::VCCRegBank
;
173 return TRI
->isSGPRClass(&RC
) ? AMDGPU::SGPRRegBank
: AMDGPU::VGPRRegBank
;
176 template <unsigned NumOps
>
177 RegisterBankInfo::InstructionMappings
178 AMDGPURegisterBankInfo::addMappingFromTable(
179 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
,
180 const std::array
<unsigned, NumOps
> RegSrcOpIdx
,
181 ArrayRef
<OpRegBankEntry
<NumOps
>> Table
) const {
183 InstructionMappings AltMappings
;
185 SmallVector
<const ValueMapping
*, 10> Operands(MI
.getNumOperands());
187 unsigned Sizes
[NumOps
];
188 for (unsigned I
= 0; I
< NumOps
; ++I
) {
189 Register Reg
= MI
.getOperand(RegSrcOpIdx
[I
]).getReg();
190 Sizes
[I
] = getSizeInBits(Reg
, MRI
, *TRI
);
193 for (unsigned I
= 0, E
= MI
.getNumExplicitDefs(); I
!= E
; ++I
) {
194 unsigned SizeI
= getSizeInBits(MI
.getOperand(I
).getReg(), MRI
, *TRI
);
195 Operands
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SizeI
);
198 // getInstrMapping's default mapping uses ID 1, so start at 2.
199 unsigned MappingID
= 2;
200 for (const auto &Entry
: Table
) {
201 for (unsigned I
= 0; I
< NumOps
; ++I
) {
202 int OpIdx
= RegSrcOpIdx
[I
];
203 Operands
[OpIdx
] = AMDGPU::getValueMapping(Entry
.RegBanks
[I
], Sizes
[I
]);
206 AltMappings
.push_back(&getInstructionMapping(MappingID
++, Entry
.Cost
,
207 getOperandsMapping(Operands
),
214 RegisterBankInfo::InstructionMappings
215 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
216 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
217 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
218 case Intrinsic::amdgcn_readlane
: {
219 static const OpRegBankEntry
<3> Table
[2] = {
221 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
223 // Need a readfirstlane for the index.
224 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
227 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
228 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
230 case Intrinsic::amdgcn_writelane
: {
231 static const OpRegBankEntry
<4> Table
[4] = {
233 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
235 // Need readfirstlane of first op
236 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
238 // Need readfirstlane of second op
239 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
241 // Need readfirstlane of both ops
242 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 3 }
245 // rsrc, voffset, offset
246 const std::array
<unsigned, 4> RegSrcOpIdx
= { { 0, 2, 3, 4 } };
247 return addMappingFromTable
<4>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
250 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
254 RegisterBankInfo::InstructionMappings
255 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
256 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
258 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
259 case Intrinsic::amdgcn_buffer_load
: {
260 static const OpRegBankEntry
<3> Table
[4] = {
262 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
263 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
265 // Waterfall loop needed for rsrc. In the worst case this will execute
266 // approximately an extra 10 * wavesize + 2 instructions.
267 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1000 },
268 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1000 }
271 // rsrc, voffset, offset
272 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 2, 3, 4 } };
273 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
275 case Intrinsic::amdgcn_s_buffer_load
: {
276 static const OpRegBankEntry
<2> Table
[4] = {
278 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
280 // Only need 1 register in loop
281 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 300 },
283 // Have to waterfall the resource.
284 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1000 },
286 // Have to waterfall the resource, and the offset.
287 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1500 }
291 const std::array
<unsigned, 2> RegSrcOpIdx
= { { 2, 3 } };
292 return addMappingFromTable
<2>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
294 case Intrinsic::amdgcn_ds_ordered_add
:
295 case Intrinsic::amdgcn_ds_ordered_swap
: {
297 static const OpRegBankEntry
<3> Table
[2] = {
299 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
301 // Need a readfirstlane for m0
302 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
305 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
306 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
308 case Intrinsic::amdgcn_s_sendmsg
:
309 case Intrinsic::amdgcn_s_sendmsghalt
: {
310 // FIXME: Should have no register for immediate
311 static const OpRegBankEntry
<2> Table
[2] = {
313 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
316 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 3 }
319 const std::array
<unsigned, 2> RegSrcOpIdx
= { { 1, 2 } };
320 return addMappingFromTable
<2>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
323 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
327 static bool isInstrUniformNonExtLoadAlign4(const MachineInstr
&MI
) {
328 if (!MI
.hasOneMemOperand())
331 const MachineMemOperand
*MMO
= *MI
.memoperands_begin();
332 return MMO
->getSize() >= 4 && MMO
->getAlignment() >= 4 &&
333 AMDGPUInstrInfo::isUniformMMO(MMO
);
336 RegisterBankInfo::InstructionMappings
337 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
338 const MachineInstr
&MI
) const {
340 const MachineFunction
&MF
= *MI
.getParent()->getParent();
341 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
344 InstructionMappings AltMappings
;
345 switch (MI
.getOpcode()) {
346 case TargetOpcode::G_CONSTANT
: {
347 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
349 static const OpRegBankEntry
<1> Table
[4] = {
350 { { AMDGPU::VGPRRegBankID
}, 1 },
351 { { AMDGPU::SGPRRegBankID
}, 1 },
352 { { AMDGPU::VCCRegBankID
}, 1 },
353 { { AMDGPU::SCCRegBankID
}, 1 }
356 return addMappingFromTable
<1>(MI
, MRI
, { 0 }, Table
);
361 case TargetOpcode::G_FCONSTANT
:
362 case TargetOpcode::G_FRAME_INDEX
:
363 case TargetOpcode::G_GLOBAL_VALUE
: {
364 static const OpRegBankEntry
<1> Table
[2] = {
365 { { AMDGPU::VGPRRegBankID
}, 1 },
366 { { AMDGPU::SGPRRegBankID
}, 1 }
369 return addMappingFromTable
<1>(MI
, MRI
, {{ 0 }}, Table
);
371 case TargetOpcode::G_AND
:
372 case TargetOpcode::G_OR
:
373 case TargetOpcode::G_XOR
: {
374 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
377 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
378 const InstructionMapping
&SCCMapping
= getInstructionMapping(
379 1, 1, getOperandsMapping(
380 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, Size
),
381 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
382 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
384 AltMappings
.push_back(&SCCMapping
);
386 const InstructionMapping
&SGPRMapping
= getInstructionMapping(
387 1, 1, getOperandsMapping(
388 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
389 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
390 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
392 AltMappings
.push_back(&SGPRMapping
);
394 const InstructionMapping
&VCCMapping0
= getInstructionMapping(
395 2, 10, getOperandsMapping(
396 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
397 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
398 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
)}),
400 AltMappings
.push_back(&VCCMapping0
);
407 const InstructionMapping
&SSMapping
= getInstructionMapping(
408 1, 1, getOperandsMapping(
409 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
410 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
411 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
413 AltMappings
.push_back(&SSMapping
);
415 const InstructionMapping
&VVMapping
= getInstructionMapping(
416 2, 2, getOperandsMapping(
417 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
418 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
419 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
421 AltMappings
.push_back(&VVMapping
);
423 const InstructionMapping
&SVMapping
= getInstructionMapping(
424 3, 3, getOperandsMapping(
425 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
426 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
),
427 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
429 AltMappings
.push_back(&SVMapping
);
431 // SGPR in LHS is slightly preferrable, so make it VS more expensive than
433 const InstructionMapping
&VSMapping
= getInstructionMapping(
434 3, 4, getOperandsMapping(
435 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
436 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
437 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
)}),
439 AltMappings
.push_back(&VSMapping
);
442 case TargetOpcode::G_LOAD
:
443 case TargetOpcode::G_ZEXTLOAD
:
444 case TargetOpcode::G_SEXTLOAD
: {
445 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
446 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
447 unsigned PtrSize
= PtrTy
.getSizeInBits();
448 unsigned AS
= PtrTy
.getAddressSpace();
449 LLT LoadTy
= MRI
.getType(MI
.getOperand(0).getReg());
450 if (isInstrUniformNonExtLoadAlign4(MI
) &&
451 (AS
!= AMDGPUAS::LOCAL_ADDRESS
&& AS
!= AMDGPUAS::REGION_ADDRESS
)) {
452 const InstructionMapping
&SSMapping
= getInstructionMapping(
453 1, 1, getOperandsMapping(
454 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
455 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
)}),
457 AltMappings
.push_back(&SSMapping
);
460 const InstructionMapping
&VVMapping
= getInstructionMapping(
461 2, 1, getOperandsMapping(
462 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID
, LoadTy
),
463 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
)}),
465 AltMappings
.push_back(&VVMapping
);
467 // It may be possible to have a vgpr = load sgpr mapping here, because
468 // the mubuf instructions support this kind of load, but probably for only
469 // gfx7 and older. However, the addressing mode matching in the instruction
470 // selector should be able to do a better job of detecting and selecting
471 // these kinds of loads from the vgpr = load vgpr mapping.
476 case TargetOpcode::G_ICMP
: {
477 unsigned Size
= getSizeInBits(MI
.getOperand(2).getReg(), MRI
, *TRI
);
478 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
479 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1),
480 nullptr, // Predicate operand.
481 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
482 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
484 AltMappings
.push_back(&SSMapping
);
486 const InstructionMapping
&SVMapping
= getInstructionMapping(2, 1,
487 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
488 nullptr, // Predicate operand.
489 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
490 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
)}),
492 AltMappings
.push_back(&SVMapping
);
494 const InstructionMapping
&VSMapping
= getInstructionMapping(3, 1,
495 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
496 nullptr, // Predicate operand.
497 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
498 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
500 AltMappings
.push_back(&VSMapping
);
502 const InstructionMapping
&VVMapping
= getInstructionMapping(4, 1,
503 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
504 nullptr, // Predicate operand.
505 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
506 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
)}),
508 AltMappings
.push_back(&VVMapping
);
512 case TargetOpcode::G_SELECT
: {
513 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
514 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
515 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
516 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1),
517 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
518 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
520 AltMappings
.push_back(&SSMapping
);
522 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
523 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
524 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
525 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
526 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
528 AltMappings
.push_back(&VVMapping
);
532 case TargetOpcode::G_SMIN
:
533 case TargetOpcode::G_SMAX
:
534 case TargetOpcode::G_UMIN
:
535 case TargetOpcode::G_UMAX
: {
536 static const OpRegBankEntry
<3> Table
[4] = {
537 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
538 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
539 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
541 // Scalar requires cmp+select, and extends if 16-bit.
542 // FIXME: Should there be separate costs for 32 and 16-bit
543 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 3 }
546 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 1, 2 } };
547 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
549 case TargetOpcode::G_UADDE
:
550 case TargetOpcode::G_USUBE
:
551 case TargetOpcode::G_SADDE
:
552 case TargetOpcode::G_SSUBE
: {
553 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
554 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
556 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
557 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1),
558 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
559 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
560 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1)}),
562 AltMappings
.push_back(&SSMapping
);
564 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
565 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
566 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
567 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
568 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
569 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1)}),
571 AltMappings
.push_back(&VVMapping
);
574 case AMDGPU::G_BRCOND
: {
575 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
577 const InstructionMapping
&SMapping
= getInstructionMapping(
578 1, 1, getOperandsMapping(
579 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1), nullptr}),
581 AltMappings
.push_back(&SMapping
);
583 const InstructionMapping
&VMapping
= getInstructionMapping(
584 1, 1, getOperandsMapping(
585 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1), nullptr }),
587 AltMappings
.push_back(&VMapping
);
590 case AMDGPU::G_INTRINSIC
:
591 return getInstrAlternativeMappingsIntrinsic(MI
, MRI
);
592 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
:
593 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI
, MRI
);
597 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
600 void AMDGPURegisterBankInfo::split64BitValueForMapping(
602 SmallVector
<Register
, 2> &Regs
,
604 Register Reg
) const {
605 assert(HalfTy
.getSizeInBits() == 32);
606 MachineRegisterInfo
*MRI
= B
.getMRI();
607 Register LoLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
608 Register HiLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
609 const RegisterBank
*Bank
= getRegBank(Reg
, *MRI
, *TRI
);
610 MRI
->setRegBank(LoLHS
, *Bank
);
611 MRI
->setRegBank(HiLHS
, *Bank
);
613 Regs
.push_back(LoLHS
);
614 Regs
.push_back(HiLHS
);
616 B
.buildInstr(AMDGPU::G_UNMERGE_VALUES
)
622 /// Replace the current type each register in \p Regs has with \p NewTy
623 static void setRegsToType(MachineRegisterInfo
&MRI
, ArrayRef
<Register
> Regs
,
625 for (Register Reg
: Regs
) {
626 assert(MRI
.getType(Reg
).getSizeInBits() == NewTy
.getSizeInBits());
627 MRI
.setType(Reg
, NewTy
);
631 static LLT
getHalfSizedType(LLT Ty
) {
633 assert(Ty
.getNumElements() % 2 == 0);
634 return LLT::scalarOrVector(Ty
.getNumElements() / 2, Ty
.getElementType());
637 assert(Ty
.getSizeInBits() % 2 == 0);
638 return LLT::scalar(Ty
.getSizeInBits() / 2);
641 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
642 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
643 /// execute the instruction for each unique combination of values in all lanes
644 /// in the wave. The block will be split such that rest of the instructions are
645 /// moved to a new block.
647 /// Essentially performs this loop:
649 /// Save Execution Mask
650 /// For (Lane : Wavefront) {
651 /// Enable Lane, Disable all other lanes
652 /// SGPR = read SGPR value for current lane from VGPR
653 /// VGPRResult[Lane] = use_op SGPR
655 /// Restore Execution Mask
657 /// There is additional complexity to try for compare values to identify the
658 /// unique values used.
659 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
662 MachineRegisterInfo
&MRI
,
663 ArrayRef
<unsigned> OpIndices
) const {
664 MachineFunction
*MF
= MI
.getParent()->getParent();
665 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
666 const SIInstrInfo
*TII
= ST
.getInstrInfo();
667 MachineBasicBlock::iterator
I(MI
);
669 MachineBasicBlock
&MBB
= *MI
.getParent();
670 const DebugLoc
&DL
= MI
.getDebugLoc();
672 // Use a set to avoid extra readfirstlanes in the case where multiple operands
673 // are the same register.
674 SmallSet
<Register
, 4> SGPROperandRegs
;
675 for (unsigned Op
: OpIndices
) {
676 assert(MI
.getOperand(Op
).isUse());
677 Register Reg
= MI
.getOperand(Op
).getReg();
678 const RegisterBank
*OpBank
= getRegBank(Reg
, MRI
, *TRI
);
679 if (OpBank
->getID() == AMDGPU::VGPRRegBankID
)
680 SGPROperandRegs
.insert(Reg
);
683 // No operands need to be replaced, so no need to loop.
684 if (SGPROperandRegs
.empty())
687 SmallVector
<Register
, 4> ResultRegs
;
688 SmallVector
<Register
, 4> InitResultRegs
;
689 SmallVector
<Register
, 4> PhiRegs
;
690 for (MachineOperand
&Def
: MI
.defs()) {
691 LLT ResTy
= MRI
.getType(Def
.getReg());
692 const RegisterBank
*DefBank
= getRegBank(Def
.getReg(), MRI
, *TRI
);
693 ResultRegs
.push_back(Def
.getReg());
694 Register InitReg
= B
.buildUndef(ResTy
).getReg(0);
695 Register PhiReg
= MRI
.createGenericVirtualRegister(ResTy
);
696 InitResultRegs
.push_back(InitReg
);
697 PhiRegs
.push_back(PhiReg
);
698 MRI
.setRegBank(PhiReg
, *DefBank
);
699 MRI
.setRegBank(InitReg
, *DefBank
);
702 Register SaveExecReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
703 Register InitSaveExecReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
705 // Don't bother using generic instructions/registers for the exec mask.
706 B
.buildInstr(TargetOpcode::IMPLICIT_DEF
)
707 .addDef(InitSaveExecReg
);
709 Register PhiExec
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
710 Register NewExec
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
712 // To insert the loop we need to split the block. Move everything before this
713 // point to a new block, and insert a new empty block before this instruction.
714 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
715 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
716 MachineBasicBlock
*RestoreExecBB
= MF
->CreateMachineBasicBlock();
717 MachineFunction::iterator
MBBI(MBB
);
719 MF
->insert(MBBI
, LoopBB
);
720 MF
->insert(MBBI
, RestoreExecBB
);
721 MF
->insert(MBBI
, RemainderBB
);
723 LoopBB
->addSuccessor(RestoreExecBB
);
724 LoopBB
->addSuccessor(LoopBB
);
726 // Move the rest of the block into a new block.
727 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
728 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, I
, MBB
.end());
730 MBB
.addSuccessor(LoopBB
);
731 RestoreExecBB
->addSuccessor(RemainderBB
);
733 B
.setInsertPt(*LoopBB
, LoopBB
->end());
735 B
.buildInstr(TargetOpcode::PHI
)
737 .addReg(InitSaveExecReg
)
742 for (auto Result
: zip(InitResultRegs
, ResultRegs
, PhiRegs
)) {
743 B
.buildInstr(TargetOpcode::G_PHI
)
744 .addDef(std::get
<2>(Result
))
745 .addReg(std::get
<0>(Result
)) // Initial value / implicit_def
747 .addReg(std::get
<1>(Result
)) // Mid-loop value.
751 // Move the instruction into the loop.
752 LoopBB
->splice(LoopBB
->end(), &MBB
, I
);
753 I
= std::prev(LoopBB
->end());
759 for (MachineOperand
&Op
: MI
.uses()) {
764 if (SGPROperandRegs
.count(Op
.getReg())) {
765 LLT OpTy
= MRI
.getType(Op
.getReg());
766 unsigned OpSize
= OpTy
.getSizeInBits();
768 // Can only do a readlane of 32-bit pieces.
770 // Avoid extra copies in the simple case of one 32-bit register.
771 Register CurrentLaneOpReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
772 MRI
.setType(CurrentLaneOpReg
, OpTy
);
774 constrainGenericRegister(Op
.getReg(), AMDGPU::VGPR_32RegClass
, MRI
);
775 // Read the next variant <- also loop target.
776 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), CurrentLaneOpReg
)
777 .addReg(Op
.getReg());
779 Register NewCondReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
780 bool First
= CondReg
== AMDGPU::NoRegister
;
782 CondReg
= NewCondReg
;
784 // Compare the just read M0 value to all possible Idx values.
785 B
.buildInstr(AMDGPU::V_CMP_EQ_U32_e64
)
787 .addReg(CurrentLaneOpReg
)
788 .addReg(Op
.getReg());
789 Op
.setReg(CurrentLaneOpReg
);
792 Register AndReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
794 // If there are multiple operands to consider, and the conditions.
795 B
.buildInstr(AMDGPU::S_AND_B64
)
802 LLT S32
= LLT::scalar(32);
803 SmallVector
<Register
, 8> ReadlanePieces
;
805 // The compares can be done as 64-bit, but the extract needs to be done
808 bool Is64
= OpSize
% 64 == 0;
810 LLT UnmergeTy
= OpSize
% 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
811 unsigned CmpOp
= OpSize
% 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
812 : AMDGPU::V_CMP_EQ_U32_e64
;
814 // The compares can be done as 64-bit, but the extract needs to be done
817 // Insert the unmerge before the loop.
820 auto Unmerge
= B
.buildUnmerge(UnmergeTy
, Op
.getReg());
823 unsigned NumPieces
= Unmerge
->getNumOperands() - 1;
824 for (unsigned PieceIdx
= 0; PieceIdx
!= NumPieces
; ++PieceIdx
) {
825 Register UnmergePiece
= Unmerge
.getReg(PieceIdx
);
827 Register CurrentLaneOpReg
;
829 Register CurrentLaneOpRegLo
= MRI
.createGenericVirtualRegister(S32
);
830 Register CurrentLaneOpRegHi
= MRI
.createGenericVirtualRegister(S32
);
832 MRI
.setRegClass(UnmergePiece
, &AMDGPU::VReg_64RegClass
);
833 MRI
.setRegClass(CurrentLaneOpRegLo
, &AMDGPU::SReg_32_XM0RegClass
);
834 MRI
.setRegClass(CurrentLaneOpRegHi
, &AMDGPU::SReg_32_XM0RegClass
);
836 // Read the next variant <- also loop target.
837 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
839 .addReg(UnmergePiece
, 0, AMDGPU::sub0
);
841 // Read the next variant <- also loop target.
842 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
844 .addReg(UnmergePiece
, 0, AMDGPU::sub1
);
847 B
.buildMerge(LLT::scalar(64),
848 {CurrentLaneOpRegLo
, CurrentLaneOpRegHi
})
851 MRI
.setRegClass(CurrentLaneOpReg
, &AMDGPU::SReg_64_XEXECRegClass
);
853 if (OpTy
.getScalarSizeInBits() == 64) {
854 // If we need to produce a 64-bit element vector, so use the
856 ReadlanePieces
.push_back(CurrentLaneOpReg
);
858 // 32-bit element type.
859 ReadlanePieces
.push_back(CurrentLaneOpRegLo
);
860 ReadlanePieces
.push_back(CurrentLaneOpRegHi
);
863 CurrentLaneOpReg
= MRI
.createGenericVirtualRegister(LLT::scalar(32));
864 MRI
.setRegClass(UnmergePiece
, &AMDGPU::VGPR_32RegClass
);
865 MRI
.setRegClass(CurrentLaneOpReg
, &AMDGPU::SReg_32_XM0RegClass
);
867 // Read the next variant <- also loop target.
868 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
870 .addReg(UnmergePiece
);
871 ReadlanePieces
.push_back(CurrentLaneOpReg
);
875 = MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
876 bool First
= CondReg
== AMDGPU::NoRegister
;
878 CondReg
= NewCondReg
;
882 .addReg(CurrentLaneOpReg
)
883 .addReg(UnmergePiece
);
887 = MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
889 // If there are multiple operands to consider, and the conditions.
890 B
.buildInstr(AMDGPU::S_AND_B64
)
898 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
900 if (OpTy
.isVector()) {
901 auto Merge
= B
.buildBuildVector(OpTy
, ReadlanePieces
);
902 Op
.setReg(Merge
.getReg(0));
904 auto Merge
= B
.buildMerge(OpTy
, ReadlanePieces
);
905 Op
.setReg(Merge
.getReg(0));
908 MRI
.setRegBank(Op
.getReg(), getRegBank(AMDGPU::SGPRRegBankID
));
913 B
.setInsertPt(*LoopBB
, LoopBB
->end());
915 // Update EXEC, save the original EXEC value to VCC.
916 B
.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64
)
918 .addReg(CondReg
, RegState::Kill
);
920 MRI
.setSimpleHint(NewExec
, CondReg
);
922 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
923 B
.buildInstr(AMDGPU::S_XOR_B64_term
)
924 .addDef(AMDGPU::EXEC
)
925 .addReg(AMDGPU::EXEC
)
928 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
931 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
932 B
.buildInstr(AMDGPU::S_CBRANCH_EXECNZ
)
935 // Save the EXEC mask before the loop.
936 BuildMI(MBB
, MBB
.end(), DL
, TII
->get(AMDGPU::S_MOV_B64_term
), SaveExecReg
)
937 .addReg(AMDGPU::EXEC
);
939 // Restore the EXEC mask after the loop.
940 B
.setMBB(*RestoreExecBB
);
941 B
.buildInstr(AMDGPU::S_MOV_B64_term
)
942 .addDef(AMDGPU::EXEC
)
943 .addReg(SaveExecReg
);
945 // Restore the insert point before the original instruction.
946 B
.setInsertPt(MBB
, MBB
.end());
951 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
952 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
953 ArrayRef
<unsigned> OpIndices
) const {
954 MachineIRBuilder
B(MI
);
955 return executeInWaterfallLoop(B
, MI
, MRI
, OpIndices
);
958 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
959 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
960 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, unsigned OpIdx
) const {
961 Register Reg
= MI
.getOperand(OpIdx
).getReg();
962 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
963 if (Bank
!= &AMDGPU::VGPRRegBank
)
966 MachineIRBuilder
B(MI
);
967 Register SGPR
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
968 B
.buildInstr(AMDGPU::V_READFIRSTLANE_B32
)
972 const TargetRegisterClass
*Constrained
=
973 constrainGenericRegister(Reg
, AMDGPU::VGPR_32RegClass
, MRI
);
975 assert(Constrained
&& "Failed to constrain readfirstlane src reg");
977 MI
.getOperand(OpIdx
).setReg(SGPR
);
980 // When regbankselect repairs registers, it will insert a repair instruction
981 // which defines the repaired register. Then it calls applyMapping and expects
982 // that the targets will either delete or rewrite the originally wrote to the
983 // repaired registers. Beccause of this, we end up in a situation where
984 // we have 2 instructions defining the same registers.
985 static MachineInstr
*getOtherVRegDef(const MachineRegisterInfo
&MRI
,
987 const MachineInstr
&MI
) {
988 // Is there some way we can assert that there are exactly 2 def instructions?
989 for (MachineInstr
&Other
: MRI
.def_instructions(Reg
)) {
997 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr
&MI
,
998 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
999 MachineRegisterInfo
&MRI
) const {
1000 Register DstReg
= MI
.getOperand(0).getReg();
1001 const LLT LoadTy
= MRI
.getType(DstReg
);
1002 unsigned LoadSize
= LoadTy
.getSizeInBits();
1003 const unsigned MaxNonSmrdLoadSize
= 128;
1004 // 128-bit loads are supported for all instruction types.
1005 if (LoadSize
<= MaxNonSmrdLoadSize
)
1008 SmallVector
<unsigned, 16> DefRegs(OpdMapper
.getVRegs(0));
1009 SmallVector
<unsigned, 1> SrcRegs(OpdMapper
.getVRegs(1));
1011 // If the pointer is an SGPR, we have nothing to do.
1012 if (SrcRegs
.empty())
1015 assert(LoadSize
% MaxNonSmrdLoadSize
== 0);
1017 // We want to get the repair instruction now, because it will help us
1018 // determine which instruction the legalizer inserts that will also
1020 MachineInstr
*RepairInst
= getOtherVRegDef(MRI
, DstReg
, MI
);
1022 // RegBankSelect only emits scalar types, so we need to reset the pointer
1023 // operand to a pointer type.
1024 Register BasePtrReg
= SrcRegs
[0];
1025 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
1026 MRI
.setType(BasePtrReg
, PtrTy
);
1028 MachineIRBuilder
B(MI
);
1030 unsigned SplitElts
=
1031 MaxNonSmrdLoadSize
/ LoadTy
.getScalarType().getSizeInBits();
1032 const LLT LoadSplitTy
= LLT::vector(SplitElts
, LoadTy
.getScalarType());
1033 ApplyRegBankMapping
O(MRI
, &AMDGPU::VGPRRegBank
);
1034 GISelObserverWrapper
Observer(&O
);
1035 B
.setChangeObserver(Observer
);
1036 LegalizerHelper
Helper(B
.getMF(), Observer
, B
);
1037 if (Helper
.fewerElementsVector(MI
, 0, LoadSplitTy
) != LegalizerHelper::Legalized
)
1040 // At this point, the legalizer has split the original load into smaller
1041 // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
1042 // that combines the outputs of the lower loads and writes it to DstReg.
1043 // The register bank selector has also added the RepairInst which writes to
1046 MachineInstr
*LegalizedInst
= getOtherVRegDef(MRI
, DstReg
, *RepairInst
);
1048 // Replace the output of the LegalizedInst with a temporary register, since
1049 // RepairInst already defines DstReg.
1050 Register TmpReg
= MRI
.createGenericVirtualRegister(MRI
.getType(DstReg
));
1051 LegalizedInst
->getOperand(0).setReg(TmpReg
);
1052 B
.setInsertPt(*RepairInst
->getParent(), RepairInst
);
1054 for (unsigned DefIdx
= 0, e
= DefRegs
.size(); DefIdx
!= e
; ++DefIdx
) {
1055 Register IdxReg
= MRI
.createGenericVirtualRegister(LLT::scalar(32));
1056 B
.buildConstant(IdxReg
, DefIdx
);
1057 MRI
.setRegBank(IdxReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1058 B
.buildExtractVectorElement(DefRegs
[DefIdx
], TmpReg
, IdxReg
);
1061 MRI
.setRegBank(DstReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1065 bool AMDGPURegisterBankInfo::applyMappingImage(
1066 MachineInstr
&MI
, const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1067 MachineRegisterInfo
&MRI
, int RsrcIdx
) const {
1068 const int NumDefs
= MI
.getNumExplicitDefs();
1070 // The reported argument index is relative to the IR intrinsic call arguments,
1071 // so we need to shift by the number of defs and the intrinsic ID.
1072 RsrcIdx
+= NumDefs
+ 1;
1074 // Insert copies to VGPR arguments.
1075 applyDefaultMapping(OpdMapper
);
1077 // Fixup any SGPR arguments.
1078 SmallVector
<unsigned, 4> SGPRIndexes
;
1079 for (int I
= NumDefs
, NumOps
= MI
.getNumOperands(); I
!= NumOps
; ++I
) {
1080 if (!MI
.getOperand(I
).isReg())
1083 // If this intrinsic has a sampler, it immediately follows rsrc.
1084 if (I
== RsrcIdx
|| I
== RsrcIdx
+ 1)
1085 SGPRIndexes
.push_back(I
);
1088 executeInWaterfallLoop(MI
, MRI
, SGPRIndexes
);
1092 // For cases where only a single copy is inserted for matching register banks.
1093 // Replace the register in the instruction operand
1094 static void substituteSimpleCopyRegs(
1095 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
, unsigned OpIdx
) {
1096 SmallVector
<unsigned, 1> SrcReg(OpdMapper
.getVRegs(OpIdx
));
1097 if (!SrcReg
.empty()) {
1098 assert(SrcReg
.size() == 1);
1099 OpdMapper
.getMI().getOperand(OpIdx
).setReg(SrcReg
[0]);
1103 /// Handle register layout difference for f16 images for some subtargets.
1104 Register
AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder
&B
,
1105 MachineRegisterInfo
&MRI
,
1106 Register Reg
) const {
1107 if (!Subtarget
.hasUnpackedD16VMem())
1110 const LLT S16
= LLT::scalar(16);
1111 LLT StoreVT
= MRI
.getType(Reg
);
1112 if (!StoreVT
.isVector() || StoreVT
.getElementType() != S16
)
1115 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
1118 SmallVector
<Register
, 4> WideRegs
;
1119 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
1120 WideRegs
.push_back(Unmerge
.getReg(I
));
1122 const LLT S32
= LLT::scalar(32);
1123 int NumElts
= StoreVT
.getNumElements();
1125 return B
.buildMerge(LLT::vector(NumElts
, S32
), WideRegs
).getReg(0);
1128 static std::pair
<Register
, unsigned>
1129 getBaseWithConstantOffset(MachineRegisterInfo
&MRI
, Register Reg
) {
1131 if (mi_match(Reg
, MRI
, m_ICst(Const
)))
1132 return std::make_pair(Register(), Const
);
1135 if (mi_match(Reg
, MRI
, m_GAdd(m_Reg(Base
), m_ICst(Const
))))
1136 return std::make_pair(Base
, Const
);
1138 // TODO: Handle G_OR used for add case
1139 return std::make_pair(Reg
, 0);
1142 std::pair
<Register
, unsigned>
1143 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder
&B
,
1144 Register OrigOffset
) const {
1145 const unsigned MaxImm
= 4095;
1148 const LLT S32
= LLT::scalar(32);
1150 std::tie(BaseReg
, ImmOffset
) = getBaseWithConstantOffset(*B
.getMRI(),
1154 if (ImmOffset
!= 0) {
1155 // If the immediate value is too big for the immoffset field, put the value
1156 // and -4096 into the immoffset field so that the value that is copied/added
1157 // for the voffset field is a multiple of 4096, and it stands more chance
1158 // of being CSEd with the copy/add for another similar load/store.
1159 // However, do not do that rounding down to a multiple of 4096 if that is a
1160 // negative number, as it appears to be illegal to have a negative offset
1161 // in the vgpr, even if adding the immediate offset makes it positive.
1162 unsigned Overflow
= ImmOffset
& ~MaxImm
;
1163 ImmOffset
-= Overflow
;
1164 if ((int32_t)Overflow
< 0) {
1165 Overflow
+= ImmOffset
;
1170 if (Overflow
!= 0) {
1172 BaseReg
= B
.buildConstant(S32
, Overflow
).getReg(0);
1174 auto OverflowVal
= B
.buildConstant(S32
, Overflow
);
1175 BaseReg
= B
.buildAdd(S32
, BaseReg
, OverflowVal
).getReg(0);
1181 BaseReg
= B
.buildConstant(S32
, 0).getReg(0);
1183 return {BaseReg
, C1
};
1186 static bool isZero(Register Reg
, MachineRegisterInfo
&MRI
) {
1188 return mi_match(Reg
, MRI
, m_ICst(C
)) && C
== 0;
1191 static unsigned extractGLC(unsigned CachePolicy
) {
1192 return CachePolicy
& 1;
1195 static unsigned extractSLC(unsigned CachePolicy
) {
1196 return (CachePolicy
>> 1) & 1;
1199 static unsigned extractDLC(unsigned CachePolicy
) {
1200 return (CachePolicy
>> 2) & 1;
1204 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder
&B
,
1205 MachineInstr
&MI
) const {
1206 MachineRegisterInfo
&MRI
= *B
.getMRI();
1207 executeInWaterfallLoop(B
, MI
, MRI
, {2, 4});
1209 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1211 Register VData
= MI
.getOperand(1).getReg();
1212 LLT Ty
= MRI
.getType(VData
);
1214 int EltSize
= Ty
.getScalarSizeInBits();
1215 int Size
= Ty
.getSizeInBits();
1217 // FIXME: Broken integer truncstore.
1219 report_fatal_error("unhandled intrinsic store");
1221 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1222 const int MemSize
= (*MI
.memoperands_begin())->getSize();
1225 Register RSrc
= MI
.getOperand(2).getReg();
1226 Register VOffset
= MI
.getOperand(3).getReg();
1227 Register SOffset
= MI
.getOperand(4).getReg();
1228 unsigned CachePolicy
= MI
.getOperand(5).getImm();
1231 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
1233 const bool Offen
= !isZero(VOffset
, MRI
);
1235 unsigned Opc
= AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
;
1236 switch (8 * MemSize
) {
1238 Opc
= Offen
? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact
:
1239 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact
;
1242 Opc
= Offen
? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact
:
1243 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact
;
1246 Opc
= Offen
? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
:
1247 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact
;
1249 Opc
= AMDGPU::getMUBUFOpcode(Opc
, Size
/ 32);
1254 // Set the insertion point back to the instruction in case it was moved into a
1258 MachineInstrBuilder MIB
= B
.buildInstr(Opc
)
1262 MIB
.addUse(VOffset
);
1267 .addImm(extractGLC(CachePolicy
))
1268 .addImm(extractSLC(CachePolicy
))
1269 .addImm(0) // tfe: FIXME: Remove from inst
1270 .addImm(extractDLC(CachePolicy
))
1273 // FIXME: We need a way to report failure from applyMappingImpl.
1274 // Insert constrain copies before inserting the loop.
1275 if (!constrainSelectedInstRegOperands(*MIB
, *TII
, *TRI
, *this))
1276 report_fatal_error("failed to constrain selected store intrinsic");
1281 void AMDGPURegisterBankInfo::applyMappingImpl(
1282 const OperandsMapper
&OpdMapper
) const {
1283 MachineInstr
&MI
= OpdMapper
.getMI();
1284 unsigned Opc
= MI
.getOpcode();
1285 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
1287 case AMDGPU::G_SELECT
: {
1288 Register DstReg
= MI
.getOperand(0).getReg();
1289 LLT DstTy
= MRI
.getType(DstReg
);
1290 if (DstTy
.getSizeInBits() != 64)
1293 LLT HalfTy
= getHalfSizedType(DstTy
);
1295 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1296 SmallVector
<Register
, 1> Src0Regs(OpdMapper
.getVRegs(1));
1297 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
1298 SmallVector
<Register
, 2> Src2Regs(OpdMapper
.getVRegs(3));
1300 // All inputs are SGPRs, nothing special to do.
1301 if (DefRegs
.empty()) {
1302 assert(Src1Regs
.empty() && Src2Regs
.empty());
1306 MachineIRBuilder
B(MI
);
1307 if (Src0Regs
.empty())
1308 Src0Regs
.push_back(MI
.getOperand(1).getReg());
1310 assert(Src0Regs
.size() == 1);
1313 if (Src1Regs
.empty())
1314 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
1316 setRegsToType(MRI
, Src1Regs
, HalfTy
);
1319 if (Src2Regs
.empty())
1320 split64BitValueForMapping(B
, Src2Regs
, HalfTy
, MI
.getOperand(3).getReg());
1322 setRegsToType(MRI
, Src2Regs
, HalfTy
);
1324 setRegsToType(MRI
, DefRegs
, HalfTy
);
1326 B
.buildSelect(DefRegs
[0], Src0Regs
[0], Src1Regs
[0], Src2Regs
[0]);
1327 B
.buildSelect(DefRegs
[1], Src0Regs
[0], Src1Regs
[1], Src2Regs
[1]);
1329 MRI
.setRegBank(DstReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1330 MI
.eraseFromParent();
1335 case AMDGPU::G_XOR
: {
1336 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1337 // there is a VGPR input.
1338 Register DstReg
= MI
.getOperand(0).getReg();
1339 LLT DstTy
= MRI
.getType(DstReg
);
1340 if (DstTy
.getSizeInBits() != 64)
1343 LLT HalfTy
= getHalfSizedType(DstTy
);
1344 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1345 SmallVector
<Register
, 2> Src0Regs(OpdMapper
.getVRegs(1));
1346 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
1348 // All inputs are SGPRs, nothing special to do.
1349 if (DefRegs
.empty()) {
1350 assert(Src0Regs
.empty() && Src1Regs
.empty());
1354 assert(DefRegs
.size() == 2);
1355 assert(Src0Regs
.size() == Src1Regs
.size() &&
1356 (Src0Regs
.empty() || Src0Regs
.size() == 2));
1358 // Depending on where the source registers came from, the generic code may
1359 // have decided to split the inputs already or not. If not, we still need to
1360 // extract the values.
1361 MachineIRBuilder
B(MI
);
1363 if (Src0Regs
.empty())
1364 split64BitValueForMapping(B
, Src0Regs
, HalfTy
, MI
.getOperand(1).getReg());
1366 setRegsToType(MRI
, Src0Regs
, HalfTy
);
1368 if (Src1Regs
.empty())
1369 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
1371 setRegsToType(MRI
, Src1Regs
, HalfTy
);
1373 setRegsToType(MRI
, DefRegs
, HalfTy
);
1377 .addUse(Src0Regs
[0])
1378 .addUse(Src1Regs
[0]);
1382 .addUse(Src0Regs
[1])
1383 .addUse(Src1Regs
[1]);
1385 MRI
.setRegBank(DstReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1386 MI
.eraseFromParent();
1391 case AMDGPU::G_MUL
: {
1392 Register DstReg
= MI
.getOperand(0).getReg();
1393 LLT DstTy
= MRI
.getType(DstReg
);
1394 if (DstTy
!= LLT::scalar(16))
1397 const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
);
1398 if (DstBank
== &AMDGPU::VGPRRegBank
)
1401 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1402 MachineFunction
*MF
= MI
.getParent()->getParent();
1403 MachineIRBuilder
B(MI
);
1404 ApplyRegBankMapping
ApplySALU(MRI
, &AMDGPU::SGPRRegBank
);
1405 GISelObserverWrapper
Observer(&ApplySALU
);
1406 LegalizerHelper
Helper(*MF
, Observer
, B
);
1408 if (Helper
.widenScalar(MI
, 0, LLT::scalar(32)) !=
1409 LegalizerHelper::Legalized
)
1410 llvm_unreachable("widen scalar should have succeeded");
1413 case AMDGPU::G_SMIN
:
1414 case AMDGPU::G_SMAX
:
1415 case AMDGPU::G_UMIN
:
1416 case AMDGPU::G_UMAX
: {
1417 Register DstReg
= MI
.getOperand(0).getReg();
1418 const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
);
1419 if (DstBank
== &AMDGPU::VGPRRegBank
)
1422 MachineFunction
*MF
= MI
.getParent()->getParent();
1423 MachineIRBuilder
B(MI
);
1424 ApplyRegBankMapping
ApplySALU(MRI
, &AMDGPU::SGPRRegBank
);
1425 GISelObserverWrapper
Observer(&ApplySALU
);
1426 LegalizerHelper
Helper(*MF
, Observer
, B
);
1428 // Turn scalar min/max into a compare and select.
1429 LLT Ty
= MRI
.getType(DstReg
);
1430 LLT S32
= LLT::scalar(32);
1431 LLT S16
= LLT::scalar(16);
1434 // Need to widen to s32, and expand as cmp + select.
1435 if (Helper
.widenScalar(MI
, 0, S32
) != LegalizerHelper::Legalized
)
1436 llvm_unreachable("widenScalar should have succeeded");
1438 // FIXME: This is relying on widenScalar leaving MI in place.
1439 if (Helper
.lower(MI
, 0, S32
) != LegalizerHelper::Legalized
)
1440 llvm_unreachable("lower should have succeeded");
1442 if (Helper
.lower(MI
, 0, Ty
) != LegalizerHelper::Legalized
)
1443 llvm_unreachable("lower should have succeeded");
1448 case AMDGPU::G_SEXT
:
1449 case AMDGPU::G_ZEXT
: {
1450 Register SrcReg
= MI
.getOperand(1).getReg();
1451 LLT SrcTy
= MRI
.getType(SrcReg
);
1452 bool Signed
= Opc
== AMDGPU::G_SEXT
;
1454 MachineIRBuilder
B(MI
);
1455 const RegisterBank
*SrcBank
= getRegBank(SrcReg
, MRI
, *TRI
);
1457 Register DstReg
= MI
.getOperand(0).getReg();
1458 LLT DstTy
= MRI
.getType(DstReg
);
1459 if (DstTy
.isScalar() &&
1460 SrcBank
!= &AMDGPU::SGPRRegBank
&&
1461 SrcBank
!= &AMDGPU::SCCRegBank
&&
1462 SrcBank
!= &AMDGPU::VCCRegBank
&&
1463 // FIXME: Should handle any type that round to s64 when irregular
1464 // breakdowns supported.
1465 DstTy
.getSizeInBits() == 64 &&
1466 SrcTy
.getSizeInBits() <= 32) {
1467 const LLT S32
= LLT::scalar(32);
1468 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1470 // Extend to 32-bit, and then extend the low half.
1472 // TODO: Should really be buildSExtOrCopy
1473 B
.buildSExtOrTrunc(DefRegs
[0], SrcReg
);
1475 // Replicate sign bit from 32-bit extended part.
1476 auto ShiftAmt
= B
.buildConstant(S32
, 31);
1477 MRI
.setRegBank(ShiftAmt
.getReg(0), *SrcBank
);
1478 B
.buildAShr(DefRegs
[1], DefRegs
[0], ShiftAmt
);
1480 B
.buildZExtOrTrunc(DefRegs
[0], SrcReg
);
1481 B
.buildConstant(DefRegs
[1], 0);
1484 MRI
.setRegBank(DstReg
, *SrcBank
);
1485 MI
.eraseFromParent();
1489 if (SrcTy
!= LLT::scalar(1))
1492 if (SrcBank
== &AMDGPU::SCCRegBank
|| SrcBank
== &AMDGPU::VCCRegBank
) {
1493 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1495 const RegisterBank
*DstBank
= SrcBank
== &AMDGPU::SCCRegBank
?
1496 &AMDGPU::SGPRRegBank
: &AMDGPU::VGPRRegBank
;
1498 unsigned DstSize
= DstTy
.getSizeInBits();
1499 // 64-bit select is SGPR only
1500 const bool UseSel64
= DstSize
> 32 &&
1501 SrcBank
->getID() == AMDGPU::SCCRegBankID
;
1503 // TODO: Should s16 select be legal?
1504 LLT SelType
= UseSel64
? LLT::scalar(64) : LLT::scalar(32);
1505 auto True
= B
.buildConstant(SelType
, Signed
? -1 : 1);
1506 auto False
= B
.buildConstant(SelType
, 0);
1508 MRI
.setRegBank(True
.getReg(0), *DstBank
);
1509 MRI
.setRegBank(False
.getReg(0), *DstBank
);
1510 MRI
.setRegBank(DstReg
, *DstBank
);
1512 if (DstSize
> 32 && SrcBank
->getID() != AMDGPU::SCCRegBankID
) {
1513 B
.buildSelect(DefRegs
[0], SrcReg
, True
, False
);
1514 B
.buildCopy(DefRegs
[1], DefRegs
[0]);
1515 } else if (DstSize
< 32) {
1516 auto Sel
= B
.buildSelect(SelType
, SrcReg
, True
, False
);
1517 MRI
.setRegBank(Sel
.getReg(0), *DstBank
);
1518 B
.buildTrunc(DstReg
, Sel
);
1520 B
.buildSelect(DstReg
, SrcReg
, True
, False
);
1523 MI
.eraseFromParent();
1527 // Fixup the case with an s1 src that isn't a condition register. Use shifts
1528 // instead of introducing a compare to avoid an unnecessary condition
1529 // register (and since there's no scalar 16-bit compares).
1530 auto Ext
= B
.buildAnyExt(DstTy
, SrcReg
);
1531 auto ShiftAmt
= B
.buildConstant(LLT::scalar(32), DstTy
.getSizeInBits() - 1);
1532 auto Shl
= B
.buildShl(DstTy
, Ext
, ShiftAmt
);
1534 if (MI
.getOpcode() == AMDGPU::G_SEXT
)
1535 B
.buildAShr(DstReg
, Shl
, ShiftAmt
);
1537 B
.buildLShr(DstReg
, Shl
, ShiftAmt
);
1539 MRI
.setRegBank(DstReg
, *SrcBank
);
1540 MRI
.setRegBank(Ext
.getReg(0), *SrcBank
);
1541 MRI
.setRegBank(ShiftAmt
.getReg(0), *SrcBank
);
1542 MRI
.setRegBank(Shl
.getReg(0), *SrcBank
);
1543 MI
.eraseFromParent();
1546 case AMDGPU::G_BUILD_VECTOR
:
1547 case AMDGPU::G_BUILD_VECTOR_TRUNC
: {
1548 Register DstReg
= MI
.getOperand(0).getReg();
1549 LLT DstTy
= MRI
.getType(DstReg
);
1550 if (DstTy
!= LLT::vector(2, 16))
1553 assert(MI
.getNumOperands() == 3 && empty(OpdMapper
.getVRegs(0)));
1554 substituteSimpleCopyRegs(OpdMapper
, 1);
1555 substituteSimpleCopyRegs(OpdMapper
, 2);
1557 const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
);
1558 if (DstBank
== &AMDGPU::SGPRRegBank
)
1559 break; // Can use S_PACK_* instructions.
1561 MachineIRBuilder
B(MI
);
1563 Register Lo
= MI
.getOperand(1).getReg();
1564 Register Hi
= MI
.getOperand(2).getReg();
1565 const LLT S32
= LLT::scalar(32);
1567 const RegisterBank
*BankLo
= getRegBank(Lo
, MRI
, *TRI
);
1568 const RegisterBank
*BankHi
= getRegBank(Hi
, MRI
, *TRI
);
1573 if (Opc
== AMDGPU::G_BUILD_VECTOR
) {
1574 ZextLo
= B
.buildZExt(S32
, Lo
).getReg(0);
1575 MRI
.setRegBank(ZextLo
, *BankLo
);
1577 Register ZextHi
= B
.buildZExt(S32
, Hi
).getReg(0);
1578 MRI
.setRegBank(ZextHi
, *BankHi
);
1580 auto ShiftAmt
= B
.buildConstant(S32
, 16);
1581 MRI
.setRegBank(ShiftAmt
.getReg(0), *BankHi
);
1583 ShiftHi
= B
.buildShl(S32
, ZextHi
, ShiftAmt
).getReg(0);
1584 MRI
.setRegBank(ShiftHi
, *BankHi
);
1586 Register MaskLo
= B
.buildConstant(S32
, 0xffff).getReg(0);
1587 MRI
.setRegBank(MaskLo
, *BankLo
);
1589 auto ShiftAmt
= B
.buildConstant(S32
, 16);
1590 MRI
.setRegBank(ShiftAmt
.getReg(0), *BankHi
);
1592 ShiftHi
= B
.buildShl(S32
, Hi
, ShiftAmt
).getReg(0);
1593 MRI
.setRegBank(ShiftHi
, *BankHi
);
1595 ZextLo
= B
.buildAnd(S32
, Lo
, MaskLo
).getReg(0);
1596 MRI
.setRegBank(ZextLo
, *BankLo
);
1599 auto Or
= B
.buildOr(S32
, ZextLo
, ShiftHi
);
1600 MRI
.setRegBank(Or
.getReg(0), *DstBank
);
1602 B
.buildBitcast(DstReg
, Or
);
1603 MI
.eraseFromParent();
1606 case AMDGPU::G_EXTRACT_VECTOR_ELT
:
1607 applyDefaultMapping(OpdMapper
);
1608 executeInWaterfallLoop(MI
, MRI
, { 2 });
1610 case AMDGPU::G_INTRINSIC
: {
1611 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
1612 case Intrinsic::amdgcn_s_buffer_load
: {
1613 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
1614 executeInWaterfallLoop(MI
, MRI
, { 2, 3 });
1617 case Intrinsic::amdgcn_readlane
: {
1618 substituteSimpleCopyRegs(OpdMapper
, 2);
1620 assert(empty(OpdMapper
.getVRegs(0)));
1621 assert(empty(OpdMapper
.getVRegs(3)));
1623 // Make sure the index is an SGPR. It doesn't make sense to run this in a
1624 // waterfall loop, so assume it's a uniform value.
1625 constrainOpWithReadfirstlane(MI
, MRI
, 3); // Index
1628 case Intrinsic::amdgcn_writelane
: {
1629 assert(empty(OpdMapper
.getVRegs(0)));
1630 assert(empty(OpdMapper
.getVRegs(2)));
1631 assert(empty(OpdMapper
.getVRegs(3)));
1633 substituteSimpleCopyRegs(OpdMapper
, 4); // VGPR input val
1634 constrainOpWithReadfirstlane(MI
, MRI
, 2); // Source value
1635 constrainOpWithReadfirstlane(MI
, MRI
, 3); // Index
1643 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
: {
1644 auto IntrID
= MI
.getIntrinsicID();
1646 case Intrinsic::amdgcn_buffer_load
: {
1647 executeInWaterfallLoop(MI
, MRI
, { 2 });
1650 case Intrinsic::amdgcn_ds_ordered_add
:
1651 case Intrinsic::amdgcn_ds_ordered_swap
: {
1652 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
1653 assert(empty(OpdMapper
.getVRegs(0)));
1654 substituteSimpleCopyRegs(OpdMapper
, 3);
1655 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
1658 case Intrinsic::amdgcn_s_sendmsg
:
1659 case Intrinsic::amdgcn_s_sendmsghalt
: {
1660 // FIXME: Should this use a waterfall loop?
1661 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
1664 case Intrinsic::amdgcn_raw_buffer_load
:
1665 case Intrinsic::amdgcn_raw_buffer_load_format
:
1666 case Intrinsic::amdgcn_raw_tbuffer_load
:
1667 case Intrinsic::amdgcn_raw_buffer_store
:
1668 case Intrinsic::amdgcn_raw_buffer_store_format
:
1669 case Intrinsic::amdgcn_raw_tbuffer_store
: {
1670 applyDefaultMapping(OpdMapper
);
1671 executeInWaterfallLoop(MI
, MRI
, {2, 4});
1674 case Intrinsic::amdgcn_struct_buffer_load
:
1675 case Intrinsic::amdgcn_struct_buffer_store
:
1676 case Intrinsic::amdgcn_struct_tbuffer_load
:
1677 case Intrinsic::amdgcn_struct_tbuffer_store
: {
1678 applyDefaultMapping(OpdMapper
);
1679 executeInWaterfallLoop(MI
, MRI
, {2, 5});
1683 if (const AMDGPU::RsrcIntrinsic
*RSrcIntrin
=
1684 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
1685 // Non-images can have complications from operands that allow both SGPR
1686 // and VGPR. For now it's too complicated to figure out the final opcode
1687 // to derive the register bank from the MCInstrDesc.
1688 if (RSrcIntrin
->IsImage
) {
1689 applyMappingImage(MI
, OpdMapper
, MRI
, RSrcIntrin
->RsrcArg
);
1699 case AMDGPU::G_LOAD
:
1700 case AMDGPU::G_ZEXTLOAD
:
1701 case AMDGPU::G_SEXTLOAD
: {
1702 if (applyMappingWideLoad(MI
, OpdMapper
, MRI
))
1710 return applyDefaultMapping(OpdMapper
);
1713 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr
&MI
) const {
1714 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1715 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1716 for (unsigned i
= 0, e
= MI
.getNumOperands();i
!= e
; ++i
) {
1717 if (!MI
.getOperand(i
).isReg())
1719 Register Reg
= MI
.getOperand(i
).getReg();
1720 if (const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
)) {
1721 if (Bank
->getID() == AMDGPU::VGPRRegBankID
)
1724 assert(Bank
->getID() == AMDGPU::SGPRRegBankID
||
1725 Bank
->getID() == AMDGPU::SCCRegBankID
);
1731 const RegisterBankInfo::InstructionMapping
&
1732 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr
&MI
) const {
1733 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1734 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1735 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1737 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
1738 unsigned Size
= getSizeInBits(MI
.getOperand(i
).getReg(), MRI
, *TRI
);
1739 unsigned BankID
= Size
== 1 ? AMDGPU::SCCRegBankID
: AMDGPU::SGPRRegBankID
;
1740 OpdsMapping
[i
] = AMDGPU::getValueMapping(BankID
, Size
);
1742 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
1743 MI
.getNumOperands());
1746 const RegisterBankInfo::InstructionMapping
&
1747 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr
&MI
) const {
1748 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1749 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1750 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1751 unsigned OpdIdx
= 0;
1753 unsigned Size0
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1754 OpdsMapping
[OpdIdx
++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size0
);
1756 if (MI
.getOperand(OpdIdx
).isIntrinsicID())
1757 OpdsMapping
[OpdIdx
++] = nullptr;
1759 Register Reg1
= MI
.getOperand(OpdIdx
).getReg();
1760 unsigned Size1
= getSizeInBits(Reg1
, MRI
, *TRI
);
1762 unsigned DefaultBankID
= Size1
== 1 ?
1763 AMDGPU::VCCRegBankID
: AMDGPU::VGPRRegBankID
;
1764 unsigned Bank1
= getRegBankID(Reg1
, MRI
, *TRI
, DefaultBankID
);
1766 OpdsMapping
[OpdIdx
++] = AMDGPU::getValueMapping(Bank1
, Size1
);
1768 for (unsigned e
= MI
.getNumOperands(); OpdIdx
!= e
; ++OpdIdx
) {
1769 const MachineOperand
&MO
= MI
.getOperand(OpdIdx
);
1773 unsigned Size
= getSizeInBits(MO
.getReg(), MRI
, *TRI
);
1774 unsigned BankID
= Size
== 1 ? AMDGPU::VCCRegBankID
: AMDGPU::VGPRRegBankID
;
1775 OpdsMapping
[OpdIdx
] = AMDGPU::getValueMapping(BankID
, Size
);
1778 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
1779 MI
.getNumOperands());
1782 const RegisterBankInfo::InstructionMapping
&
1783 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr
&MI
) const {
1784 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1785 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1786 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1788 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
) {
1789 const MachineOperand
&Op
= MI
.getOperand(I
);
1793 unsigned Size
= getSizeInBits(Op
.getReg(), MRI
, *TRI
);
1794 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
1797 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
1798 MI
.getNumOperands());
1801 const RegisterBankInfo::InstructionMapping
&
1802 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo
&MRI
,
1803 const MachineInstr
&MI
,
1804 int RsrcIdx
) const {
1805 // The reported argument index is relative to the IR intrinsic call arguments,
1806 // so we need to shift by the number of defs and the intrinsic ID.
1807 RsrcIdx
+= MI
.getNumExplicitDefs() + 1;
1809 const int NumOps
= MI
.getNumOperands();
1810 SmallVector
<const ValueMapping
*, 8> OpdsMapping(NumOps
);
1812 // TODO: Should packed/unpacked D16 difference be reported here as part of
1813 // the value mapping?
1814 for (int I
= 0; I
!= NumOps
; ++I
) {
1815 if (!MI
.getOperand(I
).isReg())
1818 Register OpReg
= MI
.getOperand(I
).getReg();
1819 unsigned Size
= getSizeInBits(OpReg
, MRI
, *TRI
);
1821 // FIXME: Probably need a new intrinsic register bank searchable table to
1822 // handle arbitrary intrinsics easily.
1824 // If this has a sampler, it immediately follows rsrc.
1825 const bool MustBeSGPR
= I
== RsrcIdx
|| I
== RsrcIdx
+ 1;
1828 // If this must be an SGPR, so we must report whatever it is as legal.
1829 unsigned NewBank
= getRegBankID(OpReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
1830 OpdsMapping
[I
] = AMDGPU::getValueMapping(NewBank
, Size
);
1832 // Some operands must be VGPR, and these are easy to copy to.
1833 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
1837 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
), NumOps
);
1840 const RegisterBankInfo::InstructionMapping
&
1841 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr
&MI
) const {
1843 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1844 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1845 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1846 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1847 LLT LoadTy
= MRI
.getType(MI
.getOperand(0).getReg());
1848 Register PtrReg
= MI
.getOperand(1).getReg();
1849 LLT PtrTy
= MRI
.getType(PtrReg
);
1850 unsigned AS
= PtrTy
.getAddressSpace();
1851 unsigned PtrSize
= PtrTy
.getSizeInBits();
1853 const ValueMapping
*ValMapping
;
1854 const ValueMapping
*PtrMapping
;
1856 if (isInstrUniformNonExtLoadAlign4(MI
) &&
1857 (AS
!= AMDGPUAS::LOCAL_ADDRESS
&& AS
!= AMDGPUAS::REGION_ADDRESS
)) {
1858 // We have a uniform instruction so we want to use an SMRD load
1859 ValMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
1860 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
);
1862 ValMapping
= AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID
, LoadTy
);
1863 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
);
1866 OpdsMapping
[0] = ValMapping
;
1867 OpdsMapping
[1] = PtrMapping
;
1868 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstructionMapping(
1869 1, 1, getOperandsMapping(OpdsMapping
), MI
.getNumOperands());
1872 // FIXME: Do we want to add a mapping for FLAT load, or should we just
1873 // handle that during instruction selection?
1877 AMDGPURegisterBankInfo::getRegBankID(Register Reg
,
1878 const MachineRegisterInfo
&MRI
,
1879 const TargetRegisterInfo
&TRI
,
1880 unsigned Default
) const {
1882 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, TRI
);
1883 return Bank
? Bank
->getID() : Default
;
1887 static unsigned regBankUnion(unsigned RB0
, unsigned RB1
) {
1888 return (RB0
== AMDGPU::SGPRRegBankID
&& RB1
== AMDGPU::SGPRRegBankID
) ?
1889 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
1892 const RegisterBankInfo::ValueMapping
*
1893 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg
,
1894 const MachineRegisterInfo
&MRI
,
1895 const TargetRegisterInfo
&TRI
) const {
1896 // Lie and claim anything is legal, even though this needs to be an SGPR
1897 // applyMapping will have to deal with it as a waterfall loop.
1898 unsigned Bank
= getRegBankID(Reg
, MRI
, TRI
, AMDGPU::SGPRRegBankID
);
1899 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
1900 return AMDGPU::getValueMapping(Bank
, Size
);
1903 const RegisterBankInfo::ValueMapping
*
1904 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg
,
1905 const MachineRegisterInfo
&MRI
,
1906 const TargetRegisterInfo
&TRI
) const {
1907 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
1908 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
1912 /// This function must return a legal mapping, because
1913 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
1914 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
1915 /// VGPR to SGPR generated is illegal.
1917 const RegisterBankInfo::InstructionMapping
&
1918 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr
&MI
) const {
1919 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1920 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1922 if (MI
.isRegSequence()) {
1923 // If any input is a VGPR, the result must be a VGPR. The default handling
1924 // assumes any copy between banks is legal.
1925 unsigned BankID
= AMDGPU::SGPRRegBankID
;
1927 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
1928 auto OpBank
= getRegBankID(MI
.getOperand(I
).getReg(), MRI
, *TRI
);
1929 // It doesn't make sense to use vcc or scc banks here, so just ignore
1931 if (OpBank
!= AMDGPU::SGPRRegBankID
) {
1932 BankID
= AMDGPU::VGPRRegBankID
;
1936 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1938 const ValueMapping
&ValMap
= getValueMapping(0, Size
, getRegBank(BankID
));
1939 return getInstructionMapping(
1941 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
1944 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
1947 // TODO: There are additional exec masking dependencies to analyze.
1948 if (MI
.getOpcode() == TargetOpcode::G_PHI
) {
1949 // TODO: Generate proper invalid bank enum.
1950 int ResultBank
= -1;
1952 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
1953 Register Reg
= MI
.getOperand(I
).getReg();
1954 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
1956 // FIXME: Assuming VGPR for any undetermined inputs.
1957 if (!Bank
|| Bank
->getID() == AMDGPU::VGPRRegBankID
) {
1958 ResultBank
= AMDGPU::VGPRRegBankID
;
1962 unsigned OpBank
= Bank
->getID();
1964 if (OpBank
== AMDGPU::SCCRegBankID
) {
1965 // There's only one SCC register, so a phi requires copying to SGPR.
1966 OpBank
= AMDGPU::SGPRRegBankID
;
1967 } else if (OpBank
== AMDGPU::VCCRegBankID
) {
1969 // vcc, sgpr -> vgpr
1970 if (ResultBank
!= -1 && ResultBank
!= AMDGPU::VCCRegBankID
) {
1971 ResultBank
= AMDGPU::VGPRRegBankID
;
1976 ResultBank
= OpBank
;
1979 assert(ResultBank
!= -1);
1981 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1983 const ValueMapping
&ValMap
=
1984 getValueMapping(0, Size
, getRegBank(ResultBank
));
1985 return getInstructionMapping(
1987 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
1990 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstrMappingImpl(MI
);
1991 if (Mapping
.isValid())
1994 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1996 switch (MI
.getOpcode()) {
1998 return getInvalidInstructionMapping();
2002 case AMDGPU::G_XOR
: {
2003 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2005 const RegisterBank
*DstBank
2006 = getRegBank(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2008 unsigned TargetBankID
= -1;
2009 unsigned BankLHS
= -1;
2010 unsigned BankRHS
= -1;
2012 TargetBankID
= DstBank
->getID();
2013 if (DstBank
== &AMDGPU::VCCRegBank
) {
2014 TargetBankID
= AMDGPU::VCCRegBankID
;
2015 BankLHS
= AMDGPU::VCCRegBankID
;
2016 BankRHS
= AMDGPU::VCCRegBankID
;
2017 } else if (DstBank
== &AMDGPU::SCCRegBank
) {
2018 TargetBankID
= AMDGPU::SCCRegBankID
;
2019 BankLHS
= AMDGPU::SGPRRegBankID
;
2020 BankRHS
= AMDGPU::SGPRRegBankID
;
2022 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
,
2023 AMDGPU::SGPRRegBankID
);
2024 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2025 AMDGPU::SGPRRegBankID
);
2028 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
,
2029 AMDGPU::VCCRegBankID
);
2030 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2031 AMDGPU::VCCRegBankID
);
2033 // Both inputs should be true booleans to produce a boolean result.
2034 if (BankLHS
== AMDGPU::VGPRRegBankID
|| BankRHS
== AMDGPU::VGPRRegBankID
) {
2035 TargetBankID
= AMDGPU::VGPRRegBankID
;
2036 } else if (BankLHS
== AMDGPU::VCCRegBankID
|| BankRHS
== AMDGPU::VCCRegBankID
) {
2037 TargetBankID
= AMDGPU::VCCRegBankID
;
2038 BankLHS
= AMDGPU::VCCRegBankID
;
2039 BankRHS
= AMDGPU::VCCRegBankID
;
2040 } else if (BankLHS
== AMDGPU::SGPRRegBankID
&& BankRHS
== AMDGPU::SGPRRegBankID
) {
2041 TargetBankID
= AMDGPU::SGPRRegBankID
;
2042 } else if (BankLHS
== AMDGPU::SCCRegBankID
|| BankRHS
== AMDGPU::SCCRegBankID
) {
2043 // The operation must be done on a 32-bit register, but it will set
2044 // scc. The result type could interchangably be SCC or SGPR, since
2045 // both values will be produced.
2046 TargetBankID
= AMDGPU::SCCRegBankID
;
2047 BankLHS
= AMDGPU::SGPRRegBankID
;
2048 BankRHS
= AMDGPU::SGPRRegBankID
;
2052 OpdsMapping
[0] = AMDGPU::getValueMapping(TargetBankID
, Size
);
2053 OpdsMapping
[1] = AMDGPU::getValueMapping(BankLHS
, Size
);
2054 OpdsMapping
[2] = AMDGPU::getValueMapping(BankRHS
, Size
);
2060 if (isSALUMapping(MI
)) {
2061 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
);
2062 OpdsMapping
[1] = OpdsMapping
[2] = OpdsMapping
[0];
2064 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
);
2065 unsigned Bank1
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
/*, DefaultBankID*/);
2066 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank1
, Size
);
2068 unsigned Bank2
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
/*, DefaultBankID*/);
2069 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank2
, Size
);
2082 case AMDGPU::G_LSHR
:
2083 case AMDGPU::G_ASHR
:
2084 case AMDGPU::G_UADDO
:
2085 case AMDGPU::G_SADDO
:
2086 case AMDGPU::G_USUBO
:
2087 case AMDGPU::G_SSUBO
:
2088 case AMDGPU::G_UADDE
:
2089 case AMDGPU::G_SADDE
:
2090 case AMDGPU::G_USUBE
:
2091 case AMDGPU::G_SSUBE
:
2092 case AMDGPU::G_SMIN
:
2093 case AMDGPU::G_SMAX
:
2094 case AMDGPU::G_UMIN
:
2095 case AMDGPU::G_UMAX
:
2096 if (isSALUMapping(MI
))
2097 return getDefaultMappingSOP(MI
);
2100 case AMDGPU::G_FADD
:
2101 case AMDGPU::G_FSUB
:
2102 case AMDGPU::G_FPTOSI
:
2103 case AMDGPU::G_FPTOUI
:
2104 case AMDGPU::G_FMUL
:
2106 case AMDGPU::G_FMAD
:
2107 case AMDGPU::G_FSQRT
:
2108 case AMDGPU::G_FFLOOR
:
2109 case AMDGPU::G_FCEIL
:
2110 case AMDGPU::G_FRINT
:
2111 case AMDGPU::G_SITOFP
:
2112 case AMDGPU::G_UITOFP
:
2113 case AMDGPU::G_FPTRUNC
:
2114 case AMDGPU::G_FPEXT
:
2115 case AMDGPU::G_FEXP2
:
2116 case AMDGPU::G_FLOG2
:
2117 case AMDGPU::G_FMINNUM
:
2118 case AMDGPU::G_FMAXNUM
:
2119 case AMDGPU::G_FMINNUM_IEEE
:
2120 case AMDGPU::G_FMAXNUM_IEEE
:
2121 case AMDGPU::G_FCANONICALIZE
:
2122 case AMDGPU::G_INTRINSIC_TRUNC
:
2123 case AMDGPU::G_INTRINSIC_ROUND
:
2124 return getDefaultMappingVOP(MI
);
2125 case AMDGPU::G_UMULH
:
2126 case AMDGPU::G_SMULH
: {
2127 if (MF
.getSubtarget
<GCNSubtarget
>().hasScalarMulHiInsts() &&
2129 return getDefaultMappingSOP(MI
);
2130 return getDefaultMappingVOP(MI
);
2132 case AMDGPU::G_IMPLICIT_DEF
: {
2133 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2134 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2137 case AMDGPU::G_FCONSTANT
:
2138 case AMDGPU::G_CONSTANT
:
2139 case AMDGPU::G_FRAME_INDEX
:
2140 case AMDGPU::G_GLOBAL_VALUE
:
2141 case AMDGPU::G_BLOCK_ADDR
: {
2142 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2143 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2146 case AMDGPU::G_INSERT
: {
2147 unsigned BankID
= isSALUMapping(MI
) ? AMDGPU::SGPRRegBankID
:
2148 AMDGPU::VGPRRegBankID
;
2149 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2150 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2151 unsigned EltSize
= getSizeInBits(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2152 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
2153 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
2154 OpdsMapping
[2] = AMDGPU::getValueMapping(BankID
, EltSize
);
2155 OpdsMapping
[3] = nullptr;
2158 case AMDGPU::G_EXTRACT
: {
2159 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2160 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2161 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2162 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
2163 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
2164 OpdsMapping
[2] = nullptr;
2167 case AMDGPU::G_BUILD_VECTOR
:
2168 case AMDGPU::G_BUILD_VECTOR_TRUNC
: {
2169 LLT DstTy
= MRI
.getType(MI
.getOperand(0).getReg());
2170 if (DstTy
== LLT::vector(2, 16)) {
2171 unsigned DstSize
= DstTy
.getSizeInBits();
2172 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
2173 unsigned Src0BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2174 unsigned Src1BankID
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2175 unsigned DstBankID
= regBankUnion(Src0BankID
, Src1BankID
);
2177 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBankID
, DstSize
);
2178 OpdsMapping
[1] = AMDGPU::getValueMapping(Src0BankID
, SrcSize
);
2179 OpdsMapping
[2] = AMDGPU::getValueMapping(Src1BankID
, SrcSize
);
2185 case AMDGPU::G_MERGE_VALUES
:
2186 case AMDGPU::G_CONCAT_VECTORS
: {
2187 unsigned Bank
= isSALUMapping(MI
) ?
2188 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
2189 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2190 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
2192 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
2193 // Op1 and Dst should use the same register bank.
2194 for (unsigned i
= 1, e
= MI
.getNumOperands(); i
!= e
; ++i
)
2195 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, SrcSize
);
2198 case AMDGPU::G_BITCAST
:
2199 case AMDGPU::G_INTTOPTR
:
2200 case AMDGPU::G_PTRTOINT
:
2201 case AMDGPU::G_CTLZ
:
2202 case AMDGPU::G_CTLZ_ZERO_UNDEF
:
2203 case AMDGPU::G_CTTZ
:
2204 case AMDGPU::G_CTTZ_ZERO_UNDEF
:
2205 case AMDGPU::G_CTPOP
:
2206 case AMDGPU::G_BSWAP
:
2207 case AMDGPU::G_BITREVERSE
:
2208 case AMDGPU::G_FABS
:
2209 case AMDGPU::G_FNEG
: {
2210 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2211 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2212 OpdsMapping
[0] = OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, Size
);
2215 case AMDGPU::G_TRUNC
: {
2216 Register Dst
= MI
.getOperand(0).getReg();
2217 Register Src
= MI
.getOperand(1).getReg();
2218 unsigned Bank
= getRegBankID(Src
, MRI
, *TRI
);
2219 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
2220 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
2221 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
2222 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, SrcSize
);
2225 case AMDGPU::G_ZEXT
:
2226 case AMDGPU::G_SEXT
:
2227 case AMDGPU::G_ANYEXT
: {
2228 Register Dst
= MI
.getOperand(0).getReg();
2229 Register Src
= MI
.getOperand(1).getReg();
2230 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
2231 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
2234 const RegisterBank
*SrcBank
= getRegBank(Src
, MRI
, *TRI
);
2236 switch (SrcBank
->getID()) {
2237 case AMDGPU::SCCRegBankID
:
2238 case AMDGPU::SGPRRegBankID
:
2239 DstBank
= AMDGPU::SGPRRegBankID
;
2242 DstBank
= AMDGPU::VGPRRegBankID
;
2246 // TODO: Should anyext be split into 32-bit part as well?
2247 if (MI
.getOpcode() == AMDGPU::G_ANYEXT
) {
2248 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBank
, DstSize
);
2249 OpdsMapping
[1] = AMDGPU::getValueMapping(SrcBank
->getID(), SrcSize
);
2251 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
2252 // 32-bits, and then to 64.
2253 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(DstBank
, DstSize
);
2254 OpdsMapping
[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank
->getID(),
2259 case AMDGPU::G_FCMP
: {
2260 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2261 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2262 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
2263 OpdsMapping
[1] = nullptr; // Predicate Operand.
2264 OpdsMapping
[2] = AMDGPU::getValueMapping(Op2Bank
, Size
);
2265 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
2268 case AMDGPU::G_STORE
: {
2269 assert(MI
.getOperand(0).isReg());
2270 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2271 // FIXME: We need to specify a different reg bank once scalar stores
2273 const ValueMapping
*ValMapping
=
2274 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
2275 // FIXME: Depending on the type of store, the pointer could be in
2276 // the SGPR Reg bank.
2277 // FIXME: Pointer size should be based on the address space.
2278 const ValueMapping
*PtrMapping
=
2279 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 64);
2281 OpdsMapping
[0] = ValMapping
;
2282 OpdsMapping
[1] = PtrMapping
;
2286 case AMDGPU::G_ICMP
: {
2287 auto Pred
= static_cast<CmpInst::Predicate
>(MI
.getOperand(1).getPredicate());
2288 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2289 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2290 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2292 bool CanUseSCC
= Op2Bank
== AMDGPU::SGPRRegBankID
&&
2293 Op3Bank
== AMDGPU::SGPRRegBankID
&&
2294 (Size
== 32 || (Size
== 64 &&
2295 (Pred
== CmpInst::ICMP_EQ
|| Pred
== CmpInst::ICMP_NE
) &&
2296 MF
.getSubtarget
<GCNSubtarget
>().hasScalarCompareEq64()));
2298 unsigned Op0Bank
= CanUseSCC
? AMDGPU::SCCRegBankID
: AMDGPU::VCCRegBankID
;
2300 OpdsMapping
[0] = AMDGPU::getValueMapping(Op0Bank
, 1);
2301 OpdsMapping
[1] = nullptr; // Predicate Operand.
2302 OpdsMapping
[2] = AMDGPU::getValueMapping(Op2Bank
, Size
);
2303 OpdsMapping
[3] = AMDGPU::getValueMapping(Op3Bank
, Size
);
2306 case AMDGPU::G_EXTRACT_VECTOR_ELT
: {
2307 unsigned OutputBankID
= isSALUMapping(MI
) ?
2308 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
2309 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
2310 unsigned IdxSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2311 unsigned IdxBank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2313 OpdsMapping
[0] = AMDGPU::getValueMapping(OutputBankID
, SrcSize
);
2314 OpdsMapping
[1] = AMDGPU::getValueMapping(OutputBankID
, SrcSize
);
2316 // The index can be either if the source vector is VGPR.
2317 OpdsMapping
[2] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
2320 case AMDGPU::G_INSERT_VECTOR_ELT
: {
2321 unsigned OutputBankID
= isSALUMapping(MI
) ?
2322 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
2324 unsigned VecSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2325 unsigned InsertSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2326 unsigned IdxSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
2327 unsigned InsertEltBank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2328 unsigned IdxBank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2330 OpdsMapping
[0] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
2331 OpdsMapping
[1] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
2332 OpdsMapping
[2] = AMDGPU::getValueMapping(InsertEltBank
, InsertSize
);
2334 // The index can be either if the source vector is VGPR.
2335 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
2338 case AMDGPU::G_UNMERGE_VALUES
: {
2339 unsigned Bank
= isSALUMapping(MI
) ? AMDGPU::SGPRRegBankID
:
2340 AMDGPU::VGPRRegBankID
;
2342 // Op1 and Dst should use the same register bank.
2343 // FIXME: Shouldn't this be the default? Why do we need to handle this?
2344 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
2345 unsigned Size
= getSizeInBits(MI
.getOperand(i
).getReg(), MRI
, *TRI
);
2346 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, Size
);
2350 case AMDGPU::G_INTRINSIC
: {
2351 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
2353 return getInvalidInstructionMapping();
2354 case Intrinsic::amdgcn_div_fmas
:
2355 case Intrinsic::amdgcn_trig_preop
:
2356 case Intrinsic::amdgcn_sin
:
2357 case Intrinsic::amdgcn_cos
:
2358 case Intrinsic::amdgcn_log_clamp
:
2359 case Intrinsic::amdgcn_rcp
:
2360 case Intrinsic::amdgcn_rcp_legacy
:
2361 case Intrinsic::amdgcn_rsq
:
2362 case Intrinsic::amdgcn_rsq_legacy
:
2363 case Intrinsic::amdgcn_rsq_clamp
:
2364 case Intrinsic::amdgcn_ldexp
:
2365 case Intrinsic::amdgcn_frexp_mant
:
2366 case Intrinsic::amdgcn_frexp_exp
:
2367 case Intrinsic::amdgcn_fract
:
2368 case Intrinsic::amdgcn_cvt_pkrtz
:
2369 case Intrinsic::amdgcn_cvt_pknorm_i16
:
2370 case Intrinsic::amdgcn_cvt_pknorm_u16
:
2371 case Intrinsic::amdgcn_cvt_pk_i16
:
2372 case Intrinsic::amdgcn_cvt_pk_u16
:
2373 case Intrinsic::amdgcn_fmed3
:
2374 case Intrinsic::amdgcn_cubeid
:
2375 case Intrinsic::amdgcn_cubema
:
2376 case Intrinsic::amdgcn_cubesc
:
2377 case Intrinsic::amdgcn_cubetc
:
2378 case Intrinsic::amdgcn_sffbh
:
2379 case Intrinsic::amdgcn_fmad_ftz
:
2380 case Intrinsic::amdgcn_mbcnt_lo
:
2381 case Intrinsic::amdgcn_mbcnt_hi
:
2382 case Intrinsic::amdgcn_ubfe
:
2383 case Intrinsic::amdgcn_sbfe
:
2384 case Intrinsic::amdgcn_lerp
:
2385 case Intrinsic::amdgcn_sad_u8
:
2386 case Intrinsic::amdgcn_msad_u8
:
2387 case Intrinsic::amdgcn_sad_hi_u8
:
2388 case Intrinsic::amdgcn_sad_u16
:
2389 case Intrinsic::amdgcn_qsad_pk_u16_u8
:
2390 case Intrinsic::amdgcn_mqsad_pk_u16_u8
:
2391 case Intrinsic::amdgcn_mqsad_u32_u8
:
2392 case Intrinsic::amdgcn_cvt_pk_u8_f32
:
2393 case Intrinsic::amdgcn_alignbit
:
2394 case Intrinsic::amdgcn_alignbyte
:
2395 case Intrinsic::amdgcn_fdot2
:
2396 case Intrinsic::amdgcn_sdot2
:
2397 case Intrinsic::amdgcn_udot2
:
2398 case Intrinsic::amdgcn_sdot4
:
2399 case Intrinsic::amdgcn_udot4
:
2400 case Intrinsic::amdgcn_sdot8
:
2401 case Intrinsic::amdgcn_udot8
:
2402 case Intrinsic::amdgcn_wwm
:
2403 case Intrinsic::amdgcn_wqm
:
2404 return getDefaultMappingVOP(MI
);
2405 case Intrinsic::amdgcn_ds_swizzle
:
2406 case Intrinsic::amdgcn_ds_permute
:
2407 case Intrinsic::amdgcn_ds_bpermute
:
2408 case Intrinsic::amdgcn_update_dpp
:
2409 return getDefaultMappingAllVGPR(MI
);
2410 case Intrinsic::amdgcn_kernarg_segment_ptr
:
2411 case Intrinsic::amdgcn_s_getpc
:
2412 case Intrinsic::amdgcn_groupstaticsize
: {
2413 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2414 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2417 case Intrinsic::amdgcn_wqm_vote
: {
2418 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2419 OpdsMapping
[0] = OpdsMapping
[2]
2420 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
);
2423 case Intrinsic::amdgcn_s_buffer_load
: {
2424 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
2425 Register RSrc
= MI
.getOperand(2).getReg(); // SGPR
2426 Register Offset
= MI
.getOperand(3).getReg(); // SGPR/imm
2428 unsigned Size0
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2429 unsigned Size2
= MRI
.getType(RSrc
).getSizeInBits();
2430 unsigned Size3
= MRI
.getType(Offset
).getSizeInBits();
2432 unsigned RSrcBank
= getRegBankID(RSrc
, MRI
, *TRI
);
2433 unsigned OffsetBank
= getRegBankID(Offset
, MRI
, *TRI
);
2435 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size0
);
2436 OpdsMapping
[1] = nullptr; // intrinsic id
2438 // Lie and claim everything is legal, even though some need to be
2439 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2440 OpdsMapping
[2] = AMDGPU::getValueMapping(RSrcBank
, Size2
); // rsrc
2441 OpdsMapping
[3] = AMDGPU::getValueMapping(OffsetBank
, Size3
);
2442 OpdsMapping
[4] = nullptr;
2445 case Intrinsic::amdgcn_div_scale
: {
2446 unsigned Dst0Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2447 unsigned Dst1Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
2448 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Dst0Size
);
2449 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Dst1Size
);
2451 unsigned SrcSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
2452 OpdsMapping
[3] = AMDGPU::getValueMapping(
2453 getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
), SrcSize
);
2454 OpdsMapping
[4] = AMDGPU::getValueMapping(
2455 getRegBankID(MI
.getOperand(4).getReg(), MRI
, *TRI
), SrcSize
);
2459 case Intrinsic::amdgcn_class
: {
2460 Register Src0Reg
= MI
.getOperand(2).getReg();
2461 Register Src1Reg
= MI
.getOperand(3).getReg();
2462 unsigned Src0Size
= MRI
.getType(Src0Reg
).getSizeInBits();
2463 unsigned Src1Size
= MRI
.getType(Src1Reg
).getSizeInBits();
2464 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2465 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, DstSize
);
2466 OpdsMapping
[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg
, MRI
, *TRI
),
2468 OpdsMapping
[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg
, MRI
, *TRI
),
2472 case Intrinsic::amdgcn_icmp
:
2473 case Intrinsic::amdgcn_fcmp
: {
2474 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2475 // This is not VCCRegBank because this is not used in boolean contexts.
2476 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
2477 unsigned OpSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2478 unsigned Op1Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2479 unsigned Op2Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2480 OpdsMapping
[2] = AMDGPU::getValueMapping(Op1Bank
, OpSize
);
2481 OpdsMapping
[3] = AMDGPU::getValueMapping(Op2Bank
, OpSize
);
2484 case Intrinsic::amdgcn_readlane
: {
2485 // This must be an SGPR, but accept a VGPR.
2486 Register IdxReg
= MI
.getOperand(3).getReg();
2487 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
2488 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
2489 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
2492 case Intrinsic::amdgcn_readfirstlane
: {
2493 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2494 unsigned SrcSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2495 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
2496 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
2499 case Intrinsic::amdgcn_writelane
: {
2500 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2501 Register SrcReg
= MI
.getOperand(2).getReg();
2502 unsigned SrcSize
= MRI
.getType(SrcReg
).getSizeInBits();
2503 unsigned SrcBank
= getRegBankID(SrcReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
2504 Register IdxReg
= MI
.getOperand(3).getReg();
2505 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
2506 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
2507 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
2509 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
2511 OpdsMapping
[2] = AMDGPU::getValueMapping(SrcBank
, SrcSize
);
2512 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
2513 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
2516 case Intrinsic::amdgcn_if_break
: {
2517 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2518 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2519 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
2520 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2526 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
: {
2527 auto IntrID
= MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID();
2529 case Intrinsic::amdgcn_s_getreg
:
2530 case Intrinsic::amdgcn_s_memtime
:
2531 case Intrinsic::amdgcn_s_memrealtime
:
2532 case Intrinsic::amdgcn_s_get_waveid_in_workgroup
: {
2533 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2534 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2537 case Intrinsic::amdgcn_ds_append
:
2538 case Intrinsic::amdgcn_ds_consume
:
2539 case Intrinsic::amdgcn_ds_fadd
:
2540 case Intrinsic::amdgcn_ds_fmin
:
2541 case Intrinsic::amdgcn_ds_fmax
:
2542 case Intrinsic::amdgcn_atomic_inc
:
2543 case Intrinsic::amdgcn_atomic_dec
:
2544 return getDefaultMappingAllVGPR(MI
);
2545 case Intrinsic::amdgcn_ds_ordered_add
:
2546 case Intrinsic::amdgcn_ds_ordered_swap
: {
2547 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2548 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
2549 unsigned M0Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2550 AMDGPU::SGPRRegBankID
);
2551 OpdsMapping
[2] = AMDGPU::getValueMapping(M0Bank
, 32);
2552 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2555 case Intrinsic::amdgcn_exp_compr
:
2556 OpdsMapping
[0] = nullptr; // IntrinsicID
2557 // FIXME: These are immediate values which can't be read from registers.
2558 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2559 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2560 // FIXME: Could we support packed types here?
2561 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2562 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2563 // FIXME: These are immediate values which can't be read from registers.
2564 OpdsMapping
[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2565 OpdsMapping
[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2567 case Intrinsic::amdgcn_exp
:
2568 // FIXME: Could we support packed types here?
2569 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2570 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2571 OpdsMapping
[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2572 OpdsMapping
[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2574 case Intrinsic::amdgcn_buffer_load
: {
2575 Register RSrc
= MI
.getOperand(2).getReg(); // SGPR
2576 Register VIndex
= MI
.getOperand(3).getReg(); // VGPR
2577 Register Offset
= MI
.getOperand(4).getReg(); // SGPR/VGPR/imm
2579 unsigned Size0
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2580 unsigned Size2
= MRI
.getType(RSrc
).getSizeInBits();
2581 unsigned Size3
= MRI
.getType(VIndex
).getSizeInBits();
2582 unsigned Size4
= MRI
.getType(Offset
).getSizeInBits();
2584 unsigned RSrcBank
= getRegBankID(RSrc
, MRI
, *TRI
);
2585 unsigned OffsetBank
= getRegBankID(Offset
, MRI
, *TRI
);
2587 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size0
);
2588 OpdsMapping
[1] = nullptr; // intrinsic id
2590 // Lie and claim everything is legal, even though some need to be
2591 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2592 OpdsMapping
[2] = AMDGPU::getValueMapping(RSrcBank
, Size2
); // rsrc
2593 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size3
);
2594 OpdsMapping
[4] = AMDGPU::getValueMapping(OffsetBank
, Size4
);
2595 OpdsMapping
[5] = nullptr;
2596 OpdsMapping
[6] = nullptr;
2599 case Intrinsic::amdgcn_s_sendmsg
:
2600 case Intrinsic::amdgcn_s_sendmsghalt
: {
2601 // This must be an SGPR, but accept a VGPR.
2602 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2603 AMDGPU::SGPRRegBankID
);
2604 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, 32);
2605 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
2608 case Intrinsic::amdgcn_end_cf
:
2609 case Intrinsic::amdgcn_init_exec
: {
2610 unsigned Size
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2611 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2614 case Intrinsic::amdgcn_else
: {
2615 unsigned WaveSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2616 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
2617 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, WaveSize
);
2618 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, WaveSize
);
2621 case Intrinsic::amdgcn_kill
: {
2622 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
2625 case Intrinsic::amdgcn_raw_buffer_load
:
2626 case Intrinsic::amdgcn_raw_tbuffer_load
: {
2627 // FIXME: Should make intrinsic ID the last operand of the instruction,
2628 // then this would be the same as store
2629 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2630 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2631 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2632 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
2635 case Intrinsic::amdgcn_raw_buffer_store
:
2636 case Intrinsic::amdgcn_raw_buffer_store_format
:
2637 case Intrinsic::amdgcn_raw_tbuffer_store
: {
2638 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2639 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2640 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2641 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
2644 case Intrinsic::amdgcn_struct_buffer_load
:
2645 case Intrinsic::amdgcn_struct_tbuffer_load
: {
2646 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2647 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2648 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2649 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
2650 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
2653 case Intrinsic::amdgcn_struct_buffer_store
:
2654 case Intrinsic::amdgcn_struct_tbuffer_store
: {
2655 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2656 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2657 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2658 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
2659 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
2662 case Intrinsic::amdgcn_init_exec_from_input
: {
2663 unsigned Size
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2664 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2665 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2669 if (const AMDGPU::RsrcIntrinsic
*RSrcIntrin
=
2670 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
2671 // Non-images can have complications from operands that allow both SGPR
2672 // and VGPR. For now it's too complicated to figure out the final opcode
2673 // to derive the register bank from the MCInstrDesc.
2674 if (RSrcIntrin
->IsImage
)
2675 return getImageMapping(MRI
, MI
, RSrcIntrin
->RsrcArg
);
2678 return getInvalidInstructionMapping();
2682 case AMDGPU::G_SELECT
: {
2683 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2684 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2685 AMDGPU::SGPRRegBankID
);
2686 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
,
2687 AMDGPU::SGPRRegBankID
);
2688 bool SGPRSrcs
= Op2Bank
== AMDGPU::SGPRRegBankID
&&
2689 Op3Bank
== AMDGPU::SGPRRegBankID
;
2691 unsigned CondBankDefault
= SGPRSrcs
?
2692 AMDGPU::SCCRegBankID
: AMDGPU::VCCRegBankID
;
2693 unsigned CondBank
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
,
2695 if (CondBank
== AMDGPU::SGPRRegBankID
)
2696 CondBank
= SGPRSrcs
? AMDGPU::SCCRegBankID
: AMDGPU::VCCRegBankID
;
2697 else if (CondBank
== AMDGPU::VGPRRegBankID
)
2698 CondBank
= AMDGPU::VCCRegBankID
;
2700 unsigned Bank
= SGPRSrcs
&& CondBank
== AMDGPU::SCCRegBankID
?
2701 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
2703 assert(CondBank
== AMDGPU::VCCRegBankID
|| CondBank
== AMDGPU::SCCRegBankID
);
2706 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
2707 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
2708 OpdsMapping
[2] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
2709 OpdsMapping
[3] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
2711 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, Size
);
2712 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
2713 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, Size
);
2714 OpdsMapping
[3] = AMDGPU::getValueMapping(Bank
, Size
);
2720 case AMDGPU::G_LOAD
:
2721 case AMDGPU::G_ZEXTLOAD
:
2722 case AMDGPU::G_SEXTLOAD
:
2723 return getInstrMappingForLoad(MI
);
2725 case AMDGPU::G_ATOMICRMW_XCHG
:
2726 case AMDGPU::G_ATOMICRMW_ADD
:
2727 case AMDGPU::G_ATOMICRMW_SUB
:
2728 case AMDGPU::G_ATOMICRMW_AND
:
2729 case AMDGPU::G_ATOMICRMW_OR
:
2730 case AMDGPU::G_ATOMICRMW_XOR
:
2731 case AMDGPU::G_ATOMICRMW_MAX
:
2732 case AMDGPU::G_ATOMICRMW_MIN
:
2733 case AMDGPU::G_ATOMICRMW_UMAX
:
2734 case AMDGPU::G_ATOMICRMW_UMIN
:
2735 case AMDGPU::G_ATOMICRMW_FADD
:
2736 case AMDGPU::G_ATOMIC_CMPXCHG
: {
2737 return getDefaultMappingAllVGPR(MI
);
2739 case AMDGPU::G_BRCOND
: {
2740 unsigned Bank
= getRegBankID(MI
.getOperand(0).getReg(), MRI
, *TRI
,
2741 AMDGPU::SGPRRegBankID
);
2742 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
2743 if (Bank
!= AMDGPU::SCCRegBankID
)
2744 Bank
= AMDGPU::VCCRegBankID
;
2746 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, 1);
2751 return getInstructionMapping(/*ID*/1, /*Cost*/1,
2752 getOperandsMapping(OpdsMapping
),
2753 MI
.getNumOperands());