1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the RegisterBankInfo class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/SmallSet.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
25 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
26 #include "llvm/CodeGen/TargetRegisterInfo.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Constants.h"
30 #define GET_TARGET_REGBANK_IMPL
31 #include "AMDGPUGenRegisterBank.inc"
33 // This file will be TableGen'ed at some point.
34 #include "AMDGPUGenRegisterBankInfo.def"
37 using namespace MIPatternMatch
;
41 // Observer to apply a register bank to new registers created by LegalizerHelper.
42 class ApplyRegBankMapping final
: public GISelChangeObserver
{
44 MachineRegisterInfo
&MRI
;
45 const RegisterBank
*NewBank
;
46 SmallVector
<MachineInstr
*, 4> NewInsts
;
49 ApplyRegBankMapping(MachineRegisterInfo
&MRI_
, const RegisterBank
*RB
)
50 : MRI(MRI_
), NewBank(RB
) {}
52 ~ApplyRegBankMapping() {
53 for (MachineInstr
*MI
: NewInsts
)
57 /// Set any registers that don't have a set register class or bank to SALU.
58 void applyBank(MachineInstr
&MI
) {
59 for (MachineOperand
&Op
: MI
.operands()) {
63 Register Reg
= Op
.getReg();
64 if (MRI
.getRegClassOrRegBank(Reg
))
67 const RegisterBank
*RB
= NewBank
;
68 // FIXME: This might not be enough to detect when SCC should be used.
69 if (MRI
.getType(Reg
) == LLT::scalar(1))
70 RB
= (NewBank
== &AMDGPU::SGPRRegBank
?
71 &AMDGPU::SCCRegBank
: &AMDGPU::VCCRegBank
);
73 MRI
.setRegBank(Reg
, *RB
);
77 void erasingInstr(MachineInstr
&MI
) override
{}
79 void createdInstr(MachineInstr
&MI
) override
{
80 // At this point, the instruction was just inserted and has no operands.
81 NewInsts
.push_back(&MI
);
84 void changingInstr(MachineInstr
&MI
) override
{}
85 void changedInstr(MachineInstr
&MI
) override
{}
89 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget
&ST
)
90 : AMDGPUGenRegisterBankInfo(),
92 TRI(Subtarget
.getRegisterInfo()),
93 TII(Subtarget
.getInstrInfo()) {
95 // HACK: Until this is fully tablegen'd.
96 static bool AlreadyInit
= false;
102 const RegisterBank
&RBSGPR
= getRegBank(AMDGPU::SGPRRegBankID
);
104 assert(&RBSGPR
== &AMDGPU::SGPRRegBank
);
106 const RegisterBank
&RBVGPR
= getRegBank(AMDGPU::VGPRRegBankID
);
108 assert(&RBVGPR
== &AMDGPU::VGPRRegBank
);
112 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank
&Dst
,
113 const RegisterBank
&Src
,
114 unsigned Size
) const {
115 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
116 if (Dst
.getID() == AMDGPU::SGPRRegBankID
&&
117 Src
.getID() == AMDGPU::VGPRRegBankID
) {
118 return std::numeric_limits
<unsigned>::max();
121 // Bool values are tricky, because the meaning is based on context. The SCC
122 // and VCC banks are for the natural scalar and vector conditions produced by
125 // Legalization doesn't know about the necessary context, so an s1 use may
126 // have been a truncate from an arbitrary value, in which case a copy (lowered
127 // as a compare with 0) needs to be inserted.
129 (Dst
.getID() == AMDGPU::SCCRegBankID
||
130 Dst
.getID() == AMDGPU::SGPRRegBankID
) &&
131 (Src
.getID() == AMDGPU::SGPRRegBankID
||
132 Src
.getID() == AMDGPU::VGPRRegBankID
||
133 Src
.getID() == AMDGPU::VCCRegBankID
))
134 return std::numeric_limits
<unsigned>::max();
136 if (Dst
.getID() == AMDGPU::SCCRegBankID
&&
137 Src
.getID() == AMDGPU::VCCRegBankID
)
138 return std::numeric_limits
<unsigned>::max();
140 return RegisterBankInfo::copyCost(Dst
, Src
, Size
);
143 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
144 const ValueMapping
&ValMapping
,
145 const RegisterBank
*CurBank
) const {
146 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
148 // FIXME: Is there a better way to do this?
149 if (ValMapping
.NumBreakDowns
>= 2 || ValMapping
.BreakDown
[0].Length
>= 64)
150 return 10; // This is expensive.
152 assert(ValMapping
.NumBreakDowns
== 2 &&
153 ValMapping
.BreakDown
[0].Length
== 32 &&
154 ValMapping
.BreakDown
[0].StartIdx
== 0 &&
155 ValMapping
.BreakDown
[1].Length
== 32 &&
156 ValMapping
.BreakDown
[1].StartIdx
== 32 &&
157 ValMapping
.BreakDown
[0].RegBank
== ValMapping
.BreakDown
[1].RegBank
);
159 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
160 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
163 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
164 // alignment restrictions, but this probably isn't important.
168 const RegisterBank
&AMDGPURegisterBankInfo::getRegBankFromRegClass(
169 const TargetRegisterClass
&RC
) const {
170 if (&RC
== &AMDGPU::SReg_1RegClass
)
171 return AMDGPU::VCCRegBank
;
173 return TRI
->isSGPRClass(&RC
) ? AMDGPU::SGPRRegBank
: AMDGPU::VGPRRegBank
;
176 template <unsigned NumOps
>
177 RegisterBankInfo::InstructionMappings
178 AMDGPURegisterBankInfo::addMappingFromTable(
179 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
,
180 const std::array
<unsigned, NumOps
> RegSrcOpIdx
,
181 ArrayRef
<OpRegBankEntry
<NumOps
>> Table
) const {
183 InstructionMappings AltMappings
;
185 SmallVector
<const ValueMapping
*, 10> Operands(MI
.getNumOperands());
187 unsigned Sizes
[NumOps
];
188 for (unsigned I
= 0; I
< NumOps
; ++I
) {
189 Register Reg
= MI
.getOperand(RegSrcOpIdx
[I
]).getReg();
190 Sizes
[I
] = getSizeInBits(Reg
, MRI
, *TRI
);
193 for (unsigned I
= 0, E
= MI
.getNumExplicitDefs(); I
!= E
; ++I
) {
194 unsigned SizeI
= getSizeInBits(MI
.getOperand(I
).getReg(), MRI
, *TRI
);
195 Operands
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SizeI
);
198 // getInstrMapping's default mapping uses ID 1, so start at 2.
199 unsigned MappingID
= 2;
200 for (const auto &Entry
: Table
) {
201 for (unsigned I
= 0; I
< NumOps
; ++I
) {
202 int OpIdx
= RegSrcOpIdx
[I
];
203 Operands
[OpIdx
] = AMDGPU::getValueMapping(Entry
.RegBanks
[I
], Sizes
[I
]);
206 AltMappings
.push_back(&getInstructionMapping(MappingID
++, Entry
.Cost
,
207 getOperandsMapping(Operands
),
214 RegisterBankInfo::InstructionMappings
215 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
216 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
217 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
218 case Intrinsic::amdgcn_readlane
: {
219 static const OpRegBankEntry
<3> Table
[2] = {
221 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
223 // Need a readfirstlane for the index.
224 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
227 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
228 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
230 case Intrinsic::amdgcn_writelane
: {
231 static const OpRegBankEntry
<4> Table
[4] = {
233 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
235 // Need readfirstlane of first op
236 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
238 // Need readfirstlane of second op
239 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
241 // Need readfirstlane of both ops
242 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 3 }
245 // rsrc, voffset, offset
246 const std::array
<unsigned, 4> RegSrcOpIdx
= { { 0, 2, 3, 4 } };
247 return addMappingFromTable
<4>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
250 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
254 RegisterBankInfo::InstructionMappings
255 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
256 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
258 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
259 case Intrinsic::amdgcn_buffer_load
: {
260 static const OpRegBankEntry
<3> Table
[4] = {
262 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
263 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
265 // Waterfall loop needed for rsrc. In the worst case this will execute
266 // approximately an extra 10 * wavesize + 2 instructions.
267 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1000 },
268 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1000 }
271 // rsrc, voffset, offset
272 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 2, 3, 4 } };
273 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
275 case Intrinsic::amdgcn_s_buffer_load
: {
276 static const OpRegBankEntry
<2> Table
[4] = {
278 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
280 // Only need 1 register in loop
281 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 300 },
283 // Have to waterfall the resource.
284 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1000 },
286 // Have to waterfall the resource, and the offset.
287 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1500 }
291 const std::array
<unsigned, 2> RegSrcOpIdx
= { { 2, 3 } };
292 return addMappingFromTable
<2>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
294 case Intrinsic::amdgcn_ds_ordered_add
:
295 case Intrinsic::amdgcn_ds_ordered_swap
: {
297 static const OpRegBankEntry
<3> Table
[2] = {
299 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
301 // Need a readfirstlane for m0
302 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
305 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
306 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
308 case Intrinsic::amdgcn_s_sendmsg
:
309 case Intrinsic::amdgcn_s_sendmsghalt
: {
310 // FIXME: Should have no register for immediate
311 static const OpRegBankEntry
<2> Table
[2] = {
313 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
316 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 3 }
319 const std::array
<unsigned, 2> RegSrcOpIdx
= { { 1, 2 } };
320 return addMappingFromTable
<2>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
323 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
327 static bool isInstrUniformNonExtLoadAlign4(const MachineInstr
&MI
) {
328 if (!MI
.hasOneMemOperand())
331 const MachineMemOperand
*MMO
= *MI
.memoperands_begin();
332 return MMO
->getSize() >= 4 && MMO
->getAlignment() >= 4 &&
333 AMDGPUInstrInfo::isUniformMMO(MMO
);
336 RegisterBankInfo::InstructionMappings
337 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
338 const MachineInstr
&MI
) const {
340 const MachineFunction
&MF
= *MI
.getParent()->getParent();
341 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
344 InstructionMappings AltMappings
;
345 switch (MI
.getOpcode()) {
346 case TargetOpcode::G_CONSTANT
:
347 case TargetOpcode::G_FCONSTANT
:
348 case TargetOpcode::G_FRAME_INDEX
:
349 case TargetOpcode::G_GLOBAL_VALUE
: {
350 static const OpRegBankEntry
<1> Table
[2] = {
351 { { AMDGPU::VGPRRegBankID
}, 1 },
352 { { AMDGPU::SGPRRegBankID
}, 1 }
355 return addMappingFromTable
<1>(MI
, MRI
, { 0 }, Table
);
357 case TargetOpcode::G_AND
:
358 case TargetOpcode::G_OR
:
359 case TargetOpcode::G_XOR
: {
360 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
363 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
364 const InstructionMapping
&SCCMapping
= getInstructionMapping(
365 1, 1, getOperandsMapping(
366 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, Size
),
367 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
368 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
370 AltMappings
.push_back(&SCCMapping
);
372 const InstructionMapping
&SGPRMapping
= getInstructionMapping(
373 1, 1, getOperandsMapping(
374 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
375 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
376 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
378 AltMappings
.push_back(&SGPRMapping
);
380 const InstructionMapping
&VCCMapping0
= getInstructionMapping(
381 2, 10, getOperandsMapping(
382 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
383 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
384 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
)}),
386 AltMappings
.push_back(&VCCMapping0
);
393 const InstructionMapping
&SSMapping
= getInstructionMapping(
394 1, 1, getOperandsMapping(
395 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
396 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
397 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
399 AltMappings
.push_back(&SSMapping
);
401 const InstructionMapping
&VVMapping
= getInstructionMapping(
402 2, 2, getOperandsMapping(
403 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
404 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
405 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
407 AltMappings
.push_back(&VVMapping
);
409 const InstructionMapping
&SVMapping
= getInstructionMapping(
410 3, 3, getOperandsMapping(
411 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
412 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
),
413 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
415 AltMappings
.push_back(&SVMapping
);
417 // SGPR in LHS is slightly preferrable, so make it VS more expensive than
419 const InstructionMapping
&VSMapping
= getInstructionMapping(
420 3, 4, getOperandsMapping(
421 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
422 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
423 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
)}),
425 AltMappings
.push_back(&VSMapping
);
428 case TargetOpcode::G_LOAD
:
429 case TargetOpcode::G_ZEXTLOAD
:
430 case TargetOpcode::G_SEXTLOAD
: {
431 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
432 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
433 unsigned PtrSize
= PtrTy
.getSizeInBits();
434 unsigned AS
= PtrTy
.getAddressSpace();
435 LLT LoadTy
= MRI
.getType(MI
.getOperand(0).getReg());
436 if (isInstrUniformNonExtLoadAlign4(MI
) &&
437 (AS
!= AMDGPUAS::LOCAL_ADDRESS
&& AS
!= AMDGPUAS::REGION_ADDRESS
)) {
438 const InstructionMapping
&SSMapping
= getInstructionMapping(
439 1, 1, getOperandsMapping(
440 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
441 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
)}),
443 AltMappings
.push_back(&SSMapping
);
446 const InstructionMapping
&VVMapping
= getInstructionMapping(
447 2, 1, getOperandsMapping(
448 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID
, LoadTy
),
449 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
)}),
451 AltMappings
.push_back(&VVMapping
);
453 // It may be possible to have a vgpr = load sgpr mapping here, because
454 // the mubuf instructions support this kind of load, but probably for only
455 // gfx7 and older. However, the addressing mode matching in the instruction
456 // selector should be able to do a better job of detecting and selecting
457 // these kinds of loads from the vgpr = load vgpr mapping.
462 case TargetOpcode::G_ICMP
: {
463 unsigned Size
= getSizeInBits(MI
.getOperand(2).getReg(), MRI
, *TRI
);
464 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
465 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1),
466 nullptr, // Predicate operand.
467 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
468 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
470 AltMappings
.push_back(&SSMapping
);
472 const InstructionMapping
&SVMapping
= getInstructionMapping(2, 1,
473 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
474 nullptr, // Predicate operand.
475 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
476 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
)}),
478 AltMappings
.push_back(&SVMapping
);
480 const InstructionMapping
&VSMapping
= getInstructionMapping(3, 1,
481 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
482 nullptr, // Predicate operand.
483 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
484 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
486 AltMappings
.push_back(&VSMapping
);
488 const InstructionMapping
&VVMapping
= getInstructionMapping(4, 1,
489 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
490 nullptr, // Predicate operand.
491 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
492 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
)}),
494 AltMappings
.push_back(&VVMapping
);
498 case TargetOpcode::G_SELECT
: {
499 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
500 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
501 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
502 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1),
503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
504 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
506 AltMappings
.push_back(&SSMapping
);
508 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
509 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
511 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
512 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
514 AltMappings
.push_back(&VVMapping
);
518 case TargetOpcode::G_SMIN
:
519 case TargetOpcode::G_SMAX
:
520 case TargetOpcode::G_UMIN
:
521 case TargetOpcode::G_UMAX
: {
522 static const OpRegBankEntry
<3> Table
[4] = {
523 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
524 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
525 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
527 // Scalar requires cmp+select, and extends if 16-bit.
528 // FIXME: Should there be separate costs for 32 and 16-bit
529 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 3 }
532 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 1, 2 } };
533 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
535 case TargetOpcode::G_UADDE
:
536 case TargetOpcode::G_USUBE
:
537 case TargetOpcode::G_SADDE
:
538 case TargetOpcode::G_SSUBE
: {
539 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
540 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
542 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
543 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1),
544 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
545 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
546 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1)}),
548 AltMappings
.push_back(&SSMapping
);
550 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
551 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
552 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
553 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
554 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
555 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1)}),
557 AltMappings
.push_back(&VVMapping
);
560 case AMDGPU::G_BRCOND
: {
561 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
563 const InstructionMapping
&SMapping
= getInstructionMapping(
564 1, 1, getOperandsMapping(
565 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1), nullptr}),
567 AltMappings
.push_back(&SMapping
);
569 const InstructionMapping
&VMapping
= getInstructionMapping(
570 1, 1, getOperandsMapping(
571 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1), nullptr }),
573 AltMappings
.push_back(&VMapping
);
576 case AMDGPU::G_INTRINSIC
:
577 return getInstrAlternativeMappingsIntrinsic(MI
, MRI
);
578 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
:
579 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI
, MRI
);
583 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
586 void AMDGPURegisterBankInfo::split64BitValueForMapping(
588 SmallVector
<Register
, 2> &Regs
,
590 Register Reg
) const {
591 assert(HalfTy
.getSizeInBits() == 32);
592 MachineRegisterInfo
*MRI
= B
.getMRI();
593 Register LoLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
594 Register HiLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
595 const RegisterBank
*Bank
= getRegBank(Reg
, *MRI
, *TRI
);
596 MRI
->setRegBank(LoLHS
, *Bank
);
597 MRI
->setRegBank(HiLHS
, *Bank
);
599 Regs
.push_back(LoLHS
);
600 Regs
.push_back(HiLHS
);
602 B
.buildInstr(AMDGPU::G_UNMERGE_VALUES
)
608 /// Replace the current type each register in \p Regs has with \p NewTy
609 static void setRegsToType(MachineRegisterInfo
&MRI
, ArrayRef
<Register
> Regs
,
611 for (Register Reg
: Regs
) {
612 assert(MRI
.getType(Reg
).getSizeInBits() == NewTy
.getSizeInBits());
613 MRI
.setType(Reg
, NewTy
);
617 static LLT
getHalfSizedType(LLT Ty
) {
619 assert(Ty
.getNumElements() % 2 == 0);
620 return LLT::scalarOrVector(Ty
.getNumElements() / 2, Ty
.getElementType());
623 assert(Ty
.getSizeInBits() % 2 == 0);
624 return LLT::scalar(Ty
.getSizeInBits() / 2);
627 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
628 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
629 /// execute the instruction for each unique combination of values in all lanes
630 /// in the wave. The block will be split such that rest of the instructions are
631 /// moved to a new block.
633 /// Essentially performs this loop:
635 /// Save Execution Mask
636 /// For (Lane : Wavefront) {
637 /// Enable Lane, Disable all other lanes
638 /// SGPR = read SGPR value for current lane from VGPR
639 /// VGPRResult[Lane] = use_op SGPR
641 /// Restore Execution Mask
643 /// There is additional complexity to try for compare values to identify the
644 /// unique values used.
645 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
648 MachineRegisterInfo
&MRI
,
649 ArrayRef
<unsigned> OpIndices
) const {
650 MachineFunction
*MF
= MI
.getParent()->getParent();
651 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
652 const SIInstrInfo
*TII
= ST
.getInstrInfo();
653 MachineBasicBlock::iterator
I(MI
);
655 MachineBasicBlock
&MBB
= *MI
.getParent();
656 const DebugLoc
&DL
= MI
.getDebugLoc();
658 // Use a set to avoid extra readfirstlanes in the case where multiple operands
659 // are the same register.
660 SmallSet
<Register
, 4> SGPROperandRegs
;
661 for (unsigned Op
: OpIndices
) {
662 assert(MI
.getOperand(Op
).isUse());
663 Register Reg
= MI
.getOperand(Op
).getReg();
664 const RegisterBank
*OpBank
= getRegBank(Reg
, MRI
, *TRI
);
665 if (OpBank
->getID() == AMDGPU::VGPRRegBankID
)
666 SGPROperandRegs
.insert(Reg
);
669 // No operands need to be replaced, so no need to loop.
670 if (SGPROperandRegs
.empty())
673 SmallVector
<Register
, 4> ResultRegs
;
674 SmallVector
<Register
, 4> InitResultRegs
;
675 SmallVector
<Register
, 4> PhiRegs
;
676 for (MachineOperand
&Def
: MI
.defs()) {
677 LLT ResTy
= MRI
.getType(Def
.getReg());
678 const RegisterBank
*DefBank
= getRegBank(Def
.getReg(), MRI
, *TRI
);
679 ResultRegs
.push_back(Def
.getReg());
680 Register InitReg
= B
.buildUndef(ResTy
).getReg(0);
681 Register PhiReg
= MRI
.createGenericVirtualRegister(ResTy
);
682 InitResultRegs
.push_back(InitReg
);
683 PhiRegs
.push_back(PhiReg
);
684 MRI
.setRegBank(PhiReg
, *DefBank
);
685 MRI
.setRegBank(InitReg
, *DefBank
);
688 Register SaveExecReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
689 Register InitSaveExecReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
691 // Don't bother using generic instructions/registers for the exec mask.
692 B
.buildInstr(TargetOpcode::IMPLICIT_DEF
)
693 .addDef(InitSaveExecReg
);
695 Register PhiExec
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
696 Register NewExec
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
698 // To insert the loop we need to split the block. Move everything before this
699 // point to a new block, and insert a new empty block before this instruction.
700 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
701 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
702 MachineBasicBlock
*RestoreExecBB
= MF
->CreateMachineBasicBlock();
703 MachineFunction::iterator
MBBI(MBB
);
705 MF
->insert(MBBI
, LoopBB
);
706 MF
->insert(MBBI
, RestoreExecBB
);
707 MF
->insert(MBBI
, RemainderBB
);
709 LoopBB
->addSuccessor(RestoreExecBB
);
710 LoopBB
->addSuccessor(LoopBB
);
712 // Move the rest of the block into a new block.
713 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
714 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, I
, MBB
.end());
716 MBB
.addSuccessor(LoopBB
);
717 RestoreExecBB
->addSuccessor(RemainderBB
);
719 B
.setInsertPt(*LoopBB
, LoopBB
->end());
721 B
.buildInstr(TargetOpcode::PHI
)
723 .addReg(InitSaveExecReg
)
728 for (auto Result
: zip(InitResultRegs
, ResultRegs
, PhiRegs
)) {
729 B
.buildInstr(TargetOpcode::G_PHI
)
730 .addDef(std::get
<2>(Result
))
731 .addReg(std::get
<0>(Result
)) // Initial value / implicit_def
733 .addReg(std::get
<1>(Result
)) // Mid-loop value.
737 // Move the instruction into the loop.
738 LoopBB
->splice(LoopBB
->end(), &MBB
, I
);
739 I
= std::prev(LoopBB
->end());
745 for (MachineOperand
&Op
: MI
.uses()) {
750 if (SGPROperandRegs
.count(Op
.getReg())) {
751 LLT OpTy
= MRI
.getType(Op
.getReg());
752 unsigned OpSize
= OpTy
.getSizeInBits();
754 // Can only do a readlane of 32-bit pieces.
756 // Avoid extra copies in the simple case of one 32-bit register.
757 Register CurrentLaneOpReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
758 MRI
.setType(CurrentLaneOpReg
, OpTy
);
760 constrainGenericRegister(Op
.getReg(), AMDGPU::VGPR_32RegClass
, MRI
);
761 // Read the next variant <- also loop target.
762 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), CurrentLaneOpReg
)
763 .addReg(Op
.getReg());
765 Register NewCondReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
766 bool First
= CondReg
== AMDGPU::NoRegister
;
768 CondReg
= NewCondReg
;
770 // Compare the just read M0 value to all possible Idx values.
771 B
.buildInstr(AMDGPU::V_CMP_EQ_U32_e64
)
773 .addReg(CurrentLaneOpReg
)
774 .addReg(Op
.getReg());
775 Op
.setReg(CurrentLaneOpReg
);
778 Register AndReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
780 // If there are multiple operands to consider, and the conditions.
781 B
.buildInstr(AMDGPU::S_AND_B64
)
788 LLT S32
= LLT::scalar(32);
789 SmallVector
<Register
, 8> ReadlanePieces
;
791 // The compares can be done as 64-bit, but the extract needs to be done
794 bool Is64
= OpSize
% 64 == 0;
796 LLT UnmergeTy
= OpSize
% 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
797 unsigned CmpOp
= OpSize
% 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
798 : AMDGPU::V_CMP_EQ_U32_e64
;
800 // The compares can be done as 64-bit, but the extract needs to be done
803 // Insert the unmerge before the loop.
806 auto Unmerge
= B
.buildUnmerge(UnmergeTy
, Op
.getReg());
809 unsigned NumPieces
= Unmerge
->getNumOperands() - 1;
810 for (unsigned PieceIdx
= 0; PieceIdx
!= NumPieces
; ++PieceIdx
) {
811 Register UnmergePiece
= Unmerge
.getReg(PieceIdx
);
813 Register CurrentLaneOpReg
;
815 Register CurrentLaneOpRegLo
= MRI
.createGenericVirtualRegister(S32
);
816 Register CurrentLaneOpRegHi
= MRI
.createGenericVirtualRegister(S32
);
818 MRI
.setRegClass(UnmergePiece
, &AMDGPU::VReg_64RegClass
);
819 MRI
.setRegClass(CurrentLaneOpRegLo
, &AMDGPU::SReg_32_XM0RegClass
);
820 MRI
.setRegClass(CurrentLaneOpRegHi
, &AMDGPU::SReg_32_XM0RegClass
);
822 // Read the next variant <- also loop target.
823 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
825 .addReg(UnmergePiece
, 0, AMDGPU::sub0
);
827 // Read the next variant <- also loop target.
828 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
830 .addReg(UnmergePiece
, 0, AMDGPU::sub1
);
833 B
.buildMerge(LLT::scalar(64),
834 {CurrentLaneOpRegLo
, CurrentLaneOpRegHi
})
837 MRI
.setRegClass(CurrentLaneOpReg
, &AMDGPU::SReg_64_XEXECRegClass
);
839 if (OpTy
.getScalarSizeInBits() == 64) {
840 // If we need to produce a 64-bit element vector, so use the
842 ReadlanePieces
.push_back(CurrentLaneOpReg
);
844 // 32-bit element type.
845 ReadlanePieces
.push_back(CurrentLaneOpRegLo
);
846 ReadlanePieces
.push_back(CurrentLaneOpRegHi
);
849 CurrentLaneOpReg
= MRI
.createGenericVirtualRegister(LLT::scalar(32));
850 MRI
.setRegClass(UnmergePiece
, &AMDGPU::VGPR_32RegClass
);
851 MRI
.setRegClass(CurrentLaneOpReg
, &AMDGPU::SReg_32_XM0RegClass
);
853 // Read the next variant <- also loop target.
854 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
856 .addReg(UnmergePiece
);
857 ReadlanePieces
.push_back(CurrentLaneOpReg
);
861 = MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
862 bool First
= CondReg
== AMDGPU::NoRegister
;
864 CondReg
= NewCondReg
;
868 .addReg(CurrentLaneOpReg
)
869 .addReg(UnmergePiece
);
873 = MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
875 // If there are multiple operands to consider, and the conditions.
876 B
.buildInstr(AMDGPU::S_AND_B64
)
884 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
886 if (OpTy
.isVector()) {
887 auto Merge
= B
.buildBuildVector(OpTy
, ReadlanePieces
);
888 Op
.setReg(Merge
.getReg(0));
890 auto Merge
= B
.buildMerge(OpTy
, ReadlanePieces
);
891 Op
.setReg(Merge
.getReg(0));
894 MRI
.setRegBank(Op
.getReg(), getRegBank(AMDGPU::SGPRRegBankID
));
899 B
.setInsertPt(*LoopBB
, LoopBB
->end());
901 // Update EXEC, save the original EXEC value to VCC.
902 B
.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64
)
904 .addReg(CondReg
, RegState::Kill
);
906 MRI
.setSimpleHint(NewExec
, CondReg
);
908 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
909 B
.buildInstr(AMDGPU::S_XOR_B64_term
)
910 .addDef(AMDGPU::EXEC
)
911 .addReg(AMDGPU::EXEC
)
914 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
917 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
918 B
.buildInstr(AMDGPU::S_CBRANCH_EXECNZ
)
921 // Save the EXEC mask before the loop.
922 BuildMI(MBB
, MBB
.end(), DL
, TII
->get(AMDGPU::S_MOV_B64_term
), SaveExecReg
)
923 .addReg(AMDGPU::EXEC
);
925 // Restore the EXEC mask after the loop.
926 B
.setMBB(*RestoreExecBB
);
927 B
.buildInstr(AMDGPU::S_MOV_B64_term
)
928 .addDef(AMDGPU::EXEC
)
929 .addReg(SaveExecReg
);
931 // Restore the insert point before the original instruction.
932 B
.setInsertPt(MBB
, MBB
.end());
937 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
938 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
939 ArrayRef
<unsigned> OpIndices
) const {
940 MachineIRBuilder
B(MI
);
941 return executeInWaterfallLoop(B
, MI
, MRI
, OpIndices
);
944 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
945 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
946 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, unsigned OpIdx
) const {
947 Register Reg
= MI
.getOperand(OpIdx
).getReg();
948 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
949 if (Bank
!= &AMDGPU::VGPRRegBank
)
952 MachineIRBuilder
B(MI
);
953 Register SGPR
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
954 B
.buildInstr(AMDGPU::V_READFIRSTLANE_B32
)
958 const TargetRegisterClass
*Constrained
=
959 constrainGenericRegister(Reg
, AMDGPU::VGPR_32RegClass
, MRI
);
961 assert(Constrained
&& "Failed to constrain readfirstlane src reg");
963 MI
.getOperand(OpIdx
).setReg(SGPR
);
966 // When regbankselect repairs registers, it will insert a repair instruction
967 // which defines the repaired register. Then it calls applyMapping and expects
968 // that the targets will either delete or rewrite the originally wrote to the
969 // repaired registers. Beccause of this, we end up in a situation where
970 // we have 2 instructions defining the same registers.
971 static MachineInstr
*getOtherVRegDef(const MachineRegisterInfo
&MRI
,
973 const MachineInstr
&MI
) {
974 // Is there some way we can assert that there are exactly 2 def instructions?
975 for (MachineInstr
&Other
: MRI
.def_instructions(Reg
)) {
983 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr
&MI
,
984 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
985 MachineRegisterInfo
&MRI
) const {
986 Register DstReg
= MI
.getOperand(0).getReg();
987 const LLT LoadTy
= MRI
.getType(DstReg
);
988 unsigned LoadSize
= LoadTy
.getSizeInBits();
989 const unsigned MaxNonSmrdLoadSize
= 128;
990 // 128-bit loads are supported for all instruction types.
991 if (LoadSize
<= MaxNonSmrdLoadSize
)
994 SmallVector
<unsigned, 16> DefRegs(OpdMapper
.getVRegs(0));
995 SmallVector
<unsigned, 1> SrcRegs(OpdMapper
.getVRegs(1));
997 // If the pointer is an SGPR, we have nothing to do.
1001 assert(LoadSize
% MaxNonSmrdLoadSize
== 0);
1003 // We want to get the repair instruction now, because it will help us
1004 // determine which instruction the legalizer inserts that will also
1006 MachineInstr
*RepairInst
= getOtherVRegDef(MRI
, DstReg
, MI
);
1008 // RegBankSelect only emits scalar types, so we need to reset the pointer
1009 // operand to a pointer type.
1010 Register BasePtrReg
= SrcRegs
[0];
1011 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
1012 MRI
.setType(BasePtrReg
, PtrTy
);
1014 MachineIRBuilder
B(MI
);
1016 unsigned SplitElts
=
1017 MaxNonSmrdLoadSize
/ LoadTy
.getScalarType().getSizeInBits();
1018 const LLT LoadSplitTy
= LLT::vector(SplitElts
, LoadTy
.getScalarType());
1019 ApplyRegBankMapping
O(MRI
, &AMDGPU::VGPRRegBank
);
1020 GISelObserverWrapper
Observer(&O
);
1021 B
.setChangeObserver(Observer
);
1022 LegalizerHelper
Helper(B
.getMF(), Observer
, B
);
1023 if (Helper
.fewerElementsVector(MI
, 0, LoadSplitTy
) != LegalizerHelper::Legalized
)
1026 // At this point, the legalizer has split the original load into smaller
1027 // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
1028 // that combines the outputs of the lower loads and writes it to DstReg.
1029 // The register bank selector has also added the RepairInst which writes to
1032 MachineInstr
*LegalizedInst
= getOtherVRegDef(MRI
, DstReg
, *RepairInst
);
1034 // Replace the output of the LegalizedInst with a temporary register, since
1035 // RepairInst already defines DstReg.
1036 Register TmpReg
= MRI
.createGenericVirtualRegister(MRI
.getType(DstReg
));
1037 LegalizedInst
->getOperand(0).setReg(TmpReg
);
1038 B
.setInsertPt(*RepairInst
->getParent(), RepairInst
);
1040 for (unsigned DefIdx
= 0, e
= DefRegs
.size(); DefIdx
!= e
; ++DefIdx
) {
1041 Register IdxReg
= MRI
.createGenericVirtualRegister(LLT::scalar(32));
1042 B
.buildConstant(IdxReg
, DefIdx
);
1043 MRI
.setRegBank(IdxReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1044 B
.buildExtractVectorElement(DefRegs
[DefIdx
], TmpReg
, IdxReg
);
1047 MRI
.setRegBank(DstReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1051 bool AMDGPURegisterBankInfo::applyMappingImage(
1052 MachineInstr
&MI
, const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
1053 MachineRegisterInfo
&MRI
, int RsrcIdx
) const {
1054 const int NumDefs
= MI
.getNumExplicitDefs();
1056 // The reported argument index is relative to the IR intrinsic call arguments,
1057 // so we need to shift by the number of defs and the intrinsic ID.
1058 RsrcIdx
+= NumDefs
+ 1;
1060 // Insert copies to VGPR arguments.
1061 applyDefaultMapping(OpdMapper
);
1063 // Fixup any SGPR arguments.
1064 SmallVector
<unsigned, 4> SGPRIndexes
;
1065 for (int I
= NumDefs
, NumOps
= MI
.getNumOperands(); I
!= NumOps
; ++I
) {
1066 if (!MI
.getOperand(I
).isReg())
1069 // If this intrinsic has a sampler, it immediately follows rsrc.
1070 if (I
== RsrcIdx
|| I
== RsrcIdx
+ 1)
1071 SGPRIndexes
.push_back(I
);
1074 executeInWaterfallLoop(MI
, MRI
, SGPRIndexes
);
1078 // For cases where only a single copy is inserted for matching register banks.
1079 // Replace the register in the instruction operand
1080 static void substituteSimpleCopyRegs(
1081 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
, unsigned OpIdx
) {
1082 SmallVector
<unsigned, 1> SrcReg(OpdMapper
.getVRegs(OpIdx
));
1083 if (!SrcReg
.empty()) {
1084 assert(SrcReg
.size() == 1);
1085 OpdMapper
.getMI().getOperand(OpIdx
).setReg(SrcReg
[0]);
1089 /// Handle register layout difference for f16 images for some subtargets.
1090 Register
AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder
&B
,
1091 MachineRegisterInfo
&MRI
,
1092 Register Reg
) const {
1093 if (!Subtarget
.hasUnpackedD16VMem())
1096 const LLT S16
= LLT::scalar(16);
1097 LLT StoreVT
= MRI
.getType(Reg
);
1098 if (!StoreVT
.isVector() || StoreVT
.getElementType() != S16
)
1101 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
1104 SmallVector
<Register
, 4> WideRegs
;
1105 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
1106 WideRegs
.push_back(Unmerge
.getReg(I
));
1108 const LLT S32
= LLT::scalar(32);
1109 int NumElts
= StoreVT
.getNumElements();
1111 return B
.buildMerge(LLT::vector(NumElts
, S32
), WideRegs
).getReg(0);
1114 static std::pair
<Register
, unsigned>
1115 getBaseWithConstantOffset(MachineRegisterInfo
&MRI
, Register Reg
) {
1117 if (mi_match(Reg
, MRI
, m_ICst(Const
)))
1118 return std::make_pair(Register(), Const
);
1121 if (mi_match(Reg
, MRI
, m_GAdd(m_Reg(Base
), m_ICst(Const
))))
1122 return std::make_pair(Base
, Const
);
1124 // TODO: Handle G_OR used for add case
1125 return std::make_pair(Reg
, 0);
1128 std::pair
<Register
, unsigned>
1129 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder
&B
,
1130 Register OrigOffset
) const {
1131 const unsigned MaxImm
= 4095;
1134 const LLT S32
= LLT::scalar(32);
1136 std::tie(BaseReg
, ImmOffset
) = getBaseWithConstantOffset(*B
.getMRI(),
1140 if (ImmOffset
!= 0) {
1141 // If the immediate value is too big for the immoffset field, put the value
1142 // and -4096 into the immoffset field so that the value that is copied/added
1143 // for the voffset field is a multiple of 4096, and it stands more chance
1144 // of being CSEd with the copy/add for another similar load/store.
1145 // However, do not do that rounding down to a multiple of 4096 if that is a
1146 // negative number, as it appears to be illegal to have a negative offset
1147 // in the vgpr, even if adding the immediate offset makes it positive.
1148 unsigned Overflow
= ImmOffset
& ~MaxImm
;
1149 ImmOffset
-= Overflow
;
1150 if ((int32_t)Overflow
< 0) {
1151 Overflow
+= ImmOffset
;
1156 if (Overflow
!= 0) {
1158 BaseReg
= B
.buildConstant(S32
, Overflow
).getReg(0);
1160 auto OverflowVal
= B
.buildConstant(S32
, Overflow
);
1161 BaseReg
= B
.buildAdd(S32
, BaseReg
, OverflowVal
).getReg(0);
1167 BaseReg
= B
.buildConstant(S32
, 0).getReg(0);
1169 return {BaseReg
, C1
};
1172 static bool isZero(Register Reg
, MachineRegisterInfo
&MRI
) {
1174 return mi_match(Reg
, MRI
, m_ICst(C
)) && C
== 0;
1177 static unsigned extractGLC(unsigned CachePolicy
) {
1178 return CachePolicy
& 1;
1181 static unsigned extractSLC(unsigned CachePolicy
) {
1182 return (CachePolicy
>> 1) & 1;
1185 static unsigned extractDLC(unsigned CachePolicy
) {
1186 return (CachePolicy
>> 2) & 1;
1190 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder
&B
,
1191 MachineInstr
&MI
) const {
1192 MachineRegisterInfo
&MRI
= *B
.getMRI();
1193 executeInWaterfallLoop(B
, MI
, MRI
, {2, 4});
1195 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1197 Register VData
= MI
.getOperand(1).getReg();
1198 LLT Ty
= MRI
.getType(VData
);
1200 int EltSize
= Ty
.getScalarSizeInBits();
1201 int Size
= Ty
.getSizeInBits();
1203 // FIXME: Broken integer truncstore.
1205 report_fatal_error("unhandled intrinsic store");
1207 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1208 const int MemSize
= (*MI
.memoperands_begin())->getSize();
1211 Register RSrc
= MI
.getOperand(2).getReg();
1212 Register VOffset
= MI
.getOperand(3).getReg();
1213 Register SOffset
= MI
.getOperand(4).getReg();
1214 unsigned CachePolicy
= MI
.getOperand(5).getImm();
1217 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
1219 const bool Offen
= !isZero(VOffset
, MRI
);
1221 unsigned Opc
= AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
;
1222 switch (8 * MemSize
) {
1224 Opc
= Offen
? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact
:
1225 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact
;
1228 Opc
= Offen
? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact
:
1229 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact
;
1232 Opc
= Offen
? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
:
1233 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact
;
1235 Opc
= AMDGPU::getMUBUFOpcode(Opc
, Size
/ 32);
1240 // Set the insertion point back to the instruction in case it was moved into a
1244 MachineInstrBuilder MIB
= B
.buildInstr(Opc
)
1248 MIB
.addUse(VOffset
);
1253 .addImm(extractGLC(CachePolicy
))
1254 .addImm(extractSLC(CachePolicy
))
1255 .addImm(0) // tfe: FIXME: Remove from inst
1256 .addImm(extractDLC(CachePolicy
))
1259 // FIXME: We need a way to report failure from applyMappingImpl.
1260 // Insert constrain copies before inserting the loop.
1261 if (!constrainSelectedInstRegOperands(*MIB
, *TII
, *TRI
, *this))
1262 report_fatal_error("failed to constrain selected store intrinsic");
1267 void AMDGPURegisterBankInfo::applyMappingImpl(
1268 const OperandsMapper
&OpdMapper
) const {
1269 MachineInstr
&MI
= OpdMapper
.getMI();
1270 unsigned Opc
= MI
.getOpcode();
1271 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
1273 case AMDGPU::G_SELECT
: {
1274 Register DstReg
= MI
.getOperand(0).getReg();
1275 LLT DstTy
= MRI
.getType(DstReg
);
1276 if (DstTy
.getSizeInBits() != 64)
1279 LLT HalfTy
= getHalfSizedType(DstTy
);
1281 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1282 SmallVector
<Register
, 1> Src0Regs(OpdMapper
.getVRegs(1));
1283 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
1284 SmallVector
<Register
, 2> Src2Regs(OpdMapper
.getVRegs(3));
1286 // All inputs are SGPRs, nothing special to do.
1287 if (DefRegs
.empty()) {
1288 assert(Src1Regs
.empty() && Src2Regs
.empty());
1292 MachineIRBuilder
B(MI
);
1293 if (Src0Regs
.empty())
1294 Src0Regs
.push_back(MI
.getOperand(1).getReg());
1296 assert(Src0Regs
.size() == 1);
1299 if (Src1Regs
.empty())
1300 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
1302 setRegsToType(MRI
, Src1Regs
, HalfTy
);
1305 if (Src2Regs
.empty())
1306 split64BitValueForMapping(B
, Src2Regs
, HalfTy
, MI
.getOperand(3).getReg());
1308 setRegsToType(MRI
, Src2Regs
, HalfTy
);
1310 setRegsToType(MRI
, DefRegs
, HalfTy
);
1312 B
.buildSelect(DefRegs
[0], Src0Regs
[0], Src1Regs
[0], Src2Regs
[0]);
1313 B
.buildSelect(DefRegs
[1], Src0Regs
[0], Src1Regs
[1], Src2Regs
[1]);
1315 MRI
.setRegBank(DstReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1316 MI
.eraseFromParent();
1321 case AMDGPU::G_XOR
: {
1322 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1323 // there is a VGPR input.
1324 Register DstReg
= MI
.getOperand(0).getReg();
1325 LLT DstTy
= MRI
.getType(DstReg
);
1326 if (DstTy
.getSizeInBits() != 64)
1329 LLT HalfTy
= getHalfSizedType(DstTy
);
1330 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1331 SmallVector
<Register
, 2> Src0Regs(OpdMapper
.getVRegs(1));
1332 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
1334 // All inputs are SGPRs, nothing special to do.
1335 if (DefRegs
.empty()) {
1336 assert(Src0Regs
.empty() && Src1Regs
.empty());
1340 assert(DefRegs
.size() == 2);
1341 assert(Src0Regs
.size() == Src1Regs
.size() &&
1342 (Src0Regs
.empty() || Src0Regs
.size() == 2));
1344 // Depending on where the source registers came from, the generic code may
1345 // have decided to split the inputs already or not. If not, we still need to
1346 // extract the values.
1347 MachineIRBuilder
B(MI
);
1349 if (Src0Regs
.empty())
1350 split64BitValueForMapping(B
, Src0Regs
, HalfTy
, MI
.getOperand(1).getReg());
1352 setRegsToType(MRI
, Src0Regs
, HalfTy
);
1354 if (Src1Regs
.empty())
1355 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
1357 setRegsToType(MRI
, Src1Regs
, HalfTy
);
1359 setRegsToType(MRI
, DefRegs
, HalfTy
);
1363 .addUse(Src0Regs
[0])
1364 .addUse(Src1Regs
[0]);
1368 .addUse(Src0Regs
[1])
1369 .addUse(Src1Regs
[1]);
1371 MRI
.setRegBank(DstReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1372 MI
.eraseFromParent();
1377 case AMDGPU::G_MUL
: {
1378 Register DstReg
= MI
.getOperand(0).getReg();
1379 LLT DstTy
= MRI
.getType(DstReg
);
1380 if (DstTy
!= LLT::scalar(16))
1383 const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
);
1384 if (DstBank
== &AMDGPU::VGPRRegBank
)
1387 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1388 MachineFunction
*MF
= MI
.getParent()->getParent();
1389 MachineIRBuilder
B(MI
);
1390 ApplyRegBankMapping
ApplySALU(MRI
, &AMDGPU::SGPRRegBank
);
1391 GISelObserverWrapper
Observer(&ApplySALU
);
1392 LegalizerHelper
Helper(*MF
, Observer
, B
);
1394 if (Helper
.widenScalar(MI
, 0, LLT::scalar(32)) !=
1395 LegalizerHelper::Legalized
)
1396 llvm_unreachable("widen scalar should have succeeded");
1399 case AMDGPU::G_SMIN
:
1400 case AMDGPU::G_SMAX
:
1401 case AMDGPU::G_UMIN
:
1402 case AMDGPU::G_UMAX
: {
1403 Register DstReg
= MI
.getOperand(0).getReg();
1404 const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
);
1405 if (DstBank
== &AMDGPU::VGPRRegBank
)
1408 MachineFunction
*MF
= MI
.getParent()->getParent();
1409 MachineIRBuilder
B(MI
);
1410 ApplyRegBankMapping
ApplySALU(MRI
, &AMDGPU::SGPRRegBank
);
1411 GISelObserverWrapper
Observer(&ApplySALU
);
1412 LegalizerHelper
Helper(*MF
, Observer
, B
);
1414 // Turn scalar min/max into a compare and select.
1415 LLT Ty
= MRI
.getType(DstReg
);
1416 LLT S32
= LLT::scalar(32);
1417 LLT S16
= LLT::scalar(16);
1420 // Need to widen to s32, and expand as cmp + select.
1421 if (Helper
.widenScalar(MI
, 0, S32
) != LegalizerHelper::Legalized
)
1422 llvm_unreachable("widenScalar should have succeeded");
1424 // FIXME: This is relying on widenScalar leaving MI in place.
1425 if (Helper
.lower(MI
, 0, S32
) != LegalizerHelper::Legalized
)
1426 llvm_unreachable("lower should have succeeded");
1428 if (Helper
.lower(MI
, 0, Ty
) != LegalizerHelper::Legalized
)
1429 llvm_unreachable("lower should have succeeded");
1434 case AMDGPU::G_SEXT
:
1435 case AMDGPU::G_ZEXT
: {
1436 Register SrcReg
= MI
.getOperand(1).getReg();
1437 LLT SrcTy
= MRI
.getType(SrcReg
);
1438 bool Signed
= Opc
== AMDGPU::G_SEXT
;
1440 MachineIRBuilder
B(MI
);
1441 const RegisterBank
*SrcBank
= getRegBank(SrcReg
, MRI
, *TRI
);
1443 Register DstReg
= MI
.getOperand(0).getReg();
1444 LLT DstTy
= MRI
.getType(DstReg
);
1445 if (DstTy
.isScalar() &&
1446 SrcBank
!= &AMDGPU::SGPRRegBank
&&
1447 SrcBank
!= &AMDGPU::SCCRegBank
&&
1448 SrcBank
!= &AMDGPU::VCCRegBank
&&
1449 // FIXME: Should handle any type that round to s64 when irregular
1450 // breakdowns supported.
1451 DstTy
.getSizeInBits() == 64 &&
1452 SrcTy
.getSizeInBits() <= 32) {
1453 const LLT S32
= LLT::scalar(32);
1454 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1456 // Extend to 32-bit, and then extend the low half.
1458 // TODO: Should really be buildSExtOrCopy
1459 B
.buildSExtOrTrunc(DefRegs
[0], SrcReg
);
1461 // Replicate sign bit from 32-bit extended part.
1462 auto ShiftAmt
= B
.buildConstant(S32
, 31);
1463 MRI
.setRegBank(ShiftAmt
.getReg(0), *SrcBank
);
1464 B
.buildAShr(DefRegs
[1], DefRegs
[0], ShiftAmt
);
1466 B
.buildZExtOrTrunc(DefRegs
[0], SrcReg
);
1467 B
.buildConstant(DefRegs
[1], 0);
1470 MRI
.setRegBank(DstReg
, *SrcBank
);
1471 MI
.eraseFromParent();
1475 if (SrcTy
!= LLT::scalar(1))
1478 if (SrcBank
== &AMDGPU::SCCRegBank
|| SrcBank
== &AMDGPU::VCCRegBank
) {
1479 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1481 const RegisterBank
*DstBank
= SrcBank
== &AMDGPU::SCCRegBank
?
1482 &AMDGPU::SGPRRegBank
: &AMDGPU::VGPRRegBank
;
1484 unsigned DstSize
= DstTy
.getSizeInBits();
1485 // 64-bit select is SGPR only
1486 const bool UseSel64
= DstSize
> 32 &&
1487 SrcBank
->getID() == AMDGPU::SCCRegBankID
;
1489 // TODO: Should s16 select be legal?
1490 LLT SelType
= UseSel64
? LLT::scalar(64) : LLT::scalar(32);
1491 auto True
= B
.buildConstant(SelType
, Signed
? -1 : 1);
1492 auto False
= B
.buildConstant(SelType
, 0);
1494 MRI
.setRegBank(True
.getReg(0), *DstBank
);
1495 MRI
.setRegBank(False
.getReg(0), *DstBank
);
1496 MRI
.setRegBank(DstReg
, *DstBank
);
1498 if (DstSize
> 32 && SrcBank
->getID() != AMDGPU::SCCRegBankID
) {
1499 B
.buildSelect(DefRegs
[0], SrcReg
, True
, False
);
1500 B
.buildCopy(DefRegs
[1], DefRegs
[0]);
1501 } else if (DstSize
< 32) {
1502 auto Sel
= B
.buildSelect(SelType
, SrcReg
, True
, False
);
1503 MRI
.setRegBank(Sel
.getReg(0), *DstBank
);
1504 B
.buildTrunc(DstReg
, Sel
);
1506 B
.buildSelect(DstReg
, SrcReg
, True
, False
);
1509 MI
.eraseFromParent();
1513 // Fixup the case with an s1 src that isn't a condition register. Use shifts
1514 // instead of introducing a compare to avoid an unnecessary condition
1515 // register (and since there's no scalar 16-bit compares).
1516 auto Ext
= B
.buildAnyExt(DstTy
, SrcReg
);
1517 auto ShiftAmt
= B
.buildConstant(LLT::scalar(32), DstTy
.getSizeInBits() - 1);
1518 auto Shl
= B
.buildShl(DstTy
, Ext
, ShiftAmt
);
1520 if (MI
.getOpcode() == AMDGPU::G_SEXT
)
1521 B
.buildAShr(DstReg
, Shl
, ShiftAmt
);
1523 B
.buildLShr(DstReg
, Shl
, ShiftAmt
);
1525 MRI
.setRegBank(DstReg
, *SrcBank
);
1526 MRI
.setRegBank(Ext
.getReg(0), *SrcBank
);
1527 MRI
.setRegBank(ShiftAmt
.getReg(0), *SrcBank
);
1528 MRI
.setRegBank(Shl
.getReg(0), *SrcBank
);
1529 MI
.eraseFromParent();
1532 case AMDGPU::G_BUILD_VECTOR
:
1533 case AMDGPU::G_BUILD_VECTOR_TRUNC
: {
1534 Register DstReg
= MI
.getOperand(0).getReg();
1535 LLT DstTy
= MRI
.getType(DstReg
);
1536 if (DstTy
!= LLT::vector(2, 16))
1539 assert(MI
.getNumOperands() == 3 && empty(OpdMapper
.getVRegs(0)));
1540 substituteSimpleCopyRegs(OpdMapper
, 1);
1541 substituteSimpleCopyRegs(OpdMapper
, 2);
1543 const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
);
1544 if (DstBank
== &AMDGPU::SGPRRegBank
)
1545 break; // Can use S_PACK_* instructions.
1547 MachineIRBuilder
B(MI
);
1549 Register Lo
= MI
.getOperand(1).getReg();
1550 Register Hi
= MI
.getOperand(2).getReg();
1551 const LLT S32
= LLT::scalar(32);
1553 const RegisterBank
*BankLo
= getRegBank(Lo
, MRI
, *TRI
);
1554 const RegisterBank
*BankHi
= getRegBank(Hi
, MRI
, *TRI
);
1559 if (Opc
== AMDGPU::G_BUILD_VECTOR
) {
1560 ZextLo
= B
.buildZExt(S32
, Lo
).getReg(0);
1561 MRI
.setRegBank(ZextLo
, *BankLo
);
1563 Register ZextHi
= B
.buildZExt(S32
, Hi
).getReg(0);
1564 MRI
.setRegBank(ZextHi
, *BankHi
);
1566 auto ShiftAmt
= B
.buildConstant(S32
, 16);
1567 MRI
.setRegBank(ShiftAmt
.getReg(0), *BankHi
);
1569 ShiftHi
= B
.buildShl(S32
, ZextHi
, ShiftAmt
).getReg(0);
1570 MRI
.setRegBank(ShiftHi
, *BankHi
);
1572 Register MaskLo
= B
.buildConstant(S32
, 0xffff).getReg(0);
1573 MRI
.setRegBank(MaskLo
, *BankLo
);
1575 auto ShiftAmt
= B
.buildConstant(S32
, 16);
1576 MRI
.setRegBank(ShiftAmt
.getReg(0), *BankHi
);
1578 ShiftHi
= B
.buildShl(S32
, Hi
, ShiftAmt
).getReg(0);
1579 MRI
.setRegBank(ShiftHi
, *BankHi
);
1581 ZextLo
= B
.buildAnd(S32
, Lo
, MaskLo
).getReg(0);
1582 MRI
.setRegBank(ZextLo
, *BankLo
);
1585 auto Or
= B
.buildOr(S32
, ZextLo
, ShiftHi
);
1586 MRI
.setRegBank(Or
.getReg(0), *DstBank
);
1588 B
.buildBitcast(DstReg
, Or
);
1589 MI
.eraseFromParent();
1592 case AMDGPU::G_EXTRACT_VECTOR_ELT
:
1593 applyDefaultMapping(OpdMapper
);
1594 executeInWaterfallLoop(MI
, MRI
, { 2 });
1596 case AMDGPU::G_INTRINSIC
: {
1597 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
1598 case Intrinsic::amdgcn_s_buffer_load
: {
1599 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
1600 executeInWaterfallLoop(MI
, MRI
, { 2, 3 });
1603 case Intrinsic::amdgcn_readlane
: {
1604 substituteSimpleCopyRegs(OpdMapper
, 2);
1606 assert(empty(OpdMapper
.getVRegs(0)));
1607 assert(empty(OpdMapper
.getVRegs(3)));
1609 // Make sure the index is an SGPR. It doesn't make sense to run this in a
1610 // waterfall loop, so assume it's a uniform value.
1611 constrainOpWithReadfirstlane(MI
, MRI
, 3); // Index
1614 case Intrinsic::amdgcn_writelane
: {
1615 assert(empty(OpdMapper
.getVRegs(0)));
1616 assert(empty(OpdMapper
.getVRegs(2)));
1617 assert(empty(OpdMapper
.getVRegs(3)));
1619 substituteSimpleCopyRegs(OpdMapper
, 4); // VGPR input val
1620 constrainOpWithReadfirstlane(MI
, MRI
, 2); // Source value
1621 constrainOpWithReadfirstlane(MI
, MRI
, 3); // Index
1629 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
: {
1630 auto IntrID
= MI
.getIntrinsicID();
1632 case Intrinsic::amdgcn_buffer_load
: {
1633 executeInWaterfallLoop(MI
, MRI
, { 2 });
1636 case Intrinsic::amdgcn_ds_ordered_add
:
1637 case Intrinsic::amdgcn_ds_ordered_swap
: {
1638 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
1639 assert(empty(OpdMapper
.getVRegs(0)));
1640 substituteSimpleCopyRegs(OpdMapper
, 3);
1641 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
1644 case Intrinsic::amdgcn_s_sendmsg
:
1645 case Intrinsic::amdgcn_s_sendmsghalt
: {
1646 // FIXME: Should this use a waterfall loop?
1647 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
1650 case Intrinsic::amdgcn_raw_buffer_load
:
1651 case Intrinsic::amdgcn_raw_buffer_load_format
:
1652 case Intrinsic::amdgcn_raw_tbuffer_load
:
1653 case Intrinsic::amdgcn_raw_buffer_store
:
1654 case Intrinsic::amdgcn_raw_buffer_store_format
:
1655 case Intrinsic::amdgcn_raw_tbuffer_store
: {
1656 applyDefaultMapping(OpdMapper
);
1657 executeInWaterfallLoop(MI
, MRI
, {2, 4});
1660 case Intrinsic::amdgcn_struct_buffer_load
:
1661 case Intrinsic::amdgcn_struct_buffer_store
:
1662 case Intrinsic::amdgcn_struct_tbuffer_load
:
1663 case Intrinsic::amdgcn_struct_tbuffer_store
: {
1664 applyDefaultMapping(OpdMapper
);
1665 executeInWaterfallLoop(MI
, MRI
, {2, 5});
1669 if (const AMDGPU::RsrcIntrinsic
*RSrcIntrin
=
1670 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
1671 // Non-images can have complications from operands that allow both SGPR
1672 // and VGPR. For now it's too complicated to figure out the final opcode
1673 // to derive the register bank from the MCInstrDesc.
1674 if (RSrcIntrin
->IsImage
) {
1675 applyMappingImage(MI
, OpdMapper
, MRI
, RSrcIntrin
->RsrcArg
);
1685 case AMDGPU::G_LOAD
:
1686 case AMDGPU::G_ZEXTLOAD
:
1687 case AMDGPU::G_SEXTLOAD
: {
1688 if (applyMappingWideLoad(MI
, OpdMapper
, MRI
))
1696 return applyDefaultMapping(OpdMapper
);
1699 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr
&MI
) const {
1700 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1701 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1702 for (unsigned i
= 0, e
= MI
.getNumOperands();i
!= e
; ++i
) {
1703 if (!MI
.getOperand(i
).isReg())
1705 Register Reg
= MI
.getOperand(i
).getReg();
1706 if (const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
)) {
1707 if (Bank
->getID() == AMDGPU::VGPRRegBankID
)
1710 assert(Bank
->getID() == AMDGPU::SGPRRegBankID
||
1711 Bank
->getID() == AMDGPU::SCCRegBankID
);
1717 const RegisterBankInfo::InstructionMapping
&
1718 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr
&MI
) const {
1719 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1720 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1721 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1723 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
1724 unsigned Size
= getSizeInBits(MI
.getOperand(i
).getReg(), MRI
, *TRI
);
1725 unsigned BankID
= Size
== 1 ? AMDGPU::SCCRegBankID
: AMDGPU::SGPRRegBankID
;
1726 OpdsMapping
[i
] = AMDGPU::getValueMapping(BankID
, Size
);
1728 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
1729 MI
.getNumOperands());
1732 const RegisterBankInfo::InstructionMapping
&
1733 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr
&MI
) const {
1734 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1735 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1736 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1737 unsigned OpdIdx
= 0;
1739 unsigned Size0
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1740 OpdsMapping
[OpdIdx
++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size0
);
1742 if (MI
.getOperand(OpdIdx
).isIntrinsicID())
1743 OpdsMapping
[OpdIdx
++] = nullptr;
1745 Register Reg1
= MI
.getOperand(OpdIdx
).getReg();
1746 unsigned Size1
= getSizeInBits(Reg1
, MRI
, *TRI
);
1748 unsigned DefaultBankID
= Size1
== 1 ?
1749 AMDGPU::VCCRegBankID
: AMDGPU::VGPRRegBankID
;
1750 unsigned Bank1
= getRegBankID(Reg1
, MRI
, *TRI
, DefaultBankID
);
1752 OpdsMapping
[OpdIdx
++] = AMDGPU::getValueMapping(Bank1
, Size1
);
1754 for (unsigned e
= MI
.getNumOperands(); OpdIdx
!= e
; ++OpdIdx
) {
1755 const MachineOperand
&MO
= MI
.getOperand(OpdIdx
);
1759 unsigned Size
= getSizeInBits(MO
.getReg(), MRI
, *TRI
);
1760 unsigned BankID
= Size
== 1 ? AMDGPU::VCCRegBankID
: AMDGPU::VGPRRegBankID
;
1761 OpdsMapping
[OpdIdx
] = AMDGPU::getValueMapping(BankID
, Size
);
1764 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
1765 MI
.getNumOperands());
1768 const RegisterBankInfo::InstructionMapping
&
1769 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr
&MI
) const {
1770 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1771 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1772 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1774 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
) {
1775 const MachineOperand
&Op
= MI
.getOperand(I
);
1779 unsigned Size
= getSizeInBits(Op
.getReg(), MRI
, *TRI
);
1780 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
1783 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
1784 MI
.getNumOperands());
1787 const RegisterBankInfo::InstructionMapping
&
1788 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo
&MRI
,
1789 const MachineInstr
&MI
,
1790 int RsrcIdx
) const {
1791 // The reported argument index is relative to the IR intrinsic call arguments,
1792 // so we need to shift by the number of defs and the intrinsic ID.
1793 RsrcIdx
+= MI
.getNumExplicitDefs() + 1;
1795 const int NumOps
= MI
.getNumOperands();
1796 SmallVector
<const ValueMapping
*, 8> OpdsMapping(NumOps
);
1798 // TODO: Should packed/unpacked D16 difference be reported here as part of
1799 // the value mapping?
1800 for (int I
= 0; I
!= NumOps
; ++I
) {
1801 if (!MI
.getOperand(I
).isReg())
1804 Register OpReg
= MI
.getOperand(I
).getReg();
1805 unsigned Size
= getSizeInBits(OpReg
, MRI
, *TRI
);
1807 // FIXME: Probably need a new intrinsic register bank searchable table to
1808 // handle arbitrary intrinsics easily.
1810 // If this has a sampler, it immediately follows rsrc.
1811 const bool MustBeSGPR
= I
== RsrcIdx
|| I
== RsrcIdx
+ 1;
1814 // If this must be an SGPR, so we must report whatever it is as legal.
1815 unsigned NewBank
= getRegBankID(OpReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
1816 OpdsMapping
[I
] = AMDGPU::getValueMapping(NewBank
, Size
);
1818 // Some operands must be VGPR, and these are easy to copy to.
1819 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
1823 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
), NumOps
);
1826 const RegisterBankInfo::InstructionMapping
&
1827 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr
&MI
) const {
1829 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1830 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1831 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1832 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1833 LLT LoadTy
= MRI
.getType(MI
.getOperand(0).getReg());
1834 Register PtrReg
= MI
.getOperand(1).getReg();
1835 LLT PtrTy
= MRI
.getType(PtrReg
);
1836 unsigned AS
= PtrTy
.getAddressSpace();
1837 unsigned PtrSize
= PtrTy
.getSizeInBits();
1839 const ValueMapping
*ValMapping
;
1840 const ValueMapping
*PtrMapping
;
1842 if (isInstrUniformNonExtLoadAlign4(MI
) &&
1843 (AS
!= AMDGPUAS::LOCAL_ADDRESS
&& AS
!= AMDGPUAS::REGION_ADDRESS
)) {
1844 // We have a uniform instruction so we want to use an SMRD load
1845 ValMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
1846 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
);
1848 ValMapping
= AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID
, LoadTy
);
1849 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
);
1852 OpdsMapping
[0] = ValMapping
;
1853 OpdsMapping
[1] = PtrMapping
;
1854 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstructionMapping(
1855 1, 1, getOperandsMapping(OpdsMapping
), MI
.getNumOperands());
1858 // FIXME: Do we want to add a mapping for FLAT load, or should we just
1859 // handle that during instruction selection?
1863 AMDGPURegisterBankInfo::getRegBankID(Register Reg
,
1864 const MachineRegisterInfo
&MRI
,
1865 const TargetRegisterInfo
&TRI
,
1866 unsigned Default
) const {
1868 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, TRI
);
1869 return Bank
? Bank
->getID() : Default
;
1873 static unsigned regBankUnion(unsigned RB0
, unsigned RB1
) {
1874 return (RB0
== AMDGPU::SGPRRegBankID
&& RB1
== AMDGPU::SGPRRegBankID
) ?
1875 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
1878 const RegisterBankInfo::ValueMapping
*
1879 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg
,
1880 const MachineRegisterInfo
&MRI
,
1881 const TargetRegisterInfo
&TRI
) const {
1882 // Lie and claim anything is legal, even though this needs to be an SGPR
1883 // applyMapping will have to deal with it as a waterfall loop.
1884 unsigned Bank
= getRegBankID(Reg
, MRI
, TRI
, AMDGPU::SGPRRegBankID
);
1885 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
1886 return AMDGPU::getValueMapping(Bank
, Size
);
1889 const RegisterBankInfo::ValueMapping
*
1890 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg
,
1891 const MachineRegisterInfo
&MRI
,
1892 const TargetRegisterInfo
&TRI
) const {
1893 unsigned Size
= getSizeInBits(Reg
, MRI
, TRI
);
1894 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
1898 /// This function must return a legal mapping, because
1899 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
1900 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
1901 /// VGPR to SGPR generated is illegal.
1903 const RegisterBankInfo::InstructionMapping
&
1904 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr
&MI
) const {
1905 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1906 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1908 if (MI
.isRegSequence()) {
1909 // If any input is a VGPR, the result must be a VGPR. The default handling
1910 // assumes any copy between banks is legal.
1911 unsigned BankID
= AMDGPU::SGPRRegBankID
;
1913 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
1914 auto OpBank
= getRegBankID(MI
.getOperand(I
).getReg(), MRI
, *TRI
);
1915 // It doesn't make sense to use vcc or scc banks here, so just ignore
1917 if (OpBank
!= AMDGPU::SGPRRegBankID
) {
1918 BankID
= AMDGPU::VGPRRegBankID
;
1922 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1924 const ValueMapping
&ValMap
= getValueMapping(0, Size
, getRegBank(BankID
));
1925 return getInstructionMapping(
1927 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
1930 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
1933 // TODO: There are additional exec masking dependencies to analyze.
1934 if (MI
.getOpcode() == TargetOpcode::G_PHI
) {
1935 // TODO: Generate proper invalid bank enum.
1936 int ResultBank
= -1;
1938 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
1939 Register Reg
= MI
.getOperand(I
).getReg();
1940 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
1942 // FIXME: Assuming VGPR for any undetermined inputs.
1943 if (!Bank
|| Bank
->getID() == AMDGPU::VGPRRegBankID
) {
1944 ResultBank
= AMDGPU::VGPRRegBankID
;
1948 unsigned OpBank
= Bank
->getID();
1950 if (OpBank
== AMDGPU::SCCRegBankID
) {
1951 // There's only one SCC register, so a phi requires copying to SGPR.
1952 OpBank
= AMDGPU::SGPRRegBankID
;
1953 } else if (OpBank
== AMDGPU::VCCRegBankID
) {
1955 // vcc, sgpr -> vgpr
1956 if (ResultBank
!= -1 && ResultBank
!= AMDGPU::VCCRegBankID
) {
1957 ResultBank
= AMDGPU::VGPRRegBankID
;
1962 ResultBank
= OpBank
;
1965 assert(ResultBank
!= -1);
1967 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1969 const ValueMapping
&ValMap
=
1970 getValueMapping(0, Size
, getRegBank(ResultBank
));
1971 return getInstructionMapping(
1973 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
1976 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstrMappingImpl(MI
);
1977 if (Mapping
.isValid())
1980 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1982 switch (MI
.getOpcode()) {
1984 return getInvalidInstructionMapping();
1988 case AMDGPU::G_XOR
: {
1989 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1991 const RegisterBank
*DstBank
1992 = getRegBank(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1994 unsigned TargetBankID
= -1;
1995 unsigned BankLHS
= -1;
1996 unsigned BankRHS
= -1;
1998 TargetBankID
= DstBank
->getID();
1999 if (DstBank
== &AMDGPU::VCCRegBank
) {
2000 TargetBankID
= AMDGPU::VCCRegBankID
;
2001 BankLHS
= AMDGPU::VCCRegBankID
;
2002 BankRHS
= AMDGPU::VCCRegBankID
;
2003 } else if (DstBank
== &AMDGPU::SCCRegBank
) {
2004 TargetBankID
= AMDGPU::SCCRegBankID
;
2005 BankLHS
= AMDGPU::SGPRRegBankID
;
2006 BankRHS
= AMDGPU::SGPRRegBankID
;
2008 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
,
2009 AMDGPU::SGPRRegBankID
);
2010 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2011 AMDGPU::SGPRRegBankID
);
2014 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
,
2015 AMDGPU::VCCRegBankID
);
2016 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2017 AMDGPU::VCCRegBankID
);
2019 // Both inputs should be true booleans to produce a boolean result.
2020 if (BankLHS
== AMDGPU::VGPRRegBankID
|| BankRHS
== AMDGPU::VGPRRegBankID
) {
2021 TargetBankID
= AMDGPU::VGPRRegBankID
;
2022 } else if (BankLHS
== AMDGPU::VCCRegBankID
|| BankRHS
== AMDGPU::VCCRegBankID
) {
2023 TargetBankID
= AMDGPU::VCCRegBankID
;
2024 BankLHS
= AMDGPU::VCCRegBankID
;
2025 BankRHS
= AMDGPU::VCCRegBankID
;
2026 } else if (BankLHS
== AMDGPU::SGPRRegBankID
&& BankRHS
== AMDGPU::SGPRRegBankID
) {
2027 TargetBankID
= AMDGPU::SGPRRegBankID
;
2028 } else if (BankLHS
== AMDGPU::SCCRegBankID
|| BankRHS
== AMDGPU::SCCRegBankID
) {
2029 // The operation must be done on a 32-bit register, but it will set
2030 // scc. The result type could interchangably be SCC or SGPR, since
2031 // both values will be produced.
2032 TargetBankID
= AMDGPU::SCCRegBankID
;
2033 BankLHS
= AMDGPU::SGPRRegBankID
;
2034 BankRHS
= AMDGPU::SGPRRegBankID
;
2038 OpdsMapping
[0] = AMDGPU::getValueMapping(TargetBankID
, Size
);
2039 OpdsMapping
[1] = AMDGPU::getValueMapping(BankLHS
, Size
);
2040 OpdsMapping
[2] = AMDGPU::getValueMapping(BankRHS
, Size
);
2046 if (isSALUMapping(MI
)) {
2047 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
);
2048 OpdsMapping
[1] = OpdsMapping
[2] = OpdsMapping
[0];
2050 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
);
2051 unsigned Bank1
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
/*, DefaultBankID*/);
2052 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank1
, Size
);
2054 unsigned Bank2
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
/*, DefaultBankID*/);
2055 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank2
, Size
);
2068 case AMDGPU::G_LSHR
:
2069 case AMDGPU::G_ASHR
:
2070 case AMDGPU::G_UADDO
:
2071 case AMDGPU::G_SADDO
:
2072 case AMDGPU::G_USUBO
:
2073 case AMDGPU::G_SSUBO
:
2074 case AMDGPU::G_UADDE
:
2075 case AMDGPU::G_SADDE
:
2076 case AMDGPU::G_USUBE
:
2077 case AMDGPU::G_SSUBE
:
2078 case AMDGPU::G_SMIN
:
2079 case AMDGPU::G_SMAX
:
2080 case AMDGPU::G_UMIN
:
2081 case AMDGPU::G_UMAX
:
2082 if (isSALUMapping(MI
))
2083 return getDefaultMappingSOP(MI
);
2086 case AMDGPU::G_FADD
:
2087 case AMDGPU::G_FSUB
:
2088 case AMDGPU::G_FPTOSI
:
2089 case AMDGPU::G_FPTOUI
:
2090 case AMDGPU::G_FMUL
:
2092 case AMDGPU::G_FMAD
:
2093 case AMDGPU::G_FSQRT
:
2094 case AMDGPU::G_FFLOOR
:
2095 case AMDGPU::G_FCEIL
:
2096 case AMDGPU::G_FRINT
:
2097 case AMDGPU::G_SITOFP
:
2098 case AMDGPU::G_UITOFP
:
2099 case AMDGPU::G_FPTRUNC
:
2100 case AMDGPU::G_FPEXT
:
2101 case AMDGPU::G_FEXP2
:
2102 case AMDGPU::G_FLOG2
:
2103 case AMDGPU::G_FMINNUM
:
2104 case AMDGPU::G_FMAXNUM
:
2105 case AMDGPU::G_FMINNUM_IEEE
:
2106 case AMDGPU::G_FMAXNUM_IEEE
:
2107 case AMDGPU::G_FCANONICALIZE
:
2108 case AMDGPU::G_INTRINSIC_TRUNC
:
2109 case AMDGPU::G_INTRINSIC_ROUND
:
2110 return getDefaultMappingVOP(MI
);
2111 case AMDGPU::G_UMULH
:
2112 case AMDGPU::G_SMULH
: {
2113 if (MF
.getSubtarget
<GCNSubtarget
>().hasScalarMulHiInsts() &&
2115 return getDefaultMappingSOP(MI
);
2116 return getDefaultMappingVOP(MI
);
2118 case AMDGPU::G_IMPLICIT_DEF
: {
2119 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2120 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2123 case AMDGPU::G_FCONSTANT
:
2124 case AMDGPU::G_CONSTANT
:
2125 case AMDGPU::G_FRAME_INDEX
:
2126 case AMDGPU::G_GLOBAL_VALUE
:
2127 case AMDGPU::G_BLOCK_ADDR
: {
2128 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2129 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2132 case AMDGPU::G_INSERT
: {
2133 unsigned BankID
= isSALUMapping(MI
) ? AMDGPU::SGPRRegBankID
:
2134 AMDGPU::VGPRRegBankID
;
2135 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2136 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2137 unsigned EltSize
= getSizeInBits(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2138 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
2139 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
2140 OpdsMapping
[2] = AMDGPU::getValueMapping(BankID
, EltSize
);
2141 OpdsMapping
[3] = nullptr;
2144 case AMDGPU::G_EXTRACT
: {
2145 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2146 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2147 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2148 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
2149 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
2150 OpdsMapping
[2] = nullptr;
2153 case AMDGPU::G_BUILD_VECTOR
:
2154 case AMDGPU::G_BUILD_VECTOR_TRUNC
: {
2155 LLT DstTy
= MRI
.getType(MI
.getOperand(0).getReg());
2156 if (DstTy
== LLT::vector(2, 16)) {
2157 unsigned DstSize
= DstTy
.getSizeInBits();
2158 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
2159 unsigned Src0BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2160 unsigned Src1BankID
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2161 unsigned DstBankID
= regBankUnion(Src0BankID
, Src1BankID
);
2163 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBankID
, DstSize
);
2164 OpdsMapping
[1] = AMDGPU::getValueMapping(Src0BankID
, SrcSize
);
2165 OpdsMapping
[2] = AMDGPU::getValueMapping(Src1BankID
, SrcSize
);
2171 case AMDGPU::G_MERGE_VALUES
:
2172 case AMDGPU::G_CONCAT_VECTORS
: {
2173 unsigned Bank
= isSALUMapping(MI
) ?
2174 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
2175 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2176 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
2178 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
2179 // Op1 and Dst should use the same register bank.
2180 for (unsigned i
= 1, e
= MI
.getNumOperands(); i
!= e
; ++i
)
2181 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, SrcSize
);
2184 case AMDGPU::G_BITCAST
:
2185 case AMDGPU::G_INTTOPTR
:
2186 case AMDGPU::G_PTRTOINT
:
2187 case AMDGPU::G_CTLZ
:
2188 case AMDGPU::G_CTLZ_ZERO_UNDEF
:
2189 case AMDGPU::G_CTTZ
:
2190 case AMDGPU::G_CTTZ_ZERO_UNDEF
:
2191 case AMDGPU::G_CTPOP
:
2192 case AMDGPU::G_BSWAP
:
2193 case AMDGPU::G_BITREVERSE
:
2194 case AMDGPU::G_FABS
:
2195 case AMDGPU::G_FNEG
: {
2196 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2197 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2198 OpdsMapping
[0] = OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, Size
);
2201 case AMDGPU::G_TRUNC
: {
2202 Register Dst
= MI
.getOperand(0).getReg();
2203 Register Src
= MI
.getOperand(1).getReg();
2204 unsigned Bank
= getRegBankID(Src
, MRI
, *TRI
);
2205 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
2206 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
2207 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
2208 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, SrcSize
);
2211 case AMDGPU::G_ZEXT
:
2212 case AMDGPU::G_SEXT
:
2213 case AMDGPU::G_ANYEXT
: {
2214 Register Dst
= MI
.getOperand(0).getReg();
2215 Register Src
= MI
.getOperand(1).getReg();
2216 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
2217 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
2220 const RegisterBank
*SrcBank
= getRegBank(Src
, MRI
, *TRI
);
2222 switch (SrcBank
->getID()) {
2223 case AMDGPU::SCCRegBankID
:
2224 case AMDGPU::SGPRRegBankID
:
2225 DstBank
= AMDGPU::SGPRRegBankID
;
2228 DstBank
= AMDGPU::VGPRRegBankID
;
2232 // TODO: Should anyext be split into 32-bit part as well?
2233 if (MI
.getOpcode() == AMDGPU::G_ANYEXT
) {
2234 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBank
, DstSize
);
2235 OpdsMapping
[1] = AMDGPU::getValueMapping(SrcBank
->getID(), SrcSize
);
2237 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
2238 // 32-bits, and then to 64.
2239 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(DstBank
, DstSize
);
2240 OpdsMapping
[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank
->getID(),
2245 case AMDGPU::G_FCMP
: {
2246 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2247 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2248 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
2249 OpdsMapping
[1] = nullptr; // Predicate Operand.
2250 OpdsMapping
[2] = AMDGPU::getValueMapping(Op2Bank
, Size
);
2251 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
2254 case AMDGPU::G_STORE
: {
2255 assert(MI
.getOperand(0).isReg());
2256 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2257 // FIXME: We need to specify a different reg bank once scalar stores
2259 const ValueMapping
*ValMapping
=
2260 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
2261 // FIXME: Depending on the type of store, the pointer could be in
2262 // the SGPR Reg bank.
2263 // FIXME: Pointer size should be based on the address space.
2264 const ValueMapping
*PtrMapping
=
2265 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 64);
2267 OpdsMapping
[0] = ValMapping
;
2268 OpdsMapping
[1] = PtrMapping
;
2272 case AMDGPU::G_ICMP
: {
2273 auto Pred
= static_cast<CmpInst::Predicate
>(MI
.getOperand(1).getPredicate());
2274 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2275 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2276 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2278 bool CanUseSCC
= Op2Bank
== AMDGPU::SGPRRegBankID
&&
2279 Op3Bank
== AMDGPU::SGPRRegBankID
&&
2280 (Size
== 32 || (Size
== 64 &&
2281 (Pred
== CmpInst::ICMP_EQ
|| Pred
== CmpInst::ICMP_NE
) &&
2282 MF
.getSubtarget
<GCNSubtarget
>().hasScalarCompareEq64()));
2284 unsigned Op0Bank
= CanUseSCC
? AMDGPU::SCCRegBankID
: AMDGPU::VCCRegBankID
;
2286 OpdsMapping
[0] = AMDGPU::getValueMapping(Op0Bank
, 1);
2287 OpdsMapping
[1] = nullptr; // Predicate Operand.
2288 OpdsMapping
[2] = AMDGPU::getValueMapping(Op2Bank
, Size
);
2289 OpdsMapping
[3] = AMDGPU::getValueMapping(Op3Bank
, Size
);
2292 case AMDGPU::G_EXTRACT_VECTOR_ELT
: {
2293 unsigned OutputBankID
= isSALUMapping(MI
) ?
2294 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
2295 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
2296 unsigned IdxSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2297 unsigned IdxBank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2299 OpdsMapping
[0] = AMDGPU::getValueMapping(OutputBankID
, SrcSize
);
2300 OpdsMapping
[1] = AMDGPU::getValueMapping(OutputBankID
, SrcSize
);
2302 // The index can be either if the source vector is VGPR.
2303 OpdsMapping
[2] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
2306 case AMDGPU::G_INSERT_VECTOR_ELT
: {
2307 unsigned OutputBankID
= isSALUMapping(MI
) ?
2308 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
2310 unsigned VecSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2311 unsigned InsertSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2312 unsigned IdxSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
2313 unsigned InsertEltBank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2314 unsigned IdxBank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2316 OpdsMapping
[0] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
2317 OpdsMapping
[1] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
2318 OpdsMapping
[2] = AMDGPU::getValueMapping(InsertEltBank
, InsertSize
);
2320 // The index can be either if the source vector is VGPR.
2321 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
2324 case AMDGPU::G_UNMERGE_VALUES
: {
2325 unsigned Bank
= isSALUMapping(MI
) ? AMDGPU::SGPRRegBankID
:
2326 AMDGPU::VGPRRegBankID
;
2328 // Op1 and Dst should use the same register bank.
2329 // FIXME: Shouldn't this be the default? Why do we need to handle this?
2330 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
2331 unsigned Size
= getSizeInBits(MI
.getOperand(i
).getReg(), MRI
, *TRI
);
2332 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, Size
);
2336 case AMDGPU::G_INTRINSIC
: {
2337 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
2339 return getInvalidInstructionMapping();
2340 case Intrinsic::amdgcn_div_fmas
:
2341 case Intrinsic::amdgcn_trig_preop
:
2342 case Intrinsic::amdgcn_sin
:
2343 case Intrinsic::amdgcn_cos
:
2344 case Intrinsic::amdgcn_log_clamp
:
2345 case Intrinsic::amdgcn_rcp
:
2346 case Intrinsic::amdgcn_rcp_legacy
:
2347 case Intrinsic::amdgcn_rsq
:
2348 case Intrinsic::amdgcn_rsq_legacy
:
2349 case Intrinsic::amdgcn_rsq_clamp
:
2350 case Intrinsic::amdgcn_ldexp
:
2351 case Intrinsic::amdgcn_frexp_mant
:
2352 case Intrinsic::amdgcn_frexp_exp
:
2353 case Intrinsic::amdgcn_fract
:
2354 case Intrinsic::amdgcn_cvt_pkrtz
:
2355 case Intrinsic::amdgcn_cvt_pknorm_i16
:
2356 case Intrinsic::amdgcn_cvt_pknorm_u16
:
2357 case Intrinsic::amdgcn_cvt_pk_i16
:
2358 case Intrinsic::amdgcn_cvt_pk_u16
:
2359 case Intrinsic::amdgcn_fmed3
:
2360 case Intrinsic::amdgcn_cubeid
:
2361 case Intrinsic::amdgcn_cubema
:
2362 case Intrinsic::amdgcn_cubesc
:
2363 case Intrinsic::amdgcn_cubetc
:
2364 case Intrinsic::amdgcn_sffbh
:
2365 case Intrinsic::amdgcn_fmad_ftz
:
2366 case Intrinsic::amdgcn_mbcnt_lo
:
2367 case Intrinsic::amdgcn_mbcnt_hi
:
2368 case Intrinsic::amdgcn_ubfe
:
2369 case Intrinsic::amdgcn_sbfe
:
2370 case Intrinsic::amdgcn_lerp
:
2371 case Intrinsic::amdgcn_sad_u8
:
2372 case Intrinsic::amdgcn_msad_u8
:
2373 case Intrinsic::amdgcn_sad_hi_u8
:
2374 case Intrinsic::amdgcn_sad_u16
:
2375 case Intrinsic::amdgcn_qsad_pk_u16_u8
:
2376 case Intrinsic::amdgcn_mqsad_pk_u16_u8
:
2377 case Intrinsic::amdgcn_mqsad_u32_u8
:
2378 case Intrinsic::amdgcn_cvt_pk_u8_f32
:
2379 case Intrinsic::amdgcn_alignbit
:
2380 case Intrinsic::amdgcn_alignbyte
:
2381 case Intrinsic::amdgcn_fdot2
:
2382 case Intrinsic::amdgcn_sdot2
:
2383 case Intrinsic::amdgcn_udot2
:
2384 case Intrinsic::amdgcn_sdot4
:
2385 case Intrinsic::amdgcn_udot4
:
2386 case Intrinsic::amdgcn_sdot8
:
2387 case Intrinsic::amdgcn_udot8
:
2388 case Intrinsic::amdgcn_wwm
:
2389 case Intrinsic::amdgcn_wqm
:
2390 return getDefaultMappingVOP(MI
);
2391 case Intrinsic::amdgcn_ds_swizzle
:
2392 case Intrinsic::amdgcn_ds_permute
:
2393 case Intrinsic::amdgcn_ds_bpermute
:
2394 case Intrinsic::amdgcn_update_dpp
:
2395 return getDefaultMappingAllVGPR(MI
);
2396 case Intrinsic::amdgcn_kernarg_segment_ptr
:
2397 case Intrinsic::amdgcn_s_getpc
:
2398 case Intrinsic::amdgcn_groupstaticsize
: {
2399 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2400 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2403 case Intrinsic::amdgcn_wqm_vote
: {
2404 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2405 OpdsMapping
[0] = OpdsMapping
[2]
2406 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
);
2409 case Intrinsic::amdgcn_s_buffer_load
: {
2410 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
2411 Register RSrc
= MI
.getOperand(2).getReg(); // SGPR
2412 Register Offset
= MI
.getOperand(3).getReg(); // SGPR/imm
2414 unsigned Size0
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2415 unsigned Size2
= MRI
.getType(RSrc
).getSizeInBits();
2416 unsigned Size3
= MRI
.getType(Offset
).getSizeInBits();
2418 unsigned RSrcBank
= getRegBankID(RSrc
, MRI
, *TRI
);
2419 unsigned OffsetBank
= getRegBankID(Offset
, MRI
, *TRI
);
2421 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size0
);
2422 OpdsMapping
[1] = nullptr; // intrinsic id
2424 // Lie and claim everything is legal, even though some need to be
2425 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2426 OpdsMapping
[2] = AMDGPU::getValueMapping(RSrcBank
, Size2
); // rsrc
2427 OpdsMapping
[3] = AMDGPU::getValueMapping(OffsetBank
, Size3
);
2428 OpdsMapping
[4] = nullptr;
2431 case Intrinsic::amdgcn_div_scale
: {
2432 unsigned Dst0Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2433 unsigned Dst1Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
2434 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Dst0Size
);
2435 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Dst1Size
);
2437 unsigned SrcSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
2438 OpdsMapping
[3] = AMDGPU::getValueMapping(
2439 getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
), SrcSize
);
2440 OpdsMapping
[4] = AMDGPU::getValueMapping(
2441 getRegBankID(MI
.getOperand(4).getReg(), MRI
, *TRI
), SrcSize
);
2445 case Intrinsic::amdgcn_class
: {
2446 Register Src0Reg
= MI
.getOperand(2).getReg();
2447 Register Src1Reg
= MI
.getOperand(3).getReg();
2448 unsigned Src0Size
= MRI
.getType(Src0Reg
).getSizeInBits();
2449 unsigned Src1Size
= MRI
.getType(Src1Reg
).getSizeInBits();
2450 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2451 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, DstSize
);
2452 OpdsMapping
[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg
, MRI
, *TRI
),
2454 OpdsMapping
[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg
, MRI
, *TRI
),
2458 case Intrinsic::amdgcn_icmp
:
2459 case Intrinsic::amdgcn_fcmp
: {
2460 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2461 // This is not VCCRegBank because this is not used in boolean contexts.
2462 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
2463 unsigned OpSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2464 unsigned Op1Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2465 unsigned Op2Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2466 OpdsMapping
[2] = AMDGPU::getValueMapping(Op1Bank
, OpSize
);
2467 OpdsMapping
[3] = AMDGPU::getValueMapping(Op2Bank
, OpSize
);
2470 case Intrinsic::amdgcn_readlane
: {
2471 // This must be an SGPR, but accept a VGPR.
2472 Register IdxReg
= MI
.getOperand(3).getReg();
2473 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
2474 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
2475 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
2478 case Intrinsic::amdgcn_readfirstlane
: {
2479 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2480 unsigned SrcSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2481 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
2482 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
2485 case Intrinsic::amdgcn_writelane
: {
2486 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2487 Register SrcReg
= MI
.getOperand(2).getReg();
2488 unsigned SrcSize
= MRI
.getType(SrcReg
).getSizeInBits();
2489 unsigned SrcBank
= getRegBankID(SrcReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
2490 Register IdxReg
= MI
.getOperand(3).getReg();
2491 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
2492 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
2493 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
2495 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
2497 OpdsMapping
[2] = AMDGPU::getValueMapping(SrcBank
, SrcSize
);
2498 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
2499 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
2502 case Intrinsic::amdgcn_if_break
: {
2503 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2504 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2505 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
2506 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2512 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
: {
2513 auto IntrID
= MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID();
2515 case Intrinsic::amdgcn_s_getreg
:
2516 case Intrinsic::amdgcn_s_memtime
:
2517 case Intrinsic::amdgcn_s_memrealtime
:
2518 case Intrinsic::amdgcn_s_get_waveid_in_workgroup
: {
2519 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2520 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2523 case Intrinsic::amdgcn_ds_append
:
2524 case Intrinsic::amdgcn_ds_consume
:
2525 case Intrinsic::amdgcn_ds_fadd
:
2526 case Intrinsic::amdgcn_ds_fmin
:
2527 case Intrinsic::amdgcn_ds_fmax
:
2528 case Intrinsic::amdgcn_atomic_inc
:
2529 case Intrinsic::amdgcn_atomic_dec
:
2530 return getDefaultMappingAllVGPR(MI
);
2531 case Intrinsic::amdgcn_ds_ordered_add
:
2532 case Intrinsic::amdgcn_ds_ordered_swap
: {
2533 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2534 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
2535 unsigned M0Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2536 AMDGPU::SGPRRegBankID
);
2537 OpdsMapping
[2] = AMDGPU::getValueMapping(M0Bank
, 32);
2538 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2541 case Intrinsic::amdgcn_exp_compr
:
2542 OpdsMapping
[0] = nullptr; // IntrinsicID
2543 // FIXME: These are immediate values which can't be read from registers.
2544 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2545 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2546 // FIXME: Could we support packed types here?
2547 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2548 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2549 // FIXME: These are immediate values which can't be read from registers.
2550 OpdsMapping
[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2551 OpdsMapping
[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2553 case Intrinsic::amdgcn_exp
:
2554 // FIXME: Could we support packed types here?
2555 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2556 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2557 OpdsMapping
[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2558 OpdsMapping
[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2560 case Intrinsic::amdgcn_buffer_load
: {
2561 Register RSrc
= MI
.getOperand(2).getReg(); // SGPR
2562 Register VIndex
= MI
.getOperand(3).getReg(); // VGPR
2563 Register Offset
= MI
.getOperand(4).getReg(); // SGPR/VGPR/imm
2565 unsigned Size0
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2566 unsigned Size2
= MRI
.getType(RSrc
).getSizeInBits();
2567 unsigned Size3
= MRI
.getType(VIndex
).getSizeInBits();
2568 unsigned Size4
= MRI
.getType(Offset
).getSizeInBits();
2570 unsigned RSrcBank
= getRegBankID(RSrc
, MRI
, *TRI
);
2571 unsigned OffsetBank
= getRegBankID(Offset
, MRI
, *TRI
);
2573 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size0
);
2574 OpdsMapping
[1] = nullptr; // intrinsic id
2576 // Lie and claim everything is legal, even though some need to be
2577 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2578 OpdsMapping
[2] = AMDGPU::getValueMapping(RSrcBank
, Size2
); // rsrc
2579 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size3
);
2580 OpdsMapping
[4] = AMDGPU::getValueMapping(OffsetBank
, Size4
);
2581 OpdsMapping
[5] = nullptr;
2582 OpdsMapping
[6] = nullptr;
2585 case Intrinsic::amdgcn_s_sendmsg
:
2586 case Intrinsic::amdgcn_s_sendmsghalt
: {
2587 // This must be an SGPR, but accept a VGPR.
2588 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2589 AMDGPU::SGPRRegBankID
);
2590 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, 32);
2591 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
2594 case Intrinsic::amdgcn_end_cf
: {
2595 unsigned Size
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2596 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2599 case Intrinsic::amdgcn_else
: {
2600 unsigned WaveSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2601 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
2602 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, WaveSize
);
2603 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, WaveSize
);
2606 case Intrinsic::amdgcn_kill
: {
2607 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
2610 case Intrinsic::amdgcn_raw_buffer_load
:
2611 case Intrinsic::amdgcn_raw_tbuffer_load
: {
2612 // FIXME: Should make intrinsic ID the last operand of the instruction,
2613 // then this would be the same as store
2614 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2615 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2616 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2617 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
2620 case Intrinsic::amdgcn_raw_buffer_store
:
2621 case Intrinsic::amdgcn_raw_buffer_store_format
:
2622 case Intrinsic::amdgcn_raw_tbuffer_store
: {
2623 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2624 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2625 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2626 OpdsMapping
[4] = getSGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
2629 case Intrinsic::amdgcn_struct_buffer_load
:
2630 case Intrinsic::amdgcn_struct_tbuffer_load
: {
2631 OpdsMapping
[0] = getVGPROpMapping(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2632 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2633 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2634 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
2635 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
2638 case Intrinsic::amdgcn_struct_buffer_store
:
2639 case Intrinsic::amdgcn_struct_tbuffer_store
: {
2640 OpdsMapping
[1] = getVGPROpMapping(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2641 OpdsMapping
[2] = getSGPROpMapping(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2642 OpdsMapping
[3] = getVGPROpMapping(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2643 OpdsMapping
[4] = getVGPROpMapping(MI
.getOperand(4).getReg(), MRI
, *TRI
);
2644 OpdsMapping
[5] = getSGPROpMapping(MI
.getOperand(5).getReg(), MRI
, *TRI
);
2648 if (const AMDGPU::RsrcIntrinsic
*RSrcIntrin
=
2649 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
2650 // Non-images can have complications from operands that allow both SGPR
2651 // and VGPR. For now it's too complicated to figure out the final opcode
2652 // to derive the register bank from the MCInstrDesc.
2653 if (RSrcIntrin
->IsImage
)
2654 return getImageMapping(MRI
, MI
, RSrcIntrin
->RsrcArg
);
2657 return getInvalidInstructionMapping();
2661 case AMDGPU::G_SELECT
: {
2662 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2663 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2664 AMDGPU::SGPRRegBankID
);
2665 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
,
2666 AMDGPU::SGPRRegBankID
);
2667 bool SGPRSrcs
= Op2Bank
== AMDGPU::SGPRRegBankID
&&
2668 Op3Bank
== AMDGPU::SGPRRegBankID
;
2670 unsigned CondBankDefault
= SGPRSrcs
?
2671 AMDGPU::SCCRegBankID
: AMDGPU::VCCRegBankID
;
2672 unsigned CondBank
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
,
2674 if (CondBank
== AMDGPU::SGPRRegBankID
)
2675 CondBank
= SGPRSrcs
? AMDGPU::SCCRegBankID
: AMDGPU::VCCRegBankID
;
2676 else if (CondBank
== AMDGPU::VGPRRegBankID
)
2677 CondBank
= AMDGPU::VCCRegBankID
;
2679 unsigned Bank
= SGPRSrcs
&& CondBank
== AMDGPU::SCCRegBankID
?
2680 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
2682 assert(CondBank
== AMDGPU::VCCRegBankID
|| CondBank
== AMDGPU::SCCRegBankID
);
2685 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
2686 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
2687 OpdsMapping
[2] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
2688 OpdsMapping
[3] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
2690 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, Size
);
2691 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
2692 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, Size
);
2693 OpdsMapping
[3] = AMDGPU::getValueMapping(Bank
, Size
);
2699 case AMDGPU::G_LOAD
:
2700 case AMDGPU::G_ZEXTLOAD
:
2701 case AMDGPU::G_SEXTLOAD
:
2702 return getInstrMappingForLoad(MI
);
2704 case AMDGPU::G_ATOMICRMW_XCHG
:
2705 case AMDGPU::G_ATOMICRMW_ADD
:
2706 case AMDGPU::G_ATOMICRMW_SUB
:
2707 case AMDGPU::G_ATOMICRMW_AND
:
2708 case AMDGPU::G_ATOMICRMW_OR
:
2709 case AMDGPU::G_ATOMICRMW_XOR
:
2710 case AMDGPU::G_ATOMICRMW_MAX
:
2711 case AMDGPU::G_ATOMICRMW_MIN
:
2712 case AMDGPU::G_ATOMICRMW_UMAX
:
2713 case AMDGPU::G_ATOMICRMW_UMIN
:
2714 case AMDGPU::G_ATOMICRMW_FADD
:
2715 case AMDGPU::G_ATOMIC_CMPXCHG
: {
2716 return getDefaultMappingAllVGPR(MI
);
2718 case AMDGPU::G_BRCOND
: {
2719 unsigned Bank
= getRegBankID(MI
.getOperand(0).getReg(), MRI
, *TRI
,
2720 AMDGPU::SGPRRegBankID
);
2721 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
2722 if (Bank
!= AMDGPU::SCCRegBankID
)
2723 Bank
= AMDGPU::VCCRegBankID
;
2725 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, 1);
2730 return getInstructionMapping(/*ID*/1, /*Cost*/1,
2731 getOperandsMapping(OpdsMapping
),
2732 MI
.getNumOperands());