1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the RegisterBankInfo class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/SmallSet.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
25 #include "llvm/CodeGen/TargetRegisterInfo.h"
26 #include "llvm/CodeGen/TargetSubtargetInfo.h"
27 #include "llvm/IR/Constants.h"
29 #define GET_TARGET_REGBANK_IMPL
30 #include "AMDGPUGenRegisterBank.inc"
32 // This file will be TableGen'ed at some point.
33 #include "AMDGPUGenRegisterBankInfo.def"
39 // Observer to apply a register bank to new registers created by LegalizerHelper.
40 class ApplyRegBankMapping final
: public GISelChangeObserver
{
42 MachineRegisterInfo
&MRI
;
43 const RegisterBank
*NewBank
;
44 SmallVector
<MachineInstr
*, 4> NewInsts
;
47 ApplyRegBankMapping(MachineRegisterInfo
&MRI_
, const RegisterBank
*RB
)
48 : MRI(MRI_
), NewBank(RB
) {}
50 ~ApplyRegBankMapping() {
51 for (MachineInstr
*MI
: NewInsts
)
55 /// Set any registers that don't have a set register class or bank to SALU.
56 void applyBank(MachineInstr
&MI
) {
57 for (MachineOperand
&Op
: MI
.operands()) {
61 Register Reg
= Op
.getReg();
62 if (MRI
.getRegClassOrRegBank(Reg
))
65 const RegisterBank
*RB
= NewBank
;
66 // FIXME: This might not be enough to detect when SCC should be used.
67 if (MRI
.getType(Reg
) == LLT::scalar(1))
68 RB
= (NewBank
== &AMDGPU::SGPRRegBank
?
69 &AMDGPU::SCCRegBank
: &AMDGPU::VCCRegBank
);
71 MRI
.setRegBank(Reg
, *RB
);
75 void erasingInstr(MachineInstr
&MI
) override
{}
77 void createdInstr(MachineInstr
&MI
) override
{
78 // At this point, the instruction was just inserted and has no operands.
79 NewInsts
.push_back(&MI
);
82 void changingInstr(MachineInstr
&MI
) override
{}
83 void changedInstr(MachineInstr
&MI
) override
{}
87 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo
&TRI
)
88 : AMDGPUGenRegisterBankInfo(),
89 TRI(static_cast<const SIRegisterInfo
*>(&TRI
)) {
91 // HACK: Until this is fully tablegen'd.
92 static bool AlreadyInit
= false;
98 const RegisterBank
&RBSGPR
= getRegBank(AMDGPU::SGPRRegBankID
);
100 assert(&RBSGPR
== &AMDGPU::SGPRRegBank
);
102 const RegisterBank
&RBVGPR
= getRegBank(AMDGPU::VGPRRegBankID
);
104 assert(&RBVGPR
== &AMDGPU::VGPRRegBank
);
108 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank
&Dst
,
109 const RegisterBank
&Src
,
110 unsigned Size
) const {
111 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
112 if (Dst
.getID() == AMDGPU::SGPRRegBankID
&&
113 Src
.getID() == AMDGPU::VGPRRegBankID
) {
114 return std::numeric_limits
<unsigned>::max();
117 // Bool values are tricky, because the meaning is based on context. The SCC
118 // and VCC banks are for the natural scalar and vector conditions produced by
121 // Legalization doesn't know about the necessary context, so an s1 use may
122 // have been a truncate from an arbitrary value, in which case a copy (lowered
123 // as a compare with 0) needs to be inserted.
125 (Dst
.getID() == AMDGPU::SCCRegBankID
||
126 Dst
.getID() == AMDGPU::SGPRRegBankID
) &&
127 (Src
.getID() == AMDGPU::SGPRRegBankID
||
128 Src
.getID() == AMDGPU::VGPRRegBankID
||
129 Src
.getID() == AMDGPU::VCCRegBankID
))
130 return std::numeric_limits
<unsigned>::max();
132 if (Dst
.getID() == AMDGPU::SCCRegBankID
&&
133 Src
.getID() == AMDGPU::VCCRegBankID
)
134 return std::numeric_limits
<unsigned>::max();
136 return RegisterBankInfo::copyCost(Dst
, Src
, Size
);
139 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
140 const ValueMapping
&ValMapping
,
141 const RegisterBank
*CurBank
) const {
142 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
144 // FIXME: Is there a better way to do this?
145 if (ValMapping
.NumBreakDowns
>= 2 || ValMapping
.BreakDown
[0].Length
>= 64)
146 return 10; // This is expensive.
148 assert(ValMapping
.NumBreakDowns
== 2 &&
149 ValMapping
.BreakDown
[0].Length
== 32 &&
150 ValMapping
.BreakDown
[0].StartIdx
== 0 &&
151 ValMapping
.BreakDown
[1].Length
== 32 &&
152 ValMapping
.BreakDown
[1].StartIdx
== 32 &&
153 ValMapping
.BreakDown
[0].RegBank
== ValMapping
.BreakDown
[1].RegBank
);
155 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
156 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
159 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
160 // alignment restrictions, but this probably isn't important.
164 const RegisterBank
&AMDGPURegisterBankInfo::getRegBankFromRegClass(
165 const TargetRegisterClass
&RC
) const {
166 if (&RC
== &AMDGPU::SReg_1RegClass
)
167 return AMDGPU::VCCRegBank
;
169 return TRI
->isSGPRClass(&RC
) ? AMDGPU::SGPRRegBank
: AMDGPU::VGPRRegBank
;
172 template <unsigned NumOps
>
173 RegisterBankInfo::InstructionMappings
174 AMDGPURegisterBankInfo::addMappingFromTable(
175 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
,
176 const std::array
<unsigned, NumOps
> RegSrcOpIdx
,
177 ArrayRef
<OpRegBankEntry
<NumOps
>> Table
) const {
179 InstructionMappings AltMappings
;
181 SmallVector
<const ValueMapping
*, 10> Operands(MI
.getNumOperands());
183 unsigned Sizes
[NumOps
];
184 for (unsigned I
= 0; I
< NumOps
; ++I
) {
185 Register Reg
= MI
.getOperand(RegSrcOpIdx
[I
]).getReg();
186 Sizes
[I
] = getSizeInBits(Reg
, MRI
, *TRI
);
189 for (unsigned I
= 0, E
= MI
.getNumExplicitDefs(); I
!= E
; ++I
) {
190 unsigned SizeI
= getSizeInBits(MI
.getOperand(I
).getReg(), MRI
, *TRI
);
191 Operands
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SizeI
);
194 // getInstrMapping's default mapping uses ID 1, so start at 2.
195 unsigned MappingID
= 2;
196 for (const auto &Entry
: Table
) {
197 for (unsigned I
= 0; I
< NumOps
; ++I
) {
198 int OpIdx
= RegSrcOpIdx
[I
];
199 Operands
[OpIdx
] = AMDGPU::getValueMapping(Entry
.RegBanks
[I
], Sizes
[I
]);
202 AltMappings
.push_back(&getInstructionMapping(MappingID
++, Entry
.Cost
,
203 getOperandsMapping(Operands
),
210 RegisterBankInfo::InstructionMappings
211 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
212 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
213 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
214 case Intrinsic::amdgcn_readlane
: {
215 static const OpRegBankEntry
<3> Table
[2] = {
217 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
219 // Need a readfirstlane for the index.
220 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
223 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
224 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
226 case Intrinsic::amdgcn_writelane
: {
227 static const OpRegBankEntry
<4> Table
[4] = {
229 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
231 // Need readfirstlane of first op
232 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
234 // Need readfirstlane of second op
235 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 },
237 // Need readfirstlane of both ops
238 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 3 }
241 // rsrc, voffset, offset
242 const std::array
<unsigned, 4> RegSrcOpIdx
= { { 0, 2, 3, 4 } };
243 return addMappingFromTable
<4>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
246 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
250 RegisterBankInfo::InstructionMappings
251 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
252 const MachineInstr
&MI
, const MachineRegisterInfo
&MRI
) const {
254 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
255 case Intrinsic::amdgcn_buffer_load
: {
256 static const OpRegBankEntry
<3> Table
[4] = {
258 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
259 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
261 // Waterfall loop needed for rsrc. In the worst case this will execute
262 // approximately an extra 10 * wavesize + 2 instructions.
263 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1000 },
264 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1000 }
267 // rsrc, voffset, offset
268 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 2, 3, 4 } };
269 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
271 case Intrinsic::amdgcn_s_buffer_load
: {
272 static const OpRegBankEntry
<2> Table
[4] = {
274 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
276 // Only need 1 register in loop
277 { { AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 300 },
279 // Have to waterfall the resource.
280 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1000 },
282 // Have to waterfall the resource, and the offset.
283 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1500 }
287 const std::array
<unsigned, 2> RegSrcOpIdx
= { { 2, 3 } };
288 return addMappingFromTable
<2>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
290 case Intrinsic::amdgcn_ds_ordered_add
:
291 case Intrinsic::amdgcn_ds_ordered_swap
: {
293 static const OpRegBankEntry
<3> Table
[2] = {
295 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
297 // Need a readfirstlane for m0
298 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 2 }
301 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 2, 3 } };
302 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
304 case Intrinsic::amdgcn_s_sendmsg
:
305 case Intrinsic::amdgcn_s_sendmsghalt
: {
306 static const OpRegBankEntry
<1> Table
[2] = {
308 { { AMDGPU::SGPRRegBankID
}, 1 },
311 { { AMDGPU::VGPRRegBankID
}, 3 }
314 const std::array
<unsigned, 1> RegSrcOpIdx
= { { 2 } };
315 return addMappingFromTable
<1>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
318 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
322 static bool isInstrUniform(const MachineInstr
&MI
) {
323 if (!MI
.hasOneMemOperand())
326 const MachineMemOperand
*MMO
= *MI
.memoperands_begin();
327 return AMDGPUInstrInfo::isUniformMMO(MMO
);
330 RegisterBankInfo::InstructionMappings
331 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
332 const MachineInstr
&MI
) const {
334 const MachineFunction
&MF
= *MI
.getParent()->getParent();
335 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
338 InstructionMappings AltMappings
;
339 switch (MI
.getOpcode()) {
340 case TargetOpcode::G_CONSTANT
:
341 case TargetOpcode::G_FCONSTANT
:
342 case TargetOpcode::G_FRAME_INDEX
:
343 case TargetOpcode::G_GLOBAL_VALUE
: {
344 static const OpRegBankEntry
<1> Table
[2] = {
345 { { AMDGPU::VGPRRegBankID
}, 1 },
346 { { AMDGPU::SGPRRegBankID
}, 1 }
349 return addMappingFromTable
<1>(MI
, MRI
, { 0 }, Table
);
351 case TargetOpcode::G_AND
:
352 case TargetOpcode::G_OR
:
353 case TargetOpcode::G_XOR
: {
354 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
357 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
358 const InstructionMapping
&SCCMapping
= getInstructionMapping(
359 1, 1, getOperandsMapping(
360 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, Size
),
361 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
362 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
364 AltMappings
.push_back(&SCCMapping
);
366 const InstructionMapping
&SGPRMapping
= getInstructionMapping(
367 1, 1, getOperandsMapping(
368 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
369 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
370 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
372 AltMappings
.push_back(&SGPRMapping
);
374 const InstructionMapping
&VCCMapping0
= getInstructionMapping(
375 2, 10, getOperandsMapping(
376 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
377 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
),
378 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
)}),
380 AltMappings
.push_back(&VCCMapping0
);
387 const InstructionMapping
&SSMapping
= getInstructionMapping(
388 1, 1, getOperandsMapping(
389 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
390 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
391 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
393 AltMappings
.push_back(&SSMapping
);
395 const InstructionMapping
&VVMapping
= getInstructionMapping(
396 2, 2, getOperandsMapping(
397 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
398 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
399 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
401 AltMappings
.push_back(&VVMapping
);
403 const InstructionMapping
&SVMapping
= getInstructionMapping(
404 3, 3, getOperandsMapping(
405 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
406 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
),
407 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
409 AltMappings
.push_back(&SVMapping
);
411 // SGPR in LHS is slightly preferrable, so make it VS more expensive than
413 const InstructionMapping
&VSMapping
= getInstructionMapping(
414 3, 4, getOperandsMapping(
415 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
416 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
417 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
)}),
419 AltMappings
.push_back(&VSMapping
);
422 case TargetOpcode::G_LOAD
: {
423 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
424 LLT LoadTy
= MRI
.getType(MI
.getOperand(0).getReg());
425 // FIXME: Should we be hard coding the size for these mappings?
426 if (isInstrUniform(MI
)) {
427 const InstructionMapping
&SSMapping
= getInstructionMapping(
428 1, 1, getOperandsMapping(
429 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
430 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 64)}),
432 AltMappings
.push_back(&SSMapping
);
435 const InstructionMapping
&VVMapping
= getInstructionMapping(
436 2, 1, getOperandsMapping(
437 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID
, LoadTy
),
438 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 64)}),
440 AltMappings
.push_back(&VVMapping
);
442 // It may be possible to have a vgpr = load sgpr mapping here, because
443 // the mubuf instructions support this kind of load, but probably for only
444 // gfx7 and older. However, the addressing mode matching in the instruction
445 // selector should be able to do a better job of detecting and selecting
446 // these kinds of loads from the vgpr = load vgpr mapping.
451 case TargetOpcode::G_ICMP
: {
452 unsigned Size
= getSizeInBits(MI
.getOperand(2).getReg(), MRI
, *TRI
);
453 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
454 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1),
455 nullptr, // Predicate operand.
456 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
457 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
459 AltMappings
.push_back(&SSMapping
);
461 const InstructionMapping
&SVMapping
= getInstructionMapping(2, 1,
462 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
463 nullptr, // Predicate operand.
464 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
465 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
)}),
467 AltMappings
.push_back(&SVMapping
);
469 const InstructionMapping
&VSMapping
= getInstructionMapping(3, 1,
470 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
471 nullptr, // Predicate operand.
472 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
473 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
475 AltMappings
.push_back(&VSMapping
);
477 const InstructionMapping
&VVMapping
= getInstructionMapping(4, 1,
478 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
479 nullptr, // Predicate operand.
480 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
481 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
)}),
483 AltMappings
.push_back(&VVMapping
);
487 case TargetOpcode::G_SELECT
: {
488 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
489 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
490 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
491 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1),
492 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
493 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
)}),
495 AltMappings
.push_back(&SSMapping
);
497 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
498 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
499 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
500 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
),
501 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
)}),
503 AltMappings
.push_back(&VVMapping
);
507 case TargetOpcode::G_SMIN
:
508 case TargetOpcode::G_SMAX
:
509 case TargetOpcode::G_UMIN
:
510 case TargetOpcode::G_UMAX
: {
511 static const OpRegBankEntry
<3> Table
[4] = {
512 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
513 { { AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::VGPRRegBankID
}, 1 },
514 { { AMDGPU::VGPRRegBankID
, AMDGPU::VGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 1 },
516 // Scalar requires cmp+select, and extends if 16-bit.
517 // FIXME: Should there be separate costs for 32 and 16-bit
518 { { AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
, AMDGPU::SGPRRegBankID
}, 3 }
521 const std::array
<unsigned, 3> RegSrcOpIdx
= { { 0, 1, 2 } };
522 return addMappingFromTable
<3>(MI
, MRI
, RegSrcOpIdx
, makeArrayRef(Table
));
524 case TargetOpcode::G_UADDE
:
525 case TargetOpcode::G_USUBE
:
526 case TargetOpcode::G_SADDE
:
527 case TargetOpcode::G_SSUBE
: {
528 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
529 const InstructionMapping
&SSMapping
= getInstructionMapping(1, 1,
531 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
532 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1),
533 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
534 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
),
535 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1)}),
537 AltMappings
.push_back(&SSMapping
);
539 const InstructionMapping
&VVMapping
= getInstructionMapping(2, 1,
540 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
541 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1),
542 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
543 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
),
544 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1)}),
546 AltMappings
.push_back(&VVMapping
);
549 case AMDGPU::G_BRCOND
: {
550 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
552 const InstructionMapping
&SMapping
= getInstructionMapping(
553 1, 1, getOperandsMapping(
554 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID
, 1), nullptr}),
556 AltMappings
.push_back(&SMapping
);
558 const InstructionMapping
&VMapping
= getInstructionMapping(
559 1, 1, getOperandsMapping(
560 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1), nullptr }),
562 AltMappings
.push_back(&VMapping
);
565 case AMDGPU::G_INTRINSIC
:
566 return getInstrAlternativeMappingsIntrinsic(MI
, MRI
);
567 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
:
568 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI
, MRI
);
572 return RegisterBankInfo::getInstrAlternativeMappings(MI
);
575 void AMDGPURegisterBankInfo::split64BitValueForMapping(
577 SmallVector
<Register
, 2> &Regs
,
579 Register Reg
) const {
580 assert(HalfTy
.getSizeInBits() == 32);
581 MachineRegisterInfo
*MRI
= B
.getMRI();
582 Register LoLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
583 Register HiLHS
= MRI
->createGenericVirtualRegister(HalfTy
);
584 const RegisterBank
*Bank
= getRegBank(Reg
, *MRI
, *TRI
);
585 MRI
->setRegBank(LoLHS
, *Bank
);
586 MRI
->setRegBank(HiLHS
, *Bank
);
588 Regs
.push_back(LoLHS
);
589 Regs
.push_back(HiLHS
);
591 B
.buildInstr(AMDGPU::G_UNMERGE_VALUES
)
597 /// Replace the current type each register in \p Regs has with \p NewTy
598 static void setRegsToType(MachineRegisterInfo
&MRI
, ArrayRef
<Register
> Regs
,
600 for (Register Reg
: Regs
) {
601 assert(MRI
.getType(Reg
).getSizeInBits() == NewTy
.getSizeInBits());
602 MRI
.setType(Reg
, NewTy
);
606 static LLT
getHalfSizedType(LLT Ty
) {
608 assert(Ty
.getNumElements() % 2 == 0);
609 return LLT::scalarOrVector(Ty
.getNumElements() / 2, Ty
.getElementType());
612 assert(Ty
.getSizeInBits() % 2 == 0);
613 return LLT::scalar(Ty
.getSizeInBits() / 2);
616 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
617 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
618 /// execute the instruction for each unique combination of values in all lanes
619 /// in the wave. The block will be split such that rest of the instructions are
620 /// moved to a new block.
622 /// Essentially performs this loop:
624 /// Save Execution Mask
625 /// For (Lane : Wavefront) {
626 /// Enable Lane, Disable all other lanes
627 /// SGPR = read SGPR value for current lane from VGPR
628 /// VGPRResult[Lane] = use_op SGPR
630 /// Restore Execution Mask
632 /// There is additional complexity to try for compare values to identify the
633 /// unique values used.
634 void AMDGPURegisterBankInfo::executeInWaterfallLoop(
635 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
636 ArrayRef
<unsigned> OpIndices
) const {
637 MachineFunction
*MF
= MI
.getParent()->getParent();
638 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
639 const SIInstrInfo
*TII
= ST
.getInstrInfo();
640 MachineBasicBlock::iterator
I(MI
);
642 MachineBasicBlock
&MBB
= *MI
.getParent();
643 const DebugLoc
&DL
= MI
.getDebugLoc();
645 // Use a set to avoid extra readfirstlanes in the case where multiple operands
646 // are the same register.
647 SmallSet
<Register
, 4> SGPROperandRegs
;
648 for (unsigned Op
: OpIndices
) {
649 assert(MI
.getOperand(Op
).isUse());
650 Register Reg
= MI
.getOperand(Op
).getReg();
651 const RegisterBank
*OpBank
= getRegBank(Reg
, MRI
, *TRI
);
652 if (OpBank
->getID() == AMDGPU::VGPRRegBankID
)
653 SGPROperandRegs
.insert(Reg
);
656 // No operands need to be replaced, so no need to loop.
657 if (SGPROperandRegs
.empty())
660 MachineIRBuilder
B(MI
);
661 SmallVector
<Register
, 4> ResultRegs
;
662 SmallVector
<Register
, 4> InitResultRegs
;
663 SmallVector
<Register
, 4> PhiRegs
;
664 for (MachineOperand
&Def
: MI
.defs()) {
665 LLT ResTy
= MRI
.getType(Def
.getReg());
666 const RegisterBank
*DefBank
= getRegBank(Def
.getReg(), MRI
, *TRI
);
667 ResultRegs
.push_back(Def
.getReg());
668 Register InitReg
= B
.buildUndef(ResTy
).getReg(0);
669 Register PhiReg
= MRI
.createGenericVirtualRegister(ResTy
);
670 InitResultRegs
.push_back(InitReg
);
671 PhiRegs
.push_back(PhiReg
);
672 MRI
.setRegBank(PhiReg
, *DefBank
);
673 MRI
.setRegBank(InitReg
, *DefBank
);
676 Register SaveExecReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
677 Register InitSaveExecReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
679 // Don't bother using generic instructions/registers for the exec mask.
680 B
.buildInstr(TargetOpcode::IMPLICIT_DEF
)
681 .addDef(InitSaveExecReg
);
683 Register PhiExec
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
684 Register NewExec
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
686 // To insert the loop we need to split the block. Move everything before this
687 // point to a new block, and insert a new empty block before this instruction.
688 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
689 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
690 MachineBasicBlock
*RestoreExecBB
= MF
->CreateMachineBasicBlock();
691 MachineFunction::iterator
MBBI(MBB
);
693 MF
->insert(MBBI
, LoopBB
);
694 MF
->insert(MBBI
, RestoreExecBB
);
695 MF
->insert(MBBI
, RemainderBB
);
697 LoopBB
->addSuccessor(RestoreExecBB
);
698 LoopBB
->addSuccessor(LoopBB
);
700 // Move the rest of the block into a new block.
701 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
702 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, I
, MBB
.end());
704 MBB
.addSuccessor(LoopBB
);
705 RestoreExecBB
->addSuccessor(RemainderBB
);
707 B
.setInsertPt(*LoopBB
, LoopBB
->end());
709 B
.buildInstr(TargetOpcode::PHI
)
711 .addReg(InitSaveExecReg
)
716 for (auto Result
: zip(InitResultRegs
, ResultRegs
, PhiRegs
)) {
717 B
.buildInstr(TargetOpcode::G_PHI
)
718 .addDef(std::get
<2>(Result
))
719 .addReg(std::get
<0>(Result
)) // Initial value / implicit_def
721 .addReg(std::get
<1>(Result
)) // Mid-loop value.
725 // Move the instruction into the loop.
726 LoopBB
->splice(LoopBB
->end(), &MBB
, I
);
727 I
= std::prev(LoopBB
->end());
733 for (MachineOperand
&Op
: MI
.uses()) {
738 if (SGPROperandRegs
.count(Op
.getReg())) {
739 LLT OpTy
= MRI
.getType(Op
.getReg());
740 unsigned OpSize
= OpTy
.getSizeInBits();
742 // Can only do a readlane of 32-bit pieces.
744 // Avoid extra copies in the simple case of one 32-bit register.
745 Register CurrentLaneOpReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
746 MRI
.setType(CurrentLaneOpReg
, OpTy
);
748 constrainGenericRegister(Op
.getReg(), AMDGPU::VGPR_32RegClass
, MRI
);
749 // Read the next variant <- also loop target.
750 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), CurrentLaneOpReg
)
751 .addReg(Op
.getReg());
753 Register NewCondReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
754 bool First
= CondReg
== AMDGPU::NoRegister
;
756 CondReg
= NewCondReg
;
758 // Compare the just read M0 value to all possible Idx values.
759 B
.buildInstr(AMDGPU::V_CMP_EQ_U32_e64
)
761 .addReg(CurrentLaneOpReg
)
762 .addReg(Op
.getReg());
763 Op
.setReg(CurrentLaneOpReg
);
766 Register AndReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
768 // If there are multiple operands to consider, and the conditions.
769 B
.buildInstr(AMDGPU::S_AND_B64
)
776 LLT S32
= LLT::scalar(32);
777 SmallVector
<Register
, 8> ReadlanePieces
;
779 // The compares can be done as 64-bit, but the extract needs to be done
782 bool Is64
= OpSize
% 64 == 0;
784 LLT UnmergeTy
= OpSize
% 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
785 unsigned CmpOp
= OpSize
% 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
786 : AMDGPU::V_CMP_EQ_U32_e64
;
788 // The compares can be done as 64-bit, but the extract needs to be done
791 // Insert the unmerge before the loop.
794 auto Unmerge
= B
.buildUnmerge(UnmergeTy
, Op
.getReg());
797 unsigned NumPieces
= Unmerge
->getNumOperands() - 1;
798 for (unsigned PieceIdx
= 0; PieceIdx
!= NumPieces
; ++PieceIdx
) {
799 Register UnmergePiece
= Unmerge
.getReg(PieceIdx
);
801 Register CurrentLaneOpReg
;
803 Register CurrentLaneOpRegLo
= MRI
.createGenericVirtualRegister(S32
);
804 Register CurrentLaneOpRegHi
= MRI
.createGenericVirtualRegister(S32
);
806 MRI
.setRegClass(UnmergePiece
, &AMDGPU::VReg_64RegClass
);
807 MRI
.setRegClass(CurrentLaneOpRegLo
, &AMDGPU::SReg_32_XM0RegClass
);
808 MRI
.setRegClass(CurrentLaneOpRegHi
, &AMDGPU::SReg_32_XM0RegClass
);
810 // Read the next variant <- also loop target.
811 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
813 .addReg(UnmergePiece
, 0, AMDGPU::sub0
);
815 // Read the next variant <- also loop target.
816 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
818 .addReg(UnmergePiece
, 0, AMDGPU::sub1
);
821 B
.buildMerge(LLT::scalar(64),
822 {CurrentLaneOpRegLo
, CurrentLaneOpRegHi
})
825 MRI
.setRegClass(CurrentLaneOpReg
, &AMDGPU::SReg_64_XEXECRegClass
);
827 if (OpTy
.getScalarSizeInBits() == 64) {
828 // If we need to produce a 64-bit element vector, so use the
830 ReadlanePieces
.push_back(CurrentLaneOpReg
);
832 // 32-bit element type.
833 ReadlanePieces
.push_back(CurrentLaneOpRegLo
);
834 ReadlanePieces
.push_back(CurrentLaneOpRegHi
);
837 CurrentLaneOpReg
= MRI
.createGenericVirtualRegister(LLT::scalar(32));
838 MRI
.setRegClass(UnmergePiece
, &AMDGPU::VGPR_32RegClass
);
839 MRI
.setRegClass(CurrentLaneOpReg
, &AMDGPU::SReg_32_XM0RegClass
);
841 // Read the next variant <- also loop target.
842 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
844 .addReg(UnmergePiece
);
845 ReadlanePieces
.push_back(CurrentLaneOpReg
);
849 = MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
850 bool First
= CondReg
== AMDGPU::NoRegister
;
852 CondReg
= NewCondReg
;
856 .addReg(CurrentLaneOpReg
)
857 .addReg(UnmergePiece
);
861 = MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
863 // If there are multiple operands to consider, and the conditions.
864 B
.buildInstr(AMDGPU::S_AND_B64
)
872 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
874 if (OpTy
.isVector()) {
875 auto Merge
= B
.buildBuildVector(OpTy
, ReadlanePieces
);
876 Op
.setReg(Merge
.getReg(0));
878 auto Merge
= B
.buildMerge(OpTy
, ReadlanePieces
);
879 Op
.setReg(Merge
.getReg(0));
882 MRI
.setRegBank(Op
.getReg(), getRegBank(AMDGPU::SGPRRegBankID
));
887 B
.setInsertPt(*LoopBB
, LoopBB
->end());
889 // Update EXEC, save the original EXEC value to VCC.
890 B
.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64
)
892 .addReg(CondReg
, RegState::Kill
);
894 MRI
.setSimpleHint(NewExec
, CondReg
);
896 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
897 B
.buildInstr(AMDGPU::S_XOR_B64_term
)
898 .addDef(AMDGPU::EXEC
)
899 .addReg(AMDGPU::EXEC
)
902 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
905 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
906 B
.buildInstr(AMDGPU::S_CBRANCH_EXECNZ
)
909 // Save the EXEC mask before the loop.
910 BuildMI(MBB
, MBB
.end(), DL
, TII
->get(AMDGPU::S_MOV_B64_term
), SaveExecReg
)
911 .addReg(AMDGPU::EXEC
);
913 // Restore the EXEC mask after the loop.
914 B
.setMBB(*RestoreExecBB
);
915 B
.buildInstr(AMDGPU::S_MOV_B64_term
)
916 .addDef(AMDGPU::EXEC
)
917 .addReg(SaveExecReg
);
920 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
921 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
922 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, unsigned OpIdx
) const {
923 Register Reg
= MI
.getOperand(OpIdx
).getReg();
924 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
925 if (Bank
!= &AMDGPU::VGPRRegBank
)
928 MachineIRBuilder
B(MI
);
929 Register SGPR
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
930 B
.buildInstr(AMDGPU::V_READFIRSTLANE_B32
)
934 const TargetRegisterClass
*Constrained
=
935 constrainGenericRegister(Reg
, AMDGPU::VGPR_32RegClass
, MRI
);
937 assert(Constrained
&& "Failed to constrain readfirstlane src reg");
939 MI
.getOperand(OpIdx
).setReg(SGPR
);
942 // When regbankselect repairs registers, it will insert a repair instruction
943 // which defines the repaired register. Then it calls applyMapping and expects
944 // that the targets will either delete or rewrite the originally wrote to the
945 // repaired registers. Beccause of this, we end up in a situation where
946 // we have 2 instructions defining the same registers.
947 static MachineInstr
*getOtherVRegDef(const MachineRegisterInfo
&MRI
,
949 const MachineInstr
&MI
) {
950 // Is there some way we can assert that there are exactly 2 def instructions?
951 for (MachineInstr
&Other
: MRI
.def_instructions(Reg
)) {
959 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr
&MI
,
960 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
,
961 MachineRegisterInfo
&MRI
) const {
962 Register DstReg
= MI
.getOperand(0).getReg();
963 const LLT LoadTy
= MRI
.getType(DstReg
);
964 unsigned LoadSize
= LoadTy
.getSizeInBits();
965 const unsigned MaxNonSmrdLoadSize
= 128;
966 // 128-bit loads are supported for all instruction types.
967 if (LoadSize
<= MaxNonSmrdLoadSize
)
970 SmallVector
<unsigned, 16> DefRegs(OpdMapper
.getVRegs(0));
971 SmallVector
<unsigned, 1> SrcRegs(OpdMapper
.getVRegs(1));
973 // If the pointer is an SGPR, we have nothing to do.
977 assert(LoadSize
% MaxNonSmrdLoadSize
== 0);
979 // We want to get the repair instruction now, because it will help us
980 // determine which instruction the legalizer inserts that will also
982 MachineInstr
*RepairInst
= getOtherVRegDef(MRI
, DstReg
, MI
);
984 // RegBankSelect only emits scalar types, so we need to reset the pointer
985 // operand to a pointer type.
986 Register BasePtrReg
= SrcRegs
[0];
987 LLT PtrTy
= MRI
.getType(MI
.getOperand(1).getReg());
988 MRI
.setType(BasePtrReg
, PtrTy
);
990 MachineIRBuilder
B(MI
);
993 MaxNonSmrdLoadSize
/ LoadTy
.getScalarType().getSizeInBits();
994 const LLT LoadSplitTy
= LLT::vector(SplitElts
, LoadTy
.getScalarType());
995 ApplyRegBankMapping
O(MRI
, &AMDGPU::VGPRRegBank
);
996 GISelObserverWrapper
Observer(&O
);
997 B
.setChangeObserver(Observer
);
998 LegalizerHelper
Helper(B
.getMF(), Observer
, B
);
999 if (Helper
.fewerElementsVector(MI
, 0, LoadSplitTy
) != LegalizerHelper::Legalized
)
1002 // At this point, the legalizer has split the original load into smaller
1003 // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
1004 // that combines the outputs of the lower loads and writes it to DstReg.
1005 // The register bank selector has also added the RepairInst which writes to
1008 MachineInstr
*LegalizedInst
= getOtherVRegDef(MRI
, DstReg
, *RepairInst
);
1010 // Replace the output of the LegalizedInst with a temporary register, since
1011 // RepairInst already defines DstReg.
1012 Register TmpReg
= MRI
.createGenericVirtualRegister(MRI
.getType(DstReg
));
1013 LegalizedInst
->getOperand(0).setReg(TmpReg
);
1014 B
.setInsertPt(*RepairInst
->getParent(), RepairInst
);
1016 for (unsigned DefIdx
= 0, e
= DefRegs
.size(); DefIdx
!= e
; ++DefIdx
) {
1017 Register IdxReg
= MRI
.createGenericVirtualRegister(LLT::scalar(32));
1018 B
.buildConstant(IdxReg
, DefIdx
);
1019 MRI
.setRegBank(IdxReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1020 B
.buildExtractVectorElement(DefRegs
[DefIdx
], TmpReg
, IdxReg
);
1023 MRI
.setRegBank(DstReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1027 // For cases where only a single copy is inserted for matching register banks.
1028 // Replace the register in the instruction operand
1029 static void substituteSimpleCopyRegs(
1030 const AMDGPURegisterBankInfo::OperandsMapper
&OpdMapper
, unsigned OpIdx
) {
1031 SmallVector
<unsigned, 1> SrcReg(OpdMapper
.getVRegs(OpIdx
));
1032 if (!SrcReg
.empty()) {
1033 assert(SrcReg
.size() == 1);
1034 OpdMapper
.getMI().getOperand(OpIdx
).setReg(SrcReg
[0]);
1038 void AMDGPURegisterBankInfo::applyMappingImpl(
1039 const OperandsMapper
&OpdMapper
) const {
1040 MachineInstr
&MI
= OpdMapper
.getMI();
1041 unsigned Opc
= MI
.getOpcode();
1042 MachineRegisterInfo
&MRI
= OpdMapper
.getMRI();
1044 case AMDGPU::G_SELECT
: {
1045 Register DstReg
= MI
.getOperand(0).getReg();
1046 LLT DstTy
= MRI
.getType(DstReg
);
1047 if (DstTy
.getSizeInBits() != 64)
1050 LLT HalfTy
= getHalfSizedType(DstTy
);
1052 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1053 SmallVector
<Register
, 1> Src0Regs(OpdMapper
.getVRegs(1));
1054 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
1055 SmallVector
<Register
, 2> Src2Regs(OpdMapper
.getVRegs(3));
1057 // All inputs are SGPRs, nothing special to do.
1058 if (DefRegs
.empty()) {
1059 assert(Src1Regs
.empty() && Src2Regs
.empty());
1063 MachineIRBuilder
B(MI
);
1064 if (Src0Regs
.empty())
1065 Src0Regs
.push_back(MI
.getOperand(1).getReg());
1067 assert(Src0Regs
.size() == 1);
1070 if (Src1Regs
.empty())
1071 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
1073 setRegsToType(MRI
, Src1Regs
, HalfTy
);
1076 if (Src2Regs
.empty())
1077 split64BitValueForMapping(B
, Src2Regs
, HalfTy
, MI
.getOperand(3).getReg());
1079 setRegsToType(MRI
, Src2Regs
, HalfTy
);
1081 setRegsToType(MRI
, DefRegs
, HalfTy
);
1083 B
.buildSelect(DefRegs
[0], Src0Regs
[0], Src1Regs
[0], Src2Regs
[0]);
1084 B
.buildSelect(DefRegs
[1], Src0Regs
[0], Src1Regs
[1], Src2Regs
[1]);
1086 MRI
.setRegBank(DstReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1087 MI
.eraseFromParent();
1092 case AMDGPU::G_XOR
: {
1093 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1094 // there is a VGPR input.
1095 Register DstReg
= MI
.getOperand(0).getReg();
1096 LLT DstTy
= MRI
.getType(DstReg
);
1097 if (DstTy
.getSizeInBits() != 64)
1100 LLT HalfTy
= getHalfSizedType(DstTy
);
1101 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1102 SmallVector
<Register
, 2> Src0Regs(OpdMapper
.getVRegs(1));
1103 SmallVector
<Register
, 2> Src1Regs(OpdMapper
.getVRegs(2));
1105 // All inputs are SGPRs, nothing special to do.
1106 if (DefRegs
.empty()) {
1107 assert(Src0Regs
.empty() && Src1Regs
.empty());
1111 assert(DefRegs
.size() == 2);
1112 assert(Src0Regs
.size() == Src1Regs
.size() &&
1113 (Src0Regs
.empty() || Src0Regs
.size() == 2));
1115 // Depending on where the source registers came from, the generic code may
1116 // have decided to split the inputs already or not. If not, we still need to
1117 // extract the values.
1118 MachineIRBuilder
B(MI
);
1120 if (Src0Regs
.empty())
1121 split64BitValueForMapping(B
, Src0Regs
, HalfTy
, MI
.getOperand(1).getReg());
1123 setRegsToType(MRI
, Src0Regs
, HalfTy
);
1125 if (Src1Regs
.empty())
1126 split64BitValueForMapping(B
, Src1Regs
, HalfTy
, MI
.getOperand(2).getReg());
1128 setRegsToType(MRI
, Src1Regs
, HalfTy
);
1130 setRegsToType(MRI
, DefRegs
, HalfTy
);
1134 .addUse(Src0Regs
[0])
1135 .addUse(Src1Regs
[0]);
1139 .addUse(Src0Regs
[1])
1140 .addUse(Src1Regs
[1]);
1142 MRI
.setRegBank(DstReg
, getRegBank(AMDGPU::VGPRRegBankID
));
1143 MI
.eraseFromParent();
1148 case AMDGPU::G_MUL
: {
1149 Register DstReg
= MI
.getOperand(0).getReg();
1150 LLT DstTy
= MRI
.getType(DstReg
);
1151 if (DstTy
!= LLT::scalar(16))
1154 const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
);
1155 if (DstBank
== &AMDGPU::VGPRRegBank
)
1158 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1159 MachineFunction
*MF
= MI
.getParent()->getParent();
1160 MachineIRBuilder
B(MI
);
1161 ApplyRegBankMapping
ApplySALU(MRI
, &AMDGPU::SGPRRegBank
);
1162 GISelObserverWrapper
Observer(&ApplySALU
);
1163 LegalizerHelper
Helper(*MF
, Observer
, B
);
1165 if (Helper
.widenScalar(MI
, 0, LLT::scalar(32)) !=
1166 LegalizerHelper::Legalized
)
1167 llvm_unreachable("widen scalar should have succeeded");
1170 case AMDGPU::G_SMIN
:
1171 case AMDGPU::G_SMAX
:
1172 case AMDGPU::G_UMIN
:
1173 case AMDGPU::G_UMAX
: {
1174 Register DstReg
= MI
.getOperand(0).getReg();
1175 const RegisterBank
*DstBank
= getRegBank(DstReg
, MRI
, *TRI
);
1176 if (DstBank
== &AMDGPU::VGPRRegBank
)
1179 MachineFunction
*MF
= MI
.getParent()->getParent();
1180 MachineIRBuilder
B(MI
);
1181 ApplyRegBankMapping
ApplySALU(MRI
, &AMDGPU::SGPRRegBank
);
1182 GISelObserverWrapper
Observer(&ApplySALU
);
1183 LegalizerHelper
Helper(*MF
, Observer
, B
);
1185 // Turn scalar min/max into a compare and select.
1186 LLT Ty
= MRI
.getType(DstReg
);
1187 LLT S32
= LLT::scalar(32);
1188 LLT S16
= LLT::scalar(16);
1191 // Need to widen to s32, and expand as cmp + select.
1192 if (Helper
.widenScalar(MI
, 0, S32
) != LegalizerHelper::Legalized
)
1193 llvm_unreachable("widenScalar should have succeeded");
1195 // FIXME: This is relying on widenScalar leaving MI in place.
1196 if (Helper
.lower(MI
, 0, S32
) != LegalizerHelper::Legalized
)
1197 llvm_unreachable("lower should have succeeded");
1199 if (Helper
.lower(MI
, 0, Ty
) != LegalizerHelper::Legalized
)
1200 llvm_unreachable("lower should have succeeded");
1205 case AMDGPU::G_SEXT
:
1206 case AMDGPU::G_ZEXT
: {
1207 Register SrcReg
= MI
.getOperand(1).getReg();
1208 LLT SrcTy
= MRI
.getType(SrcReg
);
1209 bool Signed
= Opc
== AMDGPU::G_SEXT
;
1211 MachineIRBuilder
B(MI
);
1212 const RegisterBank
*SrcBank
= getRegBank(SrcReg
, MRI
, *TRI
);
1214 Register DstReg
= MI
.getOperand(0).getReg();
1215 LLT DstTy
= MRI
.getType(DstReg
);
1216 if (DstTy
.isScalar() &&
1217 SrcBank
!= &AMDGPU::SGPRRegBank
&&
1218 SrcBank
!= &AMDGPU::SCCRegBank
&&
1219 SrcBank
!= &AMDGPU::VCCRegBank
&&
1220 // FIXME: Should handle any type that round to s64 when irregular
1221 // breakdowns supported.
1222 DstTy
.getSizeInBits() == 64 &&
1223 SrcTy
.getSizeInBits() <= 32) {
1224 const LLT S32
= LLT::scalar(32);
1225 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1227 // Extend to 32-bit, and then extend the low half.
1229 // TODO: Should really be buildSExtOrCopy
1230 B
.buildSExtOrTrunc(DefRegs
[0], SrcReg
);
1232 // Replicate sign bit from 32-bit extended part.
1233 auto ShiftAmt
= B
.buildConstant(S32
, 31);
1234 MRI
.setRegBank(ShiftAmt
.getReg(0), *SrcBank
);
1235 B
.buildAShr(DefRegs
[1], DefRegs
[0], ShiftAmt
);
1237 B
.buildZExtOrTrunc(DefRegs
[0], SrcReg
);
1238 B
.buildConstant(DefRegs
[1], 0);
1241 MRI
.setRegBank(DstReg
, *SrcBank
);
1242 MI
.eraseFromParent();
1246 if (SrcTy
!= LLT::scalar(1))
1249 if (SrcBank
== &AMDGPU::SCCRegBank
|| SrcBank
== &AMDGPU::VCCRegBank
) {
1250 SmallVector
<Register
, 2> DefRegs(OpdMapper
.getVRegs(0));
1252 const RegisterBank
*DstBank
= SrcBank
== &AMDGPU::SCCRegBank
?
1253 &AMDGPU::SGPRRegBank
: &AMDGPU::VGPRRegBank
;
1255 unsigned DstSize
= DstTy
.getSizeInBits();
1256 // 64-bit select is SGPR only
1257 const bool UseSel64
= DstSize
> 32 &&
1258 SrcBank
->getID() == AMDGPU::SCCRegBankID
;
1260 // TODO: Should s16 select be legal?
1261 LLT SelType
= UseSel64
? LLT::scalar(64) : LLT::scalar(32);
1262 auto True
= B
.buildConstant(SelType
, Signed
? -1 : 1);
1263 auto False
= B
.buildConstant(SelType
, 0);
1265 MRI
.setRegBank(True
.getReg(0), *DstBank
);
1266 MRI
.setRegBank(False
.getReg(0), *DstBank
);
1267 MRI
.setRegBank(DstReg
, *DstBank
);
1269 if (DstSize
> 32 && SrcBank
->getID() != AMDGPU::SCCRegBankID
) {
1270 B
.buildSelect(DefRegs
[0], SrcReg
, True
, False
);
1271 B
.buildCopy(DefRegs
[1], DefRegs
[0]);
1272 } else if (DstSize
< 32) {
1273 auto Sel
= B
.buildSelect(SelType
, SrcReg
, True
, False
);
1274 MRI
.setRegBank(Sel
.getReg(0), *DstBank
);
1275 B
.buildTrunc(DstReg
, Sel
);
1277 B
.buildSelect(DstReg
, SrcReg
, True
, False
);
1280 MI
.eraseFromParent();
1284 // Fixup the case with an s1 src that isn't a condition register. Use shifts
1285 // instead of introducing a compare to avoid an unnecessary condition
1286 // register (and since there's no scalar 16-bit compares).
1287 auto Ext
= B
.buildAnyExt(DstTy
, SrcReg
);
1288 auto ShiftAmt
= B
.buildConstant(LLT::scalar(32), DstTy
.getSizeInBits() - 1);
1289 auto Shl
= B
.buildShl(DstTy
, Ext
, ShiftAmt
);
1291 if (MI
.getOpcode() == AMDGPU::G_SEXT
)
1292 B
.buildAShr(DstReg
, Shl
, ShiftAmt
);
1294 B
.buildLShr(DstReg
, Shl
, ShiftAmt
);
1296 MRI
.setRegBank(DstReg
, *SrcBank
);
1297 MRI
.setRegBank(Ext
.getReg(0), *SrcBank
);
1298 MRI
.setRegBank(ShiftAmt
.getReg(0), *SrcBank
);
1299 MRI
.setRegBank(Shl
.getReg(0), *SrcBank
);
1300 MI
.eraseFromParent();
1303 case AMDGPU::G_EXTRACT_VECTOR_ELT
:
1304 applyDefaultMapping(OpdMapper
);
1305 executeInWaterfallLoop(MI
, MRI
, { 2 });
1307 case AMDGPU::G_INTRINSIC
: {
1308 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
1309 case Intrinsic::amdgcn_s_buffer_load
: {
1310 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
1311 executeInWaterfallLoop(MI
, MRI
, { 2, 3 });
1314 case Intrinsic::amdgcn_readlane
: {
1315 substituteSimpleCopyRegs(OpdMapper
, 2);
1317 assert(empty(OpdMapper
.getVRegs(0)));
1318 assert(empty(OpdMapper
.getVRegs(3)));
1320 // Make sure the index is an SGPR. It doesn't make sense to run this in a
1321 // waterfall loop, so assume it's a uniform value.
1322 constrainOpWithReadfirstlane(MI
, MRI
, 3); // Index
1325 case Intrinsic::amdgcn_writelane
: {
1326 assert(empty(OpdMapper
.getVRegs(0)));
1327 assert(empty(OpdMapper
.getVRegs(2)));
1328 assert(empty(OpdMapper
.getVRegs(3)));
1330 substituteSimpleCopyRegs(OpdMapper
, 4); // VGPR input val
1331 constrainOpWithReadfirstlane(MI
, MRI
, 2); // Source value
1332 constrainOpWithReadfirstlane(MI
, MRI
, 3); // Index
1340 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
: {
1341 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
1342 case Intrinsic::amdgcn_buffer_load
: {
1343 executeInWaterfallLoop(MI
, MRI
, { 2 });
1346 case Intrinsic::amdgcn_ds_ordered_add
:
1347 case Intrinsic::amdgcn_ds_ordered_swap
: {
1348 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
1349 assert(empty(OpdMapper
.getVRegs(0)));
1350 substituteSimpleCopyRegs(OpdMapper
, 3);
1351 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
1354 case Intrinsic::amdgcn_s_sendmsg
:
1355 case Intrinsic::amdgcn_s_sendmsghalt
: {
1356 // FIXME: Should this use a waterfall loop?
1357 constrainOpWithReadfirstlane(MI
, MRI
, 2); // M0
1365 case AMDGPU::G_LOAD
: {
1366 if (applyMappingWideLoad(MI
, OpdMapper
, MRI
))
1374 return applyDefaultMapping(OpdMapper
);
1377 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr
&MI
) const {
1378 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1379 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1380 for (unsigned i
= 0, e
= MI
.getNumOperands();i
!= e
; ++i
) {
1381 if (!MI
.getOperand(i
).isReg())
1383 Register Reg
= MI
.getOperand(i
).getReg();
1384 if (const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
)) {
1385 if (Bank
->getID() == AMDGPU::VGPRRegBankID
)
1388 assert(Bank
->getID() == AMDGPU::SGPRRegBankID
||
1389 Bank
->getID() == AMDGPU::SCCRegBankID
);
1395 const RegisterBankInfo::InstructionMapping
&
1396 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr
&MI
) const {
1397 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1398 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1399 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1401 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
1402 unsigned Size
= getSizeInBits(MI
.getOperand(i
).getReg(), MRI
, *TRI
);
1403 unsigned BankID
= Size
== 1 ? AMDGPU::SCCRegBankID
: AMDGPU::SGPRRegBankID
;
1404 OpdsMapping
[i
] = AMDGPU::getValueMapping(BankID
, Size
);
1406 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
1407 MI
.getNumOperands());
1410 const RegisterBankInfo::InstructionMapping
&
1411 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr
&MI
) const {
1412 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1413 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1414 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1415 unsigned OpdIdx
= 0;
1417 unsigned Size0
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1418 OpdsMapping
[OpdIdx
++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size0
);
1420 if (MI
.getOperand(OpdIdx
).isIntrinsicID())
1421 OpdsMapping
[OpdIdx
++] = nullptr;
1423 Register Reg1
= MI
.getOperand(OpdIdx
).getReg();
1424 unsigned Size1
= getSizeInBits(Reg1
, MRI
, *TRI
);
1426 unsigned DefaultBankID
= Size1
== 1 ?
1427 AMDGPU::VCCRegBankID
: AMDGPU::VGPRRegBankID
;
1428 unsigned Bank1
= getRegBankID(Reg1
, MRI
, *TRI
, DefaultBankID
);
1430 OpdsMapping
[OpdIdx
++] = AMDGPU::getValueMapping(Bank1
, Size1
);
1432 for (unsigned e
= MI
.getNumOperands(); OpdIdx
!= e
; ++OpdIdx
) {
1433 const MachineOperand
&MO
= MI
.getOperand(OpdIdx
);
1437 unsigned Size
= getSizeInBits(MO
.getReg(), MRI
, *TRI
);
1438 unsigned BankID
= Size
== 1 ? AMDGPU::VCCRegBankID
: AMDGPU::VGPRRegBankID
;
1439 OpdsMapping
[OpdIdx
] = AMDGPU::getValueMapping(BankID
, Size
);
1442 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
1443 MI
.getNumOperands());
1446 const RegisterBankInfo::InstructionMapping
&
1447 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr
&MI
) const {
1448 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1449 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1450 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1452 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
) {
1453 const MachineOperand
&Op
= MI
.getOperand(I
);
1457 unsigned Size
= getSizeInBits(Op
.getReg(), MRI
, *TRI
);
1458 OpdsMapping
[I
] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
1461 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping
),
1462 MI
.getNumOperands());
1465 const RegisterBankInfo::InstructionMapping
&
1466 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr
&MI
) const {
1468 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1469 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1470 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1471 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1472 LLT LoadTy
= MRI
.getType(MI
.getOperand(0).getReg());
1473 unsigned PtrSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
1475 const ValueMapping
*ValMapping
;
1476 const ValueMapping
*PtrMapping
;
1478 if (isInstrUniform(MI
)) {
1479 // We have a uniform instruction so we want to use an SMRD load
1480 ValMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
1481 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, PtrSize
);
1483 ValMapping
= AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID
, LoadTy
);
1484 // FIXME: What would happen if we used SGPRRegBankID here?
1485 PtrMapping
= AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, PtrSize
);
1488 OpdsMapping
[0] = ValMapping
;
1489 OpdsMapping
[1] = PtrMapping
;
1490 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstructionMapping(
1491 1, 1, getOperandsMapping(OpdsMapping
), MI
.getNumOperands());
1494 // FIXME: Do we want to add a mapping for FLAT load, or should we just
1495 // handle that during instruction selection?
1499 AMDGPURegisterBankInfo::getRegBankID(Register Reg
,
1500 const MachineRegisterInfo
&MRI
,
1501 const TargetRegisterInfo
&TRI
,
1502 unsigned Default
) const {
1504 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, TRI
);
1505 return Bank
? Bank
->getID() : Default
;
1509 /// This function must return a legal mapping, because
1510 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
1511 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
1512 /// VGPR to SGPR generated is illegal.
1514 const RegisterBankInfo::InstructionMapping
&
1515 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr
&MI
) const {
1516 const MachineFunction
&MF
= *MI
.getParent()->getParent();
1517 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1519 if (MI
.isRegSequence()) {
1520 // If any input is a VGPR, the result must be a VGPR. The default handling
1521 // assumes any copy between banks is legal.
1522 unsigned BankID
= AMDGPU::SGPRRegBankID
;
1524 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
1525 auto OpBank
= getRegBankID(MI
.getOperand(I
).getReg(), MRI
, *TRI
);
1526 // It doesn't make sense to use vcc or scc banks here, so just ignore
1528 if (OpBank
!= AMDGPU::SGPRRegBankID
) {
1529 BankID
= AMDGPU::VGPRRegBankID
;
1533 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1535 const ValueMapping
&ValMap
= getValueMapping(0, Size
, getRegBank(BankID
));
1536 return getInstructionMapping(
1538 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
1541 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
1544 // TODO: There are additional exec masking dependencies to analyze.
1545 if (MI
.getOpcode() == TargetOpcode::G_PHI
) {
1546 // TODO: Generate proper invalid bank enum.
1547 int ResultBank
= -1;
1549 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
1550 Register Reg
= MI
.getOperand(I
).getReg();
1551 const RegisterBank
*Bank
= getRegBank(Reg
, MRI
, *TRI
);
1553 // FIXME: Assuming VGPR for any undetermined inputs.
1554 if (!Bank
|| Bank
->getID() == AMDGPU::VGPRRegBankID
) {
1555 ResultBank
= AMDGPU::VGPRRegBankID
;
1559 unsigned OpBank
= Bank
->getID();
1561 if (OpBank
== AMDGPU::SCCRegBankID
) {
1562 // There's only one SCC register, so a phi requires copying to SGPR.
1563 OpBank
= AMDGPU::SGPRRegBankID
;
1564 } else if (OpBank
== AMDGPU::VCCRegBankID
) {
1566 // vcc, sgpr -> vgpr
1567 if (ResultBank
!= -1 && ResultBank
!= AMDGPU::VCCRegBankID
) {
1568 ResultBank
= AMDGPU::VGPRRegBankID
;
1573 ResultBank
= OpBank
;
1576 assert(ResultBank
!= -1);
1578 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1580 const ValueMapping
&ValMap
=
1581 getValueMapping(0, Size
, getRegBank(ResultBank
));
1582 return getInstructionMapping(
1584 /*OperandsMapping*/ getOperandsMapping({&ValMap
}), 1);
1587 const RegisterBankInfo::InstructionMapping
&Mapping
= getInstrMappingImpl(MI
);
1588 if (Mapping
.isValid())
1591 SmallVector
<const ValueMapping
*, 8> OpdsMapping(MI
.getNumOperands());
1593 switch (MI
.getOpcode()) {
1595 return getInvalidInstructionMapping();
1599 case AMDGPU::G_XOR
: {
1600 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1602 const RegisterBank
*DstBank
1603 = getRegBank(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1605 unsigned TargetBankID
= -1;
1606 unsigned BankLHS
= -1;
1607 unsigned BankRHS
= -1;
1609 TargetBankID
= DstBank
->getID();
1610 if (DstBank
== &AMDGPU::VCCRegBank
) {
1611 TargetBankID
= AMDGPU::VCCRegBankID
;
1612 BankLHS
= AMDGPU::VCCRegBankID
;
1613 BankRHS
= AMDGPU::VCCRegBankID
;
1614 } else if (DstBank
== &AMDGPU::SCCRegBank
) {
1615 TargetBankID
= AMDGPU::SCCRegBankID
;
1616 BankLHS
= AMDGPU::SGPRRegBankID
;
1617 BankRHS
= AMDGPU::SGPRRegBankID
;
1619 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
,
1620 AMDGPU::SGPRRegBankID
);
1621 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
1622 AMDGPU::SGPRRegBankID
);
1625 BankLHS
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
,
1626 AMDGPU::VCCRegBankID
);
1627 BankRHS
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
1628 AMDGPU::VCCRegBankID
);
1630 // Both inputs should be true booleans to produce a boolean result.
1631 if (BankLHS
== AMDGPU::VGPRRegBankID
|| BankRHS
== AMDGPU::VGPRRegBankID
) {
1632 TargetBankID
= AMDGPU::VGPRRegBankID
;
1633 } else if (BankLHS
== AMDGPU::VCCRegBankID
|| BankRHS
== AMDGPU::VCCRegBankID
) {
1634 TargetBankID
= AMDGPU::VCCRegBankID
;
1635 BankLHS
= AMDGPU::VCCRegBankID
;
1636 BankRHS
= AMDGPU::VCCRegBankID
;
1637 } else if (BankLHS
== AMDGPU::SGPRRegBankID
&& BankRHS
== AMDGPU::SGPRRegBankID
) {
1638 TargetBankID
= AMDGPU::SGPRRegBankID
;
1639 } else if (BankLHS
== AMDGPU::SCCRegBankID
|| BankRHS
== AMDGPU::SCCRegBankID
) {
1640 // The operation must be done on a 32-bit register, but it will set
1641 // scc. The result type could interchangably be SCC or SGPR, since
1642 // both values will be produced.
1643 TargetBankID
= AMDGPU::SCCRegBankID
;
1644 BankLHS
= AMDGPU::SGPRRegBankID
;
1645 BankRHS
= AMDGPU::SGPRRegBankID
;
1649 OpdsMapping
[0] = AMDGPU::getValueMapping(TargetBankID
, Size
);
1650 OpdsMapping
[1] = AMDGPU::getValueMapping(BankLHS
, Size
);
1651 OpdsMapping
[2] = AMDGPU::getValueMapping(BankRHS
, Size
);
1657 if (isSALUMapping(MI
)) {
1658 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID
, Size
);
1659 OpdsMapping
[1] = OpdsMapping
[2] = OpdsMapping
[0];
1661 OpdsMapping
[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID
, Size
);
1662 unsigned Bank1
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
/*, DefaultBankID*/);
1663 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank1
, Size
);
1665 unsigned Bank2
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
/*, DefaultBankID*/);
1666 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank2
, Size
);
1680 case AMDGPU::G_LSHR
:
1681 case AMDGPU::G_ASHR
:
1682 case AMDGPU::G_UADDO
:
1683 case AMDGPU::G_SADDO
:
1684 case AMDGPU::G_USUBO
:
1685 case AMDGPU::G_SSUBO
:
1686 case AMDGPU::G_UADDE
:
1687 case AMDGPU::G_SADDE
:
1688 case AMDGPU::G_USUBE
:
1689 case AMDGPU::G_SSUBE
:
1690 case AMDGPU::G_UMULH
:
1691 case AMDGPU::G_SMULH
:
1692 case AMDGPU::G_SMIN
:
1693 case AMDGPU::G_SMAX
:
1694 case AMDGPU::G_UMIN
:
1695 case AMDGPU::G_UMAX
:
1696 if (isSALUMapping(MI
))
1697 return getDefaultMappingSOP(MI
);
1700 case AMDGPU::G_FADD
:
1701 case AMDGPU::G_FSUB
:
1702 case AMDGPU::G_FPTOSI
:
1703 case AMDGPU::G_FPTOUI
:
1704 case AMDGPU::G_FMUL
:
1706 case AMDGPU::G_FSQRT
:
1707 case AMDGPU::G_SITOFP
:
1708 case AMDGPU::G_UITOFP
:
1709 case AMDGPU::G_FPTRUNC
:
1710 case AMDGPU::G_FPEXT
:
1711 case AMDGPU::G_FEXP2
:
1712 case AMDGPU::G_FLOG2
:
1713 case AMDGPU::G_FMINNUM
:
1714 case AMDGPU::G_FMAXNUM
:
1715 case AMDGPU::G_FMINNUM_IEEE
:
1716 case AMDGPU::G_FMAXNUM_IEEE
:
1717 case AMDGPU::G_FCANONICALIZE
:
1718 case AMDGPU::G_INTRINSIC_TRUNC
:
1719 case AMDGPU::G_INTRINSIC_ROUND
:
1720 return getDefaultMappingVOP(MI
);
1721 case AMDGPU::G_IMPLICIT_DEF
: {
1722 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1723 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
1726 case AMDGPU::G_FCONSTANT
:
1727 case AMDGPU::G_CONSTANT
:
1728 case AMDGPU::G_FRAME_INDEX
:
1729 case AMDGPU::G_BLOCK_ADDR
: {
1730 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1731 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
1734 case AMDGPU::G_INSERT
: {
1735 unsigned BankID
= isSALUMapping(MI
) ? AMDGPU::SGPRRegBankID
:
1736 AMDGPU::VGPRRegBankID
;
1737 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1738 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
1739 unsigned EltSize
= getSizeInBits(MI
.getOperand(2).getReg(), MRI
, *TRI
);
1740 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
1741 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
1742 OpdsMapping
[2] = AMDGPU::getValueMapping(BankID
, EltSize
);
1743 OpdsMapping
[3] = nullptr;
1746 case AMDGPU::G_EXTRACT
: {
1747 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
);
1748 unsigned DstSize
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
1749 unsigned SrcSize
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
1750 OpdsMapping
[0] = AMDGPU::getValueMapping(BankID
, DstSize
);
1751 OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, SrcSize
);
1752 OpdsMapping
[2] = nullptr;
1755 case AMDGPU::G_MERGE_VALUES
:
1756 case AMDGPU::G_BUILD_VECTOR
:
1757 case AMDGPU::G_CONCAT_VECTORS
: {
1758 unsigned Bank
= isSALUMapping(MI
) ?
1759 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
1760 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1761 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
1763 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
1764 // Op1 and Dst should use the same register bank.
1765 for (unsigned i
= 1, e
= MI
.getNumOperands(); i
!= e
; ++i
)
1766 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, SrcSize
);
1769 case AMDGPU::G_BITCAST
:
1770 case AMDGPU::G_INTTOPTR
:
1771 case AMDGPU::G_PTRTOINT
:
1772 case AMDGPU::G_CTLZ
:
1773 case AMDGPU::G_CTLZ_ZERO_UNDEF
:
1774 case AMDGPU::G_CTTZ
:
1775 case AMDGPU::G_CTTZ_ZERO_UNDEF
:
1776 case AMDGPU::G_CTPOP
:
1777 case AMDGPU::G_BSWAP
:
1778 case AMDGPU::G_BITREVERSE
:
1779 case AMDGPU::G_FABS
:
1780 case AMDGPU::G_FNEG
: {
1781 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1782 unsigned BankID
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
);
1783 OpdsMapping
[0] = OpdsMapping
[1] = AMDGPU::getValueMapping(BankID
, Size
);
1786 case AMDGPU::G_TRUNC
: {
1787 Register Dst
= MI
.getOperand(0).getReg();
1788 Register Src
= MI
.getOperand(1).getReg();
1789 unsigned Bank
= getRegBankID(Src
, MRI
, *TRI
);
1790 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
1791 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
1792 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, DstSize
);
1793 OpdsMapping
[1] = AMDGPU::getValueMapping(Bank
, SrcSize
);
1796 case AMDGPU::G_ZEXT
:
1797 case AMDGPU::G_SEXT
:
1798 case AMDGPU::G_ANYEXT
: {
1799 Register Dst
= MI
.getOperand(0).getReg();
1800 Register Src
= MI
.getOperand(1).getReg();
1801 unsigned DstSize
= getSizeInBits(Dst
, MRI
, *TRI
);
1802 unsigned SrcSize
= getSizeInBits(Src
, MRI
, *TRI
);
1805 const RegisterBank
*SrcBank
= getRegBank(Src
, MRI
, *TRI
);
1807 switch (SrcBank
->getID()) {
1808 case AMDGPU::SCCRegBankID
:
1809 case AMDGPU::SGPRRegBankID
:
1810 DstBank
= AMDGPU::SGPRRegBankID
;
1813 DstBank
= AMDGPU::VGPRRegBankID
;
1817 // TODO: Should anyext be split into 32-bit part as well?
1818 if (MI
.getOpcode() == AMDGPU::G_ANYEXT
) {
1819 OpdsMapping
[0] = AMDGPU::getValueMapping(DstBank
, DstSize
);
1820 OpdsMapping
[1] = AMDGPU::getValueMapping(SrcBank
->getID(), SrcSize
);
1822 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
1823 // 32-bits, and then to 64.
1824 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(DstBank
, DstSize
);
1825 OpdsMapping
[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank
->getID(),
1830 case AMDGPU::G_FCMP
: {
1831 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
1832 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
1833 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
1834 OpdsMapping
[1] = nullptr; // Predicate Operand.
1835 OpdsMapping
[2] = AMDGPU::getValueMapping(Op2Bank
, Size
);
1836 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
1839 case AMDGPU::G_STORE
: {
1840 assert(MI
.getOperand(0).isReg());
1841 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1842 // FIXME: We need to specify a different reg bank once scalar stores
1844 const ValueMapping
*ValMapping
=
1845 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size
);
1846 // FIXME: Depending on the type of store, the pointer could be in
1847 // the SGPR Reg bank.
1848 // FIXME: Pointer size should be based on the address space.
1849 const ValueMapping
*PtrMapping
=
1850 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 64);
1852 OpdsMapping
[0] = ValMapping
;
1853 OpdsMapping
[1] = PtrMapping
;
1857 case AMDGPU::G_ICMP
: {
1858 auto Pred
= static_cast<CmpInst::Predicate
>(MI
.getOperand(1).getPredicate());
1859 unsigned Size
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
1860 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
1861 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
);
1863 bool CanUseSCC
= Op2Bank
== AMDGPU::SGPRRegBankID
&&
1864 Op3Bank
== AMDGPU::SGPRRegBankID
&&
1865 (Size
== 32 || (Size
== 64 &&
1866 (Pred
== CmpInst::ICMP_EQ
|| Pred
== CmpInst::ICMP_NE
) &&
1867 MF
.getSubtarget
<GCNSubtarget
>().hasScalarCompareEq64()));
1869 unsigned Op0Bank
= CanUseSCC
? AMDGPU::SCCRegBankID
: AMDGPU::VCCRegBankID
;
1871 OpdsMapping
[0] = AMDGPU::getValueMapping(Op0Bank
, 1);
1872 OpdsMapping
[1] = nullptr; // Predicate Operand.
1873 OpdsMapping
[2] = AMDGPU::getValueMapping(Op2Bank
, Size
);
1874 OpdsMapping
[3] = AMDGPU::getValueMapping(Op3Bank
, Size
);
1877 case AMDGPU::G_EXTRACT_VECTOR_ELT
: {
1878 unsigned OutputBankID
= isSALUMapping(MI
) ?
1879 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
1880 unsigned SrcSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
1881 unsigned IdxSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
1882 unsigned IdxBank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
1884 OpdsMapping
[0] = AMDGPU::getValueMapping(OutputBankID
, SrcSize
);
1885 OpdsMapping
[1] = AMDGPU::getValueMapping(OutputBankID
, SrcSize
);
1887 // The index can be either if the source vector is VGPR.
1888 OpdsMapping
[2] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
1891 case AMDGPU::G_INSERT_VECTOR_ELT
: {
1892 unsigned OutputBankID
= isSALUMapping(MI
) ?
1893 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
1895 unsigned VecSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1896 unsigned InsertSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
1897 unsigned IdxSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
1898 unsigned InsertEltBank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
1899 unsigned IdxBank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
);
1901 OpdsMapping
[0] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
1902 OpdsMapping
[1] = AMDGPU::getValueMapping(OutputBankID
, VecSize
);
1903 OpdsMapping
[2] = AMDGPU::getValueMapping(InsertEltBank
, InsertSize
);
1905 // The index can be either if the source vector is VGPR.
1906 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
1909 case AMDGPU::G_UNMERGE_VALUES
: {
1910 unsigned Bank
= isSALUMapping(MI
) ? AMDGPU::SGPRRegBankID
:
1911 AMDGPU::VGPRRegBankID
;
1913 // Op1 and Dst should use the same register bank.
1914 // FIXME: Shouldn't this be the default? Why do we need to handle this?
1915 for (unsigned i
= 0, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
1916 unsigned Size
= getSizeInBits(MI
.getOperand(i
).getReg(), MRI
, *TRI
);
1917 OpdsMapping
[i
] = AMDGPU::getValueMapping(Bank
, Size
);
1921 case AMDGPU::G_INTRINSIC
: {
1922 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
1924 return getInvalidInstructionMapping();
1925 case Intrinsic::amdgcn_div_fmas
:
1926 case Intrinsic::amdgcn_trig_preop
:
1927 case Intrinsic::amdgcn_sin
:
1928 case Intrinsic::amdgcn_cos
:
1929 case Intrinsic::amdgcn_log_clamp
:
1930 case Intrinsic::amdgcn_rcp
:
1931 case Intrinsic::amdgcn_rcp_legacy
:
1932 case Intrinsic::amdgcn_rsq
:
1933 case Intrinsic::amdgcn_rsq_legacy
:
1934 case Intrinsic::amdgcn_rsq_clamp
:
1935 case Intrinsic::amdgcn_ldexp
:
1936 case Intrinsic::amdgcn_frexp_mant
:
1937 case Intrinsic::amdgcn_frexp_exp
:
1938 case Intrinsic::amdgcn_fract
:
1939 case Intrinsic::amdgcn_cvt_pkrtz
:
1940 case Intrinsic::amdgcn_cvt_pknorm_i16
:
1941 case Intrinsic::amdgcn_cvt_pknorm_u16
:
1942 case Intrinsic::amdgcn_cvt_pk_i16
:
1943 case Intrinsic::amdgcn_cvt_pk_u16
:
1944 case Intrinsic::amdgcn_fmed3
:
1945 case Intrinsic::amdgcn_cubeid
:
1946 case Intrinsic::amdgcn_cubema
:
1947 case Intrinsic::amdgcn_cubesc
:
1948 case Intrinsic::amdgcn_cubetc
:
1949 case Intrinsic::amdgcn_sffbh
:
1950 case Intrinsic::amdgcn_fmad_ftz
:
1951 case Intrinsic::amdgcn_mbcnt_lo
:
1952 case Intrinsic::amdgcn_mbcnt_hi
:
1953 case Intrinsic::amdgcn_ubfe
:
1954 case Intrinsic::amdgcn_sbfe
:
1955 case Intrinsic::amdgcn_lerp
:
1956 case Intrinsic::amdgcn_sad_u8
:
1957 case Intrinsic::amdgcn_msad_u8
:
1958 case Intrinsic::amdgcn_sad_hi_u8
:
1959 case Intrinsic::amdgcn_sad_u16
:
1960 case Intrinsic::amdgcn_qsad_pk_u16_u8
:
1961 case Intrinsic::amdgcn_mqsad_pk_u16_u8
:
1962 case Intrinsic::amdgcn_mqsad_u32_u8
:
1963 case Intrinsic::amdgcn_cvt_pk_u8_f32
:
1964 case Intrinsic::amdgcn_alignbit
:
1965 case Intrinsic::amdgcn_alignbyte
:
1966 case Intrinsic::amdgcn_fdot2
:
1967 case Intrinsic::amdgcn_sdot2
:
1968 case Intrinsic::amdgcn_udot2
:
1969 case Intrinsic::amdgcn_sdot4
:
1970 case Intrinsic::amdgcn_udot4
:
1971 case Intrinsic::amdgcn_sdot8
:
1972 case Intrinsic::amdgcn_udot8
:
1973 case Intrinsic::amdgcn_wwm
:
1974 case Intrinsic::amdgcn_wqm
:
1975 return getDefaultMappingVOP(MI
);
1976 case Intrinsic::amdgcn_ds_permute
:
1977 case Intrinsic::amdgcn_ds_bpermute
:
1978 case Intrinsic::amdgcn_update_dpp
:
1979 return getDefaultMappingAllVGPR(MI
);
1980 case Intrinsic::amdgcn_kernarg_segment_ptr
:
1981 case Intrinsic::amdgcn_s_getpc
:
1982 case Intrinsic::amdgcn_groupstaticsize
: {
1983 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1984 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
1987 case Intrinsic::amdgcn_wqm_vote
: {
1988 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1989 OpdsMapping
[0] = OpdsMapping
[2]
1990 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Size
);
1993 case Intrinsic::amdgcn_s_buffer_load
: {
1994 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
1995 Register RSrc
= MI
.getOperand(2).getReg(); // SGPR
1996 Register Offset
= MI
.getOperand(3).getReg(); // SGPR/imm
1998 unsigned Size0
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1999 unsigned Size2
= MRI
.getType(RSrc
).getSizeInBits();
2000 unsigned Size3
= MRI
.getType(Offset
).getSizeInBits();
2002 unsigned RSrcBank
= getRegBankID(RSrc
, MRI
, *TRI
);
2003 unsigned OffsetBank
= getRegBankID(Offset
, MRI
, *TRI
);
2005 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size0
);
2006 OpdsMapping
[1] = nullptr; // intrinsic id
2008 // Lie and claim everything is legal, even though some need to be
2009 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2010 OpdsMapping
[2] = AMDGPU::getValueMapping(RSrcBank
, Size2
); // rsrc
2011 OpdsMapping
[3] = AMDGPU::getValueMapping(OffsetBank
, Size3
);
2012 OpdsMapping
[4] = nullptr;
2015 case Intrinsic::amdgcn_div_scale
: {
2016 unsigned Dst0Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2017 unsigned Dst1Size
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
2018 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Dst0Size
);
2019 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, Dst1Size
);
2021 unsigned SrcSize
= MRI
.getType(MI
.getOperand(3).getReg()).getSizeInBits();
2022 OpdsMapping
[3] = AMDGPU::getValueMapping(
2023 getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
), SrcSize
);
2024 OpdsMapping
[4] = AMDGPU::getValueMapping(
2025 getRegBankID(MI
.getOperand(4).getReg(), MRI
, *TRI
), SrcSize
);
2029 case Intrinsic::amdgcn_class
: {
2030 Register Src0Reg
= MI
.getOperand(2).getReg();
2031 Register Src1Reg
= MI
.getOperand(3).getReg();
2032 unsigned Src0Size
= MRI
.getType(Src0Reg
).getSizeInBits();
2033 unsigned Src1Size
= MRI
.getType(Src1Reg
).getSizeInBits();
2034 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2035 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, DstSize
);
2036 OpdsMapping
[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg
, MRI
, *TRI
),
2038 OpdsMapping
[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg
, MRI
, *TRI
),
2042 case Intrinsic::amdgcn_icmp
:
2043 case Intrinsic::amdgcn_fcmp
: {
2044 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2045 // This is not VCCRegBank because this is not used in boolean contexts.
2046 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
2047 unsigned OpSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2048 unsigned Op1Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
);
2049 unsigned Op2Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
);
2050 OpdsMapping
[2] = AMDGPU::getValueMapping(Op1Bank
, OpSize
);
2051 OpdsMapping
[3] = AMDGPU::getValueMapping(Op2Bank
, OpSize
);
2054 case Intrinsic::amdgcn_readlane
: {
2055 // This must be an SGPR, but accept a VGPR.
2056 Register IdxReg
= MI
.getOperand(3).getReg();
2057 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
2058 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
2059 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
2062 case Intrinsic::amdgcn_readfirstlane
: {
2063 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2064 unsigned SrcSize
= MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits();
2065 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, DstSize
);
2066 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
2069 case Intrinsic::amdgcn_writelane
: {
2070 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2071 Register SrcReg
= MI
.getOperand(2).getReg();
2072 unsigned SrcSize
= MRI
.getType(SrcReg
).getSizeInBits();
2073 unsigned SrcBank
= getRegBankID(SrcReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
2074 Register IdxReg
= MI
.getOperand(3).getReg();
2075 unsigned IdxSize
= MRI
.getType(IdxReg
).getSizeInBits();
2076 unsigned IdxBank
= getRegBankID(IdxReg
, MRI
, *TRI
, AMDGPU::SGPRRegBankID
);
2077 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
2079 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
2081 OpdsMapping
[2] = AMDGPU::getValueMapping(SrcBank
, SrcSize
);
2082 OpdsMapping
[3] = AMDGPU::getValueMapping(IdxBank
, IdxSize
);
2083 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, SrcSize
);
2086 case Intrinsic::amdgcn_if_break
: {
2087 unsigned Size
= getSizeInBits(MI
.getOperand(0).getReg(), MRI
, *TRI
);
2088 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2089 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID
, 1);
2090 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2096 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS
: {
2097 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
2099 return getInvalidInstructionMapping();
2100 case Intrinsic::amdgcn_s_getreg
:
2101 case Intrinsic::amdgcn_s_memtime
:
2102 case Intrinsic::amdgcn_s_memrealtime
:
2103 case Intrinsic::amdgcn_s_get_waveid_in_workgroup
: {
2104 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2105 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2108 case Intrinsic::amdgcn_ds_append
:
2109 case Intrinsic::amdgcn_ds_consume
:
2110 case Intrinsic::amdgcn_ds_fadd
:
2111 case Intrinsic::amdgcn_ds_fmin
:
2112 case Intrinsic::amdgcn_ds_fmax
:
2113 case Intrinsic::amdgcn_atomic_inc
:
2114 case Intrinsic::amdgcn_atomic_dec
:
2115 return getDefaultMappingAllVGPR(MI
);
2116 case Intrinsic::amdgcn_ds_ordered_add
:
2117 case Intrinsic::amdgcn_ds_ordered_swap
: {
2118 unsigned DstSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2119 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, DstSize
);
2120 unsigned M0Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2121 AMDGPU::SGPRRegBankID
);
2122 OpdsMapping
[2] = AMDGPU::getValueMapping(M0Bank
, 32);
2123 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2126 case Intrinsic::amdgcn_exp_compr
:
2127 OpdsMapping
[0] = nullptr; // IntrinsicID
2128 // FIXME: These are immediate values which can't be read from registers.
2129 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2130 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2131 // FIXME: Could we support packed types here?
2132 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2133 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2134 // FIXME: These are immediate values which can't be read from registers.
2135 OpdsMapping
[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2136 OpdsMapping
[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2138 case Intrinsic::amdgcn_exp
:
2139 OpdsMapping
[0] = nullptr; // IntrinsicID
2140 // FIXME: These are immediate values which can't be read from registers.
2141 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2142 OpdsMapping
[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2143 // FIXME: Could we support packed types here?
2144 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2145 OpdsMapping
[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2146 OpdsMapping
[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2147 OpdsMapping
[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, 32);
2148 // FIXME: These are immediate values which can't be read from registers.
2149 OpdsMapping
[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2150 OpdsMapping
[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, 32);
2152 case Intrinsic::amdgcn_buffer_load
: {
2153 Register RSrc
= MI
.getOperand(2).getReg(); // SGPR
2154 Register VIndex
= MI
.getOperand(3).getReg(); // VGPR
2155 Register Offset
= MI
.getOperand(4).getReg(); // SGPR/VGPR/imm
2157 unsigned Size0
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2158 unsigned Size2
= MRI
.getType(RSrc
).getSizeInBits();
2159 unsigned Size3
= MRI
.getType(VIndex
).getSizeInBits();
2160 unsigned Size4
= MRI
.getType(Offset
).getSizeInBits();
2162 unsigned RSrcBank
= getRegBankID(RSrc
, MRI
, *TRI
);
2163 unsigned OffsetBank
= getRegBankID(Offset
, MRI
, *TRI
);
2165 OpdsMapping
[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size0
);
2166 OpdsMapping
[1] = nullptr; // intrinsic id
2168 // Lie and claim everything is legal, even though some need to be
2169 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2170 OpdsMapping
[2] = AMDGPU::getValueMapping(RSrcBank
, Size2
); // rsrc
2171 OpdsMapping
[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID
, Size3
);
2172 OpdsMapping
[4] = AMDGPU::getValueMapping(OffsetBank
, Size4
);
2173 OpdsMapping
[5] = nullptr;
2174 OpdsMapping
[6] = nullptr;
2177 case Intrinsic::amdgcn_s_sendmsg
:
2178 case Intrinsic::amdgcn_s_sendmsghalt
: {
2179 // This must be an SGPR, but accept a VGPR.
2180 unsigned Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2181 AMDGPU::SGPRRegBankID
);
2182 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, 32);
2185 case Intrinsic::amdgcn_end_cf
: {
2186 unsigned Size
= getSizeInBits(MI
.getOperand(1).getReg(), MRI
, *TRI
);
2187 OpdsMapping
[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID
, Size
);
2193 case AMDGPU::G_SELECT
: {
2194 unsigned Size
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
2195 unsigned Op2Bank
= getRegBankID(MI
.getOperand(2).getReg(), MRI
, *TRI
,
2196 AMDGPU::SGPRRegBankID
);
2197 unsigned Op3Bank
= getRegBankID(MI
.getOperand(3).getReg(), MRI
, *TRI
,
2198 AMDGPU::SGPRRegBankID
);
2199 bool SGPRSrcs
= Op2Bank
== AMDGPU::SGPRRegBankID
&&
2200 Op3Bank
== AMDGPU::SGPRRegBankID
;
2202 unsigned CondBankDefault
= SGPRSrcs
?
2203 AMDGPU::SCCRegBankID
: AMDGPU::VCCRegBankID
;
2204 unsigned CondBank
= getRegBankID(MI
.getOperand(1).getReg(), MRI
, *TRI
,
2206 if (CondBank
== AMDGPU::SGPRRegBankID
)
2207 CondBank
= SGPRSrcs
? AMDGPU::SCCRegBankID
: AMDGPU::VCCRegBankID
;
2208 else if (CondBank
== AMDGPU::VGPRRegBankID
)
2209 CondBank
= AMDGPU::VCCRegBankID
;
2211 unsigned Bank
= SGPRSrcs
&& CondBank
== AMDGPU::SCCRegBankID
?
2212 AMDGPU::SGPRRegBankID
: AMDGPU::VGPRRegBankID
;
2214 assert(CondBank
== AMDGPU::VCCRegBankID
|| CondBank
== AMDGPU::SCCRegBankID
);
2217 OpdsMapping
[0] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
2218 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
2219 OpdsMapping
[2] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
2220 OpdsMapping
[3] = AMDGPU::getValueMappingSGPR64Only(Bank
, Size
);
2222 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, Size
);
2223 OpdsMapping
[1] = AMDGPU::getValueMapping(CondBank
, 1);
2224 OpdsMapping
[2] = AMDGPU::getValueMapping(Bank
, Size
);
2225 OpdsMapping
[3] = AMDGPU::getValueMapping(Bank
, Size
);
2231 case AMDGPU::G_LOAD
:
2232 return getInstrMappingForLoad(MI
);
2234 case AMDGPU::G_ATOMICRMW_XCHG
:
2235 case AMDGPU::G_ATOMICRMW_ADD
:
2236 case AMDGPU::G_ATOMICRMW_SUB
:
2237 case AMDGPU::G_ATOMICRMW_AND
:
2238 case AMDGPU::G_ATOMICRMW_OR
:
2239 case AMDGPU::G_ATOMICRMW_XOR
:
2240 case AMDGPU::G_ATOMICRMW_MAX
:
2241 case AMDGPU::G_ATOMICRMW_MIN
:
2242 case AMDGPU::G_ATOMICRMW_UMAX
:
2243 case AMDGPU::G_ATOMICRMW_UMIN
:
2244 case AMDGPU::G_ATOMICRMW_FADD
:
2245 case AMDGPU::G_ATOMIC_CMPXCHG
: {
2246 return getDefaultMappingAllVGPR(MI
);
2248 case AMDGPU::G_BRCOND
: {
2249 unsigned Bank
= getRegBankID(MI
.getOperand(0).getReg(), MRI
, *TRI
,
2250 AMDGPU::SGPRRegBankID
);
2251 assert(MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits() == 1);
2252 if (Bank
!= AMDGPU::SCCRegBankID
)
2253 Bank
= AMDGPU::VCCRegBankID
;
2255 OpdsMapping
[0] = AMDGPU::getValueMapping(Bank
, 1);
2260 return getInstructionMapping(/*ID*/1, /*Cost*/1,
2261 getOperandsMapping(OpdsMapping
),
2262 MI
.getNumOperands());