[AMDGPU] Select VGPR versions of MFMA if possible
[llvm-project.git] / llvm / lib / Target / AMDGPU / AMDGPURegisterBankInfo.cpp
blob56693805cc360b6b96304bb85b7011715414a5f0
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
71 #include "AMDGPURegisterBankInfo.h"
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
91 using namespace llvm;
92 using namespace MIPatternMatch;
94 namespace {
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final : public GISelChangeObserver {
98 private:
99 const AMDGPURegisterBankInfo &RBI;
100 MachineRegisterInfo &MRI;
101 const RegisterBank *NewBank;
102 SmallVector<MachineInstr *, 4> NewInsts;
104 public:
105 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106 MachineRegisterInfo &MRI_, const RegisterBank *RB)
107 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
109 ~ApplyRegBankMapping() {
110 for (MachineInstr *MI : NewInsts)
111 applyBank(*MI);
114 /// Set any registers that don't have a set register class or bank to SALU.
115 void applyBank(MachineInstr &MI) {
116 const unsigned Opc = MI.getOpcode();
117 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118 Opc == AMDGPU::G_SEXT) {
119 // LegalizerHelper wants to use the basic legalization artifacts when
120 // widening etc. We don't handle selection with vcc in artifact sources,
121 // so we need to use a select instead to handle these properly.
122 Register DstReg = MI.getOperand(0).getReg();
123 Register SrcReg = MI.getOperand(1).getReg();
124 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125 if (SrcBank == &AMDGPU::VCCRegBank) {
126 const LLT S32 = LLT::scalar(32);
127 assert(MRI.getType(SrcReg) == LLT::scalar(1));
128 assert(MRI.getType(DstReg) == S32);
129 assert(NewBank == &AMDGPU::VGPRRegBank);
131 // Replace the extension with a select, which really uses the boolean
132 // source.
133 MachineIRBuilder B(MI);
134 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135 auto False = B.buildConstant(S32, 0);
136 B.buildSelect(DstReg, SrcReg, True, False);
137 MRI.setRegBank(True.getReg(0), *NewBank);
138 MRI.setRegBank(False.getReg(0), *NewBank);
139 MI.eraseFromParent();
142 assert(!MRI.getRegClassOrRegBank(DstReg));
143 MRI.setRegBank(DstReg, *NewBank);
144 return;
147 #ifndef NDEBUG
148 if (Opc == AMDGPU::G_TRUNC) {
149 Register DstReg = MI.getOperand(0).getReg();
150 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151 assert(DstBank != &AMDGPU::VCCRegBank);
153 #endif
155 for (MachineOperand &Op : MI.operands()) {
156 if (!Op.isReg())
157 continue;
159 // We may see physical registers if building a real MI
160 Register Reg = Op.getReg();
161 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162 continue;
164 const RegisterBank *RB = NewBank;
165 if (MRI.getType(Reg) == LLT::scalar(1)) {
166 assert(NewBank == &AMDGPU::VGPRRegBank &&
167 "s1 operands should only be used for vector bools");
168 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
169 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
170 "not expecting legalization artifacts here");
171 RB = &AMDGPU::VCCRegBank;
174 MRI.setRegBank(Reg, *RB);
178 void erasingInstr(MachineInstr &MI) override {}
180 void createdInstr(MachineInstr &MI) override {
181 // At this point, the instruction was just inserted and has no operands.
182 NewInsts.push_back(&MI);
185 void changingInstr(MachineInstr &MI) override {}
186 void changedInstr(MachineInstr &MI) override {
187 // FIXME: In principle we should probably add the instruction to NewInsts,
188 // but the way the LegalizerHelper uses the observer, we will always see the
189 // registers we need to set the regbank on also referenced in a new
190 // instruction.
195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
196 : AMDGPUGenRegisterBankInfo(),
197 Subtarget(ST),
198 TRI(Subtarget.getRegisterInfo()),
199 TII(Subtarget.getInstrInfo()) {
201 // HACK: Until this is fully tablegen'd.
202 static llvm::once_flag InitializeRegisterBankFlag;
204 static auto InitializeRegisterBankOnce = [this]() {
205 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
206 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
207 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
208 (void)this;
211 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
215 unsigned BankID = Bank.getID();
216 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
220 const RegisterBank &Src,
221 unsigned Size) const {
222 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
225 return std::numeric_limits<unsigned>::max();
228 // Bool values are tricky, because the meaning is based on context. The SCC
229 // and VCC banks are for the natural scalar and vector conditions produced by
230 // a compare.
232 // Legalization doesn't know about the necessary context, so an s1 use may
233 // have been a truncate from an arbitrary value, in which case a copy (lowered
234 // as a compare with 0) needs to be inserted.
235 if (Size == 1 &&
236 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237 (isVectorRegisterBank(Src) ||
238 Src.getID() == AMDGPU::SGPRRegBankID ||
239 Src.getID() == AMDGPU::VCCRegBankID))
240 return std::numeric_limits<unsigned>::max();
242 // There is no direct copy between AGPRs.
243 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244 Src.getID() == AMDGPU::AGPRRegBankID)
245 return 4;
247 return RegisterBankInfo::copyCost(Dst, Src, Size);
250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251 const ValueMapping &ValMapping,
252 const RegisterBank *CurBank) const {
253 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254 // VGPR.
255 // FIXME: Is there a better way to do this?
256 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257 return 10; // This is expensive.
259 assert(ValMapping.NumBreakDowns == 2 &&
260 ValMapping.BreakDown[0].Length == 32 &&
261 ValMapping.BreakDown[0].StartIdx == 0 &&
262 ValMapping.BreakDown[1].Length == 32 &&
263 ValMapping.BreakDown[1].StartIdx == 32 &&
264 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
266 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268 // want.
270 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271 // alignment restrictions, but this probably isn't important.
272 return 1;
275 const RegisterBank &
276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
277 LLT Ty) const {
278 if (&RC == &AMDGPU::SReg_1RegClass)
279 return AMDGPU::VCCRegBank;
281 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282 // VCC-like use.
283 if (TRI->isSGPRClass(&RC)) {
284 // FIXME: This probably came from a copy from a physical register, which
285 // should be inferable from the copied to-type. We don't have many boolean
286 // physical register constraints so just assume a normal SGPR for now.
287 if (!Ty.isValid())
288 return AMDGPU::SGPRRegBank;
290 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
293 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
296 template <unsigned NumOps>
297 RegisterBankInfo::InstructionMappings
298 AMDGPURegisterBankInfo::addMappingFromTable(
299 const MachineInstr &MI, const MachineRegisterInfo &MRI,
300 const std::array<unsigned, NumOps> RegSrcOpIdx,
301 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
303 InstructionMappings AltMappings;
305 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
307 unsigned Sizes[NumOps];
308 for (unsigned I = 0; I < NumOps; ++I) {
309 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
313 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
318 // getInstrMapping's default mapping uses ID 1, so start at 2.
319 unsigned MappingID = 2;
320 for (const auto &Entry : Table) {
321 for (unsigned I = 0; I < NumOps; ++I) {
322 int OpIdx = RegSrcOpIdx[I];
323 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
326 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327 getOperandsMapping(Operands),
328 Operands.size()));
331 return AltMappings;
334 RegisterBankInfo::InstructionMappings
335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337 switch (MI.getIntrinsicID()) {
338 case Intrinsic::amdgcn_readlane: {
339 static const OpRegBankEntry<3> Table[2] = {
340 // Perfectly legal.
341 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
343 // Need a readfirstlane for the index.
344 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
347 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
350 case Intrinsic::amdgcn_writelane: {
351 static const OpRegBankEntry<4> Table[4] = {
352 // Perfectly legal.
353 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
355 // Need readfirstlane of first op
356 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
358 // Need readfirstlane of second op
359 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
361 // Need readfirstlane of both ops
362 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
365 // rsrc, voffset, offset
366 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
369 default:
370 return RegisterBankInfo::getInstrAlternativeMappings(MI);
374 RegisterBankInfo::InstructionMappings
375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
378 switch (MI.getIntrinsicID()) {
379 case Intrinsic::amdgcn_s_buffer_load: {
380 static const OpRegBankEntry<2> Table[4] = {
381 // Perfectly legal.
382 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
384 // Only need 1 register in loop
385 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
387 // Have to waterfall the resource.
388 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
390 // Have to waterfall the resource, and the offset.
391 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
394 // rsrc, offset
395 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
398 case Intrinsic::amdgcn_ds_ordered_add:
399 case Intrinsic::amdgcn_ds_ordered_swap: {
400 // VGPR = M0, VGPR
401 static const OpRegBankEntry<3> Table[2] = {
402 // Perfectly legal.
403 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
405 // Need a readfirstlane for m0
406 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
409 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
412 case Intrinsic::amdgcn_s_sendmsg:
413 case Intrinsic::amdgcn_s_sendmsghalt: {
414 // FIXME: Should have no register for immediate
415 static const OpRegBankEntry<1> Table[2] = {
416 // Perfectly legal.
417 { { AMDGPU::SGPRRegBankID }, 1 },
419 // Need readlane
420 { { AMDGPU::VGPRRegBankID }, 3 }
423 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
426 default:
427 return RegisterBankInfo::getInstrAlternativeMappings(MI);
431 // FIXME: Returns uniform if there's no source value information. This is
432 // probably wrong.
433 static bool isScalarLoadLegal(const MachineInstr &MI) {
434 if (!MI.hasOneMemOperand())
435 return false;
437 const MachineMemOperand *MMO = *MI.memoperands_begin();
438 const unsigned AS = MMO->getAddrSpace();
439 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
440 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
441 // Require 4-byte alignment.
442 return MMO->getAlign() >= Align(4) &&
443 // Can't do a scalar atomic load.
444 !MMO->isAtomic() &&
445 // Don't use scalar loads for volatile accesses to non-constant address
446 // spaces.
447 (IsConst || !MMO->isVolatile()) &&
448 // Memory must be known constant, or not written before this load.
449 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
450 AMDGPUInstrInfo::isUniformMMO(MMO);
453 RegisterBankInfo::InstructionMappings
454 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
455 const MachineInstr &MI) const {
457 const MachineFunction &MF = *MI.getParent()->getParent();
458 const MachineRegisterInfo &MRI = MF.getRegInfo();
461 InstructionMappings AltMappings;
462 switch (MI.getOpcode()) {
463 case TargetOpcode::G_CONSTANT: {
464 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
465 if (Size == 1) {
466 static const OpRegBankEntry<1> Table[3] = {
467 { { AMDGPU::VGPRRegBankID }, 1 },
468 { { AMDGPU::SGPRRegBankID }, 1 },
469 { { AMDGPU::VCCRegBankID }, 1 }
472 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
475 LLVM_FALLTHROUGH;
477 case TargetOpcode::G_FCONSTANT:
478 case TargetOpcode::G_FRAME_INDEX:
479 case TargetOpcode::G_GLOBAL_VALUE: {
480 static const OpRegBankEntry<1> Table[2] = {
481 { { AMDGPU::VGPRRegBankID }, 1 },
482 { { AMDGPU::SGPRRegBankID }, 1 }
485 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
487 case TargetOpcode::G_AND:
488 case TargetOpcode::G_OR:
489 case TargetOpcode::G_XOR: {
490 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
492 if (Size == 1) {
493 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
494 const InstructionMapping &SCCMapping = getInstructionMapping(
495 1, 1, getOperandsMapping(
496 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
497 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
498 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
499 3); // Num Operands
500 AltMappings.push_back(&SCCMapping);
502 const InstructionMapping &VCCMapping0 = getInstructionMapping(
503 2, 1, getOperandsMapping(
504 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
505 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
506 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
507 3); // Num Operands
508 AltMappings.push_back(&VCCMapping0);
509 return AltMappings;
512 if (Size != 64)
513 break;
515 const InstructionMapping &SSMapping = getInstructionMapping(
516 1, 1, getOperandsMapping(
517 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
518 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
519 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
520 3); // Num Operands
521 AltMappings.push_back(&SSMapping);
523 const InstructionMapping &VVMapping = getInstructionMapping(
524 2, 2, getOperandsMapping(
525 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
526 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
527 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
528 3); // Num Operands
529 AltMappings.push_back(&VVMapping);
530 break;
532 case TargetOpcode::G_LOAD:
533 case TargetOpcode::G_ZEXTLOAD:
534 case TargetOpcode::G_SEXTLOAD: {
535 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
536 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
537 unsigned PtrSize = PtrTy.getSizeInBits();
538 unsigned AS = PtrTy.getAddressSpace();
540 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
541 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
542 isScalarLoadLegal(MI)) {
543 const InstructionMapping &SSMapping = getInstructionMapping(
544 1, 1, getOperandsMapping(
545 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
546 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
547 2); // Num Operands
548 AltMappings.push_back(&SSMapping);
551 const InstructionMapping &VVMapping = getInstructionMapping(
552 2, 1,
553 getOperandsMapping(
554 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
555 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
556 2); // Num Operands
557 AltMappings.push_back(&VVMapping);
559 // It may be possible to have a vgpr = load sgpr mapping here, because
560 // the mubuf instructions support this kind of load, but probably for only
561 // gfx7 and older. However, the addressing mode matching in the instruction
562 // selector should be able to do a better job of detecting and selecting
563 // these kinds of loads from the vgpr = load vgpr mapping.
565 return AltMappings;
568 case TargetOpcode::G_SELECT: {
569 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
570 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
571 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
572 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
573 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
574 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
575 4); // Num Operands
576 AltMappings.push_back(&SSMapping);
578 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
579 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
580 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
581 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
582 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
583 4); // Num Operands
584 AltMappings.push_back(&VVMapping);
586 return AltMappings;
588 case TargetOpcode::G_UADDE:
589 case TargetOpcode::G_USUBE:
590 case TargetOpcode::G_SADDE:
591 case TargetOpcode::G_SSUBE: {
592 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
593 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
594 getOperandsMapping(
595 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
596 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
597 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
598 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
599 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
600 5); // Num Operands
601 AltMappings.push_back(&SSMapping);
603 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
604 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
605 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
606 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
607 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
608 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
609 5); // Num Operands
610 AltMappings.push_back(&VVMapping);
611 return AltMappings;
613 case AMDGPU::G_BRCOND: {
614 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
616 // TODO: Change type to 32 for scalar
617 const InstructionMapping &SMapping = getInstructionMapping(
618 1, 1, getOperandsMapping(
619 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
620 2); // Num Operands
621 AltMappings.push_back(&SMapping);
623 const InstructionMapping &VMapping = getInstructionMapping(
624 1, 1, getOperandsMapping(
625 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
626 2); // Num Operands
627 AltMappings.push_back(&VMapping);
628 return AltMappings;
630 case AMDGPU::G_INTRINSIC:
631 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
632 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
633 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
634 default:
635 break;
637 return RegisterBankInfo::getInstrAlternativeMappings(MI);
640 void AMDGPURegisterBankInfo::split64BitValueForMapping(
641 MachineIRBuilder &B,
642 SmallVector<Register, 2> &Regs,
643 LLT HalfTy,
644 Register Reg) const {
645 assert(HalfTy.getSizeInBits() == 32);
646 MachineRegisterInfo *MRI = B.getMRI();
647 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
648 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
649 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
650 MRI->setRegBank(LoLHS, *Bank);
651 MRI->setRegBank(HiLHS, *Bank);
653 Regs.push_back(LoLHS);
654 Regs.push_back(HiLHS);
656 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
657 .addDef(LoLHS)
658 .addDef(HiLHS)
659 .addUse(Reg);
662 /// Replace the current type each register in \p Regs has with \p NewTy
663 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
664 LLT NewTy) {
665 for (Register Reg : Regs) {
666 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
667 MRI.setType(Reg, NewTy);
671 static LLT getHalfSizedType(LLT Ty) {
672 if (Ty.isVector()) {
673 assert(Ty.getElementCount().isKnownMultipleOf(2));
674 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
675 Ty.getElementType());
678 assert(Ty.getScalarSizeInBits() % 2 == 0);
679 return LLT::scalar(Ty.getScalarSizeInBits() / 2);
682 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
683 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
684 /// execute the instruction for each unique combination of values in all lanes
685 /// in the wave. The block will be split such that rest of the instructions are
686 /// moved to a new block.
688 /// Essentially performs this loop:
690 /// Save Execution Mask
691 /// For (Lane : Wavefront) {
692 /// Enable Lane, Disable all other lanes
693 /// SGPR = read SGPR value for current lane from VGPR
694 /// VGPRResult[Lane] = use_op SGPR
695 /// }
696 /// Restore Execution Mask
698 /// There is additional complexity to try for compare values to identify the
699 /// unique values used.
700 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
701 MachineIRBuilder &B,
702 iterator_range<MachineBasicBlock::iterator> Range,
703 SmallSet<Register, 4> &SGPROperandRegs,
704 MachineRegisterInfo &MRI) const {
706 // Track use registers which have already been expanded with a readfirstlane
707 // sequence. This may have multiple uses if moving a sequence.
708 DenseMap<Register, Register> WaterfalledRegMap;
710 MachineBasicBlock &MBB = B.getMBB();
711 MachineFunction *MF = &B.getMF();
713 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
714 const unsigned WaveAndOpc = Subtarget.isWave32() ?
715 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
716 const unsigned MovExecOpc =
717 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
718 const unsigned MovExecTermOpc =
719 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
721 const unsigned XorTermOpc = Subtarget.isWave32() ?
722 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
723 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
724 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
725 const unsigned ExecReg = Subtarget.isWave32() ?
726 AMDGPU::EXEC_LO : AMDGPU::EXEC;
728 #ifndef NDEBUG
729 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
730 #endif
732 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
733 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
735 // Don't bother using generic instructions/registers for the exec mask.
736 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
737 .addDef(InitSaveExecReg);
739 Register PhiExec = MRI.createVirtualRegister(WaveRC);
740 Register NewExec = MRI.createVirtualRegister(WaveRC);
742 // To insert the loop we need to split the block. Move everything before this
743 // point to a new block, and insert a new empty block before this instruction.
744 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
745 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
746 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
747 MachineFunction::iterator MBBI(MBB);
748 ++MBBI;
749 MF->insert(MBBI, LoopBB);
750 MF->insert(MBBI, RestoreExecBB);
751 MF->insert(MBBI, RemainderBB);
753 LoopBB->addSuccessor(RestoreExecBB);
754 LoopBB->addSuccessor(LoopBB);
756 // Move the rest of the block into a new block.
757 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
758 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
760 MBB.addSuccessor(LoopBB);
761 RestoreExecBB->addSuccessor(RemainderBB);
763 B.setInsertPt(*LoopBB, LoopBB->end());
765 B.buildInstr(TargetOpcode::PHI)
766 .addDef(PhiExec)
767 .addReg(InitSaveExecReg)
768 .addMBB(&MBB)
769 .addReg(NewExec)
770 .addMBB(LoopBB);
772 const DebugLoc &DL = B.getDL();
774 MachineInstr &FirstInst = *Range.begin();
776 // Move the instruction into the loop. Note we moved everything after
777 // Range.end() already into a new block, so Range.end() is no longer valid.
778 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
780 // Figure out the iterator range after splicing the instructions.
781 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
782 auto NewEnd = LoopBB->end();
784 MachineBasicBlock::iterator I = Range.begin();
785 B.setInsertPt(*LoopBB, I);
787 Register CondReg;
789 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
791 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
792 for (MachineOperand &Op : MI.uses()) {
793 if (!Op.isReg() || Op.isDef())
794 continue;
796 Register OldReg = Op.getReg();
797 if (!SGPROperandRegs.count(OldReg))
798 continue;
800 // See if we already processed this register in another instruction in the
801 // sequence.
802 auto OldVal = WaterfalledRegMap.find(OldReg);
803 if (OldVal != WaterfalledRegMap.end()) {
804 Op.setReg(OldVal->second);
805 continue;
808 Register OpReg = Op.getReg();
809 LLT OpTy = MRI.getType(OpReg);
811 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
812 if (OpBank != &AMDGPU::VGPRRegBank) {
813 // Insert copy from AGPR to VGPR before the loop.
814 B.setMBB(MBB);
815 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
816 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
817 B.setInstr(*I);
820 unsigned OpSize = OpTy.getSizeInBits();
822 // Can only do a readlane of 32-bit pieces.
823 if (OpSize == 32) {
824 // Avoid extra copies in the simple case of one 32-bit register.
825 Register CurrentLaneOpReg
826 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
827 MRI.setType(CurrentLaneOpReg, OpTy);
829 constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
830 // Read the next variant <- also loop target.
831 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
832 CurrentLaneOpReg)
833 .addReg(OpReg);
835 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
836 bool First = CondReg == AMDGPU::NoRegister;
837 if (First)
838 CondReg = NewCondReg;
840 // Compare the just read M0 value to all possible Idx values.
841 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
842 .addDef(NewCondReg)
843 .addReg(CurrentLaneOpReg)
844 .addReg(OpReg);
845 Op.setReg(CurrentLaneOpReg);
847 if (!First) {
848 Register AndReg = MRI.createVirtualRegister(WaveRC);
850 // If there are multiple operands to consider, and the conditions.
851 B.buildInstr(WaveAndOpc)
852 .addDef(AndReg)
853 .addReg(NewCondReg)
854 .addReg(CondReg);
855 CondReg = AndReg;
857 } else {
858 LLT S32 = LLT::scalar(32);
859 SmallVector<Register, 8> ReadlanePieces;
861 // The compares can be done as 64-bit, but the extract needs to be done
862 // in 32-bit pieces.
864 bool Is64 = OpSize % 64 == 0;
866 unsigned UnmergeTySize = Is64 ? 64 : 32;
867 unsigned CmpOp =
868 Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64;
870 // Insert the unmerge before the loop.
872 B.setMBB(MBB);
873 unsigned NumPieces = OpSize / UnmergeTySize;
874 SmallVector<Register, 8> UnmergePieces;
875 if (NumPieces == 1) {
876 UnmergePieces.push_back(OpReg);
877 } else {
878 LLT UnmergeTy = LLT::scalar(UnmergeTySize);
879 MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
880 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx)
881 UnmergePieces.push_back(Unmerge.getReg(PieceIdx));
883 B.setInstr(*I);
885 for (Register UnmergePiece : UnmergePieces) {
886 Register CurrentLaneOpReg;
887 if (Is64) {
888 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
889 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
891 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
892 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
893 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
895 // Read the next variant <- also loop target.
896 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
897 CurrentLaneOpRegLo)
898 .addReg(UnmergePiece, 0, AMDGPU::sub0);
900 // Read the next variant <- also loop target.
901 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
902 CurrentLaneOpRegHi)
903 .addReg(UnmergePiece, 0, AMDGPU::sub1);
905 CurrentLaneOpReg =
906 B.buildMerge(LLT::scalar(64),
907 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
908 .getReg(0);
910 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
912 if (OpTy.getScalarSizeInBits() == 64) {
913 // If we need to produce a 64-bit element vector, so use the
914 // merged pieces
915 ReadlanePieces.push_back(CurrentLaneOpReg);
916 } else {
917 // 32-bit element type.
918 ReadlanePieces.push_back(CurrentLaneOpRegLo);
919 ReadlanePieces.push_back(CurrentLaneOpRegHi);
921 } else {
922 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
923 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
924 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
926 // Read the next variant <- also loop target.
927 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
928 CurrentLaneOpReg)
929 .addReg(UnmergePiece);
930 ReadlanePieces.push_back(CurrentLaneOpReg);
933 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
934 bool First = CondReg == AMDGPU::NoRegister;
935 if (First)
936 CondReg = NewCondReg;
938 B.buildInstr(CmpOp)
939 .addDef(NewCondReg)
940 .addReg(CurrentLaneOpReg)
941 .addReg(UnmergePiece);
943 if (!First) {
944 Register AndReg = MRI.createVirtualRegister(WaveRC);
946 // If there are multiple operands to consider, and the conditions.
947 B.buildInstr(WaveAndOpc)
948 .addDef(AndReg)
949 .addReg(NewCondReg)
950 .addReg(CondReg);
951 CondReg = AndReg;
955 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
956 // BUILD_VECTOR
957 if (OpTy.isVector()) {
958 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
959 Op.setReg(Merge.getReg(0));
960 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
961 } else if (ReadlanePieces.size() > 1) {
962 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
963 Op.setReg(Merge.getReg(0));
964 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
965 } else {
966 Op.setReg(ReadlanePieces[0]);
970 // Make sure we don't re-process this register again.
971 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
975 // Update EXEC, save the original EXEC value to VCC.
976 B.buildInstr(AndSaveExecOpc)
977 .addDef(NewExec)
978 .addReg(CondReg, RegState::Kill);
980 MRI.setSimpleHint(NewExec, CondReg);
982 B.setInsertPt(*LoopBB, LoopBB->end());
984 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
985 B.buildInstr(XorTermOpc)
986 .addDef(ExecReg)
987 .addReg(ExecReg)
988 .addReg(NewExec);
990 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
991 // s_cbranch_scc0?
993 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
994 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
996 // Save the EXEC mask before the loop.
997 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
998 .addReg(ExecReg);
1000 // Restore the EXEC mask after the loop.
1001 B.setMBB(*RestoreExecBB);
1002 B.buildInstr(MovExecTermOpc)
1003 .addDef(ExecReg)
1004 .addReg(SaveExecReg);
1006 // Set the insert point after the original instruction, so any new
1007 // instructions will be in the remainder.
1008 B.setInsertPt(*RemainderBB, RemainderBB->begin());
1010 return true;
1013 // Return any unique registers used by \p MI at \p OpIndices that need to be
1014 // handled in a waterfall loop. Returns these registers in \p
1015 // SGPROperandRegs. Returns true if there are any operands to handle and a
1016 // waterfall loop is necessary.
1017 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1018 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1019 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1020 for (unsigned Op : OpIndices) {
1021 assert(MI.getOperand(Op).isUse());
1022 Register Reg = MI.getOperand(Op).getReg();
1023 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1024 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1025 SGPROperandRegs.insert(Reg);
1028 // No operands need to be replaced, so no need to loop.
1029 return !SGPROperandRegs.empty();
1032 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1033 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1034 ArrayRef<unsigned> OpIndices) const {
1035 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1036 // are the same register.
1037 SmallSet<Register, 4> SGPROperandRegs;
1039 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1040 return false;
1042 MachineBasicBlock::iterator I = MI.getIterator();
1043 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1044 SGPROperandRegs, MRI);
1047 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1048 MachineInstr &MI, MachineRegisterInfo &MRI,
1049 ArrayRef<unsigned> OpIndices) const {
1050 MachineIRBuilder B(MI);
1051 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1054 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1055 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1056 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1057 Register Reg = MI.getOperand(OpIdx).getReg();
1058 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1059 if (Bank == &AMDGPU::SGPRRegBank)
1060 return;
1062 LLT Ty = MRI.getType(Reg);
1063 MachineIRBuilder B(MI);
1065 if (Bank != &AMDGPU::VGPRRegBank) {
1066 // We need to copy from AGPR to VGPR
1067 Reg = B.buildCopy(Ty, Reg).getReg(0);
1068 MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1071 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1072 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1073 .addDef(SGPR)
1074 .addReg(Reg);
1076 MRI.setType(SGPR, Ty);
1078 const TargetRegisterClass *Constrained =
1079 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1080 (void)Constrained;
1081 assert(Constrained && "Failed to constrain readfirstlane src reg");
1083 MI.getOperand(OpIdx).setReg(SGPR);
1086 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1087 /// rest will be in the remainder.
1088 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1089 unsigned TotalSize = Ty.getSizeInBits();
1090 if (!Ty.isVector())
1091 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1093 LLT EltTy = Ty.getElementType();
1094 unsigned EltSize = EltTy.getSizeInBits();
1095 assert(FirstSize % EltSize == 0);
1097 unsigned FirstPartNumElts = FirstSize / EltSize;
1098 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1100 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1101 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1104 static LLT widen96To128(LLT Ty) {
1105 if (!Ty.isVector())
1106 return LLT::scalar(128);
1108 LLT EltTy = Ty.getElementType();
1109 assert(128 % EltTy.getSizeInBits() == 0);
1110 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1113 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1114 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1115 MachineRegisterInfo &MRI) const {
1116 Register DstReg = MI.getOperand(0).getReg();
1117 const LLT LoadTy = MRI.getType(DstReg);
1118 unsigned LoadSize = LoadTy.getSizeInBits();
1119 const unsigned MaxNonSmrdLoadSize = 128;
1121 const RegisterBank *DstBank =
1122 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1123 if (DstBank == &AMDGPU::SGPRRegBank) {
1124 // There are some special cases that we need to look at for 32 bit and 96
1125 // bit SGPR loads otherwise we have nothing to do.
1126 if (LoadSize != 32 && LoadSize != 96)
1127 return false;
1129 MachineMemOperand *MMO = *MI.memoperands_begin();
1130 const unsigned MemSize = 8 * MMO->getSize();
1131 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1132 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1133 // scalar loads should have a load size of 32 but memory access size of less
1134 // than 32.
1135 if (LoadSize == 32 &&
1136 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1137 return false;
1139 Register PtrReg = MI.getOperand(1).getReg();
1141 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1142 MachineIRBuilder B(MI, O);
1144 if (LoadSize == 32) {
1145 // This is an extending load from a sub-dword size. Widen the memory
1146 // access size to 4 bytes and clear the extra high bits appropriately
1147 const LLT S32 = LLT::scalar(32);
1148 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1149 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1150 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1151 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1152 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1153 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1154 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1155 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1156 } else
1157 // We do not need to touch the higher bits for regular loads.
1158 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1159 } else {
1160 // 96-bit loads are only available for vector loads. We need to split this
1161 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1162 if (MMO->getAlign() < Align(16)) {
1163 MachineFunction *MF = MI.getParent()->getParent();
1164 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1165 MachineIRBuilder B(MI, ApplyBank);
1166 LegalizerHelper Helper(*MF, ApplyBank, B);
1167 LLT Part64, Part32;
1168 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1169 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1170 LegalizerHelper::Legalized)
1171 return false;
1172 return true;
1173 } else {
1174 LLT WiderTy = widen96To128(LoadTy);
1175 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1176 if (WiderTy.isScalar())
1177 B.buildTrunc(MI.getOperand(0), WideLoad);
1178 else {
1179 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1180 WideLoad);
1185 MI.eraseFromParent();
1186 return true;
1189 // 128-bit loads are supported for all instruction types.
1190 if (LoadSize <= MaxNonSmrdLoadSize)
1191 return false;
1193 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1194 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1196 if (SrcRegs.empty())
1197 SrcRegs.push_back(MI.getOperand(1).getReg());
1199 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1201 // RegBankSelect only emits scalar types, so we need to reset the pointer
1202 // operand to a pointer type.
1203 Register BasePtrReg = SrcRegs[0];
1204 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1205 MRI.setType(BasePtrReg, PtrTy);
1207 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1208 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1209 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1210 MachineIRBuilder B(MI, Observer);
1211 LegalizerHelper Helper(B.getMF(), Observer, B);
1213 if (LoadTy.isVector()) {
1214 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1215 return false;
1216 } else {
1217 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1218 return false;
1221 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1222 return true;
1225 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1226 MachineInstr &MI,
1227 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1228 MachineRegisterInfo &MRI) const {
1229 const MachineFunction &MF = *MI.getMF();
1230 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1231 const auto &TFI = *ST.getFrameLowering();
1233 // Guard in case the stack growth direction ever changes with scratch
1234 // instructions.
1235 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1236 return false;
1238 Register Dst = MI.getOperand(0).getReg();
1239 Register AllocSize = MI.getOperand(1).getReg();
1240 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1242 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1244 // TODO: Need to emit a wave reduction to get the maximum size.
1245 if (SizeBank != &AMDGPU::SGPRRegBank)
1246 return false;
1248 LLT PtrTy = MRI.getType(Dst);
1249 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1251 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1252 Register SPReg = Info->getStackPtrOffsetReg();
1253 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1254 MachineIRBuilder B(MI, ApplyBank);
1256 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1257 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1259 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1260 if (Alignment > TFI.getStackAlign()) {
1261 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1262 B.buildMaskLowPtrBits(Dst, PtrAdd,
1263 Log2(Alignment) + ST.getWavefrontSizeLog2());
1264 } else {
1265 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1268 MI.eraseFromParent();
1269 return true;
1272 bool AMDGPURegisterBankInfo::applyMappingImage(
1273 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1274 MachineRegisterInfo &MRI, int RsrcIdx) const {
1275 const int NumDefs = MI.getNumExplicitDefs();
1277 // The reported argument index is relative to the IR intrinsic call arguments,
1278 // so we need to shift by the number of defs and the intrinsic ID.
1279 RsrcIdx += NumDefs + 1;
1281 // Insert copies to VGPR arguments.
1282 applyDefaultMapping(OpdMapper);
1284 // Fixup any SGPR arguments.
1285 SmallVector<unsigned, 4> SGPRIndexes;
1286 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1287 if (!MI.getOperand(I).isReg())
1288 continue;
1290 // If this intrinsic has a sampler, it immediately follows rsrc.
1291 if (I == RsrcIdx || I == RsrcIdx + 1)
1292 SGPRIndexes.push_back(I);
1295 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1296 return true;
1299 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1300 Register Reg) {
1301 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1302 if (!Def)
1303 return Reg;
1305 // TODO: Guard against this being an implicit def
1306 return Def->getOperand(0).getReg();
1309 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1310 // the three offsets (voffset, soffset and instoffset)
1311 static unsigned setBufferOffsets(MachineIRBuilder &B,
1312 const AMDGPURegisterBankInfo &RBI,
1313 Register CombinedOffset, Register &VOffsetReg,
1314 Register &SOffsetReg, int64_t &InstOffsetVal,
1315 Align Alignment) {
1316 const LLT S32 = LLT::scalar(32);
1317 MachineRegisterInfo *MRI = B.getMRI();
1319 if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1320 uint32_t SOffset, ImmOffset;
1321 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1322 Alignment)) {
1323 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1324 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1325 InstOffsetVal = ImmOffset;
1327 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1328 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1329 return SOffset + ImmOffset;
1333 Register Base;
1334 unsigned Offset;
1336 std::tie(Base, Offset) =
1337 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1339 uint32_t SOffset, ImmOffset;
1340 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1341 &RBI.Subtarget, Alignment)) {
1342 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1343 VOffsetReg = Base;
1344 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1345 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1346 InstOffsetVal = ImmOffset;
1347 return 0; // XXX - Why is this 0?
1350 // If we have SGPR base, we can use it for soffset.
1351 if (SOffset == 0) {
1352 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1353 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1354 SOffsetReg = Base;
1355 InstOffsetVal = ImmOffset;
1356 return 0; // XXX - Why is this 0?
1360 // Handle the variable sgpr + vgpr case.
1361 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1362 if (Add && (int)Offset >= 0) {
1363 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1364 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1366 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1367 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1369 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1370 VOffsetReg = Src0;
1371 SOffsetReg = Src1;
1372 return 0;
1375 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1376 VOffsetReg = Src1;
1377 SOffsetReg = Src0;
1378 return 0;
1382 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1383 // have an SGPR offset and a VGPR resource.
1384 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1385 VOffsetReg = CombinedOffset;
1386 } else {
1387 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1388 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1391 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1392 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1393 return 0;
1396 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1397 const OperandsMapper &OpdMapper) const {
1398 MachineInstr &MI = OpdMapper.getMI();
1399 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1401 const LLT S32 = LLT::scalar(32);
1402 Register Dst = MI.getOperand(0).getReg();
1403 LLT Ty = MRI.getType(Dst);
1405 const RegisterBank *RSrcBank =
1406 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1407 const RegisterBank *OffsetBank =
1408 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1409 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1410 OffsetBank == &AMDGPU::SGPRRegBank)
1411 return true; // Legal mapping
1413 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1414 // here but don't have an MMO.
1416 unsigned LoadSize = Ty.getSizeInBits();
1417 int NumLoads = 1;
1418 if (LoadSize == 256 || LoadSize == 512) {
1419 NumLoads = LoadSize / 128;
1420 Ty = Ty.divide(NumLoads);
1423 // Use the alignment to ensure that the required offsets will fit into the
1424 // immediate offsets.
1425 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1427 MachineIRBuilder B(MI);
1428 MachineFunction &MF = B.getMF();
1430 Register SOffset;
1431 Register VOffset;
1432 int64_t ImmOffset = 0;
1434 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1435 VOffset, SOffset, ImmOffset, Alignment);
1437 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1438 // can, but we need to track an MMO for that.
1439 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1440 const Align MemAlign(4); // FIXME: ABI type alignment?
1441 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1442 MachinePointerInfo(),
1443 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1444 MachineMemOperand::MOInvariant,
1445 MemSize, MemAlign);
1446 if (MMOOffset != 0)
1447 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1449 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1450 // assume that the buffer is unswizzled.
1452 Register RSrc = MI.getOperand(1).getReg();
1453 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1454 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1456 SmallVector<Register, 4> LoadParts(NumLoads);
1458 MachineBasicBlock::iterator MII = MI.getIterator();
1459 MachineInstrSpan Span(MII, &B.getMBB());
1461 for (int i = 0; i < NumLoads; ++i) {
1462 if (NumLoads == 1) {
1463 LoadParts[i] = Dst;
1464 } else {
1465 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1466 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1469 MachineMemOperand *MMO = BaseMMO;
1470 if (i != 0)
1471 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1473 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1474 .addDef(LoadParts[i]) // vdata
1475 .addUse(RSrc) // rsrc
1476 .addUse(VIndex) // vindex
1477 .addUse(VOffset) // voffset
1478 .addUse(SOffset) // soffset
1479 .addImm(ImmOffset + 16 * i) // offset(imm)
1480 .addImm(0) // cachepolicy, swizzled buffer(imm)
1481 .addImm(0) // idxen(imm)
1482 .addMemOperand(MMO);
1485 // TODO: If only the resource is a VGPR, it may be better to execute the
1486 // scalar load in the waterfall loop if the resource is expected to frequently
1487 // be dynamically uniform.
1488 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1489 // Remove the original instruction to avoid potentially confusing the
1490 // waterfall loop logic.
1491 B.setInstr(*Span.begin());
1492 MI.eraseFromParent();
1494 SmallSet<Register, 4> OpsToWaterfall;
1496 OpsToWaterfall.insert(RSrc);
1497 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1498 OpsToWaterfall, MRI);
1501 if (NumLoads != 1) {
1502 if (Ty.isVector())
1503 B.buildConcatVectors(Dst, LoadParts);
1504 else
1505 B.buildMerge(Dst, LoadParts);
1508 // We removed the instruction earlier with a waterfall loop.
1509 if (RSrcBank == &AMDGPU::SGPRRegBank)
1510 MI.eraseFromParent();
1512 return true;
1515 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1516 bool Signed) const {
1517 MachineInstr &MI = OpdMapper.getMI();
1518 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1520 // Insert basic copies
1521 applyDefaultMapping(OpdMapper);
1523 Register DstReg = MI.getOperand(0).getReg();
1524 LLT Ty = MRI.getType(DstReg);
1526 const LLT S32 = LLT::scalar(32);
1528 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1529 Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1530 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1531 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1533 const RegisterBank *DstBank =
1534 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1535 if (DstBank == &AMDGPU::VGPRRegBank) {
1536 if (Ty == S32)
1537 return true;
1539 // There is no 64-bit vgpr bitfield extract instructions so the operation
1540 // is expanded to a sequence of instructions that implement the operation.
1541 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1542 MachineIRBuilder B(MI, ApplyBank);
1544 const LLT S64 = LLT::scalar(64);
1545 // Shift the source operand so that extracted bits start at bit 0.
1546 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1547 : B.buildLShr(S64, SrcReg, OffsetReg);
1548 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1550 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1551 // if the width is a constant.
1552 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1553 // Use the 32-bit bitfield extract instruction if the width is a constant.
1554 // Depending on the width size, use either the low or high 32-bits.
1555 auto Zero = B.buildConstant(S32, 0);
1556 auto WidthImm = ConstWidth->Value.getZExtValue();
1557 if (WidthImm <= 32) {
1558 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1559 // or clear the upper 32-bits.
1560 auto Extract =
1561 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1562 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1563 auto Extend =
1564 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1565 B.buildMerge(DstReg, {Extract, Extend});
1566 } else {
1567 // Use bitfield extract on upper 32-bit source, and combine with lower
1568 // 32-bit source.
1569 auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1570 auto Extract =
1571 Signed
1572 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1573 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1574 B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1576 MI.eraseFromParent();
1577 return true;
1580 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1581 // operations.
1582 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1583 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1584 if (Signed)
1585 B.buildAShr(S64, SignBit, ExtShift);
1586 else
1587 B.buildLShr(S64, SignBit, ExtShift);
1588 MI.eraseFromParent();
1589 return true;
1592 // The scalar form packs the offset and width in a single operand.
1594 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1595 MachineIRBuilder B(MI, ApplyBank);
1597 // Ensure the high bits are clear to insert the offset.
1598 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1599 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1601 // Zeros out the low bits, so don't bother clamping the input value.
1602 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1604 // Transformation function, pack the offset and width of a BFE into
1605 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1606 // source, bits [5:0] contain the offset and bits [22:16] the width.
1607 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1609 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1610 // register class constraints.
1611 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1612 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1614 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1615 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1616 llvm_unreachable("failed to constrain BFE");
1618 MI.eraseFromParent();
1619 return true;
1622 // Return a suitable opcode for extending the operands of Opc when widening.
1623 static unsigned getExtendOp(unsigned Opc) {
1624 switch (Opc) {
1625 case TargetOpcode::G_ASHR:
1626 case TargetOpcode::G_SMIN:
1627 case TargetOpcode::G_SMAX:
1628 return TargetOpcode::G_SEXT;
1629 case TargetOpcode::G_LSHR:
1630 case TargetOpcode::G_UMIN:
1631 case TargetOpcode::G_UMAX:
1632 return TargetOpcode::G_ZEXT;
1633 default:
1634 return TargetOpcode::G_ANYEXT;
1638 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1639 // any illegal vector extend or unmerge operations.
1640 static std::pair<Register, Register>
1641 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1642 const LLT S32 = LLT::scalar(32);
1643 auto Bitcast = B.buildBitcast(S32, Src);
1645 if (ExtOpcode == TargetOpcode::G_SEXT) {
1646 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1647 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1648 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1651 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1652 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1653 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1654 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1657 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1658 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1661 // For cases where only a single copy is inserted for matching register banks.
1662 // Replace the register in the instruction operand
1663 static bool substituteSimpleCopyRegs(
1664 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1665 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1666 if (!SrcReg.empty()) {
1667 assert(SrcReg.size() == 1);
1668 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1669 return true;
1672 return false;
1675 /// Handle register layout difference for f16 images for some subtargets.
1676 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1677 MachineRegisterInfo &MRI,
1678 Register Reg) const {
1679 if (!Subtarget.hasUnpackedD16VMem())
1680 return Reg;
1682 const LLT S16 = LLT::scalar(16);
1683 LLT StoreVT = MRI.getType(Reg);
1684 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1685 return Reg;
1687 auto Unmerge = B.buildUnmerge(S16, Reg);
1690 SmallVector<Register, 4> WideRegs;
1691 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1692 WideRegs.push_back(Unmerge.getReg(I));
1694 const LLT S32 = LLT::scalar(32);
1695 int NumElts = StoreVT.getNumElements();
1697 return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1700 static std::pair<Register, unsigned>
1701 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1702 int64_t Const;
1703 if (mi_match(Reg, MRI, m_ICst(Const)))
1704 return std::make_pair(Register(), Const);
1706 Register Base;
1707 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1708 return std::make_pair(Base, Const);
1710 // TODO: Handle G_OR used for add case
1711 return std::make_pair(Reg, 0);
1714 std::pair<Register, unsigned>
1715 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1716 Register OrigOffset) const {
1717 const unsigned MaxImm = 4095;
1718 Register BaseReg;
1719 unsigned ImmOffset;
1720 const LLT S32 = LLT::scalar(32);
1722 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1723 OrigOffset);
1725 unsigned C1 = 0;
1726 if (ImmOffset != 0) {
1727 // If the immediate value is too big for the immoffset field, put the value
1728 // and -4096 into the immoffset field so that the value that is copied/added
1729 // for the voffset field is a multiple of 4096, and it stands more chance
1730 // of being CSEd with the copy/add for another similar load/store.
1731 // However, do not do that rounding down to a multiple of 4096 if that is a
1732 // negative number, as it appears to be illegal to have a negative offset
1733 // in the vgpr, even if adding the immediate offset makes it positive.
1734 unsigned Overflow = ImmOffset & ~MaxImm;
1735 ImmOffset -= Overflow;
1736 if ((int32_t)Overflow < 0) {
1737 Overflow += ImmOffset;
1738 ImmOffset = 0;
1741 C1 = ImmOffset;
1742 if (Overflow != 0) {
1743 if (!BaseReg)
1744 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1745 else {
1746 auto OverflowVal = B.buildConstant(S32, Overflow);
1747 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1752 if (!BaseReg)
1753 BaseReg = B.buildConstant(S32, 0).getReg(0);
1755 return {BaseReg, C1};
1758 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1759 Register SrcReg) const {
1760 MachineRegisterInfo &MRI = *B.getMRI();
1761 LLT SrcTy = MRI.getType(SrcReg);
1762 if (SrcTy.getSizeInBits() == 32) {
1763 // Use a v_mov_b32 here to make the exec dependency explicit.
1764 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1765 .addDef(DstReg)
1766 .addUse(SrcReg);
1767 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1768 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1771 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1772 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1774 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1775 .addDef(TmpReg0)
1776 .addUse(SrcReg, 0, AMDGPU::sub0);
1777 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1778 .addDef(TmpReg1)
1779 .addUse(SrcReg, 0, AMDGPU::sub1);
1780 B.buildInstr(AMDGPU::REG_SEQUENCE)
1781 .addDef(DstReg)
1782 .addUse(TmpReg0)
1783 .addImm(AMDGPU::sub0)
1784 .addUse(TmpReg1)
1785 .addImm(AMDGPU::sub1);
1787 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1788 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1791 /// Utility function for pushing dynamic vector indexes with a constant offset
1792 /// into waterwall loops.
1793 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1794 MachineInstr &IdxUseInstr,
1795 unsigned OpIdx,
1796 unsigned ConstOffset) {
1797 MachineRegisterInfo &MRI = *B.getMRI();
1798 const LLT S32 = LLT::scalar(32);
1799 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1800 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1802 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1804 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1805 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1806 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1807 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1810 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1811 /// original 32-bit source value (to be inserted in the low part of the combined
1812 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1813 /// value.
1814 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1815 Register Hi32Reg, Register Lo32Reg,
1816 unsigned ExtOpc,
1817 const RegisterBank &RegBank,
1818 bool IsBooleanSrc = false) {
1819 if (ExtOpc == AMDGPU::G_ZEXT) {
1820 B.buildConstant(Hi32Reg, 0);
1821 } else if (ExtOpc == AMDGPU::G_SEXT) {
1822 if (IsBooleanSrc) {
1823 // If we know the original source was an s1, the high half is the same as
1824 // the low.
1825 B.buildCopy(Hi32Reg, Lo32Reg);
1826 } else {
1827 // Replicate sign bit from 32-bit extended part.
1828 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1829 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1830 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1832 } else {
1833 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1834 B.buildUndef(Hi32Reg);
1838 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1839 MachineInstr &MI, MachineRegisterInfo &MRI,
1840 const OperandsMapper &OpdMapper) const {
1842 Register VecReg = MI.getOperand(1).getReg();
1843 Register Idx = MI.getOperand(2).getReg();
1845 const RegisterBank &IdxBank =
1846 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1848 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1850 LLT VecTy = MRI.getType(VecReg);
1851 unsigned EltSize = VecTy.getScalarSizeInBits();
1852 unsigned NumElem = VecTy.getNumElements();
1854 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1855 IsDivergentIdx))
1856 return false;
1858 MachineIRBuilder B(MI);
1859 LLT S32 = LLT::scalar(32);
1861 const RegisterBank &DstBank =
1862 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1863 const RegisterBank &SrcBank =
1864 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1866 const RegisterBank &CCBank =
1867 (DstBank == AMDGPU::SGPRRegBank &&
1868 SrcBank == AMDGPU::SGPRRegBank &&
1869 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1870 : AMDGPU::VCCRegBank;
1871 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1873 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1874 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1875 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1878 LLT EltTy = VecTy.getScalarType();
1879 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1880 unsigned NumLanes = DstRegs.size();
1881 if (!NumLanes)
1882 NumLanes = 1;
1883 else
1884 EltTy = MRI.getType(DstRegs[0]);
1886 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1887 SmallVector<Register, 2> Res(NumLanes);
1888 for (unsigned L = 0; L < NumLanes; ++L)
1889 Res[L] = UnmergeToEltTy.getReg(L);
1891 for (unsigned I = 1; I < NumElem; ++I) {
1892 auto IC = B.buildConstant(S32, I);
1893 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1894 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1895 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1897 for (unsigned L = 0; L < NumLanes; ++L) {
1898 auto S = B.buildSelect(EltTy, Cmp,
1899 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1901 for (unsigned N : { 0, 2, 3 })
1902 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1904 Res[L] = S->getOperand(0).getReg();
1908 for (unsigned L = 0; L < NumLanes; ++L) {
1909 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1910 B.buildCopy(DstReg, Res[L]);
1911 MRI.setRegBank(DstReg, DstBank);
1914 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1915 MI.eraseFromParent();
1917 return true;
1920 // Insert a cross regbank copy for a register if it already has a bank that
1921 // differs from the one we want to set.
1922 static Register constrainRegToBank(MachineRegisterInfo &MRI,
1923 MachineIRBuilder &B, Register &Reg,
1924 const RegisterBank &Bank) {
1925 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
1926 if (CurrBank && *CurrBank != Bank) {
1927 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
1928 MRI.setRegBank(Copy, Bank);
1929 return Copy;
1932 MRI.setRegBank(Reg, Bank);
1933 return Reg;
1936 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
1937 MachineInstr &MI, MachineRegisterInfo &MRI,
1938 const OperandsMapper &OpdMapper) const {
1940 Register VecReg = MI.getOperand(1).getReg();
1941 Register Idx = MI.getOperand(3).getReg();
1943 const RegisterBank &IdxBank =
1944 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
1946 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1948 LLT VecTy = MRI.getType(VecReg);
1949 unsigned EltSize = VecTy.getScalarSizeInBits();
1950 unsigned NumElem = VecTy.getNumElements();
1952 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1953 IsDivergentIdx))
1954 return false;
1956 MachineIRBuilder B(MI);
1957 LLT S32 = LLT::scalar(32);
1959 const RegisterBank &DstBank =
1960 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1961 const RegisterBank &SrcBank =
1962 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1963 const RegisterBank &InsBank =
1964 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1966 const RegisterBank &CCBank =
1967 (DstBank == AMDGPU::SGPRRegBank &&
1968 SrcBank == AMDGPU::SGPRRegBank &&
1969 InsBank == AMDGPU::SGPRRegBank &&
1970 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1971 : AMDGPU::VCCRegBank;
1972 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1974 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1975 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1976 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1979 LLT EltTy = VecTy.getScalarType();
1980 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
1981 unsigned NumLanes = InsRegs.size();
1982 if (!NumLanes) {
1983 NumLanes = 1;
1984 InsRegs.push_back(MI.getOperand(2).getReg());
1985 } else {
1986 EltTy = MRI.getType(InsRegs[0]);
1989 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1990 SmallVector<Register, 16> Ops(NumElem * NumLanes);
1992 for (unsigned I = 0; I < NumElem; ++I) {
1993 auto IC = B.buildConstant(S32, I);
1994 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1995 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1996 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1998 for (unsigned L = 0; L < NumLanes; ++L) {
1999 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2000 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2001 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2003 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2004 MRI.setRegBank(Select, DstBank);
2006 Ops[I * NumLanes + L] = Select;
2010 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2011 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2012 B.buildBuildVector(MI.getOperand(0), Ops);
2013 } else {
2014 auto Vec = B.buildBuildVector(MergeTy, Ops);
2015 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2016 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2019 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2020 MI.eraseFromParent();
2022 return true;
2025 void AMDGPURegisterBankInfo::applyMappingImpl(
2026 const OperandsMapper &OpdMapper) const {
2027 MachineInstr &MI = OpdMapper.getMI();
2028 unsigned Opc = MI.getOpcode();
2029 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2030 switch (Opc) {
2031 case AMDGPU::G_PHI: {
2032 Register DstReg = MI.getOperand(0).getReg();
2033 LLT DstTy = MRI.getType(DstReg);
2034 if (DstTy != LLT::scalar(1))
2035 break;
2037 const LLT S32 = LLT::scalar(32);
2038 const RegisterBank *DstBank =
2039 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2040 if (DstBank == &AMDGPU::VCCRegBank) {
2041 applyDefaultMapping(OpdMapper);
2042 // The standard handling only considers the result register bank for
2043 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2044 // produce an invalid copy. We can only copy with some kind of compare to
2045 // get a vector boolean result. Insert a register bank copy that will be
2046 // correctly lowered to a compare.
2047 MachineIRBuilder B(*MI.getParent()->getParent());
2049 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2050 Register SrcReg = MI.getOperand(I).getReg();
2051 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2053 if (SrcBank != &AMDGPU::VCCRegBank) {
2054 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2055 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2057 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2058 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2059 MI.getOperand(I).setReg(Copy.getReg(0));
2063 return;
2066 // Phi handling is strange and only considers the bank of the destination.
2067 substituteSimpleCopyRegs(OpdMapper, 0);
2069 // Promote SGPR/VGPR booleans to s32
2070 MachineFunction *MF = MI.getParent()->getParent();
2071 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2072 MachineIRBuilder B(MI, ApplyBank);
2073 LegalizerHelper Helper(*MF, ApplyBank, B);
2075 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2076 llvm_unreachable("widen scalar should have succeeded");
2078 return;
2080 case AMDGPU::G_ICMP:
2081 case AMDGPU::G_UADDO:
2082 case AMDGPU::G_USUBO:
2083 case AMDGPU::G_UADDE:
2084 case AMDGPU::G_SADDE:
2085 case AMDGPU::G_USUBE:
2086 case AMDGPU::G_SSUBE: {
2087 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2088 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2090 const RegisterBank *DstBank =
2091 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2092 if (DstBank != &AMDGPU::SGPRRegBank)
2093 break;
2095 const bool HasCarryIn = MI.getNumOperands() == 5;
2097 // If this is a scalar compare, promote the result to s32, as the selection
2098 // will end up using a copy to a 32-bit vreg.
2099 const LLT S32 = LLT::scalar(32);
2100 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2101 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2102 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2103 MachineIRBuilder B(MI);
2105 if (HasCarryIn) {
2106 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2107 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2108 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2109 MI.getOperand(4).setReg(NewSrcReg);
2112 MachineBasicBlock *MBB = MI.getParent();
2113 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2115 // If we had a constrained VCC result register, a copy was inserted to VCC
2116 // from SGPR.
2117 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2118 if (DefRegs.empty())
2119 DefRegs.push_back(DstReg);
2120 B.buildTrunc(DefRegs[0], NewDstReg);
2121 return;
2123 case AMDGPU::G_SELECT: {
2124 Register DstReg = MI.getOperand(0).getReg();
2125 LLT DstTy = MRI.getType(DstReg);
2127 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2128 if (CondRegs.empty())
2129 CondRegs.push_back(MI.getOperand(1).getReg());
2130 else {
2131 assert(CondRegs.size() == 1);
2134 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2135 if (CondBank == &AMDGPU::SGPRRegBank) {
2136 MachineIRBuilder B(MI);
2137 const LLT S32 = LLT::scalar(32);
2138 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2139 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2141 MI.getOperand(1).setReg(NewCondReg);
2142 B.buildZExt(NewCondReg, CondRegs[0]);
2145 if (DstTy.getSizeInBits() != 64)
2146 break;
2148 MachineIRBuilder B(MI);
2149 LLT HalfTy = getHalfSizedType(DstTy);
2151 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2152 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2153 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2155 // All inputs are SGPRs, nothing special to do.
2156 if (DefRegs.empty()) {
2157 assert(Src1Regs.empty() && Src2Regs.empty());
2158 break;
2161 if (Src1Regs.empty())
2162 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2163 else {
2164 setRegsToType(MRI, Src1Regs, HalfTy);
2167 if (Src2Regs.empty())
2168 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2169 else
2170 setRegsToType(MRI, Src2Regs, HalfTy);
2172 setRegsToType(MRI, DefRegs, HalfTy);
2174 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2175 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2177 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2178 MI.eraseFromParent();
2179 return;
2181 case AMDGPU::G_BRCOND: {
2182 Register CondReg = MI.getOperand(0).getReg();
2183 // FIXME: Should use legalizer helper, but should change bool ext type.
2184 const RegisterBank *CondBank =
2185 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2187 if (CondBank == &AMDGPU::SGPRRegBank) {
2188 MachineIRBuilder B(MI);
2189 const LLT S32 = LLT::scalar(32);
2190 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2191 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2193 MI.getOperand(0).setReg(NewCondReg);
2194 B.buildZExt(NewCondReg, CondReg);
2195 return;
2198 break;
2200 case AMDGPU::G_AND:
2201 case AMDGPU::G_OR:
2202 case AMDGPU::G_XOR: {
2203 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2204 // there is a VGPR input.
2205 Register DstReg = MI.getOperand(0).getReg();
2206 LLT DstTy = MRI.getType(DstReg);
2208 if (DstTy.getSizeInBits() == 1) {
2209 const RegisterBank *DstBank =
2210 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2211 if (DstBank == &AMDGPU::VCCRegBank)
2212 break;
2214 MachineFunction *MF = MI.getParent()->getParent();
2215 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2216 MachineIRBuilder B(MI, ApplyBank);
2217 LegalizerHelper Helper(*MF, ApplyBank, B);
2219 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2220 LegalizerHelper::Legalized)
2221 llvm_unreachable("widen scalar should have succeeded");
2222 return;
2225 if (DstTy.getSizeInBits() != 64)
2226 break;
2228 LLT HalfTy = getHalfSizedType(DstTy);
2229 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2230 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2231 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2233 // All inputs are SGPRs, nothing special to do.
2234 if (DefRegs.empty()) {
2235 assert(Src0Regs.empty() && Src1Regs.empty());
2236 break;
2239 assert(DefRegs.size() == 2);
2240 assert(Src0Regs.size() == Src1Regs.size() &&
2241 (Src0Regs.empty() || Src0Regs.size() == 2));
2243 // Depending on where the source registers came from, the generic code may
2244 // have decided to split the inputs already or not. If not, we still need to
2245 // extract the values.
2246 MachineIRBuilder B(MI);
2248 if (Src0Regs.empty())
2249 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2250 else
2251 setRegsToType(MRI, Src0Regs, HalfTy);
2253 if (Src1Regs.empty())
2254 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2255 else
2256 setRegsToType(MRI, Src1Regs, HalfTy);
2258 setRegsToType(MRI, DefRegs, HalfTy);
2260 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2261 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2263 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2264 MI.eraseFromParent();
2265 return;
2267 case AMDGPU::G_ABS: {
2268 Register SrcReg = MI.getOperand(1).getReg();
2269 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2271 // There is no VALU abs instruction so we need to replace it with a sub and
2272 // max combination.
2273 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2274 MachineFunction *MF = MI.getParent()->getParent();
2275 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2276 MachineIRBuilder B(MI, Apply);
2277 LegalizerHelper Helper(*MF, Apply, B);
2279 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2280 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2281 return;
2283 LLVM_FALLTHROUGH;
2285 case AMDGPU::G_ADD:
2286 case AMDGPU::G_SUB:
2287 case AMDGPU::G_MUL:
2288 case AMDGPU::G_SHL:
2289 case AMDGPU::G_LSHR:
2290 case AMDGPU::G_ASHR:
2291 case AMDGPU::G_SMIN:
2292 case AMDGPU::G_SMAX:
2293 case AMDGPU::G_UMIN:
2294 case AMDGPU::G_UMAX: {
2295 Register DstReg = MI.getOperand(0).getReg();
2296 LLT DstTy = MRI.getType(DstReg);
2298 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2299 // Packed 16-bit operations need to be scalarized and promoted.
2300 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2301 break;
2303 const RegisterBank *DstBank =
2304 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2305 if (DstBank == &AMDGPU::VGPRRegBank)
2306 break;
2308 const LLT S32 = LLT::scalar(32);
2309 MachineBasicBlock *MBB = MI.getParent();
2310 MachineFunction *MF = MBB->getParent();
2311 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2312 MachineIRBuilder B(MI, ApplySALU);
2314 if (DstTy.isVector()) {
2315 Register WideSrc0Lo, WideSrc0Hi;
2316 Register WideSrc1Lo, WideSrc1Hi;
2318 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2319 std::tie(WideSrc0Lo, WideSrc0Hi)
2320 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2321 std::tie(WideSrc1Lo, WideSrc1Hi)
2322 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2323 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2324 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2325 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2326 MI.eraseFromParent();
2327 } else {
2328 LegalizerHelper Helper(*MF, ApplySALU, B);
2330 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2331 llvm_unreachable("widen scalar should have succeeded");
2333 // FIXME: s16 shift amounts should be legal.
2334 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2335 Opc == AMDGPU::G_ASHR) {
2336 B.setInsertPt(*MBB, MI.getIterator());
2337 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2338 llvm_unreachable("widen scalar should have succeeded");
2342 return;
2344 case AMDGPU::G_SEXT_INREG: {
2345 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2346 if (SrcRegs.empty())
2347 break; // Nothing to repair
2349 const LLT S32 = LLT::scalar(32);
2350 MachineIRBuilder B(MI);
2351 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2352 GISelObserverWrapper Observer(&O);
2353 B.setChangeObserver(Observer);
2355 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2356 // we would need to further expand, and doesn't let us directly set the
2357 // result registers.
2358 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2360 int Amt = MI.getOperand(2).getImm();
2361 if (Amt <= 32) {
2362 if (Amt == 32) {
2363 // The low bits are unchanged.
2364 B.buildCopy(DstRegs[0], SrcRegs[0]);
2365 } else {
2366 // Extend in the low bits and propagate the sign bit to the high half.
2367 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2370 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2371 } else {
2372 // The low bits are unchanged, and extend in the high bits.
2373 B.buildCopy(DstRegs[0], SrcRegs[0]);
2374 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2377 Register DstReg = MI.getOperand(0).getReg();
2378 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2379 MI.eraseFromParent();
2380 return;
2382 case AMDGPU::G_CTPOP:
2383 case AMDGPU::G_BITREVERSE: {
2384 const RegisterBank *DstBank =
2385 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2386 if (DstBank == &AMDGPU::SGPRRegBank)
2387 break;
2389 Register SrcReg = MI.getOperand(1).getReg();
2390 const LLT S32 = LLT::scalar(32);
2391 LLT Ty = MRI.getType(SrcReg);
2392 if (Ty == S32)
2393 break;
2395 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2396 MachineIRBuilder B(MI, ApplyVALU);
2398 MachineFunction &MF = B.getMF();
2399 LegalizerHelper Helper(MF, ApplyVALU, B);
2401 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2402 llvm_unreachable("narrowScalar should have succeeded");
2403 return;
2405 case AMDGPU::G_AMDGPU_FFBH_U32:
2406 case AMDGPU::G_AMDGPU_FFBL_B32:
2407 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2408 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2409 const RegisterBank *DstBank =
2410 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2411 if (DstBank == &AMDGPU::SGPRRegBank)
2412 break;
2414 Register SrcReg = MI.getOperand(1).getReg();
2415 const LLT S32 = LLT::scalar(32);
2416 LLT Ty = MRI.getType(SrcReg);
2417 if (Ty == S32)
2418 break;
2420 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2421 // which return -1 when the input is zero:
2422 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2423 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2424 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2425 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2426 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2427 MachineIRBuilder B(MI, ApplyVALU);
2428 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2429 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2430 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2431 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2432 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2433 : Opc;
2434 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2435 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2436 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2437 unsigned AddOpc =
2438 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2439 ? AMDGPU::G_ADD
2440 : AMDGPU::G_UADDSAT;
2441 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2442 Register DstReg = MI.getOperand(0).getReg();
2443 B.buildUMin(DstReg, X, Y);
2444 MI.eraseFromParent();
2445 return;
2447 case AMDGPU::G_SEXT:
2448 case AMDGPU::G_ZEXT:
2449 case AMDGPU::G_ANYEXT: {
2450 Register SrcReg = MI.getOperand(1).getReg();
2451 LLT SrcTy = MRI.getType(SrcReg);
2452 const bool Signed = Opc == AMDGPU::G_SEXT;
2454 assert(empty(OpdMapper.getVRegs(1)));
2456 MachineIRBuilder B(MI);
2457 const RegisterBank *SrcBank =
2458 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2460 Register DstReg = MI.getOperand(0).getReg();
2461 LLT DstTy = MRI.getType(DstReg);
2462 if (DstTy.isScalar() &&
2463 SrcBank != &AMDGPU::SGPRRegBank &&
2464 SrcBank != &AMDGPU::VCCRegBank &&
2465 // FIXME: Should handle any type that round to s64 when irregular
2466 // breakdowns supported.
2467 DstTy.getSizeInBits() == 64 &&
2468 SrcTy.getSizeInBits() <= 32) {
2469 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2471 // Extend to 32-bit, and then extend the low half.
2472 if (Signed) {
2473 // TODO: Should really be buildSExtOrCopy
2474 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2475 } else if (Opc == AMDGPU::G_ZEXT) {
2476 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2477 } else {
2478 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2481 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2482 MRI.setRegBank(DstReg, *SrcBank);
2483 MI.eraseFromParent();
2484 return;
2487 if (SrcTy != LLT::scalar(1))
2488 return;
2490 // It is not legal to have a legalization artifact with a VCC source. Rather
2491 // than introducing a copy, insert the select we would have to select the
2492 // copy to.
2493 if (SrcBank == &AMDGPU::VCCRegBank) {
2494 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2496 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2498 unsigned DstSize = DstTy.getSizeInBits();
2499 // 64-bit select is SGPR only
2500 const bool UseSel64 = DstSize > 32 &&
2501 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2503 // TODO: Should s16 select be legal?
2504 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2505 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2506 auto False = B.buildConstant(SelType, 0);
2508 MRI.setRegBank(True.getReg(0), *DstBank);
2509 MRI.setRegBank(False.getReg(0), *DstBank);
2510 MRI.setRegBank(DstReg, *DstBank);
2512 if (DstSize > 32) {
2513 B.buildSelect(DefRegs[0], SrcReg, True, False);
2514 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2515 } else if (DstSize < 32) {
2516 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2517 MRI.setRegBank(Sel.getReg(0), *DstBank);
2518 B.buildTrunc(DstReg, Sel);
2519 } else {
2520 B.buildSelect(DstReg, SrcReg, True, False);
2523 MI.eraseFromParent();
2524 return;
2527 break;
2529 case AMDGPU::G_BUILD_VECTOR:
2530 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2531 Register DstReg = MI.getOperand(0).getReg();
2532 LLT DstTy = MRI.getType(DstReg);
2533 if (DstTy != LLT::fixed_vector(2, 16))
2534 break;
2536 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2537 substituteSimpleCopyRegs(OpdMapper, 1);
2538 substituteSimpleCopyRegs(OpdMapper, 2);
2540 const RegisterBank *DstBank =
2541 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2542 if (DstBank == &AMDGPU::SGPRRegBank)
2543 break; // Can use S_PACK_* instructions.
2545 MachineIRBuilder B(MI);
2547 Register Lo = MI.getOperand(1).getReg();
2548 Register Hi = MI.getOperand(2).getReg();
2549 const LLT S32 = LLT::scalar(32);
2551 const RegisterBank *BankLo =
2552 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2553 const RegisterBank *BankHi =
2554 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2556 Register ZextLo;
2557 Register ShiftHi;
2559 if (Opc == AMDGPU::G_BUILD_VECTOR) {
2560 ZextLo = B.buildZExt(S32, Lo).getReg(0);
2561 MRI.setRegBank(ZextLo, *BankLo);
2563 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2564 MRI.setRegBank(ZextHi, *BankHi);
2566 auto ShiftAmt = B.buildConstant(S32, 16);
2567 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2569 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2570 MRI.setRegBank(ShiftHi, *BankHi);
2571 } else {
2572 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2573 MRI.setRegBank(MaskLo, *BankLo);
2575 auto ShiftAmt = B.buildConstant(S32, 16);
2576 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2578 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2579 MRI.setRegBank(ShiftHi, *BankHi);
2581 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2582 MRI.setRegBank(ZextLo, *BankLo);
2585 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2586 MRI.setRegBank(Or.getReg(0), *DstBank);
2588 B.buildBitcast(DstReg, Or);
2589 MI.eraseFromParent();
2590 return;
2592 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2593 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2595 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2597 Register DstReg = MI.getOperand(0).getReg();
2598 Register SrcReg = MI.getOperand(1).getReg();
2600 const LLT S32 = LLT::scalar(32);
2601 LLT DstTy = MRI.getType(DstReg);
2602 LLT SrcTy = MRI.getType(SrcReg);
2604 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2605 return;
2607 MachineIRBuilder B(MI);
2609 const ValueMapping &DstMapping
2610 = OpdMapper.getInstrMapping().getOperandMapping(0);
2611 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2612 const RegisterBank *SrcBank =
2613 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2614 const RegisterBank *IdxBank =
2615 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2617 Register BaseIdxReg;
2618 unsigned ConstOffset;
2619 std::tie(BaseIdxReg, ConstOffset) =
2620 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2622 // See if the index is an add of a constant which will be foldable by moving
2623 // the base register of the index later if this is going to be executed in a
2624 // waterfall loop. This is essentially to reassociate the add of a constant
2625 // with the readfirstlane.
2626 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2627 ConstOffset > 0 &&
2628 ConstOffset < SrcTy.getNumElements();
2630 // Move the base register. We'll re-insert the add later.
2631 if (ShouldMoveIndexIntoLoop)
2632 MI.getOperand(2).setReg(BaseIdxReg);
2634 // If this is a VGPR result only because the index was a VGPR result, the
2635 // actual indexing will be done on the SGPR source vector, which will
2636 // produce a scalar result. We need to copy to the VGPR result inside the
2637 // waterfall loop.
2638 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2639 SrcBank == &AMDGPU::SGPRRegBank;
2640 if (DstRegs.empty()) {
2641 applyDefaultMapping(OpdMapper);
2643 executeInWaterfallLoop(MI, MRI, { 2 });
2645 if (NeedCopyToVGPR) {
2646 // We don't want a phi for this temporary reg.
2647 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2648 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2649 MI.getOperand(0).setReg(TmpReg);
2650 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2652 // Use a v_mov_b32 here to make the exec dependency explicit.
2653 buildVCopy(B, DstReg, TmpReg);
2656 // Re-insert the constant offset add inside the waterfall loop.
2657 if (ShouldMoveIndexIntoLoop)
2658 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2660 return;
2663 assert(DstTy.getSizeInBits() == 64);
2665 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2667 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2668 auto One = B.buildConstant(S32, 1);
2670 MachineBasicBlock::iterator MII = MI.getIterator();
2672 // Split the vector index into 32-bit pieces. Prepare to move all of the
2673 // new instructions into a waterfall loop if necessary.
2675 // Don't put the bitcast or constant in the loop.
2676 MachineInstrSpan Span(MII, &B.getMBB());
2678 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2679 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2680 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2682 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2683 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2685 MRI.setRegBank(DstReg, *DstBank);
2686 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2687 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2688 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2689 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2691 SmallSet<Register, 4> OpsToWaterfall;
2692 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2693 MI.eraseFromParent();
2694 return;
2697 // Remove the original instruction to avoid potentially confusing the
2698 // waterfall loop logic.
2699 B.setInstr(*Span.begin());
2700 MI.eraseFromParent();
2701 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2702 OpsToWaterfall, MRI);
2704 if (NeedCopyToVGPR) {
2705 MachineBasicBlock *LoopBB = Extract1->getParent();
2706 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2707 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2708 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2709 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2711 Extract0->getOperand(0).setReg(TmpReg0);
2712 Extract1->getOperand(0).setReg(TmpReg1);
2714 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2716 buildVCopy(B, DstRegs[0], TmpReg0);
2717 buildVCopy(B, DstRegs[1], TmpReg1);
2720 if (ShouldMoveIndexIntoLoop)
2721 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2723 return;
2725 case AMDGPU::G_INSERT_VECTOR_ELT: {
2726 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2728 Register DstReg = MI.getOperand(0).getReg();
2729 LLT VecTy = MRI.getType(DstReg);
2731 assert(OpdMapper.getVRegs(0).empty());
2732 assert(OpdMapper.getVRegs(3).empty());
2734 if (substituteSimpleCopyRegs(OpdMapper, 1))
2735 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2737 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2738 return;
2740 const RegisterBank *IdxBank =
2741 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2743 Register SrcReg = MI.getOperand(1).getReg();
2744 Register InsReg = MI.getOperand(2).getReg();
2745 LLT InsTy = MRI.getType(InsReg);
2746 (void)InsTy;
2748 Register BaseIdxReg;
2749 unsigned ConstOffset;
2750 std::tie(BaseIdxReg, ConstOffset) =
2751 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2753 // See if the index is an add of a constant which will be foldable by moving
2754 // the base register of the index later if this is going to be executed in a
2755 // waterfall loop. This is essentially to reassociate the add of a constant
2756 // with the readfirstlane.
2757 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2758 ConstOffset > 0 &&
2759 ConstOffset < VecTy.getNumElements();
2761 // Move the base register. We'll re-insert the add later.
2762 if (ShouldMoveIndexIntoLoop)
2763 MI.getOperand(3).setReg(BaseIdxReg);
2766 if (InsRegs.empty()) {
2767 executeInWaterfallLoop(MI, MRI, { 3 });
2769 // Re-insert the constant offset add inside the waterfall loop.
2770 if (ShouldMoveIndexIntoLoop) {
2771 MachineIRBuilder B(MI);
2772 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2775 return;
2779 assert(InsTy.getSizeInBits() == 64);
2781 const LLT S32 = LLT::scalar(32);
2782 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2784 MachineIRBuilder B(MI);
2785 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2786 auto One = B.buildConstant(S32, 1);
2788 // Split the vector index into 32-bit pieces. Prepare to move all of the
2789 // new instructions into a waterfall loop if necessary.
2791 // Don't put the bitcast or constant in the loop.
2792 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2794 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2795 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2796 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2798 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2799 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2801 const RegisterBank *DstBank =
2802 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2803 const RegisterBank *SrcBank =
2804 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2805 const RegisterBank *InsSrcBank =
2806 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2808 MRI.setRegBank(InsReg, *InsSrcBank);
2809 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2810 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2811 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2812 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2813 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2814 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2817 SmallSet<Register, 4> OpsToWaterfall;
2818 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2819 B.setInsertPt(B.getMBB(), MI);
2820 B.buildBitcast(DstReg, InsHi);
2821 MI.eraseFromParent();
2822 return;
2825 B.setInstr(*Span.begin());
2826 MI.eraseFromParent();
2828 // Figure out the point after the waterfall loop before mangling the control
2829 // flow.
2830 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2831 OpsToWaterfall, MRI);
2833 // The insertion point is now right after the original instruction.
2835 // Keep the bitcast to the original vector type out of the loop. Doing this
2836 // saved an extra phi we don't need inside the loop.
2837 B.buildBitcast(DstReg, InsHi);
2839 // Re-insert the constant offset add inside the waterfall loop.
2840 if (ShouldMoveIndexIntoLoop)
2841 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2843 return;
2845 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2846 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2847 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2848 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2849 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2850 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2851 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2852 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2853 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2854 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2855 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2856 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2857 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2858 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2859 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2860 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2861 applyDefaultMapping(OpdMapper);
2862 executeInWaterfallLoop(MI, MRI, {1, 4});
2863 return;
2865 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2866 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2867 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2868 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2869 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2870 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2871 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2872 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2873 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2874 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2875 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2876 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2877 applyDefaultMapping(OpdMapper);
2878 executeInWaterfallLoop(MI, MRI, {2, 5});
2879 return;
2881 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2882 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2883 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2884 applyDefaultMapping(OpdMapper);
2885 executeInWaterfallLoop(MI, MRI, {2, 5});
2886 return;
2888 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2889 applyDefaultMapping(OpdMapper);
2890 executeInWaterfallLoop(MI, MRI, {3, 6});
2891 return;
2893 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2894 applyMappingSBufferLoad(OpdMapper);
2895 return;
2897 case AMDGPU::G_INTRINSIC: {
2898 switch (MI.getIntrinsicID()) {
2899 case Intrinsic::amdgcn_readlane: {
2900 substituteSimpleCopyRegs(OpdMapper, 2);
2902 assert(OpdMapper.getVRegs(0).empty());
2903 assert(OpdMapper.getVRegs(3).empty());
2905 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2906 // waterfall loop, so assume it's a uniform value.
2907 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2908 return;
2910 case Intrinsic::amdgcn_writelane: {
2911 assert(OpdMapper.getVRegs(0).empty());
2912 assert(OpdMapper.getVRegs(2).empty());
2913 assert(OpdMapper.getVRegs(3).empty());
2915 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2916 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2917 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2918 return;
2920 case Intrinsic::amdgcn_interp_p1:
2921 case Intrinsic::amdgcn_interp_p2:
2922 case Intrinsic::amdgcn_interp_mov:
2923 case Intrinsic::amdgcn_interp_p1_f16:
2924 case Intrinsic::amdgcn_interp_p2_f16: {
2925 applyDefaultMapping(OpdMapper);
2927 // Readlane for m0 value, which is always the last operand.
2928 // FIXME: Should this be a waterfall loop instead?
2929 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2930 return;
2932 case Intrinsic::amdgcn_permlane16:
2933 case Intrinsic::amdgcn_permlanex16: {
2934 // Doing a waterfall loop over these wouldn't make any sense.
2935 substituteSimpleCopyRegs(OpdMapper, 2);
2936 substituteSimpleCopyRegs(OpdMapper, 3);
2937 constrainOpWithReadfirstlane(MI, MRI, 4);
2938 constrainOpWithReadfirstlane(MI, MRI, 5);
2939 return;
2941 case Intrinsic::amdgcn_sbfe:
2942 applyMappingBFE(OpdMapper, true);
2943 return;
2944 case Intrinsic::amdgcn_ubfe:
2945 applyMappingBFE(OpdMapper, false);
2946 return;
2947 case Intrinsic::amdgcn_ballot:
2948 // Use default handling and insert copy to vcc source.
2949 break;
2951 break;
2953 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2954 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
2955 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
2956 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
2957 const AMDGPU::RsrcIntrinsic *RSrcIntrin
2958 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
2959 assert(RSrcIntrin && RSrcIntrin->IsImage);
2960 // Non-images can have complications from operands that allow both SGPR
2961 // and VGPR. For now it's too complicated to figure out the final opcode
2962 // to derive the register bank from the MCInstrDesc.
2963 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2964 return;
2966 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
2967 unsigned N = MI.getNumExplicitOperands() - 2;
2968 applyDefaultMapping(OpdMapper);
2969 executeInWaterfallLoop(MI, MRI, { N });
2970 return;
2972 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2973 auto IntrID = MI.getIntrinsicID();
2974 switch (IntrID) {
2975 case Intrinsic::amdgcn_ds_ordered_add:
2976 case Intrinsic::amdgcn_ds_ordered_swap: {
2977 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2978 assert(OpdMapper.getVRegs(0).empty());
2979 substituteSimpleCopyRegs(OpdMapper, 3);
2980 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2981 return;
2983 case Intrinsic::amdgcn_ds_gws_init:
2984 case Intrinsic::amdgcn_ds_gws_barrier:
2985 case Intrinsic::amdgcn_ds_gws_sema_br: {
2986 // Only the first lane is executes, so readfirstlane is safe.
2987 substituteSimpleCopyRegs(OpdMapper, 1);
2988 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2989 return;
2991 case Intrinsic::amdgcn_ds_gws_sema_v:
2992 case Intrinsic::amdgcn_ds_gws_sema_p:
2993 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2994 // Only the first lane is executes, so readfirstlane is safe.
2995 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
2996 return;
2998 case Intrinsic::amdgcn_ds_append:
2999 case Intrinsic::amdgcn_ds_consume: {
3000 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3001 return;
3003 case Intrinsic::amdgcn_s_sendmsg:
3004 case Intrinsic::amdgcn_s_sendmsghalt: {
3005 // FIXME: Should this use a waterfall loop?
3006 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3007 return;
3009 case Intrinsic::amdgcn_s_setreg: {
3010 constrainOpWithReadfirstlane(MI, MRI, 2);
3011 return;
3013 default: {
3014 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3015 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3016 // Non-images can have complications from operands that allow both SGPR
3017 // and VGPR. For now it's too complicated to figure out the final opcode
3018 // to derive the register bank from the MCInstrDesc.
3019 if (RSrcIntrin->IsImage) {
3020 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3021 return;
3025 break;
3028 break;
3030 case AMDGPU::G_SI_CALL: {
3031 // Use a set to avoid extra readfirstlanes in the case where multiple
3032 // operands are the same register.
3033 SmallSet<Register, 4> SGPROperandRegs;
3035 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3036 break;
3038 // Move all copies to physical SGPRs that are used by the call instruction
3039 // into the loop block. Start searching for these copies until the
3040 // ADJCALLSTACKUP.
3041 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3042 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3044 // Move all non-copies before the copies, so that a complete range can be
3045 // moved into the waterfall loop.
3046 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3047 // Count of NonCopyInstrs found until the current LastCopy.
3048 unsigned NonCopyInstrsLen = 0;
3049 MachineBasicBlock::iterator Start(&MI);
3050 MachineBasicBlock::iterator LastCopy = Start;
3051 MachineBasicBlock *MBB = MI.getParent();
3052 const SIMachineFunctionInfo *Info =
3053 MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3054 while (Start->getOpcode() != FrameSetupOpcode) {
3055 --Start;
3056 bool IsCopy = false;
3057 if (Start->getOpcode() == AMDGPU::COPY) {
3058 auto &Dst = Start->getOperand(0);
3059 if (Dst.isReg()) {
3060 Register Reg = Dst.getReg();
3061 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3062 IsCopy = true;
3063 } else {
3064 // Also move the copy from the scratch rsrc descriptor into the loop
3065 // to allow it to be optimized away.
3066 auto &Src = Start->getOperand(1);
3067 if (Src.isReg()) {
3068 Reg = Src.getReg();
3069 IsCopy = Info->getScratchRSrcReg() == Reg;
3075 if (IsCopy) {
3076 LastCopy = Start;
3077 NonCopyInstrsLen = NonCopyInstrs.size();
3078 } else {
3079 NonCopyInstrs.push_back(&*Start);
3082 NonCopyInstrs.resize(NonCopyInstrsLen);
3084 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3085 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3087 Start = LastCopy;
3089 // Do the same for copies after the loop
3090 NonCopyInstrs.clear();
3091 NonCopyInstrsLen = 0;
3092 MachineBasicBlock::iterator End(&MI);
3093 LastCopy = End;
3094 while (End->getOpcode() != FrameDestroyOpcode) {
3095 ++End;
3096 bool IsCopy = false;
3097 if (End->getOpcode() == AMDGPU::COPY) {
3098 auto &Src = End->getOperand(1);
3099 if (Src.isReg()) {
3100 Register Reg = Src.getReg();
3101 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3105 if (IsCopy) {
3106 LastCopy = End;
3107 NonCopyInstrsLen = NonCopyInstrs.size();
3108 } else {
3109 NonCopyInstrs.push_back(&*End);
3112 NonCopyInstrs.resize(NonCopyInstrsLen);
3114 End = LastCopy;
3115 ++LastCopy;
3116 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3117 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3120 ++End;
3121 MachineIRBuilder B(*Start);
3122 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
3123 break;
3125 case AMDGPU::G_LOAD:
3126 case AMDGPU::G_ZEXTLOAD:
3127 case AMDGPU::G_SEXTLOAD: {
3128 if (applyMappingLoad(MI, OpdMapper, MRI))
3129 return;
3130 break;
3132 case AMDGPU::G_DYN_STACKALLOC:
3133 applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3134 return;
3135 case AMDGPU::G_SBFX:
3136 applyMappingBFE(OpdMapper, /*Signed*/ true);
3137 return;
3138 case AMDGPU::G_UBFX:
3139 applyMappingBFE(OpdMapper, /*Signed*/ false);
3140 return;
3141 default:
3142 break;
3145 return applyDefaultMapping(OpdMapper);
3148 // vgpr, sgpr -> vgpr
3149 // vgpr, agpr -> vgpr
3150 // agpr, agpr -> agpr
3151 // agpr, sgpr -> vgpr
3152 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3153 if (RB0 == AMDGPU::InvalidRegBankID)
3154 return RB1;
3155 if (RB1 == AMDGPU::InvalidRegBankID)
3156 return RB0;
3158 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3159 return AMDGPU::SGPRRegBankID;
3161 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3162 return AMDGPU::AGPRRegBankID;
3164 return AMDGPU::VGPRRegBankID;
3167 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3168 if (RB0 == AMDGPU::InvalidRegBankID)
3169 return RB1;
3170 if (RB1 == AMDGPU::InvalidRegBankID)
3171 return RB0;
3173 // vcc, vcc -> vcc
3174 // vcc, sgpr -> vcc
3175 // vcc, vgpr -> vcc
3176 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3177 return AMDGPU::VCCRegBankID;
3179 // vcc, vgpr -> vgpr
3180 return regBankUnion(RB0, RB1);
3183 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3184 const MachineInstr &MI) const {
3185 unsigned RegBank = AMDGPU::InvalidRegBankID;
3187 for (const MachineOperand &MO : MI.operands()) {
3188 if (!MO.isReg())
3189 continue;
3190 Register Reg = MO.getReg();
3191 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3192 RegBank = regBankUnion(RegBank, Bank->getID());
3193 if (RegBank == AMDGPU::VGPRRegBankID)
3194 break;
3198 return RegBank;
3201 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3202 const MachineFunction &MF = *MI.getParent()->getParent();
3203 const MachineRegisterInfo &MRI = MF.getRegInfo();
3204 for (const MachineOperand &MO : MI.operands()) {
3205 if (!MO.isReg())
3206 continue;
3207 Register Reg = MO.getReg();
3208 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3209 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3210 return false;
3213 return true;
3216 const RegisterBankInfo::InstructionMapping &
3217 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3218 const MachineFunction &MF = *MI.getParent()->getParent();
3219 const MachineRegisterInfo &MRI = MF.getRegInfo();
3220 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3222 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3223 const MachineOperand &SrcOp = MI.getOperand(i);
3224 if (!SrcOp.isReg())
3225 continue;
3227 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3228 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3230 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3231 MI.getNumOperands());
3234 const RegisterBankInfo::InstructionMapping &
3235 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3236 const MachineFunction &MF = *MI.getParent()->getParent();
3237 const MachineRegisterInfo &MRI = MF.getRegInfo();
3238 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3240 // Even though we technically could use SGPRs, this would require knowledge of
3241 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3243 // TODO: Unary ops are trivially OK, so accept SGPRs?
3244 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3245 const MachineOperand &Src = MI.getOperand(i);
3246 if (!Src.isReg())
3247 continue;
3249 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3250 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3251 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3254 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3255 MI.getNumOperands());
3258 const RegisterBankInfo::InstructionMapping &
3259 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3260 const MachineFunction &MF = *MI.getParent()->getParent();
3261 const MachineRegisterInfo &MRI = MF.getRegInfo();
3262 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3264 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3265 const MachineOperand &Op = MI.getOperand(I);
3266 if (!Op.isReg())
3267 continue;
3269 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3270 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3273 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3274 MI.getNumOperands());
3277 const RegisterBankInfo::InstructionMapping &
3278 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3279 const MachineInstr &MI,
3280 int RsrcIdx) const {
3281 // The reported argument index is relative to the IR intrinsic call arguments,
3282 // so we need to shift by the number of defs and the intrinsic ID.
3283 RsrcIdx += MI.getNumExplicitDefs() + 1;
3285 const int NumOps = MI.getNumOperands();
3286 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3288 // TODO: Should packed/unpacked D16 difference be reported here as part of
3289 // the value mapping?
3290 for (int I = 0; I != NumOps; ++I) {
3291 if (!MI.getOperand(I).isReg())
3292 continue;
3294 Register OpReg = MI.getOperand(I).getReg();
3295 // We replace some dead address operands with $noreg
3296 if (!OpReg)
3297 continue;
3299 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3301 // FIXME: Probably need a new intrinsic register bank searchable table to
3302 // handle arbitrary intrinsics easily.
3304 // If this has a sampler, it immediately follows rsrc.
3305 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3307 if (MustBeSGPR) {
3308 // If this must be an SGPR, so we must report whatever it is as legal.
3309 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3310 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3311 } else {
3312 // Some operands must be VGPR, and these are easy to copy to.
3313 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3317 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3320 /// Return the mapping for a pointer argument.
3321 const RegisterBankInfo::ValueMapping *
3322 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3323 Register PtrReg) const {
3324 LLT PtrTy = MRI.getType(PtrReg);
3325 unsigned Size = PtrTy.getSizeInBits();
3326 if (Subtarget.useFlatForGlobal() ||
3327 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3328 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3330 // If we're using MUBUF instructions for global memory, an SGPR base register
3331 // is possible. Otherwise this needs to be a VGPR.
3332 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3333 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3336 const RegisterBankInfo::InstructionMapping &
3337 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3339 const MachineFunction &MF = *MI.getParent()->getParent();
3340 const MachineRegisterInfo &MRI = MF.getRegInfo();
3341 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3342 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3343 Register PtrReg = MI.getOperand(1).getReg();
3344 LLT PtrTy = MRI.getType(PtrReg);
3345 unsigned AS = PtrTy.getAddressSpace();
3346 unsigned PtrSize = PtrTy.getSizeInBits();
3348 const ValueMapping *ValMapping;
3349 const ValueMapping *PtrMapping;
3351 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3353 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3354 if (isScalarLoadLegal(MI)) {
3355 // We have a uniform instruction so we want to use an SMRD load
3356 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3357 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3358 } else {
3359 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3361 // If we're using MUBUF instructions for global memory, an SGPR base
3362 // register is possible. Otherwise this needs to be a VGPR.
3363 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3364 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3366 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3368 } else {
3369 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3370 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3373 OpdsMapping[0] = ValMapping;
3374 OpdsMapping[1] = PtrMapping;
3375 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3376 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3377 return Mapping;
3379 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3380 // handle that during instruction selection?
3383 unsigned
3384 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3385 const MachineRegisterInfo &MRI,
3386 unsigned Default) const {
3387 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3388 return Bank ? Bank->getID() : Default;
3391 const RegisterBankInfo::ValueMapping *
3392 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3393 const MachineRegisterInfo &MRI,
3394 const TargetRegisterInfo &TRI) const {
3395 // Lie and claim anything is legal, even though this needs to be an SGPR
3396 // applyMapping will have to deal with it as a waterfall loop.
3397 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3398 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3399 return AMDGPU::getValueMapping(Bank, Size);
3402 const RegisterBankInfo::ValueMapping *
3403 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3404 const MachineRegisterInfo &MRI,
3405 const TargetRegisterInfo &TRI) const {
3406 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3407 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3410 const RegisterBankInfo::ValueMapping *
3411 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3412 const MachineRegisterInfo &MRI,
3413 const TargetRegisterInfo &TRI) const {
3414 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3415 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3419 /// This function must return a legal mapping, because
3420 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3421 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3422 /// VGPR to SGPR generated is illegal.
3424 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3425 // legal. These will be dealt with in applyMappingImpl.
3427 const RegisterBankInfo::InstructionMapping &
3428 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3429 const MachineFunction &MF = *MI.getParent()->getParent();
3430 const MachineRegisterInfo &MRI = MF.getRegInfo();
3432 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3433 // The default logic bothers to analyze impossible alternative mappings. We
3434 // want the most straightforward mapping, so just directly handle this.
3435 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3436 *TRI);
3437 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3438 *TRI);
3439 assert(SrcBank && "src bank should have been assigned already");
3440 if (!DstBank)
3441 DstBank = SrcBank;
3443 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3444 if (cannotCopy(*DstBank, *SrcBank, Size))
3445 return getInvalidInstructionMapping();
3447 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3448 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3449 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3450 OpdsMapping[0] = &ValMap;
3451 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3452 OpdsMapping[1] = &ValMap;
3454 return getInstructionMapping(
3455 1, /*Cost*/ 1,
3456 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3459 if (MI.isRegSequence()) {
3460 // If any input is a VGPR, the result must be a VGPR. The default handling
3461 // assumes any copy between banks is legal.
3462 unsigned BankID = AMDGPU::SGPRRegBankID;
3464 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3465 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3466 // It doesn't make sense to use vcc or scc banks here, so just ignore
3467 // them.
3468 if (OpBank != AMDGPU::SGPRRegBankID) {
3469 BankID = AMDGPU::VGPRRegBankID;
3470 break;
3473 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3475 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3476 return getInstructionMapping(
3477 1, /*Cost*/ 1,
3478 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3481 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3482 // properly.
3484 // TODO: There are additional exec masking dependencies to analyze.
3485 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3486 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3487 Register DstReg = MI.getOperand(0).getReg();
3489 // Sometimes the result may have already been assigned a bank.
3490 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3491 ResultBank = DstBank->getID();
3493 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3494 Register Reg = MI.getOperand(I).getReg();
3495 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3497 // FIXME: Assuming VGPR for any undetermined inputs.
3498 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3499 ResultBank = AMDGPU::VGPRRegBankID;
3500 break;
3503 // FIXME: Need to promote SGPR case to s32
3504 unsigned OpBank = Bank->getID();
3505 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3508 assert(ResultBank != AMDGPU::InvalidRegBankID);
3510 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3512 const ValueMapping &ValMap =
3513 getValueMapping(0, Size, getRegBank(ResultBank));
3514 return getInstructionMapping(
3515 1, /*Cost*/ 1,
3516 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3519 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3520 if (Mapping.isValid())
3521 return Mapping;
3523 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3525 switch (MI.getOpcode()) {
3526 default:
3527 return getInvalidInstructionMapping();
3529 case AMDGPU::G_AND:
3530 case AMDGPU::G_OR:
3531 case AMDGPU::G_XOR: {
3532 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3533 if (Size == 1) {
3534 const RegisterBank *DstBank
3535 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3537 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3538 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3539 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3540 if (DstBank) {
3541 TargetBankID = DstBank->getID();
3542 if (DstBank == &AMDGPU::VCCRegBank) {
3543 TargetBankID = AMDGPU::VCCRegBankID;
3544 BankLHS = AMDGPU::VCCRegBankID;
3545 BankRHS = AMDGPU::VCCRegBankID;
3546 } else {
3547 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3548 AMDGPU::SGPRRegBankID);
3549 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3550 AMDGPU::SGPRRegBankID);
3552 } else {
3553 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3554 AMDGPU::VCCRegBankID);
3555 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3556 AMDGPU::VCCRegBankID);
3558 // Both inputs should be true booleans to produce a boolean result.
3559 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3560 TargetBankID = AMDGPU::VGPRRegBankID;
3561 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3562 TargetBankID = AMDGPU::VCCRegBankID;
3563 BankLHS = AMDGPU::VCCRegBankID;
3564 BankRHS = AMDGPU::VCCRegBankID;
3565 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3566 TargetBankID = AMDGPU::SGPRRegBankID;
3570 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3571 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3572 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3573 break;
3576 if (Size == 64) {
3578 if (isSALUMapping(MI)) {
3579 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3580 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3581 } else {
3582 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3583 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3584 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3586 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3587 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3590 break;
3593 LLVM_FALLTHROUGH;
3595 case AMDGPU::G_PTR_ADD:
3596 case AMDGPU::G_PTRMASK:
3597 case AMDGPU::G_ADD:
3598 case AMDGPU::G_SUB:
3599 case AMDGPU::G_MUL:
3600 case AMDGPU::G_SHL:
3601 case AMDGPU::G_LSHR:
3602 case AMDGPU::G_ASHR:
3603 case AMDGPU::G_UADDO:
3604 case AMDGPU::G_USUBO:
3605 case AMDGPU::G_UADDE:
3606 case AMDGPU::G_SADDE:
3607 case AMDGPU::G_USUBE:
3608 case AMDGPU::G_SSUBE:
3609 case AMDGPU::G_SMIN:
3610 case AMDGPU::G_SMAX:
3611 case AMDGPU::G_UMIN:
3612 case AMDGPU::G_UMAX:
3613 case AMDGPU::G_ABS:
3614 case AMDGPU::G_SHUFFLE_VECTOR:
3615 case AMDGPU::G_SBFX:
3616 case AMDGPU::G_UBFX:
3617 if (isSALUMapping(MI))
3618 return getDefaultMappingSOP(MI);
3619 LLVM_FALLTHROUGH;
3621 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3622 case AMDGPU::G_SSUBSAT:
3623 case AMDGPU::G_UADDSAT:
3624 case AMDGPU::G_USUBSAT:
3625 case AMDGPU::G_FADD:
3626 case AMDGPU::G_FSUB:
3627 case AMDGPU::G_FPTOSI:
3628 case AMDGPU::G_FPTOUI:
3629 case AMDGPU::G_FMUL:
3630 case AMDGPU::G_FMA:
3631 case AMDGPU::G_FMAD:
3632 case AMDGPU::G_FSQRT:
3633 case AMDGPU::G_FFLOOR:
3634 case AMDGPU::G_FCEIL:
3635 case AMDGPU::G_FRINT:
3636 case AMDGPU::G_SITOFP:
3637 case AMDGPU::G_UITOFP:
3638 case AMDGPU::G_FPTRUNC:
3639 case AMDGPU::G_FPEXT:
3640 case AMDGPU::G_FEXP2:
3641 case AMDGPU::G_FLOG2:
3642 case AMDGPU::G_FMINNUM:
3643 case AMDGPU::G_FMAXNUM:
3644 case AMDGPU::G_FMINNUM_IEEE:
3645 case AMDGPU::G_FMAXNUM_IEEE:
3646 case AMDGPU::G_FCANONICALIZE:
3647 case AMDGPU::G_INTRINSIC_TRUNC:
3648 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3649 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3650 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3651 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3652 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3653 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3654 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3655 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3656 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3657 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3658 case AMDGPU::G_AMDGPU_SMED3:
3659 return getDefaultMappingVOP(MI);
3660 case AMDGPU::G_UMULH:
3661 case AMDGPU::G_SMULH: {
3662 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3663 return getDefaultMappingSOP(MI);
3664 return getDefaultMappingVOP(MI);
3666 case AMDGPU::G_IMPLICIT_DEF: {
3667 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3668 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3669 break;
3671 case AMDGPU::G_FCONSTANT:
3672 case AMDGPU::G_CONSTANT:
3673 case AMDGPU::G_GLOBAL_VALUE:
3674 case AMDGPU::G_BLOCK_ADDR:
3675 case AMDGPU::G_READCYCLECOUNTER: {
3676 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3677 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3678 break;
3680 case AMDGPU::G_FRAME_INDEX: {
3681 // TODO: This should be the same as other constants, but eliminateFrameIndex
3682 // currently assumes VALU uses.
3683 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3684 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3685 break;
3687 case AMDGPU::G_DYN_STACKALLOC: {
3688 // Result is always uniform, and a wave reduction is needed for the source.
3689 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3690 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3691 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3692 break;
3694 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3695 // This case is weird because we expect a physical register in the source,
3696 // but need to set a bank anyway.
3698 // We could select the result to SGPR or VGPR, but for the one current use
3699 // it's more practical to always use VGPR.
3700 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3701 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3702 break;
3704 case AMDGPU::G_INSERT: {
3705 unsigned BankID = getMappingType(MRI, MI);
3706 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3707 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3708 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3709 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3710 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3711 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3712 OpdsMapping[3] = nullptr;
3713 break;
3715 case AMDGPU::G_EXTRACT: {
3716 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3717 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3718 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3719 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3720 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3721 OpdsMapping[2] = nullptr;
3722 break;
3724 case AMDGPU::G_BUILD_VECTOR:
3725 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3726 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3727 if (DstTy == LLT::fixed_vector(2, 16)) {
3728 unsigned DstSize = DstTy.getSizeInBits();
3729 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3730 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3731 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3732 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3734 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3735 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3736 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3737 break;
3740 LLVM_FALLTHROUGH;
3742 case AMDGPU::G_MERGE_VALUES:
3743 case AMDGPU::G_CONCAT_VECTORS: {
3744 unsigned Bank = getMappingType(MRI, MI);
3745 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3746 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3748 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3749 // Op1 and Dst should use the same register bank.
3750 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3751 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3752 break;
3754 case AMDGPU::G_BITREVERSE:
3755 case AMDGPU::G_BITCAST:
3756 case AMDGPU::G_INTTOPTR:
3757 case AMDGPU::G_PTRTOINT:
3758 case AMDGPU::G_FABS:
3759 case AMDGPU::G_FNEG: {
3760 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3761 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3762 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3763 break;
3765 case AMDGPU::G_AMDGPU_FFBH_U32:
3766 case AMDGPU::G_AMDGPU_FFBL_B32:
3767 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3768 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3769 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3770 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3771 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3772 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3773 break;
3775 case AMDGPU::G_CTPOP: {
3776 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3777 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3778 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3780 // This should really be getValueMappingSGPR64Only, but allowing the generic
3781 // code to handle the register split just makes using LegalizerHelper more
3782 // difficult.
3783 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3784 break;
3786 case AMDGPU::G_TRUNC: {
3787 Register Dst = MI.getOperand(0).getReg();
3788 Register Src = MI.getOperand(1).getReg();
3789 unsigned Bank = getRegBankID(Src, MRI);
3790 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3791 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3792 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3793 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3794 break;
3796 case AMDGPU::G_ZEXT:
3797 case AMDGPU::G_SEXT:
3798 case AMDGPU::G_ANYEXT:
3799 case AMDGPU::G_SEXT_INREG: {
3800 Register Dst = MI.getOperand(0).getReg();
3801 Register Src = MI.getOperand(1).getReg();
3802 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3803 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3805 unsigned DstBank;
3806 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3807 assert(SrcBank);
3808 switch (SrcBank->getID()) {
3809 case AMDGPU::SGPRRegBankID:
3810 DstBank = AMDGPU::SGPRRegBankID;
3811 break;
3812 default:
3813 DstBank = AMDGPU::VGPRRegBankID;
3814 break;
3817 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3818 // 32-bits, and then to 64.
3819 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3820 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3821 SrcSize);
3822 break;
3824 case AMDGPU::G_FCMP: {
3825 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3826 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3827 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3828 OpdsMapping[1] = nullptr; // Predicate Operand.
3829 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3830 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3831 break;
3833 case AMDGPU::G_STORE: {
3834 assert(MI.getOperand(0).isReg());
3835 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3837 // FIXME: We need to specify a different reg bank once scalar stores are
3838 // supported.
3839 const ValueMapping *ValMapping =
3840 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3841 OpdsMapping[0] = ValMapping;
3842 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3843 break;
3845 case AMDGPU::G_ICMP: {
3846 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3847 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3849 // See if the result register has already been constrained to vcc, which may
3850 // happen due to control flow intrinsic lowering.
3851 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3852 AMDGPU::SGPRRegBankID);
3853 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3854 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3856 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3857 Op2Bank == AMDGPU::SGPRRegBankID &&
3858 Op3Bank == AMDGPU::SGPRRegBankID &&
3859 (Size == 32 || (Size == 64 &&
3860 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3861 Subtarget.hasScalarCompareEq64()));
3863 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3864 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3866 // TODO: Use 32-bit for scalar output size.
3867 // SCC results will need to be copied to a 32-bit SGPR virtual register.
3868 const unsigned ResultSize = 1;
3870 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3871 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3872 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3873 break;
3875 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3876 // VGPR index can be used for waterfall when indexing a SGPR vector.
3877 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3878 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3879 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3880 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3881 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3882 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3884 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3885 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3887 // The index can be either if the source vector is VGPR.
3888 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3889 break;
3891 case AMDGPU::G_INSERT_VECTOR_ELT: {
3892 unsigned OutputBankID = isSALUMapping(MI) ?
3893 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3895 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3896 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3897 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3898 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3899 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3901 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3902 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3904 // This is a weird case, because we need to break down the mapping based on
3905 // the register bank of a different operand.
3906 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3907 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3908 InsertSize);
3909 } else {
3910 assert(InsertSize == 32 || InsertSize == 64);
3911 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3914 // The index can be either if the source vector is VGPR.
3915 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3916 break;
3918 case AMDGPU::G_UNMERGE_VALUES: {
3919 unsigned Bank = getMappingType(MRI, MI);
3921 // Op1 and Dst should use the same register bank.
3922 // FIXME: Shouldn't this be the default? Why do we need to handle this?
3923 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3924 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3925 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3927 break;
3929 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3930 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3931 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3932 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3933 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3934 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3935 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3936 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3937 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3938 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3939 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3940 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3941 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3942 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3943 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3944 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3945 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3947 // rsrc
3948 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3950 // vindex
3951 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3953 // voffset
3954 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3956 // soffset
3957 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3959 // Any remaining operands are immediates and were correctly null
3960 // initialized.
3961 break;
3963 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3964 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3965 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3966 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3967 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3968 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3969 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3970 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3971 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3972 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3973 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3974 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3975 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3976 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3977 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3978 // vdata_out
3979 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3981 // vdata_in
3982 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3984 // rsrc
3985 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3987 // vindex
3988 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3990 // voffset
3991 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3993 // soffset
3994 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3996 // Any remaining operands are immediates and were correctly null
3997 // initialized.
3998 break;
4000 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4001 // vdata_out
4002 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4004 // vdata_in
4005 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4007 // cmp
4008 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4010 // rsrc
4011 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4013 // vindex
4014 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4016 // voffset
4017 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4019 // soffset
4020 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4022 // Any remaining operands are immediates and were correctly null
4023 // initialized.
4024 break;
4026 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4027 // Lie and claim everything is legal, even though some need to be
4028 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4029 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4030 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4032 // We need to convert this to a MUBUF if either the resource of offset is
4033 // VGPR.
4034 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4035 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4036 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4038 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4039 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4040 break;
4042 case AMDGPU::G_INTRINSIC: {
4043 switch (MI.getIntrinsicID()) {
4044 default:
4045 return getInvalidInstructionMapping();
4046 case Intrinsic::amdgcn_div_fmas:
4047 case Intrinsic::amdgcn_div_fixup:
4048 case Intrinsic::amdgcn_trig_preop:
4049 case Intrinsic::amdgcn_sin:
4050 case Intrinsic::amdgcn_cos:
4051 case Intrinsic::amdgcn_log_clamp:
4052 case Intrinsic::amdgcn_rcp:
4053 case Intrinsic::amdgcn_rcp_legacy:
4054 case Intrinsic::amdgcn_sqrt:
4055 case Intrinsic::amdgcn_rsq:
4056 case Intrinsic::amdgcn_rsq_legacy:
4057 case Intrinsic::amdgcn_rsq_clamp:
4058 case Intrinsic::amdgcn_fmul_legacy:
4059 case Intrinsic::amdgcn_fma_legacy:
4060 case Intrinsic::amdgcn_ldexp:
4061 case Intrinsic::amdgcn_frexp_mant:
4062 case Intrinsic::amdgcn_frexp_exp:
4063 case Intrinsic::amdgcn_fract:
4064 case Intrinsic::amdgcn_cvt_pkrtz:
4065 case Intrinsic::amdgcn_cvt_pknorm_i16:
4066 case Intrinsic::amdgcn_cvt_pknorm_u16:
4067 case Intrinsic::amdgcn_cvt_pk_i16:
4068 case Intrinsic::amdgcn_cvt_pk_u16:
4069 case Intrinsic::amdgcn_fmed3:
4070 case Intrinsic::amdgcn_cubeid:
4071 case Intrinsic::amdgcn_cubema:
4072 case Intrinsic::amdgcn_cubesc:
4073 case Intrinsic::amdgcn_cubetc:
4074 case Intrinsic::amdgcn_sffbh:
4075 case Intrinsic::amdgcn_fmad_ftz:
4076 case Intrinsic::amdgcn_mbcnt_lo:
4077 case Intrinsic::amdgcn_mbcnt_hi:
4078 case Intrinsic::amdgcn_mul_u24:
4079 case Intrinsic::amdgcn_mul_i24:
4080 case Intrinsic::amdgcn_mulhi_u24:
4081 case Intrinsic::amdgcn_mulhi_i24:
4082 case Intrinsic::amdgcn_lerp:
4083 case Intrinsic::amdgcn_sad_u8:
4084 case Intrinsic::amdgcn_msad_u8:
4085 case Intrinsic::amdgcn_sad_hi_u8:
4086 case Intrinsic::amdgcn_sad_u16:
4087 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4088 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4089 case Intrinsic::amdgcn_mqsad_u32_u8:
4090 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4091 case Intrinsic::amdgcn_alignbyte:
4092 case Intrinsic::amdgcn_perm:
4093 case Intrinsic::amdgcn_fdot2:
4094 case Intrinsic::amdgcn_sdot2:
4095 case Intrinsic::amdgcn_udot2:
4096 case Intrinsic::amdgcn_sdot4:
4097 case Intrinsic::amdgcn_udot4:
4098 case Intrinsic::amdgcn_sdot8:
4099 case Intrinsic::amdgcn_udot8:
4100 return getDefaultMappingVOP(MI);
4101 case Intrinsic::amdgcn_sbfe:
4102 case Intrinsic::amdgcn_ubfe:
4103 if (isSALUMapping(MI))
4104 return getDefaultMappingSOP(MI);
4105 return getDefaultMappingVOP(MI);
4106 case Intrinsic::amdgcn_ds_swizzle:
4107 case Intrinsic::amdgcn_ds_permute:
4108 case Intrinsic::amdgcn_ds_bpermute:
4109 case Intrinsic::amdgcn_update_dpp:
4110 case Intrinsic::amdgcn_mov_dpp8:
4111 case Intrinsic::amdgcn_mov_dpp:
4112 case Intrinsic::amdgcn_strict_wwm:
4113 case Intrinsic::amdgcn_wwm:
4114 case Intrinsic::amdgcn_strict_wqm:
4115 case Intrinsic::amdgcn_wqm:
4116 case Intrinsic::amdgcn_softwqm:
4117 case Intrinsic::amdgcn_set_inactive:
4118 return getDefaultMappingAllVGPR(MI);
4119 case Intrinsic::amdgcn_kernarg_segment_ptr:
4120 case Intrinsic::amdgcn_s_getpc:
4121 case Intrinsic::amdgcn_groupstaticsize:
4122 case Intrinsic::amdgcn_reloc_constant:
4123 case Intrinsic::returnaddress: {
4124 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4125 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4126 break;
4128 case Intrinsic::amdgcn_wqm_vote: {
4129 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4130 OpdsMapping[0] = OpdsMapping[2]
4131 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4132 break;
4134 case Intrinsic::amdgcn_ps_live: {
4135 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4136 break;
4138 case Intrinsic::amdgcn_div_scale: {
4139 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4140 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4141 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4142 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4144 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4145 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4146 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4147 break;
4149 case Intrinsic::amdgcn_class: {
4150 Register Src0Reg = MI.getOperand(2).getReg();
4151 Register Src1Reg = MI.getOperand(3).getReg();
4152 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4153 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4154 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4155 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4156 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4157 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4158 break;
4160 case Intrinsic::amdgcn_icmp:
4161 case Intrinsic::amdgcn_fcmp: {
4162 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4163 // This is not VCCRegBank because this is not used in boolean contexts.
4164 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4165 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4166 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4167 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4168 break;
4170 case Intrinsic::amdgcn_readlane: {
4171 // This must be an SGPR, but accept a VGPR.
4172 Register IdxReg = MI.getOperand(3).getReg();
4173 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4174 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4175 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4176 LLVM_FALLTHROUGH;
4178 case Intrinsic::amdgcn_readfirstlane: {
4179 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4180 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4181 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4182 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4183 break;
4185 case Intrinsic::amdgcn_writelane: {
4186 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4187 Register SrcReg = MI.getOperand(2).getReg();
4188 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4189 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4190 Register IdxReg = MI.getOperand(3).getReg();
4191 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4192 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4193 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4195 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4196 // to legalize.
4197 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4198 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4199 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4200 break;
4202 case Intrinsic::amdgcn_if_break: {
4203 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4204 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4205 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4206 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4207 break;
4209 case Intrinsic::amdgcn_permlane16:
4210 case Intrinsic::amdgcn_permlanex16: {
4211 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4212 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4213 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4214 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4215 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4216 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4217 break;
4219 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4220 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4221 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4222 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4223 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4224 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4225 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4226 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4227 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4228 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4229 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4230 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4231 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4232 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4233 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4234 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4235 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4236 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4237 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4238 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4239 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4240 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4241 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4242 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4243 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4244 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4245 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4246 // Default for MAI intrinsics.
4247 // srcC can also be an immediate which can be folded later.
4248 // FIXME: Should we eventually add an alternative mapping with AGPR src
4249 // for srcA/srcB?
4251 // vdst, srcA, srcB, srcC
4252 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4253 OpdsMapping[0] =
4254 Info->mayNeedAGPRs()
4255 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4256 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4257 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4258 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4259 OpdsMapping[4] =
4260 Info->mayNeedAGPRs()
4261 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4262 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4263 break;
4265 case Intrinsic::amdgcn_interp_p1:
4266 case Intrinsic::amdgcn_interp_p2:
4267 case Intrinsic::amdgcn_interp_mov:
4268 case Intrinsic::amdgcn_interp_p1_f16:
4269 case Intrinsic::amdgcn_interp_p2_f16: {
4270 const int M0Idx = MI.getNumOperands() - 1;
4271 Register M0Reg = MI.getOperand(M0Idx).getReg();
4272 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4273 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4275 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4276 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4277 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4279 // Must be SGPR, but we must take whatever the original bank is and fix it
4280 // later.
4281 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4282 break;
4284 case Intrinsic::amdgcn_ballot: {
4285 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4286 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4287 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4288 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4289 break;
4292 break;
4294 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4295 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4296 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4297 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4298 auto IntrID = MI.getIntrinsicID();
4299 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4300 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4301 // Non-images can have complications from operands that allow both SGPR
4302 // and VGPR. For now it's too complicated to figure out the final opcode
4303 // to derive the register bank from the MCInstrDesc.
4304 assert(RSrcIntrin->IsImage);
4305 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4307 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4308 unsigned N = MI.getNumExplicitOperands() - 2;
4309 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4310 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4311 if (N == 3) {
4312 // Sequential form: all operands combined into VGPR256/VGPR512
4313 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4314 if (Size > 256)
4315 Size = 512;
4316 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4317 } else {
4318 // NSA form
4319 for (unsigned I = 2; I < N; ++I)
4320 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4322 break;
4324 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4325 auto IntrID = MI.getIntrinsicID();
4326 switch (IntrID) {
4327 case Intrinsic::amdgcn_s_getreg:
4328 case Intrinsic::amdgcn_s_memtime:
4329 case Intrinsic::amdgcn_s_memrealtime:
4330 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4331 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4332 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4333 break;
4335 case Intrinsic::amdgcn_global_atomic_fadd:
4336 case Intrinsic::amdgcn_global_atomic_csub:
4337 case Intrinsic::amdgcn_global_atomic_fmin:
4338 case Intrinsic::amdgcn_global_atomic_fmax:
4339 case Intrinsic::amdgcn_flat_atomic_fadd:
4340 case Intrinsic::amdgcn_flat_atomic_fmin:
4341 case Intrinsic::amdgcn_flat_atomic_fmax:
4342 return getDefaultMappingAllVGPR(MI);
4343 case Intrinsic::amdgcn_ds_ordered_add:
4344 case Intrinsic::amdgcn_ds_ordered_swap: {
4345 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4346 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4347 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4348 AMDGPU::SGPRRegBankID);
4349 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4350 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4351 break;
4353 case Intrinsic::amdgcn_ds_append:
4354 case Intrinsic::amdgcn_ds_consume: {
4355 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4356 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4357 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4358 break;
4360 case Intrinsic::amdgcn_exp_compr:
4361 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4362 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4363 break;
4364 case Intrinsic::amdgcn_exp:
4365 // FIXME: Could we support packed types here?
4366 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4367 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4368 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4369 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4370 break;
4371 case Intrinsic::amdgcn_s_sendmsg:
4372 case Intrinsic::amdgcn_s_sendmsghalt: {
4373 // This must be an SGPR, but accept a VGPR.
4374 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4375 AMDGPU::SGPRRegBankID);
4376 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4377 break;
4379 case Intrinsic::amdgcn_s_setreg: {
4380 // This must be an SGPR, but accept a VGPR.
4381 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4382 AMDGPU::SGPRRegBankID);
4383 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4384 break;
4386 case Intrinsic::amdgcn_end_cf: {
4387 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4388 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4389 break;
4391 case Intrinsic::amdgcn_else: {
4392 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4393 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4394 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4395 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4396 break;
4398 case Intrinsic::amdgcn_live_mask: {
4399 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4400 break;
4402 case Intrinsic::amdgcn_wqm_demote:
4403 case Intrinsic::amdgcn_kill: {
4404 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4405 break;
4407 case Intrinsic::amdgcn_raw_buffer_load:
4408 case Intrinsic::amdgcn_raw_tbuffer_load: {
4409 // FIXME: Should make intrinsic ID the last operand of the instruction,
4410 // then this would be the same as store
4411 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4412 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4413 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4414 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4415 break;
4417 case Intrinsic::amdgcn_raw_buffer_store:
4418 case Intrinsic::amdgcn_raw_buffer_store_format:
4419 case Intrinsic::amdgcn_raw_tbuffer_store: {
4420 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4421 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4422 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4423 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4424 break;
4426 case Intrinsic::amdgcn_struct_buffer_load:
4427 case Intrinsic::amdgcn_struct_tbuffer_load: {
4428 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4429 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4430 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4431 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4432 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4433 break;
4435 case Intrinsic::amdgcn_struct_buffer_store:
4436 case Intrinsic::amdgcn_struct_tbuffer_store: {
4437 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4438 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4439 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4440 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4441 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4442 break;
4444 case Intrinsic::amdgcn_init_exec_from_input: {
4445 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4446 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4447 break;
4449 case Intrinsic::amdgcn_ds_gws_init:
4450 case Intrinsic::amdgcn_ds_gws_barrier:
4451 case Intrinsic::amdgcn_ds_gws_sema_br: {
4452 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4454 // This must be an SGPR, but accept a VGPR.
4455 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4456 AMDGPU::SGPRRegBankID);
4457 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4458 break;
4460 case Intrinsic::amdgcn_ds_gws_sema_v:
4461 case Intrinsic::amdgcn_ds_gws_sema_p:
4462 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4463 // This must be an SGPR, but accept a VGPR.
4464 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4465 AMDGPU::SGPRRegBankID);
4466 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4467 break;
4469 default:
4470 return getInvalidInstructionMapping();
4472 break;
4474 case AMDGPU::G_SELECT: {
4475 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4476 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4477 AMDGPU::SGPRRegBankID);
4478 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4479 AMDGPU::SGPRRegBankID);
4480 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4481 Op3Bank == AMDGPU::SGPRRegBankID;
4483 unsigned CondBankDefault = SGPRSrcs ?
4484 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4485 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4486 CondBankDefault);
4487 if (CondBank == AMDGPU::SGPRRegBankID)
4488 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4489 else if (CondBank == AMDGPU::VGPRRegBankID)
4490 CondBank = AMDGPU::VCCRegBankID;
4492 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4493 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4495 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4497 // TODO: Should report 32-bit for scalar condition type.
4498 if (Size == 64) {
4499 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4500 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4501 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4502 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4503 } else {
4504 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4505 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4506 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4507 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4510 break;
4513 case AMDGPU::G_SI_CALL: {
4514 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4515 // Lie and claim everything is legal, even though some need to be
4516 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4517 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4519 // Allow anything for implicit arguments
4520 for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
4521 if (MI.getOperand(I).isReg()) {
4522 Register Reg = MI.getOperand(I).getReg();
4523 auto OpBank = getRegBankID(Reg, MRI);
4524 unsigned Size = getSizeInBits(Reg, MRI, *TRI);
4525 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
4528 break;
4530 case AMDGPU::G_LOAD:
4531 case AMDGPU::G_ZEXTLOAD:
4532 case AMDGPU::G_SEXTLOAD:
4533 return getInstrMappingForLoad(MI);
4535 case AMDGPU::G_ATOMICRMW_XCHG:
4536 case AMDGPU::G_ATOMICRMW_ADD:
4537 case AMDGPU::G_ATOMICRMW_SUB:
4538 case AMDGPU::G_ATOMICRMW_AND:
4539 case AMDGPU::G_ATOMICRMW_OR:
4540 case AMDGPU::G_ATOMICRMW_XOR:
4541 case AMDGPU::G_ATOMICRMW_MAX:
4542 case AMDGPU::G_ATOMICRMW_MIN:
4543 case AMDGPU::G_ATOMICRMW_UMAX:
4544 case AMDGPU::G_ATOMICRMW_UMIN:
4545 case AMDGPU::G_ATOMICRMW_FADD:
4546 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4547 case AMDGPU::G_AMDGPU_ATOMIC_INC:
4548 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4549 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4550 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4551 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4552 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4553 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4554 break;
4556 case AMDGPU::G_ATOMIC_CMPXCHG: {
4557 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4558 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4559 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4560 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4561 break;
4563 case AMDGPU::G_BRCOND: {
4564 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4565 AMDGPU::SGPRRegBankID);
4566 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4567 if (Bank != AMDGPU::SGPRRegBankID)
4568 Bank = AMDGPU::VCCRegBankID;
4570 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4571 break;
4575 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4576 getOperandsMapping(OpdsMapping),
4577 MI.getNumOperands());