[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / lib / Target / AMDGPU / AMDGPURegisterBankInfo.cpp
blob9c00ed76985f7015d8654245b5c8df5b1c996b90
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
71 #include "AMDGPURegisterBankInfo.h"
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
91 using namespace llvm;
92 using namespace MIPatternMatch;
94 namespace {
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final : public GISelChangeObserver {
98 private:
99 const AMDGPURegisterBankInfo &RBI;
100 MachineRegisterInfo &MRI;
101 const RegisterBank *NewBank;
102 SmallVector<MachineInstr *, 4> NewInsts;
104 public:
105 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106 MachineRegisterInfo &MRI_, const RegisterBank *RB)
107 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
109 ~ApplyRegBankMapping() {
110 for (MachineInstr *MI : NewInsts)
111 applyBank(*MI);
114 /// Set any registers that don't have a set register class or bank to SALU.
115 void applyBank(MachineInstr &MI) {
116 const unsigned Opc = MI.getOpcode();
117 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118 Opc == AMDGPU::G_SEXT) {
119 // LegalizerHelper wants to use the basic legalization artifacts when
120 // widening etc. We don't handle selection with vcc in artifact sources,
121 // so we need to use a sslect instead to handle these properly.
122 Register DstReg = MI.getOperand(0).getReg();
123 Register SrcReg = MI.getOperand(1).getReg();
124 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125 if (SrcBank == &AMDGPU::VCCRegBank) {
126 const LLT S32 = LLT::scalar(32);
127 assert(MRI.getType(SrcReg) == LLT::scalar(1));
128 assert(MRI.getType(DstReg) == S32);
129 assert(NewBank == &AMDGPU::VGPRRegBank);
131 // Replace the extension with a select, which really uses the boolean
132 // source.
133 MachineIRBuilder B(MI);
134 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135 auto False = B.buildConstant(S32, 0);
136 B.buildSelect(DstReg, SrcReg, True, False);
137 MRI.setRegBank(True.getReg(0), *NewBank);
138 MRI.setRegBank(False.getReg(0), *NewBank);
139 MI.eraseFromParent();
142 assert(!MRI.getRegClassOrRegBank(DstReg));
143 MRI.setRegBank(DstReg, *NewBank);
144 return;
147 #ifndef NDEBUG
148 if (Opc == AMDGPU::G_TRUNC) {
149 Register DstReg = MI.getOperand(0).getReg();
150 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151 assert(DstBank != &AMDGPU::VCCRegBank);
153 #endif
155 for (MachineOperand &Op : MI.operands()) {
156 if (!Op.isReg())
157 continue;
159 // We may see physical registers if building a real MI
160 Register Reg = Op.getReg();
161 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162 continue;
164 const RegisterBank *RB = NewBank;
165 if (MRI.getType(Reg) == LLT::scalar(1)) {
166 assert(NewBank == &AMDGPU::VGPRRegBank &&
167 "s1 operands should only be used for vector bools");
168 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
169 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
170 "not expecting legalization artifacts here");
171 RB = &AMDGPU::VCCRegBank;
174 MRI.setRegBank(Reg, *RB);
178 void erasingInstr(MachineInstr &MI) override {}
180 void createdInstr(MachineInstr &MI) override {
181 // At this point, the instruction was just inserted and has no operands.
182 NewInsts.push_back(&MI);
185 void changingInstr(MachineInstr &MI) override {}
186 void changedInstr(MachineInstr &MI) override {
187 // FIXME: In principle we should probably add the instruction to NewInsts,
188 // but the way the LegalizerHelper uses the observer, we will always see the
189 // registers we need to set the regbank on also referenced in a new
190 // instruction.
195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
196 : AMDGPUGenRegisterBankInfo(),
197 Subtarget(ST),
198 TRI(Subtarget.getRegisterInfo()),
199 TII(Subtarget.getInstrInfo()) {
201 // HACK: Until this is fully tablegen'd.
202 static llvm::once_flag InitializeRegisterBankFlag;
204 static auto InitializeRegisterBankOnce = [this]() {
205 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
206 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
207 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
208 (void)this;
211 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
215 unsigned BankID = Bank.getID();
216 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
220 const RegisterBank &Src,
221 unsigned Size) const {
222 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
225 return std::numeric_limits<unsigned>::max();
228 // Bool values are tricky, because the meaning is based on context. The SCC
229 // and VCC banks are for the natural scalar and vector conditions produced by
230 // a compare.
232 // Legalization doesn't know about the necessary context, so an s1 use may
233 // have been a truncate from an arbitrary value, in which case a copy (lowered
234 // as a compare with 0) needs to be inserted.
235 if (Size == 1 &&
236 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237 (isVectorRegisterBank(Src) ||
238 Src.getID() == AMDGPU::SGPRRegBankID ||
239 Src.getID() == AMDGPU::VCCRegBankID))
240 return std::numeric_limits<unsigned>::max();
242 // There is no direct copy between AGPRs.
243 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244 Src.getID() == AMDGPU::AGPRRegBankID)
245 return 4;
247 return RegisterBankInfo::copyCost(Dst, Src, Size);
250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251 const ValueMapping &ValMapping,
252 const RegisterBank *CurBank) const {
253 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254 // VGPR.
255 // FIXME: Is there a better way to do this?
256 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257 return 10; // This is expensive.
259 assert(ValMapping.NumBreakDowns == 2 &&
260 ValMapping.BreakDown[0].Length == 32 &&
261 ValMapping.BreakDown[0].StartIdx == 0 &&
262 ValMapping.BreakDown[1].Length == 32 &&
263 ValMapping.BreakDown[1].StartIdx == 32 &&
264 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
266 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268 // want.
270 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271 // alignment restrictions, but this probably isn't important.
272 return 1;
275 const RegisterBank &
276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
277 LLT Ty) const {
278 if (&RC == &AMDGPU::SReg_1RegClass)
279 return AMDGPU::VCCRegBank;
281 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282 // VCC-like use.
283 if (TRI->isSGPRClass(&RC)) {
284 // FIXME: This probably came from a copy from a physical register, which
285 // should be inferrrable from the copied to-type. We don't have many boolean
286 // physical register constraints so just assume a normal SGPR for now.
287 if (!Ty.isValid())
288 return AMDGPU::SGPRRegBank;
290 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
293 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
296 template <unsigned NumOps>
297 RegisterBankInfo::InstructionMappings
298 AMDGPURegisterBankInfo::addMappingFromTable(
299 const MachineInstr &MI, const MachineRegisterInfo &MRI,
300 const std::array<unsigned, NumOps> RegSrcOpIdx,
301 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
303 InstructionMappings AltMappings;
305 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
307 unsigned Sizes[NumOps];
308 for (unsigned I = 0; I < NumOps; ++I) {
309 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
313 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
318 // getInstrMapping's default mapping uses ID 1, so start at 2.
319 unsigned MappingID = 2;
320 for (const auto &Entry : Table) {
321 for (unsigned I = 0; I < NumOps; ++I) {
322 int OpIdx = RegSrcOpIdx[I];
323 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
326 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327 getOperandsMapping(Operands),
328 Operands.size()));
331 return AltMappings;
334 RegisterBankInfo::InstructionMappings
335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337 switch (MI.getIntrinsicID()) {
338 case Intrinsic::amdgcn_readlane: {
339 static const OpRegBankEntry<3> Table[2] = {
340 // Perfectly legal.
341 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
343 // Need a readfirstlane for the index.
344 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
347 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
350 case Intrinsic::amdgcn_writelane: {
351 static const OpRegBankEntry<4> Table[4] = {
352 // Perfectly legal.
353 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
355 // Need readfirstlane of first op
356 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
358 // Need readfirstlane of second op
359 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
361 // Need readfirstlane of both ops
362 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
365 // rsrc, voffset, offset
366 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
369 default:
370 return RegisterBankInfo::getInstrAlternativeMappings(MI);
374 RegisterBankInfo::InstructionMappings
375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
378 switch (MI.getIntrinsicID()) {
379 case Intrinsic::amdgcn_s_buffer_load: {
380 static const OpRegBankEntry<2> Table[4] = {
381 // Perfectly legal.
382 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
384 // Only need 1 register in loop
385 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
387 // Have to waterfall the resource.
388 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
390 // Have to waterfall the resource, and the offset.
391 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
394 // rsrc, offset
395 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
398 case Intrinsic::amdgcn_ds_ordered_add:
399 case Intrinsic::amdgcn_ds_ordered_swap: {
400 // VGPR = M0, VGPR
401 static const OpRegBankEntry<3> Table[2] = {
402 // Perfectly legal.
403 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
405 // Need a readfirstlane for m0
406 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
409 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
412 case Intrinsic::amdgcn_s_sendmsg:
413 case Intrinsic::amdgcn_s_sendmsghalt: {
414 // FIXME: Should have no register for immediate
415 static const OpRegBankEntry<1> Table[2] = {
416 // Perfectly legal.
417 { { AMDGPU::SGPRRegBankID }, 1 },
419 // Need readlane
420 { { AMDGPU::VGPRRegBankID }, 3 }
423 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
426 default:
427 return RegisterBankInfo::getInstrAlternativeMappings(MI);
431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
432 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
433 return I && I->getMetadata("amdgpu.noclobber");
436 // FIXME: Returns uniform if there's no source value information. This is
437 // probably wrong.
438 static bool isScalarLoadLegal(const MachineInstr &MI) {
439 if (!MI.hasOneMemOperand())
440 return false;
442 const MachineMemOperand *MMO = *MI.memoperands_begin();
443 const unsigned AS = MMO->getAddrSpace();
444 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
445 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
446 // Require 4-byte alignment.
447 return MMO->getAlign() >= Align(4) &&
448 // Can't do a scalar atomic load.
449 !MMO->isAtomic() &&
450 // Don't use scalar loads for volatile accesses to non-constant address
451 // spaces.
452 (IsConst || !MMO->isVolatile()) &&
453 // Memory must be known constant, or not written before this load.
454 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
455 AMDGPUInstrInfo::isUniformMMO(MMO);
458 RegisterBankInfo::InstructionMappings
459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460 const MachineInstr &MI) const {
462 const MachineFunction &MF = *MI.getParent()->getParent();
463 const MachineRegisterInfo &MRI = MF.getRegInfo();
466 InstructionMappings AltMappings;
467 switch (MI.getOpcode()) {
468 case TargetOpcode::G_CONSTANT: {
469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470 if (Size == 1) {
471 static const OpRegBankEntry<1> Table[3] = {
472 { { AMDGPU::VGPRRegBankID }, 1 },
473 { { AMDGPU::SGPRRegBankID }, 1 },
474 { { AMDGPU::VCCRegBankID }, 1 }
477 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
480 LLVM_FALLTHROUGH;
482 case TargetOpcode::G_FCONSTANT:
483 case TargetOpcode::G_FRAME_INDEX:
484 case TargetOpcode::G_GLOBAL_VALUE: {
485 static const OpRegBankEntry<1> Table[2] = {
486 { { AMDGPU::VGPRRegBankID }, 1 },
487 { { AMDGPU::SGPRRegBankID }, 1 }
490 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
492 case TargetOpcode::G_AND:
493 case TargetOpcode::G_OR:
494 case TargetOpcode::G_XOR: {
495 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
497 if (Size == 1) {
498 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499 const InstructionMapping &SCCMapping = getInstructionMapping(
500 1, 1, getOperandsMapping(
501 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504 3); // Num Operands
505 AltMappings.push_back(&SCCMapping);
507 const InstructionMapping &VCCMapping0 = getInstructionMapping(
508 2, 1, getOperandsMapping(
509 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512 3); // Num Operands
513 AltMappings.push_back(&VCCMapping0);
514 return AltMappings;
517 if (Size != 64)
518 break;
520 const InstructionMapping &SSMapping = getInstructionMapping(
521 1, 1, getOperandsMapping(
522 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525 3); // Num Operands
526 AltMappings.push_back(&SSMapping);
528 const InstructionMapping &VVMapping = getInstructionMapping(
529 2, 2, getOperandsMapping(
530 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533 3); // Num Operands
534 AltMappings.push_back(&VVMapping);
535 break;
537 case TargetOpcode::G_LOAD:
538 case TargetOpcode::G_ZEXTLOAD:
539 case TargetOpcode::G_SEXTLOAD: {
540 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542 unsigned PtrSize = PtrTy.getSizeInBits();
543 unsigned AS = PtrTy.getAddressSpace();
545 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
546 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
547 isScalarLoadLegal(MI)) {
548 const InstructionMapping &SSMapping = getInstructionMapping(
549 1, 1, getOperandsMapping(
550 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552 2); // Num Operands
553 AltMappings.push_back(&SSMapping);
556 const InstructionMapping &VVMapping = getInstructionMapping(
557 2, 1,
558 getOperandsMapping(
559 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561 2); // Num Operands
562 AltMappings.push_back(&VVMapping);
564 // It may be possible to have a vgpr = load sgpr mapping here, because
565 // the mubuf instructions support this kind of load, but probably for only
566 // gfx7 and older. However, the addressing mode matching in the instruction
567 // selector should be able to do a better job of detecting and selecting
568 // these kinds of loads from the vgpr = load vgpr mapping.
570 return AltMappings;
573 case TargetOpcode::G_SELECT: {
574 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580 4); // Num Operands
581 AltMappings.push_back(&SSMapping);
583 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588 4); // Num Operands
589 AltMappings.push_back(&VVMapping);
591 return AltMappings;
593 case TargetOpcode::G_UADDE:
594 case TargetOpcode::G_USUBE:
595 case TargetOpcode::G_SADDE:
596 case TargetOpcode::G_SSUBE: {
597 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
598 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
599 getOperandsMapping(
600 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
601 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
602 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
603 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
604 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
605 5); // Num Operands
606 AltMappings.push_back(&SSMapping);
608 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
609 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
610 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
611 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
612 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
613 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
614 5); // Num Operands
615 AltMappings.push_back(&VVMapping);
616 return AltMappings;
618 case AMDGPU::G_BRCOND: {
619 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
621 // TODO: Change type to 32 for scalar
622 const InstructionMapping &SMapping = getInstructionMapping(
623 1, 1, getOperandsMapping(
624 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
625 2); // Num Operands
626 AltMappings.push_back(&SMapping);
628 const InstructionMapping &VMapping = getInstructionMapping(
629 1, 1, getOperandsMapping(
630 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
631 2); // Num Operands
632 AltMappings.push_back(&VMapping);
633 return AltMappings;
635 case AMDGPU::G_INTRINSIC:
636 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
637 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
638 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
639 default:
640 break;
642 return RegisterBankInfo::getInstrAlternativeMappings(MI);
645 void AMDGPURegisterBankInfo::split64BitValueForMapping(
646 MachineIRBuilder &B,
647 SmallVector<Register, 2> &Regs,
648 LLT HalfTy,
649 Register Reg) const {
650 assert(HalfTy.getSizeInBits() == 32);
651 MachineRegisterInfo *MRI = B.getMRI();
652 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
653 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
654 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
655 MRI->setRegBank(LoLHS, *Bank);
656 MRI->setRegBank(HiLHS, *Bank);
658 Regs.push_back(LoLHS);
659 Regs.push_back(HiLHS);
661 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
662 .addDef(LoLHS)
663 .addDef(HiLHS)
664 .addUse(Reg);
667 /// Replace the current type each register in \p Regs has with \p NewTy
668 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
669 LLT NewTy) {
670 for (Register Reg : Regs) {
671 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
672 MRI.setType(Reg, NewTy);
676 static LLT getHalfSizedType(LLT Ty) {
677 if (Ty.isVector()) {
678 assert(Ty.getElementCount().isKnownMultipleOf(2));
679 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
680 Ty.getElementType());
683 assert(Ty.getScalarSizeInBits() % 2 == 0);
684 return LLT::scalar(Ty.getScalarSizeInBits() / 2);
687 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
688 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
689 /// execute the instruction for each unique combination of values in all lanes
690 /// in the wave. The block will be split such that rest of the instructions are
691 /// moved to a new block.
693 /// Essentially performs this loop:
695 /// Save Execution Mask
696 /// For (Lane : Wavefront) {
697 /// Enable Lane, Disable all other lanes
698 /// SGPR = read SGPR value for current lane from VGPR
699 /// VGPRResult[Lane] = use_op SGPR
700 /// }
701 /// Restore Execution Mask
703 /// There is additional complexity to try for compare values to identify the
704 /// unique values used.
705 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
706 MachineIRBuilder &B,
707 iterator_range<MachineBasicBlock::iterator> Range,
708 SmallSet<Register, 4> &SGPROperandRegs,
709 MachineRegisterInfo &MRI) const {
710 SmallVector<Register, 4> ResultRegs;
711 SmallVector<Register, 4> InitResultRegs;
712 SmallVector<Register, 4> PhiRegs;
714 // Track use registers which have already been expanded with a readfirstlane
715 // sequence. This may have multiple uses if moving a sequence.
716 DenseMap<Register, Register> WaterfalledRegMap;
718 MachineBasicBlock &MBB = B.getMBB();
719 MachineFunction *MF = &B.getMF();
721 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
722 const unsigned WaveAndOpc = Subtarget.isWave32() ?
723 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
724 const unsigned MovTermOpc = Subtarget.isWave32() ?
725 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
726 const unsigned XorTermOpc = Subtarget.isWave32() ?
727 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
728 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
729 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
730 const unsigned ExecReg = Subtarget.isWave32() ?
731 AMDGPU::EXEC_LO : AMDGPU::EXEC;
733 #ifndef NDEBUG
734 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
735 #endif
737 for (MachineInstr &MI : Range) {
738 for (MachineOperand &Def : MI.defs()) {
739 if (MRI.use_nodbg_empty(Def.getReg()))
740 continue;
742 LLT ResTy = MRI.getType(Def.getReg());
743 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
744 ResultRegs.push_back(Def.getReg());
745 Register InitReg = B.buildUndef(ResTy).getReg(0);
746 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
747 InitResultRegs.push_back(InitReg);
748 PhiRegs.push_back(PhiReg);
749 MRI.setRegBank(PhiReg, *DefBank);
750 MRI.setRegBank(InitReg, *DefBank);
754 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
755 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
757 // Don't bother using generic instructions/registers for the exec mask.
758 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
759 .addDef(InitSaveExecReg);
761 Register PhiExec = MRI.createVirtualRegister(WaveRC);
762 Register NewExec = MRI.createVirtualRegister(WaveRC);
764 // To insert the loop we need to split the block. Move everything before this
765 // point to a new block, and insert a new empty block before this instruction.
766 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
767 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
768 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
769 MachineFunction::iterator MBBI(MBB);
770 ++MBBI;
771 MF->insert(MBBI, LoopBB);
772 MF->insert(MBBI, RestoreExecBB);
773 MF->insert(MBBI, RemainderBB);
775 LoopBB->addSuccessor(RestoreExecBB);
776 LoopBB->addSuccessor(LoopBB);
778 // Move the rest of the block into a new block.
779 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
780 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
782 MBB.addSuccessor(LoopBB);
783 RestoreExecBB->addSuccessor(RemainderBB);
785 B.setInsertPt(*LoopBB, LoopBB->end());
787 B.buildInstr(TargetOpcode::PHI)
788 .addDef(PhiExec)
789 .addReg(InitSaveExecReg)
790 .addMBB(&MBB)
791 .addReg(NewExec)
792 .addMBB(LoopBB);
794 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
795 B.buildInstr(TargetOpcode::G_PHI)
796 .addDef(std::get<2>(Result))
797 .addReg(std::get<0>(Result)) // Initial value / implicit_def
798 .addMBB(&MBB)
799 .addReg(std::get<1>(Result)) // Mid-loop value.
800 .addMBB(LoopBB);
803 const DebugLoc &DL = B.getDL();
805 MachineInstr &FirstInst = *Range.begin();
807 // Move the instruction into the loop. Note we moved everything after
808 // Range.end() already into a new block, so Range.end() is no longer valid.
809 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
811 // Figure out the iterator range after splicing the instructions.
812 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
813 auto NewEnd = LoopBB->end();
815 MachineBasicBlock::iterator I = Range.begin();
816 B.setInsertPt(*LoopBB, I);
818 Register CondReg;
820 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
822 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
823 for (MachineOperand &Op : MI.uses()) {
824 if (!Op.isReg() || Op.isDef())
825 continue;
827 Register OldReg = Op.getReg();
828 if (!SGPROperandRegs.count(OldReg))
829 continue;
831 // See if we already processed this register in another instruction in the
832 // sequence.
833 auto OldVal = WaterfalledRegMap.find(OldReg);
834 if (OldVal != WaterfalledRegMap.end()) {
835 Op.setReg(OldVal->second);
836 continue;
839 Register OpReg = Op.getReg();
840 LLT OpTy = MRI.getType(OpReg);
842 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
843 if (OpBank != &AMDGPU::VGPRRegBank) {
844 // Insert copy from AGPR to VGPR before the loop.
845 B.setMBB(MBB);
846 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
847 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
848 B.setInstr(*I);
851 unsigned OpSize = OpTy.getSizeInBits();
853 // Can only do a readlane of 32-bit pieces.
854 if (OpSize == 32) {
855 // Avoid extra copies in the simple case of one 32-bit register.
856 Register CurrentLaneOpReg
857 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
858 MRI.setType(CurrentLaneOpReg, OpTy);
860 constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
861 // Read the next variant <- also loop target.
862 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
863 CurrentLaneOpReg)
864 .addReg(OpReg);
866 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
867 bool First = CondReg == AMDGPU::NoRegister;
868 if (First)
869 CondReg = NewCondReg;
871 // Compare the just read M0 value to all possible Idx values.
872 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
873 .addDef(NewCondReg)
874 .addReg(CurrentLaneOpReg)
875 .addReg(OpReg);
876 Op.setReg(CurrentLaneOpReg);
878 if (!First) {
879 Register AndReg = MRI.createVirtualRegister(WaveRC);
881 // If there are multiple operands to consider, and the conditions.
882 B.buildInstr(WaveAndOpc)
883 .addDef(AndReg)
884 .addReg(NewCondReg)
885 .addReg(CondReg);
886 CondReg = AndReg;
888 } else {
889 LLT S32 = LLT::scalar(32);
890 SmallVector<Register, 8> ReadlanePieces;
892 // The compares can be done as 64-bit, but the extract needs to be done
893 // in 32-bit pieces.
895 bool Is64 = OpSize % 64 == 0;
897 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
898 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
899 : AMDGPU::V_CMP_EQ_U32_e64;
901 // The compares can be done as 64-bit, but the extract needs to be done
902 // in 32-bit pieces.
904 // Insert the unmerge before the loop.
906 B.setMBB(MBB);
907 auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
908 B.setInstr(*I);
910 unsigned NumPieces = Unmerge->getNumOperands() - 1;
911 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
912 Register UnmergePiece = Unmerge.getReg(PieceIdx);
914 Register CurrentLaneOpReg;
915 if (Is64) {
916 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
917 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
919 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
920 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
921 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
923 // Read the next variant <- also loop target.
924 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
925 CurrentLaneOpRegLo)
926 .addReg(UnmergePiece, 0, AMDGPU::sub0);
928 // Read the next variant <- also loop target.
929 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
930 CurrentLaneOpRegHi)
931 .addReg(UnmergePiece, 0, AMDGPU::sub1);
933 CurrentLaneOpReg =
934 B.buildMerge(LLT::scalar(64),
935 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
936 .getReg(0);
938 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
940 if (OpTy.getScalarSizeInBits() == 64) {
941 // If we need to produce a 64-bit element vector, so use the
942 // merged pieces
943 ReadlanePieces.push_back(CurrentLaneOpReg);
944 } else {
945 // 32-bit element type.
946 ReadlanePieces.push_back(CurrentLaneOpRegLo);
947 ReadlanePieces.push_back(CurrentLaneOpRegHi);
949 } else {
950 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
951 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
952 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
954 // Read the next variant <- also loop target.
955 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
956 CurrentLaneOpReg)
957 .addReg(UnmergePiece);
958 ReadlanePieces.push_back(CurrentLaneOpReg);
961 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
962 bool First = CondReg == AMDGPU::NoRegister;
963 if (First)
964 CondReg = NewCondReg;
966 B.buildInstr(CmpOp)
967 .addDef(NewCondReg)
968 .addReg(CurrentLaneOpReg)
969 .addReg(UnmergePiece);
971 if (!First) {
972 Register AndReg = MRI.createVirtualRegister(WaveRC);
974 // If there are multiple operands to consider, and the conditions.
975 B.buildInstr(WaveAndOpc)
976 .addDef(AndReg)
977 .addReg(NewCondReg)
978 .addReg(CondReg);
979 CondReg = AndReg;
983 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
984 // BUILD_VECTOR
985 if (OpTy.isVector()) {
986 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
987 Op.setReg(Merge.getReg(0));
988 } else {
989 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
990 Op.setReg(Merge.getReg(0));
993 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
996 // Make sure we don't re-process this register again.
997 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
1001 B.setInsertPt(*LoopBB, LoopBB->end());
1003 // Update EXEC, save the original EXEC value to VCC.
1004 B.buildInstr(AndSaveExecOpc)
1005 .addDef(NewExec)
1006 .addReg(CondReg, RegState::Kill);
1008 MRI.setSimpleHint(NewExec, CondReg);
1010 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1011 B.buildInstr(XorTermOpc)
1012 .addDef(ExecReg)
1013 .addReg(ExecReg)
1014 .addReg(NewExec);
1016 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1017 // s_cbranch_scc0?
1019 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1020 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1021 .addMBB(LoopBB);
1023 // Save the EXEC mask before the loop.
1024 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1025 .addReg(ExecReg);
1027 // Restore the EXEC mask after the loop.
1028 B.setMBB(*RestoreExecBB);
1029 B.buildInstr(MovTermOpc)
1030 .addDef(ExecReg)
1031 .addReg(SaveExecReg);
1033 // Set the insert point after the original instruction, so any new
1034 // instructions will be in the remainder.
1035 B.setInsertPt(*RemainderBB, RemainderBB->begin());
1037 return true;
1040 // Return any unique registers used by \p MI at \p OpIndices that need to be
1041 // handled in a waterfall loop. Returns these registers in \p
1042 // SGPROperandRegs. Returns true if there are any operands to handle and a
1043 // waterfall loop is necessary.
1044 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1045 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1046 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1047 for (unsigned Op : OpIndices) {
1048 assert(MI.getOperand(Op).isUse());
1049 Register Reg = MI.getOperand(Op).getReg();
1050 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1051 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1052 SGPROperandRegs.insert(Reg);
1055 // No operands need to be replaced, so no need to loop.
1056 return !SGPROperandRegs.empty();
1059 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1060 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1061 ArrayRef<unsigned> OpIndices) const {
1062 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1063 // are the same register.
1064 SmallSet<Register, 4> SGPROperandRegs;
1066 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1067 return false;
1069 MachineBasicBlock::iterator I = MI.getIterator();
1070 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1071 SGPROperandRegs, MRI);
1074 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1075 MachineInstr &MI, MachineRegisterInfo &MRI,
1076 ArrayRef<unsigned> OpIndices) const {
1077 MachineIRBuilder B(MI);
1078 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1081 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1082 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1083 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1084 Register Reg = MI.getOperand(OpIdx).getReg();
1085 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1086 if (Bank == &AMDGPU::SGPRRegBank)
1087 return;
1089 LLT Ty = MRI.getType(Reg);
1090 MachineIRBuilder B(MI);
1092 if (Bank != &AMDGPU::VGPRRegBank) {
1093 // We need to copy from AGPR to VGPR
1094 Reg = B.buildCopy(Ty, Reg).getReg(0);
1095 MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1098 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1099 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1100 .addDef(SGPR)
1101 .addReg(Reg);
1103 MRI.setType(SGPR, Ty);
1105 const TargetRegisterClass *Constrained =
1106 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1107 (void)Constrained;
1108 assert(Constrained && "Failed to constrain readfirstlane src reg");
1110 MI.getOperand(OpIdx).setReg(SGPR);
1113 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1114 /// rest will be in the remainder.
1115 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1116 unsigned TotalSize = Ty.getSizeInBits();
1117 if (!Ty.isVector())
1118 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1120 LLT EltTy = Ty.getElementType();
1121 unsigned EltSize = EltTy.getSizeInBits();
1122 assert(FirstSize % EltSize == 0);
1124 unsigned FirstPartNumElts = FirstSize / EltSize;
1125 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1127 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1128 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1131 static LLT widen96To128(LLT Ty) {
1132 if (!Ty.isVector())
1133 return LLT::scalar(128);
1135 LLT EltTy = Ty.getElementType();
1136 assert(128 % EltTy.getSizeInBits() == 0);
1137 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1140 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1141 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1142 MachineRegisterInfo &MRI) const {
1143 Register DstReg = MI.getOperand(0).getReg();
1144 const LLT LoadTy = MRI.getType(DstReg);
1145 unsigned LoadSize = LoadTy.getSizeInBits();
1146 const unsigned MaxNonSmrdLoadSize = 128;
1148 const RegisterBank *DstBank =
1149 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1150 if (DstBank == &AMDGPU::SGPRRegBank) {
1151 // There are some special cases that we need to look at for 32 bit and 96
1152 // bit SGPR loads otherwise we have nothing to do.
1153 if (LoadSize != 32 && LoadSize != 96)
1154 return false;
1156 MachineMemOperand *MMO = *MI.memoperands_begin();
1157 const unsigned MemSize = 8 * MMO->getSize();
1158 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1159 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1160 // scalar loads should have a load size of 32 but memory access size of less
1161 // than 32.
1162 if (LoadSize == 32 &&
1163 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1164 return false;
1166 Register PtrReg = MI.getOperand(1).getReg();
1168 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1169 MachineIRBuilder B(MI, O);
1171 if (LoadSize == 32) {
1172 // This is an extending load from a sub-dword size. Widen the memory
1173 // access size to 4 bytes and clear the extra high bits appropriately
1174 const LLT S32 = LLT::scalar(32);
1175 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1176 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1177 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1178 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1179 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1180 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1181 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1182 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1183 } else
1184 // We do not need to touch the higher bits for regular loads.
1185 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1186 } else {
1187 // 96-bit loads are only available for vector loads. We need to split this
1188 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1189 if (MMO->getAlign() < Align(16)) {
1190 LLT Part64, Part32;
1191 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1192 auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1193 auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1195 auto Undef = B.buildUndef(LoadTy);
1196 auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1197 B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1198 } else {
1199 LLT WiderTy = widen96To128(LoadTy);
1200 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1201 B.buildExtract(MI.getOperand(0), WideLoad, 0);
1205 MI.eraseFromParent();
1206 return true;
1209 // 128-bit loads are supported for all instruction types.
1210 if (LoadSize <= MaxNonSmrdLoadSize)
1211 return false;
1213 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1214 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1216 if (SrcRegs.empty())
1217 SrcRegs.push_back(MI.getOperand(1).getReg());
1219 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1221 // RegBankSelect only emits scalar types, so we need to reset the pointer
1222 // operand to a pointer type.
1223 Register BasePtrReg = SrcRegs[0];
1224 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1225 MRI.setType(BasePtrReg, PtrTy);
1227 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1228 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1229 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1230 MachineIRBuilder B(MI, Observer);
1231 LegalizerHelper Helper(B.getMF(), Observer, B);
1233 if (LoadTy.isVector()) {
1234 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1235 return false;
1236 } else {
1237 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1238 return false;
1241 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1242 return true;
1245 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1246 MachineInstr &MI,
1247 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1248 MachineRegisterInfo &MRI) const {
1249 const MachineFunction &MF = *MI.getMF();
1250 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1251 const auto &TFI = *ST.getFrameLowering();
1253 // Guard in case the stack growth direction ever changes with scratch
1254 // instructions.
1255 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1256 return false;
1258 Register Dst = MI.getOperand(0).getReg();
1259 Register AllocSize = MI.getOperand(1).getReg();
1260 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1262 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1264 // TODO: Need to emit a wave reduction to get the maximum size.
1265 if (SizeBank != &AMDGPU::SGPRRegBank)
1266 return false;
1268 LLT PtrTy = MRI.getType(Dst);
1269 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1271 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1272 Register SPReg = Info->getStackPtrOffsetReg();
1273 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1274 MachineIRBuilder B(MI, ApplyBank);
1276 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1277 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1279 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1280 if (Alignment > TFI.getStackAlign()) {
1281 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1282 B.buildMaskLowPtrBits(Dst, PtrAdd,
1283 Log2(Alignment) + ST.getWavefrontSizeLog2());
1284 } else {
1285 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1288 MI.eraseFromParent();
1289 return true;
1292 bool AMDGPURegisterBankInfo::applyMappingImage(
1293 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1294 MachineRegisterInfo &MRI, int RsrcIdx) const {
1295 const int NumDefs = MI.getNumExplicitDefs();
1297 // The reported argument index is relative to the IR intrinsic call arguments,
1298 // so we need to shift by the number of defs and the intrinsic ID.
1299 RsrcIdx += NumDefs + 1;
1301 // Insert copies to VGPR arguments.
1302 applyDefaultMapping(OpdMapper);
1304 // Fixup any SGPR arguments.
1305 SmallVector<unsigned, 4> SGPRIndexes;
1306 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1307 if (!MI.getOperand(I).isReg())
1308 continue;
1310 // If this intrinsic has a sampler, it immediately follows rsrc.
1311 if (I == RsrcIdx || I == RsrcIdx + 1)
1312 SGPRIndexes.push_back(I);
1315 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1316 return true;
1319 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1320 Register Reg) {
1321 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1322 if (!Def)
1323 return Reg;
1325 // TODO: Guard against this being an implicit def
1326 return Def->getOperand(0).getReg();
1329 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1330 // the three offsets (voffset, soffset and instoffset)
1331 static unsigned setBufferOffsets(MachineIRBuilder &B,
1332 const AMDGPURegisterBankInfo &RBI,
1333 Register CombinedOffset, Register &VOffsetReg,
1334 Register &SOffsetReg, int64_t &InstOffsetVal,
1335 Align Alignment) {
1336 const LLT S32 = LLT::scalar(32);
1337 MachineRegisterInfo *MRI = B.getMRI();
1339 if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
1340 uint32_t SOffset, ImmOffset;
1341 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1342 Alignment)) {
1343 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1344 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1345 InstOffsetVal = ImmOffset;
1347 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1348 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1349 return SOffset + ImmOffset;
1353 Register Base;
1354 unsigned Offset;
1356 std::tie(Base, Offset) =
1357 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1359 uint32_t SOffset, ImmOffset;
1360 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1361 &RBI.Subtarget, Alignment)) {
1362 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1363 VOffsetReg = Base;
1364 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1365 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1366 InstOffsetVal = ImmOffset;
1367 return 0; // XXX - Why is this 0?
1370 // If we have SGPR base, we can use it for soffset.
1371 if (SOffset == 0) {
1372 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1373 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1374 SOffsetReg = Base;
1375 InstOffsetVal = ImmOffset;
1376 return 0; // XXX - Why is this 0?
1380 // Handle the variable sgpr + vgpr case.
1381 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1382 if (Add && (int)Offset >= 0) {
1383 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1384 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1386 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1387 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1389 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1390 VOffsetReg = Src0;
1391 SOffsetReg = Src1;
1392 return 0;
1395 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1396 VOffsetReg = Src1;
1397 SOffsetReg = Src0;
1398 return 0;
1402 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1403 // have an SGPR offset and a VGPR resource.
1404 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1405 VOffsetReg = CombinedOffset;
1406 } else {
1407 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1408 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1411 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1412 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1413 return 0;
1416 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1417 const OperandsMapper &OpdMapper) const {
1418 MachineInstr &MI = OpdMapper.getMI();
1419 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1421 const LLT S32 = LLT::scalar(32);
1422 Register Dst = MI.getOperand(0).getReg();
1423 LLT Ty = MRI.getType(Dst);
1425 const RegisterBank *RSrcBank =
1426 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1427 const RegisterBank *OffsetBank =
1428 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1429 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1430 OffsetBank == &AMDGPU::SGPRRegBank)
1431 return true; // Legal mapping
1433 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1434 // here but don't have an MMO.
1436 unsigned LoadSize = Ty.getSizeInBits();
1437 int NumLoads = 1;
1438 if (LoadSize == 256 || LoadSize == 512) {
1439 NumLoads = LoadSize / 128;
1440 Ty = Ty.divide(NumLoads);
1443 // Use the alignment to ensure that the required offsets will fit into the
1444 // immediate offsets.
1445 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1447 MachineIRBuilder B(MI);
1448 MachineFunction &MF = B.getMF();
1450 Register SOffset;
1451 Register VOffset;
1452 int64_t ImmOffset = 0;
1454 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1455 VOffset, SOffset, ImmOffset, Alignment);
1457 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1458 // can, but we neeed to track an MMO for that.
1459 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1460 const Align MemAlign(4); // FIXME: ABI type alignment?
1461 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1462 MachinePointerInfo(),
1463 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1464 MachineMemOperand::MOInvariant,
1465 MemSize, MemAlign);
1466 if (MMOOffset != 0)
1467 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1469 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1470 // assume that the buffer is unswizzled.
1472 Register RSrc = MI.getOperand(1).getReg();
1473 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1474 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1476 SmallVector<Register, 4> LoadParts(NumLoads);
1478 MachineBasicBlock::iterator MII = MI.getIterator();
1479 MachineInstrSpan Span(MII, &B.getMBB());
1481 for (int i = 0; i < NumLoads; ++i) {
1482 if (NumLoads == 1) {
1483 LoadParts[i] = Dst;
1484 } else {
1485 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1486 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1489 MachineMemOperand *MMO = BaseMMO;
1490 if (i != 0)
1491 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1493 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1494 .addDef(LoadParts[i]) // vdata
1495 .addUse(RSrc) // rsrc
1496 .addUse(VIndex) // vindex
1497 .addUse(VOffset) // voffset
1498 .addUse(SOffset) // soffset
1499 .addImm(ImmOffset + 16 * i) // offset(imm)
1500 .addImm(0) // cachepolicy, swizzled buffer(imm)
1501 .addImm(0) // idxen(imm)
1502 .addMemOperand(MMO);
1505 // TODO: If only the resource is a VGPR, it may be better to execute the
1506 // scalar load in the waterfall loop if the resource is expected to frequently
1507 // be dynamically uniform.
1508 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1509 // Remove the original instruction to avoid potentially confusing the
1510 // waterfall loop logic.
1511 B.setInstr(*Span.begin());
1512 MI.eraseFromParent();
1514 SmallSet<Register, 4> OpsToWaterfall;
1516 OpsToWaterfall.insert(RSrc);
1517 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1518 OpsToWaterfall, MRI);
1521 if (NumLoads != 1) {
1522 if (Ty.isVector())
1523 B.buildConcatVectors(Dst, LoadParts);
1524 else
1525 B.buildMerge(Dst, LoadParts);
1528 // We removed the instruction earlier with a waterfall loop.
1529 if (RSrcBank == &AMDGPU::SGPRRegBank)
1530 MI.eraseFromParent();
1532 return true;
1535 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1536 bool Signed) const {
1537 MachineInstr &MI = OpdMapper.getMI();
1538 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1540 // Insert basic copies
1541 applyDefaultMapping(OpdMapper);
1543 Register DstReg = MI.getOperand(0).getReg();
1544 LLT Ty = MRI.getType(DstReg);
1546 const LLT S32 = LLT::scalar(32);
1548 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1549 Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1550 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1551 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1553 const RegisterBank *DstBank =
1554 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1555 if (DstBank == &AMDGPU::VGPRRegBank) {
1556 if (Ty == S32)
1557 return true;
1559 // There is no 64-bit vgpr bitfield extract instructions so the operation
1560 // is expanded to a sequence of instructions that implement the operation.
1561 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1562 MachineIRBuilder B(MI, ApplyBank);
1564 const LLT S64 = LLT::scalar(64);
1565 // Shift the source operand so that extracted bits start at bit 0.
1566 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1567 : B.buildLShr(S64, SrcReg, OffsetReg);
1568 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1570 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1571 // if the width is a constant.
1572 if (auto ConstWidth = getConstantVRegValWithLookThrough(WidthReg, MRI)) {
1573 // Use the 32-bit bitfield extract instruction if the width is a constant.
1574 // Depending on the width size, use either the low or high 32-bits.
1575 auto Zero = B.buildConstant(S32, 0);
1576 auto WidthImm = ConstWidth->Value.getZExtValue();
1577 if (WidthImm <= 32) {
1578 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1579 // or clear the upper 32-bits.
1580 auto Extract =
1581 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1582 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1583 auto Extend =
1584 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1585 B.buildMerge(DstReg, {Extract, Extend});
1586 } else {
1587 // Use bitfield extract on upper 32-bit source, and combine with lower
1588 // 32-bit source.
1589 auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1590 auto Extract =
1591 Signed
1592 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1593 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1594 B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1596 MI.eraseFromParent();
1597 return true;
1600 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1601 // operations.
1602 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1603 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1604 if (Signed)
1605 B.buildAShr(S64, SignBit, ExtShift);
1606 else
1607 B.buildLShr(S64, SignBit, ExtShift);
1608 MI.eraseFromParent();
1609 return true;
1612 // The scalar form packs the offset and width in a single operand.
1614 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1615 MachineIRBuilder B(MI, ApplyBank);
1617 // Ensure the high bits are clear to insert the offset.
1618 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1619 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1621 // Zeros out the low bits, so don't bother clamping the input value.
1622 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1624 // Transformation function, pack the offset and width of a BFE into
1625 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1626 // source, bits [5:0] contain the offset and bits [22:16] the width.
1627 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1629 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1630 // register class constraints.
1631 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1632 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1634 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1635 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1636 llvm_unreachable("failed to constrain BFE");
1638 MI.eraseFromParent();
1639 return true;
1642 // Return a suitable opcode for extending the operands of Opc when widening.
1643 static unsigned getExtendOp(unsigned Opc) {
1644 switch (Opc) {
1645 case TargetOpcode::G_ASHR:
1646 case TargetOpcode::G_SMIN:
1647 case TargetOpcode::G_SMAX:
1648 return TargetOpcode::G_SEXT;
1649 case TargetOpcode::G_LSHR:
1650 case TargetOpcode::G_UMIN:
1651 case TargetOpcode::G_UMAX:
1652 return TargetOpcode::G_ZEXT;
1653 default:
1654 return TargetOpcode::G_ANYEXT;
1658 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1659 // any illegal vector extend or unmerge operations.
1660 static std::pair<Register, Register>
1661 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1662 const LLT S32 = LLT::scalar(32);
1663 auto Bitcast = B.buildBitcast(S32, Src);
1665 if (ExtOpcode == TargetOpcode::G_SEXT) {
1666 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1667 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1668 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1671 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1672 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1673 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1674 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1677 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1678 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1681 // For cases where only a single copy is inserted for matching register banks.
1682 // Replace the register in the instruction operand
1683 static bool substituteSimpleCopyRegs(
1684 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1685 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1686 if (!SrcReg.empty()) {
1687 assert(SrcReg.size() == 1);
1688 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1689 return true;
1692 return false;
1695 /// Handle register layout difference for f16 images for some subtargets.
1696 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1697 MachineRegisterInfo &MRI,
1698 Register Reg) const {
1699 if (!Subtarget.hasUnpackedD16VMem())
1700 return Reg;
1702 const LLT S16 = LLT::scalar(16);
1703 LLT StoreVT = MRI.getType(Reg);
1704 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1705 return Reg;
1707 auto Unmerge = B.buildUnmerge(S16, Reg);
1710 SmallVector<Register, 4> WideRegs;
1711 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1712 WideRegs.push_back(Unmerge.getReg(I));
1714 const LLT S32 = LLT::scalar(32);
1715 int NumElts = StoreVT.getNumElements();
1717 return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1720 static std::pair<Register, unsigned>
1721 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1722 int64_t Const;
1723 if (mi_match(Reg, MRI, m_ICst(Const)))
1724 return std::make_pair(Register(), Const);
1726 Register Base;
1727 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1728 return std::make_pair(Base, Const);
1730 // TODO: Handle G_OR used for add case
1731 return std::make_pair(Reg, 0);
1734 std::pair<Register, unsigned>
1735 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1736 Register OrigOffset) const {
1737 const unsigned MaxImm = 4095;
1738 Register BaseReg;
1739 unsigned ImmOffset;
1740 const LLT S32 = LLT::scalar(32);
1742 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1743 OrigOffset);
1745 unsigned C1 = 0;
1746 if (ImmOffset != 0) {
1747 // If the immediate value is too big for the immoffset field, put the value
1748 // and -4096 into the immoffset field so that the value that is copied/added
1749 // for the voffset field is a multiple of 4096, and it stands more chance
1750 // of being CSEd with the copy/add for another similar load/store.
1751 // However, do not do that rounding down to a multiple of 4096 if that is a
1752 // negative number, as it appears to be illegal to have a negative offset
1753 // in the vgpr, even if adding the immediate offset makes it positive.
1754 unsigned Overflow = ImmOffset & ~MaxImm;
1755 ImmOffset -= Overflow;
1756 if ((int32_t)Overflow < 0) {
1757 Overflow += ImmOffset;
1758 ImmOffset = 0;
1761 C1 = ImmOffset;
1762 if (Overflow != 0) {
1763 if (!BaseReg)
1764 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1765 else {
1766 auto OverflowVal = B.buildConstant(S32, Overflow);
1767 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1772 if (!BaseReg)
1773 BaseReg = B.buildConstant(S32, 0).getReg(0);
1775 return {BaseReg, C1};
1778 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1779 int64_t C;
1780 return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1783 static unsigned extractCPol(unsigned CachePolicy) {
1784 return CachePolicy & AMDGPU::CPol::ALL;
1787 static unsigned extractSWZ(unsigned CachePolicy) {
1788 return (CachePolicy >> 3) & 1;
1792 MachineInstr *
1793 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1794 MachineInstr &MI) const {
1795 MachineRegisterInfo &MRI = *B.getMRI();
1796 executeInWaterfallLoop(B, MI, MRI, {2, 4});
1798 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1800 Register VData = MI.getOperand(1).getReg();
1801 LLT Ty = MRI.getType(VData);
1803 int EltSize = Ty.getScalarSizeInBits();
1804 int Size = Ty.getSizeInBits();
1806 // FIXME: Broken integer truncstore.
1807 if (EltSize != 32)
1808 report_fatal_error("unhandled intrinsic store");
1810 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1811 const int MemSize = (*MI.memoperands_begin())->getSize();
1814 Register RSrc = MI.getOperand(2).getReg();
1815 Register VOffset = MI.getOperand(3).getReg();
1816 Register SOffset = MI.getOperand(4).getReg();
1817 unsigned CachePolicy = MI.getOperand(5).getImm();
1819 unsigned ImmOffset;
1820 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1822 const bool Offen = !isZero(VOffset, MRI);
1824 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1825 switch (8 * MemSize) {
1826 case 8:
1827 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1828 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1829 break;
1830 case 16:
1831 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1832 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1833 break;
1834 default:
1835 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1836 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1837 if (Size > 32)
1838 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1839 break;
1843 // Set the insertion point back to the instruction in case it was moved into a
1844 // loop.
1845 B.setInstr(MI);
1847 MachineInstrBuilder MIB = B.buildInstr(Opc)
1848 .addUse(VData);
1850 if (Offen)
1851 MIB.addUse(VOffset);
1853 MIB.addUse(RSrc)
1854 .addUse(SOffset)
1855 .addImm(ImmOffset)
1856 .addImm(extractCPol(CachePolicy))
1857 .addImm(0) // tfe: FIXME: Remove from inst
1858 .addImm(extractSWZ(CachePolicy))
1859 .cloneMemRefs(MI);
1861 // FIXME: We need a way to report failure from applyMappingImpl.
1862 // Insert constrain copies before inserting the loop.
1863 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1864 report_fatal_error("failed to constrain selected store intrinsic");
1866 return MIB;
1869 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1870 Register SrcReg) const {
1871 MachineRegisterInfo &MRI = *B.getMRI();
1872 LLT SrcTy = MRI.getType(SrcReg);
1873 if (SrcTy.getSizeInBits() == 32) {
1874 // Use a v_mov_b32 here to make the exec dependency explicit.
1875 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1876 .addDef(DstReg)
1877 .addUse(SrcReg);
1878 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1879 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1882 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1883 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1885 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1886 .addDef(TmpReg0)
1887 .addUse(SrcReg, 0, AMDGPU::sub0);
1888 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1889 .addDef(TmpReg1)
1890 .addUse(SrcReg, 0, AMDGPU::sub1);
1891 B.buildInstr(AMDGPU::REG_SEQUENCE)
1892 .addDef(DstReg)
1893 .addUse(TmpReg0)
1894 .addImm(AMDGPU::sub0)
1895 .addUse(TmpReg1)
1896 .addImm(AMDGPU::sub1);
1898 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1899 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1902 /// Utility function for pushing dynamic vector indexes with a constant offset
1903 /// into waterwall loops.
1904 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1905 MachineInstr &IdxUseInstr,
1906 unsigned OpIdx,
1907 unsigned ConstOffset) {
1908 MachineRegisterInfo &MRI = *B.getMRI();
1909 const LLT S32 = LLT::scalar(32);
1910 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1911 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1913 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1915 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1916 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1917 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1918 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1921 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1922 /// original 32-bit source value (to be inserted in the low part of the combined
1923 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1924 /// value.
1925 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1926 Register Hi32Reg, Register Lo32Reg,
1927 unsigned ExtOpc,
1928 const RegisterBank &RegBank,
1929 bool IsBooleanSrc = false) {
1930 if (ExtOpc == AMDGPU::G_ZEXT) {
1931 B.buildConstant(Hi32Reg, 0);
1932 } else if (ExtOpc == AMDGPU::G_SEXT) {
1933 if (IsBooleanSrc) {
1934 // If we know the original source was an s1, the high half is the same as
1935 // the low.
1936 B.buildCopy(Hi32Reg, Lo32Reg);
1937 } else {
1938 // Replicate sign bit from 32-bit extended part.
1939 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1940 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1941 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1943 } else {
1944 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1945 B.buildUndef(Hi32Reg);
1949 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1950 MachineInstr &MI, MachineRegisterInfo &MRI,
1951 const OperandsMapper &OpdMapper) const {
1953 Register VecReg = MI.getOperand(1).getReg();
1954 Register Idx = MI.getOperand(2).getReg();
1956 const RegisterBank &IdxBank =
1957 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1959 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1961 LLT VecTy = MRI.getType(VecReg);
1962 unsigned EltSize = VecTy.getScalarSizeInBits();
1963 unsigned NumElem = VecTy.getNumElements();
1965 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1966 IsDivergentIdx))
1967 return false;
1969 MachineIRBuilder B(MI);
1970 LLT S32 = LLT::scalar(32);
1972 const RegisterBank &DstBank =
1973 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1974 const RegisterBank &SrcBank =
1975 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1977 const RegisterBank &CCBank =
1978 (DstBank == AMDGPU::SGPRRegBank &&
1979 SrcBank == AMDGPU::SGPRRegBank &&
1980 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1981 : AMDGPU::VCCRegBank;
1982 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1984 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1985 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1986 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1989 LLT EltTy = VecTy.getScalarType();
1990 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1991 unsigned NumLanes = DstRegs.size();
1992 if (!NumLanes)
1993 NumLanes = 1;
1994 else
1995 EltTy = MRI.getType(DstRegs[0]);
1997 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1998 SmallVector<Register, 2> Res(NumLanes);
1999 for (unsigned L = 0; L < NumLanes; ++L)
2000 Res[L] = UnmergeToEltTy.getReg(L);
2002 for (unsigned I = 1; I < NumElem; ++I) {
2003 auto IC = B.buildConstant(S32, I);
2004 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2005 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2006 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2008 for (unsigned L = 0; L < NumLanes; ++L) {
2009 auto S = B.buildSelect(EltTy, Cmp,
2010 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
2012 for (unsigned N : { 0, 2, 3 })
2013 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2015 Res[L] = S->getOperand(0).getReg();
2019 for (unsigned L = 0; L < NumLanes; ++L) {
2020 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2021 B.buildCopy(DstReg, Res[L]);
2022 MRI.setRegBank(DstReg, DstBank);
2025 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2026 MI.eraseFromParent();
2028 return true;
2031 // Insert a cross regbank copy for a register if it already has a bank that
2032 // differs from the one we want to set.
2033 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2034 MachineIRBuilder &B, Register &Reg,
2035 const RegisterBank &Bank) {
2036 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2037 if (CurrBank && *CurrBank != Bank) {
2038 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2039 MRI.setRegBank(Copy, Bank);
2040 return Copy;
2043 MRI.setRegBank(Reg, Bank);
2044 return Reg;
2047 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2048 MachineInstr &MI, MachineRegisterInfo &MRI,
2049 const OperandsMapper &OpdMapper) const {
2051 Register VecReg = MI.getOperand(1).getReg();
2052 Register Idx = MI.getOperand(3).getReg();
2054 const RegisterBank &IdxBank =
2055 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2057 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2059 LLT VecTy = MRI.getType(VecReg);
2060 unsigned EltSize = VecTy.getScalarSizeInBits();
2061 unsigned NumElem = VecTy.getNumElements();
2063 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2064 IsDivergentIdx))
2065 return false;
2067 MachineIRBuilder B(MI);
2068 LLT S32 = LLT::scalar(32);
2070 const RegisterBank &DstBank =
2071 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2072 const RegisterBank &SrcBank =
2073 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2074 const RegisterBank &InsBank =
2075 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2077 const RegisterBank &CCBank =
2078 (DstBank == AMDGPU::SGPRRegBank &&
2079 SrcBank == AMDGPU::SGPRRegBank &&
2080 InsBank == AMDGPU::SGPRRegBank &&
2081 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2082 : AMDGPU::VCCRegBank;
2083 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2085 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2086 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2087 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2090 LLT EltTy = VecTy.getScalarType();
2091 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2092 unsigned NumLanes = InsRegs.size();
2093 if (!NumLanes) {
2094 NumLanes = 1;
2095 InsRegs.push_back(MI.getOperand(2).getReg());
2096 } else {
2097 EltTy = MRI.getType(InsRegs[0]);
2100 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2101 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2103 for (unsigned I = 0; I < NumElem; ++I) {
2104 auto IC = B.buildConstant(S32, I);
2105 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2106 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2107 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2109 for (unsigned L = 0; L < NumLanes; ++L) {
2110 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2111 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2112 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2114 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2115 MRI.setRegBank(Select, DstBank);
2117 Ops[I * NumLanes + L] = Select;
2121 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2122 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2123 B.buildBuildVector(MI.getOperand(0), Ops);
2124 } else {
2125 auto Vec = B.buildBuildVector(MergeTy, Ops);
2126 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2127 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2130 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2131 MI.eraseFromParent();
2133 return true;
2136 void AMDGPURegisterBankInfo::applyMappingImpl(
2137 const OperandsMapper &OpdMapper) const {
2138 MachineInstr &MI = OpdMapper.getMI();
2139 unsigned Opc = MI.getOpcode();
2140 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2141 switch (Opc) {
2142 case AMDGPU::G_PHI: {
2143 Register DstReg = MI.getOperand(0).getReg();
2144 LLT DstTy = MRI.getType(DstReg);
2145 if (DstTy != LLT::scalar(1))
2146 break;
2148 const LLT S32 = LLT::scalar(32);
2149 const RegisterBank *DstBank =
2150 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2151 if (DstBank == &AMDGPU::VCCRegBank) {
2152 applyDefaultMapping(OpdMapper);
2153 // The standard handling only considers the result register bank for
2154 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2155 // produce an invalid copy. We can only copy with some kind of compare to
2156 // get a vector boolean result. Insert a regitser bank copy that will be
2157 // correctly lowered to a compare.
2158 MachineIRBuilder B(*MI.getParent()->getParent());
2160 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2161 Register SrcReg = MI.getOperand(I).getReg();
2162 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2164 if (SrcBank != &AMDGPU::VCCRegBank) {
2165 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2166 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2168 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2169 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2170 MI.getOperand(I).setReg(Copy.getReg(0));
2174 return;
2177 // Phi handling is strange and only considers the bank of the destination.
2178 substituteSimpleCopyRegs(OpdMapper, 0);
2180 // Promote SGPR/VGPR booleans to s32
2181 MachineFunction *MF = MI.getParent()->getParent();
2182 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2183 MachineIRBuilder B(MI, ApplyBank);
2184 LegalizerHelper Helper(*MF, ApplyBank, B);
2186 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2187 llvm_unreachable("widen scalar should have succeeded");
2189 return;
2191 case AMDGPU::G_ICMP:
2192 case AMDGPU::G_UADDO:
2193 case AMDGPU::G_USUBO:
2194 case AMDGPU::G_UADDE:
2195 case AMDGPU::G_SADDE:
2196 case AMDGPU::G_USUBE:
2197 case AMDGPU::G_SSUBE: {
2198 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2199 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2201 const RegisterBank *DstBank =
2202 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2203 if (DstBank != &AMDGPU::SGPRRegBank)
2204 break;
2206 const bool HasCarryIn = MI.getNumOperands() == 5;
2208 // If this is a scalar compare, promote the result to s32, as the selection
2209 // will end up using a copy to a 32-bit vreg.
2210 const LLT S32 = LLT::scalar(32);
2211 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2212 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2213 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2214 MachineIRBuilder B(MI);
2216 if (HasCarryIn) {
2217 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2218 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2219 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2220 MI.getOperand(4).setReg(NewSrcReg);
2223 MachineBasicBlock *MBB = MI.getParent();
2224 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2226 // If we had a constrained VCC result register, a copy was inserted to VCC
2227 // from SGPR.
2228 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2229 if (DefRegs.empty())
2230 DefRegs.push_back(DstReg);
2231 B.buildTrunc(DefRegs[0], NewDstReg);
2232 return;
2234 case AMDGPU::G_SELECT: {
2235 Register DstReg = MI.getOperand(0).getReg();
2236 LLT DstTy = MRI.getType(DstReg);
2238 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2239 if (CondRegs.empty())
2240 CondRegs.push_back(MI.getOperand(1).getReg());
2241 else {
2242 assert(CondRegs.size() == 1);
2245 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2246 if (CondBank == &AMDGPU::SGPRRegBank) {
2247 MachineIRBuilder B(MI);
2248 const LLT S32 = LLT::scalar(32);
2249 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2250 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2252 MI.getOperand(1).setReg(NewCondReg);
2253 B.buildZExt(NewCondReg, CondRegs[0]);
2256 if (DstTy.getSizeInBits() != 64)
2257 break;
2259 MachineIRBuilder B(MI);
2260 LLT HalfTy = getHalfSizedType(DstTy);
2262 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2263 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2264 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2266 // All inputs are SGPRs, nothing special to do.
2267 if (DefRegs.empty()) {
2268 assert(Src1Regs.empty() && Src2Regs.empty());
2269 break;
2272 if (Src1Regs.empty())
2273 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2274 else {
2275 setRegsToType(MRI, Src1Regs, HalfTy);
2278 if (Src2Regs.empty())
2279 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2280 else
2281 setRegsToType(MRI, Src2Regs, HalfTy);
2283 setRegsToType(MRI, DefRegs, HalfTy);
2285 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2286 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2288 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2289 MI.eraseFromParent();
2290 return;
2292 case AMDGPU::G_BRCOND: {
2293 Register CondReg = MI.getOperand(0).getReg();
2294 // FIXME: Should use legalizer helper, but should change bool ext type.
2295 const RegisterBank *CondBank =
2296 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2298 if (CondBank == &AMDGPU::SGPRRegBank) {
2299 MachineIRBuilder B(MI);
2300 const LLT S32 = LLT::scalar(32);
2301 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2302 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2304 MI.getOperand(0).setReg(NewCondReg);
2305 B.buildZExt(NewCondReg, CondReg);
2306 return;
2309 break;
2311 case AMDGPU::G_AND:
2312 case AMDGPU::G_OR:
2313 case AMDGPU::G_XOR: {
2314 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2315 // there is a VGPR input.
2316 Register DstReg = MI.getOperand(0).getReg();
2317 LLT DstTy = MRI.getType(DstReg);
2319 if (DstTy.getSizeInBits() == 1) {
2320 const RegisterBank *DstBank =
2321 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2322 if (DstBank == &AMDGPU::VCCRegBank)
2323 break;
2325 MachineFunction *MF = MI.getParent()->getParent();
2326 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2327 MachineIRBuilder B(MI, ApplyBank);
2328 LegalizerHelper Helper(*MF, ApplyBank, B);
2330 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2331 LegalizerHelper::Legalized)
2332 llvm_unreachable("widen scalar should have succeeded");
2333 return;
2336 if (DstTy.getSizeInBits() != 64)
2337 break;
2339 LLT HalfTy = getHalfSizedType(DstTy);
2340 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2341 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2342 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2344 // All inputs are SGPRs, nothing special to do.
2345 if (DefRegs.empty()) {
2346 assert(Src0Regs.empty() && Src1Regs.empty());
2347 break;
2350 assert(DefRegs.size() == 2);
2351 assert(Src0Regs.size() == Src1Regs.size() &&
2352 (Src0Regs.empty() || Src0Regs.size() == 2));
2354 // Depending on where the source registers came from, the generic code may
2355 // have decided to split the inputs already or not. If not, we still need to
2356 // extract the values.
2357 MachineIRBuilder B(MI);
2359 if (Src0Regs.empty())
2360 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2361 else
2362 setRegsToType(MRI, Src0Regs, HalfTy);
2364 if (Src1Regs.empty())
2365 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2366 else
2367 setRegsToType(MRI, Src1Regs, HalfTy);
2369 setRegsToType(MRI, DefRegs, HalfTy);
2371 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2372 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2374 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2375 MI.eraseFromParent();
2376 return;
2378 case AMDGPU::G_ABS: {
2379 Register SrcReg = MI.getOperand(1).getReg();
2380 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2382 // There is no VALU abs instruction so we need to replace it with a sub and
2383 // max combination.
2384 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2385 MachineFunction *MF = MI.getParent()->getParent();
2386 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2387 MachineIRBuilder B(MI, Apply);
2388 LegalizerHelper Helper(*MF, Apply, B);
2390 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2391 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2392 return;
2394 LLVM_FALLTHROUGH;
2396 case AMDGPU::G_ADD:
2397 case AMDGPU::G_SUB:
2398 case AMDGPU::G_MUL:
2399 case AMDGPU::G_SHL:
2400 case AMDGPU::G_LSHR:
2401 case AMDGPU::G_ASHR:
2402 case AMDGPU::G_SMIN:
2403 case AMDGPU::G_SMAX:
2404 case AMDGPU::G_UMIN:
2405 case AMDGPU::G_UMAX: {
2406 Register DstReg = MI.getOperand(0).getReg();
2407 LLT DstTy = MRI.getType(DstReg);
2409 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2410 // Packed 16-bit operations need to be scalarized and promoted.
2411 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2412 break;
2414 const RegisterBank *DstBank =
2415 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2416 if (DstBank == &AMDGPU::VGPRRegBank)
2417 break;
2419 const LLT S32 = LLT::scalar(32);
2420 MachineBasicBlock *MBB = MI.getParent();
2421 MachineFunction *MF = MBB->getParent();
2422 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2423 MachineIRBuilder B(MI, ApplySALU);
2425 if (DstTy.isVector()) {
2426 Register WideSrc0Lo, WideSrc0Hi;
2427 Register WideSrc1Lo, WideSrc1Hi;
2429 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2430 std::tie(WideSrc0Lo, WideSrc0Hi)
2431 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2432 std::tie(WideSrc1Lo, WideSrc1Hi)
2433 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2434 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2435 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2436 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2437 MI.eraseFromParent();
2438 } else {
2439 LegalizerHelper Helper(*MF, ApplySALU, B);
2441 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2442 llvm_unreachable("widen scalar should have succeeded");
2444 // FIXME: s16 shift amounts should be legal.
2445 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2446 Opc == AMDGPU::G_ASHR) {
2447 B.setInsertPt(*MBB, MI.getIterator());
2448 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2449 llvm_unreachable("widen scalar should have succeeded");
2453 return;
2455 case AMDGPU::G_SEXT_INREG: {
2456 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2457 if (SrcRegs.empty())
2458 break; // Nothing to repair
2460 const LLT S32 = LLT::scalar(32);
2461 MachineIRBuilder B(MI);
2462 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2463 GISelObserverWrapper Observer(&O);
2464 B.setChangeObserver(Observer);
2466 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2467 // we would need to further expand, and doesn't let us directly set the
2468 // result registers.
2469 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2471 int Amt = MI.getOperand(2).getImm();
2472 if (Amt <= 32) {
2473 if (Amt == 32) {
2474 // The low bits are unchanged.
2475 B.buildCopy(DstRegs[0], SrcRegs[0]);
2476 } else {
2477 // Extend in the low bits and propagate the sign bit to the high half.
2478 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2481 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2482 } else {
2483 // The low bits are unchanged, and extend in the high bits.
2484 B.buildCopy(DstRegs[0], SrcRegs[0]);
2485 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2488 Register DstReg = MI.getOperand(0).getReg();
2489 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2490 MI.eraseFromParent();
2491 return;
2493 case AMDGPU::G_CTPOP:
2494 case AMDGPU::G_BITREVERSE: {
2495 const RegisterBank *DstBank =
2496 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2497 if (DstBank == &AMDGPU::SGPRRegBank)
2498 break;
2500 Register SrcReg = MI.getOperand(1).getReg();
2501 const LLT S32 = LLT::scalar(32);
2502 LLT Ty = MRI.getType(SrcReg);
2503 if (Ty == S32)
2504 break;
2506 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2507 MachineIRBuilder B(MI, ApplyVALU);
2509 MachineFunction &MF = B.getMF();
2510 LegalizerHelper Helper(MF, ApplyVALU, B);
2512 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2513 llvm_unreachable("narrowScalar should have succeeded");
2514 return;
2516 case AMDGPU::G_AMDGPU_FFBH_U32:
2517 case AMDGPU::G_AMDGPU_FFBL_B32:
2518 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2519 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2520 const RegisterBank *DstBank =
2521 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2522 if (DstBank == &AMDGPU::SGPRRegBank)
2523 break;
2525 Register SrcReg = MI.getOperand(1).getReg();
2526 const LLT S32 = LLT::scalar(32);
2527 LLT Ty = MRI.getType(SrcReg);
2528 if (Ty == S32)
2529 break;
2531 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2532 // which return -1 when the input is zero:
2533 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2534 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2535 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2536 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2537 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2538 MachineIRBuilder B(MI, ApplyVALU);
2539 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2540 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2541 ? AMDGPU::G_AMDGPU_FFBH_U32
2542 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2543 ? AMDGPU::G_AMDGPU_FFBL_B32
2544 : Opc;
2545 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2546 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2547 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2548 unsigned AddOpc =
2549 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2550 ? AMDGPU::G_ADD
2551 : AMDGPU::G_UADDSAT;
2552 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2553 Register DstReg = MI.getOperand(0).getReg();
2554 B.buildUMin(DstReg, X, Y);
2555 MI.eraseFromParent();
2556 return;
2558 case AMDGPU::G_SEXT:
2559 case AMDGPU::G_ZEXT:
2560 case AMDGPU::G_ANYEXT: {
2561 Register SrcReg = MI.getOperand(1).getReg();
2562 LLT SrcTy = MRI.getType(SrcReg);
2563 const bool Signed = Opc == AMDGPU::G_SEXT;
2565 assert(empty(OpdMapper.getVRegs(1)));
2567 MachineIRBuilder B(MI);
2568 const RegisterBank *SrcBank =
2569 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2571 Register DstReg = MI.getOperand(0).getReg();
2572 LLT DstTy = MRI.getType(DstReg);
2573 if (DstTy.isScalar() &&
2574 SrcBank != &AMDGPU::SGPRRegBank &&
2575 SrcBank != &AMDGPU::VCCRegBank &&
2576 // FIXME: Should handle any type that round to s64 when irregular
2577 // breakdowns supported.
2578 DstTy.getSizeInBits() == 64 &&
2579 SrcTy.getSizeInBits() <= 32) {
2580 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2582 // Extend to 32-bit, and then extend the low half.
2583 if (Signed) {
2584 // TODO: Should really be buildSExtOrCopy
2585 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2586 } else if (Opc == AMDGPU::G_ZEXT) {
2587 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2588 } else {
2589 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2592 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2593 MRI.setRegBank(DstReg, *SrcBank);
2594 MI.eraseFromParent();
2595 return;
2598 if (SrcTy != LLT::scalar(1))
2599 return;
2601 // It is not legal to have a legalization artifact with a VCC source. Rather
2602 // than introducing a copy, insert the select we would have to select the
2603 // copy to.
2604 if (SrcBank == &AMDGPU::VCCRegBank) {
2605 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2607 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2609 unsigned DstSize = DstTy.getSizeInBits();
2610 // 64-bit select is SGPR only
2611 const bool UseSel64 = DstSize > 32 &&
2612 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2614 // TODO: Should s16 select be legal?
2615 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2616 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2617 auto False = B.buildConstant(SelType, 0);
2619 MRI.setRegBank(True.getReg(0), *DstBank);
2620 MRI.setRegBank(False.getReg(0), *DstBank);
2621 MRI.setRegBank(DstReg, *DstBank);
2623 if (DstSize > 32) {
2624 B.buildSelect(DefRegs[0], SrcReg, True, False);
2625 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2626 } else if (DstSize < 32) {
2627 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2628 MRI.setRegBank(Sel.getReg(0), *DstBank);
2629 B.buildTrunc(DstReg, Sel);
2630 } else {
2631 B.buildSelect(DstReg, SrcReg, True, False);
2634 MI.eraseFromParent();
2635 return;
2638 break;
2640 case AMDGPU::G_BUILD_VECTOR:
2641 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2642 Register DstReg = MI.getOperand(0).getReg();
2643 LLT DstTy = MRI.getType(DstReg);
2644 if (DstTy != LLT::fixed_vector(2, 16))
2645 break;
2647 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2648 substituteSimpleCopyRegs(OpdMapper, 1);
2649 substituteSimpleCopyRegs(OpdMapper, 2);
2651 const RegisterBank *DstBank =
2652 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2653 if (DstBank == &AMDGPU::SGPRRegBank)
2654 break; // Can use S_PACK_* instructions.
2656 MachineIRBuilder B(MI);
2658 Register Lo = MI.getOperand(1).getReg();
2659 Register Hi = MI.getOperand(2).getReg();
2660 const LLT S32 = LLT::scalar(32);
2662 const RegisterBank *BankLo =
2663 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2664 const RegisterBank *BankHi =
2665 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2667 Register ZextLo;
2668 Register ShiftHi;
2670 if (Opc == AMDGPU::G_BUILD_VECTOR) {
2671 ZextLo = B.buildZExt(S32, Lo).getReg(0);
2672 MRI.setRegBank(ZextLo, *BankLo);
2674 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2675 MRI.setRegBank(ZextHi, *BankHi);
2677 auto ShiftAmt = B.buildConstant(S32, 16);
2678 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2680 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2681 MRI.setRegBank(ShiftHi, *BankHi);
2682 } else {
2683 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2684 MRI.setRegBank(MaskLo, *BankLo);
2686 auto ShiftAmt = B.buildConstant(S32, 16);
2687 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2689 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2690 MRI.setRegBank(ShiftHi, *BankHi);
2692 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2693 MRI.setRegBank(ZextLo, *BankLo);
2696 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2697 MRI.setRegBank(Or.getReg(0), *DstBank);
2699 B.buildBitcast(DstReg, Or);
2700 MI.eraseFromParent();
2701 return;
2703 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2704 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2706 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2708 Register DstReg = MI.getOperand(0).getReg();
2709 Register SrcReg = MI.getOperand(1).getReg();
2711 const LLT S32 = LLT::scalar(32);
2712 LLT DstTy = MRI.getType(DstReg);
2713 LLT SrcTy = MRI.getType(SrcReg);
2715 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2716 return;
2718 MachineIRBuilder B(MI);
2720 const ValueMapping &DstMapping
2721 = OpdMapper.getInstrMapping().getOperandMapping(0);
2722 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2723 const RegisterBank *SrcBank =
2724 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2725 const RegisterBank *IdxBank =
2726 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2728 Register BaseIdxReg;
2729 unsigned ConstOffset;
2730 std::tie(BaseIdxReg, ConstOffset) =
2731 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2733 // See if the index is an add of a constant which will be foldable by moving
2734 // the base register of the index later if this is going to be executed in a
2735 // waterfall loop. This is essentially to reassociate the add of a constant
2736 // with the readfirstlane.
2737 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2738 ConstOffset > 0 &&
2739 ConstOffset < SrcTy.getNumElements();
2741 // Move the base register. We'll re-insert the add later.
2742 if (ShouldMoveIndexIntoLoop)
2743 MI.getOperand(2).setReg(BaseIdxReg);
2745 // If this is a VGPR result only because the index was a VGPR result, the
2746 // actual indexing will be done on the SGPR source vector, which will
2747 // produce a scalar result. We need to copy to the VGPR result inside the
2748 // waterfall loop.
2749 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2750 SrcBank == &AMDGPU::SGPRRegBank;
2751 if (DstRegs.empty()) {
2752 applyDefaultMapping(OpdMapper);
2754 executeInWaterfallLoop(MI, MRI, { 2 });
2756 if (NeedCopyToVGPR) {
2757 // We don't want a phi for this temporary reg.
2758 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2759 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2760 MI.getOperand(0).setReg(TmpReg);
2761 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2763 // Use a v_mov_b32 here to make the exec dependency explicit.
2764 buildVCopy(B, DstReg, TmpReg);
2767 // Re-insert the constant offset add inside the waterfall loop.
2768 if (ShouldMoveIndexIntoLoop)
2769 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2771 return;
2774 assert(DstTy.getSizeInBits() == 64);
2776 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2778 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2779 auto One = B.buildConstant(S32, 1);
2781 MachineBasicBlock::iterator MII = MI.getIterator();
2783 // Split the vector index into 32-bit pieces. Prepare to move all of the
2784 // new instructions into a waterfall loop if necessary.
2786 // Don't put the bitcast or constant in the loop.
2787 MachineInstrSpan Span(MII, &B.getMBB());
2789 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2790 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2791 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2793 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2794 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2796 MRI.setRegBank(DstReg, *DstBank);
2797 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2798 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2799 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2800 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2802 SmallSet<Register, 4> OpsToWaterfall;
2803 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2804 MI.eraseFromParent();
2805 return;
2808 // Remove the original instruction to avoid potentially confusing the
2809 // waterfall loop logic.
2810 B.setInstr(*Span.begin());
2811 MI.eraseFromParent();
2812 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2813 OpsToWaterfall, MRI);
2815 if (NeedCopyToVGPR) {
2816 MachineBasicBlock *LoopBB = Extract1->getParent();
2817 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2818 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2819 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2820 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2822 Extract0->getOperand(0).setReg(TmpReg0);
2823 Extract1->getOperand(0).setReg(TmpReg1);
2825 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2827 buildVCopy(B, DstRegs[0], TmpReg0);
2828 buildVCopy(B, DstRegs[1], TmpReg1);
2831 if (ShouldMoveIndexIntoLoop)
2832 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2834 return;
2836 case AMDGPU::G_INSERT_VECTOR_ELT: {
2837 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2839 Register DstReg = MI.getOperand(0).getReg();
2840 LLT VecTy = MRI.getType(DstReg);
2842 assert(OpdMapper.getVRegs(0).empty());
2843 assert(OpdMapper.getVRegs(3).empty());
2845 if (substituteSimpleCopyRegs(OpdMapper, 1))
2846 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2848 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2849 return;
2851 const RegisterBank *IdxBank =
2852 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2854 Register SrcReg = MI.getOperand(1).getReg();
2855 Register InsReg = MI.getOperand(2).getReg();
2856 LLT InsTy = MRI.getType(InsReg);
2857 (void)InsTy;
2859 Register BaseIdxReg;
2860 unsigned ConstOffset;
2861 std::tie(BaseIdxReg, ConstOffset) =
2862 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2864 // See if the index is an add of a constant which will be foldable by moving
2865 // the base register of the index later if this is going to be executed in a
2866 // waterfall loop. This is essentially to reassociate the add of a constant
2867 // with the readfirstlane.
2868 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2869 ConstOffset > 0 &&
2870 ConstOffset < VecTy.getNumElements();
2872 // Move the base register. We'll re-insert the add later.
2873 if (ShouldMoveIndexIntoLoop)
2874 MI.getOperand(3).setReg(BaseIdxReg);
2877 if (InsRegs.empty()) {
2878 executeInWaterfallLoop(MI, MRI, { 3 });
2880 // Re-insert the constant offset add inside the waterfall loop.
2881 if (ShouldMoveIndexIntoLoop) {
2882 MachineIRBuilder B(MI);
2883 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2886 return;
2890 assert(InsTy.getSizeInBits() == 64);
2892 const LLT S32 = LLT::scalar(32);
2893 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2895 MachineIRBuilder B(MI);
2896 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2897 auto One = B.buildConstant(S32, 1);
2899 // Split the vector index into 32-bit pieces. Prepare to move all of the
2900 // new instructions into a waterfall loop if necessary.
2902 // Don't put the bitcast or constant in the loop.
2903 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2905 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2906 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2907 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2909 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2910 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2912 const RegisterBank *DstBank =
2913 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2914 const RegisterBank *SrcBank =
2915 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2916 const RegisterBank *InsSrcBank =
2917 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2919 MRI.setRegBank(InsReg, *InsSrcBank);
2920 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2921 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2922 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2923 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2924 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2925 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2928 SmallSet<Register, 4> OpsToWaterfall;
2929 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2930 B.setInsertPt(B.getMBB(), MI);
2931 B.buildBitcast(DstReg, InsHi);
2932 MI.eraseFromParent();
2933 return;
2936 B.setInstr(*Span.begin());
2937 MI.eraseFromParent();
2939 // Figure out the point after the waterfall loop before mangling the control
2940 // flow.
2941 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2942 OpsToWaterfall, MRI);
2944 // The insertion point is now right after the original instruction.
2946 // Keep the bitcast to the original vector type out of the loop. Doing this
2947 // saved an extra phi we don't need inside the loop.
2948 B.buildBitcast(DstReg, InsHi);
2950 // Re-insert the constant offset add inside the waterfall loop.
2951 if (ShouldMoveIndexIntoLoop)
2952 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2954 return;
2956 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2957 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2958 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2959 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2960 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2961 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2962 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2963 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2964 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2965 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2966 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2967 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2968 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2969 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2970 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2971 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2972 applyDefaultMapping(OpdMapper);
2973 executeInWaterfallLoop(MI, MRI, {1, 4});
2974 return;
2976 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2977 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2978 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2979 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2980 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2981 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2982 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2983 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2984 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2985 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2986 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2987 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2988 applyDefaultMapping(OpdMapper);
2989 executeInWaterfallLoop(MI, MRI, {2, 5});
2990 return;
2992 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2993 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2994 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2995 applyDefaultMapping(OpdMapper);
2996 executeInWaterfallLoop(MI, MRI, {2, 5});
2997 return;
2999 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3000 applyDefaultMapping(OpdMapper);
3001 executeInWaterfallLoop(MI, MRI, {3, 6});
3002 return;
3004 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3005 applyMappingSBufferLoad(OpdMapper);
3006 return;
3008 case AMDGPU::G_INTRINSIC: {
3009 switch (MI.getIntrinsicID()) {
3010 case Intrinsic::amdgcn_readlane: {
3011 substituteSimpleCopyRegs(OpdMapper, 2);
3013 assert(OpdMapper.getVRegs(0).empty());
3014 assert(OpdMapper.getVRegs(3).empty());
3016 // Make sure the index is an SGPR. It doesn't make sense to run this in a
3017 // waterfall loop, so assume it's a uniform value.
3018 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
3019 return;
3021 case Intrinsic::amdgcn_writelane: {
3022 assert(OpdMapper.getVRegs(0).empty());
3023 assert(OpdMapper.getVRegs(2).empty());
3024 assert(OpdMapper.getVRegs(3).empty());
3026 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3027 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
3028 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
3029 return;
3031 case Intrinsic::amdgcn_interp_p1:
3032 case Intrinsic::amdgcn_interp_p2:
3033 case Intrinsic::amdgcn_interp_mov:
3034 case Intrinsic::amdgcn_interp_p1_f16:
3035 case Intrinsic::amdgcn_interp_p2_f16: {
3036 applyDefaultMapping(OpdMapper);
3038 // Readlane for m0 value, which is always the last operand.
3039 // FIXME: Should this be a waterfall loop instead?
3040 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3041 return;
3043 case Intrinsic::amdgcn_permlane16:
3044 case Intrinsic::amdgcn_permlanex16: {
3045 // Doing a waterfall loop over these wouldn't make any sense.
3046 substituteSimpleCopyRegs(OpdMapper, 2);
3047 substituteSimpleCopyRegs(OpdMapper, 3);
3048 constrainOpWithReadfirstlane(MI, MRI, 4);
3049 constrainOpWithReadfirstlane(MI, MRI, 5);
3050 return;
3052 case Intrinsic::amdgcn_sbfe:
3053 applyMappingBFE(OpdMapper, true);
3054 return;
3055 case Intrinsic::amdgcn_ubfe:
3056 applyMappingBFE(OpdMapper, false);
3057 return;
3058 case Intrinsic::amdgcn_ballot:
3059 // Use default handling and insert copy to vcc source.
3060 break;
3062 break;
3064 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3065 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3066 const AMDGPU::RsrcIntrinsic *RSrcIntrin
3067 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3068 assert(RSrcIntrin && RSrcIntrin->IsImage);
3069 // Non-images can have complications from operands that allow both SGPR
3070 // and VGPR. For now it's too complicated to figure out the final opcode
3071 // to derive the register bank from the MCInstrDesc.
3072 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3073 return;
3075 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3076 unsigned N = MI.getNumExplicitOperands() - 2;
3077 applyDefaultMapping(OpdMapper);
3078 executeInWaterfallLoop(MI, MRI, { N });
3079 return;
3081 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3082 auto IntrID = MI.getIntrinsicID();
3083 switch (IntrID) {
3084 case Intrinsic::amdgcn_ds_ordered_add:
3085 case Intrinsic::amdgcn_ds_ordered_swap: {
3086 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3087 assert(OpdMapper.getVRegs(0).empty());
3088 substituteSimpleCopyRegs(OpdMapper, 3);
3089 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3090 return;
3092 case Intrinsic::amdgcn_ds_gws_init:
3093 case Intrinsic::amdgcn_ds_gws_barrier:
3094 case Intrinsic::amdgcn_ds_gws_sema_br: {
3095 // Only the first lane is executes, so readfirstlane is safe.
3096 substituteSimpleCopyRegs(OpdMapper, 1);
3097 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3098 return;
3100 case Intrinsic::amdgcn_ds_gws_sema_v:
3101 case Intrinsic::amdgcn_ds_gws_sema_p:
3102 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3103 // Only the first lane is executes, so readfirstlane is safe.
3104 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3105 return;
3107 case Intrinsic::amdgcn_ds_append:
3108 case Intrinsic::amdgcn_ds_consume: {
3109 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3110 return;
3112 case Intrinsic::amdgcn_s_sendmsg:
3113 case Intrinsic::amdgcn_s_sendmsghalt: {
3114 // FIXME: Should this use a waterfall loop?
3115 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3116 return;
3118 case Intrinsic::amdgcn_s_setreg: {
3119 constrainOpWithReadfirstlane(MI, MRI, 2);
3120 return;
3122 default: {
3123 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3124 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3125 // Non-images can have complications from operands that allow both SGPR
3126 // and VGPR. For now it's too complicated to figure out the final opcode
3127 // to derive the register bank from the MCInstrDesc.
3128 if (RSrcIntrin->IsImage) {
3129 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3130 return;
3134 break;
3137 break;
3139 case AMDGPU::G_LOAD:
3140 case AMDGPU::G_ZEXTLOAD:
3141 case AMDGPU::G_SEXTLOAD: {
3142 if (applyMappingLoad(MI, OpdMapper, MRI))
3143 return;
3144 break;
3146 case AMDGPU::G_DYN_STACKALLOC:
3147 applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3148 return;
3149 case AMDGPU::G_SBFX:
3150 applyMappingBFE(OpdMapper, /*Signed*/ true);
3151 return;
3152 case AMDGPU::G_UBFX:
3153 applyMappingBFE(OpdMapper, /*Signed*/ false);
3154 return;
3155 default:
3156 break;
3159 return applyDefaultMapping(OpdMapper);
3162 // vgpr, sgpr -> vgpr
3163 // vgpr, agpr -> vgpr
3164 // agpr, agpr -> agpr
3165 // agpr, sgpr -> vgpr
3166 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3167 if (RB0 == AMDGPU::InvalidRegBankID)
3168 return RB1;
3169 if (RB1 == AMDGPU::InvalidRegBankID)
3170 return RB0;
3172 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3173 return AMDGPU::SGPRRegBankID;
3175 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3176 return AMDGPU::AGPRRegBankID;
3178 return AMDGPU::VGPRRegBankID;
3181 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3182 if (RB0 == AMDGPU::InvalidRegBankID)
3183 return RB1;
3184 if (RB1 == AMDGPU::InvalidRegBankID)
3185 return RB0;
3187 // vcc, vcc -> vcc
3188 // vcc, sgpr -> vcc
3189 // vcc, vgpr -> vcc
3190 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3191 return AMDGPU::VCCRegBankID;
3193 // vcc, vgpr -> vgpr
3194 return regBankUnion(RB0, RB1);
3197 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3198 const MachineInstr &MI) const {
3199 unsigned RegBank = AMDGPU::InvalidRegBankID;
3201 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3202 if (!MI.getOperand(i).isReg())
3203 continue;
3204 Register Reg = MI.getOperand(i).getReg();
3205 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3206 RegBank = regBankUnion(RegBank, Bank->getID());
3207 if (RegBank == AMDGPU::VGPRRegBankID)
3208 break;
3212 return RegBank;
3215 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3216 const MachineFunction &MF = *MI.getParent()->getParent();
3217 const MachineRegisterInfo &MRI = MF.getRegInfo();
3218 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3219 if (!MI.getOperand(i).isReg())
3220 continue;
3221 Register Reg = MI.getOperand(i).getReg();
3222 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3223 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3224 return false;
3227 return true;
3230 const RegisterBankInfo::InstructionMapping &
3231 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3232 const MachineFunction &MF = *MI.getParent()->getParent();
3233 const MachineRegisterInfo &MRI = MF.getRegInfo();
3234 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3236 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3237 const MachineOperand &SrcOp = MI.getOperand(i);
3238 if (!SrcOp.isReg())
3239 continue;
3241 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3242 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3244 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3245 MI.getNumOperands());
3248 const RegisterBankInfo::InstructionMapping &
3249 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3250 const MachineFunction &MF = *MI.getParent()->getParent();
3251 const MachineRegisterInfo &MRI = MF.getRegInfo();
3252 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3254 // Even though we technically could use SGPRs, this would require knowledge of
3255 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3257 // TODO: Unary ops are trivially OK, so accept SGPRs?
3258 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3259 const MachineOperand &Src = MI.getOperand(i);
3260 if (!Src.isReg())
3261 continue;
3263 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3264 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3265 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3268 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3269 MI.getNumOperands());
3272 const RegisterBankInfo::InstructionMapping &
3273 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3274 const MachineFunction &MF = *MI.getParent()->getParent();
3275 const MachineRegisterInfo &MRI = MF.getRegInfo();
3276 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3278 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3279 const MachineOperand &Op = MI.getOperand(I);
3280 if (!Op.isReg())
3281 continue;
3283 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3284 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3287 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3288 MI.getNumOperands());
3291 const RegisterBankInfo::InstructionMapping &
3292 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3293 const MachineInstr &MI,
3294 int RsrcIdx) const {
3295 // The reported argument index is relative to the IR intrinsic call arguments,
3296 // so we need to shift by the number of defs and the intrinsic ID.
3297 RsrcIdx += MI.getNumExplicitDefs() + 1;
3299 const int NumOps = MI.getNumOperands();
3300 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3302 // TODO: Should packed/unpacked D16 difference be reported here as part of
3303 // the value mapping?
3304 for (int I = 0; I != NumOps; ++I) {
3305 if (!MI.getOperand(I).isReg())
3306 continue;
3308 Register OpReg = MI.getOperand(I).getReg();
3309 // We replace some dead address operands with $noreg
3310 if (!OpReg)
3311 continue;
3313 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3315 // FIXME: Probably need a new intrinsic register bank searchable table to
3316 // handle arbitrary intrinsics easily.
3318 // If this has a sampler, it immediately follows rsrc.
3319 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3321 if (MustBeSGPR) {
3322 // If this must be an SGPR, so we must report whatever it is as legal.
3323 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3324 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3325 } else {
3326 // Some operands must be VGPR, and these are easy to copy to.
3327 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3331 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3334 /// Return the mapping for a pointer arugment.
3335 const RegisterBankInfo::ValueMapping *
3336 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3337 Register PtrReg) const {
3338 LLT PtrTy = MRI.getType(PtrReg);
3339 unsigned Size = PtrTy.getSizeInBits();
3340 if (Subtarget.useFlatForGlobal() ||
3341 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3342 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3344 // If we're using MUBUF instructions for global memory, an SGPR base register
3345 // is possible. Otherwise this needs to be a VGPR.
3346 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3347 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3350 const RegisterBankInfo::InstructionMapping &
3351 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3353 const MachineFunction &MF = *MI.getParent()->getParent();
3354 const MachineRegisterInfo &MRI = MF.getRegInfo();
3355 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3356 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3357 Register PtrReg = MI.getOperand(1).getReg();
3358 LLT PtrTy = MRI.getType(PtrReg);
3359 unsigned AS = PtrTy.getAddressSpace();
3360 unsigned PtrSize = PtrTy.getSizeInBits();
3362 const ValueMapping *ValMapping;
3363 const ValueMapping *PtrMapping;
3365 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3367 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3368 if (isScalarLoadLegal(MI)) {
3369 // We have a uniform instruction so we want to use an SMRD load
3370 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3371 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3372 } else {
3373 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3375 // If we're using MUBUF instructions for global memory, an SGPR base
3376 // register is possible. Otherwise this needs to be a VGPR.
3377 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3378 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3380 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3382 } else {
3383 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3384 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3387 OpdsMapping[0] = ValMapping;
3388 OpdsMapping[1] = PtrMapping;
3389 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3390 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3391 return Mapping;
3393 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3394 // handle that during instruction selection?
3397 unsigned
3398 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3399 const MachineRegisterInfo &MRI,
3400 unsigned Default) const {
3401 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3402 return Bank ? Bank->getID() : Default;
3405 const RegisterBankInfo::ValueMapping *
3406 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3407 const MachineRegisterInfo &MRI,
3408 const TargetRegisterInfo &TRI) const {
3409 // Lie and claim anything is legal, even though this needs to be an SGPR
3410 // applyMapping will have to deal with it as a waterfall loop.
3411 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3412 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3413 return AMDGPU::getValueMapping(Bank, Size);
3416 const RegisterBankInfo::ValueMapping *
3417 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3418 const MachineRegisterInfo &MRI,
3419 const TargetRegisterInfo &TRI) const {
3420 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3421 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3424 const RegisterBankInfo::ValueMapping *
3425 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3426 const MachineRegisterInfo &MRI,
3427 const TargetRegisterInfo &TRI) const {
3428 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3429 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3433 /// This function must return a legal mapping, because
3434 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3435 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3436 /// VGPR to SGPR generated is illegal.
3438 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3439 // legal. These will be dealt with in applyMappingImpl.
3441 const RegisterBankInfo::InstructionMapping &
3442 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3443 const MachineFunction &MF = *MI.getParent()->getParent();
3444 const MachineRegisterInfo &MRI = MF.getRegInfo();
3446 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3447 // The default logic bothers to analyze impossible alternative mappings. We
3448 // want the most straightforward mapping, so just directly handle this.
3449 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3450 *TRI);
3451 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3452 *TRI);
3453 assert(SrcBank && "src bank should have been assigned already");
3454 if (!DstBank)
3455 DstBank = SrcBank;
3457 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3458 if (cannotCopy(*DstBank, *SrcBank, Size))
3459 return getInvalidInstructionMapping();
3461 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3462 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3463 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3464 OpdsMapping[0] = &ValMap;
3465 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3466 OpdsMapping[1] = &ValMap;
3468 return getInstructionMapping(
3469 1, /*Cost*/ 1,
3470 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3473 if (MI.isRegSequence()) {
3474 // If any input is a VGPR, the result must be a VGPR. The default handling
3475 // assumes any copy between banks is legal.
3476 unsigned BankID = AMDGPU::SGPRRegBankID;
3478 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3479 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3480 // It doesn't make sense to use vcc or scc banks here, so just ignore
3481 // them.
3482 if (OpBank != AMDGPU::SGPRRegBankID) {
3483 BankID = AMDGPU::VGPRRegBankID;
3484 break;
3487 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3489 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3490 return getInstructionMapping(
3491 1, /*Cost*/ 1,
3492 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3495 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3496 // properly.
3498 // TODO: There are additional exec masking dependencies to analyze.
3499 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3500 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3501 Register DstReg = MI.getOperand(0).getReg();
3503 // Sometimes the result may have already been assigned a bank.
3504 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3505 ResultBank = DstBank->getID();
3507 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3508 Register Reg = MI.getOperand(I).getReg();
3509 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3511 // FIXME: Assuming VGPR for any undetermined inputs.
3512 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3513 ResultBank = AMDGPU::VGPRRegBankID;
3514 break;
3517 // FIXME: Need to promote SGPR case to s32
3518 unsigned OpBank = Bank->getID();
3519 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3522 assert(ResultBank != AMDGPU::InvalidRegBankID);
3524 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3526 const ValueMapping &ValMap =
3527 getValueMapping(0, Size, getRegBank(ResultBank));
3528 return getInstructionMapping(
3529 1, /*Cost*/ 1,
3530 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3533 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3534 if (Mapping.isValid())
3535 return Mapping;
3537 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3539 switch (MI.getOpcode()) {
3540 default:
3541 return getInvalidInstructionMapping();
3543 case AMDGPU::G_AND:
3544 case AMDGPU::G_OR:
3545 case AMDGPU::G_XOR: {
3546 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3547 if (Size == 1) {
3548 const RegisterBank *DstBank
3549 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3551 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3552 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3553 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3554 if (DstBank) {
3555 TargetBankID = DstBank->getID();
3556 if (DstBank == &AMDGPU::VCCRegBank) {
3557 TargetBankID = AMDGPU::VCCRegBankID;
3558 BankLHS = AMDGPU::VCCRegBankID;
3559 BankRHS = AMDGPU::VCCRegBankID;
3560 } else {
3561 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3562 AMDGPU::SGPRRegBankID);
3563 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3564 AMDGPU::SGPRRegBankID);
3566 } else {
3567 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3568 AMDGPU::VCCRegBankID);
3569 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3570 AMDGPU::VCCRegBankID);
3572 // Both inputs should be true booleans to produce a boolean result.
3573 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3574 TargetBankID = AMDGPU::VGPRRegBankID;
3575 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3576 TargetBankID = AMDGPU::VCCRegBankID;
3577 BankLHS = AMDGPU::VCCRegBankID;
3578 BankRHS = AMDGPU::VCCRegBankID;
3579 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3580 TargetBankID = AMDGPU::SGPRRegBankID;
3584 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3585 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3586 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3587 break;
3590 if (Size == 64) {
3592 if (isSALUMapping(MI)) {
3593 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3594 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3595 } else {
3596 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3597 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3598 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3600 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3601 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3604 break;
3607 LLVM_FALLTHROUGH;
3609 case AMDGPU::G_PTR_ADD:
3610 case AMDGPU::G_PTRMASK:
3611 case AMDGPU::G_ADD:
3612 case AMDGPU::G_SUB:
3613 case AMDGPU::G_MUL:
3614 case AMDGPU::G_SHL:
3615 case AMDGPU::G_LSHR:
3616 case AMDGPU::G_ASHR:
3617 case AMDGPU::G_UADDO:
3618 case AMDGPU::G_USUBO:
3619 case AMDGPU::G_UADDE:
3620 case AMDGPU::G_SADDE:
3621 case AMDGPU::G_USUBE:
3622 case AMDGPU::G_SSUBE:
3623 case AMDGPU::G_SMIN:
3624 case AMDGPU::G_SMAX:
3625 case AMDGPU::G_UMIN:
3626 case AMDGPU::G_UMAX:
3627 case AMDGPU::G_ABS:
3628 case AMDGPU::G_SHUFFLE_VECTOR:
3629 case AMDGPU::G_SBFX:
3630 case AMDGPU::G_UBFX:
3631 if (isSALUMapping(MI))
3632 return getDefaultMappingSOP(MI);
3633 LLVM_FALLTHROUGH;
3635 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3636 case AMDGPU::G_SSUBSAT:
3637 case AMDGPU::G_UADDSAT:
3638 case AMDGPU::G_USUBSAT:
3639 case AMDGPU::G_FADD:
3640 case AMDGPU::G_FSUB:
3641 case AMDGPU::G_FPTOSI:
3642 case AMDGPU::G_FPTOUI:
3643 case AMDGPU::G_FMUL:
3644 case AMDGPU::G_FMA:
3645 case AMDGPU::G_FMAD:
3646 case AMDGPU::G_FSQRT:
3647 case AMDGPU::G_FFLOOR:
3648 case AMDGPU::G_FCEIL:
3649 case AMDGPU::G_FRINT:
3650 case AMDGPU::G_SITOFP:
3651 case AMDGPU::G_UITOFP:
3652 case AMDGPU::G_FPTRUNC:
3653 case AMDGPU::G_FPEXT:
3654 case AMDGPU::G_FEXP2:
3655 case AMDGPU::G_FLOG2:
3656 case AMDGPU::G_FMINNUM:
3657 case AMDGPU::G_FMAXNUM:
3658 case AMDGPU::G_FMINNUM_IEEE:
3659 case AMDGPU::G_FMAXNUM_IEEE:
3660 case AMDGPU::G_FCANONICALIZE:
3661 case AMDGPU::G_INTRINSIC_TRUNC:
3662 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3663 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3664 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3665 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3666 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3667 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3668 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3669 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3670 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3671 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3672 case AMDGPU::G_AMDGPU_SMED3:
3673 return getDefaultMappingVOP(MI);
3674 case AMDGPU::G_UMULH:
3675 case AMDGPU::G_SMULH: {
3676 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3677 return getDefaultMappingSOP(MI);
3678 return getDefaultMappingVOP(MI);
3680 case AMDGPU::G_IMPLICIT_DEF: {
3681 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3682 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3683 break;
3685 case AMDGPU::G_FCONSTANT:
3686 case AMDGPU::G_CONSTANT:
3687 case AMDGPU::G_GLOBAL_VALUE:
3688 case AMDGPU::G_BLOCK_ADDR:
3689 case AMDGPU::G_READCYCLECOUNTER: {
3690 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3691 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3692 break;
3694 case AMDGPU::G_FRAME_INDEX: {
3695 // TODO: This should be the same as other constants, but eliminateFrameIndex
3696 // currently assumes VALU uses.
3697 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3698 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3699 break;
3701 case AMDGPU::G_DYN_STACKALLOC: {
3702 // Result is always uniform, and a wave reduction is needed for the source.
3703 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3704 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3705 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3706 break;
3708 case AMDGPU::G_INSERT: {
3709 unsigned BankID = getMappingType(MRI, MI);
3710 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3711 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3712 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3713 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3714 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3715 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3716 OpdsMapping[3] = nullptr;
3717 break;
3719 case AMDGPU::G_EXTRACT: {
3720 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3721 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3722 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3723 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3724 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3725 OpdsMapping[2] = nullptr;
3726 break;
3728 case AMDGPU::G_BUILD_VECTOR:
3729 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3730 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3731 if (DstTy == LLT::fixed_vector(2, 16)) {
3732 unsigned DstSize = DstTy.getSizeInBits();
3733 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3734 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3735 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3736 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3738 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3739 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3740 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3741 break;
3744 LLVM_FALLTHROUGH;
3746 case AMDGPU::G_MERGE_VALUES:
3747 case AMDGPU::G_CONCAT_VECTORS: {
3748 unsigned Bank = getMappingType(MRI, MI);
3749 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3750 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3752 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3753 // Op1 and Dst should use the same register bank.
3754 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3755 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3756 break;
3758 case AMDGPU::G_BITREVERSE:
3759 case AMDGPU::G_BITCAST:
3760 case AMDGPU::G_INTTOPTR:
3761 case AMDGPU::G_PTRTOINT:
3762 case AMDGPU::G_FABS:
3763 case AMDGPU::G_FNEG: {
3764 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3765 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3766 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3767 break;
3769 case AMDGPU::G_AMDGPU_FFBH_U32:
3770 case AMDGPU::G_AMDGPU_FFBL_B32:
3771 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3772 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3773 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3774 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3775 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3776 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3777 break;
3779 case AMDGPU::G_CTPOP: {
3780 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3781 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3782 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3784 // This should really be getValueMappingSGPR64Only, but allowing the generic
3785 // code to handle the register split just makes using LegalizerHelper more
3786 // difficult.
3787 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3788 break;
3790 case AMDGPU::G_TRUNC: {
3791 Register Dst = MI.getOperand(0).getReg();
3792 Register Src = MI.getOperand(1).getReg();
3793 unsigned Bank = getRegBankID(Src, MRI);
3794 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3795 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3796 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3797 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3798 break;
3800 case AMDGPU::G_ZEXT:
3801 case AMDGPU::G_SEXT:
3802 case AMDGPU::G_ANYEXT:
3803 case AMDGPU::G_SEXT_INREG: {
3804 Register Dst = MI.getOperand(0).getReg();
3805 Register Src = MI.getOperand(1).getReg();
3806 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3807 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3809 unsigned DstBank;
3810 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3811 assert(SrcBank);
3812 switch (SrcBank->getID()) {
3813 case AMDGPU::SGPRRegBankID:
3814 DstBank = AMDGPU::SGPRRegBankID;
3815 break;
3816 default:
3817 DstBank = AMDGPU::VGPRRegBankID;
3818 break;
3821 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3822 // 32-bits, and then to 64.
3823 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3824 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3825 SrcSize);
3826 break;
3828 case AMDGPU::G_FCMP: {
3829 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3830 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3831 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3832 OpdsMapping[1] = nullptr; // Predicate Operand.
3833 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3834 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3835 break;
3837 case AMDGPU::G_STORE: {
3838 assert(MI.getOperand(0).isReg());
3839 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3841 // FIXME: We need to specify a different reg bank once scalar stores are
3842 // supported.
3843 const ValueMapping *ValMapping =
3844 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3845 OpdsMapping[0] = ValMapping;
3846 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3847 break;
3849 case AMDGPU::G_ICMP: {
3850 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3851 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3853 // See if the result register has already been constrained to vcc, which may
3854 // happen due to control flow intrinsic lowering.
3855 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3856 AMDGPU::SGPRRegBankID);
3857 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3858 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3860 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3861 Op2Bank == AMDGPU::SGPRRegBankID &&
3862 Op3Bank == AMDGPU::SGPRRegBankID &&
3863 (Size == 32 || (Size == 64 &&
3864 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3865 Subtarget.hasScalarCompareEq64()));
3867 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3868 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3870 // TODO: Use 32-bit for scalar output size.
3871 // SCC results will need to be copied to a 32-bit SGPR virtual register.
3872 const unsigned ResultSize = 1;
3874 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3875 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3876 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3877 break;
3879 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3880 // VGPR index can be used for waterfall when indexing a SGPR vector.
3881 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3882 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3883 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3884 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3885 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3886 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3888 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3889 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3891 // The index can be either if the source vector is VGPR.
3892 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3893 break;
3895 case AMDGPU::G_INSERT_VECTOR_ELT: {
3896 unsigned OutputBankID = isSALUMapping(MI) ?
3897 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3899 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3900 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3901 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3902 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3903 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3905 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3906 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3908 // This is a weird case, because we need to break down the mapping based on
3909 // the register bank of a different operand.
3910 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3911 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3912 InsertSize);
3913 } else {
3914 assert(InsertSize == 32 || InsertSize == 64);
3915 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3918 // The index can be either if the source vector is VGPR.
3919 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3920 break;
3922 case AMDGPU::G_UNMERGE_VALUES: {
3923 unsigned Bank = getMappingType(MRI, MI);
3925 // Op1 and Dst should use the same register bank.
3926 // FIXME: Shouldn't this be the default? Why do we need to handle this?
3927 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3928 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3929 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3931 break;
3933 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3934 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3935 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3936 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3937 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3938 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3939 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3940 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3941 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3942 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3943 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3944 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3945 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3946 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3947 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3948 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3949 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3951 // rsrc
3952 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3954 // vindex
3955 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3957 // voffset
3958 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3960 // soffset
3961 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3963 // Any remaining operands are immediates and were correctly null
3964 // initialized.
3965 break;
3967 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3968 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3969 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3970 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3971 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3972 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3973 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3974 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3975 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3976 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3977 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3978 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3979 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3980 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3981 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3982 // vdata_out
3983 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3985 // vdata_in
3986 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3988 // rsrc
3989 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3991 // vindex
3992 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3994 // voffset
3995 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3997 // soffset
3998 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4000 // Any remaining operands are immediates and were correctly null
4001 // initialized.
4002 break;
4004 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4005 // vdata_out
4006 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4008 // vdata_in
4009 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4011 // cmp
4012 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4014 // rsrc
4015 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4017 // vindex
4018 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4020 // voffset
4021 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4023 // soffset
4024 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4026 // Any remaining operands are immediates and were correctly null
4027 // initialized.
4028 break;
4030 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4031 // Lie and claim everything is legal, even though some need to be
4032 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4033 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4034 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4036 // We need to convert this to a MUBUF if either the resource of offset is
4037 // VGPR.
4038 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4039 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4040 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4042 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4043 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4044 break;
4046 case AMDGPU::G_INTRINSIC: {
4047 switch (MI.getIntrinsicID()) {
4048 default:
4049 return getInvalidInstructionMapping();
4050 case Intrinsic::amdgcn_div_fmas:
4051 case Intrinsic::amdgcn_div_fixup:
4052 case Intrinsic::amdgcn_trig_preop:
4053 case Intrinsic::amdgcn_sin:
4054 case Intrinsic::amdgcn_cos:
4055 case Intrinsic::amdgcn_log_clamp:
4056 case Intrinsic::amdgcn_rcp:
4057 case Intrinsic::amdgcn_rcp_legacy:
4058 case Intrinsic::amdgcn_sqrt:
4059 case Intrinsic::amdgcn_rsq:
4060 case Intrinsic::amdgcn_rsq_legacy:
4061 case Intrinsic::amdgcn_rsq_clamp:
4062 case Intrinsic::amdgcn_fmul_legacy:
4063 case Intrinsic::amdgcn_fma_legacy:
4064 case Intrinsic::amdgcn_ldexp:
4065 case Intrinsic::amdgcn_frexp_mant:
4066 case Intrinsic::amdgcn_frexp_exp:
4067 case Intrinsic::amdgcn_fract:
4068 case Intrinsic::amdgcn_cvt_pkrtz:
4069 case Intrinsic::amdgcn_cvt_pknorm_i16:
4070 case Intrinsic::amdgcn_cvt_pknorm_u16:
4071 case Intrinsic::amdgcn_cvt_pk_i16:
4072 case Intrinsic::amdgcn_cvt_pk_u16:
4073 case Intrinsic::amdgcn_fmed3:
4074 case Intrinsic::amdgcn_cubeid:
4075 case Intrinsic::amdgcn_cubema:
4076 case Intrinsic::amdgcn_cubesc:
4077 case Intrinsic::amdgcn_cubetc:
4078 case Intrinsic::amdgcn_sffbh:
4079 case Intrinsic::amdgcn_fmad_ftz:
4080 case Intrinsic::amdgcn_mbcnt_lo:
4081 case Intrinsic::amdgcn_mbcnt_hi:
4082 case Intrinsic::amdgcn_mul_u24:
4083 case Intrinsic::amdgcn_mul_i24:
4084 case Intrinsic::amdgcn_lerp:
4085 case Intrinsic::amdgcn_sad_u8:
4086 case Intrinsic::amdgcn_msad_u8:
4087 case Intrinsic::amdgcn_sad_hi_u8:
4088 case Intrinsic::amdgcn_sad_u16:
4089 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4090 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4091 case Intrinsic::amdgcn_mqsad_u32_u8:
4092 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4093 case Intrinsic::amdgcn_alignbit:
4094 case Intrinsic::amdgcn_alignbyte:
4095 case Intrinsic::amdgcn_perm:
4096 case Intrinsic::amdgcn_fdot2:
4097 case Intrinsic::amdgcn_sdot2:
4098 case Intrinsic::amdgcn_udot2:
4099 case Intrinsic::amdgcn_sdot4:
4100 case Intrinsic::amdgcn_udot4:
4101 case Intrinsic::amdgcn_sdot8:
4102 case Intrinsic::amdgcn_udot8:
4103 return getDefaultMappingVOP(MI);
4104 case Intrinsic::amdgcn_sbfe:
4105 case Intrinsic::amdgcn_ubfe:
4106 if (isSALUMapping(MI))
4107 return getDefaultMappingSOP(MI);
4108 return getDefaultMappingVOP(MI);
4109 case Intrinsic::amdgcn_ds_swizzle:
4110 case Intrinsic::amdgcn_ds_permute:
4111 case Intrinsic::amdgcn_ds_bpermute:
4112 case Intrinsic::amdgcn_update_dpp:
4113 case Intrinsic::amdgcn_mov_dpp8:
4114 case Intrinsic::amdgcn_mov_dpp:
4115 case Intrinsic::amdgcn_strict_wwm:
4116 case Intrinsic::amdgcn_wwm:
4117 case Intrinsic::amdgcn_strict_wqm:
4118 case Intrinsic::amdgcn_wqm:
4119 case Intrinsic::amdgcn_softwqm:
4120 case Intrinsic::amdgcn_set_inactive:
4121 return getDefaultMappingAllVGPR(MI);
4122 case Intrinsic::amdgcn_kernarg_segment_ptr:
4123 case Intrinsic::amdgcn_s_getpc:
4124 case Intrinsic::amdgcn_groupstaticsize:
4125 case Intrinsic::amdgcn_reloc_constant:
4126 case Intrinsic::returnaddress: {
4127 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4128 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4129 break;
4131 case Intrinsic::amdgcn_wqm_vote: {
4132 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4133 OpdsMapping[0] = OpdsMapping[2]
4134 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4135 break;
4137 case Intrinsic::amdgcn_ps_live: {
4138 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4139 break;
4141 case Intrinsic::amdgcn_div_scale: {
4142 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4143 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4144 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4145 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4147 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4148 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4149 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4150 break;
4152 case Intrinsic::amdgcn_class: {
4153 Register Src0Reg = MI.getOperand(2).getReg();
4154 Register Src1Reg = MI.getOperand(3).getReg();
4155 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4156 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4157 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4158 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4159 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4160 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4161 break;
4163 case Intrinsic::amdgcn_icmp:
4164 case Intrinsic::amdgcn_fcmp: {
4165 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4166 // This is not VCCRegBank because this is not used in boolean contexts.
4167 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4168 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4169 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4170 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4171 break;
4173 case Intrinsic::amdgcn_readlane: {
4174 // This must be an SGPR, but accept a VGPR.
4175 Register IdxReg = MI.getOperand(3).getReg();
4176 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4177 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4178 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4179 LLVM_FALLTHROUGH;
4181 case Intrinsic::amdgcn_readfirstlane: {
4182 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4183 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4184 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4185 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4186 break;
4188 case Intrinsic::amdgcn_writelane: {
4189 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4190 Register SrcReg = MI.getOperand(2).getReg();
4191 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4192 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4193 Register IdxReg = MI.getOperand(3).getReg();
4194 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4195 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4196 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4198 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4199 // to legalize.
4200 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4201 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4202 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4203 break;
4205 case Intrinsic::amdgcn_if_break: {
4206 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4207 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4208 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4209 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4210 break;
4212 case Intrinsic::amdgcn_permlane16:
4213 case Intrinsic::amdgcn_permlanex16: {
4214 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4215 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4216 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4217 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4218 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4219 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4220 break;
4222 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4223 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4224 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4225 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4226 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4227 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4228 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4229 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4230 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4231 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4232 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4233 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4234 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4235 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4236 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4237 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4238 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4239 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4240 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4241 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4242 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4243 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4244 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4245 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4246 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4247 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4248 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4249 // Default for MAI intrinsics.
4250 // srcC can also be an immediate which can be folded later.
4251 // FIXME: Should we eventually add an alternative mapping with AGPR src
4252 // for srcA/srcB?
4254 // vdst, srcA, srcB, srcC
4255 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4256 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4257 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4258 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4259 break;
4261 case Intrinsic::amdgcn_interp_p1:
4262 case Intrinsic::amdgcn_interp_p2:
4263 case Intrinsic::amdgcn_interp_mov:
4264 case Intrinsic::amdgcn_interp_p1_f16:
4265 case Intrinsic::amdgcn_interp_p2_f16: {
4266 const int M0Idx = MI.getNumOperands() - 1;
4267 Register M0Reg = MI.getOperand(M0Idx).getReg();
4268 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4269 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4271 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4272 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4273 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4275 // Must be SGPR, but we must take whatever the original bank is and fix it
4276 // later.
4277 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4278 break;
4280 case Intrinsic::amdgcn_ballot: {
4281 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4282 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4283 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4284 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4285 break;
4288 break;
4290 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4291 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4292 auto IntrID = MI.getIntrinsicID();
4293 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4294 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4295 // Non-images can have complications from operands that allow both SGPR
4296 // and VGPR. For now it's too complicated to figure out the final opcode
4297 // to derive the register bank from the MCInstrDesc.
4298 assert(RSrcIntrin->IsImage);
4299 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4301 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4302 unsigned N = MI.getNumExplicitOperands() - 2;
4303 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4304 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4305 if (N == 3) {
4306 // Sequential form: all operands combined into VGPR256/VGPR512
4307 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4308 if (Size > 256)
4309 Size = 512;
4310 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4311 } else {
4312 // NSA form
4313 for (unsigned I = 2; I < N; ++I)
4314 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4316 break;
4318 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4319 auto IntrID = MI.getIntrinsicID();
4320 switch (IntrID) {
4321 case Intrinsic::amdgcn_s_getreg:
4322 case Intrinsic::amdgcn_s_memtime:
4323 case Intrinsic::amdgcn_s_memrealtime:
4324 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4325 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4326 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4327 break;
4329 case Intrinsic::amdgcn_global_atomic_fadd:
4330 case Intrinsic::amdgcn_global_atomic_csub:
4331 case Intrinsic::amdgcn_global_atomic_fmin:
4332 case Intrinsic::amdgcn_global_atomic_fmax:
4333 case Intrinsic::amdgcn_flat_atomic_fadd:
4334 case Intrinsic::amdgcn_flat_atomic_fmin:
4335 case Intrinsic::amdgcn_flat_atomic_fmax:
4336 return getDefaultMappingAllVGPR(MI);
4337 case Intrinsic::amdgcn_ds_ordered_add:
4338 case Intrinsic::amdgcn_ds_ordered_swap: {
4339 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4340 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4341 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4342 AMDGPU::SGPRRegBankID);
4343 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4344 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4345 break;
4347 case Intrinsic::amdgcn_ds_append:
4348 case Intrinsic::amdgcn_ds_consume: {
4349 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4350 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4351 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4352 break;
4354 case Intrinsic::amdgcn_exp_compr:
4355 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4356 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4357 break;
4358 case Intrinsic::amdgcn_exp:
4359 // FIXME: Could we support packed types here?
4360 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4361 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4362 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4363 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4364 break;
4365 case Intrinsic::amdgcn_s_sendmsg:
4366 case Intrinsic::amdgcn_s_sendmsghalt: {
4367 // This must be an SGPR, but accept a VGPR.
4368 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4369 AMDGPU::SGPRRegBankID);
4370 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4371 break;
4373 case Intrinsic::amdgcn_s_setreg: {
4374 // This must be an SGPR, but accept a VGPR.
4375 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4376 AMDGPU::SGPRRegBankID);
4377 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4378 break;
4380 case Intrinsic::amdgcn_end_cf: {
4381 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4382 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4383 break;
4385 case Intrinsic::amdgcn_else: {
4386 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4387 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4388 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4389 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4390 break;
4392 case Intrinsic::amdgcn_live_mask: {
4393 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4394 break;
4396 case Intrinsic::amdgcn_wqm_demote:
4397 case Intrinsic::amdgcn_kill: {
4398 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4399 break;
4401 case Intrinsic::amdgcn_raw_buffer_load:
4402 case Intrinsic::amdgcn_raw_tbuffer_load: {
4403 // FIXME: Should make intrinsic ID the last operand of the instruction,
4404 // then this would be the same as store
4405 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4406 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4407 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4408 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4409 break;
4411 case Intrinsic::amdgcn_raw_buffer_store:
4412 case Intrinsic::amdgcn_raw_buffer_store_format:
4413 case Intrinsic::amdgcn_raw_tbuffer_store: {
4414 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4415 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4416 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4417 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4418 break;
4420 case Intrinsic::amdgcn_struct_buffer_load:
4421 case Intrinsic::amdgcn_struct_tbuffer_load: {
4422 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4423 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4424 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4425 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4426 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4427 break;
4429 case Intrinsic::amdgcn_struct_buffer_store:
4430 case Intrinsic::amdgcn_struct_tbuffer_store: {
4431 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4432 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4433 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4434 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4435 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4436 break;
4438 case Intrinsic::amdgcn_init_exec_from_input: {
4439 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4440 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4441 break;
4443 case Intrinsic::amdgcn_ds_gws_init:
4444 case Intrinsic::amdgcn_ds_gws_barrier:
4445 case Intrinsic::amdgcn_ds_gws_sema_br: {
4446 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4448 // This must be an SGPR, but accept a VGPR.
4449 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4450 AMDGPU::SGPRRegBankID);
4451 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4452 break;
4454 case Intrinsic::amdgcn_ds_gws_sema_v:
4455 case Intrinsic::amdgcn_ds_gws_sema_p:
4456 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4457 // This must be an SGPR, but accept a VGPR.
4458 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4459 AMDGPU::SGPRRegBankID);
4460 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4461 break;
4463 default:
4464 return getInvalidInstructionMapping();
4466 break;
4468 case AMDGPU::G_SELECT: {
4469 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4470 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4471 AMDGPU::SGPRRegBankID);
4472 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4473 AMDGPU::SGPRRegBankID);
4474 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4475 Op3Bank == AMDGPU::SGPRRegBankID;
4477 unsigned CondBankDefault = SGPRSrcs ?
4478 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4479 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4480 CondBankDefault);
4481 if (CondBank == AMDGPU::SGPRRegBankID)
4482 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4483 else if (CondBank == AMDGPU::VGPRRegBankID)
4484 CondBank = AMDGPU::VCCRegBankID;
4486 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4487 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4489 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4491 // TODO: Should report 32-bit for scalar condition type.
4492 if (Size == 64) {
4493 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4494 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4495 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4496 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4497 } else {
4498 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4499 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4500 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4501 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4504 break;
4507 case AMDGPU::G_LOAD:
4508 case AMDGPU::G_ZEXTLOAD:
4509 case AMDGPU::G_SEXTLOAD:
4510 return getInstrMappingForLoad(MI);
4512 case AMDGPU::G_ATOMICRMW_XCHG:
4513 case AMDGPU::G_ATOMICRMW_ADD:
4514 case AMDGPU::G_ATOMICRMW_SUB:
4515 case AMDGPU::G_ATOMICRMW_AND:
4516 case AMDGPU::G_ATOMICRMW_OR:
4517 case AMDGPU::G_ATOMICRMW_XOR:
4518 case AMDGPU::G_ATOMICRMW_MAX:
4519 case AMDGPU::G_ATOMICRMW_MIN:
4520 case AMDGPU::G_ATOMICRMW_UMAX:
4521 case AMDGPU::G_ATOMICRMW_UMIN:
4522 case AMDGPU::G_ATOMICRMW_FADD:
4523 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4524 case AMDGPU::G_AMDGPU_ATOMIC_INC:
4525 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4526 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4527 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4528 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4529 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4530 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4531 break;
4533 case AMDGPU::G_ATOMIC_CMPXCHG: {
4534 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4535 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4536 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4537 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4538 break;
4540 case AMDGPU::G_BRCOND: {
4541 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4542 AMDGPU::SGPRRegBankID);
4543 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4544 if (Bank != AMDGPU::SGPRRegBankID)
4545 Bank = AMDGPU::VCCRegBankID;
4547 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4548 break;
4552 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4553 getOperandsMapping(OpdsMapping),
4554 MI.getNumOperands());