[InstCombine] Signed saturation patterns
[llvm-core.git] / lib / Target / AMDGPU / AMDGPURegisterBankInfo.cpp
blob4d78188b3dc36522a655b711b3e7d5fa6f571997
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
25 #include "llvm/CodeGen/TargetRegisterInfo.h"
26 #include "llvm/CodeGen/TargetSubtargetInfo.h"
27 #include "llvm/IR/Constants.h"
29 #define GET_TARGET_REGBANK_IMPL
30 #include "AMDGPUGenRegisterBank.inc"
32 // This file will be TableGen'ed at some point.
33 #include "AMDGPUGenRegisterBankInfo.def"
35 using namespace llvm;
36 using namespace MIPatternMatch;
38 namespace {
40 // Observer to apply a register bank to new registers created by LegalizerHelper.
41 class ApplyRegBankMapping final : public GISelChangeObserver {
42 private:
43 MachineRegisterInfo &MRI;
44 const RegisterBank *NewBank;
45 SmallVector<MachineInstr *, 4> NewInsts;
47 public:
48 ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
49 : MRI(MRI_), NewBank(RB) {}
51 ~ApplyRegBankMapping() {
52 for (MachineInstr *MI : NewInsts)
53 applyBank(*MI);
56 /// Set any registers that don't have a set register class or bank to SALU.
57 void applyBank(MachineInstr &MI) {
58 for (MachineOperand &Op : MI.operands()) {
59 if (!Op.isReg())
60 continue;
62 Register Reg = Op.getReg();
63 if (MRI.getRegClassOrRegBank(Reg))
64 continue;
66 const RegisterBank *RB = NewBank;
67 // FIXME: This might not be enough to detect when SCC should be used.
68 if (MRI.getType(Reg) == LLT::scalar(1))
69 RB = (NewBank == &AMDGPU::SGPRRegBank ?
70 &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
72 MRI.setRegBank(Reg, *RB);
76 void erasingInstr(MachineInstr &MI) override {}
78 void createdInstr(MachineInstr &MI) override {
79 // At this point, the instruction was just inserted and has no operands.
80 NewInsts.push_back(&MI);
83 void changingInstr(MachineInstr &MI) override {}
84 void changedInstr(MachineInstr &MI) override {}
88 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
89 : AMDGPUGenRegisterBankInfo(),
90 Subtarget(ST),
91 TRI(Subtarget.getRegisterInfo()),
92 TII(Subtarget.getInstrInfo()) {
94 // HACK: Until this is fully tablegen'd.
95 static bool AlreadyInit = false;
96 if (AlreadyInit)
97 return;
99 AlreadyInit = true;
101 const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
102 (void)RBSGPR;
103 assert(&RBSGPR == &AMDGPU::SGPRRegBank);
105 const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
106 (void)RBVGPR;
107 assert(&RBVGPR == &AMDGPU::VGPRRegBank);
111 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
112 const RegisterBank &Src,
113 unsigned Size) const {
114 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
115 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
116 Src.getID() == AMDGPU::VGPRRegBankID) {
117 return std::numeric_limits<unsigned>::max();
120 // Bool values are tricky, because the meaning is based on context. The SCC
121 // and VCC banks are for the natural scalar and vector conditions produced by
122 // a compare.
124 // Legalization doesn't know about the necessary context, so an s1 use may
125 // have been a truncate from an arbitrary value, in which case a copy (lowered
126 // as a compare with 0) needs to be inserted.
127 if (Size == 1 &&
128 (Dst.getID() == AMDGPU::SCCRegBankID ||
129 Dst.getID() == AMDGPU::SGPRRegBankID) &&
130 (Src.getID() == AMDGPU::SGPRRegBankID ||
131 Src.getID() == AMDGPU::VGPRRegBankID ||
132 Src.getID() == AMDGPU::VCCRegBankID))
133 return std::numeric_limits<unsigned>::max();
135 if (Dst.getID() == AMDGPU::SCCRegBankID &&
136 Src.getID() == AMDGPU::VCCRegBankID)
137 return std::numeric_limits<unsigned>::max();
139 return RegisterBankInfo::copyCost(Dst, Src, Size);
142 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
143 const ValueMapping &ValMapping,
144 const RegisterBank *CurBank) const {
145 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
146 // VGPR.
147 // FIXME: Is there a better way to do this?
148 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
149 return 10; // This is expensive.
151 assert(ValMapping.NumBreakDowns == 2 &&
152 ValMapping.BreakDown[0].Length == 32 &&
153 ValMapping.BreakDown[0].StartIdx == 0 &&
154 ValMapping.BreakDown[1].Length == 32 &&
155 ValMapping.BreakDown[1].StartIdx == 32 &&
156 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
158 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
159 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
160 // want.
162 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
163 // alignment restrictions, but this probably isn't important.
164 return 1;
167 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
168 const TargetRegisterClass &RC) const {
169 if (&RC == &AMDGPU::SReg_1RegClass)
170 return AMDGPU::VCCRegBank;
172 return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank;
175 template <unsigned NumOps>
176 RegisterBankInfo::InstructionMappings
177 AMDGPURegisterBankInfo::addMappingFromTable(
178 const MachineInstr &MI, const MachineRegisterInfo &MRI,
179 const std::array<unsigned, NumOps> RegSrcOpIdx,
180 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
182 InstructionMappings AltMappings;
184 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
186 unsigned Sizes[NumOps];
187 for (unsigned I = 0; I < NumOps; ++I) {
188 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
189 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
192 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
193 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
194 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
197 // getInstrMapping's default mapping uses ID 1, so start at 2.
198 unsigned MappingID = 2;
199 for (const auto &Entry : Table) {
200 for (unsigned I = 0; I < NumOps; ++I) {
201 int OpIdx = RegSrcOpIdx[I];
202 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
205 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
206 getOperandsMapping(Operands),
207 Operands.size()));
210 return AltMappings;
213 RegisterBankInfo::InstructionMappings
214 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
215 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
216 switch (MI.getIntrinsicID()) {
217 case Intrinsic::amdgcn_readlane: {
218 static const OpRegBankEntry<3> Table[2] = {
219 // Perfectly legal.
220 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
222 // Need a readfirstlane for the index.
223 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
226 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
227 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
229 case Intrinsic::amdgcn_writelane: {
230 static const OpRegBankEntry<4> Table[4] = {
231 // Perfectly legal.
232 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
234 // Need readfirstlane of first op
235 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
237 // Need readfirstlane of second op
238 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
240 // Need readfirstlane of both ops
241 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
244 // rsrc, voffset, offset
245 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
246 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
248 default:
249 return RegisterBankInfo::getInstrAlternativeMappings(MI);
253 RegisterBankInfo::InstructionMappings
254 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
255 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
257 switch (MI.getIntrinsicID()) {
258 case Intrinsic::amdgcn_buffer_load: {
259 static const OpRegBankEntry<3> Table[4] = {
260 // Perfectly legal.
261 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
262 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
264 // Waterfall loop needed for rsrc. In the worst case this will execute
265 // approximately an extra 10 * wavesize + 2 instructions.
266 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
267 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
270 // rsrc, voffset, offset
271 const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
272 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
274 case Intrinsic::amdgcn_s_buffer_load: {
275 static const OpRegBankEntry<2> Table[4] = {
276 // Perfectly legal.
277 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
279 // Only need 1 register in loop
280 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
282 // Have to waterfall the resource.
283 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
285 // Have to waterfall the resource, and the offset.
286 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
289 // rsrc, offset
290 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
291 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
293 case Intrinsic::amdgcn_ds_ordered_add:
294 case Intrinsic::amdgcn_ds_ordered_swap: {
295 // VGPR = M0, VGPR
296 static const OpRegBankEntry<3> Table[2] = {
297 // Perfectly legal.
298 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
300 // Need a readfirstlane for m0
301 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
304 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
305 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
307 case Intrinsic::amdgcn_s_sendmsg:
308 case Intrinsic::amdgcn_s_sendmsghalt: {
309 // FIXME: Should have no register for immediate
310 static const OpRegBankEntry<1> Table[2] = {
311 // Perfectly legal.
312 { { AMDGPU::SGPRRegBankID }, 1 },
314 // Need readlane
315 { { AMDGPU::VGPRRegBankID }, 3 }
318 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
319 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
321 default:
322 return RegisterBankInfo::getInstrAlternativeMappings(MI);
326 // FIXME: Returns uniform if there's no source value information. This is
327 // probably wrong.
328 static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) {
329 if (!MI.hasOneMemOperand())
330 return false;
332 const MachineMemOperand *MMO = *MI.memoperands_begin();
333 return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
334 AMDGPUInstrInfo::isUniformMMO(MMO);
337 RegisterBankInfo::InstructionMappings
338 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
339 const MachineInstr &MI) const {
341 const MachineFunction &MF = *MI.getParent()->getParent();
342 const MachineRegisterInfo &MRI = MF.getRegInfo();
345 InstructionMappings AltMappings;
346 switch (MI.getOpcode()) {
347 case TargetOpcode::G_CONSTANT: {
348 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
349 if (Size == 1) {
350 static const OpRegBankEntry<1> Table[4] = {
351 { { AMDGPU::VGPRRegBankID }, 1 },
352 { { AMDGPU::SGPRRegBankID }, 1 },
353 { { AMDGPU::VCCRegBankID }, 1 },
354 { { AMDGPU::SCCRegBankID }, 1 }
357 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
360 LLVM_FALLTHROUGH;
362 case TargetOpcode::G_FCONSTANT:
363 case TargetOpcode::G_FRAME_INDEX:
364 case TargetOpcode::G_GLOBAL_VALUE: {
365 static const OpRegBankEntry<1> Table[2] = {
366 { { AMDGPU::VGPRRegBankID }, 1 },
367 { { AMDGPU::SGPRRegBankID }, 1 }
370 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
372 case TargetOpcode::G_AND:
373 case TargetOpcode::G_OR:
374 case TargetOpcode::G_XOR: {
375 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
377 if (Size == 1) {
378 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
379 const InstructionMapping &SCCMapping = getInstructionMapping(
380 1, 1, getOperandsMapping(
381 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
382 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
383 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
384 3); // Num Operands
385 AltMappings.push_back(&SCCMapping);
387 const InstructionMapping &SGPRMapping = getInstructionMapping(
388 1, 1, getOperandsMapping(
389 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
390 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
391 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
392 3); // Num Operands
393 AltMappings.push_back(&SGPRMapping);
395 const InstructionMapping &VCCMapping0 = getInstructionMapping(
396 2, 10, getOperandsMapping(
397 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
398 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
399 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
400 3); // Num Operands
401 AltMappings.push_back(&VCCMapping0);
402 return AltMappings;
405 if (Size != 64)
406 break;
408 const InstructionMapping &SSMapping = getInstructionMapping(
409 1, 1, getOperandsMapping(
410 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
411 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
412 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
413 3); // Num Operands
414 AltMappings.push_back(&SSMapping);
416 const InstructionMapping &VVMapping = getInstructionMapping(
417 2, 2, getOperandsMapping(
418 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
419 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
420 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
421 3); // Num Operands
422 AltMappings.push_back(&VVMapping);
424 const InstructionMapping &SVMapping = getInstructionMapping(
425 3, 3, getOperandsMapping(
426 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
427 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
428 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
429 3); // Num Operands
430 AltMappings.push_back(&SVMapping);
432 // SGPR in LHS is slightly preferrable, so make it VS more expensive than
433 // SV.
434 const InstructionMapping &VSMapping = getInstructionMapping(
435 3, 4, getOperandsMapping(
436 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
437 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
438 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
439 3); // Num Operands
440 AltMappings.push_back(&VSMapping);
441 break;
443 case TargetOpcode::G_LOAD:
444 case TargetOpcode::G_ZEXTLOAD:
445 case TargetOpcode::G_SEXTLOAD: {
446 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
447 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
448 unsigned PtrSize = PtrTy.getSizeInBits();
449 unsigned AS = PtrTy.getAddressSpace();
450 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
451 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
452 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
453 isInstrUniformNonExtLoadAlign4(MI)) {
454 const InstructionMapping &SSMapping = getInstructionMapping(
455 1, 1, getOperandsMapping(
456 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
457 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
458 2); // Num Operands
459 AltMappings.push_back(&SSMapping);
462 const InstructionMapping &VVMapping = getInstructionMapping(
463 2, 1, getOperandsMapping(
464 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
465 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
466 2); // Num Operands
467 AltMappings.push_back(&VVMapping);
469 // It may be possible to have a vgpr = load sgpr mapping here, because
470 // the mubuf instructions support this kind of load, but probably for only
471 // gfx7 and older. However, the addressing mode matching in the instruction
472 // selector should be able to do a better job of detecting and selecting
473 // these kinds of loads from the vgpr = load vgpr mapping.
475 return AltMappings;
478 case TargetOpcode::G_ICMP: {
479 unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
480 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
481 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
482 nullptr, // Predicate operand.
483 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
484 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
485 4); // Num Operands
486 AltMappings.push_back(&SSMapping);
488 const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
489 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
490 nullptr, // Predicate operand.
491 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
492 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
493 4); // Num Operands
494 AltMappings.push_back(&SVMapping);
496 const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
497 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
498 nullptr, // Predicate operand.
499 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
500 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
501 4); // Num Operands
502 AltMappings.push_back(&VSMapping);
504 const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
505 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
506 nullptr, // Predicate operand.
507 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
508 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
509 4); // Num Operands
510 AltMappings.push_back(&VVMapping);
512 return AltMappings;
514 case TargetOpcode::G_SELECT: {
515 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
516 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
517 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
518 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
519 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
520 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
521 4); // Num Operands
522 AltMappings.push_back(&SSMapping);
524 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
525 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
526 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
527 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
528 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
529 4); // Num Operands
530 AltMappings.push_back(&VVMapping);
532 return AltMappings;
534 case TargetOpcode::G_SMIN:
535 case TargetOpcode::G_SMAX:
536 case TargetOpcode::G_UMIN:
537 case TargetOpcode::G_UMAX: {
538 static const OpRegBankEntry<3> Table[4] = {
539 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
540 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
541 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
543 // Scalar requires cmp+select, and extends if 16-bit.
544 // FIXME: Should there be separate costs for 32 and 16-bit
545 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
548 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
549 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
551 case TargetOpcode::G_UADDE:
552 case TargetOpcode::G_USUBE:
553 case TargetOpcode::G_SADDE:
554 case TargetOpcode::G_SSUBE: {
555 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
556 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
557 getOperandsMapping(
558 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
559 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
560 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
561 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
562 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
563 5); // Num Operands
564 AltMappings.push_back(&SSMapping);
566 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
567 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
568 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
569 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
570 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
571 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
572 5); // Num Operands
573 AltMappings.push_back(&VVMapping);
574 return AltMappings;
576 case AMDGPU::G_BRCOND: {
577 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
579 const InstructionMapping &SMapping = getInstructionMapping(
580 1, 1, getOperandsMapping(
581 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
582 2); // Num Operands
583 AltMappings.push_back(&SMapping);
585 const InstructionMapping &VMapping = getInstructionMapping(
586 1, 1, getOperandsMapping(
587 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
588 2); // Num Operands
589 AltMappings.push_back(&VMapping);
590 return AltMappings;
592 case AMDGPU::G_INTRINSIC:
593 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
594 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
595 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
596 default:
597 break;
599 return RegisterBankInfo::getInstrAlternativeMappings(MI);
602 void AMDGPURegisterBankInfo::split64BitValueForMapping(
603 MachineIRBuilder &B,
604 SmallVector<Register, 2> &Regs,
605 LLT HalfTy,
606 Register Reg) const {
607 assert(HalfTy.getSizeInBits() == 32);
608 MachineRegisterInfo *MRI = B.getMRI();
609 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
610 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
611 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
612 MRI->setRegBank(LoLHS, *Bank);
613 MRI->setRegBank(HiLHS, *Bank);
615 Regs.push_back(LoLHS);
616 Regs.push_back(HiLHS);
618 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
619 .addDef(LoLHS)
620 .addDef(HiLHS)
621 .addUse(Reg);
624 /// Replace the current type each register in \p Regs has with \p NewTy
625 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
626 LLT NewTy) {
627 for (Register Reg : Regs) {
628 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
629 MRI.setType(Reg, NewTy);
633 static LLT getHalfSizedType(LLT Ty) {
634 if (Ty.isVector()) {
635 assert(Ty.getNumElements() % 2 == 0);
636 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
639 assert(Ty.getSizeInBits() % 2 == 0);
640 return LLT::scalar(Ty.getSizeInBits() / 2);
643 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
644 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
645 /// execute the instruction for each unique combination of values in all lanes
646 /// in the wave. The block will be split such that rest of the instructions are
647 /// moved to a new block.
649 /// Essentially performs this loop:
651 /// Save Execution Mask
652 /// For (Lane : Wavefront) {
653 /// Enable Lane, Disable all other lanes
654 /// SGPR = read SGPR value for current lane from VGPR
655 /// VGPRResult[Lane] = use_op SGPR
656 /// }
657 /// Restore Execution Mask
659 /// There is additional complexity to try for compare values to identify the
660 /// unique values used.
661 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
662 MachineIRBuilder &B,
663 iterator_range<MachineBasicBlock::iterator> Range,
664 SmallSet<Register, 4> &SGPROperandRegs,
665 MachineRegisterInfo &MRI) const {
666 SmallVector<Register, 4> ResultRegs;
667 SmallVector<Register, 4> InitResultRegs;
668 SmallVector<Register, 4> PhiRegs;
670 MachineBasicBlock &MBB = B.getMBB();
671 MachineFunction *MF = &B.getMF();
673 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
674 const unsigned WaveAndOpc = Subtarget.isWave32() ?
675 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
676 const unsigned MovTermOpc = Subtarget.isWave32() ?
677 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
678 const unsigned XorTermOpc = Subtarget.isWave32() ?
679 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
680 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
681 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
682 const unsigned ExecReg = Subtarget.isWave32() ?
683 AMDGPU::EXEC_LO : AMDGPU::EXEC;
685 for (MachineInstr &MI : Range) {
686 for (MachineOperand &Def : MI.defs()) {
687 LLT ResTy = MRI.getType(Def.getReg());
688 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
689 ResultRegs.push_back(Def.getReg());
690 Register InitReg = B.buildUndef(ResTy).getReg(0);
691 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
692 InitResultRegs.push_back(InitReg);
693 PhiRegs.push_back(PhiReg);
694 MRI.setRegBank(PhiReg, *DefBank);
695 MRI.setRegBank(InitReg, *DefBank);
699 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
700 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
702 // Don't bother using generic instructions/registers for the exec mask.
703 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
704 .addDef(InitSaveExecReg);
706 Register PhiExec = MRI.createVirtualRegister(WaveRC);
707 Register NewExec = MRI.createVirtualRegister(WaveRC);
709 // To insert the loop we need to split the block. Move everything before this
710 // point to a new block, and insert a new empty block before this instruction.
711 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
712 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
713 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
714 MachineFunction::iterator MBBI(MBB);
715 ++MBBI;
716 MF->insert(MBBI, LoopBB);
717 MF->insert(MBBI, RestoreExecBB);
718 MF->insert(MBBI, RemainderBB);
720 LoopBB->addSuccessor(RestoreExecBB);
721 LoopBB->addSuccessor(LoopBB);
723 // Move the rest of the block into a new block.
724 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
725 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
727 MBB.addSuccessor(LoopBB);
728 RestoreExecBB->addSuccessor(RemainderBB);
730 B.setInsertPt(*LoopBB, LoopBB->end());
732 B.buildInstr(TargetOpcode::PHI)
733 .addDef(PhiExec)
734 .addReg(InitSaveExecReg)
735 .addMBB(&MBB)
736 .addReg(NewExec)
737 .addMBB(LoopBB);
739 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
740 B.buildInstr(TargetOpcode::G_PHI)
741 .addDef(std::get<2>(Result))
742 .addReg(std::get<0>(Result)) // Initial value / implicit_def
743 .addMBB(&MBB)
744 .addReg(std::get<1>(Result)) // Mid-loop value.
745 .addMBB(LoopBB);
748 const DebugLoc &DL = B.getDL();
750 // Figure out the iterator range after splicing the instructions.
751 auto NewBegin = std::prev(LoopBB->end());
753 // Move the instruction into the loop. Note we moved everything after
754 // Range.end() already into a new block, so Range.end() is no longer valid.
755 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
757 auto NewEnd = LoopBB->end();
759 MachineBasicBlock::iterator I = Range.begin();
760 B.setInsertPt(*LoopBB, I);
762 Register CondReg;
764 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
765 for (MachineOperand &Op : MI.uses()) {
766 if (!Op.isReg() || Op.isDef())
767 continue;
769 if (SGPROperandRegs.count(Op.getReg())) {
770 LLT OpTy = MRI.getType(Op.getReg());
771 unsigned OpSize = OpTy.getSizeInBits();
773 // Can only do a readlane of 32-bit pieces.
774 if (OpSize == 32) {
775 // Avoid extra copies in the simple case of one 32-bit register.
776 Register CurrentLaneOpReg
777 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
778 MRI.setType(CurrentLaneOpReg, OpTy);
780 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
781 // Read the next variant <- also loop target.
782 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
783 CurrentLaneOpReg)
784 .addReg(Op.getReg());
786 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
787 bool First = CondReg == AMDGPU::NoRegister;
788 if (First)
789 CondReg = NewCondReg;
791 // Compare the just read M0 value to all possible Idx values.
792 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
793 .addDef(NewCondReg)
794 .addReg(CurrentLaneOpReg)
795 .addReg(Op.getReg());
796 Op.setReg(CurrentLaneOpReg);
798 if (!First) {
799 Register AndReg = MRI.createVirtualRegister(WaveRC);
801 // If there are multiple operands to consider, and the conditions.
802 B.buildInstr(WaveAndOpc)
803 .addDef(AndReg)
804 .addReg(NewCondReg)
805 .addReg(CondReg);
806 CondReg = AndReg;
808 } else {
809 LLT S32 = LLT::scalar(32);
810 SmallVector<Register, 8> ReadlanePieces;
812 // The compares can be done as 64-bit, but the extract needs to be done
813 // in 32-bit pieces.
815 bool Is64 = OpSize % 64 == 0;
817 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
818 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
819 : AMDGPU::V_CMP_EQ_U32_e64;
821 // The compares can be done as 64-bit, but the extract needs to be done
822 // in 32-bit pieces.
824 // Insert the unmerge before the loop.
826 B.setMBB(MBB);
827 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
828 B.setInstr(*I);
830 unsigned NumPieces = Unmerge->getNumOperands() - 1;
831 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
832 Register UnmergePiece = Unmerge.getReg(PieceIdx);
834 Register CurrentLaneOpReg;
835 if (Is64) {
836 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
837 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
839 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
840 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
841 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
843 // Read the next variant <- also loop target.
844 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
845 CurrentLaneOpRegLo)
846 .addReg(UnmergePiece, 0, AMDGPU::sub0);
848 // Read the next variant <- also loop target.
849 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
850 CurrentLaneOpRegHi)
851 .addReg(UnmergePiece, 0, AMDGPU::sub1);
853 CurrentLaneOpReg =
854 B.buildMerge(LLT::scalar(64),
855 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
856 .getReg(0);
858 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
860 if (OpTy.getScalarSizeInBits() == 64) {
861 // If we need to produce a 64-bit element vector, so use the
862 // merged pieces
863 ReadlanePieces.push_back(CurrentLaneOpReg);
864 } else {
865 // 32-bit element type.
866 ReadlanePieces.push_back(CurrentLaneOpRegLo);
867 ReadlanePieces.push_back(CurrentLaneOpRegHi);
869 } else {
870 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
871 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
872 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
874 // Read the next variant <- also loop target.
875 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
876 CurrentLaneOpReg)
877 .addReg(UnmergePiece);
878 ReadlanePieces.push_back(CurrentLaneOpReg);
881 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
882 bool First = CondReg == AMDGPU::NoRegister;
883 if (First)
884 CondReg = NewCondReg;
886 B.buildInstr(CmpOp)
887 .addDef(NewCondReg)
888 .addReg(CurrentLaneOpReg)
889 .addReg(UnmergePiece);
891 if (!First) {
892 Register AndReg = MRI.createVirtualRegister(WaveRC);
894 // If there are multiple operands to consider, and the conditions.
895 B.buildInstr(WaveAndOpc)
896 .addDef(AndReg)
897 .addReg(NewCondReg)
898 .addReg(CondReg);
899 CondReg = AndReg;
903 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
904 // BUILD_VECTOR
905 if (OpTy.isVector()) {
906 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
907 Op.setReg(Merge.getReg(0));
908 } else {
909 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
910 Op.setReg(Merge.getReg(0));
913 MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
919 B.setInsertPt(*LoopBB, LoopBB->end());
921 // Update EXEC, save the original EXEC value to VCC.
922 B.buildInstr(AndSaveExecOpc)
923 .addDef(NewExec)
924 .addReg(CondReg, RegState::Kill);
926 MRI.setSimpleHint(NewExec, CondReg);
928 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
929 B.buildInstr(XorTermOpc)
930 .addDef(ExecReg)
931 .addReg(ExecReg)
932 .addReg(NewExec);
934 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
935 // s_cbranch_scc0?
937 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
938 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
939 .addMBB(LoopBB);
941 // Save the EXEC mask before the loop.
942 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
943 .addReg(ExecReg);
945 // Restore the EXEC mask after the loop.
946 B.setMBB(*RestoreExecBB);
947 B.buildInstr(MovTermOpc)
948 .addDef(ExecReg)
949 .addReg(SaveExecReg);
951 // Restore the insert point before the original instruction.
952 B.setInsertPt(MBB, MBB.end());
954 return true;
957 // Return any unique registers used by \p MI at \p OpIndices that need to be
958 // handled in a waterfall loop. Returns these registers in \p
959 // SGPROperandRegs. Returns true if there are any operansd to handle and a
960 // waterfall loop is necessary.
961 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
962 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
963 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
964 for (unsigned Op : OpIndices) {
965 assert(MI.getOperand(Op).isUse());
966 Register Reg = MI.getOperand(Op).getReg();
967 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
968 if (OpBank->getID() == AMDGPU::VGPRRegBankID)
969 SGPROperandRegs.insert(Reg);
972 // No operands need to be replaced, so no need to loop.
973 return !SGPROperandRegs.empty();
976 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
977 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
978 ArrayRef<unsigned> OpIndices) const {
979 // Use a set to avoid extra readfirstlanes in the case where multiple operands
980 // are the same register.
981 SmallSet<Register, 4> SGPROperandRegs;
983 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
984 return false;
986 MachineBasicBlock::iterator I = MI.getIterator();
987 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
988 SGPROperandRegs, MRI);
991 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
992 MachineInstr &MI, MachineRegisterInfo &MRI,
993 ArrayRef<unsigned> OpIndices) const {
994 MachineIRBuilder B(MI);
995 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
998 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
999 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1000 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1001 Register Reg = MI.getOperand(OpIdx).getReg();
1002 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1003 if (Bank != &AMDGPU::VGPRRegBank)
1004 return;
1006 MachineIRBuilder B(MI);
1007 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1008 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1009 .addDef(SGPR)
1010 .addReg(Reg);
1012 const TargetRegisterClass *Constrained =
1013 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1014 (void)Constrained;
1015 assert(Constrained && "Failed to constrain readfirstlane src reg");
1017 MI.getOperand(OpIdx).setReg(SGPR);
1020 // When regbankselect repairs registers, it will insert a repair instruction
1021 // which defines the repaired register. Then it calls applyMapping and expects
1022 // that the targets will either delete or rewrite the originally wrote to the
1023 // repaired registers. Beccause of this, we end up in a situation where
1024 // we have 2 instructions defining the same registers.
1025 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
1026 Register Reg,
1027 const MachineInstr &MI) {
1028 // Is there some way we can assert that there are exactly 2 def instructions?
1029 for (MachineInstr &Other : MRI.def_instructions(Reg)) {
1030 if (&Other != &MI)
1031 return &Other;
1034 return nullptr;
1037 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
1038 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1039 MachineRegisterInfo &MRI) const {
1040 Register DstReg = MI.getOperand(0).getReg();
1041 const LLT LoadTy = MRI.getType(DstReg);
1042 unsigned LoadSize = LoadTy.getSizeInBits();
1043 const unsigned MaxNonSmrdLoadSize = 128;
1044 // 128-bit loads are supported for all instruction types.
1045 if (LoadSize <= MaxNonSmrdLoadSize)
1046 return false;
1048 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
1049 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
1051 // If the pointer is an SGPR, we have nothing to do.
1052 if (SrcRegs.empty()) {
1053 Register PtrReg = MI.getOperand(1).getReg();
1054 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
1055 if (PtrBank == &AMDGPU::SGPRRegBank)
1056 return false;
1057 SrcRegs.push_back(PtrReg);
1060 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1062 // We want to get the repair instruction now, because it will help us
1063 // determine which instruction the legalizer inserts that will also
1064 // write to DstReg.
1065 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
1067 // RegBankSelect only emits scalar types, so we need to reset the pointer
1068 // operand to a pointer type.
1069 Register BasePtrReg = SrcRegs[0];
1070 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1071 MRI.setType(BasePtrReg, PtrTy);
1073 MachineIRBuilder B(MI);
1075 unsigned SplitElts =
1076 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
1077 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
1078 ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
1079 GISelObserverWrapper Observer(&O);
1080 B.setChangeObserver(Observer);
1081 LegalizerHelper Helper(B.getMF(), Observer, B);
1082 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1083 return false;
1085 // At this point, the legalizer has split the original load into smaller
1086 // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
1087 // that combines the outputs of the lower loads and writes it to DstReg.
1088 // The register bank selector has also added the RepairInst which writes to
1089 // DstReg as well.
1091 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
1093 // Replace the output of the LegalizedInst with a temporary register, since
1094 // RepairInst already defines DstReg.
1095 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1096 LegalizedInst->getOperand(0).setReg(TmpReg);
1097 B.setInsertPt(*RepairInst->getParent(), RepairInst);
1099 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1100 Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
1101 B.buildConstant(IdxReg, DefIdx);
1102 MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
1103 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1106 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1107 return true;
1110 bool AMDGPURegisterBankInfo::applyMappingImage(
1111 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1112 MachineRegisterInfo &MRI, int RsrcIdx) const {
1113 const int NumDefs = MI.getNumExplicitDefs();
1115 // The reported argument index is relative to the IR intrinsic call arguments,
1116 // so we need to shift by the number of defs and the intrinsic ID.
1117 RsrcIdx += NumDefs + 1;
1119 // Insert copies to VGPR arguments.
1120 applyDefaultMapping(OpdMapper);
1122 // Fixup any SGPR arguments.
1123 SmallVector<unsigned, 4> SGPRIndexes;
1124 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1125 if (!MI.getOperand(I).isReg())
1126 continue;
1128 // If this intrinsic has a sampler, it immediately follows rsrc.
1129 if (I == RsrcIdx || I == RsrcIdx + 1)
1130 SGPRIndexes.push_back(I);
1133 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1134 return true;
1137 // For cases where only a single copy is inserted for matching register banks.
1138 // Replace the register in the instruction operand
1139 static void substituteSimpleCopyRegs(
1140 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1141 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1142 if (!SrcReg.empty()) {
1143 assert(SrcReg.size() == 1);
1144 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1148 /// Handle register layout difference for f16 images for some subtargets.
1149 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1150 MachineRegisterInfo &MRI,
1151 Register Reg) const {
1152 if (!Subtarget.hasUnpackedD16VMem())
1153 return Reg;
1155 const LLT S16 = LLT::scalar(16);
1156 LLT StoreVT = MRI.getType(Reg);
1157 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1158 return Reg;
1160 auto Unmerge = B.buildUnmerge(S16, Reg);
1163 SmallVector<Register, 4> WideRegs;
1164 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1165 WideRegs.push_back(Unmerge.getReg(I));
1167 const LLT S32 = LLT::scalar(32);
1168 int NumElts = StoreVT.getNumElements();
1170 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1173 static std::pair<Register, unsigned>
1174 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1175 int64_t Const;
1176 if (mi_match(Reg, MRI, m_ICst(Const)))
1177 return std::make_pair(Register(), Const);
1179 Register Base;
1180 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1181 return std::make_pair(Base, Const);
1183 // TODO: Handle G_OR used for add case
1184 return std::make_pair(Reg, 0);
1187 std::pair<Register, unsigned>
1188 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1189 Register OrigOffset) const {
1190 const unsigned MaxImm = 4095;
1191 Register BaseReg;
1192 unsigned ImmOffset;
1193 const LLT S32 = LLT::scalar(32);
1195 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1196 OrigOffset);
1198 unsigned C1 = 0;
1199 if (ImmOffset != 0) {
1200 // If the immediate value is too big for the immoffset field, put the value
1201 // and -4096 into the immoffset field so that the value that is copied/added
1202 // for the voffset field is a multiple of 4096, and it stands more chance
1203 // of being CSEd with the copy/add for another similar load/store.
1204 // However, do not do that rounding down to a multiple of 4096 if that is a
1205 // negative number, as it appears to be illegal to have a negative offset
1206 // in the vgpr, even if adding the immediate offset makes it positive.
1207 unsigned Overflow = ImmOffset & ~MaxImm;
1208 ImmOffset -= Overflow;
1209 if ((int32_t)Overflow < 0) {
1210 Overflow += ImmOffset;
1211 ImmOffset = 0;
1214 C1 = ImmOffset;
1215 if (Overflow != 0) {
1216 if (!BaseReg)
1217 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1218 else {
1219 auto OverflowVal = B.buildConstant(S32, Overflow);
1220 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1225 if (!BaseReg)
1226 BaseReg = B.buildConstant(S32, 0).getReg(0);
1228 return {BaseReg, C1};
1231 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1232 int64_t C;
1233 return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1236 static unsigned extractGLC(unsigned CachePolicy) {
1237 return CachePolicy & 1;
1240 static unsigned extractSLC(unsigned CachePolicy) {
1241 return (CachePolicy >> 1) & 1;
1244 static unsigned extractDLC(unsigned CachePolicy) {
1245 return (CachePolicy >> 2) & 1;
1248 MachineInstr *
1249 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1250 MachineInstr &MI) const {
1251 MachineRegisterInfo &MRI = *B.getMRI();
1252 executeInWaterfallLoop(B, MI, MRI, {2, 4});
1254 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1256 Register VData = MI.getOperand(1).getReg();
1257 LLT Ty = MRI.getType(VData);
1259 int EltSize = Ty.getScalarSizeInBits();
1260 int Size = Ty.getSizeInBits();
1262 // FIXME: Broken integer truncstore.
1263 if (EltSize != 32)
1264 report_fatal_error("unhandled intrinsic store");
1266 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1267 const int MemSize = (*MI.memoperands_begin())->getSize();
1270 Register RSrc = MI.getOperand(2).getReg();
1271 Register VOffset = MI.getOperand(3).getReg();
1272 Register SOffset = MI.getOperand(4).getReg();
1273 unsigned CachePolicy = MI.getOperand(5).getImm();
1275 unsigned ImmOffset;
1276 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1278 const bool Offen = !isZero(VOffset, MRI);
1280 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1281 switch (8 * MemSize) {
1282 case 8:
1283 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1284 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1285 break;
1286 case 16:
1287 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1288 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1289 break;
1290 default:
1291 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1292 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1293 if (Size > 32)
1294 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1295 break;
1299 // Set the insertion point back to the instruction in case it was moved into a
1300 // loop.
1301 B.setInstr(MI);
1303 MachineInstrBuilder MIB = B.buildInstr(Opc)
1304 .addUse(VData);
1306 if (Offen)
1307 MIB.addUse(VOffset);
1309 MIB.addUse(RSrc)
1310 .addUse(SOffset)
1311 .addImm(ImmOffset)
1312 .addImm(extractGLC(CachePolicy))
1313 .addImm(extractSLC(CachePolicy))
1314 .addImm(0) // tfe: FIXME: Remove from inst
1315 .addImm(extractDLC(CachePolicy))
1316 .cloneMemRefs(MI);
1318 // FIXME: We need a way to report failure from applyMappingImpl.
1319 // Insert constrain copies before inserting the loop.
1320 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1321 report_fatal_error("failed to constrain selected store intrinsic");
1323 return MIB;
1326 void AMDGPURegisterBankInfo::applyMappingImpl(
1327 const OperandsMapper &OpdMapper) const {
1328 MachineInstr &MI = OpdMapper.getMI();
1329 unsigned Opc = MI.getOpcode();
1330 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1331 switch (Opc) {
1332 case AMDGPU::G_SELECT: {
1333 Register DstReg = MI.getOperand(0).getReg();
1334 LLT DstTy = MRI.getType(DstReg);
1335 if (DstTy.getSizeInBits() != 64)
1336 break;
1338 LLT HalfTy = getHalfSizedType(DstTy);
1340 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1341 SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
1342 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1343 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1345 // All inputs are SGPRs, nothing special to do.
1346 if (DefRegs.empty()) {
1347 assert(Src1Regs.empty() && Src2Regs.empty());
1348 break;
1351 MachineIRBuilder B(MI);
1352 if (Src0Regs.empty())
1353 Src0Regs.push_back(MI.getOperand(1).getReg());
1354 else {
1355 assert(Src0Regs.size() == 1);
1358 if (Src1Regs.empty())
1359 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1360 else {
1361 setRegsToType(MRI, Src1Regs, HalfTy);
1364 if (Src2Regs.empty())
1365 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1366 else
1367 setRegsToType(MRI, Src2Regs, HalfTy);
1369 setRegsToType(MRI, DefRegs, HalfTy);
1371 B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
1372 B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
1374 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1375 MI.eraseFromParent();
1376 return;
1378 case AMDGPU::G_AND:
1379 case AMDGPU::G_OR:
1380 case AMDGPU::G_XOR: {
1381 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1382 // there is a VGPR input.
1383 Register DstReg = MI.getOperand(0).getReg();
1384 LLT DstTy = MRI.getType(DstReg);
1385 if (DstTy.getSizeInBits() != 64)
1386 break;
1388 LLT HalfTy = getHalfSizedType(DstTy);
1389 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1390 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
1391 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1393 // All inputs are SGPRs, nothing special to do.
1394 if (DefRegs.empty()) {
1395 assert(Src0Regs.empty() && Src1Regs.empty());
1396 break;
1399 assert(DefRegs.size() == 2);
1400 assert(Src0Regs.size() == Src1Regs.size() &&
1401 (Src0Regs.empty() || Src0Regs.size() == 2));
1403 // Depending on where the source registers came from, the generic code may
1404 // have decided to split the inputs already or not. If not, we still need to
1405 // extract the values.
1406 MachineIRBuilder B(MI);
1408 if (Src0Regs.empty())
1409 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
1410 else
1411 setRegsToType(MRI, Src0Regs, HalfTy);
1413 if (Src1Regs.empty())
1414 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1415 else
1416 setRegsToType(MRI, Src1Regs, HalfTy);
1418 setRegsToType(MRI, DefRegs, HalfTy);
1420 B.buildInstr(Opc)
1421 .addDef(DefRegs[0])
1422 .addUse(Src0Regs[0])
1423 .addUse(Src1Regs[0]);
1425 B.buildInstr(Opc)
1426 .addDef(DefRegs[1])
1427 .addUse(Src0Regs[1])
1428 .addUse(Src1Regs[1]);
1430 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1431 MI.eraseFromParent();
1432 return;
1434 case AMDGPU::G_ADD:
1435 case AMDGPU::G_SUB:
1436 case AMDGPU::G_MUL: {
1437 Register DstReg = MI.getOperand(0).getReg();
1438 LLT DstTy = MRI.getType(DstReg);
1439 if (DstTy != LLT::scalar(16))
1440 break;
1442 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1443 if (DstBank == &AMDGPU::VGPRRegBank)
1444 break;
1446 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1447 MachineFunction *MF = MI.getParent()->getParent();
1448 MachineIRBuilder B(MI);
1449 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1450 GISelObserverWrapper Observer(&ApplySALU);
1451 LegalizerHelper Helper(*MF, Observer, B);
1453 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1454 LegalizerHelper::Legalized)
1455 llvm_unreachable("widen scalar should have succeeded");
1456 return;
1458 case AMDGPU::G_SMIN:
1459 case AMDGPU::G_SMAX:
1460 case AMDGPU::G_UMIN:
1461 case AMDGPU::G_UMAX: {
1462 Register DstReg = MI.getOperand(0).getReg();
1463 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1464 if (DstBank == &AMDGPU::VGPRRegBank)
1465 break;
1467 MachineFunction *MF = MI.getParent()->getParent();
1468 MachineIRBuilder B(MI);
1469 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1470 GISelObserverWrapper Observer(&ApplySALU);
1471 LegalizerHelper Helper(*MF, Observer, B);
1473 // Turn scalar min/max into a compare and select.
1474 LLT Ty = MRI.getType(DstReg);
1475 LLT S32 = LLT::scalar(32);
1476 LLT S16 = LLT::scalar(16);
1478 if (Ty == S16) {
1479 // Need to widen to s32, and expand as cmp + select.
1480 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1481 llvm_unreachable("widenScalar should have succeeded");
1483 // FIXME: This is relying on widenScalar leaving MI in place.
1484 if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
1485 llvm_unreachable("lower should have succeeded");
1486 } else {
1487 if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
1488 llvm_unreachable("lower should have succeeded");
1491 return;
1493 case AMDGPU::G_SEXT:
1494 case AMDGPU::G_ZEXT: {
1495 Register SrcReg = MI.getOperand(1).getReg();
1496 LLT SrcTy = MRI.getType(SrcReg);
1497 bool Signed = Opc == AMDGPU::G_SEXT;
1499 MachineIRBuilder B(MI);
1500 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1502 Register DstReg = MI.getOperand(0).getReg();
1503 LLT DstTy = MRI.getType(DstReg);
1504 if (DstTy.isScalar() &&
1505 SrcBank != &AMDGPU::SGPRRegBank &&
1506 SrcBank != &AMDGPU::SCCRegBank &&
1507 SrcBank != &AMDGPU::VCCRegBank &&
1508 // FIXME: Should handle any type that round to s64 when irregular
1509 // breakdowns supported.
1510 DstTy.getSizeInBits() == 64 &&
1511 SrcTy.getSizeInBits() <= 32) {
1512 const LLT S32 = LLT::scalar(32);
1513 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1515 // Extend to 32-bit, and then extend the low half.
1516 if (Signed) {
1517 // TODO: Should really be buildSExtOrCopy
1518 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
1520 // Replicate sign bit from 32-bit extended part.
1521 auto ShiftAmt = B.buildConstant(S32, 31);
1522 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1523 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
1524 } else {
1525 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
1526 B.buildConstant(DefRegs[1], 0);
1529 MRI.setRegBank(DstReg, *SrcBank);
1530 MI.eraseFromParent();
1531 return;
1534 if (SrcTy != LLT::scalar(1))
1535 return;
1537 if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
1538 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1540 const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
1541 &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
1543 unsigned DstSize = DstTy.getSizeInBits();
1544 // 64-bit select is SGPR only
1545 const bool UseSel64 = DstSize > 32 &&
1546 SrcBank->getID() == AMDGPU::SCCRegBankID;
1548 // TODO: Should s16 select be legal?
1549 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
1550 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
1551 auto False = B.buildConstant(SelType, 0);
1553 MRI.setRegBank(True.getReg(0), *DstBank);
1554 MRI.setRegBank(False.getReg(0), *DstBank);
1555 MRI.setRegBank(DstReg, *DstBank);
1557 if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
1558 B.buildSelect(DefRegs[0], SrcReg, True, False);
1559 B.buildCopy(DefRegs[1], DefRegs[0]);
1560 } else if (DstSize < 32) {
1561 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
1562 MRI.setRegBank(Sel.getReg(0), *DstBank);
1563 B.buildTrunc(DstReg, Sel);
1564 } else {
1565 B.buildSelect(DstReg, SrcReg, True, False);
1568 MI.eraseFromParent();
1569 return;
1572 // Fixup the case with an s1 src that isn't a condition register. Use shifts
1573 // instead of introducing a compare to avoid an unnecessary condition
1574 // register (and since there's no scalar 16-bit compares).
1575 auto Ext = B.buildAnyExt(DstTy, SrcReg);
1576 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
1577 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
1579 if (MI.getOpcode() == AMDGPU::G_SEXT)
1580 B.buildAShr(DstReg, Shl, ShiftAmt);
1581 else
1582 B.buildLShr(DstReg, Shl, ShiftAmt);
1584 MRI.setRegBank(DstReg, *SrcBank);
1585 MRI.setRegBank(Ext.getReg(0), *SrcBank);
1586 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1587 MRI.setRegBank(Shl.getReg(0), *SrcBank);
1588 MI.eraseFromParent();
1589 return;
1591 case AMDGPU::G_BUILD_VECTOR:
1592 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
1593 Register DstReg = MI.getOperand(0).getReg();
1594 LLT DstTy = MRI.getType(DstReg);
1595 if (DstTy != LLT::vector(2, 16))
1596 break;
1598 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
1599 substituteSimpleCopyRegs(OpdMapper, 1);
1600 substituteSimpleCopyRegs(OpdMapper, 2);
1602 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1603 if (DstBank == &AMDGPU::SGPRRegBank)
1604 break; // Can use S_PACK_* instructions.
1606 MachineIRBuilder B(MI);
1608 Register Lo = MI.getOperand(1).getReg();
1609 Register Hi = MI.getOperand(2).getReg();
1610 const LLT S32 = LLT::scalar(32);
1612 const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI);
1613 const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI);
1615 Register ZextLo;
1616 Register ShiftHi;
1618 if (Opc == AMDGPU::G_BUILD_VECTOR) {
1619 ZextLo = B.buildZExt(S32, Lo).getReg(0);
1620 MRI.setRegBank(ZextLo, *BankLo);
1622 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
1623 MRI.setRegBank(ZextHi, *BankHi);
1625 auto ShiftAmt = B.buildConstant(S32, 16);
1626 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1628 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
1629 MRI.setRegBank(ShiftHi, *BankHi);
1630 } else {
1631 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
1632 MRI.setRegBank(MaskLo, *BankLo);
1634 auto ShiftAmt = B.buildConstant(S32, 16);
1635 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1637 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
1638 MRI.setRegBank(ShiftHi, *BankHi);
1640 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
1641 MRI.setRegBank(ZextLo, *BankLo);
1644 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
1645 MRI.setRegBank(Or.getReg(0), *DstBank);
1647 B.buildBitcast(DstReg, Or);
1648 MI.eraseFromParent();
1649 return;
1651 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
1652 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1654 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
1656 if (DstRegs.empty()) {
1657 applyDefaultMapping(OpdMapper);
1658 executeInWaterfallLoop(MI, MRI, { 2 });
1659 return;
1662 Register DstReg = MI.getOperand(0).getReg();
1663 Register SrcReg = MI.getOperand(1).getReg();
1664 Register IdxReg = MI.getOperand(2).getReg();
1665 LLT DstTy = MRI.getType(DstReg);
1666 (void)DstTy;
1668 assert(DstTy.getSizeInBits() == 64);
1670 LLT SrcTy = MRI.getType(SrcReg);
1671 const LLT S32 = LLT::scalar(32);
1672 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
1674 MachineIRBuilder B(MI);
1675 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
1676 auto One = B.buildConstant(S32, 1);
1678 // Split the vector index into 32-bit pieces. Prepare to move all of the
1679 // new instructions into a waterfall loop if necessary.
1681 // Don't put the bitcast or constant in the loop.
1682 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
1684 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
1685 auto IdxLo = B.buildShl(S32, IdxReg, One);
1686 auto IdxHi = B.buildAdd(S32, IdxLo, One);
1687 B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
1688 B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
1690 const ValueMapping &DstMapping
1691 = OpdMapper.getInstrMapping().getOperandMapping(0);
1693 // FIXME: Should be getting from mapping or not?
1694 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1695 MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank);
1696 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
1697 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
1698 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
1699 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
1701 SmallSet<Register, 4> OpsToWaterfall;
1702 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
1703 MI.eraseFromParent();
1704 return;
1707 // Remove the original instruction to avoid potentially confusing the
1708 // waterfall loop logic.
1709 B.setInstr(*Span.begin());
1710 MI.eraseFromParent();
1711 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1712 OpsToWaterfall, MRI);
1713 return;
1715 case AMDGPU::G_INSERT_VECTOR_ELT: {
1716 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
1718 assert(OpdMapper.getVRegs(0).empty());
1719 assert(OpdMapper.getVRegs(1).empty());
1720 assert(OpdMapper.getVRegs(3).empty());
1722 if (InsRegs.empty()) {
1723 applyDefaultMapping(OpdMapper);
1724 executeInWaterfallLoop(MI, MRI, { 3 });
1725 return;
1728 Register DstReg = MI.getOperand(0).getReg();
1729 Register SrcReg = MI.getOperand(1).getReg();
1730 Register InsReg = MI.getOperand(2).getReg();
1731 Register IdxReg = MI.getOperand(3).getReg();
1732 LLT SrcTy = MRI.getType(SrcReg);
1733 LLT InsTy = MRI.getType(InsReg);
1734 (void)InsTy;
1736 assert(InsTy.getSizeInBits() == 64);
1738 const LLT S32 = LLT::scalar(32);
1739 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
1741 MachineIRBuilder B(MI);
1742 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
1743 auto One = B.buildConstant(S32, 1);
1745 // Split the vector index into 32-bit pieces. Prepare to move all of the
1746 // new instructions into a waterfall loop if necessary.
1748 // Don't put the bitcast or constant in the loop.
1749 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
1751 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
1752 auto IdxLo = B.buildShl(S32, IdxReg, One);
1753 auto IdxHi = B.buildAdd(S32, IdxLo, One);
1755 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
1756 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
1757 B.buildBitcast(DstReg, InsHi);
1759 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1760 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1761 const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI);
1763 MRI.setRegBank(InsReg, *InsSrcBank);
1764 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
1765 MRI.setRegBank(InsLo.getReg(0), *DstBank);
1766 MRI.setRegBank(InsHi.getReg(0), *DstBank);
1767 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
1768 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
1769 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
1772 SmallSet<Register, 4> OpsToWaterfall;
1773 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
1774 MI.eraseFromParent();
1775 return;
1778 B.setInstr(*Span.begin());
1779 MI.eraseFromParent();
1781 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1782 OpsToWaterfall, MRI);
1783 return;
1785 case AMDGPU::G_INTRINSIC: {
1786 switch (MI.getIntrinsicID()) {
1787 case Intrinsic::amdgcn_s_buffer_load: {
1788 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
1789 executeInWaterfallLoop(MI, MRI, { 2, 3 });
1790 return;
1792 case Intrinsic::amdgcn_readlane: {
1793 substituteSimpleCopyRegs(OpdMapper, 2);
1795 assert(OpdMapper.getVRegs(0).empty());
1796 assert(OpdMapper.getVRegs(3).empty());
1798 // Make sure the index is an SGPR. It doesn't make sense to run this in a
1799 // waterfall loop, so assume it's a uniform value.
1800 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1801 return;
1803 case Intrinsic::amdgcn_writelane: {
1804 assert(OpdMapper.getVRegs(0).empty());
1805 assert(OpdMapper.getVRegs(2).empty());
1806 assert(OpdMapper.getVRegs(3).empty());
1808 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
1809 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
1810 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1811 return;
1813 default:
1814 break;
1816 break;
1818 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
1819 auto IntrID = MI.getIntrinsicID();
1820 switch (IntrID) {
1821 case Intrinsic::amdgcn_buffer_load: {
1822 executeInWaterfallLoop(MI, MRI, { 2 });
1823 return;
1825 case Intrinsic::amdgcn_ds_ordered_add:
1826 case Intrinsic::amdgcn_ds_ordered_swap: {
1827 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
1828 assert(OpdMapper.getVRegs(0).empty());
1829 substituteSimpleCopyRegs(OpdMapper, 3);
1830 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1831 return;
1833 case Intrinsic::amdgcn_ds_gws_init:
1834 case Intrinsic::amdgcn_ds_gws_barrier:
1835 case Intrinsic::amdgcn_ds_gws_sema_br: {
1836 // Only the first lane is executes, so readfirstlane is safe.
1837 substituteSimpleCopyRegs(OpdMapper, 1);
1838 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1839 return;
1841 case Intrinsic::amdgcn_ds_gws_sema_v:
1842 case Intrinsic::amdgcn_ds_gws_sema_p:
1843 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1844 // Only the first lane is executes, so readfirstlane is safe.
1845 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
1846 return;
1848 case Intrinsic::amdgcn_s_sendmsg:
1849 case Intrinsic::amdgcn_s_sendmsghalt: {
1850 // FIXME: Should this use a waterfall loop?
1851 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1852 return;
1854 case Intrinsic::amdgcn_raw_buffer_load:
1855 case Intrinsic::amdgcn_raw_buffer_load_format:
1856 case Intrinsic::amdgcn_raw_tbuffer_load:
1857 case Intrinsic::amdgcn_raw_buffer_store:
1858 case Intrinsic::amdgcn_raw_buffer_store_format:
1859 case Intrinsic::amdgcn_raw_tbuffer_store: {
1860 applyDefaultMapping(OpdMapper);
1861 executeInWaterfallLoop(MI, MRI, {2, 4});
1862 return;
1864 case Intrinsic::amdgcn_struct_buffer_load:
1865 case Intrinsic::amdgcn_struct_buffer_store:
1866 case Intrinsic::amdgcn_struct_tbuffer_load:
1867 case Intrinsic::amdgcn_struct_tbuffer_store: {
1868 applyDefaultMapping(OpdMapper);
1869 executeInWaterfallLoop(MI, MRI, {2, 5});
1870 return;
1872 default: {
1873 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1874 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1875 // Non-images can have complications from operands that allow both SGPR
1876 // and VGPR. For now it's too complicated to figure out the final opcode
1877 // to derive the register bank from the MCInstrDesc.
1878 if (RSrcIntrin->IsImage) {
1879 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
1880 return;
1884 break;
1887 break;
1889 case AMDGPU::G_LOAD:
1890 case AMDGPU::G_ZEXTLOAD:
1891 case AMDGPU::G_SEXTLOAD: {
1892 if (applyMappingWideLoad(MI, OpdMapper, MRI))
1893 return;
1894 break;
1896 default:
1897 break;
1900 return applyDefaultMapping(OpdMapper);
1903 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
1904 const MachineFunction &MF = *MI.getParent()->getParent();
1905 const MachineRegisterInfo &MRI = MF.getRegInfo();
1906 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
1907 if (!MI.getOperand(i).isReg())
1908 continue;
1909 Register Reg = MI.getOperand(i).getReg();
1910 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
1911 if (Bank->getID() == AMDGPU::VGPRRegBankID)
1912 return false;
1914 assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
1915 Bank->getID() == AMDGPU::SCCRegBankID);
1918 return true;
1921 const RegisterBankInfo::InstructionMapping &
1922 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
1923 const MachineFunction &MF = *MI.getParent()->getParent();
1924 const MachineRegisterInfo &MRI = MF.getRegInfo();
1925 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1927 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1928 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
1929 unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
1930 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
1932 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1933 MI.getNumOperands());
1936 const RegisterBankInfo::InstructionMapping &
1937 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
1938 const MachineFunction &MF = *MI.getParent()->getParent();
1939 const MachineRegisterInfo &MRI = MF.getRegInfo();
1940 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1941 unsigned OpdIdx = 0;
1943 unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1944 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
1946 if (MI.getOperand(OpdIdx).isIntrinsicID())
1947 OpdsMapping[OpdIdx++] = nullptr;
1949 Register Reg1 = MI.getOperand(OpdIdx).getReg();
1950 unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
1952 unsigned DefaultBankID = Size1 == 1 ?
1953 AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1954 unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
1956 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
1958 for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
1959 const MachineOperand &MO = MI.getOperand(OpdIdx);
1960 if (!MO.isReg())
1961 continue;
1963 unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
1964 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1965 OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
1968 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1969 MI.getNumOperands());
1972 const RegisterBankInfo::InstructionMapping &
1973 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
1974 const MachineFunction &MF = *MI.getParent()->getParent();
1975 const MachineRegisterInfo &MRI = MF.getRegInfo();
1976 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1978 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1979 const MachineOperand &Op = MI.getOperand(I);
1980 if (!Op.isReg())
1981 continue;
1983 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
1984 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1987 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1988 MI.getNumOperands());
1991 const RegisterBankInfo::InstructionMapping &
1992 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
1993 const MachineInstr &MI,
1994 int RsrcIdx) const {
1995 // The reported argument index is relative to the IR intrinsic call arguments,
1996 // so we need to shift by the number of defs and the intrinsic ID.
1997 RsrcIdx += MI.getNumExplicitDefs() + 1;
1999 const int NumOps = MI.getNumOperands();
2000 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
2002 // TODO: Should packed/unpacked D16 difference be reported here as part of
2003 // the value mapping?
2004 for (int I = 0; I != NumOps; ++I) {
2005 if (!MI.getOperand(I).isReg())
2006 continue;
2008 Register OpReg = MI.getOperand(I).getReg();
2009 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
2011 // FIXME: Probably need a new intrinsic register bank searchable table to
2012 // handle arbitrary intrinsics easily.
2014 // If this has a sampler, it immediately follows rsrc.
2015 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
2017 if (MustBeSGPR) {
2018 // If this must be an SGPR, so we must report whatever it is as legal.
2019 unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2020 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
2021 } else {
2022 // Some operands must be VGPR, and these are easy to copy to.
2023 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2027 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
2030 const RegisterBankInfo::InstructionMapping &
2031 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
2033 const MachineFunction &MF = *MI.getParent()->getParent();
2034 const MachineRegisterInfo &MRI = MF.getRegInfo();
2035 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
2036 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2037 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
2038 Register PtrReg = MI.getOperand(1).getReg();
2039 LLT PtrTy = MRI.getType(PtrReg);
2040 unsigned AS = PtrTy.getAddressSpace();
2041 unsigned PtrSize = PtrTy.getSizeInBits();
2043 const ValueMapping *ValMapping;
2044 const ValueMapping *PtrMapping;
2046 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
2048 if (PtrBank == &AMDGPU::SGPRRegBank &&
2049 (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
2050 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
2051 isInstrUniformNonExtLoadAlign4(MI)) {
2052 // We have a uniform instruction so we want to use an SMRD load
2053 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2054 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
2055 } else {
2056 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
2057 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
2060 OpdsMapping[0] = ValMapping;
2061 OpdsMapping[1] = PtrMapping;
2062 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
2063 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
2064 return Mapping;
2066 // FIXME: Do we want to add a mapping for FLAT load, or should we just
2067 // handle that during instruction selection?
2070 unsigned
2071 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
2072 const MachineRegisterInfo &MRI,
2073 const TargetRegisterInfo &TRI,
2074 unsigned Default) const {
2076 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
2077 return Bank ? Bank->getID() : Default;
2081 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
2082 return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
2083 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2086 const RegisterBankInfo::ValueMapping *
2087 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
2088 const MachineRegisterInfo &MRI,
2089 const TargetRegisterInfo &TRI) const {
2090 // Lie and claim anything is legal, even though this needs to be an SGPR
2091 // applyMapping will have to deal with it as a waterfall loop.
2092 unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
2093 unsigned Size = getSizeInBits(Reg, MRI, TRI);
2094 return AMDGPU::getValueMapping(Bank, Size);
2097 const RegisterBankInfo::ValueMapping *
2098 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
2099 const MachineRegisterInfo &MRI,
2100 const TargetRegisterInfo &TRI) const {
2101 unsigned Size = getSizeInBits(Reg, MRI, TRI);
2102 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2106 /// This function must return a legal mapping, because
2107 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
2108 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
2109 /// VGPR to SGPR generated is illegal.
2111 const RegisterBankInfo::InstructionMapping &
2112 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
2113 const MachineFunction &MF = *MI.getParent()->getParent();
2114 const MachineRegisterInfo &MRI = MF.getRegInfo();
2116 if (MI.isRegSequence()) {
2117 // If any input is a VGPR, the result must be a VGPR. The default handling
2118 // assumes any copy between banks is legal.
2119 unsigned BankID = AMDGPU::SGPRRegBankID;
2121 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2122 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
2123 // It doesn't make sense to use vcc or scc banks here, so just ignore
2124 // them.
2125 if (OpBank != AMDGPU::SGPRRegBankID) {
2126 BankID = AMDGPU::VGPRRegBankID;
2127 break;
2130 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2132 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
2133 return getInstructionMapping(
2134 1, /*Cost*/ 1,
2135 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
2138 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
2139 // properly.
2141 // TODO: There are additional exec masking dependencies to analyze.
2142 if (MI.getOpcode() == TargetOpcode::G_PHI) {
2143 // TODO: Generate proper invalid bank enum.
2144 int ResultBank = -1;
2146 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2147 Register Reg = MI.getOperand(I).getReg();
2148 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
2150 // FIXME: Assuming VGPR for any undetermined inputs.
2151 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
2152 ResultBank = AMDGPU::VGPRRegBankID;
2153 break;
2156 unsigned OpBank = Bank->getID();
2157 // scc, scc -> sgpr
2158 if (OpBank == AMDGPU::SCCRegBankID) {
2159 // There's only one SCC register, so a phi requires copying to SGPR.
2160 OpBank = AMDGPU::SGPRRegBankID;
2161 } else if (OpBank == AMDGPU::VCCRegBankID) {
2162 // vcc, vcc -> vcc
2163 // vcc, sgpr -> vgpr
2164 if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) {
2165 ResultBank = AMDGPU::VGPRRegBankID;
2166 break;
2170 ResultBank = OpBank;
2173 assert(ResultBank != -1);
2175 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2177 const ValueMapping &ValMap =
2178 getValueMapping(0, Size, getRegBank(ResultBank));
2179 return getInstructionMapping(
2180 1, /*Cost*/ 1,
2181 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
2184 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
2185 if (Mapping.isValid())
2186 return Mapping;
2188 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2190 switch (MI.getOpcode()) {
2191 default:
2192 return getInvalidInstructionMapping();
2194 case AMDGPU::G_AND:
2195 case AMDGPU::G_OR:
2196 case AMDGPU::G_XOR: {
2197 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2198 if (Size == 1) {
2199 const RegisterBank *DstBank
2200 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
2202 unsigned TargetBankID = -1;
2203 unsigned BankLHS = -1;
2204 unsigned BankRHS = -1;
2205 if (DstBank) {
2206 TargetBankID = DstBank->getID();
2207 if (DstBank == &AMDGPU::VCCRegBank) {
2208 TargetBankID = AMDGPU::VCCRegBankID;
2209 BankLHS = AMDGPU::VCCRegBankID;
2210 BankRHS = AMDGPU::VCCRegBankID;
2211 } else if (DstBank == &AMDGPU::SCCRegBank) {
2212 TargetBankID = AMDGPU::SCCRegBankID;
2213 BankLHS = AMDGPU::SGPRRegBankID;
2214 BankRHS = AMDGPU::SGPRRegBankID;
2215 } else {
2216 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2217 AMDGPU::SGPRRegBankID);
2218 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2219 AMDGPU::SGPRRegBankID);
2221 } else {
2222 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2223 AMDGPU::VCCRegBankID);
2224 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2225 AMDGPU::VCCRegBankID);
2227 // Both inputs should be true booleans to produce a boolean result.
2228 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
2229 TargetBankID = AMDGPU::VGPRRegBankID;
2230 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
2231 TargetBankID = AMDGPU::VCCRegBankID;
2232 BankLHS = AMDGPU::VCCRegBankID;
2233 BankRHS = AMDGPU::VCCRegBankID;
2234 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
2235 TargetBankID = AMDGPU::SGPRRegBankID;
2236 } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
2237 // The operation must be done on a 32-bit register, but it will set
2238 // scc. The result type could interchangably be SCC or SGPR, since
2239 // both values will be produced.
2240 TargetBankID = AMDGPU::SCCRegBankID;
2241 BankLHS = AMDGPU::SGPRRegBankID;
2242 BankRHS = AMDGPU::SGPRRegBankID;
2246 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
2247 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
2248 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
2249 break;
2252 if (Size == 64) {
2254 if (isSALUMapping(MI)) {
2255 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
2256 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
2257 } else {
2258 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
2259 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
2260 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
2262 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
2263 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
2266 break;
2269 LLVM_FALLTHROUGH;
2271 case AMDGPU::G_GEP:
2272 case AMDGPU::G_ADD:
2273 case AMDGPU::G_SUB:
2274 case AMDGPU::G_MUL:
2275 case AMDGPU::G_SHL:
2276 case AMDGPU::G_LSHR:
2277 case AMDGPU::G_ASHR:
2278 case AMDGPU::G_UADDO:
2279 case AMDGPU::G_USUBO:
2280 case AMDGPU::G_UADDE:
2281 case AMDGPU::G_SADDE:
2282 case AMDGPU::G_USUBE:
2283 case AMDGPU::G_SSUBE:
2284 case AMDGPU::G_SMIN:
2285 case AMDGPU::G_SMAX:
2286 case AMDGPU::G_UMIN:
2287 case AMDGPU::G_UMAX:
2288 if (isSALUMapping(MI))
2289 return getDefaultMappingSOP(MI);
2290 LLVM_FALLTHROUGH;
2292 case AMDGPU::G_FADD:
2293 case AMDGPU::G_FSUB:
2294 case AMDGPU::G_FPTOSI:
2295 case AMDGPU::G_FPTOUI:
2296 case AMDGPU::G_FMUL:
2297 case AMDGPU::G_FMA:
2298 case AMDGPU::G_FMAD:
2299 case AMDGPU::G_FSQRT:
2300 case AMDGPU::G_FFLOOR:
2301 case AMDGPU::G_FCEIL:
2302 case AMDGPU::G_FRINT:
2303 case AMDGPU::G_SITOFP:
2304 case AMDGPU::G_UITOFP:
2305 case AMDGPU::G_FPTRUNC:
2306 case AMDGPU::G_FPEXT:
2307 case AMDGPU::G_FEXP2:
2308 case AMDGPU::G_FLOG2:
2309 case AMDGPU::G_FMINNUM:
2310 case AMDGPU::G_FMAXNUM:
2311 case AMDGPU::G_FMINNUM_IEEE:
2312 case AMDGPU::G_FMAXNUM_IEEE:
2313 case AMDGPU::G_FCANONICALIZE:
2314 case AMDGPU::G_INTRINSIC_TRUNC:
2315 case AMDGPU::G_INTRINSIC_ROUND:
2316 case AMDGPU::G_AMDGPU_FFBH_U32:
2317 return getDefaultMappingVOP(MI);
2318 case AMDGPU::G_UMULH:
2319 case AMDGPU::G_SMULH: {
2320 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
2321 return getDefaultMappingSOP(MI);
2322 return getDefaultMappingVOP(MI);
2324 case AMDGPU::G_IMPLICIT_DEF: {
2325 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2326 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2327 break;
2329 case AMDGPU::G_FCONSTANT:
2330 case AMDGPU::G_CONSTANT:
2331 case AMDGPU::G_GLOBAL_VALUE:
2332 case AMDGPU::G_BLOCK_ADDR: {
2333 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2334 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2335 break;
2337 case AMDGPU::G_FRAME_INDEX: {
2338 // TODO: This should be the same as other constants, but eliminateFrameIndex
2339 // currently assumes VALU uses.
2340 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2341 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2342 break;
2344 case AMDGPU::G_INSERT: {
2345 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2346 AMDGPU::VGPRRegBankID;
2347 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2348 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2349 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
2350 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2351 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2352 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
2353 OpdsMapping[3] = nullptr;
2354 break;
2356 case AMDGPU::G_EXTRACT: {
2357 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2358 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2359 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2360 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2361 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2362 OpdsMapping[2] = nullptr;
2363 break;
2365 case AMDGPU::G_BUILD_VECTOR:
2366 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2367 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
2368 if (DstTy == LLT::vector(2, 16)) {
2369 unsigned DstSize = DstTy.getSizeInBits();
2370 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2371 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2372 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2373 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
2375 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
2376 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
2377 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
2378 break;
2381 LLVM_FALLTHROUGH;
2383 case AMDGPU::G_MERGE_VALUES:
2384 case AMDGPU::G_CONCAT_VECTORS: {
2385 unsigned Bank = isSALUMapping(MI) ?
2386 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2387 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2388 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2390 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2391 // Op1 and Dst should use the same register bank.
2392 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
2393 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
2394 break;
2396 case AMDGPU::G_BITCAST:
2397 case AMDGPU::G_INTTOPTR:
2398 case AMDGPU::G_PTRTOINT:
2399 case AMDGPU::G_CTLZ:
2400 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2401 case AMDGPU::G_CTTZ:
2402 case AMDGPU::G_CTTZ_ZERO_UNDEF:
2403 case AMDGPU::G_CTPOP:
2404 case AMDGPU::G_BSWAP:
2405 case AMDGPU::G_BITREVERSE:
2406 case AMDGPU::G_FABS:
2407 case AMDGPU::G_FNEG: {
2408 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2409 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2410 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
2411 break;
2413 case AMDGPU::G_TRUNC: {
2414 Register Dst = MI.getOperand(0).getReg();
2415 Register Src = MI.getOperand(1).getReg();
2416 unsigned Bank = getRegBankID(Src, MRI, *TRI);
2417 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2418 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2419 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2420 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
2421 break;
2423 case AMDGPU::G_ZEXT:
2424 case AMDGPU::G_SEXT:
2425 case AMDGPU::G_ANYEXT: {
2426 Register Dst = MI.getOperand(0).getReg();
2427 Register Src = MI.getOperand(1).getReg();
2428 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2429 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2431 unsigned DstBank;
2432 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
2433 assert(SrcBank);
2434 switch (SrcBank->getID()) {
2435 case AMDGPU::SCCRegBankID:
2436 case AMDGPU::SGPRRegBankID:
2437 DstBank = AMDGPU::SGPRRegBankID;
2438 break;
2439 default:
2440 DstBank = AMDGPU::VGPRRegBankID;
2441 break;
2444 // TODO: Should anyext be split into 32-bit part as well?
2445 if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
2446 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
2447 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
2448 } else {
2449 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
2450 // 32-bits, and then to 64.
2451 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
2452 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
2453 SrcSize);
2455 break;
2457 case AMDGPU::G_FCMP: {
2458 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2459 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2460 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2461 OpdsMapping[1] = nullptr; // Predicate Operand.
2462 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2463 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2464 break;
2466 case AMDGPU::G_STORE: {
2467 assert(MI.getOperand(0).isReg());
2468 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2469 // FIXME: We need to specify a different reg bank once scalar stores
2470 // are supported.
2471 const ValueMapping *ValMapping =
2472 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2473 // FIXME: Depending on the type of store, the pointer could be in
2474 // the SGPR Reg bank.
2475 // FIXME: Pointer size should be based on the address space.
2476 const ValueMapping *PtrMapping =
2477 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
2479 OpdsMapping[0] = ValMapping;
2480 OpdsMapping[1] = PtrMapping;
2481 break;
2484 case AMDGPU::G_ICMP: {
2485 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
2486 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2487 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2488 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2490 bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
2491 Op3Bank == AMDGPU::SGPRRegBankID &&
2492 (Size == 32 || (Size == 64 &&
2493 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
2494 Subtarget.hasScalarCompareEq64()));
2496 unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2498 OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
2499 OpdsMapping[1] = nullptr; // Predicate Operand.
2500 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2501 OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
2502 break;
2504 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2505 // VGPR index can be used for waterfall when indexing a SGPR vector.
2506 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2507 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2508 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2509 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2510 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2511 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
2513 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
2514 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
2516 // The index can be either if the source vector is VGPR.
2517 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2518 break;
2520 case AMDGPU::G_INSERT_VECTOR_ELT: {
2521 unsigned OutputBankID = isSALUMapping(MI) ?
2522 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2524 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2525 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2526 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2527 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2528 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
2529 MRI, *TRI);
2530 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2532 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
2533 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
2534 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
2535 InsertSize);
2537 // The index can be either if the source vector is VGPR.
2538 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
2539 break;
2541 case AMDGPU::G_UNMERGE_VALUES: {
2542 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2543 AMDGPU::VGPRRegBankID;
2545 // Op1 and Dst should use the same register bank.
2546 // FIXME: Shouldn't this be the default? Why do we need to handle this?
2547 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2548 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2549 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
2551 break;
2553 case AMDGPU::G_INTRINSIC: {
2554 switch (MI.getIntrinsicID()) {
2555 default:
2556 return getInvalidInstructionMapping();
2557 case Intrinsic::amdgcn_div_fmas:
2558 case Intrinsic::amdgcn_trig_preop:
2559 case Intrinsic::amdgcn_sin:
2560 case Intrinsic::amdgcn_cos:
2561 case Intrinsic::amdgcn_log_clamp:
2562 case Intrinsic::amdgcn_rcp:
2563 case Intrinsic::amdgcn_rcp_legacy:
2564 case Intrinsic::amdgcn_rsq:
2565 case Intrinsic::amdgcn_rsq_legacy:
2566 case Intrinsic::amdgcn_rsq_clamp:
2567 case Intrinsic::amdgcn_ldexp:
2568 case Intrinsic::amdgcn_frexp_mant:
2569 case Intrinsic::amdgcn_frexp_exp:
2570 case Intrinsic::amdgcn_fract:
2571 case Intrinsic::amdgcn_cvt_pkrtz:
2572 case Intrinsic::amdgcn_cvt_pknorm_i16:
2573 case Intrinsic::amdgcn_cvt_pknorm_u16:
2574 case Intrinsic::amdgcn_cvt_pk_i16:
2575 case Intrinsic::amdgcn_cvt_pk_u16:
2576 case Intrinsic::amdgcn_fmed3:
2577 case Intrinsic::amdgcn_cubeid:
2578 case Intrinsic::amdgcn_cubema:
2579 case Intrinsic::amdgcn_cubesc:
2580 case Intrinsic::amdgcn_cubetc:
2581 case Intrinsic::amdgcn_sffbh:
2582 case Intrinsic::amdgcn_fmad_ftz:
2583 case Intrinsic::amdgcn_mbcnt_lo:
2584 case Intrinsic::amdgcn_mbcnt_hi:
2585 case Intrinsic::amdgcn_ubfe:
2586 case Intrinsic::amdgcn_sbfe:
2587 case Intrinsic::amdgcn_mul_u24:
2588 case Intrinsic::amdgcn_mul_i24:
2589 case Intrinsic::amdgcn_lerp:
2590 case Intrinsic::amdgcn_sad_u8:
2591 case Intrinsic::amdgcn_msad_u8:
2592 case Intrinsic::amdgcn_sad_hi_u8:
2593 case Intrinsic::amdgcn_sad_u16:
2594 case Intrinsic::amdgcn_qsad_pk_u16_u8:
2595 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
2596 case Intrinsic::amdgcn_mqsad_u32_u8:
2597 case Intrinsic::amdgcn_cvt_pk_u8_f32:
2598 case Intrinsic::amdgcn_alignbit:
2599 case Intrinsic::amdgcn_alignbyte:
2600 case Intrinsic::amdgcn_fdot2:
2601 case Intrinsic::amdgcn_sdot2:
2602 case Intrinsic::amdgcn_udot2:
2603 case Intrinsic::amdgcn_sdot4:
2604 case Intrinsic::amdgcn_udot4:
2605 case Intrinsic::amdgcn_sdot8:
2606 case Intrinsic::amdgcn_udot8:
2607 case Intrinsic::amdgcn_wwm:
2608 case Intrinsic::amdgcn_wqm:
2609 return getDefaultMappingVOP(MI);
2610 case Intrinsic::amdgcn_ds_swizzle:
2611 case Intrinsic::amdgcn_ds_permute:
2612 case Intrinsic::amdgcn_ds_bpermute:
2613 case Intrinsic::amdgcn_update_dpp:
2614 return getDefaultMappingAllVGPR(MI);
2615 case Intrinsic::amdgcn_kernarg_segment_ptr:
2616 case Intrinsic::amdgcn_s_getpc:
2617 case Intrinsic::amdgcn_groupstaticsize: {
2618 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2619 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2620 break;
2622 case Intrinsic::amdgcn_wqm_vote: {
2623 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2624 OpdsMapping[0] = OpdsMapping[2]
2625 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
2626 break;
2628 case Intrinsic::amdgcn_s_buffer_load: {
2629 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
2630 Register RSrc = MI.getOperand(2).getReg(); // SGPR
2631 Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
2633 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2634 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2635 unsigned Size3 = MRI.getType(Offset).getSizeInBits();
2637 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2638 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2640 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
2641 OpdsMapping[1] = nullptr; // intrinsic id
2643 // Lie and claim everything is legal, even though some need to be
2644 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2645 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2646 OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
2647 OpdsMapping[4] = nullptr;
2648 break;
2650 case Intrinsic::amdgcn_div_scale: {
2651 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2652 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2653 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
2654 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
2656 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2657 OpdsMapping[3] = AMDGPU::getValueMapping(
2658 getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
2659 OpdsMapping[4] = AMDGPU::getValueMapping(
2660 getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
2662 break;
2664 case Intrinsic::amdgcn_class: {
2665 Register Src0Reg = MI.getOperand(2).getReg();
2666 Register Src1Reg = MI.getOperand(3).getReg();
2667 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
2668 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
2669 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2670 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
2671 OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
2672 Src0Size);
2673 OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
2674 Src1Size);
2675 break;
2677 case Intrinsic::amdgcn_icmp:
2678 case Intrinsic::amdgcn_fcmp: {
2679 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2680 // This is not VCCRegBank because this is not used in boolean contexts.
2681 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2682 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2683 unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2684 unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2685 OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
2686 OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
2687 break;
2689 case Intrinsic::amdgcn_readlane: {
2690 // This must be an SGPR, but accept a VGPR.
2691 Register IdxReg = MI.getOperand(3).getReg();
2692 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2693 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2694 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2695 LLVM_FALLTHROUGH;
2697 case Intrinsic::amdgcn_readfirstlane: {
2698 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2699 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2700 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2701 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2702 break;
2704 case Intrinsic::amdgcn_writelane: {
2705 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2706 Register SrcReg = MI.getOperand(2).getReg();
2707 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
2708 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2709 Register IdxReg = MI.getOperand(3).getReg();
2710 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2711 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2712 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2714 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
2715 // to legalize.
2716 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
2717 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2718 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2719 break;
2721 case Intrinsic::amdgcn_if_break: {
2722 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2723 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2724 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2725 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2726 break;
2729 break;
2731 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2732 auto IntrID = MI.getIntrinsicID();
2733 switch (IntrID) {
2734 case Intrinsic::amdgcn_s_getreg:
2735 case Intrinsic::amdgcn_s_memtime:
2736 case Intrinsic::amdgcn_s_memrealtime:
2737 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
2738 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2739 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2740 break;
2742 case Intrinsic::amdgcn_ds_append:
2743 case Intrinsic::amdgcn_ds_consume:
2744 case Intrinsic::amdgcn_ds_fadd:
2745 case Intrinsic::amdgcn_ds_fmin:
2746 case Intrinsic::amdgcn_ds_fmax:
2747 case Intrinsic::amdgcn_atomic_inc:
2748 case Intrinsic::amdgcn_atomic_dec:
2749 return getDefaultMappingAllVGPR(MI);
2750 case Intrinsic::amdgcn_ds_ordered_add:
2751 case Intrinsic::amdgcn_ds_ordered_swap: {
2752 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2753 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2754 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2755 AMDGPU::SGPRRegBankID);
2756 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
2757 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2758 break;
2760 case Intrinsic::amdgcn_exp_compr:
2761 OpdsMapping[0] = nullptr; // IntrinsicID
2762 // FIXME: These are immediate values which can't be read from registers.
2763 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2764 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2765 // FIXME: Could we support packed types here?
2766 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2767 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2768 // FIXME: These are immediate values which can't be read from registers.
2769 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2770 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2771 break;
2772 case Intrinsic::amdgcn_exp:
2773 // FIXME: Could we support packed types here?
2774 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2775 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2776 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2777 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2778 break;
2779 case Intrinsic::amdgcn_buffer_load: {
2780 Register RSrc = MI.getOperand(2).getReg(); // SGPR
2781 Register VIndex = MI.getOperand(3).getReg(); // VGPR
2782 Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
2784 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2785 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2786 unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
2787 unsigned Size4 = MRI.getType(Offset).getSizeInBits();
2789 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2790 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2792 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
2793 OpdsMapping[1] = nullptr; // intrinsic id
2795 // Lie and claim everything is legal, even though some need to be
2796 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2797 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2798 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
2799 OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
2800 OpdsMapping[5] = nullptr;
2801 OpdsMapping[6] = nullptr;
2802 break;
2804 case Intrinsic::amdgcn_s_sendmsg:
2805 case Intrinsic::amdgcn_s_sendmsghalt: {
2806 // This must be an SGPR, but accept a VGPR.
2807 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2808 AMDGPU::SGPRRegBankID);
2809 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
2810 break;
2812 case Intrinsic::amdgcn_end_cf:
2813 case Intrinsic::amdgcn_init_exec: {
2814 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2815 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2816 break;
2818 case Intrinsic::amdgcn_else: {
2819 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2820 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2821 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
2822 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
2823 break;
2825 case Intrinsic::amdgcn_kill: {
2826 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2827 break;
2829 case Intrinsic::amdgcn_raw_buffer_load:
2830 case Intrinsic::amdgcn_raw_tbuffer_load: {
2831 // FIXME: Should make intrinsic ID the last operand of the instruction,
2832 // then this would be the same as store
2833 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
2834 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2835 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2836 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2837 break;
2839 case Intrinsic::amdgcn_raw_buffer_store:
2840 case Intrinsic::amdgcn_raw_buffer_store_format:
2841 case Intrinsic::amdgcn_raw_tbuffer_store: {
2842 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
2843 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2844 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2845 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2846 break;
2848 case Intrinsic::amdgcn_struct_buffer_load:
2849 case Intrinsic::amdgcn_struct_tbuffer_load: {
2850 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
2851 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2852 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2853 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2854 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
2855 break;
2857 case Intrinsic::amdgcn_struct_buffer_store:
2858 case Intrinsic::amdgcn_struct_tbuffer_store: {
2859 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
2860 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2861 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2862 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2863 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
2864 break;
2866 case Intrinsic::amdgcn_init_exec_from_input: {
2867 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2868 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2869 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2870 break;
2872 case Intrinsic::amdgcn_ds_gws_init:
2873 case Intrinsic::amdgcn_ds_gws_barrier:
2874 case Intrinsic::amdgcn_ds_gws_sema_br: {
2875 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2877 // This must be an SGPR, but accept a VGPR.
2878 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2879 AMDGPU::SGPRRegBankID);
2880 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
2881 break;
2883 case Intrinsic::amdgcn_ds_gws_sema_v:
2884 case Intrinsic::amdgcn_ds_gws_sema_p:
2885 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2886 // This must be an SGPR, but accept a VGPR.
2887 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2888 AMDGPU::SGPRRegBankID);
2889 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
2890 break;
2892 default:
2893 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
2894 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
2895 // Non-images can have complications from operands that allow both SGPR
2896 // and VGPR. For now it's too complicated to figure out the final opcode
2897 // to derive the register bank from the MCInstrDesc.
2898 if (RSrcIntrin->IsImage)
2899 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
2902 return getInvalidInstructionMapping();
2904 break;
2906 case AMDGPU::G_SELECT: {
2907 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2908 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2909 AMDGPU::SGPRRegBankID);
2910 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
2911 AMDGPU::SGPRRegBankID);
2912 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
2913 Op3Bank == AMDGPU::SGPRRegBankID;
2915 unsigned CondBankDefault = SGPRSrcs ?
2916 AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2917 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2918 CondBankDefault);
2919 if (CondBank == AMDGPU::SGPRRegBankID)
2920 CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2921 else if (CondBank == AMDGPU::VGPRRegBankID)
2922 CondBank = AMDGPU::VCCRegBankID;
2924 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ?
2925 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2927 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID);
2929 if (Size == 64) {
2930 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2931 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2932 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2933 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2934 } else {
2935 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
2936 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2937 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
2938 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
2941 break;
2944 case AMDGPU::G_LOAD:
2945 case AMDGPU::G_ZEXTLOAD:
2946 case AMDGPU::G_SEXTLOAD:
2947 return getInstrMappingForLoad(MI);
2949 case AMDGPU::G_ATOMICRMW_XCHG:
2950 case AMDGPU::G_ATOMICRMW_ADD:
2951 case AMDGPU::G_ATOMICRMW_SUB:
2952 case AMDGPU::G_ATOMICRMW_AND:
2953 case AMDGPU::G_ATOMICRMW_OR:
2954 case AMDGPU::G_ATOMICRMW_XOR:
2955 case AMDGPU::G_ATOMICRMW_MAX:
2956 case AMDGPU::G_ATOMICRMW_MIN:
2957 case AMDGPU::G_ATOMICRMW_UMAX:
2958 case AMDGPU::G_ATOMICRMW_UMIN:
2959 case AMDGPU::G_ATOMICRMW_FADD:
2960 case AMDGPU::G_ATOMIC_CMPXCHG: {
2961 return getDefaultMappingAllVGPR(MI);
2963 case AMDGPU::G_BRCOND: {
2964 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
2965 AMDGPU::SGPRRegBankID);
2966 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
2967 if (Bank != AMDGPU::SCCRegBankID)
2968 Bank = AMDGPU::VCCRegBankID;
2970 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
2971 break;
2975 return getInstructionMapping(/*ID*/1, /*Cost*/1,
2976 getOperandsMapping(OpdsMapping),
2977 MI.getNumOperands());