[obj2yaml] - Fix BB after r373315.
[llvm-complete.git] / lib / Target / AMDGPU / AMDGPURegisterBankInfo.cpp
blob49a4c7b26b74f482c9e881def1c767335a5d6473
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/SmallSet.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
25 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
26 #include "llvm/CodeGen/TargetRegisterInfo.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Constants.h"
30 #define GET_TARGET_REGBANK_IMPL
31 #include "AMDGPUGenRegisterBank.inc"
33 // This file will be TableGen'ed at some point.
34 #include "AMDGPUGenRegisterBankInfo.def"
36 using namespace llvm;
37 using namespace MIPatternMatch;
39 namespace {
41 // Observer to apply a register bank to new registers created by LegalizerHelper.
42 class ApplyRegBankMapping final : public GISelChangeObserver {
43 private:
44 MachineRegisterInfo &MRI;
45 const RegisterBank *NewBank;
46 SmallVector<MachineInstr *, 4> NewInsts;
48 public:
49 ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
50 : MRI(MRI_), NewBank(RB) {}
52 ~ApplyRegBankMapping() {
53 for (MachineInstr *MI : NewInsts)
54 applyBank(*MI);
57 /// Set any registers that don't have a set register class or bank to SALU.
58 void applyBank(MachineInstr &MI) {
59 for (MachineOperand &Op : MI.operands()) {
60 if (!Op.isReg())
61 continue;
63 Register Reg = Op.getReg();
64 if (MRI.getRegClassOrRegBank(Reg))
65 continue;
67 const RegisterBank *RB = NewBank;
68 // FIXME: This might not be enough to detect when SCC should be used.
69 if (MRI.getType(Reg) == LLT::scalar(1))
70 RB = (NewBank == &AMDGPU::SGPRRegBank ?
71 &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
73 MRI.setRegBank(Reg, *RB);
77 void erasingInstr(MachineInstr &MI) override {}
79 void createdInstr(MachineInstr &MI) override {
80 // At this point, the instruction was just inserted and has no operands.
81 NewInsts.push_back(&MI);
84 void changingInstr(MachineInstr &MI) override {}
85 void changedInstr(MachineInstr &MI) override {}
89 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
90 : AMDGPUGenRegisterBankInfo(),
91 Subtarget(ST),
92 TRI(Subtarget.getRegisterInfo()),
93 TII(Subtarget.getInstrInfo()) {
95 // HACK: Until this is fully tablegen'd.
96 static bool AlreadyInit = false;
97 if (AlreadyInit)
98 return;
100 AlreadyInit = true;
102 const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
103 (void)RBSGPR;
104 assert(&RBSGPR == &AMDGPU::SGPRRegBank);
106 const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
107 (void)RBVGPR;
108 assert(&RBVGPR == &AMDGPU::VGPRRegBank);
112 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
113 const RegisterBank &Src,
114 unsigned Size) const {
115 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
116 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
117 Src.getID() == AMDGPU::VGPRRegBankID) {
118 return std::numeric_limits<unsigned>::max();
121 // Bool values are tricky, because the meaning is based on context. The SCC
122 // and VCC banks are for the natural scalar and vector conditions produced by
123 // a compare.
125 // Legalization doesn't know about the necessary context, so an s1 use may
126 // have been a truncate from an arbitrary value, in which case a copy (lowered
127 // as a compare with 0) needs to be inserted.
128 if (Size == 1 &&
129 (Dst.getID() == AMDGPU::SCCRegBankID ||
130 Dst.getID() == AMDGPU::SGPRRegBankID) &&
131 (Src.getID() == AMDGPU::SGPRRegBankID ||
132 Src.getID() == AMDGPU::VGPRRegBankID ||
133 Src.getID() == AMDGPU::VCCRegBankID))
134 return std::numeric_limits<unsigned>::max();
136 if (Dst.getID() == AMDGPU::SCCRegBankID &&
137 Src.getID() == AMDGPU::VCCRegBankID)
138 return std::numeric_limits<unsigned>::max();
140 return RegisterBankInfo::copyCost(Dst, Src, Size);
143 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
144 const ValueMapping &ValMapping,
145 const RegisterBank *CurBank) const {
146 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
147 // VGPR.
148 // FIXME: Is there a better way to do this?
149 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
150 return 10; // This is expensive.
152 assert(ValMapping.NumBreakDowns == 2 &&
153 ValMapping.BreakDown[0].Length == 32 &&
154 ValMapping.BreakDown[0].StartIdx == 0 &&
155 ValMapping.BreakDown[1].Length == 32 &&
156 ValMapping.BreakDown[1].StartIdx == 32 &&
157 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
159 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
160 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
161 // want.
163 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
164 // alignment restrictions, but this probably isn't important.
165 return 1;
168 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
169 const TargetRegisterClass &RC) const {
170 if (&RC == &AMDGPU::SReg_1RegClass)
171 return AMDGPU::VCCRegBank;
173 return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank;
176 template <unsigned NumOps>
177 RegisterBankInfo::InstructionMappings
178 AMDGPURegisterBankInfo::addMappingFromTable(
179 const MachineInstr &MI, const MachineRegisterInfo &MRI,
180 const std::array<unsigned, NumOps> RegSrcOpIdx,
181 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
183 InstructionMappings AltMappings;
185 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
187 unsigned Sizes[NumOps];
188 for (unsigned I = 0; I < NumOps; ++I) {
189 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
190 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
193 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
194 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
195 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
198 // getInstrMapping's default mapping uses ID 1, so start at 2.
199 unsigned MappingID = 2;
200 for (const auto &Entry : Table) {
201 for (unsigned I = 0; I < NumOps; ++I) {
202 int OpIdx = RegSrcOpIdx[I];
203 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
206 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
207 getOperandsMapping(Operands),
208 Operands.size()));
211 return AltMappings;
214 RegisterBankInfo::InstructionMappings
215 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
216 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
217 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
218 case Intrinsic::amdgcn_readlane: {
219 static const OpRegBankEntry<3> Table[2] = {
220 // Perfectly legal.
221 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
223 // Need a readfirstlane for the index.
224 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
227 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
228 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
230 case Intrinsic::amdgcn_writelane: {
231 static const OpRegBankEntry<4> Table[4] = {
232 // Perfectly legal.
233 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
235 // Need readfirstlane of first op
236 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
238 // Need readfirstlane of second op
239 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
241 // Need readfirstlane of both ops
242 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
245 // rsrc, voffset, offset
246 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
247 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
249 default:
250 return RegisterBankInfo::getInstrAlternativeMappings(MI);
254 RegisterBankInfo::InstructionMappings
255 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
256 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
258 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
259 case Intrinsic::amdgcn_buffer_load: {
260 static const OpRegBankEntry<3> Table[4] = {
261 // Perfectly legal.
262 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
263 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
265 // Waterfall loop needed for rsrc. In the worst case this will execute
266 // approximately an extra 10 * wavesize + 2 instructions.
267 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
268 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
271 // rsrc, voffset, offset
272 const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
273 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
275 case Intrinsic::amdgcn_s_buffer_load: {
276 static const OpRegBankEntry<2> Table[4] = {
277 // Perfectly legal.
278 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
280 // Only need 1 register in loop
281 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
283 // Have to waterfall the resource.
284 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
286 // Have to waterfall the resource, and the offset.
287 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
290 // rsrc, offset
291 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
292 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
294 case Intrinsic::amdgcn_ds_ordered_add:
295 case Intrinsic::amdgcn_ds_ordered_swap: {
296 // VGPR = M0, VGPR
297 static const OpRegBankEntry<3> Table[2] = {
298 // Perfectly legal.
299 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
301 // Need a readfirstlane for m0
302 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
305 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
306 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
308 case Intrinsic::amdgcn_s_sendmsg:
309 case Intrinsic::amdgcn_s_sendmsghalt: {
310 // FIXME: Should have no register for immediate
311 static const OpRegBankEntry<2> Table[2] = {
312 // Perfectly legal.
313 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
315 // Need readlane
316 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
319 const std::array<unsigned, 2> RegSrcOpIdx = { { 1, 2 } };
320 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
322 default:
323 return RegisterBankInfo::getInstrAlternativeMappings(MI);
327 static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) {
328 if (!MI.hasOneMemOperand())
329 return false;
331 const MachineMemOperand *MMO = *MI.memoperands_begin();
332 return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
333 AMDGPUInstrInfo::isUniformMMO(MMO);
336 RegisterBankInfo::InstructionMappings
337 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
338 const MachineInstr &MI) const {
340 const MachineFunction &MF = *MI.getParent()->getParent();
341 const MachineRegisterInfo &MRI = MF.getRegInfo();
344 InstructionMappings AltMappings;
345 switch (MI.getOpcode()) {
346 case TargetOpcode::G_CONSTANT: {
347 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
348 if (Size == 1) {
349 static const OpRegBankEntry<1> Table[4] = {
350 { { AMDGPU::VGPRRegBankID }, 1 },
351 { { AMDGPU::SGPRRegBankID }, 1 },
352 { { AMDGPU::VCCRegBankID }, 1 },
353 { { AMDGPU::SCCRegBankID }, 1 }
356 return addMappingFromTable<1>(MI, MRI, { 0 }, Table);
359 LLVM_FALLTHROUGH;
361 case TargetOpcode::G_FCONSTANT:
362 case TargetOpcode::G_FRAME_INDEX:
363 case TargetOpcode::G_GLOBAL_VALUE: {
364 static const OpRegBankEntry<1> Table[2] = {
365 { { AMDGPU::VGPRRegBankID }, 1 },
366 { { AMDGPU::SGPRRegBankID }, 1 }
369 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
371 case TargetOpcode::G_AND:
372 case TargetOpcode::G_OR:
373 case TargetOpcode::G_XOR: {
374 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
376 if (Size == 1) {
377 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
378 const InstructionMapping &SCCMapping = getInstructionMapping(
379 1, 1, getOperandsMapping(
380 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
381 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
382 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
383 3); // Num Operands
384 AltMappings.push_back(&SCCMapping);
386 const InstructionMapping &SGPRMapping = getInstructionMapping(
387 1, 1, getOperandsMapping(
388 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
389 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
390 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
391 3); // Num Operands
392 AltMappings.push_back(&SGPRMapping);
394 const InstructionMapping &VCCMapping0 = getInstructionMapping(
395 2, 10, getOperandsMapping(
396 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
397 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
398 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
399 3); // Num Operands
400 AltMappings.push_back(&VCCMapping0);
401 return AltMappings;
404 if (Size != 64)
405 break;
407 const InstructionMapping &SSMapping = getInstructionMapping(
408 1, 1, getOperandsMapping(
409 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
410 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
411 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
412 3); // Num Operands
413 AltMappings.push_back(&SSMapping);
415 const InstructionMapping &VVMapping = getInstructionMapping(
416 2, 2, getOperandsMapping(
417 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
418 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
419 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
420 3); // Num Operands
421 AltMappings.push_back(&VVMapping);
423 const InstructionMapping &SVMapping = getInstructionMapping(
424 3, 3, getOperandsMapping(
425 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
426 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
427 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
428 3); // Num Operands
429 AltMappings.push_back(&SVMapping);
431 // SGPR in LHS is slightly preferrable, so make it VS more expensive than
432 // SV.
433 const InstructionMapping &VSMapping = getInstructionMapping(
434 3, 4, getOperandsMapping(
435 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
436 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
437 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
438 3); // Num Operands
439 AltMappings.push_back(&VSMapping);
440 break;
442 case TargetOpcode::G_LOAD:
443 case TargetOpcode::G_ZEXTLOAD:
444 case TargetOpcode::G_SEXTLOAD: {
445 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
446 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
447 unsigned PtrSize = PtrTy.getSizeInBits();
448 unsigned AS = PtrTy.getAddressSpace();
449 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
450 if (isInstrUniformNonExtLoadAlign4(MI) &&
451 (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)) {
452 const InstructionMapping &SSMapping = getInstructionMapping(
453 1, 1, getOperandsMapping(
454 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
455 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
456 2); // Num Operands
457 AltMappings.push_back(&SSMapping);
460 const InstructionMapping &VVMapping = getInstructionMapping(
461 2, 1, getOperandsMapping(
462 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
463 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
464 2); // Num Operands
465 AltMappings.push_back(&VVMapping);
467 // It may be possible to have a vgpr = load sgpr mapping here, because
468 // the mubuf instructions support this kind of load, but probably for only
469 // gfx7 and older. However, the addressing mode matching in the instruction
470 // selector should be able to do a better job of detecting and selecting
471 // these kinds of loads from the vgpr = load vgpr mapping.
473 return AltMappings;
476 case TargetOpcode::G_ICMP: {
477 unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
478 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
479 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
480 nullptr, // Predicate operand.
481 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
482 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
483 4); // Num Operands
484 AltMappings.push_back(&SSMapping);
486 const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
487 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
488 nullptr, // Predicate operand.
489 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
490 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
491 4); // Num Operands
492 AltMappings.push_back(&SVMapping);
494 const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
495 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
496 nullptr, // Predicate operand.
497 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
498 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
499 4); // Num Operands
500 AltMappings.push_back(&VSMapping);
502 const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
503 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
504 nullptr, // Predicate operand.
505 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
506 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
507 4); // Num Operands
508 AltMappings.push_back(&VVMapping);
510 return AltMappings;
512 case TargetOpcode::G_SELECT: {
513 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
514 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
515 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
516 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
517 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
518 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
519 4); // Num Operands
520 AltMappings.push_back(&SSMapping);
522 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
523 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
524 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
525 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
526 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
527 4); // Num Operands
528 AltMappings.push_back(&VVMapping);
530 return AltMappings;
532 case TargetOpcode::G_SMIN:
533 case TargetOpcode::G_SMAX:
534 case TargetOpcode::G_UMIN:
535 case TargetOpcode::G_UMAX: {
536 static const OpRegBankEntry<3> Table[4] = {
537 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
538 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
539 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
541 // Scalar requires cmp+select, and extends if 16-bit.
542 // FIXME: Should there be separate costs for 32 and 16-bit
543 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
546 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
547 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
549 case TargetOpcode::G_UADDE:
550 case TargetOpcode::G_USUBE:
551 case TargetOpcode::G_SADDE:
552 case TargetOpcode::G_SSUBE: {
553 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
554 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
555 getOperandsMapping(
556 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
557 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
558 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
559 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
560 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
561 5); // Num Operands
562 AltMappings.push_back(&SSMapping);
564 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
565 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
566 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
567 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
568 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
569 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
570 5); // Num Operands
571 AltMappings.push_back(&VVMapping);
572 return AltMappings;
574 case AMDGPU::G_BRCOND: {
575 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
577 const InstructionMapping &SMapping = getInstructionMapping(
578 1, 1, getOperandsMapping(
579 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
580 2); // Num Operands
581 AltMappings.push_back(&SMapping);
583 const InstructionMapping &VMapping = getInstructionMapping(
584 1, 1, getOperandsMapping(
585 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
586 2); // Num Operands
587 AltMappings.push_back(&VMapping);
588 return AltMappings;
590 case AMDGPU::G_INTRINSIC:
591 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
592 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
593 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
594 default:
595 break;
597 return RegisterBankInfo::getInstrAlternativeMappings(MI);
600 void AMDGPURegisterBankInfo::split64BitValueForMapping(
601 MachineIRBuilder &B,
602 SmallVector<Register, 2> &Regs,
603 LLT HalfTy,
604 Register Reg) const {
605 assert(HalfTy.getSizeInBits() == 32);
606 MachineRegisterInfo *MRI = B.getMRI();
607 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
608 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
609 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
610 MRI->setRegBank(LoLHS, *Bank);
611 MRI->setRegBank(HiLHS, *Bank);
613 Regs.push_back(LoLHS);
614 Regs.push_back(HiLHS);
616 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
617 .addDef(LoLHS)
618 .addDef(HiLHS)
619 .addUse(Reg);
622 /// Replace the current type each register in \p Regs has with \p NewTy
623 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
624 LLT NewTy) {
625 for (Register Reg : Regs) {
626 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
627 MRI.setType(Reg, NewTy);
631 static LLT getHalfSizedType(LLT Ty) {
632 if (Ty.isVector()) {
633 assert(Ty.getNumElements() % 2 == 0);
634 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
637 assert(Ty.getSizeInBits() % 2 == 0);
638 return LLT::scalar(Ty.getSizeInBits() / 2);
641 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
642 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
643 /// execute the instruction for each unique combination of values in all lanes
644 /// in the wave. The block will be split such that rest of the instructions are
645 /// moved to a new block.
647 /// Essentially performs this loop:
649 /// Save Execution Mask
650 /// For (Lane : Wavefront) {
651 /// Enable Lane, Disable all other lanes
652 /// SGPR = read SGPR value for current lane from VGPR
653 /// VGPRResult[Lane] = use_op SGPR
654 /// }
655 /// Restore Execution Mask
657 /// There is additional complexity to try for compare values to identify the
658 /// unique values used.
659 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
660 MachineIRBuilder &B,
661 MachineInstr &MI,
662 MachineRegisterInfo &MRI,
663 ArrayRef<unsigned> OpIndices) const {
664 MachineFunction *MF = MI.getParent()->getParent();
665 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
666 const SIInstrInfo *TII = ST.getInstrInfo();
667 MachineBasicBlock::iterator I(MI);
669 MachineBasicBlock &MBB = *MI.getParent();
670 const DebugLoc &DL = MI.getDebugLoc();
672 // Use a set to avoid extra readfirstlanes in the case where multiple operands
673 // are the same register.
674 SmallSet<Register, 4> SGPROperandRegs;
675 for (unsigned Op : OpIndices) {
676 assert(MI.getOperand(Op).isUse());
677 Register Reg = MI.getOperand(Op).getReg();
678 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
679 if (OpBank->getID() == AMDGPU::VGPRRegBankID)
680 SGPROperandRegs.insert(Reg);
683 // No operands need to be replaced, so no need to loop.
684 if (SGPROperandRegs.empty())
685 return false;
687 SmallVector<Register, 4> ResultRegs;
688 SmallVector<Register, 4> InitResultRegs;
689 SmallVector<Register, 4> PhiRegs;
690 for (MachineOperand &Def : MI.defs()) {
691 LLT ResTy = MRI.getType(Def.getReg());
692 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
693 ResultRegs.push_back(Def.getReg());
694 Register InitReg = B.buildUndef(ResTy).getReg(0);
695 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
696 InitResultRegs.push_back(InitReg);
697 PhiRegs.push_back(PhiReg);
698 MRI.setRegBank(PhiReg, *DefBank);
699 MRI.setRegBank(InitReg, *DefBank);
702 Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
703 Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
705 // Don't bother using generic instructions/registers for the exec mask.
706 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
707 .addDef(InitSaveExecReg);
709 Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
710 Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
712 // To insert the loop we need to split the block. Move everything before this
713 // point to a new block, and insert a new empty block before this instruction.
714 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
715 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
716 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
717 MachineFunction::iterator MBBI(MBB);
718 ++MBBI;
719 MF->insert(MBBI, LoopBB);
720 MF->insert(MBBI, RestoreExecBB);
721 MF->insert(MBBI, RemainderBB);
723 LoopBB->addSuccessor(RestoreExecBB);
724 LoopBB->addSuccessor(LoopBB);
726 // Move the rest of the block into a new block.
727 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
728 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
730 MBB.addSuccessor(LoopBB);
731 RestoreExecBB->addSuccessor(RemainderBB);
733 B.setInsertPt(*LoopBB, LoopBB->end());
735 B.buildInstr(TargetOpcode::PHI)
736 .addDef(PhiExec)
737 .addReg(InitSaveExecReg)
738 .addMBB(&MBB)
739 .addReg(NewExec)
740 .addMBB(LoopBB);
742 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
743 B.buildInstr(TargetOpcode::G_PHI)
744 .addDef(std::get<2>(Result))
745 .addReg(std::get<0>(Result)) // Initial value / implicit_def
746 .addMBB(&MBB)
747 .addReg(std::get<1>(Result)) // Mid-loop value.
748 .addMBB(LoopBB);
751 // Move the instruction into the loop.
752 LoopBB->splice(LoopBB->end(), &MBB, I);
753 I = std::prev(LoopBB->end());
755 B.setInstr(*I);
757 Register CondReg;
759 for (MachineOperand &Op : MI.uses()) {
760 if (!Op.isReg())
761 continue;
763 assert(!Op.isDef());
764 if (SGPROperandRegs.count(Op.getReg())) {
765 LLT OpTy = MRI.getType(Op.getReg());
766 unsigned OpSize = OpTy.getSizeInBits();
768 // Can only do a readlane of 32-bit pieces.
769 if (OpSize == 32) {
770 // Avoid extra copies in the simple case of one 32-bit register.
771 Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
772 MRI.setType(CurrentLaneOpReg, OpTy);
774 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
775 // Read the next variant <- also loop target.
776 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
777 .addReg(Op.getReg());
779 Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
780 bool First = CondReg == AMDGPU::NoRegister;
781 if (First)
782 CondReg = NewCondReg;
784 // Compare the just read M0 value to all possible Idx values.
785 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
786 .addDef(NewCondReg)
787 .addReg(CurrentLaneOpReg)
788 .addReg(Op.getReg());
789 Op.setReg(CurrentLaneOpReg);
791 if (!First) {
792 Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
794 // If there are multiple operands to consider, and the conditions.
795 B.buildInstr(AMDGPU::S_AND_B64)
796 .addDef(AndReg)
797 .addReg(NewCondReg)
798 .addReg(CondReg);
799 CondReg = AndReg;
801 } else {
802 LLT S32 = LLT::scalar(32);
803 SmallVector<Register, 8> ReadlanePieces;
805 // The compares can be done as 64-bit, but the extract needs to be done
806 // in 32-bit pieces.
808 bool Is64 = OpSize % 64 == 0;
810 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
811 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
812 : AMDGPU::V_CMP_EQ_U32_e64;
814 // The compares can be done as 64-bit, but the extract needs to be done
815 // in 32-bit pieces.
817 // Insert the unmerge before the loop.
819 B.setMBB(MBB);
820 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
821 B.setInstr(*I);
823 unsigned NumPieces = Unmerge->getNumOperands() - 1;
824 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
825 Register UnmergePiece = Unmerge.getReg(PieceIdx);
827 Register CurrentLaneOpReg;
828 if (Is64) {
829 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
830 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
832 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
833 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
834 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
836 // Read the next variant <- also loop target.
837 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
838 CurrentLaneOpRegLo)
839 .addReg(UnmergePiece, 0, AMDGPU::sub0);
841 // Read the next variant <- also loop target.
842 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
843 CurrentLaneOpRegHi)
844 .addReg(UnmergePiece, 0, AMDGPU::sub1);
846 CurrentLaneOpReg =
847 B.buildMerge(LLT::scalar(64),
848 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
849 .getReg(0);
851 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
853 if (OpTy.getScalarSizeInBits() == 64) {
854 // If we need to produce a 64-bit element vector, so use the
855 // merged pieces
856 ReadlanePieces.push_back(CurrentLaneOpReg);
857 } else {
858 // 32-bit element type.
859 ReadlanePieces.push_back(CurrentLaneOpRegLo);
860 ReadlanePieces.push_back(CurrentLaneOpRegHi);
862 } else {
863 CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
864 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
865 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
867 // Read the next variant <- also loop target.
868 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
869 CurrentLaneOpReg)
870 .addReg(UnmergePiece);
871 ReadlanePieces.push_back(CurrentLaneOpReg);
874 Register NewCondReg
875 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
876 bool First = CondReg == AMDGPU::NoRegister;
877 if (First)
878 CondReg = NewCondReg;
880 B.buildInstr(CmpOp)
881 .addDef(NewCondReg)
882 .addReg(CurrentLaneOpReg)
883 .addReg(UnmergePiece);
885 if (!First) {
886 Register AndReg
887 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
889 // If there are multiple operands to consider, and the conditions.
890 B.buildInstr(AMDGPU::S_AND_B64)
891 .addDef(AndReg)
892 .addReg(NewCondReg)
893 .addReg(CondReg);
894 CondReg = AndReg;
898 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
899 // BUILD_VECTOR
900 if (OpTy.isVector()) {
901 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
902 Op.setReg(Merge.getReg(0));
903 } else {
904 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
905 Op.setReg(Merge.getReg(0));
908 MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
913 B.setInsertPt(*LoopBB, LoopBB->end());
915 // Update EXEC, save the original EXEC value to VCC.
916 B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
917 .addDef(NewExec)
918 .addReg(CondReg, RegState::Kill);
920 MRI.setSimpleHint(NewExec, CondReg);
922 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
923 B.buildInstr(AMDGPU::S_XOR_B64_term)
924 .addDef(AMDGPU::EXEC)
925 .addReg(AMDGPU::EXEC)
926 .addReg(NewExec);
928 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
929 // s_cbranch_scc0?
931 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
932 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
933 .addMBB(LoopBB);
935 // Save the EXEC mask before the loop.
936 BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
937 .addReg(AMDGPU::EXEC);
939 // Restore the EXEC mask after the loop.
940 B.setMBB(*RestoreExecBB);
941 B.buildInstr(AMDGPU::S_MOV_B64_term)
942 .addDef(AMDGPU::EXEC)
943 .addReg(SaveExecReg);
945 // Restore the insert point before the original instruction.
946 B.setInsertPt(MBB, MBB.end());
948 return true;
951 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
952 MachineInstr &MI, MachineRegisterInfo &MRI,
953 ArrayRef<unsigned> OpIndices) const {
954 MachineIRBuilder B(MI);
955 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
958 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
959 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
960 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
961 Register Reg = MI.getOperand(OpIdx).getReg();
962 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
963 if (Bank != &AMDGPU::VGPRRegBank)
964 return;
966 MachineIRBuilder B(MI);
967 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
968 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
969 .addDef(SGPR)
970 .addReg(Reg);
972 const TargetRegisterClass *Constrained =
973 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
974 (void)Constrained;
975 assert(Constrained && "Failed to constrain readfirstlane src reg");
977 MI.getOperand(OpIdx).setReg(SGPR);
980 // When regbankselect repairs registers, it will insert a repair instruction
981 // which defines the repaired register. Then it calls applyMapping and expects
982 // that the targets will either delete or rewrite the originally wrote to the
983 // repaired registers. Beccause of this, we end up in a situation where
984 // we have 2 instructions defining the same registers.
985 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
986 Register Reg,
987 const MachineInstr &MI) {
988 // Is there some way we can assert that there are exactly 2 def instructions?
989 for (MachineInstr &Other : MRI.def_instructions(Reg)) {
990 if (&Other != &MI)
991 return &Other;
994 return nullptr;
997 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
998 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
999 MachineRegisterInfo &MRI) const {
1000 Register DstReg = MI.getOperand(0).getReg();
1001 const LLT LoadTy = MRI.getType(DstReg);
1002 unsigned LoadSize = LoadTy.getSizeInBits();
1003 const unsigned MaxNonSmrdLoadSize = 128;
1004 // 128-bit loads are supported for all instruction types.
1005 if (LoadSize <= MaxNonSmrdLoadSize)
1006 return false;
1008 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
1009 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
1011 // If the pointer is an SGPR, we have nothing to do.
1012 if (SrcRegs.empty())
1013 return false;
1015 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1017 // We want to get the repair instruction now, because it will help us
1018 // determine which instruction the legalizer inserts that will also
1019 // write to DstReg.
1020 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
1022 // RegBankSelect only emits scalar types, so we need to reset the pointer
1023 // operand to a pointer type.
1024 Register BasePtrReg = SrcRegs[0];
1025 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1026 MRI.setType(BasePtrReg, PtrTy);
1028 MachineIRBuilder B(MI);
1030 unsigned SplitElts =
1031 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
1032 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
1033 ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
1034 GISelObserverWrapper Observer(&O);
1035 B.setChangeObserver(Observer);
1036 LegalizerHelper Helper(B.getMF(), Observer, B);
1037 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1038 return false;
1040 // At this point, the legalizer has split the original load into smaller
1041 // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
1042 // that combines the outputs of the lower loads and writes it to DstReg.
1043 // The register bank selector has also added the RepairInst which writes to
1044 // DstReg as well.
1046 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
1048 // Replace the output of the LegalizedInst with a temporary register, since
1049 // RepairInst already defines DstReg.
1050 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1051 LegalizedInst->getOperand(0).setReg(TmpReg);
1052 B.setInsertPt(*RepairInst->getParent(), RepairInst);
1054 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1055 Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
1056 B.buildConstant(IdxReg, DefIdx);
1057 MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
1058 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1061 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1062 return true;
1065 bool AMDGPURegisterBankInfo::applyMappingImage(
1066 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1067 MachineRegisterInfo &MRI, int RsrcIdx) const {
1068 const int NumDefs = MI.getNumExplicitDefs();
1070 // The reported argument index is relative to the IR intrinsic call arguments,
1071 // so we need to shift by the number of defs and the intrinsic ID.
1072 RsrcIdx += NumDefs + 1;
1074 // Insert copies to VGPR arguments.
1075 applyDefaultMapping(OpdMapper);
1077 // Fixup any SGPR arguments.
1078 SmallVector<unsigned, 4> SGPRIndexes;
1079 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1080 if (!MI.getOperand(I).isReg())
1081 continue;
1083 // If this intrinsic has a sampler, it immediately follows rsrc.
1084 if (I == RsrcIdx || I == RsrcIdx + 1)
1085 SGPRIndexes.push_back(I);
1088 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1089 return true;
1092 // For cases where only a single copy is inserted for matching register banks.
1093 // Replace the register in the instruction operand
1094 static void substituteSimpleCopyRegs(
1095 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1096 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1097 if (!SrcReg.empty()) {
1098 assert(SrcReg.size() == 1);
1099 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1103 /// Handle register layout difference for f16 images for some subtargets.
1104 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1105 MachineRegisterInfo &MRI,
1106 Register Reg) const {
1107 if (!Subtarget.hasUnpackedD16VMem())
1108 return Reg;
1110 const LLT S16 = LLT::scalar(16);
1111 LLT StoreVT = MRI.getType(Reg);
1112 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1113 return Reg;
1115 auto Unmerge = B.buildUnmerge(S16, Reg);
1118 SmallVector<Register, 4> WideRegs;
1119 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1120 WideRegs.push_back(Unmerge.getReg(I));
1122 const LLT S32 = LLT::scalar(32);
1123 int NumElts = StoreVT.getNumElements();
1125 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1128 static std::pair<Register, unsigned>
1129 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1130 int64_t Const;
1131 if (mi_match(Reg, MRI, m_ICst(Const)))
1132 return std::make_pair(Register(), Const);
1134 Register Base;
1135 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1136 return std::make_pair(Base, Const);
1138 // TODO: Handle G_OR used for add case
1139 return std::make_pair(Reg, 0);
1142 std::pair<Register, unsigned>
1143 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1144 Register OrigOffset) const {
1145 const unsigned MaxImm = 4095;
1146 Register BaseReg;
1147 unsigned ImmOffset;
1148 const LLT S32 = LLT::scalar(32);
1150 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1151 OrigOffset);
1153 unsigned C1 = 0;
1154 if (ImmOffset != 0) {
1155 // If the immediate value is too big for the immoffset field, put the value
1156 // and -4096 into the immoffset field so that the value that is copied/added
1157 // for the voffset field is a multiple of 4096, and it stands more chance
1158 // of being CSEd with the copy/add for another similar load/store.
1159 // However, do not do that rounding down to a multiple of 4096 if that is a
1160 // negative number, as it appears to be illegal to have a negative offset
1161 // in the vgpr, even if adding the immediate offset makes it positive.
1162 unsigned Overflow = ImmOffset & ~MaxImm;
1163 ImmOffset -= Overflow;
1164 if ((int32_t)Overflow < 0) {
1165 Overflow += ImmOffset;
1166 ImmOffset = 0;
1169 C1 = ImmOffset;
1170 if (Overflow != 0) {
1171 if (!BaseReg)
1172 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1173 else {
1174 auto OverflowVal = B.buildConstant(S32, Overflow);
1175 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1180 if (!BaseReg)
1181 BaseReg = B.buildConstant(S32, 0).getReg(0);
1183 return {BaseReg, C1};
1186 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1187 int64_t C;
1188 return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1191 static unsigned extractGLC(unsigned CachePolicy) {
1192 return CachePolicy & 1;
1195 static unsigned extractSLC(unsigned CachePolicy) {
1196 return (CachePolicy >> 1) & 1;
1199 static unsigned extractDLC(unsigned CachePolicy) {
1200 return (CachePolicy >> 2) & 1;
1203 MachineInstr *
1204 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1205 MachineInstr &MI) const {
1206 MachineRegisterInfo &MRI = *B.getMRI();
1207 executeInWaterfallLoop(B, MI, MRI, {2, 4});
1209 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1211 Register VData = MI.getOperand(1).getReg();
1212 LLT Ty = MRI.getType(VData);
1214 int EltSize = Ty.getScalarSizeInBits();
1215 int Size = Ty.getSizeInBits();
1217 // FIXME: Broken integer truncstore.
1218 if (EltSize != 32)
1219 report_fatal_error("unhandled intrinsic store");
1221 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1222 const int MemSize = (*MI.memoperands_begin())->getSize();
1225 Register RSrc = MI.getOperand(2).getReg();
1226 Register VOffset = MI.getOperand(3).getReg();
1227 Register SOffset = MI.getOperand(4).getReg();
1228 unsigned CachePolicy = MI.getOperand(5).getImm();
1230 unsigned ImmOffset;
1231 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1233 const bool Offen = !isZero(VOffset, MRI);
1235 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1236 switch (8 * MemSize) {
1237 case 8:
1238 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1239 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1240 break;
1241 case 16:
1242 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1243 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1244 break;
1245 default:
1246 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1247 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1248 if (Size > 32)
1249 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1250 break;
1254 // Set the insertion point back to the instruction in case it was moved into a
1255 // loop.
1256 B.setInstr(MI);
1258 MachineInstrBuilder MIB = B.buildInstr(Opc)
1259 .addUse(VData);
1261 if (Offen)
1262 MIB.addUse(VOffset);
1264 MIB.addUse(RSrc)
1265 .addUse(SOffset)
1266 .addImm(ImmOffset)
1267 .addImm(extractGLC(CachePolicy))
1268 .addImm(extractSLC(CachePolicy))
1269 .addImm(0) // tfe: FIXME: Remove from inst
1270 .addImm(extractDLC(CachePolicy))
1271 .cloneMemRefs(MI);
1273 // FIXME: We need a way to report failure from applyMappingImpl.
1274 // Insert constrain copies before inserting the loop.
1275 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1276 report_fatal_error("failed to constrain selected store intrinsic");
1278 return MIB;
1281 void AMDGPURegisterBankInfo::applyMappingImpl(
1282 const OperandsMapper &OpdMapper) const {
1283 MachineInstr &MI = OpdMapper.getMI();
1284 unsigned Opc = MI.getOpcode();
1285 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1286 switch (Opc) {
1287 case AMDGPU::G_SELECT: {
1288 Register DstReg = MI.getOperand(0).getReg();
1289 LLT DstTy = MRI.getType(DstReg);
1290 if (DstTy.getSizeInBits() != 64)
1291 break;
1293 LLT HalfTy = getHalfSizedType(DstTy);
1295 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1296 SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
1297 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1298 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1300 // All inputs are SGPRs, nothing special to do.
1301 if (DefRegs.empty()) {
1302 assert(Src1Regs.empty() && Src2Regs.empty());
1303 break;
1306 MachineIRBuilder B(MI);
1307 if (Src0Regs.empty())
1308 Src0Regs.push_back(MI.getOperand(1).getReg());
1309 else {
1310 assert(Src0Regs.size() == 1);
1313 if (Src1Regs.empty())
1314 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1315 else {
1316 setRegsToType(MRI, Src1Regs, HalfTy);
1319 if (Src2Regs.empty())
1320 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1321 else
1322 setRegsToType(MRI, Src2Regs, HalfTy);
1324 setRegsToType(MRI, DefRegs, HalfTy);
1326 B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
1327 B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
1329 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1330 MI.eraseFromParent();
1331 return;
1333 case AMDGPU::G_AND:
1334 case AMDGPU::G_OR:
1335 case AMDGPU::G_XOR: {
1336 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1337 // there is a VGPR input.
1338 Register DstReg = MI.getOperand(0).getReg();
1339 LLT DstTy = MRI.getType(DstReg);
1340 if (DstTy.getSizeInBits() != 64)
1341 break;
1343 LLT HalfTy = getHalfSizedType(DstTy);
1344 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1345 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
1346 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1348 // All inputs are SGPRs, nothing special to do.
1349 if (DefRegs.empty()) {
1350 assert(Src0Regs.empty() && Src1Regs.empty());
1351 break;
1354 assert(DefRegs.size() == 2);
1355 assert(Src0Regs.size() == Src1Regs.size() &&
1356 (Src0Regs.empty() || Src0Regs.size() == 2));
1358 // Depending on where the source registers came from, the generic code may
1359 // have decided to split the inputs already or not. If not, we still need to
1360 // extract the values.
1361 MachineIRBuilder B(MI);
1363 if (Src0Regs.empty())
1364 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
1365 else
1366 setRegsToType(MRI, Src0Regs, HalfTy);
1368 if (Src1Regs.empty())
1369 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1370 else
1371 setRegsToType(MRI, Src1Regs, HalfTy);
1373 setRegsToType(MRI, DefRegs, HalfTy);
1375 B.buildInstr(Opc)
1376 .addDef(DefRegs[0])
1377 .addUse(Src0Regs[0])
1378 .addUse(Src1Regs[0]);
1380 B.buildInstr(Opc)
1381 .addDef(DefRegs[1])
1382 .addUse(Src0Regs[1])
1383 .addUse(Src1Regs[1]);
1385 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1386 MI.eraseFromParent();
1387 return;
1389 case AMDGPU::G_ADD:
1390 case AMDGPU::G_SUB:
1391 case AMDGPU::G_MUL: {
1392 Register DstReg = MI.getOperand(0).getReg();
1393 LLT DstTy = MRI.getType(DstReg);
1394 if (DstTy != LLT::scalar(16))
1395 break;
1397 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1398 if (DstBank == &AMDGPU::VGPRRegBank)
1399 break;
1401 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1402 MachineFunction *MF = MI.getParent()->getParent();
1403 MachineIRBuilder B(MI);
1404 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1405 GISelObserverWrapper Observer(&ApplySALU);
1406 LegalizerHelper Helper(*MF, Observer, B);
1408 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1409 LegalizerHelper::Legalized)
1410 llvm_unreachable("widen scalar should have succeeded");
1411 return;
1413 case AMDGPU::G_SMIN:
1414 case AMDGPU::G_SMAX:
1415 case AMDGPU::G_UMIN:
1416 case AMDGPU::G_UMAX: {
1417 Register DstReg = MI.getOperand(0).getReg();
1418 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1419 if (DstBank == &AMDGPU::VGPRRegBank)
1420 break;
1422 MachineFunction *MF = MI.getParent()->getParent();
1423 MachineIRBuilder B(MI);
1424 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1425 GISelObserverWrapper Observer(&ApplySALU);
1426 LegalizerHelper Helper(*MF, Observer, B);
1428 // Turn scalar min/max into a compare and select.
1429 LLT Ty = MRI.getType(DstReg);
1430 LLT S32 = LLT::scalar(32);
1431 LLT S16 = LLT::scalar(16);
1433 if (Ty == S16) {
1434 // Need to widen to s32, and expand as cmp + select.
1435 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1436 llvm_unreachable("widenScalar should have succeeded");
1438 // FIXME: This is relying on widenScalar leaving MI in place.
1439 if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
1440 llvm_unreachable("lower should have succeeded");
1441 } else {
1442 if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
1443 llvm_unreachable("lower should have succeeded");
1446 return;
1448 case AMDGPU::G_SEXT:
1449 case AMDGPU::G_ZEXT: {
1450 Register SrcReg = MI.getOperand(1).getReg();
1451 LLT SrcTy = MRI.getType(SrcReg);
1452 bool Signed = Opc == AMDGPU::G_SEXT;
1454 MachineIRBuilder B(MI);
1455 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1457 Register DstReg = MI.getOperand(0).getReg();
1458 LLT DstTy = MRI.getType(DstReg);
1459 if (DstTy.isScalar() &&
1460 SrcBank != &AMDGPU::SGPRRegBank &&
1461 SrcBank != &AMDGPU::SCCRegBank &&
1462 SrcBank != &AMDGPU::VCCRegBank &&
1463 // FIXME: Should handle any type that round to s64 when irregular
1464 // breakdowns supported.
1465 DstTy.getSizeInBits() == 64 &&
1466 SrcTy.getSizeInBits() <= 32) {
1467 const LLT S32 = LLT::scalar(32);
1468 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1470 // Extend to 32-bit, and then extend the low half.
1471 if (Signed) {
1472 // TODO: Should really be buildSExtOrCopy
1473 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
1475 // Replicate sign bit from 32-bit extended part.
1476 auto ShiftAmt = B.buildConstant(S32, 31);
1477 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1478 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
1479 } else {
1480 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
1481 B.buildConstant(DefRegs[1], 0);
1484 MRI.setRegBank(DstReg, *SrcBank);
1485 MI.eraseFromParent();
1486 return;
1489 if (SrcTy != LLT::scalar(1))
1490 return;
1492 if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
1493 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1495 const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
1496 &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
1498 unsigned DstSize = DstTy.getSizeInBits();
1499 // 64-bit select is SGPR only
1500 const bool UseSel64 = DstSize > 32 &&
1501 SrcBank->getID() == AMDGPU::SCCRegBankID;
1503 // TODO: Should s16 select be legal?
1504 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
1505 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
1506 auto False = B.buildConstant(SelType, 0);
1508 MRI.setRegBank(True.getReg(0), *DstBank);
1509 MRI.setRegBank(False.getReg(0), *DstBank);
1510 MRI.setRegBank(DstReg, *DstBank);
1512 if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
1513 B.buildSelect(DefRegs[0], SrcReg, True, False);
1514 B.buildCopy(DefRegs[1], DefRegs[0]);
1515 } else if (DstSize < 32) {
1516 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
1517 MRI.setRegBank(Sel.getReg(0), *DstBank);
1518 B.buildTrunc(DstReg, Sel);
1519 } else {
1520 B.buildSelect(DstReg, SrcReg, True, False);
1523 MI.eraseFromParent();
1524 return;
1527 // Fixup the case with an s1 src that isn't a condition register. Use shifts
1528 // instead of introducing a compare to avoid an unnecessary condition
1529 // register (and since there's no scalar 16-bit compares).
1530 auto Ext = B.buildAnyExt(DstTy, SrcReg);
1531 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
1532 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
1534 if (MI.getOpcode() == AMDGPU::G_SEXT)
1535 B.buildAShr(DstReg, Shl, ShiftAmt);
1536 else
1537 B.buildLShr(DstReg, Shl, ShiftAmt);
1539 MRI.setRegBank(DstReg, *SrcBank);
1540 MRI.setRegBank(Ext.getReg(0), *SrcBank);
1541 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1542 MRI.setRegBank(Shl.getReg(0), *SrcBank);
1543 MI.eraseFromParent();
1544 return;
1546 case AMDGPU::G_BUILD_VECTOR:
1547 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
1548 Register DstReg = MI.getOperand(0).getReg();
1549 LLT DstTy = MRI.getType(DstReg);
1550 if (DstTy != LLT::vector(2, 16))
1551 break;
1553 assert(MI.getNumOperands() == 3 && empty(OpdMapper.getVRegs(0)));
1554 substituteSimpleCopyRegs(OpdMapper, 1);
1555 substituteSimpleCopyRegs(OpdMapper, 2);
1557 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1558 if (DstBank == &AMDGPU::SGPRRegBank)
1559 break; // Can use S_PACK_* instructions.
1561 MachineIRBuilder B(MI);
1563 Register Lo = MI.getOperand(1).getReg();
1564 Register Hi = MI.getOperand(2).getReg();
1565 const LLT S32 = LLT::scalar(32);
1567 const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI);
1568 const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI);
1570 Register ZextLo;
1571 Register ShiftHi;
1573 if (Opc == AMDGPU::G_BUILD_VECTOR) {
1574 ZextLo = B.buildZExt(S32, Lo).getReg(0);
1575 MRI.setRegBank(ZextLo, *BankLo);
1577 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
1578 MRI.setRegBank(ZextHi, *BankHi);
1580 auto ShiftAmt = B.buildConstant(S32, 16);
1581 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1583 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
1584 MRI.setRegBank(ShiftHi, *BankHi);
1585 } else {
1586 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
1587 MRI.setRegBank(MaskLo, *BankLo);
1589 auto ShiftAmt = B.buildConstant(S32, 16);
1590 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1592 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
1593 MRI.setRegBank(ShiftHi, *BankHi);
1595 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
1596 MRI.setRegBank(ZextLo, *BankLo);
1599 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
1600 MRI.setRegBank(Or.getReg(0), *DstBank);
1602 B.buildBitcast(DstReg, Or);
1603 MI.eraseFromParent();
1604 return;
1606 case AMDGPU::G_EXTRACT_VECTOR_ELT:
1607 applyDefaultMapping(OpdMapper);
1608 executeInWaterfallLoop(MI, MRI, { 2 });
1609 return;
1610 case AMDGPU::G_INTRINSIC: {
1611 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1612 case Intrinsic::amdgcn_s_buffer_load: {
1613 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
1614 executeInWaterfallLoop(MI, MRI, { 2, 3 });
1615 return;
1617 case Intrinsic::amdgcn_readlane: {
1618 substituteSimpleCopyRegs(OpdMapper, 2);
1620 assert(empty(OpdMapper.getVRegs(0)));
1621 assert(empty(OpdMapper.getVRegs(3)));
1623 // Make sure the index is an SGPR. It doesn't make sense to run this in a
1624 // waterfall loop, so assume it's a uniform value.
1625 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1626 return;
1628 case Intrinsic::amdgcn_writelane: {
1629 assert(empty(OpdMapper.getVRegs(0)));
1630 assert(empty(OpdMapper.getVRegs(2)));
1631 assert(empty(OpdMapper.getVRegs(3)));
1633 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
1634 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
1635 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1636 return;
1638 default:
1639 break;
1641 break;
1643 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
1644 auto IntrID = MI.getIntrinsicID();
1645 switch (IntrID) {
1646 case Intrinsic::amdgcn_buffer_load: {
1647 executeInWaterfallLoop(MI, MRI, { 2 });
1648 return;
1650 case Intrinsic::amdgcn_ds_ordered_add:
1651 case Intrinsic::amdgcn_ds_ordered_swap: {
1652 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
1653 assert(empty(OpdMapper.getVRegs(0)));
1654 substituteSimpleCopyRegs(OpdMapper, 3);
1655 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1656 return;
1658 case Intrinsic::amdgcn_s_sendmsg:
1659 case Intrinsic::amdgcn_s_sendmsghalt: {
1660 // FIXME: Should this use a waterfall loop?
1661 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1662 return;
1664 case Intrinsic::amdgcn_raw_buffer_load:
1665 case Intrinsic::amdgcn_raw_buffer_load_format:
1666 case Intrinsic::amdgcn_raw_tbuffer_load:
1667 case Intrinsic::amdgcn_raw_buffer_store:
1668 case Intrinsic::amdgcn_raw_buffer_store_format:
1669 case Intrinsic::amdgcn_raw_tbuffer_store: {
1670 applyDefaultMapping(OpdMapper);
1671 executeInWaterfallLoop(MI, MRI, {2, 4});
1672 return;
1674 case Intrinsic::amdgcn_struct_buffer_load:
1675 case Intrinsic::amdgcn_struct_buffer_store:
1676 case Intrinsic::amdgcn_struct_tbuffer_load:
1677 case Intrinsic::amdgcn_struct_tbuffer_store: {
1678 applyDefaultMapping(OpdMapper);
1679 executeInWaterfallLoop(MI, MRI, {2, 5});
1680 return;
1682 default: {
1683 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1684 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1685 // Non-images can have complications from operands that allow both SGPR
1686 // and VGPR. For now it's too complicated to figure out the final opcode
1687 // to derive the register bank from the MCInstrDesc.
1688 if (RSrcIntrin->IsImage) {
1689 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
1690 return;
1694 break;
1697 break;
1699 case AMDGPU::G_LOAD:
1700 case AMDGPU::G_ZEXTLOAD:
1701 case AMDGPU::G_SEXTLOAD: {
1702 if (applyMappingWideLoad(MI, OpdMapper, MRI))
1703 return;
1704 break;
1706 default:
1707 break;
1710 return applyDefaultMapping(OpdMapper);
1713 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
1714 const MachineFunction &MF = *MI.getParent()->getParent();
1715 const MachineRegisterInfo &MRI = MF.getRegInfo();
1716 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
1717 if (!MI.getOperand(i).isReg())
1718 continue;
1719 Register Reg = MI.getOperand(i).getReg();
1720 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
1721 if (Bank->getID() == AMDGPU::VGPRRegBankID)
1722 return false;
1724 assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
1725 Bank->getID() == AMDGPU::SCCRegBankID);
1728 return true;
1731 const RegisterBankInfo::InstructionMapping &
1732 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
1733 const MachineFunction &MF = *MI.getParent()->getParent();
1734 const MachineRegisterInfo &MRI = MF.getRegInfo();
1735 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1737 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1738 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
1739 unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
1740 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
1742 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1743 MI.getNumOperands());
1746 const RegisterBankInfo::InstructionMapping &
1747 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
1748 const MachineFunction &MF = *MI.getParent()->getParent();
1749 const MachineRegisterInfo &MRI = MF.getRegInfo();
1750 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1751 unsigned OpdIdx = 0;
1753 unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1754 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
1756 if (MI.getOperand(OpdIdx).isIntrinsicID())
1757 OpdsMapping[OpdIdx++] = nullptr;
1759 Register Reg1 = MI.getOperand(OpdIdx).getReg();
1760 unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
1762 unsigned DefaultBankID = Size1 == 1 ?
1763 AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1764 unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
1766 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
1768 for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
1769 const MachineOperand &MO = MI.getOperand(OpdIdx);
1770 if (!MO.isReg())
1771 continue;
1773 unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
1774 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1775 OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
1778 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1779 MI.getNumOperands());
1782 const RegisterBankInfo::InstructionMapping &
1783 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
1784 const MachineFunction &MF = *MI.getParent()->getParent();
1785 const MachineRegisterInfo &MRI = MF.getRegInfo();
1786 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1788 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1789 const MachineOperand &Op = MI.getOperand(I);
1790 if (!Op.isReg())
1791 continue;
1793 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
1794 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1797 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1798 MI.getNumOperands());
1801 const RegisterBankInfo::InstructionMapping &
1802 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
1803 const MachineInstr &MI,
1804 int RsrcIdx) const {
1805 // The reported argument index is relative to the IR intrinsic call arguments,
1806 // so we need to shift by the number of defs and the intrinsic ID.
1807 RsrcIdx += MI.getNumExplicitDefs() + 1;
1809 const int NumOps = MI.getNumOperands();
1810 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
1812 // TODO: Should packed/unpacked D16 difference be reported here as part of
1813 // the value mapping?
1814 for (int I = 0; I != NumOps; ++I) {
1815 if (!MI.getOperand(I).isReg())
1816 continue;
1818 Register OpReg = MI.getOperand(I).getReg();
1819 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
1821 // FIXME: Probably need a new intrinsic register bank searchable table to
1822 // handle arbitrary intrinsics easily.
1824 // If this has a sampler, it immediately follows rsrc.
1825 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
1827 if (MustBeSGPR) {
1828 // If this must be an SGPR, so we must report whatever it is as legal.
1829 unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
1830 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
1831 } else {
1832 // Some operands must be VGPR, and these are easy to copy to.
1833 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1837 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
1840 const RegisterBankInfo::InstructionMapping &
1841 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
1843 const MachineFunction &MF = *MI.getParent()->getParent();
1844 const MachineRegisterInfo &MRI = MF.getRegInfo();
1845 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1846 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1847 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
1848 Register PtrReg = MI.getOperand(1).getReg();
1849 LLT PtrTy = MRI.getType(PtrReg);
1850 unsigned AS = PtrTy.getAddressSpace();
1851 unsigned PtrSize = PtrTy.getSizeInBits();
1853 const ValueMapping *ValMapping;
1854 const ValueMapping *PtrMapping;
1856 if (isInstrUniformNonExtLoadAlign4(MI) &&
1857 (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)) {
1858 // We have a uniform instruction so we want to use an SMRD load
1859 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1860 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
1861 } else {
1862 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
1863 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
1866 OpdsMapping[0] = ValMapping;
1867 OpdsMapping[1] = PtrMapping;
1868 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
1869 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
1870 return Mapping;
1872 // FIXME: Do we want to add a mapping for FLAT load, or should we just
1873 // handle that during instruction selection?
1876 unsigned
1877 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
1878 const MachineRegisterInfo &MRI,
1879 const TargetRegisterInfo &TRI,
1880 unsigned Default) const {
1882 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
1883 return Bank ? Bank->getID() : Default;
1887 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
1888 return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
1889 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1892 const RegisterBankInfo::ValueMapping *
1893 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
1894 const MachineRegisterInfo &MRI,
1895 const TargetRegisterInfo &TRI) const {
1896 // Lie and claim anything is legal, even though this needs to be an SGPR
1897 // applyMapping will have to deal with it as a waterfall loop.
1898 unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
1899 unsigned Size = getSizeInBits(Reg, MRI, TRI);
1900 return AMDGPU::getValueMapping(Bank, Size);
1903 const RegisterBankInfo::ValueMapping *
1904 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
1905 const MachineRegisterInfo &MRI,
1906 const TargetRegisterInfo &TRI) const {
1907 unsigned Size = getSizeInBits(Reg, MRI, TRI);
1908 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1912 /// This function must return a legal mapping, because
1913 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
1914 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
1915 /// VGPR to SGPR generated is illegal.
1917 const RegisterBankInfo::InstructionMapping &
1918 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
1919 const MachineFunction &MF = *MI.getParent()->getParent();
1920 const MachineRegisterInfo &MRI = MF.getRegInfo();
1922 if (MI.isRegSequence()) {
1923 // If any input is a VGPR, the result must be a VGPR. The default handling
1924 // assumes any copy between banks is legal.
1925 unsigned BankID = AMDGPU::SGPRRegBankID;
1927 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1928 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
1929 // It doesn't make sense to use vcc or scc banks here, so just ignore
1930 // them.
1931 if (OpBank != AMDGPU::SGPRRegBankID) {
1932 BankID = AMDGPU::VGPRRegBankID;
1933 break;
1936 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1938 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
1939 return getInstructionMapping(
1940 1, /*Cost*/ 1,
1941 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
1944 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
1945 // properly.
1947 // TODO: There are additional exec masking dependencies to analyze.
1948 if (MI.getOpcode() == TargetOpcode::G_PHI) {
1949 // TODO: Generate proper invalid bank enum.
1950 int ResultBank = -1;
1952 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1953 Register Reg = MI.getOperand(I).getReg();
1954 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1956 // FIXME: Assuming VGPR for any undetermined inputs.
1957 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
1958 ResultBank = AMDGPU::VGPRRegBankID;
1959 break;
1962 unsigned OpBank = Bank->getID();
1963 // scc, scc -> sgpr
1964 if (OpBank == AMDGPU::SCCRegBankID) {
1965 // There's only one SCC register, so a phi requires copying to SGPR.
1966 OpBank = AMDGPU::SGPRRegBankID;
1967 } else if (OpBank == AMDGPU::VCCRegBankID) {
1968 // vcc, vcc -> vcc
1969 // vcc, sgpr -> vgpr
1970 if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) {
1971 ResultBank = AMDGPU::VGPRRegBankID;
1972 break;
1976 ResultBank = OpBank;
1979 assert(ResultBank != -1);
1981 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1983 const ValueMapping &ValMap =
1984 getValueMapping(0, Size, getRegBank(ResultBank));
1985 return getInstructionMapping(
1986 1, /*Cost*/ 1,
1987 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
1990 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
1991 if (Mapping.isValid())
1992 return Mapping;
1994 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1996 switch (MI.getOpcode()) {
1997 default:
1998 return getInvalidInstructionMapping();
2000 case AMDGPU::G_AND:
2001 case AMDGPU::G_OR:
2002 case AMDGPU::G_XOR: {
2003 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2004 if (Size == 1) {
2005 const RegisterBank *DstBank
2006 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
2008 unsigned TargetBankID = -1;
2009 unsigned BankLHS = -1;
2010 unsigned BankRHS = -1;
2011 if (DstBank) {
2012 TargetBankID = DstBank->getID();
2013 if (DstBank == &AMDGPU::VCCRegBank) {
2014 TargetBankID = AMDGPU::VCCRegBankID;
2015 BankLHS = AMDGPU::VCCRegBankID;
2016 BankRHS = AMDGPU::VCCRegBankID;
2017 } else if (DstBank == &AMDGPU::SCCRegBank) {
2018 TargetBankID = AMDGPU::SCCRegBankID;
2019 BankLHS = AMDGPU::SGPRRegBankID;
2020 BankRHS = AMDGPU::SGPRRegBankID;
2021 } else {
2022 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2023 AMDGPU::SGPRRegBankID);
2024 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2025 AMDGPU::SGPRRegBankID);
2027 } else {
2028 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2029 AMDGPU::VCCRegBankID);
2030 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2031 AMDGPU::VCCRegBankID);
2033 // Both inputs should be true booleans to produce a boolean result.
2034 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
2035 TargetBankID = AMDGPU::VGPRRegBankID;
2036 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
2037 TargetBankID = AMDGPU::VCCRegBankID;
2038 BankLHS = AMDGPU::VCCRegBankID;
2039 BankRHS = AMDGPU::VCCRegBankID;
2040 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
2041 TargetBankID = AMDGPU::SGPRRegBankID;
2042 } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
2043 // The operation must be done on a 32-bit register, but it will set
2044 // scc. The result type could interchangably be SCC or SGPR, since
2045 // both values will be produced.
2046 TargetBankID = AMDGPU::SCCRegBankID;
2047 BankLHS = AMDGPU::SGPRRegBankID;
2048 BankRHS = AMDGPU::SGPRRegBankID;
2052 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
2053 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
2054 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
2055 break;
2058 if (Size == 64) {
2060 if (isSALUMapping(MI)) {
2061 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
2062 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
2063 } else {
2064 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
2065 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
2066 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
2068 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
2069 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
2072 break;
2075 LLVM_FALLTHROUGH;
2077 case AMDGPU::G_GEP:
2078 case AMDGPU::G_ADD:
2079 case AMDGPU::G_SUB:
2080 case AMDGPU::G_MUL:
2081 case AMDGPU::G_SHL:
2082 case AMDGPU::G_LSHR:
2083 case AMDGPU::G_ASHR:
2084 case AMDGPU::G_UADDO:
2085 case AMDGPU::G_SADDO:
2086 case AMDGPU::G_USUBO:
2087 case AMDGPU::G_SSUBO:
2088 case AMDGPU::G_UADDE:
2089 case AMDGPU::G_SADDE:
2090 case AMDGPU::G_USUBE:
2091 case AMDGPU::G_SSUBE:
2092 case AMDGPU::G_SMIN:
2093 case AMDGPU::G_SMAX:
2094 case AMDGPU::G_UMIN:
2095 case AMDGPU::G_UMAX:
2096 if (isSALUMapping(MI))
2097 return getDefaultMappingSOP(MI);
2098 LLVM_FALLTHROUGH;
2100 case AMDGPU::G_FADD:
2101 case AMDGPU::G_FSUB:
2102 case AMDGPU::G_FPTOSI:
2103 case AMDGPU::G_FPTOUI:
2104 case AMDGPU::G_FMUL:
2105 case AMDGPU::G_FMA:
2106 case AMDGPU::G_FMAD:
2107 case AMDGPU::G_FSQRT:
2108 case AMDGPU::G_FFLOOR:
2109 case AMDGPU::G_FCEIL:
2110 case AMDGPU::G_FRINT:
2111 case AMDGPU::G_SITOFP:
2112 case AMDGPU::G_UITOFP:
2113 case AMDGPU::G_FPTRUNC:
2114 case AMDGPU::G_FPEXT:
2115 case AMDGPU::G_FEXP2:
2116 case AMDGPU::G_FLOG2:
2117 case AMDGPU::G_FMINNUM:
2118 case AMDGPU::G_FMAXNUM:
2119 case AMDGPU::G_FMINNUM_IEEE:
2120 case AMDGPU::G_FMAXNUM_IEEE:
2121 case AMDGPU::G_FCANONICALIZE:
2122 case AMDGPU::G_INTRINSIC_TRUNC:
2123 case AMDGPU::G_INTRINSIC_ROUND:
2124 return getDefaultMappingVOP(MI);
2125 case AMDGPU::G_UMULH:
2126 case AMDGPU::G_SMULH: {
2127 if (MF.getSubtarget<GCNSubtarget>().hasScalarMulHiInsts() &&
2128 isSALUMapping(MI))
2129 return getDefaultMappingSOP(MI);
2130 return getDefaultMappingVOP(MI);
2132 case AMDGPU::G_IMPLICIT_DEF: {
2133 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2134 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2135 break;
2137 case AMDGPU::G_FCONSTANT:
2138 case AMDGPU::G_CONSTANT:
2139 case AMDGPU::G_FRAME_INDEX:
2140 case AMDGPU::G_GLOBAL_VALUE:
2141 case AMDGPU::G_BLOCK_ADDR: {
2142 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2143 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2144 break;
2146 case AMDGPU::G_INSERT: {
2147 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2148 AMDGPU::VGPRRegBankID;
2149 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2150 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2151 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
2152 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2153 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2154 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
2155 OpdsMapping[3] = nullptr;
2156 break;
2158 case AMDGPU::G_EXTRACT: {
2159 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2160 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2161 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2162 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2163 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2164 OpdsMapping[2] = nullptr;
2165 break;
2167 case AMDGPU::G_BUILD_VECTOR:
2168 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2169 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
2170 if (DstTy == LLT::vector(2, 16)) {
2171 unsigned DstSize = DstTy.getSizeInBits();
2172 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2173 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2174 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2175 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
2177 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
2178 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
2179 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
2180 break;
2183 LLVM_FALLTHROUGH;
2185 case AMDGPU::G_MERGE_VALUES:
2186 case AMDGPU::G_CONCAT_VECTORS: {
2187 unsigned Bank = isSALUMapping(MI) ?
2188 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2189 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2190 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2192 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2193 // Op1 and Dst should use the same register bank.
2194 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
2195 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
2196 break;
2198 case AMDGPU::G_BITCAST:
2199 case AMDGPU::G_INTTOPTR:
2200 case AMDGPU::G_PTRTOINT:
2201 case AMDGPU::G_CTLZ:
2202 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2203 case AMDGPU::G_CTTZ:
2204 case AMDGPU::G_CTTZ_ZERO_UNDEF:
2205 case AMDGPU::G_CTPOP:
2206 case AMDGPU::G_BSWAP:
2207 case AMDGPU::G_BITREVERSE:
2208 case AMDGPU::G_FABS:
2209 case AMDGPU::G_FNEG: {
2210 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2211 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2212 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
2213 break;
2215 case AMDGPU::G_TRUNC: {
2216 Register Dst = MI.getOperand(0).getReg();
2217 Register Src = MI.getOperand(1).getReg();
2218 unsigned Bank = getRegBankID(Src, MRI, *TRI);
2219 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2220 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2221 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2222 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
2223 break;
2225 case AMDGPU::G_ZEXT:
2226 case AMDGPU::G_SEXT:
2227 case AMDGPU::G_ANYEXT: {
2228 Register Dst = MI.getOperand(0).getReg();
2229 Register Src = MI.getOperand(1).getReg();
2230 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2231 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2233 unsigned DstBank;
2234 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
2235 assert(SrcBank);
2236 switch (SrcBank->getID()) {
2237 case AMDGPU::SCCRegBankID:
2238 case AMDGPU::SGPRRegBankID:
2239 DstBank = AMDGPU::SGPRRegBankID;
2240 break;
2241 default:
2242 DstBank = AMDGPU::VGPRRegBankID;
2243 break;
2246 // TODO: Should anyext be split into 32-bit part as well?
2247 if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
2248 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
2249 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
2250 } else {
2251 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
2252 // 32-bits, and then to 64.
2253 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
2254 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
2255 SrcSize);
2257 break;
2259 case AMDGPU::G_FCMP: {
2260 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2261 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2262 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2263 OpdsMapping[1] = nullptr; // Predicate Operand.
2264 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2265 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2266 break;
2268 case AMDGPU::G_STORE: {
2269 assert(MI.getOperand(0).isReg());
2270 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2271 // FIXME: We need to specify a different reg bank once scalar stores
2272 // are supported.
2273 const ValueMapping *ValMapping =
2274 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2275 // FIXME: Depending on the type of store, the pointer could be in
2276 // the SGPR Reg bank.
2277 // FIXME: Pointer size should be based on the address space.
2278 const ValueMapping *PtrMapping =
2279 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
2281 OpdsMapping[0] = ValMapping;
2282 OpdsMapping[1] = PtrMapping;
2283 break;
2286 case AMDGPU::G_ICMP: {
2287 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
2288 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2289 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2290 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2292 bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
2293 Op3Bank == AMDGPU::SGPRRegBankID &&
2294 (Size == 32 || (Size == 64 &&
2295 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
2296 MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()));
2298 unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2300 OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
2301 OpdsMapping[1] = nullptr; // Predicate Operand.
2302 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2303 OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
2304 break;
2306 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2307 unsigned OutputBankID = isSALUMapping(MI) ?
2308 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2309 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2310 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2311 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2313 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
2314 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
2316 // The index can be either if the source vector is VGPR.
2317 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2318 break;
2320 case AMDGPU::G_INSERT_VECTOR_ELT: {
2321 unsigned OutputBankID = isSALUMapping(MI) ?
2322 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2324 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2325 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2326 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2327 unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2328 unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2330 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
2331 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
2332 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize);
2334 // The index can be either if the source vector is VGPR.
2335 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2336 break;
2338 case AMDGPU::G_UNMERGE_VALUES: {
2339 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2340 AMDGPU::VGPRRegBankID;
2342 // Op1 and Dst should use the same register bank.
2343 // FIXME: Shouldn't this be the default? Why do we need to handle this?
2344 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2345 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2346 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
2348 break;
2350 case AMDGPU::G_INTRINSIC: {
2351 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
2352 default:
2353 return getInvalidInstructionMapping();
2354 case Intrinsic::amdgcn_div_fmas:
2355 case Intrinsic::amdgcn_trig_preop:
2356 case Intrinsic::amdgcn_sin:
2357 case Intrinsic::amdgcn_cos:
2358 case Intrinsic::amdgcn_log_clamp:
2359 case Intrinsic::amdgcn_rcp:
2360 case Intrinsic::amdgcn_rcp_legacy:
2361 case Intrinsic::amdgcn_rsq:
2362 case Intrinsic::amdgcn_rsq_legacy:
2363 case Intrinsic::amdgcn_rsq_clamp:
2364 case Intrinsic::amdgcn_ldexp:
2365 case Intrinsic::amdgcn_frexp_mant:
2366 case Intrinsic::amdgcn_frexp_exp:
2367 case Intrinsic::amdgcn_fract:
2368 case Intrinsic::amdgcn_cvt_pkrtz:
2369 case Intrinsic::amdgcn_cvt_pknorm_i16:
2370 case Intrinsic::amdgcn_cvt_pknorm_u16:
2371 case Intrinsic::amdgcn_cvt_pk_i16:
2372 case Intrinsic::amdgcn_cvt_pk_u16:
2373 case Intrinsic::amdgcn_fmed3:
2374 case Intrinsic::amdgcn_cubeid:
2375 case Intrinsic::amdgcn_cubema:
2376 case Intrinsic::amdgcn_cubesc:
2377 case Intrinsic::amdgcn_cubetc:
2378 case Intrinsic::amdgcn_sffbh:
2379 case Intrinsic::amdgcn_fmad_ftz:
2380 case Intrinsic::amdgcn_mbcnt_lo:
2381 case Intrinsic::amdgcn_mbcnt_hi:
2382 case Intrinsic::amdgcn_ubfe:
2383 case Intrinsic::amdgcn_sbfe:
2384 case Intrinsic::amdgcn_lerp:
2385 case Intrinsic::amdgcn_sad_u8:
2386 case Intrinsic::amdgcn_msad_u8:
2387 case Intrinsic::amdgcn_sad_hi_u8:
2388 case Intrinsic::amdgcn_sad_u16:
2389 case Intrinsic::amdgcn_qsad_pk_u16_u8:
2390 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
2391 case Intrinsic::amdgcn_mqsad_u32_u8:
2392 case Intrinsic::amdgcn_cvt_pk_u8_f32:
2393 case Intrinsic::amdgcn_alignbit:
2394 case Intrinsic::amdgcn_alignbyte:
2395 case Intrinsic::amdgcn_fdot2:
2396 case Intrinsic::amdgcn_sdot2:
2397 case Intrinsic::amdgcn_udot2:
2398 case Intrinsic::amdgcn_sdot4:
2399 case Intrinsic::amdgcn_udot4:
2400 case Intrinsic::amdgcn_sdot8:
2401 case Intrinsic::amdgcn_udot8:
2402 case Intrinsic::amdgcn_wwm:
2403 case Intrinsic::amdgcn_wqm:
2404 return getDefaultMappingVOP(MI);
2405 case Intrinsic::amdgcn_ds_swizzle:
2406 case Intrinsic::amdgcn_ds_permute:
2407 case Intrinsic::amdgcn_ds_bpermute:
2408 case Intrinsic::amdgcn_update_dpp:
2409 return getDefaultMappingAllVGPR(MI);
2410 case Intrinsic::amdgcn_kernarg_segment_ptr:
2411 case Intrinsic::amdgcn_s_getpc:
2412 case Intrinsic::amdgcn_groupstaticsize: {
2413 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2414 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2415 break;
2417 case Intrinsic::amdgcn_wqm_vote: {
2418 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2419 OpdsMapping[0] = OpdsMapping[2]
2420 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
2421 break;
2423 case Intrinsic::amdgcn_s_buffer_load: {
2424 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
2425 Register RSrc = MI.getOperand(2).getReg(); // SGPR
2426 Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
2428 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2429 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2430 unsigned Size3 = MRI.getType(Offset).getSizeInBits();
2432 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2433 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2435 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
2436 OpdsMapping[1] = nullptr; // intrinsic id
2438 // Lie and claim everything is legal, even though some need to be
2439 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2440 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2441 OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
2442 OpdsMapping[4] = nullptr;
2443 break;
2445 case Intrinsic::amdgcn_div_scale: {
2446 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2447 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2448 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
2449 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
2451 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2452 OpdsMapping[3] = AMDGPU::getValueMapping(
2453 getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
2454 OpdsMapping[4] = AMDGPU::getValueMapping(
2455 getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
2457 break;
2459 case Intrinsic::amdgcn_class: {
2460 Register Src0Reg = MI.getOperand(2).getReg();
2461 Register Src1Reg = MI.getOperand(3).getReg();
2462 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
2463 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
2464 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2465 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
2466 OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
2467 Src0Size);
2468 OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
2469 Src1Size);
2470 break;
2472 case Intrinsic::amdgcn_icmp:
2473 case Intrinsic::amdgcn_fcmp: {
2474 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2475 // This is not VCCRegBank because this is not used in boolean contexts.
2476 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2477 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2478 unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2479 unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2480 OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
2481 OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
2482 break;
2484 case Intrinsic::amdgcn_readlane: {
2485 // This must be an SGPR, but accept a VGPR.
2486 Register IdxReg = MI.getOperand(3).getReg();
2487 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2488 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2489 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2490 LLVM_FALLTHROUGH;
2492 case Intrinsic::amdgcn_readfirstlane: {
2493 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2494 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2495 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2496 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2497 break;
2499 case Intrinsic::amdgcn_writelane: {
2500 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2501 Register SrcReg = MI.getOperand(2).getReg();
2502 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
2503 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2504 Register IdxReg = MI.getOperand(3).getReg();
2505 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2506 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2507 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2509 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
2510 // to legalize.
2511 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
2512 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2513 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2514 break;
2516 case Intrinsic::amdgcn_if_break: {
2517 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2518 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2519 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2520 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2521 break;
2524 break;
2526 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2527 auto IntrID = MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID();
2528 switch (IntrID) {
2529 case Intrinsic::amdgcn_s_getreg:
2530 case Intrinsic::amdgcn_s_memtime:
2531 case Intrinsic::amdgcn_s_memrealtime:
2532 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
2533 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2534 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2535 break;
2537 case Intrinsic::amdgcn_ds_append:
2538 case Intrinsic::amdgcn_ds_consume:
2539 case Intrinsic::amdgcn_ds_fadd:
2540 case Intrinsic::amdgcn_ds_fmin:
2541 case Intrinsic::amdgcn_ds_fmax:
2542 case Intrinsic::amdgcn_atomic_inc:
2543 case Intrinsic::amdgcn_atomic_dec:
2544 return getDefaultMappingAllVGPR(MI);
2545 case Intrinsic::amdgcn_ds_ordered_add:
2546 case Intrinsic::amdgcn_ds_ordered_swap: {
2547 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2548 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2549 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2550 AMDGPU::SGPRRegBankID);
2551 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
2552 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2553 break;
2555 case Intrinsic::amdgcn_exp_compr:
2556 OpdsMapping[0] = nullptr; // IntrinsicID
2557 // FIXME: These are immediate values which can't be read from registers.
2558 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2559 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2560 // FIXME: Could we support packed types here?
2561 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2562 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2563 // FIXME: These are immediate values which can't be read from registers.
2564 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2565 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2566 break;
2567 case Intrinsic::amdgcn_exp:
2568 // FIXME: Could we support packed types here?
2569 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2570 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2571 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2572 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2573 break;
2574 case Intrinsic::amdgcn_buffer_load: {
2575 Register RSrc = MI.getOperand(2).getReg(); // SGPR
2576 Register VIndex = MI.getOperand(3).getReg(); // VGPR
2577 Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
2579 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2580 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2581 unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
2582 unsigned Size4 = MRI.getType(Offset).getSizeInBits();
2584 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2585 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2587 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
2588 OpdsMapping[1] = nullptr; // intrinsic id
2590 // Lie and claim everything is legal, even though some need to be
2591 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2592 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2593 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
2594 OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
2595 OpdsMapping[5] = nullptr;
2596 OpdsMapping[6] = nullptr;
2597 break;
2599 case Intrinsic::amdgcn_s_sendmsg:
2600 case Intrinsic::amdgcn_s_sendmsghalt: {
2601 // This must be an SGPR, but accept a VGPR.
2602 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2603 AMDGPU::SGPRRegBankID);
2604 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
2605 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
2606 break;
2608 case Intrinsic::amdgcn_end_cf:
2609 case Intrinsic::amdgcn_init_exec: {
2610 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2611 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2612 break;
2614 case Intrinsic::amdgcn_else: {
2615 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2616 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2617 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
2618 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
2619 break;
2621 case Intrinsic::amdgcn_kill: {
2622 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2623 break;
2625 case Intrinsic::amdgcn_raw_buffer_load:
2626 case Intrinsic::amdgcn_raw_tbuffer_load: {
2627 // FIXME: Should make intrinsic ID the last operand of the instruction,
2628 // then this would be the same as store
2629 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
2630 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2631 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2632 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2633 break;
2635 case Intrinsic::amdgcn_raw_buffer_store:
2636 case Intrinsic::amdgcn_raw_buffer_store_format:
2637 case Intrinsic::amdgcn_raw_tbuffer_store: {
2638 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
2639 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2640 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2641 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2642 break;
2644 case Intrinsic::amdgcn_struct_buffer_load:
2645 case Intrinsic::amdgcn_struct_tbuffer_load: {
2646 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
2647 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2648 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2649 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2650 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
2651 break;
2653 case Intrinsic::amdgcn_struct_buffer_store:
2654 case Intrinsic::amdgcn_struct_tbuffer_store: {
2655 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
2656 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2657 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2658 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2659 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
2660 break;
2662 case Intrinsic::amdgcn_init_exec_from_input: {
2663 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2664 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2665 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2666 break;
2668 default:
2669 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
2670 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
2671 // Non-images can have complications from operands that allow both SGPR
2672 // and VGPR. For now it's too complicated to figure out the final opcode
2673 // to derive the register bank from the MCInstrDesc.
2674 if (RSrcIntrin->IsImage)
2675 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
2678 return getInvalidInstructionMapping();
2680 break;
2682 case AMDGPU::G_SELECT: {
2683 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2684 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2685 AMDGPU::SGPRRegBankID);
2686 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
2687 AMDGPU::SGPRRegBankID);
2688 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
2689 Op3Bank == AMDGPU::SGPRRegBankID;
2691 unsigned CondBankDefault = SGPRSrcs ?
2692 AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2693 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2694 CondBankDefault);
2695 if (CondBank == AMDGPU::SGPRRegBankID)
2696 CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2697 else if (CondBank == AMDGPU::VGPRRegBankID)
2698 CondBank = AMDGPU::VCCRegBankID;
2700 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ?
2701 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2703 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID);
2705 if (Size == 64) {
2706 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2707 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2708 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2709 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2710 } else {
2711 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
2712 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2713 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
2714 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
2717 break;
2720 case AMDGPU::G_LOAD:
2721 case AMDGPU::G_ZEXTLOAD:
2722 case AMDGPU::G_SEXTLOAD:
2723 return getInstrMappingForLoad(MI);
2725 case AMDGPU::G_ATOMICRMW_XCHG:
2726 case AMDGPU::G_ATOMICRMW_ADD:
2727 case AMDGPU::G_ATOMICRMW_SUB:
2728 case AMDGPU::G_ATOMICRMW_AND:
2729 case AMDGPU::G_ATOMICRMW_OR:
2730 case AMDGPU::G_ATOMICRMW_XOR:
2731 case AMDGPU::G_ATOMICRMW_MAX:
2732 case AMDGPU::G_ATOMICRMW_MIN:
2733 case AMDGPU::G_ATOMICRMW_UMAX:
2734 case AMDGPU::G_ATOMICRMW_UMIN:
2735 case AMDGPU::G_ATOMICRMW_FADD:
2736 case AMDGPU::G_ATOMIC_CMPXCHG: {
2737 return getDefaultMappingAllVGPR(MI);
2739 case AMDGPU::G_BRCOND: {
2740 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
2741 AMDGPU::SGPRRegBankID);
2742 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
2743 if (Bank != AMDGPU::SCCRegBankID)
2744 Bank = AMDGPU::VCCRegBankID;
2746 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
2747 break;
2751 return getInstructionMapping(/*ID*/1, /*Cost*/1,
2752 getOperandsMapping(OpdsMapping),
2753 MI.getNumOperands());