[x86] fix assert with horizontal math + broadcast of vector (PR43402)
[llvm-core.git] / lib / Target / AMDGPU / AMDGPURegisterBankInfo.cpp
blob0032d04686277f17173fe468d677cb6e9fbebf82
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/SmallSet.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
25 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
26 #include "llvm/CodeGen/TargetRegisterInfo.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Constants.h"
30 #define GET_TARGET_REGBANK_IMPL
31 #include "AMDGPUGenRegisterBank.inc"
33 // This file will be TableGen'ed at some point.
34 #include "AMDGPUGenRegisterBankInfo.def"
36 using namespace llvm;
37 using namespace MIPatternMatch;
39 namespace {
41 // Observer to apply a register bank to new registers created by LegalizerHelper.
42 class ApplyRegBankMapping final : public GISelChangeObserver {
43 private:
44 MachineRegisterInfo &MRI;
45 const RegisterBank *NewBank;
46 SmallVector<MachineInstr *, 4> NewInsts;
48 public:
49 ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
50 : MRI(MRI_), NewBank(RB) {}
52 ~ApplyRegBankMapping() {
53 for (MachineInstr *MI : NewInsts)
54 applyBank(*MI);
57 /// Set any registers that don't have a set register class or bank to SALU.
58 void applyBank(MachineInstr &MI) {
59 for (MachineOperand &Op : MI.operands()) {
60 if (!Op.isReg())
61 continue;
63 Register Reg = Op.getReg();
64 if (MRI.getRegClassOrRegBank(Reg))
65 continue;
67 const RegisterBank *RB = NewBank;
68 // FIXME: This might not be enough to detect when SCC should be used.
69 if (MRI.getType(Reg) == LLT::scalar(1))
70 RB = (NewBank == &AMDGPU::SGPRRegBank ?
71 &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
73 MRI.setRegBank(Reg, *RB);
77 void erasingInstr(MachineInstr &MI) override {}
79 void createdInstr(MachineInstr &MI) override {
80 // At this point, the instruction was just inserted and has no operands.
81 NewInsts.push_back(&MI);
84 void changingInstr(MachineInstr &MI) override {}
85 void changedInstr(MachineInstr &MI) override {}
89 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
90 : AMDGPUGenRegisterBankInfo(),
91 Subtarget(ST),
92 TRI(Subtarget.getRegisterInfo()),
93 TII(Subtarget.getInstrInfo()) {
95 // HACK: Until this is fully tablegen'd.
96 static bool AlreadyInit = false;
97 if (AlreadyInit)
98 return;
100 AlreadyInit = true;
102 const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
103 (void)RBSGPR;
104 assert(&RBSGPR == &AMDGPU::SGPRRegBank);
106 const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
107 (void)RBVGPR;
108 assert(&RBVGPR == &AMDGPU::VGPRRegBank);
112 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
113 const RegisterBank &Src,
114 unsigned Size) const {
115 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
116 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
117 Src.getID() == AMDGPU::VGPRRegBankID) {
118 return std::numeric_limits<unsigned>::max();
121 // Bool values are tricky, because the meaning is based on context. The SCC
122 // and VCC banks are for the natural scalar and vector conditions produced by
123 // a compare.
125 // Legalization doesn't know about the necessary context, so an s1 use may
126 // have been a truncate from an arbitrary value, in which case a copy (lowered
127 // as a compare with 0) needs to be inserted.
128 if (Size == 1 &&
129 (Dst.getID() == AMDGPU::SCCRegBankID ||
130 Dst.getID() == AMDGPU::SGPRRegBankID) &&
131 (Src.getID() == AMDGPU::SGPRRegBankID ||
132 Src.getID() == AMDGPU::VGPRRegBankID ||
133 Src.getID() == AMDGPU::VCCRegBankID))
134 return std::numeric_limits<unsigned>::max();
136 if (Dst.getID() == AMDGPU::SCCRegBankID &&
137 Src.getID() == AMDGPU::VCCRegBankID)
138 return std::numeric_limits<unsigned>::max();
140 return RegisterBankInfo::copyCost(Dst, Src, Size);
143 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
144 const ValueMapping &ValMapping,
145 const RegisterBank *CurBank) const {
146 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
147 // VGPR.
148 // FIXME: Is there a better way to do this?
149 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
150 return 10; // This is expensive.
152 assert(ValMapping.NumBreakDowns == 2 &&
153 ValMapping.BreakDown[0].Length == 32 &&
154 ValMapping.BreakDown[0].StartIdx == 0 &&
155 ValMapping.BreakDown[1].Length == 32 &&
156 ValMapping.BreakDown[1].StartIdx == 32 &&
157 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
159 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
160 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
161 // want.
163 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
164 // alignment restrictions, but this probably isn't important.
165 return 1;
168 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
169 const TargetRegisterClass &RC) const {
170 if (&RC == &AMDGPU::SReg_1RegClass)
171 return AMDGPU::VCCRegBank;
173 return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank;
176 template <unsigned NumOps>
177 RegisterBankInfo::InstructionMappings
178 AMDGPURegisterBankInfo::addMappingFromTable(
179 const MachineInstr &MI, const MachineRegisterInfo &MRI,
180 const std::array<unsigned, NumOps> RegSrcOpIdx,
181 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
183 InstructionMappings AltMappings;
185 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
187 unsigned Sizes[NumOps];
188 for (unsigned I = 0; I < NumOps; ++I) {
189 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
190 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
193 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
194 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
195 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
198 // getInstrMapping's default mapping uses ID 1, so start at 2.
199 unsigned MappingID = 2;
200 for (const auto &Entry : Table) {
201 for (unsigned I = 0; I < NumOps; ++I) {
202 int OpIdx = RegSrcOpIdx[I];
203 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
206 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
207 getOperandsMapping(Operands),
208 Operands.size()));
211 return AltMappings;
214 RegisterBankInfo::InstructionMappings
215 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
216 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
217 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
218 case Intrinsic::amdgcn_readlane: {
219 static const OpRegBankEntry<3> Table[2] = {
220 // Perfectly legal.
221 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
223 // Need a readfirstlane for the index.
224 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
227 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
228 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
230 case Intrinsic::amdgcn_writelane: {
231 static const OpRegBankEntry<4> Table[4] = {
232 // Perfectly legal.
233 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
235 // Need readfirstlane of first op
236 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
238 // Need readfirstlane of second op
239 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
241 // Need readfirstlane of both ops
242 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
245 // rsrc, voffset, offset
246 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
247 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
249 default:
250 return RegisterBankInfo::getInstrAlternativeMappings(MI);
254 RegisterBankInfo::InstructionMappings
255 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
256 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
258 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
259 case Intrinsic::amdgcn_buffer_load: {
260 static const OpRegBankEntry<3> Table[4] = {
261 // Perfectly legal.
262 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
263 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
265 // Waterfall loop needed for rsrc. In the worst case this will execute
266 // approximately an extra 10 * wavesize + 2 instructions.
267 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
268 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
271 // rsrc, voffset, offset
272 const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
273 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
275 case Intrinsic::amdgcn_s_buffer_load: {
276 static const OpRegBankEntry<2> Table[4] = {
277 // Perfectly legal.
278 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
280 // Only need 1 register in loop
281 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
283 // Have to waterfall the resource.
284 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
286 // Have to waterfall the resource, and the offset.
287 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
290 // rsrc, offset
291 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
292 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
294 case Intrinsic::amdgcn_ds_ordered_add:
295 case Intrinsic::amdgcn_ds_ordered_swap: {
296 // VGPR = M0, VGPR
297 static const OpRegBankEntry<3> Table[2] = {
298 // Perfectly legal.
299 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
301 // Need a readfirstlane for m0
302 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
305 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
306 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
308 case Intrinsic::amdgcn_s_sendmsg:
309 case Intrinsic::amdgcn_s_sendmsghalt: {
310 // FIXME: Should have no register for immediate
311 static const OpRegBankEntry<2> Table[2] = {
312 // Perfectly legal.
313 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
315 // Need readlane
316 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
319 const std::array<unsigned, 2> RegSrcOpIdx = { { 1, 2 } };
320 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
322 default:
323 return RegisterBankInfo::getInstrAlternativeMappings(MI);
327 static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) {
328 if (!MI.hasOneMemOperand())
329 return false;
331 const MachineMemOperand *MMO = *MI.memoperands_begin();
332 return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
333 AMDGPUInstrInfo::isUniformMMO(MMO);
336 RegisterBankInfo::InstructionMappings
337 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
338 const MachineInstr &MI) const {
340 const MachineFunction &MF = *MI.getParent()->getParent();
341 const MachineRegisterInfo &MRI = MF.getRegInfo();
344 InstructionMappings AltMappings;
345 switch (MI.getOpcode()) {
346 case TargetOpcode::G_CONSTANT:
347 case TargetOpcode::G_FCONSTANT:
348 case TargetOpcode::G_FRAME_INDEX:
349 case TargetOpcode::G_GLOBAL_VALUE: {
350 static const OpRegBankEntry<1> Table[2] = {
351 { { AMDGPU::VGPRRegBankID }, 1 },
352 { { AMDGPU::SGPRRegBankID }, 1 }
355 return addMappingFromTable<1>(MI, MRI, { 0 }, Table);
357 case TargetOpcode::G_AND:
358 case TargetOpcode::G_OR:
359 case TargetOpcode::G_XOR: {
360 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
362 if (Size == 1) {
363 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
364 const InstructionMapping &SCCMapping = getInstructionMapping(
365 1, 1, getOperandsMapping(
366 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
367 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
368 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
369 3); // Num Operands
370 AltMappings.push_back(&SCCMapping);
372 const InstructionMapping &SGPRMapping = getInstructionMapping(
373 1, 1, getOperandsMapping(
374 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
375 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
376 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
377 3); // Num Operands
378 AltMappings.push_back(&SGPRMapping);
380 const InstructionMapping &VCCMapping0 = getInstructionMapping(
381 2, 10, getOperandsMapping(
382 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
383 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
384 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
385 3); // Num Operands
386 AltMappings.push_back(&VCCMapping0);
387 return AltMappings;
390 if (Size != 64)
391 break;
393 const InstructionMapping &SSMapping = getInstructionMapping(
394 1, 1, getOperandsMapping(
395 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
396 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
397 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
398 3); // Num Operands
399 AltMappings.push_back(&SSMapping);
401 const InstructionMapping &VVMapping = getInstructionMapping(
402 2, 2, getOperandsMapping(
403 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
404 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
405 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
406 3); // Num Operands
407 AltMappings.push_back(&VVMapping);
409 const InstructionMapping &SVMapping = getInstructionMapping(
410 3, 3, getOperandsMapping(
411 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
412 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
413 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
414 3); // Num Operands
415 AltMappings.push_back(&SVMapping);
417 // SGPR in LHS is slightly preferrable, so make it VS more expensive than
418 // SV.
419 const InstructionMapping &VSMapping = getInstructionMapping(
420 3, 4, getOperandsMapping(
421 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
422 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
423 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
424 3); // Num Operands
425 AltMappings.push_back(&VSMapping);
426 break;
428 case TargetOpcode::G_LOAD:
429 case TargetOpcode::G_ZEXTLOAD:
430 case TargetOpcode::G_SEXTLOAD: {
431 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
432 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
433 unsigned PtrSize = PtrTy.getSizeInBits();
434 unsigned AS = PtrTy.getAddressSpace();
435 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
436 if (isInstrUniformNonExtLoadAlign4(MI) &&
437 (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)) {
438 const InstructionMapping &SSMapping = getInstructionMapping(
439 1, 1, getOperandsMapping(
440 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
441 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
442 2); // Num Operands
443 AltMappings.push_back(&SSMapping);
446 const InstructionMapping &VVMapping = getInstructionMapping(
447 2, 1, getOperandsMapping(
448 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
449 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
450 2); // Num Operands
451 AltMappings.push_back(&VVMapping);
453 // It may be possible to have a vgpr = load sgpr mapping here, because
454 // the mubuf instructions support this kind of load, but probably for only
455 // gfx7 and older. However, the addressing mode matching in the instruction
456 // selector should be able to do a better job of detecting and selecting
457 // these kinds of loads from the vgpr = load vgpr mapping.
459 return AltMappings;
462 case TargetOpcode::G_ICMP: {
463 unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
464 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
465 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
466 nullptr, // Predicate operand.
467 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
468 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
469 4); // Num Operands
470 AltMappings.push_back(&SSMapping);
472 const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
473 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
474 nullptr, // Predicate operand.
475 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
476 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
477 4); // Num Operands
478 AltMappings.push_back(&SVMapping);
480 const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
481 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
482 nullptr, // Predicate operand.
483 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
484 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
485 4); // Num Operands
486 AltMappings.push_back(&VSMapping);
488 const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
489 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
490 nullptr, // Predicate operand.
491 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
492 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
493 4); // Num Operands
494 AltMappings.push_back(&VVMapping);
496 return AltMappings;
498 case TargetOpcode::G_SELECT: {
499 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
500 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
501 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
502 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
504 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
505 4); // Num Operands
506 AltMappings.push_back(&SSMapping);
508 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
509 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
511 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
512 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
513 4); // Num Operands
514 AltMappings.push_back(&VVMapping);
516 return AltMappings;
518 case TargetOpcode::G_SMIN:
519 case TargetOpcode::G_SMAX:
520 case TargetOpcode::G_UMIN:
521 case TargetOpcode::G_UMAX: {
522 static const OpRegBankEntry<3> Table[4] = {
523 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
524 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
525 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
527 // Scalar requires cmp+select, and extends if 16-bit.
528 // FIXME: Should there be separate costs for 32 and 16-bit
529 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
532 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
533 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
535 case TargetOpcode::G_UADDE:
536 case TargetOpcode::G_USUBE:
537 case TargetOpcode::G_SADDE:
538 case TargetOpcode::G_SSUBE: {
539 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
540 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
541 getOperandsMapping(
542 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
543 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
544 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
545 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
546 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
547 5); // Num Operands
548 AltMappings.push_back(&SSMapping);
550 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
551 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
552 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
553 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
554 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
555 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
556 5); // Num Operands
557 AltMappings.push_back(&VVMapping);
558 return AltMappings;
560 case AMDGPU::G_BRCOND: {
561 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
563 const InstructionMapping &SMapping = getInstructionMapping(
564 1, 1, getOperandsMapping(
565 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
566 2); // Num Operands
567 AltMappings.push_back(&SMapping);
569 const InstructionMapping &VMapping = getInstructionMapping(
570 1, 1, getOperandsMapping(
571 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
572 2); // Num Operands
573 AltMappings.push_back(&VMapping);
574 return AltMappings;
576 case AMDGPU::G_INTRINSIC:
577 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
578 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
579 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
580 default:
581 break;
583 return RegisterBankInfo::getInstrAlternativeMappings(MI);
586 void AMDGPURegisterBankInfo::split64BitValueForMapping(
587 MachineIRBuilder &B,
588 SmallVector<Register, 2> &Regs,
589 LLT HalfTy,
590 Register Reg) const {
591 assert(HalfTy.getSizeInBits() == 32);
592 MachineRegisterInfo *MRI = B.getMRI();
593 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
594 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
595 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
596 MRI->setRegBank(LoLHS, *Bank);
597 MRI->setRegBank(HiLHS, *Bank);
599 Regs.push_back(LoLHS);
600 Regs.push_back(HiLHS);
602 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
603 .addDef(LoLHS)
604 .addDef(HiLHS)
605 .addUse(Reg);
608 /// Replace the current type each register in \p Regs has with \p NewTy
609 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
610 LLT NewTy) {
611 for (Register Reg : Regs) {
612 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
613 MRI.setType(Reg, NewTy);
617 static LLT getHalfSizedType(LLT Ty) {
618 if (Ty.isVector()) {
619 assert(Ty.getNumElements() % 2 == 0);
620 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
623 assert(Ty.getSizeInBits() % 2 == 0);
624 return LLT::scalar(Ty.getSizeInBits() / 2);
627 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
628 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
629 /// execute the instruction for each unique combination of values in all lanes
630 /// in the wave. The block will be split such that rest of the instructions are
631 /// moved to a new block.
633 /// Essentially performs this loop:
635 /// Save Execution Mask
636 /// For (Lane : Wavefront) {
637 /// Enable Lane, Disable all other lanes
638 /// SGPR = read SGPR value for current lane from VGPR
639 /// VGPRResult[Lane] = use_op SGPR
640 /// }
641 /// Restore Execution Mask
643 /// There is additional complexity to try for compare values to identify the
644 /// unique values used.
645 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
646 MachineIRBuilder &B,
647 MachineInstr &MI,
648 MachineRegisterInfo &MRI,
649 ArrayRef<unsigned> OpIndices) const {
650 MachineFunction *MF = MI.getParent()->getParent();
651 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
652 const SIInstrInfo *TII = ST.getInstrInfo();
653 MachineBasicBlock::iterator I(MI);
655 MachineBasicBlock &MBB = *MI.getParent();
656 const DebugLoc &DL = MI.getDebugLoc();
658 // Use a set to avoid extra readfirstlanes in the case where multiple operands
659 // are the same register.
660 SmallSet<Register, 4> SGPROperandRegs;
661 for (unsigned Op : OpIndices) {
662 assert(MI.getOperand(Op).isUse());
663 Register Reg = MI.getOperand(Op).getReg();
664 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
665 if (OpBank->getID() == AMDGPU::VGPRRegBankID)
666 SGPROperandRegs.insert(Reg);
669 // No operands need to be replaced, so no need to loop.
670 if (SGPROperandRegs.empty())
671 return false;
673 SmallVector<Register, 4> ResultRegs;
674 SmallVector<Register, 4> InitResultRegs;
675 SmallVector<Register, 4> PhiRegs;
676 for (MachineOperand &Def : MI.defs()) {
677 LLT ResTy = MRI.getType(Def.getReg());
678 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
679 ResultRegs.push_back(Def.getReg());
680 Register InitReg = B.buildUndef(ResTy).getReg(0);
681 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
682 InitResultRegs.push_back(InitReg);
683 PhiRegs.push_back(PhiReg);
684 MRI.setRegBank(PhiReg, *DefBank);
685 MRI.setRegBank(InitReg, *DefBank);
688 Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
689 Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
691 // Don't bother using generic instructions/registers for the exec mask.
692 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
693 .addDef(InitSaveExecReg);
695 Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
696 Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
698 // To insert the loop we need to split the block. Move everything before this
699 // point to a new block, and insert a new empty block before this instruction.
700 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
701 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
702 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
703 MachineFunction::iterator MBBI(MBB);
704 ++MBBI;
705 MF->insert(MBBI, LoopBB);
706 MF->insert(MBBI, RestoreExecBB);
707 MF->insert(MBBI, RemainderBB);
709 LoopBB->addSuccessor(RestoreExecBB);
710 LoopBB->addSuccessor(LoopBB);
712 // Move the rest of the block into a new block.
713 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
714 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
716 MBB.addSuccessor(LoopBB);
717 RestoreExecBB->addSuccessor(RemainderBB);
719 B.setInsertPt(*LoopBB, LoopBB->end());
721 B.buildInstr(TargetOpcode::PHI)
722 .addDef(PhiExec)
723 .addReg(InitSaveExecReg)
724 .addMBB(&MBB)
725 .addReg(NewExec)
726 .addMBB(LoopBB);
728 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
729 B.buildInstr(TargetOpcode::G_PHI)
730 .addDef(std::get<2>(Result))
731 .addReg(std::get<0>(Result)) // Initial value / implicit_def
732 .addMBB(&MBB)
733 .addReg(std::get<1>(Result)) // Mid-loop value.
734 .addMBB(LoopBB);
737 // Move the instruction into the loop.
738 LoopBB->splice(LoopBB->end(), &MBB, I);
739 I = std::prev(LoopBB->end());
741 B.setInstr(*I);
743 Register CondReg;
745 for (MachineOperand &Op : MI.uses()) {
746 if (!Op.isReg())
747 continue;
749 assert(!Op.isDef());
750 if (SGPROperandRegs.count(Op.getReg())) {
751 LLT OpTy = MRI.getType(Op.getReg());
752 unsigned OpSize = OpTy.getSizeInBits();
754 // Can only do a readlane of 32-bit pieces.
755 if (OpSize == 32) {
756 // Avoid extra copies in the simple case of one 32-bit register.
757 Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
758 MRI.setType(CurrentLaneOpReg, OpTy);
760 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
761 // Read the next variant <- also loop target.
762 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
763 .addReg(Op.getReg());
765 Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
766 bool First = CondReg == AMDGPU::NoRegister;
767 if (First)
768 CondReg = NewCondReg;
770 // Compare the just read M0 value to all possible Idx values.
771 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
772 .addDef(NewCondReg)
773 .addReg(CurrentLaneOpReg)
774 .addReg(Op.getReg());
775 Op.setReg(CurrentLaneOpReg);
777 if (!First) {
778 Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
780 // If there are multiple operands to consider, and the conditions.
781 B.buildInstr(AMDGPU::S_AND_B64)
782 .addDef(AndReg)
783 .addReg(NewCondReg)
784 .addReg(CondReg);
785 CondReg = AndReg;
787 } else {
788 LLT S32 = LLT::scalar(32);
789 SmallVector<Register, 8> ReadlanePieces;
791 // The compares can be done as 64-bit, but the extract needs to be done
792 // in 32-bit pieces.
794 bool Is64 = OpSize % 64 == 0;
796 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
797 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
798 : AMDGPU::V_CMP_EQ_U32_e64;
800 // The compares can be done as 64-bit, but the extract needs to be done
801 // in 32-bit pieces.
803 // Insert the unmerge before the loop.
805 B.setMBB(MBB);
806 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
807 B.setInstr(*I);
809 unsigned NumPieces = Unmerge->getNumOperands() - 1;
810 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
811 Register UnmergePiece = Unmerge.getReg(PieceIdx);
813 Register CurrentLaneOpReg;
814 if (Is64) {
815 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
816 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
818 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
819 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
820 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
822 // Read the next variant <- also loop target.
823 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
824 CurrentLaneOpRegLo)
825 .addReg(UnmergePiece, 0, AMDGPU::sub0);
827 // Read the next variant <- also loop target.
828 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
829 CurrentLaneOpRegHi)
830 .addReg(UnmergePiece, 0, AMDGPU::sub1);
832 CurrentLaneOpReg =
833 B.buildMerge(LLT::scalar(64),
834 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
835 .getReg(0);
837 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
839 if (OpTy.getScalarSizeInBits() == 64) {
840 // If we need to produce a 64-bit element vector, so use the
841 // merged pieces
842 ReadlanePieces.push_back(CurrentLaneOpReg);
843 } else {
844 // 32-bit element type.
845 ReadlanePieces.push_back(CurrentLaneOpRegLo);
846 ReadlanePieces.push_back(CurrentLaneOpRegHi);
848 } else {
849 CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
850 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
851 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
853 // Read the next variant <- also loop target.
854 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
855 CurrentLaneOpReg)
856 .addReg(UnmergePiece);
857 ReadlanePieces.push_back(CurrentLaneOpReg);
860 Register NewCondReg
861 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
862 bool First = CondReg == AMDGPU::NoRegister;
863 if (First)
864 CondReg = NewCondReg;
866 B.buildInstr(CmpOp)
867 .addDef(NewCondReg)
868 .addReg(CurrentLaneOpReg)
869 .addReg(UnmergePiece);
871 if (!First) {
872 Register AndReg
873 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
875 // If there are multiple operands to consider, and the conditions.
876 B.buildInstr(AMDGPU::S_AND_B64)
877 .addDef(AndReg)
878 .addReg(NewCondReg)
879 .addReg(CondReg);
880 CondReg = AndReg;
884 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
885 // BUILD_VECTOR
886 if (OpTy.isVector()) {
887 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
888 Op.setReg(Merge.getReg(0));
889 } else {
890 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
891 Op.setReg(Merge.getReg(0));
894 MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
899 B.setInsertPt(*LoopBB, LoopBB->end());
901 // Update EXEC, save the original EXEC value to VCC.
902 B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
903 .addDef(NewExec)
904 .addReg(CondReg, RegState::Kill);
906 MRI.setSimpleHint(NewExec, CondReg);
908 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
909 B.buildInstr(AMDGPU::S_XOR_B64_term)
910 .addDef(AMDGPU::EXEC)
911 .addReg(AMDGPU::EXEC)
912 .addReg(NewExec);
914 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
915 // s_cbranch_scc0?
917 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
918 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
919 .addMBB(LoopBB);
921 // Save the EXEC mask before the loop.
922 BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
923 .addReg(AMDGPU::EXEC);
925 // Restore the EXEC mask after the loop.
926 B.setMBB(*RestoreExecBB);
927 B.buildInstr(AMDGPU::S_MOV_B64_term)
928 .addDef(AMDGPU::EXEC)
929 .addReg(SaveExecReg);
931 // Restore the insert point before the original instruction.
932 B.setInsertPt(MBB, MBB.end());
934 return true;
937 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
938 MachineInstr &MI, MachineRegisterInfo &MRI,
939 ArrayRef<unsigned> OpIndices) const {
940 MachineIRBuilder B(MI);
941 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
944 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
945 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
946 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
947 Register Reg = MI.getOperand(OpIdx).getReg();
948 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
949 if (Bank != &AMDGPU::VGPRRegBank)
950 return;
952 MachineIRBuilder B(MI);
953 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
954 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
955 .addDef(SGPR)
956 .addReg(Reg);
958 const TargetRegisterClass *Constrained =
959 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
960 (void)Constrained;
961 assert(Constrained && "Failed to constrain readfirstlane src reg");
963 MI.getOperand(OpIdx).setReg(SGPR);
966 // When regbankselect repairs registers, it will insert a repair instruction
967 // which defines the repaired register. Then it calls applyMapping and expects
968 // that the targets will either delete or rewrite the originally wrote to the
969 // repaired registers. Beccause of this, we end up in a situation where
970 // we have 2 instructions defining the same registers.
971 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
972 Register Reg,
973 const MachineInstr &MI) {
974 // Is there some way we can assert that there are exactly 2 def instructions?
975 for (MachineInstr &Other : MRI.def_instructions(Reg)) {
976 if (&Other != &MI)
977 return &Other;
980 return nullptr;
983 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
984 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
985 MachineRegisterInfo &MRI) const {
986 Register DstReg = MI.getOperand(0).getReg();
987 const LLT LoadTy = MRI.getType(DstReg);
988 unsigned LoadSize = LoadTy.getSizeInBits();
989 const unsigned MaxNonSmrdLoadSize = 128;
990 // 128-bit loads are supported for all instruction types.
991 if (LoadSize <= MaxNonSmrdLoadSize)
992 return false;
994 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
995 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
997 // If the pointer is an SGPR, we have nothing to do.
998 if (SrcRegs.empty())
999 return false;
1001 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1003 // We want to get the repair instruction now, because it will help us
1004 // determine which instruction the legalizer inserts that will also
1005 // write to DstReg.
1006 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
1008 // RegBankSelect only emits scalar types, so we need to reset the pointer
1009 // operand to a pointer type.
1010 Register BasePtrReg = SrcRegs[0];
1011 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1012 MRI.setType(BasePtrReg, PtrTy);
1014 MachineIRBuilder B(MI);
1016 unsigned SplitElts =
1017 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
1018 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
1019 ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
1020 GISelObserverWrapper Observer(&O);
1021 B.setChangeObserver(Observer);
1022 LegalizerHelper Helper(B.getMF(), Observer, B);
1023 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1024 return false;
1026 // At this point, the legalizer has split the original load into smaller
1027 // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
1028 // that combines the outputs of the lower loads and writes it to DstReg.
1029 // The register bank selector has also added the RepairInst which writes to
1030 // DstReg as well.
1032 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
1034 // Replace the output of the LegalizedInst with a temporary register, since
1035 // RepairInst already defines DstReg.
1036 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1037 LegalizedInst->getOperand(0).setReg(TmpReg);
1038 B.setInsertPt(*RepairInst->getParent(), RepairInst);
1040 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1041 Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
1042 B.buildConstant(IdxReg, DefIdx);
1043 MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
1044 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1047 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1048 return true;
1051 bool AMDGPURegisterBankInfo::applyMappingImage(
1052 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1053 MachineRegisterInfo &MRI, int RsrcIdx) const {
1054 const int NumDefs = MI.getNumExplicitDefs();
1056 // The reported argument index is relative to the IR intrinsic call arguments,
1057 // so we need to shift by the number of defs and the intrinsic ID.
1058 RsrcIdx += NumDefs + 1;
1060 // Insert copies to VGPR arguments.
1061 applyDefaultMapping(OpdMapper);
1063 // Fixup any SGPR arguments.
1064 SmallVector<unsigned, 4> SGPRIndexes;
1065 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1066 if (!MI.getOperand(I).isReg())
1067 continue;
1069 // If this intrinsic has a sampler, it immediately follows rsrc.
1070 if (I == RsrcIdx || I == RsrcIdx + 1)
1071 SGPRIndexes.push_back(I);
1074 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1075 return true;
1078 // For cases where only a single copy is inserted for matching register banks.
1079 // Replace the register in the instruction operand
1080 static void substituteSimpleCopyRegs(
1081 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1082 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1083 if (!SrcReg.empty()) {
1084 assert(SrcReg.size() == 1);
1085 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1089 /// Handle register layout difference for f16 images for some subtargets.
1090 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1091 MachineRegisterInfo &MRI,
1092 Register Reg) const {
1093 if (!Subtarget.hasUnpackedD16VMem())
1094 return Reg;
1096 const LLT S16 = LLT::scalar(16);
1097 LLT StoreVT = MRI.getType(Reg);
1098 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1099 return Reg;
1101 auto Unmerge = B.buildUnmerge(S16, Reg);
1104 SmallVector<Register, 4> WideRegs;
1105 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1106 WideRegs.push_back(Unmerge.getReg(I));
1108 const LLT S32 = LLT::scalar(32);
1109 int NumElts = StoreVT.getNumElements();
1111 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1114 static std::pair<Register, unsigned>
1115 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1116 int64_t Const;
1117 if (mi_match(Reg, MRI, m_ICst(Const)))
1118 return std::make_pair(Register(), Const);
1120 Register Base;
1121 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1122 return std::make_pair(Base, Const);
1124 // TODO: Handle G_OR used for add case
1125 return std::make_pair(Reg, 0);
1128 std::pair<Register, unsigned>
1129 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1130 Register OrigOffset) const {
1131 const unsigned MaxImm = 4095;
1132 Register BaseReg;
1133 unsigned ImmOffset;
1134 const LLT S32 = LLT::scalar(32);
1136 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1137 OrigOffset);
1139 unsigned C1 = 0;
1140 if (ImmOffset != 0) {
1141 // If the immediate value is too big for the immoffset field, put the value
1142 // and -4096 into the immoffset field so that the value that is copied/added
1143 // for the voffset field is a multiple of 4096, and it stands more chance
1144 // of being CSEd with the copy/add for another similar load/store.
1145 // However, do not do that rounding down to a multiple of 4096 if that is a
1146 // negative number, as it appears to be illegal to have a negative offset
1147 // in the vgpr, even if adding the immediate offset makes it positive.
1148 unsigned Overflow = ImmOffset & ~MaxImm;
1149 ImmOffset -= Overflow;
1150 if ((int32_t)Overflow < 0) {
1151 Overflow += ImmOffset;
1152 ImmOffset = 0;
1155 C1 = ImmOffset;
1156 if (Overflow != 0) {
1157 if (!BaseReg)
1158 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1159 else {
1160 auto OverflowVal = B.buildConstant(S32, Overflow);
1161 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1166 if (!BaseReg)
1167 BaseReg = B.buildConstant(S32, 0).getReg(0);
1169 return {BaseReg, C1};
1172 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1173 int64_t C;
1174 return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1177 static unsigned extractGLC(unsigned CachePolicy) {
1178 return CachePolicy & 1;
1181 static unsigned extractSLC(unsigned CachePolicy) {
1182 return (CachePolicy >> 1) & 1;
1185 static unsigned extractDLC(unsigned CachePolicy) {
1186 return (CachePolicy >> 2) & 1;
1189 MachineInstr *
1190 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1191 MachineInstr &MI) const {
1192 MachineRegisterInfo &MRI = *B.getMRI();
1193 executeInWaterfallLoop(B, MI, MRI, {2, 4});
1195 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1197 Register VData = MI.getOperand(1).getReg();
1198 LLT Ty = MRI.getType(VData);
1200 int EltSize = Ty.getScalarSizeInBits();
1201 int Size = Ty.getSizeInBits();
1203 // FIXME: Broken integer truncstore.
1204 if (EltSize != 32)
1205 report_fatal_error("unhandled intrinsic store");
1207 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1208 const int MemSize = (*MI.memoperands_begin())->getSize();
1211 Register RSrc = MI.getOperand(2).getReg();
1212 Register VOffset = MI.getOperand(3).getReg();
1213 Register SOffset = MI.getOperand(4).getReg();
1214 unsigned CachePolicy = MI.getOperand(5).getImm();
1216 unsigned ImmOffset;
1217 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1219 const bool Offen = !isZero(VOffset, MRI);
1221 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1222 switch (8 * MemSize) {
1223 case 8:
1224 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1225 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1226 break;
1227 case 16:
1228 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1229 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1230 break;
1231 default:
1232 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1233 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1234 if (Size > 32)
1235 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1236 break;
1240 // Set the insertion point back to the instruction in case it was moved into a
1241 // loop.
1242 B.setInstr(MI);
1244 MachineInstrBuilder MIB = B.buildInstr(Opc)
1245 .addUse(VData);
1247 if (Offen)
1248 MIB.addUse(VOffset);
1250 MIB.addUse(RSrc)
1251 .addUse(SOffset)
1252 .addImm(ImmOffset)
1253 .addImm(extractGLC(CachePolicy))
1254 .addImm(extractSLC(CachePolicy))
1255 .addImm(0) // tfe: FIXME: Remove from inst
1256 .addImm(extractDLC(CachePolicy))
1257 .cloneMemRefs(MI);
1259 // FIXME: We need a way to report failure from applyMappingImpl.
1260 // Insert constrain copies before inserting the loop.
1261 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1262 report_fatal_error("failed to constrain selected store intrinsic");
1264 return MIB;
1267 void AMDGPURegisterBankInfo::applyMappingImpl(
1268 const OperandsMapper &OpdMapper) const {
1269 MachineInstr &MI = OpdMapper.getMI();
1270 unsigned Opc = MI.getOpcode();
1271 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1272 switch (Opc) {
1273 case AMDGPU::G_SELECT: {
1274 Register DstReg = MI.getOperand(0).getReg();
1275 LLT DstTy = MRI.getType(DstReg);
1276 if (DstTy.getSizeInBits() != 64)
1277 break;
1279 LLT HalfTy = getHalfSizedType(DstTy);
1281 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1282 SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
1283 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1284 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1286 // All inputs are SGPRs, nothing special to do.
1287 if (DefRegs.empty()) {
1288 assert(Src1Regs.empty() && Src2Regs.empty());
1289 break;
1292 MachineIRBuilder B(MI);
1293 if (Src0Regs.empty())
1294 Src0Regs.push_back(MI.getOperand(1).getReg());
1295 else {
1296 assert(Src0Regs.size() == 1);
1299 if (Src1Regs.empty())
1300 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1301 else {
1302 setRegsToType(MRI, Src1Regs, HalfTy);
1305 if (Src2Regs.empty())
1306 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1307 else
1308 setRegsToType(MRI, Src2Regs, HalfTy);
1310 setRegsToType(MRI, DefRegs, HalfTy);
1312 B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
1313 B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
1315 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1316 MI.eraseFromParent();
1317 return;
1319 case AMDGPU::G_AND:
1320 case AMDGPU::G_OR:
1321 case AMDGPU::G_XOR: {
1322 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1323 // there is a VGPR input.
1324 Register DstReg = MI.getOperand(0).getReg();
1325 LLT DstTy = MRI.getType(DstReg);
1326 if (DstTy.getSizeInBits() != 64)
1327 break;
1329 LLT HalfTy = getHalfSizedType(DstTy);
1330 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1331 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
1332 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1334 // All inputs are SGPRs, nothing special to do.
1335 if (DefRegs.empty()) {
1336 assert(Src0Regs.empty() && Src1Regs.empty());
1337 break;
1340 assert(DefRegs.size() == 2);
1341 assert(Src0Regs.size() == Src1Regs.size() &&
1342 (Src0Regs.empty() || Src0Regs.size() == 2));
1344 // Depending on where the source registers came from, the generic code may
1345 // have decided to split the inputs already or not. If not, we still need to
1346 // extract the values.
1347 MachineIRBuilder B(MI);
1349 if (Src0Regs.empty())
1350 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
1351 else
1352 setRegsToType(MRI, Src0Regs, HalfTy);
1354 if (Src1Regs.empty())
1355 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1356 else
1357 setRegsToType(MRI, Src1Regs, HalfTy);
1359 setRegsToType(MRI, DefRegs, HalfTy);
1361 B.buildInstr(Opc)
1362 .addDef(DefRegs[0])
1363 .addUse(Src0Regs[0])
1364 .addUse(Src1Regs[0]);
1366 B.buildInstr(Opc)
1367 .addDef(DefRegs[1])
1368 .addUse(Src0Regs[1])
1369 .addUse(Src1Regs[1]);
1371 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1372 MI.eraseFromParent();
1373 return;
1375 case AMDGPU::G_ADD:
1376 case AMDGPU::G_SUB:
1377 case AMDGPU::G_MUL: {
1378 Register DstReg = MI.getOperand(0).getReg();
1379 LLT DstTy = MRI.getType(DstReg);
1380 if (DstTy != LLT::scalar(16))
1381 break;
1383 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1384 if (DstBank == &AMDGPU::VGPRRegBank)
1385 break;
1387 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1388 MachineFunction *MF = MI.getParent()->getParent();
1389 MachineIRBuilder B(MI);
1390 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1391 GISelObserverWrapper Observer(&ApplySALU);
1392 LegalizerHelper Helper(*MF, Observer, B);
1394 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1395 LegalizerHelper::Legalized)
1396 llvm_unreachable("widen scalar should have succeeded");
1397 return;
1399 case AMDGPU::G_SMIN:
1400 case AMDGPU::G_SMAX:
1401 case AMDGPU::G_UMIN:
1402 case AMDGPU::G_UMAX: {
1403 Register DstReg = MI.getOperand(0).getReg();
1404 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1405 if (DstBank == &AMDGPU::VGPRRegBank)
1406 break;
1408 MachineFunction *MF = MI.getParent()->getParent();
1409 MachineIRBuilder B(MI);
1410 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1411 GISelObserverWrapper Observer(&ApplySALU);
1412 LegalizerHelper Helper(*MF, Observer, B);
1414 // Turn scalar min/max into a compare and select.
1415 LLT Ty = MRI.getType(DstReg);
1416 LLT S32 = LLT::scalar(32);
1417 LLT S16 = LLT::scalar(16);
1419 if (Ty == S16) {
1420 // Need to widen to s32, and expand as cmp + select.
1421 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1422 llvm_unreachable("widenScalar should have succeeded");
1424 // FIXME: This is relying on widenScalar leaving MI in place.
1425 if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
1426 llvm_unreachable("lower should have succeeded");
1427 } else {
1428 if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
1429 llvm_unreachable("lower should have succeeded");
1432 return;
1434 case AMDGPU::G_SEXT:
1435 case AMDGPU::G_ZEXT: {
1436 Register SrcReg = MI.getOperand(1).getReg();
1437 LLT SrcTy = MRI.getType(SrcReg);
1438 bool Signed = Opc == AMDGPU::G_SEXT;
1440 MachineIRBuilder B(MI);
1441 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1443 Register DstReg = MI.getOperand(0).getReg();
1444 LLT DstTy = MRI.getType(DstReg);
1445 if (DstTy.isScalar() &&
1446 SrcBank != &AMDGPU::SGPRRegBank &&
1447 SrcBank != &AMDGPU::SCCRegBank &&
1448 SrcBank != &AMDGPU::VCCRegBank &&
1449 // FIXME: Should handle any type that round to s64 when irregular
1450 // breakdowns supported.
1451 DstTy.getSizeInBits() == 64 &&
1452 SrcTy.getSizeInBits() <= 32) {
1453 const LLT S32 = LLT::scalar(32);
1454 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1456 // Extend to 32-bit, and then extend the low half.
1457 if (Signed) {
1458 // TODO: Should really be buildSExtOrCopy
1459 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
1461 // Replicate sign bit from 32-bit extended part.
1462 auto ShiftAmt = B.buildConstant(S32, 31);
1463 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1464 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
1465 } else {
1466 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
1467 B.buildConstant(DefRegs[1], 0);
1470 MRI.setRegBank(DstReg, *SrcBank);
1471 MI.eraseFromParent();
1472 return;
1475 if (SrcTy != LLT::scalar(1))
1476 return;
1478 if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
1479 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1481 const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
1482 &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
1484 unsigned DstSize = DstTy.getSizeInBits();
1485 // 64-bit select is SGPR only
1486 const bool UseSel64 = DstSize > 32 &&
1487 SrcBank->getID() == AMDGPU::SCCRegBankID;
1489 // TODO: Should s16 select be legal?
1490 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
1491 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
1492 auto False = B.buildConstant(SelType, 0);
1494 MRI.setRegBank(True.getReg(0), *DstBank);
1495 MRI.setRegBank(False.getReg(0), *DstBank);
1496 MRI.setRegBank(DstReg, *DstBank);
1498 if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
1499 B.buildSelect(DefRegs[0], SrcReg, True, False);
1500 B.buildCopy(DefRegs[1], DefRegs[0]);
1501 } else if (DstSize < 32) {
1502 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
1503 MRI.setRegBank(Sel.getReg(0), *DstBank);
1504 B.buildTrunc(DstReg, Sel);
1505 } else {
1506 B.buildSelect(DstReg, SrcReg, True, False);
1509 MI.eraseFromParent();
1510 return;
1513 // Fixup the case with an s1 src that isn't a condition register. Use shifts
1514 // instead of introducing a compare to avoid an unnecessary condition
1515 // register (and since there's no scalar 16-bit compares).
1516 auto Ext = B.buildAnyExt(DstTy, SrcReg);
1517 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
1518 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
1520 if (MI.getOpcode() == AMDGPU::G_SEXT)
1521 B.buildAShr(DstReg, Shl, ShiftAmt);
1522 else
1523 B.buildLShr(DstReg, Shl, ShiftAmt);
1525 MRI.setRegBank(DstReg, *SrcBank);
1526 MRI.setRegBank(Ext.getReg(0), *SrcBank);
1527 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1528 MRI.setRegBank(Shl.getReg(0), *SrcBank);
1529 MI.eraseFromParent();
1530 return;
1532 case AMDGPU::G_BUILD_VECTOR:
1533 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
1534 Register DstReg = MI.getOperand(0).getReg();
1535 LLT DstTy = MRI.getType(DstReg);
1536 if (DstTy != LLT::vector(2, 16))
1537 break;
1539 assert(MI.getNumOperands() == 3 && empty(OpdMapper.getVRegs(0)));
1540 substituteSimpleCopyRegs(OpdMapper, 1);
1541 substituteSimpleCopyRegs(OpdMapper, 2);
1543 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1544 if (DstBank == &AMDGPU::SGPRRegBank)
1545 break; // Can use S_PACK_* instructions.
1547 MachineIRBuilder B(MI);
1549 Register Lo = MI.getOperand(1).getReg();
1550 Register Hi = MI.getOperand(2).getReg();
1551 const LLT S32 = LLT::scalar(32);
1553 const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI);
1554 const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI);
1556 Register ZextLo;
1557 Register ShiftHi;
1559 if (Opc == AMDGPU::G_BUILD_VECTOR) {
1560 ZextLo = B.buildZExt(S32, Lo).getReg(0);
1561 MRI.setRegBank(ZextLo, *BankLo);
1563 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
1564 MRI.setRegBank(ZextHi, *BankHi);
1566 auto ShiftAmt = B.buildConstant(S32, 16);
1567 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1569 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
1570 MRI.setRegBank(ShiftHi, *BankHi);
1571 } else {
1572 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
1573 MRI.setRegBank(MaskLo, *BankLo);
1575 auto ShiftAmt = B.buildConstant(S32, 16);
1576 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1578 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
1579 MRI.setRegBank(ShiftHi, *BankHi);
1581 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
1582 MRI.setRegBank(ZextLo, *BankLo);
1585 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
1586 MRI.setRegBank(Or.getReg(0), *DstBank);
1588 B.buildBitcast(DstReg, Or);
1589 MI.eraseFromParent();
1590 return;
1592 case AMDGPU::G_EXTRACT_VECTOR_ELT:
1593 applyDefaultMapping(OpdMapper);
1594 executeInWaterfallLoop(MI, MRI, { 2 });
1595 return;
1596 case AMDGPU::G_INTRINSIC: {
1597 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1598 case Intrinsic::amdgcn_s_buffer_load: {
1599 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
1600 executeInWaterfallLoop(MI, MRI, { 2, 3 });
1601 return;
1603 case Intrinsic::amdgcn_readlane: {
1604 substituteSimpleCopyRegs(OpdMapper, 2);
1606 assert(empty(OpdMapper.getVRegs(0)));
1607 assert(empty(OpdMapper.getVRegs(3)));
1609 // Make sure the index is an SGPR. It doesn't make sense to run this in a
1610 // waterfall loop, so assume it's a uniform value.
1611 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1612 return;
1614 case Intrinsic::amdgcn_writelane: {
1615 assert(empty(OpdMapper.getVRegs(0)));
1616 assert(empty(OpdMapper.getVRegs(2)));
1617 assert(empty(OpdMapper.getVRegs(3)));
1619 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
1620 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
1621 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1622 return;
1624 default:
1625 break;
1627 break;
1629 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
1630 auto IntrID = MI.getIntrinsicID();
1631 switch (IntrID) {
1632 case Intrinsic::amdgcn_buffer_load: {
1633 executeInWaterfallLoop(MI, MRI, { 2 });
1634 return;
1636 case Intrinsic::amdgcn_ds_ordered_add:
1637 case Intrinsic::amdgcn_ds_ordered_swap: {
1638 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
1639 assert(empty(OpdMapper.getVRegs(0)));
1640 substituteSimpleCopyRegs(OpdMapper, 3);
1641 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1642 return;
1644 case Intrinsic::amdgcn_s_sendmsg:
1645 case Intrinsic::amdgcn_s_sendmsghalt: {
1646 // FIXME: Should this use a waterfall loop?
1647 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1648 return;
1650 case Intrinsic::amdgcn_raw_buffer_load:
1651 case Intrinsic::amdgcn_raw_buffer_load_format:
1652 case Intrinsic::amdgcn_raw_tbuffer_load:
1653 case Intrinsic::amdgcn_raw_buffer_store:
1654 case Intrinsic::amdgcn_raw_buffer_store_format:
1655 case Intrinsic::amdgcn_raw_tbuffer_store: {
1656 applyDefaultMapping(OpdMapper);
1657 executeInWaterfallLoop(MI, MRI, {2, 4});
1658 return;
1660 case Intrinsic::amdgcn_struct_buffer_load:
1661 case Intrinsic::amdgcn_struct_buffer_store:
1662 case Intrinsic::amdgcn_struct_tbuffer_load:
1663 case Intrinsic::amdgcn_struct_tbuffer_store: {
1664 applyDefaultMapping(OpdMapper);
1665 executeInWaterfallLoop(MI, MRI, {2, 5});
1666 return;
1668 default: {
1669 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1670 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1671 // Non-images can have complications from operands that allow both SGPR
1672 // and VGPR. For now it's too complicated to figure out the final opcode
1673 // to derive the register bank from the MCInstrDesc.
1674 if (RSrcIntrin->IsImage) {
1675 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
1676 return;
1680 break;
1683 break;
1685 case AMDGPU::G_LOAD:
1686 case AMDGPU::G_ZEXTLOAD:
1687 case AMDGPU::G_SEXTLOAD: {
1688 if (applyMappingWideLoad(MI, OpdMapper, MRI))
1689 return;
1690 break;
1692 default:
1693 break;
1696 return applyDefaultMapping(OpdMapper);
1699 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
1700 const MachineFunction &MF = *MI.getParent()->getParent();
1701 const MachineRegisterInfo &MRI = MF.getRegInfo();
1702 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
1703 if (!MI.getOperand(i).isReg())
1704 continue;
1705 Register Reg = MI.getOperand(i).getReg();
1706 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
1707 if (Bank->getID() == AMDGPU::VGPRRegBankID)
1708 return false;
1710 assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
1711 Bank->getID() == AMDGPU::SCCRegBankID);
1714 return true;
1717 const RegisterBankInfo::InstructionMapping &
1718 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
1719 const MachineFunction &MF = *MI.getParent()->getParent();
1720 const MachineRegisterInfo &MRI = MF.getRegInfo();
1721 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1723 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1724 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
1725 unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
1726 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
1728 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1729 MI.getNumOperands());
1732 const RegisterBankInfo::InstructionMapping &
1733 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
1734 const MachineFunction &MF = *MI.getParent()->getParent();
1735 const MachineRegisterInfo &MRI = MF.getRegInfo();
1736 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1737 unsigned OpdIdx = 0;
1739 unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1740 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
1742 if (MI.getOperand(OpdIdx).isIntrinsicID())
1743 OpdsMapping[OpdIdx++] = nullptr;
1745 Register Reg1 = MI.getOperand(OpdIdx).getReg();
1746 unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
1748 unsigned DefaultBankID = Size1 == 1 ?
1749 AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1750 unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
1752 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
1754 for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
1755 const MachineOperand &MO = MI.getOperand(OpdIdx);
1756 if (!MO.isReg())
1757 continue;
1759 unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
1760 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1761 OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
1764 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1765 MI.getNumOperands());
1768 const RegisterBankInfo::InstructionMapping &
1769 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
1770 const MachineFunction &MF = *MI.getParent()->getParent();
1771 const MachineRegisterInfo &MRI = MF.getRegInfo();
1772 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1774 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1775 const MachineOperand &Op = MI.getOperand(I);
1776 if (!Op.isReg())
1777 continue;
1779 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
1780 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1783 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1784 MI.getNumOperands());
1787 const RegisterBankInfo::InstructionMapping &
1788 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
1789 const MachineInstr &MI,
1790 int RsrcIdx) const {
1791 // The reported argument index is relative to the IR intrinsic call arguments,
1792 // so we need to shift by the number of defs and the intrinsic ID.
1793 RsrcIdx += MI.getNumExplicitDefs() + 1;
1795 const int NumOps = MI.getNumOperands();
1796 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
1798 // TODO: Should packed/unpacked D16 difference be reported here as part of
1799 // the value mapping?
1800 for (int I = 0; I != NumOps; ++I) {
1801 if (!MI.getOperand(I).isReg())
1802 continue;
1804 Register OpReg = MI.getOperand(I).getReg();
1805 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
1807 // FIXME: Probably need a new intrinsic register bank searchable table to
1808 // handle arbitrary intrinsics easily.
1810 // If this has a sampler, it immediately follows rsrc.
1811 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
1813 if (MustBeSGPR) {
1814 // If this must be an SGPR, so we must report whatever it is as legal.
1815 unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
1816 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
1817 } else {
1818 // Some operands must be VGPR, and these are easy to copy to.
1819 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1823 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
1826 const RegisterBankInfo::InstructionMapping &
1827 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
1829 const MachineFunction &MF = *MI.getParent()->getParent();
1830 const MachineRegisterInfo &MRI = MF.getRegInfo();
1831 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1832 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1833 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
1834 Register PtrReg = MI.getOperand(1).getReg();
1835 LLT PtrTy = MRI.getType(PtrReg);
1836 unsigned AS = PtrTy.getAddressSpace();
1837 unsigned PtrSize = PtrTy.getSizeInBits();
1839 const ValueMapping *ValMapping;
1840 const ValueMapping *PtrMapping;
1842 if (isInstrUniformNonExtLoadAlign4(MI) &&
1843 (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)) {
1844 // We have a uniform instruction so we want to use an SMRD load
1845 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1846 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
1847 } else {
1848 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
1849 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
1852 OpdsMapping[0] = ValMapping;
1853 OpdsMapping[1] = PtrMapping;
1854 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
1855 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
1856 return Mapping;
1858 // FIXME: Do we want to add a mapping for FLAT load, or should we just
1859 // handle that during instruction selection?
1862 unsigned
1863 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
1864 const MachineRegisterInfo &MRI,
1865 const TargetRegisterInfo &TRI,
1866 unsigned Default) const {
1868 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
1869 return Bank ? Bank->getID() : Default;
1873 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
1874 return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
1875 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1878 const RegisterBankInfo::ValueMapping *
1879 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
1880 const MachineRegisterInfo &MRI,
1881 const TargetRegisterInfo &TRI) const {
1882 // Lie and claim anything is legal, even though this needs to be an SGPR
1883 // applyMapping will have to deal with it as a waterfall loop.
1884 unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
1885 unsigned Size = getSizeInBits(Reg, MRI, TRI);
1886 return AMDGPU::getValueMapping(Bank, Size);
1889 const RegisterBankInfo::ValueMapping *
1890 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
1891 const MachineRegisterInfo &MRI,
1892 const TargetRegisterInfo &TRI) const {
1893 unsigned Size = getSizeInBits(Reg, MRI, TRI);
1894 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1898 /// This function must return a legal mapping, because
1899 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
1900 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
1901 /// VGPR to SGPR generated is illegal.
1903 const RegisterBankInfo::InstructionMapping &
1904 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
1905 const MachineFunction &MF = *MI.getParent()->getParent();
1906 const MachineRegisterInfo &MRI = MF.getRegInfo();
1908 if (MI.isRegSequence()) {
1909 // If any input is a VGPR, the result must be a VGPR. The default handling
1910 // assumes any copy between banks is legal.
1911 unsigned BankID = AMDGPU::SGPRRegBankID;
1913 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1914 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
1915 // It doesn't make sense to use vcc or scc banks here, so just ignore
1916 // them.
1917 if (OpBank != AMDGPU::SGPRRegBankID) {
1918 BankID = AMDGPU::VGPRRegBankID;
1919 break;
1922 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1924 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
1925 return getInstructionMapping(
1926 1, /*Cost*/ 1,
1927 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
1930 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
1931 // properly.
1933 // TODO: There are additional exec masking dependencies to analyze.
1934 if (MI.getOpcode() == TargetOpcode::G_PHI) {
1935 // TODO: Generate proper invalid bank enum.
1936 int ResultBank = -1;
1938 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1939 Register Reg = MI.getOperand(I).getReg();
1940 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1942 // FIXME: Assuming VGPR for any undetermined inputs.
1943 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
1944 ResultBank = AMDGPU::VGPRRegBankID;
1945 break;
1948 unsigned OpBank = Bank->getID();
1949 // scc, scc -> sgpr
1950 if (OpBank == AMDGPU::SCCRegBankID) {
1951 // There's only one SCC register, so a phi requires copying to SGPR.
1952 OpBank = AMDGPU::SGPRRegBankID;
1953 } else if (OpBank == AMDGPU::VCCRegBankID) {
1954 // vcc, vcc -> vcc
1955 // vcc, sgpr -> vgpr
1956 if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) {
1957 ResultBank = AMDGPU::VGPRRegBankID;
1958 break;
1962 ResultBank = OpBank;
1965 assert(ResultBank != -1);
1967 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1969 const ValueMapping &ValMap =
1970 getValueMapping(0, Size, getRegBank(ResultBank));
1971 return getInstructionMapping(
1972 1, /*Cost*/ 1,
1973 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
1976 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
1977 if (Mapping.isValid())
1978 return Mapping;
1980 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1982 switch (MI.getOpcode()) {
1983 default:
1984 return getInvalidInstructionMapping();
1986 case AMDGPU::G_AND:
1987 case AMDGPU::G_OR:
1988 case AMDGPU::G_XOR: {
1989 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1990 if (Size == 1) {
1991 const RegisterBank *DstBank
1992 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
1994 unsigned TargetBankID = -1;
1995 unsigned BankLHS = -1;
1996 unsigned BankRHS = -1;
1997 if (DstBank) {
1998 TargetBankID = DstBank->getID();
1999 if (DstBank == &AMDGPU::VCCRegBank) {
2000 TargetBankID = AMDGPU::VCCRegBankID;
2001 BankLHS = AMDGPU::VCCRegBankID;
2002 BankRHS = AMDGPU::VCCRegBankID;
2003 } else if (DstBank == &AMDGPU::SCCRegBank) {
2004 TargetBankID = AMDGPU::SCCRegBankID;
2005 BankLHS = AMDGPU::SGPRRegBankID;
2006 BankRHS = AMDGPU::SGPRRegBankID;
2007 } else {
2008 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2009 AMDGPU::SGPRRegBankID);
2010 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2011 AMDGPU::SGPRRegBankID);
2013 } else {
2014 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2015 AMDGPU::VCCRegBankID);
2016 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2017 AMDGPU::VCCRegBankID);
2019 // Both inputs should be true booleans to produce a boolean result.
2020 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
2021 TargetBankID = AMDGPU::VGPRRegBankID;
2022 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
2023 TargetBankID = AMDGPU::VCCRegBankID;
2024 BankLHS = AMDGPU::VCCRegBankID;
2025 BankRHS = AMDGPU::VCCRegBankID;
2026 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
2027 TargetBankID = AMDGPU::SGPRRegBankID;
2028 } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
2029 // The operation must be done on a 32-bit register, but it will set
2030 // scc. The result type could interchangably be SCC or SGPR, since
2031 // both values will be produced.
2032 TargetBankID = AMDGPU::SCCRegBankID;
2033 BankLHS = AMDGPU::SGPRRegBankID;
2034 BankRHS = AMDGPU::SGPRRegBankID;
2038 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
2039 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
2040 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
2041 break;
2044 if (Size == 64) {
2046 if (isSALUMapping(MI)) {
2047 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
2048 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
2049 } else {
2050 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
2051 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
2052 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
2054 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
2055 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
2058 break;
2061 LLVM_FALLTHROUGH;
2063 case AMDGPU::G_GEP:
2064 case AMDGPU::G_ADD:
2065 case AMDGPU::G_SUB:
2066 case AMDGPU::G_MUL:
2067 case AMDGPU::G_SHL:
2068 case AMDGPU::G_LSHR:
2069 case AMDGPU::G_ASHR:
2070 case AMDGPU::G_UADDO:
2071 case AMDGPU::G_SADDO:
2072 case AMDGPU::G_USUBO:
2073 case AMDGPU::G_SSUBO:
2074 case AMDGPU::G_UADDE:
2075 case AMDGPU::G_SADDE:
2076 case AMDGPU::G_USUBE:
2077 case AMDGPU::G_SSUBE:
2078 case AMDGPU::G_SMIN:
2079 case AMDGPU::G_SMAX:
2080 case AMDGPU::G_UMIN:
2081 case AMDGPU::G_UMAX:
2082 if (isSALUMapping(MI))
2083 return getDefaultMappingSOP(MI);
2084 LLVM_FALLTHROUGH;
2086 case AMDGPU::G_FADD:
2087 case AMDGPU::G_FSUB:
2088 case AMDGPU::G_FPTOSI:
2089 case AMDGPU::G_FPTOUI:
2090 case AMDGPU::G_FMUL:
2091 case AMDGPU::G_FMA:
2092 case AMDGPU::G_FMAD:
2093 case AMDGPU::G_FSQRT:
2094 case AMDGPU::G_FFLOOR:
2095 case AMDGPU::G_FCEIL:
2096 case AMDGPU::G_FRINT:
2097 case AMDGPU::G_SITOFP:
2098 case AMDGPU::G_UITOFP:
2099 case AMDGPU::G_FPTRUNC:
2100 case AMDGPU::G_FPEXT:
2101 case AMDGPU::G_FEXP2:
2102 case AMDGPU::G_FLOG2:
2103 case AMDGPU::G_FMINNUM:
2104 case AMDGPU::G_FMAXNUM:
2105 case AMDGPU::G_FMINNUM_IEEE:
2106 case AMDGPU::G_FMAXNUM_IEEE:
2107 case AMDGPU::G_FCANONICALIZE:
2108 case AMDGPU::G_INTRINSIC_TRUNC:
2109 case AMDGPU::G_INTRINSIC_ROUND:
2110 return getDefaultMappingVOP(MI);
2111 case AMDGPU::G_UMULH:
2112 case AMDGPU::G_SMULH: {
2113 if (MF.getSubtarget<GCNSubtarget>().hasScalarMulHiInsts() &&
2114 isSALUMapping(MI))
2115 return getDefaultMappingSOP(MI);
2116 return getDefaultMappingVOP(MI);
2118 case AMDGPU::G_IMPLICIT_DEF: {
2119 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2120 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2121 break;
2123 case AMDGPU::G_FCONSTANT:
2124 case AMDGPU::G_CONSTANT:
2125 case AMDGPU::G_FRAME_INDEX:
2126 case AMDGPU::G_GLOBAL_VALUE:
2127 case AMDGPU::G_BLOCK_ADDR: {
2128 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2129 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2130 break;
2132 case AMDGPU::G_INSERT: {
2133 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2134 AMDGPU::VGPRRegBankID;
2135 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2136 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2137 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
2138 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2139 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2140 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
2141 OpdsMapping[3] = nullptr;
2142 break;
2144 case AMDGPU::G_EXTRACT: {
2145 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2146 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2147 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2148 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2149 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2150 OpdsMapping[2] = nullptr;
2151 break;
2153 case AMDGPU::G_BUILD_VECTOR:
2154 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2155 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
2156 if (DstTy == LLT::vector(2, 16)) {
2157 unsigned DstSize = DstTy.getSizeInBits();
2158 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2159 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2160 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2161 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
2163 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
2164 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
2165 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
2166 break;
2169 LLVM_FALLTHROUGH;
2171 case AMDGPU::G_MERGE_VALUES:
2172 case AMDGPU::G_CONCAT_VECTORS: {
2173 unsigned Bank = isSALUMapping(MI) ?
2174 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2175 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2176 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2178 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2179 // Op1 and Dst should use the same register bank.
2180 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
2181 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
2182 break;
2184 case AMDGPU::G_BITCAST:
2185 case AMDGPU::G_INTTOPTR:
2186 case AMDGPU::G_PTRTOINT:
2187 case AMDGPU::G_CTLZ:
2188 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2189 case AMDGPU::G_CTTZ:
2190 case AMDGPU::G_CTTZ_ZERO_UNDEF:
2191 case AMDGPU::G_CTPOP:
2192 case AMDGPU::G_BSWAP:
2193 case AMDGPU::G_BITREVERSE:
2194 case AMDGPU::G_FABS:
2195 case AMDGPU::G_FNEG: {
2196 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2197 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2198 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
2199 break;
2201 case AMDGPU::G_TRUNC: {
2202 Register Dst = MI.getOperand(0).getReg();
2203 Register Src = MI.getOperand(1).getReg();
2204 unsigned Bank = getRegBankID(Src, MRI, *TRI);
2205 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2206 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2207 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2208 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
2209 break;
2211 case AMDGPU::G_ZEXT:
2212 case AMDGPU::G_SEXT:
2213 case AMDGPU::G_ANYEXT: {
2214 Register Dst = MI.getOperand(0).getReg();
2215 Register Src = MI.getOperand(1).getReg();
2216 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2217 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2219 unsigned DstBank;
2220 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
2221 assert(SrcBank);
2222 switch (SrcBank->getID()) {
2223 case AMDGPU::SCCRegBankID:
2224 case AMDGPU::SGPRRegBankID:
2225 DstBank = AMDGPU::SGPRRegBankID;
2226 break;
2227 default:
2228 DstBank = AMDGPU::VGPRRegBankID;
2229 break;
2232 // TODO: Should anyext be split into 32-bit part as well?
2233 if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
2234 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
2235 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
2236 } else {
2237 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
2238 // 32-bits, and then to 64.
2239 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
2240 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
2241 SrcSize);
2243 break;
2245 case AMDGPU::G_FCMP: {
2246 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2247 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2248 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2249 OpdsMapping[1] = nullptr; // Predicate Operand.
2250 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2251 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2252 break;
2254 case AMDGPU::G_STORE: {
2255 assert(MI.getOperand(0).isReg());
2256 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2257 // FIXME: We need to specify a different reg bank once scalar stores
2258 // are supported.
2259 const ValueMapping *ValMapping =
2260 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2261 // FIXME: Depending on the type of store, the pointer could be in
2262 // the SGPR Reg bank.
2263 // FIXME: Pointer size should be based on the address space.
2264 const ValueMapping *PtrMapping =
2265 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
2267 OpdsMapping[0] = ValMapping;
2268 OpdsMapping[1] = PtrMapping;
2269 break;
2272 case AMDGPU::G_ICMP: {
2273 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
2274 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2275 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2276 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2278 bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
2279 Op3Bank == AMDGPU::SGPRRegBankID &&
2280 (Size == 32 || (Size == 64 &&
2281 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
2282 MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()));
2284 unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2286 OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
2287 OpdsMapping[1] = nullptr; // Predicate Operand.
2288 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2289 OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
2290 break;
2292 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2293 unsigned OutputBankID = isSALUMapping(MI) ?
2294 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2295 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2296 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2297 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2299 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
2300 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
2302 // The index can be either if the source vector is VGPR.
2303 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2304 break;
2306 case AMDGPU::G_INSERT_VECTOR_ELT: {
2307 unsigned OutputBankID = isSALUMapping(MI) ?
2308 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2310 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2311 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2312 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2313 unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2314 unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2316 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
2317 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
2318 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize);
2320 // The index can be either if the source vector is VGPR.
2321 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2322 break;
2324 case AMDGPU::G_UNMERGE_VALUES: {
2325 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2326 AMDGPU::VGPRRegBankID;
2328 // Op1 and Dst should use the same register bank.
2329 // FIXME: Shouldn't this be the default? Why do we need to handle this?
2330 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2331 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2332 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
2334 break;
2336 case AMDGPU::G_INTRINSIC: {
2337 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
2338 default:
2339 return getInvalidInstructionMapping();
2340 case Intrinsic::amdgcn_div_fmas:
2341 case Intrinsic::amdgcn_trig_preop:
2342 case Intrinsic::amdgcn_sin:
2343 case Intrinsic::amdgcn_cos:
2344 case Intrinsic::amdgcn_log_clamp:
2345 case Intrinsic::amdgcn_rcp:
2346 case Intrinsic::amdgcn_rcp_legacy:
2347 case Intrinsic::amdgcn_rsq:
2348 case Intrinsic::amdgcn_rsq_legacy:
2349 case Intrinsic::amdgcn_rsq_clamp:
2350 case Intrinsic::amdgcn_ldexp:
2351 case Intrinsic::amdgcn_frexp_mant:
2352 case Intrinsic::amdgcn_frexp_exp:
2353 case Intrinsic::amdgcn_fract:
2354 case Intrinsic::amdgcn_cvt_pkrtz:
2355 case Intrinsic::amdgcn_cvt_pknorm_i16:
2356 case Intrinsic::amdgcn_cvt_pknorm_u16:
2357 case Intrinsic::amdgcn_cvt_pk_i16:
2358 case Intrinsic::amdgcn_cvt_pk_u16:
2359 case Intrinsic::amdgcn_fmed3:
2360 case Intrinsic::amdgcn_cubeid:
2361 case Intrinsic::amdgcn_cubema:
2362 case Intrinsic::amdgcn_cubesc:
2363 case Intrinsic::amdgcn_cubetc:
2364 case Intrinsic::amdgcn_sffbh:
2365 case Intrinsic::amdgcn_fmad_ftz:
2366 case Intrinsic::amdgcn_mbcnt_lo:
2367 case Intrinsic::amdgcn_mbcnt_hi:
2368 case Intrinsic::amdgcn_ubfe:
2369 case Intrinsic::amdgcn_sbfe:
2370 case Intrinsic::amdgcn_lerp:
2371 case Intrinsic::amdgcn_sad_u8:
2372 case Intrinsic::amdgcn_msad_u8:
2373 case Intrinsic::amdgcn_sad_hi_u8:
2374 case Intrinsic::amdgcn_sad_u16:
2375 case Intrinsic::amdgcn_qsad_pk_u16_u8:
2376 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
2377 case Intrinsic::amdgcn_mqsad_u32_u8:
2378 case Intrinsic::amdgcn_cvt_pk_u8_f32:
2379 case Intrinsic::amdgcn_alignbit:
2380 case Intrinsic::amdgcn_alignbyte:
2381 case Intrinsic::amdgcn_fdot2:
2382 case Intrinsic::amdgcn_sdot2:
2383 case Intrinsic::amdgcn_udot2:
2384 case Intrinsic::amdgcn_sdot4:
2385 case Intrinsic::amdgcn_udot4:
2386 case Intrinsic::amdgcn_sdot8:
2387 case Intrinsic::amdgcn_udot8:
2388 case Intrinsic::amdgcn_wwm:
2389 case Intrinsic::amdgcn_wqm:
2390 return getDefaultMappingVOP(MI);
2391 case Intrinsic::amdgcn_ds_swizzle:
2392 case Intrinsic::amdgcn_ds_permute:
2393 case Intrinsic::amdgcn_ds_bpermute:
2394 case Intrinsic::amdgcn_update_dpp:
2395 return getDefaultMappingAllVGPR(MI);
2396 case Intrinsic::amdgcn_kernarg_segment_ptr:
2397 case Intrinsic::amdgcn_s_getpc:
2398 case Intrinsic::amdgcn_groupstaticsize: {
2399 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2400 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2401 break;
2403 case Intrinsic::amdgcn_wqm_vote: {
2404 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2405 OpdsMapping[0] = OpdsMapping[2]
2406 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
2407 break;
2409 case Intrinsic::amdgcn_s_buffer_load: {
2410 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
2411 Register RSrc = MI.getOperand(2).getReg(); // SGPR
2412 Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
2414 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2415 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2416 unsigned Size3 = MRI.getType(Offset).getSizeInBits();
2418 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2419 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2421 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
2422 OpdsMapping[1] = nullptr; // intrinsic id
2424 // Lie and claim everything is legal, even though some need to be
2425 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2426 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2427 OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
2428 OpdsMapping[4] = nullptr;
2429 break;
2431 case Intrinsic::amdgcn_div_scale: {
2432 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2433 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2434 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
2435 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
2437 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2438 OpdsMapping[3] = AMDGPU::getValueMapping(
2439 getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
2440 OpdsMapping[4] = AMDGPU::getValueMapping(
2441 getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
2443 break;
2445 case Intrinsic::amdgcn_class: {
2446 Register Src0Reg = MI.getOperand(2).getReg();
2447 Register Src1Reg = MI.getOperand(3).getReg();
2448 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
2449 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
2450 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2451 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
2452 OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
2453 Src0Size);
2454 OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
2455 Src1Size);
2456 break;
2458 case Intrinsic::amdgcn_icmp:
2459 case Intrinsic::amdgcn_fcmp: {
2460 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2461 // This is not VCCRegBank because this is not used in boolean contexts.
2462 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2463 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2464 unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2465 unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2466 OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
2467 OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
2468 break;
2470 case Intrinsic::amdgcn_readlane: {
2471 // This must be an SGPR, but accept a VGPR.
2472 Register IdxReg = MI.getOperand(3).getReg();
2473 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2474 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2475 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2476 LLVM_FALLTHROUGH;
2478 case Intrinsic::amdgcn_readfirstlane: {
2479 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2480 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2481 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2482 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2483 break;
2485 case Intrinsic::amdgcn_writelane: {
2486 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2487 Register SrcReg = MI.getOperand(2).getReg();
2488 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
2489 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2490 Register IdxReg = MI.getOperand(3).getReg();
2491 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2492 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2493 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2495 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
2496 // to legalize.
2497 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
2498 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2499 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2500 break;
2502 case Intrinsic::amdgcn_if_break: {
2503 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2504 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2505 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2506 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2507 break;
2510 break;
2512 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2513 auto IntrID = MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID();
2514 switch (IntrID) {
2515 case Intrinsic::amdgcn_s_getreg:
2516 case Intrinsic::amdgcn_s_memtime:
2517 case Intrinsic::amdgcn_s_memrealtime:
2518 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
2519 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2520 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2521 break;
2523 case Intrinsic::amdgcn_ds_append:
2524 case Intrinsic::amdgcn_ds_consume:
2525 case Intrinsic::amdgcn_ds_fadd:
2526 case Intrinsic::amdgcn_ds_fmin:
2527 case Intrinsic::amdgcn_ds_fmax:
2528 case Intrinsic::amdgcn_atomic_inc:
2529 case Intrinsic::amdgcn_atomic_dec:
2530 return getDefaultMappingAllVGPR(MI);
2531 case Intrinsic::amdgcn_ds_ordered_add:
2532 case Intrinsic::amdgcn_ds_ordered_swap: {
2533 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2534 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2535 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2536 AMDGPU::SGPRRegBankID);
2537 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
2538 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2539 break;
2541 case Intrinsic::amdgcn_exp_compr:
2542 OpdsMapping[0] = nullptr; // IntrinsicID
2543 // FIXME: These are immediate values which can't be read from registers.
2544 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2545 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2546 // FIXME: Could we support packed types here?
2547 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2548 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2549 // FIXME: These are immediate values which can't be read from registers.
2550 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2551 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2552 break;
2553 case Intrinsic::amdgcn_exp:
2554 // FIXME: Could we support packed types here?
2555 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2556 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2557 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2558 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2559 break;
2560 case Intrinsic::amdgcn_buffer_load: {
2561 Register RSrc = MI.getOperand(2).getReg(); // SGPR
2562 Register VIndex = MI.getOperand(3).getReg(); // VGPR
2563 Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
2565 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2566 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2567 unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
2568 unsigned Size4 = MRI.getType(Offset).getSizeInBits();
2570 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2571 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2573 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
2574 OpdsMapping[1] = nullptr; // intrinsic id
2576 // Lie and claim everything is legal, even though some need to be
2577 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2578 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2579 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
2580 OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
2581 OpdsMapping[5] = nullptr;
2582 OpdsMapping[6] = nullptr;
2583 break;
2585 case Intrinsic::amdgcn_s_sendmsg:
2586 case Intrinsic::amdgcn_s_sendmsghalt: {
2587 // This must be an SGPR, but accept a VGPR.
2588 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2589 AMDGPU::SGPRRegBankID);
2590 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
2591 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
2592 break;
2594 case Intrinsic::amdgcn_end_cf: {
2595 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2596 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2597 break;
2599 case Intrinsic::amdgcn_else: {
2600 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2601 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2602 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
2603 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
2604 break;
2606 case Intrinsic::amdgcn_kill: {
2607 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2608 break;
2610 case Intrinsic::amdgcn_raw_buffer_load:
2611 case Intrinsic::amdgcn_raw_tbuffer_load: {
2612 // FIXME: Should make intrinsic ID the last operand of the instruction,
2613 // then this would be the same as store
2614 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
2615 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2616 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2617 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2618 break;
2620 case Intrinsic::amdgcn_raw_buffer_store:
2621 case Intrinsic::amdgcn_raw_buffer_store_format:
2622 case Intrinsic::amdgcn_raw_tbuffer_store: {
2623 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
2624 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2625 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2626 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2627 break;
2629 case Intrinsic::amdgcn_struct_buffer_load:
2630 case Intrinsic::amdgcn_struct_tbuffer_load: {
2631 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
2632 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2633 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2634 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2635 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
2636 break;
2638 case Intrinsic::amdgcn_struct_buffer_store:
2639 case Intrinsic::amdgcn_struct_tbuffer_store: {
2640 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
2641 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2642 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2643 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2644 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
2645 break;
2647 default:
2648 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
2649 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
2650 // Non-images can have complications from operands that allow both SGPR
2651 // and VGPR. For now it's too complicated to figure out the final opcode
2652 // to derive the register bank from the MCInstrDesc.
2653 if (RSrcIntrin->IsImage)
2654 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
2657 return getInvalidInstructionMapping();
2659 break;
2661 case AMDGPU::G_SELECT: {
2662 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2663 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2664 AMDGPU::SGPRRegBankID);
2665 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
2666 AMDGPU::SGPRRegBankID);
2667 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
2668 Op3Bank == AMDGPU::SGPRRegBankID;
2670 unsigned CondBankDefault = SGPRSrcs ?
2671 AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2672 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2673 CondBankDefault);
2674 if (CondBank == AMDGPU::SGPRRegBankID)
2675 CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2676 else if (CondBank == AMDGPU::VGPRRegBankID)
2677 CondBank = AMDGPU::VCCRegBankID;
2679 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ?
2680 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2682 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID);
2684 if (Size == 64) {
2685 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2686 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2687 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2688 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2689 } else {
2690 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
2691 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2692 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
2693 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
2696 break;
2699 case AMDGPU::G_LOAD:
2700 case AMDGPU::G_ZEXTLOAD:
2701 case AMDGPU::G_SEXTLOAD:
2702 return getInstrMappingForLoad(MI);
2704 case AMDGPU::G_ATOMICRMW_XCHG:
2705 case AMDGPU::G_ATOMICRMW_ADD:
2706 case AMDGPU::G_ATOMICRMW_SUB:
2707 case AMDGPU::G_ATOMICRMW_AND:
2708 case AMDGPU::G_ATOMICRMW_OR:
2709 case AMDGPU::G_ATOMICRMW_XOR:
2710 case AMDGPU::G_ATOMICRMW_MAX:
2711 case AMDGPU::G_ATOMICRMW_MIN:
2712 case AMDGPU::G_ATOMICRMW_UMAX:
2713 case AMDGPU::G_ATOMICRMW_UMIN:
2714 case AMDGPU::G_ATOMICRMW_FADD:
2715 case AMDGPU::G_ATOMIC_CMPXCHG: {
2716 return getDefaultMappingAllVGPR(MI);
2718 case AMDGPU::G_BRCOND: {
2719 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
2720 AMDGPU::SGPRRegBankID);
2721 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
2722 if (Bank != AMDGPU::SCCRegBankID)
2723 Bank = AMDGPU::VCCRegBankID;
2725 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
2726 break;
2730 return getInstructionMapping(/*ID*/1, /*Cost*/1,
2731 getOperandsMapping(OpdsMapping),
2732 MI.getNumOperands());