[Alignment][NFC] Use Align with TargetLowering::setMinFunctionAlignment
[llvm-core.git] / lib / Target / AMDGPU / AMDGPURegisterBankInfo.cpp
blobbffc3dd329513f55d4b2fcce8eabfbbc18a627ee
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/SmallSet.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
25 #include "llvm/CodeGen/TargetRegisterInfo.h"
26 #include "llvm/CodeGen/TargetSubtargetInfo.h"
27 #include "llvm/IR/Constants.h"
29 #define GET_TARGET_REGBANK_IMPL
30 #include "AMDGPUGenRegisterBank.inc"
32 // This file will be TableGen'ed at some point.
33 #include "AMDGPUGenRegisterBankInfo.def"
35 using namespace llvm;
37 namespace {
39 // Observer to apply a register bank to new registers created by LegalizerHelper.
40 class ApplyRegBankMapping final : public GISelChangeObserver {
41 private:
42 MachineRegisterInfo &MRI;
43 const RegisterBank *NewBank;
44 SmallVector<MachineInstr *, 4> NewInsts;
46 public:
47 ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
48 : MRI(MRI_), NewBank(RB) {}
50 ~ApplyRegBankMapping() {
51 for (MachineInstr *MI : NewInsts)
52 applyBank(*MI);
55 /// Set any registers that don't have a set register class or bank to SALU.
56 void applyBank(MachineInstr &MI) {
57 for (MachineOperand &Op : MI.operands()) {
58 if (!Op.isReg())
59 continue;
61 Register Reg = Op.getReg();
62 if (MRI.getRegClassOrRegBank(Reg))
63 continue;
65 const RegisterBank *RB = NewBank;
66 // FIXME: This might not be enough to detect when SCC should be used.
67 if (MRI.getType(Reg) == LLT::scalar(1))
68 RB = (NewBank == &AMDGPU::SGPRRegBank ?
69 &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
71 MRI.setRegBank(Reg, *RB);
75 void erasingInstr(MachineInstr &MI) override {}
77 void createdInstr(MachineInstr &MI) override {
78 // At this point, the instruction was just inserted and has no operands.
79 NewInsts.push_back(&MI);
82 void changingInstr(MachineInstr &MI) override {}
83 void changedInstr(MachineInstr &MI) override {}
87 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
88 : AMDGPUGenRegisterBankInfo(),
89 TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
91 // HACK: Until this is fully tablegen'd.
92 static bool AlreadyInit = false;
93 if (AlreadyInit)
94 return;
96 AlreadyInit = true;
98 const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
99 (void)RBSGPR;
100 assert(&RBSGPR == &AMDGPU::SGPRRegBank);
102 const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
103 (void)RBVGPR;
104 assert(&RBVGPR == &AMDGPU::VGPRRegBank);
108 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
109 const RegisterBank &Src,
110 unsigned Size) const {
111 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
112 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
113 Src.getID() == AMDGPU::VGPRRegBankID) {
114 return std::numeric_limits<unsigned>::max();
117 // Bool values are tricky, because the meaning is based on context. The SCC
118 // and VCC banks are for the natural scalar and vector conditions produced by
119 // a compare.
121 // Legalization doesn't know about the necessary context, so an s1 use may
122 // have been a truncate from an arbitrary value, in which case a copy (lowered
123 // as a compare with 0) needs to be inserted.
124 if (Size == 1 &&
125 (Dst.getID() == AMDGPU::SCCRegBankID ||
126 Dst.getID() == AMDGPU::SGPRRegBankID) &&
127 (Src.getID() == AMDGPU::SGPRRegBankID ||
128 Src.getID() == AMDGPU::VGPRRegBankID ||
129 Src.getID() == AMDGPU::VCCRegBankID))
130 return std::numeric_limits<unsigned>::max();
132 if (Dst.getID() == AMDGPU::SCCRegBankID &&
133 Src.getID() == AMDGPU::VCCRegBankID)
134 return std::numeric_limits<unsigned>::max();
136 return RegisterBankInfo::copyCost(Dst, Src, Size);
139 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
140 const ValueMapping &ValMapping,
141 const RegisterBank *CurBank) const {
142 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
143 // VGPR.
144 // FIXME: Is there a better way to do this?
145 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
146 return 10; // This is expensive.
148 assert(ValMapping.NumBreakDowns == 2 &&
149 ValMapping.BreakDown[0].Length == 32 &&
150 ValMapping.BreakDown[0].StartIdx == 0 &&
151 ValMapping.BreakDown[1].Length == 32 &&
152 ValMapping.BreakDown[1].StartIdx == 32 &&
153 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
155 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
156 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
157 // want.
159 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
160 // alignment restrictions, but this probably isn't important.
161 return 1;
164 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
165 const TargetRegisterClass &RC) const {
166 if (&RC == &AMDGPU::SReg_1RegClass)
167 return AMDGPU::VCCRegBank;
169 return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank;
172 template <unsigned NumOps>
173 RegisterBankInfo::InstructionMappings
174 AMDGPURegisterBankInfo::addMappingFromTable(
175 const MachineInstr &MI, const MachineRegisterInfo &MRI,
176 const std::array<unsigned, NumOps> RegSrcOpIdx,
177 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
179 InstructionMappings AltMappings;
181 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
183 unsigned Sizes[NumOps];
184 for (unsigned I = 0; I < NumOps; ++I) {
185 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
186 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
189 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
190 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
191 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
194 // getInstrMapping's default mapping uses ID 1, so start at 2.
195 unsigned MappingID = 2;
196 for (const auto &Entry : Table) {
197 for (unsigned I = 0; I < NumOps; ++I) {
198 int OpIdx = RegSrcOpIdx[I];
199 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
202 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
203 getOperandsMapping(Operands),
204 Operands.size()));
207 return AltMappings;
210 RegisterBankInfo::InstructionMappings
211 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
212 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
213 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
214 case Intrinsic::amdgcn_readlane: {
215 static const OpRegBankEntry<3> Table[2] = {
216 // Perfectly legal.
217 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
219 // Need a readfirstlane for the index.
220 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
223 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
224 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
226 case Intrinsic::amdgcn_writelane: {
227 static const OpRegBankEntry<4> Table[4] = {
228 // Perfectly legal.
229 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
231 // Need readfirstlane of first op
232 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
234 // Need readfirstlane of second op
235 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
237 // Need readfirstlane of both ops
238 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
241 // rsrc, voffset, offset
242 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
243 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
245 default:
246 return RegisterBankInfo::getInstrAlternativeMappings(MI);
250 RegisterBankInfo::InstructionMappings
251 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
252 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
254 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
255 case Intrinsic::amdgcn_buffer_load: {
256 static const OpRegBankEntry<3> Table[4] = {
257 // Perfectly legal.
258 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
259 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
261 // Waterfall loop needed for rsrc. In the worst case this will execute
262 // approximately an extra 10 * wavesize + 2 instructions.
263 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
264 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
267 // rsrc, voffset, offset
268 const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
269 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
271 case Intrinsic::amdgcn_s_buffer_load: {
272 static const OpRegBankEntry<2> Table[4] = {
273 // Perfectly legal.
274 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
276 // Only need 1 register in loop
277 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
279 // Have to waterfall the resource.
280 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
282 // Have to waterfall the resource, and the offset.
283 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
286 // rsrc, offset
287 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
288 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
290 case Intrinsic::amdgcn_ds_ordered_add:
291 case Intrinsic::amdgcn_ds_ordered_swap: {
292 // VGPR = M0, VGPR
293 static const OpRegBankEntry<3> Table[2] = {
294 // Perfectly legal.
295 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
297 // Need a readfirstlane for m0
298 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
301 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
302 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
304 case Intrinsic::amdgcn_s_sendmsg:
305 case Intrinsic::amdgcn_s_sendmsghalt: {
306 static const OpRegBankEntry<1> Table[2] = {
307 // Perfectly legal.
308 { { AMDGPU::SGPRRegBankID }, 1 },
310 // Need readlane
311 { { AMDGPU::VGPRRegBankID }, 3 }
314 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
315 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
317 default:
318 return RegisterBankInfo::getInstrAlternativeMappings(MI);
322 static bool isInstrUniform(const MachineInstr &MI) {
323 if (!MI.hasOneMemOperand())
324 return false;
326 const MachineMemOperand *MMO = *MI.memoperands_begin();
327 return AMDGPUInstrInfo::isUniformMMO(MMO);
330 RegisterBankInfo::InstructionMappings
331 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
332 const MachineInstr &MI) const {
334 const MachineFunction &MF = *MI.getParent()->getParent();
335 const MachineRegisterInfo &MRI = MF.getRegInfo();
338 InstructionMappings AltMappings;
339 switch (MI.getOpcode()) {
340 case TargetOpcode::G_CONSTANT:
341 case TargetOpcode::G_FCONSTANT:
342 case TargetOpcode::G_FRAME_INDEX:
343 case TargetOpcode::G_GLOBAL_VALUE: {
344 static const OpRegBankEntry<1> Table[2] = {
345 { { AMDGPU::VGPRRegBankID }, 1 },
346 { { AMDGPU::SGPRRegBankID }, 1 }
349 return addMappingFromTable<1>(MI, MRI, { 0 }, Table);
351 case TargetOpcode::G_AND:
352 case TargetOpcode::G_OR:
353 case TargetOpcode::G_XOR: {
354 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
356 if (Size == 1) {
357 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
358 const InstructionMapping &SCCMapping = getInstructionMapping(
359 1, 1, getOperandsMapping(
360 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
361 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
362 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
363 3); // Num Operands
364 AltMappings.push_back(&SCCMapping);
366 const InstructionMapping &SGPRMapping = getInstructionMapping(
367 1, 1, getOperandsMapping(
368 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
369 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
370 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
371 3); // Num Operands
372 AltMappings.push_back(&SGPRMapping);
374 const InstructionMapping &VCCMapping0 = getInstructionMapping(
375 2, 10, getOperandsMapping(
376 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
377 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
378 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
379 3); // Num Operands
380 AltMappings.push_back(&VCCMapping0);
381 return AltMappings;
384 if (Size != 64)
385 break;
387 const InstructionMapping &SSMapping = getInstructionMapping(
388 1, 1, getOperandsMapping(
389 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
390 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
391 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
392 3); // Num Operands
393 AltMappings.push_back(&SSMapping);
395 const InstructionMapping &VVMapping = getInstructionMapping(
396 2, 2, getOperandsMapping(
397 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
398 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
399 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
400 3); // Num Operands
401 AltMappings.push_back(&VVMapping);
403 const InstructionMapping &SVMapping = getInstructionMapping(
404 3, 3, getOperandsMapping(
405 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
406 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
407 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
408 3); // Num Operands
409 AltMappings.push_back(&SVMapping);
411 // SGPR in LHS is slightly preferrable, so make it VS more expensive than
412 // SV.
413 const InstructionMapping &VSMapping = getInstructionMapping(
414 3, 4, getOperandsMapping(
415 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
416 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
417 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
418 3); // Num Operands
419 AltMappings.push_back(&VSMapping);
420 break;
422 case TargetOpcode::G_LOAD: {
423 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
424 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
425 // FIXME: Should we be hard coding the size for these mappings?
426 if (isInstrUniform(MI)) {
427 const InstructionMapping &SSMapping = getInstructionMapping(
428 1, 1, getOperandsMapping(
429 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
430 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
431 2); // Num Operands
432 AltMappings.push_back(&SSMapping);
435 const InstructionMapping &VVMapping = getInstructionMapping(
436 2, 1, getOperandsMapping(
437 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
438 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
439 2); // Num Operands
440 AltMappings.push_back(&VVMapping);
442 // It may be possible to have a vgpr = load sgpr mapping here, because
443 // the mubuf instructions support this kind of load, but probably for only
444 // gfx7 and older. However, the addressing mode matching in the instruction
445 // selector should be able to do a better job of detecting and selecting
446 // these kinds of loads from the vgpr = load vgpr mapping.
448 return AltMappings;
451 case TargetOpcode::G_ICMP: {
452 unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
453 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
454 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
455 nullptr, // Predicate operand.
456 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
457 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
458 4); // Num Operands
459 AltMappings.push_back(&SSMapping);
461 const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
462 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
463 nullptr, // Predicate operand.
464 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
465 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
466 4); // Num Operands
467 AltMappings.push_back(&SVMapping);
469 const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
470 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
471 nullptr, // Predicate operand.
472 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
473 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
474 4); // Num Operands
475 AltMappings.push_back(&VSMapping);
477 const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
478 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
479 nullptr, // Predicate operand.
480 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
481 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
482 4); // Num Operands
483 AltMappings.push_back(&VVMapping);
485 return AltMappings;
487 case TargetOpcode::G_SELECT: {
488 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
489 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
490 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
491 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
492 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
493 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
494 4); // Num Operands
495 AltMappings.push_back(&SSMapping);
497 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
498 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
499 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
500 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
501 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
502 4); // Num Operands
503 AltMappings.push_back(&VVMapping);
505 return AltMappings;
507 case TargetOpcode::G_SMIN:
508 case TargetOpcode::G_SMAX:
509 case TargetOpcode::G_UMIN:
510 case TargetOpcode::G_UMAX: {
511 static const OpRegBankEntry<3> Table[4] = {
512 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
513 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
514 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
516 // Scalar requires cmp+select, and extends if 16-bit.
517 // FIXME: Should there be separate costs for 32 and 16-bit
518 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
521 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
522 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
524 case TargetOpcode::G_UADDE:
525 case TargetOpcode::G_USUBE:
526 case TargetOpcode::G_SADDE:
527 case TargetOpcode::G_SSUBE: {
528 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
529 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
530 getOperandsMapping(
531 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
532 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
533 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
534 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
535 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
536 5); // Num Operands
537 AltMappings.push_back(&SSMapping);
539 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
540 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
541 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
542 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
543 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
544 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
545 5); // Num Operands
546 AltMappings.push_back(&VVMapping);
547 return AltMappings;
549 case AMDGPU::G_BRCOND: {
550 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
552 const InstructionMapping &SMapping = getInstructionMapping(
553 1, 1, getOperandsMapping(
554 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
555 2); // Num Operands
556 AltMappings.push_back(&SMapping);
558 const InstructionMapping &VMapping = getInstructionMapping(
559 1, 1, getOperandsMapping(
560 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
561 2); // Num Operands
562 AltMappings.push_back(&VMapping);
563 return AltMappings;
565 case AMDGPU::G_INTRINSIC:
566 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
567 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
568 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
569 default:
570 break;
572 return RegisterBankInfo::getInstrAlternativeMappings(MI);
575 void AMDGPURegisterBankInfo::split64BitValueForMapping(
576 MachineIRBuilder &B,
577 SmallVector<Register, 2> &Regs,
578 LLT HalfTy,
579 Register Reg) const {
580 assert(HalfTy.getSizeInBits() == 32);
581 MachineRegisterInfo *MRI = B.getMRI();
582 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
583 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
584 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
585 MRI->setRegBank(LoLHS, *Bank);
586 MRI->setRegBank(HiLHS, *Bank);
588 Regs.push_back(LoLHS);
589 Regs.push_back(HiLHS);
591 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
592 .addDef(LoLHS)
593 .addDef(HiLHS)
594 .addUse(Reg);
597 /// Replace the current type each register in \p Regs has with \p NewTy
598 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
599 LLT NewTy) {
600 for (Register Reg : Regs) {
601 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
602 MRI.setType(Reg, NewTy);
606 static LLT getHalfSizedType(LLT Ty) {
607 if (Ty.isVector()) {
608 assert(Ty.getNumElements() % 2 == 0);
609 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
612 assert(Ty.getSizeInBits() % 2 == 0);
613 return LLT::scalar(Ty.getSizeInBits() / 2);
616 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
617 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
618 /// execute the instruction for each unique combination of values in all lanes
619 /// in the wave. The block will be split such that rest of the instructions are
620 /// moved to a new block.
622 /// Essentially performs this loop:
624 /// Save Execution Mask
625 /// For (Lane : Wavefront) {
626 /// Enable Lane, Disable all other lanes
627 /// SGPR = read SGPR value for current lane from VGPR
628 /// VGPRResult[Lane] = use_op SGPR
629 /// }
630 /// Restore Execution Mask
632 /// There is additional complexity to try for compare values to identify the
633 /// unique values used.
634 void AMDGPURegisterBankInfo::executeInWaterfallLoop(
635 MachineInstr &MI, MachineRegisterInfo &MRI,
636 ArrayRef<unsigned> OpIndices) const {
637 MachineFunction *MF = MI.getParent()->getParent();
638 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
639 const SIInstrInfo *TII = ST.getInstrInfo();
640 MachineBasicBlock::iterator I(MI);
642 MachineBasicBlock &MBB = *MI.getParent();
643 const DebugLoc &DL = MI.getDebugLoc();
645 // Use a set to avoid extra readfirstlanes in the case where multiple operands
646 // are the same register.
647 SmallSet<Register, 4> SGPROperandRegs;
648 for (unsigned Op : OpIndices) {
649 assert(MI.getOperand(Op).isUse());
650 Register Reg = MI.getOperand(Op).getReg();
651 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
652 if (OpBank->getID() == AMDGPU::VGPRRegBankID)
653 SGPROperandRegs.insert(Reg);
656 // No operands need to be replaced, so no need to loop.
657 if (SGPROperandRegs.empty())
658 return;
660 MachineIRBuilder B(MI);
661 SmallVector<Register, 4> ResultRegs;
662 SmallVector<Register, 4> InitResultRegs;
663 SmallVector<Register, 4> PhiRegs;
664 for (MachineOperand &Def : MI.defs()) {
665 LLT ResTy = MRI.getType(Def.getReg());
666 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
667 ResultRegs.push_back(Def.getReg());
668 Register InitReg = B.buildUndef(ResTy).getReg(0);
669 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
670 InitResultRegs.push_back(InitReg);
671 PhiRegs.push_back(PhiReg);
672 MRI.setRegBank(PhiReg, *DefBank);
673 MRI.setRegBank(InitReg, *DefBank);
676 Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
677 Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
679 // Don't bother using generic instructions/registers for the exec mask.
680 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
681 .addDef(InitSaveExecReg);
683 Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
684 Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
686 // To insert the loop we need to split the block. Move everything before this
687 // point to a new block, and insert a new empty block before this instruction.
688 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
689 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
690 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
691 MachineFunction::iterator MBBI(MBB);
692 ++MBBI;
693 MF->insert(MBBI, LoopBB);
694 MF->insert(MBBI, RestoreExecBB);
695 MF->insert(MBBI, RemainderBB);
697 LoopBB->addSuccessor(RestoreExecBB);
698 LoopBB->addSuccessor(LoopBB);
700 // Move the rest of the block into a new block.
701 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
702 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
704 MBB.addSuccessor(LoopBB);
705 RestoreExecBB->addSuccessor(RemainderBB);
707 B.setInsertPt(*LoopBB, LoopBB->end());
709 B.buildInstr(TargetOpcode::PHI)
710 .addDef(PhiExec)
711 .addReg(InitSaveExecReg)
712 .addMBB(&MBB)
713 .addReg(NewExec)
714 .addMBB(LoopBB);
716 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
717 B.buildInstr(TargetOpcode::G_PHI)
718 .addDef(std::get<2>(Result))
719 .addReg(std::get<0>(Result)) // Initial value / implicit_def
720 .addMBB(&MBB)
721 .addReg(std::get<1>(Result)) // Mid-loop value.
722 .addMBB(LoopBB);
725 // Move the instruction into the loop.
726 LoopBB->splice(LoopBB->end(), &MBB, I);
727 I = std::prev(LoopBB->end());
729 B.setInstr(*I);
731 Register CondReg;
733 for (MachineOperand &Op : MI.uses()) {
734 if (!Op.isReg())
735 continue;
737 assert(!Op.isDef());
738 if (SGPROperandRegs.count(Op.getReg())) {
739 LLT OpTy = MRI.getType(Op.getReg());
740 unsigned OpSize = OpTy.getSizeInBits();
742 // Can only do a readlane of 32-bit pieces.
743 if (OpSize == 32) {
744 // Avoid extra copies in the simple case of one 32-bit register.
745 Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
746 MRI.setType(CurrentLaneOpReg, OpTy);
748 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
749 // Read the next variant <- also loop target.
750 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
751 .addReg(Op.getReg());
753 Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
754 bool First = CondReg == AMDGPU::NoRegister;
755 if (First)
756 CondReg = NewCondReg;
758 // Compare the just read M0 value to all possible Idx values.
759 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
760 .addDef(NewCondReg)
761 .addReg(CurrentLaneOpReg)
762 .addReg(Op.getReg());
763 Op.setReg(CurrentLaneOpReg);
765 if (!First) {
766 Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
768 // If there are multiple operands to consider, and the conditions.
769 B.buildInstr(AMDGPU::S_AND_B64)
770 .addDef(AndReg)
771 .addReg(NewCondReg)
772 .addReg(CondReg);
773 CondReg = AndReg;
775 } else {
776 LLT S32 = LLT::scalar(32);
777 SmallVector<Register, 8> ReadlanePieces;
779 // The compares can be done as 64-bit, but the extract needs to be done
780 // in 32-bit pieces.
782 bool Is64 = OpSize % 64 == 0;
784 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
785 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
786 : AMDGPU::V_CMP_EQ_U32_e64;
788 // The compares can be done as 64-bit, but the extract needs to be done
789 // in 32-bit pieces.
791 // Insert the unmerge before the loop.
793 B.setMBB(MBB);
794 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
795 B.setInstr(*I);
797 unsigned NumPieces = Unmerge->getNumOperands() - 1;
798 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
799 Register UnmergePiece = Unmerge.getReg(PieceIdx);
801 Register CurrentLaneOpReg;
802 if (Is64) {
803 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
804 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
806 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
807 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
808 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
810 // Read the next variant <- also loop target.
811 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
812 CurrentLaneOpRegLo)
813 .addReg(UnmergePiece, 0, AMDGPU::sub0);
815 // Read the next variant <- also loop target.
816 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
817 CurrentLaneOpRegHi)
818 .addReg(UnmergePiece, 0, AMDGPU::sub1);
820 CurrentLaneOpReg =
821 B.buildMerge(LLT::scalar(64),
822 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
823 .getReg(0);
825 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
827 if (OpTy.getScalarSizeInBits() == 64) {
828 // If we need to produce a 64-bit element vector, so use the
829 // merged pieces
830 ReadlanePieces.push_back(CurrentLaneOpReg);
831 } else {
832 // 32-bit element type.
833 ReadlanePieces.push_back(CurrentLaneOpRegLo);
834 ReadlanePieces.push_back(CurrentLaneOpRegHi);
836 } else {
837 CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
838 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
839 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
841 // Read the next variant <- also loop target.
842 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
843 CurrentLaneOpReg)
844 .addReg(UnmergePiece);
845 ReadlanePieces.push_back(CurrentLaneOpReg);
848 Register NewCondReg
849 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
850 bool First = CondReg == AMDGPU::NoRegister;
851 if (First)
852 CondReg = NewCondReg;
854 B.buildInstr(CmpOp)
855 .addDef(NewCondReg)
856 .addReg(CurrentLaneOpReg)
857 .addReg(UnmergePiece);
859 if (!First) {
860 Register AndReg
861 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
863 // If there are multiple operands to consider, and the conditions.
864 B.buildInstr(AMDGPU::S_AND_B64)
865 .addDef(AndReg)
866 .addReg(NewCondReg)
867 .addReg(CondReg);
868 CondReg = AndReg;
872 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
873 // BUILD_VECTOR
874 if (OpTy.isVector()) {
875 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
876 Op.setReg(Merge.getReg(0));
877 } else {
878 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
879 Op.setReg(Merge.getReg(0));
882 MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
887 B.setInsertPt(*LoopBB, LoopBB->end());
889 // Update EXEC, save the original EXEC value to VCC.
890 B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
891 .addDef(NewExec)
892 .addReg(CondReg, RegState::Kill);
894 MRI.setSimpleHint(NewExec, CondReg);
896 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
897 B.buildInstr(AMDGPU::S_XOR_B64_term)
898 .addDef(AMDGPU::EXEC)
899 .addReg(AMDGPU::EXEC)
900 .addReg(NewExec);
902 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
903 // s_cbranch_scc0?
905 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
906 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
907 .addMBB(LoopBB);
909 // Save the EXEC mask before the loop.
910 BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
911 .addReg(AMDGPU::EXEC);
913 // Restore the EXEC mask after the loop.
914 B.setMBB(*RestoreExecBB);
915 B.buildInstr(AMDGPU::S_MOV_B64_term)
916 .addDef(AMDGPU::EXEC)
917 .addReg(SaveExecReg);
920 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
921 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
922 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
923 Register Reg = MI.getOperand(OpIdx).getReg();
924 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
925 if (Bank != &AMDGPU::VGPRRegBank)
926 return;
928 MachineIRBuilder B(MI);
929 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
930 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
931 .addDef(SGPR)
932 .addReg(Reg);
934 const TargetRegisterClass *Constrained =
935 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
936 (void)Constrained;
937 assert(Constrained && "Failed to constrain readfirstlane src reg");
939 MI.getOperand(OpIdx).setReg(SGPR);
942 // When regbankselect repairs registers, it will insert a repair instruction
943 // which defines the repaired register. Then it calls applyMapping and expects
944 // that the targets will either delete or rewrite the originally wrote to the
945 // repaired registers. Beccause of this, we end up in a situation where
946 // we have 2 instructions defining the same registers.
947 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
948 Register Reg,
949 const MachineInstr &MI) {
950 // Is there some way we can assert that there are exactly 2 def instructions?
951 for (MachineInstr &Other : MRI.def_instructions(Reg)) {
952 if (&Other != &MI)
953 return &Other;
956 return nullptr;
959 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
960 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
961 MachineRegisterInfo &MRI) const {
962 Register DstReg = MI.getOperand(0).getReg();
963 const LLT LoadTy = MRI.getType(DstReg);
964 unsigned LoadSize = LoadTy.getSizeInBits();
965 const unsigned MaxNonSmrdLoadSize = 128;
966 // 128-bit loads are supported for all instruction types.
967 if (LoadSize <= MaxNonSmrdLoadSize)
968 return false;
970 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
971 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
973 // If the pointer is an SGPR, we have nothing to do.
974 if (SrcRegs.empty())
975 return false;
977 assert(LoadSize % MaxNonSmrdLoadSize == 0);
979 // We want to get the repair instruction now, because it will help us
980 // determine which instruction the legalizer inserts that will also
981 // write to DstReg.
982 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
984 // RegBankSelect only emits scalar types, so we need to reset the pointer
985 // operand to a pointer type.
986 Register BasePtrReg = SrcRegs[0];
987 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
988 MRI.setType(BasePtrReg, PtrTy);
990 MachineIRBuilder B(MI);
992 unsigned SplitElts =
993 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
994 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
995 ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
996 GISelObserverWrapper Observer(&O);
997 B.setChangeObserver(Observer);
998 LegalizerHelper Helper(B.getMF(), Observer, B);
999 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1000 return false;
1002 // At this point, the legalizer has split the original load into smaller
1003 // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
1004 // that combines the outputs of the lower loads and writes it to DstReg.
1005 // The register bank selector has also added the RepairInst which writes to
1006 // DstReg as well.
1008 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
1010 // Replace the output of the LegalizedInst with a temporary register, since
1011 // RepairInst already defines DstReg.
1012 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1013 LegalizedInst->getOperand(0).setReg(TmpReg);
1014 B.setInsertPt(*RepairInst->getParent(), RepairInst);
1016 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1017 Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
1018 B.buildConstant(IdxReg, DefIdx);
1019 MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
1020 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1023 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1024 return true;
1027 // For cases where only a single copy is inserted for matching register banks.
1028 // Replace the register in the instruction operand
1029 static void substituteSimpleCopyRegs(
1030 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1031 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1032 if (!SrcReg.empty()) {
1033 assert(SrcReg.size() == 1);
1034 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1038 void AMDGPURegisterBankInfo::applyMappingImpl(
1039 const OperandsMapper &OpdMapper) const {
1040 MachineInstr &MI = OpdMapper.getMI();
1041 unsigned Opc = MI.getOpcode();
1042 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1043 switch (Opc) {
1044 case AMDGPU::G_SELECT: {
1045 Register DstReg = MI.getOperand(0).getReg();
1046 LLT DstTy = MRI.getType(DstReg);
1047 if (DstTy.getSizeInBits() != 64)
1048 break;
1050 LLT HalfTy = getHalfSizedType(DstTy);
1052 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1053 SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
1054 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1055 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1057 // All inputs are SGPRs, nothing special to do.
1058 if (DefRegs.empty()) {
1059 assert(Src1Regs.empty() && Src2Regs.empty());
1060 break;
1063 MachineIRBuilder B(MI);
1064 if (Src0Regs.empty())
1065 Src0Regs.push_back(MI.getOperand(1).getReg());
1066 else {
1067 assert(Src0Regs.size() == 1);
1070 if (Src1Regs.empty())
1071 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1072 else {
1073 setRegsToType(MRI, Src1Regs, HalfTy);
1076 if (Src2Regs.empty())
1077 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1078 else
1079 setRegsToType(MRI, Src2Regs, HalfTy);
1081 setRegsToType(MRI, DefRegs, HalfTy);
1083 B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
1084 B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
1086 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1087 MI.eraseFromParent();
1088 return;
1090 case AMDGPU::G_AND:
1091 case AMDGPU::G_OR:
1092 case AMDGPU::G_XOR: {
1093 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1094 // there is a VGPR input.
1095 Register DstReg = MI.getOperand(0).getReg();
1096 LLT DstTy = MRI.getType(DstReg);
1097 if (DstTy.getSizeInBits() != 64)
1098 break;
1100 LLT HalfTy = getHalfSizedType(DstTy);
1101 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1102 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
1103 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1105 // All inputs are SGPRs, nothing special to do.
1106 if (DefRegs.empty()) {
1107 assert(Src0Regs.empty() && Src1Regs.empty());
1108 break;
1111 assert(DefRegs.size() == 2);
1112 assert(Src0Regs.size() == Src1Regs.size() &&
1113 (Src0Regs.empty() || Src0Regs.size() == 2));
1115 // Depending on where the source registers came from, the generic code may
1116 // have decided to split the inputs already or not. If not, we still need to
1117 // extract the values.
1118 MachineIRBuilder B(MI);
1120 if (Src0Regs.empty())
1121 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
1122 else
1123 setRegsToType(MRI, Src0Regs, HalfTy);
1125 if (Src1Regs.empty())
1126 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1127 else
1128 setRegsToType(MRI, Src1Regs, HalfTy);
1130 setRegsToType(MRI, DefRegs, HalfTy);
1132 B.buildInstr(Opc)
1133 .addDef(DefRegs[0])
1134 .addUse(Src0Regs[0])
1135 .addUse(Src1Regs[0]);
1137 B.buildInstr(Opc)
1138 .addDef(DefRegs[1])
1139 .addUse(Src0Regs[1])
1140 .addUse(Src1Regs[1]);
1142 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1143 MI.eraseFromParent();
1144 return;
1146 case AMDGPU::G_ADD:
1147 case AMDGPU::G_SUB:
1148 case AMDGPU::G_MUL: {
1149 Register DstReg = MI.getOperand(0).getReg();
1150 LLT DstTy = MRI.getType(DstReg);
1151 if (DstTy != LLT::scalar(16))
1152 break;
1154 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1155 if (DstBank == &AMDGPU::VGPRRegBank)
1156 break;
1158 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1159 MachineFunction *MF = MI.getParent()->getParent();
1160 MachineIRBuilder B(MI);
1161 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1162 GISelObserverWrapper Observer(&ApplySALU);
1163 LegalizerHelper Helper(*MF, Observer, B);
1165 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1166 LegalizerHelper::Legalized)
1167 llvm_unreachable("widen scalar should have succeeded");
1168 return;
1170 case AMDGPU::G_SMIN:
1171 case AMDGPU::G_SMAX:
1172 case AMDGPU::G_UMIN:
1173 case AMDGPU::G_UMAX: {
1174 Register DstReg = MI.getOperand(0).getReg();
1175 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1176 if (DstBank == &AMDGPU::VGPRRegBank)
1177 break;
1179 MachineFunction *MF = MI.getParent()->getParent();
1180 MachineIRBuilder B(MI);
1181 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1182 GISelObserverWrapper Observer(&ApplySALU);
1183 LegalizerHelper Helper(*MF, Observer, B);
1185 // Turn scalar min/max into a compare and select.
1186 LLT Ty = MRI.getType(DstReg);
1187 LLT S32 = LLT::scalar(32);
1188 LLT S16 = LLT::scalar(16);
1190 if (Ty == S16) {
1191 // Need to widen to s32, and expand as cmp + select.
1192 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1193 llvm_unreachable("widenScalar should have succeeded");
1195 // FIXME: This is relying on widenScalar leaving MI in place.
1196 if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
1197 llvm_unreachable("lower should have succeeded");
1198 } else {
1199 if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
1200 llvm_unreachable("lower should have succeeded");
1203 return;
1205 case AMDGPU::G_SEXT:
1206 case AMDGPU::G_ZEXT: {
1207 Register SrcReg = MI.getOperand(1).getReg();
1208 LLT SrcTy = MRI.getType(SrcReg);
1209 bool Signed = Opc == AMDGPU::G_SEXT;
1211 MachineIRBuilder B(MI);
1212 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1214 Register DstReg = MI.getOperand(0).getReg();
1215 LLT DstTy = MRI.getType(DstReg);
1216 if (DstTy.isScalar() &&
1217 SrcBank != &AMDGPU::SGPRRegBank &&
1218 SrcBank != &AMDGPU::SCCRegBank &&
1219 SrcBank != &AMDGPU::VCCRegBank &&
1220 // FIXME: Should handle any type that round to s64 when irregular
1221 // breakdowns supported.
1222 DstTy.getSizeInBits() == 64 &&
1223 SrcTy.getSizeInBits() <= 32) {
1224 const LLT S32 = LLT::scalar(32);
1225 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1227 // Extend to 32-bit, and then extend the low half.
1228 if (Signed) {
1229 // TODO: Should really be buildSExtOrCopy
1230 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
1232 // Replicate sign bit from 32-bit extended part.
1233 auto ShiftAmt = B.buildConstant(S32, 31);
1234 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1235 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
1236 } else {
1237 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
1238 B.buildConstant(DefRegs[1], 0);
1241 MRI.setRegBank(DstReg, *SrcBank);
1242 MI.eraseFromParent();
1243 return;
1246 if (SrcTy != LLT::scalar(1))
1247 return;
1249 if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
1250 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1252 const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
1253 &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
1255 unsigned DstSize = DstTy.getSizeInBits();
1256 // 64-bit select is SGPR only
1257 const bool UseSel64 = DstSize > 32 &&
1258 SrcBank->getID() == AMDGPU::SCCRegBankID;
1260 // TODO: Should s16 select be legal?
1261 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
1262 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
1263 auto False = B.buildConstant(SelType, 0);
1265 MRI.setRegBank(True.getReg(0), *DstBank);
1266 MRI.setRegBank(False.getReg(0), *DstBank);
1267 MRI.setRegBank(DstReg, *DstBank);
1269 if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
1270 B.buildSelect(DefRegs[0], SrcReg, True, False);
1271 B.buildCopy(DefRegs[1], DefRegs[0]);
1272 } else if (DstSize < 32) {
1273 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
1274 MRI.setRegBank(Sel.getReg(0), *DstBank);
1275 B.buildTrunc(DstReg, Sel);
1276 } else {
1277 B.buildSelect(DstReg, SrcReg, True, False);
1280 MI.eraseFromParent();
1281 return;
1284 // Fixup the case with an s1 src that isn't a condition register. Use shifts
1285 // instead of introducing a compare to avoid an unnecessary condition
1286 // register (and since there's no scalar 16-bit compares).
1287 auto Ext = B.buildAnyExt(DstTy, SrcReg);
1288 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
1289 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
1291 if (MI.getOpcode() == AMDGPU::G_SEXT)
1292 B.buildAShr(DstReg, Shl, ShiftAmt);
1293 else
1294 B.buildLShr(DstReg, Shl, ShiftAmt);
1296 MRI.setRegBank(DstReg, *SrcBank);
1297 MRI.setRegBank(Ext.getReg(0), *SrcBank);
1298 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1299 MRI.setRegBank(Shl.getReg(0), *SrcBank);
1300 MI.eraseFromParent();
1301 return;
1303 case AMDGPU::G_EXTRACT_VECTOR_ELT:
1304 applyDefaultMapping(OpdMapper);
1305 executeInWaterfallLoop(MI, MRI, { 2 });
1306 return;
1307 case AMDGPU::G_INTRINSIC: {
1308 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1309 case Intrinsic::amdgcn_s_buffer_load: {
1310 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
1311 executeInWaterfallLoop(MI, MRI, { 2, 3 });
1312 return;
1314 case Intrinsic::amdgcn_readlane: {
1315 substituteSimpleCopyRegs(OpdMapper, 2);
1317 assert(empty(OpdMapper.getVRegs(0)));
1318 assert(empty(OpdMapper.getVRegs(3)));
1320 // Make sure the index is an SGPR. It doesn't make sense to run this in a
1321 // waterfall loop, so assume it's a uniform value.
1322 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1323 return;
1325 case Intrinsic::amdgcn_writelane: {
1326 assert(empty(OpdMapper.getVRegs(0)));
1327 assert(empty(OpdMapper.getVRegs(2)));
1328 assert(empty(OpdMapper.getVRegs(3)));
1330 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
1331 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
1332 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1333 return;
1335 default:
1336 break;
1338 break;
1340 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
1341 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1342 case Intrinsic::amdgcn_buffer_load: {
1343 executeInWaterfallLoop(MI, MRI, { 2 });
1344 return;
1346 case Intrinsic::amdgcn_ds_ordered_add:
1347 case Intrinsic::amdgcn_ds_ordered_swap: {
1348 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
1349 assert(empty(OpdMapper.getVRegs(0)));
1350 substituteSimpleCopyRegs(OpdMapper, 3);
1351 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1352 return;
1354 case Intrinsic::amdgcn_s_sendmsg:
1355 case Intrinsic::amdgcn_s_sendmsghalt: {
1356 // FIXME: Should this use a waterfall loop?
1357 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1358 return;
1360 default:
1361 break;
1363 break;
1365 case AMDGPU::G_LOAD: {
1366 if (applyMappingWideLoad(MI, OpdMapper, MRI))
1367 return;
1368 break;
1370 default:
1371 break;
1374 return applyDefaultMapping(OpdMapper);
1377 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
1378 const MachineFunction &MF = *MI.getParent()->getParent();
1379 const MachineRegisterInfo &MRI = MF.getRegInfo();
1380 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
1381 if (!MI.getOperand(i).isReg())
1382 continue;
1383 Register Reg = MI.getOperand(i).getReg();
1384 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
1385 if (Bank->getID() == AMDGPU::VGPRRegBankID)
1386 return false;
1388 assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
1389 Bank->getID() == AMDGPU::SCCRegBankID);
1392 return true;
1395 const RegisterBankInfo::InstructionMapping &
1396 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
1397 const MachineFunction &MF = *MI.getParent()->getParent();
1398 const MachineRegisterInfo &MRI = MF.getRegInfo();
1399 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1401 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1402 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
1403 unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
1404 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
1406 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1407 MI.getNumOperands());
1410 const RegisterBankInfo::InstructionMapping &
1411 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
1412 const MachineFunction &MF = *MI.getParent()->getParent();
1413 const MachineRegisterInfo &MRI = MF.getRegInfo();
1414 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1415 unsigned OpdIdx = 0;
1417 unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1418 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
1420 if (MI.getOperand(OpdIdx).isIntrinsicID())
1421 OpdsMapping[OpdIdx++] = nullptr;
1423 Register Reg1 = MI.getOperand(OpdIdx).getReg();
1424 unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
1426 unsigned DefaultBankID = Size1 == 1 ?
1427 AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1428 unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
1430 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
1432 for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
1433 const MachineOperand &MO = MI.getOperand(OpdIdx);
1434 if (!MO.isReg())
1435 continue;
1437 unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
1438 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1439 OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
1442 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1443 MI.getNumOperands());
1446 const RegisterBankInfo::InstructionMapping &
1447 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
1448 const MachineFunction &MF = *MI.getParent()->getParent();
1449 const MachineRegisterInfo &MRI = MF.getRegInfo();
1450 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1452 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1453 const MachineOperand &Op = MI.getOperand(I);
1454 if (!Op.isReg())
1455 continue;
1457 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
1458 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1461 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1462 MI.getNumOperands());
1465 const RegisterBankInfo::InstructionMapping &
1466 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
1468 const MachineFunction &MF = *MI.getParent()->getParent();
1469 const MachineRegisterInfo &MRI = MF.getRegInfo();
1470 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1471 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1472 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
1473 unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
1475 const ValueMapping *ValMapping;
1476 const ValueMapping *PtrMapping;
1478 if (isInstrUniform(MI)) {
1479 // We have a uniform instruction so we want to use an SMRD load
1480 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1481 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
1482 } else {
1483 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
1484 // FIXME: What would happen if we used SGPRRegBankID here?
1485 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
1488 OpdsMapping[0] = ValMapping;
1489 OpdsMapping[1] = PtrMapping;
1490 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
1491 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
1492 return Mapping;
1494 // FIXME: Do we want to add a mapping for FLAT load, or should we just
1495 // handle that during instruction selection?
1498 unsigned
1499 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
1500 const MachineRegisterInfo &MRI,
1501 const TargetRegisterInfo &TRI,
1502 unsigned Default) const {
1504 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
1505 return Bank ? Bank->getID() : Default;
1509 /// This function must return a legal mapping, because
1510 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
1511 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
1512 /// VGPR to SGPR generated is illegal.
1514 const RegisterBankInfo::InstructionMapping &
1515 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
1516 const MachineFunction &MF = *MI.getParent()->getParent();
1517 const MachineRegisterInfo &MRI = MF.getRegInfo();
1519 if (MI.isRegSequence()) {
1520 // If any input is a VGPR, the result must be a VGPR. The default handling
1521 // assumes any copy between banks is legal.
1522 unsigned BankID = AMDGPU::SGPRRegBankID;
1524 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1525 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
1526 // It doesn't make sense to use vcc or scc banks here, so just ignore
1527 // them.
1528 if (OpBank != AMDGPU::SGPRRegBankID) {
1529 BankID = AMDGPU::VGPRRegBankID;
1530 break;
1533 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1535 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
1536 return getInstructionMapping(
1537 1, /*Cost*/ 1,
1538 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
1541 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
1542 // properly.
1544 // TODO: There are additional exec masking dependencies to analyze.
1545 if (MI.getOpcode() == TargetOpcode::G_PHI) {
1546 // TODO: Generate proper invalid bank enum.
1547 int ResultBank = -1;
1549 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1550 Register Reg = MI.getOperand(I).getReg();
1551 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1553 // FIXME: Assuming VGPR for any undetermined inputs.
1554 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
1555 ResultBank = AMDGPU::VGPRRegBankID;
1556 break;
1559 unsigned OpBank = Bank->getID();
1560 // scc, scc -> sgpr
1561 if (OpBank == AMDGPU::SCCRegBankID) {
1562 // There's only one SCC register, so a phi requires copying to SGPR.
1563 OpBank = AMDGPU::SGPRRegBankID;
1564 } else if (OpBank == AMDGPU::VCCRegBankID) {
1565 // vcc, vcc -> vcc
1566 // vcc, sgpr -> vgpr
1567 if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) {
1568 ResultBank = AMDGPU::VGPRRegBankID;
1569 break;
1573 ResultBank = OpBank;
1576 assert(ResultBank != -1);
1578 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1580 const ValueMapping &ValMap =
1581 getValueMapping(0, Size, getRegBank(ResultBank));
1582 return getInstructionMapping(
1583 1, /*Cost*/ 1,
1584 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
1587 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
1588 if (Mapping.isValid())
1589 return Mapping;
1591 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1593 switch (MI.getOpcode()) {
1594 default:
1595 return getInvalidInstructionMapping();
1597 case AMDGPU::G_AND:
1598 case AMDGPU::G_OR:
1599 case AMDGPU::G_XOR: {
1600 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1601 if (Size == 1) {
1602 const RegisterBank *DstBank
1603 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
1605 unsigned TargetBankID = -1;
1606 unsigned BankLHS = -1;
1607 unsigned BankRHS = -1;
1608 if (DstBank) {
1609 TargetBankID = DstBank->getID();
1610 if (DstBank == &AMDGPU::VCCRegBank) {
1611 TargetBankID = AMDGPU::VCCRegBankID;
1612 BankLHS = AMDGPU::VCCRegBankID;
1613 BankRHS = AMDGPU::VCCRegBankID;
1614 } else if (DstBank == &AMDGPU::SCCRegBank) {
1615 TargetBankID = AMDGPU::SCCRegBankID;
1616 BankLHS = AMDGPU::SGPRRegBankID;
1617 BankRHS = AMDGPU::SGPRRegBankID;
1618 } else {
1619 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
1620 AMDGPU::SGPRRegBankID);
1621 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
1622 AMDGPU::SGPRRegBankID);
1624 } else {
1625 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
1626 AMDGPU::VCCRegBankID);
1627 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
1628 AMDGPU::VCCRegBankID);
1630 // Both inputs should be true booleans to produce a boolean result.
1631 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
1632 TargetBankID = AMDGPU::VGPRRegBankID;
1633 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
1634 TargetBankID = AMDGPU::VCCRegBankID;
1635 BankLHS = AMDGPU::VCCRegBankID;
1636 BankRHS = AMDGPU::VCCRegBankID;
1637 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
1638 TargetBankID = AMDGPU::SGPRRegBankID;
1639 } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
1640 // The operation must be done on a 32-bit register, but it will set
1641 // scc. The result type could interchangably be SCC or SGPR, since
1642 // both values will be produced.
1643 TargetBankID = AMDGPU::SCCRegBankID;
1644 BankLHS = AMDGPU::SGPRRegBankID;
1645 BankRHS = AMDGPU::SGPRRegBankID;
1649 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
1650 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
1651 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
1652 break;
1655 if (Size == 64) {
1657 if (isSALUMapping(MI)) {
1658 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
1659 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
1660 } else {
1661 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
1662 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
1663 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
1665 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
1666 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
1669 break;
1672 LLVM_FALLTHROUGH;
1675 case AMDGPU::G_GEP:
1676 case AMDGPU::G_ADD:
1677 case AMDGPU::G_SUB:
1678 case AMDGPU::G_MUL:
1679 case AMDGPU::G_SHL:
1680 case AMDGPU::G_LSHR:
1681 case AMDGPU::G_ASHR:
1682 case AMDGPU::G_UADDO:
1683 case AMDGPU::G_SADDO:
1684 case AMDGPU::G_USUBO:
1685 case AMDGPU::G_SSUBO:
1686 case AMDGPU::G_UADDE:
1687 case AMDGPU::G_SADDE:
1688 case AMDGPU::G_USUBE:
1689 case AMDGPU::G_SSUBE:
1690 case AMDGPU::G_UMULH:
1691 case AMDGPU::G_SMULH:
1692 case AMDGPU::G_SMIN:
1693 case AMDGPU::G_SMAX:
1694 case AMDGPU::G_UMIN:
1695 case AMDGPU::G_UMAX:
1696 if (isSALUMapping(MI))
1697 return getDefaultMappingSOP(MI);
1698 LLVM_FALLTHROUGH;
1700 case AMDGPU::G_FADD:
1701 case AMDGPU::G_FSUB:
1702 case AMDGPU::G_FPTOSI:
1703 case AMDGPU::G_FPTOUI:
1704 case AMDGPU::G_FMUL:
1705 case AMDGPU::G_FMA:
1706 case AMDGPU::G_FSQRT:
1707 case AMDGPU::G_SITOFP:
1708 case AMDGPU::G_UITOFP:
1709 case AMDGPU::G_FPTRUNC:
1710 case AMDGPU::G_FPEXT:
1711 case AMDGPU::G_FEXP2:
1712 case AMDGPU::G_FLOG2:
1713 case AMDGPU::G_FMINNUM:
1714 case AMDGPU::G_FMAXNUM:
1715 case AMDGPU::G_FMINNUM_IEEE:
1716 case AMDGPU::G_FMAXNUM_IEEE:
1717 case AMDGPU::G_FCANONICALIZE:
1718 case AMDGPU::G_INTRINSIC_TRUNC:
1719 case AMDGPU::G_INTRINSIC_ROUND:
1720 return getDefaultMappingVOP(MI);
1721 case AMDGPU::G_IMPLICIT_DEF: {
1722 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1723 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1724 break;
1726 case AMDGPU::G_FCONSTANT:
1727 case AMDGPU::G_CONSTANT:
1728 case AMDGPU::G_FRAME_INDEX:
1729 case AMDGPU::G_BLOCK_ADDR: {
1730 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1731 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1732 break;
1734 case AMDGPU::G_INSERT: {
1735 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
1736 AMDGPU::VGPRRegBankID;
1737 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1738 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
1739 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
1740 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
1741 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
1742 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
1743 OpdsMapping[3] = nullptr;
1744 break;
1746 case AMDGPU::G_EXTRACT: {
1747 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
1748 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1749 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
1750 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
1751 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
1752 OpdsMapping[2] = nullptr;
1753 break;
1755 case AMDGPU::G_MERGE_VALUES:
1756 case AMDGPU::G_BUILD_VECTOR:
1757 case AMDGPU::G_CONCAT_VECTORS: {
1758 unsigned Bank = isSALUMapping(MI) ?
1759 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1760 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1761 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1763 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
1764 // Op1 and Dst should use the same register bank.
1765 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
1766 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
1767 break;
1769 case AMDGPU::G_BITCAST:
1770 case AMDGPU::G_INTTOPTR:
1771 case AMDGPU::G_PTRTOINT:
1772 case AMDGPU::G_CTLZ:
1773 case AMDGPU::G_CTLZ_ZERO_UNDEF:
1774 case AMDGPU::G_CTTZ:
1775 case AMDGPU::G_CTTZ_ZERO_UNDEF:
1776 case AMDGPU::G_CTPOP:
1777 case AMDGPU::G_BSWAP:
1778 case AMDGPU::G_BITREVERSE:
1779 case AMDGPU::G_FABS:
1780 case AMDGPU::G_FNEG: {
1781 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1782 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
1783 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
1784 break;
1786 case AMDGPU::G_TRUNC: {
1787 Register Dst = MI.getOperand(0).getReg();
1788 Register Src = MI.getOperand(1).getReg();
1789 unsigned Bank = getRegBankID(Src, MRI, *TRI);
1790 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
1791 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
1792 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
1793 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
1794 break;
1796 case AMDGPU::G_ZEXT:
1797 case AMDGPU::G_SEXT:
1798 case AMDGPU::G_ANYEXT: {
1799 Register Dst = MI.getOperand(0).getReg();
1800 Register Src = MI.getOperand(1).getReg();
1801 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
1802 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
1804 unsigned DstBank;
1805 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
1806 assert(SrcBank);
1807 switch (SrcBank->getID()) {
1808 case AMDGPU::SCCRegBankID:
1809 case AMDGPU::SGPRRegBankID:
1810 DstBank = AMDGPU::SGPRRegBankID;
1811 break;
1812 default:
1813 DstBank = AMDGPU::VGPRRegBankID;
1814 break;
1817 // TODO: Should anyext be split into 32-bit part as well?
1818 if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
1819 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
1820 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
1821 } else {
1822 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
1823 // 32-bits, and then to 64.
1824 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
1825 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
1826 SrcSize);
1828 break;
1830 case AMDGPU::G_FCMP: {
1831 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1832 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1833 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
1834 OpdsMapping[1] = nullptr; // Predicate Operand.
1835 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
1836 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1837 break;
1839 case AMDGPU::G_STORE: {
1840 assert(MI.getOperand(0).isReg());
1841 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1842 // FIXME: We need to specify a different reg bank once scalar stores
1843 // are supported.
1844 const ValueMapping *ValMapping =
1845 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1846 // FIXME: Depending on the type of store, the pointer could be in
1847 // the SGPR Reg bank.
1848 // FIXME: Pointer size should be based on the address space.
1849 const ValueMapping *PtrMapping =
1850 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
1852 OpdsMapping[0] = ValMapping;
1853 OpdsMapping[1] = PtrMapping;
1854 break;
1857 case AMDGPU::G_ICMP: {
1858 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1859 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1860 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1861 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
1863 bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
1864 Op3Bank == AMDGPU::SGPRRegBankID &&
1865 (Size == 32 || (Size == 64 &&
1866 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
1867 MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()));
1869 unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
1871 OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
1872 OpdsMapping[1] = nullptr; // Predicate Operand.
1873 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
1874 OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
1875 break;
1877 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
1878 unsigned OutputBankID = isSALUMapping(MI) ?
1879 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1880 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1881 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1882 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1884 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
1885 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
1887 // The index can be either if the source vector is VGPR.
1888 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
1889 break;
1891 case AMDGPU::G_INSERT_VECTOR_ELT: {
1892 unsigned OutputBankID = isSALUMapping(MI) ?
1893 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1895 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1896 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1897 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
1898 unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1899 unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
1901 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
1902 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
1903 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize);
1905 // The index can be either if the source vector is VGPR.
1906 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
1907 break;
1909 case AMDGPU::G_UNMERGE_VALUES: {
1910 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
1911 AMDGPU::VGPRRegBankID;
1913 // Op1 and Dst should use the same register bank.
1914 // FIXME: Shouldn't this be the default? Why do we need to handle this?
1915 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1916 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
1917 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
1919 break;
1921 case AMDGPU::G_INTRINSIC: {
1922 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1923 default:
1924 return getInvalidInstructionMapping();
1925 case Intrinsic::amdgcn_div_fmas:
1926 case Intrinsic::amdgcn_trig_preop:
1927 case Intrinsic::amdgcn_sin:
1928 case Intrinsic::amdgcn_cos:
1929 case Intrinsic::amdgcn_log_clamp:
1930 case Intrinsic::amdgcn_rcp:
1931 case Intrinsic::amdgcn_rcp_legacy:
1932 case Intrinsic::amdgcn_rsq:
1933 case Intrinsic::amdgcn_rsq_legacy:
1934 case Intrinsic::amdgcn_rsq_clamp:
1935 case Intrinsic::amdgcn_ldexp:
1936 case Intrinsic::amdgcn_frexp_mant:
1937 case Intrinsic::amdgcn_frexp_exp:
1938 case Intrinsic::amdgcn_fract:
1939 case Intrinsic::amdgcn_cvt_pkrtz:
1940 case Intrinsic::amdgcn_cvt_pknorm_i16:
1941 case Intrinsic::amdgcn_cvt_pknorm_u16:
1942 case Intrinsic::amdgcn_cvt_pk_i16:
1943 case Intrinsic::amdgcn_cvt_pk_u16:
1944 case Intrinsic::amdgcn_fmed3:
1945 case Intrinsic::amdgcn_cubeid:
1946 case Intrinsic::amdgcn_cubema:
1947 case Intrinsic::amdgcn_cubesc:
1948 case Intrinsic::amdgcn_cubetc:
1949 case Intrinsic::amdgcn_sffbh:
1950 case Intrinsic::amdgcn_fmad_ftz:
1951 case Intrinsic::amdgcn_mbcnt_lo:
1952 case Intrinsic::amdgcn_mbcnt_hi:
1953 case Intrinsic::amdgcn_ubfe:
1954 case Intrinsic::amdgcn_sbfe:
1955 case Intrinsic::amdgcn_lerp:
1956 case Intrinsic::amdgcn_sad_u8:
1957 case Intrinsic::amdgcn_msad_u8:
1958 case Intrinsic::amdgcn_sad_hi_u8:
1959 case Intrinsic::amdgcn_sad_u16:
1960 case Intrinsic::amdgcn_qsad_pk_u16_u8:
1961 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
1962 case Intrinsic::amdgcn_mqsad_u32_u8:
1963 case Intrinsic::amdgcn_cvt_pk_u8_f32:
1964 case Intrinsic::amdgcn_alignbit:
1965 case Intrinsic::amdgcn_alignbyte:
1966 case Intrinsic::amdgcn_fdot2:
1967 case Intrinsic::amdgcn_sdot2:
1968 case Intrinsic::amdgcn_udot2:
1969 case Intrinsic::amdgcn_sdot4:
1970 case Intrinsic::amdgcn_udot4:
1971 case Intrinsic::amdgcn_sdot8:
1972 case Intrinsic::amdgcn_udot8:
1973 case Intrinsic::amdgcn_wwm:
1974 case Intrinsic::amdgcn_wqm:
1975 return getDefaultMappingVOP(MI);
1976 case Intrinsic::amdgcn_ds_permute:
1977 case Intrinsic::amdgcn_ds_bpermute:
1978 case Intrinsic::amdgcn_update_dpp:
1979 return getDefaultMappingAllVGPR(MI);
1980 case Intrinsic::amdgcn_kernarg_segment_ptr:
1981 case Intrinsic::amdgcn_s_getpc:
1982 case Intrinsic::amdgcn_groupstaticsize: {
1983 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1984 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1985 break;
1987 case Intrinsic::amdgcn_wqm_vote: {
1988 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1989 OpdsMapping[0] = OpdsMapping[2]
1990 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
1991 break;
1993 case Intrinsic::amdgcn_s_buffer_load: {
1994 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
1995 Register RSrc = MI.getOperand(2).getReg(); // SGPR
1996 Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
1998 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1999 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2000 unsigned Size3 = MRI.getType(Offset).getSizeInBits();
2002 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2003 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2005 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
2006 OpdsMapping[1] = nullptr; // intrinsic id
2008 // Lie and claim everything is legal, even though some need to be
2009 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2010 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2011 OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
2012 OpdsMapping[4] = nullptr;
2013 break;
2015 case Intrinsic::amdgcn_div_scale: {
2016 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2017 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2018 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
2019 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
2021 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2022 OpdsMapping[3] = AMDGPU::getValueMapping(
2023 getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
2024 OpdsMapping[4] = AMDGPU::getValueMapping(
2025 getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
2027 break;
2029 case Intrinsic::amdgcn_class: {
2030 Register Src0Reg = MI.getOperand(2).getReg();
2031 Register Src1Reg = MI.getOperand(3).getReg();
2032 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
2033 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
2034 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2035 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
2036 OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
2037 Src0Size);
2038 OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
2039 Src1Size);
2040 break;
2042 case Intrinsic::amdgcn_icmp:
2043 case Intrinsic::amdgcn_fcmp: {
2044 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2045 // This is not VCCRegBank because this is not used in boolean contexts.
2046 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2047 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2048 unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2049 unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2050 OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
2051 OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
2052 break;
2054 case Intrinsic::amdgcn_readlane: {
2055 // This must be an SGPR, but accept a VGPR.
2056 Register IdxReg = MI.getOperand(3).getReg();
2057 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2058 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2059 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2060 LLVM_FALLTHROUGH;
2062 case Intrinsic::amdgcn_readfirstlane: {
2063 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2064 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2065 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2066 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2067 break;
2069 case Intrinsic::amdgcn_writelane: {
2070 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2071 Register SrcReg = MI.getOperand(2).getReg();
2072 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
2073 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2074 Register IdxReg = MI.getOperand(3).getReg();
2075 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2076 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2077 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2079 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
2080 // to legalize.
2081 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
2082 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2083 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2084 break;
2086 case Intrinsic::amdgcn_if_break: {
2087 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2088 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2089 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2090 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2091 break;
2094 break;
2096 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2097 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
2098 default:
2099 return getInvalidInstructionMapping();
2100 case Intrinsic::amdgcn_s_getreg:
2101 case Intrinsic::amdgcn_s_memtime:
2102 case Intrinsic::amdgcn_s_memrealtime:
2103 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
2104 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2105 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2106 break;
2108 case Intrinsic::amdgcn_ds_append:
2109 case Intrinsic::amdgcn_ds_consume:
2110 case Intrinsic::amdgcn_ds_fadd:
2111 case Intrinsic::amdgcn_ds_fmin:
2112 case Intrinsic::amdgcn_ds_fmax:
2113 case Intrinsic::amdgcn_atomic_inc:
2114 case Intrinsic::amdgcn_atomic_dec:
2115 return getDefaultMappingAllVGPR(MI);
2116 case Intrinsic::amdgcn_ds_ordered_add:
2117 case Intrinsic::amdgcn_ds_ordered_swap: {
2118 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2119 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2120 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2121 AMDGPU::SGPRRegBankID);
2122 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
2123 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2124 break;
2126 case Intrinsic::amdgcn_exp_compr:
2127 OpdsMapping[0] = nullptr; // IntrinsicID
2128 // FIXME: These are immediate values which can't be read from registers.
2129 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2130 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2131 // FIXME: Could we support packed types here?
2132 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2133 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2134 // FIXME: These are immediate values which can't be read from registers.
2135 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2136 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2137 break;
2138 case Intrinsic::amdgcn_exp:
2139 OpdsMapping[0] = nullptr; // IntrinsicID
2140 // FIXME: These are immediate values which can't be read from registers.
2141 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2142 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2143 // FIXME: Could we support packed types here?
2144 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2145 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2146 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2147 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2148 // FIXME: These are immediate values which can't be read from registers.
2149 OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2150 OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2151 break;
2152 case Intrinsic::amdgcn_buffer_load: {
2153 Register RSrc = MI.getOperand(2).getReg(); // SGPR
2154 Register VIndex = MI.getOperand(3).getReg(); // VGPR
2155 Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
2157 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2158 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2159 unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
2160 unsigned Size4 = MRI.getType(Offset).getSizeInBits();
2162 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2163 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2165 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
2166 OpdsMapping[1] = nullptr; // intrinsic id
2168 // Lie and claim everything is legal, even though some need to be
2169 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2170 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2171 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
2172 OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
2173 OpdsMapping[5] = nullptr;
2174 OpdsMapping[6] = nullptr;
2175 break;
2177 case Intrinsic::amdgcn_s_sendmsg:
2178 case Intrinsic::amdgcn_s_sendmsghalt: {
2179 // This must be an SGPR, but accept a VGPR.
2180 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2181 AMDGPU::SGPRRegBankID);
2182 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
2183 break;
2185 case Intrinsic::amdgcn_end_cf: {
2186 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2187 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2188 break;
2191 break;
2193 case AMDGPU::G_SELECT: {
2194 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2195 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2196 AMDGPU::SGPRRegBankID);
2197 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
2198 AMDGPU::SGPRRegBankID);
2199 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
2200 Op3Bank == AMDGPU::SGPRRegBankID;
2202 unsigned CondBankDefault = SGPRSrcs ?
2203 AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2204 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2205 CondBankDefault);
2206 if (CondBank == AMDGPU::SGPRRegBankID)
2207 CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2208 else if (CondBank == AMDGPU::VGPRRegBankID)
2209 CondBank = AMDGPU::VCCRegBankID;
2211 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ?
2212 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2214 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID);
2216 if (Size == 64) {
2217 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2218 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2219 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2220 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2221 } else {
2222 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
2223 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2224 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
2225 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
2228 break;
2231 case AMDGPU::G_LOAD:
2232 return getInstrMappingForLoad(MI);
2234 case AMDGPU::G_ATOMICRMW_XCHG:
2235 case AMDGPU::G_ATOMICRMW_ADD:
2236 case AMDGPU::G_ATOMICRMW_SUB:
2237 case AMDGPU::G_ATOMICRMW_AND:
2238 case AMDGPU::G_ATOMICRMW_OR:
2239 case AMDGPU::G_ATOMICRMW_XOR:
2240 case AMDGPU::G_ATOMICRMW_MAX:
2241 case AMDGPU::G_ATOMICRMW_MIN:
2242 case AMDGPU::G_ATOMICRMW_UMAX:
2243 case AMDGPU::G_ATOMICRMW_UMIN:
2244 case AMDGPU::G_ATOMICRMW_FADD:
2245 case AMDGPU::G_ATOMIC_CMPXCHG: {
2246 return getDefaultMappingAllVGPR(MI);
2248 case AMDGPU::G_BRCOND: {
2249 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
2250 AMDGPU::SGPRRegBankID);
2251 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
2252 if (Bank != AMDGPU::SCCRegBankID)
2253 Bank = AMDGPU::VCCRegBankID;
2255 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
2256 break;
2260 return getInstructionMapping(/*ID*/1, /*Cost*/1,
2261 getOperandsMapping(OpdsMapping),
2262 MI.getNumOperands());