1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the InstructionSelector class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUInstructionSelector.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
24 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/IR/DiagnosticInfo.h"
28 #define DEBUG_TYPE "amdgpu-isel"
31 using namespace MIPatternMatch
;
33 static cl::opt
<bool> AllowRiskySelect(
34 "amdgpu-global-isel-risky-select",
35 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
39 #define GET_GLOBALISEL_IMPL
40 #define AMDGPUSubtarget GCNSubtarget
41 #include "AMDGPUGenGlobalISel.inc"
42 #undef GET_GLOBALISEL_IMPL
43 #undef AMDGPUSubtarget
45 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
46 const GCNSubtarget
&STI
, const AMDGPURegisterBankInfo
&RBI
,
47 const AMDGPUTargetMachine
&TM
)
48 : InstructionSelector(), TII(*STI
.getInstrInfo()),
49 TRI(*STI
.getRegisterInfo()), RBI(RBI
), TM(TM
),
51 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG
),
52 #define GET_GLOBALISEL_PREDICATES_INIT
53 #include "AMDGPUGenGlobalISel.inc"
54 #undef GET_GLOBALISEL_PREDICATES_INIT
55 #define GET_GLOBALISEL_TEMPORARIES_INIT
56 #include "AMDGPUGenGlobalISel.inc"
57 #undef GET_GLOBALISEL_TEMPORARIES_INIT
61 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE
; }
63 void AMDGPUInstructionSelector::setupMF(MachineFunction
&MF
, GISelKnownBits
*KB
,
64 CodeGenCoverage
&CoverageInfo
,
65 ProfileSummaryInfo
*PSI
,
66 BlockFrequencyInfo
*BFI
) {
67 MRI
= &MF
.getRegInfo();
68 Subtarget
= &MF
.getSubtarget
<GCNSubtarget
>();
69 InstructionSelector::setupMF(MF
, KB
, CoverageInfo
, PSI
, BFI
);
72 bool AMDGPUInstructionSelector::isVCC(Register Reg
,
73 const MachineRegisterInfo
&MRI
) const {
74 // The verifier is oblivious to s1 being a valid value for wavesize registers.
78 auto &RegClassOrBank
= MRI
.getRegClassOrRegBank(Reg
);
79 const TargetRegisterClass
*RC
=
80 RegClassOrBank
.dyn_cast
<const TargetRegisterClass
*>();
82 const LLT Ty
= MRI
.getType(Reg
);
83 return RC
->hasSuperClassEq(TRI
.getBoolRC()) &&
84 Ty
.isValid() && Ty
.getSizeInBits() == 1;
87 const RegisterBank
*RB
= RegClassOrBank
.get
<const RegisterBank
*>();
88 return RB
->getID() == AMDGPU::VCCRegBankID
;
91 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr
&MI
,
92 unsigned NewOpc
) const {
93 MI
.setDesc(TII
.get(NewOpc
));
94 MI
.RemoveOperand(1); // Remove intrinsic ID.
95 MI
.addOperand(*MF
, MachineOperand::CreateReg(AMDGPU::EXEC
, false, true));
97 MachineOperand
&Dst
= MI
.getOperand(0);
98 MachineOperand
&Src
= MI
.getOperand(1);
100 // TODO: This should be legalized to s32 if needed
101 if (MRI
->getType(Dst
.getReg()) == LLT::scalar(1))
104 const TargetRegisterClass
*DstRC
105 = TRI
.getConstrainedRegClassForOperand(Dst
, *MRI
);
106 const TargetRegisterClass
*SrcRC
107 = TRI
.getConstrainedRegClassForOperand(Src
, *MRI
);
108 if (!DstRC
|| DstRC
!= SrcRC
)
111 return RBI
.constrainGenericRegister(Dst
.getReg(), *DstRC
, *MRI
) &&
112 RBI
.constrainGenericRegister(Src
.getReg(), *SrcRC
, *MRI
);
115 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr
&I
) const {
116 const DebugLoc
&DL
= I
.getDebugLoc();
117 MachineBasicBlock
*BB
= I
.getParent();
118 I
.setDesc(TII
.get(TargetOpcode::COPY
));
120 const MachineOperand
&Src
= I
.getOperand(1);
121 MachineOperand
&Dst
= I
.getOperand(0);
122 Register DstReg
= Dst
.getReg();
123 Register SrcReg
= Src
.getReg();
125 if (isVCC(DstReg
, *MRI
)) {
126 if (SrcReg
== AMDGPU::SCC
) {
127 const TargetRegisterClass
*RC
128 = TRI
.getConstrainedRegClassForOperand(Dst
, *MRI
);
131 return RBI
.constrainGenericRegister(DstReg
, *RC
, *MRI
);
134 if (!isVCC(SrcReg
, *MRI
)) {
135 // TODO: Should probably leave the copy and let copyPhysReg expand it.
136 if (!RBI
.constrainGenericRegister(DstReg
, *TRI
.getBoolRC(), *MRI
))
139 const TargetRegisterClass
*SrcRC
140 = TRI
.getConstrainedRegClassForOperand(Src
, *MRI
);
142 Optional
<ValueAndVReg
> ConstVal
=
143 getConstantVRegValWithLookThrough(SrcReg
, *MRI
, true, true);
146 STI
.isWave64() ? AMDGPU::S_MOV_B64
: AMDGPU::S_MOV_B32
;
147 BuildMI(*BB
, &I
, DL
, TII
.get(MovOpc
), DstReg
)
148 .addImm(ConstVal
->Value
.getBoolValue() ? -1 : 0);
150 Register MaskedReg
= MRI
->createVirtualRegister(SrcRC
);
152 // We can't trust the high bits at this point, so clear them.
154 // TODO: Skip masking high bits if def is known boolean.
157 TRI
.isSGPRClass(SrcRC
) ? AMDGPU::S_AND_B32
: AMDGPU::V_AND_B32_e32
;
158 BuildMI(*BB
, &I
, DL
, TII
.get(AndOpc
), MaskedReg
)
161 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::V_CMP_NE_U32_e64
), DstReg
)
166 if (!MRI
->getRegClassOrNull(SrcReg
))
167 MRI
->setRegClass(SrcReg
, SrcRC
);
172 const TargetRegisterClass
*RC
=
173 TRI
.getConstrainedRegClassForOperand(Dst
, *MRI
);
174 if (RC
&& !RBI
.constrainGenericRegister(DstReg
, *RC
, *MRI
))
180 for (const MachineOperand
&MO
: I
.operands()) {
181 if (MO
.getReg().isPhysical())
184 const TargetRegisterClass
*RC
=
185 TRI
.getConstrainedRegClassForOperand(MO
, *MRI
);
188 RBI
.constrainGenericRegister(MO
.getReg(), *RC
, *MRI
);
193 bool AMDGPUInstructionSelector::selectPHI(MachineInstr
&I
) const {
194 const Register DefReg
= I
.getOperand(0).getReg();
195 const LLT DefTy
= MRI
->getType(DefReg
);
196 if (DefTy
== LLT::scalar(1)) {
197 if (!AllowRiskySelect
) {
198 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
202 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
205 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
207 const RegClassOrRegBank
&RegClassOrBank
=
208 MRI
->getRegClassOrRegBank(DefReg
);
210 const TargetRegisterClass
*DefRC
211 = RegClassOrBank
.dyn_cast
<const TargetRegisterClass
*>();
213 if (!DefTy
.isValid()) {
214 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
218 const RegisterBank
&RB
= *RegClassOrBank
.get
<const RegisterBank
*>();
219 DefRC
= TRI
.getRegClassForTypeOnBank(DefTy
, RB
, *MRI
);
221 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
226 // TODO: Verify that all registers have the same bank
227 I
.setDesc(TII
.get(TargetOpcode::PHI
));
228 return RBI
.constrainGenericRegister(DefReg
, *DefRC
, *MRI
);
232 AMDGPUInstructionSelector::getSubOperand64(MachineOperand
&MO
,
233 const TargetRegisterClass
&SubRC
,
234 unsigned SubIdx
) const {
236 MachineInstr
*MI
= MO
.getParent();
237 MachineBasicBlock
*BB
= MO
.getParent()->getParent();
238 Register DstReg
= MRI
->createVirtualRegister(&SubRC
);
241 unsigned ComposedSubIdx
= TRI
.composeSubRegIndices(MO
.getSubReg(), SubIdx
);
242 Register Reg
= MO
.getReg();
243 BuildMI(*BB
, MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::COPY
), DstReg
)
244 .addReg(Reg
, 0, ComposedSubIdx
);
246 return MachineOperand::CreateReg(DstReg
, MO
.isDef(), MO
.isImplicit(),
247 MO
.isKill(), MO
.isDead(), MO
.isUndef(),
248 MO
.isEarlyClobber(), 0, MO
.isDebug(),
249 MO
.isInternalRead());
254 APInt
Imm(64, MO
.getImm());
258 llvm_unreachable("do not know to split immediate with this sub index.");
260 return MachineOperand::CreateImm(Imm
.getLoBits(32).getSExtValue());
262 return MachineOperand::CreateImm(Imm
.getHiBits(32).getSExtValue());
266 static unsigned getLogicalBitOpcode(unsigned Opc
, bool Is64
) {
269 return Is64
? AMDGPU::S_AND_B64
: AMDGPU::S_AND_B32
;
271 return Is64
? AMDGPU::S_OR_B64
: AMDGPU::S_OR_B32
;
273 return Is64
? AMDGPU::S_XOR_B64
: AMDGPU::S_XOR_B32
;
275 llvm_unreachable("not a bit op");
279 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr
&I
) const {
280 Register DstReg
= I
.getOperand(0).getReg();
281 unsigned Size
= RBI
.getSizeInBits(DstReg
, *MRI
, TRI
);
283 const RegisterBank
*DstRB
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
284 if (DstRB
->getID() != AMDGPU::SGPRRegBankID
&&
285 DstRB
->getID() != AMDGPU::VCCRegBankID
)
288 bool Is64
= Size
> 32 || (DstRB
->getID() == AMDGPU::VCCRegBankID
&&
290 I
.setDesc(TII
.get(getLogicalBitOpcode(I
.getOpcode(), Is64
)));
292 // Dead implicit-def of scc
293 I
.addOperand(MachineOperand::CreateReg(AMDGPU::SCC
, true, // isDef
297 return constrainSelectedInstRegOperands(I
, TII
, TRI
, RBI
);
300 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr
&I
) const {
301 MachineBasicBlock
*BB
= I
.getParent();
302 MachineFunction
*MF
= BB
->getParent();
303 Register DstReg
= I
.getOperand(0).getReg();
304 const DebugLoc
&DL
= I
.getDebugLoc();
305 LLT Ty
= MRI
->getType(DstReg
);
309 unsigned Size
= Ty
.getSizeInBits();
310 const RegisterBank
*DstRB
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
311 const bool IsSALU
= DstRB
->getID() == AMDGPU::SGPRRegBankID
;
312 const bool Sub
= I
.getOpcode() == TargetOpcode::G_SUB
;
316 const unsigned Opc
= Sub
? AMDGPU::S_SUB_U32
: AMDGPU::S_ADD_U32
;
318 BuildMI(*BB
, &I
, DL
, TII
.get(Opc
), DstReg
)
319 .add(I
.getOperand(1))
320 .add(I
.getOperand(2));
322 return constrainSelectedInstRegOperands(*Add
, TII
, TRI
, RBI
);
325 if (STI
.hasAddNoCarry()) {
326 const unsigned Opc
= Sub
? AMDGPU::V_SUB_U32_e64
: AMDGPU::V_ADD_U32_e64
;
327 I
.setDesc(TII
.get(Opc
));
328 I
.addOperand(*MF
, MachineOperand::CreateImm(0));
329 I
.addOperand(*MF
, MachineOperand::CreateReg(AMDGPU::EXEC
, false, true));
330 return constrainSelectedInstRegOperands(I
, TII
, TRI
, RBI
);
333 const unsigned Opc
= Sub
? AMDGPU::V_SUB_CO_U32_e64
: AMDGPU::V_ADD_CO_U32_e64
;
335 Register UnusedCarry
= MRI
->createVirtualRegister(TRI
.getWaveMaskRegClass());
337 = BuildMI(*BB
, &I
, DL
, TII
.get(Opc
), DstReg
)
338 .addDef(UnusedCarry
, RegState::Dead
)
339 .add(I
.getOperand(1))
340 .add(I
.getOperand(2))
343 return constrainSelectedInstRegOperands(*Add
, TII
, TRI
, RBI
);
346 assert(!Sub
&& "illegal sub should not reach here");
348 const TargetRegisterClass
&RC
349 = IsSALU
? AMDGPU::SReg_64_XEXECRegClass
: AMDGPU::VReg_64RegClass
;
350 const TargetRegisterClass
&HalfRC
351 = IsSALU
? AMDGPU::SReg_32RegClass
: AMDGPU::VGPR_32RegClass
;
353 MachineOperand
Lo1(getSubOperand64(I
.getOperand(1), HalfRC
, AMDGPU::sub0
));
354 MachineOperand
Lo2(getSubOperand64(I
.getOperand(2), HalfRC
, AMDGPU::sub0
));
355 MachineOperand
Hi1(getSubOperand64(I
.getOperand(1), HalfRC
, AMDGPU::sub1
));
356 MachineOperand
Hi2(getSubOperand64(I
.getOperand(2), HalfRC
, AMDGPU::sub1
));
358 Register DstLo
= MRI
->createVirtualRegister(&HalfRC
);
359 Register DstHi
= MRI
->createVirtualRegister(&HalfRC
);
362 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::S_ADD_U32
), DstLo
)
365 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::S_ADDC_U32
), DstHi
)
369 const TargetRegisterClass
*CarryRC
= TRI
.getWaveMaskRegClass();
370 Register CarryReg
= MRI
->createVirtualRegister(CarryRC
);
371 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::V_ADD_CO_U32_e64
), DstLo
)
376 MachineInstr
*Addc
= BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::V_ADDC_U32_e64
), DstHi
)
377 .addDef(MRI
->createVirtualRegister(CarryRC
), RegState::Dead
)
380 .addReg(CarryReg
, RegState::Kill
)
383 if (!constrainSelectedInstRegOperands(*Addc
, TII
, TRI
, RBI
))
387 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), DstReg
)
389 .addImm(AMDGPU::sub0
)
391 .addImm(AMDGPU::sub1
);
394 if (!RBI
.constrainGenericRegister(DstReg
, RC
, *MRI
))
401 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
402 MachineInstr
&I
) const {
403 MachineBasicBlock
*BB
= I
.getParent();
404 MachineFunction
*MF
= BB
->getParent();
405 const DebugLoc
&DL
= I
.getDebugLoc();
406 Register Dst0Reg
= I
.getOperand(0).getReg();
407 Register Dst1Reg
= I
.getOperand(1).getReg();
408 const bool IsAdd
= I
.getOpcode() == AMDGPU::G_UADDO
||
409 I
.getOpcode() == AMDGPU::G_UADDE
;
410 const bool HasCarryIn
= I
.getOpcode() == AMDGPU::G_UADDE
||
411 I
.getOpcode() == AMDGPU::G_USUBE
;
413 if (isVCC(Dst1Reg
, *MRI
)) {
414 unsigned NoCarryOpc
=
415 IsAdd
? AMDGPU::V_ADD_CO_U32_e64
: AMDGPU::V_SUB_CO_U32_e64
;
416 unsigned CarryOpc
= IsAdd
? AMDGPU::V_ADDC_U32_e64
: AMDGPU::V_SUBB_U32_e64
;
417 I
.setDesc(TII
.get(HasCarryIn
? CarryOpc
: NoCarryOpc
));
418 I
.addOperand(*MF
, MachineOperand::CreateReg(AMDGPU::EXEC
, false, true));
419 I
.addOperand(*MF
, MachineOperand::CreateImm(0));
420 return constrainSelectedInstRegOperands(I
, TII
, TRI
, RBI
);
423 Register Src0Reg
= I
.getOperand(2).getReg();
424 Register Src1Reg
= I
.getOperand(3).getReg();
427 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), AMDGPU::SCC
)
428 .addReg(I
.getOperand(4).getReg());
431 unsigned NoCarryOpc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
432 unsigned CarryOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
434 BuildMI(*BB
, &I
, DL
, TII
.get(HasCarryIn
? CarryOpc
: NoCarryOpc
), Dst0Reg
)
435 .add(I
.getOperand(2))
436 .add(I
.getOperand(3));
437 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), Dst1Reg
)
438 .addReg(AMDGPU::SCC
);
440 if (!MRI
->getRegClassOrNull(Dst1Reg
))
441 MRI
->setRegClass(Dst1Reg
, &AMDGPU::SReg_32RegClass
);
443 if (!RBI
.constrainGenericRegister(Dst0Reg
, AMDGPU::SReg_32RegClass
, *MRI
) ||
444 !RBI
.constrainGenericRegister(Src0Reg
, AMDGPU::SReg_32RegClass
, *MRI
) ||
445 !RBI
.constrainGenericRegister(Src1Reg
, AMDGPU::SReg_32RegClass
, *MRI
))
449 !RBI
.constrainGenericRegister(I
.getOperand(4).getReg(),
450 AMDGPU::SReg_32RegClass
, *MRI
))
457 // TODO: We should probably legalize these to only using 32-bit results.
458 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr
&I
) const {
459 MachineBasicBlock
*BB
= I
.getParent();
460 Register DstReg
= I
.getOperand(0).getReg();
461 Register SrcReg
= I
.getOperand(1).getReg();
462 LLT DstTy
= MRI
->getType(DstReg
);
463 LLT SrcTy
= MRI
->getType(SrcReg
);
464 const unsigned SrcSize
= SrcTy
.getSizeInBits();
465 unsigned DstSize
= DstTy
.getSizeInBits();
467 // TODO: Should handle any multiple of 32 offset.
468 unsigned Offset
= I
.getOperand(2).getImm();
469 if (Offset
% 32 != 0 || DstSize
> 128)
472 // 16-bit operations really use 32-bit registers.
473 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
477 const TargetRegisterClass
*DstRC
=
478 TRI
.getConstrainedRegClassForOperand(I
.getOperand(0), *MRI
);
479 if (!DstRC
|| !RBI
.constrainGenericRegister(DstReg
, *DstRC
, *MRI
))
482 const RegisterBank
*SrcBank
= RBI
.getRegBank(SrcReg
, *MRI
, TRI
);
483 const TargetRegisterClass
*SrcRC
=
484 TRI
.getRegClassForSizeOnBank(SrcSize
, *SrcBank
, *MRI
);
487 unsigned SubReg
= SIRegisterInfo::getSubRegFromChannel(Offset
/ 32,
489 SrcRC
= TRI
.getSubClassWithSubReg(SrcRC
, SubReg
);
493 SrcReg
= constrainOperandRegClass(*MF
, TRI
, *MRI
, TII
, RBI
, I
,
494 *SrcRC
, I
.getOperand(1));
495 const DebugLoc
&DL
= I
.getDebugLoc();
496 BuildMI(*BB
, &I
, DL
, TII
.get(TargetOpcode::COPY
), DstReg
)
497 .addReg(SrcReg
, 0, SubReg
);
503 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr
&MI
) const {
504 MachineBasicBlock
*BB
= MI
.getParent();
505 Register DstReg
= MI
.getOperand(0).getReg();
506 LLT DstTy
= MRI
->getType(DstReg
);
507 LLT SrcTy
= MRI
->getType(MI
.getOperand(1).getReg());
509 const unsigned SrcSize
= SrcTy
.getSizeInBits();
511 return selectImpl(MI
, *CoverageInfo
);
513 const DebugLoc
&DL
= MI
.getDebugLoc();
514 const RegisterBank
*DstBank
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
515 const unsigned DstSize
= DstTy
.getSizeInBits();
516 const TargetRegisterClass
*DstRC
=
517 TRI
.getRegClassForSizeOnBank(DstSize
, *DstBank
, *MRI
);
521 ArrayRef
<int16_t> SubRegs
= TRI
.getRegSplitParts(DstRC
, SrcSize
/ 8);
522 MachineInstrBuilder MIB
=
523 BuildMI(*BB
, &MI
, DL
, TII
.get(TargetOpcode::REG_SEQUENCE
), DstReg
);
524 for (int I
= 0, E
= MI
.getNumOperands() - 1; I
!= E
; ++I
) {
525 MachineOperand
&Src
= MI
.getOperand(I
+ 1);
526 MIB
.addReg(Src
.getReg(), getUndefRegState(Src
.isUndef()));
527 MIB
.addImm(SubRegs
[I
]);
529 const TargetRegisterClass
*SrcRC
530 = TRI
.getConstrainedRegClassForOperand(Src
, *MRI
);
531 if (SrcRC
&& !RBI
.constrainGenericRegister(Src
.getReg(), *SrcRC
, *MRI
))
535 if (!RBI
.constrainGenericRegister(DstReg
, *DstRC
, *MRI
))
538 MI
.eraseFromParent();
542 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr
&MI
) const {
543 MachineBasicBlock
*BB
= MI
.getParent();
544 const int NumDst
= MI
.getNumOperands() - 1;
546 MachineOperand
&Src
= MI
.getOperand(NumDst
);
548 Register SrcReg
= Src
.getReg();
549 Register DstReg0
= MI
.getOperand(0).getReg();
550 LLT DstTy
= MRI
->getType(DstReg0
);
551 LLT SrcTy
= MRI
->getType(SrcReg
);
553 const unsigned DstSize
= DstTy
.getSizeInBits();
554 const unsigned SrcSize
= SrcTy
.getSizeInBits();
555 const DebugLoc
&DL
= MI
.getDebugLoc();
556 const RegisterBank
*SrcBank
= RBI
.getRegBank(SrcReg
, *MRI
, TRI
);
558 const TargetRegisterClass
*SrcRC
=
559 TRI
.getRegClassForSizeOnBank(SrcSize
, *SrcBank
, *MRI
);
560 if (!SrcRC
|| !RBI
.constrainGenericRegister(SrcReg
, *SrcRC
, *MRI
))
563 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
564 // source, and this relies on the fact that the same subregister indices are
566 ArrayRef
<int16_t> SubRegs
= TRI
.getRegSplitParts(SrcRC
, DstSize
/ 8);
567 for (int I
= 0, E
= NumDst
; I
!= E
; ++I
) {
568 MachineOperand
&Dst
= MI
.getOperand(I
);
569 BuildMI(*BB
, &MI
, DL
, TII
.get(TargetOpcode::COPY
), Dst
.getReg())
570 .addReg(SrcReg
, 0, SubRegs
[I
]);
572 // Make sure the subregister index is valid for the source register.
573 SrcRC
= TRI
.getSubClassWithSubReg(SrcRC
, SubRegs
[I
]);
574 if (!SrcRC
|| !RBI
.constrainGenericRegister(SrcReg
, *SrcRC
, *MRI
))
577 const TargetRegisterClass
*DstRC
=
578 TRI
.getConstrainedRegClassForOperand(Dst
, *MRI
);
579 if (DstRC
&& !RBI
.constrainGenericRegister(Dst
.getReg(), *DstRC
, *MRI
))
583 MI
.eraseFromParent();
587 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
588 MachineInstr
&MI
) const {
589 if (selectImpl(MI
, *CoverageInfo
))
592 const LLT S32
= LLT::scalar(32);
593 const LLT V2S16
= LLT::fixed_vector(2, 16);
595 Register Dst
= MI
.getOperand(0).getReg();
596 if (MRI
->getType(Dst
) != V2S16
)
599 const RegisterBank
*DstBank
= RBI
.getRegBank(Dst
, *MRI
, TRI
);
600 if (DstBank
->getID() != AMDGPU::SGPRRegBankID
)
603 Register Src0
= MI
.getOperand(1).getReg();
604 Register Src1
= MI
.getOperand(2).getReg();
605 if (MRI
->getType(Src0
) != S32
)
608 const DebugLoc
&DL
= MI
.getDebugLoc();
609 MachineBasicBlock
*BB
= MI
.getParent();
612 getConstantVRegValWithLookThrough(Src1
, *MRI
, true, true, true);
615 getConstantVRegValWithLookThrough(Src0
, *MRI
, true, true, true);
617 const int64_t K0
= ConstSrc0
->Value
.getSExtValue();
618 const int64_t K1
= ConstSrc1
->Value
.getSExtValue();
619 uint32_t Lo16
= static_cast<uint32_t>(K0
) & 0xffff;
620 uint32_t Hi16
= static_cast<uint32_t>(K1
) & 0xffff;
622 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::S_MOV_B32
), Dst
)
623 .addImm(Lo16
| (Hi16
<< 16));
624 MI
.eraseFromParent();
625 return RBI
.constrainGenericRegister(Dst
, AMDGPU::SReg_32RegClass
, *MRI
);
629 // TODO: This should probably be a combine somewhere
630 // (build_vector_trunc $src0, undef -> copy $src0
631 MachineInstr
*Src1Def
= getDefIgnoringCopies(Src1
, *MRI
);
632 if (Src1Def
&& Src1Def
->getOpcode() == AMDGPU::G_IMPLICIT_DEF
) {
633 MI
.setDesc(TII
.get(AMDGPU::COPY
));
635 return RBI
.constrainGenericRegister(Dst
, AMDGPU::SReg_32RegClass
, *MRI
) &&
636 RBI
.constrainGenericRegister(Src0
, AMDGPU::SReg_32RegClass
, *MRI
);
642 // With multiple uses of the shift, this will duplicate the shift and
643 // increase register pressure.
645 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
646 // => (S_PACK_HH_B32_B16 $src0, $src1)
647 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
648 // => (S_PACK_LH_B32_B16 $src0, $src1)
649 // (build_vector_trunc $src0, $src1)
650 // => (S_PACK_LL_B32_B16 $src0, $src1)
652 bool Shift0
= mi_match(
653 Src0
, *MRI
, m_OneUse(m_GLShr(m_Reg(ShiftSrc0
), m_SpecificICst(16))));
655 bool Shift1
= mi_match(
656 Src1
, *MRI
, m_OneUse(m_GLShr(m_Reg(ShiftSrc1
), m_SpecificICst(16))));
658 unsigned Opc
= AMDGPU::S_PACK_LL_B32_B16
;
659 if (Shift0
&& Shift1
) {
660 Opc
= AMDGPU::S_PACK_HH_B32_B16
;
661 MI
.getOperand(1).setReg(ShiftSrc0
);
662 MI
.getOperand(2).setReg(ShiftSrc1
);
664 Opc
= AMDGPU::S_PACK_LH_B32_B16
;
665 MI
.getOperand(2).setReg(ShiftSrc1
);
666 } else if (Shift0
&& ConstSrc1
&& ConstSrc1
->Value
== 0) {
667 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
668 auto MIB
= BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::S_LSHR_B32
), Dst
)
672 MI
.eraseFromParent();
673 return constrainSelectedInstRegOperands(*MIB
, TII
, TRI
, RBI
);
676 MI
.setDesc(TII
.get(Opc
));
677 return constrainSelectedInstRegOperands(MI
, TII
, TRI
, RBI
);
680 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr
&I
) const {
681 return selectG_ADD_SUB(I
);
684 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr
&I
) const {
685 const MachineOperand
&MO
= I
.getOperand(0);
687 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
688 // regbank check here is to know why getConstrainedRegClassForOperand failed.
689 const TargetRegisterClass
*RC
= TRI
.getConstrainedRegClassForOperand(MO
, *MRI
);
690 if ((!RC
&& !MRI
->getRegBankOrNull(MO
.getReg())) ||
691 (RC
&& RBI
.constrainGenericRegister(MO
.getReg(), *RC
, *MRI
))) {
692 I
.setDesc(TII
.get(TargetOpcode::IMPLICIT_DEF
));
699 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr
&I
) const {
700 MachineBasicBlock
*BB
= I
.getParent();
702 Register DstReg
= I
.getOperand(0).getReg();
703 Register Src0Reg
= I
.getOperand(1).getReg();
704 Register Src1Reg
= I
.getOperand(2).getReg();
705 LLT Src1Ty
= MRI
->getType(Src1Reg
);
707 unsigned DstSize
= MRI
->getType(DstReg
).getSizeInBits();
708 unsigned InsSize
= Src1Ty
.getSizeInBits();
710 int64_t Offset
= I
.getOperand(3).getImm();
712 // FIXME: These cases should have been illegal and unnecessary to check here.
713 if (Offset
% 32 != 0 || InsSize
% 32 != 0)
716 // Currently not handled by getSubRegFromChannel.
720 unsigned SubReg
= TRI
.getSubRegFromChannel(Offset
/ 32, InsSize
/ 32);
721 if (SubReg
== AMDGPU::NoSubRegister
)
724 const RegisterBank
*DstBank
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
725 const TargetRegisterClass
*DstRC
=
726 TRI
.getRegClassForSizeOnBank(DstSize
, *DstBank
, *MRI
);
730 const RegisterBank
*Src0Bank
= RBI
.getRegBank(Src0Reg
, *MRI
, TRI
);
731 const RegisterBank
*Src1Bank
= RBI
.getRegBank(Src1Reg
, *MRI
, TRI
);
732 const TargetRegisterClass
*Src0RC
=
733 TRI
.getRegClassForSizeOnBank(DstSize
, *Src0Bank
, *MRI
);
734 const TargetRegisterClass
*Src1RC
=
735 TRI
.getRegClassForSizeOnBank(InsSize
, *Src1Bank
, *MRI
);
737 // Deal with weird cases where the class only partially supports the subreg
739 Src0RC
= TRI
.getSubClassWithSubReg(Src0RC
, SubReg
);
740 if (!Src0RC
|| !Src1RC
)
743 if (!RBI
.constrainGenericRegister(DstReg
, *DstRC
, *MRI
) ||
744 !RBI
.constrainGenericRegister(Src0Reg
, *Src0RC
, *MRI
) ||
745 !RBI
.constrainGenericRegister(Src1Reg
, *Src1RC
, *MRI
))
748 const DebugLoc
&DL
= I
.getDebugLoc();
749 BuildMI(*BB
, &I
, DL
, TII
.get(TargetOpcode::INSERT_SUBREG
), DstReg
)
758 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr
&MI
) const {
759 Register DstReg
= MI
.getOperand(0).getReg();
760 Register SrcReg
= MI
.getOperand(1).getReg();
761 Register OffsetReg
= MI
.getOperand(2).getReg();
762 Register WidthReg
= MI
.getOperand(3).getReg();
764 assert(RBI
.getRegBank(DstReg
, *MRI
, TRI
)->getID() == AMDGPU::VGPRRegBankID
&&
765 "scalar BFX instructions are expanded in regbankselect");
766 assert(MRI
->getType(MI
.getOperand(0).getReg()).getSizeInBits() == 32 &&
767 "64-bit vector BFX instructions are expanded in regbankselect");
769 const DebugLoc
&DL
= MI
.getDebugLoc();
770 MachineBasicBlock
*MBB
= MI
.getParent();
772 bool IsSigned
= MI
.getOpcode() == TargetOpcode::G_SBFX
;
773 unsigned Opc
= IsSigned
? AMDGPU::V_BFE_I32_e64
: AMDGPU::V_BFE_U32_e64
;
774 auto MIB
= BuildMI(*MBB
, &MI
, DL
, TII
.get(Opc
), DstReg
)
778 MI
.eraseFromParent();
779 return constrainSelectedInstRegOperands(*MIB
, TII
, TRI
, RBI
);
782 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr
&MI
) const {
783 if (STI
.getLDSBankCount() != 16)
784 return selectImpl(MI
, *CoverageInfo
);
786 Register Dst
= MI
.getOperand(0).getReg();
787 Register Src0
= MI
.getOperand(2).getReg();
788 Register M0Val
= MI
.getOperand(6).getReg();
789 if (!RBI
.constrainGenericRegister(M0Val
, AMDGPU::SReg_32RegClass
, *MRI
) ||
790 !RBI
.constrainGenericRegister(Dst
, AMDGPU::VGPR_32RegClass
, *MRI
) ||
791 !RBI
.constrainGenericRegister(Src0
, AMDGPU::VGPR_32RegClass
, *MRI
))
794 // This requires 2 instructions. It is possible to write a pattern to support
795 // this, but the generated isel emitter doesn't correctly deal with multiple
796 // output instructions using the same physical register input. The copy to m0
797 // is incorrectly placed before the second instruction.
799 // TODO: Match source modifiers.
801 Register InterpMov
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
802 const DebugLoc
&DL
= MI
.getDebugLoc();
803 MachineBasicBlock
*MBB
= MI
.getParent();
805 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), AMDGPU::M0
)
807 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::V_INTERP_MOV_F32
), InterpMov
)
809 .addImm(MI
.getOperand(4).getImm()) // $attr
810 .addImm(MI
.getOperand(3).getImm()); // $attrchan
812 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::V_INTERP_P1LV_F16
), Dst
)
813 .addImm(0) // $src0_modifiers
814 .addReg(Src0
) // $src0
815 .addImm(MI
.getOperand(4).getImm()) // $attr
816 .addImm(MI
.getOperand(3).getImm()) // $attrchan
817 .addImm(0) // $src2_modifiers
818 .addReg(InterpMov
) // $src2 - 2 f16 values selected by high
819 .addImm(MI
.getOperand(5).getImm()) // $high
823 MI
.eraseFromParent();
827 // Writelane is special in that it can use SGPR and M0 (which would normally
828 // count as using the constant bus twice - but in this case it is allowed since
829 // the lane selector doesn't count as a use of the constant bus). However, it is
830 // still required to abide by the 1 SGPR rule. Fix this up if we might have
832 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr
&MI
) const {
833 // With a constant bus limit of at least 2, there's no issue.
834 if (STI
.getConstantBusLimit(AMDGPU::V_WRITELANE_B32
) > 1)
835 return selectImpl(MI
, *CoverageInfo
);
837 MachineBasicBlock
*MBB
= MI
.getParent();
838 const DebugLoc
&DL
= MI
.getDebugLoc();
839 Register VDst
= MI
.getOperand(0).getReg();
840 Register Val
= MI
.getOperand(2).getReg();
841 Register LaneSelect
= MI
.getOperand(3).getReg();
842 Register VDstIn
= MI
.getOperand(4).getReg();
844 auto MIB
= BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::V_WRITELANE_B32
), VDst
);
846 Optional
<ValueAndVReg
> ConstSelect
=
847 getConstantVRegValWithLookThrough(LaneSelect
, *MRI
, true, true);
849 // The selector has to be an inline immediate, so we can use whatever for
850 // the other operands.
852 MIB
.addImm(ConstSelect
->Value
.getSExtValue() &
853 maskTrailingOnes
<uint64_t>(STI
.getWavefrontSizeLog2()));
855 Optional
<ValueAndVReg
> ConstVal
=
856 getConstantVRegValWithLookThrough(Val
, *MRI
, true, true);
858 // If the value written is an inline immediate, we can get away without a
860 if (ConstVal
&& AMDGPU::isInlinableLiteral32(ConstVal
->Value
.getSExtValue(),
861 STI
.hasInv2PiInlineImm())) {
862 MIB
.addImm(ConstVal
->Value
.getSExtValue());
863 MIB
.addReg(LaneSelect
);
867 // If the lane selector was originally in a VGPR and copied with
868 // readfirstlane, there's a hazard to read the same SGPR from the
869 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
870 RBI
.constrainGenericRegister(LaneSelect
, AMDGPU::SReg_32_XM0RegClass
, *MRI
);
872 BuildMI(*MBB
, *MIB
, DL
, TII
.get(AMDGPU::COPY
), AMDGPU::M0
)
874 MIB
.addReg(AMDGPU::M0
);
880 MI
.eraseFromParent();
881 return constrainSelectedInstRegOperands(*MIB
, TII
, TRI
, RBI
);
884 // We need to handle this here because tablegen doesn't support matching
885 // instructions with multiple outputs.
886 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr
&MI
) const {
887 Register Dst0
= MI
.getOperand(0).getReg();
888 Register Dst1
= MI
.getOperand(1).getReg();
890 LLT Ty
= MRI
->getType(Dst0
);
892 if (Ty
== LLT::scalar(32))
893 Opc
= AMDGPU::V_DIV_SCALE_F32_e64
;
894 else if (Ty
== LLT::scalar(64))
895 Opc
= AMDGPU::V_DIV_SCALE_F64_e64
;
899 // TODO: Match source modifiers.
901 const DebugLoc
&DL
= MI
.getDebugLoc();
902 MachineBasicBlock
*MBB
= MI
.getParent();
904 Register Numer
= MI
.getOperand(3).getReg();
905 Register Denom
= MI
.getOperand(4).getReg();
906 unsigned ChooseDenom
= MI
.getOperand(5).getImm();
908 Register Src0
= ChooseDenom
!= 0 ? Numer
: Denom
;
910 auto MIB
= BuildMI(*MBB
, &MI
, DL
, TII
.get(Opc
), Dst0
)
912 .addImm(0) // $src0_modifiers
913 .addUse(Src0
) // $src0
914 .addImm(0) // $src1_modifiers
915 .addUse(Denom
) // $src1
916 .addImm(0) // $src2_modifiers
917 .addUse(Numer
) // $src2
921 MI
.eraseFromParent();
922 return constrainSelectedInstRegOperands(*MIB
, TII
, TRI
, RBI
);
925 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr
&I
) const {
926 unsigned IntrinsicID
= I
.getIntrinsicID();
927 switch (IntrinsicID
) {
928 case Intrinsic::amdgcn_if_break
: {
929 MachineBasicBlock
*BB
= I
.getParent();
931 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
932 // SelectionDAG uses for wave32 vs wave64.
933 BuildMI(*BB
, &I
, I
.getDebugLoc(), TII
.get(AMDGPU::SI_IF_BREAK
))
934 .add(I
.getOperand(0))
935 .add(I
.getOperand(2))
936 .add(I
.getOperand(3));
938 Register DstReg
= I
.getOperand(0).getReg();
939 Register Src0Reg
= I
.getOperand(2).getReg();
940 Register Src1Reg
= I
.getOperand(3).getReg();
944 for (Register Reg
: { DstReg
, Src0Reg
, Src1Reg
})
945 MRI
->setRegClass(Reg
, TRI
.getWaveMaskRegClass());
949 case Intrinsic::amdgcn_interp_p1_f16
:
950 return selectInterpP1F16(I
);
951 case Intrinsic::amdgcn_wqm
:
952 return constrainCopyLikeIntrin(I
, AMDGPU::WQM
);
953 case Intrinsic::amdgcn_softwqm
:
954 return constrainCopyLikeIntrin(I
, AMDGPU::SOFT_WQM
);
955 case Intrinsic::amdgcn_strict_wwm
:
956 case Intrinsic::amdgcn_wwm
:
957 return constrainCopyLikeIntrin(I
, AMDGPU::STRICT_WWM
);
958 case Intrinsic::amdgcn_strict_wqm
:
959 return constrainCopyLikeIntrin(I
, AMDGPU::STRICT_WQM
);
960 case Intrinsic::amdgcn_writelane
:
961 return selectWritelane(I
);
962 case Intrinsic::amdgcn_div_scale
:
963 return selectDivScale(I
);
964 case Intrinsic::amdgcn_icmp
:
965 return selectIntrinsicIcmp(I
);
966 case Intrinsic::amdgcn_ballot
:
967 return selectBallot(I
);
968 case Intrinsic::amdgcn_reloc_constant
:
969 return selectRelocConstant(I
);
970 case Intrinsic::amdgcn_groupstaticsize
:
971 return selectGroupStaticSize(I
);
972 case Intrinsic::returnaddress
:
973 return selectReturnAddress(I
);
975 return selectImpl(I
, *CoverageInfo
);
979 static int getV_CMPOpcode(CmpInst::Predicate P
, unsigned Size
) {
980 if (Size
!= 32 && Size
!= 64)
984 llvm_unreachable("Unknown condition code!");
985 case CmpInst::ICMP_NE
:
986 return Size
== 32 ? AMDGPU::V_CMP_NE_U32_e64
: AMDGPU::V_CMP_NE_U64_e64
;
987 case CmpInst::ICMP_EQ
:
988 return Size
== 32 ? AMDGPU::V_CMP_EQ_U32_e64
: AMDGPU::V_CMP_EQ_U64_e64
;
989 case CmpInst::ICMP_SGT
:
990 return Size
== 32 ? AMDGPU::V_CMP_GT_I32_e64
: AMDGPU::V_CMP_GT_I64_e64
;
991 case CmpInst::ICMP_SGE
:
992 return Size
== 32 ? AMDGPU::V_CMP_GE_I32_e64
: AMDGPU::V_CMP_GE_I64_e64
;
993 case CmpInst::ICMP_SLT
:
994 return Size
== 32 ? AMDGPU::V_CMP_LT_I32_e64
: AMDGPU::V_CMP_LT_I64_e64
;
995 case CmpInst::ICMP_SLE
:
996 return Size
== 32 ? AMDGPU::V_CMP_LE_I32_e64
: AMDGPU::V_CMP_LE_I64_e64
;
997 case CmpInst::ICMP_UGT
:
998 return Size
== 32 ? AMDGPU::V_CMP_GT_U32_e64
: AMDGPU::V_CMP_GT_U64_e64
;
999 case CmpInst::ICMP_UGE
:
1000 return Size
== 32 ? AMDGPU::V_CMP_GE_U32_e64
: AMDGPU::V_CMP_GE_U64_e64
;
1001 case CmpInst::ICMP_ULT
:
1002 return Size
== 32 ? AMDGPU::V_CMP_LT_U32_e64
: AMDGPU::V_CMP_LT_U64_e64
;
1003 case CmpInst::ICMP_ULE
:
1004 return Size
== 32 ? AMDGPU::V_CMP_LE_U32_e64
: AMDGPU::V_CMP_LE_U64_e64
;
1008 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P
,
1009 unsigned Size
) const {
1011 if (!STI
.hasScalarCompareEq64())
1015 case CmpInst::ICMP_NE
:
1016 return AMDGPU::S_CMP_LG_U64
;
1017 case CmpInst::ICMP_EQ
:
1018 return AMDGPU::S_CMP_EQ_U64
;
1028 case CmpInst::ICMP_NE
:
1029 return AMDGPU::S_CMP_LG_U32
;
1030 case CmpInst::ICMP_EQ
:
1031 return AMDGPU::S_CMP_EQ_U32
;
1032 case CmpInst::ICMP_SGT
:
1033 return AMDGPU::S_CMP_GT_I32
;
1034 case CmpInst::ICMP_SGE
:
1035 return AMDGPU::S_CMP_GE_I32
;
1036 case CmpInst::ICMP_SLT
:
1037 return AMDGPU::S_CMP_LT_I32
;
1038 case CmpInst::ICMP_SLE
:
1039 return AMDGPU::S_CMP_LE_I32
;
1040 case CmpInst::ICMP_UGT
:
1041 return AMDGPU::S_CMP_GT_U32
;
1042 case CmpInst::ICMP_UGE
:
1043 return AMDGPU::S_CMP_GE_U32
;
1044 case CmpInst::ICMP_ULT
:
1045 return AMDGPU::S_CMP_LT_U32
;
1046 case CmpInst::ICMP_ULE
:
1047 return AMDGPU::S_CMP_LE_U32
;
1049 llvm_unreachable("Unknown condition code!");
1053 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr
&I
) const {
1054 MachineBasicBlock
*BB
= I
.getParent();
1055 const DebugLoc
&DL
= I
.getDebugLoc();
1057 Register SrcReg
= I
.getOperand(2).getReg();
1058 unsigned Size
= RBI
.getSizeInBits(SrcReg
, *MRI
, TRI
);
1060 auto Pred
= (CmpInst::Predicate
)I
.getOperand(1).getPredicate();
1062 Register CCReg
= I
.getOperand(0).getReg();
1063 if (!isVCC(CCReg
, *MRI
)) {
1064 int Opcode
= getS_CMPOpcode(Pred
, Size
);
1067 MachineInstr
*ICmp
= BuildMI(*BB
, &I
, DL
, TII
.get(Opcode
))
1068 .add(I
.getOperand(2))
1069 .add(I
.getOperand(3));
1070 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), CCReg
)
1071 .addReg(AMDGPU::SCC
);
1073 constrainSelectedInstRegOperands(*ICmp
, TII
, TRI
, RBI
) &&
1074 RBI
.constrainGenericRegister(CCReg
, AMDGPU::SReg_32RegClass
, *MRI
);
1075 I
.eraseFromParent();
1079 int Opcode
= getV_CMPOpcode(Pred
, Size
);
1083 MachineInstr
*ICmp
= BuildMI(*BB
, &I
, DL
, TII
.get(Opcode
),
1084 I
.getOperand(0).getReg())
1085 .add(I
.getOperand(2))
1086 .add(I
.getOperand(3));
1087 RBI
.constrainGenericRegister(ICmp
->getOperand(0).getReg(),
1088 *TRI
.getBoolRC(), *MRI
);
1089 bool Ret
= constrainSelectedInstRegOperands(*ICmp
, TII
, TRI
, RBI
);
1090 I
.eraseFromParent();
1094 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr
&I
) const {
1095 Register Dst
= I
.getOperand(0).getReg();
1096 if (isVCC(Dst
, *MRI
))
1099 if (MRI
->getType(Dst
).getSizeInBits() != STI
.getWavefrontSize())
1102 MachineBasicBlock
*BB
= I
.getParent();
1103 const DebugLoc
&DL
= I
.getDebugLoc();
1104 Register SrcReg
= I
.getOperand(2).getReg();
1105 unsigned Size
= RBI
.getSizeInBits(SrcReg
, *MRI
, TRI
);
1106 auto Pred
= static_cast<CmpInst::Predicate
>(I
.getOperand(4).getImm());
1108 int Opcode
= getV_CMPOpcode(Pred
, Size
);
1112 MachineInstr
*ICmp
= BuildMI(*BB
, &I
, DL
, TII
.get(Opcode
), Dst
)
1113 .add(I
.getOperand(2))
1114 .add(I
.getOperand(3));
1115 RBI
.constrainGenericRegister(ICmp
->getOperand(0).getReg(), *TRI
.getBoolRC(),
1117 bool Ret
= constrainSelectedInstRegOperands(*ICmp
, TII
, TRI
, RBI
);
1118 I
.eraseFromParent();
1122 bool AMDGPUInstructionSelector::selectBallot(MachineInstr
&I
) const {
1123 MachineBasicBlock
*BB
= I
.getParent();
1124 const DebugLoc
&DL
= I
.getDebugLoc();
1125 Register DstReg
= I
.getOperand(0).getReg();
1126 const unsigned Size
= MRI
->getType(DstReg
).getSizeInBits();
1127 const bool Is64
= Size
== 64;
1129 if (Size
!= STI
.getWavefrontSize())
1132 Optional
<ValueAndVReg
> Arg
=
1133 getConstantVRegValWithLookThrough(I
.getOperand(2).getReg(), *MRI
, true);
1135 if (Arg
.hasValue()) {
1136 const int64_t Value
= Arg
.getValue().Value
.getSExtValue();
1138 unsigned Opcode
= Is64
? AMDGPU::S_MOV_B64
: AMDGPU::S_MOV_B32
;
1139 BuildMI(*BB
, &I
, DL
, TII
.get(Opcode
), DstReg
).addImm(0);
1140 } else if (Value
== -1) { // all ones
1141 Register SrcReg
= Is64
? AMDGPU::EXEC
: AMDGPU::EXEC_LO
;
1142 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), DstReg
).addReg(SrcReg
);
1146 Register SrcReg
= I
.getOperand(2).getReg();
1147 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), DstReg
).addReg(SrcReg
);
1150 I
.eraseFromParent();
1154 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr
&I
) const {
1155 Register DstReg
= I
.getOperand(0).getReg();
1156 const RegisterBank
*DstBank
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
1157 const TargetRegisterClass
*DstRC
=
1158 TRI
.getRegClassForSizeOnBank(32, *DstBank
, *MRI
);
1159 if (!DstRC
|| !RBI
.constrainGenericRegister(DstReg
, *DstRC
, *MRI
))
1162 const bool IsVALU
= DstBank
->getID() == AMDGPU::VGPRRegBankID
;
1164 Module
*M
= MF
->getFunction().getParent();
1165 const MDNode
*Metadata
= I
.getOperand(2).getMetadata();
1166 auto SymbolName
= cast
<MDString
>(Metadata
->getOperand(0))->getString();
1167 auto RelocSymbol
= cast
<GlobalVariable
>(
1168 M
->getOrInsertGlobal(SymbolName
, Type::getInt32Ty(M
->getContext())));
1170 MachineBasicBlock
*BB
= I
.getParent();
1171 BuildMI(*BB
, &I
, I
.getDebugLoc(),
1172 TII
.get(IsVALU
? AMDGPU::V_MOV_B32_e32
: AMDGPU::S_MOV_B32
), DstReg
)
1173 .addGlobalAddress(RelocSymbol
, 0, SIInstrInfo::MO_ABS32_LO
);
1175 I
.eraseFromParent();
1179 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr
&I
) const {
1180 Triple::OSType OS
= MF
->getTarget().getTargetTriple().getOS();
1182 Register DstReg
= I
.getOperand(0).getReg();
1183 const RegisterBank
*DstRB
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
1184 unsigned Mov
= DstRB
->getID() == AMDGPU::SGPRRegBankID
?
1185 AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
1187 MachineBasicBlock
*MBB
= I
.getParent();
1188 const DebugLoc
&DL
= I
.getDebugLoc();
1190 auto MIB
= BuildMI(*MBB
, &I
, DL
, TII
.get(Mov
), DstReg
);
1192 if (OS
== Triple::AMDHSA
|| OS
== Triple::AMDPAL
) {
1193 const SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
1194 MIB
.addImm(MFI
->getLDSSize());
1196 Module
*M
= MF
->getFunction().getParent();
1197 const GlobalValue
*GV
1198 = Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_groupstaticsize
);
1199 MIB
.addGlobalAddress(GV
, 0, SIInstrInfo::MO_ABS32_LO
);
1202 I
.eraseFromParent();
1203 return constrainSelectedInstRegOperands(*MIB
, TII
, TRI
, RBI
);
1206 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr
&I
) const {
1207 MachineBasicBlock
*MBB
= I
.getParent();
1208 MachineFunction
&MF
= *MBB
->getParent();
1209 const DebugLoc
&DL
= I
.getDebugLoc();
1211 MachineOperand
&Dst
= I
.getOperand(0);
1212 Register DstReg
= Dst
.getReg();
1213 unsigned Depth
= I
.getOperand(2).getImm();
1215 const TargetRegisterClass
*RC
1216 = TRI
.getConstrainedRegClassForOperand(Dst
, *MRI
);
1217 if (!RC
->hasSubClassEq(&AMDGPU::SGPR_64RegClass
) ||
1218 !RBI
.constrainGenericRegister(DstReg
, *RC
, *MRI
))
1221 // Check for kernel and shader functions
1223 MF
.getInfo
<SIMachineFunctionInfo
>()->isEntryFunction()) {
1224 BuildMI(*MBB
, &I
, DL
, TII
.get(AMDGPU::S_MOV_B64
), DstReg
)
1226 I
.eraseFromParent();
1230 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1231 // There is a call to @llvm.returnaddress in this function
1232 MFI
.setReturnAddressIsTaken(true);
1234 // Get the return address reg and mark it as an implicit live-in
1235 Register ReturnAddrReg
= TRI
.getReturnAddressReg(MF
);
1236 Register LiveIn
= getFunctionLiveInPhysReg(MF
, TII
, ReturnAddrReg
,
1237 AMDGPU::SReg_64RegClass
);
1238 BuildMI(*MBB
, &I
, DL
, TII
.get(AMDGPU::COPY
), DstReg
)
1240 I
.eraseFromParent();
1244 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr
&MI
) const {
1245 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1246 // SelectionDAG uses for wave32 vs wave64.
1247 MachineBasicBlock
*BB
= MI
.getParent();
1248 BuildMI(*BB
, &MI
, MI
.getDebugLoc(), TII
.get(AMDGPU::SI_END_CF
))
1249 .add(MI
.getOperand(1));
1251 Register Reg
= MI
.getOperand(1).getReg();
1252 MI
.eraseFromParent();
1254 if (!MRI
->getRegClassOrNull(Reg
))
1255 MRI
->setRegClass(Reg
, TRI
.getWaveMaskRegClass());
1259 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1260 MachineInstr
&MI
, Intrinsic::ID IntrID
) const {
1261 MachineBasicBlock
*MBB
= MI
.getParent();
1262 MachineFunction
*MF
= MBB
->getParent();
1263 const DebugLoc
&DL
= MI
.getDebugLoc();
1265 unsigned IndexOperand
= MI
.getOperand(7).getImm();
1266 bool WaveRelease
= MI
.getOperand(8).getImm() != 0;
1267 bool WaveDone
= MI
.getOperand(9).getImm() != 0;
1269 if (WaveDone
&& !WaveRelease
)
1270 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1272 unsigned OrderedCountIndex
= IndexOperand
& 0x3f;
1273 IndexOperand
&= ~0x3f;
1274 unsigned CountDw
= 0;
1276 if (STI
.getGeneration() >= AMDGPUSubtarget::GFX10
) {
1277 CountDw
= (IndexOperand
>> 24) & 0xf;
1278 IndexOperand
&= ~(0xf << 24);
1280 if (CountDw
< 1 || CountDw
> 4) {
1282 "ds_ordered_count: dword count must be between 1 and 4");
1287 report_fatal_error("ds_ordered_count: bad index operand");
1289 unsigned Instruction
= IntrID
== Intrinsic::amdgcn_ds_ordered_add
? 0 : 1;
1290 unsigned ShaderType
= SIInstrInfo::getDSShaderTypeValue(*MF
);
1292 unsigned Offset0
= OrderedCountIndex
<< 2;
1293 unsigned Offset1
= WaveRelease
| (WaveDone
<< 1) | (ShaderType
<< 2) |
1296 if (STI
.getGeneration() >= AMDGPUSubtarget::GFX10
)
1297 Offset1
|= (CountDw
- 1) << 6;
1299 unsigned Offset
= Offset0
| (Offset1
<< 8);
1301 Register M0Val
= MI
.getOperand(2).getReg();
1302 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), AMDGPU::M0
)
1305 Register DstReg
= MI
.getOperand(0).getReg();
1306 Register ValReg
= MI
.getOperand(3).getReg();
1307 MachineInstrBuilder DS
=
1308 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::DS_ORDERED_COUNT
), DstReg
)
1313 if (!RBI
.constrainGenericRegister(M0Val
, AMDGPU::SReg_32RegClass
, *MRI
))
1316 bool Ret
= constrainSelectedInstRegOperands(*DS
, TII
, TRI
, RBI
);
1317 MI
.eraseFromParent();
1321 static unsigned gwsIntrinToOpcode(unsigned IntrID
) {
1323 case Intrinsic::amdgcn_ds_gws_init
:
1324 return AMDGPU::DS_GWS_INIT
;
1325 case Intrinsic::amdgcn_ds_gws_barrier
:
1326 return AMDGPU::DS_GWS_BARRIER
;
1327 case Intrinsic::amdgcn_ds_gws_sema_v
:
1328 return AMDGPU::DS_GWS_SEMA_V
;
1329 case Intrinsic::amdgcn_ds_gws_sema_br
:
1330 return AMDGPU::DS_GWS_SEMA_BR
;
1331 case Intrinsic::amdgcn_ds_gws_sema_p
:
1332 return AMDGPU::DS_GWS_SEMA_P
;
1333 case Intrinsic::amdgcn_ds_gws_sema_release_all
:
1334 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL
;
1336 llvm_unreachable("not a gws intrinsic");
1340 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr
&MI
,
1341 Intrinsic::ID IID
) const {
1342 if (IID
== Intrinsic::amdgcn_ds_gws_sema_release_all
&&
1343 !STI
.hasGWSSemaReleaseAll())
1346 // intrinsic ID, vsrc, offset
1347 const bool HasVSrc
= MI
.getNumOperands() == 3;
1348 assert(HasVSrc
|| MI
.getNumOperands() == 2);
1350 Register BaseOffset
= MI
.getOperand(HasVSrc
? 2 : 1).getReg();
1351 const RegisterBank
*OffsetRB
= RBI
.getRegBank(BaseOffset
, *MRI
, TRI
);
1352 if (OffsetRB
->getID() != AMDGPU::SGPRRegBankID
)
1355 MachineInstr
*OffsetDef
= getDefIgnoringCopies(BaseOffset
, *MRI
);
1360 MachineBasicBlock
*MBB
= MI
.getParent();
1361 const DebugLoc
&DL
= MI
.getDebugLoc();
1363 MachineInstr
*Readfirstlane
= nullptr;
1365 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1366 // incoming offset, in case there's an add of a constant. We'll have to put it
1368 if (OffsetDef
->getOpcode() == AMDGPU::V_READFIRSTLANE_B32
) {
1369 Readfirstlane
= OffsetDef
;
1370 BaseOffset
= OffsetDef
->getOperand(1).getReg();
1371 OffsetDef
= getDefIgnoringCopies(BaseOffset
, *MRI
);
1374 if (OffsetDef
->getOpcode() == AMDGPU::G_CONSTANT
) {
1375 // If we have a constant offset, try to use the 0 in m0 as the base.
1376 // TODO: Look into changing the default m0 initialization value. If the
1377 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1378 // the immediate offset.
1380 ImmOffset
= OffsetDef
->getOperand(1).getCImm()->getZExtValue();
1381 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
1384 std::tie(BaseOffset
, ImmOffset
) =
1385 AMDGPU::getBaseWithConstantOffset(*MRI
, BaseOffset
);
1387 if (Readfirstlane
) {
1388 // We have the constant offset now, so put the readfirstlane back on the
1389 // variable component.
1390 if (!RBI
.constrainGenericRegister(BaseOffset
, AMDGPU::VGPR_32RegClass
, *MRI
))
1393 Readfirstlane
->getOperand(1).setReg(BaseOffset
);
1394 BaseOffset
= Readfirstlane
->getOperand(0).getReg();
1396 if (!RBI
.constrainGenericRegister(BaseOffset
,
1397 AMDGPU::SReg_32RegClass
, *MRI
))
1401 Register M0Base
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1402 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::S_LSHL_B32
), M0Base
)
1406 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), AMDGPU::M0
)
1410 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1411 // offset field) % 64. Some versions of the programming guide omit the m0
1412 // part, or claim it's from offset 0.
1413 auto MIB
= BuildMI(*MBB
, &MI
, DL
, TII
.get(gwsIntrinToOpcode(IID
)));
1416 Register VSrc
= MI
.getOperand(1).getReg();
1418 if (STI
.needsAlignedVGPRs()) {
1419 // Add implicit aligned super-reg to force alignment on the data operand.
1420 Register Undef
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1421 BuildMI(*MBB
, &*MIB
, DL
, TII
.get(AMDGPU::IMPLICIT_DEF
), Undef
);
1423 MRI
->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass
);
1424 BuildMI(*MBB
, &*MIB
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), NewVR
)
1425 .addReg(VSrc
, 0, MI
.getOperand(1).getSubReg())
1426 .addImm(AMDGPU::sub0
)
1428 .addImm(AMDGPU::sub1
);
1429 MIB
.addReg(NewVR
, 0, AMDGPU::sub0
);
1430 MIB
.addReg(NewVR
, RegState::Implicit
);
1435 if (!RBI
.constrainGenericRegister(VSrc
, AMDGPU::VGPR_32RegClass
, *MRI
))
1439 MIB
.addImm(ImmOffset
)
1442 MI
.eraseFromParent();
1446 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr
&MI
,
1447 bool IsAppend
) const {
1448 Register PtrBase
= MI
.getOperand(2).getReg();
1449 LLT PtrTy
= MRI
->getType(PtrBase
);
1450 bool IsGDS
= PtrTy
.getAddressSpace() == AMDGPUAS::REGION_ADDRESS
;
1453 std::tie(PtrBase
, Offset
) = selectDS1Addr1OffsetImpl(MI
.getOperand(2));
1455 // TODO: Should this try to look through readfirstlane like GWS?
1456 if (!isDSOffsetLegal(PtrBase
, Offset
)) {
1457 PtrBase
= MI
.getOperand(2).getReg();
1461 MachineBasicBlock
*MBB
= MI
.getParent();
1462 const DebugLoc
&DL
= MI
.getDebugLoc();
1463 const unsigned Opc
= IsAppend
? AMDGPU::DS_APPEND
: AMDGPU::DS_CONSUME
;
1465 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), AMDGPU::M0
)
1467 if (!RBI
.constrainGenericRegister(PtrBase
, AMDGPU::SReg_32RegClass
, *MRI
))
1470 auto MIB
= BuildMI(*MBB
, &MI
, DL
, TII
.get(Opc
), MI
.getOperand(0).getReg())
1472 .addImm(IsGDS
? -1 : 0)
1474 MI
.eraseFromParent();
1475 return constrainSelectedInstRegOperands(*MIB
, TII
, TRI
, RBI
);
1478 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr
&MI
) const {
1479 if (TM
.getOptLevel() > CodeGenOpt::None
) {
1480 unsigned WGSize
= STI
.getFlatWorkGroupSizes(MF
->getFunction()).second
;
1481 if (WGSize
<= STI
.getWavefrontSize()) {
1482 MachineBasicBlock
*MBB
= MI
.getParent();
1483 const DebugLoc
&DL
= MI
.getDebugLoc();
1484 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::WAVE_BARRIER
));
1485 MI
.eraseFromParent();
1489 return selectImpl(MI
, *CoverageInfo
);
1492 static bool parseTexFail(uint64_t TexFailCtrl
, bool &TFE
, bool &LWE
,
1497 TFE
= (TexFailCtrl
& 0x1) ? 1 : 0;
1498 TexFailCtrl
&= ~(uint64_t)0x1;
1499 LWE
= (TexFailCtrl
& 0x2) ? 1 : 0;
1500 TexFailCtrl
&= ~(uint64_t)0x2;
1502 return TexFailCtrl
== 0;
1505 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1506 MachineInstr
&MI
, const AMDGPU::ImageDimIntrinsicInfo
*Intr
) const {
1507 MachineBasicBlock
*MBB
= MI
.getParent();
1508 const DebugLoc
&DL
= MI
.getDebugLoc();
1510 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
1511 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
1513 const AMDGPU::MIMGDimInfo
*DimInfo
= AMDGPU::getMIMGDimInfo(Intr
->Dim
);
1514 const AMDGPU::MIMGLZMappingInfo
*LZMappingInfo
=
1515 AMDGPU::getMIMGLZMappingInfo(Intr
->BaseOpcode
);
1516 const AMDGPU::MIMGMIPMappingInfo
*MIPMappingInfo
=
1517 AMDGPU::getMIMGMIPMappingInfo(Intr
->BaseOpcode
);
1518 unsigned IntrOpcode
= Intr
->BaseOpcode
;
1519 const bool IsGFX10Plus
= AMDGPU::isGFX10Plus(STI
);
1521 const unsigned ArgOffset
= MI
.getNumExplicitDefs() + 1;
1523 Register VDataIn
, VDataOut
;
1525 int NumVDataDwords
= -1;
1529 if (!BaseOpcode
->Sampler
)
1532 Unorm
= MI
.getOperand(ArgOffset
+ Intr
->UnormIndex
).getImm() != 0;
1536 bool IsTexFail
= false;
1537 if (!parseTexFail(MI
.getOperand(ArgOffset
+ Intr
->TexFailCtrlIndex
).getImm(),
1538 TFE
, LWE
, IsTexFail
))
1541 const int Flags
= MI
.getOperand(ArgOffset
+ Intr
->NumArgs
).getImm();
1542 const bool IsA16
= (Flags
& 1) != 0;
1543 const bool IsG16
= (Flags
& 2) != 0;
1545 // A16 implies 16 bit gradients if subtarget doesn't support G16
1546 if (IsA16
&& !STI
.hasG16() && !IsG16
)
1550 unsigned DMaskLanes
= 0;
1552 if (BaseOpcode
->Atomic
) {
1553 VDataOut
= MI
.getOperand(0).getReg();
1554 VDataIn
= MI
.getOperand(2).getReg();
1555 LLT Ty
= MRI
->getType(VDataIn
);
1557 // Be careful to allow atomic swap on 16-bit element vectors.
1558 const bool Is64Bit
= BaseOpcode
->AtomicX2
?
1559 Ty
.getSizeInBits() == 128 :
1560 Ty
.getSizeInBits() == 64;
1562 if (BaseOpcode
->AtomicX2
) {
1563 assert(MI
.getOperand(3).getReg() == AMDGPU::NoRegister
);
1565 DMask
= Is64Bit
? 0xf : 0x3;
1566 NumVDataDwords
= Is64Bit
? 4 : 2;
1568 DMask
= Is64Bit
? 0x3 : 0x1;
1569 NumVDataDwords
= Is64Bit
? 2 : 1;
1572 DMask
= MI
.getOperand(ArgOffset
+ Intr
->DMaskIndex
).getImm();
1573 DMaskLanes
= BaseOpcode
->Gather4
? 4 : countPopulation(DMask
);
1575 // One memoperand is mandatory, except for getresinfo.
1576 // FIXME: Check this in verifier.
1577 if (!MI
.memoperands_empty()) {
1578 const MachineMemOperand
*MMO
= *MI
.memoperands_begin();
1580 // Infer d16 from the memory size, as the register type will be mangled by
1581 // unpacked subtargets, or by TFE.
1582 IsD16
= ((8 * MMO
->getSize()) / DMaskLanes
) < 32;
1585 if (BaseOpcode
->Store
) {
1586 VDataIn
= MI
.getOperand(1).getReg();
1587 VDataTy
= MRI
->getType(VDataIn
);
1588 NumVDataDwords
= (VDataTy
.getSizeInBits() + 31) / 32;
1590 VDataOut
= MI
.getOperand(0).getReg();
1591 VDataTy
= MRI
->getType(VDataOut
);
1592 NumVDataDwords
= DMaskLanes
;
1594 if (IsD16
&& !STI
.hasUnpackedD16VMem())
1595 NumVDataDwords
= (DMaskLanes
+ 1) / 2;
1599 // Optimize _L to _LZ when _L is zero
1600 if (LZMappingInfo
) {
1601 // The legalizer replaced the register with an immediate 0 if we need to
1602 // change the opcode.
1603 const MachineOperand
&Lod
= MI
.getOperand(ArgOffset
+ Intr
->LodIndex
);
1605 assert(Lod
.getImm() == 0);
1606 IntrOpcode
= LZMappingInfo
->LZ
; // set new opcode to _lz variant of _l
1610 // Optimize _mip away, when 'lod' is zero
1611 if (MIPMappingInfo
) {
1612 const MachineOperand
&Lod
= MI
.getOperand(ArgOffset
+ Intr
->MipIndex
);
1614 assert(Lod
.getImm() == 0);
1615 IntrOpcode
= MIPMappingInfo
->NONMIP
; // set new opcode to variant without _mip
1620 if (IsG16
&& !IsA16
) {
1621 const AMDGPU::MIMGG16MappingInfo
*G16MappingInfo
=
1622 AMDGPU::getMIMGG16MappingInfo(Intr
->BaseOpcode
);
1623 assert(G16MappingInfo
);
1624 IntrOpcode
= G16MappingInfo
->G16
; // set opcode to variant with _g16
1627 // TODO: Check this in verifier.
1628 assert((!IsTexFail
|| DMaskLanes
>= 1) && "should have legalized this");
1630 unsigned CPol
= MI
.getOperand(ArgOffset
+ Intr
->CachePolicyIndex
).getImm();
1631 if (BaseOpcode
->Atomic
)
1632 CPol
|= AMDGPU::CPol::GLC
; // TODO no-return optimization
1633 if (CPol
& ~AMDGPU::CPol::ALL
)
1636 int NumVAddrRegs
= 0;
1637 int NumVAddrDwords
= 0;
1638 for (unsigned I
= Intr
->VAddrStart
; I
< Intr
->VAddrEnd
; I
++) {
1639 // Skip the $noregs and 0s inserted during legalization.
1640 MachineOperand
&AddrOp
= MI
.getOperand(ArgOffset
+ I
);
1641 if (!AddrOp
.isReg())
1642 continue; // XXX - Break?
1644 Register Addr
= AddrOp
.getReg();
1649 NumVAddrDwords
+= (MRI
->getType(Addr
).getSizeInBits() + 31) / 32;
1652 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1653 // NSA, these should have beeen packed into a single value in the first
1655 const bool UseNSA
= NumVAddrRegs
!= 1 && NumVAddrDwords
== NumVAddrRegs
;
1656 if (UseNSA
&& !STI
.hasFeature(AMDGPU::FeatureNSAEncoding
)) {
1657 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1666 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
,
1667 UseNSA
? AMDGPU::MIMGEncGfx10NSA
1668 : AMDGPU::MIMGEncGfx10Default
,
1669 NumVDataDwords
, NumVAddrDwords
);
1671 if (STI
.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
1672 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx8
,
1673 NumVDataDwords
, NumVAddrDwords
);
1675 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx6
,
1676 NumVDataDwords
, NumVAddrDwords
);
1678 assert(Opcode
!= -1);
1680 auto MIB
= BuildMI(*MBB
, &MI
, DL
, TII
.get(Opcode
))
1684 if (BaseOpcode
->AtomicX2
) {
1685 const bool Is64
= MRI
->getType(VDataOut
).getSizeInBits() == 64;
1687 Register TmpReg
= MRI
->createVirtualRegister(
1688 Is64
? &AMDGPU::VReg_128RegClass
: &AMDGPU::VReg_64RegClass
);
1689 unsigned SubReg
= Is64
? AMDGPU::sub0_sub1
: AMDGPU::sub0
;
1692 if (!MRI
->use_empty(VDataOut
)) {
1693 BuildMI(*MBB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), VDataOut
)
1694 .addReg(TmpReg
, RegState::Kill
, SubReg
);
1698 MIB
.addDef(VDataOut
); // vdata output
1703 MIB
.addReg(VDataIn
); // vdata input
1705 for (int I
= 0; I
!= NumVAddrRegs
; ++I
) {
1706 MachineOperand
&SrcOp
= MI
.getOperand(ArgOffset
+ Intr
->VAddrStart
+ I
);
1707 if (SrcOp
.isReg()) {
1708 assert(SrcOp
.getReg() != 0);
1709 MIB
.addReg(SrcOp
.getReg());
1713 MIB
.addReg(MI
.getOperand(ArgOffset
+ Intr
->RsrcIndex
).getReg());
1714 if (BaseOpcode
->Sampler
)
1715 MIB
.addReg(MI
.getOperand(ArgOffset
+ Intr
->SampIndex
).getReg());
1717 MIB
.addImm(DMask
); // dmask
1720 MIB
.addImm(DimInfo
->Encoding
);
1724 MIB
.addImm(IsA16
&& // a16 or r128
1725 STI
.hasFeature(AMDGPU::FeatureR128A16
) ? -1 : 0);
1727 MIB
.addImm(IsA16
? -1 : 0);
1729 MIB
.addImm(TFE
); // tfe
1730 MIB
.addImm(LWE
); // lwe
1732 MIB
.addImm(DimInfo
->DA
? -1 : 0);
1733 if (BaseOpcode
->HasD16
)
1734 MIB
.addImm(IsD16
? -1 : 0);
1737 // An image load instruction with TFE/LWE only conditionally writes to its
1738 // result registers. Initialize them to zero so that we always get well
1739 // defined result values.
1740 assert(VDataOut
&& !VDataIn
);
1741 Register Tied
= MRI
->cloneVirtualRegister(VDataOut
);
1742 Register Zero
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1743 BuildMI(*MBB
, *MIB
, DL
, TII
.get(AMDGPU::V_MOV_B32_e32
), Zero
)
1745 auto Parts
= TRI
.getRegSplitParts(MRI
->getRegClass(Tied
), 4);
1746 if (STI
.usePRTStrictNull()) {
1747 // With enable-prt-strict-null enabled, initialize all result registers to
1750 BuildMI(*MBB
, *MIB
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), Tied
);
1751 for (auto Sub
: Parts
)
1752 RegSeq
.addReg(Zero
).addImm(Sub
);
1754 // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
1756 Register Undef
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1757 BuildMI(*MBB
, *MIB
, DL
, TII
.get(AMDGPU::IMPLICIT_DEF
), Undef
);
1759 BuildMI(*MBB
, *MIB
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), Tied
);
1760 for (auto Sub
: Parts
.drop_back(1))
1761 RegSeq
.addReg(Undef
).addImm(Sub
);
1762 RegSeq
.addReg(Zero
).addImm(Parts
.back());
1764 MIB
.addReg(Tied
, RegState::Implicit
);
1765 MIB
->tieOperands(0, MIB
->getNumOperands() - 1);
1768 MI
.eraseFromParent();
1769 return constrainSelectedInstRegOperands(*MIB
, TII
, TRI
, RBI
);
1772 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1773 MachineInstr
&I
) const {
1774 unsigned IntrinsicID
= I
.getIntrinsicID();
1775 switch (IntrinsicID
) {
1776 case Intrinsic::amdgcn_end_cf
:
1777 return selectEndCfIntrinsic(I
);
1778 case Intrinsic::amdgcn_ds_ordered_add
:
1779 case Intrinsic::amdgcn_ds_ordered_swap
:
1780 return selectDSOrderedIntrinsic(I
, IntrinsicID
);
1781 case Intrinsic::amdgcn_ds_gws_init
:
1782 case Intrinsic::amdgcn_ds_gws_barrier
:
1783 case Intrinsic::amdgcn_ds_gws_sema_v
:
1784 case Intrinsic::amdgcn_ds_gws_sema_br
:
1785 case Intrinsic::amdgcn_ds_gws_sema_p
:
1786 case Intrinsic::amdgcn_ds_gws_sema_release_all
:
1787 return selectDSGWSIntrinsic(I
, IntrinsicID
);
1788 case Intrinsic::amdgcn_ds_append
:
1789 return selectDSAppendConsume(I
, true);
1790 case Intrinsic::amdgcn_ds_consume
:
1791 return selectDSAppendConsume(I
, false);
1792 case Intrinsic::amdgcn_s_barrier
:
1793 return selectSBarrier(I
);
1794 case Intrinsic::amdgcn_global_atomic_fadd
:
1795 return selectGlobalAtomicFadd(I
, I
.getOperand(2), I
.getOperand(3));
1797 return selectImpl(I
, *CoverageInfo
);
1802 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr
&I
) const {
1803 if (selectImpl(I
, *CoverageInfo
))
1806 MachineBasicBlock
*BB
= I
.getParent();
1807 const DebugLoc
&DL
= I
.getDebugLoc();
1809 Register DstReg
= I
.getOperand(0).getReg();
1810 unsigned Size
= RBI
.getSizeInBits(DstReg
, *MRI
, TRI
);
1811 assert(Size
<= 32 || Size
== 64);
1812 const MachineOperand
&CCOp
= I
.getOperand(1);
1813 Register CCReg
= CCOp
.getReg();
1814 if (!isVCC(CCReg
, *MRI
)) {
1815 unsigned SelectOpcode
= Size
== 64 ? AMDGPU::S_CSELECT_B64
:
1816 AMDGPU::S_CSELECT_B32
;
1817 MachineInstr
*CopySCC
= BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), AMDGPU::SCC
)
1820 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1821 // bank, because it does not cover the register class that we used to represent
1822 // for it. So we need to manually set the register class here.
1823 if (!MRI
->getRegClassOrNull(CCReg
))
1824 MRI
->setRegClass(CCReg
, TRI
.getConstrainedRegClassForOperand(CCOp
, *MRI
));
1825 MachineInstr
*Select
= BuildMI(*BB
, &I
, DL
, TII
.get(SelectOpcode
), DstReg
)
1826 .add(I
.getOperand(2))
1827 .add(I
.getOperand(3));
1829 bool Ret
= constrainSelectedInstRegOperands(*Select
, TII
, TRI
, RBI
) |
1830 constrainSelectedInstRegOperands(*CopySCC
, TII
, TRI
, RBI
);
1831 I
.eraseFromParent();
1835 // Wide VGPR select should have been split in RegBankSelect.
1839 MachineInstr
*Select
=
1840 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::V_CNDMASK_B32_e64
), DstReg
)
1842 .add(I
.getOperand(3))
1844 .add(I
.getOperand(2))
1845 .add(I
.getOperand(1));
1847 bool Ret
= constrainSelectedInstRegOperands(*Select
, TII
, TRI
, RBI
);
1848 I
.eraseFromParent();
1852 static int sizeToSubRegIndex(unsigned Size
) {
1855 return AMDGPU::sub0
;
1857 return AMDGPU::sub0_sub1
;
1859 return AMDGPU::sub0_sub1_sub2
;
1861 return AMDGPU::sub0_sub1_sub2_sub3
;
1863 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7
;
1866 return AMDGPU::sub0
;
1869 return sizeToSubRegIndex(PowerOf2Ceil(Size
));
1873 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr
&I
) const {
1874 Register DstReg
= I
.getOperand(0).getReg();
1875 Register SrcReg
= I
.getOperand(1).getReg();
1876 const LLT DstTy
= MRI
->getType(DstReg
);
1877 const LLT SrcTy
= MRI
->getType(SrcReg
);
1878 const LLT S1
= LLT::scalar(1);
1880 const RegisterBank
*SrcRB
= RBI
.getRegBank(SrcReg
, *MRI
, TRI
);
1881 const RegisterBank
*DstRB
;
1883 // This is a special case. We don't treat s1 for legalization artifacts as
1887 DstRB
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
1892 const bool IsVALU
= DstRB
->getID() == AMDGPU::VGPRRegBankID
;
1894 unsigned DstSize
= DstTy
.getSizeInBits();
1895 unsigned SrcSize
= SrcTy
.getSizeInBits();
1897 const TargetRegisterClass
*SrcRC
1898 = TRI
.getRegClassForSizeOnBank(SrcSize
, *SrcRB
, *MRI
);
1899 const TargetRegisterClass
*DstRC
1900 = TRI
.getRegClassForSizeOnBank(DstSize
, *DstRB
, *MRI
);
1901 if (!SrcRC
|| !DstRC
)
1904 if (!RBI
.constrainGenericRegister(SrcReg
, *SrcRC
, *MRI
) ||
1905 !RBI
.constrainGenericRegister(DstReg
, *DstRC
, *MRI
)) {
1906 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1910 if (DstTy
== LLT::fixed_vector(2, 16) && SrcTy
== LLT::fixed_vector(2, 32)) {
1911 MachineBasicBlock
*MBB
= I
.getParent();
1912 const DebugLoc
&DL
= I
.getDebugLoc();
1914 Register LoReg
= MRI
->createVirtualRegister(DstRC
);
1915 Register HiReg
= MRI
->createVirtualRegister(DstRC
);
1916 BuildMI(*MBB
, I
, DL
, TII
.get(AMDGPU::COPY
), LoReg
)
1917 .addReg(SrcReg
, 0, AMDGPU::sub0
);
1918 BuildMI(*MBB
, I
, DL
, TII
.get(AMDGPU::COPY
), HiReg
)
1919 .addReg(SrcReg
, 0, AMDGPU::sub1
);
1921 if (IsVALU
&& STI
.hasSDWA()) {
1922 // Write the low 16-bits of the high element into the high 16-bits of the
1924 MachineInstr
*MovSDWA
=
1925 BuildMI(*MBB
, I
, DL
, TII
.get(AMDGPU::V_MOV_B32_sdwa
), DstReg
)
1926 .addImm(0) // $src0_modifiers
1927 .addReg(HiReg
) // $src0
1928 .addImm(0) // $clamp
1929 .addImm(AMDGPU::SDWA::WORD_1
) // $dst_sel
1930 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE
) // $dst_unused
1931 .addImm(AMDGPU::SDWA::WORD_0
) // $src0_sel
1932 .addReg(LoReg
, RegState::Implicit
);
1933 MovSDWA
->tieOperands(0, MovSDWA
->getNumOperands() - 1);
1935 Register TmpReg0
= MRI
->createVirtualRegister(DstRC
);
1936 Register TmpReg1
= MRI
->createVirtualRegister(DstRC
);
1937 Register ImmReg
= MRI
->createVirtualRegister(DstRC
);
1939 BuildMI(*MBB
, I
, DL
, TII
.get(AMDGPU::V_LSHLREV_B32_e64
), TmpReg0
)
1943 BuildMI(*MBB
, I
, DL
, TII
.get(AMDGPU::S_LSHL_B32
), TmpReg0
)
1948 unsigned MovOpc
= IsVALU
? AMDGPU::V_MOV_B32_e32
: AMDGPU::S_MOV_B32
;
1949 unsigned AndOpc
= IsVALU
? AMDGPU::V_AND_B32_e64
: AMDGPU::S_AND_B32
;
1950 unsigned OrOpc
= IsVALU
? AMDGPU::V_OR_B32_e64
: AMDGPU::S_OR_B32
;
1952 BuildMI(*MBB
, I
, DL
, TII
.get(MovOpc
), ImmReg
)
1954 BuildMI(*MBB
, I
, DL
, TII
.get(AndOpc
), TmpReg1
)
1957 BuildMI(*MBB
, I
, DL
, TII
.get(OrOpc
), DstReg
)
1962 I
.eraseFromParent();
1966 if (!DstTy
.isScalar())
1970 int SubRegIdx
= sizeToSubRegIndex(DstSize
);
1971 if (SubRegIdx
== -1)
1974 // Deal with weird cases where the class only partially supports the subreg
1976 const TargetRegisterClass
*SrcWithSubRC
1977 = TRI
.getSubClassWithSubReg(SrcRC
, SubRegIdx
);
1981 if (SrcWithSubRC
!= SrcRC
) {
1982 if (!RBI
.constrainGenericRegister(SrcReg
, *SrcWithSubRC
, *MRI
))
1986 I
.getOperand(1).setSubReg(SubRegIdx
);
1989 I
.setDesc(TII
.get(TargetOpcode::COPY
));
1993 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1994 static bool shouldUseAndMask(unsigned Size
, unsigned &Mask
) {
1995 Mask
= maskTrailingOnes
<unsigned>(Size
);
1996 int SignedMask
= static_cast<int>(Mask
);
1997 return SignedMask
>= -16 && SignedMask
<= 64;
2000 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2001 const RegisterBank
*AMDGPUInstructionSelector::getArtifactRegBank(
2002 Register Reg
, const MachineRegisterInfo
&MRI
,
2003 const TargetRegisterInfo
&TRI
) const {
2004 const RegClassOrRegBank
&RegClassOrBank
= MRI
.getRegClassOrRegBank(Reg
);
2005 if (auto *RB
= RegClassOrBank
.dyn_cast
<const RegisterBank
*>())
2008 // Ignore the type, since we don't use vcc in artifacts.
2009 if (auto *RC
= RegClassOrBank
.dyn_cast
<const TargetRegisterClass
*>())
2010 return &RBI
.getRegBankFromRegClass(*RC
, LLT());
2014 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr
&I
) const {
2015 bool InReg
= I
.getOpcode() == AMDGPU::G_SEXT_INREG
;
2016 bool Signed
= I
.getOpcode() == AMDGPU::G_SEXT
|| InReg
;
2017 const DebugLoc
&DL
= I
.getDebugLoc();
2018 MachineBasicBlock
&MBB
= *I
.getParent();
2019 const Register DstReg
= I
.getOperand(0).getReg();
2020 const Register SrcReg
= I
.getOperand(1).getReg();
2022 const LLT DstTy
= MRI
->getType(DstReg
);
2023 const LLT SrcTy
= MRI
->getType(SrcReg
);
2024 const unsigned SrcSize
= I
.getOpcode() == AMDGPU::G_SEXT_INREG
?
2025 I
.getOperand(2).getImm() : SrcTy
.getSizeInBits();
2026 const unsigned DstSize
= DstTy
.getSizeInBits();
2027 if (!DstTy
.isScalar())
2030 // Artifact casts should never use vcc.
2031 const RegisterBank
*SrcBank
= getArtifactRegBank(SrcReg
, *MRI
, TRI
);
2033 // FIXME: This should probably be illegal and split earlier.
2034 if (I
.getOpcode() == AMDGPU::G_ANYEXT
) {
2036 return selectCOPY(I
);
2038 const TargetRegisterClass
*SrcRC
=
2039 TRI
.getRegClassForTypeOnBank(SrcTy
, *SrcBank
, *MRI
);
2040 const RegisterBank
*DstBank
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
2041 const TargetRegisterClass
*DstRC
=
2042 TRI
.getRegClassForSizeOnBank(DstSize
, *DstBank
, *MRI
);
2044 Register UndefReg
= MRI
->createVirtualRegister(SrcRC
);
2045 BuildMI(MBB
, I
, DL
, TII
.get(AMDGPU::IMPLICIT_DEF
), UndefReg
);
2046 BuildMI(MBB
, I
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), DstReg
)
2048 .addImm(AMDGPU::sub0
)
2050 .addImm(AMDGPU::sub1
);
2051 I
.eraseFromParent();
2053 return RBI
.constrainGenericRegister(DstReg
, *DstRC
, *MRI
) &&
2054 RBI
.constrainGenericRegister(SrcReg
, *SrcRC
, *MRI
);
2057 if (SrcBank
->getID() == AMDGPU::VGPRRegBankID
&& DstSize
<= 32) {
2058 // 64-bit should have been split up in RegBankSelect
2060 // Try to use an and with a mask if it will save code size.
2062 if (!Signed
&& shouldUseAndMask(SrcSize
, Mask
)) {
2063 MachineInstr
*ExtI
=
2064 BuildMI(MBB
, I
, DL
, TII
.get(AMDGPU::V_AND_B32_e32
), DstReg
)
2067 I
.eraseFromParent();
2068 return constrainSelectedInstRegOperands(*ExtI
, TII
, TRI
, RBI
);
2071 const unsigned BFE
= Signed
? AMDGPU::V_BFE_I32_e64
: AMDGPU::V_BFE_U32_e64
;
2072 MachineInstr
*ExtI
=
2073 BuildMI(MBB
, I
, DL
, TII
.get(BFE
), DstReg
)
2075 .addImm(0) // Offset
2076 .addImm(SrcSize
); // Width
2077 I
.eraseFromParent();
2078 return constrainSelectedInstRegOperands(*ExtI
, TII
, TRI
, RBI
);
2081 if (SrcBank
->getID() == AMDGPU::SGPRRegBankID
&& DstSize
<= 64) {
2082 const TargetRegisterClass
&SrcRC
= InReg
&& DstSize
> 32 ?
2083 AMDGPU::SReg_64RegClass
: AMDGPU::SReg_32RegClass
;
2084 if (!RBI
.constrainGenericRegister(SrcReg
, SrcRC
, *MRI
))
2087 if (Signed
&& DstSize
== 32 && (SrcSize
== 8 || SrcSize
== 16)) {
2088 const unsigned SextOpc
= SrcSize
== 8 ?
2089 AMDGPU::S_SEXT_I32_I8
: AMDGPU::S_SEXT_I32_I16
;
2090 BuildMI(MBB
, I
, DL
, TII
.get(SextOpc
), DstReg
)
2092 I
.eraseFromParent();
2093 return RBI
.constrainGenericRegister(DstReg
, AMDGPU::SReg_32RegClass
, *MRI
);
2096 const unsigned BFE64
= Signed
? AMDGPU::S_BFE_I64
: AMDGPU::S_BFE_U64
;
2097 const unsigned BFE32
= Signed
? AMDGPU::S_BFE_I32
: AMDGPU::S_BFE_U32
;
2099 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2100 if (DstSize
> 32 && (SrcSize
<= 32 || InReg
)) {
2101 // We need a 64-bit register source, but the high bits don't matter.
2102 Register ExtReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_64RegClass
);
2103 Register UndefReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2104 unsigned SubReg
= InReg
? AMDGPU::sub0
: 0;
2106 BuildMI(MBB
, I
, DL
, TII
.get(AMDGPU::IMPLICIT_DEF
), UndefReg
);
2107 BuildMI(MBB
, I
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), ExtReg
)
2108 .addReg(SrcReg
, 0, SubReg
)
2109 .addImm(AMDGPU::sub0
)
2111 .addImm(AMDGPU::sub1
);
2113 BuildMI(MBB
, I
, DL
, TII
.get(BFE64
), DstReg
)
2115 .addImm(SrcSize
<< 16);
2117 I
.eraseFromParent();
2118 return RBI
.constrainGenericRegister(DstReg
, AMDGPU::SReg_64RegClass
, *MRI
);
2122 if (!Signed
&& shouldUseAndMask(SrcSize
, Mask
)) {
2123 BuildMI(MBB
, I
, DL
, TII
.get(AMDGPU::S_AND_B32
), DstReg
)
2127 BuildMI(MBB
, I
, DL
, TII
.get(BFE32
), DstReg
)
2129 .addImm(SrcSize
<< 16);
2132 I
.eraseFromParent();
2133 return RBI
.constrainGenericRegister(DstReg
, AMDGPU::SReg_32RegClass
, *MRI
);
2139 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr
&I
) const {
2140 MachineBasicBlock
*BB
= I
.getParent();
2141 MachineOperand
&ImmOp
= I
.getOperand(1);
2142 Register DstReg
= I
.getOperand(0).getReg();
2143 unsigned Size
= MRI
->getType(DstReg
).getSizeInBits();
2145 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2146 if (ImmOp
.isFPImm()) {
2147 const APInt
&Imm
= ImmOp
.getFPImm()->getValueAPF().bitcastToAPInt();
2148 ImmOp
.ChangeToImmediate(Imm
.getZExtValue());
2149 } else if (ImmOp
.isCImm()) {
2150 ImmOp
.ChangeToImmediate(ImmOp
.getCImm()->getSExtValue());
2152 llvm_unreachable("Not supported by g_constants");
2155 const RegisterBank
*DstRB
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
2156 const bool IsSgpr
= DstRB
->getID() == AMDGPU::SGPRRegBankID
;
2159 if (DstRB
->getID() == AMDGPU::VCCRegBankID
) {
2160 Opcode
= STI
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
2162 Opcode
= IsSgpr
? AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
2164 // We should never produce s1 values on banks other than VCC. If the user of
2165 // this already constrained the register, we may incorrectly think it's VCC
2166 // if it wasn't originally.
2172 I
.setDesc(TII
.get(Opcode
));
2173 I
.addImplicitDefUseOperands(*MF
);
2174 return constrainSelectedInstRegOperands(I
, TII
, TRI
, RBI
);
2177 const DebugLoc
&DL
= I
.getDebugLoc();
2179 APInt
Imm(Size
, I
.getOperand(1).getImm());
2181 MachineInstr
*ResInst
;
2182 if (IsSgpr
&& TII
.isInlineConstant(Imm
)) {
2183 ResInst
= BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::S_MOV_B64
), DstReg
)
2184 .addImm(I
.getOperand(1).getImm());
2186 const TargetRegisterClass
*RC
= IsSgpr
?
2187 &AMDGPU::SReg_32RegClass
: &AMDGPU::VGPR_32RegClass
;
2188 Register LoReg
= MRI
->createVirtualRegister(RC
);
2189 Register HiReg
= MRI
->createVirtualRegister(RC
);
2191 BuildMI(*BB
, &I
, DL
, TII
.get(Opcode
), LoReg
)
2192 .addImm(Imm
.trunc(32).getZExtValue());
2194 BuildMI(*BB
, &I
, DL
, TII
.get(Opcode
), HiReg
)
2195 .addImm(Imm
.ashr(32).getZExtValue());
2197 ResInst
= BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), DstReg
)
2199 .addImm(AMDGPU::sub0
)
2201 .addImm(AMDGPU::sub1
);
2204 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2205 // work for target independent opcodes
2206 I
.eraseFromParent();
2207 const TargetRegisterClass
*DstRC
=
2208 TRI
.getConstrainedRegClassForOperand(ResInst
->getOperand(0), *MRI
);
2211 return RBI
.constrainGenericRegister(DstReg
, *DstRC
, *MRI
);
2214 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr
&MI
) const {
2215 // Only manually handle the f64 SGPR case.
2217 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2218 // the bit ops theoretically have a second result due to the implicit def of
2219 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2220 // that is easy by disabling the check. The result works, but uses a
2221 // nonsensical sreg32orlds_and_sreg_1 regclass.
2223 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2224 // the variadic REG_SEQUENCE operands.
2226 Register Dst
= MI
.getOperand(0).getReg();
2227 const RegisterBank
*DstRB
= RBI
.getRegBank(Dst
, *MRI
, TRI
);
2228 if (DstRB
->getID() != AMDGPU::SGPRRegBankID
||
2229 MRI
->getType(Dst
) != LLT::scalar(64))
2232 Register Src
= MI
.getOperand(1).getReg();
2233 MachineInstr
*Fabs
= getOpcodeDef(TargetOpcode::G_FABS
, Src
, *MRI
);
2235 Src
= Fabs
->getOperand(1).getReg();
2237 if (!RBI
.constrainGenericRegister(Src
, AMDGPU::SReg_64RegClass
, *MRI
) ||
2238 !RBI
.constrainGenericRegister(Dst
, AMDGPU::SReg_64RegClass
, *MRI
))
2241 MachineBasicBlock
*BB
= MI
.getParent();
2242 const DebugLoc
&DL
= MI
.getDebugLoc();
2243 Register LoReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2244 Register HiReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2245 Register ConstReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2246 Register OpReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2248 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), LoReg
)
2249 .addReg(Src
, 0, AMDGPU::sub0
);
2250 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), HiReg
)
2251 .addReg(Src
, 0, AMDGPU::sub1
);
2252 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::S_MOV_B32
), ConstReg
)
2253 .addImm(0x80000000);
2255 // Set or toggle sign bit.
2256 unsigned Opc
= Fabs
? AMDGPU::S_OR_B32
: AMDGPU::S_XOR_B32
;
2257 BuildMI(*BB
, &MI
, DL
, TII
.get(Opc
), OpReg
)
2260 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), Dst
)
2262 .addImm(AMDGPU::sub0
)
2264 .addImm(AMDGPU::sub1
);
2265 MI
.eraseFromParent();
2269 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2270 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr
&MI
) const {
2271 Register Dst
= MI
.getOperand(0).getReg();
2272 const RegisterBank
*DstRB
= RBI
.getRegBank(Dst
, *MRI
, TRI
);
2273 if (DstRB
->getID() != AMDGPU::SGPRRegBankID
||
2274 MRI
->getType(Dst
) != LLT::scalar(64))
2277 Register Src
= MI
.getOperand(1).getReg();
2278 MachineBasicBlock
*BB
= MI
.getParent();
2279 const DebugLoc
&DL
= MI
.getDebugLoc();
2280 Register LoReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2281 Register HiReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2282 Register ConstReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2283 Register OpReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2285 if (!RBI
.constrainGenericRegister(Src
, AMDGPU::SReg_64RegClass
, *MRI
) ||
2286 !RBI
.constrainGenericRegister(Dst
, AMDGPU::SReg_64RegClass
, *MRI
))
2289 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), LoReg
)
2290 .addReg(Src
, 0, AMDGPU::sub0
);
2291 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), HiReg
)
2292 .addReg(Src
, 0, AMDGPU::sub1
);
2293 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::S_MOV_B32
), ConstReg
)
2294 .addImm(0x7fffffff);
2297 // TODO: Should this used S_BITSET0_*?
2298 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::S_AND_B32
), OpReg
)
2301 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), Dst
)
2303 .addImm(AMDGPU::sub0
)
2305 .addImm(AMDGPU::sub1
);
2307 MI
.eraseFromParent();
2311 static bool isConstant(const MachineInstr
&MI
) {
2312 return MI
.getOpcode() == TargetOpcode::G_CONSTANT
;
2315 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr
&Load
,
2316 const MachineRegisterInfo
&MRI
, SmallVectorImpl
<GEPInfo
> &AddrInfo
) const {
2318 const MachineInstr
*PtrMI
= MRI
.getUniqueVRegDef(Load
.getOperand(1).getReg());
2322 if (PtrMI
->getOpcode() != TargetOpcode::G_PTR_ADD
)
2325 GEPInfo
GEPInfo(*PtrMI
);
2327 for (unsigned i
= 1; i
!= 3; ++i
) {
2328 const MachineOperand
&GEPOp
= PtrMI
->getOperand(i
);
2329 const MachineInstr
*OpDef
= MRI
.getUniqueVRegDef(GEPOp
.getReg());
2331 if (i
== 2 && isConstant(*OpDef
)) {
2332 // TODO: Could handle constant base + variable offset, but a combine
2333 // probably should have commuted it.
2334 assert(GEPInfo
.Imm
== 0);
2335 GEPInfo
.Imm
= OpDef
->getOperand(1).getCImm()->getSExtValue();
2338 const RegisterBank
*OpBank
= RBI
.getRegBank(GEPOp
.getReg(), MRI
, TRI
);
2339 if (OpBank
->getID() == AMDGPU::SGPRRegBankID
)
2340 GEPInfo
.SgprParts
.push_back(GEPOp
.getReg());
2342 GEPInfo
.VgprParts
.push_back(GEPOp
.getReg());
2345 AddrInfo
.push_back(GEPInfo
);
2346 getAddrModeInfo(*PtrMI
, MRI
, AddrInfo
);
2349 bool AMDGPUInstructionSelector::isSGPR(Register Reg
) const {
2350 return RBI
.getRegBank(Reg
, *MRI
, TRI
)->getID() == AMDGPU::SGPRRegBankID
;
2353 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr
&MI
) const {
2354 if (!MI
.hasOneMemOperand())
2357 const MachineMemOperand
*MMO
= *MI
.memoperands_begin();
2358 const Value
*Ptr
= MMO
->getValue();
2360 // UndefValue means this is a load of a kernel input. These are uniform.
2361 // Sometimes LDS instructions have constant pointers.
2362 // If Ptr is null, then that means this mem operand contains a
2363 // PseudoSourceValue like GOT.
2364 if (!Ptr
|| isa
<UndefValue
>(Ptr
) || isa
<Argument
>(Ptr
) ||
2365 isa
<Constant
>(Ptr
) || isa
<GlobalValue
>(Ptr
))
2368 if (MMO
->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
)
2371 const Instruction
*I
= dyn_cast
<Instruction
>(Ptr
);
2372 return I
&& I
->getMetadata("amdgpu.uniform");
2375 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef
<GEPInfo
> AddrInfo
) const {
2376 for (const GEPInfo
&GEPInfo
: AddrInfo
) {
2377 if (!GEPInfo
.VgprParts
.empty())
2383 void AMDGPUInstructionSelector::initM0(MachineInstr
&I
) const {
2384 const LLT PtrTy
= MRI
->getType(I
.getOperand(1).getReg());
2385 unsigned AS
= PtrTy
.getAddressSpace();
2386 if ((AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) &&
2387 STI
.ldsRequiresM0Init()) {
2388 MachineBasicBlock
*BB
= I
.getParent();
2390 // If DS instructions require M0 initializtion, insert it before selecting.
2391 BuildMI(*BB
, &I
, I
.getDebugLoc(), TII
.get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
2396 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2397 MachineInstr
&I
) const {
2398 if (I
.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD
) {
2399 const LLT PtrTy
= MRI
->getType(I
.getOperand(1).getReg());
2400 unsigned AS
= PtrTy
.getAddressSpace();
2401 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
)
2402 return selectGlobalAtomicFadd(I
, I
.getOperand(1), I
.getOperand(2));
2406 return selectImpl(I
, *CoverageInfo
);
2409 // TODO: No rtn optimization.
2410 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2411 MachineInstr
&MI
) const {
2412 Register PtrReg
= MI
.getOperand(1).getReg();
2413 const LLT PtrTy
= MRI
->getType(PtrReg
);
2414 if (PtrTy
.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS
||
2415 STI
.useFlatForGlobal())
2416 return selectImpl(MI
, *CoverageInfo
);
2418 Register DstReg
= MI
.getOperand(0).getReg();
2419 const LLT Ty
= MRI
->getType(DstReg
);
2420 const bool Is64
= Ty
.getSizeInBits() == 64;
2421 const unsigned SubReg
= Is64
? AMDGPU::sub0_sub1
: AMDGPU::sub0
;
2422 Register TmpReg
= MRI
->createVirtualRegister(
2423 Is64
? &AMDGPU::VReg_128RegClass
: &AMDGPU::VReg_64RegClass
);
2425 const DebugLoc
&DL
= MI
.getDebugLoc();
2426 MachineBasicBlock
*BB
= MI
.getParent();
2428 Register VAddr
, RSrcReg
, SOffset
;
2432 if (selectMUBUFOffsetImpl(MI
.getOperand(1), RSrcReg
, SOffset
, Offset
)) {
2433 Opcode
= Is64
? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN
:
2434 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
;
2435 } else if (selectMUBUFAddr64Impl(MI
.getOperand(1), VAddr
,
2436 RSrcReg
, SOffset
, Offset
)) {
2437 Opcode
= Is64
? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN
:
2438 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN
;
2440 return selectImpl(MI
, *CoverageInfo
);
2442 auto MIB
= BuildMI(*BB
, &MI
, DL
, TII
.get(Opcode
), TmpReg
)
2443 .addReg(MI
.getOperand(2).getReg());
2448 MIB
.addReg(RSrcReg
);
2450 MIB
.addReg(SOffset
);
2455 MIB
.addImm(AMDGPU::CPol::GLC
);
2456 MIB
.cloneMemRefs(MI
);
2458 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), DstReg
)
2459 .addReg(TmpReg
, RegState::Kill
, SubReg
);
2461 MI
.eraseFromParent();
2464 DstReg
, Is64
? &AMDGPU::VReg_64RegClass
: &AMDGPU::VGPR_32RegClass
);
2465 return constrainSelectedInstRegOperands(*MIB
, TII
, TRI
, RBI
);
2468 static bool isVCmpResult(Register Reg
, MachineRegisterInfo
&MRI
) {
2469 if (Reg
.isPhysical())
2472 MachineInstr
&MI
= *MRI
.getUniqueVRegDef(Reg
);
2473 const unsigned Opcode
= MI
.getOpcode();
2475 if (Opcode
== AMDGPU::COPY
)
2476 return isVCmpResult(MI
.getOperand(1).getReg(), MRI
);
2478 if (Opcode
== AMDGPU::G_AND
|| Opcode
== AMDGPU::G_OR
||
2479 Opcode
== AMDGPU::G_XOR
)
2480 return isVCmpResult(MI
.getOperand(1).getReg(), MRI
) &&
2481 isVCmpResult(MI
.getOperand(2).getReg(), MRI
);
2483 if (Opcode
== TargetOpcode::G_INTRINSIC
)
2484 return MI
.getIntrinsicID() == Intrinsic::amdgcn_class
;
2486 return Opcode
== AMDGPU::G_ICMP
|| Opcode
== AMDGPU::G_FCMP
;
2489 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr
&I
) const {
2490 MachineBasicBlock
*BB
= I
.getParent();
2491 MachineOperand
&CondOp
= I
.getOperand(0);
2492 Register CondReg
= CondOp
.getReg();
2493 const DebugLoc
&DL
= I
.getDebugLoc();
2496 Register CondPhysReg
;
2497 const TargetRegisterClass
*ConstrainRC
;
2499 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2500 // whether the branch is uniform when selecting the instruction. In
2501 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2502 // RegBankSelect knows what it's doing if the branch condition is scc, even
2503 // though it currently does not.
2504 if (!isVCC(CondReg
, *MRI
)) {
2505 if (MRI
->getType(CondReg
) != LLT::scalar(32))
2508 CondPhysReg
= AMDGPU::SCC
;
2509 BrOpcode
= AMDGPU::S_CBRANCH_SCC1
;
2510 ConstrainRC
= &AMDGPU::SReg_32RegClass
;
2512 // FIXME: Should scc->vcc copies and with exec?
2514 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2515 // need to insert an and with exec.
2516 if (!isVCmpResult(CondReg
, *MRI
)) {
2517 const bool Is64
= STI
.isWave64();
2518 const unsigned Opcode
= Is64
? AMDGPU::S_AND_B64
: AMDGPU::S_AND_B32
;
2519 const Register Exec
= Is64
? AMDGPU::EXEC
: AMDGPU::EXEC_LO
;
2521 Register TmpReg
= MRI
->createVirtualRegister(TRI
.getBoolRC());
2522 BuildMI(*BB
, &I
, DL
, TII
.get(Opcode
), TmpReg
)
2528 CondPhysReg
= TRI
.getVCC();
2529 BrOpcode
= AMDGPU::S_CBRANCH_VCCNZ
;
2530 ConstrainRC
= TRI
.getBoolRC();
2533 if (!MRI
->getRegClassOrNull(CondReg
))
2534 MRI
->setRegClass(CondReg
, ConstrainRC
);
2536 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), CondPhysReg
)
2538 BuildMI(*BB
, &I
, DL
, TII
.get(BrOpcode
))
2539 .addMBB(I
.getOperand(1).getMBB());
2541 I
.eraseFromParent();
2545 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2546 MachineInstr
&I
) const {
2547 Register DstReg
= I
.getOperand(0).getReg();
2548 const RegisterBank
*DstRB
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
2549 const bool IsVGPR
= DstRB
->getID() == AMDGPU::VGPRRegBankID
;
2550 I
.setDesc(TII
.get(IsVGPR
? AMDGPU::V_MOV_B32_e32
: AMDGPU::S_MOV_B32
));
2552 I
.addOperand(*MF
, MachineOperand::CreateReg(AMDGPU::EXEC
, false, true));
2554 return RBI
.constrainGenericRegister(
2555 DstReg
, IsVGPR
? AMDGPU::VGPR_32RegClass
: AMDGPU::SReg_32RegClass
, *MRI
);
2558 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr
&I
) const {
2559 Register DstReg
= I
.getOperand(0).getReg();
2560 Register SrcReg
= I
.getOperand(1).getReg();
2561 Register MaskReg
= I
.getOperand(2).getReg();
2562 LLT Ty
= MRI
->getType(DstReg
);
2563 LLT MaskTy
= MRI
->getType(MaskReg
);
2565 const RegisterBank
*DstRB
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
2566 const RegisterBank
*SrcRB
= RBI
.getRegBank(SrcReg
, *MRI
, TRI
);
2567 const RegisterBank
*MaskRB
= RBI
.getRegBank(MaskReg
, *MRI
, TRI
);
2568 const bool IsVGPR
= DstRB
->getID() == AMDGPU::VGPRRegBankID
;
2569 if (DstRB
!= SrcRB
) // Should only happen for hand written MIR.
2572 unsigned NewOpc
= IsVGPR
? AMDGPU::V_AND_B32_e64
: AMDGPU::S_AND_B32
;
2573 const TargetRegisterClass
&RegRC
2574 = IsVGPR
? AMDGPU::VGPR_32RegClass
: AMDGPU::SReg_32RegClass
;
2576 const TargetRegisterClass
*DstRC
= TRI
.getRegClassForTypeOnBank(Ty
, *DstRB
,
2578 const TargetRegisterClass
*SrcRC
= TRI
.getRegClassForTypeOnBank(Ty
, *SrcRB
,
2580 const TargetRegisterClass
*MaskRC
=
2581 TRI
.getRegClassForTypeOnBank(MaskTy
, *MaskRB
, *MRI
);
2583 if (!RBI
.constrainGenericRegister(DstReg
, *DstRC
, *MRI
) ||
2584 !RBI
.constrainGenericRegister(SrcReg
, *SrcRC
, *MRI
) ||
2585 !RBI
.constrainGenericRegister(MaskReg
, *MaskRC
, *MRI
))
2588 MachineBasicBlock
*BB
= I
.getParent();
2589 const DebugLoc
&DL
= I
.getDebugLoc();
2590 if (Ty
.getSizeInBits() == 32) {
2591 assert(MaskTy
.getSizeInBits() == 32 &&
2592 "ptrmask should have been narrowed during legalize");
2594 BuildMI(*BB
, &I
, DL
, TII
.get(NewOpc
), DstReg
)
2597 I
.eraseFromParent();
2601 Register HiReg
= MRI
->createVirtualRegister(&RegRC
);
2602 Register LoReg
= MRI
->createVirtualRegister(&RegRC
);
2604 // Extract the subregisters from the source pointer.
2605 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), LoReg
)
2606 .addReg(SrcReg
, 0, AMDGPU::sub0
);
2607 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), HiReg
)
2608 .addReg(SrcReg
, 0, AMDGPU::sub1
);
2610 Register MaskedLo
, MaskedHi
;
2612 // Try to avoid emitting a bit operation when we only need to touch half of
2613 // the 64-bit pointer.
2614 APInt MaskOnes
= KnownBits
->getKnownOnes(MaskReg
).zextOrSelf(64);
2616 const APInt MaskHi32
= APInt::getHighBitsSet(64, 32);
2617 const APInt MaskLo32
= APInt::getLowBitsSet(64, 32);
2618 if ((MaskOnes
& MaskLo32
) == MaskLo32
) {
2619 // If all the bits in the low half are 1, we only need a copy for it.
2622 // Extract the mask subregister and apply the and.
2623 Register MaskLo
= MRI
->createVirtualRegister(&RegRC
);
2624 MaskedLo
= MRI
->createVirtualRegister(&RegRC
);
2626 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), MaskLo
)
2627 .addReg(MaskReg
, 0, AMDGPU::sub0
);
2628 BuildMI(*BB
, &I
, DL
, TII
.get(NewOpc
), MaskedLo
)
2633 if ((MaskOnes
& MaskHi32
) == MaskHi32
) {
2634 // If all the bits in the high half are 1, we only need a copy for it.
2637 Register MaskHi
= MRI
->createVirtualRegister(&RegRC
);
2638 MaskedHi
= MRI
->createVirtualRegister(&RegRC
);
2640 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::COPY
), MaskHi
)
2641 .addReg(MaskReg
, 0, AMDGPU::sub1
);
2642 BuildMI(*BB
, &I
, DL
, TII
.get(NewOpc
), MaskedHi
)
2647 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), DstReg
)
2649 .addImm(AMDGPU::sub0
)
2651 .addImm(AMDGPU::sub1
);
2652 I
.eraseFromParent();
2656 /// Return the register to use for the index value, and the subregister to use
2657 /// for the indirectly accessed register.
2658 static std::pair
<Register
, unsigned>
2659 computeIndirectRegIndex(MachineRegisterInfo
&MRI
,
2660 const SIRegisterInfo
&TRI
,
2661 const TargetRegisterClass
*SuperRC
,
2664 Register IdxBaseReg
;
2667 std::tie(IdxBaseReg
, Offset
) = AMDGPU::getBaseWithConstantOffset(MRI
, IdxReg
);
2668 if (IdxBaseReg
== AMDGPU::NoRegister
) {
2669 // This will happen if the index is a known constant. This should ordinarily
2670 // be legalized out, but handle it as a register just in case.
2671 assert(Offset
== 0);
2672 IdxBaseReg
= IdxReg
;
2675 ArrayRef
<int16_t> SubRegs
= TRI
.getRegSplitParts(SuperRC
, EltSize
);
2677 // Skip out of bounds offsets, or else we would end up using an undefined
2679 if (static_cast<unsigned>(Offset
) >= SubRegs
.size())
2680 return std::make_pair(IdxReg
, SubRegs
[0]);
2681 return std::make_pair(IdxBaseReg
, SubRegs
[Offset
]);
2684 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2685 MachineInstr
&MI
) const {
2686 Register DstReg
= MI
.getOperand(0).getReg();
2687 Register SrcReg
= MI
.getOperand(1).getReg();
2688 Register IdxReg
= MI
.getOperand(2).getReg();
2690 LLT DstTy
= MRI
->getType(DstReg
);
2691 LLT SrcTy
= MRI
->getType(SrcReg
);
2693 const RegisterBank
*DstRB
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
2694 const RegisterBank
*SrcRB
= RBI
.getRegBank(SrcReg
, *MRI
, TRI
);
2695 const RegisterBank
*IdxRB
= RBI
.getRegBank(IdxReg
, *MRI
, TRI
);
2697 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2698 // into a waterfall loop.
2699 if (IdxRB
->getID() != AMDGPU::SGPRRegBankID
)
2702 const TargetRegisterClass
*SrcRC
= TRI
.getRegClassForTypeOnBank(SrcTy
, *SrcRB
,
2704 const TargetRegisterClass
*DstRC
= TRI
.getRegClassForTypeOnBank(DstTy
, *DstRB
,
2706 if (!SrcRC
|| !DstRC
)
2708 if (!RBI
.constrainGenericRegister(SrcReg
, *SrcRC
, *MRI
) ||
2709 !RBI
.constrainGenericRegister(DstReg
, *DstRC
, *MRI
) ||
2710 !RBI
.constrainGenericRegister(IdxReg
, AMDGPU::SReg_32RegClass
, *MRI
))
2713 MachineBasicBlock
*BB
= MI
.getParent();
2714 const DebugLoc
&DL
= MI
.getDebugLoc();
2715 const bool Is64
= DstTy
.getSizeInBits() == 64;
2718 std::tie(IdxReg
, SubReg
) = computeIndirectRegIndex(*MRI
, TRI
, SrcRC
, IdxReg
,
2719 DstTy
.getSizeInBits() / 8);
2721 if (SrcRB
->getID() == AMDGPU::SGPRRegBankID
) {
2722 if (DstTy
.getSizeInBits() != 32 && !Is64
)
2725 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), AMDGPU::M0
)
2728 unsigned Opc
= Is64
? AMDGPU::S_MOVRELS_B64
: AMDGPU::S_MOVRELS_B32
;
2729 BuildMI(*BB
, &MI
, DL
, TII
.get(Opc
), DstReg
)
2730 .addReg(SrcReg
, 0, SubReg
)
2731 .addReg(SrcReg
, RegState::Implicit
);
2732 MI
.eraseFromParent();
2736 if (SrcRB
->getID() != AMDGPU::VGPRRegBankID
|| DstTy
.getSizeInBits() != 32)
2739 if (!STI
.useVGPRIndexMode()) {
2740 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), AMDGPU::M0
)
2742 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::V_MOVRELS_B32_e32
), DstReg
)
2743 .addReg(SrcReg
, 0, SubReg
)
2744 .addReg(SrcReg
, RegState::Implicit
);
2745 MI
.eraseFromParent();
2749 const MCInstrDesc
&GPRIDXDesc
=
2750 TII
.getIndirectGPRIDXPseudo(TRI
.getRegSizeInBits(*SrcRC
), true);
2751 BuildMI(*BB
, MI
, DL
, GPRIDXDesc
, DstReg
)
2756 MI
.eraseFromParent();
2760 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2761 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2762 MachineInstr
&MI
) const {
2763 Register DstReg
= MI
.getOperand(0).getReg();
2764 Register VecReg
= MI
.getOperand(1).getReg();
2765 Register ValReg
= MI
.getOperand(2).getReg();
2766 Register IdxReg
= MI
.getOperand(3).getReg();
2768 LLT VecTy
= MRI
->getType(DstReg
);
2769 LLT ValTy
= MRI
->getType(ValReg
);
2770 unsigned VecSize
= VecTy
.getSizeInBits();
2771 unsigned ValSize
= ValTy
.getSizeInBits();
2773 const RegisterBank
*VecRB
= RBI
.getRegBank(VecReg
, *MRI
, TRI
);
2774 const RegisterBank
*ValRB
= RBI
.getRegBank(ValReg
, *MRI
, TRI
);
2775 const RegisterBank
*IdxRB
= RBI
.getRegBank(IdxReg
, *MRI
, TRI
);
2777 assert(VecTy
.getElementType() == ValTy
);
2779 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2780 // into a waterfall loop.
2781 if (IdxRB
->getID() != AMDGPU::SGPRRegBankID
)
2784 const TargetRegisterClass
*VecRC
= TRI
.getRegClassForTypeOnBank(VecTy
, *VecRB
,
2786 const TargetRegisterClass
*ValRC
= TRI
.getRegClassForTypeOnBank(ValTy
, *ValRB
,
2789 if (!RBI
.constrainGenericRegister(VecReg
, *VecRC
, *MRI
) ||
2790 !RBI
.constrainGenericRegister(DstReg
, *VecRC
, *MRI
) ||
2791 !RBI
.constrainGenericRegister(ValReg
, *ValRC
, *MRI
) ||
2792 !RBI
.constrainGenericRegister(IdxReg
, AMDGPU::SReg_32RegClass
, *MRI
))
2795 if (VecRB
->getID() == AMDGPU::VGPRRegBankID
&& ValSize
!= 32)
2799 std::tie(IdxReg
, SubReg
) = computeIndirectRegIndex(*MRI
, TRI
, VecRC
, IdxReg
,
2802 const bool IndexMode
= VecRB
->getID() == AMDGPU::VGPRRegBankID
&&
2803 STI
.useVGPRIndexMode();
2805 MachineBasicBlock
*BB
= MI
.getParent();
2806 const DebugLoc
&DL
= MI
.getDebugLoc();
2809 BuildMI(*BB
, &MI
, DL
, TII
.get(AMDGPU::COPY
), AMDGPU::M0
)
2812 const MCInstrDesc
&RegWriteOp
= TII
.getIndirectRegWriteMovRelPseudo(
2813 VecSize
, ValSize
, VecRB
->getID() == AMDGPU::SGPRRegBankID
);
2814 BuildMI(*BB
, MI
, DL
, RegWriteOp
, DstReg
)
2818 MI
.eraseFromParent();
2822 const MCInstrDesc
&GPRIDXDesc
=
2823 TII
.getIndirectGPRIDXPseudo(TRI
.getRegSizeInBits(*VecRC
), false);
2824 BuildMI(*BB
, MI
, DL
, GPRIDXDesc
, DstReg
)
2830 MI
.eraseFromParent();
2834 static bool isZeroOrUndef(int X
) {
2835 return X
== 0 || X
== -1;
2838 static bool isOneOrUndef(int X
) {
2839 return X
== 1 || X
== -1;
2842 static bool isZeroOrOneOrUndef(int X
) {
2843 return X
== 0 || X
== 1 || X
== -1;
2846 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2848 static Register
normalizeVOP3PMask(int NewMask
[2], Register Src0
, Register Src1
,
2849 ArrayRef
<int> Mask
) {
2850 NewMask
[0] = Mask
[0];
2851 NewMask
[1] = Mask
[1];
2852 if (isZeroOrOneOrUndef(Mask
[0]) && isZeroOrOneOrUndef(Mask
[1]))
2855 assert(NewMask
[0] == 2 || NewMask
[0] == 3 || NewMask
[0] == -1);
2856 assert(NewMask
[1] == 2 || NewMask
[1] == 3 || NewMask
[1] == -1);
2858 // Shift the mask inputs to be 0/1;
2859 NewMask
[0] = NewMask
[0] == -1 ? -1 : NewMask
[0] - 2;
2860 NewMask
[1] = NewMask
[1] == -1 ? -1 : NewMask
[1] - 2;
2864 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2865 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2866 MachineInstr
&MI
) const {
2867 Register DstReg
= MI
.getOperand(0).getReg();
2868 Register Src0Reg
= MI
.getOperand(1).getReg();
2869 Register Src1Reg
= MI
.getOperand(2).getReg();
2870 ArrayRef
<int> ShufMask
= MI
.getOperand(3).getShuffleMask();
2872 const LLT V2S16
= LLT::fixed_vector(2, 16);
2873 if (MRI
->getType(DstReg
) != V2S16
|| MRI
->getType(Src0Reg
) != V2S16
)
2876 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask
))
2879 assert(ShufMask
.size() == 2);
2880 assert(STI
.hasSDWA() && "no target has VOP3P but not SDWA");
2882 MachineBasicBlock
*MBB
= MI
.getParent();
2883 const DebugLoc
&DL
= MI
.getDebugLoc();
2885 const RegisterBank
*DstRB
= RBI
.getRegBank(DstReg
, *MRI
, TRI
);
2886 const bool IsVALU
= DstRB
->getID() == AMDGPU::VGPRRegBankID
;
2887 const TargetRegisterClass
&RC
= IsVALU
?
2888 AMDGPU::VGPR_32RegClass
: AMDGPU::SReg_32RegClass
;
2890 // Handle the degenerate case which should have folded out.
2891 if (ShufMask
[0] == -1 && ShufMask
[1] == -1) {
2892 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::IMPLICIT_DEF
), DstReg
);
2894 MI
.eraseFromParent();
2895 return RBI
.constrainGenericRegister(DstReg
, RC
, *MRI
);
2898 // A legal VOP3P mask only reads one of the sources.
2900 Register SrcVec
= normalizeVOP3PMask(Mask
, Src0Reg
, Src1Reg
, ShufMask
);
2902 if (!RBI
.constrainGenericRegister(DstReg
, RC
, *MRI
) ||
2903 !RBI
.constrainGenericRegister(SrcVec
, RC
, *MRI
))
2906 // TODO: This also should have been folded out
2907 if (isZeroOrUndef(Mask
[0]) && isOneOrUndef(Mask
[1])) {
2908 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::COPY
), DstReg
)
2911 MI
.eraseFromParent();
2915 if (Mask
[0] == 1 && Mask
[1] == -1) {
2917 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::V_LSHRREV_B32_e64
), DstReg
)
2921 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::S_LSHR_B32
), DstReg
)
2925 } else if (Mask
[0] == -1 && Mask
[1] == 0) {
2927 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::V_LSHLREV_B32_e64
), DstReg
)
2931 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::S_LSHL_B32
), DstReg
)
2935 } else if (Mask
[0] == 0 && Mask
[1] == 0) {
2937 // Write low half of the register into the high half.
2938 MachineInstr
*MovSDWA
=
2939 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::V_MOV_B32_sdwa
), DstReg
)
2940 .addImm(0) // $src0_modifiers
2941 .addReg(SrcVec
) // $src0
2942 .addImm(0) // $clamp
2943 .addImm(AMDGPU::SDWA::WORD_1
) // $dst_sel
2944 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE
) // $dst_unused
2945 .addImm(AMDGPU::SDWA::WORD_0
) // $src0_sel
2946 .addReg(SrcVec
, RegState::Implicit
);
2947 MovSDWA
->tieOperands(0, MovSDWA
->getNumOperands() - 1);
2949 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::S_PACK_LL_B32_B16
), DstReg
)
2953 } else if (Mask
[0] == 1 && Mask
[1] == 1) {
2955 // Write high half of the register into the low half.
2956 MachineInstr
*MovSDWA
=
2957 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::V_MOV_B32_sdwa
), DstReg
)
2958 .addImm(0) // $src0_modifiers
2959 .addReg(SrcVec
) // $src0
2960 .addImm(0) // $clamp
2961 .addImm(AMDGPU::SDWA::WORD_0
) // $dst_sel
2962 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE
) // $dst_unused
2963 .addImm(AMDGPU::SDWA::WORD_1
) // $src0_sel
2964 .addReg(SrcVec
, RegState::Implicit
);
2965 MovSDWA
->tieOperands(0, MovSDWA
->getNumOperands() - 1);
2967 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::S_PACK_HH_B32_B16
), DstReg
)
2971 } else if (Mask
[0] == 1 && Mask
[1] == 0) {
2973 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::V_ALIGNBIT_B32_e64
), DstReg
)
2978 Register TmpReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
2979 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::S_LSHR_B32
), TmpReg
)
2982 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::S_PACK_LL_B32_B16
), DstReg
)
2987 llvm_unreachable("all shuffle masks should be handled");
2989 MI
.eraseFromParent();
2993 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2994 MachineInstr
&MI
) const {
2995 if (STI
.hasGFX90AInsts())
2996 return selectImpl(MI
, *CoverageInfo
);
2998 MachineBasicBlock
*MBB
= MI
.getParent();
2999 const DebugLoc
&DL
= MI
.getDebugLoc();
3001 if (!MRI
->use_nodbg_empty(MI
.getOperand(0).getReg())) {
3002 Function
&F
= MBB
->getParent()->getFunction();
3003 DiagnosticInfoUnsupported
3004 NoFpRet(F
, "return versions of fp atomics not supported",
3005 MI
.getDebugLoc(), DS_Error
);
3006 F
.getContext().diagnose(NoFpRet
);
3010 // FIXME: This is only needed because tablegen requires number of dst operands
3011 // in match and replace pattern to be the same. Otherwise patterns can be
3012 // exported from SDag path.
3013 MachineOperand
&VDataIn
= MI
.getOperand(1);
3014 MachineOperand
&VIndex
= MI
.getOperand(3);
3015 MachineOperand
&VOffset
= MI
.getOperand(4);
3016 MachineOperand
&SOffset
= MI
.getOperand(5);
3017 int16_t Offset
= MI
.getOperand(6).getImm();
3019 bool HasVOffset
= !isOperandImmEqual(VOffset
, 0, *MRI
);
3020 bool HasVIndex
= !isOperandImmEqual(VIndex
, 0, *MRI
);
3024 Opcode
= HasVIndex
? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
3025 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN
;
3027 Opcode
= HasVIndex
? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
3028 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET
;
3031 if (MRI
->getType(VDataIn
.getReg()).isVector()) {
3033 case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
:
3034 Opcode
= AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN
;
3036 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN
:
3037 Opcode
= AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN
;
3039 case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
:
3040 Opcode
= AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN
;
3042 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET
:
3043 Opcode
= AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET
;
3048 auto I
= BuildMI(*MBB
, MI
, DL
, TII
.get(Opcode
));
3051 if (Opcode
== AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
||
3052 Opcode
== AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN
) {
3053 Register IdxReg
= MRI
->createVirtualRegister(TRI
.getVGPR64Class());
3054 BuildMI(*MBB
, &*I
, DL
, TII
.get(AMDGPU::REG_SEQUENCE
), IdxReg
)
3055 .addReg(VIndex
.getReg())
3056 .addImm(AMDGPU::sub0
)
3057 .addReg(VOffset
.getReg())
3058 .addImm(AMDGPU::sub1
);
3061 } else if (HasVIndex
) {
3063 } else if (HasVOffset
) {
3067 I
.add(MI
.getOperand(2)); // rsrc
3070 I
.addImm(MI
.getOperand(7).getImm()); // cpol
3073 MI
.eraseFromParent();
3078 bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
3079 MachineInstr
&MI
, MachineOperand
&AddrOp
, MachineOperand
&DataOp
) const {
3081 if (STI
.hasGFX90AInsts()) {
3082 // gfx90a adds return versions of the global atomic fadd instructions so no
3083 // special handling is required.
3084 return selectImpl(MI
, *CoverageInfo
);
3087 MachineBasicBlock
*MBB
= MI
.getParent();
3088 const DebugLoc
&DL
= MI
.getDebugLoc();
3090 if (!MRI
->use_nodbg_empty(MI
.getOperand(0).getReg())) {
3091 Function
&F
= MBB
->getParent()->getFunction();
3092 DiagnosticInfoUnsupported
3093 NoFpRet(F
, "return versions of fp atomics not supported",
3094 MI
.getDebugLoc(), DS_Error
);
3095 F
.getContext().diagnose(NoFpRet
);
3099 // FIXME: This is only needed because tablegen requires number of dst operands
3100 // in match and replace pattern to be the same. Otherwise patterns can be
3101 // exported from SDag path.
3102 auto Addr
= selectFlatOffsetImpl(AddrOp
, SIInstrFlags::FlatGlobal
);
3104 Register Data
= DataOp
.getReg();
3105 const unsigned Opc
= MRI
->getType(Data
).isVector() ?
3106 AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16
: AMDGPU::GLOBAL_ATOMIC_ADD_F32
;
3107 auto MIB
= BuildMI(*MBB
, &MI
, DL
, TII
.get(Opc
))
3110 .addImm(Addr
.second
)
3114 MI
.eraseFromParent();
3115 return constrainSelectedInstRegOperands(*MIB
, TII
, TRI
, RBI
);
3118 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr
&MI
) const{
3119 MI
.setDesc(TII
.get(MI
.getOperand(1).getImm()));
3120 MI
.RemoveOperand(1);
3121 MI
.addImplicitDefUseOperands(*MI
.getParent()->getParent());
3125 bool AMDGPUInstructionSelector::select(MachineInstr
&I
) {
3127 return selectPHI(I
);
3129 if (!I
.isPreISelOpcode()) {
3131 return selectCOPY(I
);
3135 switch (I
.getOpcode()) {
3136 case TargetOpcode::G_AND
:
3137 case TargetOpcode::G_OR
:
3138 case TargetOpcode::G_XOR
:
3139 if (selectImpl(I
, *CoverageInfo
))
3141 return selectG_AND_OR_XOR(I
);
3142 case TargetOpcode::G_ADD
:
3143 case TargetOpcode::G_SUB
:
3144 if (selectImpl(I
, *CoverageInfo
))
3146 return selectG_ADD_SUB(I
);
3147 case TargetOpcode::G_UADDO
:
3148 case TargetOpcode::G_USUBO
:
3149 case TargetOpcode::G_UADDE
:
3150 case TargetOpcode::G_USUBE
:
3151 return selectG_UADDO_USUBO_UADDE_USUBE(I
);
3152 case TargetOpcode::G_INTTOPTR
:
3153 case TargetOpcode::G_BITCAST
:
3154 case TargetOpcode::G_PTRTOINT
:
3155 return selectCOPY(I
);
3156 case TargetOpcode::G_CONSTANT
:
3157 case TargetOpcode::G_FCONSTANT
:
3158 return selectG_CONSTANT(I
);
3159 case TargetOpcode::G_FNEG
:
3160 if (selectImpl(I
, *CoverageInfo
))
3162 return selectG_FNEG(I
);
3163 case TargetOpcode::G_FABS
:
3164 if (selectImpl(I
, *CoverageInfo
))
3166 return selectG_FABS(I
);
3167 case TargetOpcode::G_EXTRACT
:
3168 return selectG_EXTRACT(I
);
3169 case TargetOpcode::G_MERGE_VALUES
:
3170 case TargetOpcode::G_BUILD_VECTOR
:
3171 case TargetOpcode::G_CONCAT_VECTORS
:
3172 return selectG_MERGE_VALUES(I
);
3173 case TargetOpcode::G_UNMERGE_VALUES
:
3174 return selectG_UNMERGE_VALUES(I
);
3175 case TargetOpcode::G_BUILD_VECTOR_TRUNC
:
3176 return selectG_BUILD_VECTOR_TRUNC(I
);
3177 case TargetOpcode::G_PTR_ADD
:
3178 return selectG_PTR_ADD(I
);
3179 case TargetOpcode::G_IMPLICIT_DEF
:
3180 return selectG_IMPLICIT_DEF(I
);
3181 case TargetOpcode::G_FREEZE
:
3182 return selectCOPY(I
);
3183 case TargetOpcode::G_INSERT
:
3184 return selectG_INSERT(I
);
3185 case TargetOpcode::G_INTRINSIC
:
3186 return selectG_INTRINSIC(I
);
3187 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS
:
3188 return selectG_INTRINSIC_W_SIDE_EFFECTS(I
);
3189 case TargetOpcode::G_ICMP
:
3190 if (selectG_ICMP(I
))
3192 return selectImpl(I
, *CoverageInfo
);
3193 case TargetOpcode::G_LOAD
:
3194 case TargetOpcode::G_STORE
:
3195 case TargetOpcode::G_ATOMIC_CMPXCHG
:
3196 case TargetOpcode::G_ATOMICRMW_XCHG
:
3197 case TargetOpcode::G_ATOMICRMW_ADD
:
3198 case TargetOpcode::G_ATOMICRMW_SUB
:
3199 case TargetOpcode::G_ATOMICRMW_AND
:
3200 case TargetOpcode::G_ATOMICRMW_OR
:
3201 case TargetOpcode::G_ATOMICRMW_XOR
:
3202 case TargetOpcode::G_ATOMICRMW_MIN
:
3203 case TargetOpcode::G_ATOMICRMW_MAX
:
3204 case TargetOpcode::G_ATOMICRMW_UMIN
:
3205 case TargetOpcode::G_ATOMICRMW_UMAX
:
3206 case TargetOpcode::G_ATOMICRMW_FADD
:
3207 case AMDGPU::G_AMDGPU_ATOMIC_INC
:
3208 case AMDGPU::G_AMDGPU_ATOMIC_DEC
:
3209 case AMDGPU::G_AMDGPU_ATOMIC_FMIN
:
3210 case AMDGPU::G_AMDGPU_ATOMIC_FMAX
:
3211 return selectG_LOAD_STORE_ATOMICRMW(I
);
3212 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG
:
3213 return selectG_AMDGPU_ATOMIC_CMPXCHG(I
);
3214 case TargetOpcode::G_SELECT
:
3215 return selectG_SELECT(I
);
3216 case TargetOpcode::G_TRUNC
:
3217 return selectG_TRUNC(I
);
3218 case TargetOpcode::G_SEXT
:
3219 case TargetOpcode::G_ZEXT
:
3220 case TargetOpcode::G_ANYEXT
:
3221 case TargetOpcode::G_SEXT_INREG
:
3222 if (selectImpl(I
, *CoverageInfo
))
3224 return selectG_SZA_EXT(I
);
3225 case TargetOpcode::G_BRCOND
:
3226 return selectG_BRCOND(I
);
3227 case TargetOpcode::G_GLOBAL_VALUE
:
3228 return selectG_GLOBAL_VALUE(I
);
3229 case TargetOpcode::G_PTRMASK
:
3230 return selectG_PTRMASK(I
);
3231 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
3232 return selectG_EXTRACT_VECTOR_ELT(I
);
3233 case TargetOpcode::G_INSERT_VECTOR_ELT
:
3234 return selectG_INSERT_VECTOR_ELT(I
);
3235 case TargetOpcode::G_SHUFFLE_VECTOR
:
3236 return selectG_SHUFFLE_VECTOR(I
);
3237 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD
:
3238 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE
: {
3239 const AMDGPU::ImageDimIntrinsicInfo
*Intr
3240 = AMDGPU::getImageDimIntrinsicInfo(I
.getIntrinsicID());
3241 assert(Intr
&& "not an image intrinsic with image pseudo");
3242 return selectImageIntrinsic(I
, Intr
);
3244 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY
:
3245 return selectBVHIntrinsic(I
);
3246 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD
:
3247 return selectAMDGPU_BUFFER_ATOMIC_FADD(I
);
3248 case AMDGPU::G_SBFX
:
3249 case AMDGPU::G_UBFX
:
3250 return selectG_SBFX_UBFX(I
);
3252 return selectImpl(I
, *CoverageInfo
);
3257 InstructionSelector::ComplexRendererFns
3258 AMDGPUInstructionSelector::selectVCSRC(MachineOperand
&Root
) const {
3260 [=](MachineInstrBuilder
&MIB
) { MIB
.add(Root
); }
3265 std::pair
<Register
, unsigned>
3266 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand
&Root
,
3267 bool AllowAbs
) const {
3268 Register Src
= Root
.getReg();
3269 Register OrigSrc
= Src
;
3271 MachineInstr
*MI
= getDefIgnoringCopies(Src
, *MRI
);
3273 if (MI
&& MI
->getOpcode() == AMDGPU::G_FNEG
) {
3274 Src
= MI
->getOperand(1).getReg();
3275 Mods
|= SISrcMods::NEG
;
3276 MI
= getDefIgnoringCopies(Src
, *MRI
);
3279 if (AllowAbs
&& MI
&& MI
->getOpcode() == AMDGPU::G_FABS
) {
3280 Src
= MI
->getOperand(1).getReg();
3281 Mods
|= SISrcMods::ABS
;
3285 RBI
.getRegBank(Src
, *MRI
, TRI
)->getID() != AMDGPU::VGPRRegBankID
) {
3286 MachineInstr
*UseMI
= Root
.getParent();
3288 // If we looked through copies to find source modifiers on an SGPR operand,
3289 // we now have an SGPR register source. To avoid potentially violating the
3290 // constant bus restriction, we need to insert a copy to a VGPR.
3291 Register VGPRSrc
= MRI
->cloneVirtualRegister(OrigSrc
);
3292 BuildMI(*UseMI
->getParent(), UseMI
, UseMI
->getDebugLoc(),
3293 TII
.get(AMDGPU::COPY
), VGPRSrc
)
3298 return std::make_pair(Src
, Mods
);
3302 /// This will select either an SGPR or VGPR operand and will save us from
3303 /// having to write an extra tablegen pattern.
3304 InstructionSelector::ComplexRendererFns
3305 AMDGPUInstructionSelector::selectVSRC0(MachineOperand
&Root
) const {
3307 [=](MachineInstrBuilder
&MIB
) { MIB
.add(Root
); }
3311 InstructionSelector::ComplexRendererFns
3312 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand
&Root
) const {
3315 std::tie(Src
, Mods
) = selectVOP3ModsImpl(Root
);
3318 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(Src
); },
3319 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Mods
); }, // src0_mods
3320 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(0); }, // clamp
3321 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(0); } // omod
3325 InstructionSelector::ComplexRendererFns
3326 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand
&Root
) const {
3329 std::tie(Src
, Mods
) = selectVOP3ModsImpl(Root
, /* AllowAbs */ false);
3332 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(Src
); },
3333 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Mods
); }, // src0_mods
3334 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(0); }, // clamp
3335 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(0); } // omod
3339 InstructionSelector::ComplexRendererFns
3340 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand
&Root
) const {
3342 [=](MachineInstrBuilder
&MIB
) { MIB
.add(Root
); },
3343 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(0); }, // clamp
3344 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(0); } // omod
3348 InstructionSelector::ComplexRendererFns
3349 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand
&Root
) const {
3352 std::tie(Src
, Mods
) = selectVOP3ModsImpl(Root
);
3355 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(Src
); },
3356 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Mods
); } // src_mods
3360 InstructionSelector::ComplexRendererFns
3361 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand
&Root
) const {
3364 std::tie(Src
, Mods
) = selectVOP3ModsImpl(Root
, /* AllowAbs */ false);
3367 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(Src
); },
3368 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Mods
); } // src_mods
3372 InstructionSelector::ComplexRendererFns
3373 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand
&Root
) const {
3374 Register Reg
= Root
.getReg();
3375 const MachineInstr
*Def
= getDefIgnoringCopies(Reg
, *MRI
);
3376 if (Def
&& (Def
->getOpcode() == AMDGPU::G_FNEG
||
3377 Def
->getOpcode() == AMDGPU::G_FABS
))
3380 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(Reg
); },
3384 std::pair
<Register
, unsigned>
3385 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3386 Register Src
, const MachineRegisterInfo
&MRI
) const {
3388 MachineInstr
*MI
= MRI
.getVRegDef(Src
);
3390 if (MI
&& MI
->getOpcode() == AMDGPU::G_FNEG
&&
3391 // It's possible to see an f32 fneg here, but unlikely.
3392 // TODO: Treat f32 fneg as only high bit.
3393 MRI
.getType(Src
) == LLT::fixed_vector(2, 16)) {
3394 Mods
^= (SISrcMods::NEG
| SISrcMods::NEG_HI
);
3395 Src
= MI
->getOperand(1).getReg();
3396 MI
= MRI
.getVRegDef(Src
);
3399 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3401 // Packed instructions do not have abs modifiers.
3402 Mods
|= SISrcMods::OP_SEL_1
;
3404 return std::make_pair(Src
, Mods
);
3407 InstructionSelector::ComplexRendererFns
3408 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand
&Root
) const {
3409 MachineRegisterInfo
&MRI
3410 = Root
.getParent()->getParent()->getParent()->getRegInfo();
3414 std::tie(Src
, Mods
) = selectVOP3PModsImpl(Root
.getReg(), MRI
);
3417 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(Src
); },
3418 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Mods
); } // src_mods
3422 InstructionSelector::ComplexRendererFns
3423 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand
&Root
) const {
3426 std::tie(Src
, Mods
) = selectVOP3ModsImpl(Root
);
3427 if (!isKnownNeverNaN(Src
, *MRI
))
3431 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(Src
); },
3432 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Mods
); } // src_mods
3436 InstructionSelector::ComplexRendererFns
3437 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand
&Root
) const {
3438 // FIXME: Handle op_sel
3440 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(Root
.getReg()); },
3441 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(0); } // src_mods
3445 InstructionSelector::ComplexRendererFns
3446 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand
&Root
) const {
3447 SmallVector
<GEPInfo
, 4> AddrInfo
;
3448 getAddrModeInfo(*Root
.getParent(), *MRI
, AddrInfo
);
3450 if (AddrInfo
.empty() || AddrInfo
[0].SgprParts
.size() != 1)
3453 const GEPInfo
&GEPInfo
= AddrInfo
[0];
3454 Optional
<int64_t> EncodedImm
=
3455 AMDGPU::getSMRDEncodedOffset(STI
, GEPInfo
.Imm
, false);
3459 unsigned PtrReg
= GEPInfo
.SgprParts
[0];
3461 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(PtrReg
); },
3462 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(*EncodedImm
); }
3466 InstructionSelector::ComplexRendererFns
3467 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand
&Root
) const {
3468 SmallVector
<GEPInfo
, 4> AddrInfo
;
3469 getAddrModeInfo(*Root
.getParent(), *MRI
, AddrInfo
);
3471 if (AddrInfo
.empty() || AddrInfo
[0].SgprParts
.size() != 1)
3474 const GEPInfo
&GEPInfo
= AddrInfo
[0];
3475 Register PtrReg
= GEPInfo
.SgprParts
[0];
3476 Optional
<int64_t> EncodedImm
=
3477 AMDGPU::getSMRDEncodedLiteralOffset32(STI
, GEPInfo
.Imm
);
3482 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(PtrReg
); },
3483 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(*EncodedImm
); }
3487 InstructionSelector::ComplexRendererFns
3488 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand
&Root
) const {
3489 MachineInstr
*MI
= Root
.getParent();
3490 MachineBasicBlock
*MBB
= MI
->getParent();
3492 SmallVector
<GEPInfo
, 4> AddrInfo
;
3493 getAddrModeInfo(*MI
, *MRI
, AddrInfo
);
3495 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3496 // then we can select all ptr + 32-bit offsets not just immediate offsets.
3497 if (AddrInfo
.empty() || AddrInfo
[0].SgprParts
.size() != 1)
3500 const GEPInfo
&GEPInfo
= AddrInfo
[0];
3501 // SGPR offset is unsigned.
3502 if (!GEPInfo
.Imm
|| GEPInfo
.Imm
< 0 || !isUInt
<32>(GEPInfo
.Imm
))
3505 // If we make it this far we have a load with an 32-bit immediate offset.
3506 // It is OK to select this using a sgpr offset, because we have already
3507 // failed trying to select this load into one of the _IMM variants since
3508 // the _IMM Patterns are considered before the _SGPR patterns.
3509 Register PtrReg
= GEPInfo
.SgprParts
[0];
3510 Register OffsetReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
3511 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::S_MOV_B32
), OffsetReg
)
3512 .addImm(GEPInfo
.Imm
);
3514 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(PtrReg
); },
3515 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(OffsetReg
); }
3519 std::pair
<Register
, int>
3520 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand
&Root
,
3521 uint64_t FlatVariant
) const {
3522 MachineInstr
*MI
= Root
.getParent();
3524 auto Default
= std::make_pair(Root
.getReg(), 0);
3526 if (!STI
.hasFlatInstOffsets())
3530 int64_t ConstOffset
;
3531 std::tie(PtrBase
, ConstOffset
) =
3532 getPtrBaseWithConstantOffset(Root
.getReg(), *MRI
);
3533 if (ConstOffset
== 0)
3536 unsigned AddrSpace
= (*MI
->memoperands_begin())->getAddrSpace();
3537 if (!TII
.isLegalFLATOffset(ConstOffset
, AddrSpace
, FlatVariant
))
3540 return std::make_pair(PtrBase
, ConstOffset
);
3543 InstructionSelector::ComplexRendererFns
3544 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand
&Root
) const {
3545 auto PtrWithOffset
= selectFlatOffsetImpl(Root
, SIInstrFlags::FLAT
);
3548 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(PtrWithOffset
.first
); },
3549 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(PtrWithOffset
.second
); },
3553 InstructionSelector::ComplexRendererFns
3554 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand
&Root
) const {
3555 auto PtrWithOffset
= selectFlatOffsetImpl(Root
, SIInstrFlags::FlatGlobal
);
3558 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(PtrWithOffset
.first
); },
3559 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(PtrWithOffset
.second
); },
3563 InstructionSelector::ComplexRendererFns
3564 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand
&Root
) const {
3565 auto PtrWithOffset
= selectFlatOffsetImpl(Root
, SIInstrFlags::FlatScratch
);
3568 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(PtrWithOffset
.first
); },
3569 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(PtrWithOffset
.second
); },
3573 /// Match a zero extend from a 32-bit value to 64-bits.
3574 static Register
matchZeroExtendFromS32(MachineRegisterInfo
&MRI
, Register Reg
) {
3576 if (mi_match(Reg
, MRI
, m_GZExt(m_Reg(ZExtSrc
))))
3577 return MRI
.getType(ZExtSrc
) == LLT::scalar(32) ? ZExtSrc
: Register();
3579 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3580 const MachineInstr
*Def
= getDefIgnoringCopies(Reg
, MRI
);
3581 if (Def
->getOpcode() != AMDGPU::G_MERGE_VALUES
)
3584 if (mi_match(Def
->getOperand(2).getReg(), MRI
, m_ZeroInt())) {
3585 return Def
->getOperand(1).getReg();
3591 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3592 InstructionSelector::ComplexRendererFns
3593 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand
&Root
) const {
3594 Register Addr
= Root
.getReg();
3596 int64_t ConstOffset
;
3597 int64_t ImmOffset
= 0;
3599 // Match the immediate offset first, which canonically is moved as low as
3601 std::tie(PtrBase
, ConstOffset
) = getPtrBaseWithConstantOffset(Addr
, *MRI
);
3603 if (ConstOffset
!= 0) {
3604 if (TII
.isLegalFLATOffset(ConstOffset
, AMDGPUAS::GLOBAL_ADDRESS
,
3605 SIInstrFlags::FlatGlobal
)) {
3607 ImmOffset
= ConstOffset
;
3609 auto PtrBaseDef
= getDefSrcRegIgnoringCopies(PtrBase
, *MRI
);
3613 if (isSGPR(PtrBaseDef
->Reg
)) {
3614 if (ConstOffset
> 0) {
3615 // Offset is too large.
3617 // saddr + large_offset -> saddr +
3618 // (voffset = large_offset & ~MaxOffset) +
3619 // (large_offset & MaxOffset);
3620 int64_t SplitImmOffset
, RemainderOffset
;
3621 std::tie(SplitImmOffset
, RemainderOffset
) = TII
.splitFlatOffset(
3622 ConstOffset
, AMDGPUAS::GLOBAL_ADDRESS
, SIInstrFlags::FlatGlobal
);
3624 if (isUInt
<32>(RemainderOffset
)) {
3625 MachineInstr
*MI
= Root
.getParent();
3626 MachineBasicBlock
*MBB
= MI
->getParent();
3628 MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3630 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::V_MOV_B32_e32
),
3632 .addImm(RemainderOffset
);
3635 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(PtrBase
); }, // saddr
3636 [=](MachineInstrBuilder
&MIB
) {
3637 MIB
.addReg(HighBits
);
3639 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(SplitImmOffset
); },
3644 // We are adding a 64 bit SGPR and a constant. If constant bus limit
3645 // is 1 we would need to perform 1 or 2 extra moves for each half of
3646 // the constant and it is better to do a scalar add and then issue a
3647 // single VALU instruction to materialize zero. Otherwise it is less
3648 // instructions to perform VALU adds with immediates or inline literals.
3649 unsigned NumLiterals
=
3650 !TII
.isInlineConstant(APInt(32, ConstOffset
& 0xffffffff)) +
3651 !TII
.isInlineConstant(APInt(32, ConstOffset
>> 32));
3652 if (STI
.getConstantBusLimit(AMDGPU::V_ADD_U32_e64
) > NumLiterals
)
3658 auto AddrDef
= getDefSrcRegIgnoringCopies(Addr
, *MRI
);
3662 // Match the variable offset.
3663 if (AddrDef
->MI
->getOpcode() == AMDGPU::G_PTR_ADD
) {
3664 // Look through the SGPR->VGPR copy.
3666 getSrcRegIgnoringCopies(AddrDef
->MI
->getOperand(1).getReg(), *MRI
);
3668 if (SAddr
&& isSGPR(SAddr
)) {
3669 Register PtrBaseOffset
= AddrDef
->MI
->getOperand(2).getReg();
3671 // It's possible voffset is an SGPR here, but the copy to VGPR will be
3673 if (Register VOffset
= matchZeroExtendFromS32(*MRI
, PtrBaseOffset
)) {
3674 return {{[=](MachineInstrBuilder
&MIB
) { // saddr
3677 [=](MachineInstrBuilder
&MIB
) { // voffset
3678 MIB
.addReg(VOffset
);
3680 [=](MachineInstrBuilder
&MIB
) { // offset
3681 MIB
.addImm(ImmOffset
);
3687 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3689 if (AddrDef
->MI
->getOpcode() == AMDGPU::G_IMPLICIT_DEF
||
3690 AddrDef
->MI
->getOpcode() == AMDGPU::G_CONSTANT
|| !isSGPR(AddrDef
->Reg
))
3693 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3694 // moves required to copy a 64-bit SGPR to VGPR.
3695 MachineInstr
*MI
= Root
.getParent();
3696 MachineBasicBlock
*MBB
= MI
->getParent();
3697 Register VOffset
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3699 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::V_MOV_B32_e32
), VOffset
)
3703 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(AddrDef
->Reg
); }, // saddr
3704 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(VOffset
); }, // voffset
3705 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(ImmOffset
); } // offset
3709 InstructionSelector::ComplexRendererFns
3710 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand
&Root
) const {
3711 Register Addr
= Root
.getReg();
3713 int64_t ConstOffset
;
3714 int64_t ImmOffset
= 0;
3716 // Match the immediate offset first, which canonically is moved as low as
3718 std::tie(PtrBase
, ConstOffset
) = getPtrBaseWithConstantOffset(Addr
, *MRI
);
3720 if (ConstOffset
!= 0 &&
3721 TII
.isLegalFLATOffset(ConstOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
3722 SIInstrFlags::FlatScratch
)) {
3724 ImmOffset
= ConstOffset
;
3727 auto AddrDef
= getDefSrcRegIgnoringCopies(Addr
, *MRI
);
3731 if (AddrDef
->MI
->getOpcode() == AMDGPU::G_FRAME_INDEX
) {
3732 int FI
= AddrDef
->MI
->getOperand(1).getIndex();
3734 [=](MachineInstrBuilder
&MIB
) { MIB
.addFrameIndex(FI
); }, // saddr
3735 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(ImmOffset
); } // offset
3739 Register SAddr
= AddrDef
->Reg
;
3741 if (AddrDef
->MI
->getOpcode() == AMDGPU::G_PTR_ADD
) {
3742 Register LHS
= AddrDef
->MI
->getOperand(1).getReg();
3743 Register RHS
= AddrDef
->MI
->getOperand(2).getReg();
3744 auto LHSDef
= getDefSrcRegIgnoringCopies(LHS
, *MRI
);
3745 auto RHSDef
= getDefSrcRegIgnoringCopies(RHS
, *MRI
);
3747 if (LHSDef
&& RHSDef
&&
3748 LHSDef
->MI
->getOpcode() == AMDGPU::G_FRAME_INDEX
&&
3749 isSGPR(RHSDef
->Reg
)) {
3750 int FI
= LHSDef
->MI
->getOperand(1).getIndex();
3751 MachineInstr
&I
= *Root
.getParent();
3752 MachineBasicBlock
*BB
= I
.getParent();
3753 const DebugLoc
&DL
= I
.getDebugLoc();
3754 SAddr
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
3756 BuildMI(*BB
, &I
, DL
, TII
.get(AMDGPU::S_ADD_I32
), SAddr
)
3758 .addReg(RHSDef
->Reg
);
3766 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(SAddr
); }, // saddr
3767 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(ImmOffset
); } // offset
3771 InstructionSelector::ComplexRendererFns
3772 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand
&Root
) const {
3773 MachineInstr
*MI
= Root
.getParent();
3774 MachineBasicBlock
*MBB
= MI
->getParent();
3775 MachineFunction
*MF
= MBB
->getParent();
3776 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
3779 if (mi_match(Root
.getReg(), *MRI
, m_ICst(Offset
)) &&
3780 Offset
!= TM
.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS
)) {
3781 Register HighBits
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3783 // TODO: Should this be inside the render function? The iterator seems to
3785 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::V_MOV_B32_e32
),
3787 .addImm(Offset
& ~4095);
3789 return {{[=](MachineInstrBuilder
&MIB
) { // rsrc
3790 MIB
.addReg(Info
->getScratchRSrcReg());
3792 [=](MachineInstrBuilder
&MIB
) { // vaddr
3793 MIB
.addReg(HighBits
);
3795 [=](MachineInstrBuilder
&MIB
) { // soffset
3796 // Use constant zero for soffset and rely on eliminateFrameIndex
3797 // to choose the appropriate frame register if need be.
3800 [=](MachineInstrBuilder
&MIB
) { // offset
3801 MIB
.addImm(Offset
& 4095);
3805 assert(Offset
== 0 || Offset
== -1);
3807 // Try to fold a frame index directly into the MUBUF vaddr field, and any
3810 Register VAddr
= Root
.getReg();
3811 if (const MachineInstr
*RootDef
= MRI
->getVRegDef(Root
.getReg())) {
3813 int64_t ConstOffset
;
3814 std::tie(PtrBase
, ConstOffset
) = getPtrBaseWithConstantOffset(VAddr
, *MRI
);
3815 if (ConstOffset
!= 0) {
3816 if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset
) &&
3817 (!STI
.privateMemoryResourceIsRangeChecked() ||
3818 KnownBits
->signBitIsZero(PtrBase
))) {
3819 const MachineInstr
*PtrBaseDef
= MRI
->getVRegDef(PtrBase
);
3820 if (PtrBaseDef
->getOpcode() == AMDGPU::G_FRAME_INDEX
)
3821 FI
= PtrBaseDef
->getOperand(1).getIndex();
3824 Offset
= ConstOffset
;
3826 } else if (RootDef
->getOpcode() == AMDGPU::G_FRAME_INDEX
) {
3827 FI
= RootDef
->getOperand(1).getIndex();
3831 return {{[=](MachineInstrBuilder
&MIB
) { // rsrc
3832 MIB
.addReg(Info
->getScratchRSrcReg());
3834 [=](MachineInstrBuilder
&MIB
) { // vaddr
3836 MIB
.addFrameIndex(FI
.getValue());
3840 [=](MachineInstrBuilder
&MIB
) { // soffset
3841 // Use constant zero for soffset and rely on eliminateFrameIndex
3842 // to choose the appropriate frame register if need be.
3845 [=](MachineInstrBuilder
&MIB
) { // offset
3850 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base
,
3851 int64_t Offset
) const {
3852 if (!isUInt
<16>(Offset
))
3855 if (STI
.hasUsableDSOffset() || STI
.unsafeDSOffsetFoldingEnabled())
3858 // On Southern Islands instruction with a negative base value and an offset
3859 // don't seem to work.
3860 return KnownBits
->signBitIsZero(Base
);
3863 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base
, int64_t Offset0
,
3865 unsigned Size
) const {
3866 if (Offset0
% Size
!= 0 || Offset1
% Size
!= 0)
3868 if (!isUInt
<8>(Offset0
/ Size
) || !isUInt
<8>(Offset1
/ Size
))
3871 if (STI
.hasUsableDSOffset() || STI
.unsafeDSOffsetFoldingEnabled())
3874 // On Southern Islands instruction with a negative base value and an offset
3875 // don't seem to work.
3876 return KnownBits
->signBitIsZero(Base
);
3879 InstructionSelector::ComplexRendererFns
3880 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3881 MachineOperand
&Root
) const {
3882 MachineInstr
*MI
= Root
.getParent();
3883 MachineBasicBlock
*MBB
= MI
->getParent();
3886 if (!mi_match(Root
.getReg(), *MRI
, m_ICst(Offset
)) ||
3887 !SIInstrInfo::isLegalMUBUFImmOffset(Offset
))
3890 const MachineFunction
*MF
= MBB
->getParent();
3891 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
3894 [=](MachineInstrBuilder
&MIB
) { // rsrc
3895 MIB
.addReg(Info
->getScratchRSrcReg());
3897 [=](MachineInstrBuilder
&MIB
) { // soffset
3900 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Offset
); } // offset
3904 std::pair
<Register
, unsigned>
3905 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand
&Root
) const {
3906 const MachineInstr
*RootDef
= MRI
->getVRegDef(Root
.getReg());
3908 return std::make_pair(Root
.getReg(), 0);
3910 int64_t ConstAddr
= 0;
3914 std::tie(PtrBase
, Offset
) =
3915 getPtrBaseWithConstantOffset(Root
.getReg(), *MRI
);
3918 if (isDSOffsetLegal(PtrBase
, Offset
)) {
3920 return std::make_pair(PtrBase
, Offset
);
3922 } else if (RootDef
->getOpcode() == AMDGPU::G_SUB
) {
3926 } else if (mi_match(Root
.getReg(), *MRI
, m_ICst(ConstAddr
))) {
3931 return std::make_pair(Root
.getReg(), 0);
3934 InstructionSelector::ComplexRendererFns
3935 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand
&Root
) const {
3938 std::tie(Reg
, Offset
) = selectDS1Addr1OffsetImpl(Root
);
3940 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(Reg
); },
3941 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Offset
); }
3945 InstructionSelector::ComplexRendererFns
3946 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand
&Root
) const {
3947 return selectDSReadWrite2(Root
, 4);
3950 InstructionSelector::ComplexRendererFns
3951 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand
&Root
) const {
3952 return selectDSReadWrite2(Root
, 8);
3955 InstructionSelector::ComplexRendererFns
3956 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand
&Root
,
3957 unsigned Size
) const {
3960 std::tie(Reg
, Offset
) = selectDSReadWrite2Impl(Root
, Size
);
3962 [=](MachineInstrBuilder
&MIB
) { MIB
.addReg(Reg
); },
3963 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Offset
); },
3964 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Offset
+1); }
3968 std::pair
<Register
, unsigned>
3969 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand
&Root
,
3970 unsigned Size
) const {
3971 const MachineInstr
*RootDef
= MRI
->getVRegDef(Root
.getReg());
3973 return std::make_pair(Root
.getReg(), 0);
3975 int64_t ConstAddr
= 0;
3979 std::tie(PtrBase
, Offset
) =
3980 getPtrBaseWithConstantOffset(Root
.getReg(), *MRI
);
3983 int64_t OffsetValue0
= Offset
;
3984 int64_t OffsetValue1
= Offset
+ Size
;
3985 if (isDSOffset2Legal(PtrBase
, OffsetValue0
, OffsetValue1
, Size
)) {
3987 return std::make_pair(PtrBase
, OffsetValue0
/ Size
);
3989 } else if (RootDef
->getOpcode() == AMDGPU::G_SUB
) {
3992 } else if (mi_match(Root
.getReg(), *MRI
, m_ICst(ConstAddr
))) {
3997 return std::make_pair(Root
.getReg(), 0);
4000 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4001 /// the base value with the constant offset. There may be intervening copies
4002 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
4003 /// not match the pattern.
4004 std::pair
<Register
, int64_t>
4005 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4006 Register Root
, const MachineRegisterInfo
&MRI
) const {
4007 MachineInstr
*RootI
= getDefIgnoringCopies(Root
, MRI
);
4008 if (RootI
->getOpcode() != TargetOpcode::G_PTR_ADD
)
4011 MachineOperand
&RHS
= RootI
->getOperand(2);
4012 Optional
<ValueAndVReg
> MaybeOffset
4013 = getConstantVRegValWithLookThrough(RHS
.getReg(), MRI
, true);
4016 return {RootI
->getOperand(1).getReg(), MaybeOffset
->Value
.getSExtValue()};
4019 static void addZeroImm(MachineInstrBuilder
&MIB
) {
4023 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
4024 /// BasePtr is not valid, a null base pointer will be used.
4025 static Register
buildRSRC(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
,
4026 uint32_t FormatLo
, uint32_t FormatHi
,
4028 Register RSrc2
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
4029 Register RSrc3
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
4030 Register RSrcHi
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
4031 Register RSrc
= MRI
.createVirtualRegister(&AMDGPU::SGPR_128RegClass
);
4033 B
.buildInstr(AMDGPU::S_MOV_B32
)
4036 B
.buildInstr(AMDGPU::S_MOV_B32
)
4040 // Build the half of the subregister with the constants before building the
4041 // full 128-bit register. If we are building multiple resource descriptors,
4042 // this will allow CSEing of the 2-component register.
4043 B
.buildInstr(AMDGPU::REG_SEQUENCE
)
4046 .addImm(AMDGPU::sub0
)
4048 .addImm(AMDGPU::sub1
);
4050 Register RSrcLo
= BasePtr
;
4052 RSrcLo
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
4053 B
.buildInstr(AMDGPU::S_MOV_B64
)
4058 B
.buildInstr(AMDGPU::REG_SEQUENCE
)
4061 .addImm(AMDGPU::sub0_sub1
)
4063 .addImm(AMDGPU::sub2_sub3
);
4068 static Register
buildAddr64RSrc(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
,
4069 const SIInstrInfo
&TII
, Register BasePtr
) {
4070 uint64_t DefaultFormat
= TII
.getDefaultRsrcDataFormat();
4072 // FIXME: Why are half the "default" bits ignored based on the addressing
4074 return buildRSRC(B
, MRI
, 0, Hi_32(DefaultFormat
), BasePtr
);
4077 static Register
buildOffsetSrc(MachineIRBuilder
&B
, MachineRegisterInfo
&MRI
,
4078 const SIInstrInfo
&TII
, Register BasePtr
) {
4079 uint64_t DefaultFormat
= TII
.getDefaultRsrcDataFormat();
4081 // FIXME: Why are half the "default" bits ignored based on the addressing
4083 return buildRSRC(B
, MRI
, -1, Hi_32(DefaultFormat
), BasePtr
);
4086 AMDGPUInstructionSelector::MUBUFAddressData
4087 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src
) const {
4088 MUBUFAddressData Data
;
4094 std::tie(PtrBase
, Offset
) = getPtrBaseWithConstantOffset(Src
, *MRI
);
4095 if (isUInt
<32>(Offset
)) {
4097 Data
.Offset
= Offset
;
4100 if (MachineInstr
*InputAdd
4101 = getOpcodeDef(TargetOpcode::G_PTR_ADD
, Data
.N0
, *MRI
)) {
4102 Data
.N2
= InputAdd
->getOperand(1).getReg();
4103 Data
.N3
= InputAdd
->getOperand(2).getReg();
4105 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4106 // FIXME: Don't know this was defined by operand 0
4108 // TODO: Remove this when we have copy folding optimizations after
4110 Data
.N2
= getDefIgnoringCopies(Data
.N2
, *MRI
)->getOperand(0).getReg();
4111 Data
.N3
= getDefIgnoringCopies(Data
.N3
, *MRI
)->getOperand(0).getReg();
4117 /// Return if the addr64 mubuf mode should be used for the given address.
4118 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr
) const {
4119 // (ptr_add N2, N3) -> addr64, or
4120 // (ptr_add (ptr_add N2, N3), C1) -> addr64
4124 const RegisterBank
*N0Bank
= RBI
.getRegBank(Addr
.N0
, *MRI
, TRI
);
4125 return N0Bank
->getID() == AMDGPU::VGPRRegBankID
;
4128 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4129 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4131 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4132 MachineIRBuilder
&B
, Register
&SOffset
, int64_t &ImmOffset
) const {
4133 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset
))
4136 // Illegal offset, store it in soffset.
4137 SOffset
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
4138 B
.buildInstr(AMDGPU::S_MOV_B32
)
4144 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4145 MachineOperand
&Root
, Register
&VAddr
, Register
&RSrcReg
,
4146 Register
&SOffset
, int64_t &Offset
) const {
4147 // FIXME: Predicates should stop this from reaching here.
4148 // addr64 bit was removed for volcanic islands.
4149 if (!STI
.hasAddr64() || STI
.useFlatForGlobal())
4152 MUBUFAddressData AddrData
= parseMUBUFAddress(Root
.getReg());
4153 if (!shouldUseAddr64(AddrData
))
4156 Register N0
= AddrData
.N0
;
4157 Register N2
= AddrData
.N2
;
4158 Register N3
= AddrData
.N3
;
4159 Offset
= AddrData
.Offset
;
4161 // Base pointer for the SRD.
4165 if (RBI
.getRegBank(N2
, *MRI
, TRI
)->getID() == AMDGPU::VGPRRegBankID
) {
4167 if (RBI
.getRegBank(N3
, *MRI
, TRI
)->getID() == AMDGPU::VGPRRegBankID
) {
4168 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4169 // addr64, and construct the default resource from a 0 address.
4176 // N2 is not divergent.
4180 } else if (RBI
.getRegBank(N0
, *MRI
, TRI
)->getID() == AMDGPU::VGPRRegBankID
) {
4181 // Use the default null pointer in the resource
4185 // (N0 + C1) -> offset
4189 MachineIRBuilder
B(*Root
.getParent());
4190 RSrcReg
= buildAddr64RSrc(B
, *MRI
, TII
, SRDPtr
);
4191 splitIllegalMUBUFOffset(B
, SOffset
, Offset
);
4195 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4196 MachineOperand
&Root
, Register
&RSrcReg
, Register
&SOffset
,
4197 int64_t &Offset
) const {
4199 // FIXME: Pattern should not reach here.
4200 if (STI
.useFlatForGlobal())
4203 MUBUFAddressData AddrData
= parseMUBUFAddress(Root
.getReg());
4204 if (shouldUseAddr64(AddrData
))
4208 // (N0 + C1) -> offset
4209 Register SRDPtr
= AddrData
.N0
;
4210 Offset
= AddrData
.Offset
;
4212 // TODO: Look through extensions for 32-bit soffset.
4213 MachineIRBuilder
B(*Root
.getParent());
4215 RSrcReg
= buildOffsetSrc(B
, *MRI
, TII
, SRDPtr
);
4216 splitIllegalMUBUFOffset(B
, SOffset
, Offset
);
4220 InstructionSelector::ComplexRendererFns
4221 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand
&Root
) const {
4227 if (!selectMUBUFAddr64Impl(Root
, VAddr
, RSrcReg
, SOffset
, Offset
))
4230 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4233 [=](MachineInstrBuilder
&MIB
) { // rsrc
4234 MIB
.addReg(RSrcReg
);
4236 [=](MachineInstrBuilder
&MIB
) { // vaddr
4239 [=](MachineInstrBuilder
&MIB
) { // soffset
4241 MIB
.addReg(SOffset
);
4245 [=](MachineInstrBuilder
&MIB
) { // offset
4254 InstructionSelector::ComplexRendererFns
4255 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand
&Root
) const {
4260 if (!selectMUBUFOffsetImpl(Root
, RSrcReg
, SOffset
, Offset
))
4264 [=](MachineInstrBuilder
&MIB
) { // rsrc
4265 MIB
.addReg(RSrcReg
);
4267 [=](MachineInstrBuilder
&MIB
) { // soffset
4269 MIB
.addReg(SOffset
);
4273 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Offset
); }, // offset
4280 InstructionSelector::ComplexRendererFns
4281 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand
&Root
) const {
4287 if (!selectMUBUFAddr64Impl(Root
, VAddr
, RSrcReg
, SOffset
, Offset
))
4290 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4293 [=](MachineInstrBuilder
&MIB
) { // rsrc
4294 MIB
.addReg(RSrcReg
);
4296 [=](MachineInstrBuilder
&MIB
) { // vaddr
4299 [=](MachineInstrBuilder
&MIB
) { // soffset
4301 MIB
.addReg(SOffset
);
4305 [=](MachineInstrBuilder
&MIB
) { // offset
4308 [=](MachineInstrBuilder
&MIB
) {
4309 MIB
.addImm(AMDGPU::CPol::GLC
); // cpol
4314 InstructionSelector::ComplexRendererFns
4315 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand
&Root
) const {
4320 if (!selectMUBUFOffsetImpl(Root
, RSrcReg
, SOffset
, Offset
))
4324 [=](MachineInstrBuilder
&MIB
) { // rsrc
4325 MIB
.addReg(RSrcReg
);
4327 [=](MachineInstrBuilder
&MIB
) { // soffset
4329 MIB
.addReg(SOffset
);
4333 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(Offset
); }, // offset
4334 [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(AMDGPU::CPol::GLC
); } // cpol
4338 /// Get an immediate that must be 32-bits, and treated as zero extended.
4339 static Optional
<uint64_t> getConstantZext32Val(Register Reg
,
4340 const MachineRegisterInfo
&MRI
) {
4341 // getConstantVRegVal sexts any values, so see if that matters.
4342 Optional
<int64_t> OffsetVal
= getConstantVRegSExtVal(Reg
, MRI
);
4343 if (!OffsetVal
|| !isInt
<32>(*OffsetVal
))
4345 return Lo_32(*OffsetVal
);
4348 InstructionSelector::ComplexRendererFns
4349 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand
&Root
) const {
4350 Optional
<uint64_t> OffsetVal
= getConstantZext32Val(Root
.getReg(), *MRI
);
4354 Optional
<int64_t> EncodedImm
=
4355 AMDGPU::getSMRDEncodedOffset(STI
, *OffsetVal
, true);
4359 return {{ [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(*EncodedImm
); } }};
4362 InstructionSelector::ComplexRendererFns
4363 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand
&Root
) const {
4364 assert(STI
.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
);
4366 Optional
<uint64_t> OffsetVal
= getConstantZext32Val(Root
.getReg(), *MRI
);
4370 Optional
<int64_t> EncodedImm
4371 = AMDGPU::getSMRDEncodedLiteralOffset32(STI
, *OffsetVal
);
4375 return {{ [=](MachineInstrBuilder
&MIB
) { MIB
.addImm(*EncodedImm
); } }};
4378 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder
&MIB
,
4379 const MachineInstr
&MI
,
4381 assert(MI
.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx
== -1 &&
4382 "Expected G_CONSTANT");
4383 MIB
.addImm(MI
.getOperand(1).getCImm()->getSExtValue());
4386 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder
&MIB
,
4387 const MachineInstr
&MI
,
4389 assert(MI
.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx
== -1 &&
4390 "Expected G_CONSTANT");
4391 MIB
.addImm(-MI
.getOperand(1).getCImm()->getSExtValue());
4394 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder
&MIB
,
4395 const MachineInstr
&MI
,
4397 assert(OpIdx
== -1);
4399 const MachineOperand
&Op
= MI
.getOperand(1);
4400 if (MI
.getOpcode() == TargetOpcode::G_FCONSTANT
)
4401 MIB
.addImm(Op
.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4403 assert(MI
.getOpcode() == TargetOpcode::G_CONSTANT
&& "Expected G_CONSTANT");
4404 MIB
.addImm(Op
.getCImm()->getSExtValue());
4408 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder
&MIB
,
4409 const MachineInstr
&MI
,
4411 assert(MI
.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx
== -1 &&
4412 "Expected G_CONSTANT");
4413 MIB
.addImm(MI
.getOperand(1).getCImm()->getValue().countPopulation());
4416 /// This only really exists to satisfy DAG type checking machinery, so is a
4418 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder
&MIB
,
4419 const MachineInstr
&MI
,
4421 MIB
.addImm(MI
.getOperand(OpIdx
).getImm());
4424 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder
&MIB
,
4425 const MachineInstr
&MI
,
4427 assert(OpIdx
>= 0 && "expected to match an immediate operand");
4428 MIB
.addImm(MI
.getOperand(OpIdx
).getImm() & AMDGPU::CPol::ALL
);
4431 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder
&MIB
,
4432 const MachineInstr
&MI
,
4434 assert(OpIdx
>= 0 && "expected to match an immediate operand");
4435 MIB
.addImm((MI
.getOperand(OpIdx
).getImm() >> 3) & 1);
4438 void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder
&MIB
,
4439 const MachineInstr
&MI
,
4441 assert(OpIdx
>= 0 && "expected to match an immediate operand");
4442 MIB
.addImm(MI
.getOperand(OpIdx
).getImm() | AMDGPU::CPol::GLC
);
4445 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder
&MIB
,
4446 const MachineInstr
&MI
,
4448 MIB
.addFrameIndex((MI
.getOperand(1).getIndex()));
4451 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm
) const {
4452 return AMDGPU::isInlinableLiteral16(Imm
, STI
.hasInv2PiInlineImm());
4455 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm
) const {
4456 return AMDGPU::isInlinableLiteral32(Imm
, STI
.hasInv2PiInlineImm());
4459 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm
) const {
4460 return AMDGPU::isInlinableLiteral64(Imm
, STI
.hasInv2PiInlineImm());
4463 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat
&Imm
) const {
4464 return TII
.isInlineConstant(Imm
);