1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //==-----------------------------------------------------------------------===//
10 /// Defines an instruction selector for the AMDGPU target.
12 //===----------------------------------------------------------------------===//
15 #include "AMDGPUArgumentUsageInfo.h"
16 #include "AMDGPUISelLowering.h" // For AMDGPUISD
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUPerfHintAnalysis.h"
19 #include "AMDGPURegisterInfo.h"
20 #include "AMDGPUSubtarget.h"
21 #include "AMDGPUTargetMachine.h"
22 #include "SIDefines.h"
23 #include "SIISelLowering.h"
24 #include "SIInstrInfo.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIRegisterInfo.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "llvm/ADT/APInt.h"
29 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/ADT/StringRef.h"
31 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
32 #include "llvm/Analysis/ValueTracking.h"
33 #include "llvm/CodeGen/FunctionLoweringInfo.h"
34 #include "llvm/CodeGen/ISDOpcodes.h"
35 #include "llvm/CodeGen/MachineFunction.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/SelectionDAG.h"
38 #include "llvm/CodeGen/SelectionDAGISel.h"
39 #include "llvm/CodeGen/SelectionDAGNodes.h"
40 #include "llvm/CodeGen/ValueTypes.h"
41 #include "llvm/IR/BasicBlock.h"
42 #ifdef EXPENSIVE_CHECKS
43 #include "llvm/IR/Dominators.h"
45 #include "llvm/IR/Instruction.h"
46 #include "llvm/MC/MCInstrDesc.h"
47 #include "llvm/Support/Casting.h"
48 #include "llvm/Support/CodeGen.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Support/MachineValueType.h"
51 #include "llvm/Support/MathExtras.h"
57 #define DEBUG_TYPE "isel"
65 } // end namespace llvm
67 //===----------------------------------------------------------------------===//
68 // Instruction Selector Implementation
69 //===----------------------------------------------------------------------===//
73 static bool isNullConstantOrUndef(SDValue V
) {
77 ConstantSDNode
*Const
= dyn_cast
<ConstantSDNode
>(V
);
78 return Const
!= nullptr && Const
->isNullValue();
81 static bool getConstantValue(SDValue N
, uint32_t &Out
) {
82 // This is only used for packed vectors, where ussing 0 for undef should
89 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
)) {
90 Out
= C
->getAPIntValue().getSExtValue();
94 if (const ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(N
)) {
95 Out
= C
->getValueAPF().bitcastToAPInt().getSExtValue();
102 // TODO: Handle undef as zero
103 static SDNode
*packConstantV2I16(const SDNode
*N
, SelectionDAG
&DAG
,
104 bool Negate
= false) {
105 assert(N
->getOpcode() == ISD::BUILD_VECTOR
&& N
->getNumOperands() == 2);
106 uint32_t LHSVal
, RHSVal
;
107 if (getConstantValue(N
->getOperand(0), LHSVal
) &&
108 getConstantValue(N
->getOperand(1), RHSVal
)) {
110 uint32_t K
= Negate
?
111 (-LHSVal
& 0xffff) | (-RHSVal
<< 16) :
112 (LHSVal
& 0xffff) | (RHSVal
<< 16);
113 return DAG
.getMachineNode(AMDGPU::S_MOV_B32
, SL
, N
->getValueType(0),
114 DAG
.getTargetConstant(K
, SL
, MVT::i32
));
120 static SDNode
*packNegConstantV2I16(const SDNode
*N
, SelectionDAG
&DAG
) {
121 return packConstantV2I16(N
, DAG
, true);
124 /// AMDGPU specific code to select AMDGPU machine instructions for
125 /// SelectionDAG operations.
126 class AMDGPUDAGToDAGISel
: public SelectionDAGISel
{
127 // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
128 // make the right decision when generating code for different targets.
129 const GCNSubtarget
*Subtarget
;
130 bool EnableLateStructurizeCFG
;
133 explicit AMDGPUDAGToDAGISel(TargetMachine
*TM
= nullptr,
134 CodeGenOpt::Level OptLevel
= CodeGenOpt::Default
)
135 : SelectionDAGISel(*TM
, OptLevel
) {
136 EnableLateStructurizeCFG
= AMDGPUTargetMachine::EnableLateStructurizeCFG
;
138 ~AMDGPUDAGToDAGISel() override
= default;
140 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
141 AU
.addRequired
<AMDGPUArgumentUsageInfo
>();
142 AU
.addRequired
<LegacyDivergenceAnalysis
>();
143 #ifdef EXPENSIVE_CHECKS
144 AU
.addRequired
<DominatorTreeWrapperPass
>();
145 AU
.addRequired
<LoopInfoWrapperPass
>();
147 SelectionDAGISel::getAnalysisUsage(AU
);
150 bool matchLoadD16FromBuildVector(SDNode
*N
) const;
152 bool runOnMachineFunction(MachineFunction
&MF
) override
;
153 void PreprocessISelDAG() override
;
154 void Select(SDNode
*N
) override
;
155 StringRef
getPassName() const override
;
156 void PostprocessISelDAG() override
;
159 void SelectBuildVector(SDNode
*N
, unsigned RegClassID
);
162 std::pair
<SDValue
, SDValue
> foldFrameIndex(SDValue N
) const;
163 bool isNoNanSrc(SDValue N
) const;
164 bool isInlineImmediate(const SDNode
*N
, bool Negated
= false) const;
165 bool isNegInlineImmediate(const SDNode
*N
) const {
166 return isInlineImmediate(N
, true);
169 bool isVGPRImm(const SDNode
*N
) const;
170 bool isUniformLoad(const SDNode
*N
) const;
171 bool isUniformBr(const SDNode
*N
) const;
173 MachineSDNode
*buildSMovImm64(SDLoc
&DL
, uint64_t Val
, EVT VT
) const;
175 SDNode
*glueCopyToM0LDSInit(SDNode
*N
) const;
176 SDNode
*glueCopyToM0(SDNode
*N
, SDValue Val
) const;
178 const TargetRegisterClass
*getOperandRegClass(SDNode
*N
, unsigned OpNo
) const;
179 virtual bool SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
, SDValue
&Offset
);
180 virtual bool SelectADDRIndirect(SDValue Addr
, SDValue
&Base
, SDValue
&Offset
);
181 bool isDSOffsetLegal(SDValue Base
, unsigned Offset
,
182 unsigned OffsetBits
) const;
183 bool SelectDS1Addr1Offset(SDValue Ptr
, SDValue
&Base
, SDValue
&Offset
) const;
184 bool SelectDS64Bit4ByteAligned(SDValue Ptr
, SDValue
&Base
, SDValue
&Offset0
,
185 SDValue
&Offset1
) const;
186 bool SelectMUBUF(SDValue Addr
, SDValue
&SRsrc
, SDValue
&VAddr
,
187 SDValue
&SOffset
, SDValue
&Offset
, SDValue
&Offen
,
188 SDValue
&Idxen
, SDValue
&Addr64
, SDValue
&GLC
, SDValue
&SLC
,
189 SDValue
&TFE
, SDValue
&DLC
, SDValue
&SWZ
) const;
190 bool SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
, SDValue
&VAddr
,
191 SDValue
&SOffset
, SDValue
&Offset
, SDValue
&GLC
,
192 SDValue
&SLC
, SDValue
&TFE
, SDValue
&DLC
,
194 bool SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
,
195 SDValue
&VAddr
, SDValue
&SOffset
, SDValue
&Offset
,
197 bool SelectMUBUFScratchOffen(SDNode
*Parent
,
198 SDValue Addr
, SDValue
&RSrc
, SDValue
&VAddr
,
199 SDValue
&SOffset
, SDValue
&ImmOffset
) const;
200 bool SelectMUBUFScratchOffset(SDNode
*Parent
,
201 SDValue Addr
, SDValue
&SRsrc
, SDValue
&Soffset
,
202 SDValue
&Offset
) const;
204 bool SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
, SDValue
&SOffset
,
205 SDValue
&Offset
, SDValue
&GLC
, SDValue
&SLC
,
206 SDValue
&TFE
, SDValue
&DLC
, SDValue
&SWZ
) const;
207 bool SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
, SDValue
&Soffset
,
208 SDValue
&Offset
, SDValue
&SLC
) const;
209 bool SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
, SDValue
&Soffset
,
210 SDValue
&Offset
) const;
212 template <bool IsSigned
>
213 bool SelectFlatOffset(SDNode
*N
, SDValue Addr
, SDValue
&VAddr
,
214 SDValue
&Offset
, SDValue
&SLC
) const;
215 bool SelectFlatAtomic(SDNode
*N
, SDValue Addr
, SDValue
&VAddr
,
216 SDValue
&Offset
, SDValue
&SLC
) const;
217 bool SelectFlatAtomicSigned(SDNode
*N
, SDValue Addr
, SDValue
&VAddr
,
218 SDValue
&Offset
, SDValue
&SLC
) const;
220 bool SelectSMRDOffset(SDValue ByteOffsetNode
, SDValue
&Offset
,
222 SDValue
Expand32BitAddress(SDValue Addr
) const;
223 bool SelectSMRD(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
,
225 bool SelectSMRDImm(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
) const;
226 bool SelectSMRDImm32(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
) const;
227 bool SelectSMRDSgpr(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
) const;
228 bool SelectSMRDBufferImm(SDValue Addr
, SDValue
&Offset
) const;
229 bool SelectSMRDBufferImm32(SDValue Addr
, SDValue
&Offset
) const;
230 bool SelectMOVRELOffset(SDValue Index
, SDValue
&Base
, SDValue
&Offset
) const;
232 bool SelectVOP3Mods_NNaN(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
233 bool SelectVOP3Mods_f32(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
234 bool SelectVOP3ModsImpl(SDValue In
, SDValue
&Src
, unsigned &SrcMods
) const;
235 bool SelectVOP3Mods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
236 bool SelectVOP3NoMods(SDValue In
, SDValue
&Src
) const;
237 bool SelectVOP3Mods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
238 SDValue
&Clamp
, SDValue
&Omod
) const;
239 bool SelectVOP3NoMods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
240 SDValue
&Clamp
, SDValue
&Omod
) const;
242 bool SelectVOP3Mods0Clamp0OMod(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
244 SDValue
&Omod
) const;
246 bool SelectVOP3OMods(SDValue In
, SDValue
&Src
,
247 SDValue
&Clamp
, SDValue
&Omod
) const;
249 bool SelectVOP3PMods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
250 bool SelectVOP3PMods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
251 SDValue
&Clamp
) const;
253 bool SelectVOP3OpSel(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
254 bool SelectVOP3OpSel0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
255 SDValue
&Clamp
) const;
257 bool SelectVOP3OpSelMods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
258 bool SelectVOP3OpSelMods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
259 SDValue
&Clamp
) const;
260 bool SelectVOP3PMadMixModsImpl(SDValue In
, SDValue
&Src
, unsigned &Mods
) const;
261 bool SelectVOP3PMadMixMods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
263 SDValue
getHi16Elt(SDValue In
) const;
265 void SelectADD_SUB_I64(SDNode
*N
);
266 void SelectAddcSubb(SDNode
*N
);
267 void SelectUADDO_USUBO(SDNode
*N
);
268 void SelectDIV_SCALE(SDNode
*N
);
269 void SelectDIV_FMAS(SDNode
*N
);
270 void SelectMAD_64_32(SDNode
*N
);
271 void SelectFMA_W_CHAIN(SDNode
*N
);
272 void SelectFMUL_W_CHAIN(SDNode
*N
);
274 SDNode
*getS_BFE(unsigned Opcode
, const SDLoc
&DL
, SDValue Val
,
275 uint32_t Offset
, uint32_t Width
);
276 void SelectS_BFEFromShifts(SDNode
*N
);
277 void SelectS_BFE(SDNode
*N
);
278 bool isCBranchSCC(const SDNode
*N
) const;
279 void SelectBRCOND(SDNode
*N
);
280 void SelectFMAD_FMA(SDNode
*N
);
281 void SelectATOMIC_CMP_SWAP(SDNode
*N
);
282 void SelectDSAppendConsume(SDNode
*N
, unsigned IntrID
);
283 void SelectDS_GWS(SDNode
*N
, unsigned IntrID
);
284 void SelectINTRINSIC_W_CHAIN(SDNode
*N
);
285 void SelectINTRINSIC_WO_CHAIN(SDNode
*N
);
286 void SelectINTRINSIC_VOID(SDNode
*N
);
289 // Include the pieces autogenerated from the target description.
290 #include "AMDGPUGenDAGISel.inc"
293 class R600DAGToDAGISel
: public AMDGPUDAGToDAGISel
{
294 const R600Subtarget
*Subtarget
;
296 bool isConstantLoad(const MemSDNode
*N
, int cbID
) const;
297 bool SelectGlobalValueConstantOffset(SDValue Addr
, SDValue
& IntPtr
);
298 bool SelectGlobalValueVariableOffset(SDValue Addr
, SDValue
&BaseReg
,
301 explicit R600DAGToDAGISel(TargetMachine
*TM
, CodeGenOpt::Level OptLevel
) :
302 AMDGPUDAGToDAGISel(TM
, OptLevel
) {}
304 void Select(SDNode
*N
) override
;
306 bool SelectADDRIndirect(SDValue Addr
, SDValue
&Base
,
307 SDValue
&Offset
) override
;
308 bool SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
,
309 SDValue
&Offset
) override
;
311 bool runOnMachineFunction(MachineFunction
&MF
) override
;
313 void PreprocessISelDAG() override
{}
316 // Include the pieces autogenerated from the target description.
317 #include "R600GenDAGISel.inc"
320 static SDValue
stripBitcast(SDValue Val
) {
321 return Val
.getOpcode() == ISD::BITCAST
? Val
.getOperand(0) : Val
;
324 // Figure out if this is really an extract of the high 16-bits of a dword.
325 static bool isExtractHiElt(SDValue In
, SDValue
&Out
) {
326 In
= stripBitcast(In
);
327 if (In
.getOpcode() != ISD::TRUNCATE
)
330 SDValue Srl
= In
.getOperand(0);
331 if (Srl
.getOpcode() == ISD::SRL
) {
332 if (ConstantSDNode
*ShiftAmt
= dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1))) {
333 if (ShiftAmt
->getZExtValue() == 16) {
334 Out
= stripBitcast(Srl
.getOperand(0));
343 // Look through operations that obscure just looking at the low 16-bits of the
345 static SDValue
stripExtractLoElt(SDValue In
) {
346 if (In
.getOpcode() == ISD::TRUNCATE
) {
347 SDValue Src
= In
.getOperand(0);
348 if (Src
.getValueType().getSizeInBits() == 32)
349 return stripBitcast(Src
);
355 } // end anonymous namespace
357 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel
, "amdgpu-isel",
358 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
359 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo
)
360 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis
)
361 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis
)
362 #ifdef EXPENSIVE_CHECKS
363 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
364 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass
)
366 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel
, "amdgpu-isel",
367 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
369 /// This pass converts a legalized DAG into a AMDGPU-specific
370 // DAG, ready for instruction scheduling.
371 FunctionPass
*llvm::createAMDGPUISelDag(TargetMachine
*TM
,
372 CodeGenOpt::Level OptLevel
) {
373 return new AMDGPUDAGToDAGISel(TM
, OptLevel
);
376 /// This pass converts a legalized DAG into a R600-specific
377 // DAG, ready for instruction scheduling.
378 FunctionPass
*llvm::createR600ISelDag(TargetMachine
*TM
,
379 CodeGenOpt::Level OptLevel
) {
380 return new R600DAGToDAGISel(TM
, OptLevel
);
383 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction
&MF
) {
384 #ifdef EXPENSIVE_CHECKS
385 DominatorTree
& DT
= getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
386 LoopInfo
* LI
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
387 for (auto &L
: LI
->getLoopsInPreorder()) {
388 assert(L
->isLCSSAForm(DT
));
391 Subtarget
= &MF
.getSubtarget
<GCNSubtarget
>();
392 return SelectionDAGISel::runOnMachineFunction(MF
);
395 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode
*N
) const {
396 assert(Subtarget
->d16PreservesUnusedBits());
397 MVT VT
= N
->getValueType(0).getSimpleVT();
398 if (VT
!= MVT::v2i16
&& VT
!= MVT::v2f16
)
401 SDValue Lo
= N
->getOperand(0);
402 SDValue Hi
= N
->getOperand(1);
404 LoadSDNode
*LdHi
= dyn_cast
<LoadSDNode
>(stripBitcast(Hi
));
406 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
407 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
408 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
410 // Need to check for possible indirect dependencies on the other half of the
411 // vector to avoid introducing a cycle.
412 if (LdHi
&& Hi
.hasOneUse() && !LdHi
->isPredecessorOf(Lo
.getNode())) {
413 SDVTList VTList
= CurDAG
->getVTList(VT
, MVT::Other
);
415 SDValue TiedIn
= CurDAG
->getNode(ISD::SCALAR_TO_VECTOR
, SDLoc(N
), VT
, Lo
);
417 LdHi
->getChain(), LdHi
->getBasePtr(), TiedIn
420 unsigned LoadOp
= AMDGPUISD::LOAD_D16_HI
;
421 if (LdHi
->getMemoryVT() == MVT::i8
) {
422 LoadOp
= LdHi
->getExtensionType() == ISD::SEXTLOAD
?
423 AMDGPUISD::LOAD_D16_HI_I8
: AMDGPUISD::LOAD_D16_HI_U8
;
425 assert(LdHi
->getMemoryVT() == MVT::i16
);
429 CurDAG
->getMemIntrinsicNode(LoadOp
, SDLoc(LdHi
), VTList
,
430 Ops
, LdHi
->getMemoryVT(),
431 LdHi
->getMemOperand());
433 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), NewLoadHi
);
434 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(LdHi
, 1), NewLoadHi
.getValue(1));
438 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
439 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
440 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
441 LoadSDNode
*LdLo
= dyn_cast
<LoadSDNode
>(stripBitcast(Lo
));
442 if (LdLo
&& Lo
.hasOneUse()) {
443 SDValue TiedIn
= getHi16Elt(Hi
);
444 if (!TiedIn
|| LdLo
->isPredecessorOf(TiedIn
.getNode()))
447 SDVTList VTList
= CurDAG
->getVTList(VT
, MVT::Other
);
448 unsigned LoadOp
= AMDGPUISD::LOAD_D16_LO
;
449 if (LdLo
->getMemoryVT() == MVT::i8
) {
450 LoadOp
= LdLo
->getExtensionType() == ISD::SEXTLOAD
?
451 AMDGPUISD::LOAD_D16_LO_I8
: AMDGPUISD::LOAD_D16_LO_U8
;
453 assert(LdLo
->getMemoryVT() == MVT::i16
);
456 TiedIn
= CurDAG
->getNode(ISD::BITCAST
, SDLoc(N
), VT
, TiedIn
);
459 LdLo
->getChain(), LdLo
->getBasePtr(), TiedIn
463 CurDAG
->getMemIntrinsicNode(LoadOp
, SDLoc(LdLo
), VTList
,
464 Ops
, LdLo
->getMemoryVT(),
465 LdLo
->getMemOperand());
467 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), NewLoadLo
);
468 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(LdLo
, 1), NewLoadLo
.getValue(1));
475 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
476 if (!Subtarget
->d16PreservesUnusedBits())
479 SelectionDAG::allnodes_iterator Position
= CurDAG
->allnodes_end();
481 bool MadeChange
= false;
482 while (Position
!= CurDAG
->allnodes_begin()) {
483 SDNode
*N
= &*--Position
;
487 switch (N
->getOpcode()) {
488 case ISD::BUILD_VECTOR
:
489 MadeChange
|= matchLoadD16FromBuildVector(N
);
497 CurDAG
->RemoveDeadNodes();
498 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
503 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N
) const {
504 if (TM
.Options
.NoNaNsFPMath
)
507 // TODO: Move into isKnownNeverNaN
508 if (N
->getFlags().isDefined())
509 return N
->getFlags().hasNoNaNs();
511 return CurDAG
->isKnownNeverNaN(N
);
514 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode
*N
,
515 bool Negated
) const {
519 const SIInstrInfo
*TII
= Subtarget
->getInstrInfo();
521 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
))
522 return TII
->isInlineConstant(-C
->getAPIntValue());
524 if (const ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(N
))
525 return TII
->isInlineConstant(-C
->getValueAPF().bitcastToAPInt());
528 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
))
529 return TII
->isInlineConstant(C
->getAPIntValue());
531 if (const ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(N
))
532 return TII
->isInlineConstant(C
->getValueAPF().bitcastToAPInt());
538 /// Determine the register class for \p OpNo
539 /// \returns The register class of the virtual register that will be used for
540 /// the given operand number \OpNo or NULL if the register class cannot be
542 const TargetRegisterClass
*AMDGPUDAGToDAGISel::getOperandRegClass(SDNode
*N
,
543 unsigned OpNo
) const {
544 if (!N
->isMachineOpcode()) {
545 if (N
->getOpcode() == ISD::CopyToReg
) {
546 unsigned Reg
= cast
<RegisterSDNode
>(N
->getOperand(1))->getReg();
547 if (Register::isVirtualRegister(Reg
)) {
548 MachineRegisterInfo
&MRI
= CurDAG
->getMachineFunction().getRegInfo();
549 return MRI
.getRegClass(Reg
);
552 const SIRegisterInfo
*TRI
553 = static_cast<const GCNSubtarget
*>(Subtarget
)->getRegisterInfo();
554 return TRI
->getPhysRegClass(Reg
);
560 switch (N
->getMachineOpcode()) {
562 const MCInstrDesc
&Desc
=
563 Subtarget
->getInstrInfo()->get(N
->getMachineOpcode());
564 unsigned OpIdx
= Desc
.getNumDefs() + OpNo
;
565 if (OpIdx
>= Desc
.getNumOperands())
567 int RegClass
= Desc
.OpInfo
[OpIdx
].RegClass
;
571 return Subtarget
->getRegisterInfo()->getRegClass(RegClass
);
573 case AMDGPU::REG_SEQUENCE
: {
574 unsigned RCID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
575 const TargetRegisterClass
*SuperRC
=
576 Subtarget
->getRegisterInfo()->getRegClass(RCID
);
578 SDValue SubRegOp
= N
->getOperand(OpNo
+ 1);
579 unsigned SubRegIdx
= cast
<ConstantSDNode
>(SubRegOp
)->getZExtValue();
580 return Subtarget
->getRegisterInfo()->getSubClassWithSubReg(SuperRC
,
586 SDNode
*AMDGPUDAGToDAGISel::glueCopyToM0(SDNode
*N
, SDValue Val
) const {
587 const SITargetLowering
& Lowering
=
588 *static_cast<const SITargetLowering
*>(getTargetLowering());
590 assert(N
->getOperand(0).getValueType() == MVT::Other
&& "Expected chain");
592 SDValue M0
= Lowering
.copyToM0(*CurDAG
, N
->getOperand(0), SDLoc(N
),
595 SDValue Glue
= M0
.getValue(1);
597 SmallVector
<SDValue
, 8> Ops
;
598 Ops
.push_back(M0
); // Replace the chain.
599 for (unsigned i
= 1, e
= N
->getNumOperands(); i
!= e
; ++i
)
600 Ops
.push_back(N
->getOperand(i
));
603 return CurDAG
->MorphNodeTo(N
, N
->getOpcode(), N
->getVTList(), Ops
);
606 SDNode
*AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode
*N
) const {
607 unsigned AS
= cast
<MemSDNode
>(N
)->getAddressSpace();
608 if (AS
== AMDGPUAS::LOCAL_ADDRESS
) {
609 if (Subtarget
->ldsRequiresM0Init())
610 return glueCopyToM0(N
, CurDAG
->getTargetConstant(-1, SDLoc(N
), MVT::i32
));
611 } else if (AS
== AMDGPUAS::REGION_ADDRESS
) {
612 MachineFunction
&MF
= CurDAG
->getMachineFunction();
613 unsigned Value
= MF
.getInfo
<SIMachineFunctionInfo
>()->getGDSSize();
615 glueCopyToM0(N
, CurDAG
->getTargetConstant(Value
, SDLoc(N
), MVT::i32
));
620 MachineSDNode
*AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc
&DL
, uint64_t Imm
,
622 SDNode
*Lo
= CurDAG
->getMachineNode(
623 AMDGPU::S_MOV_B32
, DL
, MVT::i32
,
624 CurDAG
->getTargetConstant(Imm
& 0xFFFFFFFF, DL
, MVT::i32
));
626 CurDAG
->getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
,
627 CurDAG
->getTargetConstant(Imm
>> 32, DL
, MVT::i32
));
628 const SDValue Ops
[] = {
629 CurDAG
->getTargetConstant(AMDGPU::SReg_64RegClassID
, DL
, MVT::i32
),
630 SDValue(Lo
, 0), CurDAG
->getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
631 SDValue(Hi
, 0), CurDAG
->getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
)};
633 return CurDAG
->getMachineNode(TargetOpcode::REG_SEQUENCE
, DL
, VT
, Ops
);
636 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts
) {
637 switch (NumVectorElts
) {
639 return AMDGPU::SReg_32RegClassID
;
641 return AMDGPU::SReg_64RegClassID
;
643 return AMDGPU::SGPR_96RegClassID
;
645 return AMDGPU::SGPR_128RegClassID
;
647 return AMDGPU::SGPR_160RegClassID
;
649 return AMDGPU::SReg_256RegClassID
;
651 return AMDGPU::SReg_512RegClassID
;
653 return AMDGPU::SReg_1024RegClassID
;
656 llvm_unreachable("invalid vector size");
659 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode
*N
, unsigned RegClassID
) {
660 EVT VT
= N
->getValueType(0);
661 unsigned NumVectorElts
= VT
.getVectorNumElements();
662 EVT EltVT
= VT
.getVectorElementType();
664 SDValue RegClass
= CurDAG
->getTargetConstant(RegClassID
, DL
, MVT::i32
);
666 if (NumVectorElts
== 1) {
667 CurDAG
->SelectNodeTo(N
, AMDGPU::COPY_TO_REGCLASS
, EltVT
, N
->getOperand(0),
672 assert(NumVectorElts
<= 32 && "Vectors with more than 32 elements not "
674 // 32 = Max Num Vector Elements
675 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
676 // 1 = Vector Register Class
677 SmallVector
<SDValue
, 32 * 2 + 1> RegSeqArgs(NumVectorElts
* 2 + 1);
679 RegSeqArgs
[0] = CurDAG
->getTargetConstant(RegClassID
, DL
, MVT::i32
);
680 bool IsRegSeq
= true;
681 unsigned NOps
= N
->getNumOperands();
682 for (unsigned i
= 0; i
< NOps
; i
++) {
683 // XXX: Why is this here?
684 if (isa
<RegisterSDNode
>(N
->getOperand(i
))) {
688 unsigned Sub
= AMDGPURegisterInfo::getSubRegFromChannel(i
);
689 RegSeqArgs
[1 + (2 * i
)] = N
->getOperand(i
);
690 RegSeqArgs
[1 + (2 * i
) + 1] = CurDAG
->getTargetConstant(Sub
, DL
, MVT::i32
);
692 if (NOps
!= NumVectorElts
) {
693 // Fill in the missing undef elements if this was a scalar_to_vector.
694 assert(N
->getOpcode() == ISD::SCALAR_TO_VECTOR
&& NOps
< NumVectorElts
);
695 MachineSDNode
*ImpDef
= CurDAG
->getMachineNode(TargetOpcode::IMPLICIT_DEF
,
697 for (unsigned i
= NOps
; i
< NumVectorElts
; ++i
) {
698 unsigned Sub
= AMDGPURegisterInfo::getSubRegFromChannel(i
);
699 RegSeqArgs
[1 + (2 * i
)] = SDValue(ImpDef
, 0);
700 RegSeqArgs
[1 + (2 * i
) + 1] =
701 CurDAG
->getTargetConstant(Sub
, DL
, MVT::i32
);
707 CurDAG
->SelectNodeTo(N
, AMDGPU::REG_SEQUENCE
, N
->getVTList(), RegSeqArgs
);
710 void AMDGPUDAGToDAGISel::Select(SDNode
*N
) {
711 unsigned int Opc
= N
->getOpcode();
712 if (N
->isMachineOpcode()) {
714 return; // Already selected.
717 // isa<MemSDNode> almost works but is slightly too permissive for some DS
719 if (Opc
== ISD::LOAD
|| Opc
== ISD::STORE
|| isa
<AtomicSDNode
>(N
) ||
720 (Opc
== AMDGPUISD::ATOMIC_INC
|| Opc
== AMDGPUISD::ATOMIC_DEC
||
721 Opc
== ISD::ATOMIC_LOAD_FADD
||
722 Opc
== AMDGPUISD::ATOMIC_LOAD_FMIN
||
723 Opc
== AMDGPUISD::ATOMIC_LOAD_FMAX
)) {
724 N
= glueCopyToM0LDSInit(N
);
732 // We are selecting i64 ADD here instead of custom lower it during
733 // DAG legalization, so we can fold some i64 ADDs used for address
734 // calculation into the LOAD and STORE instructions.
739 if (N
->getValueType(0) != MVT::i64
)
742 SelectADD_SUB_I64(N
);
747 if (N
->getValueType(0) != MVT::i32
)
754 SelectUADDO_USUBO(N
);
757 case AMDGPUISD::FMUL_W_CHAIN
: {
758 SelectFMUL_W_CHAIN(N
);
761 case AMDGPUISD::FMA_W_CHAIN
: {
762 SelectFMA_W_CHAIN(N
);
766 case ISD::SCALAR_TO_VECTOR
:
767 case ISD::BUILD_VECTOR
: {
768 EVT VT
= N
->getValueType(0);
769 unsigned NumVectorElts
= VT
.getVectorNumElements();
770 if (VT
.getScalarSizeInBits() == 16) {
771 if (Opc
== ISD::BUILD_VECTOR
&& NumVectorElts
== 2) {
772 if (SDNode
*Packed
= packConstantV2I16(N
, *CurDAG
)) {
773 ReplaceNode(N
, Packed
);
781 assert(VT
.getVectorElementType().bitsEq(MVT::i32
));
782 unsigned RegClassID
= selectSGPRVectorRegClassID(NumVectorElts
);
783 SelectBuildVector(N
, RegClassID
);
786 case ISD::BUILD_PAIR
: {
787 SDValue RC
, SubReg0
, SubReg1
;
789 if (N
->getValueType(0) == MVT::i128
) {
790 RC
= CurDAG
->getTargetConstant(AMDGPU::SGPR_128RegClassID
, DL
, MVT::i32
);
791 SubReg0
= CurDAG
->getTargetConstant(AMDGPU::sub0_sub1
, DL
, MVT::i32
);
792 SubReg1
= CurDAG
->getTargetConstant(AMDGPU::sub2_sub3
, DL
, MVT::i32
);
793 } else if (N
->getValueType(0) == MVT::i64
) {
794 RC
= CurDAG
->getTargetConstant(AMDGPU::SReg_64RegClassID
, DL
, MVT::i32
);
795 SubReg0
= CurDAG
->getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
);
796 SubReg1
= CurDAG
->getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
);
798 llvm_unreachable("Unhandled value type for BUILD_PAIR");
800 const SDValue Ops
[] = { RC
, N
->getOperand(0), SubReg0
,
801 N
->getOperand(1), SubReg1
};
802 ReplaceNode(N
, CurDAG
->getMachineNode(TargetOpcode::REG_SEQUENCE
, DL
,
803 N
->getValueType(0), Ops
));
808 case ISD::ConstantFP
: {
809 if (N
->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N
))
813 if (ConstantFPSDNode
*FP
= dyn_cast
<ConstantFPSDNode
>(N
))
814 Imm
= FP
->getValueAPF().bitcastToAPInt().getZExtValue();
816 ConstantSDNode
*C
= cast
<ConstantSDNode
>(N
);
817 Imm
= C
->getZExtValue();
821 ReplaceNode(N
, buildSMovImm64(DL
, Imm
, N
->getValueType(0)));
824 case AMDGPUISD::BFE_I32
:
825 case AMDGPUISD::BFE_U32
: {
826 // There is a scalar version available, but unlike the vector version which
827 // has a separate operand for the offset and width, the scalar version packs
828 // the width and offset into a single operand. Try to move to the scalar
829 // version if the offsets are constant, so that we can try to keep extended
830 // loads of kernel arguments in SGPRs.
832 // TODO: Technically we could try to pattern match scalar bitshifts of
833 // dynamic values, but it's probably not useful.
834 ConstantSDNode
*Offset
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
838 ConstantSDNode
*Width
= dyn_cast
<ConstantSDNode
>(N
->getOperand(2));
842 bool Signed
= Opc
== AMDGPUISD::BFE_I32
;
844 uint32_t OffsetVal
= Offset
->getZExtValue();
845 uint32_t WidthVal
= Width
->getZExtValue();
847 ReplaceNode(N
, getS_BFE(Signed
? AMDGPU::S_BFE_I32
: AMDGPU::S_BFE_U32
,
848 SDLoc(N
), N
->getOperand(0), OffsetVal
, WidthVal
));
851 case AMDGPUISD::DIV_SCALE
: {
855 case AMDGPUISD::DIV_FMAS
: {
859 case AMDGPUISD::MAD_I64_I32
:
860 case AMDGPUISD::MAD_U64_U32
: {
864 case ISD::CopyToReg
: {
865 const SITargetLowering
& Lowering
=
866 *static_cast<const SITargetLowering
*>(getTargetLowering());
867 N
= Lowering
.legalizeTargetIndependentNode(N
, *CurDAG
);
873 case ISD::SIGN_EXTEND_INREG
:
874 if (N
->getValueType(0) != MVT::i32
)
886 case AMDGPUISD::ATOMIC_CMP_SWAP
:
887 SelectATOMIC_CMP_SWAP(N
);
889 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
890 case AMDGPUISD::CVT_PKNORM_I16_F32
:
891 case AMDGPUISD::CVT_PKNORM_U16_F32
:
892 case AMDGPUISD::CVT_PK_U16_U32
:
893 case AMDGPUISD::CVT_PK_I16_I32
: {
894 // Hack around using a legal type if f16 is illegal.
895 if (N
->getValueType(0) == MVT::i32
) {
896 MVT NewVT
= Opc
== AMDGPUISD::CVT_PKRTZ_F16_F32
? MVT::v2f16
: MVT::v2i16
;
897 N
= CurDAG
->MorphNodeTo(N
, N
->getOpcode(), CurDAG
->getVTList(NewVT
),
898 { N
->getOperand(0), N
->getOperand(1) });
905 case ISD::INTRINSIC_W_CHAIN
: {
906 SelectINTRINSIC_W_CHAIN(N
);
909 case ISD::INTRINSIC_WO_CHAIN
: {
910 SelectINTRINSIC_WO_CHAIN(N
);
913 case ISD::INTRINSIC_VOID
: {
914 SelectINTRINSIC_VOID(N
);
922 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode
*N
) const {
923 const BasicBlock
*BB
= FuncInfo
->MBB
->getBasicBlock();
924 const Instruction
*Term
= BB
->getTerminator();
925 return Term
->getMetadata("amdgpu.uniform") ||
926 Term
->getMetadata("structurizecfg.uniform");
929 StringRef
AMDGPUDAGToDAGISel::getPassName() const {
930 return "AMDGPU DAG->DAG Pattern Instruction Selection";
933 //===----------------------------------------------------------------------===//
935 //===----------------------------------------------------------------------===//
937 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
,
942 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr
, SDValue
&Base
,
947 if ((C
= dyn_cast
<ConstantSDNode
>(Addr
))) {
948 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
949 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
950 } else if ((Addr
.getOpcode() == AMDGPUISD::DWORDADDR
) &&
951 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0)))) {
952 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
953 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
954 } else if ((Addr
.getOpcode() == ISD::ADD
|| Addr
.getOpcode() == ISD::OR
) &&
955 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1)))) {
956 Base
= Addr
.getOperand(0);
957 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
960 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
966 // FIXME: Should only handle addcarry/subcarry
967 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode
*N
) {
969 SDValue LHS
= N
->getOperand(0);
970 SDValue RHS
= N
->getOperand(1);
972 unsigned Opcode
= N
->getOpcode();
973 bool ConsumeCarry
= (Opcode
== ISD::ADDE
|| Opcode
== ISD::SUBE
);
975 ConsumeCarry
|| Opcode
== ISD::ADDC
|| Opcode
== ISD::SUBC
;
976 bool IsAdd
= Opcode
== ISD::ADD
|| Opcode
== ISD::ADDC
|| Opcode
== ISD::ADDE
;
978 SDValue Sub0
= CurDAG
->getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
);
979 SDValue Sub1
= CurDAG
->getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
);
981 SDNode
*Lo0
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
982 DL
, MVT::i32
, LHS
, Sub0
);
983 SDNode
*Hi0
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
984 DL
, MVT::i32
, LHS
, Sub1
);
986 SDNode
*Lo1
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
987 DL
, MVT::i32
, RHS
, Sub0
);
988 SDNode
*Hi1
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
989 DL
, MVT::i32
, RHS
, Sub1
);
991 SDVTList VTList
= CurDAG
->getVTList(MVT::i32
, MVT::Glue
);
993 unsigned Opc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
994 unsigned CarryOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
998 SDValue Args
[] = { SDValue(Lo0
, 0), SDValue(Lo1
, 0) };
999 AddLo
= CurDAG
->getMachineNode(Opc
, DL
, VTList
, Args
);
1001 SDValue Args
[] = { SDValue(Lo0
, 0), SDValue(Lo1
, 0), N
->getOperand(2) };
1002 AddLo
= CurDAG
->getMachineNode(CarryOpc
, DL
, VTList
, Args
);
1004 SDValue AddHiArgs
[] = {
1009 SDNode
*AddHi
= CurDAG
->getMachineNode(CarryOpc
, DL
, VTList
, AddHiArgs
);
1011 SDValue RegSequenceArgs
[] = {
1012 CurDAG
->getTargetConstant(AMDGPU::SReg_64RegClassID
, DL
, MVT::i32
),
1018 SDNode
*RegSequence
= CurDAG
->getMachineNode(AMDGPU::REG_SEQUENCE
, DL
,
1019 MVT::i64
, RegSequenceArgs
);
1022 // Replace the carry-use
1023 ReplaceUses(SDValue(N
, 1), SDValue(AddHi
, 1));
1026 // Replace the remaining uses.
1027 ReplaceNode(N
, RegSequence
);
1030 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode
*N
) {
1032 SDValue LHS
= N
->getOperand(0);
1033 SDValue RHS
= N
->getOperand(1);
1034 SDValue CI
= N
->getOperand(2);
1036 unsigned Opc
= N
->getOpcode() == ISD::ADDCARRY
? AMDGPU::V_ADDC_U32_e64
1037 : AMDGPU::V_SUBB_U32_e64
;
1038 CurDAG
->SelectNodeTo(
1039 N
, Opc
, N
->getVTList(),
1040 {LHS
, RHS
, CI
, CurDAG
->getTargetConstant(0, {}, MVT::i1
) /*clamp bit*/});
1043 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode
*N
) {
1044 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1045 // carry out despite the _i32 name. These were renamed in VI to _U32.
1046 // FIXME: We should probably rename the opcodes here.
1047 unsigned Opc
= N
->getOpcode() == ISD::UADDO
?
1048 AMDGPU::V_ADD_I32_e64
: AMDGPU::V_SUB_I32_e64
;
1050 CurDAG
->SelectNodeTo(
1051 N
, Opc
, N
->getVTList(),
1052 {N
->getOperand(0), N
->getOperand(1),
1053 CurDAG
->getTargetConstant(0, {}, MVT::i1
) /*clamp bit*/});
1056 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode
*N
) {
1058 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1061 SelectVOP3Mods0(N
->getOperand(1), Ops
[1], Ops
[0], Ops
[6], Ops
[7]);
1062 SelectVOP3Mods(N
->getOperand(2), Ops
[3], Ops
[2]);
1063 SelectVOP3Mods(N
->getOperand(3), Ops
[5], Ops
[4]);
1064 Ops
[8] = N
->getOperand(0);
1065 Ops
[9] = N
->getOperand(4);
1067 CurDAG
->SelectNodeTo(N
, AMDGPU::V_FMA_F32
, N
->getVTList(), Ops
);
1070 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode
*N
) {
1072 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1075 SelectVOP3Mods0(N
->getOperand(1), Ops
[1], Ops
[0], Ops
[4], Ops
[5]);
1076 SelectVOP3Mods(N
->getOperand(2), Ops
[3], Ops
[2]);
1077 Ops
[6] = N
->getOperand(0);
1078 Ops
[7] = N
->getOperand(3);
1080 CurDAG
->SelectNodeTo(N
, AMDGPU::V_MUL_F32_e64
, N
->getVTList(), Ops
);
1083 // We need to handle this here because tablegen doesn't support matching
1084 // instructions with multiple outputs.
1085 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode
*N
) {
1087 EVT VT
= N
->getValueType(0);
1089 assert(VT
== MVT::f32
|| VT
== MVT::f64
);
1092 = (VT
== MVT::f64
) ? AMDGPU::V_DIV_SCALE_F64
: AMDGPU::V_DIV_SCALE_F32
;
1094 SDValue Ops
[] = { N
->getOperand(0), N
->getOperand(1), N
->getOperand(2) };
1095 CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
1098 void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode
*N
) {
1099 const GCNSubtarget
*ST
= static_cast<const GCNSubtarget
*>(Subtarget
);
1100 const SIRegisterInfo
*TRI
= ST
->getRegisterInfo();
1103 EVT VT
= N
->getValueType(0);
1105 assert(VT
== MVT::f32
|| VT
== MVT::f64
);
1108 = (VT
== MVT::f64
) ? AMDGPU::V_DIV_FMAS_F64
: AMDGPU::V_DIV_FMAS_F32
;
1110 SDValue CarryIn
= N
->getOperand(3);
1111 // V_DIV_FMAS implicitly reads VCC.
1112 SDValue VCC
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), SL
,
1113 TRI
->getVCC(), CarryIn
, SDValue());
1117 SelectVOP3Mods0(N
->getOperand(0), Ops
[1], Ops
[0], Ops
[6], Ops
[7]);
1118 SelectVOP3Mods(N
->getOperand(1), Ops
[3], Ops
[2]);
1119 SelectVOP3Mods(N
->getOperand(2), Ops
[5], Ops
[4]);
1122 Ops
[9] = VCC
.getValue(1);
1124 CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
1127 // We need to handle this here because tablegen doesn't support matching
1128 // instructions with multiple outputs.
1129 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode
*N
) {
1131 bool Signed
= N
->getOpcode() == AMDGPUISD::MAD_I64_I32
;
1132 unsigned Opc
= Signed
? AMDGPU::V_MAD_I64_I32
: AMDGPU::V_MAD_U64_U32
;
1134 SDValue Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i1
);
1135 SDValue Ops
[] = { N
->getOperand(0), N
->getOperand(1), N
->getOperand(2),
1137 CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
1140 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base
, unsigned Offset
,
1141 unsigned OffsetBits
) const {
1142 if ((OffsetBits
== 16 && !isUInt
<16>(Offset
)) ||
1143 (OffsetBits
== 8 && !isUInt
<8>(Offset
)))
1146 if (Subtarget
->hasUsableDSOffset() ||
1147 Subtarget
->unsafeDSOffsetFoldingEnabled())
1150 // On Southern Islands instruction with a negative base value and an offset
1151 // don't seem to work.
1152 return CurDAG
->SignBitIsZero(Base
);
1155 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr
, SDValue
&Base
,
1156 SDValue
&Offset
) const {
1158 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1159 SDValue N0
= Addr
.getOperand(0);
1160 SDValue N1
= Addr
.getOperand(1);
1161 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1162 if (isDSOffsetLegal(N0
, C1
->getSExtValue(), 16)) {
1165 Offset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i16
);
1168 } else if (Addr
.getOpcode() == ISD::SUB
) {
1169 // sub C, x -> add (sub 0, x), C
1170 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0))) {
1171 int64_t ByteOffset
= C
->getSExtValue();
1172 if (isUInt
<16>(ByteOffset
)) {
1173 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1175 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1176 // the known bits in isDSOffsetLegal. We need to emit the selected node
1177 // here, so this is thrown away.
1178 SDValue Sub
= CurDAG
->getNode(ISD::SUB
, DL
, MVT::i32
,
1179 Zero
, Addr
.getOperand(1));
1181 if (isDSOffsetLegal(Sub
, ByteOffset
, 16)) {
1182 SmallVector
<SDValue
, 3> Opnds
;
1183 Opnds
.push_back(Zero
);
1184 Opnds
.push_back(Addr
.getOperand(1));
1186 // FIXME: Select to VOP3 version for with-carry.
1187 unsigned SubOp
= AMDGPU::V_SUB_I32_e32
;
1188 if (Subtarget
->hasAddNoCarry()) {
1189 SubOp
= AMDGPU::V_SUB_U32_e64
;
1191 CurDAG
->getTargetConstant(0, {}, MVT::i1
)); // clamp bit
1194 MachineSDNode
*MachineSub
=
1195 CurDAG
->getMachineNode(SubOp
, DL
, MVT::i32
, Opnds
);
1197 Base
= SDValue(MachineSub
, 0);
1198 Offset
= CurDAG
->getTargetConstant(ByteOffset
, DL
, MVT::i16
);
1203 } else if (const ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
)) {
1204 // If we have a constant address, prefer to put the constant into the
1205 // offset. This can save moves to load the constant address since multiple
1206 // operations can share the zero base address register, and enables merging
1207 // into read2 / write2 instructions.
1211 if (isUInt
<16>(CAddr
->getZExtValue())) {
1212 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1213 MachineSDNode
*MovZero
= CurDAG
->getMachineNode(AMDGPU::V_MOV_B32_e32
,
1214 DL
, MVT::i32
, Zero
);
1215 Base
= SDValue(MovZero
, 0);
1216 Offset
= CurDAG
->getTargetConstant(CAddr
->getZExtValue(), DL
, MVT::i16
);
1223 Offset
= CurDAG
->getTargetConstant(0, SDLoc(Addr
), MVT::i16
);
1227 // TODO: If offset is too big, put low 16-bit into offset.
1228 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr
, SDValue
&Base
,
1230 SDValue
&Offset1
) const {
1233 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1234 SDValue N0
= Addr
.getOperand(0);
1235 SDValue N1
= Addr
.getOperand(1);
1236 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1237 unsigned DWordOffset0
= C1
->getZExtValue() / 4;
1238 unsigned DWordOffset1
= DWordOffset0
+ 1;
1240 if (isDSOffsetLegal(N0
, DWordOffset1
, 8)) {
1242 Offset0
= CurDAG
->getTargetConstant(DWordOffset0
, DL
, MVT::i8
);
1243 Offset1
= CurDAG
->getTargetConstant(DWordOffset1
, DL
, MVT::i8
);
1246 } else if (Addr
.getOpcode() == ISD::SUB
) {
1247 // sub C, x -> add (sub 0, x), C
1248 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0))) {
1249 unsigned DWordOffset0
= C
->getZExtValue() / 4;
1250 unsigned DWordOffset1
= DWordOffset0
+ 1;
1252 if (isUInt
<8>(DWordOffset0
)) {
1254 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1256 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1257 // the known bits in isDSOffsetLegal. We need to emit the selected node
1258 // here, so this is thrown away.
1259 SDValue Sub
= CurDAG
->getNode(ISD::SUB
, DL
, MVT::i32
,
1260 Zero
, Addr
.getOperand(1));
1262 if (isDSOffsetLegal(Sub
, DWordOffset1
, 8)) {
1263 SmallVector
<SDValue
, 3> Opnds
;
1264 Opnds
.push_back(Zero
);
1265 Opnds
.push_back(Addr
.getOperand(1));
1266 unsigned SubOp
= AMDGPU::V_SUB_I32_e32
;
1267 if (Subtarget
->hasAddNoCarry()) {
1268 SubOp
= AMDGPU::V_SUB_U32_e64
;
1270 CurDAG
->getTargetConstant(0, {}, MVT::i1
)); // clamp bit
1273 MachineSDNode
*MachineSub
1274 = CurDAG
->getMachineNode(SubOp
, DL
, MVT::i32
, Opnds
);
1276 Base
= SDValue(MachineSub
, 0);
1277 Offset0
= CurDAG
->getTargetConstant(DWordOffset0
, DL
, MVT::i8
);
1278 Offset1
= CurDAG
->getTargetConstant(DWordOffset1
, DL
, MVT::i8
);
1283 } else if (const ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
)) {
1284 unsigned DWordOffset0
= CAddr
->getZExtValue() / 4;
1285 unsigned DWordOffset1
= DWordOffset0
+ 1;
1286 assert(4 * DWordOffset0
== CAddr
->getZExtValue());
1288 if (isUInt
<8>(DWordOffset0
) && isUInt
<8>(DWordOffset1
)) {
1289 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1290 MachineSDNode
*MovZero
1291 = CurDAG
->getMachineNode(AMDGPU::V_MOV_B32_e32
,
1292 DL
, MVT::i32
, Zero
);
1293 Base
= SDValue(MovZero
, 0);
1294 Offset0
= CurDAG
->getTargetConstant(DWordOffset0
, DL
, MVT::i8
);
1295 Offset1
= CurDAG
->getTargetConstant(DWordOffset1
, DL
, MVT::i8
);
1303 Offset0
= CurDAG
->getTargetConstant(0, DL
, MVT::i8
);
1304 Offset1
= CurDAG
->getTargetConstant(1, DL
, MVT::i8
);
1308 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr
, SDValue
&Ptr
,
1309 SDValue
&VAddr
, SDValue
&SOffset
,
1310 SDValue
&Offset
, SDValue
&Offen
,
1311 SDValue
&Idxen
, SDValue
&Addr64
,
1312 SDValue
&GLC
, SDValue
&SLC
,
1313 SDValue
&TFE
, SDValue
&DLC
,
1314 SDValue
&SWZ
) const {
1315 // Subtarget prefers to use flat instruction
1316 if (Subtarget
->useFlatForGlobal())
1322 GLC
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1324 SLC
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1325 TFE
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1326 DLC
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1327 SWZ
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1329 Idxen
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1330 Offen
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1331 Addr64
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1332 SOffset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1334 ConstantSDNode
*C1
= nullptr;
1336 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1337 C1
= cast
<ConstantSDNode
>(Addr
.getOperand(1));
1338 if (isUInt
<32>(C1
->getZExtValue()))
1339 N0
= Addr
.getOperand(0);
1344 if (N0
.getOpcode() == ISD::ADD
) {
1345 // (add N2, N3) -> addr64, or
1346 // (add (add N2, N3), C1) -> addr64
1347 SDValue N2
= N0
.getOperand(0);
1348 SDValue N3
= N0
.getOperand(1);
1349 Addr64
= CurDAG
->getTargetConstant(1, DL
, MVT::i1
);
1351 if (N2
->isDivergent()) {
1352 if (N3
->isDivergent()) {
1353 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1354 // addr64, and construct the resource from a 0 address.
1355 Ptr
= SDValue(buildSMovImm64(DL
, 0, MVT::v2i32
), 0);
1358 // N2 is divergent, N3 is not.
1363 // N2 is not divergent.
1367 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1368 } else if (N0
->isDivergent()) {
1369 // N0 is divergent. Use it as the addr64, and construct the resource from a
1371 Ptr
= SDValue(buildSMovImm64(DL
, 0, MVT::v2i32
), 0);
1373 Addr64
= CurDAG
->getTargetConstant(1, DL
, MVT::i1
);
1376 // (N0 + C1) -> offset
1377 VAddr
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1383 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1387 if (SIInstrInfo::isLegalMUBUFImmOffset(C1
->getZExtValue())) {
1388 // Legal offset for instruction.
1389 Offset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i16
);
1393 // Illegal offset, store it in soffset.
1394 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1396 SDValue(CurDAG
->getMachineNode(
1397 AMDGPU::S_MOV_B32
, DL
, MVT::i32
,
1398 CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i32
)),
1403 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
,
1404 SDValue
&VAddr
, SDValue
&SOffset
,
1405 SDValue
&Offset
, SDValue
&GLC
,
1406 SDValue
&SLC
, SDValue
&TFE
,
1407 SDValue
&DLC
, SDValue
&SWZ
) const {
1408 SDValue Ptr
, Offen
, Idxen
, Addr64
;
1410 // addr64 bit was removed for volcanic islands.
1411 if (!Subtarget
->hasAddr64())
1414 if (!SelectMUBUF(Addr
, Ptr
, VAddr
, SOffset
, Offset
, Offen
, Idxen
, Addr64
,
1415 GLC
, SLC
, TFE
, DLC
, SWZ
))
1418 ConstantSDNode
*C
= cast
<ConstantSDNode
>(Addr64
);
1419 if (C
->getSExtValue()) {
1422 const SITargetLowering
& Lowering
=
1423 *static_cast<const SITargetLowering
*>(getTargetLowering());
1425 SRsrc
= SDValue(Lowering
.wrapAddr64Rsrc(*CurDAG
, DL
, Ptr
), 0);
1432 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
,
1433 SDValue
&VAddr
, SDValue
&SOffset
,
1435 SDValue
&SLC
) const {
1436 SLC
= CurDAG
->getTargetConstant(0, SDLoc(Addr
), MVT::i1
);
1437 SDValue GLC
, TFE
, DLC
, SWZ
;
1439 return SelectMUBUFAddr64(Addr
, SRsrc
, VAddr
, SOffset
, Offset
, GLC
, SLC
, TFE
, DLC
, SWZ
);
1442 static bool isStackPtrRelative(const MachinePointerInfo
&PtrInfo
) {
1443 auto PSV
= PtrInfo
.V
.dyn_cast
<const PseudoSourceValue
*>();
1444 return PSV
&& PSV
->isStack();
1447 std::pair
<SDValue
, SDValue
> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N
) const {
1448 const MachineFunction
&MF
= CurDAG
->getMachineFunction();
1449 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1451 if (auto FI
= dyn_cast
<FrameIndexSDNode
>(N
)) {
1452 SDValue TFI
= CurDAG
->getTargetFrameIndex(FI
->getIndex(),
1453 FI
->getValueType(0));
1455 // If we can resolve this to a frame index access, this will be relative to
1456 // either the stack or frame pointer SGPR.
1457 return std::make_pair(
1458 TFI
, CurDAG
->getRegister(Info
->getStackPtrOffsetReg(), MVT::i32
));
1461 // If we don't know this private access is a local stack object, it needs to
1462 // be relative to the entry point's scratch wave offset register.
1463 return std::make_pair(N
, CurDAG
->getRegister(Info
->getScratchWaveOffsetReg(),
1467 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode
*Parent
,
1468 SDValue Addr
, SDValue
&Rsrc
,
1469 SDValue
&VAddr
, SDValue
&SOffset
,
1470 SDValue
&ImmOffset
) const {
1473 MachineFunction
&MF
= CurDAG
->getMachineFunction();
1474 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1476 Rsrc
= CurDAG
->getRegister(Info
->getScratchRSrcReg(), MVT::v4i32
);
1478 if (ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
)) {
1479 unsigned Imm
= CAddr
->getZExtValue();
1481 SDValue HighBits
= CurDAG
->getTargetConstant(Imm
& ~4095, DL
, MVT::i32
);
1482 MachineSDNode
*MovHighBits
= CurDAG
->getMachineNode(AMDGPU::V_MOV_B32_e32
,
1483 DL
, MVT::i32
, HighBits
);
1484 VAddr
= SDValue(MovHighBits
, 0);
1486 // In a call sequence, stores to the argument stack area are relative to the
1488 const MachinePointerInfo
&PtrInfo
= cast
<MemSDNode
>(Parent
)->getPointerInfo();
1489 unsigned SOffsetReg
= isStackPtrRelative(PtrInfo
) ?
1490 Info
->getStackPtrOffsetReg() : Info
->getScratchWaveOffsetReg();
1492 SOffset
= CurDAG
->getRegister(SOffsetReg
, MVT::i32
);
1493 ImmOffset
= CurDAG
->getTargetConstant(Imm
& 4095, DL
, MVT::i16
);
1497 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1500 SDValue N0
= Addr
.getOperand(0);
1501 SDValue N1
= Addr
.getOperand(1);
1503 // Offsets in vaddr must be positive if range checking is enabled.
1505 // The total computation of vaddr + soffset + offset must not overflow. If
1506 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1509 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1510 // always perform a range check. If a negative vaddr base index was used,
1511 // this would fail the range check. The overall address computation would
1512 // compute a valid address, but this doesn't happen due to the range
1513 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1515 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1516 // MUBUF vaddr, but not on older subtargets which can only do this if the
1517 // sign bit is known 0.
1518 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1519 if (SIInstrInfo::isLegalMUBUFImmOffset(C1
->getZExtValue()) &&
1520 (!Subtarget
->privateMemoryResourceIsRangeChecked() ||
1521 CurDAG
->SignBitIsZero(N0
))) {
1522 std::tie(VAddr
, SOffset
) = foldFrameIndex(N0
);
1523 ImmOffset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i16
);
1529 std::tie(VAddr
, SOffset
) = foldFrameIndex(Addr
);
1530 ImmOffset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1534 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode
*Parent
,
1538 SDValue
&Offset
) const {
1539 ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
);
1540 if (!CAddr
|| !SIInstrInfo::isLegalMUBUFImmOffset(CAddr
->getZExtValue()))
1544 MachineFunction
&MF
= CurDAG
->getMachineFunction();
1545 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1547 SRsrc
= CurDAG
->getRegister(Info
->getScratchRSrcReg(), MVT::v4i32
);
1549 const MachinePointerInfo
&PtrInfo
= cast
<MemSDNode
>(Parent
)->getPointerInfo();
1550 unsigned SOffsetReg
= isStackPtrRelative(PtrInfo
) ?
1551 Info
->getStackPtrOffsetReg() : Info
->getScratchWaveOffsetReg();
1553 // FIXME: Get from MachinePointerInfo? We should only be using the frame
1554 // offset if we know this is in a call sequence.
1555 SOffset
= CurDAG
->getRegister(SOffsetReg
, MVT::i32
);
1557 Offset
= CurDAG
->getTargetConstant(CAddr
->getZExtValue(), DL
, MVT::i16
);
1561 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
,
1562 SDValue
&SOffset
, SDValue
&Offset
,
1563 SDValue
&GLC
, SDValue
&SLC
,
1564 SDValue
&TFE
, SDValue
&DLC
,
1565 SDValue
&SWZ
) const {
1566 SDValue Ptr
, VAddr
, Offen
, Idxen
, Addr64
;
1567 const SIInstrInfo
*TII
=
1568 static_cast<const SIInstrInfo
*>(Subtarget
->getInstrInfo());
1570 if (!SelectMUBUF(Addr
, Ptr
, VAddr
, SOffset
, Offset
, Offen
, Idxen
, Addr64
,
1571 GLC
, SLC
, TFE
, DLC
, SWZ
))
1574 if (!cast
<ConstantSDNode
>(Offen
)->getSExtValue() &&
1575 !cast
<ConstantSDNode
>(Idxen
)->getSExtValue() &&
1576 !cast
<ConstantSDNode
>(Addr64
)->getSExtValue()) {
1577 uint64_t Rsrc
= TII
->getDefaultRsrcDataFormat() |
1578 APInt::getAllOnesValue(32).getZExtValue(); // Size
1581 const SITargetLowering
& Lowering
=
1582 *static_cast<const SITargetLowering
*>(getTargetLowering());
1584 SRsrc
= SDValue(Lowering
.buildRSRC(*CurDAG
, DL
, Ptr
, 0, Rsrc
), 0);
1590 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
,
1591 SDValue
&Soffset
, SDValue
&Offset
1593 SDValue GLC
, SLC
, TFE
, DLC
, SWZ
;
1595 return SelectMUBUFOffset(Addr
, SRsrc
, Soffset
, Offset
, GLC
, SLC
, TFE
, DLC
, SWZ
);
1597 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
,
1598 SDValue
&Soffset
, SDValue
&Offset
,
1599 SDValue
&SLC
) const {
1600 SDValue GLC
, TFE
, DLC
, SWZ
;
1602 return SelectMUBUFOffset(Addr
, SRsrc
, Soffset
, Offset
, GLC
, SLC
, TFE
, DLC
, SWZ
);
1605 // Find a load or store from corresponding pattern root.
1606 // Roots may be build_vector, bitconvert or their combinations.
1607 static MemSDNode
* findMemSDNode(SDNode
*N
) {
1608 N
= AMDGPUTargetLowering::stripBitcast(SDValue(N
,0)).getNode();
1609 if (MemSDNode
*MN
= dyn_cast
<MemSDNode
>(N
))
1611 assert(isa
<BuildVectorSDNode
>(N
));
1612 for (SDValue V
: N
->op_values())
1614 dyn_cast
<MemSDNode
>(AMDGPUTargetLowering::stripBitcast(V
)))
1616 llvm_unreachable("cannot find MemSDNode in the pattern!");
1619 template <bool IsSigned
>
1620 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode
*N
,
1624 SDValue
&SLC
) const {
1625 int64_t OffsetVal
= 0;
1627 if (Subtarget
->hasFlatInstOffsets() &&
1628 (!Subtarget
->hasFlatSegmentOffsetBug() ||
1629 findMemSDNode(N
)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS
) &&
1630 CurDAG
->isBaseWithConstantOffset(Addr
)) {
1631 SDValue N0
= Addr
.getOperand(0);
1632 SDValue N1
= Addr
.getOperand(1);
1633 int64_t COffsetVal
= cast
<ConstantSDNode
>(N1
)->getSExtValue();
1635 const SIInstrInfo
*TII
= Subtarget
->getInstrInfo();
1636 if (TII
->isLegalFLATOffset(COffsetVal
, findMemSDNode(N
)->getAddressSpace(),
1639 OffsetVal
= COffsetVal
;
1644 Offset
= CurDAG
->getTargetConstant(OffsetVal
, SDLoc(), MVT::i16
);
1645 SLC
= CurDAG
->getTargetConstant(0, SDLoc(), MVT::i1
);
1649 bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode
*N
,
1653 SDValue
&SLC
) const {
1654 return SelectFlatOffset
<false>(N
, Addr
, VAddr
, Offset
, SLC
);
1657 bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode
*N
,
1661 SDValue
&SLC
) const {
1662 return SelectFlatOffset
<true>(N
, Addr
, VAddr
, Offset
, SLC
);
1665 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode
,
1666 SDValue
&Offset
, bool &Imm
) const {
1668 // FIXME: Handle non-constant offsets.
1669 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(ByteOffsetNode
);
1673 SDLoc
SL(ByteOffsetNode
);
1674 GCNSubtarget::Generation Gen
= Subtarget
->getGeneration();
1675 int64_t ByteOffset
= C
->getSExtValue();
1676 int64_t EncodedOffset
= AMDGPU::getSMRDEncodedOffset(*Subtarget
, ByteOffset
);
1678 if (AMDGPU::isLegalSMRDImmOffset(*Subtarget
, ByteOffset
)) {
1679 Offset
= CurDAG
->getTargetConstant(EncodedOffset
, SL
, MVT::i32
);
1684 if (!isUInt
<32>(EncodedOffset
) || !isUInt
<32>(ByteOffset
))
1687 if (Gen
== AMDGPUSubtarget::SEA_ISLANDS
&& isUInt
<32>(EncodedOffset
)) {
1688 // 32-bit Immediates are supported on Sea Islands.
1689 Offset
= CurDAG
->getTargetConstant(EncodedOffset
, SL
, MVT::i32
);
1691 SDValue C32Bit
= CurDAG
->getTargetConstant(ByteOffset
, SL
, MVT::i32
);
1692 Offset
= SDValue(CurDAG
->getMachineNode(AMDGPU::S_MOV_B32
, SL
, MVT::i32
,
1699 SDValue
AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr
) const {
1700 if (Addr
.getValueType() != MVT::i32
)
1703 // Zero-extend a 32-bit address.
1706 const MachineFunction
&MF
= CurDAG
->getMachineFunction();
1707 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1708 unsigned AddrHiVal
= Info
->get32BitAddressHighBits();
1709 SDValue AddrHi
= CurDAG
->getTargetConstant(AddrHiVal
, SL
, MVT::i32
);
1711 const SDValue Ops
[] = {
1712 CurDAG
->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID
, SL
, MVT::i32
),
1714 CurDAG
->getTargetConstant(AMDGPU::sub0
, SL
, MVT::i32
),
1715 SDValue(CurDAG
->getMachineNode(AMDGPU::S_MOV_B32
, SL
, MVT::i32
, AddrHi
),
1717 CurDAG
->getTargetConstant(AMDGPU::sub1
, SL
, MVT::i32
),
1720 return SDValue(CurDAG
->getMachineNode(AMDGPU::REG_SEQUENCE
, SL
, MVT::i64
,
1724 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr
, SDValue
&SBase
,
1725 SDValue
&Offset
, bool &Imm
) const {
1728 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
1729 // wraparound, because s_load instructions perform the addition in 64 bits.
1730 if ((Addr
.getValueType() != MVT::i32
||
1731 Addr
->getFlags().hasNoUnsignedWrap()) &&
1732 CurDAG
->isBaseWithConstantOffset(Addr
)) {
1733 SDValue N0
= Addr
.getOperand(0);
1734 SDValue N1
= Addr
.getOperand(1);
1736 if (SelectSMRDOffset(N1
, Offset
, Imm
)) {
1737 SBase
= Expand32BitAddress(N0
);
1741 SBase
= Expand32BitAddress(Addr
);
1742 Offset
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
1747 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr
, SDValue
&SBase
,
1748 SDValue
&Offset
) const {
1750 return SelectSMRD(Addr
, SBase
, Offset
, Imm
) && Imm
;
1753 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr
, SDValue
&SBase
,
1754 SDValue
&Offset
) const {
1756 if (Subtarget
->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS
)
1760 if (!SelectSMRD(Addr
, SBase
, Offset
, Imm
))
1763 return !Imm
&& isa
<ConstantSDNode
>(Offset
);
1766 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr
, SDValue
&SBase
,
1767 SDValue
&Offset
) const {
1769 return SelectSMRD(Addr
, SBase
, Offset
, Imm
) && !Imm
&&
1770 !isa
<ConstantSDNode
>(Offset
);
1773 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr
,
1774 SDValue
&Offset
) const {
1776 return SelectSMRDOffset(Addr
, Offset
, Imm
) && Imm
;
1779 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr
,
1780 SDValue
&Offset
) const {
1781 if (Subtarget
->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS
)
1785 if (!SelectSMRDOffset(Addr
, Offset
, Imm
))
1788 return !Imm
&& isa
<ConstantSDNode
>(Offset
);
1791 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index
,
1793 SDValue
&Offset
) const {
1796 if (CurDAG
->isBaseWithConstantOffset(Index
)) {
1797 SDValue N0
= Index
.getOperand(0);
1798 SDValue N1
= Index
.getOperand(1);
1799 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1802 // Don't peel off the offset (c0) if doing so could possibly lead
1803 // the base (n0) to be negative.
1804 if (C1
->getSExtValue() <= 0 || CurDAG
->SignBitIsZero(N0
)) {
1806 Offset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i32
);
1811 if (isa
<ConstantSDNode
>(Index
))
1815 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1819 SDNode
*AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode
, const SDLoc
&DL
,
1820 SDValue Val
, uint32_t Offset
,
1822 // Transformation function, pack the offset and width of a BFE into
1823 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1824 // source, bits [5:0] contain the offset and bits [22:16] the width.
1825 uint32_t PackedVal
= Offset
| (Width
<< 16);
1826 SDValue PackedConst
= CurDAG
->getTargetConstant(PackedVal
, DL
, MVT::i32
);
1828 return CurDAG
->getMachineNode(Opcode
, DL
, MVT::i32
, Val
, PackedConst
);
1831 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode
*N
) {
1832 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
1833 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
1834 // Predicate: 0 < b <= c < 32
1836 const SDValue
&Shl
= N
->getOperand(0);
1837 ConstantSDNode
*B
= dyn_cast
<ConstantSDNode
>(Shl
->getOperand(1));
1838 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
1841 uint32_t BVal
= B
->getZExtValue();
1842 uint32_t CVal
= C
->getZExtValue();
1844 if (0 < BVal
&& BVal
<= CVal
&& CVal
< 32) {
1845 bool Signed
= N
->getOpcode() == ISD::SRA
;
1846 unsigned Opcode
= Signed
? AMDGPU::S_BFE_I32
: AMDGPU::S_BFE_U32
;
1848 ReplaceNode(N
, getS_BFE(Opcode
, SDLoc(N
), Shl
.getOperand(0), CVal
- BVal
,
1856 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode
*N
) {
1857 switch (N
->getOpcode()) {
1859 if (N
->getOperand(0).getOpcode() == ISD::SRL
) {
1860 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
1861 // Predicate: isMask(mask)
1862 const SDValue
&Srl
= N
->getOperand(0);
1863 ConstantSDNode
*Shift
= dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1));
1864 ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
1866 if (Shift
&& Mask
) {
1867 uint32_t ShiftVal
= Shift
->getZExtValue();
1868 uint32_t MaskVal
= Mask
->getZExtValue();
1870 if (isMask_32(MaskVal
)) {
1871 uint32_t WidthVal
= countPopulation(MaskVal
);
1873 ReplaceNode(N
, getS_BFE(AMDGPU::S_BFE_U32
, SDLoc(N
),
1874 Srl
.getOperand(0), ShiftVal
, WidthVal
));
1881 if (N
->getOperand(0).getOpcode() == ISD::AND
) {
1882 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
1883 // Predicate: isMask(mask >> b)
1884 const SDValue
&And
= N
->getOperand(0);
1885 ConstantSDNode
*Shift
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
1886 ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(And
->getOperand(1));
1888 if (Shift
&& Mask
) {
1889 uint32_t ShiftVal
= Shift
->getZExtValue();
1890 uint32_t MaskVal
= Mask
->getZExtValue() >> ShiftVal
;
1892 if (isMask_32(MaskVal
)) {
1893 uint32_t WidthVal
= countPopulation(MaskVal
);
1895 ReplaceNode(N
, getS_BFE(AMDGPU::S_BFE_U32
, SDLoc(N
),
1896 And
.getOperand(0), ShiftVal
, WidthVal
));
1900 } else if (N
->getOperand(0).getOpcode() == ISD::SHL
) {
1901 SelectS_BFEFromShifts(N
);
1906 if (N
->getOperand(0).getOpcode() == ISD::SHL
) {
1907 SelectS_BFEFromShifts(N
);
1912 case ISD::SIGN_EXTEND_INREG
: {
1913 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
1914 SDValue Src
= N
->getOperand(0);
1915 if (Src
.getOpcode() != ISD::SRL
)
1918 const ConstantSDNode
*Amt
= dyn_cast
<ConstantSDNode
>(Src
.getOperand(1));
1922 unsigned Width
= cast
<VTSDNode
>(N
->getOperand(1))->getVT().getSizeInBits();
1923 ReplaceNode(N
, getS_BFE(AMDGPU::S_BFE_I32
, SDLoc(N
), Src
.getOperand(0),
1924 Amt
->getZExtValue(), Width
));
1932 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode
*N
) const {
1933 assert(N
->getOpcode() == ISD::BRCOND
);
1934 if (!N
->hasOneUse())
1937 SDValue Cond
= N
->getOperand(1);
1938 if (Cond
.getOpcode() == ISD::CopyToReg
)
1939 Cond
= Cond
.getOperand(2);
1941 if (Cond
.getOpcode() != ISD::SETCC
|| !Cond
.hasOneUse())
1944 MVT VT
= Cond
.getOperand(0).getSimpleValueType();
1948 if (VT
== MVT::i64
) {
1949 auto ST
= static_cast<const GCNSubtarget
*>(Subtarget
);
1951 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Cond
.getOperand(2))->get();
1952 return (CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) && ST
->hasScalarCompareEq64();
1958 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode
*N
) {
1959 SDValue Cond
= N
->getOperand(1);
1961 if (Cond
.isUndef()) {
1962 CurDAG
->SelectNodeTo(N
, AMDGPU::SI_BR_UNDEF
, MVT::Other
,
1963 N
->getOperand(2), N
->getOperand(0));
1967 const GCNSubtarget
*ST
= static_cast<const GCNSubtarget
*>(Subtarget
);
1968 const SIRegisterInfo
*TRI
= ST
->getRegisterInfo();
1970 bool UseSCCBr
= isCBranchSCC(N
) && isUniformBr(N
);
1971 unsigned BrOp
= UseSCCBr
? AMDGPU::S_CBRANCH_SCC1
: AMDGPU::S_CBRANCH_VCCNZ
;
1972 unsigned CondReg
= UseSCCBr
? (unsigned)AMDGPU::SCC
: TRI
->getVCC();
1976 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
1977 // analyzed what generates the vcc value, so we do not know whether vcc
1978 // bits for disabled lanes are 0. Thus we need to mask out bits for
1981 // For the case that we select S_CBRANCH_SCC1 and it gets
1982 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
1983 // SIInstrInfo::moveToVALU which inserts the S_AND).
1985 // We could add an analysis of what generates the vcc value here and omit
1986 // the S_AND when is unnecessary. But it would be better to add a separate
1987 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
1988 // catches both cases.
1989 Cond
= SDValue(CurDAG
->getMachineNode(ST
->isWave32() ? AMDGPU::S_AND_B32
1990 : AMDGPU::S_AND_B64
,
1992 CurDAG
->getRegister(ST
->isWave32() ? AMDGPU::EXEC_LO
1999 SDValue VCC
= CurDAG
->getCopyToReg(N
->getOperand(0), SL
, CondReg
, Cond
);
2000 CurDAG
->SelectNodeTo(N
, BrOp
, MVT::Other
,
2001 N
->getOperand(2), // Basic Block
2005 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode
*N
) {
2006 MVT VT
= N
->getSimpleValueType(0);
2007 bool IsFMA
= N
->getOpcode() == ISD::FMA
;
2008 if (VT
!= MVT::f32
|| (!Subtarget
->hasMadMixInsts() &&
2009 !Subtarget
->hasFmaMixInsts()) ||
2010 ((IsFMA
&& Subtarget
->hasMadMixInsts()) ||
2011 (!IsFMA
&& Subtarget
->hasFmaMixInsts()))) {
2016 SDValue Src0
= N
->getOperand(0);
2017 SDValue Src1
= N
->getOperand(1);
2018 SDValue Src2
= N
->getOperand(2);
2019 unsigned Src0Mods
, Src1Mods
, Src2Mods
;
2021 // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
2022 // using the conversion from f16.
2023 bool Sel0
= SelectVOP3PMadMixModsImpl(Src0
, Src0
, Src0Mods
);
2024 bool Sel1
= SelectVOP3PMadMixModsImpl(Src1
, Src1
, Src1Mods
);
2025 bool Sel2
= SelectVOP3PMadMixModsImpl(Src2
, Src2
, Src2Mods
);
2027 assert((IsFMA
|| !Subtarget
->hasFP32Denormals()) &&
2028 "fmad selected with denormals enabled");
2029 // TODO: We can select this with f32 denormals enabled if all the sources are
2030 // converted from f16 (in which case fmad isn't legal).
2032 if (Sel0
|| Sel1
|| Sel2
) {
2033 // For dummy operands.
2034 SDValue Zero
= CurDAG
->getTargetConstant(0, SDLoc(), MVT::i32
);
2036 CurDAG
->getTargetConstant(Src0Mods
, SDLoc(), MVT::i32
), Src0
,
2037 CurDAG
->getTargetConstant(Src1Mods
, SDLoc(), MVT::i32
), Src1
,
2038 CurDAG
->getTargetConstant(Src2Mods
, SDLoc(), MVT::i32
), Src2
,
2039 CurDAG
->getTargetConstant(0, SDLoc(), MVT::i1
),
2043 CurDAG
->SelectNodeTo(N
,
2044 IsFMA
? AMDGPU::V_FMA_MIX_F32
: AMDGPU::V_MAD_MIX_F32
,
2051 // This is here because there isn't a way to use the generated sub0_sub1 as the
2052 // subreg index to EXTRACT_SUBREG in tablegen.
2053 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode
*N
) {
2054 MemSDNode
*Mem
= cast
<MemSDNode
>(N
);
2055 unsigned AS
= Mem
->getAddressSpace();
2056 if (AS
== AMDGPUAS::FLAT_ADDRESS
) {
2061 MVT VT
= N
->getSimpleValueType(0);
2062 bool Is32
= (VT
== MVT::i32
);
2065 MachineSDNode
*CmpSwap
= nullptr;
2066 if (Subtarget
->hasAddr64()) {
2067 SDValue SRsrc
, VAddr
, SOffset
, Offset
, SLC
;
2069 if (SelectMUBUFAddr64(Mem
->getBasePtr(), SRsrc
, VAddr
, SOffset
, Offset
, SLC
)) {
2070 unsigned Opcode
= Is32
? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN
:
2071 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN
;
2072 SDValue CmpVal
= Mem
->getOperand(2);
2074 // XXX - Do we care about glue operands?
2077 CmpVal
, VAddr
, SRsrc
, SOffset
, Offset
, SLC
, Mem
->getChain()
2080 CmpSwap
= CurDAG
->getMachineNode(Opcode
, SL
, Mem
->getVTList(), Ops
);
2085 SDValue SRsrc
, SOffset
, Offset
, SLC
;
2086 if (SelectMUBUFOffset(Mem
->getBasePtr(), SRsrc
, SOffset
, Offset
, SLC
)) {
2087 unsigned Opcode
= Is32
? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
:
2088 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN
;
2090 SDValue CmpVal
= Mem
->getOperand(2);
2092 CmpVal
, SRsrc
, SOffset
, Offset
, SLC
, Mem
->getChain()
2095 CmpSwap
= CurDAG
->getMachineNode(Opcode
, SL
, Mem
->getVTList(), Ops
);
2104 MachineMemOperand
*MMO
= Mem
->getMemOperand();
2105 CurDAG
->setNodeMemRefs(CmpSwap
, {MMO
});
2107 unsigned SubReg
= Is32
? AMDGPU::sub0
: AMDGPU::sub0_sub1
;
2109 = CurDAG
->getTargetExtractSubreg(SubReg
, SL
, VT
, SDValue(CmpSwap
, 0));
2111 ReplaceUses(SDValue(N
, 0), Extract
);
2112 ReplaceUses(SDValue(N
, 1), SDValue(CmpSwap
, 1));
2113 CurDAG
->RemoveDeadNode(N
);
2116 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode
*N
, unsigned IntrID
) {
2117 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2118 // be copied to an SGPR with readfirstlane.
2119 unsigned Opc
= IntrID
== Intrinsic::amdgcn_ds_append
?
2120 AMDGPU::DS_APPEND
: AMDGPU::DS_CONSUME
;
2122 SDValue Chain
= N
->getOperand(0);
2123 SDValue Ptr
= N
->getOperand(2);
2124 MemIntrinsicSDNode
*M
= cast
<MemIntrinsicSDNode
>(N
);
2125 MachineMemOperand
*MMO
= M
->getMemOperand();
2126 bool IsGDS
= M
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
;
2129 if (CurDAG
->isBaseWithConstantOffset(Ptr
)) {
2130 SDValue PtrBase
= Ptr
.getOperand(0);
2131 SDValue PtrOffset
= Ptr
.getOperand(1);
2133 const APInt
&OffsetVal
= cast
<ConstantSDNode
>(PtrOffset
)->getAPIntValue();
2134 if (isDSOffsetLegal(PtrBase
, OffsetVal
.getZExtValue(), 16)) {
2135 N
= glueCopyToM0(N
, PtrBase
);
2136 Offset
= CurDAG
->getTargetConstant(OffsetVal
, SDLoc(), MVT::i32
);
2141 N
= glueCopyToM0(N
, Ptr
);
2142 Offset
= CurDAG
->getTargetConstant(0, SDLoc(), MVT::i32
);
2147 CurDAG
->getTargetConstant(IsGDS
, SDLoc(), MVT::i32
),
2149 N
->getOperand(N
->getNumOperands() - 1) // New glue
2152 SDNode
*Selected
= CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
2153 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Selected
), {MMO
});
2156 static unsigned gwsIntrinToOpcode(unsigned IntrID
) {
2158 case Intrinsic::amdgcn_ds_gws_init
:
2159 return AMDGPU::DS_GWS_INIT
;
2160 case Intrinsic::amdgcn_ds_gws_barrier
:
2161 return AMDGPU::DS_GWS_BARRIER
;
2162 case Intrinsic::amdgcn_ds_gws_sema_v
:
2163 return AMDGPU::DS_GWS_SEMA_V
;
2164 case Intrinsic::amdgcn_ds_gws_sema_br
:
2165 return AMDGPU::DS_GWS_SEMA_BR
;
2166 case Intrinsic::amdgcn_ds_gws_sema_p
:
2167 return AMDGPU::DS_GWS_SEMA_P
;
2168 case Intrinsic::amdgcn_ds_gws_sema_release_all
:
2169 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL
;
2171 llvm_unreachable("not a gws intrinsic");
2175 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode
*N
, unsigned IntrID
) {
2176 if (IntrID
== Intrinsic::amdgcn_ds_gws_sema_release_all
&&
2177 !Subtarget
->hasGWSSemaReleaseAll()) {
2183 // Chain, intrinsic ID, vsrc, offset
2184 const bool HasVSrc
= N
->getNumOperands() == 4;
2185 assert(HasVSrc
|| N
->getNumOperands() == 3);
2188 SDValue BaseOffset
= N
->getOperand(HasVSrc
? 3 : 2);
2190 MemIntrinsicSDNode
*M
= cast
<MemIntrinsicSDNode
>(N
);
2191 MachineMemOperand
*MMO
= M
->getMemOperand();
2193 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2194 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2196 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2197 // offset field) % 64. Some versions of the programming guide omit the m0
2198 // part, or claim it's from offset 0.
2199 if (ConstantSDNode
*ConstOffset
= dyn_cast
<ConstantSDNode
>(BaseOffset
)) {
2200 // If we have a constant offset, try to use the 0 in m0 as the base.
2201 // TODO: Look into changing the default m0 initialization value. If the
2202 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2203 // the immediate offset.
2204 glueCopyToM0(N
, CurDAG
->getTargetConstant(0, SL
, MVT::i32
));
2205 ImmOffset
= ConstOffset
->getZExtValue();
2207 if (CurDAG
->isBaseWithConstantOffset(BaseOffset
)) {
2208 ImmOffset
= BaseOffset
.getConstantOperandVal(1);
2209 BaseOffset
= BaseOffset
.getOperand(0);
2212 // Prefer to do the shift in an SGPR since it should be possible to use m0
2213 // as the result directly. If it's already an SGPR, it will be eliminated
2216 = CurDAG
->getMachineNode(AMDGPU::V_READFIRSTLANE_B32
, SL
, MVT::i32
,
2218 // Shift to offset in m0
2220 = CurDAG
->getMachineNode(AMDGPU::S_LSHL_B32
, SL
, MVT::i32
,
2221 SDValue(SGPROffset
, 0),
2222 CurDAG
->getTargetConstant(16, SL
, MVT::i32
));
2223 glueCopyToM0(N
, SDValue(M0Base
, 0));
2226 SDValue Chain
= N
->getOperand(0);
2227 SDValue OffsetField
= CurDAG
->getTargetConstant(ImmOffset
, SL
, MVT::i32
);
2229 // TODO: Can this just be removed from the instruction?
2230 SDValue GDS
= CurDAG
->getTargetConstant(1, SL
, MVT::i1
);
2232 const unsigned Opc
= gwsIntrinToOpcode(IntrID
);
2233 SmallVector
<SDValue
, 5> Ops
;
2235 Ops
.push_back(N
->getOperand(2));
2236 Ops
.push_back(OffsetField
);
2238 Ops
.push_back(Chain
);
2240 SDNode
*Selected
= CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
2241 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Selected
), {MMO
});
2244 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode
*N
) {
2245 unsigned IntrID
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
2247 case Intrinsic::amdgcn_ds_append
:
2248 case Intrinsic::amdgcn_ds_consume
: {
2249 if (N
->getValueType(0) != MVT::i32
)
2251 SelectDSAppendConsume(N
, IntrID
);
2259 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode
*N
) {
2260 unsigned IntrID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
2263 case Intrinsic::amdgcn_wqm
:
2264 Opcode
= AMDGPU::WQM
;
2266 case Intrinsic::amdgcn_softwqm
:
2267 Opcode
= AMDGPU::SOFT_WQM
;
2269 case Intrinsic::amdgcn_wwm
:
2270 Opcode
= AMDGPU::WWM
;
2277 SDValue Src
= N
->getOperand(1);
2278 CurDAG
->SelectNodeTo(N
, Opcode
, N
->getVTList(), {Src
});
2281 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode
*N
) {
2282 unsigned IntrID
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
2284 case Intrinsic::amdgcn_ds_gws_init
:
2285 case Intrinsic::amdgcn_ds_gws_barrier
:
2286 case Intrinsic::amdgcn_ds_gws_sema_v
:
2287 case Intrinsic::amdgcn_ds_gws_sema_br
:
2288 case Intrinsic::amdgcn_ds_gws_sema_p
:
2289 case Intrinsic::amdgcn_ds_gws_sema_release_all
:
2290 SelectDS_GWS(N
, IntrID
);
2299 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In
, SDValue
&Src
,
2300 unsigned &Mods
) const {
2304 if (Src
.getOpcode() == ISD::FNEG
) {
2305 Mods
|= SISrcMods::NEG
;
2306 Src
= Src
.getOperand(0);
2309 if (Src
.getOpcode() == ISD::FABS
) {
2310 Mods
|= SISrcMods::ABS
;
2311 Src
= Src
.getOperand(0);
2317 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In
, SDValue
&Src
,
2318 SDValue
&SrcMods
) const {
2320 if (SelectVOP3ModsImpl(In
, Src
, Mods
)) {
2321 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2328 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In
, SDValue
&Src
,
2329 SDValue
&SrcMods
) const {
2330 SelectVOP3Mods(In
, Src
, SrcMods
);
2331 return isNoNanSrc(Src
);
2334 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In
, SDValue
&Src
,
2335 SDValue
&SrcMods
) const {
2336 if (In
.getValueType() == MVT::f32
)
2337 return SelectVOP3Mods(In
, Src
, SrcMods
);
2339 SrcMods
= CurDAG
->getTargetConstant(0, SDLoc(In
), MVT::i32
);;
2343 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In
, SDValue
&Src
) const {
2344 if (In
.getOpcode() == ISD::FABS
|| In
.getOpcode() == ISD::FNEG
)
2351 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In
, SDValue
&Src
,
2352 SDValue
&SrcMods
, SDValue
&Clamp
,
2353 SDValue
&Omod
) const {
2355 Clamp
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2356 Omod
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2358 return SelectVOP3Mods(In
, Src
, SrcMods
);
2361 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In
, SDValue
&Src
,
2364 SDValue
&Omod
) const {
2365 Clamp
= Omod
= CurDAG
->getTargetConstant(0, SDLoc(In
), MVT::i32
);
2366 return SelectVOP3Mods(In
, Src
, SrcMods
);
2369 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In
, SDValue
&Src
,
2370 SDValue
&Clamp
, SDValue
&Omod
) const {
2374 Clamp
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2375 Omod
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2380 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In
, SDValue
&Src
,
2381 SDValue
&SrcMods
) const {
2385 if (Src
.getOpcode() == ISD::FNEG
) {
2386 Mods
^= (SISrcMods::NEG
| SISrcMods::NEG_HI
);
2387 Src
= Src
.getOperand(0);
2390 if (Src
.getOpcode() == ISD::BUILD_VECTOR
) {
2391 unsigned VecMods
= Mods
;
2393 SDValue Lo
= stripBitcast(Src
.getOperand(0));
2394 SDValue Hi
= stripBitcast(Src
.getOperand(1));
2396 if (Lo
.getOpcode() == ISD::FNEG
) {
2397 Lo
= stripBitcast(Lo
.getOperand(0));
2398 Mods
^= SISrcMods::NEG
;
2401 if (Hi
.getOpcode() == ISD::FNEG
) {
2402 Hi
= stripBitcast(Hi
.getOperand(0));
2403 Mods
^= SISrcMods::NEG_HI
;
2406 if (isExtractHiElt(Lo
, Lo
))
2407 Mods
|= SISrcMods::OP_SEL_0
;
2409 if (isExtractHiElt(Hi
, Hi
))
2410 Mods
|= SISrcMods::OP_SEL_1
;
2412 Lo
= stripExtractLoElt(Lo
);
2413 Hi
= stripExtractLoElt(Hi
);
2415 if (Lo
== Hi
&& !isInlineImmediate(Lo
.getNode())) {
2416 // Really a scalar input. Just select from the low half of the register to
2420 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2427 // Packed instructions do not have abs modifiers.
2428 Mods
|= SISrcMods::OP_SEL_1
;
2430 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2434 bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In
, SDValue
&Src
,
2436 SDValue
&Clamp
) const {
2439 // FIXME: Handle clamp and op_sel
2440 Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
2442 return SelectVOP3PMods(In
, Src
, SrcMods
);
2445 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In
, SDValue
&Src
,
2446 SDValue
&SrcMods
) const {
2448 // FIXME: Handle op_sel
2449 SrcMods
= CurDAG
->getTargetConstant(0, SDLoc(In
), MVT::i32
);
2453 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In
, SDValue
&Src
,
2455 SDValue
&Clamp
) const {
2458 // FIXME: Handle clamp
2459 Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
2461 return SelectVOP3OpSel(In
, Src
, SrcMods
);
2464 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In
, SDValue
&Src
,
2465 SDValue
&SrcMods
) const {
2466 // FIXME: Handle op_sel
2467 return SelectVOP3Mods(In
, Src
, SrcMods
);
2470 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In
, SDValue
&Src
,
2472 SDValue
&Clamp
) const {
2475 // FIXME: Handle clamp
2476 Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
2478 return SelectVOP3OpSelMods(In
, Src
, SrcMods
);
2481 // The return value is not whether the match is possible (which it always is),
2482 // but whether or not it a conversion is really used.
2483 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In
, SDValue
&Src
,
2484 unsigned &Mods
) const {
2486 SelectVOP3ModsImpl(In
, Src
, Mods
);
2488 if (Src
.getOpcode() == ISD::FP_EXTEND
) {
2489 Src
= Src
.getOperand(0);
2490 assert(Src
.getValueType() == MVT::f16
);
2491 Src
= stripBitcast(Src
);
2493 // Be careful about folding modifiers if we already have an abs. fneg is
2494 // applied last, so we don't want to apply an earlier fneg.
2495 if ((Mods
& SISrcMods::ABS
) == 0) {
2497 SelectVOP3ModsImpl(Src
, Src
, ModsTmp
);
2499 if ((ModsTmp
& SISrcMods::NEG
) != 0)
2500 Mods
^= SISrcMods::NEG
;
2502 if ((ModsTmp
& SISrcMods::ABS
) != 0)
2503 Mods
|= SISrcMods::ABS
;
2506 // op_sel/op_sel_hi decide the source type and source.
2507 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2508 // If the sources's op_sel is set, it picks the high half of the source
2511 Mods
|= SISrcMods::OP_SEL_1
;
2512 if (isExtractHiElt(Src
, Src
)) {
2513 Mods
|= SISrcMods::OP_SEL_0
;
2515 // TODO: Should we try to look for neg/abs here?
2524 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In
, SDValue
&Src
,
2525 SDValue
&SrcMods
) const {
2527 SelectVOP3PMadMixModsImpl(In
, Src
, Mods
);
2528 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2532 SDValue
AMDGPUDAGToDAGISel::getHi16Elt(SDValue In
) const {
2534 return CurDAG
->getUNDEF(MVT::i32
);
2536 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(In
)) {
2538 return CurDAG
->getConstant(C
->getZExtValue() << 16, SL
, MVT::i32
);
2541 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(In
)) {
2543 return CurDAG
->getConstant(
2544 C
->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL
, MVT::i32
);
2548 if (isExtractHiElt(In
, Src
))
2554 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode
* N
) const {
2555 assert(CurDAG
->getTarget().getTargetTriple().getArch() == Triple::amdgcn
);
2557 const SIRegisterInfo
*SIRI
=
2558 static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
2559 const SIInstrInfo
* SII
=
2560 static_cast<const SIInstrInfo
*>(Subtarget
->getInstrInfo());
2563 bool AllUsesAcceptSReg
= true;
2564 for (SDNode::use_iterator U
= N
->use_begin(), E
= SDNode::use_end();
2565 Limit
< 10 && U
!= E
; ++U
, ++Limit
) {
2566 const TargetRegisterClass
*RC
= getOperandRegClass(*U
, U
.getOperandNo());
2568 // If the register class is unknown, it could be an unknown
2569 // register class that needs to be an SGPR, e.g. an inline asm
2571 if (!RC
|| SIRI
->isSGPRClass(RC
))
2574 if (RC
!= &AMDGPU::VS_32RegClass
) {
2575 AllUsesAcceptSReg
= false;
2577 if (User
->isMachineOpcode()) {
2578 unsigned Opc
= User
->getMachineOpcode();
2579 MCInstrDesc Desc
= SII
->get(Opc
);
2580 if (Desc
.isCommutable()) {
2581 unsigned OpIdx
= Desc
.getNumDefs() + U
.getOperandNo();
2582 unsigned CommuteIdx1
= TargetInstrInfo::CommuteAnyOperandIndex
;
2583 if (SII
->findCommutedOpIndices(Desc
, OpIdx
, CommuteIdx1
)) {
2584 unsigned CommutedOpNo
= CommuteIdx1
- Desc
.getNumDefs();
2585 const TargetRegisterClass
*CommutedRC
= getOperandRegClass(*U
, CommutedOpNo
);
2586 if (CommutedRC
== &AMDGPU::VS_32RegClass
)
2587 AllUsesAcceptSReg
= true;
2591 // If "AllUsesAcceptSReg == false" so far we haven't suceeded
2592 // commuting current user. This means have at least one use
2593 // that strictly require VGPR. Thus, we will not attempt to commute
2594 // other user instructions.
2595 if (!AllUsesAcceptSReg
)
2599 return !AllUsesAcceptSReg
&& (Limit
< 10);
2602 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode
* N
) const {
2603 auto Ld
= cast
<LoadSDNode
>(N
);
2605 return Ld
->getAlignment() >= 4 &&
2609 Ld
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
2610 Ld
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
2617 Subtarget
->getScalarizeGlobalBehavior() &&
2618 Ld
->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
&&
2619 !Ld
->isVolatile() &&
2620 !N
->isDivergent() &&
2621 static_cast<const SITargetLowering
*>(
2622 getTargetLowering())->isMemOpHasNoClobberedMemOperand(N
)
2627 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
2628 const AMDGPUTargetLowering
& Lowering
=
2629 *static_cast<const AMDGPUTargetLowering
*>(getTargetLowering());
2630 bool IsModified
= false;
2634 // Go over all selected nodes and try to fold them a bit more
2635 SelectionDAG::allnodes_iterator Position
= CurDAG
->allnodes_begin();
2636 while (Position
!= CurDAG
->allnodes_end()) {
2637 SDNode
*Node
= &*Position
++;
2638 MachineSDNode
*MachineNode
= dyn_cast
<MachineSDNode
>(Node
);
2642 SDNode
*ResNode
= Lowering
.PostISelFolding(MachineNode
, *CurDAG
);
2643 if (ResNode
!= Node
) {
2645 ReplaceUses(Node
, ResNode
);
2649 CurDAG
->RemoveDeadNodes();
2650 } while (IsModified
);
2653 bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction
&MF
) {
2654 Subtarget
= &MF
.getSubtarget
<R600Subtarget
>();
2655 return SelectionDAGISel::runOnMachineFunction(MF
);
2658 bool R600DAGToDAGISel::isConstantLoad(const MemSDNode
*N
, int CbId
) const {
2662 return N
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
2663 N
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
;
2665 return N
->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0
+ CbId
;
2668 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr
,
2670 if (ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(Addr
)) {
2671 IntPtr
= CurDAG
->getIntPtrConstant(Cst
->getZExtValue() / 4, SDLoc(Addr
),
2678 bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr
,
2679 SDValue
& BaseReg
, SDValue
&Offset
) {
2680 if (!isa
<ConstantSDNode
>(Addr
)) {
2682 Offset
= CurDAG
->getIntPtrConstant(0, SDLoc(Addr
), true);
2688 void R600DAGToDAGISel::Select(SDNode
*N
) {
2689 unsigned int Opc
= N
->getOpcode();
2690 if (N
->isMachineOpcode()) {
2692 return; // Already selected.
2697 case AMDGPUISD::BUILD_VERTICAL_VECTOR
:
2698 case ISD::SCALAR_TO_VECTOR
:
2699 case ISD::BUILD_VECTOR
: {
2700 EVT VT
= N
->getValueType(0);
2701 unsigned NumVectorElts
= VT
.getVectorNumElements();
2702 unsigned RegClassID
;
2703 // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
2704 // that adds a 128 bits reg copy when going through TwoAddressInstructions
2705 // pass. We want to avoid 128 bits copies as much as possible because they
2706 // can't be bundled by our scheduler.
2707 switch(NumVectorElts
) {
2708 case 2: RegClassID
= R600::R600_Reg64RegClassID
; break;
2710 if (Opc
== AMDGPUISD::BUILD_VERTICAL_VECTOR
)
2711 RegClassID
= R600::R600_Reg128VerticalRegClassID
;
2713 RegClassID
= R600::R600_Reg128RegClassID
;
2715 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
2717 SelectBuildVector(N
, RegClassID
);
2725 bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr
, SDValue
&Base
,
2730 if ((C
= dyn_cast
<ConstantSDNode
>(Addr
))) {
2731 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
2732 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
2733 } else if ((Addr
.getOpcode() == AMDGPUISD::DWORDADDR
) &&
2734 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0)))) {
2735 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
2736 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
2737 } else if ((Addr
.getOpcode() == ISD::ADD
|| Addr
.getOpcode() == ISD::OR
) &&
2738 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1)))) {
2739 Base
= Addr
.getOperand(0);
2740 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
2743 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
2749 bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
,
2751 ConstantSDNode
*IMMOffset
;
2753 if (Addr
.getOpcode() == ISD::ADD
2754 && (IMMOffset
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1)))
2755 && isInt
<16>(IMMOffset
->getZExtValue())) {
2757 Base
= Addr
.getOperand(0);
2758 Offset
= CurDAG
->getTargetConstant(IMMOffset
->getZExtValue(), SDLoc(Addr
),
2761 // If the pointer address is constant, we can move it to the offset field.
2762 } else if ((IMMOffset
= dyn_cast
<ConstantSDNode
>(Addr
))
2763 && isInt
<16>(IMMOffset
->getZExtValue())) {
2764 Base
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(),
2765 SDLoc(CurDAG
->getEntryNode()),
2766 R600::ZERO
, MVT::i32
);
2767 Offset
= CurDAG
->getTargetConstant(IMMOffset
->getZExtValue(), SDLoc(Addr
),
2772 // Default case, no offset
2774 Offset
= CurDAG
->getTargetConstant(0, SDLoc(Addr
), MVT::i32
);