1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //==-----------------------------------------------------------------------===//
10 /// Defines an instruction selector for the AMDGPU target.
12 //===----------------------------------------------------------------------===//
15 #include "AMDGPUArgumentUsageInfo.h"
16 #include "AMDGPUISelLowering.h" // For AMDGPUISD
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUPerfHintAnalysis.h"
19 #include "AMDGPURegisterInfo.h"
20 #include "AMDGPUSubtarget.h"
21 #include "AMDGPUTargetMachine.h"
22 #include "SIDefines.h"
23 #include "SIISelLowering.h"
24 #include "SIInstrInfo.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIRegisterInfo.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "llvm/ADT/APInt.h"
29 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/ADT/StringRef.h"
31 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
32 #include "llvm/Analysis/ValueTracking.h"
33 #include "llvm/CodeGen/FunctionLoweringInfo.h"
34 #include "llvm/CodeGen/ISDOpcodes.h"
35 #include "llvm/CodeGen/MachineFunction.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/SelectionDAG.h"
38 #include "llvm/CodeGen/SelectionDAGISel.h"
39 #include "llvm/CodeGen/SelectionDAGNodes.h"
40 #include "llvm/CodeGen/ValueTypes.h"
41 #include "llvm/IR/BasicBlock.h"
42 #ifdef EXPENSIVE_CHECKS
43 #include "llvm/IR/Dominators.h"
45 #include "llvm/IR/Instruction.h"
46 #include "llvm/MC/MCInstrDesc.h"
47 #include "llvm/Support/Casting.h"
48 #include "llvm/Support/CodeGen.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Support/MachineValueType.h"
51 #include "llvm/Support/MathExtras.h"
57 #define DEBUG_TYPE "isel"
65 } // end namespace llvm
67 //===----------------------------------------------------------------------===//
68 // Instruction Selector Implementation
69 //===----------------------------------------------------------------------===//
73 static bool isNullConstantOrUndef(SDValue V
) {
77 ConstantSDNode
*Const
= dyn_cast
<ConstantSDNode
>(V
);
78 return Const
!= nullptr && Const
->isNullValue();
81 static bool getConstantValue(SDValue N
, uint32_t &Out
) {
82 // This is only used for packed vectors, where ussing 0 for undef should
89 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
)) {
90 Out
= C
->getAPIntValue().getSExtValue();
94 if (const ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(N
)) {
95 Out
= C
->getValueAPF().bitcastToAPInt().getSExtValue();
102 // TODO: Handle undef as zero
103 static SDNode
*packConstantV2I16(const SDNode
*N
, SelectionDAG
&DAG
,
104 bool Negate
= false) {
105 assert(N
->getOpcode() == ISD::BUILD_VECTOR
&& N
->getNumOperands() == 2);
106 uint32_t LHSVal
, RHSVal
;
107 if (getConstantValue(N
->getOperand(0), LHSVal
) &&
108 getConstantValue(N
->getOperand(1), RHSVal
)) {
110 uint32_t K
= Negate
?
111 (-LHSVal
& 0xffff) | (-RHSVal
<< 16) :
112 (LHSVal
& 0xffff) | (RHSVal
<< 16);
113 return DAG
.getMachineNode(AMDGPU::S_MOV_B32
, SL
, N
->getValueType(0),
114 DAG
.getTargetConstant(K
, SL
, MVT::i32
));
120 static SDNode
*packNegConstantV2I16(const SDNode
*N
, SelectionDAG
&DAG
) {
121 return packConstantV2I16(N
, DAG
, true);
124 /// AMDGPU specific code to select AMDGPU machine instructions for
125 /// SelectionDAG operations.
126 class AMDGPUDAGToDAGISel
: public SelectionDAGISel
{
127 // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
128 // make the right decision when generating code for different targets.
129 const GCNSubtarget
*Subtarget
;
130 bool EnableLateStructurizeCFG
;
133 explicit AMDGPUDAGToDAGISel(TargetMachine
*TM
= nullptr,
134 CodeGenOpt::Level OptLevel
= CodeGenOpt::Default
)
135 : SelectionDAGISel(*TM
, OptLevel
) {
136 EnableLateStructurizeCFG
= AMDGPUTargetMachine::EnableLateStructurizeCFG
;
138 ~AMDGPUDAGToDAGISel() override
= default;
140 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
141 AU
.addRequired
<AMDGPUArgumentUsageInfo
>();
142 AU
.addRequired
<LegacyDivergenceAnalysis
>();
143 #ifdef EXPENSIVE_CHECKS
144 AU
.addRequired
<DominatorTreeWrapperPass
>();
145 AU
.addRequired
<LoopInfoWrapperPass
>();
147 SelectionDAGISel::getAnalysisUsage(AU
);
150 bool matchLoadD16FromBuildVector(SDNode
*N
) const;
152 bool runOnMachineFunction(MachineFunction
&MF
) override
;
153 void PreprocessISelDAG() override
;
154 void Select(SDNode
*N
) override
;
155 StringRef
getPassName() const override
;
156 void PostprocessISelDAG() override
;
159 void SelectBuildVector(SDNode
*N
, unsigned RegClassID
);
162 std::pair
<SDValue
, SDValue
> foldFrameIndex(SDValue N
) const;
163 bool isNoNanSrc(SDValue N
) const;
164 bool isInlineImmediate(const SDNode
*N
, bool Negated
= false) const;
165 bool isNegInlineImmediate(const SDNode
*N
) const {
166 return isInlineImmediate(N
, true);
169 bool isVGPRImm(const SDNode
*N
) const;
170 bool isUniformLoad(const SDNode
*N
) const;
171 bool isUniformBr(const SDNode
*N
) const;
173 MachineSDNode
*buildSMovImm64(SDLoc
&DL
, uint64_t Val
, EVT VT
) const;
175 SDNode
*glueCopyToM0LDSInit(SDNode
*N
) const;
176 SDNode
*glueCopyToM0(SDNode
*N
, SDValue Val
) const;
178 const TargetRegisterClass
*getOperandRegClass(SDNode
*N
, unsigned OpNo
) const;
179 virtual bool SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
, SDValue
&Offset
);
180 virtual bool SelectADDRIndirect(SDValue Addr
, SDValue
&Base
, SDValue
&Offset
);
181 bool isDSOffsetLegal(SDValue Base
, unsigned Offset
,
182 unsigned OffsetBits
) const;
183 bool SelectDS1Addr1Offset(SDValue Ptr
, SDValue
&Base
, SDValue
&Offset
) const;
184 bool SelectDS64Bit4ByteAligned(SDValue Ptr
, SDValue
&Base
, SDValue
&Offset0
,
185 SDValue
&Offset1
) const;
186 bool SelectMUBUF(SDValue Addr
, SDValue
&SRsrc
, SDValue
&VAddr
,
187 SDValue
&SOffset
, SDValue
&Offset
, SDValue
&Offen
,
188 SDValue
&Idxen
, SDValue
&Addr64
, SDValue
&GLC
, SDValue
&SLC
,
189 SDValue
&TFE
, SDValue
&DLC
) const;
190 bool SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
, SDValue
&VAddr
,
191 SDValue
&SOffset
, SDValue
&Offset
, SDValue
&GLC
,
192 SDValue
&SLC
, SDValue
&TFE
, SDValue
&DLC
) const;
193 bool SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
,
194 SDValue
&VAddr
, SDValue
&SOffset
, SDValue
&Offset
,
196 bool SelectMUBUFScratchOffen(SDNode
*Parent
,
197 SDValue Addr
, SDValue
&RSrc
, SDValue
&VAddr
,
198 SDValue
&SOffset
, SDValue
&ImmOffset
) const;
199 bool SelectMUBUFScratchOffset(SDNode
*Parent
,
200 SDValue Addr
, SDValue
&SRsrc
, SDValue
&Soffset
,
201 SDValue
&Offset
) const;
203 bool SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
, SDValue
&SOffset
,
204 SDValue
&Offset
, SDValue
&GLC
, SDValue
&SLC
,
205 SDValue
&TFE
, SDValue
&DLC
) const;
206 bool SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
, SDValue
&Soffset
,
207 SDValue
&Offset
, SDValue
&SLC
) const;
208 bool SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
, SDValue
&Soffset
,
209 SDValue
&Offset
) const;
211 bool SelectFlatAtomic(SDNode
*N
, SDValue Addr
, SDValue
&VAddr
,
212 SDValue
&Offset
, SDValue
&SLC
) const;
213 bool SelectFlatAtomicSigned(SDNode
*N
, SDValue Addr
, SDValue
&VAddr
,
214 SDValue
&Offset
, SDValue
&SLC
) const;
216 template <bool IsSigned
>
217 bool SelectFlatOffset(SDNode
*N
, SDValue Addr
, SDValue
&VAddr
,
218 SDValue
&Offset
, SDValue
&SLC
) const;
220 bool SelectSMRDOffset(SDValue ByteOffsetNode
, SDValue
&Offset
,
222 SDValue
Expand32BitAddress(SDValue Addr
) const;
223 bool SelectSMRD(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
,
225 bool SelectSMRDImm(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
) const;
226 bool SelectSMRDImm32(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
) const;
227 bool SelectSMRDSgpr(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
) const;
228 bool SelectSMRDBufferImm(SDValue Addr
, SDValue
&Offset
) const;
229 bool SelectSMRDBufferImm32(SDValue Addr
, SDValue
&Offset
) const;
230 bool SelectMOVRELOffset(SDValue Index
, SDValue
&Base
, SDValue
&Offset
) const;
232 bool SelectVOP3Mods_NNaN(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
233 bool SelectVOP3Mods_f32(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
234 bool SelectVOP3ModsImpl(SDValue In
, SDValue
&Src
, unsigned &SrcMods
) const;
235 bool SelectVOP3Mods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
236 bool SelectVOP3NoMods(SDValue In
, SDValue
&Src
) const;
237 bool SelectVOP3Mods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
238 SDValue
&Clamp
, SDValue
&Omod
) const;
239 bool SelectVOP3NoMods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
240 SDValue
&Clamp
, SDValue
&Omod
) const;
242 bool SelectVOP3Mods0Clamp0OMod(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
244 SDValue
&Omod
) const;
246 bool SelectVOP3OMods(SDValue In
, SDValue
&Src
,
247 SDValue
&Clamp
, SDValue
&Omod
) const;
249 bool SelectVOP3PMods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
250 bool SelectVOP3PMods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
251 SDValue
&Clamp
) const;
253 bool SelectVOP3OpSel(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
254 bool SelectVOP3OpSel0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
255 SDValue
&Clamp
) const;
257 bool SelectVOP3OpSelMods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
258 bool SelectVOP3OpSelMods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
259 SDValue
&Clamp
) const;
260 bool SelectVOP3PMadMixModsImpl(SDValue In
, SDValue
&Src
, unsigned &Mods
) const;
261 bool SelectVOP3PMadMixMods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
263 SDValue
getHi16Elt(SDValue In
) const;
265 void SelectADD_SUB_I64(SDNode
*N
);
266 void SelectAddcSubb(SDNode
*N
);
267 void SelectUADDO_USUBO(SDNode
*N
);
268 void SelectDIV_SCALE(SDNode
*N
);
269 void SelectDIV_FMAS(SDNode
*N
);
270 void SelectMAD_64_32(SDNode
*N
);
271 void SelectFMA_W_CHAIN(SDNode
*N
);
272 void SelectFMUL_W_CHAIN(SDNode
*N
);
274 SDNode
*getS_BFE(unsigned Opcode
, const SDLoc
&DL
, SDValue Val
,
275 uint32_t Offset
, uint32_t Width
);
276 void SelectS_BFEFromShifts(SDNode
*N
);
277 void SelectS_BFE(SDNode
*N
);
278 bool isCBranchSCC(const SDNode
*N
) const;
279 void SelectBRCOND(SDNode
*N
);
280 void SelectFMAD_FMA(SDNode
*N
);
281 void SelectATOMIC_CMP_SWAP(SDNode
*N
);
282 void SelectDSAppendConsume(SDNode
*N
, unsigned IntrID
);
283 void SelectDS_GWS(SDNode
*N
, unsigned IntrID
);
284 void SelectINTRINSIC_W_CHAIN(SDNode
*N
);
285 void SelectINTRINSIC_WO_CHAIN(SDNode
*N
);
286 void SelectINTRINSIC_VOID(SDNode
*N
);
289 // Include the pieces autogenerated from the target description.
290 #include "AMDGPUGenDAGISel.inc"
293 class R600DAGToDAGISel
: public AMDGPUDAGToDAGISel
{
294 const R600Subtarget
*Subtarget
;
296 bool isConstantLoad(const MemSDNode
*N
, int cbID
) const;
297 bool SelectGlobalValueConstantOffset(SDValue Addr
, SDValue
& IntPtr
);
298 bool SelectGlobalValueVariableOffset(SDValue Addr
, SDValue
&BaseReg
,
301 explicit R600DAGToDAGISel(TargetMachine
*TM
, CodeGenOpt::Level OptLevel
) :
302 AMDGPUDAGToDAGISel(TM
, OptLevel
) {}
304 void Select(SDNode
*N
) override
;
306 bool SelectADDRIndirect(SDValue Addr
, SDValue
&Base
,
307 SDValue
&Offset
) override
;
308 bool SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
,
309 SDValue
&Offset
) override
;
311 bool runOnMachineFunction(MachineFunction
&MF
) override
;
313 void PreprocessISelDAG() override
{}
316 // Include the pieces autogenerated from the target description.
317 #include "R600GenDAGISel.inc"
320 static SDValue
stripBitcast(SDValue Val
) {
321 return Val
.getOpcode() == ISD::BITCAST
? Val
.getOperand(0) : Val
;
324 // Figure out if this is really an extract of the high 16-bits of a dword.
325 static bool isExtractHiElt(SDValue In
, SDValue
&Out
) {
326 In
= stripBitcast(In
);
327 if (In
.getOpcode() != ISD::TRUNCATE
)
330 SDValue Srl
= In
.getOperand(0);
331 if (Srl
.getOpcode() == ISD::SRL
) {
332 if (ConstantSDNode
*ShiftAmt
= dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1))) {
333 if (ShiftAmt
->getZExtValue() == 16) {
334 Out
= stripBitcast(Srl
.getOperand(0));
343 // Look through operations that obscure just looking at the low 16-bits of the
345 static SDValue
stripExtractLoElt(SDValue In
) {
346 if (In
.getOpcode() == ISD::TRUNCATE
) {
347 SDValue Src
= In
.getOperand(0);
348 if (Src
.getValueType().getSizeInBits() == 32)
349 return stripBitcast(Src
);
355 } // end anonymous namespace
357 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel
, "amdgpu-isel",
358 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
359 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo
)
360 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis
)
361 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis
)
362 #ifdef EXPENSIVE_CHECKS
363 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
364 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass
)
366 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel
, "amdgpu-isel",
367 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
369 /// This pass converts a legalized DAG into a AMDGPU-specific
370 // DAG, ready for instruction scheduling.
371 FunctionPass
*llvm::createAMDGPUISelDag(TargetMachine
*TM
,
372 CodeGenOpt::Level OptLevel
) {
373 return new AMDGPUDAGToDAGISel(TM
, OptLevel
);
376 /// This pass converts a legalized DAG into a R600-specific
377 // DAG, ready for instruction scheduling.
378 FunctionPass
*llvm::createR600ISelDag(TargetMachine
*TM
,
379 CodeGenOpt::Level OptLevel
) {
380 return new R600DAGToDAGISel(TM
, OptLevel
);
383 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction
&MF
) {
384 #ifdef EXPENSIVE_CHECKS
385 DominatorTree
& DT
= getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
386 LoopInfo
* LI
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
387 for (auto &L
: LI
->getLoopsInPreorder()) {
388 assert(L
->isLCSSAForm(DT
));
391 Subtarget
= &MF
.getSubtarget
<GCNSubtarget
>();
392 return SelectionDAGISel::runOnMachineFunction(MF
);
395 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode
*N
) const {
396 assert(Subtarget
->d16PreservesUnusedBits());
397 MVT VT
= N
->getValueType(0).getSimpleVT();
398 if (VT
!= MVT::v2i16
&& VT
!= MVT::v2f16
)
401 SDValue Lo
= N
->getOperand(0);
402 SDValue Hi
= N
->getOperand(1);
404 LoadSDNode
*LdHi
= dyn_cast
<LoadSDNode
>(stripBitcast(Hi
));
406 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
407 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
408 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
410 // Need to check for possible indirect dependencies on the other half of the
411 // vector to avoid introducing a cycle.
412 if (LdHi
&& Hi
.hasOneUse() && !LdHi
->isPredecessorOf(Lo
.getNode())) {
413 SDVTList VTList
= CurDAG
->getVTList(VT
, MVT::Other
);
415 SDValue TiedIn
= CurDAG
->getNode(ISD::SCALAR_TO_VECTOR
, SDLoc(N
), VT
, Lo
);
417 LdHi
->getChain(), LdHi
->getBasePtr(), TiedIn
420 unsigned LoadOp
= AMDGPUISD::LOAD_D16_HI
;
421 if (LdHi
->getMemoryVT() == MVT::i8
) {
422 LoadOp
= LdHi
->getExtensionType() == ISD::SEXTLOAD
?
423 AMDGPUISD::LOAD_D16_HI_I8
: AMDGPUISD::LOAD_D16_HI_U8
;
425 assert(LdHi
->getMemoryVT() == MVT::i16
);
429 CurDAG
->getMemIntrinsicNode(LoadOp
, SDLoc(LdHi
), VTList
,
430 Ops
, LdHi
->getMemoryVT(),
431 LdHi
->getMemOperand());
433 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), NewLoadHi
);
434 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(LdHi
, 1), NewLoadHi
.getValue(1));
438 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
439 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
440 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
441 LoadSDNode
*LdLo
= dyn_cast
<LoadSDNode
>(stripBitcast(Lo
));
442 if (LdLo
&& Lo
.hasOneUse()) {
443 SDValue TiedIn
= getHi16Elt(Hi
);
444 if (!TiedIn
|| LdLo
->isPredecessorOf(TiedIn
.getNode()))
447 SDVTList VTList
= CurDAG
->getVTList(VT
, MVT::Other
);
448 unsigned LoadOp
= AMDGPUISD::LOAD_D16_LO
;
449 if (LdLo
->getMemoryVT() == MVT::i8
) {
450 LoadOp
= LdLo
->getExtensionType() == ISD::SEXTLOAD
?
451 AMDGPUISD::LOAD_D16_LO_I8
: AMDGPUISD::LOAD_D16_LO_U8
;
453 assert(LdLo
->getMemoryVT() == MVT::i16
);
456 TiedIn
= CurDAG
->getNode(ISD::BITCAST
, SDLoc(N
), VT
, TiedIn
);
459 LdLo
->getChain(), LdLo
->getBasePtr(), TiedIn
463 CurDAG
->getMemIntrinsicNode(LoadOp
, SDLoc(LdLo
), VTList
,
464 Ops
, LdLo
->getMemoryVT(),
465 LdLo
->getMemOperand());
467 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), NewLoadLo
);
468 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(LdLo
, 1), NewLoadLo
.getValue(1));
475 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
476 if (!Subtarget
->d16PreservesUnusedBits())
479 SelectionDAG::allnodes_iterator Position
= CurDAG
->allnodes_end();
481 bool MadeChange
= false;
482 while (Position
!= CurDAG
->allnodes_begin()) {
483 SDNode
*N
= &*--Position
;
487 switch (N
->getOpcode()) {
488 case ISD::BUILD_VECTOR
:
489 MadeChange
|= matchLoadD16FromBuildVector(N
);
497 CurDAG
->RemoveDeadNodes();
498 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
503 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N
) const {
504 if (TM
.Options
.NoNaNsFPMath
)
507 // TODO: Move into isKnownNeverNaN
508 if (N
->getFlags().isDefined())
509 return N
->getFlags().hasNoNaNs();
511 return CurDAG
->isKnownNeverNaN(N
);
514 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode
*N
,
515 bool Negated
) const {
519 const SIInstrInfo
*TII
= Subtarget
->getInstrInfo();
521 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
))
522 return TII
->isInlineConstant(-C
->getAPIntValue());
524 if (const ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(N
))
525 return TII
->isInlineConstant(-C
->getValueAPF().bitcastToAPInt());
528 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
))
529 return TII
->isInlineConstant(C
->getAPIntValue());
531 if (const ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(N
))
532 return TII
->isInlineConstant(C
->getValueAPF().bitcastToAPInt());
538 /// Determine the register class for \p OpNo
539 /// \returns The register class of the virtual register that will be used for
540 /// the given operand number \OpNo or NULL if the register class cannot be
542 const TargetRegisterClass
*AMDGPUDAGToDAGISel::getOperandRegClass(SDNode
*N
,
543 unsigned OpNo
) const {
544 if (!N
->isMachineOpcode()) {
545 if (N
->getOpcode() == ISD::CopyToReg
) {
546 unsigned Reg
= cast
<RegisterSDNode
>(N
->getOperand(1))->getReg();
547 if (Register::isVirtualRegister(Reg
)) {
548 MachineRegisterInfo
&MRI
= CurDAG
->getMachineFunction().getRegInfo();
549 return MRI
.getRegClass(Reg
);
552 const SIRegisterInfo
*TRI
553 = static_cast<const GCNSubtarget
*>(Subtarget
)->getRegisterInfo();
554 return TRI
->getPhysRegClass(Reg
);
560 switch (N
->getMachineOpcode()) {
562 const MCInstrDesc
&Desc
=
563 Subtarget
->getInstrInfo()->get(N
->getMachineOpcode());
564 unsigned OpIdx
= Desc
.getNumDefs() + OpNo
;
565 if (OpIdx
>= Desc
.getNumOperands())
567 int RegClass
= Desc
.OpInfo
[OpIdx
].RegClass
;
571 return Subtarget
->getRegisterInfo()->getRegClass(RegClass
);
573 case AMDGPU::REG_SEQUENCE
: {
574 unsigned RCID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
575 const TargetRegisterClass
*SuperRC
=
576 Subtarget
->getRegisterInfo()->getRegClass(RCID
);
578 SDValue SubRegOp
= N
->getOperand(OpNo
+ 1);
579 unsigned SubRegIdx
= cast
<ConstantSDNode
>(SubRegOp
)->getZExtValue();
580 return Subtarget
->getRegisterInfo()->getSubClassWithSubReg(SuperRC
,
586 SDNode
*AMDGPUDAGToDAGISel::glueCopyToM0(SDNode
*N
, SDValue Val
) const {
587 const SITargetLowering
& Lowering
=
588 *static_cast<const SITargetLowering
*>(getTargetLowering());
590 assert(N
->getOperand(0).getValueType() == MVT::Other
&& "Expected chain");
592 SDValue M0
= Lowering
.copyToM0(*CurDAG
, N
->getOperand(0), SDLoc(N
),
595 SDValue Glue
= M0
.getValue(1);
597 SmallVector
<SDValue
, 8> Ops
;
598 Ops
.push_back(M0
); // Replace the chain.
599 for (unsigned i
= 1, e
= N
->getNumOperands(); i
!= e
; ++i
)
600 Ops
.push_back(N
->getOperand(i
));
603 return CurDAG
->MorphNodeTo(N
, N
->getOpcode(), N
->getVTList(), Ops
);
606 SDNode
*AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode
*N
) const {
607 unsigned AS
= cast
<MemSDNode
>(N
)->getAddressSpace();
608 if (AS
== AMDGPUAS::LOCAL_ADDRESS
) {
609 if (Subtarget
->ldsRequiresM0Init())
610 return glueCopyToM0(N
, CurDAG
->getTargetConstant(-1, SDLoc(N
), MVT::i32
));
611 } else if (AS
== AMDGPUAS::REGION_ADDRESS
) {
612 MachineFunction
&MF
= CurDAG
->getMachineFunction();
613 unsigned Value
= MF
.getInfo
<SIMachineFunctionInfo
>()->getGDSSize();
615 glueCopyToM0(N
, CurDAG
->getTargetConstant(Value
, SDLoc(N
), MVT::i32
));
620 MachineSDNode
*AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc
&DL
, uint64_t Imm
,
622 SDNode
*Lo
= CurDAG
->getMachineNode(
623 AMDGPU::S_MOV_B32
, DL
, MVT::i32
,
624 CurDAG
->getTargetConstant(Imm
& 0xFFFFFFFF, DL
, MVT::i32
));
626 CurDAG
->getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
,
627 CurDAG
->getTargetConstant(Imm
>> 32, DL
, MVT::i32
));
628 const SDValue Ops
[] = {
629 CurDAG
->getTargetConstant(AMDGPU::SReg_64RegClassID
, DL
, MVT::i32
),
630 SDValue(Lo
, 0), CurDAG
->getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
631 SDValue(Hi
, 0), CurDAG
->getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
)};
633 return CurDAG
->getMachineNode(TargetOpcode::REG_SEQUENCE
, DL
, VT
, Ops
);
636 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts
) {
637 switch (NumVectorElts
) {
639 return AMDGPU::SReg_32_XM0RegClassID
;
641 return AMDGPU::SReg_64RegClassID
;
643 return AMDGPU::SGPR_96RegClassID
;
645 return AMDGPU::SReg_128RegClassID
;
647 return AMDGPU::SGPR_160RegClassID
;
649 return AMDGPU::SReg_256RegClassID
;
651 return AMDGPU::SReg_512RegClassID
;
653 return AMDGPU::SReg_1024RegClassID
;
656 llvm_unreachable("invalid vector size");
659 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode
*N
, unsigned RegClassID
) {
660 EVT VT
= N
->getValueType(0);
661 unsigned NumVectorElts
= VT
.getVectorNumElements();
662 EVT EltVT
= VT
.getVectorElementType();
664 SDValue RegClass
= CurDAG
->getTargetConstant(RegClassID
, DL
, MVT::i32
);
666 if (NumVectorElts
== 1) {
667 CurDAG
->SelectNodeTo(N
, AMDGPU::COPY_TO_REGCLASS
, EltVT
, N
->getOperand(0),
672 assert(NumVectorElts
<= 32 && "Vectors with more than 32 elements not "
674 // 32 = Max Num Vector Elements
675 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
676 // 1 = Vector Register Class
677 SmallVector
<SDValue
, 32 * 2 + 1> RegSeqArgs(NumVectorElts
* 2 + 1);
679 RegSeqArgs
[0] = CurDAG
->getTargetConstant(RegClassID
, DL
, MVT::i32
);
680 bool IsRegSeq
= true;
681 unsigned NOps
= N
->getNumOperands();
682 for (unsigned i
= 0; i
< NOps
; i
++) {
683 // XXX: Why is this here?
684 if (isa
<RegisterSDNode
>(N
->getOperand(i
))) {
688 unsigned Sub
= AMDGPURegisterInfo::getSubRegFromChannel(i
);
689 RegSeqArgs
[1 + (2 * i
)] = N
->getOperand(i
);
690 RegSeqArgs
[1 + (2 * i
) + 1] = CurDAG
->getTargetConstant(Sub
, DL
, MVT::i32
);
692 if (NOps
!= NumVectorElts
) {
693 // Fill in the missing undef elements if this was a scalar_to_vector.
694 assert(N
->getOpcode() == ISD::SCALAR_TO_VECTOR
&& NOps
< NumVectorElts
);
695 MachineSDNode
*ImpDef
= CurDAG
->getMachineNode(TargetOpcode::IMPLICIT_DEF
,
697 for (unsigned i
= NOps
; i
< NumVectorElts
; ++i
) {
698 unsigned Sub
= AMDGPURegisterInfo::getSubRegFromChannel(i
);
699 RegSeqArgs
[1 + (2 * i
)] = SDValue(ImpDef
, 0);
700 RegSeqArgs
[1 + (2 * i
) + 1] =
701 CurDAG
->getTargetConstant(Sub
, DL
, MVT::i32
);
707 CurDAG
->SelectNodeTo(N
, AMDGPU::REG_SEQUENCE
, N
->getVTList(), RegSeqArgs
);
710 void AMDGPUDAGToDAGISel::Select(SDNode
*N
) {
711 unsigned int Opc
= N
->getOpcode();
712 if (N
->isMachineOpcode()) {
714 return; // Already selected.
717 if (isa
<AtomicSDNode
>(N
) ||
718 (Opc
== AMDGPUISD::ATOMIC_INC
|| Opc
== AMDGPUISD::ATOMIC_DEC
||
719 Opc
== ISD::ATOMIC_LOAD_FADD
||
720 Opc
== AMDGPUISD::ATOMIC_LOAD_FMIN
||
721 Opc
== AMDGPUISD::ATOMIC_LOAD_FMAX
))
722 N
= glueCopyToM0LDSInit(N
);
727 // We are selecting i64 ADD here instead of custom lower it during
728 // DAG legalization, so we can fold some i64 ADDs used for address
729 // calculation into the LOAD and STORE instructions.
734 if (N
->getValueType(0) != MVT::i64
)
737 SelectADD_SUB_I64(N
);
742 if (N
->getValueType(0) != MVT::i32
)
749 SelectUADDO_USUBO(N
);
752 case AMDGPUISD::FMUL_W_CHAIN
: {
753 SelectFMUL_W_CHAIN(N
);
756 case AMDGPUISD::FMA_W_CHAIN
: {
757 SelectFMA_W_CHAIN(N
);
761 case ISD::SCALAR_TO_VECTOR
:
762 case ISD::BUILD_VECTOR
: {
763 EVT VT
= N
->getValueType(0);
764 unsigned NumVectorElts
= VT
.getVectorNumElements();
765 if (VT
.getScalarSizeInBits() == 16) {
766 if (Opc
== ISD::BUILD_VECTOR
&& NumVectorElts
== 2) {
767 if (SDNode
*Packed
= packConstantV2I16(N
, *CurDAG
)) {
768 ReplaceNode(N
, Packed
);
776 assert(VT
.getVectorElementType().bitsEq(MVT::i32
));
777 unsigned RegClassID
= selectSGPRVectorRegClassID(NumVectorElts
);
778 SelectBuildVector(N
, RegClassID
);
781 case ISD::BUILD_PAIR
: {
782 SDValue RC
, SubReg0
, SubReg1
;
784 if (N
->getValueType(0) == MVT::i128
) {
785 RC
= CurDAG
->getTargetConstant(AMDGPU::SReg_128RegClassID
, DL
, MVT::i32
);
786 SubReg0
= CurDAG
->getTargetConstant(AMDGPU::sub0_sub1
, DL
, MVT::i32
);
787 SubReg1
= CurDAG
->getTargetConstant(AMDGPU::sub2_sub3
, DL
, MVT::i32
);
788 } else if (N
->getValueType(0) == MVT::i64
) {
789 RC
= CurDAG
->getTargetConstant(AMDGPU::SReg_64RegClassID
, DL
, MVT::i32
);
790 SubReg0
= CurDAG
->getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
);
791 SubReg1
= CurDAG
->getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
);
793 llvm_unreachable("Unhandled value type for BUILD_PAIR");
795 const SDValue Ops
[] = { RC
, N
->getOperand(0), SubReg0
,
796 N
->getOperand(1), SubReg1
};
797 ReplaceNode(N
, CurDAG
->getMachineNode(TargetOpcode::REG_SEQUENCE
, DL
,
798 N
->getValueType(0), Ops
));
803 case ISD::ConstantFP
: {
804 if (N
->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N
))
808 if (ConstantFPSDNode
*FP
= dyn_cast
<ConstantFPSDNode
>(N
))
809 Imm
= FP
->getValueAPF().bitcastToAPInt().getZExtValue();
811 ConstantSDNode
*C
= cast
<ConstantSDNode
>(N
);
812 Imm
= C
->getZExtValue();
816 ReplaceNode(N
, buildSMovImm64(DL
, Imm
, N
->getValueType(0)));
821 case ISD::ATOMIC_LOAD
:
822 case ISD::ATOMIC_STORE
: {
823 N
= glueCopyToM0LDSInit(N
);
827 case AMDGPUISD::BFE_I32
:
828 case AMDGPUISD::BFE_U32
: {
829 // There is a scalar version available, but unlike the vector version which
830 // has a separate operand for the offset and width, the scalar version packs
831 // the width and offset into a single operand. Try to move to the scalar
832 // version if the offsets are constant, so that we can try to keep extended
833 // loads of kernel arguments in SGPRs.
835 // TODO: Technically we could try to pattern match scalar bitshifts of
836 // dynamic values, but it's probably not useful.
837 ConstantSDNode
*Offset
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
841 ConstantSDNode
*Width
= dyn_cast
<ConstantSDNode
>(N
->getOperand(2));
845 bool Signed
= Opc
== AMDGPUISD::BFE_I32
;
847 uint32_t OffsetVal
= Offset
->getZExtValue();
848 uint32_t WidthVal
= Width
->getZExtValue();
850 ReplaceNode(N
, getS_BFE(Signed
? AMDGPU::S_BFE_I32
: AMDGPU::S_BFE_U32
,
851 SDLoc(N
), N
->getOperand(0), OffsetVal
, WidthVal
));
854 case AMDGPUISD::DIV_SCALE
: {
858 case AMDGPUISD::DIV_FMAS
: {
862 case AMDGPUISD::MAD_I64_I32
:
863 case AMDGPUISD::MAD_U64_U32
: {
867 case ISD::CopyToReg
: {
868 const SITargetLowering
& Lowering
=
869 *static_cast<const SITargetLowering
*>(getTargetLowering());
870 N
= Lowering
.legalizeTargetIndependentNode(N
, *CurDAG
);
876 case ISD::SIGN_EXTEND_INREG
:
877 if (N
->getValueType(0) != MVT::i32
)
889 case AMDGPUISD::ATOMIC_CMP_SWAP
:
890 SelectATOMIC_CMP_SWAP(N
);
892 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
893 case AMDGPUISD::CVT_PKNORM_I16_F32
:
894 case AMDGPUISD::CVT_PKNORM_U16_F32
:
895 case AMDGPUISD::CVT_PK_U16_U32
:
896 case AMDGPUISD::CVT_PK_I16_I32
: {
897 // Hack around using a legal type if f16 is illegal.
898 if (N
->getValueType(0) == MVT::i32
) {
899 MVT NewVT
= Opc
== AMDGPUISD::CVT_PKRTZ_F16_F32
? MVT::v2f16
: MVT::v2i16
;
900 N
= CurDAG
->MorphNodeTo(N
, N
->getOpcode(), CurDAG
->getVTList(NewVT
),
901 { N
->getOperand(0), N
->getOperand(1) });
908 case ISD::INTRINSIC_W_CHAIN
: {
909 SelectINTRINSIC_W_CHAIN(N
);
912 case ISD::INTRINSIC_WO_CHAIN
: {
913 SelectINTRINSIC_WO_CHAIN(N
);
916 case ISD::INTRINSIC_VOID
: {
917 SelectINTRINSIC_VOID(N
);
925 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode
*N
) const {
926 const BasicBlock
*BB
= FuncInfo
->MBB
->getBasicBlock();
927 const Instruction
*Term
= BB
->getTerminator();
928 return Term
->getMetadata("amdgpu.uniform") ||
929 Term
->getMetadata("structurizecfg.uniform");
932 StringRef
AMDGPUDAGToDAGISel::getPassName() const {
933 return "AMDGPU DAG->DAG Pattern Instruction Selection";
936 //===----------------------------------------------------------------------===//
938 //===----------------------------------------------------------------------===//
940 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
,
945 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr
, SDValue
&Base
,
950 if ((C
= dyn_cast
<ConstantSDNode
>(Addr
))) {
951 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
952 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
953 } else if ((Addr
.getOpcode() == AMDGPUISD::DWORDADDR
) &&
954 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0)))) {
955 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
956 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
957 } else if ((Addr
.getOpcode() == ISD::ADD
|| Addr
.getOpcode() == ISD::OR
) &&
958 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1)))) {
959 Base
= Addr
.getOperand(0);
960 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
963 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
969 // FIXME: Should only handle addcarry/subcarry
970 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode
*N
) {
972 SDValue LHS
= N
->getOperand(0);
973 SDValue RHS
= N
->getOperand(1);
975 unsigned Opcode
= N
->getOpcode();
976 bool ConsumeCarry
= (Opcode
== ISD::ADDE
|| Opcode
== ISD::SUBE
);
978 ConsumeCarry
|| Opcode
== ISD::ADDC
|| Opcode
== ISD::SUBC
;
979 bool IsAdd
= Opcode
== ISD::ADD
|| Opcode
== ISD::ADDC
|| Opcode
== ISD::ADDE
;
981 SDValue Sub0
= CurDAG
->getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
);
982 SDValue Sub1
= CurDAG
->getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
);
984 SDNode
*Lo0
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
985 DL
, MVT::i32
, LHS
, Sub0
);
986 SDNode
*Hi0
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
987 DL
, MVT::i32
, LHS
, Sub1
);
989 SDNode
*Lo1
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
990 DL
, MVT::i32
, RHS
, Sub0
);
991 SDNode
*Hi1
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
992 DL
, MVT::i32
, RHS
, Sub1
);
994 SDVTList VTList
= CurDAG
->getVTList(MVT::i32
, MVT::Glue
);
996 unsigned Opc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
997 unsigned CarryOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
1000 if (!ConsumeCarry
) {
1001 SDValue Args
[] = { SDValue(Lo0
, 0), SDValue(Lo1
, 0) };
1002 AddLo
= CurDAG
->getMachineNode(Opc
, DL
, VTList
, Args
);
1004 SDValue Args
[] = { SDValue(Lo0
, 0), SDValue(Lo1
, 0), N
->getOperand(2) };
1005 AddLo
= CurDAG
->getMachineNode(CarryOpc
, DL
, VTList
, Args
);
1007 SDValue AddHiArgs
[] = {
1012 SDNode
*AddHi
= CurDAG
->getMachineNode(CarryOpc
, DL
, VTList
, AddHiArgs
);
1014 SDValue RegSequenceArgs
[] = {
1015 CurDAG
->getTargetConstant(AMDGPU::SReg_64RegClassID
, DL
, MVT::i32
),
1021 SDNode
*RegSequence
= CurDAG
->getMachineNode(AMDGPU::REG_SEQUENCE
, DL
,
1022 MVT::i64
, RegSequenceArgs
);
1025 // Replace the carry-use
1026 ReplaceUses(SDValue(N
, 1), SDValue(AddHi
, 1));
1029 // Replace the remaining uses.
1030 ReplaceNode(N
, RegSequence
);
1033 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode
*N
) {
1035 SDValue LHS
= N
->getOperand(0);
1036 SDValue RHS
= N
->getOperand(1);
1037 SDValue CI
= N
->getOperand(2);
1039 unsigned Opc
= N
->getOpcode() == ISD::ADDCARRY
? AMDGPU::V_ADDC_U32_e64
1040 : AMDGPU::V_SUBB_U32_e64
;
1041 CurDAG
->SelectNodeTo(
1042 N
, Opc
, N
->getVTList(),
1043 {LHS
, RHS
, CI
, CurDAG
->getTargetConstant(0, {}, MVT::i1
) /*clamp bit*/});
1046 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode
*N
) {
1047 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1048 // carry out despite the _i32 name. These were renamed in VI to _U32.
1049 // FIXME: We should probably rename the opcodes here.
1050 unsigned Opc
= N
->getOpcode() == ISD::UADDO
?
1051 AMDGPU::V_ADD_I32_e64
: AMDGPU::V_SUB_I32_e64
;
1053 CurDAG
->SelectNodeTo(
1054 N
, Opc
, N
->getVTList(),
1055 {N
->getOperand(0), N
->getOperand(1),
1056 CurDAG
->getTargetConstant(0, {}, MVT::i1
) /*clamp bit*/});
1059 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode
*N
) {
1061 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1064 SelectVOP3Mods0(N
->getOperand(1), Ops
[1], Ops
[0], Ops
[6], Ops
[7]);
1065 SelectVOP3Mods(N
->getOperand(2), Ops
[3], Ops
[2]);
1066 SelectVOP3Mods(N
->getOperand(3), Ops
[5], Ops
[4]);
1067 Ops
[8] = N
->getOperand(0);
1068 Ops
[9] = N
->getOperand(4);
1070 CurDAG
->SelectNodeTo(N
, AMDGPU::V_FMA_F32
, N
->getVTList(), Ops
);
1073 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode
*N
) {
1075 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1078 SelectVOP3Mods0(N
->getOperand(1), Ops
[1], Ops
[0], Ops
[4], Ops
[5]);
1079 SelectVOP3Mods(N
->getOperand(2), Ops
[3], Ops
[2]);
1080 Ops
[6] = N
->getOperand(0);
1081 Ops
[7] = N
->getOperand(3);
1083 CurDAG
->SelectNodeTo(N
, AMDGPU::V_MUL_F32_e64
, N
->getVTList(), Ops
);
1086 // We need to handle this here because tablegen doesn't support matching
1087 // instructions with multiple outputs.
1088 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode
*N
) {
1090 EVT VT
= N
->getValueType(0);
1092 assert(VT
== MVT::f32
|| VT
== MVT::f64
);
1095 = (VT
== MVT::f64
) ? AMDGPU::V_DIV_SCALE_F64
: AMDGPU::V_DIV_SCALE_F32
;
1097 SDValue Ops
[] = { N
->getOperand(0), N
->getOperand(1), N
->getOperand(2) };
1098 CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
1101 void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode
*N
) {
1102 const GCNSubtarget
*ST
= static_cast<const GCNSubtarget
*>(Subtarget
);
1103 const SIRegisterInfo
*TRI
= ST
->getRegisterInfo();
1106 EVT VT
= N
->getValueType(0);
1108 assert(VT
== MVT::f32
|| VT
== MVT::f64
);
1111 = (VT
== MVT::f64
) ? AMDGPU::V_DIV_FMAS_F64
: AMDGPU::V_DIV_FMAS_F32
;
1113 SDValue CarryIn
= N
->getOperand(3);
1114 // V_DIV_FMAS implicitly reads VCC.
1115 SDValue VCC
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), SL
,
1116 TRI
->getVCC(), CarryIn
, SDValue());
1120 SelectVOP3Mods0(N
->getOperand(0), Ops
[1], Ops
[0], Ops
[6], Ops
[7]);
1121 SelectVOP3Mods(N
->getOperand(1), Ops
[3], Ops
[2]);
1122 SelectVOP3Mods(N
->getOperand(2), Ops
[5], Ops
[4]);
1125 Ops
[9] = VCC
.getValue(1);
1127 CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
1130 // We need to handle this here because tablegen doesn't support matching
1131 // instructions with multiple outputs.
1132 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode
*N
) {
1134 bool Signed
= N
->getOpcode() == AMDGPUISD::MAD_I64_I32
;
1135 unsigned Opc
= Signed
? AMDGPU::V_MAD_I64_I32
: AMDGPU::V_MAD_U64_U32
;
1137 SDValue Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i1
);
1138 SDValue Ops
[] = { N
->getOperand(0), N
->getOperand(1), N
->getOperand(2),
1140 CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
1143 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base
, unsigned Offset
,
1144 unsigned OffsetBits
) const {
1145 if ((OffsetBits
== 16 && !isUInt
<16>(Offset
)) ||
1146 (OffsetBits
== 8 && !isUInt
<8>(Offset
)))
1149 if (Subtarget
->hasUsableDSOffset() ||
1150 Subtarget
->unsafeDSOffsetFoldingEnabled())
1153 // On Southern Islands instruction with a negative base value and an offset
1154 // don't seem to work.
1155 return CurDAG
->SignBitIsZero(Base
);
1158 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr
, SDValue
&Base
,
1159 SDValue
&Offset
) const {
1161 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1162 SDValue N0
= Addr
.getOperand(0);
1163 SDValue N1
= Addr
.getOperand(1);
1164 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1165 if (isDSOffsetLegal(N0
, C1
->getSExtValue(), 16)) {
1168 Offset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i16
);
1171 } else if (Addr
.getOpcode() == ISD::SUB
) {
1172 // sub C, x -> add (sub 0, x), C
1173 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0))) {
1174 int64_t ByteOffset
= C
->getSExtValue();
1175 if (isUInt
<16>(ByteOffset
)) {
1176 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1178 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1179 // the known bits in isDSOffsetLegal. We need to emit the selected node
1180 // here, so this is thrown away.
1181 SDValue Sub
= CurDAG
->getNode(ISD::SUB
, DL
, MVT::i32
,
1182 Zero
, Addr
.getOperand(1));
1184 if (isDSOffsetLegal(Sub
, ByteOffset
, 16)) {
1185 SmallVector
<SDValue
, 3> Opnds
;
1186 Opnds
.push_back(Zero
);
1187 Opnds
.push_back(Addr
.getOperand(1));
1189 // FIXME: Select to VOP3 version for with-carry.
1190 unsigned SubOp
= AMDGPU::V_SUB_I32_e32
;
1191 if (Subtarget
->hasAddNoCarry()) {
1192 SubOp
= AMDGPU::V_SUB_U32_e64
;
1194 CurDAG
->getTargetConstant(0, {}, MVT::i1
)); // clamp bit
1197 MachineSDNode
*MachineSub
=
1198 CurDAG
->getMachineNode(SubOp
, DL
, MVT::i32
, Opnds
);
1200 Base
= SDValue(MachineSub
, 0);
1201 Offset
= CurDAG
->getTargetConstant(ByteOffset
, DL
, MVT::i16
);
1206 } else if (const ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
)) {
1207 // If we have a constant address, prefer to put the constant into the
1208 // offset. This can save moves to load the constant address since multiple
1209 // operations can share the zero base address register, and enables merging
1210 // into read2 / write2 instructions.
1214 if (isUInt
<16>(CAddr
->getZExtValue())) {
1215 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1216 MachineSDNode
*MovZero
= CurDAG
->getMachineNode(AMDGPU::V_MOV_B32_e32
,
1217 DL
, MVT::i32
, Zero
);
1218 Base
= SDValue(MovZero
, 0);
1219 Offset
= CurDAG
->getTargetConstant(CAddr
->getZExtValue(), DL
, MVT::i16
);
1226 Offset
= CurDAG
->getTargetConstant(0, SDLoc(Addr
), MVT::i16
);
1230 // TODO: If offset is too big, put low 16-bit into offset.
1231 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr
, SDValue
&Base
,
1233 SDValue
&Offset1
) const {
1236 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1237 SDValue N0
= Addr
.getOperand(0);
1238 SDValue N1
= Addr
.getOperand(1);
1239 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1240 unsigned DWordOffset0
= C1
->getZExtValue() / 4;
1241 unsigned DWordOffset1
= DWordOffset0
+ 1;
1243 if (isDSOffsetLegal(N0
, DWordOffset1
, 8)) {
1245 Offset0
= CurDAG
->getTargetConstant(DWordOffset0
, DL
, MVT::i8
);
1246 Offset1
= CurDAG
->getTargetConstant(DWordOffset1
, DL
, MVT::i8
);
1249 } else if (Addr
.getOpcode() == ISD::SUB
) {
1250 // sub C, x -> add (sub 0, x), C
1251 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0))) {
1252 unsigned DWordOffset0
= C
->getZExtValue() / 4;
1253 unsigned DWordOffset1
= DWordOffset0
+ 1;
1255 if (isUInt
<8>(DWordOffset0
)) {
1257 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1259 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1260 // the known bits in isDSOffsetLegal. We need to emit the selected node
1261 // here, so this is thrown away.
1262 SDValue Sub
= CurDAG
->getNode(ISD::SUB
, DL
, MVT::i32
,
1263 Zero
, Addr
.getOperand(1));
1265 if (isDSOffsetLegal(Sub
, DWordOffset1
, 8)) {
1266 SmallVector
<SDValue
, 3> Opnds
;
1267 Opnds
.push_back(Zero
);
1268 Opnds
.push_back(Addr
.getOperand(1));
1269 unsigned SubOp
= AMDGPU::V_SUB_I32_e32
;
1270 if (Subtarget
->hasAddNoCarry()) {
1271 SubOp
= AMDGPU::V_SUB_U32_e64
;
1273 CurDAG
->getTargetConstant(0, {}, MVT::i1
)); // clamp bit
1276 MachineSDNode
*MachineSub
1277 = CurDAG
->getMachineNode(SubOp
, DL
, MVT::i32
, Opnds
);
1279 Base
= SDValue(MachineSub
, 0);
1280 Offset0
= CurDAG
->getTargetConstant(DWordOffset0
, DL
, MVT::i8
);
1281 Offset1
= CurDAG
->getTargetConstant(DWordOffset1
, DL
, MVT::i8
);
1286 } else if (const ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
)) {
1287 unsigned DWordOffset0
= CAddr
->getZExtValue() / 4;
1288 unsigned DWordOffset1
= DWordOffset0
+ 1;
1289 assert(4 * DWordOffset0
== CAddr
->getZExtValue());
1291 if (isUInt
<8>(DWordOffset0
) && isUInt
<8>(DWordOffset1
)) {
1292 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1293 MachineSDNode
*MovZero
1294 = CurDAG
->getMachineNode(AMDGPU::V_MOV_B32_e32
,
1295 DL
, MVT::i32
, Zero
);
1296 Base
= SDValue(MovZero
, 0);
1297 Offset0
= CurDAG
->getTargetConstant(DWordOffset0
, DL
, MVT::i8
);
1298 Offset1
= CurDAG
->getTargetConstant(DWordOffset1
, DL
, MVT::i8
);
1306 Offset0
= CurDAG
->getTargetConstant(0, DL
, MVT::i8
);
1307 Offset1
= CurDAG
->getTargetConstant(1, DL
, MVT::i8
);
1311 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr
, SDValue
&Ptr
,
1312 SDValue
&VAddr
, SDValue
&SOffset
,
1313 SDValue
&Offset
, SDValue
&Offen
,
1314 SDValue
&Idxen
, SDValue
&Addr64
,
1315 SDValue
&GLC
, SDValue
&SLC
,
1316 SDValue
&TFE
, SDValue
&DLC
) const {
1317 // Subtarget prefers to use flat instruction
1318 if (Subtarget
->useFlatForGlobal())
1324 GLC
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1326 SLC
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1327 TFE
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1328 DLC
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1330 Idxen
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1331 Offen
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1332 Addr64
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1333 SOffset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1335 ConstantSDNode
*C1
= nullptr;
1337 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1338 C1
= cast
<ConstantSDNode
>(Addr
.getOperand(1));
1339 if (isUInt
<32>(C1
->getZExtValue()))
1340 N0
= Addr
.getOperand(0);
1345 if (N0
.getOpcode() == ISD::ADD
) {
1346 // (add N2, N3) -> addr64, or
1347 // (add (add N2, N3), C1) -> addr64
1348 SDValue N2
= N0
.getOperand(0);
1349 SDValue N3
= N0
.getOperand(1);
1350 Addr64
= CurDAG
->getTargetConstant(1, DL
, MVT::i1
);
1352 if (N2
->isDivergent()) {
1353 if (N3
->isDivergent()) {
1354 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1355 // addr64, and construct the resource from a 0 address.
1356 Ptr
= SDValue(buildSMovImm64(DL
, 0, MVT::v2i32
), 0);
1359 // N2 is divergent, N3 is not.
1364 // N2 is not divergent.
1368 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1369 } else if (N0
->isDivergent()) {
1370 // N0 is divergent. Use it as the addr64, and construct the resource from a
1372 Ptr
= SDValue(buildSMovImm64(DL
, 0, MVT::v2i32
), 0);
1374 Addr64
= CurDAG
->getTargetConstant(1, DL
, MVT::i1
);
1377 // (N0 + C1) -> offset
1378 VAddr
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1384 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1388 if (SIInstrInfo::isLegalMUBUFImmOffset(C1
->getZExtValue())) {
1389 // Legal offset for instruction.
1390 Offset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i16
);
1394 // Illegal offset, store it in soffset.
1395 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1397 SDValue(CurDAG
->getMachineNode(
1398 AMDGPU::S_MOV_B32
, DL
, MVT::i32
,
1399 CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i32
)),
1404 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
,
1405 SDValue
&VAddr
, SDValue
&SOffset
,
1406 SDValue
&Offset
, SDValue
&GLC
,
1407 SDValue
&SLC
, SDValue
&TFE
,
1408 SDValue
&DLC
) const {
1409 SDValue Ptr
, Offen
, Idxen
, Addr64
;
1411 // addr64 bit was removed for volcanic islands.
1412 if (!Subtarget
->hasAddr64())
1415 if (!SelectMUBUF(Addr
, Ptr
, VAddr
, SOffset
, Offset
, Offen
, Idxen
, Addr64
,
1416 GLC
, SLC
, TFE
, DLC
))
1419 ConstantSDNode
*C
= cast
<ConstantSDNode
>(Addr64
);
1420 if (C
->getSExtValue()) {
1423 const SITargetLowering
& Lowering
=
1424 *static_cast<const SITargetLowering
*>(getTargetLowering());
1426 SRsrc
= SDValue(Lowering
.wrapAddr64Rsrc(*CurDAG
, DL
, Ptr
), 0);
1433 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
,
1434 SDValue
&VAddr
, SDValue
&SOffset
,
1436 SDValue
&SLC
) const {
1437 SLC
= CurDAG
->getTargetConstant(0, SDLoc(Addr
), MVT::i1
);
1438 SDValue GLC
, TFE
, DLC
;
1440 return SelectMUBUFAddr64(Addr
, SRsrc
, VAddr
, SOffset
, Offset
, GLC
, SLC
, TFE
, DLC
);
1443 static bool isStackPtrRelative(const MachinePointerInfo
&PtrInfo
) {
1444 auto PSV
= PtrInfo
.V
.dyn_cast
<const PseudoSourceValue
*>();
1445 return PSV
&& PSV
->isStack();
1448 std::pair
<SDValue
, SDValue
> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N
) const {
1449 const MachineFunction
&MF
= CurDAG
->getMachineFunction();
1450 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1452 if (auto FI
= dyn_cast
<FrameIndexSDNode
>(N
)) {
1453 SDValue TFI
= CurDAG
->getTargetFrameIndex(FI
->getIndex(),
1454 FI
->getValueType(0));
1456 // If we can resolve this to a frame index access, this will be relative to
1457 // either the stack or frame pointer SGPR.
1458 return std::make_pair(
1459 TFI
, CurDAG
->getRegister(Info
->getStackPtrOffsetReg(), MVT::i32
));
1462 // If we don't know this private access is a local stack object, it needs to
1463 // be relative to the entry point's scratch wave offset register.
1464 return std::make_pair(N
, CurDAG
->getRegister(Info
->getScratchWaveOffsetReg(),
1468 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode
*Parent
,
1469 SDValue Addr
, SDValue
&Rsrc
,
1470 SDValue
&VAddr
, SDValue
&SOffset
,
1471 SDValue
&ImmOffset
) const {
1474 MachineFunction
&MF
= CurDAG
->getMachineFunction();
1475 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1477 Rsrc
= CurDAG
->getRegister(Info
->getScratchRSrcReg(), MVT::v4i32
);
1479 if (ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
)) {
1480 unsigned Imm
= CAddr
->getZExtValue();
1482 SDValue HighBits
= CurDAG
->getTargetConstant(Imm
& ~4095, DL
, MVT::i32
);
1483 MachineSDNode
*MovHighBits
= CurDAG
->getMachineNode(AMDGPU::V_MOV_B32_e32
,
1484 DL
, MVT::i32
, HighBits
);
1485 VAddr
= SDValue(MovHighBits
, 0);
1487 // In a call sequence, stores to the argument stack area are relative to the
1489 const MachinePointerInfo
&PtrInfo
= cast
<MemSDNode
>(Parent
)->getPointerInfo();
1490 unsigned SOffsetReg
= isStackPtrRelative(PtrInfo
) ?
1491 Info
->getStackPtrOffsetReg() : Info
->getScratchWaveOffsetReg();
1493 SOffset
= CurDAG
->getRegister(SOffsetReg
, MVT::i32
);
1494 ImmOffset
= CurDAG
->getTargetConstant(Imm
& 4095, DL
, MVT::i16
);
1498 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1501 SDValue N0
= Addr
.getOperand(0);
1502 SDValue N1
= Addr
.getOperand(1);
1504 // Offsets in vaddr must be positive if range checking is enabled.
1506 // The total computation of vaddr + soffset + offset must not overflow. If
1507 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1510 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1511 // always perform a range check. If a negative vaddr base index was used,
1512 // this would fail the range check. The overall address computation would
1513 // compute a valid address, but this doesn't happen due to the range
1514 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1516 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1517 // MUBUF vaddr, but not on older subtargets which can only do this if the
1518 // sign bit is known 0.
1519 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1520 if (SIInstrInfo::isLegalMUBUFImmOffset(C1
->getZExtValue()) &&
1521 (!Subtarget
->privateMemoryResourceIsRangeChecked() ||
1522 CurDAG
->SignBitIsZero(N0
))) {
1523 std::tie(VAddr
, SOffset
) = foldFrameIndex(N0
);
1524 ImmOffset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i16
);
1530 std::tie(VAddr
, SOffset
) = foldFrameIndex(Addr
);
1531 ImmOffset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1535 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode
*Parent
,
1539 SDValue
&Offset
) const {
1540 ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
);
1541 if (!CAddr
|| !SIInstrInfo::isLegalMUBUFImmOffset(CAddr
->getZExtValue()))
1545 MachineFunction
&MF
= CurDAG
->getMachineFunction();
1546 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1548 SRsrc
= CurDAG
->getRegister(Info
->getScratchRSrcReg(), MVT::v4i32
);
1550 const MachinePointerInfo
&PtrInfo
= cast
<MemSDNode
>(Parent
)->getPointerInfo();
1551 unsigned SOffsetReg
= isStackPtrRelative(PtrInfo
) ?
1552 Info
->getStackPtrOffsetReg() : Info
->getScratchWaveOffsetReg();
1554 // FIXME: Get from MachinePointerInfo? We should only be using the frame
1555 // offset if we know this is in a call sequence.
1556 SOffset
= CurDAG
->getRegister(SOffsetReg
, MVT::i32
);
1558 Offset
= CurDAG
->getTargetConstant(CAddr
->getZExtValue(), DL
, MVT::i16
);
1562 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
,
1563 SDValue
&SOffset
, SDValue
&Offset
,
1564 SDValue
&GLC
, SDValue
&SLC
,
1565 SDValue
&TFE
, SDValue
&DLC
) const {
1566 SDValue Ptr
, VAddr
, Offen
, Idxen
, Addr64
;
1567 const SIInstrInfo
*TII
=
1568 static_cast<const SIInstrInfo
*>(Subtarget
->getInstrInfo());
1570 if (!SelectMUBUF(Addr
, Ptr
, VAddr
, SOffset
, Offset
, Offen
, Idxen
, Addr64
,
1571 GLC
, SLC
, TFE
, DLC
))
1574 if (!cast
<ConstantSDNode
>(Offen
)->getSExtValue() &&
1575 !cast
<ConstantSDNode
>(Idxen
)->getSExtValue() &&
1576 !cast
<ConstantSDNode
>(Addr64
)->getSExtValue()) {
1577 uint64_t Rsrc
= TII
->getDefaultRsrcDataFormat() |
1578 APInt::getAllOnesValue(32).getZExtValue(); // Size
1581 const SITargetLowering
& Lowering
=
1582 *static_cast<const SITargetLowering
*>(getTargetLowering());
1584 SRsrc
= SDValue(Lowering
.buildRSRC(*CurDAG
, DL
, Ptr
, 0, Rsrc
), 0);
1590 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
,
1591 SDValue
&Soffset
, SDValue
&Offset
1593 SDValue GLC
, SLC
, TFE
, DLC
;
1595 return SelectMUBUFOffset(Addr
, SRsrc
, Soffset
, Offset
, GLC
, SLC
, TFE
, DLC
);
1597 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
,
1598 SDValue
&Soffset
, SDValue
&Offset
,
1599 SDValue
&SLC
) const {
1600 SDValue GLC
, TFE
, DLC
;
1602 return SelectMUBUFOffset(Addr
, SRsrc
, Soffset
, Offset
, GLC
, SLC
, TFE
, DLC
);
1605 template <bool IsSigned
>
1606 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode
*N
,
1610 SDValue
&SLC
) const {
1611 return static_cast<const SITargetLowering
*>(getTargetLowering())->
1612 SelectFlatOffset(IsSigned
, *CurDAG
, N
, Addr
, VAddr
, Offset
, SLC
);
1615 bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode
*N
,
1619 SDValue
&SLC
) const {
1620 return SelectFlatOffset
<false>(N
, Addr
, VAddr
, Offset
, SLC
);
1623 bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode
*N
,
1627 SDValue
&SLC
) const {
1628 return SelectFlatOffset
<true>(N
, Addr
, VAddr
, Offset
, SLC
);
1631 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode
,
1632 SDValue
&Offset
, bool &Imm
) const {
1634 // FIXME: Handle non-constant offsets.
1635 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(ByteOffsetNode
);
1639 SDLoc
SL(ByteOffsetNode
);
1640 GCNSubtarget::Generation Gen
= Subtarget
->getGeneration();
1641 int64_t ByteOffset
= C
->getSExtValue();
1642 int64_t EncodedOffset
= AMDGPU::getSMRDEncodedOffset(*Subtarget
, ByteOffset
);
1644 if (AMDGPU::isLegalSMRDImmOffset(*Subtarget
, ByteOffset
)) {
1645 Offset
= CurDAG
->getTargetConstant(EncodedOffset
, SL
, MVT::i32
);
1650 if (!isUInt
<32>(EncodedOffset
) || !isUInt
<32>(ByteOffset
))
1653 if (Gen
== AMDGPUSubtarget::SEA_ISLANDS
&& isUInt
<32>(EncodedOffset
)) {
1654 // 32-bit Immediates are supported on Sea Islands.
1655 Offset
= CurDAG
->getTargetConstant(EncodedOffset
, SL
, MVT::i32
);
1657 SDValue C32Bit
= CurDAG
->getTargetConstant(ByteOffset
, SL
, MVT::i32
);
1658 Offset
= SDValue(CurDAG
->getMachineNode(AMDGPU::S_MOV_B32
, SL
, MVT::i32
,
1665 SDValue
AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr
) const {
1666 if (Addr
.getValueType() != MVT::i32
)
1669 // Zero-extend a 32-bit address.
1672 const MachineFunction
&MF
= CurDAG
->getMachineFunction();
1673 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1674 unsigned AddrHiVal
= Info
->get32BitAddressHighBits();
1675 SDValue AddrHi
= CurDAG
->getTargetConstant(AddrHiVal
, SL
, MVT::i32
);
1677 const SDValue Ops
[] = {
1678 CurDAG
->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID
, SL
, MVT::i32
),
1680 CurDAG
->getTargetConstant(AMDGPU::sub0
, SL
, MVT::i32
),
1681 SDValue(CurDAG
->getMachineNode(AMDGPU::S_MOV_B32
, SL
, MVT::i32
, AddrHi
),
1683 CurDAG
->getTargetConstant(AMDGPU::sub1
, SL
, MVT::i32
),
1686 return SDValue(CurDAG
->getMachineNode(AMDGPU::REG_SEQUENCE
, SL
, MVT::i64
,
1690 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr
, SDValue
&SBase
,
1691 SDValue
&Offset
, bool &Imm
) const {
1694 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
1695 // wraparound, because s_load instructions perform the addition in 64 bits.
1696 if ((Addr
.getValueType() != MVT::i32
||
1697 Addr
->getFlags().hasNoUnsignedWrap()) &&
1698 CurDAG
->isBaseWithConstantOffset(Addr
)) {
1699 SDValue N0
= Addr
.getOperand(0);
1700 SDValue N1
= Addr
.getOperand(1);
1702 if (SelectSMRDOffset(N1
, Offset
, Imm
)) {
1703 SBase
= Expand32BitAddress(N0
);
1707 SBase
= Expand32BitAddress(Addr
);
1708 Offset
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
1713 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr
, SDValue
&SBase
,
1714 SDValue
&Offset
) const {
1716 return SelectSMRD(Addr
, SBase
, Offset
, Imm
) && Imm
;
1719 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr
, SDValue
&SBase
,
1720 SDValue
&Offset
) const {
1722 if (Subtarget
->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS
)
1726 if (!SelectSMRD(Addr
, SBase
, Offset
, Imm
))
1729 return !Imm
&& isa
<ConstantSDNode
>(Offset
);
1732 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr
, SDValue
&SBase
,
1733 SDValue
&Offset
) const {
1735 return SelectSMRD(Addr
, SBase
, Offset
, Imm
) && !Imm
&&
1736 !isa
<ConstantSDNode
>(Offset
);
1739 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr
,
1740 SDValue
&Offset
) const {
1742 return SelectSMRDOffset(Addr
, Offset
, Imm
) && Imm
;
1745 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr
,
1746 SDValue
&Offset
) const {
1747 if (Subtarget
->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS
)
1751 if (!SelectSMRDOffset(Addr
, Offset
, Imm
))
1754 return !Imm
&& isa
<ConstantSDNode
>(Offset
);
1757 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index
,
1759 SDValue
&Offset
) const {
1762 if (CurDAG
->isBaseWithConstantOffset(Index
)) {
1763 SDValue N0
= Index
.getOperand(0);
1764 SDValue N1
= Index
.getOperand(1);
1765 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1768 // Don't peel off the offset (c0) if doing so could possibly lead
1769 // the base (n0) to be negative.
1770 if (C1
->getSExtValue() <= 0 || CurDAG
->SignBitIsZero(N0
)) {
1772 Offset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i32
);
1777 if (isa
<ConstantSDNode
>(Index
))
1781 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1785 SDNode
*AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode
, const SDLoc
&DL
,
1786 SDValue Val
, uint32_t Offset
,
1788 // Transformation function, pack the offset and width of a BFE into
1789 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1790 // source, bits [5:0] contain the offset and bits [22:16] the width.
1791 uint32_t PackedVal
= Offset
| (Width
<< 16);
1792 SDValue PackedConst
= CurDAG
->getTargetConstant(PackedVal
, DL
, MVT::i32
);
1794 return CurDAG
->getMachineNode(Opcode
, DL
, MVT::i32
, Val
, PackedConst
);
1797 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode
*N
) {
1798 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
1799 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
1800 // Predicate: 0 < b <= c < 32
1802 const SDValue
&Shl
= N
->getOperand(0);
1803 ConstantSDNode
*B
= dyn_cast
<ConstantSDNode
>(Shl
->getOperand(1));
1804 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
1807 uint32_t BVal
= B
->getZExtValue();
1808 uint32_t CVal
= C
->getZExtValue();
1810 if (0 < BVal
&& BVal
<= CVal
&& CVal
< 32) {
1811 bool Signed
= N
->getOpcode() == ISD::SRA
;
1812 unsigned Opcode
= Signed
? AMDGPU::S_BFE_I32
: AMDGPU::S_BFE_U32
;
1814 ReplaceNode(N
, getS_BFE(Opcode
, SDLoc(N
), Shl
.getOperand(0), CVal
- BVal
,
1822 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode
*N
) {
1823 switch (N
->getOpcode()) {
1825 if (N
->getOperand(0).getOpcode() == ISD::SRL
) {
1826 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
1827 // Predicate: isMask(mask)
1828 const SDValue
&Srl
= N
->getOperand(0);
1829 ConstantSDNode
*Shift
= dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1));
1830 ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
1832 if (Shift
&& Mask
) {
1833 uint32_t ShiftVal
= Shift
->getZExtValue();
1834 uint32_t MaskVal
= Mask
->getZExtValue();
1836 if (isMask_32(MaskVal
)) {
1837 uint32_t WidthVal
= countPopulation(MaskVal
);
1839 ReplaceNode(N
, getS_BFE(AMDGPU::S_BFE_U32
, SDLoc(N
),
1840 Srl
.getOperand(0), ShiftVal
, WidthVal
));
1847 if (N
->getOperand(0).getOpcode() == ISD::AND
) {
1848 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
1849 // Predicate: isMask(mask >> b)
1850 const SDValue
&And
= N
->getOperand(0);
1851 ConstantSDNode
*Shift
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
1852 ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(And
->getOperand(1));
1854 if (Shift
&& Mask
) {
1855 uint32_t ShiftVal
= Shift
->getZExtValue();
1856 uint32_t MaskVal
= Mask
->getZExtValue() >> ShiftVal
;
1858 if (isMask_32(MaskVal
)) {
1859 uint32_t WidthVal
= countPopulation(MaskVal
);
1861 ReplaceNode(N
, getS_BFE(AMDGPU::S_BFE_U32
, SDLoc(N
),
1862 And
.getOperand(0), ShiftVal
, WidthVal
));
1866 } else if (N
->getOperand(0).getOpcode() == ISD::SHL
) {
1867 SelectS_BFEFromShifts(N
);
1872 if (N
->getOperand(0).getOpcode() == ISD::SHL
) {
1873 SelectS_BFEFromShifts(N
);
1878 case ISD::SIGN_EXTEND_INREG
: {
1879 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
1880 SDValue Src
= N
->getOperand(0);
1881 if (Src
.getOpcode() != ISD::SRL
)
1884 const ConstantSDNode
*Amt
= dyn_cast
<ConstantSDNode
>(Src
.getOperand(1));
1888 unsigned Width
= cast
<VTSDNode
>(N
->getOperand(1))->getVT().getSizeInBits();
1889 ReplaceNode(N
, getS_BFE(AMDGPU::S_BFE_I32
, SDLoc(N
), Src
.getOperand(0),
1890 Amt
->getZExtValue(), Width
));
1898 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode
*N
) const {
1899 assert(N
->getOpcode() == ISD::BRCOND
);
1900 if (!N
->hasOneUse())
1903 SDValue Cond
= N
->getOperand(1);
1904 if (Cond
.getOpcode() == ISD::CopyToReg
)
1905 Cond
= Cond
.getOperand(2);
1907 if (Cond
.getOpcode() != ISD::SETCC
|| !Cond
.hasOneUse())
1910 MVT VT
= Cond
.getOperand(0).getSimpleValueType();
1914 if (VT
== MVT::i64
) {
1915 auto ST
= static_cast<const GCNSubtarget
*>(Subtarget
);
1917 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Cond
.getOperand(2))->get();
1918 return (CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) && ST
->hasScalarCompareEq64();
1924 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode
*N
) {
1925 SDValue Cond
= N
->getOperand(1);
1927 if (Cond
.isUndef()) {
1928 CurDAG
->SelectNodeTo(N
, AMDGPU::SI_BR_UNDEF
, MVT::Other
,
1929 N
->getOperand(2), N
->getOperand(0));
1933 const GCNSubtarget
*ST
= static_cast<const GCNSubtarget
*>(Subtarget
);
1934 const SIRegisterInfo
*TRI
= ST
->getRegisterInfo();
1936 bool UseSCCBr
= isCBranchSCC(N
) && isUniformBr(N
);
1937 unsigned BrOp
= UseSCCBr
? AMDGPU::S_CBRANCH_SCC1
: AMDGPU::S_CBRANCH_VCCNZ
;
1938 unsigned CondReg
= UseSCCBr
? (unsigned)AMDGPU::SCC
: TRI
->getVCC();
1942 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
1943 // analyzed what generates the vcc value, so we do not know whether vcc
1944 // bits for disabled lanes are 0. Thus we need to mask out bits for
1947 // For the case that we select S_CBRANCH_SCC1 and it gets
1948 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
1949 // SIInstrInfo::moveToVALU which inserts the S_AND).
1951 // We could add an analysis of what generates the vcc value here and omit
1952 // the S_AND when is unnecessary. But it would be better to add a separate
1953 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
1954 // catches both cases.
1955 Cond
= SDValue(CurDAG
->getMachineNode(ST
->isWave32() ? AMDGPU::S_AND_B32
1956 : AMDGPU::S_AND_B64
,
1958 CurDAG
->getRegister(ST
->isWave32() ? AMDGPU::EXEC_LO
1965 SDValue VCC
= CurDAG
->getCopyToReg(N
->getOperand(0), SL
, CondReg
, Cond
);
1966 CurDAG
->SelectNodeTo(N
, BrOp
, MVT::Other
,
1967 N
->getOperand(2), // Basic Block
1971 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode
*N
) {
1972 MVT VT
= N
->getSimpleValueType(0);
1973 bool IsFMA
= N
->getOpcode() == ISD::FMA
;
1974 if (VT
!= MVT::f32
|| (!Subtarget
->hasMadMixInsts() &&
1975 !Subtarget
->hasFmaMixInsts()) ||
1976 ((IsFMA
&& Subtarget
->hasMadMixInsts()) ||
1977 (!IsFMA
&& Subtarget
->hasFmaMixInsts()))) {
1982 SDValue Src0
= N
->getOperand(0);
1983 SDValue Src1
= N
->getOperand(1);
1984 SDValue Src2
= N
->getOperand(2);
1985 unsigned Src0Mods
, Src1Mods
, Src2Mods
;
1987 // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
1988 // using the conversion from f16.
1989 bool Sel0
= SelectVOP3PMadMixModsImpl(Src0
, Src0
, Src0Mods
);
1990 bool Sel1
= SelectVOP3PMadMixModsImpl(Src1
, Src1
, Src1Mods
);
1991 bool Sel2
= SelectVOP3PMadMixModsImpl(Src2
, Src2
, Src2Mods
);
1993 assert((IsFMA
|| !Subtarget
->hasFP32Denormals()) &&
1994 "fmad selected with denormals enabled");
1995 // TODO: We can select this with f32 denormals enabled if all the sources are
1996 // converted from f16 (in which case fmad isn't legal).
1998 if (Sel0
|| Sel1
|| Sel2
) {
1999 // For dummy operands.
2000 SDValue Zero
= CurDAG
->getTargetConstant(0, SDLoc(), MVT::i32
);
2002 CurDAG
->getTargetConstant(Src0Mods
, SDLoc(), MVT::i32
), Src0
,
2003 CurDAG
->getTargetConstant(Src1Mods
, SDLoc(), MVT::i32
), Src1
,
2004 CurDAG
->getTargetConstant(Src2Mods
, SDLoc(), MVT::i32
), Src2
,
2005 CurDAG
->getTargetConstant(0, SDLoc(), MVT::i1
),
2009 CurDAG
->SelectNodeTo(N
,
2010 IsFMA
? AMDGPU::V_FMA_MIX_F32
: AMDGPU::V_MAD_MIX_F32
,
2017 // This is here because there isn't a way to use the generated sub0_sub1 as the
2018 // subreg index to EXTRACT_SUBREG in tablegen.
2019 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode
*N
) {
2020 MemSDNode
*Mem
= cast
<MemSDNode
>(N
);
2021 unsigned AS
= Mem
->getAddressSpace();
2022 if (AS
== AMDGPUAS::FLAT_ADDRESS
) {
2027 MVT VT
= N
->getSimpleValueType(0);
2028 bool Is32
= (VT
== MVT::i32
);
2031 MachineSDNode
*CmpSwap
= nullptr;
2032 if (Subtarget
->hasAddr64()) {
2033 SDValue SRsrc
, VAddr
, SOffset
, Offset
, SLC
;
2035 if (SelectMUBUFAddr64(Mem
->getBasePtr(), SRsrc
, VAddr
, SOffset
, Offset
, SLC
)) {
2036 unsigned Opcode
= Is32
? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN
:
2037 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN
;
2038 SDValue CmpVal
= Mem
->getOperand(2);
2040 // XXX - Do we care about glue operands?
2043 CmpVal
, VAddr
, SRsrc
, SOffset
, Offset
, SLC
, Mem
->getChain()
2046 CmpSwap
= CurDAG
->getMachineNode(Opcode
, SL
, Mem
->getVTList(), Ops
);
2051 SDValue SRsrc
, SOffset
, Offset
, SLC
;
2052 if (SelectMUBUFOffset(Mem
->getBasePtr(), SRsrc
, SOffset
, Offset
, SLC
)) {
2053 unsigned Opcode
= Is32
? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
:
2054 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN
;
2056 SDValue CmpVal
= Mem
->getOperand(2);
2058 CmpVal
, SRsrc
, SOffset
, Offset
, SLC
, Mem
->getChain()
2061 CmpSwap
= CurDAG
->getMachineNode(Opcode
, SL
, Mem
->getVTList(), Ops
);
2070 MachineMemOperand
*MMO
= Mem
->getMemOperand();
2071 CurDAG
->setNodeMemRefs(CmpSwap
, {MMO
});
2073 unsigned SubReg
= Is32
? AMDGPU::sub0
: AMDGPU::sub0_sub1
;
2075 = CurDAG
->getTargetExtractSubreg(SubReg
, SL
, VT
, SDValue(CmpSwap
, 0));
2077 ReplaceUses(SDValue(N
, 0), Extract
);
2078 ReplaceUses(SDValue(N
, 1), SDValue(CmpSwap
, 1));
2079 CurDAG
->RemoveDeadNode(N
);
2082 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode
*N
, unsigned IntrID
) {
2083 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2084 // be copied to an SGPR with readfirstlane.
2085 unsigned Opc
= IntrID
== Intrinsic::amdgcn_ds_append
?
2086 AMDGPU::DS_APPEND
: AMDGPU::DS_CONSUME
;
2088 SDValue Chain
= N
->getOperand(0);
2089 SDValue Ptr
= N
->getOperand(2);
2090 MemIntrinsicSDNode
*M
= cast
<MemIntrinsicSDNode
>(N
);
2091 MachineMemOperand
*MMO
= M
->getMemOperand();
2092 bool IsGDS
= M
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
;
2095 if (CurDAG
->isBaseWithConstantOffset(Ptr
)) {
2096 SDValue PtrBase
= Ptr
.getOperand(0);
2097 SDValue PtrOffset
= Ptr
.getOperand(1);
2099 const APInt
&OffsetVal
= cast
<ConstantSDNode
>(PtrOffset
)->getAPIntValue();
2100 if (isDSOffsetLegal(PtrBase
, OffsetVal
.getZExtValue(), 16)) {
2101 N
= glueCopyToM0(N
, PtrBase
);
2102 Offset
= CurDAG
->getTargetConstant(OffsetVal
, SDLoc(), MVT::i32
);
2107 N
= glueCopyToM0(N
, Ptr
);
2108 Offset
= CurDAG
->getTargetConstant(0, SDLoc(), MVT::i32
);
2113 CurDAG
->getTargetConstant(IsGDS
, SDLoc(), MVT::i32
),
2115 N
->getOperand(N
->getNumOperands() - 1) // New glue
2118 SDNode
*Selected
= CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
2119 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Selected
), {MMO
});
2122 static unsigned gwsIntrinToOpcode(unsigned IntrID
) {
2124 case Intrinsic::amdgcn_ds_gws_init
:
2125 return AMDGPU::DS_GWS_INIT
;
2126 case Intrinsic::amdgcn_ds_gws_barrier
:
2127 return AMDGPU::DS_GWS_BARRIER
;
2128 case Intrinsic::amdgcn_ds_gws_sema_v
:
2129 return AMDGPU::DS_GWS_SEMA_V
;
2130 case Intrinsic::amdgcn_ds_gws_sema_br
:
2131 return AMDGPU::DS_GWS_SEMA_BR
;
2132 case Intrinsic::amdgcn_ds_gws_sema_p
:
2133 return AMDGPU::DS_GWS_SEMA_P
;
2134 case Intrinsic::amdgcn_ds_gws_sema_release_all
:
2135 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL
;
2137 llvm_unreachable("not a gws intrinsic");
2141 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode
*N
, unsigned IntrID
) {
2142 if (IntrID
== Intrinsic::amdgcn_ds_gws_sema_release_all
&&
2143 !Subtarget
->hasGWSSemaReleaseAll()) {
2149 // Chain, intrinsic ID, vsrc, offset
2150 const bool HasVSrc
= N
->getNumOperands() == 4;
2151 assert(HasVSrc
|| N
->getNumOperands() == 3);
2154 SDValue BaseOffset
= N
->getOperand(HasVSrc
? 3 : 2);
2156 MemIntrinsicSDNode
*M
= cast
<MemIntrinsicSDNode
>(N
);
2157 MachineMemOperand
*MMO
= M
->getMemOperand();
2159 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2160 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2162 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2163 // offset field) % 64. Some versions of the programming guide omit the m0
2164 // part, or claim it's from offset 0.
2165 if (ConstantSDNode
*ConstOffset
= dyn_cast
<ConstantSDNode
>(BaseOffset
)) {
2166 // If we have a constant offset, try to use the 0 in m0 as the base.
2167 // TODO: Look into changing the default m0 initialization value. If the
2168 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2169 // the immediate offset.
2170 glueCopyToM0(N
, CurDAG
->getTargetConstant(0, SL
, MVT::i32
));
2171 ImmOffset
= ConstOffset
->getZExtValue();
2173 if (CurDAG
->isBaseWithConstantOffset(BaseOffset
)) {
2174 ImmOffset
= BaseOffset
.getConstantOperandVal(1);
2175 BaseOffset
= BaseOffset
.getOperand(0);
2178 // Prefer to do the shift in an SGPR since it should be possible to use m0
2179 // as the result directly. If it's already an SGPR, it will be eliminated
2182 = CurDAG
->getMachineNode(AMDGPU::V_READFIRSTLANE_B32
, SL
, MVT::i32
,
2184 // Shift to offset in m0
2186 = CurDAG
->getMachineNode(AMDGPU::S_LSHL_B32
, SL
, MVT::i32
,
2187 SDValue(SGPROffset
, 0),
2188 CurDAG
->getTargetConstant(16, SL
, MVT::i32
));
2189 glueCopyToM0(N
, SDValue(M0Base
, 0));
2192 SDValue Chain
= N
->getOperand(0);
2193 SDValue OffsetField
= CurDAG
->getTargetConstant(ImmOffset
, SL
, MVT::i32
);
2195 // TODO: Can this just be removed from the instruction?
2196 SDValue GDS
= CurDAG
->getTargetConstant(1, SL
, MVT::i1
);
2198 const unsigned Opc
= gwsIntrinToOpcode(IntrID
);
2199 SmallVector
<SDValue
, 5> Ops
;
2201 Ops
.push_back(N
->getOperand(2));
2202 Ops
.push_back(OffsetField
);
2204 Ops
.push_back(Chain
);
2206 SDNode
*Selected
= CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
2207 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Selected
), {MMO
});
2210 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode
*N
) {
2211 unsigned IntrID
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
2213 case Intrinsic::amdgcn_ds_append
:
2214 case Intrinsic::amdgcn_ds_consume
: {
2215 if (N
->getValueType(0) != MVT::i32
)
2217 SelectDSAppendConsume(N
, IntrID
);
2225 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode
*N
) {
2226 unsigned IntrID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
2229 case Intrinsic::amdgcn_wqm
:
2230 Opcode
= AMDGPU::WQM
;
2232 case Intrinsic::amdgcn_softwqm
:
2233 Opcode
= AMDGPU::SOFT_WQM
;
2235 case Intrinsic::amdgcn_wwm
:
2236 Opcode
= AMDGPU::WWM
;
2243 SDValue Src
= N
->getOperand(1);
2244 CurDAG
->SelectNodeTo(N
, Opcode
, N
->getVTList(), {Src
});
2247 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode
*N
) {
2248 unsigned IntrID
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
2250 case Intrinsic::amdgcn_ds_gws_init
:
2251 case Intrinsic::amdgcn_ds_gws_barrier
:
2252 case Intrinsic::amdgcn_ds_gws_sema_v
:
2253 case Intrinsic::amdgcn_ds_gws_sema_br
:
2254 case Intrinsic::amdgcn_ds_gws_sema_p
:
2255 case Intrinsic::amdgcn_ds_gws_sema_release_all
:
2256 SelectDS_GWS(N
, IntrID
);
2265 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In
, SDValue
&Src
,
2266 unsigned &Mods
) const {
2270 if (Src
.getOpcode() == ISD::FNEG
) {
2271 Mods
|= SISrcMods::NEG
;
2272 Src
= Src
.getOperand(0);
2275 if (Src
.getOpcode() == ISD::FABS
) {
2276 Mods
|= SISrcMods::ABS
;
2277 Src
= Src
.getOperand(0);
2283 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In
, SDValue
&Src
,
2284 SDValue
&SrcMods
) const {
2286 if (SelectVOP3ModsImpl(In
, Src
, Mods
)) {
2287 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2294 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In
, SDValue
&Src
,
2295 SDValue
&SrcMods
) const {
2296 SelectVOP3Mods(In
, Src
, SrcMods
);
2297 return isNoNanSrc(Src
);
2300 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In
, SDValue
&Src
,
2301 SDValue
&SrcMods
) const {
2302 if (In
.getValueType() == MVT::f32
)
2303 return SelectVOP3Mods(In
, Src
, SrcMods
);
2305 SrcMods
= CurDAG
->getTargetConstant(0, SDLoc(In
), MVT::i32
);;
2309 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In
, SDValue
&Src
) const {
2310 if (In
.getOpcode() == ISD::FABS
|| In
.getOpcode() == ISD::FNEG
)
2317 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In
, SDValue
&Src
,
2318 SDValue
&SrcMods
, SDValue
&Clamp
,
2319 SDValue
&Omod
) const {
2321 Clamp
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2322 Omod
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2324 return SelectVOP3Mods(In
, Src
, SrcMods
);
2327 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In
, SDValue
&Src
,
2330 SDValue
&Omod
) const {
2331 Clamp
= Omod
= CurDAG
->getTargetConstant(0, SDLoc(In
), MVT::i32
);
2332 return SelectVOP3Mods(In
, Src
, SrcMods
);
2335 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In
, SDValue
&Src
,
2336 SDValue
&Clamp
, SDValue
&Omod
) const {
2340 Clamp
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2341 Omod
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2346 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In
, SDValue
&Src
,
2347 SDValue
&SrcMods
) const {
2351 if (Src
.getOpcode() == ISD::FNEG
) {
2352 Mods
^= (SISrcMods::NEG
| SISrcMods::NEG_HI
);
2353 Src
= Src
.getOperand(0);
2356 if (Src
.getOpcode() == ISD::BUILD_VECTOR
) {
2357 unsigned VecMods
= Mods
;
2359 SDValue Lo
= stripBitcast(Src
.getOperand(0));
2360 SDValue Hi
= stripBitcast(Src
.getOperand(1));
2362 if (Lo
.getOpcode() == ISD::FNEG
) {
2363 Lo
= stripBitcast(Lo
.getOperand(0));
2364 Mods
^= SISrcMods::NEG
;
2367 if (Hi
.getOpcode() == ISD::FNEG
) {
2368 Hi
= stripBitcast(Hi
.getOperand(0));
2369 Mods
^= SISrcMods::NEG_HI
;
2372 if (isExtractHiElt(Lo
, Lo
))
2373 Mods
|= SISrcMods::OP_SEL_0
;
2375 if (isExtractHiElt(Hi
, Hi
))
2376 Mods
|= SISrcMods::OP_SEL_1
;
2378 Lo
= stripExtractLoElt(Lo
);
2379 Hi
= stripExtractLoElt(Hi
);
2381 if (Lo
== Hi
&& !isInlineImmediate(Lo
.getNode())) {
2382 // Really a scalar input. Just select from the low half of the register to
2386 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2393 // Packed instructions do not have abs modifiers.
2394 Mods
|= SISrcMods::OP_SEL_1
;
2396 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2400 bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In
, SDValue
&Src
,
2402 SDValue
&Clamp
) const {
2405 // FIXME: Handle clamp and op_sel
2406 Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
2408 return SelectVOP3PMods(In
, Src
, SrcMods
);
2411 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In
, SDValue
&Src
,
2412 SDValue
&SrcMods
) const {
2414 // FIXME: Handle op_sel
2415 SrcMods
= CurDAG
->getTargetConstant(0, SDLoc(In
), MVT::i32
);
2419 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In
, SDValue
&Src
,
2421 SDValue
&Clamp
) const {
2424 // FIXME: Handle clamp
2425 Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
2427 return SelectVOP3OpSel(In
, Src
, SrcMods
);
2430 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In
, SDValue
&Src
,
2431 SDValue
&SrcMods
) const {
2432 // FIXME: Handle op_sel
2433 return SelectVOP3Mods(In
, Src
, SrcMods
);
2436 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In
, SDValue
&Src
,
2438 SDValue
&Clamp
) const {
2441 // FIXME: Handle clamp
2442 Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
2444 return SelectVOP3OpSelMods(In
, Src
, SrcMods
);
2447 // The return value is not whether the match is possible (which it always is),
2448 // but whether or not it a conversion is really used.
2449 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In
, SDValue
&Src
,
2450 unsigned &Mods
) const {
2452 SelectVOP3ModsImpl(In
, Src
, Mods
);
2454 if (Src
.getOpcode() == ISD::FP_EXTEND
) {
2455 Src
= Src
.getOperand(0);
2456 assert(Src
.getValueType() == MVT::f16
);
2457 Src
= stripBitcast(Src
);
2459 // Be careful about folding modifiers if we already have an abs. fneg is
2460 // applied last, so we don't want to apply an earlier fneg.
2461 if ((Mods
& SISrcMods::ABS
) == 0) {
2463 SelectVOP3ModsImpl(Src
, Src
, ModsTmp
);
2465 if ((ModsTmp
& SISrcMods::NEG
) != 0)
2466 Mods
^= SISrcMods::NEG
;
2468 if ((ModsTmp
& SISrcMods::ABS
) != 0)
2469 Mods
|= SISrcMods::ABS
;
2472 // op_sel/op_sel_hi decide the source type and source.
2473 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2474 // If the sources's op_sel is set, it picks the high half of the source
2477 Mods
|= SISrcMods::OP_SEL_1
;
2478 if (isExtractHiElt(Src
, Src
)) {
2479 Mods
|= SISrcMods::OP_SEL_0
;
2481 // TODO: Should we try to look for neg/abs here?
2490 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In
, SDValue
&Src
,
2491 SDValue
&SrcMods
) const {
2493 SelectVOP3PMadMixModsImpl(In
, Src
, Mods
);
2494 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2498 SDValue
AMDGPUDAGToDAGISel::getHi16Elt(SDValue In
) const {
2500 return CurDAG
->getUNDEF(MVT::i32
);
2502 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(In
)) {
2504 return CurDAG
->getConstant(C
->getZExtValue() << 16, SL
, MVT::i32
);
2507 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(In
)) {
2509 return CurDAG
->getConstant(
2510 C
->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL
, MVT::i32
);
2514 if (isExtractHiElt(In
, Src
))
2520 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode
* N
) const {
2521 assert(CurDAG
->getTarget().getTargetTriple().getArch() == Triple::amdgcn
);
2523 const SIRegisterInfo
*SIRI
=
2524 static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
2525 const SIInstrInfo
* SII
=
2526 static_cast<const SIInstrInfo
*>(Subtarget
->getInstrInfo());
2529 bool AllUsesAcceptSReg
= true;
2530 for (SDNode::use_iterator U
= N
->use_begin(), E
= SDNode::use_end();
2531 Limit
< 10 && U
!= E
; ++U
, ++Limit
) {
2532 const TargetRegisterClass
*RC
= getOperandRegClass(*U
, U
.getOperandNo());
2534 // If the register class is unknown, it could be an unknown
2535 // register class that needs to be an SGPR, e.g. an inline asm
2537 if (!RC
|| SIRI
->isSGPRClass(RC
))
2540 if (RC
!= &AMDGPU::VS_32RegClass
) {
2541 AllUsesAcceptSReg
= false;
2543 if (User
->isMachineOpcode()) {
2544 unsigned Opc
= User
->getMachineOpcode();
2545 MCInstrDesc Desc
= SII
->get(Opc
);
2546 if (Desc
.isCommutable()) {
2547 unsigned OpIdx
= Desc
.getNumDefs() + U
.getOperandNo();
2548 unsigned CommuteIdx1
= TargetInstrInfo::CommuteAnyOperandIndex
;
2549 if (SII
->findCommutedOpIndices(Desc
, OpIdx
, CommuteIdx1
)) {
2550 unsigned CommutedOpNo
= CommuteIdx1
- Desc
.getNumDefs();
2551 const TargetRegisterClass
*CommutedRC
= getOperandRegClass(*U
, CommutedOpNo
);
2552 if (CommutedRC
== &AMDGPU::VS_32RegClass
)
2553 AllUsesAcceptSReg
= true;
2557 // If "AllUsesAcceptSReg == false" so far we haven't suceeded
2558 // commuting current user. This means have at least one use
2559 // that strictly require VGPR. Thus, we will not attempt to commute
2560 // other user instructions.
2561 if (!AllUsesAcceptSReg
)
2565 return !AllUsesAcceptSReg
&& (Limit
< 10);
2568 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode
* N
) const {
2569 auto Ld
= cast
<LoadSDNode
>(N
);
2571 return Ld
->getAlignment() >= 4 &&
2575 Ld
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
2576 Ld
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
2583 Subtarget
->getScalarizeGlobalBehavior() &&
2584 Ld
->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
&&
2585 !Ld
->isVolatile() &&
2586 !N
->isDivergent() &&
2587 static_cast<const SITargetLowering
*>(
2588 getTargetLowering())->isMemOpHasNoClobberedMemOperand(N
)
2593 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
2594 const AMDGPUTargetLowering
& Lowering
=
2595 *static_cast<const AMDGPUTargetLowering
*>(getTargetLowering());
2596 bool IsModified
= false;
2600 // Go over all selected nodes and try to fold them a bit more
2601 SelectionDAG::allnodes_iterator Position
= CurDAG
->allnodes_begin();
2602 while (Position
!= CurDAG
->allnodes_end()) {
2603 SDNode
*Node
= &*Position
++;
2604 MachineSDNode
*MachineNode
= dyn_cast
<MachineSDNode
>(Node
);
2608 SDNode
*ResNode
= Lowering
.PostISelFolding(MachineNode
, *CurDAG
);
2609 if (ResNode
!= Node
) {
2611 ReplaceUses(Node
, ResNode
);
2615 CurDAG
->RemoveDeadNodes();
2616 } while (IsModified
);
2619 bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction
&MF
) {
2620 Subtarget
= &MF
.getSubtarget
<R600Subtarget
>();
2621 return SelectionDAGISel::runOnMachineFunction(MF
);
2624 bool R600DAGToDAGISel::isConstantLoad(const MemSDNode
*N
, int CbId
) const {
2628 return N
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
2629 N
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
;
2631 return N
->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0
+ CbId
;
2634 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr
,
2636 if (ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(Addr
)) {
2637 IntPtr
= CurDAG
->getIntPtrConstant(Cst
->getZExtValue() / 4, SDLoc(Addr
),
2644 bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr
,
2645 SDValue
& BaseReg
, SDValue
&Offset
) {
2646 if (!isa
<ConstantSDNode
>(Addr
)) {
2648 Offset
= CurDAG
->getIntPtrConstant(0, SDLoc(Addr
), true);
2654 void R600DAGToDAGISel::Select(SDNode
*N
) {
2655 unsigned int Opc
= N
->getOpcode();
2656 if (N
->isMachineOpcode()) {
2658 return; // Already selected.
2663 case AMDGPUISD::BUILD_VERTICAL_VECTOR
:
2664 case ISD::SCALAR_TO_VECTOR
:
2665 case ISD::BUILD_VECTOR
: {
2666 EVT VT
= N
->getValueType(0);
2667 unsigned NumVectorElts
= VT
.getVectorNumElements();
2668 unsigned RegClassID
;
2669 // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
2670 // that adds a 128 bits reg copy when going through TwoAddressInstructions
2671 // pass. We want to avoid 128 bits copies as much as possible because they
2672 // can't be bundled by our scheduler.
2673 switch(NumVectorElts
) {
2674 case 2: RegClassID
= R600::R600_Reg64RegClassID
; break;
2676 if (Opc
== AMDGPUISD::BUILD_VERTICAL_VECTOR
)
2677 RegClassID
= R600::R600_Reg128VerticalRegClassID
;
2679 RegClassID
= R600::R600_Reg128RegClassID
;
2681 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
2683 SelectBuildVector(N
, RegClassID
);
2691 bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr
, SDValue
&Base
,
2696 if ((C
= dyn_cast
<ConstantSDNode
>(Addr
))) {
2697 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
2698 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
2699 } else if ((Addr
.getOpcode() == AMDGPUISD::DWORDADDR
) &&
2700 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0)))) {
2701 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
2702 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
2703 } else if ((Addr
.getOpcode() == ISD::ADD
|| Addr
.getOpcode() == ISD::OR
) &&
2704 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1)))) {
2705 Base
= Addr
.getOperand(0);
2706 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
2709 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
2715 bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
,
2717 ConstantSDNode
*IMMOffset
;
2719 if (Addr
.getOpcode() == ISD::ADD
2720 && (IMMOffset
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1)))
2721 && isInt
<16>(IMMOffset
->getZExtValue())) {
2723 Base
= Addr
.getOperand(0);
2724 Offset
= CurDAG
->getTargetConstant(IMMOffset
->getZExtValue(), SDLoc(Addr
),
2727 // If the pointer address is constant, we can move it to the offset field.
2728 } else if ((IMMOffset
= dyn_cast
<ConstantSDNode
>(Addr
))
2729 && isInt
<16>(IMMOffset
->getZExtValue())) {
2730 Base
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(),
2731 SDLoc(CurDAG
->getEntryNode()),
2732 R600::ZERO
, MVT::i32
);
2733 Offset
= CurDAG
->getTargetConstant(IMMOffset
->getZExtValue(), SDLoc(Addr
),
2738 // Default case, no offset
2740 Offset
= CurDAG
->getTargetConstant(0, SDLoc(Addr
), MVT::i32
);