1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //==-----------------------------------------------------------------------===//
10 /// Defines an instruction selector for the AMDGPU target.
12 //===----------------------------------------------------------------------===//
15 #include "AMDGPUArgumentUsageInfo.h"
16 #include "AMDGPUISelLowering.h" // For AMDGPUISD
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUPerfHintAnalysis.h"
19 #include "AMDGPURegisterInfo.h"
20 #include "AMDGPUSubtarget.h"
21 #include "AMDGPUTargetMachine.h"
22 #include "SIDefines.h"
23 #include "SIISelLowering.h"
24 #include "SIInstrInfo.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIRegisterInfo.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "llvm/ADT/APInt.h"
29 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/ADT/StringRef.h"
31 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
32 #include "llvm/Analysis/ValueTracking.h"
33 #include "llvm/CodeGen/FunctionLoweringInfo.h"
34 #include "llvm/CodeGen/ISDOpcodes.h"
35 #include "llvm/CodeGen/MachineFunction.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/SelectionDAG.h"
38 #include "llvm/CodeGen/SelectionDAGISel.h"
39 #include "llvm/CodeGen/SelectionDAGNodes.h"
40 #include "llvm/CodeGen/ValueTypes.h"
41 #include "llvm/IR/BasicBlock.h"
42 #ifdef EXPENSIVE_CHECKS
43 #include "llvm/IR/Dominators.h"
45 #include "llvm/IR/Instruction.h"
46 #include "llvm/MC/MCInstrDesc.h"
47 #include "llvm/Support/Casting.h"
48 #include "llvm/Support/CodeGen.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Support/MachineValueType.h"
51 #include "llvm/Support/MathExtras.h"
57 #define DEBUG_TYPE "isel"
65 } // end namespace llvm
67 //===----------------------------------------------------------------------===//
68 // Instruction Selector Implementation
69 //===----------------------------------------------------------------------===//
73 static bool isNullConstantOrUndef(SDValue V
) {
77 ConstantSDNode
*Const
= dyn_cast
<ConstantSDNode
>(V
);
78 return Const
!= nullptr && Const
->isNullValue();
81 static bool getConstantValue(SDValue N
, uint32_t &Out
) {
82 // This is only used for packed vectors, where ussing 0 for undef should
89 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
)) {
90 Out
= C
->getAPIntValue().getSExtValue();
94 if (const ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(N
)) {
95 Out
= C
->getValueAPF().bitcastToAPInt().getSExtValue();
102 // TODO: Handle undef as zero
103 static SDNode
*packConstantV2I16(const SDNode
*N
, SelectionDAG
&DAG
,
104 bool Negate
= false) {
105 assert(N
->getOpcode() == ISD::BUILD_VECTOR
&& N
->getNumOperands() == 2);
106 uint32_t LHSVal
, RHSVal
;
107 if (getConstantValue(N
->getOperand(0), LHSVal
) &&
108 getConstantValue(N
->getOperand(1), RHSVal
)) {
110 uint32_t K
= Negate
?
111 (-LHSVal
& 0xffff) | (-RHSVal
<< 16) :
112 (LHSVal
& 0xffff) | (RHSVal
<< 16);
113 return DAG
.getMachineNode(AMDGPU::S_MOV_B32
, SL
, N
->getValueType(0),
114 DAG
.getTargetConstant(K
, SL
, MVT::i32
));
120 static SDNode
*packNegConstantV2I16(const SDNode
*N
, SelectionDAG
&DAG
) {
121 return packConstantV2I16(N
, DAG
, true);
124 /// AMDGPU specific code to select AMDGPU machine instructions for
125 /// SelectionDAG operations.
126 class AMDGPUDAGToDAGISel
: public SelectionDAGISel
{
127 // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
128 // make the right decision when generating code for different targets.
129 const GCNSubtarget
*Subtarget
;
130 bool EnableLateStructurizeCFG
;
133 explicit AMDGPUDAGToDAGISel(TargetMachine
*TM
= nullptr,
134 CodeGenOpt::Level OptLevel
= CodeGenOpt::Default
)
135 : SelectionDAGISel(*TM
, OptLevel
) {
136 EnableLateStructurizeCFG
= AMDGPUTargetMachine::EnableLateStructurizeCFG
;
138 ~AMDGPUDAGToDAGISel() override
= default;
140 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
141 AU
.addRequired
<AMDGPUArgumentUsageInfo
>();
142 AU
.addRequired
<LegacyDivergenceAnalysis
>();
143 #ifdef EXPENSIVE_CHECKS
144 AU
.addRequired
<DominatorTreeWrapperPass
>();
145 AU
.addRequired
<LoopInfoWrapperPass
>();
147 SelectionDAGISel::getAnalysisUsage(AU
);
150 bool matchLoadD16FromBuildVector(SDNode
*N
) const;
152 bool runOnMachineFunction(MachineFunction
&MF
) override
;
153 void PreprocessISelDAG() override
;
154 void Select(SDNode
*N
) override
;
155 StringRef
getPassName() const override
;
156 void PostprocessISelDAG() override
;
159 void SelectBuildVector(SDNode
*N
, unsigned RegClassID
);
162 std::pair
<SDValue
, SDValue
> foldFrameIndex(SDValue N
) const;
163 bool isNoNanSrc(SDValue N
) const;
164 bool isInlineImmediate(const SDNode
*N
, bool Negated
= false) const;
165 bool isNegInlineImmediate(const SDNode
*N
) const {
166 return isInlineImmediate(N
, true);
169 bool isVGPRImm(const SDNode
*N
) const;
170 bool isUniformLoad(const SDNode
*N
) const;
171 bool isUniformBr(const SDNode
*N
) const;
173 MachineSDNode
*buildSMovImm64(SDLoc
&DL
, uint64_t Val
, EVT VT
) const;
175 SDNode
*glueCopyToM0LDSInit(SDNode
*N
) const;
176 SDNode
*glueCopyToM0(SDNode
*N
, SDValue Val
) const;
178 const TargetRegisterClass
*getOperandRegClass(SDNode
*N
, unsigned OpNo
) const;
179 virtual bool SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
, SDValue
&Offset
);
180 virtual bool SelectADDRIndirect(SDValue Addr
, SDValue
&Base
, SDValue
&Offset
);
181 bool isDSOffsetLegal(SDValue Base
, unsigned Offset
,
182 unsigned OffsetBits
) const;
183 bool SelectDS1Addr1Offset(SDValue Ptr
, SDValue
&Base
, SDValue
&Offset
) const;
184 bool SelectDS64Bit4ByteAligned(SDValue Ptr
, SDValue
&Base
, SDValue
&Offset0
,
185 SDValue
&Offset1
) const;
186 bool SelectMUBUF(SDValue Addr
, SDValue
&SRsrc
, SDValue
&VAddr
,
187 SDValue
&SOffset
, SDValue
&Offset
, SDValue
&Offen
,
188 SDValue
&Idxen
, SDValue
&Addr64
, SDValue
&GLC
, SDValue
&SLC
,
189 SDValue
&TFE
, SDValue
&DLC
, SDValue
&SWZ
) const;
190 bool SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
, SDValue
&VAddr
,
191 SDValue
&SOffset
, SDValue
&Offset
, SDValue
&GLC
,
192 SDValue
&SLC
, SDValue
&TFE
, SDValue
&DLC
,
194 bool SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
,
195 SDValue
&VAddr
, SDValue
&SOffset
, SDValue
&Offset
,
197 bool SelectMUBUFScratchOffen(SDNode
*Parent
,
198 SDValue Addr
, SDValue
&RSrc
, SDValue
&VAddr
,
199 SDValue
&SOffset
, SDValue
&ImmOffset
) const;
200 bool SelectMUBUFScratchOffset(SDNode
*Parent
,
201 SDValue Addr
, SDValue
&SRsrc
, SDValue
&Soffset
,
202 SDValue
&Offset
) const;
204 bool SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
, SDValue
&SOffset
,
205 SDValue
&Offset
, SDValue
&GLC
, SDValue
&SLC
,
206 SDValue
&TFE
, SDValue
&DLC
, SDValue
&SWZ
) const;
207 bool SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
, SDValue
&Soffset
,
208 SDValue
&Offset
, SDValue
&SLC
) const;
209 bool SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
, SDValue
&Soffset
,
210 SDValue
&Offset
) const;
212 bool SelectFlatAtomic(SDNode
*N
, SDValue Addr
, SDValue
&VAddr
,
213 SDValue
&Offset
, SDValue
&SLC
) const;
214 bool SelectFlatAtomicSigned(SDNode
*N
, SDValue Addr
, SDValue
&VAddr
,
215 SDValue
&Offset
, SDValue
&SLC
) const;
217 template <bool IsSigned
>
218 bool SelectFlatOffset(SDNode
*N
, SDValue Addr
, SDValue
&VAddr
,
219 SDValue
&Offset
, SDValue
&SLC
) const;
221 bool SelectSMRDOffset(SDValue ByteOffsetNode
, SDValue
&Offset
,
223 SDValue
Expand32BitAddress(SDValue Addr
) const;
224 bool SelectSMRD(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
,
226 bool SelectSMRDImm(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
) const;
227 bool SelectSMRDImm32(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
) const;
228 bool SelectSMRDSgpr(SDValue Addr
, SDValue
&SBase
, SDValue
&Offset
) const;
229 bool SelectSMRDBufferImm(SDValue Addr
, SDValue
&Offset
) const;
230 bool SelectSMRDBufferImm32(SDValue Addr
, SDValue
&Offset
) const;
231 bool SelectMOVRELOffset(SDValue Index
, SDValue
&Base
, SDValue
&Offset
) const;
233 bool SelectVOP3Mods_NNaN(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
234 bool SelectVOP3Mods_f32(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
235 bool SelectVOP3ModsImpl(SDValue In
, SDValue
&Src
, unsigned &SrcMods
) const;
236 bool SelectVOP3Mods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
237 bool SelectVOP3NoMods(SDValue In
, SDValue
&Src
) const;
238 bool SelectVOP3Mods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
239 SDValue
&Clamp
, SDValue
&Omod
) const;
240 bool SelectVOP3NoMods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
241 SDValue
&Clamp
, SDValue
&Omod
) const;
243 bool SelectVOP3Mods0Clamp0OMod(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
245 SDValue
&Omod
) const;
247 bool SelectVOP3OMods(SDValue In
, SDValue
&Src
,
248 SDValue
&Clamp
, SDValue
&Omod
) const;
250 bool SelectVOP3PMods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
251 bool SelectVOP3PMods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
252 SDValue
&Clamp
) const;
254 bool SelectVOP3OpSel(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
255 bool SelectVOP3OpSel0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
256 SDValue
&Clamp
) const;
258 bool SelectVOP3OpSelMods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
259 bool SelectVOP3OpSelMods0(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
,
260 SDValue
&Clamp
) const;
261 bool SelectVOP3PMadMixModsImpl(SDValue In
, SDValue
&Src
, unsigned &Mods
) const;
262 bool SelectVOP3PMadMixMods(SDValue In
, SDValue
&Src
, SDValue
&SrcMods
) const;
264 SDValue
getHi16Elt(SDValue In
) const;
266 void SelectADD_SUB_I64(SDNode
*N
);
267 void SelectAddcSubb(SDNode
*N
);
268 void SelectUADDO_USUBO(SDNode
*N
);
269 void SelectDIV_SCALE(SDNode
*N
);
270 void SelectDIV_FMAS(SDNode
*N
);
271 void SelectMAD_64_32(SDNode
*N
);
272 void SelectFMA_W_CHAIN(SDNode
*N
);
273 void SelectFMUL_W_CHAIN(SDNode
*N
);
275 SDNode
*getS_BFE(unsigned Opcode
, const SDLoc
&DL
, SDValue Val
,
276 uint32_t Offset
, uint32_t Width
);
277 void SelectS_BFEFromShifts(SDNode
*N
);
278 void SelectS_BFE(SDNode
*N
);
279 bool isCBranchSCC(const SDNode
*N
) const;
280 void SelectBRCOND(SDNode
*N
);
281 void SelectFMAD_FMA(SDNode
*N
);
282 void SelectATOMIC_CMP_SWAP(SDNode
*N
);
283 void SelectDSAppendConsume(SDNode
*N
, unsigned IntrID
);
284 void SelectDS_GWS(SDNode
*N
, unsigned IntrID
);
285 void SelectINTRINSIC_W_CHAIN(SDNode
*N
);
286 void SelectINTRINSIC_WO_CHAIN(SDNode
*N
);
287 void SelectINTRINSIC_VOID(SDNode
*N
);
290 // Include the pieces autogenerated from the target description.
291 #include "AMDGPUGenDAGISel.inc"
294 class R600DAGToDAGISel
: public AMDGPUDAGToDAGISel
{
295 const R600Subtarget
*Subtarget
;
297 bool isConstantLoad(const MemSDNode
*N
, int cbID
) const;
298 bool SelectGlobalValueConstantOffset(SDValue Addr
, SDValue
& IntPtr
);
299 bool SelectGlobalValueVariableOffset(SDValue Addr
, SDValue
&BaseReg
,
302 explicit R600DAGToDAGISel(TargetMachine
*TM
, CodeGenOpt::Level OptLevel
) :
303 AMDGPUDAGToDAGISel(TM
, OptLevel
) {}
305 void Select(SDNode
*N
) override
;
307 bool SelectADDRIndirect(SDValue Addr
, SDValue
&Base
,
308 SDValue
&Offset
) override
;
309 bool SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
,
310 SDValue
&Offset
) override
;
312 bool runOnMachineFunction(MachineFunction
&MF
) override
;
314 void PreprocessISelDAG() override
{}
317 // Include the pieces autogenerated from the target description.
318 #include "R600GenDAGISel.inc"
321 static SDValue
stripBitcast(SDValue Val
) {
322 return Val
.getOpcode() == ISD::BITCAST
? Val
.getOperand(0) : Val
;
325 // Figure out if this is really an extract of the high 16-bits of a dword.
326 static bool isExtractHiElt(SDValue In
, SDValue
&Out
) {
327 In
= stripBitcast(In
);
328 if (In
.getOpcode() != ISD::TRUNCATE
)
331 SDValue Srl
= In
.getOperand(0);
332 if (Srl
.getOpcode() == ISD::SRL
) {
333 if (ConstantSDNode
*ShiftAmt
= dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1))) {
334 if (ShiftAmt
->getZExtValue() == 16) {
335 Out
= stripBitcast(Srl
.getOperand(0));
344 // Look through operations that obscure just looking at the low 16-bits of the
346 static SDValue
stripExtractLoElt(SDValue In
) {
347 if (In
.getOpcode() == ISD::TRUNCATE
) {
348 SDValue Src
= In
.getOperand(0);
349 if (Src
.getValueType().getSizeInBits() == 32)
350 return stripBitcast(Src
);
356 } // end anonymous namespace
358 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel
, "amdgpu-isel",
359 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
360 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo
)
361 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis
)
362 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis
)
363 #ifdef EXPENSIVE_CHECKS
364 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
365 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass
)
367 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel
, "amdgpu-isel",
368 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
370 /// This pass converts a legalized DAG into a AMDGPU-specific
371 // DAG, ready for instruction scheduling.
372 FunctionPass
*llvm::createAMDGPUISelDag(TargetMachine
*TM
,
373 CodeGenOpt::Level OptLevel
) {
374 return new AMDGPUDAGToDAGISel(TM
, OptLevel
);
377 /// This pass converts a legalized DAG into a R600-specific
378 // DAG, ready for instruction scheduling.
379 FunctionPass
*llvm::createR600ISelDag(TargetMachine
*TM
,
380 CodeGenOpt::Level OptLevel
) {
381 return new R600DAGToDAGISel(TM
, OptLevel
);
384 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction
&MF
) {
385 #ifdef EXPENSIVE_CHECKS
386 DominatorTree
& DT
= getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
387 LoopInfo
* LI
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
388 for (auto &L
: LI
->getLoopsInPreorder()) {
389 assert(L
->isLCSSAForm(DT
));
392 Subtarget
= &MF
.getSubtarget
<GCNSubtarget
>();
393 return SelectionDAGISel::runOnMachineFunction(MF
);
396 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode
*N
) const {
397 assert(Subtarget
->d16PreservesUnusedBits());
398 MVT VT
= N
->getValueType(0).getSimpleVT();
399 if (VT
!= MVT::v2i16
&& VT
!= MVT::v2f16
)
402 SDValue Lo
= N
->getOperand(0);
403 SDValue Hi
= N
->getOperand(1);
405 LoadSDNode
*LdHi
= dyn_cast
<LoadSDNode
>(stripBitcast(Hi
));
407 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
408 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
409 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
411 // Need to check for possible indirect dependencies on the other half of the
412 // vector to avoid introducing a cycle.
413 if (LdHi
&& Hi
.hasOneUse() && !LdHi
->isPredecessorOf(Lo
.getNode())) {
414 SDVTList VTList
= CurDAG
->getVTList(VT
, MVT::Other
);
416 SDValue TiedIn
= CurDAG
->getNode(ISD::SCALAR_TO_VECTOR
, SDLoc(N
), VT
, Lo
);
418 LdHi
->getChain(), LdHi
->getBasePtr(), TiedIn
421 unsigned LoadOp
= AMDGPUISD::LOAD_D16_HI
;
422 if (LdHi
->getMemoryVT() == MVT::i8
) {
423 LoadOp
= LdHi
->getExtensionType() == ISD::SEXTLOAD
?
424 AMDGPUISD::LOAD_D16_HI_I8
: AMDGPUISD::LOAD_D16_HI_U8
;
426 assert(LdHi
->getMemoryVT() == MVT::i16
);
430 CurDAG
->getMemIntrinsicNode(LoadOp
, SDLoc(LdHi
), VTList
,
431 Ops
, LdHi
->getMemoryVT(),
432 LdHi
->getMemOperand());
434 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), NewLoadHi
);
435 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(LdHi
, 1), NewLoadHi
.getValue(1));
439 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
440 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
441 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
442 LoadSDNode
*LdLo
= dyn_cast
<LoadSDNode
>(stripBitcast(Lo
));
443 if (LdLo
&& Lo
.hasOneUse()) {
444 SDValue TiedIn
= getHi16Elt(Hi
);
445 if (!TiedIn
|| LdLo
->isPredecessorOf(TiedIn
.getNode()))
448 SDVTList VTList
= CurDAG
->getVTList(VT
, MVT::Other
);
449 unsigned LoadOp
= AMDGPUISD::LOAD_D16_LO
;
450 if (LdLo
->getMemoryVT() == MVT::i8
) {
451 LoadOp
= LdLo
->getExtensionType() == ISD::SEXTLOAD
?
452 AMDGPUISD::LOAD_D16_LO_I8
: AMDGPUISD::LOAD_D16_LO_U8
;
454 assert(LdLo
->getMemoryVT() == MVT::i16
);
457 TiedIn
= CurDAG
->getNode(ISD::BITCAST
, SDLoc(N
), VT
, TiedIn
);
460 LdLo
->getChain(), LdLo
->getBasePtr(), TiedIn
464 CurDAG
->getMemIntrinsicNode(LoadOp
, SDLoc(LdLo
), VTList
,
465 Ops
, LdLo
->getMemoryVT(),
466 LdLo
->getMemOperand());
468 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), NewLoadLo
);
469 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(LdLo
, 1), NewLoadLo
.getValue(1));
476 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
477 if (!Subtarget
->d16PreservesUnusedBits())
480 SelectionDAG::allnodes_iterator Position
= CurDAG
->allnodes_end();
482 bool MadeChange
= false;
483 while (Position
!= CurDAG
->allnodes_begin()) {
484 SDNode
*N
= &*--Position
;
488 switch (N
->getOpcode()) {
489 case ISD::BUILD_VECTOR
:
490 MadeChange
|= matchLoadD16FromBuildVector(N
);
498 CurDAG
->RemoveDeadNodes();
499 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
504 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N
) const {
505 if (TM
.Options
.NoNaNsFPMath
)
508 // TODO: Move into isKnownNeverNaN
509 if (N
->getFlags().isDefined())
510 return N
->getFlags().hasNoNaNs();
512 return CurDAG
->isKnownNeverNaN(N
);
515 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode
*N
,
516 bool Negated
) const {
520 const SIInstrInfo
*TII
= Subtarget
->getInstrInfo();
522 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
))
523 return TII
->isInlineConstant(-C
->getAPIntValue());
525 if (const ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(N
))
526 return TII
->isInlineConstant(-C
->getValueAPF().bitcastToAPInt());
529 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
))
530 return TII
->isInlineConstant(C
->getAPIntValue());
532 if (const ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(N
))
533 return TII
->isInlineConstant(C
->getValueAPF().bitcastToAPInt());
539 /// Determine the register class for \p OpNo
540 /// \returns The register class of the virtual register that will be used for
541 /// the given operand number \OpNo or NULL if the register class cannot be
543 const TargetRegisterClass
*AMDGPUDAGToDAGISel::getOperandRegClass(SDNode
*N
,
544 unsigned OpNo
) const {
545 if (!N
->isMachineOpcode()) {
546 if (N
->getOpcode() == ISD::CopyToReg
) {
547 unsigned Reg
= cast
<RegisterSDNode
>(N
->getOperand(1))->getReg();
548 if (Register::isVirtualRegister(Reg
)) {
549 MachineRegisterInfo
&MRI
= CurDAG
->getMachineFunction().getRegInfo();
550 return MRI
.getRegClass(Reg
);
553 const SIRegisterInfo
*TRI
554 = static_cast<const GCNSubtarget
*>(Subtarget
)->getRegisterInfo();
555 return TRI
->getPhysRegClass(Reg
);
561 switch (N
->getMachineOpcode()) {
563 const MCInstrDesc
&Desc
=
564 Subtarget
->getInstrInfo()->get(N
->getMachineOpcode());
565 unsigned OpIdx
= Desc
.getNumDefs() + OpNo
;
566 if (OpIdx
>= Desc
.getNumOperands())
568 int RegClass
= Desc
.OpInfo
[OpIdx
].RegClass
;
572 return Subtarget
->getRegisterInfo()->getRegClass(RegClass
);
574 case AMDGPU::REG_SEQUENCE
: {
575 unsigned RCID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
576 const TargetRegisterClass
*SuperRC
=
577 Subtarget
->getRegisterInfo()->getRegClass(RCID
);
579 SDValue SubRegOp
= N
->getOperand(OpNo
+ 1);
580 unsigned SubRegIdx
= cast
<ConstantSDNode
>(SubRegOp
)->getZExtValue();
581 return Subtarget
->getRegisterInfo()->getSubClassWithSubReg(SuperRC
,
587 SDNode
*AMDGPUDAGToDAGISel::glueCopyToM0(SDNode
*N
, SDValue Val
) const {
588 const SITargetLowering
& Lowering
=
589 *static_cast<const SITargetLowering
*>(getTargetLowering());
591 assert(N
->getOperand(0).getValueType() == MVT::Other
&& "Expected chain");
593 SDValue M0
= Lowering
.copyToM0(*CurDAG
, N
->getOperand(0), SDLoc(N
),
596 SDValue Glue
= M0
.getValue(1);
598 SmallVector
<SDValue
, 8> Ops
;
599 Ops
.push_back(M0
); // Replace the chain.
600 for (unsigned i
= 1, e
= N
->getNumOperands(); i
!= e
; ++i
)
601 Ops
.push_back(N
->getOperand(i
));
604 return CurDAG
->MorphNodeTo(N
, N
->getOpcode(), N
->getVTList(), Ops
);
607 SDNode
*AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode
*N
) const {
608 unsigned AS
= cast
<MemSDNode
>(N
)->getAddressSpace();
609 if (AS
== AMDGPUAS::LOCAL_ADDRESS
) {
610 if (Subtarget
->ldsRequiresM0Init())
611 return glueCopyToM0(N
, CurDAG
->getTargetConstant(-1, SDLoc(N
), MVT::i32
));
612 } else if (AS
== AMDGPUAS::REGION_ADDRESS
) {
613 MachineFunction
&MF
= CurDAG
->getMachineFunction();
614 unsigned Value
= MF
.getInfo
<SIMachineFunctionInfo
>()->getGDSSize();
616 glueCopyToM0(N
, CurDAG
->getTargetConstant(Value
, SDLoc(N
), MVT::i32
));
621 MachineSDNode
*AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc
&DL
, uint64_t Imm
,
623 SDNode
*Lo
= CurDAG
->getMachineNode(
624 AMDGPU::S_MOV_B32
, DL
, MVT::i32
,
625 CurDAG
->getTargetConstant(Imm
& 0xFFFFFFFF, DL
, MVT::i32
));
627 CurDAG
->getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
,
628 CurDAG
->getTargetConstant(Imm
>> 32, DL
, MVT::i32
));
629 const SDValue Ops
[] = {
630 CurDAG
->getTargetConstant(AMDGPU::SReg_64RegClassID
, DL
, MVT::i32
),
631 SDValue(Lo
, 0), CurDAG
->getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
632 SDValue(Hi
, 0), CurDAG
->getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
)};
634 return CurDAG
->getMachineNode(TargetOpcode::REG_SEQUENCE
, DL
, VT
, Ops
);
637 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts
) {
638 switch (NumVectorElts
) {
640 return AMDGPU::SReg_32_XM0RegClassID
;
642 return AMDGPU::SReg_64RegClassID
;
644 return AMDGPU::SGPR_96RegClassID
;
646 return AMDGPU::SGPR_128RegClassID
;
648 return AMDGPU::SGPR_160RegClassID
;
650 return AMDGPU::SReg_256RegClassID
;
652 return AMDGPU::SReg_512RegClassID
;
654 return AMDGPU::SReg_1024RegClassID
;
657 llvm_unreachable("invalid vector size");
660 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode
*N
, unsigned RegClassID
) {
661 EVT VT
= N
->getValueType(0);
662 unsigned NumVectorElts
= VT
.getVectorNumElements();
663 EVT EltVT
= VT
.getVectorElementType();
665 SDValue RegClass
= CurDAG
->getTargetConstant(RegClassID
, DL
, MVT::i32
);
667 if (NumVectorElts
== 1) {
668 CurDAG
->SelectNodeTo(N
, AMDGPU::COPY_TO_REGCLASS
, EltVT
, N
->getOperand(0),
673 assert(NumVectorElts
<= 32 && "Vectors with more than 32 elements not "
675 // 32 = Max Num Vector Elements
676 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
677 // 1 = Vector Register Class
678 SmallVector
<SDValue
, 32 * 2 + 1> RegSeqArgs(NumVectorElts
* 2 + 1);
680 RegSeqArgs
[0] = CurDAG
->getTargetConstant(RegClassID
, DL
, MVT::i32
);
681 bool IsRegSeq
= true;
682 unsigned NOps
= N
->getNumOperands();
683 for (unsigned i
= 0; i
< NOps
; i
++) {
684 // XXX: Why is this here?
685 if (isa
<RegisterSDNode
>(N
->getOperand(i
))) {
689 unsigned Sub
= AMDGPURegisterInfo::getSubRegFromChannel(i
);
690 RegSeqArgs
[1 + (2 * i
)] = N
->getOperand(i
);
691 RegSeqArgs
[1 + (2 * i
) + 1] = CurDAG
->getTargetConstant(Sub
, DL
, MVT::i32
);
693 if (NOps
!= NumVectorElts
) {
694 // Fill in the missing undef elements if this was a scalar_to_vector.
695 assert(N
->getOpcode() == ISD::SCALAR_TO_VECTOR
&& NOps
< NumVectorElts
);
696 MachineSDNode
*ImpDef
= CurDAG
->getMachineNode(TargetOpcode::IMPLICIT_DEF
,
698 for (unsigned i
= NOps
; i
< NumVectorElts
; ++i
) {
699 unsigned Sub
= AMDGPURegisterInfo::getSubRegFromChannel(i
);
700 RegSeqArgs
[1 + (2 * i
)] = SDValue(ImpDef
, 0);
701 RegSeqArgs
[1 + (2 * i
) + 1] =
702 CurDAG
->getTargetConstant(Sub
, DL
, MVT::i32
);
708 CurDAG
->SelectNodeTo(N
, AMDGPU::REG_SEQUENCE
, N
->getVTList(), RegSeqArgs
);
711 void AMDGPUDAGToDAGISel::Select(SDNode
*N
) {
712 unsigned int Opc
= N
->getOpcode();
713 if (N
->isMachineOpcode()) {
715 return; // Already selected.
718 if (isa
<AtomicSDNode
>(N
) ||
719 (Opc
== AMDGPUISD::ATOMIC_INC
|| Opc
== AMDGPUISD::ATOMIC_DEC
||
720 Opc
== ISD::ATOMIC_LOAD_FADD
||
721 Opc
== AMDGPUISD::ATOMIC_LOAD_FMIN
||
722 Opc
== AMDGPUISD::ATOMIC_LOAD_FMAX
))
723 N
= glueCopyToM0LDSInit(N
);
728 // We are selecting i64 ADD here instead of custom lower it during
729 // DAG legalization, so we can fold some i64 ADDs used for address
730 // calculation into the LOAD and STORE instructions.
735 if (N
->getValueType(0) != MVT::i64
)
738 SelectADD_SUB_I64(N
);
743 if (N
->getValueType(0) != MVT::i32
)
750 SelectUADDO_USUBO(N
);
753 case AMDGPUISD::FMUL_W_CHAIN
: {
754 SelectFMUL_W_CHAIN(N
);
757 case AMDGPUISD::FMA_W_CHAIN
: {
758 SelectFMA_W_CHAIN(N
);
762 case ISD::SCALAR_TO_VECTOR
:
763 case ISD::BUILD_VECTOR
: {
764 EVT VT
= N
->getValueType(0);
765 unsigned NumVectorElts
= VT
.getVectorNumElements();
766 if (VT
.getScalarSizeInBits() == 16) {
767 if (Opc
== ISD::BUILD_VECTOR
&& NumVectorElts
== 2) {
768 if (SDNode
*Packed
= packConstantV2I16(N
, *CurDAG
)) {
769 ReplaceNode(N
, Packed
);
777 assert(VT
.getVectorElementType().bitsEq(MVT::i32
));
778 unsigned RegClassID
= selectSGPRVectorRegClassID(NumVectorElts
);
779 SelectBuildVector(N
, RegClassID
);
782 case ISD::BUILD_PAIR
: {
783 SDValue RC
, SubReg0
, SubReg1
;
785 if (N
->getValueType(0) == MVT::i128
) {
786 RC
= CurDAG
->getTargetConstant(AMDGPU::SGPR_128RegClassID
, DL
, MVT::i32
);
787 SubReg0
= CurDAG
->getTargetConstant(AMDGPU::sub0_sub1
, DL
, MVT::i32
);
788 SubReg1
= CurDAG
->getTargetConstant(AMDGPU::sub2_sub3
, DL
, MVT::i32
);
789 } else if (N
->getValueType(0) == MVT::i64
) {
790 RC
= CurDAG
->getTargetConstant(AMDGPU::SReg_64RegClassID
, DL
, MVT::i32
);
791 SubReg0
= CurDAG
->getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
);
792 SubReg1
= CurDAG
->getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
);
794 llvm_unreachable("Unhandled value type for BUILD_PAIR");
796 const SDValue Ops
[] = { RC
, N
->getOperand(0), SubReg0
,
797 N
->getOperand(1), SubReg1
};
798 ReplaceNode(N
, CurDAG
->getMachineNode(TargetOpcode::REG_SEQUENCE
, DL
,
799 N
->getValueType(0), Ops
));
804 case ISD::ConstantFP
: {
805 if (N
->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N
))
809 if (ConstantFPSDNode
*FP
= dyn_cast
<ConstantFPSDNode
>(N
))
810 Imm
= FP
->getValueAPF().bitcastToAPInt().getZExtValue();
812 ConstantSDNode
*C
= cast
<ConstantSDNode
>(N
);
813 Imm
= C
->getZExtValue();
817 ReplaceNode(N
, buildSMovImm64(DL
, Imm
, N
->getValueType(0)));
822 case ISD::ATOMIC_LOAD
:
823 case ISD::ATOMIC_STORE
: {
824 N
= glueCopyToM0LDSInit(N
);
828 case AMDGPUISD::BFE_I32
:
829 case AMDGPUISD::BFE_U32
: {
830 // There is a scalar version available, but unlike the vector version which
831 // has a separate operand for the offset and width, the scalar version packs
832 // the width and offset into a single operand. Try to move to the scalar
833 // version if the offsets are constant, so that we can try to keep extended
834 // loads of kernel arguments in SGPRs.
836 // TODO: Technically we could try to pattern match scalar bitshifts of
837 // dynamic values, but it's probably not useful.
838 ConstantSDNode
*Offset
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
842 ConstantSDNode
*Width
= dyn_cast
<ConstantSDNode
>(N
->getOperand(2));
846 bool Signed
= Opc
== AMDGPUISD::BFE_I32
;
848 uint32_t OffsetVal
= Offset
->getZExtValue();
849 uint32_t WidthVal
= Width
->getZExtValue();
851 ReplaceNode(N
, getS_BFE(Signed
? AMDGPU::S_BFE_I32
: AMDGPU::S_BFE_U32
,
852 SDLoc(N
), N
->getOperand(0), OffsetVal
, WidthVal
));
855 case AMDGPUISD::DIV_SCALE
: {
859 case AMDGPUISD::DIV_FMAS
: {
863 case AMDGPUISD::MAD_I64_I32
:
864 case AMDGPUISD::MAD_U64_U32
: {
868 case ISD::CopyToReg
: {
869 const SITargetLowering
& Lowering
=
870 *static_cast<const SITargetLowering
*>(getTargetLowering());
871 N
= Lowering
.legalizeTargetIndependentNode(N
, *CurDAG
);
877 case ISD::SIGN_EXTEND_INREG
:
878 if (N
->getValueType(0) != MVT::i32
)
890 case AMDGPUISD::ATOMIC_CMP_SWAP
:
891 SelectATOMIC_CMP_SWAP(N
);
893 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
894 case AMDGPUISD::CVT_PKNORM_I16_F32
:
895 case AMDGPUISD::CVT_PKNORM_U16_F32
:
896 case AMDGPUISD::CVT_PK_U16_U32
:
897 case AMDGPUISD::CVT_PK_I16_I32
: {
898 // Hack around using a legal type if f16 is illegal.
899 if (N
->getValueType(0) == MVT::i32
) {
900 MVT NewVT
= Opc
== AMDGPUISD::CVT_PKRTZ_F16_F32
? MVT::v2f16
: MVT::v2i16
;
901 N
= CurDAG
->MorphNodeTo(N
, N
->getOpcode(), CurDAG
->getVTList(NewVT
),
902 { N
->getOperand(0), N
->getOperand(1) });
909 case ISD::INTRINSIC_W_CHAIN
: {
910 SelectINTRINSIC_W_CHAIN(N
);
913 case ISD::INTRINSIC_WO_CHAIN
: {
914 SelectINTRINSIC_WO_CHAIN(N
);
917 case ISD::INTRINSIC_VOID
: {
918 SelectINTRINSIC_VOID(N
);
926 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode
*N
) const {
927 const BasicBlock
*BB
= FuncInfo
->MBB
->getBasicBlock();
928 const Instruction
*Term
= BB
->getTerminator();
929 return Term
->getMetadata("amdgpu.uniform") ||
930 Term
->getMetadata("structurizecfg.uniform");
933 StringRef
AMDGPUDAGToDAGISel::getPassName() const {
934 return "AMDGPU DAG->DAG Pattern Instruction Selection";
937 //===----------------------------------------------------------------------===//
939 //===----------------------------------------------------------------------===//
941 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
,
946 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr
, SDValue
&Base
,
951 if ((C
= dyn_cast
<ConstantSDNode
>(Addr
))) {
952 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
953 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
954 } else if ((Addr
.getOpcode() == AMDGPUISD::DWORDADDR
) &&
955 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0)))) {
956 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
957 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
958 } else if ((Addr
.getOpcode() == ISD::ADD
|| Addr
.getOpcode() == ISD::OR
) &&
959 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1)))) {
960 Base
= Addr
.getOperand(0);
961 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
964 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
970 // FIXME: Should only handle addcarry/subcarry
971 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode
*N
) {
973 SDValue LHS
= N
->getOperand(0);
974 SDValue RHS
= N
->getOperand(1);
976 unsigned Opcode
= N
->getOpcode();
977 bool ConsumeCarry
= (Opcode
== ISD::ADDE
|| Opcode
== ISD::SUBE
);
979 ConsumeCarry
|| Opcode
== ISD::ADDC
|| Opcode
== ISD::SUBC
;
980 bool IsAdd
= Opcode
== ISD::ADD
|| Opcode
== ISD::ADDC
|| Opcode
== ISD::ADDE
;
982 SDValue Sub0
= CurDAG
->getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
);
983 SDValue Sub1
= CurDAG
->getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
);
985 SDNode
*Lo0
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
986 DL
, MVT::i32
, LHS
, Sub0
);
987 SDNode
*Hi0
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
988 DL
, MVT::i32
, LHS
, Sub1
);
990 SDNode
*Lo1
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
991 DL
, MVT::i32
, RHS
, Sub0
);
992 SDNode
*Hi1
= CurDAG
->getMachineNode(TargetOpcode::EXTRACT_SUBREG
,
993 DL
, MVT::i32
, RHS
, Sub1
);
995 SDVTList VTList
= CurDAG
->getVTList(MVT::i32
, MVT::Glue
);
997 unsigned Opc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
998 unsigned CarryOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
1001 if (!ConsumeCarry
) {
1002 SDValue Args
[] = { SDValue(Lo0
, 0), SDValue(Lo1
, 0) };
1003 AddLo
= CurDAG
->getMachineNode(Opc
, DL
, VTList
, Args
);
1005 SDValue Args
[] = { SDValue(Lo0
, 0), SDValue(Lo1
, 0), N
->getOperand(2) };
1006 AddLo
= CurDAG
->getMachineNode(CarryOpc
, DL
, VTList
, Args
);
1008 SDValue AddHiArgs
[] = {
1013 SDNode
*AddHi
= CurDAG
->getMachineNode(CarryOpc
, DL
, VTList
, AddHiArgs
);
1015 SDValue RegSequenceArgs
[] = {
1016 CurDAG
->getTargetConstant(AMDGPU::SReg_64RegClassID
, DL
, MVT::i32
),
1022 SDNode
*RegSequence
= CurDAG
->getMachineNode(AMDGPU::REG_SEQUENCE
, DL
,
1023 MVT::i64
, RegSequenceArgs
);
1026 // Replace the carry-use
1027 ReplaceUses(SDValue(N
, 1), SDValue(AddHi
, 1));
1030 // Replace the remaining uses.
1031 ReplaceNode(N
, RegSequence
);
1034 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode
*N
) {
1036 SDValue LHS
= N
->getOperand(0);
1037 SDValue RHS
= N
->getOperand(1);
1038 SDValue CI
= N
->getOperand(2);
1040 unsigned Opc
= N
->getOpcode() == ISD::ADDCARRY
? AMDGPU::V_ADDC_U32_e64
1041 : AMDGPU::V_SUBB_U32_e64
;
1042 CurDAG
->SelectNodeTo(
1043 N
, Opc
, N
->getVTList(),
1044 {LHS
, RHS
, CI
, CurDAG
->getTargetConstant(0, {}, MVT::i1
) /*clamp bit*/});
1047 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode
*N
) {
1048 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1049 // carry out despite the _i32 name. These were renamed in VI to _U32.
1050 // FIXME: We should probably rename the opcodes here.
1051 unsigned Opc
= N
->getOpcode() == ISD::UADDO
?
1052 AMDGPU::V_ADD_I32_e64
: AMDGPU::V_SUB_I32_e64
;
1054 CurDAG
->SelectNodeTo(
1055 N
, Opc
, N
->getVTList(),
1056 {N
->getOperand(0), N
->getOperand(1),
1057 CurDAG
->getTargetConstant(0, {}, MVT::i1
) /*clamp bit*/});
1060 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode
*N
) {
1062 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1065 SelectVOP3Mods0(N
->getOperand(1), Ops
[1], Ops
[0], Ops
[6], Ops
[7]);
1066 SelectVOP3Mods(N
->getOperand(2), Ops
[3], Ops
[2]);
1067 SelectVOP3Mods(N
->getOperand(3), Ops
[5], Ops
[4]);
1068 Ops
[8] = N
->getOperand(0);
1069 Ops
[9] = N
->getOperand(4);
1071 CurDAG
->SelectNodeTo(N
, AMDGPU::V_FMA_F32
, N
->getVTList(), Ops
);
1074 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode
*N
) {
1076 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1079 SelectVOP3Mods0(N
->getOperand(1), Ops
[1], Ops
[0], Ops
[4], Ops
[5]);
1080 SelectVOP3Mods(N
->getOperand(2), Ops
[3], Ops
[2]);
1081 Ops
[6] = N
->getOperand(0);
1082 Ops
[7] = N
->getOperand(3);
1084 CurDAG
->SelectNodeTo(N
, AMDGPU::V_MUL_F32_e64
, N
->getVTList(), Ops
);
1087 // We need to handle this here because tablegen doesn't support matching
1088 // instructions with multiple outputs.
1089 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode
*N
) {
1091 EVT VT
= N
->getValueType(0);
1093 assert(VT
== MVT::f32
|| VT
== MVT::f64
);
1096 = (VT
== MVT::f64
) ? AMDGPU::V_DIV_SCALE_F64
: AMDGPU::V_DIV_SCALE_F32
;
1098 SDValue Ops
[] = { N
->getOperand(0), N
->getOperand(1), N
->getOperand(2) };
1099 CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
1102 void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode
*N
) {
1103 const GCNSubtarget
*ST
= static_cast<const GCNSubtarget
*>(Subtarget
);
1104 const SIRegisterInfo
*TRI
= ST
->getRegisterInfo();
1107 EVT VT
= N
->getValueType(0);
1109 assert(VT
== MVT::f32
|| VT
== MVT::f64
);
1112 = (VT
== MVT::f64
) ? AMDGPU::V_DIV_FMAS_F64
: AMDGPU::V_DIV_FMAS_F32
;
1114 SDValue CarryIn
= N
->getOperand(3);
1115 // V_DIV_FMAS implicitly reads VCC.
1116 SDValue VCC
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), SL
,
1117 TRI
->getVCC(), CarryIn
, SDValue());
1121 SelectVOP3Mods0(N
->getOperand(0), Ops
[1], Ops
[0], Ops
[6], Ops
[7]);
1122 SelectVOP3Mods(N
->getOperand(1), Ops
[3], Ops
[2]);
1123 SelectVOP3Mods(N
->getOperand(2), Ops
[5], Ops
[4]);
1126 Ops
[9] = VCC
.getValue(1);
1128 CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
1131 // We need to handle this here because tablegen doesn't support matching
1132 // instructions with multiple outputs.
1133 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode
*N
) {
1135 bool Signed
= N
->getOpcode() == AMDGPUISD::MAD_I64_I32
;
1136 unsigned Opc
= Signed
? AMDGPU::V_MAD_I64_I32
: AMDGPU::V_MAD_U64_U32
;
1138 SDValue Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i1
);
1139 SDValue Ops
[] = { N
->getOperand(0), N
->getOperand(1), N
->getOperand(2),
1141 CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
1144 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base
, unsigned Offset
,
1145 unsigned OffsetBits
) const {
1146 if ((OffsetBits
== 16 && !isUInt
<16>(Offset
)) ||
1147 (OffsetBits
== 8 && !isUInt
<8>(Offset
)))
1150 if (Subtarget
->hasUsableDSOffset() ||
1151 Subtarget
->unsafeDSOffsetFoldingEnabled())
1154 // On Southern Islands instruction with a negative base value and an offset
1155 // don't seem to work.
1156 return CurDAG
->SignBitIsZero(Base
);
1159 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr
, SDValue
&Base
,
1160 SDValue
&Offset
) const {
1162 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1163 SDValue N0
= Addr
.getOperand(0);
1164 SDValue N1
= Addr
.getOperand(1);
1165 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1166 if (isDSOffsetLegal(N0
, C1
->getSExtValue(), 16)) {
1169 Offset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i16
);
1172 } else if (Addr
.getOpcode() == ISD::SUB
) {
1173 // sub C, x -> add (sub 0, x), C
1174 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0))) {
1175 int64_t ByteOffset
= C
->getSExtValue();
1176 if (isUInt
<16>(ByteOffset
)) {
1177 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1179 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1180 // the known bits in isDSOffsetLegal. We need to emit the selected node
1181 // here, so this is thrown away.
1182 SDValue Sub
= CurDAG
->getNode(ISD::SUB
, DL
, MVT::i32
,
1183 Zero
, Addr
.getOperand(1));
1185 if (isDSOffsetLegal(Sub
, ByteOffset
, 16)) {
1186 SmallVector
<SDValue
, 3> Opnds
;
1187 Opnds
.push_back(Zero
);
1188 Opnds
.push_back(Addr
.getOperand(1));
1190 // FIXME: Select to VOP3 version for with-carry.
1191 unsigned SubOp
= AMDGPU::V_SUB_I32_e32
;
1192 if (Subtarget
->hasAddNoCarry()) {
1193 SubOp
= AMDGPU::V_SUB_U32_e64
;
1195 CurDAG
->getTargetConstant(0, {}, MVT::i1
)); // clamp bit
1198 MachineSDNode
*MachineSub
=
1199 CurDAG
->getMachineNode(SubOp
, DL
, MVT::i32
, Opnds
);
1201 Base
= SDValue(MachineSub
, 0);
1202 Offset
= CurDAG
->getTargetConstant(ByteOffset
, DL
, MVT::i16
);
1207 } else if (const ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
)) {
1208 // If we have a constant address, prefer to put the constant into the
1209 // offset. This can save moves to load the constant address since multiple
1210 // operations can share the zero base address register, and enables merging
1211 // into read2 / write2 instructions.
1215 if (isUInt
<16>(CAddr
->getZExtValue())) {
1216 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1217 MachineSDNode
*MovZero
= CurDAG
->getMachineNode(AMDGPU::V_MOV_B32_e32
,
1218 DL
, MVT::i32
, Zero
);
1219 Base
= SDValue(MovZero
, 0);
1220 Offset
= CurDAG
->getTargetConstant(CAddr
->getZExtValue(), DL
, MVT::i16
);
1227 Offset
= CurDAG
->getTargetConstant(0, SDLoc(Addr
), MVT::i16
);
1231 // TODO: If offset is too big, put low 16-bit into offset.
1232 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr
, SDValue
&Base
,
1234 SDValue
&Offset1
) const {
1237 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1238 SDValue N0
= Addr
.getOperand(0);
1239 SDValue N1
= Addr
.getOperand(1);
1240 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1241 unsigned DWordOffset0
= C1
->getZExtValue() / 4;
1242 unsigned DWordOffset1
= DWordOffset0
+ 1;
1244 if (isDSOffsetLegal(N0
, DWordOffset1
, 8)) {
1246 Offset0
= CurDAG
->getTargetConstant(DWordOffset0
, DL
, MVT::i8
);
1247 Offset1
= CurDAG
->getTargetConstant(DWordOffset1
, DL
, MVT::i8
);
1250 } else if (Addr
.getOpcode() == ISD::SUB
) {
1251 // sub C, x -> add (sub 0, x), C
1252 if (const ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0))) {
1253 unsigned DWordOffset0
= C
->getZExtValue() / 4;
1254 unsigned DWordOffset1
= DWordOffset0
+ 1;
1256 if (isUInt
<8>(DWordOffset0
)) {
1258 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1260 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1261 // the known bits in isDSOffsetLegal. We need to emit the selected node
1262 // here, so this is thrown away.
1263 SDValue Sub
= CurDAG
->getNode(ISD::SUB
, DL
, MVT::i32
,
1264 Zero
, Addr
.getOperand(1));
1266 if (isDSOffsetLegal(Sub
, DWordOffset1
, 8)) {
1267 SmallVector
<SDValue
, 3> Opnds
;
1268 Opnds
.push_back(Zero
);
1269 Opnds
.push_back(Addr
.getOperand(1));
1270 unsigned SubOp
= AMDGPU::V_SUB_I32_e32
;
1271 if (Subtarget
->hasAddNoCarry()) {
1272 SubOp
= AMDGPU::V_SUB_U32_e64
;
1274 CurDAG
->getTargetConstant(0, {}, MVT::i1
)); // clamp bit
1277 MachineSDNode
*MachineSub
1278 = CurDAG
->getMachineNode(SubOp
, DL
, MVT::i32
, Opnds
);
1280 Base
= SDValue(MachineSub
, 0);
1281 Offset0
= CurDAG
->getTargetConstant(DWordOffset0
, DL
, MVT::i8
);
1282 Offset1
= CurDAG
->getTargetConstant(DWordOffset1
, DL
, MVT::i8
);
1287 } else if (const ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
)) {
1288 unsigned DWordOffset0
= CAddr
->getZExtValue() / 4;
1289 unsigned DWordOffset1
= DWordOffset0
+ 1;
1290 assert(4 * DWordOffset0
== CAddr
->getZExtValue());
1292 if (isUInt
<8>(DWordOffset0
) && isUInt
<8>(DWordOffset1
)) {
1293 SDValue Zero
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1294 MachineSDNode
*MovZero
1295 = CurDAG
->getMachineNode(AMDGPU::V_MOV_B32_e32
,
1296 DL
, MVT::i32
, Zero
);
1297 Base
= SDValue(MovZero
, 0);
1298 Offset0
= CurDAG
->getTargetConstant(DWordOffset0
, DL
, MVT::i8
);
1299 Offset1
= CurDAG
->getTargetConstant(DWordOffset1
, DL
, MVT::i8
);
1307 Offset0
= CurDAG
->getTargetConstant(0, DL
, MVT::i8
);
1308 Offset1
= CurDAG
->getTargetConstant(1, DL
, MVT::i8
);
1312 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr
, SDValue
&Ptr
,
1313 SDValue
&VAddr
, SDValue
&SOffset
,
1314 SDValue
&Offset
, SDValue
&Offen
,
1315 SDValue
&Idxen
, SDValue
&Addr64
,
1316 SDValue
&GLC
, SDValue
&SLC
,
1317 SDValue
&TFE
, SDValue
&DLC
,
1318 SDValue
&SWZ
) const {
1319 // Subtarget prefers to use flat instruction
1320 if (Subtarget
->useFlatForGlobal())
1326 GLC
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1328 SLC
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1329 TFE
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1330 DLC
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1331 SWZ
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1333 Idxen
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1334 Offen
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1335 Addr64
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
1336 SOffset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1338 ConstantSDNode
*C1
= nullptr;
1340 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1341 C1
= cast
<ConstantSDNode
>(Addr
.getOperand(1));
1342 if (isUInt
<32>(C1
->getZExtValue()))
1343 N0
= Addr
.getOperand(0);
1348 if (N0
.getOpcode() == ISD::ADD
) {
1349 // (add N2, N3) -> addr64, or
1350 // (add (add N2, N3), C1) -> addr64
1351 SDValue N2
= N0
.getOperand(0);
1352 SDValue N3
= N0
.getOperand(1);
1353 Addr64
= CurDAG
->getTargetConstant(1, DL
, MVT::i1
);
1355 if (N2
->isDivergent()) {
1356 if (N3
->isDivergent()) {
1357 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1358 // addr64, and construct the resource from a 0 address.
1359 Ptr
= SDValue(buildSMovImm64(DL
, 0, MVT::v2i32
), 0);
1362 // N2 is divergent, N3 is not.
1367 // N2 is not divergent.
1371 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1372 } else if (N0
->isDivergent()) {
1373 // N0 is divergent. Use it as the addr64, and construct the resource from a
1375 Ptr
= SDValue(buildSMovImm64(DL
, 0, MVT::v2i32
), 0);
1377 Addr64
= CurDAG
->getTargetConstant(1, DL
, MVT::i1
);
1380 // (N0 + C1) -> offset
1381 VAddr
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1387 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1391 if (SIInstrInfo::isLegalMUBUFImmOffset(C1
->getZExtValue())) {
1392 // Legal offset for instruction.
1393 Offset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i16
);
1397 // Illegal offset, store it in soffset.
1398 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1400 SDValue(CurDAG
->getMachineNode(
1401 AMDGPU::S_MOV_B32
, DL
, MVT::i32
,
1402 CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i32
)),
1407 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
,
1408 SDValue
&VAddr
, SDValue
&SOffset
,
1409 SDValue
&Offset
, SDValue
&GLC
,
1410 SDValue
&SLC
, SDValue
&TFE
,
1411 SDValue
&DLC
, SDValue
&SWZ
) const {
1412 SDValue Ptr
, Offen
, Idxen
, Addr64
;
1414 // addr64 bit was removed for volcanic islands.
1415 if (!Subtarget
->hasAddr64())
1418 if (!SelectMUBUF(Addr
, Ptr
, VAddr
, SOffset
, Offset
, Offen
, Idxen
, Addr64
,
1419 GLC
, SLC
, TFE
, DLC
, SWZ
))
1422 ConstantSDNode
*C
= cast
<ConstantSDNode
>(Addr64
);
1423 if (C
->getSExtValue()) {
1426 const SITargetLowering
& Lowering
=
1427 *static_cast<const SITargetLowering
*>(getTargetLowering());
1429 SRsrc
= SDValue(Lowering
.wrapAddr64Rsrc(*CurDAG
, DL
, Ptr
), 0);
1436 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr
, SDValue
&SRsrc
,
1437 SDValue
&VAddr
, SDValue
&SOffset
,
1439 SDValue
&SLC
) const {
1440 SLC
= CurDAG
->getTargetConstant(0, SDLoc(Addr
), MVT::i1
);
1441 SDValue GLC
, TFE
, DLC
, SWZ
;
1443 return SelectMUBUFAddr64(Addr
, SRsrc
, VAddr
, SOffset
, Offset
, GLC
, SLC
, TFE
, DLC
, SWZ
);
1446 static bool isStackPtrRelative(const MachinePointerInfo
&PtrInfo
) {
1447 auto PSV
= PtrInfo
.V
.dyn_cast
<const PseudoSourceValue
*>();
1448 return PSV
&& PSV
->isStack();
1451 std::pair
<SDValue
, SDValue
> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N
) const {
1452 const MachineFunction
&MF
= CurDAG
->getMachineFunction();
1453 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1455 if (auto FI
= dyn_cast
<FrameIndexSDNode
>(N
)) {
1456 SDValue TFI
= CurDAG
->getTargetFrameIndex(FI
->getIndex(),
1457 FI
->getValueType(0));
1459 // If we can resolve this to a frame index access, this will be relative to
1460 // either the stack or frame pointer SGPR.
1461 return std::make_pair(
1462 TFI
, CurDAG
->getRegister(Info
->getStackPtrOffsetReg(), MVT::i32
));
1465 // If we don't know this private access is a local stack object, it needs to
1466 // be relative to the entry point's scratch wave offset register.
1467 return std::make_pair(N
, CurDAG
->getRegister(Info
->getScratchWaveOffsetReg(),
1471 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode
*Parent
,
1472 SDValue Addr
, SDValue
&Rsrc
,
1473 SDValue
&VAddr
, SDValue
&SOffset
,
1474 SDValue
&ImmOffset
) const {
1477 MachineFunction
&MF
= CurDAG
->getMachineFunction();
1478 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1480 Rsrc
= CurDAG
->getRegister(Info
->getScratchRSrcReg(), MVT::v4i32
);
1482 if (ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
)) {
1483 unsigned Imm
= CAddr
->getZExtValue();
1485 SDValue HighBits
= CurDAG
->getTargetConstant(Imm
& ~4095, DL
, MVT::i32
);
1486 MachineSDNode
*MovHighBits
= CurDAG
->getMachineNode(AMDGPU::V_MOV_B32_e32
,
1487 DL
, MVT::i32
, HighBits
);
1488 VAddr
= SDValue(MovHighBits
, 0);
1490 // In a call sequence, stores to the argument stack area are relative to the
1492 const MachinePointerInfo
&PtrInfo
= cast
<MemSDNode
>(Parent
)->getPointerInfo();
1493 unsigned SOffsetReg
= isStackPtrRelative(PtrInfo
) ?
1494 Info
->getStackPtrOffsetReg() : Info
->getScratchWaveOffsetReg();
1496 SOffset
= CurDAG
->getRegister(SOffsetReg
, MVT::i32
);
1497 ImmOffset
= CurDAG
->getTargetConstant(Imm
& 4095, DL
, MVT::i16
);
1501 if (CurDAG
->isBaseWithConstantOffset(Addr
)) {
1504 SDValue N0
= Addr
.getOperand(0);
1505 SDValue N1
= Addr
.getOperand(1);
1507 // Offsets in vaddr must be positive if range checking is enabled.
1509 // The total computation of vaddr + soffset + offset must not overflow. If
1510 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1513 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1514 // always perform a range check. If a negative vaddr base index was used,
1515 // this would fail the range check. The overall address computation would
1516 // compute a valid address, but this doesn't happen due to the range
1517 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1519 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1520 // MUBUF vaddr, but not on older subtargets which can only do this if the
1521 // sign bit is known 0.
1522 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1523 if (SIInstrInfo::isLegalMUBUFImmOffset(C1
->getZExtValue()) &&
1524 (!Subtarget
->privateMemoryResourceIsRangeChecked() ||
1525 CurDAG
->SignBitIsZero(N0
))) {
1526 std::tie(VAddr
, SOffset
) = foldFrameIndex(N0
);
1527 ImmOffset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i16
);
1533 std::tie(VAddr
, SOffset
) = foldFrameIndex(Addr
);
1534 ImmOffset
= CurDAG
->getTargetConstant(0, DL
, MVT::i16
);
1538 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode
*Parent
,
1542 SDValue
&Offset
) const {
1543 ConstantSDNode
*CAddr
= dyn_cast
<ConstantSDNode
>(Addr
);
1544 if (!CAddr
|| !SIInstrInfo::isLegalMUBUFImmOffset(CAddr
->getZExtValue()))
1548 MachineFunction
&MF
= CurDAG
->getMachineFunction();
1549 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1551 SRsrc
= CurDAG
->getRegister(Info
->getScratchRSrcReg(), MVT::v4i32
);
1553 const MachinePointerInfo
&PtrInfo
= cast
<MemSDNode
>(Parent
)->getPointerInfo();
1554 unsigned SOffsetReg
= isStackPtrRelative(PtrInfo
) ?
1555 Info
->getStackPtrOffsetReg() : Info
->getScratchWaveOffsetReg();
1557 // FIXME: Get from MachinePointerInfo? We should only be using the frame
1558 // offset if we know this is in a call sequence.
1559 SOffset
= CurDAG
->getRegister(SOffsetReg
, MVT::i32
);
1561 Offset
= CurDAG
->getTargetConstant(CAddr
->getZExtValue(), DL
, MVT::i16
);
1565 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
,
1566 SDValue
&SOffset
, SDValue
&Offset
,
1567 SDValue
&GLC
, SDValue
&SLC
,
1568 SDValue
&TFE
, SDValue
&DLC
,
1569 SDValue
&SWZ
) const {
1570 SDValue Ptr
, VAddr
, Offen
, Idxen
, Addr64
;
1571 const SIInstrInfo
*TII
=
1572 static_cast<const SIInstrInfo
*>(Subtarget
->getInstrInfo());
1574 if (!SelectMUBUF(Addr
, Ptr
, VAddr
, SOffset
, Offset
, Offen
, Idxen
, Addr64
,
1575 GLC
, SLC
, TFE
, DLC
, SWZ
))
1578 if (!cast
<ConstantSDNode
>(Offen
)->getSExtValue() &&
1579 !cast
<ConstantSDNode
>(Idxen
)->getSExtValue() &&
1580 !cast
<ConstantSDNode
>(Addr64
)->getSExtValue()) {
1581 uint64_t Rsrc
= TII
->getDefaultRsrcDataFormat() |
1582 APInt::getAllOnesValue(32).getZExtValue(); // Size
1585 const SITargetLowering
& Lowering
=
1586 *static_cast<const SITargetLowering
*>(getTargetLowering());
1588 SRsrc
= SDValue(Lowering
.buildRSRC(*CurDAG
, DL
, Ptr
, 0, Rsrc
), 0);
1594 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
,
1595 SDValue
&Soffset
, SDValue
&Offset
1597 SDValue GLC
, SLC
, TFE
, DLC
, SWZ
;
1599 return SelectMUBUFOffset(Addr
, SRsrc
, Soffset
, Offset
, GLC
, SLC
, TFE
, DLC
, SWZ
);
1601 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr
, SDValue
&SRsrc
,
1602 SDValue
&Soffset
, SDValue
&Offset
,
1603 SDValue
&SLC
) const {
1604 SDValue GLC
, TFE
, DLC
, SWZ
;
1606 return SelectMUBUFOffset(Addr
, SRsrc
, Soffset
, Offset
, GLC
, SLC
, TFE
, DLC
, SWZ
);
1609 template <bool IsSigned
>
1610 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode
*N
,
1614 SDValue
&SLC
) const {
1615 return static_cast<const SITargetLowering
*>(getTargetLowering())->
1616 SelectFlatOffset(IsSigned
, *CurDAG
, N
, Addr
, VAddr
, Offset
, SLC
);
1619 bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode
*N
,
1623 SDValue
&SLC
) const {
1624 return SelectFlatOffset
<false>(N
, Addr
, VAddr
, Offset
, SLC
);
1627 bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode
*N
,
1631 SDValue
&SLC
) const {
1632 return SelectFlatOffset
<true>(N
, Addr
, VAddr
, Offset
, SLC
);
1635 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode
,
1636 SDValue
&Offset
, bool &Imm
) const {
1638 // FIXME: Handle non-constant offsets.
1639 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(ByteOffsetNode
);
1643 SDLoc
SL(ByteOffsetNode
);
1644 GCNSubtarget::Generation Gen
= Subtarget
->getGeneration();
1645 int64_t ByteOffset
= C
->getSExtValue();
1646 int64_t EncodedOffset
= AMDGPU::getSMRDEncodedOffset(*Subtarget
, ByteOffset
);
1648 if (AMDGPU::isLegalSMRDImmOffset(*Subtarget
, ByteOffset
)) {
1649 Offset
= CurDAG
->getTargetConstant(EncodedOffset
, SL
, MVT::i32
);
1654 if (!isUInt
<32>(EncodedOffset
) || !isUInt
<32>(ByteOffset
))
1657 if (Gen
== AMDGPUSubtarget::SEA_ISLANDS
&& isUInt
<32>(EncodedOffset
)) {
1658 // 32-bit Immediates are supported on Sea Islands.
1659 Offset
= CurDAG
->getTargetConstant(EncodedOffset
, SL
, MVT::i32
);
1661 SDValue C32Bit
= CurDAG
->getTargetConstant(ByteOffset
, SL
, MVT::i32
);
1662 Offset
= SDValue(CurDAG
->getMachineNode(AMDGPU::S_MOV_B32
, SL
, MVT::i32
,
1669 SDValue
AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr
) const {
1670 if (Addr
.getValueType() != MVT::i32
)
1673 // Zero-extend a 32-bit address.
1676 const MachineFunction
&MF
= CurDAG
->getMachineFunction();
1677 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1678 unsigned AddrHiVal
= Info
->get32BitAddressHighBits();
1679 SDValue AddrHi
= CurDAG
->getTargetConstant(AddrHiVal
, SL
, MVT::i32
);
1681 const SDValue Ops
[] = {
1682 CurDAG
->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID
, SL
, MVT::i32
),
1684 CurDAG
->getTargetConstant(AMDGPU::sub0
, SL
, MVT::i32
),
1685 SDValue(CurDAG
->getMachineNode(AMDGPU::S_MOV_B32
, SL
, MVT::i32
, AddrHi
),
1687 CurDAG
->getTargetConstant(AMDGPU::sub1
, SL
, MVT::i32
),
1690 return SDValue(CurDAG
->getMachineNode(AMDGPU::REG_SEQUENCE
, SL
, MVT::i64
,
1694 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr
, SDValue
&SBase
,
1695 SDValue
&Offset
, bool &Imm
) const {
1698 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
1699 // wraparound, because s_load instructions perform the addition in 64 bits.
1700 if ((Addr
.getValueType() != MVT::i32
||
1701 Addr
->getFlags().hasNoUnsignedWrap()) &&
1702 CurDAG
->isBaseWithConstantOffset(Addr
)) {
1703 SDValue N0
= Addr
.getOperand(0);
1704 SDValue N1
= Addr
.getOperand(1);
1706 if (SelectSMRDOffset(N1
, Offset
, Imm
)) {
1707 SBase
= Expand32BitAddress(N0
);
1711 SBase
= Expand32BitAddress(Addr
);
1712 Offset
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
1717 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr
, SDValue
&SBase
,
1718 SDValue
&Offset
) const {
1720 return SelectSMRD(Addr
, SBase
, Offset
, Imm
) && Imm
;
1723 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr
, SDValue
&SBase
,
1724 SDValue
&Offset
) const {
1726 if (Subtarget
->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS
)
1730 if (!SelectSMRD(Addr
, SBase
, Offset
, Imm
))
1733 return !Imm
&& isa
<ConstantSDNode
>(Offset
);
1736 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr
, SDValue
&SBase
,
1737 SDValue
&Offset
) const {
1739 return SelectSMRD(Addr
, SBase
, Offset
, Imm
) && !Imm
&&
1740 !isa
<ConstantSDNode
>(Offset
);
1743 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr
,
1744 SDValue
&Offset
) const {
1746 return SelectSMRDOffset(Addr
, Offset
, Imm
) && Imm
;
1749 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr
,
1750 SDValue
&Offset
) const {
1751 if (Subtarget
->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS
)
1755 if (!SelectSMRDOffset(Addr
, Offset
, Imm
))
1758 return !Imm
&& isa
<ConstantSDNode
>(Offset
);
1761 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index
,
1763 SDValue
&Offset
) const {
1766 if (CurDAG
->isBaseWithConstantOffset(Index
)) {
1767 SDValue N0
= Index
.getOperand(0);
1768 SDValue N1
= Index
.getOperand(1);
1769 ConstantSDNode
*C1
= cast
<ConstantSDNode
>(N1
);
1772 // Don't peel off the offset (c0) if doing so could possibly lead
1773 // the base (n0) to be negative.
1774 if (C1
->getSExtValue() <= 0 || CurDAG
->SignBitIsZero(N0
)) {
1776 Offset
= CurDAG
->getTargetConstant(C1
->getZExtValue(), DL
, MVT::i32
);
1781 if (isa
<ConstantSDNode
>(Index
))
1785 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
1789 SDNode
*AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode
, const SDLoc
&DL
,
1790 SDValue Val
, uint32_t Offset
,
1792 // Transformation function, pack the offset and width of a BFE into
1793 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1794 // source, bits [5:0] contain the offset and bits [22:16] the width.
1795 uint32_t PackedVal
= Offset
| (Width
<< 16);
1796 SDValue PackedConst
= CurDAG
->getTargetConstant(PackedVal
, DL
, MVT::i32
);
1798 return CurDAG
->getMachineNode(Opcode
, DL
, MVT::i32
, Val
, PackedConst
);
1801 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode
*N
) {
1802 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
1803 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
1804 // Predicate: 0 < b <= c < 32
1806 const SDValue
&Shl
= N
->getOperand(0);
1807 ConstantSDNode
*B
= dyn_cast
<ConstantSDNode
>(Shl
->getOperand(1));
1808 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
1811 uint32_t BVal
= B
->getZExtValue();
1812 uint32_t CVal
= C
->getZExtValue();
1814 if (0 < BVal
&& BVal
<= CVal
&& CVal
< 32) {
1815 bool Signed
= N
->getOpcode() == ISD::SRA
;
1816 unsigned Opcode
= Signed
? AMDGPU::S_BFE_I32
: AMDGPU::S_BFE_U32
;
1818 ReplaceNode(N
, getS_BFE(Opcode
, SDLoc(N
), Shl
.getOperand(0), CVal
- BVal
,
1826 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode
*N
) {
1827 switch (N
->getOpcode()) {
1829 if (N
->getOperand(0).getOpcode() == ISD::SRL
) {
1830 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
1831 // Predicate: isMask(mask)
1832 const SDValue
&Srl
= N
->getOperand(0);
1833 ConstantSDNode
*Shift
= dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1));
1834 ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
1836 if (Shift
&& Mask
) {
1837 uint32_t ShiftVal
= Shift
->getZExtValue();
1838 uint32_t MaskVal
= Mask
->getZExtValue();
1840 if (isMask_32(MaskVal
)) {
1841 uint32_t WidthVal
= countPopulation(MaskVal
);
1843 ReplaceNode(N
, getS_BFE(AMDGPU::S_BFE_U32
, SDLoc(N
),
1844 Srl
.getOperand(0), ShiftVal
, WidthVal
));
1851 if (N
->getOperand(0).getOpcode() == ISD::AND
) {
1852 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
1853 // Predicate: isMask(mask >> b)
1854 const SDValue
&And
= N
->getOperand(0);
1855 ConstantSDNode
*Shift
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
1856 ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(And
->getOperand(1));
1858 if (Shift
&& Mask
) {
1859 uint32_t ShiftVal
= Shift
->getZExtValue();
1860 uint32_t MaskVal
= Mask
->getZExtValue() >> ShiftVal
;
1862 if (isMask_32(MaskVal
)) {
1863 uint32_t WidthVal
= countPopulation(MaskVal
);
1865 ReplaceNode(N
, getS_BFE(AMDGPU::S_BFE_U32
, SDLoc(N
),
1866 And
.getOperand(0), ShiftVal
, WidthVal
));
1870 } else if (N
->getOperand(0).getOpcode() == ISD::SHL
) {
1871 SelectS_BFEFromShifts(N
);
1876 if (N
->getOperand(0).getOpcode() == ISD::SHL
) {
1877 SelectS_BFEFromShifts(N
);
1882 case ISD::SIGN_EXTEND_INREG
: {
1883 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
1884 SDValue Src
= N
->getOperand(0);
1885 if (Src
.getOpcode() != ISD::SRL
)
1888 const ConstantSDNode
*Amt
= dyn_cast
<ConstantSDNode
>(Src
.getOperand(1));
1892 unsigned Width
= cast
<VTSDNode
>(N
->getOperand(1))->getVT().getSizeInBits();
1893 ReplaceNode(N
, getS_BFE(AMDGPU::S_BFE_I32
, SDLoc(N
), Src
.getOperand(0),
1894 Amt
->getZExtValue(), Width
));
1902 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode
*N
) const {
1903 assert(N
->getOpcode() == ISD::BRCOND
);
1904 if (!N
->hasOneUse())
1907 SDValue Cond
= N
->getOperand(1);
1908 if (Cond
.getOpcode() == ISD::CopyToReg
)
1909 Cond
= Cond
.getOperand(2);
1911 if (Cond
.getOpcode() != ISD::SETCC
|| !Cond
.hasOneUse())
1914 MVT VT
= Cond
.getOperand(0).getSimpleValueType();
1918 if (VT
== MVT::i64
) {
1919 auto ST
= static_cast<const GCNSubtarget
*>(Subtarget
);
1921 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Cond
.getOperand(2))->get();
1922 return (CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) && ST
->hasScalarCompareEq64();
1928 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode
*N
) {
1929 SDValue Cond
= N
->getOperand(1);
1931 if (Cond
.isUndef()) {
1932 CurDAG
->SelectNodeTo(N
, AMDGPU::SI_BR_UNDEF
, MVT::Other
,
1933 N
->getOperand(2), N
->getOperand(0));
1937 const GCNSubtarget
*ST
= static_cast<const GCNSubtarget
*>(Subtarget
);
1938 const SIRegisterInfo
*TRI
= ST
->getRegisterInfo();
1940 bool UseSCCBr
= isCBranchSCC(N
) && isUniformBr(N
);
1941 unsigned BrOp
= UseSCCBr
? AMDGPU::S_CBRANCH_SCC1
: AMDGPU::S_CBRANCH_VCCNZ
;
1942 unsigned CondReg
= UseSCCBr
? (unsigned)AMDGPU::SCC
: TRI
->getVCC();
1946 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
1947 // analyzed what generates the vcc value, so we do not know whether vcc
1948 // bits for disabled lanes are 0. Thus we need to mask out bits for
1951 // For the case that we select S_CBRANCH_SCC1 and it gets
1952 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
1953 // SIInstrInfo::moveToVALU which inserts the S_AND).
1955 // We could add an analysis of what generates the vcc value here and omit
1956 // the S_AND when is unnecessary. But it would be better to add a separate
1957 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
1958 // catches both cases.
1959 Cond
= SDValue(CurDAG
->getMachineNode(ST
->isWave32() ? AMDGPU::S_AND_B32
1960 : AMDGPU::S_AND_B64
,
1962 CurDAG
->getRegister(ST
->isWave32() ? AMDGPU::EXEC_LO
1969 SDValue VCC
= CurDAG
->getCopyToReg(N
->getOperand(0), SL
, CondReg
, Cond
);
1970 CurDAG
->SelectNodeTo(N
, BrOp
, MVT::Other
,
1971 N
->getOperand(2), // Basic Block
1975 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode
*N
) {
1976 MVT VT
= N
->getSimpleValueType(0);
1977 bool IsFMA
= N
->getOpcode() == ISD::FMA
;
1978 if (VT
!= MVT::f32
|| (!Subtarget
->hasMadMixInsts() &&
1979 !Subtarget
->hasFmaMixInsts()) ||
1980 ((IsFMA
&& Subtarget
->hasMadMixInsts()) ||
1981 (!IsFMA
&& Subtarget
->hasFmaMixInsts()))) {
1986 SDValue Src0
= N
->getOperand(0);
1987 SDValue Src1
= N
->getOperand(1);
1988 SDValue Src2
= N
->getOperand(2);
1989 unsigned Src0Mods
, Src1Mods
, Src2Mods
;
1991 // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
1992 // using the conversion from f16.
1993 bool Sel0
= SelectVOP3PMadMixModsImpl(Src0
, Src0
, Src0Mods
);
1994 bool Sel1
= SelectVOP3PMadMixModsImpl(Src1
, Src1
, Src1Mods
);
1995 bool Sel2
= SelectVOP3PMadMixModsImpl(Src2
, Src2
, Src2Mods
);
1997 assert((IsFMA
|| !Subtarget
->hasFP32Denormals()) &&
1998 "fmad selected with denormals enabled");
1999 // TODO: We can select this with f32 denormals enabled if all the sources are
2000 // converted from f16 (in which case fmad isn't legal).
2002 if (Sel0
|| Sel1
|| Sel2
) {
2003 // For dummy operands.
2004 SDValue Zero
= CurDAG
->getTargetConstant(0, SDLoc(), MVT::i32
);
2006 CurDAG
->getTargetConstant(Src0Mods
, SDLoc(), MVT::i32
), Src0
,
2007 CurDAG
->getTargetConstant(Src1Mods
, SDLoc(), MVT::i32
), Src1
,
2008 CurDAG
->getTargetConstant(Src2Mods
, SDLoc(), MVT::i32
), Src2
,
2009 CurDAG
->getTargetConstant(0, SDLoc(), MVT::i1
),
2013 CurDAG
->SelectNodeTo(N
,
2014 IsFMA
? AMDGPU::V_FMA_MIX_F32
: AMDGPU::V_MAD_MIX_F32
,
2021 // This is here because there isn't a way to use the generated sub0_sub1 as the
2022 // subreg index to EXTRACT_SUBREG in tablegen.
2023 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode
*N
) {
2024 MemSDNode
*Mem
= cast
<MemSDNode
>(N
);
2025 unsigned AS
= Mem
->getAddressSpace();
2026 if (AS
== AMDGPUAS::FLAT_ADDRESS
) {
2031 MVT VT
= N
->getSimpleValueType(0);
2032 bool Is32
= (VT
== MVT::i32
);
2035 MachineSDNode
*CmpSwap
= nullptr;
2036 if (Subtarget
->hasAddr64()) {
2037 SDValue SRsrc
, VAddr
, SOffset
, Offset
, SLC
;
2039 if (SelectMUBUFAddr64(Mem
->getBasePtr(), SRsrc
, VAddr
, SOffset
, Offset
, SLC
)) {
2040 unsigned Opcode
= Is32
? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN
:
2041 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN
;
2042 SDValue CmpVal
= Mem
->getOperand(2);
2044 // XXX - Do we care about glue operands?
2047 CmpVal
, VAddr
, SRsrc
, SOffset
, Offset
, SLC
, Mem
->getChain()
2050 CmpSwap
= CurDAG
->getMachineNode(Opcode
, SL
, Mem
->getVTList(), Ops
);
2055 SDValue SRsrc
, SOffset
, Offset
, SLC
;
2056 if (SelectMUBUFOffset(Mem
->getBasePtr(), SRsrc
, SOffset
, Offset
, SLC
)) {
2057 unsigned Opcode
= Is32
? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
:
2058 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN
;
2060 SDValue CmpVal
= Mem
->getOperand(2);
2062 CmpVal
, SRsrc
, SOffset
, Offset
, SLC
, Mem
->getChain()
2065 CmpSwap
= CurDAG
->getMachineNode(Opcode
, SL
, Mem
->getVTList(), Ops
);
2074 MachineMemOperand
*MMO
= Mem
->getMemOperand();
2075 CurDAG
->setNodeMemRefs(CmpSwap
, {MMO
});
2077 unsigned SubReg
= Is32
? AMDGPU::sub0
: AMDGPU::sub0_sub1
;
2079 = CurDAG
->getTargetExtractSubreg(SubReg
, SL
, VT
, SDValue(CmpSwap
, 0));
2081 ReplaceUses(SDValue(N
, 0), Extract
);
2082 ReplaceUses(SDValue(N
, 1), SDValue(CmpSwap
, 1));
2083 CurDAG
->RemoveDeadNode(N
);
2086 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode
*N
, unsigned IntrID
) {
2087 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2088 // be copied to an SGPR with readfirstlane.
2089 unsigned Opc
= IntrID
== Intrinsic::amdgcn_ds_append
?
2090 AMDGPU::DS_APPEND
: AMDGPU::DS_CONSUME
;
2092 SDValue Chain
= N
->getOperand(0);
2093 SDValue Ptr
= N
->getOperand(2);
2094 MemIntrinsicSDNode
*M
= cast
<MemIntrinsicSDNode
>(N
);
2095 MachineMemOperand
*MMO
= M
->getMemOperand();
2096 bool IsGDS
= M
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
;
2099 if (CurDAG
->isBaseWithConstantOffset(Ptr
)) {
2100 SDValue PtrBase
= Ptr
.getOperand(0);
2101 SDValue PtrOffset
= Ptr
.getOperand(1);
2103 const APInt
&OffsetVal
= cast
<ConstantSDNode
>(PtrOffset
)->getAPIntValue();
2104 if (isDSOffsetLegal(PtrBase
, OffsetVal
.getZExtValue(), 16)) {
2105 N
= glueCopyToM0(N
, PtrBase
);
2106 Offset
= CurDAG
->getTargetConstant(OffsetVal
, SDLoc(), MVT::i32
);
2111 N
= glueCopyToM0(N
, Ptr
);
2112 Offset
= CurDAG
->getTargetConstant(0, SDLoc(), MVT::i32
);
2117 CurDAG
->getTargetConstant(IsGDS
, SDLoc(), MVT::i32
),
2119 N
->getOperand(N
->getNumOperands() - 1) // New glue
2122 SDNode
*Selected
= CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
2123 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Selected
), {MMO
});
2126 static unsigned gwsIntrinToOpcode(unsigned IntrID
) {
2128 case Intrinsic::amdgcn_ds_gws_init
:
2129 return AMDGPU::DS_GWS_INIT
;
2130 case Intrinsic::amdgcn_ds_gws_barrier
:
2131 return AMDGPU::DS_GWS_BARRIER
;
2132 case Intrinsic::amdgcn_ds_gws_sema_v
:
2133 return AMDGPU::DS_GWS_SEMA_V
;
2134 case Intrinsic::amdgcn_ds_gws_sema_br
:
2135 return AMDGPU::DS_GWS_SEMA_BR
;
2136 case Intrinsic::amdgcn_ds_gws_sema_p
:
2137 return AMDGPU::DS_GWS_SEMA_P
;
2138 case Intrinsic::amdgcn_ds_gws_sema_release_all
:
2139 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL
;
2141 llvm_unreachable("not a gws intrinsic");
2145 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode
*N
, unsigned IntrID
) {
2146 if (IntrID
== Intrinsic::amdgcn_ds_gws_sema_release_all
&&
2147 !Subtarget
->hasGWSSemaReleaseAll()) {
2153 // Chain, intrinsic ID, vsrc, offset
2154 const bool HasVSrc
= N
->getNumOperands() == 4;
2155 assert(HasVSrc
|| N
->getNumOperands() == 3);
2158 SDValue BaseOffset
= N
->getOperand(HasVSrc
? 3 : 2);
2160 MemIntrinsicSDNode
*M
= cast
<MemIntrinsicSDNode
>(N
);
2161 MachineMemOperand
*MMO
= M
->getMemOperand();
2163 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2164 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2166 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2167 // offset field) % 64. Some versions of the programming guide omit the m0
2168 // part, or claim it's from offset 0.
2169 if (ConstantSDNode
*ConstOffset
= dyn_cast
<ConstantSDNode
>(BaseOffset
)) {
2170 // If we have a constant offset, try to use the 0 in m0 as the base.
2171 // TODO: Look into changing the default m0 initialization value. If the
2172 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2173 // the immediate offset.
2174 glueCopyToM0(N
, CurDAG
->getTargetConstant(0, SL
, MVT::i32
));
2175 ImmOffset
= ConstOffset
->getZExtValue();
2177 if (CurDAG
->isBaseWithConstantOffset(BaseOffset
)) {
2178 ImmOffset
= BaseOffset
.getConstantOperandVal(1);
2179 BaseOffset
= BaseOffset
.getOperand(0);
2182 // Prefer to do the shift in an SGPR since it should be possible to use m0
2183 // as the result directly. If it's already an SGPR, it will be eliminated
2186 = CurDAG
->getMachineNode(AMDGPU::V_READFIRSTLANE_B32
, SL
, MVT::i32
,
2188 // Shift to offset in m0
2190 = CurDAG
->getMachineNode(AMDGPU::S_LSHL_B32
, SL
, MVT::i32
,
2191 SDValue(SGPROffset
, 0),
2192 CurDAG
->getTargetConstant(16, SL
, MVT::i32
));
2193 glueCopyToM0(N
, SDValue(M0Base
, 0));
2196 SDValue Chain
= N
->getOperand(0);
2197 SDValue OffsetField
= CurDAG
->getTargetConstant(ImmOffset
, SL
, MVT::i32
);
2199 // TODO: Can this just be removed from the instruction?
2200 SDValue GDS
= CurDAG
->getTargetConstant(1, SL
, MVT::i1
);
2202 const unsigned Opc
= gwsIntrinToOpcode(IntrID
);
2203 SmallVector
<SDValue
, 5> Ops
;
2205 Ops
.push_back(N
->getOperand(2));
2206 Ops
.push_back(OffsetField
);
2208 Ops
.push_back(Chain
);
2210 SDNode
*Selected
= CurDAG
->SelectNodeTo(N
, Opc
, N
->getVTList(), Ops
);
2211 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Selected
), {MMO
});
2214 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode
*N
) {
2215 unsigned IntrID
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
2217 case Intrinsic::amdgcn_ds_append
:
2218 case Intrinsic::amdgcn_ds_consume
: {
2219 if (N
->getValueType(0) != MVT::i32
)
2221 SelectDSAppendConsume(N
, IntrID
);
2229 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode
*N
) {
2230 unsigned IntrID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
2233 case Intrinsic::amdgcn_wqm
:
2234 Opcode
= AMDGPU::WQM
;
2236 case Intrinsic::amdgcn_softwqm
:
2237 Opcode
= AMDGPU::SOFT_WQM
;
2239 case Intrinsic::amdgcn_wwm
:
2240 Opcode
= AMDGPU::WWM
;
2247 SDValue Src
= N
->getOperand(1);
2248 CurDAG
->SelectNodeTo(N
, Opcode
, N
->getVTList(), {Src
});
2251 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode
*N
) {
2252 unsigned IntrID
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
2254 case Intrinsic::amdgcn_ds_gws_init
:
2255 case Intrinsic::amdgcn_ds_gws_barrier
:
2256 case Intrinsic::amdgcn_ds_gws_sema_v
:
2257 case Intrinsic::amdgcn_ds_gws_sema_br
:
2258 case Intrinsic::amdgcn_ds_gws_sema_p
:
2259 case Intrinsic::amdgcn_ds_gws_sema_release_all
:
2260 SelectDS_GWS(N
, IntrID
);
2269 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In
, SDValue
&Src
,
2270 unsigned &Mods
) const {
2274 if (Src
.getOpcode() == ISD::FNEG
) {
2275 Mods
|= SISrcMods::NEG
;
2276 Src
= Src
.getOperand(0);
2279 if (Src
.getOpcode() == ISD::FABS
) {
2280 Mods
|= SISrcMods::ABS
;
2281 Src
= Src
.getOperand(0);
2287 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In
, SDValue
&Src
,
2288 SDValue
&SrcMods
) const {
2290 if (SelectVOP3ModsImpl(In
, Src
, Mods
)) {
2291 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2298 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In
, SDValue
&Src
,
2299 SDValue
&SrcMods
) const {
2300 SelectVOP3Mods(In
, Src
, SrcMods
);
2301 return isNoNanSrc(Src
);
2304 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In
, SDValue
&Src
,
2305 SDValue
&SrcMods
) const {
2306 if (In
.getValueType() == MVT::f32
)
2307 return SelectVOP3Mods(In
, Src
, SrcMods
);
2309 SrcMods
= CurDAG
->getTargetConstant(0, SDLoc(In
), MVT::i32
);;
2313 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In
, SDValue
&Src
) const {
2314 if (In
.getOpcode() == ISD::FABS
|| In
.getOpcode() == ISD::FNEG
)
2321 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In
, SDValue
&Src
,
2322 SDValue
&SrcMods
, SDValue
&Clamp
,
2323 SDValue
&Omod
) const {
2325 Clamp
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2326 Omod
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2328 return SelectVOP3Mods(In
, Src
, SrcMods
);
2331 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In
, SDValue
&Src
,
2334 SDValue
&Omod
) const {
2335 Clamp
= Omod
= CurDAG
->getTargetConstant(0, SDLoc(In
), MVT::i32
);
2336 return SelectVOP3Mods(In
, Src
, SrcMods
);
2339 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In
, SDValue
&Src
,
2340 SDValue
&Clamp
, SDValue
&Omod
) const {
2344 Clamp
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2345 Omod
= CurDAG
->getTargetConstant(0, DL
, MVT::i1
);
2350 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In
, SDValue
&Src
,
2351 SDValue
&SrcMods
) const {
2355 if (Src
.getOpcode() == ISD::FNEG
) {
2356 Mods
^= (SISrcMods::NEG
| SISrcMods::NEG_HI
);
2357 Src
= Src
.getOperand(0);
2360 if (Src
.getOpcode() == ISD::BUILD_VECTOR
) {
2361 unsigned VecMods
= Mods
;
2363 SDValue Lo
= stripBitcast(Src
.getOperand(0));
2364 SDValue Hi
= stripBitcast(Src
.getOperand(1));
2366 if (Lo
.getOpcode() == ISD::FNEG
) {
2367 Lo
= stripBitcast(Lo
.getOperand(0));
2368 Mods
^= SISrcMods::NEG
;
2371 if (Hi
.getOpcode() == ISD::FNEG
) {
2372 Hi
= stripBitcast(Hi
.getOperand(0));
2373 Mods
^= SISrcMods::NEG_HI
;
2376 if (isExtractHiElt(Lo
, Lo
))
2377 Mods
|= SISrcMods::OP_SEL_0
;
2379 if (isExtractHiElt(Hi
, Hi
))
2380 Mods
|= SISrcMods::OP_SEL_1
;
2382 Lo
= stripExtractLoElt(Lo
);
2383 Hi
= stripExtractLoElt(Hi
);
2385 if (Lo
== Hi
&& !isInlineImmediate(Lo
.getNode())) {
2386 // Really a scalar input. Just select from the low half of the register to
2390 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2397 // Packed instructions do not have abs modifiers.
2398 Mods
|= SISrcMods::OP_SEL_1
;
2400 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2404 bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In
, SDValue
&Src
,
2406 SDValue
&Clamp
) const {
2409 // FIXME: Handle clamp and op_sel
2410 Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
2412 return SelectVOP3PMods(In
, Src
, SrcMods
);
2415 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In
, SDValue
&Src
,
2416 SDValue
&SrcMods
) const {
2418 // FIXME: Handle op_sel
2419 SrcMods
= CurDAG
->getTargetConstant(0, SDLoc(In
), MVT::i32
);
2423 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In
, SDValue
&Src
,
2425 SDValue
&Clamp
) const {
2428 // FIXME: Handle clamp
2429 Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
2431 return SelectVOP3OpSel(In
, Src
, SrcMods
);
2434 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In
, SDValue
&Src
,
2435 SDValue
&SrcMods
) const {
2436 // FIXME: Handle op_sel
2437 return SelectVOP3Mods(In
, Src
, SrcMods
);
2440 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In
, SDValue
&Src
,
2442 SDValue
&Clamp
) const {
2445 // FIXME: Handle clamp
2446 Clamp
= CurDAG
->getTargetConstant(0, SL
, MVT::i32
);
2448 return SelectVOP3OpSelMods(In
, Src
, SrcMods
);
2451 // The return value is not whether the match is possible (which it always is),
2452 // but whether or not it a conversion is really used.
2453 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In
, SDValue
&Src
,
2454 unsigned &Mods
) const {
2456 SelectVOP3ModsImpl(In
, Src
, Mods
);
2458 if (Src
.getOpcode() == ISD::FP_EXTEND
) {
2459 Src
= Src
.getOperand(0);
2460 assert(Src
.getValueType() == MVT::f16
);
2461 Src
= stripBitcast(Src
);
2463 // Be careful about folding modifiers if we already have an abs. fneg is
2464 // applied last, so we don't want to apply an earlier fneg.
2465 if ((Mods
& SISrcMods::ABS
) == 0) {
2467 SelectVOP3ModsImpl(Src
, Src
, ModsTmp
);
2469 if ((ModsTmp
& SISrcMods::NEG
) != 0)
2470 Mods
^= SISrcMods::NEG
;
2472 if ((ModsTmp
& SISrcMods::ABS
) != 0)
2473 Mods
|= SISrcMods::ABS
;
2476 // op_sel/op_sel_hi decide the source type and source.
2477 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2478 // If the sources's op_sel is set, it picks the high half of the source
2481 Mods
|= SISrcMods::OP_SEL_1
;
2482 if (isExtractHiElt(Src
, Src
)) {
2483 Mods
|= SISrcMods::OP_SEL_0
;
2485 // TODO: Should we try to look for neg/abs here?
2494 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In
, SDValue
&Src
,
2495 SDValue
&SrcMods
) const {
2497 SelectVOP3PMadMixModsImpl(In
, Src
, Mods
);
2498 SrcMods
= CurDAG
->getTargetConstant(Mods
, SDLoc(In
), MVT::i32
);
2502 SDValue
AMDGPUDAGToDAGISel::getHi16Elt(SDValue In
) const {
2504 return CurDAG
->getUNDEF(MVT::i32
);
2506 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(In
)) {
2508 return CurDAG
->getConstant(C
->getZExtValue() << 16, SL
, MVT::i32
);
2511 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(In
)) {
2513 return CurDAG
->getConstant(
2514 C
->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL
, MVT::i32
);
2518 if (isExtractHiElt(In
, Src
))
2524 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode
* N
) const {
2525 assert(CurDAG
->getTarget().getTargetTriple().getArch() == Triple::amdgcn
);
2527 const SIRegisterInfo
*SIRI
=
2528 static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
2529 const SIInstrInfo
* SII
=
2530 static_cast<const SIInstrInfo
*>(Subtarget
->getInstrInfo());
2533 bool AllUsesAcceptSReg
= true;
2534 for (SDNode::use_iterator U
= N
->use_begin(), E
= SDNode::use_end();
2535 Limit
< 10 && U
!= E
; ++U
, ++Limit
) {
2536 const TargetRegisterClass
*RC
= getOperandRegClass(*U
, U
.getOperandNo());
2538 // If the register class is unknown, it could be an unknown
2539 // register class that needs to be an SGPR, e.g. an inline asm
2541 if (!RC
|| SIRI
->isSGPRClass(RC
))
2544 if (RC
!= &AMDGPU::VS_32RegClass
) {
2545 AllUsesAcceptSReg
= false;
2547 if (User
->isMachineOpcode()) {
2548 unsigned Opc
= User
->getMachineOpcode();
2549 MCInstrDesc Desc
= SII
->get(Opc
);
2550 if (Desc
.isCommutable()) {
2551 unsigned OpIdx
= Desc
.getNumDefs() + U
.getOperandNo();
2552 unsigned CommuteIdx1
= TargetInstrInfo::CommuteAnyOperandIndex
;
2553 if (SII
->findCommutedOpIndices(Desc
, OpIdx
, CommuteIdx1
)) {
2554 unsigned CommutedOpNo
= CommuteIdx1
- Desc
.getNumDefs();
2555 const TargetRegisterClass
*CommutedRC
= getOperandRegClass(*U
, CommutedOpNo
);
2556 if (CommutedRC
== &AMDGPU::VS_32RegClass
)
2557 AllUsesAcceptSReg
= true;
2561 // If "AllUsesAcceptSReg == false" so far we haven't suceeded
2562 // commuting current user. This means have at least one use
2563 // that strictly require VGPR. Thus, we will not attempt to commute
2564 // other user instructions.
2565 if (!AllUsesAcceptSReg
)
2569 return !AllUsesAcceptSReg
&& (Limit
< 10);
2572 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode
* N
) const {
2573 auto Ld
= cast
<LoadSDNode
>(N
);
2575 return Ld
->getAlignment() >= 4 &&
2579 Ld
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
2580 Ld
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
2587 Subtarget
->getScalarizeGlobalBehavior() &&
2588 Ld
->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
&&
2589 !Ld
->isVolatile() &&
2590 !N
->isDivergent() &&
2591 static_cast<const SITargetLowering
*>(
2592 getTargetLowering())->isMemOpHasNoClobberedMemOperand(N
)
2597 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
2598 const AMDGPUTargetLowering
& Lowering
=
2599 *static_cast<const AMDGPUTargetLowering
*>(getTargetLowering());
2600 bool IsModified
= false;
2604 // Go over all selected nodes and try to fold them a bit more
2605 SelectionDAG::allnodes_iterator Position
= CurDAG
->allnodes_begin();
2606 while (Position
!= CurDAG
->allnodes_end()) {
2607 SDNode
*Node
= &*Position
++;
2608 MachineSDNode
*MachineNode
= dyn_cast
<MachineSDNode
>(Node
);
2612 SDNode
*ResNode
= Lowering
.PostISelFolding(MachineNode
, *CurDAG
);
2613 if (ResNode
!= Node
) {
2615 ReplaceUses(Node
, ResNode
);
2619 CurDAG
->RemoveDeadNodes();
2620 } while (IsModified
);
2623 bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction
&MF
) {
2624 Subtarget
= &MF
.getSubtarget
<R600Subtarget
>();
2625 return SelectionDAGISel::runOnMachineFunction(MF
);
2628 bool R600DAGToDAGISel::isConstantLoad(const MemSDNode
*N
, int CbId
) const {
2632 return N
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
2633 N
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
;
2635 return N
->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0
+ CbId
;
2638 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr
,
2640 if (ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(Addr
)) {
2641 IntPtr
= CurDAG
->getIntPtrConstant(Cst
->getZExtValue() / 4, SDLoc(Addr
),
2648 bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr
,
2649 SDValue
& BaseReg
, SDValue
&Offset
) {
2650 if (!isa
<ConstantSDNode
>(Addr
)) {
2652 Offset
= CurDAG
->getIntPtrConstant(0, SDLoc(Addr
), true);
2658 void R600DAGToDAGISel::Select(SDNode
*N
) {
2659 unsigned int Opc
= N
->getOpcode();
2660 if (N
->isMachineOpcode()) {
2662 return; // Already selected.
2667 case AMDGPUISD::BUILD_VERTICAL_VECTOR
:
2668 case ISD::SCALAR_TO_VECTOR
:
2669 case ISD::BUILD_VECTOR
: {
2670 EVT VT
= N
->getValueType(0);
2671 unsigned NumVectorElts
= VT
.getVectorNumElements();
2672 unsigned RegClassID
;
2673 // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
2674 // that adds a 128 bits reg copy when going through TwoAddressInstructions
2675 // pass. We want to avoid 128 bits copies as much as possible because they
2676 // can't be bundled by our scheduler.
2677 switch(NumVectorElts
) {
2678 case 2: RegClassID
= R600::R600_Reg64RegClassID
; break;
2680 if (Opc
== AMDGPUISD::BUILD_VERTICAL_VECTOR
)
2681 RegClassID
= R600::R600_Reg128VerticalRegClassID
;
2683 RegClassID
= R600::R600_Reg128RegClassID
;
2685 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
2687 SelectBuildVector(N
, RegClassID
);
2695 bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr
, SDValue
&Base
,
2700 if ((C
= dyn_cast
<ConstantSDNode
>(Addr
))) {
2701 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
2702 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
2703 } else if ((Addr
.getOpcode() == AMDGPUISD::DWORDADDR
) &&
2704 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(0)))) {
2705 Base
= CurDAG
->getRegister(R600::INDIRECT_BASE_ADDR
, MVT::i32
);
2706 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
2707 } else if ((Addr
.getOpcode() == ISD::ADD
|| Addr
.getOpcode() == ISD::OR
) &&
2708 (C
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1)))) {
2709 Base
= Addr
.getOperand(0);
2710 Offset
= CurDAG
->getTargetConstant(C
->getZExtValue(), DL
, MVT::i32
);
2713 Offset
= CurDAG
->getTargetConstant(0, DL
, MVT::i32
);
2719 bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr
, SDValue
&Base
,
2721 ConstantSDNode
*IMMOffset
;
2723 if (Addr
.getOpcode() == ISD::ADD
2724 && (IMMOffset
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1)))
2725 && isInt
<16>(IMMOffset
->getZExtValue())) {
2727 Base
= Addr
.getOperand(0);
2728 Offset
= CurDAG
->getTargetConstant(IMMOffset
->getZExtValue(), SDLoc(Addr
),
2731 // If the pointer address is constant, we can move it to the offset field.
2732 } else if ((IMMOffset
= dyn_cast
<ConstantSDNode
>(Addr
))
2733 && isInt
<16>(IMMOffset
->getZExtValue())) {
2734 Base
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(),
2735 SDLoc(CurDAG
->getEntryNode()),
2736 R600::ZERO
, MVT::i32
);
2737 Offset
= CurDAG
->getTargetConstant(IMMOffset
->getZExtValue(), SDLoc(Addr
),
2742 // Default case, no offset
2744 Offset
= CurDAG
->getTargetConstant(0, SDLoc(Addr
), MVT::i32
);