1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// Custom DAG lowering for SI
13 //===----------------------------------------------------------------------===//
17 #define _USE_MATH_DEFINES
20 #include "SIISelLowering.h"
22 #include "AMDGPUIntrinsicInfo.h"
23 #include "AMDGPUSubtarget.h"
24 #include "AMDGPUTargetMachine.h"
25 #include "SIDefines.h"
26 #include "SIInstrInfo.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIRegisterInfo.h"
29 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/APFloat.h"
32 #include "llvm/ADT/APInt.h"
33 #include "llvm/ADT/ArrayRef.h"
34 #include "llvm/ADT/BitVector.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/ADT/StringRef.h"
38 #include "llvm/ADT/StringSwitch.h"
39 #include "llvm/ADT/Twine.h"
40 #include "llvm/CodeGen/Analysis.h"
41 #include "llvm/CodeGen/CallingConvLower.h"
42 #include "llvm/CodeGen/DAGCombine.h"
43 #include "llvm/CodeGen/ISDOpcodes.h"
44 #include "llvm/CodeGen/MachineBasicBlock.h"
45 #include "llvm/CodeGen/MachineFrameInfo.h"
46 #include "llvm/CodeGen/MachineFunction.h"
47 #include "llvm/CodeGen/MachineInstr.h"
48 #include "llvm/CodeGen/MachineInstrBuilder.h"
49 #include "llvm/CodeGen/MachineMemOperand.h"
50 #include "llvm/CodeGen/MachineModuleInfo.h"
51 #include "llvm/CodeGen/MachineOperand.h"
52 #include "llvm/CodeGen/MachineRegisterInfo.h"
53 #include "llvm/CodeGen/SelectionDAG.h"
54 #include "llvm/CodeGen/SelectionDAGNodes.h"
55 #include "llvm/CodeGen/TargetCallingConv.h"
56 #include "llvm/CodeGen/TargetRegisterInfo.h"
57 #include "llvm/CodeGen/ValueTypes.h"
58 #include "llvm/IR/Constants.h"
59 #include "llvm/IR/DataLayout.h"
60 #include "llvm/IR/DebugLoc.h"
61 #include "llvm/IR/DerivedTypes.h"
62 #include "llvm/IR/DiagnosticInfo.h"
63 #include "llvm/IR/Function.h"
64 #include "llvm/IR/GlobalValue.h"
65 #include "llvm/IR/InstrTypes.h"
66 #include "llvm/IR/Instruction.h"
67 #include "llvm/IR/Instructions.h"
68 #include "llvm/IR/IntrinsicInst.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/Support/Casting.h"
71 #include "llvm/Support/CodeGen.h"
72 #include "llvm/Support/CommandLine.h"
73 #include "llvm/Support/Compiler.h"
74 #include "llvm/Support/ErrorHandling.h"
75 #include "llvm/Support/KnownBits.h"
76 #include "llvm/Support/MachineValueType.h"
77 #include "llvm/Support/MathExtras.h"
78 #include "llvm/Target/TargetOptions.h"
89 #define DEBUG_TYPE "si-lower"
91 STATISTIC(NumTailCalls
, "Number of tail calls");
93 static cl::opt
<bool> EnableVGPRIndexMode(
94 "amdgpu-vgpr-index-mode",
95 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
98 static cl::opt
<unsigned> AssumeFrameIndexHighZeroBits(
99 "amdgpu-frame-index-zero-bits",
100 cl::desc("High bits of frame index assumed to be zero"),
104 static unsigned findFirstFreeSGPR(CCState
&CCInfo
) {
105 unsigned NumSGPRs
= AMDGPU::SGPR_32RegClass
.getNumRegs();
106 for (unsigned Reg
= 0; Reg
< NumSGPRs
; ++Reg
) {
107 if (!CCInfo
.isAllocated(AMDGPU::SGPR0
+ Reg
)) {
108 return AMDGPU::SGPR0
+ Reg
;
111 llvm_unreachable("Cannot allocate sgpr");
114 SITargetLowering::SITargetLowering(const TargetMachine
&TM
,
115 const GCNSubtarget
&STI
)
116 : AMDGPUTargetLowering(TM
, STI
),
118 addRegisterClass(MVT::i1
, &AMDGPU::VReg_1RegClass
);
119 addRegisterClass(MVT::i64
, &AMDGPU::SReg_64RegClass
);
121 addRegisterClass(MVT::i32
, &AMDGPU::SReg_32_XM0RegClass
);
122 addRegisterClass(MVT::f32
, &AMDGPU::VGPR_32RegClass
);
124 addRegisterClass(MVT::f64
, &AMDGPU::VReg_64RegClass
);
125 addRegisterClass(MVT::v2i32
, &AMDGPU::SReg_64RegClass
);
126 addRegisterClass(MVT::v2f32
, &AMDGPU::VReg_64RegClass
);
128 addRegisterClass(MVT::v2i64
, &AMDGPU::SReg_128RegClass
);
129 addRegisterClass(MVT::v2f64
, &AMDGPU::SReg_128RegClass
);
131 addRegisterClass(MVT::v4i32
, &AMDGPU::SReg_128RegClass
);
132 addRegisterClass(MVT::v4f32
, &AMDGPU::VReg_128RegClass
);
134 addRegisterClass(MVT::v8i32
, &AMDGPU::SReg_256RegClass
);
135 addRegisterClass(MVT::v8f32
, &AMDGPU::VReg_256RegClass
);
137 addRegisterClass(MVT::v16i32
, &AMDGPU::SReg_512RegClass
);
138 addRegisterClass(MVT::v16f32
, &AMDGPU::VReg_512RegClass
);
140 if (Subtarget
->has16BitInsts()) {
141 addRegisterClass(MVT::i16
, &AMDGPU::SReg_32_XM0RegClass
);
142 addRegisterClass(MVT::f16
, &AMDGPU::SReg_32_XM0RegClass
);
144 // Unless there are also VOP3P operations, not operations are really legal.
145 addRegisterClass(MVT::v2i16
, &AMDGPU::SReg_32_XM0RegClass
);
146 addRegisterClass(MVT::v2f16
, &AMDGPU::SReg_32_XM0RegClass
);
147 addRegisterClass(MVT::v4i16
, &AMDGPU::SReg_64RegClass
);
148 addRegisterClass(MVT::v4f16
, &AMDGPU::SReg_64RegClass
);
151 computeRegisterProperties(Subtarget
->getRegisterInfo());
153 // We need to custom lower vector stores from local memory
154 setOperationAction(ISD::LOAD
, MVT::v2i32
, Custom
);
155 setOperationAction(ISD::LOAD
, MVT::v4i32
, Custom
);
156 setOperationAction(ISD::LOAD
, MVT::v8i32
, Custom
);
157 setOperationAction(ISD::LOAD
, MVT::v16i32
, Custom
);
158 setOperationAction(ISD::LOAD
, MVT::i1
, Custom
);
159 setOperationAction(ISD::LOAD
, MVT::v32i32
, Custom
);
161 setOperationAction(ISD::STORE
, MVT::v2i32
, Custom
);
162 setOperationAction(ISD::STORE
, MVT::v4i32
, Custom
);
163 setOperationAction(ISD::STORE
, MVT::v8i32
, Custom
);
164 setOperationAction(ISD::STORE
, MVT::v16i32
, Custom
);
165 setOperationAction(ISD::STORE
, MVT::i1
, Custom
);
166 setOperationAction(ISD::STORE
, MVT::v32i32
, Custom
);
168 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Expand
);
169 setTruncStoreAction(MVT::v4i32
, MVT::v4i16
, Expand
);
170 setTruncStoreAction(MVT::v8i32
, MVT::v8i16
, Expand
);
171 setTruncStoreAction(MVT::v16i32
, MVT::v16i16
, Expand
);
172 setTruncStoreAction(MVT::v32i32
, MVT::v32i16
, Expand
);
173 setTruncStoreAction(MVT::v2i32
, MVT::v2i8
, Expand
);
174 setTruncStoreAction(MVT::v4i32
, MVT::v4i8
, Expand
);
175 setTruncStoreAction(MVT::v8i32
, MVT::v8i8
, Expand
);
176 setTruncStoreAction(MVT::v16i32
, MVT::v16i8
, Expand
);
177 setTruncStoreAction(MVT::v32i32
, MVT::v32i8
, Expand
);
179 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
180 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
182 setOperationAction(ISD::SELECT
, MVT::i1
, Promote
);
183 setOperationAction(ISD::SELECT
, MVT::i64
, Custom
);
184 setOperationAction(ISD::SELECT
, MVT::f64
, Promote
);
185 AddPromotedToType(ISD::SELECT
, MVT::f64
, MVT::i64
);
187 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Expand
);
188 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Expand
);
189 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Expand
);
190 setOperationAction(ISD::SELECT_CC
, MVT::f64
, Expand
);
191 setOperationAction(ISD::SELECT_CC
, MVT::i1
, Expand
);
193 setOperationAction(ISD::SETCC
, MVT::i1
, Promote
);
194 setOperationAction(ISD::SETCC
, MVT::v2i1
, Expand
);
195 setOperationAction(ISD::SETCC
, MVT::v4i1
, Expand
);
196 AddPromotedToType(ISD::SETCC
, MVT::i1
, MVT::i32
);
198 setOperationAction(ISD::TRUNCATE
, MVT::v2i32
, Expand
);
199 setOperationAction(ISD::FP_ROUND
, MVT::v2f32
, Expand
);
201 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i1
, Custom
);
202 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i1
, Custom
);
203 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i8
, Custom
);
204 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i8
, Custom
);
205 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i16
, Custom
);
206 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i16
, Custom
);
207 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::Other
, Custom
);
209 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
210 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f32
, Custom
);
211 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v4f32
, Custom
);
212 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::i16
, Custom
);
213 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f16
, Custom
);
214 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2i16
, Custom
);
215 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2f16
, Custom
);
217 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v2f16
, Custom
);
218 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v4f16
, Custom
);
219 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::Other
, Custom
);
221 setOperationAction(ISD::INTRINSIC_VOID
, MVT::Other
, Custom
);
222 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2i16
, Custom
);
223 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2f16
, Custom
);
224 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v4f16
, Custom
);
226 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
227 setOperationAction(ISD::BR_CC
, MVT::i1
, Expand
);
228 setOperationAction(ISD::BR_CC
, MVT::i32
, Expand
);
229 setOperationAction(ISD::BR_CC
, MVT::i64
, Expand
);
230 setOperationAction(ISD::BR_CC
, MVT::f32
, Expand
);
231 setOperationAction(ISD::BR_CC
, MVT::f64
, Expand
);
233 setOperationAction(ISD::UADDO
, MVT::i32
, Legal
);
234 setOperationAction(ISD::USUBO
, MVT::i32
, Legal
);
236 setOperationAction(ISD::ADDCARRY
, MVT::i32
, Legal
);
237 setOperationAction(ISD::SUBCARRY
, MVT::i32
, Legal
);
239 setOperationAction(ISD::SHL_PARTS
, MVT::i64
, Expand
);
240 setOperationAction(ISD::SRA_PARTS
, MVT::i64
, Expand
);
241 setOperationAction(ISD::SRL_PARTS
, MVT::i64
, Expand
);
244 setOperationAction(ISD::ADDCARRY
, MVT::i64
, Legal
);
245 setOperationAction(ISD::SUBCARRY
, MVT::i64
, Legal
);
248 // We only support LOAD/STORE and vector manipulation ops for vectors
249 // with > 4 elements.
250 for (MVT VT
: {MVT::v8i32
, MVT::v8f32
, MVT::v16i32
, MVT::v16f32
,
251 MVT::v2i64
, MVT::v2f64
, MVT::v4i16
, MVT::v4f16
, MVT::v32i32
}) {
252 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
256 case ISD::BUILD_VECTOR
:
258 case ISD::EXTRACT_VECTOR_ELT
:
259 case ISD::INSERT_VECTOR_ELT
:
260 case ISD::INSERT_SUBVECTOR
:
261 case ISD::EXTRACT_SUBVECTOR
:
262 case ISD::SCALAR_TO_VECTOR
:
264 case ISD::CONCAT_VECTORS
:
265 setOperationAction(Op
, VT
, Custom
);
268 setOperationAction(Op
, VT
, Expand
);
274 setOperationAction(ISD::FP_EXTEND
, MVT::v4f32
, Expand
);
276 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
277 // is expanded to avoid having two separate loops in case the index is a VGPR.
279 // Most operations are naturally 32-bit vector operations. We only support
280 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
281 for (MVT Vec64
: { MVT::v2i64
, MVT::v2f64
}) {
282 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
283 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v4i32
);
285 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
286 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
288 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
289 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
291 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
292 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v4i32
);
295 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8i32
, Expand
);
296 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8f32
, Expand
);
297 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16i32
, Expand
);
298 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16f32
, Expand
);
300 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4f16
, Custom
);
301 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4i16
, Custom
);
303 // Avoid stack access for these.
304 // TODO: Generalize to more vector types.
305 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i16
, Custom
);
306 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2f16
, Custom
);
307 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
308 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
310 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
311 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
312 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i8
, Custom
);
313 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i8
, Custom
);
314 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v8i8
, Custom
);
316 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i8
, Custom
);
317 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i8
, Custom
);
318 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v8i8
, Custom
);
320 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i16
, Custom
);
321 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4f16
, Custom
);
322 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
323 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
325 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
326 // and output demarshalling
327 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i32
, Custom
);
328 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i64
, Custom
);
330 // We can't return success/failure, only the old value,
331 // let LLVM add the comparison
332 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i32
, Expand
);
333 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i64
, Expand
);
335 if (Subtarget
->hasFlatAddressSpace()) {
336 setOperationAction(ISD::ADDRSPACECAST
, MVT::i32
, Custom
);
337 setOperationAction(ISD::ADDRSPACECAST
, MVT::i64
, Custom
);
340 setOperationAction(ISD::BSWAP
, MVT::i32
, Legal
);
341 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
343 // On SI this is s_memtime and s_memrealtime on VI.
344 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Legal
);
345 setOperationAction(ISD::TRAP
, MVT::Other
, Custom
);
346 setOperationAction(ISD::DEBUGTRAP
, MVT::Other
, Custom
);
348 if (Subtarget
->has16BitInsts()) {
349 setOperationAction(ISD::FLOG
, MVT::f16
, Custom
);
350 setOperationAction(ISD::FEXP
, MVT::f16
, Custom
);
351 setOperationAction(ISD::FLOG10
, MVT::f16
, Custom
);
354 // v_mad_f32 does not support denormals according to some sources.
355 if (!Subtarget
->hasFP32Denormals())
356 setOperationAction(ISD::FMAD
, MVT::f32
, Legal
);
358 if (!Subtarget
->hasBFI()) {
359 // fcopysign can be done in a single instruction with BFI.
360 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Expand
);
361 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
364 if (!Subtarget
->hasBCNT(32))
365 setOperationAction(ISD::CTPOP
, MVT::i32
, Expand
);
367 if (!Subtarget
->hasBCNT(64))
368 setOperationAction(ISD::CTPOP
, MVT::i64
, Expand
);
370 if (Subtarget
->hasFFBH())
371 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i32
, Custom
);
373 if (Subtarget
->hasFFBL())
374 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i32
, Custom
);
376 // We only really have 32-bit BFE instructions (and 16-bit on VI).
378 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
379 // effort to match them now. We want this to be false for i64 cases when the
380 // extraction isn't restricted to the upper or lower half. Ideally we would
381 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
382 // span the midpoint are probably relatively rare, so don't worry about them
384 if (Subtarget
->hasBFE())
385 setHasExtractBitsInsn(true);
387 setOperationAction(ISD::FMINNUM
, MVT::f64
, Legal
);
388 setOperationAction(ISD::FMAXNUM
, MVT::f64
, Legal
);
390 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS
) {
391 setOperationAction(ISD::FTRUNC
, MVT::f64
, Legal
);
392 setOperationAction(ISD::FCEIL
, MVT::f64
, Legal
);
393 setOperationAction(ISD::FRINT
, MVT::f64
, Legal
);
395 setOperationAction(ISD::FCEIL
, MVT::f64
, Custom
);
396 setOperationAction(ISD::FTRUNC
, MVT::f64
, Custom
);
397 setOperationAction(ISD::FRINT
, MVT::f64
, Custom
);
398 setOperationAction(ISD::FFLOOR
, MVT::f64
, Custom
);
401 setOperationAction(ISD::FFLOOR
, MVT::f64
, Legal
);
403 setOperationAction(ISD::FSIN
, MVT::f32
, Custom
);
404 setOperationAction(ISD::FCOS
, MVT::f32
, Custom
);
405 setOperationAction(ISD::FDIV
, MVT::f32
, Custom
);
406 setOperationAction(ISD::FDIV
, MVT::f64
, Custom
);
408 if (Subtarget
->has16BitInsts()) {
409 setOperationAction(ISD::Constant
, MVT::i16
, Legal
);
411 setOperationAction(ISD::SMIN
, MVT::i16
, Legal
);
412 setOperationAction(ISD::SMAX
, MVT::i16
, Legal
);
414 setOperationAction(ISD::UMIN
, MVT::i16
, Legal
);
415 setOperationAction(ISD::UMAX
, MVT::i16
, Legal
);
417 setOperationAction(ISD::SIGN_EXTEND
, MVT::i16
, Promote
);
418 AddPromotedToType(ISD::SIGN_EXTEND
, MVT::i16
, MVT::i32
);
420 setOperationAction(ISD::ROTR
, MVT::i16
, Promote
);
421 setOperationAction(ISD::ROTL
, MVT::i16
, Promote
);
423 setOperationAction(ISD::SDIV
, MVT::i16
, Promote
);
424 setOperationAction(ISD::UDIV
, MVT::i16
, Promote
);
425 setOperationAction(ISD::SREM
, MVT::i16
, Promote
);
426 setOperationAction(ISD::UREM
, MVT::i16
, Promote
);
428 setOperationAction(ISD::BSWAP
, MVT::i16
, Promote
);
429 setOperationAction(ISD::BITREVERSE
, MVT::i16
, Promote
);
431 setOperationAction(ISD::CTTZ
, MVT::i16
, Promote
);
432 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i16
, Promote
);
433 setOperationAction(ISD::CTLZ
, MVT::i16
, Promote
);
434 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i16
, Promote
);
435 setOperationAction(ISD::CTPOP
, MVT::i16
, Promote
);
437 setOperationAction(ISD::SELECT_CC
, MVT::i16
, Expand
);
439 setOperationAction(ISD::BR_CC
, MVT::i16
, Expand
);
441 setOperationAction(ISD::LOAD
, MVT::i16
, Custom
);
443 setTruncStoreAction(MVT::i64
, MVT::i16
, Expand
);
445 setOperationAction(ISD::FP16_TO_FP
, MVT::i16
, Promote
);
446 AddPromotedToType(ISD::FP16_TO_FP
, MVT::i16
, MVT::i32
);
447 setOperationAction(ISD::FP_TO_FP16
, MVT::i16
, Promote
);
448 AddPromotedToType(ISD::FP_TO_FP16
, MVT::i16
, MVT::i32
);
450 setOperationAction(ISD::FP_TO_SINT
, MVT::i16
, Promote
);
451 setOperationAction(ISD::FP_TO_UINT
, MVT::i16
, Promote
);
452 setOperationAction(ISD::SINT_TO_FP
, MVT::i16
, Promote
);
453 setOperationAction(ISD::UINT_TO_FP
, MVT::i16
, Promote
);
455 // F16 - Constant Actions.
456 setOperationAction(ISD::ConstantFP
, MVT::f16
, Legal
);
458 // F16 - Load/Store Actions.
459 setOperationAction(ISD::LOAD
, MVT::f16
, Promote
);
460 AddPromotedToType(ISD::LOAD
, MVT::f16
, MVT::i16
);
461 setOperationAction(ISD::STORE
, MVT::f16
, Promote
);
462 AddPromotedToType(ISD::STORE
, MVT::f16
, MVT::i16
);
464 // F16 - VOP1 Actions.
465 setOperationAction(ISD::FP_ROUND
, MVT::f16
, Custom
);
466 setOperationAction(ISD::FCOS
, MVT::f16
, Promote
);
467 setOperationAction(ISD::FSIN
, MVT::f16
, Promote
);
468 setOperationAction(ISD::FP_TO_SINT
, MVT::f16
, Promote
);
469 setOperationAction(ISD::FP_TO_UINT
, MVT::f16
, Promote
);
470 setOperationAction(ISD::SINT_TO_FP
, MVT::f16
, Promote
);
471 setOperationAction(ISD::UINT_TO_FP
, MVT::f16
, Promote
);
472 setOperationAction(ISD::FROUND
, MVT::f16
, Custom
);
474 // F16 - VOP2 Actions.
475 setOperationAction(ISD::BR_CC
, MVT::f16
, Expand
);
476 setOperationAction(ISD::SELECT_CC
, MVT::f16
, Expand
);
477 setOperationAction(ISD::FMAXNUM
, MVT::f16
, Legal
);
478 setOperationAction(ISD::FMINNUM
, MVT::f16
, Legal
);
479 setOperationAction(ISD::FDIV
, MVT::f16
, Custom
);
481 // F16 - VOP3 Actions.
482 setOperationAction(ISD::FMA
, MVT::f16
, Legal
);
483 if (!Subtarget
->hasFP16Denormals())
484 setOperationAction(ISD::FMAD
, MVT::f16
, Legal
);
486 for (MVT VT
: {MVT::v2i16
, MVT::v2f16
, MVT::v4i16
, MVT::v4f16
}) {
487 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
491 case ISD::BUILD_VECTOR
:
493 case ISD::EXTRACT_VECTOR_ELT
:
494 case ISD::INSERT_VECTOR_ELT
:
495 case ISD::INSERT_SUBVECTOR
:
496 case ISD::EXTRACT_SUBVECTOR
:
497 case ISD::SCALAR_TO_VECTOR
:
499 case ISD::CONCAT_VECTORS
:
500 setOperationAction(Op
, VT
, Custom
);
503 setOperationAction(Op
, VT
, Expand
);
509 // XXX - Do these do anything? Vector constants turn into build_vector.
510 setOperationAction(ISD::Constant
, MVT::v2i16
, Legal
);
511 setOperationAction(ISD::ConstantFP
, MVT::v2f16
, Legal
);
513 setOperationAction(ISD::UNDEF
, MVT::v2i16
, Legal
);
514 setOperationAction(ISD::UNDEF
, MVT::v2f16
, Legal
);
516 setOperationAction(ISD::STORE
, MVT::v2i16
, Promote
);
517 AddPromotedToType(ISD::STORE
, MVT::v2i16
, MVT::i32
);
518 setOperationAction(ISD::STORE
, MVT::v2f16
, Promote
);
519 AddPromotedToType(ISD::STORE
, MVT::v2f16
, MVT::i32
);
521 setOperationAction(ISD::LOAD
, MVT::v2i16
, Promote
);
522 AddPromotedToType(ISD::LOAD
, MVT::v2i16
, MVT::i32
);
523 setOperationAction(ISD::LOAD
, MVT::v2f16
, Promote
);
524 AddPromotedToType(ISD::LOAD
, MVT::v2f16
, MVT::i32
);
526 setOperationAction(ISD::AND
, MVT::v2i16
, Promote
);
527 AddPromotedToType(ISD::AND
, MVT::v2i16
, MVT::i32
);
528 setOperationAction(ISD::OR
, MVT::v2i16
, Promote
);
529 AddPromotedToType(ISD::OR
, MVT::v2i16
, MVT::i32
);
530 setOperationAction(ISD::XOR
, MVT::v2i16
, Promote
);
531 AddPromotedToType(ISD::XOR
, MVT::v2i16
, MVT::i32
);
533 setOperationAction(ISD::LOAD
, MVT::v4i16
, Promote
);
534 AddPromotedToType(ISD::LOAD
, MVT::v4i16
, MVT::v2i32
);
535 setOperationAction(ISD::LOAD
, MVT::v4f16
, Promote
);
536 AddPromotedToType(ISD::LOAD
, MVT::v4f16
, MVT::v2i32
);
538 setOperationAction(ISD::STORE
, MVT::v4i16
, Promote
);
539 AddPromotedToType(ISD::STORE
, MVT::v4i16
, MVT::v2i32
);
540 setOperationAction(ISD::STORE
, MVT::v4f16
, Promote
);
541 AddPromotedToType(ISD::STORE
, MVT::v4f16
, MVT::v2i32
);
543 setOperationAction(ISD::ANY_EXTEND
, MVT::v2i32
, Expand
);
544 setOperationAction(ISD::ZERO_EXTEND
, MVT::v2i32
, Expand
);
545 setOperationAction(ISD::SIGN_EXTEND
, MVT::v2i32
, Expand
);
546 setOperationAction(ISD::FP_EXTEND
, MVT::v2f32
, Expand
);
548 setOperationAction(ISD::ANY_EXTEND
, MVT::v4i32
, Expand
);
549 setOperationAction(ISD::ZERO_EXTEND
, MVT::v4i32
, Expand
);
550 setOperationAction(ISD::SIGN_EXTEND
, MVT::v4i32
, Expand
);
552 if (!Subtarget
->hasVOP3PInsts()) {
553 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2i16
, Custom
);
554 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2f16
, Custom
);
557 setOperationAction(ISD::FNEG
, MVT::v2f16
, Legal
);
558 // This isn't really legal, but this avoids the legalizer unrolling it (and
559 // allows matching fneg (fabs x) patterns)
560 setOperationAction(ISD::FABS
, MVT::v2f16
, Legal
);
563 if (Subtarget
->hasVOP3PInsts()) {
564 setOperationAction(ISD::ADD
, MVT::v2i16
, Legal
);
565 setOperationAction(ISD::SUB
, MVT::v2i16
, Legal
);
566 setOperationAction(ISD::MUL
, MVT::v2i16
, Legal
);
567 setOperationAction(ISD::SHL
, MVT::v2i16
, Legal
);
568 setOperationAction(ISD::SRL
, MVT::v2i16
, Legal
);
569 setOperationAction(ISD::SRA
, MVT::v2i16
, Legal
);
570 setOperationAction(ISD::SMIN
, MVT::v2i16
, Legal
);
571 setOperationAction(ISD::UMIN
, MVT::v2i16
, Legal
);
572 setOperationAction(ISD::SMAX
, MVT::v2i16
, Legal
);
573 setOperationAction(ISD::UMAX
, MVT::v2i16
, Legal
);
575 setOperationAction(ISD::FADD
, MVT::v2f16
, Legal
);
576 setOperationAction(ISD::FMUL
, MVT::v2f16
, Legal
);
577 setOperationAction(ISD::FMA
, MVT::v2f16
, Legal
);
578 setOperationAction(ISD::FMINNUM
, MVT::v2f16
, Legal
);
579 setOperationAction(ISD::FMAXNUM
, MVT::v2f16
, Legal
);
580 setOperationAction(ISD::FCANONICALIZE
, MVT::v2f16
, Legal
);
582 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
583 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
585 setOperationAction(ISD::SHL
, MVT::v4i16
, Custom
);
586 setOperationAction(ISD::SRA
, MVT::v4i16
, Custom
);
587 setOperationAction(ISD::SRL
, MVT::v4i16
, Custom
);
588 setOperationAction(ISD::ADD
, MVT::v4i16
, Custom
);
589 setOperationAction(ISD::SUB
, MVT::v4i16
, Custom
);
590 setOperationAction(ISD::MUL
, MVT::v4i16
, Custom
);
592 setOperationAction(ISD::SMIN
, MVT::v4i16
, Custom
);
593 setOperationAction(ISD::SMAX
, MVT::v4i16
, Custom
);
594 setOperationAction(ISD::UMIN
, MVT::v4i16
, Custom
);
595 setOperationAction(ISD::UMAX
, MVT::v4i16
, Custom
);
597 setOperationAction(ISD::FADD
, MVT::v4f16
, Custom
);
598 setOperationAction(ISD::FMUL
, MVT::v4f16
, Custom
);
599 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Custom
);
600 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Custom
);
601 setOperationAction(ISD::FCANONICALIZE
, MVT::v4f16
, Custom
);
603 setOperationAction(ISD::FEXP
, MVT::v2f16
, Custom
);
604 setOperationAction(ISD::SELECT
, MVT::v4i16
, Custom
);
605 setOperationAction(ISD::SELECT
, MVT::v4f16
, Custom
);
608 setOperationAction(ISD::FNEG
, MVT::v4f16
, Custom
);
609 setOperationAction(ISD::FABS
, MVT::v4f16
, Custom
);
611 if (Subtarget
->has16BitInsts()) {
612 setOperationAction(ISD::SELECT
, MVT::v2i16
, Promote
);
613 AddPromotedToType(ISD::SELECT
, MVT::v2i16
, MVT::i32
);
614 setOperationAction(ISD::SELECT
, MVT::v2f16
, Promote
);
615 AddPromotedToType(ISD::SELECT
, MVT::v2f16
, MVT::i32
);
617 // Legalization hack.
618 setOperationAction(ISD::SELECT
, MVT::v2i16
, Custom
);
619 setOperationAction(ISD::SELECT
, MVT::v2f16
, Custom
);
621 setOperationAction(ISD::FNEG
, MVT::v2f16
, Custom
);
622 setOperationAction(ISD::FABS
, MVT::v2f16
, Custom
);
625 for (MVT VT
: { MVT::v4i16
, MVT::v4f16
, MVT::v2i8
, MVT::v4i8
, MVT::v8i8
}) {
626 setOperationAction(ISD::SELECT
, VT
, Custom
);
629 setTargetDAGCombine(ISD::ADD
);
630 setTargetDAGCombine(ISD::ADDCARRY
);
631 setTargetDAGCombine(ISD::SUB
);
632 setTargetDAGCombine(ISD::SUBCARRY
);
633 setTargetDAGCombine(ISD::FADD
);
634 setTargetDAGCombine(ISD::FSUB
);
635 setTargetDAGCombine(ISD::FMINNUM
);
636 setTargetDAGCombine(ISD::FMAXNUM
);
637 setTargetDAGCombine(ISD::FMA
);
638 setTargetDAGCombine(ISD::SMIN
);
639 setTargetDAGCombine(ISD::SMAX
);
640 setTargetDAGCombine(ISD::UMIN
);
641 setTargetDAGCombine(ISD::UMAX
);
642 setTargetDAGCombine(ISD::SETCC
);
643 setTargetDAGCombine(ISD::AND
);
644 setTargetDAGCombine(ISD::OR
);
645 setTargetDAGCombine(ISD::XOR
);
646 setTargetDAGCombine(ISD::SINT_TO_FP
);
647 setTargetDAGCombine(ISD::UINT_TO_FP
);
648 setTargetDAGCombine(ISD::FCANONICALIZE
);
649 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR
);
650 setTargetDAGCombine(ISD::ZERO_EXTEND
);
651 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT
);
652 setTargetDAGCombine(ISD::BUILD_VECTOR
);
654 // All memory operations. Some folding on the pointer operand is done to help
655 // matching the constant offsets in the addressing modes.
656 setTargetDAGCombine(ISD::LOAD
);
657 setTargetDAGCombine(ISD::STORE
);
658 setTargetDAGCombine(ISD::ATOMIC_LOAD
);
659 setTargetDAGCombine(ISD::ATOMIC_STORE
);
660 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP
);
661 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
);
662 setTargetDAGCombine(ISD::ATOMIC_SWAP
);
663 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD
);
664 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB
);
665 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND
);
666 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR
);
667 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR
);
668 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND
);
669 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN
);
670 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX
);
671 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN
);
672 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX
);
674 setSchedulingPreference(Sched::RegPressure
);
676 // SI at least has hardware support for floating point exceptions, but no way
677 // of using or handling them is implemented. They are also optional in OpenCL
679 setHasFloatingPointExceptions(Subtarget
->hasFPExceptions());
682 const GCNSubtarget
*SITargetLowering::getSubtarget() const {
686 //===----------------------------------------------------------------------===//
687 // TargetLowering queries
688 //===----------------------------------------------------------------------===//
690 // v_mad_mix* support a conversion from f16 to f32.
692 // There is only one special case when denormals are enabled we don't currently,
693 // where this is OK to use.
694 bool SITargetLowering::isFPExtFoldable(unsigned Opcode
,
695 EVT DestVT
, EVT SrcVT
) const {
696 return ((Opcode
== ISD::FMAD
&& Subtarget
->hasMadMixInsts()) ||
697 (Opcode
== ISD::FMA
&& Subtarget
->hasFmaMixInsts())) &&
698 DestVT
.getScalarType() == MVT::f32
&& !Subtarget
->hasFP32Denormals() &&
699 SrcVT
.getScalarType() == MVT::f16
;
702 bool SITargetLowering::isShuffleMaskLegal(ArrayRef
<int>, EVT
) const {
703 // SI has some legal vector types, but no legal vector operations. Say no
704 // shuffles are legal in order to prefer scalarizing some vector operations.
708 MVT
SITargetLowering::getRegisterTypeForCallingConv(LLVMContext
&Context
,
711 // TODO: Consider splitting all arguments into 32-bit pieces.
712 if (CC
!= CallingConv::AMDGPU_KERNEL
&& VT
.isVector()) {
713 EVT ScalarVT
= VT
.getScalarType();
714 unsigned Size
= ScalarVT
.getSizeInBits();
716 return ScalarVT
.getSimpleVT();
721 if (Size
== 16 && Subtarget
->has16BitInsts())
722 return VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
725 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
728 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext
&Context
,
731 if (CC
!= CallingConv::AMDGPU_KERNEL
&& VT
.isVector()) {
732 unsigned NumElts
= VT
.getVectorNumElements();
733 EVT ScalarVT
= VT
.getScalarType();
734 unsigned Size
= ScalarVT
.getSizeInBits();
742 if (Size
== 16 && Subtarget
->has16BitInsts())
743 return (VT
.getVectorNumElements() + 1) / 2;
746 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
749 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
750 LLVMContext
&Context
, CallingConv::ID CC
,
751 EVT VT
, EVT
&IntermediateVT
,
752 unsigned &NumIntermediates
, MVT
&RegisterVT
) const {
753 if (CC
!= CallingConv::AMDGPU_KERNEL
&& VT
.isVector()) {
754 unsigned NumElts
= VT
.getVectorNumElements();
755 EVT ScalarVT
= VT
.getScalarType();
756 unsigned Size
= ScalarVT
.getSizeInBits();
758 RegisterVT
= ScalarVT
.getSimpleVT();
759 IntermediateVT
= RegisterVT
;
760 NumIntermediates
= NumElts
;
761 return NumIntermediates
;
765 RegisterVT
= MVT::i32
;
766 IntermediateVT
= RegisterVT
;
767 NumIntermediates
= 2 * NumElts
;
768 return NumIntermediates
;
771 // FIXME: We should fix the ABI to be the same on targets without 16-bit
772 // support, but unless we can properly handle 3-vectors, it will be still be
774 if (Size
== 16 && Subtarget
->has16BitInsts()) {
775 RegisterVT
= VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
776 IntermediateVT
= RegisterVT
;
777 NumIntermediates
= (NumElts
+ 1) / 2;
778 return NumIntermediates
;
782 return TargetLowering::getVectorTypeBreakdownForCallingConv(
783 Context
, CC
, VT
, IntermediateVT
, NumIntermediates
, RegisterVT
);
786 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo
&Info
,
789 unsigned IntrID
) const {
790 if (const AMDGPU::RsrcIntrinsic
*RsrcIntr
=
791 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
792 AttributeList Attr
= Intrinsic::getAttributes(CI
.getContext(),
793 (Intrinsic::ID
)IntrID
);
794 if (Attr
.hasFnAttribute(Attribute::ReadNone
))
797 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
799 if (RsrcIntr
->IsImage
) {
800 Info
.ptrVal
= MFI
->getImagePSV(
801 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
802 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
805 Info
.ptrVal
= MFI
->getBufferPSV(
806 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
807 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
810 Info
.flags
= MachineMemOperand::MODereferenceable
;
811 if (Attr
.hasFnAttribute(Attribute::ReadOnly
)) {
812 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
813 Info
.memVT
= MVT::getVT(CI
.getType());
814 Info
.flags
|= MachineMemOperand::MOLoad
;
815 } else if (Attr
.hasFnAttribute(Attribute::WriteOnly
)) {
816 Info
.opc
= ISD::INTRINSIC_VOID
;
817 Info
.memVT
= MVT::getVT(CI
.getArgOperand(0)->getType());
818 Info
.flags
|= MachineMemOperand::MOStore
;
821 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
822 Info
.memVT
= MVT::getVT(CI
.getType());
823 Info
.flags
= MachineMemOperand::MOLoad
|
824 MachineMemOperand::MOStore
|
825 MachineMemOperand::MODereferenceable
;
827 // XXX - Should this be volatile without known ordering?
828 Info
.flags
|= MachineMemOperand::MOVolatile
;
834 case Intrinsic::amdgcn_atomic_inc
:
835 case Intrinsic::amdgcn_atomic_dec
:
836 case Intrinsic::amdgcn_ds_fadd
:
837 case Intrinsic::amdgcn_ds_fmin
:
838 case Intrinsic::amdgcn_ds_fmax
: {
839 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
840 Info
.memVT
= MVT::getVT(CI
.getType());
841 Info
.ptrVal
= CI
.getOperand(0);
843 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
845 const ConstantInt
*Vol
= dyn_cast
<ConstantInt
>(CI
.getOperand(4));
846 if (!Vol
|| !Vol
->isZero())
847 Info
.flags
|= MachineMemOperand::MOVolatile
;
857 bool SITargetLowering::getAddrModeArguments(IntrinsicInst
*II
,
858 SmallVectorImpl
<Value
*> &Ops
,
859 Type
*&AccessTy
) const {
860 switch (II
->getIntrinsicID()) {
861 case Intrinsic::amdgcn_atomic_inc
:
862 case Intrinsic::amdgcn_atomic_dec
:
863 case Intrinsic::amdgcn_ds_fadd
:
864 case Intrinsic::amdgcn_ds_fmin
:
865 case Intrinsic::amdgcn_ds_fmax
: {
866 Value
*Ptr
= II
->getArgOperand(0);
867 AccessTy
= II
->getType();
876 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode
&AM
) const {
877 if (!Subtarget
->hasFlatInstOffsets()) {
878 // Flat instructions do not have offsets, and only have the register
880 return AM
.BaseOffs
== 0 && AM
.Scale
== 0;
883 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
884 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
887 return isUInt
<12>(AM
.BaseOffs
) && AM
.Scale
== 0;
890 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode
&AM
) const {
891 if (Subtarget
->hasFlatGlobalInsts())
892 return isInt
<13>(AM
.BaseOffs
) && AM
.Scale
== 0;
894 if (!Subtarget
->hasAddr64() || Subtarget
->useFlatForGlobal()) {
895 // Assume the we will use FLAT for all global memory accesses
897 // FIXME: This assumption is currently wrong. On VI we still use
898 // MUBUF instructions for the r + i addressing mode. As currently
899 // implemented, the MUBUF instructions only work on buffer < 4GB.
900 // It may be possible to support > 4GB buffers with MUBUF instructions,
901 // by setting the stride value in the resource descriptor which would
902 // increase the size limit to (stride * 4GB). However, this is risky,
903 // because it has never been validated.
904 return isLegalFlatAddressingMode(AM
);
907 return isLegalMUBUFAddressingMode(AM
);
910 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode
&AM
) const {
911 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
912 // additionally can do r + r + i with addr64. 32-bit has more addressing
913 // mode options. Depending on the resource constant, it can also do
914 // (i64 r0) + (i32 r1) * (i14 i).
916 // Private arrays end up using a scratch buffer most of the time, so also
917 // assume those use MUBUF instructions. Scratch loads / stores are currently
918 // implemented as mubuf instructions with offen bit set, so slightly
919 // different than the normal addr64.
920 if (!isUInt
<12>(AM
.BaseOffs
))
923 // FIXME: Since we can split immediate into soffset and immediate offset,
924 // would it make sense to allow any immediate?
927 case 0: // r + i or just i, depending on HasBaseReg.
930 return true; // We have r + r or r + i.
937 // Allow 2 * r as r + r
938 // Or 2 * r + i is allowed as r + r + i.
940 default: // Don't allow n * r
945 bool SITargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
946 const AddrMode
&AM
, Type
*Ty
,
947 unsigned AS
, Instruction
*I
) const {
948 // No global is ever allowed as a base.
952 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
)
953 return isLegalGlobalAddressingMode(AM
);
955 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
956 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
957 // If the offset isn't a multiple of 4, it probably isn't going to be
958 // correctly aligned.
959 // FIXME: Can we get the real alignment here?
960 if (AM
.BaseOffs
% 4 != 0)
961 return isLegalMUBUFAddressingMode(AM
);
963 // There are no SMRD extloads, so if we have to do a small type access we
964 // will use a MUBUF load.
965 // FIXME?: We also need to do this if unaligned, but we don't know the
967 if (Ty
->isSized() && DL
.getTypeStoreSize(Ty
) < 4)
968 return isLegalGlobalAddressingMode(AM
);
970 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
971 // SMRD instructions have an 8-bit, dword offset on SI.
972 if (!isUInt
<8>(AM
.BaseOffs
/ 4))
974 } else if (Subtarget
->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
) {
975 // On CI+, this can also be a 32-bit literal constant offset. If it fits
976 // in 8-bits, it can use a smaller encoding.
977 if (!isUInt
<32>(AM
.BaseOffs
/ 4))
979 } else if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
980 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
981 if (!isUInt
<20>(AM
.BaseOffs
))
984 llvm_unreachable("unhandled generation");
986 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
989 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
994 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
995 return isLegalMUBUFAddressingMode(AM
);
996 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
||
997 AS
== AMDGPUAS::REGION_ADDRESS
) {
998 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1000 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1001 // an 8-bit dword offset but we don't know the alignment here.
1002 if (!isUInt
<16>(AM
.BaseOffs
))
1005 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1008 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1012 } else if (AS
== AMDGPUAS::FLAT_ADDRESS
||
1013 AS
== AMDGPUAS::UNKNOWN_ADDRESS_SPACE
) {
1014 // For an unknown address space, this usually means that this is for some
1015 // reason being used for pure arithmetic, and not based on some addressing
1016 // computation. We don't have instructions that compute pointers with any
1017 // addressing modes, so treat them as having no offset like flat
1019 return isLegalFlatAddressingMode(AM
);
1021 llvm_unreachable("unhandled address space");
1025 bool SITargetLowering::canMergeStoresTo(unsigned AS
, EVT MemVT
,
1026 const SelectionDAG
&DAG
) const {
1027 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
) {
1028 return (MemVT
.getSizeInBits() <= 4 * 32);
1029 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1030 unsigned MaxPrivateBits
= 8 * getSubtarget()->getMaxPrivateElementSize();
1031 return (MemVT
.getSizeInBits() <= MaxPrivateBits
);
1032 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
) {
1033 return (MemVT
.getSizeInBits() <= 2 * 32);
1038 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT
,
1041 bool *IsFast
) const {
1045 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1046 // which isn't a simple VT.
1047 // Until MVT is extended to handle this, simply check for the size and
1048 // rely on the condition below: allow accesses if the size is a multiple of 4.
1049 if (VT
== MVT::Other
|| (VT
!= MVT::Other
&& VT
.getSizeInBits() > 1024 &&
1050 VT
.getStoreSize() > 16)) {
1054 if (AddrSpace
== AMDGPUAS::LOCAL_ADDRESS
||
1055 AddrSpace
== AMDGPUAS::REGION_ADDRESS
) {
1056 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1057 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1058 // with adjacent offsets.
1059 bool AlignedBy4
= (Align
% 4 == 0);
1061 *IsFast
= AlignedBy4
;
1066 // FIXME: We have to be conservative here and assume that flat operations
1067 // will access scratch. If we had access to the IR function, then we
1068 // could determine if any private memory was used in the function.
1069 if (!Subtarget
->hasUnalignedScratchAccess() &&
1070 (AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
||
1071 AddrSpace
== AMDGPUAS::FLAT_ADDRESS
)) {
1072 bool AlignedBy4
= Align
>= 4;
1074 *IsFast
= AlignedBy4
;
1079 if (Subtarget
->hasUnalignedBufferAccess()) {
1080 // If we have an uniform constant load, it still requires using a slow
1081 // buffer instruction if unaligned.
1083 *IsFast
= (AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS
||
1084 AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) ?
1085 (Align
% 4 == 0) : true;
1091 // Smaller than dword value must be aligned.
1092 if (VT
.bitsLT(MVT::i32
))
1095 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1096 // byte-address are ignored, thus forcing Dword alignment.
1097 // This applies to private, global, and constant memory.
1101 return VT
.bitsGT(MVT::i32
) && Align
% 4 == 0;
1104 EVT
SITargetLowering::getOptimalMemOpType(uint64_t Size
, unsigned DstAlign
,
1105 unsigned SrcAlign
, bool IsMemset
,
1108 MachineFunction
&MF
) const {
1109 // FIXME: Should account for address space here.
1111 // The default fallback uses the private pointer size as a guess for a type to
1112 // use. Make sure we switch these to 64-bit accesses.
1114 if (Size
>= 16 && DstAlign
>= 4) // XXX: Should only do for global
1117 if (Size
>= 8 && DstAlign
>= 4)
1124 static bool isFlatGlobalAddrSpace(unsigned AS
) {
1125 return AS
== AMDGPUAS::GLOBAL_ADDRESS
||
1126 AS
== AMDGPUAS::FLAT_ADDRESS
||
1127 AS
== AMDGPUAS::CONSTANT_ADDRESS
;
1130 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS
,
1131 unsigned DestAS
) const {
1132 return isFlatGlobalAddrSpace(SrcAS
) && isFlatGlobalAddrSpace(DestAS
);
1135 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode
*N
) const {
1136 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1137 const Value
*Ptr
= MemNode
->getMemOperand()->getValue();
1138 const Instruction
*I
= dyn_cast_or_null
<Instruction
>(Ptr
);
1139 return I
&& I
->getMetadata("amdgpu.noclobber");
1142 bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS
,
1143 unsigned DestAS
) const {
1144 // Flat -> private/local is a simple truncate.
1145 // Flat -> global is no-op
1146 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
)
1149 return isNoopAddrSpaceCast(SrcAS
, DestAS
);
1152 bool SITargetLowering::isMemOpUniform(const SDNode
*N
) const {
1153 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1155 return AMDGPUInstrInfo::isUniformMMO(MemNode
->getMemOperand());
1158 TargetLoweringBase::LegalizeTypeAction
1159 SITargetLowering::getPreferredVectorAction(EVT VT
) const {
1160 if (VT
.getVectorNumElements() != 1 && VT
.getScalarType().bitsLE(MVT::i16
))
1161 return TypeSplitVector
;
1163 return TargetLoweringBase::getPreferredVectorAction(VT
);
1166 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
1168 // FIXME: Could be smarter if called for vector constants.
1172 bool SITargetLowering::isTypeDesirableForOp(unsigned Op
, EVT VT
) const {
1173 if (Subtarget
->has16BitInsts() && VT
== MVT::i16
) {
1178 // These operations are done with 32-bit instructions anyway.
1183 // TODO: Extensions?
1190 // SimplifySetCC uses this function to determine whether or not it should
1191 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1192 if (VT
== MVT::i1
&& Op
== ISD::SETCC
)
1195 return TargetLowering::isTypeDesirableForOp(Op
, VT
);
1198 SDValue
SITargetLowering::lowerKernArgParameterPtr(SelectionDAG
&DAG
,
1201 uint64_t Offset
) const {
1202 const DataLayout
&DL
= DAG
.getDataLayout();
1203 MachineFunction
&MF
= DAG
.getMachineFunction();
1204 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1206 const ArgDescriptor
*InputPtrReg
;
1207 const TargetRegisterClass
*RC
;
1209 std::tie(InputPtrReg
, RC
)
1210 = Info
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1212 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
1213 MVT PtrVT
= getPointerTy(DL
, AMDGPUAS::CONSTANT_ADDRESS
);
1214 SDValue BasePtr
= DAG
.getCopyFromReg(Chain
, SL
,
1215 MRI
.getLiveInVirtReg(InputPtrReg
->getRegister()), PtrVT
);
1217 return DAG
.getObjectPtrOffset(SL
, BasePtr
, Offset
);
1220 SDValue
SITargetLowering::getImplicitArgPtr(SelectionDAG
&DAG
,
1221 const SDLoc
&SL
) const {
1222 uint64_t Offset
= getImplicitParameterOffset(DAG
.getMachineFunction(),
1224 return lowerKernArgParameterPtr(DAG
, SL
, DAG
.getEntryNode(), Offset
);
1227 SDValue
SITargetLowering::convertArgType(SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1228 const SDLoc
&SL
, SDValue Val
,
1230 const ISD::InputArg
*Arg
) const {
1231 if (Arg
&& (Arg
->Flags
.isSExt() || Arg
->Flags
.isZExt()) &&
1233 unsigned Opc
= Arg
->Flags
.isZExt() ? ISD::AssertZext
: ISD::AssertSext
;
1234 Val
= DAG
.getNode(Opc
, SL
, MemVT
, Val
, DAG
.getValueType(VT
));
1237 if (MemVT
.isFloatingPoint())
1238 Val
= getFPExtOrFPTrunc(DAG
, Val
, SL
, VT
);
1240 Val
= DAG
.getSExtOrTrunc(Val
, SL
, VT
);
1242 Val
= DAG
.getZExtOrTrunc(Val
, SL
, VT
);
1247 SDValue
SITargetLowering::lowerKernargMemParameter(
1248 SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1249 const SDLoc
&SL
, SDValue Chain
,
1250 uint64_t Offset
, unsigned Align
, bool Signed
,
1251 const ISD::InputArg
*Arg
) const {
1252 Type
*Ty
= MemVT
.getTypeForEVT(*DAG
.getContext());
1253 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
1254 MachinePointerInfo
PtrInfo(UndefValue::get(PtrTy
));
1256 // Try to avoid using an extload by loading earlier than the argument address,
1257 // and extracting the relevant bits. The load should hopefully be merged with
1258 // the previous argument.
1259 if (MemVT
.getStoreSize() < 4 && Align
< 4) {
1260 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1261 int64_t AlignDownOffset
= alignDown(Offset
, 4);
1262 int64_t OffsetDiff
= Offset
- AlignDownOffset
;
1264 EVT IntVT
= MemVT
.changeTypeToInteger();
1266 // TODO: If we passed in the base kernel offset we could have a better
1267 // alignment than 4, but we don't really need it.
1268 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, AlignDownOffset
);
1269 SDValue Load
= DAG
.getLoad(MVT::i32
, SL
, Chain
, Ptr
, PtrInfo
, 4,
1270 MachineMemOperand::MODereferenceable
|
1271 MachineMemOperand::MOInvariant
);
1273 SDValue ShiftAmt
= DAG
.getConstant(OffsetDiff
* 8, SL
, MVT::i32
);
1274 SDValue Extract
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Load
, ShiftAmt
);
1276 SDValue ArgVal
= DAG
.getNode(ISD::TRUNCATE
, SL
, IntVT
, Extract
);
1277 ArgVal
= DAG
.getNode(ISD::BITCAST
, SL
, MemVT
, ArgVal
);
1278 ArgVal
= convertArgType(DAG
, VT
, MemVT
, SL
, ArgVal
, Signed
, Arg
);
1281 return DAG
.getMergeValues({ ArgVal
, Load
.getValue(1) }, SL
);
1284 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, Offset
);
1285 SDValue Load
= DAG
.getLoad(MemVT
, SL
, Chain
, Ptr
, PtrInfo
, Align
,
1286 MachineMemOperand::MODereferenceable
|
1287 MachineMemOperand::MOInvariant
);
1289 SDValue Val
= convertArgType(DAG
, VT
, MemVT
, SL
, Load
, Signed
, Arg
);
1290 return DAG
.getMergeValues({ Val
, Load
.getValue(1) }, SL
);
1293 SDValue
SITargetLowering::lowerStackParameter(SelectionDAG
&DAG
, CCValAssign
&VA
,
1294 const SDLoc
&SL
, SDValue Chain
,
1295 const ISD::InputArg
&Arg
) const {
1296 MachineFunction
&MF
= DAG
.getMachineFunction();
1297 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1299 if (Arg
.Flags
.isByVal()) {
1300 unsigned Size
= Arg
.Flags
.getByValSize();
1301 int FrameIdx
= MFI
.CreateFixedObject(Size
, VA
.getLocMemOffset(), false);
1302 return DAG
.getFrameIndex(FrameIdx
, MVT::i32
);
1305 unsigned ArgOffset
= VA
.getLocMemOffset();
1306 unsigned ArgSize
= VA
.getValVT().getStoreSize();
1308 int FI
= MFI
.CreateFixedObject(ArgSize
, ArgOffset
, true);
1310 // Create load nodes to retrieve arguments from the stack.
1311 SDValue FIN
= DAG
.getFrameIndex(FI
, MVT::i32
);
1314 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1315 ISD::LoadExtType ExtType
= ISD::NON_EXTLOAD
;
1316 MVT MemVT
= VA
.getValVT();
1318 switch (VA
.getLocInfo()) {
1321 case CCValAssign::BCvt
:
1322 MemVT
= VA
.getLocVT();
1324 case CCValAssign::SExt
:
1325 ExtType
= ISD::SEXTLOAD
;
1327 case CCValAssign::ZExt
:
1328 ExtType
= ISD::ZEXTLOAD
;
1330 case CCValAssign::AExt
:
1331 ExtType
= ISD::EXTLOAD
;
1335 ArgValue
= DAG
.getExtLoad(
1336 ExtType
, SL
, VA
.getLocVT(), Chain
, FIN
,
1337 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
),
1342 SDValue
SITargetLowering::getPreloadedValue(SelectionDAG
&DAG
,
1343 const SIMachineFunctionInfo
&MFI
,
1345 AMDGPUFunctionArgInfo::PreloadedValue PVID
) const {
1346 const ArgDescriptor
*Reg
;
1347 const TargetRegisterClass
*RC
;
1349 std::tie(Reg
, RC
) = MFI
.getPreloadedValue(PVID
);
1350 return CreateLiveInRegister(DAG
, RC
, Reg
->getRegister(), VT
);
1353 static void processShaderInputArgs(SmallVectorImpl
<ISD::InputArg
> &Splits
,
1354 CallingConv::ID CallConv
,
1355 ArrayRef
<ISD::InputArg
> Ins
,
1357 FunctionType
*FType
,
1358 SIMachineFunctionInfo
*Info
) {
1359 for (unsigned I
= 0, E
= Ins
.size(), PSInputNum
= 0; I
!= E
; ++I
) {
1360 const ISD::InputArg
*Arg
= &Ins
[I
];
1362 assert((!Arg
->VT
.isVector() || Arg
->VT
.getScalarSizeInBits() == 16) &&
1363 "vector type argument should have been split");
1365 // First check if it's a PS input addr.
1366 if (CallConv
== CallingConv::AMDGPU_PS
&&
1367 !Arg
->Flags
.isInReg() && !Arg
->Flags
.isByVal() && PSInputNum
<= 15) {
1369 bool SkipArg
= !Arg
->Used
&& !Info
->isPSInputAllocated(PSInputNum
);
1371 // Inconveniently only the first part of the split is marked as isSplit,
1372 // so skip to the end. We only want to increment PSInputNum once for the
1373 // entire split argument.
1374 if (Arg
->Flags
.isSplit()) {
1375 while (!Arg
->Flags
.isSplitEnd()) {
1376 assert(!Arg
->VT
.isVector() &&
1377 "unexpected vector split in ps argument type");
1379 Splits
.push_back(*Arg
);
1385 // We can safely skip PS inputs.
1386 Skipped
.set(Arg
->getOrigArgIndex());
1391 Info
->markPSInputAllocated(PSInputNum
);
1393 Info
->markPSInputEnabled(PSInputNum
);
1398 Splits
.push_back(*Arg
);
1402 // Allocate special inputs passed in VGPRs.
1403 static void allocateSpecialEntryInputVGPRs(CCState
&CCInfo
,
1404 MachineFunction
&MF
,
1405 const SIRegisterInfo
&TRI
,
1406 SIMachineFunctionInfo
&Info
) {
1407 if (Info
.hasWorkItemIDX()) {
1408 unsigned Reg
= AMDGPU::VGPR0
;
1409 MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
);
1411 CCInfo
.AllocateReg(Reg
);
1412 Info
.setWorkItemIDX(ArgDescriptor::createRegister(Reg
));
1415 if (Info
.hasWorkItemIDY()) {
1416 unsigned Reg
= AMDGPU::VGPR1
;
1417 MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
);
1419 CCInfo
.AllocateReg(Reg
);
1420 Info
.setWorkItemIDY(ArgDescriptor::createRegister(Reg
));
1423 if (Info
.hasWorkItemIDZ()) {
1424 unsigned Reg
= AMDGPU::VGPR2
;
1425 MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
);
1427 CCInfo
.AllocateReg(Reg
);
1428 Info
.setWorkItemIDZ(ArgDescriptor::createRegister(Reg
));
1432 // Try to allocate a VGPR at the end of the argument list, or if no argument
1433 // VGPRs are left allocating a stack slot.
1434 static ArgDescriptor
allocateVGPR32Input(CCState
&CCInfo
) {
1435 ArrayRef
<MCPhysReg
> ArgVGPRs
1436 = makeArrayRef(AMDGPU::VGPR_32RegClass
.begin(), 32);
1437 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgVGPRs
);
1438 if (RegIdx
== ArgVGPRs
.size()) {
1439 // Spill to stack required.
1440 int64_t Offset
= CCInfo
.AllocateStack(4, 4);
1442 return ArgDescriptor::createStack(Offset
);
1445 unsigned Reg
= ArgVGPRs
[RegIdx
];
1446 Reg
= CCInfo
.AllocateReg(Reg
);
1447 assert(Reg
!= AMDGPU::NoRegister
);
1449 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1450 MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
);
1451 return ArgDescriptor::createRegister(Reg
);
1454 static ArgDescriptor
allocateSGPR32InputImpl(CCState
&CCInfo
,
1455 const TargetRegisterClass
*RC
,
1456 unsigned NumArgRegs
) {
1457 ArrayRef
<MCPhysReg
> ArgSGPRs
= makeArrayRef(RC
->begin(), 32);
1458 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgSGPRs
);
1459 if (RegIdx
== ArgSGPRs
.size())
1460 report_fatal_error("ran out of SGPRs for arguments");
1462 unsigned Reg
= ArgSGPRs
[RegIdx
];
1463 Reg
= CCInfo
.AllocateReg(Reg
);
1464 assert(Reg
!= AMDGPU::NoRegister
);
1466 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1467 MF
.addLiveIn(Reg
, RC
);
1468 return ArgDescriptor::createRegister(Reg
);
1471 static ArgDescriptor
allocateSGPR32Input(CCState
&CCInfo
) {
1472 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_32RegClass
, 32);
1475 static ArgDescriptor
allocateSGPR64Input(CCState
&CCInfo
) {
1476 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_64RegClass
, 16);
1479 static void allocateSpecialInputVGPRs(CCState
&CCInfo
,
1480 MachineFunction
&MF
,
1481 const SIRegisterInfo
&TRI
,
1482 SIMachineFunctionInfo
&Info
) {
1483 if (Info
.hasWorkItemIDX())
1484 Info
.setWorkItemIDX(allocateVGPR32Input(CCInfo
));
1486 if (Info
.hasWorkItemIDY())
1487 Info
.setWorkItemIDY(allocateVGPR32Input(CCInfo
));
1489 if (Info
.hasWorkItemIDZ())
1490 Info
.setWorkItemIDZ(allocateVGPR32Input(CCInfo
));
1493 static void allocateSpecialInputSGPRs(CCState
&CCInfo
,
1494 MachineFunction
&MF
,
1495 const SIRegisterInfo
&TRI
,
1496 SIMachineFunctionInfo
&Info
) {
1497 auto &ArgInfo
= Info
.getArgInfo();
1499 // TODO: Unify handling with private memory pointers.
1501 if (Info
.hasDispatchPtr())
1502 ArgInfo
.DispatchPtr
= allocateSGPR64Input(CCInfo
);
1504 if (Info
.hasQueuePtr())
1505 ArgInfo
.QueuePtr
= allocateSGPR64Input(CCInfo
);
1507 if (Info
.hasKernargSegmentPtr())
1508 ArgInfo
.KernargSegmentPtr
= allocateSGPR64Input(CCInfo
);
1510 if (Info
.hasDispatchID())
1511 ArgInfo
.DispatchID
= allocateSGPR64Input(CCInfo
);
1513 // flat_scratch_init is not applicable for non-kernel functions.
1515 if (Info
.hasWorkGroupIDX())
1516 ArgInfo
.WorkGroupIDX
= allocateSGPR32Input(CCInfo
);
1518 if (Info
.hasWorkGroupIDY())
1519 ArgInfo
.WorkGroupIDY
= allocateSGPR32Input(CCInfo
);
1521 if (Info
.hasWorkGroupIDZ())
1522 ArgInfo
.WorkGroupIDZ
= allocateSGPR32Input(CCInfo
);
1524 if (Info
.hasImplicitArgPtr())
1525 ArgInfo
.ImplicitArgPtr
= allocateSGPR64Input(CCInfo
);
1528 // Allocate special inputs passed in user SGPRs.
1529 static void allocateHSAUserSGPRs(CCState
&CCInfo
,
1530 MachineFunction
&MF
,
1531 const SIRegisterInfo
&TRI
,
1532 SIMachineFunctionInfo
&Info
) {
1533 if (Info
.hasImplicitBufferPtr()) {
1534 unsigned ImplicitBufferPtrReg
= Info
.addImplicitBufferPtr(TRI
);
1535 MF
.addLiveIn(ImplicitBufferPtrReg
, &AMDGPU::SGPR_64RegClass
);
1536 CCInfo
.AllocateReg(ImplicitBufferPtrReg
);
1539 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1540 if (Info
.hasPrivateSegmentBuffer()) {
1541 unsigned PrivateSegmentBufferReg
= Info
.addPrivateSegmentBuffer(TRI
);
1542 MF
.addLiveIn(PrivateSegmentBufferReg
, &AMDGPU::SGPR_128RegClass
);
1543 CCInfo
.AllocateReg(PrivateSegmentBufferReg
);
1546 if (Info
.hasDispatchPtr()) {
1547 unsigned DispatchPtrReg
= Info
.addDispatchPtr(TRI
);
1548 MF
.addLiveIn(DispatchPtrReg
, &AMDGPU::SGPR_64RegClass
);
1549 CCInfo
.AllocateReg(DispatchPtrReg
);
1552 if (Info
.hasQueuePtr()) {
1553 unsigned QueuePtrReg
= Info
.addQueuePtr(TRI
);
1554 MF
.addLiveIn(QueuePtrReg
, &AMDGPU::SGPR_64RegClass
);
1555 CCInfo
.AllocateReg(QueuePtrReg
);
1558 if (Info
.hasKernargSegmentPtr()) {
1559 unsigned InputPtrReg
= Info
.addKernargSegmentPtr(TRI
);
1560 MF
.addLiveIn(InputPtrReg
, &AMDGPU::SGPR_64RegClass
);
1561 CCInfo
.AllocateReg(InputPtrReg
);
1564 if (Info
.hasDispatchID()) {
1565 unsigned DispatchIDReg
= Info
.addDispatchID(TRI
);
1566 MF
.addLiveIn(DispatchIDReg
, &AMDGPU::SGPR_64RegClass
);
1567 CCInfo
.AllocateReg(DispatchIDReg
);
1570 if (Info
.hasFlatScratchInit()) {
1571 unsigned FlatScratchInitReg
= Info
.addFlatScratchInit(TRI
);
1572 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
1573 CCInfo
.AllocateReg(FlatScratchInitReg
);
1576 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1577 // these from the dispatch pointer.
1580 // Allocate special input registers that are initialized per-wave.
1581 static void allocateSystemSGPRs(CCState
&CCInfo
,
1582 MachineFunction
&MF
,
1583 SIMachineFunctionInfo
&Info
,
1584 CallingConv::ID CallConv
,
1586 if (Info
.hasWorkGroupIDX()) {
1587 unsigned Reg
= Info
.addWorkGroupIDX();
1588 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1589 CCInfo
.AllocateReg(Reg
);
1592 if (Info
.hasWorkGroupIDY()) {
1593 unsigned Reg
= Info
.addWorkGroupIDY();
1594 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1595 CCInfo
.AllocateReg(Reg
);
1598 if (Info
.hasWorkGroupIDZ()) {
1599 unsigned Reg
= Info
.addWorkGroupIDZ();
1600 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1601 CCInfo
.AllocateReg(Reg
);
1604 if (Info
.hasWorkGroupInfo()) {
1605 unsigned Reg
= Info
.addWorkGroupInfo();
1606 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1607 CCInfo
.AllocateReg(Reg
);
1610 if (Info
.hasPrivateSegmentWaveByteOffset()) {
1611 // Scratch wave offset passed in system SGPR.
1612 unsigned PrivateSegmentWaveByteOffsetReg
;
1615 PrivateSegmentWaveByteOffsetReg
=
1616 Info
.getPrivateSegmentWaveByteOffsetSystemSGPR();
1618 // This is true if the scratch wave byte offset doesn't have a fixed
1620 if (PrivateSegmentWaveByteOffsetReg
== AMDGPU::NoRegister
) {
1621 PrivateSegmentWaveByteOffsetReg
= findFirstFreeSGPR(CCInfo
);
1622 Info
.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg
);
1625 PrivateSegmentWaveByteOffsetReg
= Info
.addPrivateSegmentWaveByteOffset();
1627 MF
.addLiveIn(PrivateSegmentWaveByteOffsetReg
, &AMDGPU::SGPR_32RegClass
);
1628 CCInfo
.AllocateReg(PrivateSegmentWaveByteOffsetReg
);
1632 static void reservePrivateMemoryRegs(const TargetMachine
&TM
,
1633 MachineFunction
&MF
,
1634 const SIRegisterInfo
&TRI
,
1635 SIMachineFunctionInfo
&Info
) {
1636 // Now that we've figured out where the scratch register inputs are, see if
1637 // should reserve the arguments and use them directly.
1638 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1639 bool HasStackObjects
= MFI
.hasStackObjects();
1641 // Record that we know we have non-spill stack objects so we don't need to
1642 // check all stack objects later.
1643 if (HasStackObjects
)
1644 Info
.setHasNonSpillStackObjects(true);
1646 // Everything live out of a block is spilled with fast regalloc, so it's
1647 // almost certain that spilling will be required.
1648 if (TM
.getOptLevel() == CodeGenOpt::None
)
1649 HasStackObjects
= true;
1651 // For now assume stack access is needed in any callee functions, so we need
1652 // the scratch registers to pass in.
1653 bool RequiresStackAccess
= HasStackObjects
|| MFI
.hasCalls();
1655 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1656 if (ST
.isAmdHsaOrMesa(MF
.getFunction())) {
1657 if (RequiresStackAccess
) {
1658 // If we have stack objects, we unquestionably need the private buffer
1659 // resource. For the Code Object V2 ABI, this will be the first 4 user
1660 // SGPR inputs. We can reserve those and use them directly.
1662 unsigned PrivateSegmentBufferReg
= Info
.getPreloadedReg(
1663 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER
);
1664 Info
.setScratchRSrcReg(PrivateSegmentBufferReg
);
1666 if (MFI
.hasCalls()) {
1667 // If we have calls, we need to keep the frame register in a register
1668 // that won't be clobbered by a call, so ensure it is copied somewhere.
1670 // This is not a problem for the scratch wave offset, because the same
1671 // registers are reserved in all functions.
1673 // FIXME: Nothing is really ensuring this is a call preserved register,
1674 // it's just selected from the end so it happens to be.
1675 unsigned ReservedOffsetReg
1676 = TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1677 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1679 unsigned PrivateSegmentWaveByteOffsetReg
= Info
.getPreloadedReg(
1680 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET
);
1681 Info
.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg
);
1684 unsigned ReservedBufferReg
1685 = TRI
.reservedPrivateSegmentBufferReg(MF
);
1686 unsigned ReservedOffsetReg
1687 = TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1689 // We tentatively reserve the last registers (skipping the last two
1690 // which may contain VCC). After register allocation, we'll replace
1691 // these with the ones immediately after those which were really
1692 // allocated. In the prologue copies will be inserted from the argument
1693 // to these reserved registers.
1694 Info
.setScratchRSrcReg(ReservedBufferReg
);
1695 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1698 unsigned ReservedBufferReg
= TRI
.reservedPrivateSegmentBufferReg(MF
);
1700 // Without HSA, relocations are used for the scratch pointer and the
1701 // buffer resource setup is always inserted in the prologue. Scratch wave
1702 // offset is still in an input SGPR.
1703 Info
.setScratchRSrcReg(ReservedBufferReg
);
1705 if (HasStackObjects
&& !MFI
.hasCalls()) {
1706 unsigned ScratchWaveOffsetReg
= Info
.getPreloadedReg(
1707 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET
);
1708 Info
.setScratchWaveOffsetReg(ScratchWaveOffsetReg
);
1710 unsigned ReservedOffsetReg
1711 = TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1712 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1717 bool SITargetLowering::supportSplitCSR(MachineFunction
*MF
) const {
1718 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
1719 return !Info
->isEntryFunction();
1722 void SITargetLowering::initializeSplitCSR(MachineBasicBlock
*Entry
) const {
1726 void SITargetLowering::insertCopiesSplitCSR(
1727 MachineBasicBlock
*Entry
,
1728 const SmallVectorImpl
<MachineBasicBlock
*> &Exits
) const {
1729 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
1731 const MCPhysReg
*IStart
= TRI
->getCalleeSavedRegsViaCopy(Entry
->getParent());
1735 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
1736 MachineRegisterInfo
*MRI
= &Entry
->getParent()->getRegInfo();
1737 MachineBasicBlock::iterator MBBI
= Entry
->begin();
1738 for (const MCPhysReg
*I
= IStart
; *I
; ++I
) {
1739 const TargetRegisterClass
*RC
= nullptr;
1740 if (AMDGPU::SReg_64RegClass
.contains(*I
))
1741 RC
= &AMDGPU::SGPR_64RegClass
;
1742 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
1743 RC
= &AMDGPU::SGPR_32RegClass
;
1745 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1747 unsigned NewVR
= MRI
->createVirtualRegister(RC
);
1748 // Create copy from CSR to a virtual register.
1749 Entry
->addLiveIn(*I
);
1750 BuildMI(*Entry
, MBBI
, DebugLoc(), TII
->get(TargetOpcode::COPY
), NewVR
)
1753 // Insert the copy-back instructions right before the terminator.
1754 for (auto *Exit
: Exits
)
1755 BuildMI(*Exit
, Exit
->getFirstTerminator(), DebugLoc(),
1756 TII
->get(TargetOpcode::COPY
), *I
)
1761 SDValue
SITargetLowering::LowerFormalArguments(
1762 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
1763 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
1764 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
1765 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
1767 MachineFunction
&MF
= DAG
.getMachineFunction();
1768 const Function
&Fn
= MF
.getFunction();
1769 FunctionType
*FType
= MF
.getFunction().getFunctionType();
1770 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1771 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1773 if (Subtarget
->isAmdHsaOS() && AMDGPU::isShader(CallConv
)) {
1774 DiagnosticInfoUnsupported
NoGraphicsHSA(
1775 Fn
, "unsupported non-compute shaders with HSA", DL
.getDebugLoc());
1776 DAG
.getContext()->diagnose(NoGraphicsHSA
);
1777 return DAG
.getEntryNode();
1780 // Create stack objects that are used for emitting debugger prologue if
1781 // "amdgpu-debugger-emit-prologue" attribute was specified.
1782 if (ST
.debuggerEmitPrologue())
1783 createDebuggerPrologueStackObjects(MF
);
1785 SmallVector
<ISD::InputArg
, 16> Splits
;
1786 SmallVector
<CCValAssign
, 16> ArgLocs
;
1787 BitVector
Skipped(Ins
.size());
1788 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
1791 bool IsShader
= AMDGPU::isShader(CallConv
);
1792 bool IsKernel
= AMDGPU::isKernel(CallConv
);
1793 bool IsEntryFunc
= AMDGPU::isEntryFunctionCC(CallConv
);
1796 // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1797 // this when allocating argument fixed offsets.
1798 CCInfo
.AllocateStack(4, 4);
1802 processShaderInputArgs(Splits
, CallConv
, Ins
, Skipped
, FType
, Info
);
1804 // At least one interpolation mode must be enabled or else the GPU will
1807 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1808 // set PSInputAddr, the user wants to enable some bits after the compilation
1809 // based on run-time states. Since we can't know what the final PSInputEna
1810 // will look like, so we shouldn't do anything here and the user should take
1811 // responsibility for the correct programming.
1813 // Otherwise, the following restrictions apply:
1814 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1815 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1817 if (CallConv
== CallingConv::AMDGPU_PS
) {
1818 if ((Info
->getPSInputAddr() & 0x7F) == 0 ||
1819 ((Info
->getPSInputAddr() & 0xF) == 0 &&
1820 Info
->isPSInputAllocated(11))) {
1821 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
1822 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
1823 Info
->markPSInputAllocated(0);
1824 Info
->markPSInputEnabled(0);
1826 if (Subtarget
->isAmdPalOS()) {
1827 // For isAmdPalOS, the user does not enable some bits after compilation
1828 // based on run-time states; the register values being generated here are
1829 // the final ones set in hardware. Therefore we need to apply the
1830 // workaround to PSInputAddr and PSInputEnable together. (The case where
1831 // a bit is set in PSInputAddr but not PSInputEnable is where the
1832 // frontend set up an input arg for a particular interpolation mode, but
1833 // nothing uses that input arg. Really we should have an earlier pass
1834 // that removes such an arg.)
1835 unsigned PsInputBits
= Info
->getPSInputAddr() & Info
->getPSInputEnable();
1836 if ((PsInputBits
& 0x7F) == 0 ||
1837 ((PsInputBits
& 0xF) == 0 &&
1838 (PsInputBits
>> 11 & 1)))
1839 Info
->markPSInputEnabled(
1840 countTrailingZeros(Info
->getPSInputAddr(), ZB_Undefined
));
1844 assert(!Info
->hasDispatchPtr() &&
1845 !Info
->hasKernargSegmentPtr() && !Info
->hasFlatScratchInit() &&
1846 !Info
->hasWorkGroupIDX() && !Info
->hasWorkGroupIDY() &&
1847 !Info
->hasWorkGroupIDZ() && !Info
->hasWorkGroupInfo() &&
1848 !Info
->hasWorkItemIDX() && !Info
->hasWorkItemIDY() &&
1849 !Info
->hasWorkItemIDZ());
1850 } else if (IsKernel
) {
1851 assert(Info
->hasWorkGroupIDX() && Info
->hasWorkItemIDX());
1853 Splits
.append(Ins
.begin(), Ins
.end());
1857 allocateSpecialEntryInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
1858 allocateHSAUserSGPRs(CCInfo
, MF
, *TRI
, *Info
);
1862 analyzeFormalArgumentsCompute(CCInfo
, Ins
);
1864 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, isVarArg
);
1865 CCInfo
.AnalyzeFormalArguments(Splits
, AssignFn
);
1868 SmallVector
<SDValue
, 16> Chains
;
1870 // FIXME: This is the minimum kernel argument alignment. We should improve
1871 // this to the maximum alignment of the arguments.
1873 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
1875 const unsigned KernelArgBaseAlign
= 16;
1877 for (unsigned i
= 0, e
= Ins
.size(), ArgIdx
= 0; i
!= e
; ++i
) {
1878 const ISD::InputArg
&Arg
= Ins
[i
];
1879 if (Arg
.isOrigArg() && Skipped
[Arg
.getOrigArgIndex()]) {
1880 InVals
.push_back(DAG
.getUNDEF(Arg
.VT
));
1884 CCValAssign
&VA
= ArgLocs
[ArgIdx
++];
1885 MVT VT
= VA
.getLocVT();
1887 if (IsEntryFunc
&& VA
.isMemLoc()) {
1889 EVT MemVT
= VA
.getLocVT();
1891 const uint64_t Offset
= VA
.getLocMemOffset();
1892 unsigned Align
= MinAlign(KernelArgBaseAlign
, Offset
);
1894 SDValue Arg
= lowerKernargMemParameter(
1895 DAG
, VT
, MemVT
, DL
, Chain
, Offset
, Align
, Ins
[i
].Flags
.isSExt(), &Ins
[i
]);
1896 Chains
.push_back(Arg
.getValue(1));
1899 dyn_cast
<PointerType
>(FType
->getParamType(Ins
[i
].getOrigArgIndex()));
1900 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
1901 ParamTy
&& ParamTy
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
) {
1902 // On SI local pointers are just offsets into LDS, so they are always
1903 // less than 16-bits. On CI and newer they could potentially be
1904 // real pointers, so we can't guarantee their size.
1905 Arg
= DAG
.getNode(ISD::AssertZext
, DL
, Arg
.getValueType(), Arg
,
1906 DAG
.getValueType(MVT::i16
));
1909 InVals
.push_back(Arg
);
1911 } else if (!IsEntryFunc
&& VA
.isMemLoc()) {
1912 SDValue Val
= lowerStackParameter(DAG
, VA
, DL
, Chain
, Arg
);
1913 InVals
.push_back(Val
);
1914 if (!Arg
.Flags
.isByVal())
1915 Chains
.push_back(Val
.getValue(1));
1919 assert(VA
.isRegLoc() && "Parameter must be in a register!");
1921 unsigned Reg
= VA
.getLocReg();
1922 const TargetRegisterClass
*RC
= TRI
->getMinimalPhysRegClass(Reg
, VT
);
1923 EVT ValVT
= VA
.getValVT();
1925 Reg
= MF
.addLiveIn(Reg
, RC
);
1926 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
);
1928 if (Arg
.Flags
.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
1929 // The return object should be reasonably addressable.
1931 // FIXME: This helps when the return is a real sret. If it is a
1932 // automatically inserted sret (i.e. CanLowerReturn returns false), an
1933 // extra copy is inserted in SelectionDAGBuilder which obscures this.
1934 unsigned NumBits
= 32 - AssumeFrameIndexHighZeroBits
;
1935 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
1936 DAG
.getValueType(EVT::getIntegerVT(*DAG
.getContext(), NumBits
)));
1939 // If this is an 8 or 16-bit value, it is really passed promoted
1940 // to 32 bits. Insert an assert[sz]ext to capture this, then
1941 // truncate to the right size.
1942 switch (VA
.getLocInfo()) {
1943 case CCValAssign::Full
:
1945 case CCValAssign::BCvt
:
1946 Val
= DAG
.getNode(ISD::BITCAST
, DL
, ValVT
, Val
);
1948 case CCValAssign::SExt
:
1949 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VT
, Val
,
1950 DAG
.getValueType(ValVT
));
1951 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
1953 case CCValAssign::ZExt
:
1954 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
1955 DAG
.getValueType(ValVT
));
1956 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
1958 case CCValAssign::AExt
:
1959 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
1962 llvm_unreachable("Unknown loc info!");
1965 InVals
.push_back(Val
);
1969 // Special inputs come after user arguments.
1970 allocateSpecialInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
1973 // Start adding system SGPRs.
1975 allocateSystemSGPRs(CCInfo
, MF
, *Info
, CallConv
, IsShader
);
1977 CCInfo
.AllocateReg(Info
->getScratchRSrcReg());
1978 CCInfo
.AllocateReg(Info
->getScratchWaveOffsetReg());
1979 CCInfo
.AllocateReg(Info
->getFrameOffsetReg());
1980 allocateSpecialInputSGPRs(CCInfo
, MF
, *TRI
, *Info
);
1983 auto &ArgUsageInfo
=
1984 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
1985 ArgUsageInfo
.setFuncArgInfo(Fn
, Info
->getArgInfo());
1987 unsigned StackArgSize
= CCInfo
.getNextStackOffset();
1988 Info
->setBytesInStackArgArea(StackArgSize
);
1990 return Chains
.empty() ? Chain
:
1991 DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
1994 // TODO: If return values can't fit in registers, we should return as many as
1995 // possible in registers before passing on stack.
1996 bool SITargetLowering::CanLowerReturn(
1997 CallingConv::ID CallConv
,
1998 MachineFunction
&MF
, bool IsVarArg
,
1999 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2000 LLVMContext
&Context
) const {
2001 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2002 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2003 // for shaders. Vector types should be explicitly handled by CC.
2004 if (AMDGPU::isEntryFunctionCC(CallConv
))
2007 SmallVector
<CCValAssign
, 16> RVLocs
;
2008 CCState
CCInfo(CallConv
, IsVarArg
, MF
, RVLocs
, Context
);
2009 return CCInfo
.CheckReturn(Outs
, CCAssignFnForReturn(CallConv
, IsVarArg
));
2013 SITargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
2015 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2016 const SmallVectorImpl
<SDValue
> &OutVals
,
2017 const SDLoc
&DL
, SelectionDAG
&DAG
) const {
2018 MachineFunction
&MF
= DAG
.getMachineFunction();
2019 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2021 if (AMDGPU::isKernel(CallConv
)) {
2022 return AMDGPUTargetLowering::LowerReturn(Chain
, CallConv
, isVarArg
, Outs
,
2026 bool IsShader
= AMDGPU::isShader(CallConv
);
2028 Info
->setIfReturnsVoid(Outs
.empty());
2029 bool IsWaveEnd
= Info
->returnsVoid() && IsShader
;
2031 // CCValAssign - represent the assignment of the return value to a location.
2032 SmallVector
<CCValAssign
, 48> RVLocs
;
2033 SmallVector
<ISD::OutputArg
, 48> Splits
;
2035 // CCState - Info about the registers and stack slots.
2036 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
2039 // Analyze outgoing return values.
2040 CCInfo
.AnalyzeReturn(Outs
, CCAssignFnForReturn(CallConv
, isVarArg
));
2043 SmallVector
<SDValue
, 48> RetOps
;
2044 RetOps
.push_back(Chain
); // Operand #0 = Chain (updated below)
2046 // Add return address for callable functions.
2047 if (!Info
->isEntryFunction()) {
2048 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2049 SDValue ReturnAddrReg
= CreateLiveInRegister(
2050 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2052 // FIXME: Should be able to use a vreg here, but need a way to prevent it
2053 // from being allcoated to a CSR.
2055 SDValue PhysReturnAddrReg
= DAG
.getRegister(TRI
->getReturnAddressReg(MF
),
2058 Chain
= DAG
.getCopyToReg(Chain
, DL
, PhysReturnAddrReg
, ReturnAddrReg
, Flag
);
2059 Flag
= Chain
.getValue(1);
2061 RetOps
.push_back(PhysReturnAddrReg
);
2064 // Copy the result values into the output registers.
2065 for (unsigned I
= 0, RealRVLocIdx
= 0, E
= RVLocs
.size(); I
!= E
;
2066 ++I
, ++RealRVLocIdx
) {
2067 CCValAssign
&VA
= RVLocs
[I
];
2068 assert(VA
.isRegLoc() && "Can only return in registers!");
2069 // TODO: Partially return in registers if return values don't fit.
2070 SDValue Arg
= OutVals
[RealRVLocIdx
];
2072 // Copied from other backends.
2073 switch (VA
.getLocInfo()) {
2074 case CCValAssign::Full
:
2076 case CCValAssign::BCvt
:
2077 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2079 case CCValAssign::SExt
:
2080 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2082 case CCValAssign::ZExt
:
2083 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2085 case CCValAssign::AExt
:
2086 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2089 llvm_unreachable("Unknown loc info!");
2092 Chain
= DAG
.getCopyToReg(Chain
, DL
, VA
.getLocReg(), Arg
, Flag
);
2093 Flag
= Chain
.getValue(1);
2094 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
2097 // FIXME: Does sret work properly?
2098 if (!Info
->isEntryFunction()) {
2099 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2100 const MCPhysReg
*I
=
2101 TRI
->getCalleeSavedRegsViaCopy(&DAG
.getMachineFunction());
2104 if (AMDGPU::SReg_64RegClass
.contains(*I
))
2105 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i64
));
2106 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
2107 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i32
));
2109 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2114 // Update chain and glue.
2117 RetOps
.push_back(Flag
);
2119 unsigned Opc
= AMDGPUISD::ENDPGM
;
2121 Opc
= IsShader
? AMDGPUISD::RETURN_TO_EPILOG
: AMDGPUISD::RET_FLAG
;
2122 return DAG
.getNode(Opc
, DL
, MVT::Other
, RetOps
);
2125 SDValue
SITargetLowering::LowerCallResult(
2126 SDValue Chain
, SDValue InFlag
, CallingConv::ID CallConv
, bool IsVarArg
,
2127 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2128 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
, bool IsThisReturn
,
2129 SDValue ThisVal
) const {
2130 CCAssignFn
*RetCC
= CCAssignFnForReturn(CallConv
, IsVarArg
);
2132 // Assign locations to each value returned by this call.
2133 SmallVector
<CCValAssign
, 16> RVLocs
;
2134 CCState
CCInfo(CallConv
, IsVarArg
, DAG
.getMachineFunction(), RVLocs
,
2136 CCInfo
.AnalyzeCallResult(Ins
, RetCC
);
2138 // Copy all of the result registers out of their specified physreg.
2139 for (unsigned i
= 0; i
!= RVLocs
.size(); ++i
) {
2140 CCValAssign VA
= RVLocs
[i
];
2143 if (VA
.isRegLoc()) {
2144 Val
= DAG
.getCopyFromReg(Chain
, DL
, VA
.getLocReg(), VA
.getLocVT(), InFlag
);
2145 Chain
= Val
.getValue(1);
2146 InFlag
= Val
.getValue(2);
2147 } else if (VA
.isMemLoc()) {
2148 report_fatal_error("TODO: return values in memory");
2150 llvm_unreachable("unknown argument location type");
2152 switch (VA
.getLocInfo()) {
2153 case CCValAssign::Full
:
2155 case CCValAssign::BCvt
:
2156 Val
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), Val
);
2158 case CCValAssign::ZExt
:
2159 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VA
.getLocVT(), Val
,
2160 DAG
.getValueType(VA
.getValVT()));
2161 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2163 case CCValAssign::SExt
:
2164 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VA
.getLocVT(), Val
,
2165 DAG
.getValueType(VA
.getValVT()));
2166 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2168 case CCValAssign::AExt
:
2169 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2172 llvm_unreachable("Unknown loc info!");
2175 InVals
.push_back(Val
);
2181 // Add code to pass special inputs required depending on used features separate
2182 // from the explicit user arguments present in the IR.
2183 void SITargetLowering::passSpecialInputs(
2184 CallLoweringInfo
&CLI
,
2186 const SIMachineFunctionInfo
&Info
,
2187 SmallVectorImpl
<std::pair
<unsigned, SDValue
>> &RegsToPass
,
2188 SmallVectorImpl
<SDValue
> &MemOpChains
,
2189 SDValue Chain
) const {
2190 // If we don't have a call site, this was a call inserted by
2191 // legalization. These can never use special inputs.
2195 const Function
*CalleeFunc
= CLI
.CS
.getCalledFunction();
2198 SelectionDAG
&DAG
= CLI
.DAG
;
2199 const SDLoc
&DL
= CLI
.DL
;
2201 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2203 auto &ArgUsageInfo
=
2204 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2205 const AMDGPUFunctionArgInfo
&CalleeArgInfo
2206 = ArgUsageInfo
.lookupFuncArgInfo(*CalleeFunc
);
2208 const AMDGPUFunctionArgInfo
&CallerArgInfo
= Info
.getArgInfo();
2210 // TODO: Unify with private memory register handling. This is complicated by
2211 // the fact that at least in kernels, the input argument is not necessarily
2212 // in the same location as the input.
2213 AMDGPUFunctionArgInfo::PreloadedValue InputRegs
[] = {
2214 AMDGPUFunctionArgInfo::DISPATCH_PTR
,
2215 AMDGPUFunctionArgInfo::QUEUE_PTR
,
2216 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
,
2217 AMDGPUFunctionArgInfo::DISPATCH_ID
,
2218 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
,
2219 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
,
2220 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
,
2221 AMDGPUFunctionArgInfo::WORKITEM_ID_X
,
2222 AMDGPUFunctionArgInfo::WORKITEM_ID_Y
,
2223 AMDGPUFunctionArgInfo::WORKITEM_ID_Z
,
2224 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2227 for (auto InputID
: InputRegs
) {
2228 const ArgDescriptor
*OutgoingArg
;
2229 const TargetRegisterClass
*ArgRC
;
2231 std::tie(OutgoingArg
, ArgRC
) = CalleeArgInfo
.getPreloadedValue(InputID
);
2235 const ArgDescriptor
*IncomingArg
;
2236 const TargetRegisterClass
*IncomingArgRC
;
2237 std::tie(IncomingArg
, IncomingArgRC
)
2238 = CallerArgInfo
.getPreloadedValue(InputID
);
2239 assert(IncomingArgRC
== ArgRC
);
2241 // All special arguments are ints for now.
2242 EVT ArgVT
= TRI
->getSpillSize(*ArgRC
) == 8 ? MVT::i64
: MVT::i32
;
2246 InputReg
= loadInputValue(DAG
, ArgRC
, ArgVT
, DL
, *IncomingArg
);
2248 // The implicit arg ptr is special because it doesn't have a corresponding
2249 // input for kernels, and is computed from the kernarg segment pointer.
2250 assert(InputID
== AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
2251 InputReg
= getImplicitArgPtr(DAG
, DL
);
2254 if (OutgoingArg
->isRegister()) {
2255 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2257 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(ArgVT
.getStoreSize(), 4);
2258 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2260 MemOpChains
.push_back(ArgStore
);
2265 static bool canGuaranteeTCO(CallingConv::ID CC
) {
2266 return CC
== CallingConv::Fast
;
2269 /// Return true if we might ever do TCO for calls with this calling convention.
2270 static bool mayTailCallThisCC(CallingConv::ID CC
) {
2272 case CallingConv::C
:
2275 return canGuaranteeTCO(CC
);
2279 bool SITargetLowering::isEligibleForTailCallOptimization(
2280 SDValue Callee
, CallingConv::ID CalleeCC
, bool IsVarArg
,
2281 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2282 const SmallVectorImpl
<SDValue
> &OutVals
,
2283 const SmallVectorImpl
<ISD::InputArg
> &Ins
, SelectionDAG
&DAG
) const {
2284 if (!mayTailCallThisCC(CalleeCC
))
2287 MachineFunction
&MF
= DAG
.getMachineFunction();
2288 const Function
&CallerF
= MF
.getFunction();
2289 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
2290 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2291 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
2293 // Kernels aren't callable, and don't have a live in return address so it
2294 // doesn't make sense to do a tail call with entry functions.
2295 if (!CallerPreserved
)
2298 bool CCMatch
= CallerCC
== CalleeCC
;
2300 if (DAG
.getTarget().Options
.GuaranteedTailCallOpt
) {
2301 if (canGuaranteeTCO(CalleeCC
) && CCMatch
)
2306 // TODO: Can we handle var args?
2310 for (const Argument
&Arg
: CallerF
.args()) {
2311 if (Arg
.hasByValAttr())
2315 LLVMContext
&Ctx
= *DAG
.getContext();
2317 // Check that the call results are passed in the same way.
2318 if (!CCState::resultsCompatible(CalleeCC
, CallerCC
, MF
, Ctx
, Ins
,
2319 CCAssignFnForCall(CalleeCC
, IsVarArg
),
2320 CCAssignFnForCall(CallerCC
, IsVarArg
)))
2323 // The callee has to preserve all registers the caller needs to preserve.
2325 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
2326 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
2330 // Nothing more to check if the callee is taking no arguments.
2334 SmallVector
<CCValAssign
, 16> ArgLocs
;
2335 CCState
CCInfo(CalleeCC
, IsVarArg
, MF
, ArgLocs
, Ctx
);
2337 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CalleeCC
, IsVarArg
));
2339 const SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
2340 // If the stack arguments for this call do not fit into our own save area then
2341 // the call cannot be made tail.
2342 // TODO: Is this really necessary?
2343 if (CCInfo
.getNextStackOffset() > FuncInfo
->getBytesInStackArgArea())
2346 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2347 return parametersInCSRMatch(MRI
, CallerPreserved
, ArgLocs
, OutVals
);
2350 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst
*CI
) const {
2351 if (!CI
->isTailCall())
2354 const Function
*ParentFn
= CI
->getParent()->getParent();
2355 if (AMDGPU::isEntryFunctionCC(ParentFn
->getCallingConv()))
2358 auto Attr
= ParentFn
->getFnAttribute("disable-tail-calls");
2359 return (Attr
.getValueAsString() != "true");
2362 // The wave scratch offset register is used as the global base pointer.
2363 SDValue
SITargetLowering::LowerCall(CallLoweringInfo
&CLI
,
2364 SmallVectorImpl
<SDValue
> &InVals
) const {
2365 SelectionDAG
&DAG
= CLI
.DAG
;
2366 const SDLoc
&DL
= CLI
.DL
;
2367 SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
2368 SmallVector
<SDValue
, 32> &OutVals
= CLI
.OutVals
;
2369 SmallVector
<ISD::InputArg
, 32> &Ins
= CLI
.Ins
;
2370 SDValue Chain
= CLI
.Chain
;
2371 SDValue Callee
= CLI
.Callee
;
2372 bool &IsTailCall
= CLI
.IsTailCall
;
2373 CallingConv::ID CallConv
= CLI
.CallConv
;
2374 bool IsVarArg
= CLI
.IsVarArg
;
2375 bool IsSibCall
= false;
2376 bool IsThisReturn
= false;
2377 MachineFunction
&MF
= DAG
.getMachineFunction();
2380 return lowerUnhandledCall(CLI
, InVals
,
2381 "unsupported call to variadic function ");
2384 if (!CLI
.CS
.getInstruction())
2385 report_fatal_error("unsupported libcall legalization");
2387 if (!CLI
.CS
.getCalledFunction()) {
2388 return lowerUnhandledCall(CLI
, InVals
,
2389 "unsupported indirect call to function ");
2392 if (IsTailCall
&& MF
.getTarget().Options
.GuaranteedTailCallOpt
) {
2393 return lowerUnhandledCall(CLI
, InVals
,
2394 "unsupported required tail call to function ");
2397 if (AMDGPU::isShader(MF
.getFunction().getCallingConv())) {
2398 // Note the issue is with the CC of the calling function, not of the call
2400 return lowerUnhandledCall(CLI
, InVals
,
2401 "unsupported call from graphics shader of function ");
2404 // The first 4 bytes are reserved for the callee's emergency stack slot.
2406 IsTailCall
= isEligibleForTailCallOptimization(
2407 Callee
, CallConv
, IsVarArg
, Outs
, OutVals
, Ins
, DAG
);
2408 if (!IsTailCall
&& CLI
.CS
&& CLI
.CS
.isMustTailCall()) {
2409 report_fatal_error("failed to perform tail call elimination on a call "
2410 "site marked musttail");
2413 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
2415 // A sibling call is one where we're under the usual C ABI and not planning
2416 // to change that but can still do a tail call:
2417 if (!TailCallOpt
&& IsTailCall
)
2424 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2426 // Analyze operands of the call, assigning locations to each operand.
2427 SmallVector
<CCValAssign
, 16> ArgLocs
;
2428 CCState
CCInfo(CallConv
, IsVarArg
, MF
, ArgLocs
, *DAG
.getContext());
2429 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, IsVarArg
);
2431 // The first 4 bytes are reserved for the callee's emergency stack slot.
2432 CCInfo
.AllocateStack(4, 4);
2434 CCInfo
.AnalyzeCallOperands(Outs
, AssignFn
);
2436 // Get a count of how many bytes are to be pushed on the stack.
2437 unsigned NumBytes
= CCInfo
.getNextStackOffset();
2440 // Since we're not changing the ABI to make this a tail call, the memory
2441 // operands are already available in the caller's incoming argument space.
2445 // FPDiff is the byte offset of the call's argument area from the callee's.
2446 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2447 // by this amount for a tail call. In a sibling call it must be 0 because the
2448 // caller will deallocate the entire stack and the callee still expects its
2449 // arguments to begin at SP+0. Completely unused for non-tail calls.
2451 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2452 SmallVector
<std::pair
<unsigned, SDValue
>, 8> RegsToPass
;
2454 SDValue CallerSavedFP
;
2456 // Adjust the stack pointer for the new arguments...
2457 // These operations are automatically eliminated by the prolog/epilog pass
2459 Chain
= DAG
.getCALLSEQ_START(Chain
, 0, 0, DL
);
2461 unsigned OffsetReg
= Info
->getScratchWaveOffsetReg();
2463 // In the HSA case, this should be an identity copy.
2464 SDValue ScratchRSrcReg
2465 = DAG
.getCopyFromReg(Chain
, DL
, Info
->getScratchRSrcReg(), MVT::v4i32
);
2466 RegsToPass
.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3
, ScratchRSrcReg
);
2468 // TODO: Don't hardcode these registers and get from the callee function.
2469 SDValue ScratchWaveOffsetReg
2470 = DAG
.getCopyFromReg(Chain
, DL
, OffsetReg
, MVT::i32
);
2471 RegsToPass
.emplace_back(AMDGPU::SGPR4
, ScratchWaveOffsetReg
);
2473 if (!Info
->isEntryFunction()) {
2474 // Avoid clobbering this function's FP value. In the current convention
2475 // callee will overwrite this, so do save/restore around the call site.
2476 CallerSavedFP
= DAG
.getCopyFromReg(Chain
, DL
,
2477 Info
->getFrameOffsetReg(), MVT::i32
);
2481 SmallVector
<SDValue
, 8> MemOpChains
;
2482 MVT PtrVT
= MVT::i32
;
2484 // Walk the register/memloc assignments, inserting copies/loads.
2485 for (unsigned i
= 0, realArgIdx
= 0, e
= ArgLocs
.size(); i
!= e
;
2486 ++i
, ++realArgIdx
) {
2487 CCValAssign
&VA
= ArgLocs
[i
];
2488 SDValue Arg
= OutVals
[realArgIdx
];
2490 // Promote the value if needed.
2491 switch (VA
.getLocInfo()) {
2492 case CCValAssign::Full
:
2494 case CCValAssign::BCvt
:
2495 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2497 case CCValAssign::ZExt
:
2498 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2500 case CCValAssign::SExt
:
2501 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2503 case CCValAssign::AExt
:
2504 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2506 case CCValAssign::FPExt
:
2507 Arg
= DAG
.getNode(ISD::FP_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2510 llvm_unreachable("Unknown loc info!");
2513 if (VA
.isRegLoc()) {
2514 RegsToPass
.push_back(std::make_pair(VA
.getLocReg(), Arg
));
2516 assert(VA
.isMemLoc());
2519 MachinePointerInfo DstInfo
;
2521 unsigned LocMemOffset
= VA
.getLocMemOffset();
2522 int32_t Offset
= LocMemOffset
;
2524 SDValue PtrOff
= DAG
.getConstant(Offset
, DL
, PtrVT
);
2528 ISD::ArgFlagsTy Flags
= Outs
[realArgIdx
].Flags
;
2529 unsigned OpSize
= Flags
.isByVal() ?
2530 Flags
.getByValSize() : VA
.getValVT().getStoreSize();
2532 // FIXME: We can have better than the minimum byval required alignment.
2533 Align
= Flags
.isByVal() ? Flags
.getByValAlign() :
2534 MinAlign(Subtarget
->getStackAlignment(), Offset
);
2536 Offset
= Offset
+ FPDiff
;
2537 int FI
= MFI
.CreateFixedObject(OpSize
, Offset
, true);
2539 DstAddr
= DAG
.getFrameIndex(FI
, PtrVT
);
2540 DstInfo
= MachinePointerInfo::getFixedStack(MF
, FI
);
2542 // Make sure any stack arguments overlapping with where we're storing
2543 // are loaded before this eventual operation. Otherwise they'll be
2546 // FIXME: Why is this really necessary? This seems to just result in a
2547 // lot of code to copy the stack and write them back to the same
2548 // locations, which are supposed to be immutable?
2549 Chain
= addTokenForArgument(Chain
, DAG
, MFI
, FI
);
2552 DstInfo
= MachinePointerInfo::getStack(MF
, LocMemOffset
);
2553 Align
= MinAlign(Subtarget
->getStackAlignment(), LocMemOffset
);
2556 if (Outs
[i
].Flags
.isByVal()) {
2558 DAG
.getConstant(Outs
[i
].Flags
.getByValSize(), DL
, MVT::i32
);
2559 SDValue Cpy
= DAG
.getMemcpy(
2560 Chain
, DL
, DstAddr
, Arg
, SizeNode
, Outs
[i
].Flags
.getByValAlign(),
2561 /*isVol = */ false, /*AlwaysInline = */ true,
2562 /*isTailCall = */ false, DstInfo
,
2563 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2564 *DAG
.getContext(), AMDGPUAS::PRIVATE_ADDRESS
))));
2566 MemOpChains
.push_back(Cpy
);
2568 SDValue Store
= DAG
.getStore(Chain
, DL
, Arg
, DstAddr
, DstInfo
, Align
);
2569 MemOpChains
.push_back(Store
);
2574 // Copy special input registers after user input arguments.
2575 passSpecialInputs(CLI
, CCInfo
, *Info
, RegsToPass
, MemOpChains
, Chain
);
2577 if (!MemOpChains
.empty())
2578 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOpChains
);
2580 // Build a sequence of copy-to-reg nodes chained together with token chain
2581 // and flag operands which copy the outgoing args into the appropriate regs.
2583 for (auto &RegToPass
: RegsToPass
) {
2584 Chain
= DAG
.getCopyToReg(Chain
, DL
, RegToPass
.first
,
2585 RegToPass
.second
, InFlag
);
2586 InFlag
= Chain
.getValue(1);
2590 SDValue PhysReturnAddrReg
;
2592 // Since the return is being combined with the call, we need to pass on the
2595 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2596 SDValue ReturnAddrReg
= CreateLiveInRegister(
2597 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2599 PhysReturnAddrReg
= DAG
.getRegister(TRI
->getReturnAddressReg(MF
),
2601 Chain
= DAG
.getCopyToReg(Chain
, DL
, PhysReturnAddrReg
, ReturnAddrReg
, InFlag
);
2602 InFlag
= Chain
.getValue(1);
2605 // We don't usually want to end the call-sequence here because we would tidy
2606 // the frame up *after* the call, however in the ABI-changing tail-call case
2607 // we've carefully laid out the parameters so that when sp is reset they'll be
2608 // in the correct location.
2609 if (IsTailCall
&& !IsSibCall
) {
2610 Chain
= DAG
.getCALLSEQ_END(Chain
,
2611 DAG
.getTargetConstant(NumBytes
, DL
, MVT::i32
),
2612 DAG
.getTargetConstant(0, DL
, MVT::i32
),
2614 InFlag
= Chain
.getValue(1);
2617 std::vector
<SDValue
> Ops
;
2618 Ops
.push_back(Chain
);
2619 Ops
.push_back(Callee
);
2622 // Each tail call may have to adjust the stack by a different amount, so
2623 // this information must travel along with the operation for eventual
2624 // consumption by emitEpilogue.
2625 Ops
.push_back(DAG
.getTargetConstant(FPDiff
, DL
, MVT::i32
));
2627 Ops
.push_back(PhysReturnAddrReg
);
2630 // Add argument registers to the end of the list so that they are known live
2632 for (auto &RegToPass
: RegsToPass
) {
2633 Ops
.push_back(DAG
.getRegister(RegToPass
.first
,
2634 RegToPass
.second
.getValueType()));
2637 // Add a register mask operand representing the call-preserved registers.
2639 auto *TRI
= static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
2640 const uint32_t *Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
2641 assert(Mask
&& "Missing call preserved mask for calling convention");
2642 Ops
.push_back(DAG
.getRegisterMask(Mask
));
2644 if (InFlag
.getNode())
2645 Ops
.push_back(InFlag
);
2647 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2649 // If we're doing a tall call, use a TC_RETURN here rather than an
2650 // actual call instruction.
2652 MFI
.setHasTailCall();
2653 return DAG
.getNode(AMDGPUISD::TC_RETURN
, DL
, NodeTys
, Ops
);
2656 // Returns a chain and a flag for retval copy to use.
2657 SDValue Call
= DAG
.getNode(AMDGPUISD::CALL
, DL
, NodeTys
, Ops
);
2658 Chain
= Call
.getValue(0);
2659 InFlag
= Call
.getValue(1);
2661 if (CallerSavedFP
) {
2662 SDValue FPReg
= DAG
.getRegister(Info
->getFrameOffsetReg(), MVT::i32
);
2663 Chain
= DAG
.getCopyToReg(Chain
, DL
, FPReg
, CallerSavedFP
, InFlag
);
2664 InFlag
= Chain
.getValue(1);
2667 uint64_t CalleePopBytes
= NumBytes
;
2668 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getTargetConstant(0, DL
, MVT::i32
),
2669 DAG
.getTargetConstant(CalleePopBytes
, DL
, MVT::i32
),
2672 InFlag
= Chain
.getValue(1);
2674 // Handle result values, copying them out of physregs into vregs that we
2676 return LowerCallResult(Chain
, InFlag
, CallConv
, IsVarArg
, Ins
, DL
, DAG
,
2677 InVals
, IsThisReturn
,
2678 IsThisReturn
? OutVals
[0] : SDValue());
2681 unsigned SITargetLowering::getRegisterByName(const char* RegName
, EVT VT
,
2682 SelectionDAG
&DAG
) const {
2683 unsigned Reg
= StringSwitch
<unsigned>(RegName
)
2684 .Case("m0", AMDGPU::M0
)
2685 .Case("exec", AMDGPU::EXEC
)
2686 .Case("exec_lo", AMDGPU::EXEC_LO
)
2687 .Case("exec_hi", AMDGPU::EXEC_HI
)
2688 .Case("flat_scratch", AMDGPU::FLAT_SCR
)
2689 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO
)
2690 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI
)
2691 .Default(AMDGPU::NoRegister
);
2693 if (Reg
== AMDGPU::NoRegister
) {
2694 report_fatal_error(Twine("invalid register name \""
2695 + StringRef(RegName
) + "\"."));
2699 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
2700 Subtarget
->getRegisterInfo()->regsOverlap(Reg
, AMDGPU::FLAT_SCR
)) {
2701 report_fatal_error(Twine("invalid register \""
2702 + StringRef(RegName
) + "\" for subtarget."));
2707 case AMDGPU::EXEC_LO
:
2708 case AMDGPU::EXEC_HI
:
2709 case AMDGPU::FLAT_SCR_LO
:
2710 case AMDGPU::FLAT_SCR_HI
:
2711 if (VT
.getSizeInBits() == 32)
2715 case AMDGPU::FLAT_SCR
:
2716 if (VT
.getSizeInBits() == 64)
2720 llvm_unreachable("missing register type checking");
2723 report_fatal_error(Twine("invalid type for register \""
2724 + StringRef(RegName
) + "\"."));
2727 // If kill is not the last instruction, split the block so kill is always a
2728 // proper terminator.
2729 MachineBasicBlock
*SITargetLowering::splitKillBlock(MachineInstr
&MI
,
2730 MachineBasicBlock
*BB
) const {
2731 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
2733 MachineBasicBlock::iterator
SplitPoint(&MI
);
2736 if (SplitPoint
== BB
->end()) {
2737 // Don't bother with a new block.
2738 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
2742 MachineFunction
*MF
= BB
->getParent();
2743 MachineBasicBlock
*SplitBB
2744 = MF
->CreateMachineBasicBlock(BB
->getBasicBlock());
2746 MF
->insert(++MachineFunction::iterator(BB
), SplitBB
);
2747 SplitBB
->splice(SplitBB
->begin(), BB
, SplitPoint
, BB
->end());
2749 SplitBB
->transferSuccessorsAndUpdatePHIs(BB
);
2750 BB
->addSuccessor(SplitBB
);
2752 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
2756 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2757 // wavefront. If the value is uniform and just happens to be in a VGPR, this
2758 // will only do one iteration. In the worst case, this will loop 64 times.
2760 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2761 static MachineBasicBlock::iterator
emitLoadM0FromVGPRLoop(
2762 const SIInstrInfo
*TII
,
2763 MachineRegisterInfo
&MRI
,
2764 MachineBasicBlock
&OrigBB
,
2765 MachineBasicBlock
&LoopBB
,
2767 const MachineOperand
&IdxReg
,
2771 unsigned InitSaveExecReg
,
2774 bool IsIndirectSrc
) {
2775 MachineBasicBlock::iterator I
= LoopBB
.begin();
2777 unsigned PhiExec
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
2778 unsigned NewExec
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
2779 unsigned CurrentIdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
2780 unsigned CondReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
2782 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiReg
)
2788 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiExec
)
2789 .addReg(InitSaveExecReg
)
2794 // Read the next variant <- also loop target.
2795 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), CurrentIdxReg
)
2796 .addReg(IdxReg
.getReg(), getUndefRegState(IdxReg
.isUndef()));
2798 // Compare the just read M0 value to all possible Idx values.
2799 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_CMP_EQ_U32_e64
), CondReg
)
2800 .addReg(CurrentIdxReg
)
2801 .addReg(IdxReg
.getReg(), 0, IdxReg
.getSubReg());
2803 // Update EXEC, save the original EXEC value to VCC.
2804 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_AND_SAVEEXEC_B64
), NewExec
)
2805 .addReg(CondReg
, RegState::Kill
);
2807 MRI
.setSimpleHint(NewExec
, CondReg
);
2809 if (UseGPRIdxMode
) {
2812 IdxReg
= CurrentIdxReg
;
2814 IdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
2815 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), IdxReg
)
2816 .addReg(CurrentIdxReg
, RegState::Kill
)
2819 unsigned IdxMode
= IsIndirectSrc
?
2820 VGPRIndexMode::SRC0_ENABLE
: VGPRIndexMode::DST_ENABLE
;
2821 MachineInstr
*SetOn
=
2822 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
2823 .addReg(IdxReg
, RegState::Kill
)
2825 SetOn
->getOperand(3).setIsUndef();
2827 // Move index from VCC into M0
2829 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
2830 .addReg(CurrentIdxReg
, RegState::Kill
);
2832 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
2833 .addReg(CurrentIdxReg
, RegState::Kill
)
2838 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2839 MachineInstr
*InsertPt
=
2840 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_XOR_B64
), AMDGPU::EXEC
)
2841 .addReg(AMDGPU::EXEC
)
2844 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2847 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2848 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
2851 return InsertPt
->getIterator();
2854 // This has slightly sub-optimal regalloc when the source vector is killed by
2855 // the read. The register allocator does not understand that the kill is
2856 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
2857 // subregister from it, using 1 more VGPR than necessary. This was saved when
2858 // this was expanded after register allocation.
2859 static MachineBasicBlock::iterator
loadM0FromVGPR(const SIInstrInfo
*TII
,
2860 MachineBasicBlock
&MBB
,
2862 unsigned InitResultReg
,
2866 bool IsIndirectSrc
) {
2867 MachineFunction
*MF
= MBB
.getParent();
2868 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
2869 const DebugLoc
&DL
= MI
.getDebugLoc();
2870 MachineBasicBlock::iterator
I(&MI
);
2872 unsigned DstReg
= MI
.getOperand(0).getReg();
2873 unsigned SaveExec
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
2874 unsigned TmpExec
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
2876 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), TmpExec
);
2878 // Save the EXEC mask
2879 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B64
), SaveExec
)
2880 .addReg(AMDGPU::EXEC
);
2882 // To insert the loop we need to split the block. Move everything after this
2883 // point to a new block, and insert a new empty block between the two.
2884 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
2885 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
2886 MachineFunction::iterator
MBBI(MBB
);
2889 MF
->insert(MBBI
, LoopBB
);
2890 MF
->insert(MBBI
, RemainderBB
);
2892 LoopBB
->addSuccessor(LoopBB
);
2893 LoopBB
->addSuccessor(RemainderBB
);
2895 // Move the rest of the block into a new block.
2896 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
2897 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, I
, MBB
.end());
2899 MBB
.addSuccessor(LoopBB
);
2901 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
2903 auto InsPt
= emitLoadM0FromVGPRLoop(TII
, MRI
, MBB
, *LoopBB
, DL
, *Idx
,
2904 InitResultReg
, DstReg
, PhiReg
, TmpExec
,
2905 Offset
, UseGPRIdxMode
, IsIndirectSrc
);
2907 MachineBasicBlock::iterator First
= RemainderBB
->begin();
2908 BuildMI(*RemainderBB
, First
, DL
, TII
->get(AMDGPU::S_MOV_B64
), AMDGPU::EXEC
)
2914 // Returns subreg index, offset
2915 static std::pair
<unsigned, int>
2916 computeIndirectRegAndOffset(const SIRegisterInfo
&TRI
,
2917 const TargetRegisterClass
*SuperRC
,
2920 int NumElts
= TRI
.getRegSizeInBits(*SuperRC
) / 32;
2922 // Skip out of bounds offsets, or else we would end up using an undefined
2924 if (Offset
>= NumElts
|| Offset
< 0)
2925 return std::make_pair(AMDGPU::sub0
, Offset
);
2927 return std::make_pair(AMDGPU::sub0
+ Offset
, 0);
2930 // Return true if the index is an SGPR and was set.
2931 static bool setM0ToIndexFromSGPR(const SIInstrInfo
*TII
,
2932 MachineRegisterInfo
&MRI
,
2936 bool IsIndirectSrc
) {
2937 MachineBasicBlock
*MBB
= MI
.getParent();
2938 const DebugLoc
&DL
= MI
.getDebugLoc();
2939 MachineBasicBlock::iterator
I(&MI
);
2941 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
2942 const TargetRegisterClass
*IdxRC
= MRI
.getRegClass(Idx
->getReg());
2944 assert(Idx
->getReg() != AMDGPU::NoRegister
);
2946 if (!TII
->getRegisterInfo().isSGPRClass(IdxRC
))
2949 if (UseGPRIdxMode
) {
2950 unsigned IdxMode
= IsIndirectSrc
?
2951 VGPRIndexMode::SRC0_ENABLE
: VGPRIndexMode::DST_ENABLE
;
2953 MachineInstr
*SetOn
=
2954 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
2958 SetOn
->getOperand(3).setIsUndef();
2960 unsigned Tmp
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
2961 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), Tmp
)
2964 MachineInstr
*SetOn
=
2965 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
2966 .addReg(Tmp
, RegState::Kill
)
2969 SetOn
->getOperand(3).setIsUndef();
2976 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
2979 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
2987 // Control flow needs to be inserted if indexing with a VGPR.
2988 static MachineBasicBlock
*emitIndirectSrc(MachineInstr
&MI
,
2989 MachineBasicBlock
&MBB
,
2990 const GCNSubtarget
&ST
) {
2991 const SIInstrInfo
*TII
= ST
.getInstrInfo();
2992 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
2993 MachineFunction
*MF
= MBB
.getParent();
2994 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
2996 unsigned Dst
= MI
.getOperand(0).getReg();
2997 unsigned SrcReg
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
)->getReg();
2998 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3000 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcReg
);
3003 std::tie(SubReg
, Offset
)
3004 = computeIndirectRegAndOffset(TRI
, VecRC
, SrcReg
, Offset
);
3006 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3008 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, true)) {
3009 MachineBasicBlock::iterator
I(&MI
);
3010 const DebugLoc
&DL
= MI
.getDebugLoc();
3012 if (UseGPRIdxMode
) {
3013 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3014 // to avoid interfering with other uses, so probably requires a new
3015 // optimization pass.
3016 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3017 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3018 .addReg(SrcReg
, RegState::Implicit
)
3019 .addReg(AMDGPU::M0
, RegState::Implicit
);
3020 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3022 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3023 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3024 .addReg(SrcReg
, RegState::Implicit
);
3027 MI
.eraseFromParent();
3032 const DebugLoc
&DL
= MI
.getDebugLoc();
3033 MachineBasicBlock::iterator
I(&MI
);
3035 unsigned PhiReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3036 unsigned InitReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3038 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), InitReg
);
3040 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, InitReg
, PhiReg
,
3041 Offset
, UseGPRIdxMode
, true);
3042 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3044 if (UseGPRIdxMode
) {
3045 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3046 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3047 .addReg(SrcReg
, RegState::Implicit
)
3048 .addReg(AMDGPU::M0
, RegState::Implicit
);
3049 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3051 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3052 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3053 .addReg(SrcReg
, RegState::Implicit
);
3056 MI
.eraseFromParent();
3061 static unsigned getMOVRELDPseudo(const SIRegisterInfo
&TRI
,
3062 const TargetRegisterClass
*VecRC
) {
3063 switch (TRI
.getRegSizeInBits(*VecRC
)) {
3065 return AMDGPU::V_MOVRELD_B32_V1
;
3067 return AMDGPU::V_MOVRELD_B32_V2
;
3068 case 128: // 16 bytes
3069 return AMDGPU::V_MOVRELD_B32_V4
;
3070 case 256: // 32 bytes
3071 return AMDGPU::V_MOVRELD_B32_V8
;
3072 case 512: // 64 bytes
3073 return AMDGPU::V_MOVRELD_B32_V16
;
3075 llvm_unreachable("unsupported size for MOVRELD pseudos");
3079 static MachineBasicBlock
*emitIndirectDst(MachineInstr
&MI
,
3080 MachineBasicBlock
&MBB
,
3081 const GCNSubtarget
&ST
) {
3082 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3083 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3084 MachineFunction
*MF
= MBB
.getParent();
3085 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3087 unsigned Dst
= MI
.getOperand(0).getReg();
3088 const MachineOperand
*SrcVec
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
);
3089 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3090 const MachineOperand
*Val
= TII
->getNamedOperand(MI
, AMDGPU::OpName::val
);
3091 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3092 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcVec
->getReg());
3094 // This can be an immediate, but will be folded later.
3095 assert(Val
->getReg());
3098 std::tie(SubReg
, Offset
) = computeIndirectRegAndOffset(TRI
, VecRC
,
3101 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3103 if (Idx
->getReg() == AMDGPU::NoRegister
) {
3104 MachineBasicBlock::iterator
I(&MI
);
3105 const DebugLoc
&DL
= MI
.getDebugLoc();
3107 assert(Offset
== 0);
3109 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::INSERT_SUBREG
), Dst
)
3114 MI
.eraseFromParent();
3118 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, false)) {
3119 MachineBasicBlock::iterator
I(&MI
);
3120 const DebugLoc
&DL
= MI
.getDebugLoc();
3122 if (UseGPRIdxMode
) {
3123 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3124 .addReg(SrcVec
->getReg(), RegState::Undef
, SubReg
) // vdst
3126 .addReg(Dst
, RegState::ImplicitDefine
)
3127 .addReg(SrcVec
->getReg(), RegState::Implicit
)
3128 .addReg(AMDGPU::M0
, RegState::Implicit
);
3130 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3132 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3134 BuildMI(MBB
, I
, DL
, MovRelDesc
)
3135 .addReg(Dst
, RegState::Define
)
3136 .addReg(SrcVec
->getReg())
3138 .addImm(SubReg
- AMDGPU::sub0
);
3141 MI
.eraseFromParent();
3146 MRI
.clearKillFlags(Val
->getReg());
3148 const DebugLoc
&DL
= MI
.getDebugLoc();
3150 unsigned PhiReg
= MRI
.createVirtualRegister(VecRC
);
3152 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, SrcVec
->getReg(), PhiReg
,
3153 Offset
, UseGPRIdxMode
, false);
3154 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3156 if (UseGPRIdxMode
) {
3157 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3158 .addReg(PhiReg
, RegState::Undef
, SubReg
) // vdst
3160 .addReg(Dst
, RegState::ImplicitDefine
)
3161 .addReg(PhiReg
, RegState::Implicit
)
3162 .addReg(AMDGPU::M0
, RegState::Implicit
);
3163 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3165 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3167 BuildMI(*LoopBB
, InsPt
, DL
, MovRelDesc
)
3168 .addReg(Dst
, RegState::Define
)
3171 .addImm(SubReg
- AMDGPU::sub0
);
3174 MI
.eraseFromParent();
3179 MachineBasicBlock
*SITargetLowering::EmitInstrWithCustomInserter(
3180 MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
3182 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3183 MachineFunction
*MF
= BB
->getParent();
3184 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
3186 if (TII
->isMIMG(MI
)) {
3187 if (MI
.memoperands_empty() && MI
.mayLoadOrStore()) {
3188 report_fatal_error("missing mem operand from MIMG instruction");
3190 // Add a memoperand for mimg instructions so that they aren't assumed to
3191 // be ordered memory instuctions.
3196 switch (MI
.getOpcode()) {
3197 case AMDGPU::S_ADD_U64_PSEUDO
:
3198 case AMDGPU::S_SUB_U64_PSEUDO
: {
3199 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3200 const DebugLoc
&DL
= MI
.getDebugLoc();
3202 MachineOperand
&Dest
= MI
.getOperand(0);
3203 MachineOperand
&Src0
= MI
.getOperand(1);
3204 MachineOperand
&Src1
= MI
.getOperand(2);
3206 unsigned DestSub0
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3207 unsigned DestSub1
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3209 MachineOperand Src0Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3210 Src0
, &AMDGPU::SReg_64RegClass
, AMDGPU::sub0
,
3211 &AMDGPU::SReg_32_XM0RegClass
);
3212 MachineOperand Src0Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3213 Src0
, &AMDGPU::SReg_64RegClass
, AMDGPU::sub1
,
3214 &AMDGPU::SReg_32_XM0RegClass
);
3216 MachineOperand Src1Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3217 Src1
, &AMDGPU::SReg_64RegClass
, AMDGPU::sub0
,
3218 &AMDGPU::SReg_32_XM0RegClass
);
3219 MachineOperand Src1Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3220 Src1
, &AMDGPU::SReg_64RegClass
, AMDGPU::sub1
,
3221 &AMDGPU::SReg_32_XM0RegClass
);
3223 bool IsAdd
= (MI
.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO
);
3225 unsigned LoOpc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
3226 unsigned HiOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
3227 BuildMI(*BB
, MI
, DL
, TII
->get(LoOpc
), DestSub0
)
3230 BuildMI(*BB
, MI
, DL
, TII
->get(HiOpc
), DestSub1
)
3233 BuildMI(*BB
, MI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), Dest
.getReg())
3235 .addImm(AMDGPU::sub0
)
3237 .addImm(AMDGPU::sub1
);
3238 MI
.eraseFromParent();
3241 case AMDGPU::SI_INIT_M0
: {
3242 BuildMI(*BB
, MI
.getIterator(), MI
.getDebugLoc(),
3243 TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3244 .add(MI
.getOperand(0));
3245 MI
.eraseFromParent();
3248 case AMDGPU::SI_INIT_EXEC
:
3249 // This should be before all vector instructions.
3250 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B64
),
3252 .addImm(MI
.getOperand(0).getImm());
3253 MI
.eraseFromParent();
3256 case AMDGPU::SI_INIT_EXEC_FROM_INPUT
: {
3257 // Extract the thread count from an SGPR input and set EXEC accordingly.
3258 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3260 // S_BFE_U32 count, input, {shift, 7}
3261 // S_BFM_B64 exec, count, 0
3262 // S_CMP_EQ_U32 count, 64
3263 // S_CMOV_B64 exec, -1
3264 MachineInstr
*FirstMI
= &*BB
->begin();
3265 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3266 unsigned InputReg
= MI
.getOperand(0).getReg();
3267 unsigned CountReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3270 // Move the COPY of the input reg to the beginning, so that we can use it.
3271 for (auto I
= BB
->begin(); I
!= &MI
; I
++) {
3272 if (I
->getOpcode() != TargetOpcode::COPY
||
3273 I
->getOperand(0).getReg() != InputReg
)
3277 FirstMI
= &*++BB
->begin();
3279 I
->removeFromParent();
3280 BB
->insert(FirstMI
, &*I
);
3288 // This should be before all vector instructions.
3289 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_BFE_U32
), CountReg
)
3291 .addImm((MI
.getOperand(1).getImm() & 0x7f) | 0x70000);
3292 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_BFM_B64
),
3296 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_CMP_EQ_U32
))
3297 .addReg(CountReg
, RegState::Kill
)
3299 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_CMOV_B64
),
3302 MI
.eraseFromParent();
3306 case AMDGPU::GET_GROUPSTATICSIZE
: {
3307 DebugLoc DL
= MI
.getDebugLoc();
3308 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_MOV_B32
))
3309 .add(MI
.getOperand(0))
3310 .addImm(MFI
->getLDSSize());
3311 MI
.eraseFromParent();
3314 case AMDGPU::SI_INDIRECT_SRC_V1
:
3315 case AMDGPU::SI_INDIRECT_SRC_V2
:
3316 case AMDGPU::SI_INDIRECT_SRC_V4
:
3317 case AMDGPU::SI_INDIRECT_SRC_V8
:
3318 case AMDGPU::SI_INDIRECT_SRC_V16
:
3319 return emitIndirectSrc(MI
, *BB
, *getSubtarget());
3320 case AMDGPU::SI_INDIRECT_DST_V1
:
3321 case AMDGPU::SI_INDIRECT_DST_V2
:
3322 case AMDGPU::SI_INDIRECT_DST_V4
:
3323 case AMDGPU::SI_INDIRECT_DST_V8
:
3324 case AMDGPU::SI_INDIRECT_DST_V16
:
3325 return emitIndirectDst(MI
, *BB
, *getSubtarget());
3326 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO
:
3327 case AMDGPU::SI_KILL_I1_PSEUDO
:
3328 return splitKillBlock(MI
, BB
);
3329 case AMDGPU::V_CNDMASK_B64_PSEUDO
: {
3330 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3332 unsigned Dst
= MI
.getOperand(0).getReg();
3333 unsigned Src0
= MI
.getOperand(1).getReg();
3334 unsigned Src1
= MI
.getOperand(2).getReg();
3335 const DebugLoc
&DL
= MI
.getDebugLoc();
3336 unsigned SrcCond
= MI
.getOperand(3).getReg();
3338 unsigned DstLo
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3339 unsigned DstHi
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3340 unsigned SrcCondCopy
= MRI
.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
3342 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::COPY
), SrcCondCopy
)
3344 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstLo
)
3345 .addReg(Src0
, 0, AMDGPU::sub0
)
3346 .addReg(Src1
, 0, AMDGPU::sub0
)
3347 .addReg(SrcCondCopy
);
3348 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstHi
)
3349 .addReg(Src0
, 0, AMDGPU::sub1
)
3350 .addReg(Src1
, 0, AMDGPU::sub1
)
3351 .addReg(SrcCondCopy
);
3353 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), Dst
)
3355 .addImm(AMDGPU::sub0
)
3357 .addImm(AMDGPU::sub1
);
3358 MI
.eraseFromParent();
3361 case AMDGPU::SI_BR_UNDEF
: {
3362 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3363 const DebugLoc
&DL
= MI
.getDebugLoc();
3364 MachineInstr
*Br
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3365 .add(MI
.getOperand(0));
3366 Br
->getOperand(1).setIsUndef(true); // read undef SCC
3367 MI
.eraseFromParent();
3370 case AMDGPU::ADJCALLSTACKUP
:
3371 case AMDGPU::ADJCALLSTACKDOWN
: {
3372 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
3373 MachineInstrBuilder
MIB(*MF
, &MI
);
3375 // Add an implicit use of the frame offset reg to prevent the restore copy
3376 // inserted after the call from being reorderd after stack operations in the
3377 // the caller's frame.
3378 MIB
.addReg(Info
->getStackPtrOffsetReg(), RegState::ImplicitDefine
)
3379 .addReg(Info
->getStackPtrOffsetReg(), RegState::Implicit
)
3380 .addReg(Info
->getFrameOffsetReg(), RegState::Implicit
);
3383 case AMDGPU::SI_CALL_ISEL
:
3384 case AMDGPU::SI_TCRETURN_ISEL
: {
3385 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3386 const DebugLoc
&DL
= MI
.getDebugLoc();
3387 unsigned ReturnAddrReg
= TII
->getRegisterInfo().getReturnAddressReg(*MF
);
3389 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3390 unsigned GlobalAddrReg
= MI
.getOperand(0).getReg();
3391 MachineInstr
*PCRel
= MRI
.getVRegDef(GlobalAddrReg
);
3392 assert(PCRel
->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET
);
3394 const GlobalValue
*G
= PCRel
->getOperand(1).getGlobal();
3396 MachineInstrBuilder MIB
;
3397 if (MI
.getOpcode() == AMDGPU::SI_CALL_ISEL
) {
3398 MIB
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::SI_CALL
), ReturnAddrReg
)
3399 .add(MI
.getOperand(0))
3400 .addGlobalAddress(G
);
3402 MIB
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::SI_TCRETURN
))
3403 .add(MI
.getOperand(0))
3404 .addGlobalAddress(G
);
3406 // There is an additional imm operand for tcreturn, but it should be in the
3407 // right place already.
3410 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; ++I
)
3411 MIB
.add(MI
.getOperand(I
));
3413 MIB
.cloneMemRefs(MI
);
3414 MI
.eraseFromParent();
3418 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI
, BB
);
3422 bool SITargetLowering::hasBitPreservingFPLogic(EVT VT
) const {
3423 return isTypeLegal(VT
.getScalarType());
3426 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT
) const {
3427 // This currently forces unfolding various combinations of fsub into fma with
3428 // free fneg'd operands. As long as we have fast FMA (controlled by
3429 // isFMAFasterThanFMulAndFAdd), we should perform these.
3431 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3432 // most of these combines appear to be cycle neutral but save on instruction
3433 // count / code size.
3437 EVT
SITargetLowering::getSetCCResultType(const DataLayout
&DL
, LLVMContext
&Ctx
,
3439 if (!VT
.isVector()) {
3442 return EVT::getVectorVT(Ctx
, MVT::i1
, VT
.getVectorNumElements());
3445 MVT
SITargetLowering::getScalarShiftAmountTy(const DataLayout
&, EVT VT
) const {
3446 // TODO: Should i16 be used always if legal? For now it would force VALU
3448 return (VT
== MVT::i16
) ? MVT::i16
: MVT::i32
;
3451 // Answering this is somewhat tricky and depends on the specific device which
3452 // have different rates for fma or all f64 operations.
3454 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3455 // regardless of which device (although the number of cycles differs between
3456 // devices), so it is always profitable for f64.
3458 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3459 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3460 // which we can always do even without fused FP ops since it returns the same
3461 // result as the separate operations and since it is always full
3462 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3463 // however does not support denormals, so we do report fma as faster if we have
3464 // a fast fma device and require denormals.
3466 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT
) const {
3467 VT
= VT
.getScalarType();
3469 switch (VT
.getSimpleVT().SimpleTy
) {
3471 // This is as fast on some subtargets. However, we always have full rate f32
3472 // mad available which returns the same result as the separate operations
3473 // which we should prefer over fma. We can't use this if we want to support
3474 // denormals, so only report this in these cases.
3475 if (Subtarget
->hasFP32Denormals())
3476 return Subtarget
->hasFastFMAF32() || Subtarget
->hasDLInsts();
3478 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3479 return Subtarget
->hasFastFMAF32() && Subtarget
->hasDLInsts();
3484 return Subtarget
->has16BitInsts() && Subtarget
->hasFP16Denormals();
3492 //===----------------------------------------------------------------------===//
3493 // Custom DAG Lowering Operations
3494 //===----------------------------------------------------------------------===//
3496 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3497 // wider vector type is legal.
3498 SDValue
SITargetLowering::splitUnaryVectorOp(SDValue Op
,
3499 SelectionDAG
&DAG
) const {
3500 unsigned Opc
= Op
.getOpcode();
3501 EVT VT
= Op
.getValueType();
3502 assert(VT
== MVT::v4f16
);
3505 std::tie(Lo
, Hi
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3508 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo
.getValueType(), Lo
,
3510 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi
.getValueType(), Hi
,
3513 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3516 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3517 // wider vector type is legal.
3518 SDValue
SITargetLowering::splitBinaryVectorOp(SDValue Op
,
3519 SelectionDAG
&DAG
) const {
3520 unsigned Opc
= Op
.getOpcode();
3521 EVT VT
= Op
.getValueType();
3522 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
3525 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3527 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
3531 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
,
3533 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
,
3536 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3539 SDValue
SITargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
3540 switch (Op
.getOpcode()) {
3541 default: return AMDGPUTargetLowering::LowerOperation(Op
, DAG
);
3542 case ISD::BRCOND
: return LowerBRCOND(Op
, DAG
);
3544 SDValue Result
= LowerLOAD(Op
, DAG
);
3545 assert((!Result
.getNode() ||
3546 Result
.getNode()->getNumValues() == 2) &&
3547 "Load should return a value and a chain");
3553 return LowerTrig(Op
, DAG
);
3554 case ISD::SELECT
: return LowerSELECT(Op
, DAG
);
3555 case ISD::FDIV
: return LowerFDIV(Op
, DAG
);
3556 case ISD::ATOMIC_CMP_SWAP
: return LowerATOMIC_CMP_SWAP(Op
, DAG
);
3557 case ISD::STORE
: return LowerSTORE(Op
, DAG
);
3558 case ISD::GlobalAddress
: {
3559 MachineFunction
&MF
= DAG
.getMachineFunction();
3560 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
3561 return LowerGlobalAddress(MFI
, Op
, DAG
);
3563 case ISD::INTRINSIC_WO_CHAIN
: return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
3564 case ISD::INTRINSIC_W_CHAIN
: return LowerINTRINSIC_W_CHAIN(Op
, DAG
);
3565 case ISD::INTRINSIC_VOID
: return LowerINTRINSIC_VOID(Op
, DAG
);
3566 case ISD::ADDRSPACECAST
: return lowerADDRSPACECAST(Op
, DAG
);
3567 case ISD::INSERT_VECTOR_ELT
:
3568 return lowerINSERT_VECTOR_ELT(Op
, DAG
);
3569 case ISD::EXTRACT_VECTOR_ELT
:
3570 return lowerEXTRACT_VECTOR_ELT(Op
, DAG
);
3571 case ISD::BUILD_VECTOR
:
3572 return lowerBUILD_VECTOR(Op
, DAG
);
3574 return lowerFP_ROUND(Op
, DAG
);
3576 return lowerTRAP(Op
, DAG
);
3577 case ISD::DEBUGTRAP
:
3578 return lowerDEBUGTRAP(Op
, DAG
);
3581 case ISD::FCANONICALIZE
:
3582 return splitUnaryVectorOp(Op
, DAG
);
3597 return splitBinaryVectorOp(Op
, DAG
);
3602 static SDValue
adjustLoadValueTypeImpl(SDValue Result
, EVT LoadVT
,
3604 SelectionDAG
&DAG
, bool Unpacked
) {
3605 if (!LoadVT
.isVector())
3608 if (Unpacked
) { // From v2i32/v4i32 back to v2f16/v4f16.
3609 // Truncate to v2i16/v4i16.
3610 EVT IntLoadVT
= LoadVT
.changeTypeToInteger();
3612 // Workaround legalizer not scalarizing truncate after vector op
3613 // legalization byt not creating intermediate vector trunc.
3614 SmallVector
<SDValue
, 4> Elts
;
3615 DAG
.ExtractVectorElements(Result
, Elts
);
3616 for (SDValue
&Elt
: Elts
)
3617 Elt
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, Elt
);
3619 Result
= DAG
.getBuildVector(IntLoadVT
, DL
, Elts
);
3621 // Bitcast to original type (v2f16/v4f16).
3622 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
3625 // Cast back to the original packed type.
3626 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
3629 SDValue
SITargetLowering::adjustLoadValueType(unsigned Opcode
,
3632 ArrayRef
<SDValue
> Ops
,
3633 bool IsIntrinsic
) const {
3636 bool Unpacked
= Subtarget
->hasUnpackedD16VMem();
3637 EVT LoadVT
= M
->getValueType(0);
3639 EVT EquivLoadVT
= LoadVT
;
3640 if (Unpacked
&& LoadVT
.isVector()) {
3641 EquivLoadVT
= LoadVT
.isVector() ?
3642 EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
3643 LoadVT
.getVectorNumElements()) : LoadVT
;
3646 // Change from v4f16/v2f16 to EquivLoadVT.
3647 SDVTList VTList
= DAG
.getVTList(EquivLoadVT
, MVT::Other
);
3650 = DAG
.getMemIntrinsicNode(
3651 IsIntrinsic
? (unsigned)ISD::INTRINSIC_W_CHAIN
: Opcode
, DL
,
3652 VTList
, Ops
, M
->getMemoryVT(),
3653 M
->getMemOperand());
3654 if (!Unpacked
) // Just adjusted the opcode.
3657 SDValue Adjusted
= adjustLoadValueTypeImpl(Load
, LoadVT
, DL
, DAG
, Unpacked
);
3659 return DAG
.getMergeValues({ Adjusted
, Load
.getValue(1) }, DL
);
3662 static SDValue
lowerICMPIntrinsic(const SITargetLowering
&TLI
,
3663 SDNode
*N
, SelectionDAG
&DAG
) {
3664 EVT VT
= N
->getValueType(0);
3665 const auto *CD
= dyn_cast
<ConstantSDNode
>(N
->getOperand(3));
3667 return DAG
.getUNDEF(VT
);
3669 int CondCode
= CD
->getSExtValue();
3670 if (CondCode
< ICmpInst::Predicate::FIRST_ICMP_PREDICATE
||
3671 CondCode
> ICmpInst::Predicate::LAST_ICMP_PREDICATE
)
3672 return DAG
.getUNDEF(VT
);
3674 ICmpInst::Predicate IcInput
= static_cast<ICmpInst::Predicate
>(CondCode
);
3677 SDValue LHS
= N
->getOperand(1);
3678 SDValue RHS
= N
->getOperand(2);
3682 EVT CmpVT
= LHS
.getValueType();
3683 if (CmpVT
== MVT::i16
&& !TLI
.isTypeLegal(MVT::i16
)) {
3684 unsigned PromoteOp
= ICmpInst::isSigned(IcInput
) ?
3685 ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
3686 LHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, LHS
);
3687 RHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, RHS
);
3690 ISD::CondCode CCOpcode
= getICmpCondCode(IcInput
);
3692 return DAG
.getNode(AMDGPUISD::SETCC
, DL
, VT
, LHS
, RHS
,
3693 DAG
.getCondCode(CCOpcode
));
3696 static SDValue
lowerFCMPIntrinsic(const SITargetLowering
&TLI
,
3697 SDNode
*N
, SelectionDAG
&DAG
) {
3698 EVT VT
= N
->getValueType(0);
3699 const auto *CD
= dyn_cast
<ConstantSDNode
>(N
->getOperand(3));
3701 return DAG
.getUNDEF(VT
);
3703 int CondCode
= CD
->getSExtValue();
3704 if (CondCode
< FCmpInst::Predicate::FIRST_FCMP_PREDICATE
||
3705 CondCode
> FCmpInst::Predicate::LAST_FCMP_PREDICATE
) {
3706 return DAG
.getUNDEF(VT
);
3709 SDValue Src0
= N
->getOperand(1);
3710 SDValue Src1
= N
->getOperand(2);
3711 EVT CmpVT
= Src0
.getValueType();
3714 if (CmpVT
== MVT::f16
&& !TLI
.isTypeLegal(CmpVT
)) {
3715 Src0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
3716 Src1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
3719 FCmpInst::Predicate IcInput
= static_cast<FCmpInst::Predicate
>(CondCode
);
3720 ISD::CondCode CCOpcode
= getFCmpCondCode(IcInput
);
3721 return DAG
.getNode(AMDGPUISD::SETCC
, SL
, VT
, Src0
,
3722 Src1
, DAG
.getCondCode(CCOpcode
));
3725 void SITargetLowering::ReplaceNodeResults(SDNode
*N
,
3726 SmallVectorImpl
<SDValue
> &Results
,
3727 SelectionDAG
&DAG
) const {
3728 switch (N
->getOpcode()) {
3729 case ISD::INSERT_VECTOR_ELT
: {
3730 if (SDValue Res
= lowerINSERT_VECTOR_ELT(SDValue(N
, 0), DAG
))
3731 Results
.push_back(Res
);
3734 case ISD::EXTRACT_VECTOR_ELT
: {
3735 if (SDValue Res
= lowerEXTRACT_VECTOR_ELT(SDValue(N
, 0), DAG
))
3736 Results
.push_back(Res
);
3739 case ISD::INTRINSIC_WO_CHAIN
: {
3740 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
3742 case Intrinsic::amdgcn_cvt_pkrtz
: {
3743 SDValue Src0
= N
->getOperand(1);
3744 SDValue Src1
= N
->getOperand(2);
3746 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32
, SL
, MVT::i32
,
3748 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Cvt
));
3751 case Intrinsic::amdgcn_cvt_pknorm_i16
:
3752 case Intrinsic::amdgcn_cvt_pknorm_u16
:
3753 case Intrinsic::amdgcn_cvt_pk_i16
:
3754 case Intrinsic::amdgcn_cvt_pk_u16
: {
3755 SDValue Src0
= N
->getOperand(1);
3756 SDValue Src1
= N
->getOperand(2);
3760 if (IID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
3761 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
3762 else if (IID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
3763 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
3764 else if (IID
== Intrinsic::amdgcn_cvt_pk_i16
)
3765 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
3767 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
3769 EVT VT
= N
->getValueType(0);
3770 if (isTypeLegal(VT
))
3771 Results
.push_back(DAG
.getNode(Opcode
, SL
, VT
, Src0
, Src1
));
3773 SDValue Cvt
= DAG
.getNode(Opcode
, SL
, MVT::i32
, Src0
, Src1
);
3774 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, Cvt
));
3781 case ISD::INTRINSIC_W_CHAIN
: {
3782 if (SDValue Res
= LowerINTRINSIC_W_CHAIN(SDValue(N
, 0), DAG
)) {
3783 Results
.push_back(Res
);
3784 Results
.push_back(Res
.getValue(1));
3792 EVT VT
= N
->getValueType(0);
3793 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VT
);
3794 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(1));
3795 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(2));
3797 EVT SelectVT
= NewVT
;
3798 if (NewVT
.bitsLT(MVT::i32
)) {
3799 LHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, LHS
);
3800 RHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, RHS
);
3801 SelectVT
= MVT::i32
;
3804 SDValue NewSelect
= DAG
.getNode(ISD::SELECT
, SL
, SelectVT
,
3805 N
->getOperand(0), LHS
, RHS
);
3807 if (NewVT
!= SelectVT
)
3808 NewSelect
= DAG
.getNode(ISD::TRUNCATE
, SL
, NewVT
, NewSelect
);
3809 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewSelect
));
3813 if (N
->getValueType(0) != MVT::v2f16
)
3817 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
3819 SDValue Op
= DAG
.getNode(ISD::XOR
, SL
, MVT::i32
,
3821 DAG
.getConstant(0x80008000, SL
, MVT::i32
));
3822 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
3826 if (N
->getValueType(0) != MVT::v2f16
)
3830 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
3832 SDValue Op
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
,
3834 DAG
.getConstant(0x7fff7fff, SL
, MVT::i32
));
3835 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
3843 /// Helper function for LowerBRCOND
3844 static SDNode
*findUser(SDValue Value
, unsigned Opcode
) {
3846 SDNode
*Parent
= Value
.getNode();
3847 for (SDNode::use_iterator I
= Parent
->use_begin(), E
= Parent
->use_end();
3850 if (I
.getUse().get() != Value
)
3853 if (I
->getOpcode() == Opcode
)
3859 unsigned SITargetLowering::isCFIntrinsic(const SDNode
*Intr
) const {
3860 if (Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
) {
3861 switch (cast
<ConstantSDNode
>(Intr
->getOperand(1))->getZExtValue()) {
3862 case Intrinsic::amdgcn_if
:
3863 return AMDGPUISD::IF
;
3864 case Intrinsic::amdgcn_else
:
3865 return AMDGPUISD::ELSE
;
3866 case Intrinsic::amdgcn_loop
:
3867 return AMDGPUISD::LOOP
;
3868 case Intrinsic::amdgcn_end_cf
:
3869 llvm_unreachable("should not occur");
3875 // break, if_break, else_break are all only used as inputs to loop, not
3876 // directly as branch conditions.
3880 void SITargetLowering::createDebuggerPrologueStackObjects(
3881 MachineFunction
&MF
) const {
3882 // Create stack objects that are used for emitting debugger prologue.
3884 // Debugger prologue writes work group IDs and work item IDs to scratch memory
3885 // at fixed location in the following format:
3886 // offset 0: work group ID x
3887 // offset 4: work group ID y
3888 // offset 8: work group ID z
3889 // offset 16: work item ID x
3890 // offset 20: work item ID y
3891 // offset 24: work item ID z
3892 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
3895 // For each dimension:
3896 for (unsigned i
= 0; i
< 3; ++i
) {
3897 // Create fixed stack object for work group ID.
3898 ObjectIdx
= MF
.getFrameInfo().CreateFixedObject(4, i
* 4, true);
3899 Info
->setDebuggerWorkGroupIDStackObjectIndex(i
, ObjectIdx
);
3900 // Create fixed stack object for work item ID.
3901 ObjectIdx
= MF
.getFrameInfo().CreateFixedObject(4, i
* 4 + 16, true);
3902 Info
->setDebuggerWorkItemIDStackObjectIndex(i
, ObjectIdx
);
3906 bool SITargetLowering::shouldEmitFixup(const GlobalValue
*GV
) const {
3907 const Triple
&TT
= getTargetMachine().getTargetTriple();
3908 return (GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
3909 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
3910 AMDGPU::shouldEmitConstantsToTextSection(TT
);
3913 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue
*GV
) const {
3914 return (GV
->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
3915 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
3916 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
3917 !shouldEmitFixup(GV
) &&
3918 !getTargetMachine().shouldAssumeDSOLocal(*GV
->getParent(), GV
);
3921 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue
*GV
) const {
3922 return !shouldEmitFixup(GV
) && !shouldEmitGOTReloc(GV
);
3925 /// This transforms the control flow intrinsics to get the branch destination as
3926 /// last parameter, also switches branch target with BR if the need arise
3927 SDValue
SITargetLowering::LowerBRCOND(SDValue BRCOND
,
3928 SelectionDAG
&DAG
) const {
3931 SDNode
*Intr
= BRCOND
.getOperand(1).getNode();
3932 SDValue Target
= BRCOND
.getOperand(2);
3933 SDNode
*BR
= nullptr;
3934 SDNode
*SetCC
= nullptr;
3936 if (Intr
->getOpcode() == ISD::SETCC
) {
3937 // As long as we negate the condition everything is fine
3939 Intr
= SetCC
->getOperand(0).getNode();
3942 // Get the target from BR if we don't negate the condition
3943 BR
= findUser(BRCOND
, ISD::BR
);
3944 Target
= BR
->getOperand(1);
3947 // FIXME: This changes the types of the intrinsics instead of introducing new
3948 // nodes with the correct types.
3949 // e.g. llvm.amdgcn.loop
3951 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
3952 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
3954 unsigned CFNode
= isCFIntrinsic(Intr
);
3956 // This is a uniform branch so we don't need to legalize.
3960 bool HaveChain
= Intr
->getOpcode() == ISD::INTRINSIC_VOID
||
3961 Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
;
3964 (SetCC
->getConstantOperandVal(1) == 1 &&
3965 cast
<CondCodeSDNode
>(SetCC
->getOperand(2).getNode())->get() ==
3968 // operands of the new intrinsic call
3969 SmallVector
<SDValue
, 4> Ops
;
3971 Ops
.push_back(BRCOND
.getOperand(0));
3973 Ops
.append(Intr
->op_begin() + (HaveChain
? 2 : 1), Intr
->op_end());
3974 Ops
.push_back(Target
);
3976 ArrayRef
<EVT
> Res(Intr
->value_begin() + 1, Intr
->value_end());
3978 // build the new intrinsic call
3979 SDNode
*Result
= DAG
.getNode(CFNode
, DL
, DAG
.getVTList(Res
), Ops
).getNode();
3984 BRCOND
.getOperand(0)
3987 Result
= DAG
.getMergeValues(Ops
, DL
).getNode();
3991 // Give the branch instruction our target
3994 BRCOND
.getOperand(2)
3996 SDValue NewBR
= DAG
.getNode(ISD::BR
, DL
, BR
->getVTList(), Ops
);
3997 DAG
.ReplaceAllUsesWith(BR
, NewBR
.getNode());
3998 BR
= NewBR
.getNode();
4001 SDValue Chain
= SDValue(Result
, Result
->getNumValues() - 1);
4003 // Copy the intrinsic results to registers
4004 for (unsigned i
= 1, e
= Intr
->getNumValues() - 1; i
!= e
; ++i
) {
4005 SDNode
*CopyToReg
= findUser(SDValue(Intr
, i
), ISD::CopyToReg
);
4009 Chain
= DAG
.getCopyToReg(
4011 CopyToReg
->getOperand(1),
4012 SDValue(Result
, i
- 1),
4015 DAG
.ReplaceAllUsesWith(SDValue(CopyToReg
, 0), CopyToReg
->getOperand(0));
4018 // Remove the old intrinsic from the chain
4019 DAG
.ReplaceAllUsesOfValueWith(
4020 SDValue(Intr
, Intr
->getNumValues() - 1),
4021 Intr
->getOperand(0));
4026 SDValue
SITargetLowering::getFPExtOrFPTrunc(SelectionDAG
&DAG
,
4030 return Op
.getValueType().bitsLE(VT
) ?
4031 DAG
.getNode(ISD::FP_EXTEND
, DL
, VT
, Op
) :
4032 DAG
.getNode(ISD::FTRUNC
, DL
, VT
, Op
);
4035 SDValue
SITargetLowering::lowerFP_ROUND(SDValue Op
, SelectionDAG
&DAG
) const {
4036 assert(Op
.getValueType() == MVT::f16
&&
4037 "Do not know how to custom lower FP_ROUND for non-f16 type");
4039 SDValue Src
= Op
.getOperand(0);
4040 EVT SrcVT
= Src
.getValueType();
4041 if (SrcVT
!= MVT::f64
)
4046 SDValue FpToFp16
= DAG
.getNode(ISD::FP_TO_FP16
, DL
, MVT::i32
, Src
);
4047 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, FpToFp16
);
4048 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f16
, Trunc
);
4051 SDValue
SITargetLowering::lowerTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4053 SDValue Chain
= Op
.getOperand(0);
4055 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4056 !Subtarget
->isTrapHandlerEnabled())
4057 return DAG
.getNode(AMDGPUISD::ENDPGM
, SL
, MVT::Other
, Chain
);
4059 MachineFunction
&MF
= DAG
.getMachineFunction();
4060 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4061 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4062 assert(UserSGPR
!= AMDGPU::NoRegister
);
4063 SDValue QueuePtr
= CreateLiveInRegister(
4064 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4065 SDValue SGPR01
= DAG
.getRegister(AMDGPU::SGPR0_SGPR1
, MVT::i64
);
4066 SDValue ToReg
= DAG
.getCopyToReg(Chain
, SL
, SGPR01
,
4067 QueuePtr
, SDValue());
4070 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap
, SL
, MVT::i16
),
4074 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4077 SDValue
SITargetLowering::lowerDEBUGTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4079 SDValue Chain
= Op
.getOperand(0);
4080 MachineFunction
&MF
= DAG
.getMachineFunction();
4082 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4083 !Subtarget
->isTrapHandlerEnabled()) {
4084 DiagnosticInfoUnsupported
NoTrap(MF
.getFunction(),
4085 "debugtrap handler not supported",
4088 LLVMContext
&Ctx
= MF
.getFunction().getContext();
4089 Ctx
.diagnose(NoTrap
);
4095 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap
, SL
, MVT::i16
)
4097 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4100 SDValue
SITargetLowering::getSegmentAperture(unsigned AS
, const SDLoc
&DL
,
4101 SelectionDAG
&DAG
) const {
4102 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4103 if (Subtarget
->hasApertureRegs()) {
4104 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4105 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
4106 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
4107 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4108 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
4109 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
4111 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
4112 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
4113 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
4115 SDValue EncodingImm
= DAG
.getTargetConstant(Encoding
, DL
, MVT::i16
);
4116 SDValue ApertureReg
= SDValue(
4117 DAG
.getMachineNode(AMDGPU::S_GETREG_B32
, DL
, MVT::i32
, EncodingImm
), 0);
4118 SDValue ShiftAmount
= DAG
.getTargetConstant(WidthM1
+ 1, DL
, MVT::i32
);
4119 return DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, ApertureReg
, ShiftAmount
);
4122 MachineFunction
&MF
= DAG
.getMachineFunction();
4123 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4124 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4125 assert(UserSGPR
!= AMDGPU::NoRegister
);
4127 SDValue QueuePtr
= CreateLiveInRegister(
4128 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4130 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4131 // private_segment_aperture_base_hi.
4132 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
4134 SDValue Ptr
= DAG
.getObjectPtrOffset(DL
, QueuePtr
, StructOffset
);
4136 // TODO: Use custom target PseudoSourceValue.
4137 // TODO: We should use the value from the IR intrinsic call, but it might not
4138 // be available and how do we get it?
4139 Value
*V
= UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG
.getContext()),
4140 AMDGPUAS::CONSTANT_ADDRESS
));
4142 MachinePointerInfo
PtrInfo(V
, StructOffset
);
4143 return DAG
.getLoad(MVT::i32
, DL
, QueuePtr
.getValue(1), Ptr
, PtrInfo
,
4144 MinAlign(64, StructOffset
),
4145 MachineMemOperand::MODereferenceable
|
4146 MachineMemOperand::MOInvariant
);
4149 SDValue
SITargetLowering::lowerADDRSPACECAST(SDValue Op
,
4150 SelectionDAG
&DAG
) const {
4152 const AddrSpaceCastSDNode
*ASC
= cast
<AddrSpaceCastSDNode
>(Op
);
4154 SDValue Src
= ASC
->getOperand(0);
4155 SDValue FlatNullPtr
= DAG
.getConstant(0, SL
, MVT::i64
);
4157 const AMDGPUTargetMachine
&TM
=
4158 static_cast<const AMDGPUTargetMachine
&>(getTargetMachine());
4160 // flat -> local/private
4161 if (ASC
->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4162 unsigned DestAS
= ASC
->getDestAddressSpace();
4164 if (DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
4165 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4166 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
4167 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4168 SDValue NonNull
= DAG
.getSetCC(SL
, MVT::i1
, Src
, FlatNullPtr
, ISD::SETNE
);
4169 SDValue Ptr
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, Src
);
4171 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i32
,
4172 NonNull
, Ptr
, SegmentNullPtr
);
4176 // local/private -> flat
4177 if (ASC
->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4178 unsigned SrcAS
= ASC
->getSrcAddressSpace();
4180 if (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
4181 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4182 unsigned NullVal
= TM
.getNullPointerValue(SrcAS
);
4183 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4186 = DAG
.getSetCC(SL
, MVT::i1
, Src
, SegmentNullPtr
, ISD::SETNE
);
4188 SDValue Aperture
= getSegmentAperture(ASC
->getSrcAddressSpace(), SL
, DAG
);
4190 = DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, Src
, Aperture
);
4192 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, NonNull
,
4193 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, CvtPtr
),
4198 // global <-> flat are no-ops and never emitted.
4200 const MachineFunction
&MF
= DAG
.getMachineFunction();
4201 DiagnosticInfoUnsupported
InvalidAddrSpaceCast(
4202 MF
.getFunction(), "invalid addrspacecast", SL
.getDebugLoc());
4203 DAG
.getContext()->diagnose(InvalidAddrSpaceCast
);
4205 return DAG
.getUNDEF(ASC
->getValueType(0));
4208 SDValue
SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op
,
4209 SelectionDAG
&DAG
) const {
4210 SDValue Vec
= Op
.getOperand(0);
4211 SDValue InsVal
= Op
.getOperand(1);
4212 SDValue Idx
= Op
.getOperand(2);
4213 EVT VecVT
= Vec
.getValueType();
4214 EVT EltVT
= VecVT
.getVectorElementType();
4215 unsigned VecSize
= VecVT
.getSizeInBits();
4216 unsigned EltSize
= EltVT
.getSizeInBits();
4219 assert(VecSize
<= 64);
4221 unsigned NumElts
= VecVT
.getVectorNumElements();
4223 auto KIdx
= dyn_cast
<ConstantSDNode
>(Idx
);
4225 if (NumElts
== 4 && EltSize
== 16 && KIdx
) {
4226 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Vec
);
4228 SDValue LoHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4229 DAG
.getConstant(0, SL
, MVT::i32
));
4230 SDValue HiHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4231 DAG
.getConstant(1, SL
, MVT::i32
));
4233 SDValue LoVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, LoHalf
);
4234 SDValue HiVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, HiHalf
);
4236 unsigned Idx
= KIdx
->getZExtValue();
4237 bool InsertLo
= Idx
< 2;
4238 SDValue InsHalf
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, MVT::v2i16
,
4239 InsertLo
? LoVec
: HiVec
,
4240 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, InsVal
),
4241 DAG
.getConstant(InsertLo
? Idx
: (Idx
- 2), SL
, MVT::i32
));
4243 InsHalf
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, InsHalf
);
4245 SDValue Concat
= InsertLo
?
4246 DAG
.getBuildVector(MVT::v2i32
, SL
, { InsHalf
, HiHalf
}) :
4247 DAG
.getBuildVector(MVT::v2i32
, SL
, { LoHalf
, InsHalf
});
4249 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, Concat
);
4252 if (isa
<ConstantSDNode
>(Idx
))
4255 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4257 // Avoid stack access for dynamic indexing.
4258 SDValue Val
= InsVal
;
4259 if (InsVal
.getValueType() == MVT::f16
)
4260 Val
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, InsVal
);
4262 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4263 SDValue ExtVal
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, IntVT
, Val
);
4265 assert(isPowerOf2_32(EltSize
));
4266 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4268 // Convert vector index to bit-index.
4269 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4271 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4272 SDValue BFM
= DAG
.getNode(ISD::SHL
, SL
, IntVT
,
4273 DAG
.getConstant(0xffff, SL
, IntVT
),
4276 SDValue LHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
, BFM
, ExtVal
);
4277 SDValue RHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
,
4278 DAG
.getNOT(SL
, BFM
, IntVT
), BCVec
);
4280 SDValue BFI
= DAG
.getNode(ISD::OR
, SL
, IntVT
, LHS
, RHS
);
4281 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, BFI
);
4284 SDValue
SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op
,
4285 SelectionDAG
&DAG
) const {
4288 EVT ResultVT
= Op
.getValueType();
4289 SDValue Vec
= Op
.getOperand(0);
4290 SDValue Idx
= Op
.getOperand(1);
4291 EVT VecVT
= Vec
.getValueType();
4292 unsigned VecSize
= VecVT
.getSizeInBits();
4293 EVT EltVT
= VecVT
.getVectorElementType();
4294 assert(VecSize
<= 64);
4296 DAGCombinerInfo
DCI(DAG
, AfterLegalizeVectorOps
, true, nullptr);
4298 // Make sure we do any optimizations that will make it easier to fold
4299 // source modifiers before obscuring it with bit operations.
4301 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4302 if (SDValue Combined
= performExtractVectorEltCombine(Op
.getNode(), DCI
))
4305 unsigned EltSize
= EltVT
.getSizeInBits();
4306 assert(isPowerOf2_32(EltSize
));
4308 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4309 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4311 // Convert vector index to bit-index (* EltSize)
4312 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4314 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4315 SDValue Elt
= DAG
.getNode(ISD::SRL
, SL
, IntVT
, BC
, ScaledIdx
);
4317 if (ResultVT
== MVT::f16
) {
4318 SDValue Result
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i16
, Elt
);
4319 return DAG
.getNode(ISD::BITCAST
, SL
, ResultVT
, Result
);
4322 return DAG
.getAnyExtOrTrunc(Elt
, SL
, ResultVT
);
4325 SDValue
SITargetLowering::lowerBUILD_VECTOR(SDValue Op
,
4326 SelectionDAG
&DAG
) const {
4328 EVT VT
= Op
.getValueType();
4330 if (VT
== MVT::v4i16
|| VT
== MVT::v4f16
) {
4331 EVT HalfVT
= MVT::getVectorVT(VT
.getVectorElementType().getSimpleVT(), 2);
4333 // Turn into pair of packed build_vectors.
4334 // TODO: Special case for constants that can be materialized with s_mov_b64.
4335 SDValue Lo
= DAG
.getBuildVector(HalfVT
, SL
,
4336 { Op
.getOperand(0), Op
.getOperand(1) });
4337 SDValue Hi
= DAG
.getBuildVector(HalfVT
, SL
,
4338 { Op
.getOperand(2), Op
.getOperand(3) });
4340 SDValue CastLo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Lo
);
4341 SDValue CastHi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Hi
);
4343 SDValue Blend
= DAG
.getBuildVector(MVT::v2i32
, SL
, { CastLo
, CastHi
});
4344 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Blend
);
4347 assert(VT
== MVT::v2f16
|| VT
== MVT::v2i16
);
4348 assert(!Subtarget
->hasVOP3PInsts() && "this should be legal");
4350 SDValue Lo
= Op
.getOperand(0);
4351 SDValue Hi
= Op
.getOperand(1);
4353 // Avoid adding defined bits with the zero_extend.
4355 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4356 SDValue ExtLo
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Lo
);
4357 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ExtLo
);
4360 Hi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Hi
);
4361 Hi
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Hi
);
4363 SDValue ShlHi
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Hi
,
4364 DAG
.getConstant(16, SL
, MVT::i32
));
4366 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ShlHi
);
4368 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4369 Lo
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Lo
);
4371 SDValue Or
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Lo
, ShlHi
);
4372 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Or
);
4376 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode
*GA
) const {
4377 // We can fold offsets for anything that doesn't require a GOT relocation.
4378 return (GA
->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
4379 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4380 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4381 !shouldEmitGOTReloc(GA
->getGlobal());
4385 buildPCRelGlobalAddress(SelectionDAG
&DAG
, const GlobalValue
*GV
,
4386 const SDLoc
&DL
, unsigned Offset
, EVT PtrVT
,
4387 unsigned GAFlags
= SIInstrInfo::MO_NONE
) {
4388 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4389 // lowered to the following code sequence:
4391 // For constant address space:
4392 // s_getpc_b64 s[0:1]
4393 // s_add_u32 s0, s0, $symbol
4394 // s_addc_u32 s1, s1, 0
4396 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4397 // a fixup or relocation is emitted to replace $symbol with a literal
4398 // constant, which is a pc-relative offset from the encoding of the $symbol
4399 // operand to the global variable.
4401 // For global address space:
4402 // s_getpc_b64 s[0:1]
4403 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4404 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4406 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4407 // fixups or relocations are emitted to replace $symbol@*@lo and
4408 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4409 // which is a 64-bit pc-relative offset from the encoding of the $symbol
4410 // operand to the global variable.
4412 // What we want here is an offset from the value returned by s_getpc
4413 // (which is the address of the s_add_u32 instruction) to the global
4414 // variable, but since the encoding of $symbol starts 4 bytes after the start
4415 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4416 // small. This requires us to add 4 to the global variable offset in order to
4417 // compute the correct address.
4418 SDValue PtrLo
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4,
4420 SDValue PtrHi
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4,
4421 GAFlags
== SIInstrInfo::MO_NONE
?
4422 GAFlags
: GAFlags
+ 1);
4423 return DAG
.getNode(AMDGPUISD::PC_ADD_REL_OFFSET
, DL
, PtrVT
, PtrLo
, PtrHi
);
4426 SDValue
SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
*MFI
,
4428 SelectionDAG
&DAG
) const {
4429 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Op
);
4430 const GlobalValue
*GV
= GSD
->getGlobal();
4431 if (GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
4432 GSD
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
||
4433 GSD
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
)
4434 return AMDGPUTargetLowering::LowerGlobalAddress(MFI
, Op
, DAG
);
4437 EVT PtrVT
= Op
.getValueType();
4439 // FIXME: Should not make address space based decisions here.
4440 if (shouldEmitFixup(GV
))
4441 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
);
4442 else if (shouldEmitPCReloc(GV
))
4443 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
,
4444 SIInstrInfo::MO_REL32
);
4446 SDValue GOTAddr
= buildPCRelGlobalAddress(DAG
, GV
, DL
, 0, PtrVT
,
4447 SIInstrInfo::MO_GOTPCREL32
);
4449 Type
*Ty
= PtrVT
.getTypeForEVT(*DAG
.getContext());
4450 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
4451 const DataLayout
&DataLayout
= DAG
.getDataLayout();
4452 unsigned Align
= DataLayout
.getABITypeAlignment(PtrTy
);
4453 MachinePointerInfo PtrInfo
4454 = MachinePointerInfo::getGOT(DAG
.getMachineFunction());
4456 return DAG
.getLoad(PtrVT
, DL
, DAG
.getEntryNode(), GOTAddr
, PtrInfo
, Align
,
4457 MachineMemOperand::MODereferenceable
|
4458 MachineMemOperand::MOInvariant
);
4461 SDValue
SITargetLowering::copyToM0(SelectionDAG
&DAG
, SDValue Chain
,
4462 const SDLoc
&DL
, SDValue V
) const {
4463 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4464 // the destination register.
4466 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4467 // so we will end up with redundant moves to m0.
4469 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4471 // A Null SDValue creates a glue result.
4472 SDNode
*M0
= DAG
.getMachineNode(AMDGPU::SI_INIT_M0
, DL
, MVT::Other
, MVT::Glue
,
4474 return SDValue(M0
, 0);
4477 SDValue
SITargetLowering::lowerImplicitZextParam(SelectionDAG
&DAG
,
4480 unsigned Offset
) const {
4482 SDValue Param
= lowerKernargMemParameter(DAG
, MVT::i32
, MVT::i32
, SL
,
4483 DAG
.getEntryNode(), Offset
, 4, false);
4484 // The local size values will have the hi 16-bits as zero.
4485 return DAG
.getNode(ISD::AssertZext
, SL
, MVT::i32
, Param
,
4486 DAG
.getValueType(VT
));
4489 static SDValue
emitNonHSAIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
4491 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
4492 "non-hsa intrinsic with hsa target",
4494 DAG
.getContext()->diagnose(BadIntrin
);
4495 return DAG
.getUNDEF(VT
);
4498 static SDValue
emitRemovedIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
4500 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
4501 "intrinsic not supported on subtarget",
4503 DAG
.getContext()->diagnose(BadIntrin
);
4504 return DAG
.getUNDEF(VT
);
4507 static SDValue
getBuildDwordsVector(SelectionDAG
&DAG
, SDLoc DL
,
4508 ArrayRef
<SDValue
> Elts
) {
4509 assert(!Elts
.empty());
4513 if (Elts
.size() == 1) {
4516 } else if (Elts
.size() == 2) {
4519 } else if (Elts
.size() <= 4) {
4522 } else if (Elts
.size() <= 8) {
4526 assert(Elts
.size() <= 16);
4531 SmallVector
<SDValue
, 16> VecElts(NumElts
);
4532 for (unsigned i
= 0; i
< Elts
.size(); ++i
) {
4533 SDValue Elt
= Elts
[i
];
4534 if (Elt
.getValueType() != MVT::f32
)
4535 Elt
= DAG
.getBitcast(MVT::f32
, Elt
);
4538 for (unsigned i
= Elts
.size(); i
< NumElts
; ++i
)
4539 VecElts
[i
] = DAG
.getUNDEF(MVT::f32
);
4543 return DAG
.getBuildVector(Type
, DL
, VecElts
);
4546 static bool parseCachePolicy(SDValue CachePolicy
, SelectionDAG
&DAG
,
4547 SDValue
*GLC
, SDValue
*SLC
) {
4548 auto CachePolicyConst
= dyn_cast
<ConstantSDNode
>(CachePolicy
.getNode());
4549 if (!CachePolicyConst
)
4552 uint64_t Value
= CachePolicyConst
->getZExtValue();
4553 SDLoc
DL(CachePolicy
);
4555 *GLC
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
4556 Value
&= ~(uint64_t)0x1;
4559 *SLC
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
4560 Value
&= ~(uint64_t)0x2;
4566 SDValue
SITargetLowering::lowerImage(SDValue Op
,
4567 const AMDGPU::ImageDimIntrinsicInfo
*Intr
,
4568 SelectionDAG
&DAG
) const {
4570 MachineFunction
&MF
= DAG
.getMachineFunction();
4571 const GCNSubtarget
* ST
= &MF
.getSubtarget
<GCNSubtarget
>();
4572 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
4573 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
4574 const AMDGPU::MIMGDimInfo
*DimInfo
= AMDGPU::getMIMGDimInfo(Intr
->Dim
);
4575 const AMDGPU::MIMGLZMappingInfo
*LZMappingInfo
=
4576 AMDGPU::getMIMGLZMappingInfo(Intr
->BaseOpcode
);
4577 unsigned IntrOpcode
= Intr
->BaseOpcode
;
4579 SmallVector
<EVT
, 2> ResultTypes(Op
->value_begin(), Op
->value_end());
4584 unsigned AddrIdx
; // Index of first address argument
4587 if (BaseOpcode
->Atomic
) {
4588 VData
= Op
.getOperand(2);
4590 bool Is64Bit
= VData
.getValueType() == MVT::i64
;
4591 if (BaseOpcode
->AtomicX2
) {
4592 SDValue VData2
= Op
.getOperand(3);
4593 VData
= DAG
.getBuildVector(Is64Bit
? MVT::v2i64
: MVT::v2i32
, DL
,
4596 VData
= DAG
.getBitcast(MVT::v4i32
, VData
);
4598 ResultTypes
[0] = Is64Bit
? MVT::v2i64
: MVT::v2i32
;
4599 DMask
= Is64Bit
? 0xf : 0x3;
4600 NumVDataDwords
= Is64Bit
? 4 : 2;
4603 DMask
= Is64Bit
? 0x3 : 0x1;
4604 NumVDataDwords
= Is64Bit
? 2 : 1;
4610 if (BaseOpcode
->Store
) {
4611 VData
= Op
.getOperand(2);
4613 MVT StoreVT
= VData
.getSimpleValueType();
4614 if (StoreVT
.getScalarType() == MVT::f16
) {
4615 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
||
4616 !BaseOpcode
->HasD16
)
4617 return Op
; // D16 is unsupported for this instruction
4620 VData
= handleD16VData(VData
, DAG
);
4623 NumVDataDwords
= (VData
.getValueType().getSizeInBits() + 31) / 32;
4626 MVT LoadVT
= Op
.getSimpleValueType();
4627 if (LoadVT
.getScalarType() == MVT::f16
) {
4628 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
||
4629 !BaseOpcode
->HasD16
)
4630 return Op
; // D16 is unsupported for this instruction
4633 if (LoadVT
.isVector() && Subtarget
->hasUnpackedD16VMem())
4634 ResultTypes
[0] = (LoadVT
== MVT::v2f16
) ? MVT::v2i32
: MVT::v4i32
;
4637 NumVDataDwords
= (ResultTypes
[0].getSizeInBits() + 31) / 32;
4638 DMaskIdx
= isa
<MemSDNode
>(Op
) ? 2 : 1;
4641 auto DMaskConst
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(DMaskIdx
));
4645 AddrIdx
= DMaskIdx
+ 1;
4646 DMask
= DMaskConst
->getZExtValue();
4647 if (!DMask
&& !BaseOpcode
->Store
) {
4648 // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
4649 // store the channels' default values.
4650 SDValue Undef
= DAG
.getUNDEF(Op
.getValueType());
4651 if (isa
<MemSDNode
>(Op
))
4652 return DAG
.getMergeValues({Undef
, Op
.getOperand(0)}, DL
);
4657 unsigned NumGradients
= BaseOpcode
->Gradients
? DimInfo
->NumGradients
: 0;
4658 unsigned NumCoords
= BaseOpcode
->Coordinates
? DimInfo
->NumCoords
: 0;
4659 unsigned NumLCM
= BaseOpcode
->LodOrClampOrMip
? 1 : 0;
4660 unsigned NumVAddrs
= BaseOpcode
->NumExtraArgs
+ NumGradients
+
4662 unsigned NumMIVAddrs
= NumVAddrs
;
4664 SmallVector
<SDValue
, 4> VAddrs
;
4666 // Optimize _L to _LZ when _L is zero
4667 if (LZMappingInfo
) {
4668 if (auto ConstantLod
=
4669 dyn_cast
<ConstantFPSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
4670 if (ConstantLod
->isZero() || ConstantLod
->isNegative()) {
4671 IntrOpcode
= LZMappingInfo
->LZ
; // set new opcode to _lz variant of _l
4672 NumMIVAddrs
--; // remove 'lod'
4677 // Check for 16 bit addresses and pack if true.
4678 unsigned DimIdx
= AddrIdx
+ BaseOpcode
->NumExtraArgs
;
4679 MVT VAddrVT
= Op
.getOperand(DimIdx
).getSimpleValueType();
4680 if (VAddrVT
.getScalarType() == MVT::f16
&&
4681 ST
->hasFeature(AMDGPU::FeatureR128A16
)) {
4683 for (unsigned i
= AddrIdx
; i
< (AddrIdx
+ NumMIVAddrs
); ++i
) {
4684 SDValue AddrLo
, AddrHi
;
4685 // Push back extra arguments.
4687 AddrLo
= Op
.getOperand(i
);
4689 AddrLo
= Op
.getOperand(i
);
4690 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
4691 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
4692 if (((i
+ 1) >= (AddrIdx
+ NumMIVAddrs
)) ||
4693 ((NumGradients
/ 2) % 2 == 1 &&
4694 (i
== DimIdx
+ (NumGradients
/ 2) - 1 ||
4695 i
== DimIdx
+ NumGradients
- 1))) {
4696 AddrHi
= DAG
.getUNDEF(MVT::f16
);
4698 AddrHi
= Op
.getOperand(i
+ 1);
4701 AddrLo
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, MVT::v2f16
,
4703 AddrLo
= DAG
.getBitcast(MVT::i32
, AddrLo
);
4705 VAddrs
.push_back(AddrLo
);
4708 for (unsigned i
= 0; i
< NumMIVAddrs
; ++i
)
4709 VAddrs
.push_back(Op
.getOperand(AddrIdx
+ i
));
4712 SDValue VAddr
= getBuildDwordsVector(DAG
, DL
, VAddrs
);
4714 SDValue True
= DAG
.getTargetConstant(1, DL
, MVT::i1
);
4715 SDValue False
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
4716 unsigned CtrlIdx
; // Index of texfailctrl argument
4718 if (!BaseOpcode
->Sampler
) {
4720 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 1;
4723 dyn_cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 2));
4727 Unorm
= UnormConst
->getZExtValue() ? True
: False
;
4728 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 3;
4731 SDValue TexFail
= Op
.getOperand(CtrlIdx
);
4732 auto TexFailConst
= dyn_cast
<ConstantSDNode
>(TexFail
.getNode());
4733 if (!TexFailConst
|| TexFailConst
->getZExtValue() != 0)
4738 if (BaseOpcode
->Atomic
) {
4739 GLC
= True
; // TODO no-return optimization
4740 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, nullptr, &SLC
))
4743 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, &GLC
, &SLC
))
4747 SmallVector
<SDValue
, 14> Ops
;
4748 if (BaseOpcode
->Store
|| BaseOpcode
->Atomic
)
4749 Ops
.push_back(VData
); // vdata
4750 Ops
.push_back(VAddr
);
4751 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
)); // rsrc
4752 if (BaseOpcode
->Sampler
)
4753 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 1)); // sampler
4754 Ops
.push_back(DAG
.getTargetConstant(DMask
, DL
, MVT::i32
));
4755 Ops
.push_back(Unorm
);
4758 Ops
.push_back(IsA16
&& // a16 or r128
4759 ST
->hasFeature(AMDGPU::FeatureR128A16
) ? True
: False
);
4760 Ops
.push_back(False
); // tfe
4761 Ops
.push_back(False
); // lwe
4762 Ops
.push_back(DimInfo
->DA
? True
: False
);
4763 if (BaseOpcode
->HasD16
)
4764 Ops
.push_back(IsD16
? True
: False
);
4765 if (isa
<MemSDNode
>(Op
))
4766 Ops
.push_back(Op
.getOperand(0)); // chain
4768 int NumVAddrDwords
= VAddr
.getValueType().getSizeInBits() / 32;
4771 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
4772 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx8
,
4773 NumVDataDwords
, NumVAddrDwords
);
4775 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx6
,
4776 NumVDataDwords
, NumVAddrDwords
);
4777 assert(Opcode
!= -1);
4779 MachineSDNode
*NewNode
= DAG
.getMachineNode(Opcode
, DL
, ResultTypes
, Ops
);
4780 if (auto MemOp
= dyn_cast
<MemSDNode
>(Op
)) {
4781 MachineMemOperand
*MemRef
= MemOp
->getMemOperand();
4782 DAG
.setNodeMemRefs(NewNode
, {MemRef
});
4785 if (BaseOpcode
->AtomicX2
) {
4786 SmallVector
<SDValue
, 1> Elt
;
4787 DAG
.ExtractVectorElements(SDValue(NewNode
, 0), Elt
, 0, 1);
4788 return DAG
.getMergeValues({Elt
[0], SDValue(NewNode
, 1)}, DL
);
4789 } else if (IsD16
&& !BaseOpcode
->Store
) {
4790 MVT LoadVT
= Op
.getSimpleValueType();
4791 SDValue Adjusted
= adjustLoadValueTypeImpl(
4792 SDValue(NewNode
, 0), LoadVT
, DL
, DAG
, Subtarget
->hasUnpackedD16VMem());
4793 return DAG
.getMergeValues({Adjusted
, SDValue(NewNode
, 1)}, DL
);
4796 return SDValue(NewNode
, 0);
4799 SDValue
SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
4800 SelectionDAG
&DAG
) const {
4801 MachineFunction
&MF
= DAG
.getMachineFunction();
4802 auto MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
4804 EVT VT
= Op
.getValueType();
4806 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
4808 // TODO: Should this propagate fast-math-flags?
4810 switch (IntrinsicID
) {
4811 case Intrinsic::amdgcn_implicit_buffer_ptr
: {
4812 if (getSubtarget()->isAmdHsaOrMesa(MF
.getFunction()))
4813 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
4814 return getPreloadedValue(DAG
, *MFI
, VT
,
4815 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
4817 case Intrinsic::amdgcn_dispatch_ptr
:
4818 case Intrinsic::amdgcn_queue_ptr
: {
4819 if (!Subtarget
->isAmdHsaOrMesa(MF
.getFunction())) {
4820 DiagnosticInfoUnsupported
BadIntrin(
4821 MF
.getFunction(), "unsupported hsa intrinsic without hsa target",
4823 DAG
.getContext()->diagnose(BadIntrin
);
4824 return DAG
.getUNDEF(VT
);
4827 auto RegID
= IntrinsicID
== Intrinsic::amdgcn_dispatch_ptr
?
4828 AMDGPUFunctionArgInfo::DISPATCH_PTR
: AMDGPUFunctionArgInfo::QUEUE_PTR
;
4829 return getPreloadedValue(DAG
, *MFI
, VT
, RegID
);
4831 case Intrinsic::amdgcn_implicitarg_ptr
: {
4832 if (MFI
->isEntryFunction())
4833 return getImplicitArgPtr(DAG
, DL
);
4834 return getPreloadedValue(DAG
, *MFI
, VT
,
4835 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
4837 case Intrinsic::amdgcn_kernarg_segment_ptr
: {
4838 return getPreloadedValue(DAG
, *MFI
, VT
,
4839 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
4841 case Intrinsic::amdgcn_dispatch_id
: {
4842 return getPreloadedValue(DAG
, *MFI
, VT
, AMDGPUFunctionArgInfo::DISPATCH_ID
);
4844 case Intrinsic::amdgcn_rcp
:
4845 return DAG
.getNode(AMDGPUISD::RCP
, DL
, VT
, Op
.getOperand(1));
4846 case Intrinsic::amdgcn_rsq
:
4847 return DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
4848 case Intrinsic::amdgcn_rsq_legacy
:
4849 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
4850 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
4852 return DAG
.getNode(AMDGPUISD::RSQ_LEGACY
, DL
, VT
, Op
.getOperand(1));
4853 case Intrinsic::amdgcn_rcp_legacy
:
4854 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
4855 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
4856 return DAG
.getNode(AMDGPUISD::RCP_LEGACY
, DL
, VT
, Op
.getOperand(1));
4857 case Intrinsic::amdgcn_rsq_clamp
: {
4858 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
4859 return DAG
.getNode(AMDGPUISD::RSQ_CLAMP
, DL
, VT
, Op
.getOperand(1));
4861 Type
*Type
= VT
.getTypeForEVT(*DAG
.getContext());
4862 APFloat Max
= APFloat::getLargest(Type
->getFltSemantics());
4863 APFloat Min
= APFloat::getLargest(Type
->getFltSemantics(), true);
4865 SDValue Rsq
= DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
4866 SDValue Tmp
= DAG
.getNode(ISD::FMINNUM
, DL
, VT
, Rsq
,
4867 DAG
.getConstantFP(Max
, DL
, VT
));
4868 return DAG
.getNode(ISD::FMAXNUM
, DL
, VT
, Tmp
,
4869 DAG
.getConstantFP(Min
, DL
, VT
));
4871 case Intrinsic::r600_read_ngroups_x
:
4872 if (Subtarget
->isAmdHsaOS())
4873 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
4875 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
4876 SI::KernelInputOffsets::NGROUPS_X
, 4, false);
4877 case Intrinsic::r600_read_ngroups_y
:
4878 if (Subtarget
->isAmdHsaOS())
4879 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
4881 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
4882 SI::KernelInputOffsets::NGROUPS_Y
, 4, false);
4883 case Intrinsic::r600_read_ngroups_z
:
4884 if (Subtarget
->isAmdHsaOS())
4885 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
4887 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
4888 SI::KernelInputOffsets::NGROUPS_Z
, 4, false);
4889 case Intrinsic::r600_read_global_size_x
:
4890 if (Subtarget
->isAmdHsaOS())
4891 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
4893 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
4894 SI::KernelInputOffsets::GLOBAL_SIZE_X
, 4, false);
4895 case Intrinsic::r600_read_global_size_y
:
4896 if (Subtarget
->isAmdHsaOS())
4897 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
4899 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
4900 SI::KernelInputOffsets::GLOBAL_SIZE_Y
, 4, false);
4901 case Intrinsic::r600_read_global_size_z
:
4902 if (Subtarget
->isAmdHsaOS())
4903 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
4905 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
4906 SI::KernelInputOffsets::GLOBAL_SIZE_Z
, 4, false);
4907 case Intrinsic::r600_read_local_size_x
:
4908 if (Subtarget
->isAmdHsaOS())
4909 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
4911 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
4912 SI::KernelInputOffsets::LOCAL_SIZE_X
);
4913 case Intrinsic::r600_read_local_size_y
:
4914 if (Subtarget
->isAmdHsaOS())
4915 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
4917 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
4918 SI::KernelInputOffsets::LOCAL_SIZE_Y
);
4919 case Intrinsic::r600_read_local_size_z
:
4920 if (Subtarget
->isAmdHsaOS())
4921 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
4923 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
4924 SI::KernelInputOffsets::LOCAL_SIZE_Z
);
4925 case Intrinsic::amdgcn_workgroup_id_x
:
4926 case Intrinsic::r600_read_tgid_x
:
4927 return getPreloadedValue(DAG
, *MFI
, VT
,
4928 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
4929 case Intrinsic::amdgcn_workgroup_id_y
:
4930 case Intrinsic::r600_read_tgid_y
:
4931 return getPreloadedValue(DAG
, *MFI
, VT
,
4932 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
4933 case Intrinsic::amdgcn_workgroup_id_z
:
4934 case Intrinsic::r600_read_tgid_z
:
4935 return getPreloadedValue(DAG
, *MFI
, VT
,
4936 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
4937 case Intrinsic::amdgcn_workitem_id_x
: {
4938 case Intrinsic::r600_read_tidig_x
:
4939 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
4940 SDLoc(DAG
.getEntryNode()),
4941 MFI
->getArgInfo().WorkItemIDX
);
4943 case Intrinsic::amdgcn_workitem_id_y
:
4944 case Intrinsic::r600_read_tidig_y
:
4945 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
4946 SDLoc(DAG
.getEntryNode()),
4947 MFI
->getArgInfo().WorkItemIDY
);
4948 case Intrinsic::amdgcn_workitem_id_z
:
4949 case Intrinsic::r600_read_tidig_z
:
4950 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
4951 SDLoc(DAG
.getEntryNode()),
4952 MFI
->getArgInfo().WorkItemIDZ
);
4953 case AMDGPUIntrinsic::SI_load_const
: {
4955 Op
.getOperand(1), // Ptr
4956 Op
.getOperand(2), // Offset
4957 DAG
.getTargetConstant(0, DL
, MVT::i1
) // glc
4960 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
4961 MachinePointerInfo(),
4962 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
4963 MachineMemOperand::MOInvariant
,
4964 VT
.getStoreSize(), 4);
4965 SDVTList VTList
= DAG
.getVTList(MVT::i32
);
4966 SDValue Load
= DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD
, DL
,
4967 VTList
, Ops
, MVT::i32
, MMO
);
4969 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Load
);
4971 case Intrinsic::amdgcn_s_buffer_load
: {
4972 unsigned Cache
= cast
<ConstantSDNode
>(Op
.getOperand(3))->getZExtValue();
4974 Op
.getOperand(1), // Ptr
4975 Op
.getOperand(2), // Offset
4976 DAG
.getTargetConstant(Cache
& 1, DL
, MVT::i1
) // glc
4979 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
4980 MachinePointerInfo(),
4981 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
4982 MachineMemOperand::MOInvariant
,
4983 VT
.getStoreSize(), VT
.getStoreSize());
4984 return DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD
, DL
,
4985 Op
->getVTList(), Ops
, VT
, MMO
);
4987 case Intrinsic::amdgcn_fdiv_fast
:
4988 return lowerFDIV_FAST(Op
, DAG
);
4989 case Intrinsic::amdgcn_interp_mov
: {
4990 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(4));
4991 SDValue Glue
= M0
.getValue(1);
4992 return DAG
.getNode(AMDGPUISD::INTERP_MOV
, DL
, MVT::f32
, Op
.getOperand(1),
4993 Op
.getOperand(2), Op
.getOperand(3), Glue
);
4995 case Intrinsic::amdgcn_interp_p1
: {
4996 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(4));
4997 SDValue Glue
= M0
.getValue(1);
4998 return DAG
.getNode(AMDGPUISD::INTERP_P1
, DL
, MVT::f32
, Op
.getOperand(1),
4999 Op
.getOperand(2), Op
.getOperand(3), Glue
);
5001 case Intrinsic::amdgcn_interp_p2
: {
5002 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(5));
5003 SDValue Glue
= SDValue(M0
.getNode(), 1);
5004 return DAG
.getNode(AMDGPUISD::INTERP_P2
, DL
, MVT::f32
, Op
.getOperand(1),
5005 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(4),
5008 case Intrinsic::amdgcn_sin
:
5009 return DAG
.getNode(AMDGPUISD::SIN_HW
, DL
, VT
, Op
.getOperand(1));
5011 case Intrinsic::amdgcn_cos
:
5012 return DAG
.getNode(AMDGPUISD::COS_HW
, DL
, VT
, Op
.getOperand(1));
5014 case Intrinsic::amdgcn_log_clamp
: {
5015 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5018 DiagnosticInfoUnsupported
BadIntrin(
5019 MF
.getFunction(), "intrinsic not supported on subtarget",
5021 DAG
.getContext()->diagnose(BadIntrin
);
5022 return DAG
.getUNDEF(VT
);
5024 case Intrinsic::amdgcn_ldexp
:
5025 return DAG
.getNode(AMDGPUISD::LDEXP
, DL
, VT
,
5026 Op
.getOperand(1), Op
.getOperand(2));
5028 case Intrinsic::amdgcn_fract
:
5029 return DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, Op
.getOperand(1));
5031 case Intrinsic::amdgcn_class
:
5032 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, VT
,
5033 Op
.getOperand(1), Op
.getOperand(2));
5034 case Intrinsic::amdgcn_div_fmas
:
5035 return DAG
.getNode(AMDGPUISD::DIV_FMAS
, DL
, VT
,
5036 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
5039 case Intrinsic::amdgcn_div_fixup
:
5040 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, DL
, VT
,
5041 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5043 case Intrinsic::amdgcn_trig_preop
:
5044 return DAG
.getNode(AMDGPUISD::TRIG_PREOP
, DL
, VT
,
5045 Op
.getOperand(1), Op
.getOperand(2));
5046 case Intrinsic::amdgcn_div_scale
: {
5047 // 3rd parameter required to be a constant.
5048 const ConstantSDNode
*Param
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3));
5050 return DAG
.getMergeValues({ DAG
.getUNDEF(VT
), DAG
.getUNDEF(MVT::i1
) }, DL
);
5052 // Translate to the operands expected by the machine instruction. The
5053 // first parameter must be the same as the first instruction.
5054 SDValue Numerator
= Op
.getOperand(1);
5055 SDValue Denominator
= Op
.getOperand(2);
5057 // Note this order is opposite of the machine instruction's operations,
5058 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5059 // intrinsic has the numerator as the first operand to match a normal
5060 // division operation.
5062 SDValue Src0
= Param
->isAllOnesValue() ? Numerator
: Denominator
;
5064 return DAG
.getNode(AMDGPUISD::DIV_SCALE
, DL
, Op
->getVTList(), Src0
,
5065 Denominator
, Numerator
);
5067 case Intrinsic::amdgcn_icmp
: {
5068 return lowerICMPIntrinsic(*this, Op
.getNode(), DAG
);
5070 case Intrinsic::amdgcn_fcmp
: {
5071 return lowerFCMPIntrinsic(*this, Op
.getNode(), DAG
);
5073 case Intrinsic::amdgcn_fmed3
:
5074 return DAG
.getNode(AMDGPUISD::FMED3
, DL
, VT
,
5075 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5076 case Intrinsic::amdgcn_fdot2
:
5077 return DAG
.getNode(AMDGPUISD::FDOT2
, DL
, VT
,
5078 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
5080 case Intrinsic::amdgcn_fmul_legacy
:
5081 return DAG
.getNode(AMDGPUISD::FMUL_LEGACY
, DL
, VT
,
5082 Op
.getOperand(1), Op
.getOperand(2));
5083 case Intrinsic::amdgcn_sffbh
:
5084 return DAG
.getNode(AMDGPUISD::FFBH_I32
, DL
, VT
, Op
.getOperand(1));
5085 case Intrinsic::amdgcn_sbfe
:
5086 return DAG
.getNode(AMDGPUISD::BFE_I32
, DL
, VT
,
5087 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5088 case Intrinsic::amdgcn_ubfe
:
5089 return DAG
.getNode(AMDGPUISD::BFE_U32
, DL
, VT
,
5090 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5091 case Intrinsic::amdgcn_cvt_pkrtz
:
5092 case Intrinsic::amdgcn_cvt_pknorm_i16
:
5093 case Intrinsic::amdgcn_cvt_pknorm_u16
:
5094 case Intrinsic::amdgcn_cvt_pk_i16
:
5095 case Intrinsic::amdgcn_cvt_pk_u16
: {
5096 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
5097 EVT VT
= Op
.getValueType();
5100 if (IntrinsicID
== Intrinsic::amdgcn_cvt_pkrtz
)
5101 Opcode
= AMDGPUISD::CVT_PKRTZ_F16_F32
;
5102 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
5103 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
5104 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
5105 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
5106 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pk_i16
)
5107 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
5109 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
5111 if (isTypeLegal(VT
))
5112 return DAG
.getNode(Opcode
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5114 SDValue Node
= DAG
.getNode(Opcode
, DL
, MVT::i32
,
5115 Op
.getOperand(1), Op
.getOperand(2));
5116 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Node
);
5118 case Intrinsic::amdgcn_wqm
: {
5119 SDValue Src
= Op
.getOperand(1);
5120 return SDValue(DAG
.getMachineNode(AMDGPU::WQM
, DL
, Src
.getValueType(), Src
),
5123 case Intrinsic::amdgcn_wwm
: {
5124 SDValue Src
= Op
.getOperand(1);
5125 return SDValue(DAG
.getMachineNode(AMDGPU::WWM
, DL
, Src
.getValueType(), Src
),
5128 case Intrinsic::amdgcn_fmad_ftz
:
5129 return DAG
.getNode(AMDGPUISD::FMAD_FTZ
, DL
, VT
, Op
.getOperand(1),
5130 Op
.getOperand(2), Op
.getOperand(3));
5132 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
5133 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
5134 return lowerImage(Op
, ImageDimIntr
, DAG
);
5140 SDValue
SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op
,
5141 SelectionDAG
&DAG
) const {
5142 unsigned IntrID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
5146 case Intrinsic::amdgcn_atomic_inc
:
5147 case Intrinsic::amdgcn_atomic_dec
:
5148 case Intrinsic::amdgcn_ds_fadd
:
5149 case Intrinsic::amdgcn_ds_fmin
:
5150 case Intrinsic::amdgcn_ds_fmax
: {
5151 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
5154 case Intrinsic::amdgcn_atomic_inc
:
5155 Opc
= AMDGPUISD::ATOMIC_INC
;
5157 case Intrinsic::amdgcn_atomic_dec
:
5158 Opc
= AMDGPUISD::ATOMIC_DEC
;
5160 case Intrinsic::amdgcn_ds_fadd
:
5161 Opc
= AMDGPUISD::ATOMIC_LOAD_FADD
;
5163 case Intrinsic::amdgcn_ds_fmin
:
5164 Opc
= AMDGPUISD::ATOMIC_LOAD_FMIN
;
5166 case Intrinsic::amdgcn_ds_fmax
:
5167 Opc
= AMDGPUISD::ATOMIC_LOAD_FMAX
;
5170 llvm_unreachable("Unknown intrinsic!");
5173 M
->getOperand(0), // Chain
5174 M
->getOperand(2), // Ptr
5175 M
->getOperand(3) // Value
5178 return DAG
.getMemIntrinsicNode(Opc
, SDLoc(Op
), M
->getVTList(), Ops
,
5179 M
->getMemoryVT(), M
->getMemOperand());
5181 case Intrinsic::amdgcn_buffer_load
:
5182 case Intrinsic::amdgcn_buffer_load_format
: {
5183 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(5))->getZExtValue();
5184 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
5186 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
5187 IdxEn
= Idx
->getZExtValue() != 0;
5189 Op
.getOperand(0), // Chain
5190 Op
.getOperand(2), // rsrc
5191 Op
.getOperand(3), // vindex
5192 SDValue(), // voffset -- will be set by setBufferOffsets
5193 SDValue(), // soffset -- will be set by setBufferOffsets
5194 SDValue(), // offset -- will be set by setBufferOffsets
5195 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
5196 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
5199 setBufferOffsets(Op
.getOperand(4), DAG
, &Ops
[3]);
5200 unsigned Opc
= (IntrID
== Intrinsic::amdgcn_buffer_load
) ?
5201 AMDGPUISD::BUFFER_LOAD
: AMDGPUISD::BUFFER_LOAD_FORMAT
;
5203 EVT VT
= Op
.getValueType();
5204 EVT IntVT
= VT
.changeTypeToInteger();
5205 auto *M
= cast
<MemSDNode
>(Op
);
5206 EVT LoadVT
= Op
.getValueType();
5208 if (LoadVT
.getScalarType() == MVT::f16
)
5209 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
,
5211 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
, IntVT
,
5212 M
->getMemOperand());
5214 case Intrinsic::amdgcn_raw_buffer_load
:
5215 case Intrinsic::amdgcn_raw_buffer_load_format
: {
5216 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
5218 Op
.getOperand(0), // Chain
5219 Op
.getOperand(2), // rsrc
5220 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5221 Offsets
.first
, // voffset
5222 Op
.getOperand(4), // soffset
5223 Offsets
.second
, // offset
5224 Op
.getOperand(5), // cachepolicy
5225 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
5228 unsigned Opc
= (IntrID
== Intrinsic::amdgcn_raw_buffer_load
) ?
5229 AMDGPUISD::BUFFER_LOAD
: AMDGPUISD::BUFFER_LOAD_FORMAT
;
5231 EVT VT
= Op
.getValueType();
5232 EVT IntVT
= VT
.changeTypeToInteger();
5233 auto *M
= cast
<MemSDNode
>(Op
);
5234 EVT LoadVT
= Op
.getValueType();
5236 if (LoadVT
.getScalarType() == MVT::f16
)
5237 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
,
5239 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
, IntVT
,
5240 M
->getMemOperand());
5242 case Intrinsic::amdgcn_struct_buffer_load
:
5243 case Intrinsic::amdgcn_struct_buffer_load_format
: {
5244 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
5246 Op
.getOperand(0), // Chain
5247 Op
.getOperand(2), // rsrc
5248 Op
.getOperand(3), // vindex
5249 Offsets
.first
, // voffset
5250 Op
.getOperand(5), // soffset
5251 Offsets
.second
, // offset
5252 Op
.getOperand(6), // cachepolicy
5253 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
5256 unsigned Opc
= (IntrID
== Intrinsic::amdgcn_struct_buffer_load
) ?
5257 AMDGPUISD::BUFFER_LOAD
: AMDGPUISD::BUFFER_LOAD_FORMAT
;
5259 EVT VT
= Op
.getValueType();
5260 EVT IntVT
= VT
.changeTypeToInteger();
5261 auto *M
= cast
<MemSDNode
>(Op
);
5262 EVT LoadVT
= Op
.getValueType();
5264 if (LoadVT
.getScalarType() == MVT::f16
)
5265 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
,
5267 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
, IntVT
,
5268 M
->getMemOperand());
5270 case Intrinsic::amdgcn_tbuffer_load
: {
5271 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
5272 EVT LoadVT
= Op
.getValueType();
5274 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
5275 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
5276 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
5277 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
5279 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
5280 IdxEn
= Idx
->getZExtValue() != 0;
5282 Op
.getOperand(0), // Chain
5283 Op
.getOperand(2), // rsrc
5284 Op
.getOperand(3), // vindex
5285 Op
.getOperand(4), // voffset
5286 Op
.getOperand(5), // soffset
5287 Op
.getOperand(6), // offset
5288 DAG
.getConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
5289 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
5290 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
5293 if (LoadVT
.getScalarType() == MVT::f16
)
5294 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
5296 return DAG
.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
5297 Op
->getVTList(), Ops
, LoadVT
,
5298 M
->getMemOperand());
5300 case Intrinsic::amdgcn_raw_tbuffer_load
: {
5301 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
5302 EVT LoadVT
= Op
.getValueType();
5303 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
5306 Op
.getOperand(0), // Chain
5307 Op
.getOperand(2), // rsrc
5308 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5309 Offsets
.first
, // voffset
5310 Op
.getOperand(4), // soffset
5311 Offsets
.second
, // offset
5312 Op
.getOperand(5), // format
5313 Op
.getOperand(6), // cachepolicy
5314 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
5317 if (LoadVT
.getScalarType() == MVT::f16
)
5318 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
5320 return DAG
.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
5321 Op
->getVTList(), Ops
, LoadVT
,
5322 M
->getMemOperand());
5324 case Intrinsic::amdgcn_struct_tbuffer_load
: {
5325 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
5326 EVT LoadVT
= Op
.getValueType();
5327 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
5330 Op
.getOperand(0), // Chain
5331 Op
.getOperand(2), // rsrc
5332 Op
.getOperand(3), // vindex
5333 Offsets
.first
, // voffset
5334 Op
.getOperand(5), // soffset
5335 Offsets
.second
, // offset
5336 Op
.getOperand(6), // format
5337 Op
.getOperand(7), // cachepolicy
5338 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
5341 if (LoadVT
.getScalarType() == MVT::f16
)
5342 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
5344 return DAG
.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
5345 Op
->getVTList(), Ops
, LoadVT
,
5346 M
->getMemOperand());
5348 case Intrinsic::amdgcn_buffer_atomic_swap
:
5349 case Intrinsic::amdgcn_buffer_atomic_add
:
5350 case Intrinsic::amdgcn_buffer_atomic_sub
:
5351 case Intrinsic::amdgcn_buffer_atomic_smin
:
5352 case Intrinsic::amdgcn_buffer_atomic_umin
:
5353 case Intrinsic::amdgcn_buffer_atomic_smax
:
5354 case Intrinsic::amdgcn_buffer_atomic_umax
:
5355 case Intrinsic::amdgcn_buffer_atomic_and
:
5356 case Intrinsic::amdgcn_buffer_atomic_or
:
5357 case Intrinsic::amdgcn_buffer_atomic_xor
: {
5358 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
5360 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
5361 IdxEn
= Idx
->getZExtValue() != 0;
5363 Op
.getOperand(0), // Chain
5364 Op
.getOperand(2), // vdata
5365 Op
.getOperand(3), // rsrc
5366 Op
.getOperand(4), // vindex
5367 SDValue(), // voffset -- will be set by setBufferOffsets
5368 SDValue(), // soffset -- will be set by setBufferOffsets
5369 SDValue(), // offset -- will be set by setBufferOffsets
5370 DAG
.getConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
5371 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
5373 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
5374 EVT VT
= Op
.getValueType();
5376 auto *M
= cast
<MemSDNode
>(Op
);
5377 unsigned Opcode
= 0;
5380 case Intrinsic::amdgcn_buffer_atomic_swap
:
5381 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
5383 case Intrinsic::amdgcn_buffer_atomic_add
:
5384 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
5386 case Intrinsic::amdgcn_buffer_atomic_sub
:
5387 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
5389 case Intrinsic::amdgcn_buffer_atomic_smin
:
5390 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
5392 case Intrinsic::amdgcn_buffer_atomic_umin
:
5393 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
5395 case Intrinsic::amdgcn_buffer_atomic_smax
:
5396 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
5398 case Intrinsic::amdgcn_buffer_atomic_umax
:
5399 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
5401 case Intrinsic::amdgcn_buffer_atomic_and
:
5402 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
5404 case Intrinsic::amdgcn_buffer_atomic_or
:
5405 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
5407 case Intrinsic::amdgcn_buffer_atomic_xor
:
5408 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
5411 llvm_unreachable("unhandled atomic opcode");
5414 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
5415 M
->getMemOperand());
5417 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
5418 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
5419 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
5420 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
5421 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
5422 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
5423 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
5424 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
5425 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
5426 case Intrinsic::amdgcn_raw_buffer_atomic_xor
: {
5427 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
5429 Op
.getOperand(0), // Chain
5430 Op
.getOperand(2), // vdata
5431 Op
.getOperand(3), // rsrc
5432 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5433 Offsets
.first
, // voffset
5434 Op
.getOperand(5), // soffset
5435 Offsets
.second
, // offset
5436 Op
.getOperand(6), // cachepolicy
5437 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
5439 EVT VT
= Op
.getValueType();
5441 auto *M
= cast
<MemSDNode
>(Op
);
5442 unsigned Opcode
= 0;
5445 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
5446 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
5448 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
5449 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
5451 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
5452 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
5454 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
5455 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
5457 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
5458 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
5460 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
5461 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
5463 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
5464 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
5466 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
5467 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
5469 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
5470 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
5472 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
5473 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
5476 llvm_unreachable("unhandled atomic opcode");
5479 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
5480 M
->getMemOperand());
5482 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
5483 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
5484 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
5485 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
5486 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
5487 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
5488 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
5489 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
5490 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
5491 case Intrinsic::amdgcn_struct_buffer_atomic_xor
: {
5492 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
5494 Op
.getOperand(0), // Chain
5495 Op
.getOperand(2), // vdata
5496 Op
.getOperand(3), // rsrc
5497 Op
.getOperand(4), // vindex
5498 Offsets
.first
, // voffset
5499 Op
.getOperand(6), // soffset
5500 Offsets
.second
, // offset
5501 Op
.getOperand(7), // cachepolicy
5502 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
5504 EVT VT
= Op
.getValueType();
5506 auto *M
= cast
<MemSDNode
>(Op
);
5507 unsigned Opcode
= 0;
5510 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
5511 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
5513 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
5514 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
5516 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
5517 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
5519 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
5520 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
5522 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
5523 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
5525 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
5526 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
5528 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
5529 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
5531 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
5532 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
5534 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
5535 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
5537 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
5538 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
5541 llvm_unreachable("unhandled atomic opcode");
5544 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
5545 M
->getMemOperand());
5547 case Intrinsic::amdgcn_buffer_atomic_cmpswap
: {
5548 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
5550 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(5)))
5551 IdxEn
= Idx
->getZExtValue() != 0;
5553 Op
.getOperand(0), // Chain
5554 Op
.getOperand(2), // src
5555 Op
.getOperand(3), // cmp
5556 Op
.getOperand(4), // rsrc
5557 Op
.getOperand(5), // vindex
5558 SDValue(), // voffset -- will be set by setBufferOffsets
5559 SDValue(), // soffset -- will be set by setBufferOffsets
5560 SDValue(), // offset -- will be set by setBufferOffsets
5561 DAG
.getConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
5562 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
5564 setBufferOffsets(Op
.getOperand(6), DAG
, &Ops
[5]);
5565 EVT VT
= Op
.getValueType();
5566 auto *M
= cast
<MemSDNode
>(Op
);
5568 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
5569 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
5571 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
: {
5572 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
5574 Op
.getOperand(0), // Chain
5575 Op
.getOperand(2), // src
5576 Op
.getOperand(3), // cmp
5577 Op
.getOperand(4), // rsrc
5578 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5579 Offsets
.first
, // voffset
5580 Op
.getOperand(6), // soffset
5581 Offsets
.second
, // offset
5582 Op
.getOperand(7), // cachepolicy
5583 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
5585 EVT VT
= Op
.getValueType();
5586 auto *M
= cast
<MemSDNode
>(Op
);
5588 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
5589 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
5591 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
: {
5592 auto Offsets
= splitBufferOffsets(Op
.getOperand(6), DAG
);
5594 Op
.getOperand(0), // Chain
5595 Op
.getOperand(2), // src
5596 Op
.getOperand(3), // cmp
5597 Op
.getOperand(4), // rsrc
5598 Op
.getOperand(5), // vindex
5599 Offsets
.first
, // voffset
5600 Op
.getOperand(7), // soffset
5601 Offsets
.second
, // offset
5602 Op
.getOperand(8), // cachepolicy
5603 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
5605 EVT VT
= Op
.getValueType();
5606 auto *M
= cast
<MemSDNode
>(Op
);
5608 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
5609 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
5613 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
5614 AMDGPU::getImageDimIntrinsicInfo(IntrID
))
5615 return lowerImage(Op
, ImageDimIntr
, DAG
);
5621 SDValue
SITargetLowering::handleD16VData(SDValue VData
,
5622 SelectionDAG
&DAG
) const {
5623 EVT StoreVT
= VData
.getValueType();
5625 // No change for f16 and legal vector D16 types.
5626 if (!StoreVT
.isVector())
5630 assert((StoreVT
.getVectorNumElements() != 3) && "Handle v3f16");
5632 if (Subtarget
->hasUnpackedD16VMem()) {
5633 // We need to unpack the packed data to store.
5634 EVT IntStoreVT
= StoreVT
.changeTypeToInteger();
5635 SDValue IntVData
= DAG
.getNode(ISD::BITCAST
, DL
, IntStoreVT
, VData
);
5637 EVT EquivStoreVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
5638 StoreVT
.getVectorNumElements());
5639 SDValue ZExt
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, EquivStoreVT
, IntVData
);
5640 return DAG
.UnrollVectorOp(ZExt
.getNode());
5643 assert(isTypeLegal(StoreVT
));
5647 SDValue
SITargetLowering::LowerINTRINSIC_VOID(SDValue Op
,
5648 SelectionDAG
&DAG
) const {
5650 SDValue Chain
= Op
.getOperand(0);
5651 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
5652 MachineFunction
&MF
= DAG
.getMachineFunction();
5654 switch (IntrinsicID
) {
5655 case Intrinsic::amdgcn_exp
: {
5656 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
5657 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
5658 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(8));
5659 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(9));
5661 const SDValue Ops
[] = {
5663 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
5664 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
5665 Op
.getOperand(4), // src0
5666 Op
.getOperand(5), // src1
5667 Op
.getOperand(6), // src2
5668 Op
.getOperand(7), // src3
5669 DAG
.getTargetConstant(0, DL
, MVT::i1
), // compr
5670 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
5673 unsigned Opc
= Done
->isNullValue() ?
5674 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
5675 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
5677 case Intrinsic::amdgcn_exp_compr
: {
5678 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
5679 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
5680 SDValue Src0
= Op
.getOperand(4);
5681 SDValue Src1
= Op
.getOperand(5);
5682 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(6));
5683 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(7));
5685 SDValue Undef
= DAG
.getUNDEF(MVT::f32
);
5686 const SDValue Ops
[] = {
5688 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
5689 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
5690 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src0
),
5691 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src1
),
5694 DAG
.getTargetConstant(1, DL
, MVT::i1
), // compr
5695 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
5698 unsigned Opc
= Done
->isNullValue() ?
5699 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
5700 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
5702 case Intrinsic::amdgcn_s_sendmsg
:
5703 case Intrinsic::amdgcn_s_sendmsghalt
: {
5704 unsigned NodeOp
= (IntrinsicID
== Intrinsic::amdgcn_s_sendmsg
) ?
5705 AMDGPUISD::SENDMSG
: AMDGPUISD::SENDMSGHALT
;
5706 Chain
= copyToM0(DAG
, Chain
, DL
, Op
.getOperand(3));
5707 SDValue Glue
= Chain
.getValue(1);
5708 return DAG
.getNode(NodeOp
, DL
, MVT::Other
, Chain
,
5709 Op
.getOperand(2), Glue
);
5711 case Intrinsic::amdgcn_init_exec
: {
5712 return DAG
.getNode(AMDGPUISD::INIT_EXEC
, DL
, MVT::Other
, Chain
,
5715 case Intrinsic::amdgcn_init_exec_from_input
: {
5716 return DAG
.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT
, DL
, MVT::Other
, Chain
,
5717 Op
.getOperand(2), Op
.getOperand(3));
5719 case AMDGPUIntrinsic::AMDGPU_kill
: {
5720 SDValue Src
= Op
.getOperand(2);
5721 if (const ConstantFPSDNode
*K
= dyn_cast
<ConstantFPSDNode
>(Src
)) {
5722 if (!K
->isNegative())
5725 SDValue NegOne
= DAG
.getTargetConstant(FloatToBits(-1.0f
), DL
, MVT::i32
);
5726 return DAG
.getNode(AMDGPUISD::KILL
, DL
, MVT::Other
, Chain
, NegOne
);
5729 SDValue Cast
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::i32
, Src
);
5730 return DAG
.getNode(AMDGPUISD::KILL
, DL
, MVT::Other
, Chain
, Cast
);
5732 case Intrinsic::amdgcn_s_barrier
: {
5733 if (getTargetMachine().getOptLevel() > CodeGenOpt::None
) {
5734 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
5735 unsigned WGSize
= ST
.getFlatWorkGroupSizes(MF
.getFunction()).second
;
5736 if (WGSize
<= ST
.getWavefrontSize())
5737 return SDValue(DAG
.getMachineNode(AMDGPU::WAVE_BARRIER
, DL
, MVT::Other
,
5738 Op
.getOperand(0)), 0);
5742 case AMDGPUIntrinsic::SI_tbuffer_store
: {
5744 // Extract vindex and voffset from vaddr as appropriate
5745 const ConstantSDNode
*OffEn
= cast
<ConstantSDNode
>(Op
.getOperand(10));
5746 const ConstantSDNode
*IdxEn
= cast
<ConstantSDNode
>(Op
.getOperand(11));
5747 SDValue VAddr
= Op
.getOperand(5);
5749 SDValue Zero
= DAG
.getTargetConstant(0, DL
, MVT::i32
);
5751 assert(!(OffEn
->isOne() && IdxEn
->isOne()) &&
5752 "Legacy intrinsic doesn't support both offset and index - use new version");
5754 SDValue VIndex
= IdxEn
->isOne() ? VAddr
: Zero
;
5755 SDValue VOffset
= OffEn
->isOne() ? VAddr
: Zero
;
5757 // Deal with the vec-3 case
5758 const ConstantSDNode
*NumChannels
= cast
<ConstantSDNode
>(Op
.getOperand(4));
5759 auto Opcode
= NumChannels
->getZExtValue() == 3 ?
5760 AMDGPUISD::TBUFFER_STORE_FORMAT_X3
: AMDGPUISD::TBUFFER_STORE_FORMAT
;
5762 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
5763 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
5764 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(12))->getZExtValue();
5765 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(13))->getZExtValue();
5768 Op
.getOperand(3), // vdata
5769 Op
.getOperand(2), // rsrc
5772 Op
.getOperand(6), // soffset
5773 Op
.getOperand(7), // inst_offset
5774 DAG
.getConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
5775 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
5776 DAG
.getConstant(IdxEn
->isOne(), DL
, MVT::i1
), // idxen
5779 assert((cast
<ConstantSDNode
>(Op
.getOperand(14)))->getZExtValue() == 0 &&
5780 "Value of tfe other than zero is unsupported");
5782 EVT VT
= Op
.getOperand(3).getValueType();
5783 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
5784 MachinePointerInfo(),
5785 MachineMemOperand::MOStore
,
5786 VT
.getStoreSize(), 4);
5787 return DAG
.getMemIntrinsicNode(Opcode
, DL
,
5788 Op
->getVTList(), Ops
, VT
, MMO
);
5791 case Intrinsic::amdgcn_tbuffer_store
: {
5792 SDValue VData
= Op
.getOperand(2);
5793 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
5795 VData
= handleD16VData(VData
, DAG
);
5796 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
5797 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
5798 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
5799 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(11))->getZExtValue();
5801 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
5802 IdxEn
= Idx
->getZExtValue() != 0;
5806 Op
.getOperand(3), // rsrc
5807 Op
.getOperand(4), // vindex
5808 Op
.getOperand(5), // voffset
5809 Op
.getOperand(6), // soffset
5810 Op
.getOperand(7), // offset
5811 DAG
.getConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
5812 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
5813 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idexen
5815 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
5816 AMDGPUISD::TBUFFER_STORE_FORMAT
;
5817 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
5818 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
5819 M
->getMemoryVT(), M
->getMemOperand());
5822 case Intrinsic::amdgcn_struct_tbuffer_store
: {
5823 SDValue VData
= Op
.getOperand(2);
5824 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
5826 VData
= handleD16VData(VData
, DAG
);
5827 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
5831 Op
.getOperand(3), // rsrc
5832 Op
.getOperand(4), // vindex
5833 Offsets
.first
, // voffset
5834 Op
.getOperand(6), // soffset
5835 Offsets
.second
, // offset
5836 Op
.getOperand(7), // format
5837 Op
.getOperand(8), // cachepolicy
5838 DAG
.getConstant(1, DL
, MVT::i1
), // idexen
5840 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
5841 AMDGPUISD::TBUFFER_STORE_FORMAT
;
5842 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
5843 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
5844 M
->getMemoryVT(), M
->getMemOperand());
5847 case Intrinsic::amdgcn_raw_tbuffer_store
: {
5848 SDValue VData
= Op
.getOperand(2);
5849 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
5851 VData
= handleD16VData(VData
, DAG
);
5852 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
5856 Op
.getOperand(3), // rsrc
5857 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5858 Offsets
.first
, // voffset
5859 Op
.getOperand(5), // soffset
5860 Offsets
.second
, // offset
5861 Op
.getOperand(6), // format
5862 Op
.getOperand(7), // cachepolicy
5863 DAG
.getConstant(0, DL
, MVT::i1
), // idexen
5865 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
5866 AMDGPUISD::TBUFFER_STORE_FORMAT
;
5867 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
5868 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
5869 M
->getMemoryVT(), M
->getMemOperand());
5872 case Intrinsic::amdgcn_buffer_store
:
5873 case Intrinsic::amdgcn_buffer_store_format
: {
5874 SDValue VData
= Op
.getOperand(2);
5875 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
5877 VData
= handleD16VData(VData
, DAG
);
5878 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
5879 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
5881 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
5882 IdxEn
= Idx
->getZExtValue() != 0;
5886 Op
.getOperand(3), // rsrc
5887 Op
.getOperand(4), // vindex
5888 SDValue(), // voffset -- will be set by setBufferOffsets
5889 SDValue(), // soffset -- will be set by setBufferOffsets
5890 SDValue(), // offset -- will be set by setBufferOffsets
5891 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
5892 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
5894 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
5895 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_buffer_store
?
5896 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
5897 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
5898 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
5899 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
5900 M
->getMemoryVT(), M
->getMemOperand());
5903 case Intrinsic::amdgcn_raw_buffer_store
:
5904 case Intrinsic::amdgcn_raw_buffer_store_format
: {
5905 SDValue VData
= Op
.getOperand(2);
5906 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
5908 VData
= handleD16VData(VData
, DAG
);
5909 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
5913 Op
.getOperand(3), // rsrc
5914 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5915 Offsets
.first
, // voffset
5916 Op
.getOperand(5), // soffset
5917 Offsets
.second
, // offset
5918 Op
.getOperand(6), // cachepolicy
5919 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
5921 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_raw_buffer_store
?
5922 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
5923 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
5924 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
5925 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
5926 M
->getMemoryVT(), M
->getMemOperand());
5929 case Intrinsic::amdgcn_struct_buffer_store
:
5930 case Intrinsic::amdgcn_struct_buffer_store_format
: {
5931 SDValue VData
= Op
.getOperand(2);
5932 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
5934 VData
= handleD16VData(VData
, DAG
);
5935 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
5939 Op
.getOperand(3), // rsrc
5940 Op
.getOperand(4), // vindex
5941 Offsets
.first
, // voffset
5942 Op
.getOperand(6), // soffset
5943 Offsets
.second
, // offset
5944 Op
.getOperand(7), // cachepolicy
5945 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
5947 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store
?
5948 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
5949 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
5950 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
5951 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
5952 M
->getMemoryVT(), M
->getMemOperand());
5956 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
5957 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
5958 return lowerImage(Op
, ImageDimIntr
, DAG
);
5965 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5966 // offset (the offset that is included in bounds checking and swizzling, to be
5967 // split between the instruction's voffset and immoffset fields) and soffset
5968 // (the offset that is excluded from bounds checking and swizzling, to go in
5969 // the instruction's soffset field). This function takes the first kind of
5970 // offset and figures out how to split it between voffset and immoffset.
5971 std::pair
<SDValue
, SDValue
> SITargetLowering::splitBufferOffsets(
5972 SDValue Offset
, SelectionDAG
&DAG
) const {
5974 const unsigned MaxImm
= 4095;
5975 SDValue N0
= Offset
;
5976 ConstantSDNode
*C1
= nullptr;
5977 if (N0
.getOpcode() == ISD::ADD
) {
5978 if ((C1
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1))))
5979 N0
= N0
.getOperand(0);
5980 } else if ((C1
= dyn_cast
<ConstantSDNode
>(N0
)))
5984 unsigned ImmOffset
= C1
->getZExtValue();
5985 // If the immediate value is too big for the immoffset field, put the value
5986 // and -4096 into the immoffset field so that the value that is copied/added
5987 // for the voffset field is a multiple of 4096, and it stands more chance
5988 // of being CSEd with the copy/add for another similar load/store.
5989 // However, do not do that rounding down to a multiple of 4096 if that is a
5990 // negative number, as it appears to be illegal to have a negative offset
5991 // in the vgpr, even if adding the immediate offset makes it positive.
5992 unsigned Overflow
= ImmOffset
& ~MaxImm
;
5993 ImmOffset
-= Overflow
;
5994 if ((int32_t)Overflow
< 0) {
5995 Overflow
+= ImmOffset
;
5998 C1
= cast
<ConstantSDNode
>(DAG
.getConstant(ImmOffset
, DL
, MVT::i32
));
6000 auto OverflowVal
= DAG
.getConstant(Overflow
, DL
, MVT::i32
);
6004 SDValue Ops
[] = { N0
, OverflowVal
};
6005 N0
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, Ops
);
6010 N0
= DAG
.getConstant(0, DL
, MVT::i32
);
6012 C1
= cast
<ConstantSDNode
>(DAG
.getConstant(0, DL
, MVT::i32
));
6013 return {N0
, SDValue(C1
, 0)};
6016 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6017 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6018 // pointed to by Offsets.
6019 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset
,
6021 SDValue
*Offsets
) const {
6022 SDLoc
DL(CombinedOffset
);
6023 if (auto C
= dyn_cast
<ConstantSDNode
>(CombinedOffset
)) {
6024 uint32_t Imm
= C
->getZExtValue();
6025 uint32_t SOffset
, ImmOffset
;
6026 if (AMDGPU::splitMUBUFOffset(Imm
, SOffset
, ImmOffset
, Subtarget
)) {
6027 Offsets
[0] = DAG
.getConstant(0, DL
, MVT::i32
);
6028 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
6029 Offsets
[2] = DAG
.getConstant(ImmOffset
, DL
, MVT::i32
);
6033 if (DAG
.isBaseWithConstantOffset(CombinedOffset
)) {
6034 SDValue N0
= CombinedOffset
.getOperand(0);
6035 SDValue N1
= CombinedOffset
.getOperand(1);
6036 uint32_t SOffset
, ImmOffset
;
6037 int Offset
= cast
<ConstantSDNode
>(N1
)->getSExtValue();
6039 && AMDGPU::splitMUBUFOffset(Offset
, SOffset
, ImmOffset
, Subtarget
)) {
6041 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
6042 Offsets
[2] = DAG
.getConstant(ImmOffset
, DL
, MVT::i32
);
6046 Offsets
[0] = CombinedOffset
;
6047 Offsets
[1] = DAG
.getConstant(0, DL
, MVT::i32
);
6048 Offsets
[2] = DAG
.getConstant(0, DL
, MVT::i32
);
6051 static SDValue
getLoadExtOrTrunc(SelectionDAG
&DAG
,
6052 ISD::LoadExtType ExtType
, SDValue Op
,
6053 const SDLoc
&SL
, EVT VT
) {
6054 if (VT
.bitsLT(Op
.getValueType()))
6055 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Op
);
6059 return DAG
.getNode(ISD::SIGN_EXTEND
, SL
, VT
, Op
);
6061 return DAG
.getNode(ISD::ZERO_EXTEND
, SL
, VT
, Op
);
6063 return DAG
.getNode(ISD::ANY_EXTEND
, SL
, VT
, Op
);
6064 case ISD::NON_EXTLOAD
:
6068 llvm_unreachable("invalid ext type");
6071 SDValue
SITargetLowering::widenLoad(LoadSDNode
*Ld
, DAGCombinerInfo
&DCI
) const {
6072 SelectionDAG
&DAG
= DCI
.DAG
;
6073 if (Ld
->getAlignment() < 4 || Ld
->isDivergent())
6076 // FIXME: Constant loads should all be marked invariant.
6077 unsigned AS
= Ld
->getAddressSpace();
6078 if (AS
!= AMDGPUAS::CONSTANT_ADDRESS
&&
6079 AS
!= AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
6080 (AS
!= AMDGPUAS::GLOBAL_ADDRESS
|| !Ld
->isInvariant()))
6083 // Don't do this early, since it may interfere with adjacent load merging for
6084 // illegal types. We can avoid losing alignment information for exotic types
6086 EVT MemVT
= Ld
->getMemoryVT();
6087 if ((MemVT
.isSimple() && !DCI
.isAfterLegalizeDAG()) ||
6088 MemVT
.getSizeInBits() >= 32)
6093 assert((!MemVT
.isVector() || Ld
->getExtensionType() == ISD::NON_EXTLOAD
) &&
6094 "unexpected vector extload");
6096 // TODO: Drop only high part of range.
6097 SDValue Ptr
= Ld
->getBasePtr();
6098 SDValue NewLoad
= DAG
.getLoad(ISD::UNINDEXED
, ISD::NON_EXTLOAD
,
6099 MVT::i32
, SL
, Ld
->getChain(), Ptr
,
6101 Ld
->getPointerInfo(), MVT::i32
,
6103 Ld
->getMemOperand()->getFlags(),
6105 nullptr); // Drop ranges
6107 EVT TruncVT
= EVT::getIntegerVT(*DAG
.getContext(), MemVT
.getSizeInBits());
6108 if (MemVT
.isFloatingPoint()) {
6109 assert(Ld
->getExtensionType() == ISD::NON_EXTLOAD
&&
6110 "unexpected fp extload");
6111 TruncVT
= MemVT
.changeTypeToInteger();
6114 SDValue Cvt
= NewLoad
;
6115 if (Ld
->getExtensionType() == ISD::SEXTLOAD
) {
6116 Cvt
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, SL
, MVT::i32
, NewLoad
,
6117 DAG
.getValueType(TruncVT
));
6118 } else if (Ld
->getExtensionType() == ISD::ZEXTLOAD
||
6119 Ld
->getExtensionType() == ISD::NON_EXTLOAD
) {
6120 Cvt
= DAG
.getZeroExtendInReg(NewLoad
, SL
, TruncVT
);
6122 assert(Ld
->getExtensionType() == ISD::EXTLOAD
);
6125 EVT VT
= Ld
->getValueType(0);
6126 EVT IntVT
= EVT::getIntegerVT(*DAG
.getContext(), VT
.getSizeInBits());
6128 DCI
.AddToWorklist(Cvt
.getNode());
6130 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6131 // the appropriate extension from the 32-bit load.
6132 Cvt
= getLoadExtOrTrunc(DAG
, Ld
->getExtensionType(), Cvt
, SL
, IntVT
);
6133 DCI
.AddToWorklist(Cvt
.getNode());
6135 // Handle conversion back to floating point if necessary.
6136 Cvt
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, Cvt
);
6138 return DAG
.getMergeValues({ Cvt
, NewLoad
.getValue(1) }, SL
);
6141 SDValue
SITargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
6143 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
6144 ISD::LoadExtType ExtType
= Load
->getExtensionType();
6145 EVT MemVT
= Load
->getMemoryVT();
6147 if (ExtType
== ISD::NON_EXTLOAD
&& MemVT
.getSizeInBits() < 32) {
6148 if (MemVT
== MVT::i16
&& isTypeLegal(MVT::i16
))
6151 // FIXME: Copied from PPC
6152 // First, load into 32 bits, then truncate to 1 bit.
6154 SDValue Chain
= Load
->getChain();
6155 SDValue BasePtr
= Load
->getBasePtr();
6156 MachineMemOperand
*MMO
= Load
->getMemOperand();
6158 EVT RealMemVT
= (MemVT
== MVT::i1
) ? MVT::i8
: MVT::i16
;
6160 SDValue NewLD
= DAG
.getExtLoad(ISD::EXTLOAD
, DL
, MVT::i32
, Chain
,
6161 BasePtr
, RealMemVT
, MMO
);
6164 DAG
.getNode(ISD::TRUNCATE
, DL
, MemVT
, NewLD
),
6168 return DAG
.getMergeValues(Ops
, DL
);
6171 if (!MemVT
.isVector())
6174 assert(Op
.getValueType().getVectorElementType() == MVT::i32
&&
6175 "Custom lowering for non-i32 vectors hasn't been implemented.");
6177 unsigned Alignment
= Load
->getAlignment();
6178 unsigned AS
= Load
->getAddressSpace();
6179 if (!allowsMemoryAccess(*DAG
.getContext(), DAG
.getDataLayout(), MemVT
,
6182 std::tie(Ops
[0], Ops
[1]) = expandUnalignedLoad(Load
, DAG
);
6183 return DAG
.getMergeValues(Ops
, DL
);
6186 MachineFunction
&MF
= DAG
.getMachineFunction();
6187 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
6188 // If there is a possibilty that flat instruction access scratch memory
6189 // then we need to use the same legalization rules we use for private.
6190 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
6191 AS
= MFI
->hasFlatScratchInit() ?
6192 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
6194 unsigned NumElements
= MemVT
.getVectorNumElements();
6196 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
6197 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
6198 if (!Op
->isDivergent() && Alignment
>= 4 && NumElements
< 32)
6200 // Non-uniform loads will be selected to MUBUF instructions, so they
6201 // have the same legalization requirements as global and private
6206 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
6207 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
6208 AS
== AMDGPUAS::GLOBAL_ADDRESS
) {
6209 if (Subtarget
->getScalarizeGlobalBehavior() && !Op
->isDivergent() &&
6210 !Load
->isVolatile() && isMemOpHasNoClobberedMemOperand(Load
) &&
6211 Alignment
>= 4 && NumElements
< 32)
6213 // Non-uniform loads will be selected to MUBUF instructions, so they
6214 // have the same legalization requirements as global and private
6218 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
6219 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
6220 AS
== AMDGPUAS::GLOBAL_ADDRESS
||
6221 AS
== AMDGPUAS::FLAT_ADDRESS
) {
6222 if (NumElements
> 4)
6223 return SplitVectorLoad(Op
, DAG
);
6224 // v4 loads are supported for private and global memory.
6227 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
6228 // Depending on the setting of the private_element_size field in the
6229 // resource descriptor, we can only make private accesses up to a certain
6231 switch (Subtarget
->getMaxPrivateElementSize()) {
6233 return scalarizeVectorLoad(Load
, DAG
);
6235 if (NumElements
> 2)
6236 return SplitVectorLoad(Op
, DAG
);
6239 // Same as global/flat
6240 if (NumElements
> 4)
6241 return SplitVectorLoad(Op
, DAG
);
6244 llvm_unreachable("unsupported private_element_size");
6246 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
) {
6247 // Use ds_read_b128 if possible.
6248 if (Subtarget
->useDS128() && Load
->getAlignment() >= 16 &&
6249 MemVT
.getStoreSize() == 16)
6252 if (NumElements
> 2)
6253 return SplitVectorLoad(Op
, DAG
);
6258 SDValue
SITargetLowering::LowerSELECT(SDValue Op
, SelectionDAG
&DAG
) const {
6259 EVT VT
= Op
.getValueType();
6260 assert(VT
.getSizeInBits() == 64);
6263 SDValue Cond
= Op
.getOperand(0);
6265 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i32
);
6266 SDValue One
= DAG
.getConstant(1, DL
, MVT::i32
);
6268 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(1));
6269 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(2));
6271 SDValue Lo0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, Zero
);
6272 SDValue Lo1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, Zero
);
6274 SDValue Lo
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Lo0
, Lo1
);
6276 SDValue Hi0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, One
);
6277 SDValue Hi1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, One
);
6279 SDValue Hi
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Hi0
, Hi1
);
6281 SDValue Res
= DAG
.getBuildVector(MVT::v2i32
, DL
, {Lo
, Hi
});
6282 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Res
);
6285 // Catch division cases where we can use shortcuts with rcp and rsq
6287 SDValue
SITargetLowering::lowerFastUnsafeFDIV(SDValue Op
,
6288 SelectionDAG
&DAG
) const {
6290 SDValue LHS
= Op
.getOperand(0);
6291 SDValue RHS
= Op
.getOperand(1);
6292 EVT VT
= Op
.getValueType();
6293 const SDNodeFlags Flags
= Op
->getFlags();
6294 bool Unsafe
= DAG
.getTarget().Options
.UnsafeFPMath
|| Flags
.hasAllowReciprocal();
6296 if (!Unsafe
&& VT
== MVT::f32
&& Subtarget
->hasFP32Denormals())
6299 if (const ConstantFPSDNode
*CLHS
= dyn_cast
<ConstantFPSDNode
>(LHS
)) {
6300 if (Unsafe
|| VT
== MVT::f32
|| VT
== MVT::f16
) {
6301 if (CLHS
->isExactlyValue(1.0)) {
6302 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
6303 // the CI documentation has a worst case error of 1 ulp.
6304 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
6305 // use it as long as we aren't trying to use denormals.
6307 // v_rcp_f16 and v_rsq_f16 DO support denormals.
6309 // 1.0 / sqrt(x) -> rsq(x)
6311 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
6312 // error seems really high at 2^29 ULP.
6313 if (RHS
.getOpcode() == ISD::FSQRT
)
6314 return DAG
.getNode(AMDGPUISD::RSQ
, SL
, VT
, RHS
.getOperand(0));
6316 // 1.0 / x -> rcp(x)
6317 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
6320 // Same as for 1.0, but expand the sign out of the constant.
6321 if (CLHS
->isExactlyValue(-1.0)) {
6322 // -1.0 / x -> rcp (fneg x)
6323 SDValue FNegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
6324 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, FNegRHS
);
6330 // Turn into multiply by the reciprocal.
6331 // x / y -> x * (1.0 / y)
6332 SDValue Recip
= DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
6333 return DAG
.getNode(ISD::FMUL
, SL
, VT
, LHS
, Recip
, Flags
);
6339 static SDValue
getFPBinOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
6340 EVT VT
, SDValue A
, SDValue B
, SDValue GlueChain
) {
6341 if (GlueChain
->getNumValues() <= 1) {
6342 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
);
6345 assert(GlueChain
->getNumValues() == 3);
6347 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
6349 default: llvm_unreachable("no chain equivalent for opcode");
6351 Opcode
= AMDGPUISD::FMUL_W_CHAIN
;
6355 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
,
6356 GlueChain
.getValue(2));
6359 static SDValue
getFPTernOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
6360 EVT VT
, SDValue A
, SDValue B
, SDValue C
,
6361 SDValue GlueChain
) {
6362 if (GlueChain
->getNumValues() <= 1) {
6363 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
, C
);
6366 assert(GlueChain
->getNumValues() == 3);
6368 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
6370 default: llvm_unreachable("no chain equivalent for opcode");
6372 Opcode
= AMDGPUISD::FMA_W_CHAIN
;
6376 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
, C
,
6377 GlueChain
.getValue(2));
6380 SDValue
SITargetLowering::LowerFDIV16(SDValue Op
, SelectionDAG
&DAG
) const {
6381 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
6385 SDValue Src0
= Op
.getOperand(0);
6386 SDValue Src1
= Op
.getOperand(1);
6388 SDValue CvtSrc0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
6389 SDValue CvtSrc1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
6391 SDValue RcpSrc1
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, CvtSrc1
);
6392 SDValue Quot
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, CvtSrc0
, RcpSrc1
);
6394 SDValue FPRoundFlag
= DAG
.getTargetConstant(0, SL
, MVT::i32
);
6395 SDValue BestQuot
= DAG
.getNode(ISD::FP_ROUND
, SL
, MVT::f16
, Quot
, FPRoundFlag
);
6397 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f16
, BestQuot
, Src1
, Src0
);
6400 // Faster 2.5 ULP division that does not support denormals.
6401 SDValue
SITargetLowering::lowerFDIV_FAST(SDValue Op
, SelectionDAG
&DAG
) const {
6403 SDValue LHS
= Op
.getOperand(1);
6404 SDValue RHS
= Op
.getOperand(2);
6406 SDValue r1
= DAG
.getNode(ISD::FABS
, SL
, MVT::f32
, RHS
);
6408 const APFloat
K0Val(BitsToFloat(0x6f800000));
6409 const SDValue K0
= DAG
.getConstantFP(K0Val
, SL
, MVT::f32
);
6411 const APFloat
K1Val(BitsToFloat(0x2f800000));
6412 const SDValue K1
= DAG
.getConstantFP(K1Val
, SL
, MVT::f32
);
6414 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
6417 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::f32
);
6419 SDValue r2
= DAG
.getSetCC(SL
, SetCCVT
, r1
, K0
, ISD::SETOGT
);
6421 SDValue r3
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f32
, r2
, K1
, One
);
6423 // TODO: Should this propagate fast-math-flags?
6424 r1
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, RHS
, r3
);
6426 // rcp does not support denormals.
6427 SDValue r0
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, r1
);
6429 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, LHS
, r0
);
6431 return DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, r3
, Mul
);
6434 SDValue
SITargetLowering::LowerFDIV32(SDValue Op
, SelectionDAG
&DAG
) const {
6435 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
6439 SDValue LHS
= Op
.getOperand(0);
6440 SDValue RHS
= Op
.getOperand(1);
6442 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
6444 SDVTList ScaleVT
= DAG
.getVTList(MVT::f32
, MVT::i1
);
6446 SDValue DenominatorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
6448 SDValue NumeratorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
6451 // Denominator is scaled to not be denormal, so using rcp is ok.
6452 SDValue ApproxRcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
,
6454 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
,
6457 const unsigned Denorm32Reg
= AMDGPU::Hwreg::ID_MODE
|
6458 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_
) |
6459 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_
);
6461 const SDValue BitField
= DAG
.getTargetConstant(Denorm32Reg
, SL
, MVT::i16
);
6463 if (!Subtarget
->hasFP32Denormals()) {
6464 SDVTList BindParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
6465 const SDValue EnableDenormValue
= DAG
.getConstant(FP_DENORM_FLUSH_NONE
,
6467 SDValue EnableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, BindParamVTs
,
6469 EnableDenormValue
, BitField
);
6472 EnableDenorm
.getValue(0),
6473 EnableDenorm
.getValue(1)
6476 NegDivScale0
= DAG
.getMergeValues(Ops
, SL
);
6479 SDValue Fma0
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
,
6480 ApproxRcp
, One
, NegDivScale0
);
6482 SDValue Fma1
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma0
, ApproxRcp
,
6485 SDValue Mul
= getFPBinOp(DAG
, ISD::FMUL
, SL
, MVT::f32
, NumeratorScaled
,
6488 SDValue Fma2
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Mul
,
6489 NumeratorScaled
, Mul
);
6491 SDValue Fma3
= getFPTernOp(DAG
, ISD::FMA
,SL
, MVT::f32
, Fma2
, Fma1
, Mul
, Fma2
);
6493 SDValue Fma4
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Fma3
,
6494 NumeratorScaled
, Fma3
);
6496 if (!Subtarget
->hasFP32Denormals()) {
6497 const SDValue DisableDenormValue
=
6498 DAG
.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT
, SL
, MVT::i32
);
6499 SDValue DisableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, MVT::Other
,
6505 SDValue OutputChain
= DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
,
6506 DisableDenorm
, DAG
.getRoot());
6507 DAG
.setRoot(OutputChain
);
6510 SDValue Scale
= NumeratorScaled
.getValue(1);
6511 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f32
,
6512 Fma4
, Fma1
, Fma3
, Scale
);
6514 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f32
, Fmas
, RHS
, LHS
);
6517 SDValue
SITargetLowering::LowerFDIV64(SDValue Op
, SelectionDAG
&DAG
) const {
6518 if (DAG
.getTarget().Options
.UnsafeFPMath
)
6519 return lowerFastUnsafeFDIV(Op
, DAG
);
6522 SDValue X
= Op
.getOperand(0);
6523 SDValue Y
= Op
.getOperand(1);
6525 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f64
);
6527 SDVTList ScaleVT
= DAG
.getVTList(MVT::f64
, MVT::i1
);
6529 SDValue DivScale0
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, Y
, Y
, X
);
6531 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f64
, DivScale0
);
6533 SDValue Rcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f64
, DivScale0
);
6535 SDValue Fma0
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Rcp
, One
);
6537 SDValue Fma1
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Rcp
, Fma0
, Rcp
);
6539 SDValue Fma2
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Fma1
, One
);
6541 SDValue DivScale1
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, X
, Y
, X
);
6543 SDValue Fma3
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Fma1
, Fma2
, Fma1
);
6544 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f64
, DivScale1
, Fma3
);
6546 SDValue Fma4
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
,
6547 NegDivScale0
, Mul
, DivScale1
);
6551 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
6552 // Workaround a hardware bug on SI where the condition output from div_scale
6555 const SDValue Hi
= DAG
.getConstant(1, SL
, MVT::i32
);
6557 // Figure out if the scale to use for div_fmas.
6558 SDValue NumBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, X
);
6559 SDValue DenBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Y
);
6560 SDValue Scale0BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale0
);
6561 SDValue Scale1BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale1
);
6563 SDValue NumHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, NumBC
, Hi
);
6564 SDValue DenHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, DenBC
, Hi
);
6567 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale0BC
, Hi
);
6569 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale1BC
, Hi
);
6571 SDValue CmpDen
= DAG
.getSetCC(SL
, MVT::i1
, DenHi
, Scale0Hi
, ISD::SETEQ
);
6572 SDValue CmpNum
= DAG
.getSetCC(SL
, MVT::i1
, NumHi
, Scale1Hi
, ISD::SETEQ
);
6573 Scale
= DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, CmpNum
, CmpDen
);
6575 Scale
= DivScale1
.getValue(1);
6578 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f64
,
6579 Fma4
, Fma3
, Mul
, Scale
);
6581 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f64
, Fmas
, Y
, X
);
6584 SDValue
SITargetLowering::LowerFDIV(SDValue Op
, SelectionDAG
&DAG
) const {
6585 EVT VT
= Op
.getValueType();
6588 return LowerFDIV32(Op
, DAG
);
6591 return LowerFDIV64(Op
, DAG
);
6594 return LowerFDIV16(Op
, DAG
);
6596 llvm_unreachable("Unexpected type for fdiv");
6599 SDValue
SITargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
6601 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
6602 EVT VT
= Store
->getMemoryVT();
6604 if (VT
== MVT::i1
) {
6605 return DAG
.getTruncStore(Store
->getChain(), DL
,
6606 DAG
.getSExtOrTrunc(Store
->getValue(), DL
, MVT::i32
),
6607 Store
->getBasePtr(), MVT::i1
, Store
->getMemOperand());
6610 assert(VT
.isVector() &&
6611 Store
->getValue().getValueType().getScalarType() == MVT::i32
);
6613 unsigned AS
= Store
->getAddressSpace();
6614 if (!allowsMemoryAccess(*DAG
.getContext(), DAG
.getDataLayout(), VT
,
6615 AS
, Store
->getAlignment())) {
6616 return expandUnalignedStore(Store
, DAG
);
6619 MachineFunction
&MF
= DAG
.getMachineFunction();
6620 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
6621 // If there is a possibilty that flat instruction access scratch memory
6622 // then we need to use the same legalization rules we use for private.
6623 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
6624 AS
= MFI
->hasFlatScratchInit() ?
6625 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
6627 unsigned NumElements
= VT
.getVectorNumElements();
6628 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
||
6629 AS
== AMDGPUAS::FLAT_ADDRESS
) {
6630 if (NumElements
> 4)
6631 return SplitVectorStore(Op
, DAG
);
6633 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
6634 switch (Subtarget
->getMaxPrivateElementSize()) {
6636 return scalarizeVectorStore(Store
, DAG
);
6638 if (NumElements
> 2)
6639 return SplitVectorStore(Op
, DAG
);
6642 if (NumElements
> 4)
6643 return SplitVectorStore(Op
, DAG
);
6646 llvm_unreachable("unsupported private_element_size");
6648 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
) {
6649 // Use ds_write_b128 if possible.
6650 if (Subtarget
->useDS128() && Store
->getAlignment() >= 16 &&
6651 VT
.getStoreSize() == 16)
6654 if (NumElements
> 2)
6655 return SplitVectorStore(Op
, DAG
);
6658 llvm_unreachable("unhandled address space");
6662 SDValue
SITargetLowering::LowerTrig(SDValue Op
, SelectionDAG
&DAG
) const {
6664 EVT VT
= Op
.getValueType();
6665 SDValue Arg
= Op
.getOperand(0);
6668 // TODO: Should this propagate fast-math-flags?
6670 SDValue OneOver2Pi
= DAG
.getConstantFP(0.5 / M_PI
, DL
, VT
);
6672 if (Subtarget
->hasTrigReducedRange()) {
6673 SDValue MulVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
6674 TrigVal
= DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, MulVal
);
6676 TrigVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
6679 switch (Op
.getOpcode()) {
6681 return DAG
.getNode(AMDGPUISD::COS_HW
, SDLoc(Op
), VT
, TrigVal
);
6683 return DAG
.getNode(AMDGPUISD::SIN_HW
, SDLoc(Op
), VT
, TrigVal
);
6685 llvm_unreachable("Wrong trig opcode");
6689 SDValue
SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op
, SelectionDAG
&DAG
) const {
6690 AtomicSDNode
*AtomicNode
= cast
<AtomicSDNode
>(Op
);
6691 assert(AtomicNode
->isCompareAndSwap());
6692 unsigned AS
= AtomicNode
->getAddressSpace();
6694 // No custom lowering required for local address space
6695 if (!isFlatGlobalAddrSpace(AS
))
6698 // Non-local address space requires custom lowering for atomic compare
6699 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
6701 SDValue ChainIn
= Op
.getOperand(0);
6702 SDValue Addr
= Op
.getOperand(1);
6703 SDValue Old
= Op
.getOperand(2);
6704 SDValue New
= Op
.getOperand(3);
6705 EVT VT
= Op
.getValueType();
6706 MVT SimpleVT
= VT
.getSimpleVT();
6707 MVT VecType
= MVT::getVectorVT(SimpleVT
, 2);
6709 SDValue NewOld
= DAG
.getBuildVector(VecType
, DL
, {New
, Old
});
6710 SDValue Ops
[] = { ChainIn
, Addr
, NewOld
};
6712 return DAG
.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP
, DL
, Op
->getVTList(),
6713 Ops
, VT
, AtomicNode
->getMemOperand());
6716 //===----------------------------------------------------------------------===//
6717 // Custom DAG optimizations
6718 //===----------------------------------------------------------------------===//
6720 SDValue
SITargetLowering::performUCharToFloatCombine(SDNode
*N
,
6721 DAGCombinerInfo
&DCI
) const {
6722 EVT VT
= N
->getValueType(0);
6723 EVT ScalarVT
= VT
.getScalarType();
6724 if (ScalarVT
!= MVT::f32
)
6727 SelectionDAG
&DAG
= DCI
.DAG
;
6730 SDValue Src
= N
->getOperand(0);
6731 EVT SrcVT
= Src
.getValueType();
6733 // TODO: We could try to match extracting the higher bytes, which would be
6734 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
6735 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
6736 // about in practice.
6737 if (DCI
.isAfterLegalizeDAG() && SrcVT
== MVT::i32
) {
6738 if (DAG
.MaskedValueIsZero(Src
, APInt::getHighBitsSet(32, 24))) {
6739 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
, DL
, VT
, Src
);
6740 DCI
.AddToWorklist(Cvt
.getNode());
6748 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
6750 // This is a variant of
6751 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
6753 // The normal DAG combiner will do this, but only if the add has one use since
6754 // that would increase the number of instructions.
6756 // This prevents us from seeing a constant offset that can be folded into a
6757 // memory instruction's addressing mode. If we know the resulting add offset of
6758 // a pointer can be folded into an addressing offset, we can replace the pointer
6759 // operand with the add of new constant offset. This eliminates one of the uses,
6760 // and may allow the remaining use to also be simplified.
6762 SDValue
SITargetLowering::performSHLPtrCombine(SDNode
*N
,
6765 DAGCombinerInfo
&DCI
) const {
6766 SDValue N0
= N
->getOperand(0);
6767 SDValue N1
= N
->getOperand(1);
6769 // We only do this to handle cases where it's profitable when there are
6770 // multiple uses of the add, so defer to the standard combine.
6771 if ((N0
.getOpcode() != ISD::ADD
&& N0
.getOpcode() != ISD::OR
) ||
6775 const ConstantSDNode
*CN1
= dyn_cast
<ConstantSDNode
>(N1
);
6779 const ConstantSDNode
*CAdd
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
6783 // If the resulting offset is too large, we can't fold it into the addressing
6785 APInt Offset
= CAdd
->getAPIntValue() << CN1
->getAPIntValue();
6786 Type
*Ty
= MemVT
.getTypeForEVT(*DCI
.DAG
.getContext());
6789 AM
.HasBaseReg
= true;
6790 AM
.BaseOffs
= Offset
.getSExtValue();
6791 if (!isLegalAddressingMode(DCI
.DAG
.getDataLayout(), AM
, Ty
, AddrSpace
))
6794 SelectionDAG
&DAG
= DCI
.DAG
;
6796 EVT VT
= N
->getValueType(0);
6798 SDValue ShlX
= DAG
.getNode(ISD::SHL
, SL
, VT
, N0
.getOperand(0), N1
);
6799 SDValue COffset
= DAG
.getConstant(Offset
, SL
, MVT::i32
);
6802 Flags
.setNoUnsignedWrap(N
->getFlags().hasNoUnsignedWrap() &&
6803 (N0
.getOpcode() == ISD::OR
||
6804 N0
->getFlags().hasNoUnsignedWrap()));
6806 return DAG
.getNode(ISD::ADD
, SL
, VT
, ShlX
, COffset
, Flags
);
6809 SDValue
SITargetLowering::performMemSDNodeCombine(MemSDNode
*N
,
6810 DAGCombinerInfo
&DCI
) const {
6811 SDValue Ptr
= N
->getBasePtr();
6812 SelectionDAG
&DAG
= DCI
.DAG
;
6815 // TODO: We could also do this for multiplies.
6816 if (Ptr
.getOpcode() == ISD::SHL
) {
6817 SDValue NewPtr
= performSHLPtrCombine(Ptr
.getNode(), N
->getAddressSpace(),
6818 N
->getMemoryVT(), DCI
);
6820 SmallVector
<SDValue
, 8> NewOps(N
->op_begin(), N
->op_end());
6822 NewOps
[N
->getOpcode() == ISD::STORE
? 2 : 1] = NewPtr
;
6823 return SDValue(DAG
.UpdateNodeOperands(N
, NewOps
), 0);
6830 static bool bitOpWithConstantIsReducible(unsigned Opc
, uint32_t Val
) {
6831 return (Opc
== ISD::AND
&& (Val
== 0 || Val
== 0xffffffff)) ||
6832 (Opc
== ISD::OR
&& (Val
== 0xffffffff || Val
== 0)) ||
6833 (Opc
== ISD::XOR
&& Val
== 0);
6836 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
6837 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
6838 // integer combine opportunities since most 64-bit operations are decomposed
6839 // this way. TODO: We won't want this for SALU especially if it is an inline
6841 SDValue
SITargetLowering::splitBinaryBitConstantOp(
6842 DAGCombinerInfo
&DCI
,
6844 unsigned Opc
, SDValue LHS
,
6845 const ConstantSDNode
*CRHS
) const {
6846 uint64_t Val
= CRHS
->getZExtValue();
6847 uint32_t ValLo
= Lo_32(Val
);
6848 uint32_t ValHi
= Hi_32(Val
);
6849 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
6851 if ((bitOpWithConstantIsReducible(Opc
, ValLo
) ||
6852 bitOpWithConstantIsReducible(Opc
, ValHi
)) ||
6853 (CRHS
->hasOneUse() && !TII
->isInlineConstant(CRHS
->getAPIntValue()))) {
6854 // If we need to materialize a 64-bit immediate, it will be split up later
6855 // anyway. Avoid creating the harder to understand 64-bit immediate
6857 return splitBinaryBitConstantOpImpl(DCI
, SL
, Opc
, LHS
, ValLo
, ValHi
);
6863 // Returns true if argument is a boolean value which is not serialized into
6864 // memory or argument and does not require v_cmdmask_b32 to be deserialized.
6865 static bool isBoolSGPR(SDValue V
) {
6866 if (V
.getValueType() != MVT::i1
)
6868 switch (V
.getOpcode()) {
6874 case AMDGPUISD::FP_CLASS
:
6880 // If a constant has all zeroes or all ones within each byte return it.
6881 // Otherwise return 0.
6882 static uint32_t getConstantPermuteMask(uint32_t C
) {
6883 // 0xff for any zero byte in the mask
6884 uint32_t ZeroByteMask
= 0;
6885 if (!(C
& 0x000000ff)) ZeroByteMask
|= 0x000000ff;
6886 if (!(C
& 0x0000ff00)) ZeroByteMask
|= 0x0000ff00;
6887 if (!(C
& 0x00ff0000)) ZeroByteMask
|= 0x00ff0000;
6888 if (!(C
& 0xff000000)) ZeroByteMask
|= 0xff000000;
6889 uint32_t NonZeroByteMask
= ~ZeroByteMask
; // 0xff for any non-zero byte
6890 if ((NonZeroByteMask
& C
) != NonZeroByteMask
)
6891 return 0; // Partial bytes selected.
6895 // Check if a node selects whole bytes from its operand 0 starting at a byte
6896 // boundary while masking the rest. Returns select mask as in the v_perm_b32
6897 // or -1 if not succeeded.
6898 // Note byte select encoding:
6899 // value 0-3 selects corresponding source byte;
6900 // value 0xc selects zero;
6901 // value 0xff selects 0xff.
6902 static uint32_t getPermuteMask(SelectionDAG
&DAG
, SDValue V
) {
6903 assert(V
.getValueSizeInBits() == 32);
6905 if (V
.getNumOperands() != 2)
6908 ConstantSDNode
*N1
= dyn_cast
<ConstantSDNode
>(V
.getOperand(1));
6912 uint32_t C
= N1
->getZExtValue();
6914 switch (V
.getOpcode()) {
6918 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
6919 return (0x03020100 & ConstMask
) | (0x0c0c0c0c & ~ConstMask
);
6924 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
6925 return (0x03020100 & ~ConstMask
) | ConstMask
;
6933 return uint32_t((0x030201000c0c0c0cull
<< C
) >> 32);
6939 return uint32_t(0x0c0c0c0c03020100ull
>> C
);
6945 SDValue
SITargetLowering::performAndCombine(SDNode
*N
,
6946 DAGCombinerInfo
&DCI
) const {
6947 if (DCI
.isBeforeLegalize())
6950 SelectionDAG
&DAG
= DCI
.DAG
;
6951 EVT VT
= N
->getValueType(0);
6952 SDValue LHS
= N
->getOperand(0);
6953 SDValue RHS
= N
->getOperand(1);
6956 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
6957 if (VT
== MVT::i64
&& CRHS
) {
6959 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::AND
, LHS
, CRHS
))
6963 if (CRHS
&& VT
== MVT::i32
) {
6964 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
6965 // nb = number of trailing zeroes in mask
6966 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
6967 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
6968 uint64_t Mask
= CRHS
->getZExtValue();
6969 unsigned Bits
= countPopulation(Mask
);
6970 if (getSubtarget()->hasSDWA() && LHS
->getOpcode() == ISD::SRL
&&
6971 (Bits
== 8 || Bits
== 16) && isShiftedMask_64(Mask
) && !(Mask
& 1)) {
6972 if (auto *CShift
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(1))) {
6973 unsigned Shift
= CShift
->getZExtValue();
6974 unsigned NB
= CRHS
->getAPIntValue().countTrailingZeros();
6975 unsigned Offset
= NB
+ Shift
;
6976 if ((Offset
& (Bits
- 1)) == 0) { // Starts at a byte or word boundary.
6978 SDValue BFE
= DAG
.getNode(AMDGPUISD::BFE_U32
, SL
, MVT::i32
,
6980 DAG
.getConstant(Offset
, SL
, MVT::i32
),
6981 DAG
.getConstant(Bits
, SL
, MVT::i32
));
6982 EVT NarrowVT
= EVT::getIntegerVT(*DAG
.getContext(), Bits
);
6983 SDValue Ext
= DAG
.getNode(ISD::AssertZext
, SL
, VT
, BFE
,
6984 DAG
.getValueType(NarrowVT
));
6985 SDValue Shl
= DAG
.getNode(ISD::SHL
, SDLoc(LHS
), VT
, Ext
,
6986 DAG
.getConstant(NB
, SDLoc(CRHS
), MVT::i32
));
6992 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
6993 if (LHS
.hasOneUse() && LHS
.getOpcode() == AMDGPUISD::PERM
&&
6994 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
6995 uint32_t Sel
= getConstantPermuteMask(Mask
);
6999 // Select 0xc for all zero bytes
7000 Sel
= (LHS
.getConstantOperandVal(2) & Sel
) | (~Sel
& 0x0c0c0c0c);
7002 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
7003 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
7007 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
7008 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
7009 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == ISD::SETCC
) {
7010 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
7011 ISD::CondCode RCC
= cast
<CondCodeSDNode
>(RHS
.getOperand(2))->get();
7013 SDValue X
= LHS
.getOperand(0);
7014 SDValue Y
= RHS
.getOperand(0);
7015 if (Y
.getOpcode() != ISD::FABS
|| Y
.getOperand(0) != X
)
7018 if (LCC
== ISD::SETO
) {
7019 if (X
!= LHS
.getOperand(1))
7022 if (RCC
== ISD::SETUNE
) {
7023 const ConstantFPSDNode
*C1
= dyn_cast
<ConstantFPSDNode
>(RHS
.getOperand(1));
7024 if (!C1
|| !C1
->isInfinity() || C1
->isNegative())
7027 const uint32_t Mask
= SIInstrFlags::N_NORMAL
|
7028 SIInstrFlags::N_SUBNORMAL
|
7029 SIInstrFlags::N_ZERO
|
7030 SIInstrFlags::P_ZERO
|
7031 SIInstrFlags::P_SUBNORMAL
|
7032 SIInstrFlags::P_NORMAL
;
7034 static_assert(((~(SIInstrFlags::S_NAN
|
7035 SIInstrFlags::Q_NAN
|
7036 SIInstrFlags::N_INFINITY
|
7037 SIInstrFlags::P_INFINITY
)) & 0x3ff) == Mask
,
7041 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
7042 X
, DAG
.getConstant(Mask
, DL
, MVT::i32
));
7047 if (RHS
.getOpcode() == ISD::SETCC
&& LHS
.getOpcode() == AMDGPUISD::FP_CLASS
)
7048 std::swap(LHS
, RHS
);
7050 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
7052 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
7053 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
7054 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
7055 const ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
7056 if ((LCC
== ISD::SETO
|| LCC
== ISD::SETUO
) && Mask
&&
7057 (RHS
.getOperand(0) == LHS
.getOperand(0) &&
7058 LHS
.getOperand(0) == LHS
.getOperand(1))) {
7059 const unsigned OrdMask
= SIInstrFlags::S_NAN
| SIInstrFlags::Q_NAN
;
7060 unsigned NewMask
= LCC
== ISD::SETO
?
7061 Mask
->getZExtValue() & ~OrdMask
:
7062 Mask
->getZExtValue() & OrdMask
;
7065 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
, RHS
.getOperand(0),
7066 DAG
.getConstant(NewMask
, DL
, MVT::i32
));
7070 if (VT
== MVT::i32
&&
7071 (RHS
.getOpcode() == ISD::SIGN_EXTEND
|| LHS
.getOpcode() == ISD::SIGN_EXTEND
)) {
7072 // and x, (sext cc from i1) => select cc, x, 0
7073 if (RHS
.getOpcode() != ISD::SIGN_EXTEND
)
7074 std::swap(LHS
, RHS
);
7075 if (isBoolSGPR(RHS
.getOperand(0)))
7076 return DAG
.getSelect(SDLoc(N
), MVT::i32
, RHS
.getOperand(0),
7077 LHS
, DAG
.getConstant(0, SDLoc(N
), MVT::i32
));
7080 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7081 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
7082 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
7083 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
7084 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
7085 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
7086 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
7087 // Canonicalize the expression in an attempt to have fewer unique masks
7088 // and therefore fewer registers used to hold the masks.
7089 if (LHSMask
> RHSMask
) {
7090 std::swap(LHSMask
, RHSMask
);
7091 std::swap(LHS
, RHS
);
7094 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7095 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7096 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
7097 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
7099 // Check of we need to combine values from two sources within a byte.
7100 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
7101 // If we select high and lower word keep it for SDWA.
7102 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7103 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
7104 // Each byte in each mask is either selector mask 0-3, or has higher
7105 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
7106 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
7107 // mask which is not 0xff wins. By anding both masks we have a correct
7108 // result except that 0x0c shall be corrected to give 0x0c only.
7109 uint32_t Mask
= LHSMask
& RHSMask
;
7110 for (unsigned I
= 0; I
< 32; I
+= 8) {
7111 uint32_t ByteSel
= 0xff << I
;
7112 if ((LHSMask
& ByteSel
) == 0x0c || (RHSMask
& ByteSel
) == 0x0c)
7113 Mask
&= (0x0c << I
) & 0xffffffff;
7116 // Add 4 to each active LHS lane. It will not affect any existing 0xff
7118 uint32_t Sel
= Mask
| (LHSUsedLanes
& 0x04040404);
7121 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
7122 LHS
.getOperand(0), RHS
.getOperand(0),
7123 DAG
.getConstant(Sel
, DL
, MVT::i32
));
7131 SDValue
SITargetLowering::performOrCombine(SDNode
*N
,
7132 DAGCombinerInfo
&DCI
) const {
7133 SelectionDAG
&DAG
= DCI
.DAG
;
7134 SDValue LHS
= N
->getOperand(0);
7135 SDValue RHS
= N
->getOperand(1);
7137 EVT VT
= N
->getValueType(0);
7138 if (VT
== MVT::i1
) {
7139 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
7140 if (LHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
7141 RHS
.getOpcode() == AMDGPUISD::FP_CLASS
) {
7142 SDValue Src
= LHS
.getOperand(0);
7143 if (Src
!= RHS
.getOperand(0))
7146 const ConstantSDNode
*CLHS
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
7147 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
7151 // Only 10 bits are used.
7152 static const uint32_t MaxMask
= 0x3ff;
7154 uint32_t NewMask
= (CLHS
->getZExtValue() | CRHS
->getZExtValue()) & MaxMask
;
7156 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
7157 Src
, DAG
.getConstant(NewMask
, DL
, MVT::i32
));
7163 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7164 if (isa
<ConstantSDNode
>(RHS
) && LHS
.hasOneUse() &&
7165 LHS
.getOpcode() == AMDGPUISD::PERM
&&
7166 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
7167 uint32_t Sel
= getConstantPermuteMask(N
->getConstantOperandVal(1));
7171 Sel
|= LHS
.getConstantOperandVal(2);
7173 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
7174 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
7177 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7178 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
7179 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
7180 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
7181 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
7182 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
7183 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
7184 // Canonicalize the expression in an attempt to have fewer unique masks
7185 // and therefore fewer registers used to hold the masks.
7186 if (LHSMask
> RHSMask
) {
7187 std::swap(LHSMask
, RHSMask
);
7188 std::swap(LHS
, RHS
);
7191 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7192 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7193 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
7194 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
7196 // Check of we need to combine values from two sources within a byte.
7197 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
7198 // If we select high and lower word keep it for SDWA.
7199 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7200 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
7201 // Kill zero bytes selected by other mask. Zero value is 0xc.
7202 LHSMask
&= ~RHSUsedLanes
;
7203 RHSMask
&= ~LHSUsedLanes
;
7204 // Add 4 to each active LHS lane
7205 LHSMask
|= LHSUsedLanes
& 0x04040404;
7207 uint32_t Sel
= LHSMask
| RHSMask
;
7210 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
7211 LHS
.getOperand(0), RHS
.getOperand(0),
7212 DAG
.getConstant(Sel
, DL
, MVT::i32
));
7220 // TODO: This could be a generic combine with a predicate for extracting the
7221 // high half of an integer being free.
7223 // (or i64:x, (zero_extend i32:y)) ->
7224 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
7225 if (LHS
.getOpcode() == ISD::ZERO_EXTEND
&&
7226 RHS
.getOpcode() != ISD::ZERO_EXTEND
)
7227 std::swap(LHS
, RHS
);
7229 if (RHS
.getOpcode() == ISD::ZERO_EXTEND
) {
7230 SDValue ExtSrc
= RHS
.getOperand(0);
7231 EVT SrcVT
= ExtSrc
.getValueType();
7232 if (SrcVT
== MVT::i32
) {
7234 SDValue LowLHS
, HiBits
;
7235 std::tie(LowLHS
, HiBits
) = split64BitValue(LHS
, DAG
);
7236 SDValue LowOr
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, LowLHS
, ExtSrc
);
7238 DCI
.AddToWorklist(LowOr
.getNode());
7239 DCI
.AddToWorklist(HiBits
.getNode());
7241 SDValue Vec
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
,
7243 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Vec
);
7247 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
7250 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::OR
, LHS
, CRHS
))
7257 SDValue
SITargetLowering::performXorCombine(SDNode
*N
,
7258 DAGCombinerInfo
&DCI
) const {
7259 EVT VT
= N
->getValueType(0);
7263 SDValue LHS
= N
->getOperand(0);
7264 SDValue RHS
= N
->getOperand(1);
7266 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
7269 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::XOR
, LHS
, CRHS
))
7276 // Instructions that will be lowered with a final instruction that zeros the
7277 // high result bits.
7278 // XXX - probably only need to list legal operations.
7279 static bool fp16SrcZerosHighBits(unsigned Opc
) {
7288 case ISD::FCANONICALIZE
:
7290 case ISD::UINT_TO_FP
:
7291 case ISD::SINT_TO_FP
:
7293 // Fabs is lowered to a bit operation, but it's an and which will clear the
7294 // high bits anyway.
7308 case ISD::FNEARBYINT
:
7313 case AMDGPUISD::FRACT
:
7314 case AMDGPUISD::CLAMP
:
7315 case AMDGPUISD::COS_HW
:
7316 case AMDGPUISD::SIN_HW
:
7317 case AMDGPUISD::FMIN3
:
7318 case AMDGPUISD::FMAX3
:
7319 case AMDGPUISD::FMED3
:
7320 case AMDGPUISD::FMAD_FTZ
:
7321 case AMDGPUISD::RCP
:
7322 case AMDGPUISD::RSQ
:
7323 case AMDGPUISD::RCP_IFLAG
:
7324 case AMDGPUISD::LDEXP
:
7327 // fcopysign, select and others may be lowered to 32-bit bit operations
7328 // which don't zero the high bits.
7333 SDValue
SITargetLowering::performZeroExtendCombine(SDNode
*N
,
7334 DAGCombinerInfo
&DCI
) const {
7335 if (!Subtarget
->has16BitInsts() ||
7336 DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
7339 EVT VT
= N
->getValueType(0);
7343 SDValue Src
= N
->getOperand(0);
7344 if (Src
.getValueType() != MVT::i16
)
7347 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
7348 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
7349 if (Src
.getOpcode() == ISD::BITCAST
) {
7350 SDValue BCSrc
= Src
.getOperand(0);
7351 if (BCSrc
.getValueType() == MVT::f16
&&
7352 fp16SrcZerosHighBits(BCSrc
.getOpcode()))
7353 return DCI
.DAG
.getNode(AMDGPUISD::FP16_ZEXT
, SDLoc(N
), VT
, BCSrc
);
7359 SDValue
SITargetLowering::performClassCombine(SDNode
*N
,
7360 DAGCombinerInfo
&DCI
) const {
7361 SelectionDAG
&DAG
= DCI
.DAG
;
7362 SDValue Mask
= N
->getOperand(1);
7364 // fp_class x, 0 -> false
7365 if (const ConstantSDNode
*CMask
= dyn_cast
<ConstantSDNode
>(Mask
)) {
7366 if (CMask
->isNullValue())
7367 return DAG
.getConstant(0, SDLoc(N
), MVT::i1
);
7370 if (N
->getOperand(0).isUndef())
7371 return DAG
.getUNDEF(MVT::i1
);
7376 SDValue
SITargetLowering::performRcpCombine(SDNode
*N
,
7377 DAGCombinerInfo
&DCI
) const {
7378 EVT VT
= N
->getValueType(0);
7379 SDValue N0
= N
->getOperand(0);
7384 if (VT
== MVT::f32
&& (N0
.getOpcode() == ISD::UINT_TO_FP
||
7385 N0
.getOpcode() == ISD::SINT_TO_FP
)) {
7386 return DCI
.DAG
.getNode(AMDGPUISD::RCP_IFLAG
, SDLoc(N
), VT
, N0
,
7390 return AMDGPUTargetLowering::performRcpCombine(N
, DCI
);
7393 bool SITargetLowering::isCanonicalized(SelectionDAG
&DAG
, SDValue Op
,
7394 unsigned MaxDepth
) const {
7395 unsigned Opcode
= Op
.getOpcode();
7396 if (Opcode
== ISD::FCANONICALIZE
)
7399 if (auto *CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
7400 auto F
= CFP
->getValueAPF();
7401 if (F
.isNaN() && F
.isSignaling())
7403 return !F
.isDenormal() || denormalsEnabledForType(Op
.getValueType());
7406 // If source is a result of another standard FP operation it is already in
7412 // These will flush denorms if required.
7424 case ISD::FP_EXTEND
:
7425 case AMDGPUISD::FMUL_LEGACY
:
7426 case AMDGPUISD::FMAD_FTZ
:
7427 case AMDGPUISD::RCP
:
7428 case AMDGPUISD::RSQ
:
7429 case AMDGPUISD::RSQ_CLAMP
:
7430 case AMDGPUISD::RCP_LEGACY
:
7431 case AMDGPUISD::RSQ_LEGACY
:
7432 case AMDGPUISD::RCP_IFLAG
:
7433 case AMDGPUISD::TRIG_PREOP
:
7434 case AMDGPUISD::DIV_SCALE
:
7435 case AMDGPUISD::DIV_FMAS
:
7436 case AMDGPUISD::DIV_FIXUP
:
7437 case AMDGPUISD::FRACT
:
7438 case AMDGPUISD::LDEXP
:
7439 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
7440 case AMDGPUISD::CVT_F32_UBYTE0
:
7441 case AMDGPUISD::CVT_F32_UBYTE1
:
7442 case AMDGPUISD::CVT_F32_UBYTE2
:
7443 case AMDGPUISD::CVT_F32_UBYTE3
:
7446 // It can/will be lowered or combined as a bit operation.
7447 // Need to check their input recursively to handle.
7450 case ISD::FCOPYSIGN
:
7451 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
7456 return Op
.getValueType().getScalarType() != MVT::f16
;
7460 case AMDGPUISD::CLAMP
:
7461 case AMDGPUISD::FMED3
:
7462 case AMDGPUISD::FMAX3
:
7463 case AMDGPUISD::FMIN3
: {
7464 // FIXME: Shouldn't treat the generic operations different based these.
7465 bool IsIEEEMode
= Subtarget
->enableIEEEBit(DAG
.getMachineFunction());
7467 // snans will be quieted, so we only need to worry about denormals.
7468 if (Subtarget
->supportsMinMaxDenormModes() ||
7469 denormalsEnabledForType(Op
.getValueType()))
7472 // Flushing may be required.
7473 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
7474 // targets need to check their input recursively.
7475 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1) &&
7476 isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1);
7479 if (Subtarget
->supportsMinMaxDenormModes() ||
7480 denormalsEnabledForType(Op
.getValueType())) {
7481 // Only quieting may be necessary.
7482 return DAG
.isKnownNeverSNaN(Op
.getOperand(0)) &&
7483 DAG
.isKnownNeverSNaN(Op
.getOperand(1));
7486 // Flushing and quieting may be necessary
7487 // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it
7488 // needs to be quieted.
7489 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1) &&
7490 isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1);
7493 return isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1) &&
7494 isCanonicalized(DAG
, Op
.getOperand(2), MaxDepth
- 1);
7496 case ISD::BUILD_VECTOR
: {
7497 for (unsigned i
= 0, e
= Op
.getNumOperands(); i
!= e
; ++i
) {
7498 SDValue SrcOp
= Op
.getOperand(i
);
7499 if (!isCanonicalized(DAG
, SrcOp
, MaxDepth
- 1))
7505 case ISD::EXTRACT_VECTOR_ELT
:
7506 case ISD::EXTRACT_SUBVECTOR
: {
7507 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
7509 case ISD::INSERT_VECTOR_ELT
: {
7510 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1) &&
7511 isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1);
7514 // Could be anything.
7517 case ISD::INTRINSIC_WO_CHAIN
: {
7518 unsigned IntrinsicID
7519 = cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
7520 // TODO: Handle more intrinsics
7521 switch (IntrinsicID
) {
7522 case Intrinsic::amdgcn_cvt_pkrtz
:
7523 case Intrinsic::amdgcn_cubeid
:
7524 case Intrinsic::amdgcn_frexp_mant
:
7525 case Intrinsic::amdgcn_fdot2
:
7534 return denormalsEnabledForType(Op
.getValueType()) &&
7535 DAG
.isKnownNeverSNaN(Op
);
7538 llvm_unreachable("invalid operation");
7541 // Constant fold canonicalize.
7543 SDValue
SITargetLowering::getCanonicalConstantFP(
7544 SelectionDAG
&DAG
, const SDLoc
&SL
, EVT VT
, const APFloat
&C
) const {
7545 // Flush denormals to 0 if not enabled.
7546 if (C
.isDenormal() && !denormalsEnabledForType(VT
))
7547 return DAG
.getConstantFP(0.0, SL
, VT
);
7550 APFloat CanonicalQNaN
= APFloat::getQNaN(C
.getSemantics());
7551 if (C
.isSignaling()) {
7552 // Quiet a signaling NaN.
7553 // FIXME: Is this supposed to preserve payload bits?
7554 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
7557 // Make sure it is the canonical NaN bitpattern.
7559 // TODO: Can we use -1 as the canonical NaN value since it's an inline
7561 if (C
.bitcastToAPInt() != CanonicalQNaN
.bitcastToAPInt())
7562 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
7565 // Already canonical.
7566 return DAG
.getConstantFP(C
, SL
, VT
);
7569 static bool vectorEltWillFoldAway(SDValue Op
) {
7570 return Op
.isUndef() || isa
<ConstantFPSDNode
>(Op
);
7573 SDValue
SITargetLowering::performFCanonicalizeCombine(
7575 DAGCombinerInfo
&DCI
) const {
7576 SelectionDAG
&DAG
= DCI
.DAG
;
7577 SDValue N0
= N
->getOperand(0);
7578 EVT VT
= N
->getValueType(0);
7580 // fcanonicalize undef -> qnan
7582 APFloat QNaN
= APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT
));
7583 return DAG
.getConstantFP(QNaN
, SDLoc(N
), VT
);
7586 if (ConstantFPSDNode
*CFP
= isConstOrConstSplatFP(N0
)) {
7587 EVT VT
= N
->getValueType(0);
7588 return getCanonicalConstantFP(DAG
, SDLoc(N
), VT
, CFP
->getValueAPF());
7591 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
7592 // (fcanonicalize k)
7594 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
7596 // TODO: This could be better with wider vectors that will be split to v2f16,
7597 // and to consider uses since there aren't that many packed operations.
7598 if (N0
.getOpcode() == ISD::BUILD_VECTOR
&& VT
== MVT::v2f16
&&
7599 isTypeLegal(MVT::v2f16
)) {
7602 SDValue Lo
= N0
.getOperand(0);
7603 SDValue Hi
= N0
.getOperand(1);
7604 EVT EltVT
= Lo
.getValueType();
7606 if (vectorEltWillFoldAway(Lo
) || vectorEltWillFoldAway(Hi
)) {
7607 for (unsigned I
= 0; I
!= 2; ++I
) {
7608 SDValue Op
= N0
.getOperand(I
);
7609 if (ConstantFPSDNode
*CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
7610 NewElts
[I
] = getCanonicalConstantFP(DAG
, SL
, EltVT
,
7611 CFP
->getValueAPF());
7612 } else if (Op
.isUndef()) {
7613 // Handled below based on what the other operand is.
7616 NewElts
[I
] = DAG
.getNode(ISD::FCANONICALIZE
, SL
, EltVT
, Op
);
7620 // If one half is undef, and one is constant, perfer a splat vector rather
7621 // than the normal qNaN. If it's a register, prefer 0.0 since that's
7622 // cheaper to use and may be free with a packed operation.
7623 if (NewElts
[0].isUndef()) {
7624 if (isa
<ConstantFPSDNode
>(NewElts
[1]))
7625 NewElts
[0] = isa
<ConstantFPSDNode
>(NewElts
[1]) ?
7626 NewElts
[1]: DAG
.getConstantFP(0.0f
, SL
, EltVT
);
7629 if (NewElts
[1].isUndef()) {
7630 NewElts
[1] = isa
<ConstantFPSDNode
>(NewElts
[0]) ?
7631 NewElts
[0] : DAG
.getConstantFP(0.0f
, SL
, EltVT
);
7634 return DAG
.getBuildVector(VT
, SL
, NewElts
);
7638 return isCanonicalized(DAG
, N0
) ? N0
: SDValue();
7641 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc
) {
7644 return AMDGPUISD::FMAX3
;
7646 return AMDGPUISD::SMAX3
;
7648 return AMDGPUISD::UMAX3
;
7650 return AMDGPUISD::FMIN3
;
7652 return AMDGPUISD::SMIN3
;
7654 return AMDGPUISD::UMIN3
;
7656 llvm_unreachable("Not a min/max opcode");
7660 SDValue
SITargetLowering::performIntMed3ImmCombine(
7661 SelectionDAG
&DAG
, const SDLoc
&SL
,
7662 SDValue Op0
, SDValue Op1
, bool Signed
) const {
7663 ConstantSDNode
*K1
= dyn_cast
<ConstantSDNode
>(Op1
);
7667 ConstantSDNode
*K0
= dyn_cast
<ConstantSDNode
>(Op0
.getOperand(1));
7672 if (K0
->getAPIntValue().sge(K1
->getAPIntValue()))
7675 if (K0
->getAPIntValue().uge(K1
->getAPIntValue()))
7679 EVT VT
= K0
->getValueType(0);
7680 unsigned Med3Opc
= Signed
? AMDGPUISD::SMED3
: AMDGPUISD::UMED3
;
7681 if (VT
== MVT::i32
|| (VT
== MVT::i16
&& Subtarget
->hasMed3_16())) {
7682 return DAG
.getNode(Med3Opc
, SL
, VT
,
7683 Op0
.getOperand(0), SDValue(K0
, 0), SDValue(K1
, 0));
7686 // If there isn't a 16-bit med3 operation, convert to 32-bit.
7688 unsigned ExtOp
= Signed
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
7690 SDValue Tmp1
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(0));
7691 SDValue Tmp2
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(1));
7692 SDValue Tmp3
= DAG
.getNode(ExtOp
, SL
, NVT
, Op1
);
7694 SDValue Med3
= DAG
.getNode(Med3Opc
, SL
, NVT
, Tmp1
, Tmp2
, Tmp3
);
7695 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Med3
);
7698 static ConstantFPSDNode
*getSplatConstantFP(SDValue Op
) {
7699 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(Op
))
7702 if (BuildVectorSDNode
*BV
= dyn_cast
<BuildVectorSDNode
>(Op
)) {
7703 if (ConstantFPSDNode
*C
= BV
->getConstantFPSplatNode())
7710 SDValue
SITargetLowering::performFPMed3ImmCombine(SelectionDAG
&DAG
,
7713 SDValue Op1
) const {
7714 ConstantFPSDNode
*K1
= getSplatConstantFP(Op1
);
7718 ConstantFPSDNode
*K0
= getSplatConstantFP(Op0
.getOperand(1));
7722 // Ordered >= (although NaN inputs should have folded away by now).
7723 APFloat::cmpResult Cmp
= K0
->getValueAPF().compare(K1
->getValueAPF());
7724 if (Cmp
== APFloat::cmpGreaterThan
)
7727 // TODO: Check IEEE bit enabled?
7728 EVT VT
= Op0
.getValueType();
7729 if (Subtarget
->enableDX10Clamp()) {
7730 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
7731 // hardware fmed3 behavior converting to a min.
7732 // FIXME: Should this be allowing -0.0?
7733 if (K1
->isExactlyValue(1.0) && K0
->isExactlyValue(0.0))
7734 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Op0
.getOperand(0));
7737 // med3 for f16 is only available on gfx9+, and not available for v2f16.
7738 if (VT
== MVT::f32
|| (VT
== MVT::f16
&& Subtarget
->hasMed3_16())) {
7739 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
7740 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
7741 // then give the other result, which is different from med3 with a NaN
7743 SDValue Var
= Op0
.getOperand(0);
7744 if (!DAG
.isKnownNeverSNaN(Var
))
7747 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
7749 if ((!K0
->hasOneUse() ||
7750 TII
->isInlineConstant(K0
->getValueAPF().bitcastToAPInt())) &&
7751 (!K1
->hasOneUse() ||
7752 TII
->isInlineConstant(K1
->getValueAPF().bitcastToAPInt()))) {
7753 return DAG
.getNode(AMDGPUISD::FMED3
, SL
, K0
->getValueType(0),
7754 Var
, SDValue(K0
, 0), SDValue(K1
, 0));
7761 SDValue
SITargetLowering::performMinMaxCombine(SDNode
*N
,
7762 DAGCombinerInfo
&DCI
) const {
7763 SelectionDAG
&DAG
= DCI
.DAG
;
7765 EVT VT
= N
->getValueType(0);
7766 unsigned Opc
= N
->getOpcode();
7767 SDValue Op0
= N
->getOperand(0);
7768 SDValue Op1
= N
->getOperand(1);
7770 // Only do this if the inner op has one use since this will just increases
7771 // register pressure for no benefit.
7774 if (Opc
!= AMDGPUISD::FMIN_LEGACY
&& Opc
!= AMDGPUISD::FMAX_LEGACY
&&
7775 !VT
.isVector() && VT
!= MVT::f64
&&
7776 ((VT
!= MVT::f16
&& VT
!= MVT::i16
) || Subtarget
->hasMin3Max3_16())) {
7777 // max(max(a, b), c) -> max3(a, b, c)
7778 // min(min(a, b), c) -> min3(a, b, c)
7779 if (Op0
.getOpcode() == Opc
&& Op0
.hasOneUse()) {
7781 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
7790 // max(a, max(b, c)) -> max3(a, b, c)
7791 // min(a, min(b, c)) -> min3(a, b, c)
7792 if (Op1
.getOpcode() == Opc
&& Op1
.hasOneUse()) {
7794 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
7803 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
7804 if (Opc
== ISD::SMIN
&& Op0
.getOpcode() == ISD::SMAX
&& Op0
.hasOneUse()) {
7805 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, true))
7809 if (Opc
== ISD::UMIN
&& Op0
.getOpcode() == ISD::UMAX
&& Op0
.hasOneUse()) {
7810 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, false))
7814 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
7815 if (((Opc
== ISD::FMINNUM
&& Op0
.getOpcode() == ISD::FMAXNUM
) ||
7816 (Opc
== AMDGPUISD::FMIN_LEGACY
&&
7817 Op0
.getOpcode() == AMDGPUISD::FMAX_LEGACY
)) &&
7818 (VT
== MVT::f32
|| VT
== MVT::f64
||
7819 (VT
== MVT::f16
&& Subtarget
->has16BitInsts()) ||
7820 (VT
== MVT::v2f16
&& Subtarget
->hasVOP3PInsts())) &&
7822 if (SDValue Res
= performFPMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
))
7829 static bool isClampZeroToOne(SDValue A
, SDValue B
) {
7830 if (ConstantFPSDNode
*CA
= dyn_cast
<ConstantFPSDNode
>(A
)) {
7831 if (ConstantFPSDNode
*CB
= dyn_cast
<ConstantFPSDNode
>(B
)) {
7832 // FIXME: Should this be allowing -0.0?
7833 return (CA
->isExactlyValue(0.0) && CB
->isExactlyValue(1.0)) ||
7834 (CA
->isExactlyValue(1.0) && CB
->isExactlyValue(0.0));
7841 // FIXME: Should only worry about snans for version with chain.
7842 SDValue
SITargetLowering::performFMed3Combine(SDNode
*N
,
7843 DAGCombinerInfo
&DCI
) const {
7844 EVT VT
= N
->getValueType(0);
7845 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
7846 // NaNs. With a NaN input, the order of the operands may change the result.
7848 SelectionDAG
&DAG
= DCI
.DAG
;
7851 SDValue Src0
= N
->getOperand(0);
7852 SDValue Src1
= N
->getOperand(1);
7853 SDValue Src2
= N
->getOperand(2);
7855 if (isClampZeroToOne(Src0
, Src1
)) {
7856 // const_a, const_b, x -> clamp is safe in all cases including signaling
7858 // FIXME: Should this be allowing -0.0?
7859 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src2
);
7862 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
7863 // handling no dx10-clamp?
7864 if (Subtarget
->enableDX10Clamp()) {
7865 // If NaNs is clamped to 0, we are free to reorder the inputs.
7867 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
7868 std::swap(Src0
, Src1
);
7870 if (isa
<ConstantFPSDNode
>(Src1
) && !isa
<ConstantFPSDNode
>(Src2
))
7871 std::swap(Src1
, Src2
);
7873 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
7874 std::swap(Src0
, Src1
);
7876 if (isClampZeroToOne(Src1
, Src2
))
7877 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src0
);
7883 SDValue
SITargetLowering::performCvtPkRTZCombine(SDNode
*N
,
7884 DAGCombinerInfo
&DCI
) const {
7885 SDValue Src0
= N
->getOperand(0);
7886 SDValue Src1
= N
->getOperand(1);
7887 if (Src0
.isUndef() && Src1
.isUndef())
7888 return DCI
.DAG
.getUNDEF(N
->getValueType(0));
7892 SDValue
SITargetLowering::performExtractVectorEltCombine(
7893 SDNode
*N
, DAGCombinerInfo
&DCI
) const {
7894 SDValue Vec
= N
->getOperand(0);
7895 SelectionDAG
&DAG
= DCI
.DAG
;
7897 EVT VecVT
= Vec
.getValueType();
7898 EVT EltVT
= VecVT
.getVectorElementType();
7900 if ((Vec
.getOpcode() == ISD::FNEG
||
7901 Vec
.getOpcode() == ISD::FABS
) && allUsesHaveSourceMods(N
)) {
7903 EVT EltVT
= N
->getValueType(0);
7904 SDValue Idx
= N
->getOperand(1);
7905 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
7906 Vec
.getOperand(0), Idx
);
7907 return DAG
.getNode(Vec
.getOpcode(), SL
, EltVT
, Elt
);
7910 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
7912 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
7913 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
7914 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
7915 if (Vec
.hasOneUse() && DCI
.isBeforeLegalize()) {
7917 EVT EltVT
= N
->getValueType(0);
7918 SDValue Idx
= N
->getOperand(1);
7919 unsigned Opc
= Vec
.getOpcode();
7924 // TODO: Support other binary operations.
7934 case ISD::FMINNUM
: {
7935 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
7936 Vec
.getOperand(0), Idx
);
7937 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
7938 Vec
.getOperand(1), Idx
);
7940 DCI
.AddToWorklist(Elt0
.getNode());
7941 DCI
.AddToWorklist(Elt1
.getNode());
7942 return DAG
.getNode(Opc
, SL
, EltVT
, Elt0
, Elt1
, Vec
->getFlags());
7947 if (!DCI
.isBeforeLegalize())
7950 unsigned VecSize
= VecVT
.getSizeInBits();
7951 unsigned EltSize
= EltVT
.getSizeInBits();
7953 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
7954 // elements. This exposes more load reduction opportunities by replacing
7955 // multiple small extract_vector_elements with a single 32-bit extract.
7956 auto *Idx
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
7957 if (isa
<MemSDNode
>(Vec
) &&
7959 EltVT
.isByteSized() &&
7961 VecSize
% 32 == 0 &&
7963 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VecVT
);
7965 unsigned BitIndex
= Idx
->getZExtValue() * EltSize
;
7966 unsigned EltIdx
= BitIndex
/ 32;
7967 unsigned LeftoverBitIdx
= BitIndex
% 32;
7970 SDValue Cast
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, Vec
);
7971 DCI
.AddToWorklist(Cast
.getNode());
7973 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Cast
,
7974 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
7975 DCI
.AddToWorklist(Elt
.getNode());
7976 SDValue Srl
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Elt
,
7977 DAG
.getConstant(LeftoverBitIdx
, SL
, MVT::i32
));
7978 DCI
.AddToWorklist(Srl
.getNode());
7980 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, SL
, EltVT
.changeTypeToInteger(), Srl
);
7981 DCI
.AddToWorklist(Trunc
.getNode());
7982 return DAG
.getNode(ISD::BITCAST
, SL
, EltVT
, Trunc
);
7988 static bool convertBuildVectorCastElt(SelectionDAG
&DAG
,
7989 SDValue
&Lo
, SDValue
&Hi
) {
7990 if (Hi
.getOpcode() == ISD::BITCAST
&&
7991 Hi
.getOperand(0).getValueType() == MVT::f16
&&
7992 (isa
<ConstantSDNode
>(Lo
) || Lo
.isUndef())) {
7993 Lo
= DAG
.getNode(ISD::BITCAST
, SDLoc(Lo
), MVT::f16
, Lo
);
7994 Hi
= Hi
.getOperand(0);
8001 SDValue
SITargetLowering::performBuildVectorCombine(
8002 SDNode
*N
, DAGCombinerInfo
&DCI
) const {
8005 if (!isTypeLegal(MVT::v2i16
))
8007 SelectionDAG
&DAG
= DCI
.DAG
;
8008 EVT VT
= N
->getValueType(0);
8010 if (VT
== MVT::v2i16
) {
8011 SDValue Lo
= N
->getOperand(0);
8012 SDValue Hi
= N
->getOperand(1);
8014 // v2i16 build_vector (const|undef), (bitcast f16:$x)
8015 // -> bitcast (v2f16 build_vector const|undef, $x
8016 if (convertBuildVectorCastElt(DAG
, Lo
, Hi
)) {
8017 SDValue NewVec
= DAG
.getBuildVector(MVT::v2f16
, SL
, { Lo
, Hi
});
8018 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewVec
);
8021 if (convertBuildVectorCastElt(DAG
, Hi
, Lo
)) {
8022 SDValue NewVec
= DAG
.getBuildVector(MVT::v2f16
, SL
, { Hi
, Lo
});
8023 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewVec
);
8030 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG
&DAG
,
8032 const SDNode
*N1
) const {
8033 EVT VT
= N0
->getValueType(0);
8035 // Only do this if we are not trying to support denormals. v_mad_f32 does not
8036 // support denormals ever.
8037 if ((VT
== MVT::f32
&& !Subtarget
->hasFP32Denormals()) ||
8038 (VT
== MVT::f16
&& !Subtarget
->hasFP16Denormals()))
8041 const TargetOptions
&Options
= DAG
.getTarget().Options
;
8042 if ((Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
8043 (N0
->getFlags().hasAllowContract() &&
8044 N1
->getFlags().hasAllowContract())) &&
8045 isFMAFasterThanFMulAndFAdd(VT
)) {
8052 static SDValue
getMad64_32(SelectionDAG
&DAG
, const SDLoc
&SL
,
8054 SDValue N0
, SDValue N1
, SDValue N2
,
8056 unsigned MadOpc
= Signed
? AMDGPUISD::MAD_I64_I32
: AMDGPUISD::MAD_U64_U32
;
8057 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i1
);
8058 SDValue Mad
= DAG
.getNode(MadOpc
, SL
, VTs
, N0
, N1
, N2
);
8059 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Mad
);
8062 SDValue
SITargetLowering::performAddCombine(SDNode
*N
,
8063 DAGCombinerInfo
&DCI
) const {
8064 SelectionDAG
&DAG
= DCI
.DAG
;
8065 EVT VT
= N
->getValueType(0);
8067 SDValue LHS
= N
->getOperand(0);
8068 SDValue RHS
= N
->getOperand(1);
8070 if ((LHS
.getOpcode() == ISD::MUL
|| RHS
.getOpcode() == ISD::MUL
)
8071 && Subtarget
->hasMad64_32() &&
8072 !VT
.isVector() && VT
.getScalarSizeInBits() > 32 &&
8073 VT
.getScalarSizeInBits() <= 64) {
8074 if (LHS
.getOpcode() != ISD::MUL
)
8075 std::swap(LHS
, RHS
);
8077 SDValue MulLHS
= LHS
.getOperand(0);
8078 SDValue MulRHS
= LHS
.getOperand(1);
8079 SDValue AddRHS
= RHS
;
8081 // TODO: Maybe restrict if SGPR inputs.
8082 if (numBitsUnsigned(MulLHS
, DAG
) <= 32 &&
8083 numBitsUnsigned(MulRHS
, DAG
) <= 32) {
8084 MulLHS
= DAG
.getZExtOrTrunc(MulLHS
, SL
, MVT::i32
);
8085 MulRHS
= DAG
.getZExtOrTrunc(MulRHS
, SL
, MVT::i32
);
8086 AddRHS
= DAG
.getZExtOrTrunc(AddRHS
, SL
, MVT::i64
);
8087 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, false);
8090 if (numBitsSigned(MulLHS
, DAG
) < 32 && numBitsSigned(MulRHS
, DAG
) < 32) {
8091 MulLHS
= DAG
.getSExtOrTrunc(MulLHS
, SL
, MVT::i32
);
8092 MulRHS
= DAG
.getSExtOrTrunc(MulRHS
, SL
, MVT::i32
);
8093 AddRHS
= DAG
.getSExtOrTrunc(AddRHS
, SL
, MVT::i64
);
8094 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, true);
8100 if (VT
!= MVT::i32
|| !DCI
.isAfterLegalizeDAG())
8103 // add x, zext (setcc) => addcarry x, 0, setcc
8104 // add x, sext (setcc) => subcarry x, 0, setcc
8105 unsigned Opc
= LHS
.getOpcode();
8106 if (Opc
== ISD::ZERO_EXTEND
|| Opc
== ISD::SIGN_EXTEND
||
8107 Opc
== ISD::ANY_EXTEND
|| Opc
== ISD::ADDCARRY
)
8108 std::swap(RHS
, LHS
);
8110 Opc
= RHS
.getOpcode();
8113 case ISD::ZERO_EXTEND
:
8114 case ISD::SIGN_EXTEND
:
8115 case ISD::ANY_EXTEND
: {
8116 auto Cond
= RHS
.getOperand(0);
8117 if (!isBoolSGPR(Cond
))
8119 SDVTList VTList
= DAG
.getVTList(MVT::i32
, MVT::i1
);
8120 SDValue Args
[] = { LHS
, DAG
.getConstant(0, SL
, MVT::i32
), Cond
};
8121 Opc
= (Opc
== ISD::SIGN_EXTEND
) ? ISD::SUBCARRY
: ISD::ADDCARRY
;
8122 return DAG
.getNode(Opc
, SL
, VTList
, Args
);
8124 case ISD::ADDCARRY
: {
8125 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
8126 auto C
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8127 if (!C
|| C
->getZExtValue() != 0) break;
8128 SDValue Args
[] = { LHS
, RHS
.getOperand(0), RHS
.getOperand(2) };
8129 return DAG
.getNode(ISD::ADDCARRY
, SDLoc(N
), RHS
->getVTList(), Args
);
8135 SDValue
SITargetLowering::performSubCombine(SDNode
*N
,
8136 DAGCombinerInfo
&DCI
) const {
8137 SelectionDAG
&DAG
= DCI
.DAG
;
8138 EVT VT
= N
->getValueType(0);
8144 SDValue LHS
= N
->getOperand(0);
8145 SDValue RHS
= N
->getOperand(1);
8147 unsigned Opc
= LHS
.getOpcode();
8148 if (Opc
!= ISD::SUBCARRY
)
8149 std::swap(RHS
, LHS
);
8151 if (LHS
.getOpcode() == ISD::SUBCARRY
) {
8152 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
8153 auto C
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
8154 if (!C
|| C
->getZExtValue() != 0)
8156 SDValue Args
[] = { LHS
.getOperand(0), RHS
, LHS
.getOperand(2) };
8157 return DAG
.getNode(ISD::SUBCARRY
, SDLoc(N
), LHS
->getVTList(), Args
);
8162 SDValue
SITargetLowering::performAddCarrySubCarryCombine(SDNode
*N
,
8163 DAGCombinerInfo
&DCI
) const {
8165 if (N
->getValueType(0) != MVT::i32
)
8168 auto C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
8169 if (!C
|| C
->getZExtValue() != 0)
8172 SelectionDAG
&DAG
= DCI
.DAG
;
8173 SDValue LHS
= N
->getOperand(0);
8175 // addcarry (add x, y), 0, cc => addcarry x, y, cc
8176 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
8177 unsigned LHSOpc
= LHS
.getOpcode();
8178 unsigned Opc
= N
->getOpcode();
8179 if ((LHSOpc
== ISD::ADD
&& Opc
== ISD::ADDCARRY
) ||
8180 (LHSOpc
== ISD::SUB
&& Opc
== ISD::SUBCARRY
)) {
8181 SDValue Args
[] = { LHS
.getOperand(0), LHS
.getOperand(1), N
->getOperand(2) };
8182 return DAG
.getNode(Opc
, SDLoc(N
), N
->getVTList(), Args
);
8187 SDValue
SITargetLowering::performFAddCombine(SDNode
*N
,
8188 DAGCombinerInfo
&DCI
) const {
8189 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
8192 SelectionDAG
&DAG
= DCI
.DAG
;
8193 EVT VT
= N
->getValueType(0);
8196 SDValue LHS
= N
->getOperand(0);
8197 SDValue RHS
= N
->getOperand(1);
8199 // These should really be instruction patterns, but writing patterns with
8200 // source modiifiers is a pain.
8202 // fadd (fadd (a, a), b) -> mad 2.0, a, b
8203 if (LHS
.getOpcode() == ISD::FADD
) {
8204 SDValue A
= LHS
.getOperand(0);
8205 if (A
== LHS
.getOperand(1)) {
8206 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
8208 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
8209 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, RHS
);
8214 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
8215 if (RHS
.getOpcode() == ISD::FADD
) {
8216 SDValue A
= RHS
.getOperand(0);
8217 if (A
== RHS
.getOperand(1)) {
8218 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
8220 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
8221 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, LHS
);
8229 SDValue
SITargetLowering::performFSubCombine(SDNode
*N
,
8230 DAGCombinerInfo
&DCI
) const {
8231 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
8234 SelectionDAG
&DAG
= DCI
.DAG
;
8236 EVT VT
= N
->getValueType(0);
8237 assert(!VT
.isVector());
8239 // Try to get the fneg to fold into the source modifier. This undoes generic
8240 // DAG combines and folds them into the mad.
8242 // Only do this if we are not trying to support denormals. v_mad_f32 does
8243 // not support denormals ever.
8244 SDValue LHS
= N
->getOperand(0);
8245 SDValue RHS
= N
->getOperand(1);
8246 if (LHS
.getOpcode() == ISD::FADD
) {
8247 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
8248 SDValue A
= LHS
.getOperand(0);
8249 if (A
== LHS
.getOperand(1)) {
8250 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
8252 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
8253 SDValue NegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
8255 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, NegRHS
);
8260 if (RHS
.getOpcode() == ISD::FADD
) {
8261 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
8263 SDValue A
= RHS
.getOperand(0);
8264 if (A
== RHS
.getOperand(1)) {
8265 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
8267 const SDValue NegTwo
= DAG
.getConstantFP(-2.0, SL
, VT
);
8268 return DAG
.getNode(FusedOp
, SL
, VT
, A
, NegTwo
, LHS
);
8276 SDValue
SITargetLowering::performFMACombine(SDNode
*N
,
8277 DAGCombinerInfo
&DCI
) const {
8278 SelectionDAG
&DAG
= DCI
.DAG
;
8279 EVT VT
= N
->getValueType(0);
8282 if (!Subtarget
->hasDLInsts() || VT
!= MVT::f32
)
8285 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
8286 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
8287 SDValue Op1
= N
->getOperand(0);
8288 SDValue Op2
= N
->getOperand(1);
8289 SDValue FMA
= N
->getOperand(2);
8291 if (FMA
.getOpcode() != ISD::FMA
||
8292 Op1
.getOpcode() != ISD::FP_EXTEND
||
8293 Op2
.getOpcode() != ISD::FP_EXTEND
)
8296 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
8297 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
8298 // is sufficient to allow generaing fdot2.
8299 const TargetOptions
&Options
= DAG
.getTarget().Options
;
8300 if (Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
8301 (N
->getFlags().hasAllowContract() &&
8302 FMA
->getFlags().hasAllowContract())) {
8303 Op1
= Op1
.getOperand(0);
8304 Op2
= Op2
.getOperand(0);
8305 if (Op1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
8306 Op2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
8309 SDValue Vec1
= Op1
.getOperand(0);
8310 SDValue Idx1
= Op1
.getOperand(1);
8311 SDValue Vec2
= Op2
.getOperand(0);
8313 SDValue FMAOp1
= FMA
.getOperand(0);
8314 SDValue FMAOp2
= FMA
.getOperand(1);
8315 SDValue FMAAcc
= FMA
.getOperand(2);
8317 if (FMAOp1
.getOpcode() != ISD::FP_EXTEND
||
8318 FMAOp2
.getOpcode() != ISD::FP_EXTEND
)
8321 FMAOp1
= FMAOp1
.getOperand(0);
8322 FMAOp2
= FMAOp2
.getOperand(0);
8323 if (FMAOp1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
8324 FMAOp2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
8327 SDValue Vec3
= FMAOp1
.getOperand(0);
8328 SDValue Vec4
= FMAOp2
.getOperand(0);
8329 SDValue Idx2
= FMAOp1
.getOperand(1);
8331 if (Idx1
!= Op2
.getOperand(1) || Idx2
!= FMAOp2
.getOperand(1) ||
8332 // Idx1 and Idx2 cannot be the same.
8336 if (Vec1
== Vec2
|| Vec3
== Vec4
)
8339 if (Vec1
.getValueType() != MVT::v2f16
|| Vec2
.getValueType() != MVT::v2f16
)
8342 if ((Vec1
== Vec3
&& Vec2
== Vec4
) ||
8343 (Vec1
== Vec4
&& Vec2
== Vec3
)) {
8344 return DAG
.getNode(AMDGPUISD::FDOT2
, SL
, MVT::f32
, Vec1
, Vec2
, FMAAcc
,
8345 DAG
.getTargetConstant(0, SL
, MVT::i1
));
8351 SDValue
SITargetLowering::performSetCCCombine(SDNode
*N
,
8352 DAGCombinerInfo
&DCI
) const {
8353 SelectionDAG
&DAG
= DCI
.DAG
;
8356 SDValue LHS
= N
->getOperand(0);
8357 SDValue RHS
= N
->getOperand(1);
8358 EVT VT
= LHS
.getValueType();
8359 ISD::CondCode CC
= cast
<CondCodeSDNode
>(N
->getOperand(2))->get();
8361 auto CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8363 CRHS
= dyn_cast
<ConstantSDNode
>(LHS
);
8365 std::swap(LHS
, RHS
);
8366 CC
= getSetCCSwappedOperands(CC
);
8371 if (VT
== MVT::i32
&& LHS
.getOpcode() == ISD::SIGN_EXTEND
&&
8372 isBoolSGPR(LHS
.getOperand(0))) {
8373 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
8374 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
8375 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
8376 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
8377 if ((CRHS
->isAllOnesValue() &&
8378 (CC
== ISD::SETNE
|| CC
== ISD::SETGT
|| CC
== ISD::SETULT
)) ||
8379 (CRHS
->isNullValue() &&
8380 (CC
== ISD::SETEQ
|| CC
== ISD::SETGE
|| CC
== ISD::SETULE
)))
8381 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
8382 DAG
.getConstant(-1, SL
, MVT::i1
));
8383 if ((CRHS
->isAllOnesValue() &&
8384 (CC
== ISD::SETEQ
|| CC
== ISD::SETLE
|| CC
== ISD::SETUGE
)) ||
8385 (CRHS
->isNullValue() &&
8386 (CC
== ISD::SETNE
|| CC
== ISD::SETUGT
|| CC
== ISD::SETLT
)))
8387 return LHS
.getOperand(0);
8390 uint64_t CRHSVal
= CRHS
->getZExtValue();
8391 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) &&
8392 LHS
.getOpcode() == ISD::SELECT
&&
8393 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
8394 isa
<ConstantSDNode
>(LHS
.getOperand(2)) &&
8395 LHS
.getConstantOperandVal(1) != LHS
.getConstantOperandVal(2) &&
8396 isBoolSGPR(LHS
.getOperand(0))) {
8398 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
8399 // setcc (select cc, CT, CF), CF, ne => cc
8400 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
8401 // setcc (select cc, CT, CF), CT, eq => cc
8402 uint64_t CT
= LHS
.getConstantOperandVal(1);
8403 uint64_t CF
= LHS
.getConstantOperandVal(2);
8405 if ((CF
== CRHSVal
&& CC
== ISD::SETEQ
) ||
8406 (CT
== CRHSVal
&& CC
== ISD::SETNE
))
8407 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
8408 DAG
.getConstant(-1, SL
, MVT::i1
));
8409 if ((CF
== CRHSVal
&& CC
== ISD::SETNE
) ||
8410 (CT
== CRHSVal
&& CC
== ISD::SETEQ
))
8411 return LHS
.getOperand(0);
8415 if (VT
!= MVT::f32
&& VT
!= MVT::f64
&& (Subtarget
->has16BitInsts() &&
8419 // Match isinf/isfinite pattern
8420 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
8421 // (fcmp one (fabs x), inf) -> (fp_class x,
8422 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
8423 if ((CC
== ISD::SETOEQ
|| CC
== ISD::SETONE
) && LHS
.getOpcode() == ISD::FABS
) {
8424 const ConstantFPSDNode
*CRHS
= dyn_cast
<ConstantFPSDNode
>(RHS
);
8428 const APFloat
&APF
= CRHS
->getValueAPF();
8429 if (APF
.isInfinity() && !APF
.isNegative()) {
8430 const unsigned IsInfMask
= SIInstrFlags::P_INFINITY
|
8431 SIInstrFlags::N_INFINITY
;
8432 const unsigned IsFiniteMask
= SIInstrFlags::N_ZERO
|
8433 SIInstrFlags::P_ZERO
|
8434 SIInstrFlags::N_NORMAL
|
8435 SIInstrFlags::P_NORMAL
|
8436 SIInstrFlags::N_SUBNORMAL
|
8437 SIInstrFlags::P_SUBNORMAL
;
8438 unsigned Mask
= CC
== ISD::SETOEQ
? IsInfMask
: IsFiniteMask
;
8439 return DAG
.getNode(AMDGPUISD::FP_CLASS
, SL
, MVT::i1
, LHS
.getOperand(0),
8440 DAG
.getConstant(Mask
, SL
, MVT::i32
));
8447 SDValue
SITargetLowering::performCvtF32UByteNCombine(SDNode
*N
,
8448 DAGCombinerInfo
&DCI
) const {
8449 SelectionDAG
&DAG
= DCI
.DAG
;
8451 unsigned Offset
= N
->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0
;
8453 SDValue Src
= N
->getOperand(0);
8454 SDValue Srl
= N
->getOperand(0);
8455 if (Srl
.getOpcode() == ISD::ZERO_EXTEND
)
8456 Srl
= Srl
.getOperand(0);
8458 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
8459 if (Srl
.getOpcode() == ISD::SRL
) {
8460 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
8461 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
8462 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
8464 if (const ConstantSDNode
*C
=
8465 dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1))) {
8466 Srl
= DAG
.getZExtOrTrunc(Srl
.getOperand(0), SDLoc(Srl
.getOperand(0)),
8469 unsigned SrcOffset
= C
->getZExtValue() + 8 * Offset
;
8470 if (SrcOffset
< 32 && SrcOffset
% 8 == 0) {
8471 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
+ SrcOffset
/ 8, SL
,
8477 APInt Demanded
= APInt::getBitsSet(32, 8 * Offset
, 8 * Offset
+ 8);
8480 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
8481 !DCI
.isBeforeLegalizeOps());
8482 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
8483 if (TLI
.ShrinkDemandedConstant(Src
, Demanded
, TLO
) ||
8484 TLI
.SimplifyDemandedBits(Src
, Demanded
, Known
, TLO
)) {
8485 DCI
.CommitTargetLoweringOpt(TLO
);
8491 SDValue
SITargetLowering::performClampCombine(SDNode
*N
,
8492 DAGCombinerInfo
&DCI
) const {
8493 ConstantFPSDNode
*CSrc
= dyn_cast
<ConstantFPSDNode
>(N
->getOperand(0));
8497 const APFloat
&F
= CSrc
->getValueAPF();
8498 APFloat Zero
= APFloat::getZero(F
.getSemantics());
8499 APFloat::cmpResult Cmp0
= F
.compare(Zero
);
8500 if (Cmp0
== APFloat::cmpLessThan
||
8501 (Cmp0
== APFloat::cmpUnordered
&& Subtarget
->enableDX10Clamp())) {
8502 return DCI
.DAG
.getConstantFP(Zero
, SDLoc(N
), N
->getValueType(0));
8505 APFloat
One(F
.getSemantics(), "1.0");
8506 APFloat::cmpResult Cmp1
= F
.compare(One
);
8507 if (Cmp1
== APFloat::cmpGreaterThan
)
8508 return DCI
.DAG
.getConstantFP(One
, SDLoc(N
), N
->getValueType(0));
8510 return SDValue(CSrc
, 0);
8514 SDValue
SITargetLowering::PerformDAGCombine(SDNode
*N
,
8515 DAGCombinerInfo
&DCI
) const {
8516 switch (N
->getOpcode()) {
8518 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
8520 return performAddCombine(N
, DCI
);
8522 return performSubCombine(N
, DCI
);
8525 return performAddCarrySubCarryCombine(N
, DCI
);
8527 return performFAddCombine(N
, DCI
);
8529 return performFSubCombine(N
, DCI
);
8531 return performSetCCCombine(N
, DCI
);
8538 case AMDGPUISD::FMIN_LEGACY
:
8539 case AMDGPUISD::FMAX_LEGACY
: {
8540 if (DCI
.getDAGCombineLevel() >= AfterLegalizeDAG
&&
8541 getTargetMachine().getOptLevel() > CodeGenOpt::None
)
8542 return performMinMaxCombine(N
, DCI
);
8546 return performFMACombine(N
, DCI
);
8548 if (SDValue Widended
= widenLoad(cast
<LoadSDNode
>(N
), DCI
))
8553 case ISD::ATOMIC_LOAD
:
8554 case ISD::ATOMIC_STORE
:
8555 case ISD::ATOMIC_CMP_SWAP
:
8556 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
:
8557 case ISD::ATOMIC_SWAP
:
8558 case ISD::ATOMIC_LOAD_ADD
:
8559 case ISD::ATOMIC_LOAD_SUB
:
8560 case ISD::ATOMIC_LOAD_AND
:
8561 case ISD::ATOMIC_LOAD_OR
:
8562 case ISD::ATOMIC_LOAD_XOR
:
8563 case ISD::ATOMIC_LOAD_NAND
:
8564 case ISD::ATOMIC_LOAD_MIN
:
8565 case ISD::ATOMIC_LOAD_MAX
:
8566 case ISD::ATOMIC_LOAD_UMIN
:
8567 case ISD::ATOMIC_LOAD_UMAX
:
8568 case AMDGPUISD::ATOMIC_INC
:
8569 case AMDGPUISD::ATOMIC_DEC
:
8570 case AMDGPUISD::ATOMIC_LOAD_FADD
:
8571 case AMDGPUISD::ATOMIC_LOAD_FMIN
:
8572 case AMDGPUISD::ATOMIC_LOAD_FMAX
: // TODO: Target mem intrinsics.
8573 if (DCI
.isBeforeLegalize())
8575 return performMemSDNodeCombine(cast
<MemSDNode
>(N
), DCI
);
8577 return performAndCombine(N
, DCI
);
8579 return performOrCombine(N
, DCI
);
8581 return performXorCombine(N
, DCI
);
8582 case ISD::ZERO_EXTEND
:
8583 return performZeroExtendCombine(N
, DCI
);
8584 case AMDGPUISD::FP_CLASS
:
8585 return performClassCombine(N
, DCI
);
8586 case ISD::FCANONICALIZE
:
8587 return performFCanonicalizeCombine(N
, DCI
);
8588 case AMDGPUISD::RCP
:
8589 return performRcpCombine(N
, DCI
);
8590 case AMDGPUISD::FRACT
:
8591 case AMDGPUISD::RSQ
:
8592 case AMDGPUISD::RCP_LEGACY
:
8593 case AMDGPUISD::RSQ_LEGACY
:
8594 case AMDGPUISD::RCP_IFLAG
:
8595 case AMDGPUISD::RSQ_CLAMP
:
8596 case AMDGPUISD::LDEXP
: {
8597 SDValue Src
= N
->getOperand(0);
8602 case ISD::SINT_TO_FP
:
8603 case ISD::UINT_TO_FP
:
8604 return performUCharToFloatCombine(N
, DCI
);
8605 case AMDGPUISD::CVT_F32_UBYTE0
:
8606 case AMDGPUISD::CVT_F32_UBYTE1
:
8607 case AMDGPUISD::CVT_F32_UBYTE2
:
8608 case AMDGPUISD::CVT_F32_UBYTE3
:
8609 return performCvtF32UByteNCombine(N
, DCI
);
8610 case AMDGPUISD::FMED3
:
8611 return performFMed3Combine(N
, DCI
);
8612 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
8613 return performCvtPkRTZCombine(N
, DCI
);
8614 case AMDGPUISD::CLAMP
:
8615 return performClampCombine(N
, DCI
);
8616 case ISD::SCALAR_TO_VECTOR
: {
8617 SelectionDAG
&DAG
= DCI
.DAG
;
8618 EVT VT
= N
->getValueType(0);
8620 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
8621 if (VT
== MVT::v2i16
|| VT
== MVT::v2f16
) {
8623 SDValue Src
= N
->getOperand(0);
8624 EVT EltVT
= Src
.getValueType();
8625 if (EltVT
== MVT::f16
)
8626 Src
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Src
);
8628 SDValue Ext
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Src
);
8629 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Ext
);
8634 case ISD::EXTRACT_VECTOR_ELT
:
8635 return performExtractVectorEltCombine(N
, DCI
);
8636 case ISD::BUILD_VECTOR
:
8637 return performBuildVectorCombine(N
, DCI
);
8639 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
8642 /// Helper function for adjustWritemask
8643 static unsigned SubIdx2Lane(unsigned Idx
) {
8646 case AMDGPU::sub0
: return 0;
8647 case AMDGPU::sub1
: return 1;
8648 case AMDGPU::sub2
: return 2;
8649 case AMDGPU::sub3
: return 3;
8653 /// Adjust the writemask of MIMG instructions
8654 SDNode
*SITargetLowering::adjustWritemask(MachineSDNode
*&Node
,
8655 SelectionDAG
&DAG
) const {
8656 unsigned Opcode
= Node
->getMachineOpcode();
8658 // Subtract 1 because the vdata output is not a MachineSDNode operand.
8659 int D16Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::d16
) - 1;
8660 if (D16Idx
>= 0 && Node
->getConstantOperandVal(D16Idx
))
8661 return Node
; // not implemented for D16
8663 SDNode
*Users
[4] = { nullptr };
8665 unsigned DmaskIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::dmask
) - 1;
8666 unsigned OldDmask
= Node
->getConstantOperandVal(DmaskIdx
);
8667 unsigned NewDmask
= 0;
8668 bool HasChain
= Node
->getNumValues() > 1;
8670 if (OldDmask
== 0) {
8671 // These are folded out, but on the chance it happens don't assert.
8675 // Try to figure out the used register components
8676 for (SDNode::use_iterator I
= Node
->use_begin(), E
= Node
->use_end();
8679 // Don't look at users of the chain.
8680 if (I
.getUse().getResNo() != 0)
8683 // Abort if we can't understand the usage
8684 if (!I
->isMachineOpcode() ||
8685 I
->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
)
8688 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
8689 // Note that subregs are packed, i.e. Lane==0 is the first bit set
8690 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
8692 Lane
= SubIdx2Lane(I
->getConstantOperandVal(1));
8694 // Set which texture component corresponds to the lane.
8696 for (unsigned i
= 0, Dmask
= OldDmask
; i
<= Lane
; i
++) {
8697 Comp
= countTrailingZeros(Dmask
);
8698 Dmask
&= ~(1 << Comp
);
8701 // Abort if we have more than one user per component
8706 NewDmask
|= 1 << Comp
;
8709 // Abort if there's no change
8710 if (NewDmask
== OldDmask
)
8713 unsigned BitsSet
= countPopulation(NewDmask
);
8715 int NewOpcode
= AMDGPU::getMaskedMIMGOp(Node
->getMachineOpcode(), BitsSet
);
8716 assert(NewOpcode
!= -1 &&
8717 NewOpcode
!= static_cast<int>(Node
->getMachineOpcode()) &&
8718 "failed to find equivalent MIMG op");
8720 // Adjust the writemask in the node
8721 SmallVector
<SDValue
, 12> Ops
;
8722 Ops
.insert(Ops
.end(), Node
->op_begin(), Node
->op_begin() + DmaskIdx
);
8723 Ops
.push_back(DAG
.getTargetConstant(NewDmask
, SDLoc(Node
), MVT::i32
));
8724 Ops
.insert(Ops
.end(), Node
->op_begin() + DmaskIdx
+ 1, Node
->op_end());
8726 MVT SVT
= Node
->getValueType(0).getVectorElementType().getSimpleVT();
8728 MVT ResultVT
= BitsSet
== 1 ?
8729 SVT
: MVT::getVectorVT(SVT
, BitsSet
== 3 ? 4 : BitsSet
);
8730 SDVTList NewVTList
= HasChain
?
8731 DAG
.getVTList(ResultVT
, MVT::Other
) : DAG
.getVTList(ResultVT
);
8734 MachineSDNode
*NewNode
= DAG
.getMachineNode(NewOpcode
, SDLoc(Node
),
8739 DAG
.setNodeMemRefs(NewNode
, Node
->memoperands());
8740 DAG
.ReplaceAllUsesOfValueWith(SDValue(Node
, 1), SDValue(NewNode
, 1));
8744 assert(Node
->hasNUsesOfValue(1, 0));
8745 SDNode
*Copy
= DAG
.getMachineNode(TargetOpcode::COPY
,
8746 SDLoc(Node
), Users
[Lane
]->getValueType(0),
8747 SDValue(NewNode
, 0));
8748 DAG
.ReplaceAllUsesWith(Users
[Lane
], Copy
);
8752 // Update the users of the node with the new indices
8753 for (unsigned i
= 0, Idx
= AMDGPU::sub0
; i
< 4; ++i
) {
8754 SDNode
*User
= Users
[i
];
8758 SDValue Op
= DAG
.getTargetConstant(Idx
, SDLoc(User
), MVT::i32
);
8759 DAG
.UpdateNodeOperands(User
, SDValue(NewNode
, 0), Op
);
8763 case AMDGPU::sub0
: Idx
= AMDGPU::sub1
; break;
8764 case AMDGPU::sub1
: Idx
= AMDGPU::sub2
; break;
8765 case AMDGPU::sub2
: Idx
= AMDGPU::sub3
; break;
8769 DAG
.RemoveDeadNode(Node
);
8773 static bool isFrameIndexOp(SDValue Op
) {
8774 if (Op
.getOpcode() == ISD::AssertZext
)
8775 Op
= Op
.getOperand(0);
8777 return isa
<FrameIndexSDNode
>(Op
);
8780 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
8781 /// with frame index operands.
8782 /// LLVM assumes that inputs are to these instructions are registers.
8783 SDNode
*SITargetLowering::legalizeTargetIndependentNode(SDNode
*Node
,
8784 SelectionDAG
&DAG
) const {
8785 if (Node
->getOpcode() == ISD::CopyToReg
) {
8786 RegisterSDNode
*DestReg
= cast
<RegisterSDNode
>(Node
->getOperand(1));
8787 SDValue SrcVal
= Node
->getOperand(2);
8789 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
8790 // to try understanding copies to physical registers.
8791 if (SrcVal
.getValueType() == MVT::i1
&&
8792 TargetRegisterInfo::isPhysicalRegister(DestReg
->getReg())) {
8794 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
8795 SDValue VReg
= DAG
.getRegister(
8796 MRI
.createVirtualRegister(&AMDGPU::VReg_1RegClass
), MVT::i1
);
8798 SDNode
*Glued
= Node
->getGluedNode();
8800 = DAG
.getCopyToReg(Node
->getOperand(0), SL
, VReg
, SrcVal
,
8801 SDValue(Glued
, Glued
? Glued
->getNumValues() - 1 : 0));
8803 = DAG
.getCopyToReg(ToVReg
, SL
, SDValue(DestReg
, 0),
8804 VReg
, ToVReg
.getValue(1));
8805 DAG
.ReplaceAllUsesWith(Node
, ToResultReg
.getNode());
8806 DAG
.RemoveDeadNode(Node
);
8807 return ToResultReg
.getNode();
8811 SmallVector
<SDValue
, 8> Ops
;
8812 for (unsigned i
= 0; i
< Node
->getNumOperands(); ++i
) {
8813 if (!isFrameIndexOp(Node
->getOperand(i
))) {
8814 Ops
.push_back(Node
->getOperand(i
));
8819 Ops
.push_back(SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
,
8820 Node
->getOperand(i
).getValueType(),
8821 Node
->getOperand(i
)), 0));
8824 return DAG
.UpdateNodeOperands(Node
, Ops
);
8827 /// Fold the instructions after selecting them.
8828 /// Returns null if users were already updated.
8829 SDNode
*SITargetLowering::PostISelFolding(MachineSDNode
*Node
,
8830 SelectionDAG
&DAG
) const {
8831 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8832 unsigned Opcode
= Node
->getMachineOpcode();
8834 if (TII
->isMIMG(Opcode
) && !TII
->get(Opcode
).mayStore() &&
8835 !TII
->isGather4(Opcode
)) {
8836 return adjustWritemask(Node
, DAG
);
8839 if (Opcode
== AMDGPU::INSERT_SUBREG
||
8840 Opcode
== AMDGPU::REG_SEQUENCE
) {
8841 legalizeTargetIndependentNode(Node
, DAG
);
8846 case AMDGPU::V_DIV_SCALE_F32
:
8847 case AMDGPU::V_DIV_SCALE_F64
: {
8848 // Satisfy the operand register constraint when one of the inputs is
8849 // undefined. Ordinarily each undef value will have its own implicit_def of
8850 // a vreg, so force these to use a single register.
8851 SDValue Src0
= Node
->getOperand(0);
8852 SDValue Src1
= Node
->getOperand(1);
8853 SDValue Src2
= Node
->getOperand(2);
8855 if ((Src0
.isMachineOpcode() &&
8856 Src0
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
) &&
8857 (Src0
== Src1
|| Src0
== Src2
))
8860 MVT VT
= Src0
.getValueType().getSimpleVT();
8861 const TargetRegisterClass
*RC
= getRegClassFor(VT
);
8863 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
8864 SDValue UndefReg
= DAG
.getRegister(MRI
.createVirtualRegister(RC
), VT
);
8866 SDValue ImpDef
= DAG
.getCopyToReg(DAG
.getEntryNode(), SDLoc(Node
),
8867 UndefReg
, Src0
, SDValue());
8869 // src0 must be the same register as src1 or src2, even if the value is
8870 // undefined, so make sure we don't violate this constraint.
8871 if (Src0
.isMachineOpcode() &&
8872 Src0
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
) {
8873 if (Src1
.isMachineOpcode() &&
8874 Src1
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
8876 else if (Src2
.isMachineOpcode() &&
8877 Src2
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
8880 assert(Src1
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
);
8887 SmallVector
<SDValue
, 4> Ops
= { Src0
, Src1
, Src2
};
8888 for (unsigned I
= 3, N
= Node
->getNumOperands(); I
!= N
; ++I
)
8889 Ops
.push_back(Node
->getOperand(I
));
8891 Ops
.push_back(ImpDef
.getValue(1));
8892 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
8901 /// Assign the register class depending on the number of
8902 /// bits set in the writemask
8903 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr
&MI
,
8904 SDNode
*Node
) const {
8905 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8907 MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
8909 if (TII
->isVOP3(MI
.getOpcode())) {
8910 // Make sure constant bus requirements are respected.
8911 TII
->legalizeOperandsVOP3(MRI
, MI
);
8915 // Replace unused atomics with the no return version.
8916 int NoRetAtomicOp
= AMDGPU::getAtomicNoRetOp(MI
.getOpcode());
8917 if (NoRetAtomicOp
!= -1) {
8918 if (!Node
->hasAnyUseOfValue(0)) {
8919 MI
.setDesc(TII
->get(NoRetAtomicOp
));
8920 MI
.RemoveOperand(0);
8924 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
8925 // instruction, because the return type of these instructions is a vec2 of
8926 // the memory type, so it can be tied to the input operand.
8927 // This means these instructions always have a use, so we need to add a
8928 // special case to check if the atomic has only one extract_subreg use,
8929 // which itself has no uses.
8930 if ((Node
->hasNUsesOfValue(1, 0) &&
8931 Node
->use_begin()->isMachineOpcode() &&
8932 Node
->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG
&&
8933 !Node
->use_begin()->hasAnyUseOfValue(0))) {
8934 unsigned Def
= MI
.getOperand(0).getReg();
8936 // Change this into a noret atomic.
8937 MI
.setDesc(TII
->get(NoRetAtomicOp
));
8938 MI
.RemoveOperand(0);
8940 // If we only remove the def operand from the atomic instruction, the
8941 // extract_subreg will be left with a use of a vreg without a def.
8942 // So we need to insert an implicit_def to avoid machine verifier
8944 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(),
8945 TII
->get(AMDGPU::IMPLICIT_DEF
), Def
);
8951 static SDValue
buildSMovImm32(SelectionDAG
&DAG
, const SDLoc
&DL
,
8953 SDValue K
= DAG
.getTargetConstant(Val
, DL
, MVT::i32
);
8954 return SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, K
), 0);
8957 MachineSDNode
*SITargetLowering::wrapAddr64Rsrc(SelectionDAG
&DAG
,
8959 SDValue Ptr
) const {
8960 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8962 // Build the half of the subregister with the constants before building the
8963 // full 128-bit register. If we are building multiple resource descriptors,
8964 // this will allow CSEing of the 2-component register.
8965 const SDValue Ops0
[] = {
8966 DAG
.getTargetConstant(AMDGPU::SGPR_64RegClassID
, DL
, MVT::i32
),
8967 buildSMovImm32(DAG
, DL
, 0),
8968 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
8969 buildSMovImm32(DAG
, DL
, TII
->getDefaultRsrcDataFormat() >> 32),
8970 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
)
8973 SDValue SubRegHi
= SDValue(DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
,
8974 MVT::v2i32
, Ops0
), 0);
8976 // Combine the constants and the pointer.
8977 const SDValue Ops1
[] = {
8978 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, DL
, MVT::i32
),
8980 DAG
.getTargetConstant(AMDGPU::sub0_sub1
, DL
, MVT::i32
),
8982 DAG
.getTargetConstant(AMDGPU::sub2_sub3
, DL
, MVT::i32
)
8985 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops1
);
8988 /// Return a resource descriptor with the 'Add TID' bit enabled
8989 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
8990 /// of the resource descriptor) to create an offset, which is added to
8991 /// the resource pointer.
8992 MachineSDNode
*SITargetLowering::buildRSRC(SelectionDAG
&DAG
, const SDLoc
&DL
,
8993 SDValue Ptr
, uint32_t RsrcDword1
,
8994 uint64_t RsrcDword2And3
) const {
8995 SDValue PtrLo
= DAG
.getTargetExtractSubreg(AMDGPU::sub0
, DL
, MVT::i32
, Ptr
);
8996 SDValue PtrHi
= DAG
.getTargetExtractSubreg(AMDGPU::sub1
, DL
, MVT::i32
, Ptr
);
8998 PtrHi
= SDValue(DAG
.getMachineNode(AMDGPU::S_OR_B32
, DL
, MVT::i32
, PtrHi
,
8999 DAG
.getConstant(RsrcDword1
, DL
, MVT::i32
)),
9003 SDValue DataLo
= buildSMovImm32(DAG
, DL
,
9004 RsrcDword2And3
& UINT64_C(0xFFFFFFFF));
9005 SDValue DataHi
= buildSMovImm32(DAG
, DL
, RsrcDword2And3
>> 32);
9007 const SDValue Ops
[] = {
9008 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, DL
, MVT::i32
),
9010 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
9012 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
),
9014 DAG
.getTargetConstant(AMDGPU::sub2
, DL
, MVT::i32
),
9016 DAG
.getTargetConstant(AMDGPU::sub3
, DL
, MVT::i32
)
9019 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops
);
9022 //===----------------------------------------------------------------------===//
9023 // SI Inline Assembly Support
9024 //===----------------------------------------------------------------------===//
9026 std::pair
<unsigned, const TargetRegisterClass
*>
9027 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo
*TRI
,
9028 StringRef Constraint
,
9030 const TargetRegisterClass
*RC
= nullptr;
9031 if (Constraint
.size() == 1) {
9032 switch (Constraint
[0]) {
9034 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
9037 switch (VT
.getSizeInBits()) {
9039 return std::make_pair(0U, nullptr);
9042 RC
= &AMDGPU::SReg_32_XM0RegClass
;
9045 RC
= &AMDGPU::SGPR_64RegClass
;
9048 RC
= &AMDGPU::SReg_128RegClass
;
9051 RC
= &AMDGPU::SReg_256RegClass
;
9054 RC
= &AMDGPU::SReg_512RegClass
;
9059 switch (VT
.getSizeInBits()) {
9061 return std::make_pair(0U, nullptr);
9064 RC
= &AMDGPU::VGPR_32RegClass
;
9067 RC
= &AMDGPU::VReg_64RegClass
;
9070 RC
= &AMDGPU::VReg_96RegClass
;
9073 RC
= &AMDGPU::VReg_128RegClass
;
9076 RC
= &AMDGPU::VReg_256RegClass
;
9079 RC
= &AMDGPU::VReg_512RegClass
;
9084 // We actually support i128, i16 and f16 as inline parameters
9085 // even if they are not reported as legal
9086 if (RC
&& (isTypeLegal(VT
) || VT
.SimpleTy
== MVT::i128
||
9087 VT
.SimpleTy
== MVT::i16
|| VT
.SimpleTy
== MVT::f16
))
9088 return std::make_pair(0U, RC
);
9091 if (Constraint
.size() > 1) {
9092 if (Constraint
[1] == 'v') {
9093 RC
= &AMDGPU::VGPR_32RegClass
;
9094 } else if (Constraint
[1] == 's') {
9095 RC
= &AMDGPU::SGPR_32RegClass
;
9100 bool Failed
= Constraint
.substr(2).getAsInteger(10, Idx
);
9101 if (!Failed
&& Idx
< RC
->getNumRegs())
9102 return std::make_pair(RC
->getRegister(Idx
), RC
);
9105 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
9108 SITargetLowering::ConstraintType
9109 SITargetLowering::getConstraintType(StringRef Constraint
) const {
9110 if (Constraint
.size() == 1) {
9111 switch (Constraint
[0]) {
9115 return C_RegisterClass
;
9118 return TargetLowering::getConstraintType(Constraint
);
9121 // Figure out which registers should be reserved for stack access. Only after
9122 // the function is legalized do we know all of the non-spill stack objects or if
9123 // calls are present.
9124 void SITargetLowering::finalizeLowering(MachineFunction
&MF
) const {
9125 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
9126 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
9127 const MachineFrameInfo
&MFI
= MF
.getFrameInfo();
9128 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
9130 if (Info
->isEntryFunction()) {
9131 // Callable functions have fixed registers used for stack access.
9132 reservePrivateMemoryRegs(getTargetMachine(), MF
, *TRI
, *Info
);
9135 // We have to assume the SP is needed in case there are calls in the function
9136 // during lowering. Calls are only detected after the function is
9137 // lowered. We're about to reserve registers, so don't bother using it if we
9138 // aren't really going to use it.
9139 bool NeedSP
= !Info
->isEntryFunction() ||
9140 MFI
.hasVarSizedObjects() ||
9144 unsigned ReservedStackPtrOffsetReg
= TRI
->reservedStackPtrOffsetReg(MF
);
9145 Info
->setStackPtrOffsetReg(ReservedStackPtrOffsetReg
);
9147 assert(Info
->getStackPtrOffsetReg() != Info
->getFrameOffsetReg());
9148 assert(!TRI
->isSubRegister(Info
->getScratchRSrcReg(),
9149 Info
->getStackPtrOffsetReg()));
9150 MRI
.replaceRegWith(AMDGPU::SP_REG
, Info
->getStackPtrOffsetReg());
9153 MRI
.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG
, Info
->getScratchRSrcReg());
9154 MRI
.replaceRegWith(AMDGPU::FP_REG
, Info
->getFrameOffsetReg());
9155 MRI
.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG
,
9156 Info
->getScratchWaveOffsetReg());
9158 Info
->limitOccupancy(MF
);
9160 TargetLoweringBase::finalizeLowering(MF
);
9163 void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op
,
9165 const APInt
&DemandedElts
,
9166 const SelectionDAG
&DAG
,
9167 unsigned Depth
) const {
9168 TargetLowering::computeKnownBitsForFrameIndex(Op
, Known
, DemandedElts
,
9171 if (getSubtarget()->enableHugePrivateBuffer())
9174 // Technically it may be possible to have a dispatch with a single workitem
9175 // that uses the full private memory size, but that's not really useful. We
9176 // can't use vaddr in MUBUF instructions if we don't know the address
9177 // calculation won't overflow, so assume the sign bit is never set.
9178 Known
.Zero
.setHighBits(AssumeFrameIndexHighZeroBits
);
9181 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode
* N
,
9182 FunctionLoweringInfo
* FLI
, LegacyDivergenceAnalysis
* KDA
) const
9184 switch (N
->getOpcode()) {
9186 case ISD::CopyFromReg
:
9188 const RegisterSDNode
*R
= nullptr;
9189 if (N
->getOpcode() == ISD::Register
) {
9190 R
= dyn_cast
<RegisterSDNode
>(N
);
9193 R
= dyn_cast
<RegisterSDNode
>(N
->getOperand(1));
9197 const MachineFunction
* MF
= FLI
->MF
;
9198 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
9199 const MachineRegisterInfo
&MRI
= MF
->getRegInfo();
9200 const SIRegisterInfo
&TRI
= ST
.getInstrInfo()->getRegisterInfo();
9201 unsigned Reg
= R
->getReg();
9202 if (TRI
.isPhysicalRegister(Reg
))
9203 return TRI
.isVGPR(MRI
, Reg
);
9205 if (MRI
.isLiveIn(Reg
)) {
9206 // workitem.id.x workitem.id.y workitem.id.z
9207 // Any VGPR formal argument is also considered divergent
9208 if (TRI
.isVGPR(MRI
, Reg
))
9210 // Formal arguments of non-entry functions
9211 // are conservatively considered divergent
9212 else if (!AMDGPU::isEntryFunctionCC(FLI
->Fn
->getCallingConv()))
9215 return !KDA
|| KDA
->isDivergent(FLI
->getValueFromVirtualReg(Reg
));
9220 const LoadSDNode
*L
= cast
<LoadSDNode
>(N
);
9221 unsigned AS
= L
->getAddressSpace();
9222 // A flat load may access private memory.
9223 return AS
== AMDGPUAS::PRIVATE_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
;
9225 case ISD::CALLSEQ_END
:
9228 case ISD::INTRINSIC_WO_CHAIN
:
9232 return AMDGPU::isIntrinsicSourceOfDivergence(
9233 cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue());
9234 case ISD::INTRINSIC_W_CHAIN
:
9235 return AMDGPU::isIntrinsicSourceOfDivergence(
9236 cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue());
9237 // In some cases intrinsics that are a source of divergence have been
9238 // lowered to AMDGPUISD so we also need to check those too.
9239 case AMDGPUISD::INTERP_MOV
:
9240 case AMDGPUISD::INTERP_P1
:
9241 case AMDGPUISD::INTERP_P2
:
9247 bool SITargetLowering::denormalsEnabledForType(EVT VT
) const {
9248 switch (VT
.getScalarType().getSimpleVT().SimpleTy
) {
9250 return Subtarget
->hasFP32Denormals();
9252 return Subtarget
->hasFP64Denormals();
9254 return Subtarget
->hasFP16Denormals();