1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Custom DAG lowering for SI
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
16 #define _USE_MATH_DEFINES
19 #include "SIISelLowering.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "SIDefines.h"
24 #include "SIInstrInfo.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIRegisterInfo.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/ADT/APFloat.h"
30 #include "llvm/ADT/APInt.h"
31 #include "llvm/ADT/ArrayRef.h"
32 #include "llvm/ADT/BitVector.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/ADT/StringRef.h"
36 #include "llvm/ADT/StringSwitch.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/CodeGen/Analysis.h"
39 #include "llvm/CodeGen/CallingConvLower.h"
40 #include "llvm/CodeGen/DAGCombine.h"
41 #include "llvm/CodeGen/ISDOpcodes.h"
42 #include "llvm/CodeGen/MachineBasicBlock.h"
43 #include "llvm/CodeGen/MachineFrameInfo.h"
44 #include "llvm/CodeGen/MachineFunction.h"
45 #include "llvm/CodeGen/MachineInstr.h"
46 #include "llvm/CodeGen/MachineInstrBuilder.h"
47 #include "llvm/CodeGen/MachineMemOperand.h"
48 #include "llvm/CodeGen/MachineModuleInfo.h"
49 #include "llvm/CodeGen/MachineOperand.h"
50 #include "llvm/CodeGen/MachineRegisterInfo.h"
51 #include "llvm/CodeGen/SelectionDAG.h"
52 #include "llvm/CodeGen/SelectionDAGNodes.h"
53 #include "llvm/CodeGen/TargetCallingConv.h"
54 #include "llvm/CodeGen/TargetRegisterInfo.h"
55 #include "llvm/CodeGen/ValueTypes.h"
56 #include "llvm/IR/Constants.h"
57 #include "llvm/IR/DataLayout.h"
58 #include "llvm/IR/DebugLoc.h"
59 #include "llvm/IR/DerivedTypes.h"
60 #include "llvm/IR/DiagnosticInfo.h"
61 #include "llvm/IR/Function.h"
62 #include "llvm/IR/GlobalValue.h"
63 #include "llvm/IR/InstrTypes.h"
64 #include "llvm/IR/Instruction.h"
65 #include "llvm/IR/Instructions.h"
66 #include "llvm/IR/IntrinsicInst.h"
67 #include "llvm/IR/Type.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/CommandLine.h"
71 #include "llvm/Support/Compiler.h"
72 #include "llvm/Support/ErrorHandling.h"
73 #include "llvm/Support/KnownBits.h"
74 #include "llvm/Support/MachineValueType.h"
75 #include "llvm/Support/MathExtras.h"
76 #include "llvm/Target/TargetOptions.h"
87 #define DEBUG_TYPE "si-lower"
89 STATISTIC(NumTailCalls
, "Number of tail calls");
91 static cl::opt
<bool> EnableVGPRIndexMode(
92 "amdgpu-vgpr-index-mode",
93 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
96 static cl::opt
<bool> DisableLoopAlignment(
97 "amdgpu-disable-loop-alignment",
98 cl::desc("Do not align and prefetch loops"),
101 static unsigned findFirstFreeSGPR(CCState
&CCInfo
) {
102 unsigned NumSGPRs
= AMDGPU::SGPR_32RegClass
.getNumRegs();
103 for (unsigned Reg
= 0; Reg
< NumSGPRs
; ++Reg
) {
104 if (!CCInfo
.isAllocated(AMDGPU::SGPR0
+ Reg
)) {
105 return AMDGPU::SGPR0
+ Reg
;
108 llvm_unreachable("Cannot allocate sgpr");
111 SITargetLowering::SITargetLowering(const TargetMachine
&TM
,
112 const GCNSubtarget
&STI
)
113 : AMDGPUTargetLowering(TM
, STI
),
115 addRegisterClass(MVT::i1
, &AMDGPU::VReg_1RegClass
);
116 addRegisterClass(MVT::i64
, &AMDGPU::SReg_64RegClass
);
118 addRegisterClass(MVT::i32
, &AMDGPU::SReg_32_XM0RegClass
);
119 addRegisterClass(MVT::f32
, &AMDGPU::VGPR_32RegClass
);
121 addRegisterClass(MVT::f64
, &AMDGPU::VReg_64RegClass
);
122 addRegisterClass(MVT::v2i32
, &AMDGPU::SReg_64RegClass
);
123 addRegisterClass(MVT::v2f32
, &AMDGPU::VReg_64RegClass
);
125 addRegisterClass(MVT::v3i32
, &AMDGPU::SGPR_96RegClass
);
126 addRegisterClass(MVT::v3f32
, &AMDGPU::VReg_96RegClass
);
128 addRegisterClass(MVT::v2i64
, &AMDGPU::SReg_128RegClass
);
129 addRegisterClass(MVT::v2f64
, &AMDGPU::SReg_128RegClass
);
131 addRegisterClass(MVT::v4i32
, &AMDGPU::SReg_128RegClass
);
132 addRegisterClass(MVT::v4f32
, &AMDGPU::VReg_128RegClass
);
134 addRegisterClass(MVT::v5i32
, &AMDGPU::SGPR_160RegClass
);
135 addRegisterClass(MVT::v5f32
, &AMDGPU::VReg_160RegClass
);
137 addRegisterClass(MVT::v8i32
, &AMDGPU::SReg_256RegClass
);
138 addRegisterClass(MVT::v8f32
, &AMDGPU::VReg_256RegClass
);
140 addRegisterClass(MVT::v16i32
, &AMDGPU::SReg_512RegClass
);
141 addRegisterClass(MVT::v16f32
, &AMDGPU::VReg_512RegClass
);
143 if (Subtarget
->has16BitInsts()) {
144 addRegisterClass(MVT::i16
, &AMDGPU::SReg_32_XM0RegClass
);
145 addRegisterClass(MVT::f16
, &AMDGPU::SReg_32_XM0RegClass
);
147 // Unless there are also VOP3P operations, not operations are really legal.
148 addRegisterClass(MVT::v2i16
, &AMDGPU::SReg_32_XM0RegClass
);
149 addRegisterClass(MVT::v2f16
, &AMDGPU::SReg_32_XM0RegClass
);
150 addRegisterClass(MVT::v4i16
, &AMDGPU::SReg_64RegClass
);
151 addRegisterClass(MVT::v4f16
, &AMDGPU::SReg_64RegClass
);
154 if (Subtarget
->hasMAIInsts()) {
155 addRegisterClass(MVT::v32i32
, &AMDGPU::VReg_1024RegClass
);
156 addRegisterClass(MVT::v32f32
, &AMDGPU::VReg_1024RegClass
);
159 computeRegisterProperties(Subtarget
->getRegisterInfo());
161 // We need to custom lower vector stores from local memory
162 setOperationAction(ISD::LOAD
, MVT::v2i32
, Custom
);
163 setOperationAction(ISD::LOAD
, MVT::v3i32
, Custom
);
164 setOperationAction(ISD::LOAD
, MVT::v4i32
, Custom
);
165 setOperationAction(ISD::LOAD
, MVT::v5i32
, Custom
);
166 setOperationAction(ISD::LOAD
, MVT::v8i32
, Custom
);
167 setOperationAction(ISD::LOAD
, MVT::v16i32
, Custom
);
168 setOperationAction(ISD::LOAD
, MVT::i1
, Custom
);
169 setOperationAction(ISD::LOAD
, MVT::v32i32
, Custom
);
171 setOperationAction(ISD::STORE
, MVT::v2i32
, Custom
);
172 setOperationAction(ISD::STORE
, MVT::v3i32
, Custom
);
173 setOperationAction(ISD::STORE
, MVT::v4i32
, Custom
);
174 setOperationAction(ISD::STORE
, MVT::v5i32
, Custom
);
175 setOperationAction(ISD::STORE
, MVT::v8i32
, Custom
);
176 setOperationAction(ISD::STORE
, MVT::v16i32
, Custom
);
177 setOperationAction(ISD::STORE
, MVT::i1
, Custom
);
178 setOperationAction(ISD::STORE
, MVT::v32i32
, Custom
);
180 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Expand
);
181 setTruncStoreAction(MVT::v3i32
, MVT::v3i16
, Expand
);
182 setTruncStoreAction(MVT::v4i32
, MVT::v4i16
, Expand
);
183 setTruncStoreAction(MVT::v8i32
, MVT::v8i16
, Expand
);
184 setTruncStoreAction(MVT::v16i32
, MVT::v16i16
, Expand
);
185 setTruncStoreAction(MVT::v32i32
, MVT::v32i16
, Expand
);
186 setTruncStoreAction(MVT::v2i32
, MVT::v2i8
, Expand
);
187 setTruncStoreAction(MVT::v4i32
, MVT::v4i8
, Expand
);
188 setTruncStoreAction(MVT::v8i32
, MVT::v8i8
, Expand
);
189 setTruncStoreAction(MVT::v16i32
, MVT::v16i8
, Expand
);
190 setTruncStoreAction(MVT::v32i32
, MVT::v32i8
, Expand
);
192 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
193 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
195 setOperationAction(ISD::SELECT
, MVT::i1
, Promote
);
196 setOperationAction(ISD::SELECT
, MVT::i64
, Custom
);
197 setOperationAction(ISD::SELECT
, MVT::f64
, Promote
);
198 AddPromotedToType(ISD::SELECT
, MVT::f64
, MVT::i64
);
200 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Expand
);
201 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Expand
);
202 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Expand
);
203 setOperationAction(ISD::SELECT_CC
, MVT::f64
, Expand
);
204 setOperationAction(ISD::SELECT_CC
, MVT::i1
, Expand
);
206 setOperationAction(ISD::SETCC
, MVT::i1
, Promote
);
207 setOperationAction(ISD::SETCC
, MVT::v2i1
, Expand
);
208 setOperationAction(ISD::SETCC
, MVT::v4i1
, Expand
);
209 AddPromotedToType(ISD::SETCC
, MVT::i1
, MVT::i32
);
211 setOperationAction(ISD::TRUNCATE
, MVT::v2i32
, Expand
);
212 setOperationAction(ISD::FP_ROUND
, MVT::v2f32
, Expand
);
214 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i1
, Custom
);
215 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i1
, Custom
);
216 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i8
, Custom
);
217 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i8
, Custom
);
218 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i16
, Custom
);
219 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v3i16
, Custom
);
220 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i16
, Custom
);
221 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::Other
, Custom
);
223 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
224 setOperationAction(ISD::BR_CC
, MVT::i1
, Expand
);
225 setOperationAction(ISD::BR_CC
, MVT::i32
, Expand
);
226 setOperationAction(ISD::BR_CC
, MVT::i64
, Expand
);
227 setOperationAction(ISD::BR_CC
, MVT::f32
, Expand
);
228 setOperationAction(ISD::BR_CC
, MVT::f64
, Expand
);
230 setOperationAction(ISD::UADDO
, MVT::i32
, Legal
);
231 setOperationAction(ISD::USUBO
, MVT::i32
, Legal
);
233 setOperationAction(ISD::ADDCARRY
, MVT::i32
, Legal
);
234 setOperationAction(ISD::SUBCARRY
, MVT::i32
, Legal
);
236 setOperationAction(ISD::SHL_PARTS
, MVT::i64
, Expand
);
237 setOperationAction(ISD::SRA_PARTS
, MVT::i64
, Expand
);
238 setOperationAction(ISD::SRL_PARTS
, MVT::i64
, Expand
);
241 setOperationAction(ISD::ADDCARRY
, MVT::i64
, Legal
);
242 setOperationAction(ISD::SUBCARRY
, MVT::i64
, Legal
);
245 // We only support LOAD/STORE and vector manipulation ops for vectors
246 // with > 4 elements.
247 for (MVT VT
: { MVT::v8i32
, MVT::v8f32
, MVT::v16i32
, MVT::v16f32
,
248 MVT::v2i64
, MVT::v2f64
, MVT::v4i16
, MVT::v4f16
,
249 MVT::v32i32
, MVT::v32f32
}) {
250 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
254 case ISD::BUILD_VECTOR
:
256 case ISD::EXTRACT_VECTOR_ELT
:
257 case ISD::INSERT_VECTOR_ELT
:
258 case ISD::INSERT_SUBVECTOR
:
259 case ISD::EXTRACT_SUBVECTOR
:
260 case ISD::SCALAR_TO_VECTOR
:
262 case ISD::CONCAT_VECTORS
:
263 setOperationAction(Op
, VT
, Custom
);
266 setOperationAction(Op
, VT
, Expand
);
272 setOperationAction(ISD::FP_EXTEND
, MVT::v4f32
, Expand
);
274 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
275 // is expanded to avoid having two separate loops in case the index is a VGPR.
277 // Most operations are naturally 32-bit vector operations. We only support
278 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
279 for (MVT Vec64
: { MVT::v2i64
, MVT::v2f64
}) {
280 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
281 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v4i32
);
283 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
284 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
286 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
287 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
289 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
290 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v4i32
);
293 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8i32
, Expand
);
294 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8f32
, Expand
);
295 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16i32
, Expand
);
296 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16f32
, Expand
);
298 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4f16
, Custom
);
299 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4i16
, Custom
);
301 // Avoid stack access for these.
302 // TODO: Generalize to more vector types.
303 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i16
, Custom
);
304 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2f16
, Custom
);
305 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
306 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
308 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
309 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
310 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i8
, Custom
);
311 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i8
, Custom
);
312 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v8i8
, Custom
);
314 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i8
, Custom
);
315 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i8
, Custom
);
316 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v8i8
, Custom
);
318 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i16
, Custom
);
319 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4f16
, Custom
);
320 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
321 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
323 // Deal with vec3 vector operations when widened to vec4.
324 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v3i32
, Custom
);
325 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v3f32
, Custom
);
326 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v4i32
, Custom
);
327 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v4f32
, Custom
);
329 // Deal with vec5 vector operations when widened to vec8.
330 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v5i32
, Custom
);
331 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v5f32
, Custom
);
332 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v8i32
, Custom
);
333 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v8f32
, Custom
);
335 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
336 // and output demarshalling
337 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i32
, Custom
);
338 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i64
, Custom
);
340 // We can't return success/failure, only the old value,
341 // let LLVM add the comparison
342 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i32
, Expand
);
343 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i64
, Expand
);
345 if (Subtarget
->hasFlatAddressSpace()) {
346 setOperationAction(ISD::ADDRSPACECAST
, MVT::i32
, Custom
);
347 setOperationAction(ISD::ADDRSPACECAST
, MVT::i64
, Custom
);
350 setOperationAction(ISD::BSWAP
, MVT::i32
, Legal
);
351 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
353 // On SI this is s_memtime and s_memrealtime on VI.
354 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Legal
);
355 setOperationAction(ISD::TRAP
, MVT::Other
, Custom
);
356 setOperationAction(ISD::DEBUGTRAP
, MVT::Other
, Custom
);
358 if (Subtarget
->has16BitInsts()) {
359 setOperationAction(ISD::FLOG
, MVT::f16
, Custom
);
360 setOperationAction(ISD::FEXP
, MVT::f16
, Custom
);
361 setOperationAction(ISD::FLOG10
, MVT::f16
, Custom
);
364 // v_mad_f32 does not support denormals according to some sources.
365 if (!Subtarget
->hasFP32Denormals())
366 setOperationAction(ISD::FMAD
, MVT::f32
, Legal
);
368 if (!Subtarget
->hasBFI()) {
369 // fcopysign can be done in a single instruction with BFI.
370 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Expand
);
371 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
374 if (!Subtarget
->hasBCNT(32))
375 setOperationAction(ISD::CTPOP
, MVT::i32
, Expand
);
377 if (!Subtarget
->hasBCNT(64))
378 setOperationAction(ISD::CTPOP
, MVT::i64
, Expand
);
380 if (Subtarget
->hasFFBH())
381 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i32
, Custom
);
383 if (Subtarget
->hasFFBL())
384 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i32
, Custom
);
386 // We only really have 32-bit BFE instructions (and 16-bit on VI).
388 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
389 // effort to match them now. We want this to be false for i64 cases when the
390 // extraction isn't restricted to the upper or lower half. Ideally we would
391 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
392 // span the midpoint are probably relatively rare, so don't worry about them
394 if (Subtarget
->hasBFE())
395 setHasExtractBitsInsn(true);
397 setOperationAction(ISD::FMINNUM
, MVT::f32
, Custom
);
398 setOperationAction(ISD::FMAXNUM
, MVT::f32
, Custom
);
399 setOperationAction(ISD::FMINNUM
, MVT::f64
, Custom
);
400 setOperationAction(ISD::FMAXNUM
, MVT::f64
, Custom
);
403 // These are really only legal for ieee_mode functions. We should be avoiding
404 // them for functions that don't have ieee_mode enabled, so just say they are
406 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f32
, Legal
);
407 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f32
, Legal
);
408 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f64
, Legal
);
409 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f64
, Legal
);
412 if (Subtarget
->haveRoundOpsF64()) {
413 setOperationAction(ISD::FTRUNC
, MVT::f64
, Legal
);
414 setOperationAction(ISD::FCEIL
, MVT::f64
, Legal
);
415 setOperationAction(ISD::FRINT
, MVT::f64
, Legal
);
417 setOperationAction(ISD::FCEIL
, MVT::f64
, Custom
);
418 setOperationAction(ISD::FTRUNC
, MVT::f64
, Custom
);
419 setOperationAction(ISD::FRINT
, MVT::f64
, Custom
);
420 setOperationAction(ISD::FFLOOR
, MVT::f64
, Custom
);
423 setOperationAction(ISD::FFLOOR
, MVT::f64
, Legal
);
425 setOperationAction(ISD::FSIN
, MVT::f32
, Custom
);
426 setOperationAction(ISD::FCOS
, MVT::f32
, Custom
);
427 setOperationAction(ISD::FDIV
, MVT::f32
, Custom
);
428 setOperationAction(ISD::FDIV
, MVT::f64
, Custom
);
430 if (Subtarget
->has16BitInsts()) {
431 setOperationAction(ISD::Constant
, MVT::i16
, Legal
);
433 setOperationAction(ISD::SMIN
, MVT::i16
, Legal
);
434 setOperationAction(ISD::SMAX
, MVT::i16
, Legal
);
436 setOperationAction(ISD::UMIN
, MVT::i16
, Legal
);
437 setOperationAction(ISD::UMAX
, MVT::i16
, Legal
);
439 setOperationAction(ISD::SIGN_EXTEND
, MVT::i16
, Promote
);
440 AddPromotedToType(ISD::SIGN_EXTEND
, MVT::i16
, MVT::i32
);
442 setOperationAction(ISD::ROTR
, MVT::i16
, Promote
);
443 setOperationAction(ISD::ROTL
, MVT::i16
, Promote
);
445 setOperationAction(ISD::SDIV
, MVT::i16
, Promote
);
446 setOperationAction(ISD::UDIV
, MVT::i16
, Promote
);
447 setOperationAction(ISD::SREM
, MVT::i16
, Promote
);
448 setOperationAction(ISD::UREM
, MVT::i16
, Promote
);
450 setOperationAction(ISD::BSWAP
, MVT::i16
, Promote
);
451 setOperationAction(ISD::BITREVERSE
, MVT::i16
, Promote
);
453 setOperationAction(ISD::CTTZ
, MVT::i16
, Promote
);
454 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i16
, Promote
);
455 setOperationAction(ISD::CTLZ
, MVT::i16
, Promote
);
456 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i16
, Promote
);
457 setOperationAction(ISD::CTPOP
, MVT::i16
, Promote
);
459 setOperationAction(ISD::SELECT_CC
, MVT::i16
, Expand
);
461 setOperationAction(ISD::BR_CC
, MVT::i16
, Expand
);
463 setOperationAction(ISD::LOAD
, MVT::i16
, Custom
);
465 setTruncStoreAction(MVT::i64
, MVT::i16
, Expand
);
467 setOperationAction(ISD::FP16_TO_FP
, MVT::i16
, Promote
);
468 AddPromotedToType(ISD::FP16_TO_FP
, MVT::i16
, MVT::i32
);
469 setOperationAction(ISD::FP_TO_FP16
, MVT::i16
, Promote
);
470 AddPromotedToType(ISD::FP_TO_FP16
, MVT::i16
, MVT::i32
);
472 setOperationAction(ISD::FP_TO_SINT
, MVT::i16
, Promote
);
473 setOperationAction(ISD::FP_TO_UINT
, MVT::i16
, Promote
);
474 setOperationAction(ISD::SINT_TO_FP
, MVT::i16
, Promote
);
475 setOperationAction(ISD::UINT_TO_FP
, MVT::i16
, Promote
);
477 // F16 - Constant Actions.
478 setOperationAction(ISD::ConstantFP
, MVT::f16
, Legal
);
480 // F16 - Load/Store Actions.
481 setOperationAction(ISD::LOAD
, MVT::f16
, Promote
);
482 AddPromotedToType(ISD::LOAD
, MVT::f16
, MVT::i16
);
483 setOperationAction(ISD::STORE
, MVT::f16
, Promote
);
484 AddPromotedToType(ISD::STORE
, MVT::f16
, MVT::i16
);
486 // F16 - VOP1 Actions.
487 setOperationAction(ISD::FP_ROUND
, MVT::f16
, Custom
);
488 setOperationAction(ISD::FCOS
, MVT::f16
, Promote
);
489 setOperationAction(ISD::FSIN
, MVT::f16
, Promote
);
490 setOperationAction(ISD::FP_TO_SINT
, MVT::f16
, Promote
);
491 setOperationAction(ISD::FP_TO_UINT
, MVT::f16
, Promote
);
492 setOperationAction(ISD::SINT_TO_FP
, MVT::f16
, Promote
);
493 setOperationAction(ISD::UINT_TO_FP
, MVT::f16
, Promote
);
494 setOperationAction(ISD::FROUND
, MVT::f16
, Custom
);
496 // F16 - VOP2 Actions.
497 setOperationAction(ISD::BR_CC
, MVT::f16
, Expand
);
498 setOperationAction(ISD::SELECT_CC
, MVT::f16
, Expand
);
500 setOperationAction(ISD::FDIV
, MVT::f16
, Custom
);
502 // F16 - VOP3 Actions.
503 setOperationAction(ISD::FMA
, MVT::f16
, Legal
);
504 if (!Subtarget
->hasFP16Denormals() && STI
.hasMadF16())
505 setOperationAction(ISD::FMAD
, MVT::f16
, Legal
);
507 for (MVT VT
: {MVT::v2i16
, MVT::v2f16
, MVT::v4i16
, MVT::v4f16
}) {
508 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
512 case ISD::BUILD_VECTOR
:
514 case ISD::EXTRACT_VECTOR_ELT
:
515 case ISD::INSERT_VECTOR_ELT
:
516 case ISD::INSERT_SUBVECTOR
:
517 case ISD::EXTRACT_SUBVECTOR
:
518 case ISD::SCALAR_TO_VECTOR
:
520 case ISD::CONCAT_VECTORS
:
521 setOperationAction(Op
, VT
, Custom
);
524 setOperationAction(Op
, VT
, Expand
);
530 // XXX - Do these do anything? Vector constants turn into build_vector.
531 setOperationAction(ISD::Constant
, MVT::v2i16
, Legal
);
532 setOperationAction(ISD::ConstantFP
, MVT::v2f16
, Legal
);
534 setOperationAction(ISD::UNDEF
, MVT::v2i16
, Legal
);
535 setOperationAction(ISD::UNDEF
, MVT::v2f16
, Legal
);
537 setOperationAction(ISD::STORE
, MVT::v2i16
, Promote
);
538 AddPromotedToType(ISD::STORE
, MVT::v2i16
, MVT::i32
);
539 setOperationAction(ISD::STORE
, MVT::v2f16
, Promote
);
540 AddPromotedToType(ISD::STORE
, MVT::v2f16
, MVT::i32
);
542 setOperationAction(ISD::LOAD
, MVT::v2i16
, Promote
);
543 AddPromotedToType(ISD::LOAD
, MVT::v2i16
, MVT::i32
);
544 setOperationAction(ISD::LOAD
, MVT::v2f16
, Promote
);
545 AddPromotedToType(ISD::LOAD
, MVT::v2f16
, MVT::i32
);
547 setOperationAction(ISD::AND
, MVT::v2i16
, Promote
);
548 AddPromotedToType(ISD::AND
, MVT::v2i16
, MVT::i32
);
549 setOperationAction(ISD::OR
, MVT::v2i16
, Promote
);
550 AddPromotedToType(ISD::OR
, MVT::v2i16
, MVT::i32
);
551 setOperationAction(ISD::XOR
, MVT::v2i16
, Promote
);
552 AddPromotedToType(ISD::XOR
, MVT::v2i16
, MVT::i32
);
554 setOperationAction(ISD::LOAD
, MVT::v4i16
, Promote
);
555 AddPromotedToType(ISD::LOAD
, MVT::v4i16
, MVT::v2i32
);
556 setOperationAction(ISD::LOAD
, MVT::v4f16
, Promote
);
557 AddPromotedToType(ISD::LOAD
, MVT::v4f16
, MVT::v2i32
);
559 setOperationAction(ISD::STORE
, MVT::v4i16
, Promote
);
560 AddPromotedToType(ISD::STORE
, MVT::v4i16
, MVT::v2i32
);
561 setOperationAction(ISD::STORE
, MVT::v4f16
, Promote
);
562 AddPromotedToType(ISD::STORE
, MVT::v4f16
, MVT::v2i32
);
564 setOperationAction(ISD::ANY_EXTEND
, MVT::v2i32
, Expand
);
565 setOperationAction(ISD::ZERO_EXTEND
, MVT::v2i32
, Expand
);
566 setOperationAction(ISD::SIGN_EXTEND
, MVT::v2i32
, Expand
);
567 setOperationAction(ISD::FP_EXTEND
, MVT::v2f32
, Expand
);
569 setOperationAction(ISD::ANY_EXTEND
, MVT::v4i32
, Expand
);
570 setOperationAction(ISD::ZERO_EXTEND
, MVT::v4i32
, Expand
);
571 setOperationAction(ISD::SIGN_EXTEND
, MVT::v4i32
, Expand
);
573 if (!Subtarget
->hasVOP3PInsts()) {
574 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2i16
, Custom
);
575 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2f16
, Custom
);
578 setOperationAction(ISD::FNEG
, MVT::v2f16
, Legal
);
579 // This isn't really legal, but this avoids the legalizer unrolling it (and
580 // allows matching fneg (fabs x) patterns)
581 setOperationAction(ISD::FABS
, MVT::v2f16
, Legal
);
583 setOperationAction(ISD::FMAXNUM
, MVT::f16
, Custom
);
584 setOperationAction(ISD::FMINNUM
, MVT::f16
, Custom
);
585 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f16
, Legal
);
586 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f16
, Legal
);
588 setOperationAction(ISD::FMINNUM_IEEE
, MVT::v4f16
, Custom
);
589 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::v4f16
, Custom
);
591 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Expand
);
592 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Expand
);
595 if (Subtarget
->hasVOP3PInsts()) {
596 setOperationAction(ISD::ADD
, MVT::v2i16
, Legal
);
597 setOperationAction(ISD::SUB
, MVT::v2i16
, Legal
);
598 setOperationAction(ISD::MUL
, MVT::v2i16
, Legal
);
599 setOperationAction(ISD::SHL
, MVT::v2i16
, Legal
);
600 setOperationAction(ISD::SRL
, MVT::v2i16
, Legal
);
601 setOperationAction(ISD::SRA
, MVT::v2i16
, Legal
);
602 setOperationAction(ISD::SMIN
, MVT::v2i16
, Legal
);
603 setOperationAction(ISD::UMIN
, MVT::v2i16
, Legal
);
604 setOperationAction(ISD::SMAX
, MVT::v2i16
, Legal
);
605 setOperationAction(ISD::UMAX
, MVT::v2i16
, Legal
);
607 setOperationAction(ISD::FADD
, MVT::v2f16
, Legal
);
608 setOperationAction(ISD::FMUL
, MVT::v2f16
, Legal
);
609 setOperationAction(ISD::FMA
, MVT::v2f16
, Legal
);
611 setOperationAction(ISD::FMINNUM_IEEE
, MVT::v2f16
, Legal
);
612 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::v2f16
, Legal
);
614 setOperationAction(ISD::FCANONICALIZE
, MVT::v2f16
, Legal
);
616 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
617 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
619 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4f16
, Custom
);
620 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4i16
, Custom
);
622 setOperationAction(ISD::SHL
, MVT::v4i16
, Custom
);
623 setOperationAction(ISD::SRA
, MVT::v4i16
, Custom
);
624 setOperationAction(ISD::SRL
, MVT::v4i16
, Custom
);
625 setOperationAction(ISD::ADD
, MVT::v4i16
, Custom
);
626 setOperationAction(ISD::SUB
, MVT::v4i16
, Custom
);
627 setOperationAction(ISD::MUL
, MVT::v4i16
, Custom
);
629 setOperationAction(ISD::SMIN
, MVT::v4i16
, Custom
);
630 setOperationAction(ISD::SMAX
, MVT::v4i16
, Custom
);
631 setOperationAction(ISD::UMIN
, MVT::v4i16
, Custom
);
632 setOperationAction(ISD::UMAX
, MVT::v4i16
, Custom
);
634 setOperationAction(ISD::FADD
, MVT::v4f16
, Custom
);
635 setOperationAction(ISD::FMUL
, MVT::v4f16
, Custom
);
636 setOperationAction(ISD::FMA
, MVT::v4f16
, Custom
);
638 setOperationAction(ISD::FMAXNUM
, MVT::v2f16
, Custom
);
639 setOperationAction(ISD::FMINNUM
, MVT::v2f16
, Custom
);
641 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Custom
);
642 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Custom
);
643 setOperationAction(ISD::FCANONICALIZE
, MVT::v4f16
, Custom
);
645 setOperationAction(ISD::FEXP
, MVT::v2f16
, Custom
);
646 setOperationAction(ISD::SELECT
, MVT::v4i16
, Custom
);
647 setOperationAction(ISD::SELECT
, MVT::v4f16
, Custom
);
650 setOperationAction(ISD::FNEG
, MVT::v4f16
, Custom
);
651 setOperationAction(ISD::FABS
, MVT::v4f16
, Custom
);
653 if (Subtarget
->has16BitInsts()) {
654 setOperationAction(ISD::SELECT
, MVT::v2i16
, Promote
);
655 AddPromotedToType(ISD::SELECT
, MVT::v2i16
, MVT::i32
);
656 setOperationAction(ISD::SELECT
, MVT::v2f16
, Promote
);
657 AddPromotedToType(ISD::SELECT
, MVT::v2f16
, MVT::i32
);
659 // Legalization hack.
660 setOperationAction(ISD::SELECT
, MVT::v2i16
, Custom
);
661 setOperationAction(ISD::SELECT
, MVT::v2f16
, Custom
);
663 setOperationAction(ISD::FNEG
, MVT::v2f16
, Custom
);
664 setOperationAction(ISD::FABS
, MVT::v2f16
, Custom
);
667 for (MVT VT
: { MVT::v4i16
, MVT::v4f16
, MVT::v2i8
, MVT::v4i8
, MVT::v8i8
}) {
668 setOperationAction(ISD::SELECT
, VT
, Custom
);
671 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
672 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f32
, Custom
);
673 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v4f32
, Custom
);
674 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::i16
, Custom
);
675 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f16
, Custom
);
676 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2i16
, Custom
);
677 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2f16
, Custom
);
679 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v2f16
, Custom
);
680 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v2i16
, Custom
);
681 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v4f16
, Custom
);
682 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v4i16
, Custom
);
683 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v8f16
, Custom
);
684 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::Other
, Custom
);
685 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::f16
, Custom
);
686 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i16
, Custom
);
687 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i8
, Custom
);
689 setOperationAction(ISD::INTRINSIC_VOID
, MVT::Other
, Custom
);
690 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2i16
, Custom
);
691 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2f16
, Custom
);
692 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v4f16
, Custom
);
693 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v4i16
, Custom
);
694 setOperationAction(ISD::INTRINSIC_VOID
, MVT::f16
, Custom
);
695 setOperationAction(ISD::INTRINSIC_VOID
, MVT::i16
, Custom
);
696 setOperationAction(ISD::INTRINSIC_VOID
, MVT::i8
, Custom
);
698 setTargetDAGCombine(ISD::ADD
);
699 setTargetDAGCombine(ISD::ADDCARRY
);
700 setTargetDAGCombine(ISD::SUB
);
701 setTargetDAGCombine(ISD::SUBCARRY
);
702 setTargetDAGCombine(ISD::FADD
);
703 setTargetDAGCombine(ISD::FSUB
);
704 setTargetDAGCombine(ISD::FMINNUM
);
705 setTargetDAGCombine(ISD::FMAXNUM
);
706 setTargetDAGCombine(ISD::FMINNUM_IEEE
);
707 setTargetDAGCombine(ISD::FMAXNUM_IEEE
);
708 setTargetDAGCombine(ISD::FMA
);
709 setTargetDAGCombine(ISD::SMIN
);
710 setTargetDAGCombine(ISD::SMAX
);
711 setTargetDAGCombine(ISD::UMIN
);
712 setTargetDAGCombine(ISD::UMAX
);
713 setTargetDAGCombine(ISD::SETCC
);
714 setTargetDAGCombine(ISD::AND
);
715 setTargetDAGCombine(ISD::OR
);
716 setTargetDAGCombine(ISD::XOR
);
717 setTargetDAGCombine(ISD::SINT_TO_FP
);
718 setTargetDAGCombine(ISD::UINT_TO_FP
);
719 setTargetDAGCombine(ISD::FCANONICALIZE
);
720 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR
);
721 setTargetDAGCombine(ISD::ZERO_EXTEND
);
722 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG
);
723 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT
);
724 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT
);
726 // All memory operations. Some folding on the pointer operand is done to help
727 // matching the constant offsets in the addressing modes.
728 setTargetDAGCombine(ISD::LOAD
);
729 setTargetDAGCombine(ISD::STORE
);
730 setTargetDAGCombine(ISD::ATOMIC_LOAD
);
731 setTargetDAGCombine(ISD::ATOMIC_STORE
);
732 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP
);
733 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
);
734 setTargetDAGCombine(ISD::ATOMIC_SWAP
);
735 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD
);
736 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB
);
737 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND
);
738 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR
);
739 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR
);
740 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND
);
741 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN
);
742 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX
);
743 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN
);
744 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX
);
745 setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD
);
747 setSchedulingPreference(Sched::RegPressure
);
750 const GCNSubtarget
*SITargetLowering::getSubtarget() const {
754 //===----------------------------------------------------------------------===//
755 // TargetLowering queries
756 //===----------------------------------------------------------------------===//
758 // v_mad_mix* support a conversion from f16 to f32.
760 // There is only one special case when denormals are enabled we don't currently,
761 // where this is OK to use.
762 bool SITargetLowering::isFPExtFoldable(unsigned Opcode
,
763 EVT DestVT
, EVT SrcVT
) const {
764 return ((Opcode
== ISD::FMAD
&& Subtarget
->hasMadMixInsts()) ||
765 (Opcode
== ISD::FMA
&& Subtarget
->hasFmaMixInsts())) &&
766 DestVT
.getScalarType() == MVT::f32
&& !Subtarget
->hasFP32Denormals() &&
767 SrcVT
.getScalarType() == MVT::f16
;
770 bool SITargetLowering::isShuffleMaskLegal(ArrayRef
<int>, EVT
) const {
771 // SI has some legal vector types, but no legal vector operations. Say no
772 // shuffles are legal in order to prefer scalarizing some vector operations.
776 MVT
SITargetLowering::getRegisterTypeForCallingConv(LLVMContext
&Context
,
779 if (CC
== CallingConv::AMDGPU_KERNEL
)
780 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
783 EVT ScalarVT
= VT
.getScalarType();
784 unsigned Size
= ScalarVT
.getSizeInBits();
786 return ScalarVT
.getSimpleVT();
791 if (Size
== 16 && Subtarget
->has16BitInsts())
792 return VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
793 } else if (VT
.getSizeInBits() > 32)
796 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
799 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext
&Context
,
802 if (CC
== CallingConv::AMDGPU_KERNEL
)
803 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
806 unsigned NumElts
= VT
.getVectorNumElements();
807 EVT ScalarVT
= VT
.getScalarType();
808 unsigned Size
= ScalarVT
.getSizeInBits();
814 return NumElts
* ((Size
+ 31) / 32);
816 if (Size
== 16 && Subtarget
->has16BitInsts())
817 return (NumElts
+ 1) / 2;
818 } else if (VT
.getSizeInBits() > 32)
819 return (VT
.getSizeInBits() + 31) / 32;
821 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
824 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
825 LLVMContext
&Context
, CallingConv::ID CC
,
826 EVT VT
, EVT
&IntermediateVT
,
827 unsigned &NumIntermediates
, MVT
&RegisterVT
) const {
828 if (CC
!= CallingConv::AMDGPU_KERNEL
&& VT
.isVector()) {
829 unsigned NumElts
= VT
.getVectorNumElements();
830 EVT ScalarVT
= VT
.getScalarType();
831 unsigned Size
= ScalarVT
.getSizeInBits();
833 RegisterVT
= ScalarVT
.getSimpleVT();
834 IntermediateVT
= RegisterVT
;
835 NumIntermediates
= NumElts
;
836 return NumIntermediates
;
840 RegisterVT
= MVT::i32
;
841 IntermediateVT
= RegisterVT
;
842 NumIntermediates
= NumElts
* ((Size
+ 31) / 32);
843 return NumIntermediates
;
846 // FIXME: We should fix the ABI to be the same on targets without 16-bit
847 // support, but unless we can properly handle 3-vectors, it will be still be
849 if (Size
== 16 && Subtarget
->has16BitInsts()) {
850 RegisterVT
= VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
851 IntermediateVT
= RegisterVT
;
852 NumIntermediates
= (NumElts
+ 1) / 2;
853 return NumIntermediates
;
857 return TargetLowering::getVectorTypeBreakdownForCallingConv(
858 Context
, CC
, VT
, IntermediateVT
, NumIntermediates
, RegisterVT
);
861 static MVT
memVTFromAggregate(Type
*Ty
) {
862 // Only limited forms of aggregate type currently expected.
863 assert(Ty
->isStructTy() && "Expected struct type");
866 Type
*ElementType
= nullptr;
868 if (Ty
->getContainedType(0)->isVectorTy()) {
869 VectorType
*VecComponent
= cast
<VectorType
>(Ty
->getContainedType(0));
870 ElementType
= VecComponent
->getElementType();
871 NumElts
= VecComponent
->getNumElements();
873 ElementType
= Ty
->getContainedType(0);
877 assert((Ty
->getContainedType(1) && Ty
->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
879 // Calculate the size of the memVT type from the aggregate
880 unsigned Pow2Elts
= 0;
881 unsigned ElementSize
;
882 switch (ElementType
->getTypeID()) {
884 llvm_unreachable("Unknown type!");
885 case Type::IntegerTyID
:
886 ElementSize
= cast
<IntegerType
>(ElementType
)->getBitWidth();
891 case Type::FloatTyID
:
895 unsigned AdditionalElts
= ElementSize
== 16 ? 2 : 1;
896 Pow2Elts
= 1 << Log2_32_Ceil(NumElts
+ AdditionalElts
);
898 return MVT::getVectorVT(MVT::getVT(ElementType
, false),
902 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo
&Info
,
905 unsigned IntrID
) const {
906 if (const AMDGPU::RsrcIntrinsic
*RsrcIntr
=
907 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
908 AttributeList Attr
= Intrinsic::getAttributes(CI
.getContext(),
909 (Intrinsic::ID
)IntrID
);
910 if (Attr
.hasFnAttribute(Attribute::ReadNone
))
913 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
915 if (RsrcIntr
->IsImage
) {
916 Info
.ptrVal
= MFI
->getImagePSV(
917 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
918 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
921 Info
.ptrVal
= MFI
->getBufferPSV(
922 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
923 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
926 Info
.flags
= MachineMemOperand::MODereferenceable
;
927 if (Attr
.hasFnAttribute(Attribute::ReadOnly
)) {
928 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
929 Info
.memVT
= MVT::getVT(CI
.getType(), true);
930 if (Info
.memVT
== MVT::Other
) {
931 // Some intrinsics return an aggregate type - special case to work out
933 Info
.memVT
= memVTFromAggregate(CI
.getType());
935 Info
.flags
|= MachineMemOperand::MOLoad
;
936 } else if (Attr
.hasFnAttribute(Attribute::WriteOnly
)) {
937 Info
.opc
= ISD::INTRINSIC_VOID
;
938 Info
.memVT
= MVT::getVT(CI
.getArgOperand(0)->getType());
939 Info
.flags
|= MachineMemOperand::MOStore
;
942 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
943 Info
.memVT
= MVT::getVT(CI
.getType());
944 Info
.flags
= MachineMemOperand::MOLoad
|
945 MachineMemOperand::MOStore
|
946 MachineMemOperand::MODereferenceable
;
948 // XXX - Should this be volatile without known ordering?
949 Info
.flags
|= MachineMemOperand::MOVolatile
;
955 case Intrinsic::amdgcn_atomic_inc
:
956 case Intrinsic::amdgcn_atomic_dec
:
957 case Intrinsic::amdgcn_ds_ordered_add
:
958 case Intrinsic::amdgcn_ds_ordered_swap
:
959 case Intrinsic::amdgcn_ds_fadd
:
960 case Intrinsic::amdgcn_ds_fmin
:
961 case Intrinsic::amdgcn_ds_fmax
: {
962 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
963 Info
.memVT
= MVT::getVT(CI
.getType());
964 Info
.ptrVal
= CI
.getOperand(0);
966 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
968 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(4));
970 Info
.flags
|= MachineMemOperand::MOVolatile
;
974 case Intrinsic::amdgcn_buffer_atomic_fadd
: {
975 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
977 Info
.opc
= ISD::INTRINSIC_VOID
;
978 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType());
979 Info
.ptrVal
= MFI
->getBufferPSV(
980 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
981 CI
.getArgOperand(1));
983 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
985 const ConstantInt
*Vol
= dyn_cast
<ConstantInt
>(CI
.getOperand(4));
986 if (!Vol
|| !Vol
->isZero())
987 Info
.flags
|= MachineMemOperand::MOVolatile
;
991 case Intrinsic::amdgcn_global_atomic_fadd
: {
992 Info
.opc
= ISD::INTRINSIC_VOID
;
993 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType()
994 ->getPointerElementType());
995 Info
.ptrVal
= CI
.getOperand(0);
997 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1001 case Intrinsic::amdgcn_ds_append
:
1002 case Intrinsic::amdgcn_ds_consume
: {
1003 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1004 Info
.memVT
= MVT::getVT(CI
.getType());
1005 Info
.ptrVal
= CI
.getOperand(0);
1007 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1009 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(1));
1011 Info
.flags
|= MachineMemOperand::MOVolatile
;
1015 case Intrinsic::amdgcn_ds_gws_init
:
1016 case Intrinsic::amdgcn_ds_gws_barrier
:
1017 case Intrinsic::amdgcn_ds_gws_sema_v
:
1018 case Intrinsic::amdgcn_ds_gws_sema_br
:
1019 case Intrinsic::amdgcn_ds_gws_sema_p
:
1020 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
1021 Info
.opc
= ISD::INTRINSIC_VOID
;
1023 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1025 MFI
->getGWSPSV(*MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo());
1027 // This is an abstract access, but we need to specify a type and size.
1028 Info
.memVT
= MVT::i32
;
1030 Info
.align
= Align(4);
1032 Info
.flags
= MachineMemOperand::MOStore
;
1033 if (IntrID
== Intrinsic::amdgcn_ds_gws_barrier
)
1034 Info
.flags
= MachineMemOperand::MOLoad
;
1042 bool SITargetLowering::getAddrModeArguments(IntrinsicInst
*II
,
1043 SmallVectorImpl
<Value
*> &Ops
,
1044 Type
*&AccessTy
) const {
1045 switch (II
->getIntrinsicID()) {
1046 case Intrinsic::amdgcn_atomic_inc
:
1047 case Intrinsic::amdgcn_atomic_dec
:
1048 case Intrinsic::amdgcn_ds_ordered_add
:
1049 case Intrinsic::amdgcn_ds_ordered_swap
:
1050 case Intrinsic::amdgcn_ds_fadd
:
1051 case Intrinsic::amdgcn_ds_fmin
:
1052 case Intrinsic::amdgcn_ds_fmax
: {
1053 Value
*Ptr
= II
->getArgOperand(0);
1054 AccessTy
= II
->getType();
1063 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode
&AM
) const {
1064 if (!Subtarget
->hasFlatInstOffsets()) {
1065 // Flat instructions do not have offsets, and only have the register
1067 return AM
.BaseOffs
== 0 && AM
.Scale
== 0;
1070 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
1071 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
1073 // GFX10 shrinked signed offset to 12 bits. When using regular flat
1074 // instructions, the sign bit is also ignored and is treated as 11-bit
1077 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
1078 return isUInt
<11>(AM
.BaseOffs
) && AM
.Scale
== 0;
1081 return isUInt
<12>(AM
.BaseOffs
) && AM
.Scale
== 0;
1084 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode
&AM
) const {
1085 if (Subtarget
->hasFlatGlobalInsts())
1086 return isInt
<13>(AM
.BaseOffs
) && AM
.Scale
== 0;
1088 if (!Subtarget
->hasAddr64() || Subtarget
->useFlatForGlobal()) {
1089 // Assume the we will use FLAT for all global memory accesses
1091 // FIXME: This assumption is currently wrong. On VI we still use
1092 // MUBUF instructions for the r + i addressing mode. As currently
1093 // implemented, the MUBUF instructions only work on buffer < 4GB.
1094 // It may be possible to support > 4GB buffers with MUBUF instructions,
1095 // by setting the stride value in the resource descriptor which would
1096 // increase the size limit to (stride * 4GB). However, this is risky,
1097 // because it has never been validated.
1098 return isLegalFlatAddressingMode(AM
);
1101 return isLegalMUBUFAddressingMode(AM
);
1104 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode
&AM
) const {
1105 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1106 // additionally can do r + r + i with addr64. 32-bit has more addressing
1107 // mode options. Depending on the resource constant, it can also do
1108 // (i64 r0) + (i32 r1) * (i14 i).
1110 // Private arrays end up using a scratch buffer most of the time, so also
1111 // assume those use MUBUF instructions. Scratch loads / stores are currently
1112 // implemented as mubuf instructions with offen bit set, so slightly
1113 // different than the normal addr64.
1114 if (!isUInt
<12>(AM
.BaseOffs
))
1117 // FIXME: Since we can split immediate into soffset and immediate offset,
1118 // would it make sense to allow any immediate?
1121 case 0: // r + i or just i, depending on HasBaseReg.
1124 return true; // We have r + r or r + i.
1126 if (AM
.HasBaseReg
) {
1127 // Reject 2 * r + r.
1131 // Allow 2 * r as r + r
1132 // Or 2 * r + i is allowed as r + r + i.
1134 default: // Don't allow n * r
1139 bool SITargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
1140 const AddrMode
&AM
, Type
*Ty
,
1141 unsigned AS
, Instruction
*I
) const {
1142 // No global is ever allowed as a base.
1146 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
)
1147 return isLegalGlobalAddressingMode(AM
);
1149 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1150 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
1151 AS
== AMDGPUAS::BUFFER_FAT_POINTER
) {
1152 // If the offset isn't a multiple of 4, it probably isn't going to be
1153 // correctly aligned.
1154 // FIXME: Can we get the real alignment here?
1155 if (AM
.BaseOffs
% 4 != 0)
1156 return isLegalMUBUFAddressingMode(AM
);
1158 // There are no SMRD extloads, so if we have to do a small type access we
1159 // will use a MUBUF load.
1160 // FIXME?: We also need to do this if unaligned, but we don't know the
1162 if (Ty
->isSized() && DL
.getTypeStoreSize(Ty
) < 4)
1163 return isLegalGlobalAddressingMode(AM
);
1165 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
1166 // SMRD instructions have an 8-bit, dword offset on SI.
1167 if (!isUInt
<8>(AM
.BaseOffs
/ 4))
1169 } else if (Subtarget
->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
) {
1170 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1171 // in 8-bits, it can use a smaller encoding.
1172 if (!isUInt
<32>(AM
.BaseOffs
/ 4))
1174 } else if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
1175 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1176 if (!isUInt
<20>(AM
.BaseOffs
))
1179 llvm_unreachable("unhandled generation");
1181 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1184 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1189 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1190 return isLegalMUBUFAddressingMode(AM
);
1191 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
||
1192 AS
== AMDGPUAS::REGION_ADDRESS
) {
1193 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1195 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1196 // an 8-bit dword offset but we don't know the alignment here.
1197 if (!isUInt
<16>(AM
.BaseOffs
))
1200 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1203 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1207 } else if (AS
== AMDGPUAS::FLAT_ADDRESS
||
1208 AS
== AMDGPUAS::UNKNOWN_ADDRESS_SPACE
) {
1209 // For an unknown address space, this usually means that this is for some
1210 // reason being used for pure arithmetic, and not based on some addressing
1211 // computation. We don't have instructions that compute pointers with any
1212 // addressing modes, so treat them as having no offset like flat
1214 return isLegalFlatAddressingMode(AM
);
1216 llvm_unreachable("unhandled address space");
1220 bool SITargetLowering::canMergeStoresTo(unsigned AS
, EVT MemVT
,
1221 const SelectionDAG
&DAG
) const {
1222 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
) {
1223 return (MemVT
.getSizeInBits() <= 4 * 32);
1224 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1225 unsigned MaxPrivateBits
= 8 * getSubtarget()->getMaxPrivateElementSize();
1226 return (MemVT
.getSizeInBits() <= MaxPrivateBits
);
1227 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
1228 return (MemVT
.getSizeInBits() <= 2 * 32);
1233 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1234 EVT VT
, unsigned AddrSpace
, unsigned Align
, MachineMemOperand::Flags Flags
,
1235 bool *IsFast
) const {
1239 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1240 // which isn't a simple VT.
1241 // Until MVT is extended to handle this, simply check for the size and
1242 // rely on the condition below: allow accesses if the size is a multiple of 4.
1243 if (VT
== MVT::Other
|| (VT
!= MVT::Other
&& VT
.getSizeInBits() > 1024 &&
1244 VT
.getStoreSize() > 16)) {
1248 if (AddrSpace
== AMDGPUAS::LOCAL_ADDRESS
||
1249 AddrSpace
== AMDGPUAS::REGION_ADDRESS
) {
1250 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1251 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1252 // with adjacent offsets.
1253 bool AlignedBy4
= (Align
% 4 == 0);
1255 *IsFast
= AlignedBy4
;
1260 // FIXME: We have to be conservative here and assume that flat operations
1261 // will access scratch. If we had access to the IR function, then we
1262 // could determine if any private memory was used in the function.
1263 if (!Subtarget
->hasUnalignedScratchAccess() &&
1264 (AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
||
1265 AddrSpace
== AMDGPUAS::FLAT_ADDRESS
)) {
1266 bool AlignedBy4
= Align
>= 4;
1268 *IsFast
= AlignedBy4
;
1273 if (Subtarget
->hasUnalignedBufferAccess()) {
1274 // If we have an uniform constant load, it still requires using a slow
1275 // buffer instruction if unaligned.
1277 *IsFast
= (AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS
||
1278 AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) ?
1279 (Align
% 4 == 0) : true;
1285 // Smaller than dword value must be aligned.
1286 if (VT
.bitsLT(MVT::i32
))
1289 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1290 // byte-address are ignored, thus forcing Dword alignment.
1291 // This applies to private, global, and constant memory.
1295 return VT
.bitsGT(MVT::i32
) && Align
% 4 == 0;
1298 EVT
SITargetLowering::getOptimalMemOpType(
1299 uint64_t Size
, unsigned DstAlign
, unsigned SrcAlign
, bool IsMemset
,
1300 bool ZeroMemset
, bool MemcpyStrSrc
,
1301 const AttributeList
&FuncAttributes
) const {
1302 // FIXME: Should account for address space here.
1304 // The default fallback uses the private pointer size as a guess for a type to
1305 // use. Make sure we switch these to 64-bit accesses.
1307 if (Size
>= 16 && DstAlign
>= 4) // XXX: Should only do for global
1310 if (Size
>= 8 && DstAlign
>= 4)
1317 static bool isFlatGlobalAddrSpace(unsigned AS
) {
1318 return AS
== AMDGPUAS::GLOBAL_ADDRESS
||
1319 AS
== AMDGPUAS::FLAT_ADDRESS
||
1320 AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1321 AS
> AMDGPUAS::MAX_AMDGPU_ADDRESS
;
1324 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS
,
1325 unsigned DestAS
) const {
1326 return isFlatGlobalAddrSpace(SrcAS
) && isFlatGlobalAddrSpace(DestAS
);
1329 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode
*N
) const {
1330 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1331 const Value
*Ptr
= MemNode
->getMemOperand()->getValue();
1332 const Instruction
*I
= dyn_cast_or_null
<Instruction
>(Ptr
);
1333 return I
&& I
->getMetadata("amdgpu.noclobber");
1336 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS
,
1337 unsigned DestAS
) const {
1338 // Flat -> private/local is a simple truncate.
1339 // Flat -> global is no-op
1340 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
)
1343 return isNoopAddrSpaceCast(SrcAS
, DestAS
);
1346 bool SITargetLowering::isMemOpUniform(const SDNode
*N
) const {
1347 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1349 return AMDGPUInstrInfo::isUniformMMO(MemNode
->getMemOperand());
1352 TargetLoweringBase::LegalizeTypeAction
1353 SITargetLowering::getPreferredVectorAction(MVT VT
) const {
1354 if (VT
.getVectorNumElements() != 1 && VT
.getScalarType().bitsLE(MVT::i16
))
1355 return TypeSplitVector
;
1357 return TargetLoweringBase::getPreferredVectorAction(VT
);
1360 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
1362 // FIXME: Could be smarter if called for vector constants.
1366 bool SITargetLowering::isTypeDesirableForOp(unsigned Op
, EVT VT
) const {
1367 if (Subtarget
->has16BitInsts() && VT
== MVT::i16
) {
1372 // These operations are done with 32-bit instructions anyway.
1377 // TODO: Extensions?
1384 // SimplifySetCC uses this function to determine whether or not it should
1385 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1386 if (VT
== MVT::i1
&& Op
== ISD::SETCC
)
1389 return TargetLowering::isTypeDesirableForOp(Op
, VT
);
1392 SDValue
SITargetLowering::lowerKernArgParameterPtr(SelectionDAG
&DAG
,
1395 uint64_t Offset
) const {
1396 const DataLayout
&DL
= DAG
.getDataLayout();
1397 MachineFunction
&MF
= DAG
.getMachineFunction();
1398 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1400 const ArgDescriptor
*InputPtrReg
;
1401 const TargetRegisterClass
*RC
;
1403 std::tie(InputPtrReg
, RC
)
1404 = Info
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1406 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
1407 MVT PtrVT
= getPointerTy(DL
, AMDGPUAS::CONSTANT_ADDRESS
);
1408 SDValue BasePtr
= DAG
.getCopyFromReg(Chain
, SL
,
1409 MRI
.getLiveInVirtReg(InputPtrReg
->getRegister()), PtrVT
);
1411 return DAG
.getObjectPtrOffset(SL
, BasePtr
, Offset
);
1414 SDValue
SITargetLowering::getImplicitArgPtr(SelectionDAG
&DAG
,
1415 const SDLoc
&SL
) const {
1416 uint64_t Offset
= getImplicitParameterOffset(DAG
.getMachineFunction(),
1418 return lowerKernArgParameterPtr(DAG
, SL
, DAG
.getEntryNode(), Offset
);
1421 SDValue
SITargetLowering::convertArgType(SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1422 const SDLoc
&SL
, SDValue Val
,
1424 const ISD::InputArg
*Arg
) const {
1425 // First, if it is a widened vector, narrow it.
1426 if (VT
.isVector() &&
1427 VT
.getVectorNumElements() != MemVT
.getVectorNumElements()) {
1429 EVT::getVectorVT(*DAG
.getContext(), MemVT
.getVectorElementType(),
1430 VT
.getVectorNumElements());
1431 Val
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, NarrowedVT
, Val
,
1432 DAG
.getConstant(0, SL
, MVT::i32
));
1435 // Then convert the vector elements or scalar value.
1436 if (Arg
&& (Arg
->Flags
.isSExt() || Arg
->Flags
.isZExt()) &&
1438 unsigned Opc
= Arg
->Flags
.isZExt() ? ISD::AssertZext
: ISD::AssertSext
;
1439 Val
= DAG
.getNode(Opc
, SL
, MemVT
, Val
, DAG
.getValueType(VT
));
1442 if (MemVT
.isFloatingPoint())
1443 Val
= getFPExtOrFPTrunc(DAG
, Val
, SL
, VT
);
1445 Val
= DAG
.getSExtOrTrunc(Val
, SL
, VT
);
1447 Val
= DAG
.getZExtOrTrunc(Val
, SL
, VT
);
1452 SDValue
SITargetLowering::lowerKernargMemParameter(
1453 SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1454 const SDLoc
&SL
, SDValue Chain
,
1455 uint64_t Offset
, unsigned Align
, bool Signed
,
1456 const ISD::InputArg
*Arg
) const {
1457 Type
*Ty
= MemVT
.getTypeForEVT(*DAG
.getContext());
1458 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
1459 MachinePointerInfo
PtrInfo(UndefValue::get(PtrTy
));
1461 // Try to avoid using an extload by loading earlier than the argument address,
1462 // and extracting the relevant bits. The load should hopefully be merged with
1463 // the previous argument.
1464 if (MemVT
.getStoreSize() < 4 && Align
< 4) {
1465 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1466 int64_t AlignDownOffset
= alignDown(Offset
, 4);
1467 int64_t OffsetDiff
= Offset
- AlignDownOffset
;
1469 EVT IntVT
= MemVT
.changeTypeToInteger();
1471 // TODO: If we passed in the base kernel offset we could have a better
1472 // alignment than 4, but we don't really need it.
1473 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, AlignDownOffset
);
1474 SDValue Load
= DAG
.getLoad(MVT::i32
, SL
, Chain
, Ptr
, PtrInfo
, 4,
1475 MachineMemOperand::MODereferenceable
|
1476 MachineMemOperand::MOInvariant
);
1478 SDValue ShiftAmt
= DAG
.getConstant(OffsetDiff
* 8, SL
, MVT::i32
);
1479 SDValue Extract
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Load
, ShiftAmt
);
1481 SDValue ArgVal
= DAG
.getNode(ISD::TRUNCATE
, SL
, IntVT
, Extract
);
1482 ArgVal
= DAG
.getNode(ISD::BITCAST
, SL
, MemVT
, ArgVal
);
1483 ArgVal
= convertArgType(DAG
, VT
, MemVT
, SL
, ArgVal
, Signed
, Arg
);
1486 return DAG
.getMergeValues({ ArgVal
, Load
.getValue(1) }, SL
);
1489 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, Offset
);
1490 SDValue Load
= DAG
.getLoad(MemVT
, SL
, Chain
, Ptr
, PtrInfo
, Align
,
1491 MachineMemOperand::MODereferenceable
|
1492 MachineMemOperand::MOInvariant
);
1494 SDValue Val
= convertArgType(DAG
, VT
, MemVT
, SL
, Load
, Signed
, Arg
);
1495 return DAG
.getMergeValues({ Val
, Load
.getValue(1) }, SL
);
1498 SDValue
SITargetLowering::lowerStackParameter(SelectionDAG
&DAG
, CCValAssign
&VA
,
1499 const SDLoc
&SL
, SDValue Chain
,
1500 const ISD::InputArg
&Arg
) const {
1501 MachineFunction
&MF
= DAG
.getMachineFunction();
1502 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1504 if (Arg
.Flags
.isByVal()) {
1505 unsigned Size
= Arg
.Flags
.getByValSize();
1506 int FrameIdx
= MFI
.CreateFixedObject(Size
, VA
.getLocMemOffset(), false);
1507 return DAG
.getFrameIndex(FrameIdx
, MVT::i32
);
1510 unsigned ArgOffset
= VA
.getLocMemOffset();
1511 unsigned ArgSize
= VA
.getValVT().getStoreSize();
1513 int FI
= MFI
.CreateFixedObject(ArgSize
, ArgOffset
, true);
1515 // Create load nodes to retrieve arguments from the stack.
1516 SDValue FIN
= DAG
.getFrameIndex(FI
, MVT::i32
);
1519 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1520 ISD::LoadExtType ExtType
= ISD::NON_EXTLOAD
;
1521 MVT MemVT
= VA
.getValVT();
1523 switch (VA
.getLocInfo()) {
1526 case CCValAssign::BCvt
:
1527 MemVT
= VA
.getLocVT();
1529 case CCValAssign::SExt
:
1530 ExtType
= ISD::SEXTLOAD
;
1532 case CCValAssign::ZExt
:
1533 ExtType
= ISD::ZEXTLOAD
;
1535 case CCValAssign::AExt
:
1536 ExtType
= ISD::EXTLOAD
;
1540 ArgValue
= DAG
.getExtLoad(
1541 ExtType
, SL
, VA
.getLocVT(), Chain
, FIN
,
1542 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
),
1547 SDValue
SITargetLowering::getPreloadedValue(SelectionDAG
&DAG
,
1548 const SIMachineFunctionInfo
&MFI
,
1550 AMDGPUFunctionArgInfo::PreloadedValue PVID
) const {
1551 const ArgDescriptor
*Reg
;
1552 const TargetRegisterClass
*RC
;
1554 std::tie(Reg
, RC
) = MFI
.getPreloadedValue(PVID
);
1555 return CreateLiveInRegister(DAG
, RC
, Reg
->getRegister(), VT
);
1558 static void processShaderInputArgs(SmallVectorImpl
<ISD::InputArg
> &Splits
,
1559 CallingConv::ID CallConv
,
1560 ArrayRef
<ISD::InputArg
> Ins
,
1562 FunctionType
*FType
,
1563 SIMachineFunctionInfo
*Info
) {
1564 for (unsigned I
= 0, E
= Ins
.size(), PSInputNum
= 0; I
!= E
; ++I
) {
1565 const ISD::InputArg
*Arg
= &Ins
[I
];
1567 assert((!Arg
->VT
.isVector() || Arg
->VT
.getScalarSizeInBits() == 16) &&
1568 "vector type argument should have been split");
1570 // First check if it's a PS input addr.
1571 if (CallConv
== CallingConv::AMDGPU_PS
&&
1572 !Arg
->Flags
.isInReg() && PSInputNum
<= 15) {
1573 bool SkipArg
= !Arg
->Used
&& !Info
->isPSInputAllocated(PSInputNum
);
1575 // Inconveniently only the first part of the split is marked as isSplit,
1576 // so skip to the end. We only want to increment PSInputNum once for the
1577 // entire split argument.
1578 if (Arg
->Flags
.isSplit()) {
1579 while (!Arg
->Flags
.isSplitEnd()) {
1580 assert((!Arg
->VT
.isVector() ||
1581 Arg
->VT
.getScalarSizeInBits() == 16) &&
1582 "unexpected vector split in ps argument type");
1584 Splits
.push_back(*Arg
);
1590 // We can safely skip PS inputs.
1591 Skipped
.set(Arg
->getOrigArgIndex());
1596 Info
->markPSInputAllocated(PSInputNum
);
1598 Info
->markPSInputEnabled(PSInputNum
);
1603 Splits
.push_back(*Arg
);
1607 // Allocate special inputs passed in VGPRs.
1608 void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState
&CCInfo
,
1609 MachineFunction
&MF
,
1610 const SIRegisterInfo
&TRI
,
1611 SIMachineFunctionInfo
&Info
) const {
1612 const LLT S32
= LLT::scalar(32);
1613 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1615 if (Info
.hasWorkItemIDX()) {
1616 Register Reg
= AMDGPU::VGPR0
;
1617 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1619 CCInfo
.AllocateReg(Reg
);
1620 Info
.setWorkItemIDX(ArgDescriptor::createRegister(Reg
));
1623 if (Info
.hasWorkItemIDY()) {
1624 Register Reg
= AMDGPU::VGPR1
;
1625 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1627 CCInfo
.AllocateReg(Reg
);
1628 Info
.setWorkItemIDY(ArgDescriptor::createRegister(Reg
));
1631 if (Info
.hasWorkItemIDZ()) {
1632 Register Reg
= AMDGPU::VGPR2
;
1633 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1635 CCInfo
.AllocateReg(Reg
);
1636 Info
.setWorkItemIDZ(ArgDescriptor::createRegister(Reg
));
1640 // Try to allocate a VGPR at the end of the argument list, or if no argument
1641 // VGPRs are left allocating a stack slot.
1642 // If \p Mask is is given it indicates bitfield position in the register.
1643 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1644 static ArgDescriptor
allocateVGPR32Input(CCState
&CCInfo
, unsigned Mask
= ~0u,
1645 ArgDescriptor Arg
= ArgDescriptor()) {
1647 return ArgDescriptor::createArg(Arg
, Mask
);
1649 ArrayRef
<MCPhysReg
> ArgVGPRs
1650 = makeArrayRef(AMDGPU::VGPR_32RegClass
.begin(), 32);
1651 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgVGPRs
);
1652 if (RegIdx
== ArgVGPRs
.size()) {
1653 // Spill to stack required.
1654 int64_t Offset
= CCInfo
.AllocateStack(4, 4);
1656 return ArgDescriptor::createStack(Offset
, Mask
);
1659 unsigned Reg
= ArgVGPRs
[RegIdx
];
1660 Reg
= CCInfo
.AllocateReg(Reg
);
1661 assert(Reg
!= AMDGPU::NoRegister
);
1663 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1664 MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
);
1665 return ArgDescriptor::createRegister(Reg
, Mask
);
1668 static ArgDescriptor
allocateSGPR32InputImpl(CCState
&CCInfo
,
1669 const TargetRegisterClass
*RC
,
1670 unsigned NumArgRegs
) {
1671 ArrayRef
<MCPhysReg
> ArgSGPRs
= makeArrayRef(RC
->begin(), 32);
1672 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgSGPRs
);
1673 if (RegIdx
== ArgSGPRs
.size())
1674 report_fatal_error("ran out of SGPRs for arguments");
1676 unsigned Reg
= ArgSGPRs
[RegIdx
];
1677 Reg
= CCInfo
.AllocateReg(Reg
);
1678 assert(Reg
!= AMDGPU::NoRegister
);
1680 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1681 MF
.addLiveIn(Reg
, RC
);
1682 return ArgDescriptor::createRegister(Reg
);
1685 static ArgDescriptor
allocateSGPR32Input(CCState
&CCInfo
) {
1686 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_32RegClass
, 32);
1689 static ArgDescriptor
allocateSGPR64Input(CCState
&CCInfo
) {
1690 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_64RegClass
, 16);
1693 void SITargetLowering::allocateSpecialInputVGPRs(CCState
&CCInfo
,
1694 MachineFunction
&MF
,
1695 const SIRegisterInfo
&TRI
,
1696 SIMachineFunctionInfo
&Info
) const {
1697 const unsigned Mask
= 0x3ff;
1700 if (Info
.hasWorkItemIDX()) {
1701 Arg
= allocateVGPR32Input(CCInfo
, Mask
);
1702 Info
.setWorkItemIDX(Arg
);
1705 if (Info
.hasWorkItemIDY()) {
1706 Arg
= allocateVGPR32Input(CCInfo
, Mask
<< 10, Arg
);
1707 Info
.setWorkItemIDY(Arg
);
1710 if (Info
.hasWorkItemIDZ())
1711 Info
.setWorkItemIDZ(allocateVGPR32Input(CCInfo
, Mask
<< 20, Arg
));
1714 void SITargetLowering::allocateSpecialInputSGPRs(
1716 MachineFunction
&MF
,
1717 const SIRegisterInfo
&TRI
,
1718 SIMachineFunctionInfo
&Info
) const {
1719 auto &ArgInfo
= Info
.getArgInfo();
1721 // TODO: Unify handling with private memory pointers.
1723 if (Info
.hasDispatchPtr())
1724 ArgInfo
.DispatchPtr
= allocateSGPR64Input(CCInfo
);
1726 if (Info
.hasQueuePtr())
1727 ArgInfo
.QueuePtr
= allocateSGPR64Input(CCInfo
);
1729 if (Info
.hasKernargSegmentPtr())
1730 ArgInfo
.KernargSegmentPtr
= allocateSGPR64Input(CCInfo
);
1732 if (Info
.hasDispatchID())
1733 ArgInfo
.DispatchID
= allocateSGPR64Input(CCInfo
);
1735 // flat_scratch_init is not applicable for non-kernel functions.
1737 if (Info
.hasWorkGroupIDX())
1738 ArgInfo
.WorkGroupIDX
= allocateSGPR32Input(CCInfo
);
1740 if (Info
.hasWorkGroupIDY())
1741 ArgInfo
.WorkGroupIDY
= allocateSGPR32Input(CCInfo
);
1743 if (Info
.hasWorkGroupIDZ())
1744 ArgInfo
.WorkGroupIDZ
= allocateSGPR32Input(CCInfo
);
1746 if (Info
.hasImplicitArgPtr())
1747 ArgInfo
.ImplicitArgPtr
= allocateSGPR64Input(CCInfo
);
1750 // Allocate special inputs passed in user SGPRs.
1751 void SITargetLowering::allocateHSAUserSGPRs(CCState
&CCInfo
,
1752 MachineFunction
&MF
,
1753 const SIRegisterInfo
&TRI
,
1754 SIMachineFunctionInfo
&Info
) const {
1755 if (Info
.hasImplicitBufferPtr()) {
1756 unsigned ImplicitBufferPtrReg
= Info
.addImplicitBufferPtr(TRI
);
1757 MF
.addLiveIn(ImplicitBufferPtrReg
, &AMDGPU::SGPR_64RegClass
);
1758 CCInfo
.AllocateReg(ImplicitBufferPtrReg
);
1761 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1762 if (Info
.hasPrivateSegmentBuffer()) {
1763 unsigned PrivateSegmentBufferReg
= Info
.addPrivateSegmentBuffer(TRI
);
1764 MF
.addLiveIn(PrivateSegmentBufferReg
, &AMDGPU::SGPR_128RegClass
);
1765 CCInfo
.AllocateReg(PrivateSegmentBufferReg
);
1768 if (Info
.hasDispatchPtr()) {
1769 unsigned DispatchPtrReg
= Info
.addDispatchPtr(TRI
);
1770 MF
.addLiveIn(DispatchPtrReg
, &AMDGPU::SGPR_64RegClass
);
1771 CCInfo
.AllocateReg(DispatchPtrReg
);
1774 if (Info
.hasQueuePtr()) {
1775 unsigned QueuePtrReg
= Info
.addQueuePtr(TRI
);
1776 MF
.addLiveIn(QueuePtrReg
, &AMDGPU::SGPR_64RegClass
);
1777 CCInfo
.AllocateReg(QueuePtrReg
);
1780 if (Info
.hasKernargSegmentPtr()) {
1781 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1782 Register InputPtrReg
= Info
.addKernargSegmentPtr(TRI
);
1783 CCInfo
.AllocateReg(InputPtrReg
);
1785 Register VReg
= MF
.addLiveIn(InputPtrReg
, &AMDGPU::SGPR_64RegClass
);
1786 MRI
.setType(VReg
, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
1789 if (Info
.hasDispatchID()) {
1790 unsigned DispatchIDReg
= Info
.addDispatchID(TRI
);
1791 MF
.addLiveIn(DispatchIDReg
, &AMDGPU::SGPR_64RegClass
);
1792 CCInfo
.AllocateReg(DispatchIDReg
);
1795 if (Info
.hasFlatScratchInit()) {
1796 unsigned FlatScratchInitReg
= Info
.addFlatScratchInit(TRI
);
1797 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
1798 CCInfo
.AllocateReg(FlatScratchInitReg
);
1801 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1802 // these from the dispatch pointer.
1805 // Allocate special input registers that are initialized per-wave.
1806 void SITargetLowering::allocateSystemSGPRs(CCState
&CCInfo
,
1807 MachineFunction
&MF
,
1808 SIMachineFunctionInfo
&Info
,
1809 CallingConv::ID CallConv
,
1810 bool IsShader
) const {
1811 if (Info
.hasWorkGroupIDX()) {
1812 unsigned Reg
= Info
.addWorkGroupIDX();
1813 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1814 CCInfo
.AllocateReg(Reg
);
1817 if (Info
.hasWorkGroupIDY()) {
1818 unsigned Reg
= Info
.addWorkGroupIDY();
1819 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1820 CCInfo
.AllocateReg(Reg
);
1823 if (Info
.hasWorkGroupIDZ()) {
1824 unsigned Reg
= Info
.addWorkGroupIDZ();
1825 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1826 CCInfo
.AllocateReg(Reg
);
1829 if (Info
.hasWorkGroupInfo()) {
1830 unsigned Reg
= Info
.addWorkGroupInfo();
1831 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1832 CCInfo
.AllocateReg(Reg
);
1835 if (Info
.hasPrivateSegmentWaveByteOffset()) {
1836 // Scratch wave offset passed in system SGPR.
1837 unsigned PrivateSegmentWaveByteOffsetReg
;
1840 PrivateSegmentWaveByteOffsetReg
=
1841 Info
.getPrivateSegmentWaveByteOffsetSystemSGPR();
1843 // This is true if the scratch wave byte offset doesn't have a fixed
1845 if (PrivateSegmentWaveByteOffsetReg
== AMDGPU::NoRegister
) {
1846 PrivateSegmentWaveByteOffsetReg
= findFirstFreeSGPR(CCInfo
);
1847 Info
.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg
);
1850 PrivateSegmentWaveByteOffsetReg
= Info
.addPrivateSegmentWaveByteOffset();
1852 MF
.addLiveIn(PrivateSegmentWaveByteOffsetReg
, &AMDGPU::SGPR_32RegClass
);
1853 CCInfo
.AllocateReg(PrivateSegmentWaveByteOffsetReg
);
1857 static void reservePrivateMemoryRegs(const TargetMachine
&TM
,
1858 MachineFunction
&MF
,
1859 const SIRegisterInfo
&TRI
,
1860 SIMachineFunctionInfo
&Info
) {
1861 // Now that we've figured out where the scratch register inputs are, see if
1862 // should reserve the arguments and use them directly.
1863 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1864 bool HasStackObjects
= MFI
.hasStackObjects();
1865 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1867 // Record that we know we have non-spill stack objects so we don't need to
1868 // check all stack objects later.
1869 if (HasStackObjects
)
1870 Info
.setHasNonSpillStackObjects(true);
1872 // Everything live out of a block is spilled with fast regalloc, so it's
1873 // almost certain that spilling will be required.
1874 if (TM
.getOptLevel() == CodeGenOpt::None
)
1875 HasStackObjects
= true;
1877 // For now assume stack access is needed in any callee functions, so we need
1878 // the scratch registers to pass in.
1879 bool RequiresStackAccess
= HasStackObjects
|| MFI
.hasCalls();
1881 if (RequiresStackAccess
&& ST
.isAmdHsaOrMesa(MF
.getFunction())) {
1882 // If we have stack objects, we unquestionably need the private buffer
1883 // resource. For the Code Object V2 ABI, this will be the first 4 user
1884 // SGPR inputs. We can reserve those and use them directly.
1886 Register PrivateSegmentBufferReg
=
1887 Info
.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER
);
1888 Info
.setScratchRSrcReg(PrivateSegmentBufferReg
);
1890 unsigned ReservedBufferReg
= TRI
.reservedPrivateSegmentBufferReg(MF
);
1891 // We tentatively reserve the last registers (skipping the last registers
1892 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
1893 // we'll replace these with the ones immediately after those which were
1894 // really allocated. In the prologue copies will be inserted from the
1895 // argument to these reserved registers.
1897 // Without HSA, relocations are used for the scratch pointer and the
1898 // buffer resource setup is always inserted in the prologue. Scratch wave
1899 // offset is still in an input SGPR.
1900 Info
.setScratchRSrcReg(ReservedBufferReg
);
1903 // hasFP should be accurate for kernels even before the frame is finalized.
1904 if (ST
.getFrameLowering()->hasFP(MF
)) {
1905 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1907 // Try to use s32 as the SP, but move it if it would interfere with input
1908 // arguments. This won't work with calls though.
1910 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
1912 if (!MRI
.isLiveIn(AMDGPU::SGPR32
)) {
1913 Info
.setStackPtrOffsetReg(AMDGPU::SGPR32
);
1915 assert(AMDGPU::isShader(MF
.getFunction().getCallingConv()));
1918 report_fatal_error("call in graphics shader with too many input SGPRs");
1920 for (unsigned Reg
: AMDGPU::SGPR_32RegClass
) {
1921 if (!MRI
.isLiveIn(Reg
)) {
1922 Info
.setStackPtrOffsetReg(Reg
);
1927 if (Info
.getStackPtrOffsetReg() == AMDGPU::SP_REG
)
1928 report_fatal_error("failed to find register for SP");
1931 if (MFI
.hasCalls()) {
1932 Info
.setScratchWaveOffsetReg(AMDGPU::SGPR33
);
1933 Info
.setFrameOffsetReg(AMDGPU::SGPR33
);
1935 unsigned ReservedOffsetReg
=
1936 TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1937 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1938 Info
.setFrameOffsetReg(ReservedOffsetReg
);
1940 } else if (RequiresStackAccess
) {
1941 assert(!MFI
.hasCalls());
1942 // We know there are accesses and they will be done relative to SP, so just
1943 // pin it to the input.
1945 // FIXME: Should not do this if inline asm is reading/writing these
1947 Register PreloadedSP
= Info
.getPreloadedReg(
1948 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET
);
1950 Info
.setStackPtrOffsetReg(PreloadedSP
);
1951 Info
.setScratchWaveOffsetReg(PreloadedSP
);
1952 Info
.setFrameOffsetReg(PreloadedSP
);
1954 assert(!MFI
.hasCalls());
1956 // There may not be stack access at all. There may still be spills, or
1957 // access of a constant pointer (in which cases an extra copy will be
1958 // emitted in the prolog).
1959 unsigned ReservedOffsetReg
1960 = TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1961 Info
.setStackPtrOffsetReg(ReservedOffsetReg
);
1962 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1963 Info
.setFrameOffsetReg(ReservedOffsetReg
);
1967 bool SITargetLowering::supportSplitCSR(MachineFunction
*MF
) const {
1968 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
1969 return !Info
->isEntryFunction();
1972 void SITargetLowering::initializeSplitCSR(MachineBasicBlock
*Entry
) const {
1976 void SITargetLowering::insertCopiesSplitCSR(
1977 MachineBasicBlock
*Entry
,
1978 const SmallVectorImpl
<MachineBasicBlock
*> &Exits
) const {
1979 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
1981 const MCPhysReg
*IStart
= TRI
->getCalleeSavedRegsViaCopy(Entry
->getParent());
1985 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
1986 MachineRegisterInfo
*MRI
= &Entry
->getParent()->getRegInfo();
1987 MachineBasicBlock::iterator MBBI
= Entry
->begin();
1988 for (const MCPhysReg
*I
= IStart
; *I
; ++I
) {
1989 const TargetRegisterClass
*RC
= nullptr;
1990 if (AMDGPU::SReg_64RegClass
.contains(*I
))
1991 RC
= &AMDGPU::SGPR_64RegClass
;
1992 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
1993 RC
= &AMDGPU::SGPR_32RegClass
;
1995 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1997 Register NewVR
= MRI
->createVirtualRegister(RC
);
1998 // Create copy from CSR to a virtual register.
1999 Entry
->addLiveIn(*I
);
2000 BuildMI(*Entry
, MBBI
, DebugLoc(), TII
->get(TargetOpcode::COPY
), NewVR
)
2003 // Insert the copy-back instructions right before the terminator.
2004 for (auto *Exit
: Exits
)
2005 BuildMI(*Exit
, Exit
->getFirstTerminator(), DebugLoc(),
2006 TII
->get(TargetOpcode::COPY
), *I
)
2011 SDValue
SITargetLowering::LowerFormalArguments(
2012 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
2013 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2014 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
2015 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2017 MachineFunction
&MF
= DAG
.getMachineFunction();
2018 const Function
&Fn
= MF
.getFunction();
2019 FunctionType
*FType
= MF
.getFunction().getFunctionType();
2020 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2022 if (Subtarget
->isAmdHsaOS() && AMDGPU::isShader(CallConv
)) {
2023 DiagnosticInfoUnsupported
NoGraphicsHSA(
2024 Fn
, "unsupported non-compute shaders with HSA", DL
.getDebugLoc());
2025 DAG
.getContext()->diagnose(NoGraphicsHSA
);
2026 return DAG
.getEntryNode();
2029 SmallVector
<ISD::InputArg
, 16> Splits
;
2030 SmallVector
<CCValAssign
, 16> ArgLocs
;
2031 BitVector
Skipped(Ins
.size());
2032 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
2035 bool IsShader
= AMDGPU::isShader(CallConv
);
2036 bool IsKernel
= AMDGPU::isKernel(CallConv
);
2037 bool IsEntryFunc
= AMDGPU::isEntryFunctionCC(CallConv
);
2040 processShaderInputArgs(Splits
, CallConv
, Ins
, Skipped
, FType
, Info
);
2042 // At least one interpolation mode must be enabled or else the GPU will
2045 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2046 // set PSInputAddr, the user wants to enable some bits after the compilation
2047 // based on run-time states. Since we can't know what the final PSInputEna
2048 // will look like, so we shouldn't do anything here and the user should take
2049 // responsibility for the correct programming.
2051 // Otherwise, the following restrictions apply:
2052 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2053 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2055 if (CallConv
== CallingConv::AMDGPU_PS
) {
2056 if ((Info
->getPSInputAddr() & 0x7F) == 0 ||
2057 ((Info
->getPSInputAddr() & 0xF) == 0 &&
2058 Info
->isPSInputAllocated(11))) {
2059 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
2060 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
2061 Info
->markPSInputAllocated(0);
2062 Info
->markPSInputEnabled(0);
2064 if (Subtarget
->isAmdPalOS()) {
2065 // For isAmdPalOS, the user does not enable some bits after compilation
2066 // based on run-time states; the register values being generated here are
2067 // the final ones set in hardware. Therefore we need to apply the
2068 // workaround to PSInputAddr and PSInputEnable together. (The case where
2069 // a bit is set in PSInputAddr but not PSInputEnable is where the
2070 // frontend set up an input arg for a particular interpolation mode, but
2071 // nothing uses that input arg. Really we should have an earlier pass
2072 // that removes such an arg.)
2073 unsigned PsInputBits
= Info
->getPSInputAddr() & Info
->getPSInputEnable();
2074 if ((PsInputBits
& 0x7F) == 0 ||
2075 ((PsInputBits
& 0xF) == 0 &&
2076 (PsInputBits
>> 11 & 1)))
2077 Info
->markPSInputEnabled(
2078 countTrailingZeros(Info
->getPSInputAddr(), ZB_Undefined
));
2082 assert(!Info
->hasDispatchPtr() &&
2083 !Info
->hasKernargSegmentPtr() && !Info
->hasFlatScratchInit() &&
2084 !Info
->hasWorkGroupIDX() && !Info
->hasWorkGroupIDY() &&
2085 !Info
->hasWorkGroupIDZ() && !Info
->hasWorkGroupInfo() &&
2086 !Info
->hasWorkItemIDX() && !Info
->hasWorkItemIDY() &&
2087 !Info
->hasWorkItemIDZ());
2088 } else if (IsKernel
) {
2089 assert(Info
->hasWorkGroupIDX() && Info
->hasWorkItemIDX());
2091 Splits
.append(Ins
.begin(), Ins
.end());
2095 allocateSpecialEntryInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2096 allocateHSAUserSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2100 analyzeFormalArgumentsCompute(CCInfo
, Ins
);
2102 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, isVarArg
);
2103 CCInfo
.AnalyzeFormalArguments(Splits
, AssignFn
);
2106 SmallVector
<SDValue
, 16> Chains
;
2108 // FIXME: This is the minimum kernel argument alignment. We should improve
2109 // this to the maximum alignment of the arguments.
2111 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2113 const unsigned KernelArgBaseAlign
= 16;
2115 for (unsigned i
= 0, e
= Ins
.size(), ArgIdx
= 0; i
!= e
; ++i
) {
2116 const ISD::InputArg
&Arg
= Ins
[i
];
2117 if (Arg
.isOrigArg() && Skipped
[Arg
.getOrigArgIndex()]) {
2118 InVals
.push_back(DAG
.getUNDEF(Arg
.VT
));
2122 CCValAssign
&VA
= ArgLocs
[ArgIdx
++];
2123 MVT VT
= VA
.getLocVT();
2125 if (IsEntryFunc
&& VA
.isMemLoc()) {
2127 EVT MemVT
= VA
.getLocVT();
2129 const uint64_t Offset
= VA
.getLocMemOffset();
2130 unsigned Align
= MinAlign(KernelArgBaseAlign
, Offset
);
2132 SDValue Arg
= lowerKernargMemParameter(
2133 DAG
, VT
, MemVT
, DL
, Chain
, Offset
, Align
, Ins
[i
].Flags
.isSExt(), &Ins
[i
]);
2134 Chains
.push_back(Arg
.getValue(1));
2137 dyn_cast
<PointerType
>(FType
->getParamType(Ins
[i
].getOrigArgIndex()));
2138 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
2139 ParamTy
&& (ParamTy
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
2140 ParamTy
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
)) {
2141 // On SI local pointers are just offsets into LDS, so they are always
2142 // less than 16-bits. On CI and newer they could potentially be
2143 // real pointers, so we can't guarantee their size.
2144 Arg
= DAG
.getNode(ISD::AssertZext
, DL
, Arg
.getValueType(), Arg
,
2145 DAG
.getValueType(MVT::i16
));
2148 InVals
.push_back(Arg
);
2150 } else if (!IsEntryFunc
&& VA
.isMemLoc()) {
2151 SDValue Val
= lowerStackParameter(DAG
, VA
, DL
, Chain
, Arg
);
2152 InVals
.push_back(Val
);
2153 if (!Arg
.Flags
.isByVal())
2154 Chains
.push_back(Val
.getValue(1));
2158 assert(VA
.isRegLoc() && "Parameter must be in a register!");
2160 Register Reg
= VA
.getLocReg();
2161 const TargetRegisterClass
*RC
= TRI
->getMinimalPhysRegClass(Reg
, VT
);
2162 EVT ValVT
= VA
.getValVT();
2164 Reg
= MF
.addLiveIn(Reg
, RC
);
2165 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
);
2167 if (Arg
.Flags
.isSRet()) {
2168 // The return object should be reasonably addressable.
2170 // FIXME: This helps when the return is a real sret. If it is a
2171 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2172 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2174 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
2175 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
2176 DAG
.getValueType(EVT::getIntegerVT(*DAG
.getContext(), NumBits
)));
2179 // If this is an 8 or 16-bit value, it is really passed promoted
2180 // to 32 bits. Insert an assert[sz]ext to capture this, then
2181 // truncate to the right size.
2182 switch (VA
.getLocInfo()) {
2183 case CCValAssign::Full
:
2185 case CCValAssign::BCvt
:
2186 Val
= DAG
.getNode(ISD::BITCAST
, DL
, ValVT
, Val
);
2188 case CCValAssign::SExt
:
2189 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VT
, Val
,
2190 DAG
.getValueType(ValVT
));
2191 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2193 case CCValAssign::ZExt
:
2194 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
2195 DAG
.getValueType(ValVT
));
2196 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2198 case CCValAssign::AExt
:
2199 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2202 llvm_unreachable("Unknown loc info!");
2205 InVals
.push_back(Val
);
2209 // Special inputs come after user arguments.
2210 allocateSpecialInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2213 // Start adding system SGPRs.
2215 allocateSystemSGPRs(CCInfo
, MF
, *Info
, CallConv
, IsShader
);
2217 CCInfo
.AllocateReg(Info
->getScratchRSrcReg());
2218 CCInfo
.AllocateReg(Info
->getScratchWaveOffsetReg());
2219 CCInfo
.AllocateReg(Info
->getFrameOffsetReg());
2220 allocateSpecialInputSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2223 auto &ArgUsageInfo
=
2224 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2225 ArgUsageInfo
.setFuncArgInfo(Fn
, Info
->getArgInfo());
2227 unsigned StackArgSize
= CCInfo
.getNextStackOffset();
2228 Info
->setBytesInStackArgArea(StackArgSize
);
2230 return Chains
.empty() ? Chain
:
2231 DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
2234 // TODO: If return values can't fit in registers, we should return as many as
2235 // possible in registers before passing on stack.
2236 bool SITargetLowering::CanLowerReturn(
2237 CallingConv::ID CallConv
,
2238 MachineFunction
&MF
, bool IsVarArg
,
2239 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2240 LLVMContext
&Context
) const {
2241 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2242 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2243 // for shaders. Vector types should be explicitly handled by CC.
2244 if (AMDGPU::isEntryFunctionCC(CallConv
))
2247 SmallVector
<CCValAssign
, 16> RVLocs
;
2248 CCState
CCInfo(CallConv
, IsVarArg
, MF
, RVLocs
, Context
);
2249 return CCInfo
.CheckReturn(Outs
, CCAssignFnForReturn(CallConv
, IsVarArg
));
2253 SITargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
2255 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2256 const SmallVectorImpl
<SDValue
> &OutVals
,
2257 const SDLoc
&DL
, SelectionDAG
&DAG
) const {
2258 MachineFunction
&MF
= DAG
.getMachineFunction();
2259 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2261 if (AMDGPU::isKernel(CallConv
)) {
2262 return AMDGPUTargetLowering::LowerReturn(Chain
, CallConv
, isVarArg
, Outs
,
2266 bool IsShader
= AMDGPU::isShader(CallConv
);
2268 Info
->setIfReturnsVoid(Outs
.empty());
2269 bool IsWaveEnd
= Info
->returnsVoid() && IsShader
;
2271 // CCValAssign - represent the assignment of the return value to a location.
2272 SmallVector
<CCValAssign
, 48> RVLocs
;
2273 SmallVector
<ISD::OutputArg
, 48> Splits
;
2275 // CCState - Info about the registers and stack slots.
2276 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
2279 // Analyze outgoing return values.
2280 CCInfo
.AnalyzeReturn(Outs
, CCAssignFnForReturn(CallConv
, isVarArg
));
2283 SmallVector
<SDValue
, 48> RetOps
;
2284 RetOps
.push_back(Chain
); // Operand #0 = Chain (updated below)
2286 // Add return address for callable functions.
2287 if (!Info
->isEntryFunction()) {
2288 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2289 SDValue ReturnAddrReg
= CreateLiveInRegister(
2290 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2292 SDValue ReturnAddrVirtualReg
= DAG
.getRegister(
2293 MF
.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass
),
2296 DAG
.getCopyToReg(Chain
, DL
, ReturnAddrVirtualReg
, ReturnAddrReg
, Flag
);
2297 Flag
= Chain
.getValue(1);
2298 RetOps
.push_back(ReturnAddrVirtualReg
);
2301 // Copy the result values into the output registers.
2302 for (unsigned I
= 0, RealRVLocIdx
= 0, E
= RVLocs
.size(); I
!= E
;
2303 ++I
, ++RealRVLocIdx
) {
2304 CCValAssign
&VA
= RVLocs
[I
];
2305 assert(VA
.isRegLoc() && "Can only return in registers!");
2306 // TODO: Partially return in registers if return values don't fit.
2307 SDValue Arg
= OutVals
[RealRVLocIdx
];
2309 // Copied from other backends.
2310 switch (VA
.getLocInfo()) {
2311 case CCValAssign::Full
:
2313 case CCValAssign::BCvt
:
2314 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2316 case CCValAssign::SExt
:
2317 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2319 case CCValAssign::ZExt
:
2320 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2322 case CCValAssign::AExt
:
2323 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2326 llvm_unreachable("Unknown loc info!");
2329 Chain
= DAG
.getCopyToReg(Chain
, DL
, VA
.getLocReg(), Arg
, Flag
);
2330 Flag
= Chain
.getValue(1);
2331 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
2334 // FIXME: Does sret work properly?
2335 if (!Info
->isEntryFunction()) {
2336 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2337 const MCPhysReg
*I
=
2338 TRI
->getCalleeSavedRegsViaCopy(&DAG
.getMachineFunction());
2341 if (AMDGPU::SReg_64RegClass
.contains(*I
))
2342 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i64
));
2343 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
2344 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i32
));
2346 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2351 // Update chain and glue.
2354 RetOps
.push_back(Flag
);
2356 unsigned Opc
= AMDGPUISD::ENDPGM
;
2358 Opc
= IsShader
? AMDGPUISD::RETURN_TO_EPILOG
: AMDGPUISD::RET_FLAG
;
2359 return DAG
.getNode(Opc
, DL
, MVT::Other
, RetOps
);
2362 SDValue
SITargetLowering::LowerCallResult(
2363 SDValue Chain
, SDValue InFlag
, CallingConv::ID CallConv
, bool IsVarArg
,
2364 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2365 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
, bool IsThisReturn
,
2366 SDValue ThisVal
) const {
2367 CCAssignFn
*RetCC
= CCAssignFnForReturn(CallConv
, IsVarArg
);
2369 // Assign locations to each value returned by this call.
2370 SmallVector
<CCValAssign
, 16> RVLocs
;
2371 CCState
CCInfo(CallConv
, IsVarArg
, DAG
.getMachineFunction(), RVLocs
,
2373 CCInfo
.AnalyzeCallResult(Ins
, RetCC
);
2375 // Copy all of the result registers out of their specified physreg.
2376 for (unsigned i
= 0; i
!= RVLocs
.size(); ++i
) {
2377 CCValAssign VA
= RVLocs
[i
];
2380 if (VA
.isRegLoc()) {
2381 Val
= DAG
.getCopyFromReg(Chain
, DL
, VA
.getLocReg(), VA
.getLocVT(), InFlag
);
2382 Chain
= Val
.getValue(1);
2383 InFlag
= Val
.getValue(2);
2384 } else if (VA
.isMemLoc()) {
2385 report_fatal_error("TODO: return values in memory");
2387 llvm_unreachable("unknown argument location type");
2389 switch (VA
.getLocInfo()) {
2390 case CCValAssign::Full
:
2392 case CCValAssign::BCvt
:
2393 Val
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), Val
);
2395 case CCValAssign::ZExt
:
2396 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VA
.getLocVT(), Val
,
2397 DAG
.getValueType(VA
.getValVT()));
2398 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2400 case CCValAssign::SExt
:
2401 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VA
.getLocVT(), Val
,
2402 DAG
.getValueType(VA
.getValVT()));
2403 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2405 case CCValAssign::AExt
:
2406 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2409 llvm_unreachable("Unknown loc info!");
2412 InVals
.push_back(Val
);
2418 // Add code to pass special inputs required depending on used features separate
2419 // from the explicit user arguments present in the IR.
2420 void SITargetLowering::passSpecialInputs(
2421 CallLoweringInfo
&CLI
,
2423 const SIMachineFunctionInfo
&Info
,
2424 SmallVectorImpl
<std::pair
<unsigned, SDValue
>> &RegsToPass
,
2425 SmallVectorImpl
<SDValue
> &MemOpChains
,
2426 SDValue Chain
) const {
2427 // If we don't have a call site, this was a call inserted by
2428 // legalization. These can never use special inputs.
2432 const Function
*CalleeFunc
= CLI
.CS
.getCalledFunction();
2435 SelectionDAG
&DAG
= CLI
.DAG
;
2436 const SDLoc
&DL
= CLI
.DL
;
2438 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2440 auto &ArgUsageInfo
=
2441 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2442 const AMDGPUFunctionArgInfo
&CalleeArgInfo
2443 = ArgUsageInfo
.lookupFuncArgInfo(*CalleeFunc
);
2445 const AMDGPUFunctionArgInfo
&CallerArgInfo
= Info
.getArgInfo();
2447 // TODO: Unify with private memory register handling. This is complicated by
2448 // the fact that at least in kernels, the input argument is not necessarily
2449 // in the same location as the input.
2450 AMDGPUFunctionArgInfo::PreloadedValue InputRegs
[] = {
2451 AMDGPUFunctionArgInfo::DISPATCH_PTR
,
2452 AMDGPUFunctionArgInfo::QUEUE_PTR
,
2453 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
,
2454 AMDGPUFunctionArgInfo::DISPATCH_ID
,
2455 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
,
2456 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
,
2457 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
,
2458 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2461 for (auto InputID
: InputRegs
) {
2462 const ArgDescriptor
*OutgoingArg
;
2463 const TargetRegisterClass
*ArgRC
;
2465 std::tie(OutgoingArg
, ArgRC
) = CalleeArgInfo
.getPreloadedValue(InputID
);
2469 const ArgDescriptor
*IncomingArg
;
2470 const TargetRegisterClass
*IncomingArgRC
;
2471 std::tie(IncomingArg
, IncomingArgRC
)
2472 = CallerArgInfo
.getPreloadedValue(InputID
);
2473 assert(IncomingArgRC
== ArgRC
);
2475 // All special arguments are ints for now.
2476 EVT ArgVT
= TRI
->getSpillSize(*ArgRC
) == 8 ? MVT::i64
: MVT::i32
;
2480 InputReg
= loadInputValue(DAG
, ArgRC
, ArgVT
, DL
, *IncomingArg
);
2482 // The implicit arg ptr is special because it doesn't have a corresponding
2483 // input for kernels, and is computed from the kernarg segment pointer.
2484 assert(InputID
== AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
2485 InputReg
= getImplicitArgPtr(DAG
, DL
);
2488 if (OutgoingArg
->isRegister()) {
2489 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2491 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(ArgVT
.getStoreSize(), 4);
2492 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2494 MemOpChains
.push_back(ArgStore
);
2498 // Pack workitem IDs into a single register or pass it as is if already
2500 const ArgDescriptor
*OutgoingArg
;
2501 const TargetRegisterClass
*ArgRC
;
2503 std::tie(OutgoingArg
, ArgRC
) =
2504 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
2506 std::tie(OutgoingArg
, ArgRC
) =
2507 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
2509 std::tie(OutgoingArg
, ArgRC
) =
2510 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
2514 const ArgDescriptor
*IncomingArgX
2515 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
).first
;
2516 const ArgDescriptor
*IncomingArgY
2517 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
).first
;
2518 const ArgDescriptor
*IncomingArgZ
2519 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
).first
;
2524 // If incoming ids are not packed we need to pack them.
2525 if (IncomingArgX
&& !IncomingArgX
->isMasked() && CalleeArgInfo
.WorkItemIDX
)
2526 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgX
);
2528 if (IncomingArgY
&& !IncomingArgY
->isMasked() && CalleeArgInfo
.WorkItemIDY
) {
2529 SDValue Y
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgY
);
2530 Y
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Y
,
2531 DAG
.getShiftAmountConstant(10, MVT::i32
, SL
));
2532 InputReg
= InputReg
.getNode() ?
2533 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Y
) : Y
;
2536 if (IncomingArgZ
&& !IncomingArgZ
->isMasked() && CalleeArgInfo
.WorkItemIDZ
) {
2537 SDValue Z
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgZ
);
2538 Z
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Z
,
2539 DAG
.getShiftAmountConstant(20, MVT::i32
, SL
));
2540 InputReg
= InputReg
.getNode() ?
2541 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Z
) : Z
;
2544 if (!InputReg
.getNode()) {
2545 // Workitem ids are already packed, any of present incoming arguments
2546 // will carry all required fields.
2547 ArgDescriptor IncomingArg
= ArgDescriptor::createArg(
2548 IncomingArgX
? *IncomingArgX
:
2549 IncomingArgY
? *IncomingArgY
:
2550 *IncomingArgZ
, ~0u);
2551 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, IncomingArg
);
2554 if (OutgoingArg
->isRegister()) {
2555 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2557 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(4, 4);
2558 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2560 MemOpChains
.push_back(ArgStore
);
2564 static bool canGuaranteeTCO(CallingConv::ID CC
) {
2565 return CC
== CallingConv::Fast
;
2568 /// Return true if we might ever do TCO for calls with this calling convention.
2569 static bool mayTailCallThisCC(CallingConv::ID CC
) {
2571 case CallingConv::C
:
2574 return canGuaranteeTCO(CC
);
2578 bool SITargetLowering::isEligibleForTailCallOptimization(
2579 SDValue Callee
, CallingConv::ID CalleeCC
, bool IsVarArg
,
2580 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2581 const SmallVectorImpl
<SDValue
> &OutVals
,
2582 const SmallVectorImpl
<ISD::InputArg
> &Ins
, SelectionDAG
&DAG
) const {
2583 if (!mayTailCallThisCC(CalleeCC
))
2586 MachineFunction
&MF
= DAG
.getMachineFunction();
2587 const Function
&CallerF
= MF
.getFunction();
2588 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
2589 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2590 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
2592 // Kernels aren't callable, and don't have a live in return address so it
2593 // doesn't make sense to do a tail call with entry functions.
2594 if (!CallerPreserved
)
2597 bool CCMatch
= CallerCC
== CalleeCC
;
2599 if (DAG
.getTarget().Options
.GuaranteedTailCallOpt
) {
2600 if (canGuaranteeTCO(CalleeCC
) && CCMatch
)
2605 // TODO: Can we handle var args?
2609 for (const Argument
&Arg
: CallerF
.args()) {
2610 if (Arg
.hasByValAttr())
2614 LLVMContext
&Ctx
= *DAG
.getContext();
2616 // Check that the call results are passed in the same way.
2617 if (!CCState::resultsCompatible(CalleeCC
, CallerCC
, MF
, Ctx
, Ins
,
2618 CCAssignFnForCall(CalleeCC
, IsVarArg
),
2619 CCAssignFnForCall(CallerCC
, IsVarArg
)))
2622 // The callee has to preserve all registers the caller needs to preserve.
2624 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
2625 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
2629 // Nothing more to check if the callee is taking no arguments.
2633 SmallVector
<CCValAssign
, 16> ArgLocs
;
2634 CCState
CCInfo(CalleeCC
, IsVarArg
, MF
, ArgLocs
, Ctx
);
2636 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CalleeCC
, IsVarArg
));
2638 const SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
2639 // If the stack arguments for this call do not fit into our own save area then
2640 // the call cannot be made tail.
2641 // TODO: Is this really necessary?
2642 if (CCInfo
.getNextStackOffset() > FuncInfo
->getBytesInStackArgArea())
2645 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2646 return parametersInCSRMatch(MRI
, CallerPreserved
, ArgLocs
, OutVals
);
2649 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst
*CI
) const {
2650 if (!CI
->isTailCall())
2653 const Function
*ParentFn
= CI
->getParent()->getParent();
2654 if (AMDGPU::isEntryFunctionCC(ParentFn
->getCallingConv()))
2657 auto Attr
= ParentFn
->getFnAttribute("disable-tail-calls");
2658 return (Attr
.getValueAsString() != "true");
2661 // The wave scratch offset register is used as the global base pointer.
2662 SDValue
SITargetLowering::LowerCall(CallLoweringInfo
&CLI
,
2663 SmallVectorImpl
<SDValue
> &InVals
) const {
2664 SelectionDAG
&DAG
= CLI
.DAG
;
2665 const SDLoc
&DL
= CLI
.DL
;
2666 SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
2667 SmallVector
<SDValue
, 32> &OutVals
= CLI
.OutVals
;
2668 SmallVector
<ISD::InputArg
, 32> &Ins
= CLI
.Ins
;
2669 SDValue Chain
= CLI
.Chain
;
2670 SDValue Callee
= CLI
.Callee
;
2671 bool &IsTailCall
= CLI
.IsTailCall
;
2672 CallingConv::ID CallConv
= CLI
.CallConv
;
2673 bool IsVarArg
= CLI
.IsVarArg
;
2674 bool IsSibCall
= false;
2675 bool IsThisReturn
= false;
2676 MachineFunction
&MF
= DAG
.getMachineFunction();
2679 return lowerUnhandledCall(CLI
, InVals
,
2680 "unsupported call to variadic function ");
2683 if (!CLI
.CS
.getInstruction())
2684 report_fatal_error("unsupported libcall legalization");
2686 if (!CLI
.CS
.getCalledFunction()) {
2687 return lowerUnhandledCall(CLI
, InVals
,
2688 "unsupported indirect call to function ");
2691 if (IsTailCall
&& MF
.getTarget().Options
.GuaranteedTailCallOpt
) {
2692 return lowerUnhandledCall(CLI
, InVals
,
2693 "unsupported required tail call to function ");
2696 if (AMDGPU::isShader(MF
.getFunction().getCallingConv())) {
2697 // Note the issue is with the CC of the calling function, not of the call
2699 return lowerUnhandledCall(CLI
, InVals
,
2700 "unsupported call from graphics shader of function ");
2704 IsTailCall
= isEligibleForTailCallOptimization(
2705 Callee
, CallConv
, IsVarArg
, Outs
, OutVals
, Ins
, DAG
);
2706 if (!IsTailCall
&& CLI
.CS
&& CLI
.CS
.isMustTailCall()) {
2707 report_fatal_error("failed to perform tail call elimination on a call "
2708 "site marked musttail");
2711 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
2713 // A sibling call is one where we're under the usual C ABI and not planning
2714 // to change that but can still do a tail call:
2715 if (!TailCallOpt
&& IsTailCall
)
2722 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2724 // Analyze operands of the call, assigning locations to each operand.
2725 SmallVector
<CCValAssign
, 16> ArgLocs
;
2726 CCState
CCInfo(CallConv
, IsVarArg
, MF
, ArgLocs
, *DAG
.getContext());
2727 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, IsVarArg
);
2729 CCInfo
.AnalyzeCallOperands(Outs
, AssignFn
);
2731 // Get a count of how many bytes are to be pushed on the stack.
2732 unsigned NumBytes
= CCInfo
.getNextStackOffset();
2735 // Since we're not changing the ABI to make this a tail call, the memory
2736 // operands are already available in the caller's incoming argument space.
2740 // FPDiff is the byte offset of the call's argument area from the callee's.
2741 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2742 // by this amount for a tail call. In a sibling call it must be 0 because the
2743 // caller will deallocate the entire stack and the callee still expects its
2744 // arguments to begin at SP+0. Completely unused for non-tail calls.
2746 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2747 SmallVector
<std::pair
<unsigned, SDValue
>, 8> RegsToPass
;
2749 // Adjust the stack pointer for the new arguments...
2750 // These operations are automatically eliminated by the prolog/epilog pass
2752 Chain
= DAG
.getCALLSEQ_START(Chain
, 0, 0, DL
);
2754 SmallVector
<SDValue
, 4> CopyFromChains
;
2756 // In the HSA case, this should be an identity copy.
2757 SDValue ScratchRSrcReg
2758 = DAG
.getCopyFromReg(Chain
, DL
, Info
->getScratchRSrcReg(), MVT::v4i32
);
2759 RegsToPass
.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3
, ScratchRSrcReg
);
2760 CopyFromChains
.push_back(ScratchRSrcReg
.getValue(1));
2761 Chain
= DAG
.getTokenFactor(DL
, CopyFromChains
);
2764 SmallVector
<SDValue
, 8> MemOpChains
;
2765 MVT PtrVT
= MVT::i32
;
2767 // Walk the register/memloc assignments, inserting copies/loads.
2768 for (unsigned i
= 0, realArgIdx
= 0, e
= ArgLocs
.size(); i
!= e
;
2769 ++i
, ++realArgIdx
) {
2770 CCValAssign
&VA
= ArgLocs
[i
];
2771 SDValue Arg
= OutVals
[realArgIdx
];
2773 // Promote the value if needed.
2774 switch (VA
.getLocInfo()) {
2775 case CCValAssign::Full
:
2777 case CCValAssign::BCvt
:
2778 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2780 case CCValAssign::ZExt
:
2781 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2783 case CCValAssign::SExt
:
2784 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2786 case CCValAssign::AExt
:
2787 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2789 case CCValAssign::FPExt
:
2790 Arg
= DAG
.getNode(ISD::FP_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2793 llvm_unreachable("Unknown loc info!");
2796 if (VA
.isRegLoc()) {
2797 RegsToPass
.push_back(std::make_pair(VA
.getLocReg(), Arg
));
2799 assert(VA
.isMemLoc());
2802 MachinePointerInfo DstInfo
;
2804 unsigned LocMemOffset
= VA
.getLocMemOffset();
2805 int32_t Offset
= LocMemOffset
;
2807 SDValue PtrOff
= DAG
.getConstant(Offset
, DL
, PtrVT
);
2811 ISD::ArgFlagsTy Flags
= Outs
[realArgIdx
].Flags
;
2812 unsigned OpSize
= Flags
.isByVal() ?
2813 Flags
.getByValSize() : VA
.getValVT().getStoreSize();
2815 // FIXME: We can have better than the minimum byval required alignment.
2816 Align
= Flags
.isByVal() ? Flags
.getByValAlign() :
2817 MinAlign(Subtarget
->getStackAlignment(), Offset
);
2819 Offset
= Offset
+ FPDiff
;
2820 int FI
= MFI
.CreateFixedObject(OpSize
, Offset
, true);
2822 DstAddr
= DAG
.getFrameIndex(FI
, PtrVT
);
2823 DstInfo
= MachinePointerInfo::getFixedStack(MF
, FI
);
2825 // Make sure any stack arguments overlapping with where we're storing
2826 // are loaded before this eventual operation. Otherwise they'll be
2829 // FIXME: Why is this really necessary? This seems to just result in a
2830 // lot of code to copy the stack and write them back to the same
2831 // locations, which are supposed to be immutable?
2832 Chain
= addTokenForArgument(Chain
, DAG
, MFI
, FI
);
2835 DstInfo
= MachinePointerInfo::getStack(MF
, LocMemOffset
);
2836 Align
= MinAlign(Subtarget
->getStackAlignment(), LocMemOffset
);
2839 if (Outs
[i
].Flags
.isByVal()) {
2841 DAG
.getConstant(Outs
[i
].Flags
.getByValSize(), DL
, MVT::i32
);
2842 SDValue Cpy
= DAG
.getMemcpy(
2843 Chain
, DL
, DstAddr
, Arg
, SizeNode
, Outs
[i
].Flags
.getByValAlign(),
2844 /*isVol = */ false, /*AlwaysInline = */ true,
2845 /*isTailCall = */ false, DstInfo
,
2846 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2847 *DAG
.getContext(), AMDGPUAS::PRIVATE_ADDRESS
))));
2849 MemOpChains
.push_back(Cpy
);
2851 SDValue Store
= DAG
.getStore(Chain
, DL
, Arg
, DstAddr
, DstInfo
, Align
);
2852 MemOpChains
.push_back(Store
);
2857 // Copy special input registers after user input arguments.
2858 passSpecialInputs(CLI
, CCInfo
, *Info
, RegsToPass
, MemOpChains
, Chain
);
2860 if (!MemOpChains
.empty())
2861 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOpChains
);
2863 // Build a sequence of copy-to-reg nodes chained together with token chain
2864 // and flag operands which copy the outgoing args into the appropriate regs.
2866 for (auto &RegToPass
: RegsToPass
) {
2867 Chain
= DAG
.getCopyToReg(Chain
, DL
, RegToPass
.first
,
2868 RegToPass
.second
, InFlag
);
2869 InFlag
= Chain
.getValue(1);
2873 SDValue PhysReturnAddrReg
;
2875 // Since the return is being combined with the call, we need to pass on the
2878 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2879 SDValue ReturnAddrReg
= CreateLiveInRegister(
2880 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2882 PhysReturnAddrReg
= DAG
.getRegister(TRI
->getReturnAddressReg(MF
),
2884 Chain
= DAG
.getCopyToReg(Chain
, DL
, PhysReturnAddrReg
, ReturnAddrReg
, InFlag
);
2885 InFlag
= Chain
.getValue(1);
2888 // We don't usually want to end the call-sequence here because we would tidy
2889 // the frame up *after* the call, however in the ABI-changing tail-call case
2890 // we've carefully laid out the parameters so that when sp is reset they'll be
2891 // in the correct location.
2892 if (IsTailCall
&& !IsSibCall
) {
2893 Chain
= DAG
.getCALLSEQ_END(Chain
,
2894 DAG
.getTargetConstant(NumBytes
, DL
, MVT::i32
),
2895 DAG
.getTargetConstant(0, DL
, MVT::i32
),
2897 InFlag
= Chain
.getValue(1);
2900 std::vector
<SDValue
> Ops
;
2901 Ops
.push_back(Chain
);
2902 Ops
.push_back(Callee
);
2903 // Add a redundant copy of the callee global which will not be legalized, as
2904 // we need direct access to the callee later.
2905 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Callee
);
2906 const GlobalValue
*GV
= GSD
->getGlobal();
2907 Ops
.push_back(DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i64
));
2910 // Each tail call may have to adjust the stack by a different amount, so
2911 // this information must travel along with the operation for eventual
2912 // consumption by emitEpilogue.
2913 Ops
.push_back(DAG
.getTargetConstant(FPDiff
, DL
, MVT::i32
));
2915 Ops
.push_back(PhysReturnAddrReg
);
2918 // Add argument registers to the end of the list so that they are known live
2920 for (auto &RegToPass
: RegsToPass
) {
2921 Ops
.push_back(DAG
.getRegister(RegToPass
.first
,
2922 RegToPass
.second
.getValueType()));
2925 // Add a register mask operand representing the call-preserved registers.
2927 auto *TRI
= static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
2928 const uint32_t *Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
2929 assert(Mask
&& "Missing call preserved mask for calling convention");
2930 Ops
.push_back(DAG
.getRegisterMask(Mask
));
2932 if (InFlag
.getNode())
2933 Ops
.push_back(InFlag
);
2935 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2937 // If we're doing a tall call, use a TC_RETURN here rather than an
2938 // actual call instruction.
2940 MFI
.setHasTailCall();
2941 return DAG
.getNode(AMDGPUISD::TC_RETURN
, DL
, NodeTys
, Ops
);
2944 // Returns a chain and a flag for retval copy to use.
2945 SDValue Call
= DAG
.getNode(AMDGPUISD::CALL
, DL
, NodeTys
, Ops
);
2946 Chain
= Call
.getValue(0);
2947 InFlag
= Call
.getValue(1);
2949 uint64_t CalleePopBytes
= NumBytes
;
2950 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getTargetConstant(0, DL
, MVT::i32
),
2951 DAG
.getTargetConstant(CalleePopBytes
, DL
, MVT::i32
),
2954 InFlag
= Chain
.getValue(1);
2956 // Handle result values, copying them out of physregs into vregs that we
2958 return LowerCallResult(Chain
, InFlag
, CallConv
, IsVarArg
, Ins
, DL
, DAG
,
2959 InVals
, IsThisReturn
,
2960 IsThisReturn
? OutVals
[0] : SDValue());
2963 unsigned SITargetLowering::getRegisterByName(const char* RegName
, EVT VT
,
2964 SelectionDAG
&DAG
) const {
2965 unsigned Reg
= StringSwitch
<unsigned>(RegName
)
2966 .Case("m0", AMDGPU::M0
)
2967 .Case("exec", AMDGPU::EXEC
)
2968 .Case("exec_lo", AMDGPU::EXEC_LO
)
2969 .Case("exec_hi", AMDGPU::EXEC_HI
)
2970 .Case("flat_scratch", AMDGPU::FLAT_SCR
)
2971 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO
)
2972 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI
)
2973 .Default(AMDGPU::NoRegister
);
2975 if (Reg
== AMDGPU::NoRegister
) {
2976 report_fatal_error(Twine("invalid register name \""
2977 + StringRef(RegName
) + "\"."));
2981 if (!Subtarget
->hasFlatScrRegister() &&
2982 Subtarget
->getRegisterInfo()->regsOverlap(Reg
, AMDGPU::FLAT_SCR
)) {
2983 report_fatal_error(Twine("invalid register \""
2984 + StringRef(RegName
) + "\" for subtarget."));
2989 case AMDGPU::EXEC_LO
:
2990 case AMDGPU::EXEC_HI
:
2991 case AMDGPU::FLAT_SCR_LO
:
2992 case AMDGPU::FLAT_SCR_HI
:
2993 if (VT
.getSizeInBits() == 32)
2997 case AMDGPU::FLAT_SCR
:
2998 if (VT
.getSizeInBits() == 64)
3002 llvm_unreachable("missing register type checking");
3005 report_fatal_error(Twine("invalid type for register \""
3006 + StringRef(RegName
) + "\"."));
3009 // If kill is not the last instruction, split the block so kill is always a
3010 // proper terminator.
3011 MachineBasicBlock
*SITargetLowering::splitKillBlock(MachineInstr
&MI
,
3012 MachineBasicBlock
*BB
) const {
3013 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3015 MachineBasicBlock::iterator
SplitPoint(&MI
);
3018 if (SplitPoint
== BB
->end()) {
3019 // Don't bother with a new block.
3020 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
3024 MachineFunction
*MF
= BB
->getParent();
3025 MachineBasicBlock
*SplitBB
3026 = MF
->CreateMachineBasicBlock(BB
->getBasicBlock());
3028 MF
->insert(++MachineFunction::iterator(BB
), SplitBB
);
3029 SplitBB
->splice(SplitBB
->begin(), BB
, SplitPoint
, BB
->end());
3031 SplitBB
->transferSuccessorsAndUpdatePHIs(BB
);
3032 BB
->addSuccessor(SplitBB
);
3034 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
3038 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3039 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3040 // be the first instruction in the remainder block.
3042 /// \returns { LoopBody, Remainder }
3043 static std::pair
<MachineBasicBlock
*, MachineBasicBlock
*>
3044 splitBlockForLoop(MachineInstr
&MI
, MachineBasicBlock
&MBB
, bool InstInLoop
) {
3045 MachineFunction
*MF
= MBB
.getParent();
3046 MachineBasicBlock::iterator
I(&MI
);
3048 // To insert the loop we need to split the block. Move everything after this
3049 // point to a new block, and insert a new empty block between the two.
3050 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
3051 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
3052 MachineFunction::iterator
MBBI(MBB
);
3055 MF
->insert(MBBI
, LoopBB
);
3056 MF
->insert(MBBI
, RemainderBB
);
3058 LoopBB
->addSuccessor(LoopBB
);
3059 LoopBB
->addSuccessor(RemainderBB
);
3061 // Move the rest of the block into a new block.
3062 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
3065 auto Next
= std::next(I
);
3067 // Move instruction to loop body.
3068 LoopBB
->splice(LoopBB
->begin(), &MBB
, I
, Next
);
3070 // Move the rest of the block.
3071 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, Next
, MBB
.end());
3073 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, I
, MBB
.end());
3076 MBB
.addSuccessor(LoopBB
);
3078 return std::make_pair(LoopBB
, RemainderBB
);
3081 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3082 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr
&MI
) const {
3083 MachineBasicBlock
*MBB
= MI
.getParent();
3084 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3085 auto I
= MI
.getIterator();
3086 auto E
= std::next(I
);
3088 BuildMI(*MBB
, E
, MI
.getDebugLoc(), TII
->get(AMDGPU::S_WAITCNT
))
3091 MIBundleBuilder
Bundler(*MBB
, I
, E
);
3092 finalizeBundle(*MBB
, Bundler
.begin());
3096 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr
&MI
,
3097 MachineBasicBlock
*BB
) const {
3098 const DebugLoc
&DL
= MI
.getDebugLoc();
3100 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3102 MachineBasicBlock
*LoopBB
;
3103 MachineBasicBlock
*RemainderBB
;
3104 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3106 // Apparently kill flags are only valid if the def is in the same block?
3107 if (MachineOperand
*Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::data0
))
3108 Src
->setIsKill(false);
3110 std::tie(LoopBB
, RemainderBB
) = splitBlockForLoop(MI
, *BB
, true);
3112 MachineBasicBlock::iterator I
= LoopBB
->end();
3114 const unsigned EncodedReg
= AMDGPU::Hwreg::encodeHwreg(
3115 AMDGPU::Hwreg::ID_TRAPSTS
, AMDGPU::Hwreg::OFFSET_MEM_VIOL
, 1);
3117 // Clear TRAP_STS.MEM_VIOL
3118 BuildMI(*LoopBB
, LoopBB
->begin(), DL
, TII
->get(AMDGPU::S_SETREG_IMM32_B32
))
3120 .addImm(EncodedReg
);
3122 bundleInstWithWaitcnt(MI
);
3124 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3126 // Load and check TRAP_STS.MEM_VIOL
3127 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_GETREG_B32
), Reg
)
3128 .addImm(EncodedReg
);
3130 // FIXME: Do we need to use an isel pseudo that may clobber scc?
3131 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CMP_LG_U32
))
3132 .addReg(Reg
, RegState::Kill
)
3134 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3140 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3141 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3142 // will only do one iteration. In the worst case, this will loop 64 times.
3144 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3145 static MachineBasicBlock::iterator
emitLoadM0FromVGPRLoop(
3146 const SIInstrInfo
*TII
,
3147 MachineRegisterInfo
&MRI
,
3148 MachineBasicBlock
&OrigBB
,
3149 MachineBasicBlock
&LoopBB
,
3151 const MachineOperand
&IdxReg
,
3155 unsigned InitSaveExecReg
,
3158 bool IsIndirectSrc
) {
3159 MachineFunction
*MF
= OrigBB
.getParent();
3160 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3161 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3162 MachineBasicBlock::iterator I
= LoopBB
.begin();
3164 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
3165 Register PhiExec
= MRI
.createVirtualRegister(BoolRC
);
3166 Register NewExec
= MRI
.createVirtualRegister(BoolRC
);
3167 Register CurrentIdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3168 Register CondReg
= MRI
.createVirtualRegister(BoolRC
);
3170 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiReg
)
3176 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiExec
)
3177 .addReg(InitSaveExecReg
)
3182 // Read the next variant <- also loop target.
3183 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), CurrentIdxReg
)
3184 .addReg(IdxReg
.getReg(), getUndefRegState(IdxReg
.isUndef()));
3186 // Compare the just read M0 value to all possible Idx values.
3187 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_CMP_EQ_U32_e64
), CondReg
)
3188 .addReg(CurrentIdxReg
)
3189 .addReg(IdxReg
.getReg(), 0, IdxReg
.getSubReg());
3191 // Update EXEC, save the original EXEC value to VCC.
3192 BuildMI(LoopBB
, I
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3193 : AMDGPU::S_AND_SAVEEXEC_B64
),
3195 .addReg(CondReg
, RegState::Kill
);
3197 MRI
.setSimpleHint(NewExec
, CondReg
);
3199 if (UseGPRIdxMode
) {
3202 IdxReg
= CurrentIdxReg
;
3204 IdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3205 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), IdxReg
)
3206 .addReg(CurrentIdxReg
, RegState::Kill
)
3209 unsigned IdxMode
= IsIndirectSrc
?
3210 AMDGPU::VGPRIndexMode::SRC0_ENABLE
: AMDGPU::VGPRIndexMode::DST_ENABLE
;
3211 MachineInstr
*SetOn
=
3212 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3213 .addReg(IdxReg
, RegState::Kill
)
3215 SetOn
->getOperand(3).setIsUndef();
3217 // Move index from VCC into M0
3219 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3220 .addReg(CurrentIdxReg
, RegState::Kill
);
3222 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
3223 .addReg(CurrentIdxReg
, RegState::Kill
)
3228 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3229 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3230 MachineInstr
*InsertPt
=
3231 BuildMI(LoopBB
, I
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_XOR_B32_term
3232 : AMDGPU::S_XOR_B64_term
), Exec
)
3236 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3239 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3240 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
3243 return InsertPt
->getIterator();
3246 // This has slightly sub-optimal regalloc when the source vector is killed by
3247 // the read. The register allocator does not understand that the kill is
3248 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3249 // subregister from it, using 1 more VGPR than necessary. This was saved when
3250 // this was expanded after register allocation.
3251 static MachineBasicBlock::iterator
loadM0FromVGPR(const SIInstrInfo
*TII
,
3252 MachineBasicBlock
&MBB
,
3254 unsigned InitResultReg
,
3258 bool IsIndirectSrc
) {
3259 MachineFunction
*MF
= MBB
.getParent();
3260 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3261 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3262 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3263 const DebugLoc
&DL
= MI
.getDebugLoc();
3264 MachineBasicBlock::iterator
I(&MI
);
3266 const auto *BoolXExecRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
3267 Register DstReg
= MI
.getOperand(0).getReg();
3268 Register SaveExec
= MRI
.createVirtualRegister(BoolXExecRC
);
3269 Register TmpExec
= MRI
.createVirtualRegister(BoolXExecRC
);
3270 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3271 unsigned MovExecOpc
= ST
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
3273 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), TmpExec
);
3275 // Save the EXEC mask
3276 BuildMI(MBB
, I
, DL
, TII
->get(MovExecOpc
), SaveExec
)
3279 MachineBasicBlock
*LoopBB
;
3280 MachineBasicBlock
*RemainderBB
;
3281 std::tie(LoopBB
, RemainderBB
) = splitBlockForLoop(MI
, MBB
, false);
3283 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3285 auto InsPt
= emitLoadM0FromVGPRLoop(TII
, MRI
, MBB
, *LoopBB
, DL
, *Idx
,
3286 InitResultReg
, DstReg
, PhiReg
, TmpExec
,
3287 Offset
, UseGPRIdxMode
, IsIndirectSrc
);
3289 MachineBasicBlock::iterator First
= RemainderBB
->begin();
3290 BuildMI(*RemainderBB
, First
, DL
, TII
->get(MovExecOpc
), Exec
)
3296 // Returns subreg index, offset
3297 static std::pair
<unsigned, int>
3298 computeIndirectRegAndOffset(const SIRegisterInfo
&TRI
,
3299 const TargetRegisterClass
*SuperRC
,
3302 int NumElts
= TRI
.getRegSizeInBits(*SuperRC
) / 32;
3304 // Skip out of bounds offsets, or else we would end up using an undefined
3306 if (Offset
>= NumElts
|| Offset
< 0)
3307 return std::make_pair(AMDGPU::sub0
, Offset
);
3309 return std::make_pair(AMDGPU::sub0
+ Offset
, 0);
3312 // Return true if the index is an SGPR and was set.
3313 static bool setM0ToIndexFromSGPR(const SIInstrInfo
*TII
,
3314 MachineRegisterInfo
&MRI
,
3318 bool IsIndirectSrc
) {
3319 MachineBasicBlock
*MBB
= MI
.getParent();
3320 const DebugLoc
&DL
= MI
.getDebugLoc();
3321 MachineBasicBlock::iterator
I(&MI
);
3323 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3324 const TargetRegisterClass
*IdxRC
= MRI
.getRegClass(Idx
->getReg());
3326 assert(Idx
->getReg() != AMDGPU::NoRegister
);
3328 if (!TII
->getRegisterInfo().isSGPRClass(IdxRC
))
3331 if (UseGPRIdxMode
) {
3332 unsigned IdxMode
= IsIndirectSrc
?
3333 AMDGPU::VGPRIndexMode::SRC0_ENABLE
: AMDGPU::VGPRIndexMode::DST_ENABLE
;
3335 MachineInstr
*SetOn
=
3336 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3340 SetOn
->getOperand(3).setIsUndef();
3342 Register Tmp
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3343 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), Tmp
)
3346 MachineInstr
*SetOn
=
3347 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3348 .addReg(Tmp
, RegState::Kill
)
3351 SetOn
->getOperand(3).setIsUndef();
3358 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3361 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
3369 // Control flow needs to be inserted if indexing with a VGPR.
3370 static MachineBasicBlock
*emitIndirectSrc(MachineInstr
&MI
,
3371 MachineBasicBlock
&MBB
,
3372 const GCNSubtarget
&ST
) {
3373 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3374 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3375 MachineFunction
*MF
= MBB
.getParent();
3376 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3378 Register Dst
= MI
.getOperand(0).getReg();
3379 Register SrcReg
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
)->getReg();
3380 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3382 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcReg
);
3385 std::tie(SubReg
, Offset
)
3386 = computeIndirectRegAndOffset(TRI
, VecRC
, SrcReg
, Offset
);
3388 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3390 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, true)) {
3391 MachineBasicBlock::iterator
I(&MI
);
3392 const DebugLoc
&DL
= MI
.getDebugLoc();
3394 if (UseGPRIdxMode
) {
3395 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3396 // to avoid interfering with other uses, so probably requires a new
3397 // optimization pass.
3398 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3399 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3400 .addReg(SrcReg
, RegState::Implicit
)
3401 .addReg(AMDGPU::M0
, RegState::Implicit
);
3402 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3404 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3405 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3406 .addReg(SrcReg
, RegState::Implicit
);
3409 MI
.eraseFromParent();
3414 const DebugLoc
&DL
= MI
.getDebugLoc();
3415 MachineBasicBlock::iterator
I(&MI
);
3417 Register PhiReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3418 Register InitReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3420 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), InitReg
);
3422 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, InitReg
, PhiReg
,
3423 Offset
, UseGPRIdxMode
, true);
3424 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3426 if (UseGPRIdxMode
) {
3427 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3428 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3429 .addReg(SrcReg
, RegState::Implicit
)
3430 .addReg(AMDGPU::M0
, RegState::Implicit
);
3431 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3433 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3434 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3435 .addReg(SrcReg
, RegState::Implicit
);
3438 MI
.eraseFromParent();
3443 static unsigned getMOVRELDPseudo(const SIRegisterInfo
&TRI
,
3444 const TargetRegisterClass
*VecRC
) {
3445 switch (TRI
.getRegSizeInBits(*VecRC
)) {
3447 return AMDGPU::V_MOVRELD_B32_V1
;
3449 return AMDGPU::V_MOVRELD_B32_V2
;
3450 case 128: // 16 bytes
3451 return AMDGPU::V_MOVRELD_B32_V4
;
3452 case 256: // 32 bytes
3453 return AMDGPU::V_MOVRELD_B32_V8
;
3454 case 512: // 64 bytes
3455 return AMDGPU::V_MOVRELD_B32_V16
;
3457 llvm_unreachable("unsupported size for MOVRELD pseudos");
3461 static MachineBasicBlock
*emitIndirectDst(MachineInstr
&MI
,
3462 MachineBasicBlock
&MBB
,
3463 const GCNSubtarget
&ST
) {
3464 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3465 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3466 MachineFunction
*MF
= MBB
.getParent();
3467 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3469 Register Dst
= MI
.getOperand(0).getReg();
3470 const MachineOperand
*SrcVec
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
);
3471 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3472 const MachineOperand
*Val
= TII
->getNamedOperand(MI
, AMDGPU::OpName::val
);
3473 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3474 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcVec
->getReg());
3476 // This can be an immediate, but will be folded later.
3477 assert(Val
->getReg());
3480 std::tie(SubReg
, Offset
) = computeIndirectRegAndOffset(TRI
, VecRC
,
3483 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3485 if (Idx
->getReg() == AMDGPU::NoRegister
) {
3486 MachineBasicBlock::iterator
I(&MI
);
3487 const DebugLoc
&DL
= MI
.getDebugLoc();
3489 assert(Offset
== 0);
3491 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::INSERT_SUBREG
), Dst
)
3496 MI
.eraseFromParent();
3500 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, false)) {
3501 MachineBasicBlock::iterator
I(&MI
);
3502 const DebugLoc
&DL
= MI
.getDebugLoc();
3504 if (UseGPRIdxMode
) {
3505 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3506 .addReg(SrcVec
->getReg(), RegState::Undef
, SubReg
) // vdst
3508 .addReg(Dst
, RegState::ImplicitDefine
)
3509 .addReg(SrcVec
->getReg(), RegState::Implicit
)
3510 .addReg(AMDGPU::M0
, RegState::Implicit
);
3512 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3514 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3516 BuildMI(MBB
, I
, DL
, MovRelDesc
)
3517 .addReg(Dst
, RegState::Define
)
3518 .addReg(SrcVec
->getReg())
3520 .addImm(SubReg
- AMDGPU::sub0
);
3523 MI
.eraseFromParent();
3528 MRI
.clearKillFlags(Val
->getReg());
3530 const DebugLoc
&DL
= MI
.getDebugLoc();
3532 Register PhiReg
= MRI
.createVirtualRegister(VecRC
);
3534 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, SrcVec
->getReg(), PhiReg
,
3535 Offset
, UseGPRIdxMode
, false);
3536 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3538 if (UseGPRIdxMode
) {
3539 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3540 .addReg(PhiReg
, RegState::Undef
, SubReg
) // vdst
3542 .addReg(Dst
, RegState::ImplicitDefine
)
3543 .addReg(PhiReg
, RegState::Implicit
)
3544 .addReg(AMDGPU::M0
, RegState::Implicit
);
3545 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3547 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3549 BuildMI(*LoopBB
, InsPt
, DL
, MovRelDesc
)
3550 .addReg(Dst
, RegState::Define
)
3553 .addImm(SubReg
- AMDGPU::sub0
);
3556 MI
.eraseFromParent();
3561 MachineBasicBlock
*SITargetLowering::EmitInstrWithCustomInserter(
3562 MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
3564 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3565 MachineFunction
*MF
= BB
->getParent();
3566 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
3568 if (TII
->isMIMG(MI
)) {
3569 if (MI
.memoperands_empty() && MI
.mayLoadOrStore()) {
3570 report_fatal_error("missing mem operand from MIMG instruction");
3572 // Add a memoperand for mimg instructions so that they aren't assumed to
3573 // be ordered memory instuctions.
3578 switch (MI
.getOpcode()) {
3579 case AMDGPU::S_ADD_U64_PSEUDO
:
3580 case AMDGPU::S_SUB_U64_PSEUDO
: {
3581 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3582 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3583 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3584 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
3585 const DebugLoc
&DL
= MI
.getDebugLoc();
3587 MachineOperand
&Dest
= MI
.getOperand(0);
3588 MachineOperand
&Src0
= MI
.getOperand(1);
3589 MachineOperand
&Src1
= MI
.getOperand(2);
3591 Register DestSub0
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3592 Register DestSub1
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3594 MachineOperand Src0Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3595 Src0
, BoolRC
, AMDGPU::sub0
,
3596 &AMDGPU::SReg_32_XM0RegClass
);
3597 MachineOperand Src0Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3598 Src0
, BoolRC
, AMDGPU::sub1
,
3599 &AMDGPU::SReg_32_XM0RegClass
);
3601 MachineOperand Src1Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3602 Src1
, BoolRC
, AMDGPU::sub0
,
3603 &AMDGPU::SReg_32_XM0RegClass
);
3604 MachineOperand Src1Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3605 Src1
, BoolRC
, AMDGPU::sub1
,
3606 &AMDGPU::SReg_32_XM0RegClass
);
3608 bool IsAdd
= (MI
.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO
);
3610 unsigned LoOpc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
3611 unsigned HiOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
3612 BuildMI(*BB
, MI
, DL
, TII
->get(LoOpc
), DestSub0
)
3615 BuildMI(*BB
, MI
, DL
, TII
->get(HiOpc
), DestSub1
)
3618 BuildMI(*BB
, MI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), Dest
.getReg())
3620 .addImm(AMDGPU::sub0
)
3622 .addImm(AMDGPU::sub1
);
3623 MI
.eraseFromParent();
3626 case AMDGPU::SI_INIT_M0
: {
3627 BuildMI(*BB
, MI
.getIterator(), MI
.getDebugLoc(),
3628 TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3629 .add(MI
.getOperand(0));
3630 MI
.eraseFromParent();
3633 case AMDGPU::SI_INIT_EXEC
:
3634 // This should be before all vector instructions.
3635 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B64
),
3637 .addImm(MI
.getOperand(0).getImm());
3638 MI
.eraseFromParent();
3641 case AMDGPU::SI_INIT_EXEC_LO
:
3642 // This should be before all vector instructions.
3643 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B32
),
3645 .addImm(MI
.getOperand(0).getImm());
3646 MI
.eraseFromParent();
3649 case AMDGPU::SI_INIT_EXEC_FROM_INPUT
: {
3650 // Extract the thread count from an SGPR input and set EXEC accordingly.
3651 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3653 // S_BFE_U32 count, input, {shift, 7}
3654 // S_BFM_B64 exec, count, 0
3655 // S_CMP_EQ_U32 count, 64
3656 // S_CMOV_B64 exec, -1
3657 MachineInstr
*FirstMI
= &*BB
->begin();
3658 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3659 Register InputReg
= MI
.getOperand(0).getReg();
3660 Register CountReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3663 // Move the COPY of the input reg to the beginning, so that we can use it.
3664 for (auto I
= BB
->begin(); I
!= &MI
; I
++) {
3665 if (I
->getOpcode() != TargetOpcode::COPY
||
3666 I
->getOperand(0).getReg() != InputReg
)
3670 FirstMI
= &*++BB
->begin();
3672 I
->removeFromParent();
3673 BB
->insert(FirstMI
, &*I
);
3681 // This should be before all vector instructions.
3682 unsigned Mask
= (getSubtarget()->getWavefrontSize() << 1) - 1;
3683 bool isWave32
= getSubtarget()->isWave32();
3684 unsigned Exec
= isWave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3685 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_BFE_U32
), CountReg
)
3687 .addImm((MI
.getOperand(1).getImm() & Mask
) | 0x70000);
3688 BuildMI(*BB
, FirstMI
, DebugLoc(),
3689 TII
->get(isWave32
? AMDGPU::S_BFM_B32
: AMDGPU::S_BFM_B64
),
3693 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_CMP_EQ_U32
))
3694 .addReg(CountReg
, RegState::Kill
)
3695 .addImm(getSubtarget()->getWavefrontSize());
3696 BuildMI(*BB
, FirstMI
, DebugLoc(),
3697 TII
->get(isWave32
? AMDGPU::S_CMOV_B32
: AMDGPU::S_CMOV_B64
),
3700 MI
.eraseFromParent();
3704 case AMDGPU::GET_GROUPSTATICSIZE
: {
3705 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
3706 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
);
3707 DebugLoc DL
= MI
.getDebugLoc();
3708 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_MOV_B32
))
3709 .add(MI
.getOperand(0))
3710 .addImm(MFI
->getLDSSize());
3711 MI
.eraseFromParent();
3714 case AMDGPU::SI_INDIRECT_SRC_V1
:
3715 case AMDGPU::SI_INDIRECT_SRC_V2
:
3716 case AMDGPU::SI_INDIRECT_SRC_V4
:
3717 case AMDGPU::SI_INDIRECT_SRC_V8
:
3718 case AMDGPU::SI_INDIRECT_SRC_V16
:
3719 return emitIndirectSrc(MI
, *BB
, *getSubtarget());
3720 case AMDGPU::SI_INDIRECT_DST_V1
:
3721 case AMDGPU::SI_INDIRECT_DST_V2
:
3722 case AMDGPU::SI_INDIRECT_DST_V4
:
3723 case AMDGPU::SI_INDIRECT_DST_V8
:
3724 case AMDGPU::SI_INDIRECT_DST_V16
:
3725 return emitIndirectDst(MI
, *BB
, *getSubtarget());
3726 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO
:
3727 case AMDGPU::SI_KILL_I1_PSEUDO
:
3728 return splitKillBlock(MI
, BB
);
3729 case AMDGPU::V_CNDMASK_B64_PSEUDO
: {
3730 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3731 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3732 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3734 Register Dst
= MI
.getOperand(0).getReg();
3735 Register Src0
= MI
.getOperand(1).getReg();
3736 Register Src1
= MI
.getOperand(2).getReg();
3737 const DebugLoc
&DL
= MI
.getDebugLoc();
3738 Register SrcCond
= MI
.getOperand(3).getReg();
3740 Register DstLo
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3741 Register DstHi
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3742 const auto *CondRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
3743 Register SrcCondCopy
= MRI
.createVirtualRegister(CondRC
);
3745 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::COPY
), SrcCondCopy
)
3747 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstLo
)
3749 .addReg(Src0
, 0, AMDGPU::sub0
)
3751 .addReg(Src1
, 0, AMDGPU::sub0
)
3752 .addReg(SrcCondCopy
);
3753 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstHi
)
3755 .addReg(Src0
, 0, AMDGPU::sub1
)
3757 .addReg(Src1
, 0, AMDGPU::sub1
)
3758 .addReg(SrcCondCopy
);
3760 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), Dst
)
3762 .addImm(AMDGPU::sub0
)
3764 .addImm(AMDGPU::sub1
);
3765 MI
.eraseFromParent();
3768 case AMDGPU::SI_BR_UNDEF
: {
3769 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3770 const DebugLoc
&DL
= MI
.getDebugLoc();
3771 MachineInstr
*Br
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3772 .add(MI
.getOperand(0));
3773 Br
->getOperand(1).setIsUndef(true); // read undef SCC
3774 MI
.eraseFromParent();
3777 case AMDGPU::ADJCALLSTACKUP
:
3778 case AMDGPU::ADJCALLSTACKDOWN
: {
3779 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
3780 MachineInstrBuilder
MIB(*MF
, &MI
);
3782 // Add an implicit use of the frame offset reg to prevent the restore copy
3783 // inserted after the call from being reorderd after stack operations in the
3784 // the caller's frame.
3785 MIB
.addReg(Info
->getStackPtrOffsetReg(), RegState::ImplicitDefine
)
3786 .addReg(Info
->getStackPtrOffsetReg(), RegState::Implicit
)
3787 .addReg(Info
->getFrameOffsetReg(), RegState::Implicit
);
3790 case AMDGPU::SI_CALL_ISEL
: {
3791 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3792 const DebugLoc
&DL
= MI
.getDebugLoc();
3794 unsigned ReturnAddrReg
= TII
->getRegisterInfo().getReturnAddressReg(*MF
);
3796 MachineInstrBuilder MIB
;
3797 MIB
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::SI_CALL
), ReturnAddrReg
);
3799 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
)
3800 MIB
.add(MI
.getOperand(I
));
3802 MIB
.cloneMemRefs(MI
);
3803 MI
.eraseFromParent();
3806 case AMDGPU::V_ADD_I32_e32
:
3807 case AMDGPU::V_SUB_I32_e32
:
3808 case AMDGPU::V_SUBREV_I32_e32
: {
3809 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
3810 const DebugLoc
&DL
= MI
.getDebugLoc();
3811 unsigned Opc
= MI
.getOpcode();
3813 bool NeedClampOperand
= false;
3814 if (TII
->pseudoToMCOpcode(Opc
) == -1) {
3815 Opc
= AMDGPU::getVOPe64(Opc
);
3816 NeedClampOperand
= true;
3819 auto I
= BuildMI(*BB
, MI
, DL
, TII
->get(Opc
), MI
.getOperand(0).getReg());
3820 if (TII
->isVOP3(*I
)) {
3821 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3822 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3823 I
.addReg(TRI
->getVCC(), RegState::Define
);
3825 I
.add(MI
.getOperand(1))
3826 .add(MI
.getOperand(2));
3827 if (NeedClampOperand
)
3828 I
.addImm(0); // clamp bit for e64 encoding
3830 TII
->legalizeOperands(*I
);
3832 MI
.eraseFromParent();
3835 case AMDGPU::DS_GWS_INIT
:
3836 case AMDGPU::DS_GWS_SEMA_V
:
3837 case AMDGPU::DS_GWS_SEMA_BR
:
3838 case AMDGPU::DS_GWS_SEMA_P
:
3839 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL
:
3840 case AMDGPU::DS_GWS_BARRIER
:
3841 // A s_waitcnt 0 is required to be the instruction immediately following.
3842 if (getSubtarget()->hasGWSAutoReplay()) {
3843 bundleInstWithWaitcnt(MI
);
3847 return emitGWSMemViolTestLoop(MI
, BB
);
3849 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI
, BB
);
3853 bool SITargetLowering::hasBitPreservingFPLogic(EVT VT
) const {
3854 return isTypeLegal(VT
.getScalarType());
3857 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT
) const {
3858 // This currently forces unfolding various combinations of fsub into fma with
3859 // free fneg'd operands. As long as we have fast FMA (controlled by
3860 // isFMAFasterThanFMulAndFAdd), we should perform these.
3862 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3863 // most of these combines appear to be cycle neutral but save on instruction
3864 // count / code size.
3868 EVT
SITargetLowering::getSetCCResultType(const DataLayout
&DL
, LLVMContext
&Ctx
,
3870 if (!VT
.isVector()) {
3873 return EVT::getVectorVT(Ctx
, MVT::i1
, VT
.getVectorNumElements());
3876 MVT
SITargetLowering::getScalarShiftAmountTy(const DataLayout
&, EVT VT
) const {
3877 // TODO: Should i16 be used always if legal? For now it would force VALU
3879 return (VT
== MVT::i16
) ? MVT::i16
: MVT::i32
;
3882 // Answering this is somewhat tricky and depends on the specific device which
3883 // have different rates for fma or all f64 operations.
3885 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3886 // regardless of which device (although the number of cycles differs between
3887 // devices), so it is always profitable for f64.
3889 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3890 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3891 // which we can always do even without fused FP ops since it returns the same
3892 // result as the separate operations and since it is always full
3893 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3894 // however does not support denormals, so we do report fma as faster if we have
3895 // a fast fma device and require denormals.
3897 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT
) const {
3898 VT
= VT
.getScalarType();
3900 switch (VT
.getSimpleVT().SimpleTy
) {
3902 // This is as fast on some subtargets. However, we always have full rate f32
3903 // mad available which returns the same result as the separate operations
3904 // which we should prefer over fma. We can't use this if we want to support
3905 // denormals, so only report this in these cases.
3906 if (Subtarget
->hasFP32Denormals())
3907 return Subtarget
->hasFastFMAF32() || Subtarget
->hasDLInsts();
3909 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3910 return Subtarget
->hasFastFMAF32() && Subtarget
->hasDLInsts();
3915 return Subtarget
->has16BitInsts() && Subtarget
->hasFP16Denormals();
3923 //===----------------------------------------------------------------------===//
3924 // Custom DAG Lowering Operations
3925 //===----------------------------------------------------------------------===//
3927 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3928 // wider vector type is legal.
3929 SDValue
SITargetLowering::splitUnaryVectorOp(SDValue Op
,
3930 SelectionDAG
&DAG
) const {
3931 unsigned Opc
= Op
.getOpcode();
3932 EVT VT
= Op
.getValueType();
3933 assert(VT
== MVT::v4f16
);
3936 std::tie(Lo
, Hi
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3939 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo
.getValueType(), Lo
,
3941 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi
.getValueType(), Hi
,
3944 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3947 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3948 // wider vector type is legal.
3949 SDValue
SITargetLowering::splitBinaryVectorOp(SDValue Op
,
3950 SelectionDAG
&DAG
) const {
3951 unsigned Opc
= Op
.getOpcode();
3952 EVT VT
= Op
.getValueType();
3953 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
3956 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3958 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
3962 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
,
3964 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
,
3967 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3970 SDValue
SITargetLowering::splitTernaryVectorOp(SDValue Op
,
3971 SelectionDAG
&DAG
) const {
3972 unsigned Opc
= Op
.getOpcode();
3973 EVT VT
= Op
.getValueType();
3974 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
3977 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3979 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
3981 std::tie(Lo2
, Hi2
) = DAG
.SplitVectorOperand(Op
.getNode(), 2);
3985 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
, Lo2
,
3987 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
, Hi2
,
3990 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3994 SDValue
SITargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
3995 switch (Op
.getOpcode()) {
3996 default: return AMDGPUTargetLowering::LowerOperation(Op
, DAG
);
3997 case ISD::BRCOND
: return LowerBRCOND(Op
, DAG
);
3998 case ISD::RETURNADDR
: return LowerRETURNADDR(Op
, DAG
);
4000 SDValue Result
= LowerLOAD(Op
, DAG
);
4001 assert((!Result
.getNode() ||
4002 Result
.getNode()->getNumValues() == 2) &&
4003 "Load should return a value and a chain");
4009 return LowerTrig(Op
, DAG
);
4010 case ISD::SELECT
: return LowerSELECT(Op
, DAG
);
4011 case ISD::FDIV
: return LowerFDIV(Op
, DAG
);
4012 case ISD::ATOMIC_CMP_SWAP
: return LowerATOMIC_CMP_SWAP(Op
, DAG
);
4013 case ISD::STORE
: return LowerSTORE(Op
, DAG
);
4014 case ISD::GlobalAddress
: {
4015 MachineFunction
&MF
= DAG
.getMachineFunction();
4016 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
4017 return LowerGlobalAddress(MFI
, Op
, DAG
);
4019 case ISD::INTRINSIC_WO_CHAIN
: return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
4020 case ISD::INTRINSIC_W_CHAIN
: return LowerINTRINSIC_W_CHAIN(Op
, DAG
);
4021 case ISD::INTRINSIC_VOID
: return LowerINTRINSIC_VOID(Op
, DAG
);
4022 case ISD::ADDRSPACECAST
: return lowerADDRSPACECAST(Op
, DAG
);
4023 case ISD::INSERT_SUBVECTOR
:
4024 return lowerINSERT_SUBVECTOR(Op
, DAG
);
4025 case ISD::INSERT_VECTOR_ELT
:
4026 return lowerINSERT_VECTOR_ELT(Op
, DAG
);
4027 case ISD::EXTRACT_VECTOR_ELT
:
4028 return lowerEXTRACT_VECTOR_ELT(Op
, DAG
);
4029 case ISD::VECTOR_SHUFFLE
:
4030 return lowerVECTOR_SHUFFLE(Op
, DAG
);
4031 case ISD::BUILD_VECTOR
:
4032 return lowerBUILD_VECTOR(Op
, DAG
);
4034 return lowerFP_ROUND(Op
, DAG
);
4036 return lowerTRAP(Op
, DAG
);
4037 case ISD::DEBUGTRAP
:
4038 return lowerDEBUGTRAP(Op
, DAG
);
4041 case ISD::FCANONICALIZE
:
4042 return splitUnaryVectorOp(Op
, DAG
);
4045 return lowerFMINNUM_FMAXNUM(Op
, DAG
);
4047 return splitTernaryVectorOp(Op
, DAG
);
4060 case ISD::FMINNUM_IEEE
:
4061 case ISD::FMAXNUM_IEEE
:
4062 return splitBinaryVectorOp(Op
, DAG
);
4067 static SDValue
adjustLoadValueTypeImpl(SDValue Result
, EVT LoadVT
,
4069 SelectionDAG
&DAG
, bool Unpacked
) {
4070 if (!LoadVT
.isVector())
4073 if (Unpacked
) { // From v2i32/v4i32 back to v2f16/v4f16.
4074 // Truncate to v2i16/v4i16.
4075 EVT IntLoadVT
= LoadVT
.changeTypeToInteger();
4077 // Workaround legalizer not scalarizing truncate after vector op
4078 // legalization byt not creating intermediate vector trunc.
4079 SmallVector
<SDValue
, 4> Elts
;
4080 DAG
.ExtractVectorElements(Result
, Elts
);
4081 for (SDValue
&Elt
: Elts
)
4082 Elt
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, Elt
);
4084 Result
= DAG
.getBuildVector(IntLoadVT
, DL
, Elts
);
4086 // Bitcast to original type (v2f16/v4f16).
4087 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
4090 // Cast back to the original packed type.
4091 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
4094 SDValue
SITargetLowering::adjustLoadValueType(unsigned Opcode
,
4097 ArrayRef
<SDValue
> Ops
,
4098 bool IsIntrinsic
) const {
4101 bool Unpacked
= Subtarget
->hasUnpackedD16VMem();
4102 EVT LoadVT
= M
->getValueType(0);
4104 EVT EquivLoadVT
= LoadVT
;
4105 if (Unpacked
&& LoadVT
.isVector()) {
4106 EquivLoadVT
= LoadVT
.isVector() ?
4107 EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
4108 LoadVT
.getVectorNumElements()) : LoadVT
;
4111 // Change from v4f16/v2f16 to EquivLoadVT.
4112 SDVTList VTList
= DAG
.getVTList(EquivLoadVT
, MVT::Other
);
4115 = DAG
.getMemIntrinsicNode(
4116 IsIntrinsic
? (unsigned)ISD::INTRINSIC_W_CHAIN
: Opcode
, DL
,
4117 VTList
, Ops
, M
->getMemoryVT(),
4118 M
->getMemOperand());
4119 if (!Unpacked
) // Just adjusted the opcode.
4122 SDValue Adjusted
= adjustLoadValueTypeImpl(Load
, LoadVT
, DL
, DAG
, Unpacked
);
4124 return DAG
.getMergeValues({ Adjusted
, Load
.getValue(1) }, DL
);
4127 SDValue
SITargetLowering::lowerIntrinsicLoad(MemSDNode
*M
, bool IsFormat
,
4129 ArrayRef
<SDValue
> Ops
) const {
4131 EVT LoadVT
= M
->getValueType(0);
4132 EVT EltType
= LoadVT
.getScalarType();
4133 EVT IntVT
= LoadVT
.changeTypeToInteger();
4135 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
4138 IsFormat
? AMDGPUISD::BUFFER_LOAD_FORMAT
: AMDGPUISD::BUFFER_LOAD
;
4141 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
, M
, DAG
, Ops
);
4144 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
4145 if (!IsD16
&& !LoadVT
.isVector() && EltType
.getSizeInBits() < 32)
4146 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
4148 if (isTypeLegal(LoadVT
)) {
4149 return getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), Ops
, IntVT
,
4150 M
->getMemOperand(), DAG
);
4153 EVT CastVT
= getEquivalentMemType(*DAG
.getContext(), LoadVT
);
4154 SDVTList VTList
= DAG
.getVTList(CastVT
, MVT::Other
);
4155 SDValue MemNode
= getMemIntrinsicNode(Opc
, DL
, VTList
, Ops
, CastVT
,
4156 M
->getMemOperand(), DAG
);
4157 return DAG
.getMergeValues(
4158 {DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, MemNode
), MemNode
.getValue(1)},
4162 static SDValue
lowerICMPIntrinsic(const SITargetLowering
&TLI
,
4163 SDNode
*N
, SelectionDAG
&DAG
) {
4164 EVT VT
= N
->getValueType(0);
4165 const auto *CD
= cast
<ConstantSDNode
>(N
->getOperand(3));
4166 int CondCode
= CD
->getSExtValue();
4167 if (CondCode
< ICmpInst::Predicate::FIRST_ICMP_PREDICATE
||
4168 CondCode
> ICmpInst::Predicate::LAST_ICMP_PREDICATE
)
4169 return DAG
.getUNDEF(VT
);
4171 ICmpInst::Predicate IcInput
= static_cast<ICmpInst::Predicate
>(CondCode
);
4173 SDValue LHS
= N
->getOperand(1);
4174 SDValue RHS
= N
->getOperand(2);
4178 EVT CmpVT
= LHS
.getValueType();
4179 if (CmpVT
== MVT::i16
&& !TLI
.isTypeLegal(MVT::i16
)) {
4180 unsigned PromoteOp
= ICmpInst::isSigned(IcInput
) ?
4181 ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
4182 LHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, LHS
);
4183 RHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, RHS
);
4186 ISD::CondCode CCOpcode
= getICmpCondCode(IcInput
);
4188 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
4189 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
4191 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, DL
, CCVT
, LHS
, RHS
,
4192 DAG
.getCondCode(CCOpcode
));
4193 if (VT
.bitsEq(CCVT
))
4195 return DAG
.getZExtOrTrunc(SetCC
, DL
, VT
);
4198 static SDValue
lowerFCMPIntrinsic(const SITargetLowering
&TLI
,
4199 SDNode
*N
, SelectionDAG
&DAG
) {
4200 EVT VT
= N
->getValueType(0);
4201 const auto *CD
= cast
<ConstantSDNode
>(N
->getOperand(3));
4203 int CondCode
= CD
->getSExtValue();
4204 if (CondCode
< FCmpInst::Predicate::FIRST_FCMP_PREDICATE
||
4205 CondCode
> FCmpInst::Predicate::LAST_FCMP_PREDICATE
) {
4206 return DAG
.getUNDEF(VT
);
4209 SDValue Src0
= N
->getOperand(1);
4210 SDValue Src1
= N
->getOperand(2);
4211 EVT CmpVT
= Src0
.getValueType();
4214 if (CmpVT
== MVT::f16
&& !TLI
.isTypeLegal(CmpVT
)) {
4215 Src0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
4216 Src1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
4219 FCmpInst::Predicate IcInput
= static_cast<FCmpInst::Predicate
>(CondCode
);
4220 ISD::CondCode CCOpcode
= getFCmpCondCode(IcInput
);
4221 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
4222 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
4223 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, SL
, CCVT
, Src0
,
4224 Src1
, DAG
.getCondCode(CCOpcode
));
4225 if (VT
.bitsEq(CCVT
))
4227 return DAG
.getZExtOrTrunc(SetCC
, SL
, VT
);
4230 void SITargetLowering::ReplaceNodeResults(SDNode
*N
,
4231 SmallVectorImpl
<SDValue
> &Results
,
4232 SelectionDAG
&DAG
) const {
4233 switch (N
->getOpcode()) {
4234 case ISD::INSERT_VECTOR_ELT
: {
4235 if (SDValue Res
= lowerINSERT_VECTOR_ELT(SDValue(N
, 0), DAG
))
4236 Results
.push_back(Res
);
4239 case ISD::EXTRACT_VECTOR_ELT
: {
4240 if (SDValue Res
= lowerEXTRACT_VECTOR_ELT(SDValue(N
, 0), DAG
))
4241 Results
.push_back(Res
);
4244 case ISD::INTRINSIC_WO_CHAIN
: {
4245 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
4247 case Intrinsic::amdgcn_cvt_pkrtz
: {
4248 SDValue Src0
= N
->getOperand(1);
4249 SDValue Src1
= N
->getOperand(2);
4251 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32
, SL
, MVT::i32
,
4253 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Cvt
));
4256 case Intrinsic::amdgcn_cvt_pknorm_i16
:
4257 case Intrinsic::amdgcn_cvt_pknorm_u16
:
4258 case Intrinsic::amdgcn_cvt_pk_i16
:
4259 case Intrinsic::amdgcn_cvt_pk_u16
: {
4260 SDValue Src0
= N
->getOperand(1);
4261 SDValue Src1
= N
->getOperand(2);
4265 if (IID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
4266 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
4267 else if (IID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
4268 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
4269 else if (IID
== Intrinsic::amdgcn_cvt_pk_i16
)
4270 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
4272 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
4274 EVT VT
= N
->getValueType(0);
4275 if (isTypeLegal(VT
))
4276 Results
.push_back(DAG
.getNode(Opcode
, SL
, VT
, Src0
, Src1
));
4278 SDValue Cvt
= DAG
.getNode(Opcode
, SL
, MVT::i32
, Src0
, Src1
);
4279 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, Cvt
));
4286 case ISD::INTRINSIC_W_CHAIN
: {
4287 if (SDValue Res
= LowerINTRINSIC_W_CHAIN(SDValue(N
, 0), DAG
)) {
4288 if (Res
.getOpcode() == ISD::MERGE_VALUES
) {
4290 Results
.push_back(Res
.getOperand(0));
4291 Results
.push_back(Res
.getOperand(1));
4293 Results
.push_back(Res
);
4294 Results
.push_back(Res
.getValue(1));
4303 EVT VT
= N
->getValueType(0);
4304 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VT
);
4305 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(1));
4306 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(2));
4308 EVT SelectVT
= NewVT
;
4309 if (NewVT
.bitsLT(MVT::i32
)) {
4310 LHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, LHS
);
4311 RHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, RHS
);
4312 SelectVT
= MVT::i32
;
4315 SDValue NewSelect
= DAG
.getNode(ISD::SELECT
, SL
, SelectVT
,
4316 N
->getOperand(0), LHS
, RHS
);
4318 if (NewVT
!= SelectVT
)
4319 NewSelect
= DAG
.getNode(ISD::TRUNCATE
, SL
, NewVT
, NewSelect
);
4320 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewSelect
));
4324 if (N
->getValueType(0) != MVT::v2f16
)
4328 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
4330 SDValue Op
= DAG
.getNode(ISD::XOR
, SL
, MVT::i32
,
4332 DAG
.getConstant(0x80008000, SL
, MVT::i32
));
4333 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
4337 if (N
->getValueType(0) != MVT::v2f16
)
4341 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
4343 SDValue Op
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
,
4345 DAG
.getConstant(0x7fff7fff, SL
, MVT::i32
));
4346 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
4354 /// Helper function for LowerBRCOND
4355 static SDNode
*findUser(SDValue Value
, unsigned Opcode
) {
4357 SDNode
*Parent
= Value
.getNode();
4358 for (SDNode::use_iterator I
= Parent
->use_begin(), E
= Parent
->use_end();
4361 if (I
.getUse().get() != Value
)
4364 if (I
->getOpcode() == Opcode
)
4370 unsigned SITargetLowering::isCFIntrinsic(const SDNode
*Intr
) const {
4371 if (Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
) {
4372 switch (cast
<ConstantSDNode
>(Intr
->getOperand(1))->getZExtValue()) {
4373 case Intrinsic::amdgcn_if
:
4374 return AMDGPUISD::IF
;
4375 case Intrinsic::amdgcn_else
:
4376 return AMDGPUISD::ELSE
;
4377 case Intrinsic::amdgcn_loop
:
4378 return AMDGPUISD::LOOP
;
4379 case Intrinsic::amdgcn_end_cf
:
4380 llvm_unreachable("should not occur");
4386 // break, if_break, else_break are all only used as inputs to loop, not
4387 // directly as branch conditions.
4391 bool SITargetLowering::shouldEmitFixup(const GlobalValue
*GV
) const {
4392 const Triple
&TT
= getTargetMachine().getTargetTriple();
4393 return (GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4394 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4395 AMDGPU::shouldEmitConstantsToTextSection(TT
);
4398 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue
*GV
) const {
4399 // FIXME: Either avoid relying on address space here or change the default
4400 // address space for functions to avoid the explicit check.
4401 return (GV
->getValueType()->isFunctionTy() ||
4402 GV
->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
4403 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4404 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4405 !shouldEmitFixup(GV
) &&
4406 !getTargetMachine().shouldAssumeDSOLocal(*GV
->getParent(), GV
);
4409 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue
*GV
) const {
4410 return !shouldEmitFixup(GV
) && !shouldEmitGOTReloc(GV
);
4413 /// This transforms the control flow intrinsics to get the branch destination as
4414 /// last parameter, also switches branch target with BR if the need arise
4415 SDValue
SITargetLowering::LowerBRCOND(SDValue BRCOND
,
4416 SelectionDAG
&DAG
) const {
4419 SDNode
*Intr
= BRCOND
.getOperand(1).getNode();
4420 SDValue Target
= BRCOND
.getOperand(2);
4421 SDNode
*BR
= nullptr;
4422 SDNode
*SetCC
= nullptr;
4424 if (Intr
->getOpcode() == ISD::SETCC
) {
4425 // As long as we negate the condition everything is fine
4427 Intr
= SetCC
->getOperand(0).getNode();
4430 // Get the target from BR if we don't negate the condition
4431 BR
= findUser(BRCOND
, ISD::BR
);
4432 Target
= BR
->getOperand(1);
4435 // FIXME: This changes the types of the intrinsics instead of introducing new
4436 // nodes with the correct types.
4437 // e.g. llvm.amdgcn.loop
4439 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4440 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4442 unsigned CFNode
= isCFIntrinsic(Intr
);
4444 // This is a uniform branch so we don't need to legalize.
4448 bool HaveChain
= Intr
->getOpcode() == ISD::INTRINSIC_VOID
||
4449 Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
;
4452 (SetCC
->getConstantOperandVal(1) == 1 &&
4453 cast
<CondCodeSDNode
>(SetCC
->getOperand(2).getNode())->get() ==
4456 // operands of the new intrinsic call
4457 SmallVector
<SDValue
, 4> Ops
;
4459 Ops
.push_back(BRCOND
.getOperand(0));
4461 Ops
.append(Intr
->op_begin() + (HaveChain
? 2 : 1), Intr
->op_end());
4462 Ops
.push_back(Target
);
4464 ArrayRef
<EVT
> Res(Intr
->value_begin() + 1, Intr
->value_end());
4466 // build the new intrinsic call
4467 SDNode
*Result
= DAG
.getNode(CFNode
, DL
, DAG
.getVTList(Res
), Ops
).getNode();
4472 BRCOND
.getOperand(0)
4475 Result
= DAG
.getMergeValues(Ops
, DL
).getNode();
4479 // Give the branch instruction our target
4482 BRCOND
.getOperand(2)
4484 SDValue NewBR
= DAG
.getNode(ISD::BR
, DL
, BR
->getVTList(), Ops
);
4485 DAG
.ReplaceAllUsesWith(BR
, NewBR
.getNode());
4486 BR
= NewBR
.getNode();
4489 SDValue Chain
= SDValue(Result
, Result
->getNumValues() - 1);
4491 // Copy the intrinsic results to registers
4492 for (unsigned i
= 1, e
= Intr
->getNumValues() - 1; i
!= e
; ++i
) {
4493 SDNode
*CopyToReg
= findUser(SDValue(Intr
, i
), ISD::CopyToReg
);
4497 Chain
= DAG
.getCopyToReg(
4499 CopyToReg
->getOperand(1),
4500 SDValue(Result
, i
- 1),
4503 DAG
.ReplaceAllUsesWith(SDValue(CopyToReg
, 0), CopyToReg
->getOperand(0));
4506 // Remove the old intrinsic from the chain
4507 DAG
.ReplaceAllUsesOfValueWith(
4508 SDValue(Intr
, Intr
->getNumValues() - 1),
4509 Intr
->getOperand(0));
4514 SDValue
SITargetLowering::LowerRETURNADDR(SDValue Op
,
4515 SelectionDAG
&DAG
) const {
4516 MVT VT
= Op
.getSimpleValueType();
4518 // Checking the depth
4519 if (cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue() != 0)
4520 return DAG
.getConstant(0, DL
, VT
);
4522 MachineFunction
&MF
= DAG
.getMachineFunction();
4523 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4524 // Check for kernel and shader functions
4525 if (Info
->isEntryFunction())
4526 return DAG
.getConstant(0, DL
, VT
);
4528 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
4529 // There is a call to @llvm.returnaddress in this function
4530 MFI
.setReturnAddressIsTaken(true);
4532 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
4533 // Get the return address reg and mark it as an implicit live-in
4534 unsigned Reg
= MF
.addLiveIn(TRI
->getReturnAddressReg(MF
), getRegClassFor(VT
, Op
.getNode()->isDivergent()));
4536 return DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, Reg
, VT
);
4539 SDValue
SITargetLowering::getFPExtOrFPTrunc(SelectionDAG
&DAG
,
4543 return Op
.getValueType().bitsLE(VT
) ?
4544 DAG
.getNode(ISD::FP_EXTEND
, DL
, VT
, Op
) :
4545 DAG
.getNode(ISD::FTRUNC
, DL
, VT
, Op
);
4548 SDValue
SITargetLowering::lowerFP_ROUND(SDValue Op
, SelectionDAG
&DAG
) const {
4549 assert(Op
.getValueType() == MVT::f16
&&
4550 "Do not know how to custom lower FP_ROUND for non-f16 type");
4552 SDValue Src
= Op
.getOperand(0);
4553 EVT SrcVT
= Src
.getValueType();
4554 if (SrcVT
!= MVT::f64
)
4559 SDValue FpToFp16
= DAG
.getNode(ISD::FP_TO_FP16
, DL
, MVT::i32
, Src
);
4560 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, FpToFp16
);
4561 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f16
, Trunc
);
4564 SDValue
SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op
,
4565 SelectionDAG
&DAG
) const {
4566 EVT VT
= Op
.getValueType();
4567 const MachineFunction
&MF
= DAG
.getMachineFunction();
4568 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4569 bool IsIEEEMode
= Info
->getMode().IEEE
;
4571 // FIXME: Assert during eslection that this is only selected for
4572 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4573 // mode functions, but this happens to be OK since it's only done in cases
4574 // where there is known no sNaN.
4576 return expandFMINNUM_FMAXNUM(Op
.getNode(), DAG
);
4578 if (VT
== MVT::v4f16
)
4579 return splitBinaryVectorOp(Op
, DAG
);
4583 SDValue
SITargetLowering::lowerTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4585 SDValue Chain
= Op
.getOperand(0);
4587 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4588 !Subtarget
->isTrapHandlerEnabled())
4589 return DAG
.getNode(AMDGPUISD::ENDPGM
, SL
, MVT::Other
, Chain
);
4591 MachineFunction
&MF
= DAG
.getMachineFunction();
4592 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4593 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4594 assert(UserSGPR
!= AMDGPU::NoRegister
);
4595 SDValue QueuePtr
= CreateLiveInRegister(
4596 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4597 SDValue SGPR01
= DAG
.getRegister(AMDGPU::SGPR0_SGPR1
, MVT::i64
);
4598 SDValue ToReg
= DAG
.getCopyToReg(Chain
, SL
, SGPR01
,
4599 QueuePtr
, SDValue());
4602 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap
, SL
, MVT::i16
),
4606 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4609 SDValue
SITargetLowering::lowerDEBUGTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4611 SDValue Chain
= Op
.getOperand(0);
4612 MachineFunction
&MF
= DAG
.getMachineFunction();
4614 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4615 !Subtarget
->isTrapHandlerEnabled()) {
4616 DiagnosticInfoUnsupported
NoTrap(MF
.getFunction(),
4617 "debugtrap handler not supported",
4620 LLVMContext
&Ctx
= MF
.getFunction().getContext();
4621 Ctx
.diagnose(NoTrap
);
4627 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap
, SL
, MVT::i16
)
4629 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4632 SDValue
SITargetLowering::getSegmentAperture(unsigned AS
, const SDLoc
&DL
,
4633 SelectionDAG
&DAG
) const {
4634 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4635 if (Subtarget
->hasApertureRegs()) {
4636 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4637 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
4638 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
4639 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4640 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
4641 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
4643 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
4644 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
4645 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
4647 SDValue EncodingImm
= DAG
.getTargetConstant(Encoding
, DL
, MVT::i16
);
4648 SDValue ApertureReg
= SDValue(
4649 DAG
.getMachineNode(AMDGPU::S_GETREG_B32
, DL
, MVT::i32
, EncodingImm
), 0);
4650 SDValue ShiftAmount
= DAG
.getTargetConstant(WidthM1
+ 1, DL
, MVT::i32
);
4651 return DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, ApertureReg
, ShiftAmount
);
4654 MachineFunction
&MF
= DAG
.getMachineFunction();
4655 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4656 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4657 assert(UserSGPR
!= AMDGPU::NoRegister
);
4659 SDValue QueuePtr
= CreateLiveInRegister(
4660 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4662 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4663 // private_segment_aperture_base_hi.
4664 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
4666 SDValue Ptr
= DAG
.getObjectPtrOffset(DL
, QueuePtr
, StructOffset
);
4668 // TODO: Use custom target PseudoSourceValue.
4669 // TODO: We should use the value from the IR intrinsic call, but it might not
4670 // be available and how do we get it?
4671 Value
*V
= UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG
.getContext()),
4672 AMDGPUAS::CONSTANT_ADDRESS
));
4674 MachinePointerInfo
PtrInfo(V
, StructOffset
);
4675 return DAG
.getLoad(MVT::i32
, DL
, QueuePtr
.getValue(1), Ptr
, PtrInfo
,
4676 MinAlign(64, StructOffset
),
4677 MachineMemOperand::MODereferenceable
|
4678 MachineMemOperand::MOInvariant
);
4681 SDValue
SITargetLowering::lowerADDRSPACECAST(SDValue Op
,
4682 SelectionDAG
&DAG
) const {
4684 const AddrSpaceCastSDNode
*ASC
= cast
<AddrSpaceCastSDNode
>(Op
);
4686 SDValue Src
= ASC
->getOperand(0);
4687 SDValue FlatNullPtr
= DAG
.getConstant(0, SL
, MVT::i64
);
4689 const AMDGPUTargetMachine
&TM
=
4690 static_cast<const AMDGPUTargetMachine
&>(getTargetMachine());
4692 // flat -> local/private
4693 if (ASC
->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4694 unsigned DestAS
= ASC
->getDestAddressSpace();
4696 if (DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
4697 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4698 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
4699 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4700 SDValue NonNull
= DAG
.getSetCC(SL
, MVT::i1
, Src
, FlatNullPtr
, ISD::SETNE
);
4701 SDValue Ptr
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, Src
);
4703 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i32
,
4704 NonNull
, Ptr
, SegmentNullPtr
);
4708 // local/private -> flat
4709 if (ASC
->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4710 unsigned SrcAS
= ASC
->getSrcAddressSpace();
4712 if (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
4713 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4714 unsigned NullVal
= TM
.getNullPointerValue(SrcAS
);
4715 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4718 = DAG
.getSetCC(SL
, MVT::i1
, Src
, SegmentNullPtr
, ISD::SETNE
);
4720 SDValue Aperture
= getSegmentAperture(ASC
->getSrcAddressSpace(), SL
, DAG
);
4722 = DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, Src
, Aperture
);
4724 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, NonNull
,
4725 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, CvtPtr
),
4730 // global <-> flat are no-ops and never emitted.
4732 const MachineFunction
&MF
= DAG
.getMachineFunction();
4733 DiagnosticInfoUnsupported
InvalidAddrSpaceCast(
4734 MF
.getFunction(), "invalid addrspacecast", SL
.getDebugLoc());
4735 DAG
.getContext()->diagnose(InvalidAddrSpaceCast
);
4737 return DAG
.getUNDEF(ASC
->getValueType(0));
4740 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
4741 // the small vector and inserting them into the big vector. That is better than
4742 // the default expansion of doing it via a stack slot. Even though the use of
4743 // the stack slot would be optimized away afterwards, the stack slot itself
4745 SDValue
SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op
,
4746 SelectionDAG
&DAG
) const {
4747 SDValue Vec
= Op
.getOperand(0);
4748 SDValue Ins
= Op
.getOperand(1);
4749 SDValue Idx
= Op
.getOperand(2);
4750 EVT VecVT
= Vec
.getValueType();
4751 EVT InsVT
= Ins
.getValueType();
4752 EVT EltVT
= VecVT
.getVectorElementType();
4753 unsigned InsNumElts
= InsVT
.getVectorNumElements();
4754 unsigned IdxVal
= cast
<ConstantSDNode
>(Idx
)->getZExtValue();
4757 for (unsigned I
= 0; I
!= InsNumElts
; ++I
) {
4758 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Ins
,
4759 DAG
.getConstant(I
, SL
, MVT::i32
));
4760 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, VecVT
, Vec
, Elt
,
4761 DAG
.getConstant(IdxVal
+ I
, SL
, MVT::i32
));
4766 SDValue
SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op
,
4767 SelectionDAG
&DAG
) const {
4768 SDValue Vec
= Op
.getOperand(0);
4769 SDValue InsVal
= Op
.getOperand(1);
4770 SDValue Idx
= Op
.getOperand(2);
4771 EVT VecVT
= Vec
.getValueType();
4772 EVT EltVT
= VecVT
.getVectorElementType();
4773 unsigned VecSize
= VecVT
.getSizeInBits();
4774 unsigned EltSize
= EltVT
.getSizeInBits();
4777 assert(VecSize
<= 64);
4779 unsigned NumElts
= VecVT
.getVectorNumElements();
4781 auto KIdx
= dyn_cast
<ConstantSDNode
>(Idx
);
4783 if (NumElts
== 4 && EltSize
== 16 && KIdx
) {
4784 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Vec
);
4786 SDValue LoHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4787 DAG
.getConstant(0, SL
, MVT::i32
));
4788 SDValue HiHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4789 DAG
.getConstant(1, SL
, MVT::i32
));
4791 SDValue LoVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, LoHalf
);
4792 SDValue HiVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, HiHalf
);
4794 unsigned Idx
= KIdx
->getZExtValue();
4795 bool InsertLo
= Idx
< 2;
4796 SDValue InsHalf
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, MVT::v2i16
,
4797 InsertLo
? LoVec
: HiVec
,
4798 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, InsVal
),
4799 DAG
.getConstant(InsertLo
? Idx
: (Idx
- 2), SL
, MVT::i32
));
4801 InsHalf
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, InsHalf
);
4803 SDValue Concat
= InsertLo
?
4804 DAG
.getBuildVector(MVT::v2i32
, SL
, { InsHalf
, HiHalf
}) :
4805 DAG
.getBuildVector(MVT::v2i32
, SL
, { LoHalf
, InsHalf
});
4807 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, Concat
);
4810 if (isa
<ConstantSDNode
>(Idx
))
4813 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4815 // Avoid stack access for dynamic indexing.
4816 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4818 // Create a congruent vector with the target value in each element so that
4819 // the required element can be masked and ORed into the target vector.
4820 SDValue ExtVal
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
,
4821 DAG
.getSplatBuildVector(VecVT
, SL
, InsVal
));
4823 assert(isPowerOf2_32(EltSize
));
4824 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4826 // Convert vector index to bit-index.
4827 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4829 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4830 SDValue BFM
= DAG
.getNode(ISD::SHL
, SL
, IntVT
,
4831 DAG
.getConstant(0xffff, SL
, IntVT
),
4834 SDValue LHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
, BFM
, ExtVal
);
4835 SDValue RHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
,
4836 DAG
.getNOT(SL
, BFM
, IntVT
), BCVec
);
4838 SDValue BFI
= DAG
.getNode(ISD::OR
, SL
, IntVT
, LHS
, RHS
);
4839 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, BFI
);
4842 SDValue
SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op
,
4843 SelectionDAG
&DAG
) const {
4846 EVT ResultVT
= Op
.getValueType();
4847 SDValue Vec
= Op
.getOperand(0);
4848 SDValue Idx
= Op
.getOperand(1);
4849 EVT VecVT
= Vec
.getValueType();
4850 unsigned VecSize
= VecVT
.getSizeInBits();
4851 EVT EltVT
= VecVT
.getVectorElementType();
4852 assert(VecSize
<= 64);
4854 DAGCombinerInfo
DCI(DAG
, AfterLegalizeVectorOps
, true, nullptr);
4856 // Make sure we do any optimizations that will make it easier to fold
4857 // source modifiers before obscuring it with bit operations.
4859 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4860 if (SDValue Combined
= performExtractVectorEltCombine(Op
.getNode(), DCI
))
4863 unsigned EltSize
= EltVT
.getSizeInBits();
4864 assert(isPowerOf2_32(EltSize
));
4866 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4867 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4869 // Convert vector index to bit-index (* EltSize)
4870 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4872 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4873 SDValue Elt
= DAG
.getNode(ISD::SRL
, SL
, IntVT
, BC
, ScaledIdx
);
4875 if (ResultVT
== MVT::f16
) {
4876 SDValue Result
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i16
, Elt
);
4877 return DAG
.getNode(ISD::BITCAST
, SL
, ResultVT
, Result
);
4880 return DAG
.getAnyExtOrTrunc(Elt
, SL
, ResultVT
);
4883 static bool elementPairIsContiguous(ArrayRef
<int> Mask
, int Elt
) {
4884 assert(Elt
% 2 == 0);
4885 return Mask
[Elt
+ 1] == Mask
[Elt
] + 1 && (Mask
[Elt
] % 2 == 0);
4888 SDValue
SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op
,
4889 SelectionDAG
&DAG
) const {
4891 EVT ResultVT
= Op
.getValueType();
4892 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
);
4894 EVT PackVT
= ResultVT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
4895 EVT EltVT
= PackVT
.getVectorElementType();
4896 int SrcNumElts
= Op
.getOperand(0).getValueType().getVectorNumElements();
4898 // vector_shuffle <0,1,6,7> lhs, rhs
4899 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
4901 // vector_shuffle <6,7,2,3> lhs, rhs
4902 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
4904 // vector_shuffle <6,7,0,1> lhs, rhs
4905 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
4907 // Avoid scalarizing when both halves are reading from consecutive elements.
4908 SmallVector
<SDValue
, 4> Pieces
;
4909 for (int I
= 0, N
= ResultVT
.getVectorNumElements(); I
!= N
; I
+= 2) {
4910 if (elementPairIsContiguous(SVN
->getMask(), I
)) {
4911 const int Idx
= SVN
->getMaskElt(I
);
4912 int VecIdx
= Idx
< SrcNumElts
? 0 : 1;
4913 int EltIdx
= Idx
< SrcNumElts
? Idx
: Idx
- SrcNumElts
;
4914 SDValue SubVec
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
,
4915 PackVT
, SVN
->getOperand(VecIdx
),
4916 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
4917 Pieces
.push_back(SubVec
);
4919 const int Idx0
= SVN
->getMaskElt(I
);
4920 const int Idx1
= SVN
->getMaskElt(I
+ 1);
4921 int VecIdx0
= Idx0
< SrcNumElts
? 0 : 1;
4922 int VecIdx1
= Idx1
< SrcNumElts
? 0 : 1;
4923 int EltIdx0
= Idx0
< SrcNumElts
? Idx0
: Idx0
- SrcNumElts
;
4924 int EltIdx1
= Idx1
< SrcNumElts
? Idx1
: Idx1
- SrcNumElts
;
4926 SDValue Vec0
= SVN
->getOperand(VecIdx0
);
4927 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
4928 Vec0
, DAG
.getConstant(EltIdx0
, SL
, MVT::i32
));
4930 SDValue Vec1
= SVN
->getOperand(VecIdx1
);
4931 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
4932 Vec1
, DAG
.getConstant(EltIdx1
, SL
, MVT::i32
));
4933 Pieces
.push_back(DAG
.getBuildVector(PackVT
, SL
, { Elt0
, Elt1
}));
4937 return DAG
.getNode(ISD::CONCAT_VECTORS
, SL
, ResultVT
, Pieces
);
4940 SDValue
SITargetLowering::lowerBUILD_VECTOR(SDValue Op
,
4941 SelectionDAG
&DAG
) const {
4943 EVT VT
= Op
.getValueType();
4945 if (VT
== MVT::v4i16
|| VT
== MVT::v4f16
) {
4946 EVT HalfVT
= MVT::getVectorVT(VT
.getVectorElementType().getSimpleVT(), 2);
4948 // Turn into pair of packed build_vectors.
4949 // TODO: Special case for constants that can be materialized with s_mov_b64.
4950 SDValue Lo
= DAG
.getBuildVector(HalfVT
, SL
,
4951 { Op
.getOperand(0), Op
.getOperand(1) });
4952 SDValue Hi
= DAG
.getBuildVector(HalfVT
, SL
,
4953 { Op
.getOperand(2), Op
.getOperand(3) });
4955 SDValue CastLo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Lo
);
4956 SDValue CastHi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Hi
);
4958 SDValue Blend
= DAG
.getBuildVector(MVT::v2i32
, SL
, { CastLo
, CastHi
});
4959 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Blend
);
4962 assert(VT
== MVT::v2f16
|| VT
== MVT::v2i16
);
4963 assert(!Subtarget
->hasVOP3PInsts() && "this should be legal");
4965 SDValue Lo
= Op
.getOperand(0);
4966 SDValue Hi
= Op
.getOperand(1);
4968 // Avoid adding defined bits with the zero_extend.
4970 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4971 SDValue ExtLo
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Lo
);
4972 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ExtLo
);
4975 Hi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Hi
);
4976 Hi
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Hi
);
4978 SDValue ShlHi
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Hi
,
4979 DAG
.getConstant(16, SL
, MVT::i32
));
4981 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ShlHi
);
4983 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4984 Lo
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Lo
);
4986 SDValue Or
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Lo
, ShlHi
);
4987 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Or
);
4991 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode
*GA
) const {
4992 // We can fold offsets for anything that doesn't require a GOT relocation.
4993 return (GA
->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
4994 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4995 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4996 !shouldEmitGOTReloc(GA
->getGlobal());
5000 buildPCRelGlobalAddress(SelectionDAG
&DAG
, const GlobalValue
*GV
,
5001 const SDLoc
&DL
, unsigned Offset
, EVT PtrVT
,
5002 unsigned GAFlags
= SIInstrInfo::MO_NONE
) {
5003 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
5004 // lowered to the following code sequence:
5006 // For constant address space:
5007 // s_getpc_b64 s[0:1]
5008 // s_add_u32 s0, s0, $symbol
5009 // s_addc_u32 s1, s1, 0
5011 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
5012 // a fixup or relocation is emitted to replace $symbol with a literal
5013 // constant, which is a pc-relative offset from the encoding of the $symbol
5014 // operand to the global variable.
5016 // For global address space:
5017 // s_getpc_b64 s[0:1]
5018 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
5019 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
5021 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
5022 // fixups or relocations are emitted to replace $symbol@*@lo and
5023 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
5024 // which is a 64-bit pc-relative offset from the encoding of the $symbol
5025 // operand to the global variable.
5027 // What we want here is an offset from the value returned by s_getpc
5028 // (which is the address of the s_add_u32 instruction) to the global
5029 // variable, but since the encoding of $symbol starts 4 bytes after the start
5030 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
5031 // small. This requires us to add 4 to the global variable offset in order to
5032 // compute the correct address.
5033 unsigned LoFlags
= GAFlags
;
5034 if (LoFlags
== SIInstrInfo::MO_NONE
)
5035 LoFlags
= SIInstrInfo::MO_REL32
;
5037 DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4, LoFlags
);
5039 if (GAFlags
== SIInstrInfo::MO_NONE
) {
5040 PtrHi
= DAG
.getTargetConstant(0, DL
, MVT::i32
);
5043 DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4, GAFlags
+ 1);
5045 return DAG
.getNode(AMDGPUISD::PC_ADD_REL_OFFSET
, DL
, PtrVT
, PtrLo
, PtrHi
);
5048 SDValue
SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
*MFI
,
5050 SelectionDAG
&DAG
) const {
5051 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Op
);
5052 const GlobalValue
*GV
= GSD
->getGlobal();
5053 if ((GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
&&
5054 (!GV
->hasExternalLinkage() ||
5055 getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
5056 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
)) ||
5057 GSD
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
||
5058 GSD
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
)
5059 return AMDGPUTargetLowering::LowerGlobalAddress(MFI
, Op
, DAG
);
5062 EVT PtrVT
= Op
.getValueType();
5064 if (GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
) {
5065 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, GSD
->getOffset(),
5066 SIInstrInfo::MO_ABS32_LO
);
5067 return DAG
.getNode(AMDGPUISD::LDS
, DL
, MVT::i32
, GA
);
5070 if (shouldEmitFixup(GV
))
5071 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
);
5072 else if (shouldEmitPCReloc(GV
))
5073 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
,
5074 SIInstrInfo::MO_REL32
);
5076 SDValue GOTAddr
= buildPCRelGlobalAddress(DAG
, GV
, DL
, 0, PtrVT
,
5077 SIInstrInfo::MO_GOTPCREL32
);
5079 Type
*Ty
= PtrVT
.getTypeForEVT(*DAG
.getContext());
5080 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
5081 const DataLayout
&DataLayout
= DAG
.getDataLayout();
5082 unsigned Align
= DataLayout
.getABITypeAlignment(PtrTy
);
5083 MachinePointerInfo PtrInfo
5084 = MachinePointerInfo::getGOT(DAG
.getMachineFunction());
5086 return DAG
.getLoad(PtrVT
, DL
, DAG
.getEntryNode(), GOTAddr
, PtrInfo
, Align
,
5087 MachineMemOperand::MODereferenceable
|
5088 MachineMemOperand::MOInvariant
);
5091 SDValue
SITargetLowering::copyToM0(SelectionDAG
&DAG
, SDValue Chain
,
5092 const SDLoc
&DL
, SDValue V
) const {
5093 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
5094 // the destination register.
5096 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
5097 // so we will end up with redundant moves to m0.
5099 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
5101 // A Null SDValue creates a glue result.
5102 SDNode
*M0
= DAG
.getMachineNode(AMDGPU::SI_INIT_M0
, DL
, MVT::Other
, MVT::Glue
,
5104 return SDValue(M0
, 0);
5107 SDValue
SITargetLowering::lowerImplicitZextParam(SelectionDAG
&DAG
,
5110 unsigned Offset
) const {
5112 SDValue Param
= lowerKernargMemParameter(DAG
, MVT::i32
, MVT::i32
, SL
,
5113 DAG
.getEntryNode(), Offset
, 4, false);
5114 // The local size values will have the hi 16-bits as zero.
5115 return DAG
.getNode(ISD::AssertZext
, SL
, MVT::i32
, Param
,
5116 DAG
.getValueType(VT
));
5119 static SDValue
emitNonHSAIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
5121 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
5122 "non-hsa intrinsic with hsa target",
5124 DAG
.getContext()->diagnose(BadIntrin
);
5125 return DAG
.getUNDEF(VT
);
5128 static SDValue
emitRemovedIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
5130 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
5131 "intrinsic not supported on subtarget",
5133 DAG
.getContext()->diagnose(BadIntrin
);
5134 return DAG
.getUNDEF(VT
);
5137 static SDValue
getBuildDwordsVector(SelectionDAG
&DAG
, SDLoc DL
,
5138 ArrayRef
<SDValue
> Elts
) {
5139 assert(!Elts
.empty());
5143 if (Elts
.size() == 1) {
5146 } else if (Elts
.size() == 2) {
5149 } else if (Elts
.size() <= 4) {
5152 } else if (Elts
.size() <= 8) {
5156 assert(Elts
.size() <= 16);
5161 SmallVector
<SDValue
, 16> VecElts(NumElts
);
5162 for (unsigned i
= 0; i
< Elts
.size(); ++i
) {
5163 SDValue Elt
= Elts
[i
];
5164 if (Elt
.getValueType() != MVT::f32
)
5165 Elt
= DAG
.getBitcast(MVT::f32
, Elt
);
5168 for (unsigned i
= Elts
.size(); i
< NumElts
; ++i
)
5169 VecElts
[i
] = DAG
.getUNDEF(MVT::f32
);
5173 return DAG
.getBuildVector(Type
, DL
, VecElts
);
5176 static bool parseCachePolicy(SDValue CachePolicy
, SelectionDAG
&DAG
,
5177 SDValue
*GLC
, SDValue
*SLC
, SDValue
*DLC
) {
5178 auto CachePolicyConst
= cast
<ConstantSDNode
>(CachePolicy
.getNode());
5180 uint64_t Value
= CachePolicyConst
->getZExtValue();
5181 SDLoc
DL(CachePolicy
);
5183 *GLC
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
5184 Value
&= ~(uint64_t)0x1;
5187 *SLC
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
5188 Value
&= ~(uint64_t)0x2;
5191 *DLC
= DAG
.getTargetConstant((Value
& 0x4) ? 1 : 0, DL
, MVT::i32
);
5192 Value
&= ~(uint64_t)0x4;
5198 // Re-construct the required return value for a image load intrinsic.
5199 // This is more complicated due to the optional use TexFailCtrl which means the required
5200 // return type is an aggregate
5201 static SDValue
constructRetValue(SelectionDAG
&DAG
,
5202 MachineSDNode
*Result
,
5203 ArrayRef
<EVT
> ResultTypes
,
5204 bool IsTexFail
, bool Unpacked
, bool IsD16
,
5205 int DMaskPop
, int NumVDataDwords
,
5206 const SDLoc
&DL
, LLVMContext
&Context
) {
5207 // Determine the required return type. This is the same regardless of IsTexFail flag
5208 EVT ReqRetVT
= ResultTypes
[0];
5209 EVT ReqRetEltVT
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorElementType() : ReqRetVT
;
5210 int ReqRetNumElts
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorNumElements() : 1;
5211 EVT AdjEltVT
= Unpacked
&& IsD16
? MVT::i32
: ReqRetEltVT
;
5212 EVT AdjVT
= Unpacked
? ReqRetNumElts
> 1 ? EVT::getVectorVT(Context
, AdjEltVT
, ReqRetNumElts
)
5216 // Extract data part of the result
5217 // Bitcast the result to the same type as the required return type
5219 if (IsD16
&& !Unpacked
)
5220 NumElts
= NumVDataDwords
<< 1;
5222 NumElts
= NumVDataDwords
;
5224 EVT CastVT
= NumElts
> 1 ? EVT::getVectorVT(Context
, AdjEltVT
, NumElts
)
5227 // Special case for v6f16. Rather than add support for this, use v3i32 to
5228 // extract the data elements
5229 bool V6F16Special
= false;
5231 CastVT
= EVT::getVectorVT(Context
, MVT::i32
, NumElts
/ 2);
5233 ReqRetNumElts
>>= 1;
5234 V6F16Special
= true;
5238 SDValue N
= SDValue(Result
, 0);
5239 SDValue CastRes
= DAG
.getNode(ISD::BITCAST
, DL
, CastVT
, N
);
5241 // Iterate over the result
5242 SmallVector
<SDValue
, 4> BVElts
;
5244 if (CastVT
.isVector()) {
5245 DAG
.ExtractVectorElements(CastRes
, BVElts
, 0, DMaskPop
);
5247 BVElts
.push_back(CastRes
);
5249 int ExtraElts
= ReqRetNumElts
- DMaskPop
;
5251 BVElts
.push_back(DAG
.getUNDEF(AdjEltVT
));
5254 if (ReqRetNumElts
> 1) {
5255 SDValue NewVec
= DAG
.getBuildVector(AdjVT
, DL
, BVElts
);
5256 if (IsD16
&& Unpacked
)
5257 PreTFCRes
= adjustLoadValueTypeImpl(NewVec
, ReqRetVT
, DL
, DAG
, Unpacked
);
5261 PreTFCRes
= BVElts
[0];
5265 PreTFCRes
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v4f16
, PreTFCRes
);
5268 if (Result
->getNumValues() > 1)
5269 return DAG
.getMergeValues({PreTFCRes
, SDValue(Result
, 1)}, DL
);
5274 // Extract the TexFail result and insert into aggregate return
5275 SmallVector
<SDValue
, 1> TFCElt
;
5276 DAG
.ExtractVectorElements(N
, TFCElt
, DMaskPop
, 1);
5277 SDValue TFCRes
= DAG
.getNode(ISD::BITCAST
, DL
, ResultTypes
[1], TFCElt
[0]);
5278 return DAG
.getMergeValues({PreTFCRes
, TFCRes
, SDValue(Result
, 1)}, DL
);
5281 static bool parseTexFail(SDValue TexFailCtrl
, SelectionDAG
&DAG
, SDValue
*TFE
,
5282 SDValue
*LWE
, bool &IsTexFail
) {
5283 auto TexFailCtrlConst
= cast
<ConstantSDNode
>(TexFailCtrl
.getNode());
5285 uint64_t Value
= TexFailCtrlConst
->getZExtValue();
5290 SDLoc
DL(TexFailCtrlConst
);
5291 *TFE
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
5292 Value
&= ~(uint64_t)0x1;
5293 *LWE
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
5294 Value
&= ~(uint64_t)0x2;
5299 SDValue
SITargetLowering::lowerImage(SDValue Op
,
5300 const AMDGPU::ImageDimIntrinsicInfo
*Intr
,
5301 SelectionDAG
&DAG
) const {
5303 MachineFunction
&MF
= DAG
.getMachineFunction();
5304 const GCNSubtarget
* ST
= &MF
.getSubtarget
<GCNSubtarget
>();
5305 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
5306 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
5307 const AMDGPU::MIMGDimInfo
*DimInfo
= AMDGPU::getMIMGDimInfo(Intr
->Dim
);
5308 const AMDGPU::MIMGLZMappingInfo
*LZMappingInfo
=
5309 AMDGPU::getMIMGLZMappingInfo(Intr
->BaseOpcode
);
5310 const AMDGPU::MIMGMIPMappingInfo
*MIPMappingInfo
=
5311 AMDGPU::getMIMGMIPMappingInfo(Intr
->BaseOpcode
);
5312 unsigned IntrOpcode
= Intr
->BaseOpcode
;
5313 bool IsGFX10
= Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
;
5315 SmallVector
<EVT
, 3> ResultTypes(Op
->value_begin(), Op
->value_end());
5316 SmallVector
<EVT
, 3> OrigResultTypes(Op
->value_begin(), Op
->value_end());
5321 bool AdjustRetType
= false;
5323 unsigned AddrIdx
; // Index of first address argument
5325 unsigned DMaskLanes
= 0;
5327 if (BaseOpcode
->Atomic
) {
5328 VData
= Op
.getOperand(2);
5330 bool Is64Bit
= VData
.getValueType() == MVT::i64
;
5331 if (BaseOpcode
->AtomicX2
) {
5332 SDValue VData2
= Op
.getOperand(3);
5333 VData
= DAG
.getBuildVector(Is64Bit
? MVT::v2i64
: MVT::v2i32
, DL
,
5336 VData
= DAG
.getBitcast(MVT::v4i32
, VData
);
5338 ResultTypes
[0] = Is64Bit
? MVT::v2i64
: MVT::v2i32
;
5339 DMask
= Is64Bit
? 0xf : 0x3;
5340 NumVDataDwords
= Is64Bit
? 4 : 2;
5343 DMask
= Is64Bit
? 0x3 : 0x1;
5344 NumVDataDwords
= Is64Bit
? 2 : 1;
5348 unsigned DMaskIdx
= BaseOpcode
->Store
? 3 : isa
<MemSDNode
>(Op
) ? 2 : 1;
5349 auto DMaskConst
= cast
<ConstantSDNode
>(Op
.getOperand(DMaskIdx
));
5350 DMask
= DMaskConst
->getZExtValue();
5351 DMaskLanes
= BaseOpcode
->Gather4
? 4 : countPopulation(DMask
);
5353 if (BaseOpcode
->Store
) {
5354 VData
= Op
.getOperand(2);
5356 MVT StoreVT
= VData
.getSimpleValueType();
5357 if (StoreVT
.getScalarType() == MVT::f16
) {
5358 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
5359 return Op
; // D16 is unsupported for this instruction
5362 VData
= handleD16VData(VData
, DAG
);
5365 NumVDataDwords
= (VData
.getValueType().getSizeInBits() + 31) / 32;
5367 // Work out the num dwords based on the dmask popcount and underlying type
5368 // and whether packing is supported.
5369 MVT LoadVT
= ResultTypes
[0].getSimpleVT();
5370 if (LoadVT
.getScalarType() == MVT::f16
) {
5371 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
5372 return Op
; // D16 is unsupported for this instruction
5377 // Confirm that the return type is large enough for the dmask specified
5378 if ((LoadVT
.isVector() && LoadVT
.getVectorNumElements() < DMaskLanes
) ||
5379 (!LoadVT
.isVector() && DMaskLanes
> 1))
5382 if (IsD16
&& !Subtarget
->hasUnpackedD16VMem())
5383 NumVDataDwords
= (DMaskLanes
+ 1) / 2;
5385 NumVDataDwords
= DMaskLanes
;
5387 AdjustRetType
= true;
5390 AddrIdx
= DMaskIdx
+ 1;
5393 unsigned NumGradients
= BaseOpcode
->Gradients
? DimInfo
->NumGradients
: 0;
5394 unsigned NumCoords
= BaseOpcode
->Coordinates
? DimInfo
->NumCoords
: 0;
5395 unsigned NumLCM
= BaseOpcode
->LodOrClampOrMip
? 1 : 0;
5396 unsigned NumVAddrs
= BaseOpcode
->NumExtraArgs
+ NumGradients
+
5398 unsigned NumMIVAddrs
= NumVAddrs
;
5400 SmallVector
<SDValue
, 4> VAddrs
;
5402 // Optimize _L to _LZ when _L is zero
5403 if (LZMappingInfo
) {
5404 if (auto ConstantLod
=
5405 dyn_cast
<ConstantFPSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
5406 if (ConstantLod
->isZero() || ConstantLod
->isNegative()) {
5407 IntrOpcode
= LZMappingInfo
->LZ
; // set new opcode to _lz variant of _l
5408 NumMIVAddrs
--; // remove 'lod'
5413 // Optimize _mip away, when 'lod' is zero
5414 if (MIPMappingInfo
) {
5415 if (auto ConstantLod
=
5416 dyn_cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
5417 if (ConstantLod
->isNullValue()) {
5418 IntrOpcode
= MIPMappingInfo
->NONMIP
; // set new opcode to variant without _mip
5419 NumMIVAddrs
--; // remove 'lod'
5424 // Check for 16 bit addresses and pack if true.
5425 unsigned DimIdx
= AddrIdx
+ BaseOpcode
->NumExtraArgs
;
5426 MVT VAddrVT
= Op
.getOperand(DimIdx
).getSimpleValueType();
5427 const MVT VAddrScalarVT
= VAddrVT
.getScalarType();
5428 if (((VAddrScalarVT
== MVT::f16
) || (VAddrScalarVT
== MVT::i16
)) &&
5429 ST
->hasFeature(AMDGPU::FeatureR128A16
)) {
5431 const MVT VectorVT
= VAddrScalarVT
== MVT::f16
? MVT::v2f16
: MVT::v2i16
;
5432 for (unsigned i
= AddrIdx
; i
< (AddrIdx
+ NumMIVAddrs
); ++i
) {
5433 SDValue AddrLo
, AddrHi
;
5434 // Push back extra arguments.
5436 AddrLo
= Op
.getOperand(i
);
5438 AddrLo
= Op
.getOperand(i
);
5439 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
5440 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
5441 if (((i
+ 1) >= (AddrIdx
+ NumMIVAddrs
)) ||
5442 ((NumGradients
/ 2) % 2 == 1 &&
5443 (i
== DimIdx
+ (NumGradients
/ 2) - 1 ||
5444 i
== DimIdx
+ NumGradients
- 1))) {
5445 AddrHi
= DAG
.getUNDEF(MVT::f16
);
5447 AddrHi
= Op
.getOperand(i
+ 1);
5450 AddrLo
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, VectorVT
,
5452 AddrLo
= DAG
.getBitcast(MVT::i32
, AddrLo
);
5454 VAddrs
.push_back(AddrLo
);
5457 for (unsigned i
= 0; i
< NumMIVAddrs
; ++i
)
5458 VAddrs
.push_back(Op
.getOperand(AddrIdx
+ i
));
5461 // If the register allocator cannot place the address registers contiguously
5462 // without introducing moves, then using the non-sequential address encoding
5463 // is always preferable, since it saves VALU instructions and is usually a
5464 // wash in terms of code size or even better.
5466 // However, we currently have no way of hinting to the register allocator that
5467 // MIMG addresses should be placed contiguously when it is possible to do so,
5468 // so force non-NSA for the common 2-address case as a heuristic.
5470 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5471 // allocation when possible.
5473 ST
->hasFeature(AMDGPU::FeatureNSAEncoding
) && VAddrs
.size() >= 3;
5476 VAddr
= getBuildDwordsVector(DAG
, DL
, VAddrs
);
5478 SDValue True
= DAG
.getTargetConstant(1, DL
, MVT::i1
);
5479 SDValue False
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
5480 unsigned CtrlIdx
; // Index of texfailctrl argument
5482 if (!BaseOpcode
->Sampler
) {
5484 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 1;
5487 cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 2));
5489 Unorm
= UnormConst
->getZExtValue() ? True
: False
;
5490 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 3;
5495 SDValue TexFail
= Op
.getOperand(CtrlIdx
);
5496 bool IsTexFail
= false;
5497 if (!parseTexFail(TexFail
, DAG
, &TFE
, &LWE
, IsTexFail
))
5502 // Expecting to get an error flag since TFC is on - and dmask is 0
5503 // Force dmask to be at least 1 otherwise the instruction will fail
5508 NumVDataDwords
+= 1;
5509 AdjustRetType
= true;
5512 // Has something earlier tagged that the return type needs adjusting
5513 // This happens if the instruction is a load or has set TexFailCtrl flags
5514 if (AdjustRetType
) {
5515 // NumVDataDwords reflects the true number of dwords required in the return type
5516 if (DMaskLanes
== 0 && !BaseOpcode
->Store
) {
5517 // This is a no-op load. This can be eliminated
5518 SDValue Undef
= DAG
.getUNDEF(Op
.getValueType());
5519 if (isa
<MemSDNode
>(Op
))
5520 return DAG
.getMergeValues({Undef
, Op
.getOperand(0)}, DL
);
5524 EVT NewVT
= NumVDataDwords
> 1 ?
5525 EVT::getVectorVT(*DAG
.getContext(), MVT::f32
, NumVDataDwords
)
5528 ResultTypes
[0] = NewVT
;
5529 if (ResultTypes
.size() == 3) {
5530 // Original result was aggregate type used for TexFailCtrl results
5531 // The actual instruction returns as a vector type which has now been
5532 // created. Remove the aggregate result.
5533 ResultTypes
.erase(&ResultTypes
[1]);
5540 if (BaseOpcode
->Atomic
) {
5541 GLC
= True
; // TODO no-return optimization
5542 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, nullptr, &SLC
,
5543 IsGFX10
? &DLC
: nullptr))
5546 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, &GLC
, &SLC
,
5547 IsGFX10
? &DLC
: nullptr))
5551 SmallVector
<SDValue
, 26> Ops
;
5552 if (BaseOpcode
->Store
|| BaseOpcode
->Atomic
)
5553 Ops
.push_back(VData
); // vdata
5555 for (const SDValue
&Addr
: VAddrs
)
5556 Ops
.push_back(Addr
);
5558 Ops
.push_back(VAddr
);
5560 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
)); // rsrc
5561 if (BaseOpcode
->Sampler
)
5562 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 1)); // sampler
5563 Ops
.push_back(DAG
.getTargetConstant(DMask
, DL
, MVT::i32
));
5565 Ops
.push_back(DAG
.getTargetConstant(DimInfo
->Encoding
, DL
, MVT::i32
));
5566 Ops
.push_back(Unorm
);
5571 Ops
.push_back(IsA16
&& // a16 or r128
5572 ST
->hasFeature(AMDGPU::FeatureR128A16
) ? True
: False
);
5573 Ops
.push_back(TFE
); // tfe
5574 Ops
.push_back(LWE
); // lwe
5576 Ops
.push_back(DimInfo
->DA
? True
: False
);
5577 if (BaseOpcode
->HasD16
)
5578 Ops
.push_back(IsD16
? True
: False
);
5579 if (isa
<MemSDNode
>(Op
))
5580 Ops
.push_back(Op
.getOperand(0)); // chain
5582 int NumVAddrDwords
=
5583 UseNSA
? VAddrs
.size() : VAddr
.getValueType().getSizeInBits() / 32;
5587 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
,
5588 UseNSA
? AMDGPU::MIMGEncGfx10NSA
5589 : AMDGPU::MIMGEncGfx10Default
,
5590 NumVDataDwords
, NumVAddrDwords
);
5592 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5593 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx8
,
5594 NumVDataDwords
, NumVAddrDwords
);
5596 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx6
,
5597 NumVDataDwords
, NumVAddrDwords
);
5599 assert(Opcode
!= -1);
5601 MachineSDNode
*NewNode
= DAG
.getMachineNode(Opcode
, DL
, ResultTypes
, Ops
);
5602 if (auto MemOp
= dyn_cast
<MemSDNode
>(Op
)) {
5603 MachineMemOperand
*MemRef
= MemOp
->getMemOperand();
5604 DAG
.setNodeMemRefs(NewNode
, {MemRef
});
5607 if (BaseOpcode
->AtomicX2
) {
5608 SmallVector
<SDValue
, 1> Elt
;
5609 DAG
.ExtractVectorElements(SDValue(NewNode
, 0), Elt
, 0, 1);
5610 return DAG
.getMergeValues({Elt
[0], SDValue(NewNode
, 1)}, DL
);
5611 } else if (!BaseOpcode
->Store
) {
5612 return constructRetValue(DAG
, NewNode
,
5613 OrigResultTypes
, IsTexFail
,
5614 Subtarget
->hasUnpackedD16VMem(), IsD16
,
5615 DMaskLanes
, NumVDataDwords
, DL
,
5619 return SDValue(NewNode
, 0);
5622 SDValue
SITargetLowering::lowerSBuffer(EVT VT
, SDLoc DL
, SDValue Rsrc
,
5623 SDValue Offset
, SDValue GLC
, SDValue DLC
,
5624 SelectionDAG
&DAG
) const {
5625 MachineFunction
&MF
= DAG
.getMachineFunction();
5626 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
5627 MachinePointerInfo(),
5628 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
5629 MachineMemOperand::MOInvariant
,
5630 VT
.getStoreSize(), VT
.getStoreSize());
5632 if (!Offset
->isDivergent()) {
5639 return DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD
, DL
,
5640 DAG
.getVTList(VT
), Ops
, VT
, MMO
);
5643 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5644 // assume that the buffer is unswizzled.
5645 SmallVector
<SDValue
, 4> Loads
;
5646 unsigned NumLoads
= 1;
5647 MVT LoadVT
= VT
.getSimpleVT();
5648 unsigned NumElts
= LoadVT
.isVector() ? LoadVT
.getVectorNumElements() : 1;
5649 assert((LoadVT
.getScalarType() == MVT::i32
||
5650 LoadVT
.getScalarType() == MVT::f32
) &&
5651 isPowerOf2_32(NumElts
));
5653 if (NumElts
== 8 || NumElts
== 16) {
5654 NumLoads
= NumElts
== 16 ? 4 : 2;
5655 LoadVT
= MVT::v4i32
;
5658 SDVTList VTList
= DAG
.getVTList({LoadVT
, MVT::Glue
});
5659 unsigned CachePolicy
= cast
<ConstantSDNode
>(GLC
)->getZExtValue();
5661 DAG
.getEntryNode(), // Chain
5663 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5667 DAG
.getConstant(CachePolicy
, DL
, MVT::i32
), // cachepolicy
5668 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
5671 // Use the alignment to ensure that the required offsets will fit into the
5672 // immediate offsets.
5673 setBufferOffsets(Offset
, DAG
, &Ops
[3], NumLoads
> 1 ? 16 * NumLoads
: 4);
5675 uint64_t InstOffset
= cast
<ConstantSDNode
>(Ops
[5])->getZExtValue();
5676 for (unsigned i
= 0; i
< NumLoads
; ++i
) {
5677 Ops
[5] = DAG
.getConstant(InstOffset
+ 16 * i
, DL
, MVT::i32
);
5678 Loads
.push_back(DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD
, DL
, VTList
,
5682 if (VT
== MVT::v8i32
|| VT
== MVT::v16i32
)
5683 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VT
, Loads
);
5688 SDValue
SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
5689 SelectionDAG
&DAG
) const {
5690 MachineFunction
&MF
= DAG
.getMachineFunction();
5691 auto MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
5693 EVT VT
= Op
.getValueType();
5695 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
5697 // TODO: Should this propagate fast-math-flags?
5699 switch (IntrinsicID
) {
5700 case Intrinsic::amdgcn_implicit_buffer_ptr
: {
5701 if (getSubtarget()->isAmdHsaOrMesa(MF
.getFunction()))
5702 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5703 return getPreloadedValue(DAG
, *MFI
, VT
,
5704 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
5706 case Intrinsic::amdgcn_dispatch_ptr
:
5707 case Intrinsic::amdgcn_queue_ptr
: {
5708 if (!Subtarget
->isAmdHsaOrMesa(MF
.getFunction())) {
5709 DiagnosticInfoUnsupported
BadIntrin(
5710 MF
.getFunction(), "unsupported hsa intrinsic without hsa target",
5712 DAG
.getContext()->diagnose(BadIntrin
);
5713 return DAG
.getUNDEF(VT
);
5716 auto RegID
= IntrinsicID
== Intrinsic::amdgcn_dispatch_ptr
?
5717 AMDGPUFunctionArgInfo::DISPATCH_PTR
: AMDGPUFunctionArgInfo::QUEUE_PTR
;
5718 return getPreloadedValue(DAG
, *MFI
, VT
, RegID
);
5720 case Intrinsic::amdgcn_implicitarg_ptr
: {
5721 if (MFI
->isEntryFunction())
5722 return getImplicitArgPtr(DAG
, DL
);
5723 return getPreloadedValue(DAG
, *MFI
, VT
,
5724 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
5726 case Intrinsic::amdgcn_kernarg_segment_ptr
: {
5727 return getPreloadedValue(DAG
, *MFI
, VT
,
5728 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
5730 case Intrinsic::amdgcn_dispatch_id
: {
5731 return getPreloadedValue(DAG
, *MFI
, VT
, AMDGPUFunctionArgInfo::DISPATCH_ID
);
5733 case Intrinsic::amdgcn_rcp
:
5734 return DAG
.getNode(AMDGPUISD::RCP
, DL
, VT
, Op
.getOperand(1));
5735 case Intrinsic::amdgcn_rsq
:
5736 return DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
5737 case Intrinsic::amdgcn_rsq_legacy
:
5738 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5739 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
5741 return DAG
.getNode(AMDGPUISD::RSQ_LEGACY
, DL
, VT
, Op
.getOperand(1));
5742 case Intrinsic::amdgcn_rcp_legacy
:
5743 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5744 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
5745 return DAG
.getNode(AMDGPUISD::RCP_LEGACY
, DL
, VT
, Op
.getOperand(1));
5746 case Intrinsic::amdgcn_rsq_clamp
: {
5747 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5748 return DAG
.getNode(AMDGPUISD::RSQ_CLAMP
, DL
, VT
, Op
.getOperand(1));
5750 Type
*Type
= VT
.getTypeForEVT(*DAG
.getContext());
5751 APFloat Max
= APFloat::getLargest(Type
->getFltSemantics());
5752 APFloat Min
= APFloat::getLargest(Type
->getFltSemantics(), true);
5754 SDValue Rsq
= DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
5755 SDValue Tmp
= DAG
.getNode(ISD::FMINNUM
, DL
, VT
, Rsq
,
5756 DAG
.getConstantFP(Max
, DL
, VT
));
5757 return DAG
.getNode(ISD::FMAXNUM
, DL
, VT
, Tmp
,
5758 DAG
.getConstantFP(Min
, DL
, VT
));
5760 case Intrinsic::r600_read_ngroups_x
:
5761 if (Subtarget
->isAmdHsaOS())
5762 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5764 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5765 SI::KernelInputOffsets::NGROUPS_X
, 4, false);
5766 case Intrinsic::r600_read_ngroups_y
:
5767 if (Subtarget
->isAmdHsaOS())
5768 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5770 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5771 SI::KernelInputOffsets::NGROUPS_Y
, 4, false);
5772 case Intrinsic::r600_read_ngroups_z
:
5773 if (Subtarget
->isAmdHsaOS())
5774 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5776 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5777 SI::KernelInputOffsets::NGROUPS_Z
, 4, false);
5778 case Intrinsic::r600_read_global_size_x
:
5779 if (Subtarget
->isAmdHsaOS())
5780 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5782 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5783 SI::KernelInputOffsets::GLOBAL_SIZE_X
, 4, false);
5784 case Intrinsic::r600_read_global_size_y
:
5785 if (Subtarget
->isAmdHsaOS())
5786 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5788 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5789 SI::KernelInputOffsets::GLOBAL_SIZE_Y
, 4, false);
5790 case Intrinsic::r600_read_global_size_z
:
5791 if (Subtarget
->isAmdHsaOS())
5792 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5794 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5795 SI::KernelInputOffsets::GLOBAL_SIZE_Z
, 4, false);
5796 case Intrinsic::r600_read_local_size_x
:
5797 if (Subtarget
->isAmdHsaOS())
5798 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5800 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5801 SI::KernelInputOffsets::LOCAL_SIZE_X
);
5802 case Intrinsic::r600_read_local_size_y
:
5803 if (Subtarget
->isAmdHsaOS())
5804 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5806 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5807 SI::KernelInputOffsets::LOCAL_SIZE_Y
);
5808 case Intrinsic::r600_read_local_size_z
:
5809 if (Subtarget
->isAmdHsaOS())
5810 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5812 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5813 SI::KernelInputOffsets::LOCAL_SIZE_Z
);
5814 case Intrinsic::amdgcn_workgroup_id_x
:
5815 case Intrinsic::r600_read_tgid_x
:
5816 return getPreloadedValue(DAG
, *MFI
, VT
,
5817 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
5818 case Intrinsic::amdgcn_workgroup_id_y
:
5819 case Intrinsic::r600_read_tgid_y
:
5820 return getPreloadedValue(DAG
, *MFI
, VT
,
5821 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
5822 case Intrinsic::amdgcn_workgroup_id_z
:
5823 case Intrinsic::r600_read_tgid_z
:
5824 return getPreloadedValue(DAG
, *MFI
, VT
,
5825 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
5826 case Intrinsic::amdgcn_workitem_id_x
:
5827 case Intrinsic::r600_read_tidig_x
:
5828 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5829 SDLoc(DAG
.getEntryNode()),
5830 MFI
->getArgInfo().WorkItemIDX
);
5831 case Intrinsic::amdgcn_workitem_id_y
:
5832 case Intrinsic::r600_read_tidig_y
:
5833 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5834 SDLoc(DAG
.getEntryNode()),
5835 MFI
->getArgInfo().WorkItemIDY
);
5836 case Intrinsic::amdgcn_workitem_id_z
:
5837 case Intrinsic::r600_read_tidig_z
:
5838 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5839 SDLoc(DAG
.getEntryNode()),
5840 MFI
->getArgInfo().WorkItemIDZ
);
5841 case Intrinsic::amdgcn_wavefrontsize
:
5842 return DAG
.getConstant(MF
.getSubtarget
<GCNSubtarget
>().getWavefrontSize(),
5843 SDLoc(Op
), MVT::i32
);
5844 case Intrinsic::amdgcn_s_buffer_load
: {
5845 bool IsGFX10
= Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
;
5847 SDValue DLC
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
5848 if (!parseCachePolicy(Op
.getOperand(3), DAG
, &GLC
, nullptr,
5849 IsGFX10
? &DLC
: nullptr))
5851 return lowerSBuffer(VT
, DL
, Op
.getOperand(1), Op
.getOperand(2), GLC
, DLC
,
5854 case Intrinsic::amdgcn_fdiv_fast
:
5855 return lowerFDIV_FAST(Op
, DAG
);
5856 case Intrinsic::amdgcn_interp_mov
: {
5857 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(4));
5858 SDValue Glue
= M0
.getValue(1);
5859 return DAG
.getNode(AMDGPUISD::INTERP_MOV
, DL
, MVT::f32
, Op
.getOperand(1),
5860 Op
.getOperand(2), Op
.getOperand(3), Glue
);
5862 case Intrinsic::amdgcn_interp_p1
: {
5863 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(4));
5864 SDValue Glue
= M0
.getValue(1);
5865 return DAG
.getNode(AMDGPUISD::INTERP_P1
, DL
, MVT::f32
, Op
.getOperand(1),
5866 Op
.getOperand(2), Op
.getOperand(3), Glue
);
5868 case Intrinsic::amdgcn_interp_p2
: {
5869 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(5));
5870 SDValue Glue
= SDValue(M0
.getNode(), 1);
5871 return DAG
.getNode(AMDGPUISD::INTERP_P2
, DL
, MVT::f32
, Op
.getOperand(1),
5872 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(4),
5875 case Intrinsic::amdgcn_interp_p1_f16
: {
5876 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(5));
5877 SDValue Glue
= M0
.getValue(1);
5878 if (getSubtarget()->getLDSBankCount() == 16) {
5880 SDValue S
= DAG
.getNode(AMDGPUISD::INTERP_MOV
, DL
, MVT::f32
,
5881 DAG
.getConstant(2, DL
, MVT::i32
), // P0
5882 Op
.getOperand(2), // Attrchan
5883 Op
.getOperand(3), // Attr
5886 Op
.getOperand(1), // Src0
5887 Op
.getOperand(2), // Attrchan
5888 Op
.getOperand(3), // Attr
5889 DAG
.getConstant(0, DL
, MVT::i32
), // $src0_modifiers
5890 S
, // Src2 - holds two f16 values selected by high
5891 DAG
.getConstant(0, DL
, MVT::i32
), // $src2_modifiers
5892 Op
.getOperand(4), // high
5893 DAG
.getConstant(0, DL
, MVT::i1
), // $clamp
5894 DAG
.getConstant(0, DL
, MVT::i32
) // $omod
5896 return DAG
.getNode(AMDGPUISD::INTERP_P1LV_F16
, DL
, MVT::f32
, Ops
);
5900 Op
.getOperand(1), // Src0
5901 Op
.getOperand(2), // Attrchan
5902 Op
.getOperand(3), // Attr
5903 DAG
.getConstant(0, DL
, MVT::i32
), // $src0_modifiers
5904 Op
.getOperand(4), // high
5905 DAG
.getConstant(0, DL
, MVT::i1
), // $clamp
5906 DAG
.getConstant(0, DL
, MVT::i32
), // $omod
5909 return DAG
.getNode(AMDGPUISD::INTERP_P1LL_F16
, DL
, MVT::f32
, Ops
);
5912 case Intrinsic::amdgcn_interp_p2_f16
: {
5913 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(6));
5914 SDValue Glue
= SDValue(M0
.getNode(), 1);
5916 Op
.getOperand(2), // Src0
5917 Op
.getOperand(3), // Attrchan
5918 Op
.getOperand(4), // Attr
5919 DAG
.getConstant(0, DL
, MVT::i32
), // $src0_modifiers
5920 Op
.getOperand(1), // Src2
5921 DAG
.getConstant(0, DL
, MVT::i32
), // $src2_modifiers
5922 Op
.getOperand(5), // high
5923 DAG
.getConstant(0, DL
, MVT::i1
), // $clamp
5926 return DAG
.getNode(AMDGPUISD::INTERP_P2_F16
, DL
, MVT::f16
, Ops
);
5928 case Intrinsic::amdgcn_sin
:
5929 return DAG
.getNode(AMDGPUISD::SIN_HW
, DL
, VT
, Op
.getOperand(1));
5931 case Intrinsic::amdgcn_cos
:
5932 return DAG
.getNode(AMDGPUISD::COS_HW
, DL
, VT
, Op
.getOperand(1));
5934 case Intrinsic::amdgcn_mul_u24
:
5935 return DAG
.getNode(AMDGPUISD::MUL_U24
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5936 case Intrinsic::amdgcn_mul_i24
:
5937 return DAG
.getNode(AMDGPUISD::MUL_I24
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5939 case Intrinsic::amdgcn_log_clamp
: {
5940 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5943 DiagnosticInfoUnsupported
BadIntrin(
5944 MF
.getFunction(), "intrinsic not supported on subtarget",
5946 DAG
.getContext()->diagnose(BadIntrin
);
5947 return DAG
.getUNDEF(VT
);
5949 case Intrinsic::amdgcn_ldexp
:
5950 return DAG
.getNode(AMDGPUISD::LDEXP
, DL
, VT
,
5951 Op
.getOperand(1), Op
.getOperand(2));
5953 case Intrinsic::amdgcn_fract
:
5954 return DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, Op
.getOperand(1));
5956 case Intrinsic::amdgcn_class
:
5957 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, VT
,
5958 Op
.getOperand(1), Op
.getOperand(2));
5959 case Intrinsic::amdgcn_div_fmas
:
5960 return DAG
.getNode(AMDGPUISD::DIV_FMAS
, DL
, VT
,
5961 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
5964 case Intrinsic::amdgcn_div_fixup
:
5965 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, DL
, VT
,
5966 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5968 case Intrinsic::amdgcn_trig_preop
:
5969 return DAG
.getNode(AMDGPUISD::TRIG_PREOP
, DL
, VT
,
5970 Op
.getOperand(1), Op
.getOperand(2));
5971 case Intrinsic::amdgcn_div_scale
: {
5972 const ConstantSDNode
*Param
= cast
<ConstantSDNode
>(Op
.getOperand(3));
5974 // Translate to the operands expected by the machine instruction. The
5975 // first parameter must be the same as the first instruction.
5976 SDValue Numerator
= Op
.getOperand(1);
5977 SDValue Denominator
= Op
.getOperand(2);
5979 // Note this order is opposite of the machine instruction's operations,
5980 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5981 // intrinsic has the numerator as the first operand to match a normal
5982 // division operation.
5984 SDValue Src0
= Param
->isAllOnesValue() ? Numerator
: Denominator
;
5986 return DAG
.getNode(AMDGPUISD::DIV_SCALE
, DL
, Op
->getVTList(), Src0
,
5987 Denominator
, Numerator
);
5989 case Intrinsic::amdgcn_icmp
: {
5990 // There is a Pat that handles this variant, so return it as-is.
5991 if (Op
.getOperand(1).getValueType() == MVT::i1
&&
5992 Op
.getConstantOperandVal(2) == 0 &&
5993 Op
.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE
)
5995 return lowerICMPIntrinsic(*this, Op
.getNode(), DAG
);
5997 case Intrinsic::amdgcn_fcmp
: {
5998 return lowerFCMPIntrinsic(*this, Op
.getNode(), DAG
);
6000 case Intrinsic::amdgcn_fmed3
:
6001 return DAG
.getNode(AMDGPUISD::FMED3
, DL
, VT
,
6002 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6003 case Intrinsic::amdgcn_fdot2
:
6004 return DAG
.getNode(AMDGPUISD::FDOT2
, DL
, VT
,
6005 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
6007 case Intrinsic::amdgcn_fmul_legacy
:
6008 return DAG
.getNode(AMDGPUISD::FMUL_LEGACY
, DL
, VT
,
6009 Op
.getOperand(1), Op
.getOperand(2));
6010 case Intrinsic::amdgcn_sffbh
:
6011 return DAG
.getNode(AMDGPUISD::FFBH_I32
, DL
, VT
, Op
.getOperand(1));
6012 case Intrinsic::amdgcn_sbfe
:
6013 return DAG
.getNode(AMDGPUISD::BFE_I32
, DL
, VT
,
6014 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6015 case Intrinsic::amdgcn_ubfe
:
6016 return DAG
.getNode(AMDGPUISD::BFE_U32
, DL
, VT
,
6017 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6018 case Intrinsic::amdgcn_cvt_pkrtz
:
6019 case Intrinsic::amdgcn_cvt_pknorm_i16
:
6020 case Intrinsic::amdgcn_cvt_pknorm_u16
:
6021 case Intrinsic::amdgcn_cvt_pk_i16
:
6022 case Intrinsic::amdgcn_cvt_pk_u16
: {
6023 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
6024 EVT VT
= Op
.getValueType();
6027 if (IntrinsicID
== Intrinsic::amdgcn_cvt_pkrtz
)
6028 Opcode
= AMDGPUISD::CVT_PKRTZ_F16_F32
;
6029 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
6030 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
6031 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
6032 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
6033 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pk_i16
)
6034 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
6036 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
6038 if (isTypeLegal(VT
))
6039 return DAG
.getNode(Opcode
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
6041 SDValue Node
= DAG
.getNode(Opcode
, DL
, MVT::i32
,
6042 Op
.getOperand(1), Op
.getOperand(2));
6043 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Node
);
6045 case Intrinsic::amdgcn_fmad_ftz
:
6046 return DAG
.getNode(AMDGPUISD::FMAD_FTZ
, DL
, VT
, Op
.getOperand(1),
6047 Op
.getOperand(2), Op
.getOperand(3));
6049 case Intrinsic::amdgcn_if_break
:
6050 return SDValue(DAG
.getMachineNode(AMDGPU::SI_IF_BREAK
, DL
, VT
,
6051 Op
->getOperand(1), Op
->getOperand(2)), 0);
6053 case Intrinsic::amdgcn_groupstaticsize
: {
6054 Triple::OSType OS
= getTargetMachine().getTargetTriple().getOS();
6055 if (OS
== Triple::AMDHSA
|| OS
== Triple::AMDPAL
)
6058 const Module
*M
= MF
.getFunction().getParent();
6059 const GlobalValue
*GV
=
6060 M
->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize
));
6061 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, 0,
6062 SIInstrInfo::MO_ABS32_LO
);
6063 return {DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, GA
), 0};
6066 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6067 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
6068 return lowerImage(Op
, ImageDimIntr
, DAG
);
6074 SDValue
SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op
,
6075 SelectionDAG
&DAG
) const {
6076 unsigned IntrID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
6080 case Intrinsic::amdgcn_ds_ordered_add
:
6081 case Intrinsic::amdgcn_ds_ordered_swap
: {
6082 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6083 SDValue Chain
= M
->getOperand(0);
6084 SDValue M0
= M
->getOperand(2);
6085 SDValue Value
= M
->getOperand(3);
6086 unsigned IndexOperand
= M
->getConstantOperandVal(7);
6087 unsigned WaveRelease
= M
->getConstantOperandVal(8);
6088 unsigned WaveDone
= M
->getConstantOperandVal(9);
6089 unsigned ShaderType
;
6090 unsigned Instruction
;
6092 unsigned OrderedCountIndex
= IndexOperand
& 0x3f;
6093 IndexOperand
&= ~0x3f;
6094 unsigned CountDw
= 0;
6096 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
) {
6097 CountDw
= (IndexOperand
>> 24) & 0xf;
6098 IndexOperand
&= ~(0xf << 24);
6100 if (CountDw
< 1 || CountDw
> 4) {
6102 "ds_ordered_count: dword count must be between 1 and 4");
6107 report_fatal_error("ds_ordered_count: bad index operand");
6110 case Intrinsic::amdgcn_ds_ordered_add
:
6113 case Intrinsic::amdgcn_ds_ordered_swap
:
6118 if (WaveDone
&& !WaveRelease
)
6119 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
6121 switch (DAG
.getMachineFunction().getFunction().getCallingConv()) {
6122 case CallingConv::AMDGPU_CS
:
6123 case CallingConv::AMDGPU_KERNEL
:
6126 case CallingConv::AMDGPU_PS
:
6129 case CallingConv::AMDGPU_VS
:
6132 case CallingConv::AMDGPU_GS
:
6136 report_fatal_error("ds_ordered_count unsupported for this calling conv");
6139 unsigned Offset0
= OrderedCountIndex
<< 2;
6140 unsigned Offset1
= WaveRelease
| (WaveDone
<< 1) | (ShaderType
<< 2) |
6143 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
6144 Offset1
|= (CountDw
- 1) << 6;
6146 unsigned Offset
= Offset0
| (Offset1
<< 8);
6151 DAG
.getTargetConstant(Offset
, DL
, MVT::i16
),
6152 copyToM0(DAG
, Chain
, DL
, M0
).getValue(1), // Glue
6154 return DAG
.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT
, DL
,
6155 M
->getVTList(), Ops
, M
->getMemoryVT(),
6156 M
->getMemOperand());
6158 case Intrinsic::amdgcn_ds_fadd
: {
6159 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6162 case Intrinsic::amdgcn_ds_fadd
:
6163 Opc
= ISD::ATOMIC_LOAD_FADD
;
6167 return DAG
.getAtomic(Opc
, SDLoc(Op
), M
->getMemoryVT(),
6168 M
->getOperand(0), M
->getOperand(2), M
->getOperand(3),
6169 M
->getMemOperand());
6171 case Intrinsic::amdgcn_atomic_inc
:
6172 case Intrinsic::amdgcn_atomic_dec
:
6173 case Intrinsic::amdgcn_ds_fmin
:
6174 case Intrinsic::amdgcn_ds_fmax
: {
6175 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6178 case Intrinsic::amdgcn_atomic_inc
:
6179 Opc
= AMDGPUISD::ATOMIC_INC
;
6181 case Intrinsic::amdgcn_atomic_dec
:
6182 Opc
= AMDGPUISD::ATOMIC_DEC
;
6184 case Intrinsic::amdgcn_ds_fmin
:
6185 Opc
= AMDGPUISD::ATOMIC_LOAD_FMIN
;
6187 case Intrinsic::amdgcn_ds_fmax
:
6188 Opc
= AMDGPUISD::ATOMIC_LOAD_FMAX
;
6191 llvm_unreachable("Unknown intrinsic!");
6194 M
->getOperand(0), // Chain
6195 M
->getOperand(2), // Ptr
6196 M
->getOperand(3) // Value
6199 return DAG
.getMemIntrinsicNode(Opc
, SDLoc(Op
), M
->getVTList(), Ops
,
6200 M
->getMemoryVT(), M
->getMemOperand());
6202 case Intrinsic::amdgcn_buffer_load
:
6203 case Intrinsic::amdgcn_buffer_load_format
: {
6204 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(5))->getZExtValue();
6205 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6207 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
6208 IdxEn
= Idx
->getZExtValue() != 0;
6210 Op
.getOperand(0), // Chain
6211 Op
.getOperand(2), // rsrc
6212 Op
.getOperand(3), // vindex
6213 SDValue(), // voffset -- will be set by setBufferOffsets
6214 SDValue(), // soffset -- will be set by setBufferOffsets
6215 SDValue(), // offset -- will be set by setBufferOffsets
6216 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6217 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6220 setBufferOffsets(Op
.getOperand(4), DAG
, &Ops
[3]);
6221 unsigned Opc
= (IntrID
== Intrinsic::amdgcn_buffer_load
) ?
6222 AMDGPUISD::BUFFER_LOAD
: AMDGPUISD::BUFFER_LOAD_FORMAT
;
6224 EVT VT
= Op
.getValueType();
6225 EVT IntVT
= VT
.changeTypeToInteger();
6226 auto *M
= cast
<MemSDNode
>(Op
);
6227 EVT LoadVT
= Op
.getValueType();
6229 if (LoadVT
.getScalarType() == MVT::f16
)
6230 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
,
6233 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6234 if (LoadVT
.getScalarType() == MVT::i8
||
6235 LoadVT
.getScalarType() == MVT::i16
)
6236 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
6238 return getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
, IntVT
,
6239 M
->getMemOperand(), DAG
);
6241 case Intrinsic::amdgcn_raw_buffer_load
:
6242 case Intrinsic::amdgcn_raw_buffer_load_format
: {
6243 const bool IsFormat
= IntrID
== Intrinsic::amdgcn_raw_buffer_load_format
;
6245 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
6247 Op
.getOperand(0), // Chain
6248 Op
.getOperand(2), // rsrc
6249 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6250 Offsets
.first
, // voffset
6251 Op
.getOperand(4), // soffset
6252 Offsets
.second
, // offset
6253 Op
.getOperand(5), // cachepolicy
6254 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
6257 return lowerIntrinsicLoad(cast
<MemSDNode
>(Op
), IsFormat
, DAG
, Ops
);
6259 case Intrinsic::amdgcn_struct_buffer_load
:
6260 case Intrinsic::amdgcn_struct_buffer_load_format
: {
6261 const bool IsFormat
= IntrID
== Intrinsic::amdgcn_struct_buffer_load_format
;
6263 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6265 Op
.getOperand(0), // Chain
6266 Op
.getOperand(2), // rsrc
6267 Op
.getOperand(3), // vindex
6268 Offsets
.first
, // voffset
6269 Op
.getOperand(5), // soffset
6270 Offsets
.second
, // offset
6271 Op
.getOperand(6), // cachepolicy
6272 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
6275 return lowerIntrinsicLoad(cast
<MemSDNode
>(Op
), IsFormat
, DAG
, Ops
);
6277 case Intrinsic::amdgcn_tbuffer_load
: {
6278 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6279 EVT LoadVT
= Op
.getValueType();
6281 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6282 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
6283 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
6284 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
6286 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
6287 IdxEn
= Idx
->getZExtValue() != 0;
6289 Op
.getOperand(0), // Chain
6290 Op
.getOperand(2), // rsrc
6291 Op
.getOperand(3), // vindex
6292 Op
.getOperand(4), // voffset
6293 Op
.getOperand(5), // soffset
6294 Op
.getOperand(6), // offset
6295 DAG
.getConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
6296 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6297 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6300 if (LoadVT
.getScalarType() == MVT::f16
)
6301 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6303 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6304 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6307 case Intrinsic::amdgcn_raw_tbuffer_load
: {
6308 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6309 EVT LoadVT
= Op
.getValueType();
6310 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
6313 Op
.getOperand(0), // Chain
6314 Op
.getOperand(2), // rsrc
6315 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6316 Offsets
.first
, // voffset
6317 Op
.getOperand(4), // soffset
6318 Offsets
.second
, // offset
6319 Op
.getOperand(5), // format
6320 Op
.getOperand(6), // cachepolicy
6321 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
6324 if (LoadVT
.getScalarType() == MVT::f16
)
6325 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6327 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6328 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6331 case Intrinsic::amdgcn_struct_tbuffer_load
: {
6332 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6333 EVT LoadVT
= Op
.getValueType();
6334 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6337 Op
.getOperand(0), // Chain
6338 Op
.getOperand(2), // rsrc
6339 Op
.getOperand(3), // vindex
6340 Offsets
.first
, // voffset
6341 Op
.getOperand(5), // soffset
6342 Offsets
.second
, // offset
6343 Op
.getOperand(6), // format
6344 Op
.getOperand(7), // cachepolicy
6345 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
6348 if (LoadVT
.getScalarType() == MVT::f16
)
6349 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6351 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6352 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6355 case Intrinsic::amdgcn_buffer_atomic_swap
:
6356 case Intrinsic::amdgcn_buffer_atomic_add
:
6357 case Intrinsic::amdgcn_buffer_atomic_sub
:
6358 case Intrinsic::amdgcn_buffer_atomic_smin
:
6359 case Intrinsic::amdgcn_buffer_atomic_umin
:
6360 case Intrinsic::amdgcn_buffer_atomic_smax
:
6361 case Intrinsic::amdgcn_buffer_atomic_umax
:
6362 case Intrinsic::amdgcn_buffer_atomic_and
:
6363 case Intrinsic::amdgcn_buffer_atomic_or
:
6364 case Intrinsic::amdgcn_buffer_atomic_xor
: {
6365 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6367 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6368 IdxEn
= Idx
->getZExtValue() != 0;
6370 Op
.getOperand(0), // Chain
6371 Op
.getOperand(2), // vdata
6372 Op
.getOperand(3), // rsrc
6373 Op
.getOperand(4), // vindex
6374 SDValue(), // voffset -- will be set by setBufferOffsets
6375 SDValue(), // soffset -- will be set by setBufferOffsets
6376 SDValue(), // offset -- will be set by setBufferOffsets
6377 DAG
.getConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6378 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6380 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6381 EVT VT
= Op
.getValueType();
6383 auto *M
= cast
<MemSDNode
>(Op
);
6384 unsigned Opcode
= 0;
6387 case Intrinsic::amdgcn_buffer_atomic_swap
:
6388 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6390 case Intrinsic::amdgcn_buffer_atomic_add
:
6391 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6393 case Intrinsic::amdgcn_buffer_atomic_sub
:
6394 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6396 case Intrinsic::amdgcn_buffer_atomic_smin
:
6397 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6399 case Intrinsic::amdgcn_buffer_atomic_umin
:
6400 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6402 case Intrinsic::amdgcn_buffer_atomic_smax
:
6403 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6405 case Intrinsic::amdgcn_buffer_atomic_umax
:
6406 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6408 case Intrinsic::amdgcn_buffer_atomic_and
:
6409 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6411 case Intrinsic::amdgcn_buffer_atomic_or
:
6412 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6414 case Intrinsic::amdgcn_buffer_atomic_xor
:
6415 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6418 llvm_unreachable("unhandled atomic opcode");
6421 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6422 M
->getMemOperand());
6424 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6425 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6426 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6427 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6428 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6429 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6430 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6431 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6432 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6433 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6434 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6435 case Intrinsic::amdgcn_raw_buffer_atomic_dec
: {
6436 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6438 Op
.getOperand(0), // Chain
6439 Op
.getOperand(2), // vdata
6440 Op
.getOperand(3), // rsrc
6441 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6442 Offsets
.first
, // voffset
6443 Op
.getOperand(5), // soffset
6444 Offsets
.second
, // offset
6445 Op
.getOperand(6), // cachepolicy
6446 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
6448 EVT VT
= Op
.getValueType();
6450 auto *M
= cast
<MemSDNode
>(Op
);
6451 unsigned Opcode
= 0;
6454 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6455 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6457 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6458 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6460 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6461 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6463 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6464 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6466 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6467 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6469 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6470 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6472 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6473 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6475 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6476 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6478 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6479 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6481 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6482 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6484 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6485 Opcode
= AMDGPUISD::BUFFER_ATOMIC_INC
;
6487 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
6488 Opcode
= AMDGPUISD::BUFFER_ATOMIC_DEC
;
6491 llvm_unreachable("unhandled atomic opcode");
6494 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6495 M
->getMemOperand());
6497 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6498 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6499 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6500 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6501 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6502 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6503 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6504 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6505 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6506 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6507 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6508 case Intrinsic::amdgcn_struct_buffer_atomic_dec
: {
6509 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6511 Op
.getOperand(0), // Chain
6512 Op
.getOperand(2), // vdata
6513 Op
.getOperand(3), // rsrc
6514 Op
.getOperand(4), // vindex
6515 Offsets
.first
, // voffset
6516 Op
.getOperand(6), // soffset
6517 Offsets
.second
, // offset
6518 Op
.getOperand(7), // cachepolicy
6519 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
6521 EVT VT
= Op
.getValueType();
6523 auto *M
= cast
<MemSDNode
>(Op
);
6524 unsigned Opcode
= 0;
6527 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6528 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6530 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6531 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6533 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6534 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6536 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6537 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6539 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6540 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6542 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6543 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6545 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6546 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6548 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6549 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6551 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6552 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6554 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6555 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6557 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6558 Opcode
= AMDGPUISD::BUFFER_ATOMIC_INC
;
6560 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
6561 Opcode
= AMDGPUISD::BUFFER_ATOMIC_DEC
;
6564 llvm_unreachable("unhandled atomic opcode");
6567 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6568 M
->getMemOperand());
6570 case Intrinsic::amdgcn_buffer_atomic_cmpswap
: {
6571 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6573 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(5)))
6574 IdxEn
= Idx
->getZExtValue() != 0;
6576 Op
.getOperand(0), // Chain
6577 Op
.getOperand(2), // src
6578 Op
.getOperand(3), // cmp
6579 Op
.getOperand(4), // rsrc
6580 Op
.getOperand(5), // vindex
6581 SDValue(), // voffset -- will be set by setBufferOffsets
6582 SDValue(), // soffset -- will be set by setBufferOffsets
6583 SDValue(), // offset -- will be set by setBufferOffsets
6584 DAG
.getConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6585 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6587 setBufferOffsets(Op
.getOperand(6), DAG
, &Ops
[5]);
6588 EVT VT
= Op
.getValueType();
6589 auto *M
= cast
<MemSDNode
>(Op
);
6591 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6592 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6594 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
: {
6595 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6597 Op
.getOperand(0), // Chain
6598 Op
.getOperand(2), // src
6599 Op
.getOperand(3), // cmp
6600 Op
.getOperand(4), // rsrc
6601 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6602 Offsets
.first
, // voffset
6603 Op
.getOperand(6), // soffset
6604 Offsets
.second
, // offset
6605 Op
.getOperand(7), // cachepolicy
6606 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
6608 EVT VT
= Op
.getValueType();
6609 auto *M
= cast
<MemSDNode
>(Op
);
6611 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6612 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6614 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
: {
6615 auto Offsets
= splitBufferOffsets(Op
.getOperand(6), DAG
);
6617 Op
.getOperand(0), // Chain
6618 Op
.getOperand(2), // src
6619 Op
.getOperand(3), // cmp
6620 Op
.getOperand(4), // rsrc
6621 Op
.getOperand(5), // vindex
6622 Offsets
.first
, // voffset
6623 Op
.getOperand(7), // soffset
6624 Offsets
.second
, // offset
6625 Op
.getOperand(8), // cachepolicy
6626 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
6628 EVT VT
= Op
.getValueType();
6629 auto *M
= cast
<MemSDNode
>(Op
);
6631 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6632 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6636 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6637 AMDGPU::getImageDimIntrinsicInfo(IntrID
))
6638 return lowerImage(Op
, ImageDimIntr
, DAG
);
6644 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
6645 // dwordx4 if on SI.
6646 SDValue
SITargetLowering::getMemIntrinsicNode(unsigned Opcode
, const SDLoc
&DL
,
6648 ArrayRef
<SDValue
> Ops
, EVT MemVT
,
6649 MachineMemOperand
*MMO
,
6650 SelectionDAG
&DAG
) const {
6651 EVT VT
= VTList
.VTs
[0];
6653 EVT WidenedMemVT
= MemVT
;
6654 if (!Subtarget
->hasDwordx3LoadStores() &&
6655 (WidenedVT
== MVT::v3i32
|| WidenedVT
== MVT::v3f32
)) {
6656 WidenedVT
= EVT::getVectorVT(*DAG
.getContext(),
6657 WidenedVT
.getVectorElementType(), 4);
6658 WidenedMemVT
= EVT::getVectorVT(*DAG
.getContext(),
6659 WidenedMemVT
.getVectorElementType(), 4);
6660 MMO
= DAG
.getMachineFunction().getMachineMemOperand(MMO
, 0, 16);
6663 assert(VTList
.NumVTs
== 2);
6664 SDVTList WidenedVTList
= DAG
.getVTList(WidenedVT
, VTList
.VTs
[1]);
6666 auto NewOp
= DAG
.getMemIntrinsicNode(Opcode
, DL
, WidenedVTList
, Ops
,
6668 if (WidenedVT
!= VT
) {
6669 auto Extract
= DAG
.getNode(
6670 ISD::EXTRACT_SUBVECTOR
, DL
, VT
, NewOp
,
6671 DAG
.getConstant(0, DL
, getVectorIdxTy(DAG
.getDataLayout())));
6672 NewOp
= DAG
.getMergeValues({ Extract
, SDValue(NewOp
.getNode(), 1) }, DL
);
6677 SDValue
SITargetLowering::handleD16VData(SDValue VData
,
6678 SelectionDAG
&DAG
) const {
6679 EVT StoreVT
= VData
.getValueType();
6681 // No change for f16 and legal vector D16 types.
6682 if (!StoreVT
.isVector())
6686 assert((StoreVT
.getVectorNumElements() != 3) && "Handle v3f16");
6688 if (Subtarget
->hasUnpackedD16VMem()) {
6689 // We need to unpack the packed data to store.
6690 EVT IntStoreVT
= StoreVT
.changeTypeToInteger();
6691 SDValue IntVData
= DAG
.getNode(ISD::BITCAST
, DL
, IntStoreVT
, VData
);
6693 EVT EquivStoreVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
6694 StoreVT
.getVectorNumElements());
6695 SDValue ZExt
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, EquivStoreVT
, IntVData
);
6696 return DAG
.UnrollVectorOp(ZExt
.getNode());
6699 assert(isTypeLegal(StoreVT
));
6703 SDValue
SITargetLowering::LowerINTRINSIC_VOID(SDValue Op
,
6704 SelectionDAG
&DAG
) const {
6706 SDValue Chain
= Op
.getOperand(0);
6707 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
6708 MachineFunction
&MF
= DAG
.getMachineFunction();
6710 switch (IntrinsicID
) {
6711 case Intrinsic::amdgcn_exp
: {
6712 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
6713 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
6714 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(8));
6715 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(9));
6717 const SDValue Ops
[] = {
6719 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
6720 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
6721 Op
.getOperand(4), // src0
6722 Op
.getOperand(5), // src1
6723 Op
.getOperand(6), // src2
6724 Op
.getOperand(7), // src3
6725 DAG
.getTargetConstant(0, DL
, MVT::i1
), // compr
6726 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
6729 unsigned Opc
= Done
->isNullValue() ?
6730 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
6731 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
6733 case Intrinsic::amdgcn_exp_compr
: {
6734 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
6735 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
6736 SDValue Src0
= Op
.getOperand(4);
6737 SDValue Src1
= Op
.getOperand(5);
6738 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(6));
6739 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(7));
6741 SDValue Undef
= DAG
.getUNDEF(MVT::f32
);
6742 const SDValue Ops
[] = {
6744 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
6745 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
6746 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src0
),
6747 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src1
),
6750 DAG
.getTargetConstant(1, DL
, MVT::i1
), // compr
6751 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
6754 unsigned Opc
= Done
->isNullValue() ?
6755 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
6756 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
6758 case Intrinsic::amdgcn_init_exec
: {
6759 return DAG
.getNode(AMDGPUISD::INIT_EXEC
, DL
, MVT::Other
, Chain
,
6762 case Intrinsic::amdgcn_init_exec_from_input
: {
6763 return DAG
.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT
, DL
, MVT::Other
, Chain
,
6764 Op
.getOperand(2), Op
.getOperand(3));
6766 case Intrinsic::amdgcn_s_barrier
: {
6767 if (getTargetMachine().getOptLevel() > CodeGenOpt::None
) {
6768 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
6769 unsigned WGSize
= ST
.getFlatWorkGroupSizes(MF
.getFunction()).second
;
6770 if (WGSize
<= ST
.getWavefrontSize())
6771 return SDValue(DAG
.getMachineNode(AMDGPU::WAVE_BARRIER
, DL
, MVT::Other
,
6772 Op
.getOperand(0)), 0);
6776 case Intrinsic::amdgcn_tbuffer_store
: {
6777 SDValue VData
= Op
.getOperand(2);
6778 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6780 VData
= handleD16VData(VData
, DAG
);
6781 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
6782 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
6783 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
6784 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(11))->getZExtValue();
6786 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6787 IdxEn
= Idx
->getZExtValue() != 0;
6791 Op
.getOperand(3), // rsrc
6792 Op
.getOperand(4), // vindex
6793 Op
.getOperand(5), // voffset
6794 Op
.getOperand(6), // soffset
6795 Op
.getOperand(7), // offset
6796 DAG
.getConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
6797 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6798 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idexen
6800 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6801 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6802 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6803 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6804 M
->getMemoryVT(), M
->getMemOperand());
6807 case Intrinsic::amdgcn_struct_tbuffer_store
: {
6808 SDValue VData
= Op
.getOperand(2);
6809 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6811 VData
= handleD16VData(VData
, DAG
);
6812 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6816 Op
.getOperand(3), // rsrc
6817 Op
.getOperand(4), // vindex
6818 Offsets
.first
, // voffset
6819 Op
.getOperand(6), // soffset
6820 Offsets
.second
, // offset
6821 Op
.getOperand(7), // format
6822 Op
.getOperand(8), // cachepolicy
6823 DAG
.getConstant(1, DL
, MVT::i1
), // idexen
6825 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6826 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6827 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6828 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6829 M
->getMemoryVT(), M
->getMemOperand());
6832 case Intrinsic::amdgcn_raw_tbuffer_store
: {
6833 SDValue VData
= Op
.getOperand(2);
6834 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6836 VData
= handleD16VData(VData
, DAG
);
6837 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6841 Op
.getOperand(3), // rsrc
6842 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6843 Offsets
.first
, // voffset
6844 Op
.getOperand(5), // soffset
6845 Offsets
.second
, // offset
6846 Op
.getOperand(6), // format
6847 Op
.getOperand(7), // cachepolicy
6848 DAG
.getConstant(0, DL
, MVT::i1
), // idexen
6850 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6851 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6852 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6853 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6854 M
->getMemoryVT(), M
->getMemOperand());
6857 case Intrinsic::amdgcn_buffer_store
:
6858 case Intrinsic::amdgcn_buffer_store_format
: {
6859 SDValue VData
= Op
.getOperand(2);
6860 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6862 VData
= handleD16VData(VData
, DAG
);
6863 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6864 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6866 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6867 IdxEn
= Idx
->getZExtValue() != 0;
6871 Op
.getOperand(3), // rsrc
6872 Op
.getOperand(4), // vindex
6873 SDValue(), // voffset -- will be set by setBufferOffsets
6874 SDValue(), // soffset -- will be set by setBufferOffsets
6875 SDValue(), // offset -- will be set by setBufferOffsets
6876 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6877 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6879 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6880 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_buffer_store
?
6881 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
6882 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6883 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6885 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6886 EVT VDataType
= VData
.getValueType().getScalarType();
6887 if (VDataType
== MVT::i8
|| VDataType
== MVT::i16
)
6888 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
6890 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6891 M
->getMemoryVT(), M
->getMemOperand());
6894 case Intrinsic::amdgcn_raw_buffer_store
:
6895 case Intrinsic::amdgcn_raw_buffer_store_format
: {
6896 const bool IsFormat
=
6897 IntrinsicID
== Intrinsic::amdgcn_raw_buffer_store_format
;
6899 SDValue VData
= Op
.getOperand(2);
6900 EVT VDataVT
= VData
.getValueType();
6901 EVT EltType
= VDataVT
.getScalarType();
6902 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
6904 VData
= handleD16VData(VData
, DAG
);
6906 if (!isTypeLegal(VDataVT
)) {
6908 DAG
.getNode(ISD::BITCAST
, DL
,
6909 getEquivalentMemType(*DAG
.getContext(), VDataVT
), VData
);
6912 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6916 Op
.getOperand(3), // rsrc
6917 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6918 Offsets
.first
, // voffset
6919 Op
.getOperand(5), // soffset
6920 Offsets
.second
, // offset
6921 Op
.getOperand(6), // cachepolicy
6922 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
6925 IsFormat
? AMDGPUISD::BUFFER_STORE_FORMAT
: AMDGPUISD::BUFFER_STORE
;
6926 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6927 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6929 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6930 if (!IsD16
&& !VDataVT
.isVector() && EltType
.getSizeInBits() < 32)
6931 return handleByteShortBufferStores(DAG
, VDataVT
, DL
, Ops
, M
);
6933 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6934 M
->getMemoryVT(), M
->getMemOperand());
6937 case Intrinsic::amdgcn_struct_buffer_store
:
6938 case Intrinsic::amdgcn_struct_buffer_store_format
: {
6939 const bool IsFormat
=
6940 IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store_format
;
6942 SDValue VData
= Op
.getOperand(2);
6943 EVT VDataVT
= VData
.getValueType();
6944 EVT EltType
= VDataVT
.getScalarType();
6945 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
6948 VData
= handleD16VData(VData
, DAG
);
6950 if (!isTypeLegal(VDataVT
)) {
6952 DAG
.getNode(ISD::BITCAST
, DL
,
6953 getEquivalentMemType(*DAG
.getContext(), VDataVT
), VData
);
6956 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6960 Op
.getOperand(3), // rsrc
6961 Op
.getOperand(4), // vindex
6962 Offsets
.first
, // voffset
6963 Op
.getOperand(6), // soffset
6964 Offsets
.second
, // offset
6965 Op
.getOperand(7), // cachepolicy
6966 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
6968 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store
?
6969 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
6970 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6971 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6973 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6974 EVT VDataType
= VData
.getValueType().getScalarType();
6975 if (!IsD16
&& !VDataVT
.isVector() && EltType
.getSizeInBits() < 32)
6976 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
6978 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6979 M
->getMemoryVT(), M
->getMemOperand());
6982 case Intrinsic::amdgcn_buffer_atomic_fadd
: {
6983 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6985 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6986 IdxEn
= Idx
->getZExtValue() != 0;
6989 Op
.getOperand(2), // vdata
6990 Op
.getOperand(3), // rsrc
6991 Op
.getOperand(4), // vindex
6992 SDValue(), // voffset -- will be set by setBufferOffsets
6993 SDValue(), // soffset -- will be set by setBufferOffsets
6994 SDValue(), // offset -- will be set by setBufferOffsets
6995 DAG
.getConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6996 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6998 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6999 EVT VT
= Op
.getOperand(2).getValueType();
7001 auto *M
= cast
<MemSDNode
>(Op
);
7002 unsigned Opcode
= VT
.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
7003 : AMDGPUISD::BUFFER_ATOMIC_FADD
;
7005 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
7006 M
->getMemOperand());
7009 case Intrinsic::amdgcn_global_atomic_fadd
: {
7012 Op
.getOperand(2), // ptr
7013 Op
.getOperand(3) // vdata
7015 EVT VT
= Op
.getOperand(3).getValueType();
7017 auto *M
= cast
<MemSDNode
>(Op
);
7018 unsigned Opcode
= VT
.isVector() ? AMDGPUISD::ATOMIC_PK_FADD
7019 : AMDGPUISD::ATOMIC_FADD
;
7021 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
7022 M
->getMemOperand());
7025 case Intrinsic::amdgcn_end_cf
:
7026 return SDValue(DAG
.getMachineNode(AMDGPU::SI_END_CF
, DL
, MVT::Other
,
7027 Op
->getOperand(2), Chain
), 0);
7030 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
7031 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
7032 return lowerImage(Op
, ImageDimIntr
, DAG
);
7039 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
7040 // offset (the offset that is included in bounds checking and swizzling, to be
7041 // split between the instruction's voffset and immoffset fields) and soffset
7042 // (the offset that is excluded from bounds checking and swizzling, to go in
7043 // the instruction's soffset field). This function takes the first kind of
7044 // offset and figures out how to split it between voffset and immoffset.
7045 std::pair
<SDValue
, SDValue
> SITargetLowering::splitBufferOffsets(
7046 SDValue Offset
, SelectionDAG
&DAG
) const {
7048 const unsigned MaxImm
= 4095;
7049 SDValue N0
= Offset
;
7050 ConstantSDNode
*C1
= nullptr;
7052 if ((C1
= dyn_cast
<ConstantSDNode
>(N0
)))
7054 else if (DAG
.isBaseWithConstantOffset(N0
)) {
7055 C1
= cast
<ConstantSDNode
>(N0
.getOperand(1));
7056 N0
= N0
.getOperand(0);
7060 unsigned ImmOffset
= C1
->getZExtValue();
7061 // If the immediate value is too big for the immoffset field, put the value
7062 // and -4096 into the immoffset field so that the value that is copied/added
7063 // for the voffset field is a multiple of 4096, and it stands more chance
7064 // of being CSEd with the copy/add for another similar load/store.
7065 // However, do not do that rounding down to a multiple of 4096 if that is a
7066 // negative number, as it appears to be illegal to have a negative offset
7067 // in the vgpr, even if adding the immediate offset makes it positive.
7068 unsigned Overflow
= ImmOffset
& ~MaxImm
;
7069 ImmOffset
-= Overflow
;
7070 if ((int32_t)Overflow
< 0) {
7071 Overflow
+= ImmOffset
;
7074 C1
= cast
<ConstantSDNode
>(DAG
.getConstant(ImmOffset
, DL
, MVT::i32
));
7076 auto OverflowVal
= DAG
.getConstant(Overflow
, DL
, MVT::i32
);
7080 SDValue Ops
[] = { N0
, OverflowVal
};
7081 N0
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, Ops
);
7086 N0
= DAG
.getConstant(0, DL
, MVT::i32
);
7088 C1
= cast
<ConstantSDNode
>(DAG
.getConstant(0, DL
, MVT::i32
));
7089 return {N0
, SDValue(C1
, 0)};
7092 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
7093 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
7094 // pointed to by Offsets.
7095 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset
,
7096 SelectionDAG
&DAG
, SDValue
*Offsets
,
7097 unsigned Align
) const {
7098 SDLoc
DL(CombinedOffset
);
7099 if (auto C
= dyn_cast
<ConstantSDNode
>(CombinedOffset
)) {
7100 uint32_t Imm
= C
->getZExtValue();
7101 uint32_t SOffset
, ImmOffset
;
7102 if (AMDGPU::splitMUBUFOffset(Imm
, SOffset
, ImmOffset
, Subtarget
, Align
)) {
7103 Offsets
[0] = DAG
.getConstant(0, DL
, MVT::i32
);
7104 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
7105 Offsets
[2] = DAG
.getConstant(ImmOffset
, DL
, MVT::i32
);
7109 if (DAG
.isBaseWithConstantOffset(CombinedOffset
)) {
7110 SDValue N0
= CombinedOffset
.getOperand(0);
7111 SDValue N1
= CombinedOffset
.getOperand(1);
7112 uint32_t SOffset
, ImmOffset
;
7113 int Offset
= cast
<ConstantSDNode
>(N1
)->getSExtValue();
7114 if (Offset
>= 0 && AMDGPU::splitMUBUFOffset(Offset
, SOffset
, ImmOffset
,
7115 Subtarget
, Align
)) {
7117 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
7118 Offsets
[2] = DAG
.getConstant(ImmOffset
, DL
, MVT::i32
);
7122 Offsets
[0] = CombinedOffset
;
7123 Offsets
[1] = DAG
.getConstant(0, DL
, MVT::i32
);
7124 Offsets
[2] = DAG
.getConstant(0, DL
, MVT::i32
);
7127 // Handle 8 bit and 16 bit buffer loads
7128 SDValue
SITargetLowering::handleByteShortBufferLoads(SelectionDAG
&DAG
,
7129 EVT LoadVT
, SDLoc DL
,
7130 ArrayRef
<SDValue
> Ops
,
7131 MemSDNode
*M
) const {
7132 EVT IntVT
= LoadVT
.changeTypeToInteger();
7133 unsigned Opc
= (LoadVT
.getScalarType() == MVT::i8
) ?
7134 AMDGPUISD::BUFFER_LOAD_UBYTE
: AMDGPUISD::BUFFER_LOAD_USHORT
;
7136 SDVTList ResList
= DAG
.getVTList(MVT::i32
, MVT::Other
);
7137 SDValue BufferLoad
= DAG
.getMemIntrinsicNode(Opc
, DL
, ResList
,
7139 M
->getMemOperand());
7140 SDValue LoadVal
= DAG
.getNode(ISD::TRUNCATE
, DL
, IntVT
, BufferLoad
);
7141 LoadVal
= DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, LoadVal
);
7143 return DAG
.getMergeValues({LoadVal
, BufferLoad
.getValue(1)}, DL
);
7146 // Handle 8 bit and 16 bit buffer stores
7147 SDValue
SITargetLowering::handleByteShortBufferStores(SelectionDAG
&DAG
,
7148 EVT VDataType
, SDLoc DL
,
7150 MemSDNode
*M
) const {
7151 if (VDataType
== MVT::f16
)
7152 Ops
[1] = DAG
.getNode(ISD::BITCAST
, DL
, MVT::i16
, Ops
[1]);
7154 SDValue BufferStoreExt
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Ops
[1]);
7155 Ops
[1] = BufferStoreExt
;
7156 unsigned Opc
= (VDataType
== MVT::i8
) ? AMDGPUISD::BUFFER_STORE_BYTE
:
7157 AMDGPUISD::BUFFER_STORE_SHORT
;
7158 ArrayRef
<SDValue
> OpsRef
= makeArrayRef(&Ops
[0], 9);
7159 return DAG
.getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), OpsRef
, VDataType
,
7160 M
->getMemOperand());
7163 static SDValue
getLoadExtOrTrunc(SelectionDAG
&DAG
,
7164 ISD::LoadExtType ExtType
, SDValue Op
,
7165 const SDLoc
&SL
, EVT VT
) {
7166 if (VT
.bitsLT(Op
.getValueType()))
7167 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Op
);
7171 return DAG
.getNode(ISD::SIGN_EXTEND
, SL
, VT
, Op
);
7173 return DAG
.getNode(ISD::ZERO_EXTEND
, SL
, VT
, Op
);
7175 return DAG
.getNode(ISD::ANY_EXTEND
, SL
, VT
, Op
);
7176 case ISD::NON_EXTLOAD
:
7180 llvm_unreachable("invalid ext type");
7183 SDValue
SITargetLowering::widenLoad(LoadSDNode
*Ld
, DAGCombinerInfo
&DCI
) const {
7184 SelectionDAG
&DAG
= DCI
.DAG
;
7185 if (Ld
->getAlignment() < 4 || Ld
->isDivergent())
7188 // FIXME: Constant loads should all be marked invariant.
7189 unsigned AS
= Ld
->getAddressSpace();
7190 if (AS
!= AMDGPUAS::CONSTANT_ADDRESS
&&
7191 AS
!= AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
7192 (AS
!= AMDGPUAS::GLOBAL_ADDRESS
|| !Ld
->isInvariant()))
7195 // Don't do this early, since it may interfere with adjacent load merging for
7196 // illegal types. We can avoid losing alignment information for exotic types
7198 EVT MemVT
= Ld
->getMemoryVT();
7199 if ((MemVT
.isSimple() && !DCI
.isAfterLegalizeDAG()) ||
7200 MemVT
.getSizeInBits() >= 32)
7205 assert((!MemVT
.isVector() || Ld
->getExtensionType() == ISD::NON_EXTLOAD
) &&
7206 "unexpected vector extload");
7208 // TODO: Drop only high part of range.
7209 SDValue Ptr
= Ld
->getBasePtr();
7210 SDValue NewLoad
= DAG
.getLoad(ISD::UNINDEXED
, ISD::NON_EXTLOAD
,
7211 MVT::i32
, SL
, Ld
->getChain(), Ptr
,
7213 Ld
->getPointerInfo(), MVT::i32
,
7215 Ld
->getMemOperand()->getFlags(),
7217 nullptr); // Drop ranges
7219 EVT TruncVT
= EVT::getIntegerVT(*DAG
.getContext(), MemVT
.getSizeInBits());
7220 if (MemVT
.isFloatingPoint()) {
7221 assert(Ld
->getExtensionType() == ISD::NON_EXTLOAD
&&
7222 "unexpected fp extload");
7223 TruncVT
= MemVT
.changeTypeToInteger();
7226 SDValue Cvt
= NewLoad
;
7227 if (Ld
->getExtensionType() == ISD::SEXTLOAD
) {
7228 Cvt
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, SL
, MVT::i32
, NewLoad
,
7229 DAG
.getValueType(TruncVT
));
7230 } else if (Ld
->getExtensionType() == ISD::ZEXTLOAD
||
7231 Ld
->getExtensionType() == ISD::NON_EXTLOAD
) {
7232 Cvt
= DAG
.getZeroExtendInReg(NewLoad
, SL
, TruncVT
);
7234 assert(Ld
->getExtensionType() == ISD::EXTLOAD
);
7237 EVT VT
= Ld
->getValueType(0);
7238 EVT IntVT
= EVT::getIntegerVT(*DAG
.getContext(), VT
.getSizeInBits());
7240 DCI
.AddToWorklist(Cvt
.getNode());
7242 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
7243 // the appropriate extension from the 32-bit load.
7244 Cvt
= getLoadExtOrTrunc(DAG
, Ld
->getExtensionType(), Cvt
, SL
, IntVT
);
7245 DCI
.AddToWorklist(Cvt
.getNode());
7247 // Handle conversion back to floating point if necessary.
7248 Cvt
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, Cvt
);
7250 return DAG
.getMergeValues({ Cvt
, NewLoad
.getValue(1) }, SL
);
7253 SDValue
SITargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
7255 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
7256 ISD::LoadExtType ExtType
= Load
->getExtensionType();
7257 EVT MemVT
= Load
->getMemoryVT();
7259 if (ExtType
== ISD::NON_EXTLOAD
&& MemVT
.getSizeInBits() < 32) {
7260 if (MemVT
== MVT::i16
&& isTypeLegal(MVT::i16
))
7263 // FIXME: Copied from PPC
7264 // First, load into 32 bits, then truncate to 1 bit.
7266 SDValue Chain
= Load
->getChain();
7267 SDValue BasePtr
= Load
->getBasePtr();
7268 MachineMemOperand
*MMO
= Load
->getMemOperand();
7270 EVT RealMemVT
= (MemVT
== MVT::i1
) ? MVT::i8
: MVT::i16
;
7272 SDValue NewLD
= DAG
.getExtLoad(ISD::EXTLOAD
, DL
, MVT::i32
, Chain
,
7273 BasePtr
, RealMemVT
, MMO
);
7275 if (!MemVT
.isVector()) {
7277 DAG
.getNode(ISD::TRUNCATE
, DL
, MemVT
, NewLD
),
7281 return DAG
.getMergeValues(Ops
, DL
);
7284 SmallVector
<SDValue
, 3> Elts
;
7285 for (unsigned I
= 0, N
= MemVT
.getVectorNumElements(); I
!= N
; ++I
) {
7286 SDValue Elt
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, NewLD
,
7287 DAG
.getConstant(I
, DL
, MVT::i32
));
7289 Elts
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Elt
));
7293 DAG
.getBuildVector(MemVT
, DL
, Elts
),
7297 return DAG
.getMergeValues(Ops
, DL
);
7300 if (!MemVT
.isVector())
7303 assert(Op
.getValueType().getVectorElementType() == MVT::i32
&&
7304 "Custom lowering for non-i32 vectors hasn't been implemented.");
7306 if (!allowsMemoryAccess(*DAG
.getContext(), DAG
.getDataLayout(), MemVT
,
7307 *Load
->getMemOperand())) {
7309 std::tie(Ops
[0], Ops
[1]) = expandUnalignedLoad(Load
, DAG
);
7310 return DAG
.getMergeValues(Ops
, DL
);
7313 unsigned Alignment
= Load
->getAlignment();
7314 unsigned AS
= Load
->getAddressSpace();
7315 if (Subtarget
->hasLDSMisalignedBug() &&
7316 AS
== AMDGPUAS::FLAT_ADDRESS
&&
7317 Alignment
< MemVT
.getStoreSize() && MemVT
.getSizeInBits() > 32) {
7318 return SplitVectorLoad(Op
, DAG
);
7321 MachineFunction
&MF
= DAG
.getMachineFunction();
7322 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
7323 // If there is a possibilty that flat instruction access scratch memory
7324 // then we need to use the same legalization rules we use for private.
7325 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
7326 AS
= MFI
->hasFlatScratchInit() ?
7327 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
7329 unsigned NumElements
= MemVT
.getVectorNumElements();
7331 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7332 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
7333 if (!Op
->isDivergent() && Alignment
>= 4 && NumElements
< 32) {
7334 if (MemVT
.isPow2VectorType())
7336 if (NumElements
== 3)
7337 return WidenVectorLoad(Op
, DAG
);
7338 return SplitVectorLoad(Op
, DAG
);
7340 // Non-uniform loads will be selected to MUBUF instructions, so they
7341 // have the same legalization requirements as global and private
7346 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7347 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
7348 AS
== AMDGPUAS::GLOBAL_ADDRESS
) {
7349 if (Subtarget
->getScalarizeGlobalBehavior() && !Op
->isDivergent() &&
7350 !Load
->isVolatile() && isMemOpHasNoClobberedMemOperand(Load
) &&
7351 Alignment
>= 4 && NumElements
< 32) {
7352 if (MemVT
.isPow2VectorType())
7354 if (NumElements
== 3)
7355 return WidenVectorLoad(Op
, DAG
);
7356 return SplitVectorLoad(Op
, DAG
);
7358 // Non-uniform loads will be selected to MUBUF instructions, so they
7359 // have the same legalization requirements as global and private
7363 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7364 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
7365 AS
== AMDGPUAS::GLOBAL_ADDRESS
||
7366 AS
== AMDGPUAS::FLAT_ADDRESS
) {
7367 if (NumElements
> 4)
7368 return SplitVectorLoad(Op
, DAG
);
7369 // v3 loads not supported on SI.
7370 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7371 return WidenVectorLoad(Op
, DAG
);
7372 // v3 and v4 loads are supported for private and global memory.
7375 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7376 // Depending on the setting of the private_element_size field in the
7377 // resource descriptor, we can only make private accesses up to a certain
7379 switch (Subtarget
->getMaxPrivateElementSize()) {
7381 return scalarizeVectorLoad(Load
, DAG
);
7383 if (NumElements
> 2)
7384 return SplitVectorLoad(Op
, DAG
);
7387 // Same as global/flat
7388 if (NumElements
> 4)
7389 return SplitVectorLoad(Op
, DAG
);
7390 // v3 loads not supported on SI.
7391 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7392 return WidenVectorLoad(Op
, DAG
);
7395 llvm_unreachable("unsupported private_element_size");
7397 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
7398 // Use ds_read_b128 if possible.
7399 if (Subtarget
->useDS128() && Load
->getAlignment() >= 16 &&
7400 MemVT
.getStoreSize() == 16)
7403 if (NumElements
> 2)
7404 return SplitVectorLoad(Op
, DAG
);
7406 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7407 // address is negative, then the instruction is incorrectly treated as
7408 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7409 // loads here to avoid emitting ds_read2_b32. We may re-combine the
7410 // load later in the SILoadStoreOptimizer.
7411 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
7412 NumElements
== 2 && MemVT
.getStoreSize() == 8 &&
7413 Load
->getAlignment() < 8) {
7414 return SplitVectorLoad(Op
, DAG
);
7420 SDValue
SITargetLowering::LowerSELECT(SDValue Op
, SelectionDAG
&DAG
) const {
7421 EVT VT
= Op
.getValueType();
7422 assert(VT
.getSizeInBits() == 64);
7425 SDValue Cond
= Op
.getOperand(0);
7427 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i32
);
7428 SDValue One
= DAG
.getConstant(1, DL
, MVT::i32
);
7430 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(1));
7431 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(2));
7433 SDValue Lo0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, Zero
);
7434 SDValue Lo1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, Zero
);
7436 SDValue Lo
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Lo0
, Lo1
);
7438 SDValue Hi0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, One
);
7439 SDValue Hi1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, One
);
7441 SDValue Hi
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Hi0
, Hi1
);
7443 SDValue Res
= DAG
.getBuildVector(MVT::v2i32
, DL
, {Lo
, Hi
});
7444 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Res
);
7447 // Catch division cases where we can use shortcuts with rcp and rsq
7449 SDValue
SITargetLowering::lowerFastUnsafeFDIV(SDValue Op
,
7450 SelectionDAG
&DAG
) const {
7452 SDValue LHS
= Op
.getOperand(0);
7453 SDValue RHS
= Op
.getOperand(1);
7454 EVT VT
= Op
.getValueType();
7455 const SDNodeFlags Flags
= Op
->getFlags();
7456 bool Unsafe
= DAG
.getTarget().Options
.UnsafeFPMath
|| Flags
.hasAllowReciprocal();
7458 if (!Unsafe
&& VT
== MVT::f32
&& Subtarget
->hasFP32Denormals())
7461 if (const ConstantFPSDNode
*CLHS
= dyn_cast
<ConstantFPSDNode
>(LHS
)) {
7462 if (Unsafe
|| VT
== MVT::f32
|| VT
== MVT::f16
) {
7463 if (CLHS
->isExactlyValue(1.0)) {
7464 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
7465 // the CI documentation has a worst case error of 1 ulp.
7466 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
7467 // use it as long as we aren't trying to use denormals.
7469 // v_rcp_f16 and v_rsq_f16 DO support denormals.
7471 // 1.0 / sqrt(x) -> rsq(x)
7473 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
7474 // error seems really high at 2^29 ULP.
7475 if (RHS
.getOpcode() == ISD::FSQRT
)
7476 return DAG
.getNode(AMDGPUISD::RSQ
, SL
, VT
, RHS
.getOperand(0));
7478 // 1.0 / x -> rcp(x)
7479 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
7482 // Same as for 1.0, but expand the sign out of the constant.
7483 if (CLHS
->isExactlyValue(-1.0)) {
7484 // -1.0 / x -> rcp (fneg x)
7485 SDValue FNegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
7486 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, FNegRHS
);
7492 // Turn into multiply by the reciprocal.
7493 // x / y -> x * (1.0 / y)
7494 SDValue Recip
= DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
7495 return DAG
.getNode(ISD::FMUL
, SL
, VT
, LHS
, Recip
, Flags
);
7501 static SDValue
getFPBinOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
7502 EVT VT
, SDValue A
, SDValue B
, SDValue GlueChain
) {
7503 if (GlueChain
->getNumValues() <= 1) {
7504 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
);
7507 assert(GlueChain
->getNumValues() == 3);
7509 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
7511 default: llvm_unreachable("no chain equivalent for opcode");
7513 Opcode
= AMDGPUISD::FMUL_W_CHAIN
;
7517 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
,
7518 GlueChain
.getValue(2));
7521 static SDValue
getFPTernOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
7522 EVT VT
, SDValue A
, SDValue B
, SDValue C
,
7523 SDValue GlueChain
) {
7524 if (GlueChain
->getNumValues() <= 1) {
7525 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
, C
);
7528 assert(GlueChain
->getNumValues() == 3);
7530 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
7532 default: llvm_unreachable("no chain equivalent for opcode");
7534 Opcode
= AMDGPUISD::FMA_W_CHAIN
;
7538 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
, C
,
7539 GlueChain
.getValue(2));
7542 SDValue
SITargetLowering::LowerFDIV16(SDValue Op
, SelectionDAG
&DAG
) const {
7543 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
7547 SDValue Src0
= Op
.getOperand(0);
7548 SDValue Src1
= Op
.getOperand(1);
7550 SDValue CvtSrc0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
7551 SDValue CvtSrc1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
7553 SDValue RcpSrc1
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, CvtSrc1
);
7554 SDValue Quot
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, CvtSrc0
, RcpSrc1
);
7556 SDValue FPRoundFlag
= DAG
.getTargetConstant(0, SL
, MVT::i32
);
7557 SDValue BestQuot
= DAG
.getNode(ISD::FP_ROUND
, SL
, MVT::f16
, Quot
, FPRoundFlag
);
7559 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f16
, BestQuot
, Src1
, Src0
);
7562 // Faster 2.5 ULP division that does not support denormals.
7563 SDValue
SITargetLowering::lowerFDIV_FAST(SDValue Op
, SelectionDAG
&DAG
) const {
7565 SDValue LHS
= Op
.getOperand(1);
7566 SDValue RHS
= Op
.getOperand(2);
7568 SDValue r1
= DAG
.getNode(ISD::FABS
, SL
, MVT::f32
, RHS
);
7570 const APFloat
K0Val(BitsToFloat(0x6f800000));
7571 const SDValue K0
= DAG
.getConstantFP(K0Val
, SL
, MVT::f32
);
7573 const APFloat
K1Val(BitsToFloat(0x2f800000));
7574 const SDValue K1
= DAG
.getConstantFP(K1Val
, SL
, MVT::f32
);
7576 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
7579 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::f32
);
7581 SDValue r2
= DAG
.getSetCC(SL
, SetCCVT
, r1
, K0
, ISD::SETOGT
);
7583 SDValue r3
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f32
, r2
, K1
, One
);
7585 // TODO: Should this propagate fast-math-flags?
7586 r1
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, RHS
, r3
);
7588 // rcp does not support denormals.
7589 SDValue r0
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, r1
);
7591 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, LHS
, r0
);
7593 return DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, r3
, Mul
);
7596 // Returns immediate value for setting the F32 denorm mode when using the
7597 // S_DENORM_MODE instruction.
7598 static const SDValue
getSPDenormModeValue(int SPDenormMode
, SelectionDAG
&DAG
,
7599 const SDLoc
&SL
, const GCNSubtarget
*ST
) {
7600 assert(ST
->hasDenormModeInst() && "Requires S_DENORM_MODE");
7601 int DPDenormModeDefault
= ST
->hasFP64Denormals()
7602 ? FP_DENORM_FLUSH_NONE
7603 : FP_DENORM_FLUSH_IN_FLUSH_OUT
;
7605 int Mode
= SPDenormMode
| (DPDenormModeDefault
<< 2);
7606 return DAG
.getTargetConstant(Mode
, SL
, MVT::i32
);
7609 SDValue
SITargetLowering::LowerFDIV32(SDValue Op
, SelectionDAG
&DAG
) const {
7610 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
7614 SDValue LHS
= Op
.getOperand(0);
7615 SDValue RHS
= Op
.getOperand(1);
7617 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
7619 SDVTList ScaleVT
= DAG
.getVTList(MVT::f32
, MVT::i1
);
7621 SDValue DenominatorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
7623 SDValue NumeratorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
7626 // Denominator is scaled to not be denormal, so using rcp is ok.
7627 SDValue ApproxRcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
,
7629 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
,
7632 const unsigned Denorm32Reg
= AMDGPU::Hwreg::ID_MODE
|
7633 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_
) |
7634 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_
);
7635 const SDValue BitField
= DAG
.getTargetConstant(Denorm32Reg
, SL
, MVT::i16
);
7637 if (!Subtarget
->hasFP32Denormals()) {
7638 SDVTList BindParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
7640 SDValue EnableDenorm
;
7641 if (Subtarget
->hasDenormModeInst()) {
7642 const SDValue EnableDenormValue
=
7643 getSPDenormModeValue(FP_DENORM_FLUSH_NONE
, DAG
, SL
, Subtarget
);
7645 EnableDenorm
= DAG
.getNode(AMDGPUISD::DENORM_MODE
, SL
, BindParamVTs
,
7646 DAG
.getEntryNode(), EnableDenormValue
);
7648 const SDValue EnableDenormValue
= DAG
.getConstant(FP_DENORM_FLUSH_NONE
,
7650 EnableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, BindParamVTs
,
7651 DAG
.getEntryNode(), EnableDenormValue
,
7657 EnableDenorm
.getValue(0),
7658 EnableDenorm
.getValue(1)
7661 NegDivScale0
= DAG
.getMergeValues(Ops
, SL
);
7664 SDValue Fma0
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
,
7665 ApproxRcp
, One
, NegDivScale0
);
7667 SDValue Fma1
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma0
, ApproxRcp
,
7670 SDValue Mul
= getFPBinOp(DAG
, ISD::FMUL
, SL
, MVT::f32
, NumeratorScaled
,
7673 SDValue Fma2
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Mul
,
7674 NumeratorScaled
, Mul
);
7676 SDValue Fma3
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma2
, Fma1
, Mul
, Fma2
);
7678 SDValue Fma4
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Fma3
,
7679 NumeratorScaled
, Fma3
);
7681 if (!Subtarget
->hasFP32Denormals()) {
7683 SDValue DisableDenorm
;
7684 if (Subtarget
->hasDenormModeInst()) {
7685 const SDValue DisableDenormValue
=
7686 getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT
, DAG
, SL
, Subtarget
);
7688 DisableDenorm
= DAG
.getNode(AMDGPUISD::DENORM_MODE
, SL
, MVT::Other
,
7689 Fma4
.getValue(1), DisableDenormValue
,
7692 const SDValue DisableDenormValue
=
7693 DAG
.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT
, SL
, MVT::i32
);
7695 DisableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, MVT::Other
,
7696 Fma4
.getValue(1), DisableDenormValue
,
7697 BitField
, Fma4
.getValue(2));
7700 SDValue OutputChain
= DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
,
7701 DisableDenorm
, DAG
.getRoot());
7702 DAG
.setRoot(OutputChain
);
7705 SDValue Scale
= NumeratorScaled
.getValue(1);
7706 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f32
,
7707 Fma4
, Fma1
, Fma3
, Scale
);
7709 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f32
, Fmas
, RHS
, LHS
);
7712 SDValue
SITargetLowering::LowerFDIV64(SDValue Op
, SelectionDAG
&DAG
) const {
7713 if (DAG
.getTarget().Options
.UnsafeFPMath
)
7714 return lowerFastUnsafeFDIV(Op
, DAG
);
7717 SDValue X
= Op
.getOperand(0);
7718 SDValue Y
= Op
.getOperand(1);
7720 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f64
);
7722 SDVTList ScaleVT
= DAG
.getVTList(MVT::f64
, MVT::i1
);
7724 SDValue DivScale0
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, Y
, Y
, X
);
7726 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f64
, DivScale0
);
7728 SDValue Rcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f64
, DivScale0
);
7730 SDValue Fma0
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Rcp
, One
);
7732 SDValue Fma1
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Rcp
, Fma0
, Rcp
);
7734 SDValue Fma2
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Fma1
, One
);
7736 SDValue DivScale1
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, X
, Y
, X
);
7738 SDValue Fma3
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Fma1
, Fma2
, Fma1
);
7739 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f64
, DivScale1
, Fma3
);
7741 SDValue Fma4
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
,
7742 NegDivScale0
, Mul
, DivScale1
);
7746 if (!Subtarget
->hasUsableDivScaleConditionOutput()) {
7747 // Workaround a hardware bug on SI where the condition output from div_scale
7750 const SDValue Hi
= DAG
.getConstant(1, SL
, MVT::i32
);
7752 // Figure out if the scale to use for div_fmas.
7753 SDValue NumBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, X
);
7754 SDValue DenBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Y
);
7755 SDValue Scale0BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale0
);
7756 SDValue Scale1BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale1
);
7758 SDValue NumHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, NumBC
, Hi
);
7759 SDValue DenHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, DenBC
, Hi
);
7762 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale0BC
, Hi
);
7764 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale1BC
, Hi
);
7766 SDValue CmpDen
= DAG
.getSetCC(SL
, MVT::i1
, DenHi
, Scale0Hi
, ISD::SETEQ
);
7767 SDValue CmpNum
= DAG
.getSetCC(SL
, MVT::i1
, NumHi
, Scale1Hi
, ISD::SETEQ
);
7768 Scale
= DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, CmpNum
, CmpDen
);
7770 Scale
= DivScale1
.getValue(1);
7773 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f64
,
7774 Fma4
, Fma3
, Mul
, Scale
);
7776 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f64
, Fmas
, Y
, X
);
7779 SDValue
SITargetLowering::LowerFDIV(SDValue Op
, SelectionDAG
&DAG
) const {
7780 EVT VT
= Op
.getValueType();
7783 return LowerFDIV32(Op
, DAG
);
7786 return LowerFDIV64(Op
, DAG
);
7789 return LowerFDIV16(Op
, DAG
);
7791 llvm_unreachable("Unexpected type for fdiv");
7794 SDValue
SITargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
7796 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
7797 EVT VT
= Store
->getMemoryVT();
7799 if (VT
== MVT::i1
) {
7800 return DAG
.getTruncStore(Store
->getChain(), DL
,
7801 DAG
.getSExtOrTrunc(Store
->getValue(), DL
, MVT::i32
),
7802 Store
->getBasePtr(), MVT::i1
, Store
->getMemOperand());
7805 assert(VT
.isVector() &&
7806 Store
->getValue().getValueType().getScalarType() == MVT::i32
);
7808 if (!allowsMemoryAccess(*DAG
.getContext(), DAG
.getDataLayout(), VT
,
7809 *Store
->getMemOperand())) {
7810 return expandUnalignedStore(Store
, DAG
);
7813 unsigned AS
= Store
->getAddressSpace();
7814 if (Subtarget
->hasLDSMisalignedBug() &&
7815 AS
== AMDGPUAS::FLAT_ADDRESS
&&
7816 Store
->getAlignment() < VT
.getStoreSize() && VT
.getSizeInBits() > 32) {
7817 return SplitVectorStore(Op
, DAG
);
7820 MachineFunction
&MF
= DAG
.getMachineFunction();
7821 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
7822 // If there is a possibilty that flat instruction access scratch memory
7823 // then we need to use the same legalization rules we use for private.
7824 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
7825 AS
= MFI
->hasFlatScratchInit() ?
7826 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
7828 unsigned NumElements
= VT
.getVectorNumElements();
7829 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
||
7830 AS
== AMDGPUAS::FLAT_ADDRESS
) {
7831 if (NumElements
> 4)
7832 return SplitVectorStore(Op
, DAG
);
7833 // v3 stores not supported on SI.
7834 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7835 return SplitVectorStore(Op
, DAG
);
7837 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7838 switch (Subtarget
->getMaxPrivateElementSize()) {
7840 return scalarizeVectorStore(Store
, DAG
);
7842 if (NumElements
> 2)
7843 return SplitVectorStore(Op
, DAG
);
7846 if (NumElements
> 4 || NumElements
== 3)
7847 return SplitVectorStore(Op
, DAG
);
7850 llvm_unreachable("unsupported private_element_size");
7852 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
7853 // Use ds_write_b128 if possible.
7854 if (Subtarget
->useDS128() && Store
->getAlignment() >= 16 &&
7855 VT
.getStoreSize() == 16 && NumElements
!= 3)
7858 if (NumElements
> 2)
7859 return SplitVectorStore(Op
, DAG
);
7861 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7862 // address is negative, then the instruction is incorrectly treated as
7863 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7864 // stores here to avoid emitting ds_write2_b32. We may re-combine the
7865 // store later in the SILoadStoreOptimizer.
7866 if (!Subtarget
->hasUsableDSOffset() &&
7867 NumElements
== 2 && VT
.getStoreSize() == 8 &&
7868 Store
->getAlignment() < 8) {
7869 return SplitVectorStore(Op
, DAG
);
7874 llvm_unreachable("unhandled address space");
7878 SDValue
SITargetLowering::LowerTrig(SDValue Op
, SelectionDAG
&DAG
) const {
7880 EVT VT
= Op
.getValueType();
7881 SDValue Arg
= Op
.getOperand(0);
7884 // TODO: Should this propagate fast-math-flags?
7886 SDValue OneOver2Pi
= DAG
.getConstantFP(0.5 / M_PI
, DL
, VT
);
7888 if (Subtarget
->hasTrigReducedRange()) {
7889 SDValue MulVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
7890 TrigVal
= DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, MulVal
);
7892 TrigVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
7895 switch (Op
.getOpcode()) {
7897 return DAG
.getNode(AMDGPUISD::COS_HW
, SDLoc(Op
), VT
, TrigVal
);
7899 return DAG
.getNode(AMDGPUISD::SIN_HW
, SDLoc(Op
), VT
, TrigVal
);
7901 llvm_unreachable("Wrong trig opcode");
7905 SDValue
SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op
, SelectionDAG
&DAG
) const {
7906 AtomicSDNode
*AtomicNode
= cast
<AtomicSDNode
>(Op
);
7907 assert(AtomicNode
->isCompareAndSwap());
7908 unsigned AS
= AtomicNode
->getAddressSpace();
7910 // No custom lowering required for local address space
7911 if (!isFlatGlobalAddrSpace(AS
))
7914 // Non-local address space requires custom lowering for atomic compare
7915 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7917 SDValue ChainIn
= Op
.getOperand(0);
7918 SDValue Addr
= Op
.getOperand(1);
7919 SDValue Old
= Op
.getOperand(2);
7920 SDValue New
= Op
.getOperand(3);
7921 EVT VT
= Op
.getValueType();
7922 MVT SimpleVT
= VT
.getSimpleVT();
7923 MVT VecType
= MVT::getVectorVT(SimpleVT
, 2);
7925 SDValue NewOld
= DAG
.getBuildVector(VecType
, DL
, {New
, Old
});
7926 SDValue Ops
[] = { ChainIn
, Addr
, NewOld
};
7928 return DAG
.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP
, DL
, Op
->getVTList(),
7929 Ops
, VT
, AtomicNode
->getMemOperand());
7932 //===----------------------------------------------------------------------===//
7933 // Custom DAG optimizations
7934 //===----------------------------------------------------------------------===//
7936 SDValue
SITargetLowering::performUCharToFloatCombine(SDNode
*N
,
7937 DAGCombinerInfo
&DCI
) const {
7938 EVT VT
= N
->getValueType(0);
7939 EVT ScalarVT
= VT
.getScalarType();
7940 if (ScalarVT
!= MVT::f32
)
7943 SelectionDAG
&DAG
= DCI
.DAG
;
7946 SDValue Src
= N
->getOperand(0);
7947 EVT SrcVT
= Src
.getValueType();
7949 // TODO: We could try to match extracting the higher bytes, which would be
7950 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
7951 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
7952 // about in practice.
7953 if (DCI
.isAfterLegalizeDAG() && SrcVT
== MVT::i32
) {
7954 if (DAG
.MaskedValueIsZero(Src
, APInt::getHighBitsSet(32, 24))) {
7955 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
, DL
, VT
, Src
);
7956 DCI
.AddToWorklist(Cvt
.getNode());
7964 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
7966 // This is a variant of
7967 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
7969 // The normal DAG combiner will do this, but only if the add has one use since
7970 // that would increase the number of instructions.
7972 // This prevents us from seeing a constant offset that can be folded into a
7973 // memory instruction's addressing mode. If we know the resulting add offset of
7974 // a pointer can be folded into an addressing offset, we can replace the pointer
7975 // operand with the add of new constant offset. This eliminates one of the uses,
7976 // and may allow the remaining use to also be simplified.
7978 SDValue
SITargetLowering::performSHLPtrCombine(SDNode
*N
,
7981 DAGCombinerInfo
&DCI
) const {
7982 SDValue N0
= N
->getOperand(0);
7983 SDValue N1
= N
->getOperand(1);
7985 // We only do this to handle cases where it's profitable when there are
7986 // multiple uses of the add, so defer to the standard combine.
7987 if ((N0
.getOpcode() != ISD::ADD
&& N0
.getOpcode() != ISD::OR
) ||
7991 const ConstantSDNode
*CN1
= dyn_cast
<ConstantSDNode
>(N1
);
7995 const ConstantSDNode
*CAdd
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
7999 // If the resulting offset is too large, we can't fold it into the addressing
8001 APInt Offset
= CAdd
->getAPIntValue() << CN1
->getAPIntValue();
8002 Type
*Ty
= MemVT
.getTypeForEVT(*DCI
.DAG
.getContext());
8005 AM
.HasBaseReg
= true;
8006 AM
.BaseOffs
= Offset
.getSExtValue();
8007 if (!isLegalAddressingMode(DCI
.DAG
.getDataLayout(), AM
, Ty
, AddrSpace
))
8010 SelectionDAG
&DAG
= DCI
.DAG
;
8012 EVT VT
= N
->getValueType(0);
8014 SDValue ShlX
= DAG
.getNode(ISD::SHL
, SL
, VT
, N0
.getOperand(0), N1
);
8015 SDValue COffset
= DAG
.getConstant(Offset
, SL
, MVT::i32
);
8018 Flags
.setNoUnsignedWrap(N
->getFlags().hasNoUnsignedWrap() &&
8019 (N0
.getOpcode() == ISD::OR
||
8020 N0
->getFlags().hasNoUnsignedWrap()));
8022 return DAG
.getNode(ISD::ADD
, SL
, VT
, ShlX
, COffset
, Flags
);
8025 SDValue
SITargetLowering::performMemSDNodeCombine(MemSDNode
*N
,
8026 DAGCombinerInfo
&DCI
) const {
8027 SDValue Ptr
= N
->getBasePtr();
8028 SelectionDAG
&DAG
= DCI
.DAG
;
8031 // TODO: We could also do this for multiplies.
8032 if (Ptr
.getOpcode() == ISD::SHL
) {
8033 SDValue NewPtr
= performSHLPtrCombine(Ptr
.getNode(), N
->getAddressSpace(),
8034 N
->getMemoryVT(), DCI
);
8036 SmallVector
<SDValue
, 8> NewOps(N
->op_begin(), N
->op_end());
8038 NewOps
[N
->getOpcode() == ISD::STORE
? 2 : 1] = NewPtr
;
8039 return SDValue(DAG
.UpdateNodeOperands(N
, NewOps
), 0);
8046 static bool bitOpWithConstantIsReducible(unsigned Opc
, uint32_t Val
) {
8047 return (Opc
== ISD::AND
&& (Val
== 0 || Val
== 0xffffffff)) ||
8048 (Opc
== ISD::OR
&& (Val
== 0xffffffff || Val
== 0)) ||
8049 (Opc
== ISD::XOR
&& Val
== 0);
8052 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
8053 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
8054 // integer combine opportunities since most 64-bit operations are decomposed
8055 // this way. TODO: We won't want this for SALU especially if it is an inline
8057 SDValue
SITargetLowering::splitBinaryBitConstantOp(
8058 DAGCombinerInfo
&DCI
,
8060 unsigned Opc
, SDValue LHS
,
8061 const ConstantSDNode
*CRHS
) const {
8062 uint64_t Val
= CRHS
->getZExtValue();
8063 uint32_t ValLo
= Lo_32(Val
);
8064 uint32_t ValHi
= Hi_32(Val
);
8065 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8067 if ((bitOpWithConstantIsReducible(Opc
, ValLo
) ||
8068 bitOpWithConstantIsReducible(Opc
, ValHi
)) ||
8069 (CRHS
->hasOneUse() && !TII
->isInlineConstant(CRHS
->getAPIntValue()))) {
8070 // If we need to materialize a 64-bit immediate, it will be split up later
8071 // anyway. Avoid creating the harder to understand 64-bit immediate
8073 return splitBinaryBitConstantOpImpl(DCI
, SL
, Opc
, LHS
, ValLo
, ValHi
);
8079 // Returns true if argument is a boolean value which is not serialized into
8080 // memory or argument and does not require v_cmdmask_b32 to be deserialized.
8081 static bool isBoolSGPR(SDValue V
) {
8082 if (V
.getValueType() != MVT::i1
)
8084 switch (V
.getOpcode()) {
8090 case AMDGPUISD::FP_CLASS
:
8096 // If a constant has all zeroes or all ones within each byte return it.
8097 // Otherwise return 0.
8098 static uint32_t getConstantPermuteMask(uint32_t C
) {
8099 // 0xff for any zero byte in the mask
8100 uint32_t ZeroByteMask
= 0;
8101 if (!(C
& 0x000000ff)) ZeroByteMask
|= 0x000000ff;
8102 if (!(C
& 0x0000ff00)) ZeroByteMask
|= 0x0000ff00;
8103 if (!(C
& 0x00ff0000)) ZeroByteMask
|= 0x00ff0000;
8104 if (!(C
& 0xff000000)) ZeroByteMask
|= 0xff000000;
8105 uint32_t NonZeroByteMask
= ~ZeroByteMask
; // 0xff for any non-zero byte
8106 if ((NonZeroByteMask
& C
) != NonZeroByteMask
)
8107 return 0; // Partial bytes selected.
8111 // Check if a node selects whole bytes from its operand 0 starting at a byte
8112 // boundary while masking the rest. Returns select mask as in the v_perm_b32
8113 // or -1 if not succeeded.
8114 // Note byte select encoding:
8115 // value 0-3 selects corresponding source byte;
8116 // value 0xc selects zero;
8117 // value 0xff selects 0xff.
8118 static uint32_t getPermuteMask(SelectionDAG
&DAG
, SDValue V
) {
8119 assert(V
.getValueSizeInBits() == 32);
8121 if (V
.getNumOperands() != 2)
8124 ConstantSDNode
*N1
= dyn_cast
<ConstantSDNode
>(V
.getOperand(1));
8128 uint32_t C
= N1
->getZExtValue();
8130 switch (V
.getOpcode()) {
8134 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
8135 return (0x03020100 & ConstMask
) | (0x0c0c0c0c & ~ConstMask
);
8140 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
8141 return (0x03020100 & ~ConstMask
) | ConstMask
;
8149 return uint32_t((0x030201000c0c0c0cull
<< C
) >> 32);
8155 return uint32_t(0x0c0c0c0c03020100ull
>> C
);
8161 SDValue
SITargetLowering::performAndCombine(SDNode
*N
,
8162 DAGCombinerInfo
&DCI
) const {
8163 if (DCI
.isBeforeLegalize())
8166 SelectionDAG
&DAG
= DCI
.DAG
;
8167 EVT VT
= N
->getValueType(0);
8168 SDValue LHS
= N
->getOperand(0);
8169 SDValue RHS
= N
->getOperand(1);
8172 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8173 if (VT
== MVT::i64
&& CRHS
) {
8175 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::AND
, LHS
, CRHS
))
8179 if (CRHS
&& VT
== MVT::i32
) {
8180 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
8181 // nb = number of trailing zeroes in mask
8182 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
8183 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
8184 uint64_t Mask
= CRHS
->getZExtValue();
8185 unsigned Bits
= countPopulation(Mask
);
8186 if (getSubtarget()->hasSDWA() && LHS
->getOpcode() == ISD::SRL
&&
8187 (Bits
== 8 || Bits
== 16) && isShiftedMask_64(Mask
) && !(Mask
& 1)) {
8188 if (auto *CShift
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(1))) {
8189 unsigned Shift
= CShift
->getZExtValue();
8190 unsigned NB
= CRHS
->getAPIntValue().countTrailingZeros();
8191 unsigned Offset
= NB
+ Shift
;
8192 if ((Offset
& (Bits
- 1)) == 0) { // Starts at a byte or word boundary.
8194 SDValue BFE
= DAG
.getNode(AMDGPUISD::BFE_U32
, SL
, MVT::i32
,
8196 DAG
.getConstant(Offset
, SL
, MVT::i32
),
8197 DAG
.getConstant(Bits
, SL
, MVT::i32
));
8198 EVT NarrowVT
= EVT::getIntegerVT(*DAG
.getContext(), Bits
);
8199 SDValue Ext
= DAG
.getNode(ISD::AssertZext
, SL
, VT
, BFE
,
8200 DAG
.getValueType(NarrowVT
));
8201 SDValue Shl
= DAG
.getNode(ISD::SHL
, SDLoc(LHS
), VT
, Ext
,
8202 DAG
.getConstant(NB
, SDLoc(CRHS
), MVT::i32
));
8208 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8209 if (LHS
.hasOneUse() && LHS
.getOpcode() == AMDGPUISD::PERM
&&
8210 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
8211 uint32_t Sel
= getConstantPermuteMask(Mask
);
8215 // Select 0xc for all zero bytes
8216 Sel
= (LHS
.getConstantOperandVal(2) & Sel
) | (~Sel
& 0x0c0c0c0c);
8218 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
8219 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
8223 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
8224 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
8225 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == ISD::SETCC
) {
8226 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
8227 ISD::CondCode RCC
= cast
<CondCodeSDNode
>(RHS
.getOperand(2))->get();
8229 SDValue X
= LHS
.getOperand(0);
8230 SDValue Y
= RHS
.getOperand(0);
8231 if (Y
.getOpcode() != ISD::FABS
|| Y
.getOperand(0) != X
)
8234 if (LCC
== ISD::SETO
) {
8235 if (X
!= LHS
.getOperand(1))
8238 if (RCC
== ISD::SETUNE
) {
8239 const ConstantFPSDNode
*C1
= dyn_cast
<ConstantFPSDNode
>(RHS
.getOperand(1));
8240 if (!C1
|| !C1
->isInfinity() || C1
->isNegative())
8243 const uint32_t Mask
= SIInstrFlags::N_NORMAL
|
8244 SIInstrFlags::N_SUBNORMAL
|
8245 SIInstrFlags::N_ZERO
|
8246 SIInstrFlags::P_ZERO
|
8247 SIInstrFlags::P_SUBNORMAL
|
8248 SIInstrFlags::P_NORMAL
;
8250 static_assert(((~(SIInstrFlags::S_NAN
|
8251 SIInstrFlags::Q_NAN
|
8252 SIInstrFlags::N_INFINITY
|
8253 SIInstrFlags::P_INFINITY
)) & 0x3ff) == Mask
,
8257 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
8258 X
, DAG
.getConstant(Mask
, DL
, MVT::i32
));
8263 if (RHS
.getOpcode() == ISD::SETCC
&& LHS
.getOpcode() == AMDGPUISD::FP_CLASS
)
8264 std::swap(LHS
, RHS
);
8266 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
8268 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
8269 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
8270 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
8271 const ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8272 if ((LCC
== ISD::SETO
|| LCC
== ISD::SETUO
) && Mask
&&
8273 (RHS
.getOperand(0) == LHS
.getOperand(0) &&
8274 LHS
.getOperand(0) == LHS
.getOperand(1))) {
8275 const unsigned OrdMask
= SIInstrFlags::S_NAN
| SIInstrFlags::Q_NAN
;
8276 unsigned NewMask
= LCC
== ISD::SETO
?
8277 Mask
->getZExtValue() & ~OrdMask
:
8278 Mask
->getZExtValue() & OrdMask
;
8281 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
, RHS
.getOperand(0),
8282 DAG
.getConstant(NewMask
, DL
, MVT::i32
));
8286 if (VT
== MVT::i32
&&
8287 (RHS
.getOpcode() == ISD::SIGN_EXTEND
|| LHS
.getOpcode() == ISD::SIGN_EXTEND
)) {
8288 // and x, (sext cc from i1) => select cc, x, 0
8289 if (RHS
.getOpcode() != ISD::SIGN_EXTEND
)
8290 std::swap(LHS
, RHS
);
8291 if (isBoolSGPR(RHS
.getOperand(0)))
8292 return DAG
.getSelect(SDLoc(N
), MVT::i32
, RHS
.getOperand(0),
8293 LHS
, DAG
.getConstant(0, SDLoc(N
), MVT::i32
));
8296 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8297 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8298 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
8299 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
8300 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
8301 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
8302 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
8303 // Canonicalize the expression in an attempt to have fewer unique masks
8304 // and therefore fewer registers used to hold the masks.
8305 if (LHSMask
> RHSMask
) {
8306 std::swap(LHSMask
, RHSMask
);
8307 std::swap(LHS
, RHS
);
8310 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8311 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8312 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8313 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8315 // Check of we need to combine values from two sources within a byte.
8316 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
8317 // If we select high and lower word keep it for SDWA.
8318 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8319 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
8320 // Each byte in each mask is either selector mask 0-3, or has higher
8321 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
8322 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
8323 // mask which is not 0xff wins. By anding both masks we have a correct
8324 // result except that 0x0c shall be corrected to give 0x0c only.
8325 uint32_t Mask
= LHSMask
& RHSMask
;
8326 for (unsigned I
= 0; I
< 32; I
+= 8) {
8327 uint32_t ByteSel
= 0xff << I
;
8328 if ((LHSMask
& ByteSel
) == 0x0c || (RHSMask
& ByteSel
) == 0x0c)
8329 Mask
&= (0x0c << I
) & 0xffffffff;
8332 // Add 4 to each active LHS lane. It will not affect any existing 0xff
8334 uint32_t Sel
= Mask
| (LHSUsedLanes
& 0x04040404);
8337 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
8338 LHS
.getOperand(0), RHS
.getOperand(0),
8339 DAG
.getConstant(Sel
, DL
, MVT::i32
));
8347 SDValue
SITargetLowering::performOrCombine(SDNode
*N
,
8348 DAGCombinerInfo
&DCI
) const {
8349 SelectionDAG
&DAG
= DCI
.DAG
;
8350 SDValue LHS
= N
->getOperand(0);
8351 SDValue RHS
= N
->getOperand(1);
8353 EVT VT
= N
->getValueType(0);
8354 if (VT
== MVT::i1
) {
8355 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
8356 if (LHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
8357 RHS
.getOpcode() == AMDGPUISD::FP_CLASS
) {
8358 SDValue Src
= LHS
.getOperand(0);
8359 if (Src
!= RHS
.getOperand(0))
8362 const ConstantSDNode
*CLHS
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
8363 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8367 // Only 10 bits are used.
8368 static const uint32_t MaxMask
= 0x3ff;
8370 uint32_t NewMask
= (CLHS
->getZExtValue() | CRHS
->getZExtValue()) & MaxMask
;
8372 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
8373 Src
, DAG
.getConstant(NewMask
, DL
, MVT::i32
));
8379 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8380 if (isa
<ConstantSDNode
>(RHS
) && LHS
.hasOneUse() &&
8381 LHS
.getOpcode() == AMDGPUISD::PERM
&&
8382 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
8383 uint32_t Sel
= getConstantPermuteMask(N
->getConstantOperandVal(1));
8387 Sel
|= LHS
.getConstantOperandVal(2);
8389 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
8390 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
8393 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8394 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8395 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
8396 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
8397 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
8398 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
8399 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
8400 // Canonicalize the expression in an attempt to have fewer unique masks
8401 // and therefore fewer registers used to hold the masks.
8402 if (LHSMask
> RHSMask
) {
8403 std::swap(LHSMask
, RHSMask
);
8404 std::swap(LHS
, RHS
);
8407 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8408 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8409 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8410 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8412 // Check of we need to combine values from two sources within a byte.
8413 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
8414 // If we select high and lower word keep it for SDWA.
8415 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8416 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
8417 // Kill zero bytes selected by other mask. Zero value is 0xc.
8418 LHSMask
&= ~RHSUsedLanes
;
8419 RHSMask
&= ~LHSUsedLanes
;
8420 // Add 4 to each active LHS lane
8421 LHSMask
|= LHSUsedLanes
& 0x04040404;
8423 uint32_t Sel
= LHSMask
| RHSMask
;
8426 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
8427 LHS
.getOperand(0), RHS
.getOperand(0),
8428 DAG
.getConstant(Sel
, DL
, MVT::i32
));
8436 // TODO: This could be a generic combine with a predicate for extracting the
8437 // high half of an integer being free.
8439 // (or i64:x, (zero_extend i32:y)) ->
8440 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
8441 if (LHS
.getOpcode() == ISD::ZERO_EXTEND
&&
8442 RHS
.getOpcode() != ISD::ZERO_EXTEND
)
8443 std::swap(LHS
, RHS
);
8445 if (RHS
.getOpcode() == ISD::ZERO_EXTEND
) {
8446 SDValue ExtSrc
= RHS
.getOperand(0);
8447 EVT SrcVT
= ExtSrc
.getValueType();
8448 if (SrcVT
== MVT::i32
) {
8450 SDValue LowLHS
, HiBits
;
8451 std::tie(LowLHS
, HiBits
) = split64BitValue(LHS
, DAG
);
8452 SDValue LowOr
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, LowLHS
, ExtSrc
);
8454 DCI
.AddToWorklist(LowOr
.getNode());
8455 DCI
.AddToWorklist(HiBits
.getNode());
8457 SDValue Vec
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
,
8459 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Vec
);
8463 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
8466 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::OR
, LHS
, CRHS
))
8473 SDValue
SITargetLowering::performXorCombine(SDNode
*N
,
8474 DAGCombinerInfo
&DCI
) const {
8475 EVT VT
= N
->getValueType(0);
8479 SDValue LHS
= N
->getOperand(0);
8480 SDValue RHS
= N
->getOperand(1);
8482 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8485 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::XOR
, LHS
, CRHS
))
8492 // Instructions that will be lowered with a final instruction that zeros the
8493 // high result bits.
8494 // XXX - probably only need to list legal operations.
8495 static bool fp16SrcZerosHighBits(unsigned Opc
) {
8504 case ISD::FCANONICALIZE
:
8506 case ISD::UINT_TO_FP
:
8507 case ISD::SINT_TO_FP
:
8509 // Fabs is lowered to a bit operation, but it's an and which will clear the
8510 // high bits anyway.
8524 case ISD::FNEARBYINT
:
8529 case AMDGPUISD::FRACT
:
8530 case AMDGPUISD::CLAMP
:
8531 case AMDGPUISD::COS_HW
:
8532 case AMDGPUISD::SIN_HW
:
8533 case AMDGPUISD::FMIN3
:
8534 case AMDGPUISD::FMAX3
:
8535 case AMDGPUISD::FMED3
:
8536 case AMDGPUISD::FMAD_FTZ
:
8537 case AMDGPUISD::RCP
:
8538 case AMDGPUISD::RSQ
:
8539 case AMDGPUISD::RCP_IFLAG
:
8540 case AMDGPUISD::LDEXP
:
8543 // fcopysign, select and others may be lowered to 32-bit bit operations
8544 // which don't zero the high bits.
8549 SDValue
SITargetLowering::performZeroExtendCombine(SDNode
*N
,
8550 DAGCombinerInfo
&DCI
) const {
8551 if (!Subtarget
->has16BitInsts() ||
8552 DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
8555 EVT VT
= N
->getValueType(0);
8559 SDValue Src
= N
->getOperand(0);
8560 if (Src
.getValueType() != MVT::i16
)
8563 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
8564 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
8565 if (Src
.getOpcode() == ISD::BITCAST
) {
8566 SDValue BCSrc
= Src
.getOperand(0);
8567 if (BCSrc
.getValueType() == MVT::f16
&&
8568 fp16SrcZerosHighBits(BCSrc
.getOpcode()))
8569 return DCI
.DAG
.getNode(AMDGPUISD::FP16_ZEXT
, SDLoc(N
), VT
, BCSrc
);
8575 SDValue
SITargetLowering::performSignExtendInRegCombine(SDNode
*N
,
8576 DAGCombinerInfo
&DCI
)
8578 SDValue Src
= N
->getOperand(0);
8579 auto *VTSign
= cast
<VTSDNode
>(N
->getOperand(1));
8581 if (((Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
&&
8582 VTSign
->getVT() == MVT::i8
) ||
8583 (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT
&&
8584 VTSign
->getVT() == MVT::i16
)) &&
8586 auto *M
= cast
<MemSDNode
>(Src
);
8588 Src
.getOperand(0), // Chain
8589 Src
.getOperand(1), // rsrc
8590 Src
.getOperand(2), // vindex
8591 Src
.getOperand(3), // voffset
8592 Src
.getOperand(4), // soffset
8593 Src
.getOperand(5), // offset
8597 // replace with BUFFER_LOAD_BYTE/SHORT
8598 SDVTList ResList
= DCI
.DAG
.getVTList(MVT::i32
,
8599 Src
.getOperand(0).getValueType());
8600 unsigned Opc
= (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
) ?
8601 AMDGPUISD::BUFFER_LOAD_BYTE
: AMDGPUISD::BUFFER_LOAD_SHORT
;
8602 SDValue BufferLoadSignExt
= DCI
.DAG
.getMemIntrinsicNode(Opc
, SDLoc(N
),
8604 Ops
, M
->getMemoryVT(),
8605 M
->getMemOperand());
8606 return DCI
.DAG
.getMergeValues({BufferLoadSignExt
,
8607 BufferLoadSignExt
.getValue(1)}, SDLoc(N
));
8612 SDValue
SITargetLowering::performClassCombine(SDNode
*N
,
8613 DAGCombinerInfo
&DCI
) const {
8614 SelectionDAG
&DAG
= DCI
.DAG
;
8615 SDValue Mask
= N
->getOperand(1);
8617 // fp_class x, 0 -> false
8618 if (const ConstantSDNode
*CMask
= dyn_cast
<ConstantSDNode
>(Mask
)) {
8619 if (CMask
->isNullValue())
8620 return DAG
.getConstant(0, SDLoc(N
), MVT::i1
);
8623 if (N
->getOperand(0).isUndef())
8624 return DAG
.getUNDEF(MVT::i1
);
8629 SDValue
SITargetLowering::performRcpCombine(SDNode
*N
,
8630 DAGCombinerInfo
&DCI
) const {
8631 EVT VT
= N
->getValueType(0);
8632 SDValue N0
= N
->getOperand(0);
8637 if (VT
== MVT::f32
&& (N0
.getOpcode() == ISD::UINT_TO_FP
||
8638 N0
.getOpcode() == ISD::SINT_TO_FP
)) {
8639 return DCI
.DAG
.getNode(AMDGPUISD::RCP_IFLAG
, SDLoc(N
), VT
, N0
,
8643 return AMDGPUTargetLowering::performRcpCombine(N
, DCI
);
8646 bool SITargetLowering::isCanonicalized(SelectionDAG
&DAG
, SDValue Op
,
8647 unsigned MaxDepth
) const {
8648 unsigned Opcode
= Op
.getOpcode();
8649 if (Opcode
== ISD::FCANONICALIZE
)
8652 if (auto *CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
8653 auto F
= CFP
->getValueAPF();
8654 if (F
.isNaN() && F
.isSignaling())
8656 return !F
.isDenormal() || denormalsEnabledForType(Op
.getValueType());
8659 // If source is a result of another standard FP operation it is already in
8665 // These will flush denorms if required.
8677 case ISD::FP_EXTEND
:
8678 case AMDGPUISD::FMUL_LEGACY
:
8679 case AMDGPUISD::FMAD_FTZ
:
8680 case AMDGPUISD::RCP
:
8681 case AMDGPUISD::RSQ
:
8682 case AMDGPUISD::RSQ_CLAMP
:
8683 case AMDGPUISD::RCP_LEGACY
:
8684 case AMDGPUISD::RSQ_LEGACY
:
8685 case AMDGPUISD::RCP_IFLAG
:
8686 case AMDGPUISD::TRIG_PREOP
:
8687 case AMDGPUISD::DIV_SCALE
:
8688 case AMDGPUISD::DIV_FMAS
:
8689 case AMDGPUISD::DIV_FIXUP
:
8690 case AMDGPUISD::FRACT
:
8691 case AMDGPUISD::LDEXP
:
8692 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
8693 case AMDGPUISD::CVT_F32_UBYTE0
:
8694 case AMDGPUISD::CVT_F32_UBYTE1
:
8695 case AMDGPUISD::CVT_F32_UBYTE2
:
8696 case AMDGPUISD::CVT_F32_UBYTE3
:
8699 // It can/will be lowered or combined as a bit operation.
8700 // Need to check their input recursively to handle.
8703 case ISD::FCOPYSIGN
:
8704 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
8709 return Op
.getValueType().getScalarType() != MVT::f16
;
8713 case ISD::FMINNUM_IEEE
:
8714 case ISD::FMAXNUM_IEEE
:
8715 case AMDGPUISD::CLAMP
:
8716 case AMDGPUISD::FMED3
:
8717 case AMDGPUISD::FMAX3
:
8718 case AMDGPUISD::FMIN3
: {
8719 // FIXME: Shouldn't treat the generic operations different based these.
8720 // However, we aren't really required to flush the result from
8723 // snans will be quieted, so we only need to worry about denormals.
8724 if (Subtarget
->supportsMinMaxDenormModes() ||
8725 denormalsEnabledForType(Op
.getValueType()))
8728 // Flushing may be required.
8729 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
8730 // targets need to check their input recursively.
8732 // FIXME: Does this apply with clamp? It's implemented with max.
8733 for (unsigned I
= 0, E
= Op
.getNumOperands(); I
!= E
; ++I
) {
8734 if (!isCanonicalized(DAG
, Op
.getOperand(I
), MaxDepth
- 1))
8741 return isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1) &&
8742 isCanonicalized(DAG
, Op
.getOperand(2), MaxDepth
- 1);
8744 case ISD::BUILD_VECTOR
: {
8745 for (unsigned i
= 0, e
= Op
.getNumOperands(); i
!= e
; ++i
) {
8746 SDValue SrcOp
= Op
.getOperand(i
);
8747 if (!isCanonicalized(DAG
, SrcOp
, MaxDepth
- 1))
8753 case ISD::EXTRACT_VECTOR_ELT
:
8754 case ISD::EXTRACT_SUBVECTOR
: {
8755 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
8757 case ISD::INSERT_VECTOR_ELT
: {
8758 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1) &&
8759 isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1);
8762 // Could be anything.
8765 case ISD::BITCAST
: {
8766 // Hack round the mess we make when legalizing extract_vector_elt
8767 SDValue Src
= Op
.getOperand(0);
8768 if (Src
.getValueType() == MVT::i16
&&
8769 Src
.getOpcode() == ISD::TRUNCATE
) {
8770 SDValue TruncSrc
= Src
.getOperand(0);
8771 if (TruncSrc
.getValueType() == MVT::i32
&&
8772 TruncSrc
.getOpcode() == ISD::BITCAST
&&
8773 TruncSrc
.getOperand(0).getValueType() == MVT::v2f16
) {
8774 return isCanonicalized(DAG
, TruncSrc
.getOperand(0), MaxDepth
- 1);
8780 case ISD::INTRINSIC_WO_CHAIN
: {
8781 unsigned IntrinsicID
8782 = cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
8783 // TODO: Handle more intrinsics
8784 switch (IntrinsicID
) {
8785 case Intrinsic::amdgcn_cvt_pkrtz
:
8786 case Intrinsic::amdgcn_cubeid
:
8787 case Intrinsic::amdgcn_frexp_mant
:
8788 case Intrinsic::amdgcn_fdot2
:
8797 return denormalsEnabledForType(Op
.getValueType()) &&
8798 DAG
.isKnownNeverSNaN(Op
);
8801 llvm_unreachable("invalid operation");
8804 // Constant fold canonicalize.
8805 SDValue
SITargetLowering::getCanonicalConstantFP(
8806 SelectionDAG
&DAG
, const SDLoc
&SL
, EVT VT
, const APFloat
&C
) const {
8807 // Flush denormals to 0 if not enabled.
8808 if (C
.isDenormal() && !denormalsEnabledForType(VT
))
8809 return DAG
.getConstantFP(0.0, SL
, VT
);
8812 APFloat CanonicalQNaN
= APFloat::getQNaN(C
.getSemantics());
8813 if (C
.isSignaling()) {
8814 // Quiet a signaling NaN.
8815 // FIXME: Is this supposed to preserve payload bits?
8816 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
8819 // Make sure it is the canonical NaN bitpattern.
8821 // TODO: Can we use -1 as the canonical NaN value since it's an inline
8823 if (C
.bitcastToAPInt() != CanonicalQNaN
.bitcastToAPInt())
8824 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
8827 // Already canonical.
8828 return DAG
.getConstantFP(C
, SL
, VT
);
8831 static bool vectorEltWillFoldAway(SDValue Op
) {
8832 return Op
.isUndef() || isa
<ConstantFPSDNode
>(Op
);
8835 SDValue
SITargetLowering::performFCanonicalizeCombine(
8837 DAGCombinerInfo
&DCI
) const {
8838 SelectionDAG
&DAG
= DCI
.DAG
;
8839 SDValue N0
= N
->getOperand(0);
8840 EVT VT
= N
->getValueType(0);
8842 // fcanonicalize undef -> qnan
8844 APFloat QNaN
= APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT
));
8845 return DAG
.getConstantFP(QNaN
, SDLoc(N
), VT
);
8848 if (ConstantFPSDNode
*CFP
= isConstOrConstSplatFP(N0
)) {
8849 EVT VT
= N
->getValueType(0);
8850 return getCanonicalConstantFP(DAG
, SDLoc(N
), VT
, CFP
->getValueAPF());
8853 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
8854 // (fcanonicalize k)
8856 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
8858 // TODO: This could be better with wider vectors that will be split to v2f16,
8859 // and to consider uses since there aren't that many packed operations.
8860 if (N0
.getOpcode() == ISD::BUILD_VECTOR
&& VT
== MVT::v2f16
&&
8861 isTypeLegal(MVT::v2f16
)) {
8864 SDValue Lo
= N0
.getOperand(0);
8865 SDValue Hi
= N0
.getOperand(1);
8866 EVT EltVT
= Lo
.getValueType();
8868 if (vectorEltWillFoldAway(Lo
) || vectorEltWillFoldAway(Hi
)) {
8869 for (unsigned I
= 0; I
!= 2; ++I
) {
8870 SDValue Op
= N0
.getOperand(I
);
8871 if (ConstantFPSDNode
*CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
8872 NewElts
[I
] = getCanonicalConstantFP(DAG
, SL
, EltVT
,
8873 CFP
->getValueAPF());
8874 } else if (Op
.isUndef()) {
8875 // Handled below based on what the other operand is.
8878 NewElts
[I
] = DAG
.getNode(ISD::FCANONICALIZE
, SL
, EltVT
, Op
);
8882 // If one half is undef, and one is constant, perfer a splat vector rather
8883 // than the normal qNaN. If it's a register, prefer 0.0 since that's
8884 // cheaper to use and may be free with a packed operation.
8885 if (NewElts
[0].isUndef()) {
8886 if (isa
<ConstantFPSDNode
>(NewElts
[1]))
8887 NewElts
[0] = isa
<ConstantFPSDNode
>(NewElts
[1]) ?
8888 NewElts
[1]: DAG
.getConstantFP(0.0f
, SL
, EltVT
);
8891 if (NewElts
[1].isUndef()) {
8892 NewElts
[1] = isa
<ConstantFPSDNode
>(NewElts
[0]) ?
8893 NewElts
[0] : DAG
.getConstantFP(0.0f
, SL
, EltVT
);
8896 return DAG
.getBuildVector(VT
, SL
, NewElts
);
8900 unsigned SrcOpc
= N0
.getOpcode();
8902 // If it's free to do so, push canonicalizes further up the source, which may
8903 // find a canonical source.
8905 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
8907 if (SrcOpc
== ISD::FMINNUM
|| SrcOpc
== ISD::FMAXNUM
) {
8908 auto *CRHS
= dyn_cast
<ConstantFPSDNode
>(N0
.getOperand(1));
8909 if (CRHS
&& N0
.hasOneUse()) {
8911 SDValue Canon0
= DAG
.getNode(ISD::FCANONICALIZE
, SL
, VT
,
8913 SDValue Canon1
= getCanonicalConstantFP(DAG
, SL
, VT
, CRHS
->getValueAPF());
8914 DCI
.AddToWorklist(Canon0
.getNode());
8916 return DAG
.getNode(N0
.getOpcode(), SL
, VT
, Canon0
, Canon1
);
8920 return isCanonicalized(DAG
, N0
) ? N0
: SDValue();
8923 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc
) {
8926 case ISD::FMAXNUM_IEEE
:
8927 return AMDGPUISD::FMAX3
;
8929 return AMDGPUISD::SMAX3
;
8931 return AMDGPUISD::UMAX3
;
8933 case ISD::FMINNUM_IEEE
:
8934 return AMDGPUISD::FMIN3
;
8936 return AMDGPUISD::SMIN3
;
8938 return AMDGPUISD::UMIN3
;
8940 llvm_unreachable("Not a min/max opcode");
8944 SDValue
SITargetLowering::performIntMed3ImmCombine(
8945 SelectionDAG
&DAG
, const SDLoc
&SL
,
8946 SDValue Op0
, SDValue Op1
, bool Signed
) const {
8947 ConstantSDNode
*K1
= dyn_cast
<ConstantSDNode
>(Op1
);
8951 ConstantSDNode
*K0
= dyn_cast
<ConstantSDNode
>(Op0
.getOperand(1));
8956 if (K0
->getAPIntValue().sge(K1
->getAPIntValue()))
8959 if (K0
->getAPIntValue().uge(K1
->getAPIntValue()))
8963 EVT VT
= K0
->getValueType(0);
8964 unsigned Med3Opc
= Signed
? AMDGPUISD::SMED3
: AMDGPUISD::UMED3
;
8965 if (VT
== MVT::i32
|| (VT
== MVT::i16
&& Subtarget
->hasMed3_16())) {
8966 return DAG
.getNode(Med3Opc
, SL
, VT
,
8967 Op0
.getOperand(0), SDValue(K0
, 0), SDValue(K1
, 0));
8970 // If there isn't a 16-bit med3 operation, convert to 32-bit.
8972 unsigned ExtOp
= Signed
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
8974 SDValue Tmp1
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(0));
8975 SDValue Tmp2
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(1));
8976 SDValue Tmp3
= DAG
.getNode(ExtOp
, SL
, NVT
, Op1
);
8978 SDValue Med3
= DAG
.getNode(Med3Opc
, SL
, NVT
, Tmp1
, Tmp2
, Tmp3
);
8979 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Med3
);
8982 static ConstantFPSDNode
*getSplatConstantFP(SDValue Op
) {
8983 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(Op
))
8986 if (BuildVectorSDNode
*BV
= dyn_cast
<BuildVectorSDNode
>(Op
)) {
8987 if (ConstantFPSDNode
*C
= BV
->getConstantFPSplatNode())
8994 SDValue
SITargetLowering::performFPMed3ImmCombine(SelectionDAG
&DAG
,
8997 SDValue Op1
) const {
8998 ConstantFPSDNode
*K1
= getSplatConstantFP(Op1
);
9002 ConstantFPSDNode
*K0
= getSplatConstantFP(Op0
.getOperand(1));
9006 // Ordered >= (although NaN inputs should have folded away by now).
9007 APFloat::cmpResult Cmp
= K0
->getValueAPF().compare(K1
->getValueAPF());
9008 if (Cmp
== APFloat::cmpGreaterThan
)
9011 const MachineFunction
&MF
= DAG
.getMachineFunction();
9012 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
9014 // TODO: Check IEEE bit enabled?
9015 EVT VT
= Op0
.getValueType();
9016 if (Info
->getMode().DX10Clamp
) {
9017 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
9018 // hardware fmed3 behavior converting to a min.
9019 // FIXME: Should this be allowing -0.0?
9020 if (K1
->isExactlyValue(1.0) && K0
->isExactlyValue(0.0))
9021 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Op0
.getOperand(0));
9024 // med3 for f16 is only available on gfx9+, and not available for v2f16.
9025 if (VT
== MVT::f32
|| (VT
== MVT::f16
&& Subtarget
->hasMed3_16())) {
9026 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
9027 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
9028 // then give the other result, which is different from med3 with a NaN
9030 SDValue Var
= Op0
.getOperand(0);
9031 if (!DAG
.isKnownNeverSNaN(Var
))
9034 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
9036 if ((!K0
->hasOneUse() ||
9037 TII
->isInlineConstant(K0
->getValueAPF().bitcastToAPInt())) &&
9038 (!K1
->hasOneUse() ||
9039 TII
->isInlineConstant(K1
->getValueAPF().bitcastToAPInt()))) {
9040 return DAG
.getNode(AMDGPUISD::FMED3
, SL
, K0
->getValueType(0),
9041 Var
, SDValue(K0
, 0), SDValue(K1
, 0));
9048 SDValue
SITargetLowering::performMinMaxCombine(SDNode
*N
,
9049 DAGCombinerInfo
&DCI
) const {
9050 SelectionDAG
&DAG
= DCI
.DAG
;
9052 EVT VT
= N
->getValueType(0);
9053 unsigned Opc
= N
->getOpcode();
9054 SDValue Op0
= N
->getOperand(0);
9055 SDValue Op1
= N
->getOperand(1);
9057 // Only do this if the inner op has one use since this will just increases
9058 // register pressure for no benefit.
9060 if (Opc
!= AMDGPUISD::FMIN_LEGACY
&& Opc
!= AMDGPUISD::FMAX_LEGACY
&&
9062 (VT
== MVT::i32
|| VT
== MVT::f32
||
9063 ((VT
== MVT::f16
|| VT
== MVT::i16
) && Subtarget
->hasMin3Max3_16()))) {
9064 // max(max(a, b), c) -> max3(a, b, c)
9065 // min(min(a, b), c) -> min3(a, b, c)
9066 if (Op0
.getOpcode() == Opc
&& Op0
.hasOneUse()) {
9068 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
9077 // max(a, max(b, c)) -> max3(a, b, c)
9078 // min(a, min(b, c)) -> min3(a, b, c)
9079 if (Op1
.getOpcode() == Opc
&& Op1
.hasOneUse()) {
9081 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
9090 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
9091 if (Opc
== ISD::SMIN
&& Op0
.getOpcode() == ISD::SMAX
&& Op0
.hasOneUse()) {
9092 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, true))
9096 if (Opc
== ISD::UMIN
&& Op0
.getOpcode() == ISD::UMAX
&& Op0
.hasOneUse()) {
9097 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, false))
9101 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
9102 if (((Opc
== ISD::FMINNUM
&& Op0
.getOpcode() == ISD::FMAXNUM
) ||
9103 (Opc
== ISD::FMINNUM_IEEE
&& Op0
.getOpcode() == ISD::FMAXNUM_IEEE
) ||
9104 (Opc
== AMDGPUISD::FMIN_LEGACY
&&
9105 Op0
.getOpcode() == AMDGPUISD::FMAX_LEGACY
)) &&
9106 (VT
== MVT::f32
|| VT
== MVT::f64
||
9107 (VT
== MVT::f16
&& Subtarget
->has16BitInsts()) ||
9108 (VT
== MVT::v2f16
&& Subtarget
->hasVOP3PInsts())) &&
9110 if (SDValue Res
= performFPMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
))
9117 static bool isClampZeroToOne(SDValue A
, SDValue B
) {
9118 if (ConstantFPSDNode
*CA
= dyn_cast
<ConstantFPSDNode
>(A
)) {
9119 if (ConstantFPSDNode
*CB
= dyn_cast
<ConstantFPSDNode
>(B
)) {
9120 // FIXME: Should this be allowing -0.0?
9121 return (CA
->isExactlyValue(0.0) && CB
->isExactlyValue(1.0)) ||
9122 (CA
->isExactlyValue(1.0) && CB
->isExactlyValue(0.0));
9129 // FIXME: Should only worry about snans for version with chain.
9130 SDValue
SITargetLowering::performFMed3Combine(SDNode
*N
,
9131 DAGCombinerInfo
&DCI
) const {
9132 EVT VT
= N
->getValueType(0);
9133 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
9134 // NaNs. With a NaN input, the order of the operands may change the result.
9136 SelectionDAG
&DAG
= DCI
.DAG
;
9139 SDValue Src0
= N
->getOperand(0);
9140 SDValue Src1
= N
->getOperand(1);
9141 SDValue Src2
= N
->getOperand(2);
9143 if (isClampZeroToOne(Src0
, Src1
)) {
9144 // const_a, const_b, x -> clamp is safe in all cases including signaling
9146 // FIXME: Should this be allowing -0.0?
9147 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src2
);
9150 const MachineFunction
&MF
= DAG
.getMachineFunction();
9151 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
9153 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
9154 // handling no dx10-clamp?
9155 if (Info
->getMode().DX10Clamp
) {
9156 // If NaNs is clamped to 0, we are free to reorder the inputs.
9158 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
9159 std::swap(Src0
, Src1
);
9161 if (isa
<ConstantFPSDNode
>(Src1
) && !isa
<ConstantFPSDNode
>(Src2
))
9162 std::swap(Src1
, Src2
);
9164 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
9165 std::swap(Src0
, Src1
);
9167 if (isClampZeroToOne(Src1
, Src2
))
9168 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src0
);
9174 SDValue
SITargetLowering::performCvtPkRTZCombine(SDNode
*N
,
9175 DAGCombinerInfo
&DCI
) const {
9176 SDValue Src0
= N
->getOperand(0);
9177 SDValue Src1
= N
->getOperand(1);
9178 if (Src0
.isUndef() && Src1
.isUndef())
9179 return DCI
.DAG
.getUNDEF(N
->getValueType(0));
9183 SDValue
SITargetLowering::performExtractVectorEltCombine(
9184 SDNode
*N
, DAGCombinerInfo
&DCI
) const {
9185 SDValue Vec
= N
->getOperand(0);
9186 SelectionDAG
&DAG
= DCI
.DAG
;
9188 EVT VecVT
= Vec
.getValueType();
9189 EVT EltVT
= VecVT
.getVectorElementType();
9191 if ((Vec
.getOpcode() == ISD::FNEG
||
9192 Vec
.getOpcode() == ISD::FABS
) && allUsesHaveSourceMods(N
)) {
9194 EVT EltVT
= N
->getValueType(0);
9195 SDValue Idx
= N
->getOperand(1);
9196 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9197 Vec
.getOperand(0), Idx
);
9198 return DAG
.getNode(Vec
.getOpcode(), SL
, EltVT
, Elt
);
9201 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
9203 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
9204 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
9205 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
9206 if (Vec
.hasOneUse() && DCI
.isBeforeLegalize()) {
9208 EVT EltVT
= N
->getValueType(0);
9209 SDValue Idx
= N
->getOperand(1);
9210 unsigned Opc
= Vec
.getOpcode();
9215 // TODO: Support other binary operations.
9226 case ISD::FMAXNUM_IEEE
:
9227 case ISD::FMINNUM_IEEE
: {
9228 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9229 Vec
.getOperand(0), Idx
);
9230 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9231 Vec
.getOperand(1), Idx
);
9233 DCI
.AddToWorklist(Elt0
.getNode());
9234 DCI
.AddToWorklist(Elt1
.getNode());
9235 return DAG
.getNode(Opc
, SL
, EltVT
, Elt0
, Elt1
, Vec
->getFlags());
9240 unsigned VecSize
= VecVT
.getSizeInBits();
9241 unsigned EltSize
= EltVT
.getSizeInBits();
9243 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
9244 // This elminates non-constant index and subsequent movrel or scratch access.
9245 // Sub-dword vectors of size 2 dword or less have better implementation.
9246 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
9248 if (VecSize
<= 256 && (VecSize
> 64 || EltSize
>= 32) &&
9249 !isa
<ConstantSDNode
>(N
->getOperand(1))) {
9251 SDValue Idx
= N
->getOperand(1);
9252 EVT IdxVT
= Idx
.getValueType();
9254 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
9255 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
9256 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
9260 V
= DAG
.getSelectCC(SL
, Idx
, IC
, Elt
, V
, ISD::SETEQ
);
9265 if (!DCI
.isBeforeLegalize())
9268 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
9269 // elements. This exposes more load reduction opportunities by replacing
9270 // multiple small extract_vector_elements with a single 32-bit extract.
9271 auto *Idx
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
9272 if (isa
<MemSDNode
>(Vec
) &&
9274 EltVT
.isByteSized() &&
9276 VecSize
% 32 == 0 &&
9278 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VecVT
);
9280 unsigned BitIndex
= Idx
->getZExtValue() * EltSize
;
9281 unsigned EltIdx
= BitIndex
/ 32;
9282 unsigned LeftoverBitIdx
= BitIndex
% 32;
9285 SDValue Cast
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, Vec
);
9286 DCI
.AddToWorklist(Cast
.getNode());
9288 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Cast
,
9289 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
9290 DCI
.AddToWorklist(Elt
.getNode());
9291 SDValue Srl
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Elt
,
9292 DAG
.getConstant(LeftoverBitIdx
, SL
, MVT::i32
));
9293 DCI
.AddToWorklist(Srl
.getNode());
9295 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, SL
, EltVT
.changeTypeToInteger(), Srl
);
9296 DCI
.AddToWorklist(Trunc
.getNode());
9297 return DAG
.getNode(ISD::BITCAST
, SL
, EltVT
, Trunc
);
9304 SITargetLowering::performInsertVectorEltCombine(SDNode
*N
,
9305 DAGCombinerInfo
&DCI
) const {
9306 SDValue Vec
= N
->getOperand(0);
9307 SDValue Idx
= N
->getOperand(2);
9308 EVT VecVT
= Vec
.getValueType();
9309 EVT EltVT
= VecVT
.getVectorElementType();
9310 unsigned VecSize
= VecVT
.getSizeInBits();
9311 unsigned EltSize
= EltVT
.getSizeInBits();
9313 // INSERT_VECTOR_ELT (<n x e>, var-idx)
9314 // => BUILD_VECTOR n x select (e, const-idx)
9315 // This elminates non-constant index and subsequent movrel or scratch access.
9316 // Sub-dword vectors of size 2 dword or less have better implementation.
9317 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
9319 if (isa
<ConstantSDNode
>(Idx
) ||
9320 VecSize
> 256 || (VecSize
<= 64 && EltSize
< 32))
9323 SelectionDAG
&DAG
= DCI
.DAG
;
9325 SDValue Ins
= N
->getOperand(1);
9326 EVT IdxVT
= Idx
.getValueType();
9328 SmallVector
<SDValue
, 16> Ops
;
9329 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
9330 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
9331 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
9332 SDValue V
= DAG
.getSelectCC(SL
, Idx
, IC
, Ins
, Elt
, ISD::SETEQ
);
9336 return DAG
.getBuildVector(VecVT
, SL
, Ops
);
9339 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG
&DAG
,
9341 const SDNode
*N1
) const {
9342 EVT VT
= N0
->getValueType(0);
9344 // Only do this if we are not trying to support denormals. v_mad_f32 does not
9345 // support denormals ever.
9346 if (((VT
== MVT::f32
&& !Subtarget
->hasFP32Denormals()) ||
9347 (VT
== MVT::f16
&& !Subtarget
->hasFP16Denormals() &&
9348 getSubtarget()->hasMadF16())) &&
9349 isOperationLegal(ISD::FMAD
, VT
))
9352 const TargetOptions
&Options
= DAG
.getTarget().Options
;
9353 if ((Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
9354 (N0
->getFlags().hasAllowContract() &&
9355 N1
->getFlags().hasAllowContract())) &&
9356 isFMAFasterThanFMulAndFAdd(VT
)) {
9363 // For a reassociatable opcode perform:
9364 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
9365 SDValue
SITargetLowering::reassociateScalarOps(SDNode
*N
,
9366 SelectionDAG
&DAG
) const {
9367 EVT VT
= N
->getValueType(0);
9368 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
9371 unsigned Opc
= N
->getOpcode();
9372 SDValue Op0
= N
->getOperand(0);
9373 SDValue Op1
= N
->getOperand(1);
9375 if (!(Op0
->isDivergent() ^ Op1
->isDivergent()))
9378 if (Op0
->isDivergent())
9379 std::swap(Op0
, Op1
);
9381 if (Op1
.getOpcode() != Opc
|| !Op1
.hasOneUse())
9384 SDValue Op2
= Op1
.getOperand(1);
9385 Op1
= Op1
.getOperand(0);
9386 if (!(Op1
->isDivergent() ^ Op2
->isDivergent()))
9389 if (Op1
->isDivergent())
9390 std::swap(Op1
, Op2
);
9392 // If either operand is constant this will conflict with
9393 // DAGCombiner::ReassociateOps().
9394 if (DAG
.isConstantIntBuildVectorOrConstantInt(Op0
) ||
9395 DAG
.isConstantIntBuildVectorOrConstantInt(Op1
))
9399 SDValue Add1
= DAG
.getNode(Opc
, SL
, VT
, Op0
, Op1
);
9400 return DAG
.getNode(Opc
, SL
, VT
, Add1
, Op2
);
9403 static SDValue
getMad64_32(SelectionDAG
&DAG
, const SDLoc
&SL
,
9405 SDValue N0
, SDValue N1
, SDValue N2
,
9407 unsigned MadOpc
= Signed
? AMDGPUISD::MAD_I64_I32
: AMDGPUISD::MAD_U64_U32
;
9408 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i1
);
9409 SDValue Mad
= DAG
.getNode(MadOpc
, SL
, VTs
, N0
, N1
, N2
);
9410 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Mad
);
9413 SDValue
SITargetLowering::performAddCombine(SDNode
*N
,
9414 DAGCombinerInfo
&DCI
) const {
9415 SelectionDAG
&DAG
= DCI
.DAG
;
9416 EVT VT
= N
->getValueType(0);
9418 SDValue LHS
= N
->getOperand(0);
9419 SDValue RHS
= N
->getOperand(1);
9421 if ((LHS
.getOpcode() == ISD::MUL
|| RHS
.getOpcode() == ISD::MUL
)
9422 && Subtarget
->hasMad64_32() &&
9423 !VT
.isVector() && VT
.getScalarSizeInBits() > 32 &&
9424 VT
.getScalarSizeInBits() <= 64) {
9425 if (LHS
.getOpcode() != ISD::MUL
)
9426 std::swap(LHS
, RHS
);
9428 SDValue MulLHS
= LHS
.getOperand(0);
9429 SDValue MulRHS
= LHS
.getOperand(1);
9430 SDValue AddRHS
= RHS
;
9432 // TODO: Maybe restrict if SGPR inputs.
9433 if (numBitsUnsigned(MulLHS
, DAG
) <= 32 &&
9434 numBitsUnsigned(MulRHS
, DAG
) <= 32) {
9435 MulLHS
= DAG
.getZExtOrTrunc(MulLHS
, SL
, MVT::i32
);
9436 MulRHS
= DAG
.getZExtOrTrunc(MulRHS
, SL
, MVT::i32
);
9437 AddRHS
= DAG
.getZExtOrTrunc(AddRHS
, SL
, MVT::i64
);
9438 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, false);
9441 if (numBitsSigned(MulLHS
, DAG
) < 32 && numBitsSigned(MulRHS
, DAG
) < 32) {
9442 MulLHS
= DAG
.getSExtOrTrunc(MulLHS
, SL
, MVT::i32
);
9443 MulRHS
= DAG
.getSExtOrTrunc(MulRHS
, SL
, MVT::i32
);
9444 AddRHS
= DAG
.getSExtOrTrunc(AddRHS
, SL
, MVT::i64
);
9445 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, true);
9451 if (SDValue V
= reassociateScalarOps(N
, DAG
)) {
9455 if (VT
!= MVT::i32
|| !DCI
.isAfterLegalizeDAG())
9458 // add x, zext (setcc) => addcarry x, 0, setcc
9459 // add x, sext (setcc) => subcarry x, 0, setcc
9460 unsigned Opc
= LHS
.getOpcode();
9461 if (Opc
== ISD::ZERO_EXTEND
|| Opc
== ISD::SIGN_EXTEND
||
9462 Opc
== ISD::ANY_EXTEND
|| Opc
== ISD::ADDCARRY
)
9463 std::swap(RHS
, LHS
);
9465 Opc
= RHS
.getOpcode();
9468 case ISD::ZERO_EXTEND
:
9469 case ISD::SIGN_EXTEND
:
9470 case ISD::ANY_EXTEND
: {
9471 auto Cond
= RHS
.getOperand(0);
9472 if (!isBoolSGPR(Cond
))
9474 SDVTList VTList
= DAG
.getVTList(MVT::i32
, MVT::i1
);
9475 SDValue Args
[] = { LHS
, DAG
.getConstant(0, SL
, MVT::i32
), Cond
};
9476 Opc
= (Opc
== ISD::SIGN_EXTEND
) ? ISD::SUBCARRY
: ISD::ADDCARRY
;
9477 return DAG
.getNode(Opc
, SL
, VTList
, Args
);
9479 case ISD::ADDCARRY
: {
9480 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
9481 auto C
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
9482 if (!C
|| C
->getZExtValue() != 0) break;
9483 SDValue Args
[] = { LHS
, RHS
.getOperand(0), RHS
.getOperand(2) };
9484 return DAG
.getNode(ISD::ADDCARRY
, SDLoc(N
), RHS
->getVTList(), Args
);
9490 SDValue
SITargetLowering::performSubCombine(SDNode
*N
,
9491 DAGCombinerInfo
&DCI
) const {
9492 SelectionDAG
&DAG
= DCI
.DAG
;
9493 EVT VT
= N
->getValueType(0);
9499 SDValue LHS
= N
->getOperand(0);
9500 SDValue RHS
= N
->getOperand(1);
9502 if (LHS
.getOpcode() == ISD::SUBCARRY
) {
9503 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
9504 auto C
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
9505 if (!C
|| !C
->isNullValue())
9507 SDValue Args
[] = { LHS
.getOperand(0), RHS
, LHS
.getOperand(2) };
9508 return DAG
.getNode(ISD::SUBCARRY
, SDLoc(N
), LHS
->getVTList(), Args
);
9513 SDValue
SITargetLowering::performAddCarrySubCarryCombine(SDNode
*N
,
9514 DAGCombinerInfo
&DCI
) const {
9516 if (N
->getValueType(0) != MVT::i32
)
9519 auto C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
9520 if (!C
|| C
->getZExtValue() != 0)
9523 SelectionDAG
&DAG
= DCI
.DAG
;
9524 SDValue LHS
= N
->getOperand(0);
9526 // addcarry (add x, y), 0, cc => addcarry x, y, cc
9527 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
9528 unsigned LHSOpc
= LHS
.getOpcode();
9529 unsigned Opc
= N
->getOpcode();
9530 if ((LHSOpc
== ISD::ADD
&& Opc
== ISD::ADDCARRY
) ||
9531 (LHSOpc
== ISD::SUB
&& Opc
== ISD::SUBCARRY
)) {
9532 SDValue Args
[] = { LHS
.getOperand(0), LHS
.getOperand(1), N
->getOperand(2) };
9533 return DAG
.getNode(Opc
, SDLoc(N
), N
->getVTList(), Args
);
9538 SDValue
SITargetLowering::performFAddCombine(SDNode
*N
,
9539 DAGCombinerInfo
&DCI
) const {
9540 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
9543 SelectionDAG
&DAG
= DCI
.DAG
;
9544 EVT VT
= N
->getValueType(0);
9547 SDValue LHS
= N
->getOperand(0);
9548 SDValue RHS
= N
->getOperand(1);
9550 // These should really be instruction patterns, but writing patterns with
9551 // source modiifiers is a pain.
9553 // fadd (fadd (a, a), b) -> mad 2.0, a, b
9554 if (LHS
.getOpcode() == ISD::FADD
) {
9555 SDValue A
= LHS
.getOperand(0);
9556 if (A
== LHS
.getOperand(1)) {
9557 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
9559 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9560 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, RHS
);
9565 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
9566 if (RHS
.getOpcode() == ISD::FADD
) {
9567 SDValue A
= RHS
.getOperand(0);
9568 if (A
== RHS
.getOperand(1)) {
9569 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
9571 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9572 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, LHS
);
9580 SDValue
SITargetLowering::performFSubCombine(SDNode
*N
,
9581 DAGCombinerInfo
&DCI
) const {
9582 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
9585 SelectionDAG
&DAG
= DCI
.DAG
;
9587 EVT VT
= N
->getValueType(0);
9588 assert(!VT
.isVector());
9590 // Try to get the fneg to fold into the source modifier. This undoes generic
9591 // DAG combines and folds them into the mad.
9593 // Only do this if we are not trying to support denormals. v_mad_f32 does
9594 // not support denormals ever.
9595 SDValue LHS
= N
->getOperand(0);
9596 SDValue RHS
= N
->getOperand(1);
9597 if (LHS
.getOpcode() == ISD::FADD
) {
9598 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
9599 SDValue A
= LHS
.getOperand(0);
9600 if (A
== LHS
.getOperand(1)) {
9601 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
9603 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9604 SDValue NegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
9606 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, NegRHS
);
9611 if (RHS
.getOpcode() == ISD::FADD
) {
9612 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
9614 SDValue A
= RHS
.getOperand(0);
9615 if (A
== RHS
.getOperand(1)) {
9616 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
9618 const SDValue NegTwo
= DAG
.getConstantFP(-2.0, SL
, VT
);
9619 return DAG
.getNode(FusedOp
, SL
, VT
, A
, NegTwo
, LHS
);
9627 SDValue
SITargetLowering::performFMACombine(SDNode
*N
,
9628 DAGCombinerInfo
&DCI
) const {
9629 SelectionDAG
&DAG
= DCI
.DAG
;
9630 EVT VT
= N
->getValueType(0);
9633 if (!Subtarget
->hasDot2Insts() || VT
!= MVT::f32
)
9636 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
9637 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
9638 SDValue Op1
= N
->getOperand(0);
9639 SDValue Op2
= N
->getOperand(1);
9640 SDValue FMA
= N
->getOperand(2);
9642 if (FMA
.getOpcode() != ISD::FMA
||
9643 Op1
.getOpcode() != ISD::FP_EXTEND
||
9644 Op2
.getOpcode() != ISD::FP_EXTEND
)
9647 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
9648 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
9649 // is sufficient to allow generaing fdot2.
9650 const TargetOptions
&Options
= DAG
.getTarget().Options
;
9651 if (Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
9652 (N
->getFlags().hasAllowContract() &&
9653 FMA
->getFlags().hasAllowContract())) {
9654 Op1
= Op1
.getOperand(0);
9655 Op2
= Op2
.getOperand(0);
9656 if (Op1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
9657 Op2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
9660 SDValue Vec1
= Op1
.getOperand(0);
9661 SDValue Idx1
= Op1
.getOperand(1);
9662 SDValue Vec2
= Op2
.getOperand(0);
9664 SDValue FMAOp1
= FMA
.getOperand(0);
9665 SDValue FMAOp2
= FMA
.getOperand(1);
9666 SDValue FMAAcc
= FMA
.getOperand(2);
9668 if (FMAOp1
.getOpcode() != ISD::FP_EXTEND
||
9669 FMAOp2
.getOpcode() != ISD::FP_EXTEND
)
9672 FMAOp1
= FMAOp1
.getOperand(0);
9673 FMAOp2
= FMAOp2
.getOperand(0);
9674 if (FMAOp1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
9675 FMAOp2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
9678 SDValue Vec3
= FMAOp1
.getOperand(0);
9679 SDValue Vec4
= FMAOp2
.getOperand(0);
9680 SDValue Idx2
= FMAOp1
.getOperand(1);
9682 if (Idx1
!= Op2
.getOperand(1) || Idx2
!= FMAOp2
.getOperand(1) ||
9683 // Idx1 and Idx2 cannot be the same.
9687 if (Vec1
== Vec2
|| Vec3
== Vec4
)
9690 if (Vec1
.getValueType() != MVT::v2f16
|| Vec2
.getValueType() != MVT::v2f16
)
9693 if ((Vec1
== Vec3
&& Vec2
== Vec4
) ||
9694 (Vec1
== Vec4
&& Vec2
== Vec3
)) {
9695 return DAG
.getNode(AMDGPUISD::FDOT2
, SL
, MVT::f32
, Vec1
, Vec2
, FMAAcc
,
9696 DAG
.getTargetConstant(0, SL
, MVT::i1
));
9702 SDValue
SITargetLowering::performSetCCCombine(SDNode
*N
,
9703 DAGCombinerInfo
&DCI
) const {
9704 SelectionDAG
&DAG
= DCI
.DAG
;
9707 SDValue LHS
= N
->getOperand(0);
9708 SDValue RHS
= N
->getOperand(1);
9709 EVT VT
= LHS
.getValueType();
9710 ISD::CondCode CC
= cast
<CondCodeSDNode
>(N
->getOperand(2))->get();
9712 auto CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
9714 CRHS
= dyn_cast
<ConstantSDNode
>(LHS
);
9716 std::swap(LHS
, RHS
);
9717 CC
= getSetCCSwappedOperands(CC
);
9722 if (VT
== MVT::i32
&& LHS
.getOpcode() == ISD::SIGN_EXTEND
&&
9723 isBoolSGPR(LHS
.getOperand(0))) {
9724 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
9725 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
9726 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
9727 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
9728 if ((CRHS
->isAllOnesValue() &&
9729 (CC
== ISD::SETNE
|| CC
== ISD::SETGT
|| CC
== ISD::SETULT
)) ||
9730 (CRHS
->isNullValue() &&
9731 (CC
== ISD::SETEQ
|| CC
== ISD::SETGE
|| CC
== ISD::SETULE
)))
9732 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
9733 DAG
.getConstant(-1, SL
, MVT::i1
));
9734 if ((CRHS
->isAllOnesValue() &&
9735 (CC
== ISD::SETEQ
|| CC
== ISD::SETLE
|| CC
== ISD::SETUGE
)) ||
9736 (CRHS
->isNullValue() &&
9737 (CC
== ISD::SETNE
|| CC
== ISD::SETUGT
|| CC
== ISD::SETLT
)))
9738 return LHS
.getOperand(0);
9741 uint64_t CRHSVal
= CRHS
->getZExtValue();
9742 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) &&
9743 LHS
.getOpcode() == ISD::SELECT
&&
9744 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
9745 isa
<ConstantSDNode
>(LHS
.getOperand(2)) &&
9746 LHS
.getConstantOperandVal(1) != LHS
.getConstantOperandVal(2) &&
9747 isBoolSGPR(LHS
.getOperand(0))) {
9749 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
9750 // setcc (select cc, CT, CF), CF, ne => cc
9751 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
9752 // setcc (select cc, CT, CF), CT, eq => cc
9753 uint64_t CT
= LHS
.getConstantOperandVal(1);
9754 uint64_t CF
= LHS
.getConstantOperandVal(2);
9756 if ((CF
== CRHSVal
&& CC
== ISD::SETEQ
) ||
9757 (CT
== CRHSVal
&& CC
== ISD::SETNE
))
9758 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
9759 DAG
.getConstant(-1, SL
, MVT::i1
));
9760 if ((CF
== CRHSVal
&& CC
== ISD::SETNE
) ||
9761 (CT
== CRHSVal
&& CC
== ISD::SETEQ
))
9762 return LHS
.getOperand(0);
9766 if (VT
!= MVT::f32
&& VT
!= MVT::f64
&& (Subtarget
->has16BitInsts() &&
9770 // Match isinf/isfinite pattern
9771 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
9772 // (fcmp one (fabs x), inf) -> (fp_class x,
9773 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
9774 if ((CC
== ISD::SETOEQ
|| CC
== ISD::SETONE
) && LHS
.getOpcode() == ISD::FABS
) {
9775 const ConstantFPSDNode
*CRHS
= dyn_cast
<ConstantFPSDNode
>(RHS
);
9779 const APFloat
&APF
= CRHS
->getValueAPF();
9780 if (APF
.isInfinity() && !APF
.isNegative()) {
9781 const unsigned IsInfMask
= SIInstrFlags::P_INFINITY
|
9782 SIInstrFlags::N_INFINITY
;
9783 const unsigned IsFiniteMask
= SIInstrFlags::N_ZERO
|
9784 SIInstrFlags::P_ZERO
|
9785 SIInstrFlags::N_NORMAL
|
9786 SIInstrFlags::P_NORMAL
|
9787 SIInstrFlags::N_SUBNORMAL
|
9788 SIInstrFlags::P_SUBNORMAL
;
9789 unsigned Mask
= CC
== ISD::SETOEQ
? IsInfMask
: IsFiniteMask
;
9790 return DAG
.getNode(AMDGPUISD::FP_CLASS
, SL
, MVT::i1
, LHS
.getOperand(0),
9791 DAG
.getConstant(Mask
, SL
, MVT::i32
));
9798 SDValue
SITargetLowering::performCvtF32UByteNCombine(SDNode
*N
,
9799 DAGCombinerInfo
&DCI
) const {
9800 SelectionDAG
&DAG
= DCI
.DAG
;
9802 unsigned Offset
= N
->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0
;
9804 SDValue Src
= N
->getOperand(0);
9805 SDValue Srl
= N
->getOperand(0);
9806 if (Srl
.getOpcode() == ISD::ZERO_EXTEND
)
9807 Srl
= Srl
.getOperand(0);
9809 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
9810 if (Srl
.getOpcode() == ISD::SRL
) {
9811 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
9812 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
9813 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
9815 if (const ConstantSDNode
*C
=
9816 dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1))) {
9817 Srl
= DAG
.getZExtOrTrunc(Srl
.getOperand(0), SDLoc(Srl
.getOperand(0)),
9820 unsigned SrcOffset
= C
->getZExtValue() + 8 * Offset
;
9821 if (SrcOffset
< 32 && SrcOffset
% 8 == 0) {
9822 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
+ SrcOffset
/ 8, SL
,
9828 APInt Demanded
= APInt::getBitsSet(32, 8 * Offset
, 8 * Offset
+ 8);
9831 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
9832 !DCI
.isBeforeLegalizeOps());
9833 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
9834 if (TLI
.SimplifyDemandedBits(Src
, Demanded
, Known
, TLO
)) {
9835 DCI
.CommitTargetLoweringOpt(TLO
);
9841 SDValue
SITargetLowering::performClampCombine(SDNode
*N
,
9842 DAGCombinerInfo
&DCI
) const {
9843 ConstantFPSDNode
*CSrc
= dyn_cast
<ConstantFPSDNode
>(N
->getOperand(0));
9847 const MachineFunction
&MF
= DCI
.DAG
.getMachineFunction();
9848 const APFloat
&F
= CSrc
->getValueAPF();
9849 APFloat Zero
= APFloat::getZero(F
.getSemantics());
9850 APFloat::cmpResult Cmp0
= F
.compare(Zero
);
9851 if (Cmp0
== APFloat::cmpLessThan
||
9852 (Cmp0
== APFloat::cmpUnordered
&&
9853 MF
.getInfo
<SIMachineFunctionInfo
>()->getMode().DX10Clamp
)) {
9854 return DCI
.DAG
.getConstantFP(Zero
, SDLoc(N
), N
->getValueType(0));
9857 APFloat
One(F
.getSemantics(), "1.0");
9858 APFloat::cmpResult Cmp1
= F
.compare(One
);
9859 if (Cmp1
== APFloat::cmpGreaterThan
)
9860 return DCI
.DAG
.getConstantFP(One
, SDLoc(N
), N
->getValueType(0));
9862 return SDValue(CSrc
, 0);
9866 SDValue
SITargetLowering::PerformDAGCombine(SDNode
*N
,
9867 DAGCombinerInfo
&DCI
) const {
9868 if (getTargetMachine().getOptLevel() == CodeGenOpt::None
)
9870 switch (N
->getOpcode()) {
9872 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
9874 return performAddCombine(N
, DCI
);
9876 return performSubCombine(N
, DCI
);
9879 return performAddCarrySubCarryCombine(N
, DCI
);
9881 return performFAddCombine(N
, DCI
);
9883 return performFSubCombine(N
, DCI
);
9885 return performSetCCCombine(N
, DCI
);
9888 case ISD::FMAXNUM_IEEE
:
9889 case ISD::FMINNUM_IEEE
:
9894 case AMDGPUISD::FMIN_LEGACY
:
9895 case AMDGPUISD::FMAX_LEGACY
:
9896 return performMinMaxCombine(N
, DCI
);
9898 return performFMACombine(N
, DCI
);
9900 if (SDValue Widended
= widenLoad(cast
<LoadSDNode
>(N
), DCI
))
9905 case ISD::ATOMIC_LOAD
:
9906 case ISD::ATOMIC_STORE
:
9907 case ISD::ATOMIC_CMP_SWAP
:
9908 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
:
9909 case ISD::ATOMIC_SWAP
:
9910 case ISD::ATOMIC_LOAD_ADD
:
9911 case ISD::ATOMIC_LOAD_SUB
:
9912 case ISD::ATOMIC_LOAD_AND
:
9913 case ISD::ATOMIC_LOAD_OR
:
9914 case ISD::ATOMIC_LOAD_XOR
:
9915 case ISD::ATOMIC_LOAD_NAND
:
9916 case ISD::ATOMIC_LOAD_MIN
:
9917 case ISD::ATOMIC_LOAD_MAX
:
9918 case ISD::ATOMIC_LOAD_UMIN
:
9919 case ISD::ATOMIC_LOAD_UMAX
:
9920 case ISD::ATOMIC_LOAD_FADD
:
9921 case AMDGPUISD::ATOMIC_INC
:
9922 case AMDGPUISD::ATOMIC_DEC
:
9923 case AMDGPUISD::ATOMIC_LOAD_FMIN
:
9924 case AMDGPUISD::ATOMIC_LOAD_FMAX
: // TODO: Target mem intrinsics.
9925 if (DCI
.isBeforeLegalize())
9927 return performMemSDNodeCombine(cast
<MemSDNode
>(N
), DCI
);
9929 return performAndCombine(N
, DCI
);
9931 return performOrCombine(N
, DCI
);
9933 return performXorCombine(N
, DCI
);
9934 case ISD::ZERO_EXTEND
:
9935 return performZeroExtendCombine(N
, DCI
);
9936 case ISD::SIGN_EXTEND_INREG
:
9937 return performSignExtendInRegCombine(N
, DCI
);
9938 case AMDGPUISD::FP_CLASS
:
9939 return performClassCombine(N
, DCI
);
9940 case ISD::FCANONICALIZE
:
9941 return performFCanonicalizeCombine(N
, DCI
);
9942 case AMDGPUISD::RCP
:
9943 return performRcpCombine(N
, DCI
);
9944 case AMDGPUISD::FRACT
:
9945 case AMDGPUISD::RSQ
:
9946 case AMDGPUISD::RCP_LEGACY
:
9947 case AMDGPUISD::RSQ_LEGACY
:
9948 case AMDGPUISD::RCP_IFLAG
:
9949 case AMDGPUISD::RSQ_CLAMP
:
9950 case AMDGPUISD::LDEXP
: {
9951 SDValue Src
= N
->getOperand(0);
9956 case ISD::SINT_TO_FP
:
9957 case ISD::UINT_TO_FP
:
9958 return performUCharToFloatCombine(N
, DCI
);
9959 case AMDGPUISD::CVT_F32_UBYTE0
:
9960 case AMDGPUISD::CVT_F32_UBYTE1
:
9961 case AMDGPUISD::CVT_F32_UBYTE2
:
9962 case AMDGPUISD::CVT_F32_UBYTE3
:
9963 return performCvtF32UByteNCombine(N
, DCI
);
9964 case AMDGPUISD::FMED3
:
9965 return performFMed3Combine(N
, DCI
);
9966 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
9967 return performCvtPkRTZCombine(N
, DCI
);
9968 case AMDGPUISD::CLAMP
:
9969 return performClampCombine(N
, DCI
);
9970 case ISD::SCALAR_TO_VECTOR
: {
9971 SelectionDAG
&DAG
= DCI
.DAG
;
9972 EVT VT
= N
->getValueType(0);
9974 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
9975 if (VT
== MVT::v2i16
|| VT
== MVT::v2f16
) {
9977 SDValue Src
= N
->getOperand(0);
9978 EVT EltVT
= Src
.getValueType();
9979 if (EltVT
== MVT::f16
)
9980 Src
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Src
);
9982 SDValue Ext
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Src
);
9983 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Ext
);
9988 case ISD::EXTRACT_VECTOR_ELT
:
9989 return performExtractVectorEltCombine(N
, DCI
);
9990 case ISD::INSERT_VECTOR_ELT
:
9991 return performInsertVectorEltCombine(N
, DCI
);
9993 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
9996 /// Helper function for adjustWritemask
9997 static unsigned SubIdx2Lane(unsigned Idx
) {
10000 case AMDGPU::sub0
: return 0;
10001 case AMDGPU::sub1
: return 1;
10002 case AMDGPU::sub2
: return 2;
10003 case AMDGPU::sub3
: return 3;
10004 case AMDGPU::sub4
: return 4; // Possible with TFE/LWE
10008 /// Adjust the writemask of MIMG instructions
10009 SDNode
*SITargetLowering::adjustWritemask(MachineSDNode
*&Node
,
10010 SelectionDAG
&DAG
) const {
10011 unsigned Opcode
= Node
->getMachineOpcode();
10013 // Subtract 1 because the vdata output is not a MachineSDNode operand.
10014 int D16Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::d16
) - 1;
10015 if (D16Idx
>= 0 && Node
->getConstantOperandVal(D16Idx
))
10016 return Node
; // not implemented for D16
10018 SDNode
*Users
[5] = { nullptr };
10020 unsigned DmaskIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::dmask
) - 1;
10021 unsigned OldDmask
= Node
->getConstantOperandVal(DmaskIdx
);
10022 unsigned NewDmask
= 0;
10023 unsigned TFEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::tfe
) - 1;
10024 unsigned LWEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::lwe
) - 1;
10025 bool UsesTFC
= (Node
->getConstantOperandVal(TFEIdx
) ||
10026 Node
->getConstantOperandVal(LWEIdx
)) ? 1 : 0;
10027 unsigned TFCLane
= 0;
10028 bool HasChain
= Node
->getNumValues() > 1;
10030 if (OldDmask
== 0) {
10031 // These are folded out, but on the chance it happens don't assert.
10035 unsigned OldBitsSet
= countPopulation(OldDmask
);
10036 // Work out which is the TFE/LWE lane if that is enabled.
10038 TFCLane
= OldBitsSet
;
10041 // Try to figure out the used register components
10042 for (SDNode::use_iterator I
= Node
->use_begin(), E
= Node
->use_end();
10045 // Don't look at users of the chain.
10046 if (I
.getUse().getResNo() != 0)
10049 // Abort if we can't understand the usage
10050 if (!I
->isMachineOpcode() ||
10051 I
->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
)
10054 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
10055 // Note that subregs are packed, i.e. Lane==0 is the first bit set
10056 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
10058 Lane
= SubIdx2Lane(I
->getConstantOperandVal(1));
10060 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
10061 if (UsesTFC
&& Lane
== TFCLane
) {
10064 // Set which texture component corresponds to the lane.
10066 for (unsigned i
= 0, Dmask
= OldDmask
; (i
<= Lane
) && (Dmask
!= 0); i
++) {
10067 Comp
= countTrailingZeros(Dmask
);
10068 Dmask
&= ~(1 << Comp
);
10071 // Abort if we have more than one user per component.
10076 NewDmask
|= 1 << Comp
;
10080 // Don't allow 0 dmask, as hardware assumes one channel enabled.
10081 bool NoChannels
= !NewDmask
;
10084 // No uses of the result and not using TFC. Then do nothing.
10087 // If the original dmask has one channel - then nothing to do
10088 if (OldBitsSet
== 1)
10090 // Use an arbitrary dmask - required for the instruction to work
10093 // Abort if there's no change
10094 if (NewDmask
== OldDmask
)
10097 unsigned BitsSet
= countPopulation(NewDmask
);
10099 // Check for TFE or LWE - increase the number of channels by one to account
10100 // for the extra return value
10101 // This will need adjustment for D16 if this is also included in
10102 // adjustWriteMask (this function) but at present D16 are excluded.
10103 unsigned NewChannels
= BitsSet
+ UsesTFC
;
10106 AMDGPU::getMaskedMIMGOp(Node
->getMachineOpcode(), NewChannels
);
10107 assert(NewOpcode
!= -1 &&
10108 NewOpcode
!= static_cast<int>(Node
->getMachineOpcode()) &&
10109 "failed to find equivalent MIMG op");
10111 // Adjust the writemask in the node
10112 SmallVector
<SDValue
, 12> Ops
;
10113 Ops
.insert(Ops
.end(), Node
->op_begin(), Node
->op_begin() + DmaskIdx
);
10114 Ops
.push_back(DAG
.getTargetConstant(NewDmask
, SDLoc(Node
), MVT::i32
));
10115 Ops
.insert(Ops
.end(), Node
->op_begin() + DmaskIdx
+ 1, Node
->op_end());
10117 MVT SVT
= Node
->getValueType(0).getVectorElementType().getSimpleVT();
10119 MVT ResultVT
= NewChannels
== 1 ?
10120 SVT
: MVT::getVectorVT(SVT
, NewChannels
== 3 ? 4 :
10121 NewChannels
== 5 ? 8 : NewChannels
);
10122 SDVTList NewVTList
= HasChain
?
10123 DAG
.getVTList(ResultVT
, MVT::Other
) : DAG
.getVTList(ResultVT
);
10126 MachineSDNode
*NewNode
= DAG
.getMachineNode(NewOpcode
, SDLoc(Node
),
10131 DAG
.setNodeMemRefs(NewNode
, Node
->memoperands());
10132 DAG
.ReplaceAllUsesOfValueWith(SDValue(Node
, 1), SDValue(NewNode
, 1));
10135 if (NewChannels
== 1) {
10136 assert(Node
->hasNUsesOfValue(1, 0));
10137 SDNode
*Copy
= DAG
.getMachineNode(TargetOpcode::COPY
,
10138 SDLoc(Node
), Users
[Lane
]->getValueType(0),
10139 SDValue(NewNode
, 0));
10140 DAG
.ReplaceAllUsesWith(Users
[Lane
], Copy
);
10144 // Update the users of the node with the new indices
10145 for (unsigned i
= 0, Idx
= AMDGPU::sub0
; i
< 5; ++i
) {
10146 SDNode
*User
= Users
[i
];
10148 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
10149 // Users[0] is still nullptr because channel 0 doesn't really have a use.
10150 if (i
|| !NoChannels
)
10153 SDValue Op
= DAG
.getTargetConstant(Idx
, SDLoc(User
), MVT::i32
);
10154 DAG
.UpdateNodeOperands(User
, SDValue(NewNode
, 0), Op
);
10159 case AMDGPU::sub0
: Idx
= AMDGPU::sub1
; break;
10160 case AMDGPU::sub1
: Idx
= AMDGPU::sub2
; break;
10161 case AMDGPU::sub2
: Idx
= AMDGPU::sub3
; break;
10162 case AMDGPU::sub3
: Idx
= AMDGPU::sub4
; break;
10166 DAG
.RemoveDeadNode(Node
);
10170 static bool isFrameIndexOp(SDValue Op
) {
10171 if (Op
.getOpcode() == ISD::AssertZext
)
10172 Op
= Op
.getOperand(0);
10174 return isa
<FrameIndexSDNode
>(Op
);
10177 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
10178 /// with frame index operands.
10179 /// LLVM assumes that inputs are to these instructions are registers.
10180 SDNode
*SITargetLowering::legalizeTargetIndependentNode(SDNode
*Node
,
10181 SelectionDAG
&DAG
) const {
10182 if (Node
->getOpcode() == ISD::CopyToReg
) {
10183 RegisterSDNode
*DestReg
= cast
<RegisterSDNode
>(Node
->getOperand(1));
10184 SDValue SrcVal
= Node
->getOperand(2);
10186 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
10187 // to try understanding copies to physical registers.
10188 if (SrcVal
.getValueType() == MVT::i1
&&
10189 Register::isPhysicalRegister(DestReg
->getReg())) {
10191 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
10192 SDValue VReg
= DAG
.getRegister(
10193 MRI
.createVirtualRegister(&AMDGPU::VReg_1RegClass
), MVT::i1
);
10195 SDNode
*Glued
= Node
->getGluedNode();
10197 = DAG
.getCopyToReg(Node
->getOperand(0), SL
, VReg
, SrcVal
,
10198 SDValue(Glued
, Glued
? Glued
->getNumValues() - 1 : 0));
10199 SDValue ToResultReg
10200 = DAG
.getCopyToReg(ToVReg
, SL
, SDValue(DestReg
, 0),
10201 VReg
, ToVReg
.getValue(1));
10202 DAG
.ReplaceAllUsesWith(Node
, ToResultReg
.getNode());
10203 DAG
.RemoveDeadNode(Node
);
10204 return ToResultReg
.getNode();
10208 SmallVector
<SDValue
, 8> Ops
;
10209 for (unsigned i
= 0; i
< Node
->getNumOperands(); ++i
) {
10210 if (!isFrameIndexOp(Node
->getOperand(i
))) {
10211 Ops
.push_back(Node
->getOperand(i
));
10216 Ops
.push_back(SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
,
10217 Node
->getOperand(i
).getValueType(),
10218 Node
->getOperand(i
)), 0));
10221 return DAG
.UpdateNodeOperands(Node
, Ops
);
10224 /// Fold the instructions after selecting them.
10225 /// Returns null if users were already updated.
10226 SDNode
*SITargetLowering::PostISelFolding(MachineSDNode
*Node
,
10227 SelectionDAG
&DAG
) const {
10228 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10229 unsigned Opcode
= Node
->getMachineOpcode();
10231 if (TII
->isMIMG(Opcode
) && !TII
->get(Opcode
).mayStore() &&
10232 !TII
->isGather4(Opcode
)) {
10233 return adjustWritemask(Node
, DAG
);
10236 if (Opcode
== AMDGPU::INSERT_SUBREG
||
10237 Opcode
== AMDGPU::REG_SEQUENCE
) {
10238 legalizeTargetIndependentNode(Node
, DAG
);
10243 case AMDGPU::V_DIV_SCALE_F32
:
10244 case AMDGPU::V_DIV_SCALE_F64
: {
10245 // Satisfy the operand register constraint when one of the inputs is
10246 // undefined. Ordinarily each undef value will have its own implicit_def of
10247 // a vreg, so force these to use a single register.
10248 SDValue Src0
= Node
->getOperand(0);
10249 SDValue Src1
= Node
->getOperand(1);
10250 SDValue Src2
= Node
->getOperand(2);
10252 if ((Src0
.isMachineOpcode() &&
10253 Src0
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
) &&
10254 (Src0
== Src1
|| Src0
== Src2
))
10257 MVT VT
= Src0
.getValueType().getSimpleVT();
10258 const TargetRegisterClass
*RC
=
10259 getRegClassFor(VT
, Src0
.getNode()->isDivergent());
10261 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
10262 SDValue UndefReg
= DAG
.getRegister(MRI
.createVirtualRegister(RC
), VT
);
10264 SDValue ImpDef
= DAG
.getCopyToReg(DAG
.getEntryNode(), SDLoc(Node
),
10265 UndefReg
, Src0
, SDValue());
10267 // src0 must be the same register as src1 or src2, even if the value is
10268 // undefined, so make sure we don't violate this constraint.
10269 if (Src0
.isMachineOpcode() &&
10270 Src0
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
) {
10271 if (Src1
.isMachineOpcode() &&
10272 Src1
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
10274 else if (Src2
.isMachineOpcode() &&
10275 Src2
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
10278 assert(Src1
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
);
10285 SmallVector
<SDValue
, 4> Ops
= { Src0
, Src1
, Src2
};
10286 for (unsigned I
= 3, N
= Node
->getNumOperands(); I
!= N
; ++I
)
10287 Ops
.push_back(Node
->getOperand(I
));
10289 Ops
.push_back(ImpDef
.getValue(1));
10290 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
10292 case AMDGPU::V_PERMLANE16_B32
:
10293 case AMDGPU::V_PERMLANEX16_B32
: {
10294 ConstantSDNode
*FI
= cast
<ConstantSDNode
>(Node
->getOperand(0));
10295 ConstantSDNode
*BC
= cast
<ConstantSDNode
>(Node
->getOperand(2));
10296 if (!FI
->getZExtValue() && !BC
->getZExtValue())
10298 SDValue VDstIn
= Node
->getOperand(6);
10299 if (VDstIn
.isMachineOpcode()
10300 && VDstIn
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
)
10302 MachineSDNode
*ImpDef
= DAG
.getMachineNode(TargetOpcode::IMPLICIT_DEF
,
10303 SDLoc(Node
), MVT::i32
);
10304 SmallVector
<SDValue
, 8> Ops
= { SDValue(FI
, 0), Node
->getOperand(1),
10305 SDValue(BC
, 0), Node
->getOperand(3),
10306 Node
->getOperand(4), Node
->getOperand(5),
10307 SDValue(ImpDef
, 0), Node
->getOperand(7) };
10308 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
10317 /// Assign the register class depending on the number of
10318 /// bits set in the writemask
10319 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr
&MI
,
10320 SDNode
*Node
) const {
10321 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10323 MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
10325 if (TII
->isVOP3(MI
.getOpcode())) {
10326 // Make sure constant bus requirements are respected.
10327 TII
->legalizeOperandsVOP3(MRI
, MI
);
10329 // Prefer VGPRs over AGPRs in mAI instructions where possible.
10330 // This saves a chain-copy of registers and better ballance register
10331 // use between vgpr and agpr as agpr tuples tend to be big.
10332 if (const MCOperandInfo
*OpInfo
= MI
.getDesc().OpInfo
) {
10333 unsigned Opc
= MI
.getOpcode();
10334 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10335 for (auto I
: { AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
),
10336 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
) }) {
10339 MachineOperand
&Op
= MI
.getOperand(I
);
10340 if ((OpInfo
[I
].RegClass
!= llvm::AMDGPU::AV_64RegClassID
&&
10341 OpInfo
[I
].RegClass
!= llvm::AMDGPU::AV_32RegClassID
) ||
10342 !Register::isVirtualRegister(Op
.getReg()) ||
10343 !TRI
->isAGPR(MRI
, Op
.getReg()))
10345 auto *Src
= MRI
.getUniqueVRegDef(Op
.getReg());
10346 if (!Src
|| !Src
->isCopy() ||
10347 !TRI
->isSGPRReg(MRI
, Src
->getOperand(1).getReg()))
10349 auto *RC
= TRI
->getRegClassForReg(MRI
, Op
.getReg());
10350 auto *NewRC
= TRI
->getEquivalentVGPRClass(RC
);
10351 // All uses of agpr64 and agpr32 can also accept vgpr except for
10352 // v_accvgpr_read, but we do not produce agpr reads during selection,
10353 // so no use checks are needed.
10354 MRI
.setRegClass(Op
.getReg(), NewRC
);
10361 // Replace unused atomics with the no return version.
10362 int NoRetAtomicOp
= AMDGPU::getAtomicNoRetOp(MI
.getOpcode());
10363 if (NoRetAtomicOp
!= -1) {
10364 if (!Node
->hasAnyUseOfValue(0)) {
10365 MI
.setDesc(TII
->get(NoRetAtomicOp
));
10366 MI
.RemoveOperand(0);
10370 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
10371 // instruction, because the return type of these instructions is a vec2 of
10372 // the memory type, so it can be tied to the input operand.
10373 // This means these instructions always have a use, so we need to add a
10374 // special case to check if the atomic has only one extract_subreg use,
10375 // which itself has no uses.
10376 if ((Node
->hasNUsesOfValue(1, 0) &&
10377 Node
->use_begin()->isMachineOpcode() &&
10378 Node
->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG
&&
10379 !Node
->use_begin()->hasAnyUseOfValue(0))) {
10380 Register Def
= MI
.getOperand(0).getReg();
10382 // Change this into a noret atomic.
10383 MI
.setDesc(TII
->get(NoRetAtomicOp
));
10384 MI
.RemoveOperand(0);
10386 // If we only remove the def operand from the atomic instruction, the
10387 // extract_subreg will be left with a use of a vreg without a def.
10388 // So we need to insert an implicit_def to avoid machine verifier
10390 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(),
10391 TII
->get(AMDGPU::IMPLICIT_DEF
), Def
);
10397 static SDValue
buildSMovImm32(SelectionDAG
&DAG
, const SDLoc
&DL
,
10399 SDValue K
= DAG
.getTargetConstant(Val
, DL
, MVT::i32
);
10400 return SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, K
), 0);
10403 MachineSDNode
*SITargetLowering::wrapAddr64Rsrc(SelectionDAG
&DAG
,
10405 SDValue Ptr
) const {
10406 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10408 // Build the half of the subregister with the constants before building the
10409 // full 128-bit register. If we are building multiple resource descriptors,
10410 // this will allow CSEing of the 2-component register.
10411 const SDValue Ops0
[] = {
10412 DAG
.getTargetConstant(AMDGPU::SGPR_64RegClassID
, DL
, MVT::i32
),
10413 buildSMovImm32(DAG
, DL
, 0),
10414 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
10415 buildSMovImm32(DAG
, DL
, TII
->getDefaultRsrcDataFormat() >> 32),
10416 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
)
10419 SDValue SubRegHi
= SDValue(DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
,
10420 MVT::v2i32
, Ops0
), 0);
10422 // Combine the constants and the pointer.
10423 const SDValue Ops1
[] = {
10424 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, DL
, MVT::i32
),
10426 DAG
.getTargetConstant(AMDGPU::sub0_sub1
, DL
, MVT::i32
),
10428 DAG
.getTargetConstant(AMDGPU::sub2_sub3
, DL
, MVT::i32
)
10431 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops1
);
10434 /// Return a resource descriptor with the 'Add TID' bit enabled
10435 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
10436 /// of the resource descriptor) to create an offset, which is added to
10437 /// the resource pointer.
10438 MachineSDNode
*SITargetLowering::buildRSRC(SelectionDAG
&DAG
, const SDLoc
&DL
,
10439 SDValue Ptr
, uint32_t RsrcDword1
,
10440 uint64_t RsrcDword2And3
) const {
10441 SDValue PtrLo
= DAG
.getTargetExtractSubreg(AMDGPU::sub0
, DL
, MVT::i32
, Ptr
);
10442 SDValue PtrHi
= DAG
.getTargetExtractSubreg(AMDGPU::sub1
, DL
, MVT::i32
, Ptr
);
10444 PtrHi
= SDValue(DAG
.getMachineNode(AMDGPU::S_OR_B32
, DL
, MVT::i32
, PtrHi
,
10445 DAG
.getConstant(RsrcDword1
, DL
, MVT::i32
)),
10449 SDValue DataLo
= buildSMovImm32(DAG
, DL
,
10450 RsrcDword2And3
& UINT64_C(0xFFFFFFFF));
10451 SDValue DataHi
= buildSMovImm32(DAG
, DL
, RsrcDword2And3
>> 32);
10453 const SDValue Ops
[] = {
10454 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, DL
, MVT::i32
),
10456 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
10458 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
),
10460 DAG
.getTargetConstant(AMDGPU::sub2
, DL
, MVT::i32
),
10462 DAG
.getTargetConstant(AMDGPU::sub3
, DL
, MVT::i32
)
10465 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops
);
10468 //===----------------------------------------------------------------------===//
10469 // SI Inline Assembly Support
10470 //===----------------------------------------------------------------------===//
10472 std::pair
<unsigned, const TargetRegisterClass
*>
10473 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo
*TRI
,
10474 StringRef Constraint
,
10476 const TargetRegisterClass
*RC
= nullptr;
10477 if (Constraint
.size() == 1) {
10478 switch (Constraint
[0]) {
10480 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
10483 switch (VT
.getSizeInBits()) {
10485 return std::make_pair(0U, nullptr);
10488 RC
= &AMDGPU::SReg_32_XM0RegClass
;
10491 RC
= &AMDGPU::SGPR_64RegClass
;
10494 RC
= &AMDGPU::SReg_96RegClass
;
10497 RC
= &AMDGPU::SReg_128RegClass
;
10500 RC
= &AMDGPU::SReg_160RegClass
;
10503 RC
= &AMDGPU::SReg_256RegClass
;
10506 RC
= &AMDGPU::SReg_512RegClass
;
10511 switch (VT
.getSizeInBits()) {
10513 return std::make_pair(0U, nullptr);
10516 RC
= &AMDGPU::VGPR_32RegClass
;
10519 RC
= &AMDGPU::VReg_64RegClass
;
10522 RC
= &AMDGPU::VReg_96RegClass
;
10525 RC
= &AMDGPU::VReg_128RegClass
;
10528 RC
= &AMDGPU::VReg_160RegClass
;
10531 RC
= &AMDGPU::VReg_256RegClass
;
10534 RC
= &AMDGPU::VReg_512RegClass
;
10539 if (!Subtarget
->hasMAIInsts())
10541 switch (VT
.getSizeInBits()) {
10543 return std::make_pair(0U, nullptr);
10546 RC
= &AMDGPU::AGPR_32RegClass
;
10549 RC
= &AMDGPU::AReg_64RegClass
;
10552 RC
= &AMDGPU::AReg_128RegClass
;
10555 RC
= &AMDGPU::AReg_512RegClass
;
10558 RC
= &AMDGPU::AReg_1024RegClass
;
10559 // v32 types are not legal but we support them here.
10560 return std::make_pair(0U, RC
);
10564 // We actually support i128, i16 and f16 as inline parameters
10565 // even if they are not reported as legal
10566 if (RC
&& (isTypeLegal(VT
) || VT
.SimpleTy
== MVT::i128
||
10567 VT
.SimpleTy
== MVT::i16
|| VT
.SimpleTy
== MVT::f16
))
10568 return std::make_pair(0U, RC
);
10571 if (Constraint
.size() > 1) {
10572 if (Constraint
[1] == 'v') {
10573 RC
= &AMDGPU::VGPR_32RegClass
;
10574 } else if (Constraint
[1] == 's') {
10575 RC
= &AMDGPU::SGPR_32RegClass
;
10576 } else if (Constraint
[1] == 'a') {
10577 RC
= &AMDGPU::AGPR_32RegClass
;
10582 bool Failed
= Constraint
.substr(2).getAsInteger(10, Idx
);
10583 if (!Failed
&& Idx
< RC
->getNumRegs())
10584 return std::make_pair(RC
->getRegister(Idx
), RC
);
10587 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
10590 SITargetLowering::ConstraintType
10591 SITargetLowering::getConstraintType(StringRef Constraint
) const {
10592 if (Constraint
.size() == 1) {
10593 switch (Constraint
[0]) {
10598 return C_RegisterClass
;
10601 return TargetLowering::getConstraintType(Constraint
);
10604 // Figure out which registers should be reserved for stack access. Only after
10605 // the function is legalized do we know all of the non-spill stack objects or if
10606 // calls are present.
10607 void SITargetLowering::finalizeLowering(MachineFunction
&MF
) const {
10608 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
10609 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10610 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
10611 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10613 if (Info
->isEntryFunction()) {
10614 // Callable functions have fixed registers used for stack access.
10615 reservePrivateMemoryRegs(getTargetMachine(), MF
, *TRI
, *Info
);
10618 assert(!TRI
->isSubRegister(Info
->getScratchRSrcReg(),
10619 Info
->getStackPtrOffsetReg()));
10620 if (Info
->getStackPtrOffsetReg() != AMDGPU::SP_REG
)
10621 MRI
.replaceRegWith(AMDGPU::SP_REG
, Info
->getStackPtrOffsetReg());
10623 // We need to worry about replacing the default register with itself in case
10624 // of MIR testcases missing the MFI.
10625 if (Info
->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG
)
10626 MRI
.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG
, Info
->getScratchRSrcReg());
10628 if (Info
->getFrameOffsetReg() != AMDGPU::FP_REG
)
10629 MRI
.replaceRegWith(AMDGPU::FP_REG
, Info
->getFrameOffsetReg());
10631 if (Info
->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG
) {
10632 MRI
.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG
,
10633 Info
->getScratchWaveOffsetReg());
10636 Info
->limitOccupancy(MF
);
10638 if (ST
.isWave32() && !MF
.empty()) {
10639 // Add VCC_HI def because many instructions marked as imp-use VCC where
10640 // we may only define VCC_LO. If nothing defines VCC_HI we may end up
10641 // having a use of undef.
10643 const SIInstrInfo
*TII
= ST
.getInstrInfo();
10646 MachineBasicBlock
&MBB
= MF
.front();
10647 MachineBasicBlock::iterator I
= MBB
.getFirstNonDebugInstr();
10648 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), AMDGPU::VCC_HI
);
10650 for (auto &MBB
: MF
) {
10651 for (auto &MI
: MBB
) {
10652 TII
->fixImplicitOperands(MI
);
10657 TargetLoweringBase::finalizeLowering(MF
);
10660 void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op
,
10662 const APInt
&DemandedElts
,
10663 const SelectionDAG
&DAG
,
10664 unsigned Depth
) const {
10665 TargetLowering::computeKnownBitsForFrameIndex(Op
, Known
, DemandedElts
,
10668 // Set the high bits to zero based on the maximum allowed scratch size per
10669 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
10670 // calculation won't overflow, so assume the sign bit is never set.
10671 Known
.Zero
.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
10674 unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop
*ML
) const {
10675 const unsigned PrefAlign
= TargetLowering::getPrefLoopAlignment(ML
);
10676 const unsigned CacheLineAlign
= 6; // log2(64)
10678 // Pre-GFX10 target did not benefit from loop alignment
10679 if (!ML
|| DisableLoopAlignment
||
10680 (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10
) ||
10681 getSubtarget()->hasInstFwdPrefetchBug())
10684 // On GFX10 I$ is 4 x 64 bytes cache lines.
10685 // By default prefetcher keeps one cache line behind and reads two ahead.
10686 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
10687 // behind and one ahead.
10688 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
10689 // If loop fits 64 bytes it always spans no more than two cache lines and
10690 // does not need an alignment.
10691 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
10692 // Else if loop is less or equal 192 bytes we need two lines behind.
10694 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10695 const MachineBasicBlock
*Header
= ML
->getHeader();
10696 if (Header
->getAlignment() != PrefAlign
)
10697 return Header
->getAlignment(); // Already processed.
10699 unsigned LoopSize
= 0;
10700 for (const MachineBasicBlock
*MBB
: ML
->blocks()) {
10701 // If inner loop block is aligned assume in average half of the alignment
10702 // size to be added as nops.
10704 LoopSize
+= (1 << MBB
->getAlignment()) / 2;
10706 for (const MachineInstr
&MI
: *MBB
) {
10707 LoopSize
+= TII
->getInstSizeInBytes(MI
);
10708 if (LoopSize
> 192)
10713 if (LoopSize
<= 64)
10716 if (LoopSize
<= 128)
10717 return CacheLineAlign
;
10719 // If any of parent loops is surrounded by prefetch instructions do not
10720 // insert new for inner loop, which would reset parent's settings.
10721 for (MachineLoop
*P
= ML
->getParentLoop(); P
; P
= P
->getParentLoop()) {
10722 if (MachineBasicBlock
*Exit
= P
->getExitBlock()) {
10723 auto I
= Exit
->getFirstNonDebugInstr();
10724 if (I
!= Exit
->end() && I
->getOpcode() == AMDGPU::S_INST_PREFETCH
)
10725 return CacheLineAlign
;
10729 MachineBasicBlock
*Pre
= ML
->getLoopPreheader();
10730 MachineBasicBlock
*Exit
= ML
->getExitBlock();
10733 BuildMI(*Pre
, Pre
->getFirstTerminator(), DebugLoc(),
10734 TII
->get(AMDGPU::S_INST_PREFETCH
))
10735 .addImm(1); // prefetch 2 lines behind PC
10737 BuildMI(*Exit
, Exit
->getFirstNonDebugInstr(), DebugLoc(),
10738 TII
->get(AMDGPU::S_INST_PREFETCH
))
10739 .addImm(2); // prefetch 1 line behind PC
10742 return CacheLineAlign
;
10745 LLVM_ATTRIBUTE_UNUSED
10746 static bool isCopyFromRegOfInlineAsm(const SDNode
*N
) {
10747 assert(N
->getOpcode() == ISD::CopyFromReg
);
10749 // Follow the chain until we find an INLINEASM node.
10750 N
= N
->getOperand(0).getNode();
10751 if (N
->getOpcode() == ISD::INLINEASM
||
10752 N
->getOpcode() == ISD::INLINEASM_BR
)
10754 } while (N
->getOpcode() == ISD::CopyFromReg
);
10758 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode
* N
,
10759 FunctionLoweringInfo
* FLI
, LegacyDivergenceAnalysis
* KDA
) const
10761 switch (N
->getOpcode()) {
10762 case ISD::CopyFromReg
:
10764 const RegisterSDNode
*R
= cast
<RegisterSDNode
>(N
->getOperand(1));
10765 const MachineFunction
* MF
= FLI
->MF
;
10766 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
10767 const MachineRegisterInfo
&MRI
= MF
->getRegInfo();
10768 const SIRegisterInfo
&TRI
= ST
.getInstrInfo()->getRegisterInfo();
10769 unsigned Reg
= R
->getReg();
10770 if (Register::isPhysicalRegister(Reg
))
10771 return !TRI
.isSGPRReg(MRI
, Reg
);
10773 if (MRI
.isLiveIn(Reg
)) {
10774 // workitem.id.x workitem.id.y workitem.id.z
10775 // Any VGPR formal argument is also considered divergent
10776 if (!TRI
.isSGPRReg(MRI
, Reg
))
10778 // Formal arguments of non-entry functions
10779 // are conservatively considered divergent
10780 else if (!AMDGPU::isEntryFunctionCC(FLI
->Fn
->getCallingConv()))
10784 const Value
*V
= FLI
->getValueFromVirtualReg(Reg
);
10786 return KDA
->isDivergent(V
);
10787 assert(Reg
== FLI
->DemoteRegister
|| isCopyFromRegOfInlineAsm(N
));
10788 return !TRI
.isSGPRReg(MRI
, Reg
);
10792 const LoadSDNode
*L
= cast
<LoadSDNode
>(N
);
10793 unsigned AS
= L
->getAddressSpace();
10794 // A flat load may access private memory.
10795 return AS
== AMDGPUAS::PRIVATE_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
;
10797 case ISD::CALLSEQ_END
:
10800 case ISD::INTRINSIC_WO_CHAIN
:
10804 return AMDGPU::isIntrinsicSourceOfDivergence(
10805 cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue());
10806 case ISD::INTRINSIC_W_CHAIN
:
10807 return AMDGPU::isIntrinsicSourceOfDivergence(
10808 cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue());
10809 // In some cases intrinsics that are a source of divergence have been
10810 // lowered to AMDGPUISD so we also need to check those too.
10811 case AMDGPUISD::INTERP_MOV
:
10812 case AMDGPUISD::INTERP_P1
:
10813 case AMDGPUISD::INTERP_P2
:
10819 bool SITargetLowering::denormalsEnabledForType(EVT VT
) const {
10820 switch (VT
.getScalarType().getSimpleVT().SimpleTy
) {
10822 return Subtarget
->hasFP32Denormals();
10824 return Subtarget
->hasFP64Denormals();
10826 return Subtarget
->hasFP16Denormals();
10832 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op
,
10833 const SelectionDAG
&DAG
,
10835 unsigned Depth
) const {
10836 if (Op
.getOpcode() == AMDGPUISD::CLAMP
) {
10837 const MachineFunction
&MF
= DAG
.getMachineFunction();
10838 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10840 if (Info
->getMode().DX10Clamp
)
10841 return true; // Clamped to 0.
10842 return DAG
.isKnownNeverNaN(Op
.getOperand(0), SNaN
, Depth
+ 1);
10845 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op
, DAG
,
10849 TargetLowering::AtomicExpansionKind
10850 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*RMW
) const {
10851 switch (RMW
->getOperation()) {
10852 case AtomicRMWInst::FAdd
: {
10853 Type
*Ty
= RMW
->getType();
10855 // We don't have a way to support 16-bit atomics now, so just leave them
10857 if (Ty
->isHalfTy())
10858 return AtomicExpansionKind::None
;
10860 if (!Ty
->isFloatTy())
10861 return AtomicExpansionKind::CmpXChg
;
10863 // TODO: Do have these for flat. Older targets also had them for buffers.
10864 unsigned AS
= RMW
->getPointerAddressSpace();
10865 return (AS
== AMDGPUAS::LOCAL_ADDRESS
&& Subtarget
->hasLDSFPAtomics()) ?
10866 AtomicExpansionKind::None
: AtomicExpansionKind::CmpXChg
;
10872 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW
);