1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Custom DAG lowering for SI
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
16 #define _USE_MATH_DEFINES
19 #include "SIISelLowering.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "SIDefines.h"
24 #include "SIInstrInfo.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIRegisterInfo.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/ADT/APFloat.h"
30 #include "llvm/ADT/APInt.h"
31 #include "llvm/ADT/ArrayRef.h"
32 #include "llvm/ADT/BitVector.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/ADT/StringRef.h"
36 #include "llvm/ADT/StringSwitch.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/CodeGen/Analysis.h"
39 #include "llvm/CodeGen/CallingConvLower.h"
40 #include "llvm/CodeGen/DAGCombine.h"
41 #include "llvm/CodeGen/ISDOpcodes.h"
42 #include "llvm/CodeGen/MachineBasicBlock.h"
43 #include "llvm/CodeGen/MachineFrameInfo.h"
44 #include "llvm/CodeGen/MachineFunction.h"
45 #include "llvm/CodeGen/MachineInstr.h"
46 #include "llvm/CodeGen/MachineInstrBuilder.h"
47 #include "llvm/CodeGen/MachineMemOperand.h"
48 #include "llvm/CodeGen/MachineModuleInfo.h"
49 #include "llvm/CodeGen/MachineOperand.h"
50 #include "llvm/CodeGen/MachineRegisterInfo.h"
51 #include "llvm/CodeGen/SelectionDAG.h"
52 #include "llvm/CodeGen/SelectionDAGNodes.h"
53 #include "llvm/CodeGen/TargetCallingConv.h"
54 #include "llvm/CodeGen/TargetRegisterInfo.h"
55 #include "llvm/CodeGen/ValueTypes.h"
56 #include "llvm/IR/Constants.h"
57 #include "llvm/IR/DataLayout.h"
58 #include "llvm/IR/DebugLoc.h"
59 #include "llvm/IR/DerivedTypes.h"
60 #include "llvm/IR/DiagnosticInfo.h"
61 #include "llvm/IR/Function.h"
62 #include "llvm/IR/GlobalValue.h"
63 #include "llvm/IR/InstrTypes.h"
64 #include "llvm/IR/Instruction.h"
65 #include "llvm/IR/Instructions.h"
66 #include "llvm/IR/IntrinsicInst.h"
67 #include "llvm/IR/Type.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/CommandLine.h"
71 #include "llvm/Support/Compiler.h"
72 #include "llvm/Support/ErrorHandling.h"
73 #include "llvm/Support/KnownBits.h"
74 #include "llvm/Support/MachineValueType.h"
75 #include "llvm/Support/MathExtras.h"
76 #include "llvm/Target/TargetOptions.h"
87 #define DEBUG_TYPE "si-lower"
89 STATISTIC(NumTailCalls
, "Number of tail calls");
91 static cl::opt
<bool> EnableVGPRIndexMode(
92 "amdgpu-vgpr-index-mode",
93 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
96 static cl::opt
<bool> DisableLoopAlignment(
97 "amdgpu-disable-loop-alignment",
98 cl::desc("Do not align and prefetch loops"),
101 static unsigned findFirstFreeSGPR(CCState
&CCInfo
) {
102 unsigned NumSGPRs
= AMDGPU::SGPR_32RegClass
.getNumRegs();
103 for (unsigned Reg
= 0; Reg
< NumSGPRs
; ++Reg
) {
104 if (!CCInfo
.isAllocated(AMDGPU::SGPR0
+ Reg
)) {
105 return AMDGPU::SGPR0
+ Reg
;
108 llvm_unreachable("Cannot allocate sgpr");
111 SITargetLowering::SITargetLowering(const TargetMachine
&TM
,
112 const GCNSubtarget
&STI
)
113 : AMDGPUTargetLowering(TM
, STI
),
115 addRegisterClass(MVT::i1
, &AMDGPU::VReg_1RegClass
);
116 addRegisterClass(MVT::i64
, &AMDGPU::SReg_64RegClass
);
118 addRegisterClass(MVT::i32
, &AMDGPU::SReg_32_XM0RegClass
);
119 addRegisterClass(MVT::f32
, &AMDGPU::VGPR_32RegClass
);
121 addRegisterClass(MVT::f64
, &AMDGPU::VReg_64RegClass
);
122 addRegisterClass(MVT::v2i32
, &AMDGPU::SReg_64RegClass
);
123 addRegisterClass(MVT::v2f32
, &AMDGPU::VReg_64RegClass
);
125 addRegisterClass(MVT::v3i32
, &AMDGPU::SGPR_96RegClass
);
126 addRegisterClass(MVT::v3f32
, &AMDGPU::VReg_96RegClass
);
128 addRegisterClass(MVT::v2i64
, &AMDGPU::SReg_128RegClass
);
129 addRegisterClass(MVT::v2f64
, &AMDGPU::SReg_128RegClass
);
131 addRegisterClass(MVT::v4i32
, &AMDGPU::SReg_128RegClass
);
132 addRegisterClass(MVT::v4f32
, &AMDGPU::VReg_128RegClass
);
134 addRegisterClass(MVT::v5i32
, &AMDGPU::SGPR_160RegClass
);
135 addRegisterClass(MVT::v5f32
, &AMDGPU::VReg_160RegClass
);
137 addRegisterClass(MVT::v8i32
, &AMDGPU::SReg_256RegClass
);
138 addRegisterClass(MVT::v8f32
, &AMDGPU::VReg_256RegClass
);
140 addRegisterClass(MVT::v16i32
, &AMDGPU::SReg_512RegClass
);
141 addRegisterClass(MVT::v16f32
, &AMDGPU::VReg_512RegClass
);
143 if (Subtarget
->has16BitInsts()) {
144 addRegisterClass(MVT::i16
, &AMDGPU::SReg_32_XM0RegClass
);
145 addRegisterClass(MVT::f16
, &AMDGPU::SReg_32_XM0RegClass
);
147 // Unless there are also VOP3P operations, not operations are really legal.
148 addRegisterClass(MVT::v2i16
, &AMDGPU::SReg_32_XM0RegClass
);
149 addRegisterClass(MVT::v2f16
, &AMDGPU::SReg_32_XM0RegClass
);
150 addRegisterClass(MVT::v4i16
, &AMDGPU::SReg_64RegClass
);
151 addRegisterClass(MVT::v4f16
, &AMDGPU::SReg_64RegClass
);
154 if (Subtarget
->hasMAIInsts()) {
155 addRegisterClass(MVT::v32i32
, &AMDGPU::VReg_1024RegClass
);
156 addRegisterClass(MVT::v32f32
, &AMDGPU::VReg_1024RegClass
);
159 computeRegisterProperties(Subtarget
->getRegisterInfo());
161 // We need to custom lower vector stores from local memory
162 setOperationAction(ISD::LOAD
, MVT::v2i32
, Custom
);
163 setOperationAction(ISD::LOAD
, MVT::v3i32
, Custom
);
164 setOperationAction(ISD::LOAD
, MVT::v4i32
, Custom
);
165 setOperationAction(ISD::LOAD
, MVT::v5i32
, Custom
);
166 setOperationAction(ISD::LOAD
, MVT::v8i32
, Custom
);
167 setOperationAction(ISD::LOAD
, MVT::v16i32
, Custom
);
168 setOperationAction(ISD::LOAD
, MVT::i1
, Custom
);
169 setOperationAction(ISD::LOAD
, MVT::v32i32
, Custom
);
171 setOperationAction(ISD::STORE
, MVT::v2i32
, Custom
);
172 setOperationAction(ISD::STORE
, MVT::v3i32
, Custom
);
173 setOperationAction(ISD::STORE
, MVT::v4i32
, Custom
);
174 setOperationAction(ISD::STORE
, MVT::v5i32
, Custom
);
175 setOperationAction(ISD::STORE
, MVT::v8i32
, Custom
);
176 setOperationAction(ISD::STORE
, MVT::v16i32
, Custom
);
177 setOperationAction(ISD::STORE
, MVT::i1
, Custom
);
178 setOperationAction(ISD::STORE
, MVT::v32i32
, Custom
);
180 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Expand
);
181 setTruncStoreAction(MVT::v3i32
, MVT::v3i16
, Expand
);
182 setTruncStoreAction(MVT::v4i32
, MVT::v4i16
, Expand
);
183 setTruncStoreAction(MVT::v8i32
, MVT::v8i16
, Expand
);
184 setTruncStoreAction(MVT::v16i32
, MVT::v16i16
, Expand
);
185 setTruncStoreAction(MVT::v32i32
, MVT::v32i16
, Expand
);
186 setTruncStoreAction(MVT::v2i32
, MVT::v2i8
, Expand
);
187 setTruncStoreAction(MVT::v4i32
, MVT::v4i8
, Expand
);
188 setTruncStoreAction(MVT::v8i32
, MVT::v8i8
, Expand
);
189 setTruncStoreAction(MVT::v16i32
, MVT::v16i8
, Expand
);
190 setTruncStoreAction(MVT::v32i32
, MVT::v32i8
, Expand
);
192 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
193 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
195 setOperationAction(ISD::SELECT
, MVT::i1
, Promote
);
196 setOperationAction(ISD::SELECT
, MVT::i64
, Custom
);
197 setOperationAction(ISD::SELECT
, MVT::f64
, Promote
);
198 AddPromotedToType(ISD::SELECT
, MVT::f64
, MVT::i64
);
200 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Expand
);
201 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Expand
);
202 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Expand
);
203 setOperationAction(ISD::SELECT_CC
, MVT::f64
, Expand
);
204 setOperationAction(ISD::SELECT_CC
, MVT::i1
, Expand
);
206 setOperationAction(ISD::SETCC
, MVT::i1
, Promote
);
207 setOperationAction(ISD::SETCC
, MVT::v2i1
, Expand
);
208 setOperationAction(ISD::SETCC
, MVT::v4i1
, Expand
);
209 AddPromotedToType(ISD::SETCC
, MVT::i1
, MVT::i32
);
211 setOperationAction(ISD::TRUNCATE
, MVT::v2i32
, Expand
);
212 setOperationAction(ISD::FP_ROUND
, MVT::v2f32
, Expand
);
214 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i1
, Custom
);
215 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i1
, Custom
);
216 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i8
, Custom
);
217 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i8
, Custom
);
218 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i16
, Custom
);
219 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v3i16
, Custom
);
220 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i16
, Custom
);
221 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::Other
, Custom
);
223 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
224 setOperationAction(ISD::BR_CC
, MVT::i1
, Expand
);
225 setOperationAction(ISD::BR_CC
, MVT::i32
, Expand
);
226 setOperationAction(ISD::BR_CC
, MVT::i64
, Expand
);
227 setOperationAction(ISD::BR_CC
, MVT::f32
, Expand
);
228 setOperationAction(ISD::BR_CC
, MVT::f64
, Expand
);
230 setOperationAction(ISD::UADDO
, MVT::i32
, Legal
);
231 setOperationAction(ISD::USUBO
, MVT::i32
, Legal
);
233 setOperationAction(ISD::ADDCARRY
, MVT::i32
, Legal
);
234 setOperationAction(ISD::SUBCARRY
, MVT::i32
, Legal
);
236 setOperationAction(ISD::SHL_PARTS
, MVT::i64
, Expand
);
237 setOperationAction(ISD::SRA_PARTS
, MVT::i64
, Expand
);
238 setOperationAction(ISD::SRL_PARTS
, MVT::i64
, Expand
);
241 setOperationAction(ISD::ADDCARRY
, MVT::i64
, Legal
);
242 setOperationAction(ISD::SUBCARRY
, MVT::i64
, Legal
);
245 // We only support LOAD/STORE and vector manipulation ops for vectors
246 // with > 4 elements.
247 for (MVT VT
: { MVT::v8i32
, MVT::v8f32
, MVT::v16i32
, MVT::v16f32
,
248 MVT::v2i64
, MVT::v2f64
, MVT::v4i16
, MVT::v4f16
,
249 MVT::v32i32
, MVT::v32f32
}) {
250 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
254 case ISD::BUILD_VECTOR
:
256 case ISD::EXTRACT_VECTOR_ELT
:
257 case ISD::INSERT_VECTOR_ELT
:
258 case ISD::INSERT_SUBVECTOR
:
259 case ISD::EXTRACT_SUBVECTOR
:
260 case ISD::SCALAR_TO_VECTOR
:
262 case ISD::CONCAT_VECTORS
:
263 setOperationAction(Op
, VT
, Custom
);
266 setOperationAction(Op
, VT
, Expand
);
272 setOperationAction(ISD::FP_EXTEND
, MVT::v4f32
, Expand
);
274 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
275 // is expanded to avoid having two separate loops in case the index is a VGPR.
277 // Most operations are naturally 32-bit vector operations. We only support
278 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
279 for (MVT Vec64
: { MVT::v2i64
, MVT::v2f64
}) {
280 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
281 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v4i32
);
283 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
284 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
286 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
287 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
289 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
290 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v4i32
);
293 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8i32
, Expand
);
294 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8f32
, Expand
);
295 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16i32
, Expand
);
296 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16f32
, Expand
);
298 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4f16
, Custom
);
299 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4i16
, Custom
);
301 // Avoid stack access for these.
302 // TODO: Generalize to more vector types.
303 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i16
, Custom
);
304 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2f16
, Custom
);
305 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
306 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
308 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
309 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
310 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i8
, Custom
);
311 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i8
, Custom
);
312 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v8i8
, Custom
);
314 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i8
, Custom
);
315 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i8
, Custom
);
316 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v8i8
, Custom
);
318 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i16
, Custom
);
319 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4f16
, Custom
);
320 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
321 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
323 // Deal with vec3 vector operations when widened to vec4.
324 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v3i32
, Custom
);
325 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v3f32
, Custom
);
326 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v4i32
, Custom
);
327 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v4f32
, Custom
);
329 // Deal with vec5 vector operations when widened to vec8.
330 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v5i32
, Custom
);
331 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v5f32
, Custom
);
332 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v8i32
, Custom
);
333 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v8f32
, Custom
);
335 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
336 // and output demarshalling
337 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i32
, Custom
);
338 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i64
, Custom
);
340 // We can't return success/failure, only the old value,
341 // let LLVM add the comparison
342 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i32
, Expand
);
343 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i64
, Expand
);
345 if (Subtarget
->hasFlatAddressSpace()) {
346 setOperationAction(ISD::ADDRSPACECAST
, MVT::i32
, Custom
);
347 setOperationAction(ISD::ADDRSPACECAST
, MVT::i64
, Custom
);
350 setOperationAction(ISD::BSWAP
, MVT::i32
, Legal
);
351 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
353 // On SI this is s_memtime and s_memrealtime on VI.
354 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Legal
);
355 setOperationAction(ISD::TRAP
, MVT::Other
, Custom
);
356 setOperationAction(ISD::DEBUGTRAP
, MVT::Other
, Custom
);
358 if (Subtarget
->has16BitInsts()) {
359 setOperationAction(ISD::FLOG
, MVT::f16
, Custom
);
360 setOperationAction(ISD::FEXP
, MVT::f16
, Custom
);
361 setOperationAction(ISD::FLOG10
, MVT::f16
, Custom
);
364 // v_mad_f32 does not support denormals according to some sources.
365 if (!Subtarget
->hasFP32Denormals())
366 setOperationAction(ISD::FMAD
, MVT::f32
, Legal
);
368 if (!Subtarget
->hasBFI()) {
369 // fcopysign can be done in a single instruction with BFI.
370 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Expand
);
371 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
374 if (!Subtarget
->hasBCNT(32))
375 setOperationAction(ISD::CTPOP
, MVT::i32
, Expand
);
377 if (!Subtarget
->hasBCNT(64))
378 setOperationAction(ISD::CTPOP
, MVT::i64
, Expand
);
380 if (Subtarget
->hasFFBH())
381 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i32
, Custom
);
383 if (Subtarget
->hasFFBL())
384 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i32
, Custom
);
386 // We only really have 32-bit BFE instructions (and 16-bit on VI).
388 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
389 // effort to match them now. We want this to be false for i64 cases when the
390 // extraction isn't restricted to the upper or lower half. Ideally we would
391 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
392 // span the midpoint are probably relatively rare, so don't worry about them
394 if (Subtarget
->hasBFE())
395 setHasExtractBitsInsn(true);
397 setOperationAction(ISD::FMINNUM
, MVT::f32
, Custom
);
398 setOperationAction(ISD::FMAXNUM
, MVT::f32
, Custom
);
399 setOperationAction(ISD::FMINNUM
, MVT::f64
, Custom
);
400 setOperationAction(ISD::FMAXNUM
, MVT::f64
, Custom
);
403 // These are really only legal for ieee_mode functions. We should be avoiding
404 // them for functions that don't have ieee_mode enabled, so just say they are
406 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f32
, Legal
);
407 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f32
, Legal
);
408 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f64
, Legal
);
409 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f64
, Legal
);
412 if (Subtarget
->haveRoundOpsF64()) {
413 setOperationAction(ISD::FTRUNC
, MVT::f64
, Legal
);
414 setOperationAction(ISD::FCEIL
, MVT::f64
, Legal
);
415 setOperationAction(ISD::FRINT
, MVT::f64
, Legal
);
417 setOperationAction(ISD::FCEIL
, MVT::f64
, Custom
);
418 setOperationAction(ISD::FTRUNC
, MVT::f64
, Custom
);
419 setOperationAction(ISD::FRINT
, MVT::f64
, Custom
);
420 setOperationAction(ISD::FFLOOR
, MVT::f64
, Custom
);
423 setOperationAction(ISD::FFLOOR
, MVT::f64
, Legal
);
425 setOperationAction(ISD::FSIN
, MVT::f32
, Custom
);
426 setOperationAction(ISD::FCOS
, MVT::f32
, Custom
);
427 setOperationAction(ISD::FDIV
, MVT::f32
, Custom
);
428 setOperationAction(ISD::FDIV
, MVT::f64
, Custom
);
430 if (Subtarget
->has16BitInsts()) {
431 setOperationAction(ISD::Constant
, MVT::i16
, Legal
);
433 setOperationAction(ISD::SMIN
, MVT::i16
, Legal
);
434 setOperationAction(ISD::SMAX
, MVT::i16
, Legal
);
436 setOperationAction(ISD::UMIN
, MVT::i16
, Legal
);
437 setOperationAction(ISD::UMAX
, MVT::i16
, Legal
);
439 setOperationAction(ISD::SIGN_EXTEND
, MVT::i16
, Promote
);
440 AddPromotedToType(ISD::SIGN_EXTEND
, MVT::i16
, MVT::i32
);
442 setOperationAction(ISD::ROTR
, MVT::i16
, Promote
);
443 setOperationAction(ISD::ROTL
, MVT::i16
, Promote
);
445 setOperationAction(ISD::SDIV
, MVT::i16
, Promote
);
446 setOperationAction(ISD::UDIV
, MVT::i16
, Promote
);
447 setOperationAction(ISD::SREM
, MVT::i16
, Promote
);
448 setOperationAction(ISD::UREM
, MVT::i16
, Promote
);
450 setOperationAction(ISD::BSWAP
, MVT::i16
, Promote
);
451 setOperationAction(ISD::BITREVERSE
, MVT::i16
, Promote
);
453 setOperationAction(ISD::CTTZ
, MVT::i16
, Promote
);
454 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i16
, Promote
);
455 setOperationAction(ISD::CTLZ
, MVT::i16
, Promote
);
456 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i16
, Promote
);
457 setOperationAction(ISD::CTPOP
, MVT::i16
, Promote
);
459 setOperationAction(ISD::SELECT_CC
, MVT::i16
, Expand
);
461 setOperationAction(ISD::BR_CC
, MVT::i16
, Expand
);
463 setOperationAction(ISD::LOAD
, MVT::i16
, Custom
);
465 setTruncStoreAction(MVT::i64
, MVT::i16
, Expand
);
467 setOperationAction(ISD::FP16_TO_FP
, MVT::i16
, Promote
);
468 AddPromotedToType(ISD::FP16_TO_FP
, MVT::i16
, MVT::i32
);
469 setOperationAction(ISD::FP_TO_FP16
, MVT::i16
, Promote
);
470 AddPromotedToType(ISD::FP_TO_FP16
, MVT::i16
, MVT::i32
);
472 setOperationAction(ISD::FP_TO_SINT
, MVT::i16
, Promote
);
473 setOperationAction(ISD::FP_TO_UINT
, MVT::i16
, Promote
);
474 setOperationAction(ISD::SINT_TO_FP
, MVT::i16
, Promote
);
475 setOperationAction(ISD::UINT_TO_FP
, MVT::i16
, Promote
);
477 // F16 - Constant Actions.
478 setOperationAction(ISD::ConstantFP
, MVT::f16
, Legal
);
480 // F16 - Load/Store Actions.
481 setOperationAction(ISD::LOAD
, MVT::f16
, Promote
);
482 AddPromotedToType(ISD::LOAD
, MVT::f16
, MVT::i16
);
483 setOperationAction(ISD::STORE
, MVT::f16
, Promote
);
484 AddPromotedToType(ISD::STORE
, MVT::f16
, MVT::i16
);
486 // F16 - VOP1 Actions.
487 setOperationAction(ISD::FP_ROUND
, MVT::f16
, Custom
);
488 setOperationAction(ISD::FCOS
, MVT::f16
, Promote
);
489 setOperationAction(ISD::FSIN
, MVT::f16
, Promote
);
490 setOperationAction(ISD::FP_TO_SINT
, MVT::f16
, Promote
);
491 setOperationAction(ISD::FP_TO_UINT
, MVT::f16
, Promote
);
492 setOperationAction(ISD::SINT_TO_FP
, MVT::f16
, Promote
);
493 setOperationAction(ISD::UINT_TO_FP
, MVT::f16
, Promote
);
494 setOperationAction(ISD::FROUND
, MVT::f16
, Custom
);
496 // F16 - VOP2 Actions.
497 setOperationAction(ISD::BR_CC
, MVT::f16
, Expand
);
498 setOperationAction(ISD::SELECT_CC
, MVT::f16
, Expand
);
500 setOperationAction(ISD::FDIV
, MVT::f16
, Custom
);
502 // F16 - VOP3 Actions.
503 setOperationAction(ISD::FMA
, MVT::f16
, Legal
);
504 if (!Subtarget
->hasFP16Denormals() && STI
.hasMadF16())
505 setOperationAction(ISD::FMAD
, MVT::f16
, Legal
);
507 for (MVT VT
: {MVT::v2i16
, MVT::v2f16
, MVT::v4i16
, MVT::v4f16
}) {
508 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
512 case ISD::BUILD_VECTOR
:
514 case ISD::EXTRACT_VECTOR_ELT
:
515 case ISD::INSERT_VECTOR_ELT
:
516 case ISD::INSERT_SUBVECTOR
:
517 case ISD::EXTRACT_SUBVECTOR
:
518 case ISD::SCALAR_TO_VECTOR
:
520 case ISD::CONCAT_VECTORS
:
521 setOperationAction(Op
, VT
, Custom
);
524 setOperationAction(Op
, VT
, Expand
);
530 // XXX - Do these do anything? Vector constants turn into build_vector.
531 setOperationAction(ISD::Constant
, MVT::v2i16
, Legal
);
532 setOperationAction(ISD::ConstantFP
, MVT::v2f16
, Legal
);
534 setOperationAction(ISD::UNDEF
, MVT::v2i16
, Legal
);
535 setOperationAction(ISD::UNDEF
, MVT::v2f16
, Legal
);
537 setOperationAction(ISD::STORE
, MVT::v2i16
, Promote
);
538 AddPromotedToType(ISD::STORE
, MVT::v2i16
, MVT::i32
);
539 setOperationAction(ISD::STORE
, MVT::v2f16
, Promote
);
540 AddPromotedToType(ISD::STORE
, MVT::v2f16
, MVT::i32
);
542 setOperationAction(ISD::LOAD
, MVT::v2i16
, Promote
);
543 AddPromotedToType(ISD::LOAD
, MVT::v2i16
, MVT::i32
);
544 setOperationAction(ISD::LOAD
, MVT::v2f16
, Promote
);
545 AddPromotedToType(ISD::LOAD
, MVT::v2f16
, MVT::i32
);
547 setOperationAction(ISD::AND
, MVT::v2i16
, Promote
);
548 AddPromotedToType(ISD::AND
, MVT::v2i16
, MVT::i32
);
549 setOperationAction(ISD::OR
, MVT::v2i16
, Promote
);
550 AddPromotedToType(ISD::OR
, MVT::v2i16
, MVT::i32
);
551 setOperationAction(ISD::XOR
, MVT::v2i16
, Promote
);
552 AddPromotedToType(ISD::XOR
, MVT::v2i16
, MVT::i32
);
554 setOperationAction(ISD::LOAD
, MVT::v4i16
, Promote
);
555 AddPromotedToType(ISD::LOAD
, MVT::v4i16
, MVT::v2i32
);
556 setOperationAction(ISD::LOAD
, MVT::v4f16
, Promote
);
557 AddPromotedToType(ISD::LOAD
, MVT::v4f16
, MVT::v2i32
);
559 setOperationAction(ISD::STORE
, MVT::v4i16
, Promote
);
560 AddPromotedToType(ISD::STORE
, MVT::v4i16
, MVT::v2i32
);
561 setOperationAction(ISD::STORE
, MVT::v4f16
, Promote
);
562 AddPromotedToType(ISD::STORE
, MVT::v4f16
, MVT::v2i32
);
564 setOperationAction(ISD::ANY_EXTEND
, MVT::v2i32
, Expand
);
565 setOperationAction(ISD::ZERO_EXTEND
, MVT::v2i32
, Expand
);
566 setOperationAction(ISD::SIGN_EXTEND
, MVT::v2i32
, Expand
);
567 setOperationAction(ISD::FP_EXTEND
, MVT::v2f32
, Expand
);
569 setOperationAction(ISD::ANY_EXTEND
, MVT::v4i32
, Expand
);
570 setOperationAction(ISD::ZERO_EXTEND
, MVT::v4i32
, Expand
);
571 setOperationAction(ISD::SIGN_EXTEND
, MVT::v4i32
, Expand
);
573 if (!Subtarget
->hasVOP3PInsts()) {
574 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2i16
, Custom
);
575 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2f16
, Custom
);
578 setOperationAction(ISD::FNEG
, MVT::v2f16
, Legal
);
579 // This isn't really legal, but this avoids the legalizer unrolling it (and
580 // allows matching fneg (fabs x) patterns)
581 setOperationAction(ISD::FABS
, MVT::v2f16
, Legal
);
583 setOperationAction(ISD::FMAXNUM
, MVT::f16
, Custom
);
584 setOperationAction(ISD::FMINNUM
, MVT::f16
, Custom
);
585 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f16
, Legal
);
586 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f16
, Legal
);
588 setOperationAction(ISD::FMINNUM_IEEE
, MVT::v4f16
, Custom
);
589 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::v4f16
, Custom
);
591 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Expand
);
592 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Expand
);
595 if (Subtarget
->hasVOP3PInsts()) {
596 setOperationAction(ISD::ADD
, MVT::v2i16
, Legal
);
597 setOperationAction(ISD::SUB
, MVT::v2i16
, Legal
);
598 setOperationAction(ISD::MUL
, MVT::v2i16
, Legal
);
599 setOperationAction(ISD::SHL
, MVT::v2i16
, Legal
);
600 setOperationAction(ISD::SRL
, MVT::v2i16
, Legal
);
601 setOperationAction(ISD::SRA
, MVT::v2i16
, Legal
);
602 setOperationAction(ISD::SMIN
, MVT::v2i16
, Legal
);
603 setOperationAction(ISD::UMIN
, MVT::v2i16
, Legal
);
604 setOperationAction(ISD::SMAX
, MVT::v2i16
, Legal
);
605 setOperationAction(ISD::UMAX
, MVT::v2i16
, Legal
);
607 setOperationAction(ISD::FADD
, MVT::v2f16
, Legal
);
608 setOperationAction(ISD::FMUL
, MVT::v2f16
, Legal
);
609 setOperationAction(ISD::FMA
, MVT::v2f16
, Legal
);
611 setOperationAction(ISD::FMINNUM_IEEE
, MVT::v2f16
, Legal
);
612 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::v2f16
, Legal
);
614 setOperationAction(ISD::FCANONICALIZE
, MVT::v2f16
, Legal
);
616 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
617 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
619 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4f16
, Custom
);
620 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4i16
, Custom
);
622 setOperationAction(ISD::SHL
, MVT::v4i16
, Custom
);
623 setOperationAction(ISD::SRA
, MVT::v4i16
, Custom
);
624 setOperationAction(ISD::SRL
, MVT::v4i16
, Custom
);
625 setOperationAction(ISD::ADD
, MVT::v4i16
, Custom
);
626 setOperationAction(ISD::SUB
, MVT::v4i16
, Custom
);
627 setOperationAction(ISD::MUL
, MVT::v4i16
, Custom
);
629 setOperationAction(ISD::SMIN
, MVT::v4i16
, Custom
);
630 setOperationAction(ISD::SMAX
, MVT::v4i16
, Custom
);
631 setOperationAction(ISD::UMIN
, MVT::v4i16
, Custom
);
632 setOperationAction(ISD::UMAX
, MVT::v4i16
, Custom
);
634 setOperationAction(ISD::FADD
, MVT::v4f16
, Custom
);
635 setOperationAction(ISD::FMUL
, MVT::v4f16
, Custom
);
636 setOperationAction(ISD::FMA
, MVT::v4f16
, Custom
);
638 setOperationAction(ISD::FMAXNUM
, MVT::v2f16
, Custom
);
639 setOperationAction(ISD::FMINNUM
, MVT::v2f16
, Custom
);
641 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Custom
);
642 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Custom
);
643 setOperationAction(ISD::FCANONICALIZE
, MVT::v4f16
, Custom
);
645 setOperationAction(ISD::FEXP
, MVT::v2f16
, Custom
);
646 setOperationAction(ISD::SELECT
, MVT::v4i16
, Custom
);
647 setOperationAction(ISD::SELECT
, MVT::v4f16
, Custom
);
650 setOperationAction(ISD::FNEG
, MVT::v4f16
, Custom
);
651 setOperationAction(ISD::FABS
, MVT::v4f16
, Custom
);
653 if (Subtarget
->has16BitInsts()) {
654 setOperationAction(ISD::SELECT
, MVT::v2i16
, Promote
);
655 AddPromotedToType(ISD::SELECT
, MVT::v2i16
, MVT::i32
);
656 setOperationAction(ISD::SELECT
, MVT::v2f16
, Promote
);
657 AddPromotedToType(ISD::SELECT
, MVT::v2f16
, MVT::i32
);
659 // Legalization hack.
660 setOperationAction(ISD::SELECT
, MVT::v2i16
, Custom
);
661 setOperationAction(ISD::SELECT
, MVT::v2f16
, Custom
);
663 setOperationAction(ISD::FNEG
, MVT::v2f16
, Custom
);
664 setOperationAction(ISD::FABS
, MVT::v2f16
, Custom
);
667 for (MVT VT
: { MVT::v4i16
, MVT::v4f16
, MVT::v2i8
, MVT::v4i8
, MVT::v8i8
}) {
668 setOperationAction(ISD::SELECT
, VT
, Custom
);
671 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
672 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f32
, Custom
);
673 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v4f32
, Custom
);
674 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::i16
, Custom
);
675 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f16
, Custom
);
676 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2i16
, Custom
);
677 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2f16
, Custom
);
679 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v2f16
, Custom
);
680 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v2i16
, Custom
);
681 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v4f16
, Custom
);
682 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v4i16
, Custom
);
683 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v8f16
, Custom
);
684 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::Other
, Custom
);
685 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::f16
, Custom
);
686 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i16
, Custom
);
687 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i8
, Custom
);
689 setOperationAction(ISD::INTRINSIC_VOID
, MVT::Other
, Custom
);
690 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2i16
, Custom
);
691 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2f16
, Custom
);
692 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v4f16
, Custom
);
693 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v4i16
, Custom
);
694 setOperationAction(ISD::INTRINSIC_VOID
, MVT::f16
, Custom
);
695 setOperationAction(ISD::INTRINSIC_VOID
, MVT::i16
, Custom
);
696 setOperationAction(ISD::INTRINSIC_VOID
, MVT::i8
, Custom
);
698 setTargetDAGCombine(ISD::ADD
);
699 setTargetDAGCombine(ISD::ADDCARRY
);
700 setTargetDAGCombine(ISD::SUB
);
701 setTargetDAGCombine(ISD::SUBCARRY
);
702 setTargetDAGCombine(ISD::FADD
);
703 setTargetDAGCombine(ISD::FSUB
);
704 setTargetDAGCombine(ISD::FMINNUM
);
705 setTargetDAGCombine(ISD::FMAXNUM
);
706 setTargetDAGCombine(ISD::FMINNUM_IEEE
);
707 setTargetDAGCombine(ISD::FMAXNUM_IEEE
);
708 setTargetDAGCombine(ISD::FMA
);
709 setTargetDAGCombine(ISD::SMIN
);
710 setTargetDAGCombine(ISD::SMAX
);
711 setTargetDAGCombine(ISD::UMIN
);
712 setTargetDAGCombine(ISD::UMAX
);
713 setTargetDAGCombine(ISD::SETCC
);
714 setTargetDAGCombine(ISD::AND
);
715 setTargetDAGCombine(ISD::OR
);
716 setTargetDAGCombine(ISD::XOR
);
717 setTargetDAGCombine(ISD::SINT_TO_FP
);
718 setTargetDAGCombine(ISD::UINT_TO_FP
);
719 setTargetDAGCombine(ISD::FCANONICALIZE
);
720 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR
);
721 setTargetDAGCombine(ISD::ZERO_EXTEND
);
722 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG
);
723 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT
);
724 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT
);
726 // All memory operations. Some folding on the pointer operand is done to help
727 // matching the constant offsets in the addressing modes.
728 setTargetDAGCombine(ISD::LOAD
);
729 setTargetDAGCombine(ISD::STORE
);
730 setTargetDAGCombine(ISD::ATOMIC_LOAD
);
731 setTargetDAGCombine(ISD::ATOMIC_STORE
);
732 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP
);
733 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
);
734 setTargetDAGCombine(ISD::ATOMIC_SWAP
);
735 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD
);
736 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB
);
737 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND
);
738 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR
);
739 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR
);
740 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND
);
741 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN
);
742 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX
);
743 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN
);
744 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX
);
745 setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD
);
747 setSchedulingPreference(Sched::RegPressure
);
750 const GCNSubtarget
*SITargetLowering::getSubtarget() const {
754 //===----------------------------------------------------------------------===//
755 // TargetLowering queries
756 //===----------------------------------------------------------------------===//
758 // v_mad_mix* support a conversion from f16 to f32.
760 // There is only one special case when denormals are enabled we don't currently,
761 // where this is OK to use.
762 bool SITargetLowering::isFPExtFoldable(unsigned Opcode
,
763 EVT DestVT
, EVT SrcVT
) const {
764 return ((Opcode
== ISD::FMAD
&& Subtarget
->hasMadMixInsts()) ||
765 (Opcode
== ISD::FMA
&& Subtarget
->hasFmaMixInsts())) &&
766 DestVT
.getScalarType() == MVT::f32
&& !Subtarget
->hasFP32Denormals() &&
767 SrcVT
.getScalarType() == MVT::f16
;
770 bool SITargetLowering::isShuffleMaskLegal(ArrayRef
<int>, EVT
) const {
771 // SI has some legal vector types, but no legal vector operations. Say no
772 // shuffles are legal in order to prefer scalarizing some vector operations.
776 MVT
SITargetLowering::getRegisterTypeForCallingConv(LLVMContext
&Context
,
779 if (CC
== CallingConv::AMDGPU_KERNEL
)
780 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
783 EVT ScalarVT
= VT
.getScalarType();
784 unsigned Size
= ScalarVT
.getSizeInBits();
786 return ScalarVT
.getSimpleVT();
791 if (Size
== 16 && Subtarget
->has16BitInsts())
792 return VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
793 } else if (VT
.getSizeInBits() > 32)
796 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
799 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext
&Context
,
802 if (CC
== CallingConv::AMDGPU_KERNEL
)
803 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
806 unsigned NumElts
= VT
.getVectorNumElements();
807 EVT ScalarVT
= VT
.getScalarType();
808 unsigned Size
= ScalarVT
.getSizeInBits();
814 return NumElts
* ((Size
+ 31) / 32);
816 if (Size
== 16 && Subtarget
->has16BitInsts())
817 return (NumElts
+ 1) / 2;
818 } else if (VT
.getSizeInBits() > 32)
819 return (VT
.getSizeInBits() + 31) / 32;
821 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
824 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
825 LLVMContext
&Context
, CallingConv::ID CC
,
826 EVT VT
, EVT
&IntermediateVT
,
827 unsigned &NumIntermediates
, MVT
&RegisterVT
) const {
828 if (CC
!= CallingConv::AMDGPU_KERNEL
&& VT
.isVector()) {
829 unsigned NumElts
= VT
.getVectorNumElements();
830 EVT ScalarVT
= VT
.getScalarType();
831 unsigned Size
= ScalarVT
.getSizeInBits();
833 RegisterVT
= ScalarVT
.getSimpleVT();
834 IntermediateVT
= RegisterVT
;
835 NumIntermediates
= NumElts
;
836 return NumIntermediates
;
840 RegisterVT
= MVT::i32
;
841 IntermediateVT
= RegisterVT
;
842 NumIntermediates
= NumElts
* ((Size
+ 31) / 32);
843 return NumIntermediates
;
846 // FIXME: We should fix the ABI to be the same on targets without 16-bit
847 // support, but unless we can properly handle 3-vectors, it will be still be
849 if (Size
== 16 && Subtarget
->has16BitInsts()) {
850 RegisterVT
= VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
851 IntermediateVT
= RegisterVT
;
852 NumIntermediates
= (NumElts
+ 1) / 2;
853 return NumIntermediates
;
857 return TargetLowering::getVectorTypeBreakdownForCallingConv(
858 Context
, CC
, VT
, IntermediateVT
, NumIntermediates
, RegisterVT
);
861 static MVT
memVTFromAggregate(Type
*Ty
) {
862 // Only limited forms of aggregate type currently expected.
863 assert(Ty
->isStructTy() && "Expected struct type");
866 Type
*ElementType
= nullptr;
868 if (Ty
->getContainedType(0)->isVectorTy()) {
869 VectorType
*VecComponent
= cast
<VectorType
>(Ty
->getContainedType(0));
870 ElementType
= VecComponent
->getElementType();
871 NumElts
= VecComponent
->getNumElements();
873 ElementType
= Ty
->getContainedType(0);
877 assert((Ty
->getContainedType(1) && Ty
->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
879 // Calculate the size of the memVT type from the aggregate
880 unsigned Pow2Elts
= 0;
881 unsigned ElementSize
;
882 switch (ElementType
->getTypeID()) {
884 llvm_unreachable("Unknown type!");
885 case Type::IntegerTyID
:
886 ElementSize
= cast
<IntegerType
>(ElementType
)->getBitWidth();
891 case Type::FloatTyID
:
895 unsigned AdditionalElts
= ElementSize
== 16 ? 2 : 1;
896 Pow2Elts
= 1 << Log2_32_Ceil(NumElts
+ AdditionalElts
);
898 return MVT::getVectorVT(MVT::getVT(ElementType
, false),
902 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo
&Info
,
905 unsigned IntrID
) const {
906 if (const AMDGPU::RsrcIntrinsic
*RsrcIntr
=
907 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
908 AttributeList Attr
= Intrinsic::getAttributes(CI
.getContext(),
909 (Intrinsic::ID
)IntrID
);
910 if (Attr
.hasFnAttribute(Attribute::ReadNone
))
913 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
915 if (RsrcIntr
->IsImage
) {
916 Info
.ptrVal
= MFI
->getImagePSV(
917 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
918 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
921 Info
.ptrVal
= MFI
->getBufferPSV(
922 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
923 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
926 Info
.flags
= MachineMemOperand::MODereferenceable
;
927 if (Attr
.hasFnAttribute(Attribute::ReadOnly
)) {
928 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
929 Info
.memVT
= MVT::getVT(CI
.getType(), true);
930 if (Info
.memVT
== MVT::Other
) {
931 // Some intrinsics return an aggregate type - special case to work out
933 Info
.memVT
= memVTFromAggregate(CI
.getType());
935 Info
.flags
|= MachineMemOperand::MOLoad
;
936 } else if (Attr
.hasFnAttribute(Attribute::WriteOnly
)) {
937 Info
.opc
= ISD::INTRINSIC_VOID
;
938 Info
.memVT
= MVT::getVT(CI
.getArgOperand(0)->getType());
939 Info
.flags
|= MachineMemOperand::MOStore
;
942 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
943 Info
.memVT
= MVT::getVT(CI
.getType());
944 Info
.flags
= MachineMemOperand::MOLoad
|
945 MachineMemOperand::MOStore
|
946 MachineMemOperand::MODereferenceable
;
948 // XXX - Should this be volatile without known ordering?
949 Info
.flags
|= MachineMemOperand::MOVolatile
;
955 case Intrinsic::amdgcn_atomic_inc
:
956 case Intrinsic::amdgcn_atomic_dec
:
957 case Intrinsic::amdgcn_ds_ordered_add
:
958 case Intrinsic::amdgcn_ds_ordered_swap
:
959 case Intrinsic::amdgcn_ds_fadd
:
960 case Intrinsic::amdgcn_ds_fmin
:
961 case Intrinsic::amdgcn_ds_fmax
: {
962 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
963 Info
.memVT
= MVT::getVT(CI
.getType());
964 Info
.ptrVal
= CI
.getOperand(0);
966 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
968 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(4));
970 Info
.flags
|= MachineMemOperand::MOVolatile
;
974 case Intrinsic::amdgcn_buffer_atomic_fadd
: {
975 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
977 Info
.opc
= ISD::INTRINSIC_VOID
;
978 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType());
979 Info
.ptrVal
= MFI
->getBufferPSV(
980 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
981 CI
.getArgOperand(1));
983 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
985 const ConstantInt
*Vol
= dyn_cast
<ConstantInt
>(CI
.getOperand(4));
986 if (!Vol
|| !Vol
->isZero())
987 Info
.flags
|= MachineMemOperand::MOVolatile
;
991 case Intrinsic::amdgcn_global_atomic_fadd
: {
992 Info
.opc
= ISD::INTRINSIC_VOID
;
993 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType()
994 ->getPointerElementType());
995 Info
.ptrVal
= CI
.getOperand(0);
997 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1001 case Intrinsic::amdgcn_ds_append
:
1002 case Intrinsic::amdgcn_ds_consume
: {
1003 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1004 Info
.memVT
= MVT::getVT(CI
.getType());
1005 Info
.ptrVal
= CI
.getOperand(0);
1007 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1009 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(1));
1011 Info
.flags
|= MachineMemOperand::MOVolatile
;
1015 case Intrinsic::amdgcn_ds_gws_init
:
1016 case Intrinsic::amdgcn_ds_gws_barrier
:
1017 case Intrinsic::amdgcn_ds_gws_sema_v
:
1018 case Intrinsic::amdgcn_ds_gws_sema_br
:
1019 case Intrinsic::amdgcn_ds_gws_sema_p
:
1020 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
1021 Info
.opc
= ISD::INTRINSIC_VOID
;
1023 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1025 MFI
->getGWSPSV(*MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo());
1027 // This is an abstract access, but we need to specify a type and size.
1028 Info
.memVT
= MVT::i32
;
1030 Info
.align
= Align(4);
1032 Info
.flags
= MachineMemOperand::MOStore
;
1033 if (IntrID
== Intrinsic::amdgcn_ds_gws_barrier
)
1034 Info
.flags
= MachineMemOperand::MOLoad
;
1042 bool SITargetLowering::getAddrModeArguments(IntrinsicInst
*II
,
1043 SmallVectorImpl
<Value
*> &Ops
,
1044 Type
*&AccessTy
) const {
1045 switch (II
->getIntrinsicID()) {
1046 case Intrinsic::amdgcn_atomic_inc
:
1047 case Intrinsic::amdgcn_atomic_dec
:
1048 case Intrinsic::amdgcn_ds_ordered_add
:
1049 case Intrinsic::amdgcn_ds_ordered_swap
:
1050 case Intrinsic::amdgcn_ds_fadd
:
1051 case Intrinsic::amdgcn_ds_fmin
:
1052 case Intrinsic::amdgcn_ds_fmax
: {
1053 Value
*Ptr
= II
->getArgOperand(0);
1054 AccessTy
= II
->getType();
1063 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode
&AM
) const {
1064 if (!Subtarget
->hasFlatInstOffsets()) {
1065 // Flat instructions do not have offsets, and only have the register
1067 return AM
.BaseOffs
== 0 && AM
.Scale
== 0;
1070 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
1071 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
1073 // GFX10 shrinked signed offset to 12 bits. When using regular flat
1074 // instructions, the sign bit is also ignored and is treated as 11-bit
1077 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
1078 return isUInt
<11>(AM
.BaseOffs
) && AM
.Scale
== 0;
1081 return isUInt
<12>(AM
.BaseOffs
) && AM
.Scale
== 0;
1084 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode
&AM
) const {
1085 if (Subtarget
->hasFlatGlobalInsts())
1086 return isInt
<13>(AM
.BaseOffs
) && AM
.Scale
== 0;
1088 if (!Subtarget
->hasAddr64() || Subtarget
->useFlatForGlobal()) {
1089 // Assume the we will use FLAT for all global memory accesses
1091 // FIXME: This assumption is currently wrong. On VI we still use
1092 // MUBUF instructions for the r + i addressing mode. As currently
1093 // implemented, the MUBUF instructions only work on buffer < 4GB.
1094 // It may be possible to support > 4GB buffers with MUBUF instructions,
1095 // by setting the stride value in the resource descriptor which would
1096 // increase the size limit to (stride * 4GB). However, this is risky,
1097 // because it has never been validated.
1098 return isLegalFlatAddressingMode(AM
);
1101 return isLegalMUBUFAddressingMode(AM
);
1104 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode
&AM
) const {
1105 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1106 // additionally can do r + r + i with addr64. 32-bit has more addressing
1107 // mode options. Depending on the resource constant, it can also do
1108 // (i64 r0) + (i32 r1) * (i14 i).
1110 // Private arrays end up using a scratch buffer most of the time, so also
1111 // assume those use MUBUF instructions. Scratch loads / stores are currently
1112 // implemented as mubuf instructions with offen bit set, so slightly
1113 // different than the normal addr64.
1114 if (!isUInt
<12>(AM
.BaseOffs
))
1117 // FIXME: Since we can split immediate into soffset and immediate offset,
1118 // would it make sense to allow any immediate?
1121 case 0: // r + i or just i, depending on HasBaseReg.
1124 return true; // We have r + r or r + i.
1126 if (AM
.HasBaseReg
) {
1127 // Reject 2 * r + r.
1131 // Allow 2 * r as r + r
1132 // Or 2 * r + i is allowed as r + r + i.
1134 default: // Don't allow n * r
1139 bool SITargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
1140 const AddrMode
&AM
, Type
*Ty
,
1141 unsigned AS
, Instruction
*I
) const {
1142 // No global is ever allowed as a base.
1146 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
)
1147 return isLegalGlobalAddressingMode(AM
);
1149 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1150 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
1151 AS
== AMDGPUAS::BUFFER_FAT_POINTER
) {
1152 // If the offset isn't a multiple of 4, it probably isn't going to be
1153 // correctly aligned.
1154 // FIXME: Can we get the real alignment here?
1155 if (AM
.BaseOffs
% 4 != 0)
1156 return isLegalMUBUFAddressingMode(AM
);
1158 // There are no SMRD extloads, so if we have to do a small type access we
1159 // will use a MUBUF load.
1160 // FIXME?: We also need to do this if unaligned, but we don't know the
1162 if (Ty
->isSized() && DL
.getTypeStoreSize(Ty
) < 4)
1163 return isLegalGlobalAddressingMode(AM
);
1165 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
1166 // SMRD instructions have an 8-bit, dword offset on SI.
1167 if (!isUInt
<8>(AM
.BaseOffs
/ 4))
1169 } else if (Subtarget
->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
) {
1170 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1171 // in 8-bits, it can use a smaller encoding.
1172 if (!isUInt
<32>(AM
.BaseOffs
/ 4))
1174 } else if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
1175 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1176 if (!isUInt
<20>(AM
.BaseOffs
))
1179 llvm_unreachable("unhandled generation");
1181 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1184 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1189 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1190 return isLegalMUBUFAddressingMode(AM
);
1191 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
||
1192 AS
== AMDGPUAS::REGION_ADDRESS
) {
1193 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1195 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1196 // an 8-bit dword offset but we don't know the alignment here.
1197 if (!isUInt
<16>(AM
.BaseOffs
))
1200 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1203 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1207 } else if (AS
== AMDGPUAS::FLAT_ADDRESS
||
1208 AS
== AMDGPUAS::UNKNOWN_ADDRESS_SPACE
) {
1209 // For an unknown address space, this usually means that this is for some
1210 // reason being used for pure arithmetic, and not based on some addressing
1211 // computation. We don't have instructions that compute pointers with any
1212 // addressing modes, so treat them as having no offset like flat
1214 return isLegalFlatAddressingMode(AM
);
1216 llvm_unreachable("unhandled address space");
1220 bool SITargetLowering::canMergeStoresTo(unsigned AS
, EVT MemVT
,
1221 const SelectionDAG
&DAG
) const {
1222 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
) {
1223 return (MemVT
.getSizeInBits() <= 4 * 32);
1224 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1225 unsigned MaxPrivateBits
= 8 * getSubtarget()->getMaxPrivateElementSize();
1226 return (MemVT
.getSizeInBits() <= MaxPrivateBits
);
1227 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
1228 return (MemVT
.getSizeInBits() <= 2 * 32);
1233 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1234 unsigned Size
, unsigned AddrSpace
, unsigned Align
,
1235 MachineMemOperand::Flags Flags
, bool *IsFast
) const {
1239 if (AddrSpace
== AMDGPUAS::LOCAL_ADDRESS
||
1240 AddrSpace
== AMDGPUAS::REGION_ADDRESS
) {
1241 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1242 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1243 // with adjacent offsets.
1244 bool AlignedBy4
= (Align
% 4 == 0);
1246 *IsFast
= AlignedBy4
;
1251 // FIXME: We have to be conservative here and assume that flat operations
1252 // will access scratch. If we had access to the IR function, then we
1253 // could determine if any private memory was used in the function.
1254 if (!Subtarget
->hasUnalignedScratchAccess() &&
1255 (AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
||
1256 AddrSpace
== AMDGPUAS::FLAT_ADDRESS
)) {
1257 bool AlignedBy4
= Align
>= 4;
1259 *IsFast
= AlignedBy4
;
1264 if (Subtarget
->hasUnalignedBufferAccess()) {
1265 // If we have an uniform constant load, it still requires using a slow
1266 // buffer instruction if unaligned.
1268 *IsFast
= (AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS
||
1269 AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) ?
1270 (Align
% 4 == 0) : true;
1276 // Smaller than dword value must be aligned.
1280 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1281 // byte-address are ignored, thus forcing Dword alignment.
1282 // This applies to private, global, and constant memory.
1286 return Size
>= 32 && Align
>= 4;
1289 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1290 EVT VT
, unsigned AddrSpace
, unsigned Align
, MachineMemOperand::Flags Flags
,
1291 bool *IsFast
) const {
1295 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1296 // which isn't a simple VT.
1297 // Until MVT is extended to handle this, simply check for the size and
1298 // rely on the condition below: allow accesses if the size is a multiple of 4.
1299 if (VT
== MVT::Other
|| (VT
!= MVT::Other
&& VT
.getSizeInBits() > 1024 &&
1300 VT
.getStoreSize() > 16)) {
1304 return allowsMisalignedMemoryAccessesImpl(VT
.getSizeInBits(), AddrSpace
,
1305 Align
, Flags
, IsFast
);
1308 EVT
SITargetLowering::getOptimalMemOpType(
1309 uint64_t Size
, unsigned DstAlign
, unsigned SrcAlign
, bool IsMemset
,
1310 bool ZeroMemset
, bool MemcpyStrSrc
,
1311 const AttributeList
&FuncAttributes
) const {
1312 // FIXME: Should account for address space here.
1314 // The default fallback uses the private pointer size as a guess for a type to
1315 // use. Make sure we switch these to 64-bit accesses.
1317 if (Size
>= 16 && DstAlign
>= 4) // XXX: Should only do for global
1320 if (Size
>= 8 && DstAlign
>= 4)
1327 static bool isFlatGlobalAddrSpace(unsigned AS
) {
1328 return AS
== AMDGPUAS::GLOBAL_ADDRESS
||
1329 AS
== AMDGPUAS::FLAT_ADDRESS
||
1330 AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1331 AS
> AMDGPUAS::MAX_AMDGPU_ADDRESS
;
1334 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS
,
1335 unsigned DestAS
) const {
1336 return isFlatGlobalAddrSpace(SrcAS
) && isFlatGlobalAddrSpace(DestAS
);
1339 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode
*N
) const {
1340 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1341 const Value
*Ptr
= MemNode
->getMemOperand()->getValue();
1342 const Instruction
*I
= dyn_cast_or_null
<Instruction
>(Ptr
);
1343 return I
&& I
->getMetadata("amdgpu.noclobber");
1346 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS
,
1347 unsigned DestAS
) const {
1348 // Flat -> private/local is a simple truncate.
1349 // Flat -> global is no-op
1350 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
)
1353 return isNoopAddrSpaceCast(SrcAS
, DestAS
);
1356 bool SITargetLowering::isMemOpUniform(const SDNode
*N
) const {
1357 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1359 return AMDGPUInstrInfo::isUniformMMO(MemNode
->getMemOperand());
1362 TargetLoweringBase::LegalizeTypeAction
1363 SITargetLowering::getPreferredVectorAction(MVT VT
) const {
1364 int NumElts
= VT
.getVectorNumElements();
1365 if (NumElts
!= 1 && VT
.getScalarType().bitsLE(MVT::i16
))
1366 return VT
.isPow2VectorType() ? TypeSplitVector
: TypeWidenVector
;
1367 return TargetLoweringBase::getPreferredVectorAction(VT
);
1370 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
1372 // FIXME: Could be smarter if called for vector constants.
1376 bool SITargetLowering::isTypeDesirableForOp(unsigned Op
, EVT VT
) const {
1377 if (Subtarget
->has16BitInsts() && VT
== MVT::i16
) {
1382 // These operations are done with 32-bit instructions anyway.
1387 // TODO: Extensions?
1394 // SimplifySetCC uses this function to determine whether or not it should
1395 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1396 if (VT
== MVT::i1
&& Op
== ISD::SETCC
)
1399 return TargetLowering::isTypeDesirableForOp(Op
, VT
);
1402 SDValue
SITargetLowering::lowerKernArgParameterPtr(SelectionDAG
&DAG
,
1405 uint64_t Offset
) const {
1406 const DataLayout
&DL
= DAG
.getDataLayout();
1407 MachineFunction
&MF
= DAG
.getMachineFunction();
1408 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1410 const ArgDescriptor
*InputPtrReg
;
1411 const TargetRegisterClass
*RC
;
1413 std::tie(InputPtrReg
, RC
)
1414 = Info
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1416 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
1417 MVT PtrVT
= getPointerTy(DL
, AMDGPUAS::CONSTANT_ADDRESS
);
1418 SDValue BasePtr
= DAG
.getCopyFromReg(Chain
, SL
,
1419 MRI
.getLiveInVirtReg(InputPtrReg
->getRegister()), PtrVT
);
1421 return DAG
.getObjectPtrOffset(SL
, BasePtr
, Offset
);
1424 SDValue
SITargetLowering::getImplicitArgPtr(SelectionDAG
&DAG
,
1425 const SDLoc
&SL
) const {
1426 uint64_t Offset
= getImplicitParameterOffset(DAG
.getMachineFunction(),
1428 return lowerKernArgParameterPtr(DAG
, SL
, DAG
.getEntryNode(), Offset
);
1431 SDValue
SITargetLowering::convertArgType(SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1432 const SDLoc
&SL
, SDValue Val
,
1434 const ISD::InputArg
*Arg
) const {
1435 // First, if it is a widened vector, narrow it.
1436 if (VT
.isVector() &&
1437 VT
.getVectorNumElements() != MemVT
.getVectorNumElements()) {
1439 EVT::getVectorVT(*DAG
.getContext(), MemVT
.getVectorElementType(),
1440 VT
.getVectorNumElements());
1441 Val
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, NarrowedVT
, Val
,
1442 DAG
.getConstant(0, SL
, MVT::i32
));
1445 // Then convert the vector elements or scalar value.
1446 if (Arg
&& (Arg
->Flags
.isSExt() || Arg
->Flags
.isZExt()) &&
1448 unsigned Opc
= Arg
->Flags
.isZExt() ? ISD::AssertZext
: ISD::AssertSext
;
1449 Val
= DAG
.getNode(Opc
, SL
, MemVT
, Val
, DAG
.getValueType(VT
));
1452 if (MemVT
.isFloatingPoint())
1453 Val
= getFPExtOrFPTrunc(DAG
, Val
, SL
, VT
);
1455 Val
= DAG
.getSExtOrTrunc(Val
, SL
, VT
);
1457 Val
= DAG
.getZExtOrTrunc(Val
, SL
, VT
);
1462 SDValue
SITargetLowering::lowerKernargMemParameter(
1463 SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1464 const SDLoc
&SL
, SDValue Chain
,
1465 uint64_t Offset
, unsigned Align
, bool Signed
,
1466 const ISD::InputArg
*Arg
) const {
1467 Type
*Ty
= MemVT
.getTypeForEVT(*DAG
.getContext());
1468 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
1469 MachinePointerInfo
PtrInfo(UndefValue::get(PtrTy
));
1471 // Try to avoid using an extload by loading earlier than the argument address,
1472 // and extracting the relevant bits. The load should hopefully be merged with
1473 // the previous argument.
1474 if (MemVT
.getStoreSize() < 4 && Align
< 4) {
1475 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1476 int64_t AlignDownOffset
= alignDown(Offset
, 4);
1477 int64_t OffsetDiff
= Offset
- AlignDownOffset
;
1479 EVT IntVT
= MemVT
.changeTypeToInteger();
1481 // TODO: If we passed in the base kernel offset we could have a better
1482 // alignment than 4, but we don't really need it.
1483 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, AlignDownOffset
);
1484 SDValue Load
= DAG
.getLoad(MVT::i32
, SL
, Chain
, Ptr
, PtrInfo
, 4,
1485 MachineMemOperand::MODereferenceable
|
1486 MachineMemOperand::MOInvariant
);
1488 SDValue ShiftAmt
= DAG
.getConstant(OffsetDiff
* 8, SL
, MVT::i32
);
1489 SDValue Extract
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Load
, ShiftAmt
);
1491 SDValue ArgVal
= DAG
.getNode(ISD::TRUNCATE
, SL
, IntVT
, Extract
);
1492 ArgVal
= DAG
.getNode(ISD::BITCAST
, SL
, MemVT
, ArgVal
);
1493 ArgVal
= convertArgType(DAG
, VT
, MemVT
, SL
, ArgVal
, Signed
, Arg
);
1496 return DAG
.getMergeValues({ ArgVal
, Load
.getValue(1) }, SL
);
1499 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, Offset
);
1500 SDValue Load
= DAG
.getLoad(MemVT
, SL
, Chain
, Ptr
, PtrInfo
, Align
,
1501 MachineMemOperand::MODereferenceable
|
1502 MachineMemOperand::MOInvariant
);
1504 SDValue Val
= convertArgType(DAG
, VT
, MemVT
, SL
, Load
, Signed
, Arg
);
1505 return DAG
.getMergeValues({ Val
, Load
.getValue(1) }, SL
);
1508 SDValue
SITargetLowering::lowerStackParameter(SelectionDAG
&DAG
, CCValAssign
&VA
,
1509 const SDLoc
&SL
, SDValue Chain
,
1510 const ISD::InputArg
&Arg
) const {
1511 MachineFunction
&MF
= DAG
.getMachineFunction();
1512 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1514 if (Arg
.Flags
.isByVal()) {
1515 unsigned Size
= Arg
.Flags
.getByValSize();
1516 int FrameIdx
= MFI
.CreateFixedObject(Size
, VA
.getLocMemOffset(), false);
1517 return DAG
.getFrameIndex(FrameIdx
, MVT::i32
);
1520 unsigned ArgOffset
= VA
.getLocMemOffset();
1521 unsigned ArgSize
= VA
.getValVT().getStoreSize();
1523 int FI
= MFI
.CreateFixedObject(ArgSize
, ArgOffset
, true);
1525 // Create load nodes to retrieve arguments from the stack.
1526 SDValue FIN
= DAG
.getFrameIndex(FI
, MVT::i32
);
1529 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1530 ISD::LoadExtType ExtType
= ISD::NON_EXTLOAD
;
1531 MVT MemVT
= VA
.getValVT();
1533 switch (VA
.getLocInfo()) {
1536 case CCValAssign::BCvt
:
1537 MemVT
= VA
.getLocVT();
1539 case CCValAssign::SExt
:
1540 ExtType
= ISD::SEXTLOAD
;
1542 case CCValAssign::ZExt
:
1543 ExtType
= ISD::ZEXTLOAD
;
1545 case CCValAssign::AExt
:
1546 ExtType
= ISD::EXTLOAD
;
1550 ArgValue
= DAG
.getExtLoad(
1551 ExtType
, SL
, VA
.getLocVT(), Chain
, FIN
,
1552 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
),
1557 SDValue
SITargetLowering::getPreloadedValue(SelectionDAG
&DAG
,
1558 const SIMachineFunctionInfo
&MFI
,
1560 AMDGPUFunctionArgInfo::PreloadedValue PVID
) const {
1561 const ArgDescriptor
*Reg
;
1562 const TargetRegisterClass
*RC
;
1564 std::tie(Reg
, RC
) = MFI
.getPreloadedValue(PVID
);
1565 return CreateLiveInRegister(DAG
, RC
, Reg
->getRegister(), VT
);
1568 static void processShaderInputArgs(SmallVectorImpl
<ISD::InputArg
> &Splits
,
1569 CallingConv::ID CallConv
,
1570 ArrayRef
<ISD::InputArg
> Ins
,
1572 FunctionType
*FType
,
1573 SIMachineFunctionInfo
*Info
) {
1574 for (unsigned I
= 0, E
= Ins
.size(), PSInputNum
= 0; I
!= E
; ++I
) {
1575 const ISD::InputArg
*Arg
= &Ins
[I
];
1577 assert((!Arg
->VT
.isVector() || Arg
->VT
.getScalarSizeInBits() == 16) &&
1578 "vector type argument should have been split");
1580 // First check if it's a PS input addr.
1581 if (CallConv
== CallingConv::AMDGPU_PS
&&
1582 !Arg
->Flags
.isInReg() && PSInputNum
<= 15) {
1583 bool SkipArg
= !Arg
->Used
&& !Info
->isPSInputAllocated(PSInputNum
);
1585 // Inconveniently only the first part of the split is marked as isSplit,
1586 // so skip to the end. We only want to increment PSInputNum once for the
1587 // entire split argument.
1588 if (Arg
->Flags
.isSplit()) {
1589 while (!Arg
->Flags
.isSplitEnd()) {
1590 assert((!Arg
->VT
.isVector() ||
1591 Arg
->VT
.getScalarSizeInBits() == 16) &&
1592 "unexpected vector split in ps argument type");
1594 Splits
.push_back(*Arg
);
1600 // We can safely skip PS inputs.
1601 Skipped
.set(Arg
->getOrigArgIndex());
1606 Info
->markPSInputAllocated(PSInputNum
);
1608 Info
->markPSInputEnabled(PSInputNum
);
1613 Splits
.push_back(*Arg
);
1617 // Allocate special inputs passed in VGPRs.
1618 void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState
&CCInfo
,
1619 MachineFunction
&MF
,
1620 const SIRegisterInfo
&TRI
,
1621 SIMachineFunctionInfo
&Info
) const {
1622 const LLT S32
= LLT::scalar(32);
1623 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1625 if (Info
.hasWorkItemIDX()) {
1626 Register Reg
= AMDGPU::VGPR0
;
1627 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1629 CCInfo
.AllocateReg(Reg
);
1630 Info
.setWorkItemIDX(ArgDescriptor::createRegister(Reg
));
1633 if (Info
.hasWorkItemIDY()) {
1634 Register Reg
= AMDGPU::VGPR1
;
1635 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1637 CCInfo
.AllocateReg(Reg
);
1638 Info
.setWorkItemIDY(ArgDescriptor::createRegister(Reg
));
1641 if (Info
.hasWorkItemIDZ()) {
1642 Register Reg
= AMDGPU::VGPR2
;
1643 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1645 CCInfo
.AllocateReg(Reg
);
1646 Info
.setWorkItemIDZ(ArgDescriptor::createRegister(Reg
));
1650 // Try to allocate a VGPR at the end of the argument list, or if no argument
1651 // VGPRs are left allocating a stack slot.
1652 // If \p Mask is is given it indicates bitfield position in the register.
1653 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1654 static ArgDescriptor
allocateVGPR32Input(CCState
&CCInfo
, unsigned Mask
= ~0u,
1655 ArgDescriptor Arg
= ArgDescriptor()) {
1657 return ArgDescriptor::createArg(Arg
, Mask
);
1659 ArrayRef
<MCPhysReg
> ArgVGPRs
1660 = makeArrayRef(AMDGPU::VGPR_32RegClass
.begin(), 32);
1661 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgVGPRs
);
1662 if (RegIdx
== ArgVGPRs
.size()) {
1663 // Spill to stack required.
1664 int64_t Offset
= CCInfo
.AllocateStack(4, 4);
1666 return ArgDescriptor::createStack(Offset
, Mask
);
1669 unsigned Reg
= ArgVGPRs
[RegIdx
];
1670 Reg
= CCInfo
.AllocateReg(Reg
);
1671 assert(Reg
!= AMDGPU::NoRegister
);
1673 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1674 Register LiveInVReg
= MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
);
1675 MF
.getRegInfo().setType(LiveInVReg
, LLT::scalar(32));
1676 return ArgDescriptor::createRegister(Reg
, Mask
);
1679 static ArgDescriptor
allocateSGPR32InputImpl(CCState
&CCInfo
,
1680 const TargetRegisterClass
*RC
,
1681 unsigned NumArgRegs
) {
1682 ArrayRef
<MCPhysReg
> ArgSGPRs
= makeArrayRef(RC
->begin(), 32);
1683 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgSGPRs
);
1684 if (RegIdx
== ArgSGPRs
.size())
1685 report_fatal_error("ran out of SGPRs for arguments");
1687 unsigned Reg
= ArgSGPRs
[RegIdx
];
1688 Reg
= CCInfo
.AllocateReg(Reg
);
1689 assert(Reg
!= AMDGPU::NoRegister
);
1691 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1692 MF
.addLiveIn(Reg
, RC
);
1693 return ArgDescriptor::createRegister(Reg
);
1696 static ArgDescriptor
allocateSGPR32Input(CCState
&CCInfo
) {
1697 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_32RegClass
, 32);
1700 static ArgDescriptor
allocateSGPR64Input(CCState
&CCInfo
) {
1701 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_64RegClass
, 16);
1704 void SITargetLowering::allocateSpecialInputVGPRs(CCState
&CCInfo
,
1705 MachineFunction
&MF
,
1706 const SIRegisterInfo
&TRI
,
1707 SIMachineFunctionInfo
&Info
) const {
1708 const unsigned Mask
= 0x3ff;
1711 if (Info
.hasWorkItemIDX()) {
1712 Arg
= allocateVGPR32Input(CCInfo
, Mask
);
1713 Info
.setWorkItemIDX(Arg
);
1716 if (Info
.hasWorkItemIDY()) {
1717 Arg
= allocateVGPR32Input(CCInfo
, Mask
<< 10, Arg
);
1718 Info
.setWorkItemIDY(Arg
);
1721 if (Info
.hasWorkItemIDZ())
1722 Info
.setWorkItemIDZ(allocateVGPR32Input(CCInfo
, Mask
<< 20, Arg
));
1725 void SITargetLowering::allocateSpecialInputSGPRs(
1727 MachineFunction
&MF
,
1728 const SIRegisterInfo
&TRI
,
1729 SIMachineFunctionInfo
&Info
) const {
1730 auto &ArgInfo
= Info
.getArgInfo();
1732 // TODO: Unify handling with private memory pointers.
1734 if (Info
.hasDispatchPtr())
1735 ArgInfo
.DispatchPtr
= allocateSGPR64Input(CCInfo
);
1737 if (Info
.hasQueuePtr())
1738 ArgInfo
.QueuePtr
= allocateSGPR64Input(CCInfo
);
1740 if (Info
.hasKernargSegmentPtr())
1741 ArgInfo
.KernargSegmentPtr
= allocateSGPR64Input(CCInfo
);
1743 if (Info
.hasDispatchID())
1744 ArgInfo
.DispatchID
= allocateSGPR64Input(CCInfo
);
1746 // flat_scratch_init is not applicable for non-kernel functions.
1748 if (Info
.hasWorkGroupIDX())
1749 ArgInfo
.WorkGroupIDX
= allocateSGPR32Input(CCInfo
);
1751 if (Info
.hasWorkGroupIDY())
1752 ArgInfo
.WorkGroupIDY
= allocateSGPR32Input(CCInfo
);
1754 if (Info
.hasWorkGroupIDZ())
1755 ArgInfo
.WorkGroupIDZ
= allocateSGPR32Input(CCInfo
);
1757 if (Info
.hasImplicitArgPtr())
1758 ArgInfo
.ImplicitArgPtr
= allocateSGPR64Input(CCInfo
);
1761 // Allocate special inputs passed in user SGPRs.
1762 void SITargetLowering::allocateHSAUserSGPRs(CCState
&CCInfo
,
1763 MachineFunction
&MF
,
1764 const SIRegisterInfo
&TRI
,
1765 SIMachineFunctionInfo
&Info
) const {
1766 if (Info
.hasImplicitBufferPtr()) {
1767 unsigned ImplicitBufferPtrReg
= Info
.addImplicitBufferPtr(TRI
);
1768 MF
.addLiveIn(ImplicitBufferPtrReg
, &AMDGPU::SGPR_64RegClass
);
1769 CCInfo
.AllocateReg(ImplicitBufferPtrReg
);
1772 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1773 if (Info
.hasPrivateSegmentBuffer()) {
1774 unsigned PrivateSegmentBufferReg
= Info
.addPrivateSegmentBuffer(TRI
);
1775 MF
.addLiveIn(PrivateSegmentBufferReg
, &AMDGPU::SGPR_128RegClass
);
1776 CCInfo
.AllocateReg(PrivateSegmentBufferReg
);
1779 if (Info
.hasDispatchPtr()) {
1780 unsigned DispatchPtrReg
= Info
.addDispatchPtr(TRI
);
1781 MF
.addLiveIn(DispatchPtrReg
, &AMDGPU::SGPR_64RegClass
);
1782 CCInfo
.AllocateReg(DispatchPtrReg
);
1785 if (Info
.hasQueuePtr()) {
1786 unsigned QueuePtrReg
= Info
.addQueuePtr(TRI
);
1787 MF
.addLiveIn(QueuePtrReg
, &AMDGPU::SGPR_64RegClass
);
1788 CCInfo
.AllocateReg(QueuePtrReg
);
1791 if (Info
.hasKernargSegmentPtr()) {
1792 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1793 Register InputPtrReg
= Info
.addKernargSegmentPtr(TRI
);
1794 CCInfo
.AllocateReg(InputPtrReg
);
1796 Register VReg
= MF
.addLiveIn(InputPtrReg
, &AMDGPU::SGPR_64RegClass
);
1797 MRI
.setType(VReg
, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
1800 if (Info
.hasDispatchID()) {
1801 unsigned DispatchIDReg
= Info
.addDispatchID(TRI
);
1802 MF
.addLiveIn(DispatchIDReg
, &AMDGPU::SGPR_64RegClass
);
1803 CCInfo
.AllocateReg(DispatchIDReg
);
1806 if (Info
.hasFlatScratchInit()) {
1807 unsigned FlatScratchInitReg
= Info
.addFlatScratchInit(TRI
);
1808 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
1809 CCInfo
.AllocateReg(FlatScratchInitReg
);
1812 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1813 // these from the dispatch pointer.
1816 // Allocate special input registers that are initialized per-wave.
1817 void SITargetLowering::allocateSystemSGPRs(CCState
&CCInfo
,
1818 MachineFunction
&MF
,
1819 SIMachineFunctionInfo
&Info
,
1820 CallingConv::ID CallConv
,
1821 bool IsShader
) const {
1822 if (Info
.hasWorkGroupIDX()) {
1823 unsigned Reg
= Info
.addWorkGroupIDX();
1824 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1825 CCInfo
.AllocateReg(Reg
);
1828 if (Info
.hasWorkGroupIDY()) {
1829 unsigned Reg
= Info
.addWorkGroupIDY();
1830 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1831 CCInfo
.AllocateReg(Reg
);
1834 if (Info
.hasWorkGroupIDZ()) {
1835 unsigned Reg
= Info
.addWorkGroupIDZ();
1836 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1837 CCInfo
.AllocateReg(Reg
);
1840 if (Info
.hasWorkGroupInfo()) {
1841 unsigned Reg
= Info
.addWorkGroupInfo();
1842 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1843 CCInfo
.AllocateReg(Reg
);
1846 if (Info
.hasPrivateSegmentWaveByteOffset()) {
1847 // Scratch wave offset passed in system SGPR.
1848 unsigned PrivateSegmentWaveByteOffsetReg
;
1851 PrivateSegmentWaveByteOffsetReg
=
1852 Info
.getPrivateSegmentWaveByteOffsetSystemSGPR();
1854 // This is true if the scratch wave byte offset doesn't have a fixed
1856 if (PrivateSegmentWaveByteOffsetReg
== AMDGPU::NoRegister
) {
1857 PrivateSegmentWaveByteOffsetReg
= findFirstFreeSGPR(CCInfo
);
1858 Info
.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg
);
1861 PrivateSegmentWaveByteOffsetReg
= Info
.addPrivateSegmentWaveByteOffset();
1863 MF
.addLiveIn(PrivateSegmentWaveByteOffsetReg
, &AMDGPU::SGPR_32RegClass
);
1864 CCInfo
.AllocateReg(PrivateSegmentWaveByteOffsetReg
);
1868 static void reservePrivateMemoryRegs(const TargetMachine
&TM
,
1869 MachineFunction
&MF
,
1870 const SIRegisterInfo
&TRI
,
1871 SIMachineFunctionInfo
&Info
) {
1872 // Now that we've figured out where the scratch register inputs are, see if
1873 // should reserve the arguments and use them directly.
1874 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1875 bool HasStackObjects
= MFI
.hasStackObjects();
1876 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1878 // Record that we know we have non-spill stack objects so we don't need to
1879 // check all stack objects later.
1880 if (HasStackObjects
)
1881 Info
.setHasNonSpillStackObjects(true);
1883 // Everything live out of a block is spilled with fast regalloc, so it's
1884 // almost certain that spilling will be required.
1885 if (TM
.getOptLevel() == CodeGenOpt::None
)
1886 HasStackObjects
= true;
1888 // For now assume stack access is needed in any callee functions, so we need
1889 // the scratch registers to pass in.
1890 bool RequiresStackAccess
= HasStackObjects
|| MFI
.hasCalls();
1892 if (RequiresStackAccess
&& ST
.isAmdHsaOrMesa(MF
.getFunction())) {
1893 // If we have stack objects, we unquestionably need the private buffer
1894 // resource. For the Code Object V2 ABI, this will be the first 4 user
1895 // SGPR inputs. We can reserve those and use them directly.
1897 Register PrivateSegmentBufferReg
=
1898 Info
.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER
);
1899 Info
.setScratchRSrcReg(PrivateSegmentBufferReg
);
1901 unsigned ReservedBufferReg
= TRI
.reservedPrivateSegmentBufferReg(MF
);
1902 // We tentatively reserve the last registers (skipping the last registers
1903 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
1904 // we'll replace these with the ones immediately after those which were
1905 // really allocated. In the prologue copies will be inserted from the
1906 // argument to these reserved registers.
1908 // Without HSA, relocations are used for the scratch pointer and the
1909 // buffer resource setup is always inserted in the prologue. Scratch wave
1910 // offset is still in an input SGPR.
1911 Info
.setScratchRSrcReg(ReservedBufferReg
);
1914 // hasFP should be accurate for kernels even before the frame is finalized.
1915 if (ST
.getFrameLowering()->hasFP(MF
)) {
1916 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1918 // Try to use s32 as the SP, but move it if it would interfere with input
1919 // arguments. This won't work with calls though.
1921 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
1923 if (!MRI
.isLiveIn(AMDGPU::SGPR32
)) {
1924 Info
.setStackPtrOffsetReg(AMDGPU::SGPR32
);
1926 assert(AMDGPU::isShader(MF
.getFunction().getCallingConv()));
1929 report_fatal_error("call in graphics shader with too many input SGPRs");
1931 for (unsigned Reg
: AMDGPU::SGPR_32RegClass
) {
1932 if (!MRI
.isLiveIn(Reg
)) {
1933 Info
.setStackPtrOffsetReg(Reg
);
1938 if (Info
.getStackPtrOffsetReg() == AMDGPU::SP_REG
)
1939 report_fatal_error("failed to find register for SP");
1942 if (MFI
.hasCalls()) {
1943 Info
.setScratchWaveOffsetReg(AMDGPU::SGPR33
);
1944 Info
.setFrameOffsetReg(AMDGPU::SGPR33
);
1946 unsigned ReservedOffsetReg
=
1947 TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1948 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1949 Info
.setFrameOffsetReg(ReservedOffsetReg
);
1951 } else if (RequiresStackAccess
) {
1952 assert(!MFI
.hasCalls());
1953 // We know there are accesses and they will be done relative to SP, so just
1954 // pin it to the input.
1956 // FIXME: Should not do this if inline asm is reading/writing these
1958 Register PreloadedSP
= Info
.getPreloadedReg(
1959 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET
);
1961 Info
.setStackPtrOffsetReg(PreloadedSP
);
1962 Info
.setScratchWaveOffsetReg(PreloadedSP
);
1963 Info
.setFrameOffsetReg(PreloadedSP
);
1965 assert(!MFI
.hasCalls());
1967 // There may not be stack access at all. There may still be spills, or
1968 // access of a constant pointer (in which cases an extra copy will be
1969 // emitted in the prolog).
1970 unsigned ReservedOffsetReg
1971 = TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1972 Info
.setStackPtrOffsetReg(ReservedOffsetReg
);
1973 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1974 Info
.setFrameOffsetReg(ReservedOffsetReg
);
1978 bool SITargetLowering::supportSplitCSR(MachineFunction
*MF
) const {
1979 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
1980 return !Info
->isEntryFunction();
1983 void SITargetLowering::initializeSplitCSR(MachineBasicBlock
*Entry
) const {
1987 void SITargetLowering::insertCopiesSplitCSR(
1988 MachineBasicBlock
*Entry
,
1989 const SmallVectorImpl
<MachineBasicBlock
*> &Exits
) const {
1990 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
1992 const MCPhysReg
*IStart
= TRI
->getCalleeSavedRegsViaCopy(Entry
->getParent());
1996 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
1997 MachineRegisterInfo
*MRI
= &Entry
->getParent()->getRegInfo();
1998 MachineBasicBlock::iterator MBBI
= Entry
->begin();
1999 for (const MCPhysReg
*I
= IStart
; *I
; ++I
) {
2000 const TargetRegisterClass
*RC
= nullptr;
2001 if (AMDGPU::SReg_64RegClass
.contains(*I
))
2002 RC
= &AMDGPU::SGPR_64RegClass
;
2003 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
2004 RC
= &AMDGPU::SGPR_32RegClass
;
2006 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2008 Register NewVR
= MRI
->createVirtualRegister(RC
);
2009 // Create copy from CSR to a virtual register.
2010 Entry
->addLiveIn(*I
);
2011 BuildMI(*Entry
, MBBI
, DebugLoc(), TII
->get(TargetOpcode::COPY
), NewVR
)
2014 // Insert the copy-back instructions right before the terminator.
2015 for (auto *Exit
: Exits
)
2016 BuildMI(*Exit
, Exit
->getFirstTerminator(), DebugLoc(),
2017 TII
->get(TargetOpcode::COPY
), *I
)
2022 SDValue
SITargetLowering::LowerFormalArguments(
2023 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
2024 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2025 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
2026 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2028 MachineFunction
&MF
= DAG
.getMachineFunction();
2029 const Function
&Fn
= MF
.getFunction();
2030 FunctionType
*FType
= MF
.getFunction().getFunctionType();
2031 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2033 if (Subtarget
->isAmdHsaOS() && AMDGPU::isShader(CallConv
)) {
2034 DiagnosticInfoUnsupported
NoGraphicsHSA(
2035 Fn
, "unsupported non-compute shaders with HSA", DL
.getDebugLoc());
2036 DAG
.getContext()->diagnose(NoGraphicsHSA
);
2037 return DAG
.getEntryNode();
2040 SmallVector
<ISD::InputArg
, 16> Splits
;
2041 SmallVector
<CCValAssign
, 16> ArgLocs
;
2042 BitVector
Skipped(Ins
.size());
2043 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
2046 bool IsShader
= AMDGPU::isShader(CallConv
);
2047 bool IsKernel
= AMDGPU::isKernel(CallConv
);
2048 bool IsEntryFunc
= AMDGPU::isEntryFunctionCC(CallConv
);
2051 processShaderInputArgs(Splits
, CallConv
, Ins
, Skipped
, FType
, Info
);
2053 // At least one interpolation mode must be enabled or else the GPU will
2056 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2057 // set PSInputAddr, the user wants to enable some bits after the compilation
2058 // based on run-time states. Since we can't know what the final PSInputEna
2059 // will look like, so we shouldn't do anything here and the user should take
2060 // responsibility for the correct programming.
2062 // Otherwise, the following restrictions apply:
2063 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2064 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2066 if (CallConv
== CallingConv::AMDGPU_PS
) {
2067 if ((Info
->getPSInputAddr() & 0x7F) == 0 ||
2068 ((Info
->getPSInputAddr() & 0xF) == 0 &&
2069 Info
->isPSInputAllocated(11))) {
2070 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
2071 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
2072 Info
->markPSInputAllocated(0);
2073 Info
->markPSInputEnabled(0);
2075 if (Subtarget
->isAmdPalOS()) {
2076 // For isAmdPalOS, the user does not enable some bits after compilation
2077 // based on run-time states; the register values being generated here are
2078 // the final ones set in hardware. Therefore we need to apply the
2079 // workaround to PSInputAddr and PSInputEnable together. (The case where
2080 // a bit is set in PSInputAddr but not PSInputEnable is where the
2081 // frontend set up an input arg for a particular interpolation mode, but
2082 // nothing uses that input arg. Really we should have an earlier pass
2083 // that removes such an arg.)
2084 unsigned PsInputBits
= Info
->getPSInputAddr() & Info
->getPSInputEnable();
2085 if ((PsInputBits
& 0x7F) == 0 ||
2086 ((PsInputBits
& 0xF) == 0 &&
2087 (PsInputBits
>> 11 & 1)))
2088 Info
->markPSInputEnabled(
2089 countTrailingZeros(Info
->getPSInputAddr(), ZB_Undefined
));
2093 assert(!Info
->hasDispatchPtr() &&
2094 !Info
->hasKernargSegmentPtr() && !Info
->hasFlatScratchInit() &&
2095 !Info
->hasWorkGroupIDX() && !Info
->hasWorkGroupIDY() &&
2096 !Info
->hasWorkGroupIDZ() && !Info
->hasWorkGroupInfo() &&
2097 !Info
->hasWorkItemIDX() && !Info
->hasWorkItemIDY() &&
2098 !Info
->hasWorkItemIDZ());
2099 } else if (IsKernel
) {
2100 assert(Info
->hasWorkGroupIDX() && Info
->hasWorkItemIDX());
2102 Splits
.append(Ins
.begin(), Ins
.end());
2106 allocateSpecialEntryInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2107 allocateHSAUserSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2111 analyzeFormalArgumentsCompute(CCInfo
, Ins
);
2113 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, isVarArg
);
2114 CCInfo
.AnalyzeFormalArguments(Splits
, AssignFn
);
2117 SmallVector
<SDValue
, 16> Chains
;
2119 // FIXME: This is the minimum kernel argument alignment. We should improve
2120 // this to the maximum alignment of the arguments.
2122 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2124 const unsigned KernelArgBaseAlign
= 16;
2126 for (unsigned i
= 0, e
= Ins
.size(), ArgIdx
= 0; i
!= e
; ++i
) {
2127 const ISD::InputArg
&Arg
= Ins
[i
];
2128 if (Arg
.isOrigArg() && Skipped
[Arg
.getOrigArgIndex()]) {
2129 InVals
.push_back(DAG
.getUNDEF(Arg
.VT
));
2133 CCValAssign
&VA
= ArgLocs
[ArgIdx
++];
2134 MVT VT
= VA
.getLocVT();
2136 if (IsEntryFunc
&& VA
.isMemLoc()) {
2138 EVT MemVT
= VA
.getLocVT();
2140 const uint64_t Offset
= VA
.getLocMemOffset();
2141 unsigned Align
= MinAlign(KernelArgBaseAlign
, Offset
);
2143 SDValue Arg
= lowerKernargMemParameter(
2144 DAG
, VT
, MemVT
, DL
, Chain
, Offset
, Align
, Ins
[i
].Flags
.isSExt(), &Ins
[i
]);
2145 Chains
.push_back(Arg
.getValue(1));
2148 dyn_cast
<PointerType
>(FType
->getParamType(Ins
[i
].getOrigArgIndex()));
2149 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
2150 ParamTy
&& (ParamTy
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
2151 ParamTy
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
)) {
2152 // On SI local pointers are just offsets into LDS, so they are always
2153 // less than 16-bits. On CI and newer they could potentially be
2154 // real pointers, so we can't guarantee their size.
2155 Arg
= DAG
.getNode(ISD::AssertZext
, DL
, Arg
.getValueType(), Arg
,
2156 DAG
.getValueType(MVT::i16
));
2159 InVals
.push_back(Arg
);
2161 } else if (!IsEntryFunc
&& VA
.isMemLoc()) {
2162 SDValue Val
= lowerStackParameter(DAG
, VA
, DL
, Chain
, Arg
);
2163 InVals
.push_back(Val
);
2164 if (!Arg
.Flags
.isByVal())
2165 Chains
.push_back(Val
.getValue(1));
2169 assert(VA
.isRegLoc() && "Parameter must be in a register!");
2171 Register Reg
= VA
.getLocReg();
2172 const TargetRegisterClass
*RC
= TRI
->getMinimalPhysRegClass(Reg
, VT
);
2173 EVT ValVT
= VA
.getValVT();
2175 Reg
= MF
.addLiveIn(Reg
, RC
);
2176 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
);
2178 if (Arg
.Flags
.isSRet()) {
2179 // The return object should be reasonably addressable.
2181 // FIXME: This helps when the return is a real sret. If it is a
2182 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2183 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2185 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
2186 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
2187 DAG
.getValueType(EVT::getIntegerVT(*DAG
.getContext(), NumBits
)));
2190 // If this is an 8 or 16-bit value, it is really passed promoted
2191 // to 32 bits. Insert an assert[sz]ext to capture this, then
2192 // truncate to the right size.
2193 switch (VA
.getLocInfo()) {
2194 case CCValAssign::Full
:
2196 case CCValAssign::BCvt
:
2197 Val
= DAG
.getNode(ISD::BITCAST
, DL
, ValVT
, Val
);
2199 case CCValAssign::SExt
:
2200 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VT
, Val
,
2201 DAG
.getValueType(ValVT
));
2202 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2204 case CCValAssign::ZExt
:
2205 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
2206 DAG
.getValueType(ValVT
));
2207 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2209 case CCValAssign::AExt
:
2210 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2213 llvm_unreachable("Unknown loc info!");
2216 InVals
.push_back(Val
);
2220 // Special inputs come after user arguments.
2221 allocateSpecialInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2224 // Start adding system SGPRs.
2226 allocateSystemSGPRs(CCInfo
, MF
, *Info
, CallConv
, IsShader
);
2228 CCInfo
.AllocateReg(Info
->getScratchRSrcReg());
2229 CCInfo
.AllocateReg(Info
->getScratchWaveOffsetReg());
2230 CCInfo
.AllocateReg(Info
->getFrameOffsetReg());
2231 allocateSpecialInputSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2234 auto &ArgUsageInfo
=
2235 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2236 ArgUsageInfo
.setFuncArgInfo(Fn
, Info
->getArgInfo());
2238 unsigned StackArgSize
= CCInfo
.getNextStackOffset();
2239 Info
->setBytesInStackArgArea(StackArgSize
);
2241 return Chains
.empty() ? Chain
:
2242 DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
2245 // TODO: If return values can't fit in registers, we should return as many as
2246 // possible in registers before passing on stack.
2247 bool SITargetLowering::CanLowerReturn(
2248 CallingConv::ID CallConv
,
2249 MachineFunction
&MF
, bool IsVarArg
,
2250 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2251 LLVMContext
&Context
) const {
2252 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2253 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2254 // for shaders. Vector types should be explicitly handled by CC.
2255 if (AMDGPU::isEntryFunctionCC(CallConv
))
2258 SmallVector
<CCValAssign
, 16> RVLocs
;
2259 CCState
CCInfo(CallConv
, IsVarArg
, MF
, RVLocs
, Context
);
2260 return CCInfo
.CheckReturn(Outs
, CCAssignFnForReturn(CallConv
, IsVarArg
));
2264 SITargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
2266 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2267 const SmallVectorImpl
<SDValue
> &OutVals
,
2268 const SDLoc
&DL
, SelectionDAG
&DAG
) const {
2269 MachineFunction
&MF
= DAG
.getMachineFunction();
2270 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2272 if (AMDGPU::isKernel(CallConv
)) {
2273 return AMDGPUTargetLowering::LowerReturn(Chain
, CallConv
, isVarArg
, Outs
,
2277 bool IsShader
= AMDGPU::isShader(CallConv
);
2279 Info
->setIfReturnsVoid(Outs
.empty());
2280 bool IsWaveEnd
= Info
->returnsVoid() && IsShader
;
2282 // CCValAssign - represent the assignment of the return value to a location.
2283 SmallVector
<CCValAssign
, 48> RVLocs
;
2284 SmallVector
<ISD::OutputArg
, 48> Splits
;
2286 // CCState - Info about the registers and stack slots.
2287 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
2290 // Analyze outgoing return values.
2291 CCInfo
.AnalyzeReturn(Outs
, CCAssignFnForReturn(CallConv
, isVarArg
));
2294 SmallVector
<SDValue
, 48> RetOps
;
2295 RetOps
.push_back(Chain
); // Operand #0 = Chain (updated below)
2297 // Add return address for callable functions.
2298 if (!Info
->isEntryFunction()) {
2299 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2300 SDValue ReturnAddrReg
= CreateLiveInRegister(
2301 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2303 SDValue ReturnAddrVirtualReg
= DAG
.getRegister(
2304 MF
.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass
),
2307 DAG
.getCopyToReg(Chain
, DL
, ReturnAddrVirtualReg
, ReturnAddrReg
, Flag
);
2308 Flag
= Chain
.getValue(1);
2309 RetOps
.push_back(ReturnAddrVirtualReg
);
2312 // Copy the result values into the output registers.
2313 for (unsigned I
= 0, RealRVLocIdx
= 0, E
= RVLocs
.size(); I
!= E
;
2314 ++I
, ++RealRVLocIdx
) {
2315 CCValAssign
&VA
= RVLocs
[I
];
2316 assert(VA
.isRegLoc() && "Can only return in registers!");
2317 // TODO: Partially return in registers if return values don't fit.
2318 SDValue Arg
= OutVals
[RealRVLocIdx
];
2320 // Copied from other backends.
2321 switch (VA
.getLocInfo()) {
2322 case CCValAssign::Full
:
2324 case CCValAssign::BCvt
:
2325 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2327 case CCValAssign::SExt
:
2328 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2330 case CCValAssign::ZExt
:
2331 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2333 case CCValAssign::AExt
:
2334 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2337 llvm_unreachable("Unknown loc info!");
2340 Chain
= DAG
.getCopyToReg(Chain
, DL
, VA
.getLocReg(), Arg
, Flag
);
2341 Flag
= Chain
.getValue(1);
2342 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
2345 // FIXME: Does sret work properly?
2346 if (!Info
->isEntryFunction()) {
2347 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2348 const MCPhysReg
*I
=
2349 TRI
->getCalleeSavedRegsViaCopy(&DAG
.getMachineFunction());
2352 if (AMDGPU::SReg_64RegClass
.contains(*I
))
2353 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i64
));
2354 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
2355 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i32
));
2357 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2362 // Update chain and glue.
2365 RetOps
.push_back(Flag
);
2367 unsigned Opc
= AMDGPUISD::ENDPGM
;
2369 Opc
= IsShader
? AMDGPUISD::RETURN_TO_EPILOG
: AMDGPUISD::RET_FLAG
;
2370 return DAG
.getNode(Opc
, DL
, MVT::Other
, RetOps
);
2373 SDValue
SITargetLowering::LowerCallResult(
2374 SDValue Chain
, SDValue InFlag
, CallingConv::ID CallConv
, bool IsVarArg
,
2375 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2376 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
, bool IsThisReturn
,
2377 SDValue ThisVal
) const {
2378 CCAssignFn
*RetCC
= CCAssignFnForReturn(CallConv
, IsVarArg
);
2380 // Assign locations to each value returned by this call.
2381 SmallVector
<CCValAssign
, 16> RVLocs
;
2382 CCState
CCInfo(CallConv
, IsVarArg
, DAG
.getMachineFunction(), RVLocs
,
2384 CCInfo
.AnalyzeCallResult(Ins
, RetCC
);
2386 // Copy all of the result registers out of their specified physreg.
2387 for (unsigned i
= 0; i
!= RVLocs
.size(); ++i
) {
2388 CCValAssign VA
= RVLocs
[i
];
2391 if (VA
.isRegLoc()) {
2392 Val
= DAG
.getCopyFromReg(Chain
, DL
, VA
.getLocReg(), VA
.getLocVT(), InFlag
);
2393 Chain
= Val
.getValue(1);
2394 InFlag
= Val
.getValue(2);
2395 } else if (VA
.isMemLoc()) {
2396 report_fatal_error("TODO: return values in memory");
2398 llvm_unreachable("unknown argument location type");
2400 switch (VA
.getLocInfo()) {
2401 case CCValAssign::Full
:
2403 case CCValAssign::BCvt
:
2404 Val
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), Val
);
2406 case CCValAssign::ZExt
:
2407 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VA
.getLocVT(), Val
,
2408 DAG
.getValueType(VA
.getValVT()));
2409 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2411 case CCValAssign::SExt
:
2412 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VA
.getLocVT(), Val
,
2413 DAG
.getValueType(VA
.getValVT()));
2414 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2416 case CCValAssign::AExt
:
2417 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2420 llvm_unreachable("Unknown loc info!");
2423 InVals
.push_back(Val
);
2429 // Add code to pass special inputs required depending on used features separate
2430 // from the explicit user arguments present in the IR.
2431 void SITargetLowering::passSpecialInputs(
2432 CallLoweringInfo
&CLI
,
2434 const SIMachineFunctionInfo
&Info
,
2435 SmallVectorImpl
<std::pair
<unsigned, SDValue
>> &RegsToPass
,
2436 SmallVectorImpl
<SDValue
> &MemOpChains
,
2437 SDValue Chain
) const {
2438 // If we don't have a call site, this was a call inserted by
2439 // legalization. These can never use special inputs.
2443 const Function
*CalleeFunc
= CLI
.CS
.getCalledFunction();
2446 SelectionDAG
&DAG
= CLI
.DAG
;
2447 const SDLoc
&DL
= CLI
.DL
;
2449 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2451 auto &ArgUsageInfo
=
2452 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2453 const AMDGPUFunctionArgInfo
&CalleeArgInfo
2454 = ArgUsageInfo
.lookupFuncArgInfo(*CalleeFunc
);
2456 const AMDGPUFunctionArgInfo
&CallerArgInfo
= Info
.getArgInfo();
2458 // TODO: Unify with private memory register handling. This is complicated by
2459 // the fact that at least in kernels, the input argument is not necessarily
2460 // in the same location as the input.
2461 AMDGPUFunctionArgInfo::PreloadedValue InputRegs
[] = {
2462 AMDGPUFunctionArgInfo::DISPATCH_PTR
,
2463 AMDGPUFunctionArgInfo::QUEUE_PTR
,
2464 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
,
2465 AMDGPUFunctionArgInfo::DISPATCH_ID
,
2466 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
,
2467 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
,
2468 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
,
2469 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2472 for (auto InputID
: InputRegs
) {
2473 const ArgDescriptor
*OutgoingArg
;
2474 const TargetRegisterClass
*ArgRC
;
2476 std::tie(OutgoingArg
, ArgRC
) = CalleeArgInfo
.getPreloadedValue(InputID
);
2480 const ArgDescriptor
*IncomingArg
;
2481 const TargetRegisterClass
*IncomingArgRC
;
2482 std::tie(IncomingArg
, IncomingArgRC
)
2483 = CallerArgInfo
.getPreloadedValue(InputID
);
2484 assert(IncomingArgRC
== ArgRC
);
2486 // All special arguments are ints for now.
2487 EVT ArgVT
= TRI
->getSpillSize(*ArgRC
) == 8 ? MVT::i64
: MVT::i32
;
2491 InputReg
= loadInputValue(DAG
, ArgRC
, ArgVT
, DL
, *IncomingArg
);
2493 // The implicit arg ptr is special because it doesn't have a corresponding
2494 // input for kernels, and is computed from the kernarg segment pointer.
2495 assert(InputID
== AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
2496 InputReg
= getImplicitArgPtr(DAG
, DL
);
2499 if (OutgoingArg
->isRegister()) {
2500 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2502 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(ArgVT
.getStoreSize(), 4);
2503 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2505 MemOpChains
.push_back(ArgStore
);
2509 // Pack workitem IDs into a single register or pass it as is if already
2511 const ArgDescriptor
*OutgoingArg
;
2512 const TargetRegisterClass
*ArgRC
;
2514 std::tie(OutgoingArg
, ArgRC
) =
2515 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
2517 std::tie(OutgoingArg
, ArgRC
) =
2518 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
2520 std::tie(OutgoingArg
, ArgRC
) =
2521 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
2525 const ArgDescriptor
*IncomingArgX
2526 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
).first
;
2527 const ArgDescriptor
*IncomingArgY
2528 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
).first
;
2529 const ArgDescriptor
*IncomingArgZ
2530 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
).first
;
2535 // If incoming ids are not packed we need to pack them.
2536 if (IncomingArgX
&& !IncomingArgX
->isMasked() && CalleeArgInfo
.WorkItemIDX
)
2537 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgX
);
2539 if (IncomingArgY
&& !IncomingArgY
->isMasked() && CalleeArgInfo
.WorkItemIDY
) {
2540 SDValue Y
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgY
);
2541 Y
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Y
,
2542 DAG
.getShiftAmountConstant(10, MVT::i32
, SL
));
2543 InputReg
= InputReg
.getNode() ?
2544 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Y
) : Y
;
2547 if (IncomingArgZ
&& !IncomingArgZ
->isMasked() && CalleeArgInfo
.WorkItemIDZ
) {
2548 SDValue Z
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgZ
);
2549 Z
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Z
,
2550 DAG
.getShiftAmountConstant(20, MVT::i32
, SL
));
2551 InputReg
= InputReg
.getNode() ?
2552 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Z
) : Z
;
2555 if (!InputReg
.getNode()) {
2556 // Workitem ids are already packed, any of present incoming arguments
2557 // will carry all required fields.
2558 ArgDescriptor IncomingArg
= ArgDescriptor::createArg(
2559 IncomingArgX
? *IncomingArgX
:
2560 IncomingArgY
? *IncomingArgY
:
2561 *IncomingArgZ
, ~0u);
2562 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, IncomingArg
);
2565 if (OutgoingArg
->isRegister()) {
2566 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2568 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(4, 4);
2569 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2571 MemOpChains
.push_back(ArgStore
);
2575 static bool canGuaranteeTCO(CallingConv::ID CC
) {
2576 return CC
== CallingConv::Fast
;
2579 /// Return true if we might ever do TCO for calls with this calling convention.
2580 static bool mayTailCallThisCC(CallingConv::ID CC
) {
2582 case CallingConv::C
:
2585 return canGuaranteeTCO(CC
);
2589 bool SITargetLowering::isEligibleForTailCallOptimization(
2590 SDValue Callee
, CallingConv::ID CalleeCC
, bool IsVarArg
,
2591 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2592 const SmallVectorImpl
<SDValue
> &OutVals
,
2593 const SmallVectorImpl
<ISD::InputArg
> &Ins
, SelectionDAG
&DAG
) const {
2594 if (!mayTailCallThisCC(CalleeCC
))
2597 MachineFunction
&MF
= DAG
.getMachineFunction();
2598 const Function
&CallerF
= MF
.getFunction();
2599 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
2600 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2601 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
2603 // Kernels aren't callable, and don't have a live in return address so it
2604 // doesn't make sense to do a tail call with entry functions.
2605 if (!CallerPreserved
)
2608 bool CCMatch
= CallerCC
== CalleeCC
;
2610 if (DAG
.getTarget().Options
.GuaranteedTailCallOpt
) {
2611 if (canGuaranteeTCO(CalleeCC
) && CCMatch
)
2616 // TODO: Can we handle var args?
2620 for (const Argument
&Arg
: CallerF
.args()) {
2621 if (Arg
.hasByValAttr())
2625 LLVMContext
&Ctx
= *DAG
.getContext();
2627 // Check that the call results are passed in the same way.
2628 if (!CCState::resultsCompatible(CalleeCC
, CallerCC
, MF
, Ctx
, Ins
,
2629 CCAssignFnForCall(CalleeCC
, IsVarArg
),
2630 CCAssignFnForCall(CallerCC
, IsVarArg
)))
2633 // The callee has to preserve all registers the caller needs to preserve.
2635 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
2636 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
2640 // Nothing more to check if the callee is taking no arguments.
2644 SmallVector
<CCValAssign
, 16> ArgLocs
;
2645 CCState
CCInfo(CalleeCC
, IsVarArg
, MF
, ArgLocs
, Ctx
);
2647 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CalleeCC
, IsVarArg
));
2649 const SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
2650 // If the stack arguments for this call do not fit into our own save area then
2651 // the call cannot be made tail.
2652 // TODO: Is this really necessary?
2653 if (CCInfo
.getNextStackOffset() > FuncInfo
->getBytesInStackArgArea())
2656 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2657 return parametersInCSRMatch(MRI
, CallerPreserved
, ArgLocs
, OutVals
);
2660 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst
*CI
) const {
2661 if (!CI
->isTailCall())
2664 const Function
*ParentFn
= CI
->getParent()->getParent();
2665 if (AMDGPU::isEntryFunctionCC(ParentFn
->getCallingConv()))
2668 auto Attr
= ParentFn
->getFnAttribute("disable-tail-calls");
2669 return (Attr
.getValueAsString() != "true");
2672 // The wave scratch offset register is used as the global base pointer.
2673 SDValue
SITargetLowering::LowerCall(CallLoweringInfo
&CLI
,
2674 SmallVectorImpl
<SDValue
> &InVals
) const {
2675 SelectionDAG
&DAG
= CLI
.DAG
;
2676 const SDLoc
&DL
= CLI
.DL
;
2677 SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
2678 SmallVector
<SDValue
, 32> &OutVals
= CLI
.OutVals
;
2679 SmallVector
<ISD::InputArg
, 32> &Ins
= CLI
.Ins
;
2680 SDValue Chain
= CLI
.Chain
;
2681 SDValue Callee
= CLI
.Callee
;
2682 bool &IsTailCall
= CLI
.IsTailCall
;
2683 CallingConv::ID CallConv
= CLI
.CallConv
;
2684 bool IsVarArg
= CLI
.IsVarArg
;
2685 bool IsSibCall
= false;
2686 bool IsThisReturn
= false;
2687 MachineFunction
&MF
= DAG
.getMachineFunction();
2690 return lowerUnhandledCall(CLI
, InVals
,
2691 "unsupported call to variadic function ");
2694 if (!CLI
.CS
.getInstruction())
2695 report_fatal_error("unsupported libcall legalization");
2697 if (!CLI
.CS
.getCalledFunction()) {
2698 return lowerUnhandledCall(CLI
, InVals
,
2699 "unsupported indirect call to function ");
2702 if (IsTailCall
&& MF
.getTarget().Options
.GuaranteedTailCallOpt
) {
2703 return lowerUnhandledCall(CLI
, InVals
,
2704 "unsupported required tail call to function ");
2707 if (AMDGPU::isShader(MF
.getFunction().getCallingConv())) {
2708 // Note the issue is with the CC of the calling function, not of the call
2710 return lowerUnhandledCall(CLI
, InVals
,
2711 "unsupported call from graphics shader of function ");
2715 IsTailCall
= isEligibleForTailCallOptimization(
2716 Callee
, CallConv
, IsVarArg
, Outs
, OutVals
, Ins
, DAG
);
2717 if (!IsTailCall
&& CLI
.CS
&& CLI
.CS
.isMustTailCall()) {
2718 report_fatal_error("failed to perform tail call elimination on a call "
2719 "site marked musttail");
2722 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
2724 // A sibling call is one where we're under the usual C ABI and not planning
2725 // to change that but can still do a tail call:
2726 if (!TailCallOpt
&& IsTailCall
)
2733 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2735 // Analyze operands of the call, assigning locations to each operand.
2736 SmallVector
<CCValAssign
, 16> ArgLocs
;
2737 CCState
CCInfo(CallConv
, IsVarArg
, MF
, ArgLocs
, *DAG
.getContext());
2738 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, IsVarArg
);
2740 CCInfo
.AnalyzeCallOperands(Outs
, AssignFn
);
2742 // Get a count of how many bytes are to be pushed on the stack.
2743 unsigned NumBytes
= CCInfo
.getNextStackOffset();
2746 // Since we're not changing the ABI to make this a tail call, the memory
2747 // operands are already available in the caller's incoming argument space.
2751 // FPDiff is the byte offset of the call's argument area from the callee's.
2752 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2753 // by this amount for a tail call. In a sibling call it must be 0 because the
2754 // caller will deallocate the entire stack and the callee still expects its
2755 // arguments to begin at SP+0. Completely unused for non-tail calls.
2757 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2758 SmallVector
<std::pair
<unsigned, SDValue
>, 8> RegsToPass
;
2760 // Adjust the stack pointer for the new arguments...
2761 // These operations are automatically eliminated by the prolog/epilog pass
2763 Chain
= DAG
.getCALLSEQ_START(Chain
, 0, 0, DL
);
2765 SmallVector
<SDValue
, 4> CopyFromChains
;
2767 // In the HSA case, this should be an identity copy.
2768 SDValue ScratchRSrcReg
2769 = DAG
.getCopyFromReg(Chain
, DL
, Info
->getScratchRSrcReg(), MVT::v4i32
);
2770 RegsToPass
.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3
, ScratchRSrcReg
);
2771 CopyFromChains
.push_back(ScratchRSrcReg
.getValue(1));
2772 Chain
= DAG
.getTokenFactor(DL
, CopyFromChains
);
2775 SmallVector
<SDValue
, 8> MemOpChains
;
2776 MVT PtrVT
= MVT::i32
;
2778 // Walk the register/memloc assignments, inserting copies/loads.
2779 for (unsigned i
= 0, realArgIdx
= 0, e
= ArgLocs
.size(); i
!= e
;
2780 ++i
, ++realArgIdx
) {
2781 CCValAssign
&VA
= ArgLocs
[i
];
2782 SDValue Arg
= OutVals
[realArgIdx
];
2784 // Promote the value if needed.
2785 switch (VA
.getLocInfo()) {
2786 case CCValAssign::Full
:
2788 case CCValAssign::BCvt
:
2789 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2791 case CCValAssign::ZExt
:
2792 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2794 case CCValAssign::SExt
:
2795 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2797 case CCValAssign::AExt
:
2798 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2800 case CCValAssign::FPExt
:
2801 Arg
= DAG
.getNode(ISD::FP_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2804 llvm_unreachable("Unknown loc info!");
2807 if (VA
.isRegLoc()) {
2808 RegsToPass
.push_back(std::make_pair(VA
.getLocReg(), Arg
));
2810 assert(VA
.isMemLoc());
2813 MachinePointerInfo DstInfo
;
2815 unsigned LocMemOffset
= VA
.getLocMemOffset();
2816 int32_t Offset
= LocMemOffset
;
2818 SDValue PtrOff
= DAG
.getConstant(Offset
, DL
, PtrVT
);
2822 ISD::ArgFlagsTy Flags
= Outs
[realArgIdx
].Flags
;
2823 unsigned OpSize
= Flags
.isByVal() ?
2824 Flags
.getByValSize() : VA
.getValVT().getStoreSize();
2826 // FIXME: We can have better than the minimum byval required alignment.
2827 Align
= Flags
.isByVal() ? Flags
.getByValAlign() :
2828 MinAlign(Subtarget
->getStackAlignment(), Offset
);
2830 Offset
= Offset
+ FPDiff
;
2831 int FI
= MFI
.CreateFixedObject(OpSize
, Offset
, true);
2833 DstAddr
= DAG
.getFrameIndex(FI
, PtrVT
);
2834 DstInfo
= MachinePointerInfo::getFixedStack(MF
, FI
);
2836 // Make sure any stack arguments overlapping with where we're storing
2837 // are loaded before this eventual operation. Otherwise they'll be
2840 // FIXME: Why is this really necessary? This seems to just result in a
2841 // lot of code to copy the stack and write them back to the same
2842 // locations, which are supposed to be immutable?
2843 Chain
= addTokenForArgument(Chain
, DAG
, MFI
, FI
);
2846 DstInfo
= MachinePointerInfo::getStack(MF
, LocMemOffset
);
2847 Align
= MinAlign(Subtarget
->getStackAlignment(), LocMemOffset
);
2850 if (Outs
[i
].Flags
.isByVal()) {
2852 DAG
.getConstant(Outs
[i
].Flags
.getByValSize(), DL
, MVT::i32
);
2853 SDValue Cpy
= DAG
.getMemcpy(
2854 Chain
, DL
, DstAddr
, Arg
, SizeNode
, Outs
[i
].Flags
.getByValAlign(),
2855 /*isVol = */ false, /*AlwaysInline = */ true,
2856 /*isTailCall = */ false, DstInfo
,
2857 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2858 *DAG
.getContext(), AMDGPUAS::PRIVATE_ADDRESS
))));
2860 MemOpChains
.push_back(Cpy
);
2862 SDValue Store
= DAG
.getStore(Chain
, DL
, Arg
, DstAddr
, DstInfo
, Align
);
2863 MemOpChains
.push_back(Store
);
2868 // Copy special input registers after user input arguments.
2869 passSpecialInputs(CLI
, CCInfo
, *Info
, RegsToPass
, MemOpChains
, Chain
);
2871 if (!MemOpChains
.empty())
2872 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOpChains
);
2874 // Build a sequence of copy-to-reg nodes chained together with token chain
2875 // and flag operands which copy the outgoing args into the appropriate regs.
2877 for (auto &RegToPass
: RegsToPass
) {
2878 Chain
= DAG
.getCopyToReg(Chain
, DL
, RegToPass
.first
,
2879 RegToPass
.second
, InFlag
);
2880 InFlag
= Chain
.getValue(1);
2884 SDValue PhysReturnAddrReg
;
2886 // Since the return is being combined with the call, we need to pass on the
2889 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2890 SDValue ReturnAddrReg
= CreateLiveInRegister(
2891 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2893 PhysReturnAddrReg
= DAG
.getRegister(TRI
->getReturnAddressReg(MF
),
2895 Chain
= DAG
.getCopyToReg(Chain
, DL
, PhysReturnAddrReg
, ReturnAddrReg
, InFlag
);
2896 InFlag
= Chain
.getValue(1);
2899 // We don't usually want to end the call-sequence here because we would tidy
2900 // the frame up *after* the call, however in the ABI-changing tail-call case
2901 // we've carefully laid out the parameters so that when sp is reset they'll be
2902 // in the correct location.
2903 if (IsTailCall
&& !IsSibCall
) {
2904 Chain
= DAG
.getCALLSEQ_END(Chain
,
2905 DAG
.getTargetConstant(NumBytes
, DL
, MVT::i32
),
2906 DAG
.getTargetConstant(0, DL
, MVT::i32
),
2908 InFlag
= Chain
.getValue(1);
2911 std::vector
<SDValue
> Ops
;
2912 Ops
.push_back(Chain
);
2913 Ops
.push_back(Callee
);
2914 // Add a redundant copy of the callee global which will not be legalized, as
2915 // we need direct access to the callee later.
2916 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Callee
);
2917 const GlobalValue
*GV
= GSD
->getGlobal();
2918 Ops
.push_back(DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i64
));
2921 // Each tail call may have to adjust the stack by a different amount, so
2922 // this information must travel along with the operation for eventual
2923 // consumption by emitEpilogue.
2924 Ops
.push_back(DAG
.getTargetConstant(FPDiff
, DL
, MVT::i32
));
2926 Ops
.push_back(PhysReturnAddrReg
);
2929 // Add argument registers to the end of the list so that they are known live
2931 for (auto &RegToPass
: RegsToPass
) {
2932 Ops
.push_back(DAG
.getRegister(RegToPass
.first
,
2933 RegToPass
.second
.getValueType()));
2936 // Add a register mask operand representing the call-preserved registers.
2938 auto *TRI
= static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
2939 const uint32_t *Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
2940 assert(Mask
&& "Missing call preserved mask for calling convention");
2941 Ops
.push_back(DAG
.getRegisterMask(Mask
));
2943 if (InFlag
.getNode())
2944 Ops
.push_back(InFlag
);
2946 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2948 // If we're doing a tall call, use a TC_RETURN here rather than an
2949 // actual call instruction.
2951 MFI
.setHasTailCall();
2952 return DAG
.getNode(AMDGPUISD::TC_RETURN
, DL
, NodeTys
, Ops
);
2955 // Returns a chain and a flag for retval copy to use.
2956 SDValue Call
= DAG
.getNode(AMDGPUISD::CALL
, DL
, NodeTys
, Ops
);
2957 Chain
= Call
.getValue(0);
2958 InFlag
= Call
.getValue(1);
2960 uint64_t CalleePopBytes
= NumBytes
;
2961 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getTargetConstant(0, DL
, MVT::i32
),
2962 DAG
.getTargetConstant(CalleePopBytes
, DL
, MVT::i32
),
2965 InFlag
= Chain
.getValue(1);
2967 // Handle result values, copying them out of physregs into vregs that we
2969 return LowerCallResult(Chain
, InFlag
, CallConv
, IsVarArg
, Ins
, DL
, DAG
,
2970 InVals
, IsThisReturn
,
2971 IsThisReturn
? OutVals
[0] : SDValue());
2974 unsigned SITargetLowering::getRegisterByName(const char* RegName
, EVT VT
,
2975 SelectionDAG
&DAG
) const {
2976 unsigned Reg
= StringSwitch
<unsigned>(RegName
)
2977 .Case("m0", AMDGPU::M0
)
2978 .Case("exec", AMDGPU::EXEC
)
2979 .Case("exec_lo", AMDGPU::EXEC_LO
)
2980 .Case("exec_hi", AMDGPU::EXEC_HI
)
2981 .Case("flat_scratch", AMDGPU::FLAT_SCR
)
2982 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO
)
2983 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI
)
2984 .Default(AMDGPU::NoRegister
);
2986 if (Reg
== AMDGPU::NoRegister
) {
2987 report_fatal_error(Twine("invalid register name \""
2988 + StringRef(RegName
) + "\"."));
2992 if (!Subtarget
->hasFlatScrRegister() &&
2993 Subtarget
->getRegisterInfo()->regsOverlap(Reg
, AMDGPU::FLAT_SCR
)) {
2994 report_fatal_error(Twine("invalid register \""
2995 + StringRef(RegName
) + "\" for subtarget."));
3000 case AMDGPU::EXEC_LO
:
3001 case AMDGPU::EXEC_HI
:
3002 case AMDGPU::FLAT_SCR_LO
:
3003 case AMDGPU::FLAT_SCR_HI
:
3004 if (VT
.getSizeInBits() == 32)
3008 case AMDGPU::FLAT_SCR
:
3009 if (VT
.getSizeInBits() == 64)
3013 llvm_unreachable("missing register type checking");
3016 report_fatal_error(Twine("invalid type for register \""
3017 + StringRef(RegName
) + "\"."));
3020 // If kill is not the last instruction, split the block so kill is always a
3021 // proper terminator.
3022 MachineBasicBlock
*SITargetLowering::splitKillBlock(MachineInstr
&MI
,
3023 MachineBasicBlock
*BB
) const {
3024 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3026 MachineBasicBlock::iterator
SplitPoint(&MI
);
3029 if (SplitPoint
== BB
->end()) {
3030 // Don't bother with a new block.
3031 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
3035 MachineFunction
*MF
= BB
->getParent();
3036 MachineBasicBlock
*SplitBB
3037 = MF
->CreateMachineBasicBlock(BB
->getBasicBlock());
3039 MF
->insert(++MachineFunction::iterator(BB
), SplitBB
);
3040 SplitBB
->splice(SplitBB
->begin(), BB
, SplitPoint
, BB
->end());
3042 SplitBB
->transferSuccessorsAndUpdatePHIs(BB
);
3043 BB
->addSuccessor(SplitBB
);
3045 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
3049 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3050 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3051 // be the first instruction in the remainder block.
3053 /// \returns { LoopBody, Remainder }
3054 static std::pair
<MachineBasicBlock
*, MachineBasicBlock
*>
3055 splitBlockForLoop(MachineInstr
&MI
, MachineBasicBlock
&MBB
, bool InstInLoop
) {
3056 MachineFunction
*MF
= MBB
.getParent();
3057 MachineBasicBlock::iterator
I(&MI
);
3059 // To insert the loop we need to split the block. Move everything after this
3060 // point to a new block, and insert a new empty block between the two.
3061 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
3062 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
3063 MachineFunction::iterator
MBBI(MBB
);
3066 MF
->insert(MBBI
, LoopBB
);
3067 MF
->insert(MBBI
, RemainderBB
);
3069 LoopBB
->addSuccessor(LoopBB
);
3070 LoopBB
->addSuccessor(RemainderBB
);
3072 // Move the rest of the block into a new block.
3073 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
3076 auto Next
= std::next(I
);
3078 // Move instruction to loop body.
3079 LoopBB
->splice(LoopBB
->begin(), &MBB
, I
, Next
);
3081 // Move the rest of the block.
3082 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, Next
, MBB
.end());
3084 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, I
, MBB
.end());
3087 MBB
.addSuccessor(LoopBB
);
3089 return std::make_pair(LoopBB
, RemainderBB
);
3092 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3093 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr
&MI
) const {
3094 MachineBasicBlock
*MBB
= MI
.getParent();
3095 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3096 auto I
= MI
.getIterator();
3097 auto E
= std::next(I
);
3099 BuildMI(*MBB
, E
, MI
.getDebugLoc(), TII
->get(AMDGPU::S_WAITCNT
))
3102 MIBundleBuilder
Bundler(*MBB
, I
, E
);
3103 finalizeBundle(*MBB
, Bundler
.begin());
3107 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr
&MI
,
3108 MachineBasicBlock
*BB
) const {
3109 const DebugLoc
&DL
= MI
.getDebugLoc();
3111 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3113 MachineBasicBlock
*LoopBB
;
3114 MachineBasicBlock
*RemainderBB
;
3115 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3117 // Apparently kill flags are only valid if the def is in the same block?
3118 if (MachineOperand
*Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::data0
))
3119 Src
->setIsKill(false);
3121 std::tie(LoopBB
, RemainderBB
) = splitBlockForLoop(MI
, *BB
, true);
3123 MachineBasicBlock::iterator I
= LoopBB
->end();
3125 const unsigned EncodedReg
= AMDGPU::Hwreg::encodeHwreg(
3126 AMDGPU::Hwreg::ID_TRAPSTS
, AMDGPU::Hwreg::OFFSET_MEM_VIOL
, 1);
3128 // Clear TRAP_STS.MEM_VIOL
3129 BuildMI(*LoopBB
, LoopBB
->begin(), DL
, TII
->get(AMDGPU::S_SETREG_IMM32_B32
))
3131 .addImm(EncodedReg
);
3133 bundleInstWithWaitcnt(MI
);
3135 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3137 // Load and check TRAP_STS.MEM_VIOL
3138 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_GETREG_B32
), Reg
)
3139 .addImm(EncodedReg
);
3141 // FIXME: Do we need to use an isel pseudo that may clobber scc?
3142 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CMP_LG_U32
))
3143 .addReg(Reg
, RegState::Kill
)
3145 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3151 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3152 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3153 // will only do one iteration. In the worst case, this will loop 64 times.
3155 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3156 static MachineBasicBlock::iterator
emitLoadM0FromVGPRLoop(
3157 const SIInstrInfo
*TII
,
3158 MachineRegisterInfo
&MRI
,
3159 MachineBasicBlock
&OrigBB
,
3160 MachineBasicBlock
&LoopBB
,
3162 const MachineOperand
&IdxReg
,
3166 unsigned InitSaveExecReg
,
3169 bool IsIndirectSrc
) {
3170 MachineFunction
*MF
= OrigBB
.getParent();
3171 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3172 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3173 MachineBasicBlock::iterator I
= LoopBB
.begin();
3175 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
3176 Register PhiExec
= MRI
.createVirtualRegister(BoolRC
);
3177 Register NewExec
= MRI
.createVirtualRegister(BoolRC
);
3178 Register CurrentIdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3179 Register CondReg
= MRI
.createVirtualRegister(BoolRC
);
3181 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiReg
)
3187 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiExec
)
3188 .addReg(InitSaveExecReg
)
3193 // Read the next variant <- also loop target.
3194 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), CurrentIdxReg
)
3195 .addReg(IdxReg
.getReg(), getUndefRegState(IdxReg
.isUndef()));
3197 // Compare the just read M0 value to all possible Idx values.
3198 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_CMP_EQ_U32_e64
), CondReg
)
3199 .addReg(CurrentIdxReg
)
3200 .addReg(IdxReg
.getReg(), 0, IdxReg
.getSubReg());
3202 // Update EXEC, save the original EXEC value to VCC.
3203 BuildMI(LoopBB
, I
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3204 : AMDGPU::S_AND_SAVEEXEC_B64
),
3206 .addReg(CondReg
, RegState::Kill
);
3208 MRI
.setSimpleHint(NewExec
, CondReg
);
3210 if (UseGPRIdxMode
) {
3213 IdxReg
= CurrentIdxReg
;
3215 IdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3216 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), IdxReg
)
3217 .addReg(CurrentIdxReg
, RegState::Kill
)
3220 unsigned IdxMode
= IsIndirectSrc
?
3221 AMDGPU::VGPRIndexMode::SRC0_ENABLE
: AMDGPU::VGPRIndexMode::DST_ENABLE
;
3222 MachineInstr
*SetOn
=
3223 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3224 .addReg(IdxReg
, RegState::Kill
)
3226 SetOn
->getOperand(3).setIsUndef();
3228 // Move index from VCC into M0
3230 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3231 .addReg(CurrentIdxReg
, RegState::Kill
);
3233 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
3234 .addReg(CurrentIdxReg
, RegState::Kill
)
3239 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3240 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3241 MachineInstr
*InsertPt
=
3242 BuildMI(LoopBB
, I
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_XOR_B32_term
3243 : AMDGPU::S_XOR_B64_term
), Exec
)
3247 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3250 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3251 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
3254 return InsertPt
->getIterator();
3257 // This has slightly sub-optimal regalloc when the source vector is killed by
3258 // the read. The register allocator does not understand that the kill is
3259 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3260 // subregister from it, using 1 more VGPR than necessary. This was saved when
3261 // this was expanded after register allocation.
3262 static MachineBasicBlock::iterator
loadM0FromVGPR(const SIInstrInfo
*TII
,
3263 MachineBasicBlock
&MBB
,
3265 unsigned InitResultReg
,
3269 bool IsIndirectSrc
) {
3270 MachineFunction
*MF
= MBB
.getParent();
3271 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3272 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3273 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3274 const DebugLoc
&DL
= MI
.getDebugLoc();
3275 MachineBasicBlock::iterator
I(&MI
);
3277 const auto *BoolXExecRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
3278 Register DstReg
= MI
.getOperand(0).getReg();
3279 Register SaveExec
= MRI
.createVirtualRegister(BoolXExecRC
);
3280 Register TmpExec
= MRI
.createVirtualRegister(BoolXExecRC
);
3281 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3282 unsigned MovExecOpc
= ST
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
3284 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), TmpExec
);
3286 // Save the EXEC mask
3287 BuildMI(MBB
, I
, DL
, TII
->get(MovExecOpc
), SaveExec
)
3290 MachineBasicBlock
*LoopBB
;
3291 MachineBasicBlock
*RemainderBB
;
3292 std::tie(LoopBB
, RemainderBB
) = splitBlockForLoop(MI
, MBB
, false);
3294 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3296 auto InsPt
= emitLoadM0FromVGPRLoop(TII
, MRI
, MBB
, *LoopBB
, DL
, *Idx
,
3297 InitResultReg
, DstReg
, PhiReg
, TmpExec
,
3298 Offset
, UseGPRIdxMode
, IsIndirectSrc
);
3300 MachineBasicBlock::iterator First
= RemainderBB
->begin();
3301 BuildMI(*RemainderBB
, First
, DL
, TII
->get(MovExecOpc
), Exec
)
3307 // Returns subreg index, offset
3308 static std::pair
<unsigned, int>
3309 computeIndirectRegAndOffset(const SIRegisterInfo
&TRI
,
3310 const TargetRegisterClass
*SuperRC
,
3313 int NumElts
= TRI
.getRegSizeInBits(*SuperRC
) / 32;
3315 // Skip out of bounds offsets, or else we would end up using an undefined
3317 if (Offset
>= NumElts
|| Offset
< 0)
3318 return std::make_pair(AMDGPU::sub0
, Offset
);
3320 return std::make_pair(AMDGPU::sub0
+ Offset
, 0);
3323 // Return true if the index is an SGPR and was set.
3324 static bool setM0ToIndexFromSGPR(const SIInstrInfo
*TII
,
3325 MachineRegisterInfo
&MRI
,
3329 bool IsIndirectSrc
) {
3330 MachineBasicBlock
*MBB
= MI
.getParent();
3331 const DebugLoc
&DL
= MI
.getDebugLoc();
3332 MachineBasicBlock::iterator
I(&MI
);
3334 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3335 const TargetRegisterClass
*IdxRC
= MRI
.getRegClass(Idx
->getReg());
3337 assert(Idx
->getReg() != AMDGPU::NoRegister
);
3339 if (!TII
->getRegisterInfo().isSGPRClass(IdxRC
))
3342 if (UseGPRIdxMode
) {
3343 unsigned IdxMode
= IsIndirectSrc
?
3344 AMDGPU::VGPRIndexMode::SRC0_ENABLE
: AMDGPU::VGPRIndexMode::DST_ENABLE
;
3346 MachineInstr
*SetOn
=
3347 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3351 SetOn
->getOperand(3).setIsUndef();
3353 Register Tmp
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3354 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), Tmp
)
3357 MachineInstr
*SetOn
=
3358 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3359 .addReg(Tmp
, RegState::Kill
)
3362 SetOn
->getOperand(3).setIsUndef();
3369 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3372 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
3380 // Control flow needs to be inserted if indexing with a VGPR.
3381 static MachineBasicBlock
*emitIndirectSrc(MachineInstr
&MI
,
3382 MachineBasicBlock
&MBB
,
3383 const GCNSubtarget
&ST
) {
3384 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3385 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3386 MachineFunction
*MF
= MBB
.getParent();
3387 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3389 Register Dst
= MI
.getOperand(0).getReg();
3390 Register SrcReg
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
)->getReg();
3391 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3393 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcReg
);
3396 std::tie(SubReg
, Offset
)
3397 = computeIndirectRegAndOffset(TRI
, VecRC
, SrcReg
, Offset
);
3399 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3401 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, true)) {
3402 MachineBasicBlock::iterator
I(&MI
);
3403 const DebugLoc
&DL
= MI
.getDebugLoc();
3405 if (UseGPRIdxMode
) {
3406 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3407 // to avoid interfering with other uses, so probably requires a new
3408 // optimization pass.
3409 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3410 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3411 .addReg(SrcReg
, RegState::Implicit
)
3412 .addReg(AMDGPU::M0
, RegState::Implicit
);
3413 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3415 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3416 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3417 .addReg(SrcReg
, RegState::Implicit
);
3420 MI
.eraseFromParent();
3425 const DebugLoc
&DL
= MI
.getDebugLoc();
3426 MachineBasicBlock::iterator
I(&MI
);
3428 Register PhiReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3429 Register InitReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3431 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), InitReg
);
3433 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, InitReg
, PhiReg
,
3434 Offset
, UseGPRIdxMode
, true);
3435 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3437 if (UseGPRIdxMode
) {
3438 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3439 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3440 .addReg(SrcReg
, RegState::Implicit
)
3441 .addReg(AMDGPU::M0
, RegState::Implicit
);
3442 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3444 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3445 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3446 .addReg(SrcReg
, RegState::Implicit
);
3449 MI
.eraseFromParent();
3454 static unsigned getMOVRELDPseudo(const SIRegisterInfo
&TRI
,
3455 const TargetRegisterClass
*VecRC
) {
3456 switch (TRI
.getRegSizeInBits(*VecRC
)) {
3458 return AMDGPU::V_MOVRELD_B32_V1
;
3460 return AMDGPU::V_MOVRELD_B32_V2
;
3461 case 128: // 16 bytes
3462 return AMDGPU::V_MOVRELD_B32_V4
;
3463 case 256: // 32 bytes
3464 return AMDGPU::V_MOVRELD_B32_V8
;
3465 case 512: // 64 bytes
3466 return AMDGPU::V_MOVRELD_B32_V16
;
3468 llvm_unreachable("unsupported size for MOVRELD pseudos");
3472 static MachineBasicBlock
*emitIndirectDst(MachineInstr
&MI
,
3473 MachineBasicBlock
&MBB
,
3474 const GCNSubtarget
&ST
) {
3475 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3476 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3477 MachineFunction
*MF
= MBB
.getParent();
3478 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3480 Register Dst
= MI
.getOperand(0).getReg();
3481 const MachineOperand
*SrcVec
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
);
3482 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3483 const MachineOperand
*Val
= TII
->getNamedOperand(MI
, AMDGPU::OpName::val
);
3484 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3485 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcVec
->getReg());
3487 // This can be an immediate, but will be folded later.
3488 assert(Val
->getReg());
3491 std::tie(SubReg
, Offset
) = computeIndirectRegAndOffset(TRI
, VecRC
,
3494 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3496 if (Idx
->getReg() == AMDGPU::NoRegister
) {
3497 MachineBasicBlock::iterator
I(&MI
);
3498 const DebugLoc
&DL
= MI
.getDebugLoc();
3500 assert(Offset
== 0);
3502 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::INSERT_SUBREG
), Dst
)
3507 MI
.eraseFromParent();
3511 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, false)) {
3512 MachineBasicBlock::iterator
I(&MI
);
3513 const DebugLoc
&DL
= MI
.getDebugLoc();
3515 if (UseGPRIdxMode
) {
3516 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3517 .addReg(SrcVec
->getReg(), RegState::Undef
, SubReg
) // vdst
3519 .addReg(Dst
, RegState::ImplicitDefine
)
3520 .addReg(SrcVec
->getReg(), RegState::Implicit
)
3521 .addReg(AMDGPU::M0
, RegState::Implicit
);
3523 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3525 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3527 BuildMI(MBB
, I
, DL
, MovRelDesc
)
3528 .addReg(Dst
, RegState::Define
)
3529 .addReg(SrcVec
->getReg())
3531 .addImm(SubReg
- AMDGPU::sub0
);
3534 MI
.eraseFromParent();
3539 MRI
.clearKillFlags(Val
->getReg());
3541 const DebugLoc
&DL
= MI
.getDebugLoc();
3543 Register PhiReg
= MRI
.createVirtualRegister(VecRC
);
3545 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, SrcVec
->getReg(), PhiReg
,
3546 Offset
, UseGPRIdxMode
, false);
3547 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3549 if (UseGPRIdxMode
) {
3550 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3551 .addReg(PhiReg
, RegState::Undef
, SubReg
) // vdst
3553 .addReg(Dst
, RegState::ImplicitDefine
)
3554 .addReg(PhiReg
, RegState::Implicit
)
3555 .addReg(AMDGPU::M0
, RegState::Implicit
);
3556 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3558 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3560 BuildMI(*LoopBB
, InsPt
, DL
, MovRelDesc
)
3561 .addReg(Dst
, RegState::Define
)
3564 .addImm(SubReg
- AMDGPU::sub0
);
3567 MI
.eraseFromParent();
3572 MachineBasicBlock
*SITargetLowering::EmitInstrWithCustomInserter(
3573 MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
3575 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3576 MachineFunction
*MF
= BB
->getParent();
3577 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
3579 if (TII
->isMIMG(MI
)) {
3580 if (MI
.memoperands_empty() && MI
.mayLoadOrStore()) {
3581 report_fatal_error("missing mem operand from MIMG instruction");
3583 // Add a memoperand for mimg instructions so that they aren't assumed to
3584 // be ordered memory instuctions.
3589 switch (MI
.getOpcode()) {
3590 case AMDGPU::S_ADD_U64_PSEUDO
:
3591 case AMDGPU::S_SUB_U64_PSEUDO
: {
3592 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3593 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3594 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3595 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
3596 const DebugLoc
&DL
= MI
.getDebugLoc();
3598 MachineOperand
&Dest
= MI
.getOperand(0);
3599 MachineOperand
&Src0
= MI
.getOperand(1);
3600 MachineOperand
&Src1
= MI
.getOperand(2);
3602 Register DestSub0
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3603 Register DestSub1
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3605 MachineOperand Src0Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3606 Src0
, BoolRC
, AMDGPU::sub0
,
3607 &AMDGPU::SReg_32_XM0RegClass
);
3608 MachineOperand Src0Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3609 Src0
, BoolRC
, AMDGPU::sub1
,
3610 &AMDGPU::SReg_32_XM0RegClass
);
3612 MachineOperand Src1Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3613 Src1
, BoolRC
, AMDGPU::sub0
,
3614 &AMDGPU::SReg_32_XM0RegClass
);
3615 MachineOperand Src1Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3616 Src1
, BoolRC
, AMDGPU::sub1
,
3617 &AMDGPU::SReg_32_XM0RegClass
);
3619 bool IsAdd
= (MI
.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO
);
3621 unsigned LoOpc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
3622 unsigned HiOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
3623 BuildMI(*BB
, MI
, DL
, TII
->get(LoOpc
), DestSub0
)
3626 BuildMI(*BB
, MI
, DL
, TII
->get(HiOpc
), DestSub1
)
3629 BuildMI(*BB
, MI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), Dest
.getReg())
3631 .addImm(AMDGPU::sub0
)
3633 .addImm(AMDGPU::sub1
);
3634 MI
.eraseFromParent();
3637 case AMDGPU::SI_INIT_M0
: {
3638 BuildMI(*BB
, MI
.getIterator(), MI
.getDebugLoc(),
3639 TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3640 .add(MI
.getOperand(0));
3641 MI
.eraseFromParent();
3644 case AMDGPU::SI_INIT_EXEC
:
3645 // This should be before all vector instructions.
3646 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B64
),
3648 .addImm(MI
.getOperand(0).getImm());
3649 MI
.eraseFromParent();
3652 case AMDGPU::SI_INIT_EXEC_LO
:
3653 // This should be before all vector instructions.
3654 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B32
),
3656 .addImm(MI
.getOperand(0).getImm());
3657 MI
.eraseFromParent();
3660 case AMDGPU::SI_INIT_EXEC_FROM_INPUT
: {
3661 // Extract the thread count from an SGPR input and set EXEC accordingly.
3662 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3664 // S_BFE_U32 count, input, {shift, 7}
3665 // S_BFM_B64 exec, count, 0
3666 // S_CMP_EQ_U32 count, 64
3667 // S_CMOV_B64 exec, -1
3668 MachineInstr
*FirstMI
= &*BB
->begin();
3669 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3670 Register InputReg
= MI
.getOperand(0).getReg();
3671 Register CountReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3674 // Move the COPY of the input reg to the beginning, so that we can use it.
3675 for (auto I
= BB
->begin(); I
!= &MI
; I
++) {
3676 if (I
->getOpcode() != TargetOpcode::COPY
||
3677 I
->getOperand(0).getReg() != InputReg
)
3681 FirstMI
= &*++BB
->begin();
3683 I
->removeFromParent();
3684 BB
->insert(FirstMI
, &*I
);
3692 // This should be before all vector instructions.
3693 unsigned Mask
= (getSubtarget()->getWavefrontSize() << 1) - 1;
3694 bool isWave32
= getSubtarget()->isWave32();
3695 unsigned Exec
= isWave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3696 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_BFE_U32
), CountReg
)
3698 .addImm((MI
.getOperand(1).getImm() & Mask
) | 0x70000);
3699 BuildMI(*BB
, FirstMI
, DebugLoc(),
3700 TII
->get(isWave32
? AMDGPU::S_BFM_B32
: AMDGPU::S_BFM_B64
),
3704 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_CMP_EQ_U32
))
3705 .addReg(CountReg
, RegState::Kill
)
3706 .addImm(getSubtarget()->getWavefrontSize());
3707 BuildMI(*BB
, FirstMI
, DebugLoc(),
3708 TII
->get(isWave32
? AMDGPU::S_CMOV_B32
: AMDGPU::S_CMOV_B64
),
3711 MI
.eraseFromParent();
3715 case AMDGPU::GET_GROUPSTATICSIZE
: {
3716 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
3717 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
);
3718 DebugLoc DL
= MI
.getDebugLoc();
3719 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_MOV_B32
))
3720 .add(MI
.getOperand(0))
3721 .addImm(MFI
->getLDSSize());
3722 MI
.eraseFromParent();
3725 case AMDGPU::SI_INDIRECT_SRC_V1
:
3726 case AMDGPU::SI_INDIRECT_SRC_V2
:
3727 case AMDGPU::SI_INDIRECT_SRC_V4
:
3728 case AMDGPU::SI_INDIRECT_SRC_V8
:
3729 case AMDGPU::SI_INDIRECT_SRC_V16
:
3730 return emitIndirectSrc(MI
, *BB
, *getSubtarget());
3731 case AMDGPU::SI_INDIRECT_DST_V1
:
3732 case AMDGPU::SI_INDIRECT_DST_V2
:
3733 case AMDGPU::SI_INDIRECT_DST_V4
:
3734 case AMDGPU::SI_INDIRECT_DST_V8
:
3735 case AMDGPU::SI_INDIRECT_DST_V16
:
3736 return emitIndirectDst(MI
, *BB
, *getSubtarget());
3737 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO
:
3738 case AMDGPU::SI_KILL_I1_PSEUDO
:
3739 return splitKillBlock(MI
, BB
);
3740 case AMDGPU::V_CNDMASK_B64_PSEUDO
: {
3741 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3742 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3743 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3745 Register Dst
= MI
.getOperand(0).getReg();
3746 Register Src0
= MI
.getOperand(1).getReg();
3747 Register Src1
= MI
.getOperand(2).getReg();
3748 const DebugLoc
&DL
= MI
.getDebugLoc();
3749 Register SrcCond
= MI
.getOperand(3).getReg();
3751 Register DstLo
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3752 Register DstHi
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3753 const auto *CondRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
3754 Register SrcCondCopy
= MRI
.createVirtualRegister(CondRC
);
3756 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::COPY
), SrcCondCopy
)
3758 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstLo
)
3760 .addReg(Src0
, 0, AMDGPU::sub0
)
3762 .addReg(Src1
, 0, AMDGPU::sub0
)
3763 .addReg(SrcCondCopy
);
3764 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstHi
)
3766 .addReg(Src0
, 0, AMDGPU::sub1
)
3768 .addReg(Src1
, 0, AMDGPU::sub1
)
3769 .addReg(SrcCondCopy
);
3771 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), Dst
)
3773 .addImm(AMDGPU::sub0
)
3775 .addImm(AMDGPU::sub1
);
3776 MI
.eraseFromParent();
3779 case AMDGPU::SI_BR_UNDEF
: {
3780 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3781 const DebugLoc
&DL
= MI
.getDebugLoc();
3782 MachineInstr
*Br
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3783 .add(MI
.getOperand(0));
3784 Br
->getOperand(1).setIsUndef(true); // read undef SCC
3785 MI
.eraseFromParent();
3788 case AMDGPU::ADJCALLSTACKUP
:
3789 case AMDGPU::ADJCALLSTACKDOWN
: {
3790 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
3791 MachineInstrBuilder
MIB(*MF
, &MI
);
3793 // Add an implicit use of the frame offset reg to prevent the restore copy
3794 // inserted after the call from being reorderd after stack operations in the
3795 // the caller's frame.
3796 MIB
.addReg(Info
->getStackPtrOffsetReg(), RegState::ImplicitDefine
)
3797 .addReg(Info
->getStackPtrOffsetReg(), RegState::Implicit
)
3798 .addReg(Info
->getFrameOffsetReg(), RegState::Implicit
);
3801 case AMDGPU::SI_CALL_ISEL
: {
3802 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3803 const DebugLoc
&DL
= MI
.getDebugLoc();
3805 unsigned ReturnAddrReg
= TII
->getRegisterInfo().getReturnAddressReg(*MF
);
3807 MachineInstrBuilder MIB
;
3808 MIB
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::SI_CALL
), ReturnAddrReg
);
3810 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
)
3811 MIB
.add(MI
.getOperand(I
));
3813 MIB
.cloneMemRefs(MI
);
3814 MI
.eraseFromParent();
3817 case AMDGPU::V_ADD_I32_e32
:
3818 case AMDGPU::V_SUB_I32_e32
:
3819 case AMDGPU::V_SUBREV_I32_e32
: {
3820 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
3821 const DebugLoc
&DL
= MI
.getDebugLoc();
3822 unsigned Opc
= MI
.getOpcode();
3824 bool NeedClampOperand
= false;
3825 if (TII
->pseudoToMCOpcode(Opc
) == -1) {
3826 Opc
= AMDGPU::getVOPe64(Opc
);
3827 NeedClampOperand
= true;
3830 auto I
= BuildMI(*BB
, MI
, DL
, TII
->get(Opc
), MI
.getOperand(0).getReg());
3831 if (TII
->isVOP3(*I
)) {
3832 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3833 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3834 I
.addReg(TRI
->getVCC(), RegState::Define
);
3836 I
.add(MI
.getOperand(1))
3837 .add(MI
.getOperand(2));
3838 if (NeedClampOperand
)
3839 I
.addImm(0); // clamp bit for e64 encoding
3841 TII
->legalizeOperands(*I
);
3843 MI
.eraseFromParent();
3846 case AMDGPU::DS_GWS_INIT
:
3847 case AMDGPU::DS_GWS_SEMA_V
:
3848 case AMDGPU::DS_GWS_SEMA_BR
:
3849 case AMDGPU::DS_GWS_SEMA_P
:
3850 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL
:
3851 case AMDGPU::DS_GWS_BARRIER
:
3852 // A s_waitcnt 0 is required to be the instruction immediately following.
3853 if (getSubtarget()->hasGWSAutoReplay()) {
3854 bundleInstWithWaitcnt(MI
);
3858 return emitGWSMemViolTestLoop(MI
, BB
);
3860 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI
, BB
);
3864 bool SITargetLowering::hasBitPreservingFPLogic(EVT VT
) const {
3865 return isTypeLegal(VT
.getScalarType());
3868 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT
) const {
3869 // This currently forces unfolding various combinations of fsub into fma with
3870 // free fneg'd operands. As long as we have fast FMA (controlled by
3871 // isFMAFasterThanFMulAndFAdd), we should perform these.
3873 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3874 // most of these combines appear to be cycle neutral but save on instruction
3875 // count / code size.
3879 EVT
SITargetLowering::getSetCCResultType(const DataLayout
&DL
, LLVMContext
&Ctx
,
3881 if (!VT
.isVector()) {
3884 return EVT::getVectorVT(Ctx
, MVT::i1
, VT
.getVectorNumElements());
3887 MVT
SITargetLowering::getScalarShiftAmountTy(const DataLayout
&, EVT VT
) const {
3888 // TODO: Should i16 be used always if legal? For now it would force VALU
3890 return (VT
== MVT::i16
) ? MVT::i16
: MVT::i32
;
3893 // Answering this is somewhat tricky and depends on the specific device which
3894 // have different rates for fma or all f64 operations.
3896 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3897 // regardless of which device (although the number of cycles differs between
3898 // devices), so it is always profitable for f64.
3900 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3901 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3902 // which we can always do even without fused FP ops since it returns the same
3903 // result as the separate operations and since it is always full
3904 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3905 // however does not support denormals, so we do report fma as faster if we have
3906 // a fast fma device and require denormals.
3908 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT
) const {
3909 VT
= VT
.getScalarType();
3911 switch (VT
.getSimpleVT().SimpleTy
) {
3913 // This is as fast on some subtargets. However, we always have full rate f32
3914 // mad available which returns the same result as the separate operations
3915 // which we should prefer over fma. We can't use this if we want to support
3916 // denormals, so only report this in these cases.
3917 if (Subtarget
->hasFP32Denormals())
3918 return Subtarget
->hasFastFMAF32() || Subtarget
->hasDLInsts();
3920 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3921 return Subtarget
->hasFastFMAF32() && Subtarget
->hasDLInsts();
3926 return Subtarget
->has16BitInsts() && Subtarget
->hasFP16Denormals();
3934 //===----------------------------------------------------------------------===//
3935 // Custom DAG Lowering Operations
3936 //===----------------------------------------------------------------------===//
3938 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3939 // wider vector type is legal.
3940 SDValue
SITargetLowering::splitUnaryVectorOp(SDValue Op
,
3941 SelectionDAG
&DAG
) const {
3942 unsigned Opc
= Op
.getOpcode();
3943 EVT VT
= Op
.getValueType();
3944 assert(VT
== MVT::v4f16
);
3947 std::tie(Lo
, Hi
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3950 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo
.getValueType(), Lo
,
3952 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi
.getValueType(), Hi
,
3955 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3958 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3959 // wider vector type is legal.
3960 SDValue
SITargetLowering::splitBinaryVectorOp(SDValue Op
,
3961 SelectionDAG
&DAG
) const {
3962 unsigned Opc
= Op
.getOpcode();
3963 EVT VT
= Op
.getValueType();
3964 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
3967 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3969 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
3973 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
,
3975 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
,
3978 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3981 SDValue
SITargetLowering::splitTernaryVectorOp(SDValue Op
,
3982 SelectionDAG
&DAG
) const {
3983 unsigned Opc
= Op
.getOpcode();
3984 EVT VT
= Op
.getValueType();
3985 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
3988 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3990 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
3992 std::tie(Lo2
, Hi2
) = DAG
.SplitVectorOperand(Op
.getNode(), 2);
3996 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
, Lo2
,
3998 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
, Hi2
,
4001 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
4005 SDValue
SITargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
4006 switch (Op
.getOpcode()) {
4007 default: return AMDGPUTargetLowering::LowerOperation(Op
, DAG
);
4008 case ISD::BRCOND
: return LowerBRCOND(Op
, DAG
);
4009 case ISD::RETURNADDR
: return LowerRETURNADDR(Op
, DAG
);
4011 SDValue Result
= LowerLOAD(Op
, DAG
);
4012 assert((!Result
.getNode() ||
4013 Result
.getNode()->getNumValues() == 2) &&
4014 "Load should return a value and a chain");
4020 return LowerTrig(Op
, DAG
);
4021 case ISD::SELECT
: return LowerSELECT(Op
, DAG
);
4022 case ISD::FDIV
: return LowerFDIV(Op
, DAG
);
4023 case ISD::ATOMIC_CMP_SWAP
: return LowerATOMIC_CMP_SWAP(Op
, DAG
);
4024 case ISD::STORE
: return LowerSTORE(Op
, DAG
);
4025 case ISD::GlobalAddress
: {
4026 MachineFunction
&MF
= DAG
.getMachineFunction();
4027 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
4028 return LowerGlobalAddress(MFI
, Op
, DAG
);
4030 case ISD::INTRINSIC_WO_CHAIN
: return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
4031 case ISD::INTRINSIC_W_CHAIN
: return LowerINTRINSIC_W_CHAIN(Op
, DAG
);
4032 case ISD::INTRINSIC_VOID
: return LowerINTRINSIC_VOID(Op
, DAG
);
4033 case ISD::ADDRSPACECAST
: return lowerADDRSPACECAST(Op
, DAG
);
4034 case ISD::INSERT_SUBVECTOR
:
4035 return lowerINSERT_SUBVECTOR(Op
, DAG
);
4036 case ISD::INSERT_VECTOR_ELT
:
4037 return lowerINSERT_VECTOR_ELT(Op
, DAG
);
4038 case ISD::EXTRACT_VECTOR_ELT
:
4039 return lowerEXTRACT_VECTOR_ELT(Op
, DAG
);
4040 case ISD::VECTOR_SHUFFLE
:
4041 return lowerVECTOR_SHUFFLE(Op
, DAG
);
4042 case ISD::BUILD_VECTOR
:
4043 return lowerBUILD_VECTOR(Op
, DAG
);
4045 return lowerFP_ROUND(Op
, DAG
);
4047 return lowerTRAP(Op
, DAG
);
4048 case ISD::DEBUGTRAP
:
4049 return lowerDEBUGTRAP(Op
, DAG
);
4052 case ISD::FCANONICALIZE
:
4053 return splitUnaryVectorOp(Op
, DAG
);
4056 return lowerFMINNUM_FMAXNUM(Op
, DAG
);
4058 return splitTernaryVectorOp(Op
, DAG
);
4071 case ISD::FMINNUM_IEEE
:
4072 case ISD::FMAXNUM_IEEE
:
4073 return splitBinaryVectorOp(Op
, DAG
);
4078 static SDValue
adjustLoadValueTypeImpl(SDValue Result
, EVT LoadVT
,
4080 SelectionDAG
&DAG
, bool Unpacked
) {
4081 if (!LoadVT
.isVector())
4084 if (Unpacked
) { // From v2i32/v4i32 back to v2f16/v4f16.
4085 // Truncate to v2i16/v4i16.
4086 EVT IntLoadVT
= LoadVT
.changeTypeToInteger();
4088 // Workaround legalizer not scalarizing truncate after vector op
4089 // legalization byt not creating intermediate vector trunc.
4090 SmallVector
<SDValue
, 4> Elts
;
4091 DAG
.ExtractVectorElements(Result
, Elts
);
4092 for (SDValue
&Elt
: Elts
)
4093 Elt
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, Elt
);
4095 Result
= DAG
.getBuildVector(IntLoadVT
, DL
, Elts
);
4097 // Bitcast to original type (v2f16/v4f16).
4098 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
4101 // Cast back to the original packed type.
4102 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
4105 SDValue
SITargetLowering::adjustLoadValueType(unsigned Opcode
,
4108 ArrayRef
<SDValue
> Ops
,
4109 bool IsIntrinsic
) const {
4112 bool Unpacked
= Subtarget
->hasUnpackedD16VMem();
4113 EVT LoadVT
= M
->getValueType(0);
4115 EVT EquivLoadVT
= LoadVT
;
4116 if (Unpacked
&& LoadVT
.isVector()) {
4117 EquivLoadVT
= LoadVT
.isVector() ?
4118 EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
4119 LoadVT
.getVectorNumElements()) : LoadVT
;
4122 // Change from v4f16/v2f16 to EquivLoadVT.
4123 SDVTList VTList
= DAG
.getVTList(EquivLoadVT
, MVT::Other
);
4126 = DAG
.getMemIntrinsicNode(
4127 IsIntrinsic
? (unsigned)ISD::INTRINSIC_W_CHAIN
: Opcode
, DL
,
4128 VTList
, Ops
, M
->getMemoryVT(),
4129 M
->getMemOperand());
4130 if (!Unpacked
) // Just adjusted the opcode.
4133 SDValue Adjusted
= adjustLoadValueTypeImpl(Load
, LoadVT
, DL
, DAG
, Unpacked
);
4135 return DAG
.getMergeValues({ Adjusted
, Load
.getValue(1) }, DL
);
4138 SDValue
SITargetLowering::lowerIntrinsicLoad(MemSDNode
*M
, bool IsFormat
,
4140 ArrayRef
<SDValue
> Ops
) const {
4142 EVT LoadVT
= M
->getValueType(0);
4143 EVT EltType
= LoadVT
.getScalarType();
4144 EVT IntVT
= LoadVT
.changeTypeToInteger();
4146 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
4149 IsFormat
? AMDGPUISD::BUFFER_LOAD_FORMAT
: AMDGPUISD::BUFFER_LOAD
;
4152 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
, M
, DAG
, Ops
);
4155 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
4156 if (!IsD16
&& !LoadVT
.isVector() && EltType
.getSizeInBits() < 32)
4157 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
4159 if (isTypeLegal(LoadVT
)) {
4160 return getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), Ops
, IntVT
,
4161 M
->getMemOperand(), DAG
);
4164 EVT CastVT
= getEquivalentMemType(*DAG
.getContext(), LoadVT
);
4165 SDVTList VTList
= DAG
.getVTList(CastVT
, MVT::Other
);
4166 SDValue MemNode
= getMemIntrinsicNode(Opc
, DL
, VTList
, Ops
, CastVT
,
4167 M
->getMemOperand(), DAG
);
4168 return DAG
.getMergeValues(
4169 {DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, MemNode
), MemNode
.getValue(1)},
4173 static SDValue
lowerICMPIntrinsic(const SITargetLowering
&TLI
,
4174 SDNode
*N
, SelectionDAG
&DAG
) {
4175 EVT VT
= N
->getValueType(0);
4176 const auto *CD
= cast
<ConstantSDNode
>(N
->getOperand(3));
4177 int CondCode
= CD
->getSExtValue();
4178 if (CondCode
< ICmpInst::Predicate::FIRST_ICMP_PREDICATE
||
4179 CondCode
> ICmpInst::Predicate::LAST_ICMP_PREDICATE
)
4180 return DAG
.getUNDEF(VT
);
4182 ICmpInst::Predicate IcInput
= static_cast<ICmpInst::Predicate
>(CondCode
);
4184 SDValue LHS
= N
->getOperand(1);
4185 SDValue RHS
= N
->getOperand(2);
4189 EVT CmpVT
= LHS
.getValueType();
4190 if (CmpVT
== MVT::i16
&& !TLI
.isTypeLegal(MVT::i16
)) {
4191 unsigned PromoteOp
= ICmpInst::isSigned(IcInput
) ?
4192 ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
4193 LHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, LHS
);
4194 RHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, RHS
);
4197 ISD::CondCode CCOpcode
= getICmpCondCode(IcInput
);
4199 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
4200 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
4202 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, DL
, CCVT
, LHS
, RHS
,
4203 DAG
.getCondCode(CCOpcode
));
4204 if (VT
.bitsEq(CCVT
))
4206 return DAG
.getZExtOrTrunc(SetCC
, DL
, VT
);
4209 static SDValue
lowerFCMPIntrinsic(const SITargetLowering
&TLI
,
4210 SDNode
*N
, SelectionDAG
&DAG
) {
4211 EVT VT
= N
->getValueType(0);
4212 const auto *CD
= cast
<ConstantSDNode
>(N
->getOperand(3));
4214 int CondCode
= CD
->getSExtValue();
4215 if (CondCode
< FCmpInst::Predicate::FIRST_FCMP_PREDICATE
||
4216 CondCode
> FCmpInst::Predicate::LAST_FCMP_PREDICATE
) {
4217 return DAG
.getUNDEF(VT
);
4220 SDValue Src0
= N
->getOperand(1);
4221 SDValue Src1
= N
->getOperand(2);
4222 EVT CmpVT
= Src0
.getValueType();
4225 if (CmpVT
== MVT::f16
&& !TLI
.isTypeLegal(CmpVT
)) {
4226 Src0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
4227 Src1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
4230 FCmpInst::Predicate IcInput
= static_cast<FCmpInst::Predicate
>(CondCode
);
4231 ISD::CondCode CCOpcode
= getFCmpCondCode(IcInput
);
4232 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
4233 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
4234 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, SL
, CCVT
, Src0
,
4235 Src1
, DAG
.getCondCode(CCOpcode
));
4236 if (VT
.bitsEq(CCVT
))
4238 return DAG
.getZExtOrTrunc(SetCC
, SL
, VT
);
4241 void SITargetLowering::ReplaceNodeResults(SDNode
*N
,
4242 SmallVectorImpl
<SDValue
> &Results
,
4243 SelectionDAG
&DAG
) const {
4244 switch (N
->getOpcode()) {
4245 case ISD::INSERT_VECTOR_ELT
: {
4246 if (SDValue Res
= lowerINSERT_VECTOR_ELT(SDValue(N
, 0), DAG
))
4247 Results
.push_back(Res
);
4250 case ISD::EXTRACT_VECTOR_ELT
: {
4251 if (SDValue Res
= lowerEXTRACT_VECTOR_ELT(SDValue(N
, 0), DAG
))
4252 Results
.push_back(Res
);
4255 case ISD::INTRINSIC_WO_CHAIN
: {
4256 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
4258 case Intrinsic::amdgcn_cvt_pkrtz
: {
4259 SDValue Src0
= N
->getOperand(1);
4260 SDValue Src1
= N
->getOperand(2);
4262 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32
, SL
, MVT::i32
,
4264 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Cvt
));
4267 case Intrinsic::amdgcn_cvt_pknorm_i16
:
4268 case Intrinsic::amdgcn_cvt_pknorm_u16
:
4269 case Intrinsic::amdgcn_cvt_pk_i16
:
4270 case Intrinsic::amdgcn_cvt_pk_u16
: {
4271 SDValue Src0
= N
->getOperand(1);
4272 SDValue Src1
= N
->getOperand(2);
4276 if (IID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
4277 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
4278 else if (IID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
4279 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
4280 else if (IID
== Intrinsic::amdgcn_cvt_pk_i16
)
4281 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
4283 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
4285 EVT VT
= N
->getValueType(0);
4286 if (isTypeLegal(VT
))
4287 Results
.push_back(DAG
.getNode(Opcode
, SL
, VT
, Src0
, Src1
));
4289 SDValue Cvt
= DAG
.getNode(Opcode
, SL
, MVT::i32
, Src0
, Src1
);
4290 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, Cvt
));
4297 case ISD::INTRINSIC_W_CHAIN
: {
4298 if (SDValue Res
= LowerINTRINSIC_W_CHAIN(SDValue(N
, 0), DAG
)) {
4299 if (Res
.getOpcode() == ISD::MERGE_VALUES
) {
4301 Results
.push_back(Res
.getOperand(0));
4302 Results
.push_back(Res
.getOperand(1));
4304 Results
.push_back(Res
);
4305 Results
.push_back(Res
.getValue(1));
4314 EVT VT
= N
->getValueType(0);
4315 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VT
);
4316 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(1));
4317 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(2));
4319 EVT SelectVT
= NewVT
;
4320 if (NewVT
.bitsLT(MVT::i32
)) {
4321 LHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, LHS
);
4322 RHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, RHS
);
4323 SelectVT
= MVT::i32
;
4326 SDValue NewSelect
= DAG
.getNode(ISD::SELECT
, SL
, SelectVT
,
4327 N
->getOperand(0), LHS
, RHS
);
4329 if (NewVT
!= SelectVT
)
4330 NewSelect
= DAG
.getNode(ISD::TRUNCATE
, SL
, NewVT
, NewSelect
);
4331 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewSelect
));
4335 if (N
->getValueType(0) != MVT::v2f16
)
4339 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
4341 SDValue Op
= DAG
.getNode(ISD::XOR
, SL
, MVT::i32
,
4343 DAG
.getConstant(0x80008000, SL
, MVT::i32
));
4344 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
4348 if (N
->getValueType(0) != MVT::v2f16
)
4352 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
4354 SDValue Op
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
,
4356 DAG
.getConstant(0x7fff7fff, SL
, MVT::i32
));
4357 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
4365 /// Helper function for LowerBRCOND
4366 static SDNode
*findUser(SDValue Value
, unsigned Opcode
) {
4368 SDNode
*Parent
= Value
.getNode();
4369 for (SDNode::use_iterator I
= Parent
->use_begin(), E
= Parent
->use_end();
4372 if (I
.getUse().get() != Value
)
4375 if (I
->getOpcode() == Opcode
)
4381 unsigned SITargetLowering::isCFIntrinsic(const SDNode
*Intr
) const {
4382 if (Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
) {
4383 switch (cast
<ConstantSDNode
>(Intr
->getOperand(1))->getZExtValue()) {
4384 case Intrinsic::amdgcn_if
:
4385 return AMDGPUISD::IF
;
4386 case Intrinsic::amdgcn_else
:
4387 return AMDGPUISD::ELSE
;
4388 case Intrinsic::amdgcn_loop
:
4389 return AMDGPUISD::LOOP
;
4390 case Intrinsic::amdgcn_end_cf
:
4391 llvm_unreachable("should not occur");
4397 // break, if_break, else_break are all only used as inputs to loop, not
4398 // directly as branch conditions.
4402 bool SITargetLowering::shouldEmitFixup(const GlobalValue
*GV
) const {
4403 const Triple
&TT
= getTargetMachine().getTargetTriple();
4404 return (GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4405 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4406 AMDGPU::shouldEmitConstantsToTextSection(TT
);
4409 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue
*GV
) const {
4410 // FIXME: Either avoid relying on address space here or change the default
4411 // address space for functions to avoid the explicit check.
4412 return (GV
->getValueType()->isFunctionTy() ||
4413 GV
->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
4414 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4415 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4416 !shouldEmitFixup(GV
) &&
4417 !getTargetMachine().shouldAssumeDSOLocal(*GV
->getParent(), GV
);
4420 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue
*GV
) const {
4421 return !shouldEmitFixup(GV
) && !shouldEmitGOTReloc(GV
);
4424 /// This transforms the control flow intrinsics to get the branch destination as
4425 /// last parameter, also switches branch target with BR if the need arise
4426 SDValue
SITargetLowering::LowerBRCOND(SDValue BRCOND
,
4427 SelectionDAG
&DAG
) const {
4430 SDNode
*Intr
= BRCOND
.getOperand(1).getNode();
4431 SDValue Target
= BRCOND
.getOperand(2);
4432 SDNode
*BR
= nullptr;
4433 SDNode
*SetCC
= nullptr;
4435 if (Intr
->getOpcode() == ISD::SETCC
) {
4436 // As long as we negate the condition everything is fine
4438 Intr
= SetCC
->getOperand(0).getNode();
4441 // Get the target from BR if we don't negate the condition
4442 BR
= findUser(BRCOND
, ISD::BR
);
4443 Target
= BR
->getOperand(1);
4446 // FIXME: This changes the types of the intrinsics instead of introducing new
4447 // nodes with the correct types.
4448 // e.g. llvm.amdgcn.loop
4450 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4451 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4453 unsigned CFNode
= isCFIntrinsic(Intr
);
4455 // This is a uniform branch so we don't need to legalize.
4459 bool HaveChain
= Intr
->getOpcode() == ISD::INTRINSIC_VOID
||
4460 Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
;
4463 (SetCC
->getConstantOperandVal(1) == 1 &&
4464 cast
<CondCodeSDNode
>(SetCC
->getOperand(2).getNode())->get() ==
4467 // operands of the new intrinsic call
4468 SmallVector
<SDValue
, 4> Ops
;
4470 Ops
.push_back(BRCOND
.getOperand(0));
4472 Ops
.append(Intr
->op_begin() + (HaveChain
? 2 : 1), Intr
->op_end());
4473 Ops
.push_back(Target
);
4475 ArrayRef
<EVT
> Res(Intr
->value_begin() + 1, Intr
->value_end());
4477 // build the new intrinsic call
4478 SDNode
*Result
= DAG
.getNode(CFNode
, DL
, DAG
.getVTList(Res
), Ops
).getNode();
4483 BRCOND
.getOperand(0)
4486 Result
= DAG
.getMergeValues(Ops
, DL
).getNode();
4490 // Give the branch instruction our target
4493 BRCOND
.getOperand(2)
4495 SDValue NewBR
= DAG
.getNode(ISD::BR
, DL
, BR
->getVTList(), Ops
);
4496 DAG
.ReplaceAllUsesWith(BR
, NewBR
.getNode());
4497 BR
= NewBR
.getNode();
4500 SDValue Chain
= SDValue(Result
, Result
->getNumValues() - 1);
4502 // Copy the intrinsic results to registers
4503 for (unsigned i
= 1, e
= Intr
->getNumValues() - 1; i
!= e
; ++i
) {
4504 SDNode
*CopyToReg
= findUser(SDValue(Intr
, i
), ISD::CopyToReg
);
4508 Chain
= DAG
.getCopyToReg(
4510 CopyToReg
->getOperand(1),
4511 SDValue(Result
, i
- 1),
4514 DAG
.ReplaceAllUsesWith(SDValue(CopyToReg
, 0), CopyToReg
->getOperand(0));
4517 // Remove the old intrinsic from the chain
4518 DAG
.ReplaceAllUsesOfValueWith(
4519 SDValue(Intr
, Intr
->getNumValues() - 1),
4520 Intr
->getOperand(0));
4525 SDValue
SITargetLowering::LowerRETURNADDR(SDValue Op
,
4526 SelectionDAG
&DAG
) const {
4527 MVT VT
= Op
.getSimpleValueType();
4529 // Checking the depth
4530 if (cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue() != 0)
4531 return DAG
.getConstant(0, DL
, VT
);
4533 MachineFunction
&MF
= DAG
.getMachineFunction();
4534 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4535 // Check for kernel and shader functions
4536 if (Info
->isEntryFunction())
4537 return DAG
.getConstant(0, DL
, VT
);
4539 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
4540 // There is a call to @llvm.returnaddress in this function
4541 MFI
.setReturnAddressIsTaken(true);
4543 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
4544 // Get the return address reg and mark it as an implicit live-in
4545 unsigned Reg
= MF
.addLiveIn(TRI
->getReturnAddressReg(MF
), getRegClassFor(VT
, Op
.getNode()->isDivergent()));
4547 return DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, Reg
, VT
);
4550 SDValue
SITargetLowering::getFPExtOrFPTrunc(SelectionDAG
&DAG
,
4554 return Op
.getValueType().bitsLE(VT
) ?
4555 DAG
.getNode(ISD::FP_EXTEND
, DL
, VT
, Op
) :
4556 DAG
.getNode(ISD::FTRUNC
, DL
, VT
, Op
);
4559 SDValue
SITargetLowering::lowerFP_ROUND(SDValue Op
, SelectionDAG
&DAG
) const {
4560 assert(Op
.getValueType() == MVT::f16
&&
4561 "Do not know how to custom lower FP_ROUND for non-f16 type");
4563 SDValue Src
= Op
.getOperand(0);
4564 EVT SrcVT
= Src
.getValueType();
4565 if (SrcVT
!= MVT::f64
)
4570 SDValue FpToFp16
= DAG
.getNode(ISD::FP_TO_FP16
, DL
, MVT::i32
, Src
);
4571 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, FpToFp16
);
4572 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f16
, Trunc
);
4575 SDValue
SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op
,
4576 SelectionDAG
&DAG
) const {
4577 EVT VT
= Op
.getValueType();
4578 const MachineFunction
&MF
= DAG
.getMachineFunction();
4579 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4580 bool IsIEEEMode
= Info
->getMode().IEEE
;
4582 // FIXME: Assert during eslection that this is only selected for
4583 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4584 // mode functions, but this happens to be OK since it's only done in cases
4585 // where there is known no sNaN.
4587 return expandFMINNUM_FMAXNUM(Op
.getNode(), DAG
);
4589 if (VT
== MVT::v4f16
)
4590 return splitBinaryVectorOp(Op
, DAG
);
4594 SDValue
SITargetLowering::lowerTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4596 SDValue Chain
= Op
.getOperand(0);
4598 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4599 !Subtarget
->isTrapHandlerEnabled())
4600 return DAG
.getNode(AMDGPUISD::ENDPGM
, SL
, MVT::Other
, Chain
);
4602 MachineFunction
&MF
= DAG
.getMachineFunction();
4603 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4604 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4605 assert(UserSGPR
!= AMDGPU::NoRegister
);
4606 SDValue QueuePtr
= CreateLiveInRegister(
4607 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4608 SDValue SGPR01
= DAG
.getRegister(AMDGPU::SGPR0_SGPR1
, MVT::i64
);
4609 SDValue ToReg
= DAG
.getCopyToReg(Chain
, SL
, SGPR01
,
4610 QueuePtr
, SDValue());
4613 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap
, SL
, MVT::i16
),
4617 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4620 SDValue
SITargetLowering::lowerDEBUGTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4622 SDValue Chain
= Op
.getOperand(0);
4623 MachineFunction
&MF
= DAG
.getMachineFunction();
4625 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4626 !Subtarget
->isTrapHandlerEnabled()) {
4627 DiagnosticInfoUnsupported
NoTrap(MF
.getFunction(),
4628 "debugtrap handler not supported",
4631 LLVMContext
&Ctx
= MF
.getFunction().getContext();
4632 Ctx
.diagnose(NoTrap
);
4638 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap
, SL
, MVT::i16
)
4640 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4643 SDValue
SITargetLowering::getSegmentAperture(unsigned AS
, const SDLoc
&DL
,
4644 SelectionDAG
&DAG
) const {
4645 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4646 if (Subtarget
->hasApertureRegs()) {
4647 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4648 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
4649 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
4650 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4651 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
4652 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
4654 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
4655 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
4656 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
4658 SDValue EncodingImm
= DAG
.getTargetConstant(Encoding
, DL
, MVT::i16
);
4659 SDValue ApertureReg
= SDValue(
4660 DAG
.getMachineNode(AMDGPU::S_GETREG_B32
, DL
, MVT::i32
, EncodingImm
), 0);
4661 SDValue ShiftAmount
= DAG
.getTargetConstant(WidthM1
+ 1, DL
, MVT::i32
);
4662 return DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, ApertureReg
, ShiftAmount
);
4665 MachineFunction
&MF
= DAG
.getMachineFunction();
4666 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4667 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4668 assert(UserSGPR
!= AMDGPU::NoRegister
);
4670 SDValue QueuePtr
= CreateLiveInRegister(
4671 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4673 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4674 // private_segment_aperture_base_hi.
4675 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
4677 SDValue Ptr
= DAG
.getObjectPtrOffset(DL
, QueuePtr
, StructOffset
);
4679 // TODO: Use custom target PseudoSourceValue.
4680 // TODO: We should use the value from the IR intrinsic call, but it might not
4681 // be available and how do we get it?
4682 Value
*V
= UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG
.getContext()),
4683 AMDGPUAS::CONSTANT_ADDRESS
));
4685 MachinePointerInfo
PtrInfo(V
, StructOffset
);
4686 return DAG
.getLoad(MVT::i32
, DL
, QueuePtr
.getValue(1), Ptr
, PtrInfo
,
4687 MinAlign(64, StructOffset
),
4688 MachineMemOperand::MODereferenceable
|
4689 MachineMemOperand::MOInvariant
);
4692 SDValue
SITargetLowering::lowerADDRSPACECAST(SDValue Op
,
4693 SelectionDAG
&DAG
) const {
4695 const AddrSpaceCastSDNode
*ASC
= cast
<AddrSpaceCastSDNode
>(Op
);
4697 SDValue Src
= ASC
->getOperand(0);
4698 SDValue FlatNullPtr
= DAG
.getConstant(0, SL
, MVT::i64
);
4700 const AMDGPUTargetMachine
&TM
=
4701 static_cast<const AMDGPUTargetMachine
&>(getTargetMachine());
4703 // flat -> local/private
4704 if (ASC
->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4705 unsigned DestAS
= ASC
->getDestAddressSpace();
4707 if (DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
4708 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4709 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
4710 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4711 SDValue NonNull
= DAG
.getSetCC(SL
, MVT::i1
, Src
, FlatNullPtr
, ISD::SETNE
);
4712 SDValue Ptr
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, Src
);
4714 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i32
,
4715 NonNull
, Ptr
, SegmentNullPtr
);
4719 // local/private -> flat
4720 if (ASC
->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4721 unsigned SrcAS
= ASC
->getSrcAddressSpace();
4723 if (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
4724 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4725 unsigned NullVal
= TM
.getNullPointerValue(SrcAS
);
4726 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4729 = DAG
.getSetCC(SL
, MVT::i1
, Src
, SegmentNullPtr
, ISD::SETNE
);
4731 SDValue Aperture
= getSegmentAperture(ASC
->getSrcAddressSpace(), SL
, DAG
);
4733 = DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, Src
, Aperture
);
4735 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, NonNull
,
4736 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, CvtPtr
),
4741 // global <-> flat are no-ops and never emitted.
4743 const MachineFunction
&MF
= DAG
.getMachineFunction();
4744 DiagnosticInfoUnsupported
InvalidAddrSpaceCast(
4745 MF
.getFunction(), "invalid addrspacecast", SL
.getDebugLoc());
4746 DAG
.getContext()->diagnose(InvalidAddrSpaceCast
);
4748 return DAG
.getUNDEF(ASC
->getValueType(0));
4751 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
4752 // the small vector and inserting them into the big vector. That is better than
4753 // the default expansion of doing it via a stack slot. Even though the use of
4754 // the stack slot would be optimized away afterwards, the stack slot itself
4756 SDValue
SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op
,
4757 SelectionDAG
&DAG
) const {
4758 SDValue Vec
= Op
.getOperand(0);
4759 SDValue Ins
= Op
.getOperand(1);
4760 SDValue Idx
= Op
.getOperand(2);
4761 EVT VecVT
= Vec
.getValueType();
4762 EVT InsVT
= Ins
.getValueType();
4763 EVT EltVT
= VecVT
.getVectorElementType();
4764 unsigned InsNumElts
= InsVT
.getVectorNumElements();
4765 unsigned IdxVal
= cast
<ConstantSDNode
>(Idx
)->getZExtValue();
4768 for (unsigned I
= 0; I
!= InsNumElts
; ++I
) {
4769 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Ins
,
4770 DAG
.getConstant(I
, SL
, MVT::i32
));
4771 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, VecVT
, Vec
, Elt
,
4772 DAG
.getConstant(IdxVal
+ I
, SL
, MVT::i32
));
4777 SDValue
SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op
,
4778 SelectionDAG
&DAG
) const {
4779 SDValue Vec
= Op
.getOperand(0);
4780 SDValue InsVal
= Op
.getOperand(1);
4781 SDValue Idx
= Op
.getOperand(2);
4782 EVT VecVT
= Vec
.getValueType();
4783 EVT EltVT
= VecVT
.getVectorElementType();
4784 unsigned VecSize
= VecVT
.getSizeInBits();
4785 unsigned EltSize
= EltVT
.getSizeInBits();
4788 assert(VecSize
<= 64);
4790 unsigned NumElts
= VecVT
.getVectorNumElements();
4792 auto KIdx
= dyn_cast
<ConstantSDNode
>(Idx
);
4794 if (NumElts
== 4 && EltSize
== 16 && KIdx
) {
4795 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Vec
);
4797 SDValue LoHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4798 DAG
.getConstant(0, SL
, MVT::i32
));
4799 SDValue HiHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4800 DAG
.getConstant(1, SL
, MVT::i32
));
4802 SDValue LoVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, LoHalf
);
4803 SDValue HiVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, HiHalf
);
4805 unsigned Idx
= KIdx
->getZExtValue();
4806 bool InsertLo
= Idx
< 2;
4807 SDValue InsHalf
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, MVT::v2i16
,
4808 InsertLo
? LoVec
: HiVec
,
4809 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, InsVal
),
4810 DAG
.getConstant(InsertLo
? Idx
: (Idx
- 2), SL
, MVT::i32
));
4812 InsHalf
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, InsHalf
);
4814 SDValue Concat
= InsertLo
?
4815 DAG
.getBuildVector(MVT::v2i32
, SL
, { InsHalf
, HiHalf
}) :
4816 DAG
.getBuildVector(MVT::v2i32
, SL
, { LoHalf
, InsHalf
});
4818 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, Concat
);
4821 if (isa
<ConstantSDNode
>(Idx
))
4824 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4826 // Avoid stack access for dynamic indexing.
4827 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4829 // Create a congruent vector with the target value in each element so that
4830 // the required element can be masked and ORed into the target vector.
4831 SDValue ExtVal
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
,
4832 DAG
.getSplatBuildVector(VecVT
, SL
, InsVal
));
4834 assert(isPowerOf2_32(EltSize
));
4835 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4837 // Convert vector index to bit-index.
4838 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4840 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4841 SDValue BFM
= DAG
.getNode(ISD::SHL
, SL
, IntVT
,
4842 DAG
.getConstant(0xffff, SL
, IntVT
),
4845 SDValue LHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
, BFM
, ExtVal
);
4846 SDValue RHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
,
4847 DAG
.getNOT(SL
, BFM
, IntVT
), BCVec
);
4849 SDValue BFI
= DAG
.getNode(ISD::OR
, SL
, IntVT
, LHS
, RHS
);
4850 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, BFI
);
4853 SDValue
SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op
,
4854 SelectionDAG
&DAG
) const {
4857 EVT ResultVT
= Op
.getValueType();
4858 SDValue Vec
= Op
.getOperand(0);
4859 SDValue Idx
= Op
.getOperand(1);
4860 EVT VecVT
= Vec
.getValueType();
4861 unsigned VecSize
= VecVT
.getSizeInBits();
4862 EVT EltVT
= VecVT
.getVectorElementType();
4863 assert(VecSize
<= 64);
4865 DAGCombinerInfo
DCI(DAG
, AfterLegalizeVectorOps
, true, nullptr);
4867 // Make sure we do any optimizations that will make it easier to fold
4868 // source modifiers before obscuring it with bit operations.
4870 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4871 if (SDValue Combined
= performExtractVectorEltCombine(Op
.getNode(), DCI
))
4874 unsigned EltSize
= EltVT
.getSizeInBits();
4875 assert(isPowerOf2_32(EltSize
));
4877 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4878 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4880 // Convert vector index to bit-index (* EltSize)
4881 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4883 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4884 SDValue Elt
= DAG
.getNode(ISD::SRL
, SL
, IntVT
, BC
, ScaledIdx
);
4886 if (ResultVT
== MVT::f16
) {
4887 SDValue Result
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i16
, Elt
);
4888 return DAG
.getNode(ISD::BITCAST
, SL
, ResultVT
, Result
);
4891 return DAG
.getAnyExtOrTrunc(Elt
, SL
, ResultVT
);
4894 static bool elementPairIsContiguous(ArrayRef
<int> Mask
, int Elt
) {
4895 assert(Elt
% 2 == 0);
4896 return Mask
[Elt
+ 1] == Mask
[Elt
] + 1 && (Mask
[Elt
] % 2 == 0);
4899 SDValue
SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op
,
4900 SelectionDAG
&DAG
) const {
4902 EVT ResultVT
= Op
.getValueType();
4903 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
);
4905 EVT PackVT
= ResultVT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
4906 EVT EltVT
= PackVT
.getVectorElementType();
4907 int SrcNumElts
= Op
.getOperand(0).getValueType().getVectorNumElements();
4909 // vector_shuffle <0,1,6,7> lhs, rhs
4910 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
4912 // vector_shuffle <6,7,2,3> lhs, rhs
4913 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
4915 // vector_shuffle <6,7,0,1> lhs, rhs
4916 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
4918 // Avoid scalarizing when both halves are reading from consecutive elements.
4919 SmallVector
<SDValue
, 4> Pieces
;
4920 for (int I
= 0, N
= ResultVT
.getVectorNumElements(); I
!= N
; I
+= 2) {
4921 if (elementPairIsContiguous(SVN
->getMask(), I
)) {
4922 const int Idx
= SVN
->getMaskElt(I
);
4923 int VecIdx
= Idx
< SrcNumElts
? 0 : 1;
4924 int EltIdx
= Idx
< SrcNumElts
? Idx
: Idx
- SrcNumElts
;
4925 SDValue SubVec
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
,
4926 PackVT
, SVN
->getOperand(VecIdx
),
4927 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
4928 Pieces
.push_back(SubVec
);
4930 const int Idx0
= SVN
->getMaskElt(I
);
4931 const int Idx1
= SVN
->getMaskElt(I
+ 1);
4932 int VecIdx0
= Idx0
< SrcNumElts
? 0 : 1;
4933 int VecIdx1
= Idx1
< SrcNumElts
? 0 : 1;
4934 int EltIdx0
= Idx0
< SrcNumElts
? Idx0
: Idx0
- SrcNumElts
;
4935 int EltIdx1
= Idx1
< SrcNumElts
? Idx1
: Idx1
- SrcNumElts
;
4937 SDValue Vec0
= SVN
->getOperand(VecIdx0
);
4938 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
4939 Vec0
, DAG
.getConstant(EltIdx0
, SL
, MVT::i32
));
4941 SDValue Vec1
= SVN
->getOperand(VecIdx1
);
4942 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
4943 Vec1
, DAG
.getConstant(EltIdx1
, SL
, MVT::i32
));
4944 Pieces
.push_back(DAG
.getBuildVector(PackVT
, SL
, { Elt0
, Elt1
}));
4948 return DAG
.getNode(ISD::CONCAT_VECTORS
, SL
, ResultVT
, Pieces
);
4951 SDValue
SITargetLowering::lowerBUILD_VECTOR(SDValue Op
,
4952 SelectionDAG
&DAG
) const {
4954 EVT VT
= Op
.getValueType();
4956 if (VT
== MVT::v4i16
|| VT
== MVT::v4f16
) {
4957 EVT HalfVT
= MVT::getVectorVT(VT
.getVectorElementType().getSimpleVT(), 2);
4959 // Turn into pair of packed build_vectors.
4960 // TODO: Special case for constants that can be materialized with s_mov_b64.
4961 SDValue Lo
= DAG
.getBuildVector(HalfVT
, SL
,
4962 { Op
.getOperand(0), Op
.getOperand(1) });
4963 SDValue Hi
= DAG
.getBuildVector(HalfVT
, SL
,
4964 { Op
.getOperand(2), Op
.getOperand(3) });
4966 SDValue CastLo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Lo
);
4967 SDValue CastHi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Hi
);
4969 SDValue Blend
= DAG
.getBuildVector(MVT::v2i32
, SL
, { CastLo
, CastHi
});
4970 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Blend
);
4973 assert(VT
== MVT::v2f16
|| VT
== MVT::v2i16
);
4974 assert(!Subtarget
->hasVOP3PInsts() && "this should be legal");
4976 SDValue Lo
= Op
.getOperand(0);
4977 SDValue Hi
= Op
.getOperand(1);
4979 // Avoid adding defined bits with the zero_extend.
4981 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4982 SDValue ExtLo
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Lo
);
4983 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ExtLo
);
4986 Hi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Hi
);
4987 Hi
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Hi
);
4989 SDValue ShlHi
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Hi
,
4990 DAG
.getConstant(16, SL
, MVT::i32
));
4992 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ShlHi
);
4994 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4995 Lo
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Lo
);
4997 SDValue Or
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Lo
, ShlHi
);
4998 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Or
);
5002 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode
*GA
) const {
5003 // We can fold offsets for anything that doesn't require a GOT relocation.
5004 return (GA
->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
5005 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
5006 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
5007 !shouldEmitGOTReloc(GA
->getGlobal());
5011 buildPCRelGlobalAddress(SelectionDAG
&DAG
, const GlobalValue
*GV
,
5012 const SDLoc
&DL
, unsigned Offset
, EVT PtrVT
,
5013 unsigned GAFlags
= SIInstrInfo::MO_NONE
) {
5014 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
5015 // lowered to the following code sequence:
5017 // For constant address space:
5018 // s_getpc_b64 s[0:1]
5019 // s_add_u32 s0, s0, $symbol
5020 // s_addc_u32 s1, s1, 0
5022 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
5023 // a fixup or relocation is emitted to replace $symbol with a literal
5024 // constant, which is a pc-relative offset from the encoding of the $symbol
5025 // operand to the global variable.
5027 // For global address space:
5028 // s_getpc_b64 s[0:1]
5029 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
5030 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
5032 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
5033 // fixups or relocations are emitted to replace $symbol@*@lo and
5034 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
5035 // which is a 64-bit pc-relative offset from the encoding of the $symbol
5036 // operand to the global variable.
5038 // What we want here is an offset from the value returned by s_getpc
5039 // (which is the address of the s_add_u32 instruction) to the global
5040 // variable, but since the encoding of $symbol starts 4 bytes after the start
5041 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
5042 // small. This requires us to add 4 to the global variable offset in order to
5043 // compute the correct address.
5045 DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4, GAFlags
);
5047 if (GAFlags
== SIInstrInfo::MO_NONE
) {
5048 PtrHi
= DAG
.getTargetConstant(0, DL
, MVT::i32
);
5051 DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4, GAFlags
+ 1);
5053 return DAG
.getNode(AMDGPUISD::PC_ADD_REL_OFFSET
, DL
, PtrVT
, PtrLo
, PtrHi
);
5056 SDValue
SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
*MFI
,
5058 SelectionDAG
&DAG
) const {
5059 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Op
);
5060 const GlobalValue
*GV
= GSD
->getGlobal();
5061 if ((GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
&&
5062 (!GV
->hasExternalLinkage() ||
5063 getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
5064 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
)) ||
5065 GSD
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
||
5066 GSD
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
)
5067 return AMDGPUTargetLowering::LowerGlobalAddress(MFI
, Op
, DAG
);
5070 EVT PtrVT
= Op
.getValueType();
5072 if (GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
) {
5073 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, GSD
->getOffset(),
5074 SIInstrInfo::MO_ABS32_LO
);
5075 return DAG
.getNode(AMDGPUISD::LDS
, DL
, MVT::i32
, GA
);
5078 if (shouldEmitFixup(GV
))
5079 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
);
5080 else if (shouldEmitPCReloc(GV
))
5081 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
,
5082 SIInstrInfo::MO_REL32
);
5084 SDValue GOTAddr
= buildPCRelGlobalAddress(DAG
, GV
, DL
, 0, PtrVT
,
5085 SIInstrInfo::MO_GOTPCREL32
);
5087 Type
*Ty
= PtrVT
.getTypeForEVT(*DAG
.getContext());
5088 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
5089 const DataLayout
&DataLayout
= DAG
.getDataLayout();
5090 unsigned Align
= DataLayout
.getABITypeAlignment(PtrTy
);
5091 MachinePointerInfo PtrInfo
5092 = MachinePointerInfo::getGOT(DAG
.getMachineFunction());
5094 return DAG
.getLoad(PtrVT
, DL
, DAG
.getEntryNode(), GOTAddr
, PtrInfo
, Align
,
5095 MachineMemOperand::MODereferenceable
|
5096 MachineMemOperand::MOInvariant
);
5099 SDValue
SITargetLowering::copyToM0(SelectionDAG
&DAG
, SDValue Chain
,
5100 const SDLoc
&DL
, SDValue V
) const {
5101 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
5102 // the destination register.
5104 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
5105 // so we will end up with redundant moves to m0.
5107 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
5109 // A Null SDValue creates a glue result.
5110 SDNode
*M0
= DAG
.getMachineNode(AMDGPU::SI_INIT_M0
, DL
, MVT::Other
, MVT::Glue
,
5112 return SDValue(M0
, 0);
5115 SDValue
SITargetLowering::lowerImplicitZextParam(SelectionDAG
&DAG
,
5118 unsigned Offset
) const {
5120 SDValue Param
= lowerKernargMemParameter(DAG
, MVT::i32
, MVT::i32
, SL
,
5121 DAG
.getEntryNode(), Offset
, 4, false);
5122 // The local size values will have the hi 16-bits as zero.
5123 return DAG
.getNode(ISD::AssertZext
, SL
, MVT::i32
, Param
,
5124 DAG
.getValueType(VT
));
5127 static SDValue
emitNonHSAIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
5129 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
5130 "non-hsa intrinsic with hsa target",
5132 DAG
.getContext()->diagnose(BadIntrin
);
5133 return DAG
.getUNDEF(VT
);
5136 static SDValue
emitRemovedIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
5138 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
5139 "intrinsic not supported on subtarget",
5141 DAG
.getContext()->diagnose(BadIntrin
);
5142 return DAG
.getUNDEF(VT
);
5145 static SDValue
getBuildDwordsVector(SelectionDAG
&DAG
, SDLoc DL
,
5146 ArrayRef
<SDValue
> Elts
) {
5147 assert(!Elts
.empty());
5151 if (Elts
.size() == 1) {
5154 } else if (Elts
.size() == 2) {
5157 } else if (Elts
.size() <= 4) {
5160 } else if (Elts
.size() <= 8) {
5164 assert(Elts
.size() <= 16);
5169 SmallVector
<SDValue
, 16> VecElts(NumElts
);
5170 for (unsigned i
= 0; i
< Elts
.size(); ++i
) {
5171 SDValue Elt
= Elts
[i
];
5172 if (Elt
.getValueType() != MVT::f32
)
5173 Elt
= DAG
.getBitcast(MVT::f32
, Elt
);
5176 for (unsigned i
= Elts
.size(); i
< NumElts
; ++i
)
5177 VecElts
[i
] = DAG
.getUNDEF(MVT::f32
);
5181 return DAG
.getBuildVector(Type
, DL
, VecElts
);
5184 static bool parseCachePolicy(SDValue CachePolicy
, SelectionDAG
&DAG
,
5185 SDValue
*GLC
, SDValue
*SLC
, SDValue
*DLC
) {
5186 auto CachePolicyConst
= cast
<ConstantSDNode
>(CachePolicy
.getNode());
5188 uint64_t Value
= CachePolicyConst
->getZExtValue();
5189 SDLoc
DL(CachePolicy
);
5191 *GLC
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
5192 Value
&= ~(uint64_t)0x1;
5195 *SLC
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
5196 Value
&= ~(uint64_t)0x2;
5199 *DLC
= DAG
.getTargetConstant((Value
& 0x4) ? 1 : 0, DL
, MVT::i32
);
5200 Value
&= ~(uint64_t)0x4;
5206 // Re-construct the required return value for a image load intrinsic.
5207 // This is more complicated due to the optional use TexFailCtrl which means the required
5208 // return type is an aggregate
5209 static SDValue
constructRetValue(SelectionDAG
&DAG
,
5210 MachineSDNode
*Result
,
5211 ArrayRef
<EVT
> ResultTypes
,
5212 bool IsTexFail
, bool Unpacked
, bool IsD16
,
5213 int DMaskPop
, int NumVDataDwords
,
5214 const SDLoc
&DL
, LLVMContext
&Context
) {
5215 // Determine the required return type. This is the same regardless of IsTexFail flag
5216 EVT ReqRetVT
= ResultTypes
[0];
5217 EVT ReqRetEltVT
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorElementType() : ReqRetVT
;
5218 int ReqRetNumElts
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorNumElements() : 1;
5219 EVT AdjEltVT
= Unpacked
&& IsD16
? MVT::i32
: ReqRetEltVT
;
5220 EVT AdjVT
= Unpacked
? ReqRetNumElts
> 1 ? EVT::getVectorVT(Context
, AdjEltVT
, ReqRetNumElts
)
5224 // Extract data part of the result
5225 // Bitcast the result to the same type as the required return type
5227 if (IsD16
&& !Unpacked
)
5228 NumElts
= NumVDataDwords
<< 1;
5230 NumElts
= NumVDataDwords
;
5232 EVT CastVT
= NumElts
> 1 ? EVT::getVectorVT(Context
, AdjEltVT
, NumElts
)
5235 // Special case for v6f16. Rather than add support for this, use v3i32 to
5236 // extract the data elements
5237 bool V6F16Special
= false;
5239 CastVT
= EVT::getVectorVT(Context
, MVT::i32
, NumElts
/ 2);
5241 ReqRetNumElts
>>= 1;
5242 V6F16Special
= true;
5246 SDValue N
= SDValue(Result
, 0);
5247 SDValue CastRes
= DAG
.getNode(ISD::BITCAST
, DL
, CastVT
, N
);
5249 // Iterate over the result
5250 SmallVector
<SDValue
, 4> BVElts
;
5252 if (CastVT
.isVector()) {
5253 DAG
.ExtractVectorElements(CastRes
, BVElts
, 0, DMaskPop
);
5255 BVElts
.push_back(CastRes
);
5257 int ExtraElts
= ReqRetNumElts
- DMaskPop
;
5259 BVElts
.push_back(DAG
.getUNDEF(AdjEltVT
));
5262 if (ReqRetNumElts
> 1) {
5263 SDValue NewVec
= DAG
.getBuildVector(AdjVT
, DL
, BVElts
);
5264 if (IsD16
&& Unpacked
)
5265 PreTFCRes
= adjustLoadValueTypeImpl(NewVec
, ReqRetVT
, DL
, DAG
, Unpacked
);
5269 PreTFCRes
= BVElts
[0];
5273 PreTFCRes
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v4f16
, PreTFCRes
);
5276 if (Result
->getNumValues() > 1)
5277 return DAG
.getMergeValues({PreTFCRes
, SDValue(Result
, 1)}, DL
);
5282 // Extract the TexFail result and insert into aggregate return
5283 SmallVector
<SDValue
, 1> TFCElt
;
5284 DAG
.ExtractVectorElements(N
, TFCElt
, DMaskPop
, 1);
5285 SDValue TFCRes
= DAG
.getNode(ISD::BITCAST
, DL
, ResultTypes
[1], TFCElt
[0]);
5286 return DAG
.getMergeValues({PreTFCRes
, TFCRes
, SDValue(Result
, 1)}, DL
);
5289 static bool parseTexFail(SDValue TexFailCtrl
, SelectionDAG
&DAG
, SDValue
*TFE
,
5290 SDValue
*LWE
, bool &IsTexFail
) {
5291 auto TexFailCtrlConst
= cast
<ConstantSDNode
>(TexFailCtrl
.getNode());
5293 uint64_t Value
= TexFailCtrlConst
->getZExtValue();
5298 SDLoc
DL(TexFailCtrlConst
);
5299 *TFE
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
5300 Value
&= ~(uint64_t)0x1;
5301 *LWE
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
5302 Value
&= ~(uint64_t)0x2;
5307 SDValue
SITargetLowering::lowerImage(SDValue Op
,
5308 const AMDGPU::ImageDimIntrinsicInfo
*Intr
,
5309 SelectionDAG
&DAG
) const {
5311 MachineFunction
&MF
= DAG
.getMachineFunction();
5312 const GCNSubtarget
* ST
= &MF
.getSubtarget
<GCNSubtarget
>();
5313 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
5314 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
5315 const AMDGPU::MIMGDimInfo
*DimInfo
= AMDGPU::getMIMGDimInfo(Intr
->Dim
);
5316 const AMDGPU::MIMGLZMappingInfo
*LZMappingInfo
=
5317 AMDGPU::getMIMGLZMappingInfo(Intr
->BaseOpcode
);
5318 const AMDGPU::MIMGMIPMappingInfo
*MIPMappingInfo
=
5319 AMDGPU::getMIMGMIPMappingInfo(Intr
->BaseOpcode
);
5320 unsigned IntrOpcode
= Intr
->BaseOpcode
;
5321 bool IsGFX10
= Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
;
5323 SmallVector
<EVT
, 3> ResultTypes(Op
->value_begin(), Op
->value_end());
5324 SmallVector
<EVT
, 3> OrigResultTypes(Op
->value_begin(), Op
->value_end());
5329 bool AdjustRetType
= false;
5331 unsigned AddrIdx
; // Index of first address argument
5333 unsigned DMaskLanes
= 0;
5335 if (BaseOpcode
->Atomic
) {
5336 VData
= Op
.getOperand(2);
5338 bool Is64Bit
= VData
.getValueType() == MVT::i64
;
5339 if (BaseOpcode
->AtomicX2
) {
5340 SDValue VData2
= Op
.getOperand(3);
5341 VData
= DAG
.getBuildVector(Is64Bit
? MVT::v2i64
: MVT::v2i32
, DL
,
5344 VData
= DAG
.getBitcast(MVT::v4i32
, VData
);
5346 ResultTypes
[0] = Is64Bit
? MVT::v2i64
: MVT::v2i32
;
5347 DMask
= Is64Bit
? 0xf : 0x3;
5348 NumVDataDwords
= Is64Bit
? 4 : 2;
5351 DMask
= Is64Bit
? 0x3 : 0x1;
5352 NumVDataDwords
= Is64Bit
? 2 : 1;
5356 unsigned DMaskIdx
= BaseOpcode
->Store
? 3 : isa
<MemSDNode
>(Op
) ? 2 : 1;
5357 auto DMaskConst
= cast
<ConstantSDNode
>(Op
.getOperand(DMaskIdx
));
5358 DMask
= DMaskConst
->getZExtValue();
5359 DMaskLanes
= BaseOpcode
->Gather4
? 4 : countPopulation(DMask
);
5361 if (BaseOpcode
->Store
) {
5362 VData
= Op
.getOperand(2);
5364 MVT StoreVT
= VData
.getSimpleValueType();
5365 if (StoreVT
.getScalarType() == MVT::f16
) {
5366 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
5367 return Op
; // D16 is unsupported for this instruction
5370 VData
= handleD16VData(VData
, DAG
);
5373 NumVDataDwords
= (VData
.getValueType().getSizeInBits() + 31) / 32;
5375 // Work out the num dwords based on the dmask popcount and underlying type
5376 // and whether packing is supported.
5377 MVT LoadVT
= ResultTypes
[0].getSimpleVT();
5378 if (LoadVT
.getScalarType() == MVT::f16
) {
5379 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
5380 return Op
; // D16 is unsupported for this instruction
5385 // Confirm that the return type is large enough for the dmask specified
5386 if ((LoadVT
.isVector() && LoadVT
.getVectorNumElements() < DMaskLanes
) ||
5387 (!LoadVT
.isVector() && DMaskLanes
> 1))
5390 if (IsD16
&& !Subtarget
->hasUnpackedD16VMem())
5391 NumVDataDwords
= (DMaskLanes
+ 1) / 2;
5393 NumVDataDwords
= DMaskLanes
;
5395 AdjustRetType
= true;
5398 AddrIdx
= DMaskIdx
+ 1;
5401 unsigned NumGradients
= BaseOpcode
->Gradients
? DimInfo
->NumGradients
: 0;
5402 unsigned NumCoords
= BaseOpcode
->Coordinates
? DimInfo
->NumCoords
: 0;
5403 unsigned NumLCM
= BaseOpcode
->LodOrClampOrMip
? 1 : 0;
5404 unsigned NumVAddrs
= BaseOpcode
->NumExtraArgs
+ NumGradients
+
5406 unsigned NumMIVAddrs
= NumVAddrs
;
5408 SmallVector
<SDValue
, 4> VAddrs
;
5410 // Optimize _L to _LZ when _L is zero
5411 if (LZMappingInfo
) {
5412 if (auto ConstantLod
=
5413 dyn_cast
<ConstantFPSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
5414 if (ConstantLod
->isZero() || ConstantLod
->isNegative()) {
5415 IntrOpcode
= LZMappingInfo
->LZ
; // set new opcode to _lz variant of _l
5416 NumMIVAddrs
--; // remove 'lod'
5421 // Optimize _mip away, when 'lod' is zero
5422 if (MIPMappingInfo
) {
5423 if (auto ConstantLod
=
5424 dyn_cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
5425 if (ConstantLod
->isNullValue()) {
5426 IntrOpcode
= MIPMappingInfo
->NONMIP
; // set new opcode to variant without _mip
5427 NumMIVAddrs
--; // remove 'lod'
5432 // Check for 16 bit addresses and pack if true.
5433 unsigned DimIdx
= AddrIdx
+ BaseOpcode
->NumExtraArgs
;
5434 MVT VAddrVT
= Op
.getOperand(DimIdx
).getSimpleValueType();
5435 const MVT VAddrScalarVT
= VAddrVT
.getScalarType();
5436 if (((VAddrScalarVT
== MVT::f16
) || (VAddrScalarVT
== MVT::i16
)) &&
5437 ST
->hasFeature(AMDGPU::FeatureR128A16
)) {
5439 const MVT VectorVT
= VAddrScalarVT
== MVT::f16
? MVT::v2f16
: MVT::v2i16
;
5440 for (unsigned i
= AddrIdx
; i
< (AddrIdx
+ NumMIVAddrs
); ++i
) {
5441 SDValue AddrLo
, AddrHi
;
5442 // Push back extra arguments.
5444 AddrLo
= Op
.getOperand(i
);
5446 AddrLo
= Op
.getOperand(i
);
5447 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
5448 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
5449 if (((i
+ 1) >= (AddrIdx
+ NumMIVAddrs
)) ||
5450 ((NumGradients
/ 2) % 2 == 1 &&
5451 (i
== DimIdx
+ (NumGradients
/ 2) - 1 ||
5452 i
== DimIdx
+ NumGradients
- 1))) {
5453 AddrHi
= DAG
.getUNDEF(MVT::f16
);
5455 AddrHi
= Op
.getOperand(i
+ 1);
5458 AddrLo
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, VectorVT
,
5460 AddrLo
= DAG
.getBitcast(MVT::i32
, AddrLo
);
5462 VAddrs
.push_back(AddrLo
);
5465 for (unsigned i
= 0; i
< NumMIVAddrs
; ++i
)
5466 VAddrs
.push_back(Op
.getOperand(AddrIdx
+ i
));
5469 // If the register allocator cannot place the address registers contiguously
5470 // without introducing moves, then using the non-sequential address encoding
5471 // is always preferable, since it saves VALU instructions and is usually a
5472 // wash in terms of code size or even better.
5474 // However, we currently have no way of hinting to the register allocator that
5475 // MIMG addresses should be placed contiguously when it is possible to do so,
5476 // so force non-NSA for the common 2-address case as a heuristic.
5478 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5479 // allocation when possible.
5481 ST
->hasFeature(AMDGPU::FeatureNSAEncoding
) && VAddrs
.size() >= 3;
5484 VAddr
= getBuildDwordsVector(DAG
, DL
, VAddrs
);
5486 SDValue True
= DAG
.getTargetConstant(1, DL
, MVT::i1
);
5487 SDValue False
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
5488 unsigned CtrlIdx
; // Index of texfailctrl argument
5490 if (!BaseOpcode
->Sampler
) {
5492 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 1;
5495 cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 2));
5497 Unorm
= UnormConst
->getZExtValue() ? True
: False
;
5498 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 3;
5503 SDValue TexFail
= Op
.getOperand(CtrlIdx
);
5504 bool IsTexFail
= false;
5505 if (!parseTexFail(TexFail
, DAG
, &TFE
, &LWE
, IsTexFail
))
5510 // Expecting to get an error flag since TFC is on - and dmask is 0
5511 // Force dmask to be at least 1 otherwise the instruction will fail
5516 NumVDataDwords
+= 1;
5517 AdjustRetType
= true;
5520 // Has something earlier tagged that the return type needs adjusting
5521 // This happens if the instruction is a load or has set TexFailCtrl flags
5522 if (AdjustRetType
) {
5523 // NumVDataDwords reflects the true number of dwords required in the return type
5524 if (DMaskLanes
== 0 && !BaseOpcode
->Store
) {
5525 // This is a no-op load. This can be eliminated
5526 SDValue Undef
= DAG
.getUNDEF(Op
.getValueType());
5527 if (isa
<MemSDNode
>(Op
))
5528 return DAG
.getMergeValues({Undef
, Op
.getOperand(0)}, DL
);
5532 EVT NewVT
= NumVDataDwords
> 1 ?
5533 EVT::getVectorVT(*DAG
.getContext(), MVT::f32
, NumVDataDwords
)
5536 ResultTypes
[0] = NewVT
;
5537 if (ResultTypes
.size() == 3) {
5538 // Original result was aggregate type used for TexFailCtrl results
5539 // The actual instruction returns as a vector type which has now been
5540 // created. Remove the aggregate result.
5541 ResultTypes
.erase(&ResultTypes
[1]);
5548 if (BaseOpcode
->Atomic
) {
5549 GLC
= True
; // TODO no-return optimization
5550 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, nullptr, &SLC
,
5551 IsGFX10
? &DLC
: nullptr))
5554 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, &GLC
, &SLC
,
5555 IsGFX10
? &DLC
: nullptr))
5559 SmallVector
<SDValue
, 26> Ops
;
5560 if (BaseOpcode
->Store
|| BaseOpcode
->Atomic
)
5561 Ops
.push_back(VData
); // vdata
5563 for (const SDValue
&Addr
: VAddrs
)
5564 Ops
.push_back(Addr
);
5566 Ops
.push_back(VAddr
);
5568 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
)); // rsrc
5569 if (BaseOpcode
->Sampler
)
5570 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 1)); // sampler
5571 Ops
.push_back(DAG
.getTargetConstant(DMask
, DL
, MVT::i32
));
5573 Ops
.push_back(DAG
.getTargetConstant(DimInfo
->Encoding
, DL
, MVT::i32
));
5574 Ops
.push_back(Unorm
);
5579 Ops
.push_back(IsA16
&& // a16 or r128
5580 ST
->hasFeature(AMDGPU::FeatureR128A16
) ? True
: False
);
5581 Ops
.push_back(TFE
); // tfe
5582 Ops
.push_back(LWE
); // lwe
5584 Ops
.push_back(DimInfo
->DA
? True
: False
);
5585 if (BaseOpcode
->HasD16
)
5586 Ops
.push_back(IsD16
? True
: False
);
5587 if (isa
<MemSDNode
>(Op
))
5588 Ops
.push_back(Op
.getOperand(0)); // chain
5590 int NumVAddrDwords
=
5591 UseNSA
? VAddrs
.size() : VAddr
.getValueType().getSizeInBits() / 32;
5595 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
,
5596 UseNSA
? AMDGPU::MIMGEncGfx10NSA
5597 : AMDGPU::MIMGEncGfx10Default
,
5598 NumVDataDwords
, NumVAddrDwords
);
5600 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5601 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx8
,
5602 NumVDataDwords
, NumVAddrDwords
);
5604 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx6
,
5605 NumVDataDwords
, NumVAddrDwords
);
5607 assert(Opcode
!= -1);
5609 MachineSDNode
*NewNode
= DAG
.getMachineNode(Opcode
, DL
, ResultTypes
, Ops
);
5610 if (auto MemOp
= dyn_cast
<MemSDNode
>(Op
)) {
5611 MachineMemOperand
*MemRef
= MemOp
->getMemOperand();
5612 DAG
.setNodeMemRefs(NewNode
, {MemRef
});
5615 if (BaseOpcode
->AtomicX2
) {
5616 SmallVector
<SDValue
, 1> Elt
;
5617 DAG
.ExtractVectorElements(SDValue(NewNode
, 0), Elt
, 0, 1);
5618 return DAG
.getMergeValues({Elt
[0], SDValue(NewNode
, 1)}, DL
);
5619 } else if (!BaseOpcode
->Store
) {
5620 return constructRetValue(DAG
, NewNode
,
5621 OrigResultTypes
, IsTexFail
,
5622 Subtarget
->hasUnpackedD16VMem(), IsD16
,
5623 DMaskLanes
, NumVDataDwords
, DL
,
5627 return SDValue(NewNode
, 0);
5630 SDValue
SITargetLowering::lowerSBuffer(EVT VT
, SDLoc DL
, SDValue Rsrc
,
5631 SDValue Offset
, SDValue GLC
, SDValue DLC
,
5632 SelectionDAG
&DAG
) const {
5633 MachineFunction
&MF
= DAG
.getMachineFunction();
5634 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
5635 MachinePointerInfo(),
5636 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
5637 MachineMemOperand::MOInvariant
,
5638 VT
.getStoreSize(), VT
.getStoreSize());
5640 if (!Offset
->isDivergent()) {
5647 return DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD
, DL
,
5648 DAG
.getVTList(VT
), Ops
, VT
, MMO
);
5651 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5652 // assume that the buffer is unswizzled.
5653 SmallVector
<SDValue
, 4> Loads
;
5654 unsigned NumLoads
= 1;
5655 MVT LoadVT
= VT
.getSimpleVT();
5656 unsigned NumElts
= LoadVT
.isVector() ? LoadVT
.getVectorNumElements() : 1;
5657 assert((LoadVT
.getScalarType() == MVT::i32
||
5658 LoadVT
.getScalarType() == MVT::f32
) &&
5659 isPowerOf2_32(NumElts
));
5661 if (NumElts
== 8 || NumElts
== 16) {
5662 NumLoads
= NumElts
== 16 ? 4 : 2;
5663 LoadVT
= MVT::v4i32
;
5666 SDVTList VTList
= DAG
.getVTList({LoadVT
, MVT::Glue
});
5667 unsigned CachePolicy
= cast
<ConstantSDNode
>(GLC
)->getZExtValue();
5669 DAG
.getEntryNode(), // Chain
5671 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5675 DAG
.getTargetConstant(CachePolicy
, DL
, MVT::i32
), // cachepolicy
5676 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
5679 // Use the alignment to ensure that the required offsets will fit into the
5680 // immediate offsets.
5681 setBufferOffsets(Offset
, DAG
, &Ops
[3], NumLoads
> 1 ? 16 * NumLoads
: 4);
5683 uint64_t InstOffset
= cast
<ConstantSDNode
>(Ops
[5])->getZExtValue();
5684 for (unsigned i
= 0; i
< NumLoads
; ++i
) {
5685 Ops
[5] = DAG
.getTargetConstant(InstOffset
+ 16 * i
, DL
, MVT::i32
);
5686 Loads
.push_back(DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD
, DL
, VTList
,
5690 if (VT
== MVT::v8i32
|| VT
== MVT::v16i32
)
5691 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VT
, Loads
);
5696 SDValue
SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
5697 SelectionDAG
&DAG
) const {
5698 MachineFunction
&MF
= DAG
.getMachineFunction();
5699 auto MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
5701 EVT VT
= Op
.getValueType();
5703 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
5705 // TODO: Should this propagate fast-math-flags?
5707 switch (IntrinsicID
) {
5708 case Intrinsic::amdgcn_implicit_buffer_ptr
: {
5709 if (getSubtarget()->isAmdHsaOrMesa(MF
.getFunction()))
5710 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5711 return getPreloadedValue(DAG
, *MFI
, VT
,
5712 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
5714 case Intrinsic::amdgcn_dispatch_ptr
:
5715 case Intrinsic::amdgcn_queue_ptr
: {
5716 if (!Subtarget
->isAmdHsaOrMesa(MF
.getFunction())) {
5717 DiagnosticInfoUnsupported
BadIntrin(
5718 MF
.getFunction(), "unsupported hsa intrinsic without hsa target",
5720 DAG
.getContext()->diagnose(BadIntrin
);
5721 return DAG
.getUNDEF(VT
);
5724 auto RegID
= IntrinsicID
== Intrinsic::amdgcn_dispatch_ptr
?
5725 AMDGPUFunctionArgInfo::DISPATCH_PTR
: AMDGPUFunctionArgInfo::QUEUE_PTR
;
5726 return getPreloadedValue(DAG
, *MFI
, VT
, RegID
);
5728 case Intrinsic::amdgcn_implicitarg_ptr
: {
5729 if (MFI
->isEntryFunction())
5730 return getImplicitArgPtr(DAG
, DL
);
5731 return getPreloadedValue(DAG
, *MFI
, VT
,
5732 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
5734 case Intrinsic::amdgcn_kernarg_segment_ptr
: {
5735 return getPreloadedValue(DAG
, *MFI
, VT
,
5736 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
5738 case Intrinsic::amdgcn_dispatch_id
: {
5739 return getPreloadedValue(DAG
, *MFI
, VT
, AMDGPUFunctionArgInfo::DISPATCH_ID
);
5741 case Intrinsic::amdgcn_rcp
:
5742 return DAG
.getNode(AMDGPUISD::RCP
, DL
, VT
, Op
.getOperand(1));
5743 case Intrinsic::amdgcn_rsq
:
5744 return DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
5745 case Intrinsic::amdgcn_rsq_legacy
:
5746 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5747 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
5749 return DAG
.getNode(AMDGPUISD::RSQ_LEGACY
, DL
, VT
, Op
.getOperand(1));
5750 case Intrinsic::amdgcn_rcp_legacy
:
5751 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5752 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
5753 return DAG
.getNode(AMDGPUISD::RCP_LEGACY
, DL
, VT
, Op
.getOperand(1));
5754 case Intrinsic::amdgcn_rsq_clamp
: {
5755 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5756 return DAG
.getNode(AMDGPUISD::RSQ_CLAMP
, DL
, VT
, Op
.getOperand(1));
5758 Type
*Type
= VT
.getTypeForEVT(*DAG
.getContext());
5759 APFloat Max
= APFloat::getLargest(Type
->getFltSemantics());
5760 APFloat Min
= APFloat::getLargest(Type
->getFltSemantics(), true);
5762 SDValue Rsq
= DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
5763 SDValue Tmp
= DAG
.getNode(ISD::FMINNUM
, DL
, VT
, Rsq
,
5764 DAG
.getConstantFP(Max
, DL
, VT
));
5765 return DAG
.getNode(ISD::FMAXNUM
, DL
, VT
, Tmp
,
5766 DAG
.getConstantFP(Min
, DL
, VT
));
5768 case Intrinsic::r600_read_ngroups_x
:
5769 if (Subtarget
->isAmdHsaOS())
5770 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5772 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5773 SI::KernelInputOffsets::NGROUPS_X
, 4, false);
5774 case Intrinsic::r600_read_ngroups_y
:
5775 if (Subtarget
->isAmdHsaOS())
5776 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5778 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5779 SI::KernelInputOffsets::NGROUPS_Y
, 4, false);
5780 case Intrinsic::r600_read_ngroups_z
:
5781 if (Subtarget
->isAmdHsaOS())
5782 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5784 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5785 SI::KernelInputOffsets::NGROUPS_Z
, 4, false);
5786 case Intrinsic::r600_read_global_size_x
:
5787 if (Subtarget
->isAmdHsaOS())
5788 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5790 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5791 SI::KernelInputOffsets::GLOBAL_SIZE_X
, 4, false);
5792 case Intrinsic::r600_read_global_size_y
:
5793 if (Subtarget
->isAmdHsaOS())
5794 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5796 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5797 SI::KernelInputOffsets::GLOBAL_SIZE_Y
, 4, false);
5798 case Intrinsic::r600_read_global_size_z
:
5799 if (Subtarget
->isAmdHsaOS())
5800 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5802 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5803 SI::KernelInputOffsets::GLOBAL_SIZE_Z
, 4, false);
5804 case Intrinsic::r600_read_local_size_x
:
5805 if (Subtarget
->isAmdHsaOS())
5806 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5808 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5809 SI::KernelInputOffsets::LOCAL_SIZE_X
);
5810 case Intrinsic::r600_read_local_size_y
:
5811 if (Subtarget
->isAmdHsaOS())
5812 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5814 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5815 SI::KernelInputOffsets::LOCAL_SIZE_Y
);
5816 case Intrinsic::r600_read_local_size_z
:
5817 if (Subtarget
->isAmdHsaOS())
5818 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5820 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5821 SI::KernelInputOffsets::LOCAL_SIZE_Z
);
5822 case Intrinsic::amdgcn_workgroup_id_x
:
5823 case Intrinsic::r600_read_tgid_x
:
5824 return getPreloadedValue(DAG
, *MFI
, VT
,
5825 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
5826 case Intrinsic::amdgcn_workgroup_id_y
:
5827 case Intrinsic::r600_read_tgid_y
:
5828 return getPreloadedValue(DAG
, *MFI
, VT
,
5829 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
5830 case Intrinsic::amdgcn_workgroup_id_z
:
5831 case Intrinsic::r600_read_tgid_z
:
5832 return getPreloadedValue(DAG
, *MFI
, VT
,
5833 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
5834 case Intrinsic::amdgcn_workitem_id_x
:
5835 case Intrinsic::r600_read_tidig_x
:
5836 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5837 SDLoc(DAG
.getEntryNode()),
5838 MFI
->getArgInfo().WorkItemIDX
);
5839 case Intrinsic::amdgcn_workitem_id_y
:
5840 case Intrinsic::r600_read_tidig_y
:
5841 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5842 SDLoc(DAG
.getEntryNode()),
5843 MFI
->getArgInfo().WorkItemIDY
);
5844 case Intrinsic::amdgcn_workitem_id_z
:
5845 case Intrinsic::r600_read_tidig_z
:
5846 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5847 SDLoc(DAG
.getEntryNode()),
5848 MFI
->getArgInfo().WorkItemIDZ
);
5849 case Intrinsic::amdgcn_wavefrontsize
:
5850 return DAG
.getConstant(MF
.getSubtarget
<GCNSubtarget
>().getWavefrontSize(),
5851 SDLoc(Op
), MVT::i32
);
5852 case Intrinsic::amdgcn_s_buffer_load
: {
5853 bool IsGFX10
= Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
;
5855 SDValue DLC
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
5856 if (!parseCachePolicy(Op
.getOperand(3), DAG
, &GLC
, nullptr,
5857 IsGFX10
? &DLC
: nullptr))
5859 return lowerSBuffer(VT
, DL
, Op
.getOperand(1), Op
.getOperand(2), GLC
, DLC
,
5862 case Intrinsic::amdgcn_fdiv_fast
:
5863 return lowerFDIV_FAST(Op
, DAG
);
5864 case Intrinsic::amdgcn_interp_mov
: {
5865 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(4));
5866 SDValue Glue
= M0
.getValue(1);
5867 return DAG
.getNode(AMDGPUISD::INTERP_MOV
, DL
, MVT::f32
, Op
.getOperand(1),
5868 Op
.getOperand(2), Op
.getOperand(3), Glue
);
5870 case Intrinsic::amdgcn_interp_p1
: {
5871 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(4));
5872 SDValue Glue
= M0
.getValue(1);
5873 return DAG
.getNode(AMDGPUISD::INTERP_P1
, DL
, MVT::f32
, Op
.getOperand(1),
5874 Op
.getOperand(2), Op
.getOperand(3), Glue
);
5876 case Intrinsic::amdgcn_interp_p2
: {
5877 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(5));
5878 SDValue Glue
= SDValue(M0
.getNode(), 1);
5879 return DAG
.getNode(AMDGPUISD::INTERP_P2
, DL
, MVT::f32
, Op
.getOperand(1),
5880 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(4),
5883 case Intrinsic::amdgcn_interp_p1_f16
: {
5884 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(5));
5885 SDValue Glue
= M0
.getValue(1);
5886 if (getSubtarget()->getLDSBankCount() == 16) {
5888 SDValue S
= DAG
.getNode(AMDGPUISD::INTERP_MOV
, DL
, MVT::f32
,
5889 DAG
.getConstant(2, DL
, MVT::i32
), // P0
5890 Op
.getOperand(2), // Attrchan
5891 Op
.getOperand(3), // Attr
5894 Op
.getOperand(1), // Src0
5895 Op
.getOperand(2), // Attrchan
5896 Op
.getOperand(3), // Attr
5897 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src0_modifiers
5898 S
, // Src2 - holds two f16 values selected by high
5899 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src2_modifiers
5900 Op
.getOperand(4), // high
5901 DAG
.getTargetConstant(0, DL
, MVT::i1
), // $clamp
5902 DAG
.getTargetConstant(0, DL
, MVT::i32
) // $omod
5904 return DAG
.getNode(AMDGPUISD::INTERP_P1LV_F16
, DL
, MVT::f32
, Ops
);
5908 Op
.getOperand(1), // Src0
5909 Op
.getOperand(2), // Attrchan
5910 Op
.getOperand(3), // Attr
5911 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src0_modifiers
5912 Op
.getOperand(4), // high
5913 DAG
.getTargetConstant(0, DL
, MVT::i1
), // $clamp
5914 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $omod
5917 return DAG
.getNode(AMDGPUISD::INTERP_P1LL_F16
, DL
, MVT::f32
, Ops
);
5920 case Intrinsic::amdgcn_interp_p2_f16
: {
5921 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(6));
5922 SDValue Glue
= SDValue(M0
.getNode(), 1);
5924 Op
.getOperand(2), // Src0
5925 Op
.getOperand(3), // Attrchan
5926 Op
.getOperand(4), // Attr
5927 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src0_modifiers
5928 Op
.getOperand(1), // Src2
5929 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src2_modifiers
5930 Op
.getOperand(5), // high
5931 DAG
.getTargetConstant(0, DL
, MVT::i1
), // $clamp
5934 return DAG
.getNode(AMDGPUISD::INTERP_P2_F16
, DL
, MVT::f16
, Ops
);
5936 case Intrinsic::amdgcn_sin
:
5937 return DAG
.getNode(AMDGPUISD::SIN_HW
, DL
, VT
, Op
.getOperand(1));
5939 case Intrinsic::amdgcn_cos
:
5940 return DAG
.getNode(AMDGPUISD::COS_HW
, DL
, VT
, Op
.getOperand(1));
5942 case Intrinsic::amdgcn_mul_u24
:
5943 return DAG
.getNode(AMDGPUISD::MUL_U24
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5944 case Intrinsic::amdgcn_mul_i24
:
5945 return DAG
.getNode(AMDGPUISD::MUL_I24
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5947 case Intrinsic::amdgcn_log_clamp
: {
5948 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5951 DiagnosticInfoUnsupported
BadIntrin(
5952 MF
.getFunction(), "intrinsic not supported on subtarget",
5954 DAG
.getContext()->diagnose(BadIntrin
);
5955 return DAG
.getUNDEF(VT
);
5957 case Intrinsic::amdgcn_ldexp
:
5958 return DAG
.getNode(AMDGPUISD::LDEXP
, DL
, VT
,
5959 Op
.getOperand(1), Op
.getOperand(2));
5961 case Intrinsic::amdgcn_fract
:
5962 return DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, Op
.getOperand(1));
5964 case Intrinsic::amdgcn_class
:
5965 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, VT
,
5966 Op
.getOperand(1), Op
.getOperand(2));
5967 case Intrinsic::amdgcn_div_fmas
:
5968 return DAG
.getNode(AMDGPUISD::DIV_FMAS
, DL
, VT
,
5969 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
5972 case Intrinsic::amdgcn_div_fixup
:
5973 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, DL
, VT
,
5974 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5976 case Intrinsic::amdgcn_trig_preop
:
5977 return DAG
.getNode(AMDGPUISD::TRIG_PREOP
, DL
, VT
,
5978 Op
.getOperand(1), Op
.getOperand(2));
5979 case Intrinsic::amdgcn_div_scale
: {
5980 const ConstantSDNode
*Param
= cast
<ConstantSDNode
>(Op
.getOperand(3));
5982 // Translate to the operands expected by the machine instruction. The
5983 // first parameter must be the same as the first instruction.
5984 SDValue Numerator
= Op
.getOperand(1);
5985 SDValue Denominator
= Op
.getOperand(2);
5987 // Note this order is opposite of the machine instruction's operations,
5988 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5989 // intrinsic has the numerator as the first operand to match a normal
5990 // division operation.
5992 SDValue Src0
= Param
->isAllOnesValue() ? Numerator
: Denominator
;
5994 return DAG
.getNode(AMDGPUISD::DIV_SCALE
, DL
, Op
->getVTList(), Src0
,
5995 Denominator
, Numerator
);
5997 case Intrinsic::amdgcn_icmp
: {
5998 // There is a Pat that handles this variant, so return it as-is.
5999 if (Op
.getOperand(1).getValueType() == MVT::i1
&&
6000 Op
.getConstantOperandVal(2) == 0 &&
6001 Op
.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE
)
6003 return lowerICMPIntrinsic(*this, Op
.getNode(), DAG
);
6005 case Intrinsic::amdgcn_fcmp
: {
6006 return lowerFCMPIntrinsic(*this, Op
.getNode(), DAG
);
6008 case Intrinsic::amdgcn_fmed3
:
6009 return DAG
.getNode(AMDGPUISD::FMED3
, DL
, VT
,
6010 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6011 case Intrinsic::amdgcn_fdot2
:
6012 return DAG
.getNode(AMDGPUISD::FDOT2
, DL
, VT
,
6013 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
6015 case Intrinsic::amdgcn_fmul_legacy
:
6016 return DAG
.getNode(AMDGPUISD::FMUL_LEGACY
, DL
, VT
,
6017 Op
.getOperand(1), Op
.getOperand(2));
6018 case Intrinsic::amdgcn_sffbh
:
6019 return DAG
.getNode(AMDGPUISD::FFBH_I32
, DL
, VT
, Op
.getOperand(1));
6020 case Intrinsic::amdgcn_sbfe
:
6021 return DAG
.getNode(AMDGPUISD::BFE_I32
, DL
, VT
,
6022 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6023 case Intrinsic::amdgcn_ubfe
:
6024 return DAG
.getNode(AMDGPUISD::BFE_U32
, DL
, VT
,
6025 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6026 case Intrinsic::amdgcn_cvt_pkrtz
:
6027 case Intrinsic::amdgcn_cvt_pknorm_i16
:
6028 case Intrinsic::amdgcn_cvt_pknorm_u16
:
6029 case Intrinsic::amdgcn_cvt_pk_i16
:
6030 case Intrinsic::amdgcn_cvt_pk_u16
: {
6031 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
6032 EVT VT
= Op
.getValueType();
6035 if (IntrinsicID
== Intrinsic::amdgcn_cvt_pkrtz
)
6036 Opcode
= AMDGPUISD::CVT_PKRTZ_F16_F32
;
6037 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
6038 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
6039 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
6040 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
6041 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pk_i16
)
6042 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
6044 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
6046 if (isTypeLegal(VT
))
6047 return DAG
.getNode(Opcode
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
6049 SDValue Node
= DAG
.getNode(Opcode
, DL
, MVT::i32
,
6050 Op
.getOperand(1), Op
.getOperand(2));
6051 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Node
);
6053 case Intrinsic::amdgcn_fmad_ftz
:
6054 return DAG
.getNode(AMDGPUISD::FMAD_FTZ
, DL
, VT
, Op
.getOperand(1),
6055 Op
.getOperand(2), Op
.getOperand(3));
6057 case Intrinsic::amdgcn_if_break
:
6058 return SDValue(DAG
.getMachineNode(AMDGPU::SI_IF_BREAK
, DL
, VT
,
6059 Op
->getOperand(1), Op
->getOperand(2)), 0);
6061 case Intrinsic::amdgcn_groupstaticsize
: {
6062 Triple::OSType OS
= getTargetMachine().getTargetTriple().getOS();
6063 if (OS
== Triple::AMDHSA
|| OS
== Triple::AMDPAL
)
6066 const Module
*M
= MF
.getFunction().getParent();
6067 const GlobalValue
*GV
=
6068 M
->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize
));
6069 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, 0,
6070 SIInstrInfo::MO_ABS32_LO
);
6071 return {DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, GA
), 0};
6073 case Intrinsic::amdgcn_is_shared
:
6074 case Intrinsic::amdgcn_is_private
: {
6076 unsigned AS
= (IntrinsicID
== Intrinsic::amdgcn_is_shared
) ?
6077 AMDGPUAS::LOCAL_ADDRESS
: AMDGPUAS::PRIVATE_ADDRESS
;
6078 SDValue Aperture
= getSegmentAperture(AS
, SL
, DAG
);
6079 SDValue SrcVec
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
,
6082 SDValue SrcHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, SrcVec
,
6083 DAG
.getConstant(1, SL
, MVT::i32
));
6084 return DAG
.getSetCC(SL
, MVT::i1
, SrcHi
, Aperture
, ISD::SETEQ
);
6087 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6088 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
6089 return lowerImage(Op
, ImageDimIntr
, DAG
);
6095 SDValue
SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op
,
6096 SelectionDAG
&DAG
) const {
6097 unsigned IntrID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
6101 case Intrinsic::amdgcn_ds_ordered_add
:
6102 case Intrinsic::amdgcn_ds_ordered_swap
: {
6103 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6104 SDValue Chain
= M
->getOperand(0);
6105 SDValue M0
= M
->getOperand(2);
6106 SDValue Value
= M
->getOperand(3);
6107 unsigned IndexOperand
= M
->getConstantOperandVal(7);
6108 unsigned WaveRelease
= M
->getConstantOperandVal(8);
6109 unsigned WaveDone
= M
->getConstantOperandVal(9);
6110 unsigned ShaderType
;
6111 unsigned Instruction
;
6113 unsigned OrderedCountIndex
= IndexOperand
& 0x3f;
6114 IndexOperand
&= ~0x3f;
6115 unsigned CountDw
= 0;
6117 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
) {
6118 CountDw
= (IndexOperand
>> 24) & 0xf;
6119 IndexOperand
&= ~(0xf << 24);
6121 if (CountDw
< 1 || CountDw
> 4) {
6123 "ds_ordered_count: dword count must be between 1 and 4");
6128 report_fatal_error("ds_ordered_count: bad index operand");
6131 case Intrinsic::amdgcn_ds_ordered_add
:
6134 case Intrinsic::amdgcn_ds_ordered_swap
:
6139 if (WaveDone
&& !WaveRelease
)
6140 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
6142 switch (DAG
.getMachineFunction().getFunction().getCallingConv()) {
6143 case CallingConv::AMDGPU_CS
:
6144 case CallingConv::AMDGPU_KERNEL
:
6147 case CallingConv::AMDGPU_PS
:
6150 case CallingConv::AMDGPU_VS
:
6153 case CallingConv::AMDGPU_GS
:
6157 report_fatal_error("ds_ordered_count unsupported for this calling conv");
6160 unsigned Offset0
= OrderedCountIndex
<< 2;
6161 unsigned Offset1
= WaveRelease
| (WaveDone
<< 1) | (ShaderType
<< 2) |
6164 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
6165 Offset1
|= (CountDw
- 1) << 6;
6167 unsigned Offset
= Offset0
| (Offset1
<< 8);
6172 DAG
.getTargetConstant(Offset
, DL
, MVT::i16
),
6173 copyToM0(DAG
, Chain
, DL
, M0
).getValue(1), // Glue
6175 return DAG
.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT
, DL
,
6176 M
->getVTList(), Ops
, M
->getMemoryVT(),
6177 M
->getMemOperand());
6179 case Intrinsic::amdgcn_ds_fadd
: {
6180 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6183 case Intrinsic::amdgcn_ds_fadd
:
6184 Opc
= ISD::ATOMIC_LOAD_FADD
;
6188 return DAG
.getAtomic(Opc
, SDLoc(Op
), M
->getMemoryVT(),
6189 M
->getOperand(0), M
->getOperand(2), M
->getOperand(3),
6190 M
->getMemOperand());
6192 case Intrinsic::amdgcn_atomic_inc
:
6193 case Intrinsic::amdgcn_atomic_dec
:
6194 case Intrinsic::amdgcn_ds_fmin
:
6195 case Intrinsic::amdgcn_ds_fmax
: {
6196 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6199 case Intrinsic::amdgcn_atomic_inc
:
6200 Opc
= AMDGPUISD::ATOMIC_INC
;
6202 case Intrinsic::amdgcn_atomic_dec
:
6203 Opc
= AMDGPUISD::ATOMIC_DEC
;
6205 case Intrinsic::amdgcn_ds_fmin
:
6206 Opc
= AMDGPUISD::ATOMIC_LOAD_FMIN
;
6208 case Intrinsic::amdgcn_ds_fmax
:
6209 Opc
= AMDGPUISD::ATOMIC_LOAD_FMAX
;
6212 llvm_unreachable("Unknown intrinsic!");
6215 M
->getOperand(0), // Chain
6216 M
->getOperand(2), // Ptr
6217 M
->getOperand(3) // Value
6220 return DAG
.getMemIntrinsicNode(Opc
, SDLoc(Op
), M
->getVTList(), Ops
,
6221 M
->getMemoryVT(), M
->getMemOperand());
6223 case Intrinsic::amdgcn_buffer_load
:
6224 case Intrinsic::amdgcn_buffer_load_format
: {
6225 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(5))->getZExtValue();
6226 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6228 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
6229 IdxEn
= Idx
->getZExtValue() != 0;
6231 Op
.getOperand(0), // Chain
6232 Op
.getOperand(2), // rsrc
6233 Op
.getOperand(3), // vindex
6234 SDValue(), // voffset -- will be set by setBufferOffsets
6235 SDValue(), // soffset -- will be set by setBufferOffsets
6236 SDValue(), // offset -- will be set by setBufferOffsets
6237 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6238 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6241 setBufferOffsets(Op
.getOperand(4), DAG
, &Ops
[3]);
6242 unsigned Opc
= (IntrID
== Intrinsic::amdgcn_buffer_load
) ?
6243 AMDGPUISD::BUFFER_LOAD
: AMDGPUISD::BUFFER_LOAD_FORMAT
;
6245 EVT VT
= Op
.getValueType();
6246 EVT IntVT
= VT
.changeTypeToInteger();
6247 auto *M
= cast
<MemSDNode
>(Op
);
6248 EVT LoadVT
= Op
.getValueType();
6250 if (LoadVT
.getScalarType() == MVT::f16
)
6251 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
,
6254 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6255 if (LoadVT
.getScalarType() == MVT::i8
||
6256 LoadVT
.getScalarType() == MVT::i16
)
6257 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
6259 return getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
, IntVT
,
6260 M
->getMemOperand(), DAG
);
6262 case Intrinsic::amdgcn_raw_buffer_load
:
6263 case Intrinsic::amdgcn_raw_buffer_load_format
: {
6264 const bool IsFormat
= IntrID
== Intrinsic::amdgcn_raw_buffer_load_format
;
6266 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
6268 Op
.getOperand(0), // Chain
6269 Op
.getOperand(2), // rsrc
6270 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6271 Offsets
.first
, // voffset
6272 Op
.getOperand(4), // soffset
6273 Offsets
.second
, // offset
6274 Op
.getOperand(5), // cachepolicy
6275 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6278 return lowerIntrinsicLoad(cast
<MemSDNode
>(Op
), IsFormat
, DAG
, Ops
);
6280 case Intrinsic::amdgcn_struct_buffer_load
:
6281 case Intrinsic::amdgcn_struct_buffer_load_format
: {
6282 const bool IsFormat
= IntrID
== Intrinsic::amdgcn_struct_buffer_load_format
;
6284 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6286 Op
.getOperand(0), // Chain
6287 Op
.getOperand(2), // rsrc
6288 Op
.getOperand(3), // vindex
6289 Offsets
.first
, // voffset
6290 Op
.getOperand(5), // soffset
6291 Offsets
.second
, // offset
6292 Op
.getOperand(6), // cachepolicy
6293 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6296 return lowerIntrinsicLoad(cast
<MemSDNode
>(Op
), IsFormat
, DAG
, Ops
);
6298 case Intrinsic::amdgcn_tbuffer_load
: {
6299 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6300 EVT LoadVT
= Op
.getValueType();
6302 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6303 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
6304 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
6305 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
6307 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
6308 IdxEn
= Idx
->getZExtValue() != 0;
6310 Op
.getOperand(0), // Chain
6311 Op
.getOperand(2), // rsrc
6312 Op
.getOperand(3), // vindex
6313 Op
.getOperand(4), // voffset
6314 Op
.getOperand(5), // soffset
6315 Op
.getOperand(6), // offset
6316 DAG
.getTargetConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
6317 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6318 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
) // idxen
6321 if (LoadVT
.getScalarType() == MVT::f16
)
6322 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6324 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6325 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6328 case Intrinsic::amdgcn_raw_tbuffer_load
: {
6329 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6330 EVT LoadVT
= Op
.getValueType();
6331 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
6334 Op
.getOperand(0), // Chain
6335 Op
.getOperand(2), // rsrc
6336 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6337 Offsets
.first
, // voffset
6338 Op
.getOperand(4), // soffset
6339 Offsets
.second
, // offset
6340 Op
.getOperand(5), // format
6341 Op
.getOperand(6), // cachepolicy
6342 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6345 if (LoadVT
.getScalarType() == MVT::f16
)
6346 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6348 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6349 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6352 case Intrinsic::amdgcn_struct_tbuffer_load
: {
6353 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6354 EVT LoadVT
= Op
.getValueType();
6355 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6358 Op
.getOperand(0), // Chain
6359 Op
.getOperand(2), // rsrc
6360 Op
.getOperand(3), // vindex
6361 Offsets
.first
, // voffset
6362 Op
.getOperand(5), // soffset
6363 Offsets
.second
, // offset
6364 Op
.getOperand(6), // format
6365 Op
.getOperand(7), // cachepolicy
6366 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6369 if (LoadVT
.getScalarType() == MVT::f16
)
6370 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6372 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6373 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6376 case Intrinsic::amdgcn_buffer_atomic_swap
:
6377 case Intrinsic::amdgcn_buffer_atomic_add
:
6378 case Intrinsic::amdgcn_buffer_atomic_sub
:
6379 case Intrinsic::amdgcn_buffer_atomic_smin
:
6380 case Intrinsic::amdgcn_buffer_atomic_umin
:
6381 case Intrinsic::amdgcn_buffer_atomic_smax
:
6382 case Intrinsic::amdgcn_buffer_atomic_umax
:
6383 case Intrinsic::amdgcn_buffer_atomic_and
:
6384 case Intrinsic::amdgcn_buffer_atomic_or
:
6385 case Intrinsic::amdgcn_buffer_atomic_xor
: {
6386 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6388 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6389 IdxEn
= Idx
->getZExtValue() != 0;
6391 Op
.getOperand(0), // Chain
6392 Op
.getOperand(2), // vdata
6393 Op
.getOperand(3), // rsrc
6394 Op
.getOperand(4), // vindex
6395 SDValue(), // voffset -- will be set by setBufferOffsets
6396 SDValue(), // soffset -- will be set by setBufferOffsets
6397 SDValue(), // offset -- will be set by setBufferOffsets
6398 DAG
.getTargetConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6399 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6401 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6402 EVT VT
= Op
.getValueType();
6404 auto *M
= cast
<MemSDNode
>(Op
);
6405 unsigned Opcode
= 0;
6408 case Intrinsic::amdgcn_buffer_atomic_swap
:
6409 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6411 case Intrinsic::amdgcn_buffer_atomic_add
:
6412 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6414 case Intrinsic::amdgcn_buffer_atomic_sub
:
6415 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6417 case Intrinsic::amdgcn_buffer_atomic_smin
:
6418 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6420 case Intrinsic::amdgcn_buffer_atomic_umin
:
6421 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6423 case Intrinsic::amdgcn_buffer_atomic_smax
:
6424 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6426 case Intrinsic::amdgcn_buffer_atomic_umax
:
6427 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6429 case Intrinsic::amdgcn_buffer_atomic_and
:
6430 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6432 case Intrinsic::amdgcn_buffer_atomic_or
:
6433 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6435 case Intrinsic::amdgcn_buffer_atomic_xor
:
6436 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6439 llvm_unreachable("unhandled atomic opcode");
6442 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6443 M
->getMemOperand());
6445 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6446 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6447 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6448 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6449 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6450 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6451 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6452 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6453 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6454 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6455 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6456 case Intrinsic::amdgcn_raw_buffer_atomic_dec
: {
6457 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6459 Op
.getOperand(0), // Chain
6460 Op
.getOperand(2), // vdata
6461 Op
.getOperand(3), // rsrc
6462 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6463 Offsets
.first
, // voffset
6464 Op
.getOperand(5), // soffset
6465 Offsets
.second
, // offset
6466 Op
.getOperand(6), // cachepolicy
6467 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6469 EVT VT
= Op
.getValueType();
6471 auto *M
= cast
<MemSDNode
>(Op
);
6472 unsigned Opcode
= 0;
6475 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6476 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6478 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6479 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6481 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6482 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6484 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6485 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6487 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6488 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6490 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6491 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6493 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6494 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6496 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6497 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6499 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6500 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6502 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6503 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6505 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6506 Opcode
= AMDGPUISD::BUFFER_ATOMIC_INC
;
6508 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
6509 Opcode
= AMDGPUISD::BUFFER_ATOMIC_DEC
;
6512 llvm_unreachable("unhandled atomic opcode");
6515 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6516 M
->getMemOperand());
6518 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6519 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6520 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6521 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6522 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6523 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6524 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6525 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6526 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6527 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6528 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6529 case Intrinsic::amdgcn_struct_buffer_atomic_dec
: {
6530 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6532 Op
.getOperand(0), // Chain
6533 Op
.getOperand(2), // vdata
6534 Op
.getOperand(3), // rsrc
6535 Op
.getOperand(4), // vindex
6536 Offsets
.first
, // voffset
6537 Op
.getOperand(6), // soffset
6538 Offsets
.second
, // offset
6539 Op
.getOperand(7), // cachepolicy
6540 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6542 EVT VT
= Op
.getValueType();
6544 auto *M
= cast
<MemSDNode
>(Op
);
6545 unsigned Opcode
= 0;
6548 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6549 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6551 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6552 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6554 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6555 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6557 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6558 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6560 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6561 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6563 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6564 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6566 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6567 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6569 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6570 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6572 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6573 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6575 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6576 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6578 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6579 Opcode
= AMDGPUISD::BUFFER_ATOMIC_INC
;
6581 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
6582 Opcode
= AMDGPUISD::BUFFER_ATOMIC_DEC
;
6585 llvm_unreachable("unhandled atomic opcode");
6588 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6589 M
->getMemOperand());
6591 case Intrinsic::amdgcn_buffer_atomic_cmpswap
: {
6592 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6594 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(5)))
6595 IdxEn
= Idx
->getZExtValue() != 0;
6597 Op
.getOperand(0), // Chain
6598 Op
.getOperand(2), // src
6599 Op
.getOperand(3), // cmp
6600 Op
.getOperand(4), // rsrc
6601 Op
.getOperand(5), // vindex
6602 SDValue(), // voffset -- will be set by setBufferOffsets
6603 SDValue(), // soffset -- will be set by setBufferOffsets
6604 SDValue(), // offset -- will be set by setBufferOffsets
6605 DAG
.getTargetConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6606 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6608 setBufferOffsets(Op
.getOperand(6), DAG
, &Ops
[5]);
6609 EVT VT
= Op
.getValueType();
6610 auto *M
= cast
<MemSDNode
>(Op
);
6612 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6613 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6615 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
: {
6616 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6618 Op
.getOperand(0), // Chain
6619 Op
.getOperand(2), // src
6620 Op
.getOperand(3), // cmp
6621 Op
.getOperand(4), // rsrc
6622 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6623 Offsets
.first
, // voffset
6624 Op
.getOperand(6), // soffset
6625 Offsets
.second
, // offset
6626 Op
.getOperand(7), // cachepolicy
6627 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6629 EVT VT
= Op
.getValueType();
6630 auto *M
= cast
<MemSDNode
>(Op
);
6632 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6633 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6635 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
: {
6636 auto Offsets
= splitBufferOffsets(Op
.getOperand(6), DAG
);
6638 Op
.getOperand(0), // Chain
6639 Op
.getOperand(2), // src
6640 Op
.getOperand(3), // cmp
6641 Op
.getOperand(4), // rsrc
6642 Op
.getOperand(5), // vindex
6643 Offsets
.first
, // voffset
6644 Op
.getOperand(7), // soffset
6645 Offsets
.second
, // offset
6646 Op
.getOperand(8), // cachepolicy
6647 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6649 EVT VT
= Op
.getValueType();
6650 auto *M
= cast
<MemSDNode
>(Op
);
6652 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6653 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6657 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6658 AMDGPU::getImageDimIntrinsicInfo(IntrID
))
6659 return lowerImage(Op
, ImageDimIntr
, DAG
);
6665 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
6666 // dwordx4 if on SI.
6667 SDValue
SITargetLowering::getMemIntrinsicNode(unsigned Opcode
, const SDLoc
&DL
,
6669 ArrayRef
<SDValue
> Ops
, EVT MemVT
,
6670 MachineMemOperand
*MMO
,
6671 SelectionDAG
&DAG
) const {
6672 EVT VT
= VTList
.VTs
[0];
6674 EVT WidenedMemVT
= MemVT
;
6675 if (!Subtarget
->hasDwordx3LoadStores() &&
6676 (WidenedVT
== MVT::v3i32
|| WidenedVT
== MVT::v3f32
)) {
6677 WidenedVT
= EVT::getVectorVT(*DAG
.getContext(),
6678 WidenedVT
.getVectorElementType(), 4);
6679 WidenedMemVT
= EVT::getVectorVT(*DAG
.getContext(),
6680 WidenedMemVT
.getVectorElementType(), 4);
6681 MMO
= DAG
.getMachineFunction().getMachineMemOperand(MMO
, 0, 16);
6684 assert(VTList
.NumVTs
== 2);
6685 SDVTList WidenedVTList
= DAG
.getVTList(WidenedVT
, VTList
.VTs
[1]);
6687 auto NewOp
= DAG
.getMemIntrinsicNode(Opcode
, DL
, WidenedVTList
, Ops
,
6689 if (WidenedVT
!= VT
) {
6690 auto Extract
= DAG
.getNode(
6691 ISD::EXTRACT_SUBVECTOR
, DL
, VT
, NewOp
,
6692 DAG
.getConstant(0, DL
, getVectorIdxTy(DAG
.getDataLayout())));
6693 NewOp
= DAG
.getMergeValues({ Extract
, SDValue(NewOp
.getNode(), 1) }, DL
);
6698 SDValue
SITargetLowering::handleD16VData(SDValue VData
,
6699 SelectionDAG
&DAG
) const {
6700 EVT StoreVT
= VData
.getValueType();
6702 // No change for f16 and legal vector D16 types.
6703 if (!StoreVT
.isVector())
6707 assert((StoreVT
.getVectorNumElements() != 3) && "Handle v3f16");
6709 if (Subtarget
->hasUnpackedD16VMem()) {
6710 // We need to unpack the packed data to store.
6711 EVT IntStoreVT
= StoreVT
.changeTypeToInteger();
6712 SDValue IntVData
= DAG
.getNode(ISD::BITCAST
, DL
, IntStoreVT
, VData
);
6714 EVT EquivStoreVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
6715 StoreVT
.getVectorNumElements());
6716 SDValue ZExt
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, EquivStoreVT
, IntVData
);
6717 return DAG
.UnrollVectorOp(ZExt
.getNode());
6720 assert(isTypeLegal(StoreVT
));
6724 SDValue
SITargetLowering::LowerINTRINSIC_VOID(SDValue Op
,
6725 SelectionDAG
&DAG
) const {
6727 SDValue Chain
= Op
.getOperand(0);
6728 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
6729 MachineFunction
&MF
= DAG
.getMachineFunction();
6731 switch (IntrinsicID
) {
6732 case Intrinsic::amdgcn_exp
: {
6733 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
6734 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
6735 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(8));
6736 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(9));
6738 const SDValue Ops
[] = {
6740 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
6741 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
6742 Op
.getOperand(4), // src0
6743 Op
.getOperand(5), // src1
6744 Op
.getOperand(6), // src2
6745 Op
.getOperand(7), // src3
6746 DAG
.getTargetConstant(0, DL
, MVT::i1
), // compr
6747 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
6750 unsigned Opc
= Done
->isNullValue() ?
6751 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
6752 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
6754 case Intrinsic::amdgcn_exp_compr
: {
6755 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
6756 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
6757 SDValue Src0
= Op
.getOperand(4);
6758 SDValue Src1
= Op
.getOperand(5);
6759 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(6));
6760 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(7));
6762 SDValue Undef
= DAG
.getUNDEF(MVT::f32
);
6763 const SDValue Ops
[] = {
6765 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
6766 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
6767 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src0
),
6768 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src1
),
6771 DAG
.getTargetConstant(1, DL
, MVT::i1
), // compr
6772 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
6775 unsigned Opc
= Done
->isNullValue() ?
6776 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
6777 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
6779 case Intrinsic::amdgcn_s_barrier
: {
6780 if (getTargetMachine().getOptLevel() > CodeGenOpt::None
) {
6781 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
6782 unsigned WGSize
= ST
.getFlatWorkGroupSizes(MF
.getFunction()).second
;
6783 if (WGSize
<= ST
.getWavefrontSize())
6784 return SDValue(DAG
.getMachineNode(AMDGPU::WAVE_BARRIER
, DL
, MVT::Other
,
6785 Op
.getOperand(0)), 0);
6789 case Intrinsic::amdgcn_tbuffer_store
: {
6790 SDValue VData
= Op
.getOperand(2);
6791 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6793 VData
= handleD16VData(VData
, DAG
);
6794 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
6795 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
6796 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
6797 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(11))->getZExtValue();
6799 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6800 IdxEn
= Idx
->getZExtValue() != 0;
6804 Op
.getOperand(3), // rsrc
6805 Op
.getOperand(4), // vindex
6806 Op
.getOperand(5), // voffset
6807 Op
.getOperand(6), // soffset
6808 Op
.getOperand(7), // offset
6809 DAG
.getTargetConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
6810 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6811 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idexen
6813 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6814 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6815 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6816 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6817 M
->getMemoryVT(), M
->getMemOperand());
6820 case Intrinsic::amdgcn_struct_tbuffer_store
: {
6821 SDValue VData
= Op
.getOperand(2);
6822 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6824 VData
= handleD16VData(VData
, DAG
);
6825 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6829 Op
.getOperand(3), // rsrc
6830 Op
.getOperand(4), // vindex
6831 Offsets
.first
, // voffset
6832 Op
.getOperand(6), // soffset
6833 Offsets
.second
, // offset
6834 Op
.getOperand(7), // format
6835 Op
.getOperand(8), // cachepolicy
6836 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idexen
6838 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6839 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6840 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6841 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6842 M
->getMemoryVT(), M
->getMemOperand());
6845 case Intrinsic::amdgcn_raw_tbuffer_store
: {
6846 SDValue VData
= Op
.getOperand(2);
6847 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6849 VData
= handleD16VData(VData
, DAG
);
6850 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6854 Op
.getOperand(3), // rsrc
6855 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6856 Offsets
.first
, // voffset
6857 Op
.getOperand(5), // soffset
6858 Offsets
.second
, // offset
6859 Op
.getOperand(6), // format
6860 Op
.getOperand(7), // cachepolicy
6861 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idexen
6863 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6864 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6865 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6866 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6867 M
->getMemoryVT(), M
->getMemOperand());
6870 case Intrinsic::amdgcn_buffer_store
:
6871 case Intrinsic::amdgcn_buffer_store_format
: {
6872 SDValue VData
= Op
.getOperand(2);
6873 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6875 VData
= handleD16VData(VData
, DAG
);
6876 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6877 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6879 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6880 IdxEn
= Idx
->getZExtValue() != 0;
6884 Op
.getOperand(3), // rsrc
6885 Op
.getOperand(4), // vindex
6886 SDValue(), // voffset -- will be set by setBufferOffsets
6887 SDValue(), // soffset -- will be set by setBufferOffsets
6888 SDValue(), // offset -- will be set by setBufferOffsets
6889 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6890 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6892 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6893 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_buffer_store
?
6894 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
6895 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6896 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6898 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6899 EVT VDataType
= VData
.getValueType().getScalarType();
6900 if (VDataType
== MVT::i8
|| VDataType
== MVT::i16
)
6901 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
6903 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6904 M
->getMemoryVT(), M
->getMemOperand());
6907 case Intrinsic::amdgcn_raw_buffer_store
:
6908 case Intrinsic::amdgcn_raw_buffer_store_format
: {
6909 const bool IsFormat
=
6910 IntrinsicID
== Intrinsic::amdgcn_raw_buffer_store_format
;
6912 SDValue VData
= Op
.getOperand(2);
6913 EVT VDataVT
= VData
.getValueType();
6914 EVT EltType
= VDataVT
.getScalarType();
6915 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
6917 VData
= handleD16VData(VData
, DAG
);
6919 if (!isTypeLegal(VDataVT
)) {
6921 DAG
.getNode(ISD::BITCAST
, DL
,
6922 getEquivalentMemType(*DAG
.getContext(), VDataVT
), VData
);
6925 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6929 Op
.getOperand(3), // rsrc
6930 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6931 Offsets
.first
, // voffset
6932 Op
.getOperand(5), // soffset
6933 Offsets
.second
, // offset
6934 Op
.getOperand(6), // cachepolicy
6935 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6938 IsFormat
? AMDGPUISD::BUFFER_STORE_FORMAT
: AMDGPUISD::BUFFER_STORE
;
6939 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6940 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6942 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6943 if (!IsD16
&& !VDataVT
.isVector() && EltType
.getSizeInBits() < 32)
6944 return handleByteShortBufferStores(DAG
, VDataVT
, DL
, Ops
, M
);
6946 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6947 M
->getMemoryVT(), M
->getMemOperand());
6950 case Intrinsic::amdgcn_struct_buffer_store
:
6951 case Intrinsic::amdgcn_struct_buffer_store_format
: {
6952 const bool IsFormat
=
6953 IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store_format
;
6955 SDValue VData
= Op
.getOperand(2);
6956 EVT VDataVT
= VData
.getValueType();
6957 EVT EltType
= VDataVT
.getScalarType();
6958 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
6961 VData
= handleD16VData(VData
, DAG
);
6963 if (!isTypeLegal(VDataVT
)) {
6965 DAG
.getNode(ISD::BITCAST
, DL
,
6966 getEquivalentMemType(*DAG
.getContext(), VDataVT
), VData
);
6969 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6973 Op
.getOperand(3), // rsrc
6974 Op
.getOperand(4), // vindex
6975 Offsets
.first
, // voffset
6976 Op
.getOperand(6), // soffset
6977 Offsets
.second
, // offset
6978 Op
.getOperand(7), // cachepolicy
6979 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6981 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store
?
6982 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
6983 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6984 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6986 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6987 EVT VDataType
= VData
.getValueType().getScalarType();
6988 if (!IsD16
&& !VDataVT
.isVector() && EltType
.getSizeInBits() < 32)
6989 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
6991 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6992 M
->getMemoryVT(), M
->getMemOperand());
6995 case Intrinsic::amdgcn_buffer_atomic_fadd
: {
6996 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6998 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6999 IdxEn
= Idx
->getZExtValue() != 0;
7002 Op
.getOperand(2), // vdata
7003 Op
.getOperand(3), // rsrc
7004 Op
.getOperand(4), // vindex
7005 SDValue(), // voffset -- will be set by setBufferOffsets
7006 SDValue(), // soffset -- will be set by setBufferOffsets
7007 SDValue(), // offset -- will be set by setBufferOffsets
7008 DAG
.getTargetConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
7009 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
7011 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
7012 EVT VT
= Op
.getOperand(2).getValueType();
7014 auto *M
= cast
<MemSDNode
>(Op
);
7015 unsigned Opcode
= VT
.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
7016 : AMDGPUISD::BUFFER_ATOMIC_FADD
;
7018 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
7019 M
->getMemOperand());
7022 case Intrinsic::amdgcn_global_atomic_fadd
: {
7025 Op
.getOperand(2), // ptr
7026 Op
.getOperand(3) // vdata
7028 EVT VT
= Op
.getOperand(3).getValueType();
7030 auto *M
= cast
<MemSDNode
>(Op
);
7031 unsigned Opcode
= VT
.isVector() ? AMDGPUISD::ATOMIC_PK_FADD
7032 : AMDGPUISD::ATOMIC_FADD
;
7034 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
7035 M
->getMemOperand());
7038 case Intrinsic::amdgcn_end_cf
:
7039 return SDValue(DAG
.getMachineNode(AMDGPU::SI_END_CF
, DL
, MVT::Other
,
7040 Op
->getOperand(2), Chain
), 0);
7043 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
7044 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
7045 return lowerImage(Op
, ImageDimIntr
, DAG
);
7052 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
7053 // offset (the offset that is included in bounds checking and swizzling, to be
7054 // split between the instruction's voffset and immoffset fields) and soffset
7055 // (the offset that is excluded from bounds checking and swizzling, to go in
7056 // the instruction's soffset field). This function takes the first kind of
7057 // offset and figures out how to split it between voffset and immoffset.
7058 std::pair
<SDValue
, SDValue
> SITargetLowering::splitBufferOffsets(
7059 SDValue Offset
, SelectionDAG
&DAG
) const {
7061 const unsigned MaxImm
= 4095;
7062 SDValue N0
= Offset
;
7063 ConstantSDNode
*C1
= nullptr;
7065 if ((C1
= dyn_cast
<ConstantSDNode
>(N0
)))
7067 else if (DAG
.isBaseWithConstantOffset(N0
)) {
7068 C1
= cast
<ConstantSDNode
>(N0
.getOperand(1));
7069 N0
= N0
.getOperand(0);
7073 unsigned ImmOffset
= C1
->getZExtValue();
7074 // If the immediate value is too big for the immoffset field, put the value
7075 // and -4096 into the immoffset field so that the value that is copied/added
7076 // for the voffset field is a multiple of 4096, and it stands more chance
7077 // of being CSEd with the copy/add for another similar load/store.
7078 // However, do not do that rounding down to a multiple of 4096 if that is a
7079 // negative number, as it appears to be illegal to have a negative offset
7080 // in the vgpr, even if adding the immediate offset makes it positive.
7081 unsigned Overflow
= ImmOffset
& ~MaxImm
;
7082 ImmOffset
-= Overflow
;
7083 if ((int32_t)Overflow
< 0) {
7084 Overflow
+= ImmOffset
;
7087 C1
= cast
<ConstantSDNode
>(DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
));
7089 auto OverflowVal
= DAG
.getConstant(Overflow
, DL
, MVT::i32
);
7093 SDValue Ops
[] = { N0
, OverflowVal
};
7094 N0
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, Ops
);
7099 N0
= DAG
.getConstant(0, DL
, MVT::i32
);
7101 C1
= cast
<ConstantSDNode
>(DAG
.getTargetConstant(0, DL
, MVT::i32
));
7102 return {N0
, SDValue(C1
, 0)};
7105 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
7106 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
7107 // pointed to by Offsets.
7108 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset
,
7109 SelectionDAG
&DAG
, SDValue
*Offsets
,
7110 unsigned Align
) const {
7111 SDLoc
DL(CombinedOffset
);
7112 if (auto C
= dyn_cast
<ConstantSDNode
>(CombinedOffset
)) {
7113 uint32_t Imm
= C
->getZExtValue();
7114 uint32_t SOffset
, ImmOffset
;
7115 if (AMDGPU::splitMUBUFOffset(Imm
, SOffset
, ImmOffset
, Subtarget
, Align
)) {
7116 Offsets
[0] = DAG
.getConstant(0, DL
, MVT::i32
);
7117 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
7118 Offsets
[2] = DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
);
7122 if (DAG
.isBaseWithConstantOffset(CombinedOffset
)) {
7123 SDValue N0
= CombinedOffset
.getOperand(0);
7124 SDValue N1
= CombinedOffset
.getOperand(1);
7125 uint32_t SOffset
, ImmOffset
;
7126 int Offset
= cast
<ConstantSDNode
>(N1
)->getSExtValue();
7127 if (Offset
>= 0 && AMDGPU::splitMUBUFOffset(Offset
, SOffset
, ImmOffset
,
7128 Subtarget
, Align
)) {
7130 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
7131 Offsets
[2] = DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
);
7135 Offsets
[0] = CombinedOffset
;
7136 Offsets
[1] = DAG
.getConstant(0, DL
, MVT::i32
);
7137 Offsets
[2] = DAG
.getTargetConstant(0, DL
, MVT::i32
);
7140 // Handle 8 bit and 16 bit buffer loads
7141 SDValue
SITargetLowering::handleByteShortBufferLoads(SelectionDAG
&DAG
,
7142 EVT LoadVT
, SDLoc DL
,
7143 ArrayRef
<SDValue
> Ops
,
7144 MemSDNode
*M
) const {
7145 EVT IntVT
= LoadVT
.changeTypeToInteger();
7146 unsigned Opc
= (LoadVT
.getScalarType() == MVT::i8
) ?
7147 AMDGPUISD::BUFFER_LOAD_UBYTE
: AMDGPUISD::BUFFER_LOAD_USHORT
;
7149 SDVTList ResList
= DAG
.getVTList(MVT::i32
, MVT::Other
);
7150 SDValue BufferLoad
= DAG
.getMemIntrinsicNode(Opc
, DL
, ResList
,
7152 M
->getMemOperand());
7153 SDValue LoadVal
= DAG
.getNode(ISD::TRUNCATE
, DL
, IntVT
, BufferLoad
);
7154 LoadVal
= DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, LoadVal
);
7156 return DAG
.getMergeValues({LoadVal
, BufferLoad
.getValue(1)}, DL
);
7159 // Handle 8 bit and 16 bit buffer stores
7160 SDValue
SITargetLowering::handleByteShortBufferStores(SelectionDAG
&DAG
,
7161 EVT VDataType
, SDLoc DL
,
7163 MemSDNode
*M
) const {
7164 if (VDataType
== MVT::f16
)
7165 Ops
[1] = DAG
.getNode(ISD::BITCAST
, DL
, MVT::i16
, Ops
[1]);
7167 SDValue BufferStoreExt
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Ops
[1]);
7168 Ops
[1] = BufferStoreExt
;
7169 unsigned Opc
= (VDataType
== MVT::i8
) ? AMDGPUISD::BUFFER_STORE_BYTE
:
7170 AMDGPUISD::BUFFER_STORE_SHORT
;
7171 ArrayRef
<SDValue
> OpsRef
= makeArrayRef(&Ops
[0], 9);
7172 return DAG
.getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), OpsRef
, VDataType
,
7173 M
->getMemOperand());
7176 static SDValue
getLoadExtOrTrunc(SelectionDAG
&DAG
,
7177 ISD::LoadExtType ExtType
, SDValue Op
,
7178 const SDLoc
&SL
, EVT VT
) {
7179 if (VT
.bitsLT(Op
.getValueType()))
7180 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Op
);
7184 return DAG
.getNode(ISD::SIGN_EXTEND
, SL
, VT
, Op
);
7186 return DAG
.getNode(ISD::ZERO_EXTEND
, SL
, VT
, Op
);
7188 return DAG
.getNode(ISD::ANY_EXTEND
, SL
, VT
, Op
);
7189 case ISD::NON_EXTLOAD
:
7193 llvm_unreachable("invalid ext type");
7196 SDValue
SITargetLowering::widenLoad(LoadSDNode
*Ld
, DAGCombinerInfo
&DCI
) const {
7197 SelectionDAG
&DAG
= DCI
.DAG
;
7198 if (Ld
->getAlignment() < 4 || Ld
->isDivergent())
7201 // FIXME: Constant loads should all be marked invariant.
7202 unsigned AS
= Ld
->getAddressSpace();
7203 if (AS
!= AMDGPUAS::CONSTANT_ADDRESS
&&
7204 AS
!= AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
7205 (AS
!= AMDGPUAS::GLOBAL_ADDRESS
|| !Ld
->isInvariant()))
7208 // Don't do this early, since it may interfere with adjacent load merging for
7209 // illegal types. We can avoid losing alignment information for exotic types
7211 EVT MemVT
= Ld
->getMemoryVT();
7212 if ((MemVT
.isSimple() && !DCI
.isAfterLegalizeDAG()) ||
7213 MemVT
.getSizeInBits() >= 32)
7218 assert((!MemVT
.isVector() || Ld
->getExtensionType() == ISD::NON_EXTLOAD
) &&
7219 "unexpected vector extload");
7221 // TODO: Drop only high part of range.
7222 SDValue Ptr
= Ld
->getBasePtr();
7223 SDValue NewLoad
= DAG
.getLoad(ISD::UNINDEXED
, ISD::NON_EXTLOAD
,
7224 MVT::i32
, SL
, Ld
->getChain(), Ptr
,
7226 Ld
->getPointerInfo(), MVT::i32
,
7228 Ld
->getMemOperand()->getFlags(),
7230 nullptr); // Drop ranges
7232 EVT TruncVT
= EVT::getIntegerVT(*DAG
.getContext(), MemVT
.getSizeInBits());
7233 if (MemVT
.isFloatingPoint()) {
7234 assert(Ld
->getExtensionType() == ISD::NON_EXTLOAD
&&
7235 "unexpected fp extload");
7236 TruncVT
= MemVT
.changeTypeToInteger();
7239 SDValue Cvt
= NewLoad
;
7240 if (Ld
->getExtensionType() == ISD::SEXTLOAD
) {
7241 Cvt
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, SL
, MVT::i32
, NewLoad
,
7242 DAG
.getValueType(TruncVT
));
7243 } else if (Ld
->getExtensionType() == ISD::ZEXTLOAD
||
7244 Ld
->getExtensionType() == ISD::NON_EXTLOAD
) {
7245 Cvt
= DAG
.getZeroExtendInReg(NewLoad
, SL
, TruncVT
);
7247 assert(Ld
->getExtensionType() == ISD::EXTLOAD
);
7250 EVT VT
= Ld
->getValueType(0);
7251 EVT IntVT
= EVT::getIntegerVT(*DAG
.getContext(), VT
.getSizeInBits());
7253 DCI
.AddToWorklist(Cvt
.getNode());
7255 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
7256 // the appropriate extension from the 32-bit load.
7257 Cvt
= getLoadExtOrTrunc(DAG
, Ld
->getExtensionType(), Cvt
, SL
, IntVT
);
7258 DCI
.AddToWorklist(Cvt
.getNode());
7260 // Handle conversion back to floating point if necessary.
7261 Cvt
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, Cvt
);
7263 return DAG
.getMergeValues({ Cvt
, NewLoad
.getValue(1) }, SL
);
7266 SDValue
SITargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
7268 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
7269 ISD::LoadExtType ExtType
= Load
->getExtensionType();
7270 EVT MemVT
= Load
->getMemoryVT();
7272 if (ExtType
== ISD::NON_EXTLOAD
&& MemVT
.getSizeInBits() < 32) {
7273 if (MemVT
== MVT::i16
&& isTypeLegal(MVT::i16
))
7276 // FIXME: Copied from PPC
7277 // First, load into 32 bits, then truncate to 1 bit.
7279 SDValue Chain
= Load
->getChain();
7280 SDValue BasePtr
= Load
->getBasePtr();
7281 MachineMemOperand
*MMO
= Load
->getMemOperand();
7283 EVT RealMemVT
= (MemVT
== MVT::i1
) ? MVT::i8
: MVT::i16
;
7285 SDValue NewLD
= DAG
.getExtLoad(ISD::EXTLOAD
, DL
, MVT::i32
, Chain
,
7286 BasePtr
, RealMemVT
, MMO
);
7288 if (!MemVT
.isVector()) {
7290 DAG
.getNode(ISD::TRUNCATE
, DL
, MemVT
, NewLD
),
7294 return DAG
.getMergeValues(Ops
, DL
);
7297 SmallVector
<SDValue
, 3> Elts
;
7298 for (unsigned I
= 0, N
= MemVT
.getVectorNumElements(); I
!= N
; ++I
) {
7299 SDValue Elt
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, NewLD
,
7300 DAG
.getConstant(I
, DL
, MVT::i32
));
7302 Elts
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Elt
));
7306 DAG
.getBuildVector(MemVT
, DL
, Elts
),
7310 return DAG
.getMergeValues(Ops
, DL
);
7313 if (!MemVT
.isVector())
7316 assert(Op
.getValueType().getVectorElementType() == MVT::i32
&&
7317 "Custom lowering for non-i32 vectors hasn't been implemented.");
7319 if (!allowsMemoryAccess(*DAG
.getContext(), DAG
.getDataLayout(), MemVT
,
7320 *Load
->getMemOperand())) {
7322 std::tie(Ops
[0], Ops
[1]) = expandUnalignedLoad(Load
, DAG
);
7323 return DAG
.getMergeValues(Ops
, DL
);
7326 unsigned Alignment
= Load
->getAlignment();
7327 unsigned AS
= Load
->getAddressSpace();
7328 if (Subtarget
->hasLDSMisalignedBug() &&
7329 AS
== AMDGPUAS::FLAT_ADDRESS
&&
7330 Alignment
< MemVT
.getStoreSize() && MemVT
.getSizeInBits() > 32) {
7331 return SplitVectorLoad(Op
, DAG
);
7334 MachineFunction
&MF
= DAG
.getMachineFunction();
7335 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
7336 // If there is a possibilty that flat instruction access scratch memory
7337 // then we need to use the same legalization rules we use for private.
7338 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
7339 AS
= MFI
->hasFlatScratchInit() ?
7340 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
7342 unsigned NumElements
= MemVT
.getVectorNumElements();
7344 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7345 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
7346 if (!Op
->isDivergent() && Alignment
>= 4 && NumElements
< 32) {
7347 if (MemVT
.isPow2VectorType())
7349 if (NumElements
== 3)
7350 return WidenVectorLoad(Op
, DAG
);
7351 return SplitVectorLoad(Op
, DAG
);
7353 // Non-uniform loads will be selected to MUBUF instructions, so they
7354 // have the same legalization requirements as global and private
7359 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7360 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
7361 AS
== AMDGPUAS::GLOBAL_ADDRESS
) {
7362 if (Subtarget
->getScalarizeGlobalBehavior() && !Op
->isDivergent() &&
7363 !Load
->isVolatile() && isMemOpHasNoClobberedMemOperand(Load
) &&
7364 Alignment
>= 4 && NumElements
< 32) {
7365 if (MemVT
.isPow2VectorType())
7367 if (NumElements
== 3)
7368 return WidenVectorLoad(Op
, DAG
);
7369 return SplitVectorLoad(Op
, DAG
);
7371 // Non-uniform loads will be selected to MUBUF instructions, so they
7372 // have the same legalization requirements as global and private
7376 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7377 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
7378 AS
== AMDGPUAS::GLOBAL_ADDRESS
||
7379 AS
== AMDGPUAS::FLAT_ADDRESS
) {
7380 if (NumElements
> 4)
7381 return SplitVectorLoad(Op
, DAG
);
7382 // v3 loads not supported on SI.
7383 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7384 return WidenVectorLoad(Op
, DAG
);
7385 // v3 and v4 loads are supported for private and global memory.
7388 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7389 // Depending on the setting of the private_element_size field in the
7390 // resource descriptor, we can only make private accesses up to a certain
7392 switch (Subtarget
->getMaxPrivateElementSize()) {
7394 return scalarizeVectorLoad(Load
, DAG
);
7396 if (NumElements
> 2)
7397 return SplitVectorLoad(Op
, DAG
);
7400 // Same as global/flat
7401 if (NumElements
> 4)
7402 return SplitVectorLoad(Op
, DAG
);
7403 // v3 loads not supported on SI.
7404 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7405 return WidenVectorLoad(Op
, DAG
);
7408 llvm_unreachable("unsupported private_element_size");
7410 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
7411 // Use ds_read_b128 if possible.
7412 if (Subtarget
->useDS128() && Load
->getAlignment() >= 16 &&
7413 MemVT
.getStoreSize() == 16)
7416 if (NumElements
> 2)
7417 return SplitVectorLoad(Op
, DAG
);
7419 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7420 // address is negative, then the instruction is incorrectly treated as
7421 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7422 // loads here to avoid emitting ds_read2_b32. We may re-combine the
7423 // load later in the SILoadStoreOptimizer.
7424 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
7425 NumElements
== 2 && MemVT
.getStoreSize() == 8 &&
7426 Load
->getAlignment() < 8) {
7427 return SplitVectorLoad(Op
, DAG
);
7433 SDValue
SITargetLowering::LowerSELECT(SDValue Op
, SelectionDAG
&DAG
) const {
7434 EVT VT
= Op
.getValueType();
7435 assert(VT
.getSizeInBits() == 64);
7438 SDValue Cond
= Op
.getOperand(0);
7440 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i32
);
7441 SDValue One
= DAG
.getConstant(1, DL
, MVT::i32
);
7443 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(1));
7444 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(2));
7446 SDValue Lo0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, Zero
);
7447 SDValue Lo1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, Zero
);
7449 SDValue Lo
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Lo0
, Lo1
);
7451 SDValue Hi0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, One
);
7452 SDValue Hi1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, One
);
7454 SDValue Hi
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Hi0
, Hi1
);
7456 SDValue Res
= DAG
.getBuildVector(MVT::v2i32
, DL
, {Lo
, Hi
});
7457 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Res
);
7460 // Catch division cases where we can use shortcuts with rcp and rsq
7462 SDValue
SITargetLowering::lowerFastUnsafeFDIV(SDValue Op
,
7463 SelectionDAG
&DAG
) const {
7465 SDValue LHS
= Op
.getOperand(0);
7466 SDValue RHS
= Op
.getOperand(1);
7467 EVT VT
= Op
.getValueType();
7468 const SDNodeFlags Flags
= Op
->getFlags();
7469 bool Unsafe
= DAG
.getTarget().Options
.UnsafeFPMath
|| Flags
.hasAllowReciprocal();
7471 if (!Unsafe
&& VT
== MVT::f32
&& Subtarget
->hasFP32Denormals())
7474 if (const ConstantFPSDNode
*CLHS
= dyn_cast
<ConstantFPSDNode
>(LHS
)) {
7475 if (Unsafe
|| VT
== MVT::f32
|| VT
== MVT::f16
) {
7476 if (CLHS
->isExactlyValue(1.0)) {
7477 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
7478 // the CI documentation has a worst case error of 1 ulp.
7479 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
7480 // use it as long as we aren't trying to use denormals.
7482 // v_rcp_f16 and v_rsq_f16 DO support denormals.
7484 // 1.0 / sqrt(x) -> rsq(x)
7486 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
7487 // error seems really high at 2^29 ULP.
7488 if (RHS
.getOpcode() == ISD::FSQRT
)
7489 return DAG
.getNode(AMDGPUISD::RSQ
, SL
, VT
, RHS
.getOperand(0));
7491 // 1.0 / x -> rcp(x)
7492 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
7495 // Same as for 1.0, but expand the sign out of the constant.
7496 if (CLHS
->isExactlyValue(-1.0)) {
7497 // -1.0 / x -> rcp (fneg x)
7498 SDValue FNegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
7499 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, FNegRHS
);
7505 // Turn into multiply by the reciprocal.
7506 // x / y -> x * (1.0 / y)
7507 SDValue Recip
= DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
7508 return DAG
.getNode(ISD::FMUL
, SL
, VT
, LHS
, Recip
, Flags
);
7514 static SDValue
getFPBinOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
7515 EVT VT
, SDValue A
, SDValue B
, SDValue GlueChain
) {
7516 if (GlueChain
->getNumValues() <= 1) {
7517 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
);
7520 assert(GlueChain
->getNumValues() == 3);
7522 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
7524 default: llvm_unreachable("no chain equivalent for opcode");
7526 Opcode
= AMDGPUISD::FMUL_W_CHAIN
;
7530 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
,
7531 GlueChain
.getValue(2));
7534 static SDValue
getFPTernOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
7535 EVT VT
, SDValue A
, SDValue B
, SDValue C
,
7536 SDValue GlueChain
) {
7537 if (GlueChain
->getNumValues() <= 1) {
7538 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
, C
);
7541 assert(GlueChain
->getNumValues() == 3);
7543 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
7545 default: llvm_unreachable("no chain equivalent for opcode");
7547 Opcode
= AMDGPUISD::FMA_W_CHAIN
;
7551 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
, C
,
7552 GlueChain
.getValue(2));
7555 SDValue
SITargetLowering::LowerFDIV16(SDValue Op
, SelectionDAG
&DAG
) const {
7556 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
7560 SDValue Src0
= Op
.getOperand(0);
7561 SDValue Src1
= Op
.getOperand(1);
7563 SDValue CvtSrc0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
7564 SDValue CvtSrc1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
7566 SDValue RcpSrc1
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, CvtSrc1
);
7567 SDValue Quot
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, CvtSrc0
, RcpSrc1
);
7569 SDValue FPRoundFlag
= DAG
.getTargetConstant(0, SL
, MVT::i32
);
7570 SDValue BestQuot
= DAG
.getNode(ISD::FP_ROUND
, SL
, MVT::f16
, Quot
, FPRoundFlag
);
7572 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f16
, BestQuot
, Src1
, Src0
);
7575 // Faster 2.5 ULP division that does not support denormals.
7576 SDValue
SITargetLowering::lowerFDIV_FAST(SDValue Op
, SelectionDAG
&DAG
) const {
7578 SDValue LHS
= Op
.getOperand(1);
7579 SDValue RHS
= Op
.getOperand(2);
7581 SDValue r1
= DAG
.getNode(ISD::FABS
, SL
, MVT::f32
, RHS
);
7583 const APFloat
K0Val(BitsToFloat(0x6f800000));
7584 const SDValue K0
= DAG
.getConstantFP(K0Val
, SL
, MVT::f32
);
7586 const APFloat
K1Val(BitsToFloat(0x2f800000));
7587 const SDValue K1
= DAG
.getConstantFP(K1Val
, SL
, MVT::f32
);
7589 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
7592 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::f32
);
7594 SDValue r2
= DAG
.getSetCC(SL
, SetCCVT
, r1
, K0
, ISD::SETOGT
);
7596 SDValue r3
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f32
, r2
, K1
, One
);
7598 // TODO: Should this propagate fast-math-flags?
7599 r1
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, RHS
, r3
);
7601 // rcp does not support denormals.
7602 SDValue r0
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, r1
);
7604 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, LHS
, r0
);
7606 return DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, r3
, Mul
);
7609 // Returns immediate value for setting the F32 denorm mode when using the
7610 // S_DENORM_MODE instruction.
7611 static const SDValue
getSPDenormModeValue(int SPDenormMode
, SelectionDAG
&DAG
,
7612 const SDLoc
&SL
, const GCNSubtarget
*ST
) {
7613 assert(ST
->hasDenormModeInst() && "Requires S_DENORM_MODE");
7614 int DPDenormModeDefault
= ST
->hasFP64Denormals()
7615 ? FP_DENORM_FLUSH_NONE
7616 : FP_DENORM_FLUSH_IN_FLUSH_OUT
;
7618 int Mode
= SPDenormMode
| (DPDenormModeDefault
<< 2);
7619 return DAG
.getTargetConstant(Mode
, SL
, MVT::i32
);
7622 SDValue
SITargetLowering::LowerFDIV32(SDValue Op
, SelectionDAG
&DAG
) const {
7623 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
7627 SDValue LHS
= Op
.getOperand(0);
7628 SDValue RHS
= Op
.getOperand(1);
7630 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
7632 SDVTList ScaleVT
= DAG
.getVTList(MVT::f32
, MVT::i1
);
7634 SDValue DenominatorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
7636 SDValue NumeratorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
7639 // Denominator is scaled to not be denormal, so using rcp is ok.
7640 SDValue ApproxRcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
,
7642 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
,
7645 const unsigned Denorm32Reg
= AMDGPU::Hwreg::ID_MODE
|
7646 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_
) |
7647 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_
);
7648 const SDValue BitField
= DAG
.getTargetConstant(Denorm32Reg
, SL
, MVT::i16
);
7650 if (!Subtarget
->hasFP32Denormals()) {
7651 SDVTList BindParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
7653 SDValue EnableDenorm
;
7654 if (Subtarget
->hasDenormModeInst()) {
7655 const SDValue EnableDenormValue
=
7656 getSPDenormModeValue(FP_DENORM_FLUSH_NONE
, DAG
, SL
, Subtarget
);
7658 EnableDenorm
= DAG
.getNode(AMDGPUISD::DENORM_MODE
, SL
, BindParamVTs
,
7659 DAG
.getEntryNode(), EnableDenormValue
);
7661 const SDValue EnableDenormValue
= DAG
.getConstant(FP_DENORM_FLUSH_NONE
,
7663 EnableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, BindParamVTs
,
7664 DAG
.getEntryNode(), EnableDenormValue
,
7670 EnableDenorm
.getValue(0),
7671 EnableDenorm
.getValue(1)
7674 NegDivScale0
= DAG
.getMergeValues(Ops
, SL
);
7677 SDValue Fma0
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
,
7678 ApproxRcp
, One
, NegDivScale0
);
7680 SDValue Fma1
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma0
, ApproxRcp
,
7683 SDValue Mul
= getFPBinOp(DAG
, ISD::FMUL
, SL
, MVT::f32
, NumeratorScaled
,
7686 SDValue Fma2
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Mul
,
7687 NumeratorScaled
, Mul
);
7689 SDValue Fma3
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma2
, Fma1
, Mul
, Fma2
);
7691 SDValue Fma4
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Fma3
,
7692 NumeratorScaled
, Fma3
);
7694 if (!Subtarget
->hasFP32Denormals()) {
7696 SDValue DisableDenorm
;
7697 if (Subtarget
->hasDenormModeInst()) {
7698 const SDValue DisableDenormValue
=
7699 getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT
, DAG
, SL
, Subtarget
);
7701 DisableDenorm
= DAG
.getNode(AMDGPUISD::DENORM_MODE
, SL
, MVT::Other
,
7702 Fma4
.getValue(1), DisableDenormValue
,
7705 const SDValue DisableDenormValue
=
7706 DAG
.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT
, SL
, MVT::i32
);
7708 DisableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, MVT::Other
,
7709 Fma4
.getValue(1), DisableDenormValue
,
7710 BitField
, Fma4
.getValue(2));
7713 SDValue OutputChain
= DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
,
7714 DisableDenorm
, DAG
.getRoot());
7715 DAG
.setRoot(OutputChain
);
7718 SDValue Scale
= NumeratorScaled
.getValue(1);
7719 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f32
,
7720 Fma4
, Fma1
, Fma3
, Scale
);
7722 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f32
, Fmas
, RHS
, LHS
);
7725 SDValue
SITargetLowering::LowerFDIV64(SDValue Op
, SelectionDAG
&DAG
) const {
7726 if (DAG
.getTarget().Options
.UnsafeFPMath
)
7727 return lowerFastUnsafeFDIV(Op
, DAG
);
7730 SDValue X
= Op
.getOperand(0);
7731 SDValue Y
= Op
.getOperand(1);
7733 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f64
);
7735 SDVTList ScaleVT
= DAG
.getVTList(MVT::f64
, MVT::i1
);
7737 SDValue DivScale0
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, Y
, Y
, X
);
7739 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f64
, DivScale0
);
7741 SDValue Rcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f64
, DivScale0
);
7743 SDValue Fma0
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Rcp
, One
);
7745 SDValue Fma1
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Rcp
, Fma0
, Rcp
);
7747 SDValue Fma2
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Fma1
, One
);
7749 SDValue DivScale1
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, X
, Y
, X
);
7751 SDValue Fma3
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Fma1
, Fma2
, Fma1
);
7752 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f64
, DivScale1
, Fma3
);
7754 SDValue Fma4
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
,
7755 NegDivScale0
, Mul
, DivScale1
);
7759 if (!Subtarget
->hasUsableDivScaleConditionOutput()) {
7760 // Workaround a hardware bug on SI where the condition output from div_scale
7763 const SDValue Hi
= DAG
.getConstant(1, SL
, MVT::i32
);
7765 // Figure out if the scale to use for div_fmas.
7766 SDValue NumBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, X
);
7767 SDValue DenBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Y
);
7768 SDValue Scale0BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale0
);
7769 SDValue Scale1BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale1
);
7771 SDValue NumHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, NumBC
, Hi
);
7772 SDValue DenHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, DenBC
, Hi
);
7775 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale0BC
, Hi
);
7777 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale1BC
, Hi
);
7779 SDValue CmpDen
= DAG
.getSetCC(SL
, MVT::i1
, DenHi
, Scale0Hi
, ISD::SETEQ
);
7780 SDValue CmpNum
= DAG
.getSetCC(SL
, MVT::i1
, NumHi
, Scale1Hi
, ISD::SETEQ
);
7781 Scale
= DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, CmpNum
, CmpDen
);
7783 Scale
= DivScale1
.getValue(1);
7786 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f64
,
7787 Fma4
, Fma3
, Mul
, Scale
);
7789 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f64
, Fmas
, Y
, X
);
7792 SDValue
SITargetLowering::LowerFDIV(SDValue Op
, SelectionDAG
&DAG
) const {
7793 EVT VT
= Op
.getValueType();
7796 return LowerFDIV32(Op
, DAG
);
7799 return LowerFDIV64(Op
, DAG
);
7802 return LowerFDIV16(Op
, DAG
);
7804 llvm_unreachable("Unexpected type for fdiv");
7807 SDValue
SITargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
7809 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
7810 EVT VT
= Store
->getMemoryVT();
7812 if (VT
== MVT::i1
) {
7813 return DAG
.getTruncStore(Store
->getChain(), DL
,
7814 DAG
.getSExtOrTrunc(Store
->getValue(), DL
, MVT::i32
),
7815 Store
->getBasePtr(), MVT::i1
, Store
->getMemOperand());
7818 assert(VT
.isVector() &&
7819 Store
->getValue().getValueType().getScalarType() == MVT::i32
);
7821 if (!allowsMemoryAccess(*DAG
.getContext(), DAG
.getDataLayout(), VT
,
7822 *Store
->getMemOperand())) {
7823 return expandUnalignedStore(Store
, DAG
);
7826 unsigned AS
= Store
->getAddressSpace();
7827 if (Subtarget
->hasLDSMisalignedBug() &&
7828 AS
== AMDGPUAS::FLAT_ADDRESS
&&
7829 Store
->getAlignment() < VT
.getStoreSize() && VT
.getSizeInBits() > 32) {
7830 return SplitVectorStore(Op
, DAG
);
7833 MachineFunction
&MF
= DAG
.getMachineFunction();
7834 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
7835 // If there is a possibilty that flat instruction access scratch memory
7836 // then we need to use the same legalization rules we use for private.
7837 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
7838 AS
= MFI
->hasFlatScratchInit() ?
7839 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
7841 unsigned NumElements
= VT
.getVectorNumElements();
7842 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
||
7843 AS
== AMDGPUAS::FLAT_ADDRESS
) {
7844 if (NumElements
> 4)
7845 return SplitVectorStore(Op
, DAG
);
7846 // v3 stores not supported on SI.
7847 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7848 return SplitVectorStore(Op
, DAG
);
7850 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7851 switch (Subtarget
->getMaxPrivateElementSize()) {
7853 return scalarizeVectorStore(Store
, DAG
);
7855 if (NumElements
> 2)
7856 return SplitVectorStore(Op
, DAG
);
7859 if (NumElements
> 4 || NumElements
== 3)
7860 return SplitVectorStore(Op
, DAG
);
7863 llvm_unreachable("unsupported private_element_size");
7865 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
7866 // Use ds_write_b128 if possible.
7867 if (Subtarget
->useDS128() && Store
->getAlignment() >= 16 &&
7868 VT
.getStoreSize() == 16 && NumElements
!= 3)
7871 if (NumElements
> 2)
7872 return SplitVectorStore(Op
, DAG
);
7874 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7875 // address is negative, then the instruction is incorrectly treated as
7876 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7877 // stores here to avoid emitting ds_write2_b32. We may re-combine the
7878 // store later in the SILoadStoreOptimizer.
7879 if (!Subtarget
->hasUsableDSOffset() &&
7880 NumElements
== 2 && VT
.getStoreSize() == 8 &&
7881 Store
->getAlignment() < 8) {
7882 return SplitVectorStore(Op
, DAG
);
7887 llvm_unreachable("unhandled address space");
7891 SDValue
SITargetLowering::LowerTrig(SDValue Op
, SelectionDAG
&DAG
) const {
7893 EVT VT
= Op
.getValueType();
7894 SDValue Arg
= Op
.getOperand(0);
7897 // TODO: Should this propagate fast-math-flags?
7899 SDValue OneOver2Pi
= DAG
.getConstantFP(0.5 / M_PI
, DL
, VT
);
7901 if (Subtarget
->hasTrigReducedRange()) {
7902 SDValue MulVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
7903 TrigVal
= DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, MulVal
);
7905 TrigVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
7908 switch (Op
.getOpcode()) {
7910 return DAG
.getNode(AMDGPUISD::COS_HW
, SDLoc(Op
), VT
, TrigVal
);
7912 return DAG
.getNode(AMDGPUISD::SIN_HW
, SDLoc(Op
), VT
, TrigVal
);
7914 llvm_unreachable("Wrong trig opcode");
7918 SDValue
SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op
, SelectionDAG
&DAG
) const {
7919 AtomicSDNode
*AtomicNode
= cast
<AtomicSDNode
>(Op
);
7920 assert(AtomicNode
->isCompareAndSwap());
7921 unsigned AS
= AtomicNode
->getAddressSpace();
7923 // No custom lowering required for local address space
7924 if (!isFlatGlobalAddrSpace(AS
))
7927 // Non-local address space requires custom lowering for atomic compare
7928 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7930 SDValue ChainIn
= Op
.getOperand(0);
7931 SDValue Addr
= Op
.getOperand(1);
7932 SDValue Old
= Op
.getOperand(2);
7933 SDValue New
= Op
.getOperand(3);
7934 EVT VT
= Op
.getValueType();
7935 MVT SimpleVT
= VT
.getSimpleVT();
7936 MVT VecType
= MVT::getVectorVT(SimpleVT
, 2);
7938 SDValue NewOld
= DAG
.getBuildVector(VecType
, DL
, {New
, Old
});
7939 SDValue Ops
[] = { ChainIn
, Addr
, NewOld
};
7941 return DAG
.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP
, DL
, Op
->getVTList(),
7942 Ops
, VT
, AtomicNode
->getMemOperand());
7945 //===----------------------------------------------------------------------===//
7946 // Custom DAG optimizations
7947 //===----------------------------------------------------------------------===//
7949 SDValue
SITargetLowering::performUCharToFloatCombine(SDNode
*N
,
7950 DAGCombinerInfo
&DCI
) const {
7951 EVT VT
= N
->getValueType(0);
7952 EVT ScalarVT
= VT
.getScalarType();
7953 if (ScalarVT
!= MVT::f32
)
7956 SelectionDAG
&DAG
= DCI
.DAG
;
7959 SDValue Src
= N
->getOperand(0);
7960 EVT SrcVT
= Src
.getValueType();
7962 // TODO: We could try to match extracting the higher bytes, which would be
7963 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
7964 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
7965 // about in practice.
7966 if (DCI
.isAfterLegalizeDAG() && SrcVT
== MVT::i32
) {
7967 if (DAG
.MaskedValueIsZero(Src
, APInt::getHighBitsSet(32, 24))) {
7968 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
, DL
, VT
, Src
);
7969 DCI
.AddToWorklist(Cvt
.getNode());
7977 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
7979 // This is a variant of
7980 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
7982 // The normal DAG combiner will do this, but only if the add has one use since
7983 // that would increase the number of instructions.
7985 // This prevents us from seeing a constant offset that can be folded into a
7986 // memory instruction's addressing mode. If we know the resulting add offset of
7987 // a pointer can be folded into an addressing offset, we can replace the pointer
7988 // operand with the add of new constant offset. This eliminates one of the uses,
7989 // and may allow the remaining use to also be simplified.
7991 SDValue
SITargetLowering::performSHLPtrCombine(SDNode
*N
,
7994 DAGCombinerInfo
&DCI
) const {
7995 SDValue N0
= N
->getOperand(0);
7996 SDValue N1
= N
->getOperand(1);
7998 // We only do this to handle cases where it's profitable when there are
7999 // multiple uses of the add, so defer to the standard combine.
8000 if ((N0
.getOpcode() != ISD::ADD
&& N0
.getOpcode() != ISD::OR
) ||
8004 const ConstantSDNode
*CN1
= dyn_cast
<ConstantSDNode
>(N1
);
8008 const ConstantSDNode
*CAdd
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
8012 // If the resulting offset is too large, we can't fold it into the addressing
8014 APInt Offset
= CAdd
->getAPIntValue() << CN1
->getAPIntValue();
8015 Type
*Ty
= MemVT
.getTypeForEVT(*DCI
.DAG
.getContext());
8018 AM
.HasBaseReg
= true;
8019 AM
.BaseOffs
= Offset
.getSExtValue();
8020 if (!isLegalAddressingMode(DCI
.DAG
.getDataLayout(), AM
, Ty
, AddrSpace
))
8023 SelectionDAG
&DAG
= DCI
.DAG
;
8025 EVT VT
= N
->getValueType(0);
8027 SDValue ShlX
= DAG
.getNode(ISD::SHL
, SL
, VT
, N0
.getOperand(0), N1
);
8028 SDValue COffset
= DAG
.getConstant(Offset
, SL
, MVT::i32
);
8031 Flags
.setNoUnsignedWrap(N
->getFlags().hasNoUnsignedWrap() &&
8032 (N0
.getOpcode() == ISD::OR
||
8033 N0
->getFlags().hasNoUnsignedWrap()));
8035 return DAG
.getNode(ISD::ADD
, SL
, VT
, ShlX
, COffset
, Flags
);
8038 SDValue
SITargetLowering::performMemSDNodeCombine(MemSDNode
*N
,
8039 DAGCombinerInfo
&DCI
) const {
8040 SDValue Ptr
= N
->getBasePtr();
8041 SelectionDAG
&DAG
= DCI
.DAG
;
8044 // TODO: We could also do this for multiplies.
8045 if (Ptr
.getOpcode() == ISD::SHL
) {
8046 SDValue NewPtr
= performSHLPtrCombine(Ptr
.getNode(), N
->getAddressSpace(),
8047 N
->getMemoryVT(), DCI
);
8049 SmallVector
<SDValue
, 8> NewOps(N
->op_begin(), N
->op_end());
8051 NewOps
[N
->getOpcode() == ISD::STORE
? 2 : 1] = NewPtr
;
8052 return SDValue(DAG
.UpdateNodeOperands(N
, NewOps
), 0);
8059 static bool bitOpWithConstantIsReducible(unsigned Opc
, uint32_t Val
) {
8060 return (Opc
== ISD::AND
&& (Val
== 0 || Val
== 0xffffffff)) ||
8061 (Opc
== ISD::OR
&& (Val
== 0xffffffff || Val
== 0)) ||
8062 (Opc
== ISD::XOR
&& Val
== 0);
8065 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
8066 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
8067 // integer combine opportunities since most 64-bit operations are decomposed
8068 // this way. TODO: We won't want this for SALU especially if it is an inline
8070 SDValue
SITargetLowering::splitBinaryBitConstantOp(
8071 DAGCombinerInfo
&DCI
,
8073 unsigned Opc
, SDValue LHS
,
8074 const ConstantSDNode
*CRHS
) const {
8075 uint64_t Val
= CRHS
->getZExtValue();
8076 uint32_t ValLo
= Lo_32(Val
);
8077 uint32_t ValHi
= Hi_32(Val
);
8078 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8080 if ((bitOpWithConstantIsReducible(Opc
, ValLo
) ||
8081 bitOpWithConstantIsReducible(Opc
, ValHi
)) ||
8082 (CRHS
->hasOneUse() && !TII
->isInlineConstant(CRHS
->getAPIntValue()))) {
8083 // If we need to materialize a 64-bit immediate, it will be split up later
8084 // anyway. Avoid creating the harder to understand 64-bit immediate
8086 return splitBinaryBitConstantOpImpl(DCI
, SL
, Opc
, LHS
, ValLo
, ValHi
);
8092 // Returns true if argument is a boolean value which is not serialized into
8093 // memory or argument and does not require v_cmdmask_b32 to be deserialized.
8094 static bool isBoolSGPR(SDValue V
) {
8095 if (V
.getValueType() != MVT::i1
)
8097 switch (V
.getOpcode()) {
8103 case AMDGPUISD::FP_CLASS
:
8109 // If a constant has all zeroes or all ones within each byte return it.
8110 // Otherwise return 0.
8111 static uint32_t getConstantPermuteMask(uint32_t C
) {
8112 // 0xff for any zero byte in the mask
8113 uint32_t ZeroByteMask
= 0;
8114 if (!(C
& 0x000000ff)) ZeroByteMask
|= 0x000000ff;
8115 if (!(C
& 0x0000ff00)) ZeroByteMask
|= 0x0000ff00;
8116 if (!(C
& 0x00ff0000)) ZeroByteMask
|= 0x00ff0000;
8117 if (!(C
& 0xff000000)) ZeroByteMask
|= 0xff000000;
8118 uint32_t NonZeroByteMask
= ~ZeroByteMask
; // 0xff for any non-zero byte
8119 if ((NonZeroByteMask
& C
) != NonZeroByteMask
)
8120 return 0; // Partial bytes selected.
8124 // Check if a node selects whole bytes from its operand 0 starting at a byte
8125 // boundary while masking the rest. Returns select mask as in the v_perm_b32
8126 // or -1 if not succeeded.
8127 // Note byte select encoding:
8128 // value 0-3 selects corresponding source byte;
8129 // value 0xc selects zero;
8130 // value 0xff selects 0xff.
8131 static uint32_t getPermuteMask(SelectionDAG
&DAG
, SDValue V
) {
8132 assert(V
.getValueSizeInBits() == 32);
8134 if (V
.getNumOperands() != 2)
8137 ConstantSDNode
*N1
= dyn_cast
<ConstantSDNode
>(V
.getOperand(1));
8141 uint32_t C
= N1
->getZExtValue();
8143 switch (V
.getOpcode()) {
8147 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
8148 return (0x03020100 & ConstMask
) | (0x0c0c0c0c & ~ConstMask
);
8153 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
8154 return (0x03020100 & ~ConstMask
) | ConstMask
;
8162 return uint32_t((0x030201000c0c0c0cull
<< C
) >> 32);
8168 return uint32_t(0x0c0c0c0c03020100ull
>> C
);
8174 SDValue
SITargetLowering::performAndCombine(SDNode
*N
,
8175 DAGCombinerInfo
&DCI
) const {
8176 if (DCI
.isBeforeLegalize())
8179 SelectionDAG
&DAG
= DCI
.DAG
;
8180 EVT VT
= N
->getValueType(0);
8181 SDValue LHS
= N
->getOperand(0);
8182 SDValue RHS
= N
->getOperand(1);
8185 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8186 if (VT
== MVT::i64
&& CRHS
) {
8188 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::AND
, LHS
, CRHS
))
8192 if (CRHS
&& VT
== MVT::i32
) {
8193 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
8194 // nb = number of trailing zeroes in mask
8195 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
8196 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
8197 uint64_t Mask
= CRHS
->getZExtValue();
8198 unsigned Bits
= countPopulation(Mask
);
8199 if (getSubtarget()->hasSDWA() && LHS
->getOpcode() == ISD::SRL
&&
8200 (Bits
== 8 || Bits
== 16) && isShiftedMask_64(Mask
) && !(Mask
& 1)) {
8201 if (auto *CShift
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(1))) {
8202 unsigned Shift
= CShift
->getZExtValue();
8203 unsigned NB
= CRHS
->getAPIntValue().countTrailingZeros();
8204 unsigned Offset
= NB
+ Shift
;
8205 if ((Offset
& (Bits
- 1)) == 0) { // Starts at a byte or word boundary.
8207 SDValue BFE
= DAG
.getNode(AMDGPUISD::BFE_U32
, SL
, MVT::i32
,
8209 DAG
.getConstant(Offset
, SL
, MVT::i32
),
8210 DAG
.getConstant(Bits
, SL
, MVT::i32
));
8211 EVT NarrowVT
= EVT::getIntegerVT(*DAG
.getContext(), Bits
);
8212 SDValue Ext
= DAG
.getNode(ISD::AssertZext
, SL
, VT
, BFE
,
8213 DAG
.getValueType(NarrowVT
));
8214 SDValue Shl
= DAG
.getNode(ISD::SHL
, SDLoc(LHS
), VT
, Ext
,
8215 DAG
.getConstant(NB
, SDLoc(CRHS
), MVT::i32
));
8221 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8222 if (LHS
.hasOneUse() && LHS
.getOpcode() == AMDGPUISD::PERM
&&
8223 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
8224 uint32_t Sel
= getConstantPermuteMask(Mask
);
8228 // Select 0xc for all zero bytes
8229 Sel
= (LHS
.getConstantOperandVal(2) & Sel
) | (~Sel
& 0x0c0c0c0c);
8231 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
8232 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
8236 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
8237 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
8238 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == ISD::SETCC
) {
8239 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
8240 ISD::CondCode RCC
= cast
<CondCodeSDNode
>(RHS
.getOperand(2))->get();
8242 SDValue X
= LHS
.getOperand(0);
8243 SDValue Y
= RHS
.getOperand(0);
8244 if (Y
.getOpcode() != ISD::FABS
|| Y
.getOperand(0) != X
)
8247 if (LCC
== ISD::SETO
) {
8248 if (X
!= LHS
.getOperand(1))
8251 if (RCC
== ISD::SETUNE
) {
8252 const ConstantFPSDNode
*C1
= dyn_cast
<ConstantFPSDNode
>(RHS
.getOperand(1));
8253 if (!C1
|| !C1
->isInfinity() || C1
->isNegative())
8256 const uint32_t Mask
= SIInstrFlags::N_NORMAL
|
8257 SIInstrFlags::N_SUBNORMAL
|
8258 SIInstrFlags::N_ZERO
|
8259 SIInstrFlags::P_ZERO
|
8260 SIInstrFlags::P_SUBNORMAL
|
8261 SIInstrFlags::P_NORMAL
;
8263 static_assert(((~(SIInstrFlags::S_NAN
|
8264 SIInstrFlags::Q_NAN
|
8265 SIInstrFlags::N_INFINITY
|
8266 SIInstrFlags::P_INFINITY
)) & 0x3ff) == Mask
,
8270 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
8271 X
, DAG
.getConstant(Mask
, DL
, MVT::i32
));
8276 if (RHS
.getOpcode() == ISD::SETCC
&& LHS
.getOpcode() == AMDGPUISD::FP_CLASS
)
8277 std::swap(LHS
, RHS
);
8279 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
8281 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
8282 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
8283 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
8284 const ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8285 if ((LCC
== ISD::SETO
|| LCC
== ISD::SETUO
) && Mask
&&
8286 (RHS
.getOperand(0) == LHS
.getOperand(0) &&
8287 LHS
.getOperand(0) == LHS
.getOperand(1))) {
8288 const unsigned OrdMask
= SIInstrFlags::S_NAN
| SIInstrFlags::Q_NAN
;
8289 unsigned NewMask
= LCC
== ISD::SETO
?
8290 Mask
->getZExtValue() & ~OrdMask
:
8291 Mask
->getZExtValue() & OrdMask
;
8294 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
, RHS
.getOperand(0),
8295 DAG
.getConstant(NewMask
, DL
, MVT::i32
));
8299 if (VT
== MVT::i32
&&
8300 (RHS
.getOpcode() == ISD::SIGN_EXTEND
|| LHS
.getOpcode() == ISD::SIGN_EXTEND
)) {
8301 // and x, (sext cc from i1) => select cc, x, 0
8302 if (RHS
.getOpcode() != ISD::SIGN_EXTEND
)
8303 std::swap(LHS
, RHS
);
8304 if (isBoolSGPR(RHS
.getOperand(0)))
8305 return DAG
.getSelect(SDLoc(N
), MVT::i32
, RHS
.getOperand(0),
8306 LHS
, DAG
.getConstant(0, SDLoc(N
), MVT::i32
));
8309 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8310 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8311 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
8312 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
8313 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
8314 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
8315 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
8316 // Canonicalize the expression in an attempt to have fewer unique masks
8317 // and therefore fewer registers used to hold the masks.
8318 if (LHSMask
> RHSMask
) {
8319 std::swap(LHSMask
, RHSMask
);
8320 std::swap(LHS
, RHS
);
8323 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8324 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8325 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8326 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8328 // Check of we need to combine values from two sources within a byte.
8329 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
8330 // If we select high and lower word keep it for SDWA.
8331 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8332 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
8333 // Each byte in each mask is either selector mask 0-3, or has higher
8334 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
8335 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
8336 // mask which is not 0xff wins. By anding both masks we have a correct
8337 // result except that 0x0c shall be corrected to give 0x0c only.
8338 uint32_t Mask
= LHSMask
& RHSMask
;
8339 for (unsigned I
= 0; I
< 32; I
+= 8) {
8340 uint32_t ByteSel
= 0xff << I
;
8341 if ((LHSMask
& ByteSel
) == 0x0c || (RHSMask
& ByteSel
) == 0x0c)
8342 Mask
&= (0x0c << I
) & 0xffffffff;
8345 // Add 4 to each active LHS lane. It will not affect any existing 0xff
8347 uint32_t Sel
= Mask
| (LHSUsedLanes
& 0x04040404);
8350 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
8351 LHS
.getOperand(0), RHS
.getOperand(0),
8352 DAG
.getConstant(Sel
, DL
, MVT::i32
));
8360 SDValue
SITargetLowering::performOrCombine(SDNode
*N
,
8361 DAGCombinerInfo
&DCI
) const {
8362 SelectionDAG
&DAG
= DCI
.DAG
;
8363 SDValue LHS
= N
->getOperand(0);
8364 SDValue RHS
= N
->getOperand(1);
8366 EVT VT
= N
->getValueType(0);
8367 if (VT
== MVT::i1
) {
8368 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
8369 if (LHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
8370 RHS
.getOpcode() == AMDGPUISD::FP_CLASS
) {
8371 SDValue Src
= LHS
.getOperand(0);
8372 if (Src
!= RHS
.getOperand(0))
8375 const ConstantSDNode
*CLHS
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
8376 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8380 // Only 10 bits are used.
8381 static const uint32_t MaxMask
= 0x3ff;
8383 uint32_t NewMask
= (CLHS
->getZExtValue() | CRHS
->getZExtValue()) & MaxMask
;
8385 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
8386 Src
, DAG
.getConstant(NewMask
, DL
, MVT::i32
));
8392 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8393 if (isa
<ConstantSDNode
>(RHS
) && LHS
.hasOneUse() &&
8394 LHS
.getOpcode() == AMDGPUISD::PERM
&&
8395 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
8396 uint32_t Sel
= getConstantPermuteMask(N
->getConstantOperandVal(1));
8400 Sel
|= LHS
.getConstantOperandVal(2);
8402 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
8403 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
8406 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8407 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8408 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
8409 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
8410 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
8411 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
8412 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
8413 // Canonicalize the expression in an attempt to have fewer unique masks
8414 // and therefore fewer registers used to hold the masks.
8415 if (LHSMask
> RHSMask
) {
8416 std::swap(LHSMask
, RHSMask
);
8417 std::swap(LHS
, RHS
);
8420 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8421 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8422 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8423 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8425 // Check of we need to combine values from two sources within a byte.
8426 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
8427 // If we select high and lower word keep it for SDWA.
8428 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8429 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
8430 // Kill zero bytes selected by other mask. Zero value is 0xc.
8431 LHSMask
&= ~RHSUsedLanes
;
8432 RHSMask
&= ~LHSUsedLanes
;
8433 // Add 4 to each active LHS lane
8434 LHSMask
|= LHSUsedLanes
& 0x04040404;
8436 uint32_t Sel
= LHSMask
| RHSMask
;
8439 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
8440 LHS
.getOperand(0), RHS
.getOperand(0),
8441 DAG
.getConstant(Sel
, DL
, MVT::i32
));
8449 // TODO: This could be a generic combine with a predicate for extracting the
8450 // high half of an integer being free.
8452 // (or i64:x, (zero_extend i32:y)) ->
8453 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
8454 if (LHS
.getOpcode() == ISD::ZERO_EXTEND
&&
8455 RHS
.getOpcode() != ISD::ZERO_EXTEND
)
8456 std::swap(LHS
, RHS
);
8458 if (RHS
.getOpcode() == ISD::ZERO_EXTEND
) {
8459 SDValue ExtSrc
= RHS
.getOperand(0);
8460 EVT SrcVT
= ExtSrc
.getValueType();
8461 if (SrcVT
== MVT::i32
) {
8463 SDValue LowLHS
, HiBits
;
8464 std::tie(LowLHS
, HiBits
) = split64BitValue(LHS
, DAG
);
8465 SDValue LowOr
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, LowLHS
, ExtSrc
);
8467 DCI
.AddToWorklist(LowOr
.getNode());
8468 DCI
.AddToWorklist(HiBits
.getNode());
8470 SDValue Vec
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
,
8472 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Vec
);
8476 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
8479 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::OR
, LHS
, CRHS
))
8486 SDValue
SITargetLowering::performXorCombine(SDNode
*N
,
8487 DAGCombinerInfo
&DCI
) const {
8488 EVT VT
= N
->getValueType(0);
8492 SDValue LHS
= N
->getOperand(0);
8493 SDValue RHS
= N
->getOperand(1);
8495 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8498 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::XOR
, LHS
, CRHS
))
8505 // Instructions that will be lowered with a final instruction that zeros the
8506 // high result bits.
8507 // XXX - probably only need to list legal operations.
8508 static bool fp16SrcZerosHighBits(unsigned Opc
) {
8517 case ISD::FCANONICALIZE
:
8519 case ISD::UINT_TO_FP
:
8520 case ISD::SINT_TO_FP
:
8522 // Fabs is lowered to a bit operation, but it's an and which will clear the
8523 // high bits anyway.
8537 case ISD::FNEARBYINT
:
8542 case AMDGPUISD::FRACT
:
8543 case AMDGPUISD::CLAMP
:
8544 case AMDGPUISD::COS_HW
:
8545 case AMDGPUISD::SIN_HW
:
8546 case AMDGPUISD::FMIN3
:
8547 case AMDGPUISD::FMAX3
:
8548 case AMDGPUISD::FMED3
:
8549 case AMDGPUISD::FMAD_FTZ
:
8550 case AMDGPUISD::RCP
:
8551 case AMDGPUISD::RSQ
:
8552 case AMDGPUISD::RCP_IFLAG
:
8553 case AMDGPUISD::LDEXP
:
8556 // fcopysign, select and others may be lowered to 32-bit bit operations
8557 // which don't zero the high bits.
8562 SDValue
SITargetLowering::performZeroExtendCombine(SDNode
*N
,
8563 DAGCombinerInfo
&DCI
) const {
8564 if (!Subtarget
->has16BitInsts() ||
8565 DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
8568 EVT VT
= N
->getValueType(0);
8572 SDValue Src
= N
->getOperand(0);
8573 if (Src
.getValueType() != MVT::i16
)
8576 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
8577 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
8578 if (Src
.getOpcode() == ISD::BITCAST
) {
8579 SDValue BCSrc
= Src
.getOperand(0);
8580 if (BCSrc
.getValueType() == MVT::f16
&&
8581 fp16SrcZerosHighBits(BCSrc
.getOpcode()))
8582 return DCI
.DAG
.getNode(AMDGPUISD::FP16_ZEXT
, SDLoc(N
), VT
, BCSrc
);
8588 SDValue
SITargetLowering::performSignExtendInRegCombine(SDNode
*N
,
8589 DAGCombinerInfo
&DCI
)
8591 SDValue Src
= N
->getOperand(0);
8592 auto *VTSign
= cast
<VTSDNode
>(N
->getOperand(1));
8594 if (((Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
&&
8595 VTSign
->getVT() == MVT::i8
) ||
8596 (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT
&&
8597 VTSign
->getVT() == MVT::i16
)) &&
8599 auto *M
= cast
<MemSDNode
>(Src
);
8601 Src
.getOperand(0), // Chain
8602 Src
.getOperand(1), // rsrc
8603 Src
.getOperand(2), // vindex
8604 Src
.getOperand(3), // voffset
8605 Src
.getOperand(4), // soffset
8606 Src
.getOperand(5), // offset
8610 // replace with BUFFER_LOAD_BYTE/SHORT
8611 SDVTList ResList
= DCI
.DAG
.getVTList(MVT::i32
,
8612 Src
.getOperand(0).getValueType());
8613 unsigned Opc
= (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
) ?
8614 AMDGPUISD::BUFFER_LOAD_BYTE
: AMDGPUISD::BUFFER_LOAD_SHORT
;
8615 SDValue BufferLoadSignExt
= DCI
.DAG
.getMemIntrinsicNode(Opc
, SDLoc(N
),
8617 Ops
, M
->getMemoryVT(),
8618 M
->getMemOperand());
8619 return DCI
.DAG
.getMergeValues({BufferLoadSignExt
,
8620 BufferLoadSignExt
.getValue(1)}, SDLoc(N
));
8625 SDValue
SITargetLowering::performClassCombine(SDNode
*N
,
8626 DAGCombinerInfo
&DCI
) const {
8627 SelectionDAG
&DAG
= DCI
.DAG
;
8628 SDValue Mask
= N
->getOperand(1);
8630 // fp_class x, 0 -> false
8631 if (const ConstantSDNode
*CMask
= dyn_cast
<ConstantSDNode
>(Mask
)) {
8632 if (CMask
->isNullValue())
8633 return DAG
.getConstant(0, SDLoc(N
), MVT::i1
);
8636 if (N
->getOperand(0).isUndef())
8637 return DAG
.getUNDEF(MVT::i1
);
8642 SDValue
SITargetLowering::performRcpCombine(SDNode
*N
,
8643 DAGCombinerInfo
&DCI
) const {
8644 EVT VT
= N
->getValueType(0);
8645 SDValue N0
= N
->getOperand(0);
8650 if (VT
== MVT::f32
&& (N0
.getOpcode() == ISD::UINT_TO_FP
||
8651 N0
.getOpcode() == ISD::SINT_TO_FP
)) {
8652 return DCI
.DAG
.getNode(AMDGPUISD::RCP_IFLAG
, SDLoc(N
), VT
, N0
,
8656 return AMDGPUTargetLowering::performRcpCombine(N
, DCI
);
8659 bool SITargetLowering::isCanonicalized(SelectionDAG
&DAG
, SDValue Op
,
8660 unsigned MaxDepth
) const {
8661 unsigned Opcode
= Op
.getOpcode();
8662 if (Opcode
== ISD::FCANONICALIZE
)
8665 if (auto *CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
8666 auto F
= CFP
->getValueAPF();
8667 if (F
.isNaN() && F
.isSignaling())
8669 return !F
.isDenormal() || denormalsEnabledForType(Op
.getValueType());
8672 // If source is a result of another standard FP operation it is already in
8678 // These will flush denorms if required.
8690 case ISD::FP_EXTEND
:
8691 case AMDGPUISD::FMUL_LEGACY
:
8692 case AMDGPUISD::FMAD_FTZ
:
8693 case AMDGPUISD::RCP
:
8694 case AMDGPUISD::RSQ
:
8695 case AMDGPUISD::RSQ_CLAMP
:
8696 case AMDGPUISD::RCP_LEGACY
:
8697 case AMDGPUISD::RSQ_LEGACY
:
8698 case AMDGPUISD::RCP_IFLAG
:
8699 case AMDGPUISD::TRIG_PREOP
:
8700 case AMDGPUISD::DIV_SCALE
:
8701 case AMDGPUISD::DIV_FMAS
:
8702 case AMDGPUISD::DIV_FIXUP
:
8703 case AMDGPUISD::FRACT
:
8704 case AMDGPUISD::LDEXP
:
8705 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
8706 case AMDGPUISD::CVT_F32_UBYTE0
:
8707 case AMDGPUISD::CVT_F32_UBYTE1
:
8708 case AMDGPUISD::CVT_F32_UBYTE2
:
8709 case AMDGPUISD::CVT_F32_UBYTE3
:
8712 // It can/will be lowered or combined as a bit operation.
8713 // Need to check their input recursively to handle.
8716 case ISD::FCOPYSIGN
:
8717 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
8722 return Op
.getValueType().getScalarType() != MVT::f16
;
8726 case ISD::FMINNUM_IEEE
:
8727 case ISD::FMAXNUM_IEEE
:
8728 case AMDGPUISD::CLAMP
:
8729 case AMDGPUISD::FMED3
:
8730 case AMDGPUISD::FMAX3
:
8731 case AMDGPUISD::FMIN3
: {
8732 // FIXME: Shouldn't treat the generic operations different based these.
8733 // However, we aren't really required to flush the result from
8736 // snans will be quieted, so we only need to worry about denormals.
8737 if (Subtarget
->supportsMinMaxDenormModes() ||
8738 denormalsEnabledForType(Op
.getValueType()))
8741 // Flushing may be required.
8742 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
8743 // targets need to check their input recursively.
8745 // FIXME: Does this apply with clamp? It's implemented with max.
8746 for (unsigned I
= 0, E
= Op
.getNumOperands(); I
!= E
; ++I
) {
8747 if (!isCanonicalized(DAG
, Op
.getOperand(I
), MaxDepth
- 1))
8754 return isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1) &&
8755 isCanonicalized(DAG
, Op
.getOperand(2), MaxDepth
- 1);
8757 case ISD::BUILD_VECTOR
: {
8758 for (unsigned i
= 0, e
= Op
.getNumOperands(); i
!= e
; ++i
) {
8759 SDValue SrcOp
= Op
.getOperand(i
);
8760 if (!isCanonicalized(DAG
, SrcOp
, MaxDepth
- 1))
8766 case ISD::EXTRACT_VECTOR_ELT
:
8767 case ISD::EXTRACT_SUBVECTOR
: {
8768 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
8770 case ISD::INSERT_VECTOR_ELT
: {
8771 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1) &&
8772 isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1);
8775 // Could be anything.
8778 case ISD::BITCAST
: {
8779 // Hack round the mess we make when legalizing extract_vector_elt
8780 SDValue Src
= Op
.getOperand(0);
8781 if (Src
.getValueType() == MVT::i16
&&
8782 Src
.getOpcode() == ISD::TRUNCATE
) {
8783 SDValue TruncSrc
= Src
.getOperand(0);
8784 if (TruncSrc
.getValueType() == MVT::i32
&&
8785 TruncSrc
.getOpcode() == ISD::BITCAST
&&
8786 TruncSrc
.getOperand(0).getValueType() == MVT::v2f16
) {
8787 return isCanonicalized(DAG
, TruncSrc
.getOperand(0), MaxDepth
- 1);
8793 case ISD::INTRINSIC_WO_CHAIN
: {
8794 unsigned IntrinsicID
8795 = cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
8796 // TODO: Handle more intrinsics
8797 switch (IntrinsicID
) {
8798 case Intrinsic::amdgcn_cvt_pkrtz
:
8799 case Intrinsic::amdgcn_cubeid
:
8800 case Intrinsic::amdgcn_frexp_mant
:
8801 case Intrinsic::amdgcn_fdot2
:
8810 return denormalsEnabledForType(Op
.getValueType()) &&
8811 DAG
.isKnownNeverSNaN(Op
);
8814 llvm_unreachable("invalid operation");
8817 // Constant fold canonicalize.
8818 SDValue
SITargetLowering::getCanonicalConstantFP(
8819 SelectionDAG
&DAG
, const SDLoc
&SL
, EVT VT
, const APFloat
&C
) const {
8820 // Flush denormals to 0 if not enabled.
8821 if (C
.isDenormal() && !denormalsEnabledForType(VT
))
8822 return DAG
.getConstantFP(0.0, SL
, VT
);
8825 APFloat CanonicalQNaN
= APFloat::getQNaN(C
.getSemantics());
8826 if (C
.isSignaling()) {
8827 // Quiet a signaling NaN.
8828 // FIXME: Is this supposed to preserve payload bits?
8829 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
8832 // Make sure it is the canonical NaN bitpattern.
8834 // TODO: Can we use -1 as the canonical NaN value since it's an inline
8836 if (C
.bitcastToAPInt() != CanonicalQNaN
.bitcastToAPInt())
8837 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
8840 // Already canonical.
8841 return DAG
.getConstantFP(C
, SL
, VT
);
8844 static bool vectorEltWillFoldAway(SDValue Op
) {
8845 return Op
.isUndef() || isa
<ConstantFPSDNode
>(Op
);
8848 SDValue
SITargetLowering::performFCanonicalizeCombine(
8850 DAGCombinerInfo
&DCI
) const {
8851 SelectionDAG
&DAG
= DCI
.DAG
;
8852 SDValue N0
= N
->getOperand(0);
8853 EVT VT
= N
->getValueType(0);
8855 // fcanonicalize undef -> qnan
8857 APFloat QNaN
= APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT
));
8858 return DAG
.getConstantFP(QNaN
, SDLoc(N
), VT
);
8861 if (ConstantFPSDNode
*CFP
= isConstOrConstSplatFP(N0
)) {
8862 EVT VT
= N
->getValueType(0);
8863 return getCanonicalConstantFP(DAG
, SDLoc(N
), VT
, CFP
->getValueAPF());
8866 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
8867 // (fcanonicalize k)
8869 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
8871 // TODO: This could be better with wider vectors that will be split to v2f16,
8872 // and to consider uses since there aren't that many packed operations.
8873 if (N0
.getOpcode() == ISD::BUILD_VECTOR
&& VT
== MVT::v2f16
&&
8874 isTypeLegal(MVT::v2f16
)) {
8877 SDValue Lo
= N0
.getOperand(0);
8878 SDValue Hi
= N0
.getOperand(1);
8879 EVT EltVT
= Lo
.getValueType();
8881 if (vectorEltWillFoldAway(Lo
) || vectorEltWillFoldAway(Hi
)) {
8882 for (unsigned I
= 0; I
!= 2; ++I
) {
8883 SDValue Op
= N0
.getOperand(I
);
8884 if (ConstantFPSDNode
*CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
8885 NewElts
[I
] = getCanonicalConstantFP(DAG
, SL
, EltVT
,
8886 CFP
->getValueAPF());
8887 } else if (Op
.isUndef()) {
8888 // Handled below based on what the other operand is.
8891 NewElts
[I
] = DAG
.getNode(ISD::FCANONICALIZE
, SL
, EltVT
, Op
);
8895 // If one half is undef, and one is constant, perfer a splat vector rather
8896 // than the normal qNaN. If it's a register, prefer 0.0 since that's
8897 // cheaper to use and may be free with a packed operation.
8898 if (NewElts
[0].isUndef()) {
8899 if (isa
<ConstantFPSDNode
>(NewElts
[1]))
8900 NewElts
[0] = isa
<ConstantFPSDNode
>(NewElts
[1]) ?
8901 NewElts
[1]: DAG
.getConstantFP(0.0f
, SL
, EltVT
);
8904 if (NewElts
[1].isUndef()) {
8905 NewElts
[1] = isa
<ConstantFPSDNode
>(NewElts
[0]) ?
8906 NewElts
[0] : DAG
.getConstantFP(0.0f
, SL
, EltVT
);
8909 return DAG
.getBuildVector(VT
, SL
, NewElts
);
8913 unsigned SrcOpc
= N0
.getOpcode();
8915 // If it's free to do so, push canonicalizes further up the source, which may
8916 // find a canonical source.
8918 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
8920 if (SrcOpc
== ISD::FMINNUM
|| SrcOpc
== ISD::FMAXNUM
) {
8921 auto *CRHS
= dyn_cast
<ConstantFPSDNode
>(N0
.getOperand(1));
8922 if (CRHS
&& N0
.hasOneUse()) {
8924 SDValue Canon0
= DAG
.getNode(ISD::FCANONICALIZE
, SL
, VT
,
8926 SDValue Canon1
= getCanonicalConstantFP(DAG
, SL
, VT
, CRHS
->getValueAPF());
8927 DCI
.AddToWorklist(Canon0
.getNode());
8929 return DAG
.getNode(N0
.getOpcode(), SL
, VT
, Canon0
, Canon1
);
8933 return isCanonicalized(DAG
, N0
) ? N0
: SDValue();
8936 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc
) {
8939 case ISD::FMAXNUM_IEEE
:
8940 return AMDGPUISD::FMAX3
;
8942 return AMDGPUISD::SMAX3
;
8944 return AMDGPUISD::UMAX3
;
8946 case ISD::FMINNUM_IEEE
:
8947 return AMDGPUISD::FMIN3
;
8949 return AMDGPUISD::SMIN3
;
8951 return AMDGPUISD::UMIN3
;
8953 llvm_unreachable("Not a min/max opcode");
8957 SDValue
SITargetLowering::performIntMed3ImmCombine(
8958 SelectionDAG
&DAG
, const SDLoc
&SL
,
8959 SDValue Op0
, SDValue Op1
, bool Signed
) const {
8960 ConstantSDNode
*K1
= dyn_cast
<ConstantSDNode
>(Op1
);
8964 ConstantSDNode
*K0
= dyn_cast
<ConstantSDNode
>(Op0
.getOperand(1));
8969 if (K0
->getAPIntValue().sge(K1
->getAPIntValue()))
8972 if (K0
->getAPIntValue().uge(K1
->getAPIntValue()))
8976 EVT VT
= K0
->getValueType(0);
8977 unsigned Med3Opc
= Signed
? AMDGPUISD::SMED3
: AMDGPUISD::UMED3
;
8978 if (VT
== MVT::i32
|| (VT
== MVT::i16
&& Subtarget
->hasMed3_16())) {
8979 return DAG
.getNode(Med3Opc
, SL
, VT
,
8980 Op0
.getOperand(0), SDValue(K0
, 0), SDValue(K1
, 0));
8983 // If there isn't a 16-bit med3 operation, convert to 32-bit.
8985 unsigned ExtOp
= Signed
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
8987 SDValue Tmp1
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(0));
8988 SDValue Tmp2
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(1));
8989 SDValue Tmp3
= DAG
.getNode(ExtOp
, SL
, NVT
, Op1
);
8991 SDValue Med3
= DAG
.getNode(Med3Opc
, SL
, NVT
, Tmp1
, Tmp2
, Tmp3
);
8992 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Med3
);
8995 static ConstantFPSDNode
*getSplatConstantFP(SDValue Op
) {
8996 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(Op
))
8999 if (BuildVectorSDNode
*BV
= dyn_cast
<BuildVectorSDNode
>(Op
)) {
9000 if (ConstantFPSDNode
*C
= BV
->getConstantFPSplatNode())
9007 SDValue
SITargetLowering::performFPMed3ImmCombine(SelectionDAG
&DAG
,
9010 SDValue Op1
) const {
9011 ConstantFPSDNode
*K1
= getSplatConstantFP(Op1
);
9015 ConstantFPSDNode
*K0
= getSplatConstantFP(Op0
.getOperand(1));
9019 // Ordered >= (although NaN inputs should have folded away by now).
9020 APFloat::cmpResult Cmp
= K0
->getValueAPF().compare(K1
->getValueAPF());
9021 if (Cmp
== APFloat::cmpGreaterThan
)
9024 const MachineFunction
&MF
= DAG
.getMachineFunction();
9025 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
9027 // TODO: Check IEEE bit enabled?
9028 EVT VT
= Op0
.getValueType();
9029 if (Info
->getMode().DX10Clamp
) {
9030 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
9031 // hardware fmed3 behavior converting to a min.
9032 // FIXME: Should this be allowing -0.0?
9033 if (K1
->isExactlyValue(1.0) && K0
->isExactlyValue(0.0))
9034 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Op0
.getOperand(0));
9037 // med3 for f16 is only available on gfx9+, and not available for v2f16.
9038 if (VT
== MVT::f32
|| (VT
== MVT::f16
&& Subtarget
->hasMed3_16())) {
9039 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
9040 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
9041 // then give the other result, which is different from med3 with a NaN
9043 SDValue Var
= Op0
.getOperand(0);
9044 if (!DAG
.isKnownNeverSNaN(Var
))
9047 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
9049 if ((!K0
->hasOneUse() ||
9050 TII
->isInlineConstant(K0
->getValueAPF().bitcastToAPInt())) &&
9051 (!K1
->hasOneUse() ||
9052 TII
->isInlineConstant(K1
->getValueAPF().bitcastToAPInt()))) {
9053 return DAG
.getNode(AMDGPUISD::FMED3
, SL
, K0
->getValueType(0),
9054 Var
, SDValue(K0
, 0), SDValue(K1
, 0));
9061 SDValue
SITargetLowering::performMinMaxCombine(SDNode
*N
,
9062 DAGCombinerInfo
&DCI
) const {
9063 SelectionDAG
&DAG
= DCI
.DAG
;
9065 EVT VT
= N
->getValueType(0);
9066 unsigned Opc
= N
->getOpcode();
9067 SDValue Op0
= N
->getOperand(0);
9068 SDValue Op1
= N
->getOperand(1);
9070 // Only do this if the inner op has one use since this will just increases
9071 // register pressure for no benefit.
9073 if (Opc
!= AMDGPUISD::FMIN_LEGACY
&& Opc
!= AMDGPUISD::FMAX_LEGACY
&&
9075 (VT
== MVT::i32
|| VT
== MVT::f32
||
9076 ((VT
== MVT::f16
|| VT
== MVT::i16
) && Subtarget
->hasMin3Max3_16()))) {
9077 // max(max(a, b), c) -> max3(a, b, c)
9078 // min(min(a, b), c) -> min3(a, b, c)
9079 if (Op0
.getOpcode() == Opc
&& Op0
.hasOneUse()) {
9081 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
9090 // max(a, max(b, c)) -> max3(a, b, c)
9091 // min(a, min(b, c)) -> min3(a, b, c)
9092 if (Op1
.getOpcode() == Opc
&& Op1
.hasOneUse()) {
9094 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
9103 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
9104 if (Opc
== ISD::SMIN
&& Op0
.getOpcode() == ISD::SMAX
&& Op0
.hasOneUse()) {
9105 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, true))
9109 if (Opc
== ISD::UMIN
&& Op0
.getOpcode() == ISD::UMAX
&& Op0
.hasOneUse()) {
9110 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, false))
9114 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
9115 if (((Opc
== ISD::FMINNUM
&& Op0
.getOpcode() == ISD::FMAXNUM
) ||
9116 (Opc
== ISD::FMINNUM_IEEE
&& Op0
.getOpcode() == ISD::FMAXNUM_IEEE
) ||
9117 (Opc
== AMDGPUISD::FMIN_LEGACY
&&
9118 Op0
.getOpcode() == AMDGPUISD::FMAX_LEGACY
)) &&
9119 (VT
== MVT::f32
|| VT
== MVT::f64
||
9120 (VT
== MVT::f16
&& Subtarget
->has16BitInsts()) ||
9121 (VT
== MVT::v2f16
&& Subtarget
->hasVOP3PInsts())) &&
9123 if (SDValue Res
= performFPMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
))
9130 static bool isClampZeroToOne(SDValue A
, SDValue B
) {
9131 if (ConstantFPSDNode
*CA
= dyn_cast
<ConstantFPSDNode
>(A
)) {
9132 if (ConstantFPSDNode
*CB
= dyn_cast
<ConstantFPSDNode
>(B
)) {
9133 // FIXME: Should this be allowing -0.0?
9134 return (CA
->isExactlyValue(0.0) && CB
->isExactlyValue(1.0)) ||
9135 (CA
->isExactlyValue(1.0) && CB
->isExactlyValue(0.0));
9142 // FIXME: Should only worry about snans for version with chain.
9143 SDValue
SITargetLowering::performFMed3Combine(SDNode
*N
,
9144 DAGCombinerInfo
&DCI
) const {
9145 EVT VT
= N
->getValueType(0);
9146 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
9147 // NaNs. With a NaN input, the order of the operands may change the result.
9149 SelectionDAG
&DAG
= DCI
.DAG
;
9152 SDValue Src0
= N
->getOperand(0);
9153 SDValue Src1
= N
->getOperand(1);
9154 SDValue Src2
= N
->getOperand(2);
9156 if (isClampZeroToOne(Src0
, Src1
)) {
9157 // const_a, const_b, x -> clamp is safe in all cases including signaling
9159 // FIXME: Should this be allowing -0.0?
9160 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src2
);
9163 const MachineFunction
&MF
= DAG
.getMachineFunction();
9164 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
9166 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
9167 // handling no dx10-clamp?
9168 if (Info
->getMode().DX10Clamp
) {
9169 // If NaNs is clamped to 0, we are free to reorder the inputs.
9171 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
9172 std::swap(Src0
, Src1
);
9174 if (isa
<ConstantFPSDNode
>(Src1
) && !isa
<ConstantFPSDNode
>(Src2
))
9175 std::swap(Src1
, Src2
);
9177 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
9178 std::swap(Src0
, Src1
);
9180 if (isClampZeroToOne(Src1
, Src2
))
9181 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src0
);
9187 SDValue
SITargetLowering::performCvtPkRTZCombine(SDNode
*N
,
9188 DAGCombinerInfo
&DCI
) const {
9189 SDValue Src0
= N
->getOperand(0);
9190 SDValue Src1
= N
->getOperand(1);
9191 if (Src0
.isUndef() && Src1
.isUndef())
9192 return DCI
.DAG
.getUNDEF(N
->getValueType(0));
9196 SDValue
SITargetLowering::performExtractVectorEltCombine(
9197 SDNode
*N
, DAGCombinerInfo
&DCI
) const {
9198 SDValue Vec
= N
->getOperand(0);
9199 SelectionDAG
&DAG
= DCI
.DAG
;
9201 EVT VecVT
= Vec
.getValueType();
9202 EVT EltVT
= VecVT
.getVectorElementType();
9204 if ((Vec
.getOpcode() == ISD::FNEG
||
9205 Vec
.getOpcode() == ISD::FABS
) && allUsesHaveSourceMods(N
)) {
9207 EVT EltVT
= N
->getValueType(0);
9208 SDValue Idx
= N
->getOperand(1);
9209 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9210 Vec
.getOperand(0), Idx
);
9211 return DAG
.getNode(Vec
.getOpcode(), SL
, EltVT
, Elt
);
9214 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
9216 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
9217 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
9218 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
9219 if (Vec
.hasOneUse() && DCI
.isBeforeLegalize()) {
9221 EVT EltVT
= N
->getValueType(0);
9222 SDValue Idx
= N
->getOperand(1);
9223 unsigned Opc
= Vec
.getOpcode();
9228 // TODO: Support other binary operations.
9239 case ISD::FMAXNUM_IEEE
:
9240 case ISD::FMINNUM_IEEE
: {
9241 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9242 Vec
.getOperand(0), Idx
);
9243 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9244 Vec
.getOperand(1), Idx
);
9246 DCI
.AddToWorklist(Elt0
.getNode());
9247 DCI
.AddToWorklist(Elt1
.getNode());
9248 return DAG
.getNode(Opc
, SL
, EltVT
, Elt0
, Elt1
, Vec
->getFlags());
9253 unsigned VecSize
= VecVT
.getSizeInBits();
9254 unsigned EltSize
= EltVT
.getSizeInBits();
9256 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
9257 // This elminates non-constant index and subsequent movrel or scratch access.
9258 // Sub-dword vectors of size 2 dword or less have better implementation.
9259 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
9261 if (VecSize
<= 256 && (VecSize
> 64 || EltSize
>= 32) &&
9262 !isa
<ConstantSDNode
>(N
->getOperand(1))) {
9264 SDValue Idx
= N
->getOperand(1);
9265 EVT IdxVT
= Idx
.getValueType();
9267 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
9268 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
9269 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
9273 V
= DAG
.getSelectCC(SL
, Idx
, IC
, Elt
, V
, ISD::SETEQ
);
9278 if (!DCI
.isBeforeLegalize())
9281 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
9282 // elements. This exposes more load reduction opportunities by replacing
9283 // multiple small extract_vector_elements with a single 32-bit extract.
9284 auto *Idx
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
9285 if (isa
<MemSDNode
>(Vec
) &&
9287 EltVT
.isByteSized() &&
9289 VecSize
% 32 == 0 &&
9291 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VecVT
);
9293 unsigned BitIndex
= Idx
->getZExtValue() * EltSize
;
9294 unsigned EltIdx
= BitIndex
/ 32;
9295 unsigned LeftoverBitIdx
= BitIndex
% 32;
9298 SDValue Cast
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, Vec
);
9299 DCI
.AddToWorklist(Cast
.getNode());
9301 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Cast
,
9302 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
9303 DCI
.AddToWorklist(Elt
.getNode());
9304 SDValue Srl
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Elt
,
9305 DAG
.getConstant(LeftoverBitIdx
, SL
, MVT::i32
));
9306 DCI
.AddToWorklist(Srl
.getNode());
9308 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, SL
, EltVT
.changeTypeToInteger(), Srl
);
9309 DCI
.AddToWorklist(Trunc
.getNode());
9310 return DAG
.getNode(ISD::BITCAST
, SL
, EltVT
, Trunc
);
9317 SITargetLowering::performInsertVectorEltCombine(SDNode
*N
,
9318 DAGCombinerInfo
&DCI
) const {
9319 SDValue Vec
= N
->getOperand(0);
9320 SDValue Idx
= N
->getOperand(2);
9321 EVT VecVT
= Vec
.getValueType();
9322 EVT EltVT
= VecVT
.getVectorElementType();
9323 unsigned VecSize
= VecVT
.getSizeInBits();
9324 unsigned EltSize
= EltVT
.getSizeInBits();
9326 // INSERT_VECTOR_ELT (<n x e>, var-idx)
9327 // => BUILD_VECTOR n x select (e, const-idx)
9328 // This elminates non-constant index and subsequent movrel or scratch access.
9329 // Sub-dword vectors of size 2 dword or less have better implementation.
9330 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
9332 if (isa
<ConstantSDNode
>(Idx
) ||
9333 VecSize
> 256 || (VecSize
<= 64 && EltSize
< 32))
9336 SelectionDAG
&DAG
= DCI
.DAG
;
9338 SDValue Ins
= N
->getOperand(1);
9339 EVT IdxVT
= Idx
.getValueType();
9341 SmallVector
<SDValue
, 16> Ops
;
9342 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
9343 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
9344 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
9345 SDValue V
= DAG
.getSelectCC(SL
, Idx
, IC
, Ins
, Elt
, ISD::SETEQ
);
9349 return DAG
.getBuildVector(VecVT
, SL
, Ops
);
9352 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG
&DAG
,
9354 const SDNode
*N1
) const {
9355 EVT VT
= N0
->getValueType(0);
9357 // Only do this if we are not trying to support denormals. v_mad_f32 does not
9358 // support denormals ever.
9359 if (((VT
== MVT::f32
&& !Subtarget
->hasFP32Denormals()) ||
9360 (VT
== MVT::f16
&& !Subtarget
->hasFP16Denormals() &&
9361 getSubtarget()->hasMadF16())) &&
9362 isOperationLegal(ISD::FMAD
, VT
))
9365 const TargetOptions
&Options
= DAG
.getTarget().Options
;
9366 if ((Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
9367 (N0
->getFlags().hasAllowContract() &&
9368 N1
->getFlags().hasAllowContract())) &&
9369 isFMAFasterThanFMulAndFAdd(VT
)) {
9376 // For a reassociatable opcode perform:
9377 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
9378 SDValue
SITargetLowering::reassociateScalarOps(SDNode
*N
,
9379 SelectionDAG
&DAG
) const {
9380 EVT VT
= N
->getValueType(0);
9381 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
9384 unsigned Opc
= N
->getOpcode();
9385 SDValue Op0
= N
->getOperand(0);
9386 SDValue Op1
= N
->getOperand(1);
9388 if (!(Op0
->isDivergent() ^ Op1
->isDivergent()))
9391 if (Op0
->isDivergent())
9392 std::swap(Op0
, Op1
);
9394 if (Op1
.getOpcode() != Opc
|| !Op1
.hasOneUse())
9397 SDValue Op2
= Op1
.getOperand(1);
9398 Op1
= Op1
.getOperand(0);
9399 if (!(Op1
->isDivergent() ^ Op2
->isDivergent()))
9402 if (Op1
->isDivergent())
9403 std::swap(Op1
, Op2
);
9405 // If either operand is constant this will conflict with
9406 // DAGCombiner::ReassociateOps().
9407 if (DAG
.isConstantIntBuildVectorOrConstantInt(Op0
) ||
9408 DAG
.isConstantIntBuildVectorOrConstantInt(Op1
))
9412 SDValue Add1
= DAG
.getNode(Opc
, SL
, VT
, Op0
, Op1
);
9413 return DAG
.getNode(Opc
, SL
, VT
, Add1
, Op2
);
9416 static SDValue
getMad64_32(SelectionDAG
&DAG
, const SDLoc
&SL
,
9418 SDValue N0
, SDValue N1
, SDValue N2
,
9420 unsigned MadOpc
= Signed
? AMDGPUISD::MAD_I64_I32
: AMDGPUISD::MAD_U64_U32
;
9421 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i1
);
9422 SDValue Mad
= DAG
.getNode(MadOpc
, SL
, VTs
, N0
, N1
, N2
);
9423 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Mad
);
9426 SDValue
SITargetLowering::performAddCombine(SDNode
*N
,
9427 DAGCombinerInfo
&DCI
) const {
9428 SelectionDAG
&DAG
= DCI
.DAG
;
9429 EVT VT
= N
->getValueType(0);
9431 SDValue LHS
= N
->getOperand(0);
9432 SDValue RHS
= N
->getOperand(1);
9434 if ((LHS
.getOpcode() == ISD::MUL
|| RHS
.getOpcode() == ISD::MUL
)
9435 && Subtarget
->hasMad64_32() &&
9436 !VT
.isVector() && VT
.getScalarSizeInBits() > 32 &&
9437 VT
.getScalarSizeInBits() <= 64) {
9438 if (LHS
.getOpcode() != ISD::MUL
)
9439 std::swap(LHS
, RHS
);
9441 SDValue MulLHS
= LHS
.getOperand(0);
9442 SDValue MulRHS
= LHS
.getOperand(1);
9443 SDValue AddRHS
= RHS
;
9445 // TODO: Maybe restrict if SGPR inputs.
9446 if (numBitsUnsigned(MulLHS
, DAG
) <= 32 &&
9447 numBitsUnsigned(MulRHS
, DAG
) <= 32) {
9448 MulLHS
= DAG
.getZExtOrTrunc(MulLHS
, SL
, MVT::i32
);
9449 MulRHS
= DAG
.getZExtOrTrunc(MulRHS
, SL
, MVT::i32
);
9450 AddRHS
= DAG
.getZExtOrTrunc(AddRHS
, SL
, MVT::i64
);
9451 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, false);
9454 if (numBitsSigned(MulLHS
, DAG
) < 32 && numBitsSigned(MulRHS
, DAG
) < 32) {
9455 MulLHS
= DAG
.getSExtOrTrunc(MulLHS
, SL
, MVT::i32
);
9456 MulRHS
= DAG
.getSExtOrTrunc(MulRHS
, SL
, MVT::i32
);
9457 AddRHS
= DAG
.getSExtOrTrunc(AddRHS
, SL
, MVT::i64
);
9458 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, true);
9464 if (SDValue V
= reassociateScalarOps(N
, DAG
)) {
9468 if (VT
!= MVT::i32
|| !DCI
.isAfterLegalizeDAG())
9471 // add x, zext (setcc) => addcarry x, 0, setcc
9472 // add x, sext (setcc) => subcarry x, 0, setcc
9473 unsigned Opc
= LHS
.getOpcode();
9474 if (Opc
== ISD::ZERO_EXTEND
|| Opc
== ISD::SIGN_EXTEND
||
9475 Opc
== ISD::ANY_EXTEND
|| Opc
== ISD::ADDCARRY
)
9476 std::swap(RHS
, LHS
);
9478 Opc
= RHS
.getOpcode();
9481 case ISD::ZERO_EXTEND
:
9482 case ISD::SIGN_EXTEND
:
9483 case ISD::ANY_EXTEND
: {
9484 auto Cond
= RHS
.getOperand(0);
9485 if (!isBoolSGPR(Cond
))
9487 SDVTList VTList
= DAG
.getVTList(MVT::i32
, MVT::i1
);
9488 SDValue Args
[] = { LHS
, DAG
.getConstant(0, SL
, MVT::i32
), Cond
};
9489 Opc
= (Opc
== ISD::SIGN_EXTEND
) ? ISD::SUBCARRY
: ISD::ADDCARRY
;
9490 return DAG
.getNode(Opc
, SL
, VTList
, Args
);
9492 case ISD::ADDCARRY
: {
9493 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
9494 auto C
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
9495 if (!C
|| C
->getZExtValue() != 0) break;
9496 SDValue Args
[] = { LHS
, RHS
.getOperand(0), RHS
.getOperand(2) };
9497 return DAG
.getNode(ISD::ADDCARRY
, SDLoc(N
), RHS
->getVTList(), Args
);
9503 SDValue
SITargetLowering::performSubCombine(SDNode
*N
,
9504 DAGCombinerInfo
&DCI
) const {
9505 SelectionDAG
&DAG
= DCI
.DAG
;
9506 EVT VT
= N
->getValueType(0);
9512 SDValue LHS
= N
->getOperand(0);
9513 SDValue RHS
= N
->getOperand(1);
9515 if (LHS
.getOpcode() == ISD::SUBCARRY
) {
9516 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
9517 auto C
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
9518 if (!C
|| !C
->isNullValue())
9520 SDValue Args
[] = { LHS
.getOperand(0), RHS
, LHS
.getOperand(2) };
9521 return DAG
.getNode(ISD::SUBCARRY
, SDLoc(N
), LHS
->getVTList(), Args
);
9526 SDValue
SITargetLowering::performAddCarrySubCarryCombine(SDNode
*N
,
9527 DAGCombinerInfo
&DCI
) const {
9529 if (N
->getValueType(0) != MVT::i32
)
9532 auto C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
9533 if (!C
|| C
->getZExtValue() != 0)
9536 SelectionDAG
&DAG
= DCI
.DAG
;
9537 SDValue LHS
= N
->getOperand(0);
9539 // addcarry (add x, y), 0, cc => addcarry x, y, cc
9540 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
9541 unsigned LHSOpc
= LHS
.getOpcode();
9542 unsigned Opc
= N
->getOpcode();
9543 if ((LHSOpc
== ISD::ADD
&& Opc
== ISD::ADDCARRY
) ||
9544 (LHSOpc
== ISD::SUB
&& Opc
== ISD::SUBCARRY
)) {
9545 SDValue Args
[] = { LHS
.getOperand(0), LHS
.getOperand(1), N
->getOperand(2) };
9546 return DAG
.getNode(Opc
, SDLoc(N
), N
->getVTList(), Args
);
9551 SDValue
SITargetLowering::performFAddCombine(SDNode
*N
,
9552 DAGCombinerInfo
&DCI
) const {
9553 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
9556 SelectionDAG
&DAG
= DCI
.DAG
;
9557 EVT VT
= N
->getValueType(0);
9560 SDValue LHS
= N
->getOperand(0);
9561 SDValue RHS
= N
->getOperand(1);
9563 // These should really be instruction patterns, but writing patterns with
9564 // source modiifiers is a pain.
9566 // fadd (fadd (a, a), b) -> mad 2.0, a, b
9567 if (LHS
.getOpcode() == ISD::FADD
) {
9568 SDValue A
= LHS
.getOperand(0);
9569 if (A
== LHS
.getOperand(1)) {
9570 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
9572 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9573 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, RHS
);
9578 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
9579 if (RHS
.getOpcode() == ISD::FADD
) {
9580 SDValue A
= RHS
.getOperand(0);
9581 if (A
== RHS
.getOperand(1)) {
9582 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
9584 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9585 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, LHS
);
9593 SDValue
SITargetLowering::performFSubCombine(SDNode
*N
,
9594 DAGCombinerInfo
&DCI
) const {
9595 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
9598 SelectionDAG
&DAG
= DCI
.DAG
;
9600 EVT VT
= N
->getValueType(0);
9601 assert(!VT
.isVector());
9603 // Try to get the fneg to fold into the source modifier. This undoes generic
9604 // DAG combines and folds them into the mad.
9606 // Only do this if we are not trying to support denormals. v_mad_f32 does
9607 // not support denormals ever.
9608 SDValue LHS
= N
->getOperand(0);
9609 SDValue RHS
= N
->getOperand(1);
9610 if (LHS
.getOpcode() == ISD::FADD
) {
9611 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
9612 SDValue A
= LHS
.getOperand(0);
9613 if (A
== LHS
.getOperand(1)) {
9614 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
9616 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9617 SDValue NegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
9619 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, NegRHS
);
9624 if (RHS
.getOpcode() == ISD::FADD
) {
9625 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
9627 SDValue A
= RHS
.getOperand(0);
9628 if (A
== RHS
.getOperand(1)) {
9629 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
9631 const SDValue NegTwo
= DAG
.getConstantFP(-2.0, SL
, VT
);
9632 return DAG
.getNode(FusedOp
, SL
, VT
, A
, NegTwo
, LHS
);
9640 SDValue
SITargetLowering::performFMACombine(SDNode
*N
,
9641 DAGCombinerInfo
&DCI
) const {
9642 SelectionDAG
&DAG
= DCI
.DAG
;
9643 EVT VT
= N
->getValueType(0);
9646 if (!Subtarget
->hasDot2Insts() || VT
!= MVT::f32
)
9649 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
9650 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
9651 SDValue Op1
= N
->getOperand(0);
9652 SDValue Op2
= N
->getOperand(1);
9653 SDValue FMA
= N
->getOperand(2);
9655 if (FMA
.getOpcode() != ISD::FMA
||
9656 Op1
.getOpcode() != ISD::FP_EXTEND
||
9657 Op2
.getOpcode() != ISD::FP_EXTEND
)
9660 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
9661 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
9662 // is sufficient to allow generaing fdot2.
9663 const TargetOptions
&Options
= DAG
.getTarget().Options
;
9664 if (Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
9665 (N
->getFlags().hasAllowContract() &&
9666 FMA
->getFlags().hasAllowContract())) {
9667 Op1
= Op1
.getOperand(0);
9668 Op2
= Op2
.getOperand(0);
9669 if (Op1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
9670 Op2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
9673 SDValue Vec1
= Op1
.getOperand(0);
9674 SDValue Idx1
= Op1
.getOperand(1);
9675 SDValue Vec2
= Op2
.getOperand(0);
9677 SDValue FMAOp1
= FMA
.getOperand(0);
9678 SDValue FMAOp2
= FMA
.getOperand(1);
9679 SDValue FMAAcc
= FMA
.getOperand(2);
9681 if (FMAOp1
.getOpcode() != ISD::FP_EXTEND
||
9682 FMAOp2
.getOpcode() != ISD::FP_EXTEND
)
9685 FMAOp1
= FMAOp1
.getOperand(0);
9686 FMAOp2
= FMAOp2
.getOperand(0);
9687 if (FMAOp1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
9688 FMAOp2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
9691 SDValue Vec3
= FMAOp1
.getOperand(0);
9692 SDValue Vec4
= FMAOp2
.getOperand(0);
9693 SDValue Idx2
= FMAOp1
.getOperand(1);
9695 if (Idx1
!= Op2
.getOperand(1) || Idx2
!= FMAOp2
.getOperand(1) ||
9696 // Idx1 and Idx2 cannot be the same.
9700 if (Vec1
== Vec2
|| Vec3
== Vec4
)
9703 if (Vec1
.getValueType() != MVT::v2f16
|| Vec2
.getValueType() != MVT::v2f16
)
9706 if ((Vec1
== Vec3
&& Vec2
== Vec4
) ||
9707 (Vec1
== Vec4
&& Vec2
== Vec3
)) {
9708 return DAG
.getNode(AMDGPUISD::FDOT2
, SL
, MVT::f32
, Vec1
, Vec2
, FMAAcc
,
9709 DAG
.getTargetConstant(0, SL
, MVT::i1
));
9715 SDValue
SITargetLowering::performSetCCCombine(SDNode
*N
,
9716 DAGCombinerInfo
&DCI
) const {
9717 SelectionDAG
&DAG
= DCI
.DAG
;
9720 SDValue LHS
= N
->getOperand(0);
9721 SDValue RHS
= N
->getOperand(1);
9722 EVT VT
= LHS
.getValueType();
9723 ISD::CondCode CC
= cast
<CondCodeSDNode
>(N
->getOperand(2))->get();
9725 auto CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
9727 CRHS
= dyn_cast
<ConstantSDNode
>(LHS
);
9729 std::swap(LHS
, RHS
);
9730 CC
= getSetCCSwappedOperands(CC
);
9735 if (VT
== MVT::i32
&& LHS
.getOpcode() == ISD::SIGN_EXTEND
&&
9736 isBoolSGPR(LHS
.getOperand(0))) {
9737 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
9738 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
9739 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
9740 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
9741 if ((CRHS
->isAllOnesValue() &&
9742 (CC
== ISD::SETNE
|| CC
== ISD::SETGT
|| CC
== ISD::SETULT
)) ||
9743 (CRHS
->isNullValue() &&
9744 (CC
== ISD::SETEQ
|| CC
== ISD::SETGE
|| CC
== ISD::SETULE
)))
9745 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
9746 DAG
.getConstant(-1, SL
, MVT::i1
));
9747 if ((CRHS
->isAllOnesValue() &&
9748 (CC
== ISD::SETEQ
|| CC
== ISD::SETLE
|| CC
== ISD::SETUGE
)) ||
9749 (CRHS
->isNullValue() &&
9750 (CC
== ISD::SETNE
|| CC
== ISD::SETUGT
|| CC
== ISD::SETLT
)))
9751 return LHS
.getOperand(0);
9754 uint64_t CRHSVal
= CRHS
->getZExtValue();
9755 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) &&
9756 LHS
.getOpcode() == ISD::SELECT
&&
9757 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
9758 isa
<ConstantSDNode
>(LHS
.getOperand(2)) &&
9759 LHS
.getConstantOperandVal(1) != LHS
.getConstantOperandVal(2) &&
9760 isBoolSGPR(LHS
.getOperand(0))) {
9762 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
9763 // setcc (select cc, CT, CF), CF, ne => cc
9764 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
9765 // setcc (select cc, CT, CF), CT, eq => cc
9766 uint64_t CT
= LHS
.getConstantOperandVal(1);
9767 uint64_t CF
= LHS
.getConstantOperandVal(2);
9769 if ((CF
== CRHSVal
&& CC
== ISD::SETEQ
) ||
9770 (CT
== CRHSVal
&& CC
== ISD::SETNE
))
9771 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
9772 DAG
.getConstant(-1, SL
, MVT::i1
));
9773 if ((CF
== CRHSVal
&& CC
== ISD::SETNE
) ||
9774 (CT
== CRHSVal
&& CC
== ISD::SETEQ
))
9775 return LHS
.getOperand(0);
9779 if (VT
!= MVT::f32
&& VT
!= MVT::f64
&& (Subtarget
->has16BitInsts() &&
9783 // Match isinf/isfinite pattern
9784 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
9785 // (fcmp one (fabs x), inf) -> (fp_class x,
9786 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
9787 if ((CC
== ISD::SETOEQ
|| CC
== ISD::SETONE
) && LHS
.getOpcode() == ISD::FABS
) {
9788 const ConstantFPSDNode
*CRHS
= dyn_cast
<ConstantFPSDNode
>(RHS
);
9792 const APFloat
&APF
= CRHS
->getValueAPF();
9793 if (APF
.isInfinity() && !APF
.isNegative()) {
9794 const unsigned IsInfMask
= SIInstrFlags::P_INFINITY
|
9795 SIInstrFlags::N_INFINITY
;
9796 const unsigned IsFiniteMask
= SIInstrFlags::N_ZERO
|
9797 SIInstrFlags::P_ZERO
|
9798 SIInstrFlags::N_NORMAL
|
9799 SIInstrFlags::P_NORMAL
|
9800 SIInstrFlags::N_SUBNORMAL
|
9801 SIInstrFlags::P_SUBNORMAL
;
9802 unsigned Mask
= CC
== ISD::SETOEQ
? IsInfMask
: IsFiniteMask
;
9803 return DAG
.getNode(AMDGPUISD::FP_CLASS
, SL
, MVT::i1
, LHS
.getOperand(0),
9804 DAG
.getConstant(Mask
, SL
, MVT::i32
));
9811 SDValue
SITargetLowering::performCvtF32UByteNCombine(SDNode
*N
,
9812 DAGCombinerInfo
&DCI
) const {
9813 SelectionDAG
&DAG
= DCI
.DAG
;
9815 unsigned Offset
= N
->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0
;
9817 SDValue Src
= N
->getOperand(0);
9818 SDValue Srl
= N
->getOperand(0);
9819 if (Srl
.getOpcode() == ISD::ZERO_EXTEND
)
9820 Srl
= Srl
.getOperand(0);
9822 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
9823 if (Srl
.getOpcode() == ISD::SRL
) {
9824 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
9825 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
9826 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
9828 if (const ConstantSDNode
*C
=
9829 dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1))) {
9830 Srl
= DAG
.getZExtOrTrunc(Srl
.getOperand(0), SDLoc(Srl
.getOperand(0)),
9833 unsigned SrcOffset
= C
->getZExtValue() + 8 * Offset
;
9834 if (SrcOffset
< 32 && SrcOffset
% 8 == 0) {
9835 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
+ SrcOffset
/ 8, SL
,
9841 APInt Demanded
= APInt::getBitsSet(32, 8 * Offset
, 8 * Offset
+ 8);
9844 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
9845 !DCI
.isBeforeLegalizeOps());
9846 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
9847 if (TLI
.SimplifyDemandedBits(Src
, Demanded
, Known
, TLO
)) {
9848 DCI
.CommitTargetLoweringOpt(TLO
);
9854 SDValue
SITargetLowering::performClampCombine(SDNode
*N
,
9855 DAGCombinerInfo
&DCI
) const {
9856 ConstantFPSDNode
*CSrc
= dyn_cast
<ConstantFPSDNode
>(N
->getOperand(0));
9860 const MachineFunction
&MF
= DCI
.DAG
.getMachineFunction();
9861 const APFloat
&F
= CSrc
->getValueAPF();
9862 APFloat Zero
= APFloat::getZero(F
.getSemantics());
9863 APFloat::cmpResult Cmp0
= F
.compare(Zero
);
9864 if (Cmp0
== APFloat::cmpLessThan
||
9865 (Cmp0
== APFloat::cmpUnordered
&&
9866 MF
.getInfo
<SIMachineFunctionInfo
>()->getMode().DX10Clamp
)) {
9867 return DCI
.DAG
.getConstantFP(Zero
, SDLoc(N
), N
->getValueType(0));
9870 APFloat
One(F
.getSemantics(), "1.0");
9871 APFloat::cmpResult Cmp1
= F
.compare(One
);
9872 if (Cmp1
== APFloat::cmpGreaterThan
)
9873 return DCI
.DAG
.getConstantFP(One
, SDLoc(N
), N
->getValueType(0));
9875 return SDValue(CSrc
, 0);
9879 SDValue
SITargetLowering::PerformDAGCombine(SDNode
*N
,
9880 DAGCombinerInfo
&DCI
) const {
9881 if (getTargetMachine().getOptLevel() == CodeGenOpt::None
)
9883 switch (N
->getOpcode()) {
9885 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
9887 return performAddCombine(N
, DCI
);
9889 return performSubCombine(N
, DCI
);
9892 return performAddCarrySubCarryCombine(N
, DCI
);
9894 return performFAddCombine(N
, DCI
);
9896 return performFSubCombine(N
, DCI
);
9898 return performSetCCCombine(N
, DCI
);
9901 case ISD::FMAXNUM_IEEE
:
9902 case ISD::FMINNUM_IEEE
:
9907 case AMDGPUISD::FMIN_LEGACY
:
9908 case AMDGPUISD::FMAX_LEGACY
:
9909 return performMinMaxCombine(N
, DCI
);
9911 return performFMACombine(N
, DCI
);
9913 if (SDValue Widended
= widenLoad(cast
<LoadSDNode
>(N
), DCI
))
9918 case ISD::ATOMIC_LOAD
:
9919 case ISD::ATOMIC_STORE
:
9920 case ISD::ATOMIC_CMP_SWAP
:
9921 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
:
9922 case ISD::ATOMIC_SWAP
:
9923 case ISD::ATOMIC_LOAD_ADD
:
9924 case ISD::ATOMIC_LOAD_SUB
:
9925 case ISD::ATOMIC_LOAD_AND
:
9926 case ISD::ATOMIC_LOAD_OR
:
9927 case ISD::ATOMIC_LOAD_XOR
:
9928 case ISD::ATOMIC_LOAD_NAND
:
9929 case ISD::ATOMIC_LOAD_MIN
:
9930 case ISD::ATOMIC_LOAD_MAX
:
9931 case ISD::ATOMIC_LOAD_UMIN
:
9932 case ISD::ATOMIC_LOAD_UMAX
:
9933 case ISD::ATOMIC_LOAD_FADD
:
9934 case AMDGPUISD::ATOMIC_INC
:
9935 case AMDGPUISD::ATOMIC_DEC
:
9936 case AMDGPUISD::ATOMIC_LOAD_FMIN
:
9937 case AMDGPUISD::ATOMIC_LOAD_FMAX
: // TODO: Target mem intrinsics.
9938 if (DCI
.isBeforeLegalize())
9940 return performMemSDNodeCombine(cast
<MemSDNode
>(N
), DCI
);
9942 return performAndCombine(N
, DCI
);
9944 return performOrCombine(N
, DCI
);
9946 return performXorCombine(N
, DCI
);
9947 case ISD::ZERO_EXTEND
:
9948 return performZeroExtendCombine(N
, DCI
);
9949 case ISD::SIGN_EXTEND_INREG
:
9950 return performSignExtendInRegCombine(N
, DCI
);
9951 case AMDGPUISD::FP_CLASS
:
9952 return performClassCombine(N
, DCI
);
9953 case ISD::FCANONICALIZE
:
9954 return performFCanonicalizeCombine(N
, DCI
);
9955 case AMDGPUISD::RCP
:
9956 return performRcpCombine(N
, DCI
);
9957 case AMDGPUISD::FRACT
:
9958 case AMDGPUISD::RSQ
:
9959 case AMDGPUISD::RCP_LEGACY
:
9960 case AMDGPUISD::RSQ_LEGACY
:
9961 case AMDGPUISD::RCP_IFLAG
:
9962 case AMDGPUISD::RSQ_CLAMP
:
9963 case AMDGPUISD::LDEXP
: {
9964 SDValue Src
= N
->getOperand(0);
9969 case ISD::SINT_TO_FP
:
9970 case ISD::UINT_TO_FP
:
9971 return performUCharToFloatCombine(N
, DCI
);
9972 case AMDGPUISD::CVT_F32_UBYTE0
:
9973 case AMDGPUISD::CVT_F32_UBYTE1
:
9974 case AMDGPUISD::CVT_F32_UBYTE2
:
9975 case AMDGPUISD::CVT_F32_UBYTE3
:
9976 return performCvtF32UByteNCombine(N
, DCI
);
9977 case AMDGPUISD::FMED3
:
9978 return performFMed3Combine(N
, DCI
);
9979 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
9980 return performCvtPkRTZCombine(N
, DCI
);
9981 case AMDGPUISD::CLAMP
:
9982 return performClampCombine(N
, DCI
);
9983 case ISD::SCALAR_TO_VECTOR
: {
9984 SelectionDAG
&DAG
= DCI
.DAG
;
9985 EVT VT
= N
->getValueType(0);
9987 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
9988 if (VT
== MVT::v2i16
|| VT
== MVT::v2f16
) {
9990 SDValue Src
= N
->getOperand(0);
9991 EVT EltVT
= Src
.getValueType();
9992 if (EltVT
== MVT::f16
)
9993 Src
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Src
);
9995 SDValue Ext
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Src
);
9996 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Ext
);
10001 case ISD::EXTRACT_VECTOR_ELT
:
10002 return performExtractVectorEltCombine(N
, DCI
);
10003 case ISD::INSERT_VECTOR_ELT
:
10004 return performInsertVectorEltCombine(N
, DCI
);
10006 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
10009 /// Helper function for adjustWritemask
10010 static unsigned SubIdx2Lane(unsigned Idx
) {
10013 case AMDGPU::sub0
: return 0;
10014 case AMDGPU::sub1
: return 1;
10015 case AMDGPU::sub2
: return 2;
10016 case AMDGPU::sub3
: return 3;
10017 case AMDGPU::sub4
: return 4; // Possible with TFE/LWE
10021 /// Adjust the writemask of MIMG instructions
10022 SDNode
*SITargetLowering::adjustWritemask(MachineSDNode
*&Node
,
10023 SelectionDAG
&DAG
) const {
10024 unsigned Opcode
= Node
->getMachineOpcode();
10026 // Subtract 1 because the vdata output is not a MachineSDNode operand.
10027 int D16Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::d16
) - 1;
10028 if (D16Idx
>= 0 && Node
->getConstantOperandVal(D16Idx
))
10029 return Node
; // not implemented for D16
10031 SDNode
*Users
[5] = { nullptr };
10033 unsigned DmaskIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::dmask
) - 1;
10034 unsigned OldDmask
= Node
->getConstantOperandVal(DmaskIdx
);
10035 unsigned NewDmask
= 0;
10036 unsigned TFEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::tfe
) - 1;
10037 unsigned LWEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::lwe
) - 1;
10038 bool UsesTFC
= (Node
->getConstantOperandVal(TFEIdx
) ||
10039 Node
->getConstantOperandVal(LWEIdx
)) ? 1 : 0;
10040 unsigned TFCLane
= 0;
10041 bool HasChain
= Node
->getNumValues() > 1;
10043 if (OldDmask
== 0) {
10044 // These are folded out, but on the chance it happens don't assert.
10048 unsigned OldBitsSet
= countPopulation(OldDmask
);
10049 // Work out which is the TFE/LWE lane if that is enabled.
10051 TFCLane
= OldBitsSet
;
10054 // Try to figure out the used register components
10055 for (SDNode::use_iterator I
= Node
->use_begin(), E
= Node
->use_end();
10058 // Don't look at users of the chain.
10059 if (I
.getUse().getResNo() != 0)
10062 // Abort if we can't understand the usage
10063 if (!I
->isMachineOpcode() ||
10064 I
->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
)
10067 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
10068 // Note that subregs are packed, i.e. Lane==0 is the first bit set
10069 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
10071 Lane
= SubIdx2Lane(I
->getConstantOperandVal(1));
10073 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
10074 if (UsesTFC
&& Lane
== TFCLane
) {
10077 // Set which texture component corresponds to the lane.
10079 for (unsigned i
= 0, Dmask
= OldDmask
; (i
<= Lane
) && (Dmask
!= 0); i
++) {
10080 Comp
= countTrailingZeros(Dmask
);
10081 Dmask
&= ~(1 << Comp
);
10084 // Abort if we have more than one user per component.
10089 NewDmask
|= 1 << Comp
;
10093 // Don't allow 0 dmask, as hardware assumes one channel enabled.
10094 bool NoChannels
= !NewDmask
;
10097 // No uses of the result and not using TFC. Then do nothing.
10100 // If the original dmask has one channel - then nothing to do
10101 if (OldBitsSet
== 1)
10103 // Use an arbitrary dmask - required for the instruction to work
10106 // Abort if there's no change
10107 if (NewDmask
== OldDmask
)
10110 unsigned BitsSet
= countPopulation(NewDmask
);
10112 // Check for TFE or LWE - increase the number of channels by one to account
10113 // for the extra return value
10114 // This will need adjustment for D16 if this is also included in
10115 // adjustWriteMask (this function) but at present D16 are excluded.
10116 unsigned NewChannels
= BitsSet
+ UsesTFC
;
10119 AMDGPU::getMaskedMIMGOp(Node
->getMachineOpcode(), NewChannels
);
10120 assert(NewOpcode
!= -1 &&
10121 NewOpcode
!= static_cast<int>(Node
->getMachineOpcode()) &&
10122 "failed to find equivalent MIMG op");
10124 // Adjust the writemask in the node
10125 SmallVector
<SDValue
, 12> Ops
;
10126 Ops
.insert(Ops
.end(), Node
->op_begin(), Node
->op_begin() + DmaskIdx
);
10127 Ops
.push_back(DAG
.getTargetConstant(NewDmask
, SDLoc(Node
), MVT::i32
));
10128 Ops
.insert(Ops
.end(), Node
->op_begin() + DmaskIdx
+ 1, Node
->op_end());
10130 MVT SVT
= Node
->getValueType(0).getVectorElementType().getSimpleVT();
10132 MVT ResultVT
= NewChannels
== 1 ?
10133 SVT
: MVT::getVectorVT(SVT
, NewChannels
== 3 ? 4 :
10134 NewChannels
== 5 ? 8 : NewChannels
);
10135 SDVTList NewVTList
= HasChain
?
10136 DAG
.getVTList(ResultVT
, MVT::Other
) : DAG
.getVTList(ResultVT
);
10139 MachineSDNode
*NewNode
= DAG
.getMachineNode(NewOpcode
, SDLoc(Node
),
10144 DAG
.setNodeMemRefs(NewNode
, Node
->memoperands());
10145 DAG
.ReplaceAllUsesOfValueWith(SDValue(Node
, 1), SDValue(NewNode
, 1));
10148 if (NewChannels
== 1) {
10149 assert(Node
->hasNUsesOfValue(1, 0));
10150 SDNode
*Copy
= DAG
.getMachineNode(TargetOpcode::COPY
,
10151 SDLoc(Node
), Users
[Lane
]->getValueType(0),
10152 SDValue(NewNode
, 0));
10153 DAG
.ReplaceAllUsesWith(Users
[Lane
], Copy
);
10157 // Update the users of the node with the new indices
10158 for (unsigned i
= 0, Idx
= AMDGPU::sub0
; i
< 5; ++i
) {
10159 SDNode
*User
= Users
[i
];
10161 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
10162 // Users[0] is still nullptr because channel 0 doesn't really have a use.
10163 if (i
|| !NoChannels
)
10166 SDValue Op
= DAG
.getTargetConstant(Idx
, SDLoc(User
), MVT::i32
);
10167 DAG
.UpdateNodeOperands(User
, SDValue(NewNode
, 0), Op
);
10172 case AMDGPU::sub0
: Idx
= AMDGPU::sub1
; break;
10173 case AMDGPU::sub1
: Idx
= AMDGPU::sub2
; break;
10174 case AMDGPU::sub2
: Idx
= AMDGPU::sub3
; break;
10175 case AMDGPU::sub3
: Idx
= AMDGPU::sub4
; break;
10179 DAG
.RemoveDeadNode(Node
);
10183 static bool isFrameIndexOp(SDValue Op
) {
10184 if (Op
.getOpcode() == ISD::AssertZext
)
10185 Op
= Op
.getOperand(0);
10187 return isa
<FrameIndexSDNode
>(Op
);
10190 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
10191 /// with frame index operands.
10192 /// LLVM assumes that inputs are to these instructions are registers.
10193 SDNode
*SITargetLowering::legalizeTargetIndependentNode(SDNode
*Node
,
10194 SelectionDAG
&DAG
) const {
10195 if (Node
->getOpcode() == ISD::CopyToReg
) {
10196 RegisterSDNode
*DestReg
= cast
<RegisterSDNode
>(Node
->getOperand(1));
10197 SDValue SrcVal
= Node
->getOperand(2);
10199 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
10200 // to try understanding copies to physical registers.
10201 if (SrcVal
.getValueType() == MVT::i1
&&
10202 Register::isPhysicalRegister(DestReg
->getReg())) {
10204 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
10205 SDValue VReg
= DAG
.getRegister(
10206 MRI
.createVirtualRegister(&AMDGPU::VReg_1RegClass
), MVT::i1
);
10208 SDNode
*Glued
= Node
->getGluedNode();
10210 = DAG
.getCopyToReg(Node
->getOperand(0), SL
, VReg
, SrcVal
,
10211 SDValue(Glued
, Glued
? Glued
->getNumValues() - 1 : 0));
10212 SDValue ToResultReg
10213 = DAG
.getCopyToReg(ToVReg
, SL
, SDValue(DestReg
, 0),
10214 VReg
, ToVReg
.getValue(1));
10215 DAG
.ReplaceAllUsesWith(Node
, ToResultReg
.getNode());
10216 DAG
.RemoveDeadNode(Node
);
10217 return ToResultReg
.getNode();
10221 SmallVector
<SDValue
, 8> Ops
;
10222 for (unsigned i
= 0; i
< Node
->getNumOperands(); ++i
) {
10223 if (!isFrameIndexOp(Node
->getOperand(i
))) {
10224 Ops
.push_back(Node
->getOperand(i
));
10229 Ops
.push_back(SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
,
10230 Node
->getOperand(i
).getValueType(),
10231 Node
->getOperand(i
)), 0));
10234 return DAG
.UpdateNodeOperands(Node
, Ops
);
10237 /// Fold the instructions after selecting them.
10238 /// Returns null if users were already updated.
10239 SDNode
*SITargetLowering::PostISelFolding(MachineSDNode
*Node
,
10240 SelectionDAG
&DAG
) const {
10241 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10242 unsigned Opcode
= Node
->getMachineOpcode();
10244 if (TII
->isMIMG(Opcode
) && !TII
->get(Opcode
).mayStore() &&
10245 !TII
->isGather4(Opcode
)) {
10246 return adjustWritemask(Node
, DAG
);
10249 if (Opcode
== AMDGPU::INSERT_SUBREG
||
10250 Opcode
== AMDGPU::REG_SEQUENCE
) {
10251 legalizeTargetIndependentNode(Node
, DAG
);
10256 case AMDGPU::V_DIV_SCALE_F32
:
10257 case AMDGPU::V_DIV_SCALE_F64
: {
10258 // Satisfy the operand register constraint when one of the inputs is
10259 // undefined. Ordinarily each undef value will have its own implicit_def of
10260 // a vreg, so force these to use a single register.
10261 SDValue Src0
= Node
->getOperand(0);
10262 SDValue Src1
= Node
->getOperand(1);
10263 SDValue Src2
= Node
->getOperand(2);
10265 if ((Src0
.isMachineOpcode() &&
10266 Src0
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
) &&
10267 (Src0
== Src1
|| Src0
== Src2
))
10270 MVT VT
= Src0
.getValueType().getSimpleVT();
10271 const TargetRegisterClass
*RC
=
10272 getRegClassFor(VT
, Src0
.getNode()->isDivergent());
10274 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
10275 SDValue UndefReg
= DAG
.getRegister(MRI
.createVirtualRegister(RC
), VT
);
10277 SDValue ImpDef
= DAG
.getCopyToReg(DAG
.getEntryNode(), SDLoc(Node
),
10278 UndefReg
, Src0
, SDValue());
10280 // src0 must be the same register as src1 or src2, even if the value is
10281 // undefined, so make sure we don't violate this constraint.
10282 if (Src0
.isMachineOpcode() &&
10283 Src0
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
) {
10284 if (Src1
.isMachineOpcode() &&
10285 Src1
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
10287 else if (Src2
.isMachineOpcode() &&
10288 Src2
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
10291 assert(Src1
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
);
10298 SmallVector
<SDValue
, 4> Ops
= { Src0
, Src1
, Src2
};
10299 for (unsigned I
= 3, N
= Node
->getNumOperands(); I
!= N
; ++I
)
10300 Ops
.push_back(Node
->getOperand(I
));
10302 Ops
.push_back(ImpDef
.getValue(1));
10303 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
10305 case AMDGPU::V_PERMLANE16_B32
:
10306 case AMDGPU::V_PERMLANEX16_B32
: {
10307 ConstantSDNode
*FI
= cast
<ConstantSDNode
>(Node
->getOperand(0));
10308 ConstantSDNode
*BC
= cast
<ConstantSDNode
>(Node
->getOperand(2));
10309 if (!FI
->getZExtValue() && !BC
->getZExtValue())
10311 SDValue VDstIn
= Node
->getOperand(6);
10312 if (VDstIn
.isMachineOpcode()
10313 && VDstIn
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
)
10315 MachineSDNode
*ImpDef
= DAG
.getMachineNode(TargetOpcode::IMPLICIT_DEF
,
10316 SDLoc(Node
), MVT::i32
);
10317 SmallVector
<SDValue
, 8> Ops
= { SDValue(FI
, 0), Node
->getOperand(1),
10318 SDValue(BC
, 0), Node
->getOperand(3),
10319 Node
->getOperand(4), Node
->getOperand(5),
10320 SDValue(ImpDef
, 0), Node
->getOperand(7) };
10321 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
10330 /// Assign the register class depending on the number of
10331 /// bits set in the writemask
10332 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr
&MI
,
10333 SDNode
*Node
) const {
10334 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10336 MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
10338 if (TII
->isVOP3(MI
.getOpcode())) {
10339 // Make sure constant bus requirements are respected.
10340 TII
->legalizeOperandsVOP3(MRI
, MI
);
10342 // Prefer VGPRs over AGPRs in mAI instructions where possible.
10343 // This saves a chain-copy of registers and better ballance register
10344 // use between vgpr and agpr as agpr tuples tend to be big.
10345 if (const MCOperandInfo
*OpInfo
= MI
.getDesc().OpInfo
) {
10346 unsigned Opc
= MI
.getOpcode();
10347 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10348 for (auto I
: { AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
),
10349 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
) }) {
10352 MachineOperand
&Op
= MI
.getOperand(I
);
10353 if ((OpInfo
[I
].RegClass
!= llvm::AMDGPU::AV_64RegClassID
&&
10354 OpInfo
[I
].RegClass
!= llvm::AMDGPU::AV_32RegClassID
) ||
10355 !Register::isVirtualRegister(Op
.getReg()) ||
10356 !TRI
->isAGPR(MRI
, Op
.getReg()))
10358 auto *Src
= MRI
.getUniqueVRegDef(Op
.getReg());
10359 if (!Src
|| !Src
->isCopy() ||
10360 !TRI
->isSGPRReg(MRI
, Src
->getOperand(1).getReg()))
10362 auto *RC
= TRI
->getRegClassForReg(MRI
, Op
.getReg());
10363 auto *NewRC
= TRI
->getEquivalentVGPRClass(RC
);
10364 // All uses of agpr64 and agpr32 can also accept vgpr except for
10365 // v_accvgpr_read, but we do not produce agpr reads during selection,
10366 // so no use checks are needed.
10367 MRI
.setRegClass(Op
.getReg(), NewRC
);
10374 // Replace unused atomics with the no return version.
10375 int NoRetAtomicOp
= AMDGPU::getAtomicNoRetOp(MI
.getOpcode());
10376 if (NoRetAtomicOp
!= -1) {
10377 if (!Node
->hasAnyUseOfValue(0)) {
10378 MI
.setDesc(TII
->get(NoRetAtomicOp
));
10379 MI
.RemoveOperand(0);
10383 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
10384 // instruction, because the return type of these instructions is a vec2 of
10385 // the memory type, so it can be tied to the input operand.
10386 // This means these instructions always have a use, so we need to add a
10387 // special case to check if the atomic has only one extract_subreg use,
10388 // which itself has no uses.
10389 if ((Node
->hasNUsesOfValue(1, 0) &&
10390 Node
->use_begin()->isMachineOpcode() &&
10391 Node
->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG
&&
10392 !Node
->use_begin()->hasAnyUseOfValue(0))) {
10393 Register Def
= MI
.getOperand(0).getReg();
10395 // Change this into a noret atomic.
10396 MI
.setDesc(TII
->get(NoRetAtomicOp
));
10397 MI
.RemoveOperand(0);
10399 // If we only remove the def operand from the atomic instruction, the
10400 // extract_subreg will be left with a use of a vreg without a def.
10401 // So we need to insert an implicit_def to avoid machine verifier
10403 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(),
10404 TII
->get(AMDGPU::IMPLICIT_DEF
), Def
);
10410 static SDValue
buildSMovImm32(SelectionDAG
&DAG
, const SDLoc
&DL
,
10412 SDValue K
= DAG
.getTargetConstant(Val
, DL
, MVT::i32
);
10413 return SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, K
), 0);
10416 MachineSDNode
*SITargetLowering::wrapAddr64Rsrc(SelectionDAG
&DAG
,
10418 SDValue Ptr
) const {
10419 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10421 // Build the half of the subregister with the constants before building the
10422 // full 128-bit register. If we are building multiple resource descriptors,
10423 // this will allow CSEing of the 2-component register.
10424 const SDValue Ops0
[] = {
10425 DAG
.getTargetConstant(AMDGPU::SGPR_64RegClassID
, DL
, MVT::i32
),
10426 buildSMovImm32(DAG
, DL
, 0),
10427 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
10428 buildSMovImm32(DAG
, DL
, TII
->getDefaultRsrcDataFormat() >> 32),
10429 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
)
10432 SDValue SubRegHi
= SDValue(DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
,
10433 MVT::v2i32
, Ops0
), 0);
10435 // Combine the constants and the pointer.
10436 const SDValue Ops1
[] = {
10437 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, DL
, MVT::i32
),
10439 DAG
.getTargetConstant(AMDGPU::sub0_sub1
, DL
, MVT::i32
),
10441 DAG
.getTargetConstant(AMDGPU::sub2_sub3
, DL
, MVT::i32
)
10444 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops1
);
10447 /// Return a resource descriptor with the 'Add TID' bit enabled
10448 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
10449 /// of the resource descriptor) to create an offset, which is added to
10450 /// the resource pointer.
10451 MachineSDNode
*SITargetLowering::buildRSRC(SelectionDAG
&DAG
, const SDLoc
&DL
,
10452 SDValue Ptr
, uint32_t RsrcDword1
,
10453 uint64_t RsrcDword2And3
) const {
10454 SDValue PtrLo
= DAG
.getTargetExtractSubreg(AMDGPU::sub0
, DL
, MVT::i32
, Ptr
);
10455 SDValue PtrHi
= DAG
.getTargetExtractSubreg(AMDGPU::sub1
, DL
, MVT::i32
, Ptr
);
10457 PtrHi
= SDValue(DAG
.getMachineNode(AMDGPU::S_OR_B32
, DL
, MVT::i32
, PtrHi
,
10458 DAG
.getConstant(RsrcDword1
, DL
, MVT::i32
)),
10462 SDValue DataLo
= buildSMovImm32(DAG
, DL
,
10463 RsrcDword2And3
& UINT64_C(0xFFFFFFFF));
10464 SDValue DataHi
= buildSMovImm32(DAG
, DL
, RsrcDword2And3
>> 32);
10466 const SDValue Ops
[] = {
10467 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, DL
, MVT::i32
),
10469 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
10471 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
),
10473 DAG
.getTargetConstant(AMDGPU::sub2
, DL
, MVT::i32
),
10475 DAG
.getTargetConstant(AMDGPU::sub3
, DL
, MVT::i32
)
10478 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops
);
10481 //===----------------------------------------------------------------------===//
10482 // SI Inline Assembly Support
10483 //===----------------------------------------------------------------------===//
10485 std::pair
<unsigned, const TargetRegisterClass
*>
10486 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo
*TRI
,
10487 StringRef Constraint
,
10489 const TargetRegisterClass
*RC
= nullptr;
10490 if (Constraint
.size() == 1) {
10491 switch (Constraint
[0]) {
10493 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
10496 switch (VT
.getSizeInBits()) {
10498 return std::make_pair(0U, nullptr);
10501 RC
= &AMDGPU::SReg_32_XM0RegClass
;
10504 RC
= &AMDGPU::SGPR_64RegClass
;
10507 RC
= &AMDGPU::SReg_96RegClass
;
10510 RC
= &AMDGPU::SReg_128RegClass
;
10513 RC
= &AMDGPU::SReg_160RegClass
;
10516 RC
= &AMDGPU::SReg_256RegClass
;
10519 RC
= &AMDGPU::SReg_512RegClass
;
10524 switch (VT
.getSizeInBits()) {
10526 return std::make_pair(0U, nullptr);
10529 RC
= &AMDGPU::VGPR_32RegClass
;
10532 RC
= &AMDGPU::VReg_64RegClass
;
10535 RC
= &AMDGPU::VReg_96RegClass
;
10538 RC
= &AMDGPU::VReg_128RegClass
;
10541 RC
= &AMDGPU::VReg_160RegClass
;
10544 RC
= &AMDGPU::VReg_256RegClass
;
10547 RC
= &AMDGPU::VReg_512RegClass
;
10552 if (!Subtarget
->hasMAIInsts())
10554 switch (VT
.getSizeInBits()) {
10556 return std::make_pair(0U, nullptr);
10559 RC
= &AMDGPU::AGPR_32RegClass
;
10562 RC
= &AMDGPU::AReg_64RegClass
;
10565 RC
= &AMDGPU::AReg_128RegClass
;
10568 RC
= &AMDGPU::AReg_512RegClass
;
10571 RC
= &AMDGPU::AReg_1024RegClass
;
10572 // v32 types are not legal but we support them here.
10573 return std::make_pair(0U, RC
);
10577 // We actually support i128, i16 and f16 as inline parameters
10578 // even if they are not reported as legal
10579 if (RC
&& (isTypeLegal(VT
) || VT
.SimpleTy
== MVT::i128
||
10580 VT
.SimpleTy
== MVT::i16
|| VT
.SimpleTy
== MVT::f16
))
10581 return std::make_pair(0U, RC
);
10584 if (Constraint
.size() > 1) {
10585 if (Constraint
[1] == 'v') {
10586 RC
= &AMDGPU::VGPR_32RegClass
;
10587 } else if (Constraint
[1] == 's') {
10588 RC
= &AMDGPU::SGPR_32RegClass
;
10589 } else if (Constraint
[1] == 'a') {
10590 RC
= &AMDGPU::AGPR_32RegClass
;
10595 bool Failed
= Constraint
.substr(2).getAsInteger(10, Idx
);
10596 if (!Failed
&& Idx
< RC
->getNumRegs())
10597 return std::make_pair(RC
->getRegister(Idx
), RC
);
10600 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
10603 SITargetLowering::ConstraintType
10604 SITargetLowering::getConstraintType(StringRef Constraint
) const {
10605 if (Constraint
.size() == 1) {
10606 switch (Constraint
[0]) {
10611 return C_RegisterClass
;
10614 return TargetLowering::getConstraintType(Constraint
);
10617 // Figure out which registers should be reserved for stack access. Only after
10618 // the function is legalized do we know all of the non-spill stack objects or if
10619 // calls are present.
10620 void SITargetLowering::finalizeLowering(MachineFunction
&MF
) const {
10621 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
10622 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10623 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
10624 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10626 if (Info
->isEntryFunction()) {
10627 // Callable functions have fixed registers used for stack access.
10628 reservePrivateMemoryRegs(getTargetMachine(), MF
, *TRI
, *Info
);
10631 assert(!TRI
->isSubRegister(Info
->getScratchRSrcReg(),
10632 Info
->getStackPtrOffsetReg()));
10633 if (Info
->getStackPtrOffsetReg() != AMDGPU::SP_REG
)
10634 MRI
.replaceRegWith(AMDGPU::SP_REG
, Info
->getStackPtrOffsetReg());
10636 // We need to worry about replacing the default register with itself in case
10637 // of MIR testcases missing the MFI.
10638 if (Info
->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG
)
10639 MRI
.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG
, Info
->getScratchRSrcReg());
10641 if (Info
->getFrameOffsetReg() != AMDGPU::FP_REG
)
10642 MRI
.replaceRegWith(AMDGPU::FP_REG
, Info
->getFrameOffsetReg());
10644 if (Info
->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG
) {
10645 MRI
.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG
,
10646 Info
->getScratchWaveOffsetReg());
10649 Info
->limitOccupancy(MF
);
10651 if (ST
.isWave32() && !MF
.empty()) {
10652 // Add VCC_HI def because many instructions marked as imp-use VCC where
10653 // we may only define VCC_LO. If nothing defines VCC_HI we may end up
10654 // having a use of undef.
10656 const SIInstrInfo
*TII
= ST
.getInstrInfo();
10659 MachineBasicBlock
&MBB
= MF
.front();
10660 MachineBasicBlock::iterator I
= MBB
.getFirstNonDebugInstr();
10661 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), AMDGPU::VCC_HI
);
10663 for (auto &MBB
: MF
) {
10664 for (auto &MI
: MBB
) {
10665 TII
->fixImplicitOperands(MI
);
10670 TargetLoweringBase::finalizeLowering(MF
);
10673 void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op
,
10675 const APInt
&DemandedElts
,
10676 const SelectionDAG
&DAG
,
10677 unsigned Depth
) const {
10678 TargetLowering::computeKnownBitsForFrameIndex(Op
, Known
, DemandedElts
,
10681 // Set the high bits to zero based on the maximum allowed scratch size per
10682 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
10683 // calculation won't overflow, so assume the sign bit is never set.
10684 Known
.Zero
.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
10687 llvm::Align
SITargetLowering::getPrefLoopAlignment(MachineLoop
*ML
) const {
10688 const llvm::Align PrefAlign
= TargetLowering::getPrefLoopAlignment(ML
);
10689 const llvm::Align CacheLineAlign
= llvm::Align(64);
10691 // Pre-GFX10 target did not benefit from loop alignment
10692 if (!ML
|| DisableLoopAlignment
||
10693 (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10
) ||
10694 getSubtarget()->hasInstFwdPrefetchBug())
10697 // On GFX10 I$ is 4 x 64 bytes cache lines.
10698 // By default prefetcher keeps one cache line behind and reads two ahead.
10699 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
10700 // behind and one ahead.
10701 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
10702 // If loop fits 64 bytes it always spans no more than two cache lines and
10703 // does not need an alignment.
10704 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
10705 // Else if loop is less or equal 192 bytes we need two lines behind.
10707 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10708 const MachineBasicBlock
*Header
= ML
->getHeader();
10709 if (Header
->getAlignment() != PrefAlign
)
10710 return Header
->getAlignment(); // Already processed.
10712 unsigned LoopSize
= 0;
10713 for (const MachineBasicBlock
*MBB
: ML
->blocks()) {
10714 // If inner loop block is aligned assume in average half of the alignment
10715 // size to be added as nops.
10717 LoopSize
+= MBB
->getAlignment().value() / 2;
10719 for (const MachineInstr
&MI
: *MBB
) {
10720 LoopSize
+= TII
->getInstSizeInBytes(MI
);
10721 if (LoopSize
> 192)
10726 if (LoopSize
<= 64)
10729 if (LoopSize
<= 128)
10730 return CacheLineAlign
;
10732 // If any of parent loops is surrounded by prefetch instructions do not
10733 // insert new for inner loop, which would reset parent's settings.
10734 for (MachineLoop
*P
= ML
->getParentLoop(); P
; P
= P
->getParentLoop()) {
10735 if (MachineBasicBlock
*Exit
= P
->getExitBlock()) {
10736 auto I
= Exit
->getFirstNonDebugInstr();
10737 if (I
!= Exit
->end() && I
->getOpcode() == AMDGPU::S_INST_PREFETCH
)
10738 return CacheLineAlign
;
10742 MachineBasicBlock
*Pre
= ML
->getLoopPreheader();
10743 MachineBasicBlock
*Exit
= ML
->getExitBlock();
10746 BuildMI(*Pre
, Pre
->getFirstTerminator(), DebugLoc(),
10747 TII
->get(AMDGPU::S_INST_PREFETCH
))
10748 .addImm(1); // prefetch 2 lines behind PC
10750 BuildMI(*Exit
, Exit
->getFirstNonDebugInstr(), DebugLoc(),
10751 TII
->get(AMDGPU::S_INST_PREFETCH
))
10752 .addImm(2); // prefetch 1 line behind PC
10755 return CacheLineAlign
;
10758 LLVM_ATTRIBUTE_UNUSED
10759 static bool isCopyFromRegOfInlineAsm(const SDNode
*N
) {
10760 assert(N
->getOpcode() == ISD::CopyFromReg
);
10762 // Follow the chain until we find an INLINEASM node.
10763 N
= N
->getOperand(0).getNode();
10764 if (N
->getOpcode() == ISD::INLINEASM
||
10765 N
->getOpcode() == ISD::INLINEASM_BR
)
10767 } while (N
->getOpcode() == ISD::CopyFromReg
);
10771 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode
* N
,
10772 FunctionLoweringInfo
* FLI
, LegacyDivergenceAnalysis
* KDA
) const
10774 switch (N
->getOpcode()) {
10775 case ISD::CopyFromReg
:
10777 const RegisterSDNode
*R
= cast
<RegisterSDNode
>(N
->getOperand(1));
10778 const MachineFunction
* MF
= FLI
->MF
;
10779 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
10780 const MachineRegisterInfo
&MRI
= MF
->getRegInfo();
10781 const SIRegisterInfo
&TRI
= ST
.getInstrInfo()->getRegisterInfo();
10782 unsigned Reg
= R
->getReg();
10783 if (Register::isPhysicalRegister(Reg
))
10784 return !TRI
.isSGPRReg(MRI
, Reg
);
10786 if (MRI
.isLiveIn(Reg
)) {
10787 // workitem.id.x workitem.id.y workitem.id.z
10788 // Any VGPR formal argument is also considered divergent
10789 if (!TRI
.isSGPRReg(MRI
, Reg
))
10791 // Formal arguments of non-entry functions
10792 // are conservatively considered divergent
10793 else if (!AMDGPU::isEntryFunctionCC(FLI
->Fn
->getCallingConv()))
10797 const Value
*V
= FLI
->getValueFromVirtualReg(Reg
);
10799 return KDA
->isDivergent(V
);
10800 assert(Reg
== FLI
->DemoteRegister
|| isCopyFromRegOfInlineAsm(N
));
10801 return !TRI
.isSGPRReg(MRI
, Reg
);
10805 const LoadSDNode
*L
= cast
<LoadSDNode
>(N
);
10806 unsigned AS
= L
->getAddressSpace();
10807 // A flat load may access private memory.
10808 return AS
== AMDGPUAS::PRIVATE_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
;
10810 case ISD::CALLSEQ_END
:
10813 case ISD::INTRINSIC_WO_CHAIN
:
10817 return AMDGPU::isIntrinsicSourceOfDivergence(
10818 cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue());
10819 case ISD::INTRINSIC_W_CHAIN
:
10820 return AMDGPU::isIntrinsicSourceOfDivergence(
10821 cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue());
10822 // In some cases intrinsics that are a source of divergence have been
10823 // lowered to AMDGPUISD so we also need to check those too.
10824 case AMDGPUISD::INTERP_MOV
:
10825 case AMDGPUISD::INTERP_P1
:
10826 case AMDGPUISD::INTERP_P2
:
10832 bool SITargetLowering::denormalsEnabledForType(EVT VT
) const {
10833 switch (VT
.getScalarType().getSimpleVT().SimpleTy
) {
10835 return Subtarget
->hasFP32Denormals();
10837 return Subtarget
->hasFP64Denormals();
10839 return Subtarget
->hasFP16Denormals();
10845 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op
,
10846 const SelectionDAG
&DAG
,
10848 unsigned Depth
) const {
10849 if (Op
.getOpcode() == AMDGPUISD::CLAMP
) {
10850 const MachineFunction
&MF
= DAG
.getMachineFunction();
10851 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10853 if (Info
->getMode().DX10Clamp
)
10854 return true; // Clamped to 0.
10855 return DAG
.isKnownNeverNaN(Op
.getOperand(0), SNaN
, Depth
+ 1);
10858 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op
, DAG
,
10862 TargetLowering::AtomicExpansionKind
10863 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*RMW
) const {
10864 switch (RMW
->getOperation()) {
10865 case AtomicRMWInst::FAdd
: {
10866 Type
*Ty
= RMW
->getType();
10868 // We don't have a way to support 16-bit atomics now, so just leave them
10870 if (Ty
->isHalfTy())
10871 return AtomicExpansionKind::None
;
10873 if (!Ty
->isFloatTy())
10874 return AtomicExpansionKind::CmpXChg
;
10876 // TODO: Do have these for flat. Older targets also had them for buffers.
10877 unsigned AS
= RMW
->getPointerAddressSpace();
10878 return (AS
== AMDGPUAS::LOCAL_ADDRESS
&& Subtarget
->hasLDSFPAtomics()) ?
10879 AtomicExpansionKind::None
: AtomicExpansionKind::CmpXChg
;
10885 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW
);