1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Custom DAG lowering for SI
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
16 #define _USE_MATH_DEFINES
19 #include "SIISelLowering.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "SIDefines.h"
24 #include "SIInstrInfo.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIRegisterInfo.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/ADT/APFloat.h"
30 #include "llvm/ADT/APInt.h"
31 #include "llvm/ADT/ArrayRef.h"
32 #include "llvm/ADT/BitVector.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/ADT/StringRef.h"
36 #include "llvm/ADT/StringSwitch.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/CodeGen/Analysis.h"
39 #include "llvm/CodeGen/CallingConvLower.h"
40 #include "llvm/CodeGen/DAGCombine.h"
41 #include "llvm/CodeGen/ISDOpcodes.h"
42 #include "llvm/CodeGen/MachineBasicBlock.h"
43 #include "llvm/CodeGen/MachineFrameInfo.h"
44 #include "llvm/CodeGen/MachineFunction.h"
45 #include "llvm/CodeGen/MachineInstr.h"
46 #include "llvm/CodeGen/MachineInstrBuilder.h"
47 #include "llvm/CodeGen/MachineMemOperand.h"
48 #include "llvm/CodeGen/MachineModuleInfo.h"
49 #include "llvm/CodeGen/MachineOperand.h"
50 #include "llvm/CodeGen/MachineRegisterInfo.h"
51 #include "llvm/CodeGen/SelectionDAG.h"
52 #include "llvm/CodeGen/SelectionDAGNodes.h"
53 #include "llvm/CodeGen/TargetCallingConv.h"
54 #include "llvm/CodeGen/TargetRegisterInfo.h"
55 #include "llvm/CodeGen/ValueTypes.h"
56 #include "llvm/IR/Constants.h"
57 #include "llvm/IR/DataLayout.h"
58 #include "llvm/IR/DebugLoc.h"
59 #include "llvm/IR/DerivedTypes.h"
60 #include "llvm/IR/DiagnosticInfo.h"
61 #include "llvm/IR/Function.h"
62 #include "llvm/IR/GlobalValue.h"
63 #include "llvm/IR/InstrTypes.h"
64 #include "llvm/IR/Instruction.h"
65 #include "llvm/IR/Instructions.h"
66 #include "llvm/IR/IntrinsicInst.h"
67 #include "llvm/IR/Type.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/CommandLine.h"
71 #include "llvm/Support/Compiler.h"
72 #include "llvm/Support/ErrorHandling.h"
73 #include "llvm/Support/KnownBits.h"
74 #include "llvm/Support/MachineValueType.h"
75 #include "llvm/Support/MathExtras.h"
76 #include "llvm/Target/TargetOptions.h"
87 #define DEBUG_TYPE "si-lower"
89 STATISTIC(NumTailCalls
, "Number of tail calls");
91 static cl::opt
<bool> EnableVGPRIndexMode(
92 "amdgpu-vgpr-index-mode",
93 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
96 static cl::opt
<bool> DisableLoopAlignment(
97 "amdgpu-disable-loop-alignment",
98 cl::desc("Do not align and prefetch loops"),
101 static unsigned findFirstFreeSGPR(CCState
&CCInfo
) {
102 unsigned NumSGPRs
= AMDGPU::SGPR_32RegClass
.getNumRegs();
103 for (unsigned Reg
= 0; Reg
< NumSGPRs
; ++Reg
) {
104 if (!CCInfo
.isAllocated(AMDGPU::SGPR0
+ Reg
)) {
105 return AMDGPU::SGPR0
+ Reg
;
108 llvm_unreachable("Cannot allocate sgpr");
111 SITargetLowering::SITargetLowering(const TargetMachine
&TM
,
112 const GCNSubtarget
&STI
)
113 : AMDGPUTargetLowering(TM
, STI
),
115 addRegisterClass(MVT::i1
, &AMDGPU::VReg_1RegClass
);
116 addRegisterClass(MVT::i64
, &AMDGPU::SReg_64RegClass
);
118 addRegisterClass(MVT::i32
, &AMDGPU::SReg_32_XM0RegClass
);
119 addRegisterClass(MVT::f32
, &AMDGPU::VGPR_32RegClass
);
121 addRegisterClass(MVT::f64
, &AMDGPU::VReg_64RegClass
);
122 addRegisterClass(MVT::v2i32
, &AMDGPU::SReg_64RegClass
);
123 addRegisterClass(MVT::v2f32
, &AMDGPU::VReg_64RegClass
);
125 addRegisterClass(MVT::v3i32
, &AMDGPU::SGPR_96RegClass
);
126 addRegisterClass(MVT::v3f32
, &AMDGPU::VReg_96RegClass
);
128 addRegisterClass(MVT::v2i64
, &AMDGPU::SGPR_128RegClass
);
129 addRegisterClass(MVT::v2f64
, &AMDGPU::SGPR_128RegClass
);
131 addRegisterClass(MVT::v4i32
, &AMDGPU::SGPR_128RegClass
);
132 addRegisterClass(MVT::v4f32
, &AMDGPU::VReg_128RegClass
);
134 addRegisterClass(MVT::v5i32
, &AMDGPU::SGPR_160RegClass
);
135 addRegisterClass(MVT::v5f32
, &AMDGPU::VReg_160RegClass
);
137 addRegisterClass(MVT::v8i32
, &AMDGPU::SReg_256RegClass
);
138 addRegisterClass(MVT::v8f32
, &AMDGPU::VReg_256RegClass
);
140 addRegisterClass(MVT::v16i32
, &AMDGPU::SReg_512RegClass
);
141 addRegisterClass(MVT::v16f32
, &AMDGPU::VReg_512RegClass
);
143 if (Subtarget
->has16BitInsts()) {
144 addRegisterClass(MVT::i16
, &AMDGPU::SReg_32_XM0RegClass
);
145 addRegisterClass(MVT::f16
, &AMDGPU::SReg_32_XM0RegClass
);
147 // Unless there are also VOP3P operations, not operations are really legal.
148 addRegisterClass(MVT::v2i16
, &AMDGPU::SReg_32_XM0RegClass
);
149 addRegisterClass(MVT::v2f16
, &AMDGPU::SReg_32_XM0RegClass
);
150 addRegisterClass(MVT::v4i16
, &AMDGPU::SReg_64RegClass
);
151 addRegisterClass(MVT::v4f16
, &AMDGPU::SReg_64RegClass
);
154 if (Subtarget
->hasMAIInsts()) {
155 addRegisterClass(MVT::v32i32
, &AMDGPU::VReg_1024RegClass
);
156 addRegisterClass(MVT::v32f32
, &AMDGPU::VReg_1024RegClass
);
159 computeRegisterProperties(Subtarget
->getRegisterInfo());
161 // We need to custom lower vector stores from local memory
162 setOperationAction(ISD::LOAD
, MVT::v2i32
, Custom
);
163 setOperationAction(ISD::LOAD
, MVT::v3i32
, Custom
);
164 setOperationAction(ISD::LOAD
, MVT::v4i32
, Custom
);
165 setOperationAction(ISD::LOAD
, MVT::v5i32
, Custom
);
166 setOperationAction(ISD::LOAD
, MVT::v8i32
, Custom
);
167 setOperationAction(ISD::LOAD
, MVT::v16i32
, Custom
);
168 setOperationAction(ISD::LOAD
, MVT::i1
, Custom
);
169 setOperationAction(ISD::LOAD
, MVT::v32i32
, Custom
);
171 setOperationAction(ISD::STORE
, MVT::v2i32
, Custom
);
172 setOperationAction(ISD::STORE
, MVT::v3i32
, Custom
);
173 setOperationAction(ISD::STORE
, MVT::v4i32
, Custom
);
174 setOperationAction(ISD::STORE
, MVT::v5i32
, Custom
);
175 setOperationAction(ISD::STORE
, MVT::v8i32
, Custom
);
176 setOperationAction(ISD::STORE
, MVT::v16i32
, Custom
);
177 setOperationAction(ISD::STORE
, MVT::i1
, Custom
);
178 setOperationAction(ISD::STORE
, MVT::v32i32
, Custom
);
180 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Expand
);
181 setTruncStoreAction(MVT::v3i32
, MVT::v3i16
, Expand
);
182 setTruncStoreAction(MVT::v4i32
, MVT::v4i16
, Expand
);
183 setTruncStoreAction(MVT::v8i32
, MVT::v8i16
, Expand
);
184 setTruncStoreAction(MVT::v16i32
, MVT::v16i16
, Expand
);
185 setTruncStoreAction(MVT::v32i32
, MVT::v32i16
, Expand
);
186 setTruncStoreAction(MVT::v2i32
, MVT::v2i8
, Expand
);
187 setTruncStoreAction(MVT::v4i32
, MVT::v4i8
, Expand
);
188 setTruncStoreAction(MVT::v8i32
, MVT::v8i8
, Expand
);
189 setTruncStoreAction(MVT::v16i32
, MVT::v16i8
, Expand
);
190 setTruncStoreAction(MVT::v32i32
, MVT::v32i8
, Expand
);
192 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
193 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
195 setOperationAction(ISD::SELECT
, MVT::i1
, Promote
);
196 setOperationAction(ISD::SELECT
, MVT::i64
, Custom
);
197 setOperationAction(ISD::SELECT
, MVT::f64
, Promote
);
198 AddPromotedToType(ISD::SELECT
, MVT::f64
, MVT::i64
);
200 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Expand
);
201 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Expand
);
202 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Expand
);
203 setOperationAction(ISD::SELECT_CC
, MVT::f64
, Expand
);
204 setOperationAction(ISD::SELECT_CC
, MVT::i1
, Expand
);
206 setOperationAction(ISD::SETCC
, MVT::i1
, Promote
);
207 setOperationAction(ISD::SETCC
, MVT::v2i1
, Expand
);
208 setOperationAction(ISD::SETCC
, MVT::v4i1
, Expand
);
209 AddPromotedToType(ISD::SETCC
, MVT::i1
, MVT::i32
);
211 setOperationAction(ISD::TRUNCATE
, MVT::v2i32
, Expand
);
212 setOperationAction(ISD::FP_ROUND
, MVT::v2f32
, Expand
);
214 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i1
, Custom
);
215 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i1
, Custom
);
216 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i8
, Custom
);
217 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i8
, Custom
);
218 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i16
, Custom
);
219 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v3i16
, Custom
);
220 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i16
, Custom
);
221 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::Other
, Custom
);
223 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
224 setOperationAction(ISD::BR_CC
, MVT::i1
, Expand
);
225 setOperationAction(ISD::BR_CC
, MVT::i32
, Expand
);
226 setOperationAction(ISD::BR_CC
, MVT::i64
, Expand
);
227 setOperationAction(ISD::BR_CC
, MVT::f32
, Expand
);
228 setOperationAction(ISD::BR_CC
, MVT::f64
, Expand
);
230 setOperationAction(ISD::UADDO
, MVT::i32
, Legal
);
231 setOperationAction(ISD::USUBO
, MVT::i32
, Legal
);
233 setOperationAction(ISD::ADDCARRY
, MVT::i32
, Legal
);
234 setOperationAction(ISD::SUBCARRY
, MVT::i32
, Legal
);
236 setOperationAction(ISD::SHL_PARTS
, MVT::i64
, Expand
);
237 setOperationAction(ISD::SRA_PARTS
, MVT::i64
, Expand
);
238 setOperationAction(ISD::SRL_PARTS
, MVT::i64
, Expand
);
241 setOperationAction(ISD::ADDCARRY
, MVT::i64
, Legal
);
242 setOperationAction(ISD::SUBCARRY
, MVT::i64
, Legal
);
245 // We only support LOAD/STORE and vector manipulation ops for vectors
246 // with > 4 elements.
247 for (MVT VT
: { MVT::v8i32
, MVT::v8f32
, MVT::v16i32
, MVT::v16f32
,
248 MVT::v2i64
, MVT::v2f64
, MVT::v4i16
, MVT::v4f16
,
249 MVT::v32i32
, MVT::v32f32
}) {
250 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
254 case ISD::BUILD_VECTOR
:
256 case ISD::EXTRACT_VECTOR_ELT
:
257 case ISD::INSERT_VECTOR_ELT
:
258 case ISD::INSERT_SUBVECTOR
:
259 case ISD::EXTRACT_SUBVECTOR
:
260 case ISD::SCALAR_TO_VECTOR
:
262 case ISD::CONCAT_VECTORS
:
263 setOperationAction(Op
, VT
, Custom
);
266 setOperationAction(Op
, VT
, Expand
);
272 setOperationAction(ISD::FP_EXTEND
, MVT::v4f32
, Expand
);
274 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
275 // is expanded to avoid having two separate loops in case the index is a VGPR.
277 // Most operations are naturally 32-bit vector operations. We only support
278 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
279 for (MVT Vec64
: { MVT::v2i64
, MVT::v2f64
}) {
280 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
281 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v4i32
);
283 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
284 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
286 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
287 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
289 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
290 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v4i32
);
293 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8i32
, Expand
);
294 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8f32
, Expand
);
295 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16i32
, Expand
);
296 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16f32
, Expand
);
298 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4f16
, Custom
);
299 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4i16
, Custom
);
301 // Avoid stack access for these.
302 // TODO: Generalize to more vector types.
303 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i16
, Custom
);
304 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2f16
, Custom
);
305 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
306 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
308 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
309 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
310 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i8
, Custom
);
311 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i8
, Custom
);
312 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v8i8
, Custom
);
314 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i8
, Custom
);
315 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i8
, Custom
);
316 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v8i8
, Custom
);
318 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i16
, Custom
);
319 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4f16
, Custom
);
320 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
321 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
323 // Deal with vec3 vector operations when widened to vec4.
324 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v3i32
, Custom
);
325 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v3f32
, Custom
);
326 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v4i32
, Custom
);
327 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v4f32
, Custom
);
329 // Deal with vec5 vector operations when widened to vec8.
330 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v5i32
, Custom
);
331 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v5f32
, Custom
);
332 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v8i32
, Custom
);
333 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v8f32
, Custom
);
335 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
336 // and output demarshalling
337 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i32
, Custom
);
338 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i64
, Custom
);
340 // We can't return success/failure, only the old value,
341 // let LLVM add the comparison
342 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i32
, Expand
);
343 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i64
, Expand
);
345 if (Subtarget
->hasFlatAddressSpace()) {
346 setOperationAction(ISD::ADDRSPACECAST
, MVT::i32
, Custom
);
347 setOperationAction(ISD::ADDRSPACECAST
, MVT::i64
, Custom
);
350 setOperationAction(ISD::BSWAP
, MVT::i32
, Legal
);
351 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
353 // On SI this is s_memtime and s_memrealtime on VI.
354 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Legal
);
355 setOperationAction(ISD::TRAP
, MVT::Other
, Custom
);
356 setOperationAction(ISD::DEBUGTRAP
, MVT::Other
, Custom
);
358 if (Subtarget
->has16BitInsts()) {
359 setOperationAction(ISD::FLOG
, MVT::f16
, Custom
);
360 setOperationAction(ISD::FEXP
, MVT::f16
, Custom
);
361 setOperationAction(ISD::FLOG10
, MVT::f16
, Custom
);
364 // v_mad_f32 does not support denormals according to some sources.
365 if (!Subtarget
->hasFP32Denormals())
366 setOperationAction(ISD::FMAD
, MVT::f32
, Legal
);
368 if (!Subtarget
->hasBFI()) {
369 // fcopysign can be done in a single instruction with BFI.
370 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Expand
);
371 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
374 if (!Subtarget
->hasBCNT(32))
375 setOperationAction(ISD::CTPOP
, MVT::i32
, Expand
);
377 if (!Subtarget
->hasBCNT(64))
378 setOperationAction(ISD::CTPOP
, MVT::i64
, Expand
);
380 if (Subtarget
->hasFFBH())
381 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i32
, Custom
);
383 if (Subtarget
->hasFFBL())
384 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i32
, Custom
);
386 // We only really have 32-bit BFE instructions (and 16-bit on VI).
388 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
389 // effort to match them now. We want this to be false for i64 cases when the
390 // extraction isn't restricted to the upper or lower half. Ideally we would
391 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
392 // span the midpoint are probably relatively rare, so don't worry about them
394 if (Subtarget
->hasBFE())
395 setHasExtractBitsInsn(true);
397 setOperationAction(ISD::FMINNUM
, MVT::f32
, Custom
);
398 setOperationAction(ISD::FMAXNUM
, MVT::f32
, Custom
);
399 setOperationAction(ISD::FMINNUM
, MVT::f64
, Custom
);
400 setOperationAction(ISD::FMAXNUM
, MVT::f64
, Custom
);
403 // These are really only legal for ieee_mode functions. We should be avoiding
404 // them for functions that don't have ieee_mode enabled, so just say they are
406 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f32
, Legal
);
407 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f32
, Legal
);
408 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f64
, Legal
);
409 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f64
, Legal
);
412 if (Subtarget
->haveRoundOpsF64()) {
413 setOperationAction(ISD::FTRUNC
, MVT::f64
, Legal
);
414 setOperationAction(ISD::FCEIL
, MVT::f64
, Legal
);
415 setOperationAction(ISD::FRINT
, MVT::f64
, Legal
);
417 setOperationAction(ISD::FCEIL
, MVT::f64
, Custom
);
418 setOperationAction(ISD::FTRUNC
, MVT::f64
, Custom
);
419 setOperationAction(ISD::FRINT
, MVT::f64
, Custom
);
420 setOperationAction(ISD::FFLOOR
, MVT::f64
, Custom
);
423 setOperationAction(ISD::FFLOOR
, MVT::f64
, Legal
);
425 setOperationAction(ISD::FSIN
, MVT::f32
, Custom
);
426 setOperationAction(ISD::FCOS
, MVT::f32
, Custom
);
427 setOperationAction(ISD::FDIV
, MVT::f32
, Custom
);
428 setOperationAction(ISD::FDIV
, MVT::f64
, Custom
);
430 if (Subtarget
->has16BitInsts()) {
431 setOperationAction(ISD::Constant
, MVT::i16
, Legal
);
433 setOperationAction(ISD::SMIN
, MVT::i16
, Legal
);
434 setOperationAction(ISD::SMAX
, MVT::i16
, Legal
);
436 setOperationAction(ISD::UMIN
, MVT::i16
, Legal
);
437 setOperationAction(ISD::UMAX
, MVT::i16
, Legal
);
439 setOperationAction(ISD::SIGN_EXTEND
, MVT::i16
, Promote
);
440 AddPromotedToType(ISD::SIGN_EXTEND
, MVT::i16
, MVT::i32
);
442 setOperationAction(ISD::ROTR
, MVT::i16
, Promote
);
443 setOperationAction(ISD::ROTL
, MVT::i16
, Promote
);
445 setOperationAction(ISD::SDIV
, MVT::i16
, Promote
);
446 setOperationAction(ISD::UDIV
, MVT::i16
, Promote
);
447 setOperationAction(ISD::SREM
, MVT::i16
, Promote
);
448 setOperationAction(ISD::UREM
, MVT::i16
, Promote
);
450 setOperationAction(ISD::BSWAP
, MVT::i16
, Promote
);
451 setOperationAction(ISD::BITREVERSE
, MVT::i16
, Promote
);
453 setOperationAction(ISD::CTTZ
, MVT::i16
, Promote
);
454 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i16
, Promote
);
455 setOperationAction(ISD::CTLZ
, MVT::i16
, Promote
);
456 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i16
, Promote
);
457 setOperationAction(ISD::CTPOP
, MVT::i16
, Promote
);
459 setOperationAction(ISD::SELECT_CC
, MVT::i16
, Expand
);
461 setOperationAction(ISD::BR_CC
, MVT::i16
, Expand
);
463 setOperationAction(ISD::LOAD
, MVT::i16
, Custom
);
465 setTruncStoreAction(MVT::i64
, MVT::i16
, Expand
);
467 setOperationAction(ISD::FP16_TO_FP
, MVT::i16
, Promote
);
468 AddPromotedToType(ISD::FP16_TO_FP
, MVT::i16
, MVT::i32
);
469 setOperationAction(ISD::FP_TO_FP16
, MVT::i16
, Promote
);
470 AddPromotedToType(ISD::FP_TO_FP16
, MVT::i16
, MVT::i32
);
472 setOperationAction(ISD::FP_TO_SINT
, MVT::i16
, Promote
);
473 setOperationAction(ISD::FP_TO_UINT
, MVT::i16
, Promote
);
474 setOperationAction(ISD::SINT_TO_FP
, MVT::i16
, Promote
);
475 setOperationAction(ISD::UINT_TO_FP
, MVT::i16
, Promote
);
477 // F16 - Constant Actions.
478 setOperationAction(ISD::ConstantFP
, MVT::f16
, Legal
);
480 // F16 - Load/Store Actions.
481 setOperationAction(ISD::LOAD
, MVT::f16
, Promote
);
482 AddPromotedToType(ISD::LOAD
, MVT::f16
, MVT::i16
);
483 setOperationAction(ISD::STORE
, MVT::f16
, Promote
);
484 AddPromotedToType(ISD::STORE
, MVT::f16
, MVT::i16
);
486 // F16 - VOP1 Actions.
487 setOperationAction(ISD::FP_ROUND
, MVT::f16
, Custom
);
488 setOperationAction(ISD::FCOS
, MVT::f16
, Promote
);
489 setOperationAction(ISD::FSIN
, MVT::f16
, Promote
);
490 setOperationAction(ISD::FP_TO_SINT
, MVT::f16
, Promote
);
491 setOperationAction(ISD::FP_TO_UINT
, MVT::f16
, Promote
);
492 setOperationAction(ISD::SINT_TO_FP
, MVT::f16
, Promote
);
493 setOperationAction(ISD::UINT_TO_FP
, MVT::f16
, Promote
);
494 setOperationAction(ISD::FROUND
, MVT::f16
, Custom
);
496 // F16 - VOP2 Actions.
497 setOperationAction(ISD::BR_CC
, MVT::f16
, Expand
);
498 setOperationAction(ISD::SELECT_CC
, MVT::f16
, Expand
);
500 setOperationAction(ISD::FDIV
, MVT::f16
, Custom
);
502 // F16 - VOP3 Actions.
503 setOperationAction(ISD::FMA
, MVT::f16
, Legal
);
504 if (!Subtarget
->hasFP16Denormals() && STI
.hasMadF16())
505 setOperationAction(ISD::FMAD
, MVT::f16
, Legal
);
507 for (MVT VT
: {MVT::v2i16
, MVT::v2f16
, MVT::v4i16
, MVT::v4f16
}) {
508 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
512 case ISD::BUILD_VECTOR
:
514 case ISD::EXTRACT_VECTOR_ELT
:
515 case ISD::INSERT_VECTOR_ELT
:
516 case ISD::INSERT_SUBVECTOR
:
517 case ISD::EXTRACT_SUBVECTOR
:
518 case ISD::SCALAR_TO_VECTOR
:
520 case ISD::CONCAT_VECTORS
:
521 setOperationAction(Op
, VT
, Custom
);
524 setOperationAction(Op
, VT
, Expand
);
530 // XXX - Do these do anything? Vector constants turn into build_vector.
531 setOperationAction(ISD::Constant
, MVT::v2i16
, Legal
);
532 setOperationAction(ISD::ConstantFP
, MVT::v2f16
, Legal
);
534 setOperationAction(ISD::UNDEF
, MVT::v2i16
, Legal
);
535 setOperationAction(ISD::UNDEF
, MVT::v2f16
, Legal
);
537 setOperationAction(ISD::STORE
, MVT::v2i16
, Promote
);
538 AddPromotedToType(ISD::STORE
, MVT::v2i16
, MVT::i32
);
539 setOperationAction(ISD::STORE
, MVT::v2f16
, Promote
);
540 AddPromotedToType(ISD::STORE
, MVT::v2f16
, MVT::i32
);
542 setOperationAction(ISD::LOAD
, MVT::v2i16
, Promote
);
543 AddPromotedToType(ISD::LOAD
, MVT::v2i16
, MVT::i32
);
544 setOperationAction(ISD::LOAD
, MVT::v2f16
, Promote
);
545 AddPromotedToType(ISD::LOAD
, MVT::v2f16
, MVT::i32
);
547 setOperationAction(ISD::AND
, MVT::v2i16
, Promote
);
548 AddPromotedToType(ISD::AND
, MVT::v2i16
, MVT::i32
);
549 setOperationAction(ISD::OR
, MVT::v2i16
, Promote
);
550 AddPromotedToType(ISD::OR
, MVT::v2i16
, MVT::i32
);
551 setOperationAction(ISD::XOR
, MVT::v2i16
, Promote
);
552 AddPromotedToType(ISD::XOR
, MVT::v2i16
, MVT::i32
);
554 setOperationAction(ISD::LOAD
, MVT::v4i16
, Promote
);
555 AddPromotedToType(ISD::LOAD
, MVT::v4i16
, MVT::v2i32
);
556 setOperationAction(ISD::LOAD
, MVT::v4f16
, Promote
);
557 AddPromotedToType(ISD::LOAD
, MVT::v4f16
, MVT::v2i32
);
559 setOperationAction(ISD::STORE
, MVT::v4i16
, Promote
);
560 AddPromotedToType(ISD::STORE
, MVT::v4i16
, MVT::v2i32
);
561 setOperationAction(ISD::STORE
, MVT::v4f16
, Promote
);
562 AddPromotedToType(ISD::STORE
, MVT::v4f16
, MVT::v2i32
);
564 setOperationAction(ISD::ANY_EXTEND
, MVT::v2i32
, Expand
);
565 setOperationAction(ISD::ZERO_EXTEND
, MVT::v2i32
, Expand
);
566 setOperationAction(ISD::SIGN_EXTEND
, MVT::v2i32
, Expand
);
567 setOperationAction(ISD::FP_EXTEND
, MVT::v2f32
, Expand
);
569 setOperationAction(ISD::ANY_EXTEND
, MVT::v4i32
, Expand
);
570 setOperationAction(ISD::ZERO_EXTEND
, MVT::v4i32
, Expand
);
571 setOperationAction(ISD::SIGN_EXTEND
, MVT::v4i32
, Expand
);
573 if (!Subtarget
->hasVOP3PInsts()) {
574 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2i16
, Custom
);
575 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2f16
, Custom
);
578 setOperationAction(ISD::FNEG
, MVT::v2f16
, Legal
);
579 // This isn't really legal, but this avoids the legalizer unrolling it (and
580 // allows matching fneg (fabs x) patterns)
581 setOperationAction(ISD::FABS
, MVT::v2f16
, Legal
);
583 setOperationAction(ISD::FMAXNUM
, MVT::f16
, Custom
);
584 setOperationAction(ISD::FMINNUM
, MVT::f16
, Custom
);
585 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f16
, Legal
);
586 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f16
, Legal
);
588 setOperationAction(ISD::FMINNUM_IEEE
, MVT::v4f16
, Custom
);
589 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::v4f16
, Custom
);
591 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Expand
);
592 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Expand
);
595 if (Subtarget
->hasVOP3PInsts()) {
596 setOperationAction(ISD::ADD
, MVT::v2i16
, Legal
);
597 setOperationAction(ISD::SUB
, MVT::v2i16
, Legal
);
598 setOperationAction(ISD::MUL
, MVT::v2i16
, Legal
);
599 setOperationAction(ISD::SHL
, MVT::v2i16
, Legal
);
600 setOperationAction(ISD::SRL
, MVT::v2i16
, Legal
);
601 setOperationAction(ISD::SRA
, MVT::v2i16
, Legal
);
602 setOperationAction(ISD::SMIN
, MVT::v2i16
, Legal
);
603 setOperationAction(ISD::UMIN
, MVT::v2i16
, Legal
);
604 setOperationAction(ISD::SMAX
, MVT::v2i16
, Legal
);
605 setOperationAction(ISD::UMAX
, MVT::v2i16
, Legal
);
607 setOperationAction(ISD::FADD
, MVT::v2f16
, Legal
);
608 setOperationAction(ISD::FMUL
, MVT::v2f16
, Legal
);
609 setOperationAction(ISD::FMA
, MVT::v2f16
, Legal
);
611 setOperationAction(ISD::FMINNUM_IEEE
, MVT::v2f16
, Legal
);
612 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::v2f16
, Legal
);
614 setOperationAction(ISD::FCANONICALIZE
, MVT::v2f16
, Legal
);
616 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
617 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
619 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4f16
, Custom
);
620 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4i16
, Custom
);
622 setOperationAction(ISD::SHL
, MVT::v4i16
, Custom
);
623 setOperationAction(ISD::SRA
, MVT::v4i16
, Custom
);
624 setOperationAction(ISD::SRL
, MVT::v4i16
, Custom
);
625 setOperationAction(ISD::ADD
, MVT::v4i16
, Custom
);
626 setOperationAction(ISD::SUB
, MVT::v4i16
, Custom
);
627 setOperationAction(ISD::MUL
, MVT::v4i16
, Custom
);
629 setOperationAction(ISD::SMIN
, MVT::v4i16
, Custom
);
630 setOperationAction(ISD::SMAX
, MVT::v4i16
, Custom
);
631 setOperationAction(ISD::UMIN
, MVT::v4i16
, Custom
);
632 setOperationAction(ISD::UMAX
, MVT::v4i16
, Custom
);
634 setOperationAction(ISD::FADD
, MVT::v4f16
, Custom
);
635 setOperationAction(ISD::FMUL
, MVT::v4f16
, Custom
);
636 setOperationAction(ISD::FMA
, MVT::v4f16
, Custom
);
638 setOperationAction(ISD::FMAXNUM
, MVT::v2f16
, Custom
);
639 setOperationAction(ISD::FMINNUM
, MVT::v2f16
, Custom
);
641 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Custom
);
642 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Custom
);
643 setOperationAction(ISD::FCANONICALIZE
, MVT::v4f16
, Custom
);
645 setOperationAction(ISD::FEXP
, MVT::v2f16
, Custom
);
646 setOperationAction(ISD::SELECT
, MVT::v4i16
, Custom
);
647 setOperationAction(ISD::SELECT
, MVT::v4f16
, Custom
);
650 setOperationAction(ISD::FNEG
, MVT::v4f16
, Custom
);
651 setOperationAction(ISD::FABS
, MVT::v4f16
, Custom
);
653 if (Subtarget
->has16BitInsts()) {
654 setOperationAction(ISD::SELECT
, MVT::v2i16
, Promote
);
655 AddPromotedToType(ISD::SELECT
, MVT::v2i16
, MVT::i32
);
656 setOperationAction(ISD::SELECT
, MVT::v2f16
, Promote
);
657 AddPromotedToType(ISD::SELECT
, MVT::v2f16
, MVT::i32
);
659 // Legalization hack.
660 setOperationAction(ISD::SELECT
, MVT::v2i16
, Custom
);
661 setOperationAction(ISD::SELECT
, MVT::v2f16
, Custom
);
663 setOperationAction(ISD::FNEG
, MVT::v2f16
, Custom
);
664 setOperationAction(ISD::FABS
, MVT::v2f16
, Custom
);
667 for (MVT VT
: { MVT::v4i16
, MVT::v4f16
, MVT::v2i8
, MVT::v4i8
, MVT::v8i8
}) {
668 setOperationAction(ISD::SELECT
, VT
, Custom
);
671 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
672 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f32
, Custom
);
673 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v4f32
, Custom
);
674 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::i16
, Custom
);
675 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f16
, Custom
);
676 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2i16
, Custom
);
677 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2f16
, Custom
);
679 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v2f16
, Custom
);
680 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v2i16
, Custom
);
681 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v4f16
, Custom
);
682 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v4i16
, Custom
);
683 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v8f16
, Custom
);
684 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::Other
, Custom
);
685 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::f16
, Custom
);
686 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i16
, Custom
);
687 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i8
, Custom
);
689 setOperationAction(ISD::INTRINSIC_VOID
, MVT::Other
, Custom
);
690 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2i16
, Custom
);
691 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2f16
, Custom
);
692 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v4f16
, Custom
);
693 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v4i16
, Custom
);
694 setOperationAction(ISD::INTRINSIC_VOID
, MVT::f16
, Custom
);
695 setOperationAction(ISD::INTRINSIC_VOID
, MVT::i16
, Custom
);
696 setOperationAction(ISD::INTRINSIC_VOID
, MVT::i8
, Custom
);
698 setTargetDAGCombine(ISD::ADD
);
699 setTargetDAGCombine(ISD::ADDCARRY
);
700 setTargetDAGCombine(ISD::SUB
);
701 setTargetDAGCombine(ISD::SUBCARRY
);
702 setTargetDAGCombine(ISD::FADD
);
703 setTargetDAGCombine(ISD::FSUB
);
704 setTargetDAGCombine(ISD::FMINNUM
);
705 setTargetDAGCombine(ISD::FMAXNUM
);
706 setTargetDAGCombine(ISD::FMINNUM_IEEE
);
707 setTargetDAGCombine(ISD::FMAXNUM_IEEE
);
708 setTargetDAGCombine(ISD::FMA
);
709 setTargetDAGCombine(ISD::SMIN
);
710 setTargetDAGCombine(ISD::SMAX
);
711 setTargetDAGCombine(ISD::UMIN
);
712 setTargetDAGCombine(ISD::UMAX
);
713 setTargetDAGCombine(ISD::SETCC
);
714 setTargetDAGCombine(ISD::AND
);
715 setTargetDAGCombine(ISD::OR
);
716 setTargetDAGCombine(ISD::XOR
);
717 setTargetDAGCombine(ISD::SINT_TO_FP
);
718 setTargetDAGCombine(ISD::UINT_TO_FP
);
719 setTargetDAGCombine(ISD::FCANONICALIZE
);
720 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR
);
721 setTargetDAGCombine(ISD::ZERO_EXTEND
);
722 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG
);
723 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT
);
724 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT
);
726 // All memory operations. Some folding on the pointer operand is done to help
727 // matching the constant offsets in the addressing modes.
728 setTargetDAGCombine(ISD::LOAD
);
729 setTargetDAGCombine(ISD::STORE
);
730 setTargetDAGCombine(ISD::ATOMIC_LOAD
);
731 setTargetDAGCombine(ISD::ATOMIC_STORE
);
732 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP
);
733 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
);
734 setTargetDAGCombine(ISD::ATOMIC_SWAP
);
735 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD
);
736 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB
);
737 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND
);
738 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR
);
739 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR
);
740 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND
);
741 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN
);
742 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX
);
743 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN
);
744 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX
);
745 setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD
);
747 setSchedulingPreference(Sched::RegPressure
);
750 const GCNSubtarget
*SITargetLowering::getSubtarget() const {
754 //===----------------------------------------------------------------------===//
755 // TargetLowering queries
756 //===----------------------------------------------------------------------===//
758 // v_mad_mix* support a conversion from f16 to f32.
760 // There is only one special case when denormals are enabled we don't currently,
761 // where this is OK to use.
762 bool SITargetLowering::isFPExtFoldable(unsigned Opcode
,
763 EVT DestVT
, EVT SrcVT
) const {
764 return ((Opcode
== ISD::FMAD
&& Subtarget
->hasMadMixInsts()) ||
765 (Opcode
== ISD::FMA
&& Subtarget
->hasFmaMixInsts())) &&
766 DestVT
.getScalarType() == MVT::f32
&& !Subtarget
->hasFP32Denormals() &&
767 SrcVT
.getScalarType() == MVT::f16
;
770 bool SITargetLowering::isShuffleMaskLegal(ArrayRef
<int>, EVT
) const {
771 // SI has some legal vector types, but no legal vector operations. Say no
772 // shuffles are legal in order to prefer scalarizing some vector operations.
776 MVT
SITargetLowering::getRegisterTypeForCallingConv(LLVMContext
&Context
,
779 if (CC
== CallingConv::AMDGPU_KERNEL
)
780 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
783 EVT ScalarVT
= VT
.getScalarType();
784 unsigned Size
= ScalarVT
.getSizeInBits();
786 return ScalarVT
.getSimpleVT();
791 if (Size
== 16 && Subtarget
->has16BitInsts())
792 return VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
793 } else if (VT
.getSizeInBits() > 32)
796 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
799 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext
&Context
,
802 if (CC
== CallingConv::AMDGPU_KERNEL
)
803 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
806 unsigned NumElts
= VT
.getVectorNumElements();
807 EVT ScalarVT
= VT
.getScalarType();
808 unsigned Size
= ScalarVT
.getSizeInBits();
814 return NumElts
* ((Size
+ 31) / 32);
816 if (Size
== 16 && Subtarget
->has16BitInsts())
817 return (NumElts
+ 1) / 2;
818 } else if (VT
.getSizeInBits() > 32)
819 return (VT
.getSizeInBits() + 31) / 32;
821 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
824 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
825 LLVMContext
&Context
, CallingConv::ID CC
,
826 EVT VT
, EVT
&IntermediateVT
,
827 unsigned &NumIntermediates
, MVT
&RegisterVT
) const {
828 if (CC
!= CallingConv::AMDGPU_KERNEL
&& VT
.isVector()) {
829 unsigned NumElts
= VT
.getVectorNumElements();
830 EVT ScalarVT
= VT
.getScalarType();
831 unsigned Size
= ScalarVT
.getSizeInBits();
833 RegisterVT
= ScalarVT
.getSimpleVT();
834 IntermediateVT
= RegisterVT
;
835 NumIntermediates
= NumElts
;
836 return NumIntermediates
;
840 RegisterVT
= MVT::i32
;
841 IntermediateVT
= RegisterVT
;
842 NumIntermediates
= NumElts
* ((Size
+ 31) / 32);
843 return NumIntermediates
;
846 // FIXME: We should fix the ABI to be the same on targets without 16-bit
847 // support, but unless we can properly handle 3-vectors, it will be still be
849 if (Size
== 16 && Subtarget
->has16BitInsts()) {
850 RegisterVT
= VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
851 IntermediateVT
= RegisterVT
;
852 NumIntermediates
= (NumElts
+ 1) / 2;
853 return NumIntermediates
;
857 return TargetLowering::getVectorTypeBreakdownForCallingConv(
858 Context
, CC
, VT
, IntermediateVT
, NumIntermediates
, RegisterVT
);
861 static MVT
memVTFromAggregate(Type
*Ty
) {
862 // Only limited forms of aggregate type currently expected.
863 assert(Ty
->isStructTy() && "Expected struct type");
866 Type
*ElementType
= nullptr;
868 if (Ty
->getContainedType(0)->isVectorTy()) {
869 VectorType
*VecComponent
= cast
<VectorType
>(Ty
->getContainedType(0));
870 ElementType
= VecComponent
->getElementType();
871 NumElts
= VecComponent
->getNumElements();
873 ElementType
= Ty
->getContainedType(0);
877 assert((Ty
->getContainedType(1) && Ty
->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
879 // Calculate the size of the memVT type from the aggregate
880 unsigned Pow2Elts
= 0;
881 unsigned ElementSize
;
882 switch (ElementType
->getTypeID()) {
884 llvm_unreachable("Unknown type!");
885 case Type::IntegerTyID
:
886 ElementSize
= cast
<IntegerType
>(ElementType
)->getBitWidth();
891 case Type::FloatTyID
:
895 unsigned AdditionalElts
= ElementSize
== 16 ? 2 : 1;
896 Pow2Elts
= 1 << Log2_32_Ceil(NumElts
+ AdditionalElts
);
898 return MVT::getVectorVT(MVT::getVT(ElementType
, false),
902 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo
&Info
,
905 unsigned IntrID
) const {
906 if (const AMDGPU::RsrcIntrinsic
*RsrcIntr
=
907 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
908 AttributeList Attr
= Intrinsic::getAttributes(CI
.getContext(),
909 (Intrinsic::ID
)IntrID
);
910 if (Attr
.hasFnAttribute(Attribute::ReadNone
))
913 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
915 if (RsrcIntr
->IsImage
) {
916 Info
.ptrVal
= MFI
->getImagePSV(
917 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
918 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
921 Info
.ptrVal
= MFI
->getBufferPSV(
922 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
923 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
926 Info
.flags
= MachineMemOperand::MODereferenceable
;
927 if (Attr
.hasFnAttribute(Attribute::ReadOnly
)) {
928 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
929 Info
.memVT
= MVT::getVT(CI
.getType(), true);
930 if (Info
.memVT
== MVT::Other
) {
931 // Some intrinsics return an aggregate type - special case to work out
933 Info
.memVT
= memVTFromAggregate(CI
.getType());
935 Info
.flags
|= MachineMemOperand::MOLoad
;
936 } else if (Attr
.hasFnAttribute(Attribute::WriteOnly
)) {
937 Info
.opc
= ISD::INTRINSIC_VOID
;
938 Info
.memVT
= MVT::getVT(CI
.getArgOperand(0)->getType());
939 Info
.flags
|= MachineMemOperand::MOStore
;
942 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
943 Info
.memVT
= MVT::getVT(CI
.getType());
944 Info
.flags
= MachineMemOperand::MOLoad
|
945 MachineMemOperand::MOStore
|
946 MachineMemOperand::MODereferenceable
;
948 // XXX - Should this be volatile without known ordering?
949 Info
.flags
|= MachineMemOperand::MOVolatile
;
955 case Intrinsic::amdgcn_atomic_inc
:
956 case Intrinsic::amdgcn_atomic_dec
:
957 case Intrinsic::amdgcn_ds_ordered_add
:
958 case Intrinsic::amdgcn_ds_ordered_swap
:
959 case Intrinsic::amdgcn_ds_fadd
:
960 case Intrinsic::amdgcn_ds_fmin
:
961 case Intrinsic::amdgcn_ds_fmax
: {
962 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
963 Info
.memVT
= MVT::getVT(CI
.getType());
964 Info
.ptrVal
= CI
.getOperand(0);
966 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
968 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(4));
970 Info
.flags
|= MachineMemOperand::MOVolatile
;
974 case Intrinsic::amdgcn_buffer_atomic_fadd
: {
975 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
977 Info
.opc
= ISD::INTRINSIC_VOID
;
978 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType());
979 Info
.ptrVal
= MFI
->getBufferPSV(
980 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
981 CI
.getArgOperand(1));
983 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
985 const ConstantInt
*Vol
= dyn_cast
<ConstantInt
>(CI
.getOperand(4));
986 if (!Vol
|| !Vol
->isZero())
987 Info
.flags
|= MachineMemOperand::MOVolatile
;
991 case Intrinsic::amdgcn_global_atomic_fadd
: {
992 Info
.opc
= ISD::INTRINSIC_VOID
;
993 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType()
994 ->getPointerElementType());
995 Info
.ptrVal
= CI
.getOperand(0);
997 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1001 case Intrinsic::amdgcn_ds_append
:
1002 case Intrinsic::amdgcn_ds_consume
: {
1003 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1004 Info
.memVT
= MVT::getVT(CI
.getType());
1005 Info
.ptrVal
= CI
.getOperand(0);
1007 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1009 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(1));
1011 Info
.flags
|= MachineMemOperand::MOVolatile
;
1015 case Intrinsic::amdgcn_ds_gws_init
:
1016 case Intrinsic::amdgcn_ds_gws_barrier
:
1017 case Intrinsic::amdgcn_ds_gws_sema_v
:
1018 case Intrinsic::amdgcn_ds_gws_sema_br
:
1019 case Intrinsic::amdgcn_ds_gws_sema_p
:
1020 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
1021 Info
.opc
= ISD::INTRINSIC_VOID
;
1023 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1025 MFI
->getGWSPSV(*MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo());
1027 // This is an abstract access, but we need to specify a type and size.
1028 Info
.memVT
= MVT::i32
;
1030 Info
.align
= Align(4);
1032 Info
.flags
= MachineMemOperand::MOStore
;
1033 if (IntrID
== Intrinsic::amdgcn_ds_gws_barrier
)
1034 Info
.flags
= MachineMemOperand::MOLoad
;
1042 bool SITargetLowering::getAddrModeArguments(IntrinsicInst
*II
,
1043 SmallVectorImpl
<Value
*> &Ops
,
1044 Type
*&AccessTy
) const {
1045 switch (II
->getIntrinsicID()) {
1046 case Intrinsic::amdgcn_atomic_inc
:
1047 case Intrinsic::amdgcn_atomic_dec
:
1048 case Intrinsic::amdgcn_ds_ordered_add
:
1049 case Intrinsic::amdgcn_ds_ordered_swap
:
1050 case Intrinsic::amdgcn_ds_fadd
:
1051 case Intrinsic::amdgcn_ds_fmin
:
1052 case Intrinsic::amdgcn_ds_fmax
: {
1053 Value
*Ptr
= II
->getArgOperand(0);
1054 AccessTy
= II
->getType();
1063 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode
&AM
) const {
1064 if (!Subtarget
->hasFlatInstOffsets()) {
1065 // Flat instructions do not have offsets, and only have the register
1067 return AM
.BaseOffs
== 0 && AM
.Scale
== 0;
1070 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
1071 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
1073 // GFX10 shrinked signed offset to 12 bits. When using regular flat
1074 // instructions, the sign bit is also ignored and is treated as 11-bit
1077 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
1078 return isUInt
<11>(AM
.BaseOffs
) && AM
.Scale
== 0;
1081 return isUInt
<12>(AM
.BaseOffs
) && AM
.Scale
== 0;
1084 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode
&AM
) const {
1085 if (Subtarget
->hasFlatGlobalInsts())
1086 return isInt
<13>(AM
.BaseOffs
) && AM
.Scale
== 0;
1088 if (!Subtarget
->hasAddr64() || Subtarget
->useFlatForGlobal()) {
1089 // Assume the we will use FLAT for all global memory accesses
1091 // FIXME: This assumption is currently wrong. On VI we still use
1092 // MUBUF instructions for the r + i addressing mode. As currently
1093 // implemented, the MUBUF instructions only work on buffer < 4GB.
1094 // It may be possible to support > 4GB buffers with MUBUF instructions,
1095 // by setting the stride value in the resource descriptor which would
1096 // increase the size limit to (stride * 4GB). However, this is risky,
1097 // because it has never been validated.
1098 return isLegalFlatAddressingMode(AM
);
1101 return isLegalMUBUFAddressingMode(AM
);
1104 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode
&AM
) const {
1105 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1106 // additionally can do r + r + i with addr64. 32-bit has more addressing
1107 // mode options. Depending on the resource constant, it can also do
1108 // (i64 r0) + (i32 r1) * (i14 i).
1110 // Private arrays end up using a scratch buffer most of the time, so also
1111 // assume those use MUBUF instructions. Scratch loads / stores are currently
1112 // implemented as mubuf instructions with offen bit set, so slightly
1113 // different than the normal addr64.
1114 if (!isUInt
<12>(AM
.BaseOffs
))
1117 // FIXME: Since we can split immediate into soffset and immediate offset,
1118 // would it make sense to allow any immediate?
1121 case 0: // r + i or just i, depending on HasBaseReg.
1124 return true; // We have r + r or r + i.
1126 if (AM
.HasBaseReg
) {
1127 // Reject 2 * r + r.
1131 // Allow 2 * r as r + r
1132 // Or 2 * r + i is allowed as r + r + i.
1134 default: // Don't allow n * r
1139 bool SITargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
1140 const AddrMode
&AM
, Type
*Ty
,
1141 unsigned AS
, Instruction
*I
) const {
1142 // No global is ever allowed as a base.
1146 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
)
1147 return isLegalGlobalAddressingMode(AM
);
1149 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1150 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
1151 AS
== AMDGPUAS::BUFFER_FAT_POINTER
) {
1152 // If the offset isn't a multiple of 4, it probably isn't going to be
1153 // correctly aligned.
1154 // FIXME: Can we get the real alignment here?
1155 if (AM
.BaseOffs
% 4 != 0)
1156 return isLegalMUBUFAddressingMode(AM
);
1158 // There are no SMRD extloads, so if we have to do a small type access we
1159 // will use a MUBUF load.
1160 // FIXME?: We also need to do this if unaligned, but we don't know the
1162 if (Ty
->isSized() && DL
.getTypeStoreSize(Ty
) < 4)
1163 return isLegalGlobalAddressingMode(AM
);
1165 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
1166 // SMRD instructions have an 8-bit, dword offset on SI.
1167 if (!isUInt
<8>(AM
.BaseOffs
/ 4))
1169 } else if (Subtarget
->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
) {
1170 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1171 // in 8-bits, it can use a smaller encoding.
1172 if (!isUInt
<32>(AM
.BaseOffs
/ 4))
1174 } else if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
1175 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1176 if (!isUInt
<20>(AM
.BaseOffs
))
1179 llvm_unreachable("unhandled generation");
1181 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1184 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1189 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1190 return isLegalMUBUFAddressingMode(AM
);
1191 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
||
1192 AS
== AMDGPUAS::REGION_ADDRESS
) {
1193 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1195 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1196 // an 8-bit dword offset but we don't know the alignment here.
1197 if (!isUInt
<16>(AM
.BaseOffs
))
1200 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1203 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1207 } else if (AS
== AMDGPUAS::FLAT_ADDRESS
||
1208 AS
== AMDGPUAS::UNKNOWN_ADDRESS_SPACE
) {
1209 // For an unknown address space, this usually means that this is for some
1210 // reason being used for pure arithmetic, and not based on some addressing
1211 // computation. We don't have instructions that compute pointers with any
1212 // addressing modes, so treat them as having no offset like flat
1214 return isLegalFlatAddressingMode(AM
);
1216 llvm_unreachable("unhandled address space");
1220 bool SITargetLowering::canMergeStoresTo(unsigned AS
, EVT MemVT
,
1221 const SelectionDAG
&DAG
) const {
1222 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
) {
1223 return (MemVT
.getSizeInBits() <= 4 * 32);
1224 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1225 unsigned MaxPrivateBits
= 8 * getSubtarget()->getMaxPrivateElementSize();
1226 return (MemVT
.getSizeInBits() <= MaxPrivateBits
);
1227 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
1228 return (MemVT
.getSizeInBits() <= 2 * 32);
1233 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1234 unsigned Size
, unsigned AddrSpace
, unsigned Align
,
1235 MachineMemOperand::Flags Flags
, bool *IsFast
) const {
1239 if (AddrSpace
== AMDGPUAS::LOCAL_ADDRESS
||
1240 AddrSpace
== AMDGPUAS::REGION_ADDRESS
) {
1241 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1242 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1243 // with adjacent offsets.
1244 bool AlignedBy4
= (Align
% 4 == 0);
1246 *IsFast
= AlignedBy4
;
1251 // FIXME: We have to be conservative here and assume that flat operations
1252 // will access scratch. If we had access to the IR function, then we
1253 // could determine if any private memory was used in the function.
1254 if (!Subtarget
->hasUnalignedScratchAccess() &&
1255 (AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
||
1256 AddrSpace
== AMDGPUAS::FLAT_ADDRESS
)) {
1257 bool AlignedBy4
= Align
>= 4;
1259 *IsFast
= AlignedBy4
;
1264 if (Subtarget
->hasUnalignedBufferAccess()) {
1265 // If we have an uniform constant load, it still requires using a slow
1266 // buffer instruction if unaligned.
1268 *IsFast
= (AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS
||
1269 AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) ?
1270 (Align
% 4 == 0) : true;
1276 // Smaller than dword value must be aligned.
1280 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1281 // byte-address are ignored, thus forcing Dword alignment.
1282 // This applies to private, global, and constant memory.
1286 return Size
>= 32 && Align
>= 4;
1289 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1290 EVT VT
, unsigned AddrSpace
, unsigned Align
, MachineMemOperand::Flags Flags
,
1291 bool *IsFast
) const {
1295 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1296 // which isn't a simple VT.
1297 // Until MVT is extended to handle this, simply check for the size and
1298 // rely on the condition below: allow accesses if the size is a multiple of 4.
1299 if (VT
== MVT::Other
|| (VT
!= MVT::Other
&& VT
.getSizeInBits() > 1024 &&
1300 VT
.getStoreSize() > 16)) {
1304 return allowsMisalignedMemoryAccessesImpl(VT
.getSizeInBits(), AddrSpace
,
1305 Align
, Flags
, IsFast
);
1308 EVT
SITargetLowering::getOptimalMemOpType(
1309 uint64_t Size
, unsigned DstAlign
, unsigned SrcAlign
, bool IsMemset
,
1310 bool ZeroMemset
, bool MemcpyStrSrc
,
1311 const AttributeList
&FuncAttributes
) const {
1312 // FIXME: Should account for address space here.
1314 // The default fallback uses the private pointer size as a guess for a type to
1315 // use. Make sure we switch these to 64-bit accesses.
1317 if (Size
>= 16 && DstAlign
>= 4) // XXX: Should only do for global
1320 if (Size
>= 8 && DstAlign
>= 4)
1327 static bool isFlatGlobalAddrSpace(unsigned AS
) {
1328 return AS
== AMDGPUAS::GLOBAL_ADDRESS
||
1329 AS
== AMDGPUAS::FLAT_ADDRESS
||
1330 AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1331 AS
> AMDGPUAS::MAX_AMDGPU_ADDRESS
;
1334 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS
,
1335 unsigned DestAS
) const {
1336 return isFlatGlobalAddrSpace(SrcAS
) && isFlatGlobalAddrSpace(DestAS
);
1339 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode
*N
) const {
1340 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1341 const Value
*Ptr
= MemNode
->getMemOperand()->getValue();
1342 const Instruction
*I
= dyn_cast_or_null
<Instruction
>(Ptr
);
1343 return I
&& I
->getMetadata("amdgpu.noclobber");
1346 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS
,
1347 unsigned DestAS
) const {
1348 // Flat -> private/local is a simple truncate.
1349 // Flat -> global is no-op
1350 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
)
1353 return isNoopAddrSpaceCast(SrcAS
, DestAS
);
1356 bool SITargetLowering::isMemOpUniform(const SDNode
*N
) const {
1357 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1359 return AMDGPUInstrInfo::isUniformMMO(MemNode
->getMemOperand());
1362 TargetLoweringBase::LegalizeTypeAction
1363 SITargetLowering::getPreferredVectorAction(MVT VT
) const {
1364 int NumElts
= VT
.getVectorNumElements();
1365 if (NumElts
!= 1 && VT
.getScalarType().bitsLE(MVT::i16
))
1366 return VT
.isPow2VectorType() ? TypeSplitVector
: TypeWidenVector
;
1367 return TargetLoweringBase::getPreferredVectorAction(VT
);
1370 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
1372 // FIXME: Could be smarter if called for vector constants.
1376 bool SITargetLowering::isTypeDesirableForOp(unsigned Op
, EVT VT
) const {
1377 if (Subtarget
->has16BitInsts() && VT
== MVT::i16
) {
1382 // These operations are done with 32-bit instructions anyway.
1387 // TODO: Extensions?
1394 // SimplifySetCC uses this function to determine whether or not it should
1395 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1396 if (VT
== MVT::i1
&& Op
== ISD::SETCC
)
1399 return TargetLowering::isTypeDesirableForOp(Op
, VT
);
1402 SDValue
SITargetLowering::lowerKernArgParameterPtr(SelectionDAG
&DAG
,
1405 uint64_t Offset
) const {
1406 const DataLayout
&DL
= DAG
.getDataLayout();
1407 MachineFunction
&MF
= DAG
.getMachineFunction();
1408 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1410 const ArgDescriptor
*InputPtrReg
;
1411 const TargetRegisterClass
*RC
;
1413 std::tie(InputPtrReg
, RC
)
1414 = Info
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1416 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
1417 MVT PtrVT
= getPointerTy(DL
, AMDGPUAS::CONSTANT_ADDRESS
);
1418 SDValue BasePtr
= DAG
.getCopyFromReg(Chain
, SL
,
1419 MRI
.getLiveInVirtReg(InputPtrReg
->getRegister()), PtrVT
);
1421 return DAG
.getObjectPtrOffset(SL
, BasePtr
, Offset
);
1424 SDValue
SITargetLowering::getImplicitArgPtr(SelectionDAG
&DAG
,
1425 const SDLoc
&SL
) const {
1426 uint64_t Offset
= getImplicitParameterOffset(DAG
.getMachineFunction(),
1428 return lowerKernArgParameterPtr(DAG
, SL
, DAG
.getEntryNode(), Offset
);
1431 SDValue
SITargetLowering::convertArgType(SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1432 const SDLoc
&SL
, SDValue Val
,
1434 const ISD::InputArg
*Arg
) const {
1435 // First, if it is a widened vector, narrow it.
1436 if (VT
.isVector() &&
1437 VT
.getVectorNumElements() != MemVT
.getVectorNumElements()) {
1439 EVT::getVectorVT(*DAG
.getContext(), MemVT
.getVectorElementType(),
1440 VT
.getVectorNumElements());
1441 Val
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, NarrowedVT
, Val
,
1442 DAG
.getConstant(0, SL
, MVT::i32
));
1445 // Then convert the vector elements or scalar value.
1446 if (Arg
&& (Arg
->Flags
.isSExt() || Arg
->Flags
.isZExt()) &&
1448 unsigned Opc
= Arg
->Flags
.isZExt() ? ISD::AssertZext
: ISD::AssertSext
;
1449 Val
= DAG
.getNode(Opc
, SL
, MemVT
, Val
, DAG
.getValueType(VT
));
1452 if (MemVT
.isFloatingPoint())
1453 Val
= getFPExtOrFPTrunc(DAG
, Val
, SL
, VT
);
1455 Val
= DAG
.getSExtOrTrunc(Val
, SL
, VT
);
1457 Val
= DAG
.getZExtOrTrunc(Val
, SL
, VT
);
1462 SDValue
SITargetLowering::lowerKernargMemParameter(
1463 SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1464 const SDLoc
&SL
, SDValue Chain
,
1465 uint64_t Offset
, unsigned Align
, bool Signed
,
1466 const ISD::InputArg
*Arg
) const {
1467 Type
*Ty
= MemVT
.getTypeForEVT(*DAG
.getContext());
1468 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
1469 MachinePointerInfo
PtrInfo(UndefValue::get(PtrTy
));
1471 // Try to avoid using an extload by loading earlier than the argument address,
1472 // and extracting the relevant bits. The load should hopefully be merged with
1473 // the previous argument.
1474 if (MemVT
.getStoreSize() < 4 && Align
< 4) {
1475 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1476 int64_t AlignDownOffset
= alignDown(Offset
, 4);
1477 int64_t OffsetDiff
= Offset
- AlignDownOffset
;
1479 EVT IntVT
= MemVT
.changeTypeToInteger();
1481 // TODO: If we passed in the base kernel offset we could have a better
1482 // alignment than 4, but we don't really need it.
1483 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, AlignDownOffset
);
1484 SDValue Load
= DAG
.getLoad(MVT::i32
, SL
, Chain
, Ptr
, PtrInfo
, 4,
1485 MachineMemOperand::MODereferenceable
|
1486 MachineMemOperand::MOInvariant
);
1488 SDValue ShiftAmt
= DAG
.getConstant(OffsetDiff
* 8, SL
, MVT::i32
);
1489 SDValue Extract
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Load
, ShiftAmt
);
1491 SDValue ArgVal
= DAG
.getNode(ISD::TRUNCATE
, SL
, IntVT
, Extract
);
1492 ArgVal
= DAG
.getNode(ISD::BITCAST
, SL
, MemVT
, ArgVal
);
1493 ArgVal
= convertArgType(DAG
, VT
, MemVT
, SL
, ArgVal
, Signed
, Arg
);
1496 return DAG
.getMergeValues({ ArgVal
, Load
.getValue(1) }, SL
);
1499 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, Offset
);
1500 SDValue Load
= DAG
.getLoad(MemVT
, SL
, Chain
, Ptr
, PtrInfo
, Align
,
1501 MachineMemOperand::MODereferenceable
|
1502 MachineMemOperand::MOInvariant
);
1504 SDValue Val
= convertArgType(DAG
, VT
, MemVT
, SL
, Load
, Signed
, Arg
);
1505 return DAG
.getMergeValues({ Val
, Load
.getValue(1) }, SL
);
1508 SDValue
SITargetLowering::lowerStackParameter(SelectionDAG
&DAG
, CCValAssign
&VA
,
1509 const SDLoc
&SL
, SDValue Chain
,
1510 const ISD::InputArg
&Arg
) const {
1511 MachineFunction
&MF
= DAG
.getMachineFunction();
1512 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1514 if (Arg
.Flags
.isByVal()) {
1515 unsigned Size
= Arg
.Flags
.getByValSize();
1516 int FrameIdx
= MFI
.CreateFixedObject(Size
, VA
.getLocMemOffset(), false);
1517 return DAG
.getFrameIndex(FrameIdx
, MVT::i32
);
1520 unsigned ArgOffset
= VA
.getLocMemOffset();
1521 unsigned ArgSize
= VA
.getValVT().getStoreSize();
1523 int FI
= MFI
.CreateFixedObject(ArgSize
, ArgOffset
, true);
1525 // Create load nodes to retrieve arguments from the stack.
1526 SDValue FIN
= DAG
.getFrameIndex(FI
, MVT::i32
);
1529 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1530 ISD::LoadExtType ExtType
= ISD::NON_EXTLOAD
;
1531 MVT MemVT
= VA
.getValVT();
1533 switch (VA
.getLocInfo()) {
1536 case CCValAssign::BCvt
:
1537 MemVT
= VA
.getLocVT();
1539 case CCValAssign::SExt
:
1540 ExtType
= ISD::SEXTLOAD
;
1542 case CCValAssign::ZExt
:
1543 ExtType
= ISD::ZEXTLOAD
;
1545 case CCValAssign::AExt
:
1546 ExtType
= ISD::EXTLOAD
;
1550 ArgValue
= DAG
.getExtLoad(
1551 ExtType
, SL
, VA
.getLocVT(), Chain
, FIN
,
1552 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
),
1557 SDValue
SITargetLowering::getPreloadedValue(SelectionDAG
&DAG
,
1558 const SIMachineFunctionInfo
&MFI
,
1560 AMDGPUFunctionArgInfo::PreloadedValue PVID
) const {
1561 const ArgDescriptor
*Reg
;
1562 const TargetRegisterClass
*RC
;
1564 std::tie(Reg
, RC
) = MFI
.getPreloadedValue(PVID
);
1565 return CreateLiveInRegister(DAG
, RC
, Reg
->getRegister(), VT
);
1568 static void processShaderInputArgs(SmallVectorImpl
<ISD::InputArg
> &Splits
,
1569 CallingConv::ID CallConv
,
1570 ArrayRef
<ISD::InputArg
> Ins
,
1572 FunctionType
*FType
,
1573 SIMachineFunctionInfo
*Info
) {
1574 for (unsigned I
= 0, E
= Ins
.size(), PSInputNum
= 0; I
!= E
; ++I
) {
1575 const ISD::InputArg
*Arg
= &Ins
[I
];
1577 assert((!Arg
->VT
.isVector() || Arg
->VT
.getScalarSizeInBits() == 16) &&
1578 "vector type argument should have been split");
1580 // First check if it's a PS input addr.
1581 if (CallConv
== CallingConv::AMDGPU_PS
&&
1582 !Arg
->Flags
.isInReg() && PSInputNum
<= 15) {
1583 bool SkipArg
= !Arg
->Used
&& !Info
->isPSInputAllocated(PSInputNum
);
1585 // Inconveniently only the first part of the split is marked as isSplit,
1586 // so skip to the end. We only want to increment PSInputNum once for the
1587 // entire split argument.
1588 if (Arg
->Flags
.isSplit()) {
1589 while (!Arg
->Flags
.isSplitEnd()) {
1590 assert((!Arg
->VT
.isVector() ||
1591 Arg
->VT
.getScalarSizeInBits() == 16) &&
1592 "unexpected vector split in ps argument type");
1594 Splits
.push_back(*Arg
);
1600 // We can safely skip PS inputs.
1601 Skipped
.set(Arg
->getOrigArgIndex());
1606 Info
->markPSInputAllocated(PSInputNum
);
1608 Info
->markPSInputEnabled(PSInputNum
);
1613 Splits
.push_back(*Arg
);
1617 // Allocate special inputs passed in VGPRs.
1618 void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState
&CCInfo
,
1619 MachineFunction
&MF
,
1620 const SIRegisterInfo
&TRI
,
1621 SIMachineFunctionInfo
&Info
) const {
1622 const LLT S32
= LLT::scalar(32);
1623 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1625 if (Info
.hasWorkItemIDX()) {
1626 Register Reg
= AMDGPU::VGPR0
;
1627 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1629 CCInfo
.AllocateReg(Reg
);
1630 Info
.setWorkItemIDX(ArgDescriptor::createRegister(Reg
));
1633 if (Info
.hasWorkItemIDY()) {
1634 Register Reg
= AMDGPU::VGPR1
;
1635 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1637 CCInfo
.AllocateReg(Reg
);
1638 Info
.setWorkItemIDY(ArgDescriptor::createRegister(Reg
));
1641 if (Info
.hasWorkItemIDZ()) {
1642 Register Reg
= AMDGPU::VGPR2
;
1643 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1645 CCInfo
.AllocateReg(Reg
);
1646 Info
.setWorkItemIDZ(ArgDescriptor::createRegister(Reg
));
1650 // Try to allocate a VGPR at the end of the argument list, or if no argument
1651 // VGPRs are left allocating a stack slot.
1652 // If \p Mask is is given it indicates bitfield position in the register.
1653 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1654 static ArgDescriptor
allocateVGPR32Input(CCState
&CCInfo
, unsigned Mask
= ~0u,
1655 ArgDescriptor Arg
= ArgDescriptor()) {
1657 return ArgDescriptor::createArg(Arg
, Mask
);
1659 ArrayRef
<MCPhysReg
> ArgVGPRs
1660 = makeArrayRef(AMDGPU::VGPR_32RegClass
.begin(), 32);
1661 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgVGPRs
);
1662 if (RegIdx
== ArgVGPRs
.size()) {
1663 // Spill to stack required.
1664 int64_t Offset
= CCInfo
.AllocateStack(4, 4);
1666 return ArgDescriptor::createStack(Offset
, Mask
);
1669 unsigned Reg
= ArgVGPRs
[RegIdx
];
1670 Reg
= CCInfo
.AllocateReg(Reg
);
1671 assert(Reg
!= AMDGPU::NoRegister
);
1673 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1674 Register LiveInVReg
= MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
);
1675 MF
.getRegInfo().setType(LiveInVReg
, LLT::scalar(32));
1676 return ArgDescriptor::createRegister(Reg
, Mask
);
1679 static ArgDescriptor
allocateSGPR32InputImpl(CCState
&CCInfo
,
1680 const TargetRegisterClass
*RC
,
1681 unsigned NumArgRegs
) {
1682 ArrayRef
<MCPhysReg
> ArgSGPRs
= makeArrayRef(RC
->begin(), 32);
1683 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgSGPRs
);
1684 if (RegIdx
== ArgSGPRs
.size())
1685 report_fatal_error("ran out of SGPRs for arguments");
1687 unsigned Reg
= ArgSGPRs
[RegIdx
];
1688 Reg
= CCInfo
.AllocateReg(Reg
);
1689 assert(Reg
!= AMDGPU::NoRegister
);
1691 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1692 MF
.addLiveIn(Reg
, RC
);
1693 return ArgDescriptor::createRegister(Reg
);
1696 static ArgDescriptor
allocateSGPR32Input(CCState
&CCInfo
) {
1697 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_32RegClass
, 32);
1700 static ArgDescriptor
allocateSGPR64Input(CCState
&CCInfo
) {
1701 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_64RegClass
, 16);
1704 void SITargetLowering::allocateSpecialInputVGPRs(CCState
&CCInfo
,
1705 MachineFunction
&MF
,
1706 const SIRegisterInfo
&TRI
,
1707 SIMachineFunctionInfo
&Info
) const {
1708 const unsigned Mask
= 0x3ff;
1711 if (Info
.hasWorkItemIDX()) {
1712 Arg
= allocateVGPR32Input(CCInfo
, Mask
);
1713 Info
.setWorkItemIDX(Arg
);
1716 if (Info
.hasWorkItemIDY()) {
1717 Arg
= allocateVGPR32Input(CCInfo
, Mask
<< 10, Arg
);
1718 Info
.setWorkItemIDY(Arg
);
1721 if (Info
.hasWorkItemIDZ())
1722 Info
.setWorkItemIDZ(allocateVGPR32Input(CCInfo
, Mask
<< 20, Arg
));
1725 void SITargetLowering::allocateSpecialInputSGPRs(
1727 MachineFunction
&MF
,
1728 const SIRegisterInfo
&TRI
,
1729 SIMachineFunctionInfo
&Info
) const {
1730 auto &ArgInfo
= Info
.getArgInfo();
1732 // TODO: Unify handling with private memory pointers.
1734 if (Info
.hasDispatchPtr())
1735 ArgInfo
.DispatchPtr
= allocateSGPR64Input(CCInfo
);
1737 if (Info
.hasQueuePtr())
1738 ArgInfo
.QueuePtr
= allocateSGPR64Input(CCInfo
);
1740 if (Info
.hasKernargSegmentPtr())
1741 ArgInfo
.KernargSegmentPtr
= allocateSGPR64Input(CCInfo
);
1743 if (Info
.hasDispatchID())
1744 ArgInfo
.DispatchID
= allocateSGPR64Input(CCInfo
);
1746 // flat_scratch_init is not applicable for non-kernel functions.
1748 if (Info
.hasWorkGroupIDX())
1749 ArgInfo
.WorkGroupIDX
= allocateSGPR32Input(CCInfo
);
1751 if (Info
.hasWorkGroupIDY())
1752 ArgInfo
.WorkGroupIDY
= allocateSGPR32Input(CCInfo
);
1754 if (Info
.hasWorkGroupIDZ())
1755 ArgInfo
.WorkGroupIDZ
= allocateSGPR32Input(CCInfo
);
1757 if (Info
.hasImplicitArgPtr())
1758 ArgInfo
.ImplicitArgPtr
= allocateSGPR64Input(CCInfo
);
1761 // Allocate special inputs passed in user SGPRs.
1762 void SITargetLowering::allocateHSAUserSGPRs(CCState
&CCInfo
,
1763 MachineFunction
&MF
,
1764 const SIRegisterInfo
&TRI
,
1765 SIMachineFunctionInfo
&Info
) const {
1766 if (Info
.hasImplicitBufferPtr()) {
1767 unsigned ImplicitBufferPtrReg
= Info
.addImplicitBufferPtr(TRI
);
1768 MF
.addLiveIn(ImplicitBufferPtrReg
, &AMDGPU::SGPR_64RegClass
);
1769 CCInfo
.AllocateReg(ImplicitBufferPtrReg
);
1772 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1773 if (Info
.hasPrivateSegmentBuffer()) {
1774 unsigned PrivateSegmentBufferReg
= Info
.addPrivateSegmentBuffer(TRI
);
1775 MF
.addLiveIn(PrivateSegmentBufferReg
, &AMDGPU::SGPR_128RegClass
);
1776 CCInfo
.AllocateReg(PrivateSegmentBufferReg
);
1779 if (Info
.hasDispatchPtr()) {
1780 unsigned DispatchPtrReg
= Info
.addDispatchPtr(TRI
);
1781 MF
.addLiveIn(DispatchPtrReg
, &AMDGPU::SGPR_64RegClass
);
1782 CCInfo
.AllocateReg(DispatchPtrReg
);
1785 if (Info
.hasQueuePtr()) {
1786 unsigned QueuePtrReg
= Info
.addQueuePtr(TRI
);
1787 MF
.addLiveIn(QueuePtrReg
, &AMDGPU::SGPR_64RegClass
);
1788 CCInfo
.AllocateReg(QueuePtrReg
);
1791 if (Info
.hasKernargSegmentPtr()) {
1792 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1793 Register InputPtrReg
= Info
.addKernargSegmentPtr(TRI
);
1794 CCInfo
.AllocateReg(InputPtrReg
);
1796 Register VReg
= MF
.addLiveIn(InputPtrReg
, &AMDGPU::SGPR_64RegClass
);
1797 MRI
.setType(VReg
, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
1800 if (Info
.hasDispatchID()) {
1801 unsigned DispatchIDReg
= Info
.addDispatchID(TRI
);
1802 MF
.addLiveIn(DispatchIDReg
, &AMDGPU::SGPR_64RegClass
);
1803 CCInfo
.AllocateReg(DispatchIDReg
);
1806 if (Info
.hasFlatScratchInit()) {
1807 unsigned FlatScratchInitReg
= Info
.addFlatScratchInit(TRI
);
1808 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
1809 CCInfo
.AllocateReg(FlatScratchInitReg
);
1812 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1813 // these from the dispatch pointer.
1816 // Allocate special input registers that are initialized per-wave.
1817 void SITargetLowering::allocateSystemSGPRs(CCState
&CCInfo
,
1818 MachineFunction
&MF
,
1819 SIMachineFunctionInfo
&Info
,
1820 CallingConv::ID CallConv
,
1821 bool IsShader
) const {
1822 if (Info
.hasWorkGroupIDX()) {
1823 unsigned Reg
= Info
.addWorkGroupIDX();
1824 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1825 CCInfo
.AllocateReg(Reg
);
1828 if (Info
.hasWorkGroupIDY()) {
1829 unsigned Reg
= Info
.addWorkGroupIDY();
1830 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1831 CCInfo
.AllocateReg(Reg
);
1834 if (Info
.hasWorkGroupIDZ()) {
1835 unsigned Reg
= Info
.addWorkGroupIDZ();
1836 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1837 CCInfo
.AllocateReg(Reg
);
1840 if (Info
.hasWorkGroupInfo()) {
1841 unsigned Reg
= Info
.addWorkGroupInfo();
1842 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1843 CCInfo
.AllocateReg(Reg
);
1846 if (Info
.hasPrivateSegmentWaveByteOffset()) {
1847 // Scratch wave offset passed in system SGPR.
1848 unsigned PrivateSegmentWaveByteOffsetReg
;
1851 PrivateSegmentWaveByteOffsetReg
=
1852 Info
.getPrivateSegmentWaveByteOffsetSystemSGPR();
1854 // This is true if the scratch wave byte offset doesn't have a fixed
1856 if (PrivateSegmentWaveByteOffsetReg
== AMDGPU::NoRegister
) {
1857 PrivateSegmentWaveByteOffsetReg
= findFirstFreeSGPR(CCInfo
);
1858 Info
.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg
);
1861 PrivateSegmentWaveByteOffsetReg
= Info
.addPrivateSegmentWaveByteOffset();
1863 MF
.addLiveIn(PrivateSegmentWaveByteOffsetReg
, &AMDGPU::SGPR_32RegClass
);
1864 CCInfo
.AllocateReg(PrivateSegmentWaveByteOffsetReg
);
1868 static void reservePrivateMemoryRegs(const TargetMachine
&TM
,
1869 MachineFunction
&MF
,
1870 const SIRegisterInfo
&TRI
,
1871 SIMachineFunctionInfo
&Info
) {
1872 // Now that we've figured out where the scratch register inputs are, see if
1873 // should reserve the arguments and use them directly.
1874 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1875 bool HasStackObjects
= MFI
.hasStackObjects();
1876 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1878 // Record that we know we have non-spill stack objects so we don't need to
1879 // check all stack objects later.
1880 if (HasStackObjects
)
1881 Info
.setHasNonSpillStackObjects(true);
1883 // Everything live out of a block is spilled with fast regalloc, so it's
1884 // almost certain that spilling will be required.
1885 if (TM
.getOptLevel() == CodeGenOpt::None
)
1886 HasStackObjects
= true;
1888 // For now assume stack access is needed in any callee functions, so we need
1889 // the scratch registers to pass in.
1890 bool RequiresStackAccess
= HasStackObjects
|| MFI
.hasCalls();
1892 if (RequiresStackAccess
&& ST
.isAmdHsaOrMesa(MF
.getFunction())) {
1893 // If we have stack objects, we unquestionably need the private buffer
1894 // resource. For the Code Object V2 ABI, this will be the first 4 user
1895 // SGPR inputs. We can reserve those and use them directly.
1897 Register PrivateSegmentBufferReg
=
1898 Info
.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER
);
1899 Info
.setScratchRSrcReg(PrivateSegmentBufferReg
);
1901 unsigned ReservedBufferReg
= TRI
.reservedPrivateSegmentBufferReg(MF
);
1902 // We tentatively reserve the last registers (skipping the last registers
1903 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
1904 // we'll replace these with the ones immediately after those which were
1905 // really allocated. In the prologue copies will be inserted from the
1906 // argument to these reserved registers.
1908 // Without HSA, relocations are used for the scratch pointer and the
1909 // buffer resource setup is always inserted in the prologue. Scratch wave
1910 // offset is still in an input SGPR.
1911 Info
.setScratchRSrcReg(ReservedBufferReg
);
1914 // hasFP should be accurate for kernels even before the frame is finalized.
1915 if (ST
.getFrameLowering()->hasFP(MF
)) {
1916 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1918 // Try to use s32 as the SP, but move it if it would interfere with input
1919 // arguments. This won't work with calls though.
1921 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
1923 if (!MRI
.isLiveIn(AMDGPU::SGPR32
)) {
1924 Info
.setStackPtrOffsetReg(AMDGPU::SGPR32
);
1926 assert(AMDGPU::isShader(MF
.getFunction().getCallingConv()));
1929 report_fatal_error("call in graphics shader with too many input SGPRs");
1931 for (unsigned Reg
: AMDGPU::SGPR_32RegClass
) {
1932 if (!MRI
.isLiveIn(Reg
)) {
1933 Info
.setStackPtrOffsetReg(Reg
);
1938 if (Info
.getStackPtrOffsetReg() == AMDGPU::SP_REG
)
1939 report_fatal_error("failed to find register for SP");
1942 if (MFI
.hasCalls()) {
1943 Info
.setScratchWaveOffsetReg(AMDGPU::SGPR33
);
1944 Info
.setFrameOffsetReg(AMDGPU::SGPR33
);
1946 unsigned ReservedOffsetReg
=
1947 TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1948 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1949 Info
.setFrameOffsetReg(ReservedOffsetReg
);
1951 } else if (RequiresStackAccess
) {
1952 assert(!MFI
.hasCalls());
1953 // We know there are accesses and they will be done relative to SP, so just
1954 // pin it to the input.
1956 // FIXME: Should not do this if inline asm is reading/writing these
1958 Register PreloadedSP
= Info
.getPreloadedReg(
1959 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET
);
1961 Info
.setStackPtrOffsetReg(PreloadedSP
);
1962 Info
.setScratchWaveOffsetReg(PreloadedSP
);
1963 Info
.setFrameOffsetReg(PreloadedSP
);
1965 assert(!MFI
.hasCalls());
1967 // There may not be stack access at all. There may still be spills, or
1968 // access of a constant pointer (in which cases an extra copy will be
1969 // emitted in the prolog).
1970 unsigned ReservedOffsetReg
1971 = TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1972 Info
.setStackPtrOffsetReg(ReservedOffsetReg
);
1973 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1974 Info
.setFrameOffsetReg(ReservedOffsetReg
);
1978 bool SITargetLowering::supportSplitCSR(MachineFunction
*MF
) const {
1979 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
1980 return !Info
->isEntryFunction();
1983 void SITargetLowering::initializeSplitCSR(MachineBasicBlock
*Entry
) const {
1987 void SITargetLowering::insertCopiesSplitCSR(
1988 MachineBasicBlock
*Entry
,
1989 const SmallVectorImpl
<MachineBasicBlock
*> &Exits
) const {
1990 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
1992 const MCPhysReg
*IStart
= TRI
->getCalleeSavedRegsViaCopy(Entry
->getParent());
1996 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
1997 MachineRegisterInfo
*MRI
= &Entry
->getParent()->getRegInfo();
1998 MachineBasicBlock::iterator MBBI
= Entry
->begin();
1999 for (const MCPhysReg
*I
= IStart
; *I
; ++I
) {
2000 const TargetRegisterClass
*RC
= nullptr;
2001 if (AMDGPU::SReg_64RegClass
.contains(*I
))
2002 RC
= &AMDGPU::SGPR_64RegClass
;
2003 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
2004 RC
= &AMDGPU::SGPR_32RegClass
;
2006 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2008 Register NewVR
= MRI
->createVirtualRegister(RC
);
2009 // Create copy from CSR to a virtual register.
2010 Entry
->addLiveIn(*I
);
2011 BuildMI(*Entry
, MBBI
, DebugLoc(), TII
->get(TargetOpcode::COPY
), NewVR
)
2014 // Insert the copy-back instructions right before the terminator.
2015 for (auto *Exit
: Exits
)
2016 BuildMI(*Exit
, Exit
->getFirstTerminator(), DebugLoc(),
2017 TII
->get(TargetOpcode::COPY
), *I
)
2022 SDValue
SITargetLowering::LowerFormalArguments(
2023 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
2024 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2025 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
2026 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2028 MachineFunction
&MF
= DAG
.getMachineFunction();
2029 const Function
&Fn
= MF
.getFunction();
2030 FunctionType
*FType
= MF
.getFunction().getFunctionType();
2031 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2033 if (Subtarget
->isAmdHsaOS() && AMDGPU::isShader(CallConv
)) {
2034 DiagnosticInfoUnsupported
NoGraphicsHSA(
2035 Fn
, "unsupported non-compute shaders with HSA", DL
.getDebugLoc());
2036 DAG
.getContext()->diagnose(NoGraphicsHSA
);
2037 return DAG
.getEntryNode();
2040 SmallVector
<ISD::InputArg
, 16> Splits
;
2041 SmallVector
<CCValAssign
, 16> ArgLocs
;
2042 BitVector
Skipped(Ins
.size());
2043 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
2046 bool IsShader
= AMDGPU::isShader(CallConv
);
2047 bool IsKernel
= AMDGPU::isKernel(CallConv
);
2048 bool IsEntryFunc
= AMDGPU::isEntryFunctionCC(CallConv
);
2051 processShaderInputArgs(Splits
, CallConv
, Ins
, Skipped
, FType
, Info
);
2053 // At least one interpolation mode must be enabled or else the GPU will
2056 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2057 // set PSInputAddr, the user wants to enable some bits after the compilation
2058 // based on run-time states. Since we can't know what the final PSInputEna
2059 // will look like, so we shouldn't do anything here and the user should take
2060 // responsibility for the correct programming.
2062 // Otherwise, the following restrictions apply:
2063 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2064 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2066 if (CallConv
== CallingConv::AMDGPU_PS
) {
2067 if ((Info
->getPSInputAddr() & 0x7F) == 0 ||
2068 ((Info
->getPSInputAddr() & 0xF) == 0 &&
2069 Info
->isPSInputAllocated(11))) {
2070 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
2071 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
2072 Info
->markPSInputAllocated(0);
2073 Info
->markPSInputEnabled(0);
2075 if (Subtarget
->isAmdPalOS()) {
2076 // For isAmdPalOS, the user does not enable some bits after compilation
2077 // based on run-time states; the register values being generated here are
2078 // the final ones set in hardware. Therefore we need to apply the
2079 // workaround to PSInputAddr and PSInputEnable together. (The case where
2080 // a bit is set in PSInputAddr but not PSInputEnable is where the
2081 // frontend set up an input arg for a particular interpolation mode, but
2082 // nothing uses that input arg. Really we should have an earlier pass
2083 // that removes such an arg.)
2084 unsigned PsInputBits
= Info
->getPSInputAddr() & Info
->getPSInputEnable();
2085 if ((PsInputBits
& 0x7F) == 0 ||
2086 ((PsInputBits
& 0xF) == 0 &&
2087 (PsInputBits
>> 11 & 1)))
2088 Info
->markPSInputEnabled(
2089 countTrailingZeros(Info
->getPSInputAddr(), ZB_Undefined
));
2093 assert(!Info
->hasDispatchPtr() &&
2094 !Info
->hasKernargSegmentPtr() && !Info
->hasFlatScratchInit() &&
2095 !Info
->hasWorkGroupIDX() && !Info
->hasWorkGroupIDY() &&
2096 !Info
->hasWorkGroupIDZ() && !Info
->hasWorkGroupInfo() &&
2097 !Info
->hasWorkItemIDX() && !Info
->hasWorkItemIDY() &&
2098 !Info
->hasWorkItemIDZ());
2099 } else if (IsKernel
) {
2100 assert(Info
->hasWorkGroupIDX() && Info
->hasWorkItemIDX());
2102 Splits
.append(Ins
.begin(), Ins
.end());
2106 allocateSpecialEntryInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2107 allocateHSAUserSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2111 analyzeFormalArgumentsCompute(CCInfo
, Ins
);
2113 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, isVarArg
);
2114 CCInfo
.AnalyzeFormalArguments(Splits
, AssignFn
);
2117 SmallVector
<SDValue
, 16> Chains
;
2119 // FIXME: This is the minimum kernel argument alignment. We should improve
2120 // this to the maximum alignment of the arguments.
2122 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2124 const unsigned KernelArgBaseAlign
= 16;
2126 for (unsigned i
= 0, e
= Ins
.size(), ArgIdx
= 0; i
!= e
; ++i
) {
2127 const ISD::InputArg
&Arg
= Ins
[i
];
2128 if (Arg
.isOrigArg() && Skipped
[Arg
.getOrigArgIndex()]) {
2129 InVals
.push_back(DAG
.getUNDEF(Arg
.VT
));
2133 CCValAssign
&VA
= ArgLocs
[ArgIdx
++];
2134 MVT VT
= VA
.getLocVT();
2136 if (IsEntryFunc
&& VA
.isMemLoc()) {
2138 EVT MemVT
= VA
.getLocVT();
2140 const uint64_t Offset
= VA
.getLocMemOffset();
2141 unsigned Align
= MinAlign(KernelArgBaseAlign
, Offset
);
2143 SDValue Arg
= lowerKernargMemParameter(
2144 DAG
, VT
, MemVT
, DL
, Chain
, Offset
, Align
, Ins
[i
].Flags
.isSExt(), &Ins
[i
]);
2145 Chains
.push_back(Arg
.getValue(1));
2148 dyn_cast
<PointerType
>(FType
->getParamType(Ins
[i
].getOrigArgIndex()));
2149 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
2150 ParamTy
&& (ParamTy
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
2151 ParamTy
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
)) {
2152 // On SI local pointers are just offsets into LDS, so they are always
2153 // less than 16-bits. On CI and newer they could potentially be
2154 // real pointers, so we can't guarantee their size.
2155 Arg
= DAG
.getNode(ISD::AssertZext
, DL
, Arg
.getValueType(), Arg
,
2156 DAG
.getValueType(MVT::i16
));
2159 InVals
.push_back(Arg
);
2161 } else if (!IsEntryFunc
&& VA
.isMemLoc()) {
2162 SDValue Val
= lowerStackParameter(DAG
, VA
, DL
, Chain
, Arg
);
2163 InVals
.push_back(Val
);
2164 if (!Arg
.Flags
.isByVal())
2165 Chains
.push_back(Val
.getValue(1));
2169 assert(VA
.isRegLoc() && "Parameter must be in a register!");
2171 Register Reg
= VA
.getLocReg();
2172 const TargetRegisterClass
*RC
= TRI
->getMinimalPhysRegClass(Reg
, VT
);
2173 EVT ValVT
= VA
.getValVT();
2175 Reg
= MF
.addLiveIn(Reg
, RC
);
2176 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
);
2178 if (Arg
.Flags
.isSRet()) {
2179 // The return object should be reasonably addressable.
2181 // FIXME: This helps when the return is a real sret. If it is a
2182 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2183 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2185 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
2186 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
2187 DAG
.getValueType(EVT::getIntegerVT(*DAG
.getContext(), NumBits
)));
2190 // If this is an 8 or 16-bit value, it is really passed promoted
2191 // to 32 bits. Insert an assert[sz]ext to capture this, then
2192 // truncate to the right size.
2193 switch (VA
.getLocInfo()) {
2194 case CCValAssign::Full
:
2196 case CCValAssign::BCvt
:
2197 Val
= DAG
.getNode(ISD::BITCAST
, DL
, ValVT
, Val
);
2199 case CCValAssign::SExt
:
2200 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VT
, Val
,
2201 DAG
.getValueType(ValVT
));
2202 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2204 case CCValAssign::ZExt
:
2205 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
2206 DAG
.getValueType(ValVT
));
2207 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2209 case CCValAssign::AExt
:
2210 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2213 llvm_unreachable("Unknown loc info!");
2216 InVals
.push_back(Val
);
2220 // Special inputs come after user arguments.
2221 allocateSpecialInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2224 // Start adding system SGPRs.
2226 allocateSystemSGPRs(CCInfo
, MF
, *Info
, CallConv
, IsShader
);
2228 CCInfo
.AllocateReg(Info
->getScratchRSrcReg());
2229 CCInfo
.AllocateReg(Info
->getScratchWaveOffsetReg());
2230 CCInfo
.AllocateReg(Info
->getFrameOffsetReg());
2231 allocateSpecialInputSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2234 auto &ArgUsageInfo
=
2235 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2236 ArgUsageInfo
.setFuncArgInfo(Fn
, Info
->getArgInfo());
2238 unsigned StackArgSize
= CCInfo
.getNextStackOffset();
2239 Info
->setBytesInStackArgArea(StackArgSize
);
2241 return Chains
.empty() ? Chain
:
2242 DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
2245 // TODO: If return values can't fit in registers, we should return as many as
2246 // possible in registers before passing on stack.
2247 bool SITargetLowering::CanLowerReturn(
2248 CallingConv::ID CallConv
,
2249 MachineFunction
&MF
, bool IsVarArg
,
2250 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2251 LLVMContext
&Context
) const {
2252 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2253 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2254 // for shaders. Vector types should be explicitly handled by CC.
2255 if (AMDGPU::isEntryFunctionCC(CallConv
))
2258 SmallVector
<CCValAssign
, 16> RVLocs
;
2259 CCState
CCInfo(CallConv
, IsVarArg
, MF
, RVLocs
, Context
);
2260 return CCInfo
.CheckReturn(Outs
, CCAssignFnForReturn(CallConv
, IsVarArg
));
2264 SITargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
2266 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2267 const SmallVectorImpl
<SDValue
> &OutVals
,
2268 const SDLoc
&DL
, SelectionDAG
&DAG
) const {
2269 MachineFunction
&MF
= DAG
.getMachineFunction();
2270 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2272 if (AMDGPU::isKernel(CallConv
)) {
2273 return AMDGPUTargetLowering::LowerReturn(Chain
, CallConv
, isVarArg
, Outs
,
2277 bool IsShader
= AMDGPU::isShader(CallConv
);
2279 Info
->setIfReturnsVoid(Outs
.empty());
2280 bool IsWaveEnd
= Info
->returnsVoid() && IsShader
;
2282 // CCValAssign - represent the assignment of the return value to a location.
2283 SmallVector
<CCValAssign
, 48> RVLocs
;
2284 SmallVector
<ISD::OutputArg
, 48> Splits
;
2286 // CCState - Info about the registers and stack slots.
2287 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
2290 // Analyze outgoing return values.
2291 CCInfo
.AnalyzeReturn(Outs
, CCAssignFnForReturn(CallConv
, isVarArg
));
2294 SmallVector
<SDValue
, 48> RetOps
;
2295 RetOps
.push_back(Chain
); // Operand #0 = Chain (updated below)
2297 // Add return address for callable functions.
2298 if (!Info
->isEntryFunction()) {
2299 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2300 SDValue ReturnAddrReg
= CreateLiveInRegister(
2301 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2303 SDValue ReturnAddrVirtualReg
= DAG
.getRegister(
2304 MF
.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass
),
2307 DAG
.getCopyToReg(Chain
, DL
, ReturnAddrVirtualReg
, ReturnAddrReg
, Flag
);
2308 Flag
= Chain
.getValue(1);
2309 RetOps
.push_back(ReturnAddrVirtualReg
);
2312 // Copy the result values into the output registers.
2313 for (unsigned I
= 0, RealRVLocIdx
= 0, E
= RVLocs
.size(); I
!= E
;
2314 ++I
, ++RealRVLocIdx
) {
2315 CCValAssign
&VA
= RVLocs
[I
];
2316 assert(VA
.isRegLoc() && "Can only return in registers!");
2317 // TODO: Partially return in registers if return values don't fit.
2318 SDValue Arg
= OutVals
[RealRVLocIdx
];
2320 // Copied from other backends.
2321 switch (VA
.getLocInfo()) {
2322 case CCValAssign::Full
:
2324 case CCValAssign::BCvt
:
2325 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2327 case CCValAssign::SExt
:
2328 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2330 case CCValAssign::ZExt
:
2331 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2333 case CCValAssign::AExt
:
2334 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2337 llvm_unreachable("Unknown loc info!");
2340 Chain
= DAG
.getCopyToReg(Chain
, DL
, VA
.getLocReg(), Arg
, Flag
);
2341 Flag
= Chain
.getValue(1);
2342 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
2345 // FIXME: Does sret work properly?
2346 if (!Info
->isEntryFunction()) {
2347 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2348 const MCPhysReg
*I
=
2349 TRI
->getCalleeSavedRegsViaCopy(&DAG
.getMachineFunction());
2352 if (AMDGPU::SReg_64RegClass
.contains(*I
))
2353 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i64
));
2354 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
2355 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i32
));
2357 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2362 // Update chain and glue.
2365 RetOps
.push_back(Flag
);
2367 unsigned Opc
= AMDGPUISD::ENDPGM
;
2369 Opc
= IsShader
? AMDGPUISD::RETURN_TO_EPILOG
: AMDGPUISD::RET_FLAG
;
2370 return DAG
.getNode(Opc
, DL
, MVT::Other
, RetOps
);
2373 SDValue
SITargetLowering::LowerCallResult(
2374 SDValue Chain
, SDValue InFlag
, CallingConv::ID CallConv
, bool IsVarArg
,
2375 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2376 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
, bool IsThisReturn
,
2377 SDValue ThisVal
) const {
2378 CCAssignFn
*RetCC
= CCAssignFnForReturn(CallConv
, IsVarArg
);
2380 // Assign locations to each value returned by this call.
2381 SmallVector
<CCValAssign
, 16> RVLocs
;
2382 CCState
CCInfo(CallConv
, IsVarArg
, DAG
.getMachineFunction(), RVLocs
,
2384 CCInfo
.AnalyzeCallResult(Ins
, RetCC
);
2386 // Copy all of the result registers out of their specified physreg.
2387 for (unsigned i
= 0; i
!= RVLocs
.size(); ++i
) {
2388 CCValAssign VA
= RVLocs
[i
];
2391 if (VA
.isRegLoc()) {
2392 Val
= DAG
.getCopyFromReg(Chain
, DL
, VA
.getLocReg(), VA
.getLocVT(), InFlag
);
2393 Chain
= Val
.getValue(1);
2394 InFlag
= Val
.getValue(2);
2395 } else if (VA
.isMemLoc()) {
2396 report_fatal_error("TODO: return values in memory");
2398 llvm_unreachable("unknown argument location type");
2400 switch (VA
.getLocInfo()) {
2401 case CCValAssign::Full
:
2403 case CCValAssign::BCvt
:
2404 Val
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), Val
);
2406 case CCValAssign::ZExt
:
2407 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VA
.getLocVT(), Val
,
2408 DAG
.getValueType(VA
.getValVT()));
2409 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2411 case CCValAssign::SExt
:
2412 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VA
.getLocVT(), Val
,
2413 DAG
.getValueType(VA
.getValVT()));
2414 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2416 case CCValAssign::AExt
:
2417 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2420 llvm_unreachable("Unknown loc info!");
2423 InVals
.push_back(Val
);
2429 // Add code to pass special inputs required depending on used features separate
2430 // from the explicit user arguments present in the IR.
2431 void SITargetLowering::passSpecialInputs(
2432 CallLoweringInfo
&CLI
,
2434 const SIMachineFunctionInfo
&Info
,
2435 SmallVectorImpl
<std::pair
<unsigned, SDValue
>> &RegsToPass
,
2436 SmallVectorImpl
<SDValue
> &MemOpChains
,
2437 SDValue Chain
) const {
2438 // If we don't have a call site, this was a call inserted by
2439 // legalization. These can never use special inputs.
2443 const Function
*CalleeFunc
= CLI
.CS
.getCalledFunction();
2446 SelectionDAG
&DAG
= CLI
.DAG
;
2447 const SDLoc
&DL
= CLI
.DL
;
2449 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2451 auto &ArgUsageInfo
=
2452 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2453 const AMDGPUFunctionArgInfo
&CalleeArgInfo
2454 = ArgUsageInfo
.lookupFuncArgInfo(*CalleeFunc
);
2456 const AMDGPUFunctionArgInfo
&CallerArgInfo
= Info
.getArgInfo();
2458 // TODO: Unify with private memory register handling. This is complicated by
2459 // the fact that at least in kernels, the input argument is not necessarily
2460 // in the same location as the input.
2461 AMDGPUFunctionArgInfo::PreloadedValue InputRegs
[] = {
2462 AMDGPUFunctionArgInfo::DISPATCH_PTR
,
2463 AMDGPUFunctionArgInfo::QUEUE_PTR
,
2464 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
,
2465 AMDGPUFunctionArgInfo::DISPATCH_ID
,
2466 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
,
2467 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
,
2468 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
,
2469 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2472 for (auto InputID
: InputRegs
) {
2473 const ArgDescriptor
*OutgoingArg
;
2474 const TargetRegisterClass
*ArgRC
;
2476 std::tie(OutgoingArg
, ArgRC
) = CalleeArgInfo
.getPreloadedValue(InputID
);
2480 const ArgDescriptor
*IncomingArg
;
2481 const TargetRegisterClass
*IncomingArgRC
;
2482 std::tie(IncomingArg
, IncomingArgRC
)
2483 = CallerArgInfo
.getPreloadedValue(InputID
);
2484 assert(IncomingArgRC
== ArgRC
);
2486 // All special arguments are ints for now.
2487 EVT ArgVT
= TRI
->getSpillSize(*ArgRC
) == 8 ? MVT::i64
: MVT::i32
;
2491 InputReg
= loadInputValue(DAG
, ArgRC
, ArgVT
, DL
, *IncomingArg
);
2493 // The implicit arg ptr is special because it doesn't have a corresponding
2494 // input for kernels, and is computed from the kernarg segment pointer.
2495 assert(InputID
== AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
2496 InputReg
= getImplicitArgPtr(DAG
, DL
);
2499 if (OutgoingArg
->isRegister()) {
2500 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2502 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(ArgVT
.getStoreSize(), 4);
2503 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2505 MemOpChains
.push_back(ArgStore
);
2509 // Pack workitem IDs into a single register or pass it as is if already
2511 const ArgDescriptor
*OutgoingArg
;
2512 const TargetRegisterClass
*ArgRC
;
2514 std::tie(OutgoingArg
, ArgRC
) =
2515 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
2517 std::tie(OutgoingArg
, ArgRC
) =
2518 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
2520 std::tie(OutgoingArg
, ArgRC
) =
2521 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
2525 const ArgDescriptor
*IncomingArgX
2526 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
).first
;
2527 const ArgDescriptor
*IncomingArgY
2528 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
).first
;
2529 const ArgDescriptor
*IncomingArgZ
2530 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
).first
;
2535 // If incoming ids are not packed we need to pack them.
2536 if (IncomingArgX
&& !IncomingArgX
->isMasked() && CalleeArgInfo
.WorkItemIDX
)
2537 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgX
);
2539 if (IncomingArgY
&& !IncomingArgY
->isMasked() && CalleeArgInfo
.WorkItemIDY
) {
2540 SDValue Y
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgY
);
2541 Y
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Y
,
2542 DAG
.getShiftAmountConstant(10, MVT::i32
, SL
));
2543 InputReg
= InputReg
.getNode() ?
2544 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Y
) : Y
;
2547 if (IncomingArgZ
&& !IncomingArgZ
->isMasked() && CalleeArgInfo
.WorkItemIDZ
) {
2548 SDValue Z
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgZ
);
2549 Z
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Z
,
2550 DAG
.getShiftAmountConstant(20, MVT::i32
, SL
));
2551 InputReg
= InputReg
.getNode() ?
2552 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Z
) : Z
;
2555 if (!InputReg
.getNode()) {
2556 // Workitem ids are already packed, any of present incoming arguments
2557 // will carry all required fields.
2558 ArgDescriptor IncomingArg
= ArgDescriptor::createArg(
2559 IncomingArgX
? *IncomingArgX
:
2560 IncomingArgY
? *IncomingArgY
:
2561 *IncomingArgZ
, ~0u);
2562 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, IncomingArg
);
2565 if (OutgoingArg
->isRegister()) {
2566 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2568 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(4, 4);
2569 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2571 MemOpChains
.push_back(ArgStore
);
2575 static bool canGuaranteeTCO(CallingConv::ID CC
) {
2576 return CC
== CallingConv::Fast
;
2579 /// Return true if we might ever do TCO for calls with this calling convention.
2580 static bool mayTailCallThisCC(CallingConv::ID CC
) {
2582 case CallingConv::C
:
2585 return canGuaranteeTCO(CC
);
2589 bool SITargetLowering::isEligibleForTailCallOptimization(
2590 SDValue Callee
, CallingConv::ID CalleeCC
, bool IsVarArg
,
2591 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2592 const SmallVectorImpl
<SDValue
> &OutVals
,
2593 const SmallVectorImpl
<ISD::InputArg
> &Ins
, SelectionDAG
&DAG
) const {
2594 if (!mayTailCallThisCC(CalleeCC
))
2597 MachineFunction
&MF
= DAG
.getMachineFunction();
2598 const Function
&CallerF
= MF
.getFunction();
2599 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
2600 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2601 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
2603 // Kernels aren't callable, and don't have a live in return address so it
2604 // doesn't make sense to do a tail call with entry functions.
2605 if (!CallerPreserved
)
2608 bool CCMatch
= CallerCC
== CalleeCC
;
2610 if (DAG
.getTarget().Options
.GuaranteedTailCallOpt
) {
2611 if (canGuaranteeTCO(CalleeCC
) && CCMatch
)
2616 // TODO: Can we handle var args?
2620 for (const Argument
&Arg
: CallerF
.args()) {
2621 if (Arg
.hasByValAttr())
2625 LLVMContext
&Ctx
= *DAG
.getContext();
2627 // Check that the call results are passed in the same way.
2628 if (!CCState::resultsCompatible(CalleeCC
, CallerCC
, MF
, Ctx
, Ins
,
2629 CCAssignFnForCall(CalleeCC
, IsVarArg
),
2630 CCAssignFnForCall(CallerCC
, IsVarArg
)))
2633 // The callee has to preserve all registers the caller needs to preserve.
2635 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
2636 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
2640 // Nothing more to check if the callee is taking no arguments.
2644 SmallVector
<CCValAssign
, 16> ArgLocs
;
2645 CCState
CCInfo(CalleeCC
, IsVarArg
, MF
, ArgLocs
, Ctx
);
2647 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CalleeCC
, IsVarArg
));
2649 const SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
2650 // If the stack arguments for this call do not fit into our own save area then
2651 // the call cannot be made tail.
2652 // TODO: Is this really necessary?
2653 if (CCInfo
.getNextStackOffset() > FuncInfo
->getBytesInStackArgArea())
2656 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2657 return parametersInCSRMatch(MRI
, CallerPreserved
, ArgLocs
, OutVals
);
2660 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst
*CI
) const {
2661 if (!CI
->isTailCall())
2664 const Function
*ParentFn
= CI
->getParent()->getParent();
2665 if (AMDGPU::isEntryFunctionCC(ParentFn
->getCallingConv()))
2668 auto Attr
= ParentFn
->getFnAttribute("disable-tail-calls");
2669 return (Attr
.getValueAsString() != "true");
2672 // The wave scratch offset register is used as the global base pointer.
2673 SDValue
SITargetLowering::LowerCall(CallLoweringInfo
&CLI
,
2674 SmallVectorImpl
<SDValue
> &InVals
) const {
2675 SelectionDAG
&DAG
= CLI
.DAG
;
2676 const SDLoc
&DL
= CLI
.DL
;
2677 SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
2678 SmallVector
<SDValue
, 32> &OutVals
= CLI
.OutVals
;
2679 SmallVector
<ISD::InputArg
, 32> &Ins
= CLI
.Ins
;
2680 SDValue Chain
= CLI
.Chain
;
2681 SDValue Callee
= CLI
.Callee
;
2682 bool &IsTailCall
= CLI
.IsTailCall
;
2683 CallingConv::ID CallConv
= CLI
.CallConv
;
2684 bool IsVarArg
= CLI
.IsVarArg
;
2685 bool IsSibCall
= false;
2686 bool IsThisReturn
= false;
2687 MachineFunction
&MF
= DAG
.getMachineFunction();
2690 return lowerUnhandledCall(CLI
, InVals
,
2691 "unsupported call to variadic function ");
2694 if (!CLI
.CS
.getInstruction())
2695 report_fatal_error("unsupported libcall legalization");
2697 if (!CLI
.CS
.getCalledFunction()) {
2698 return lowerUnhandledCall(CLI
, InVals
,
2699 "unsupported indirect call to function ");
2702 if (IsTailCall
&& MF
.getTarget().Options
.GuaranteedTailCallOpt
) {
2703 return lowerUnhandledCall(CLI
, InVals
,
2704 "unsupported required tail call to function ");
2707 if (AMDGPU::isShader(MF
.getFunction().getCallingConv())) {
2708 // Note the issue is with the CC of the calling function, not of the call
2710 return lowerUnhandledCall(CLI
, InVals
,
2711 "unsupported call from graphics shader of function ");
2715 IsTailCall
= isEligibleForTailCallOptimization(
2716 Callee
, CallConv
, IsVarArg
, Outs
, OutVals
, Ins
, DAG
);
2717 if (!IsTailCall
&& CLI
.CS
&& CLI
.CS
.isMustTailCall()) {
2718 report_fatal_error("failed to perform tail call elimination on a call "
2719 "site marked musttail");
2722 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
2724 // A sibling call is one where we're under the usual C ABI and not planning
2725 // to change that but can still do a tail call:
2726 if (!TailCallOpt
&& IsTailCall
)
2733 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2735 // Analyze operands of the call, assigning locations to each operand.
2736 SmallVector
<CCValAssign
, 16> ArgLocs
;
2737 CCState
CCInfo(CallConv
, IsVarArg
, MF
, ArgLocs
, *DAG
.getContext());
2738 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, IsVarArg
);
2740 CCInfo
.AnalyzeCallOperands(Outs
, AssignFn
);
2742 // Get a count of how many bytes are to be pushed on the stack.
2743 unsigned NumBytes
= CCInfo
.getNextStackOffset();
2746 // Since we're not changing the ABI to make this a tail call, the memory
2747 // operands are already available in the caller's incoming argument space.
2751 // FPDiff is the byte offset of the call's argument area from the callee's.
2752 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2753 // by this amount for a tail call. In a sibling call it must be 0 because the
2754 // caller will deallocate the entire stack and the callee still expects its
2755 // arguments to begin at SP+0. Completely unused for non-tail calls.
2757 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2758 SmallVector
<std::pair
<unsigned, SDValue
>, 8> RegsToPass
;
2760 // Adjust the stack pointer for the new arguments...
2761 // These operations are automatically eliminated by the prolog/epilog pass
2763 Chain
= DAG
.getCALLSEQ_START(Chain
, 0, 0, DL
);
2765 SmallVector
<SDValue
, 4> CopyFromChains
;
2767 // In the HSA case, this should be an identity copy.
2768 SDValue ScratchRSrcReg
2769 = DAG
.getCopyFromReg(Chain
, DL
, Info
->getScratchRSrcReg(), MVT::v4i32
);
2770 RegsToPass
.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3
, ScratchRSrcReg
);
2771 CopyFromChains
.push_back(ScratchRSrcReg
.getValue(1));
2772 Chain
= DAG
.getTokenFactor(DL
, CopyFromChains
);
2775 SmallVector
<SDValue
, 8> MemOpChains
;
2776 MVT PtrVT
= MVT::i32
;
2778 // Walk the register/memloc assignments, inserting copies/loads.
2779 for (unsigned i
= 0, realArgIdx
= 0, e
= ArgLocs
.size(); i
!= e
;
2780 ++i
, ++realArgIdx
) {
2781 CCValAssign
&VA
= ArgLocs
[i
];
2782 SDValue Arg
= OutVals
[realArgIdx
];
2784 // Promote the value if needed.
2785 switch (VA
.getLocInfo()) {
2786 case CCValAssign::Full
:
2788 case CCValAssign::BCvt
:
2789 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2791 case CCValAssign::ZExt
:
2792 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2794 case CCValAssign::SExt
:
2795 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2797 case CCValAssign::AExt
:
2798 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2800 case CCValAssign::FPExt
:
2801 Arg
= DAG
.getNode(ISD::FP_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2804 llvm_unreachable("Unknown loc info!");
2807 if (VA
.isRegLoc()) {
2808 RegsToPass
.push_back(std::make_pair(VA
.getLocReg(), Arg
));
2810 assert(VA
.isMemLoc());
2813 MachinePointerInfo DstInfo
;
2815 unsigned LocMemOffset
= VA
.getLocMemOffset();
2816 int32_t Offset
= LocMemOffset
;
2818 SDValue PtrOff
= DAG
.getConstant(Offset
, DL
, PtrVT
);
2822 ISD::ArgFlagsTy Flags
= Outs
[realArgIdx
].Flags
;
2823 unsigned OpSize
= Flags
.isByVal() ?
2824 Flags
.getByValSize() : VA
.getValVT().getStoreSize();
2826 // FIXME: We can have better than the minimum byval required alignment.
2827 Align
= Flags
.isByVal() ? Flags
.getByValAlign() :
2828 MinAlign(Subtarget
->getStackAlignment(), Offset
);
2830 Offset
= Offset
+ FPDiff
;
2831 int FI
= MFI
.CreateFixedObject(OpSize
, Offset
, true);
2833 DstAddr
= DAG
.getFrameIndex(FI
, PtrVT
);
2834 DstInfo
= MachinePointerInfo::getFixedStack(MF
, FI
);
2836 // Make sure any stack arguments overlapping with where we're storing
2837 // are loaded before this eventual operation. Otherwise they'll be
2840 // FIXME: Why is this really necessary? This seems to just result in a
2841 // lot of code to copy the stack and write them back to the same
2842 // locations, which are supposed to be immutable?
2843 Chain
= addTokenForArgument(Chain
, DAG
, MFI
, FI
);
2846 DstInfo
= MachinePointerInfo::getStack(MF
, LocMemOffset
);
2847 Align
= MinAlign(Subtarget
->getStackAlignment(), LocMemOffset
);
2850 if (Outs
[i
].Flags
.isByVal()) {
2852 DAG
.getConstant(Outs
[i
].Flags
.getByValSize(), DL
, MVT::i32
);
2853 SDValue Cpy
= DAG
.getMemcpy(
2854 Chain
, DL
, DstAddr
, Arg
, SizeNode
, Outs
[i
].Flags
.getByValAlign(),
2855 /*isVol = */ false, /*AlwaysInline = */ true,
2856 /*isTailCall = */ false, DstInfo
,
2857 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2858 *DAG
.getContext(), AMDGPUAS::PRIVATE_ADDRESS
))));
2860 MemOpChains
.push_back(Cpy
);
2862 SDValue Store
= DAG
.getStore(Chain
, DL
, Arg
, DstAddr
, DstInfo
, Align
);
2863 MemOpChains
.push_back(Store
);
2868 // Copy special input registers after user input arguments.
2869 passSpecialInputs(CLI
, CCInfo
, *Info
, RegsToPass
, MemOpChains
, Chain
);
2871 if (!MemOpChains
.empty())
2872 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOpChains
);
2874 // Build a sequence of copy-to-reg nodes chained together with token chain
2875 // and flag operands which copy the outgoing args into the appropriate regs.
2877 for (auto &RegToPass
: RegsToPass
) {
2878 Chain
= DAG
.getCopyToReg(Chain
, DL
, RegToPass
.first
,
2879 RegToPass
.second
, InFlag
);
2880 InFlag
= Chain
.getValue(1);
2884 SDValue PhysReturnAddrReg
;
2886 // Since the return is being combined with the call, we need to pass on the
2889 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2890 SDValue ReturnAddrReg
= CreateLiveInRegister(
2891 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2893 PhysReturnAddrReg
= DAG
.getRegister(TRI
->getReturnAddressReg(MF
),
2895 Chain
= DAG
.getCopyToReg(Chain
, DL
, PhysReturnAddrReg
, ReturnAddrReg
, InFlag
);
2896 InFlag
= Chain
.getValue(1);
2899 // We don't usually want to end the call-sequence here because we would tidy
2900 // the frame up *after* the call, however in the ABI-changing tail-call case
2901 // we've carefully laid out the parameters so that when sp is reset they'll be
2902 // in the correct location.
2903 if (IsTailCall
&& !IsSibCall
) {
2904 Chain
= DAG
.getCALLSEQ_END(Chain
,
2905 DAG
.getTargetConstant(NumBytes
, DL
, MVT::i32
),
2906 DAG
.getTargetConstant(0, DL
, MVT::i32
),
2908 InFlag
= Chain
.getValue(1);
2911 std::vector
<SDValue
> Ops
;
2912 Ops
.push_back(Chain
);
2913 Ops
.push_back(Callee
);
2914 // Add a redundant copy of the callee global which will not be legalized, as
2915 // we need direct access to the callee later.
2916 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Callee
);
2917 const GlobalValue
*GV
= GSD
->getGlobal();
2918 Ops
.push_back(DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i64
));
2921 // Each tail call may have to adjust the stack by a different amount, so
2922 // this information must travel along with the operation for eventual
2923 // consumption by emitEpilogue.
2924 Ops
.push_back(DAG
.getTargetConstant(FPDiff
, DL
, MVT::i32
));
2926 Ops
.push_back(PhysReturnAddrReg
);
2929 // Add argument registers to the end of the list so that they are known live
2931 for (auto &RegToPass
: RegsToPass
) {
2932 Ops
.push_back(DAG
.getRegister(RegToPass
.first
,
2933 RegToPass
.second
.getValueType()));
2936 // Add a register mask operand representing the call-preserved registers.
2938 auto *TRI
= static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
2939 const uint32_t *Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
2940 assert(Mask
&& "Missing call preserved mask for calling convention");
2941 Ops
.push_back(DAG
.getRegisterMask(Mask
));
2943 if (InFlag
.getNode())
2944 Ops
.push_back(InFlag
);
2946 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2948 // If we're doing a tall call, use a TC_RETURN here rather than an
2949 // actual call instruction.
2951 MFI
.setHasTailCall();
2952 return DAG
.getNode(AMDGPUISD::TC_RETURN
, DL
, NodeTys
, Ops
);
2955 // Returns a chain and a flag for retval copy to use.
2956 SDValue Call
= DAG
.getNode(AMDGPUISD::CALL
, DL
, NodeTys
, Ops
);
2957 Chain
= Call
.getValue(0);
2958 InFlag
= Call
.getValue(1);
2960 uint64_t CalleePopBytes
= NumBytes
;
2961 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getTargetConstant(0, DL
, MVT::i32
),
2962 DAG
.getTargetConstant(CalleePopBytes
, DL
, MVT::i32
),
2965 InFlag
= Chain
.getValue(1);
2967 // Handle result values, copying them out of physregs into vregs that we
2969 return LowerCallResult(Chain
, InFlag
, CallConv
, IsVarArg
, Ins
, DL
, DAG
,
2970 InVals
, IsThisReturn
,
2971 IsThisReturn
? OutVals
[0] : SDValue());
2974 Register
SITargetLowering::getRegisterByName(const char* RegName
, EVT VT
,
2975 const MachineFunction
&MF
) const {
2976 Register Reg
= StringSwitch
<Register
>(RegName
)
2977 .Case("m0", AMDGPU::M0
)
2978 .Case("exec", AMDGPU::EXEC
)
2979 .Case("exec_lo", AMDGPU::EXEC_LO
)
2980 .Case("exec_hi", AMDGPU::EXEC_HI
)
2981 .Case("flat_scratch", AMDGPU::FLAT_SCR
)
2982 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO
)
2983 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI
)
2984 .Default(Register());
2986 if (Reg
== AMDGPU::NoRegister
) {
2987 report_fatal_error(Twine("invalid register name \""
2988 + StringRef(RegName
) + "\"."));
2992 if (!Subtarget
->hasFlatScrRegister() &&
2993 Subtarget
->getRegisterInfo()->regsOverlap(Reg
, AMDGPU::FLAT_SCR
)) {
2994 report_fatal_error(Twine("invalid register \""
2995 + StringRef(RegName
) + "\" for subtarget."));
3000 case AMDGPU::EXEC_LO
:
3001 case AMDGPU::EXEC_HI
:
3002 case AMDGPU::FLAT_SCR_LO
:
3003 case AMDGPU::FLAT_SCR_HI
:
3004 if (VT
.getSizeInBits() == 32)
3008 case AMDGPU::FLAT_SCR
:
3009 if (VT
.getSizeInBits() == 64)
3013 llvm_unreachable("missing register type checking");
3016 report_fatal_error(Twine("invalid type for register \""
3017 + StringRef(RegName
) + "\"."));
3020 // If kill is not the last instruction, split the block so kill is always a
3021 // proper terminator.
3022 MachineBasicBlock
*SITargetLowering::splitKillBlock(MachineInstr
&MI
,
3023 MachineBasicBlock
*BB
) const {
3024 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3026 MachineBasicBlock::iterator
SplitPoint(&MI
);
3029 if (SplitPoint
== BB
->end()) {
3030 // Don't bother with a new block.
3031 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
3035 MachineFunction
*MF
= BB
->getParent();
3036 MachineBasicBlock
*SplitBB
3037 = MF
->CreateMachineBasicBlock(BB
->getBasicBlock());
3039 MF
->insert(++MachineFunction::iterator(BB
), SplitBB
);
3040 SplitBB
->splice(SplitBB
->begin(), BB
, SplitPoint
, BB
->end());
3042 SplitBB
->transferSuccessorsAndUpdatePHIs(BB
);
3043 BB
->addSuccessor(SplitBB
);
3045 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
3049 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3050 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3051 // be the first instruction in the remainder block.
3053 /// \returns { LoopBody, Remainder }
3054 static std::pair
<MachineBasicBlock
*, MachineBasicBlock
*>
3055 splitBlockForLoop(MachineInstr
&MI
, MachineBasicBlock
&MBB
, bool InstInLoop
) {
3056 MachineFunction
*MF
= MBB
.getParent();
3057 MachineBasicBlock::iterator
I(&MI
);
3059 // To insert the loop we need to split the block. Move everything after this
3060 // point to a new block, and insert a new empty block between the two.
3061 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
3062 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
3063 MachineFunction::iterator
MBBI(MBB
);
3066 MF
->insert(MBBI
, LoopBB
);
3067 MF
->insert(MBBI
, RemainderBB
);
3069 LoopBB
->addSuccessor(LoopBB
);
3070 LoopBB
->addSuccessor(RemainderBB
);
3072 // Move the rest of the block into a new block.
3073 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
3076 auto Next
= std::next(I
);
3078 // Move instruction to loop body.
3079 LoopBB
->splice(LoopBB
->begin(), &MBB
, I
, Next
);
3081 // Move the rest of the block.
3082 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, Next
, MBB
.end());
3084 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, I
, MBB
.end());
3087 MBB
.addSuccessor(LoopBB
);
3089 return std::make_pair(LoopBB
, RemainderBB
);
3092 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3093 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr
&MI
) const {
3094 MachineBasicBlock
*MBB
= MI
.getParent();
3095 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3096 auto I
= MI
.getIterator();
3097 auto E
= std::next(I
);
3099 BuildMI(*MBB
, E
, MI
.getDebugLoc(), TII
->get(AMDGPU::S_WAITCNT
))
3102 MIBundleBuilder
Bundler(*MBB
, I
, E
);
3103 finalizeBundle(*MBB
, Bundler
.begin());
3107 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr
&MI
,
3108 MachineBasicBlock
*BB
) const {
3109 const DebugLoc
&DL
= MI
.getDebugLoc();
3111 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3113 MachineBasicBlock
*LoopBB
;
3114 MachineBasicBlock
*RemainderBB
;
3115 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3117 // Apparently kill flags are only valid if the def is in the same block?
3118 if (MachineOperand
*Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::data0
))
3119 Src
->setIsKill(false);
3121 std::tie(LoopBB
, RemainderBB
) = splitBlockForLoop(MI
, *BB
, true);
3123 MachineBasicBlock::iterator I
= LoopBB
->end();
3125 const unsigned EncodedReg
= AMDGPU::Hwreg::encodeHwreg(
3126 AMDGPU::Hwreg::ID_TRAPSTS
, AMDGPU::Hwreg::OFFSET_MEM_VIOL
, 1);
3128 // Clear TRAP_STS.MEM_VIOL
3129 BuildMI(*LoopBB
, LoopBB
->begin(), DL
, TII
->get(AMDGPU::S_SETREG_IMM32_B32
))
3131 .addImm(EncodedReg
);
3133 bundleInstWithWaitcnt(MI
);
3135 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3137 // Load and check TRAP_STS.MEM_VIOL
3138 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_GETREG_B32
), Reg
)
3139 .addImm(EncodedReg
);
3141 // FIXME: Do we need to use an isel pseudo that may clobber scc?
3142 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CMP_LG_U32
))
3143 .addReg(Reg
, RegState::Kill
)
3145 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3151 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3152 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3153 // will only do one iteration. In the worst case, this will loop 64 times.
3155 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3156 static MachineBasicBlock::iterator
emitLoadM0FromVGPRLoop(
3157 const SIInstrInfo
*TII
,
3158 MachineRegisterInfo
&MRI
,
3159 MachineBasicBlock
&OrigBB
,
3160 MachineBasicBlock
&LoopBB
,
3162 const MachineOperand
&IdxReg
,
3166 unsigned InitSaveExecReg
,
3169 bool IsIndirectSrc
) {
3170 MachineFunction
*MF
= OrigBB
.getParent();
3171 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3172 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3173 MachineBasicBlock::iterator I
= LoopBB
.begin();
3175 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
3176 Register PhiExec
= MRI
.createVirtualRegister(BoolRC
);
3177 Register NewExec
= MRI
.createVirtualRegister(BoolRC
);
3178 Register CurrentIdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3179 Register CondReg
= MRI
.createVirtualRegister(BoolRC
);
3181 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiReg
)
3187 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiExec
)
3188 .addReg(InitSaveExecReg
)
3193 // Read the next variant <- also loop target.
3194 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), CurrentIdxReg
)
3195 .addReg(IdxReg
.getReg(), getUndefRegState(IdxReg
.isUndef()));
3197 // Compare the just read M0 value to all possible Idx values.
3198 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_CMP_EQ_U32_e64
), CondReg
)
3199 .addReg(CurrentIdxReg
)
3200 .addReg(IdxReg
.getReg(), 0, IdxReg
.getSubReg());
3202 // Update EXEC, save the original EXEC value to VCC.
3203 BuildMI(LoopBB
, I
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3204 : AMDGPU::S_AND_SAVEEXEC_B64
),
3206 .addReg(CondReg
, RegState::Kill
);
3208 MRI
.setSimpleHint(NewExec
, CondReg
);
3210 if (UseGPRIdxMode
) {
3213 IdxReg
= CurrentIdxReg
;
3215 IdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3216 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), IdxReg
)
3217 .addReg(CurrentIdxReg
, RegState::Kill
)
3220 unsigned IdxMode
= IsIndirectSrc
?
3221 AMDGPU::VGPRIndexMode::SRC0_ENABLE
: AMDGPU::VGPRIndexMode::DST_ENABLE
;
3222 MachineInstr
*SetOn
=
3223 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3224 .addReg(IdxReg
, RegState::Kill
)
3226 SetOn
->getOperand(3).setIsUndef();
3228 // Move index from VCC into M0
3230 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3231 .addReg(CurrentIdxReg
, RegState::Kill
);
3233 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
3234 .addReg(CurrentIdxReg
, RegState::Kill
)
3239 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3240 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3241 MachineInstr
*InsertPt
=
3242 BuildMI(LoopBB
, I
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_XOR_B32_term
3243 : AMDGPU::S_XOR_B64_term
), Exec
)
3247 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3250 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3251 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
3254 return InsertPt
->getIterator();
3257 // This has slightly sub-optimal regalloc when the source vector is killed by
3258 // the read. The register allocator does not understand that the kill is
3259 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3260 // subregister from it, using 1 more VGPR than necessary. This was saved when
3261 // this was expanded after register allocation.
3262 static MachineBasicBlock::iterator
loadM0FromVGPR(const SIInstrInfo
*TII
,
3263 MachineBasicBlock
&MBB
,
3265 unsigned InitResultReg
,
3269 bool IsIndirectSrc
) {
3270 MachineFunction
*MF
= MBB
.getParent();
3271 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3272 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3273 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3274 const DebugLoc
&DL
= MI
.getDebugLoc();
3275 MachineBasicBlock::iterator
I(&MI
);
3277 const auto *BoolXExecRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
3278 Register DstReg
= MI
.getOperand(0).getReg();
3279 Register SaveExec
= MRI
.createVirtualRegister(BoolXExecRC
);
3280 Register TmpExec
= MRI
.createVirtualRegister(BoolXExecRC
);
3281 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3282 unsigned MovExecOpc
= ST
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
3284 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), TmpExec
);
3286 // Save the EXEC mask
3287 BuildMI(MBB
, I
, DL
, TII
->get(MovExecOpc
), SaveExec
)
3290 MachineBasicBlock
*LoopBB
;
3291 MachineBasicBlock
*RemainderBB
;
3292 std::tie(LoopBB
, RemainderBB
) = splitBlockForLoop(MI
, MBB
, false);
3294 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3296 auto InsPt
= emitLoadM0FromVGPRLoop(TII
, MRI
, MBB
, *LoopBB
, DL
, *Idx
,
3297 InitResultReg
, DstReg
, PhiReg
, TmpExec
,
3298 Offset
, UseGPRIdxMode
, IsIndirectSrc
);
3300 MachineBasicBlock::iterator First
= RemainderBB
->begin();
3301 BuildMI(*RemainderBB
, First
, DL
, TII
->get(MovExecOpc
), Exec
)
3307 // Returns subreg index, offset
3308 static std::pair
<unsigned, int>
3309 computeIndirectRegAndOffset(const SIRegisterInfo
&TRI
,
3310 const TargetRegisterClass
*SuperRC
,
3313 int NumElts
= TRI
.getRegSizeInBits(*SuperRC
) / 32;
3315 // Skip out of bounds offsets, or else we would end up using an undefined
3317 if (Offset
>= NumElts
|| Offset
< 0)
3318 return std::make_pair(AMDGPU::sub0
, Offset
);
3320 return std::make_pair(AMDGPU::sub0
+ Offset
, 0);
3323 // Return true if the index is an SGPR and was set.
3324 static bool setM0ToIndexFromSGPR(const SIInstrInfo
*TII
,
3325 MachineRegisterInfo
&MRI
,
3329 bool IsIndirectSrc
) {
3330 MachineBasicBlock
*MBB
= MI
.getParent();
3331 const DebugLoc
&DL
= MI
.getDebugLoc();
3332 MachineBasicBlock::iterator
I(&MI
);
3334 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3335 const TargetRegisterClass
*IdxRC
= MRI
.getRegClass(Idx
->getReg());
3337 assert(Idx
->getReg() != AMDGPU::NoRegister
);
3339 if (!TII
->getRegisterInfo().isSGPRClass(IdxRC
))
3342 if (UseGPRIdxMode
) {
3343 unsigned IdxMode
= IsIndirectSrc
?
3344 AMDGPU::VGPRIndexMode::SRC0_ENABLE
: AMDGPU::VGPRIndexMode::DST_ENABLE
;
3346 MachineInstr
*SetOn
=
3347 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3351 SetOn
->getOperand(3).setIsUndef();
3353 Register Tmp
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3354 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), Tmp
)
3357 MachineInstr
*SetOn
=
3358 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3359 .addReg(Tmp
, RegState::Kill
)
3362 SetOn
->getOperand(3).setIsUndef();
3369 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3372 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
3380 // Control flow needs to be inserted if indexing with a VGPR.
3381 static MachineBasicBlock
*emitIndirectSrc(MachineInstr
&MI
,
3382 MachineBasicBlock
&MBB
,
3383 const GCNSubtarget
&ST
) {
3384 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3385 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3386 MachineFunction
*MF
= MBB
.getParent();
3387 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3389 Register Dst
= MI
.getOperand(0).getReg();
3390 Register SrcReg
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
)->getReg();
3391 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3393 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcReg
);
3396 std::tie(SubReg
, Offset
)
3397 = computeIndirectRegAndOffset(TRI
, VecRC
, SrcReg
, Offset
);
3399 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3401 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, true)) {
3402 MachineBasicBlock::iterator
I(&MI
);
3403 const DebugLoc
&DL
= MI
.getDebugLoc();
3405 if (UseGPRIdxMode
) {
3406 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3407 // to avoid interfering with other uses, so probably requires a new
3408 // optimization pass.
3409 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3410 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3411 .addReg(SrcReg
, RegState::Implicit
)
3412 .addReg(AMDGPU::M0
, RegState::Implicit
);
3413 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3415 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3416 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3417 .addReg(SrcReg
, RegState::Implicit
);
3420 MI
.eraseFromParent();
3425 const DebugLoc
&DL
= MI
.getDebugLoc();
3426 MachineBasicBlock::iterator
I(&MI
);
3428 Register PhiReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3429 Register InitReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3431 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), InitReg
);
3433 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, InitReg
, PhiReg
,
3434 Offset
, UseGPRIdxMode
, true);
3435 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3437 if (UseGPRIdxMode
) {
3438 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3439 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3440 .addReg(SrcReg
, RegState::Implicit
)
3441 .addReg(AMDGPU::M0
, RegState::Implicit
);
3442 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3444 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3445 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3446 .addReg(SrcReg
, RegState::Implicit
);
3449 MI
.eraseFromParent();
3454 static unsigned getMOVRELDPseudo(const SIRegisterInfo
&TRI
,
3455 const TargetRegisterClass
*VecRC
) {
3456 switch (TRI
.getRegSizeInBits(*VecRC
)) {
3458 return AMDGPU::V_MOVRELD_B32_V1
;
3460 return AMDGPU::V_MOVRELD_B32_V2
;
3461 case 128: // 16 bytes
3462 return AMDGPU::V_MOVRELD_B32_V4
;
3463 case 256: // 32 bytes
3464 return AMDGPU::V_MOVRELD_B32_V8
;
3465 case 512: // 64 bytes
3466 return AMDGPU::V_MOVRELD_B32_V16
;
3468 llvm_unreachable("unsupported size for MOVRELD pseudos");
3472 static MachineBasicBlock
*emitIndirectDst(MachineInstr
&MI
,
3473 MachineBasicBlock
&MBB
,
3474 const GCNSubtarget
&ST
) {
3475 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3476 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3477 MachineFunction
*MF
= MBB
.getParent();
3478 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3480 Register Dst
= MI
.getOperand(0).getReg();
3481 const MachineOperand
*SrcVec
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
);
3482 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3483 const MachineOperand
*Val
= TII
->getNamedOperand(MI
, AMDGPU::OpName::val
);
3484 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3485 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcVec
->getReg());
3487 // This can be an immediate, but will be folded later.
3488 assert(Val
->getReg());
3491 std::tie(SubReg
, Offset
) = computeIndirectRegAndOffset(TRI
, VecRC
,
3494 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3496 if (Idx
->getReg() == AMDGPU::NoRegister
) {
3497 MachineBasicBlock::iterator
I(&MI
);
3498 const DebugLoc
&DL
= MI
.getDebugLoc();
3500 assert(Offset
== 0);
3502 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::INSERT_SUBREG
), Dst
)
3507 MI
.eraseFromParent();
3511 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, false)) {
3512 MachineBasicBlock::iterator
I(&MI
);
3513 const DebugLoc
&DL
= MI
.getDebugLoc();
3515 if (UseGPRIdxMode
) {
3516 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3517 .addReg(SrcVec
->getReg(), RegState::Undef
, SubReg
) // vdst
3519 .addReg(Dst
, RegState::ImplicitDefine
)
3520 .addReg(SrcVec
->getReg(), RegState::Implicit
)
3521 .addReg(AMDGPU::M0
, RegState::Implicit
);
3523 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3525 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3527 BuildMI(MBB
, I
, DL
, MovRelDesc
)
3528 .addReg(Dst
, RegState::Define
)
3529 .addReg(SrcVec
->getReg())
3531 .addImm(SubReg
- AMDGPU::sub0
);
3534 MI
.eraseFromParent();
3539 MRI
.clearKillFlags(Val
->getReg());
3541 const DebugLoc
&DL
= MI
.getDebugLoc();
3543 Register PhiReg
= MRI
.createVirtualRegister(VecRC
);
3545 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, SrcVec
->getReg(), PhiReg
,
3546 Offset
, UseGPRIdxMode
, false);
3547 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3549 if (UseGPRIdxMode
) {
3550 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3551 .addReg(PhiReg
, RegState::Undef
, SubReg
) // vdst
3553 .addReg(Dst
, RegState::ImplicitDefine
)
3554 .addReg(PhiReg
, RegState::Implicit
)
3555 .addReg(AMDGPU::M0
, RegState::Implicit
);
3556 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3558 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3560 BuildMI(*LoopBB
, InsPt
, DL
, MovRelDesc
)
3561 .addReg(Dst
, RegState::Define
)
3564 .addImm(SubReg
- AMDGPU::sub0
);
3567 MI
.eraseFromParent();
3572 MachineBasicBlock
*SITargetLowering::EmitInstrWithCustomInserter(
3573 MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
3575 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3576 MachineFunction
*MF
= BB
->getParent();
3577 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
3579 if (TII
->isMIMG(MI
)) {
3580 if (MI
.memoperands_empty() && MI
.mayLoadOrStore()) {
3581 report_fatal_error("missing mem operand from MIMG instruction");
3583 // Add a memoperand for mimg instructions so that they aren't assumed to
3584 // be ordered memory instuctions.
3589 switch (MI
.getOpcode()) {
3590 case AMDGPU::S_ADD_U64_PSEUDO
:
3591 case AMDGPU::S_SUB_U64_PSEUDO
: {
3592 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3593 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3594 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3595 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
3596 const DebugLoc
&DL
= MI
.getDebugLoc();
3598 MachineOperand
&Dest
= MI
.getOperand(0);
3599 MachineOperand
&Src0
= MI
.getOperand(1);
3600 MachineOperand
&Src1
= MI
.getOperand(2);
3602 Register DestSub0
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3603 Register DestSub1
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3605 MachineOperand Src0Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3606 Src0
, BoolRC
, AMDGPU::sub0
,
3607 &AMDGPU::SReg_32_XM0RegClass
);
3608 MachineOperand Src0Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3609 Src0
, BoolRC
, AMDGPU::sub1
,
3610 &AMDGPU::SReg_32_XM0RegClass
);
3612 MachineOperand Src1Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3613 Src1
, BoolRC
, AMDGPU::sub0
,
3614 &AMDGPU::SReg_32_XM0RegClass
);
3615 MachineOperand Src1Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3616 Src1
, BoolRC
, AMDGPU::sub1
,
3617 &AMDGPU::SReg_32_XM0RegClass
);
3619 bool IsAdd
= (MI
.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO
);
3621 unsigned LoOpc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
3622 unsigned HiOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
3623 BuildMI(*BB
, MI
, DL
, TII
->get(LoOpc
), DestSub0
)
3626 BuildMI(*BB
, MI
, DL
, TII
->get(HiOpc
), DestSub1
)
3629 BuildMI(*BB
, MI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), Dest
.getReg())
3631 .addImm(AMDGPU::sub0
)
3633 .addImm(AMDGPU::sub1
);
3634 MI
.eraseFromParent();
3637 case AMDGPU::SI_INIT_M0
: {
3638 BuildMI(*BB
, MI
.getIterator(), MI
.getDebugLoc(),
3639 TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3640 .add(MI
.getOperand(0));
3641 MI
.eraseFromParent();
3644 case AMDGPU::SI_INIT_EXEC
:
3645 // This should be before all vector instructions.
3646 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B64
),
3648 .addImm(MI
.getOperand(0).getImm());
3649 MI
.eraseFromParent();
3652 case AMDGPU::SI_INIT_EXEC_LO
:
3653 // This should be before all vector instructions.
3654 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B32
),
3656 .addImm(MI
.getOperand(0).getImm());
3657 MI
.eraseFromParent();
3660 case AMDGPU::SI_INIT_EXEC_FROM_INPUT
: {
3661 // Extract the thread count from an SGPR input and set EXEC accordingly.
3662 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3664 // S_BFE_U32 count, input, {shift, 7}
3665 // S_BFM_B64 exec, count, 0
3666 // S_CMP_EQ_U32 count, 64
3667 // S_CMOV_B64 exec, -1
3668 MachineInstr
*FirstMI
= &*BB
->begin();
3669 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3670 Register InputReg
= MI
.getOperand(0).getReg();
3671 Register CountReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3674 // Move the COPY of the input reg to the beginning, so that we can use it.
3675 for (auto I
= BB
->begin(); I
!= &MI
; I
++) {
3676 if (I
->getOpcode() != TargetOpcode::COPY
||
3677 I
->getOperand(0).getReg() != InputReg
)
3681 FirstMI
= &*++BB
->begin();
3683 I
->removeFromParent();
3684 BB
->insert(FirstMI
, &*I
);
3692 // This should be before all vector instructions.
3693 unsigned Mask
= (getSubtarget()->getWavefrontSize() << 1) - 1;
3694 bool isWave32
= getSubtarget()->isWave32();
3695 unsigned Exec
= isWave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3696 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_BFE_U32
), CountReg
)
3698 .addImm((MI
.getOperand(1).getImm() & Mask
) | 0x70000);
3699 BuildMI(*BB
, FirstMI
, DebugLoc(),
3700 TII
->get(isWave32
? AMDGPU::S_BFM_B32
: AMDGPU::S_BFM_B64
),
3704 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_CMP_EQ_U32
))
3705 .addReg(CountReg
, RegState::Kill
)
3706 .addImm(getSubtarget()->getWavefrontSize());
3707 BuildMI(*BB
, FirstMI
, DebugLoc(),
3708 TII
->get(isWave32
? AMDGPU::S_CMOV_B32
: AMDGPU::S_CMOV_B64
),
3711 MI
.eraseFromParent();
3715 case AMDGPU::GET_GROUPSTATICSIZE
: {
3716 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
3717 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
);
3718 DebugLoc DL
= MI
.getDebugLoc();
3719 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_MOV_B32
))
3720 .add(MI
.getOperand(0))
3721 .addImm(MFI
->getLDSSize());
3722 MI
.eraseFromParent();
3725 case AMDGPU::SI_INDIRECT_SRC_V1
:
3726 case AMDGPU::SI_INDIRECT_SRC_V2
:
3727 case AMDGPU::SI_INDIRECT_SRC_V4
:
3728 case AMDGPU::SI_INDIRECT_SRC_V8
:
3729 case AMDGPU::SI_INDIRECT_SRC_V16
:
3730 return emitIndirectSrc(MI
, *BB
, *getSubtarget());
3731 case AMDGPU::SI_INDIRECT_DST_V1
:
3732 case AMDGPU::SI_INDIRECT_DST_V2
:
3733 case AMDGPU::SI_INDIRECT_DST_V4
:
3734 case AMDGPU::SI_INDIRECT_DST_V8
:
3735 case AMDGPU::SI_INDIRECT_DST_V16
:
3736 return emitIndirectDst(MI
, *BB
, *getSubtarget());
3737 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO
:
3738 case AMDGPU::SI_KILL_I1_PSEUDO
:
3739 return splitKillBlock(MI
, BB
);
3740 case AMDGPU::V_CNDMASK_B64_PSEUDO
: {
3741 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3742 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3743 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3745 Register Dst
= MI
.getOperand(0).getReg();
3746 Register Src0
= MI
.getOperand(1).getReg();
3747 Register Src1
= MI
.getOperand(2).getReg();
3748 const DebugLoc
&DL
= MI
.getDebugLoc();
3749 Register SrcCond
= MI
.getOperand(3).getReg();
3751 Register DstLo
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3752 Register DstHi
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3753 const auto *CondRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
3754 Register SrcCondCopy
= MRI
.createVirtualRegister(CondRC
);
3756 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::COPY
), SrcCondCopy
)
3758 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstLo
)
3760 .addReg(Src0
, 0, AMDGPU::sub0
)
3762 .addReg(Src1
, 0, AMDGPU::sub0
)
3763 .addReg(SrcCondCopy
);
3764 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstHi
)
3766 .addReg(Src0
, 0, AMDGPU::sub1
)
3768 .addReg(Src1
, 0, AMDGPU::sub1
)
3769 .addReg(SrcCondCopy
);
3771 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), Dst
)
3773 .addImm(AMDGPU::sub0
)
3775 .addImm(AMDGPU::sub1
);
3776 MI
.eraseFromParent();
3779 case AMDGPU::SI_BR_UNDEF
: {
3780 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3781 const DebugLoc
&DL
= MI
.getDebugLoc();
3782 MachineInstr
*Br
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3783 .add(MI
.getOperand(0));
3784 Br
->getOperand(1).setIsUndef(true); // read undef SCC
3785 MI
.eraseFromParent();
3788 case AMDGPU::ADJCALLSTACKUP
:
3789 case AMDGPU::ADJCALLSTACKDOWN
: {
3790 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
3791 MachineInstrBuilder
MIB(*MF
, &MI
);
3793 // Add an implicit use of the frame offset reg to prevent the restore copy
3794 // inserted after the call from being reorderd after stack operations in the
3795 // the caller's frame.
3796 MIB
.addReg(Info
->getStackPtrOffsetReg(), RegState::ImplicitDefine
)
3797 .addReg(Info
->getStackPtrOffsetReg(), RegState::Implicit
)
3798 .addReg(Info
->getFrameOffsetReg(), RegState::Implicit
);
3801 case AMDGPU::SI_CALL_ISEL
: {
3802 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3803 const DebugLoc
&DL
= MI
.getDebugLoc();
3805 unsigned ReturnAddrReg
= TII
->getRegisterInfo().getReturnAddressReg(*MF
);
3807 MachineInstrBuilder MIB
;
3808 MIB
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::SI_CALL
), ReturnAddrReg
);
3810 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
)
3811 MIB
.add(MI
.getOperand(I
));
3813 MIB
.cloneMemRefs(MI
);
3814 MI
.eraseFromParent();
3817 case AMDGPU::V_ADD_I32_e32
:
3818 case AMDGPU::V_SUB_I32_e32
:
3819 case AMDGPU::V_SUBREV_I32_e32
: {
3820 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
3821 const DebugLoc
&DL
= MI
.getDebugLoc();
3822 unsigned Opc
= MI
.getOpcode();
3824 bool NeedClampOperand
= false;
3825 if (TII
->pseudoToMCOpcode(Opc
) == -1) {
3826 Opc
= AMDGPU::getVOPe64(Opc
);
3827 NeedClampOperand
= true;
3830 auto I
= BuildMI(*BB
, MI
, DL
, TII
->get(Opc
), MI
.getOperand(0).getReg());
3831 if (TII
->isVOP3(*I
)) {
3832 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3833 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3834 I
.addReg(TRI
->getVCC(), RegState::Define
);
3836 I
.add(MI
.getOperand(1))
3837 .add(MI
.getOperand(2));
3838 if (NeedClampOperand
)
3839 I
.addImm(0); // clamp bit for e64 encoding
3841 TII
->legalizeOperands(*I
);
3843 MI
.eraseFromParent();
3846 case AMDGPU::DS_GWS_INIT
:
3847 case AMDGPU::DS_GWS_SEMA_V
:
3848 case AMDGPU::DS_GWS_SEMA_BR
:
3849 case AMDGPU::DS_GWS_SEMA_P
:
3850 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL
:
3851 case AMDGPU::DS_GWS_BARRIER
:
3852 // A s_waitcnt 0 is required to be the instruction immediately following.
3853 if (getSubtarget()->hasGWSAutoReplay()) {
3854 bundleInstWithWaitcnt(MI
);
3858 return emitGWSMemViolTestLoop(MI
, BB
);
3860 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI
, BB
);
3864 bool SITargetLowering::hasBitPreservingFPLogic(EVT VT
) const {
3865 return isTypeLegal(VT
.getScalarType());
3868 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT
) const {
3869 // This currently forces unfolding various combinations of fsub into fma with
3870 // free fneg'd operands. As long as we have fast FMA (controlled by
3871 // isFMAFasterThanFMulAndFAdd), we should perform these.
3873 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3874 // most of these combines appear to be cycle neutral but save on instruction
3875 // count / code size.
3879 EVT
SITargetLowering::getSetCCResultType(const DataLayout
&DL
, LLVMContext
&Ctx
,
3881 if (!VT
.isVector()) {
3884 return EVT::getVectorVT(Ctx
, MVT::i1
, VT
.getVectorNumElements());
3887 MVT
SITargetLowering::getScalarShiftAmountTy(const DataLayout
&, EVT VT
) const {
3888 // TODO: Should i16 be used always if legal? For now it would force VALU
3890 return (VT
== MVT::i16
) ? MVT::i16
: MVT::i32
;
3893 // Answering this is somewhat tricky and depends on the specific device which
3894 // have different rates for fma or all f64 operations.
3896 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3897 // regardless of which device (although the number of cycles differs between
3898 // devices), so it is always profitable for f64.
3900 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3901 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3902 // which we can always do even without fused FP ops since it returns the same
3903 // result as the separate operations and since it is always full
3904 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3905 // however does not support denormals, so we do report fma as faster if we have
3906 // a fast fma device and require denormals.
3908 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT
) const {
3909 VT
= VT
.getScalarType();
3911 switch (VT
.getSimpleVT().SimpleTy
) {
3913 // This is as fast on some subtargets. However, we always have full rate f32
3914 // mad available which returns the same result as the separate operations
3915 // which we should prefer over fma. We can't use this if we want to support
3916 // denormals, so only report this in these cases.
3917 if (Subtarget
->hasFP32Denormals())
3918 return Subtarget
->hasFastFMAF32() || Subtarget
->hasDLInsts();
3920 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3921 return Subtarget
->hasFastFMAF32() && Subtarget
->hasDLInsts();
3926 return Subtarget
->has16BitInsts() && Subtarget
->hasFP16Denormals();
3934 //===----------------------------------------------------------------------===//
3935 // Custom DAG Lowering Operations
3936 //===----------------------------------------------------------------------===//
3938 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3939 // wider vector type is legal.
3940 SDValue
SITargetLowering::splitUnaryVectorOp(SDValue Op
,
3941 SelectionDAG
&DAG
) const {
3942 unsigned Opc
= Op
.getOpcode();
3943 EVT VT
= Op
.getValueType();
3944 assert(VT
== MVT::v4f16
);
3947 std::tie(Lo
, Hi
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3950 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo
.getValueType(), Lo
,
3952 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi
.getValueType(), Hi
,
3955 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3958 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3959 // wider vector type is legal.
3960 SDValue
SITargetLowering::splitBinaryVectorOp(SDValue Op
,
3961 SelectionDAG
&DAG
) const {
3962 unsigned Opc
= Op
.getOpcode();
3963 EVT VT
= Op
.getValueType();
3964 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
3967 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3969 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
3973 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
,
3975 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
,
3978 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3981 SDValue
SITargetLowering::splitTernaryVectorOp(SDValue Op
,
3982 SelectionDAG
&DAG
) const {
3983 unsigned Opc
= Op
.getOpcode();
3984 EVT VT
= Op
.getValueType();
3985 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
3988 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3990 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
3992 std::tie(Lo2
, Hi2
) = DAG
.SplitVectorOperand(Op
.getNode(), 2);
3996 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
, Lo2
,
3998 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
, Hi2
,
4001 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
4005 SDValue
SITargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
4006 switch (Op
.getOpcode()) {
4007 default: return AMDGPUTargetLowering::LowerOperation(Op
, DAG
);
4008 case ISD::BRCOND
: return LowerBRCOND(Op
, DAG
);
4009 case ISD::RETURNADDR
: return LowerRETURNADDR(Op
, DAG
);
4011 SDValue Result
= LowerLOAD(Op
, DAG
);
4012 assert((!Result
.getNode() ||
4013 Result
.getNode()->getNumValues() == 2) &&
4014 "Load should return a value and a chain");
4020 return LowerTrig(Op
, DAG
);
4021 case ISD::SELECT
: return LowerSELECT(Op
, DAG
);
4022 case ISD::FDIV
: return LowerFDIV(Op
, DAG
);
4023 case ISD::ATOMIC_CMP_SWAP
: return LowerATOMIC_CMP_SWAP(Op
, DAG
);
4024 case ISD::STORE
: return LowerSTORE(Op
, DAG
);
4025 case ISD::GlobalAddress
: {
4026 MachineFunction
&MF
= DAG
.getMachineFunction();
4027 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
4028 return LowerGlobalAddress(MFI
, Op
, DAG
);
4030 case ISD::INTRINSIC_WO_CHAIN
: return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
4031 case ISD::INTRINSIC_W_CHAIN
: return LowerINTRINSIC_W_CHAIN(Op
, DAG
);
4032 case ISD::INTRINSIC_VOID
: return LowerINTRINSIC_VOID(Op
, DAG
);
4033 case ISD::ADDRSPACECAST
: return lowerADDRSPACECAST(Op
, DAG
);
4034 case ISD::INSERT_SUBVECTOR
:
4035 return lowerINSERT_SUBVECTOR(Op
, DAG
);
4036 case ISD::INSERT_VECTOR_ELT
:
4037 return lowerINSERT_VECTOR_ELT(Op
, DAG
);
4038 case ISD::EXTRACT_VECTOR_ELT
:
4039 return lowerEXTRACT_VECTOR_ELT(Op
, DAG
);
4040 case ISD::VECTOR_SHUFFLE
:
4041 return lowerVECTOR_SHUFFLE(Op
, DAG
);
4042 case ISD::BUILD_VECTOR
:
4043 return lowerBUILD_VECTOR(Op
, DAG
);
4045 return lowerFP_ROUND(Op
, DAG
);
4047 return lowerTRAP(Op
, DAG
);
4048 case ISD::DEBUGTRAP
:
4049 return lowerDEBUGTRAP(Op
, DAG
);
4052 case ISD::FCANONICALIZE
:
4053 return splitUnaryVectorOp(Op
, DAG
);
4056 return lowerFMINNUM_FMAXNUM(Op
, DAG
);
4058 return splitTernaryVectorOp(Op
, DAG
);
4071 case ISD::FMINNUM_IEEE
:
4072 case ISD::FMAXNUM_IEEE
:
4073 return splitBinaryVectorOp(Op
, DAG
);
4078 static SDValue
adjustLoadValueTypeImpl(SDValue Result
, EVT LoadVT
,
4080 SelectionDAG
&DAG
, bool Unpacked
) {
4081 if (!LoadVT
.isVector())
4084 if (Unpacked
) { // From v2i32/v4i32 back to v2f16/v4f16.
4085 // Truncate to v2i16/v4i16.
4086 EVT IntLoadVT
= LoadVT
.changeTypeToInteger();
4088 // Workaround legalizer not scalarizing truncate after vector op
4089 // legalization byt not creating intermediate vector trunc.
4090 SmallVector
<SDValue
, 4> Elts
;
4091 DAG
.ExtractVectorElements(Result
, Elts
);
4092 for (SDValue
&Elt
: Elts
)
4093 Elt
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, Elt
);
4095 Result
= DAG
.getBuildVector(IntLoadVT
, DL
, Elts
);
4097 // Bitcast to original type (v2f16/v4f16).
4098 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
4101 // Cast back to the original packed type.
4102 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
4105 SDValue
SITargetLowering::adjustLoadValueType(unsigned Opcode
,
4108 ArrayRef
<SDValue
> Ops
,
4109 bool IsIntrinsic
) const {
4112 bool Unpacked
= Subtarget
->hasUnpackedD16VMem();
4113 EVT LoadVT
= M
->getValueType(0);
4115 EVT EquivLoadVT
= LoadVT
;
4116 if (Unpacked
&& LoadVT
.isVector()) {
4117 EquivLoadVT
= LoadVT
.isVector() ?
4118 EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
4119 LoadVT
.getVectorNumElements()) : LoadVT
;
4122 // Change from v4f16/v2f16 to EquivLoadVT.
4123 SDVTList VTList
= DAG
.getVTList(EquivLoadVT
, MVT::Other
);
4126 = DAG
.getMemIntrinsicNode(
4127 IsIntrinsic
? (unsigned)ISD::INTRINSIC_W_CHAIN
: Opcode
, DL
,
4128 VTList
, Ops
, M
->getMemoryVT(),
4129 M
->getMemOperand());
4130 if (!Unpacked
) // Just adjusted the opcode.
4133 SDValue Adjusted
= adjustLoadValueTypeImpl(Load
, LoadVT
, DL
, DAG
, Unpacked
);
4135 return DAG
.getMergeValues({ Adjusted
, Load
.getValue(1) }, DL
);
4138 SDValue
SITargetLowering::lowerIntrinsicLoad(MemSDNode
*M
, bool IsFormat
,
4140 ArrayRef
<SDValue
> Ops
) const {
4142 EVT LoadVT
= M
->getValueType(0);
4143 EVT EltType
= LoadVT
.getScalarType();
4144 EVT IntVT
= LoadVT
.changeTypeToInteger();
4146 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
4149 IsFormat
? AMDGPUISD::BUFFER_LOAD_FORMAT
: AMDGPUISD::BUFFER_LOAD
;
4152 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
, M
, DAG
, Ops
);
4155 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
4156 if (!IsD16
&& !LoadVT
.isVector() && EltType
.getSizeInBits() < 32)
4157 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
4159 if (isTypeLegal(LoadVT
)) {
4160 return getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), Ops
, IntVT
,
4161 M
->getMemOperand(), DAG
);
4164 EVT CastVT
= getEquivalentMemType(*DAG
.getContext(), LoadVT
);
4165 SDVTList VTList
= DAG
.getVTList(CastVT
, MVT::Other
);
4166 SDValue MemNode
= getMemIntrinsicNode(Opc
, DL
, VTList
, Ops
, CastVT
,
4167 M
->getMemOperand(), DAG
);
4168 return DAG
.getMergeValues(
4169 {DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, MemNode
), MemNode
.getValue(1)},
4173 static SDValue
lowerICMPIntrinsic(const SITargetLowering
&TLI
,
4174 SDNode
*N
, SelectionDAG
&DAG
) {
4175 EVT VT
= N
->getValueType(0);
4176 const auto *CD
= cast
<ConstantSDNode
>(N
->getOperand(3));
4177 int CondCode
= CD
->getSExtValue();
4178 if (CondCode
< ICmpInst::Predicate::FIRST_ICMP_PREDICATE
||
4179 CondCode
> ICmpInst::Predicate::LAST_ICMP_PREDICATE
)
4180 return DAG
.getUNDEF(VT
);
4182 ICmpInst::Predicate IcInput
= static_cast<ICmpInst::Predicate
>(CondCode
);
4184 SDValue LHS
= N
->getOperand(1);
4185 SDValue RHS
= N
->getOperand(2);
4189 EVT CmpVT
= LHS
.getValueType();
4190 if (CmpVT
== MVT::i16
&& !TLI
.isTypeLegal(MVT::i16
)) {
4191 unsigned PromoteOp
= ICmpInst::isSigned(IcInput
) ?
4192 ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
4193 LHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, LHS
);
4194 RHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, RHS
);
4197 ISD::CondCode CCOpcode
= getICmpCondCode(IcInput
);
4199 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
4200 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
4202 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, DL
, CCVT
, LHS
, RHS
,
4203 DAG
.getCondCode(CCOpcode
));
4204 if (VT
.bitsEq(CCVT
))
4206 return DAG
.getZExtOrTrunc(SetCC
, DL
, VT
);
4209 static SDValue
lowerFCMPIntrinsic(const SITargetLowering
&TLI
,
4210 SDNode
*N
, SelectionDAG
&DAG
) {
4211 EVT VT
= N
->getValueType(0);
4212 const auto *CD
= cast
<ConstantSDNode
>(N
->getOperand(3));
4214 int CondCode
= CD
->getSExtValue();
4215 if (CondCode
< FCmpInst::Predicate::FIRST_FCMP_PREDICATE
||
4216 CondCode
> FCmpInst::Predicate::LAST_FCMP_PREDICATE
) {
4217 return DAG
.getUNDEF(VT
);
4220 SDValue Src0
= N
->getOperand(1);
4221 SDValue Src1
= N
->getOperand(2);
4222 EVT CmpVT
= Src0
.getValueType();
4225 if (CmpVT
== MVT::f16
&& !TLI
.isTypeLegal(CmpVT
)) {
4226 Src0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
4227 Src1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
4230 FCmpInst::Predicate IcInput
= static_cast<FCmpInst::Predicate
>(CondCode
);
4231 ISD::CondCode CCOpcode
= getFCmpCondCode(IcInput
);
4232 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
4233 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
4234 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, SL
, CCVT
, Src0
,
4235 Src1
, DAG
.getCondCode(CCOpcode
));
4236 if (VT
.bitsEq(CCVT
))
4238 return DAG
.getZExtOrTrunc(SetCC
, SL
, VT
);
4241 void SITargetLowering::ReplaceNodeResults(SDNode
*N
,
4242 SmallVectorImpl
<SDValue
> &Results
,
4243 SelectionDAG
&DAG
) const {
4244 switch (N
->getOpcode()) {
4245 case ISD::INSERT_VECTOR_ELT
: {
4246 if (SDValue Res
= lowerINSERT_VECTOR_ELT(SDValue(N
, 0), DAG
))
4247 Results
.push_back(Res
);
4250 case ISD::EXTRACT_VECTOR_ELT
: {
4251 if (SDValue Res
= lowerEXTRACT_VECTOR_ELT(SDValue(N
, 0), DAG
))
4252 Results
.push_back(Res
);
4255 case ISD::INTRINSIC_WO_CHAIN
: {
4256 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
4258 case Intrinsic::amdgcn_cvt_pkrtz
: {
4259 SDValue Src0
= N
->getOperand(1);
4260 SDValue Src1
= N
->getOperand(2);
4262 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32
, SL
, MVT::i32
,
4264 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Cvt
));
4267 case Intrinsic::amdgcn_cvt_pknorm_i16
:
4268 case Intrinsic::amdgcn_cvt_pknorm_u16
:
4269 case Intrinsic::amdgcn_cvt_pk_i16
:
4270 case Intrinsic::amdgcn_cvt_pk_u16
: {
4271 SDValue Src0
= N
->getOperand(1);
4272 SDValue Src1
= N
->getOperand(2);
4276 if (IID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
4277 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
4278 else if (IID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
4279 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
4280 else if (IID
== Intrinsic::amdgcn_cvt_pk_i16
)
4281 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
4283 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
4285 EVT VT
= N
->getValueType(0);
4286 if (isTypeLegal(VT
))
4287 Results
.push_back(DAG
.getNode(Opcode
, SL
, VT
, Src0
, Src1
));
4289 SDValue Cvt
= DAG
.getNode(Opcode
, SL
, MVT::i32
, Src0
, Src1
);
4290 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, Cvt
));
4297 case ISD::INTRINSIC_W_CHAIN
: {
4298 if (SDValue Res
= LowerINTRINSIC_W_CHAIN(SDValue(N
, 0), DAG
)) {
4299 if (Res
.getOpcode() == ISD::MERGE_VALUES
) {
4301 Results
.push_back(Res
.getOperand(0));
4302 Results
.push_back(Res
.getOperand(1));
4304 Results
.push_back(Res
);
4305 Results
.push_back(Res
.getValue(1));
4314 EVT VT
= N
->getValueType(0);
4315 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VT
);
4316 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(1));
4317 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(2));
4319 EVT SelectVT
= NewVT
;
4320 if (NewVT
.bitsLT(MVT::i32
)) {
4321 LHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, LHS
);
4322 RHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, RHS
);
4323 SelectVT
= MVT::i32
;
4326 SDValue NewSelect
= DAG
.getNode(ISD::SELECT
, SL
, SelectVT
,
4327 N
->getOperand(0), LHS
, RHS
);
4329 if (NewVT
!= SelectVT
)
4330 NewSelect
= DAG
.getNode(ISD::TRUNCATE
, SL
, NewVT
, NewSelect
);
4331 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewSelect
));
4335 if (N
->getValueType(0) != MVT::v2f16
)
4339 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
4341 SDValue Op
= DAG
.getNode(ISD::XOR
, SL
, MVT::i32
,
4343 DAG
.getConstant(0x80008000, SL
, MVT::i32
));
4344 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
4348 if (N
->getValueType(0) != MVT::v2f16
)
4352 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
4354 SDValue Op
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
,
4356 DAG
.getConstant(0x7fff7fff, SL
, MVT::i32
));
4357 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
4365 /// Helper function for LowerBRCOND
4366 static SDNode
*findUser(SDValue Value
, unsigned Opcode
) {
4368 SDNode
*Parent
= Value
.getNode();
4369 for (SDNode::use_iterator I
= Parent
->use_begin(), E
= Parent
->use_end();
4372 if (I
.getUse().get() != Value
)
4375 if (I
->getOpcode() == Opcode
)
4381 unsigned SITargetLowering::isCFIntrinsic(const SDNode
*Intr
) const {
4382 if (Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
) {
4383 switch (cast
<ConstantSDNode
>(Intr
->getOperand(1))->getZExtValue()) {
4384 case Intrinsic::amdgcn_if
:
4385 return AMDGPUISD::IF
;
4386 case Intrinsic::amdgcn_else
:
4387 return AMDGPUISD::ELSE
;
4388 case Intrinsic::amdgcn_loop
:
4389 return AMDGPUISD::LOOP
;
4390 case Intrinsic::amdgcn_end_cf
:
4391 llvm_unreachable("should not occur");
4397 // break, if_break, else_break are all only used as inputs to loop, not
4398 // directly as branch conditions.
4402 bool SITargetLowering::shouldEmitFixup(const GlobalValue
*GV
) const {
4403 const Triple
&TT
= getTargetMachine().getTargetTriple();
4404 return (GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4405 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4406 AMDGPU::shouldEmitConstantsToTextSection(TT
);
4409 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue
*GV
) const {
4410 // FIXME: Either avoid relying on address space here or change the default
4411 // address space for functions to avoid the explicit check.
4412 return (GV
->getValueType()->isFunctionTy() ||
4413 GV
->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
4414 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4415 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4416 !shouldEmitFixup(GV
) &&
4417 !getTargetMachine().shouldAssumeDSOLocal(*GV
->getParent(), GV
);
4420 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue
*GV
) const {
4421 return !shouldEmitFixup(GV
) && !shouldEmitGOTReloc(GV
);
4424 /// This transforms the control flow intrinsics to get the branch destination as
4425 /// last parameter, also switches branch target with BR if the need arise
4426 SDValue
SITargetLowering::LowerBRCOND(SDValue BRCOND
,
4427 SelectionDAG
&DAG
) const {
4430 SDNode
*Intr
= BRCOND
.getOperand(1).getNode();
4431 SDValue Target
= BRCOND
.getOperand(2);
4432 SDNode
*BR
= nullptr;
4433 SDNode
*SetCC
= nullptr;
4435 if (Intr
->getOpcode() == ISD::SETCC
) {
4436 // As long as we negate the condition everything is fine
4438 Intr
= SetCC
->getOperand(0).getNode();
4441 // Get the target from BR if we don't negate the condition
4442 BR
= findUser(BRCOND
, ISD::BR
);
4443 Target
= BR
->getOperand(1);
4446 // FIXME: This changes the types of the intrinsics instead of introducing new
4447 // nodes with the correct types.
4448 // e.g. llvm.amdgcn.loop
4450 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4451 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4453 unsigned CFNode
= isCFIntrinsic(Intr
);
4455 // This is a uniform branch so we don't need to legalize.
4459 bool HaveChain
= Intr
->getOpcode() == ISD::INTRINSIC_VOID
||
4460 Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
;
4463 (SetCC
->getConstantOperandVal(1) == 1 &&
4464 cast
<CondCodeSDNode
>(SetCC
->getOperand(2).getNode())->get() ==
4467 // operands of the new intrinsic call
4468 SmallVector
<SDValue
, 4> Ops
;
4470 Ops
.push_back(BRCOND
.getOperand(0));
4472 Ops
.append(Intr
->op_begin() + (HaveChain
? 2 : 1), Intr
->op_end());
4473 Ops
.push_back(Target
);
4475 ArrayRef
<EVT
> Res(Intr
->value_begin() + 1, Intr
->value_end());
4477 // build the new intrinsic call
4478 SDNode
*Result
= DAG
.getNode(CFNode
, DL
, DAG
.getVTList(Res
), Ops
).getNode();
4483 BRCOND
.getOperand(0)
4486 Result
= DAG
.getMergeValues(Ops
, DL
).getNode();
4490 // Give the branch instruction our target
4493 BRCOND
.getOperand(2)
4495 SDValue NewBR
= DAG
.getNode(ISD::BR
, DL
, BR
->getVTList(), Ops
);
4496 DAG
.ReplaceAllUsesWith(BR
, NewBR
.getNode());
4497 BR
= NewBR
.getNode();
4500 SDValue Chain
= SDValue(Result
, Result
->getNumValues() - 1);
4502 // Copy the intrinsic results to registers
4503 for (unsigned i
= 1, e
= Intr
->getNumValues() - 1; i
!= e
; ++i
) {
4504 SDNode
*CopyToReg
= findUser(SDValue(Intr
, i
), ISD::CopyToReg
);
4508 Chain
= DAG
.getCopyToReg(
4510 CopyToReg
->getOperand(1),
4511 SDValue(Result
, i
- 1),
4514 DAG
.ReplaceAllUsesWith(SDValue(CopyToReg
, 0), CopyToReg
->getOperand(0));
4517 // Remove the old intrinsic from the chain
4518 DAG
.ReplaceAllUsesOfValueWith(
4519 SDValue(Intr
, Intr
->getNumValues() - 1),
4520 Intr
->getOperand(0));
4525 SDValue
SITargetLowering::LowerRETURNADDR(SDValue Op
,
4526 SelectionDAG
&DAG
) const {
4527 MVT VT
= Op
.getSimpleValueType();
4529 // Checking the depth
4530 if (cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue() != 0)
4531 return DAG
.getConstant(0, DL
, VT
);
4533 MachineFunction
&MF
= DAG
.getMachineFunction();
4534 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4535 // Check for kernel and shader functions
4536 if (Info
->isEntryFunction())
4537 return DAG
.getConstant(0, DL
, VT
);
4539 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
4540 // There is a call to @llvm.returnaddress in this function
4541 MFI
.setReturnAddressIsTaken(true);
4543 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
4544 // Get the return address reg and mark it as an implicit live-in
4545 unsigned Reg
= MF
.addLiveIn(TRI
->getReturnAddressReg(MF
), getRegClassFor(VT
, Op
.getNode()->isDivergent()));
4547 return DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, Reg
, VT
);
4550 SDValue
SITargetLowering::getFPExtOrFPTrunc(SelectionDAG
&DAG
,
4554 return Op
.getValueType().bitsLE(VT
) ?
4555 DAG
.getNode(ISD::FP_EXTEND
, DL
, VT
, Op
) :
4556 DAG
.getNode(ISD::FTRUNC
, DL
, VT
, Op
);
4559 SDValue
SITargetLowering::lowerFP_ROUND(SDValue Op
, SelectionDAG
&DAG
) const {
4560 assert(Op
.getValueType() == MVT::f16
&&
4561 "Do not know how to custom lower FP_ROUND for non-f16 type");
4563 SDValue Src
= Op
.getOperand(0);
4564 EVT SrcVT
= Src
.getValueType();
4565 if (SrcVT
!= MVT::f64
)
4570 SDValue FpToFp16
= DAG
.getNode(ISD::FP_TO_FP16
, DL
, MVT::i32
, Src
);
4571 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, FpToFp16
);
4572 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f16
, Trunc
);
4575 SDValue
SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op
,
4576 SelectionDAG
&DAG
) const {
4577 EVT VT
= Op
.getValueType();
4578 const MachineFunction
&MF
= DAG
.getMachineFunction();
4579 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4580 bool IsIEEEMode
= Info
->getMode().IEEE
;
4582 // FIXME: Assert during eslection that this is only selected for
4583 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4584 // mode functions, but this happens to be OK since it's only done in cases
4585 // where there is known no sNaN.
4587 return expandFMINNUM_FMAXNUM(Op
.getNode(), DAG
);
4589 if (VT
== MVT::v4f16
)
4590 return splitBinaryVectorOp(Op
, DAG
);
4594 SDValue
SITargetLowering::lowerTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4596 SDValue Chain
= Op
.getOperand(0);
4598 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4599 !Subtarget
->isTrapHandlerEnabled())
4600 return DAG
.getNode(AMDGPUISD::ENDPGM
, SL
, MVT::Other
, Chain
);
4602 MachineFunction
&MF
= DAG
.getMachineFunction();
4603 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4604 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4605 assert(UserSGPR
!= AMDGPU::NoRegister
);
4606 SDValue QueuePtr
= CreateLiveInRegister(
4607 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4608 SDValue SGPR01
= DAG
.getRegister(AMDGPU::SGPR0_SGPR1
, MVT::i64
);
4609 SDValue ToReg
= DAG
.getCopyToReg(Chain
, SL
, SGPR01
,
4610 QueuePtr
, SDValue());
4613 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap
, SL
, MVT::i16
),
4617 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4620 SDValue
SITargetLowering::lowerDEBUGTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4622 SDValue Chain
= Op
.getOperand(0);
4623 MachineFunction
&MF
= DAG
.getMachineFunction();
4625 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4626 !Subtarget
->isTrapHandlerEnabled()) {
4627 DiagnosticInfoUnsupported
NoTrap(MF
.getFunction(),
4628 "debugtrap handler not supported",
4631 LLVMContext
&Ctx
= MF
.getFunction().getContext();
4632 Ctx
.diagnose(NoTrap
);
4638 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap
, SL
, MVT::i16
)
4640 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4643 SDValue
SITargetLowering::getSegmentAperture(unsigned AS
, const SDLoc
&DL
,
4644 SelectionDAG
&DAG
) const {
4645 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4646 if (Subtarget
->hasApertureRegs()) {
4647 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4648 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
4649 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
4650 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4651 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
4652 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
4654 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
4655 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
4656 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
4658 SDValue EncodingImm
= DAG
.getTargetConstant(Encoding
, DL
, MVT::i16
);
4659 SDValue ApertureReg
= SDValue(
4660 DAG
.getMachineNode(AMDGPU::S_GETREG_B32
, DL
, MVT::i32
, EncodingImm
), 0);
4661 SDValue ShiftAmount
= DAG
.getTargetConstant(WidthM1
+ 1, DL
, MVT::i32
);
4662 return DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, ApertureReg
, ShiftAmount
);
4665 MachineFunction
&MF
= DAG
.getMachineFunction();
4666 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4667 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4668 assert(UserSGPR
!= AMDGPU::NoRegister
);
4670 SDValue QueuePtr
= CreateLiveInRegister(
4671 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4673 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4674 // private_segment_aperture_base_hi.
4675 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
4677 SDValue Ptr
= DAG
.getObjectPtrOffset(DL
, QueuePtr
, StructOffset
);
4679 // TODO: Use custom target PseudoSourceValue.
4680 // TODO: We should use the value from the IR intrinsic call, but it might not
4681 // be available and how do we get it?
4682 Value
*V
= UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG
.getContext()),
4683 AMDGPUAS::CONSTANT_ADDRESS
));
4685 MachinePointerInfo
PtrInfo(V
, StructOffset
);
4686 return DAG
.getLoad(MVT::i32
, DL
, QueuePtr
.getValue(1), Ptr
, PtrInfo
,
4687 MinAlign(64, StructOffset
),
4688 MachineMemOperand::MODereferenceable
|
4689 MachineMemOperand::MOInvariant
);
4692 SDValue
SITargetLowering::lowerADDRSPACECAST(SDValue Op
,
4693 SelectionDAG
&DAG
) const {
4695 const AddrSpaceCastSDNode
*ASC
= cast
<AddrSpaceCastSDNode
>(Op
);
4697 SDValue Src
= ASC
->getOperand(0);
4698 SDValue FlatNullPtr
= DAG
.getConstant(0, SL
, MVT::i64
);
4700 const AMDGPUTargetMachine
&TM
=
4701 static_cast<const AMDGPUTargetMachine
&>(getTargetMachine());
4703 // flat -> local/private
4704 if (ASC
->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4705 unsigned DestAS
= ASC
->getDestAddressSpace();
4707 if (DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
4708 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4709 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
4710 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4711 SDValue NonNull
= DAG
.getSetCC(SL
, MVT::i1
, Src
, FlatNullPtr
, ISD::SETNE
);
4712 SDValue Ptr
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, Src
);
4714 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i32
,
4715 NonNull
, Ptr
, SegmentNullPtr
);
4719 // local/private -> flat
4720 if (ASC
->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4721 unsigned SrcAS
= ASC
->getSrcAddressSpace();
4723 if (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
4724 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4725 unsigned NullVal
= TM
.getNullPointerValue(SrcAS
);
4726 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4729 = DAG
.getSetCC(SL
, MVT::i1
, Src
, SegmentNullPtr
, ISD::SETNE
);
4731 SDValue Aperture
= getSegmentAperture(ASC
->getSrcAddressSpace(), SL
, DAG
);
4733 = DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, Src
, Aperture
);
4735 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, NonNull
,
4736 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, CvtPtr
),
4741 // global <-> flat are no-ops and never emitted.
4743 const MachineFunction
&MF
= DAG
.getMachineFunction();
4744 DiagnosticInfoUnsupported
InvalidAddrSpaceCast(
4745 MF
.getFunction(), "invalid addrspacecast", SL
.getDebugLoc());
4746 DAG
.getContext()->diagnose(InvalidAddrSpaceCast
);
4748 return DAG
.getUNDEF(ASC
->getValueType(0));
4751 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
4752 // the small vector and inserting them into the big vector. That is better than
4753 // the default expansion of doing it via a stack slot. Even though the use of
4754 // the stack slot would be optimized away afterwards, the stack slot itself
4756 SDValue
SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op
,
4757 SelectionDAG
&DAG
) const {
4758 SDValue Vec
= Op
.getOperand(0);
4759 SDValue Ins
= Op
.getOperand(1);
4760 SDValue Idx
= Op
.getOperand(2);
4761 EVT VecVT
= Vec
.getValueType();
4762 EVT InsVT
= Ins
.getValueType();
4763 EVT EltVT
= VecVT
.getVectorElementType();
4764 unsigned InsNumElts
= InsVT
.getVectorNumElements();
4765 unsigned IdxVal
= cast
<ConstantSDNode
>(Idx
)->getZExtValue();
4768 for (unsigned I
= 0; I
!= InsNumElts
; ++I
) {
4769 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Ins
,
4770 DAG
.getConstant(I
, SL
, MVT::i32
));
4771 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, VecVT
, Vec
, Elt
,
4772 DAG
.getConstant(IdxVal
+ I
, SL
, MVT::i32
));
4777 SDValue
SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op
,
4778 SelectionDAG
&DAG
) const {
4779 SDValue Vec
= Op
.getOperand(0);
4780 SDValue InsVal
= Op
.getOperand(1);
4781 SDValue Idx
= Op
.getOperand(2);
4782 EVT VecVT
= Vec
.getValueType();
4783 EVT EltVT
= VecVT
.getVectorElementType();
4784 unsigned VecSize
= VecVT
.getSizeInBits();
4785 unsigned EltSize
= EltVT
.getSizeInBits();
4788 assert(VecSize
<= 64);
4790 unsigned NumElts
= VecVT
.getVectorNumElements();
4792 auto KIdx
= dyn_cast
<ConstantSDNode
>(Idx
);
4794 if (NumElts
== 4 && EltSize
== 16 && KIdx
) {
4795 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Vec
);
4797 SDValue LoHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4798 DAG
.getConstant(0, SL
, MVT::i32
));
4799 SDValue HiHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4800 DAG
.getConstant(1, SL
, MVT::i32
));
4802 SDValue LoVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, LoHalf
);
4803 SDValue HiVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, HiHalf
);
4805 unsigned Idx
= KIdx
->getZExtValue();
4806 bool InsertLo
= Idx
< 2;
4807 SDValue InsHalf
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, MVT::v2i16
,
4808 InsertLo
? LoVec
: HiVec
,
4809 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, InsVal
),
4810 DAG
.getConstant(InsertLo
? Idx
: (Idx
- 2), SL
, MVT::i32
));
4812 InsHalf
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, InsHalf
);
4814 SDValue Concat
= InsertLo
?
4815 DAG
.getBuildVector(MVT::v2i32
, SL
, { InsHalf
, HiHalf
}) :
4816 DAG
.getBuildVector(MVT::v2i32
, SL
, { LoHalf
, InsHalf
});
4818 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, Concat
);
4821 if (isa
<ConstantSDNode
>(Idx
))
4824 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4826 // Avoid stack access for dynamic indexing.
4827 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4829 // Create a congruent vector with the target value in each element so that
4830 // the required element can be masked and ORed into the target vector.
4831 SDValue ExtVal
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
,
4832 DAG
.getSplatBuildVector(VecVT
, SL
, InsVal
));
4834 assert(isPowerOf2_32(EltSize
));
4835 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4837 // Convert vector index to bit-index.
4838 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4840 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4841 SDValue BFM
= DAG
.getNode(ISD::SHL
, SL
, IntVT
,
4842 DAG
.getConstant(0xffff, SL
, IntVT
),
4845 SDValue LHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
, BFM
, ExtVal
);
4846 SDValue RHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
,
4847 DAG
.getNOT(SL
, BFM
, IntVT
), BCVec
);
4849 SDValue BFI
= DAG
.getNode(ISD::OR
, SL
, IntVT
, LHS
, RHS
);
4850 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, BFI
);
4853 SDValue
SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op
,
4854 SelectionDAG
&DAG
) const {
4857 EVT ResultVT
= Op
.getValueType();
4858 SDValue Vec
= Op
.getOperand(0);
4859 SDValue Idx
= Op
.getOperand(1);
4860 EVT VecVT
= Vec
.getValueType();
4861 unsigned VecSize
= VecVT
.getSizeInBits();
4862 EVT EltVT
= VecVT
.getVectorElementType();
4863 assert(VecSize
<= 64);
4865 DAGCombinerInfo
DCI(DAG
, AfterLegalizeVectorOps
, true, nullptr);
4867 // Make sure we do any optimizations that will make it easier to fold
4868 // source modifiers before obscuring it with bit operations.
4870 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4871 if (SDValue Combined
= performExtractVectorEltCombine(Op
.getNode(), DCI
))
4874 unsigned EltSize
= EltVT
.getSizeInBits();
4875 assert(isPowerOf2_32(EltSize
));
4877 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4878 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4880 // Convert vector index to bit-index (* EltSize)
4881 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4883 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4884 SDValue Elt
= DAG
.getNode(ISD::SRL
, SL
, IntVT
, BC
, ScaledIdx
);
4886 if (ResultVT
== MVT::f16
) {
4887 SDValue Result
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i16
, Elt
);
4888 return DAG
.getNode(ISD::BITCAST
, SL
, ResultVT
, Result
);
4891 return DAG
.getAnyExtOrTrunc(Elt
, SL
, ResultVT
);
4894 static bool elementPairIsContiguous(ArrayRef
<int> Mask
, int Elt
) {
4895 assert(Elt
% 2 == 0);
4896 return Mask
[Elt
+ 1] == Mask
[Elt
] + 1 && (Mask
[Elt
] % 2 == 0);
4899 SDValue
SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op
,
4900 SelectionDAG
&DAG
) const {
4902 EVT ResultVT
= Op
.getValueType();
4903 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
);
4905 EVT PackVT
= ResultVT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
4906 EVT EltVT
= PackVT
.getVectorElementType();
4907 int SrcNumElts
= Op
.getOperand(0).getValueType().getVectorNumElements();
4909 // vector_shuffle <0,1,6,7> lhs, rhs
4910 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
4912 // vector_shuffle <6,7,2,3> lhs, rhs
4913 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
4915 // vector_shuffle <6,7,0,1> lhs, rhs
4916 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
4918 // Avoid scalarizing when both halves are reading from consecutive elements.
4919 SmallVector
<SDValue
, 4> Pieces
;
4920 for (int I
= 0, N
= ResultVT
.getVectorNumElements(); I
!= N
; I
+= 2) {
4921 if (elementPairIsContiguous(SVN
->getMask(), I
)) {
4922 const int Idx
= SVN
->getMaskElt(I
);
4923 int VecIdx
= Idx
< SrcNumElts
? 0 : 1;
4924 int EltIdx
= Idx
< SrcNumElts
? Idx
: Idx
- SrcNumElts
;
4925 SDValue SubVec
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
,
4926 PackVT
, SVN
->getOperand(VecIdx
),
4927 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
4928 Pieces
.push_back(SubVec
);
4930 const int Idx0
= SVN
->getMaskElt(I
);
4931 const int Idx1
= SVN
->getMaskElt(I
+ 1);
4932 int VecIdx0
= Idx0
< SrcNumElts
? 0 : 1;
4933 int VecIdx1
= Idx1
< SrcNumElts
? 0 : 1;
4934 int EltIdx0
= Idx0
< SrcNumElts
? Idx0
: Idx0
- SrcNumElts
;
4935 int EltIdx1
= Idx1
< SrcNumElts
? Idx1
: Idx1
- SrcNumElts
;
4937 SDValue Vec0
= SVN
->getOperand(VecIdx0
);
4938 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
4939 Vec0
, DAG
.getConstant(EltIdx0
, SL
, MVT::i32
));
4941 SDValue Vec1
= SVN
->getOperand(VecIdx1
);
4942 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
4943 Vec1
, DAG
.getConstant(EltIdx1
, SL
, MVT::i32
));
4944 Pieces
.push_back(DAG
.getBuildVector(PackVT
, SL
, { Elt0
, Elt1
}));
4948 return DAG
.getNode(ISD::CONCAT_VECTORS
, SL
, ResultVT
, Pieces
);
4951 SDValue
SITargetLowering::lowerBUILD_VECTOR(SDValue Op
,
4952 SelectionDAG
&DAG
) const {
4954 EVT VT
= Op
.getValueType();
4956 if (VT
== MVT::v4i16
|| VT
== MVT::v4f16
) {
4957 EVT HalfVT
= MVT::getVectorVT(VT
.getVectorElementType().getSimpleVT(), 2);
4959 // Turn into pair of packed build_vectors.
4960 // TODO: Special case for constants that can be materialized with s_mov_b64.
4961 SDValue Lo
= DAG
.getBuildVector(HalfVT
, SL
,
4962 { Op
.getOperand(0), Op
.getOperand(1) });
4963 SDValue Hi
= DAG
.getBuildVector(HalfVT
, SL
,
4964 { Op
.getOperand(2), Op
.getOperand(3) });
4966 SDValue CastLo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Lo
);
4967 SDValue CastHi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Hi
);
4969 SDValue Blend
= DAG
.getBuildVector(MVT::v2i32
, SL
, { CastLo
, CastHi
});
4970 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Blend
);
4973 assert(VT
== MVT::v2f16
|| VT
== MVT::v2i16
);
4974 assert(!Subtarget
->hasVOP3PInsts() && "this should be legal");
4976 SDValue Lo
= Op
.getOperand(0);
4977 SDValue Hi
= Op
.getOperand(1);
4979 // Avoid adding defined bits with the zero_extend.
4981 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4982 SDValue ExtLo
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Lo
);
4983 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ExtLo
);
4986 Hi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Hi
);
4987 Hi
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Hi
);
4989 SDValue ShlHi
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Hi
,
4990 DAG
.getConstant(16, SL
, MVT::i32
));
4992 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ShlHi
);
4994 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4995 Lo
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Lo
);
4997 SDValue Or
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Lo
, ShlHi
);
4998 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Or
);
5002 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode
*GA
) const {
5003 // We can fold offsets for anything that doesn't require a GOT relocation.
5004 return (GA
->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
5005 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
5006 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
5007 !shouldEmitGOTReloc(GA
->getGlobal());
5011 buildPCRelGlobalAddress(SelectionDAG
&DAG
, const GlobalValue
*GV
,
5012 const SDLoc
&DL
, unsigned Offset
, EVT PtrVT
,
5013 unsigned GAFlags
= SIInstrInfo::MO_NONE
) {
5014 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
5015 // lowered to the following code sequence:
5017 // For constant address space:
5018 // s_getpc_b64 s[0:1]
5019 // s_add_u32 s0, s0, $symbol
5020 // s_addc_u32 s1, s1, 0
5022 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
5023 // a fixup or relocation is emitted to replace $symbol with a literal
5024 // constant, which is a pc-relative offset from the encoding of the $symbol
5025 // operand to the global variable.
5027 // For global address space:
5028 // s_getpc_b64 s[0:1]
5029 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
5030 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
5032 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
5033 // fixups or relocations are emitted to replace $symbol@*@lo and
5034 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
5035 // which is a 64-bit pc-relative offset from the encoding of the $symbol
5036 // operand to the global variable.
5038 // What we want here is an offset from the value returned by s_getpc
5039 // (which is the address of the s_add_u32 instruction) to the global
5040 // variable, but since the encoding of $symbol starts 4 bytes after the start
5041 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
5042 // small. This requires us to add 4 to the global variable offset in order to
5043 // compute the correct address.
5045 DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4, GAFlags
);
5047 if (GAFlags
== SIInstrInfo::MO_NONE
) {
5048 PtrHi
= DAG
.getTargetConstant(0, DL
, MVT::i32
);
5051 DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4, GAFlags
+ 1);
5053 return DAG
.getNode(AMDGPUISD::PC_ADD_REL_OFFSET
, DL
, PtrVT
, PtrLo
, PtrHi
);
5056 SDValue
SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
*MFI
,
5058 SelectionDAG
&DAG
) const {
5059 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Op
);
5060 const GlobalValue
*GV
= GSD
->getGlobal();
5061 if ((GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
&&
5062 (!GV
->hasExternalLinkage() ||
5063 getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
5064 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
)) ||
5065 GSD
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
||
5066 GSD
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
)
5067 return AMDGPUTargetLowering::LowerGlobalAddress(MFI
, Op
, DAG
);
5070 EVT PtrVT
= Op
.getValueType();
5072 if (GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
) {
5073 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, GSD
->getOffset(),
5074 SIInstrInfo::MO_ABS32_LO
);
5075 return DAG
.getNode(AMDGPUISD::LDS
, DL
, MVT::i32
, GA
);
5078 if (shouldEmitFixup(GV
))
5079 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
);
5080 else if (shouldEmitPCReloc(GV
))
5081 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
,
5082 SIInstrInfo::MO_REL32
);
5084 SDValue GOTAddr
= buildPCRelGlobalAddress(DAG
, GV
, DL
, 0, PtrVT
,
5085 SIInstrInfo::MO_GOTPCREL32
);
5087 Type
*Ty
= PtrVT
.getTypeForEVT(*DAG
.getContext());
5088 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
5089 const DataLayout
&DataLayout
= DAG
.getDataLayout();
5090 unsigned Align
= DataLayout
.getABITypeAlignment(PtrTy
);
5091 MachinePointerInfo PtrInfo
5092 = MachinePointerInfo::getGOT(DAG
.getMachineFunction());
5094 return DAG
.getLoad(PtrVT
, DL
, DAG
.getEntryNode(), GOTAddr
, PtrInfo
, Align
,
5095 MachineMemOperand::MODereferenceable
|
5096 MachineMemOperand::MOInvariant
);
5099 SDValue
SITargetLowering::copyToM0(SelectionDAG
&DAG
, SDValue Chain
,
5100 const SDLoc
&DL
, SDValue V
) const {
5101 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
5102 // the destination register.
5104 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
5105 // so we will end up with redundant moves to m0.
5107 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
5109 // A Null SDValue creates a glue result.
5110 SDNode
*M0
= DAG
.getMachineNode(AMDGPU::SI_INIT_M0
, DL
, MVT::Other
, MVT::Glue
,
5112 return SDValue(M0
, 0);
5115 SDValue
SITargetLowering::lowerImplicitZextParam(SelectionDAG
&DAG
,
5118 unsigned Offset
) const {
5120 SDValue Param
= lowerKernargMemParameter(DAG
, MVT::i32
, MVT::i32
, SL
,
5121 DAG
.getEntryNode(), Offset
, 4, false);
5122 // The local size values will have the hi 16-bits as zero.
5123 return DAG
.getNode(ISD::AssertZext
, SL
, MVT::i32
, Param
,
5124 DAG
.getValueType(VT
));
5127 static SDValue
emitNonHSAIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
5129 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
5130 "non-hsa intrinsic with hsa target",
5132 DAG
.getContext()->diagnose(BadIntrin
);
5133 return DAG
.getUNDEF(VT
);
5136 static SDValue
emitRemovedIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
5138 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
5139 "intrinsic not supported on subtarget",
5141 DAG
.getContext()->diagnose(BadIntrin
);
5142 return DAG
.getUNDEF(VT
);
5145 static SDValue
getBuildDwordsVector(SelectionDAG
&DAG
, SDLoc DL
,
5146 ArrayRef
<SDValue
> Elts
) {
5147 assert(!Elts
.empty());
5151 if (Elts
.size() == 1) {
5154 } else if (Elts
.size() == 2) {
5157 } else if (Elts
.size() <= 4) {
5160 } else if (Elts
.size() <= 8) {
5164 assert(Elts
.size() <= 16);
5169 SmallVector
<SDValue
, 16> VecElts(NumElts
);
5170 for (unsigned i
= 0; i
< Elts
.size(); ++i
) {
5171 SDValue Elt
= Elts
[i
];
5172 if (Elt
.getValueType() != MVT::f32
)
5173 Elt
= DAG
.getBitcast(MVT::f32
, Elt
);
5176 for (unsigned i
= Elts
.size(); i
< NumElts
; ++i
)
5177 VecElts
[i
] = DAG
.getUNDEF(MVT::f32
);
5181 return DAG
.getBuildVector(Type
, DL
, VecElts
);
5184 static bool parseCachePolicy(SDValue CachePolicy
, SelectionDAG
&DAG
,
5185 SDValue
*GLC
, SDValue
*SLC
, SDValue
*DLC
) {
5186 auto CachePolicyConst
= cast
<ConstantSDNode
>(CachePolicy
.getNode());
5188 uint64_t Value
= CachePolicyConst
->getZExtValue();
5189 SDLoc
DL(CachePolicy
);
5191 *GLC
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
5192 Value
&= ~(uint64_t)0x1;
5195 *SLC
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
5196 Value
&= ~(uint64_t)0x2;
5199 *DLC
= DAG
.getTargetConstant((Value
& 0x4) ? 1 : 0, DL
, MVT::i32
);
5200 Value
&= ~(uint64_t)0x4;
5206 // Re-construct the required return value for a image load intrinsic.
5207 // This is more complicated due to the optional use TexFailCtrl which means the required
5208 // return type is an aggregate
5209 static SDValue
constructRetValue(SelectionDAG
&DAG
,
5210 MachineSDNode
*Result
,
5211 ArrayRef
<EVT
> ResultTypes
,
5212 bool IsTexFail
, bool Unpacked
, bool IsD16
,
5213 int DMaskPop
, int NumVDataDwords
,
5214 const SDLoc
&DL
, LLVMContext
&Context
) {
5215 // Determine the required return type. This is the same regardless of IsTexFail flag
5216 EVT ReqRetVT
= ResultTypes
[0];
5217 EVT ReqRetEltVT
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorElementType() : ReqRetVT
;
5218 int ReqRetNumElts
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorNumElements() : 1;
5219 EVT AdjEltVT
= Unpacked
&& IsD16
? MVT::i32
: ReqRetEltVT
;
5220 EVT AdjVT
= Unpacked
? ReqRetNumElts
> 1 ? EVT::getVectorVT(Context
, AdjEltVT
, ReqRetNumElts
)
5224 // Extract data part of the result
5225 // Bitcast the result to the same type as the required return type
5227 if (IsD16
&& !Unpacked
)
5228 NumElts
= NumVDataDwords
<< 1;
5230 NumElts
= NumVDataDwords
;
5232 EVT CastVT
= NumElts
> 1 ? EVT::getVectorVT(Context
, AdjEltVT
, NumElts
)
5235 // Special case for v6f16. Rather than add support for this, use v3i32 to
5236 // extract the data elements
5237 bool V6F16Special
= false;
5239 CastVT
= EVT::getVectorVT(Context
, MVT::i32
, NumElts
/ 2);
5241 ReqRetNumElts
>>= 1;
5242 V6F16Special
= true;
5246 SDValue N
= SDValue(Result
, 0);
5247 SDValue CastRes
= DAG
.getNode(ISD::BITCAST
, DL
, CastVT
, N
);
5249 // Iterate over the result
5250 SmallVector
<SDValue
, 4> BVElts
;
5252 if (CastVT
.isVector()) {
5253 DAG
.ExtractVectorElements(CastRes
, BVElts
, 0, DMaskPop
);
5255 BVElts
.push_back(CastRes
);
5257 int ExtraElts
= ReqRetNumElts
- DMaskPop
;
5259 BVElts
.push_back(DAG
.getUNDEF(AdjEltVT
));
5262 if (ReqRetNumElts
> 1) {
5263 SDValue NewVec
= DAG
.getBuildVector(AdjVT
, DL
, BVElts
);
5264 if (IsD16
&& Unpacked
)
5265 PreTFCRes
= adjustLoadValueTypeImpl(NewVec
, ReqRetVT
, DL
, DAG
, Unpacked
);
5269 PreTFCRes
= BVElts
[0];
5273 PreTFCRes
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v4f16
, PreTFCRes
);
5276 if (Result
->getNumValues() > 1)
5277 return DAG
.getMergeValues({PreTFCRes
, SDValue(Result
, 1)}, DL
);
5282 // Extract the TexFail result and insert into aggregate return
5283 SmallVector
<SDValue
, 1> TFCElt
;
5284 DAG
.ExtractVectorElements(N
, TFCElt
, DMaskPop
, 1);
5285 SDValue TFCRes
= DAG
.getNode(ISD::BITCAST
, DL
, ResultTypes
[1], TFCElt
[0]);
5286 return DAG
.getMergeValues({PreTFCRes
, TFCRes
, SDValue(Result
, 1)}, DL
);
5289 static bool parseTexFail(SDValue TexFailCtrl
, SelectionDAG
&DAG
, SDValue
*TFE
,
5290 SDValue
*LWE
, bool &IsTexFail
) {
5291 auto TexFailCtrlConst
= cast
<ConstantSDNode
>(TexFailCtrl
.getNode());
5293 uint64_t Value
= TexFailCtrlConst
->getZExtValue();
5298 SDLoc
DL(TexFailCtrlConst
);
5299 *TFE
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
5300 Value
&= ~(uint64_t)0x1;
5301 *LWE
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
5302 Value
&= ~(uint64_t)0x2;
5307 SDValue
SITargetLowering::lowerImage(SDValue Op
,
5308 const AMDGPU::ImageDimIntrinsicInfo
*Intr
,
5309 SelectionDAG
&DAG
) const {
5311 MachineFunction
&MF
= DAG
.getMachineFunction();
5312 const GCNSubtarget
* ST
= &MF
.getSubtarget
<GCNSubtarget
>();
5313 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
5314 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
5315 const AMDGPU::MIMGDimInfo
*DimInfo
= AMDGPU::getMIMGDimInfo(Intr
->Dim
);
5316 const AMDGPU::MIMGLZMappingInfo
*LZMappingInfo
=
5317 AMDGPU::getMIMGLZMappingInfo(Intr
->BaseOpcode
);
5318 const AMDGPU::MIMGMIPMappingInfo
*MIPMappingInfo
=
5319 AMDGPU::getMIMGMIPMappingInfo(Intr
->BaseOpcode
);
5320 unsigned IntrOpcode
= Intr
->BaseOpcode
;
5321 bool IsGFX10
= Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
;
5323 SmallVector
<EVT
, 3> ResultTypes(Op
->value_begin(), Op
->value_end());
5324 SmallVector
<EVT
, 3> OrigResultTypes(Op
->value_begin(), Op
->value_end());
5329 bool AdjustRetType
= false;
5331 unsigned AddrIdx
; // Index of first address argument
5333 unsigned DMaskLanes
= 0;
5335 if (BaseOpcode
->Atomic
) {
5336 VData
= Op
.getOperand(2);
5338 bool Is64Bit
= VData
.getValueType() == MVT::i64
;
5339 if (BaseOpcode
->AtomicX2
) {
5340 SDValue VData2
= Op
.getOperand(3);
5341 VData
= DAG
.getBuildVector(Is64Bit
? MVT::v2i64
: MVT::v2i32
, DL
,
5344 VData
= DAG
.getBitcast(MVT::v4i32
, VData
);
5346 ResultTypes
[0] = Is64Bit
? MVT::v2i64
: MVT::v2i32
;
5347 DMask
= Is64Bit
? 0xf : 0x3;
5348 NumVDataDwords
= Is64Bit
? 4 : 2;
5351 DMask
= Is64Bit
? 0x3 : 0x1;
5352 NumVDataDwords
= Is64Bit
? 2 : 1;
5356 unsigned DMaskIdx
= BaseOpcode
->Store
? 3 : isa
<MemSDNode
>(Op
) ? 2 : 1;
5357 auto DMaskConst
= cast
<ConstantSDNode
>(Op
.getOperand(DMaskIdx
));
5358 DMask
= DMaskConst
->getZExtValue();
5359 DMaskLanes
= BaseOpcode
->Gather4
? 4 : countPopulation(DMask
);
5361 if (BaseOpcode
->Store
) {
5362 VData
= Op
.getOperand(2);
5364 MVT StoreVT
= VData
.getSimpleValueType();
5365 if (StoreVT
.getScalarType() == MVT::f16
) {
5366 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
5367 return Op
; // D16 is unsupported for this instruction
5370 VData
= handleD16VData(VData
, DAG
);
5373 NumVDataDwords
= (VData
.getValueType().getSizeInBits() + 31) / 32;
5375 // Work out the num dwords based on the dmask popcount and underlying type
5376 // and whether packing is supported.
5377 MVT LoadVT
= ResultTypes
[0].getSimpleVT();
5378 if (LoadVT
.getScalarType() == MVT::f16
) {
5379 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
5380 return Op
; // D16 is unsupported for this instruction
5385 // Confirm that the return type is large enough for the dmask specified
5386 if ((LoadVT
.isVector() && LoadVT
.getVectorNumElements() < DMaskLanes
) ||
5387 (!LoadVT
.isVector() && DMaskLanes
> 1))
5390 if (IsD16
&& !Subtarget
->hasUnpackedD16VMem())
5391 NumVDataDwords
= (DMaskLanes
+ 1) / 2;
5393 NumVDataDwords
= DMaskLanes
;
5395 AdjustRetType
= true;
5398 AddrIdx
= DMaskIdx
+ 1;
5401 unsigned NumGradients
= BaseOpcode
->Gradients
? DimInfo
->NumGradients
: 0;
5402 unsigned NumCoords
= BaseOpcode
->Coordinates
? DimInfo
->NumCoords
: 0;
5403 unsigned NumLCM
= BaseOpcode
->LodOrClampOrMip
? 1 : 0;
5404 unsigned NumVAddrs
= BaseOpcode
->NumExtraArgs
+ NumGradients
+
5406 unsigned NumMIVAddrs
= NumVAddrs
;
5408 SmallVector
<SDValue
, 4> VAddrs
;
5410 // Optimize _L to _LZ when _L is zero
5411 if (LZMappingInfo
) {
5412 if (auto ConstantLod
=
5413 dyn_cast
<ConstantFPSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
5414 if (ConstantLod
->isZero() || ConstantLod
->isNegative()) {
5415 IntrOpcode
= LZMappingInfo
->LZ
; // set new opcode to _lz variant of _l
5416 NumMIVAddrs
--; // remove 'lod'
5421 // Optimize _mip away, when 'lod' is zero
5422 if (MIPMappingInfo
) {
5423 if (auto ConstantLod
=
5424 dyn_cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
5425 if (ConstantLod
->isNullValue()) {
5426 IntrOpcode
= MIPMappingInfo
->NONMIP
; // set new opcode to variant without _mip
5427 NumMIVAddrs
--; // remove 'lod'
5432 // Check for 16 bit addresses and pack if true.
5433 unsigned DimIdx
= AddrIdx
+ BaseOpcode
->NumExtraArgs
;
5434 MVT VAddrVT
= Op
.getOperand(DimIdx
).getSimpleValueType();
5435 const MVT VAddrScalarVT
= VAddrVT
.getScalarType();
5436 if (((VAddrScalarVT
== MVT::f16
) || (VAddrScalarVT
== MVT::i16
)) &&
5437 ST
->hasFeature(AMDGPU::FeatureR128A16
)) {
5439 const MVT VectorVT
= VAddrScalarVT
== MVT::f16
? MVT::v2f16
: MVT::v2i16
;
5440 for (unsigned i
= AddrIdx
; i
< (AddrIdx
+ NumMIVAddrs
); ++i
) {
5441 SDValue AddrLo
, AddrHi
;
5442 // Push back extra arguments.
5444 AddrLo
= Op
.getOperand(i
);
5446 AddrLo
= Op
.getOperand(i
);
5447 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
5448 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
5449 if (((i
+ 1) >= (AddrIdx
+ NumMIVAddrs
)) ||
5450 ((NumGradients
/ 2) % 2 == 1 &&
5451 (i
== DimIdx
+ (NumGradients
/ 2) - 1 ||
5452 i
== DimIdx
+ NumGradients
- 1))) {
5453 AddrHi
= DAG
.getUNDEF(MVT::f16
);
5455 AddrHi
= Op
.getOperand(i
+ 1);
5458 AddrLo
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, VectorVT
,
5460 AddrLo
= DAG
.getBitcast(MVT::i32
, AddrLo
);
5462 VAddrs
.push_back(AddrLo
);
5465 for (unsigned i
= 0; i
< NumMIVAddrs
; ++i
)
5466 VAddrs
.push_back(Op
.getOperand(AddrIdx
+ i
));
5469 // If the register allocator cannot place the address registers contiguously
5470 // without introducing moves, then using the non-sequential address encoding
5471 // is always preferable, since it saves VALU instructions and is usually a
5472 // wash in terms of code size or even better.
5474 // However, we currently have no way of hinting to the register allocator that
5475 // MIMG addresses should be placed contiguously when it is possible to do so,
5476 // so force non-NSA for the common 2-address case as a heuristic.
5478 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5479 // allocation when possible.
5481 ST
->hasFeature(AMDGPU::FeatureNSAEncoding
) && VAddrs
.size() >= 3;
5484 VAddr
= getBuildDwordsVector(DAG
, DL
, VAddrs
);
5486 SDValue True
= DAG
.getTargetConstant(1, DL
, MVT::i1
);
5487 SDValue False
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
5488 unsigned CtrlIdx
; // Index of texfailctrl argument
5490 if (!BaseOpcode
->Sampler
) {
5492 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 1;
5495 cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 2));
5497 Unorm
= UnormConst
->getZExtValue() ? True
: False
;
5498 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 3;
5503 SDValue TexFail
= Op
.getOperand(CtrlIdx
);
5504 bool IsTexFail
= false;
5505 if (!parseTexFail(TexFail
, DAG
, &TFE
, &LWE
, IsTexFail
))
5510 // Expecting to get an error flag since TFC is on - and dmask is 0
5511 // Force dmask to be at least 1 otherwise the instruction will fail
5516 NumVDataDwords
+= 1;
5517 AdjustRetType
= true;
5520 // Has something earlier tagged that the return type needs adjusting
5521 // This happens if the instruction is a load or has set TexFailCtrl flags
5522 if (AdjustRetType
) {
5523 // NumVDataDwords reflects the true number of dwords required in the return type
5524 if (DMaskLanes
== 0 && !BaseOpcode
->Store
) {
5525 // This is a no-op load. This can be eliminated
5526 SDValue Undef
= DAG
.getUNDEF(Op
.getValueType());
5527 if (isa
<MemSDNode
>(Op
))
5528 return DAG
.getMergeValues({Undef
, Op
.getOperand(0)}, DL
);
5532 EVT NewVT
= NumVDataDwords
> 1 ?
5533 EVT::getVectorVT(*DAG
.getContext(), MVT::f32
, NumVDataDwords
)
5536 ResultTypes
[0] = NewVT
;
5537 if (ResultTypes
.size() == 3) {
5538 // Original result was aggregate type used for TexFailCtrl results
5539 // The actual instruction returns as a vector type which has now been
5540 // created. Remove the aggregate result.
5541 ResultTypes
.erase(&ResultTypes
[1]);
5548 if (BaseOpcode
->Atomic
) {
5549 GLC
= True
; // TODO no-return optimization
5550 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, nullptr, &SLC
,
5551 IsGFX10
? &DLC
: nullptr))
5554 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, &GLC
, &SLC
,
5555 IsGFX10
? &DLC
: nullptr))
5559 SmallVector
<SDValue
, 26> Ops
;
5560 if (BaseOpcode
->Store
|| BaseOpcode
->Atomic
)
5561 Ops
.push_back(VData
); // vdata
5563 for (const SDValue
&Addr
: VAddrs
)
5564 Ops
.push_back(Addr
);
5566 Ops
.push_back(VAddr
);
5568 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
)); // rsrc
5569 if (BaseOpcode
->Sampler
)
5570 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 1)); // sampler
5571 Ops
.push_back(DAG
.getTargetConstant(DMask
, DL
, MVT::i32
));
5573 Ops
.push_back(DAG
.getTargetConstant(DimInfo
->Encoding
, DL
, MVT::i32
));
5574 Ops
.push_back(Unorm
);
5579 Ops
.push_back(IsA16
&& // a16 or r128
5580 ST
->hasFeature(AMDGPU::FeatureR128A16
) ? True
: False
);
5581 Ops
.push_back(TFE
); // tfe
5582 Ops
.push_back(LWE
); // lwe
5584 Ops
.push_back(DimInfo
->DA
? True
: False
);
5585 if (BaseOpcode
->HasD16
)
5586 Ops
.push_back(IsD16
? True
: False
);
5587 if (isa
<MemSDNode
>(Op
))
5588 Ops
.push_back(Op
.getOperand(0)); // chain
5590 int NumVAddrDwords
=
5591 UseNSA
? VAddrs
.size() : VAddr
.getValueType().getSizeInBits() / 32;
5595 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
,
5596 UseNSA
? AMDGPU::MIMGEncGfx10NSA
5597 : AMDGPU::MIMGEncGfx10Default
,
5598 NumVDataDwords
, NumVAddrDwords
);
5600 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5601 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx8
,
5602 NumVDataDwords
, NumVAddrDwords
);
5604 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx6
,
5605 NumVDataDwords
, NumVAddrDwords
);
5607 assert(Opcode
!= -1);
5609 MachineSDNode
*NewNode
= DAG
.getMachineNode(Opcode
, DL
, ResultTypes
, Ops
);
5610 if (auto MemOp
= dyn_cast
<MemSDNode
>(Op
)) {
5611 MachineMemOperand
*MemRef
= MemOp
->getMemOperand();
5612 DAG
.setNodeMemRefs(NewNode
, {MemRef
});
5615 if (BaseOpcode
->AtomicX2
) {
5616 SmallVector
<SDValue
, 1> Elt
;
5617 DAG
.ExtractVectorElements(SDValue(NewNode
, 0), Elt
, 0, 1);
5618 return DAG
.getMergeValues({Elt
[0], SDValue(NewNode
, 1)}, DL
);
5619 } else if (!BaseOpcode
->Store
) {
5620 return constructRetValue(DAG
, NewNode
,
5621 OrigResultTypes
, IsTexFail
,
5622 Subtarget
->hasUnpackedD16VMem(), IsD16
,
5623 DMaskLanes
, NumVDataDwords
, DL
,
5627 return SDValue(NewNode
, 0);
5630 SDValue
SITargetLowering::lowerSBuffer(EVT VT
, SDLoc DL
, SDValue Rsrc
,
5631 SDValue Offset
, SDValue GLC
, SDValue DLC
,
5632 SelectionDAG
&DAG
) const {
5633 MachineFunction
&MF
= DAG
.getMachineFunction();
5634 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
5635 MachinePointerInfo(),
5636 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
5637 MachineMemOperand::MOInvariant
,
5638 VT
.getStoreSize(), VT
.getStoreSize());
5640 if (!Offset
->isDivergent()) {
5647 return DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD
, DL
,
5648 DAG
.getVTList(VT
), Ops
, VT
, MMO
);
5651 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5652 // assume that the buffer is unswizzled.
5653 SmallVector
<SDValue
, 4> Loads
;
5654 unsigned NumLoads
= 1;
5655 MVT LoadVT
= VT
.getSimpleVT();
5656 unsigned NumElts
= LoadVT
.isVector() ? LoadVT
.getVectorNumElements() : 1;
5657 assert((LoadVT
.getScalarType() == MVT::i32
||
5658 LoadVT
.getScalarType() == MVT::f32
) &&
5659 isPowerOf2_32(NumElts
));
5661 if (NumElts
== 8 || NumElts
== 16) {
5662 NumLoads
= NumElts
== 16 ? 4 : 2;
5663 LoadVT
= MVT::v4i32
;
5666 SDVTList VTList
= DAG
.getVTList({LoadVT
, MVT::Glue
});
5667 unsigned CachePolicy
= cast
<ConstantSDNode
>(GLC
)->getZExtValue();
5669 DAG
.getEntryNode(), // Chain
5671 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5675 DAG
.getTargetConstant(CachePolicy
, DL
, MVT::i32
), // cachepolicy
5676 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
5679 // Use the alignment to ensure that the required offsets will fit into the
5680 // immediate offsets.
5681 setBufferOffsets(Offset
, DAG
, &Ops
[3], NumLoads
> 1 ? 16 * NumLoads
: 4);
5683 uint64_t InstOffset
= cast
<ConstantSDNode
>(Ops
[5])->getZExtValue();
5684 for (unsigned i
= 0; i
< NumLoads
; ++i
) {
5685 Ops
[5] = DAG
.getTargetConstant(InstOffset
+ 16 * i
, DL
, MVT::i32
);
5686 Loads
.push_back(DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD
, DL
, VTList
,
5690 if (VT
== MVT::v8i32
|| VT
== MVT::v16i32
)
5691 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VT
, Loads
);
5696 SDValue
SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
5697 SelectionDAG
&DAG
) const {
5698 MachineFunction
&MF
= DAG
.getMachineFunction();
5699 auto MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
5701 EVT VT
= Op
.getValueType();
5703 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
5705 // TODO: Should this propagate fast-math-flags?
5707 switch (IntrinsicID
) {
5708 case Intrinsic::amdgcn_implicit_buffer_ptr
: {
5709 if (getSubtarget()->isAmdHsaOrMesa(MF
.getFunction()))
5710 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5711 return getPreloadedValue(DAG
, *MFI
, VT
,
5712 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
5714 case Intrinsic::amdgcn_dispatch_ptr
:
5715 case Intrinsic::amdgcn_queue_ptr
: {
5716 if (!Subtarget
->isAmdHsaOrMesa(MF
.getFunction())) {
5717 DiagnosticInfoUnsupported
BadIntrin(
5718 MF
.getFunction(), "unsupported hsa intrinsic without hsa target",
5720 DAG
.getContext()->diagnose(BadIntrin
);
5721 return DAG
.getUNDEF(VT
);
5724 auto RegID
= IntrinsicID
== Intrinsic::amdgcn_dispatch_ptr
?
5725 AMDGPUFunctionArgInfo::DISPATCH_PTR
: AMDGPUFunctionArgInfo::QUEUE_PTR
;
5726 return getPreloadedValue(DAG
, *MFI
, VT
, RegID
);
5728 case Intrinsic::amdgcn_implicitarg_ptr
: {
5729 if (MFI
->isEntryFunction())
5730 return getImplicitArgPtr(DAG
, DL
);
5731 return getPreloadedValue(DAG
, *MFI
, VT
,
5732 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
5734 case Intrinsic::amdgcn_kernarg_segment_ptr
: {
5735 return getPreloadedValue(DAG
, *MFI
, VT
,
5736 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
5738 case Intrinsic::amdgcn_dispatch_id
: {
5739 return getPreloadedValue(DAG
, *MFI
, VT
, AMDGPUFunctionArgInfo::DISPATCH_ID
);
5741 case Intrinsic::amdgcn_rcp
:
5742 return DAG
.getNode(AMDGPUISD::RCP
, DL
, VT
, Op
.getOperand(1));
5743 case Intrinsic::amdgcn_rsq
:
5744 return DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
5745 case Intrinsic::amdgcn_rsq_legacy
:
5746 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5747 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
5749 return DAG
.getNode(AMDGPUISD::RSQ_LEGACY
, DL
, VT
, Op
.getOperand(1));
5750 case Intrinsic::amdgcn_rcp_legacy
:
5751 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5752 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
5753 return DAG
.getNode(AMDGPUISD::RCP_LEGACY
, DL
, VT
, Op
.getOperand(1));
5754 case Intrinsic::amdgcn_rsq_clamp
: {
5755 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5756 return DAG
.getNode(AMDGPUISD::RSQ_CLAMP
, DL
, VT
, Op
.getOperand(1));
5758 Type
*Type
= VT
.getTypeForEVT(*DAG
.getContext());
5759 APFloat Max
= APFloat::getLargest(Type
->getFltSemantics());
5760 APFloat Min
= APFloat::getLargest(Type
->getFltSemantics(), true);
5762 SDValue Rsq
= DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
5763 SDValue Tmp
= DAG
.getNode(ISD::FMINNUM
, DL
, VT
, Rsq
,
5764 DAG
.getConstantFP(Max
, DL
, VT
));
5765 return DAG
.getNode(ISD::FMAXNUM
, DL
, VT
, Tmp
,
5766 DAG
.getConstantFP(Min
, DL
, VT
));
5768 case Intrinsic::r600_read_ngroups_x
:
5769 if (Subtarget
->isAmdHsaOS())
5770 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5772 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5773 SI::KernelInputOffsets::NGROUPS_X
, 4, false);
5774 case Intrinsic::r600_read_ngroups_y
:
5775 if (Subtarget
->isAmdHsaOS())
5776 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5778 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5779 SI::KernelInputOffsets::NGROUPS_Y
, 4, false);
5780 case Intrinsic::r600_read_ngroups_z
:
5781 if (Subtarget
->isAmdHsaOS())
5782 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5784 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5785 SI::KernelInputOffsets::NGROUPS_Z
, 4, false);
5786 case Intrinsic::r600_read_global_size_x
:
5787 if (Subtarget
->isAmdHsaOS())
5788 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5790 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5791 SI::KernelInputOffsets::GLOBAL_SIZE_X
, 4, false);
5792 case Intrinsic::r600_read_global_size_y
:
5793 if (Subtarget
->isAmdHsaOS())
5794 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5796 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5797 SI::KernelInputOffsets::GLOBAL_SIZE_Y
, 4, false);
5798 case Intrinsic::r600_read_global_size_z
:
5799 if (Subtarget
->isAmdHsaOS())
5800 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5802 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5803 SI::KernelInputOffsets::GLOBAL_SIZE_Z
, 4, false);
5804 case Intrinsic::r600_read_local_size_x
:
5805 if (Subtarget
->isAmdHsaOS())
5806 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5808 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5809 SI::KernelInputOffsets::LOCAL_SIZE_X
);
5810 case Intrinsic::r600_read_local_size_y
:
5811 if (Subtarget
->isAmdHsaOS())
5812 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5814 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5815 SI::KernelInputOffsets::LOCAL_SIZE_Y
);
5816 case Intrinsic::r600_read_local_size_z
:
5817 if (Subtarget
->isAmdHsaOS())
5818 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5820 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5821 SI::KernelInputOffsets::LOCAL_SIZE_Z
);
5822 case Intrinsic::amdgcn_workgroup_id_x
:
5823 case Intrinsic::r600_read_tgid_x
:
5824 return getPreloadedValue(DAG
, *MFI
, VT
,
5825 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
5826 case Intrinsic::amdgcn_workgroup_id_y
:
5827 case Intrinsic::r600_read_tgid_y
:
5828 return getPreloadedValue(DAG
, *MFI
, VT
,
5829 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
5830 case Intrinsic::amdgcn_workgroup_id_z
:
5831 case Intrinsic::r600_read_tgid_z
:
5832 return getPreloadedValue(DAG
, *MFI
, VT
,
5833 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
5834 case Intrinsic::amdgcn_workitem_id_x
:
5835 case Intrinsic::r600_read_tidig_x
:
5836 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5837 SDLoc(DAG
.getEntryNode()),
5838 MFI
->getArgInfo().WorkItemIDX
);
5839 case Intrinsic::amdgcn_workitem_id_y
:
5840 case Intrinsic::r600_read_tidig_y
:
5841 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5842 SDLoc(DAG
.getEntryNode()),
5843 MFI
->getArgInfo().WorkItemIDY
);
5844 case Intrinsic::amdgcn_workitem_id_z
:
5845 case Intrinsic::r600_read_tidig_z
:
5846 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5847 SDLoc(DAG
.getEntryNode()),
5848 MFI
->getArgInfo().WorkItemIDZ
);
5849 case Intrinsic::amdgcn_wavefrontsize
:
5850 return DAG
.getConstant(MF
.getSubtarget
<GCNSubtarget
>().getWavefrontSize(),
5851 SDLoc(Op
), MVT::i32
);
5852 case Intrinsic::amdgcn_s_buffer_load
: {
5853 bool IsGFX10
= Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
;
5855 SDValue DLC
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
5856 if (!parseCachePolicy(Op
.getOperand(3), DAG
, &GLC
, nullptr,
5857 IsGFX10
? &DLC
: nullptr))
5859 return lowerSBuffer(VT
, DL
, Op
.getOperand(1), Op
.getOperand(2), GLC
, DLC
,
5862 case Intrinsic::amdgcn_fdiv_fast
:
5863 return lowerFDIV_FAST(Op
, DAG
);
5864 case Intrinsic::amdgcn_interp_mov
: {
5865 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(4));
5866 SDValue Glue
= M0
.getValue(1);
5867 return DAG
.getNode(AMDGPUISD::INTERP_MOV
, DL
, MVT::f32
, Op
.getOperand(1),
5868 Op
.getOperand(2), Op
.getOperand(3), Glue
);
5870 case Intrinsic::amdgcn_interp_p1
: {
5871 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(4));
5872 SDValue Glue
= M0
.getValue(1);
5873 return DAG
.getNode(AMDGPUISD::INTERP_P1
, DL
, MVT::f32
, Op
.getOperand(1),
5874 Op
.getOperand(2), Op
.getOperand(3), Glue
);
5876 case Intrinsic::amdgcn_interp_p2
: {
5877 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(5));
5878 SDValue Glue
= SDValue(M0
.getNode(), 1);
5879 return DAG
.getNode(AMDGPUISD::INTERP_P2
, DL
, MVT::f32
, Op
.getOperand(1),
5880 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(4),
5883 case Intrinsic::amdgcn_interp_p1_f16
: {
5884 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(5));
5885 SDValue Glue
= M0
.getValue(1);
5886 if (getSubtarget()->getLDSBankCount() == 16) {
5888 SDValue S
= DAG
.getNode(AMDGPUISD::INTERP_MOV
, DL
, MVT::f32
,
5889 DAG
.getConstant(2, DL
, MVT::i32
), // P0
5890 Op
.getOperand(2), // Attrchan
5891 Op
.getOperand(3), // Attr
5894 Op
.getOperand(1), // Src0
5895 Op
.getOperand(2), // Attrchan
5896 Op
.getOperand(3), // Attr
5897 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src0_modifiers
5898 S
, // Src2 - holds two f16 values selected by high
5899 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src2_modifiers
5900 Op
.getOperand(4), // high
5901 DAG
.getTargetConstant(0, DL
, MVT::i1
), // $clamp
5902 DAG
.getTargetConstant(0, DL
, MVT::i32
) // $omod
5904 return DAG
.getNode(AMDGPUISD::INTERP_P1LV_F16
, DL
, MVT::f32
, Ops
);
5908 Op
.getOperand(1), // Src0
5909 Op
.getOperand(2), // Attrchan
5910 Op
.getOperand(3), // Attr
5911 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src0_modifiers
5912 Op
.getOperand(4), // high
5913 DAG
.getTargetConstant(0, DL
, MVT::i1
), // $clamp
5914 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $omod
5917 return DAG
.getNode(AMDGPUISD::INTERP_P1LL_F16
, DL
, MVT::f32
, Ops
);
5920 case Intrinsic::amdgcn_interp_p2_f16
: {
5921 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(6));
5922 SDValue Glue
= SDValue(M0
.getNode(), 1);
5924 Op
.getOperand(2), // Src0
5925 Op
.getOperand(3), // Attrchan
5926 Op
.getOperand(4), // Attr
5927 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src0_modifiers
5928 Op
.getOperand(1), // Src2
5929 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src2_modifiers
5930 Op
.getOperand(5), // high
5931 DAG
.getTargetConstant(0, DL
, MVT::i1
), // $clamp
5934 return DAG
.getNode(AMDGPUISD::INTERP_P2_F16
, DL
, MVT::f16
, Ops
);
5936 case Intrinsic::amdgcn_sin
:
5937 return DAG
.getNode(AMDGPUISD::SIN_HW
, DL
, VT
, Op
.getOperand(1));
5939 case Intrinsic::amdgcn_cos
:
5940 return DAG
.getNode(AMDGPUISD::COS_HW
, DL
, VT
, Op
.getOperand(1));
5942 case Intrinsic::amdgcn_mul_u24
:
5943 return DAG
.getNode(AMDGPUISD::MUL_U24
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5944 case Intrinsic::amdgcn_mul_i24
:
5945 return DAG
.getNode(AMDGPUISD::MUL_I24
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5947 case Intrinsic::amdgcn_log_clamp
: {
5948 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5951 DiagnosticInfoUnsupported
BadIntrin(
5952 MF
.getFunction(), "intrinsic not supported on subtarget",
5954 DAG
.getContext()->diagnose(BadIntrin
);
5955 return DAG
.getUNDEF(VT
);
5957 case Intrinsic::amdgcn_ldexp
:
5958 return DAG
.getNode(AMDGPUISD::LDEXP
, DL
, VT
,
5959 Op
.getOperand(1), Op
.getOperand(2));
5961 case Intrinsic::amdgcn_fract
:
5962 return DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, Op
.getOperand(1));
5964 case Intrinsic::amdgcn_class
:
5965 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, VT
,
5966 Op
.getOperand(1), Op
.getOperand(2));
5967 case Intrinsic::amdgcn_div_fmas
:
5968 return DAG
.getNode(AMDGPUISD::DIV_FMAS
, DL
, VT
,
5969 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
5972 case Intrinsic::amdgcn_div_fixup
:
5973 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, DL
, VT
,
5974 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5976 case Intrinsic::amdgcn_trig_preop
:
5977 return DAG
.getNode(AMDGPUISD::TRIG_PREOP
, DL
, VT
,
5978 Op
.getOperand(1), Op
.getOperand(2));
5979 case Intrinsic::amdgcn_div_scale
: {
5980 const ConstantSDNode
*Param
= cast
<ConstantSDNode
>(Op
.getOperand(3));
5982 // Translate to the operands expected by the machine instruction. The
5983 // first parameter must be the same as the first instruction.
5984 SDValue Numerator
= Op
.getOperand(1);
5985 SDValue Denominator
= Op
.getOperand(2);
5987 // Note this order is opposite of the machine instruction's operations,
5988 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5989 // intrinsic has the numerator as the first operand to match a normal
5990 // division operation.
5992 SDValue Src0
= Param
->isAllOnesValue() ? Numerator
: Denominator
;
5994 return DAG
.getNode(AMDGPUISD::DIV_SCALE
, DL
, Op
->getVTList(), Src0
,
5995 Denominator
, Numerator
);
5997 case Intrinsic::amdgcn_icmp
: {
5998 // There is a Pat that handles this variant, so return it as-is.
5999 if (Op
.getOperand(1).getValueType() == MVT::i1
&&
6000 Op
.getConstantOperandVal(2) == 0 &&
6001 Op
.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE
)
6003 return lowerICMPIntrinsic(*this, Op
.getNode(), DAG
);
6005 case Intrinsic::amdgcn_fcmp
: {
6006 return lowerFCMPIntrinsic(*this, Op
.getNode(), DAG
);
6008 case Intrinsic::amdgcn_fmed3
:
6009 return DAG
.getNode(AMDGPUISD::FMED3
, DL
, VT
,
6010 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6011 case Intrinsic::amdgcn_fdot2
:
6012 return DAG
.getNode(AMDGPUISD::FDOT2
, DL
, VT
,
6013 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
6015 case Intrinsic::amdgcn_fmul_legacy
:
6016 return DAG
.getNode(AMDGPUISD::FMUL_LEGACY
, DL
, VT
,
6017 Op
.getOperand(1), Op
.getOperand(2));
6018 case Intrinsic::amdgcn_sffbh
:
6019 return DAG
.getNode(AMDGPUISD::FFBH_I32
, DL
, VT
, Op
.getOperand(1));
6020 case Intrinsic::amdgcn_sbfe
:
6021 return DAG
.getNode(AMDGPUISD::BFE_I32
, DL
, VT
,
6022 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6023 case Intrinsic::amdgcn_ubfe
:
6024 return DAG
.getNode(AMDGPUISD::BFE_U32
, DL
, VT
,
6025 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6026 case Intrinsic::amdgcn_cvt_pkrtz
:
6027 case Intrinsic::amdgcn_cvt_pknorm_i16
:
6028 case Intrinsic::amdgcn_cvt_pknorm_u16
:
6029 case Intrinsic::amdgcn_cvt_pk_i16
:
6030 case Intrinsic::amdgcn_cvt_pk_u16
: {
6031 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
6032 EVT VT
= Op
.getValueType();
6035 if (IntrinsicID
== Intrinsic::amdgcn_cvt_pkrtz
)
6036 Opcode
= AMDGPUISD::CVT_PKRTZ_F16_F32
;
6037 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
6038 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
6039 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
6040 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
6041 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pk_i16
)
6042 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
6044 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
6046 if (isTypeLegal(VT
))
6047 return DAG
.getNode(Opcode
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
6049 SDValue Node
= DAG
.getNode(Opcode
, DL
, MVT::i32
,
6050 Op
.getOperand(1), Op
.getOperand(2));
6051 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Node
);
6053 case Intrinsic::amdgcn_fmad_ftz
:
6054 return DAG
.getNode(AMDGPUISD::FMAD_FTZ
, DL
, VT
, Op
.getOperand(1),
6055 Op
.getOperand(2), Op
.getOperand(3));
6057 case Intrinsic::amdgcn_if_break
:
6058 return SDValue(DAG
.getMachineNode(AMDGPU::SI_IF_BREAK
, DL
, VT
,
6059 Op
->getOperand(1), Op
->getOperand(2)), 0);
6061 case Intrinsic::amdgcn_groupstaticsize
: {
6062 Triple::OSType OS
= getTargetMachine().getTargetTriple().getOS();
6063 if (OS
== Triple::AMDHSA
|| OS
== Triple::AMDPAL
)
6066 const Module
*M
= MF
.getFunction().getParent();
6067 const GlobalValue
*GV
=
6068 M
->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize
));
6069 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, 0,
6070 SIInstrInfo::MO_ABS32_LO
);
6071 return {DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, GA
), 0};
6073 case Intrinsic::amdgcn_is_shared
:
6074 case Intrinsic::amdgcn_is_private
: {
6076 unsigned AS
= (IntrinsicID
== Intrinsic::amdgcn_is_shared
) ?
6077 AMDGPUAS::LOCAL_ADDRESS
: AMDGPUAS::PRIVATE_ADDRESS
;
6078 SDValue Aperture
= getSegmentAperture(AS
, SL
, DAG
);
6079 SDValue SrcVec
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
,
6082 SDValue SrcHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, SrcVec
,
6083 DAG
.getConstant(1, SL
, MVT::i32
));
6084 return DAG
.getSetCC(SL
, MVT::i1
, SrcHi
, Aperture
, ISD::SETEQ
);
6087 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6088 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
6089 return lowerImage(Op
, ImageDimIntr
, DAG
);
6095 // This function computes an appropriate offset to pass to
6096 // MachineMemOperand::setOffset() based on the offset inputs to
6097 // an intrinsic. If any of the offsets are non-contstant or
6098 // if VIndex is non-zero then this function returns 0. Otherwise,
6099 // it returns the sum of VOffset, SOffset, and Offset.
6100 static unsigned getBufferOffsetForMMO(SDValue VOffset
,
6103 SDValue VIndex
= SDValue()) {
6105 if (!isa
<ConstantSDNode
>(VOffset
) || !isa
<ConstantSDNode
>(SOffset
) ||
6106 !isa
<ConstantSDNode
>(Offset
))
6110 if (!isa
<ConstantSDNode
>(VIndex
) || !cast
<ConstantSDNode
>(VIndex
)->isNullValue())
6114 return cast
<ConstantSDNode
>(VOffset
)->getSExtValue() +
6115 cast
<ConstantSDNode
>(SOffset
)->getSExtValue() +
6116 cast
<ConstantSDNode
>(Offset
)->getSExtValue();
6119 SDValue
SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op
,
6120 SelectionDAG
&DAG
) const {
6121 unsigned IntrID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
6125 case Intrinsic::amdgcn_ds_ordered_add
:
6126 case Intrinsic::amdgcn_ds_ordered_swap
: {
6127 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6128 SDValue Chain
= M
->getOperand(0);
6129 SDValue M0
= M
->getOperand(2);
6130 SDValue Value
= M
->getOperand(3);
6131 unsigned IndexOperand
= M
->getConstantOperandVal(7);
6132 unsigned WaveRelease
= M
->getConstantOperandVal(8);
6133 unsigned WaveDone
= M
->getConstantOperandVal(9);
6134 unsigned ShaderType
;
6135 unsigned Instruction
;
6137 unsigned OrderedCountIndex
= IndexOperand
& 0x3f;
6138 IndexOperand
&= ~0x3f;
6139 unsigned CountDw
= 0;
6141 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
) {
6142 CountDw
= (IndexOperand
>> 24) & 0xf;
6143 IndexOperand
&= ~(0xf << 24);
6145 if (CountDw
< 1 || CountDw
> 4) {
6147 "ds_ordered_count: dword count must be between 1 and 4");
6152 report_fatal_error("ds_ordered_count: bad index operand");
6155 case Intrinsic::amdgcn_ds_ordered_add
:
6158 case Intrinsic::amdgcn_ds_ordered_swap
:
6163 if (WaveDone
&& !WaveRelease
)
6164 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
6166 switch (DAG
.getMachineFunction().getFunction().getCallingConv()) {
6167 case CallingConv::AMDGPU_CS
:
6168 case CallingConv::AMDGPU_KERNEL
:
6171 case CallingConv::AMDGPU_PS
:
6174 case CallingConv::AMDGPU_VS
:
6177 case CallingConv::AMDGPU_GS
:
6181 report_fatal_error("ds_ordered_count unsupported for this calling conv");
6184 unsigned Offset0
= OrderedCountIndex
<< 2;
6185 unsigned Offset1
= WaveRelease
| (WaveDone
<< 1) | (ShaderType
<< 2) |
6188 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
6189 Offset1
|= (CountDw
- 1) << 6;
6191 unsigned Offset
= Offset0
| (Offset1
<< 8);
6196 DAG
.getTargetConstant(Offset
, DL
, MVT::i16
),
6197 copyToM0(DAG
, Chain
, DL
, M0
).getValue(1), // Glue
6199 return DAG
.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT
, DL
,
6200 M
->getVTList(), Ops
, M
->getMemoryVT(),
6201 M
->getMemOperand());
6203 case Intrinsic::amdgcn_ds_fadd
: {
6204 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6207 case Intrinsic::amdgcn_ds_fadd
:
6208 Opc
= ISD::ATOMIC_LOAD_FADD
;
6212 return DAG
.getAtomic(Opc
, SDLoc(Op
), M
->getMemoryVT(),
6213 M
->getOperand(0), M
->getOperand(2), M
->getOperand(3),
6214 M
->getMemOperand());
6216 case Intrinsic::amdgcn_atomic_inc
:
6217 case Intrinsic::amdgcn_atomic_dec
:
6218 case Intrinsic::amdgcn_ds_fmin
:
6219 case Intrinsic::amdgcn_ds_fmax
: {
6220 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6223 case Intrinsic::amdgcn_atomic_inc
:
6224 Opc
= AMDGPUISD::ATOMIC_INC
;
6226 case Intrinsic::amdgcn_atomic_dec
:
6227 Opc
= AMDGPUISD::ATOMIC_DEC
;
6229 case Intrinsic::amdgcn_ds_fmin
:
6230 Opc
= AMDGPUISD::ATOMIC_LOAD_FMIN
;
6232 case Intrinsic::amdgcn_ds_fmax
:
6233 Opc
= AMDGPUISD::ATOMIC_LOAD_FMAX
;
6236 llvm_unreachable("Unknown intrinsic!");
6239 M
->getOperand(0), // Chain
6240 M
->getOperand(2), // Ptr
6241 M
->getOperand(3) // Value
6244 return DAG
.getMemIntrinsicNode(Opc
, SDLoc(Op
), M
->getVTList(), Ops
,
6245 M
->getMemoryVT(), M
->getMemOperand());
6247 case Intrinsic::amdgcn_buffer_load
:
6248 case Intrinsic::amdgcn_buffer_load_format
: {
6249 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(5))->getZExtValue();
6250 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6252 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
6253 IdxEn
= Idx
->getZExtValue() != 0;
6255 Op
.getOperand(0), // Chain
6256 Op
.getOperand(2), // rsrc
6257 Op
.getOperand(3), // vindex
6258 SDValue(), // voffset -- will be set by setBufferOffsets
6259 SDValue(), // soffset -- will be set by setBufferOffsets
6260 SDValue(), // offset -- will be set by setBufferOffsets
6261 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6262 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6265 unsigned Offset
= setBufferOffsets(Op
.getOperand(4), DAG
, &Ops
[3]);
6266 // We don't know the offset if vindex is non-zero, so clear it.
6270 unsigned Opc
= (IntrID
== Intrinsic::amdgcn_buffer_load
) ?
6271 AMDGPUISD::BUFFER_LOAD
: AMDGPUISD::BUFFER_LOAD_FORMAT
;
6273 EVT VT
= Op
.getValueType();
6274 EVT IntVT
= VT
.changeTypeToInteger();
6275 auto *M
= cast
<MemSDNode
>(Op
);
6276 M
->getMemOperand()->setOffset(Offset
);
6277 EVT LoadVT
= Op
.getValueType();
6279 if (LoadVT
.getScalarType() == MVT::f16
)
6280 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
,
6283 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6284 if (LoadVT
.getScalarType() == MVT::i8
||
6285 LoadVT
.getScalarType() == MVT::i16
)
6286 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
6288 return getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
, IntVT
,
6289 M
->getMemOperand(), DAG
);
6291 case Intrinsic::amdgcn_raw_buffer_load
:
6292 case Intrinsic::amdgcn_raw_buffer_load_format
: {
6293 const bool IsFormat
= IntrID
== Intrinsic::amdgcn_raw_buffer_load_format
;
6295 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
6297 Op
.getOperand(0), // Chain
6298 Op
.getOperand(2), // rsrc
6299 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6300 Offsets
.first
, // voffset
6301 Op
.getOperand(4), // soffset
6302 Offsets
.second
, // offset
6303 Op
.getOperand(5), // cachepolicy, swizzled buffer
6304 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6307 auto *M
= cast
<MemSDNode
>(Op
);
6308 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[3], Ops
[4], Ops
[5]));
6309 return lowerIntrinsicLoad(M
, IsFormat
, DAG
, Ops
);
6311 case Intrinsic::amdgcn_struct_buffer_load
:
6312 case Intrinsic::amdgcn_struct_buffer_load_format
: {
6313 const bool IsFormat
= IntrID
== Intrinsic::amdgcn_struct_buffer_load_format
;
6315 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6317 Op
.getOperand(0), // Chain
6318 Op
.getOperand(2), // rsrc
6319 Op
.getOperand(3), // vindex
6320 Offsets
.first
, // voffset
6321 Op
.getOperand(5), // soffset
6322 Offsets
.second
, // offset
6323 Op
.getOperand(6), // cachepolicy, swizzled buffer
6324 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6327 auto *M
= cast
<MemSDNode
>(Op
);
6328 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[3], Ops
[4], Ops
[5],
6330 return lowerIntrinsicLoad(cast
<MemSDNode
>(Op
), IsFormat
, DAG
, Ops
);
6332 case Intrinsic::amdgcn_tbuffer_load
: {
6333 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6334 EVT LoadVT
= Op
.getValueType();
6336 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6337 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
6338 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
6339 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
6341 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
6342 IdxEn
= Idx
->getZExtValue() != 0;
6344 Op
.getOperand(0), // Chain
6345 Op
.getOperand(2), // rsrc
6346 Op
.getOperand(3), // vindex
6347 Op
.getOperand(4), // voffset
6348 Op
.getOperand(5), // soffset
6349 Op
.getOperand(6), // offset
6350 DAG
.getTargetConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
6351 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6352 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
) // idxen
6355 if (LoadVT
.getScalarType() == MVT::f16
)
6356 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6358 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6359 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6362 case Intrinsic::amdgcn_raw_tbuffer_load
: {
6363 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6364 EVT LoadVT
= Op
.getValueType();
6365 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
6368 Op
.getOperand(0), // Chain
6369 Op
.getOperand(2), // rsrc
6370 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6371 Offsets
.first
, // voffset
6372 Op
.getOperand(4), // soffset
6373 Offsets
.second
, // offset
6374 Op
.getOperand(5), // format
6375 Op
.getOperand(6), // cachepolicy, swizzled buffer
6376 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6379 if (LoadVT
.getScalarType() == MVT::f16
)
6380 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6382 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6383 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6386 case Intrinsic::amdgcn_struct_tbuffer_load
: {
6387 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6388 EVT LoadVT
= Op
.getValueType();
6389 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6392 Op
.getOperand(0), // Chain
6393 Op
.getOperand(2), // rsrc
6394 Op
.getOperand(3), // vindex
6395 Offsets
.first
, // voffset
6396 Op
.getOperand(5), // soffset
6397 Offsets
.second
, // offset
6398 Op
.getOperand(6), // format
6399 Op
.getOperand(7), // cachepolicy, swizzled buffer
6400 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6403 if (LoadVT
.getScalarType() == MVT::f16
)
6404 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6406 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6407 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6410 case Intrinsic::amdgcn_buffer_atomic_swap
:
6411 case Intrinsic::amdgcn_buffer_atomic_add
:
6412 case Intrinsic::amdgcn_buffer_atomic_sub
:
6413 case Intrinsic::amdgcn_buffer_atomic_smin
:
6414 case Intrinsic::amdgcn_buffer_atomic_umin
:
6415 case Intrinsic::amdgcn_buffer_atomic_smax
:
6416 case Intrinsic::amdgcn_buffer_atomic_umax
:
6417 case Intrinsic::amdgcn_buffer_atomic_and
:
6418 case Intrinsic::amdgcn_buffer_atomic_or
:
6419 case Intrinsic::amdgcn_buffer_atomic_xor
: {
6420 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6422 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6423 IdxEn
= Idx
->getZExtValue() != 0;
6425 Op
.getOperand(0), // Chain
6426 Op
.getOperand(2), // vdata
6427 Op
.getOperand(3), // rsrc
6428 Op
.getOperand(4), // vindex
6429 SDValue(), // voffset -- will be set by setBufferOffsets
6430 SDValue(), // soffset -- will be set by setBufferOffsets
6431 SDValue(), // offset -- will be set by setBufferOffsets
6432 DAG
.getTargetConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6433 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6435 unsigned Offset
= setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6436 // We don't know the offset if vindex is non-zero, so clear it.
6439 EVT VT
= Op
.getValueType();
6441 auto *M
= cast
<MemSDNode
>(Op
);
6442 M
->getMemOperand()->setOffset(Offset
);
6443 unsigned Opcode
= 0;
6446 case Intrinsic::amdgcn_buffer_atomic_swap
:
6447 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6449 case Intrinsic::amdgcn_buffer_atomic_add
:
6450 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6452 case Intrinsic::amdgcn_buffer_atomic_sub
:
6453 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6455 case Intrinsic::amdgcn_buffer_atomic_smin
:
6456 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6458 case Intrinsic::amdgcn_buffer_atomic_umin
:
6459 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6461 case Intrinsic::amdgcn_buffer_atomic_smax
:
6462 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6464 case Intrinsic::amdgcn_buffer_atomic_umax
:
6465 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6467 case Intrinsic::amdgcn_buffer_atomic_and
:
6468 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6470 case Intrinsic::amdgcn_buffer_atomic_or
:
6471 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6473 case Intrinsic::amdgcn_buffer_atomic_xor
:
6474 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6477 llvm_unreachable("unhandled atomic opcode");
6480 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6481 M
->getMemOperand());
6483 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6484 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6485 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6486 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6487 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6488 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6489 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6490 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6491 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6492 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6493 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6494 case Intrinsic::amdgcn_raw_buffer_atomic_dec
: {
6495 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6497 Op
.getOperand(0), // Chain
6498 Op
.getOperand(2), // vdata
6499 Op
.getOperand(3), // rsrc
6500 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6501 Offsets
.first
, // voffset
6502 Op
.getOperand(5), // soffset
6503 Offsets
.second
, // offset
6504 Op
.getOperand(6), // cachepolicy
6505 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6507 EVT VT
= Op
.getValueType();
6509 auto *M
= cast
<MemSDNode
>(Op
);
6510 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[4], Ops
[5], Ops
[6]));
6511 unsigned Opcode
= 0;
6514 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6515 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6517 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6518 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6520 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6521 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6523 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6524 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6526 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6527 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6529 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6530 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6532 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6533 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6535 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6536 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6538 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6539 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6541 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6542 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6544 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6545 Opcode
= AMDGPUISD::BUFFER_ATOMIC_INC
;
6547 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
6548 Opcode
= AMDGPUISD::BUFFER_ATOMIC_DEC
;
6551 llvm_unreachable("unhandled atomic opcode");
6554 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6555 M
->getMemOperand());
6557 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6558 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6559 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6560 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6561 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6562 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6563 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6564 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6565 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6566 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6567 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6568 case Intrinsic::amdgcn_struct_buffer_atomic_dec
: {
6569 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6571 Op
.getOperand(0), // Chain
6572 Op
.getOperand(2), // vdata
6573 Op
.getOperand(3), // rsrc
6574 Op
.getOperand(4), // vindex
6575 Offsets
.first
, // voffset
6576 Op
.getOperand(6), // soffset
6577 Offsets
.second
, // offset
6578 Op
.getOperand(7), // cachepolicy
6579 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6581 EVT VT
= Op
.getValueType();
6583 auto *M
= cast
<MemSDNode
>(Op
);
6584 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[4], Ops
[5], Ops
[6],
6586 unsigned Opcode
= 0;
6589 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6590 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6592 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6593 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6595 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6596 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6598 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6599 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6601 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6602 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6604 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6605 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6607 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6608 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6610 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6611 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6613 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6614 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6616 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6617 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6619 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6620 Opcode
= AMDGPUISD::BUFFER_ATOMIC_INC
;
6622 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
6623 Opcode
= AMDGPUISD::BUFFER_ATOMIC_DEC
;
6626 llvm_unreachable("unhandled atomic opcode");
6629 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6630 M
->getMemOperand());
6632 case Intrinsic::amdgcn_buffer_atomic_cmpswap
: {
6633 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6635 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(5)))
6636 IdxEn
= Idx
->getZExtValue() != 0;
6638 Op
.getOperand(0), // Chain
6639 Op
.getOperand(2), // src
6640 Op
.getOperand(3), // cmp
6641 Op
.getOperand(4), // rsrc
6642 Op
.getOperand(5), // vindex
6643 SDValue(), // voffset -- will be set by setBufferOffsets
6644 SDValue(), // soffset -- will be set by setBufferOffsets
6645 SDValue(), // offset -- will be set by setBufferOffsets
6646 DAG
.getTargetConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6647 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6649 unsigned Offset
= setBufferOffsets(Op
.getOperand(6), DAG
, &Ops
[5]);
6650 // We don't know the offset if vindex is non-zero, so clear it.
6653 EVT VT
= Op
.getValueType();
6654 auto *M
= cast
<MemSDNode
>(Op
);
6655 M
->getMemOperand()->setOffset(Offset
);
6657 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6658 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6660 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
: {
6661 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6663 Op
.getOperand(0), // Chain
6664 Op
.getOperand(2), // src
6665 Op
.getOperand(3), // cmp
6666 Op
.getOperand(4), // rsrc
6667 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6668 Offsets
.first
, // voffset
6669 Op
.getOperand(6), // soffset
6670 Offsets
.second
, // offset
6671 Op
.getOperand(7), // cachepolicy
6672 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6674 EVT VT
= Op
.getValueType();
6675 auto *M
= cast
<MemSDNode
>(Op
);
6676 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[5], Ops
[6], Ops
[7]));
6678 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6679 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6681 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
: {
6682 auto Offsets
= splitBufferOffsets(Op
.getOperand(6), DAG
);
6684 Op
.getOperand(0), // Chain
6685 Op
.getOperand(2), // src
6686 Op
.getOperand(3), // cmp
6687 Op
.getOperand(4), // rsrc
6688 Op
.getOperand(5), // vindex
6689 Offsets
.first
, // voffset
6690 Op
.getOperand(7), // soffset
6691 Offsets
.second
, // offset
6692 Op
.getOperand(8), // cachepolicy
6693 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6695 EVT VT
= Op
.getValueType();
6696 auto *M
= cast
<MemSDNode
>(Op
);
6697 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[5], Ops
[6], Ops
[7],
6700 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6701 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6705 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6706 AMDGPU::getImageDimIntrinsicInfo(IntrID
))
6707 return lowerImage(Op
, ImageDimIntr
, DAG
);
6713 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
6714 // dwordx4 if on SI.
6715 SDValue
SITargetLowering::getMemIntrinsicNode(unsigned Opcode
, const SDLoc
&DL
,
6717 ArrayRef
<SDValue
> Ops
, EVT MemVT
,
6718 MachineMemOperand
*MMO
,
6719 SelectionDAG
&DAG
) const {
6720 EVT VT
= VTList
.VTs
[0];
6722 EVT WidenedMemVT
= MemVT
;
6723 if (!Subtarget
->hasDwordx3LoadStores() &&
6724 (WidenedVT
== MVT::v3i32
|| WidenedVT
== MVT::v3f32
)) {
6725 WidenedVT
= EVT::getVectorVT(*DAG
.getContext(),
6726 WidenedVT
.getVectorElementType(), 4);
6727 WidenedMemVT
= EVT::getVectorVT(*DAG
.getContext(),
6728 WidenedMemVT
.getVectorElementType(), 4);
6729 MMO
= DAG
.getMachineFunction().getMachineMemOperand(MMO
, 0, 16);
6732 assert(VTList
.NumVTs
== 2);
6733 SDVTList WidenedVTList
= DAG
.getVTList(WidenedVT
, VTList
.VTs
[1]);
6735 auto NewOp
= DAG
.getMemIntrinsicNode(Opcode
, DL
, WidenedVTList
, Ops
,
6737 if (WidenedVT
!= VT
) {
6738 auto Extract
= DAG
.getNode(
6739 ISD::EXTRACT_SUBVECTOR
, DL
, VT
, NewOp
,
6740 DAG
.getConstant(0, DL
, getVectorIdxTy(DAG
.getDataLayout())));
6741 NewOp
= DAG
.getMergeValues({ Extract
, SDValue(NewOp
.getNode(), 1) }, DL
);
6746 SDValue
SITargetLowering::handleD16VData(SDValue VData
,
6747 SelectionDAG
&DAG
) const {
6748 EVT StoreVT
= VData
.getValueType();
6750 // No change for f16 and legal vector D16 types.
6751 if (!StoreVT
.isVector())
6755 assert((StoreVT
.getVectorNumElements() != 3) && "Handle v3f16");
6757 if (Subtarget
->hasUnpackedD16VMem()) {
6758 // We need to unpack the packed data to store.
6759 EVT IntStoreVT
= StoreVT
.changeTypeToInteger();
6760 SDValue IntVData
= DAG
.getNode(ISD::BITCAST
, DL
, IntStoreVT
, VData
);
6762 EVT EquivStoreVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
6763 StoreVT
.getVectorNumElements());
6764 SDValue ZExt
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, EquivStoreVT
, IntVData
);
6765 return DAG
.UnrollVectorOp(ZExt
.getNode());
6768 assert(isTypeLegal(StoreVT
));
6772 SDValue
SITargetLowering::LowerINTRINSIC_VOID(SDValue Op
,
6773 SelectionDAG
&DAG
) const {
6775 SDValue Chain
= Op
.getOperand(0);
6776 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
6777 MachineFunction
&MF
= DAG
.getMachineFunction();
6779 switch (IntrinsicID
) {
6780 case Intrinsic::amdgcn_exp
: {
6781 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
6782 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
6783 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(8));
6784 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(9));
6786 const SDValue Ops
[] = {
6788 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
6789 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
6790 Op
.getOperand(4), // src0
6791 Op
.getOperand(5), // src1
6792 Op
.getOperand(6), // src2
6793 Op
.getOperand(7), // src3
6794 DAG
.getTargetConstant(0, DL
, MVT::i1
), // compr
6795 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
6798 unsigned Opc
= Done
->isNullValue() ?
6799 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
6800 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
6802 case Intrinsic::amdgcn_exp_compr
: {
6803 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
6804 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
6805 SDValue Src0
= Op
.getOperand(4);
6806 SDValue Src1
= Op
.getOperand(5);
6807 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(6));
6808 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(7));
6810 SDValue Undef
= DAG
.getUNDEF(MVT::f32
);
6811 const SDValue Ops
[] = {
6813 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
6814 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
6815 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src0
),
6816 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src1
),
6819 DAG
.getTargetConstant(1, DL
, MVT::i1
), // compr
6820 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
6823 unsigned Opc
= Done
->isNullValue() ?
6824 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
6825 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
6827 case Intrinsic::amdgcn_s_barrier
: {
6828 if (getTargetMachine().getOptLevel() > CodeGenOpt::None
) {
6829 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
6830 unsigned WGSize
= ST
.getFlatWorkGroupSizes(MF
.getFunction()).second
;
6831 if (WGSize
<= ST
.getWavefrontSize())
6832 return SDValue(DAG
.getMachineNode(AMDGPU::WAVE_BARRIER
, DL
, MVT::Other
,
6833 Op
.getOperand(0)), 0);
6837 case Intrinsic::amdgcn_tbuffer_store
: {
6838 SDValue VData
= Op
.getOperand(2);
6839 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6841 VData
= handleD16VData(VData
, DAG
);
6842 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
6843 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
6844 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
6845 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(11))->getZExtValue();
6847 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6848 IdxEn
= Idx
->getZExtValue() != 0;
6852 Op
.getOperand(3), // rsrc
6853 Op
.getOperand(4), // vindex
6854 Op
.getOperand(5), // voffset
6855 Op
.getOperand(6), // soffset
6856 Op
.getOperand(7), // offset
6857 DAG
.getTargetConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
6858 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6859 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idexen
6861 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6862 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6863 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6864 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6865 M
->getMemoryVT(), M
->getMemOperand());
6868 case Intrinsic::amdgcn_struct_tbuffer_store
: {
6869 SDValue VData
= Op
.getOperand(2);
6870 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6872 VData
= handleD16VData(VData
, DAG
);
6873 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6877 Op
.getOperand(3), // rsrc
6878 Op
.getOperand(4), // vindex
6879 Offsets
.first
, // voffset
6880 Op
.getOperand(6), // soffset
6881 Offsets
.second
, // offset
6882 Op
.getOperand(7), // format
6883 Op
.getOperand(8), // cachepolicy, swizzled buffer
6884 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idexen
6886 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6887 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6888 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6889 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6890 M
->getMemoryVT(), M
->getMemOperand());
6893 case Intrinsic::amdgcn_raw_tbuffer_store
: {
6894 SDValue VData
= Op
.getOperand(2);
6895 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6897 VData
= handleD16VData(VData
, DAG
);
6898 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6902 Op
.getOperand(3), // rsrc
6903 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6904 Offsets
.first
, // voffset
6905 Op
.getOperand(5), // soffset
6906 Offsets
.second
, // offset
6907 Op
.getOperand(6), // format
6908 Op
.getOperand(7), // cachepolicy, swizzled buffer
6909 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idexen
6911 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6912 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6913 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6914 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6915 M
->getMemoryVT(), M
->getMemOperand());
6918 case Intrinsic::amdgcn_buffer_store
:
6919 case Intrinsic::amdgcn_buffer_store_format
: {
6920 SDValue VData
= Op
.getOperand(2);
6921 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6923 VData
= handleD16VData(VData
, DAG
);
6924 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6925 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6927 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6928 IdxEn
= Idx
->getZExtValue() != 0;
6932 Op
.getOperand(3), // rsrc
6933 Op
.getOperand(4), // vindex
6934 SDValue(), // voffset -- will be set by setBufferOffsets
6935 SDValue(), // soffset -- will be set by setBufferOffsets
6936 SDValue(), // offset -- will be set by setBufferOffsets
6937 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6938 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6940 unsigned Offset
= setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6941 // We don't know the offset if vindex is non-zero, so clear it.
6944 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_buffer_store
?
6945 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
6946 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6947 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6948 M
->getMemOperand()->setOffset(Offset
);
6950 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6951 EVT VDataType
= VData
.getValueType().getScalarType();
6952 if (VDataType
== MVT::i8
|| VDataType
== MVT::i16
)
6953 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
6955 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6956 M
->getMemoryVT(), M
->getMemOperand());
6959 case Intrinsic::amdgcn_raw_buffer_store
:
6960 case Intrinsic::amdgcn_raw_buffer_store_format
: {
6961 const bool IsFormat
=
6962 IntrinsicID
== Intrinsic::amdgcn_raw_buffer_store_format
;
6964 SDValue VData
= Op
.getOperand(2);
6965 EVT VDataVT
= VData
.getValueType();
6966 EVT EltType
= VDataVT
.getScalarType();
6967 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
6969 VData
= handleD16VData(VData
, DAG
);
6971 if (!isTypeLegal(VDataVT
)) {
6973 DAG
.getNode(ISD::BITCAST
, DL
,
6974 getEquivalentMemType(*DAG
.getContext(), VDataVT
), VData
);
6977 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6981 Op
.getOperand(3), // rsrc
6982 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6983 Offsets
.first
, // voffset
6984 Op
.getOperand(5), // soffset
6985 Offsets
.second
, // offset
6986 Op
.getOperand(6), // cachepolicy, swizzled buffer
6987 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6990 IsFormat
? AMDGPUISD::BUFFER_STORE_FORMAT
: AMDGPUISD::BUFFER_STORE
;
6991 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6992 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6993 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[4], Ops
[5], Ops
[6]));
6995 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6996 if (!IsD16
&& !VDataVT
.isVector() && EltType
.getSizeInBits() < 32)
6997 return handleByteShortBufferStores(DAG
, VDataVT
, DL
, Ops
, M
);
6999 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
7000 M
->getMemoryVT(), M
->getMemOperand());
7003 case Intrinsic::amdgcn_struct_buffer_store
:
7004 case Intrinsic::amdgcn_struct_buffer_store_format
: {
7005 const bool IsFormat
=
7006 IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store_format
;
7008 SDValue VData
= Op
.getOperand(2);
7009 EVT VDataVT
= VData
.getValueType();
7010 EVT EltType
= VDataVT
.getScalarType();
7011 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
7014 VData
= handleD16VData(VData
, DAG
);
7016 if (!isTypeLegal(VDataVT
)) {
7018 DAG
.getNode(ISD::BITCAST
, DL
,
7019 getEquivalentMemType(*DAG
.getContext(), VDataVT
), VData
);
7022 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
7026 Op
.getOperand(3), // rsrc
7027 Op
.getOperand(4), // vindex
7028 Offsets
.first
, // voffset
7029 Op
.getOperand(6), // soffset
7030 Offsets
.second
, // offset
7031 Op
.getOperand(7), // cachepolicy, swizzled buffer
7032 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
7034 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store
?
7035 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
7036 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
7037 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
7038 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[4], Ops
[5], Ops
[6],
7041 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
7042 EVT VDataType
= VData
.getValueType().getScalarType();
7043 if (!IsD16
&& !VDataVT
.isVector() && EltType
.getSizeInBits() < 32)
7044 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
7046 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
7047 M
->getMemoryVT(), M
->getMemOperand());
7050 case Intrinsic::amdgcn_buffer_atomic_fadd
: {
7051 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
7053 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
7054 IdxEn
= Idx
->getZExtValue() != 0;
7057 Op
.getOperand(2), // vdata
7058 Op
.getOperand(3), // rsrc
7059 Op
.getOperand(4), // vindex
7060 SDValue(), // voffset -- will be set by setBufferOffsets
7061 SDValue(), // soffset -- will be set by setBufferOffsets
7062 SDValue(), // offset -- will be set by setBufferOffsets
7063 DAG
.getTargetConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
7064 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
7066 unsigned Offset
= setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
7067 // We don't know the offset if vindex is non-zero, so clear it.
7070 EVT VT
= Op
.getOperand(2).getValueType();
7072 auto *M
= cast
<MemSDNode
>(Op
);
7073 M
->getMemOperand()->setOffset(Offset
);
7074 unsigned Opcode
= VT
.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
7075 : AMDGPUISD::BUFFER_ATOMIC_FADD
;
7077 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
7078 M
->getMemOperand());
7081 case Intrinsic::amdgcn_global_atomic_fadd
: {
7084 Op
.getOperand(2), // ptr
7085 Op
.getOperand(3) // vdata
7087 EVT VT
= Op
.getOperand(3).getValueType();
7089 auto *M
= cast
<MemSDNode
>(Op
);
7090 unsigned Opcode
= VT
.isVector() ? AMDGPUISD::ATOMIC_PK_FADD
7091 : AMDGPUISD::ATOMIC_FADD
;
7093 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
7094 M
->getMemOperand());
7097 case Intrinsic::amdgcn_end_cf
:
7098 return SDValue(DAG
.getMachineNode(AMDGPU::SI_END_CF
, DL
, MVT::Other
,
7099 Op
->getOperand(2), Chain
), 0);
7102 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
7103 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
7104 return lowerImage(Op
, ImageDimIntr
, DAG
);
7111 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
7112 // offset (the offset that is included in bounds checking and swizzling, to be
7113 // split between the instruction's voffset and immoffset fields) and soffset
7114 // (the offset that is excluded from bounds checking and swizzling, to go in
7115 // the instruction's soffset field). This function takes the first kind of
7116 // offset and figures out how to split it between voffset and immoffset.
7117 std::pair
<SDValue
, SDValue
> SITargetLowering::splitBufferOffsets(
7118 SDValue Offset
, SelectionDAG
&DAG
) const {
7120 const unsigned MaxImm
= 4095;
7121 SDValue N0
= Offset
;
7122 ConstantSDNode
*C1
= nullptr;
7124 if ((C1
= dyn_cast
<ConstantSDNode
>(N0
)))
7126 else if (DAG
.isBaseWithConstantOffset(N0
)) {
7127 C1
= cast
<ConstantSDNode
>(N0
.getOperand(1));
7128 N0
= N0
.getOperand(0);
7132 unsigned ImmOffset
= C1
->getZExtValue();
7133 // If the immediate value is too big for the immoffset field, put the value
7134 // and -4096 into the immoffset field so that the value that is copied/added
7135 // for the voffset field is a multiple of 4096, and it stands more chance
7136 // of being CSEd with the copy/add for another similar load/store.
7137 // However, do not do that rounding down to a multiple of 4096 if that is a
7138 // negative number, as it appears to be illegal to have a negative offset
7139 // in the vgpr, even if adding the immediate offset makes it positive.
7140 unsigned Overflow
= ImmOffset
& ~MaxImm
;
7141 ImmOffset
-= Overflow
;
7142 if ((int32_t)Overflow
< 0) {
7143 Overflow
+= ImmOffset
;
7146 C1
= cast
<ConstantSDNode
>(DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
));
7148 auto OverflowVal
= DAG
.getConstant(Overflow
, DL
, MVT::i32
);
7152 SDValue Ops
[] = { N0
, OverflowVal
};
7153 N0
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, Ops
);
7158 N0
= DAG
.getConstant(0, DL
, MVT::i32
);
7160 C1
= cast
<ConstantSDNode
>(DAG
.getTargetConstant(0, DL
, MVT::i32
));
7161 return {N0
, SDValue(C1
, 0)};
7164 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
7165 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
7166 // pointed to by Offsets.
7167 unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset
,
7168 SelectionDAG
&DAG
, SDValue
*Offsets
,
7169 unsigned Align
) const {
7170 SDLoc
DL(CombinedOffset
);
7171 if (auto C
= dyn_cast
<ConstantSDNode
>(CombinedOffset
)) {
7172 uint32_t Imm
= C
->getZExtValue();
7173 uint32_t SOffset
, ImmOffset
;
7174 if (AMDGPU::splitMUBUFOffset(Imm
, SOffset
, ImmOffset
, Subtarget
, Align
)) {
7175 Offsets
[0] = DAG
.getConstant(0, DL
, MVT::i32
);
7176 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
7177 Offsets
[2] = DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
);
7178 return SOffset
+ ImmOffset
;
7181 if (DAG
.isBaseWithConstantOffset(CombinedOffset
)) {
7182 SDValue N0
= CombinedOffset
.getOperand(0);
7183 SDValue N1
= CombinedOffset
.getOperand(1);
7184 uint32_t SOffset
, ImmOffset
;
7185 int Offset
= cast
<ConstantSDNode
>(N1
)->getSExtValue();
7186 if (Offset
>= 0 && AMDGPU::splitMUBUFOffset(Offset
, SOffset
, ImmOffset
,
7187 Subtarget
, Align
)) {
7189 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
7190 Offsets
[2] = DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
);
7194 Offsets
[0] = CombinedOffset
;
7195 Offsets
[1] = DAG
.getConstant(0, DL
, MVT::i32
);
7196 Offsets
[2] = DAG
.getTargetConstant(0, DL
, MVT::i32
);
7200 // Handle 8 bit and 16 bit buffer loads
7201 SDValue
SITargetLowering::handleByteShortBufferLoads(SelectionDAG
&DAG
,
7202 EVT LoadVT
, SDLoc DL
,
7203 ArrayRef
<SDValue
> Ops
,
7204 MemSDNode
*M
) const {
7205 EVT IntVT
= LoadVT
.changeTypeToInteger();
7206 unsigned Opc
= (LoadVT
.getScalarType() == MVT::i8
) ?
7207 AMDGPUISD::BUFFER_LOAD_UBYTE
: AMDGPUISD::BUFFER_LOAD_USHORT
;
7209 SDVTList ResList
= DAG
.getVTList(MVT::i32
, MVT::Other
);
7210 SDValue BufferLoad
= DAG
.getMemIntrinsicNode(Opc
, DL
, ResList
,
7212 M
->getMemOperand());
7213 SDValue LoadVal
= DAG
.getNode(ISD::TRUNCATE
, DL
, IntVT
, BufferLoad
);
7214 LoadVal
= DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, LoadVal
);
7216 return DAG
.getMergeValues({LoadVal
, BufferLoad
.getValue(1)}, DL
);
7219 // Handle 8 bit and 16 bit buffer stores
7220 SDValue
SITargetLowering::handleByteShortBufferStores(SelectionDAG
&DAG
,
7221 EVT VDataType
, SDLoc DL
,
7223 MemSDNode
*M
) const {
7224 if (VDataType
== MVT::f16
)
7225 Ops
[1] = DAG
.getNode(ISD::BITCAST
, DL
, MVT::i16
, Ops
[1]);
7227 SDValue BufferStoreExt
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Ops
[1]);
7228 Ops
[1] = BufferStoreExt
;
7229 unsigned Opc
= (VDataType
== MVT::i8
) ? AMDGPUISD::BUFFER_STORE_BYTE
:
7230 AMDGPUISD::BUFFER_STORE_SHORT
;
7231 ArrayRef
<SDValue
> OpsRef
= makeArrayRef(&Ops
[0], 9);
7232 return DAG
.getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), OpsRef
, VDataType
,
7233 M
->getMemOperand());
7236 static SDValue
getLoadExtOrTrunc(SelectionDAG
&DAG
,
7237 ISD::LoadExtType ExtType
, SDValue Op
,
7238 const SDLoc
&SL
, EVT VT
) {
7239 if (VT
.bitsLT(Op
.getValueType()))
7240 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Op
);
7244 return DAG
.getNode(ISD::SIGN_EXTEND
, SL
, VT
, Op
);
7246 return DAG
.getNode(ISD::ZERO_EXTEND
, SL
, VT
, Op
);
7248 return DAG
.getNode(ISD::ANY_EXTEND
, SL
, VT
, Op
);
7249 case ISD::NON_EXTLOAD
:
7253 llvm_unreachable("invalid ext type");
7256 SDValue
SITargetLowering::widenLoad(LoadSDNode
*Ld
, DAGCombinerInfo
&DCI
) const {
7257 SelectionDAG
&DAG
= DCI
.DAG
;
7258 if (Ld
->getAlignment() < 4 || Ld
->isDivergent())
7261 // FIXME: Constant loads should all be marked invariant.
7262 unsigned AS
= Ld
->getAddressSpace();
7263 if (AS
!= AMDGPUAS::CONSTANT_ADDRESS
&&
7264 AS
!= AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
7265 (AS
!= AMDGPUAS::GLOBAL_ADDRESS
|| !Ld
->isInvariant()))
7268 // Don't do this early, since it may interfere with adjacent load merging for
7269 // illegal types. We can avoid losing alignment information for exotic types
7271 EVT MemVT
= Ld
->getMemoryVT();
7272 if ((MemVT
.isSimple() && !DCI
.isAfterLegalizeDAG()) ||
7273 MemVT
.getSizeInBits() >= 32)
7278 assert((!MemVT
.isVector() || Ld
->getExtensionType() == ISD::NON_EXTLOAD
) &&
7279 "unexpected vector extload");
7281 // TODO: Drop only high part of range.
7282 SDValue Ptr
= Ld
->getBasePtr();
7283 SDValue NewLoad
= DAG
.getLoad(ISD::UNINDEXED
, ISD::NON_EXTLOAD
,
7284 MVT::i32
, SL
, Ld
->getChain(), Ptr
,
7286 Ld
->getPointerInfo(), MVT::i32
,
7288 Ld
->getMemOperand()->getFlags(),
7290 nullptr); // Drop ranges
7292 EVT TruncVT
= EVT::getIntegerVT(*DAG
.getContext(), MemVT
.getSizeInBits());
7293 if (MemVT
.isFloatingPoint()) {
7294 assert(Ld
->getExtensionType() == ISD::NON_EXTLOAD
&&
7295 "unexpected fp extload");
7296 TruncVT
= MemVT
.changeTypeToInteger();
7299 SDValue Cvt
= NewLoad
;
7300 if (Ld
->getExtensionType() == ISD::SEXTLOAD
) {
7301 Cvt
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, SL
, MVT::i32
, NewLoad
,
7302 DAG
.getValueType(TruncVT
));
7303 } else if (Ld
->getExtensionType() == ISD::ZEXTLOAD
||
7304 Ld
->getExtensionType() == ISD::NON_EXTLOAD
) {
7305 Cvt
= DAG
.getZeroExtendInReg(NewLoad
, SL
, TruncVT
);
7307 assert(Ld
->getExtensionType() == ISD::EXTLOAD
);
7310 EVT VT
= Ld
->getValueType(0);
7311 EVT IntVT
= EVT::getIntegerVT(*DAG
.getContext(), VT
.getSizeInBits());
7313 DCI
.AddToWorklist(Cvt
.getNode());
7315 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
7316 // the appropriate extension from the 32-bit load.
7317 Cvt
= getLoadExtOrTrunc(DAG
, Ld
->getExtensionType(), Cvt
, SL
, IntVT
);
7318 DCI
.AddToWorklist(Cvt
.getNode());
7320 // Handle conversion back to floating point if necessary.
7321 Cvt
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, Cvt
);
7323 return DAG
.getMergeValues({ Cvt
, NewLoad
.getValue(1) }, SL
);
7326 SDValue
SITargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
7328 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
7329 ISD::LoadExtType ExtType
= Load
->getExtensionType();
7330 EVT MemVT
= Load
->getMemoryVT();
7332 if (ExtType
== ISD::NON_EXTLOAD
&& MemVT
.getSizeInBits() < 32) {
7333 if (MemVT
== MVT::i16
&& isTypeLegal(MVT::i16
))
7336 // FIXME: Copied from PPC
7337 // First, load into 32 bits, then truncate to 1 bit.
7339 SDValue Chain
= Load
->getChain();
7340 SDValue BasePtr
= Load
->getBasePtr();
7341 MachineMemOperand
*MMO
= Load
->getMemOperand();
7343 EVT RealMemVT
= (MemVT
== MVT::i1
) ? MVT::i8
: MVT::i16
;
7345 SDValue NewLD
= DAG
.getExtLoad(ISD::EXTLOAD
, DL
, MVT::i32
, Chain
,
7346 BasePtr
, RealMemVT
, MMO
);
7348 if (!MemVT
.isVector()) {
7350 DAG
.getNode(ISD::TRUNCATE
, DL
, MemVT
, NewLD
),
7354 return DAG
.getMergeValues(Ops
, DL
);
7357 SmallVector
<SDValue
, 3> Elts
;
7358 for (unsigned I
= 0, N
= MemVT
.getVectorNumElements(); I
!= N
; ++I
) {
7359 SDValue Elt
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, NewLD
,
7360 DAG
.getConstant(I
, DL
, MVT::i32
));
7362 Elts
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Elt
));
7366 DAG
.getBuildVector(MemVT
, DL
, Elts
),
7370 return DAG
.getMergeValues(Ops
, DL
);
7373 if (!MemVT
.isVector())
7376 assert(Op
.getValueType().getVectorElementType() == MVT::i32
&&
7377 "Custom lowering for non-i32 vectors hasn't been implemented.");
7379 if (!allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
7380 MemVT
, *Load
->getMemOperand())) {
7382 std::tie(Ops
[0], Ops
[1]) = expandUnalignedLoad(Load
, DAG
);
7383 return DAG
.getMergeValues(Ops
, DL
);
7386 unsigned Alignment
= Load
->getAlignment();
7387 unsigned AS
= Load
->getAddressSpace();
7388 if (Subtarget
->hasLDSMisalignedBug() &&
7389 AS
== AMDGPUAS::FLAT_ADDRESS
&&
7390 Alignment
< MemVT
.getStoreSize() && MemVT
.getSizeInBits() > 32) {
7391 return SplitVectorLoad(Op
, DAG
);
7394 MachineFunction
&MF
= DAG
.getMachineFunction();
7395 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
7396 // If there is a possibilty that flat instruction access scratch memory
7397 // then we need to use the same legalization rules we use for private.
7398 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
7399 AS
= MFI
->hasFlatScratchInit() ?
7400 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
7402 unsigned NumElements
= MemVT
.getVectorNumElements();
7404 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7405 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
7406 if (!Op
->isDivergent() && Alignment
>= 4 && NumElements
< 32) {
7407 if (MemVT
.isPow2VectorType())
7409 if (NumElements
== 3)
7410 return WidenVectorLoad(Op
, DAG
);
7411 return SplitVectorLoad(Op
, DAG
);
7413 // Non-uniform loads will be selected to MUBUF instructions, so they
7414 // have the same legalization requirements as global and private
7419 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7420 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
7421 AS
== AMDGPUAS::GLOBAL_ADDRESS
) {
7422 if (Subtarget
->getScalarizeGlobalBehavior() && !Op
->isDivergent() &&
7423 !Load
->isVolatile() && isMemOpHasNoClobberedMemOperand(Load
) &&
7424 Alignment
>= 4 && NumElements
< 32) {
7425 if (MemVT
.isPow2VectorType())
7427 if (NumElements
== 3)
7428 return WidenVectorLoad(Op
, DAG
);
7429 return SplitVectorLoad(Op
, DAG
);
7431 // Non-uniform loads will be selected to MUBUF instructions, so they
7432 // have the same legalization requirements as global and private
7436 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7437 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
7438 AS
== AMDGPUAS::GLOBAL_ADDRESS
||
7439 AS
== AMDGPUAS::FLAT_ADDRESS
) {
7440 if (NumElements
> 4)
7441 return SplitVectorLoad(Op
, DAG
);
7442 // v3 loads not supported on SI.
7443 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7444 return WidenVectorLoad(Op
, DAG
);
7445 // v3 and v4 loads are supported for private and global memory.
7448 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7449 // Depending on the setting of the private_element_size field in the
7450 // resource descriptor, we can only make private accesses up to a certain
7452 switch (Subtarget
->getMaxPrivateElementSize()) {
7454 return scalarizeVectorLoad(Load
, DAG
);
7456 if (NumElements
> 2)
7457 return SplitVectorLoad(Op
, DAG
);
7460 // Same as global/flat
7461 if (NumElements
> 4)
7462 return SplitVectorLoad(Op
, DAG
);
7463 // v3 loads not supported on SI.
7464 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7465 return WidenVectorLoad(Op
, DAG
);
7468 llvm_unreachable("unsupported private_element_size");
7470 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
7471 // Use ds_read_b128 if possible.
7472 if (Subtarget
->useDS128() && Load
->getAlignment() >= 16 &&
7473 MemVT
.getStoreSize() == 16)
7476 if (NumElements
> 2)
7477 return SplitVectorLoad(Op
, DAG
);
7479 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7480 // address is negative, then the instruction is incorrectly treated as
7481 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7482 // loads here to avoid emitting ds_read2_b32. We may re-combine the
7483 // load later in the SILoadStoreOptimizer.
7484 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
7485 NumElements
== 2 && MemVT
.getStoreSize() == 8 &&
7486 Load
->getAlignment() < 8) {
7487 return SplitVectorLoad(Op
, DAG
);
7493 SDValue
SITargetLowering::LowerSELECT(SDValue Op
, SelectionDAG
&DAG
) const {
7494 EVT VT
= Op
.getValueType();
7495 assert(VT
.getSizeInBits() == 64);
7498 SDValue Cond
= Op
.getOperand(0);
7500 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i32
);
7501 SDValue One
= DAG
.getConstant(1, DL
, MVT::i32
);
7503 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(1));
7504 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(2));
7506 SDValue Lo0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, Zero
);
7507 SDValue Lo1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, Zero
);
7509 SDValue Lo
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Lo0
, Lo1
);
7511 SDValue Hi0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, One
);
7512 SDValue Hi1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, One
);
7514 SDValue Hi
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Hi0
, Hi1
);
7516 SDValue Res
= DAG
.getBuildVector(MVT::v2i32
, DL
, {Lo
, Hi
});
7517 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Res
);
7520 // Catch division cases where we can use shortcuts with rcp and rsq
7522 SDValue
SITargetLowering::lowerFastUnsafeFDIV(SDValue Op
,
7523 SelectionDAG
&DAG
) const {
7525 SDValue LHS
= Op
.getOperand(0);
7526 SDValue RHS
= Op
.getOperand(1);
7527 EVT VT
= Op
.getValueType();
7528 const SDNodeFlags Flags
= Op
->getFlags();
7529 bool Unsafe
= DAG
.getTarget().Options
.UnsafeFPMath
|| Flags
.hasAllowReciprocal();
7531 if (!Unsafe
&& VT
== MVT::f32
&& Subtarget
->hasFP32Denormals())
7534 if (const ConstantFPSDNode
*CLHS
= dyn_cast
<ConstantFPSDNode
>(LHS
)) {
7535 if (Unsafe
|| VT
== MVT::f32
|| VT
== MVT::f16
) {
7536 if (CLHS
->isExactlyValue(1.0)) {
7537 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
7538 // the CI documentation has a worst case error of 1 ulp.
7539 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
7540 // use it as long as we aren't trying to use denormals.
7542 // v_rcp_f16 and v_rsq_f16 DO support denormals.
7544 // 1.0 / sqrt(x) -> rsq(x)
7546 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
7547 // error seems really high at 2^29 ULP.
7548 if (RHS
.getOpcode() == ISD::FSQRT
)
7549 return DAG
.getNode(AMDGPUISD::RSQ
, SL
, VT
, RHS
.getOperand(0));
7551 // 1.0 / x -> rcp(x)
7552 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
7555 // Same as for 1.0, but expand the sign out of the constant.
7556 if (CLHS
->isExactlyValue(-1.0)) {
7557 // -1.0 / x -> rcp (fneg x)
7558 SDValue FNegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
7559 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, FNegRHS
);
7565 // Turn into multiply by the reciprocal.
7566 // x / y -> x * (1.0 / y)
7567 SDValue Recip
= DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
7568 return DAG
.getNode(ISD::FMUL
, SL
, VT
, LHS
, Recip
, Flags
);
7574 static SDValue
getFPBinOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
7575 EVT VT
, SDValue A
, SDValue B
, SDValue GlueChain
) {
7576 if (GlueChain
->getNumValues() <= 1) {
7577 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
);
7580 assert(GlueChain
->getNumValues() == 3);
7582 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
7584 default: llvm_unreachable("no chain equivalent for opcode");
7586 Opcode
= AMDGPUISD::FMUL_W_CHAIN
;
7590 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
,
7591 GlueChain
.getValue(2));
7594 static SDValue
getFPTernOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
7595 EVT VT
, SDValue A
, SDValue B
, SDValue C
,
7596 SDValue GlueChain
) {
7597 if (GlueChain
->getNumValues() <= 1) {
7598 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
, C
);
7601 assert(GlueChain
->getNumValues() == 3);
7603 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
7605 default: llvm_unreachable("no chain equivalent for opcode");
7607 Opcode
= AMDGPUISD::FMA_W_CHAIN
;
7611 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
, C
,
7612 GlueChain
.getValue(2));
7615 SDValue
SITargetLowering::LowerFDIV16(SDValue Op
, SelectionDAG
&DAG
) const {
7616 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
7620 SDValue Src0
= Op
.getOperand(0);
7621 SDValue Src1
= Op
.getOperand(1);
7623 SDValue CvtSrc0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
7624 SDValue CvtSrc1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
7626 SDValue RcpSrc1
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, CvtSrc1
);
7627 SDValue Quot
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, CvtSrc0
, RcpSrc1
);
7629 SDValue FPRoundFlag
= DAG
.getTargetConstant(0, SL
, MVT::i32
);
7630 SDValue BestQuot
= DAG
.getNode(ISD::FP_ROUND
, SL
, MVT::f16
, Quot
, FPRoundFlag
);
7632 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f16
, BestQuot
, Src1
, Src0
);
7635 // Faster 2.5 ULP division that does not support denormals.
7636 SDValue
SITargetLowering::lowerFDIV_FAST(SDValue Op
, SelectionDAG
&DAG
) const {
7638 SDValue LHS
= Op
.getOperand(1);
7639 SDValue RHS
= Op
.getOperand(2);
7641 SDValue r1
= DAG
.getNode(ISD::FABS
, SL
, MVT::f32
, RHS
);
7643 const APFloat
K0Val(BitsToFloat(0x6f800000));
7644 const SDValue K0
= DAG
.getConstantFP(K0Val
, SL
, MVT::f32
);
7646 const APFloat
K1Val(BitsToFloat(0x2f800000));
7647 const SDValue K1
= DAG
.getConstantFP(K1Val
, SL
, MVT::f32
);
7649 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
7652 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::f32
);
7654 SDValue r2
= DAG
.getSetCC(SL
, SetCCVT
, r1
, K0
, ISD::SETOGT
);
7656 SDValue r3
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f32
, r2
, K1
, One
);
7658 // TODO: Should this propagate fast-math-flags?
7659 r1
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, RHS
, r3
);
7661 // rcp does not support denormals.
7662 SDValue r0
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, r1
);
7664 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, LHS
, r0
);
7666 return DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, r3
, Mul
);
7669 // Returns immediate value for setting the F32 denorm mode when using the
7670 // S_DENORM_MODE instruction.
7671 static const SDValue
getSPDenormModeValue(int SPDenormMode
, SelectionDAG
&DAG
,
7672 const SDLoc
&SL
, const GCNSubtarget
*ST
) {
7673 assert(ST
->hasDenormModeInst() && "Requires S_DENORM_MODE");
7674 int DPDenormModeDefault
= ST
->hasFP64Denormals()
7675 ? FP_DENORM_FLUSH_NONE
7676 : FP_DENORM_FLUSH_IN_FLUSH_OUT
;
7678 int Mode
= SPDenormMode
| (DPDenormModeDefault
<< 2);
7679 return DAG
.getTargetConstant(Mode
, SL
, MVT::i32
);
7682 SDValue
SITargetLowering::LowerFDIV32(SDValue Op
, SelectionDAG
&DAG
) const {
7683 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
7687 SDValue LHS
= Op
.getOperand(0);
7688 SDValue RHS
= Op
.getOperand(1);
7690 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
7692 SDVTList ScaleVT
= DAG
.getVTList(MVT::f32
, MVT::i1
);
7694 SDValue DenominatorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
7696 SDValue NumeratorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
7699 // Denominator is scaled to not be denormal, so using rcp is ok.
7700 SDValue ApproxRcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
,
7702 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
,
7705 const unsigned Denorm32Reg
= AMDGPU::Hwreg::ID_MODE
|
7706 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_
) |
7707 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_
);
7708 const SDValue BitField
= DAG
.getTargetConstant(Denorm32Reg
, SL
, MVT::i16
);
7710 if (!Subtarget
->hasFP32Denormals()) {
7711 SDVTList BindParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
7713 SDValue EnableDenorm
;
7714 if (Subtarget
->hasDenormModeInst()) {
7715 const SDValue EnableDenormValue
=
7716 getSPDenormModeValue(FP_DENORM_FLUSH_NONE
, DAG
, SL
, Subtarget
);
7718 EnableDenorm
= DAG
.getNode(AMDGPUISD::DENORM_MODE
, SL
, BindParamVTs
,
7719 DAG
.getEntryNode(), EnableDenormValue
);
7721 const SDValue EnableDenormValue
= DAG
.getConstant(FP_DENORM_FLUSH_NONE
,
7723 EnableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, BindParamVTs
,
7724 DAG
.getEntryNode(), EnableDenormValue
,
7730 EnableDenorm
.getValue(0),
7731 EnableDenorm
.getValue(1)
7734 NegDivScale0
= DAG
.getMergeValues(Ops
, SL
);
7737 SDValue Fma0
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
,
7738 ApproxRcp
, One
, NegDivScale0
);
7740 SDValue Fma1
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma0
, ApproxRcp
,
7743 SDValue Mul
= getFPBinOp(DAG
, ISD::FMUL
, SL
, MVT::f32
, NumeratorScaled
,
7746 SDValue Fma2
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Mul
,
7747 NumeratorScaled
, Mul
);
7749 SDValue Fma3
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma2
, Fma1
, Mul
, Fma2
);
7751 SDValue Fma4
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Fma3
,
7752 NumeratorScaled
, Fma3
);
7754 if (!Subtarget
->hasFP32Denormals()) {
7756 SDValue DisableDenorm
;
7757 if (Subtarget
->hasDenormModeInst()) {
7758 const SDValue DisableDenormValue
=
7759 getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT
, DAG
, SL
, Subtarget
);
7761 DisableDenorm
= DAG
.getNode(AMDGPUISD::DENORM_MODE
, SL
, MVT::Other
,
7762 Fma4
.getValue(1), DisableDenormValue
,
7765 const SDValue DisableDenormValue
=
7766 DAG
.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT
, SL
, MVT::i32
);
7768 DisableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, MVT::Other
,
7769 Fma4
.getValue(1), DisableDenormValue
,
7770 BitField
, Fma4
.getValue(2));
7773 SDValue OutputChain
= DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
,
7774 DisableDenorm
, DAG
.getRoot());
7775 DAG
.setRoot(OutputChain
);
7778 SDValue Scale
= NumeratorScaled
.getValue(1);
7779 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f32
,
7780 Fma4
, Fma1
, Fma3
, Scale
);
7782 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f32
, Fmas
, RHS
, LHS
);
7785 SDValue
SITargetLowering::LowerFDIV64(SDValue Op
, SelectionDAG
&DAG
) const {
7786 if (DAG
.getTarget().Options
.UnsafeFPMath
)
7787 return lowerFastUnsafeFDIV(Op
, DAG
);
7790 SDValue X
= Op
.getOperand(0);
7791 SDValue Y
= Op
.getOperand(1);
7793 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f64
);
7795 SDVTList ScaleVT
= DAG
.getVTList(MVT::f64
, MVT::i1
);
7797 SDValue DivScale0
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, Y
, Y
, X
);
7799 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f64
, DivScale0
);
7801 SDValue Rcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f64
, DivScale0
);
7803 SDValue Fma0
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Rcp
, One
);
7805 SDValue Fma1
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Rcp
, Fma0
, Rcp
);
7807 SDValue Fma2
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Fma1
, One
);
7809 SDValue DivScale1
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, X
, Y
, X
);
7811 SDValue Fma3
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Fma1
, Fma2
, Fma1
);
7812 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f64
, DivScale1
, Fma3
);
7814 SDValue Fma4
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
,
7815 NegDivScale0
, Mul
, DivScale1
);
7819 if (!Subtarget
->hasUsableDivScaleConditionOutput()) {
7820 // Workaround a hardware bug on SI where the condition output from div_scale
7823 const SDValue Hi
= DAG
.getConstant(1, SL
, MVT::i32
);
7825 // Figure out if the scale to use for div_fmas.
7826 SDValue NumBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, X
);
7827 SDValue DenBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Y
);
7828 SDValue Scale0BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale0
);
7829 SDValue Scale1BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale1
);
7831 SDValue NumHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, NumBC
, Hi
);
7832 SDValue DenHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, DenBC
, Hi
);
7835 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale0BC
, Hi
);
7837 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale1BC
, Hi
);
7839 SDValue CmpDen
= DAG
.getSetCC(SL
, MVT::i1
, DenHi
, Scale0Hi
, ISD::SETEQ
);
7840 SDValue CmpNum
= DAG
.getSetCC(SL
, MVT::i1
, NumHi
, Scale1Hi
, ISD::SETEQ
);
7841 Scale
= DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, CmpNum
, CmpDen
);
7843 Scale
= DivScale1
.getValue(1);
7846 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f64
,
7847 Fma4
, Fma3
, Mul
, Scale
);
7849 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f64
, Fmas
, Y
, X
);
7852 SDValue
SITargetLowering::LowerFDIV(SDValue Op
, SelectionDAG
&DAG
) const {
7853 EVT VT
= Op
.getValueType();
7856 return LowerFDIV32(Op
, DAG
);
7859 return LowerFDIV64(Op
, DAG
);
7862 return LowerFDIV16(Op
, DAG
);
7864 llvm_unreachable("Unexpected type for fdiv");
7867 SDValue
SITargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
7869 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
7870 EVT VT
= Store
->getMemoryVT();
7872 if (VT
== MVT::i1
) {
7873 return DAG
.getTruncStore(Store
->getChain(), DL
,
7874 DAG
.getSExtOrTrunc(Store
->getValue(), DL
, MVT::i32
),
7875 Store
->getBasePtr(), MVT::i1
, Store
->getMemOperand());
7878 assert(VT
.isVector() &&
7879 Store
->getValue().getValueType().getScalarType() == MVT::i32
);
7881 if (!allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
7882 VT
, *Store
->getMemOperand())) {
7883 return expandUnalignedStore(Store
, DAG
);
7886 unsigned AS
= Store
->getAddressSpace();
7887 if (Subtarget
->hasLDSMisalignedBug() &&
7888 AS
== AMDGPUAS::FLAT_ADDRESS
&&
7889 Store
->getAlignment() < VT
.getStoreSize() && VT
.getSizeInBits() > 32) {
7890 return SplitVectorStore(Op
, DAG
);
7893 MachineFunction
&MF
= DAG
.getMachineFunction();
7894 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
7895 // If there is a possibilty that flat instruction access scratch memory
7896 // then we need to use the same legalization rules we use for private.
7897 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
7898 AS
= MFI
->hasFlatScratchInit() ?
7899 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
7901 unsigned NumElements
= VT
.getVectorNumElements();
7902 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
||
7903 AS
== AMDGPUAS::FLAT_ADDRESS
) {
7904 if (NumElements
> 4)
7905 return SplitVectorStore(Op
, DAG
);
7906 // v3 stores not supported on SI.
7907 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7908 return SplitVectorStore(Op
, DAG
);
7910 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7911 switch (Subtarget
->getMaxPrivateElementSize()) {
7913 return scalarizeVectorStore(Store
, DAG
);
7915 if (NumElements
> 2)
7916 return SplitVectorStore(Op
, DAG
);
7919 if (NumElements
> 4 || NumElements
== 3)
7920 return SplitVectorStore(Op
, DAG
);
7923 llvm_unreachable("unsupported private_element_size");
7925 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
7926 // Use ds_write_b128 if possible.
7927 if (Subtarget
->useDS128() && Store
->getAlignment() >= 16 &&
7928 VT
.getStoreSize() == 16 && NumElements
!= 3)
7931 if (NumElements
> 2)
7932 return SplitVectorStore(Op
, DAG
);
7934 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7935 // address is negative, then the instruction is incorrectly treated as
7936 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7937 // stores here to avoid emitting ds_write2_b32. We may re-combine the
7938 // store later in the SILoadStoreOptimizer.
7939 if (!Subtarget
->hasUsableDSOffset() &&
7940 NumElements
== 2 && VT
.getStoreSize() == 8 &&
7941 Store
->getAlignment() < 8) {
7942 return SplitVectorStore(Op
, DAG
);
7947 llvm_unreachable("unhandled address space");
7951 SDValue
SITargetLowering::LowerTrig(SDValue Op
, SelectionDAG
&DAG
) const {
7953 EVT VT
= Op
.getValueType();
7954 SDValue Arg
= Op
.getOperand(0);
7957 // TODO: Should this propagate fast-math-flags?
7959 SDValue OneOver2Pi
= DAG
.getConstantFP(0.5 / M_PI
, DL
, VT
);
7961 if (Subtarget
->hasTrigReducedRange()) {
7962 SDValue MulVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
7963 TrigVal
= DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, MulVal
);
7965 TrigVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
7968 switch (Op
.getOpcode()) {
7970 return DAG
.getNode(AMDGPUISD::COS_HW
, SDLoc(Op
), VT
, TrigVal
);
7972 return DAG
.getNode(AMDGPUISD::SIN_HW
, SDLoc(Op
), VT
, TrigVal
);
7974 llvm_unreachable("Wrong trig opcode");
7978 SDValue
SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op
, SelectionDAG
&DAG
) const {
7979 AtomicSDNode
*AtomicNode
= cast
<AtomicSDNode
>(Op
);
7980 assert(AtomicNode
->isCompareAndSwap());
7981 unsigned AS
= AtomicNode
->getAddressSpace();
7983 // No custom lowering required for local address space
7984 if (!isFlatGlobalAddrSpace(AS
))
7987 // Non-local address space requires custom lowering for atomic compare
7988 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7990 SDValue ChainIn
= Op
.getOperand(0);
7991 SDValue Addr
= Op
.getOperand(1);
7992 SDValue Old
= Op
.getOperand(2);
7993 SDValue New
= Op
.getOperand(3);
7994 EVT VT
= Op
.getValueType();
7995 MVT SimpleVT
= VT
.getSimpleVT();
7996 MVT VecType
= MVT::getVectorVT(SimpleVT
, 2);
7998 SDValue NewOld
= DAG
.getBuildVector(VecType
, DL
, {New
, Old
});
7999 SDValue Ops
[] = { ChainIn
, Addr
, NewOld
};
8001 return DAG
.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP
, DL
, Op
->getVTList(),
8002 Ops
, VT
, AtomicNode
->getMemOperand());
8005 //===----------------------------------------------------------------------===//
8006 // Custom DAG optimizations
8007 //===----------------------------------------------------------------------===//
8009 SDValue
SITargetLowering::performUCharToFloatCombine(SDNode
*N
,
8010 DAGCombinerInfo
&DCI
) const {
8011 EVT VT
= N
->getValueType(0);
8012 EVT ScalarVT
= VT
.getScalarType();
8013 if (ScalarVT
!= MVT::f32
)
8016 SelectionDAG
&DAG
= DCI
.DAG
;
8019 SDValue Src
= N
->getOperand(0);
8020 EVT SrcVT
= Src
.getValueType();
8022 // TODO: We could try to match extracting the higher bytes, which would be
8023 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
8024 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
8025 // about in practice.
8026 if (DCI
.isAfterLegalizeDAG() && SrcVT
== MVT::i32
) {
8027 if (DAG
.MaskedValueIsZero(Src
, APInt::getHighBitsSet(32, 24))) {
8028 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
, DL
, VT
, Src
);
8029 DCI
.AddToWorklist(Cvt
.getNode());
8037 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
8039 // This is a variant of
8040 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
8042 // The normal DAG combiner will do this, but only if the add has one use since
8043 // that would increase the number of instructions.
8045 // This prevents us from seeing a constant offset that can be folded into a
8046 // memory instruction's addressing mode. If we know the resulting add offset of
8047 // a pointer can be folded into an addressing offset, we can replace the pointer
8048 // operand with the add of new constant offset. This eliminates one of the uses,
8049 // and may allow the remaining use to also be simplified.
8051 SDValue
SITargetLowering::performSHLPtrCombine(SDNode
*N
,
8054 DAGCombinerInfo
&DCI
) const {
8055 SDValue N0
= N
->getOperand(0);
8056 SDValue N1
= N
->getOperand(1);
8058 // We only do this to handle cases where it's profitable when there are
8059 // multiple uses of the add, so defer to the standard combine.
8060 if ((N0
.getOpcode() != ISD::ADD
&& N0
.getOpcode() != ISD::OR
) ||
8064 const ConstantSDNode
*CN1
= dyn_cast
<ConstantSDNode
>(N1
);
8068 const ConstantSDNode
*CAdd
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
8072 // If the resulting offset is too large, we can't fold it into the addressing
8074 APInt Offset
= CAdd
->getAPIntValue() << CN1
->getAPIntValue();
8075 Type
*Ty
= MemVT
.getTypeForEVT(*DCI
.DAG
.getContext());
8078 AM
.HasBaseReg
= true;
8079 AM
.BaseOffs
= Offset
.getSExtValue();
8080 if (!isLegalAddressingMode(DCI
.DAG
.getDataLayout(), AM
, Ty
, AddrSpace
))
8083 SelectionDAG
&DAG
= DCI
.DAG
;
8085 EVT VT
= N
->getValueType(0);
8087 SDValue ShlX
= DAG
.getNode(ISD::SHL
, SL
, VT
, N0
.getOperand(0), N1
);
8088 SDValue COffset
= DAG
.getConstant(Offset
, SL
, MVT::i32
);
8091 Flags
.setNoUnsignedWrap(N
->getFlags().hasNoUnsignedWrap() &&
8092 (N0
.getOpcode() == ISD::OR
||
8093 N0
->getFlags().hasNoUnsignedWrap()));
8095 return DAG
.getNode(ISD::ADD
, SL
, VT
, ShlX
, COffset
, Flags
);
8098 SDValue
SITargetLowering::performMemSDNodeCombine(MemSDNode
*N
,
8099 DAGCombinerInfo
&DCI
) const {
8100 SDValue Ptr
= N
->getBasePtr();
8101 SelectionDAG
&DAG
= DCI
.DAG
;
8104 // TODO: We could also do this for multiplies.
8105 if (Ptr
.getOpcode() == ISD::SHL
) {
8106 SDValue NewPtr
= performSHLPtrCombine(Ptr
.getNode(), N
->getAddressSpace(),
8107 N
->getMemoryVT(), DCI
);
8109 SmallVector
<SDValue
, 8> NewOps(N
->op_begin(), N
->op_end());
8111 NewOps
[N
->getOpcode() == ISD::STORE
? 2 : 1] = NewPtr
;
8112 return SDValue(DAG
.UpdateNodeOperands(N
, NewOps
), 0);
8119 static bool bitOpWithConstantIsReducible(unsigned Opc
, uint32_t Val
) {
8120 return (Opc
== ISD::AND
&& (Val
== 0 || Val
== 0xffffffff)) ||
8121 (Opc
== ISD::OR
&& (Val
== 0xffffffff || Val
== 0)) ||
8122 (Opc
== ISD::XOR
&& Val
== 0);
8125 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
8126 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
8127 // integer combine opportunities since most 64-bit operations are decomposed
8128 // this way. TODO: We won't want this for SALU especially if it is an inline
8130 SDValue
SITargetLowering::splitBinaryBitConstantOp(
8131 DAGCombinerInfo
&DCI
,
8133 unsigned Opc
, SDValue LHS
,
8134 const ConstantSDNode
*CRHS
) const {
8135 uint64_t Val
= CRHS
->getZExtValue();
8136 uint32_t ValLo
= Lo_32(Val
);
8137 uint32_t ValHi
= Hi_32(Val
);
8138 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8140 if ((bitOpWithConstantIsReducible(Opc
, ValLo
) ||
8141 bitOpWithConstantIsReducible(Opc
, ValHi
)) ||
8142 (CRHS
->hasOneUse() && !TII
->isInlineConstant(CRHS
->getAPIntValue()))) {
8143 // If we need to materialize a 64-bit immediate, it will be split up later
8144 // anyway. Avoid creating the harder to understand 64-bit immediate
8146 return splitBinaryBitConstantOpImpl(DCI
, SL
, Opc
, LHS
, ValLo
, ValHi
);
8152 // Returns true if argument is a boolean value which is not serialized into
8153 // memory or argument and does not require v_cmdmask_b32 to be deserialized.
8154 static bool isBoolSGPR(SDValue V
) {
8155 if (V
.getValueType() != MVT::i1
)
8157 switch (V
.getOpcode()) {
8163 case AMDGPUISD::FP_CLASS
:
8169 // If a constant has all zeroes or all ones within each byte return it.
8170 // Otherwise return 0.
8171 static uint32_t getConstantPermuteMask(uint32_t C
) {
8172 // 0xff for any zero byte in the mask
8173 uint32_t ZeroByteMask
= 0;
8174 if (!(C
& 0x000000ff)) ZeroByteMask
|= 0x000000ff;
8175 if (!(C
& 0x0000ff00)) ZeroByteMask
|= 0x0000ff00;
8176 if (!(C
& 0x00ff0000)) ZeroByteMask
|= 0x00ff0000;
8177 if (!(C
& 0xff000000)) ZeroByteMask
|= 0xff000000;
8178 uint32_t NonZeroByteMask
= ~ZeroByteMask
; // 0xff for any non-zero byte
8179 if ((NonZeroByteMask
& C
) != NonZeroByteMask
)
8180 return 0; // Partial bytes selected.
8184 // Check if a node selects whole bytes from its operand 0 starting at a byte
8185 // boundary while masking the rest. Returns select mask as in the v_perm_b32
8186 // or -1 if not succeeded.
8187 // Note byte select encoding:
8188 // value 0-3 selects corresponding source byte;
8189 // value 0xc selects zero;
8190 // value 0xff selects 0xff.
8191 static uint32_t getPermuteMask(SelectionDAG
&DAG
, SDValue V
) {
8192 assert(V
.getValueSizeInBits() == 32);
8194 if (V
.getNumOperands() != 2)
8197 ConstantSDNode
*N1
= dyn_cast
<ConstantSDNode
>(V
.getOperand(1));
8201 uint32_t C
= N1
->getZExtValue();
8203 switch (V
.getOpcode()) {
8207 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
8208 return (0x03020100 & ConstMask
) | (0x0c0c0c0c & ~ConstMask
);
8213 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
8214 return (0x03020100 & ~ConstMask
) | ConstMask
;
8222 return uint32_t((0x030201000c0c0c0cull
<< C
) >> 32);
8228 return uint32_t(0x0c0c0c0c03020100ull
>> C
);
8234 SDValue
SITargetLowering::performAndCombine(SDNode
*N
,
8235 DAGCombinerInfo
&DCI
) const {
8236 if (DCI
.isBeforeLegalize())
8239 SelectionDAG
&DAG
= DCI
.DAG
;
8240 EVT VT
= N
->getValueType(0);
8241 SDValue LHS
= N
->getOperand(0);
8242 SDValue RHS
= N
->getOperand(1);
8245 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8246 if (VT
== MVT::i64
&& CRHS
) {
8248 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::AND
, LHS
, CRHS
))
8252 if (CRHS
&& VT
== MVT::i32
) {
8253 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
8254 // nb = number of trailing zeroes in mask
8255 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
8256 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
8257 uint64_t Mask
= CRHS
->getZExtValue();
8258 unsigned Bits
= countPopulation(Mask
);
8259 if (getSubtarget()->hasSDWA() && LHS
->getOpcode() == ISD::SRL
&&
8260 (Bits
== 8 || Bits
== 16) && isShiftedMask_64(Mask
) && !(Mask
& 1)) {
8261 if (auto *CShift
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(1))) {
8262 unsigned Shift
= CShift
->getZExtValue();
8263 unsigned NB
= CRHS
->getAPIntValue().countTrailingZeros();
8264 unsigned Offset
= NB
+ Shift
;
8265 if ((Offset
& (Bits
- 1)) == 0) { // Starts at a byte or word boundary.
8267 SDValue BFE
= DAG
.getNode(AMDGPUISD::BFE_U32
, SL
, MVT::i32
,
8269 DAG
.getConstant(Offset
, SL
, MVT::i32
),
8270 DAG
.getConstant(Bits
, SL
, MVT::i32
));
8271 EVT NarrowVT
= EVT::getIntegerVT(*DAG
.getContext(), Bits
);
8272 SDValue Ext
= DAG
.getNode(ISD::AssertZext
, SL
, VT
, BFE
,
8273 DAG
.getValueType(NarrowVT
));
8274 SDValue Shl
= DAG
.getNode(ISD::SHL
, SDLoc(LHS
), VT
, Ext
,
8275 DAG
.getConstant(NB
, SDLoc(CRHS
), MVT::i32
));
8281 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8282 if (LHS
.hasOneUse() && LHS
.getOpcode() == AMDGPUISD::PERM
&&
8283 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
8284 uint32_t Sel
= getConstantPermuteMask(Mask
);
8288 // Select 0xc for all zero bytes
8289 Sel
= (LHS
.getConstantOperandVal(2) & Sel
) | (~Sel
& 0x0c0c0c0c);
8291 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
8292 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
8296 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
8297 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
8298 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == ISD::SETCC
) {
8299 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
8300 ISD::CondCode RCC
= cast
<CondCodeSDNode
>(RHS
.getOperand(2))->get();
8302 SDValue X
= LHS
.getOperand(0);
8303 SDValue Y
= RHS
.getOperand(0);
8304 if (Y
.getOpcode() != ISD::FABS
|| Y
.getOperand(0) != X
)
8307 if (LCC
== ISD::SETO
) {
8308 if (X
!= LHS
.getOperand(1))
8311 if (RCC
== ISD::SETUNE
) {
8312 const ConstantFPSDNode
*C1
= dyn_cast
<ConstantFPSDNode
>(RHS
.getOperand(1));
8313 if (!C1
|| !C1
->isInfinity() || C1
->isNegative())
8316 const uint32_t Mask
= SIInstrFlags::N_NORMAL
|
8317 SIInstrFlags::N_SUBNORMAL
|
8318 SIInstrFlags::N_ZERO
|
8319 SIInstrFlags::P_ZERO
|
8320 SIInstrFlags::P_SUBNORMAL
|
8321 SIInstrFlags::P_NORMAL
;
8323 static_assert(((~(SIInstrFlags::S_NAN
|
8324 SIInstrFlags::Q_NAN
|
8325 SIInstrFlags::N_INFINITY
|
8326 SIInstrFlags::P_INFINITY
)) & 0x3ff) == Mask
,
8330 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
8331 X
, DAG
.getConstant(Mask
, DL
, MVT::i32
));
8336 if (RHS
.getOpcode() == ISD::SETCC
&& LHS
.getOpcode() == AMDGPUISD::FP_CLASS
)
8337 std::swap(LHS
, RHS
);
8339 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
8341 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
8342 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
8343 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
8344 const ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8345 if ((LCC
== ISD::SETO
|| LCC
== ISD::SETUO
) && Mask
&&
8346 (RHS
.getOperand(0) == LHS
.getOperand(0) &&
8347 LHS
.getOperand(0) == LHS
.getOperand(1))) {
8348 const unsigned OrdMask
= SIInstrFlags::S_NAN
| SIInstrFlags::Q_NAN
;
8349 unsigned NewMask
= LCC
== ISD::SETO
?
8350 Mask
->getZExtValue() & ~OrdMask
:
8351 Mask
->getZExtValue() & OrdMask
;
8354 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
, RHS
.getOperand(0),
8355 DAG
.getConstant(NewMask
, DL
, MVT::i32
));
8359 if (VT
== MVT::i32
&&
8360 (RHS
.getOpcode() == ISD::SIGN_EXTEND
|| LHS
.getOpcode() == ISD::SIGN_EXTEND
)) {
8361 // and x, (sext cc from i1) => select cc, x, 0
8362 if (RHS
.getOpcode() != ISD::SIGN_EXTEND
)
8363 std::swap(LHS
, RHS
);
8364 if (isBoolSGPR(RHS
.getOperand(0)))
8365 return DAG
.getSelect(SDLoc(N
), MVT::i32
, RHS
.getOperand(0),
8366 LHS
, DAG
.getConstant(0, SDLoc(N
), MVT::i32
));
8369 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8370 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8371 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
8372 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
8373 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
8374 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
8375 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
8376 // Canonicalize the expression in an attempt to have fewer unique masks
8377 // and therefore fewer registers used to hold the masks.
8378 if (LHSMask
> RHSMask
) {
8379 std::swap(LHSMask
, RHSMask
);
8380 std::swap(LHS
, RHS
);
8383 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8384 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8385 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8386 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8388 // Check of we need to combine values from two sources within a byte.
8389 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
8390 // If we select high and lower word keep it for SDWA.
8391 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8392 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
8393 // Each byte in each mask is either selector mask 0-3, or has higher
8394 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
8395 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
8396 // mask which is not 0xff wins. By anding both masks we have a correct
8397 // result except that 0x0c shall be corrected to give 0x0c only.
8398 uint32_t Mask
= LHSMask
& RHSMask
;
8399 for (unsigned I
= 0; I
< 32; I
+= 8) {
8400 uint32_t ByteSel
= 0xff << I
;
8401 if ((LHSMask
& ByteSel
) == 0x0c || (RHSMask
& ByteSel
) == 0x0c)
8402 Mask
&= (0x0c << I
) & 0xffffffff;
8405 // Add 4 to each active LHS lane. It will not affect any existing 0xff
8407 uint32_t Sel
= Mask
| (LHSUsedLanes
& 0x04040404);
8410 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
8411 LHS
.getOperand(0), RHS
.getOperand(0),
8412 DAG
.getConstant(Sel
, DL
, MVT::i32
));
8420 SDValue
SITargetLowering::performOrCombine(SDNode
*N
,
8421 DAGCombinerInfo
&DCI
) const {
8422 SelectionDAG
&DAG
= DCI
.DAG
;
8423 SDValue LHS
= N
->getOperand(0);
8424 SDValue RHS
= N
->getOperand(1);
8426 EVT VT
= N
->getValueType(0);
8427 if (VT
== MVT::i1
) {
8428 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
8429 if (LHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
8430 RHS
.getOpcode() == AMDGPUISD::FP_CLASS
) {
8431 SDValue Src
= LHS
.getOperand(0);
8432 if (Src
!= RHS
.getOperand(0))
8435 const ConstantSDNode
*CLHS
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
8436 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8440 // Only 10 bits are used.
8441 static const uint32_t MaxMask
= 0x3ff;
8443 uint32_t NewMask
= (CLHS
->getZExtValue() | CRHS
->getZExtValue()) & MaxMask
;
8445 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
8446 Src
, DAG
.getConstant(NewMask
, DL
, MVT::i32
));
8452 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8453 if (isa
<ConstantSDNode
>(RHS
) && LHS
.hasOneUse() &&
8454 LHS
.getOpcode() == AMDGPUISD::PERM
&&
8455 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
8456 uint32_t Sel
= getConstantPermuteMask(N
->getConstantOperandVal(1));
8460 Sel
|= LHS
.getConstantOperandVal(2);
8462 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
8463 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
8466 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8467 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8468 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
8469 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
8470 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
8471 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
8472 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
8473 // Canonicalize the expression in an attempt to have fewer unique masks
8474 // and therefore fewer registers used to hold the masks.
8475 if (LHSMask
> RHSMask
) {
8476 std::swap(LHSMask
, RHSMask
);
8477 std::swap(LHS
, RHS
);
8480 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8481 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8482 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8483 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8485 // Check of we need to combine values from two sources within a byte.
8486 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
8487 // If we select high and lower word keep it for SDWA.
8488 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8489 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
8490 // Kill zero bytes selected by other mask. Zero value is 0xc.
8491 LHSMask
&= ~RHSUsedLanes
;
8492 RHSMask
&= ~LHSUsedLanes
;
8493 // Add 4 to each active LHS lane
8494 LHSMask
|= LHSUsedLanes
& 0x04040404;
8496 uint32_t Sel
= LHSMask
| RHSMask
;
8499 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
8500 LHS
.getOperand(0), RHS
.getOperand(0),
8501 DAG
.getConstant(Sel
, DL
, MVT::i32
));
8509 // TODO: This could be a generic combine with a predicate for extracting the
8510 // high half of an integer being free.
8512 // (or i64:x, (zero_extend i32:y)) ->
8513 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
8514 if (LHS
.getOpcode() == ISD::ZERO_EXTEND
&&
8515 RHS
.getOpcode() != ISD::ZERO_EXTEND
)
8516 std::swap(LHS
, RHS
);
8518 if (RHS
.getOpcode() == ISD::ZERO_EXTEND
) {
8519 SDValue ExtSrc
= RHS
.getOperand(0);
8520 EVT SrcVT
= ExtSrc
.getValueType();
8521 if (SrcVT
== MVT::i32
) {
8523 SDValue LowLHS
, HiBits
;
8524 std::tie(LowLHS
, HiBits
) = split64BitValue(LHS
, DAG
);
8525 SDValue LowOr
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, LowLHS
, ExtSrc
);
8527 DCI
.AddToWorklist(LowOr
.getNode());
8528 DCI
.AddToWorklist(HiBits
.getNode());
8530 SDValue Vec
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
,
8532 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Vec
);
8536 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
8539 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::OR
, LHS
, CRHS
))
8546 SDValue
SITargetLowering::performXorCombine(SDNode
*N
,
8547 DAGCombinerInfo
&DCI
) const {
8548 EVT VT
= N
->getValueType(0);
8552 SDValue LHS
= N
->getOperand(0);
8553 SDValue RHS
= N
->getOperand(1);
8555 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8558 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::XOR
, LHS
, CRHS
))
8565 // Instructions that will be lowered with a final instruction that zeros the
8566 // high result bits.
8567 // XXX - probably only need to list legal operations.
8568 static bool fp16SrcZerosHighBits(unsigned Opc
) {
8577 case ISD::FCANONICALIZE
:
8579 case ISD::UINT_TO_FP
:
8580 case ISD::SINT_TO_FP
:
8582 // Fabs is lowered to a bit operation, but it's an and which will clear the
8583 // high bits anyway.
8597 case ISD::FNEARBYINT
:
8602 case AMDGPUISD::FRACT
:
8603 case AMDGPUISD::CLAMP
:
8604 case AMDGPUISD::COS_HW
:
8605 case AMDGPUISD::SIN_HW
:
8606 case AMDGPUISD::FMIN3
:
8607 case AMDGPUISD::FMAX3
:
8608 case AMDGPUISD::FMED3
:
8609 case AMDGPUISD::FMAD_FTZ
:
8610 case AMDGPUISD::RCP
:
8611 case AMDGPUISD::RSQ
:
8612 case AMDGPUISD::RCP_IFLAG
:
8613 case AMDGPUISD::LDEXP
:
8616 // fcopysign, select and others may be lowered to 32-bit bit operations
8617 // which don't zero the high bits.
8622 SDValue
SITargetLowering::performZeroExtendCombine(SDNode
*N
,
8623 DAGCombinerInfo
&DCI
) const {
8624 if (!Subtarget
->has16BitInsts() ||
8625 DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
8628 EVT VT
= N
->getValueType(0);
8632 SDValue Src
= N
->getOperand(0);
8633 if (Src
.getValueType() != MVT::i16
)
8636 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
8637 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
8638 if (Src
.getOpcode() == ISD::BITCAST
) {
8639 SDValue BCSrc
= Src
.getOperand(0);
8640 if (BCSrc
.getValueType() == MVT::f16
&&
8641 fp16SrcZerosHighBits(BCSrc
.getOpcode()))
8642 return DCI
.DAG
.getNode(AMDGPUISD::FP16_ZEXT
, SDLoc(N
), VT
, BCSrc
);
8648 SDValue
SITargetLowering::performSignExtendInRegCombine(SDNode
*N
,
8649 DAGCombinerInfo
&DCI
)
8651 SDValue Src
= N
->getOperand(0);
8652 auto *VTSign
= cast
<VTSDNode
>(N
->getOperand(1));
8654 if (((Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
&&
8655 VTSign
->getVT() == MVT::i8
) ||
8656 (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT
&&
8657 VTSign
->getVT() == MVT::i16
)) &&
8659 auto *M
= cast
<MemSDNode
>(Src
);
8661 Src
.getOperand(0), // Chain
8662 Src
.getOperand(1), // rsrc
8663 Src
.getOperand(2), // vindex
8664 Src
.getOperand(3), // voffset
8665 Src
.getOperand(4), // soffset
8666 Src
.getOperand(5), // offset
8670 // replace with BUFFER_LOAD_BYTE/SHORT
8671 SDVTList ResList
= DCI
.DAG
.getVTList(MVT::i32
,
8672 Src
.getOperand(0).getValueType());
8673 unsigned Opc
= (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
) ?
8674 AMDGPUISD::BUFFER_LOAD_BYTE
: AMDGPUISD::BUFFER_LOAD_SHORT
;
8675 SDValue BufferLoadSignExt
= DCI
.DAG
.getMemIntrinsicNode(Opc
, SDLoc(N
),
8677 Ops
, M
->getMemoryVT(),
8678 M
->getMemOperand());
8679 return DCI
.DAG
.getMergeValues({BufferLoadSignExt
,
8680 BufferLoadSignExt
.getValue(1)}, SDLoc(N
));
8685 SDValue
SITargetLowering::performClassCombine(SDNode
*N
,
8686 DAGCombinerInfo
&DCI
) const {
8687 SelectionDAG
&DAG
= DCI
.DAG
;
8688 SDValue Mask
= N
->getOperand(1);
8690 // fp_class x, 0 -> false
8691 if (const ConstantSDNode
*CMask
= dyn_cast
<ConstantSDNode
>(Mask
)) {
8692 if (CMask
->isNullValue())
8693 return DAG
.getConstant(0, SDLoc(N
), MVT::i1
);
8696 if (N
->getOperand(0).isUndef())
8697 return DAG
.getUNDEF(MVT::i1
);
8702 SDValue
SITargetLowering::performRcpCombine(SDNode
*N
,
8703 DAGCombinerInfo
&DCI
) const {
8704 EVT VT
= N
->getValueType(0);
8705 SDValue N0
= N
->getOperand(0);
8710 if (VT
== MVT::f32
&& (N0
.getOpcode() == ISD::UINT_TO_FP
||
8711 N0
.getOpcode() == ISD::SINT_TO_FP
)) {
8712 return DCI
.DAG
.getNode(AMDGPUISD::RCP_IFLAG
, SDLoc(N
), VT
, N0
,
8716 return AMDGPUTargetLowering::performRcpCombine(N
, DCI
);
8719 bool SITargetLowering::isCanonicalized(SelectionDAG
&DAG
, SDValue Op
,
8720 unsigned MaxDepth
) const {
8721 unsigned Opcode
= Op
.getOpcode();
8722 if (Opcode
== ISD::FCANONICALIZE
)
8725 if (auto *CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
8726 auto F
= CFP
->getValueAPF();
8727 if (F
.isNaN() && F
.isSignaling())
8729 return !F
.isDenormal() || denormalsEnabledForType(Op
.getValueType());
8732 // If source is a result of another standard FP operation it is already in
8738 // These will flush denorms if required.
8750 case ISD::FP_EXTEND
:
8751 case AMDGPUISD::FMUL_LEGACY
:
8752 case AMDGPUISD::FMAD_FTZ
:
8753 case AMDGPUISD::RCP
:
8754 case AMDGPUISD::RSQ
:
8755 case AMDGPUISD::RSQ_CLAMP
:
8756 case AMDGPUISD::RCP_LEGACY
:
8757 case AMDGPUISD::RSQ_LEGACY
:
8758 case AMDGPUISD::RCP_IFLAG
:
8759 case AMDGPUISD::TRIG_PREOP
:
8760 case AMDGPUISD::DIV_SCALE
:
8761 case AMDGPUISD::DIV_FMAS
:
8762 case AMDGPUISD::DIV_FIXUP
:
8763 case AMDGPUISD::FRACT
:
8764 case AMDGPUISD::LDEXP
:
8765 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
8766 case AMDGPUISD::CVT_F32_UBYTE0
:
8767 case AMDGPUISD::CVT_F32_UBYTE1
:
8768 case AMDGPUISD::CVT_F32_UBYTE2
:
8769 case AMDGPUISD::CVT_F32_UBYTE3
:
8772 // It can/will be lowered or combined as a bit operation.
8773 // Need to check their input recursively to handle.
8776 case ISD::FCOPYSIGN
:
8777 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
8782 return Op
.getValueType().getScalarType() != MVT::f16
;
8786 case ISD::FMINNUM_IEEE
:
8787 case ISD::FMAXNUM_IEEE
:
8788 case AMDGPUISD::CLAMP
:
8789 case AMDGPUISD::FMED3
:
8790 case AMDGPUISD::FMAX3
:
8791 case AMDGPUISD::FMIN3
: {
8792 // FIXME: Shouldn't treat the generic operations different based these.
8793 // However, we aren't really required to flush the result from
8796 // snans will be quieted, so we only need to worry about denormals.
8797 if (Subtarget
->supportsMinMaxDenormModes() ||
8798 denormalsEnabledForType(Op
.getValueType()))
8801 // Flushing may be required.
8802 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
8803 // targets need to check their input recursively.
8805 // FIXME: Does this apply with clamp? It's implemented with max.
8806 for (unsigned I
= 0, E
= Op
.getNumOperands(); I
!= E
; ++I
) {
8807 if (!isCanonicalized(DAG
, Op
.getOperand(I
), MaxDepth
- 1))
8814 return isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1) &&
8815 isCanonicalized(DAG
, Op
.getOperand(2), MaxDepth
- 1);
8817 case ISD::BUILD_VECTOR
: {
8818 for (unsigned i
= 0, e
= Op
.getNumOperands(); i
!= e
; ++i
) {
8819 SDValue SrcOp
= Op
.getOperand(i
);
8820 if (!isCanonicalized(DAG
, SrcOp
, MaxDepth
- 1))
8826 case ISD::EXTRACT_VECTOR_ELT
:
8827 case ISD::EXTRACT_SUBVECTOR
: {
8828 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
8830 case ISD::INSERT_VECTOR_ELT
: {
8831 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1) &&
8832 isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1);
8835 // Could be anything.
8838 case ISD::BITCAST
: {
8839 // Hack round the mess we make when legalizing extract_vector_elt
8840 SDValue Src
= Op
.getOperand(0);
8841 if (Src
.getValueType() == MVT::i16
&&
8842 Src
.getOpcode() == ISD::TRUNCATE
) {
8843 SDValue TruncSrc
= Src
.getOperand(0);
8844 if (TruncSrc
.getValueType() == MVT::i32
&&
8845 TruncSrc
.getOpcode() == ISD::BITCAST
&&
8846 TruncSrc
.getOperand(0).getValueType() == MVT::v2f16
) {
8847 return isCanonicalized(DAG
, TruncSrc
.getOperand(0), MaxDepth
- 1);
8853 case ISD::INTRINSIC_WO_CHAIN
: {
8854 unsigned IntrinsicID
8855 = cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
8856 // TODO: Handle more intrinsics
8857 switch (IntrinsicID
) {
8858 case Intrinsic::amdgcn_cvt_pkrtz
:
8859 case Intrinsic::amdgcn_cubeid
:
8860 case Intrinsic::amdgcn_frexp_mant
:
8861 case Intrinsic::amdgcn_fdot2
:
8870 return denormalsEnabledForType(Op
.getValueType()) &&
8871 DAG
.isKnownNeverSNaN(Op
);
8874 llvm_unreachable("invalid operation");
8877 // Constant fold canonicalize.
8878 SDValue
SITargetLowering::getCanonicalConstantFP(
8879 SelectionDAG
&DAG
, const SDLoc
&SL
, EVT VT
, const APFloat
&C
) const {
8880 // Flush denormals to 0 if not enabled.
8881 if (C
.isDenormal() && !denormalsEnabledForType(VT
))
8882 return DAG
.getConstantFP(0.0, SL
, VT
);
8885 APFloat CanonicalQNaN
= APFloat::getQNaN(C
.getSemantics());
8886 if (C
.isSignaling()) {
8887 // Quiet a signaling NaN.
8888 // FIXME: Is this supposed to preserve payload bits?
8889 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
8892 // Make sure it is the canonical NaN bitpattern.
8894 // TODO: Can we use -1 as the canonical NaN value since it's an inline
8896 if (C
.bitcastToAPInt() != CanonicalQNaN
.bitcastToAPInt())
8897 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
8900 // Already canonical.
8901 return DAG
.getConstantFP(C
, SL
, VT
);
8904 static bool vectorEltWillFoldAway(SDValue Op
) {
8905 return Op
.isUndef() || isa
<ConstantFPSDNode
>(Op
);
8908 SDValue
SITargetLowering::performFCanonicalizeCombine(
8910 DAGCombinerInfo
&DCI
) const {
8911 SelectionDAG
&DAG
= DCI
.DAG
;
8912 SDValue N0
= N
->getOperand(0);
8913 EVT VT
= N
->getValueType(0);
8915 // fcanonicalize undef -> qnan
8917 APFloat QNaN
= APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT
));
8918 return DAG
.getConstantFP(QNaN
, SDLoc(N
), VT
);
8921 if (ConstantFPSDNode
*CFP
= isConstOrConstSplatFP(N0
)) {
8922 EVT VT
= N
->getValueType(0);
8923 return getCanonicalConstantFP(DAG
, SDLoc(N
), VT
, CFP
->getValueAPF());
8926 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
8927 // (fcanonicalize k)
8929 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
8931 // TODO: This could be better with wider vectors that will be split to v2f16,
8932 // and to consider uses since there aren't that many packed operations.
8933 if (N0
.getOpcode() == ISD::BUILD_VECTOR
&& VT
== MVT::v2f16
&&
8934 isTypeLegal(MVT::v2f16
)) {
8937 SDValue Lo
= N0
.getOperand(0);
8938 SDValue Hi
= N0
.getOperand(1);
8939 EVT EltVT
= Lo
.getValueType();
8941 if (vectorEltWillFoldAway(Lo
) || vectorEltWillFoldAway(Hi
)) {
8942 for (unsigned I
= 0; I
!= 2; ++I
) {
8943 SDValue Op
= N0
.getOperand(I
);
8944 if (ConstantFPSDNode
*CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
8945 NewElts
[I
] = getCanonicalConstantFP(DAG
, SL
, EltVT
,
8946 CFP
->getValueAPF());
8947 } else if (Op
.isUndef()) {
8948 // Handled below based on what the other operand is.
8951 NewElts
[I
] = DAG
.getNode(ISD::FCANONICALIZE
, SL
, EltVT
, Op
);
8955 // If one half is undef, and one is constant, perfer a splat vector rather
8956 // than the normal qNaN. If it's a register, prefer 0.0 since that's
8957 // cheaper to use and may be free with a packed operation.
8958 if (NewElts
[0].isUndef()) {
8959 if (isa
<ConstantFPSDNode
>(NewElts
[1]))
8960 NewElts
[0] = isa
<ConstantFPSDNode
>(NewElts
[1]) ?
8961 NewElts
[1]: DAG
.getConstantFP(0.0f
, SL
, EltVT
);
8964 if (NewElts
[1].isUndef()) {
8965 NewElts
[1] = isa
<ConstantFPSDNode
>(NewElts
[0]) ?
8966 NewElts
[0] : DAG
.getConstantFP(0.0f
, SL
, EltVT
);
8969 return DAG
.getBuildVector(VT
, SL
, NewElts
);
8973 unsigned SrcOpc
= N0
.getOpcode();
8975 // If it's free to do so, push canonicalizes further up the source, which may
8976 // find a canonical source.
8978 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
8980 if (SrcOpc
== ISD::FMINNUM
|| SrcOpc
== ISD::FMAXNUM
) {
8981 auto *CRHS
= dyn_cast
<ConstantFPSDNode
>(N0
.getOperand(1));
8982 if (CRHS
&& N0
.hasOneUse()) {
8984 SDValue Canon0
= DAG
.getNode(ISD::FCANONICALIZE
, SL
, VT
,
8986 SDValue Canon1
= getCanonicalConstantFP(DAG
, SL
, VT
, CRHS
->getValueAPF());
8987 DCI
.AddToWorklist(Canon0
.getNode());
8989 return DAG
.getNode(N0
.getOpcode(), SL
, VT
, Canon0
, Canon1
);
8993 return isCanonicalized(DAG
, N0
) ? N0
: SDValue();
8996 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc
) {
8999 case ISD::FMAXNUM_IEEE
:
9000 return AMDGPUISD::FMAX3
;
9002 return AMDGPUISD::SMAX3
;
9004 return AMDGPUISD::UMAX3
;
9006 case ISD::FMINNUM_IEEE
:
9007 return AMDGPUISD::FMIN3
;
9009 return AMDGPUISD::SMIN3
;
9011 return AMDGPUISD::UMIN3
;
9013 llvm_unreachable("Not a min/max opcode");
9017 SDValue
SITargetLowering::performIntMed3ImmCombine(
9018 SelectionDAG
&DAG
, const SDLoc
&SL
,
9019 SDValue Op0
, SDValue Op1
, bool Signed
) const {
9020 ConstantSDNode
*K1
= dyn_cast
<ConstantSDNode
>(Op1
);
9024 ConstantSDNode
*K0
= dyn_cast
<ConstantSDNode
>(Op0
.getOperand(1));
9029 if (K0
->getAPIntValue().sge(K1
->getAPIntValue()))
9032 if (K0
->getAPIntValue().uge(K1
->getAPIntValue()))
9036 EVT VT
= K0
->getValueType(0);
9037 unsigned Med3Opc
= Signed
? AMDGPUISD::SMED3
: AMDGPUISD::UMED3
;
9038 if (VT
== MVT::i32
|| (VT
== MVT::i16
&& Subtarget
->hasMed3_16())) {
9039 return DAG
.getNode(Med3Opc
, SL
, VT
,
9040 Op0
.getOperand(0), SDValue(K0
, 0), SDValue(K1
, 0));
9043 // If there isn't a 16-bit med3 operation, convert to 32-bit.
9045 unsigned ExtOp
= Signed
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
9047 SDValue Tmp1
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(0));
9048 SDValue Tmp2
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(1));
9049 SDValue Tmp3
= DAG
.getNode(ExtOp
, SL
, NVT
, Op1
);
9051 SDValue Med3
= DAG
.getNode(Med3Opc
, SL
, NVT
, Tmp1
, Tmp2
, Tmp3
);
9052 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Med3
);
9055 static ConstantFPSDNode
*getSplatConstantFP(SDValue Op
) {
9056 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(Op
))
9059 if (BuildVectorSDNode
*BV
= dyn_cast
<BuildVectorSDNode
>(Op
)) {
9060 if (ConstantFPSDNode
*C
= BV
->getConstantFPSplatNode())
9067 SDValue
SITargetLowering::performFPMed3ImmCombine(SelectionDAG
&DAG
,
9070 SDValue Op1
) const {
9071 ConstantFPSDNode
*K1
= getSplatConstantFP(Op1
);
9075 ConstantFPSDNode
*K0
= getSplatConstantFP(Op0
.getOperand(1));
9079 // Ordered >= (although NaN inputs should have folded away by now).
9080 APFloat::cmpResult Cmp
= K0
->getValueAPF().compare(K1
->getValueAPF());
9081 if (Cmp
== APFloat::cmpGreaterThan
)
9084 const MachineFunction
&MF
= DAG
.getMachineFunction();
9085 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
9087 // TODO: Check IEEE bit enabled?
9088 EVT VT
= Op0
.getValueType();
9089 if (Info
->getMode().DX10Clamp
) {
9090 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
9091 // hardware fmed3 behavior converting to a min.
9092 // FIXME: Should this be allowing -0.0?
9093 if (K1
->isExactlyValue(1.0) && K0
->isExactlyValue(0.0))
9094 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Op0
.getOperand(0));
9097 // med3 for f16 is only available on gfx9+, and not available for v2f16.
9098 if (VT
== MVT::f32
|| (VT
== MVT::f16
&& Subtarget
->hasMed3_16())) {
9099 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
9100 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
9101 // then give the other result, which is different from med3 with a NaN
9103 SDValue Var
= Op0
.getOperand(0);
9104 if (!DAG
.isKnownNeverSNaN(Var
))
9107 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
9109 if ((!K0
->hasOneUse() ||
9110 TII
->isInlineConstant(K0
->getValueAPF().bitcastToAPInt())) &&
9111 (!K1
->hasOneUse() ||
9112 TII
->isInlineConstant(K1
->getValueAPF().bitcastToAPInt()))) {
9113 return DAG
.getNode(AMDGPUISD::FMED3
, SL
, K0
->getValueType(0),
9114 Var
, SDValue(K0
, 0), SDValue(K1
, 0));
9121 SDValue
SITargetLowering::performMinMaxCombine(SDNode
*N
,
9122 DAGCombinerInfo
&DCI
) const {
9123 SelectionDAG
&DAG
= DCI
.DAG
;
9125 EVT VT
= N
->getValueType(0);
9126 unsigned Opc
= N
->getOpcode();
9127 SDValue Op0
= N
->getOperand(0);
9128 SDValue Op1
= N
->getOperand(1);
9130 // Only do this if the inner op has one use since this will just increases
9131 // register pressure for no benefit.
9133 if (Opc
!= AMDGPUISD::FMIN_LEGACY
&& Opc
!= AMDGPUISD::FMAX_LEGACY
&&
9135 (VT
== MVT::i32
|| VT
== MVT::f32
||
9136 ((VT
== MVT::f16
|| VT
== MVT::i16
) && Subtarget
->hasMin3Max3_16()))) {
9137 // max(max(a, b), c) -> max3(a, b, c)
9138 // min(min(a, b), c) -> min3(a, b, c)
9139 if (Op0
.getOpcode() == Opc
&& Op0
.hasOneUse()) {
9141 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
9150 // max(a, max(b, c)) -> max3(a, b, c)
9151 // min(a, min(b, c)) -> min3(a, b, c)
9152 if (Op1
.getOpcode() == Opc
&& Op1
.hasOneUse()) {
9154 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
9163 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
9164 if (Opc
== ISD::SMIN
&& Op0
.getOpcode() == ISD::SMAX
&& Op0
.hasOneUse()) {
9165 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, true))
9169 if (Opc
== ISD::UMIN
&& Op0
.getOpcode() == ISD::UMAX
&& Op0
.hasOneUse()) {
9170 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, false))
9174 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
9175 if (((Opc
== ISD::FMINNUM
&& Op0
.getOpcode() == ISD::FMAXNUM
) ||
9176 (Opc
== ISD::FMINNUM_IEEE
&& Op0
.getOpcode() == ISD::FMAXNUM_IEEE
) ||
9177 (Opc
== AMDGPUISD::FMIN_LEGACY
&&
9178 Op0
.getOpcode() == AMDGPUISD::FMAX_LEGACY
)) &&
9179 (VT
== MVT::f32
|| VT
== MVT::f64
||
9180 (VT
== MVT::f16
&& Subtarget
->has16BitInsts()) ||
9181 (VT
== MVT::v2f16
&& Subtarget
->hasVOP3PInsts())) &&
9183 if (SDValue Res
= performFPMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
))
9190 static bool isClampZeroToOne(SDValue A
, SDValue B
) {
9191 if (ConstantFPSDNode
*CA
= dyn_cast
<ConstantFPSDNode
>(A
)) {
9192 if (ConstantFPSDNode
*CB
= dyn_cast
<ConstantFPSDNode
>(B
)) {
9193 // FIXME: Should this be allowing -0.0?
9194 return (CA
->isExactlyValue(0.0) && CB
->isExactlyValue(1.0)) ||
9195 (CA
->isExactlyValue(1.0) && CB
->isExactlyValue(0.0));
9202 // FIXME: Should only worry about snans for version with chain.
9203 SDValue
SITargetLowering::performFMed3Combine(SDNode
*N
,
9204 DAGCombinerInfo
&DCI
) const {
9205 EVT VT
= N
->getValueType(0);
9206 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
9207 // NaNs. With a NaN input, the order of the operands may change the result.
9209 SelectionDAG
&DAG
= DCI
.DAG
;
9212 SDValue Src0
= N
->getOperand(0);
9213 SDValue Src1
= N
->getOperand(1);
9214 SDValue Src2
= N
->getOperand(2);
9216 if (isClampZeroToOne(Src0
, Src1
)) {
9217 // const_a, const_b, x -> clamp is safe in all cases including signaling
9219 // FIXME: Should this be allowing -0.0?
9220 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src2
);
9223 const MachineFunction
&MF
= DAG
.getMachineFunction();
9224 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
9226 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
9227 // handling no dx10-clamp?
9228 if (Info
->getMode().DX10Clamp
) {
9229 // If NaNs is clamped to 0, we are free to reorder the inputs.
9231 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
9232 std::swap(Src0
, Src1
);
9234 if (isa
<ConstantFPSDNode
>(Src1
) && !isa
<ConstantFPSDNode
>(Src2
))
9235 std::swap(Src1
, Src2
);
9237 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
9238 std::swap(Src0
, Src1
);
9240 if (isClampZeroToOne(Src1
, Src2
))
9241 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src0
);
9247 SDValue
SITargetLowering::performCvtPkRTZCombine(SDNode
*N
,
9248 DAGCombinerInfo
&DCI
) const {
9249 SDValue Src0
= N
->getOperand(0);
9250 SDValue Src1
= N
->getOperand(1);
9251 if (Src0
.isUndef() && Src1
.isUndef())
9252 return DCI
.DAG
.getUNDEF(N
->getValueType(0));
9256 SDValue
SITargetLowering::performExtractVectorEltCombine(
9257 SDNode
*N
, DAGCombinerInfo
&DCI
) const {
9258 SDValue Vec
= N
->getOperand(0);
9259 SelectionDAG
&DAG
= DCI
.DAG
;
9261 EVT VecVT
= Vec
.getValueType();
9262 EVT EltVT
= VecVT
.getVectorElementType();
9264 if ((Vec
.getOpcode() == ISD::FNEG
||
9265 Vec
.getOpcode() == ISD::FABS
) && allUsesHaveSourceMods(N
)) {
9267 EVT EltVT
= N
->getValueType(0);
9268 SDValue Idx
= N
->getOperand(1);
9269 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9270 Vec
.getOperand(0), Idx
);
9271 return DAG
.getNode(Vec
.getOpcode(), SL
, EltVT
, Elt
);
9274 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
9276 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
9277 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
9278 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
9279 if (Vec
.hasOneUse() && DCI
.isBeforeLegalize()) {
9281 EVT EltVT
= N
->getValueType(0);
9282 SDValue Idx
= N
->getOperand(1);
9283 unsigned Opc
= Vec
.getOpcode();
9288 // TODO: Support other binary operations.
9299 case ISD::FMAXNUM_IEEE
:
9300 case ISD::FMINNUM_IEEE
: {
9301 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9302 Vec
.getOperand(0), Idx
);
9303 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9304 Vec
.getOperand(1), Idx
);
9306 DCI
.AddToWorklist(Elt0
.getNode());
9307 DCI
.AddToWorklist(Elt1
.getNode());
9308 return DAG
.getNode(Opc
, SL
, EltVT
, Elt0
, Elt1
, Vec
->getFlags());
9313 unsigned VecSize
= VecVT
.getSizeInBits();
9314 unsigned EltSize
= EltVT
.getSizeInBits();
9316 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
9317 // This elminates non-constant index and subsequent movrel or scratch access.
9318 // Sub-dword vectors of size 2 dword or less have better implementation.
9319 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
9321 if (VecSize
<= 256 && (VecSize
> 64 || EltSize
>= 32) &&
9322 !isa
<ConstantSDNode
>(N
->getOperand(1))) {
9324 SDValue Idx
= N
->getOperand(1);
9325 EVT IdxVT
= Idx
.getValueType();
9327 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
9328 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
9329 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
9333 V
= DAG
.getSelectCC(SL
, Idx
, IC
, Elt
, V
, ISD::SETEQ
);
9338 if (!DCI
.isBeforeLegalize())
9341 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
9342 // elements. This exposes more load reduction opportunities by replacing
9343 // multiple small extract_vector_elements with a single 32-bit extract.
9344 auto *Idx
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
9345 if (isa
<MemSDNode
>(Vec
) &&
9347 EltVT
.isByteSized() &&
9349 VecSize
% 32 == 0 &&
9351 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VecVT
);
9353 unsigned BitIndex
= Idx
->getZExtValue() * EltSize
;
9354 unsigned EltIdx
= BitIndex
/ 32;
9355 unsigned LeftoverBitIdx
= BitIndex
% 32;
9358 SDValue Cast
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, Vec
);
9359 DCI
.AddToWorklist(Cast
.getNode());
9361 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Cast
,
9362 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
9363 DCI
.AddToWorklist(Elt
.getNode());
9364 SDValue Srl
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Elt
,
9365 DAG
.getConstant(LeftoverBitIdx
, SL
, MVT::i32
));
9366 DCI
.AddToWorklist(Srl
.getNode());
9368 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, SL
, EltVT
.changeTypeToInteger(), Srl
);
9369 DCI
.AddToWorklist(Trunc
.getNode());
9370 return DAG
.getNode(ISD::BITCAST
, SL
, EltVT
, Trunc
);
9377 SITargetLowering::performInsertVectorEltCombine(SDNode
*N
,
9378 DAGCombinerInfo
&DCI
) const {
9379 SDValue Vec
= N
->getOperand(0);
9380 SDValue Idx
= N
->getOperand(2);
9381 EVT VecVT
= Vec
.getValueType();
9382 EVT EltVT
= VecVT
.getVectorElementType();
9383 unsigned VecSize
= VecVT
.getSizeInBits();
9384 unsigned EltSize
= EltVT
.getSizeInBits();
9386 // INSERT_VECTOR_ELT (<n x e>, var-idx)
9387 // => BUILD_VECTOR n x select (e, const-idx)
9388 // This elminates non-constant index and subsequent movrel or scratch access.
9389 // Sub-dword vectors of size 2 dword or less have better implementation.
9390 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
9392 if (isa
<ConstantSDNode
>(Idx
) ||
9393 VecSize
> 256 || (VecSize
<= 64 && EltSize
< 32))
9396 SelectionDAG
&DAG
= DCI
.DAG
;
9398 SDValue Ins
= N
->getOperand(1);
9399 EVT IdxVT
= Idx
.getValueType();
9401 SmallVector
<SDValue
, 16> Ops
;
9402 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
9403 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
9404 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
9405 SDValue V
= DAG
.getSelectCC(SL
, Idx
, IC
, Ins
, Elt
, ISD::SETEQ
);
9409 return DAG
.getBuildVector(VecVT
, SL
, Ops
);
9412 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG
&DAG
,
9414 const SDNode
*N1
) const {
9415 EVT VT
= N0
->getValueType(0);
9417 // Only do this if we are not trying to support denormals. v_mad_f32 does not
9418 // support denormals ever.
9419 if (((VT
== MVT::f32
&& !Subtarget
->hasFP32Denormals()) ||
9420 (VT
== MVT::f16
&& !Subtarget
->hasFP16Denormals() &&
9421 getSubtarget()->hasMadF16())) &&
9422 isOperationLegal(ISD::FMAD
, VT
))
9425 const TargetOptions
&Options
= DAG
.getTarget().Options
;
9426 if ((Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
9427 (N0
->getFlags().hasAllowContract() &&
9428 N1
->getFlags().hasAllowContract())) &&
9429 isFMAFasterThanFMulAndFAdd(VT
)) {
9436 // For a reassociatable opcode perform:
9437 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
9438 SDValue
SITargetLowering::reassociateScalarOps(SDNode
*N
,
9439 SelectionDAG
&DAG
) const {
9440 EVT VT
= N
->getValueType(0);
9441 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
9444 unsigned Opc
= N
->getOpcode();
9445 SDValue Op0
= N
->getOperand(0);
9446 SDValue Op1
= N
->getOperand(1);
9448 if (!(Op0
->isDivergent() ^ Op1
->isDivergent()))
9451 if (Op0
->isDivergent())
9452 std::swap(Op0
, Op1
);
9454 if (Op1
.getOpcode() != Opc
|| !Op1
.hasOneUse())
9457 SDValue Op2
= Op1
.getOperand(1);
9458 Op1
= Op1
.getOperand(0);
9459 if (!(Op1
->isDivergent() ^ Op2
->isDivergent()))
9462 if (Op1
->isDivergent())
9463 std::swap(Op1
, Op2
);
9465 // If either operand is constant this will conflict with
9466 // DAGCombiner::ReassociateOps().
9467 if (DAG
.isConstantIntBuildVectorOrConstantInt(Op0
) ||
9468 DAG
.isConstantIntBuildVectorOrConstantInt(Op1
))
9472 SDValue Add1
= DAG
.getNode(Opc
, SL
, VT
, Op0
, Op1
);
9473 return DAG
.getNode(Opc
, SL
, VT
, Add1
, Op2
);
9476 static SDValue
getMad64_32(SelectionDAG
&DAG
, const SDLoc
&SL
,
9478 SDValue N0
, SDValue N1
, SDValue N2
,
9480 unsigned MadOpc
= Signed
? AMDGPUISD::MAD_I64_I32
: AMDGPUISD::MAD_U64_U32
;
9481 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i1
);
9482 SDValue Mad
= DAG
.getNode(MadOpc
, SL
, VTs
, N0
, N1
, N2
);
9483 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Mad
);
9486 SDValue
SITargetLowering::performAddCombine(SDNode
*N
,
9487 DAGCombinerInfo
&DCI
) const {
9488 SelectionDAG
&DAG
= DCI
.DAG
;
9489 EVT VT
= N
->getValueType(0);
9491 SDValue LHS
= N
->getOperand(0);
9492 SDValue RHS
= N
->getOperand(1);
9494 if ((LHS
.getOpcode() == ISD::MUL
|| RHS
.getOpcode() == ISD::MUL
)
9495 && Subtarget
->hasMad64_32() &&
9496 !VT
.isVector() && VT
.getScalarSizeInBits() > 32 &&
9497 VT
.getScalarSizeInBits() <= 64) {
9498 if (LHS
.getOpcode() != ISD::MUL
)
9499 std::swap(LHS
, RHS
);
9501 SDValue MulLHS
= LHS
.getOperand(0);
9502 SDValue MulRHS
= LHS
.getOperand(1);
9503 SDValue AddRHS
= RHS
;
9505 // TODO: Maybe restrict if SGPR inputs.
9506 if (numBitsUnsigned(MulLHS
, DAG
) <= 32 &&
9507 numBitsUnsigned(MulRHS
, DAG
) <= 32) {
9508 MulLHS
= DAG
.getZExtOrTrunc(MulLHS
, SL
, MVT::i32
);
9509 MulRHS
= DAG
.getZExtOrTrunc(MulRHS
, SL
, MVT::i32
);
9510 AddRHS
= DAG
.getZExtOrTrunc(AddRHS
, SL
, MVT::i64
);
9511 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, false);
9514 if (numBitsSigned(MulLHS
, DAG
) < 32 && numBitsSigned(MulRHS
, DAG
) < 32) {
9515 MulLHS
= DAG
.getSExtOrTrunc(MulLHS
, SL
, MVT::i32
);
9516 MulRHS
= DAG
.getSExtOrTrunc(MulRHS
, SL
, MVT::i32
);
9517 AddRHS
= DAG
.getSExtOrTrunc(AddRHS
, SL
, MVT::i64
);
9518 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, true);
9524 if (SDValue V
= reassociateScalarOps(N
, DAG
)) {
9528 if (VT
!= MVT::i32
|| !DCI
.isAfterLegalizeDAG())
9531 // add x, zext (setcc) => addcarry x, 0, setcc
9532 // add x, sext (setcc) => subcarry x, 0, setcc
9533 unsigned Opc
= LHS
.getOpcode();
9534 if (Opc
== ISD::ZERO_EXTEND
|| Opc
== ISD::SIGN_EXTEND
||
9535 Opc
== ISD::ANY_EXTEND
|| Opc
== ISD::ADDCARRY
)
9536 std::swap(RHS
, LHS
);
9538 Opc
= RHS
.getOpcode();
9541 case ISD::ZERO_EXTEND
:
9542 case ISD::SIGN_EXTEND
:
9543 case ISD::ANY_EXTEND
: {
9544 auto Cond
= RHS
.getOperand(0);
9545 if (!isBoolSGPR(Cond
))
9547 SDVTList VTList
= DAG
.getVTList(MVT::i32
, MVT::i1
);
9548 SDValue Args
[] = { LHS
, DAG
.getConstant(0, SL
, MVT::i32
), Cond
};
9549 Opc
= (Opc
== ISD::SIGN_EXTEND
) ? ISD::SUBCARRY
: ISD::ADDCARRY
;
9550 return DAG
.getNode(Opc
, SL
, VTList
, Args
);
9552 case ISD::ADDCARRY
: {
9553 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
9554 auto C
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
9555 if (!C
|| C
->getZExtValue() != 0) break;
9556 SDValue Args
[] = { LHS
, RHS
.getOperand(0), RHS
.getOperand(2) };
9557 return DAG
.getNode(ISD::ADDCARRY
, SDLoc(N
), RHS
->getVTList(), Args
);
9563 SDValue
SITargetLowering::performSubCombine(SDNode
*N
,
9564 DAGCombinerInfo
&DCI
) const {
9565 SelectionDAG
&DAG
= DCI
.DAG
;
9566 EVT VT
= N
->getValueType(0);
9572 SDValue LHS
= N
->getOperand(0);
9573 SDValue RHS
= N
->getOperand(1);
9575 if (LHS
.getOpcode() == ISD::SUBCARRY
) {
9576 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
9577 auto C
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
9578 if (!C
|| !C
->isNullValue())
9580 SDValue Args
[] = { LHS
.getOperand(0), RHS
, LHS
.getOperand(2) };
9581 return DAG
.getNode(ISD::SUBCARRY
, SDLoc(N
), LHS
->getVTList(), Args
);
9586 SDValue
SITargetLowering::performAddCarrySubCarryCombine(SDNode
*N
,
9587 DAGCombinerInfo
&DCI
) const {
9589 if (N
->getValueType(0) != MVT::i32
)
9592 auto C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
9593 if (!C
|| C
->getZExtValue() != 0)
9596 SelectionDAG
&DAG
= DCI
.DAG
;
9597 SDValue LHS
= N
->getOperand(0);
9599 // addcarry (add x, y), 0, cc => addcarry x, y, cc
9600 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
9601 unsigned LHSOpc
= LHS
.getOpcode();
9602 unsigned Opc
= N
->getOpcode();
9603 if ((LHSOpc
== ISD::ADD
&& Opc
== ISD::ADDCARRY
) ||
9604 (LHSOpc
== ISD::SUB
&& Opc
== ISD::SUBCARRY
)) {
9605 SDValue Args
[] = { LHS
.getOperand(0), LHS
.getOperand(1), N
->getOperand(2) };
9606 return DAG
.getNode(Opc
, SDLoc(N
), N
->getVTList(), Args
);
9611 SDValue
SITargetLowering::performFAddCombine(SDNode
*N
,
9612 DAGCombinerInfo
&DCI
) const {
9613 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
9616 SelectionDAG
&DAG
= DCI
.DAG
;
9617 EVT VT
= N
->getValueType(0);
9620 SDValue LHS
= N
->getOperand(0);
9621 SDValue RHS
= N
->getOperand(1);
9623 // These should really be instruction patterns, but writing patterns with
9624 // source modiifiers is a pain.
9626 // fadd (fadd (a, a), b) -> mad 2.0, a, b
9627 if (LHS
.getOpcode() == ISD::FADD
) {
9628 SDValue A
= LHS
.getOperand(0);
9629 if (A
== LHS
.getOperand(1)) {
9630 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
9632 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9633 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, RHS
);
9638 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
9639 if (RHS
.getOpcode() == ISD::FADD
) {
9640 SDValue A
= RHS
.getOperand(0);
9641 if (A
== RHS
.getOperand(1)) {
9642 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
9644 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9645 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, LHS
);
9653 SDValue
SITargetLowering::performFSubCombine(SDNode
*N
,
9654 DAGCombinerInfo
&DCI
) const {
9655 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
9658 SelectionDAG
&DAG
= DCI
.DAG
;
9660 EVT VT
= N
->getValueType(0);
9661 assert(!VT
.isVector());
9663 // Try to get the fneg to fold into the source modifier. This undoes generic
9664 // DAG combines and folds them into the mad.
9666 // Only do this if we are not trying to support denormals. v_mad_f32 does
9667 // not support denormals ever.
9668 SDValue LHS
= N
->getOperand(0);
9669 SDValue RHS
= N
->getOperand(1);
9670 if (LHS
.getOpcode() == ISD::FADD
) {
9671 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
9672 SDValue A
= LHS
.getOperand(0);
9673 if (A
== LHS
.getOperand(1)) {
9674 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
9676 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9677 SDValue NegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
9679 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, NegRHS
);
9684 if (RHS
.getOpcode() == ISD::FADD
) {
9685 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
9687 SDValue A
= RHS
.getOperand(0);
9688 if (A
== RHS
.getOperand(1)) {
9689 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
9691 const SDValue NegTwo
= DAG
.getConstantFP(-2.0, SL
, VT
);
9692 return DAG
.getNode(FusedOp
, SL
, VT
, A
, NegTwo
, LHS
);
9700 SDValue
SITargetLowering::performFMACombine(SDNode
*N
,
9701 DAGCombinerInfo
&DCI
) const {
9702 SelectionDAG
&DAG
= DCI
.DAG
;
9703 EVT VT
= N
->getValueType(0);
9706 if (!Subtarget
->hasDot2Insts() || VT
!= MVT::f32
)
9709 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
9710 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
9711 SDValue Op1
= N
->getOperand(0);
9712 SDValue Op2
= N
->getOperand(1);
9713 SDValue FMA
= N
->getOperand(2);
9715 if (FMA
.getOpcode() != ISD::FMA
||
9716 Op1
.getOpcode() != ISD::FP_EXTEND
||
9717 Op2
.getOpcode() != ISD::FP_EXTEND
)
9720 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
9721 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
9722 // is sufficient to allow generaing fdot2.
9723 const TargetOptions
&Options
= DAG
.getTarget().Options
;
9724 if (Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
9725 (N
->getFlags().hasAllowContract() &&
9726 FMA
->getFlags().hasAllowContract())) {
9727 Op1
= Op1
.getOperand(0);
9728 Op2
= Op2
.getOperand(0);
9729 if (Op1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
9730 Op2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
9733 SDValue Vec1
= Op1
.getOperand(0);
9734 SDValue Idx1
= Op1
.getOperand(1);
9735 SDValue Vec2
= Op2
.getOperand(0);
9737 SDValue FMAOp1
= FMA
.getOperand(0);
9738 SDValue FMAOp2
= FMA
.getOperand(1);
9739 SDValue FMAAcc
= FMA
.getOperand(2);
9741 if (FMAOp1
.getOpcode() != ISD::FP_EXTEND
||
9742 FMAOp2
.getOpcode() != ISD::FP_EXTEND
)
9745 FMAOp1
= FMAOp1
.getOperand(0);
9746 FMAOp2
= FMAOp2
.getOperand(0);
9747 if (FMAOp1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
9748 FMAOp2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
9751 SDValue Vec3
= FMAOp1
.getOperand(0);
9752 SDValue Vec4
= FMAOp2
.getOperand(0);
9753 SDValue Idx2
= FMAOp1
.getOperand(1);
9755 if (Idx1
!= Op2
.getOperand(1) || Idx2
!= FMAOp2
.getOperand(1) ||
9756 // Idx1 and Idx2 cannot be the same.
9760 if (Vec1
== Vec2
|| Vec3
== Vec4
)
9763 if (Vec1
.getValueType() != MVT::v2f16
|| Vec2
.getValueType() != MVT::v2f16
)
9766 if ((Vec1
== Vec3
&& Vec2
== Vec4
) ||
9767 (Vec1
== Vec4
&& Vec2
== Vec3
)) {
9768 return DAG
.getNode(AMDGPUISD::FDOT2
, SL
, MVT::f32
, Vec1
, Vec2
, FMAAcc
,
9769 DAG
.getTargetConstant(0, SL
, MVT::i1
));
9775 SDValue
SITargetLowering::performSetCCCombine(SDNode
*N
,
9776 DAGCombinerInfo
&DCI
) const {
9777 SelectionDAG
&DAG
= DCI
.DAG
;
9780 SDValue LHS
= N
->getOperand(0);
9781 SDValue RHS
= N
->getOperand(1);
9782 EVT VT
= LHS
.getValueType();
9783 ISD::CondCode CC
= cast
<CondCodeSDNode
>(N
->getOperand(2))->get();
9785 auto CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
9787 CRHS
= dyn_cast
<ConstantSDNode
>(LHS
);
9789 std::swap(LHS
, RHS
);
9790 CC
= getSetCCSwappedOperands(CC
);
9795 if (VT
== MVT::i32
&& LHS
.getOpcode() == ISD::SIGN_EXTEND
&&
9796 isBoolSGPR(LHS
.getOperand(0))) {
9797 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
9798 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
9799 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
9800 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
9801 if ((CRHS
->isAllOnesValue() &&
9802 (CC
== ISD::SETNE
|| CC
== ISD::SETGT
|| CC
== ISD::SETULT
)) ||
9803 (CRHS
->isNullValue() &&
9804 (CC
== ISD::SETEQ
|| CC
== ISD::SETGE
|| CC
== ISD::SETULE
)))
9805 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
9806 DAG
.getConstant(-1, SL
, MVT::i1
));
9807 if ((CRHS
->isAllOnesValue() &&
9808 (CC
== ISD::SETEQ
|| CC
== ISD::SETLE
|| CC
== ISD::SETUGE
)) ||
9809 (CRHS
->isNullValue() &&
9810 (CC
== ISD::SETNE
|| CC
== ISD::SETUGT
|| CC
== ISD::SETLT
)))
9811 return LHS
.getOperand(0);
9814 uint64_t CRHSVal
= CRHS
->getZExtValue();
9815 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) &&
9816 LHS
.getOpcode() == ISD::SELECT
&&
9817 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
9818 isa
<ConstantSDNode
>(LHS
.getOperand(2)) &&
9819 LHS
.getConstantOperandVal(1) != LHS
.getConstantOperandVal(2) &&
9820 isBoolSGPR(LHS
.getOperand(0))) {
9822 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
9823 // setcc (select cc, CT, CF), CF, ne => cc
9824 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
9825 // setcc (select cc, CT, CF), CT, eq => cc
9826 uint64_t CT
= LHS
.getConstantOperandVal(1);
9827 uint64_t CF
= LHS
.getConstantOperandVal(2);
9829 if ((CF
== CRHSVal
&& CC
== ISD::SETEQ
) ||
9830 (CT
== CRHSVal
&& CC
== ISD::SETNE
))
9831 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
9832 DAG
.getConstant(-1, SL
, MVT::i1
));
9833 if ((CF
== CRHSVal
&& CC
== ISD::SETNE
) ||
9834 (CT
== CRHSVal
&& CC
== ISD::SETEQ
))
9835 return LHS
.getOperand(0);
9839 if (VT
!= MVT::f32
&& VT
!= MVT::f64
&& (Subtarget
->has16BitInsts() &&
9843 // Match isinf/isfinite pattern
9844 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
9845 // (fcmp one (fabs x), inf) -> (fp_class x,
9846 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
9847 if ((CC
== ISD::SETOEQ
|| CC
== ISD::SETONE
) && LHS
.getOpcode() == ISD::FABS
) {
9848 const ConstantFPSDNode
*CRHS
= dyn_cast
<ConstantFPSDNode
>(RHS
);
9852 const APFloat
&APF
= CRHS
->getValueAPF();
9853 if (APF
.isInfinity() && !APF
.isNegative()) {
9854 const unsigned IsInfMask
= SIInstrFlags::P_INFINITY
|
9855 SIInstrFlags::N_INFINITY
;
9856 const unsigned IsFiniteMask
= SIInstrFlags::N_ZERO
|
9857 SIInstrFlags::P_ZERO
|
9858 SIInstrFlags::N_NORMAL
|
9859 SIInstrFlags::P_NORMAL
|
9860 SIInstrFlags::N_SUBNORMAL
|
9861 SIInstrFlags::P_SUBNORMAL
;
9862 unsigned Mask
= CC
== ISD::SETOEQ
? IsInfMask
: IsFiniteMask
;
9863 return DAG
.getNode(AMDGPUISD::FP_CLASS
, SL
, MVT::i1
, LHS
.getOperand(0),
9864 DAG
.getConstant(Mask
, SL
, MVT::i32
));
9871 SDValue
SITargetLowering::performCvtF32UByteNCombine(SDNode
*N
,
9872 DAGCombinerInfo
&DCI
) const {
9873 SelectionDAG
&DAG
= DCI
.DAG
;
9875 unsigned Offset
= N
->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0
;
9877 SDValue Src
= N
->getOperand(0);
9878 SDValue Srl
= N
->getOperand(0);
9879 if (Srl
.getOpcode() == ISD::ZERO_EXTEND
)
9880 Srl
= Srl
.getOperand(0);
9882 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
9883 if (Srl
.getOpcode() == ISD::SRL
) {
9884 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
9885 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
9886 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
9888 if (const ConstantSDNode
*C
=
9889 dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1))) {
9890 Srl
= DAG
.getZExtOrTrunc(Srl
.getOperand(0), SDLoc(Srl
.getOperand(0)),
9893 unsigned SrcOffset
= C
->getZExtValue() + 8 * Offset
;
9894 if (SrcOffset
< 32 && SrcOffset
% 8 == 0) {
9895 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
+ SrcOffset
/ 8, SL
,
9901 APInt Demanded
= APInt::getBitsSet(32, 8 * Offset
, 8 * Offset
+ 8);
9904 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
9905 !DCI
.isBeforeLegalizeOps());
9906 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
9907 if (TLI
.SimplifyDemandedBits(Src
, Demanded
, Known
, TLO
)) {
9908 DCI
.CommitTargetLoweringOpt(TLO
);
9914 SDValue
SITargetLowering::performClampCombine(SDNode
*N
,
9915 DAGCombinerInfo
&DCI
) const {
9916 ConstantFPSDNode
*CSrc
= dyn_cast
<ConstantFPSDNode
>(N
->getOperand(0));
9920 const MachineFunction
&MF
= DCI
.DAG
.getMachineFunction();
9921 const APFloat
&F
= CSrc
->getValueAPF();
9922 APFloat Zero
= APFloat::getZero(F
.getSemantics());
9923 APFloat::cmpResult Cmp0
= F
.compare(Zero
);
9924 if (Cmp0
== APFloat::cmpLessThan
||
9925 (Cmp0
== APFloat::cmpUnordered
&&
9926 MF
.getInfo
<SIMachineFunctionInfo
>()->getMode().DX10Clamp
)) {
9927 return DCI
.DAG
.getConstantFP(Zero
, SDLoc(N
), N
->getValueType(0));
9930 APFloat
One(F
.getSemantics(), "1.0");
9931 APFloat::cmpResult Cmp1
= F
.compare(One
);
9932 if (Cmp1
== APFloat::cmpGreaterThan
)
9933 return DCI
.DAG
.getConstantFP(One
, SDLoc(N
), N
->getValueType(0));
9935 return SDValue(CSrc
, 0);
9939 SDValue
SITargetLowering::PerformDAGCombine(SDNode
*N
,
9940 DAGCombinerInfo
&DCI
) const {
9941 if (getTargetMachine().getOptLevel() == CodeGenOpt::None
)
9943 switch (N
->getOpcode()) {
9945 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
9947 return performAddCombine(N
, DCI
);
9949 return performSubCombine(N
, DCI
);
9952 return performAddCarrySubCarryCombine(N
, DCI
);
9954 return performFAddCombine(N
, DCI
);
9956 return performFSubCombine(N
, DCI
);
9958 return performSetCCCombine(N
, DCI
);
9961 case ISD::FMAXNUM_IEEE
:
9962 case ISD::FMINNUM_IEEE
:
9967 case AMDGPUISD::FMIN_LEGACY
:
9968 case AMDGPUISD::FMAX_LEGACY
:
9969 return performMinMaxCombine(N
, DCI
);
9971 return performFMACombine(N
, DCI
);
9973 if (SDValue Widended
= widenLoad(cast
<LoadSDNode
>(N
), DCI
))
9978 case ISD::ATOMIC_LOAD
:
9979 case ISD::ATOMIC_STORE
:
9980 case ISD::ATOMIC_CMP_SWAP
:
9981 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
:
9982 case ISD::ATOMIC_SWAP
:
9983 case ISD::ATOMIC_LOAD_ADD
:
9984 case ISD::ATOMIC_LOAD_SUB
:
9985 case ISD::ATOMIC_LOAD_AND
:
9986 case ISD::ATOMIC_LOAD_OR
:
9987 case ISD::ATOMIC_LOAD_XOR
:
9988 case ISD::ATOMIC_LOAD_NAND
:
9989 case ISD::ATOMIC_LOAD_MIN
:
9990 case ISD::ATOMIC_LOAD_MAX
:
9991 case ISD::ATOMIC_LOAD_UMIN
:
9992 case ISD::ATOMIC_LOAD_UMAX
:
9993 case ISD::ATOMIC_LOAD_FADD
:
9994 case AMDGPUISD::ATOMIC_INC
:
9995 case AMDGPUISD::ATOMIC_DEC
:
9996 case AMDGPUISD::ATOMIC_LOAD_FMIN
:
9997 case AMDGPUISD::ATOMIC_LOAD_FMAX
: // TODO: Target mem intrinsics.
9998 if (DCI
.isBeforeLegalize())
10000 return performMemSDNodeCombine(cast
<MemSDNode
>(N
), DCI
);
10002 return performAndCombine(N
, DCI
);
10004 return performOrCombine(N
, DCI
);
10006 return performXorCombine(N
, DCI
);
10007 case ISD::ZERO_EXTEND
:
10008 return performZeroExtendCombine(N
, DCI
);
10009 case ISD::SIGN_EXTEND_INREG
:
10010 return performSignExtendInRegCombine(N
, DCI
);
10011 case AMDGPUISD::FP_CLASS
:
10012 return performClassCombine(N
, DCI
);
10013 case ISD::FCANONICALIZE
:
10014 return performFCanonicalizeCombine(N
, DCI
);
10015 case AMDGPUISD::RCP
:
10016 return performRcpCombine(N
, DCI
);
10017 case AMDGPUISD::FRACT
:
10018 case AMDGPUISD::RSQ
:
10019 case AMDGPUISD::RCP_LEGACY
:
10020 case AMDGPUISD::RSQ_LEGACY
:
10021 case AMDGPUISD::RCP_IFLAG
:
10022 case AMDGPUISD::RSQ_CLAMP
:
10023 case AMDGPUISD::LDEXP
: {
10024 SDValue Src
= N
->getOperand(0);
10029 case ISD::SINT_TO_FP
:
10030 case ISD::UINT_TO_FP
:
10031 return performUCharToFloatCombine(N
, DCI
);
10032 case AMDGPUISD::CVT_F32_UBYTE0
:
10033 case AMDGPUISD::CVT_F32_UBYTE1
:
10034 case AMDGPUISD::CVT_F32_UBYTE2
:
10035 case AMDGPUISD::CVT_F32_UBYTE3
:
10036 return performCvtF32UByteNCombine(N
, DCI
);
10037 case AMDGPUISD::FMED3
:
10038 return performFMed3Combine(N
, DCI
);
10039 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
10040 return performCvtPkRTZCombine(N
, DCI
);
10041 case AMDGPUISD::CLAMP
:
10042 return performClampCombine(N
, DCI
);
10043 case ISD::SCALAR_TO_VECTOR
: {
10044 SelectionDAG
&DAG
= DCI
.DAG
;
10045 EVT VT
= N
->getValueType(0);
10047 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
10048 if (VT
== MVT::v2i16
|| VT
== MVT::v2f16
) {
10050 SDValue Src
= N
->getOperand(0);
10051 EVT EltVT
= Src
.getValueType();
10052 if (EltVT
== MVT::f16
)
10053 Src
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Src
);
10055 SDValue Ext
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Src
);
10056 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Ext
);
10061 case ISD::EXTRACT_VECTOR_ELT
:
10062 return performExtractVectorEltCombine(N
, DCI
);
10063 case ISD::INSERT_VECTOR_ELT
:
10064 return performInsertVectorEltCombine(N
, DCI
);
10066 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
10069 /// Helper function for adjustWritemask
10070 static unsigned SubIdx2Lane(unsigned Idx
) {
10073 case AMDGPU::sub0
: return 0;
10074 case AMDGPU::sub1
: return 1;
10075 case AMDGPU::sub2
: return 2;
10076 case AMDGPU::sub3
: return 3;
10077 case AMDGPU::sub4
: return 4; // Possible with TFE/LWE
10081 /// Adjust the writemask of MIMG instructions
10082 SDNode
*SITargetLowering::adjustWritemask(MachineSDNode
*&Node
,
10083 SelectionDAG
&DAG
) const {
10084 unsigned Opcode
= Node
->getMachineOpcode();
10086 // Subtract 1 because the vdata output is not a MachineSDNode operand.
10087 int D16Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::d16
) - 1;
10088 if (D16Idx
>= 0 && Node
->getConstantOperandVal(D16Idx
))
10089 return Node
; // not implemented for D16
10091 SDNode
*Users
[5] = { nullptr };
10093 unsigned DmaskIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::dmask
) - 1;
10094 unsigned OldDmask
= Node
->getConstantOperandVal(DmaskIdx
);
10095 unsigned NewDmask
= 0;
10096 unsigned TFEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::tfe
) - 1;
10097 unsigned LWEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::lwe
) - 1;
10098 bool UsesTFC
= (Node
->getConstantOperandVal(TFEIdx
) ||
10099 Node
->getConstantOperandVal(LWEIdx
)) ? 1 : 0;
10100 unsigned TFCLane
= 0;
10101 bool HasChain
= Node
->getNumValues() > 1;
10103 if (OldDmask
== 0) {
10104 // These are folded out, but on the chance it happens don't assert.
10108 unsigned OldBitsSet
= countPopulation(OldDmask
);
10109 // Work out which is the TFE/LWE lane if that is enabled.
10111 TFCLane
= OldBitsSet
;
10114 // Try to figure out the used register components
10115 for (SDNode::use_iterator I
= Node
->use_begin(), E
= Node
->use_end();
10118 // Don't look at users of the chain.
10119 if (I
.getUse().getResNo() != 0)
10122 // Abort if we can't understand the usage
10123 if (!I
->isMachineOpcode() ||
10124 I
->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
)
10127 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
10128 // Note that subregs are packed, i.e. Lane==0 is the first bit set
10129 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
10131 Lane
= SubIdx2Lane(I
->getConstantOperandVal(1));
10133 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
10134 if (UsesTFC
&& Lane
== TFCLane
) {
10137 // Set which texture component corresponds to the lane.
10139 for (unsigned i
= 0, Dmask
= OldDmask
; (i
<= Lane
) && (Dmask
!= 0); i
++) {
10140 Comp
= countTrailingZeros(Dmask
);
10141 Dmask
&= ~(1 << Comp
);
10144 // Abort if we have more than one user per component.
10149 NewDmask
|= 1 << Comp
;
10153 // Don't allow 0 dmask, as hardware assumes one channel enabled.
10154 bool NoChannels
= !NewDmask
;
10157 // No uses of the result and not using TFC. Then do nothing.
10160 // If the original dmask has one channel - then nothing to do
10161 if (OldBitsSet
== 1)
10163 // Use an arbitrary dmask - required for the instruction to work
10166 // Abort if there's no change
10167 if (NewDmask
== OldDmask
)
10170 unsigned BitsSet
= countPopulation(NewDmask
);
10172 // Check for TFE or LWE - increase the number of channels by one to account
10173 // for the extra return value
10174 // This will need adjustment for D16 if this is also included in
10175 // adjustWriteMask (this function) but at present D16 are excluded.
10176 unsigned NewChannels
= BitsSet
+ UsesTFC
;
10179 AMDGPU::getMaskedMIMGOp(Node
->getMachineOpcode(), NewChannels
);
10180 assert(NewOpcode
!= -1 &&
10181 NewOpcode
!= static_cast<int>(Node
->getMachineOpcode()) &&
10182 "failed to find equivalent MIMG op");
10184 // Adjust the writemask in the node
10185 SmallVector
<SDValue
, 12> Ops
;
10186 Ops
.insert(Ops
.end(), Node
->op_begin(), Node
->op_begin() + DmaskIdx
);
10187 Ops
.push_back(DAG
.getTargetConstant(NewDmask
, SDLoc(Node
), MVT::i32
));
10188 Ops
.insert(Ops
.end(), Node
->op_begin() + DmaskIdx
+ 1, Node
->op_end());
10190 MVT SVT
= Node
->getValueType(0).getVectorElementType().getSimpleVT();
10192 MVT ResultVT
= NewChannels
== 1 ?
10193 SVT
: MVT::getVectorVT(SVT
, NewChannels
== 3 ? 4 :
10194 NewChannels
== 5 ? 8 : NewChannels
);
10195 SDVTList NewVTList
= HasChain
?
10196 DAG
.getVTList(ResultVT
, MVT::Other
) : DAG
.getVTList(ResultVT
);
10199 MachineSDNode
*NewNode
= DAG
.getMachineNode(NewOpcode
, SDLoc(Node
),
10204 DAG
.setNodeMemRefs(NewNode
, Node
->memoperands());
10205 DAG
.ReplaceAllUsesOfValueWith(SDValue(Node
, 1), SDValue(NewNode
, 1));
10208 if (NewChannels
== 1) {
10209 assert(Node
->hasNUsesOfValue(1, 0));
10210 SDNode
*Copy
= DAG
.getMachineNode(TargetOpcode::COPY
,
10211 SDLoc(Node
), Users
[Lane
]->getValueType(0),
10212 SDValue(NewNode
, 0));
10213 DAG
.ReplaceAllUsesWith(Users
[Lane
], Copy
);
10217 // Update the users of the node with the new indices
10218 for (unsigned i
= 0, Idx
= AMDGPU::sub0
; i
< 5; ++i
) {
10219 SDNode
*User
= Users
[i
];
10221 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
10222 // Users[0] is still nullptr because channel 0 doesn't really have a use.
10223 if (i
|| !NoChannels
)
10226 SDValue Op
= DAG
.getTargetConstant(Idx
, SDLoc(User
), MVT::i32
);
10227 DAG
.UpdateNodeOperands(User
, SDValue(NewNode
, 0), Op
);
10232 case AMDGPU::sub0
: Idx
= AMDGPU::sub1
; break;
10233 case AMDGPU::sub1
: Idx
= AMDGPU::sub2
; break;
10234 case AMDGPU::sub2
: Idx
= AMDGPU::sub3
; break;
10235 case AMDGPU::sub3
: Idx
= AMDGPU::sub4
; break;
10239 DAG
.RemoveDeadNode(Node
);
10243 static bool isFrameIndexOp(SDValue Op
) {
10244 if (Op
.getOpcode() == ISD::AssertZext
)
10245 Op
= Op
.getOperand(0);
10247 return isa
<FrameIndexSDNode
>(Op
);
10250 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
10251 /// with frame index operands.
10252 /// LLVM assumes that inputs are to these instructions are registers.
10253 SDNode
*SITargetLowering::legalizeTargetIndependentNode(SDNode
*Node
,
10254 SelectionDAG
&DAG
) const {
10255 if (Node
->getOpcode() == ISD::CopyToReg
) {
10256 RegisterSDNode
*DestReg
= cast
<RegisterSDNode
>(Node
->getOperand(1));
10257 SDValue SrcVal
= Node
->getOperand(2);
10259 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
10260 // to try understanding copies to physical registers.
10261 if (SrcVal
.getValueType() == MVT::i1
&&
10262 Register::isPhysicalRegister(DestReg
->getReg())) {
10264 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
10265 SDValue VReg
= DAG
.getRegister(
10266 MRI
.createVirtualRegister(&AMDGPU::VReg_1RegClass
), MVT::i1
);
10268 SDNode
*Glued
= Node
->getGluedNode();
10270 = DAG
.getCopyToReg(Node
->getOperand(0), SL
, VReg
, SrcVal
,
10271 SDValue(Glued
, Glued
? Glued
->getNumValues() - 1 : 0));
10272 SDValue ToResultReg
10273 = DAG
.getCopyToReg(ToVReg
, SL
, SDValue(DestReg
, 0),
10274 VReg
, ToVReg
.getValue(1));
10275 DAG
.ReplaceAllUsesWith(Node
, ToResultReg
.getNode());
10276 DAG
.RemoveDeadNode(Node
);
10277 return ToResultReg
.getNode();
10281 SmallVector
<SDValue
, 8> Ops
;
10282 for (unsigned i
= 0; i
< Node
->getNumOperands(); ++i
) {
10283 if (!isFrameIndexOp(Node
->getOperand(i
))) {
10284 Ops
.push_back(Node
->getOperand(i
));
10289 Ops
.push_back(SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
,
10290 Node
->getOperand(i
).getValueType(),
10291 Node
->getOperand(i
)), 0));
10294 return DAG
.UpdateNodeOperands(Node
, Ops
);
10297 /// Fold the instructions after selecting them.
10298 /// Returns null if users were already updated.
10299 SDNode
*SITargetLowering::PostISelFolding(MachineSDNode
*Node
,
10300 SelectionDAG
&DAG
) const {
10301 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10302 unsigned Opcode
= Node
->getMachineOpcode();
10304 if (TII
->isMIMG(Opcode
) && !TII
->get(Opcode
).mayStore() &&
10305 !TII
->isGather4(Opcode
)) {
10306 return adjustWritemask(Node
, DAG
);
10309 if (Opcode
== AMDGPU::INSERT_SUBREG
||
10310 Opcode
== AMDGPU::REG_SEQUENCE
) {
10311 legalizeTargetIndependentNode(Node
, DAG
);
10316 case AMDGPU::V_DIV_SCALE_F32
:
10317 case AMDGPU::V_DIV_SCALE_F64
: {
10318 // Satisfy the operand register constraint when one of the inputs is
10319 // undefined. Ordinarily each undef value will have its own implicit_def of
10320 // a vreg, so force these to use a single register.
10321 SDValue Src0
= Node
->getOperand(0);
10322 SDValue Src1
= Node
->getOperand(1);
10323 SDValue Src2
= Node
->getOperand(2);
10325 if ((Src0
.isMachineOpcode() &&
10326 Src0
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
) &&
10327 (Src0
== Src1
|| Src0
== Src2
))
10330 MVT VT
= Src0
.getValueType().getSimpleVT();
10331 const TargetRegisterClass
*RC
=
10332 getRegClassFor(VT
, Src0
.getNode()->isDivergent());
10334 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
10335 SDValue UndefReg
= DAG
.getRegister(MRI
.createVirtualRegister(RC
), VT
);
10337 SDValue ImpDef
= DAG
.getCopyToReg(DAG
.getEntryNode(), SDLoc(Node
),
10338 UndefReg
, Src0
, SDValue());
10340 // src0 must be the same register as src1 or src2, even if the value is
10341 // undefined, so make sure we don't violate this constraint.
10342 if (Src0
.isMachineOpcode() &&
10343 Src0
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
) {
10344 if (Src1
.isMachineOpcode() &&
10345 Src1
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
10347 else if (Src2
.isMachineOpcode() &&
10348 Src2
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
10351 assert(Src1
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
);
10358 SmallVector
<SDValue
, 4> Ops
= { Src0
, Src1
, Src2
};
10359 for (unsigned I
= 3, N
= Node
->getNumOperands(); I
!= N
; ++I
)
10360 Ops
.push_back(Node
->getOperand(I
));
10362 Ops
.push_back(ImpDef
.getValue(1));
10363 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
10365 case AMDGPU::V_PERMLANE16_B32
:
10366 case AMDGPU::V_PERMLANEX16_B32
: {
10367 ConstantSDNode
*FI
= cast
<ConstantSDNode
>(Node
->getOperand(0));
10368 ConstantSDNode
*BC
= cast
<ConstantSDNode
>(Node
->getOperand(2));
10369 if (!FI
->getZExtValue() && !BC
->getZExtValue())
10371 SDValue VDstIn
= Node
->getOperand(6);
10372 if (VDstIn
.isMachineOpcode()
10373 && VDstIn
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
)
10375 MachineSDNode
*ImpDef
= DAG
.getMachineNode(TargetOpcode::IMPLICIT_DEF
,
10376 SDLoc(Node
), MVT::i32
);
10377 SmallVector
<SDValue
, 8> Ops
= { SDValue(FI
, 0), Node
->getOperand(1),
10378 SDValue(BC
, 0), Node
->getOperand(3),
10379 Node
->getOperand(4), Node
->getOperand(5),
10380 SDValue(ImpDef
, 0), Node
->getOperand(7) };
10381 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
10390 /// Assign the register class depending on the number of
10391 /// bits set in the writemask
10392 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr
&MI
,
10393 SDNode
*Node
) const {
10394 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10396 MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
10398 if (TII
->isVOP3(MI
.getOpcode())) {
10399 // Make sure constant bus requirements are respected.
10400 TII
->legalizeOperandsVOP3(MRI
, MI
);
10402 // Prefer VGPRs over AGPRs in mAI instructions where possible.
10403 // This saves a chain-copy of registers and better ballance register
10404 // use between vgpr and agpr as agpr tuples tend to be big.
10405 if (const MCOperandInfo
*OpInfo
= MI
.getDesc().OpInfo
) {
10406 unsigned Opc
= MI
.getOpcode();
10407 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10408 for (auto I
: { AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
),
10409 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
) }) {
10412 MachineOperand
&Op
= MI
.getOperand(I
);
10413 if ((OpInfo
[I
].RegClass
!= llvm::AMDGPU::AV_64RegClassID
&&
10414 OpInfo
[I
].RegClass
!= llvm::AMDGPU::AV_32RegClassID
) ||
10415 !Register::isVirtualRegister(Op
.getReg()) ||
10416 !TRI
->isAGPR(MRI
, Op
.getReg()))
10418 auto *Src
= MRI
.getUniqueVRegDef(Op
.getReg());
10419 if (!Src
|| !Src
->isCopy() ||
10420 !TRI
->isSGPRReg(MRI
, Src
->getOperand(1).getReg()))
10422 auto *RC
= TRI
->getRegClassForReg(MRI
, Op
.getReg());
10423 auto *NewRC
= TRI
->getEquivalentVGPRClass(RC
);
10424 // All uses of agpr64 and agpr32 can also accept vgpr except for
10425 // v_accvgpr_read, but we do not produce agpr reads during selection,
10426 // so no use checks are needed.
10427 MRI
.setRegClass(Op
.getReg(), NewRC
);
10434 // Replace unused atomics with the no return version.
10435 int NoRetAtomicOp
= AMDGPU::getAtomicNoRetOp(MI
.getOpcode());
10436 if (NoRetAtomicOp
!= -1) {
10437 if (!Node
->hasAnyUseOfValue(0)) {
10438 MI
.setDesc(TII
->get(NoRetAtomicOp
));
10439 MI
.RemoveOperand(0);
10443 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
10444 // instruction, because the return type of these instructions is a vec2 of
10445 // the memory type, so it can be tied to the input operand.
10446 // This means these instructions always have a use, so we need to add a
10447 // special case to check if the atomic has only one extract_subreg use,
10448 // which itself has no uses.
10449 if ((Node
->hasNUsesOfValue(1, 0) &&
10450 Node
->use_begin()->isMachineOpcode() &&
10451 Node
->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG
&&
10452 !Node
->use_begin()->hasAnyUseOfValue(0))) {
10453 Register Def
= MI
.getOperand(0).getReg();
10455 // Change this into a noret atomic.
10456 MI
.setDesc(TII
->get(NoRetAtomicOp
));
10457 MI
.RemoveOperand(0);
10459 // If we only remove the def operand from the atomic instruction, the
10460 // extract_subreg will be left with a use of a vreg without a def.
10461 // So we need to insert an implicit_def to avoid machine verifier
10463 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(),
10464 TII
->get(AMDGPU::IMPLICIT_DEF
), Def
);
10470 static SDValue
buildSMovImm32(SelectionDAG
&DAG
, const SDLoc
&DL
,
10472 SDValue K
= DAG
.getTargetConstant(Val
, DL
, MVT::i32
);
10473 return SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, K
), 0);
10476 MachineSDNode
*SITargetLowering::wrapAddr64Rsrc(SelectionDAG
&DAG
,
10478 SDValue Ptr
) const {
10479 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10481 // Build the half of the subregister with the constants before building the
10482 // full 128-bit register. If we are building multiple resource descriptors,
10483 // this will allow CSEing of the 2-component register.
10484 const SDValue Ops0
[] = {
10485 DAG
.getTargetConstant(AMDGPU::SGPR_64RegClassID
, DL
, MVT::i32
),
10486 buildSMovImm32(DAG
, DL
, 0),
10487 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
10488 buildSMovImm32(DAG
, DL
, TII
->getDefaultRsrcDataFormat() >> 32),
10489 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
)
10492 SDValue SubRegHi
= SDValue(DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
,
10493 MVT::v2i32
, Ops0
), 0);
10495 // Combine the constants and the pointer.
10496 const SDValue Ops1
[] = {
10497 DAG
.getTargetConstant(AMDGPU::SGPR_128RegClassID
, DL
, MVT::i32
),
10499 DAG
.getTargetConstant(AMDGPU::sub0_sub1
, DL
, MVT::i32
),
10501 DAG
.getTargetConstant(AMDGPU::sub2_sub3
, DL
, MVT::i32
)
10504 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops1
);
10507 /// Return a resource descriptor with the 'Add TID' bit enabled
10508 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
10509 /// of the resource descriptor) to create an offset, which is added to
10510 /// the resource pointer.
10511 MachineSDNode
*SITargetLowering::buildRSRC(SelectionDAG
&DAG
, const SDLoc
&DL
,
10512 SDValue Ptr
, uint32_t RsrcDword1
,
10513 uint64_t RsrcDword2And3
) const {
10514 SDValue PtrLo
= DAG
.getTargetExtractSubreg(AMDGPU::sub0
, DL
, MVT::i32
, Ptr
);
10515 SDValue PtrHi
= DAG
.getTargetExtractSubreg(AMDGPU::sub1
, DL
, MVT::i32
, Ptr
);
10517 PtrHi
= SDValue(DAG
.getMachineNode(AMDGPU::S_OR_B32
, DL
, MVT::i32
, PtrHi
,
10518 DAG
.getConstant(RsrcDword1
, DL
, MVT::i32
)),
10522 SDValue DataLo
= buildSMovImm32(DAG
, DL
,
10523 RsrcDword2And3
& UINT64_C(0xFFFFFFFF));
10524 SDValue DataHi
= buildSMovImm32(DAG
, DL
, RsrcDword2And3
>> 32);
10526 const SDValue Ops
[] = {
10527 DAG
.getTargetConstant(AMDGPU::SGPR_128RegClassID
, DL
, MVT::i32
),
10529 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
10531 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
),
10533 DAG
.getTargetConstant(AMDGPU::sub2
, DL
, MVT::i32
),
10535 DAG
.getTargetConstant(AMDGPU::sub3
, DL
, MVT::i32
)
10538 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops
);
10541 //===----------------------------------------------------------------------===//
10542 // SI Inline Assembly Support
10543 //===----------------------------------------------------------------------===//
10545 std::pair
<unsigned, const TargetRegisterClass
*>
10546 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo
*TRI
,
10547 StringRef Constraint
,
10549 const TargetRegisterClass
*RC
= nullptr;
10550 if (Constraint
.size() == 1) {
10551 switch (Constraint
[0]) {
10553 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
10556 switch (VT
.getSizeInBits()) {
10558 return std::make_pair(0U, nullptr);
10561 RC
= &AMDGPU::SReg_32_XM0RegClass
;
10564 RC
= &AMDGPU::SGPR_64RegClass
;
10567 RC
= &AMDGPU::SReg_96RegClass
;
10570 RC
= &AMDGPU::SGPR_128RegClass
;
10573 RC
= &AMDGPU::SReg_160RegClass
;
10576 RC
= &AMDGPU::SReg_256RegClass
;
10579 RC
= &AMDGPU::SReg_512RegClass
;
10584 switch (VT
.getSizeInBits()) {
10586 return std::make_pair(0U, nullptr);
10589 RC
= &AMDGPU::VGPR_32RegClass
;
10592 RC
= &AMDGPU::VReg_64RegClass
;
10595 RC
= &AMDGPU::VReg_96RegClass
;
10598 RC
= &AMDGPU::VReg_128RegClass
;
10601 RC
= &AMDGPU::VReg_160RegClass
;
10604 RC
= &AMDGPU::VReg_256RegClass
;
10607 RC
= &AMDGPU::VReg_512RegClass
;
10612 if (!Subtarget
->hasMAIInsts())
10614 switch (VT
.getSizeInBits()) {
10616 return std::make_pair(0U, nullptr);
10619 RC
= &AMDGPU::AGPR_32RegClass
;
10622 RC
= &AMDGPU::AReg_64RegClass
;
10625 RC
= &AMDGPU::AReg_128RegClass
;
10628 RC
= &AMDGPU::AReg_512RegClass
;
10631 RC
= &AMDGPU::AReg_1024RegClass
;
10632 // v32 types are not legal but we support them here.
10633 return std::make_pair(0U, RC
);
10637 // We actually support i128, i16 and f16 as inline parameters
10638 // even if they are not reported as legal
10639 if (RC
&& (isTypeLegal(VT
) || VT
.SimpleTy
== MVT::i128
||
10640 VT
.SimpleTy
== MVT::i16
|| VT
.SimpleTy
== MVT::f16
))
10641 return std::make_pair(0U, RC
);
10644 if (Constraint
.size() > 1) {
10645 if (Constraint
[1] == 'v') {
10646 RC
= &AMDGPU::VGPR_32RegClass
;
10647 } else if (Constraint
[1] == 's') {
10648 RC
= &AMDGPU::SGPR_32RegClass
;
10649 } else if (Constraint
[1] == 'a') {
10650 RC
= &AMDGPU::AGPR_32RegClass
;
10655 bool Failed
= Constraint
.substr(2).getAsInteger(10, Idx
);
10656 if (!Failed
&& Idx
< RC
->getNumRegs())
10657 return std::make_pair(RC
->getRegister(Idx
), RC
);
10660 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
10663 SITargetLowering::ConstraintType
10664 SITargetLowering::getConstraintType(StringRef Constraint
) const {
10665 if (Constraint
.size() == 1) {
10666 switch (Constraint
[0]) {
10671 return C_RegisterClass
;
10674 return TargetLowering::getConstraintType(Constraint
);
10677 // Figure out which registers should be reserved for stack access. Only after
10678 // the function is legalized do we know all of the non-spill stack objects or if
10679 // calls are present.
10680 void SITargetLowering::finalizeLowering(MachineFunction
&MF
) const {
10681 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
10682 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10683 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
10684 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10686 if (Info
->isEntryFunction()) {
10687 // Callable functions have fixed registers used for stack access.
10688 reservePrivateMemoryRegs(getTargetMachine(), MF
, *TRI
, *Info
);
10691 assert(!TRI
->isSubRegister(Info
->getScratchRSrcReg(),
10692 Info
->getStackPtrOffsetReg()));
10693 if (Info
->getStackPtrOffsetReg() != AMDGPU::SP_REG
)
10694 MRI
.replaceRegWith(AMDGPU::SP_REG
, Info
->getStackPtrOffsetReg());
10696 // We need to worry about replacing the default register with itself in case
10697 // of MIR testcases missing the MFI.
10698 if (Info
->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG
)
10699 MRI
.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG
, Info
->getScratchRSrcReg());
10701 if (Info
->getFrameOffsetReg() != AMDGPU::FP_REG
)
10702 MRI
.replaceRegWith(AMDGPU::FP_REG
, Info
->getFrameOffsetReg());
10704 if (Info
->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG
) {
10705 MRI
.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG
,
10706 Info
->getScratchWaveOffsetReg());
10709 Info
->limitOccupancy(MF
);
10711 if (ST
.isWave32() && !MF
.empty()) {
10712 // Add VCC_HI def because many instructions marked as imp-use VCC where
10713 // we may only define VCC_LO. If nothing defines VCC_HI we may end up
10714 // having a use of undef.
10716 const SIInstrInfo
*TII
= ST
.getInstrInfo();
10719 MachineBasicBlock
&MBB
= MF
.front();
10720 MachineBasicBlock::iterator I
= MBB
.getFirstNonDebugInstr();
10721 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), AMDGPU::VCC_HI
);
10723 for (auto &MBB
: MF
) {
10724 for (auto &MI
: MBB
) {
10725 TII
->fixImplicitOperands(MI
);
10730 TargetLoweringBase::finalizeLowering(MF
);
10733 void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op
,
10735 const APInt
&DemandedElts
,
10736 const SelectionDAG
&DAG
,
10737 unsigned Depth
) const {
10738 TargetLowering::computeKnownBitsForFrameIndex(Op
, Known
, DemandedElts
,
10741 // Set the high bits to zero based on the maximum allowed scratch size per
10742 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
10743 // calculation won't overflow, so assume the sign bit is never set.
10744 Known
.Zero
.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
10747 Align
SITargetLowering::getPrefLoopAlignment(MachineLoop
*ML
) const {
10748 const Align PrefAlign
= TargetLowering::getPrefLoopAlignment(ML
);
10749 const Align CacheLineAlign
= Align(64);
10751 // Pre-GFX10 target did not benefit from loop alignment
10752 if (!ML
|| DisableLoopAlignment
||
10753 (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10
) ||
10754 getSubtarget()->hasInstFwdPrefetchBug())
10757 // On GFX10 I$ is 4 x 64 bytes cache lines.
10758 // By default prefetcher keeps one cache line behind and reads two ahead.
10759 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
10760 // behind and one ahead.
10761 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
10762 // If loop fits 64 bytes it always spans no more than two cache lines and
10763 // does not need an alignment.
10764 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
10765 // Else if loop is less or equal 192 bytes we need two lines behind.
10767 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10768 const MachineBasicBlock
*Header
= ML
->getHeader();
10769 if (Header
->getAlignment() != PrefAlign
)
10770 return Header
->getAlignment(); // Already processed.
10772 unsigned LoopSize
= 0;
10773 for (const MachineBasicBlock
*MBB
: ML
->blocks()) {
10774 // If inner loop block is aligned assume in average half of the alignment
10775 // size to be added as nops.
10777 LoopSize
+= MBB
->getAlignment().value() / 2;
10779 for (const MachineInstr
&MI
: *MBB
) {
10780 LoopSize
+= TII
->getInstSizeInBytes(MI
);
10781 if (LoopSize
> 192)
10786 if (LoopSize
<= 64)
10789 if (LoopSize
<= 128)
10790 return CacheLineAlign
;
10792 // If any of parent loops is surrounded by prefetch instructions do not
10793 // insert new for inner loop, which would reset parent's settings.
10794 for (MachineLoop
*P
= ML
->getParentLoop(); P
; P
= P
->getParentLoop()) {
10795 if (MachineBasicBlock
*Exit
= P
->getExitBlock()) {
10796 auto I
= Exit
->getFirstNonDebugInstr();
10797 if (I
!= Exit
->end() && I
->getOpcode() == AMDGPU::S_INST_PREFETCH
)
10798 return CacheLineAlign
;
10802 MachineBasicBlock
*Pre
= ML
->getLoopPreheader();
10803 MachineBasicBlock
*Exit
= ML
->getExitBlock();
10806 BuildMI(*Pre
, Pre
->getFirstTerminator(), DebugLoc(),
10807 TII
->get(AMDGPU::S_INST_PREFETCH
))
10808 .addImm(1); // prefetch 2 lines behind PC
10810 BuildMI(*Exit
, Exit
->getFirstNonDebugInstr(), DebugLoc(),
10811 TII
->get(AMDGPU::S_INST_PREFETCH
))
10812 .addImm(2); // prefetch 1 line behind PC
10815 return CacheLineAlign
;
10818 LLVM_ATTRIBUTE_UNUSED
10819 static bool isCopyFromRegOfInlineAsm(const SDNode
*N
) {
10820 assert(N
->getOpcode() == ISD::CopyFromReg
);
10822 // Follow the chain until we find an INLINEASM node.
10823 N
= N
->getOperand(0).getNode();
10824 if (N
->getOpcode() == ISD::INLINEASM
||
10825 N
->getOpcode() == ISD::INLINEASM_BR
)
10827 } while (N
->getOpcode() == ISD::CopyFromReg
);
10831 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode
* N
,
10832 FunctionLoweringInfo
* FLI
, LegacyDivergenceAnalysis
* KDA
) const
10834 switch (N
->getOpcode()) {
10835 case ISD::CopyFromReg
:
10837 const RegisterSDNode
*R
= cast
<RegisterSDNode
>(N
->getOperand(1));
10838 const MachineFunction
* MF
= FLI
->MF
;
10839 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
10840 const MachineRegisterInfo
&MRI
= MF
->getRegInfo();
10841 const SIRegisterInfo
&TRI
= ST
.getInstrInfo()->getRegisterInfo();
10842 unsigned Reg
= R
->getReg();
10843 if (Register::isPhysicalRegister(Reg
))
10844 return !TRI
.isSGPRReg(MRI
, Reg
);
10846 if (MRI
.isLiveIn(Reg
)) {
10847 // workitem.id.x workitem.id.y workitem.id.z
10848 // Any VGPR formal argument is also considered divergent
10849 if (!TRI
.isSGPRReg(MRI
, Reg
))
10851 // Formal arguments of non-entry functions
10852 // are conservatively considered divergent
10853 else if (!AMDGPU::isEntryFunctionCC(FLI
->Fn
->getCallingConv()))
10857 const Value
*V
= FLI
->getValueFromVirtualReg(Reg
);
10859 return KDA
->isDivergent(V
);
10860 assert(Reg
== FLI
->DemoteRegister
|| isCopyFromRegOfInlineAsm(N
));
10861 return !TRI
.isSGPRReg(MRI
, Reg
);
10865 const LoadSDNode
*L
= cast
<LoadSDNode
>(N
);
10866 unsigned AS
= L
->getAddressSpace();
10867 // A flat load may access private memory.
10868 return AS
== AMDGPUAS::PRIVATE_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
;
10870 case ISD::CALLSEQ_END
:
10873 case ISD::INTRINSIC_WO_CHAIN
:
10877 return AMDGPU::isIntrinsicSourceOfDivergence(
10878 cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue());
10879 case ISD::INTRINSIC_W_CHAIN
:
10880 return AMDGPU::isIntrinsicSourceOfDivergence(
10881 cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue());
10882 // In some cases intrinsics that are a source of divergence have been
10883 // lowered to AMDGPUISD so we also need to check those too.
10884 case AMDGPUISD::INTERP_MOV
:
10885 case AMDGPUISD::INTERP_P1
:
10886 case AMDGPUISD::INTERP_P2
:
10892 bool SITargetLowering::denormalsEnabledForType(EVT VT
) const {
10893 switch (VT
.getScalarType().getSimpleVT().SimpleTy
) {
10895 return Subtarget
->hasFP32Denormals();
10897 return Subtarget
->hasFP64Denormals();
10899 return Subtarget
->hasFP16Denormals();
10905 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op
,
10906 const SelectionDAG
&DAG
,
10908 unsigned Depth
) const {
10909 if (Op
.getOpcode() == AMDGPUISD::CLAMP
) {
10910 const MachineFunction
&MF
= DAG
.getMachineFunction();
10911 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10913 if (Info
->getMode().DX10Clamp
)
10914 return true; // Clamped to 0.
10915 return DAG
.isKnownNeverNaN(Op
.getOperand(0), SNaN
, Depth
+ 1);
10918 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op
, DAG
,
10922 TargetLowering::AtomicExpansionKind
10923 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*RMW
) const {
10924 switch (RMW
->getOperation()) {
10925 case AtomicRMWInst::FAdd
: {
10926 Type
*Ty
= RMW
->getType();
10928 // We don't have a way to support 16-bit atomics now, so just leave them
10930 if (Ty
->isHalfTy())
10931 return AtomicExpansionKind::None
;
10933 if (!Ty
->isFloatTy())
10934 return AtomicExpansionKind::CmpXChg
;
10936 // TODO: Do have these for flat. Older targets also had them for buffers.
10937 unsigned AS
= RMW
->getPointerAddressSpace();
10938 return (AS
== AMDGPUAS::LOCAL_ADDRESS
&& Subtarget
->hasLDSFPAtomics()) ?
10939 AtomicExpansionKind::None
: AtomicExpansionKind::CmpXChg
;
10945 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW
);