1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Custom DAG lowering for SI
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
16 #define _USE_MATH_DEFINES
19 #include "SIISelLowering.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "SIDefines.h"
25 #include "SIInstrInfo.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "SIRegisterInfo.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/ADT/APFloat.h"
30 #include "llvm/ADT/APInt.h"
31 #include "llvm/ADT/ArrayRef.h"
32 #include "llvm/ADT/BitVector.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/ADT/StringRef.h"
36 #include "llvm/ADT/StringSwitch.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
39 #include "llvm/CodeGen/Analysis.h"
40 #include "llvm/CodeGen/CallingConvLower.h"
41 #include "llvm/CodeGen/DAGCombine.h"
42 #include "llvm/CodeGen/ISDOpcodes.h"
43 #include "llvm/CodeGen/MachineBasicBlock.h"
44 #include "llvm/CodeGen/MachineFrameInfo.h"
45 #include "llvm/CodeGen/MachineFunction.h"
46 #include "llvm/CodeGen/MachineInstr.h"
47 #include "llvm/CodeGen/MachineInstrBuilder.h"
48 #include "llvm/CodeGen/MachineLoopInfo.h"
49 #include "llvm/CodeGen/MachineMemOperand.h"
50 #include "llvm/CodeGen/MachineModuleInfo.h"
51 #include "llvm/CodeGen/MachineOperand.h"
52 #include "llvm/CodeGen/MachineRegisterInfo.h"
53 #include "llvm/CodeGen/SelectionDAG.h"
54 #include "llvm/CodeGen/SelectionDAGNodes.h"
55 #include "llvm/CodeGen/TargetCallingConv.h"
56 #include "llvm/CodeGen/TargetRegisterInfo.h"
57 #include "llvm/CodeGen/ValueTypes.h"
58 #include "llvm/IR/Constants.h"
59 #include "llvm/IR/DataLayout.h"
60 #include "llvm/IR/DebugLoc.h"
61 #include "llvm/IR/DerivedTypes.h"
62 #include "llvm/IR/DiagnosticInfo.h"
63 #include "llvm/IR/Function.h"
64 #include "llvm/IR/GlobalValue.h"
65 #include "llvm/IR/InstrTypes.h"
66 #include "llvm/IR/Instruction.h"
67 #include "llvm/IR/Instructions.h"
68 #include "llvm/IR/IntrinsicInst.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/Support/Casting.h"
71 #include "llvm/Support/CodeGen.h"
72 #include "llvm/Support/CommandLine.h"
73 #include "llvm/Support/Compiler.h"
74 #include "llvm/Support/ErrorHandling.h"
75 #include "llvm/Support/KnownBits.h"
76 #include "llvm/Support/MachineValueType.h"
77 #include "llvm/Support/MathExtras.h"
78 #include "llvm/Target/TargetOptions.h"
89 #define DEBUG_TYPE "si-lower"
91 STATISTIC(NumTailCalls
, "Number of tail calls");
93 static cl::opt
<bool> EnableVGPRIndexMode(
94 "amdgpu-vgpr-index-mode",
95 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
98 static cl::opt
<bool> DisableLoopAlignment(
99 "amdgpu-disable-loop-alignment",
100 cl::desc("Do not align and prefetch loops"),
103 static unsigned findFirstFreeSGPR(CCState
&CCInfo
) {
104 unsigned NumSGPRs
= AMDGPU::SGPR_32RegClass
.getNumRegs();
105 for (unsigned Reg
= 0; Reg
< NumSGPRs
; ++Reg
) {
106 if (!CCInfo
.isAllocated(AMDGPU::SGPR0
+ Reg
)) {
107 return AMDGPU::SGPR0
+ Reg
;
110 llvm_unreachable("Cannot allocate sgpr");
113 SITargetLowering::SITargetLowering(const TargetMachine
&TM
,
114 const GCNSubtarget
&STI
)
115 : AMDGPUTargetLowering(TM
, STI
),
117 addRegisterClass(MVT::i1
, &AMDGPU::VReg_1RegClass
);
118 addRegisterClass(MVT::i64
, &AMDGPU::SReg_64RegClass
);
120 addRegisterClass(MVT::i32
, &AMDGPU::SReg_32RegClass
);
121 addRegisterClass(MVT::f32
, &AMDGPU::VGPR_32RegClass
);
123 addRegisterClass(MVT::f64
, &AMDGPU::VReg_64RegClass
);
124 addRegisterClass(MVT::v2i32
, &AMDGPU::SReg_64RegClass
);
125 addRegisterClass(MVT::v2f32
, &AMDGPU::VReg_64RegClass
);
127 addRegisterClass(MVT::v3i32
, &AMDGPU::SGPR_96RegClass
);
128 addRegisterClass(MVT::v3f32
, &AMDGPU::VReg_96RegClass
);
130 addRegisterClass(MVT::v2i64
, &AMDGPU::SGPR_128RegClass
);
131 addRegisterClass(MVT::v2f64
, &AMDGPU::SGPR_128RegClass
);
133 addRegisterClass(MVT::v4i32
, &AMDGPU::SGPR_128RegClass
);
134 addRegisterClass(MVT::v4f32
, &AMDGPU::VReg_128RegClass
);
136 addRegisterClass(MVT::v5i32
, &AMDGPU::SGPR_160RegClass
);
137 addRegisterClass(MVT::v5f32
, &AMDGPU::VReg_160RegClass
);
139 addRegisterClass(MVT::v8i32
, &AMDGPU::SReg_256RegClass
);
140 addRegisterClass(MVT::v8f32
, &AMDGPU::VReg_256RegClass
);
142 addRegisterClass(MVT::v16i32
, &AMDGPU::SReg_512RegClass
);
143 addRegisterClass(MVT::v16f32
, &AMDGPU::VReg_512RegClass
);
145 if (Subtarget
->has16BitInsts()) {
146 addRegisterClass(MVT::i16
, &AMDGPU::SReg_32RegClass
);
147 addRegisterClass(MVT::f16
, &AMDGPU::SReg_32RegClass
);
149 // Unless there are also VOP3P operations, not operations are really legal.
150 addRegisterClass(MVT::v2i16
, &AMDGPU::SReg_32RegClass
);
151 addRegisterClass(MVT::v2f16
, &AMDGPU::SReg_32RegClass
);
152 addRegisterClass(MVT::v4i16
, &AMDGPU::SReg_64RegClass
);
153 addRegisterClass(MVT::v4f16
, &AMDGPU::SReg_64RegClass
);
156 if (Subtarget
->hasMAIInsts()) {
157 addRegisterClass(MVT::v32i32
, &AMDGPU::VReg_1024RegClass
);
158 addRegisterClass(MVT::v32f32
, &AMDGPU::VReg_1024RegClass
);
161 computeRegisterProperties(Subtarget
->getRegisterInfo());
163 // We need to custom lower vector stores from local memory
164 setOperationAction(ISD::LOAD
, MVT::v2i32
, Custom
);
165 setOperationAction(ISD::LOAD
, MVT::v3i32
, Custom
);
166 setOperationAction(ISD::LOAD
, MVT::v4i32
, Custom
);
167 setOperationAction(ISD::LOAD
, MVT::v5i32
, Custom
);
168 setOperationAction(ISD::LOAD
, MVT::v8i32
, Custom
);
169 setOperationAction(ISD::LOAD
, MVT::v16i32
, Custom
);
170 setOperationAction(ISD::LOAD
, MVT::i1
, Custom
);
171 setOperationAction(ISD::LOAD
, MVT::v32i32
, Custom
);
173 setOperationAction(ISD::STORE
, MVT::v2i32
, Custom
);
174 setOperationAction(ISD::STORE
, MVT::v3i32
, Custom
);
175 setOperationAction(ISD::STORE
, MVT::v4i32
, Custom
);
176 setOperationAction(ISD::STORE
, MVT::v5i32
, Custom
);
177 setOperationAction(ISD::STORE
, MVT::v8i32
, Custom
);
178 setOperationAction(ISD::STORE
, MVT::v16i32
, Custom
);
179 setOperationAction(ISD::STORE
, MVT::i1
, Custom
);
180 setOperationAction(ISD::STORE
, MVT::v32i32
, Custom
);
182 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Expand
);
183 setTruncStoreAction(MVT::v3i32
, MVT::v3i16
, Expand
);
184 setTruncStoreAction(MVT::v4i32
, MVT::v4i16
, Expand
);
185 setTruncStoreAction(MVT::v8i32
, MVT::v8i16
, Expand
);
186 setTruncStoreAction(MVT::v16i32
, MVT::v16i16
, Expand
);
187 setTruncStoreAction(MVT::v32i32
, MVT::v32i16
, Expand
);
188 setTruncStoreAction(MVT::v2i32
, MVT::v2i8
, Expand
);
189 setTruncStoreAction(MVT::v4i32
, MVT::v4i8
, Expand
);
190 setTruncStoreAction(MVT::v8i32
, MVT::v8i8
, Expand
);
191 setTruncStoreAction(MVT::v16i32
, MVT::v16i8
, Expand
);
192 setTruncStoreAction(MVT::v32i32
, MVT::v32i8
, Expand
);
194 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
195 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
197 setOperationAction(ISD::SELECT
, MVT::i1
, Promote
);
198 setOperationAction(ISD::SELECT
, MVT::i64
, Custom
);
199 setOperationAction(ISD::SELECT
, MVT::f64
, Promote
);
200 AddPromotedToType(ISD::SELECT
, MVT::f64
, MVT::i64
);
202 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Expand
);
203 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Expand
);
204 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Expand
);
205 setOperationAction(ISD::SELECT_CC
, MVT::f64
, Expand
);
206 setOperationAction(ISD::SELECT_CC
, MVT::i1
, Expand
);
208 setOperationAction(ISD::SETCC
, MVT::i1
, Promote
);
209 setOperationAction(ISD::SETCC
, MVT::v2i1
, Expand
);
210 setOperationAction(ISD::SETCC
, MVT::v4i1
, Expand
);
211 AddPromotedToType(ISD::SETCC
, MVT::i1
, MVT::i32
);
213 setOperationAction(ISD::TRUNCATE
, MVT::v2i32
, Expand
);
214 setOperationAction(ISD::FP_ROUND
, MVT::v2f32
, Expand
);
216 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i1
, Custom
);
217 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i1
, Custom
);
218 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i8
, Custom
);
219 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i8
, Custom
);
220 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i16
, Custom
);
221 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v3i16
, Custom
);
222 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i16
, Custom
);
223 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::Other
, Custom
);
225 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
226 setOperationAction(ISD::BR_CC
, MVT::i1
, Expand
);
227 setOperationAction(ISD::BR_CC
, MVT::i32
, Expand
);
228 setOperationAction(ISD::BR_CC
, MVT::i64
, Expand
);
229 setOperationAction(ISD::BR_CC
, MVT::f32
, Expand
);
230 setOperationAction(ISD::BR_CC
, MVT::f64
, Expand
);
232 setOperationAction(ISD::UADDO
, MVT::i32
, Legal
);
233 setOperationAction(ISD::USUBO
, MVT::i32
, Legal
);
235 setOperationAction(ISD::ADDCARRY
, MVT::i32
, Legal
);
236 setOperationAction(ISD::SUBCARRY
, MVT::i32
, Legal
);
238 setOperationAction(ISD::SHL_PARTS
, MVT::i64
, Expand
);
239 setOperationAction(ISD::SRA_PARTS
, MVT::i64
, Expand
);
240 setOperationAction(ISD::SRL_PARTS
, MVT::i64
, Expand
);
243 setOperationAction(ISD::ADDCARRY
, MVT::i64
, Legal
);
244 setOperationAction(ISD::SUBCARRY
, MVT::i64
, Legal
);
247 // We only support LOAD/STORE and vector manipulation ops for vectors
248 // with > 4 elements.
249 for (MVT VT
: { MVT::v8i32
, MVT::v8f32
, MVT::v16i32
, MVT::v16f32
,
250 MVT::v2i64
, MVT::v2f64
, MVT::v4i16
, MVT::v4f16
,
251 MVT::v32i32
, MVT::v32f32
}) {
252 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
256 case ISD::BUILD_VECTOR
:
258 case ISD::EXTRACT_VECTOR_ELT
:
259 case ISD::INSERT_VECTOR_ELT
:
260 case ISD::INSERT_SUBVECTOR
:
261 case ISD::EXTRACT_SUBVECTOR
:
262 case ISD::SCALAR_TO_VECTOR
:
264 case ISD::CONCAT_VECTORS
:
265 setOperationAction(Op
, VT
, Custom
);
268 setOperationAction(Op
, VT
, Expand
);
274 setOperationAction(ISD::FP_EXTEND
, MVT::v4f32
, Expand
);
276 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
277 // is expanded to avoid having two separate loops in case the index is a VGPR.
279 // Most operations are naturally 32-bit vector operations. We only support
280 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
281 for (MVT Vec64
: { MVT::v2i64
, MVT::v2f64
}) {
282 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
283 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v4i32
);
285 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
286 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
288 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
289 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
291 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
292 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v4i32
);
295 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8i32
, Expand
);
296 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8f32
, Expand
);
297 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16i32
, Expand
);
298 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16f32
, Expand
);
300 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4f16
, Custom
);
301 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4i16
, Custom
);
303 // Avoid stack access for these.
304 // TODO: Generalize to more vector types.
305 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i16
, Custom
);
306 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2f16
, Custom
);
307 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
308 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
310 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
311 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
312 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i8
, Custom
);
313 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i8
, Custom
);
314 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v8i8
, Custom
);
316 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i8
, Custom
);
317 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i8
, Custom
);
318 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v8i8
, Custom
);
320 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i16
, Custom
);
321 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4f16
, Custom
);
322 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
323 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
325 // Deal with vec3 vector operations when widened to vec4.
326 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v3i32
, Custom
);
327 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v3f32
, Custom
);
328 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v4i32
, Custom
);
329 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v4f32
, Custom
);
331 // Deal with vec5 vector operations when widened to vec8.
332 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v5i32
, Custom
);
333 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v5f32
, Custom
);
334 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v8i32
, Custom
);
335 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v8f32
, Custom
);
337 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
338 // and output demarshalling
339 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i32
, Custom
);
340 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i64
, Custom
);
342 // We can't return success/failure, only the old value,
343 // let LLVM add the comparison
344 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i32
, Expand
);
345 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i64
, Expand
);
347 if (Subtarget
->hasFlatAddressSpace()) {
348 setOperationAction(ISD::ADDRSPACECAST
, MVT::i32
, Custom
);
349 setOperationAction(ISD::ADDRSPACECAST
, MVT::i64
, Custom
);
352 setOperationAction(ISD::BSWAP
, MVT::i32
, Legal
);
353 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
355 // On SI this is s_memtime and s_memrealtime on VI.
356 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Legal
);
357 setOperationAction(ISD::TRAP
, MVT::Other
, Custom
);
358 setOperationAction(ISD::DEBUGTRAP
, MVT::Other
, Custom
);
360 if (Subtarget
->has16BitInsts()) {
361 setOperationAction(ISD::FLOG
, MVT::f16
, Custom
);
362 setOperationAction(ISD::FEXP
, MVT::f16
, Custom
);
363 setOperationAction(ISD::FLOG10
, MVT::f16
, Custom
);
366 // v_mad_f32 does not support denormals according to some sources.
367 if (!Subtarget
->hasFP32Denormals())
368 setOperationAction(ISD::FMAD
, MVT::f32
, Legal
);
370 if (!Subtarget
->hasBFI()) {
371 // fcopysign can be done in a single instruction with BFI.
372 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Expand
);
373 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
376 if (!Subtarget
->hasBCNT(32))
377 setOperationAction(ISD::CTPOP
, MVT::i32
, Expand
);
379 if (!Subtarget
->hasBCNT(64))
380 setOperationAction(ISD::CTPOP
, MVT::i64
, Expand
);
382 if (Subtarget
->hasFFBH())
383 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i32
, Custom
);
385 if (Subtarget
->hasFFBL())
386 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i32
, Custom
);
388 // We only really have 32-bit BFE instructions (and 16-bit on VI).
390 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
391 // effort to match them now. We want this to be false for i64 cases when the
392 // extraction isn't restricted to the upper or lower half. Ideally we would
393 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
394 // span the midpoint are probably relatively rare, so don't worry about them
396 if (Subtarget
->hasBFE())
397 setHasExtractBitsInsn(true);
399 setOperationAction(ISD::FMINNUM
, MVT::f32
, Custom
);
400 setOperationAction(ISD::FMAXNUM
, MVT::f32
, Custom
);
401 setOperationAction(ISD::FMINNUM
, MVT::f64
, Custom
);
402 setOperationAction(ISD::FMAXNUM
, MVT::f64
, Custom
);
405 // These are really only legal for ieee_mode functions. We should be avoiding
406 // them for functions that don't have ieee_mode enabled, so just say they are
408 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f32
, Legal
);
409 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f32
, Legal
);
410 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f64
, Legal
);
411 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f64
, Legal
);
414 if (Subtarget
->haveRoundOpsF64()) {
415 setOperationAction(ISD::FTRUNC
, MVT::f64
, Legal
);
416 setOperationAction(ISD::FCEIL
, MVT::f64
, Legal
);
417 setOperationAction(ISD::FRINT
, MVT::f64
, Legal
);
419 setOperationAction(ISD::FCEIL
, MVT::f64
, Custom
);
420 setOperationAction(ISD::FTRUNC
, MVT::f64
, Custom
);
421 setOperationAction(ISD::FRINT
, MVT::f64
, Custom
);
422 setOperationAction(ISD::FFLOOR
, MVT::f64
, Custom
);
425 setOperationAction(ISD::FFLOOR
, MVT::f64
, Legal
);
427 setOperationAction(ISD::FSIN
, MVT::f32
, Custom
);
428 setOperationAction(ISD::FCOS
, MVT::f32
, Custom
);
429 setOperationAction(ISD::FDIV
, MVT::f32
, Custom
);
430 setOperationAction(ISD::FDIV
, MVT::f64
, Custom
);
432 if (Subtarget
->has16BitInsts()) {
433 setOperationAction(ISD::Constant
, MVT::i16
, Legal
);
435 setOperationAction(ISD::SMIN
, MVT::i16
, Legal
);
436 setOperationAction(ISD::SMAX
, MVT::i16
, Legal
);
438 setOperationAction(ISD::UMIN
, MVT::i16
, Legal
);
439 setOperationAction(ISD::UMAX
, MVT::i16
, Legal
);
441 setOperationAction(ISD::SIGN_EXTEND
, MVT::i16
, Promote
);
442 AddPromotedToType(ISD::SIGN_EXTEND
, MVT::i16
, MVT::i32
);
444 setOperationAction(ISD::ROTR
, MVT::i16
, Promote
);
445 setOperationAction(ISD::ROTL
, MVT::i16
, Promote
);
447 setOperationAction(ISD::SDIV
, MVT::i16
, Promote
);
448 setOperationAction(ISD::UDIV
, MVT::i16
, Promote
);
449 setOperationAction(ISD::SREM
, MVT::i16
, Promote
);
450 setOperationAction(ISD::UREM
, MVT::i16
, Promote
);
452 setOperationAction(ISD::BSWAP
, MVT::i16
, Promote
);
453 setOperationAction(ISD::BITREVERSE
, MVT::i16
, Promote
);
455 setOperationAction(ISD::CTTZ
, MVT::i16
, Promote
);
456 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i16
, Promote
);
457 setOperationAction(ISD::CTLZ
, MVT::i16
, Promote
);
458 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i16
, Promote
);
459 setOperationAction(ISD::CTPOP
, MVT::i16
, Promote
);
461 setOperationAction(ISD::SELECT_CC
, MVT::i16
, Expand
);
463 setOperationAction(ISD::BR_CC
, MVT::i16
, Expand
);
465 setOperationAction(ISD::LOAD
, MVT::i16
, Custom
);
467 setTruncStoreAction(MVT::i64
, MVT::i16
, Expand
);
469 setOperationAction(ISD::FP16_TO_FP
, MVT::i16
, Promote
);
470 AddPromotedToType(ISD::FP16_TO_FP
, MVT::i16
, MVT::i32
);
471 setOperationAction(ISD::FP_TO_FP16
, MVT::i16
, Promote
);
472 AddPromotedToType(ISD::FP_TO_FP16
, MVT::i16
, MVT::i32
);
474 setOperationAction(ISD::FP_TO_SINT
, MVT::i16
, Promote
);
475 setOperationAction(ISD::FP_TO_UINT
, MVT::i16
, Promote
);
476 setOperationAction(ISD::SINT_TO_FP
, MVT::i16
, Promote
);
477 setOperationAction(ISD::UINT_TO_FP
, MVT::i16
, Promote
);
479 // F16 - Constant Actions.
480 setOperationAction(ISD::ConstantFP
, MVT::f16
, Legal
);
482 // F16 - Load/Store Actions.
483 setOperationAction(ISD::LOAD
, MVT::f16
, Promote
);
484 AddPromotedToType(ISD::LOAD
, MVT::f16
, MVT::i16
);
485 setOperationAction(ISD::STORE
, MVT::f16
, Promote
);
486 AddPromotedToType(ISD::STORE
, MVT::f16
, MVT::i16
);
488 // F16 - VOP1 Actions.
489 setOperationAction(ISD::FP_ROUND
, MVT::f16
, Custom
);
490 setOperationAction(ISD::FCOS
, MVT::f16
, Promote
);
491 setOperationAction(ISD::FSIN
, MVT::f16
, Promote
);
492 setOperationAction(ISD::FP_TO_SINT
, MVT::f16
, Promote
);
493 setOperationAction(ISD::FP_TO_UINT
, MVT::f16
, Promote
);
494 setOperationAction(ISD::SINT_TO_FP
, MVT::f16
, Promote
);
495 setOperationAction(ISD::UINT_TO_FP
, MVT::f16
, Promote
);
496 setOperationAction(ISD::FROUND
, MVT::f16
, Custom
);
498 // F16 - VOP2 Actions.
499 setOperationAction(ISD::BR_CC
, MVT::f16
, Expand
);
500 setOperationAction(ISD::SELECT_CC
, MVT::f16
, Expand
);
502 setOperationAction(ISD::FDIV
, MVT::f16
, Custom
);
504 // F16 - VOP3 Actions.
505 setOperationAction(ISD::FMA
, MVT::f16
, Legal
);
506 if (!Subtarget
->hasFP16Denormals() && STI
.hasMadF16())
507 setOperationAction(ISD::FMAD
, MVT::f16
, Legal
);
509 for (MVT VT
: {MVT::v2i16
, MVT::v2f16
, MVT::v4i16
, MVT::v4f16
}) {
510 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
514 case ISD::BUILD_VECTOR
:
516 case ISD::EXTRACT_VECTOR_ELT
:
517 case ISD::INSERT_VECTOR_ELT
:
518 case ISD::INSERT_SUBVECTOR
:
519 case ISD::EXTRACT_SUBVECTOR
:
520 case ISD::SCALAR_TO_VECTOR
:
522 case ISD::CONCAT_VECTORS
:
523 setOperationAction(Op
, VT
, Custom
);
526 setOperationAction(Op
, VT
, Expand
);
532 // XXX - Do these do anything? Vector constants turn into build_vector.
533 setOperationAction(ISD::Constant
, MVT::v2i16
, Legal
);
534 setOperationAction(ISD::ConstantFP
, MVT::v2f16
, Legal
);
536 setOperationAction(ISD::UNDEF
, MVT::v2i16
, Legal
);
537 setOperationAction(ISD::UNDEF
, MVT::v2f16
, Legal
);
539 setOperationAction(ISD::STORE
, MVT::v2i16
, Promote
);
540 AddPromotedToType(ISD::STORE
, MVT::v2i16
, MVT::i32
);
541 setOperationAction(ISD::STORE
, MVT::v2f16
, Promote
);
542 AddPromotedToType(ISD::STORE
, MVT::v2f16
, MVT::i32
);
544 setOperationAction(ISD::LOAD
, MVT::v2i16
, Promote
);
545 AddPromotedToType(ISD::LOAD
, MVT::v2i16
, MVT::i32
);
546 setOperationAction(ISD::LOAD
, MVT::v2f16
, Promote
);
547 AddPromotedToType(ISD::LOAD
, MVT::v2f16
, MVT::i32
);
549 setOperationAction(ISD::AND
, MVT::v2i16
, Promote
);
550 AddPromotedToType(ISD::AND
, MVT::v2i16
, MVT::i32
);
551 setOperationAction(ISD::OR
, MVT::v2i16
, Promote
);
552 AddPromotedToType(ISD::OR
, MVT::v2i16
, MVT::i32
);
553 setOperationAction(ISD::XOR
, MVT::v2i16
, Promote
);
554 AddPromotedToType(ISD::XOR
, MVT::v2i16
, MVT::i32
);
556 setOperationAction(ISD::LOAD
, MVT::v4i16
, Promote
);
557 AddPromotedToType(ISD::LOAD
, MVT::v4i16
, MVT::v2i32
);
558 setOperationAction(ISD::LOAD
, MVT::v4f16
, Promote
);
559 AddPromotedToType(ISD::LOAD
, MVT::v4f16
, MVT::v2i32
);
561 setOperationAction(ISD::STORE
, MVT::v4i16
, Promote
);
562 AddPromotedToType(ISD::STORE
, MVT::v4i16
, MVT::v2i32
);
563 setOperationAction(ISD::STORE
, MVT::v4f16
, Promote
);
564 AddPromotedToType(ISD::STORE
, MVT::v4f16
, MVT::v2i32
);
566 setOperationAction(ISD::ANY_EXTEND
, MVT::v2i32
, Expand
);
567 setOperationAction(ISD::ZERO_EXTEND
, MVT::v2i32
, Expand
);
568 setOperationAction(ISD::SIGN_EXTEND
, MVT::v2i32
, Expand
);
569 setOperationAction(ISD::FP_EXTEND
, MVT::v2f32
, Expand
);
571 setOperationAction(ISD::ANY_EXTEND
, MVT::v4i32
, Expand
);
572 setOperationAction(ISD::ZERO_EXTEND
, MVT::v4i32
, Expand
);
573 setOperationAction(ISD::SIGN_EXTEND
, MVT::v4i32
, Expand
);
575 if (!Subtarget
->hasVOP3PInsts()) {
576 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2i16
, Custom
);
577 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2f16
, Custom
);
580 setOperationAction(ISD::FNEG
, MVT::v2f16
, Legal
);
581 // This isn't really legal, but this avoids the legalizer unrolling it (and
582 // allows matching fneg (fabs x) patterns)
583 setOperationAction(ISD::FABS
, MVT::v2f16
, Legal
);
585 setOperationAction(ISD::FMAXNUM
, MVT::f16
, Custom
);
586 setOperationAction(ISD::FMINNUM
, MVT::f16
, Custom
);
587 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f16
, Legal
);
588 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f16
, Legal
);
590 setOperationAction(ISD::FMINNUM_IEEE
, MVT::v4f16
, Custom
);
591 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::v4f16
, Custom
);
593 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Expand
);
594 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Expand
);
597 if (Subtarget
->hasVOP3PInsts()) {
598 setOperationAction(ISD::ADD
, MVT::v2i16
, Legal
);
599 setOperationAction(ISD::SUB
, MVT::v2i16
, Legal
);
600 setOperationAction(ISD::MUL
, MVT::v2i16
, Legal
);
601 setOperationAction(ISD::SHL
, MVT::v2i16
, Legal
);
602 setOperationAction(ISD::SRL
, MVT::v2i16
, Legal
);
603 setOperationAction(ISD::SRA
, MVT::v2i16
, Legal
);
604 setOperationAction(ISD::SMIN
, MVT::v2i16
, Legal
);
605 setOperationAction(ISD::UMIN
, MVT::v2i16
, Legal
);
606 setOperationAction(ISD::SMAX
, MVT::v2i16
, Legal
);
607 setOperationAction(ISD::UMAX
, MVT::v2i16
, Legal
);
609 setOperationAction(ISD::FADD
, MVT::v2f16
, Legal
);
610 setOperationAction(ISD::FMUL
, MVT::v2f16
, Legal
);
611 setOperationAction(ISD::FMA
, MVT::v2f16
, Legal
);
613 setOperationAction(ISD::FMINNUM_IEEE
, MVT::v2f16
, Legal
);
614 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::v2f16
, Legal
);
616 setOperationAction(ISD::FCANONICALIZE
, MVT::v2f16
, Legal
);
618 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
619 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
621 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4f16
, Custom
);
622 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4i16
, Custom
);
624 setOperationAction(ISD::SHL
, MVT::v4i16
, Custom
);
625 setOperationAction(ISD::SRA
, MVT::v4i16
, Custom
);
626 setOperationAction(ISD::SRL
, MVT::v4i16
, Custom
);
627 setOperationAction(ISD::ADD
, MVT::v4i16
, Custom
);
628 setOperationAction(ISD::SUB
, MVT::v4i16
, Custom
);
629 setOperationAction(ISD::MUL
, MVT::v4i16
, Custom
);
631 setOperationAction(ISD::SMIN
, MVT::v4i16
, Custom
);
632 setOperationAction(ISD::SMAX
, MVT::v4i16
, Custom
);
633 setOperationAction(ISD::UMIN
, MVT::v4i16
, Custom
);
634 setOperationAction(ISD::UMAX
, MVT::v4i16
, Custom
);
636 setOperationAction(ISD::FADD
, MVT::v4f16
, Custom
);
637 setOperationAction(ISD::FMUL
, MVT::v4f16
, Custom
);
638 setOperationAction(ISD::FMA
, MVT::v4f16
, Custom
);
640 setOperationAction(ISD::FMAXNUM
, MVT::v2f16
, Custom
);
641 setOperationAction(ISD::FMINNUM
, MVT::v2f16
, Custom
);
643 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Custom
);
644 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Custom
);
645 setOperationAction(ISD::FCANONICALIZE
, MVT::v4f16
, Custom
);
647 setOperationAction(ISD::FEXP
, MVT::v2f16
, Custom
);
648 setOperationAction(ISD::SELECT
, MVT::v4i16
, Custom
);
649 setOperationAction(ISD::SELECT
, MVT::v4f16
, Custom
);
652 setOperationAction(ISD::FNEG
, MVT::v4f16
, Custom
);
653 setOperationAction(ISD::FABS
, MVT::v4f16
, Custom
);
655 if (Subtarget
->has16BitInsts()) {
656 setOperationAction(ISD::SELECT
, MVT::v2i16
, Promote
);
657 AddPromotedToType(ISD::SELECT
, MVT::v2i16
, MVT::i32
);
658 setOperationAction(ISD::SELECT
, MVT::v2f16
, Promote
);
659 AddPromotedToType(ISD::SELECT
, MVT::v2f16
, MVT::i32
);
661 // Legalization hack.
662 setOperationAction(ISD::SELECT
, MVT::v2i16
, Custom
);
663 setOperationAction(ISD::SELECT
, MVT::v2f16
, Custom
);
665 setOperationAction(ISD::FNEG
, MVT::v2f16
, Custom
);
666 setOperationAction(ISD::FABS
, MVT::v2f16
, Custom
);
669 for (MVT VT
: { MVT::v4i16
, MVT::v4f16
, MVT::v2i8
, MVT::v4i8
, MVT::v8i8
}) {
670 setOperationAction(ISD::SELECT
, VT
, Custom
);
673 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
674 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f32
, Custom
);
675 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v4f32
, Custom
);
676 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::i16
, Custom
);
677 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f16
, Custom
);
678 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2i16
, Custom
);
679 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2f16
, Custom
);
681 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v2f16
, Custom
);
682 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v2i16
, Custom
);
683 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v4f16
, Custom
);
684 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v4i16
, Custom
);
685 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v8f16
, Custom
);
686 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::Other
, Custom
);
687 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::f16
, Custom
);
688 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i16
, Custom
);
689 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i8
, Custom
);
691 setOperationAction(ISD::INTRINSIC_VOID
, MVT::Other
, Custom
);
692 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2i16
, Custom
);
693 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2f16
, Custom
);
694 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v4f16
, Custom
);
695 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v4i16
, Custom
);
696 setOperationAction(ISD::INTRINSIC_VOID
, MVT::f16
, Custom
);
697 setOperationAction(ISD::INTRINSIC_VOID
, MVT::i16
, Custom
);
698 setOperationAction(ISD::INTRINSIC_VOID
, MVT::i8
, Custom
);
700 setTargetDAGCombine(ISD::ADD
);
701 setTargetDAGCombine(ISD::ADDCARRY
);
702 setTargetDAGCombine(ISD::SUB
);
703 setTargetDAGCombine(ISD::SUBCARRY
);
704 setTargetDAGCombine(ISD::FADD
);
705 setTargetDAGCombine(ISD::FSUB
);
706 setTargetDAGCombine(ISD::FMINNUM
);
707 setTargetDAGCombine(ISD::FMAXNUM
);
708 setTargetDAGCombine(ISD::FMINNUM_IEEE
);
709 setTargetDAGCombine(ISD::FMAXNUM_IEEE
);
710 setTargetDAGCombine(ISD::FMA
);
711 setTargetDAGCombine(ISD::SMIN
);
712 setTargetDAGCombine(ISD::SMAX
);
713 setTargetDAGCombine(ISD::UMIN
);
714 setTargetDAGCombine(ISD::UMAX
);
715 setTargetDAGCombine(ISD::SETCC
);
716 setTargetDAGCombine(ISD::AND
);
717 setTargetDAGCombine(ISD::OR
);
718 setTargetDAGCombine(ISD::XOR
);
719 setTargetDAGCombine(ISD::SINT_TO_FP
);
720 setTargetDAGCombine(ISD::UINT_TO_FP
);
721 setTargetDAGCombine(ISD::FCANONICALIZE
);
722 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR
);
723 setTargetDAGCombine(ISD::ZERO_EXTEND
);
724 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG
);
725 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT
);
726 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT
);
728 // All memory operations. Some folding on the pointer operand is done to help
729 // matching the constant offsets in the addressing modes.
730 setTargetDAGCombine(ISD::LOAD
);
731 setTargetDAGCombine(ISD::STORE
);
732 setTargetDAGCombine(ISD::ATOMIC_LOAD
);
733 setTargetDAGCombine(ISD::ATOMIC_STORE
);
734 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP
);
735 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
);
736 setTargetDAGCombine(ISD::ATOMIC_SWAP
);
737 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD
);
738 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB
);
739 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND
);
740 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR
);
741 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR
);
742 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND
);
743 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN
);
744 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX
);
745 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN
);
746 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX
);
747 setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD
);
749 setSchedulingPreference(Sched::RegPressure
);
752 const GCNSubtarget
*SITargetLowering::getSubtarget() const {
756 //===----------------------------------------------------------------------===//
757 // TargetLowering queries
758 //===----------------------------------------------------------------------===//
760 // v_mad_mix* support a conversion from f16 to f32.
762 // There is only one special case when denormals are enabled we don't currently,
763 // where this is OK to use.
764 bool SITargetLowering::isFPExtFoldable(unsigned Opcode
,
765 EVT DestVT
, EVT SrcVT
) const {
766 return ((Opcode
== ISD::FMAD
&& Subtarget
->hasMadMixInsts()) ||
767 (Opcode
== ISD::FMA
&& Subtarget
->hasFmaMixInsts())) &&
768 DestVT
.getScalarType() == MVT::f32
&& !Subtarget
->hasFP32Denormals() &&
769 SrcVT
.getScalarType() == MVT::f16
;
772 bool SITargetLowering::isShuffleMaskLegal(ArrayRef
<int>, EVT
) const {
773 // SI has some legal vector types, but no legal vector operations. Say no
774 // shuffles are legal in order to prefer scalarizing some vector operations.
778 MVT
SITargetLowering::getRegisterTypeForCallingConv(LLVMContext
&Context
,
781 if (CC
== CallingConv::AMDGPU_KERNEL
)
782 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
785 EVT ScalarVT
= VT
.getScalarType();
786 unsigned Size
= ScalarVT
.getSizeInBits();
788 return ScalarVT
.getSimpleVT();
793 if (Size
== 16 && Subtarget
->has16BitInsts())
794 return VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
795 } else if (VT
.getSizeInBits() > 32)
798 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
801 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext
&Context
,
804 if (CC
== CallingConv::AMDGPU_KERNEL
)
805 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
808 unsigned NumElts
= VT
.getVectorNumElements();
809 EVT ScalarVT
= VT
.getScalarType();
810 unsigned Size
= ScalarVT
.getSizeInBits();
816 return NumElts
* ((Size
+ 31) / 32);
818 if (Size
== 16 && Subtarget
->has16BitInsts())
819 return (NumElts
+ 1) / 2;
820 } else if (VT
.getSizeInBits() > 32)
821 return (VT
.getSizeInBits() + 31) / 32;
823 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
826 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
827 LLVMContext
&Context
, CallingConv::ID CC
,
828 EVT VT
, EVT
&IntermediateVT
,
829 unsigned &NumIntermediates
, MVT
&RegisterVT
) const {
830 if (CC
!= CallingConv::AMDGPU_KERNEL
&& VT
.isVector()) {
831 unsigned NumElts
= VT
.getVectorNumElements();
832 EVT ScalarVT
= VT
.getScalarType();
833 unsigned Size
= ScalarVT
.getSizeInBits();
835 RegisterVT
= ScalarVT
.getSimpleVT();
836 IntermediateVT
= RegisterVT
;
837 NumIntermediates
= NumElts
;
838 return NumIntermediates
;
842 RegisterVT
= MVT::i32
;
843 IntermediateVT
= RegisterVT
;
844 NumIntermediates
= NumElts
* ((Size
+ 31) / 32);
845 return NumIntermediates
;
848 // FIXME: We should fix the ABI to be the same on targets without 16-bit
849 // support, but unless we can properly handle 3-vectors, it will be still be
851 if (Size
== 16 && Subtarget
->has16BitInsts()) {
852 RegisterVT
= VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
853 IntermediateVT
= RegisterVT
;
854 NumIntermediates
= (NumElts
+ 1) / 2;
855 return NumIntermediates
;
859 return TargetLowering::getVectorTypeBreakdownForCallingConv(
860 Context
, CC
, VT
, IntermediateVT
, NumIntermediates
, RegisterVT
);
863 static MVT
memVTFromAggregate(Type
*Ty
) {
864 // Only limited forms of aggregate type currently expected.
865 assert(Ty
->isStructTy() && "Expected struct type");
868 Type
*ElementType
= nullptr;
870 if (Ty
->getContainedType(0)->isVectorTy()) {
871 VectorType
*VecComponent
= cast
<VectorType
>(Ty
->getContainedType(0));
872 ElementType
= VecComponent
->getElementType();
873 NumElts
= VecComponent
->getNumElements();
875 ElementType
= Ty
->getContainedType(0);
879 assert((Ty
->getContainedType(1) && Ty
->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
881 // Calculate the size of the memVT type from the aggregate
882 unsigned Pow2Elts
= 0;
883 unsigned ElementSize
;
884 switch (ElementType
->getTypeID()) {
886 llvm_unreachable("Unknown type!");
887 case Type::IntegerTyID
:
888 ElementSize
= cast
<IntegerType
>(ElementType
)->getBitWidth();
893 case Type::FloatTyID
:
897 unsigned AdditionalElts
= ElementSize
== 16 ? 2 : 1;
898 Pow2Elts
= 1 << Log2_32_Ceil(NumElts
+ AdditionalElts
);
900 return MVT::getVectorVT(MVT::getVT(ElementType
, false),
904 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo
&Info
,
907 unsigned IntrID
) const {
908 if (const AMDGPU::RsrcIntrinsic
*RsrcIntr
=
909 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
910 AttributeList Attr
= Intrinsic::getAttributes(CI
.getContext(),
911 (Intrinsic::ID
)IntrID
);
912 if (Attr
.hasFnAttribute(Attribute::ReadNone
))
915 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
917 if (RsrcIntr
->IsImage
) {
918 Info
.ptrVal
= MFI
->getImagePSV(
919 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
920 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
923 Info
.ptrVal
= MFI
->getBufferPSV(
924 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
925 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
928 Info
.flags
= MachineMemOperand::MODereferenceable
;
929 if (Attr
.hasFnAttribute(Attribute::ReadOnly
)) {
930 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
931 Info
.memVT
= MVT::getVT(CI
.getType(), true);
932 if (Info
.memVT
== MVT::Other
) {
933 // Some intrinsics return an aggregate type - special case to work out
935 Info
.memVT
= memVTFromAggregate(CI
.getType());
937 Info
.flags
|= MachineMemOperand::MOLoad
;
938 } else if (Attr
.hasFnAttribute(Attribute::WriteOnly
)) {
939 Info
.opc
= ISD::INTRINSIC_VOID
;
940 Info
.memVT
= MVT::getVT(CI
.getArgOperand(0)->getType());
941 Info
.flags
|= MachineMemOperand::MOStore
;
944 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
945 Info
.memVT
= MVT::getVT(CI
.getType());
946 Info
.flags
= MachineMemOperand::MOLoad
|
947 MachineMemOperand::MOStore
|
948 MachineMemOperand::MODereferenceable
;
950 // XXX - Should this be volatile without known ordering?
951 Info
.flags
|= MachineMemOperand::MOVolatile
;
957 case Intrinsic::amdgcn_atomic_inc
:
958 case Intrinsic::amdgcn_atomic_dec
:
959 case Intrinsic::amdgcn_ds_ordered_add
:
960 case Intrinsic::amdgcn_ds_ordered_swap
:
961 case Intrinsic::amdgcn_ds_fadd
:
962 case Intrinsic::amdgcn_ds_fmin
:
963 case Intrinsic::amdgcn_ds_fmax
: {
964 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
965 Info
.memVT
= MVT::getVT(CI
.getType());
966 Info
.ptrVal
= CI
.getOperand(0);
968 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
970 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(4));
972 Info
.flags
|= MachineMemOperand::MOVolatile
;
976 case Intrinsic::amdgcn_buffer_atomic_fadd
: {
977 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
979 Info
.opc
= ISD::INTRINSIC_VOID
;
980 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType());
981 Info
.ptrVal
= MFI
->getBufferPSV(
982 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
983 CI
.getArgOperand(1));
985 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
987 const ConstantInt
*Vol
= dyn_cast
<ConstantInt
>(CI
.getOperand(4));
988 if (!Vol
|| !Vol
->isZero())
989 Info
.flags
|= MachineMemOperand::MOVolatile
;
993 case Intrinsic::amdgcn_global_atomic_fadd
: {
994 Info
.opc
= ISD::INTRINSIC_VOID
;
995 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType()
996 ->getPointerElementType());
997 Info
.ptrVal
= CI
.getOperand(0);
999 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1003 case Intrinsic::amdgcn_ds_append
:
1004 case Intrinsic::amdgcn_ds_consume
: {
1005 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1006 Info
.memVT
= MVT::getVT(CI
.getType());
1007 Info
.ptrVal
= CI
.getOperand(0);
1009 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1011 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(1));
1013 Info
.flags
|= MachineMemOperand::MOVolatile
;
1017 case Intrinsic::amdgcn_ds_gws_init
:
1018 case Intrinsic::amdgcn_ds_gws_barrier
:
1019 case Intrinsic::amdgcn_ds_gws_sema_v
:
1020 case Intrinsic::amdgcn_ds_gws_sema_br
:
1021 case Intrinsic::amdgcn_ds_gws_sema_p
:
1022 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
1023 Info
.opc
= ISD::INTRINSIC_VOID
;
1025 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1027 MFI
->getGWSPSV(*MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo());
1029 // This is an abstract access, but we need to specify a type and size.
1030 Info
.memVT
= MVT::i32
;
1032 Info
.align
= Align(4);
1034 Info
.flags
= MachineMemOperand::MOStore
;
1035 if (IntrID
== Intrinsic::amdgcn_ds_gws_barrier
)
1036 Info
.flags
= MachineMemOperand::MOLoad
;
1044 bool SITargetLowering::getAddrModeArguments(IntrinsicInst
*II
,
1045 SmallVectorImpl
<Value
*> &Ops
,
1046 Type
*&AccessTy
) const {
1047 switch (II
->getIntrinsicID()) {
1048 case Intrinsic::amdgcn_atomic_inc
:
1049 case Intrinsic::amdgcn_atomic_dec
:
1050 case Intrinsic::amdgcn_ds_ordered_add
:
1051 case Intrinsic::amdgcn_ds_ordered_swap
:
1052 case Intrinsic::amdgcn_ds_fadd
:
1053 case Intrinsic::amdgcn_ds_fmin
:
1054 case Intrinsic::amdgcn_ds_fmax
: {
1055 Value
*Ptr
= II
->getArgOperand(0);
1056 AccessTy
= II
->getType();
1065 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode
&AM
) const {
1066 if (!Subtarget
->hasFlatInstOffsets()) {
1067 // Flat instructions do not have offsets, and only have the register
1069 return AM
.BaseOffs
== 0 && AM
.Scale
== 0;
1072 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
1073 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
1075 // GFX10 shrinked signed offset to 12 bits. When using regular flat
1076 // instructions, the sign bit is also ignored and is treated as 11-bit
1079 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
1080 return isUInt
<11>(AM
.BaseOffs
) && AM
.Scale
== 0;
1083 return isUInt
<12>(AM
.BaseOffs
) && AM
.Scale
== 0;
1086 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode
&AM
) const {
1087 if (Subtarget
->hasFlatGlobalInsts())
1088 return isInt
<13>(AM
.BaseOffs
) && AM
.Scale
== 0;
1090 if (!Subtarget
->hasAddr64() || Subtarget
->useFlatForGlobal()) {
1091 // Assume the we will use FLAT for all global memory accesses
1093 // FIXME: This assumption is currently wrong. On VI we still use
1094 // MUBUF instructions for the r + i addressing mode. As currently
1095 // implemented, the MUBUF instructions only work on buffer < 4GB.
1096 // It may be possible to support > 4GB buffers with MUBUF instructions,
1097 // by setting the stride value in the resource descriptor which would
1098 // increase the size limit to (stride * 4GB). However, this is risky,
1099 // because it has never been validated.
1100 return isLegalFlatAddressingMode(AM
);
1103 return isLegalMUBUFAddressingMode(AM
);
1106 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode
&AM
) const {
1107 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1108 // additionally can do r + r + i with addr64. 32-bit has more addressing
1109 // mode options. Depending on the resource constant, it can also do
1110 // (i64 r0) + (i32 r1) * (i14 i).
1112 // Private arrays end up using a scratch buffer most of the time, so also
1113 // assume those use MUBUF instructions. Scratch loads / stores are currently
1114 // implemented as mubuf instructions with offen bit set, so slightly
1115 // different than the normal addr64.
1116 if (!isUInt
<12>(AM
.BaseOffs
))
1119 // FIXME: Since we can split immediate into soffset and immediate offset,
1120 // would it make sense to allow any immediate?
1123 case 0: // r + i or just i, depending on HasBaseReg.
1126 return true; // We have r + r or r + i.
1128 if (AM
.HasBaseReg
) {
1129 // Reject 2 * r + r.
1133 // Allow 2 * r as r + r
1134 // Or 2 * r + i is allowed as r + r + i.
1136 default: // Don't allow n * r
1141 bool SITargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
1142 const AddrMode
&AM
, Type
*Ty
,
1143 unsigned AS
, Instruction
*I
) const {
1144 // No global is ever allowed as a base.
1148 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
)
1149 return isLegalGlobalAddressingMode(AM
);
1151 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1152 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
1153 AS
== AMDGPUAS::BUFFER_FAT_POINTER
) {
1154 // If the offset isn't a multiple of 4, it probably isn't going to be
1155 // correctly aligned.
1156 // FIXME: Can we get the real alignment here?
1157 if (AM
.BaseOffs
% 4 != 0)
1158 return isLegalMUBUFAddressingMode(AM
);
1160 // There are no SMRD extloads, so if we have to do a small type access we
1161 // will use a MUBUF load.
1162 // FIXME?: We also need to do this if unaligned, but we don't know the
1164 if (Ty
->isSized() && DL
.getTypeStoreSize(Ty
) < 4)
1165 return isLegalGlobalAddressingMode(AM
);
1167 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
1168 // SMRD instructions have an 8-bit, dword offset on SI.
1169 if (!isUInt
<8>(AM
.BaseOffs
/ 4))
1171 } else if (Subtarget
->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
) {
1172 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1173 // in 8-bits, it can use a smaller encoding.
1174 if (!isUInt
<32>(AM
.BaseOffs
/ 4))
1176 } else if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
1177 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1178 if (!isUInt
<20>(AM
.BaseOffs
))
1181 llvm_unreachable("unhandled generation");
1183 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1186 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1191 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1192 return isLegalMUBUFAddressingMode(AM
);
1193 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
||
1194 AS
== AMDGPUAS::REGION_ADDRESS
) {
1195 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1197 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1198 // an 8-bit dword offset but we don't know the alignment here.
1199 if (!isUInt
<16>(AM
.BaseOffs
))
1202 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1205 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1209 } else if (AS
== AMDGPUAS::FLAT_ADDRESS
||
1210 AS
== AMDGPUAS::UNKNOWN_ADDRESS_SPACE
) {
1211 // For an unknown address space, this usually means that this is for some
1212 // reason being used for pure arithmetic, and not based on some addressing
1213 // computation. We don't have instructions that compute pointers with any
1214 // addressing modes, so treat them as having no offset like flat
1216 return isLegalFlatAddressingMode(AM
);
1218 llvm_unreachable("unhandled address space");
1222 bool SITargetLowering::canMergeStoresTo(unsigned AS
, EVT MemVT
,
1223 const SelectionDAG
&DAG
) const {
1224 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
) {
1225 return (MemVT
.getSizeInBits() <= 4 * 32);
1226 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1227 unsigned MaxPrivateBits
= 8 * getSubtarget()->getMaxPrivateElementSize();
1228 return (MemVT
.getSizeInBits() <= MaxPrivateBits
);
1229 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
1230 return (MemVT
.getSizeInBits() <= 2 * 32);
1235 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1236 unsigned Size
, unsigned AddrSpace
, unsigned Align
,
1237 MachineMemOperand::Flags Flags
, bool *IsFast
) const {
1241 if (AddrSpace
== AMDGPUAS::LOCAL_ADDRESS
||
1242 AddrSpace
== AMDGPUAS::REGION_ADDRESS
) {
1243 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1244 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1245 // with adjacent offsets.
1246 bool AlignedBy4
= (Align
% 4 == 0);
1248 *IsFast
= AlignedBy4
;
1253 // FIXME: We have to be conservative here and assume that flat operations
1254 // will access scratch. If we had access to the IR function, then we
1255 // could determine if any private memory was used in the function.
1256 if (!Subtarget
->hasUnalignedScratchAccess() &&
1257 (AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
||
1258 AddrSpace
== AMDGPUAS::FLAT_ADDRESS
)) {
1259 bool AlignedBy4
= Align
>= 4;
1261 *IsFast
= AlignedBy4
;
1266 if (Subtarget
->hasUnalignedBufferAccess()) {
1267 // If we have an uniform constant load, it still requires using a slow
1268 // buffer instruction if unaligned.
1270 *IsFast
= (AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS
||
1271 AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) ?
1272 (Align
% 4 == 0) : true;
1278 // Smaller than dword value must be aligned.
1282 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1283 // byte-address are ignored, thus forcing Dword alignment.
1284 // This applies to private, global, and constant memory.
1288 return Size
>= 32 && Align
>= 4;
1291 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1292 EVT VT
, unsigned AddrSpace
, unsigned Align
, MachineMemOperand::Flags Flags
,
1293 bool *IsFast
) const {
1297 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1298 // which isn't a simple VT.
1299 // Until MVT is extended to handle this, simply check for the size and
1300 // rely on the condition below: allow accesses if the size is a multiple of 4.
1301 if (VT
== MVT::Other
|| (VT
!= MVT::Other
&& VT
.getSizeInBits() > 1024 &&
1302 VT
.getStoreSize() > 16)) {
1306 return allowsMisalignedMemoryAccessesImpl(VT
.getSizeInBits(), AddrSpace
,
1307 Align
, Flags
, IsFast
);
1310 EVT
SITargetLowering::getOptimalMemOpType(
1311 uint64_t Size
, unsigned DstAlign
, unsigned SrcAlign
, bool IsMemset
,
1312 bool ZeroMemset
, bool MemcpyStrSrc
,
1313 const AttributeList
&FuncAttributes
) const {
1314 // FIXME: Should account for address space here.
1316 // The default fallback uses the private pointer size as a guess for a type to
1317 // use. Make sure we switch these to 64-bit accesses.
1319 if (Size
>= 16 && DstAlign
>= 4) // XXX: Should only do for global
1322 if (Size
>= 8 && DstAlign
>= 4)
1329 static bool isFlatGlobalAddrSpace(unsigned AS
) {
1330 return AS
== AMDGPUAS::GLOBAL_ADDRESS
||
1331 AS
== AMDGPUAS::FLAT_ADDRESS
||
1332 AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1333 AS
> AMDGPUAS::MAX_AMDGPU_ADDRESS
;
1336 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS
,
1337 unsigned DestAS
) const {
1338 return isFlatGlobalAddrSpace(SrcAS
) && isFlatGlobalAddrSpace(DestAS
);
1341 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode
*N
) const {
1342 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1343 const Value
*Ptr
= MemNode
->getMemOperand()->getValue();
1344 const Instruction
*I
= dyn_cast_or_null
<Instruction
>(Ptr
);
1345 return I
&& I
->getMetadata("amdgpu.noclobber");
1348 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS
,
1349 unsigned DestAS
) const {
1350 // Flat -> private/local is a simple truncate.
1351 // Flat -> global is no-op
1352 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
)
1355 return isNoopAddrSpaceCast(SrcAS
, DestAS
);
1358 bool SITargetLowering::isMemOpUniform(const SDNode
*N
) const {
1359 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1361 return AMDGPUInstrInfo::isUniformMMO(MemNode
->getMemOperand());
1364 TargetLoweringBase::LegalizeTypeAction
1365 SITargetLowering::getPreferredVectorAction(MVT VT
) const {
1366 int NumElts
= VT
.getVectorNumElements();
1367 if (NumElts
!= 1 && VT
.getScalarType().bitsLE(MVT::i16
))
1368 return VT
.isPow2VectorType() ? TypeSplitVector
: TypeWidenVector
;
1369 return TargetLoweringBase::getPreferredVectorAction(VT
);
1372 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
1374 // FIXME: Could be smarter if called for vector constants.
1378 bool SITargetLowering::isTypeDesirableForOp(unsigned Op
, EVT VT
) const {
1379 if (Subtarget
->has16BitInsts() && VT
== MVT::i16
) {
1384 // These operations are done with 32-bit instructions anyway.
1389 // TODO: Extensions?
1396 // SimplifySetCC uses this function to determine whether or not it should
1397 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1398 if (VT
== MVT::i1
&& Op
== ISD::SETCC
)
1401 return TargetLowering::isTypeDesirableForOp(Op
, VT
);
1404 SDValue
SITargetLowering::lowerKernArgParameterPtr(SelectionDAG
&DAG
,
1407 uint64_t Offset
) const {
1408 const DataLayout
&DL
= DAG
.getDataLayout();
1409 MachineFunction
&MF
= DAG
.getMachineFunction();
1410 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1412 const ArgDescriptor
*InputPtrReg
;
1413 const TargetRegisterClass
*RC
;
1415 std::tie(InputPtrReg
, RC
)
1416 = Info
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1418 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
1419 MVT PtrVT
= getPointerTy(DL
, AMDGPUAS::CONSTANT_ADDRESS
);
1420 SDValue BasePtr
= DAG
.getCopyFromReg(Chain
, SL
,
1421 MRI
.getLiveInVirtReg(InputPtrReg
->getRegister()), PtrVT
);
1423 return DAG
.getObjectPtrOffset(SL
, BasePtr
, Offset
);
1426 SDValue
SITargetLowering::getImplicitArgPtr(SelectionDAG
&DAG
,
1427 const SDLoc
&SL
) const {
1428 uint64_t Offset
= getImplicitParameterOffset(DAG
.getMachineFunction(),
1430 return lowerKernArgParameterPtr(DAG
, SL
, DAG
.getEntryNode(), Offset
);
1433 SDValue
SITargetLowering::convertArgType(SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1434 const SDLoc
&SL
, SDValue Val
,
1436 const ISD::InputArg
*Arg
) const {
1437 // First, if it is a widened vector, narrow it.
1438 if (VT
.isVector() &&
1439 VT
.getVectorNumElements() != MemVT
.getVectorNumElements()) {
1441 EVT::getVectorVT(*DAG
.getContext(), MemVT
.getVectorElementType(),
1442 VT
.getVectorNumElements());
1443 Val
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, NarrowedVT
, Val
,
1444 DAG
.getConstant(0, SL
, MVT::i32
));
1447 // Then convert the vector elements or scalar value.
1448 if (Arg
&& (Arg
->Flags
.isSExt() || Arg
->Flags
.isZExt()) &&
1450 unsigned Opc
= Arg
->Flags
.isZExt() ? ISD::AssertZext
: ISD::AssertSext
;
1451 Val
= DAG
.getNode(Opc
, SL
, MemVT
, Val
, DAG
.getValueType(VT
));
1454 if (MemVT
.isFloatingPoint())
1455 Val
= getFPExtOrFPTrunc(DAG
, Val
, SL
, VT
);
1457 Val
= DAG
.getSExtOrTrunc(Val
, SL
, VT
);
1459 Val
= DAG
.getZExtOrTrunc(Val
, SL
, VT
);
1464 SDValue
SITargetLowering::lowerKernargMemParameter(
1465 SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1466 const SDLoc
&SL
, SDValue Chain
,
1467 uint64_t Offset
, unsigned Align
, bool Signed
,
1468 const ISD::InputArg
*Arg
) const {
1469 Type
*Ty
= MemVT
.getTypeForEVT(*DAG
.getContext());
1470 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
1471 MachinePointerInfo
PtrInfo(UndefValue::get(PtrTy
));
1473 // Try to avoid using an extload by loading earlier than the argument address,
1474 // and extracting the relevant bits. The load should hopefully be merged with
1475 // the previous argument.
1476 if (MemVT
.getStoreSize() < 4 && Align
< 4) {
1477 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1478 int64_t AlignDownOffset
= alignDown(Offset
, 4);
1479 int64_t OffsetDiff
= Offset
- AlignDownOffset
;
1481 EVT IntVT
= MemVT
.changeTypeToInteger();
1483 // TODO: If we passed in the base kernel offset we could have a better
1484 // alignment than 4, but we don't really need it.
1485 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, AlignDownOffset
);
1486 SDValue Load
= DAG
.getLoad(MVT::i32
, SL
, Chain
, Ptr
, PtrInfo
, 4,
1487 MachineMemOperand::MODereferenceable
|
1488 MachineMemOperand::MOInvariant
);
1490 SDValue ShiftAmt
= DAG
.getConstant(OffsetDiff
* 8, SL
, MVT::i32
);
1491 SDValue Extract
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Load
, ShiftAmt
);
1493 SDValue ArgVal
= DAG
.getNode(ISD::TRUNCATE
, SL
, IntVT
, Extract
);
1494 ArgVal
= DAG
.getNode(ISD::BITCAST
, SL
, MemVT
, ArgVal
);
1495 ArgVal
= convertArgType(DAG
, VT
, MemVT
, SL
, ArgVal
, Signed
, Arg
);
1498 return DAG
.getMergeValues({ ArgVal
, Load
.getValue(1) }, SL
);
1501 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, Offset
);
1502 SDValue Load
= DAG
.getLoad(MemVT
, SL
, Chain
, Ptr
, PtrInfo
, Align
,
1503 MachineMemOperand::MODereferenceable
|
1504 MachineMemOperand::MOInvariant
);
1506 SDValue Val
= convertArgType(DAG
, VT
, MemVT
, SL
, Load
, Signed
, Arg
);
1507 return DAG
.getMergeValues({ Val
, Load
.getValue(1) }, SL
);
1510 SDValue
SITargetLowering::lowerStackParameter(SelectionDAG
&DAG
, CCValAssign
&VA
,
1511 const SDLoc
&SL
, SDValue Chain
,
1512 const ISD::InputArg
&Arg
) const {
1513 MachineFunction
&MF
= DAG
.getMachineFunction();
1514 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1516 if (Arg
.Flags
.isByVal()) {
1517 unsigned Size
= Arg
.Flags
.getByValSize();
1518 int FrameIdx
= MFI
.CreateFixedObject(Size
, VA
.getLocMemOffset(), false);
1519 return DAG
.getFrameIndex(FrameIdx
, MVT::i32
);
1522 unsigned ArgOffset
= VA
.getLocMemOffset();
1523 unsigned ArgSize
= VA
.getValVT().getStoreSize();
1525 int FI
= MFI
.CreateFixedObject(ArgSize
, ArgOffset
, true);
1527 // Create load nodes to retrieve arguments from the stack.
1528 SDValue FIN
= DAG
.getFrameIndex(FI
, MVT::i32
);
1531 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1532 ISD::LoadExtType ExtType
= ISD::NON_EXTLOAD
;
1533 MVT MemVT
= VA
.getValVT();
1535 switch (VA
.getLocInfo()) {
1538 case CCValAssign::BCvt
:
1539 MemVT
= VA
.getLocVT();
1541 case CCValAssign::SExt
:
1542 ExtType
= ISD::SEXTLOAD
;
1544 case CCValAssign::ZExt
:
1545 ExtType
= ISD::ZEXTLOAD
;
1547 case CCValAssign::AExt
:
1548 ExtType
= ISD::EXTLOAD
;
1552 ArgValue
= DAG
.getExtLoad(
1553 ExtType
, SL
, VA
.getLocVT(), Chain
, FIN
,
1554 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
),
1559 SDValue
SITargetLowering::getPreloadedValue(SelectionDAG
&DAG
,
1560 const SIMachineFunctionInfo
&MFI
,
1562 AMDGPUFunctionArgInfo::PreloadedValue PVID
) const {
1563 const ArgDescriptor
*Reg
;
1564 const TargetRegisterClass
*RC
;
1566 std::tie(Reg
, RC
) = MFI
.getPreloadedValue(PVID
);
1567 return CreateLiveInRegister(DAG
, RC
, Reg
->getRegister(), VT
);
1570 static void processShaderInputArgs(SmallVectorImpl
<ISD::InputArg
> &Splits
,
1571 CallingConv::ID CallConv
,
1572 ArrayRef
<ISD::InputArg
> Ins
,
1574 FunctionType
*FType
,
1575 SIMachineFunctionInfo
*Info
) {
1576 for (unsigned I
= 0, E
= Ins
.size(), PSInputNum
= 0; I
!= E
; ++I
) {
1577 const ISD::InputArg
*Arg
= &Ins
[I
];
1579 assert((!Arg
->VT
.isVector() || Arg
->VT
.getScalarSizeInBits() == 16) &&
1580 "vector type argument should have been split");
1582 // First check if it's a PS input addr.
1583 if (CallConv
== CallingConv::AMDGPU_PS
&&
1584 !Arg
->Flags
.isInReg() && PSInputNum
<= 15) {
1585 bool SkipArg
= !Arg
->Used
&& !Info
->isPSInputAllocated(PSInputNum
);
1587 // Inconveniently only the first part of the split is marked as isSplit,
1588 // so skip to the end. We only want to increment PSInputNum once for the
1589 // entire split argument.
1590 if (Arg
->Flags
.isSplit()) {
1591 while (!Arg
->Flags
.isSplitEnd()) {
1592 assert((!Arg
->VT
.isVector() ||
1593 Arg
->VT
.getScalarSizeInBits() == 16) &&
1594 "unexpected vector split in ps argument type");
1596 Splits
.push_back(*Arg
);
1602 // We can safely skip PS inputs.
1603 Skipped
.set(Arg
->getOrigArgIndex());
1608 Info
->markPSInputAllocated(PSInputNum
);
1610 Info
->markPSInputEnabled(PSInputNum
);
1615 Splits
.push_back(*Arg
);
1619 // Allocate special inputs passed in VGPRs.
1620 void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState
&CCInfo
,
1621 MachineFunction
&MF
,
1622 const SIRegisterInfo
&TRI
,
1623 SIMachineFunctionInfo
&Info
) const {
1624 const LLT S32
= LLT::scalar(32);
1625 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1627 if (Info
.hasWorkItemIDX()) {
1628 Register Reg
= AMDGPU::VGPR0
;
1629 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1631 CCInfo
.AllocateReg(Reg
);
1632 Info
.setWorkItemIDX(ArgDescriptor::createRegister(Reg
));
1635 if (Info
.hasWorkItemIDY()) {
1636 Register Reg
= AMDGPU::VGPR1
;
1637 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1639 CCInfo
.AllocateReg(Reg
);
1640 Info
.setWorkItemIDY(ArgDescriptor::createRegister(Reg
));
1643 if (Info
.hasWorkItemIDZ()) {
1644 Register Reg
= AMDGPU::VGPR2
;
1645 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1647 CCInfo
.AllocateReg(Reg
);
1648 Info
.setWorkItemIDZ(ArgDescriptor::createRegister(Reg
));
1652 // Try to allocate a VGPR at the end of the argument list, or if no argument
1653 // VGPRs are left allocating a stack slot.
1654 // If \p Mask is is given it indicates bitfield position in the register.
1655 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1656 static ArgDescriptor
allocateVGPR32Input(CCState
&CCInfo
, unsigned Mask
= ~0u,
1657 ArgDescriptor Arg
= ArgDescriptor()) {
1659 return ArgDescriptor::createArg(Arg
, Mask
);
1661 ArrayRef
<MCPhysReg
> ArgVGPRs
1662 = makeArrayRef(AMDGPU::VGPR_32RegClass
.begin(), 32);
1663 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgVGPRs
);
1664 if (RegIdx
== ArgVGPRs
.size()) {
1665 // Spill to stack required.
1666 int64_t Offset
= CCInfo
.AllocateStack(4, 4);
1668 return ArgDescriptor::createStack(Offset
, Mask
);
1671 unsigned Reg
= ArgVGPRs
[RegIdx
];
1672 Reg
= CCInfo
.AllocateReg(Reg
);
1673 assert(Reg
!= AMDGPU::NoRegister
);
1675 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1676 Register LiveInVReg
= MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
);
1677 MF
.getRegInfo().setType(LiveInVReg
, LLT::scalar(32));
1678 return ArgDescriptor::createRegister(Reg
, Mask
);
1681 static ArgDescriptor
allocateSGPR32InputImpl(CCState
&CCInfo
,
1682 const TargetRegisterClass
*RC
,
1683 unsigned NumArgRegs
) {
1684 ArrayRef
<MCPhysReg
> ArgSGPRs
= makeArrayRef(RC
->begin(), 32);
1685 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgSGPRs
);
1686 if (RegIdx
== ArgSGPRs
.size())
1687 report_fatal_error("ran out of SGPRs for arguments");
1689 unsigned Reg
= ArgSGPRs
[RegIdx
];
1690 Reg
= CCInfo
.AllocateReg(Reg
);
1691 assert(Reg
!= AMDGPU::NoRegister
);
1693 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1694 MF
.addLiveIn(Reg
, RC
);
1695 return ArgDescriptor::createRegister(Reg
);
1698 static ArgDescriptor
allocateSGPR32Input(CCState
&CCInfo
) {
1699 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_32RegClass
, 32);
1702 static ArgDescriptor
allocateSGPR64Input(CCState
&CCInfo
) {
1703 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_64RegClass
, 16);
1706 void SITargetLowering::allocateSpecialInputVGPRs(CCState
&CCInfo
,
1707 MachineFunction
&MF
,
1708 const SIRegisterInfo
&TRI
,
1709 SIMachineFunctionInfo
&Info
) const {
1710 const unsigned Mask
= 0x3ff;
1713 if (Info
.hasWorkItemIDX()) {
1714 Arg
= allocateVGPR32Input(CCInfo
, Mask
);
1715 Info
.setWorkItemIDX(Arg
);
1718 if (Info
.hasWorkItemIDY()) {
1719 Arg
= allocateVGPR32Input(CCInfo
, Mask
<< 10, Arg
);
1720 Info
.setWorkItemIDY(Arg
);
1723 if (Info
.hasWorkItemIDZ())
1724 Info
.setWorkItemIDZ(allocateVGPR32Input(CCInfo
, Mask
<< 20, Arg
));
1727 void SITargetLowering::allocateSpecialInputSGPRs(
1729 MachineFunction
&MF
,
1730 const SIRegisterInfo
&TRI
,
1731 SIMachineFunctionInfo
&Info
) const {
1732 auto &ArgInfo
= Info
.getArgInfo();
1734 // TODO: Unify handling with private memory pointers.
1736 if (Info
.hasDispatchPtr())
1737 ArgInfo
.DispatchPtr
= allocateSGPR64Input(CCInfo
);
1739 if (Info
.hasQueuePtr())
1740 ArgInfo
.QueuePtr
= allocateSGPR64Input(CCInfo
);
1742 if (Info
.hasKernargSegmentPtr())
1743 ArgInfo
.KernargSegmentPtr
= allocateSGPR64Input(CCInfo
);
1745 if (Info
.hasDispatchID())
1746 ArgInfo
.DispatchID
= allocateSGPR64Input(CCInfo
);
1748 // flat_scratch_init is not applicable for non-kernel functions.
1750 if (Info
.hasWorkGroupIDX())
1751 ArgInfo
.WorkGroupIDX
= allocateSGPR32Input(CCInfo
);
1753 if (Info
.hasWorkGroupIDY())
1754 ArgInfo
.WorkGroupIDY
= allocateSGPR32Input(CCInfo
);
1756 if (Info
.hasWorkGroupIDZ())
1757 ArgInfo
.WorkGroupIDZ
= allocateSGPR32Input(CCInfo
);
1759 if (Info
.hasImplicitArgPtr())
1760 ArgInfo
.ImplicitArgPtr
= allocateSGPR64Input(CCInfo
);
1763 // Allocate special inputs passed in user SGPRs.
1764 void SITargetLowering::allocateHSAUserSGPRs(CCState
&CCInfo
,
1765 MachineFunction
&MF
,
1766 const SIRegisterInfo
&TRI
,
1767 SIMachineFunctionInfo
&Info
) const {
1768 if (Info
.hasImplicitBufferPtr()) {
1769 unsigned ImplicitBufferPtrReg
= Info
.addImplicitBufferPtr(TRI
);
1770 MF
.addLiveIn(ImplicitBufferPtrReg
, &AMDGPU::SGPR_64RegClass
);
1771 CCInfo
.AllocateReg(ImplicitBufferPtrReg
);
1774 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1775 if (Info
.hasPrivateSegmentBuffer()) {
1776 unsigned PrivateSegmentBufferReg
= Info
.addPrivateSegmentBuffer(TRI
);
1777 MF
.addLiveIn(PrivateSegmentBufferReg
, &AMDGPU::SGPR_128RegClass
);
1778 CCInfo
.AllocateReg(PrivateSegmentBufferReg
);
1781 if (Info
.hasDispatchPtr()) {
1782 unsigned DispatchPtrReg
= Info
.addDispatchPtr(TRI
);
1783 MF
.addLiveIn(DispatchPtrReg
, &AMDGPU::SGPR_64RegClass
);
1784 CCInfo
.AllocateReg(DispatchPtrReg
);
1787 if (Info
.hasQueuePtr()) {
1788 unsigned QueuePtrReg
= Info
.addQueuePtr(TRI
);
1789 MF
.addLiveIn(QueuePtrReg
, &AMDGPU::SGPR_64RegClass
);
1790 CCInfo
.AllocateReg(QueuePtrReg
);
1793 if (Info
.hasKernargSegmentPtr()) {
1794 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1795 Register InputPtrReg
= Info
.addKernargSegmentPtr(TRI
);
1796 CCInfo
.AllocateReg(InputPtrReg
);
1798 Register VReg
= MF
.addLiveIn(InputPtrReg
, &AMDGPU::SGPR_64RegClass
);
1799 MRI
.setType(VReg
, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
1802 if (Info
.hasDispatchID()) {
1803 unsigned DispatchIDReg
= Info
.addDispatchID(TRI
);
1804 MF
.addLiveIn(DispatchIDReg
, &AMDGPU::SGPR_64RegClass
);
1805 CCInfo
.AllocateReg(DispatchIDReg
);
1808 if (Info
.hasFlatScratchInit()) {
1809 unsigned FlatScratchInitReg
= Info
.addFlatScratchInit(TRI
);
1810 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
1811 CCInfo
.AllocateReg(FlatScratchInitReg
);
1814 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1815 // these from the dispatch pointer.
1818 // Allocate special input registers that are initialized per-wave.
1819 void SITargetLowering::allocateSystemSGPRs(CCState
&CCInfo
,
1820 MachineFunction
&MF
,
1821 SIMachineFunctionInfo
&Info
,
1822 CallingConv::ID CallConv
,
1823 bool IsShader
) const {
1824 if (Info
.hasWorkGroupIDX()) {
1825 unsigned Reg
= Info
.addWorkGroupIDX();
1826 MF
.addLiveIn(Reg
, &AMDGPU::SGPR_32RegClass
);
1827 CCInfo
.AllocateReg(Reg
);
1830 if (Info
.hasWorkGroupIDY()) {
1831 unsigned Reg
= Info
.addWorkGroupIDY();
1832 MF
.addLiveIn(Reg
, &AMDGPU::SGPR_32RegClass
);
1833 CCInfo
.AllocateReg(Reg
);
1836 if (Info
.hasWorkGroupIDZ()) {
1837 unsigned Reg
= Info
.addWorkGroupIDZ();
1838 MF
.addLiveIn(Reg
, &AMDGPU::SGPR_32RegClass
);
1839 CCInfo
.AllocateReg(Reg
);
1842 if (Info
.hasWorkGroupInfo()) {
1843 unsigned Reg
= Info
.addWorkGroupInfo();
1844 MF
.addLiveIn(Reg
, &AMDGPU::SGPR_32RegClass
);
1845 CCInfo
.AllocateReg(Reg
);
1848 if (Info
.hasPrivateSegmentWaveByteOffset()) {
1849 // Scratch wave offset passed in system SGPR.
1850 unsigned PrivateSegmentWaveByteOffsetReg
;
1853 PrivateSegmentWaveByteOffsetReg
=
1854 Info
.getPrivateSegmentWaveByteOffsetSystemSGPR();
1856 // This is true if the scratch wave byte offset doesn't have a fixed
1858 if (PrivateSegmentWaveByteOffsetReg
== AMDGPU::NoRegister
) {
1859 PrivateSegmentWaveByteOffsetReg
= findFirstFreeSGPR(CCInfo
);
1860 Info
.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg
);
1863 PrivateSegmentWaveByteOffsetReg
= Info
.addPrivateSegmentWaveByteOffset();
1865 MF
.addLiveIn(PrivateSegmentWaveByteOffsetReg
, &AMDGPU::SGPR_32RegClass
);
1866 CCInfo
.AllocateReg(PrivateSegmentWaveByteOffsetReg
);
1870 static void reservePrivateMemoryRegs(const TargetMachine
&TM
,
1871 MachineFunction
&MF
,
1872 const SIRegisterInfo
&TRI
,
1873 SIMachineFunctionInfo
&Info
) {
1874 // Now that we've figured out where the scratch register inputs are, see if
1875 // should reserve the arguments and use them directly.
1876 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1877 bool HasStackObjects
= MFI
.hasStackObjects();
1878 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1880 // Record that we know we have non-spill stack objects so we don't need to
1881 // check all stack objects later.
1882 if (HasStackObjects
)
1883 Info
.setHasNonSpillStackObjects(true);
1885 // Everything live out of a block is spilled with fast regalloc, so it's
1886 // almost certain that spilling will be required.
1887 if (TM
.getOptLevel() == CodeGenOpt::None
)
1888 HasStackObjects
= true;
1890 // For now assume stack access is needed in any callee functions, so we need
1891 // the scratch registers to pass in.
1892 bool RequiresStackAccess
= HasStackObjects
|| MFI
.hasCalls();
1894 if (RequiresStackAccess
&& ST
.isAmdHsaOrMesa(MF
.getFunction())) {
1895 // If we have stack objects, we unquestionably need the private buffer
1896 // resource. For the Code Object V2 ABI, this will be the first 4 user
1897 // SGPR inputs. We can reserve those and use them directly.
1899 Register PrivateSegmentBufferReg
=
1900 Info
.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER
);
1901 Info
.setScratchRSrcReg(PrivateSegmentBufferReg
);
1903 unsigned ReservedBufferReg
= TRI
.reservedPrivateSegmentBufferReg(MF
);
1904 // We tentatively reserve the last registers (skipping the last registers
1905 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
1906 // we'll replace these with the ones immediately after those which were
1907 // really allocated. In the prologue copies will be inserted from the
1908 // argument to these reserved registers.
1910 // Without HSA, relocations are used for the scratch pointer and the
1911 // buffer resource setup is always inserted in the prologue. Scratch wave
1912 // offset is still in an input SGPR.
1913 Info
.setScratchRSrcReg(ReservedBufferReg
);
1916 // hasFP should be accurate for kernels even before the frame is finalized.
1917 if (ST
.getFrameLowering()->hasFP(MF
)) {
1918 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1920 // Try to use s32 as the SP, but move it if it would interfere with input
1921 // arguments. This won't work with calls though.
1923 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
1925 if (!MRI
.isLiveIn(AMDGPU::SGPR32
)) {
1926 Info
.setStackPtrOffsetReg(AMDGPU::SGPR32
);
1928 assert(AMDGPU::isShader(MF
.getFunction().getCallingConv()));
1931 report_fatal_error("call in graphics shader with too many input SGPRs");
1933 for (unsigned Reg
: AMDGPU::SGPR_32RegClass
) {
1934 if (!MRI
.isLiveIn(Reg
)) {
1935 Info
.setStackPtrOffsetReg(Reg
);
1940 if (Info
.getStackPtrOffsetReg() == AMDGPU::SP_REG
)
1941 report_fatal_error("failed to find register for SP");
1944 if (MFI
.hasCalls()) {
1945 Info
.setScratchWaveOffsetReg(AMDGPU::SGPR33
);
1946 Info
.setFrameOffsetReg(AMDGPU::SGPR33
);
1948 unsigned ReservedOffsetReg
=
1949 TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1950 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1951 Info
.setFrameOffsetReg(ReservedOffsetReg
);
1953 } else if (RequiresStackAccess
) {
1954 assert(!MFI
.hasCalls());
1955 // We know there are accesses and they will be done relative to SP, so just
1956 // pin it to the input.
1958 // FIXME: Should not do this if inline asm is reading/writing these
1960 Register PreloadedSP
= Info
.getPreloadedReg(
1961 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET
);
1963 Info
.setStackPtrOffsetReg(PreloadedSP
);
1964 Info
.setScratchWaveOffsetReg(PreloadedSP
);
1965 Info
.setFrameOffsetReg(PreloadedSP
);
1967 assert(!MFI
.hasCalls());
1969 // There may not be stack access at all. There may still be spills, or
1970 // access of a constant pointer (in which cases an extra copy will be
1971 // emitted in the prolog).
1972 unsigned ReservedOffsetReg
1973 = TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1974 Info
.setStackPtrOffsetReg(ReservedOffsetReg
);
1975 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1976 Info
.setFrameOffsetReg(ReservedOffsetReg
);
1980 bool SITargetLowering::supportSplitCSR(MachineFunction
*MF
) const {
1981 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
1982 return !Info
->isEntryFunction();
1985 void SITargetLowering::initializeSplitCSR(MachineBasicBlock
*Entry
) const {
1989 void SITargetLowering::insertCopiesSplitCSR(
1990 MachineBasicBlock
*Entry
,
1991 const SmallVectorImpl
<MachineBasicBlock
*> &Exits
) const {
1992 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
1994 const MCPhysReg
*IStart
= TRI
->getCalleeSavedRegsViaCopy(Entry
->getParent());
1998 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
1999 MachineRegisterInfo
*MRI
= &Entry
->getParent()->getRegInfo();
2000 MachineBasicBlock::iterator MBBI
= Entry
->begin();
2001 for (const MCPhysReg
*I
= IStart
; *I
; ++I
) {
2002 const TargetRegisterClass
*RC
= nullptr;
2003 if (AMDGPU::SReg_64RegClass
.contains(*I
))
2004 RC
= &AMDGPU::SGPR_64RegClass
;
2005 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
2006 RC
= &AMDGPU::SGPR_32RegClass
;
2008 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2010 Register NewVR
= MRI
->createVirtualRegister(RC
);
2011 // Create copy from CSR to a virtual register.
2012 Entry
->addLiveIn(*I
);
2013 BuildMI(*Entry
, MBBI
, DebugLoc(), TII
->get(TargetOpcode::COPY
), NewVR
)
2016 // Insert the copy-back instructions right before the terminator.
2017 for (auto *Exit
: Exits
)
2018 BuildMI(*Exit
, Exit
->getFirstTerminator(), DebugLoc(),
2019 TII
->get(TargetOpcode::COPY
), *I
)
2024 SDValue
SITargetLowering::LowerFormalArguments(
2025 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
2026 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2027 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
2028 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2030 MachineFunction
&MF
= DAG
.getMachineFunction();
2031 const Function
&Fn
= MF
.getFunction();
2032 FunctionType
*FType
= MF
.getFunction().getFunctionType();
2033 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2035 if (Subtarget
->isAmdHsaOS() && AMDGPU::isShader(CallConv
)) {
2036 DiagnosticInfoUnsupported
NoGraphicsHSA(
2037 Fn
, "unsupported non-compute shaders with HSA", DL
.getDebugLoc());
2038 DAG
.getContext()->diagnose(NoGraphicsHSA
);
2039 return DAG
.getEntryNode();
2042 SmallVector
<ISD::InputArg
, 16> Splits
;
2043 SmallVector
<CCValAssign
, 16> ArgLocs
;
2044 BitVector
Skipped(Ins
.size());
2045 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
2048 bool IsShader
= AMDGPU::isShader(CallConv
);
2049 bool IsKernel
= AMDGPU::isKernel(CallConv
);
2050 bool IsEntryFunc
= AMDGPU::isEntryFunctionCC(CallConv
);
2053 processShaderInputArgs(Splits
, CallConv
, Ins
, Skipped
, FType
, Info
);
2055 // At least one interpolation mode must be enabled or else the GPU will
2058 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2059 // set PSInputAddr, the user wants to enable some bits after the compilation
2060 // based on run-time states. Since we can't know what the final PSInputEna
2061 // will look like, so we shouldn't do anything here and the user should take
2062 // responsibility for the correct programming.
2064 // Otherwise, the following restrictions apply:
2065 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2066 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2068 if (CallConv
== CallingConv::AMDGPU_PS
) {
2069 if ((Info
->getPSInputAddr() & 0x7F) == 0 ||
2070 ((Info
->getPSInputAddr() & 0xF) == 0 &&
2071 Info
->isPSInputAllocated(11))) {
2072 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
2073 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
2074 Info
->markPSInputAllocated(0);
2075 Info
->markPSInputEnabled(0);
2077 if (Subtarget
->isAmdPalOS()) {
2078 // For isAmdPalOS, the user does not enable some bits after compilation
2079 // based on run-time states; the register values being generated here are
2080 // the final ones set in hardware. Therefore we need to apply the
2081 // workaround to PSInputAddr and PSInputEnable together. (The case where
2082 // a bit is set in PSInputAddr but not PSInputEnable is where the
2083 // frontend set up an input arg for a particular interpolation mode, but
2084 // nothing uses that input arg. Really we should have an earlier pass
2085 // that removes such an arg.)
2086 unsigned PsInputBits
= Info
->getPSInputAddr() & Info
->getPSInputEnable();
2087 if ((PsInputBits
& 0x7F) == 0 ||
2088 ((PsInputBits
& 0xF) == 0 &&
2089 (PsInputBits
>> 11 & 1)))
2090 Info
->markPSInputEnabled(
2091 countTrailingZeros(Info
->getPSInputAddr(), ZB_Undefined
));
2095 assert(!Info
->hasDispatchPtr() &&
2096 !Info
->hasKernargSegmentPtr() && !Info
->hasFlatScratchInit() &&
2097 !Info
->hasWorkGroupIDX() && !Info
->hasWorkGroupIDY() &&
2098 !Info
->hasWorkGroupIDZ() && !Info
->hasWorkGroupInfo() &&
2099 !Info
->hasWorkItemIDX() && !Info
->hasWorkItemIDY() &&
2100 !Info
->hasWorkItemIDZ());
2101 } else if (IsKernel
) {
2102 assert(Info
->hasWorkGroupIDX() && Info
->hasWorkItemIDX());
2104 Splits
.append(Ins
.begin(), Ins
.end());
2108 allocateSpecialEntryInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2109 allocateHSAUserSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2113 analyzeFormalArgumentsCompute(CCInfo
, Ins
);
2115 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, isVarArg
);
2116 CCInfo
.AnalyzeFormalArguments(Splits
, AssignFn
);
2119 SmallVector
<SDValue
, 16> Chains
;
2121 // FIXME: This is the minimum kernel argument alignment. We should improve
2122 // this to the maximum alignment of the arguments.
2124 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2126 const unsigned KernelArgBaseAlign
= 16;
2128 for (unsigned i
= 0, e
= Ins
.size(), ArgIdx
= 0; i
!= e
; ++i
) {
2129 const ISD::InputArg
&Arg
= Ins
[i
];
2130 if (Arg
.isOrigArg() && Skipped
[Arg
.getOrigArgIndex()]) {
2131 InVals
.push_back(DAG
.getUNDEF(Arg
.VT
));
2135 CCValAssign
&VA
= ArgLocs
[ArgIdx
++];
2136 MVT VT
= VA
.getLocVT();
2138 if (IsEntryFunc
&& VA
.isMemLoc()) {
2140 EVT MemVT
= VA
.getLocVT();
2142 const uint64_t Offset
= VA
.getLocMemOffset();
2143 unsigned Align
= MinAlign(KernelArgBaseAlign
, Offset
);
2145 SDValue Arg
= lowerKernargMemParameter(
2146 DAG
, VT
, MemVT
, DL
, Chain
, Offset
, Align
, Ins
[i
].Flags
.isSExt(), &Ins
[i
]);
2147 Chains
.push_back(Arg
.getValue(1));
2150 dyn_cast
<PointerType
>(FType
->getParamType(Ins
[i
].getOrigArgIndex()));
2151 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
2152 ParamTy
&& (ParamTy
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
2153 ParamTy
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
)) {
2154 // On SI local pointers are just offsets into LDS, so they are always
2155 // less than 16-bits. On CI and newer they could potentially be
2156 // real pointers, so we can't guarantee their size.
2157 Arg
= DAG
.getNode(ISD::AssertZext
, DL
, Arg
.getValueType(), Arg
,
2158 DAG
.getValueType(MVT::i16
));
2161 InVals
.push_back(Arg
);
2163 } else if (!IsEntryFunc
&& VA
.isMemLoc()) {
2164 SDValue Val
= lowerStackParameter(DAG
, VA
, DL
, Chain
, Arg
);
2165 InVals
.push_back(Val
);
2166 if (!Arg
.Flags
.isByVal())
2167 Chains
.push_back(Val
.getValue(1));
2171 assert(VA
.isRegLoc() && "Parameter must be in a register!");
2173 Register Reg
= VA
.getLocReg();
2174 const TargetRegisterClass
*RC
= TRI
->getMinimalPhysRegClass(Reg
, VT
);
2175 EVT ValVT
= VA
.getValVT();
2177 Reg
= MF
.addLiveIn(Reg
, RC
);
2178 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
);
2180 if (Arg
.Flags
.isSRet()) {
2181 // The return object should be reasonably addressable.
2183 // FIXME: This helps when the return is a real sret. If it is a
2184 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2185 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2187 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
2188 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
2189 DAG
.getValueType(EVT::getIntegerVT(*DAG
.getContext(), NumBits
)));
2192 // If this is an 8 or 16-bit value, it is really passed promoted
2193 // to 32 bits. Insert an assert[sz]ext to capture this, then
2194 // truncate to the right size.
2195 switch (VA
.getLocInfo()) {
2196 case CCValAssign::Full
:
2198 case CCValAssign::BCvt
:
2199 Val
= DAG
.getNode(ISD::BITCAST
, DL
, ValVT
, Val
);
2201 case CCValAssign::SExt
:
2202 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VT
, Val
,
2203 DAG
.getValueType(ValVT
));
2204 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2206 case CCValAssign::ZExt
:
2207 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
2208 DAG
.getValueType(ValVT
));
2209 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2211 case CCValAssign::AExt
:
2212 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2215 llvm_unreachable("Unknown loc info!");
2218 InVals
.push_back(Val
);
2222 // Special inputs come after user arguments.
2223 allocateSpecialInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2226 // Start adding system SGPRs.
2228 allocateSystemSGPRs(CCInfo
, MF
, *Info
, CallConv
, IsShader
);
2230 CCInfo
.AllocateReg(Info
->getScratchRSrcReg());
2231 CCInfo
.AllocateReg(Info
->getScratchWaveOffsetReg());
2232 CCInfo
.AllocateReg(Info
->getFrameOffsetReg());
2233 allocateSpecialInputSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2236 auto &ArgUsageInfo
=
2237 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2238 ArgUsageInfo
.setFuncArgInfo(Fn
, Info
->getArgInfo());
2240 unsigned StackArgSize
= CCInfo
.getNextStackOffset();
2241 Info
->setBytesInStackArgArea(StackArgSize
);
2243 return Chains
.empty() ? Chain
:
2244 DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
2247 // TODO: If return values can't fit in registers, we should return as many as
2248 // possible in registers before passing on stack.
2249 bool SITargetLowering::CanLowerReturn(
2250 CallingConv::ID CallConv
,
2251 MachineFunction
&MF
, bool IsVarArg
,
2252 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2253 LLVMContext
&Context
) const {
2254 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2255 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2256 // for shaders. Vector types should be explicitly handled by CC.
2257 if (AMDGPU::isEntryFunctionCC(CallConv
))
2260 SmallVector
<CCValAssign
, 16> RVLocs
;
2261 CCState
CCInfo(CallConv
, IsVarArg
, MF
, RVLocs
, Context
);
2262 return CCInfo
.CheckReturn(Outs
, CCAssignFnForReturn(CallConv
, IsVarArg
));
2266 SITargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
2268 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2269 const SmallVectorImpl
<SDValue
> &OutVals
,
2270 const SDLoc
&DL
, SelectionDAG
&DAG
) const {
2271 MachineFunction
&MF
= DAG
.getMachineFunction();
2272 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2274 if (AMDGPU::isKernel(CallConv
)) {
2275 return AMDGPUTargetLowering::LowerReturn(Chain
, CallConv
, isVarArg
, Outs
,
2279 bool IsShader
= AMDGPU::isShader(CallConv
);
2281 Info
->setIfReturnsVoid(Outs
.empty());
2282 bool IsWaveEnd
= Info
->returnsVoid() && IsShader
;
2284 // CCValAssign - represent the assignment of the return value to a location.
2285 SmallVector
<CCValAssign
, 48> RVLocs
;
2286 SmallVector
<ISD::OutputArg
, 48> Splits
;
2288 // CCState - Info about the registers and stack slots.
2289 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
2292 // Analyze outgoing return values.
2293 CCInfo
.AnalyzeReturn(Outs
, CCAssignFnForReturn(CallConv
, isVarArg
));
2296 SmallVector
<SDValue
, 48> RetOps
;
2297 RetOps
.push_back(Chain
); // Operand #0 = Chain (updated below)
2299 // Add return address for callable functions.
2300 if (!Info
->isEntryFunction()) {
2301 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2302 SDValue ReturnAddrReg
= CreateLiveInRegister(
2303 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2305 SDValue ReturnAddrVirtualReg
= DAG
.getRegister(
2306 MF
.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass
),
2309 DAG
.getCopyToReg(Chain
, DL
, ReturnAddrVirtualReg
, ReturnAddrReg
, Flag
);
2310 Flag
= Chain
.getValue(1);
2311 RetOps
.push_back(ReturnAddrVirtualReg
);
2314 // Copy the result values into the output registers.
2315 for (unsigned I
= 0, RealRVLocIdx
= 0, E
= RVLocs
.size(); I
!= E
;
2316 ++I
, ++RealRVLocIdx
) {
2317 CCValAssign
&VA
= RVLocs
[I
];
2318 assert(VA
.isRegLoc() && "Can only return in registers!");
2319 // TODO: Partially return in registers if return values don't fit.
2320 SDValue Arg
= OutVals
[RealRVLocIdx
];
2322 // Copied from other backends.
2323 switch (VA
.getLocInfo()) {
2324 case CCValAssign::Full
:
2326 case CCValAssign::BCvt
:
2327 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2329 case CCValAssign::SExt
:
2330 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2332 case CCValAssign::ZExt
:
2333 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2335 case CCValAssign::AExt
:
2336 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2339 llvm_unreachable("Unknown loc info!");
2342 Chain
= DAG
.getCopyToReg(Chain
, DL
, VA
.getLocReg(), Arg
, Flag
);
2343 Flag
= Chain
.getValue(1);
2344 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
2347 // FIXME: Does sret work properly?
2348 if (!Info
->isEntryFunction()) {
2349 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2350 const MCPhysReg
*I
=
2351 TRI
->getCalleeSavedRegsViaCopy(&DAG
.getMachineFunction());
2354 if (AMDGPU::SReg_64RegClass
.contains(*I
))
2355 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i64
));
2356 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
2357 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i32
));
2359 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2364 // Update chain and glue.
2367 RetOps
.push_back(Flag
);
2369 unsigned Opc
= AMDGPUISD::ENDPGM
;
2371 Opc
= IsShader
? AMDGPUISD::RETURN_TO_EPILOG
: AMDGPUISD::RET_FLAG
;
2372 return DAG
.getNode(Opc
, DL
, MVT::Other
, RetOps
);
2375 SDValue
SITargetLowering::LowerCallResult(
2376 SDValue Chain
, SDValue InFlag
, CallingConv::ID CallConv
, bool IsVarArg
,
2377 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2378 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
, bool IsThisReturn
,
2379 SDValue ThisVal
) const {
2380 CCAssignFn
*RetCC
= CCAssignFnForReturn(CallConv
, IsVarArg
);
2382 // Assign locations to each value returned by this call.
2383 SmallVector
<CCValAssign
, 16> RVLocs
;
2384 CCState
CCInfo(CallConv
, IsVarArg
, DAG
.getMachineFunction(), RVLocs
,
2386 CCInfo
.AnalyzeCallResult(Ins
, RetCC
);
2388 // Copy all of the result registers out of their specified physreg.
2389 for (unsigned i
= 0; i
!= RVLocs
.size(); ++i
) {
2390 CCValAssign VA
= RVLocs
[i
];
2393 if (VA
.isRegLoc()) {
2394 Val
= DAG
.getCopyFromReg(Chain
, DL
, VA
.getLocReg(), VA
.getLocVT(), InFlag
);
2395 Chain
= Val
.getValue(1);
2396 InFlag
= Val
.getValue(2);
2397 } else if (VA
.isMemLoc()) {
2398 report_fatal_error("TODO: return values in memory");
2400 llvm_unreachable("unknown argument location type");
2402 switch (VA
.getLocInfo()) {
2403 case CCValAssign::Full
:
2405 case CCValAssign::BCvt
:
2406 Val
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), Val
);
2408 case CCValAssign::ZExt
:
2409 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VA
.getLocVT(), Val
,
2410 DAG
.getValueType(VA
.getValVT()));
2411 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2413 case CCValAssign::SExt
:
2414 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VA
.getLocVT(), Val
,
2415 DAG
.getValueType(VA
.getValVT()));
2416 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2418 case CCValAssign::AExt
:
2419 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2422 llvm_unreachable("Unknown loc info!");
2425 InVals
.push_back(Val
);
2431 // Add code to pass special inputs required depending on used features separate
2432 // from the explicit user arguments present in the IR.
2433 void SITargetLowering::passSpecialInputs(
2434 CallLoweringInfo
&CLI
,
2436 const SIMachineFunctionInfo
&Info
,
2437 SmallVectorImpl
<std::pair
<unsigned, SDValue
>> &RegsToPass
,
2438 SmallVectorImpl
<SDValue
> &MemOpChains
,
2439 SDValue Chain
) const {
2440 // If we don't have a call site, this was a call inserted by
2441 // legalization. These can never use special inputs.
2445 const Function
*CalleeFunc
= CLI
.CS
.getCalledFunction();
2448 SelectionDAG
&DAG
= CLI
.DAG
;
2449 const SDLoc
&DL
= CLI
.DL
;
2451 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2453 auto &ArgUsageInfo
=
2454 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2455 const AMDGPUFunctionArgInfo
&CalleeArgInfo
2456 = ArgUsageInfo
.lookupFuncArgInfo(*CalleeFunc
);
2458 const AMDGPUFunctionArgInfo
&CallerArgInfo
= Info
.getArgInfo();
2460 // TODO: Unify with private memory register handling. This is complicated by
2461 // the fact that at least in kernels, the input argument is not necessarily
2462 // in the same location as the input.
2463 AMDGPUFunctionArgInfo::PreloadedValue InputRegs
[] = {
2464 AMDGPUFunctionArgInfo::DISPATCH_PTR
,
2465 AMDGPUFunctionArgInfo::QUEUE_PTR
,
2466 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
,
2467 AMDGPUFunctionArgInfo::DISPATCH_ID
,
2468 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
,
2469 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
,
2470 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
,
2471 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2474 for (auto InputID
: InputRegs
) {
2475 const ArgDescriptor
*OutgoingArg
;
2476 const TargetRegisterClass
*ArgRC
;
2478 std::tie(OutgoingArg
, ArgRC
) = CalleeArgInfo
.getPreloadedValue(InputID
);
2482 const ArgDescriptor
*IncomingArg
;
2483 const TargetRegisterClass
*IncomingArgRC
;
2484 std::tie(IncomingArg
, IncomingArgRC
)
2485 = CallerArgInfo
.getPreloadedValue(InputID
);
2486 assert(IncomingArgRC
== ArgRC
);
2488 // All special arguments are ints for now.
2489 EVT ArgVT
= TRI
->getSpillSize(*ArgRC
) == 8 ? MVT::i64
: MVT::i32
;
2493 InputReg
= loadInputValue(DAG
, ArgRC
, ArgVT
, DL
, *IncomingArg
);
2495 // The implicit arg ptr is special because it doesn't have a corresponding
2496 // input for kernels, and is computed from the kernarg segment pointer.
2497 assert(InputID
== AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
2498 InputReg
= getImplicitArgPtr(DAG
, DL
);
2501 if (OutgoingArg
->isRegister()) {
2502 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2504 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(ArgVT
.getStoreSize(), 4);
2505 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2507 MemOpChains
.push_back(ArgStore
);
2511 // Pack workitem IDs into a single register or pass it as is if already
2513 const ArgDescriptor
*OutgoingArg
;
2514 const TargetRegisterClass
*ArgRC
;
2516 std::tie(OutgoingArg
, ArgRC
) =
2517 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
2519 std::tie(OutgoingArg
, ArgRC
) =
2520 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
2522 std::tie(OutgoingArg
, ArgRC
) =
2523 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
2527 const ArgDescriptor
*IncomingArgX
2528 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
).first
;
2529 const ArgDescriptor
*IncomingArgY
2530 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
).first
;
2531 const ArgDescriptor
*IncomingArgZ
2532 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
).first
;
2537 // If incoming ids are not packed we need to pack them.
2538 if (IncomingArgX
&& !IncomingArgX
->isMasked() && CalleeArgInfo
.WorkItemIDX
)
2539 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgX
);
2541 if (IncomingArgY
&& !IncomingArgY
->isMasked() && CalleeArgInfo
.WorkItemIDY
) {
2542 SDValue Y
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgY
);
2543 Y
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Y
,
2544 DAG
.getShiftAmountConstant(10, MVT::i32
, SL
));
2545 InputReg
= InputReg
.getNode() ?
2546 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Y
) : Y
;
2549 if (IncomingArgZ
&& !IncomingArgZ
->isMasked() && CalleeArgInfo
.WorkItemIDZ
) {
2550 SDValue Z
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgZ
);
2551 Z
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Z
,
2552 DAG
.getShiftAmountConstant(20, MVT::i32
, SL
));
2553 InputReg
= InputReg
.getNode() ?
2554 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Z
) : Z
;
2557 if (!InputReg
.getNode()) {
2558 // Workitem ids are already packed, any of present incoming arguments
2559 // will carry all required fields.
2560 ArgDescriptor IncomingArg
= ArgDescriptor::createArg(
2561 IncomingArgX
? *IncomingArgX
:
2562 IncomingArgY
? *IncomingArgY
:
2563 *IncomingArgZ
, ~0u);
2564 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, IncomingArg
);
2567 if (OutgoingArg
->isRegister()) {
2568 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2570 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(4, 4);
2571 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2573 MemOpChains
.push_back(ArgStore
);
2577 static bool canGuaranteeTCO(CallingConv::ID CC
) {
2578 return CC
== CallingConv::Fast
;
2581 /// Return true if we might ever do TCO for calls with this calling convention.
2582 static bool mayTailCallThisCC(CallingConv::ID CC
) {
2584 case CallingConv::C
:
2587 return canGuaranteeTCO(CC
);
2591 bool SITargetLowering::isEligibleForTailCallOptimization(
2592 SDValue Callee
, CallingConv::ID CalleeCC
, bool IsVarArg
,
2593 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2594 const SmallVectorImpl
<SDValue
> &OutVals
,
2595 const SmallVectorImpl
<ISD::InputArg
> &Ins
, SelectionDAG
&DAG
) const {
2596 if (!mayTailCallThisCC(CalleeCC
))
2599 MachineFunction
&MF
= DAG
.getMachineFunction();
2600 const Function
&CallerF
= MF
.getFunction();
2601 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
2602 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2603 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
2605 // Kernels aren't callable, and don't have a live in return address so it
2606 // doesn't make sense to do a tail call with entry functions.
2607 if (!CallerPreserved
)
2610 bool CCMatch
= CallerCC
== CalleeCC
;
2612 if (DAG
.getTarget().Options
.GuaranteedTailCallOpt
) {
2613 if (canGuaranteeTCO(CalleeCC
) && CCMatch
)
2618 // TODO: Can we handle var args?
2622 for (const Argument
&Arg
: CallerF
.args()) {
2623 if (Arg
.hasByValAttr())
2627 LLVMContext
&Ctx
= *DAG
.getContext();
2629 // Check that the call results are passed in the same way.
2630 if (!CCState::resultsCompatible(CalleeCC
, CallerCC
, MF
, Ctx
, Ins
,
2631 CCAssignFnForCall(CalleeCC
, IsVarArg
),
2632 CCAssignFnForCall(CallerCC
, IsVarArg
)))
2635 // The callee has to preserve all registers the caller needs to preserve.
2637 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
2638 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
2642 // Nothing more to check if the callee is taking no arguments.
2646 SmallVector
<CCValAssign
, 16> ArgLocs
;
2647 CCState
CCInfo(CalleeCC
, IsVarArg
, MF
, ArgLocs
, Ctx
);
2649 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CalleeCC
, IsVarArg
));
2651 const SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
2652 // If the stack arguments for this call do not fit into our own save area then
2653 // the call cannot be made tail.
2654 // TODO: Is this really necessary?
2655 if (CCInfo
.getNextStackOffset() > FuncInfo
->getBytesInStackArgArea())
2658 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2659 return parametersInCSRMatch(MRI
, CallerPreserved
, ArgLocs
, OutVals
);
2662 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst
*CI
) const {
2663 if (!CI
->isTailCall())
2666 const Function
*ParentFn
= CI
->getParent()->getParent();
2667 if (AMDGPU::isEntryFunctionCC(ParentFn
->getCallingConv()))
2670 auto Attr
= ParentFn
->getFnAttribute("disable-tail-calls");
2671 return (Attr
.getValueAsString() != "true");
2674 // The wave scratch offset register is used as the global base pointer.
2675 SDValue
SITargetLowering::LowerCall(CallLoweringInfo
&CLI
,
2676 SmallVectorImpl
<SDValue
> &InVals
) const {
2677 SelectionDAG
&DAG
= CLI
.DAG
;
2678 const SDLoc
&DL
= CLI
.DL
;
2679 SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
2680 SmallVector
<SDValue
, 32> &OutVals
= CLI
.OutVals
;
2681 SmallVector
<ISD::InputArg
, 32> &Ins
= CLI
.Ins
;
2682 SDValue Chain
= CLI
.Chain
;
2683 SDValue Callee
= CLI
.Callee
;
2684 bool &IsTailCall
= CLI
.IsTailCall
;
2685 CallingConv::ID CallConv
= CLI
.CallConv
;
2686 bool IsVarArg
= CLI
.IsVarArg
;
2687 bool IsSibCall
= false;
2688 bool IsThisReturn
= false;
2689 MachineFunction
&MF
= DAG
.getMachineFunction();
2691 if (Callee
.isUndef() || isNullConstant(Callee
)) {
2692 if (!CLI
.IsTailCall
) {
2693 for (unsigned I
= 0, E
= CLI
.Ins
.size(); I
!= E
; ++I
)
2694 InVals
.push_back(DAG
.getUNDEF(CLI
.Ins
[I
].VT
));
2701 return lowerUnhandledCall(CLI
, InVals
,
2702 "unsupported call to variadic function ");
2705 if (!CLI
.CS
.getInstruction())
2706 report_fatal_error("unsupported libcall legalization");
2708 if (!CLI
.CS
.getCalledFunction()) {
2709 return lowerUnhandledCall(CLI
, InVals
,
2710 "unsupported indirect call to function ");
2713 if (IsTailCall
&& MF
.getTarget().Options
.GuaranteedTailCallOpt
) {
2714 return lowerUnhandledCall(CLI
, InVals
,
2715 "unsupported required tail call to function ");
2718 if (AMDGPU::isShader(MF
.getFunction().getCallingConv())) {
2719 // Note the issue is with the CC of the calling function, not of the call
2721 return lowerUnhandledCall(CLI
, InVals
,
2722 "unsupported call from graphics shader of function ");
2726 IsTailCall
= isEligibleForTailCallOptimization(
2727 Callee
, CallConv
, IsVarArg
, Outs
, OutVals
, Ins
, DAG
);
2728 if (!IsTailCall
&& CLI
.CS
&& CLI
.CS
.isMustTailCall()) {
2729 report_fatal_error("failed to perform tail call elimination on a call "
2730 "site marked musttail");
2733 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
2735 // A sibling call is one where we're under the usual C ABI and not planning
2736 // to change that but can still do a tail call:
2737 if (!TailCallOpt
&& IsTailCall
)
2744 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2746 // Analyze operands of the call, assigning locations to each operand.
2747 SmallVector
<CCValAssign
, 16> ArgLocs
;
2748 CCState
CCInfo(CallConv
, IsVarArg
, MF
, ArgLocs
, *DAG
.getContext());
2749 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, IsVarArg
);
2751 CCInfo
.AnalyzeCallOperands(Outs
, AssignFn
);
2753 // Get a count of how many bytes are to be pushed on the stack.
2754 unsigned NumBytes
= CCInfo
.getNextStackOffset();
2757 // Since we're not changing the ABI to make this a tail call, the memory
2758 // operands are already available in the caller's incoming argument space.
2762 // FPDiff is the byte offset of the call's argument area from the callee's.
2763 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2764 // by this amount for a tail call. In a sibling call it must be 0 because the
2765 // caller will deallocate the entire stack and the callee still expects its
2766 // arguments to begin at SP+0. Completely unused for non-tail calls.
2768 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2769 SmallVector
<std::pair
<unsigned, SDValue
>, 8> RegsToPass
;
2771 // Adjust the stack pointer for the new arguments...
2772 // These operations are automatically eliminated by the prolog/epilog pass
2774 Chain
= DAG
.getCALLSEQ_START(Chain
, 0, 0, DL
);
2776 SmallVector
<SDValue
, 4> CopyFromChains
;
2778 // In the HSA case, this should be an identity copy.
2779 SDValue ScratchRSrcReg
2780 = DAG
.getCopyFromReg(Chain
, DL
, Info
->getScratchRSrcReg(), MVT::v4i32
);
2781 RegsToPass
.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3
, ScratchRSrcReg
);
2782 CopyFromChains
.push_back(ScratchRSrcReg
.getValue(1));
2783 Chain
= DAG
.getTokenFactor(DL
, CopyFromChains
);
2786 SmallVector
<SDValue
, 8> MemOpChains
;
2787 MVT PtrVT
= MVT::i32
;
2789 // Walk the register/memloc assignments, inserting copies/loads.
2790 for (unsigned i
= 0, realArgIdx
= 0, e
= ArgLocs
.size(); i
!= e
;
2791 ++i
, ++realArgIdx
) {
2792 CCValAssign
&VA
= ArgLocs
[i
];
2793 SDValue Arg
= OutVals
[realArgIdx
];
2795 // Promote the value if needed.
2796 switch (VA
.getLocInfo()) {
2797 case CCValAssign::Full
:
2799 case CCValAssign::BCvt
:
2800 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2802 case CCValAssign::ZExt
:
2803 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2805 case CCValAssign::SExt
:
2806 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2808 case CCValAssign::AExt
:
2809 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2811 case CCValAssign::FPExt
:
2812 Arg
= DAG
.getNode(ISD::FP_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2815 llvm_unreachable("Unknown loc info!");
2818 if (VA
.isRegLoc()) {
2819 RegsToPass
.push_back(std::make_pair(VA
.getLocReg(), Arg
));
2821 assert(VA
.isMemLoc());
2824 MachinePointerInfo DstInfo
;
2826 unsigned LocMemOffset
= VA
.getLocMemOffset();
2827 int32_t Offset
= LocMemOffset
;
2829 SDValue PtrOff
= DAG
.getConstant(Offset
, DL
, PtrVT
);
2830 MaybeAlign Alignment
;
2833 ISD::ArgFlagsTy Flags
= Outs
[realArgIdx
].Flags
;
2834 unsigned OpSize
= Flags
.isByVal() ?
2835 Flags
.getByValSize() : VA
.getValVT().getStoreSize();
2837 // FIXME: We can have better than the minimum byval required alignment.
2840 ? MaybeAlign(Flags
.getByValAlign())
2841 : commonAlignment(Subtarget
->getStackAlignment(), Offset
);
2843 Offset
= Offset
+ FPDiff
;
2844 int FI
= MFI
.CreateFixedObject(OpSize
, Offset
, true);
2846 DstAddr
= DAG
.getFrameIndex(FI
, PtrVT
);
2847 DstInfo
= MachinePointerInfo::getFixedStack(MF
, FI
);
2849 // Make sure any stack arguments overlapping with where we're storing
2850 // are loaded before this eventual operation. Otherwise they'll be
2853 // FIXME: Why is this really necessary? This seems to just result in a
2854 // lot of code to copy the stack and write them back to the same
2855 // locations, which are supposed to be immutable?
2856 Chain
= addTokenForArgument(Chain
, DAG
, MFI
, FI
);
2859 DstInfo
= MachinePointerInfo::getStack(MF
, LocMemOffset
);
2861 commonAlignment(Subtarget
->getStackAlignment(), LocMemOffset
);
2864 if (Outs
[i
].Flags
.isByVal()) {
2866 DAG
.getConstant(Outs
[i
].Flags
.getByValSize(), DL
, MVT::i32
);
2867 SDValue Cpy
= DAG
.getMemcpy(
2868 Chain
, DL
, DstAddr
, Arg
, SizeNode
, Outs
[i
].Flags
.getByValAlign(),
2869 /*isVol = */ false, /*AlwaysInline = */ true,
2870 /*isTailCall = */ false, DstInfo
,
2871 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2872 *DAG
.getContext(), AMDGPUAS::PRIVATE_ADDRESS
))));
2874 MemOpChains
.push_back(Cpy
);
2876 SDValue Store
= DAG
.getStore(Chain
, DL
, Arg
, DstAddr
, DstInfo
,
2877 Alignment
? Alignment
->value() : 0);
2878 MemOpChains
.push_back(Store
);
2883 // Copy special input registers after user input arguments.
2884 passSpecialInputs(CLI
, CCInfo
, *Info
, RegsToPass
, MemOpChains
, Chain
);
2886 if (!MemOpChains
.empty())
2887 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOpChains
);
2889 // Build a sequence of copy-to-reg nodes chained together with token chain
2890 // and flag operands which copy the outgoing args into the appropriate regs.
2892 for (auto &RegToPass
: RegsToPass
) {
2893 Chain
= DAG
.getCopyToReg(Chain
, DL
, RegToPass
.first
,
2894 RegToPass
.second
, InFlag
);
2895 InFlag
= Chain
.getValue(1);
2899 SDValue PhysReturnAddrReg
;
2901 // Since the return is being combined with the call, we need to pass on the
2904 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2905 SDValue ReturnAddrReg
= CreateLiveInRegister(
2906 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2908 PhysReturnAddrReg
= DAG
.getRegister(TRI
->getReturnAddressReg(MF
),
2910 Chain
= DAG
.getCopyToReg(Chain
, DL
, PhysReturnAddrReg
, ReturnAddrReg
, InFlag
);
2911 InFlag
= Chain
.getValue(1);
2914 // We don't usually want to end the call-sequence here because we would tidy
2915 // the frame up *after* the call, however in the ABI-changing tail-call case
2916 // we've carefully laid out the parameters so that when sp is reset they'll be
2917 // in the correct location.
2918 if (IsTailCall
&& !IsSibCall
) {
2919 Chain
= DAG
.getCALLSEQ_END(Chain
,
2920 DAG
.getTargetConstant(NumBytes
, DL
, MVT::i32
),
2921 DAG
.getTargetConstant(0, DL
, MVT::i32
),
2923 InFlag
= Chain
.getValue(1);
2926 std::vector
<SDValue
> Ops
;
2927 Ops
.push_back(Chain
);
2928 Ops
.push_back(Callee
);
2929 // Add a redundant copy of the callee global which will not be legalized, as
2930 // we need direct access to the callee later.
2931 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Callee
);
2932 const GlobalValue
*GV
= GSD
->getGlobal();
2933 Ops
.push_back(DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i64
));
2936 // Each tail call may have to adjust the stack by a different amount, so
2937 // this information must travel along with the operation for eventual
2938 // consumption by emitEpilogue.
2939 Ops
.push_back(DAG
.getTargetConstant(FPDiff
, DL
, MVT::i32
));
2941 Ops
.push_back(PhysReturnAddrReg
);
2944 // Add argument registers to the end of the list so that they are known live
2946 for (auto &RegToPass
: RegsToPass
) {
2947 Ops
.push_back(DAG
.getRegister(RegToPass
.first
,
2948 RegToPass
.second
.getValueType()));
2951 // Add a register mask operand representing the call-preserved registers.
2953 auto *TRI
= static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
2954 const uint32_t *Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
2955 assert(Mask
&& "Missing call preserved mask for calling convention");
2956 Ops
.push_back(DAG
.getRegisterMask(Mask
));
2958 if (InFlag
.getNode())
2959 Ops
.push_back(InFlag
);
2961 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2963 // If we're doing a tall call, use a TC_RETURN here rather than an
2964 // actual call instruction.
2966 MFI
.setHasTailCall();
2967 return DAG
.getNode(AMDGPUISD::TC_RETURN
, DL
, NodeTys
, Ops
);
2970 // Returns a chain and a flag for retval copy to use.
2971 SDValue Call
= DAG
.getNode(AMDGPUISD::CALL
, DL
, NodeTys
, Ops
);
2972 Chain
= Call
.getValue(0);
2973 InFlag
= Call
.getValue(1);
2975 uint64_t CalleePopBytes
= NumBytes
;
2976 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getTargetConstant(0, DL
, MVT::i32
),
2977 DAG
.getTargetConstant(CalleePopBytes
, DL
, MVT::i32
),
2980 InFlag
= Chain
.getValue(1);
2982 // Handle result values, copying them out of physregs into vregs that we
2984 return LowerCallResult(Chain
, InFlag
, CallConv
, IsVarArg
, Ins
, DL
, DAG
,
2985 InVals
, IsThisReturn
,
2986 IsThisReturn
? OutVals
[0] : SDValue());
2989 Register
SITargetLowering::getRegisterByName(const char* RegName
, EVT VT
,
2990 const MachineFunction
&MF
) const {
2991 Register Reg
= StringSwitch
<Register
>(RegName
)
2992 .Case("m0", AMDGPU::M0
)
2993 .Case("exec", AMDGPU::EXEC
)
2994 .Case("exec_lo", AMDGPU::EXEC_LO
)
2995 .Case("exec_hi", AMDGPU::EXEC_HI
)
2996 .Case("flat_scratch", AMDGPU::FLAT_SCR
)
2997 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO
)
2998 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI
)
2999 .Default(Register());
3001 if (Reg
== AMDGPU::NoRegister
) {
3002 report_fatal_error(Twine("invalid register name \""
3003 + StringRef(RegName
) + "\"."));
3007 if (!Subtarget
->hasFlatScrRegister() &&
3008 Subtarget
->getRegisterInfo()->regsOverlap(Reg
, AMDGPU::FLAT_SCR
)) {
3009 report_fatal_error(Twine("invalid register \""
3010 + StringRef(RegName
) + "\" for subtarget."));
3015 case AMDGPU::EXEC_LO
:
3016 case AMDGPU::EXEC_HI
:
3017 case AMDGPU::FLAT_SCR_LO
:
3018 case AMDGPU::FLAT_SCR_HI
:
3019 if (VT
.getSizeInBits() == 32)
3023 case AMDGPU::FLAT_SCR
:
3024 if (VT
.getSizeInBits() == 64)
3028 llvm_unreachable("missing register type checking");
3031 report_fatal_error(Twine("invalid type for register \""
3032 + StringRef(RegName
) + "\"."));
3035 // If kill is not the last instruction, split the block so kill is always a
3036 // proper terminator.
3037 MachineBasicBlock
*SITargetLowering::splitKillBlock(MachineInstr
&MI
,
3038 MachineBasicBlock
*BB
) const {
3039 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3041 MachineBasicBlock::iterator
SplitPoint(&MI
);
3044 if (SplitPoint
== BB
->end()) {
3045 // Don't bother with a new block.
3046 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
3050 MachineFunction
*MF
= BB
->getParent();
3051 MachineBasicBlock
*SplitBB
3052 = MF
->CreateMachineBasicBlock(BB
->getBasicBlock());
3054 MF
->insert(++MachineFunction::iterator(BB
), SplitBB
);
3055 SplitBB
->splice(SplitBB
->begin(), BB
, SplitPoint
, BB
->end());
3057 SplitBB
->transferSuccessorsAndUpdatePHIs(BB
);
3058 BB
->addSuccessor(SplitBB
);
3060 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
3064 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3065 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3066 // be the first instruction in the remainder block.
3068 /// \returns { LoopBody, Remainder }
3069 static std::pair
<MachineBasicBlock
*, MachineBasicBlock
*>
3070 splitBlockForLoop(MachineInstr
&MI
, MachineBasicBlock
&MBB
, bool InstInLoop
) {
3071 MachineFunction
*MF
= MBB
.getParent();
3072 MachineBasicBlock::iterator
I(&MI
);
3074 // To insert the loop we need to split the block. Move everything after this
3075 // point to a new block, and insert a new empty block between the two.
3076 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
3077 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
3078 MachineFunction::iterator
MBBI(MBB
);
3081 MF
->insert(MBBI
, LoopBB
);
3082 MF
->insert(MBBI
, RemainderBB
);
3084 LoopBB
->addSuccessor(LoopBB
);
3085 LoopBB
->addSuccessor(RemainderBB
);
3087 // Move the rest of the block into a new block.
3088 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
3091 auto Next
= std::next(I
);
3093 // Move instruction to loop body.
3094 LoopBB
->splice(LoopBB
->begin(), &MBB
, I
, Next
);
3096 // Move the rest of the block.
3097 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, Next
, MBB
.end());
3099 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, I
, MBB
.end());
3102 MBB
.addSuccessor(LoopBB
);
3104 return std::make_pair(LoopBB
, RemainderBB
);
3107 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3108 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr
&MI
) const {
3109 MachineBasicBlock
*MBB
= MI
.getParent();
3110 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3111 auto I
= MI
.getIterator();
3112 auto E
= std::next(I
);
3114 BuildMI(*MBB
, E
, MI
.getDebugLoc(), TII
->get(AMDGPU::S_WAITCNT
))
3117 MIBundleBuilder
Bundler(*MBB
, I
, E
);
3118 finalizeBundle(*MBB
, Bundler
.begin());
3122 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr
&MI
,
3123 MachineBasicBlock
*BB
) const {
3124 const DebugLoc
&DL
= MI
.getDebugLoc();
3126 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3128 MachineBasicBlock
*LoopBB
;
3129 MachineBasicBlock
*RemainderBB
;
3130 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3132 // Apparently kill flags are only valid if the def is in the same block?
3133 if (MachineOperand
*Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::data0
))
3134 Src
->setIsKill(false);
3136 std::tie(LoopBB
, RemainderBB
) = splitBlockForLoop(MI
, *BB
, true);
3138 MachineBasicBlock::iterator I
= LoopBB
->end();
3140 const unsigned EncodedReg
= AMDGPU::Hwreg::encodeHwreg(
3141 AMDGPU::Hwreg::ID_TRAPSTS
, AMDGPU::Hwreg::OFFSET_MEM_VIOL
, 1);
3143 // Clear TRAP_STS.MEM_VIOL
3144 BuildMI(*LoopBB
, LoopBB
->begin(), DL
, TII
->get(AMDGPU::S_SETREG_IMM32_B32
))
3146 .addImm(EncodedReg
);
3148 bundleInstWithWaitcnt(MI
);
3150 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3152 // Load and check TRAP_STS.MEM_VIOL
3153 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_GETREG_B32
), Reg
)
3154 .addImm(EncodedReg
);
3156 // FIXME: Do we need to use an isel pseudo that may clobber scc?
3157 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CMP_LG_U32
))
3158 .addReg(Reg
, RegState::Kill
)
3160 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3166 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3167 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3168 // will only do one iteration. In the worst case, this will loop 64 times.
3170 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3171 static MachineBasicBlock::iterator
emitLoadM0FromVGPRLoop(
3172 const SIInstrInfo
*TII
,
3173 MachineRegisterInfo
&MRI
,
3174 MachineBasicBlock
&OrigBB
,
3175 MachineBasicBlock
&LoopBB
,
3177 const MachineOperand
&IdxReg
,
3181 unsigned InitSaveExecReg
,
3184 bool IsIndirectSrc
) {
3185 MachineFunction
*MF
= OrigBB
.getParent();
3186 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3187 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3188 MachineBasicBlock::iterator I
= LoopBB
.begin();
3190 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
3191 Register PhiExec
= MRI
.createVirtualRegister(BoolRC
);
3192 Register NewExec
= MRI
.createVirtualRegister(BoolRC
);
3193 Register CurrentIdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3194 Register CondReg
= MRI
.createVirtualRegister(BoolRC
);
3196 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiReg
)
3202 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiExec
)
3203 .addReg(InitSaveExecReg
)
3208 // Read the next variant <- also loop target.
3209 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), CurrentIdxReg
)
3210 .addReg(IdxReg
.getReg(), getUndefRegState(IdxReg
.isUndef()));
3212 // Compare the just read M0 value to all possible Idx values.
3213 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_CMP_EQ_U32_e64
), CondReg
)
3214 .addReg(CurrentIdxReg
)
3215 .addReg(IdxReg
.getReg(), 0, IdxReg
.getSubReg());
3217 // Update EXEC, save the original EXEC value to VCC.
3218 BuildMI(LoopBB
, I
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3219 : AMDGPU::S_AND_SAVEEXEC_B64
),
3221 .addReg(CondReg
, RegState::Kill
);
3223 MRI
.setSimpleHint(NewExec
, CondReg
);
3225 if (UseGPRIdxMode
) {
3228 IdxReg
= CurrentIdxReg
;
3230 IdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3231 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), IdxReg
)
3232 .addReg(CurrentIdxReg
, RegState::Kill
)
3235 unsigned IdxMode
= IsIndirectSrc
?
3236 AMDGPU::VGPRIndexMode::SRC0_ENABLE
: AMDGPU::VGPRIndexMode::DST_ENABLE
;
3237 MachineInstr
*SetOn
=
3238 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3239 .addReg(IdxReg
, RegState::Kill
)
3241 SetOn
->getOperand(3).setIsUndef();
3243 // Move index from VCC into M0
3245 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3246 .addReg(CurrentIdxReg
, RegState::Kill
);
3248 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
3249 .addReg(CurrentIdxReg
, RegState::Kill
)
3254 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3255 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3256 MachineInstr
*InsertPt
=
3257 BuildMI(LoopBB
, I
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_XOR_B32_term
3258 : AMDGPU::S_XOR_B64_term
), Exec
)
3262 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3265 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3266 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
3269 return InsertPt
->getIterator();
3272 // This has slightly sub-optimal regalloc when the source vector is killed by
3273 // the read. The register allocator does not understand that the kill is
3274 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3275 // subregister from it, using 1 more VGPR than necessary. This was saved when
3276 // this was expanded after register allocation.
3277 static MachineBasicBlock::iterator
loadM0FromVGPR(const SIInstrInfo
*TII
,
3278 MachineBasicBlock
&MBB
,
3280 unsigned InitResultReg
,
3284 bool IsIndirectSrc
) {
3285 MachineFunction
*MF
= MBB
.getParent();
3286 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3287 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3288 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3289 const DebugLoc
&DL
= MI
.getDebugLoc();
3290 MachineBasicBlock::iterator
I(&MI
);
3292 const auto *BoolXExecRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
3293 Register DstReg
= MI
.getOperand(0).getReg();
3294 Register SaveExec
= MRI
.createVirtualRegister(BoolXExecRC
);
3295 Register TmpExec
= MRI
.createVirtualRegister(BoolXExecRC
);
3296 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3297 unsigned MovExecOpc
= ST
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
3299 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), TmpExec
);
3301 // Save the EXEC mask
3302 BuildMI(MBB
, I
, DL
, TII
->get(MovExecOpc
), SaveExec
)
3305 MachineBasicBlock
*LoopBB
;
3306 MachineBasicBlock
*RemainderBB
;
3307 std::tie(LoopBB
, RemainderBB
) = splitBlockForLoop(MI
, MBB
, false);
3309 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3311 auto InsPt
= emitLoadM0FromVGPRLoop(TII
, MRI
, MBB
, *LoopBB
, DL
, *Idx
,
3312 InitResultReg
, DstReg
, PhiReg
, TmpExec
,
3313 Offset
, UseGPRIdxMode
, IsIndirectSrc
);
3315 MachineBasicBlock::iterator First
= RemainderBB
->begin();
3316 BuildMI(*RemainderBB
, First
, DL
, TII
->get(MovExecOpc
), Exec
)
3322 // Returns subreg index, offset
3323 static std::pair
<unsigned, int>
3324 computeIndirectRegAndOffset(const SIRegisterInfo
&TRI
,
3325 const TargetRegisterClass
*SuperRC
,
3328 int NumElts
= TRI
.getRegSizeInBits(*SuperRC
) / 32;
3330 // Skip out of bounds offsets, or else we would end up using an undefined
3332 if (Offset
>= NumElts
|| Offset
< 0)
3333 return std::make_pair(AMDGPU::sub0
, Offset
);
3335 return std::make_pair(AMDGPU::sub0
+ Offset
, 0);
3338 // Return true if the index is an SGPR and was set.
3339 static bool setM0ToIndexFromSGPR(const SIInstrInfo
*TII
,
3340 MachineRegisterInfo
&MRI
,
3344 bool IsIndirectSrc
) {
3345 MachineBasicBlock
*MBB
= MI
.getParent();
3346 const DebugLoc
&DL
= MI
.getDebugLoc();
3347 MachineBasicBlock::iterator
I(&MI
);
3349 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3350 const TargetRegisterClass
*IdxRC
= MRI
.getRegClass(Idx
->getReg());
3352 assert(Idx
->getReg() != AMDGPU::NoRegister
);
3354 if (!TII
->getRegisterInfo().isSGPRClass(IdxRC
))
3357 if (UseGPRIdxMode
) {
3358 unsigned IdxMode
= IsIndirectSrc
?
3359 AMDGPU::VGPRIndexMode::SRC0_ENABLE
: AMDGPU::VGPRIndexMode::DST_ENABLE
;
3361 MachineInstr
*SetOn
=
3362 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3366 SetOn
->getOperand(3).setIsUndef();
3368 Register Tmp
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3369 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), Tmp
)
3372 MachineInstr
*SetOn
=
3373 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3374 .addReg(Tmp
, RegState::Kill
)
3377 SetOn
->getOperand(3).setIsUndef();
3384 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3387 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
3395 // Control flow needs to be inserted if indexing with a VGPR.
3396 static MachineBasicBlock
*emitIndirectSrc(MachineInstr
&MI
,
3397 MachineBasicBlock
&MBB
,
3398 const GCNSubtarget
&ST
) {
3399 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3400 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3401 MachineFunction
*MF
= MBB
.getParent();
3402 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3404 Register Dst
= MI
.getOperand(0).getReg();
3405 Register SrcReg
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
)->getReg();
3406 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3408 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcReg
);
3411 std::tie(SubReg
, Offset
)
3412 = computeIndirectRegAndOffset(TRI
, VecRC
, SrcReg
, Offset
);
3414 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3416 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, true)) {
3417 MachineBasicBlock::iterator
I(&MI
);
3418 const DebugLoc
&DL
= MI
.getDebugLoc();
3420 if (UseGPRIdxMode
) {
3421 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3422 // to avoid interfering with other uses, so probably requires a new
3423 // optimization pass.
3424 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3425 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3426 .addReg(SrcReg
, RegState::Implicit
)
3427 .addReg(AMDGPU::M0
, RegState::Implicit
);
3428 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3430 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3431 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3432 .addReg(SrcReg
, RegState::Implicit
);
3435 MI
.eraseFromParent();
3440 const DebugLoc
&DL
= MI
.getDebugLoc();
3441 MachineBasicBlock::iterator
I(&MI
);
3443 Register PhiReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3444 Register InitReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3446 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), InitReg
);
3448 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, InitReg
, PhiReg
,
3449 Offset
, UseGPRIdxMode
, true);
3450 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3452 if (UseGPRIdxMode
) {
3453 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3454 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3455 .addReg(SrcReg
, RegState::Implicit
)
3456 .addReg(AMDGPU::M0
, RegState::Implicit
);
3457 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3459 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3460 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3461 .addReg(SrcReg
, RegState::Implicit
);
3464 MI
.eraseFromParent();
3469 static unsigned getMOVRELDPseudo(const SIRegisterInfo
&TRI
,
3470 const TargetRegisterClass
*VecRC
) {
3471 switch (TRI
.getRegSizeInBits(*VecRC
)) {
3473 return AMDGPU::V_MOVRELD_B32_V1
;
3475 return AMDGPU::V_MOVRELD_B32_V2
;
3476 case 128: // 16 bytes
3477 return AMDGPU::V_MOVRELD_B32_V4
;
3478 case 256: // 32 bytes
3479 return AMDGPU::V_MOVRELD_B32_V8
;
3480 case 512: // 64 bytes
3481 return AMDGPU::V_MOVRELD_B32_V16
;
3483 llvm_unreachable("unsupported size for MOVRELD pseudos");
3487 static MachineBasicBlock
*emitIndirectDst(MachineInstr
&MI
,
3488 MachineBasicBlock
&MBB
,
3489 const GCNSubtarget
&ST
) {
3490 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3491 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3492 MachineFunction
*MF
= MBB
.getParent();
3493 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3495 Register Dst
= MI
.getOperand(0).getReg();
3496 const MachineOperand
*SrcVec
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
);
3497 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3498 const MachineOperand
*Val
= TII
->getNamedOperand(MI
, AMDGPU::OpName::val
);
3499 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3500 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcVec
->getReg());
3502 // This can be an immediate, but will be folded later.
3503 assert(Val
->getReg());
3506 std::tie(SubReg
, Offset
) = computeIndirectRegAndOffset(TRI
, VecRC
,
3509 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3511 if (Idx
->getReg() == AMDGPU::NoRegister
) {
3512 MachineBasicBlock::iterator
I(&MI
);
3513 const DebugLoc
&DL
= MI
.getDebugLoc();
3515 assert(Offset
== 0);
3517 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::INSERT_SUBREG
), Dst
)
3522 MI
.eraseFromParent();
3526 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, false)) {
3527 MachineBasicBlock::iterator
I(&MI
);
3528 const DebugLoc
&DL
= MI
.getDebugLoc();
3530 if (UseGPRIdxMode
) {
3531 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3532 .addReg(SrcVec
->getReg(), RegState::Undef
, SubReg
) // vdst
3534 .addReg(Dst
, RegState::ImplicitDefine
)
3535 .addReg(SrcVec
->getReg(), RegState::Implicit
)
3536 .addReg(AMDGPU::M0
, RegState::Implicit
);
3538 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3540 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3542 BuildMI(MBB
, I
, DL
, MovRelDesc
)
3543 .addReg(Dst
, RegState::Define
)
3544 .addReg(SrcVec
->getReg())
3546 .addImm(SubReg
- AMDGPU::sub0
);
3549 MI
.eraseFromParent();
3554 MRI
.clearKillFlags(Val
->getReg());
3556 const DebugLoc
&DL
= MI
.getDebugLoc();
3558 Register PhiReg
= MRI
.createVirtualRegister(VecRC
);
3560 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, SrcVec
->getReg(), PhiReg
,
3561 Offset
, UseGPRIdxMode
, false);
3562 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3564 if (UseGPRIdxMode
) {
3565 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3566 .addReg(PhiReg
, RegState::Undef
, SubReg
) // vdst
3568 .addReg(Dst
, RegState::ImplicitDefine
)
3569 .addReg(PhiReg
, RegState::Implicit
)
3570 .addReg(AMDGPU::M0
, RegState::Implicit
);
3571 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3573 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3575 BuildMI(*LoopBB
, InsPt
, DL
, MovRelDesc
)
3576 .addReg(Dst
, RegState::Define
)
3579 .addImm(SubReg
- AMDGPU::sub0
);
3582 MI
.eraseFromParent();
3587 MachineBasicBlock
*SITargetLowering::EmitInstrWithCustomInserter(
3588 MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
3590 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3591 MachineFunction
*MF
= BB
->getParent();
3592 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
3594 if (TII
->isMIMG(MI
)) {
3595 if (MI
.memoperands_empty() && MI
.mayLoadOrStore()) {
3596 report_fatal_error("missing mem operand from MIMG instruction");
3598 // Add a memoperand for mimg instructions so that they aren't assumed to
3599 // be ordered memory instuctions.
3604 switch (MI
.getOpcode()) {
3605 case AMDGPU::S_ADD_U64_PSEUDO
:
3606 case AMDGPU::S_SUB_U64_PSEUDO
: {
3607 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3608 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3609 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3610 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
3611 const DebugLoc
&DL
= MI
.getDebugLoc();
3613 MachineOperand
&Dest
= MI
.getOperand(0);
3614 MachineOperand
&Src0
= MI
.getOperand(1);
3615 MachineOperand
&Src1
= MI
.getOperand(2);
3617 Register DestSub0
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
3618 Register DestSub1
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
3620 MachineOperand Src0Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3621 Src0
, BoolRC
, AMDGPU::sub0
,
3622 &AMDGPU::SReg_32RegClass
);
3623 MachineOperand Src0Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3624 Src0
, BoolRC
, AMDGPU::sub1
,
3625 &AMDGPU::SReg_32RegClass
);
3627 MachineOperand Src1Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3628 Src1
, BoolRC
, AMDGPU::sub0
,
3629 &AMDGPU::SReg_32RegClass
);
3630 MachineOperand Src1Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3631 Src1
, BoolRC
, AMDGPU::sub1
,
3632 &AMDGPU::SReg_32RegClass
);
3634 bool IsAdd
= (MI
.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO
);
3636 unsigned LoOpc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
3637 unsigned HiOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
3638 BuildMI(*BB
, MI
, DL
, TII
->get(LoOpc
), DestSub0
)
3641 BuildMI(*BB
, MI
, DL
, TII
->get(HiOpc
), DestSub1
)
3644 BuildMI(*BB
, MI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), Dest
.getReg())
3646 .addImm(AMDGPU::sub0
)
3648 .addImm(AMDGPU::sub1
);
3649 MI
.eraseFromParent();
3652 case AMDGPU::SI_INIT_M0
: {
3653 BuildMI(*BB
, MI
.getIterator(), MI
.getDebugLoc(),
3654 TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3655 .add(MI
.getOperand(0));
3656 MI
.eraseFromParent();
3659 case AMDGPU::SI_INIT_EXEC
:
3660 // This should be before all vector instructions.
3661 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B64
),
3663 .addImm(MI
.getOperand(0).getImm());
3664 MI
.eraseFromParent();
3667 case AMDGPU::SI_INIT_EXEC_LO
:
3668 // This should be before all vector instructions.
3669 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B32
),
3671 .addImm(MI
.getOperand(0).getImm());
3672 MI
.eraseFromParent();
3675 case AMDGPU::SI_INIT_EXEC_FROM_INPUT
: {
3676 // Extract the thread count from an SGPR input and set EXEC accordingly.
3677 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3679 // S_BFE_U32 count, input, {shift, 7}
3680 // S_BFM_B64 exec, count, 0
3681 // S_CMP_EQ_U32 count, 64
3682 // S_CMOV_B64 exec, -1
3683 MachineInstr
*FirstMI
= &*BB
->begin();
3684 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3685 Register InputReg
= MI
.getOperand(0).getReg();
3686 Register CountReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3689 // Move the COPY of the input reg to the beginning, so that we can use it.
3690 for (auto I
= BB
->begin(); I
!= &MI
; I
++) {
3691 if (I
->getOpcode() != TargetOpcode::COPY
||
3692 I
->getOperand(0).getReg() != InputReg
)
3696 FirstMI
= &*++BB
->begin();
3698 I
->removeFromParent();
3699 BB
->insert(FirstMI
, &*I
);
3707 // This should be before all vector instructions.
3708 unsigned Mask
= (getSubtarget()->getWavefrontSize() << 1) - 1;
3709 bool isWave32
= getSubtarget()->isWave32();
3710 unsigned Exec
= isWave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3711 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_BFE_U32
), CountReg
)
3713 .addImm((MI
.getOperand(1).getImm() & Mask
) | 0x70000);
3714 BuildMI(*BB
, FirstMI
, DebugLoc(),
3715 TII
->get(isWave32
? AMDGPU::S_BFM_B32
: AMDGPU::S_BFM_B64
),
3719 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_CMP_EQ_U32
))
3720 .addReg(CountReg
, RegState::Kill
)
3721 .addImm(getSubtarget()->getWavefrontSize());
3722 BuildMI(*BB
, FirstMI
, DebugLoc(),
3723 TII
->get(isWave32
? AMDGPU::S_CMOV_B32
: AMDGPU::S_CMOV_B64
),
3726 MI
.eraseFromParent();
3730 case AMDGPU::GET_GROUPSTATICSIZE
: {
3731 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
3732 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
);
3733 DebugLoc DL
= MI
.getDebugLoc();
3734 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_MOV_B32
))
3735 .add(MI
.getOperand(0))
3736 .addImm(MFI
->getLDSSize());
3737 MI
.eraseFromParent();
3740 case AMDGPU::SI_INDIRECT_SRC_V1
:
3741 case AMDGPU::SI_INDIRECT_SRC_V2
:
3742 case AMDGPU::SI_INDIRECT_SRC_V4
:
3743 case AMDGPU::SI_INDIRECT_SRC_V8
:
3744 case AMDGPU::SI_INDIRECT_SRC_V16
:
3745 return emitIndirectSrc(MI
, *BB
, *getSubtarget());
3746 case AMDGPU::SI_INDIRECT_DST_V1
:
3747 case AMDGPU::SI_INDIRECT_DST_V2
:
3748 case AMDGPU::SI_INDIRECT_DST_V4
:
3749 case AMDGPU::SI_INDIRECT_DST_V8
:
3750 case AMDGPU::SI_INDIRECT_DST_V16
:
3751 return emitIndirectDst(MI
, *BB
, *getSubtarget());
3752 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO
:
3753 case AMDGPU::SI_KILL_I1_PSEUDO
:
3754 return splitKillBlock(MI
, BB
);
3755 case AMDGPU::V_CNDMASK_B64_PSEUDO
: {
3756 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3757 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3758 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3760 Register Dst
= MI
.getOperand(0).getReg();
3761 Register Src0
= MI
.getOperand(1).getReg();
3762 Register Src1
= MI
.getOperand(2).getReg();
3763 const DebugLoc
&DL
= MI
.getDebugLoc();
3764 Register SrcCond
= MI
.getOperand(3).getReg();
3766 Register DstLo
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3767 Register DstHi
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3768 const auto *CondRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
3769 Register SrcCondCopy
= MRI
.createVirtualRegister(CondRC
);
3771 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::COPY
), SrcCondCopy
)
3773 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstLo
)
3775 .addReg(Src0
, 0, AMDGPU::sub0
)
3777 .addReg(Src1
, 0, AMDGPU::sub0
)
3778 .addReg(SrcCondCopy
);
3779 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstHi
)
3781 .addReg(Src0
, 0, AMDGPU::sub1
)
3783 .addReg(Src1
, 0, AMDGPU::sub1
)
3784 .addReg(SrcCondCopy
);
3786 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), Dst
)
3788 .addImm(AMDGPU::sub0
)
3790 .addImm(AMDGPU::sub1
);
3791 MI
.eraseFromParent();
3794 case AMDGPU::SI_BR_UNDEF
: {
3795 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3796 const DebugLoc
&DL
= MI
.getDebugLoc();
3797 MachineInstr
*Br
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3798 .add(MI
.getOperand(0));
3799 Br
->getOperand(1).setIsUndef(true); // read undef SCC
3800 MI
.eraseFromParent();
3803 case AMDGPU::ADJCALLSTACKUP
:
3804 case AMDGPU::ADJCALLSTACKDOWN
: {
3805 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
3806 MachineInstrBuilder
MIB(*MF
, &MI
);
3808 // Add an implicit use of the frame offset reg to prevent the restore copy
3809 // inserted after the call from being reorderd after stack operations in the
3810 // the caller's frame.
3811 MIB
.addReg(Info
->getStackPtrOffsetReg(), RegState::ImplicitDefine
)
3812 .addReg(Info
->getStackPtrOffsetReg(), RegState::Implicit
)
3813 .addReg(Info
->getFrameOffsetReg(), RegState::Implicit
);
3816 case AMDGPU::SI_CALL_ISEL
: {
3817 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3818 const DebugLoc
&DL
= MI
.getDebugLoc();
3820 unsigned ReturnAddrReg
= TII
->getRegisterInfo().getReturnAddressReg(*MF
);
3822 MachineInstrBuilder MIB
;
3823 MIB
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::SI_CALL
), ReturnAddrReg
);
3825 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
)
3826 MIB
.add(MI
.getOperand(I
));
3828 MIB
.cloneMemRefs(MI
);
3829 MI
.eraseFromParent();
3832 case AMDGPU::V_ADD_I32_e32
:
3833 case AMDGPU::V_SUB_I32_e32
:
3834 case AMDGPU::V_SUBREV_I32_e32
: {
3835 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
3836 const DebugLoc
&DL
= MI
.getDebugLoc();
3837 unsigned Opc
= MI
.getOpcode();
3839 bool NeedClampOperand
= false;
3840 if (TII
->pseudoToMCOpcode(Opc
) == -1) {
3841 Opc
= AMDGPU::getVOPe64(Opc
);
3842 NeedClampOperand
= true;
3845 auto I
= BuildMI(*BB
, MI
, DL
, TII
->get(Opc
), MI
.getOperand(0).getReg());
3846 if (TII
->isVOP3(*I
)) {
3847 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3848 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3849 I
.addReg(TRI
->getVCC(), RegState::Define
);
3851 I
.add(MI
.getOperand(1))
3852 .add(MI
.getOperand(2));
3853 if (NeedClampOperand
)
3854 I
.addImm(0); // clamp bit for e64 encoding
3856 TII
->legalizeOperands(*I
);
3858 MI
.eraseFromParent();
3861 case AMDGPU::DS_GWS_INIT
:
3862 case AMDGPU::DS_GWS_SEMA_V
:
3863 case AMDGPU::DS_GWS_SEMA_BR
:
3864 case AMDGPU::DS_GWS_SEMA_P
:
3865 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL
:
3866 case AMDGPU::DS_GWS_BARRIER
:
3867 // A s_waitcnt 0 is required to be the instruction immediately following.
3868 if (getSubtarget()->hasGWSAutoReplay()) {
3869 bundleInstWithWaitcnt(MI
);
3873 return emitGWSMemViolTestLoop(MI
, BB
);
3875 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI
, BB
);
3879 bool SITargetLowering::hasBitPreservingFPLogic(EVT VT
) const {
3880 return isTypeLegal(VT
.getScalarType());
3883 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT
) const {
3884 // This currently forces unfolding various combinations of fsub into fma with
3885 // free fneg'd operands. As long as we have fast FMA (controlled by
3886 // isFMAFasterThanFMulAndFAdd), we should perform these.
3888 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3889 // most of these combines appear to be cycle neutral but save on instruction
3890 // count / code size.
3894 EVT
SITargetLowering::getSetCCResultType(const DataLayout
&DL
, LLVMContext
&Ctx
,
3896 if (!VT
.isVector()) {
3899 return EVT::getVectorVT(Ctx
, MVT::i1
, VT
.getVectorNumElements());
3902 MVT
SITargetLowering::getScalarShiftAmountTy(const DataLayout
&, EVT VT
) const {
3903 // TODO: Should i16 be used always if legal? For now it would force VALU
3905 return (VT
== MVT::i16
) ? MVT::i16
: MVT::i32
;
3908 // Answering this is somewhat tricky and depends on the specific device which
3909 // have different rates for fma or all f64 operations.
3911 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3912 // regardless of which device (although the number of cycles differs between
3913 // devices), so it is always profitable for f64.
3915 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3916 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3917 // which we can always do even without fused FP ops since it returns the same
3918 // result as the separate operations and since it is always full
3919 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3920 // however does not support denormals, so we do report fma as faster if we have
3921 // a fast fma device and require denormals.
3923 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT
) const {
3924 VT
= VT
.getScalarType();
3926 switch (VT
.getSimpleVT().SimpleTy
) {
3928 // This is as fast on some subtargets. However, we always have full rate f32
3929 // mad available which returns the same result as the separate operations
3930 // which we should prefer over fma. We can't use this if we want to support
3931 // denormals, so only report this in these cases.
3932 if (Subtarget
->hasFP32Denormals())
3933 return Subtarget
->hasFastFMAF32() || Subtarget
->hasDLInsts();
3935 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3936 return Subtarget
->hasFastFMAF32() && Subtarget
->hasDLInsts();
3941 return Subtarget
->has16BitInsts() && Subtarget
->hasFP16Denormals();
3949 //===----------------------------------------------------------------------===//
3950 // Custom DAG Lowering Operations
3951 //===----------------------------------------------------------------------===//
3953 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3954 // wider vector type is legal.
3955 SDValue
SITargetLowering::splitUnaryVectorOp(SDValue Op
,
3956 SelectionDAG
&DAG
) const {
3957 unsigned Opc
= Op
.getOpcode();
3958 EVT VT
= Op
.getValueType();
3959 assert(VT
== MVT::v4f16
);
3962 std::tie(Lo
, Hi
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3965 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo
.getValueType(), Lo
,
3967 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi
.getValueType(), Hi
,
3970 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3973 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3974 // wider vector type is legal.
3975 SDValue
SITargetLowering::splitBinaryVectorOp(SDValue Op
,
3976 SelectionDAG
&DAG
) const {
3977 unsigned Opc
= Op
.getOpcode();
3978 EVT VT
= Op
.getValueType();
3979 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
3982 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3984 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
3988 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
,
3990 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
,
3993 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3996 SDValue
SITargetLowering::splitTernaryVectorOp(SDValue Op
,
3997 SelectionDAG
&DAG
) const {
3998 unsigned Opc
= Op
.getOpcode();
3999 EVT VT
= Op
.getValueType();
4000 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
4003 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
4005 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
4007 std::tie(Lo2
, Hi2
) = DAG
.SplitVectorOperand(Op
.getNode(), 2);
4011 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
, Lo2
,
4013 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
, Hi2
,
4016 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
4020 SDValue
SITargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
4021 switch (Op
.getOpcode()) {
4022 default: return AMDGPUTargetLowering::LowerOperation(Op
, DAG
);
4023 case ISD::BRCOND
: return LowerBRCOND(Op
, DAG
);
4024 case ISD::RETURNADDR
: return LowerRETURNADDR(Op
, DAG
);
4026 SDValue Result
= LowerLOAD(Op
, DAG
);
4027 assert((!Result
.getNode() ||
4028 Result
.getNode()->getNumValues() == 2) &&
4029 "Load should return a value and a chain");
4035 return LowerTrig(Op
, DAG
);
4036 case ISD::SELECT
: return LowerSELECT(Op
, DAG
);
4037 case ISD::FDIV
: return LowerFDIV(Op
, DAG
);
4038 case ISD::ATOMIC_CMP_SWAP
: return LowerATOMIC_CMP_SWAP(Op
, DAG
);
4039 case ISD::STORE
: return LowerSTORE(Op
, DAG
);
4040 case ISD::GlobalAddress
: {
4041 MachineFunction
&MF
= DAG
.getMachineFunction();
4042 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
4043 return LowerGlobalAddress(MFI
, Op
, DAG
);
4045 case ISD::INTRINSIC_WO_CHAIN
: return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
4046 case ISD::INTRINSIC_W_CHAIN
: return LowerINTRINSIC_W_CHAIN(Op
, DAG
);
4047 case ISD::INTRINSIC_VOID
: return LowerINTRINSIC_VOID(Op
, DAG
);
4048 case ISD::ADDRSPACECAST
: return lowerADDRSPACECAST(Op
, DAG
);
4049 case ISD::INSERT_SUBVECTOR
:
4050 return lowerINSERT_SUBVECTOR(Op
, DAG
);
4051 case ISD::INSERT_VECTOR_ELT
:
4052 return lowerINSERT_VECTOR_ELT(Op
, DAG
);
4053 case ISD::EXTRACT_VECTOR_ELT
:
4054 return lowerEXTRACT_VECTOR_ELT(Op
, DAG
);
4055 case ISD::VECTOR_SHUFFLE
:
4056 return lowerVECTOR_SHUFFLE(Op
, DAG
);
4057 case ISD::BUILD_VECTOR
:
4058 return lowerBUILD_VECTOR(Op
, DAG
);
4060 return lowerFP_ROUND(Op
, DAG
);
4062 return lowerTRAP(Op
, DAG
);
4063 case ISD::DEBUGTRAP
:
4064 return lowerDEBUGTRAP(Op
, DAG
);
4067 case ISD::FCANONICALIZE
:
4068 return splitUnaryVectorOp(Op
, DAG
);
4071 return lowerFMINNUM_FMAXNUM(Op
, DAG
);
4073 return splitTernaryVectorOp(Op
, DAG
);
4086 case ISD::FMINNUM_IEEE
:
4087 case ISD::FMAXNUM_IEEE
:
4088 return splitBinaryVectorOp(Op
, DAG
);
4093 static SDValue
adjustLoadValueTypeImpl(SDValue Result
, EVT LoadVT
,
4095 SelectionDAG
&DAG
, bool Unpacked
) {
4096 if (!LoadVT
.isVector())
4099 if (Unpacked
) { // From v2i32/v4i32 back to v2f16/v4f16.
4100 // Truncate to v2i16/v4i16.
4101 EVT IntLoadVT
= LoadVT
.changeTypeToInteger();
4103 // Workaround legalizer not scalarizing truncate after vector op
4104 // legalization byt not creating intermediate vector trunc.
4105 SmallVector
<SDValue
, 4> Elts
;
4106 DAG
.ExtractVectorElements(Result
, Elts
);
4107 for (SDValue
&Elt
: Elts
)
4108 Elt
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, Elt
);
4110 Result
= DAG
.getBuildVector(IntLoadVT
, DL
, Elts
);
4112 // Bitcast to original type (v2f16/v4f16).
4113 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
4116 // Cast back to the original packed type.
4117 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
4120 SDValue
SITargetLowering::adjustLoadValueType(unsigned Opcode
,
4123 ArrayRef
<SDValue
> Ops
,
4124 bool IsIntrinsic
) const {
4127 bool Unpacked
= Subtarget
->hasUnpackedD16VMem();
4128 EVT LoadVT
= M
->getValueType(0);
4130 EVT EquivLoadVT
= LoadVT
;
4131 if (Unpacked
&& LoadVT
.isVector()) {
4132 EquivLoadVT
= LoadVT
.isVector() ?
4133 EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
4134 LoadVT
.getVectorNumElements()) : LoadVT
;
4137 // Change from v4f16/v2f16 to EquivLoadVT.
4138 SDVTList VTList
= DAG
.getVTList(EquivLoadVT
, MVT::Other
);
4141 = DAG
.getMemIntrinsicNode(
4142 IsIntrinsic
? (unsigned)ISD::INTRINSIC_W_CHAIN
: Opcode
, DL
,
4143 VTList
, Ops
, M
->getMemoryVT(),
4144 M
->getMemOperand());
4145 if (!Unpacked
) // Just adjusted the opcode.
4148 SDValue Adjusted
= adjustLoadValueTypeImpl(Load
, LoadVT
, DL
, DAG
, Unpacked
);
4150 return DAG
.getMergeValues({ Adjusted
, Load
.getValue(1) }, DL
);
4153 SDValue
SITargetLowering::lowerIntrinsicLoad(MemSDNode
*M
, bool IsFormat
,
4155 ArrayRef
<SDValue
> Ops
) const {
4157 EVT LoadVT
= M
->getValueType(0);
4158 EVT EltType
= LoadVT
.getScalarType();
4159 EVT IntVT
= LoadVT
.changeTypeToInteger();
4161 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
4164 IsFormat
? AMDGPUISD::BUFFER_LOAD_FORMAT
: AMDGPUISD::BUFFER_LOAD
;
4167 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
, M
, DAG
, Ops
);
4170 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
4171 if (!IsD16
&& !LoadVT
.isVector() && EltType
.getSizeInBits() < 32)
4172 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
4174 if (isTypeLegal(LoadVT
)) {
4175 return getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), Ops
, IntVT
,
4176 M
->getMemOperand(), DAG
);
4179 EVT CastVT
= getEquivalentMemType(*DAG
.getContext(), LoadVT
);
4180 SDVTList VTList
= DAG
.getVTList(CastVT
, MVT::Other
);
4181 SDValue MemNode
= getMemIntrinsicNode(Opc
, DL
, VTList
, Ops
, CastVT
,
4182 M
->getMemOperand(), DAG
);
4183 return DAG
.getMergeValues(
4184 {DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, MemNode
), MemNode
.getValue(1)},
4188 static SDValue
lowerICMPIntrinsic(const SITargetLowering
&TLI
,
4189 SDNode
*N
, SelectionDAG
&DAG
) {
4190 EVT VT
= N
->getValueType(0);
4191 const auto *CD
= cast
<ConstantSDNode
>(N
->getOperand(3));
4192 int CondCode
= CD
->getSExtValue();
4193 if (CondCode
< ICmpInst::Predicate::FIRST_ICMP_PREDICATE
||
4194 CondCode
> ICmpInst::Predicate::LAST_ICMP_PREDICATE
)
4195 return DAG
.getUNDEF(VT
);
4197 ICmpInst::Predicate IcInput
= static_cast<ICmpInst::Predicate
>(CondCode
);
4199 SDValue LHS
= N
->getOperand(1);
4200 SDValue RHS
= N
->getOperand(2);
4204 EVT CmpVT
= LHS
.getValueType();
4205 if (CmpVT
== MVT::i16
&& !TLI
.isTypeLegal(MVT::i16
)) {
4206 unsigned PromoteOp
= ICmpInst::isSigned(IcInput
) ?
4207 ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
4208 LHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, LHS
);
4209 RHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, RHS
);
4212 ISD::CondCode CCOpcode
= getICmpCondCode(IcInput
);
4214 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
4215 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
4217 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, DL
, CCVT
, LHS
, RHS
,
4218 DAG
.getCondCode(CCOpcode
));
4219 if (VT
.bitsEq(CCVT
))
4221 return DAG
.getZExtOrTrunc(SetCC
, DL
, VT
);
4224 static SDValue
lowerFCMPIntrinsic(const SITargetLowering
&TLI
,
4225 SDNode
*N
, SelectionDAG
&DAG
) {
4226 EVT VT
= N
->getValueType(0);
4227 const auto *CD
= cast
<ConstantSDNode
>(N
->getOperand(3));
4229 int CondCode
= CD
->getSExtValue();
4230 if (CondCode
< FCmpInst::Predicate::FIRST_FCMP_PREDICATE
||
4231 CondCode
> FCmpInst::Predicate::LAST_FCMP_PREDICATE
) {
4232 return DAG
.getUNDEF(VT
);
4235 SDValue Src0
= N
->getOperand(1);
4236 SDValue Src1
= N
->getOperand(2);
4237 EVT CmpVT
= Src0
.getValueType();
4240 if (CmpVT
== MVT::f16
&& !TLI
.isTypeLegal(CmpVT
)) {
4241 Src0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
4242 Src1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
4245 FCmpInst::Predicate IcInput
= static_cast<FCmpInst::Predicate
>(CondCode
);
4246 ISD::CondCode CCOpcode
= getFCmpCondCode(IcInput
);
4247 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
4248 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
4249 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, SL
, CCVT
, Src0
,
4250 Src1
, DAG
.getCondCode(CCOpcode
));
4251 if (VT
.bitsEq(CCVT
))
4253 return DAG
.getZExtOrTrunc(SetCC
, SL
, VT
);
4256 void SITargetLowering::ReplaceNodeResults(SDNode
*N
,
4257 SmallVectorImpl
<SDValue
> &Results
,
4258 SelectionDAG
&DAG
) const {
4259 switch (N
->getOpcode()) {
4260 case ISD::INSERT_VECTOR_ELT
: {
4261 if (SDValue Res
= lowerINSERT_VECTOR_ELT(SDValue(N
, 0), DAG
))
4262 Results
.push_back(Res
);
4265 case ISD::EXTRACT_VECTOR_ELT
: {
4266 if (SDValue Res
= lowerEXTRACT_VECTOR_ELT(SDValue(N
, 0), DAG
))
4267 Results
.push_back(Res
);
4270 case ISD::INTRINSIC_WO_CHAIN
: {
4271 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
4273 case Intrinsic::amdgcn_cvt_pkrtz
: {
4274 SDValue Src0
= N
->getOperand(1);
4275 SDValue Src1
= N
->getOperand(2);
4277 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32
, SL
, MVT::i32
,
4279 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Cvt
));
4282 case Intrinsic::amdgcn_cvt_pknorm_i16
:
4283 case Intrinsic::amdgcn_cvt_pknorm_u16
:
4284 case Intrinsic::amdgcn_cvt_pk_i16
:
4285 case Intrinsic::amdgcn_cvt_pk_u16
: {
4286 SDValue Src0
= N
->getOperand(1);
4287 SDValue Src1
= N
->getOperand(2);
4291 if (IID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
4292 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
4293 else if (IID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
4294 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
4295 else if (IID
== Intrinsic::amdgcn_cvt_pk_i16
)
4296 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
4298 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
4300 EVT VT
= N
->getValueType(0);
4301 if (isTypeLegal(VT
))
4302 Results
.push_back(DAG
.getNode(Opcode
, SL
, VT
, Src0
, Src1
));
4304 SDValue Cvt
= DAG
.getNode(Opcode
, SL
, MVT::i32
, Src0
, Src1
);
4305 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, Cvt
));
4312 case ISD::INTRINSIC_W_CHAIN
: {
4313 if (SDValue Res
= LowerINTRINSIC_W_CHAIN(SDValue(N
, 0), DAG
)) {
4314 if (Res
.getOpcode() == ISD::MERGE_VALUES
) {
4316 Results
.push_back(Res
.getOperand(0));
4317 Results
.push_back(Res
.getOperand(1));
4319 Results
.push_back(Res
);
4320 Results
.push_back(Res
.getValue(1));
4329 EVT VT
= N
->getValueType(0);
4330 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VT
);
4331 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(1));
4332 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(2));
4334 EVT SelectVT
= NewVT
;
4335 if (NewVT
.bitsLT(MVT::i32
)) {
4336 LHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, LHS
);
4337 RHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, RHS
);
4338 SelectVT
= MVT::i32
;
4341 SDValue NewSelect
= DAG
.getNode(ISD::SELECT
, SL
, SelectVT
,
4342 N
->getOperand(0), LHS
, RHS
);
4344 if (NewVT
!= SelectVT
)
4345 NewSelect
= DAG
.getNode(ISD::TRUNCATE
, SL
, NewVT
, NewSelect
);
4346 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewSelect
));
4350 if (N
->getValueType(0) != MVT::v2f16
)
4354 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
4356 SDValue Op
= DAG
.getNode(ISD::XOR
, SL
, MVT::i32
,
4358 DAG
.getConstant(0x80008000, SL
, MVT::i32
));
4359 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
4363 if (N
->getValueType(0) != MVT::v2f16
)
4367 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
4369 SDValue Op
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
,
4371 DAG
.getConstant(0x7fff7fff, SL
, MVT::i32
));
4372 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
4380 /// Helper function for LowerBRCOND
4381 static SDNode
*findUser(SDValue Value
, unsigned Opcode
) {
4383 SDNode
*Parent
= Value
.getNode();
4384 for (SDNode::use_iterator I
= Parent
->use_begin(), E
= Parent
->use_end();
4387 if (I
.getUse().get() != Value
)
4390 if (I
->getOpcode() == Opcode
)
4396 unsigned SITargetLowering::isCFIntrinsic(const SDNode
*Intr
) const {
4397 if (Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
) {
4398 switch (cast
<ConstantSDNode
>(Intr
->getOperand(1))->getZExtValue()) {
4399 case Intrinsic::amdgcn_if
:
4400 return AMDGPUISD::IF
;
4401 case Intrinsic::amdgcn_else
:
4402 return AMDGPUISD::ELSE
;
4403 case Intrinsic::amdgcn_loop
:
4404 return AMDGPUISD::LOOP
;
4405 case Intrinsic::amdgcn_end_cf
:
4406 llvm_unreachable("should not occur");
4412 // break, if_break, else_break are all only used as inputs to loop, not
4413 // directly as branch conditions.
4417 bool SITargetLowering::shouldEmitFixup(const GlobalValue
*GV
) const {
4418 const Triple
&TT
= getTargetMachine().getTargetTriple();
4419 return (GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4420 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4421 AMDGPU::shouldEmitConstantsToTextSection(TT
);
4424 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue
*GV
) const {
4425 // FIXME: Either avoid relying on address space here or change the default
4426 // address space for functions to avoid the explicit check.
4427 return (GV
->getValueType()->isFunctionTy() ||
4428 GV
->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
4429 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4430 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4431 !shouldEmitFixup(GV
) &&
4432 !getTargetMachine().shouldAssumeDSOLocal(*GV
->getParent(), GV
);
4435 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue
*GV
) const {
4436 return !shouldEmitFixup(GV
) && !shouldEmitGOTReloc(GV
);
4439 /// This transforms the control flow intrinsics to get the branch destination as
4440 /// last parameter, also switches branch target with BR if the need arise
4441 SDValue
SITargetLowering::LowerBRCOND(SDValue BRCOND
,
4442 SelectionDAG
&DAG
) const {
4445 SDNode
*Intr
= BRCOND
.getOperand(1).getNode();
4446 SDValue Target
= BRCOND
.getOperand(2);
4447 SDNode
*BR
= nullptr;
4448 SDNode
*SetCC
= nullptr;
4450 if (Intr
->getOpcode() == ISD::SETCC
) {
4451 // As long as we negate the condition everything is fine
4453 Intr
= SetCC
->getOperand(0).getNode();
4456 // Get the target from BR if we don't negate the condition
4457 BR
= findUser(BRCOND
, ISD::BR
);
4458 Target
= BR
->getOperand(1);
4461 // FIXME: This changes the types of the intrinsics instead of introducing new
4462 // nodes with the correct types.
4463 // e.g. llvm.amdgcn.loop
4465 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4466 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4468 unsigned CFNode
= isCFIntrinsic(Intr
);
4470 // This is a uniform branch so we don't need to legalize.
4474 bool HaveChain
= Intr
->getOpcode() == ISD::INTRINSIC_VOID
||
4475 Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
;
4478 (SetCC
->getConstantOperandVal(1) == 1 &&
4479 cast
<CondCodeSDNode
>(SetCC
->getOperand(2).getNode())->get() ==
4482 // operands of the new intrinsic call
4483 SmallVector
<SDValue
, 4> Ops
;
4485 Ops
.push_back(BRCOND
.getOperand(0));
4487 Ops
.append(Intr
->op_begin() + (HaveChain
? 2 : 1), Intr
->op_end());
4488 Ops
.push_back(Target
);
4490 ArrayRef
<EVT
> Res(Intr
->value_begin() + 1, Intr
->value_end());
4492 // build the new intrinsic call
4493 SDNode
*Result
= DAG
.getNode(CFNode
, DL
, DAG
.getVTList(Res
), Ops
).getNode();
4498 BRCOND
.getOperand(0)
4501 Result
= DAG
.getMergeValues(Ops
, DL
).getNode();
4505 // Give the branch instruction our target
4508 BRCOND
.getOperand(2)
4510 SDValue NewBR
= DAG
.getNode(ISD::BR
, DL
, BR
->getVTList(), Ops
);
4511 DAG
.ReplaceAllUsesWith(BR
, NewBR
.getNode());
4512 BR
= NewBR
.getNode();
4515 SDValue Chain
= SDValue(Result
, Result
->getNumValues() - 1);
4517 // Copy the intrinsic results to registers
4518 for (unsigned i
= 1, e
= Intr
->getNumValues() - 1; i
!= e
; ++i
) {
4519 SDNode
*CopyToReg
= findUser(SDValue(Intr
, i
), ISD::CopyToReg
);
4523 Chain
= DAG
.getCopyToReg(
4525 CopyToReg
->getOperand(1),
4526 SDValue(Result
, i
- 1),
4529 DAG
.ReplaceAllUsesWith(SDValue(CopyToReg
, 0), CopyToReg
->getOperand(0));
4532 // Remove the old intrinsic from the chain
4533 DAG
.ReplaceAllUsesOfValueWith(
4534 SDValue(Intr
, Intr
->getNumValues() - 1),
4535 Intr
->getOperand(0));
4540 SDValue
SITargetLowering::LowerRETURNADDR(SDValue Op
,
4541 SelectionDAG
&DAG
) const {
4542 MVT VT
= Op
.getSimpleValueType();
4544 // Checking the depth
4545 if (cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue() != 0)
4546 return DAG
.getConstant(0, DL
, VT
);
4548 MachineFunction
&MF
= DAG
.getMachineFunction();
4549 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4550 // Check for kernel and shader functions
4551 if (Info
->isEntryFunction())
4552 return DAG
.getConstant(0, DL
, VT
);
4554 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
4555 // There is a call to @llvm.returnaddress in this function
4556 MFI
.setReturnAddressIsTaken(true);
4558 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
4559 // Get the return address reg and mark it as an implicit live-in
4560 unsigned Reg
= MF
.addLiveIn(TRI
->getReturnAddressReg(MF
), getRegClassFor(VT
, Op
.getNode()->isDivergent()));
4562 return DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, Reg
, VT
);
4565 SDValue
SITargetLowering::getFPExtOrFPTrunc(SelectionDAG
&DAG
,
4569 return Op
.getValueType().bitsLE(VT
) ?
4570 DAG
.getNode(ISD::FP_EXTEND
, DL
, VT
, Op
) :
4571 DAG
.getNode(ISD::FTRUNC
, DL
, VT
, Op
);
4574 SDValue
SITargetLowering::lowerFP_ROUND(SDValue Op
, SelectionDAG
&DAG
) const {
4575 assert(Op
.getValueType() == MVT::f16
&&
4576 "Do not know how to custom lower FP_ROUND for non-f16 type");
4578 SDValue Src
= Op
.getOperand(0);
4579 EVT SrcVT
= Src
.getValueType();
4580 if (SrcVT
!= MVT::f64
)
4585 SDValue FpToFp16
= DAG
.getNode(ISD::FP_TO_FP16
, DL
, MVT::i32
, Src
);
4586 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, FpToFp16
);
4587 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f16
, Trunc
);
4590 SDValue
SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op
,
4591 SelectionDAG
&DAG
) const {
4592 EVT VT
= Op
.getValueType();
4593 const MachineFunction
&MF
= DAG
.getMachineFunction();
4594 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4595 bool IsIEEEMode
= Info
->getMode().IEEE
;
4597 // FIXME: Assert during eslection that this is only selected for
4598 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4599 // mode functions, but this happens to be OK since it's only done in cases
4600 // where there is known no sNaN.
4602 return expandFMINNUM_FMAXNUM(Op
.getNode(), DAG
);
4604 if (VT
== MVT::v4f16
)
4605 return splitBinaryVectorOp(Op
, DAG
);
4609 SDValue
SITargetLowering::lowerTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4611 SDValue Chain
= Op
.getOperand(0);
4613 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4614 !Subtarget
->isTrapHandlerEnabled())
4615 return DAG
.getNode(AMDGPUISD::ENDPGM
, SL
, MVT::Other
, Chain
);
4617 MachineFunction
&MF
= DAG
.getMachineFunction();
4618 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4619 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4620 assert(UserSGPR
!= AMDGPU::NoRegister
);
4621 SDValue QueuePtr
= CreateLiveInRegister(
4622 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4623 SDValue SGPR01
= DAG
.getRegister(AMDGPU::SGPR0_SGPR1
, MVT::i64
);
4624 SDValue ToReg
= DAG
.getCopyToReg(Chain
, SL
, SGPR01
,
4625 QueuePtr
, SDValue());
4628 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap
, SL
, MVT::i16
),
4632 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4635 SDValue
SITargetLowering::lowerDEBUGTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4637 SDValue Chain
= Op
.getOperand(0);
4638 MachineFunction
&MF
= DAG
.getMachineFunction();
4640 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4641 !Subtarget
->isTrapHandlerEnabled()) {
4642 DiagnosticInfoUnsupported
NoTrap(MF
.getFunction(),
4643 "debugtrap handler not supported",
4646 LLVMContext
&Ctx
= MF
.getFunction().getContext();
4647 Ctx
.diagnose(NoTrap
);
4653 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap
, SL
, MVT::i16
)
4655 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4658 SDValue
SITargetLowering::getSegmentAperture(unsigned AS
, const SDLoc
&DL
,
4659 SelectionDAG
&DAG
) const {
4660 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4661 if (Subtarget
->hasApertureRegs()) {
4662 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4663 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
4664 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
4665 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4666 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
4667 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
4669 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
4670 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
4671 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
4673 SDValue EncodingImm
= DAG
.getTargetConstant(Encoding
, DL
, MVT::i16
);
4674 SDValue ApertureReg
= SDValue(
4675 DAG
.getMachineNode(AMDGPU::S_GETREG_B32
, DL
, MVT::i32
, EncodingImm
), 0);
4676 SDValue ShiftAmount
= DAG
.getTargetConstant(WidthM1
+ 1, DL
, MVT::i32
);
4677 return DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, ApertureReg
, ShiftAmount
);
4680 MachineFunction
&MF
= DAG
.getMachineFunction();
4681 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4682 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4683 assert(UserSGPR
!= AMDGPU::NoRegister
);
4685 SDValue QueuePtr
= CreateLiveInRegister(
4686 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4688 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4689 // private_segment_aperture_base_hi.
4690 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
4692 SDValue Ptr
= DAG
.getObjectPtrOffset(DL
, QueuePtr
, StructOffset
);
4694 // TODO: Use custom target PseudoSourceValue.
4695 // TODO: We should use the value from the IR intrinsic call, but it might not
4696 // be available and how do we get it?
4697 Value
*V
= UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG
.getContext()),
4698 AMDGPUAS::CONSTANT_ADDRESS
));
4700 MachinePointerInfo
PtrInfo(V
, StructOffset
);
4701 return DAG
.getLoad(MVT::i32
, DL
, QueuePtr
.getValue(1), Ptr
, PtrInfo
,
4702 MinAlign(64, StructOffset
),
4703 MachineMemOperand::MODereferenceable
|
4704 MachineMemOperand::MOInvariant
);
4707 SDValue
SITargetLowering::lowerADDRSPACECAST(SDValue Op
,
4708 SelectionDAG
&DAG
) const {
4710 const AddrSpaceCastSDNode
*ASC
= cast
<AddrSpaceCastSDNode
>(Op
);
4712 SDValue Src
= ASC
->getOperand(0);
4713 SDValue FlatNullPtr
= DAG
.getConstant(0, SL
, MVT::i64
);
4715 const AMDGPUTargetMachine
&TM
=
4716 static_cast<const AMDGPUTargetMachine
&>(getTargetMachine());
4718 // flat -> local/private
4719 if (ASC
->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4720 unsigned DestAS
= ASC
->getDestAddressSpace();
4722 if (DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
4723 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4724 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
4725 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4726 SDValue NonNull
= DAG
.getSetCC(SL
, MVT::i1
, Src
, FlatNullPtr
, ISD::SETNE
);
4727 SDValue Ptr
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, Src
);
4729 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i32
,
4730 NonNull
, Ptr
, SegmentNullPtr
);
4734 // local/private -> flat
4735 if (ASC
->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4736 unsigned SrcAS
= ASC
->getSrcAddressSpace();
4738 if (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
4739 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4740 unsigned NullVal
= TM
.getNullPointerValue(SrcAS
);
4741 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4744 = DAG
.getSetCC(SL
, MVT::i1
, Src
, SegmentNullPtr
, ISD::SETNE
);
4746 SDValue Aperture
= getSegmentAperture(ASC
->getSrcAddressSpace(), SL
, DAG
);
4748 = DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, Src
, Aperture
);
4750 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, NonNull
,
4751 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, CvtPtr
),
4756 // global <-> flat are no-ops and never emitted.
4758 const MachineFunction
&MF
= DAG
.getMachineFunction();
4759 DiagnosticInfoUnsupported
InvalidAddrSpaceCast(
4760 MF
.getFunction(), "invalid addrspacecast", SL
.getDebugLoc());
4761 DAG
.getContext()->diagnose(InvalidAddrSpaceCast
);
4763 return DAG
.getUNDEF(ASC
->getValueType(0));
4766 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
4767 // the small vector and inserting them into the big vector. That is better than
4768 // the default expansion of doing it via a stack slot. Even though the use of
4769 // the stack slot would be optimized away afterwards, the stack slot itself
4771 SDValue
SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op
,
4772 SelectionDAG
&DAG
) const {
4773 SDValue Vec
= Op
.getOperand(0);
4774 SDValue Ins
= Op
.getOperand(1);
4775 SDValue Idx
= Op
.getOperand(2);
4776 EVT VecVT
= Vec
.getValueType();
4777 EVT InsVT
= Ins
.getValueType();
4778 EVT EltVT
= VecVT
.getVectorElementType();
4779 unsigned InsNumElts
= InsVT
.getVectorNumElements();
4780 unsigned IdxVal
= cast
<ConstantSDNode
>(Idx
)->getZExtValue();
4783 for (unsigned I
= 0; I
!= InsNumElts
; ++I
) {
4784 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Ins
,
4785 DAG
.getConstant(I
, SL
, MVT::i32
));
4786 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, VecVT
, Vec
, Elt
,
4787 DAG
.getConstant(IdxVal
+ I
, SL
, MVT::i32
));
4792 SDValue
SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op
,
4793 SelectionDAG
&DAG
) const {
4794 SDValue Vec
= Op
.getOperand(0);
4795 SDValue InsVal
= Op
.getOperand(1);
4796 SDValue Idx
= Op
.getOperand(2);
4797 EVT VecVT
= Vec
.getValueType();
4798 EVT EltVT
= VecVT
.getVectorElementType();
4799 unsigned VecSize
= VecVT
.getSizeInBits();
4800 unsigned EltSize
= EltVT
.getSizeInBits();
4803 assert(VecSize
<= 64);
4805 unsigned NumElts
= VecVT
.getVectorNumElements();
4807 auto KIdx
= dyn_cast
<ConstantSDNode
>(Idx
);
4809 if (NumElts
== 4 && EltSize
== 16 && KIdx
) {
4810 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Vec
);
4812 SDValue LoHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4813 DAG
.getConstant(0, SL
, MVT::i32
));
4814 SDValue HiHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4815 DAG
.getConstant(1, SL
, MVT::i32
));
4817 SDValue LoVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, LoHalf
);
4818 SDValue HiVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, HiHalf
);
4820 unsigned Idx
= KIdx
->getZExtValue();
4821 bool InsertLo
= Idx
< 2;
4822 SDValue InsHalf
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, MVT::v2i16
,
4823 InsertLo
? LoVec
: HiVec
,
4824 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, InsVal
),
4825 DAG
.getConstant(InsertLo
? Idx
: (Idx
- 2), SL
, MVT::i32
));
4827 InsHalf
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, InsHalf
);
4829 SDValue Concat
= InsertLo
?
4830 DAG
.getBuildVector(MVT::v2i32
, SL
, { InsHalf
, HiHalf
}) :
4831 DAG
.getBuildVector(MVT::v2i32
, SL
, { LoHalf
, InsHalf
});
4833 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, Concat
);
4836 if (isa
<ConstantSDNode
>(Idx
))
4839 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4841 // Avoid stack access for dynamic indexing.
4842 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4844 // Create a congruent vector with the target value in each element so that
4845 // the required element can be masked and ORed into the target vector.
4846 SDValue ExtVal
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
,
4847 DAG
.getSplatBuildVector(VecVT
, SL
, InsVal
));
4849 assert(isPowerOf2_32(EltSize
));
4850 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4852 // Convert vector index to bit-index.
4853 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4855 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4856 SDValue BFM
= DAG
.getNode(ISD::SHL
, SL
, IntVT
,
4857 DAG
.getConstant(0xffff, SL
, IntVT
),
4860 SDValue LHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
, BFM
, ExtVal
);
4861 SDValue RHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
,
4862 DAG
.getNOT(SL
, BFM
, IntVT
), BCVec
);
4864 SDValue BFI
= DAG
.getNode(ISD::OR
, SL
, IntVT
, LHS
, RHS
);
4865 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, BFI
);
4868 SDValue
SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op
,
4869 SelectionDAG
&DAG
) const {
4872 EVT ResultVT
= Op
.getValueType();
4873 SDValue Vec
= Op
.getOperand(0);
4874 SDValue Idx
= Op
.getOperand(1);
4875 EVT VecVT
= Vec
.getValueType();
4876 unsigned VecSize
= VecVT
.getSizeInBits();
4877 EVT EltVT
= VecVT
.getVectorElementType();
4878 assert(VecSize
<= 64);
4880 DAGCombinerInfo
DCI(DAG
, AfterLegalizeVectorOps
, true, nullptr);
4882 // Make sure we do any optimizations that will make it easier to fold
4883 // source modifiers before obscuring it with bit operations.
4885 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4886 if (SDValue Combined
= performExtractVectorEltCombine(Op
.getNode(), DCI
))
4889 unsigned EltSize
= EltVT
.getSizeInBits();
4890 assert(isPowerOf2_32(EltSize
));
4892 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4893 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4895 // Convert vector index to bit-index (* EltSize)
4896 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4898 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4899 SDValue Elt
= DAG
.getNode(ISD::SRL
, SL
, IntVT
, BC
, ScaledIdx
);
4901 if (ResultVT
== MVT::f16
) {
4902 SDValue Result
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i16
, Elt
);
4903 return DAG
.getNode(ISD::BITCAST
, SL
, ResultVT
, Result
);
4906 return DAG
.getAnyExtOrTrunc(Elt
, SL
, ResultVT
);
4909 static bool elementPairIsContiguous(ArrayRef
<int> Mask
, int Elt
) {
4910 assert(Elt
% 2 == 0);
4911 return Mask
[Elt
+ 1] == Mask
[Elt
] + 1 && (Mask
[Elt
] % 2 == 0);
4914 SDValue
SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op
,
4915 SelectionDAG
&DAG
) const {
4917 EVT ResultVT
= Op
.getValueType();
4918 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
);
4920 EVT PackVT
= ResultVT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
4921 EVT EltVT
= PackVT
.getVectorElementType();
4922 int SrcNumElts
= Op
.getOperand(0).getValueType().getVectorNumElements();
4924 // vector_shuffle <0,1,6,7> lhs, rhs
4925 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
4927 // vector_shuffle <6,7,2,3> lhs, rhs
4928 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
4930 // vector_shuffle <6,7,0,1> lhs, rhs
4931 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
4933 // Avoid scalarizing when both halves are reading from consecutive elements.
4934 SmallVector
<SDValue
, 4> Pieces
;
4935 for (int I
= 0, N
= ResultVT
.getVectorNumElements(); I
!= N
; I
+= 2) {
4936 if (elementPairIsContiguous(SVN
->getMask(), I
)) {
4937 const int Idx
= SVN
->getMaskElt(I
);
4938 int VecIdx
= Idx
< SrcNumElts
? 0 : 1;
4939 int EltIdx
= Idx
< SrcNumElts
? Idx
: Idx
- SrcNumElts
;
4940 SDValue SubVec
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
,
4941 PackVT
, SVN
->getOperand(VecIdx
),
4942 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
4943 Pieces
.push_back(SubVec
);
4945 const int Idx0
= SVN
->getMaskElt(I
);
4946 const int Idx1
= SVN
->getMaskElt(I
+ 1);
4947 int VecIdx0
= Idx0
< SrcNumElts
? 0 : 1;
4948 int VecIdx1
= Idx1
< SrcNumElts
? 0 : 1;
4949 int EltIdx0
= Idx0
< SrcNumElts
? Idx0
: Idx0
- SrcNumElts
;
4950 int EltIdx1
= Idx1
< SrcNumElts
? Idx1
: Idx1
- SrcNumElts
;
4952 SDValue Vec0
= SVN
->getOperand(VecIdx0
);
4953 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
4954 Vec0
, DAG
.getConstant(EltIdx0
, SL
, MVT::i32
));
4956 SDValue Vec1
= SVN
->getOperand(VecIdx1
);
4957 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
4958 Vec1
, DAG
.getConstant(EltIdx1
, SL
, MVT::i32
));
4959 Pieces
.push_back(DAG
.getBuildVector(PackVT
, SL
, { Elt0
, Elt1
}));
4963 return DAG
.getNode(ISD::CONCAT_VECTORS
, SL
, ResultVT
, Pieces
);
4966 SDValue
SITargetLowering::lowerBUILD_VECTOR(SDValue Op
,
4967 SelectionDAG
&DAG
) const {
4969 EVT VT
= Op
.getValueType();
4971 if (VT
== MVT::v4i16
|| VT
== MVT::v4f16
) {
4972 EVT HalfVT
= MVT::getVectorVT(VT
.getVectorElementType().getSimpleVT(), 2);
4974 // Turn into pair of packed build_vectors.
4975 // TODO: Special case for constants that can be materialized with s_mov_b64.
4976 SDValue Lo
= DAG
.getBuildVector(HalfVT
, SL
,
4977 { Op
.getOperand(0), Op
.getOperand(1) });
4978 SDValue Hi
= DAG
.getBuildVector(HalfVT
, SL
,
4979 { Op
.getOperand(2), Op
.getOperand(3) });
4981 SDValue CastLo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Lo
);
4982 SDValue CastHi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Hi
);
4984 SDValue Blend
= DAG
.getBuildVector(MVT::v2i32
, SL
, { CastLo
, CastHi
});
4985 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Blend
);
4988 assert(VT
== MVT::v2f16
|| VT
== MVT::v2i16
);
4989 assert(!Subtarget
->hasVOP3PInsts() && "this should be legal");
4991 SDValue Lo
= Op
.getOperand(0);
4992 SDValue Hi
= Op
.getOperand(1);
4994 // Avoid adding defined bits with the zero_extend.
4996 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4997 SDValue ExtLo
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Lo
);
4998 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ExtLo
);
5001 Hi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Hi
);
5002 Hi
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Hi
);
5004 SDValue ShlHi
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Hi
,
5005 DAG
.getConstant(16, SL
, MVT::i32
));
5007 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ShlHi
);
5009 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
5010 Lo
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Lo
);
5012 SDValue Or
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Lo
, ShlHi
);
5013 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Or
);
5017 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode
*GA
) const {
5018 // We can fold offsets for anything that doesn't require a GOT relocation.
5019 return (GA
->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
5020 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
5021 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
5022 !shouldEmitGOTReloc(GA
->getGlobal());
5026 buildPCRelGlobalAddress(SelectionDAG
&DAG
, const GlobalValue
*GV
,
5027 const SDLoc
&DL
, unsigned Offset
, EVT PtrVT
,
5028 unsigned GAFlags
= SIInstrInfo::MO_NONE
) {
5029 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
5030 // lowered to the following code sequence:
5032 // For constant address space:
5033 // s_getpc_b64 s[0:1]
5034 // s_add_u32 s0, s0, $symbol
5035 // s_addc_u32 s1, s1, 0
5037 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
5038 // a fixup or relocation is emitted to replace $symbol with a literal
5039 // constant, which is a pc-relative offset from the encoding of the $symbol
5040 // operand to the global variable.
5042 // For global address space:
5043 // s_getpc_b64 s[0:1]
5044 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
5045 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
5047 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
5048 // fixups or relocations are emitted to replace $symbol@*@lo and
5049 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
5050 // which is a 64-bit pc-relative offset from the encoding of the $symbol
5051 // operand to the global variable.
5053 // What we want here is an offset from the value returned by s_getpc
5054 // (which is the address of the s_add_u32 instruction) to the global
5055 // variable, but since the encoding of $symbol starts 4 bytes after the start
5056 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
5057 // small. This requires us to add 4 to the global variable offset in order to
5058 // compute the correct address.
5060 DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4, GAFlags
);
5062 if (GAFlags
== SIInstrInfo::MO_NONE
) {
5063 PtrHi
= DAG
.getTargetConstant(0, DL
, MVT::i32
);
5066 DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4, GAFlags
+ 1);
5068 return DAG
.getNode(AMDGPUISD::PC_ADD_REL_OFFSET
, DL
, PtrVT
, PtrLo
, PtrHi
);
5071 SDValue
SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
*MFI
,
5073 SelectionDAG
&DAG
) const {
5074 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Op
);
5075 const GlobalValue
*GV
= GSD
->getGlobal();
5076 if ((GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
&&
5077 (!GV
->hasExternalLinkage() ||
5078 getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
5079 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
)) ||
5080 GSD
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
||
5081 GSD
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
)
5082 return AMDGPUTargetLowering::LowerGlobalAddress(MFI
, Op
, DAG
);
5085 EVT PtrVT
= Op
.getValueType();
5087 if (GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
) {
5088 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, GSD
->getOffset(),
5089 SIInstrInfo::MO_ABS32_LO
);
5090 return DAG
.getNode(AMDGPUISD::LDS
, DL
, MVT::i32
, GA
);
5093 if (shouldEmitFixup(GV
))
5094 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
);
5095 else if (shouldEmitPCReloc(GV
))
5096 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
,
5097 SIInstrInfo::MO_REL32
);
5099 SDValue GOTAddr
= buildPCRelGlobalAddress(DAG
, GV
, DL
, 0, PtrVT
,
5100 SIInstrInfo::MO_GOTPCREL32
);
5102 Type
*Ty
= PtrVT
.getTypeForEVT(*DAG
.getContext());
5103 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
5104 const DataLayout
&DataLayout
= DAG
.getDataLayout();
5105 unsigned Align
= DataLayout
.getABITypeAlignment(PtrTy
);
5106 MachinePointerInfo PtrInfo
5107 = MachinePointerInfo::getGOT(DAG
.getMachineFunction());
5109 return DAG
.getLoad(PtrVT
, DL
, DAG
.getEntryNode(), GOTAddr
, PtrInfo
, Align
,
5110 MachineMemOperand::MODereferenceable
|
5111 MachineMemOperand::MOInvariant
);
5114 SDValue
SITargetLowering::copyToM0(SelectionDAG
&DAG
, SDValue Chain
,
5115 const SDLoc
&DL
, SDValue V
) const {
5116 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
5117 // the destination register.
5119 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
5120 // so we will end up with redundant moves to m0.
5122 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
5124 // A Null SDValue creates a glue result.
5125 SDNode
*M0
= DAG
.getMachineNode(AMDGPU::SI_INIT_M0
, DL
, MVT::Other
, MVT::Glue
,
5127 return SDValue(M0
, 0);
5130 SDValue
SITargetLowering::lowerImplicitZextParam(SelectionDAG
&DAG
,
5133 unsigned Offset
) const {
5135 SDValue Param
= lowerKernargMemParameter(DAG
, MVT::i32
, MVT::i32
, SL
,
5136 DAG
.getEntryNode(), Offset
, 4, false);
5137 // The local size values will have the hi 16-bits as zero.
5138 return DAG
.getNode(ISD::AssertZext
, SL
, MVT::i32
, Param
,
5139 DAG
.getValueType(VT
));
5142 static SDValue
emitNonHSAIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
5144 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
5145 "non-hsa intrinsic with hsa target",
5147 DAG
.getContext()->diagnose(BadIntrin
);
5148 return DAG
.getUNDEF(VT
);
5151 static SDValue
emitRemovedIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
5153 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
5154 "intrinsic not supported on subtarget",
5156 DAG
.getContext()->diagnose(BadIntrin
);
5157 return DAG
.getUNDEF(VT
);
5160 static SDValue
getBuildDwordsVector(SelectionDAG
&DAG
, SDLoc DL
,
5161 ArrayRef
<SDValue
> Elts
) {
5162 assert(!Elts
.empty());
5166 if (Elts
.size() == 1) {
5169 } else if (Elts
.size() == 2) {
5172 } else if (Elts
.size() <= 4) {
5175 } else if (Elts
.size() <= 8) {
5179 assert(Elts
.size() <= 16);
5184 SmallVector
<SDValue
, 16> VecElts(NumElts
);
5185 for (unsigned i
= 0; i
< Elts
.size(); ++i
) {
5186 SDValue Elt
= Elts
[i
];
5187 if (Elt
.getValueType() != MVT::f32
)
5188 Elt
= DAG
.getBitcast(MVT::f32
, Elt
);
5191 for (unsigned i
= Elts
.size(); i
< NumElts
; ++i
)
5192 VecElts
[i
] = DAG
.getUNDEF(MVT::f32
);
5196 return DAG
.getBuildVector(Type
, DL
, VecElts
);
5199 static bool parseCachePolicy(SDValue CachePolicy
, SelectionDAG
&DAG
,
5200 SDValue
*GLC
, SDValue
*SLC
, SDValue
*DLC
) {
5201 auto CachePolicyConst
= cast
<ConstantSDNode
>(CachePolicy
.getNode());
5203 uint64_t Value
= CachePolicyConst
->getZExtValue();
5204 SDLoc
DL(CachePolicy
);
5206 *GLC
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
5207 Value
&= ~(uint64_t)0x1;
5210 *SLC
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
5211 Value
&= ~(uint64_t)0x2;
5214 *DLC
= DAG
.getTargetConstant((Value
& 0x4) ? 1 : 0, DL
, MVT::i32
);
5215 Value
&= ~(uint64_t)0x4;
5221 // Re-construct the required return value for a image load intrinsic.
5222 // This is more complicated due to the optional use TexFailCtrl which means the required
5223 // return type is an aggregate
5224 static SDValue
constructRetValue(SelectionDAG
&DAG
,
5225 MachineSDNode
*Result
,
5226 ArrayRef
<EVT
> ResultTypes
,
5227 bool IsTexFail
, bool Unpacked
, bool IsD16
,
5228 int DMaskPop
, int NumVDataDwords
,
5229 const SDLoc
&DL
, LLVMContext
&Context
) {
5230 // Determine the required return type. This is the same regardless of IsTexFail flag
5231 EVT ReqRetVT
= ResultTypes
[0];
5232 EVT ReqRetEltVT
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorElementType() : ReqRetVT
;
5233 int ReqRetNumElts
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorNumElements() : 1;
5234 EVT AdjEltVT
= Unpacked
&& IsD16
? MVT::i32
: ReqRetEltVT
;
5235 EVT AdjVT
= Unpacked
? ReqRetNumElts
> 1 ? EVT::getVectorVT(Context
, AdjEltVT
, ReqRetNumElts
)
5239 // Extract data part of the result
5240 // Bitcast the result to the same type as the required return type
5242 if (IsD16
&& !Unpacked
)
5243 NumElts
= NumVDataDwords
<< 1;
5245 NumElts
= NumVDataDwords
;
5247 EVT CastVT
= NumElts
> 1 ? EVT::getVectorVT(Context
, AdjEltVT
, NumElts
)
5250 // Special case for v6f16. Rather than add support for this, use v3i32 to
5251 // extract the data elements
5252 bool V6F16Special
= false;
5254 CastVT
= EVT::getVectorVT(Context
, MVT::i32
, NumElts
/ 2);
5256 ReqRetNumElts
>>= 1;
5257 V6F16Special
= true;
5261 SDValue N
= SDValue(Result
, 0);
5262 SDValue CastRes
= DAG
.getNode(ISD::BITCAST
, DL
, CastVT
, N
);
5264 // Iterate over the result
5265 SmallVector
<SDValue
, 4> BVElts
;
5267 if (CastVT
.isVector()) {
5268 DAG
.ExtractVectorElements(CastRes
, BVElts
, 0, DMaskPop
);
5270 BVElts
.push_back(CastRes
);
5272 int ExtraElts
= ReqRetNumElts
- DMaskPop
;
5274 BVElts
.push_back(DAG
.getUNDEF(AdjEltVT
));
5277 if (ReqRetNumElts
> 1) {
5278 SDValue NewVec
= DAG
.getBuildVector(AdjVT
, DL
, BVElts
);
5279 if (IsD16
&& Unpacked
)
5280 PreTFCRes
= adjustLoadValueTypeImpl(NewVec
, ReqRetVT
, DL
, DAG
, Unpacked
);
5284 PreTFCRes
= BVElts
[0];
5288 PreTFCRes
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v4f16
, PreTFCRes
);
5291 if (Result
->getNumValues() > 1)
5292 return DAG
.getMergeValues({PreTFCRes
, SDValue(Result
, 1)}, DL
);
5297 // Extract the TexFail result and insert into aggregate return
5298 SmallVector
<SDValue
, 1> TFCElt
;
5299 DAG
.ExtractVectorElements(N
, TFCElt
, DMaskPop
, 1);
5300 SDValue TFCRes
= DAG
.getNode(ISD::BITCAST
, DL
, ResultTypes
[1], TFCElt
[0]);
5301 return DAG
.getMergeValues({PreTFCRes
, TFCRes
, SDValue(Result
, 1)}, DL
);
5304 static bool parseTexFail(SDValue TexFailCtrl
, SelectionDAG
&DAG
, SDValue
*TFE
,
5305 SDValue
*LWE
, bool &IsTexFail
) {
5306 auto TexFailCtrlConst
= cast
<ConstantSDNode
>(TexFailCtrl
.getNode());
5308 uint64_t Value
= TexFailCtrlConst
->getZExtValue();
5313 SDLoc
DL(TexFailCtrlConst
);
5314 *TFE
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
5315 Value
&= ~(uint64_t)0x1;
5316 *LWE
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
5317 Value
&= ~(uint64_t)0x2;
5322 SDValue
SITargetLowering::lowerImage(SDValue Op
,
5323 const AMDGPU::ImageDimIntrinsicInfo
*Intr
,
5324 SelectionDAG
&DAG
) const {
5326 MachineFunction
&MF
= DAG
.getMachineFunction();
5327 const GCNSubtarget
* ST
= &MF
.getSubtarget
<GCNSubtarget
>();
5328 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
5329 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
5330 const AMDGPU::MIMGDimInfo
*DimInfo
= AMDGPU::getMIMGDimInfo(Intr
->Dim
);
5331 const AMDGPU::MIMGLZMappingInfo
*LZMappingInfo
=
5332 AMDGPU::getMIMGLZMappingInfo(Intr
->BaseOpcode
);
5333 const AMDGPU::MIMGMIPMappingInfo
*MIPMappingInfo
=
5334 AMDGPU::getMIMGMIPMappingInfo(Intr
->BaseOpcode
);
5335 unsigned IntrOpcode
= Intr
->BaseOpcode
;
5336 bool IsGFX10
= Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
;
5338 SmallVector
<EVT
, 3> ResultTypes(Op
->value_begin(), Op
->value_end());
5339 SmallVector
<EVT
, 3> OrigResultTypes(Op
->value_begin(), Op
->value_end());
5344 bool AdjustRetType
= false;
5346 unsigned AddrIdx
; // Index of first address argument
5348 unsigned DMaskLanes
= 0;
5350 if (BaseOpcode
->Atomic
) {
5351 VData
= Op
.getOperand(2);
5353 bool Is64Bit
= VData
.getValueType() == MVT::i64
;
5354 if (BaseOpcode
->AtomicX2
) {
5355 SDValue VData2
= Op
.getOperand(3);
5356 VData
= DAG
.getBuildVector(Is64Bit
? MVT::v2i64
: MVT::v2i32
, DL
,
5359 VData
= DAG
.getBitcast(MVT::v4i32
, VData
);
5361 ResultTypes
[0] = Is64Bit
? MVT::v2i64
: MVT::v2i32
;
5362 DMask
= Is64Bit
? 0xf : 0x3;
5363 NumVDataDwords
= Is64Bit
? 4 : 2;
5366 DMask
= Is64Bit
? 0x3 : 0x1;
5367 NumVDataDwords
= Is64Bit
? 2 : 1;
5371 unsigned DMaskIdx
= BaseOpcode
->Store
? 3 : isa
<MemSDNode
>(Op
) ? 2 : 1;
5372 auto DMaskConst
= cast
<ConstantSDNode
>(Op
.getOperand(DMaskIdx
));
5373 DMask
= DMaskConst
->getZExtValue();
5374 DMaskLanes
= BaseOpcode
->Gather4
? 4 : countPopulation(DMask
);
5376 if (BaseOpcode
->Store
) {
5377 VData
= Op
.getOperand(2);
5379 MVT StoreVT
= VData
.getSimpleValueType();
5380 if (StoreVT
.getScalarType() == MVT::f16
) {
5381 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
5382 return Op
; // D16 is unsupported for this instruction
5385 VData
= handleD16VData(VData
, DAG
);
5388 NumVDataDwords
= (VData
.getValueType().getSizeInBits() + 31) / 32;
5390 // Work out the num dwords based on the dmask popcount and underlying type
5391 // and whether packing is supported.
5392 MVT LoadVT
= ResultTypes
[0].getSimpleVT();
5393 if (LoadVT
.getScalarType() == MVT::f16
) {
5394 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
5395 return Op
; // D16 is unsupported for this instruction
5400 // Confirm that the return type is large enough for the dmask specified
5401 if ((LoadVT
.isVector() && LoadVT
.getVectorNumElements() < DMaskLanes
) ||
5402 (!LoadVT
.isVector() && DMaskLanes
> 1))
5405 if (IsD16
&& !Subtarget
->hasUnpackedD16VMem())
5406 NumVDataDwords
= (DMaskLanes
+ 1) / 2;
5408 NumVDataDwords
= DMaskLanes
;
5410 AdjustRetType
= true;
5413 AddrIdx
= DMaskIdx
+ 1;
5416 unsigned NumGradients
= BaseOpcode
->Gradients
? DimInfo
->NumGradients
: 0;
5417 unsigned NumCoords
= BaseOpcode
->Coordinates
? DimInfo
->NumCoords
: 0;
5418 unsigned NumLCM
= BaseOpcode
->LodOrClampOrMip
? 1 : 0;
5419 unsigned NumVAddrs
= BaseOpcode
->NumExtraArgs
+ NumGradients
+
5421 unsigned NumMIVAddrs
= NumVAddrs
;
5423 SmallVector
<SDValue
, 4> VAddrs
;
5425 // Optimize _L to _LZ when _L is zero
5426 if (LZMappingInfo
) {
5427 if (auto ConstantLod
=
5428 dyn_cast
<ConstantFPSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
5429 if (ConstantLod
->isZero() || ConstantLod
->isNegative()) {
5430 IntrOpcode
= LZMappingInfo
->LZ
; // set new opcode to _lz variant of _l
5431 NumMIVAddrs
--; // remove 'lod'
5436 // Optimize _mip away, when 'lod' is zero
5437 if (MIPMappingInfo
) {
5438 if (auto ConstantLod
=
5439 dyn_cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
5440 if (ConstantLod
->isNullValue()) {
5441 IntrOpcode
= MIPMappingInfo
->NONMIP
; // set new opcode to variant without _mip
5442 NumMIVAddrs
--; // remove 'lod'
5447 // Check for 16 bit addresses and pack if true.
5448 unsigned DimIdx
= AddrIdx
+ BaseOpcode
->NumExtraArgs
;
5449 MVT VAddrVT
= Op
.getOperand(DimIdx
).getSimpleValueType();
5450 const MVT VAddrScalarVT
= VAddrVT
.getScalarType();
5451 if (((VAddrScalarVT
== MVT::f16
) || (VAddrScalarVT
== MVT::i16
)) &&
5452 ST
->hasFeature(AMDGPU::FeatureR128A16
)) {
5454 const MVT VectorVT
= VAddrScalarVT
== MVT::f16
? MVT::v2f16
: MVT::v2i16
;
5455 for (unsigned i
= AddrIdx
; i
< (AddrIdx
+ NumMIVAddrs
); ++i
) {
5456 SDValue AddrLo
, AddrHi
;
5457 // Push back extra arguments.
5459 AddrLo
= Op
.getOperand(i
);
5461 AddrLo
= Op
.getOperand(i
);
5462 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
5463 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
5464 if (((i
+ 1) >= (AddrIdx
+ NumMIVAddrs
)) ||
5465 ((NumGradients
/ 2) % 2 == 1 &&
5466 (i
== DimIdx
+ (NumGradients
/ 2) - 1 ||
5467 i
== DimIdx
+ NumGradients
- 1))) {
5468 AddrHi
= DAG
.getUNDEF(MVT::f16
);
5470 AddrHi
= Op
.getOperand(i
+ 1);
5473 AddrLo
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, VectorVT
,
5475 AddrLo
= DAG
.getBitcast(MVT::i32
, AddrLo
);
5477 VAddrs
.push_back(AddrLo
);
5480 for (unsigned i
= 0; i
< NumMIVAddrs
; ++i
)
5481 VAddrs
.push_back(Op
.getOperand(AddrIdx
+ i
));
5484 // If the register allocator cannot place the address registers contiguously
5485 // without introducing moves, then using the non-sequential address encoding
5486 // is always preferable, since it saves VALU instructions and is usually a
5487 // wash in terms of code size or even better.
5489 // However, we currently have no way of hinting to the register allocator that
5490 // MIMG addresses should be placed contiguously when it is possible to do so,
5491 // so force non-NSA for the common 2-address case as a heuristic.
5493 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5494 // allocation when possible.
5496 ST
->hasFeature(AMDGPU::FeatureNSAEncoding
) && VAddrs
.size() >= 3;
5499 VAddr
= getBuildDwordsVector(DAG
, DL
, VAddrs
);
5501 SDValue True
= DAG
.getTargetConstant(1, DL
, MVT::i1
);
5502 SDValue False
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
5503 unsigned CtrlIdx
; // Index of texfailctrl argument
5505 if (!BaseOpcode
->Sampler
) {
5507 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 1;
5510 cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 2));
5512 Unorm
= UnormConst
->getZExtValue() ? True
: False
;
5513 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 3;
5518 SDValue TexFail
= Op
.getOperand(CtrlIdx
);
5519 bool IsTexFail
= false;
5520 if (!parseTexFail(TexFail
, DAG
, &TFE
, &LWE
, IsTexFail
))
5525 // Expecting to get an error flag since TFC is on - and dmask is 0
5526 // Force dmask to be at least 1 otherwise the instruction will fail
5531 NumVDataDwords
+= 1;
5532 AdjustRetType
= true;
5535 // Has something earlier tagged that the return type needs adjusting
5536 // This happens if the instruction is a load or has set TexFailCtrl flags
5537 if (AdjustRetType
) {
5538 // NumVDataDwords reflects the true number of dwords required in the return type
5539 if (DMaskLanes
== 0 && !BaseOpcode
->Store
) {
5540 // This is a no-op load. This can be eliminated
5541 SDValue Undef
= DAG
.getUNDEF(Op
.getValueType());
5542 if (isa
<MemSDNode
>(Op
))
5543 return DAG
.getMergeValues({Undef
, Op
.getOperand(0)}, DL
);
5547 EVT NewVT
= NumVDataDwords
> 1 ?
5548 EVT::getVectorVT(*DAG
.getContext(), MVT::f32
, NumVDataDwords
)
5551 ResultTypes
[0] = NewVT
;
5552 if (ResultTypes
.size() == 3) {
5553 // Original result was aggregate type used for TexFailCtrl results
5554 // The actual instruction returns as a vector type which has now been
5555 // created. Remove the aggregate result.
5556 ResultTypes
.erase(&ResultTypes
[1]);
5563 if (BaseOpcode
->Atomic
) {
5564 GLC
= True
; // TODO no-return optimization
5565 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, nullptr, &SLC
,
5566 IsGFX10
? &DLC
: nullptr))
5569 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, &GLC
, &SLC
,
5570 IsGFX10
? &DLC
: nullptr))
5574 SmallVector
<SDValue
, 26> Ops
;
5575 if (BaseOpcode
->Store
|| BaseOpcode
->Atomic
)
5576 Ops
.push_back(VData
); // vdata
5578 for (const SDValue
&Addr
: VAddrs
)
5579 Ops
.push_back(Addr
);
5581 Ops
.push_back(VAddr
);
5583 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
)); // rsrc
5584 if (BaseOpcode
->Sampler
)
5585 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 1)); // sampler
5586 Ops
.push_back(DAG
.getTargetConstant(DMask
, DL
, MVT::i32
));
5588 Ops
.push_back(DAG
.getTargetConstant(DimInfo
->Encoding
, DL
, MVT::i32
));
5589 Ops
.push_back(Unorm
);
5594 Ops
.push_back(IsA16
&& // a16 or r128
5595 ST
->hasFeature(AMDGPU::FeatureR128A16
) ? True
: False
);
5596 Ops
.push_back(TFE
); // tfe
5597 Ops
.push_back(LWE
); // lwe
5599 Ops
.push_back(DimInfo
->DA
? True
: False
);
5600 if (BaseOpcode
->HasD16
)
5601 Ops
.push_back(IsD16
? True
: False
);
5602 if (isa
<MemSDNode
>(Op
))
5603 Ops
.push_back(Op
.getOperand(0)); // chain
5605 int NumVAddrDwords
=
5606 UseNSA
? VAddrs
.size() : VAddr
.getValueType().getSizeInBits() / 32;
5610 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
,
5611 UseNSA
? AMDGPU::MIMGEncGfx10NSA
5612 : AMDGPU::MIMGEncGfx10Default
,
5613 NumVDataDwords
, NumVAddrDwords
);
5615 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5616 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx8
,
5617 NumVDataDwords
, NumVAddrDwords
);
5619 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx6
,
5620 NumVDataDwords
, NumVAddrDwords
);
5622 assert(Opcode
!= -1);
5624 MachineSDNode
*NewNode
= DAG
.getMachineNode(Opcode
, DL
, ResultTypes
, Ops
);
5625 if (auto MemOp
= dyn_cast
<MemSDNode
>(Op
)) {
5626 MachineMemOperand
*MemRef
= MemOp
->getMemOperand();
5627 DAG
.setNodeMemRefs(NewNode
, {MemRef
});
5630 if (BaseOpcode
->AtomicX2
) {
5631 SmallVector
<SDValue
, 1> Elt
;
5632 DAG
.ExtractVectorElements(SDValue(NewNode
, 0), Elt
, 0, 1);
5633 return DAG
.getMergeValues({Elt
[0], SDValue(NewNode
, 1)}, DL
);
5634 } else if (!BaseOpcode
->Store
) {
5635 return constructRetValue(DAG
, NewNode
,
5636 OrigResultTypes
, IsTexFail
,
5637 Subtarget
->hasUnpackedD16VMem(), IsD16
,
5638 DMaskLanes
, NumVDataDwords
, DL
,
5642 return SDValue(NewNode
, 0);
5645 SDValue
SITargetLowering::lowerSBuffer(EVT VT
, SDLoc DL
, SDValue Rsrc
,
5646 SDValue Offset
, SDValue GLC
, SDValue DLC
,
5647 SelectionDAG
&DAG
) const {
5648 MachineFunction
&MF
= DAG
.getMachineFunction();
5649 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
5650 MachinePointerInfo(),
5651 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
5652 MachineMemOperand::MOInvariant
,
5653 VT
.getStoreSize(), VT
.getStoreSize());
5655 if (!Offset
->isDivergent()) {
5662 return DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD
, DL
,
5663 DAG
.getVTList(VT
), Ops
, VT
, MMO
);
5666 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5667 // assume that the buffer is unswizzled.
5668 SmallVector
<SDValue
, 4> Loads
;
5669 unsigned NumLoads
= 1;
5670 MVT LoadVT
= VT
.getSimpleVT();
5671 unsigned NumElts
= LoadVT
.isVector() ? LoadVT
.getVectorNumElements() : 1;
5672 assert((LoadVT
.getScalarType() == MVT::i32
||
5673 LoadVT
.getScalarType() == MVT::f32
) &&
5674 isPowerOf2_32(NumElts
));
5676 if (NumElts
== 8 || NumElts
== 16) {
5677 NumLoads
= NumElts
== 16 ? 4 : 2;
5678 LoadVT
= MVT::v4i32
;
5681 SDVTList VTList
= DAG
.getVTList({LoadVT
, MVT::Glue
});
5682 unsigned CachePolicy
= cast
<ConstantSDNode
>(GLC
)->getZExtValue();
5684 DAG
.getEntryNode(), // Chain
5686 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5690 DAG
.getTargetConstant(CachePolicy
, DL
, MVT::i32
), // cachepolicy
5691 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
5694 // Use the alignment to ensure that the required offsets will fit into the
5695 // immediate offsets.
5696 setBufferOffsets(Offset
, DAG
, &Ops
[3], NumLoads
> 1 ? 16 * NumLoads
: 4);
5698 uint64_t InstOffset
= cast
<ConstantSDNode
>(Ops
[5])->getZExtValue();
5699 for (unsigned i
= 0; i
< NumLoads
; ++i
) {
5700 Ops
[5] = DAG
.getTargetConstant(InstOffset
+ 16 * i
, DL
, MVT::i32
);
5701 Loads
.push_back(DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD
, DL
, VTList
,
5705 if (VT
== MVT::v8i32
|| VT
== MVT::v16i32
)
5706 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VT
, Loads
);
5711 SDValue
SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
5712 SelectionDAG
&DAG
) const {
5713 MachineFunction
&MF
= DAG
.getMachineFunction();
5714 auto MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
5716 EVT VT
= Op
.getValueType();
5718 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
5720 // TODO: Should this propagate fast-math-flags?
5722 switch (IntrinsicID
) {
5723 case Intrinsic::amdgcn_implicit_buffer_ptr
: {
5724 if (getSubtarget()->isAmdHsaOrMesa(MF
.getFunction()))
5725 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5726 return getPreloadedValue(DAG
, *MFI
, VT
,
5727 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
5729 case Intrinsic::amdgcn_dispatch_ptr
:
5730 case Intrinsic::amdgcn_queue_ptr
: {
5731 if (!Subtarget
->isAmdHsaOrMesa(MF
.getFunction())) {
5732 DiagnosticInfoUnsupported
BadIntrin(
5733 MF
.getFunction(), "unsupported hsa intrinsic without hsa target",
5735 DAG
.getContext()->diagnose(BadIntrin
);
5736 return DAG
.getUNDEF(VT
);
5739 auto RegID
= IntrinsicID
== Intrinsic::amdgcn_dispatch_ptr
?
5740 AMDGPUFunctionArgInfo::DISPATCH_PTR
: AMDGPUFunctionArgInfo::QUEUE_PTR
;
5741 return getPreloadedValue(DAG
, *MFI
, VT
, RegID
);
5743 case Intrinsic::amdgcn_implicitarg_ptr
: {
5744 if (MFI
->isEntryFunction())
5745 return getImplicitArgPtr(DAG
, DL
);
5746 return getPreloadedValue(DAG
, *MFI
, VT
,
5747 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
5749 case Intrinsic::amdgcn_kernarg_segment_ptr
: {
5750 return getPreloadedValue(DAG
, *MFI
, VT
,
5751 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
5753 case Intrinsic::amdgcn_dispatch_id
: {
5754 return getPreloadedValue(DAG
, *MFI
, VT
, AMDGPUFunctionArgInfo::DISPATCH_ID
);
5756 case Intrinsic::amdgcn_rcp
:
5757 return DAG
.getNode(AMDGPUISD::RCP
, DL
, VT
, Op
.getOperand(1));
5758 case Intrinsic::amdgcn_rsq
:
5759 return DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
5760 case Intrinsic::amdgcn_rsq_legacy
:
5761 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5762 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
5764 return DAG
.getNode(AMDGPUISD::RSQ_LEGACY
, DL
, VT
, Op
.getOperand(1));
5765 case Intrinsic::amdgcn_rcp_legacy
:
5766 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5767 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
5768 return DAG
.getNode(AMDGPUISD::RCP_LEGACY
, DL
, VT
, Op
.getOperand(1));
5769 case Intrinsic::amdgcn_rsq_clamp
: {
5770 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5771 return DAG
.getNode(AMDGPUISD::RSQ_CLAMP
, DL
, VT
, Op
.getOperand(1));
5773 Type
*Type
= VT
.getTypeForEVT(*DAG
.getContext());
5774 APFloat Max
= APFloat::getLargest(Type
->getFltSemantics());
5775 APFloat Min
= APFloat::getLargest(Type
->getFltSemantics(), true);
5777 SDValue Rsq
= DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
5778 SDValue Tmp
= DAG
.getNode(ISD::FMINNUM
, DL
, VT
, Rsq
,
5779 DAG
.getConstantFP(Max
, DL
, VT
));
5780 return DAG
.getNode(ISD::FMAXNUM
, DL
, VT
, Tmp
,
5781 DAG
.getConstantFP(Min
, DL
, VT
));
5783 case Intrinsic::r600_read_ngroups_x
:
5784 if (Subtarget
->isAmdHsaOS())
5785 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5787 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5788 SI::KernelInputOffsets::NGROUPS_X
, 4, false);
5789 case Intrinsic::r600_read_ngroups_y
:
5790 if (Subtarget
->isAmdHsaOS())
5791 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5793 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5794 SI::KernelInputOffsets::NGROUPS_Y
, 4, false);
5795 case Intrinsic::r600_read_ngroups_z
:
5796 if (Subtarget
->isAmdHsaOS())
5797 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5799 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5800 SI::KernelInputOffsets::NGROUPS_Z
, 4, false);
5801 case Intrinsic::r600_read_global_size_x
:
5802 if (Subtarget
->isAmdHsaOS())
5803 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5805 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5806 SI::KernelInputOffsets::GLOBAL_SIZE_X
, 4, false);
5807 case Intrinsic::r600_read_global_size_y
:
5808 if (Subtarget
->isAmdHsaOS())
5809 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5811 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5812 SI::KernelInputOffsets::GLOBAL_SIZE_Y
, 4, false);
5813 case Intrinsic::r600_read_global_size_z
:
5814 if (Subtarget
->isAmdHsaOS())
5815 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5817 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5818 SI::KernelInputOffsets::GLOBAL_SIZE_Z
, 4, false);
5819 case Intrinsic::r600_read_local_size_x
:
5820 if (Subtarget
->isAmdHsaOS())
5821 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5823 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5824 SI::KernelInputOffsets::LOCAL_SIZE_X
);
5825 case Intrinsic::r600_read_local_size_y
:
5826 if (Subtarget
->isAmdHsaOS())
5827 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5829 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5830 SI::KernelInputOffsets::LOCAL_SIZE_Y
);
5831 case Intrinsic::r600_read_local_size_z
:
5832 if (Subtarget
->isAmdHsaOS())
5833 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5835 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5836 SI::KernelInputOffsets::LOCAL_SIZE_Z
);
5837 case Intrinsic::amdgcn_workgroup_id_x
:
5838 case Intrinsic::r600_read_tgid_x
:
5839 return getPreloadedValue(DAG
, *MFI
, VT
,
5840 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
5841 case Intrinsic::amdgcn_workgroup_id_y
:
5842 case Intrinsic::r600_read_tgid_y
:
5843 return getPreloadedValue(DAG
, *MFI
, VT
,
5844 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
5845 case Intrinsic::amdgcn_workgroup_id_z
:
5846 case Intrinsic::r600_read_tgid_z
:
5847 return getPreloadedValue(DAG
, *MFI
, VT
,
5848 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
5849 case Intrinsic::amdgcn_workitem_id_x
:
5850 case Intrinsic::r600_read_tidig_x
:
5851 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5852 SDLoc(DAG
.getEntryNode()),
5853 MFI
->getArgInfo().WorkItemIDX
);
5854 case Intrinsic::amdgcn_workitem_id_y
:
5855 case Intrinsic::r600_read_tidig_y
:
5856 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5857 SDLoc(DAG
.getEntryNode()),
5858 MFI
->getArgInfo().WorkItemIDY
);
5859 case Intrinsic::amdgcn_workitem_id_z
:
5860 case Intrinsic::r600_read_tidig_z
:
5861 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5862 SDLoc(DAG
.getEntryNode()),
5863 MFI
->getArgInfo().WorkItemIDZ
);
5864 case Intrinsic::amdgcn_wavefrontsize
:
5865 return DAG
.getConstant(MF
.getSubtarget
<GCNSubtarget
>().getWavefrontSize(),
5866 SDLoc(Op
), MVT::i32
);
5867 case Intrinsic::amdgcn_s_buffer_load
: {
5868 bool IsGFX10
= Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
;
5870 SDValue DLC
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
5871 if (!parseCachePolicy(Op
.getOperand(3), DAG
, &GLC
, nullptr,
5872 IsGFX10
? &DLC
: nullptr))
5874 return lowerSBuffer(VT
, DL
, Op
.getOperand(1), Op
.getOperand(2), GLC
, DLC
,
5877 case Intrinsic::amdgcn_fdiv_fast
:
5878 return lowerFDIV_FAST(Op
, DAG
);
5879 case Intrinsic::amdgcn_interp_p1_f16
: {
5880 SDValue ToM0
= DAG
.getCopyToReg(DAG
.getEntryNode(), DL
, AMDGPU::M0
,
5881 Op
.getOperand(5), SDValue());
5882 if (getSubtarget()->getLDSBankCount() == 16) {
5885 // FIXME: This implicitly will insert a second CopyToReg to M0.
5886 SDValue S
= DAG
.getNode(
5887 ISD::INTRINSIC_WO_CHAIN
, DL
, MVT::f32
,
5888 DAG
.getTargetConstant(Intrinsic::amdgcn_interp_mov
, DL
, MVT::i32
),
5889 DAG
.getConstant(2, DL
, MVT::i32
), // P0
5890 Op
.getOperand(2), // Attrchan
5891 Op
.getOperand(3), // Attr
5892 Op
.getOperand(5)); // m0
5895 Op
.getOperand(1), // Src0
5896 Op
.getOperand(2), // Attrchan
5897 Op
.getOperand(3), // Attr
5898 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src0_modifiers
5899 S
, // Src2 - holds two f16 values selected by high
5900 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src2_modifiers
5901 Op
.getOperand(4), // high
5902 DAG
.getTargetConstant(0, DL
, MVT::i1
), // $clamp
5903 DAG
.getTargetConstant(0, DL
, MVT::i32
) // $omod
5905 return DAG
.getNode(AMDGPUISD::INTERP_P1LV_F16
, DL
, MVT::f32
, Ops
);
5909 Op
.getOperand(1), // Src0
5910 Op
.getOperand(2), // Attrchan
5911 Op
.getOperand(3), // Attr
5912 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src0_modifiers
5913 Op
.getOperand(4), // high
5914 DAG
.getTargetConstant(0, DL
, MVT::i1
), // $clamp
5915 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $omod
5918 return DAG
.getNode(AMDGPUISD::INTERP_P1LL_F16
, DL
, MVT::f32
, Ops
);
5921 case Intrinsic::amdgcn_interp_p2_f16
: {
5922 SDValue ToM0
= DAG
.getCopyToReg(DAG
.getEntryNode(), DL
, AMDGPU::M0
,
5923 Op
.getOperand(6), SDValue());
5925 Op
.getOperand(2), // Src0
5926 Op
.getOperand(3), // Attrchan
5927 Op
.getOperand(4), // Attr
5928 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src0_modifiers
5929 Op
.getOperand(1), // Src2
5930 DAG
.getTargetConstant(0, DL
, MVT::i32
), // $src2_modifiers
5931 Op
.getOperand(5), // high
5932 DAG
.getTargetConstant(0, DL
, MVT::i1
), // $clamp
5935 return DAG
.getNode(AMDGPUISD::INTERP_P2_F16
, DL
, MVT::f16
, Ops
);
5937 case Intrinsic::amdgcn_sin
:
5938 return DAG
.getNode(AMDGPUISD::SIN_HW
, DL
, VT
, Op
.getOperand(1));
5940 case Intrinsic::amdgcn_cos
:
5941 return DAG
.getNode(AMDGPUISD::COS_HW
, DL
, VT
, Op
.getOperand(1));
5943 case Intrinsic::amdgcn_mul_u24
:
5944 return DAG
.getNode(AMDGPUISD::MUL_U24
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5945 case Intrinsic::amdgcn_mul_i24
:
5946 return DAG
.getNode(AMDGPUISD::MUL_I24
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5948 case Intrinsic::amdgcn_log_clamp
: {
5949 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5952 DiagnosticInfoUnsupported
BadIntrin(
5953 MF
.getFunction(), "intrinsic not supported on subtarget",
5955 DAG
.getContext()->diagnose(BadIntrin
);
5956 return DAG
.getUNDEF(VT
);
5958 case Intrinsic::amdgcn_ldexp
:
5959 return DAG
.getNode(AMDGPUISD::LDEXP
, DL
, VT
,
5960 Op
.getOperand(1), Op
.getOperand(2));
5962 case Intrinsic::amdgcn_fract
:
5963 return DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, Op
.getOperand(1));
5965 case Intrinsic::amdgcn_class
:
5966 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, VT
,
5967 Op
.getOperand(1), Op
.getOperand(2));
5968 case Intrinsic::amdgcn_div_fmas
:
5969 return DAG
.getNode(AMDGPUISD::DIV_FMAS
, DL
, VT
,
5970 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
5973 case Intrinsic::amdgcn_div_fixup
:
5974 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, DL
, VT
,
5975 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5977 case Intrinsic::amdgcn_trig_preop
:
5978 return DAG
.getNode(AMDGPUISD::TRIG_PREOP
, DL
, VT
,
5979 Op
.getOperand(1), Op
.getOperand(2));
5980 case Intrinsic::amdgcn_div_scale
: {
5981 const ConstantSDNode
*Param
= cast
<ConstantSDNode
>(Op
.getOperand(3));
5983 // Translate to the operands expected by the machine instruction. The
5984 // first parameter must be the same as the first instruction.
5985 SDValue Numerator
= Op
.getOperand(1);
5986 SDValue Denominator
= Op
.getOperand(2);
5988 // Note this order is opposite of the machine instruction's operations,
5989 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5990 // intrinsic has the numerator as the first operand to match a normal
5991 // division operation.
5993 SDValue Src0
= Param
->isAllOnesValue() ? Numerator
: Denominator
;
5995 return DAG
.getNode(AMDGPUISD::DIV_SCALE
, DL
, Op
->getVTList(), Src0
,
5996 Denominator
, Numerator
);
5998 case Intrinsic::amdgcn_icmp
: {
5999 // There is a Pat that handles this variant, so return it as-is.
6000 if (Op
.getOperand(1).getValueType() == MVT::i1
&&
6001 Op
.getConstantOperandVal(2) == 0 &&
6002 Op
.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE
)
6004 return lowerICMPIntrinsic(*this, Op
.getNode(), DAG
);
6006 case Intrinsic::amdgcn_fcmp
: {
6007 return lowerFCMPIntrinsic(*this, Op
.getNode(), DAG
);
6009 case Intrinsic::amdgcn_fmed3
:
6010 return DAG
.getNode(AMDGPUISD::FMED3
, DL
, VT
,
6011 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6012 case Intrinsic::amdgcn_fdot2
:
6013 return DAG
.getNode(AMDGPUISD::FDOT2
, DL
, VT
,
6014 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
6016 case Intrinsic::amdgcn_fmul_legacy
:
6017 return DAG
.getNode(AMDGPUISD::FMUL_LEGACY
, DL
, VT
,
6018 Op
.getOperand(1), Op
.getOperand(2));
6019 case Intrinsic::amdgcn_sffbh
:
6020 return DAG
.getNode(AMDGPUISD::FFBH_I32
, DL
, VT
, Op
.getOperand(1));
6021 case Intrinsic::amdgcn_sbfe
:
6022 return DAG
.getNode(AMDGPUISD::BFE_I32
, DL
, VT
,
6023 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6024 case Intrinsic::amdgcn_ubfe
:
6025 return DAG
.getNode(AMDGPUISD::BFE_U32
, DL
, VT
,
6026 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
6027 case Intrinsic::amdgcn_cvt_pkrtz
:
6028 case Intrinsic::amdgcn_cvt_pknorm_i16
:
6029 case Intrinsic::amdgcn_cvt_pknorm_u16
:
6030 case Intrinsic::amdgcn_cvt_pk_i16
:
6031 case Intrinsic::amdgcn_cvt_pk_u16
: {
6032 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
6033 EVT VT
= Op
.getValueType();
6036 if (IntrinsicID
== Intrinsic::amdgcn_cvt_pkrtz
)
6037 Opcode
= AMDGPUISD::CVT_PKRTZ_F16_F32
;
6038 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
6039 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
6040 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
6041 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
6042 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pk_i16
)
6043 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
6045 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
6047 if (isTypeLegal(VT
))
6048 return DAG
.getNode(Opcode
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
6050 SDValue Node
= DAG
.getNode(Opcode
, DL
, MVT::i32
,
6051 Op
.getOperand(1), Op
.getOperand(2));
6052 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Node
);
6054 case Intrinsic::amdgcn_fmad_ftz
:
6055 return DAG
.getNode(AMDGPUISD::FMAD_FTZ
, DL
, VT
, Op
.getOperand(1),
6056 Op
.getOperand(2), Op
.getOperand(3));
6058 case Intrinsic::amdgcn_if_break
:
6059 return SDValue(DAG
.getMachineNode(AMDGPU::SI_IF_BREAK
, DL
, VT
,
6060 Op
->getOperand(1), Op
->getOperand(2)), 0);
6062 case Intrinsic::amdgcn_groupstaticsize
: {
6063 Triple::OSType OS
= getTargetMachine().getTargetTriple().getOS();
6064 if (OS
== Triple::AMDHSA
|| OS
== Triple::AMDPAL
)
6067 const Module
*M
= MF
.getFunction().getParent();
6068 const GlobalValue
*GV
=
6069 M
->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize
));
6070 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, 0,
6071 SIInstrInfo::MO_ABS32_LO
);
6072 return {DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, GA
), 0};
6074 case Intrinsic::amdgcn_is_shared
:
6075 case Intrinsic::amdgcn_is_private
: {
6077 unsigned AS
= (IntrinsicID
== Intrinsic::amdgcn_is_shared
) ?
6078 AMDGPUAS::LOCAL_ADDRESS
: AMDGPUAS::PRIVATE_ADDRESS
;
6079 SDValue Aperture
= getSegmentAperture(AS
, SL
, DAG
);
6080 SDValue SrcVec
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
,
6083 SDValue SrcHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, SrcVec
,
6084 DAG
.getConstant(1, SL
, MVT::i32
));
6085 return DAG
.getSetCC(SL
, MVT::i1
, SrcHi
, Aperture
, ISD::SETEQ
);
6088 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6089 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
6090 return lowerImage(Op
, ImageDimIntr
, DAG
);
6096 // This function computes an appropriate offset to pass to
6097 // MachineMemOperand::setOffset() based on the offset inputs to
6098 // an intrinsic. If any of the offsets are non-contstant or
6099 // if VIndex is non-zero then this function returns 0. Otherwise,
6100 // it returns the sum of VOffset, SOffset, and Offset.
6101 static unsigned getBufferOffsetForMMO(SDValue VOffset
,
6104 SDValue VIndex
= SDValue()) {
6106 if (!isa
<ConstantSDNode
>(VOffset
) || !isa
<ConstantSDNode
>(SOffset
) ||
6107 !isa
<ConstantSDNode
>(Offset
))
6111 if (!isa
<ConstantSDNode
>(VIndex
) || !cast
<ConstantSDNode
>(VIndex
)->isNullValue())
6115 return cast
<ConstantSDNode
>(VOffset
)->getSExtValue() +
6116 cast
<ConstantSDNode
>(SOffset
)->getSExtValue() +
6117 cast
<ConstantSDNode
>(Offset
)->getSExtValue();
6120 SDValue
SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op
,
6121 SelectionDAG
&DAG
) const {
6122 unsigned IntrID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
6126 case Intrinsic::amdgcn_ds_ordered_add
:
6127 case Intrinsic::amdgcn_ds_ordered_swap
: {
6128 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6129 SDValue Chain
= M
->getOperand(0);
6130 SDValue M0
= M
->getOperand(2);
6131 SDValue Value
= M
->getOperand(3);
6132 unsigned IndexOperand
= M
->getConstantOperandVal(7);
6133 unsigned WaveRelease
= M
->getConstantOperandVal(8);
6134 unsigned WaveDone
= M
->getConstantOperandVal(9);
6135 unsigned ShaderType
;
6136 unsigned Instruction
;
6138 unsigned OrderedCountIndex
= IndexOperand
& 0x3f;
6139 IndexOperand
&= ~0x3f;
6140 unsigned CountDw
= 0;
6142 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
) {
6143 CountDw
= (IndexOperand
>> 24) & 0xf;
6144 IndexOperand
&= ~(0xf << 24);
6146 if (CountDw
< 1 || CountDw
> 4) {
6148 "ds_ordered_count: dword count must be between 1 and 4");
6153 report_fatal_error("ds_ordered_count: bad index operand");
6156 case Intrinsic::amdgcn_ds_ordered_add
:
6159 case Intrinsic::amdgcn_ds_ordered_swap
:
6164 if (WaveDone
&& !WaveRelease
)
6165 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
6167 switch (DAG
.getMachineFunction().getFunction().getCallingConv()) {
6168 case CallingConv::AMDGPU_CS
:
6169 case CallingConv::AMDGPU_KERNEL
:
6172 case CallingConv::AMDGPU_PS
:
6175 case CallingConv::AMDGPU_VS
:
6178 case CallingConv::AMDGPU_GS
:
6182 report_fatal_error("ds_ordered_count unsupported for this calling conv");
6185 unsigned Offset0
= OrderedCountIndex
<< 2;
6186 unsigned Offset1
= WaveRelease
| (WaveDone
<< 1) | (ShaderType
<< 2) |
6189 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
6190 Offset1
|= (CountDw
- 1) << 6;
6192 unsigned Offset
= Offset0
| (Offset1
<< 8);
6197 DAG
.getTargetConstant(Offset
, DL
, MVT::i16
),
6198 copyToM0(DAG
, Chain
, DL
, M0
).getValue(1), // Glue
6200 return DAG
.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT
, DL
,
6201 M
->getVTList(), Ops
, M
->getMemoryVT(),
6202 M
->getMemOperand());
6204 case Intrinsic::amdgcn_ds_fadd
: {
6205 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6208 case Intrinsic::amdgcn_ds_fadd
:
6209 Opc
= ISD::ATOMIC_LOAD_FADD
;
6213 return DAG
.getAtomic(Opc
, SDLoc(Op
), M
->getMemoryVT(),
6214 M
->getOperand(0), M
->getOperand(2), M
->getOperand(3),
6215 M
->getMemOperand());
6217 case Intrinsic::amdgcn_atomic_inc
:
6218 case Intrinsic::amdgcn_atomic_dec
:
6219 case Intrinsic::amdgcn_ds_fmin
:
6220 case Intrinsic::amdgcn_ds_fmax
: {
6221 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6224 case Intrinsic::amdgcn_atomic_inc
:
6225 Opc
= AMDGPUISD::ATOMIC_INC
;
6227 case Intrinsic::amdgcn_atomic_dec
:
6228 Opc
= AMDGPUISD::ATOMIC_DEC
;
6230 case Intrinsic::amdgcn_ds_fmin
:
6231 Opc
= AMDGPUISD::ATOMIC_LOAD_FMIN
;
6233 case Intrinsic::amdgcn_ds_fmax
:
6234 Opc
= AMDGPUISD::ATOMIC_LOAD_FMAX
;
6237 llvm_unreachable("Unknown intrinsic!");
6240 M
->getOperand(0), // Chain
6241 M
->getOperand(2), // Ptr
6242 M
->getOperand(3) // Value
6245 return DAG
.getMemIntrinsicNode(Opc
, SDLoc(Op
), M
->getVTList(), Ops
,
6246 M
->getMemoryVT(), M
->getMemOperand());
6248 case Intrinsic::amdgcn_buffer_load
:
6249 case Intrinsic::amdgcn_buffer_load_format
: {
6250 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(5))->getZExtValue();
6251 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6253 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
6254 IdxEn
= Idx
->getZExtValue() != 0;
6256 Op
.getOperand(0), // Chain
6257 Op
.getOperand(2), // rsrc
6258 Op
.getOperand(3), // vindex
6259 SDValue(), // voffset -- will be set by setBufferOffsets
6260 SDValue(), // soffset -- will be set by setBufferOffsets
6261 SDValue(), // offset -- will be set by setBufferOffsets
6262 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6263 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6266 unsigned Offset
= setBufferOffsets(Op
.getOperand(4), DAG
, &Ops
[3]);
6267 // We don't know the offset if vindex is non-zero, so clear it.
6271 unsigned Opc
= (IntrID
== Intrinsic::amdgcn_buffer_load
) ?
6272 AMDGPUISD::BUFFER_LOAD
: AMDGPUISD::BUFFER_LOAD_FORMAT
;
6274 EVT VT
= Op
.getValueType();
6275 EVT IntVT
= VT
.changeTypeToInteger();
6276 auto *M
= cast
<MemSDNode
>(Op
);
6277 M
->getMemOperand()->setOffset(Offset
);
6278 EVT LoadVT
= Op
.getValueType();
6280 if (LoadVT
.getScalarType() == MVT::f16
)
6281 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
,
6284 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6285 if (LoadVT
.getScalarType() == MVT::i8
||
6286 LoadVT
.getScalarType() == MVT::i16
)
6287 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
6289 return getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
, IntVT
,
6290 M
->getMemOperand(), DAG
);
6292 case Intrinsic::amdgcn_raw_buffer_load
:
6293 case Intrinsic::amdgcn_raw_buffer_load_format
: {
6294 const bool IsFormat
= IntrID
== Intrinsic::amdgcn_raw_buffer_load_format
;
6296 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
6298 Op
.getOperand(0), // Chain
6299 Op
.getOperand(2), // rsrc
6300 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6301 Offsets
.first
, // voffset
6302 Op
.getOperand(4), // soffset
6303 Offsets
.second
, // offset
6304 Op
.getOperand(5), // cachepolicy, swizzled buffer
6305 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6308 auto *M
= cast
<MemSDNode
>(Op
);
6309 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[3], Ops
[4], Ops
[5]));
6310 return lowerIntrinsicLoad(M
, IsFormat
, DAG
, Ops
);
6312 case Intrinsic::amdgcn_struct_buffer_load
:
6313 case Intrinsic::amdgcn_struct_buffer_load_format
: {
6314 const bool IsFormat
= IntrID
== Intrinsic::amdgcn_struct_buffer_load_format
;
6316 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6318 Op
.getOperand(0), // Chain
6319 Op
.getOperand(2), // rsrc
6320 Op
.getOperand(3), // vindex
6321 Offsets
.first
, // voffset
6322 Op
.getOperand(5), // soffset
6323 Offsets
.second
, // offset
6324 Op
.getOperand(6), // cachepolicy, swizzled buffer
6325 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6328 auto *M
= cast
<MemSDNode
>(Op
);
6329 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[3], Ops
[4], Ops
[5],
6331 return lowerIntrinsicLoad(cast
<MemSDNode
>(Op
), IsFormat
, DAG
, Ops
);
6333 case Intrinsic::amdgcn_tbuffer_load
: {
6334 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6335 EVT LoadVT
= Op
.getValueType();
6337 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6338 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
6339 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
6340 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
6342 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
6343 IdxEn
= Idx
->getZExtValue() != 0;
6345 Op
.getOperand(0), // Chain
6346 Op
.getOperand(2), // rsrc
6347 Op
.getOperand(3), // vindex
6348 Op
.getOperand(4), // voffset
6349 Op
.getOperand(5), // soffset
6350 Op
.getOperand(6), // offset
6351 DAG
.getTargetConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
6352 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6353 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
) // idxen
6356 if (LoadVT
.getScalarType() == MVT::f16
)
6357 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6359 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6360 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6363 case Intrinsic::amdgcn_raw_tbuffer_load
: {
6364 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6365 EVT LoadVT
= Op
.getValueType();
6366 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
6369 Op
.getOperand(0), // Chain
6370 Op
.getOperand(2), // rsrc
6371 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6372 Offsets
.first
, // voffset
6373 Op
.getOperand(4), // soffset
6374 Offsets
.second
, // offset
6375 Op
.getOperand(5), // format
6376 Op
.getOperand(6), // cachepolicy, swizzled buffer
6377 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6380 if (LoadVT
.getScalarType() == MVT::f16
)
6381 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6383 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6384 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6387 case Intrinsic::amdgcn_struct_tbuffer_load
: {
6388 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6389 EVT LoadVT
= Op
.getValueType();
6390 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6393 Op
.getOperand(0), // Chain
6394 Op
.getOperand(2), // rsrc
6395 Op
.getOperand(3), // vindex
6396 Offsets
.first
, // voffset
6397 Op
.getOperand(5), // soffset
6398 Offsets
.second
, // offset
6399 Op
.getOperand(6), // format
6400 Op
.getOperand(7), // cachepolicy, swizzled buffer
6401 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6404 if (LoadVT
.getScalarType() == MVT::f16
)
6405 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6407 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6408 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6411 case Intrinsic::amdgcn_buffer_atomic_swap
:
6412 case Intrinsic::amdgcn_buffer_atomic_add
:
6413 case Intrinsic::amdgcn_buffer_atomic_sub
:
6414 case Intrinsic::amdgcn_buffer_atomic_smin
:
6415 case Intrinsic::amdgcn_buffer_atomic_umin
:
6416 case Intrinsic::amdgcn_buffer_atomic_smax
:
6417 case Intrinsic::amdgcn_buffer_atomic_umax
:
6418 case Intrinsic::amdgcn_buffer_atomic_and
:
6419 case Intrinsic::amdgcn_buffer_atomic_or
:
6420 case Intrinsic::amdgcn_buffer_atomic_xor
: {
6421 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6423 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6424 IdxEn
= Idx
->getZExtValue() != 0;
6426 Op
.getOperand(0), // Chain
6427 Op
.getOperand(2), // vdata
6428 Op
.getOperand(3), // rsrc
6429 Op
.getOperand(4), // vindex
6430 SDValue(), // voffset -- will be set by setBufferOffsets
6431 SDValue(), // soffset -- will be set by setBufferOffsets
6432 SDValue(), // offset -- will be set by setBufferOffsets
6433 DAG
.getTargetConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6434 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6436 unsigned Offset
= setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6437 // We don't know the offset if vindex is non-zero, so clear it.
6440 EVT VT
= Op
.getValueType();
6442 auto *M
= cast
<MemSDNode
>(Op
);
6443 M
->getMemOperand()->setOffset(Offset
);
6444 unsigned Opcode
= 0;
6447 case Intrinsic::amdgcn_buffer_atomic_swap
:
6448 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6450 case Intrinsic::amdgcn_buffer_atomic_add
:
6451 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6453 case Intrinsic::amdgcn_buffer_atomic_sub
:
6454 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6456 case Intrinsic::amdgcn_buffer_atomic_smin
:
6457 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6459 case Intrinsic::amdgcn_buffer_atomic_umin
:
6460 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6462 case Intrinsic::amdgcn_buffer_atomic_smax
:
6463 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6465 case Intrinsic::amdgcn_buffer_atomic_umax
:
6466 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6468 case Intrinsic::amdgcn_buffer_atomic_and
:
6469 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6471 case Intrinsic::amdgcn_buffer_atomic_or
:
6472 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6474 case Intrinsic::amdgcn_buffer_atomic_xor
:
6475 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6478 llvm_unreachable("unhandled atomic opcode");
6481 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6482 M
->getMemOperand());
6484 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6485 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6486 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6487 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6488 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6489 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6490 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6491 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6492 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6493 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6494 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6495 case Intrinsic::amdgcn_raw_buffer_atomic_dec
: {
6496 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6498 Op
.getOperand(0), // Chain
6499 Op
.getOperand(2), // vdata
6500 Op
.getOperand(3), // rsrc
6501 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6502 Offsets
.first
, // voffset
6503 Op
.getOperand(5), // soffset
6504 Offsets
.second
, // offset
6505 Op
.getOperand(6), // cachepolicy
6506 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6508 EVT VT
= Op
.getValueType();
6510 auto *M
= cast
<MemSDNode
>(Op
);
6511 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[4], Ops
[5], Ops
[6]));
6512 unsigned Opcode
= 0;
6515 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6516 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6518 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6519 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6521 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6522 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6524 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6525 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6527 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6528 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6530 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6531 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6533 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6534 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6536 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6537 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6539 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6540 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6542 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6543 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6545 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6546 Opcode
= AMDGPUISD::BUFFER_ATOMIC_INC
;
6548 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
6549 Opcode
= AMDGPUISD::BUFFER_ATOMIC_DEC
;
6552 llvm_unreachable("unhandled atomic opcode");
6555 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6556 M
->getMemOperand());
6558 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6559 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6560 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6561 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6562 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6563 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6564 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6565 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6566 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6567 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6568 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6569 case Intrinsic::amdgcn_struct_buffer_atomic_dec
: {
6570 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6572 Op
.getOperand(0), // Chain
6573 Op
.getOperand(2), // vdata
6574 Op
.getOperand(3), // rsrc
6575 Op
.getOperand(4), // vindex
6576 Offsets
.first
, // voffset
6577 Op
.getOperand(6), // soffset
6578 Offsets
.second
, // offset
6579 Op
.getOperand(7), // cachepolicy
6580 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6582 EVT VT
= Op
.getValueType();
6584 auto *M
= cast
<MemSDNode
>(Op
);
6585 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[4], Ops
[5], Ops
[6],
6587 unsigned Opcode
= 0;
6590 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6591 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6593 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6594 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6596 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6597 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6599 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6600 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6602 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6603 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6605 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6606 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6608 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6609 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6611 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6612 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6614 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6615 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6617 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6618 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6620 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6621 Opcode
= AMDGPUISD::BUFFER_ATOMIC_INC
;
6623 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
6624 Opcode
= AMDGPUISD::BUFFER_ATOMIC_DEC
;
6627 llvm_unreachable("unhandled atomic opcode");
6630 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6631 M
->getMemOperand());
6633 case Intrinsic::amdgcn_buffer_atomic_cmpswap
: {
6634 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6636 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(5)))
6637 IdxEn
= Idx
->getZExtValue() != 0;
6639 Op
.getOperand(0), // Chain
6640 Op
.getOperand(2), // src
6641 Op
.getOperand(3), // cmp
6642 Op
.getOperand(4), // rsrc
6643 Op
.getOperand(5), // vindex
6644 SDValue(), // voffset -- will be set by setBufferOffsets
6645 SDValue(), // soffset -- will be set by setBufferOffsets
6646 SDValue(), // offset -- will be set by setBufferOffsets
6647 DAG
.getTargetConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6648 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6650 unsigned Offset
= setBufferOffsets(Op
.getOperand(6), DAG
, &Ops
[5]);
6651 // We don't know the offset if vindex is non-zero, so clear it.
6654 EVT VT
= Op
.getValueType();
6655 auto *M
= cast
<MemSDNode
>(Op
);
6656 M
->getMemOperand()->setOffset(Offset
);
6658 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6659 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6661 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
: {
6662 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6664 Op
.getOperand(0), // Chain
6665 Op
.getOperand(2), // src
6666 Op
.getOperand(3), // cmp
6667 Op
.getOperand(4), // rsrc
6668 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6669 Offsets
.first
, // voffset
6670 Op
.getOperand(6), // soffset
6671 Offsets
.second
, // offset
6672 Op
.getOperand(7), // cachepolicy
6673 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6675 EVT VT
= Op
.getValueType();
6676 auto *M
= cast
<MemSDNode
>(Op
);
6677 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[5], Ops
[6], Ops
[7]));
6679 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6680 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6682 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
: {
6683 auto Offsets
= splitBufferOffsets(Op
.getOperand(6), DAG
);
6685 Op
.getOperand(0), // Chain
6686 Op
.getOperand(2), // src
6687 Op
.getOperand(3), // cmp
6688 Op
.getOperand(4), // rsrc
6689 Op
.getOperand(5), // vindex
6690 Offsets
.first
, // voffset
6691 Op
.getOperand(7), // soffset
6692 Offsets
.second
, // offset
6693 Op
.getOperand(8), // cachepolicy
6694 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
6696 EVT VT
= Op
.getValueType();
6697 auto *M
= cast
<MemSDNode
>(Op
);
6698 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[5], Ops
[6], Ops
[7],
6701 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6702 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6706 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6707 AMDGPU::getImageDimIntrinsicInfo(IntrID
))
6708 return lowerImage(Op
, ImageDimIntr
, DAG
);
6714 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
6715 // dwordx4 if on SI.
6716 SDValue
SITargetLowering::getMemIntrinsicNode(unsigned Opcode
, const SDLoc
&DL
,
6718 ArrayRef
<SDValue
> Ops
, EVT MemVT
,
6719 MachineMemOperand
*MMO
,
6720 SelectionDAG
&DAG
) const {
6721 EVT VT
= VTList
.VTs
[0];
6723 EVT WidenedMemVT
= MemVT
;
6724 if (!Subtarget
->hasDwordx3LoadStores() &&
6725 (WidenedVT
== MVT::v3i32
|| WidenedVT
== MVT::v3f32
)) {
6726 WidenedVT
= EVT::getVectorVT(*DAG
.getContext(),
6727 WidenedVT
.getVectorElementType(), 4);
6728 WidenedMemVT
= EVT::getVectorVT(*DAG
.getContext(),
6729 WidenedMemVT
.getVectorElementType(), 4);
6730 MMO
= DAG
.getMachineFunction().getMachineMemOperand(MMO
, 0, 16);
6733 assert(VTList
.NumVTs
== 2);
6734 SDVTList WidenedVTList
= DAG
.getVTList(WidenedVT
, VTList
.VTs
[1]);
6736 auto NewOp
= DAG
.getMemIntrinsicNode(Opcode
, DL
, WidenedVTList
, Ops
,
6738 if (WidenedVT
!= VT
) {
6739 auto Extract
= DAG
.getNode(
6740 ISD::EXTRACT_SUBVECTOR
, DL
, VT
, NewOp
,
6741 DAG
.getConstant(0, DL
, getVectorIdxTy(DAG
.getDataLayout())));
6742 NewOp
= DAG
.getMergeValues({ Extract
, SDValue(NewOp
.getNode(), 1) }, DL
);
6747 SDValue
SITargetLowering::handleD16VData(SDValue VData
,
6748 SelectionDAG
&DAG
) const {
6749 EVT StoreVT
= VData
.getValueType();
6751 // No change for f16 and legal vector D16 types.
6752 if (!StoreVT
.isVector())
6756 assert((StoreVT
.getVectorNumElements() != 3) && "Handle v3f16");
6758 if (Subtarget
->hasUnpackedD16VMem()) {
6759 // We need to unpack the packed data to store.
6760 EVT IntStoreVT
= StoreVT
.changeTypeToInteger();
6761 SDValue IntVData
= DAG
.getNode(ISD::BITCAST
, DL
, IntStoreVT
, VData
);
6763 EVT EquivStoreVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
6764 StoreVT
.getVectorNumElements());
6765 SDValue ZExt
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, EquivStoreVT
, IntVData
);
6766 return DAG
.UnrollVectorOp(ZExt
.getNode());
6769 assert(isTypeLegal(StoreVT
));
6773 SDValue
SITargetLowering::LowerINTRINSIC_VOID(SDValue Op
,
6774 SelectionDAG
&DAG
) const {
6776 SDValue Chain
= Op
.getOperand(0);
6777 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
6778 MachineFunction
&MF
= DAG
.getMachineFunction();
6780 switch (IntrinsicID
) {
6781 case Intrinsic::amdgcn_exp
: {
6782 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
6783 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
6784 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(8));
6785 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(9));
6787 const SDValue Ops
[] = {
6789 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
6790 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
6791 Op
.getOperand(4), // src0
6792 Op
.getOperand(5), // src1
6793 Op
.getOperand(6), // src2
6794 Op
.getOperand(7), // src3
6795 DAG
.getTargetConstant(0, DL
, MVT::i1
), // compr
6796 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
6799 unsigned Opc
= Done
->isNullValue() ?
6800 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
6801 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
6803 case Intrinsic::amdgcn_exp_compr
: {
6804 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
6805 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
6806 SDValue Src0
= Op
.getOperand(4);
6807 SDValue Src1
= Op
.getOperand(5);
6808 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(6));
6809 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(7));
6811 SDValue Undef
= DAG
.getUNDEF(MVT::f32
);
6812 const SDValue Ops
[] = {
6814 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
6815 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
6816 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src0
),
6817 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src1
),
6820 DAG
.getTargetConstant(1, DL
, MVT::i1
), // compr
6821 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
6824 unsigned Opc
= Done
->isNullValue() ?
6825 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
6826 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
6828 case Intrinsic::amdgcn_s_barrier
: {
6829 if (getTargetMachine().getOptLevel() > CodeGenOpt::None
) {
6830 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
6831 unsigned WGSize
= ST
.getFlatWorkGroupSizes(MF
.getFunction()).second
;
6832 if (WGSize
<= ST
.getWavefrontSize())
6833 return SDValue(DAG
.getMachineNode(AMDGPU::WAVE_BARRIER
, DL
, MVT::Other
,
6834 Op
.getOperand(0)), 0);
6838 case Intrinsic::amdgcn_tbuffer_store
: {
6839 SDValue VData
= Op
.getOperand(2);
6840 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6842 VData
= handleD16VData(VData
, DAG
);
6843 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
6844 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
6845 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
6846 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(11))->getZExtValue();
6848 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6849 IdxEn
= Idx
->getZExtValue() != 0;
6853 Op
.getOperand(3), // rsrc
6854 Op
.getOperand(4), // vindex
6855 Op
.getOperand(5), // voffset
6856 Op
.getOperand(6), // soffset
6857 Op
.getOperand(7), // offset
6858 DAG
.getTargetConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
6859 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6860 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idexen
6862 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6863 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6864 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6865 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6866 M
->getMemoryVT(), M
->getMemOperand());
6869 case Intrinsic::amdgcn_struct_tbuffer_store
: {
6870 SDValue VData
= Op
.getOperand(2);
6871 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6873 VData
= handleD16VData(VData
, DAG
);
6874 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6878 Op
.getOperand(3), // rsrc
6879 Op
.getOperand(4), // vindex
6880 Offsets
.first
, // voffset
6881 Op
.getOperand(6), // soffset
6882 Offsets
.second
, // offset
6883 Op
.getOperand(7), // format
6884 Op
.getOperand(8), // cachepolicy, swizzled buffer
6885 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idexen
6887 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6888 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6889 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6890 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6891 M
->getMemoryVT(), M
->getMemOperand());
6894 case Intrinsic::amdgcn_raw_tbuffer_store
: {
6895 SDValue VData
= Op
.getOperand(2);
6896 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6898 VData
= handleD16VData(VData
, DAG
);
6899 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6903 Op
.getOperand(3), // rsrc
6904 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6905 Offsets
.first
, // voffset
6906 Op
.getOperand(5), // soffset
6907 Offsets
.second
, // offset
6908 Op
.getOperand(6), // format
6909 Op
.getOperand(7), // cachepolicy, swizzled buffer
6910 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idexen
6912 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6913 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6914 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6915 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6916 M
->getMemoryVT(), M
->getMemOperand());
6919 case Intrinsic::amdgcn_buffer_store
:
6920 case Intrinsic::amdgcn_buffer_store_format
: {
6921 SDValue VData
= Op
.getOperand(2);
6922 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6924 VData
= handleD16VData(VData
, DAG
);
6925 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6926 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6928 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6929 IdxEn
= Idx
->getZExtValue() != 0;
6933 Op
.getOperand(3), // rsrc
6934 Op
.getOperand(4), // vindex
6935 SDValue(), // voffset -- will be set by setBufferOffsets
6936 SDValue(), // soffset -- will be set by setBufferOffsets
6937 SDValue(), // offset -- will be set by setBufferOffsets
6938 DAG
.getTargetConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6939 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
6941 unsigned Offset
= setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6942 // We don't know the offset if vindex is non-zero, so clear it.
6945 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_buffer_store
?
6946 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
6947 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6948 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6949 M
->getMemOperand()->setOffset(Offset
);
6951 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6952 EVT VDataType
= VData
.getValueType().getScalarType();
6953 if (VDataType
== MVT::i8
|| VDataType
== MVT::i16
)
6954 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
6956 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6957 M
->getMemoryVT(), M
->getMemOperand());
6960 case Intrinsic::amdgcn_raw_buffer_store
:
6961 case Intrinsic::amdgcn_raw_buffer_store_format
: {
6962 const bool IsFormat
=
6963 IntrinsicID
== Intrinsic::amdgcn_raw_buffer_store_format
;
6965 SDValue VData
= Op
.getOperand(2);
6966 EVT VDataVT
= VData
.getValueType();
6967 EVT EltType
= VDataVT
.getScalarType();
6968 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
6970 VData
= handleD16VData(VData
, DAG
);
6972 if (!isTypeLegal(VDataVT
)) {
6974 DAG
.getNode(ISD::BITCAST
, DL
,
6975 getEquivalentMemType(*DAG
.getContext(), VDataVT
), VData
);
6978 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6982 Op
.getOperand(3), // rsrc
6983 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6984 Offsets
.first
, // voffset
6985 Op
.getOperand(5), // soffset
6986 Offsets
.second
, // offset
6987 Op
.getOperand(6), // cachepolicy, swizzled buffer
6988 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6991 IsFormat
? AMDGPUISD::BUFFER_STORE_FORMAT
: AMDGPUISD::BUFFER_STORE
;
6992 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6993 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6994 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[4], Ops
[5], Ops
[6]));
6996 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6997 if (!IsD16
&& !VDataVT
.isVector() && EltType
.getSizeInBits() < 32)
6998 return handleByteShortBufferStores(DAG
, VDataVT
, DL
, Ops
, M
);
7000 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
7001 M
->getMemoryVT(), M
->getMemOperand());
7004 case Intrinsic::amdgcn_struct_buffer_store
:
7005 case Intrinsic::amdgcn_struct_buffer_store_format
: {
7006 const bool IsFormat
=
7007 IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store_format
;
7009 SDValue VData
= Op
.getOperand(2);
7010 EVT VDataVT
= VData
.getValueType();
7011 EVT EltType
= VDataVT
.getScalarType();
7012 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
7015 VData
= handleD16VData(VData
, DAG
);
7017 if (!isTypeLegal(VDataVT
)) {
7019 DAG
.getNode(ISD::BITCAST
, DL
,
7020 getEquivalentMemType(*DAG
.getContext(), VDataVT
), VData
);
7023 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
7027 Op
.getOperand(3), // rsrc
7028 Op
.getOperand(4), // vindex
7029 Offsets
.first
, // voffset
7030 Op
.getOperand(6), // soffset
7031 Offsets
.second
, // offset
7032 Op
.getOperand(7), // cachepolicy, swizzled buffer
7033 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
7035 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store
?
7036 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
7037 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
7038 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
7039 M
->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops
[4], Ops
[5], Ops
[6],
7042 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
7043 EVT VDataType
= VData
.getValueType().getScalarType();
7044 if (!IsD16
&& !VDataVT
.isVector() && EltType
.getSizeInBits() < 32)
7045 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
7047 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
7048 M
->getMemoryVT(), M
->getMemOperand());
7051 case Intrinsic::amdgcn_buffer_atomic_fadd
: {
7052 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
7054 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
7055 IdxEn
= Idx
->getZExtValue() != 0;
7058 Op
.getOperand(2), // vdata
7059 Op
.getOperand(3), // rsrc
7060 Op
.getOperand(4), // vindex
7061 SDValue(), // voffset -- will be set by setBufferOffsets
7062 SDValue(), // soffset -- will be set by setBufferOffsets
7063 SDValue(), // offset -- will be set by setBufferOffsets
7064 DAG
.getTargetConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
7065 DAG
.getTargetConstant(IdxEn
, DL
, MVT::i1
), // idxen
7067 unsigned Offset
= setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
7068 // We don't know the offset if vindex is non-zero, so clear it.
7071 EVT VT
= Op
.getOperand(2).getValueType();
7073 auto *M
= cast
<MemSDNode
>(Op
);
7074 M
->getMemOperand()->setOffset(Offset
);
7075 unsigned Opcode
= VT
.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
7076 : AMDGPUISD::BUFFER_ATOMIC_FADD
;
7078 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
7079 M
->getMemOperand());
7082 case Intrinsic::amdgcn_global_atomic_fadd
: {
7085 Op
.getOperand(2), // ptr
7086 Op
.getOperand(3) // vdata
7088 EVT VT
= Op
.getOperand(3).getValueType();
7090 auto *M
= cast
<MemSDNode
>(Op
);
7091 unsigned Opcode
= VT
.isVector() ? AMDGPUISD::ATOMIC_PK_FADD
7092 : AMDGPUISD::ATOMIC_FADD
;
7094 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
7095 M
->getMemOperand());
7098 case Intrinsic::amdgcn_end_cf
:
7099 return SDValue(DAG
.getMachineNode(AMDGPU::SI_END_CF
, DL
, MVT::Other
,
7100 Op
->getOperand(2), Chain
), 0);
7103 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
7104 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
7105 return lowerImage(Op
, ImageDimIntr
, DAG
);
7112 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
7113 // offset (the offset that is included in bounds checking and swizzling, to be
7114 // split between the instruction's voffset and immoffset fields) and soffset
7115 // (the offset that is excluded from bounds checking and swizzling, to go in
7116 // the instruction's soffset field). This function takes the first kind of
7117 // offset and figures out how to split it between voffset and immoffset.
7118 std::pair
<SDValue
, SDValue
> SITargetLowering::splitBufferOffsets(
7119 SDValue Offset
, SelectionDAG
&DAG
) const {
7121 const unsigned MaxImm
= 4095;
7122 SDValue N0
= Offset
;
7123 ConstantSDNode
*C1
= nullptr;
7125 if ((C1
= dyn_cast
<ConstantSDNode
>(N0
)))
7127 else if (DAG
.isBaseWithConstantOffset(N0
)) {
7128 C1
= cast
<ConstantSDNode
>(N0
.getOperand(1));
7129 N0
= N0
.getOperand(0);
7133 unsigned ImmOffset
= C1
->getZExtValue();
7134 // If the immediate value is too big for the immoffset field, put the value
7135 // and -4096 into the immoffset field so that the value that is copied/added
7136 // for the voffset field is a multiple of 4096, and it stands more chance
7137 // of being CSEd with the copy/add for another similar load/store.
7138 // However, do not do that rounding down to a multiple of 4096 if that is a
7139 // negative number, as it appears to be illegal to have a negative offset
7140 // in the vgpr, even if adding the immediate offset makes it positive.
7141 unsigned Overflow
= ImmOffset
& ~MaxImm
;
7142 ImmOffset
-= Overflow
;
7143 if ((int32_t)Overflow
< 0) {
7144 Overflow
+= ImmOffset
;
7147 C1
= cast
<ConstantSDNode
>(DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
));
7149 auto OverflowVal
= DAG
.getConstant(Overflow
, DL
, MVT::i32
);
7153 SDValue Ops
[] = { N0
, OverflowVal
};
7154 N0
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, Ops
);
7159 N0
= DAG
.getConstant(0, DL
, MVT::i32
);
7161 C1
= cast
<ConstantSDNode
>(DAG
.getTargetConstant(0, DL
, MVT::i32
));
7162 return {N0
, SDValue(C1
, 0)};
7165 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
7166 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
7167 // pointed to by Offsets.
7168 unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset
,
7169 SelectionDAG
&DAG
, SDValue
*Offsets
,
7170 unsigned Align
) const {
7171 SDLoc
DL(CombinedOffset
);
7172 if (auto C
= dyn_cast
<ConstantSDNode
>(CombinedOffset
)) {
7173 uint32_t Imm
= C
->getZExtValue();
7174 uint32_t SOffset
, ImmOffset
;
7175 if (AMDGPU::splitMUBUFOffset(Imm
, SOffset
, ImmOffset
, Subtarget
, Align
)) {
7176 Offsets
[0] = DAG
.getConstant(0, DL
, MVT::i32
);
7177 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
7178 Offsets
[2] = DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
);
7179 return SOffset
+ ImmOffset
;
7182 if (DAG
.isBaseWithConstantOffset(CombinedOffset
)) {
7183 SDValue N0
= CombinedOffset
.getOperand(0);
7184 SDValue N1
= CombinedOffset
.getOperand(1);
7185 uint32_t SOffset
, ImmOffset
;
7186 int Offset
= cast
<ConstantSDNode
>(N1
)->getSExtValue();
7187 if (Offset
>= 0 && AMDGPU::splitMUBUFOffset(Offset
, SOffset
, ImmOffset
,
7188 Subtarget
, Align
)) {
7190 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
7191 Offsets
[2] = DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
);
7195 Offsets
[0] = CombinedOffset
;
7196 Offsets
[1] = DAG
.getConstant(0, DL
, MVT::i32
);
7197 Offsets
[2] = DAG
.getTargetConstant(0, DL
, MVT::i32
);
7201 // Handle 8 bit and 16 bit buffer loads
7202 SDValue
SITargetLowering::handleByteShortBufferLoads(SelectionDAG
&DAG
,
7203 EVT LoadVT
, SDLoc DL
,
7204 ArrayRef
<SDValue
> Ops
,
7205 MemSDNode
*M
) const {
7206 EVT IntVT
= LoadVT
.changeTypeToInteger();
7207 unsigned Opc
= (LoadVT
.getScalarType() == MVT::i8
) ?
7208 AMDGPUISD::BUFFER_LOAD_UBYTE
: AMDGPUISD::BUFFER_LOAD_USHORT
;
7210 SDVTList ResList
= DAG
.getVTList(MVT::i32
, MVT::Other
);
7211 SDValue BufferLoad
= DAG
.getMemIntrinsicNode(Opc
, DL
, ResList
,
7213 M
->getMemOperand());
7214 SDValue LoadVal
= DAG
.getNode(ISD::TRUNCATE
, DL
, IntVT
, BufferLoad
);
7215 LoadVal
= DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, LoadVal
);
7217 return DAG
.getMergeValues({LoadVal
, BufferLoad
.getValue(1)}, DL
);
7220 // Handle 8 bit and 16 bit buffer stores
7221 SDValue
SITargetLowering::handleByteShortBufferStores(SelectionDAG
&DAG
,
7222 EVT VDataType
, SDLoc DL
,
7224 MemSDNode
*M
) const {
7225 if (VDataType
== MVT::f16
)
7226 Ops
[1] = DAG
.getNode(ISD::BITCAST
, DL
, MVT::i16
, Ops
[1]);
7228 SDValue BufferStoreExt
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Ops
[1]);
7229 Ops
[1] = BufferStoreExt
;
7230 unsigned Opc
= (VDataType
== MVT::i8
) ? AMDGPUISD::BUFFER_STORE_BYTE
:
7231 AMDGPUISD::BUFFER_STORE_SHORT
;
7232 ArrayRef
<SDValue
> OpsRef
= makeArrayRef(&Ops
[0], 9);
7233 return DAG
.getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), OpsRef
, VDataType
,
7234 M
->getMemOperand());
7237 static SDValue
getLoadExtOrTrunc(SelectionDAG
&DAG
,
7238 ISD::LoadExtType ExtType
, SDValue Op
,
7239 const SDLoc
&SL
, EVT VT
) {
7240 if (VT
.bitsLT(Op
.getValueType()))
7241 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Op
);
7245 return DAG
.getNode(ISD::SIGN_EXTEND
, SL
, VT
, Op
);
7247 return DAG
.getNode(ISD::ZERO_EXTEND
, SL
, VT
, Op
);
7249 return DAG
.getNode(ISD::ANY_EXTEND
, SL
, VT
, Op
);
7250 case ISD::NON_EXTLOAD
:
7254 llvm_unreachable("invalid ext type");
7257 SDValue
SITargetLowering::widenLoad(LoadSDNode
*Ld
, DAGCombinerInfo
&DCI
) const {
7258 SelectionDAG
&DAG
= DCI
.DAG
;
7259 if (Ld
->getAlignment() < 4 || Ld
->isDivergent())
7262 // FIXME: Constant loads should all be marked invariant.
7263 unsigned AS
= Ld
->getAddressSpace();
7264 if (AS
!= AMDGPUAS::CONSTANT_ADDRESS
&&
7265 AS
!= AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
7266 (AS
!= AMDGPUAS::GLOBAL_ADDRESS
|| !Ld
->isInvariant()))
7269 // Don't do this early, since it may interfere with adjacent load merging for
7270 // illegal types. We can avoid losing alignment information for exotic types
7272 EVT MemVT
= Ld
->getMemoryVT();
7273 if ((MemVT
.isSimple() && !DCI
.isAfterLegalizeDAG()) ||
7274 MemVT
.getSizeInBits() >= 32)
7279 assert((!MemVT
.isVector() || Ld
->getExtensionType() == ISD::NON_EXTLOAD
) &&
7280 "unexpected vector extload");
7282 // TODO: Drop only high part of range.
7283 SDValue Ptr
= Ld
->getBasePtr();
7284 SDValue NewLoad
= DAG
.getLoad(ISD::UNINDEXED
, ISD::NON_EXTLOAD
,
7285 MVT::i32
, SL
, Ld
->getChain(), Ptr
,
7287 Ld
->getPointerInfo(), MVT::i32
,
7289 Ld
->getMemOperand()->getFlags(),
7291 nullptr); // Drop ranges
7293 EVT TruncVT
= EVT::getIntegerVT(*DAG
.getContext(), MemVT
.getSizeInBits());
7294 if (MemVT
.isFloatingPoint()) {
7295 assert(Ld
->getExtensionType() == ISD::NON_EXTLOAD
&&
7296 "unexpected fp extload");
7297 TruncVT
= MemVT
.changeTypeToInteger();
7300 SDValue Cvt
= NewLoad
;
7301 if (Ld
->getExtensionType() == ISD::SEXTLOAD
) {
7302 Cvt
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, SL
, MVT::i32
, NewLoad
,
7303 DAG
.getValueType(TruncVT
));
7304 } else if (Ld
->getExtensionType() == ISD::ZEXTLOAD
||
7305 Ld
->getExtensionType() == ISD::NON_EXTLOAD
) {
7306 Cvt
= DAG
.getZeroExtendInReg(NewLoad
, SL
, TruncVT
);
7308 assert(Ld
->getExtensionType() == ISD::EXTLOAD
);
7311 EVT VT
= Ld
->getValueType(0);
7312 EVT IntVT
= EVT::getIntegerVT(*DAG
.getContext(), VT
.getSizeInBits());
7314 DCI
.AddToWorklist(Cvt
.getNode());
7316 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
7317 // the appropriate extension from the 32-bit load.
7318 Cvt
= getLoadExtOrTrunc(DAG
, Ld
->getExtensionType(), Cvt
, SL
, IntVT
);
7319 DCI
.AddToWorklist(Cvt
.getNode());
7321 // Handle conversion back to floating point if necessary.
7322 Cvt
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, Cvt
);
7324 return DAG
.getMergeValues({ Cvt
, NewLoad
.getValue(1) }, SL
);
7327 SDValue
SITargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
7329 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
7330 ISD::LoadExtType ExtType
= Load
->getExtensionType();
7331 EVT MemVT
= Load
->getMemoryVT();
7333 if (ExtType
== ISD::NON_EXTLOAD
&& MemVT
.getSizeInBits() < 32) {
7334 if (MemVT
== MVT::i16
&& isTypeLegal(MVT::i16
))
7337 // FIXME: Copied from PPC
7338 // First, load into 32 bits, then truncate to 1 bit.
7340 SDValue Chain
= Load
->getChain();
7341 SDValue BasePtr
= Load
->getBasePtr();
7342 MachineMemOperand
*MMO
= Load
->getMemOperand();
7344 EVT RealMemVT
= (MemVT
== MVT::i1
) ? MVT::i8
: MVT::i16
;
7346 SDValue NewLD
= DAG
.getExtLoad(ISD::EXTLOAD
, DL
, MVT::i32
, Chain
,
7347 BasePtr
, RealMemVT
, MMO
);
7349 if (!MemVT
.isVector()) {
7351 DAG
.getNode(ISD::TRUNCATE
, DL
, MemVT
, NewLD
),
7355 return DAG
.getMergeValues(Ops
, DL
);
7358 SmallVector
<SDValue
, 3> Elts
;
7359 for (unsigned I
= 0, N
= MemVT
.getVectorNumElements(); I
!= N
; ++I
) {
7360 SDValue Elt
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, NewLD
,
7361 DAG
.getConstant(I
, DL
, MVT::i32
));
7363 Elts
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Elt
));
7367 DAG
.getBuildVector(MemVT
, DL
, Elts
),
7371 return DAG
.getMergeValues(Ops
, DL
);
7374 if (!MemVT
.isVector())
7377 assert(Op
.getValueType().getVectorElementType() == MVT::i32
&&
7378 "Custom lowering for non-i32 vectors hasn't been implemented.");
7380 if (!allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
7381 MemVT
, *Load
->getMemOperand())) {
7383 std::tie(Ops
[0], Ops
[1]) = expandUnalignedLoad(Load
, DAG
);
7384 return DAG
.getMergeValues(Ops
, DL
);
7387 unsigned Alignment
= Load
->getAlignment();
7388 unsigned AS
= Load
->getAddressSpace();
7389 if (Subtarget
->hasLDSMisalignedBug() &&
7390 AS
== AMDGPUAS::FLAT_ADDRESS
&&
7391 Alignment
< MemVT
.getStoreSize() && MemVT
.getSizeInBits() > 32) {
7392 return SplitVectorLoad(Op
, DAG
);
7395 MachineFunction
&MF
= DAG
.getMachineFunction();
7396 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
7397 // If there is a possibilty that flat instruction access scratch memory
7398 // then we need to use the same legalization rules we use for private.
7399 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
7400 AS
= MFI
->hasFlatScratchInit() ?
7401 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
7403 unsigned NumElements
= MemVT
.getVectorNumElements();
7405 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7406 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
7407 if (!Op
->isDivergent() && Alignment
>= 4 && NumElements
< 32) {
7408 if (MemVT
.isPow2VectorType())
7410 if (NumElements
== 3)
7411 return WidenVectorLoad(Op
, DAG
);
7412 return SplitVectorLoad(Op
, DAG
);
7414 // Non-uniform loads will be selected to MUBUF instructions, so they
7415 // have the same legalization requirements as global and private
7420 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7421 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
7422 AS
== AMDGPUAS::GLOBAL_ADDRESS
) {
7423 if (Subtarget
->getScalarizeGlobalBehavior() && !Op
->isDivergent() &&
7424 !Load
->isVolatile() && isMemOpHasNoClobberedMemOperand(Load
) &&
7425 Alignment
>= 4 && NumElements
< 32) {
7426 if (MemVT
.isPow2VectorType())
7428 if (NumElements
== 3)
7429 return WidenVectorLoad(Op
, DAG
);
7430 return SplitVectorLoad(Op
, DAG
);
7432 // Non-uniform loads will be selected to MUBUF instructions, so they
7433 // have the same legalization requirements as global and private
7437 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7438 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
7439 AS
== AMDGPUAS::GLOBAL_ADDRESS
||
7440 AS
== AMDGPUAS::FLAT_ADDRESS
) {
7441 if (NumElements
> 4)
7442 return SplitVectorLoad(Op
, DAG
);
7443 // v3 loads not supported on SI.
7444 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7445 return WidenVectorLoad(Op
, DAG
);
7446 // v3 and v4 loads are supported for private and global memory.
7449 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7450 // Depending on the setting of the private_element_size field in the
7451 // resource descriptor, we can only make private accesses up to a certain
7453 switch (Subtarget
->getMaxPrivateElementSize()) {
7455 return scalarizeVectorLoad(Load
, DAG
);
7457 if (NumElements
> 2)
7458 return SplitVectorLoad(Op
, DAG
);
7461 // Same as global/flat
7462 if (NumElements
> 4)
7463 return SplitVectorLoad(Op
, DAG
);
7464 // v3 loads not supported on SI.
7465 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7466 return WidenVectorLoad(Op
, DAG
);
7469 llvm_unreachable("unsupported private_element_size");
7471 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
7472 // Use ds_read_b128 if possible.
7473 if (Subtarget
->useDS128() && Load
->getAlignment() >= 16 &&
7474 MemVT
.getStoreSize() == 16)
7477 if (NumElements
> 2)
7478 return SplitVectorLoad(Op
, DAG
);
7480 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7481 // address is negative, then the instruction is incorrectly treated as
7482 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7483 // loads here to avoid emitting ds_read2_b32. We may re-combine the
7484 // load later in the SILoadStoreOptimizer.
7485 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
7486 NumElements
== 2 && MemVT
.getStoreSize() == 8 &&
7487 Load
->getAlignment() < 8) {
7488 return SplitVectorLoad(Op
, DAG
);
7494 SDValue
SITargetLowering::LowerSELECT(SDValue Op
, SelectionDAG
&DAG
) const {
7495 EVT VT
= Op
.getValueType();
7496 assert(VT
.getSizeInBits() == 64);
7499 SDValue Cond
= Op
.getOperand(0);
7501 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i32
);
7502 SDValue One
= DAG
.getConstant(1, DL
, MVT::i32
);
7504 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(1));
7505 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(2));
7507 SDValue Lo0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, Zero
);
7508 SDValue Lo1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, Zero
);
7510 SDValue Lo
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Lo0
, Lo1
);
7512 SDValue Hi0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, One
);
7513 SDValue Hi1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, One
);
7515 SDValue Hi
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Hi0
, Hi1
);
7517 SDValue Res
= DAG
.getBuildVector(MVT::v2i32
, DL
, {Lo
, Hi
});
7518 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Res
);
7521 // Catch division cases where we can use shortcuts with rcp and rsq
7523 SDValue
SITargetLowering::lowerFastUnsafeFDIV(SDValue Op
,
7524 SelectionDAG
&DAG
) const {
7526 SDValue LHS
= Op
.getOperand(0);
7527 SDValue RHS
= Op
.getOperand(1);
7528 EVT VT
= Op
.getValueType();
7529 const SDNodeFlags Flags
= Op
->getFlags();
7530 bool Unsafe
= DAG
.getTarget().Options
.UnsafeFPMath
|| Flags
.hasAllowReciprocal();
7532 if (!Unsafe
&& VT
== MVT::f32
&& Subtarget
->hasFP32Denormals())
7535 if (const ConstantFPSDNode
*CLHS
= dyn_cast
<ConstantFPSDNode
>(LHS
)) {
7536 if (Unsafe
|| VT
== MVT::f32
|| VT
== MVT::f16
) {
7537 if (CLHS
->isExactlyValue(1.0)) {
7538 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
7539 // the CI documentation has a worst case error of 1 ulp.
7540 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
7541 // use it as long as we aren't trying to use denormals.
7543 // v_rcp_f16 and v_rsq_f16 DO support denormals.
7545 // 1.0 / sqrt(x) -> rsq(x)
7547 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
7548 // error seems really high at 2^29 ULP.
7549 if (RHS
.getOpcode() == ISD::FSQRT
)
7550 return DAG
.getNode(AMDGPUISD::RSQ
, SL
, VT
, RHS
.getOperand(0));
7552 // 1.0 / x -> rcp(x)
7553 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
7556 // Same as for 1.0, but expand the sign out of the constant.
7557 if (CLHS
->isExactlyValue(-1.0)) {
7558 // -1.0 / x -> rcp (fneg x)
7559 SDValue FNegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
7560 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, FNegRHS
);
7566 // Turn into multiply by the reciprocal.
7567 // x / y -> x * (1.0 / y)
7568 SDValue Recip
= DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
7569 return DAG
.getNode(ISD::FMUL
, SL
, VT
, LHS
, Recip
, Flags
);
7575 static SDValue
getFPBinOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
7576 EVT VT
, SDValue A
, SDValue B
, SDValue GlueChain
) {
7577 if (GlueChain
->getNumValues() <= 1) {
7578 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
);
7581 assert(GlueChain
->getNumValues() == 3);
7583 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
7585 default: llvm_unreachable("no chain equivalent for opcode");
7587 Opcode
= AMDGPUISD::FMUL_W_CHAIN
;
7591 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
,
7592 GlueChain
.getValue(2));
7595 static SDValue
getFPTernOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
7596 EVT VT
, SDValue A
, SDValue B
, SDValue C
,
7597 SDValue GlueChain
) {
7598 if (GlueChain
->getNumValues() <= 1) {
7599 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
, C
);
7602 assert(GlueChain
->getNumValues() == 3);
7604 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
7606 default: llvm_unreachable("no chain equivalent for opcode");
7608 Opcode
= AMDGPUISD::FMA_W_CHAIN
;
7612 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
, C
,
7613 GlueChain
.getValue(2));
7616 SDValue
SITargetLowering::LowerFDIV16(SDValue Op
, SelectionDAG
&DAG
) const {
7617 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
7621 SDValue Src0
= Op
.getOperand(0);
7622 SDValue Src1
= Op
.getOperand(1);
7624 SDValue CvtSrc0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
7625 SDValue CvtSrc1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
7627 SDValue RcpSrc1
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, CvtSrc1
);
7628 SDValue Quot
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, CvtSrc0
, RcpSrc1
);
7630 SDValue FPRoundFlag
= DAG
.getTargetConstant(0, SL
, MVT::i32
);
7631 SDValue BestQuot
= DAG
.getNode(ISD::FP_ROUND
, SL
, MVT::f16
, Quot
, FPRoundFlag
);
7633 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f16
, BestQuot
, Src1
, Src0
);
7636 // Faster 2.5 ULP division that does not support denormals.
7637 SDValue
SITargetLowering::lowerFDIV_FAST(SDValue Op
, SelectionDAG
&DAG
) const {
7639 SDValue LHS
= Op
.getOperand(1);
7640 SDValue RHS
= Op
.getOperand(2);
7642 SDValue r1
= DAG
.getNode(ISD::FABS
, SL
, MVT::f32
, RHS
);
7644 const APFloat
K0Val(BitsToFloat(0x6f800000));
7645 const SDValue K0
= DAG
.getConstantFP(K0Val
, SL
, MVT::f32
);
7647 const APFloat
K1Val(BitsToFloat(0x2f800000));
7648 const SDValue K1
= DAG
.getConstantFP(K1Val
, SL
, MVT::f32
);
7650 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
7653 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::f32
);
7655 SDValue r2
= DAG
.getSetCC(SL
, SetCCVT
, r1
, K0
, ISD::SETOGT
);
7657 SDValue r3
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f32
, r2
, K1
, One
);
7659 // TODO: Should this propagate fast-math-flags?
7660 r1
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, RHS
, r3
);
7662 // rcp does not support denormals.
7663 SDValue r0
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, r1
);
7665 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, LHS
, r0
);
7667 return DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, r3
, Mul
);
7670 // Returns immediate value for setting the F32 denorm mode when using the
7671 // S_DENORM_MODE instruction.
7672 static const SDValue
getSPDenormModeValue(int SPDenormMode
, SelectionDAG
&DAG
,
7673 const SDLoc
&SL
, const GCNSubtarget
*ST
) {
7674 assert(ST
->hasDenormModeInst() && "Requires S_DENORM_MODE");
7675 int DPDenormModeDefault
= ST
->hasFP64Denormals()
7676 ? FP_DENORM_FLUSH_NONE
7677 : FP_DENORM_FLUSH_IN_FLUSH_OUT
;
7679 int Mode
= SPDenormMode
| (DPDenormModeDefault
<< 2);
7680 return DAG
.getTargetConstant(Mode
, SL
, MVT::i32
);
7683 SDValue
SITargetLowering::LowerFDIV32(SDValue Op
, SelectionDAG
&DAG
) const {
7684 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
7688 SDValue LHS
= Op
.getOperand(0);
7689 SDValue RHS
= Op
.getOperand(1);
7691 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
7693 SDVTList ScaleVT
= DAG
.getVTList(MVT::f32
, MVT::i1
);
7695 SDValue DenominatorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
7697 SDValue NumeratorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
7700 // Denominator is scaled to not be denormal, so using rcp is ok.
7701 SDValue ApproxRcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
,
7703 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
,
7706 const unsigned Denorm32Reg
= AMDGPU::Hwreg::ID_MODE
|
7707 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_
) |
7708 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_
);
7709 const SDValue BitField
= DAG
.getTargetConstant(Denorm32Reg
, SL
, MVT::i16
);
7711 if (!Subtarget
->hasFP32Denormals()) {
7712 SDVTList BindParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
7714 SDValue EnableDenorm
;
7715 if (Subtarget
->hasDenormModeInst()) {
7716 const SDValue EnableDenormValue
=
7717 getSPDenormModeValue(FP_DENORM_FLUSH_NONE
, DAG
, SL
, Subtarget
);
7719 EnableDenorm
= DAG
.getNode(AMDGPUISD::DENORM_MODE
, SL
, BindParamVTs
,
7720 DAG
.getEntryNode(), EnableDenormValue
);
7722 const SDValue EnableDenormValue
= DAG
.getConstant(FP_DENORM_FLUSH_NONE
,
7724 EnableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, BindParamVTs
,
7725 DAG
.getEntryNode(), EnableDenormValue
,
7731 EnableDenorm
.getValue(0),
7732 EnableDenorm
.getValue(1)
7735 NegDivScale0
= DAG
.getMergeValues(Ops
, SL
);
7738 SDValue Fma0
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
,
7739 ApproxRcp
, One
, NegDivScale0
);
7741 SDValue Fma1
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma0
, ApproxRcp
,
7744 SDValue Mul
= getFPBinOp(DAG
, ISD::FMUL
, SL
, MVT::f32
, NumeratorScaled
,
7747 SDValue Fma2
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Mul
,
7748 NumeratorScaled
, Mul
);
7750 SDValue Fma3
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma2
, Fma1
, Mul
, Fma2
);
7752 SDValue Fma4
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Fma3
,
7753 NumeratorScaled
, Fma3
);
7755 if (!Subtarget
->hasFP32Denormals()) {
7757 SDValue DisableDenorm
;
7758 if (Subtarget
->hasDenormModeInst()) {
7759 const SDValue DisableDenormValue
=
7760 getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT
, DAG
, SL
, Subtarget
);
7762 DisableDenorm
= DAG
.getNode(AMDGPUISD::DENORM_MODE
, SL
, MVT::Other
,
7763 Fma4
.getValue(1), DisableDenormValue
,
7766 const SDValue DisableDenormValue
=
7767 DAG
.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT
, SL
, MVT::i32
);
7769 DisableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, MVT::Other
,
7770 Fma4
.getValue(1), DisableDenormValue
,
7771 BitField
, Fma4
.getValue(2));
7774 SDValue OutputChain
= DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
,
7775 DisableDenorm
, DAG
.getRoot());
7776 DAG
.setRoot(OutputChain
);
7779 SDValue Scale
= NumeratorScaled
.getValue(1);
7780 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f32
,
7781 Fma4
, Fma1
, Fma3
, Scale
);
7783 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f32
, Fmas
, RHS
, LHS
);
7786 SDValue
SITargetLowering::LowerFDIV64(SDValue Op
, SelectionDAG
&DAG
) const {
7787 if (DAG
.getTarget().Options
.UnsafeFPMath
)
7788 return lowerFastUnsafeFDIV(Op
, DAG
);
7791 SDValue X
= Op
.getOperand(0);
7792 SDValue Y
= Op
.getOperand(1);
7794 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f64
);
7796 SDVTList ScaleVT
= DAG
.getVTList(MVT::f64
, MVT::i1
);
7798 SDValue DivScale0
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, Y
, Y
, X
);
7800 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f64
, DivScale0
);
7802 SDValue Rcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f64
, DivScale0
);
7804 SDValue Fma0
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Rcp
, One
);
7806 SDValue Fma1
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Rcp
, Fma0
, Rcp
);
7808 SDValue Fma2
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Fma1
, One
);
7810 SDValue DivScale1
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, X
, Y
, X
);
7812 SDValue Fma3
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Fma1
, Fma2
, Fma1
);
7813 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f64
, DivScale1
, Fma3
);
7815 SDValue Fma4
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
,
7816 NegDivScale0
, Mul
, DivScale1
);
7820 if (!Subtarget
->hasUsableDivScaleConditionOutput()) {
7821 // Workaround a hardware bug on SI where the condition output from div_scale
7824 const SDValue Hi
= DAG
.getConstant(1, SL
, MVT::i32
);
7826 // Figure out if the scale to use for div_fmas.
7827 SDValue NumBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, X
);
7828 SDValue DenBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Y
);
7829 SDValue Scale0BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale0
);
7830 SDValue Scale1BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale1
);
7832 SDValue NumHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, NumBC
, Hi
);
7833 SDValue DenHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, DenBC
, Hi
);
7836 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale0BC
, Hi
);
7838 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale1BC
, Hi
);
7840 SDValue CmpDen
= DAG
.getSetCC(SL
, MVT::i1
, DenHi
, Scale0Hi
, ISD::SETEQ
);
7841 SDValue CmpNum
= DAG
.getSetCC(SL
, MVT::i1
, NumHi
, Scale1Hi
, ISD::SETEQ
);
7842 Scale
= DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, CmpNum
, CmpDen
);
7844 Scale
= DivScale1
.getValue(1);
7847 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f64
,
7848 Fma4
, Fma3
, Mul
, Scale
);
7850 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f64
, Fmas
, Y
, X
);
7853 SDValue
SITargetLowering::LowerFDIV(SDValue Op
, SelectionDAG
&DAG
) const {
7854 EVT VT
= Op
.getValueType();
7857 return LowerFDIV32(Op
, DAG
);
7860 return LowerFDIV64(Op
, DAG
);
7863 return LowerFDIV16(Op
, DAG
);
7865 llvm_unreachable("Unexpected type for fdiv");
7868 SDValue
SITargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
7870 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
7871 EVT VT
= Store
->getMemoryVT();
7873 if (VT
== MVT::i1
) {
7874 return DAG
.getTruncStore(Store
->getChain(), DL
,
7875 DAG
.getSExtOrTrunc(Store
->getValue(), DL
, MVT::i32
),
7876 Store
->getBasePtr(), MVT::i1
, Store
->getMemOperand());
7879 assert(VT
.isVector() &&
7880 Store
->getValue().getValueType().getScalarType() == MVT::i32
);
7882 if (!allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
7883 VT
, *Store
->getMemOperand())) {
7884 return expandUnalignedStore(Store
, DAG
);
7887 unsigned AS
= Store
->getAddressSpace();
7888 if (Subtarget
->hasLDSMisalignedBug() &&
7889 AS
== AMDGPUAS::FLAT_ADDRESS
&&
7890 Store
->getAlignment() < VT
.getStoreSize() && VT
.getSizeInBits() > 32) {
7891 return SplitVectorStore(Op
, DAG
);
7894 MachineFunction
&MF
= DAG
.getMachineFunction();
7895 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
7896 // If there is a possibilty that flat instruction access scratch memory
7897 // then we need to use the same legalization rules we use for private.
7898 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
7899 AS
= MFI
->hasFlatScratchInit() ?
7900 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
7902 unsigned NumElements
= VT
.getVectorNumElements();
7903 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
||
7904 AS
== AMDGPUAS::FLAT_ADDRESS
) {
7905 if (NumElements
> 4)
7906 return SplitVectorStore(Op
, DAG
);
7907 // v3 stores not supported on SI.
7908 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7909 return SplitVectorStore(Op
, DAG
);
7911 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7912 switch (Subtarget
->getMaxPrivateElementSize()) {
7914 return scalarizeVectorStore(Store
, DAG
);
7916 if (NumElements
> 2)
7917 return SplitVectorStore(Op
, DAG
);
7920 if (NumElements
> 4 || NumElements
== 3)
7921 return SplitVectorStore(Op
, DAG
);
7924 llvm_unreachable("unsupported private_element_size");
7926 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
7927 // Use ds_write_b128 if possible.
7928 if (Subtarget
->useDS128() && Store
->getAlignment() >= 16 &&
7929 VT
.getStoreSize() == 16 && NumElements
!= 3)
7932 if (NumElements
> 2)
7933 return SplitVectorStore(Op
, DAG
);
7935 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7936 // address is negative, then the instruction is incorrectly treated as
7937 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7938 // stores here to avoid emitting ds_write2_b32. We may re-combine the
7939 // store later in the SILoadStoreOptimizer.
7940 if (!Subtarget
->hasUsableDSOffset() &&
7941 NumElements
== 2 && VT
.getStoreSize() == 8 &&
7942 Store
->getAlignment() < 8) {
7943 return SplitVectorStore(Op
, DAG
);
7948 llvm_unreachable("unhandled address space");
7952 SDValue
SITargetLowering::LowerTrig(SDValue Op
, SelectionDAG
&DAG
) const {
7954 EVT VT
= Op
.getValueType();
7955 SDValue Arg
= Op
.getOperand(0);
7958 // TODO: Should this propagate fast-math-flags?
7960 SDValue OneOver2Pi
= DAG
.getConstantFP(0.5 / M_PI
, DL
, VT
);
7962 if (Subtarget
->hasTrigReducedRange()) {
7963 SDValue MulVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
7964 TrigVal
= DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, MulVal
);
7966 TrigVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
7969 switch (Op
.getOpcode()) {
7971 return DAG
.getNode(AMDGPUISD::COS_HW
, SDLoc(Op
), VT
, TrigVal
);
7973 return DAG
.getNode(AMDGPUISD::SIN_HW
, SDLoc(Op
), VT
, TrigVal
);
7975 llvm_unreachable("Wrong trig opcode");
7979 SDValue
SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op
, SelectionDAG
&DAG
) const {
7980 AtomicSDNode
*AtomicNode
= cast
<AtomicSDNode
>(Op
);
7981 assert(AtomicNode
->isCompareAndSwap());
7982 unsigned AS
= AtomicNode
->getAddressSpace();
7984 // No custom lowering required for local address space
7985 if (!isFlatGlobalAddrSpace(AS
))
7988 // Non-local address space requires custom lowering for atomic compare
7989 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7991 SDValue ChainIn
= Op
.getOperand(0);
7992 SDValue Addr
= Op
.getOperand(1);
7993 SDValue Old
= Op
.getOperand(2);
7994 SDValue New
= Op
.getOperand(3);
7995 EVT VT
= Op
.getValueType();
7996 MVT SimpleVT
= VT
.getSimpleVT();
7997 MVT VecType
= MVT::getVectorVT(SimpleVT
, 2);
7999 SDValue NewOld
= DAG
.getBuildVector(VecType
, DL
, {New
, Old
});
8000 SDValue Ops
[] = { ChainIn
, Addr
, NewOld
};
8002 return DAG
.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP
, DL
, Op
->getVTList(),
8003 Ops
, VT
, AtomicNode
->getMemOperand());
8006 //===----------------------------------------------------------------------===//
8007 // Custom DAG optimizations
8008 //===----------------------------------------------------------------------===//
8010 SDValue
SITargetLowering::performUCharToFloatCombine(SDNode
*N
,
8011 DAGCombinerInfo
&DCI
) const {
8012 EVT VT
= N
->getValueType(0);
8013 EVT ScalarVT
= VT
.getScalarType();
8014 if (ScalarVT
!= MVT::f32
)
8017 SelectionDAG
&DAG
= DCI
.DAG
;
8020 SDValue Src
= N
->getOperand(0);
8021 EVT SrcVT
= Src
.getValueType();
8023 // TODO: We could try to match extracting the higher bytes, which would be
8024 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
8025 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
8026 // about in practice.
8027 if (DCI
.isAfterLegalizeDAG() && SrcVT
== MVT::i32
) {
8028 if (DAG
.MaskedValueIsZero(Src
, APInt::getHighBitsSet(32, 24))) {
8029 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
, DL
, VT
, Src
);
8030 DCI
.AddToWorklist(Cvt
.getNode());
8038 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
8040 // This is a variant of
8041 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
8043 // The normal DAG combiner will do this, but only if the add has one use since
8044 // that would increase the number of instructions.
8046 // This prevents us from seeing a constant offset that can be folded into a
8047 // memory instruction's addressing mode. If we know the resulting add offset of
8048 // a pointer can be folded into an addressing offset, we can replace the pointer
8049 // operand with the add of new constant offset. This eliminates one of the uses,
8050 // and may allow the remaining use to also be simplified.
8052 SDValue
SITargetLowering::performSHLPtrCombine(SDNode
*N
,
8055 DAGCombinerInfo
&DCI
) const {
8056 SDValue N0
= N
->getOperand(0);
8057 SDValue N1
= N
->getOperand(1);
8059 // We only do this to handle cases where it's profitable when there are
8060 // multiple uses of the add, so defer to the standard combine.
8061 if ((N0
.getOpcode() != ISD::ADD
&& N0
.getOpcode() != ISD::OR
) ||
8065 const ConstantSDNode
*CN1
= dyn_cast
<ConstantSDNode
>(N1
);
8069 const ConstantSDNode
*CAdd
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
8073 // If the resulting offset is too large, we can't fold it into the addressing
8075 APInt Offset
= CAdd
->getAPIntValue() << CN1
->getAPIntValue();
8076 Type
*Ty
= MemVT
.getTypeForEVT(*DCI
.DAG
.getContext());
8079 AM
.HasBaseReg
= true;
8080 AM
.BaseOffs
= Offset
.getSExtValue();
8081 if (!isLegalAddressingMode(DCI
.DAG
.getDataLayout(), AM
, Ty
, AddrSpace
))
8084 SelectionDAG
&DAG
= DCI
.DAG
;
8086 EVT VT
= N
->getValueType(0);
8088 SDValue ShlX
= DAG
.getNode(ISD::SHL
, SL
, VT
, N0
.getOperand(0), N1
);
8089 SDValue COffset
= DAG
.getConstant(Offset
, SL
, MVT::i32
);
8092 Flags
.setNoUnsignedWrap(N
->getFlags().hasNoUnsignedWrap() &&
8093 (N0
.getOpcode() == ISD::OR
||
8094 N0
->getFlags().hasNoUnsignedWrap()));
8096 return DAG
.getNode(ISD::ADD
, SL
, VT
, ShlX
, COffset
, Flags
);
8099 SDValue
SITargetLowering::performMemSDNodeCombine(MemSDNode
*N
,
8100 DAGCombinerInfo
&DCI
) const {
8101 SDValue Ptr
= N
->getBasePtr();
8102 SelectionDAG
&DAG
= DCI
.DAG
;
8105 // TODO: We could also do this for multiplies.
8106 if (Ptr
.getOpcode() == ISD::SHL
) {
8107 SDValue NewPtr
= performSHLPtrCombine(Ptr
.getNode(), N
->getAddressSpace(),
8108 N
->getMemoryVT(), DCI
);
8110 SmallVector
<SDValue
, 8> NewOps(N
->op_begin(), N
->op_end());
8112 NewOps
[N
->getOpcode() == ISD::STORE
? 2 : 1] = NewPtr
;
8113 return SDValue(DAG
.UpdateNodeOperands(N
, NewOps
), 0);
8120 static bool bitOpWithConstantIsReducible(unsigned Opc
, uint32_t Val
) {
8121 return (Opc
== ISD::AND
&& (Val
== 0 || Val
== 0xffffffff)) ||
8122 (Opc
== ISD::OR
&& (Val
== 0xffffffff || Val
== 0)) ||
8123 (Opc
== ISD::XOR
&& Val
== 0);
8126 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
8127 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
8128 // integer combine opportunities since most 64-bit operations are decomposed
8129 // this way. TODO: We won't want this for SALU especially if it is an inline
8131 SDValue
SITargetLowering::splitBinaryBitConstantOp(
8132 DAGCombinerInfo
&DCI
,
8134 unsigned Opc
, SDValue LHS
,
8135 const ConstantSDNode
*CRHS
) const {
8136 uint64_t Val
= CRHS
->getZExtValue();
8137 uint32_t ValLo
= Lo_32(Val
);
8138 uint32_t ValHi
= Hi_32(Val
);
8139 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8141 if ((bitOpWithConstantIsReducible(Opc
, ValLo
) ||
8142 bitOpWithConstantIsReducible(Opc
, ValHi
)) ||
8143 (CRHS
->hasOneUse() && !TII
->isInlineConstant(CRHS
->getAPIntValue()))) {
8144 // If we need to materialize a 64-bit immediate, it will be split up later
8145 // anyway. Avoid creating the harder to understand 64-bit immediate
8147 return splitBinaryBitConstantOpImpl(DCI
, SL
, Opc
, LHS
, ValLo
, ValHi
);
8153 // Returns true if argument is a boolean value which is not serialized into
8154 // memory or argument and does not require v_cmdmask_b32 to be deserialized.
8155 static bool isBoolSGPR(SDValue V
) {
8156 if (V
.getValueType() != MVT::i1
)
8158 switch (V
.getOpcode()) {
8164 case AMDGPUISD::FP_CLASS
:
8170 // If a constant has all zeroes or all ones within each byte return it.
8171 // Otherwise return 0.
8172 static uint32_t getConstantPermuteMask(uint32_t C
) {
8173 // 0xff for any zero byte in the mask
8174 uint32_t ZeroByteMask
= 0;
8175 if (!(C
& 0x000000ff)) ZeroByteMask
|= 0x000000ff;
8176 if (!(C
& 0x0000ff00)) ZeroByteMask
|= 0x0000ff00;
8177 if (!(C
& 0x00ff0000)) ZeroByteMask
|= 0x00ff0000;
8178 if (!(C
& 0xff000000)) ZeroByteMask
|= 0xff000000;
8179 uint32_t NonZeroByteMask
= ~ZeroByteMask
; // 0xff for any non-zero byte
8180 if ((NonZeroByteMask
& C
) != NonZeroByteMask
)
8181 return 0; // Partial bytes selected.
8185 // Check if a node selects whole bytes from its operand 0 starting at a byte
8186 // boundary while masking the rest. Returns select mask as in the v_perm_b32
8187 // or -1 if not succeeded.
8188 // Note byte select encoding:
8189 // value 0-3 selects corresponding source byte;
8190 // value 0xc selects zero;
8191 // value 0xff selects 0xff.
8192 static uint32_t getPermuteMask(SelectionDAG
&DAG
, SDValue V
) {
8193 assert(V
.getValueSizeInBits() == 32);
8195 if (V
.getNumOperands() != 2)
8198 ConstantSDNode
*N1
= dyn_cast
<ConstantSDNode
>(V
.getOperand(1));
8202 uint32_t C
= N1
->getZExtValue();
8204 switch (V
.getOpcode()) {
8208 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
8209 return (0x03020100 & ConstMask
) | (0x0c0c0c0c & ~ConstMask
);
8214 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
8215 return (0x03020100 & ~ConstMask
) | ConstMask
;
8223 return uint32_t((0x030201000c0c0c0cull
<< C
) >> 32);
8229 return uint32_t(0x0c0c0c0c03020100ull
>> C
);
8235 SDValue
SITargetLowering::performAndCombine(SDNode
*N
,
8236 DAGCombinerInfo
&DCI
) const {
8237 if (DCI
.isBeforeLegalize())
8240 SelectionDAG
&DAG
= DCI
.DAG
;
8241 EVT VT
= N
->getValueType(0);
8242 SDValue LHS
= N
->getOperand(0);
8243 SDValue RHS
= N
->getOperand(1);
8246 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8247 if (VT
== MVT::i64
&& CRHS
) {
8249 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::AND
, LHS
, CRHS
))
8253 if (CRHS
&& VT
== MVT::i32
) {
8254 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
8255 // nb = number of trailing zeroes in mask
8256 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
8257 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
8258 uint64_t Mask
= CRHS
->getZExtValue();
8259 unsigned Bits
= countPopulation(Mask
);
8260 if (getSubtarget()->hasSDWA() && LHS
->getOpcode() == ISD::SRL
&&
8261 (Bits
== 8 || Bits
== 16) && isShiftedMask_64(Mask
) && !(Mask
& 1)) {
8262 if (auto *CShift
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(1))) {
8263 unsigned Shift
= CShift
->getZExtValue();
8264 unsigned NB
= CRHS
->getAPIntValue().countTrailingZeros();
8265 unsigned Offset
= NB
+ Shift
;
8266 if ((Offset
& (Bits
- 1)) == 0) { // Starts at a byte or word boundary.
8268 SDValue BFE
= DAG
.getNode(AMDGPUISD::BFE_U32
, SL
, MVT::i32
,
8270 DAG
.getConstant(Offset
, SL
, MVT::i32
),
8271 DAG
.getConstant(Bits
, SL
, MVT::i32
));
8272 EVT NarrowVT
= EVT::getIntegerVT(*DAG
.getContext(), Bits
);
8273 SDValue Ext
= DAG
.getNode(ISD::AssertZext
, SL
, VT
, BFE
,
8274 DAG
.getValueType(NarrowVT
));
8275 SDValue Shl
= DAG
.getNode(ISD::SHL
, SDLoc(LHS
), VT
, Ext
,
8276 DAG
.getConstant(NB
, SDLoc(CRHS
), MVT::i32
));
8282 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8283 if (LHS
.hasOneUse() && LHS
.getOpcode() == AMDGPUISD::PERM
&&
8284 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
8285 uint32_t Sel
= getConstantPermuteMask(Mask
);
8289 // Select 0xc for all zero bytes
8290 Sel
= (LHS
.getConstantOperandVal(2) & Sel
) | (~Sel
& 0x0c0c0c0c);
8292 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
8293 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
8297 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
8298 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
8299 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == ISD::SETCC
) {
8300 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
8301 ISD::CondCode RCC
= cast
<CondCodeSDNode
>(RHS
.getOperand(2))->get();
8303 SDValue X
= LHS
.getOperand(0);
8304 SDValue Y
= RHS
.getOperand(0);
8305 if (Y
.getOpcode() != ISD::FABS
|| Y
.getOperand(0) != X
)
8308 if (LCC
== ISD::SETO
) {
8309 if (X
!= LHS
.getOperand(1))
8312 if (RCC
== ISD::SETUNE
) {
8313 const ConstantFPSDNode
*C1
= dyn_cast
<ConstantFPSDNode
>(RHS
.getOperand(1));
8314 if (!C1
|| !C1
->isInfinity() || C1
->isNegative())
8317 const uint32_t Mask
= SIInstrFlags::N_NORMAL
|
8318 SIInstrFlags::N_SUBNORMAL
|
8319 SIInstrFlags::N_ZERO
|
8320 SIInstrFlags::P_ZERO
|
8321 SIInstrFlags::P_SUBNORMAL
|
8322 SIInstrFlags::P_NORMAL
;
8324 static_assert(((~(SIInstrFlags::S_NAN
|
8325 SIInstrFlags::Q_NAN
|
8326 SIInstrFlags::N_INFINITY
|
8327 SIInstrFlags::P_INFINITY
)) & 0x3ff) == Mask
,
8331 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
8332 X
, DAG
.getConstant(Mask
, DL
, MVT::i32
));
8337 if (RHS
.getOpcode() == ISD::SETCC
&& LHS
.getOpcode() == AMDGPUISD::FP_CLASS
)
8338 std::swap(LHS
, RHS
);
8340 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
8342 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
8343 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
8344 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
8345 const ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8346 if ((LCC
== ISD::SETO
|| LCC
== ISD::SETUO
) && Mask
&&
8347 (RHS
.getOperand(0) == LHS
.getOperand(0) &&
8348 LHS
.getOperand(0) == LHS
.getOperand(1))) {
8349 const unsigned OrdMask
= SIInstrFlags::S_NAN
| SIInstrFlags::Q_NAN
;
8350 unsigned NewMask
= LCC
== ISD::SETO
?
8351 Mask
->getZExtValue() & ~OrdMask
:
8352 Mask
->getZExtValue() & OrdMask
;
8355 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
, RHS
.getOperand(0),
8356 DAG
.getConstant(NewMask
, DL
, MVT::i32
));
8360 if (VT
== MVT::i32
&&
8361 (RHS
.getOpcode() == ISD::SIGN_EXTEND
|| LHS
.getOpcode() == ISD::SIGN_EXTEND
)) {
8362 // and x, (sext cc from i1) => select cc, x, 0
8363 if (RHS
.getOpcode() != ISD::SIGN_EXTEND
)
8364 std::swap(LHS
, RHS
);
8365 if (isBoolSGPR(RHS
.getOperand(0)))
8366 return DAG
.getSelect(SDLoc(N
), MVT::i32
, RHS
.getOperand(0),
8367 LHS
, DAG
.getConstant(0, SDLoc(N
), MVT::i32
));
8370 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8371 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8372 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
8373 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
8374 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
8375 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
8376 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
8377 // Canonicalize the expression in an attempt to have fewer unique masks
8378 // and therefore fewer registers used to hold the masks.
8379 if (LHSMask
> RHSMask
) {
8380 std::swap(LHSMask
, RHSMask
);
8381 std::swap(LHS
, RHS
);
8384 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8385 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8386 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8387 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8389 // Check of we need to combine values from two sources within a byte.
8390 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
8391 // If we select high and lower word keep it for SDWA.
8392 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8393 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
8394 // Each byte in each mask is either selector mask 0-3, or has higher
8395 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
8396 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
8397 // mask which is not 0xff wins. By anding both masks we have a correct
8398 // result except that 0x0c shall be corrected to give 0x0c only.
8399 uint32_t Mask
= LHSMask
& RHSMask
;
8400 for (unsigned I
= 0; I
< 32; I
+= 8) {
8401 uint32_t ByteSel
= 0xff << I
;
8402 if ((LHSMask
& ByteSel
) == 0x0c || (RHSMask
& ByteSel
) == 0x0c)
8403 Mask
&= (0x0c << I
) & 0xffffffff;
8406 // Add 4 to each active LHS lane. It will not affect any existing 0xff
8408 uint32_t Sel
= Mask
| (LHSUsedLanes
& 0x04040404);
8411 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
8412 LHS
.getOperand(0), RHS
.getOperand(0),
8413 DAG
.getConstant(Sel
, DL
, MVT::i32
));
8421 SDValue
SITargetLowering::performOrCombine(SDNode
*N
,
8422 DAGCombinerInfo
&DCI
) const {
8423 SelectionDAG
&DAG
= DCI
.DAG
;
8424 SDValue LHS
= N
->getOperand(0);
8425 SDValue RHS
= N
->getOperand(1);
8427 EVT VT
= N
->getValueType(0);
8428 if (VT
== MVT::i1
) {
8429 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
8430 if (LHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
8431 RHS
.getOpcode() == AMDGPUISD::FP_CLASS
) {
8432 SDValue Src
= LHS
.getOperand(0);
8433 if (Src
!= RHS
.getOperand(0))
8436 const ConstantSDNode
*CLHS
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
8437 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8441 // Only 10 bits are used.
8442 static const uint32_t MaxMask
= 0x3ff;
8444 uint32_t NewMask
= (CLHS
->getZExtValue() | CRHS
->getZExtValue()) & MaxMask
;
8446 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
8447 Src
, DAG
.getConstant(NewMask
, DL
, MVT::i32
));
8453 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8454 if (isa
<ConstantSDNode
>(RHS
) && LHS
.hasOneUse() &&
8455 LHS
.getOpcode() == AMDGPUISD::PERM
&&
8456 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
8457 uint32_t Sel
= getConstantPermuteMask(N
->getConstantOperandVal(1));
8461 Sel
|= LHS
.getConstantOperandVal(2);
8463 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
8464 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
8467 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8468 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8469 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
8470 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
8471 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
8472 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
8473 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
8474 // Canonicalize the expression in an attempt to have fewer unique masks
8475 // and therefore fewer registers used to hold the masks.
8476 if (LHSMask
> RHSMask
) {
8477 std::swap(LHSMask
, RHSMask
);
8478 std::swap(LHS
, RHS
);
8481 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8482 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8483 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8484 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8486 // Check of we need to combine values from two sources within a byte.
8487 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
8488 // If we select high and lower word keep it for SDWA.
8489 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8490 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
8491 // Kill zero bytes selected by other mask. Zero value is 0xc.
8492 LHSMask
&= ~RHSUsedLanes
;
8493 RHSMask
&= ~LHSUsedLanes
;
8494 // Add 4 to each active LHS lane
8495 LHSMask
|= LHSUsedLanes
& 0x04040404;
8497 uint32_t Sel
= LHSMask
| RHSMask
;
8500 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
8501 LHS
.getOperand(0), RHS
.getOperand(0),
8502 DAG
.getConstant(Sel
, DL
, MVT::i32
));
8510 // TODO: This could be a generic combine with a predicate for extracting the
8511 // high half of an integer being free.
8513 // (or i64:x, (zero_extend i32:y)) ->
8514 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
8515 if (LHS
.getOpcode() == ISD::ZERO_EXTEND
&&
8516 RHS
.getOpcode() != ISD::ZERO_EXTEND
)
8517 std::swap(LHS
, RHS
);
8519 if (RHS
.getOpcode() == ISD::ZERO_EXTEND
) {
8520 SDValue ExtSrc
= RHS
.getOperand(0);
8521 EVT SrcVT
= ExtSrc
.getValueType();
8522 if (SrcVT
== MVT::i32
) {
8524 SDValue LowLHS
, HiBits
;
8525 std::tie(LowLHS
, HiBits
) = split64BitValue(LHS
, DAG
);
8526 SDValue LowOr
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, LowLHS
, ExtSrc
);
8528 DCI
.AddToWorklist(LowOr
.getNode());
8529 DCI
.AddToWorklist(HiBits
.getNode());
8531 SDValue Vec
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
,
8533 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Vec
);
8537 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
8540 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::OR
, LHS
, CRHS
))
8547 SDValue
SITargetLowering::performXorCombine(SDNode
*N
,
8548 DAGCombinerInfo
&DCI
) const {
8549 EVT VT
= N
->getValueType(0);
8553 SDValue LHS
= N
->getOperand(0);
8554 SDValue RHS
= N
->getOperand(1);
8556 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8559 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::XOR
, LHS
, CRHS
))
8566 // Instructions that will be lowered with a final instruction that zeros the
8567 // high result bits.
8568 // XXX - probably only need to list legal operations.
8569 static bool fp16SrcZerosHighBits(unsigned Opc
) {
8578 case ISD::FCANONICALIZE
:
8580 case ISD::UINT_TO_FP
:
8581 case ISD::SINT_TO_FP
:
8583 // Fabs is lowered to a bit operation, but it's an and which will clear the
8584 // high bits anyway.
8598 case ISD::FNEARBYINT
:
8603 case AMDGPUISD::FRACT
:
8604 case AMDGPUISD::CLAMP
:
8605 case AMDGPUISD::COS_HW
:
8606 case AMDGPUISD::SIN_HW
:
8607 case AMDGPUISD::FMIN3
:
8608 case AMDGPUISD::FMAX3
:
8609 case AMDGPUISD::FMED3
:
8610 case AMDGPUISD::FMAD_FTZ
:
8611 case AMDGPUISD::RCP
:
8612 case AMDGPUISD::RSQ
:
8613 case AMDGPUISD::RCP_IFLAG
:
8614 case AMDGPUISD::LDEXP
:
8617 // fcopysign, select and others may be lowered to 32-bit bit operations
8618 // which don't zero the high bits.
8623 SDValue
SITargetLowering::performZeroExtendCombine(SDNode
*N
,
8624 DAGCombinerInfo
&DCI
) const {
8625 if (!Subtarget
->has16BitInsts() ||
8626 DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
8629 EVT VT
= N
->getValueType(0);
8633 SDValue Src
= N
->getOperand(0);
8634 if (Src
.getValueType() != MVT::i16
)
8637 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
8638 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
8639 if (Src
.getOpcode() == ISD::BITCAST
) {
8640 SDValue BCSrc
= Src
.getOperand(0);
8641 if (BCSrc
.getValueType() == MVT::f16
&&
8642 fp16SrcZerosHighBits(BCSrc
.getOpcode()))
8643 return DCI
.DAG
.getNode(AMDGPUISD::FP16_ZEXT
, SDLoc(N
), VT
, BCSrc
);
8649 SDValue
SITargetLowering::performSignExtendInRegCombine(SDNode
*N
,
8650 DAGCombinerInfo
&DCI
)
8652 SDValue Src
= N
->getOperand(0);
8653 auto *VTSign
= cast
<VTSDNode
>(N
->getOperand(1));
8655 if (((Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
&&
8656 VTSign
->getVT() == MVT::i8
) ||
8657 (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT
&&
8658 VTSign
->getVT() == MVT::i16
)) &&
8660 auto *M
= cast
<MemSDNode
>(Src
);
8662 Src
.getOperand(0), // Chain
8663 Src
.getOperand(1), // rsrc
8664 Src
.getOperand(2), // vindex
8665 Src
.getOperand(3), // voffset
8666 Src
.getOperand(4), // soffset
8667 Src
.getOperand(5), // offset
8671 // replace with BUFFER_LOAD_BYTE/SHORT
8672 SDVTList ResList
= DCI
.DAG
.getVTList(MVT::i32
,
8673 Src
.getOperand(0).getValueType());
8674 unsigned Opc
= (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
) ?
8675 AMDGPUISD::BUFFER_LOAD_BYTE
: AMDGPUISD::BUFFER_LOAD_SHORT
;
8676 SDValue BufferLoadSignExt
= DCI
.DAG
.getMemIntrinsicNode(Opc
, SDLoc(N
),
8678 Ops
, M
->getMemoryVT(),
8679 M
->getMemOperand());
8680 return DCI
.DAG
.getMergeValues({BufferLoadSignExt
,
8681 BufferLoadSignExt
.getValue(1)}, SDLoc(N
));
8686 SDValue
SITargetLowering::performClassCombine(SDNode
*N
,
8687 DAGCombinerInfo
&DCI
) const {
8688 SelectionDAG
&DAG
= DCI
.DAG
;
8689 SDValue Mask
= N
->getOperand(1);
8691 // fp_class x, 0 -> false
8692 if (const ConstantSDNode
*CMask
= dyn_cast
<ConstantSDNode
>(Mask
)) {
8693 if (CMask
->isNullValue())
8694 return DAG
.getConstant(0, SDLoc(N
), MVT::i1
);
8697 if (N
->getOperand(0).isUndef())
8698 return DAG
.getUNDEF(MVT::i1
);
8703 SDValue
SITargetLowering::performRcpCombine(SDNode
*N
,
8704 DAGCombinerInfo
&DCI
) const {
8705 EVT VT
= N
->getValueType(0);
8706 SDValue N0
= N
->getOperand(0);
8711 if (VT
== MVT::f32
&& (N0
.getOpcode() == ISD::UINT_TO_FP
||
8712 N0
.getOpcode() == ISD::SINT_TO_FP
)) {
8713 return DCI
.DAG
.getNode(AMDGPUISD::RCP_IFLAG
, SDLoc(N
), VT
, N0
,
8717 return AMDGPUTargetLowering::performRcpCombine(N
, DCI
);
8720 bool SITargetLowering::isCanonicalized(SelectionDAG
&DAG
, SDValue Op
,
8721 unsigned MaxDepth
) const {
8722 unsigned Opcode
= Op
.getOpcode();
8723 if (Opcode
== ISD::FCANONICALIZE
)
8726 if (auto *CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
8727 auto F
= CFP
->getValueAPF();
8728 if (F
.isNaN() && F
.isSignaling())
8730 return !F
.isDenormal() || denormalsEnabledForType(Op
.getValueType());
8733 // If source is a result of another standard FP operation it is already in
8739 // These will flush denorms if required.
8751 case ISD::FP_EXTEND
:
8752 case AMDGPUISD::FMUL_LEGACY
:
8753 case AMDGPUISD::FMAD_FTZ
:
8754 case AMDGPUISD::RCP
:
8755 case AMDGPUISD::RSQ
:
8756 case AMDGPUISD::RSQ_CLAMP
:
8757 case AMDGPUISD::RCP_LEGACY
:
8758 case AMDGPUISD::RSQ_LEGACY
:
8759 case AMDGPUISD::RCP_IFLAG
:
8760 case AMDGPUISD::TRIG_PREOP
:
8761 case AMDGPUISD::DIV_SCALE
:
8762 case AMDGPUISD::DIV_FMAS
:
8763 case AMDGPUISD::DIV_FIXUP
:
8764 case AMDGPUISD::FRACT
:
8765 case AMDGPUISD::LDEXP
:
8766 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
8767 case AMDGPUISD::CVT_F32_UBYTE0
:
8768 case AMDGPUISD::CVT_F32_UBYTE1
:
8769 case AMDGPUISD::CVT_F32_UBYTE2
:
8770 case AMDGPUISD::CVT_F32_UBYTE3
:
8773 // It can/will be lowered or combined as a bit operation.
8774 // Need to check their input recursively to handle.
8777 case ISD::FCOPYSIGN
:
8778 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
8783 return Op
.getValueType().getScalarType() != MVT::f16
;
8787 case ISD::FMINNUM_IEEE
:
8788 case ISD::FMAXNUM_IEEE
:
8789 case AMDGPUISD::CLAMP
:
8790 case AMDGPUISD::FMED3
:
8791 case AMDGPUISD::FMAX3
:
8792 case AMDGPUISD::FMIN3
: {
8793 // FIXME: Shouldn't treat the generic operations different based these.
8794 // However, we aren't really required to flush the result from
8797 // snans will be quieted, so we only need to worry about denormals.
8798 if (Subtarget
->supportsMinMaxDenormModes() ||
8799 denormalsEnabledForType(Op
.getValueType()))
8802 // Flushing may be required.
8803 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
8804 // targets need to check their input recursively.
8806 // FIXME: Does this apply with clamp? It's implemented with max.
8807 for (unsigned I
= 0, E
= Op
.getNumOperands(); I
!= E
; ++I
) {
8808 if (!isCanonicalized(DAG
, Op
.getOperand(I
), MaxDepth
- 1))
8815 return isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1) &&
8816 isCanonicalized(DAG
, Op
.getOperand(2), MaxDepth
- 1);
8818 case ISD::BUILD_VECTOR
: {
8819 for (unsigned i
= 0, e
= Op
.getNumOperands(); i
!= e
; ++i
) {
8820 SDValue SrcOp
= Op
.getOperand(i
);
8821 if (!isCanonicalized(DAG
, SrcOp
, MaxDepth
- 1))
8827 case ISD::EXTRACT_VECTOR_ELT
:
8828 case ISD::EXTRACT_SUBVECTOR
: {
8829 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
8831 case ISD::INSERT_VECTOR_ELT
: {
8832 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1) &&
8833 isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1);
8836 // Could be anything.
8839 case ISD::BITCAST
: {
8840 // Hack round the mess we make when legalizing extract_vector_elt
8841 SDValue Src
= Op
.getOperand(0);
8842 if (Src
.getValueType() == MVT::i16
&&
8843 Src
.getOpcode() == ISD::TRUNCATE
) {
8844 SDValue TruncSrc
= Src
.getOperand(0);
8845 if (TruncSrc
.getValueType() == MVT::i32
&&
8846 TruncSrc
.getOpcode() == ISD::BITCAST
&&
8847 TruncSrc
.getOperand(0).getValueType() == MVT::v2f16
) {
8848 return isCanonicalized(DAG
, TruncSrc
.getOperand(0), MaxDepth
- 1);
8854 case ISD::INTRINSIC_WO_CHAIN
: {
8855 unsigned IntrinsicID
8856 = cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
8857 // TODO: Handle more intrinsics
8858 switch (IntrinsicID
) {
8859 case Intrinsic::amdgcn_cvt_pkrtz
:
8860 case Intrinsic::amdgcn_cubeid
:
8861 case Intrinsic::amdgcn_frexp_mant
:
8862 case Intrinsic::amdgcn_fdot2
:
8871 return denormalsEnabledForType(Op
.getValueType()) &&
8872 DAG
.isKnownNeverSNaN(Op
);
8875 llvm_unreachable("invalid operation");
8878 // Constant fold canonicalize.
8879 SDValue
SITargetLowering::getCanonicalConstantFP(
8880 SelectionDAG
&DAG
, const SDLoc
&SL
, EVT VT
, const APFloat
&C
) const {
8881 // Flush denormals to 0 if not enabled.
8882 if (C
.isDenormal() && !denormalsEnabledForType(VT
))
8883 return DAG
.getConstantFP(0.0, SL
, VT
);
8886 APFloat CanonicalQNaN
= APFloat::getQNaN(C
.getSemantics());
8887 if (C
.isSignaling()) {
8888 // Quiet a signaling NaN.
8889 // FIXME: Is this supposed to preserve payload bits?
8890 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
8893 // Make sure it is the canonical NaN bitpattern.
8895 // TODO: Can we use -1 as the canonical NaN value since it's an inline
8897 if (C
.bitcastToAPInt() != CanonicalQNaN
.bitcastToAPInt())
8898 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
8901 // Already canonical.
8902 return DAG
.getConstantFP(C
, SL
, VT
);
8905 static bool vectorEltWillFoldAway(SDValue Op
) {
8906 return Op
.isUndef() || isa
<ConstantFPSDNode
>(Op
);
8909 SDValue
SITargetLowering::performFCanonicalizeCombine(
8911 DAGCombinerInfo
&DCI
) const {
8912 SelectionDAG
&DAG
= DCI
.DAG
;
8913 SDValue N0
= N
->getOperand(0);
8914 EVT VT
= N
->getValueType(0);
8916 // fcanonicalize undef -> qnan
8918 APFloat QNaN
= APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT
));
8919 return DAG
.getConstantFP(QNaN
, SDLoc(N
), VT
);
8922 if (ConstantFPSDNode
*CFP
= isConstOrConstSplatFP(N0
)) {
8923 EVT VT
= N
->getValueType(0);
8924 return getCanonicalConstantFP(DAG
, SDLoc(N
), VT
, CFP
->getValueAPF());
8927 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
8928 // (fcanonicalize k)
8930 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
8932 // TODO: This could be better with wider vectors that will be split to v2f16,
8933 // and to consider uses since there aren't that many packed operations.
8934 if (N0
.getOpcode() == ISD::BUILD_VECTOR
&& VT
== MVT::v2f16
&&
8935 isTypeLegal(MVT::v2f16
)) {
8938 SDValue Lo
= N0
.getOperand(0);
8939 SDValue Hi
= N0
.getOperand(1);
8940 EVT EltVT
= Lo
.getValueType();
8942 if (vectorEltWillFoldAway(Lo
) || vectorEltWillFoldAway(Hi
)) {
8943 for (unsigned I
= 0; I
!= 2; ++I
) {
8944 SDValue Op
= N0
.getOperand(I
);
8945 if (ConstantFPSDNode
*CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
8946 NewElts
[I
] = getCanonicalConstantFP(DAG
, SL
, EltVT
,
8947 CFP
->getValueAPF());
8948 } else if (Op
.isUndef()) {
8949 // Handled below based on what the other operand is.
8952 NewElts
[I
] = DAG
.getNode(ISD::FCANONICALIZE
, SL
, EltVT
, Op
);
8956 // If one half is undef, and one is constant, perfer a splat vector rather
8957 // than the normal qNaN. If it's a register, prefer 0.0 since that's
8958 // cheaper to use and may be free with a packed operation.
8959 if (NewElts
[0].isUndef()) {
8960 if (isa
<ConstantFPSDNode
>(NewElts
[1]))
8961 NewElts
[0] = isa
<ConstantFPSDNode
>(NewElts
[1]) ?
8962 NewElts
[1]: DAG
.getConstantFP(0.0f
, SL
, EltVT
);
8965 if (NewElts
[1].isUndef()) {
8966 NewElts
[1] = isa
<ConstantFPSDNode
>(NewElts
[0]) ?
8967 NewElts
[0] : DAG
.getConstantFP(0.0f
, SL
, EltVT
);
8970 return DAG
.getBuildVector(VT
, SL
, NewElts
);
8974 unsigned SrcOpc
= N0
.getOpcode();
8976 // If it's free to do so, push canonicalizes further up the source, which may
8977 // find a canonical source.
8979 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
8981 if (SrcOpc
== ISD::FMINNUM
|| SrcOpc
== ISD::FMAXNUM
) {
8982 auto *CRHS
= dyn_cast
<ConstantFPSDNode
>(N0
.getOperand(1));
8983 if (CRHS
&& N0
.hasOneUse()) {
8985 SDValue Canon0
= DAG
.getNode(ISD::FCANONICALIZE
, SL
, VT
,
8987 SDValue Canon1
= getCanonicalConstantFP(DAG
, SL
, VT
, CRHS
->getValueAPF());
8988 DCI
.AddToWorklist(Canon0
.getNode());
8990 return DAG
.getNode(N0
.getOpcode(), SL
, VT
, Canon0
, Canon1
);
8994 return isCanonicalized(DAG
, N0
) ? N0
: SDValue();
8997 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc
) {
9000 case ISD::FMAXNUM_IEEE
:
9001 return AMDGPUISD::FMAX3
;
9003 return AMDGPUISD::SMAX3
;
9005 return AMDGPUISD::UMAX3
;
9007 case ISD::FMINNUM_IEEE
:
9008 return AMDGPUISD::FMIN3
;
9010 return AMDGPUISD::SMIN3
;
9012 return AMDGPUISD::UMIN3
;
9014 llvm_unreachable("Not a min/max opcode");
9018 SDValue
SITargetLowering::performIntMed3ImmCombine(
9019 SelectionDAG
&DAG
, const SDLoc
&SL
,
9020 SDValue Op0
, SDValue Op1
, bool Signed
) const {
9021 ConstantSDNode
*K1
= dyn_cast
<ConstantSDNode
>(Op1
);
9025 ConstantSDNode
*K0
= dyn_cast
<ConstantSDNode
>(Op0
.getOperand(1));
9030 if (K0
->getAPIntValue().sge(K1
->getAPIntValue()))
9033 if (K0
->getAPIntValue().uge(K1
->getAPIntValue()))
9037 EVT VT
= K0
->getValueType(0);
9038 unsigned Med3Opc
= Signed
? AMDGPUISD::SMED3
: AMDGPUISD::UMED3
;
9039 if (VT
== MVT::i32
|| (VT
== MVT::i16
&& Subtarget
->hasMed3_16())) {
9040 return DAG
.getNode(Med3Opc
, SL
, VT
,
9041 Op0
.getOperand(0), SDValue(K0
, 0), SDValue(K1
, 0));
9044 // If there isn't a 16-bit med3 operation, convert to 32-bit.
9046 unsigned ExtOp
= Signed
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
9048 SDValue Tmp1
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(0));
9049 SDValue Tmp2
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(1));
9050 SDValue Tmp3
= DAG
.getNode(ExtOp
, SL
, NVT
, Op1
);
9052 SDValue Med3
= DAG
.getNode(Med3Opc
, SL
, NVT
, Tmp1
, Tmp2
, Tmp3
);
9053 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Med3
);
9056 static ConstantFPSDNode
*getSplatConstantFP(SDValue Op
) {
9057 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(Op
))
9060 if (BuildVectorSDNode
*BV
= dyn_cast
<BuildVectorSDNode
>(Op
)) {
9061 if (ConstantFPSDNode
*C
= BV
->getConstantFPSplatNode())
9068 SDValue
SITargetLowering::performFPMed3ImmCombine(SelectionDAG
&DAG
,
9071 SDValue Op1
) const {
9072 ConstantFPSDNode
*K1
= getSplatConstantFP(Op1
);
9076 ConstantFPSDNode
*K0
= getSplatConstantFP(Op0
.getOperand(1));
9080 // Ordered >= (although NaN inputs should have folded away by now).
9081 APFloat::cmpResult Cmp
= K0
->getValueAPF().compare(K1
->getValueAPF());
9082 if (Cmp
== APFloat::cmpGreaterThan
)
9085 const MachineFunction
&MF
= DAG
.getMachineFunction();
9086 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
9088 // TODO: Check IEEE bit enabled?
9089 EVT VT
= Op0
.getValueType();
9090 if (Info
->getMode().DX10Clamp
) {
9091 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
9092 // hardware fmed3 behavior converting to a min.
9093 // FIXME: Should this be allowing -0.0?
9094 if (K1
->isExactlyValue(1.0) && K0
->isExactlyValue(0.0))
9095 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Op0
.getOperand(0));
9098 // med3 for f16 is only available on gfx9+, and not available for v2f16.
9099 if (VT
== MVT::f32
|| (VT
== MVT::f16
&& Subtarget
->hasMed3_16())) {
9100 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
9101 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
9102 // then give the other result, which is different from med3 with a NaN
9104 SDValue Var
= Op0
.getOperand(0);
9105 if (!DAG
.isKnownNeverSNaN(Var
))
9108 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
9110 if ((!K0
->hasOneUse() ||
9111 TII
->isInlineConstant(K0
->getValueAPF().bitcastToAPInt())) &&
9112 (!K1
->hasOneUse() ||
9113 TII
->isInlineConstant(K1
->getValueAPF().bitcastToAPInt()))) {
9114 return DAG
.getNode(AMDGPUISD::FMED3
, SL
, K0
->getValueType(0),
9115 Var
, SDValue(K0
, 0), SDValue(K1
, 0));
9122 SDValue
SITargetLowering::performMinMaxCombine(SDNode
*N
,
9123 DAGCombinerInfo
&DCI
) const {
9124 SelectionDAG
&DAG
= DCI
.DAG
;
9126 EVT VT
= N
->getValueType(0);
9127 unsigned Opc
= N
->getOpcode();
9128 SDValue Op0
= N
->getOperand(0);
9129 SDValue Op1
= N
->getOperand(1);
9131 // Only do this if the inner op has one use since this will just increases
9132 // register pressure for no benefit.
9134 if (Opc
!= AMDGPUISD::FMIN_LEGACY
&& Opc
!= AMDGPUISD::FMAX_LEGACY
&&
9136 (VT
== MVT::i32
|| VT
== MVT::f32
||
9137 ((VT
== MVT::f16
|| VT
== MVT::i16
) && Subtarget
->hasMin3Max3_16()))) {
9138 // max(max(a, b), c) -> max3(a, b, c)
9139 // min(min(a, b), c) -> min3(a, b, c)
9140 if (Op0
.getOpcode() == Opc
&& Op0
.hasOneUse()) {
9142 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
9151 // max(a, max(b, c)) -> max3(a, b, c)
9152 // min(a, min(b, c)) -> min3(a, b, c)
9153 if (Op1
.getOpcode() == Opc
&& Op1
.hasOneUse()) {
9155 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
9164 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
9165 if (Opc
== ISD::SMIN
&& Op0
.getOpcode() == ISD::SMAX
&& Op0
.hasOneUse()) {
9166 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, true))
9170 if (Opc
== ISD::UMIN
&& Op0
.getOpcode() == ISD::UMAX
&& Op0
.hasOneUse()) {
9171 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, false))
9175 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
9176 if (((Opc
== ISD::FMINNUM
&& Op0
.getOpcode() == ISD::FMAXNUM
) ||
9177 (Opc
== ISD::FMINNUM_IEEE
&& Op0
.getOpcode() == ISD::FMAXNUM_IEEE
) ||
9178 (Opc
== AMDGPUISD::FMIN_LEGACY
&&
9179 Op0
.getOpcode() == AMDGPUISD::FMAX_LEGACY
)) &&
9180 (VT
== MVT::f32
|| VT
== MVT::f64
||
9181 (VT
== MVT::f16
&& Subtarget
->has16BitInsts()) ||
9182 (VT
== MVT::v2f16
&& Subtarget
->hasVOP3PInsts())) &&
9184 if (SDValue Res
= performFPMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
))
9191 static bool isClampZeroToOne(SDValue A
, SDValue B
) {
9192 if (ConstantFPSDNode
*CA
= dyn_cast
<ConstantFPSDNode
>(A
)) {
9193 if (ConstantFPSDNode
*CB
= dyn_cast
<ConstantFPSDNode
>(B
)) {
9194 // FIXME: Should this be allowing -0.0?
9195 return (CA
->isExactlyValue(0.0) && CB
->isExactlyValue(1.0)) ||
9196 (CA
->isExactlyValue(1.0) && CB
->isExactlyValue(0.0));
9203 // FIXME: Should only worry about snans for version with chain.
9204 SDValue
SITargetLowering::performFMed3Combine(SDNode
*N
,
9205 DAGCombinerInfo
&DCI
) const {
9206 EVT VT
= N
->getValueType(0);
9207 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
9208 // NaNs. With a NaN input, the order of the operands may change the result.
9210 SelectionDAG
&DAG
= DCI
.DAG
;
9213 SDValue Src0
= N
->getOperand(0);
9214 SDValue Src1
= N
->getOperand(1);
9215 SDValue Src2
= N
->getOperand(2);
9217 if (isClampZeroToOne(Src0
, Src1
)) {
9218 // const_a, const_b, x -> clamp is safe in all cases including signaling
9220 // FIXME: Should this be allowing -0.0?
9221 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src2
);
9224 const MachineFunction
&MF
= DAG
.getMachineFunction();
9225 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
9227 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
9228 // handling no dx10-clamp?
9229 if (Info
->getMode().DX10Clamp
) {
9230 // If NaNs is clamped to 0, we are free to reorder the inputs.
9232 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
9233 std::swap(Src0
, Src1
);
9235 if (isa
<ConstantFPSDNode
>(Src1
) && !isa
<ConstantFPSDNode
>(Src2
))
9236 std::swap(Src1
, Src2
);
9238 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
9239 std::swap(Src0
, Src1
);
9241 if (isClampZeroToOne(Src1
, Src2
))
9242 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src0
);
9248 SDValue
SITargetLowering::performCvtPkRTZCombine(SDNode
*N
,
9249 DAGCombinerInfo
&DCI
) const {
9250 SDValue Src0
= N
->getOperand(0);
9251 SDValue Src1
= N
->getOperand(1);
9252 if (Src0
.isUndef() && Src1
.isUndef())
9253 return DCI
.DAG
.getUNDEF(N
->getValueType(0));
9257 SDValue
SITargetLowering::performExtractVectorEltCombine(
9258 SDNode
*N
, DAGCombinerInfo
&DCI
) const {
9259 SDValue Vec
= N
->getOperand(0);
9260 SelectionDAG
&DAG
= DCI
.DAG
;
9262 EVT VecVT
= Vec
.getValueType();
9263 EVT EltVT
= VecVT
.getVectorElementType();
9265 if ((Vec
.getOpcode() == ISD::FNEG
||
9266 Vec
.getOpcode() == ISD::FABS
) && allUsesHaveSourceMods(N
)) {
9268 EVT EltVT
= N
->getValueType(0);
9269 SDValue Idx
= N
->getOperand(1);
9270 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9271 Vec
.getOperand(0), Idx
);
9272 return DAG
.getNode(Vec
.getOpcode(), SL
, EltVT
, Elt
);
9275 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
9277 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
9278 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
9279 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
9280 if (Vec
.hasOneUse() && DCI
.isBeforeLegalize()) {
9282 EVT EltVT
= N
->getValueType(0);
9283 SDValue Idx
= N
->getOperand(1);
9284 unsigned Opc
= Vec
.getOpcode();
9289 // TODO: Support other binary operations.
9300 case ISD::FMAXNUM_IEEE
:
9301 case ISD::FMINNUM_IEEE
: {
9302 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9303 Vec
.getOperand(0), Idx
);
9304 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9305 Vec
.getOperand(1), Idx
);
9307 DCI
.AddToWorklist(Elt0
.getNode());
9308 DCI
.AddToWorklist(Elt1
.getNode());
9309 return DAG
.getNode(Opc
, SL
, EltVT
, Elt0
, Elt1
, Vec
->getFlags());
9314 unsigned VecSize
= VecVT
.getSizeInBits();
9315 unsigned EltSize
= EltVT
.getSizeInBits();
9317 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
9318 // This elminates non-constant index and subsequent movrel or scratch access.
9319 // Sub-dword vectors of size 2 dword or less have better implementation.
9320 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
9322 if (VecSize
<= 256 && (VecSize
> 64 || EltSize
>= 32) &&
9323 !isa
<ConstantSDNode
>(N
->getOperand(1))) {
9325 SDValue Idx
= N
->getOperand(1);
9326 EVT IdxVT
= Idx
.getValueType();
9328 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
9329 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
9330 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
9334 V
= DAG
.getSelectCC(SL
, Idx
, IC
, Elt
, V
, ISD::SETEQ
);
9339 if (!DCI
.isBeforeLegalize())
9342 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
9343 // elements. This exposes more load reduction opportunities by replacing
9344 // multiple small extract_vector_elements with a single 32-bit extract.
9345 auto *Idx
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
9346 if (isa
<MemSDNode
>(Vec
) &&
9348 EltVT
.isByteSized() &&
9350 VecSize
% 32 == 0 &&
9352 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VecVT
);
9354 unsigned BitIndex
= Idx
->getZExtValue() * EltSize
;
9355 unsigned EltIdx
= BitIndex
/ 32;
9356 unsigned LeftoverBitIdx
= BitIndex
% 32;
9359 SDValue Cast
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, Vec
);
9360 DCI
.AddToWorklist(Cast
.getNode());
9362 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Cast
,
9363 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
9364 DCI
.AddToWorklist(Elt
.getNode());
9365 SDValue Srl
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Elt
,
9366 DAG
.getConstant(LeftoverBitIdx
, SL
, MVT::i32
));
9367 DCI
.AddToWorklist(Srl
.getNode());
9369 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, SL
, EltVT
.changeTypeToInteger(), Srl
);
9370 DCI
.AddToWorklist(Trunc
.getNode());
9371 return DAG
.getNode(ISD::BITCAST
, SL
, EltVT
, Trunc
);
9378 SITargetLowering::performInsertVectorEltCombine(SDNode
*N
,
9379 DAGCombinerInfo
&DCI
) const {
9380 SDValue Vec
= N
->getOperand(0);
9381 SDValue Idx
= N
->getOperand(2);
9382 EVT VecVT
= Vec
.getValueType();
9383 EVT EltVT
= VecVT
.getVectorElementType();
9384 unsigned VecSize
= VecVT
.getSizeInBits();
9385 unsigned EltSize
= EltVT
.getSizeInBits();
9387 // INSERT_VECTOR_ELT (<n x e>, var-idx)
9388 // => BUILD_VECTOR n x select (e, const-idx)
9389 // This elminates non-constant index and subsequent movrel or scratch access.
9390 // Sub-dword vectors of size 2 dword or less have better implementation.
9391 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
9393 if (isa
<ConstantSDNode
>(Idx
) ||
9394 VecSize
> 256 || (VecSize
<= 64 && EltSize
< 32))
9397 SelectionDAG
&DAG
= DCI
.DAG
;
9399 SDValue Ins
= N
->getOperand(1);
9400 EVT IdxVT
= Idx
.getValueType();
9402 SmallVector
<SDValue
, 16> Ops
;
9403 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
9404 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
9405 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
9406 SDValue V
= DAG
.getSelectCC(SL
, Idx
, IC
, Ins
, Elt
, ISD::SETEQ
);
9410 return DAG
.getBuildVector(VecVT
, SL
, Ops
);
9413 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG
&DAG
,
9415 const SDNode
*N1
) const {
9416 EVT VT
= N0
->getValueType(0);
9418 // Only do this if we are not trying to support denormals. v_mad_f32 does not
9419 // support denormals ever.
9420 if (((VT
== MVT::f32
&& !Subtarget
->hasFP32Denormals()) ||
9421 (VT
== MVT::f16
&& !Subtarget
->hasFP16Denormals() &&
9422 getSubtarget()->hasMadF16())) &&
9423 isOperationLegal(ISD::FMAD
, VT
))
9426 const TargetOptions
&Options
= DAG
.getTarget().Options
;
9427 if ((Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
9428 (N0
->getFlags().hasAllowContract() &&
9429 N1
->getFlags().hasAllowContract())) &&
9430 isFMAFasterThanFMulAndFAdd(VT
)) {
9437 // For a reassociatable opcode perform:
9438 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
9439 SDValue
SITargetLowering::reassociateScalarOps(SDNode
*N
,
9440 SelectionDAG
&DAG
) const {
9441 EVT VT
= N
->getValueType(0);
9442 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
9445 unsigned Opc
= N
->getOpcode();
9446 SDValue Op0
= N
->getOperand(0);
9447 SDValue Op1
= N
->getOperand(1);
9449 if (!(Op0
->isDivergent() ^ Op1
->isDivergent()))
9452 if (Op0
->isDivergent())
9453 std::swap(Op0
, Op1
);
9455 if (Op1
.getOpcode() != Opc
|| !Op1
.hasOneUse())
9458 SDValue Op2
= Op1
.getOperand(1);
9459 Op1
= Op1
.getOperand(0);
9460 if (!(Op1
->isDivergent() ^ Op2
->isDivergent()))
9463 if (Op1
->isDivergent())
9464 std::swap(Op1
, Op2
);
9466 // If either operand is constant this will conflict with
9467 // DAGCombiner::ReassociateOps().
9468 if (DAG
.isConstantIntBuildVectorOrConstantInt(Op0
) ||
9469 DAG
.isConstantIntBuildVectorOrConstantInt(Op1
))
9473 SDValue Add1
= DAG
.getNode(Opc
, SL
, VT
, Op0
, Op1
);
9474 return DAG
.getNode(Opc
, SL
, VT
, Add1
, Op2
);
9477 static SDValue
getMad64_32(SelectionDAG
&DAG
, const SDLoc
&SL
,
9479 SDValue N0
, SDValue N1
, SDValue N2
,
9481 unsigned MadOpc
= Signed
? AMDGPUISD::MAD_I64_I32
: AMDGPUISD::MAD_U64_U32
;
9482 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i1
);
9483 SDValue Mad
= DAG
.getNode(MadOpc
, SL
, VTs
, N0
, N1
, N2
);
9484 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Mad
);
9487 SDValue
SITargetLowering::performAddCombine(SDNode
*N
,
9488 DAGCombinerInfo
&DCI
) const {
9489 SelectionDAG
&DAG
= DCI
.DAG
;
9490 EVT VT
= N
->getValueType(0);
9492 SDValue LHS
= N
->getOperand(0);
9493 SDValue RHS
= N
->getOperand(1);
9495 if ((LHS
.getOpcode() == ISD::MUL
|| RHS
.getOpcode() == ISD::MUL
)
9496 && Subtarget
->hasMad64_32() &&
9497 !VT
.isVector() && VT
.getScalarSizeInBits() > 32 &&
9498 VT
.getScalarSizeInBits() <= 64) {
9499 if (LHS
.getOpcode() != ISD::MUL
)
9500 std::swap(LHS
, RHS
);
9502 SDValue MulLHS
= LHS
.getOperand(0);
9503 SDValue MulRHS
= LHS
.getOperand(1);
9504 SDValue AddRHS
= RHS
;
9506 // TODO: Maybe restrict if SGPR inputs.
9507 if (numBitsUnsigned(MulLHS
, DAG
) <= 32 &&
9508 numBitsUnsigned(MulRHS
, DAG
) <= 32) {
9509 MulLHS
= DAG
.getZExtOrTrunc(MulLHS
, SL
, MVT::i32
);
9510 MulRHS
= DAG
.getZExtOrTrunc(MulRHS
, SL
, MVT::i32
);
9511 AddRHS
= DAG
.getZExtOrTrunc(AddRHS
, SL
, MVT::i64
);
9512 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, false);
9515 if (numBitsSigned(MulLHS
, DAG
) < 32 && numBitsSigned(MulRHS
, DAG
) < 32) {
9516 MulLHS
= DAG
.getSExtOrTrunc(MulLHS
, SL
, MVT::i32
);
9517 MulRHS
= DAG
.getSExtOrTrunc(MulRHS
, SL
, MVT::i32
);
9518 AddRHS
= DAG
.getSExtOrTrunc(AddRHS
, SL
, MVT::i64
);
9519 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, true);
9525 if (SDValue V
= reassociateScalarOps(N
, DAG
)) {
9529 if (VT
!= MVT::i32
|| !DCI
.isAfterLegalizeDAG())
9532 // add x, zext (setcc) => addcarry x, 0, setcc
9533 // add x, sext (setcc) => subcarry x, 0, setcc
9534 unsigned Opc
= LHS
.getOpcode();
9535 if (Opc
== ISD::ZERO_EXTEND
|| Opc
== ISD::SIGN_EXTEND
||
9536 Opc
== ISD::ANY_EXTEND
|| Opc
== ISD::ADDCARRY
)
9537 std::swap(RHS
, LHS
);
9539 Opc
= RHS
.getOpcode();
9542 case ISD::ZERO_EXTEND
:
9543 case ISD::SIGN_EXTEND
:
9544 case ISD::ANY_EXTEND
: {
9545 auto Cond
= RHS
.getOperand(0);
9546 if (!isBoolSGPR(Cond
))
9548 SDVTList VTList
= DAG
.getVTList(MVT::i32
, MVT::i1
);
9549 SDValue Args
[] = { LHS
, DAG
.getConstant(0, SL
, MVT::i32
), Cond
};
9550 Opc
= (Opc
== ISD::SIGN_EXTEND
) ? ISD::SUBCARRY
: ISD::ADDCARRY
;
9551 return DAG
.getNode(Opc
, SL
, VTList
, Args
);
9553 case ISD::ADDCARRY
: {
9554 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
9555 auto C
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
9556 if (!C
|| C
->getZExtValue() != 0) break;
9557 SDValue Args
[] = { LHS
, RHS
.getOperand(0), RHS
.getOperand(2) };
9558 return DAG
.getNode(ISD::ADDCARRY
, SDLoc(N
), RHS
->getVTList(), Args
);
9564 SDValue
SITargetLowering::performSubCombine(SDNode
*N
,
9565 DAGCombinerInfo
&DCI
) const {
9566 SelectionDAG
&DAG
= DCI
.DAG
;
9567 EVT VT
= N
->getValueType(0);
9573 SDValue LHS
= N
->getOperand(0);
9574 SDValue RHS
= N
->getOperand(1);
9576 if (LHS
.getOpcode() == ISD::SUBCARRY
) {
9577 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
9578 auto C
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
9579 if (!C
|| !C
->isNullValue())
9581 SDValue Args
[] = { LHS
.getOperand(0), RHS
, LHS
.getOperand(2) };
9582 return DAG
.getNode(ISD::SUBCARRY
, SDLoc(N
), LHS
->getVTList(), Args
);
9587 SDValue
SITargetLowering::performAddCarrySubCarryCombine(SDNode
*N
,
9588 DAGCombinerInfo
&DCI
) const {
9590 if (N
->getValueType(0) != MVT::i32
)
9593 auto C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
9594 if (!C
|| C
->getZExtValue() != 0)
9597 SelectionDAG
&DAG
= DCI
.DAG
;
9598 SDValue LHS
= N
->getOperand(0);
9600 // addcarry (add x, y), 0, cc => addcarry x, y, cc
9601 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
9602 unsigned LHSOpc
= LHS
.getOpcode();
9603 unsigned Opc
= N
->getOpcode();
9604 if ((LHSOpc
== ISD::ADD
&& Opc
== ISD::ADDCARRY
) ||
9605 (LHSOpc
== ISD::SUB
&& Opc
== ISD::SUBCARRY
)) {
9606 SDValue Args
[] = { LHS
.getOperand(0), LHS
.getOperand(1), N
->getOperand(2) };
9607 return DAG
.getNode(Opc
, SDLoc(N
), N
->getVTList(), Args
);
9612 SDValue
SITargetLowering::performFAddCombine(SDNode
*N
,
9613 DAGCombinerInfo
&DCI
) const {
9614 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
9617 SelectionDAG
&DAG
= DCI
.DAG
;
9618 EVT VT
= N
->getValueType(0);
9621 SDValue LHS
= N
->getOperand(0);
9622 SDValue RHS
= N
->getOperand(1);
9624 // These should really be instruction patterns, but writing patterns with
9625 // source modiifiers is a pain.
9627 // fadd (fadd (a, a), b) -> mad 2.0, a, b
9628 if (LHS
.getOpcode() == ISD::FADD
) {
9629 SDValue A
= LHS
.getOperand(0);
9630 if (A
== LHS
.getOperand(1)) {
9631 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
9633 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9634 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, RHS
);
9639 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
9640 if (RHS
.getOpcode() == ISD::FADD
) {
9641 SDValue A
= RHS
.getOperand(0);
9642 if (A
== RHS
.getOperand(1)) {
9643 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
9645 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9646 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, LHS
);
9654 SDValue
SITargetLowering::performFSubCombine(SDNode
*N
,
9655 DAGCombinerInfo
&DCI
) const {
9656 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
9659 SelectionDAG
&DAG
= DCI
.DAG
;
9661 EVT VT
= N
->getValueType(0);
9662 assert(!VT
.isVector());
9664 // Try to get the fneg to fold into the source modifier. This undoes generic
9665 // DAG combines and folds them into the mad.
9667 // Only do this if we are not trying to support denormals. v_mad_f32 does
9668 // not support denormals ever.
9669 SDValue LHS
= N
->getOperand(0);
9670 SDValue RHS
= N
->getOperand(1);
9671 if (LHS
.getOpcode() == ISD::FADD
) {
9672 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
9673 SDValue A
= LHS
.getOperand(0);
9674 if (A
== LHS
.getOperand(1)) {
9675 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
9677 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9678 SDValue NegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
9680 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, NegRHS
);
9685 if (RHS
.getOpcode() == ISD::FADD
) {
9686 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
9688 SDValue A
= RHS
.getOperand(0);
9689 if (A
== RHS
.getOperand(1)) {
9690 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
9692 const SDValue NegTwo
= DAG
.getConstantFP(-2.0, SL
, VT
);
9693 return DAG
.getNode(FusedOp
, SL
, VT
, A
, NegTwo
, LHS
);
9701 SDValue
SITargetLowering::performFMACombine(SDNode
*N
,
9702 DAGCombinerInfo
&DCI
) const {
9703 SelectionDAG
&DAG
= DCI
.DAG
;
9704 EVT VT
= N
->getValueType(0);
9707 if (!Subtarget
->hasDot2Insts() || VT
!= MVT::f32
)
9710 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
9711 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
9712 SDValue Op1
= N
->getOperand(0);
9713 SDValue Op2
= N
->getOperand(1);
9714 SDValue FMA
= N
->getOperand(2);
9716 if (FMA
.getOpcode() != ISD::FMA
||
9717 Op1
.getOpcode() != ISD::FP_EXTEND
||
9718 Op2
.getOpcode() != ISD::FP_EXTEND
)
9721 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
9722 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
9723 // is sufficient to allow generaing fdot2.
9724 const TargetOptions
&Options
= DAG
.getTarget().Options
;
9725 if (Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
9726 (N
->getFlags().hasAllowContract() &&
9727 FMA
->getFlags().hasAllowContract())) {
9728 Op1
= Op1
.getOperand(0);
9729 Op2
= Op2
.getOperand(0);
9730 if (Op1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
9731 Op2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
9734 SDValue Vec1
= Op1
.getOperand(0);
9735 SDValue Idx1
= Op1
.getOperand(1);
9736 SDValue Vec2
= Op2
.getOperand(0);
9738 SDValue FMAOp1
= FMA
.getOperand(0);
9739 SDValue FMAOp2
= FMA
.getOperand(1);
9740 SDValue FMAAcc
= FMA
.getOperand(2);
9742 if (FMAOp1
.getOpcode() != ISD::FP_EXTEND
||
9743 FMAOp2
.getOpcode() != ISD::FP_EXTEND
)
9746 FMAOp1
= FMAOp1
.getOperand(0);
9747 FMAOp2
= FMAOp2
.getOperand(0);
9748 if (FMAOp1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
9749 FMAOp2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
9752 SDValue Vec3
= FMAOp1
.getOperand(0);
9753 SDValue Vec4
= FMAOp2
.getOperand(0);
9754 SDValue Idx2
= FMAOp1
.getOperand(1);
9756 if (Idx1
!= Op2
.getOperand(1) || Idx2
!= FMAOp2
.getOperand(1) ||
9757 // Idx1 and Idx2 cannot be the same.
9761 if (Vec1
== Vec2
|| Vec3
== Vec4
)
9764 if (Vec1
.getValueType() != MVT::v2f16
|| Vec2
.getValueType() != MVT::v2f16
)
9767 if ((Vec1
== Vec3
&& Vec2
== Vec4
) ||
9768 (Vec1
== Vec4
&& Vec2
== Vec3
)) {
9769 return DAG
.getNode(AMDGPUISD::FDOT2
, SL
, MVT::f32
, Vec1
, Vec2
, FMAAcc
,
9770 DAG
.getTargetConstant(0, SL
, MVT::i1
));
9776 SDValue
SITargetLowering::performSetCCCombine(SDNode
*N
,
9777 DAGCombinerInfo
&DCI
) const {
9778 SelectionDAG
&DAG
= DCI
.DAG
;
9781 SDValue LHS
= N
->getOperand(0);
9782 SDValue RHS
= N
->getOperand(1);
9783 EVT VT
= LHS
.getValueType();
9784 ISD::CondCode CC
= cast
<CondCodeSDNode
>(N
->getOperand(2))->get();
9786 auto CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
9788 CRHS
= dyn_cast
<ConstantSDNode
>(LHS
);
9790 std::swap(LHS
, RHS
);
9791 CC
= getSetCCSwappedOperands(CC
);
9796 if (VT
== MVT::i32
&& LHS
.getOpcode() == ISD::SIGN_EXTEND
&&
9797 isBoolSGPR(LHS
.getOperand(0))) {
9798 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
9799 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
9800 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
9801 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
9802 if ((CRHS
->isAllOnesValue() &&
9803 (CC
== ISD::SETNE
|| CC
== ISD::SETGT
|| CC
== ISD::SETULT
)) ||
9804 (CRHS
->isNullValue() &&
9805 (CC
== ISD::SETEQ
|| CC
== ISD::SETGE
|| CC
== ISD::SETULE
)))
9806 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
9807 DAG
.getConstant(-1, SL
, MVT::i1
));
9808 if ((CRHS
->isAllOnesValue() &&
9809 (CC
== ISD::SETEQ
|| CC
== ISD::SETLE
|| CC
== ISD::SETUGE
)) ||
9810 (CRHS
->isNullValue() &&
9811 (CC
== ISD::SETNE
|| CC
== ISD::SETUGT
|| CC
== ISD::SETLT
)))
9812 return LHS
.getOperand(0);
9815 uint64_t CRHSVal
= CRHS
->getZExtValue();
9816 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) &&
9817 LHS
.getOpcode() == ISD::SELECT
&&
9818 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
9819 isa
<ConstantSDNode
>(LHS
.getOperand(2)) &&
9820 LHS
.getConstantOperandVal(1) != LHS
.getConstantOperandVal(2) &&
9821 isBoolSGPR(LHS
.getOperand(0))) {
9823 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
9824 // setcc (select cc, CT, CF), CF, ne => cc
9825 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
9826 // setcc (select cc, CT, CF), CT, eq => cc
9827 uint64_t CT
= LHS
.getConstantOperandVal(1);
9828 uint64_t CF
= LHS
.getConstantOperandVal(2);
9830 if ((CF
== CRHSVal
&& CC
== ISD::SETEQ
) ||
9831 (CT
== CRHSVal
&& CC
== ISD::SETNE
))
9832 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
9833 DAG
.getConstant(-1, SL
, MVT::i1
));
9834 if ((CF
== CRHSVal
&& CC
== ISD::SETNE
) ||
9835 (CT
== CRHSVal
&& CC
== ISD::SETEQ
))
9836 return LHS
.getOperand(0);
9840 if (VT
!= MVT::f32
&& VT
!= MVT::f64
&& (Subtarget
->has16BitInsts() &&
9844 // Match isinf/isfinite pattern
9845 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
9846 // (fcmp one (fabs x), inf) -> (fp_class x,
9847 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
9848 if ((CC
== ISD::SETOEQ
|| CC
== ISD::SETONE
) && LHS
.getOpcode() == ISD::FABS
) {
9849 const ConstantFPSDNode
*CRHS
= dyn_cast
<ConstantFPSDNode
>(RHS
);
9853 const APFloat
&APF
= CRHS
->getValueAPF();
9854 if (APF
.isInfinity() && !APF
.isNegative()) {
9855 const unsigned IsInfMask
= SIInstrFlags::P_INFINITY
|
9856 SIInstrFlags::N_INFINITY
;
9857 const unsigned IsFiniteMask
= SIInstrFlags::N_ZERO
|
9858 SIInstrFlags::P_ZERO
|
9859 SIInstrFlags::N_NORMAL
|
9860 SIInstrFlags::P_NORMAL
|
9861 SIInstrFlags::N_SUBNORMAL
|
9862 SIInstrFlags::P_SUBNORMAL
;
9863 unsigned Mask
= CC
== ISD::SETOEQ
? IsInfMask
: IsFiniteMask
;
9864 return DAG
.getNode(AMDGPUISD::FP_CLASS
, SL
, MVT::i1
, LHS
.getOperand(0),
9865 DAG
.getConstant(Mask
, SL
, MVT::i32
));
9872 SDValue
SITargetLowering::performCvtF32UByteNCombine(SDNode
*N
,
9873 DAGCombinerInfo
&DCI
) const {
9874 SelectionDAG
&DAG
= DCI
.DAG
;
9876 unsigned Offset
= N
->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0
;
9878 SDValue Src
= N
->getOperand(0);
9879 SDValue Srl
= N
->getOperand(0);
9880 if (Srl
.getOpcode() == ISD::ZERO_EXTEND
)
9881 Srl
= Srl
.getOperand(0);
9883 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
9884 if (Srl
.getOpcode() == ISD::SRL
) {
9885 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
9886 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
9887 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
9889 if (const ConstantSDNode
*C
=
9890 dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1))) {
9891 Srl
= DAG
.getZExtOrTrunc(Srl
.getOperand(0), SDLoc(Srl
.getOperand(0)),
9894 unsigned SrcOffset
= C
->getZExtValue() + 8 * Offset
;
9895 if (SrcOffset
< 32 && SrcOffset
% 8 == 0) {
9896 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
+ SrcOffset
/ 8, SL
,
9902 APInt Demanded
= APInt::getBitsSet(32, 8 * Offset
, 8 * Offset
+ 8);
9905 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
9906 !DCI
.isBeforeLegalizeOps());
9907 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
9908 if (TLI
.SimplifyDemandedBits(Src
, Demanded
, Known
, TLO
)) {
9909 DCI
.CommitTargetLoweringOpt(TLO
);
9915 SDValue
SITargetLowering::performClampCombine(SDNode
*N
,
9916 DAGCombinerInfo
&DCI
) const {
9917 ConstantFPSDNode
*CSrc
= dyn_cast
<ConstantFPSDNode
>(N
->getOperand(0));
9921 const MachineFunction
&MF
= DCI
.DAG
.getMachineFunction();
9922 const APFloat
&F
= CSrc
->getValueAPF();
9923 APFloat Zero
= APFloat::getZero(F
.getSemantics());
9924 APFloat::cmpResult Cmp0
= F
.compare(Zero
);
9925 if (Cmp0
== APFloat::cmpLessThan
||
9926 (Cmp0
== APFloat::cmpUnordered
&&
9927 MF
.getInfo
<SIMachineFunctionInfo
>()->getMode().DX10Clamp
)) {
9928 return DCI
.DAG
.getConstantFP(Zero
, SDLoc(N
), N
->getValueType(0));
9931 APFloat
One(F
.getSemantics(), "1.0");
9932 APFloat::cmpResult Cmp1
= F
.compare(One
);
9933 if (Cmp1
== APFloat::cmpGreaterThan
)
9934 return DCI
.DAG
.getConstantFP(One
, SDLoc(N
), N
->getValueType(0));
9936 return SDValue(CSrc
, 0);
9940 SDValue
SITargetLowering::PerformDAGCombine(SDNode
*N
,
9941 DAGCombinerInfo
&DCI
) const {
9942 if (getTargetMachine().getOptLevel() == CodeGenOpt::None
)
9944 switch (N
->getOpcode()) {
9946 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
9948 return performAddCombine(N
, DCI
);
9950 return performSubCombine(N
, DCI
);
9953 return performAddCarrySubCarryCombine(N
, DCI
);
9955 return performFAddCombine(N
, DCI
);
9957 return performFSubCombine(N
, DCI
);
9959 return performSetCCCombine(N
, DCI
);
9962 case ISD::FMAXNUM_IEEE
:
9963 case ISD::FMINNUM_IEEE
:
9968 case AMDGPUISD::FMIN_LEGACY
:
9969 case AMDGPUISD::FMAX_LEGACY
:
9970 return performMinMaxCombine(N
, DCI
);
9972 return performFMACombine(N
, DCI
);
9974 if (SDValue Widended
= widenLoad(cast
<LoadSDNode
>(N
), DCI
))
9979 case ISD::ATOMIC_LOAD
:
9980 case ISD::ATOMIC_STORE
:
9981 case ISD::ATOMIC_CMP_SWAP
:
9982 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
:
9983 case ISD::ATOMIC_SWAP
:
9984 case ISD::ATOMIC_LOAD_ADD
:
9985 case ISD::ATOMIC_LOAD_SUB
:
9986 case ISD::ATOMIC_LOAD_AND
:
9987 case ISD::ATOMIC_LOAD_OR
:
9988 case ISD::ATOMIC_LOAD_XOR
:
9989 case ISD::ATOMIC_LOAD_NAND
:
9990 case ISD::ATOMIC_LOAD_MIN
:
9991 case ISD::ATOMIC_LOAD_MAX
:
9992 case ISD::ATOMIC_LOAD_UMIN
:
9993 case ISD::ATOMIC_LOAD_UMAX
:
9994 case ISD::ATOMIC_LOAD_FADD
:
9995 case AMDGPUISD::ATOMIC_INC
:
9996 case AMDGPUISD::ATOMIC_DEC
:
9997 case AMDGPUISD::ATOMIC_LOAD_FMIN
:
9998 case AMDGPUISD::ATOMIC_LOAD_FMAX
: // TODO: Target mem intrinsics.
9999 if (DCI
.isBeforeLegalize())
10001 return performMemSDNodeCombine(cast
<MemSDNode
>(N
), DCI
);
10003 return performAndCombine(N
, DCI
);
10005 return performOrCombine(N
, DCI
);
10007 return performXorCombine(N
, DCI
);
10008 case ISD::ZERO_EXTEND
:
10009 return performZeroExtendCombine(N
, DCI
);
10010 case ISD::SIGN_EXTEND_INREG
:
10011 return performSignExtendInRegCombine(N
, DCI
);
10012 case AMDGPUISD::FP_CLASS
:
10013 return performClassCombine(N
, DCI
);
10014 case ISD::FCANONICALIZE
:
10015 return performFCanonicalizeCombine(N
, DCI
);
10016 case AMDGPUISD::RCP
:
10017 return performRcpCombine(N
, DCI
);
10018 case AMDGPUISD::FRACT
:
10019 case AMDGPUISD::RSQ
:
10020 case AMDGPUISD::RCP_LEGACY
:
10021 case AMDGPUISD::RSQ_LEGACY
:
10022 case AMDGPUISD::RCP_IFLAG
:
10023 case AMDGPUISD::RSQ_CLAMP
:
10024 case AMDGPUISD::LDEXP
: {
10025 SDValue Src
= N
->getOperand(0);
10030 case ISD::SINT_TO_FP
:
10031 case ISD::UINT_TO_FP
:
10032 return performUCharToFloatCombine(N
, DCI
);
10033 case AMDGPUISD::CVT_F32_UBYTE0
:
10034 case AMDGPUISD::CVT_F32_UBYTE1
:
10035 case AMDGPUISD::CVT_F32_UBYTE2
:
10036 case AMDGPUISD::CVT_F32_UBYTE3
:
10037 return performCvtF32UByteNCombine(N
, DCI
);
10038 case AMDGPUISD::FMED3
:
10039 return performFMed3Combine(N
, DCI
);
10040 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
10041 return performCvtPkRTZCombine(N
, DCI
);
10042 case AMDGPUISD::CLAMP
:
10043 return performClampCombine(N
, DCI
);
10044 case ISD::SCALAR_TO_VECTOR
: {
10045 SelectionDAG
&DAG
= DCI
.DAG
;
10046 EVT VT
= N
->getValueType(0);
10048 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
10049 if (VT
== MVT::v2i16
|| VT
== MVT::v2f16
) {
10051 SDValue Src
= N
->getOperand(0);
10052 EVT EltVT
= Src
.getValueType();
10053 if (EltVT
== MVT::f16
)
10054 Src
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Src
);
10056 SDValue Ext
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Src
);
10057 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Ext
);
10062 case ISD::EXTRACT_VECTOR_ELT
:
10063 return performExtractVectorEltCombine(N
, DCI
);
10064 case ISD::INSERT_VECTOR_ELT
:
10065 return performInsertVectorEltCombine(N
, DCI
);
10067 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
10070 /// Helper function for adjustWritemask
10071 static unsigned SubIdx2Lane(unsigned Idx
) {
10074 case AMDGPU::sub0
: return 0;
10075 case AMDGPU::sub1
: return 1;
10076 case AMDGPU::sub2
: return 2;
10077 case AMDGPU::sub3
: return 3;
10078 case AMDGPU::sub4
: return 4; // Possible with TFE/LWE
10082 /// Adjust the writemask of MIMG instructions
10083 SDNode
*SITargetLowering::adjustWritemask(MachineSDNode
*&Node
,
10084 SelectionDAG
&DAG
) const {
10085 unsigned Opcode
= Node
->getMachineOpcode();
10087 // Subtract 1 because the vdata output is not a MachineSDNode operand.
10088 int D16Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::d16
) - 1;
10089 if (D16Idx
>= 0 && Node
->getConstantOperandVal(D16Idx
))
10090 return Node
; // not implemented for D16
10092 SDNode
*Users
[5] = { nullptr };
10094 unsigned DmaskIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::dmask
) - 1;
10095 unsigned OldDmask
= Node
->getConstantOperandVal(DmaskIdx
);
10096 unsigned NewDmask
= 0;
10097 unsigned TFEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::tfe
) - 1;
10098 unsigned LWEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::lwe
) - 1;
10099 bool UsesTFC
= (Node
->getConstantOperandVal(TFEIdx
) ||
10100 Node
->getConstantOperandVal(LWEIdx
)) ? 1 : 0;
10101 unsigned TFCLane
= 0;
10102 bool HasChain
= Node
->getNumValues() > 1;
10104 if (OldDmask
== 0) {
10105 // These are folded out, but on the chance it happens don't assert.
10109 unsigned OldBitsSet
= countPopulation(OldDmask
);
10110 // Work out which is the TFE/LWE lane if that is enabled.
10112 TFCLane
= OldBitsSet
;
10115 // Try to figure out the used register components
10116 for (SDNode::use_iterator I
= Node
->use_begin(), E
= Node
->use_end();
10119 // Don't look at users of the chain.
10120 if (I
.getUse().getResNo() != 0)
10123 // Abort if we can't understand the usage
10124 if (!I
->isMachineOpcode() ||
10125 I
->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
)
10128 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
10129 // Note that subregs are packed, i.e. Lane==0 is the first bit set
10130 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
10132 Lane
= SubIdx2Lane(I
->getConstantOperandVal(1));
10134 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
10135 if (UsesTFC
&& Lane
== TFCLane
) {
10138 // Set which texture component corresponds to the lane.
10140 for (unsigned i
= 0, Dmask
= OldDmask
; (i
<= Lane
) && (Dmask
!= 0); i
++) {
10141 Comp
= countTrailingZeros(Dmask
);
10142 Dmask
&= ~(1 << Comp
);
10145 // Abort if we have more than one user per component.
10150 NewDmask
|= 1 << Comp
;
10154 // Don't allow 0 dmask, as hardware assumes one channel enabled.
10155 bool NoChannels
= !NewDmask
;
10158 // No uses of the result and not using TFC. Then do nothing.
10161 // If the original dmask has one channel - then nothing to do
10162 if (OldBitsSet
== 1)
10164 // Use an arbitrary dmask - required for the instruction to work
10167 // Abort if there's no change
10168 if (NewDmask
== OldDmask
)
10171 unsigned BitsSet
= countPopulation(NewDmask
);
10173 // Check for TFE or LWE - increase the number of channels by one to account
10174 // for the extra return value
10175 // This will need adjustment for D16 if this is also included in
10176 // adjustWriteMask (this function) but at present D16 are excluded.
10177 unsigned NewChannels
= BitsSet
+ UsesTFC
;
10180 AMDGPU::getMaskedMIMGOp(Node
->getMachineOpcode(), NewChannels
);
10181 assert(NewOpcode
!= -1 &&
10182 NewOpcode
!= static_cast<int>(Node
->getMachineOpcode()) &&
10183 "failed to find equivalent MIMG op");
10185 // Adjust the writemask in the node
10186 SmallVector
<SDValue
, 12> Ops
;
10187 Ops
.insert(Ops
.end(), Node
->op_begin(), Node
->op_begin() + DmaskIdx
);
10188 Ops
.push_back(DAG
.getTargetConstant(NewDmask
, SDLoc(Node
), MVT::i32
));
10189 Ops
.insert(Ops
.end(), Node
->op_begin() + DmaskIdx
+ 1, Node
->op_end());
10191 MVT SVT
= Node
->getValueType(0).getVectorElementType().getSimpleVT();
10193 MVT ResultVT
= NewChannels
== 1 ?
10194 SVT
: MVT::getVectorVT(SVT
, NewChannels
== 3 ? 4 :
10195 NewChannels
== 5 ? 8 : NewChannels
);
10196 SDVTList NewVTList
= HasChain
?
10197 DAG
.getVTList(ResultVT
, MVT::Other
) : DAG
.getVTList(ResultVT
);
10200 MachineSDNode
*NewNode
= DAG
.getMachineNode(NewOpcode
, SDLoc(Node
),
10205 DAG
.setNodeMemRefs(NewNode
, Node
->memoperands());
10206 DAG
.ReplaceAllUsesOfValueWith(SDValue(Node
, 1), SDValue(NewNode
, 1));
10209 if (NewChannels
== 1) {
10210 assert(Node
->hasNUsesOfValue(1, 0));
10211 SDNode
*Copy
= DAG
.getMachineNode(TargetOpcode::COPY
,
10212 SDLoc(Node
), Users
[Lane
]->getValueType(0),
10213 SDValue(NewNode
, 0));
10214 DAG
.ReplaceAllUsesWith(Users
[Lane
], Copy
);
10218 // Update the users of the node with the new indices
10219 for (unsigned i
= 0, Idx
= AMDGPU::sub0
; i
< 5; ++i
) {
10220 SDNode
*User
= Users
[i
];
10222 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
10223 // Users[0] is still nullptr because channel 0 doesn't really have a use.
10224 if (i
|| !NoChannels
)
10227 SDValue Op
= DAG
.getTargetConstant(Idx
, SDLoc(User
), MVT::i32
);
10228 DAG
.UpdateNodeOperands(User
, SDValue(NewNode
, 0), Op
);
10233 case AMDGPU::sub0
: Idx
= AMDGPU::sub1
; break;
10234 case AMDGPU::sub1
: Idx
= AMDGPU::sub2
; break;
10235 case AMDGPU::sub2
: Idx
= AMDGPU::sub3
; break;
10236 case AMDGPU::sub3
: Idx
= AMDGPU::sub4
; break;
10240 DAG
.RemoveDeadNode(Node
);
10244 static bool isFrameIndexOp(SDValue Op
) {
10245 if (Op
.getOpcode() == ISD::AssertZext
)
10246 Op
= Op
.getOperand(0);
10248 return isa
<FrameIndexSDNode
>(Op
);
10251 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
10252 /// with frame index operands.
10253 /// LLVM assumes that inputs are to these instructions are registers.
10254 SDNode
*SITargetLowering::legalizeTargetIndependentNode(SDNode
*Node
,
10255 SelectionDAG
&DAG
) const {
10256 if (Node
->getOpcode() == ISD::CopyToReg
) {
10257 RegisterSDNode
*DestReg
= cast
<RegisterSDNode
>(Node
->getOperand(1));
10258 SDValue SrcVal
= Node
->getOperand(2);
10260 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
10261 // to try understanding copies to physical registers.
10262 if (SrcVal
.getValueType() == MVT::i1
&&
10263 Register::isPhysicalRegister(DestReg
->getReg())) {
10265 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
10266 SDValue VReg
= DAG
.getRegister(
10267 MRI
.createVirtualRegister(&AMDGPU::VReg_1RegClass
), MVT::i1
);
10269 SDNode
*Glued
= Node
->getGluedNode();
10271 = DAG
.getCopyToReg(Node
->getOperand(0), SL
, VReg
, SrcVal
,
10272 SDValue(Glued
, Glued
? Glued
->getNumValues() - 1 : 0));
10273 SDValue ToResultReg
10274 = DAG
.getCopyToReg(ToVReg
, SL
, SDValue(DestReg
, 0),
10275 VReg
, ToVReg
.getValue(1));
10276 DAG
.ReplaceAllUsesWith(Node
, ToResultReg
.getNode());
10277 DAG
.RemoveDeadNode(Node
);
10278 return ToResultReg
.getNode();
10282 SmallVector
<SDValue
, 8> Ops
;
10283 for (unsigned i
= 0; i
< Node
->getNumOperands(); ++i
) {
10284 if (!isFrameIndexOp(Node
->getOperand(i
))) {
10285 Ops
.push_back(Node
->getOperand(i
));
10290 Ops
.push_back(SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
,
10291 Node
->getOperand(i
).getValueType(),
10292 Node
->getOperand(i
)), 0));
10295 return DAG
.UpdateNodeOperands(Node
, Ops
);
10298 /// Fold the instructions after selecting them.
10299 /// Returns null if users were already updated.
10300 SDNode
*SITargetLowering::PostISelFolding(MachineSDNode
*Node
,
10301 SelectionDAG
&DAG
) const {
10302 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10303 unsigned Opcode
= Node
->getMachineOpcode();
10305 if (TII
->isMIMG(Opcode
) && !TII
->get(Opcode
).mayStore() &&
10306 !TII
->isGather4(Opcode
)) {
10307 return adjustWritemask(Node
, DAG
);
10310 if (Opcode
== AMDGPU::INSERT_SUBREG
||
10311 Opcode
== AMDGPU::REG_SEQUENCE
) {
10312 legalizeTargetIndependentNode(Node
, DAG
);
10317 case AMDGPU::V_DIV_SCALE_F32
:
10318 case AMDGPU::V_DIV_SCALE_F64
: {
10319 // Satisfy the operand register constraint when one of the inputs is
10320 // undefined. Ordinarily each undef value will have its own implicit_def of
10321 // a vreg, so force these to use a single register.
10322 SDValue Src0
= Node
->getOperand(0);
10323 SDValue Src1
= Node
->getOperand(1);
10324 SDValue Src2
= Node
->getOperand(2);
10326 if ((Src0
.isMachineOpcode() &&
10327 Src0
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
) &&
10328 (Src0
== Src1
|| Src0
== Src2
))
10331 MVT VT
= Src0
.getValueType().getSimpleVT();
10332 const TargetRegisterClass
*RC
=
10333 getRegClassFor(VT
, Src0
.getNode()->isDivergent());
10335 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
10336 SDValue UndefReg
= DAG
.getRegister(MRI
.createVirtualRegister(RC
), VT
);
10338 SDValue ImpDef
= DAG
.getCopyToReg(DAG
.getEntryNode(), SDLoc(Node
),
10339 UndefReg
, Src0
, SDValue());
10341 // src0 must be the same register as src1 or src2, even if the value is
10342 // undefined, so make sure we don't violate this constraint.
10343 if (Src0
.isMachineOpcode() &&
10344 Src0
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
) {
10345 if (Src1
.isMachineOpcode() &&
10346 Src1
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
10348 else if (Src2
.isMachineOpcode() &&
10349 Src2
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
10352 assert(Src1
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
);
10359 SmallVector
<SDValue
, 4> Ops
= { Src0
, Src1
, Src2
};
10360 for (unsigned I
= 3, N
= Node
->getNumOperands(); I
!= N
; ++I
)
10361 Ops
.push_back(Node
->getOperand(I
));
10363 Ops
.push_back(ImpDef
.getValue(1));
10364 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
10366 case AMDGPU::V_PERMLANE16_B32
:
10367 case AMDGPU::V_PERMLANEX16_B32
: {
10368 ConstantSDNode
*FI
= cast
<ConstantSDNode
>(Node
->getOperand(0));
10369 ConstantSDNode
*BC
= cast
<ConstantSDNode
>(Node
->getOperand(2));
10370 if (!FI
->getZExtValue() && !BC
->getZExtValue())
10372 SDValue VDstIn
= Node
->getOperand(6);
10373 if (VDstIn
.isMachineOpcode()
10374 && VDstIn
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
)
10376 MachineSDNode
*ImpDef
= DAG
.getMachineNode(TargetOpcode::IMPLICIT_DEF
,
10377 SDLoc(Node
), MVT::i32
);
10378 SmallVector
<SDValue
, 8> Ops
= { SDValue(FI
, 0), Node
->getOperand(1),
10379 SDValue(BC
, 0), Node
->getOperand(3),
10380 Node
->getOperand(4), Node
->getOperand(5),
10381 SDValue(ImpDef
, 0), Node
->getOperand(7) };
10382 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
10391 /// Assign the register class depending on the number of
10392 /// bits set in the writemask
10393 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr
&MI
,
10394 SDNode
*Node
) const {
10395 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10397 MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
10399 if (TII
->isVOP3(MI
.getOpcode())) {
10400 // Make sure constant bus requirements are respected.
10401 TII
->legalizeOperandsVOP3(MRI
, MI
);
10403 // Prefer VGPRs over AGPRs in mAI instructions where possible.
10404 // This saves a chain-copy of registers and better ballance register
10405 // use between vgpr and agpr as agpr tuples tend to be big.
10406 if (const MCOperandInfo
*OpInfo
= MI
.getDesc().OpInfo
) {
10407 unsigned Opc
= MI
.getOpcode();
10408 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10409 for (auto I
: { AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
),
10410 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
) }) {
10413 MachineOperand
&Op
= MI
.getOperand(I
);
10414 if ((OpInfo
[I
].RegClass
!= llvm::AMDGPU::AV_64RegClassID
&&
10415 OpInfo
[I
].RegClass
!= llvm::AMDGPU::AV_32RegClassID
) ||
10416 !Register::isVirtualRegister(Op
.getReg()) ||
10417 !TRI
->isAGPR(MRI
, Op
.getReg()))
10419 auto *Src
= MRI
.getUniqueVRegDef(Op
.getReg());
10420 if (!Src
|| !Src
->isCopy() ||
10421 !TRI
->isSGPRReg(MRI
, Src
->getOperand(1).getReg()))
10423 auto *RC
= TRI
->getRegClassForReg(MRI
, Op
.getReg());
10424 auto *NewRC
= TRI
->getEquivalentVGPRClass(RC
);
10425 // All uses of agpr64 and agpr32 can also accept vgpr except for
10426 // v_accvgpr_read, but we do not produce agpr reads during selection,
10427 // so no use checks are needed.
10428 MRI
.setRegClass(Op
.getReg(), NewRC
);
10435 // Replace unused atomics with the no return version.
10436 int NoRetAtomicOp
= AMDGPU::getAtomicNoRetOp(MI
.getOpcode());
10437 if (NoRetAtomicOp
!= -1) {
10438 if (!Node
->hasAnyUseOfValue(0)) {
10439 MI
.setDesc(TII
->get(NoRetAtomicOp
));
10440 MI
.RemoveOperand(0);
10444 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
10445 // instruction, because the return type of these instructions is a vec2 of
10446 // the memory type, so it can be tied to the input operand.
10447 // This means these instructions always have a use, so we need to add a
10448 // special case to check if the atomic has only one extract_subreg use,
10449 // which itself has no uses.
10450 if ((Node
->hasNUsesOfValue(1, 0) &&
10451 Node
->use_begin()->isMachineOpcode() &&
10452 Node
->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG
&&
10453 !Node
->use_begin()->hasAnyUseOfValue(0))) {
10454 Register Def
= MI
.getOperand(0).getReg();
10456 // Change this into a noret atomic.
10457 MI
.setDesc(TII
->get(NoRetAtomicOp
));
10458 MI
.RemoveOperand(0);
10460 // If we only remove the def operand from the atomic instruction, the
10461 // extract_subreg will be left with a use of a vreg without a def.
10462 // So we need to insert an implicit_def to avoid machine verifier
10464 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(),
10465 TII
->get(AMDGPU::IMPLICIT_DEF
), Def
);
10471 static SDValue
buildSMovImm32(SelectionDAG
&DAG
, const SDLoc
&DL
,
10473 SDValue K
= DAG
.getTargetConstant(Val
, DL
, MVT::i32
);
10474 return SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, K
), 0);
10477 MachineSDNode
*SITargetLowering::wrapAddr64Rsrc(SelectionDAG
&DAG
,
10479 SDValue Ptr
) const {
10480 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10482 // Build the half of the subregister with the constants before building the
10483 // full 128-bit register. If we are building multiple resource descriptors,
10484 // this will allow CSEing of the 2-component register.
10485 const SDValue Ops0
[] = {
10486 DAG
.getTargetConstant(AMDGPU::SGPR_64RegClassID
, DL
, MVT::i32
),
10487 buildSMovImm32(DAG
, DL
, 0),
10488 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
10489 buildSMovImm32(DAG
, DL
, TII
->getDefaultRsrcDataFormat() >> 32),
10490 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
)
10493 SDValue SubRegHi
= SDValue(DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
,
10494 MVT::v2i32
, Ops0
), 0);
10496 // Combine the constants and the pointer.
10497 const SDValue Ops1
[] = {
10498 DAG
.getTargetConstant(AMDGPU::SGPR_128RegClassID
, DL
, MVT::i32
),
10500 DAG
.getTargetConstant(AMDGPU::sub0_sub1
, DL
, MVT::i32
),
10502 DAG
.getTargetConstant(AMDGPU::sub2_sub3
, DL
, MVT::i32
)
10505 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops1
);
10508 /// Return a resource descriptor with the 'Add TID' bit enabled
10509 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
10510 /// of the resource descriptor) to create an offset, which is added to
10511 /// the resource pointer.
10512 MachineSDNode
*SITargetLowering::buildRSRC(SelectionDAG
&DAG
, const SDLoc
&DL
,
10513 SDValue Ptr
, uint32_t RsrcDword1
,
10514 uint64_t RsrcDword2And3
) const {
10515 SDValue PtrLo
= DAG
.getTargetExtractSubreg(AMDGPU::sub0
, DL
, MVT::i32
, Ptr
);
10516 SDValue PtrHi
= DAG
.getTargetExtractSubreg(AMDGPU::sub1
, DL
, MVT::i32
, Ptr
);
10518 PtrHi
= SDValue(DAG
.getMachineNode(AMDGPU::S_OR_B32
, DL
, MVT::i32
, PtrHi
,
10519 DAG
.getConstant(RsrcDword1
, DL
, MVT::i32
)),
10523 SDValue DataLo
= buildSMovImm32(DAG
, DL
,
10524 RsrcDword2And3
& UINT64_C(0xFFFFFFFF));
10525 SDValue DataHi
= buildSMovImm32(DAG
, DL
, RsrcDword2And3
>> 32);
10527 const SDValue Ops
[] = {
10528 DAG
.getTargetConstant(AMDGPU::SGPR_128RegClassID
, DL
, MVT::i32
),
10530 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
10532 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
),
10534 DAG
.getTargetConstant(AMDGPU::sub2
, DL
, MVT::i32
),
10536 DAG
.getTargetConstant(AMDGPU::sub3
, DL
, MVT::i32
)
10539 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops
);
10542 //===----------------------------------------------------------------------===//
10543 // SI Inline Assembly Support
10544 //===----------------------------------------------------------------------===//
10546 std::pair
<unsigned, const TargetRegisterClass
*>
10547 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo
*TRI
,
10548 StringRef Constraint
,
10550 const TargetRegisterClass
*RC
= nullptr;
10551 if (Constraint
.size() == 1) {
10552 switch (Constraint
[0]) {
10554 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
10557 switch (VT
.getSizeInBits()) {
10559 return std::make_pair(0U, nullptr);
10562 RC
= &AMDGPU::SReg_32RegClass
;
10565 RC
= &AMDGPU::SGPR_64RegClass
;
10568 RC
= &AMDGPU::SReg_96RegClass
;
10571 RC
= &AMDGPU::SGPR_128RegClass
;
10574 RC
= &AMDGPU::SReg_160RegClass
;
10577 RC
= &AMDGPU::SReg_256RegClass
;
10580 RC
= &AMDGPU::SReg_512RegClass
;
10585 switch (VT
.getSizeInBits()) {
10587 return std::make_pair(0U, nullptr);
10590 RC
= &AMDGPU::VGPR_32RegClass
;
10593 RC
= &AMDGPU::VReg_64RegClass
;
10596 RC
= &AMDGPU::VReg_96RegClass
;
10599 RC
= &AMDGPU::VReg_128RegClass
;
10602 RC
= &AMDGPU::VReg_160RegClass
;
10605 RC
= &AMDGPU::VReg_256RegClass
;
10608 RC
= &AMDGPU::VReg_512RegClass
;
10613 if (!Subtarget
->hasMAIInsts())
10615 switch (VT
.getSizeInBits()) {
10617 return std::make_pair(0U, nullptr);
10620 RC
= &AMDGPU::AGPR_32RegClass
;
10623 RC
= &AMDGPU::AReg_64RegClass
;
10626 RC
= &AMDGPU::AReg_128RegClass
;
10629 RC
= &AMDGPU::AReg_512RegClass
;
10632 RC
= &AMDGPU::AReg_1024RegClass
;
10633 // v32 types are not legal but we support them here.
10634 return std::make_pair(0U, RC
);
10638 // We actually support i128, i16 and f16 as inline parameters
10639 // even if they are not reported as legal
10640 if (RC
&& (isTypeLegal(VT
) || VT
.SimpleTy
== MVT::i128
||
10641 VT
.SimpleTy
== MVT::i16
|| VT
.SimpleTy
== MVT::f16
))
10642 return std::make_pair(0U, RC
);
10645 if (Constraint
.size() > 1) {
10646 if (Constraint
[1] == 'v') {
10647 RC
= &AMDGPU::VGPR_32RegClass
;
10648 } else if (Constraint
[1] == 's') {
10649 RC
= &AMDGPU::SGPR_32RegClass
;
10650 } else if (Constraint
[1] == 'a') {
10651 RC
= &AMDGPU::AGPR_32RegClass
;
10656 bool Failed
= Constraint
.substr(2).getAsInteger(10, Idx
);
10657 if (!Failed
&& Idx
< RC
->getNumRegs())
10658 return std::make_pair(RC
->getRegister(Idx
), RC
);
10661 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
10664 SITargetLowering::ConstraintType
10665 SITargetLowering::getConstraintType(StringRef Constraint
) const {
10666 if (Constraint
.size() == 1) {
10667 switch (Constraint
[0]) {
10672 return C_RegisterClass
;
10675 return TargetLowering::getConstraintType(Constraint
);
10678 // Figure out which registers should be reserved for stack access. Only after
10679 // the function is legalized do we know all of the non-spill stack objects or if
10680 // calls are present.
10681 void SITargetLowering::finalizeLowering(MachineFunction
&MF
) const {
10682 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
10683 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10684 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
10685 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10687 if (Info
->isEntryFunction()) {
10688 // Callable functions have fixed registers used for stack access.
10689 reservePrivateMemoryRegs(getTargetMachine(), MF
, *TRI
, *Info
);
10692 assert(!TRI
->isSubRegister(Info
->getScratchRSrcReg(),
10693 Info
->getStackPtrOffsetReg()));
10694 if (Info
->getStackPtrOffsetReg() != AMDGPU::SP_REG
)
10695 MRI
.replaceRegWith(AMDGPU::SP_REG
, Info
->getStackPtrOffsetReg());
10697 // We need to worry about replacing the default register with itself in case
10698 // of MIR testcases missing the MFI.
10699 if (Info
->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG
)
10700 MRI
.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG
, Info
->getScratchRSrcReg());
10702 if (Info
->getFrameOffsetReg() != AMDGPU::FP_REG
)
10703 MRI
.replaceRegWith(AMDGPU::FP_REG
, Info
->getFrameOffsetReg());
10705 if (Info
->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG
) {
10706 MRI
.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG
,
10707 Info
->getScratchWaveOffsetReg());
10710 Info
->limitOccupancy(MF
);
10712 if (ST
.isWave32() && !MF
.empty()) {
10713 // Add VCC_HI def because many instructions marked as imp-use VCC where
10714 // we may only define VCC_LO. If nothing defines VCC_HI we may end up
10715 // having a use of undef.
10717 const SIInstrInfo
*TII
= ST
.getInstrInfo();
10720 MachineBasicBlock
&MBB
= MF
.front();
10721 MachineBasicBlock::iterator I
= MBB
.getFirstNonDebugInstr();
10722 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), AMDGPU::VCC_HI
);
10724 for (auto &MBB
: MF
) {
10725 for (auto &MI
: MBB
) {
10726 TII
->fixImplicitOperands(MI
);
10731 TargetLoweringBase::finalizeLowering(MF
);
10734 void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op
,
10736 const APInt
&DemandedElts
,
10737 const SelectionDAG
&DAG
,
10738 unsigned Depth
) const {
10739 TargetLowering::computeKnownBitsForFrameIndex(Op
, Known
, DemandedElts
,
10742 // Set the high bits to zero based on the maximum allowed scratch size per
10743 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
10744 // calculation won't overflow, so assume the sign bit is never set.
10745 Known
.Zero
.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
10748 Align
SITargetLowering::getPrefLoopAlignment(MachineLoop
*ML
) const {
10749 const Align PrefAlign
= TargetLowering::getPrefLoopAlignment(ML
);
10750 const Align CacheLineAlign
= Align(64);
10752 // Pre-GFX10 target did not benefit from loop alignment
10753 if (!ML
|| DisableLoopAlignment
||
10754 (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10
) ||
10755 getSubtarget()->hasInstFwdPrefetchBug())
10758 // On GFX10 I$ is 4 x 64 bytes cache lines.
10759 // By default prefetcher keeps one cache line behind and reads two ahead.
10760 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
10761 // behind and one ahead.
10762 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
10763 // If loop fits 64 bytes it always spans no more than two cache lines and
10764 // does not need an alignment.
10765 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
10766 // Else if loop is less or equal 192 bytes we need two lines behind.
10768 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10769 const MachineBasicBlock
*Header
= ML
->getHeader();
10770 if (Header
->getAlignment() != PrefAlign
)
10771 return Header
->getAlignment(); // Already processed.
10773 unsigned LoopSize
= 0;
10774 for (const MachineBasicBlock
*MBB
: ML
->blocks()) {
10775 // If inner loop block is aligned assume in average half of the alignment
10776 // size to be added as nops.
10778 LoopSize
+= MBB
->getAlignment().value() / 2;
10780 for (const MachineInstr
&MI
: *MBB
) {
10781 LoopSize
+= TII
->getInstSizeInBytes(MI
);
10782 if (LoopSize
> 192)
10787 if (LoopSize
<= 64)
10790 if (LoopSize
<= 128)
10791 return CacheLineAlign
;
10793 // If any of parent loops is surrounded by prefetch instructions do not
10794 // insert new for inner loop, which would reset parent's settings.
10795 for (MachineLoop
*P
= ML
->getParentLoop(); P
; P
= P
->getParentLoop()) {
10796 if (MachineBasicBlock
*Exit
= P
->getExitBlock()) {
10797 auto I
= Exit
->getFirstNonDebugInstr();
10798 if (I
!= Exit
->end() && I
->getOpcode() == AMDGPU::S_INST_PREFETCH
)
10799 return CacheLineAlign
;
10803 MachineBasicBlock
*Pre
= ML
->getLoopPreheader();
10804 MachineBasicBlock
*Exit
= ML
->getExitBlock();
10807 BuildMI(*Pre
, Pre
->getFirstTerminator(), DebugLoc(),
10808 TII
->get(AMDGPU::S_INST_PREFETCH
))
10809 .addImm(1); // prefetch 2 lines behind PC
10811 BuildMI(*Exit
, Exit
->getFirstNonDebugInstr(), DebugLoc(),
10812 TII
->get(AMDGPU::S_INST_PREFETCH
))
10813 .addImm(2); // prefetch 1 line behind PC
10816 return CacheLineAlign
;
10819 LLVM_ATTRIBUTE_UNUSED
10820 static bool isCopyFromRegOfInlineAsm(const SDNode
*N
) {
10821 assert(N
->getOpcode() == ISD::CopyFromReg
);
10823 // Follow the chain until we find an INLINEASM node.
10824 N
= N
->getOperand(0).getNode();
10825 if (N
->getOpcode() == ISD::INLINEASM
||
10826 N
->getOpcode() == ISD::INLINEASM_BR
)
10828 } while (N
->getOpcode() == ISD::CopyFromReg
);
10832 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode
* N
,
10833 FunctionLoweringInfo
* FLI
, LegacyDivergenceAnalysis
* KDA
) const
10835 switch (N
->getOpcode()) {
10836 case ISD::CopyFromReg
:
10838 const RegisterSDNode
*R
= cast
<RegisterSDNode
>(N
->getOperand(1));
10839 const MachineFunction
* MF
= FLI
->MF
;
10840 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
10841 const MachineRegisterInfo
&MRI
= MF
->getRegInfo();
10842 const SIRegisterInfo
&TRI
= ST
.getInstrInfo()->getRegisterInfo();
10843 unsigned Reg
= R
->getReg();
10844 if (Register::isPhysicalRegister(Reg
))
10845 return !TRI
.isSGPRReg(MRI
, Reg
);
10847 if (MRI
.isLiveIn(Reg
)) {
10848 // workitem.id.x workitem.id.y workitem.id.z
10849 // Any VGPR formal argument is also considered divergent
10850 if (!TRI
.isSGPRReg(MRI
, Reg
))
10852 // Formal arguments of non-entry functions
10853 // are conservatively considered divergent
10854 else if (!AMDGPU::isEntryFunctionCC(FLI
->Fn
->getCallingConv()))
10858 const Value
*V
= FLI
->getValueFromVirtualReg(Reg
);
10860 return KDA
->isDivergent(V
);
10861 assert(Reg
== FLI
->DemoteRegister
|| isCopyFromRegOfInlineAsm(N
));
10862 return !TRI
.isSGPRReg(MRI
, Reg
);
10866 const LoadSDNode
*L
= cast
<LoadSDNode
>(N
);
10867 unsigned AS
= L
->getAddressSpace();
10868 // A flat load may access private memory.
10869 return AS
== AMDGPUAS::PRIVATE_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
;
10871 case ISD::CALLSEQ_END
:
10874 case ISD::INTRINSIC_WO_CHAIN
:
10878 return AMDGPU::isIntrinsicSourceOfDivergence(
10879 cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue());
10880 case ISD::INTRINSIC_W_CHAIN
:
10881 return AMDGPU::isIntrinsicSourceOfDivergence(
10882 cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue());
10887 bool SITargetLowering::denormalsEnabledForType(EVT VT
) const {
10888 switch (VT
.getScalarType().getSimpleVT().SimpleTy
) {
10890 return Subtarget
->hasFP32Denormals();
10892 return Subtarget
->hasFP64Denormals();
10894 return Subtarget
->hasFP16Denormals();
10900 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op
,
10901 const SelectionDAG
&DAG
,
10903 unsigned Depth
) const {
10904 if (Op
.getOpcode() == AMDGPUISD::CLAMP
) {
10905 const MachineFunction
&MF
= DAG
.getMachineFunction();
10906 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10908 if (Info
->getMode().DX10Clamp
)
10909 return true; // Clamped to 0.
10910 return DAG
.isKnownNeverNaN(Op
.getOperand(0), SNaN
, Depth
+ 1);
10913 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op
, DAG
,
10917 TargetLowering::AtomicExpansionKind
10918 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*RMW
) const {
10919 switch (RMW
->getOperation()) {
10920 case AtomicRMWInst::FAdd
: {
10921 Type
*Ty
= RMW
->getType();
10923 // We don't have a way to support 16-bit atomics now, so just leave them
10925 if (Ty
->isHalfTy())
10926 return AtomicExpansionKind::None
;
10928 if (!Ty
->isFloatTy())
10929 return AtomicExpansionKind::CmpXChg
;
10931 // TODO: Do have these for flat. Older targets also had them for buffers.
10932 unsigned AS
= RMW
->getPointerAddressSpace();
10933 return (AS
== AMDGPUAS::LOCAL_ADDRESS
&& Subtarget
->hasLDSFPAtomics()) ?
10934 AtomicExpansionKind::None
: AtomicExpansionKind::CmpXChg
;
10940 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW
);
10943 const TargetRegisterClass
*
10944 SITargetLowering::getRegClassFor(MVT VT
, bool isDivergent
) const {
10945 const TargetRegisterClass
*RC
= TargetLoweringBase::getRegClassFor(VT
, false);
10946 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10947 if (RC
== &AMDGPU::VReg_1RegClass
&& !isDivergent
)
10948 return Subtarget
->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
10949 : &AMDGPU::SReg_32RegClass
;
10950 if (!TRI
->isSGPRClass(RC
) && !isDivergent
)
10951 return TRI
->getEquivalentSGPRClass(RC
);
10952 else if (TRI
->isSGPRClass(RC
) && isDivergent
)
10953 return TRI
->getEquivalentVGPRClass(RC
);
10958 static bool hasCFUser(const Value
*V
, SmallPtrSet
<const Value
*, 16> &Visited
) {
10959 if (!Visited
.insert(V
).second
)
10961 bool Result
= false;
10962 for (auto U
: V
->users()) {
10963 if (const IntrinsicInst
*Intrinsic
= dyn_cast
<IntrinsicInst
>(U
)) {
10964 if (V
== U
->getOperand(1)) {
10965 switch (Intrinsic
->getIntrinsicID()) {
10969 case Intrinsic::amdgcn_if_break
:
10970 case Intrinsic::amdgcn_if
:
10971 case Intrinsic::amdgcn_else
:
10976 if (V
== U
->getOperand(0)) {
10977 switch (Intrinsic
->getIntrinsicID()) {
10981 case Intrinsic::amdgcn_end_cf
:
10982 case Intrinsic::amdgcn_loop
:
10988 Result
= hasCFUser(U
, Visited
);
10996 bool SITargetLowering::requiresUniformRegister(MachineFunction
&MF
,
10997 const Value
*V
) const {
10998 if (const IntrinsicInst
*Intrinsic
= dyn_cast
<IntrinsicInst
>(V
)) {
10999 switch (Intrinsic
->getIntrinsicID()) {
11002 case Intrinsic::amdgcn_if_break
:
11006 if (const ExtractValueInst
*ExtValue
= dyn_cast
<ExtractValueInst
>(V
)) {
11007 if (const IntrinsicInst
*Intrinsic
=
11008 dyn_cast
<IntrinsicInst
>(ExtValue
->getOperand(0))) {
11009 switch (Intrinsic
->getIntrinsicID()) {
11012 case Intrinsic::amdgcn_if
:
11013 case Intrinsic::amdgcn_else
: {
11014 ArrayRef
<unsigned> Indices
= ExtValue
->getIndices();
11015 if (Indices
.size() == 1 && Indices
[0] == 1) {
11022 if (const CallInst
*CI
= dyn_cast
<CallInst
>(V
)) {
11023 if (isa
<InlineAsm
>(CI
->getCalledValue())) {
11024 const SIRegisterInfo
*SIRI
= Subtarget
->getRegisterInfo();
11025 ImmutableCallSite
CS(CI
);
11026 TargetLowering::AsmOperandInfoVector TargetConstraints
= ParseConstraints(
11027 MF
.getDataLayout(), Subtarget
->getRegisterInfo(), CS
);
11028 for (auto &TC
: TargetConstraints
) {
11029 if (TC
.Type
== InlineAsm::isOutput
) {
11030 ComputeConstraintToUse(TC
, SDValue());
11031 unsigned AssignedReg
;
11032 const TargetRegisterClass
*RC
;
11033 std::tie(AssignedReg
, RC
) = getRegForInlineAsmConstraint(
11034 SIRI
, TC
.ConstraintCode
, TC
.ConstraintVT
);
11036 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
11037 if (AssignedReg
!= 0 && SIRI
->isSGPRReg(MRI
, AssignedReg
))
11039 else if (SIRI
->isSGPRClass(RC
))
11046 SmallPtrSet
<const Value
*, 16> Visited
;
11047 return hasCFUser(V
, Visited
);