1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Custom DAG lowering for SI
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
16 #define _USE_MATH_DEFINES
19 #include "SIISelLowering.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "SIDefines.h"
24 #include "SIInstrInfo.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIRegisterInfo.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/ADT/APFloat.h"
30 #include "llvm/ADT/APInt.h"
31 #include "llvm/ADT/ArrayRef.h"
32 #include "llvm/ADT/BitVector.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/ADT/StringRef.h"
36 #include "llvm/ADT/StringSwitch.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/CodeGen/Analysis.h"
39 #include "llvm/CodeGen/CallingConvLower.h"
40 #include "llvm/CodeGen/DAGCombine.h"
41 #include "llvm/CodeGen/ISDOpcodes.h"
42 #include "llvm/CodeGen/MachineBasicBlock.h"
43 #include "llvm/CodeGen/MachineFrameInfo.h"
44 #include "llvm/CodeGen/MachineFunction.h"
45 #include "llvm/CodeGen/MachineInstr.h"
46 #include "llvm/CodeGen/MachineInstrBuilder.h"
47 #include "llvm/CodeGen/MachineMemOperand.h"
48 #include "llvm/CodeGen/MachineModuleInfo.h"
49 #include "llvm/CodeGen/MachineOperand.h"
50 #include "llvm/CodeGen/MachineRegisterInfo.h"
51 #include "llvm/CodeGen/SelectionDAG.h"
52 #include "llvm/CodeGen/SelectionDAGNodes.h"
53 #include "llvm/CodeGen/TargetCallingConv.h"
54 #include "llvm/CodeGen/TargetRegisterInfo.h"
55 #include "llvm/CodeGen/ValueTypes.h"
56 #include "llvm/IR/Constants.h"
57 #include "llvm/IR/DataLayout.h"
58 #include "llvm/IR/DebugLoc.h"
59 #include "llvm/IR/DerivedTypes.h"
60 #include "llvm/IR/DiagnosticInfo.h"
61 #include "llvm/IR/Function.h"
62 #include "llvm/IR/GlobalValue.h"
63 #include "llvm/IR/InstrTypes.h"
64 #include "llvm/IR/Instruction.h"
65 #include "llvm/IR/Instructions.h"
66 #include "llvm/IR/IntrinsicInst.h"
67 #include "llvm/IR/Type.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/CommandLine.h"
71 #include "llvm/Support/Compiler.h"
72 #include "llvm/Support/ErrorHandling.h"
73 #include "llvm/Support/KnownBits.h"
74 #include "llvm/Support/MachineValueType.h"
75 #include "llvm/Support/MathExtras.h"
76 #include "llvm/Target/TargetOptions.h"
87 #define DEBUG_TYPE "si-lower"
89 STATISTIC(NumTailCalls
, "Number of tail calls");
91 static cl::opt
<bool> EnableVGPRIndexMode(
92 "amdgpu-vgpr-index-mode",
93 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
96 static cl::opt
<bool> DisableLoopAlignment(
97 "amdgpu-disable-loop-alignment",
98 cl::desc("Do not align and prefetch loops"),
101 static unsigned findFirstFreeSGPR(CCState
&CCInfo
) {
102 unsigned NumSGPRs
= AMDGPU::SGPR_32RegClass
.getNumRegs();
103 for (unsigned Reg
= 0; Reg
< NumSGPRs
; ++Reg
) {
104 if (!CCInfo
.isAllocated(AMDGPU::SGPR0
+ Reg
)) {
105 return AMDGPU::SGPR0
+ Reg
;
108 llvm_unreachable("Cannot allocate sgpr");
111 SITargetLowering::SITargetLowering(const TargetMachine
&TM
,
112 const GCNSubtarget
&STI
)
113 : AMDGPUTargetLowering(TM
, STI
),
115 addRegisterClass(MVT::i1
, &AMDGPU::VReg_1RegClass
);
116 addRegisterClass(MVT::i64
, &AMDGPU::SReg_64RegClass
);
118 addRegisterClass(MVT::i32
, &AMDGPU::SReg_32_XM0RegClass
);
119 addRegisterClass(MVT::f32
, &AMDGPU::VGPR_32RegClass
);
121 addRegisterClass(MVT::f64
, &AMDGPU::VReg_64RegClass
);
122 addRegisterClass(MVT::v2i32
, &AMDGPU::SReg_64RegClass
);
123 addRegisterClass(MVT::v2f32
, &AMDGPU::VReg_64RegClass
);
125 addRegisterClass(MVT::v3i32
, &AMDGPU::SGPR_96RegClass
);
126 addRegisterClass(MVT::v3f32
, &AMDGPU::VReg_96RegClass
);
128 addRegisterClass(MVT::v2i64
, &AMDGPU::SReg_128RegClass
);
129 addRegisterClass(MVT::v2f64
, &AMDGPU::SReg_128RegClass
);
131 addRegisterClass(MVT::v4i32
, &AMDGPU::SReg_128RegClass
);
132 addRegisterClass(MVT::v4f32
, &AMDGPU::VReg_128RegClass
);
134 addRegisterClass(MVT::v5i32
, &AMDGPU::SGPR_160RegClass
);
135 addRegisterClass(MVT::v5f32
, &AMDGPU::VReg_160RegClass
);
137 addRegisterClass(MVT::v8i32
, &AMDGPU::SReg_256RegClass
);
138 addRegisterClass(MVT::v8f32
, &AMDGPU::VReg_256RegClass
);
140 addRegisterClass(MVT::v16i32
, &AMDGPU::SReg_512RegClass
);
141 addRegisterClass(MVT::v16f32
, &AMDGPU::VReg_512RegClass
);
143 if (Subtarget
->has16BitInsts()) {
144 addRegisterClass(MVT::i16
, &AMDGPU::SReg_32_XM0RegClass
);
145 addRegisterClass(MVT::f16
, &AMDGPU::SReg_32_XM0RegClass
);
147 // Unless there are also VOP3P operations, not operations are really legal.
148 addRegisterClass(MVT::v2i16
, &AMDGPU::SReg_32_XM0RegClass
);
149 addRegisterClass(MVT::v2f16
, &AMDGPU::SReg_32_XM0RegClass
);
150 addRegisterClass(MVT::v4i16
, &AMDGPU::SReg_64RegClass
);
151 addRegisterClass(MVT::v4f16
, &AMDGPU::SReg_64RegClass
);
154 if (Subtarget
->hasMAIInsts()) {
155 addRegisterClass(MVT::v32i32
, &AMDGPU::VReg_1024RegClass
);
156 addRegisterClass(MVT::v32f32
, &AMDGPU::VReg_1024RegClass
);
159 computeRegisterProperties(Subtarget
->getRegisterInfo());
161 // We need to custom lower vector stores from local memory
162 setOperationAction(ISD::LOAD
, MVT::v2i32
, Custom
);
163 setOperationAction(ISD::LOAD
, MVT::v3i32
, Custom
);
164 setOperationAction(ISD::LOAD
, MVT::v4i32
, Custom
);
165 setOperationAction(ISD::LOAD
, MVT::v5i32
, Custom
);
166 setOperationAction(ISD::LOAD
, MVT::v8i32
, Custom
);
167 setOperationAction(ISD::LOAD
, MVT::v16i32
, Custom
);
168 setOperationAction(ISD::LOAD
, MVT::i1
, Custom
);
169 setOperationAction(ISD::LOAD
, MVT::v32i32
, Custom
);
171 setOperationAction(ISD::STORE
, MVT::v2i32
, Custom
);
172 setOperationAction(ISD::STORE
, MVT::v3i32
, Custom
);
173 setOperationAction(ISD::STORE
, MVT::v4i32
, Custom
);
174 setOperationAction(ISD::STORE
, MVT::v5i32
, Custom
);
175 setOperationAction(ISD::STORE
, MVT::v8i32
, Custom
);
176 setOperationAction(ISD::STORE
, MVT::v16i32
, Custom
);
177 setOperationAction(ISD::STORE
, MVT::i1
, Custom
);
178 setOperationAction(ISD::STORE
, MVT::v32i32
, Custom
);
180 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Expand
);
181 setTruncStoreAction(MVT::v4i32
, MVT::v4i16
, Expand
);
182 setTruncStoreAction(MVT::v8i32
, MVT::v8i16
, Expand
);
183 setTruncStoreAction(MVT::v16i32
, MVT::v16i16
, Expand
);
184 setTruncStoreAction(MVT::v32i32
, MVT::v32i16
, Expand
);
185 setTruncStoreAction(MVT::v2i32
, MVT::v2i8
, Expand
);
186 setTruncStoreAction(MVT::v4i32
, MVT::v4i8
, Expand
);
187 setTruncStoreAction(MVT::v8i32
, MVT::v8i8
, Expand
);
188 setTruncStoreAction(MVT::v16i32
, MVT::v16i8
, Expand
);
189 setTruncStoreAction(MVT::v32i32
, MVT::v32i8
, Expand
);
191 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
192 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
194 setOperationAction(ISD::SELECT
, MVT::i1
, Promote
);
195 setOperationAction(ISD::SELECT
, MVT::i64
, Custom
);
196 setOperationAction(ISD::SELECT
, MVT::f64
, Promote
);
197 AddPromotedToType(ISD::SELECT
, MVT::f64
, MVT::i64
);
199 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Expand
);
200 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Expand
);
201 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Expand
);
202 setOperationAction(ISD::SELECT_CC
, MVT::f64
, Expand
);
203 setOperationAction(ISD::SELECT_CC
, MVT::i1
, Expand
);
205 setOperationAction(ISD::SETCC
, MVT::i1
, Promote
);
206 setOperationAction(ISD::SETCC
, MVT::v2i1
, Expand
);
207 setOperationAction(ISD::SETCC
, MVT::v4i1
, Expand
);
208 AddPromotedToType(ISD::SETCC
, MVT::i1
, MVT::i32
);
210 setOperationAction(ISD::TRUNCATE
, MVT::v2i32
, Expand
);
211 setOperationAction(ISD::FP_ROUND
, MVT::v2f32
, Expand
);
213 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i1
, Custom
);
214 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i1
, Custom
);
215 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i8
, Custom
);
216 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i8
, Custom
);
217 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i16
, Custom
);
218 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i16
, Custom
);
219 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::Other
, Custom
);
221 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
222 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f32
, Custom
);
223 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v4f32
, Custom
);
224 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::i16
, Custom
);
225 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f16
, Custom
);
226 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2i16
, Custom
);
227 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v2f16
, Custom
);
229 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v2f16
, Custom
);
230 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v4f16
, Custom
);
231 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::v8f16
, Custom
);
232 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::Other
, Custom
);
233 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i16
, Custom
);
234 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i8
, Custom
);
236 setOperationAction(ISD::INTRINSIC_VOID
, MVT::Other
, Custom
);
237 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2i16
, Custom
);
238 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v2f16
, Custom
);
239 setOperationAction(ISD::INTRINSIC_VOID
, MVT::v4f16
, Custom
);
240 setOperationAction(ISD::INTRINSIC_VOID
, MVT::i16
, Custom
);
241 setOperationAction(ISD::INTRINSIC_VOID
, MVT::i8
, Custom
);
243 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
244 setOperationAction(ISD::BR_CC
, MVT::i1
, Expand
);
245 setOperationAction(ISD::BR_CC
, MVT::i32
, Expand
);
246 setOperationAction(ISD::BR_CC
, MVT::i64
, Expand
);
247 setOperationAction(ISD::BR_CC
, MVT::f32
, Expand
);
248 setOperationAction(ISD::BR_CC
, MVT::f64
, Expand
);
250 setOperationAction(ISD::UADDO
, MVT::i32
, Legal
);
251 setOperationAction(ISD::USUBO
, MVT::i32
, Legal
);
253 setOperationAction(ISD::ADDCARRY
, MVT::i32
, Legal
);
254 setOperationAction(ISD::SUBCARRY
, MVT::i32
, Legal
);
256 setOperationAction(ISD::SHL_PARTS
, MVT::i64
, Expand
);
257 setOperationAction(ISD::SRA_PARTS
, MVT::i64
, Expand
);
258 setOperationAction(ISD::SRL_PARTS
, MVT::i64
, Expand
);
261 setOperationAction(ISD::ADDCARRY
, MVT::i64
, Legal
);
262 setOperationAction(ISD::SUBCARRY
, MVT::i64
, Legal
);
265 // We only support LOAD/STORE and vector manipulation ops for vectors
266 // with > 4 elements.
267 for (MVT VT
: { MVT::v8i32
, MVT::v8f32
, MVT::v16i32
, MVT::v16f32
,
268 MVT::v2i64
, MVT::v2f64
, MVT::v4i16
, MVT::v4f16
,
269 MVT::v32i32
, MVT::v32f32
}) {
270 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
274 case ISD::BUILD_VECTOR
:
276 case ISD::EXTRACT_VECTOR_ELT
:
277 case ISD::INSERT_VECTOR_ELT
:
278 case ISD::INSERT_SUBVECTOR
:
279 case ISD::EXTRACT_SUBVECTOR
:
280 case ISD::SCALAR_TO_VECTOR
:
282 case ISD::CONCAT_VECTORS
:
283 setOperationAction(Op
, VT
, Custom
);
286 setOperationAction(Op
, VT
, Expand
);
292 setOperationAction(ISD::FP_EXTEND
, MVT::v4f32
, Expand
);
294 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
295 // is expanded to avoid having two separate loops in case the index is a VGPR.
297 // Most operations are naturally 32-bit vector operations. We only support
298 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
299 for (MVT Vec64
: { MVT::v2i64
, MVT::v2f64
}) {
300 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
301 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v4i32
);
303 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
304 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
306 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
307 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
309 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
310 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v4i32
);
313 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8i32
, Expand
);
314 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8f32
, Expand
);
315 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16i32
, Expand
);
316 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16f32
, Expand
);
318 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4f16
, Custom
);
319 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4i16
, Custom
);
321 // Avoid stack access for these.
322 // TODO: Generalize to more vector types.
323 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i16
, Custom
);
324 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2f16
, Custom
);
325 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
326 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
328 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
329 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
330 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i8
, Custom
);
331 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i8
, Custom
);
332 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v8i8
, Custom
);
334 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i8
, Custom
);
335 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i8
, Custom
);
336 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v8i8
, Custom
);
338 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i16
, Custom
);
339 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4f16
, Custom
);
340 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i16
, Custom
);
341 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f16
, Custom
);
343 // Deal with vec3 vector operations when widened to vec4.
344 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v3i32
, Custom
);
345 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v3f32
, Custom
);
346 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v4i32
, Custom
);
347 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v4f32
, Custom
);
349 // Deal with vec5 vector operations when widened to vec8.
350 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v5i32
, Custom
);
351 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v5f32
, Custom
);
352 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v8i32
, Custom
);
353 setOperationAction(ISD::INSERT_SUBVECTOR
, MVT::v8f32
, Custom
);
355 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
356 // and output demarshalling
357 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i32
, Custom
);
358 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i64
, Custom
);
360 // We can't return success/failure, only the old value,
361 // let LLVM add the comparison
362 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i32
, Expand
);
363 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, MVT::i64
, Expand
);
365 if (Subtarget
->hasFlatAddressSpace()) {
366 setOperationAction(ISD::ADDRSPACECAST
, MVT::i32
, Custom
);
367 setOperationAction(ISD::ADDRSPACECAST
, MVT::i64
, Custom
);
370 setOperationAction(ISD::BSWAP
, MVT::i32
, Legal
);
371 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
373 // On SI this is s_memtime and s_memrealtime on VI.
374 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Legal
);
375 setOperationAction(ISD::TRAP
, MVT::Other
, Custom
);
376 setOperationAction(ISD::DEBUGTRAP
, MVT::Other
, Custom
);
378 if (Subtarget
->has16BitInsts()) {
379 setOperationAction(ISD::FLOG
, MVT::f16
, Custom
);
380 setOperationAction(ISD::FEXP
, MVT::f16
, Custom
);
381 setOperationAction(ISD::FLOG10
, MVT::f16
, Custom
);
384 // v_mad_f32 does not support denormals according to some sources.
385 if (!Subtarget
->hasFP32Denormals())
386 setOperationAction(ISD::FMAD
, MVT::f32
, Legal
);
388 if (!Subtarget
->hasBFI()) {
389 // fcopysign can be done in a single instruction with BFI.
390 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Expand
);
391 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
394 if (!Subtarget
->hasBCNT(32))
395 setOperationAction(ISD::CTPOP
, MVT::i32
, Expand
);
397 if (!Subtarget
->hasBCNT(64))
398 setOperationAction(ISD::CTPOP
, MVT::i64
, Expand
);
400 if (Subtarget
->hasFFBH())
401 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i32
, Custom
);
403 if (Subtarget
->hasFFBL())
404 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i32
, Custom
);
406 // We only really have 32-bit BFE instructions (and 16-bit on VI).
408 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
409 // effort to match them now. We want this to be false for i64 cases when the
410 // extraction isn't restricted to the upper or lower half. Ideally we would
411 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
412 // span the midpoint are probably relatively rare, so don't worry about them
414 if (Subtarget
->hasBFE())
415 setHasExtractBitsInsn(true);
417 setOperationAction(ISD::FMINNUM
, MVT::f32
, Custom
);
418 setOperationAction(ISD::FMAXNUM
, MVT::f32
, Custom
);
419 setOperationAction(ISD::FMINNUM
, MVT::f64
, Custom
);
420 setOperationAction(ISD::FMAXNUM
, MVT::f64
, Custom
);
423 // These are really only legal for ieee_mode functions. We should be avoiding
424 // them for functions that don't have ieee_mode enabled, so just say they are
426 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f32
, Legal
);
427 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f32
, Legal
);
428 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f64
, Legal
);
429 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f64
, Legal
);
432 if (Subtarget
->haveRoundOpsF64()) {
433 setOperationAction(ISD::FTRUNC
, MVT::f64
, Legal
);
434 setOperationAction(ISD::FCEIL
, MVT::f64
, Legal
);
435 setOperationAction(ISD::FRINT
, MVT::f64
, Legal
);
437 setOperationAction(ISD::FCEIL
, MVT::f64
, Custom
);
438 setOperationAction(ISD::FTRUNC
, MVT::f64
, Custom
);
439 setOperationAction(ISD::FRINT
, MVT::f64
, Custom
);
440 setOperationAction(ISD::FFLOOR
, MVT::f64
, Custom
);
443 setOperationAction(ISD::FFLOOR
, MVT::f64
, Legal
);
445 setOperationAction(ISD::FSIN
, MVT::f32
, Custom
);
446 setOperationAction(ISD::FCOS
, MVT::f32
, Custom
);
447 setOperationAction(ISD::FDIV
, MVT::f32
, Custom
);
448 setOperationAction(ISD::FDIV
, MVT::f64
, Custom
);
450 if (Subtarget
->has16BitInsts()) {
451 setOperationAction(ISD::Constant
, MVT::i16
, Legal
);
453 setOperationAction(ISD::SMIN
, MVT::i16
, Legal
);
454 setOperationAction(ISD::SMAX
, MVT::i16
, Legal
);
456 setOperationAction(ISD::UMIN
, MVT::i16
, Legal
);
457 setOperationAction(ISD::UMAX
, MVT::i16
, Legal
);
459 setOperationAction(ISD::SIGN_EXTEND
, MVT::i16
, Promote
);
460 AddPromotedToType(ISD::SIGN_EXTEND
, MVT::i16
, MVT::i32
);
462 setOperationAction(ISD::ROTR
, MVT::i16
, Promote
);
463 setOperationAction(ISD::ROTL
, MVT::i16
, Promote
);
465 setOperationAction(ISD::SDIV
, MVT::i16
, Promote
);
466 setOperationAction(ISD::UDIV
, MVT::i16
, Promote
);
467 setOperationAction(ISD::SREM
, MVT::i16
, Promote
);
468 setOperationAction(ISD::UREM
, MVT::i16
, Promote
);
470 setOperationAction(ISD::BSWAP
, MVT::i16
, Promote
);
471 setOperationAction(ISD::BITREVERSE
, MVT::i16
, Promote
);
473 setOperationAction(ISD::CTTZ
, MVT::i16
, Promote
);
474 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i16
, Promote
);
475 setOperationAction(ISD::CTLZ
, MVT::i16
, Promote
);
476 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i16
, Promote
);
477 setOperationAction(ISD::CTPOP
, MVT::i16
, Promote
);
479 setOperationAction(ISD::SELECT_CC
, MVT::i16
, Expand
);
481 setOperationAction(ISD::BR_CC
, MVT::i16
, Expand
);
483 setOperationAction(ISD::LOAD
, MVT::i16
, Custom
);
485 setTruncStoreAction(MVT::i64
, MVT::i16
, Expand
);
487 setOperationAction(ISD::FP16_TO_FP
, MVT::i16
, Promote
);
488 AddPromotedToType(ISD::FP16_TO_FP
, MVT::i16
, MVT::i32
);
489 setOperationAction(ISD::FP_TO_FP16
, MVT::i16
, Promote
);
490 AddPromotedToType(ISD::FP_TO_FP16
, MVT::i16
, MVT::i32
);
492 setOperationAction(ISD::FP_TO_SINT
, MVT::i16
, Promote
);
493 setOperationAction(ISD::FP_TO_UINT
, MVT::i16
, Promote
);
494 setOperationAction(ISD::SINT_TO_FP
, MVT::i16
, Promote
);
495 setOperationAction(ISD::UINT_TO_FP
, MVT::i16
, Promote
);
497 // F16 - Constant Actions.
498 setOperationAction(ISD::ConstantFP
, MVT::f16
, Legal
);
500 // F16 - Load/Store Actions.
501 setOperationAction(ISD::LOAD
, MVT::f16
, Promote
);
502 AddPromotedToType(ISD::LOAD
, MVT::f16
, MVT::i16
);
503 setOperationAction(ISD::STORE
, MVT::f16
, Promote
);
504 AddPromotedToType(ISD::STORE
, MVT::f16
, MVT::i16
);
506 // F16 - VOP1 Actions.
507 setOperationAction(ISD::FP_ROUND
, MVT::f16
, Custom
);
508 setOperationAction(ISD::FCOS
, MVT::f16
, Promote
);
509 setOperationAction(ISD::FSIN
, MVT::f16
, Promote
);
510 setOperationAction(ISD::FP_TO_SINT
, MVT::f16
, Promote
);
511 setOperationAction(ISD::FP_TO_UINT
, MVT::f16
, Promote
);
512 setOperationAction(ISD::SINT_TO_FP
, MVT::f16
, Promote
);
513 setOperationAction(ISD::UINT_TO_FP
, MVT::f16
, Promote
);
514 setOperationAction(ISD::FROUND
, MVT::f16
, Custom
);
516 // F16 - VOP2 Actions.
517 setOperationAction(ISD::BR_CC
, MVT::f16
, Expand
);
518 setOperationAction(ISD::SELECT_CC
, MVT::f16
, Expand
);
520 setOperationAction(ISD::FDIV
, MVT::f16
, Custom
);
522 // F16 - VOP3 Actions.
523 setOperationAction(ISD::FMA
, MVT::f16
, Legal
);
524 if (!Subtarget
->hasFP16Denormals() && STI
.hasMadF16())
525 setOperationAction(ISD::FMAD
, MVT::f16
, Legal
);
527 for (MVT VT
: {MVT::v2i16
, MVT::v2f16
, MVT::v4i16
, MVT::v4f16
}) {
528 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
532 case ISD::BUILD_VECTOR
:
534 case ISD::EXTRACT_VECTOR_ELT
:
535 case ISD::INSERT_VECTOR_ELT
:
536 case ISD::INSERT_SUBVECTOR
:
537 case ISD::EXTRACT_SUBVECTOR
:
538 case ISD::SCALAR_TO_VECTOR
:
540 case ISD::CONCAT_VECTORS
:
541 setOperationAction(Op
, VT
, Custom
);
544 setOperationAction(Op
, VT
, Expand
);
550 // XXX - Do these do anything? Vector constants turn into build_vector.
551 setOperationAction(ISD::Constant
, MVT::v2i16
, Legal
);
552 setOperationAction(ISD::ConstantFP
, MVT::v2f16
, Legal
);
554 setOperationAction(ISD::UNDEF
, MVT::v2i16
, Legal
);
555 setOperationAction(ISD::UNDEF
, MVT::v2f16
, Legal
);
557 setOperationAction(ISD::STORE
, MVT::v2i16
, Promote
);
558 AddPromotedToType(ISD::STORE
, MVT::v2i16
, MVT::i32
);
559 setOperationAction(ISD::STORE
, MVT::v2f16
, Promote
);
560 AddPromotedToType(ISD::STORE
, MVT::v2f16
, MVT::i32
);
562 setOperationAction(ISD::LOAD
, MVT::v2i16
, Promote
);
563 AddPromotedToType(ISD::LOAD
, MVT::v2i16
, MVT::i32
);
564 setOperationAction(ISD::LOAD
, MVT::v2f16
, Promote
);
565 AddPromotedToType(ISD::LOAD
, MVT::v2f16
, MVT::i32
);
567 setOperationAction(ISD::AND
, MVT::v2i16
, Promote
);
568 AddPromotedToType(ISD::AND
, MVT::v2i16
, MVT::i32
);
569 setOperationAction(ISD::OR
, MVT::v2i16
, Promote
);
570 AddPromotedToType(ISD::OR
, MVT::v2i16
, MVT::i32
);
571 setOperationAction(ISD::XOR
, MVT::v2i16
, Promote
);
572 AddPromotedToType(ISD::XOR
, MVT::v2i16
, MVT::i32
);
574 setOperationAction(ISD::LOAD
, MVT::v4i16
, Promote
);
575 AddPromotedToType(ISD::LOAD
, MVT::v4i16
, MVT::v2i32
);
576 setOperationAction(ISD::LOAD
, MVT::v4f16
, Promote
);
577 AddPromotedToType(ISD::LOAD
, MVT::v4f16
, MVT::v2i32
);
579 setOperationAction(ISD::STORE
, MVT::v4i16
, Promote
);
580 AddPromotedToType(ISD::STORE
, MVT::v4i16
, MVT::v2i32
);
581 setOperationAction(ISD::STORE
, MVT::v4f16
, Promote
);
582 AddPromotedToType(ISD::STORE
, MVT::v4f16
, MVT::v2i32
);
584 setOperationAction(ISD::ANY_EXTEND
, MVT::v2i32
, Expand
);
585 setOperationAction(ISD::ZERO_EXTEND
, MVT::v2i32
, Expand
);
586 setOperationAction(ISD::SIGN_EXTEND
, MVT::v2i32
, Expand
);
587 setOperationAction(ISD::FP_EXTEND
, MVT::v2f32
, Expand
);
589 setOperationAction(ISD::ANY_EXTEND
, MVT::v4i32
, Expand
);
590 setOperationAction(ISD::ZERO_EXTEND
, MVT::v4i32
, Expand
);
591 setOperationAction(ISD::SIGN_EXTEND
, MVT::v4i32
, Expand
);
593 if (!Subtarget
->hasVOP3PInsts()) {
594 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2i16
, Custom
);
595 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2f16
, Custom
);
598 setOperationAction(ISD::FNEG
, MVT::v2f16
, Legal
);
599 // This isn't really legal, but this avoids the legalizer unrolling it (and
600 // allows matching fneg (fabs x) patterns)
601 setOperationAction(ISD::FABS
, MVT::v2f16
, Legal
);
603 setOperationAction(ISD::FMAXNUM
, MVT::f16
, Custom
);
604 setOperationAction(ISD::FMINNUM
, MVT::f16
, Custom
);
605 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::f16
, Legal
);
606 setOperationAction(ISD::FMINNUM_IEEE
, MVT::f16
, Legal
);
608 setOperationAction(ISD::FMINNUM_IEEE
, MVT::v4f16
, Custom
);
609 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::v4f16
, Custom
);
611 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Expand
);
612 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Expand
);
615 if (Subtarget
->hasVOP3PInsts()) {
616 setOperationAction(ISD::ADD
, MVT::v2i16
, Legal
);
617 setOperationAction(ISD::SUB
, MVT::v2i16
, Legal
);
618 setOperationAction(ISD::MUL
, MVT::v2i16
, Legal
);
619 setOperationAction(ISD::SHL
, MVT::v2i16
, Legal
);
620 setOperationAction(ISD::SRL
, MVT::v2i16
, Legal
);
621 setOperationAction(ISD::SRA
, MVT::v2i16
, Legal
);
622 setOperationAction(ISD::SMIN
, MVT::v2i16
, Legal
);
623 setOperationAction(ISD::UMIN
, MVT::v2i16
, Legal
);
624 setOperationAction(ISD::SMAX
, MVT::v2i16
, Legal
);
625 setOperationAction(ISD::UMAX
, MVT::v2i16
, Legal
);
627 setOperationAction(ISD::FADD
, MVT::v2f16
, Legal
);
628 setOperationAction(ISD::FMUL
, MVT::v2f16
, Legal
);
629 setOperationAction(ISD::FMA
, MVT::v2f16
, Legal
);
631 setOperationAction(ISD::FMINNUM_IEEE
, MVT::v2f16
, Legal
);
632 setOperationAction(ISD::FMAXNUM_IEEE
, MVT::v2f16
, Legal
);
634 setOperationAction(ISD::FCANONICALIZE
, MVT::v2f16
, Legal
);
636 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
637 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
639 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4f16
, Custom
);
640 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4i16
, Custom
);
642 setOperationAction(ISD::SHL
, MVT::v4i16
, Custom
);
643 setOperationAction(ISD::SRA
, MVT::v4i16
, Custom
);
644 setOperationAction(ISD::SRL
, MVT::v4i16
, Custom
);
645 setOperationAction(ISD::ADD
, MVT::v4i16
, Custom
);
646 setOperationAction(ISD::SUB
, MVT::v4i16
, Custom
);
647 setOperationAction(ISD::MUL
, MVT::v4i16
, Custom
);
649 setOperationAction(ISD::SMIN
, MVT::v4i16
, Custom
);
650 setOperationAction(ISD::SMAX
, MVT::v4i16
, Custom
);
651 setOperationAction(ISD::UMIN
, MVT::v4i16
, Custom
);
652 setOperationAction(ISD::UMAX
, MVT::v4i16
, Custom
);
654 setOperationAction(ISD::FADD
, MVT::v4f16
, Custom
);
655 setOperationAction(ISD::FMUL
, MVT::v4f16
, Custom
);
656 setOperationAction(ISD::FMA
, MVT::v4f16
, Custom
);
658 setOperationAction(ISD::FMAXNUM
, MVT::v2f16
, Custom
);
659 setOperationAction(ISD::FMINNUM
, MVT::v2f16
, Custom
);
661 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Custom
);
662 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Custom
);
663 setOperationAction(ISD::FCANONICALIZE
, MVT::v4f16
, Custom
);
665 setOperationAction(ISD::FEXP
, MVT::v2f16
, Custom
);
666 setOperationAction(ISD::SELECT
, MVT::v4i16
, Custom
);
667 setOperationAction(ISD::SELECT
, MVT::v4f16
, Custom
);
670 setOperationAction(ISD::FNEG
, MVT::v4f16
, Custom
);
671 setOperationAction(ISD::FABS
, MVT::v4f16
, Custom
);
673 if (Subtarget
->has16BitInsts()) {
674 setOperationAction(ISD::SELECT
, MVT::v2i16
, Promote
);
675 AddPromotedToType(ISD::SELECT
, MVT::v2i16
, MVT::i32
);
676 setOperationAction(ISD::SELECT
, MVT::v2f16
, Promote
);
677 AddPromotedToType(ISD::SELECT
, MVT::v2f16
, MVT::i32
);
679 // Legalization hack.
680 setOperationAction(ISD::SELECT
, MVT::v2i16
, Custom
);
681 setOperationAction(ISD::SELECT
, MVT::v2f16
, Custom
);
683 setOperationAction(ISD::FNEG
, MVT::v2f16
, Custom
);
684 setOperationAction(ISD::FABS
, MVT::v2f16
, Custom
);
687 for (MVT VT
: { MVT::v4i16
, MVT::v4f16
, MVT::v2i8
, MVT::v4i8
, MVT::v8i8
}) {
688 setOperationAction(ISD::SELECT
, VT
, Custom
);
691 setTargetDAGCombine(ISD::ADD
);
692 setTargetDAGCombine(ISD::ADDCARRY
);
693 setTargetDAGCombine(ISD::SUB
);
694 setTargetDAGCombine(ISD::SUBCARRY
);
695 setTargetDAGCombine(ISD::FADD
);
696 setTargetDAGCombine(ISD::FSUB
);
697 setTargetDAGCombine(ISD::FMINNUM
);
698 setTargetDAGCombine(ISD::FMAXNUM
);
699 setTargetDAGCombine(ISD::FMINNUM_IEEE
);
700 setTargetDAGCombine(ISD::FMAXNUM_IEEE
);
701 setTargetDAGCombine(ISD::FMA
);
702 setTargetDAGCombine(ISD::SMIN
);
703 setTargetDAGCombine(ISD::SMAX
);
704 setTargetDAGCombine(ISD::UMIN
);
705 setTargetDAGCombine(ISD::UMAX
);
706 setTargetDAGCombine(ISD::SETCC
);
707 setTargetDAGCombine(ISD::AND
);
708 setTargetDAGCombine(ISD::OR
);
709 setTargetDAGCombine(ISD::XOR
);
710 setTargetDAGCombine(ISD::SINT_TO_FP
);
711 setTargetDAGCombine(ISD::UINT_TO_FP
);
712 setTargetDAGCombine(ISD::FCANONICALIZE
);
713 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR
);
714 setTargetDAGCombine(ISD::ZERO_EXTEND
);
715 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG
);
716 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT
);
717 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT
);
719 // All memory operations. Some folding on the pointer operand is done to help
720 // matching the constant offsets in the addressing modes.
721 setTargetDAGCombine(ISD::LOAD
);
722 setTargetDAGCombine(ISD::STORE
);
723 setTargetDAGCombine(ISD::ATOMIC_LOAD
);
724 setTargetDAGCombine(ISD::ATOMIC_STORE
);
725 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP
);
726 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
);
727 setTargetDAGCombine(ISD::ATOMIC_SWAP
);
728 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD
);
729 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB
);
730 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND
);
731 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR
);
732 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR
);
733 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND
);
734 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN
);
735 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX
);
736 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN
);
737 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX
);
738 setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD
);
740 setSchedulingPreference(Sched::RegPressure
);
743 const GCNSubtarget
*SITargetLowering::getSubtarget() const {
747 //===----------------------------------------------------------------------===//
748 // TargetLowering queries
749 //===----------------------------------------------------------------------===//
751 // v_mad_mix* support a conversion from f16 to f32.
753 // There is only one special case when denormals are enabled we don't currently,
754 // where this is OK to use.
755 bool SITargetLowering::isFPExtFoldable(unsigned Opcode
,
756 EVT DestVT
, EVT SrcVT
) const {
757 return ((Opcode
== ISD::FMAD
&& Subtarget
->hasMadMixInsts()) ||
758 (Opcode
== ISD::FMA
&& Subtarget
->hasFmaMixInsts())) &&
759 DestVT
.getScalarType() == MVT::f32
&& !Subtarget
->hasFP32Denormals() &&
760 SrcVT
.getScalarType() == MVT::f16
;
763 bool SITargetLowering::isShuffleMaskLegal(ArrayRef
<int>, EVT
) const {
764 // SI has some legal vector types, but no legal vector operations. Say no
765 // shuffles are legal in order to prefer scalarizing some vector operations.
769 MVT
SITargetLowering::getRegisterTypeForCallingConv(LLVMContext
&Context
,
772 if (CC
== CallingConv::AMDGPU_KERNEL
)
773 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
776 EVT ScalarVT
= VT
.getScalarType();
777 unsigned Size
= ScalarVT
.getSizeInBits();
779 return ScalarVT
.getSimpleVT();
784 if (Size
== 16 && Subtarget
->has16BitInsts())
785 return VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
786 } else if (VT
.getSizeInBits() > 32)
789 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
792 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext
&Context
,
795 if (CC
== CallingConv::AMDGPU_KERNEL
)
796 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
799 unsigned NumElts
= VT
.getVectorNumElements();
800 EVT ScalarVT
= VT
.getScalarType();
801 unsigned Size
= ScalarVT
.getSizeInBits();
807 return NumElts
* ((Size
+ 31) / 32);
809 if (Size
== 16 && Subtarget
->has16BitInsts())
810 return (NumElts
+ 1) / 2;
811 } else if (VT
.getSizeInBits() > 32)
812 return (VT
.getSizeInBits() + 31) / 32;
814 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
817 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
818 LLVMContext
&Context
, CallingConv::ID CC
,
819 EVT VT
, EVT
&IntermediateVT
,
820 unsigned &NumIntermediates
, MVT
&RegisterVT
) const {
821 if (CC
!= CallingConv::AMDGPU_KERNEL
&& VT
.isVector()) {
822 unsigned NumElts
= VT
.getVectorNumElements();
823 EVT ScalarVT
= VT
.getScalarType();
824 unsigned Size
= ScalarVT
.getSizeInBits();
826 RegisterVT
= ScalarVT
.getSimpleVT();
827 IntermediateVT
= RegisterVT
;
828 NumIntermediates
= NumElts
;
829 return NumIntermediates
;
833 RegisterVT
= MVT::i32
;
834 IntermediateVT
= RegisterVT
;
835 NumIntermediates
= NumElts
* ((Size
+ 31) / 32);
836 return NumIntermediates
;
839 // FIXME: We should fix the ABI to be the same on targets without 16-bit
840 // support, but unless we can properly handle 3-vectors, it will be still be
842 if (Size
== 16 && Subtarget
->has16BitInsts()) {
843 RegisterVT
= VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
844 IntermediateVT
= RegisterVT
;
845 NumIntermediates
= (NumElts
+ 1) / 2;
846 return NumIntermediates
;
850 return TargetLowering::getVectorTypeBreakdownForCallingConv(
851 Context
, CC
, VT
, IntermediateVT
, NumIntermediates
, RegisterVT
);
854 static MVT
memVTFromAggregate(Type
*Ty
) {
855 // Only limited forms of aggregate type currently expected.
856 assert(Ty
->isStructTy() && "Expected struct type");
859 Type
*ElementType
= nullptr;
861 if (Ty
->getContainedType(0)->isVectorTy()) {
862 VectorType
*VecComponent
= cast
<VectorType
>(Ty
->getContainedType(0));
863 ElementType
= VecComponent
->getElementType();
864 NumElts
= VecComponent
->getNumElements();
866 ElementType
= Ty
->getContainedType(0);
870 assert((Ty
->getContainedType(1) && Ty
->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
872 // Calculate the size of the memVT type from the aggregate
873 unsigned Pow2Elts
= 0;
874 unsigned ElementSize
;
875 switch (ElementType
->getTypeID()) {
877 llvm_unreachable("Unknown type!");
878 case Type::IntegerTyID
:
879 ElementSize
= cast
<IntegerType
>(ElementType
)->getBitWidth();
884 case Type::FloatTyID
:
888 unsigned AdditionalElts
= ElementSize
== 16 ? 2 : 1;
889 Pow2Elts
= 1 << Log2_32_Ceil(NumElts
+ AdditionalElts
);
891 return MVT::getVectorVT(MVT::getVT(ElementType
, false),
895 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo
&Info
,
898 unsigned IntrID
) const {
899 if (const AMDGPU::RsrcIntrinsic
*RsrcIntr
=
900 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
901 AttributeList Attr
= Intrinsic::getAttributes(CI
.getContext(),
902 (Intrinsic::ID
)IntrID
);
903 if (Attr
.hasFnAttribute(Attribute::ReadNone
))
906 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
908 if (RsrcIntr
->IsImage
) {
909 Info
.ptrVal
= MFI
->getImagePSV(
910 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
911 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
914 Info
.ptrVal
= MFI
->getBufferPSV(
915 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
916 CI
.getArgOperand(RsrcIntr
->RsrcArg
));
919 Info
.flags
= MachineMemOperand::MODereferenceable
;
920 if (Attr
.hasFnAttribute(Attribute::ReadOnly
)) {
921 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
922 Info
.memVT
= MVT::getVT(CI
.getType(), true);
923 if (Info
.memVT
== MVT::Other
) {
924 // Some intrinsics return an aggregate type - special case to work out
926 Info
.memVT
= memVTFromAggregate(CI
.getType());
928 Info
.flags
|= MachineMemOperand::MOLoad
;
929 } else if (Attr
.hasFnAttribute(Attribute::WriteOnly
)) {
930 Info
.opc
= ISD::INTRINSIC_VOID
;
931 Info
.memVT
= MVT::getVT(CI
.getArgOperand(0)->getType());
932 Info
.flags
|= MachineMemOperand::MOStore
;
935 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
936 Info
.memVT
= MVT::getVT(CI
.getType());
937 Info
.flags
= MachineMemOperand::MOLoad
|
938 MachineMemOperand::MOStore
|
939 MachineMemOperand::MODereferenceable
;
941 // XXX - Should this be volatile without known ordering?
942 Info
.flags
|= MachineMemOperand::MOVolatile
;
948 case Intrinsic::amdgcn_atomic_inc
:
949 case Intrinsic::amdgcn_atomic_dec
:
950 case Intrinsic::amdgcn_ds_ordered_add
:
951 case Intrinsic::amdgcn_ds_ordered_swap
:
952 case Intrinsic::amdgcn_ds_fadd
:
953 case Intrinsic::amdgcn_ds_fmin
:
954 case Intrinsic::amdgcn_ds_fmax
: {
955 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
956 Info
.memVT
= MVT::getVT(CI
.getType());
957 Info
.ptrVal
= CI
.getOperand(0);
959 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
961 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(4));
963 Info
.flags
|= MachineMemOperand::MOVolatile
;
967 case Intrinsic::amdgcn_buffer_atomic_fadd
: {
968 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
970 Info
.opc
= ISD::INTRINSIC_VOID
;
971 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType());
972 Info
.ptrVal
= MFI
->getBufferPSV(
973 *MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo(),
974 CI
.getArgOperand(1));
976 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
978 const ConstantInt
*Vol
= dyn_cast
<ConstantInt
>(CI
.getOperand(4));
979 if (!Vol
|| !Vol
->isZero())
980 Info
.flags
|= MachineMemOperand::MOVolatile
;
984 case Intrinsic::amdgcn_global_atomic_fadd
: {
985 Info
.opc
= ISD::INTRINSIC_VOID
;
986 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType()
987 ->getPointerElementType());
988 Info
.ptrVal
= CI
.getOperand(0);
990 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
994 case Intrinsic::amdgcn_ds_append
:
995 case Intrinsic::amdgcn_ds_consume
: {
996 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
997 Info
.memVT
= MVT::getVT(CI
.getType());
998 Info
.ptrVal
= CI
.getOperand(0);
1000 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1002 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(1));
1004 Info
.flags
|= MachineMemOperand::MOVolatile
;
1008 case Intrinsic::amdgcn_ds_gws_init
:
1009 case Intrinsic::amdgcn_ds_gws_barrier
:
1010 case Intrinsic::amdgcn_ds_gws_sema_v
:
1011 case Intrinsic::amdgcn_ds_gws_sema_br
:
1012 case Intrinsic::amdgcn_ds_gws_sema_p
:
1013 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
1014 Info
.opc
= ISD::INTRINSIC_VOID
;
1016 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1018 MFI
->getGWSPSV(*MF
.getSubtarget
<GCNSubtarget
>().getInstrInfo());
1020 // This is an abstract access, but we need to specify a type and size.
1021 Info
.memVT
= MVT::i32
;
1025 Info
.flags
= MachineMemOperand::MOStore
;
1026 if (IntrID
== Intrinsic::amdgcn_ds_gws_barrier
)
1027 Info
.flags
= MachineMemOperand::MOLoad
;
1035 bool SITargetLowering::getAddrModeArguments(IntrinsicInst
*II
,
1036 SmallVectorImpl
<Value
*> &Ops
,
1037 Type
*&AccessTy
) const {
1038 switch (II
->getIntrinsicID()) {
1039 case Intrinsic::amdgcn_atomic_inc
:
1040 case Intrinsic::amdgcn_atomic_dec
:
1041 case Intrinsic::amdgcn_ds_ordered_add
:
1042 case Intrinsic::amdgcn_ds_ordered_swap
:
1043 case Intrinsic::amdgcn_ds_fadd
:
1044 case Intrinsic::amdgcn_ds_fmin
:
1045 case Intrinsic::amdgcn_ds_fmax
: {
1046 Value
*Ptr
= II
->getArgOperand(0);
1047 AccessTy
= II
->getType();
1056 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode
&AM
) const {
1057 if (!Subtarget
->hasFlatInstOffsets()) {
1058 // Flat instructions do not have offsets, and only have the register
1060 return AM
.BaseOffs
== 0 && AM
.Scale
== 0;
1063 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
1064 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
1066 // GFX10 shrinked signed offset to 12 bits. When using regular flat
1067 // instructions, the sign bit is also ignored and is treated as 11-bit
1070 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
1071 return isUInt
<11>(AM
.BaseOffs
) && AM
.Scale
== 0;
1074 return isUInt
<12>(AM
.BaseOffs
) && AM
.Scale
== 0;
1077 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode
&AM
) const {
1078 if (Subtarget
->hasFlatGlobalInsts())
1079 return isInt
<13>(AM
.BaseOffs
) && AM
.Scale
== 0;
1081 if (!Subtarget
->hasAddr64() || Subtarget
->useFlatForGlobal()) {
1082 // Assume the we will use FLAT for all global memory accesses
1084 // FIXME: This assumption is currently wrong. On VI we still use
1085 // MUBUF instructions for the r + i addressing mode. As currently
1086 // implemented, the MUBUF instructions only work on buffer < 4GB.
1087 // It may be possible to support > 4GB buffers with MUBUF instructions,
1088 // by setting the stride value in the resource descriptor which would
1089 // increase the size limit to (stride * 4GB). However, this is risky,
1090 // because it has never been validated.
1091 return isLegalFlatAddressingMode(AM
);
1094 return isLegalMUBUFAddressingMode(AM
);
1097 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode
&AM
) const {
1098 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1099 // additionally can do r + r + i with addr64. 32-bit has more addressing
1100 // mode options. Depending on the resource constant, it can also do
1101 // (i64 r0) + (i32 r1) * (i14 i).
1103 // Private arrays end up using a scratch buffer most of the time, so also
1104 // assume those use MUBUF instructions. Scratch loads / stores are currently
1105 // implemented as mubuf instructions with offen bit set, so slightly
1106 // different than the normal addr64.
1107 if (!isUInt
<12>(AM
.BaseOffs
))
1110 // FIXME: Since we can split immediate into soffset and immediate offset,
1111 // would it make sense to allow any immediate?
1114 case 0: // r + i or just i, depending on HasBaseReg.
1117 return true; // We have r + r or r + i.
1119 if (AM
.HasBaseReg
) {
1120 // Reject 2 * r + r.
1124 // Allow 2 * r as r + r
1125 // Or 2 * r + i is allowed as r + r + i.
1127 default: // Don't allow n * r
1132 bool SITargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
1133 const AddrMode
&AM
, Type
*Ty
,
1134 unsigned AS
, Instruction
*I
) const {
1135 // No global is ever allowed as a base.
1139 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
)
1140 return isLegalGlobalAddressingMode(AM
);
1142 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1143 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
1144 AS
== AMDGPUAS::BUFFER_FAT_POINTER
) {
1145 // If the offset isn't a multiple of 4, it probably isn't going to be
1146 // correctly aligned.
1147 // FIXME: Can we get the real alignment here?
1148 if (AM
.BaseOffs
% 4 != 0)
1149 return isLegalMUBUFAddressingMode(AM
);
1151 // There are no SMRD extloads, so if we have to do a small type access we
1152 // will use a MUBUF load.
1153 // FIXME?: We also need to do this if unaligned, but we don't know the
1155 if (Ty
->isSized() && DL
.getTypeStoreSize(Ty
) < 4)
1156 return isLegalGlobalAddressingMode(AM
);
1158 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
1159 // SMRD instructions have an 8-bit, dword offset on SI.
1160 if (!isUInt
<8>(AM
.BaseOffs
/ 4))
1162 } else if (Subtarget
->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
) {
1163 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1164 // in 8-bits, it can use a smaller encoding.
1165 if (!isUInt
<32>(AM
.BaseOffs
/ 4))
1167 } else if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
1168 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1169 if (!isUInt
<20>(AM
.BaseOffs
))
1172 llvm_unreachable("unhandled generation");
1174 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1177 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1182 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1183 return isLegalMUBUFAddressingMode(AM
);
1184 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
||
1185 AS
== AMDGPUAS::REGION_ADDRESS
) {
1186 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1188 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1189 // an 8-bit dword offset but we don't know the alignment here.
1190 if (!isUInt
<16>(AM
.BaseOffs
))
1193 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1196 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1200 } else if (AS
== AMDGPUAS::FLAT_ADDRESS
||
1201 AS
== AMDGPUAS::UNKNOWN_ADDRESS_SPACE
) {
1202 // For an unknown address space, this usually means that this is for some
1203 // reason being used for pure arithmetic, and not based on some addressing
1204 // computation. We don't have instructions that compute pointers with any
1205 // addressing modes, so treat them as having no offset like flat
1207 return isLegalFlatAddressingMode(AM
);
1209 llvm_unreachable("unhandled address space");
1213 bool SITargetLowering::canMergeStoresTo(unsigned AS
, EVT MemVT
,
1214 const SelectionDAG
&DAG
) const {
1215 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
) {
1216 return (MemVT
.getSizeInBits() <= 4 * 32);
1217 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1218 unsigned MaxPrivateBits
= 8 * getSubtarget()->getMaxPrivateElementSize();
1219 return (MemVT
.getSizeInBits() <= MaxPrivateBits
);
1220 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
1221 return (MemVT
.getSizeInBits() <= 2 * 32);
1226 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1227 EVT VT
, unsigned AddrSpace
, unsigned Align
, MachineMemOperand::Flags Flags
,
1228 bool *IsFast
) const {
1232 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1233 // which isn't a simple VT.
1234 // Until MVT is extended to handle this, simply check for the size and
1235 // rely on the condition below: allow accesses if the size is a multiple of 4.
1236 if (VT
== MVT::Other
|| (VT
!= MVT::Other
&& VT
.getSizeInBits() > 1024 &&
1237 VT
.getStoreSize() > 16)) {
1241 if (AddrSpace
== AMDGPUAS::LOCAL_ADDRESS
||
1242 AddrSpace
== AMDGPUAS::REGION_ADDRESS
) {
1243 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1244 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1245 // with adjacent offsets.
1246 bool AlignedBy4
= (Align
% 4 == 0);
1248 *IsFast
= AlignedBy4
;
1253 // FIXME: We have to be conservative here and assume that flat operations
1254 // will access scratch. If we had access to the IR function, then we
1255 // could determine if any private memory was used in the function.
1256 if (!Subtarget
->hasUnalignedScratchAccess() &&
1257 (AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
||
1258 AddrSpace
== AMDGPUAS::FLAT_ADDRESS
)) {
1259 bool AlignedBy4
= Align
>= 4;
1261 *IsFast
= AlignedBy4
;
1266 if (Subtarget
->hasUnalignedBufferAccess()) {
1267 // If we have an uniform constant load, it still requires using a slow
1268 // buffer instruction if unaligned.
1270 *IsFast
= (AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS
||
1271 AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) ?
1272 (Align
% 4 == 0) : true;
1278 // Smaller than dword value must be aligned.
1279 if (VT
.bitsLT(MVT::i32
))
1282 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1283 // byte-address are ignored, thus forcing Dword alignment.
1284 // This applies to private, global, and constant memory.
1288 return VT
.bitsGT(MVT::i32
) && Align
% 4 == 0;
1291 EVT
SITargetLowering::getOptimalMemOpType(
1292 uint64_t Size
, unsigned DstAlign
, unsigned SrcAlign
, bool IsMemset
,
1293 bool ZeroMemset
, bool MemcpyStrSrc
,
1294 const AttributeList
&FuncAttributes
) const {
1295 // FIXME: Should account for address space here.
1297 // The default fallback uses the private pointer size as a guess for a type to
1298 // use. Make sure we switch these to 64-bit accesses.
1300 if (Size
>= 16 && DstAlign
>= 4) // XXX: Should only do for global
1303 if (Size
>= 8 && DstAlign
>= 4)
1310 static bool isFlatGlobalAddrSpace(unsigned AS
) {
1311 return AS
== AMDGPUAS::GLOBAL_ADDRESS
||
1312 AS
== AMDGPUAS::FLAT_ADDRESS
||
1313 AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1314 AS
> AMDGPUAS::MAX_AMDGPU_ADDRESS
;
1317 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS
,
1318 unsigned DestAS
) const {
1319 return isFlatGlobalAddrSpace(SrcAS
) && isFlatGlobalAddrSpace(DestAS
);
1322 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode
*N
) const {
1323 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1324 const Value
*Ptr
= MemNode
->getMemOperand()->getValue();
1325 const Instruction
*I
= dyn_cast_or_null
<Instruction
>(Ptr
);
1326 return I
&& I
->getMetadata("amdgpu.noclobber");
1329 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS
,
1330 unsigned DestAS
) const {
1331 // Flat -> private/local is a simple truncate.
1332 // Flat -> global is no-op
1333 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
)
1336 return isNoopAddrSpaceCast(SrcAS
, DestAS
);
1339 bool SITargetLowering::isMemOpUniform(const SDNode
*N
) const {
1340 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1342 return AMDGPUInstrInfo::isUniformMMO(MemNode
->getMemOperand());
1345 TargetLoweringBase::LegalizeTypeAction
1346 SITargetLowering::getPreferredVectorAction(MVT VT
) const {
1347 if (VT
.getVectorNumElements() != 1 && VT
.getScalarType().bitsLE(MVT::i16
))
1348 return TypeSplitVector
;
1350 return TargetLoweringBase::getPreferredVectorAction(VT
);
1353 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
1355 // FIXME: Could be smarter if called for vector constants.
1359 bool SITargetLowering::isTypeDesirableForOp(unsigned Op
, EVT VT
) const {
1360 if (Subtarget
->has16BitInsts() && VT
== MVT::i16
) {
1365 // These operations are done with 32-bit instructions anyway.
1370 // TODO: Extensions?
1377 // SimplifySetCC uses this function to determine whether or not it should
1378 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1379 if (VT
== MVT::i1
&& Op
== ISD::SETCC
)
1382 return TargetLowering::isTypeDesirableForOp(Op
, VT
);
1385 SDValue
SITargetLowering::lowerKernArgParameterPtr(SelectionDAG
&DAG
,
1388 uint64_t Offset
) const {
1389 const DataLayout
&DL
= DAG
.getDataLayout();
1390 MachineFunction
&MF
= DAG
.getMachineFunction();
1391 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1393 const ArgDescriptor
*InputPtrReg
;
1394 const TargetRegisterClass
*RC
;
1396 std::tie(InputPtrReg
, RC
)
1397 = Info
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1399 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
1400 MVT PtrVT
= getPointerTy(DL
, AMDGPUAS::CONSTANT_ADDRESS
);
1401 SDValue BasePtr
= DAG
.getCopyFromReg(Chain
, SL
,
1402 MRI
.getLiveInVirtReg(InputPtrReg
->getRegister()), PtrVT
);
1404 return DAG
.getObjectPtrOffset(SL
, BasePtr
, Offset
);
1407 SDValue
SITargetLowering::getImplicitArgPtr(SelectionDAG
&DAG
,
1408 const SDLoc
&SL
) const {
1409 uint64_t Offset
= getImplicitParameterOffset(DAG
.getMachineFunction(),
1411 return lowerKernArgParameterPtr(DAG
, SL
, DAG
.getEntryNode(), Offset
);
1414 SDValue
SITargetLowering::convertArgType(SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1415 const SDLoc
&SL
, SDValue Val
,
1417 const ISD::InputArg
*Arg
) const {
1418 // First, if it is a widened vector, narrow it.
1419 if (VT
.isVector() &&
1420 VT
.getVectorNumElements() != MemVT
.getVectorNumElements()) {
1422 EVT::getVectorVT(*DAG
.getContext(), MemVT
.getVectorElementType(),
1423 VT
.getVectorNumElements());
1424 Val
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, NarrowedVT
, Val
,
1425 DAG
.getConstant(0, SL
, MVT::i32
));
1428 // Then convert the vector elements or scalar value.
1429 if (Arg
&& (Arg
->Flags
.isSExt() || Arg
->Flags
.isZExt()) &&
1431 unsigned Opc
= Arg
->Flags
.isZExt() ? ISD::AssertZext
: ISD::AssertSext
;
1432 Val
= DAG
.getNode(Opc
, SL
, MemVT
, Val
, DAG
.getValueType(VT
));
1435 if (MemVT
.isFloatingPoint())
1436 Val
= getFPExtOrFPTrunc(DAG
, Val
, SL
, VT
);
1438 Val
= DAG
.getSExtOrTrunc(Val
, SL
, VT
);
1440 Val
= DAG
.getZExtOrTrunc(Val
, SL
, VT
);
1445 SDValue
SITargetLowering::lowerKernargMemParameter(
1446 SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1447 const SDLoc
&SL
, SDValue Chain
,
1448 uint64_t Offset
, unsigned Align
, bool Signed
,
1449 const ISD::InputArg
*Arg
) const {
1450 Type
*Ty
= MemVT
.getTypeForEVT(*DAG
.getContext());
1451 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
1452 MachinePointerInfo
PtrInfo(UndefValue::get(PtrTy
));
1454 // Try to avoid using an extload by loading earlier than the argument address,
1455 // and extracting the relevant bits. The load should hopefully be merged with
1456 // the previous argument.
1457 if (MemVT
.getStoreSize() < 4 && Align
< 4) {
1458 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1459 int64_t AlignDownOffset
= alignDown(Offset
, 4);
1460 int64_t OffsetDiff
= Offset
- AlignDownOffset
;
1462 EVT IntVT
= MemVT
.changeTypeToInteger();
1464 // TODO: If we passed in the base kernel offset we could have a better
1465 // alignment than 4, but we don't really need it.
1466 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, AlignDownOffset
);
1467 SDValue Load
= DAG
.getLoad(MVT::i32
, SL
, Chain
, Ptr
, PtrInfo
, 4,
1468 MachineMemOperand::MODereferenceable
|
1469 MachineMemOperand::MOInvariant
);
1471 SDValue ShiftAmt
= DAG
.getConstant(OffsetDiff
* 8, SL
, MVT::i32
);
1472 SDValue Extract
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Load
, ShiftAmt
);
1474 SDValue ArgVal
= DAG
.getNode(ISD::TRUNCATE
, SL
, IntVT
, Extract
);
1475 ArgVal
= DAG
.getNode(ISD::BITCAST
, SL
, MemVT
, ArgVal
);
1476 ArgVal
= convertArgType(DAG
, VT
, MemVT
, SL
, ArgVal
, Signed
, Arg
);
1479 return DAG
.getMergeValues({ ArgVal
, Load
.getValue(1) }, SL
);
1482 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, Offset
);
1483 SDValue Load
= DAG
.getLoad(MemVT
, SL
, Chain
, Ptr
, PtrInfo
, Align
,
1484 MachineMemOperand::MODereferenceable
|
1485 MachineMemOperand::MOInvariant
);
1487 SDValue Val
= convertArgType(DAG
, VT
, MemVT
, SL
, Load
, Signed
, Arg
);
1488 return DAG
.getMergeValues({ Val
, Load
.getValue(1) }, SL
);
1491 SDValue
SITargetLowering::lowerStackParameter(SelectionDAG
&DAG
, CCValAssign
&VA
,
1492 const SDLoc
&SL
, SDValue Chain
,
1493 const ISD::InputArg
&Arg
) const {
1494 MachineFunction
&MF
= DAG
.getMachineFunction();
1495 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1497 if (Arg
.Flags
.isByVal()) {
1498 unsigned Size
= Arg
.Flags
.getByValSize();
1499 int FrameIdx
= MFI
.CreateFixedObject(Size
, VA
.getLocMemOffset(), false);
1500 return DAG
.getFrameIndex(FrameIdx
, MVT::i32
);
1503 unsigned ArgOffset
= VA
.getLocMemOffset();
1504 unsigned ArgSize
= VA
.getValVT().getStoreSize();
1506 int FI
= MFI
.CreateFixedObject(ArgSize
, ArgOffset
, true);
1508 // Create load nodes to retrieve arguments from the stack.
1509 SDValue FIN
= DAG
.getFrameIndex(FI
, MVT::i32
);
1512 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1513 ISD::LoadExtType ExtType
= ISD::NON_EXTLOAD
;
1514 MVT MemVT
= VA
.getValVT();
1516 switch (VA
.getLocInfo()) {
1519 case CCValAssign::BCvt
:
1520 MemVT
= VA
.getLocVT();
1522 case CCValAssign::SExt
:
1523 ExtType
= ISD::SEXTLOAD
;
1525 case CCValAssign::ZExt
:
1526 ExtType
= ISD::ZEXTLOAD
;
1528 case CCValAssign::AExt
:
1529 ExtType
= ISD::EXTLOAD
;
1533 ArgValue
= DAG
.getExtLoad(
1534 ExtType
, SL
, VA
.getLocVT(), Chain
, FIN
,
1535 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
),
1540 SDValue
SITargetLowering::getPreloadedValue(SelectionDAG
&DAG
,
1541 const SIMachineFunctionInfo
&MFI
,
1543 AMDGPUFunctionArgInfo::PreloadedValue PVID
) const {
1544 const ArgDescriptor
*Reg
;
1545 const TargetRegisterClass
*RC
;
1547 std::tie(Reg
, RC
) = MFI
.getPreloadedValue(PVID
);
1548 return CreateLiveInRegister(DAG
, RC
, Reg
->getRegister(), VT
);
1551 static void processShaderInputArgs(SmallVectorImpl
<ISD::InputArg
> &Splits
,
1552 CallingConv::ID CallConv
,
1553 ArrayRef
<ISD::InputArg
> Ins
,
1555 FunctionType
*FType
,
1556 SIMachineFunctionInfo
*Info
) {
1557 for (unsigned I
= 0, E
= Ins
.size(), PSInputNum
= 0; I
!= E
; ++I
) {
1558 const ISD::InputArg
*Arg
= &Ins
[I
];
1560 assert((!Arg
->VT
.isVector() || Arg
->VT
.getScalarSizeInBits() == 16) &&
1561 "vector type argument should have been split");
1563 // First check if it's a PS input addr.
1564 if (CallConv
== CallingConv::AMDGPU_PS
&&
1565 !Arg
->Flags
.isInReg() && PSInputNum
<= 15) {
1566 bool SkipArg
= !Arg
->Used
&& !Info
->isPSInputAllocated(PSInputNum
);
1568 // Inconveniently only the first part of the split is marked as isSplit,
1569 // so skip to the end. We only want to increment PSInputNum once for the
1570 // entire split argument.
1571 if (Arg
->Flags
.isSplit()) {
1572 while (!Arg
->Flags
.isSplitEnd()) {
1573 assert((!Arg
->VT
.isVector() ||
1574 Arg
->VT
.getScalarSizeInBits() == 16) &&
1575 "unexpected vector split in ps argument type");
1577 Splits
.push_back(*Arg
);
1583 // We can safely skip PS inputs.
1584 Skipped
.set(Arg
->getOrigArgIndex());
1589 Info
->markPSInputAllocated(PSInputNum
);
1591 Info
->markPSInputEnabled(PSInputNum
);
1596 Splits
.push_back(*Arg
);
1600 // Allocate special inputs passed in VGPRs.
1601 void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState
&CCInfo
,
1602 MachineFunction
&MF
,
1603 const SIRegisterInfo
&TRI
,
1604 SIMachineFunctionInfo
&Info
) const {
1605 const LLT S32
= LLT::scalar(32);
1606 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1608 if (Info
.hasWorkItemIDX()) {
1609 Register Reg
= AMDGPU::VGPR0
;
1610 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1612 CCInfo
.AllocateReg(Reg
);
1613 Info
.setWorkItemIDX(ArgDescriptor::createRegister(Reg
));
1616 if (Info
.hasWorkItemIDY()) {
1617 Register Reg
= AMDGPU::VGPR1
;
1618 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1620 CCInfo
.AllocateReg(Reg
);
1621 Info
.setWorkItemIDY(ArgDescriptor::createRegister(Reg
));
1624 if (Info
.hasWorkItemIDZ()) {
1625 Register Reg
= AMDGPU::VGPR2
;
1626 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
1628 CCInfo
.AllocateReg(Reg
);
1629 Info
.setWorkItemIDZ(ArgDescriptor::createRegister(Reg
));
1633 // Try to allocate a VGPR at the end of the argument list, or if no argument
1634 // VGPRs are left allocating a stack slot.
1635 // If \p Mask is is given it indicates bitfield position in the register.
1636 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1637 static ArgDescriptor
allocateVGPR32Input(CCState
&CCInfo
, unsigned Mask
= ~0u,
1638 ArgDescriptor Arg
= ArgDescriptor()) {
1640 return ArgDescriptor::createArg(Arg
, Mask
);
1642 ArrayRef
<MCPhysReg
> ArgVGPRs
1643 = makeArrayRef(AMDGPU::VGPR_32RegClass
.begin(), 32);
1644 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgVGPRs
);
1645 if (RegIdx
== ArgVGPRs
.size()) {
1646 // Spill to stack required.
1647 int64_t Offset
= CCInfo
.AllocateStack(4, 4);
1649 return ArgDescriptor::createStack(Offset
, Mask
);
1652 unsigned Reg
= ArgVGPRs
[RegIdx
];
1653 Reg
= CCInfo
.AllocateReg(Reg
);
1654 assert(Reg
!= AMDGPU::NoRegister
);
1656 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1657 MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
);
1658 return ArgDescriptor::createRegister(Reg
, Mask
);
1661 static ArgDescriptor
allocateSGPR32InputImpl(CCState
&CCInfo
,
1662 const TargetRegisterClass
*RC
,
1663 unsigned NumArgRegs
) {
1664 ArrayRef
<MCPhysReg
> ArgSGPRs
= makeArrayRef(RC
->begin(), 32);
1665 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgSGPRs
);
1666 if (RegIdx
== ArgSGPRs
.size())
1667 report_fatal_error("ran out of SGPRs for arguments");
1669 unsigned Reg
= ArgSGPRs
[RegIdx
];
1670 Reg
= CCInfo
.AllocateReg(Reg
);
1671 assert(Reg
!= AMDGPU::NoRegister
);
1673 MachineFunction
&MF
= CCInfo
.getMachineFunction();
1674 MF
.addLiveIn(Reg
, RC
);
1675 return ArgDescriptor::createRegister(Reg
);
1678 static ArgDescriptor
allocateSGPR32Input(CCState
&CCInfo
) {
1679 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_32RegClass
, 32);
1682 static ArgDescriptor
allocateSGPR64Input(CCState
&CCInfo
) {
1683 return allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_64RegClass
, 16);
1686 void SITargetLowering::allocateSpecialInputVGPRs(CCState
&CCInfo
,
1687 MachineFunction
&MF
,
1688 const SIRegisterInfo
&TRI
,
1689 SIMachineFunctionInfo
&Info
) const {
1690 const unsigned Mask
= 0x3ff;
1693 if (Info
.hasWorkItemIDX()) {
1694 Arg
= allocateVGPR32Input(CCInfo
, Mask
);
1695 Info
.setWorkItemIDX(Arg
);
1698 if (Info
.hasWorkItemIDY()) {
1699 Arg
= allocateVGPR32Input(CCInfo
, Mask
<< 10, Arg
);
1700 Info
.setWorkItemIDY(Arg
);
1703 if (Info
.hasWorkItemIDZ())
1704 Info
.setWorkItemIDZ(allocateVGPR32Input(CCInfo
, Mask
<< 20, Arg
));
1707 void SITargetLowering::allocateSpecialInputSGPRs(
1709 MachineFunction
&MF
,
1710 const SIRegisterInfo
&TRI
,
1711 SIMachineFunctionInfo
&Info
) const {
1712 auto &ArgInfo
= Info
.getArgInfo();
1714 // TODO: Unify handling with private memory pointers.
1716 if (Info
.hasDispatchPtr())
1717 ArgInfo
.DispatchPtr
= allocateSGPR64Input(CCInfo
);
1719 if (Info
.hasQueuePtr())
1720 ArgInfo
.QueuePtr
= allocateSGPR64Input(CCInfo
);
1722 if (Info
.hasKernargSegmentPtr())
1723 ArgInfo
.KernargSegmentPtr
= allocateSGPR64Input(CCInfo
);
1725 if (Info
.hasDispatchID())
1726 ArgInfo
.DispatchID
= allocateSGPR64Input(CCInfo
);
1728 // flat_scratch_init is not applicable for non-kernel functions.
1730 if (Info
.hasWorkGroupIDX())
1731 ArgInfo
.WorkGroupIDX
= allocateSGPR32Input(CCInfo
);
1733 if (Info
.hasWorkGroupIDY())
1734 ArgInfo
.WorkGroupIDY
= allocateSGPR32Input(CCInfo
);
1736 if (Info
.hasWorkGroupIDZ())
1737 ArgInfo
.WorkGroupIDZ
= allocateSGPR32Input(CCInfo
);
1739 if (Info
.hasImplicitArgPtr())
1740 ArgInfo
.ImplicitArgPtr
= allocateSGPR64Input(CCInfo
);
1743 // Allocate special inputs passed in user SGPRs.
1744 void SITargetLowering::allocateHSAUserSGPRs(CCState
&CCInfo
,
1745 MachineFunction
&MF
,
1746 const SIRegisterInfo
&TRI
,
1747 SIMachineFunctionInfo
&Info
) const {
1748 if (Info
.hasImplicitBufferPtr()) {
1749 unsigned ImplicitBufferPtrReg
= Info
.addImplicitBufferPtr(TRI
);
1750 MF
.addLiveIn(ImplicitBufferPtrReg
, &AMDGPU::SGPR_64RegClass
);
1751 CCInfo
.AllocateReg(ImplicitBufferPtrReg
);
1754 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1755 if (Info
.hasPrivateSegmentBuffer()) {
1756 unsigned PrivateSegmentBufferReg
= Info
.addPrivateSegmentBuffer(TRI
);
1757 MF
.addLiveIn(PrivateSegmentBufferReg
, &AMDGPU::SGPR_128RegClass
);
1758 CCInfo
.AllocateReg(PrivateSegmentBufferReg
);
1761 if (Info
.hasDispatchPtr()) {
1762 unsigned DispatchPtrReg
= Info
.addDispatchPtr(TRI
);
1763 MF
.addLiveIn(DispatchPtrReg
, &AMDGPU::SGPR_64RegClass
);
1764 CCInfo
.AllocateReg(DispatchPtrReg
);
1767 if (Info
.hasQueuePtr()) {
1768 unsigned QueuePtrReg
= Info
.addQueuePtr(TRI
);
1769 MF
.addLiveIn(QueuePtrReg
, &AMDGPU::SGPR_64RegClass
);
1770 CCInfo
.AllocateReg(QueuePtrReg
);
1773 if (Info
.hasKernargSegmentPtr()) {
1774 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1775 Register InputPtrReg
= Info
.addKernargSegmentPtr(TRI
);
1776 CCInfo
.AllocateReg(InputPtrReg
);
1778 Register VReg
= MF
.addLiveIn(InputPtrReg
, &AMDGPU::SGPR_64RegClass
);
1779 MRI
.setType(VReg
, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
1782 if (Info
.hasDispatchID()) {
1783 unsigned DispatchIDReg
= Info
.addDispatchID(TRI
);
1784 MF
.addLiveIn(DispatchIDReg
, &AMDGPU::SGPR_64RegClass
);
1785 CCInfo
.AllocateReg(DispatchIDReg
);
1788 if (Info
.hasFlatScratchInit()) {
1789 unsigned FlatScratchInitReg
= Info
.addFlatScratchInit(TRI
);
1790 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
1791 CCInfo
.AllocateReg(FlatScratchInitReg
);
1794 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1795 // these from the dispatch pointer.
1798 // Allocate special input registers that are initialized per-wave.
1799 void SITargetLowering::allocateSystemSGPRs(CCState
&CCInfo
,
1800 MachineFunction
&MF
,
1801 SIMachineFunctionInfo
&Info
,
1802 CallingConv::ID CallConv
,
1803 bool IsShader
) const {
1804 if (Info
.hasWorkGroupIDX()) {
1805 unsigned Reg
= Info
.addWorkGroupIDX();
1806 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1807 CCInfo
.AllocateReg(Reg
);
1810 if (Info
.hasWorkGroupIDY()) {
1811 unsigned Reg
= Info
.addWorkGroupIDY();
1812 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1813 CCInfo
.AllocateReg(Reg
);
1816 if (Info
.hasWorkGroupIDZ()) {
1817 unsigned Reg
= Info
.addWorkGroupIDZ();
1818 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1819 CCInfo
.AllocateReg(Reg
);
1822 if (Info
.hasWorkGroupInfo()) {
1823 unsigned Reg
= Info
.addWorkGroupInfo();
1824 MF
.addLiveIn(Reg
, &AMDGPU::SReg_32_XM0RegClass
);
1825 CCInfo
.AllocateReg(Reg
);
1828 if (Info
.hasPrivateSegmentWaveByteOffset()) {
1829 // Scratch wave offset passed in system SGPR.
1830 unsigned PrivateSegmentWaveByteOffsetReg
;
1833 PrivateSegmentWaveByteOffsetReg
=
1834 Info
.getPrivateSegmentWaveByteOffsetSystemSGPR();
1836 // This is true if the scratch wave byte offset doesn't have a fixed
1838 if (PrivateSegmentWaveByteOffsetReg
== AMDGPU::NoRegister
) {
1839 PrivateSegmentWaveByteOffsetReg
= findFirstFreeSGPR(CCInfo
);
1840 Info
.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg
);
1843 PrivateSegmentWaveByteOffsetReg
= Info
.addPrivateSegmentWaveByteOffset();
1845 MF
.addLiveIn(PrivateSegmentWaveByteOffsetReg
, &AMDGPU::SGPR_32RegClass
);
1846 CCInfo
.AllocateReg(PrivateSegmentWaveByteOffsetReg
);
1850 static void reservePrivateMemoryRegs(const TargetMachine
&TM
,
1851 MachineFunction
&MF
,
1852 const SIRegisterInfo
&TRI
,
1853 SIMachineFunctionInfo
&Info
) {
1854 // Now that we've figured out where the scratch register inputs are, see if
1855 // should reserve the arguments and use them directly.
1856 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1857 bool HasStackObjects
= MFI
.hasStackObjects();
1858 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1860 // Record that we know we have non-spill stack objects so we don't need to
1861 // check all stack objects later.
1862 if (HasStackObjects
)
1863 Info
.setHasNonSpillStackObjects(true);
1865 // Everything live out of a block is spilled with fast regalloc, so it's
1866 // almost certain that spilling will be required.
1867 if (TM
.getOptLevel() == CodeGenOpt::None
)
1868 HasStackObjects
= true;
1870 // For now assume stack access is needed in any callee functions, so we need
1871 // the scratch registers to pass in.
1872 bool RequiresStackAccess
= HasStackObjects
|| MFI
.hasCalls();
1874 if (RequiresStackAccess
&& ST
.isAmdHsaOrMesa(MF
.getFunction())) {
1875 // If we have stack objects, we unquestionably need the private buffer
1876 // resource. For the Code Object V2 ABI, this will be the first 4 user
1877 // SGPR inputs. We can reserve those and use them directly.
1879 unsigned PrivateSegmentBufferReg
=
1880 Info
.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER
);
1881 Info
.setScratchRSrcReg(PrivateSegmentBufferReg
);
1883 unsigned ReservedBufferReg
= TRI
.reservedPrivateSegmentBufferReg(MF
);
1884 // We tentatively reserve the last registers (skipping the last registers
1885 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
1886 // we'll replace these with the ones immediately after those which were
1887 // really allocated. In the prologue copies will be inserted from the
1888 // argument to these reserved registers.
1890 // Without HSA, relocations are used for the scratch pointer and the
1891 // buffer resource setup is always inserted in the prologue. Scratch wave
1892 // offset is still in an input SGPR.
1893 Info
.setScratchRSrcReg(ReservedBufferReg
);
1896 // hasFP should be accurate for kernels even before the frame is finalized.
1897 if (ST
.getFrameLowering()->hasFP(MF
)) {
1898 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1900 // Try to use s32 as the SP, but move it if it would interfere with input
1901 // arguments. This won't work with calls though.
1903 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
1905 if (!MRI
.isLiveIn(AMDGPU::SGPR32
)) {
1906 Info
.setStackPtrOffsetReg(AMDGPU::SGPR32
);
1908 assert(AMDGPU::isShader(MF
.getFunction().getCallingConv()));
1911 report_fatal_error("call in graphics shader with too many input SGPRs");
1913 for (unsigned Reg
: AMDGPU::SGPR_32RegClass
) {
1914 if (!MRI
.isLiveIn(Reg
)) {
1915 Info
.setStackPtrOffsetReg(Reg
);
1920 if (Info
.getStackPtrOffsetReg() == AMDGPU::SP_REG
)
1921 report_fatal_error("failed to find register for SP");
1924 if (MFI
.hasCalls()) {
1925 Info
.setScratchWaveOffsetReg(AMDGPU::SGPR33
);
1926 Info
.setFrameOffsetReg(AMDGPU::SGPR33
);
1928 unsigned ReservedOffsetReg
=
1929 TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1930 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1931 Info
.setFrameOffsetReg(ReservedOffsetReg
);
1933 } else if (RequiresStackAccess
) {
1934 assert(!MFI
.hasCalls());
1935 // We know there are accesses and they will be done relative to SP, so just
1936 // pin it to the input.
1938 // FIXME: Should not do this if inline asm is reading/writing these
1940 unsigned PreloadedSP
= Info
.getPreloadedReg(
1941 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET
);
1943 Info
.setStackPtrOffsetReg(PreloadedSP
);
1944 Info
.setScratchWaveOffsetReg(PreloadedSP
);
1945 Info
.setFrameOffsetReg(PreloadedSP
);
1947 assert(!MFI
.hasCalls());
1949 // There may not be stack access at all. There may still be spills, or
1950 // access of a constant pointer (in which cases an extra copy will be
1951 // emitted in the prolog).
1952 unsigned ReservedOffsetReg
1953 = TRI
.reservedPrivateSegmentWaveByteOffsetReg(MF
);
1954 Info
.setStackPtrOffsetReg(ReservedOffsetReg
);
1955 Info
.setScratchWaveOffsetReg(ReservedOffsetReg
);
1956 Info
.setFrameOffsetReg(ReservedOffsetReg
);
1960 bool SITargetLowering::supportSplitCSR(MachineFunction
*MF
) const {
1961 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
1962 return !Info
->isEntryFunction();
1965 void SITargetLowering::initializeSplitCSR(MachineBasicBlock
*Entry
) const {
1969 void SITargetLowering::insertCopiesSplitCSR(
1970 MachineBasicBlock
*Entry
,
1971 const SmallVectorImpl
<MachineBasicBlock
*> &Exits
) const {
1972 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
1974 const MCPhysReg
*IStart
= TRI
->getCalleeSavedRegsViaCopy(Entry
->getParent());
1978 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
1979 MachineRegisterInfo
*MRI
= &Entry
->getParent()->getRegInfo();
1980 MachineBasicBlock::iterator MBBI
= Entry
->begin();
1981 for (const MCPhysReg
*I
= IStart
; *I
; ++I
) {
1982 const TargetRegisterClass
*RC
= nullptr;
1983 if (AMDGPU::SReg_64RegClass
.contains(*I
))
1984 RC
= &AMDGPU::SGPR_64RegClass
;
1985 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
1986 RC
= &AMDGPU::SGPR_32RegClass
;
1988 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1990 unsigned NewVR
= MRI
->createVirtualRegister(RC
);
1991 // Create copy from CSR to a virtual register.
1992 Entry
->addLiveIn(*I
);
1993 BuildMI(*Entry
, MBBI
, DebugLoc(), TII
->get(TargetOpcode::COPY
), NewVR
)
1996 // Insert the copy-back instructions right before the terminator.
1997 for (auto *Exit
: Exits
)
1998 BuildMI(*Exit
, Exit
->getFirstTerminator(), DebugLoc(),
1999 TII
->get(TargetOpcode::COPY
), *I
)
2004 SDValue
SITargetLowering::LowerFormalArguments(
2005 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
2006 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2007 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
2008 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2010 MachineFunction
&MF
= DAG
.getMachineFunction();
2011 const Function
&Fn
= MF
.getFunction();
2012 FunctionType
*FType
= MF
.getFunction().getFunctionType();
2013 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2015 if (Subtarget
->isAmdHsaOS() && AMDGPU::isShader(CallConv
)) {
2016 DiagnosticInfoUnsupported
NoGraphicsHSA(
2017 Fn
, "unsupported non-compute shaders with HSA", DL
.getDebugLoc());
2018 DAG
.getContext()->diagnose(NoGraphicsHSA
);
2019 return DAG
.getEntryNode();
2022 SmallVector
<ISD::InputArg
, 16> Splits
;
2023 SmallVector
<CCValAssign
, 16> ArgLocs
;
2024 BitVector
Skipped(Ins
.size());
2025 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
2028 bool IsShader
= AMDGPU::isShader(CallConv
);
2029 bool IsKernel
= AMDGPU::isKernel(CallConv
);
2030 bool IsEntryFunc
= AMDGPU::isEntryFunctionCC(CallConv
);
2033 processShaderInputArgs(Splits
, CallConv
, Ins
, Skipped
, FType
, Info
);
2035 // At least one interpolation mode must be enabled or else the GPU will
2038 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2039 // set PSInputAddr, the user wants to enable some bits after the compilation
2040 // based on run-time states. Since we can't know what the final PSInputEna
2041 // will look like, so we shouldn't do anything here and the user should take
2042 // responsibility for the correct programming.
2044 // Otherwise, the following restrictions apply:
2045 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2046 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2048 if (CallConv
== CallingConv::AMDGPU_PS
) {
2049 if ((Info
->getPSInputAddr() & 0x7F) == 0 ||
2050 ((Info
->getPSInputAddr() & 0xF) == 0 &&
2051 Info
->isPSInputAllocated(11))) {
2052 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
2053 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
2054 Info
->markPSInputAllocated(0);
2055 Info
->markPSInputEnabled(0);
2057 if (Subtarget
->isAmdPalOS()) {
2058 // For isAmdPalOS, the user does not enable some bits after compilation
2059 // based on run-time states; the register values being generated here are
2060 // the final ones set in hardware. Therefore we need to apply the
2061 // workaround to PSInputAddr and PSInputEnable together. (The case where
2062 // a bit is set in PSInputAddr but not PSInputEnable is where the
2063 // frontend set up an input arg for a particular interpolation mode, but
2064 // nothing uses that input arg. Really we should have an earlier pass
2065 // that removes such an arg.)
2066 unsigned PsInputBits
= Info
->getPSInputAddr() & Info
->getPSInputEnable();
2067 if ((PsInputBits
& 0x7F) == 0 ||
2068 ((PsInputBits
& 0xF) == 0 &&
2069 (PsInputBits
>> 11 & 1)))
2070 Info
->markPSInputEnabled(
2071 countTrailingZeros(Info
->getPSInputAddr(), ZB_Undefined
));
2075 assert(!Info
->hasDispatchPtr() &&
2076 !Info
->hasKernargSegmentPtr() && !Info
->hasFlatScratchInit() &&
2077 !Info
->hasWorkGroupIDX() && !Info
->hasWorkGroupIDY() &&
2078 !Info
->hasWorkGroupIDZ() && !Info
->hasWorkGroupInfo() &&
2079 !Info
->hasWorkItemIDX() && !Info
->hasWorkItemIDY() &&
2080 !Info
->hasWorkItemIDZ());
2081 } else if (IsKernel
) {
2082 assert(Info
->hasWorkGroupIDX() && Info
->hasWorkItemIDX());
2084 Splits
.append(Ins
.begin(), Ins
.end());
2088 allocateSpecialEntryInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2089 allocateHSAUserSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2093 analyzeFormalArgumentsCompute(CCInfo
, Ins
);
2095 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, isVarArg
);
2096 CCInfo
.AnalyzeFormalArguments(Splits
, AssignFn
);
2099 SmallVector
<SDValue
, 16> Chains
;
2101 // FIXME: This is the minimum kernel argument alignment. We should improve
2102 // this to the maximum alignment of the arguments.
2104 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2106 const unsigned KernelArgBaseAlign
= 16;
2108 for (unsigned i
= 0, e
= Ins
.size(), ArgIdx
= 0; i
!= e
; ++i
) {
2109 const ISD::InputArg
&Arg
= Ins
[i
];
2110 if (Arg
.isOrigArg() && Skipped
[Arg
.getOrigArgIndex()]) {
2111 InVals
.push_back(DAG
.getUNDEF(Arg
.VT
));
2115 CCValAssign
&VA
= ArgLocs
[ArgIdx
++];
2116 MVT VT
= VA
.getLocVT();
2118 if (IsEntryFunc
&& VA
.isMemLoc()) {
2120 EVT MemVT
= VA
.getLocVT();
2122 const uint64_t Offset
= VA
.getLocMemOffset();
2123 unsigned Align
= MinAlign(KernelArgBaseAlign
, Offset
);
2125 SDValue Arg
= lowerKernargMemParameter(
2126 DAG
, VT
, MemVT
, DL
, Chain
, Offset
, Align
, Ins
[i
].Flags
.isSExt(), &Ins
[i
]);
2127 Chains
.push_back(Arg
.getValue(1));
2130 dyn_cast
<PointerType
>(FType
->getParamType(Ins
[i
].getOrigArgIndex()));
2131 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
2132 ParamTy
&& (ParamTy
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
2133 ParamTy
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
)) {
2134 // On SI local pointers are just offsets into LDS, so they are always
2135 // less than 16-bits. On CI and newer they could potentially be
2136 // real pointers, so we can't guarantee their size.
2137 Arg
= DAG
.getNode(ISD::AssertZext
, DL
, Arg
.getValueType(), Arg
,
2138 DAG
.getValueType(MVT::i16
));
2141 InVals
.push_back(Arg
);
2143 } else if (!IsEntryFunc
&& VA
.isMemLoc()) {
2144 SDValue Val
= lowerStackParameter(DAG
, VA
, DL
, Chain
, Arg
);
2145 InVals
.push_back(Val
);
2146 if (!Arg
.Flags
.isByVal())
2147 Chains
.push_back(Val
.getValue(1));
2151 assert(VA
.isRegLoc() && "Parameter must be in a register!");
2153 unsigned Reg
= VA
.getLocReg();
2154 const TargetRegisterClass
*RC
= TRI
->getMinimalPhysRegClass(Reg
, VT
);
2155 EVT ValVT
= VA
.getValVT();
2157 Reg
= MF
.addLiveIn(Reg
, RC
);
2158 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
);
2160 if (Arg
.Flags
.isSRet()) {
2161 // The return object should be reasonably addressable.
2163 // FIXME: This helps when the return is a real sret. If it is a
2164 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2165 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2167 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
2168 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
2169 DAG
.getValueType(EVT::getIntegerVT(*DAG
.getContext(), NumBits
)));
2172 // If this is an 8 or 16-bit value, it is really passed promoted
2173 // to 32 bits. Insert an assert[sz]ext to capture this, then
2174 // truncate to the right size.
2175 switch (VA
.getLocInfo()) {
2176 case CCValAssign::Full
:
2178 case CCValAssign::BCvt
:
2179 Val
= DAG
.getNode(ISD::BITCAST
, DL
, ValVT
, Val
);
2181 case CCValAssign::SExt
:
2182 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VT
, Val
,
2183 DAG
.getValueType(ValVT
));
2184 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2186 case CCValAssign::ZExt
:
2187 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
,
2188 DAG
.getValueType(ValVT
));
2189 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2191 case CCValAssign::AExt
:
2192 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
2195 llvm_unreachable("Unknown loc info!");
2198 InVals
.push_back(Val
);
2202 // Special inputs come after user arguments.
2203 allocateSpecialInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2206 // Start adding system SGPRs.
2208 allocateSystemSGPRs(CCInfo
, MF
, *Info
, CallConv
, IsShader
);
2210 CCInfo
.AllocateReg(Info
->getScratchRSrcReg());
2211 CCInfo
.AllocateReg(Info
->getScratchWaveOffsetReg());
2212 CCInfo
.AllocateReg(Info
->getFrameOffsetReg());
2213 allocateSpecialInputSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2216 auto &ArgUsageInfo
=
2217 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2218 ArgUsageInfo
.setFuncArgInfo(Fn
, Info
->getArgInfo());
2220 unsigned StackArgSize
= CCInfo
.getNextStackOffset();
2221 Info
->setBytesInStackArgArea(StackArgSize
);
2223 return Chains
.empty() ? Chain
:
2224 DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
2227 // TODO: If return values can't fit in registers, we should return as many as
2228 // possible in registers before passing on stack.
2229 bool SITargetLowering::CanLowerReturn(
2230 CallingConv::ID CallConv
,
2231 MachineFunction
&MF
, bool IsVarArg
,
2232 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2233 LLVMContext
&Context
) const {
2234 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2235 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2236 // for shaders. Vector types should be explicitly handled by CC.
2237 if (AMDGPU::isEntryFunctionCC(CallConv
))
2240 SmallVector
<CCValAssign
, 16> RVLocs
;
2241 CCState
CCInfo(CallConv
, IsVarArg
, MF
, RVLocs
, Context
);
2242 return CCInfo
.CheckReturn(Outs
, CCAssignFnForReturn(CallConv
, IsVarArg
));
2246 SITargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
2248 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2249 const SmallVectorImpl
<SDValue
> &OutVals
,
2250 const SDLoc
&DL
, SelectionDAG
&DAG
) const {
2251 MachineFunction
&MF
= DAG
.getMachineFunction();
2252 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2254 if (AMDGPU::isKernel(CallConv
)) {
2255 return AMDGPUTargetLowering::LowerReturn(Chain
, CallConv
, isVarArg
, Outs
,
2259 bool IsShader
= AMDGPU::isShader(CallConv
);
2261 Info
->setIfReturnsVoid(Outs
.empty());
2262 bool IsWaveEnd
= Info
->returnsVoid() && IsShader
;
2264 // CCValAssign - represent the assignment of the return value to a location.
2265 SmallVector
<CCValAssign
, 48> RVLocs
;
2266 SmallVector
<ISD::OutputArg
, 48> Splits
;
2268 // CCState - Info about the registers and stack slots.
2269 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
2272 // Analyze outgoing return values.
2273 CCInfo
.AnalyzeReturn(Outs
, CCAssignFnForReturn(CallConv
, isVarArg
));
2276 SmallVector
<SDValue
, 48> RetOps
;
2277 RetOps
.push_back(Chain
); // Operand #0 = Chain (updated below)
2279 // Add return address for callable functions.
2280 if (!Info
->isEntryFunction()) {
2281 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2282 SDValue ReturnAddrReg
= CreateLiveInRegister(
2283 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2285 SDValue ReturnAddrVirtualReg
= DAG
.getRegister(
2286 MF
.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass
),
2289 DAG
.getCopyToReg(Chain
, DL
, ReturnAddrVirtualReg
, ReturnAddrReg
, Flag
);
2290 Flag
= Chain
.getValue(1);
2291 RetOps
.push_back(ReturnAddrVirtualReg
);
2294 // Copy the result values into the output registers.
2295 for (unsigned I
= 0, RealRVLocIdx
= 0, E
= RVLocs
.size(); I
!= E
;
2296 ++I
, ++RealRVLocIdx
) {
2297 CCValAssign
&VA
= RVLocs
[I
];
2298 assert(VA
.isRegLoc() && "Can only return in registers!");
2299 // TODO: Partially return in registers if return values don't fit.
2300 SDValue Arg
= OutVals
[RealRVLocIdx
];
2302 // Copied from other backends.
2303 switch (VA
.getLocInfo()) {
2304 case CCValAssign::Full
:
2306 case CCValAssign::BCvt
:
2307 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2309 case CCValAssign::SExt
:
2310 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2312 case CCValAssign::ZExt
:
2313 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2315 case CCValAssign::AExt
:
2316 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2319 llvm_unreachable("Unknown loc info!");
2322 Chain
= DAG
.getCopyToReg(Chain
, DL
, VA
.getLocReg(), Arg
, Flag
);
2323 Flag
= Chain
.getValue(1);
2324 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
2327 // FIXME: Does sret work properly?
2328 if (!Info
->isEntryFunction()) {
2329 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2330 const MCPhysReg
*I
=
2331 TRI
->getCalleeSavedRegsViaCopy(&DAG
.getMachineFunction());
2334 if (AMDGPU::SReg_64RegClass
.contains(*I
))
2335 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i64
));
2336 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
2337 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i32
));
2339 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2344 // Update chain and glue.
2347 RetOps
.push_back(Flag
);
2349 unsigned Opc
= AMDGPUISD::ENDPGM
;
2351 Opc
= IsShader
? AMDGPUISD::RETURN_TO_EPILOG
: AMDGPUISD::RET_FLAG
;
2352 return DAG
.getNode(Opc
, DL
, MVT::Other
, RetOps
);
2355 SDValue
SITargetLowering::LowerCallResult(
2356 SDValue Chain
, SDValue InFlag
, CallingConv::ID CallConv
, bool IsVarArg
,
2357 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2358 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
, bool IsThisReturn
,
2359 SDValue ThisVal
) const {
2360 CCAssignFn
*RetCC
= CCAssignFnForReturn(CallConv
, IsVarArg
);
2362 // Assign locations to each value returned by this call.
2363 SmallVector
<CCValAssign
, 16> RVLocs
;
2364 CCState
CCInfo(CallConv
, IsVarArg
, DAG
.getMachineFunction(), RVLocs
,
2366 CCInfo
.AnalyzeCallResult(Ins
, RetCC
);
2368 // Copy all of the result registers out of their specified physreg.
2369 for (unsigned i
= 0; i
!= RVLocs
.size(); ++i
) {
2370 CCValAssign VA
= RVLocs
[i
];
2373 if (VA
.isRegLoc()) {
2374 Val
= DAG
.getCopyFromReg(Chain
, DL
, VA
.getLocReg(), VA
.getLocVT(), InFlag
);
2375 Chain
= Val
.getValue(1);
2376 InFlag
= Val
.getValue(2);
2377 } else if (VA
.isMemLoc()) {
2378 report_fatal_error("TODO: return values in memory");
2380 llvm_unreachable("unknown argument location type");
2382 switch (VA
.getLocInfo()) {
2383 case CCValAssign::Full
:
2385 case CCValAssign::BCvt
:
2386 Val
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), Val
);
2388 case CCValAssign::ZExt
:
2389 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VA
.getLocVT(), Val
,
2390 DAG
.getValueType(VA
.getValVT()));
2391 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2393 case CCValAssign::SExt
:
2394 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VA
.getLocVT(), Val
,
2395 DAG
.getValueType(VA
.getValVT()));
2396 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2398 case CCValAssign::AExt
:
2399 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
2402 llvm_unreachable("Unknown loc info!");
2405 InVals
.push_back(Val
);
2411 // Add code to pass special inputs required depending on used features separate
2412 // from the explicit user arguments present in the IR.
2413 void SITargetLowering::passSpecialInputs(
2414 CallLoweringInfo
&CLI
,
2416 const SIMachineFunctionInfo
&Info
,
2417 SmallVectorImpl
<std::pair
<unsigned, SDValue
>> &RegsToPass
,
2418 SmallVectorImpl
<SDValue
> &MemOpChains
,
2419 SDValue Chain
) const {
2420 // If we don't have a call site, this was a call inserted by
2421 // legalization. These can never use special inputs.
2425 const Function
*CalleeFunc
= CLI
.CS
.getCalledFunction();
2428 SelectionDAG
&DAG
= CLI
.DAG
;
2429 const SDLoc
&DL
= CLI
.DL
;
2431 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2433 auto &ArgUsageInfo
=
2434 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
2435 const AMDGPUFunctionArgInfo
&CalleeArgInfo
2436 = ArgUsageInfo
.lookupFuncArgInfo(*CalleeFunc
);
2438 const AMDGPUFunctionArgInfo
&CallerArgInfo
= Info
.getArgInfo();
2440 // TODO: Unify with private memory register handling. This is complicated by
2441 // the fact that at least in kernels, the input argument is not necessarily
2442 // in the same location as the input.
2443 AMDGPUFunctionArgInfo::PreloadedValue InputRegs
[] = {
2444 AMDGPUFunctionArgInfo::DISPATCH_PTR
,
2445 AMDGPUFunctionArgInfo::QUEUE_PTR
,
2446 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
,
2447 AMDGPUFunctionArgInfo::DISPATCH_ID
,
2448 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
,
2449 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
,
2450 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
,
2451 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2454 for (auto InputID
: InputRegs
) {
2455 const ArgDescriptor
*OutgoingArg
;
2456 const TargetRegisterClass
*ArgRC
;
2458 std::tie(OutgoingArg
, ArgRC
) = CalleeArgInfo
.getPreloadedValue(InputID
);
2462 const ArgDescriptor
*IncomingArg
;
2463 const TargetRegisterClass
*IncomingArgRC
;
2464 std::tie(IncomingArg
, IncomingArgRC
)
2465 = CallerArgInfo
.getPreloadedValue(InputID
);
2466 assert(IncomingArgRC
== ArgRC
);
2468 // All special arguments are ints for now.
2469 EVT ArgVT
= TRI
->getSpillSize(*ArgRC
) == 8 ? MVT::i64
: MVT::i32
;
2473 InputReg
= loadInputValue(DAG
, ArgRC
, ArgVT
, DL
, *IncomingArg
);
2475 // The implicit arg ptr is special because it doesn't have a corresponding
2476 // input for kernels, and is computed from the kernarg segment pointer.
2477 assert(InputID
== AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
2478 InputReg
= getImplicitArgPtr(DAG
, DL
);
2481 if (OutgoingArg
->isRegister()) {
2482 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2484 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(ArgVT
.getStoreSize(), 4);
2485 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2487 MemOpChains
.push_back(ArgStore
);
2491 // Pack workitem IDs into a single register or pass it as is if already
2493 const ArgDescriptor
*OutgoingArg
;
2494 const TargetRegisterClass
*ArgRC
;
2496 std::tie(OutgoingArg
, ArgRC
) =
2497 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
2499 std::tie(OutgoingArg
, ArgRC
) =
2500 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
2502 std::tie(OutgoingArg
, ArgRC
) =
2503 CalleeArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
2507 const ArgDescriptor
*IncomingArgX
2508 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
).first
;
2509 const ArgDescriptor
*IncomingArgY
2510 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
).first
;
2511 const ArgDescriptor
*IncomingArgZ
2512 = CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
).first
;
2517 // If incoming ids are not packed we need to pack them.
2518 if (IncomingArgX
&& !IncomingArgX
->isMasked() && CalleeArgInfo
.WorkItemIDX
)
2519 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgX
);
2521 if (IncomingArgY
&& !IncomingArgY
->isMasked() && CalleeArgInfo
.WorkItemIDY
) {
2522 SDValue Y
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgY
);
2523 Y
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Y
,
2524 DAG
.getShiftAmountConstant(10, MVT::i32
, SL
));
2525 InputReg
= InputReg
.getNode() ?
2526 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Y
) : Y
;
2529 if (IncomingArgZ
&& !IncomingArgZ
->isMasked() && CalleeArgInfo
.WorkItemIDZ
) {
2530 SDValue Z
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgZ
);
2531 Z
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Z
,
2532 DAG
.getShiftAmountConstant(20, MVT::i32
, SL
));
2533 InputReg
= InputReg
.getNode() ?
2534 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Z
) : Z
;
2537 if (!InputReg
.getNode()) {
2538 // Workitem ids are already packed, any of present incoming arguments
2539 // will carry all required fields.
2540 ArgDescriptor IncomingArg
= ArgDescriptor::createArg(
2541 IncomingArgX
? *IncomingArgX
:
2542 IncomingArgY
? *IncomingArgY
:
2543 *IncomingArgZ
, ~0u);
2544 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, IncomingArg
);
2547 if (OutgoingArg
->isRegister()) {
2548 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
2550 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(4, 4);
2551 SDValue ArgStore
= storeStackInputValue(DAG
, DL
, Chain
, InputReg
,
2553 MemOpChains
.push_back(ArgStore
);
2557 static bool canGuaranteeTCO(CallingConv::ID CC
) {
2558 return CC
== CallingConv::Fast
;
2561 /// Return true if we might ever do TCO for calls with this calling convention.
2562 static bool mayTailCallThisCC(CallingConv::ID CC
) {
2564 case CallingConv::C
:
2567 return canGuaranteeTCO(CC
);
2571 bool SITargetLowering::isEligibleForTailCallOptimization(
2572 SDValue Callee
, CallingConv::ID CalleeCC
, bool IsVarArg
,
2573 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2574 const SmallVectorImpl
<SDValue
> &OutVals
,
2575 const SmallVectorImpl
<ISD::InputArg
> &Ins
, SelectionDAG
&DAG
) const {
2576 if (!mayTailCallThisCC(CalleeCC
))
2579 MachineFunction
&MF
= DAG
.getMachineFunction();
2580 const Function
&CallerF
= MF
.getFunction();
2581 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
2582 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2583 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
2585 // Kernels aren't callable, and don't have a live in return address so it
2586 // doesn't make sense to do a tail call with entry functions.
2587 if (!CallerPreserved
)
2590 bool CCMatch
= CallerCC
== CalleeCC
;
2592 if (DAG
.getTarget().Options
.GuaranteedTailCallOpt
) {
2593 if (canGuaranteeTCO(CalleeCC
) && CCMatch
)
2598 // TODO: Can we handle var args?
2602 for (const Argument
&Arg
: CallerF
.args()) {
2603 if (Arg
.hasByValAttr())
2607 LLVMContext
&Ctx
= *DAG
.getContext();
2609 // Check that the call results are passed in the same way.
2610 if (!CCState::resultsCompatible(CalleeCC
, CallerCC
, MF
, Ctx
, Ins
,
2611 CCAssignFnForCall(CalleeCC
, IsVarArg
),
2612 CCAssignFnForCall(CallerCC
, IsVarArg
)))
2615 // The callee has to preserve all registers the caller needs to preserve.
2617 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
2618 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
2622 // Nothing more to check if the callee is taking no arguments.
2626 SmallVector
<CCValAssign
, 16> ArgLocs
;
2627 CCState
CCInfo(CalleeCC
, IsVarArg
, MF
, ArgLocs
, Ctx
);
2629 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CalleeCC
, IsVarArg
));
2631 const SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
2632 // If the stack arguments for this call do not fit into our own save area then
2633 // the call cannot be made tail.
2634 // TODO: Is this really necessary?
2635 if (CCInfo
.getNextStackOffset() > FuncInfo
->getBytesInStackArgArea())
2638 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2639 return parametersInCSRMatch(MRI
, CallerPreserved
, ArgLocs
, OutVals
);
2642 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst
*CI
) const {
2643 if (!CI
->isTailCall())
2646 const Function
*ParentFn
= CI
->getParent()->getParent();
2647 if (AMDGPU::isEntryFunctionCC(ParentFn
->getCallingConv()))
2650 auto Attr
= ParentFn
->getFnAttribute("disable-tail-calls");
2651 return (Attr
.getValueAsString() != "true");
2654 // The wave scratch offset register is used as the global base pointer.
2655 SDValue
SITargetLowering::LowerCall(CallLoweringInfo
&CLI
,
2656 SmallVectorImpl
<SDValue
> &InVals
) const {
2657 SelectionDAG
&DAG
= CLI
.DAG
;
2658 const SDLoc
&DL
= CLI
.DL
;
2659 SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
2660 SmallVector
<SDValue
, 32> &OutVals
= CLI
.OutVals
;
2661 SmallVector
<ISD::InputArg
, 32> &Ins
= CLI
.Ins
;
2662 SDValue Chain
= CLI
.Chain
;
2663 SDValue Callee
= CLI
.Callee
;
2664 bool &IsTailCall
= CLI
.IsTailCall
;
2665 CallingConv::ID CallConv
= CLI
.CallConv
;
2666 bool IsVarArg
= CLI
.IsVarArg
;
2667 bool IsSibCall
= false;
2668 bool IsThisReturn
= false;
2669 MachineFunction
&MF
= DAG
.getMachineFunction();
2672 return lowerUnhandledCall(CLI
, InVals
,
2673 "unsupported call to variadic function ");
2676 if (!CLI
.CS
.getInstruction())
2677 report_fatal_error("unsupported libcall legalization");
2679 if (!CLI
.CS
.getCalledFunction()) {
2680 return lowerUnhandledCall(CLI
, InVals
,
2681 "unsupported indirect call to function ");
2684 if (IsTailCall
&& MF
.getTarget().Options
.GuaranteedTailCallOpt
) {
2685 return lowerUnhandledCall(CLI
, InVals
,
2686 "unsupported required tail call to function ");
2689 if (AMDGPU::isShader(MF
.getFunction().getCallingConv())) {
2690 // Note the issue is with the CC of the calling function, not of the call
2692 return lowerUnhandledCall(CLI
, InVals
,
2693 "unsupported call from graphics shader of function ");
2697 IsTailCall
= isEligibleForTailCallOptimization(
2698 Callee
, CallConv
, IsVarArg
, Outs
, OutVals
, Ins
, DAG
);
2699 if (!IsTailCall
&& CLI
.CS
&& CLI
.CS
.isMustTailCall()) {
2700 report_fatal_error("failed to perform tail call elimination on a call "
2701 "site marked musttail");
2704 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
2706 // A sibling call is one where we're under the usual C ABI and not planning
2707 // to change that but can still do a tail call:
2708 if (!TailCallOpt
&& IsTailCall
)
2715 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2717 // Analyze operands of the call, assigning locations to each operand.
2718 SmallVector
<CCValAssign
, 16> ArgLocs
;
2719 CCState
CCInfo(CallConv
, IsVarArg
, MF
, ArgLocs
, *DAG
.getContext());
2720 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, IsVarArg
);
2722 CCInfo
.AnalyzeCallOperands(Outs
, AssignFn
);
2724 // Get a count of how many bytes are to be pushed on the stack.
2725 unsigned NumBytes
= CCInfo
.getNextStackOffset();
2728 // Since we're not changing the ABI to make this a tail call, the memory
2729 // operands are already available in the caller's incoming argument space.
2733 // FPDiff is the byte offset of the call's argument area from the callee's.
2734 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2735 // by this amount for a tail call. In a sibling call it must be 0 because the
2736 // caller will deallocate the entire stack and the callee still expects its
2737 // arguments to begin at SP+0. Completely unused for non-tail calls.
2739 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2740 SmallVector
<std::pair
<unsigned, SDValue
>, 8> RegsToPass
;
2742 // Adjust the stack pointer for the new arguments...
2743 // These operations are automatically eliminated by the prolog/epilog pass
2745 Chain
= DAG
.getCALLSEQ_START(Chain
, 0, 0, DL
);
2747 SmallVector
<SDValue
, 4> CopyFromChains
;
2749 // In the HSA case, this should be an identity copy.
2750 SDValue ScratchRSrcReg
2751 = DAG
.getCopyFromReg(Chain
, DL
, Info
->getScratchRSrcReg(), MVT::v4i32
);
2752 RegsToPass
.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3
, ScratchRSrcReg
);
2753 CopyFromChains
.push_back(ScratchRSrcReg
.getValue(1));
2754 Chain
= DAG
.getTokenFactor(DL
, CopyFromChains
);
2757 SmallVector
<SDValue
, 8> MemOpChains
;
2758 MVT PtrVT
= MVT::i32
;
2760 // Walk the register/memloc assignments, inserting copies/loads.
2761 for (unsigned i
= 0, realArgIdx
= 0, e
= ArgLocs
.size(); i
!= e
;
2762 ++i
, ++realArgIdx
) {
2763 CCValAssign
&VA
= ArgLocs
[i
];
2764 SDValue Arg
= OutVals
[realArgIdx
];
2766 // Promote the value if needed.
2767 switch (VA
.getLocInfo()) {
2768 case CCValAssign::Full
:
2770 case CCValAssign::BCvt
:
2771 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
2773 case CCValAssign::ZExt
:
2774 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2776 case CCValAssign::SExt
:
2777 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2779 case CCValAssign::AExt
:
2780 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2782 case CCValAssign::FPExt
:
2783 Arg
= DAG
.getNode(ISD::FP_EXTEND
, DL
, VA
.getLocVT(), Arg
);
2786 llvm_unreachable("Unknown loc info!");
2789 if (VA
.isRegLoc()) {
2790 RegsToPass
.push_back(std::make_pair(VA
.getLocReg(), Arg
));
2792 assert(VA
.isMemLoc());
2795 MachinePointerInfo DstInfo
;
2797 unsigned LocMemOffset
= VA
.getLocMemOffset();
2798 int32_t Offset
= LocMemOffset
;
2800 SDValue PtrOff
= DAG
.getConstant(Offset
, DL
, PtrVT
);
2804 ISD::ArgFlagsTy Flags
= Outs
[realArgIdx
].Flags
;
2805 unsigned OpSize
= Flags
.isByVal() ?
2806 Flags
.getByValSize() : VA
.getValVT().getStoreSize();
2808 // FIXME: We can have better than the minimum byval required alignment.
2809 Align
= Flags
.isByVal() ? Flags
.getByValAlign() :
2810 MinAlign(Subtarget
->getStackAlignment(), Offset
);
2812 Offset
= Offset
+ FPDiff
;
2813 int FI
= MFI
.CreateFixedObject(OpSize
, Offset
, true);
2815 DstAddr
= DAG
.getFrameIndex(FI
, PtrVT
);
2816 DstInfo
= MachinePointerInfo::getFixedStack(MF
, FI
);
2818 // Make sure any stack arguments overlapping with where we're storing
2819 // are loaded before this eventual operation. Otherwise they'll be
2822 // FIXME: Why is this really necessary? This seems to just result in a
2823 // lot of code to copy the stack and write them back to the same
2824 // locations, which are supposed to be immutable?
2825 Chain
= addTokenForArgument(Chain
, DAG
, MFI
, FI
);
2828 DstInfo
= MachinePointerInfo::getStack(MF
, LocMemOffset
);
2829 Align
= MinAlign(Subtarget
->getStackAlignment(), LocMemOffset
);
2832 if (Outs
[i
].Flags
.isByVal()) {
2834 DAG
.getConstant(Outs
[i
].Flags
.getByValSize(), DL
, MVT::i32
);
2835 SDValue Cpy
= DAG
.getMemcpy(
2836 Chain
, DL
, DstAddr
, Arg
, SizeNode
, Outs
[i
].Flags
.getByValAlign(),
2837 /*isVol = */ false, /*AlwaysInline = */ true,
2838 /*isTailCall = */ false, DstInfo
,
2839 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2840 *DAG
.getContext(), AMDGPUAS::PRIVATE_ADDRESS
))));
2842 MemOpChains
.push_back(Cpy
);
2844 SDValue Store
= DAG
.getStore(Chain
, DL
, Arg
, DstAddr
, DstInfo
, Align
);
2845 MemOpChains
.push_back(Store
);
2850 // Copy special input registers after user input arguments.
2851 passSpecialInputs(CLI
, CCInfo
, *Info
, RegsToPass
, MemOpChains
, Chain
);
2853 if (!MemOpChains
.empty())
2854 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOpChains
);
2856 // Build a sequence of copy-to-reg nodes chained together with token chain
2857 // and flag operands which copy the outgoing args into the appropriate regs.
2859 for (auto &RegToPass
: RegsToPass
) {
2860 Chain
= DAG
.getCopyToReg(Chain
, DL
, RegToPass
.first
,
2861 RegToPass
.second
, InFlag
);
2862 InFlag
= Chain
.getValue(1);
2866 SDValue PhysReturnAddrReg
;
2868 // Since the return is being combined with the call, we need to pass on the
2871 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2872 SDValue ReturnAddrReg
= CreateLiveInRegister(
2873 DAG
, &AMDGPU::SReg_64RegClass
, TRI
->getReturnAddressReg(MF
), MVT::i64
);
2875 PhysReturnAddrReg
= DAG
.getRegister(TRI
->getReturnAddressReg(MF
),
2877 Chain
= DAG
.getCopyToReg(Chain
, DL
, PhysReturnAddrReg
, ReturnAddrReg
, InFlag
);
2878 InFlag
= Chain
.getValue(1);
2881 // We don't usually want to end the call-sequence here because we would tidy
2882 // the frame up *after* the call, however in the ABI-changing tail-call case
2883 // we've carefully laid out the parameters so that when sp is reset they'll be
2884 // in the correct location.
2885 if (IsTailCall
&& !IsSibCall
) {
2886 Chain
= DAG
.getCALLSEQ_END(Chain
,
2887 DAG
.getTargetConstant(NumBytes
, DL
, MVT::i32
),
2888 DAG
.getTargetConstant(0, DL
, MVT::i32
),
2890 InFlag
= Chain
.getValue(1);
2893 std::vector
<SDValue
> Ops
;
2894 Ops
.push_back(Chain
);
2895 Ops
.push_back(Callee
);
2896 // Add a redundant copy of the callee global which will not be legalized, as
2897 // we need direct access to the callee later.
2898 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Callee
);
2899 const GlobalValue
*GV
= GSD
->getGlobal();
2900 Ops
.push_back(DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i64
));
2903 // Each tail call may have to adjust the stack by a different amount, so
2904 // this information must travel along with the operation for eventual
2905 // consumption by emitEpilogue.
2906 Ops
.push_back(DAG
.getTargetConstant(FPDiff
, DL
, MVT::i32
));
2908 Ops
.push_back(PhysReturnAddrReg
);
2911 // Add argument registers to the end of the list so that they are known live
2913 for (auto &RegToPass
: RegsToPass
) {
2914 Ops
.push_back(DAG
.getRegister(RegToPass
.first
,
2915 RegToPass
.second
.getValueType()));
2918 // Add a register mask operand representing the call-preserved registers.
2920 auto *TRI
= static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
2921 const uint32_t *Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
2922 assert(Mask
&& "Missing call preserved mask for calling convention");
2923 Ops
.push_back(DAG
.getRegisterMask(Mask
));
2925 if (InFlag
.getNode())
2926 Ops
.push_back(InFlag
);
2928 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2930 // If we're doing a tall call, use a TC_RETURN here rather than an
2931 // actual call instruction.
2933 MFI
.setHasTailCall();
2934 return DAG
.getNode(AMDGPUISD::TC_RETURN
, DL
, NodeTys
, Ops
);
2937 // Returns a chain and a flag for retval copy to use.
2938 SDValue Call
= DAG
.getNode(AMDGPUISD::CALL
, DL
, NodeTys
, Ops
);
2939 Chain
= Call
.getValue(0);
2940 InFlag
= Call
.getValue(1);
2942 uint64_t CalleePopBytes
= NumBytes
;
2943 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getTargetConstant(0, DL
, MVT::i32
),
2944 DAG
.getTargetConstant(CalleePopBytes
, DL
, MVT::i32
),
2947 InFlag
= Chain
.getValue(1);
2949 // Handle result values, copying them out of physregs into vregs that we
2951 return LowerCallResult(Chain
, InFlag
, CallConv
, IsVarArg
, Ins
, DL
, DAG
,
2952 InVals
, IsThisReturn
,
2953 IsThisReturn
? OutVals
[0] : SDValue());
2956 unsigned SITargetLowering::getRegisterByName(const char* RegName
, EVT VT
,
2957 SelectionDAG
&DAG
) const {
2958 unsigned Reg
= StringSwitch
<unsigned>(RegName
)
2959 .Case("m0", AMDGPU::M0
)
2960 .Case("exec", AMDGPU::EXEC
)
2961 .Case("exec_lo", AMDGPU::EXEC_LO
)
2962 .Case("exec_hi", AMDGPU::EXEC_HI
)
2963 .Case("flat_scratch", AMDGPU::FLAT_SCR
)
2964 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO
)
2965 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI
)
2966 .Default(AMDGPU::NoRegister
);
2968 if (Reg
== AMDGPU::NoRegister
) {
2969 report_fatal_error(Twine("invalid register name \""
2970 + StringRef(RegName
) + "\"."));
2974 if (!Subtarget
->hasFlatScrRegister() &&
2975 Subtarget
->getRegisterInfo()->regsOverlap(Reg
, AMDGPU::FLAT_SCR
)) {
2976 report_fatal_error(Twine("invalid register \""
2977 + StringRef(RegName
) + "\" for subtarget."));
2982 case AMDGPU::EXEC_LO
:
2983 case AMDGPU::EXEC_HI
:
2984 case AMDGPU::FLAT_SCR_LO
:
2985 case AMDGPU::FLAT_SCR_HI
:
2986 if (VT
.getSizeInBits() == 32)
2990 case AMDGPU::FLAT_SCR
:
2991 if (VT
.getSizeInBits() == 64)
2995 llvm_unreachable("missing register type checking");
2998 report_fatal_error(Twine("invalid type for register \""
2999 + StringRef(RegName
) + "\"."));
3002 // If kill is not the last instruction, split the block so kill is always a
3003 // proper terminator.
3004 MachineBasicBlock
*SITargetLowering::splitKillBlock(MachineInstr
&MI
,
3005 MachineBasicBlock
*BB
) const {
3006 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3008 MachineBasicBlock::iterator
SplitPoint(&MI
);
3011 if (SplitPoint
== BB
->end()) {
3012 // Don't bother with a new block.
3013 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
3017 MachineFunction
*MF
= BB
->getParent();
3018 MachineBasicBlock
*SplitBB
3019 = MF
->CreateMachineBasicBlock(BB
->getBasicBlock());
3021 MF
->insert(++MachineFunction::iterator(BB
), SplitBB
);
3022 SplitBB
->splice(SplitBB
->begin(), BB
, SplitPoint
, BB
->end());
3024 SplitBB
->transferSuccessorsAndUpdatePHIs(BB
);
3025 BB
->addSuccessor(SplitBB
);
3027 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
3031 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3032 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3033 // be the first instruction in the remainder block.
3035 /// \returns { LoopBody, Remainder }
3036 static std::pair
<MachineBasicBlock
*, MachineBasicBlock
*>
3037 splitBlockForLoop(MachineInstr
&MI
, MachineBasicBlock
&MBB
, bool InstInLoop
) {
3038 MachineFunction
*MF
= MBB
.getParent();
3039 MachineBasicBlock::iterator
I(&MI
);
3041 // To insert the loop we need to split the block. Move everything after this
3042 // point to a new block, and insert a new empty block between the two.
3043 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
3044 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
3045 MachineFunction::iterator
MBBI(MBB
);
3048 MF
->insert(MBBI
, LoopBB
);
3049 MF
->insert(MBBI
, RemainderBB
);
3051 LoopBB
->addSuccessor(LoopBB
);
3052 LoopBB
->addSuccessor(RemainderBB
);
3054 // Move the rest of the block into a new block.
3055 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
3058 auto Next
= std::next(I
);
3060 // Move instruction to loop body.
3061 LoopBB
->splice(LoopBB
->begin(), &MBB
, I
, Next
);
3063 // Move the rest of the block.
3064 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, Next
, MBB
.end());
3066 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, I
, MBB
.end());
3069 MBB
.addSuccessor(LoopBB
);
3071 return std::make_pair(LoopBB
, RemainderBB
);
3074 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3075 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr
&MI
) const {
3076 MachineBasicBlock
*MBB
= MI
.getParent();
3077 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3078 auto I
= MI
.getIterator();
3079 auto E
= std::next(I
);
3081 BuildMI(*MBB
, E
, MI
.getDebugLoc(), TII
->get(AMDGPU::S_WAITCNT
))
3084 MIBundleBuilder
Bundler(*MBB
, I
, E
);
3085 finalizeBundle(*MBB
, Bundler
.begin());
3089 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr
&MI
,
3090 MachineBasicBlock
*BB
) const {
3091 const DebugLoc
&DL
= MI
.getDebugLoc();
3093 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3095 MachineBasicBlock
*LoopBB
;
3096 MachineBasicBlock
*RemainderBB
;
3097 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3099 // Apparently kill flags are only valid if the def is in the same block?
3100 if (MachineOperand
*Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::data0
))
3101 Src
->setIsKill(false);
3103 std::tie(LoopBB
, RemainderBB
) = splitBlockForLoop(MI
, *BB
, true);
3105 MachineBasicBlock::iterator I
= LoopBB
->end();
3107 const unsigned EncodedReg
= AMDGPU::Hwreg::encodeHwreg(
3108 AMDGPU::Hwreg::ID_TRAPSTS
, AMDGPU::Hwreg::OFFSET_MEM_VIOL
, 1);
3110 // Clear TRAP_STS.MEM_VIOL
3111 BuildMI(*LoopBB
, LoopBB
->begin(), DL
, TII
->get(AMDGPU::S_SETREG_IMM32_B32
))
3113 .addImm(EncodedReg
);
3115 bundleInstWithWaitcnt(MI
);
3117 unsigned Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3119 // Load and check TRAP_STS.MEM_VIOL
3120 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_GETREG_B32
), Reg
)
3121 .addImm(EncodedReg
);
3123 // FIXME: Do we need to use an isel pseudo that may clobber scc?
3124 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CMP_LG_U32
))
3125 .addReg(Reg
, RegState::Kill
)
3127 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3133 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3134 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3135 // will only do one iteration. In the worst case, this will loop 64 times.
3137 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3138 static MachineBasicBlock::iterator
emitLoadM0FromVGPRLoop(
3139 const SIInstrInfo
*TII
,
3140 MachineRegisterInfo
&MRI
,
3141 MachineBasicBlock
&OrigBB
,
3142 MachineBasicBlock
&LoopBB
,
3144 const MachineOperand
&IdxReg
,
3148 unsigned InitSaveExecReg
,
3151 bool IsIndirectSrc
) {
3152 MachineFunction
*MF
= OrigBB
.getParent();
3153 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3154 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3155 MachineBasicBlock::iterator I
= LoopBB
.begin();
3157 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
3158 unsigned PhiExec
= MRI
.createVirtualRegister(BoolRC
);
3159 unsigned NewExec
= MRI
.createVirtualRegister(BoolRC
);
3160 unsigned CurrentIdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3161 unsigned CondReg
= MRI
.createVirtualRegister(BoolRC
);
3163 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiReg
)
3169 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiExec
)
3170 .addReg(InitSaveExecReg
)
3175 // Read the next variant <- also loop target.
3176 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), CurrentIdxReg
)
3177 .addReg(IdxReg
.getReg(), getUndefRegState(IdxReg
.isUndef()));
3179 // Compare the just read M0 value to all possible Idx values.
3180 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_CMP_EQ_U32_e64
), CondReg
)
3181 .addReg(CurrentIdxReg
)
3182 .addReg(IdxReg
.getReg(), 0, IdxReg
.getSubReg());
3184 // Update EXEC, save the original EXEC value to VCC.
3185 BuildMI(LoopBB
, I
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3186 : AMDGPU::S_AND_SAVEEXEC_B64
),
3188 .addReg(CondReg
, RegState::Kill
);
3190 MRI
.setSimpleHint(NewExec
, CondReg
);
3192 if (UseGPRIdxMode
) {
3195 IdxReg
= CurrentIdxReg
;
3197 IdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3198 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), IdxReg
)
3199 .addReg(CurrentIdxReg
, RegState::Kill
)
3202 unsigned IdxMode
= IsIndirectSrc
?
3203 AMDGPU::VGPRIndexMode::SRC0_ENABLE
: AMDGPU::VGPRIndexMode::DST_ENABLE
;
3204 MachineInstr
*SetOn
=
3205 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3206 .addReg(IdxReg
, RegState::Kill
)
3208 SetOn
->getOperand(3).setIsUndef();
3210 // Move index from VCC into M0
3212 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3213 .addReg(CurrentIdxReg
, RegState::Kill
);
3215 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
3216 .addReg(CurrentIdxReg
, RegState::Kill
)
3221 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3222 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3223 MachineInstr
*InsertPt
=
3224 BuildMI(LoopBB
, I
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_XOR_B32_term
3225 : AMDGPU::S_XOR_B64_term
), Exec
)
3229 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3232 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3233 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
3236 return InsertPt
->getIterator();
3239 // This has slightly sub-optimal regalloc when the source vector is killed by
3240 // the read. The register allocator does not understand that the kill is
3241 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3242 // subregister from it, using 1 more VGPR than necessary. This was saved when
3243 // this was expanded after register allocation.
3244 static MachineBasicBlock::iterator
loadM0FromVGPR(const SIInstrInfo
*TII
,
3245 MachineBasicBlock
&MBB
,
3247 unsigned InitResultReg
,
3251 bool IsIndirectSrc
) {
3252 MachineFunction
*MF
= MBB
.getParent();
3253 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3254 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3255 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3256 const DebugLoc
&DL
= MI
.getDebugLoc();
3257 MachineBasicBlock::iterator
I(&MI
);
3259 const auto *BoolXExecRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
3260 unsigned DstReg
= MI
.getOperand(0).getReg();
3261 unsigned SaveExec
= MRI
.createVirtualRegister(BoolXExecRC
);
3262 unsigned TmpExec
= MRI
.createVirtualRegister(BoolXExecRC
);
3263 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3264 unsigned MovExecOpc
= ST
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
3266 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), TmpExec
);
3268 // Save the EXEC mask
3269 BuildMI(MBB
, I
, DL
, TII
->get(MovExecOpc
), SaveExec
)
3272 MachineBasicBlock
*LoopBB
;
3273 MachineBasicBlock
*RemainderBB
;
3274 std::tie(LoopBB
, RemainderBB
) = splitBlockForLoop(MI
, MBB
, false);
3276 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3278 auto InsPt
= emitLoadM0FromVGPRLoop(TII
, MRI
, MBB
, *LoopBB
, DL
, *Idx
,
3279 InitResultReg
, DstReg
, PhiReg
, TmpExec
,
3280 Offset
, UseGPRIdxMode
, IsIndirectSrc
);
3282 MachineBasicBlock::iterator First
= RemainderBB
->begin();
3283 BuildMI(*RemainderBB
, First
, DL
, TII
->get(MovExecOpc
), Exec
)
3289 // Returns subreg index, offset
3290 static std::pair
<unsigned, int>
3291 computeIndirectRegAndOffset(const SIRegisterInfo
&TRI
,
3292 const TargetRegisterClass
*SuperRC
,
3295 int NumElts
= TRI
.getRegSizeInBits(*SuperRC
) / 32;
3297 // Skip out of bounds offsets, or else we would end up using an undefined
3299 if (Offset
>= NumElts
|| Offset
< 0)
3300 return std::make_pair(AMDGPU::sub0
, Offset
);
3302 return std::make_pair(AMDGPU::sub0
+ Offset
, 0);
3305 // Return true if the index is an SGPR and was set.
3306 static bool setM0ToIndexFromSGPR(const SIInstrInfo
*TII
,
3307 MachineRegisterInfo
&MRI
,
3311 bool IsIndirectSrc
) {
3312 MachineBasicBlock
*MBB
= MI
.getParent();
3313 const DebugLoc
&DL
= MI
.getDebugLoc();
3314 MachineBasicBlock::iterator
I(&MI
);
3316 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3317 const TargetRegisterClass
*IdxRC
= MRI
.getRegClass(Idx
->getReg());
3319 assert(Idx
->getReg() != AMDGPU::NoRegister
);
3321 if (!TII
->getRegisterInfo().isSGPRClass(IdxRC
))
3324 if (UseGPRIdxMode
) {
3325 unsigned IdxMode
= IsIndirectSrc
?
3326 AMDGPU::VGPRIndexMode::SRC0_ENABLE
: AMDGPU::VGPRIndexMode::DST_ENABLE
;
3328 MachineInstr
*SetOn
=
3329 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3333 SetOn
->getOperand(3).setIsUndef();
3335 unsigned Tmp
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3336 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), Tmp
)
3339 MachineInstr
*SetOn
=
3340 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_ON
))
3341 .addReg(Tmp
, RegState::Kill
)
3344 SetOn
->getOperand(3).setIsUndef();
3351 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3354 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
3362 // Control flow needs to be inserted if indexing with a VGPR.
3363 static MachineBasicBlock
*emitIndirectSrc(MachineInstr
&MI
,
3364 MachineBasicBlock
&MBB
,
3365 const GCNSubtarget
&ST
) {
3366 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3367 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3368 MachineFunction
*MF
= MBB
.getParent();
3369 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3371 unsigned Dst
= MI
.getOperand(0).getReg();
3372 unsigned SrcReg
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
)->getReg();
3373 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3375 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcReg
);
3378 std::tie(SubReg
, Offset
)
3379 = computeIndirectRegAndOffset(TRI
, VecRC
, SrcReg
, Offset
);
3381 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3383 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, true)) {
3384 MachineBasicBlock::iterator
I(&MI
);
3385 const DebugLoc
&DL
= MI
.getDebugLoc();
3387 if (UseGPRIdxMode
) {
3388 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3389 // to avoid interfering with other uses, so probably requires a new
3390 // optimization pass.
3391 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3392 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3393 .addReg(SrcReg
, RegState::Implicit
)
3394 .addReg(AMDGPU::M0
, RegState::Implicit
);
3395 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3397 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3398 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3399 .addReg(SrcReg
, RegState::Implicit
);
3402 MI
.eraseFromParent();
3407 const DebugLoc
&DL
= MI
.getDebugLoc();
3408 MachineBasicBlock::iterator
I(&MI
);
3410 unsigned PhiReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3411 unsigned InitReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3413 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), InitReg
);
3415 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, InitReg
, PhiReg
,
3416 Offset
, UseGPRIdxMode
, true);
3417 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3419 if (UseGPRIdxMode
) {
3420 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), Dst
)
3421 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3422 .addReg(SrcReg
, RegState::Implicit
)
3423 .addReg(AMDGPU::M0
, RegState::Implicit
);
3424 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3426 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
3427 .addReg(SrcReg
, RegState::Undef
, SubReg
)
3428 .addReg(SrcReg
, RegState::Implicit
);
3431 MI
.eraseFromParent();
3436 static unsigned getMOVRELDPseudo(const SIRegisterInfo
&TRI
,
3437 const TargetRegisterClass
*VecRC
) {
3438 switch (TRI
.getRegSizeInBits(*VecRC
)) {
3440 return AMDGPU::V_MOVRELD_B32_V1
;
3442 return AMDGPU::V_MOVRELD_B32_V2
;
3443 case 128: // 16 bytes
3444 return AMDGPU::V_MOVRELD_B32_V4
;
3445 case 256: // 32 bytes
3446 return AMDGPU::V_MOVRELD_B32_V8
;
3447 case 512: // 64 bytes
3448 return AMDGPU::V_MOVRELD_B32_V16
;
3450 llvm_unreachable("unsupported size for MOVRELD pseudos");
3454 static MachineBasicBlock
*emitIndirectDst(MachineInstr
&MI
,
3455 MachineBasicBlock
&MBB
,
3456 const GCNSubtarget
&ST
) {
3457 const SIInstrInfo
*TII
= ST
.getInstrInfo();
3458 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
3459 MachineFunction
*MF
= MBB
.getParent();
3460 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3462 unsigned Dst
= MI
.getOperand(0).getReg();
3463 const MachineOperand
*SrcVec
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
);
3464 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
3465 const MachineOperand
*Val
= TII
->getNamedOperand(MI
, AMDGPU::OpName::val
);
3466 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
3467 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcVec
->getReg());
3469 // This can be an immediate, but will be folded later.
3470 assert(Val
->getReg());
3473 std::tie(SubReg
, Offset
) = computeIndirectRegAndOffset(TRI
, VecRC
,
3476 bool UseGPRIdxMode
= ST
.useVGPRIndexMode(EnableVGPRIndexMode
);
3478 if (Idx
->getReg() == AMDGPU::NoRegister
) {
3479 MachineBasicBlock::iterator
I(&MI
);
3480 const DebugLoc
&DL
= MI
.getDebugLoc();
3482 assert(Offset
== 0);
3484 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::INSERT_SUBREG
), Dst
)
3489 MI
.eraseFromParent();
3493 if (setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
, UseGPRIdxMode
, false)) {
3494 MachineBasicBlock::iterator
I(&MI
);
3495 const DebugLoc
&DL
= MI
.getDebugLoc();
3497 if (UseGPRIdxMode
) {
3498 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3499 .addReg(SrcVec
->getReg(), RegState::Undef
, SubReg
) // vdst
3501 .addReg(Dst
, RegState::ImplicitDefine
)
3502 .addReg(SrcVec
->getReg(), RegState::Implicit
)
3503 .addReg(AMDGPU::M0
, RegState::Implicit
);
3505 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3507 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3509 BuildMI(MBB
, I
, DL
, MovRelDesc
)
3510 .addReg(Dst
, RegState::Define
)
3511 .addReg(SrcVec
->getReg())
3513 .addImm(SubReg
- AMDGPU::sub0
);
3516 MI
.eraseFromParent();
3521 MRI
.clearKillFlags(Val
->getReg());
3523 const DebugLoc
&DL
= MI
.getDebugLoc();
3525 unsigned PhiReg
= MRI
.createVirtualRegister(VecRC
);
3527 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, SrcVec
->getReg(), PhiReg
,
3528 Offset
, UseGPRIdxMode
, false);
3529 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
3531 if (UseGPRIdxMode
) {
3532 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOV_B32_indirect
))
3533 .addReg(PhiReg
, RegState::Undef
, SubReg
) // vdst
3535 .addReg(Dst
, RegState::ImplicitDefine
)
3536 .addReg(PhiReg
, RegState::Implicit
)
3537 .addReg(AMDGPU::M0
, RegState::Implicit
);
3538 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::S_SET_GPR_IDX_OFF
));
3540 const MCInstrDesc
&MovRelDesc
= TII
->get(getMOVRELDPseudo(TRI
, VecRC
));
3542 BuildMI(*LoopBB
, InsPt
, DL
, MovRelDesc
)
3543 .addReg(Dst
, RegState::Define
)
3546 .addImm(SubReg
- AMDGPU::sub0
);
3549 MI
.eraseFromParent();
3554 MachineBasicBlock
*SITargetLowering::EmitInstrWithCustomInserter(
3555 MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
3557 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3558 MachineFunction
*MF
= BB
->getParent();
3559 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
3561 if (TII
->isMIMG(MI
)) {
3562 if (MI
.memoperands_empty() && MI
.mayLoadOrStore()) {
3563 report_fatal_error("missing mem operand from MIMG instruction");
3565 // Add a memoperand for mimg instructions so that they aren't assumed to
3566 // be ordered memory instuctions.
3571 switch (MI
.getOpcode()) {
3572 case AMDGPU::S_ADD_U64_PSEUDO
:
3573 case AMDGPU::S_SUB_U64_PSEUDO
: {
3574 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3575 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3576 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3577 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
3578 const DebugLoc
&DL
= MI
.getDebugLoc();
3580 MachineOperand
&Dest
= MI
.getOperand(0);
3581 MachineOperand
&Src0
= MI
.getOperand(1);
3582 MachineOperand
&Src1
= MI
.getOperand(2);
3584 unsigned DestSub0
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3585 unsigned DestSub1
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
3587 MachineOperand Src0Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3588 Src0
, BoolRC
, AMDGPU::sub0
,
3589 &AMDGPU::SReg_32_XM0RegClass
);
3590 MachineOperand Src0Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3591 Src0
, BoolRC
, AMDGPU::sub1
,
3592 &AMDGPU::SReg_32_XM0RegClass
);
3594 MachineOperand Src1Sub0
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3595 Src1
, BoolRC
, AMDGPU::sub0
,
3596 &AMDGPU::SReg_32_XM0RegClass
);
3597 MachineOperand Src1Sub1
= TII
->buildExtractSubRegOrImm(MI
, MRI
,
3598 Src1
, BoolRC
, AMDGPU::sub1
,
3599 &AMDGPU::SReg_32_XM0RegClass
);
3601 bool IsAdd
= (MI
.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO
);
3603 unsigned LoOpc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
3604 unsigned HiOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
3605 BuildMI(*BB
, MI
, DL
, TII
->get(LoOpc
), DestSub0
)
3608 BuildMI(*BB
, MI
, DL
, TII
->get(HiOpc
), DestSub1
)
3611 BuildMI(*BB
, MI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), Dest
.getReg())
3613 .addImm(AMDGPU::sub0
)
3615 .addImm(AMDGPU::sub1
);
3616 MI
.eraseFromParent();
3619 case AMDGPU::SI_INIT_M0
: {
3620 BuildMI(*BB
, MI
.getIterator(), MI
.getDebugLoc(),
3621 TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
3622 .add(MI
.getOperand(0));
3623 MI
.eraseFromParent();
3626 case AMDGPU::SI_INIT_EXEC
:
3627 // This should be before all vector instructions.
3628 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B64
),
3630 .addImm(MI
.getOperand(0).getImm());
3631 MI
.eraseFromParent();
3634 case AMDGPU::SI_INIT_EXEC_LO
:
3635 // This should be before all vector instructions.
3636 BuildMI(*BB
, &*BB
->begin(), MI
.getDebugLoc(), TII
->get(AMDGPU::S_MOV_B32
),
3638 .addImm(MI
.getOperand(0).getImm());
3639 MI
.eraseFromParent();
3642 case AMDGPU::SI_INIT_EXEC_FROM_INPUT
: {
3643 // Extract the thread count from an SGPR input and set EXEC accordingly.
3644 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3646 // S_BFE_U32 count, input, {shift, 7}
3647 // S_BFM_B64 exec, count, 0
3648 // S_CMP_EQ_U32 count, 64
3649 // S_CMOV_B64 exec, -1
3650 MachineInstr
*FirstMI
= &*BB
->begin();
3651 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
3652 unsigned InputReg
= MI
.getOperand(0).getReg();
3653 unsigned CountReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
3656 // Move the COPY of the input reg to the beginning, so that we can use it.
3657 for (auto I
= BB
->begin(); I
!= &MI
; I
++) {
3658 if (I
->getOpcode() != TargetOpcode::COPY
||
3659 I
->getOperand(0).getReg() != InputReg
)
3663 FirstMI
= &*++BB
->begin();
3665 I
->removeFromParent();
3666 BB
->insert(FirstMI
, &*I
);
3674 // This should be before all vector instructions.
3675 unsigned Mask
= (getSubtarget()->getWavefrontSize() << 1) - 1;
3676 bool isWave32
= getSubtarget()->isWave32();
3677 unsigned Exec
= isWave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3678 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_BFE_U32
), CountReg
)
3680 .addImm((MI
.getOperand(1).getImm() & Mask
) | 0x70000);
3681 BuildMI(*BB
, FirstMI
, DebugLoc(),
3682 TII
->get(isWave32
? AMDGPU::S_BFM_B32
: AMDGPU::S_BFM_B64
),
3686 BuildMI(*BB
, FirstMI
, DebugLoc(), TII
->get(AMDGPU::S_CMP_EQ_U32
))
3687 .addReg(CountReg
, RegState::Kill
)
3688 .addImm(getSubtarget()->getWavefrontSize());
3689 BuildMI(*BB
, FirstMI
, DebugLoc(),
3690 TII
->get(isWave32
? AMDGPU::S_CMOV_B32
: AMDGPU::S_CMOV_B64
),
3693 MI
.eraseFromParent();
3697 case AMDGPU::GET_GROUPSTATICSIZE
: {
3698 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
3699 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
);
3700 DebugLoc DL
= MI
.getDebugLoc();
3701 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_MOV_B32
))
3702 .add(MI
.getOperand(0))
3703 .addImm(MFI
->getLDSSize());
3704 MI
.eraseFromParent();
3707 case AMDGPU::SI_INDIRECT_SRC_V1
:
3708 case AMDGPU::SI_INDIRECT_SRC_V2
:
3709 case AMDGPU::SI_INDIRECT_SRC_V4
:
3710 case AMDGPU::SI_INDIRECT_SRC_V8
:
3711 case AMDGPU::SI_INDIRECT_SRC_V16
:
3712 return emitIndirectSrc(MI
, *BB
, *getSubtarget());
3713 case AMDGPU::SI_INDIRECT_DST_V1
:
3714 case AMDGPU::SI_INDIRECT_DST_V2
:
3715 case AMDGPU::SI_INDIRECT_DST_V4
:
3716 case AMDGPU::SI_INDIRECT_DST_V8
:
3717 case AMDGPU::SI_INDIRECT_DST_V16
:
3718 return emitIndirectDst(MI
, *BB
, *getSubtarget());
3719 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO
:
3720 case AMDGPU::SI_KILL_I1_PSEUDO
:
3721 return splitKillBlock(MI
, BB
);
3722 case AMDGPU::V_CNDMASK_B64_PSEUDO
: {
3723 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
3724 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3725 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3727 unsigned Dst
= MI
.getOperand(0).getReg();
3728 unsigned Src0
= MI
.getOperand(1).getReg();
3729 unsigned Src1
= MI
.getOperand(2).getReg();
3730 const DebugLoc
&DL
= MI
.getDebugLoc();
3731 unsigned SrcCond
= MI
.getOperand(3).getReg();
3733 unsigned DstLo
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3734 unsigned DstHi
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
3735 const auto *CondRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
3736 unsigned SrcCondCopy
= MRI
.createVirtualRegister(CondRC
);
3738 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::COPY
), SrcCondCopy
)
3740 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstLo
)
3742 .addReg(Src0
, 0, AMDGPU::sub0
)
3744 .addReg(Src1
, 0, AMDGPU::sub0
)
3745 .addReg(SrcCondCopy
);
3746 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstHi
)
3748 .addReg(Src0
, 0, AMDGPU::sub1
)
3750 .addReg(Src1
, 0, AMDGPU::sub1
)
3751 .addReg(SrcCondCopy
);
3753 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), Dst
)
3755 .addImm(AMDGPU::sub0
)
3757 .addImm(AMDGPU::sub1
);
3758 MI
.eraseFromParent();
3761 case AMDGPU::SI_BR_UNDEF
: {
3762 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3763 const DebugLoc
&DL
= MI
.getDebugLoc();
3764 MachineInstr
*Br
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
3765 .add(MI
.getOperand(0));
3766 Br
->getOperand(1).setIsUndef(true); // read undef SCC
3767 MI
.eraseFromParent();
3770 case AMDGPU::ADJCALLSTACKUP
:
3771 case AMDGPU::ADJCALLSTACKDOWN
: {
3772 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
3773 MachineInstrBuilder
MIB(*MF
, &MI
);
3775 // Add an implicit use of the frame offset reg to prevent the restore copy
3776 // inserted after the call from being reorderd after stack operations in the
3777 // the caller's frame.
3778 MIB
.addReg(Info
->getStackPtrOffsetReg(), RegState::ImplicitDefine
)
3779 .addReg(Info
->getStackPtrOffsetReg(), RegState::Implicit
)
3780 .addReg(Info
->getFrameOffsetReg(), RegState::Implicit
);
3783 case AMDGPU::SI_CALL_ISEL
: {
3784 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
3785 const DebugLoc
&DL
= MI
.getDebugLoc();
3787 unsigned ReturnAddrReg
= TII
->getRegisterInfo().getReturnAddressReg(*MF
);
3789 MachineInstrBuilder MIB
;
3790 MIB
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::SI_CALL
), ReturnAddrReg
);
3792 for (unsigned I
= 0, E
= MI
.getNumOperands(); I
!= E
; ++I
)
3793 MIB
.add(MI
.getOperand(I
));
3795 MIB
.cloneMemRefs(MI
);
3796 MI
.eraseFromParent();
3799 case AMDGPU::V_ADD_I32_e32
:
3800 case AMDGPU::V_SUB_I32_e32
:
3801 case AMDGPU::V_SUBREV_I32_e32
: {
3802 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
3803 const DebugLoc
&DL
= MI
.getDebugLoc();
3804 unsigned Opc
= MI
.getOpcode();
3806 bool NeedClampOperand
= false;
3807 if (TII
->pseudoToMCOpcode(Opc
) == -1) {
3808 Opc
= AMDGPU::getVOPe64(Opc
);
3809 NeedClampOperand
= true;
3812 auto I
= BuildMI(*BB
, MI
, DL
, TII
->get(Opc
), MI
.getOperand(0).getReg());
3813 if (TII
->isVOP3(*I
)) {
3814 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
3815 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
3816 I
.addReg(TRI
->getVCC(), RegState::Define
);
3818 I
.add(MI
.getOperand(1))
3819 .add(MI
.getOperand(2));
3820 if (NeedClampOperand
)
3821 I
.addImm(0); // clamp bit for e64 encoding
3823 TII
->legalizeOperands(*I
);
3825 MI
.eraseFromParent();
3828 case AMDGPU::DS_GWS_INIT
:
3829 case AMDGPU::DS_GWS_SEMA_V
:
3830 case AMDGPU::DS_GWS_SEMA_BR
:
3831 case AMDGPU::DS_GWS_SEMA_P
:
3832 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL
:
3833 case AMDGPU::DS_GWS_BARRIER
:
3834 // A s_waitcnt 0 is required to be the instruction immediately following.
3835 if (getSubtarget()->hasGWSAutoReplay()) {
3836 bundleInstWithWaitcnt(MI
);
3840 return emitGWSMemViolTestLoop(MI
, BB
);
3842 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI
, BB
);
3846 bool SITargetLowering::hasBitPreservingFPLogic(EVT VT
) const {
3847 return isTypeLegal(VT
.getScalarType());
3850 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT
) const {
3851 // This currently forces unfolding various combinations of fsub into fma with
3852 // free fneg'd operands. As long as we have fast FMA (controlled by
3853 // isFMAFasterThanFMulAndFAdd), we should perform these.
3855 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3856 // most of these combines appear to be cycle neutral but save on instruction
3857 // count / code size.
3861 EVT
SITargetLowering::getSetCCResultType(const DataLayout
&DL
, LLVMContext
&Ctx
,
3863 if (!VT
.isVector()) {
3866 return EVT::getVectorVT(Ctx
, MVT::i1
, VT
.getVectorNumElements());
3869 MVT
SITargetLowering::getScalarShiftAmountTy(const DataLayout
&, EVT VT
) const {
3870 // TODO: Should i16 be used always if legal? For now it would force VALU
3872 return (VT
== MVT::i16
) ? MVT::i16
: MVT::i32
;
3875 // Answering this is somewhat tricky and depends on the specific device which
3876 // have different rates for fma or all f64 operations.
3878 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3879 // regardless of which device (although the number of cycles differs between
3880 // devices), so it is always profitable for f64.
3882 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3883 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3884 // which we can always do even without fused FP ops since it returns the same
3885 // result as the separate operations and since it is always full
3886 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3887 // however does not support denormals, so we do report fma as faster if we have
3888 // a fast fma device and require denormals.
3890 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT
) const {
3891 VT
= VT
.getScalarType();
3893 switch (VT
.getSimpleVT().SimpleTy
) {
3895 // This is as fast on some subtargets. However, we always have full rate f32
3896 // mad available which returns the same result as the separate operations
3897 // which we should prefer over fma. We can't use this if we want to support
3898 // denormals, so only report this in these cases.
3899 if (Subtarget
->hasFP32Denormals())
3900 return Subtarget
->hasFastFMAF32() || Subtarget
->hasDLInsts();
3902 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3903 return Subtarget
->hasFastFMAF32() && Subtarget
->hasDLInsts();
3908 return Subtarget
->has16BitInsts() && Subtarget
->hasFP16Denormals();
3916 //===----------------------------------------------------------------------===//
3917 // Custom DAG Lowering Operations
3918 //===----------------------------------------------------------------------===//
3920 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3921 // wider vector type is legal.
3922 SDValue
SITargetLowering::splitUnaryVectorOp(SDValue Op
,
3923 SelectionDAG
&DAG
) const {
3924 unsigned Opc
= Op
.getOpcode();
3925 EVT VT
= Op
.getValueType();
3926 assert(VT
== MVT::v4f16
);
3929 std::tie(Lo
, Hi
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3932 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo
.getValueType(), Lo
,
3934 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi
.getValueType(), Hi
,
3937 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3940 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3941 // wider vector type is legal.
3942 SDValue
SITargetLowering::splitBinaryVectorOp(SDValue Op
,
3943 SelectionDAG
&DAG
) const {
3944 unsigned Opc
= Op
.getOpcode();
3945 EVT VT
= Op
.getValueType();
3946 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
3949 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3951 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
3955 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
,
3957 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
,
3960 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3963 SDValue
SITargetLowering::splitTernaryVectorOp(SDValue Op
,
3964 SelectionDAG
&DAG
) const {
3965 unsigned Opc
= Op
.getOpcode();
3966 EVT VT
= Op
.getValueType();
3967 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
);
3970 std::tie(Lo0
, Hi0
) = DAG
.SplitVectorOperand(Op
.getNode(), 0);
3972 std::tie(Lo1
, Hi1
) = DAG
.SplitVectorOperand(Op
.getNode(), 1);
3974 std::tie(Lo2
, Hi2
) = DAG
.SplitVectorOperand(Op
.getNode(), 2);
3978 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
, Lo2
,
3980 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
, Hi2
,
3983 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
3987 SDValue
SITargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
3988 switch (Op
.getOpcode()) {
3989 default: return AMDGPUTargetLowering::LowerOperation(Op
, DAG
);
3990 case ISD::BRCOND
: return LowerBRCOND(Op
, DAG
);
3991 case ISD::RETURNADDR
: return LowerRETURNADDR(Op
, DAG
);
3993 SDValue Result
= LowerLOAD(Op
, DAG
);
3994 assert((!Result
.getNode() ||
3995 Result
.getNode()->getNumValues() == 2) &&
3996 "Load should return a value and a chain");
4002 return LowerTrig(Op
, DAG
);
4003 case ISD::SELECT
: return LowerSELECT(Op
, DAG
);
4004 case ISD::FDIV
: return LowerFDIV(Op
, DAG
);
4005 case ISD::ATOMIC_CMP_SWAP
: return LowerATOMIC_CMP_SWAP(Op
, DAG
);
4006 case ISD::STORE
: return LowerSTORE(Op
, DAG
);
4007 case ISD::GlobalAddress
: {
4008 MachineFunction
&MF
= DAG
.getMachineFunction();
4009 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
4010 return LowerGlobalAddress(MFI
, Op
, DAG
);
4012 case ISD::INTRINSIC_WO_CHAIN
: return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
4013 case ISD::INTRINSIC_W_CHAIN
: return LowerINTRINSIC_W_CHAIN(Op
, DAG
);
4014 case ISD::INTRINSIC_VOID
: return LowerINTRINSIC_VOID(Op
, DAG
);
4015 case ISD::ADDRSPACECAST
: return lowerADDRSPACECAST(Op
, DAG
);
4016 case ISD::INSERT_SUBVECTOR
:
4017 return lowerINSERT_SUBVECTOR(Op
, DAG
);
4018 case ISD::INSERT_VECTOR_ELT
:
4019 return lowerINSERT_VECTOR_ELT(Op
, DAG
);
4020 case ISD::EXTRACT_VECTOR_ELT
:
4021 return lowerEXTRACT_VECTOR_ELT(Op
, DAG
);
4022 case ISD::VECTOR_SHUFFLE
:
4023 return lowerVECTOR_SHUFFLE(Op
, DAG
);
4024 case ISD::BUILD_VECTOR
:
4025 return lowerBUILD_VECTOR(Op
, DAG
);
4027 return lowerFP_ROUND(Op
, DAG
);
4029 return lowerTRAP(Op
, DAG
);
4030 case ISD::DEBUGTRAP
:
4031 return lowerDEBUGTRAP(Op
, DAG
);
4034 case ISD::FCANONICALIZE
:
4035 return splitUnaryVectorOp(Op
, DAG
);
4038 return lowerFMINNUM_FMAXNUM(Op
, DAG
);
4040 return splitTernaryVectorOp(Op
, DAG
);
4053 case ISD::FMINNUM_IEEE
:
4054 case ISD::FMAXNUM_IEEE
:
4055 return splitBinaryVectorOp(Op
, DAG
);
4060 static SDValue
adjustLoadValueTypeImpl(SDValue Result
, EVT LoadVT
,
4062 SelectionDAG
&DAG
, bool Unpacked
) {
4063 if (!LoadVT
.isVector())
4066 if (Unpacked
) { // From v2i32/v4i32 back to v2f16/v4f16.
4067 // Truncate to v2i16/v4i16.
4068 EVT IntLoadVT
= LoadVT
.changeTypeToInteger();
4070 // Workaround legalizer not scalarizing truncate after vector op
4071 // legalization byt not creating intermediate vector trunc.
4072 SmallVector
<SDValue
, 4> Elts
;
4073 DAG
.ExtractVectorElements(Result
, Elts
);
4074 for (SDValue
&Elt
: Elts
)
4075 Elt
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, Elt
);
4077 Result
= DAG
.getBuildVector(IntLoadVT
, DL
, Elts
);
4079 // Bitcast to original type (v2f16/v4f16).
4080 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
4083 // Cast back to the original packed type.
4084 return DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Result
);
4087 SDValue
SITargetLowering::adjustLoadValueType(unsigned Opcode
,
4090 ArrayRef
<SDValue
> Ops
,
4091 bool IsIntrinsic
) const {
4094 bool Unpacked
= Subtarget
->hasUnpackedD16VMem();
4095 EVT LoadVT
= M
->getValueType(0);
4097 EVT EquivLoadVT
= LoadVT
;
4098 if (Unpacked
&& LoadVT
.isVector()) {
4099 EquivLoadVT
= LoadVT
.isVector() ?
4100 EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
4101 LoadVT
.getVectorNumElements()) : LoadVT
;
4104 // Change from v4f16/v2f16 to EquivLoadVT.
4105 SDVTList VTList
= DAG
.getVTList(EquivLoadVT
, MVT::Other
);
4108 = DAG
.getMemIntrinsicNode(
4109 IsIntrinsic
? (unsigned)ISD::INTRINSIC_W_CHAIN
: Opcode
, DL
,
4110 VTList
, Ops
, M
->getMemoryVT(),
4111 M
->getMemOperand());
4112 if (!Unpacked
) // Just adjusted the opcode.
4115 SDValue Adjusted
= adjustLoadValueTypeImpl(Load
, LoadVT
, DL
, DAG
, Unpacked
);
4117 return DAG
.getMergeValues({ Adjusted
, Load
.getValue(1) }, DL
);
4120 static SDValue
lowerICMPIntrinsic(const SITargetLowering
&TLI
,
4121 SDNode
*N
, SelectionDAG
&DAG
) {
4122 EVT VT
= N
->getValueType(0);
4123 const auto *CD
= cast
<ConstantSDNode
>(N
->getOperand(3));
4124 int CondCode
= CD
->getSExtValue();
4125 if (CondCode
< ICmpInst::Predicate::FIRST_ICMP_PREDICATE
||
4126 CondCode
> ICmpInst::Predicate::LAST_ICMP_PREDICATE
)
4127 return DAG
.getUNDEF(VT
);
4129 ICmpInst::Predicate IcInput
= static_cast<ICmpInst::Predicate
>(CondCode
);
4131 SDValue LHS
= N
->getOperand(1);
4132 SDValue RHS
= N
->getOperand(2);
4136 EVT CmpVT
= LHS
.getValueType();
4137 if (CmpVT
== MVT::i16
&& !TLI
.isTypeLegal(MVT::i16
)) {
4138 unsigned PromoteOp
= ICmpInst::isSigned(IcInput
) ?
4139 ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
4140 LHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, LHS
);
4141 RHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, RHS
);
4144 ISD::CondCode CCOpcode
= getICmpCondCode(IcInput
);
4146 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
4147 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
4149 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, DL
, CCVT
, LHS
, RHS
,
4150 DAG
.getCondCode(CCOpcode
));
4151 if (VT
.bitsEq(CCVT
))
4153 return DAG
.getZExtOrTrunc(SetCC
, DL
, VT
);
4156 static SDValue
lowerFCMPIntrinsic(const SITargetLowering
&TLI
,
4157 SDNode
*N
, SelectionDAG
&DAG
) {
4158 EVT VT
= N
->getValueType(0);
4159 const auto *CD
= cast
<ConstantSDNode
>(N
->getOperand(3));
4161 int CondCode
= CD
->getSExtValue();
4162 if (CondCode
< FCmpInst::Predicate::FIRST_FCMP_PREDICATE
||
4163 CondCode
> FCmpInst::Predicate::LAST_FCMP_PREDICATE
) {
4164 return DAG
.getUNDEF(VT
);
4167 SDValue Src0
= N
->getOperand(1);
4168 SDValue Src1
= N
->getOperand(2);
4169 EVT CmpVT
= Src0
.getValueType();
4172 if (CmpVT
== MVT::f16
&& !TLI
.isTypeLegal(CmpVT
)) {
4173 Src0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
4174 Src1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
4177 FCmpInst::Predicate IcInput
= static_cast<FCmpInst::Predicate
>(CondCode
);
4178 ISD::CondCode CCOpcode
= getFCmpCondCode(IcInput
);
4179 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
4180 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
4181 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, SL
, CCVT
, Src0
,
4182 Src1
, DAG
.getCondCode(CCOpcode
));
4183 if (VT
.bitsEq(CCVT
))
4185 return DAG
.getZExtOrTrunc(SetCC
, SL
, VT
);
4188 void SITargetLowering::ReplaceNodeResults(SDNode
*N
,
4189 SmallVectorImpl
<SDValue
> &Results
,
4190 SelectionDAG
&DAG
) const {
4191 switch (N
->getOpcode()) {
4192 case ISD::INSERT_VECTOR_ELT
: {
4193 if (SDValue Res
= lowerINSERT_VECTOR_ELT(SDValue(N
, 0), DAG
))
4194 Results
.push_back(Res
);
4197 case ISD::EXTRACT_VECTOR_ELT
: {
4198 if (SDValue Res
= lowerEXTRACT_VECTOR_ELT(SDValue(N
, 0), DAG
))
4199 Results
.push_back(Res
);
4202 case ISD::INTRINSIC_WO_CHAIN
: {
4203 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
4205 case Intrinsic::amdgcn_cvt_pkrtz
: {
4206 SDValue Src0
= N
->getOperand(1);
4207 SDValue Src1
= N
->getOperand(2);
4209 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32
, SL
, MVT::i32
,
4211 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Cvt
));
4214 case Intrinsic::amdgcn_cvt_pknorm_i16
:
4215 case Intrinsic::amdgcn_cvt_pknorm_u16
:
4216 case Intrinsic::amdgcn_cvt_pk_i16
:
4217 case Intrinsic::amdgcn_cvt_pk_u16
: {
4218 SDValue Src0
= N
->getOperand(1);
4219 SDValue Src1
= N
->getOperand(2);
4223 if (IID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
4224 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
4225 else if (IID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
4226 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
4227 else if (IID
== Intrinsic::amdgcn_cvt_pk_i16
)
4228 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
4230 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
4232 EVT VT
= N
->getValueType(0);
4233 if (isTypeLegal(VT
))
4234 Results
.push_back(DAG
.getNode(Opcode
, SL
, VT
, Src0
, Src1
));
4236 SDValue Cvt
= DAG
.getNode(Opcode
, SL
, MVT::i32
, Src0
, Src1
);
4237 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, Cvt
));
4244 case ISD::INTRINSIC_W_CHAIN
: {
4245 if (SDValue Res
= LowerINTRINSIC_W_CHAIN(SDValue(N
, 0), DAG
)) {
4246 Results
.push_back(Res
);
4247 Results
.push_back(Res
.getValue(1));
4255 EVT VT
= N
->getValueType(0);
4256 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VT
);
4257 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(1));
4258 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(2));
4260 EVT SelectVT
= NewVT
;
4261 if (NewVT
.bitsLT(MVT::i32
)) {
4262 LHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, LHS
);
4263 RHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, RHS
);
4264 SelectVT
= MVT::i32
;
4267 SDValue NewSelect
= DAG
.getNode(ISD::SELECT
, SL
, SelectVT
,
4268 N
->getOperand(0), LHS
, RHS
);
4270 if (NewVT
!= SelectVT
)
4271 NewSelect
= DAG
.getNode(ISD::TRUNCATE
, SL
, NewVT
, NewSelect
);
4272 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewSelect
));
4276 if (N
->getValueType(0) != MVT::v2f16
)
4280 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
4282 SDValue Op
= DAG
.getNode(ISD::XOR
, SL
, MVT::i32
,
4284 DAG
.getConstant(0x80008000, SL
, MVT::i32
));
4285 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
4289 if (N
->getValueType(0) != MVT::v2f16
)
4293 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
4295 SDValue Op
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
,
4297 DAG
.getConstant(0x7fff7fff, SL
, MVT::i32
));
4298 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
4306 /// Helper function for LowerBRCOND
4307 static SDNode
*findUser(SDValue Value
, unsigned Opcode
) {
4309 SDNode
*Parent
= Value
.getNode();
4310 for (SDNode::use_iterator I
= Parent
->use_begin(), E
= Parent
->use_end();
4313 if (I
.getUse().get() != Value
)
4316 if (I
->getOpcode() == Opcode
)
4322 unsigned SITargetLowering::isCFIntrinsic(const SDNode
*Intr
) const {
4323 if (Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
) {
4324 switch (cast
<ConstantSDNode
>(Intr
->getOperand(1))->getZExtValue()) {
4325 case Intrinsic::amdgcn_if
:
4326 return AMDGPUISD::IF
;
4327 case Intrinsic::amdgcn_else
:
4328 return AMDGPUISD::ELSE
;
4329 case Intrinsic::amdgcn_loop
:
4330 return AMDGPUISD::LOOP
;
4331 case Intrinsic::amdgcn_end_cf
:
4332 llvm_unreachable("should not occur");
4338 // break, if_break, else_break are all only used as inputs to loop, not
4339 // directly as branch conditions.
4343 bool SITargetLowering::shouldEmitFixup(const GlobalValue
*GV
) const {
4344 const Triple
&TT
= getTargetMachine().getTargetTriple();
4345 return (GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4346 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4347 AMDGPU::shouldEmitConstantsToTextSection(TT
);
4350 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue
*GV
) const {
4351 // FIXME: Either avoid relying on address space here or change the default
4352 // address space for functions to avoid the explicit check.
4353 return (GV
->getValueType()->isFunctionTy() ||
4354 GV
->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
4355 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4356 GV
->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4357 !shouldEmitFixup(GV
) &&
4358 !getTargetMachine().shouldAssumeDSOLocal(*GV
->getParent(), GV
);
4361 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue
*GV
) const {
4362 return !shouldEmitFixup(GV
) && !shouldEmitGOTReloc(GV
);
4365 /// This transforms the control flow intrinsics to get the branch destination as
4366 /// last parameter, also switches branch target with BR if the need arise
4367 SDValue
SITargetLowering::LowerBRCOND(SDValue BRCOND
,
4368 SelectionDAG
&DAG
) const {
4371 SDNode
*Intr
= BRCOND
.getOperand(1).getNode();
4372 SDValue Target
= BRCOND
.getOperand(2);
4373 SDNode
*BR
= nullptr;
4374 SDNode
*SetCC
= nullptr;
4376 if (Intr
->getOpcode() == ISD::SETCC
) {
4377 // As long as we negate the condition everything is fine
4379 Intr
= SetCC
->getOperand(0).getNode();
4382 // Get the target from BR if we don't negate the condition
4383 BR
= findUser(BRCOND
, ISD::BR
);
4384 Target
= BR
->getOperand(1);
4387 // FIXME: This changes the types of the intrinsics instead of introducing new
4388 // nodes with the correct types.
4389 // e.g. llvm.amdgcn.loop
4391 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4392 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4394 unsigned CFNode
= isCFIntrinsic(Intr
);
4396 // This is a uniform branch so we don't need to legalize.
4400 bool HaveChain
= Intr
->getOpcode() == ISD::INTRINSIC_VOID
||
4401 Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
;
4404 (SetCC
->getConstantOperandVal(1) == 1 &&
4405 cast
<CondCodeSDNode
>(SetCC
->getOperand(2).getNode())->get() ==
4408 // operands of the new intrinsic call
4409 SmallVector
<SDValue
, 4> Ops
;
4411 Ops
.push_back(BRCOND
.getOperand(0));
4413 Ops
.append(Intr
->op_begin() + (HaveChain
? 2 : 1), Intr
->op_end());
4414 Ops
.push_back(Target
);
4416 ArrayRef
<EVT
> Res(Intr
->value_begin() + 1, Intr
->value_end());
4418 // build the new intrinsic call
4419 SDNode
*Result
= DAG
.getNode(CFNode
, DL
, DAG
.getVTList(Res
), Ops
).getNode();
4424 BRCOND
.getOperand(0)
4427 Result
= DAG
.getMergeValues(Ops
, DL
).getNode();
4431 // Give the branch instruction our target
4434 BRCOND
.getOperand(2)
4436 SDValue NewBR
= DAG
.getNode(ISD::BR
, DL
, BR
->getVTList(), Ops
);
4437 DAG
.ReplaceAllUsesWith(BR
, NewBR
.getNode());
4438 BR
= NewBR
.getNode();
4441 SDValue Chain
= SDValue(Result
, Result
->getNumValues() - 1);
4443 // Copy the intrinsic results to registers
4444 for (unsigned i
= 1, e
= Intr
->getNumValues() - 1; i
!= e
; ++i
) {
4445 SDNode
*CopyToReg
= findUser(SDValue(Intr
, i
), ISD::CopyToReg
);
4449 Chain
= DAG
.getCopyToReg(
4451 CopyToReg
->getOperand(1),
4452 SDValue(Result
, i
- 1),
4455 DAG
.ReplaceAllUsesWith(SDValue(CopyToReg
, 0), CopyToReg
->getOperand(0));
4458 // Remove the old intrinsic from the chain
4459 DAG
.ReplaceAllUsesOfValueWith(
4460 SDValue(Intr
, Intr
->getNumValues() - 1),
4461 Intr
->getOperand(0));
4466 SDValue
SITargetLowering::LowerRETURNADDR(SDValue Op
,
4467 SelectionDAG
&DAG
) const {
4468 MVT VT
= Op
.getSimpleValueType();
4470 // Checking the depth
4471 if (cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue() != 0)
4472 return DAG
.getConstant(0, DL
, VT
);
4474 MachineFunction
&MF
= DAG
.getMachineFunction();
4475 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4476 // Check for kernel and shader functions
4477 if (Info
->isEntryFunction())
4478 return DAG
.getConstant(0, DL
, VT
);
4480 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
4481 // There is a call to @llvm.returnaddress in this function
4482 MFI
.setReturnAddressIsTaken(true);
4484 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
4485 // Get the return address reg and mark it as an implicit live-in
4486 unsigned Reg
= MF
.addLiveIn(TRI
->getReturnAddressReg(MF
), getRegClassFor(VT
, Op
.getNode()->isDivergent()));
4488 return DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, Reg
, VT
);
4491 SDValue
SITargetLowering::getFPExtOrFPTrunc(SelectionDAG
&DAG
,
4495 return Op
.getValueType().bitsLE(VT
) ?
4496 DAG
.getNode(ISD::FP_EXTEND
, DL
, VT
, Op
) :
4497 DAG
.getNode(ISD::FTRUNC
, DL
, VT
, Op
);
4500 SDValue
SITargetLowering::lowerFP_ROUND(SDValue Op
, SelectionDAG
&DAG
) const {
4501 assert(Op
.getValueType() == MVT::f16
&&
4502 "Do not know how to custom lower FP_ROUND for non-f16 type");
4504 SDValue Src
= Op
.getOperand(0);
4505 EVT SrcVT
= Src
.getValueType();
4506 if (SrcVT
!= MVT::f64
)
4511 SDValue FpToFp16
= DAG
.getNode(ISD::FP_TO_FP16
, DL
, MVT::i32
, Src
);
4512 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, FpToFp16
);
4513 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f16
, Trunc
);
4516 SDValue
SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op
,
4517 SelectionDAG
&DAG
) const {
4518 EVT VT
= Op
.getValueType();
4519 const MachineFunction
&MF
= DAG
.getMachineFunction();
4520 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4521 bool IsIEEEMode
= Info
->getMode().IEEE
;
4523 // FIXME: Assert during eslection that this is only selected for
4524 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4525 // mode functions, but this happens to be OK since it's only done in cases
4526 // where there is known no sNaN.
4528 return expandFMINNUM_FMAXNUM(Op
.getNode(), DAG
);
4530 if (VT
== MVT::v4f16
)
4531 return splitBinaryVectorOp(Op
, DAG
);
4535 SDValue
SITargetLowering::lowerTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4537 SDValue Chain
= Op
.getOperand(0);
4539 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4540 !Subtarget
->isTrapHandlerEnabled())
4541 return DAG
.getNode(AMDGPUISD::ENDPGM
, SL
, MVT::Other
, Chain
);
4543 MachineFunction
&MF
= DAG
.getMachineFunction();
4544 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4545 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4546 assert(UserSGPR
!= AMDGPU::NoRegister
);
4547 SDValue QueuePtr
= CreateLiveInRegister(
4548 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4549 SDValue SGPR01
= DAG
.getRegister(AMDGPU::SGPR0_SGPR1
, MVT::i64
);
4550 SDValue ToReg
= DAG
.getCopyToReg(Chain
, SL
, SGPR01
,
4551 QueuePtr
, SDValue());
4554 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap
, SL
, MVT::i16
),
4558 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4561 SDValue
SITargetLowering::lowerDEBUGTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
4563 SDValue Chain
= Op
.getOperand(0);
4564 MachineFunction
&MF
= DAG
.getMachineFunction();
4566 if (Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa
||
4567 !Subtarget
->isTrapHandlerEnabled()) {
4568 DiagnosticInfoUnsupported
NoTrap(MF
.getFunction(),
4569 "debugtrap handler not supported",
4572 LLVMContext
&Ctx
= MF
.getFunction().getContext();
4573 Ctx
.diagnose(NoTrap
);
4579 DAG
.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap
, SL
, MVT::i16
)
4581 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
4584 SDValue
SITargetLowering::getSegmentAperture(unsigned AS
, const SDLoc
&DL
,
4585 SelectionDAG
&DAG
) const {
4586 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4587 if (Subtarget
->hasApertureRegs()) {
4588 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4589 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
4590 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
4591 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
4592 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
4593 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
4595 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
4596 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
4597 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
4599 SDValue EncodingImm
= DAG
.getTargetConstant(Encoding
, DL
, MVT::i16
);
4600 SDValue ApertureReg
= SDValue(
4601 DAG
.getMachineNode(AMDGPU::S_GETREG_B32
, DL
, MVT::i32
, EncodingImm
), 0);
4602 SDValue ShiftAmount
= DAG
.getTargetConstant(WidthM1
+ 1, DL
, MVT::i32
);
4603 return DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, ApertureReg
, ShiftAmount
);
4606 MachineFunction
&MF
= DAG
.getMachineFunction();
4607 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
4608 unsigned UserSGPR
= Info
->getQueuePtrUserSGPR();
4609 assert(UserSGPR
!= AMDGPU::NoRegister
);
4611 SDValue QueuePtr
= CreateLiveInRegister(
4612 DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
4614 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4615 // private_segment_aperture_base_hi.
4616 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
4618 SDValue Ptr
= DAG
.getObjectPtrOffset(DL
, QueuePtr
, StructOffset
);
4620 // TODO: Use custom target PseudoSourceValue.
4621 // TODO: We should use the value from the IR intrinsic call, but it might not
4622 // be available and how do we get it?
4623 Value
*V
= UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG
.getContext()),
4624 AMDGPUAS::CONSTANT_ADDRESS
));
4626 MachinePointerInfo
PtrInfo(V
, StructOffset
);
4627 return DAG
.getLoad(MVT::i32
, DL
, QueuePtr
.getValue(1), Ptr
, PtrInfo
,
4628 MinAlign(64, StructOffset
),
4629 MachineMemOperand::MODereferenceable
|
4630 MachineMemOperand::MOInvariant
);
4633 SDValue
SITargetLowering::lowerADDRSPACECAST(SDValue Op
,
4634 SelectionDAG
&DAG
) const {
4636 const AddrSpaceCastSDNode
*ASC
= cast
<AddrSpaceCastSDNode
>(Op
);
4638 SDValue Src
= ASC
->getOperand(0);
4639 SDValue FlatNullPtr
= DAG
.getConstant(0, SL
, MVT::i64
);
4641 const AMDGPUTargetMachine
&TM
=
4642 static_cast<const AMDGPUTargetMachine
&>(getTargetMachine());
4644 // flat -> local/private
4645 if (ASC
->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4646 unsigned DestAS
= ASC
->getDestAddressSpace();
4648 if (DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
4649 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4650 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
4651 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4652 SDValue NonNull
= DAG
.getSetCC(SL
, MVT::i1
, Src
, FlatNullPtr
, ISD::SETNE
);
4653 SDValue Ptr
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, Src
);
4655 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i32
,
4656 NonNull
, Ptr
, SegmentNullPtr
);
4660 // local/private -> flat
4661 if (ASC
->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) {
4662 unsigned SrcAS
= ASC
->getSrcAddressSpace();
4664 if (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
4665 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
4666 unsigned NullVal
= TM
.getNullPointerValue(SrcAS
);
4667 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
4670 = DAG
.getSetCC(SL
, MVT::i1
, Src
, SegmentNullPtr
, ISD::SETNE
);
4672 SDValue Aperture
= getSegmentAperture(ASC
->getSrcAddressSpace(), SL
, DAG
);
4674 = DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, Src
, Aperture
);
4676 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, NonNull
,
4677 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, CvtPtr
),
4682 // global <-> flat are no-ops and never emitted.
4684 const MachineFunction
&MF
= DAG
.getMachineFunction();
4685 DiagnosticInfoUnsupported
InvalidAddrSpaceCast(
4686 MF
.getFunction(), "invalid addrspacecast", SL
.getDebugLoc());
4687 DAG
.getContext()->diagnose(InvalidAddrSpaceCast
);
4689 return DAG
.getUNDEF(ASC
->getValueType(0));
4692 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
4693 // the small vector and inserting them into the big vector. That is better than
4694 // the default expansion of doing it via a stack slot. Even though the use of
4695 // the stack slot would be optimized away afterwards, the stack slot itself
4697 SDValue
SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op
,
4698 SelectionDAG
&DAG
) const {
4699 SDValue Vec
= Op
.getOperand(0);
4700 SDValue Ins
= Op
.getOperand(1);
4701 SDValue Idx
= Op
.getOperand(2);
4702 EVT VecVT
= Vec
.getValueType();
4703 EVT InsVT
= Ins
.getValueType();
4704 EVT EltVT
= VecVT
.getVectorElementType();
4705 unsigned InsNumElts
= InsVT
.getVectorNumElements();
4706 unsigned IdxVal
= cast
<ConstantSDNode
>(Idx
)->getZExtValue();
4709 for (unsigned I
= 0; I
!= InsNumElts
; ++I
) {
4710 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Ins
,
4711 DAG
.getConstant(I
, SL
, MVT::i32
));
4712 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, VecVT
, Vec
, Elt
,
4713 DAG
.getConstant(IdxVal
+ I
, SL
, MVT::i32
));
4718 SDValue
SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op
,
4719 SelectionDAG
&DAG
) const {
4720 SDValue Vec
= Op
.getOperand(0);
4721 SDValue InsVal
= Op
.getOperand(1);
4722 SDValue Idx
= Op
.getOperand(2);
4723 EVT VecVT
= Vec
.getValueType();
4724 EVT EltVT
= VecVT
.getVectorElementType();
4725 unsigned VecSize
= VecVT
.getSizeInBits();
4726 unsigned EltSize
= EltVT
.getSizeInBits();
4729 assert(VecSize
<= 64);
4731 unsigned NumElts
= VecVT
.getVectorNumElements();
4733 auto KIdx
= dyn_cast
<ConstantSDNode
>(Idx
);
4735 if (NumElts
== 4 && EltSize
== 16 && KIdx
) {
4736 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Vec
);
4738 SDValue LoHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4739 DAG
.getConstant(0, SL
, MVT::i32
));
4740 SDValue HiHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
4741 DAG
.getConstant(1, SL
, MVT::i32
));
4743 SDValue LoVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, LoHalf
);
4744 SDValue HiVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, HiHalf
);
4746 unsigned Idx
= KIdx
->getZExtValue();
4747 bool InsertLo
= Idx
< 2;
4748 SDValue InsHalf
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, MVT::v2i16
,
4749 InsertLo
? LoVec
: HiVec
,
4750 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, InsVal
),
4751 DAG
.getConstant(InsertLo
? Idx
: (Idx
- 2), SL
, MVT::i32
));
4753 InsHalf
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, InsHalf
);
4755 SDValue Concat
= InsertLo
?
4756 DAG
.getBuildVector(MVT::v2i32
, SL
, { InsHalf
, HiHalf
}) :
4757 DAG
.getBuildVector(MVT::v2i32
, SL
, { LoHalf
, InsHalf
});
4759 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, Concat
);
4762 if (isa
<ConstantSDNode
>(Idx
))
4765 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4767 // Avoid stack access for dynamic indexing.
4768 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4770 // Create a congruent vector with the target value in each element so that
4771 // the required element can be masked and ORed into the target vector.
4772 SDValue ExtVal
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
,
4773 DAG
.getSplatBuildVector(VecVT
, SL
, InsVal
));
4775 assert(isPowerOf2_32(EltSize
));
4776 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4778 // Convert vector index to bit-index.
4779 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4781 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4782 SDValue BFM
= DAG
.getNode(ISD::SHL
, SL
, IntVT
,
4783 DAG
.getConstant(0xffff, SL
, IntVT
),
4786 SDValue LHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
, BFM
, ExtVal
);
4787 SDValue RHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
,
4788 DAG
.getNOT(SL
, BFM
, IntVT
), BCVec
);
4790 SDValue BFI
= DAG
.getNode(ISD::OR
, SL
, IntVT
, LHS
, RHS
);
4791 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, BFI
);
4794 SDValue
SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op
,
4795 SelectionDAG
&DAG
) const {
4798 EVT ResultVT
= Op
.getValueType();
4799 SDValue Vec
= Op
.getOperand(0);
4800 SDValue Idx
= Op
.getOperand(1);
4801 EVT VecVT
= Vec
.getValueType();
4802 unsigned VecSize
= VecVT
.getSizeInBits();
4803 EVT EltVT
= VecVT
.getVectorElementType();
4804 assert(VecSize
<= 64);
4806 DAGCombinerInfo
DCI(DAG
, AfterLegalizeVectorOps
, true, nullptr);
4808 // Make sure we do any optimizations that will make it easier to fold
4809 // source modifiers before obscuring it with bit operations.
4811 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4812 if (SDValue Combined
= performExtractVectorEltCombine(Op
.getNode(), DCI
))
4815 unsigned EltSize
= EltVT
.getSizeInBits();
4816 assert(isPowerOf2_32(EltSize
));
4818 MVT IntVT
= MVT::getIntegerVT(VecSize
);
4819 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
4821 // Convert vector index to bit-index (* EltSize)
4822 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
4824 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
4825 SDValue Elt
= DAG
.getNode(ISD::SRL
, SL
, IntVT
, BC
, ScaledIdx
);
4827 if (ResultVT
== MVT::f16
) {
4828 SDValue Result
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i16
, Elt
);
4829 return DAG
.getNode(ISD::BITCAST
, SL
, ResultVT
, Result
);
4832 return DAG
.getAnyExtOrTrunc(Elt
, SL
, ResultVT
);
4835 static bool elementPairIsContiguous(ArrayRef
<int> Mask
, int Elt
) {
4836 assert(Elt
% 2 == 0);
4837 return Mask
[Elt
+ 1] == Mask
[Elt
] + 1 && (Mask
[Elt
] % 2 == 0);
4840 SDValue
SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op
,
4841 SelectionDAG
&DAG
) const {
4843 EVT ResultVT
= Op
.getValueType();
4844 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
);
4846 EVT PackVT
= ResultVT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
4847 EVT EltVT
= PackVT
.getVectorElementType();
4848 int SrcNumElts
= Op
.getOperand(0).getValueType().getVectorNumElements();
4850 // vector_shuffle <0,1,6,7> lhs, rhs
4851 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
4853 // vector_shuffle <6,7,2,3> lhs, rhs
4854 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
4856 // vector_shuffle <6,7,0,1> lhs, rhs
4857 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
4859 // Avoid scalarizing when both halves are reading from consecutive elements.
4860 SmallVector
<SDValue
, 4> Pieces
;
4861 for (int I
= 0, N
= ResultVT
.getVectorNumElements(); I
!= N
; I
+= 2) {
4862 if (elementPairIsContiguous(SVN
->getMask(), I
)) {
4863 const int Idx
= SVN
->getMaskElt(I
);
4864 int VecIdx
= Idx
< SrcNumElts
? 0 : 1;
4865 int EltIdx
= Idx
< SrcNumElts
? Idx
: Idx
- SrcNumElts
;
4866 SDValue SubVec
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
,
4867 PackVT
, SVN
->getOperand(VecIdx
),
4868 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
4869 Pieces
.push_back(SubVec
);
4871 const int Idx0
= SVN
->getMaskElt(I
);
4872 const int Idx1
= SVN
->getMaskElt(I
+ 1);
4873 int VecIdx0
= Idx0
< SrcNumElts
? 0 : 1;
4874 int VecIdx1
= Idx1
< SrcNumElts
? 0 : 1;
4875 int EltIdx0
= Idx0
< SrcNumElts
? Idx0
: Idx0
- SrcNumElts
;
4876 int EltIdx1
= Idx1
< SrcNumElts
? Idx1
: Idx1
- SrcNumElts
;
4878 SDValue Vec0
= SVN
->getOperand(VecIdx0
);
4879 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
4880 Vec0
, DAG
.getConstant(EltIdx0
, SL
, MVT::i32
));
4882 SDValue Vec1
= SVN
->getOperand(VecIdx1
);
4883 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
4884 Vec1
, DAG
.getConstant(EltIdx1
, SL
, MVT::i32
));
4885 Pieces
.push_back(DAG
.getBuildVector(PackVT
, SL
, { Elt0
, Elt1
}));
4889 return DAG
.getNode(ISD::CONCAT_VECTORS
, SL
, ResultVT
, Pieces
);
4892 SDValue
SITargetLowering::lowerBUILD_VECTOR(SDValue Op
,
4893 SelectionDAG
&DAG
) const {
4895 EVT VT
= Op
.getValueType();
4897 if (VT
== MVT::v4i16
|| VT
== MVT::v4f16
) {
4898 EVT HalfVT
= MVT::getVectorVT(VT
.getVectorElementType().getSimpleVT(), 2);
4900 // Turn into pair of packed build_vectors.
4901 // TODO: Special case for constants that can be materialized with s_mov_b64.
4902 SDValue Lo
= DAG
.getBuildVector(HalfVT
, SL
,
4903 { Op
.getOperand(0), Op
.getOperand(1) });
4904 SDValue Hi
= DAG
.getBuildVector(HalfVT
, SL
,
4905 { Op
.getOperand(2), Op
.getOperand(3) });
4907 SDValue CastLo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Lo
);
4908 SDValue CastHi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Hi
);
4910 SDValue Blend
= DAG
.getBuildVector(MVT::v2i32
, SL
, { CastLo
, CastHi
});
4911 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Blend
);
4914 assert(VT
== MVT::v2f16
|| VT
== MVT::v2i16
);
4915 assert(!Subtarget
->hasVOP3PInsts() && "this should be legal");
4917 SDValue Lo
= Op
.getOperand(0);
4918 SDValue Hi
= Op
.getOperand(1);
4920 // Avoid adding defined bits with the zero_extend.
4922 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4923 SDValue ExtLo
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Lo
);
4924 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ExtLo
);
4927 Hi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Hi
);
4928 Hi
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Hi
);
4930 SDValue ShlHi
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Hi
,
4931 DAG
.getConstant(16, SL
, MVT::i32
));
4933 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ShlHi
);
4935 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
4936 Lo
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Lo
);
4938 SDValue Or
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Lo
, ShlHi
);
4939 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Or
);
4943 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode
*GA
) const {
4944 // We can fold offsets for anything that doesn't require a GOT relocation.
4945 return (GA
->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
4946 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
4947 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
4948 !shouldEmitGOTReloc(GA
->getGlobal());
4952 buildPCRelGlobalAddress(SelectionDAG
&DAG
, const GlobalValue
*GV
,
4953 const SDLoc
&DL
, unsigned Offset
, EVT PtrVT
,
4954 unsigned GAFlags
= SIInstrInfo::MO_NONE
) {
4955 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4956 // lowered to the following code sequence:
4958 // For constant address space:
4959 // s_getpc_b64 s[0:1]
4960 // s_add_u32 s0, s0, $symbol
4961 // s_addc_u32 s1, s1, 0
4963 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4964 // a fixup or relocation is emitted to replace $symbol with a literal
4965 // constant, which is a pc-relative offset from the encoding of the $symbol
4966 // operand to the global variable.
4968 // For global address space:
4969 // s_getpc_b64 s[0:1]
4970 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4971 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4973 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4974 // fixups or relocations are emitted to replace $symbol@*@lo and
4975 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4976 // which is a 64-bit pc-relative offset from the encoding of the $symbol
4977 // operand to the global variable.
4979 // What we want here is an offset from the value returned by s_getpc
4980 // (which is the address of the s_add_u32 instruction) to the global
4981 // variable, but since the encoding of $symbol starts 4 bytes after the start
4982 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4983 // small. This requires us to add 4 to the global variable offset in order to
4984 // compute the correct address.
4985 unsigned LoFlags
= GAFlags
;
4986 if (LoFlags
== SIInstrInfo::MO_NONE
)
4987 LoFlags
= SIInstrInfo::MO_REL32
;
4989 DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4, LoFlags
);
4991 if (GAFlags
== SIInstrInfo::MO_NONE
) {
4992 PtrHi
= DAG
.getTargetConstant(0, DL
, MVT::i32
);
4995 DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
+ 4, GAFlags
+ 1);
4997 return DAG
.getNode(AMDGPUISD::PC_ADD_REL_OFFSET
, DL
, PtrVT
, PtrLo
, PtrHi
);
5000 SDValue
SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
*MFI
,
5002 SelectionDAG
&DAG
) const {
5003 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Op
);
5004 const GlobalValue
*GV
= GSD
->getGlobal();
5005 if ((GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
&&
5006 (!GV
->hasExternalLinkage() ||
5007 getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
5008 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
)) ||
5009 GSD
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
||
5010 GSD
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
)
5011 return AMDGPUTargetLowering::LowerGlobalAddress(MFI
, Op
, DAG
);
5014 EVT PtrVT
= Op
.getValueType();
5016 if (GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
) {
5017 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, GSD
->getOffset(),
5018 SIInstrInfo::MO_ABS32_LO
);
5019 return DAG
.getNode(AMDGPUISD::LDS
, DL
, MVT::i32
, GA
);
5022 if (shouldEmitFixup(GV
))
5023 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
);
5024 else if (shouldEmitPCReloc(GV
))
5025 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
,
5026 SIInstrInfo::MO_REL32
);
5028 SDValue GOTAddr
= buildPCRelGlobalAddress(DAG
, GV
, DL
, 0, PtrVT
,
5029 SIInstrInfo::MO_GOTPCREL32
);
5031 Type
*Ty
= PtrVT
.getTypeForEVT(*DAG
.getContext());
5032 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
5033 const DataLayout
&DataLayout
= DAG
.getDataLayout();
5034 unsigned Align
= DataLayout
.getABITypeAlignment(PtrTy
);
5035 MachinePointerInfo PtrInfo
5036 = MachinePointerInfo::getGOT(DAG
.getMachineFunction());
5038 return DAG
.getLoad(PtrVT
, DL
, DAG
.getEntryNode(), GOTAddr
, PtrInfo
, Align
,
5039 MachineMemOperand::MODereferenceable
|
5040 MachineMemOperand::MOInvariant
);
5043 SDValue
SITargetLowering::copyToM0(SelectionDAG
&DAG
, SDValue Chain
,
5044 const SDLoc
&DL
, SDValue V
) const {
5045 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
5046 // the destination register.
5048 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
5049 // so we will end up with redundant moves to m0.
5051 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
5053 // A Null SDValue creates a glue result.
5054 SDNode
*M0
= DAG
.getMachineNode(AMDGPU::SI_INIT_M0
, DL
, MVT::Other
, MVT::Glue
,
5056 return SDValue(M0
, 0);
5059 SDValue
SITargetLowering::lowerImplicitZextParam(SelectionDAG
&DAG
,
5062 unsigned Offset
) const {
5064 SDValue Param
= lowerKernargMemParameter(DAG
, MVT::i32
, MVT::i32
, SL
,
5065 DAG
.getEntryNode(), Offset
, 4, false);
5066 // The local size values will have the hi 16-bits as zero.
5067 return DAG
.getNode(ISD::AssertZext
, SL
, MVT::i32
, Param
,
5068 DAG
.getValueType(VT
));
5071 static SDValue
emitNonHSAIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
5073 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
5074 "non-hsa intrinsic with hsa target",
5076 DAG
.getContext()->diagnose(BadIntrin
);
5077 return DAG
.getUNDEF(VT
);
5080 static SDValue
emitRemovedIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
5082 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
5083 "intrinsic not supported on subtarget",
5085 DAG
.getContext()->diagnose(BadIntrin
);
5086 return DAG
.getUNDEF(VT
);
5089 static SDValue
getBuildDwordsVector(SelectionDAG
&DAG
, SDLoc DL
,
5090 ArrayRef
<SDValue
> Elts
) {
5091 assert(!Elts
.empty());
5095 if (Elts
.size() == 1) {
5098 } else if (Elts
.size() == 2) {
5101 } else if (Elts
.size() <= 4) {
5104 } else if (Elts
.size() <= 8) {
5108 assert(Elts
.size() <= 16);
5113 SmallVector
<SDValue
, 16> VecElts(NumElts
);
5114 for (unsigned i
= 0; i
< Elts
.size(); ++i
) {
5115 SDValue Elt
= Elts
[i
];
5116 if (Elt
.getValueType() != MVT::f32
)
5117 Elt
= DAG
.getBitcast(MVT::f32
, Elt
);
5120 for (unsigned i
= Elts
.size(); i
< NumElts
; ++i
)
5121 VecElts
[i
] = DAG
.getUNDEF(MVT::f32
);
5125 return DAG
.getBuildVector(Type
, DL
, VecElts
);
5128 static bool parseCachePolicy(SDValue CachePolicy
, SelectionDAG
&DAG
,
5129 SDValue
*GLC
, SDValue
*SLC
, SDValue
*DLC
) {
5130 auto CachePolicyConst
= cast
<ConstantSDNode
>(CachePolicy
.getNode());
5132 uint64_t Value
= CachePolicyConst
->getZExtValue();
5133 SDLoc
DL(CachePolicy
);
5135 *GLC
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
5136 Value
&= ~(uint64_t)0x1;
5139 *SLC
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
5140 Value
&= ~(uint64_t)0x2;
5143 *DLC
= DAG
.getTargetConstant((Value
& 0x4) ? 1 : 0, DL
, MVT::i32
);
5144 Value
&= ~(uint64_t)0x4;
5150 // Re-construct the required return value for a image load intrinsic.
5151 // This is more complicated due to the optional use TexFailCtrl which means the required
5152 // return type is an aggregate
5153 static SDValue
constructRetValue(SelectionDAG
&DAG
,
5154 MachineSDNode
*Result
,
5155 ArrayRef
<EVT
> ResultTypes
,
5156 bool IsTexFail
, bool Unpacked
, bool IsD16
,
5157 int DMaskPop
, int NumVDataDwords
,
5158 const SDLoc
&DL
, LLVMContext
&Context
) {
5159 // Determine the required return type. This is the same regardless of IsTexFail flag
5160 EVT ReqRetVT
= ResultTypes
[0];
5161 EVT ReqRetEltVT
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorElementType() : ReqRetVT
;
5162 int ReqRetNumElts
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorNumElements() : 1;
5163 EVT AdjEltVT
= Unpacked
&& IsD16
? MVT::i32
: ReqRetEltVT
;
5164 EVT AdjVT
= Unpacked
? ReqRetNumElts
> 1 ? EVT::getVectorVT(Context
, AdjEltVT
, ReqRetNumElts
)
5168 // Extract data part of the result
5169 // Bitcast the result to the same type as the required return type
5171 if (IsD16
&& !Unpacked
)
5172 NumElts
= NumVDataDwords
<< 1;
5174 NumElts
= NumVDataDwords
;
5176 EVT CastVT
= NumElts
> 1 ? EVT::getVectorVT(Context
, AdjEltVT
, NumElts
)
5179 // Special case for v6f16. Rather than add support for this, use v3i32 to
5180 // extract the data elements
5181 bool V6F16Special
= false;
5183 CastVT
= EVT::getVectorVT(Context
, MVT::i32
, NumElts
/ 2);
5185 ReqRetNumElts
>>= 1;
5186 V6F16Special
= true;
5190 SDValue N
= SDValue(Result
, 0);
5191 SDValue CastRes
= DAG
.getNode(ISD::BITCAST
, DL
, CastVT
, N
);
5193 // Iterate over the result
5194 SmallVector
<SDValue
, 4> BVElts
;
5196 if (CastVT
.isVector()) {
5197 DAG
.ExtractVectorElements(CastRes
, BVElts
, 0, DMaskPop
);
5199 BVElts
.push_back(CastRes
);
5201 int ExtraElts
= ReqRetNumElts
- DMaskPop
;
5203 BVElts
.push_back(DAG
.getUNDEF(AdjEltVT
));
5206 if (ReqRetNumElts
> 1) {
5207 SDValue NewVec
= DAG
.getBuildVector(AdjVT
, DL
, BVElts
);
5208 if (IsD16
&& Unpacked
)
5209 PreTFCRes
= adjustLoadValueTypeImpl(NewVec
, ReqRetVT
, DL
, DAG
, Unpacked
);
5213 PreTFCRes
= BVElts
[0];
5217 PreTFCRes
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v4f16
, PreTFCRes
);
5220 if (Result
->getNumValues() > 1)
5221 return DAG
.getMergeValues({PreTFCRes
, SDValue(Result
, 1)}, DL
);
5226 // Extract the TexFail result and insert into aggregate return
5227 SmallVector
<SDValue
, 1> TFCElt
;
5228 DAG
.ExtractVectorElements(N
, TFCElt
, DMaskPop
, 1);
5229 SDValue TFCRes
= DAG
.getNode(ISD::BITCAST
, DL
, ResultTypes
[1], TFCElt
[0]);
5230 return DAG
.getMergeValues({PreTFCRes
, TFCRes
, SDValue(Result
, 1)}, DL
);
5233 static bool parseTexFail(SDValue TexFailCtrl
, SelectionDAG
&DAG
, SDValue
*TFE
,
5234 SDValue
*LWE
, bool &IsTexFail
) {
5235 auto TexFailCtrlConst
= cast
<ConstantSDNode
>(TexFailCtrl
.getNode());
5237 uint64_t Value
= TexFailCtrlConst
->getZExtValue();
5242 SDLoc
DL(TexFailCtrlConst
);
5243 *TFE
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
5244 Value
&= ~(uint64_t)0x1;
5245 *LWE
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
5246 Value
&= ~(uint64_t)0x2;
5251 SDValue
SITargetLowering::lowerImage(SDValue Op
,
5252 const AMDGPU::ImageDimIntrinsicInfo
*Intr
,
5253 SelectionDAG
&DAG
) const {
5255 MachineFunction
&MF
= DAG
.getMachineFunction();
5256 const GCNSubtarget
* ST
= &MF
.getSubtarget
<GCNSubtarget
>();
5257 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
5258 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
5259 const AMDGPU::MIMGDimInfo
*DimInfo
= AMDGPU::getMIMGDimInfo(Intr
->Dim
);
5260 const AMDGPU::MIMGLZMappingInfo
*LZMappingInfo
=
5261 AMDGPU::getMIMGLZMappingInfo(Intr
->BaseOpcode
);
5262 const AMDGPU::MIMGMIPMappingInfo
*MIPMappingInfo
=
5263 AMDGPU::getMIMGMIPMappingInfo(Intr
->BaseOpcode
);
5264 unsigned IntrOpcode
= Intr
->BaseOpcode
;
5265 bool IsGFX10
= Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
;
5267 SmallVector
<EVT
, 3> ResultTypes(Op
->value_begin(), Op
->value_end());
5268 SmallVector
<EVT
, 3> OrigResultTypes(Op
->value_begin(), Op
->value_end());
5273 bool AdjustRetType
= false;
5275 unsigned AddrIdx
; // Index of first address argument
5277 unsigned DMaskLanes
= 0;
5279 if (BaseOpcode
->Atomic
) {
5280 VData
= Op
.getOperand(2);
5282 bool Is64Bit
= VData
.getValueType() == MVT::i64
;
5283 if (BaseOpcode
->AtomicX2
) {
5284 SDValue VData2
= Op
.getOperand(3);
5285 VData
= DAG
.getBuildVector(Is64Bit
? MVT::v2i64
: MVT::v2i32
, DL
,
5288 VData
= DAG
.getBitcast(MVT::v4i32
, VData
);
5290 ResultTypes
[0] = Is64Bit
? MVT::v2i64
: MVT::v2i32
;
5291 DMask
= Is64Bit
? 0xf : 0x3;
5292 NumVDataDwords
= Is64Bit
? 4 : 2;
5295 DMask
= Is64Bit
? 0x3 : 0x1;
5296 NumVDataDwords
= Is64Bit
? 2 : 1;
5300 unsigned DMaskIdx
= BaseOpcode
->Store
? 3 : isa
<MemSDNode
>(Op
) ? 2 : 1;
5301 auto DMaskConst
= cast
<ConstantSDNode
>(Op
.getOperand(DMaskIdx
));
5302 DMask
= DMaskConst
->getZExtValue();
5303 DMaskLanes
= BaseOpcode
->Gather4
? 4 : countPopulation(DMask
);
5305 if (BaseOpcode
->Store
) {
5306 VData
= Op
.getOperand(2);
5308 MVT StoreVT
= VData
.getSimpleValueType();
5309 if (StoreVT
.getScalarType() == MVT::f16
) {
5310 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
5311 return Op
; // D16 is unsupported for this instruction
5314 VData
= handleD16VData(VData
, DAG
);
5317 NumVDataDwords
= (VData
.getValueType().getSizeInBits() + 31) / 32;
5319 // Work out the num dwords based on the dmask popcount and underlying type
5320 // and whether packing is supported.
5321 MVT LoadVT
= ResultTypes
[0].getSimpleVT();
5322 if (LoadVT
.getScalarType() == MVT::f16
) {
5323 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
5324 return Op
; // D16 is unsupported for this instruction
5329 // Confirm that the return type is large enough for the dmask specified
5330 if ((LoadVT
.isVector() && LoadVT
.getVectorNumElements() < DMaskLanes
) ||
5331 (!LoadVT
.isVector() && DMaskLanes
> 1))
5334 if (IsD16
&& !Subtarget
->hasUnpackedD16VMem())
5335 NumVDataDwords
= (DMaskLanes
+ 1) / 2;
5337 NumVDataDwords
= DMaskLanes
;
5339 AdjustRetType
= true;
5342 AddrIdx
= DMaskIdx
+ 1;
5345 unsigned NumGradients
= BaseOpcode
->Gradients
? DimInfo
->NumGradients
: 0;
5346 unsigned NumCoords
= BaseOpcode
->Coordinates
? DimInfo
->NumCoords
: 0;
5347 unsigned NumLCM
= BaseOpcode
->LodOrClampOrMip
? 1 : 0;
5348 unsigned NumVAddrs
= BaseOpcode
->NumExtraArgs
+ NumGradients
+
5350 unsigned NumMIVAddrs
= NumVAddrs
;
5352 SmallVector
<SDValue
, 4> VAddrs
;
5354 // Optimize _L to _LZ when _L is zero
5355 if (LZMappingInfo
) {
5356 if (auto ConstantLod
=
5357 dyn_cast
<ConstantFPSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
5358 if (ConstantLod
->isZero() || ConstantLod
->isNegative()) {
5359 IntrOpcode
= LZMappingInfo
->LZ
; // set new opcode to _lz variant of _l
5360 NumMIVAddrs
--; // remove 'lod'
5365 // Optimize _mip away, when 'lod' is zero
5366 if (MIPMappingInfo
) {
5367 if (auto ConstantLod
=
5368 dyn_cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+NumVAddrs
-1))) {
5369 if (ConstantLod
->isNullValue()) {
5370 IntrOpcode
= MIPMappingInfo
->NONMIP
; // set new opcode to variant without _mip
5371 NumMIVAddrs
--; // remove 'lod'
5376 // Check for 16 bit addresses and pack if true.
5377 unsigned DimIdx
= AddrIdx
+ BaseOpcode
->NumExtraArgs
;
5378 MVT VAddrVT
= Op
.getOperand(DimIdx
).getSimpleValueType();
5379 const MVT VAddrScalarVT
= VAddrVT
.getScalarType();
5380 if (((VAddrScalarVT
== MVT::f16
) || (VAddrScalarVT
== MVT::i16
)) &&
5381 ST
->hasFeature(AMDGPU::FeatureR128A16
)) {
5383 const MVT VectorVT
= VAddrScalarVT
== MVT::f16
? MVT::v2f16
: MVT::v2i16
;
5384 for (unsigned i
= AddrIdx
; i
< (AddrIdx
+ NumMIVAddrs
); ++i
) {
5385 SDValue AddrLo
, AddrHi
;
5386 // Push back extra arguments.
5388 AddrLo
= Op
.getOperand(i
);
5390 AddrLo
= Op
.getOperand(i
);
5391 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
5392 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
5393 if (((i
+ 1) >= (AddrIdx
+ NumMIVAddrs
)) ||
5394 ((NumGradients
/ 2) % 2 == 1 &&
5395 (i
== DimIdx
+ (NumGradients
/ 2) - 1 ||
5396 i
== DimIdx
+ NumGradients
- 1))) {
5397 AddrHi
= DAG
.getUNDEF(MVT::f16
);
5399 AddrHi
= Op
.getOperand(i
+ 1);
5402 AddrLo
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, VectorVT
,
5404 AddrLo
= DAG
.getBitcast(MVT::i32
, AddrLo
);
5406 VAddrs
.push_back(AddrLo
);
5409 for (unsigned i
= 0; i
< NumMIVAddrs
; ++i
)
5410 VAddrs
.push_back(Op
.getOperand(AddrIdx
+ i
));
5413 // If the register allocator cannot place the address registers contiguously
5414 // without introducing moves, then using the non-sequential address encoding
5415 // is always preferable, since it saves VALU instructions and is usually a
5416 // wash in terms of code size or even better.
5418 // However, we currently have no way of hinting to the register allocator that
5419 // MIMG addresses should be placed contiguously when it is possible to do so,
5420 // so force non-NSA for the common 2-address case as a heuristic.
5422 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5423 // allocation when possible.
5425 ST
->hasFeature(AMDGPU::FeatureNSAEncoding
) && VAddrs
.size() >= 3;
5428 VAddr
= getBuildDwordsVector(DAG
, DL
, VAddrs
);
5430 SDValue True
= DAG
.getTargetConstant(1, DL
, MVT::i1
);
5431 SDValue False
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
5432 unsigned CtrlIdx
; // Index of texfailctrl argument
5434 if (!BaseOpcode
->Sampler
) {
5436 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 1;
5439 cast
<ConstantSDNode
>(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 2));
5441 Unorm
= UnormConst
->getZExtValue() ? True
: False
;
5442 CtrlIdx
= AddrIdx
+ NumVAddrs
+ 3;
5447 SDValue TexFail
= Op
.getOperand(CtrlIdx
);
5448 bool IsTexFail
= false;
5449 if (!parseTexFail(TexFail
, DAG
, &TFE
, &LWE
, IsTexFail
))
5454 // Expecting to get an error flag since TFC is on - and dmask is 0
5455 // Force dmask to be at least 1 otherwise the instruction will fail
5460 NumVDataDwords
+= 1;
5461 AdjustRetType
= true;
5464 // Has something earlier tagged that the return type needs adjusting
5465 // This happens if the instruction is a load or has set TexFailCtrl flags
5466 if (AdjustRetType
) {
5467 // NumVDataDwords reflects the true number of dwords required in the return type
5468 if (DMaskLanes
== 0 && !BaseOpcode
->Store
) {
5469 // This is a no-op load. This can be eliminated
5470 SDValue Undef
= DAG
.getUNDEF(Op
.getValueType());
5471 if (isa
<MemSDNode
>(Op
))
5472 return DAG
.getMergeValues({Undef
, Op
.getOperand(0)}, DL
);
5476 EVT NewVT
= NumVDataDwords
> 1 ?
5477 EVT::getVectorVT(*DAG
.getContext(), MVT::f32
, NumVDataDwords
)
5480 ResultTypes
[0] = NewVT
;
5481 if (ResultTypes
.size() == 3) {
5482 // Original result was aggregate type used for TexFailCtrl results
5483 // The actual instruction returns as a vector type which has now been
5484 // created. Remove the aggregate result.
5485 ResultTypes
.erase(&ResultTypes
[1]);
5492 if (BaseOpcode
->Atomic
) {
5493 GLC
= True
; // TODO no-return optimization
5494 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, nullptr, &SLC
,
5495 IsGFX10
? &DLC
: nullptr))
5498 if (!parseCachePolicy(Op
.getOperand(CtrlIdx
+ 1), DAG
, &GLC
, &SLC
,
5499 IsGFX10
? &DLC
: nullptr))
5503 SmallVector
<SDValue
, 26> Ops
;
5504 if (BaseOpcode
->Store
|| BaseOpcode
->Atomic
)
5505 Ops
.push_back(VData
); // vdata
5507 for (const SDValue
&Addr
: VAddrs
)
5508 Ops
.push_back(Addr
);
5510 Ops
.push_back(VAddr
);
5512 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
)); // rsrc
5513 if (BaseOpcode
->Sampler
)
5514 Ops
.push_back(Op
.getOperand(AddrIdx
+ NumVAddrs
+ 1)); // sampler
5515 Ops
.push_back(DAG
.getTargetConstant(DMask
, DL
, MVT::i32
));
5517 Ops
.push_back(DAG
.getTargetConstant(DimInfo
->Encoding
, DL
, MVT::i32
));
5518 Ops
.push_back(Unorm
);
5523 Ops
.push_back(IsA16
&& // a16 or r128
5524 ST
->hasFeature(AMDGPU::FeatureR128A16
) ? True
: False
);
5525 Ops
.push_back(TFE
); // tfe
5526 Ops
.push_back(LWE
); // lwe
5528 Ops
.push_back(DimInfo
->DA
? True
: False
);
5529 if (BaseOpcode
->HasD16
)
5530 Ops
.push_back(IsD16
? True
: False
);
5531 if (isa
<MemSDNode
>(Op
))
5532 Ops
.push_back(Op
.getOperand(0)); // chain
5534 int NumVAddrDwords
=
5535 UseNSA
? VAddrs
.size() : VAddr
.getValueType().getSizeInBits() / 32;
5539 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
,
5540 UseNSA
? AMDGPU::MIMGEncGfx10NSA
5541 : AMDGPU::MIMGEncGfx10Default
,
5542 NumVDataDwords
, NumVAddrDwords
);
5544 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5545 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx8
,
5546 NumVDataDwords
, NumVAddrDwords
);
5548 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx6
,
5549 NumVDataDwords
, NumVAddrDwords
);
5551 assert(Opcode
!= -1);
5553 MachineSDNode
*NewNode
= DAG
.getMachineNode(Opcode
, DL
, ResultTypes
, Ops
);
5554 if (auto MemOp
= dyn_cast
<MemSDNode
>(Op
)) {
5555 MachineMemOperand
*MemRef
= MemOp
->getMemOperand();
5556 DAG
.setNodeMemRefs(NewNode
, {MemRef
});
5559 if (BaseOpcode
->AtomicX2
) {
5560 SmallVector
<SDValue
, 1> Elt
;
5561 DAG
.ExtractVectorElements(SDValue(NewNode
, 0), Elt
, 0, 1);
5562 return DAG
.getMergeValues({Elt
[0], SDValue(NewNode
, 1)}, DL
);
5563 } else if (!BaseOpcode
->Store
) {
5564 return constructRetValue(DAG
, NewNode
,
5565 OrigResultTypes
, IsTexFail
,
5566 Subtarget
->hasUnpackedD16VMem(), IsD16
,
5567 DMaskLanes
, NumVDataDwords
, DL
,
5571 return SDValue(NewNode
, 0);
5574 SDValue
SITargetLowering::lowerSBuffer(EVT VT
, SDLoc DL
, SDValue Rsrc
,
5575 SDValue Offset
, SDValue GLC
, SDValue DLC
,
5576 SelectionDAG
&DAG
) const {
5577 MachineFunction
&MF
= DAG
.getMachineFunction();
5578 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
5579 MachinePointerInfo(),
5580 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
5581 MachineMemOperand::MOInvariant
,
5582 VT
.getStoreSize(), VT
.getStoreSize());
5584 if (!Offset
->isDivergent()) {
5591 return DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD
, DL
,
5592 DAG
.getVTList(VT
), Ops
, VT
, MMO
);
5595 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5596 // assume that the buffer is unswizzled.
5597 SmallVector
<SDValue
, 4> Loads
;
5598 unsigned NumLoads
= 1;
5599 MVT LoadVT
= VT
.getSimpleVT();
5600 unsigned NumElts
= LoadVT
.isVector() ? LoadVT
.getVectorNumElements() : 1;
5601 assert((LoadVT
.getScalarType() == MVT::i32
||
5602 LoadVT
.getScalarType() == MVT::f32
) &&
5603 isPowerOf2_32(NumElts
));
5605 if (NumElts
== 8 || NumElts
== 16) {
5606 NumLoads
= NumElts
== 16 ? 4 : 2;
5607 LoadVT
= MVT::v4i32
;
5610 SDVTList VTList
= DAG
.getVTList({LoadVT
, MVT::Glue
});
5611 unsigned CachePolicy
= cast
<ConstantSDNode
>(GLC
)->getZExtValue();
5613 DAG
.getEntryNode(), // Chain
5615 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
5619 DAG
.getConstant(CachePolicy
, DL
, MVT::i32
), // cachepolicy
5620 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
5623 // Use the alignment to ensure that the required offsets will fit into the
5624 // immediate offsets.
5625 setBufferOffsets(Offset
, DAG
, &Ops
[3], NumLoads
> 1 ? 16 * NumLoads
: 4);
5627 uint64_t InstOffset
= cast
<ConstantSDNode
>(Ops
[5])->getZExtValue();
5628 for (unsigned i
= 0; i
< NumLoads
; ++i
) {
5629 Ops
[5] = DAG
.getConstant(InstOffset
+ 16 * i
, DL
, MVT::i32
);
5630 Loads
.push_back(DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD
, DL
, VTList
,
5634 if (VT
== MVT::v8i32
|| VT
== MVT::v16i32
)
5635 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VT
, Loads
);
5640 SDValue
SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
5641 SelectionDAG
&DAG
) const {
5642 MachineFunction
&MF
= DAG
.getMachineFunction();
5643 auto MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
5645 EVT VT
= Op
.getValueType();
5647 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
5649 // TODO: Should this propagate fast-math-flags?
5651 switch (IntrinsicID
) {
5652 case Intrinsic::amdgcn_implicit_buffer_ptr
: {
5653 if (getSubtarget()->isAmdHsaOrMesa(MF
.getFunction()))
5654 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5655 return getPreloadedValue(DAG
, *MFI
, VT
,
5656 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
5658 case Intrinsic::amdgcn_dispatch_ptr
:
5659 case Intrinsic::amdgcn_queue_ptr
: {
5660 if (!Subtarget
->isAmdHsaOrMesa(MF
.getFunction())) {
5661 DiagnosticInfoUnsupported
BadIntrin(
5662 MF
.getFunction(), "unsupported hsa intrinsic without hsa target",
5664 DAG
.getContext()->diagnose(BadIntrin
);
5665 return DAG
.getUNDEF(VT
);
5668 auto RegID
= IntrinsicID
== Intrinsic::amdgcn_dispatch_ptr
?
5669 AMDGPUFunctionArgInfo::DISPATCH_PTR
: AMDGPUFunctionArgInfo::QUEUE_PTR
;
5670 return getPreloadedValue(DAG
, *MFI
, VT
, RegID
);
5672 case Intrinsic::amdgcn_implicitarg_ptr
: {
5673 if (MFI
->isEntryFunction())
5674 return getImplicitArgPtr(DAG
, DL
);
5675 return getPreloadedValue(DAG
, *MFI
, VT
,
5676 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
5678 case Intrinsic::amdgcn_kernarg_segment_ptr
: {
5679 return getPreloadedValue(DAG
, *MFI
, VT
,
5680 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
5682 case Intrinsic::amdgcn_dispatch_id
: {
5683 return getPreloadedValue(DAG
, *MFI
, VT
, AMDGPUFunctionArgInfo::DISPATCH_ID
);
5685 case Intrinsic::amdgcn_rcp
:
5686 return DAG
.getNode(AMDGPUISD::RCP
, DL
, VT
, Op
.getOperand(1));
5687 case Intrinsic::amdgcn_rsq
:
5688 return DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
5689 case Intrinsic::amdgcn_rsq_legacy
:
5690 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5691 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
5693 return DAG
.getNode(AMDGPUISD::RSQ_LEGACY
, DL
, VT
, Op
.getOperand(1));
5694 case Intrinsic::amdgcn_rcp_legacy
:
5695 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5696 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
5697 return DAG
.getNode(AMDGPUISD::RCP_LEGACY
, DL
, VT
, Op
.getOperand(1));
5698 case Intrinsic::amdgcn_rsq_clamp
: {
5699 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5700 return DAG
.getNode(AMDGPUISD::RSQ_CLAMP
, DL
, VT
, Op
.getOperand(1));
5702 Type
*Type
= VT
.getTypeForEVT(*DAG
.getContext());
5703 APFloat Max
= APFloat::getLargest(Type
->getFltSemantics());
5704 APFloat Min
= APFloat::getLargest(Type
->getFltSemantics(), true);
5706 SDValue Rsq
= DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
5707 SDValue Tmp
= DAG
.getNode(ISD::FMINNUM
, DL
, VT
, Rsq
,
5708 DAG
.getConstantFP(Max
, DL
, VT
));
5709 return DAG
.getNode(ISD::FMAXNUM
, DL
, VT
, Tmp
,
5710 DAG
.getConstantFP(Min
, DL
, VT
));
5712 case Intrinsic::r600_read_ngroups_x
:
5713 if (Subtarget
->isAmdHsaOS())
5714 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5716 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5717 SI::KernelInputOffsets::NGROUPS_X
, 4, false);
5718 case Intrinsic::r600_read_ngroups_y
:
5719 if (Subtarget
->isAmdHsaOS())
5720 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5722 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5723 SI::KernelInputOffsets::NGROUPS_Y
, 4, false);
5724 case Intrinsic::r600_read_ngroups_z
:
5725 if (Subtarget
->isAmdHsaOS())
5726 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5728 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5729 SI::KernelInputOffsets::NGROUPS_Z
, 4, false);
5730 case Intrinsic::r600_read_global_size_x
:
5731 if (Subtarget
->isAmdHsaOS())
5732 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5734 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5735 SI::KernelInputOffsets::GLOBAL_SIZE_X
, 4, false);
5736 case Intrinsic::r600_read_global_size_y
:
5737 if (Subtarget
->isAmdHsaOS())
5738 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5740 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5741 SI::KernelInputOffsets::GLOBAL_SIZE_Y
, 4, false);
5742 case Intrinsic::r600_read_global_size_z
:
5743 if (Subtarget
->isAmdHsaOS())
5744 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5746 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
5747 SI::KernelInputOffsets::GLOBAL_SIZE_Z
, 4, false);
5748 case Intrinsic::r600_read_local_size_x
:
5749 if (Subtarget
->isAmdHsaOS())
5750 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5752 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5753 SI::KernelInputOffsets::LOCAL_SIZE_X
);
5754 case Intrinsic::r600_read_local_size_y
:
5755 if (Subtarget
->isAmdHsaOS())
5756 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5758 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5759 SI::KernelInputOffsets::LOCAL_SIZE_Y
);
5760 case Intrinsic::r600_read_local_size_z
:
5761 if (Subtarget
->isAmdHsaOS())
5762 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
5764 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
5765 SI::KernelInputOffsets::LOCAL_SIZE_Z
);
5766 case Intrinsic::amdgcn_workgroup_id_x
:
5767 case Intrinsic::r600_read_tgid_x
:
5768 return getPreloadedValue(DAG
, *MFI
, VT
,
5769 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
5770 case Intrinsic::amdgcn_workgroup_id_y
:
5771 case Intrinsic::r600_read_tgid_y
:
5772 return getPreloadedValue(DAG
, *MFI
, VT
,
5773 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
5774 case Intrinsic::amdgcn_workgroup_id_z
:
5775 case Intrinsic::r600_read_tgid_z
:
5776 return getPreloadedValue(DAG
, *MFI
, VT
,
5777 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
5778 case Intrinsic::amdgcn_workitem_id_x
:
5779 case Intrinsic::r600_read_tidig_x
:
5780 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5781 SDLoc(DAG
.getEntryNode()),
5782 MFI
->getArgInfo().WorkItemIDX
);
5783 case Intrinsic::amdgcn_workitem_id_y
:
5784 case Intrinsic::r600_read_tidig_y
:
5785 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5786 SDLoc(DAG
.getEntryNode()),
5787 MFI
->getArgInfo().WorkItemIDY
);
5788 case Intrinsic::amdgcn_workitem_id_z
:
5789 case Intrinsic::r600_read_tidig_z
:
5790 return loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
5791 SDLoc(DAG
.getEntryNode()),
5792 MFI
->getArgInfo().WorkItemIDZ
);
5793 case Intrinsic::amdgcn_wavefrontsize
:
5794 return DAG
.getConstant(MF
.getSubtarget
<GCNSubtarget
>().getWavefrontSize(),
5795 SDLoc(Op
), MVT::i32
);
5796 case Intrinsic::amdgcn_s_buffer_load
: {
5797 bool IsGFX10
= Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
;
5799 SDValue DLC
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
5800 if (!parseCachePolicy(Op
.getOperand(3), DAG
, &GLC
, nullptr,
5801 IsGFX10
? &DLC
: nullptr))
5803 return lowerSBuffer(VT
, DL
, Op
.getOperand(1), Op
.getOperand(2), GLC
, DLC
,
5806 case Intrinsic::amdgcn_fdiv_fast
:
5807 return lowerFDIV_FAST(Op
, DAG
);
5808 case Intrinsic::amdgcn_interp_mov
: {
5809 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(4));
5810 SDValue Glue
= M0
.getValue(1);
5811 return DAG
.getNode(AMDGPUISD::INTERP_MOV
, DL
, MVT::f32
, Op
.getOperand(1),
5812 Op
.getOperand(2), Op
.getOperand(3), Glue
);
5814 case Intrinsic::amdgcn_interp_p1
: {
5815 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(4));
5816 SDValue Glue
= M0
.getValue(1);
5817 return DAG
.getNode(AMDGPUISD::INTERP_P1
, DL
, MVT::f32
, Op
.getOperand(1),
5818 Op
.getOperand(2), Op
.getOperand(3), Glue
);
5820 case Intrinsic::amdgcn_interp_p2
: {
5821 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(5));
5822 SDValue Glue
= SDValue(M0
.getNode(), 1);
5823 return DAG
.getNode(AMDGPUISD::INTERP_P2
, DL
, MVT::f32
, Op
.getOperand(1),
5824 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(4),
5827 case Intrinsic::amdgcn_interp_p1_f16
: {
5828 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(5));
5829 SDValue Glue
= M0
.getValue(1);
5830 if (getSubtarget()->getLDSBankCount() == 16) {
5832 SDValue S
= DAG
.getNode(AMDGPUISD::INTERP_MOV
, DL
, MVT::f32
,
5833 DAG
.getConstant(2, DL
, MVT::i32
), // P0
5834 Op
.getOperand(2), // Attrchan
5835 Op
.getOperand(3), // Attr
5838 Op
.getOperand(1), // Src0
5839 Op
.getOperand(2), // Attrchan
5840 Op
.getOperand(3), // Attr
5841 DAG
.getConstant(0, DL
, MVT::i32
), // $src0_modifiers
5842 S
, // Src2 - holds two f16 values selected by high
5843 DAG
.getConstant(0, DL
, MVT::i32
), // $src2_modifiers
5844 Op
.getOperand(4), // high
5845 DAG
.getConstant(0, DL
, MVT::i1
), // $clamp
5846 DAG
.getConstant(0, DL
, MVT::i32
) // $omod
5848 return DAG
.getNode(AMDGPUISD::INTERP_P1LV_F16
, DL
, MVT::f32
, Ops
);
5852 Op
.getOperand(1), // Src0
5853 Op
.getOperand(2), // Attrchan
5854 Op
.getOperand(3), // Attr
5855 DAG
.getConstant(0, DL
, MVT::i32
), // $src0_modifiers
5856 Op
.getOperand(4), // high
5857 DAG
.getConstant(0, DL
, MVT::i1
), // $clamp
5858 DAG
.getConstant(0, DL
, MVT::i32
), // $omod
5861 return DAG
.getNode(AMDGPUISD::INTERP_P1LL_F16
, DL
, MVT::f32
, Ops
);
5864 case Intrinsic::amdgcn_interp_p2_f16
: {
5865 SDValue M0
= copyToM0(DAG
, DAG
.getEntryNode(), DL
, Op
.getOperand(6));
5866 SDValue Glue
= SDValue(M0
.getNode(), 1);
5868 Op
.getOperand(2), // Src0
5869 Op
.getOperand(3), // Attrchan
5870 Op
.getOperand(4), // Attr
5871 DAG
.getConstant(0, DL
, MVT::i32
), // $src0_modifiers
5872 Op
.getOperand(1), // Src2
5873 DAG
.getConstant(0, DL
, MVT::i32
), // $src2_modifiers
5874 Op
.getOperand(5), // high
5875 DAG
.getConstant(0, DL
, MVT::i1
), // $clamp
5878 return DAG
.getNode(AMDGPUISD::INTERP_P2_F16
, DL
, MVT::f16
, Ops
);
5880 case Intrinsic::amdgcn_sin
:
5881 return DAG
.getNode(AMDGPUISD::SIN_HW
, DL
, VT
, Op
.getOperand(1));
5883 case Intrinsic::amdgcn_cos
:
5884 return DAG
.getNode(AMDGPUISD::COS_HW
, DL
, VT
, Op
.getOperand(1));
5886 case Intrinsic::amdgcn_mul_u24
:
5887 return DAG
.getNode(AMDGPUISD::MUL_U24
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5888 case Intrinsic::amdgcn_mul_i24
:
5889 return DAG
.getNode(AMDGPUISD::MUL_I24
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5891 case Intrinsic::amdgcn_log_clamp
: {
5892 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5895 DiagnosticInfoUnsupported
BadIntrin(
5896 MF
.getFunction(), "intrinsic not supported on subtarget",
5898 DAG
.getContext()->diagnose(BadIntrin
);
5899 return DAG
.getUNDEF(VT
);
5901 case Intrinsic::amdgcn_ldexp
:
5902 return DAG
.getNode(AMDGPUISD::LDEXP
, DL
, VT
,
5903 Op
.getOperand(1), Op
.getOperand(2));
5905 case Intrinsic::amdgcn_fract
:
5906 return DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, Op
.getOperand(1));
5908 case Intrinsic::amdgcn_class
:
5909 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, VT
,
5910 Op
.getOperand(1), Op
.getOperand(2));
5911 case Intrinsic::amdgcn_div_fmas
:
5912 return DAG
.getNode(AMDGPUISD::DIV_FMAS
, DL
, VT
,
5913 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
5916 case Intrinsic::amdgcn_div_fixup
:
5917 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, DL
, VT
,
5918 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5920 case Intrinsic::amdgcn_trig_preop
:
5921 return DAG
.getNode(AMDGPUISD::TRIG_PREOP
, DL
, VT
,
5922 Op
.getOperand(1), Op
.getOperand(2));
5923 case Intrinsic::amdgcn_div_scale
: {
5924 const ConstantSDNode
*Param
= cast
<ConstantSDNode
>(Op
.getOperand(3));
5926 // Translate to the operands expected by the machine instruction. The
5927 // first parameter must be the same as the first instruction.
5928 SDValue Numerator
= Op
.getOperand(1);
5929 SDValue Denominator
= Op
.getOperand(2);
5931 // Note this order is opposite of the machine instruction's operations,
5932 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5933 // intrinsic has the numerator as the first operand to match a normal
5934 // division operation.
5936 SDValue Src0
= Param
->isAllOnesValue() ? Numerator
: Denominator
;
5938 return DAG
.getNode(AMDGPUISD::DIV_SCALE
, DL
, Op
->getVTList(), Src0
,
5939 Denominator
, Numerator
);
5941 case Intrinsic::amdgcn_icmp
: {
5942 // There is a Pat that handles this variant, so return it as-is.
5943 if (Op
.getOperand(1).getValueType() == MVT::i1
&&
5944 Op
.getConstantOperandVal(2) == 0 &&
5945 Op
.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE
)
5947 return lowerICMPIntrinsic(*this, Op
.getNode(), DAG
);
5949 case Intrinsic::amdgcn_fcmp
: {
5950 return lowerFCMPIntrinsic(*this, Op
.getNode(), DAG
);
5952 case Intrinsic::amdgcn_fmed3
:
5953 return DAG
.getNode(AMDGPUISD::FMED3
, DL
, VT
,
5954 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5955 case Intrinsic::amdgcn_fdot2
:
5956 return DAG
.getNode(AMDGPUISD::FDOT2
, DL
, VT
,
5957 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3),
5959 case Intrinsic::amdgcn_fmul_legacy
:
5960 return DAG
.getNode(AMDGPUISD::FMUL_LEGACY
, DL
, VT
,
5961 Op
.getOperand(1), Op
.getOperand(2));
5962 case Intrinsic::amdgcn_sffbh
:
5963 return DAG
.getNode(AMDGPUISD::FFBH_I32
, DL
, VT
, Op
.getOperand(1));
5964 case Intrinsic::amdgcn_sbfe
:
5965 return DAG
.getNode(AMDGPUISD::BFE_I32
, DL
, VT
,
5966 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5967 case Intrinsic::amdgcn_ubfe
:
5968 return DAG
.getNode(AMDGPUISD::BFE_U32
, DL
, VT
,
5969 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
5970 case Intrinsic::amdgcn_cvt_pkrtz
:
5971 case Intrinsic::amdgcn_cvt_pknorm_i16
:
5972 case Intrinsic::amdgcn_cvt_pknorm_u16
:
5973 case Intrinsic::amdgcn_cvt_pk_i16
:
5974 case Intrinsic::amdgcn_cvt_pk_u16
: {
5975 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
5976 EVT VT
= Op
.getValueType();
5979 if (IntrinsicID
== Intrinsic::amdgcn_cvt_pkrtz
)
5980 Opcode
= AMDGPUISD::CVT_PKRTZ_F16_F32
;
5981 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
5982 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
5983 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
5984 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
5985 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pk_i16
)
5986 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
5988 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
5990 if (isTypeLegal(VT
))
5991 return DAG
.getNode(Opcode
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
5993 SDValue Node
= DAG
.getNode(Opcode
, DL
, MVT::i32
,
5994 Op
.getOperand(1), Op
.getOperand(2));
5995 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Node
);
5997 case Intrinsic::amdgcn_fmad_ftz
:
5998 return DAG
.getNode(AMDGPUISD::FMAD_FTZ
, DL
, VT
, Op
.getOperand(1),
5999 Op
.getOperand(2), Op
.getOperand(3));
6001 case Intrinsic::amdgcn_if_break
:
6002 return SDValue(DAG
.getMachineNode(AMDGPU::SI_IF_BREAK
, DL
, VT
,
6003 Op
->getOperand(1), Op
->getOperand(2)), 0);
6005 case Intrinsic::amdgcn_groupstaticsize
: {
6006 Triple::OSType OS
= getTargetMachine().getTargetTriple().getOS();
6007 if (OS
== Triple::AMDHSA
|| OS
== Triple::AMDPAL
)
6010 const Module
*M
= MF
.getFunction().getParent();
6011 const GlobalValue
*GV
=
6012 M
->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize
));
6013 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, 0,
6014 SIInstrInfo::MO_ABS32_LO
);
6015 return {DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, GA
), 0};
6018 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6019 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
6020 return lowerImage(Op
, ImageDimIntr
, DAG
);
6026 SDValue
SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op
,
6027 SelectionDAG
&DAG
) const {
6028 unsigned IntrID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
6032 case Intrinsic::amdgcn_ds_ordered_add
:
6033 case Intrinsic::amdgcn_ds_ordered_swap
: {
6034 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6035 SDValue Chain
= M
->getOperand(0);
6036 SDValue M0
= M
->getOperand(2);
6037 SDValue Value
= M
->getOperand(3);
6038 unsigned IndexOperand
= M
->getConstantOperandVal(7);
6039 unsigned WaveRelease
= M
->getConstantOperandVal(8);
6040 unsigned WaveDone
= M
->getConstantOperandVal(9);
6041 unsigned ShaderType
;
6042 unsigned Instruction
;
6044 unsigned OrderedCountIndex
= IndexOperand
& 0x3f;
6045 IndexOperand
&= ~0x3f;
6046 unsigned CountDw
= 0;
6048 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
) {
6049 CountDw
= (IndexOperand
>> 24) & 0xf;
6050 IndexOperand
&= ~(0xf << 24);
6052 if (CountDw
< 1 || CountDw
> 4) {
6054 "ds_ordered_count: dword count must be between 1 and 4");
6059 report_fatal_error("ds_ordered_count: bad index operand");
6062 case Intrinsic::amdgcn_ds_ordered_add
:
6065 case Intrinsic::amdgcn_ds_ordered_swap
:
6070 if (WaveDone
&& !WaveRelease
)
6071 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
6073 switch (DAG
.getMachineFunction().getFunction().getCallingConv()) {
6074 case CallingConv::AMDGPU_CS
:
6075 case CallingConv::AMDGPU_KERNEL
:
6078 case CallingConv::AMDGPU_PS
:
6081 case CallingConv::AMDGPU_VS
:
6084 case CallingConv::AMDGPU_GS
:
6088 report_fatal_error("ds_ordered_count unsupported for this calling conv");
6091 unsigned Offset0
= OrderedCountIndex
<< 2;
6092 unsigned Offset1
= WaveRelease
| (WaveDone
<< 1) | (ShaderType
<< 2) |
6095 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
6096 Offset1
|= (CountDw
- 1) << 6;
6098 unsigned Offset
= Offset0
| (Offset1
<< 8);
6103 DAG
.getTargetConstant(Offset
, DL
, MVT::i16
),
6104 copyToM0(DAG
, Chain
, DL
, M0
).getValue(1), // Glue
6106 return DAG
.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT
, DL
,
6107 M
->getVTList(), Ops
, M
->getMemoryVT(),
6108 M
->getMemOperand());
6110 case Intrinsic::amdgcn_ds_fadd
: {
6111 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6114 case Intrinsic::amdgcn_ds_fadd
:
6115 Opc
= ISD::ATOMIC_LOAD_FADD
;
6119 return DAG
.getAtomic(Opc
, SDLoc(Op
), M
->getMemoryVT(),
6120 M
->getOperand(0), M
->getOperand(2), M
->getOperand(3),
6121 M
->getMemOperand());
6123 case Intrinsic::amdgcn_atomic_inc
:
6124 case Intrinsic::amdgcn_atomic_dec
:
6125 case Intrinsic::amdgcn_ds_fmin
:
6126 case Intrinsic::amdgcn_ds_fmax
: {
6127 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6130 case Intrinsic::amdgcn_atomic_inc
:
6131 Opc
= AMDGPUISD::ATOMIC_INC
;
6133 case Intrinsic::amdgcn_atomic_dec
:
6134 Opc
= AMDGPUISD::ATOMIC_DEC
;
6136 case Intrinsic::amdgcn_ds_fmin
:
6137 Opc
= AMDGPUISD::ATOMIC_LOAD_FMIN
;
6139 case Intrinsic::amdgcn_ds_fmax
:
6140 Opc
= AMDGPUISD::ATOMIC_LOAD_FMAX
;
6143 llvm_unreachable("Unknown intrinsic!");
6146 M
->getOperand(0), // Chain
6147 M
->getOperand(2), // Ptr
6148 M
->getOperand(3) // Value
6151 return DAG
.getMemIntrinsicNode(Opc
, SDLoc(Op
), M
->getVTList(), Ops
,
6152 M
->getMemoryVT(), M
->getMemOperand());
6154 case Intrinsic::amdgcn_buffer_load
:
6155 case Intrinsic::amdgcn_buffer_load_format
: {
6156 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(5))->getZExtValue();
6157 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6159 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
6160 IdxEn
= Idx
->getZExtValue() != 0;
6162 Op
.getOperand(0), // Chain
6163 Op
.getOperand(2), // rsrc
6164 Op
.getOperand(3), // vindex
6165 SDValue(), // voffset -- will be set by setBufferOffsets
6166 SDValue(), // soffset -- will be set by setBufferOffsets
6167 SDValue(), // offset -- will be set by setBufferOffsets
6168 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6169 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6172 setBufferOffsets(Op
.getOperand(4), DAG
, &Ops
[3]);
6173 unsigned Opc
= (IntrID
== Intrinsic::amdgcn_buffer_load
) ?
6174 AMDGPUISD::BUFFER_LOAD
: AMDGPUISD::BUFFER_LOAD_FORMAT
;
6176 EVT VT
= Op
.getValueType();
6177 EVT IntVT
= VT
.changeTypeToInteger();
6178 auto *M
= cast
<MemSDNode
>(Op
);
6179 EVT LoadVT
= Op
.getValueType();
6181 if (LoadVT
.getScalarType() == MVT::f16
)
6182 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
,
6185 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6186 if (LoadVT
.getScalarType() == MVT::i8
||
6187 LoadVT
.getScalarType() == MVT::i16
)
6188 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
6190 return getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
, IntVT
,
6191 M
->getMemOperand(), DAG
);
6193 case Intrinsic::amdgcn_raw_buffer_load
:
6194 case Intrinsic::amdgcn_raw_buffer_load_format
: {
6195 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
6197 Op
.getOperand(0), // Chain
6198 Op
.getOperand(2), // rsrc
6199 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6200 Offsets
.first
, // voffset
6201 Op
.getOperand(4), // soffset
6202 Offsets
.second
, // offset
6203 Op
.getOperand(5), // cachepolicy
6204 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
6207 unsigned Opc
= (IntrID
== Intrinsic::amdgcn_raw_buffer_load
) ?
6208 AMDGPUISD::BUFFER_LOAD
: AMDGPUISD::BUFFER_LOAD_FORMAT
;
6210 EVT VT
= Op
.getValueType();
6211 EVT IntVT
= VT
.changeTypeToInteger();
6212 auto *M
= cast
<MemSDNode
>(Op
);
6213 EVT LoadVT
= Op
.getValueType();
6215 if (LoadVT
.getScalarType() == MVT::f16
)
6216 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
,
6219 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6220 if (LoadVT
.getScalarType() == MVT::i8
||
6221 LoadVT
.getScalarType() == MVT::i16
)
6222 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
6224 return getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
, IntVT
,
6225 M
->getMemOperand(), DAG
);
6227 case Intrinsic::amdgcn_struct_buffer_load
:
6228 case Intrinsic::amdgcn_struct_buffer_load_format
: {
6229 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6231 Op
.getOperand(0), // Chain
6232 Op
.getOperand(2), // rsrc
6233 Op
.getOperand(3), // vindex
6234 Offsets
.first
, // voffset
6235 Op
.getOperand(5), // soffset
6236 Offsets
.second
, // offset
6237 Op
.getOperand(6), // cachepolicy
6238 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
6241 unsigned Opc
= (IntrID
== Intrinsic::amdgcn_struct_buffer_load
) ?
6242 AMDGPUISD::BUFFER_LOAD
: AMDGPUISD::BUFFER_LOAD_FORMAT
;
6244 EVT VT
= Op
.getValueType();
6245 EVT IntVT
= VT
.changeTypeToInteger();
6246 auto *M
= cast
<MemSDNode
>(Op
);
6247 EVT LoadVT
= Op
.getValueType();
6249 if (LoadVT
.getScalarType() == MVT::f16
)
6250 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
,
6253 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6254 if (LoadVT
.getScalarType() == MVT::i8
||
6255 LoadVT
.getScalarType() == MVT::i16
)
6256 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
);
6258 return getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
, IntVT
,
6259 M
->getMemOperand(), DAG
);
6261 case Intrinsic::amdgcn_tbuffer_load
: {
6262 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6263 EVT LoadVT
= Op
.getValueType();
6265 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6266 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
6267 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
6268 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
6270 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3)))
6271 IdxEn
= Idx
->getZExtValue() != 0;
6273 Op
.getOperand(0), // Chain
6274 Op
.getOperand(2), // rsrc
6275 Op
.getOperand(3), // vindex
6276 Op
.getOperand(4), // voffset
6277 Op
.getOperand(5), // soffset
6278 Op
.getOperand(6), // offset
6279 DAG
.getConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
6280 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6281 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6284 if (LoadVT
.getScalarType() == MVT::f16
)
6285 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6287 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6288 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6291 case Intrinsic::amdgcn_raw_tbuffer_load
: {
6292 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6293 EVT LoadVT
= Op
.getValueType();
6294 auto Offsets
= splitBufferOffsets(Op
.getOperand(3), DAG
);
6297 Op
.getOperand(0), // Chain
6298 Op
.getOperand(2), // rsrc
6299 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6300 Offsets
.first
, // voffset
6301 Op
.getOperand(4), // soffset
6302 Offsets
.second
, // offset
6303 Op
.getOperand(5), // format
6304 Op
.getOperand(6), // cachepolicy
6305 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
6308 if (LoadVT
.getScalarType() == MVT::f16
)
6309 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6311 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6312 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6315 case Intrinsic::amdgcn_struct_tbuffer_load
: {
6316 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6317 EVT LoadVT
= Op
.getValueType();
6318 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6321 Op
.getOperand(0), // Chain
6322 Op
.getOperand(2), // rsrc
6323 Op
.getOperand(3), // vindex
6324 Offsets
.first
, // voffset
6325 Op
.getOperand(5), // soffset
6326 Offsets
.second
, // offset
6327 Op
.getOperand(6), // format
6328 Op
.getOperand(7), // cachepolicy
6329 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
6332 if (LoadVT
.getScalarType() == MVT::f16
)
6333 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
,
6335 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
6336 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
6339 case Intrinsic::amdgcn_buffer_atomic_swap
:
6340 case Intrinsic::amdgcn_buffer_atomic_add
:
6341 case Intrinsic::amdgcn_buffer_atomic_sub
:
6342 case Intrinsic::amdgcn_buffer_atomic_smin
:
6343 case Intrinsic::amdgcn_buffer_atomic_umin
:
6344 case Intrinsic::amdgcn_buffer_atomic_smax
:
6345 case Intrinsic::amdgcn_buffer_atomic_umax
:
6346 case Intrinsic::amdgcn_buffer_atomic_and
:
6347 case Intrinsic::amdgcn_buffer_atomic_or
:
6348 case Intrinsic::amdgcn_buffer_atomic_xor
: {
6349 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6351 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6352 IdxEn
= Idx
->getZExtValue() != 0;
6354 Op
.getOperand(0), // Chain
6355 Op
.getOperand(2), // vdata
6356 Op
.getOperand(3), // rsrc
6357 Op
.getOperand(4), // vindex
6358 SDValue(), // voffset -- will be set by setBufferOffsets
6359 SDValue(), // soffset -- will be set by setBufferOffsets
6360 SDValue(), // offset -- will be set by setBufferOffsets
6361 DAG
.getConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6362 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6364 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6365 EVT VT
= Op
.getValueType();
6367 auto *M
= cast
<MemSDNode
>(Op
);
6368 unsigned Opcode
= 0;
6371 case Intrinsic::amdgcn_buffer_atomic_swap
:
6372 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6374 case Intrinsic::amdgcn_buffer_atomic_add
:
6375 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6377 case Intrinsic::amdgcn_buffer_atomic_sub
:
6378 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6380 case Intrinsic::amdgcn_buffer_atomic_smin
:
6381 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6383 case Intrinsic::amdgcn_buffer_atomic_umin
:
6384 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6386 case Intrinsic::amdgcn_buffer_atomic_smax
:
6387 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6389 case Intrinsic::amdgcn_buffer_atomic_umax
:
6390 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6392 case Intrinsic::amdgcn_buffer_atomic_and
:
6393 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6395 case Intrinsic::amdgcn_buffer_atomic_or
:
6396 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6398 case Intrinsic::amdgcn_buffer_atomic_xor
:
6399 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6402 llvm_unreachable("unhandled atomic opcode");
6405 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6406 M
->getMemOperand());
6408 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6409 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6410 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6411 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6412 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6413 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6414 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6415 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6416 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6417 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6418 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6419 case Intrinsic::amdgcn_raw_buffer_atomic_dec
: {
6420 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6422 Op
.getOperand(0), // Chain
6423 Op
.getOperand(2), // vdata
6424 Op
.getOperand(3), // rsrc
6425 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6426 Offsets
.first
, // voffset
6427 Op
.getOperand(5), // soffset
6428 Offsets
.second
, // offset
6429 Op
.getOperand(6), // cachepolicy
6430 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
6432 EVT VT
= Op
.getValueType();
6434 auto *M
= cast
<MemSDNode
>(Op
);
6435 unsigned Opcode
= 0;
6438 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6439 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6441 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6442 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6444 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6445 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6447 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6448 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6450 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6451 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6453 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6454 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6456 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6457 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6459 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6460 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6462 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6463 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6465 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6466 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6468 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6469 Opcode
= AMDGPUISD::BUFFER_ATOMIC_INC
;
6471 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
6472 Opcode
= AMDGPUISD::BUFFER_ATOMIC_DEC
;
6475 llvm_unreachable("unhandled atomic opcode");
6478 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6479 M
->getMemOperand());
6481 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6482 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6483 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6484 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6485 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6486 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6487 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6488 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6489 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6490 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6491 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6492 case Intrinsic::amdgcn_struct_buffer_atomic_dec
: {
6493 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6495 Op
.getOperand(0), // Chain
6496 Op
.getOperand(2), // vdata
6497 Op
.getOperand(3), // rsrc
6498 Op
.getOperand(4), // vindex
6499 Offsets
.first
, // voffset
6500 Op
.getOperand(6), // soffset
6501 Offsets
.second
, // offset
6502 Op
.getOperand(7), // cachepolicy
6503 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
6505 EVT VT
= Op
.getValueType();
6507 auto *M
= cast
<MemSDNode
>(Op
);
6508 unsigned Opcode
= 0;
6511 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6512 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SWAP
;
6514 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6515 Opcode
= AMDGPUISD::BUFFER_ATOMIC_ADD
;
6517 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6518 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SUB
;
6520 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6521 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMIN
;
6523 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6524 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMIN
;
6526 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6527 Opcode
= AMDGPUISD::BUFFER_ATOMIC_SMAX
;
6529 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6530 Opcode
= AMDGPUISD::BUFFER_ATOMIC_UMAX
;
6532 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6533 Opcode
= AMDGPUISD::BUFFER_ATOMIC_AND
;
6535 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6536 Opcode
= AMDGPUISD::BUFFER_ATOMIC_OR
;
6538 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6539 Opcode
= AMDGPUISD::BUFFER_ATOMIC_XOR
;
6541 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6542 Opcode
= AMDGPUISD::BUFFER_ATOMIC_INC
;
6544 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
6545 Opcode
= AMDGPUISD::BUFFER_ATOMIC_DEC
;
6548 llvm_unreachable("unhandled atomic opcode");
6551 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6552 M
->getMemOperand());
6554 case Intrinsic::amdgcn_buffer_atomic_cmpswap
: {
6555 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6557 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(5)))
6558 IdxEn
= Idx
->getZExtValue() != 0;
6560 Op
.getOperand(0), // Chain
6561 Op
.getOperand(2), // src
6562 Op
.getOperand(3), // cmp
6563 Op
.getOperand(4), // rsrc
6564 Op
.getOperand(5), // vindex
6565 SDValue(), // voffset -- will be set by setBufferOffsets
6566 SDValue(), // soffset -- will be set by setBufferOffsets
6567 SDValue(), // offset -- will be set by setBufferOffsets
6568 DAG
.getConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6569 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6571 setBufferOffsets(Op
.getOperand(6), DAG
, &Ops
[5]);
6572 EVT VT
= Op
.getValueType();
6573 auto *M
= cast
<MemSDNode
>(Op
);
6575 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6576 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6578 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
: {
6579 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6581 Op
.getOperand(0), // Chain
6582 Op
.getOperand(2), // src
6583 Op
.getOperand(3), // cmp
6584 Op
.getOperand(4), // rsrc
6585 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6586 Offsets
.first
, // voffset
6587 Op
.getOperand(6), // soffset
6588 Offsets
.second
, // offset
6589 Op
.getOperand(7), // cachepolicy
6590 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
6592 EVT VT
= Op
.getValueType();
6593 auto *M
= cast
<MemSDNode
>(Op
);
6595 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6596 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6598 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
: {
6599 auto Offsets
= splitBufferOffsets(Op
.getOperand(6), DAG
);
6601 Op
.getOperand(0), // Chain
6602 Op
.getOperand(2), // src
6603 Op
.getOperand(3), // cmp
6604 Op
.getOperand(4), // rsrc
6605 Op
.getOperand(5), // vindex
6606 Offsets
.first
, // voffset
6607 Op
.getOperand(7), // soffset
6608 Offsets
.second
, // offset
6609 Op
.getOperand(8), // cachepolicy
6610 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
6612 EVT VT
= Op
.getValueType();
6613 auto *M
= cast
<MemSDNode
>(Op
);
6615 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
6616 Op
->getVTList(), Ops
, VT
, M
->getMemOperand());
6620 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6621 AMDGPU::getImageDimIntrinsicInfo(IntrID
))
6622 return lowerImage(Op
, ImageDimIntr
, DAG
);
6628 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
6629 // dwordx4 if on SI.
6630 SDValue
SITargetLowering::getMemIntrinsicNode(unsigned Opcode
, const SDLoc
&DL
,
6632 ArrayRef
<SDValue
> Ops
, EVT MemVT
,
6633 MachineMemOperand
*MMO
,
6634 SelectionDAG
&DAG
) const {
6635 EVT VT
= VTList
.VTs
[0];
6637 EVT WidenedMemVT
= MemVT
;
6638 if (!Subtarget
->hasDwordx3LoadStores() &&
6639 (WidenedVT
== MVT::v3i32
|| WidenedVT
== MVT::v3f32
)) {
6640 WidenedVT
= EVT::getVectorVT(*DAG
.getContext(),
6641 WidenedVT
.getVectorElementType(), 4);
6642 WidenedMemVT
= EVT::getVectorVT(*DAG
.getContext(),
6643 WidenedMemVT
.getVectorElementType(), 4);
6644 MMO
= DAG
.getMachineFunction().getMachineMemOperand(MMO
, 0, 16);
6647 assert(VTList
.NumVTs
== 2);
6648 SDVTList WidenedVTList
= DAG
.getVTList(WidenedVT
, VTList
.VTs
[1]);
6650 auto NewOp
= DAG
.getMemIntrinsicNode(Opcode
, DL
, WidenedVTList
, Ops
,
6652 if (WidenedVT
!= VT
) {
6653 auto Extract
= DAG
.getNode(
6654 ISD::EXTRACT_SUBVECTOR
, DL
, VT
, NewOp
,
6655 DAG
.getConstant(0, DL
, getVectorIdxTy(DAG
.getDataLayout())));
6656 NewOp
= DAG
.getMergeValues({ Extract
, SDValue(NewOp
.getNode(), 1) }, DL
);
6661 SDValue
SITargetLowering::handleD16VData(SDValue VData
,
6662 SelectionDAG
&DAG
) const {
6663 EVT StoreVT
= VData
.getValueType();
6665 // No change for f16 and legal vector D16 types.
6666 if (!StoreVT
.isVector())
6670 assert((StoreVT
.getVectorNumElements() != 3) && "Handle v3f16");
6672 if (Subtarget
->hasUnpackedD16VMem()) {
6673 // We need to unpack the packed data to store.
6674 EVT IntStoreVT
= StoreVT
.changeTypeToInteger();
6675 SDValue IntVData
= DAG
.getNode(ISD::BITCAST
, DL
, IntStoreVT
, VData
);
6677 EVT EquivStoreVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
6678 StoreVT
.getVectorNumElements());
6679 SDValue ZExt
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, EquivStoreVT
, IntVData
);
6680 return DAG
.UnrollVectorOp(ZExt
.getNode());
6683 assert(isTypeLegal(StoreVT
));
6687 SDValue
SITargetLowering::LowerINTRINSIC_VOID(SDValue Op
,
6688 SelectionDAG
&DAG
) const {
6690 SDValue Chain
= Op
.getOperand(0);
6691 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
6692 MachineFunction
&MF
= DAG
.getMachineFunction();
6694 switch (IntrinsicID
) {
6695 case Intrinsic::amdgcn_exp
: {
6696 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
6697 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
6698 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(8));
6699 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(9));
6701 const SDValue Ops
[] = {
6703 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
6704 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
6705 Op
.getOperand(4), // src0
6706 Op
.getOperand(5), // src1
6707 Op
.getOperand(6), // src2
6708 Op
.getOperand(7), // src3
6709 DAG
.getTargetConstant(0, DL
, MVT::i1
), // compr
6710 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
6713 unsigned Opc
= Done
->isNullValue() ?
6714 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
6715 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
6717 case Intrinsic::amdgcn_exp_compr
: {
6718 const ConstantSDNode
*Tgt
= cast
<ConstantSDNode
>(Op
.getOperand(2));
6719 const ConstantSDNode
*En
= cast
<ConstantSDNode
>(Op
.getOperand(3));
6720 SDValue Src0
= Op
.getOperand(4);
6721 SDValue Src1
= Op
.getOperand(5);
6722 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(6));
6723 const ConstantSDNode
*VM
= cast
<ConstantSDNode
>(Op
.getOperand(7));
6725 SDValue Undef
= DAG
.getUNDEF(MVT::f32
);
6726 const SDValue Ops
[] = {
6728 DAG
.getTargetConstant(Tgt
->getZExtValue(), DL
, MVT::i8
), // tgt
6729 DAG
.getTargetConstant(En
->getZExtValue(), DL
, MVT::i8
), // en
6730 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src0
),
6731 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src1
),
6734 DAG
.getTargetConstant(1, DL
, MVT::i1
), // compr
6735 DAG
.getTargetConstant(VM
->getZExtValue(), DL
, MVT::i1
)
6738 unsigned Opc
= Done
->isNullValue() ?
6739 AMDGPUISD::EXPORT
: AMDGPUISD::EXPORT_DONE
;
6740 return DAG
.getNode(Opc
, DL
, Op
->getVTList(), Ops
);
6742 case Intrinsic::amdgcn_init_exec
: {
6743 return DAG
.getNode(AMDGPUISD::INIT_EXEC
, DL
, MVT::Other
, Chain
,
6746 case Intrinsic::amdgcn_init_exec_from_input
: {
6747 return DAG
.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT
, DL
, MVT::Other
, Chain
,
6748 Op
.getOperand(2), Op
.getOperand(3));
6750 case Intrinsic::amdgcn_s_barrier
: {
6751 if (getTargetMachine().getOptLevel() > CodeGenOpt::None
) {
6752 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
6753 unsigned WGSize
= ST
.getFlatWorkGroupSizes(MF
.getFunction()).second
;
6754 if (WGSize
<= ST
.getWavefrontSize())
6755 return SDValue(DAG
.getMachineNode(AMDGPU::WAVE_BARRIER
, DL
, MVT::Other
,
6756 Op
.getOperand(0)), 0);
6760 case Intrinsic::amdgcn_tbuffer_store
: {
6761 SDValue VData
= Op
.getOperand(2);
6762 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6764 VData
= handleD16VData(VData
, DAG
);
6765 unsigned Dfmt
= cast
<ConstantSDNode
>(Op
.getOperand(8))->getZExtValue();
6766 unsigned Nfmt
= cast
<ConstantSDNode
>(Op
.getOperand(9))->getZExtValue();
6767 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(10))->getZExtValue();
6768 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(11))->getZExtValue();
6770 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6771 IdxEn
= Idx
->getZExtValue() != 0;
6775 Op
.getOperand(3), // rsrc
6776 Op
.getOperand(4), // vindex
6777 Op
.getOperand(5), // voffset
6778 Op
.getOperand(6), // soffset
6779 Op
.getOperand(7), // offset
6780 DAG
.getConstant(Dfmt
| (Nfmt
<< 4), DL
, MVT::i32
), // format
6781 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6782 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idexen
6784 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6785 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6786 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6787 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6788 M
->getMemoryVT(), M
->getMemOperand());
6791 case Intrinsic::amdgcn_struct_tbuffer_store
: {
6792 SDValue VData
= Op
.getOperand(2);
6793 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6795 VData
= handleD16VData(VData
, DAG
);
6796 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6800 Op
.getOperand(3), // rsrc
6801 Op
.getOperand(4), // vindex
6802 Offsets
.first
, // voffset
6803 Op
.getOperand(6), // soffset
6804 Offsets
.second
, // offset
6805 Op
.getOperand(7), // format
6806 Op
.getOperand(8), // cachepolicy
6807 DAG
.getConstant(1, DL
, MVT::i1
), // idexen
6809 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6810 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6811 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6812 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6813 M
->getMemoryVT(), M
->getMemOperand());
6816 case Intrinsic::amdgcn_raw_tbuffer_store
: {
6817 SDValue VData
= Op
.getOperand(2);
6818 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6820 VData
= handleD16VData(VData
, DAG
);
6821 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6825 Op
.getOperand(3), // rsrc
6826 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6827 Offsets
.first
, // voffset
6828 Op
.getOperand(5), // soffset
6829 Offsets
.second
, // offset
6830 Op
.getOperand(6), // format
6831 Op
.getOperand(7), // cachepolicy
6832 DAG
.getConstant(0, DL
, MVT::i1
), // idexen
6834 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
:
6835 AMDGPUISD::TBUFFER_STORE_FORMAT
;
6836 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6837 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6838 M
->getMemoryVT(), M
->getMemOperand());
6841 case Intrinsic::amdgcn_buffer_store
:
6842 case Intrinsic::amdgcn_buffer_store_format
: {
6843 SDValue VData
= Op
.getOperand(2);
6844 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6846 VData
= handleD16VData(VData
, DAG
);
6847 unsigned Glc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6848 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(7))->getZExtValue();
6850 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6851 IdxEn
= Idx
->getZExtValue() != 0;
6855 Op
.getOperand(3), // rsrc
6856 Op
.getOperand(4), // vindex
6857 SDValue(), // voffset -- will be set by setBufferOffsets
6858 SDValue(), // soffset -- will be set by setBufferOffsets
6859 SDValue(), // offset -- will be set by setBufferOffsets
6860 DAG
.getConstant(Glc
| (Slc
<< 1), DL
, MVT::i32
), // cachepolicy
6861 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6863 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6864 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_buffer_store
?
6865 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
6866 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6867 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6869 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6870 EVT VDataType
= VData
.getValueType().getScalarType();
6871 if (VDataType
== MVT::i8
|| VDataType
== MVT::i16
)
6872 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
6874 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6875 M
->getMemoryVT(), M
->getMemOperand());
6878 case Intrinsic::amdgcn_raw_buffer_store
:
6879 case Intrinsic::amdgcn_raw_buffer_store_format
: {
6880 SDValue VData
= Op
.getOperand(2);
6881 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6883 VData
= handleD16VData(VData
, DAG
);
6884 auto Offsets
= splitBufferOffsets(Op
.getOperand(4), DAG
);
6888 Op
.getOperand(3), // rsrc
6889 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6890 Offsets
.first
, // voffset
6891 Op
.getOperand(5), // soffset
6892 Offsets
.second
, // offset
6893 Op
.getOperand(6), // cachepolicy
6894 DAG
.getConstant(0, DL
, MVT::i1
), // idxen
6896 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_raw_buffer_store
?
6897 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
6898 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6899 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6901 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6902 EVT VDataType
= VData
.getValueType().getScalarType();
6903 if (VDataType
== MVT::i8
|| VDataType
== MVT::i16
)
6904 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
6906 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6907 M
->getMemoryVT(), M
->getMemOperand());
6910 case Intrinsic::amdgcn_struct_buffer_store
:
6911 case Intrinsic::amdgcn_struct_buffer_store_format
: {
6912 SDValue VData
= Op
.getOperand(2);
6913 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
6915 VData
= handleD16VData(VData
, DAG
);
6916 auto Offsets
= splitBufferOffsets(Op
.getOperand(5), DAG
);
6920 Op
.getOperand(3), // rsrc
6921 Op
.getOperand(4), // vindex
6922 Offsets
.first
, // voffset
6923 Op
.getOperand(6), // soffset
6924 Offsets
.second
, // offset
6925 Op
.getOperand(7), // cachepolicy
6926 DAG
.getConstant(1, DL
, MVT::i1
), // idxen
6928 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store
?
6929 AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
6930 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
6931 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
6933 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6934 EVT VDataType
= VData
.getValueType().getScalarType();
6935 if (VDataType
== MVT::i8
|| VDataType
== MVT::i16
)
6936 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
6938 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
6939 M
->getMemoryVT(), M
->getMemOperand());
6942 case Intrinsic::amdgcn_buffer_atomic_fadd
: {
6943 unsigned Slc
= cast
<ConstantSDNode
>(Op
.getOperand(6))->getZExtValue();
6945 if (auto Idx
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(4)))
6946 IdxEn
= Idx
->getZExtValue() != 0;
6949 Op
.getOperand(2), // vdata
6950 Op
.getOperand(3), // rsrc
6951 Op
.getOperand(4), // vindex
6952 SDValue(), // voffset -- will be set by setBufferOffsets
6953 SDValue(), // soffset -- will be set by setBufferOffsets
6954 SDValue(), // offset -- will be set by setBufferOffsets
6955 DAG
.getConstant(Slc
<< 1, DL
, MVT::i32
), // cachepolicy
6956 DAG
.getConstant(IdxEn
, DL
, MVT::i1
), // idxen
6958 setBufferOffsets(Op
.getOperand(5), DAG
, &Ops
[4]);
6959 EVT VT
= Op
.getOperand(2).getValueType();
6961 auto *M
= cast
<MemSDNode
>(Op
);
6962 unsigned Opcode
= VT
.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
6963 : AMDGPUISD::BUFFER_ATOMIC_FADD
;
6965 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6966 M
->getMemOperand());
6969 case Intrinsic::amdgcn_global_atomic_fadd
: {
6972 Op
.getOperand(2), // ptr
6973 Op
.getOperand(3) // vdata
6975 EVT VT
= Op
.getOperand(3).getValueType();
6977 auto *M
= cast
<MemSDNode
>(Op
);
6978 unsigned Opcode
= VT
.isVector() ? AMDGPUISD::ATOMIC_PK_FADD
6979 : AMDGPUISD::ATOMIC_FADD
;
6981 return DAG
.getMemIntrinsicNode(Opcode
, DL
, Op
->getVTList(), Ops
, VT
,
6982 M
->getMemOperand());
6985 case Intrinsic::amdgcn_end_cf
:
6986 return SDValue(DAG
.getMachineNode(AMDGPU::SI_END_CF
, DL
, MVT::Other
,
6987 Op
->getOperand(2), Chain
), 0);
6990 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
6991 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
6992 return lowerImage(Op
, ImageDimIntr
, DAG
);
6999 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
7000 // offset (the offset that is included in bounds checking and swizzling, to be
7001 // split between the instruction's voffset and immoffset fields) and soffset
7002 // (the offset that is excluded from bounds checking and swizzling, to go in
7003 // the instruction's soffset field). This function takes the first kind of
7004 // offset and figures out how to split it between voffset and immoffset.
7005 std::pair
<SDValue
, SDValue
> SITargetLowering::splitBufferOffsets(
7006 SDValue Offset
, SelectionDAG
&DAG
) const {
7008 const unsigned MaxImm
= 4095;
7009 SDValue N0
= Offset
;
7010 ConstantSDNode
*C1
= nullptr;
7012 if ((C1
= dyn_cast
<ConstantSDNode
>(N0
)))
7014 else if (DAG
.isBaseWithConstantOffset(N0
)) {
7015 C1
= cast
<ConstantSDNode
>(N0
.getOperand(1));
7016 N0
= N0
.getOperand(0);
7020 unsigned ImmOffset
= C1
->getZExtValue();
7021 // If the immediate value is too big for the immoffset field, put the value
7022 // and -4096 into the immoffset field so that the value that is copied/added
7023 // for the voffset field is a multiple of 4096, and it stands more chance
7024 // of being CSEd with the copy/add for another similar load/store.
7025 // However, do not do that rounding down to a multiple of 4096 if that is a
7026 // negative number, as it appears to be illegal to have a negative offset
7027 // in the vgpr, even if adding the immediate offset makes it positive.
7028 unsigned Overflow
= ImmOffset
& ~MaxImm
;
7029 ImmOffset
-= Overflow
;
7030 if ((int32_t)Overflow
< 0) {
7031 Overflow
+= ImmOffset
;
7034 C1
= cast
<ConstantSDNode
>(DAG
.getConstant(ImmOffset
, DL
, MVT::i32
));
7036 auto OverflowVal
= DAG
.getConstant(Overflow
, DL
, MVT::i32
);
7040 SDValue Ops
[] = { N0
, OverflowVal
};
7041 N0
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, Ops
);
7046 N0
= DAG
.getConstant(0, DL
, MVT::i32
);
7048 C1
= cast
<ConstantSDNode
>(DAG
.getConstant(0, DL
, MVT::i32
));
7049 return {N0
, SDValue(C1
, 0)};
7052 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
7053 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
7054 // pointed to by Offsets.
7055 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset
,
7056 SelectionDAG
&DAG
, SDValue
*Offsets
,
7057 unsigned Align
) const {
7058 SDLoc
DL(CombinedOffset
);
7059 if (auto C
= dyn_cast
<ConstantSDNode
>(CombinedOffset
)) {
7060 uint32_t Imm
= C
->getZExtValue();
7061 uint32_t SOffset
, ImmOffset
;
7062 if (AMDGPU::splitMUBUFOffset(Imm
, SOffset
, ImmOffset
, Subtarget
, Align
)) {
7063 Offsets
[0] = DAG
.getConstant(0, DL
, MVT::i32
);
7064 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
7065 Offsets
[2] = DAG
.getConstant(ImmOffset
, DL
, MVT::i32
);
7069 if (DAG
.isBaseWithConstantOffset(CombinedOffset
)) {
7070 SDValue N0
= CombinedOffset
.getOperand(0);
7071 SDValue N1
= CombinedOffset
.getOperand(1);
7072 uint32_t SOffset
, ImmOffset
;
7073 int Offset
= cast
<ConstantSDNode
>(N1
)->getSExtValue();
7074 if (Offset
>= 0 && AMDGPU::splitMUBUFOffset(Offset
, SOffset
, ImmOffset
,
7075 Subtarget
, Align
)) {
7077 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
7078 Offsets
[2] = DAG
.getConstant(ImmOffset
, DL
, MVT::i32
);
7082 Offsets
[0] = CombinedOffset
;
7083 Offsets
[1] = DAG
.getConstant(0, DL
, MVT::i32
);
7084 Offsets
[2] = DAG
.getConstant(0, DL
, MVT::i32
);
7087 // Handle 8 bit and 16 bit buffer loads
7088 SDValue
SITargetLowering::handleByteShortBufferLoads(SelectionDAG
&DAG
,
7089 EVT LoadVT
, SDLoc DL
,
7090 ArrayRef
<SDValue
> Ops
,
7091 MemSDNode
*M
) const {
7092 EVT IntVT
= LoadVT
.changeTypeToInteger();
7093 unsigned Opc
= (LoadVT
.getScalarType() == MVT::i8
) ?
7094 AMDGPUISD::BUFFER_LOAD_UBYTE
: AMDGPUISD::BUFFER_LOAD_USHORT
;
7096 SDVTList ResList
= DAG
.getVTList(MVT::i32
, MVT::Other
);
7097 SDValue BufferLoad
= DAG
.getMemIntrinsicNode(Opc
, DL
, ResList
,
7099 M
->getMemOperand());
7100 SDValue BufferLoadTrunc
= DAG
.getNode(ISD::TRUNCATE
, DL
,
7101 LoadVT
.getScalarType(), BufferLoad
);
7102 return DAG
.getMergeValues({BufferLoadTrunc
, BufferLoad
.getValue(1)}, DL
);
7105 // Handle 8 bit and 16 bit buffer stores
7106 SDValue
SITargetLowering::handleByteShortBufferStores(SelectionDAG
&DAG
,
7107 EVT VDataType
, SDLoc DL
,
7109 MemSDNode
*M
) const {
7110 SDValue BufferStoreExt
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Ops
[1]);
7111 Ops
[1] = BufferStoreExt
;
7112 unsigned Opc
= (VDataType
== MVT::i8
) ? AMDGPUISD::BUFFER_STORE_BYTE
:
7113 AMDGPUISD::BUFFER_STORE_SHORT
;
7114 ArrayRef
<SDValue
> OpsRef
= makeArrayRef(&Ops
[0], 9);
7115 return DAG
.getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), OpsRef
, VDataType
,
7116 M
->getMemOperand());
7119 static SDValue
getLoadExtOrTrunc(SelectionDAG
&DAG
,
7120 ISD::LoadExtType ExtType
, SDValue Op
,
7121 const SDLoc
&SL
, EVT VT
) {
7122 if (VT
.bitsLT(Op
.getValueType()))
7123 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Op
);
7127 return DAG
.getNode(ISD::SIGN_EXTEND
, SL
, VT
, Op
);
7129 return DAG
.getNode(ISD::ZERO_EXTEND
, SL
, VT
, Op
);
7131 return DAG
.getNode(ISD::ANY_EXTEND
, SL
, VT
, Op
);
7132 case ISD::NON_EXTLOAD
:
7136 llvm_unreachable("invalid ext type");
7139 SDValue
SITargetLowering::widenLoad(LoadSDNode
*Ld
, DAGCombinerInfo
&DCI
) const {
7140 SelectionDAG
&DAG
= DCI
.DAG
;
7141 if (Ld
->getAlignment() < 4 || Ld
->isDivergent())
7144 // FIXME: Constant loads should all be marked invariant.
7145 unsigned AS
= Ld
->getAddressSpace();
7146 if (AS
!= AMDGPUAS::CONSTANT_ADDRESS
&&
7147 AS
!= AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
7148 (AS
!= AMDGPUAS::GLOBAL_ADDRESS
|| !Ld
->isInvariant()))
7151 // Don't do this early, since it may interfere with adjacent load merging for
7152 // illegal types. We can avoid losing alignment information for exotic types
7154 EVT MemVT
= Ld
->getMemoryVT();
7155 if ((MemVT
.isSimple() && !DCI
.isAfterLegalizeDAG()) ||
7156 MemVT
.getSizeInBits() >= 32)
7161 assert((!MemVT
.isVector() || Ld
->getExtensionType() == ISD::NON_EXTLOAD
) &&
7162 "unexpected vector extload");
7164 // TODO: Drop only high part of range.
7165 SDValue Ptr
= Ld
->getBasePtr();
7166 SDValue NewLoad
= DAG
.getLoad(ISD::UNINDEXED
, ISD::NON_EXTLOAD
,
7167 MVT::i32
, SL
, Ld
->getChain(), Ptr
,
7169 Ld
->getPointerInfo(), MVT::i32
,
7171 Ld
->getMemOperand()->getFlags(),
7173 nullptr); // Drop ranges
7175 EVT TruncVT
= EVT::getIntegerVT(*DAG
.getContext(), MemVT
.getSizeInBits());
7176 if (MemVT
.isFloatingPoint()) {
7177 assert(Ld
->getExtensionType() == ISD::NON_EXTLOAD
&&
7178 "unexpected fp extload");
7179 TruncVT
= MemVT
.changeTypeToInteger();
7182 SDValue Cvt
= NewLoad
;
7183 if (Ld
->getExtensionType() == ISD::SEXTLOAD
) {
7184 Cvt
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, SL
, MVT::i32
, NewLoad
,
7185 DAG
.getValueType(TruncVT
));
7186 } else if (Ld
->getExtensionType() == ISD::ZEXTLOAD
||
7187 Ld
->getExtensionType() == ISD::NON_EXTLOAD
) {
7188 Cvt
= DAG
.getZeroExtendInReg(NewLoad
, SL
, TruncVT
);
7190 assert(Ld
->getExtensionType() == ISD::EXTLOAD
);
7193 EVT VT
= Ld
->getValueType(0);
7194 EVT IntVT
= EVT::getIntegerVT(*DAG
.getContext(), VT
.getSizeInBits());
7196 DCI
.AddToWorklist(Cvt
.getNode());
7198 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
7199 // the appropriate extension from the 32-bit load.
7200 Cvt
= getLoadExtOrTrunc(DAG
, Ld
->getExtensionType(), Cvt
, SL
, IntVT
);
7201 DCI
.AddToWorklist(Cvt
.getNode());
7203 // Handle conversion back to floating point if necessary.
7204 Cvt
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, Cvt
);
7206 return DAG
.getMergeValues({ Cvt
, NewLoad
.getValue(1) }, SL
);
7209 SDValue
SITargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
7211 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
7212 ISD::LoadExtType ExtType
= Load
->getExtensionType();
7213 EVT MemVT
= Load
->getMemoryVT();
7215 if (ExtType
== ISD::NON_EXTLOAD
&& MemVT
.getSizeInBits() < 32) {
7216 if (MemVT
== MVT::i16
&& isTypeLegal(MVT::i16
))
7219 // FIXME: Copied from PPC
7220 // First, load into 32 bits, then truncate to 1 bit.
7222 SDValue Chain
= Load
->getChain();
7223 SDValue BasePtr
= Load
->getBasePtr();
7224 MachineMemOperand
*MMO
= Load
->getMemOperand();
7226 EVT RealMemVT
= (MemVT
== MVT::i1
) ? MVT::i8
: MVT::i16
;
7228 SDValue NewLD
= DAG
.getExtLoad(ISD::EXTLOAD
, DL
, MVT::i32
, Chain
,
7229 BasePtr
, RealMemVT
, MMO
);
7231 if (!MemVT
.isVector()) {
7233 DAG
.getNode(ISD::TRUNCATE
, DL
, MemVT
, NewLD
),
7237 return DAG
.getMergeValues(Ops
, DL
);
7240 SmallVector
<SDValue
, 3> Elts
;
7241 for (unsigned I
= 0, N
= MemVT
.getVectorNumElements(); I
!= N
; ++I
) {
7242 SDValue Elt
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, NewLD
,
7243 DAG
.getConstant(I
, DL
, MVT::i32
));
7245 Elts
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Elt
));
7249 DAG
.getBuildVector(MemVT
, DL
, Elts
),
7253 return DAG
.getMergeValues(Ops
, DL
);
7256 if (!MemVT
.isVector())
7259 assert(Op
.getValueType().getVectorElementType() == MVT::i32
&&
7260 "Custom lowering for non-i32 vectors hasn't been implemented.");
7262 if (!allowsMemoryAccess(*DAG
.getContext(), DAG
.getDataLayout(), MemVT
,
7263 *Load
->getMemOperand())) {
7265 std::tie(Ops
[0], Ops
[1]) = expandUnalignedLoad(Load
, DAG
);
7266 return DAG
.getMergeValues(Ops
, DL
);
7269 unsigned Alignment
= Load
->getAlignment();
7270 unsigned AS
= Load
->getAddressSpace();
7271 if (Subtarget
->hasLDSMisalignedBug() &&
7272 AS
== AMDGPUAS::FLAT_ADDRESS
&&
7273 Alignment
< MemVT
.getStoreSize() && MemVT
.getSizeInBits() > 32) {
7274 return SplitVectorLoad(Op
, DAG
);
7277 MachineFunction
&MF
= DAG
.getMachineFunction();
7278 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
7279 // If there is a possibilty that flat instruction access scratch memory
7280 // then we need to use the same legalization rules we use for private.
7281 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
7282 AS
= MFI
->hasFlatScratchInit() ?
7283 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
7285 unsigned NumElements
= MemVT
.getVectorNumElements();
7287 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7288 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
7289 if (!Op
->isDivergent() && Alignment
>= 4 && NumElements
< 32) {
7290 if (MemVT
.isPow2VectorType())
7292 if (NumElements
== 3)
7293 return WidenVectorLoad(Op
, DAG
);
7294 return SplitVectorLoad(Op
, DAG
);
7296 // Non-uniform loads will be selected to MUBUF instructions, so they
7297 // have the same legalization requirements as global and private
7302 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7303 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
7304 AS
== AMDGPUAS::GLOBAL_ADDRESS
) {
7305 if (Subtarget
->getScalarizeGlobalBehavior() && !Op
->isDivergent() &&
7306 !Load
->isVolatile() && isMemOpHasNoClobberedMemOperand(Load
) &&
7307 Alignment
>= 4 && NumElements
< 32) {
7308 if (MemVT
.isPow2VectorType())
7310 if (NumElements
== 3)
7311 return WidenVectorLoad(Op
, DAG
);
7312 return SplitVectorLoad(Op
, DAG
);
7314 // Non-uniform loads will be selected to MUBUF instructions, so they
7315 // have the same legalization requirements as global and private
7319 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
7320 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
7321 AS
== AMDGPUAS::GLOBAL_ADDRESS
||
7322 AS
== AMDGPUAS::FLAT_ADDRESS
) {
7323 if (NumElements
> 4)
7324 return SplitVectorLoad(Op
, DAG
);
7325 // v3 loads not supported on SI.
7326 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7327 return WidenVectorLoad(Op
, DAG
);
7328 // v3 and v4 loads are supported for private and global memory.
7331 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7332 // Depending on the setting of the private_element_size field in the
7333 // resource descriptor, we can only make private accesses up to a certain
7335 switch (Subtarget
->getMaxPrivateElementSize()) {
7337 return scalarizeVectorLoad(Load
, DAG
);
7339 if (NumElements
> 2)
7340 return SplitVectorLoad(Op
, DAG
);
7343 // Same as global/flat
7344 if (NumElements
> 4)
7345 return SplitVectorLoad(Op
, DAG
);
7346 // v3 loads not supported on SI.
7347 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7348 return WidenVectorLoad(Op
, DAG
);
7351 llvm_unreachable("unsupported private_element_size");
7353 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
7354 // Use ds_read_b128 if possible.
7355 if (Subtarget
->useDS128() && Load
->getAlignment() >= 16 &&
7356 MemVT
.getStoreSize() == 16)
7359 if (NumElements
> 2)
7360 return SplitVectorLoad(Op
, DAG
);
7362 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7363 // address is negative, then the instruction is incorrectly treated as
7364 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7365 // loads here to avoid emitting ds_read2_b32. We may re-combine the
7366 // load later in the SILoadStoreOptimizer.
7367 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
7368 NumElements
== 2 && MemVT
.getStoreSize() == 8 &&
7369 Load
->getAlignment() < 8) {
7370 return SplitVectorLoad(Op
, DAG
);
7376 SDValue
SITargetLowering::LowerSELECT(SDValue Op
, SelectionDAG
&DAG
) const {
7377 EVT VT
= Op
.getValueType();
7378 assert(VT
.getSizeInBits() == 64);
7381 SDValue Cond
= Op
.getOperand(0);
7383 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i32
);
7384 SDValue One
= DAG
.getConstant(1, DL
, MVT::i32
);
7386 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(1));
7387 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(2));
7389 SDValue Lo0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, Zero
);
7390 SDValue Lo1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, Zero
);
7392 SDValue Lo
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Lo0
, Lo1
);
7394 SDValue Hi0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, One
);
7395 SDValue Hi1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, One
);
7397 SDValue Hi
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Hi0
, Hi1
);
7399 SDValue Res
= DAG
.getBuildVector(MVT::v2i32
, DL
, {Lo
, Hi
});
7400 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Res
);
7403 // Catch division cases where we can use shortcuts with rcp and rsq
7405 SDValue
SITargetLowering::lowerFastUnsafeFDIV(SDValue Op
,
7406 SelectionDAG
&DAG
) const {
7408 SDValue LHS
= Op
.getOperand(0);
7409 SDValue RHS
= Op
.getOperand(1);
7410 EVT VT
= Op
.getValueType();
7411 const SDNodeFlags Flags
= Op
->getFlags();
7412 bool Unsafe
= DAG
.getTarget().Options
.UnsafeFPMath
|| Flags
.hasAllowReciprocal();
7414 if (!Unsafe
&& VT
== MVT::f32
&& Subtarget
->hasFP32Denormals())
7417 if (const ConstantFPSDNode
*CLHS
= dyn_cast
<ConstantFPSDNode
>(LHS
)) {
7418 if (Unsafe
|| VT
== MVT::f32
|| VT
== MVT::f16
) {
7419 if (CLHS
->isExactlyValue(1.0)) {
7420 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
7421 // the CI documentation has a worst case error of 1 ulp.
7422 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
7423 // use it as long as we aren't trying to use denormals.
7425 // v_rcp_f16 and v_rsq_f16 DO support denormals.
7427 // 1.0 / sqrt(x) -> rsq(x)
7429 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
7430 // error seems really high at 2^29 ULP.
7431 if (RHS
.getOpcode() == ISD::FSQRT
)
7432 return DAG
.getNode(AMDGPUISD::RSQ
, SL
, VT
, RHS
.getOperand(0));
7434 // 1.0 / x -> rcp(x)
7435 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
7438 // Same as for 1.0, but expand the sign out of the constant.
7439 if (CLHS
->isExactlyValue(-1.0)) {
7440 // -1.0 / x -> rcp (fneg x)
7441 SDValue FNegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
7442 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, FNegRHS
);
7448 // Turn into multiply by the reciprocal.
7449 // x / y -> x * (1.0 / y)
7450 SDValue Recip
= DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
7451 return DAG
.getNode(ISD::FMUL
, SL
, VT
, LHS
, Recip
, Flags
);
7457 static SDValue
getFPBinOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
7458 EVT VT
, SDValue A
, SDValue B
, SDValue GlueChain
) {
7459 if (GlueChain
->getNumValues() <= 1) {
7460 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
);
7463 assert(GlueChain
->getNumValues() == 3);
7465 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
7467 default: llvm_unreachable("no chain equivalent for opcode");
7469 Opcode
= AMDGPUISD::FMUL_W_CHAIN
;
7473 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
,
7474 GlueChain
.getValue(2));
7477 static SDValue
getFPTernOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
7478 EVT VT
, SDValue A
, SDValue B
, SDValue C
,
7479 SDValue GlueChain
) {
7480 if (GlueChain
->getNumValues() <= 1) {
7481 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
, C
);
7484 assert(GlueChain
->getNumValues() == 3);
7486 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
7488 default: llvm_unreachable("no chain equivalent for opcode");
7490 Opcode
= AMDGPUISD::FMA_W_CHAIN
;
7494 return DAG
.getNode(Opcode
, SL
, VTList
, GlueChain
.getValue(1), A
, B
, C
,
7495 GlueChain
.getValue(2));
7498 SDValue
SITargetLowering::LowerFDIV16(SDValue Op
, SelectionDAG
&DAG
) const {
7499 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
7503 SDValue Src0
= Op
.getOperand(0);
7504 SDValue Src1
= Op
.getOperand(1);
7506 SDValue CvtSrc0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
7507 SDValue CvtSrc1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
7509 SDValue RcpSrc1
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, CvtSrc1
);
7510 SDValue Quot
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, CvtSrc0
, RcpSrc1
);
7512 SDValue FPRoundFlag
= DAG
.getTargetConstant(0, SL
, MVT::i32
);
7513 SDValue BestQuot
= DAG
.getNode(ISD::FP_ROUND
, SL
, MVT::f16
, Quot
, FPRoundFlag
);
7515 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f16
, BestQuot
, Src1
, Src0
);
7518 // Faster 2.5 ULP division that does not support denormals.
7519 SDValue
SITargetLowering::lowerFDIV_FAST(SDValue Op
, SelectionDAG
&DAG
) const {
7521 SDValue LHS
= Op
.getOperand(1);
7522 SDValue RHS
= Op
.getOperand(2);
7524 SDValue r1
= DAG
.getNode(ISD::FABS
, SL
, MVT::f32
, RHS
);
7526 const APFloat
K0Val(BitsToFloat(0x6f800000));
7527 const SDValue K0
= DAG
.getConstantFP(K0Val
, SL
, MVT::f32
);
7529 const APFloat
K1Val(BitsToFloat(0x2f800000));
7530 const SDValue K1
= DAG
.getConstantFP(K1Val
, SL
, MVT::f32
);
7532 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
7535 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::f32
);
7537 SDValue r2
= DAG
.getSetCC(SL
, SetCCVT
, r1
, K0
, ISD::SETOGT
);
7539 SDValue r3
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f32
, r2
, K1
, One
);
7541 // TODO: Should this propagate fast-math-flags?
7542 r1
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, RHS
, r3
);
7544 // rcp does not support denormals.
7545 SDValue r0
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, r1
);
7547 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, LHS
, r0
);
7549 return DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, r3
, Mul
);
7552 SDValue
SITargetLowering::LowerFDIV32(SDValue Op
, SelectionDAG
&DAG
) const {
7553 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
7557 SDValue LHS
= Op
.getOperand(0);
7558 SDValue RHS
= Op
.getOperand(1);
7560 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
7562 SDVTList ScaleVT
= DAG
.getVTList(MVT::f32
, MVT::i1
);
7564 SDValue DenominatorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
7566 SDValue NumeratorScaled
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
,
7569 // Denominator is scaled to not be denormal, so using rcp is ok.
7570 SDValue ApproxRcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
,
7572 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
,
7575 const unsigned Denorm32Reg
= AMDGPU::Hwreg::ID_MODE
|
7576 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_
) |
7577 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_
);
7579 const SDValue BitField
= DAG
.getTargetConstant(Denorm32Reg
, SL
, MVT::i16
);
7581 if (!Subtarget
->hasFP32Denormals()) {
7582 SDVTList BindParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
7583 const SDValue EnableDenormValue
= DAG
.getConstant(FP_DENORM_FLUSH_NONE
,
7585 SDValue EnableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, BindParamVTs
,
7587 EnableDenormValue
, BitField
);
7590 EnableDenorm
.getValue(0),
7591 EnableDenorm
.getValue(1)
7594 NegDivScale0
= DAG
.getMergeValues(Ops
, SL
);
7597 SDValue Fma0
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
,
7598 ApproxRcp
, One
, NegDivScale0
);
7600 SDValue Fma1
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma0
, ApproxRcp
,
7603 SDValue Mul
= getFPBinOp(DAG
, ISD::FMUL
, SL
, MVT::f32
, NumeratorScaled
,
7606 SDValue Fma2
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Mul
,
7607 NumeratorScaled
, Mul
);
7609 SDValue Fma3
= getFPTernOp(DAG
, ISD::FMA
,SL
, MVT::f32
, Fma2
, Fma1
, Mul
, Fma2
);
7611 SDValue Fma4
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Fma3
,
7612 NumeratorScaled
, Fma3
);
7614 if (!Subtarget
->hasFP32Denormals()) {
7615 const SDValue DisableDenormValue
=
7616 DAG
.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT
, SL
, MVT::i32
);
7617 SDValue DisableDenorm
= DAG
.getNode(AMDGPUISD::SETREG
, SL
, MVT::Other
,
7623 SDValue OutputChain
= DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
,
7624 DisableDenorm
, DAG
.getRoot());
7625 DAG
.setRoot(OutputChain
);
7628 SDValue Scale
= NumeratorScaled
.getValue(1);
7629 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f32
,
7630 Fma4
, Fma1
, Fma3
, Scale
);
7632 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f32
, Fmas
, RHS
, LHS
);
7635 SDValue
SITargetLowering::LowerFDIV64(SDValue Op
, SelectionDAG
&DAG
) const {
7636 if (DAG
.getTarget().Options
.UnsafeFPMath
)
7637 return lowerFastUnsafeFDIV(Op
, DAG
);
7640 SDValue X
= Op
.getOperand(0);
7641 SDValue Y
= Op
.getOperand(1);
7643 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f64
);
7645 SDVTList ScaleVT
= DAG
.getVTList(MVT::f64
, MVT::i1
);
7647 SDValue DivScale0
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, Y
, Y
, X
);
7649 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f64
, DivScale0
);
7651 SDValue Rcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f64
, DivScale0
);
7653 SDValue Fma0
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Rcp
, One
);
7655 SDValue Fma1
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Rcp
, Fma0
, Rcp
);
7657 SDValue Fma2
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Fma1
, One
);
7659 SDValue DivScale1
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, X
, Y
, X
);
7661 SDValue Fma3
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Fma1
, Fma2
, Fma1
);
7662 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f64
, DivScale1
, Fma3
);
7664 SDValue Fma4
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
,
7665 NegDivScale0
, Mul
, DivScale1
);
7669 if (!Subtarget
->hasUsableDivScaleConditionOutput()) {
7670 // Workaround a hardware bug on SI where the condition output from div_scale
7673 const SDValue Hi
= DAG
.getConstant(1, SL
, MVT::i32
);
7675 // Figure out if the scale to use for div_fmas.
7676 SDValue NumBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, X
);
7677 SDValue DenBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Y
);
7678 SDValue Scale0BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale0
);
7679 SDValue Scale1BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale1
);
7681 SDValue NumHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, NumBC
, Hi
);
7682 SDValue DenHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, DenBC
, Hi
);
7685 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale0BC
, Hi
);
7687 = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale1BC
, Hi
);
7689 SDValue CmpDen
= DAG
.getSetCC(SL
, MVT::i1
, DenHi
, Scale0Hi
, ISD::SETEQ
);
7690 SDValue CmpNum
= DAG
.getSetCC(SL
, MVT::i1
, NumHi
, Scale1Hi
, ISD::SETEQ
);
7691 Scale
= DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, CmpNum
, CmpDen
);
7693 Scale
= DivScale1
.getValue(1);
7696 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f64
,
7697 Fma4
, Fma3
, Mul
, Scale
);
7699 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f64
, Fmas
, Y
, X
);
7702 SDValue
SITargetLowering::LowerFDIV(SDValue Op
, SelectionDAG
&DAG
) const {
7703 EVT VT
= Op
.getValueType();
7706 return LowerFDIV32(Op
, DAG
);
7709 return LowerFDIV64(Op
, DAG
);
7712 return LowerFDIV16(Op
, DAG
);
7714 llvm_unreachable("Unexpected type for fdiv");
7717 SDValue
SITargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
7719 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
7720 EVT VT
= Store
->getMemoryVT();
7722 if (VT
== MVT::i1
) {
7723 return DAG
.getTruncStore(Store
->getChain(), DL
,
7724 DAG
.getSExtOrTrunc(Store
->getValue(), DL
, MVT::i32
),
7725 Store
->getBasePtr(), MVT::i1
, Store
->getMemOperand());
7728 assert(VT
.isVector() &&
7729 Store
->getValue().getValueType().getScalarType() == MVT::i32
);
7731 if (!allowsMemoryAccess(*DAG
.getContext(), DAG
.getDataLayout(), VT
,
7732 *Store
->getMemOperand())) {
7733 return expandUnalignedStore(Store
, DAG
);
7736 unsigned AS
= Store
->getAddressSpace();
7737 if (Subtarget
->hasLDSMisalignedBug() &&
7738 AS
== AMDGPUAS::FLAT_ADDRESS
&&
7739 Store
->getAlignment() < VT
.getStoreSize() && VT
.getSizeInBits() > 32) {
7740 return SplitVectorStore(Op
, DAG
);
7743 MachineFunction
&MF
= DAG
.getMachineFunction();
7744 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
7745 // If there is a possibilty that flat instruction access scratch memory
7746 // then we need to use the same legalization rules we use for private.
7747 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
7748 AS
= MFI
->hasFlatScratchInit() ?
7749 AMDGPUAS::PRIVATE_ADDRESS
: AMDGPUAS::GLOBAL_ADDRESS
;
7751 unsigned NumElements
= VT
.getVectorNumElements();
7752 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
||
7753 AS
== AMDGPUAS::FLAT_ADDRESS
) {
7754 if (NumElements
> 4)
7755 return SplitVectorStore(Op
, DAG
);
7756 // v3 stores not supported on SI.
7757 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
7758 return SplitVectorStore(Op
, DAG
);
7760 } else if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7761 switch (Subtarget
->getMaxPrivateElementSize()) {
7763 return scalarizeVectorStore(Store
, DAG
);
7765 if (NumElements
> 2)
7766 return SplitVectorStore(Op
, DAG
);
7769 if (NumElements
> 4 || NumElements
== 3)
7770 return SplitVectorStore(Op
, DAG
);
7773 llvm_unreachable("unsupported private_element_size");
7775 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
7776 // Use ds_write_b128 if possible.
7777 if (Subtarget
->useDS128() && Store
->getAlignment() >= 16 &&
7778 VT
.getStoreSize() == 16 && NumElements
!= 3)
7781 if (NumElements
> 2)
7782 return SplitVectorStore(Op
, DAG
);
7784 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7785 // address is negative, then the instruction is incorrectly treated as
7786 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7787 // stores here to avoid emitting ds_write2_b32. We may re-combine the
7788 // store later in the SILoadStoreOptimizer.
7789 if (!Subtarget
->hasUsableDSOffset() &&
7790 NumElements
== 2 && VT
.getStoreSize() == 8 &&
7791 Store
->getAlignment() < 8) {
7792 return SplitVectorStore(Op
, DAG
);
7797 llvm_unreachable("unhandled address space");
7801 SDValue
SITargetLowering::LowerTrig(SDValue Op
, SelectionDAG
&DAG
) const {
7803 EVT VT
= Op
.getValueType();
7804 SDValue Arg
= Op
.getOperand(0);
7807 // TODO: Should this propagate fast-math-flags?
7809 SDValue OneOver2Pi
= DAG
.getConstantFP(0.5 / M_PI
, DL
, VT
);
7811 if (Subtarget
->hasTrigReducedRange()) {
7812 SDValue MulVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
7813 TrigVal
= DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, MulVal
);
7815 TrigVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
);
7818 switch (Op
.getOpcode()) {
7820 return DAG
.getNode(AMDGPUISD::COS_HW
, SDLoc(Op
), VT
, TrigVal
);
7822 return DAG
.getNode(AMDGPUISD::SIN_HW
, SDLoc(Op
), VT
, TrigVal
);
7824 llvm_unreachable("Wrong trig opcode");
7828 SDValue
SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op
, SelectionDAG
&DAG
) const {
7829 AtomicSDNode
*AtomicNode
= cast
<AtomicSDNode
>(Op
);
7830 assert(AtomicNode
->isCompareAndSwap());
7831 unsigned AS
= AtomicNode
->getAddressSpace();
7833 // No custom lowering required for local address space
7834 if (!isFlatGlobalAddrSpace(AS
))
7837 // Non-local address space requires custom lowering for atomic compare
7838 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7840 SDValue ChainIn
= Op
.getOperand(0);
7841 SDValue Addr
= Op
.getOperand(1);
7842 SDValue Old
= Op
.getOperand(2);
7843 SDValue New
= Op
.getOperand(3);
7844 EVT VT
= Op
.getValueType();
7845 MVT SimpleVT
= VT
.getSimpleVT();
7846 MVT VecType
= MVT::getVectorVT(SimpleVT
, 2);
7848 SDValue NewOld
= DAG
.getBuildVector(VecType
, DL
, {New
, Old
});
7849 SDValue Ops
[] = { ChainIn
, Addr
, NewOld
};
7851 return DAG
.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP
, DL
, Op
->getVTList(),
7852 Ops
, VT
, AtomicNode
->getMemOperand());
7855 //===----------------------------------------------------------------------===//
7856 // Custom DAG optimizations
7857 //===----------------------------------------------------------------------===//
7859 SDValue
SITargetLowering::performUCharToFloatCombine(SDNode
*N
,
7860 DAGCombinerInfo
&DCI
) const {
7861 EVT VT
= N
->getValueType(0);
7862 EVT ScalarVT
= VT
.getScalarType();
7863 if (ScalarVT
!= MVT::f32
)
7866 SelectionDAG
&DAG
= DCI
.DAG
;
7869 SDValue Src
= N
->getOperand(0);
7870 EVT SrcVT
= Src
.getValueType();
7872 // TODO: We could try to match extracting the higher bytes, which would be
7873 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
7874 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
7875 // about in practice.
7876 if (DCI
.isAfterLegalizeDAG() && SrcVT
== MVT::i32
) {
7877 if (DAG
.MaskedValueIsZero(Src
, APInt::getHighBitsSet(32, 24))) {
7878 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
, DL
, VT
, Src
);
7879 DCI
.AddToWorklist(Cvt
.getNode());
7887 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
7889 // This is a variant of
7890 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
7892 // The normal DAG combiner will do this, but only if the add has one use since
7893 // that would increase the number of instructions.
7895 // This prevents us from seeing a constant offset that can be folded into a
7896 // memory instruction's addressing mode. If we know the resulting add offset of
7897 // a pointer can be folded into an addressing offset, we can replace the pointer
7898 // operand with the add of new constant offset. This eliminates one of the uses,
7899 // and may allow the remaining use to also be simplified.
7901 SDValue
SITargetLowering::performSHLPtrCombine(SDNode
*N
,
7904 DAGCombinerInfo
&DCI
) const {
7905 SDValue N0
= N
->getOperand(0);
7906 SDValue N1
= N
->getOperand(1);
7908 // We only do this to handle cases where it's profitable when there are
7909 // multiple uses of the add, so defer to the standard combine.
7910 if ((N0
.getOpcode() != ISD::ADD
&& N0
.getOpcode() != ISD::OR
) ||
7914 const ConstantSDNode
*CN1
= dyn_cast
<ConstantSDNode
>(N1
);
7918 const ConstantSDNode
*CAdd
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
7922 // If the resulting offset is too large, we can't fold it into the addressing
7924 APInt Offset
= CAdd
->getAPIntValue() << CN1
->getAPIntValue();
7925 Type
*Ty
= MemVT
.getTypeForEVT(*DCI
.DAG
.getContext());
7928 AM
.HasBaseReg
= true;
7929 AM
.BaseOffs
= Offset
.getSExtValue();
7930 if (!isLegalAddressingMode(DCI
.DAG
.getDataLayout(), AM
, Ty
, AddrSpace
))
7933 SelectionDAG
&DAG
= DCI
.DAG
;
7935 EVT VT
= N
->getValueType(0);
7937 SDValue ShlX
= DAG
.getNode(ISD::SHL
, SL
, VT
, N0
.getOperand(0), N1
);
7938 SDValue COffset
= DAG
.getConstant(Offset
, SL
, MVT::i32
);
7941 Flags
.setNoUnsignedWrap(N
->getFlags().hasNoUnsignedWrap() &&
7942 (N0
.getOpcode() == ISD::OR
||
7943 N0
->getFlags().hasNoUnsignedWrap()));
7945 return DAG
.getNode(ISD::ADD
, SL
, VT
, ShlX
, COffset
, Flags
);
7948 SDValue
SITargetLowering::performMemSDNodeCombine(MemSDNode
*N
,
7949 DAGCombinerInfo
&DCI
) const {
7950 SDValue Ptr
= N
->getBasePtr();
7951 SelectionDAG
&DAG
= DCI
.DAG
;
7954 // TODO: We could also do this for multiplies.
7955 if (Ptr
.getOpcode() == ISD::SHL
) {
7956 SDValue NewPtr
= performSHLPtrCombine(Ptr
.getNode(), N
->getAddressSpace(),
7957 N
->getMemoryVT(), DCI
);
7959 SmallVector
<SDValue
, 8> NewOps(N
->op_begin(), N
->op_end());
7961 NewOps
[N
->getOpcode() == ISD::STORE
? 2 : 1] = NewPtr
;
7962 return SDValue(DAG
.UpdateNodeOperands(N
, NewOps
), 0);
7969 static bool bitOpWithConstantIsReducible(unsigned Opc
, uint32_t Val
) {
7970 return (Opc
== ISD::AND
&& (Val
== 0 || Val
== 0xffffffff)) ||
7971 (Opc
== ISD::OR
&& (Val
== 0xffffffff || Val
== 0)) ||
7972 (Opc
== ISD::XOR
&& Val
== 0);
7975 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
7976 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
7977 // integer combine opportunities since most 64-bit operations are decomposed
7978 // this way. TODO: We won't want this for SALU especially if it is an inline
7980 SDValue
SITargetLowering::splitBinaryBitConstantOp(
7981 DAGCombinerInfo
&DCI
,
7983 unsigned Opc
, SDValue LHS
,
7984 const ConstantSDNode
*CRHS
) const {
7985 uint64_t Val
= CRHS
->getZExtValue();
7986 uint32_t ValLo
= Lo_32(Val
);
7987 uint32_t ValHi
= Hi_32(Val
);
7988 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
7990 if ((bitOpWithConstantIsReducible(Opc
, ValLo
) ||
7991 bitOpWithConstantIsReducible(Opc
, ValHi
)) ||
7992 (CRHS
->hasOneUse() && !TII
->isInlineConstant(CRHS
->getAPIntValue()))) {
7993 // If we need to materialize a 64-bit immediate, it will be split up later
7994 // anyway. Avoid creating the harder to understand 64-bit immediate
7996 return splitBinaryBitConstantOpImpl(DCI
, SL
, Opc
, LHS
, ValLo
, ValHi
);
8002 // Returns true if argument is a boolean value which is not serialized into
8003 // memory or argument and does not require v_cmdmask_b32 to be deserialized.
8004 static bool isBoolSGPR(SDValue V
) {
8005 if (V
.getValueType() != MVT::i1
)
8007 switch (V
.getOpcode()) {
8013 case AMDGPUISD::FP_CLASS
:
8019 // If a constant has all zeroes or all ones within each byte return it.
8020 // Otherwise return 0.
8021 static uint32_t getConstantPermuteMask(uint32_t C
) {
8022 // 0xff for any zero byte in the mask
8023 uint32_t ZeroByteMask
= 0;
8024 if (!(C
& 0x000000ff)) ZeroByteMask
|= 0x000000ff;
8025 if (!(C
& 0x0000ff00)) ZeroByteMask
|= 0x0000ff00;
8026 if (!(C
& 0x00ff0000)) ZeroByteMask
|= 0x00ff0000;
8027 if (!(C
& 0xff000000)) ZeroByteMask
|= 0xff000000;
8028 uint32_t NonZeroByteMask
= ~ZeroByteMask
; // 0xff for any non-zero byte
8029 if ((NonZeroByteMask
& C
) != NonZeroByteMask
)
8030 return 0; // Partial bytes selected.
8034 // Check if a node selects whole bytes from its operand 0 starting at a byte
8035 // boundary while masking the rest. Returns select mask as in the v_perm_b32
8036 // or -1 if not succeeded.
8037 // Note byte select encoding:
8038 // value 0-3 selects corresponding source byte;
8039 // value 0xc selects zero;
8040 // value 0xff selects 0xff.
8041 static uint32_t getPermuteMask(SelectionDAG
&DAG
, SDValue V
) {
8042 assert(V
.getValueSizeInBits() == 32);
8044 if (V
.getNumOperands() != 2)
8047 ConstantSDNode
*N1
= dyn_cast
<ConstantSDNode
>(V
.getOperand(1));
8051 uint32_t C
= N1
->getZExtValue();
8053 switch (V
.getOpcode()) {
8057 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
8058 return (0x03020100 & ConstMask
) | (0x0c0c0c0c & ~ConstMask
);
8063 if (uint32_t ConstMask
= getConstantPermuteMask(C
)) {
8064 return (0x03020100 & ~ConstMask
) | ConstMask
;
8072 return uint32_t((0x030201000c0c0c0cull
<< C
) >> 32);
8078 return uint32_t(0x0c0c0c0c03020100ull
>> C
);
8084 SDValue
SITargetLowering::performAndCombine(SDNode
*N
,
8085 DAGCombinerInfo
&DCI
) const {
8086 if (DCI
.isBeforeLegalize())
8089 SelectionDAG
&DAG
= DCI
.DAG
;
8090 EVT VT
= N
->getValueType(0);
8091 SDValue LHS
= N
->getOperand(0);
8092 SDValue RHS
= N
->getOperand(1);
8095 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8096 if (VT
== MVT::i64
&& CRHS
) {
8098 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::AND
, LHS
, CRHS
))
8102 if (CRHS
&& VT
== MVT::i32
) {
8103 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
8104 // nb = number of trailing zeroes in mask
8105 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
8106 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
8107 uint64_t Mask
= CRHS
->getZExtValue();
8108 unsigned Bits
= countPopulation(Mask
);
8109 if (getSubtarget()->hasSDWA() && LHS
->getOpcode() == ISD::SRL
&&
8110 (Bits
== 8 || Bits
== 16) && isShiftedMask_64(Mask
) && !(Mask
& 1)) {
8111 if (auto *CShift
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(1))) {
8112 unsigned Shift
= CShift
->getZExtValue();
8113 unsigned NB
= CRHS
->getAPIntValue().countTrailingZeros();
8114 unsigned Offset
= NB
+ Shift
;
8115 if ((Offset
& (Bits
- 1)) == 0) { // Starts at a byte or word boundary.
8117 SDValue BFE
= DAG
.getNode(AMDGPUISD::BFE_U32
, SL
, MVT::i32
,
8119 DAG
.getConstant(Offset
, SL
, MVT::i32
),
8120 DAG
.getConstant(Bits
, SL
, MVT::i32
));
8121 EVT NarrowVT
= EVT::getIntegerVT(*DAG
.getContext(), Bits
);
8122 SDValue Ext
= DAG
.getNode(ISD::AssertZext
, SL
, VT
, BFE
,
8123 DAG
.getValueType(NarrowVT
));
8124 SDValue Shl
= DAG
.getNode(ISD::SHL
, SDLoc(LHS
), VT
, Ext
,
8125 DAG
.getConstant(NB
, SDLoc(CRHS
), MVT::i32
));
8131 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8132 if (LHS
.hasOneUse() && LHS
.getOpcode() == AMDGPUISD::PERM
&&
8133 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
8134 uint32_t Sel
= getConstantPermuteMask(Mask
);
8138 // Select 0xc for all zero bytes
8139 Sel
= (LHS
.getConstantOperandVal(2) & Sel
) | (~Sel
& 0x0c0c0c0c);
8141 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
8142 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
8146 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
8147 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
8148 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == ISD::SETCC
) {
8149 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
8150 ISD::CondCode RCC
= cast
<CondCodeSDNode
>(RHS
.getOperand(2))->get();
8152 SDValue X
= LHS
.getOperand(0);
8153 SDValue Y
= RHS
.getOperand(0);
8154 if (Y
.getOpcode() != ISD::FABS
|| Y
.getOperand(0) != X
)
8157 if (LCC
== ISD::SETO
) {
8158 if (X
!= LHS
.getOperand(1))
8161 if (RCC
== ISD::SETUNE
) {
8162 const ConstantFPSDNode
*C1
= dyn_cast
<ConstantFPSDNode
>(RHS
.getOperand(1));
8163 if (!C1
|| !C1
->isInfinity() || C1
->isNegative())
8166 const uint32_t Mask
= SIInstrFlags::N_NORMAL
|
8167 SIInstrFlags::N_SUBNORMAL
|
8168 SIInstrFlags::N_ZERO
|
8169 SIInstrFlags::P_ZERO
|
8170 SIInstrFlags::P_SUBNORMAL
|
8171 SIInstrFlags::P_NORMAL
;
8173 static_assert(((~(SIInstrFlags::S_NAN
|
8174 SIInstrFlags::Q_NAN
|
8175 SIInstrFlags::N_INFINITY
|
8176 SIInstrFlags::P_INFINITY
)) & 0x3ff) == Mask
,
8180 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
8181 X
, DAG
.getConstant(Mask
, DL
, MVT::i32
));
8186 if (RHS
.getOpcode() == ISD::SETCC
&& LHS
.getOpcode() == AMDGPUISD::FP_CLASS
)
8187 std::swap(LHS
, RHS
);
8189 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
8191 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
8192 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
8193 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
8194 const ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8195 if ((LCC
== ISD::SETO
|| LCC
== ISD::SETUO
) && Mask
&&
8196 (RHS
.getOperand(0) == LHS
.getOperand(0) &&
8197 LHS
.getOperand(0) == LHS
.getOperand(1))) {
8198 const unsigned OrdMask
= SIInstrFlags::S_NAN
| SIInstrFlags::Q_NAN
;
8199 unsigned NewMask
= LCC
== ISD::SETO
?
8200 Mask
->getZExtValue() & ~OrdMask
:
8201 Mask
->getZExtValue() & OrdMask
;
8204 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
, RHS
.getOperand(0),
8205 DAG
.getConstant(NewMask
, DL
, MVT::i32
));
8209 if (VT
== MVT::i32
&&
8210 (RHS
.getOpcode() == ISD::SIGN_EXTEND
|| LHS
.getOpcode() == ISD::SIGN_EXTEND
)) {
8211 // and x, (sext cc from i1) => select cc, x, 0
8212 if (RHS
.getOpcode() != ISD::SIGN_EXTEND
)
8213 std::swap(LHS
, RHS
);
8214 if (isBoolSGPR(RHS
.getOperand(0)))
8215 return DAG
.getSelect(SDLoc(N
), MVT::i32
, RHS
.getOperand(0),
8216 LHS
, DAG
.getConstant(0, SDLoc(N
), MVT::i32
));
8219 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8220 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8221 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
8222 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
8223 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
8224 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
8225 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
8226 // Canonicalize the expression in an attempt to have fewer unique masks
8227 // and therefore fewer registers used to hold the masks.
8228 if (LHSMask
> RHSMask
) {
8229 std::swap(LHSMask
, RHSMask
);
8230 std::swap(LHS
, RHS
);
8233 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8234 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8235 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8236 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8238 // Check of we need to combine values from two sources within a byte.
8239 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
8240 // If we select high and lower word keep it for SDWA.
8241 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8242 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
8243 // Each byte in each mask is either selector mask 0-3, or has higher
8244 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
8245 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
8246 // mask which is not 0xff wins. By anding both masks we have a correct
8247 // result except that 0x0c shall be corrected to give 0x0c only.
8248 uint32_t Mask
= LHSMask
& RHSMask
;
8249 for (unsigned I
= 0; I
< 32; I
+= 8) {
8250 uint32_t ByteSel
= 0xff << I
;
8251 if ((LHSMask
& ByteSel
) == 0x0c || (RHSMask
& ByteSel
) == 0x0c)
8252 Mask
&= (0x0c << I
) & 0xffffffff;
8255 // Add 4 to each active LHS lane. It will not affect any existing 0xff
8257 uint32_t Sel
= Mask
| (LHSUsedLanes
& 0x04040404);
8260 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
8261 LHS
.getOperand(0), RHS
.getOperand(0),
8262 DAG
.getConstant(Sel
, DL
, MVT::i32
));
8270 SDValue
SITargetLowering::performOrCombine(SDNode
*N
,
8271 DAGCombinerInfo
&DCI
) const {
8272 SelectionDAG
&DAG
= DCI
.DAG
;
8273 SDValue LHS
= N
->getOperand(0);
8274 SDValue RHS
= N
->getOperand(1);
8276 EVT VT
= N
->getValueType(0);
8277 if (VT
== MVT::i1
) {
8278 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
8279 if (LHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
8280 RHS
.getOpcode() == AMDGPUISD::FP_CLASS
) {
8281 SDValue Src
= LHS
.getOperand(0);
8282 if (Src
!= RHS
.getOperand(0))
8285 const ConstantSDNode
*CLHS
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
8286 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
8290 // Only 10 bits are used.
8291 static const uint32_t MaxMask
= 0x3ff;
8293 uint32_t NewMask
= (CLHS
->getZExtValue() | CRHS
->getZExtValue()) & MaxMask
;
8295 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
,
8296 Src
, DAG
.getConstant(NewMask
, DL
, MVT::i32
));
8302 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8303 if (isa
<ConstantSDNode
>(RHS
) && LHS
.hasOneUse() &&
8304 LHS
.getOpcode() == AMDGPUISD::PERM
&&
8305 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
8306 uint32_t Sel
= getConstantPermuteMask(N
->getConstantOperandVal(1));
8310 Sel
|= LHS
.getConstantOperandVal(2);
8312 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
8313 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
8316 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8317 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8318 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
8319 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32
) != -1) {
8320 uint32_t LHSMask
= getPermuteMask(DAG
, LHS
);
8321 uint32_t RHSMask
= getPermuteMask(DAG
, RHS
);
8322 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
8323 // Canonicalize the expression in an attempt to have fewer unique masks
8324 // and therefore fewer registers used to hold the masks.
8325 if (LHSMask
> RHSMask
) {
8326 std::swap(LHSMask
, RHSMask
);
8327 std::swap(LHS
, RHS
);
8330 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8331 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8332 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8333 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
8335 // Check of we need to combine values from two sources within a byte.
8336 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
8337 // If we select high and lower word keep it for SDWA.
8338 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8339 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
8340 // Kill zero bytes selected by other mask. Zero value is 0xc.
8341 LHSMask
&= ~RHSUsedLanes
;
8342 RHSMask
&= ~LHSUsedLanes
;
8343 // Add 4 to each active LHS lane
8344 LHSMask
|= LHSUsedLanes
& 0x04040404;
8346 uint32_t Sel
= LHSMask
| RHSMask
;
8349 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
,
8350 LHS
.getOperand(0), RHS
.getOperand(0),
8351 DAG
.getConstant(Sel
, DL
, MVT::i32
));
8359 // TODO: This could be a generic combine with a predicate for extracting the
8360 // high half of an integer being free.
8362 // (or i64:x, (zero_extend i32:y)) ->
8363 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
8364 if (LHS
.getOpcode() == ISD::ZERO_EXTEND
&&
8365 RHS
.getOpcode() != ISD::ZERO_EXTEND
)
8366 std::swap(LHS
, RHS
);
8368 if (RHS
.getOpcode() == ISD::ZERO_EXTEND
) {
8369 SDValue ExtSrc
= RHS
.getOperand(0);
8370 EVT SrcVT
= ExtSrc
.getValueType();
8371 if (SrcVT
== MVT::i32
) {
8373 SDValue LowLHS
, HiBits
;
8374 std::tie(LowLHS
, HiBits
) = split64BitValue(LHS
, DAG
);
8375 SDValue LowOr
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, LowLHS
, ExtSrc
);
8377 DCI
.AddToWorklist(LowOr
.getNode());
8378 DCI
.AddToWorklist(HiBits
.getNode());
8380 SDValue Vec
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
,
8382 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Vec
);
8386 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
8389 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::OR
, LHS
, CRHS
))
8396 SDValue
SITargetLowering::performXorCombine(SDNode
*N
,
8397 DAGCombinerInfo
&DCI
) const {
8398 EVT VT
= N
->getValueType(0);
8402 SDValue LHS
= N
->getOperand(0);
8403 SDValue RHS
= N
->getOperand(1);
8405 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
8408 = splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::XOR
, LHS
, CRHS
))
8415 // Instructions that will be lowered with a final instruction that zeros the
8416 // high result bits.
8417 // XXX - probably only need to list legal operations.
8418 static bool fp16SrcZerosHighBits(unsigned Opc
) {
8427 case ISD::FCANONICALIZE
:
8429 case ISD::UINT_TO_FP
:
8430 case ISD::SINT_TO_FP
:
8432 // Fabs is lowered to a bit operation, but it's an and which will clear the
8433 // high bits anyway.
8447 case ISD::FNEARBYINT
:
8452 case AMDGPUISD::FRACT
:
8453 case AMDGPUISD::CLAMP
:
8454 case AMDGPUISD::COS_HW
:
8455 case AMDGPUISD::SIN_HW
:
8456 case AMDGPUISD::FMIN3
:
8457 case AMDGPUISD::FMAX3
:
8458 case AMDGPUISD::FMED3
:
8459 case AMDGPUISD::FMAD_FTZ
:
8460 case AMDGPUISD::RCP
:
8461 case AMDGPUISD::RSQ
:
8462 case AMDGPUISD::RCP_IFLAG
:
8463 case AMDGPUISD::LDEXP
:
8466 // fcopysign, select and others may be lowered to 32-bit bit operations
8467 // which don't zero the high bits.
8472 SDValue
SITargetLowering::performZeroExtendCombine(SDNode
*N
,
8473 DAGCombinerInfo
&DCI
) const {
8474 if (!Subtarget
->has16BitInsts() ||
8475 DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
8478 EVT VT
= N
->getValueType(0);
8482 SDValue Src
= N
->getOperand(0);
8483 if (Src
.getValueType() != MVT::i16
)
8486 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
8487 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
8488 if (Src
.getOpcode() == ISD::BITCAST
) {
8489 SDValue BCSrc
= Src
.getOperand(0);
8490 if (BCSrc
.getValueType() == MVT::f16
&&
8491 fp16SrcZerosHighBits(BCSrc
.getOpcode()))
8492 return DCI
.DAG
.getNode(AMDGPUISD::FP16_ZEXT
, SDLoc(N
), VT
, BCSrc
);
8498 SDValue
SITargetLowering::performSignExtendInRegCombine(SDNode
*N
,
8499 DAGCombinerInfo
&DCI
)
8501 SDValue Src
= N
->getOperand(0);
8502 auto *VTSign
= cast
<VTSDNode
>(N
->getOperand(1));
8504 if (((Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
&&
8505 VTSign
->getVT() == MVT::i8
) ||
8506 (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT
&&
8507 VTSign
->getVT() == MVT::i16
)) &&
8509 auto *M
= cast
<MemSDNode
>(Src
);
8511 Src
.getOperand(0), // Chain
8512 Src
.getOperand(1), // rsrc
8513 Src
.getOperand(2), // vindex
8514 Src
.getOperand(3), // voffset
8515 Src
.getOperand(4), // soffset
8516 Src
.getOperand(5), // offset
8520 // replace with BUFFER_LOAD_BYTE/SHORT
8521 SDVTList ResList
= DCI
.DAG
.getVTList(MVT::i32
,
8522 Src
.getOperand(0).getValueType());
8523 unsigned Opc
= (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
) ?
8524 AMDGPUISD::BUFFER_LOAD_BYTE
: AMDGPUISD::BUFFER_LOAD_SHORT
;
8525 SDValue BufferLoadSignExt
= DCI
.DAG
.getMemIntrinsicNode(Opc
, SDLoc(N
),
8527 Ops
, M
->getMemoryVT(),
8528 M
->getMemOperand());
8529 return DCI
.DAG
.getMergeValues({BufferLoadSignExt
,
8530 BufferLoadSignExt
.getValue(1)}, SDLoc(N
));
8535 SDValue
SITargetLowering::performClassCombine(SDNode
*N
,
8536 DAGCombinerInfo
&DCI
) const {
8537 SelectionDAG
&DAG
= DCI
.DAG
;
8538 SDValue Mask
= N
->getOperand(1);
8540 // fp_class x, 0 -> false
8541 if (const ConstantSDNode
*CMask
= dyn_cast
<ConstantSDNode
>(Mask
)) {
8542 if (CMask
->isNullValue())
8543 return DAG
.getConstant(0, SDLoc(N
), MVT::i1
);
8546 if (N
->getOperand(0).isUndef())
8547 return DAG
.getUNDEF(MVT::i1
);
8552 SDValue
SITargetLowering::performRcpCombine(SDNode
*N
,
8553 DAGCombinerInfo
&DCI
) const {
8554 EVT VT
= N
->getValueType(0);
8555 SDValue N0
= N
->getOperand(0);
8560 if (VT
== MVT::f32
&& (N0
.getOpcode() == ISD::UINT_TO_FP
||
8561 N0
.getOpcode() == ISD::SINT_TO_FP
)) {
8562 return DCI
.DAG
.getNode(AMDGPUISD::RCP_IFLAG
, SDLoc(N
), VT
, N0
,
8566 return AMDGPUTargetLowering::performRcpCombine(N
, DCI
);
8569 bool SITargetLowering::isCanonicalized(SelectionDAG
&DAG
, SDValue Op
,
8570 unsigned MaxDepth
) const {
8571 unsigned Opcode
= Op
.getOpcode();
8572 if (Opcode
== ISD::FCANONICALIZE
)
8575 if (auto *CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
8576 auto F
= CFP
->getValueAPF();
8577 if (F
.isNaN() && F
.isSignaling())
8579 return !F
.isDenormal() || denormalsEnabledForType(Op
.getValueType());
8582 // If source is a result of another standard FP operation it is already in
8588 // These will flush denorms if required.
8600 case ISD::FP_EXTEND
:
8601 case AMDGPUISD::FMUL_LEGACY
:
8602 case AMDGPUISD::FMAD_FTZ
:
8603 case AMDGPUISD::RCP
:
8604 case AMDGPUISD::RSQ
:
8605 case AMDGPUISD::RSQ_CLAMP
:
8606 case AMDGPUISD::RCP_LEGACY
:
8607 case AMDGPUISD::RSQ_LEGACY
:
8608 case AMDGPUISD::RCP_IFLAG
:
8609 case AMDGPUISD::TRIG_PREOP
:
8610 case AMDGPUISD::DIV_SCALE
:
8611 case AMDGPUISD::DIV_FMAS
:
8612 case AMDGPUISD::DIV_FIXUP
:
8613 case AMDGPUISD::FRACT
:
8614 case AMDGPUISD::LDEXP
:
8615 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
8616 case AMDGPUISD::CVT_F32_UBYTE0
:
8617 case AMDGPUISD::CVT_F32_UBYTE1
:
8618 case AMDGPUISD::CVT_F32_UBYTE2
:
8619 case AMDGPUISD::CVT_F32_UBYTE3
:
8622 // It can/will be lowered or combined as a bit operation.
8623 // Need to check their input recursively to handle.
8626 case ISD::FCOPYSIGN
:
8627 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
8632 return Op
.getValueType().getScalarType() != MVT::f16
;
8636 case ISD::FMINNUM_IEEE
:
8637 case ISD::FMAXNUM_IEEE
:
8638 case AMDGPUISD::CLAMP
:
8639 case AMDGPUISD::FMED3
:
8640 case AMDGPUISD::FMAX3
:
8641 case AMDGPUISD::FMIN3
: {
8642 // FIXME: Shouldn't treat the generic operations different based these.
8643 // However, we aren't really required to flush the result from
8646 // snans will be quieted, so we only need to worry about denormals.
8647 if (Subtarget
->supportsMinMaxDenormModes() ||
8648 denormalsEnabledForType(Op
.getValueType()))
8651 // Flushing may be required.
8652 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
8653 // targets need to check their input recursively.
8655 // FIXME: Does this apply with clamp? It's implemented with max.
8656 for (unsigned I
= 0, E
= Op
.getNumOperands(); I
!= E
; ++I
) {
8657 if (!isCanonicalized(DAG
, Op
.getOperand(I
), MaxDepth
- 1))
8664 return isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1) &&
8665 isCanonicalized(DAG
, Op
.getOperand(2), MaxDepth
- 1);
8667 case ISD::BUILD_VECTOR
: {
8668 for (unsigned i
= 0, e
= Op
.getNumOperands(); i
!= e
; ++i
) {
8669 SDValue SrcOp
= Op
.getOperand(i
);
8670 if (!isCanonicalized(DAG
, SrcOp
, MaxDepth
- 1))
8676 case ISD::EXTRACT_VECTOR_ELT
:
8677 case ISD::EXTRACT_SUBVECTOR
: {
8678 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
8680 case ISD::INSERT_VECTOR_ELT
: {
8681 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1) &&
8682 isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1);
8685 // Could be anything.
8688 case ISD::BITCAST
: {
8689 // Hack round the mess we make when legalizing extract_vector_elt
8690 SDValue Src
= Op
.getOperand(0);
8691 if (Src
.getValueType() == MVT::i16
&&
8692 Src
.getOpcode() == ISD::TRUNCATE
) {
8693 SDValue TruncSrc
= Src
.getOperand(0);
8694 if (TruncSrc
.getValueType() == MVT::i32
&&
8695 TruncSrc
.getOpcode() == ISD::BITCAST
&&
8696 TruncSrc
.getOperand(0).getValueType() == MVT::v2f16
) {
8697 return isCanonicalized(DAG
, TruncSrc
.getOperand(0), MaxDepth
- 1);
8703 case ISD::INTRINSIC_WO_CHAIN
: {
8704 unsigned IntrinsicID
8705 = cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
8706 // TODO: Handle more intrinsics
8707 switch (IntrinsicID
) {
8708 case Intrinsic::amdgcn_cvt_pkrtz
:
8709 case Intrinsic::amdgcn_cubeid
:
8710 case Intrinsic::amdgcn_frexp_mant
:
8711 case Intrinsic::amdgcn_fdot2
:
8720 return denormalsEnabledForType(Op
.getValueType()) &&
8721 DAG
.isKnownNeverSNaN(Op
);
8724 llvm_unreachable("invalid operation");
8727 // Constant fold canonicalize.
8728 SDValue
SITargetLowering::getCanonicalConstantFP(
8729 SelectionDAG
&DAG
, const SDLoc
&SL
, EVT VT
, const APFloat
&C
) const {
8730 // Flush denormals to 0 if not enabled.
8731 if (C
.isDenormal() && !denormalsEnabledForType(VT
))
8732 return DAG
.getConstantFP(0.0, SL
, VT
);
8735 APFloat CanonicalQNaN
= APFloat::getQNaN(C
.getSemantics());
8736 if (C
.isSignaling()) {
8737 // Quiet a signaling NaN.
8738 // FIXME: Is this supposed to preserve payload bits?
8739 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
8742 // Make sure it is the canonical NaN bitpattern.
8744 // TODO: Can we use -1 as the canonical NaN value since it's an inline
8746 if (C
.bitcastToAPInt() != CanonicalQNaN
.bitcastToAPInt())
8747 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
8750 // Already canonical.
8751 return DAG
.getConstantFP(C
, SL
, VT
);
8754 static bool vectorEltWillFoldAway(SDValue Op
) {
8755 return Op
.isUndef() || isa
<ConstantFPSDNode
>(Op
);
8758 SDValue
SITargetLowering::performFCanonicalizeCombine(
8760 DAGCombinerInfo
&DCI
) const {
8761 SelectionDAG
&DAG
= DCI
.DAG
;
8762 SDValue N0
= N
->getOperand(0);
8763 EVT VT
= N
->getValueType(0);
8765 // fcanonicalize undef -> qnan
8767 APFloat QNaN
= APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT
));
8768 return DAG
.getConstantFP(QNaN
, SDLoc(N
), VT
);
8771 if (ConstantFPSDNode
*CFP
= isConstOrConstSplatFP(N0
)) {
8772 EVT VT
= N
->getValueType(0);
8773 return getCanonicalConstantFP(DAG
, SDLoc(N
), VT
, CFP
->getValueAPF());
8776 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
8777 // (fcanonicalize k)
8779 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
8781 // TODO: This could be better with wider vectors that will be split to v2f16,
8782 // and to consider uses since there aren't that many packed operations.
8783 if (N0
.getOpcode() == ISD::BUILD_VECTOR
&& VT
== MVT::v2f16
&&
8784 isTypeLegal(MVT::v2f16
)) {
8787 SDValue Lo
= N0
.getOperand(0);
8788 SDValue Hi
= N0
.getOperand(1);
8789 EVT EltVT
= Lo
.getValueType();
8791 if (vectorEltWillFoldAway(Lo
) || vectorEltWillFoldAway(Hi
)) {
8792 for (unsigned I
= 0; I
!= 2; ++I
) {
8793 SDValue Op
= N0
.getOperand(I
);
8794 if (ConstantFPSDNode
*CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
8795 NewElts
[I
] = getCanonicalConstantFP(DAG
, SL
, EltVT
,
8796 CFP
->getValueAPF());
8797 } else if (Op
.isUndef()) {
8798 // Handled below based on what the other operand is.
8801 NewElts
[I
] = DAG
.getNode(ISD::FCANONICALIZE
, SL
, EltVT
, Op
);
8805 // If one half is undef, and one is constant, perfer a splat vector rather
8806 // than the normal qNaN. If it's a register, prefer 0.0 since that's
8807 // cheaper to use and may be free with a packed operation.
8808 if (NewElts
[0].isUndef()) {
8809 if (isa
<ConstantFPSDNode
>(NewElts
[1]))
8810 NewElts
[0] = isa
<ConstantFPSDNode
>(NewElts
[1]) ?
8811 NewElts
[1]: DAG
.getConstantFP(0.0f
, SL
, EltVT
);
8814 if (NewElts
[1].isUndef()) {
8815 NewElts
[1] = isa
<ConstantFPSDNode
>(NewElts
[0]) ?
8816 NewElts
[0] : DAG
.getConstantFP(0.0f
, SL
, EltVT
);
8819 return DAG
.getBuildVector(VT
, SL
, NewElts
);
8823 unsigned SrcOpc
= N0
.getOpcode();
8825 // If it's free to do so, push canonicalizes further up the source, which may
8826 // find a canonical source.
8828 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
8830 if (SrcOpc
== ISD::FMINNUM
|| SrcOpc
== ISD::FMAXNUM
) {
8831 auto *CRHS
= dyn_cast
<ConstantFPSDNode
>(N0
.getOperand(1));
8832 if (CRHS
&& N0
.hasOneUse()) {
8834 SDValue Canon0
= DAG
.getNode(ISD::FCANONICALIZE
, SL
, VT
,
8836 SDValue Canon1
= getCanonicalConstantFP(DAG
, SL
, VT
, CRHS
->getValueAPF());
8837 DCI
.AddToWorklist(Canon0
.getNode());
8839 return DAG
.getNode(N0
.getOpcode(), SL
, VT
, Canon0
, Canon1
);
8843 return isCanonicalized(DAG
, N0
) ? N0
: SDValue();
8846 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc
) {
8849 case ISD::FMAXNUM_IEEE
:
8850 return AMDGPUISD::FMAX3
;
8852 return AMDGPUISD::SMAX3
;
8854 return AMDGPUISD::UMAX3
;
8856 case ISD::FMINNUM_IEEE
:
8857 return AMDGPUISD::FMIN3
;
8859 return AMDGPUISD::SMIN3
;
8861 return AMDGPUISD::UMIN3
;
8863 llvm_unreachable("Not a min/max opcode");
8867 SDValue
SITargetLowering::performIntMed3ImmCombine(
8868 SelectionDAG
&DAG
, const SDLoc
&SL
,
8869 SDValue Op0
, SDValue Op1
, bool Signed
) const {
8870 ConstantSDNode
*K1
= dyn_cast
<ConstantSDNode
>(Op1
);
8874 ConstantSDNode
*K0
= dyn_cast
<ConstantSDNode
>(Op0
.getOperand(1));
8879 if (K0
->getAPIntValue().sge(K1
->getAPIntValue()))
8882 if (K0
->getAPIntValue().uge(K1
->getAPIntValue()))
8886 EVT VT
= K0
->getValueType(0);
8887 unsigned Med3Opc
= Signed
? AMDGPUISD::SMED3
: AMDGPUISD::UMED3
;
8888 if (VT
== MVT::i32
|| (VT
== MVT::i16
&& Subtarget
->hasMed3_16())) {
8889 return DAG
.getNode(Med3Opc
, SL
, VT
,
8890 Op0
.getOperand(0), SDValue(K0
, 0), SDValue(K1
, 0));
8893 // If there isn't a 16-bit med3 operation, convert to 32-bit.
8895 unsigned ExtOp
= Signed
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
8897 SDValue Tmp1
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(0));
8898 SDValue Tmp2
= DAG
.getNode(ExtOp
, SL
, NVT
, Op0
->getOperand(1));
8899 SDValue Tmp3
= DAG
.getNode(ExtOp
, SL
, NVT
, Op1
);
8901 SDValue Med3
= DAG
.getNode(Med3Opc
, SL
, NVT
, Tmp1
, Tmp2
, Tmp3
);
8902 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Med3
);
8905 static ConstantFPSDNode
*getSplatConstantFP(SDValue Op
) {
8906 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(Op
))
8909 if (BuildVectorSDNode
*BV
= dyn_cast
<BuildVectorSDNode
>(Op
)) {
8910 if (ConstantFPSDNode
*C
= BV
->getConstantFPSplatNode())
8917 SDValue
SITargetLowering::performFPMed3ImmCombine(SelectionDAG
&DAG
,
8920 SDValue Op1
) const {
8921 ConstantFPSDNode
*K1
= getSplatConstantFP(Op1
);
8925 ConstantFPSDNode
*K0
= getSplatConstantFP(Op0
.getOperand(1));
8929 // Ordered >= (although NaN inputs should have folded away by now).
8930 APFloat::cmpResult Cmp
= K0
->getValueAPF().compare(K1
->getValueAPF());
8931 if (Cmp
== APFloat::cmpGreaterThan
)
8934 const MachineFunction
&MF
= DAG
.getMachineFunction();
8935 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
8937 // TODO: Check IEEE bit enabled?
8938 EVT VT
= Op0
.getValueType();
8939 if (Info
->getMode().DX10Clamp
) {
8940 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
8941 // hardware fmed3 behavior converting to a min.
8942 // FIXME: Should this be allowing -0.0?
8943 if (K1
->isExactlyValue(1.0) && K0
->isExactlyValue(0.0))
8944 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Op0
.getOperand(0));
8947 // med3 for f16 is only available on gfx9+, and not available for v2f16.
8948 if (VT
== MVT::f32
|| (VT
== MVT::f16
&& Subtarget
->hasMed3_16())) {
8949 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
8950 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
8951 // then give the other result, which is different from med3 with a NaN
8953 SDValue Var
= Op0
.getOperand(0);
8954 if (!DAG
.isKnownNeverSNaN(Var
))
8957 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
8959 if ((!K0
->hasOneUse() ||
8960 TII
->isInlineConstant(K0
->getValueAPF().bitcastToAPInt())) &&
8961 (!K1
->hasOneUse() ||
8962 TII
->isInlineConstant(K1
->getValueAPF().bitcastToAPInt()))) {
8963 return DAG
.getNode(AMDGPUISD::FMED3
, SL
, K0
->getValueType(0),
8964 Var
, SDValue(K0
, 0), SDValue(K1
, 0));
8971 SDValue
SITargetLowering::performMinMaxCombine(SDNode
*N
,
8972 DAGCombinerInfo
&DCI
) const {
8973 SelectionDAG
&DAG
= DCI
.DAG
;
8975 EVT VT
= N
->getValueType(0);
8976 unsigned Opc
= N
->getOpcode();
8977 SDValue Op0
= N
->getOperand(0);
8978 SDValue Op1
= N
->getOperand(1);
8980 // Only do this if the inner op has one use since this will just increases
8981 // register pressure for no benefit.
8983 if (Opc
!= AMDGPUISD::FMIN_LEGACY
&& Opc
!= AMDGPUISD::FMAX_LEGACY
&&
8985 (VT
== MVT::i32
|| VT
== MVT::f32
||
8986 ((VT
== MVT::f16
|| VT
== MVT::i16
) && Subtarget
->hasMin3Max3_16()))) {
8987 // max(max(a, b), c) -> max3(a, b, c)
8988 // min(min(a, b), c) -> min3(a, b, c)
8989 if (Op0
.getOpcode() == Opc
&& Op0
.hasOneUse()) {
8991 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
9000 // max(a, max(b, c)) -> max3(a, b, c)
9001 // min(a, min(b, c)) -> min3(a, b, c)
9002 if (Op1
.getOpcode() == Opc
&& Op1
.hasOneUse()) {
9004 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
9013 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
9014 if (Opc
== ISD::SMIN
&& Op0
.getOpcode() == ISD::SMAX
&& Op0
.hasOneUse()) {
9015 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, true))
9019 if (Opc
== ISD::UMIN
&& Op0
.getOpcode() == ISD::UMAX
&& Op0
.hasOneUse()) {
9020 if (SDValue Med3
= performIntMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
, false))
9024 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
9025 if (((Opc
== ISD::FMINNUM
&& Op0
.getOpcode() == ISD::FMAXNUM
) ||
9026 (Opc
== ISD::FMINNUM_IEEE
&& Op0
.getOpcode() == ISD::FMAXNUM_IEEE
) ||
9027 (Opc
== AMDGPUISD::FMIN_LEGACY
&&
9028 Op0
.getOpcode() == AMDGPUISD::FMAX_LEGACY
)) &&
9029 (VT
== MVT::f32
|| VT
== MVT::f64
||
9030 (VT
== MVT::f16
&& Subtarget
->has16BitInsts()) ||
9031 (VT
== MVT::v2f16
&& Subtarget
->hasVOP3PInsts())) &&
9033 if (SDValue Res
= performFPMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
))
9040 static bool isClampZeroToOne(SDValue A
, SDValue B
) {
9041 if (ConstantFPSDNode
*CA
= dyn_cast
<ConstantFPSDNode
>(A
)) {
9042 if (ConstantFPSDNode
*CB
= dyn_cast
<ConstantFPSDNode
>(B
)) {
9043 // FIXME: Should this be allowing -0.0?
9044 return (CA
->isExactlyValue(0.0) && CB
->isExactlyValue(1.0)) ||
9045 (CA
->isExactlyValue(1.0) && CB
->isExactlyValue(0.0));
9052 // FIXME: Should only worry about snans for version with chain.
9053 SDValue
SITargetLowering::performFMed3Combine(SDNode
*N
,
9054 DAGCombinerInfo
&DCI
) const {
9055 EVT VT
= N
->getValueType(0);
9056 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
9057 // NaNs. With a NaN input, the order of the operands may change the result.
9059 SelectionDAG
&DAG
= DCI
.DAG
;
9062 SDValue Src0
= N
->getOperand(0);
9063 SDValue Src1
= N
->getOperand(1);
9064 SDValue Src2
= N
->getOperand(2);
9066 if (isClampZeroToOne(Src0
, Src1
)) {
9067 // const_a, const_b, x -> clamp is safe in all cases including signaling
9069 // FIXME: Should this be allowing -0.0?
9070 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src2
);
9073 const MachineFunction
&MF
= DAG
.getMachineFunction();
9074 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
9076 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
9077 // handling no dx10-clamp?
9078 if (Info
->getMode().DX10Clamp
) {
9079 // If NaNs is clamped to 0, we are free to reorder the inputs.
9081 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
9082 std::swap(Src0
, Src1
);
9084 if (isa
<ConstantFPSDNode
>(Src1
) && !isa
<ConstantFPSDNode
>(Src2
))
9085 std::swap(Src1
, Src2
);
9087 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
9088 std::swap(Src0
, Src1
);
9090 if (isClampZeroToOne(Src1
, Src2
))
9091 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src0
);
9097 SDValue
SITargetLowering::performCvtPkRTZCombine(SDNode
*N
,
9098 DAGCombinerInfo
&DCI
) const {
9099 SDValue Src0
= N
->getOperand(0);
9100 SDValue Src1
= N
->getOperand(1);
9101 if (Src0
.isUndef() && Src1
.isUndef())
9102 return DCI
.DAG
.getUNDEF(N
->getValueType(0));
9106 SDValue
SITargetLowering::performExtractVectorEltCombine(
9107 SDNode
*N
, DAGCombinerInfo
&DCI
) const {
9108 SDValue Vec
= N
->getOperand(0);
9109 SelectionDAG
&DAG
= DCI
.DAG
;
9111 EVT VecVT
= Vec
.getValueType();
9112 EVT EltVT
= VecVT
.getVectorElementType();
9114 if ((Vec
.getOpcode() == ISD::FNEG
||
9115 Vec
.getOpcode() == ISD::FABS
) && allUsesHaveSourceMods(N
)) {
9117 EVT EltVT
= N
->getValueType(0);
9118 SDValue Idx
= N
->getOperand(1);
9119 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9120 Vec
.getOperand(0), Idx
);
9121 return DAG
.getNode(Vec
.getOpcode(), SL
, EltVT
, Elt
);
9124 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
9126 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
9127 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
9128 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
9129 if (Vec
.hasOneUse() && DCI
.isBeforeLegalize()) {
9131 EVT EltVT
= N
->getValueType(0);
9132 SDValue Idx
= N
->getOperand(1);
9133 unsigned Opc
= Vec
.getOpcode();
9138 // TODO: Support other binary operations.
9149 case ISD::FMAXNUM_IEEE
:
9150 case ISD::FMINNUM_IEEE
: {
9151 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9152 Vec
.getOperand(0), Idx
);
9153 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
9154 Vec
.getOperand(1), Idx
);
9156 DCI
.AddToWorklist(Elt0
.getNode());
9157 DCI
.AddToWorklist(Elt1
.getNode());
9158 return DAG
.getNode(Opc
, SL
, EltVT
, Elt0
, Elt1
, Vec
->getFlags());
9163 unsigned VecSize
= VecVT
.getSizeInBits();
9164 unsigned EltSize
= EltVT
.getSizeInBits();
9166 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
9167 // This elminates non-constant index and subsequent movrel or scratch access.
9168 // Sub-dword vectors of size 2 dword or less have better implementation.
9169 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
9171 if (VecSize
<= 256 && (VecSize
> 64 || EltSize
>= 32) &&
9172 !isa
<ConstantSDNode
>(N
->getOperand(1))) {
9174 SDValue Idx
= N
->getOperand(1);
9175 EVT IdxVT
= Idx
.getValueType();
9177 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
9178 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
9179 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
9183 V
= DAG
.getSelectCC(SL
, Idx
, IC
, Elt
, V
, ISD::SETEQ
);
9188 if (!DCI
.isBeforeLegalize())
9191 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
9192 // elements. This exposes more load reduction opportunities by replacing
9193 // multiple small extract_vector_elements with a single 32-bit extract.
9194 auto *Idx
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
9195 if (isa
<MemSDNode
>(Vec
) &&
9197 EltVT
.isByteSized() &&
9199 VecSize
% 32 == 0 &&
9201 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VecVT
);
9203 unsigned BitIndex
= Idx
->getZExtValue() * EltSize
;
9204 unsigned EltIdx
= BitIndex
/ 32;
9205 unsigned LeftoverBitIdx
= BitIndex
% 32;
9208 SDValue Cast
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, Vec
);
9209 DCI
.AddToWorklist(Cast
.getNode());
9211 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Cast
,
9212 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
9213 DCI
.AddToWorklist(Elt
.getNode());
9214 SDValue Srl
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Elt
,
9215 DAG
.getConstant(LeftoverBitIdx
, SL
, MVT::i32
));
9216 DCI
.AddToWorklist(Srl
.getNode());
9218 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, SL
, EltVT
.changeTypeToInteger(), Srl
);
9219 DCI
.AddToWorklist(Trunc
.getNode());
9220 return DAG
.getNode(ISD::BITCAST
, SL
, EltVT
, Trunc
);
9227 SITargetLowering::performInsertVectorEltCombine(SDNode
*N
,
9228 DAGCombinerInfo
&DCI
) const {
9229 SDValue Vec
= N
->getOperand(0);
9230 SDValue Idx
= N
->getOperand(2);
9231 EVT VecVT
= Vec
.getValueType();
9232 EVT EltVT
= VecVT
.getVectorElementType();
9233 unsigned VecSize
= VecVT
.getSizeInBits();
9234 unsigned EltSize
= EltVT
.getSizeInBits();
9236 // INSERT_VECTOR_ELT (<n x e>, var-idx)
9237 // => BUILD_VECTOR n x select (e, const-idx)
9238 // This elminates non-constant index and subsequent movrel or scratch access.
9239 // Sub-dword vectors of size 2 dword or less have better implementation.
9240 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
9242 if (isa
<ConstantSDNode
>(Idx
) ||
9243 VecSize
> 256 || (VecSize
<= 64 && EltSize
< 32))
9246 SelectionDAG
&DAG
= DCI
.DAG
;
9248 SDValue Ins
= N
->getOperand(1);
9249 EVT IdxVT
= Idx
.getValueType();
9251 SmallVector
<SDValue
, 16> Ops
;
9252 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
9253 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
9254 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
9255 SDValue V
= DAG
.getSelectCC(SL
, Idx
, IC
, Ins
, Elt
, ISD::SETEQ
);
9259 return DAG
.getBuildVector(VecVT
, SL
, Ops
);
9262 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG
&DAG
,
9264 const SDNode
*N1
) const {
9265 EVT VT
= N0
->getValueType(0);
9267 // Only do this if we are not trying to support denormals. v_mad_f32 does not
9268 // support denormals ever.
9269 if (((VT
== MVT::f32
&& !Subtarget
->hasFP32Denormals()) ||
9270 (VT
== MVT::f16
&& !Subtarget
->hasFP16Denormals() &&
9271 getSubtarget()->hasMadF16())) &&
9272 isOperationLegal(ISD::FMAD
, VT
))
9275 const TargetOptions
&Options
= DAG
.getTarget().Options
;
9276 if ((Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
9277 (N0
->getFlags().hasAllowContract() &&
9278 N1
->getFlags().hasAllowContract())) &&
9279 isFMAFasterThanFMulAndFAdd(VT
)) {
9286 // For a reassociatable opcode perform:
9287 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
9288 SDValue
SITargetLowering::reassociateScalarOps(SDNode
*N
,
9289 SelectionDAG
&DAG
) const {
9290 EVT VT
= N
->getValueType(0);
9291 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
9294 unsigned Opc
= N
->getOpcode();
9295 SDValue Op0
= N
->getOperand(0);
9296 SDValue Op1
= N
->getOperand(1);
9298 if (!(Op0
->isDivergent() ^ Op1
->isDivergent()))
9301 if (Op0
->isDivergent())
9302 std::swap(Op0
, Op1
);
9304 if (Op1
.getOpcode() != Opc
|| !Op1
.hasOneUse())
9307 SDValue Op2
= Op1
.getOperand(1);
9308 Op1
= Op1
.getOperand(0);
9309 if (!(Op1
->isDivergent() ^ Op2
->isDivergent()))
9312 if (Op1
->isDivergent())
9313 std::swap(Op1
, Op2
);
9315 // If either operand is constant this will conflict with
9316 // DAGCombiner::ReassociateOps().
9317 if (DAG
.isConstantIntBuildVectorOrConstantInt(Op0
) ||
9318 DAG
.isConstantIntBuildVectorOrConstantInt(Op1
))
9322 SDValue Add1
= DAG
.getNode(Opc
, SL
, VT
, Op0
, Op1
);
9323 return DAG
.getNode(Opc
, SL
, VT
, Add1
, Op2
);
9326 static SDValue
getMad64_32(SelectionDAG
&DAG
, const SDLoc
&SL
,
9328 SDValue N0
, SDValue N1
, SDValue N2
,
9330 unsigned MadOpc
= Signed
? AMDGPUISD::MAD_I64_I32
: AMDGPUISD::MAD_U64_U32
;
9331 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i1
);
9332 SDValue Mad
= DAG
.getNode(MadOpc
, SL
, VTs
, N0
, N1
, N2
);
9333 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Mad
);
9336 SDValue
SITargetLowering::performAddCombine(SDNode
*N
,
9337 DAGCombinerInfo
&DCI
) const {
9338 SelectionDAG
&DAG
= DCI
.DAG
;
9339 EVT VT
= N
->getValueType(0);
9341 SDValue LHS
= N
->getOperand(0);
9342 SDValue RHS
= N
->getOperand(1);
9344 if ((LHS
.getOpcode() == ISD::MUL
|| RHS
.getOpcode() == ISD::MUL
)
9345 && Subtarget
->hasMad64_32() &&
9346 !VT
.isVector() && VT
.getScalarSizeInBits() > 32 &&
9347 VT
.getScalarSizeInBits() <= 64) {
9348 if (LHS
.getOpcode() != ISD::MUL
)
9349 std::swap(LHS
, RHS
);
9351 SDValue MulLHS
= LHS
.getOperand(0);
9352 SDValue MulRHS
= LHS
.getOperand(1);
9353 SDValue AddRHS
= RHS
;
9355 // TODO: Maybe restrict if SGPR inputs.
9356 if (numBitsUnsigned(MulLHS
, DAG
) <= 32 &&
9357 numBitsUnsigned(MulRHS
, DAG
) <= 32) {
9358 MulLHS
= DAG
.getZExtOrTrunc(MulLHS
, SL
, MVT::i32
);
9359 MulRHS
= DAG
.getZExtOrTrunc(MulRHS
, SL
, MVT::i32
);
9360 AddRHS
= DAG
.getZExtOrTrunc(AddRHS
, SL
, MVT::i64
);
9361 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, false);
9364 if (numBitsSigned(MulLHS
, DAG
) < 32 && numBitsSigned(MulRHS
, DAG
) < 32) {
9365 MulLHS
= DAG
.getSExtOrTrunc(MulLHS
, SL
, MVT::i32
);
9366 MulRHS
= DAG
.getSExtOrTrunc(MulRHS
, SL
, MVT::i32
);
9367 AddRHS
= DAG
.getSExtOrTrunc(AddRHS
, SL
, MVT::i64
);
9368 return getMad64_32(DAG
, SL
, VT
, MulLHS
, MulRHS
, AddRHS
, true);
9374 if (SDValue V
= reassociateScalarOps(N
, DAG
)) {
9378 if (VT
!= MVT::i32
|| !DCI
.isAfterLegalizeDAG())
9381 // add x, zext (setcc) => addcarry x, 0, setcc
9382 // add x, sext (setcc) => subcarry x, 0, setcc
9383 unsigned Opc
= LHS
.getOpcode();
9384 if (Opc
== ISD::ZERO_EXTEND
|| Opc
== ISD::SIGN_EXTEND
||
9385 Opc
== ISD::ANY_EXTEND
|| Opc
== ISD::ADDCARRY
)
9386 std::swap(RHS
, LHS
);
9388 Opc
= RHS
.getOpcode();
9391 case ISD::ZERO_EXTEND
:
9392 case ISD::SIGN_EXTEND
:
9393 case ISD::ANY_EXTEND
: {
9394 auto Cond
= RHS
.getOperand(0);
9395 if (!isBoolSGPR(Cond
))
9397 SDVTList VTList
= DAG
.getVTList(MVT::i32
, MVT::i1
);
9398 SDValue Args
[] = { LHS
, DAG
.getConstant(0, SL
, MVT::i32
), Cond
};
9399 Opc
= (Opc
== ISD::SIGN_EXTEND
) ? ISD::SUBCARRY
: ISD::ADDCARRY
;
9400 return DAG
.getNode(Opc
, SL
, VTList
, Args
);
9402 case ISD::ADDCARRY
: {
9403 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
9404 auto C
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
9405 if (!C
|| C
->getZExtValue() != 0) break;
9406 SDValue Args
[] = { LHS
, RHS
.getOperand(0), RHS
.getOperand(2) };
9407 return DAG
.getNode(ISD::ADDCARRY
, SDLoc(N
), RHS
->getVTList(), Args
);
9413 SDValue
SITargetLowering::performSubCombine(SDNode
*N
,
9414 DAGCombinerInfo
&DCI
) const {
9415 SelectionDAG
&DAG
= DCI
.DAG
;
9416 EVT VT
= N
->getValueType(0);
9422 SDValue LHS
= N
->getOperand(0);
9423 SDValue RHS
= N
->getOperand(1);
9425 if (LHS
.getOpcode() == ISD::SUBCARRY
) {
9426 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
9427 auto C
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
9428 if (!C
|| !C
->isNullValue())
9430 SDValue Args
[] = { LHS
.getOperand(0), RHS
, LHS
.getOperand(2) };
9431 return DAG
.getNode(ISD::SUBCARRY
, SDLoc(N
), LHS
->getVTList(), Args
);
9436 SDValue
SITargetLowering::performAddCarrySubCarryCombine(SDNode
*N
,
9437 DAGCombinerInfo
&DCI
) const {
9439 if (N
->getValueType(0) != MVT::i32
)
9442 auto C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
9443 if (!C
|| C
->getZExtValue() != 0)
9446 SelectionDAG
&DAG
= DCI
.DAG
;
9447 SDValue LHS
= N
->getOperand(0);
9449 // addcarry (add x, y), 0, cc => addcarry x, y, cc
9450 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
9451 unsigned LHSOpc
= LHS
.getOpcode();
9452 unsigned Opc
= N
->getOpcode();
9453 if ((LHSOpc
== ISD::ADD
&& Opc
== ISD::ADDCARRY
) ||
9454 (LHSOpc
== ISD::SUB
&& Opc
== ISD::SUBCARRY
)) {
9455 SDValue Args
[] = { LHS
.getOperand(0), LHS
.getOperand(1), N
->getOperand(2) };
9456 return DAG
.getNode(Opc
, SDLoc(N
), N
->getVTList(), Args
);
9461 SDValue
SITargetLowering::performFAddCombine(SDNode
*N
,
9462 DAGCombinerInfo
&DCI
) const {
9463 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
9466 SelectionDAG
&DAG
= DCI
.DAG
;
9467 EVT VT
= N
->getValueType(0);
9470 SDValue LHS
= N
->getOperand(0);
9471 SDValue RHS
= N
->getOperand(1);
9473 // These should really be instruction patterns, but writing patterns with
9474 // source modiifiers is a pain.
9476 // fadd (fadd (a, a), b) -> mad 2.0, a, b
9477 if (LHS
.getOpcode() == ISD::FADD
) {
9478 SDValue A
= LHS
.getOperand(0);
9479 if (A
== LHS
.getOperand(1)) {
9480 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
9482 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9483 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, RHS
);
9488 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
9489 if (RHS
.getOpcode() == ISD::FADD
) {
9490 SDValue A
= RHS
.getOperand(0);
9491 if (A
== RHS
.getOperand(1)) {
9492 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
9494 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9495 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, LHS
);
9503 SDValue
SITargetLowering::performFSubCombine(SDNode
*N
,
9504 DAGCombinerInfo
&DCI
) const {
9505 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
9508 SelectionDAG
&DAG
= DCI
.DAG
;
9510 EVT VT
= N
->getValueType(0);
9511 assert(!VT
.isVector());
9513 // Try to get the fneg to fold into the source modifier. This undoes generic
9514 // DAG combines and folds them into the mad.
9516 // Only do this if we are not trying to support denormals. v_mad_f32 does
9517 // not support denormals ever.
9518 SDValue LHS
= N
->getOperand(0);
9519 SDValue RHS
= N
->getOperand(1);
9520 if (LHS
.getOpcode() == ISD::FADD
) {
9521 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
9522 SDValue A
= LHS
.getOperand(0);
9523 if (A
== LHS
.getOperand(1)) {
9524 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
9526 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
9527 SDValue NegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
9529 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, NegRHS
);
9534 if (RHS
.getOpcode() == ISD::FADD
) {
9535 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
9537 SDValue A
= RHS
.getOperand(0);
9538 if (A
== RHS
.getOperand(1)) {
9539 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
9541 const SDValue NegTwo
= DAG
.getConstantFP(-2.0, SL
, VT
);
9542 return DAG
.getNode(FusedOp
, SL
, VT
, A
, NegTwo
, LHS
);
9550 SDValue
SITargetLowering::performFMACombine(SDNode
*N
,
9551 DAGCombinerInfo
&DCI
) const {
9552 SelectionDAG
&DAG
= DCI
.DAG
;
9553 EVT VT
= N
->getValueType(0);
9556 if (!Subtarget
->hasDot2Insts() || VT
!= MVT::f32
)
9559 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
9560 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
9561 SDValue Op1
= N
->getOperand(0);
9562 SDValue Op2
= N
->getOperand(1);
9563 SDValue FMA
= N
->getOperand(2);
9565 if (FMA
.getOpcode() != ISD::FMA
||
9566 Op1
.getOpcode() != ISD::FP_EXTEND
||
9567 Op2
.getOpcode() != ISD::FP_EXTEND
)
9570 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
9571 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
9572 // is sufficient to allow generaing fdot2.
9573 const TargetOptions
&Options
= DAG
.getTarget().Options
;
9574 if (Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
9575 (N
->getFlags().hasAllowContract() &&
9576 FMA
->getFlags().hasAllowContract())) {
9577 Op1
= Op1
.getOperand(0);
9578 Op2
= Op2
.getOperand(0);
9579 if (Op1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
9580 Op2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
9583 SDValue Vec1
= Op1
.getOperand(0);
9584 SDValue Idx1
= Op1
.getOperand(1);
9585 SDValue Vec2
= Op2
.getOperand(0);
9587 SDValue FMAOp1
= FMA
.getOperand(0);
9588 SDValue FMAOp2
= FMA
.getOperand(1);
9589 SDValue FMAAcc
= FMA
.getOperand(2);
9591 if (FMAOp1
.getOpcode() != ISD::FP_EXTEND
||
9592 FMAOp2
.getOpcode() != ISD::FP_EXTEND
)
9595 FMAOp1
= FMAOp1
.getOperand(0);
9596 FMAOp2
= FMAOp2
.getOperand(0);
9597 if (FMAOp1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
9598 FMAOp2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
9601 SDValue Vec3
= FMAOp1
.getOperand(0);
9602 SDValue Vec4
= FMAOp2
.getOperand(0);
9603 SDValue Idx2
= FMAOp1
.getOperand(1);
9605 if (Idx1
!= Op2
.getOperand(1) || Idx2
!= FMAOp2
.getOperand(1) ||
9606 // Idx1 and Idx2 cannot be the same.
9610 if (Vec1
== Vec2
|| Vec3
== Vec4
)
9613 if (Vec1
.getValueType() != MVT::v2f16
|| Vec2
.getValueType() != MVT::v2f16
)
9616 if ((Vec1
== Vec3
&& Vec2
== Vec4
) ||
9617 (Vec1
== Vec4
&& Vec2
== Vec3
)) {
9618 return DAG
.getNode(AMDGPUISD::FDOT2
, SL
, MVT::f32
, Vec1
, Vec2
, FMAAcc
,
9619 DAG
.getTargetConstant(0, SL
, MVT::i1
));
9625 SDValue
SITargetLowering::performSetCCCombine(SDNode
*N
,
9626 DAGCombinerInfo
&DCI
) const {
9627 SelectionDAG
&DAG
= DCI
.DAG
;
9630 SDValue LHS
= N
->getOperand(0);
9631 SDValue RHS
= N
->getOperand(1);
9632 EVT VT
= LHS
.getValueType();
9633 ISD::CondCode CC
= cast
<CondCodeSDNode
>(N
->getOperand(2))->get();
9635 auto CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
9637 CRHS
= dyn_cast
<ConstantSDNode
>(LHS
);
9639 std::swap(LHS
, RHS
);
9640 CC
= getSetCCSwappedOperands(CC
);
9645 if (VT
== MVT::i32
&& LHS
.getOpcode() == ISD::SIGN_EXTEND
&&
9646 isBoolSGPR(LHS
.getOperand(0))) {
9647 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
9648 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
9649 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
9650 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
9651 if ((CRHS
->isAllOnesValue() &&
9652 (CC
== ISD::SETNE
|| CC
== ISD::SETGT
|| CC
== ISD::SETULT
)) ||
9653 (CRHS
->isNullValue() &&
9654 (CC
== ISD::SETEQ
|| CC
== ISD::SETGE
|| CC
== ISD::SETULE
)))
9655 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
9656 DAG
.getConstant(-1, SL
, MVT::i1
));
9657 if ((CRHS
->isAllOnesValue() &&
9658 (CC
== ISD::SETEQ
|| CC
== ISD::SETLE
|| CC
== ISD::SETUGE
)) ||
9659 (CRHS
->isNullValue() &&
9660 (CC
== ISD::SETNE
|| CC
== ISD::SETUGT
|| CC
== ISD::SETLT
)))
9661 return LHS
.getOperand(0);
9664 uint64_t CRHSVal
= CRHS
->getZExtValue();
9665 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) &&
9666 LHS
.getOpcode() == ISD::SELECT
&&
9667 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
9668 isa
<ConstantSDNode
>(LHS
.getOperand(2)) &&
9669 LHS
.getConstantOperandVal(1) != LHS
.getConstantOperandVal(2) &&
9670 isBoolSGPR(LHS
.getOperand(0))) {
9672 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
9673 // setcc (select cc, CT, CF), CF, ne => cc
9674 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
9675 // setcc (select cc, CT, CF), CT, eq => cc
9676 uint64_t CT
= LHS
.getConstantOperandVal(1);
9677 uint64_t CF
= LHS
.getConstantOperandVal(2);
9679 if ((CF
== CRHSVal
&& CC
== ISD::SETEQ
) ||
9680 (CT
== CRHSVal
&& CC
== ISD::SETNE
))
9681 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
9682 DAG
.getConstant(-1, SL
, MVT::i1
));
9683 if ((CF
== CRHSVal
&& CC
== ISD::SETNE
) ||
9684 (CT
== CRHSVal
&& CC
== ISD::SETEQ
))
9685 return LHS
.getOperand(0);
9689 if (VT
!= MVT::f32
&& VT
!= MVT::f64
&& (Subtarget
->has16BitInsts() &&
9693 // Match isinf/isfinite pattern
9694 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
9695 // (fcmp one (fabs x), inf) -> (fp_class x,
9696 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
9697 if ((CC
== ISD::SETOEQ
|| CC
== ISD::SETONE
) && LHS
.getOpcode() == ISD::FABS
) {
9698 const ConstantFPSDNode
*CRHS
= dyn_cast
<ConstantFPSDNode
>(RHS
);
9702 const APFloat
&APF
= CRHS
->getValueAPF();
9703 if (APF
.isInfinity() && !APF
.isNegative()) {
9704 const unsigned IsInfMask
= SIInstrFlags::P_INFINITY
|
9705 SIInstrFlags::N_INFINITY
;
9706 const unsigned IsFiniteMask
= SIInstrFlags::N_ZERO
|
9707 SIInstrFlags::P_ZERO
|
9708 SIInstrFlags::N_NORMAL
|
9709 SIInstrFlags::P_NORMAL
|
9710 SIInstrFlags::N_SUBNORMAL
|
9711 SIInstrFlags::P_SUBNORMAL
;
9712 unsigned Mask
= CC
== ISD::SETOEQ
? IsInfMask
: IsFiniteMask
;
9713 return DAG
.getNode(AMDGPUISD::FP_CLASS
, SL
, MVT::i1
, LHS
.getOperand(0),
9714 DAG
.getConstant(Mask
, SL
, MVT::i32
));
9721 SDValue
SITargetLowering::performCvtF32UByteNCombine(SDNode
*N
,
9722 DAGCombinerInfo
&DCI
) const {
9723 SelectionDAG
&DAG
= DCI
.DAG
;
9725 unsigned Offset
= N
->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0
;
9727 SDValue Src
= N
->getOperand(0);
9728 SDValue Srl
= N
->getOperand(0);
9729 if (Srl
.getOpcode() == ISD::ZERO_EXTEND
)
9730 Srl
= Srl
.getOperand(0);
9732 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
9733 if (Srl
.getOpcode() == ISD::SRL
) {
9734 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
9735 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
9736 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
9738 if (const ConstantSDNode
*C
=
9739 dyn_cast
<ConstantSDNode
>(Srl
.getOperand(1))) {
9740 Srl
= DAG
.getZExtOrTrunc(Srl
.getOperand(0), SDLoc(Srl
.getOperand(0)),
9743 unsigned SrcOffset
= C
->getZExtValue() + 8 * Offset
;
9744 if (SrcOffset
< 32 && SrcOffset
% 8 == 0) {
9745 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
+ SrcOffset
/ 8, SL
,
9751 APInt Demanded
= APInt::getBitsSet(32, 8 * Offset
, 8 * Offset
+ 8);
9754 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
9755 !DCI
.isBeforeLegalizeOps());
9756 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
9757 if (TLI
.SimplifyDemandedBits(Src
, Demanded
, Known
, TLO
)) {
9758 DCI
.CommitTargetLoweringOpt(TLO
);
9764 SDValue
SITargetLowering::performClampCombine(SDNode
*N
,
9765 DAGCombinerInfo
&DCI
) const {
9766 ConstantFPSDNode
*CSrc
= dyn_cast
<ConstantFPSDNode
>(N
->getOperand(0));
9770 const MachineFunction
&MF
= DCI
.DAG
.getMachineFunction();
9771 const APFloat
&F
= CSrc
->getValueAPF();
9772 APFloat Zero
= APFloat::getZero(F
.getSemantics());
9773 APFloat::cmpResult Cmp0
= F
.compare(Zero
);
9774 if (Cmp0
== APFloat::cmpLessThan
||
9775 (Cmp0
== APFloat::cmpUnordered
&&
9776 MF
.getInfo
<SIMachineFunctionInfo
>()->getMode().DX10Clamp
)) {
9777 return DCI
.DAG
.getConstantFP(Zero
, SDLoc(N
), N
->getValueType(0));
9780 APFloat
One(F
.getSemantics(), "1.0");
9781 APFloat::cmpResult Cmp1
= F
.compare(One
);
9782 if (Cmp1
== APFloat::cmpGreaterThan
)
9783 return DCI
.DAG
.getConstantFP(One
, SDLoc(N
), N
->getValueType(0));
9785 return SDValue(CSrc
, 0);
9789 SDValue
SITargetLowering::PerformDAGCombine(SDNode
*N
,
9790 DAGCombinerInfo
&DCI
) const {
9791 if (getTargetMachine().getOptLevel() == CodeGenOpt::None
)
9793 switch (N
->getOpcode()) {
9795 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
9797 return performAddCombine(N
, DCI
);
9799 return performSubCombine(N
, DCI
);
9802 return performAddCarrySubCarryCombine(N
, DCI
);
9804 return performFAddCombine(N
, DCI
);
9806 return performFSubCombine(N
, DCI
);
9808 return performSetCCCombine(N
, DCI
);
9811 case ISD::FMAXNUM_IEEE
:
9812 case ISD::FMINNUM_IEEE
:
9817 case AMDGPUISD::FMIN_LEGACY
:
9818 case AMDGPUISD::FMAX_LEGACY
:
9819 return performMinMaxCombine(N
, DCI
);
9821 return performFMACombine(N
, DCI
);
9823 if (SDValue Widended
= widenLoad(cast
<LoadSDNode
>(N
), DCI
))
9828 case ISD::ATOMIC_LOAD
:
9829 case ISD::ATOMIC_STORE
:
9830 case ISD::ATOMIC_CMP_SWAP
:
9831 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
:
9832 case ISD::ATOMIC_SWAP
:
9833 case ISD::ATOMIC_LOAD_ADD
:
9834 case ISD::ATOMIC_LOAD_SUB
:
9835 case ISD::ATOMIC_LOAD_AND
:
9836 case ISD::ATOMIC_LOAD_OR
:
9837 case ISD::ATOMIC_LOAD_XOR
:
9838 case ISD::ATOMIC_LOAD_NAND
:
9839 case ISD::ATOMIC_LOAD_MIN
:
9840 case ISD::ATOMIC_LOAD_MAX
:
9841 case ISD::ATOMIC_LOAD_UMIN
:
9842 case ISD::ATOMIC_LOAD_UMAX
:
9843 case ISD::ATOMIC_LOAD_FADD
:
9844 case AMDGPUISD::ATOMIC_INC
:
9845 case AMDGPUISD::ATOMIC_DEC
:
9846 case AMDGPUISD::ATOMIC_LOAD_FMIN
:
9847 case AMDGPUISD::ATOMIC_LOAD_FMAX
: // TODO: Target mem intrinsics.
9848 if (DCI
.isBeforeLegalize())
9850 return performMemSDNodeCombine(cast
<MemSDNode
>(N
), DCI
);
9852 return performAndCombine(N
, DCI
);
9854 return performOrCombine(N
, DCI
);
9856 return performXorCombine(N
, DCI
);
9857 case ISD::ZERO_EXTEND
:
9858 return performZeroExtendCombine(N
, DCI
);
9859 case ISD::SIGN_EXTEND_INREG
:
9860 return performSignExtendInRegCombine(N
, DCI
);
9861 case AMDGPUISD::FP_CLASS
:
9862 return performClassCombine(N
, DCI
);
9863 case ISD::FCANONICALIZE
:
9864 return performFCanonicalizeCombine(N
, DCI
);
9865 case AMDGPUISD::RCP
:
9866 return performRcpCombine(N
, DCI
);
9867 case AMDGPUISD::FRACT
:
9868 case AMDGPUISD::RSQ
:
9869 case AMDGPUISD::RCP_LEGACY
:
9870 case AMDGPUISD::RSQ_LEGACY
:
9871 case AMDGPUISD::RCP_IFLAG
:
9872 case AMDGPUISD::RSQ_CLAMP
:
9873 case AMDGPUISD::LDEXP
: {
9874 SDValue Src
= N
->getOperand(0);
9879 case ISD::SINT_TO_FP
:
9880 case ISD::UINT_TO_FP
:
9881 return performUCharToFloatCombine(N
, DCI
);
9882 case AMDGPUISD::CVT_F32_UBYTE0
:
9883 case AMDGPUISD::CVT_F32_UBYTE1
:
9884 case AMDGPUISD::CVT_F32_UBYTE2
:
9885 case AMDGPUISD::CVT_F32_UBYTE3
:
9886 return performCvtF32UByteNCombine(N
, DCI
);
9887 case AMDGPUISD::FMED3
:
9888 return performFMed3Combine(N
, DCI
);
9889 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
9890 return performCvtPkRTZCombine(N
, DCI
);
9891 case AMDGPUISD::CLAMP
:
9892 return performClampCombine(N
, DCI
);
9893 case ISD::SCALAR_TO_VECTOR
: {
9894 SelectionDAG
&DAG
= DCI
.DAG
;
9895 EVT VT
= N
->getValueType(0);
9897 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
9898 if (VT
== MVT::v2i16
|| VT
== MVT::v2f16
) {
9900 SDValue Src
= N
->getOperand(0);
9901 EVT EltVT
= Src
.getValueType();
9902 if (EltVT
== MVT::f16
)
9903 Src
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Src
);
9905 SDValue Ext
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Src
);
9906 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Ext
);
9911 case ISD::EXTRACT_VECTOR_ELT
:
9912 return performExtractVectorEltCombine(N
, DCI
);
9913 case ISD::INSERT_VECTOR_ELT
:
9914 return performInsertVectorEltCombine(N
, DCI
);
9916 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
9919 /// Helper function for adjustWritemask
9920 static unsigned SubIdx2Lane(unsigned Idx
) {
9923 case AMDGPU::sub0
: return 0;
9924 case AMDGPU::sub1
: return 1;
9925 case AMDGPU::sub2
: return 2;
9926 case AMDGPU::sub3
: return 3;
9927 case AMDGPU::sub4
: return 4; // Possible with TFE/LWE
9931 /// Adjust the writemask of MIMG instructions
9932 SDNode
*SITargetLowering::adjustWritemask(MachineSDNode
*&Node
,
9933 SelectionDAG
&DAG
) const {
9934 unsigned Opcode
= Node
->getMachineOpcode();
9936 // Subtract 1 because the vdata output is not a MachineSDNode operand.
9937 int D16Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::d16
) - 1;
9938 if (D16Idx
>= 0 && Node
->getConstantOperandVal(D16Idx
))
9939 return Node
; // not implemented for D16
9941 SDNode
*Users
[5] = { nullptr };
9943 unsigned DmaskIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::dmask
) - 1;
9944 unsigned OldDmask
= Node
->getConstantOperandVal(DmaskIdx
);
9945 unsigned NewDmask
= 0;
9946 unsigned TFEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::tfe
) - 1;
9947 unsigned LWEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::lwe
) - 1;
9948 bool UsesTFC
= (Node
->getConstantOperandVal(TFEIdx
) ||
9949 Node
->getConstantOperandVal(LWEIdx
)) ? 1 : 0;
9950 unsigned TFCLane
= 0;
9951 bool HasChain
= Node
->getNumValues() > 1;
9953 if (OldDmask
== 0) {
9954 // These are folded out, but on the chance it happens don't assert.
9958 unsigned OldBitsSet
= countPopulation(OldDmask
);
9959 // Work out which is the TFE/LWE lane if that is enabled.
9961 TFCLane
= OldBitsSet
;
9964 // Try to figure out the used register components
9965 for (SDNode::use_iterator I
= Node
->use_begin(), E
= Node
->use_end();
9968 // Don't look at users of the chain.
9969 if (I
.getUse().getResNo() != 0)
9972 // Abort if we can't understand the usage
9973 if (!I
->isMachineOpcode() ||
9974 I
->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
)
9977 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
9978 // Note that subregs are packed, i.e. Lane==0 is the first bit set
9979 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
9981 Lane
= SubIdx2Lane(I
->getConstantOperandVal(1));
9983 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
9984 if (UsesTFC
&& Lane
== TFCLane
) {
9987 // Set which texture component corresponds to the lane.
9989 for (unsigned i
= 0, Dmask
= OldDmask
; (i
<= Lane
) && (Dmask
!= 0); i
++) {
9990 Comp
= countTrailingZeros(Dmask
);
9991 Dmask
&= ~(1 << Comp
);
9994 // Abort if we have more than one user per component.
9999 NewDmask
|= 1 << Comp
;
10003 // Don't allow 0 dmask, as hardware assumes one channel enabled.
10004 bool NoChannels
= !NewDmask
;
10007 // No uses of the result and not using TFC. Then do nothing.
10010 // If the original dmask has one channel - then nothing to do
10011 if (OldBitsSet
== 1)
10013 // Use an arbitrary dmask - required for the instruction to work
10016 // Abort if there's no change
10017 if (NewDmask
== OldDmask
)
10020 unsigned BitsSet
= countPopulation(NewDmask
);
10022 // Check for TFE or LWE - increase the number of channels by one to account
10023 // for the extra return value
10024 // This will need adjustment for D16 if this is also included in
10025 // adjustWriteMask (this function) but at present D16 are excluded.
10026 unsigned NewChannels
= BitsSet
+ UsesTFC
;
10029 AMDGPU::getMaskedMIMGOp(Node
->getMachineOpcode(), NewChannels
);
10030 assert(NewOpcode
!= -1 &&
10031 NewOpcode
!= static_cast<int>(Node
->getMachineOpcode()) &&
10032 "failed to find equivalent MIMG op");
10034 // Adjust the writemask in the node
10035 SmallVector
<SDValue
, 12> Ops
;
10036 Ops
.insert(Ops
.end(), Node
->op_begin(), Node
->op_begin() + DmaskIdx
);
10037 Ops
.push_back(DAG
.getTargetConstant(NewDmask
, SDLoc(Node
), MVT::i32
));
10038 Ops
.insert(Ops
.end(), Node
->op_begin() + DmaskIdx
+ 1, Node
->op_end());
10040 MVT SVT
= Node
->getValueType(0).getVectorElementType().getSimpleVT();
10042 MVT ResultVT
= NewChannels
== 1 ?
10043 SVT
: MVT::getVectorVT(SVT
, NewChannels
== 3 ? 4 :
10044 NewChannels
== 5 ? 8 : NewChannels
);
10045 SDVTList NewVTList
= HasChain
?
10046 DAG
.getVTList(ResultVT
, MVT::Other
) : DAG
.getVTList(ResultVT
);
10049 MachineSDNode
*NewNode
= DAG
.getMachineNode(NewOpcode
, SDLoc(Node
),
10054 DAG
.setNodeMemRefs(NewNode
, Node
->memoperands());
10055 DAG
.ReplaceAllUsesOfValueWith(SDValue(Node
, 1), SDValue(NewNode
, 1));
10058 if (NewChannels
== 1) {
10059 assert(Node
->hasNUsesOfValue(1, 0));
10060 SDNode
*Copy
= DAG
.getMachineNode(TargetOpcode::COPY
,
10061 SDLoc(Node
), Users
[Lane
]->getValueType(0),
10062 SDValue(NewNode
, 0));
10063 DAG
.ReplaceAllUsesWith(Users
[Lane
], Copy
);
10067 // Update the users of the node with the new indices
10068 for (unsigned i
= 0, Idx
= AMDGPU::sub0
; i
< 5; ++i
) {
10069 SDNode
*User
= Users
[i
];
10071 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
10072 // Users[0] is still nullptr because channel 0 doesn't really have a use.
10073 if (i
|| !NoChannels
)
10076 SDValue Op
= DAG
.getTargetConstant(Idx
, SDLoc(User
), MVT::i32
);
10077 DAG
.UpdateNodeOperands(User
, SDValue(NewNode
, 0), Op
);
10082 case AMDGPU::sub0
: Idx
= AMDGPU::sub1
; break;
10083 case AMDGPU::sub1
: Idx
= AMDGPU::sub2
; break;
10084 case AMDGPU::sub2
: Idx
= AMDGPU::sub3
; break;
10085 case AMDGPU::sub3
: Idx
= AMDGPU::sub4
; break;
10089 DAG
.RemoveDeadNode(Node
);
10093 static bool isFrameIndexOp(SDValue Op
) {
10094 if (Op
.getOpcode() == ISD::AssertZext
)
10095 Op
= Op
.getOperand(0);
10097 return isa
<FrameIndexSDNode
>(Op
);
10100 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
10101 /// with frame index operands.
10102 /// LLVM assumes that inputs are to these instructions are registers.
10103 SDNode
*SITargetLowering::legalizeTargetIndependentNode(SDNode
*Node
,
10104 SelectionDAG
&DAG
) const {
10105 if (Node
->getOpcode() == ISD::CopyToReg
) {
10106 RegisterSDNode
*DestReg
= cast
<RegisterSDNode
>(Node
->getOperand(1));
10107 SDValue SrcVal
= Node
->getOperand(2);
10109 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
10110 // to try understanding copies to physical registers.
10111 if (SrcVal
.getValueType() == MVT::i1
&&
10112 Register::isPhysicalRegister(DestReg
->getReg())) {
10114 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
10115 SDValue VReg
= DAG
.getRegister(
10116 MRI
.createVirtualRegister(&AMDGPU::VReg_1RegClass
), MVT::i1
);
10118 SDNode
*Glued
= Node
->getGluedNode();
10120 = DAG
.getCopyToReg(Node
->getOperand(0), SL
, VReg
, SrcVal
,
10121 SDValue(Glued
, Glued
? Glued
->getNumValues() - 1 : 0));
10122 SDValue ToResultReg
10123 = DAG
.getCopyToReg(ToVReg
, SL
, SDValue(DestReg
, 0),
10124 VReg
, ToVReg
.getValue(1));
10125 DAG
.ReplaceAllUsesWith(Node
, ToResultReg
.getNode());
10126 DAG
.RemoveDeadNode(Node
);
10127 return ToResultReg
.getNode();
10131 SmallVector
<SDValue
, 8> Ops
;
10132 for (unsigned i
= 0; i
< Node
->getNumOperands(); ++i
) {
10133 if (!isFrameIndexOp(Node
->getOperand(i
))) {
10134 Ops
.push_back(Node
->getOperand(i
));
10139 Ops
.push_back(SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
,
10140 Node
->getOperand(i
).getValueType(),
10141 Node
->getOperand(i
)), 0));
10144 return DAG
.UpdateNodeOperands(Node
, Ops
);
10147 /// Fold the instructions after selecting them.
10148 /// Returns null if users were already updated.
10149 SDNode
*SITargetLowering::PostISelFolding(MachineSDNode
*Node
,
10150 SelectionDAG
&DAG
) const {
10151 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10152 unsigned Opcode
= Node
->getMachineOpcode();
10154 if (TII
->isMIMG(Opcode
) && !TII
->get(Opcode
).mayStore() &&
10155 !TII
->isGather4(Opcode
)) {
10156 return adjustWritemask(Node
, DAG
);
10159 if (Opcode
== AMDGPU::INSERT_SUBREG
||
10160 Opcode
== AMDGPU::REG_SEQUENCE
) {
10161 legalizeTargetIndependentNode(Node
, DAG
);
10166 case AMDGPU::V_DIV_SCALE_F32
:
10167 case AMDGPU::V_DIV_SCALE_F64
: {
10168 // Satisfy the operand register constraint when one of the inputs is
10169 // undefined. Ordinarily each undef value will have its own implicit_def of
10170 // a vreg, so force these to use a single register.
10171 SDValue Src0
= Node
->getOperand(0);
10172 SDValue Src1
= Node
->getOperand(1);
10173 SDValue Src2
= Node
->getOperand(2);
10175 if ((Src0
.isMachineOpcode() &&
10176 Src0
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
) &&
10177 (Src0
== Src1
|| Src0
== Src2
))
10180 MVT VT
= Src0
.getValueType().getSimpleVT();
10181 const TargetRegisterClass
*RC
=
10182 getRegClassFor(VT
, Src0
.getNode()->isDivergent());
10184 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
10185 SDValue UndefReg
= DAG
.getRegister(MRI
.createVirtualRegister(RC
), VT
);
10187 SDValue ImpDef
= DAG
.getCopyToReg(DAG
.getEntryNode(), SDLoc(Node
),
10188 UndefReg
, Src0
, SDValue());
10190 // src0 must be the same register as src1 or src2, even if the value is
10191 // undefined, so make sure we don't violate this constraint.
10192 if (Src0
.isMachineOpcode() &&
10193 Src0
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
) {
10194 if (Src1
.isMachineOpcode() &&
10195 Src1
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
10197 else if (Src2
.isMachineOpcode() &&
10198 Src2
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
10201 assert(Src1
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
);
10208 SmallVector
<SDValue
, 4> Ops
= { Src0
, Src1
, Src2
};
10209 for (unsigned I
= 3, N
= Node
->getNumOperands(); I
!= N
; ++I
)
10210 Ops
.push_back(Node
->getOperand(I
));
10212 Ops
.push_back(ImpDef
.getValue(1));
10213 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
10215 case AMDGPU::V_PERMLANE16_B32
:
10216 case AMDGPU::V_PERMLANEX16_B32
: {
10217 ConstantSDNode
*FI
= cast
<ConstantSDNode
>(Node
->getOperand(0));
10218 ConstantSDNode
*BC
= cast
<ConstantSDNode
>(Node
->getOperand(2));
10219 if (!FI
->getZExtValue() && !BC
->getZExtValue())
10221 SDValue VDstIn
= Node
->getOperand(6);
10222 if (VDstIn
.isMachineOpcode()
10223 && VDstIn
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
)
10225 MachineSDNode
*ImpDef
= DAG
.getMachineNode(TargetOpcode::IMPLICIT_DEF
,
10226 SDLoc(Node
), MVT::i32
);
10227 SmallVector
<SDValue
, 8> Ops
= { SDValue(FI
, 0), Node
->getOperand(1),
10228 SDValue(BC
, 0), Node
->getOperand(3),
10229 Node
->getOperand(4), Node
->getOperand(5),
10230 SDValue(ImpDef
, 0), Node
->getOperand(7) };
10231 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
10240 /// Assign the register class depending on the number of
10241 /// bits set in the writemask
10242 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr
&MI
,
10243 SDNode
*Node
) const {
10244 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10246 MachineRegisterInfo
&MRI
= MI
.getParent()->getParent()->getRegInfo();
10248 if (TII
->isVOP3(MI
.getOpcode())) {
10249 // Make sure constant bus requirements are respected.
10250 TII
->legalizeOperandsVOP3(MRI
, MI
);
10252 // Prefer VGPRs over AGPRs in mAI instructions where possible.
10253 // This saves a chain-copy of registers and better ballance register
10254 // use between vgpr and agpr as agpr tuples tend to be big.
10255 if (const MCOperandInfo
*OpInfo
= MI
.getDesc().OpInfo
) {
10256 unsigned Opc
= MI
.getOpcode();
10257 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10258 for (auto I
: { AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
),
10259 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
) }) {
10262 MachineOperand
&Op
= MI
.getOperand(I
);
10263 if ((OpInfo
[I
].RegClass
!= llvm::AMDGPU::AV_64RegClassID
&&
10264 OpInfo
[I
].RegClass
!= llvm::AMDGPU::AV_32RegClassID
) ||
10265 !Register::isVirtualRegister(Op
.getReg()) ||
10266 !TRI
->isAGPR(MRI
, Op
.getReg()))
10268 auto *Src
= MRI
.getUniqueVRegDef(Op
.getReg());
10269 if (!Src
|| !Src
->isCopy() ||
10270 !TRI
->isSGPRReg(MRI
, Src
->getOperand(1).getReg()))
10272 auto *RC
= TRI
->getRegClassForReg(MRI
, Op
.getReg());
10273 auto *NewRC
= TRI
->getEquivalentVGPRClass(RC
);
10274 // All uses of agpr64 and agpr32 can also accept vgpr except for
10275 // v_accvgpr_read, but we do not produce agpr reads during selection,
10276 // so no use checks are needed.
10277 MRI
.setRegClass(Op
.getReg(), NewRC
);
10284 // Replace unused atomics with the no return version.
10285 int NoRetAtomicOp
= AMDGPU::getAtomicNoRetOp(MI
.getOpcode());
10286 if (NoRetAtomicOp
!= -1) {
10287 if (!Node
->hasAnyUseOfValue(0)) {
10288 MI
.setDesc(TII
->get(NoRetAtomicOp
));
10289 MI
.RemoveOperand(0);
10293 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
10294 // instruction, because the return type of these instructions is a vec2 of
10295 // the memory type, so it can be tied to the input operand.
10296 // This means these instructions always have a use, so we need to add a
10297 // special case to check if the atomic has only one extract_subreg use,
10298 // which itself has no uses.
10299 if ((Node
->hasNUsesOfValue(1, 0) &&
10300 Node
->use_begin()->isMachineOpcode() &&
10301 Node
->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG
&&
10302 !Node
->use_begin()->hasAnyUseOfValue(0))) {
10303 unsigned Def
= MI
.getOperand(0).getReg();
10305 // Change this into a noret atomic.
10306 MI
.setDesc(TII
->get(NoRetAtomicOp
));
10307 MI
.RemoveOperand(0);
10309 // If we only remove the def operand from the atomic instruction, the
10310 // extract_subreg will be left with a use of a vreg without a def.
10311 // So we need to insert an implicit_def to avoid machine verifier
10313 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(),
10314 TII
->get(AMDGPU::IMPLICIT_DEF
), Def
);
10320 static SDValue
buildSMovImm32(SelectionDAG
&DAG
, const SDLoc
&DL
,
10322 SDValue K
= DAG
.getTargetConstant(Val
, DL
, MVT::i32
);
10323 return SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, K
), 0);
10326 MachineSDNode
*SITargetLowering::wrapAddr64Rsrc(SelectionDAG
&DAG
,
10328 SDValue Ptr
) const {
10329 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10331 // Build the half of the subregister with the constants before building the
10332 // full 128-bit register. If we are building multiple resource descriptors,
10333 // this will allow CSEing of the 2-component register.
10334 const SDValue Ops0
[] = {
10335 DAG
.getTargetConstant(AMDGPU::SGPR_64RegClassID
, DL
, MVT::i32
),
10336 buildSMovImm32(DAG
, DL
, 0),
10337 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
10338 buildSMovImm32(DAG
, DL
, TII
->getDefaultRsrcDataFormat() >> 32),
10339 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
)
10342 SDValue SubRegHi
= SDValue(DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
,
10343 MVT::v2i32
, Ops0
), 0);
10345 // Combine the constants and the pointer.
10346 const SDValue Ops1
[] = {
10347 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, DL
, MVT::i32
),
10349 DAG
.getTargetConstant(AMDGPU::sub0_sub1
, DL
, MVT::i32
),
10351 DAG
.getTargetConstant(AMDGPU::sub2_sub3
, DL
, MVT::i32
)
10354 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops1
);
10357 /// Return a resource descriptor with the 'Add TID' bit enabled
10358 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
10359 /// of the resource descriptor) to create an offset, which is added to
10360 /// the resource pointer.
10361 MachineSDNode
*SITargetLowering::buildRSRC(SelectionDAG
&DAG
, const SDLoc
&DL
,
10362 SDValue Ptr
, uint32_t RsrcDword1
,
10363 uint64_t RsrcDword2And3
) const {
10364 SDValue PtrLo
= DAG
.getTargetExtractSubreg(AMDGPU::sub0
, DL
, MVT::i32
, Ptr
);
10365 SDValue PtrHi
= DAG
.getTargetExtractSubreg(AMDGPU::sub1
, DL
, MVT::i32
, Ptr
);
10367 PtrHi
= SDValue(DAG
.getMachineNode(AMDGPU::S_OR_B32
, DL
, MVT::i32
, PtrHi
,
10368 DAG
.getConstant(RsrcDword1
, DL
, MVT::i32
)),
10372 SDValue DataLo
= buildSMovImm32(DAG
, DL
,
10373 RsrcDword2And3
& UINT64_C(0xFFFFFFFF));
10374 SDValue DataHi
= buildSMovImm32(DAG
, DL
, RsrcDword2And3
>> 32);
10376 const SDValue Ops
[] = {
10377 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, DL
, MVT::i32
),
10379 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
10381 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
),
10383 DAG
.getTargetConstant(AMDGPU::sub2
, DL
, MVT::i32
),
10385 DAG
.getTargetConstant(AMDGPU::sub3
, DL
, MVT::i32
)
10388 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops
);
10391 //===----------------------------------------------------------------------===//
10392 // SI Inline Assembly Support
10393 //===----------------------------------------------------------------------===//
10395 std::pair
<unsigned, const TargetRegisterClass
*>
10396 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo
*TRI
,
10397 StringRef Constraint
,
10399 const TargetRegisterClass
*RC
= nullptr;
10400 if (Constraint
.size() == 1) {
10401 switch (Constraint
[0]) {
10403 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
10406 switch (VT
.getSizeInBits()) {
10408 return std::make_pair(0U, nullptr);
10411 RC
= &AMDGPU::SReg_32_XM0RegClass
;
10414 RC
= &AMDGPU::SGPR_64RegClass
;
10417 RC
= &AMDGPU::SReg_96RegClass
;
10420 RC
= &AMDGPU::SReg_128RegClass
;
10423 RC
= &AMDGPU::SReg_160RegClass
;
10426 RC
= &AMDGPU::SReg_256RegClass
;
10429 RC
= &AMDGPU::SReg_512RegClass
;
10434 switch (VT
.getSizeInBits()) {
10436 return std::make_pair(0U, nullptr);
10439 RC
= &AMDGPU::VGPR_32RegClass
;
10442 RC
= &AMDGPU::VReg_64RegClass
;
10445 RC
= &AMDGPU::VReg_96RegClass
;
10448 RC
= &AMDGPU::VReg_128RegClass
;
10451 RC
= &AMDGPU::VReg_160RegClass
;
10454 RC
= &AMDGPU::VReg_256RegClass
;
10457 RC
= &AMDGPU::VReg_512RegClass
;
10462 if (!Subtarget
->hasMAIInsts())
10464 switch (VT
.getSizeInBits()) {
10466 return std::make_pair(0U, nullptr);
10469 RC
= &AMDGPU::AGPR_32RegClass
;
10472 RC
= &AMDGPU::AReg_64RegClass
;
10475 RC
= &AMDGPU::AReg_128RegClass
;
10478 RC
= &AMDGPU::AReg_512RegClass
;
10481 RC
= &AMDGPU::AReg_1024RegClass
;
10482 // v32 types are not legal but we support them here.
10483 return std::make_pair(0U, RC
);
10487 // We actually support i128, i16 and f16 as inline parameters
10488 // even if they are not reported as legal
10489 if (RC
&& (isTypeLegal(VT
) || VT
.SimpleTy
== MVT::i128
||
10490 VT
.SimpleTy
== MVT::i16
|| VT
.SimpleTy
== MVT::f16
))
10491 return std::make_pair(0U, RC
);
10494 if (Constraint
.size() > 1) {
10495 if (Constraint
[1] == 'v') {
10496 RC
= &AMDGPU::VGPR_32RegClass
;
10497 } else if (Constraint
[1] == 's') {
10498 RC
= &AMDGPU::SGPR_32RegClass
;
10499 } else if (Constraint
[1] == 'a') {
10500 RC
= &AMDGPU::AGPR_32RegClass
;
10505 bool Failed
= Constraint
.substr(2).getAsInteger(10, Idx
);
10506 if (!Failed
&& Idx
< RC
->getNumRegs())
10507 return std::make_pair(RC
->getRegister(Idx
), RC
);
10510 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
10513 SITargetLowering::ConstraintType
10514 SITargetLowering::getConstraintType(StringRef Constraint
) const {
10515 if (Constraint
.size() == 1) {
10516 switch (Constraint
[0]) {
10521 return C_RegisterClass
;
10524 return TargetLowering::getConstraintType(Constraint
);
10527 // Figure out which registers should be reserved for stack access. Only after
10528 // the function is legalized do we know all of the non-spill stack objects or if
10529 // calls are present.
10530 void SITargetLowering::finalizeLowering(MachineFunction
&MF
) const {
10531 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
10532 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10533 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
10534 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
10536 if (Info
->isEntryFunction()) {
10537 // Callable functions have fixed registers used for stack access.
10538 reservePrivateMemoryRegs(getTargetMachine(), MF
, *TRI
, *Info
);
10541 assert(!TRI
->isSubRegister(Info
->getScratchRSrcReg(),
10542 Info
->getStackPtrOffsetReg()));
10543 if (Info
->getStackPtrOffsetReg() != AMDGPU::SP_REG
)
10544 MRI
.replaceRegWith(AMDGPU::SP_REG
, Info
->getStackPtrOffsetReg());
10546 // We need to worry about replacing the default register with itself in case
10547 // of MIR testcases missing the MFI.
10548 if (Info
->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG
)
10549 MRI
.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG
, Info
->getScratchRSrcReg());
10551 if (Info
->getFrameOffsetReg() != AMDGPU::FP_REG
)
10552 MRI
.replaceRegWith(AMDGPU::FP_REG
, Info
->getFrameOffsetReg());
10554 if (Info
->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG
) {
10555 MRI
.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG
,
10556 Info
->getScratchWaveOffsetReg());
10559 Info
->limitOccupancy(MF
);
10561 if (ST
.isWave32() && !MF
.empty()) {
10562 // Add VCC_HI def because many instructions marked as imp-use VCC where
10563 // we may only define VCC_LO. If nothing defines VCC_HI we may end up
10564 // having a use of undef.
10566 const SIInstrInfo
*TII
= ST
.getInstrInfo();
10569 MachineBasicBlock
&MBB
= MF
.front();
10570 MachineBasicBlock::iterator I
= MBB
.getFirstNonDebugInstr();
10571 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), AMDGPU::VCC_HI
);
10573 for (auto &MBB
: MF
) {
10574 for (auto &MI
: MBB
) {
10575 TII
->fixImplicitOperands(MI
);
10580 TargetLoweringBase::finalizeLowering(MF
);
10583 void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op
,
10585 const APInt
&DemandedElts
,
10586 const SelectionDAG
&DAG
,
10587 unsigned Depth
) const {
10588 TargetLowering::computeKnownBitsForFrameIndex(Op
, Known
, DemandedElts
,
10591 // Set the high bits to zero based on the maximum allowed scratch size per
10592 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
10593 // calculation won't overflow, so assume the sign bit is never set.
10594 Known
.Zero
.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
10597 unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop
*ML
) const {
10598 const unsigned PrefAlign
= TargetLowering::getPrefLoopAlignment(ML
);
10599 const unsigned CacheLineAlign
= 6; // log2(64)
10601 // Pre-GFX10 target did not benefit from loop alignment
10602 if (!ML
|| DisableLoopAlignment
||
10603 (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10
) ||
10604 getSubtarget()->hasInstFwdPrefetchBug())
10607 // On GFX10 I$ is 4 x 64 bytes cache lines.
10608 // By default prefetcher keeps one cache line behind and reads two ahead.
10609 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
10610 // behind and one ahead.
10611 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
10612 // If loop fits 64 bytes it always spans no more than two cache lines and
10613 // does not need an alignment.
10614 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
10615 // Else if loop is less or equal 192 bytes we need two lines behind.
10617 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10618 const MachineBasicBlock
*Header
= ML
->getHeader();
10619 if (Header
->getAlignment() != PrefAlign
)
10620 return Header
->getAlignment(); // Already processed.
10622 unsigned LoopSize
= 0;
10623 for (const MachineBasicBlock
*MBB
: ML
->blocks()) {
10624 // If inner loop block is aligned assume in average half of the alignment
10625 // size to be added as nops.
10627 LoopSize
+= (1 << MBB
->getAlignment()) / 2;
10629 for (const MachineInstr
&MI
: *MBB
) {
10630 LoopSize
+= TII
->getInstSizeInBytes(MI
);
10631 if (LoopSize
> 192)
10636 if (LoopSize
<= 64)
10639 if (LoopSize
<= 128)
10640 return CacheLineAlign
;
10642 // If any of parent loops is surrounded by prefetch instructions do not
10643 // insert new for inner loop, which would reset parent's settings.
10644 for (MachineLoop
*P
= ML
->getParentLoop(); P
; P
= P
->getParentLoop()) {
10645 if (MachineBasicBlock
*Exit
= P
->getExitBlock()) {
10646 auto I
= Exit
->getFirstNonDebugInstr();
10647 if (I
!= Exit
->end() && I
->getOpcode() == AMDGPU::S_INST_PREFETCH
)
10648 return CacheLineAlign
;
10652 MachineBasicBlock
*Pre
= ML
->getLoopPreheader();
10653 MachineBasicBlock
*Exit
= ML
->getExitBlock();
10656 BuildMI(*Pre
, Pre
->getFirstTerminator(), DebugLoc(),
10657 TII
->get(AMDGPU::S_INST_PREFETCH
))
10658 .addImm(1); // prefetch 2 lines behind PC
10660 BuildMI(*Exit
, Exit
->getFirstNonDebugInstr(), DebugLoc(),
10661 TII
->get(AMDGPU::S_INST_PREFETCH
))
10662 .addImm(2); // prefetch 1 line behind PC
10665 return CacheLineAlign
;
10668 LLVM_ATTRIBUTE_UNUSED
10669 static bool isCopyFromRegOfInlineAsm(const SDNode
*N
) {
10670 assert(N
->getOpcode() == ISD::CopyFromReg
);
10672 // Follow the chain until we find an INLINEASM node.
10673 N
= N
->getOperand(0).getNode();
10674 if (N
->getOpcode() == ISD::INLINEASM
||
10675 N
->getOpcode() == ISD::INLINEASM_BR
)
10677 } while (N
->getOpcode() == ISD::CopyFromReg
);
10681 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode
* N
,
10682 FunctionLoweringInfo
* FLI
, LegacyDivergenceAnalysis
* KDA
) const
10684 switch (N
->getOpcode()) {
10685 case ISD::CopyFromReg
:
10687 const RegisterSDNode
*R
= cast
<RegisterSDNode
>(N
->getOperand(1));
10688 const MachineFunction
* MF
= FLI
->MF
;
10689 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
10690 const MachineRegisterInfo
&MRI
= MF
->getRegInfo();
10691 const SIRegisterInfo
&TRI
= ST
.getInstrInfo()->getRegisterInfo();
10692 unsigned Reg
= R
->getReg();
10693 if (Register::isPhysicalRegister(Reg
))
10694 return !TRI
.isSGPRReg(MRI
, Reg
);
10696 if (MRI
.isLiveIn(Reg
)) {
10697 // workitem.id.x workitem.id.y workitem.id.z
10698 // Any VGPR formal argument is also considered divergent
10699 if (!TRI
.isSGPRReg(MRI
, Reg
))
10701 // Formal arguments of non-entry functions
10702 // are conservatively considered divergent
10703 else if (!AMDGPU::isEntryFunctionCC(FLI
->Fn
->getCallingConv()))
10707 const Value
*V
= FLI
->getValueFromVirtualReg(Reg
);
10709 return KDA
->isDivergent(V
);
10710 assert(Reg
== FLI
->DemoteRegister
|| isCopyFromRegOfInlineAsm(N
));
10711 return !TRI
.isSGPRReg(MRI
, Reg
);
10715 const LoadSDNode
*L
= cast
<LoadSDNode
>(N
);
10716 unsigned AS
= L
->getAddressSpace();
10717 // A flat load may access private memory.
10718 return AS
== AMDGPUAS::PRIVATE_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
;
10720 case ISD::CALLSEQ_END
:
10723 case ISD::INTRINSIC_WO_CHAIN
:
10727 return AMDGPU::isIntrinsicSourceOfDivergence(
10728 cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue());
10729 case ISD::INTRINSIC_W_CHAIN
:
10730 return AMDGPU::isIntrinsicSourceOfDivergence(
10731 cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue());
10732 // In some cases intrinsics that are a source of divergence have been
10733 // lowered to AMDGPUISD so we also need to check those too.
10734 case AMDGPUISD::INTERP_MOV
:
10735 case AMDGPUISD::INTERP_P1
:
10736 case AMDGPUISD::INTERP_P2
:
10742 bool SITargetLowering::denormalsEnabledForType(EVT VT
) const {
10743 switch (VT
.getScalarType().getSimpleVT().SimpleTy
) {
10745 return Subtarget
->hasFP32Denormals();
10747 return Subtarget
->hasFP64Denormals();
10749 return Subtarget
->hasFP16Denormals();
10755 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op
,
10756 const SelectionDAG
&DAG
,
10758 unsigned Depth
) const {
10759 if (Op
.getOpcode() == AMDGPUISD::CLAMP
) {
10760 const MachineFunction
&MF
= DAG
.getMachineFunction();
10761 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10763 if (Info
->getMode().DX10Clamp
)
10764 return true; // Clamped to 0.
10765 return DAG
.isKnownNeverNaN(Op
.getOperand(0), SNaN
, Depth
+ 1);
10768 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op
, DAG
,
10772 TargetLowering::AtomicExpansionKind
10773 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*RMW
) const {
10774 switch (RMW
->getOperation()) {
10775 case AtomicRMWInst::FAdd
: {
10776 Type
*Ty
= RMW
->getType();
10778 // We don't have a way to support 16-bit atomics now, so just leave them
10780 if (Ty
->isHalfTy())
10781 return AtomicExpansionKind::None
;
10783 if (!Ty
->isFloatTy())
10784 return AtomicExpansionKind::CmpXChg
;
10786 // TODO: Do have these for flat. Older targets also had them for buffers.
10787 unsigned AS
= RMW
->getPointerAddressSpace();
10788 return (AS
== AMDGPUAS::LOCAL_ADDRESS
&& Subtarget
->hasLDSFPAtomics()) ?
10789 AtomicExpansionKind::None
: AtomicExpansionKind::CmpXChg
;
10795 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW
);