1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Custom DAG lowering for SI
12 //===----------------------------------------------------------------------===//
14 #include "SIISelLowering.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "llvm/ADT/APInt.h"
23 #include "llvm/ADT/FloatingPointMode.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/UniformityAnalysis.h"
27 #include "llvm/CodeGen/Analysis.h"
28 #include "llvm/CodeGen/ByteProvider.h"
29 #include "llvm/CodeGen/FunctionLoweringInfo.h"
30 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
31 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
32 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
33 #include "llvm/CodeGen/MachineFrameInfo.h"
34 #include "llvm/CodeGen/MachineFunction.h"
35 #include "llvm/CodeGen/MachineLoopInfo.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/IR/IRBuilder.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/IntrinsicsAMDGPU.h"
40 #include "llvm/IR/IntrinsicsR600.h"
41 #include "llvm/IR/MDBuilder.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/KnownBits.h"
44 #include "llvm/Support/ModRef.h"
45 #include "llvm/Transforms/Utils/LowerAtomic.h"
50 #define DEBUG_TYPE "si-lower"
52 STATISTIC(NumTailCalls
, "Number of tail calls");
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
59 static cl::opt
<bool> UseDivergentRegisterIndexing(
60 "amdgpu-use-divergent-register-indexing", cl::Hidden
,
61 cl::desc("Use indirect register addressing for divergent indexes"),
64 static bool denormalModeIsFlushAllF32(const MachineFunction
&MF
) {
65 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
66 return Info
->getMode().FP32Denormals
== DenormalMode::getPreserveSign();
69 static bool denormalModeIsFlushAllF64F16(const MachineFunction
&MF
) {
70 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
71 return Info
->getMode().FP64FP16Denormals
== DenormalMode::getPreserveSign();
74 static unsigned findFirstFreeSGPR(CCState
&CCInfo
) {
75 unsigned NumSGPRs
= AMDGPU::SGPR_32RegClass
.getNumRegs();
76 for (unsigned Reg
= 0; Reg
< NumSGPRs
; ++Reg
) {
77 if (!CCInfo
.isAllocated(AMDGPU::SGPR0
+ Reg
)) {
78 return AMDGPU::SGPR0
+ Reg
;
81 llvm_unreachable("Cannot allocate sgpr");
84 SITargetLowering::SITargetLowering(const TargetMachine
&TM
,
85 const GCNSubtarget
&STI
)
86 : AMDGPUTargetLowering(TM
, STI
), Subtarget(&STI
) {
87 addRegisterClass(MVT::i1
, &AMDGPU::VReg_1RegClass
);
88 addRegisterClass(MVT::i64
, &AMDGPU::SReg_64RegClass
);
90 addRegisterClass(MVT::i32
, &AMDGPU::SReg_32RegClass
);
91 addRegisterClass(MVT::f32
, &AMDGPU::VGPR_32RegClass
);
93 addRegisterClass(MVT::v2i32
, &AMDGPU::SReg_64RegClass
);
95 const SIRegisterInfo
*TRI
= STI
.getRegisterInfo();
96 const TargetRegisterClass
*V64RegClass
= TRI
->getVGPR64Class();
98 addRegisterClass(MVT::f64
, V64RegClass
);
99 addRegisterClass(MVT::v2f32
, V64RegClass
);
100 addRegisterClass(MVT::Untyped
, V64RegClass
);
102 addRegisterClass(MVT::v3i32
, &AMDGPU::SGPR_96RegClass
);
103 addRegisterClass(MVT::v3f32
, TRI
->getVGPRClassForBitWidth(96));
105 addRegisterClass(MVT::v2i64
, &AMDGPU::SGPR_128RegClass
);
106 addRegisterClass(MVT::v2f64
, &AMDGPU::SGPR_128RegClass
);
108 addRegisterClass(MVT::v4i32
, &AMDGPU::SGPR_128RegClass
);
109 addRegisterClass(MVT::v4f32
, TRI
->getVGPRClassForBitWidth(128));
111 addRegisterClass(MVT::v5i32
, &AMDGPU::SGPR_160RegClass
);
112 addRegisterClass(MVT::v5f32
, TRI
->getVGPRClassForBitWidth(160));
114 addRegisterClass(MVT::v6i32
, &AMDGPU::SGPR_192RegClass
);
115 addRegisterClass(MVT::v6f32
, TRI
->getVGPRClassForBitWidth(192));
117 addRegisterClass(MVT::v3i64
, &AMDGPU::SGPR_192RegClass
);
118 addRegisterClass(MVT::v3f64
, TRI
->getVGPRClassForBitWidth(192));
120 addRegisterClass(MVT::v7i32
, &AMDGPU::SGPR_224RegClass
);
121 addRegisterClass(MVT::v7f32
, TRI
->getVGPRClassForBitWidth(224));
123 addRegisterClass(MVT::v8i32
, &AMDGPU::SGPR_256RegClass
);
124 addRegisterClass(MVT::v8f32
, TRI
->getVGPRClassForBitWidth(256));
126 addRegisterClass(MVT::v4i64
, &AMDGPU::SGPR_256RegClass
);
127 addRegisterClass(MVT::v4f64
, TRI
->getVGPRClassForBitWidth(256));
129 addRegisterClass(MVT::v9i32
, &AMDGPU::SGPR_288RegClass
);
130 addRegisterClass(MVT::v9f32
, TRI
->getVGPRClassForBitWidth(288));
132 addRegisterClass(MVT::v10i32
, &AMDGPU::SGPR_320RegClass
);
133 addRegisterClass(MVT::v10f32
, TRI
->getVGPRClassForBitWidth(320));
135 addRegisterClass(MVT::v11i32
, &AMDGPU::SGPR_352RegClass
);
136 addRegisterClass(MVT::v11f32
, TRI
->getVGPRClassForBitWidth(352));
138 addRegisterClass(MVT::v12i32
, &AMDGPU::SGPR_384RegClass
);
139 addRegisterClass(MVT::v12f32
, TRI
->getVGPRClassForBitWidth(384));
141 addRegisterClass(MVT::v16i32
, &AMDGPU::SGPR_512RegClass
);
142 addRegisterClass(MVT::v16f32
, TRI
->getVGPRClassForBitWidth(512));
144 addRegisterClass(MVT::v8i64
, &AMDGPU::SGPR_512RegClass
);
145 addRegisterClass(MVT::v8f64
, TRI
->getVGPRClassForBitWidth(512));
147 addRegisterClass(MVT::v16i64
, &AMDGPU::SGPR_1024RegClass
);
148 addRegisterClass(MVT::v16f64
, TRI
->getVGPRClassForBitWidth(1024));
150 if (Subtarget
->has16BitInsts()) {
151 if (Subtarget
->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16
, &AMDGPU::VGPR_16RegClass
);
153 addRegisterClass(MVT::f16
, &AMDGPU::VGPR_16RegClass
);
154 addRegisterClass(MVT::bf16
, &AMDGPU::VGPR_16RegClass
);
156 addRegisterClass(MVT::i16
, &AMDGPU::SReg_32RegClass
);
157 addRegisterClass(MVT::f16
, &AMDGPU::SReg_32RegClass
);
158 addRegisterClass(MVT::bf16
, &AMDGPU::SReg_32RegClass
);
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16
, &AMDGPU::SReg_32RegClass
);
163 addRegisterClass(MVT::v2f16
, &AMDGPU::SReg_32RegClass
);
164 addRegisterClass(MVT::v2bf16
, &AMDGPU::SReg_32RegClass
);
165 addRegisterClass(MVT::v4i16
, &AMDGPU::SReg_64RegClass
);
166 addRegisterClass(MVT::v4f16
, &AMDGPU::SReg_64RegClass
);
167 addRegisterClass(MVT::v4bf16
, &AMDGPU::SReg_64RegClass
);
168 addRegisterClass(MVT::v8i16
, &AMDGPU::SGPR_128RegClass
);
169 addRegisterClass(MVT::v8f16
, &AMDGPU::SGPR_128RegClass
);
170 addRegisterClass(MVT::v8bf16
, &AMDGPU::SGPR_128RegClass
);
171 addRegisterClass(MVT::v16i16
, &AMDGPU::SGPR_256RegClass
);
172 addRegisterClass(MVT::v16f16
, &AMDGPU::SGPR_256RegClass
);
173 addRegisterClass(MVT::v16bf16
, &AMDGPU::SGPR_256RegClass
);
174 addRegisterClass(MVT::v32i16
, &AMDGPU::SGPR_512RegClass
);
175 addRegisterClass(MVT::v32f16
, &AMDGPU::SGPR_512RegClass
);
176 addRegisterClass(MVT::v32bf16
, &AMDGPU::SGPR_512RegClass
);
179 addRegisterClass(MVT::v32i32
, &AMDGPU::VReg_1024RegClass
);
180 addRegisterClass(MVT::v32f32
, TRI
->getVGPRClassForBitWidth(1024));
182 computeRegisterProperties(Subtarget
->getRegisterInfo());
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
188 setBooleanContents(ZeroOrOneBooleanContent
);
189 setBooleanVectorContents(ZeroOrOneBooleanContent
);
191 // We need to custom lower vector stores from local memory
192 setOperationAction(ISD::LOAD
,
193 {MVT::v2i32
, MVT::v3i32
, MVT::v4i32
, MVT::v5i32
,
194 MVT::v6i32
, MVT::v7i32
, MVT::v8i32
, MVT::v9i32
,
195 MVT::v10i32
, MVT::v11i32
, MVT::v12i32
, MVT::v16i32
,
196 MVT::i1
, MVT::v32i32
},
199 setOperationAction(ISD::STORE
,
200 {MVT::v2i32
, MVT::v3i32
, MVT::v4i32
, MVT::v5i32
,
201 MVT::v6i32
, MVT::v7i32
, MVT::v8i32
, MVT::v9i32
,
202 MVT::v10i32
, MVT::v11i32
, MVT::v12i32
, MVT::v16i32
,
203 MVT::i1
, MVT::v32i32
},
206 if (isTypeLegal(MVT::bf16
)) {
208 {ISD::FADD
, ISD::FSUB
, ISD::FMUL
, ISD::FDIV
,
209 ISD::FREM
, ISD::FMA
, ISD::FMINNUM
, ISD::FMAXNUM
,
210 ISD::FMINIMUM
, ISD::FMAXIMUM
, ISD::FSQRT
, ISD::FCBRT
,
211 ISD::FSIN
, ISD::FCOS
, ISD::FPOW
, ISD::FPOWI
,
212 ISD::FLDEXP
, ISD::FFREXP
, ISD::FLOG
, ISD::FLOG2
,
213 ISD::FLOG10
, ISD::FEXP
, ISD::FEXP2
, ISD::FEXP10
,
214 ISD::FCEIL
, ISD::FTRUNC
, ISD::FRINT
, ISD::FNEARBYINT
,
215 ISD::FROUND
, ISD::FROUNDEVEN
, ISD::FFLOOR
, ISD::FCANONICALIZE
,
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc
, MVT::bf16
, Promote
);
219 AddPromotedToType(Opc
, MVT::bf16
, MVT::f32
);
222 setOperationAction(ISD::FP_ROUND
, MVT::bf16
, Expand
);
224 setOperationAction(ISD::SELECT
, MVT::bf16
, Promote
);
225 AddPromotedToType(ISD::SELECT
, MVT::bf16
, MVT::i16
);
227 setOperationAction(ISD::FABS
, MVT::bf16
, Legal
);
228 setOperationAction(ISD::FNEG
, MVT::bf16
, Legal
);
229 setOperationAction(ISD::FCOPYSIGN
, MVT::bf16
, Legal
);
231 // We only need to custom lower because we can't specify an action for bf16
233 setOperationAction(ISD::FP_TO_SINT
, MVT::i32
, Custom
);
234 setOperationAction(ISD::FP_TO_UINT
, MVT::i32
, Custom
);
237 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Expand
);
238 setTruncStoreAction(MVT::v3i32
, MVT::v3i16
, Expand
);
239 setTruncStoreAction(MVT::v4i32
, MVT::v4i16
, Expand
);
240 setTruncStoreAction(MVT::v8i32
, MVT::v8i16
, Expand
);
241 setTruncStoreAction(MVT::v16i32
, MVT::v16i16
, Expand
);
242 setTruncStoreAction(MVT::v32i32
, MVT::v32i16
, Expand
);
243 setTruncStoreAction(MVT::v2i32
, MVT::v2i8
, Expand
);
244 setTruncStoreAction(MVT::v4i32
, MVT::v4i8
, Expand
);
245 setTruncStoreAction(MVT::v8i32
, MVT::v8i8
, Expand
);
246 setTruncStoreAction(MVT::v16i32
, MVT::v16i8
, Expand
);
247 setTruncStoreAction(MVT::v32i32
, MVT::v32i8
, Expand
);
248 setTruncStoreAction(MVT::v2i16
, MVT::v2i8
, Expand
);
249 setTruncStoreAction(MVT::v4i16
, MVT::v4i8
, Expand
);
250 setTruncStoreAction(MVT::v8i16
, MVT::v8i8
, Expand
);
251 setTruncStoreAction(MVT::v16i16
, MVT::v16i8
, Expand
);
252 setTruncStoreAction(MVT::v32i16
, MVT::v32i8
, Expand
);
254 setTruncStoreAction(MVT::v3i64
, MVT::v3i16
, Expand
);
255 setTruncStoreAction(MVT::v3i64
, MVT::v3i32
, Expand
);
256 setTruncStoreAction(MVT::v4i64
, MVT::v4i8
, Expand
);
257 setTruncStoreAction(MVT::v8i64
, MVT::v8i8
, Expand
);
258 setTruncStoreAction(MVT::v8i64
, MVT::v8i16
, Expand
);
259 setTruncStoreAction(MVT::v8i64
, MVT::v8i32
, Expand
);
260 setTruncStoreAction(MVT::v16i64
, MVT::v16i32
, Expand
);
262 setOperationAction(ISD::GlobalAddress
, {MVT::i32
, MVT::i64
}, Custom
);
264 setOperationAction(ISD::SELECT
, MVT::i1
, Promote
);
265 setOperationAction(ISD::SELECT
, MVT::i64
, Custom
);
266 setOperationAction(ISD::SELECT
, MVT::f64
, Promote
);
267 AddPromotedToType(ISD::SELECT
, MVT::f64
, MVT::i64
);
269 setOperationAction(ISD::FSQRT
, {MVT::f32
, MVT::f64
}, Custom
);
271 setOperationAction(ISD::SELECT_CC
,
272 {MVT::f32
, MVT::i32
, MVT::i64
, MVT::f64
, MVT::i1
}, Expand
);
274 setOperationAction(ISD::SETCC
, MVT::i1
, Promote
);
275 setOperationAction(ISD::SETCC
, {MVT::v2i1
, MVT::v4i1
}, Expand
);
276 AddPromotedToType(ISD::SETCC
, MVT::i1
, MVT::i32
);
278 setOperationAction(ISD::TRUNCATE
,
279 {MVT::v2i32
, MVT::v3i32
, MVT::v4i32
, MVT::v5i32
,
280 MVT::v6i32
, MVT::v7i32
, MVT::v8i32
, MVT::v9i32
,
281 MVT::v10i32
, MVT::v11i32
, MVT::v12i32
, MVT::v16i32
},
283 setOperationAction(ISD::FP_ROUND
,
284 {MVT::v2f32
, MVT::v3f32
, MVT::v4f32
, MVT::v5f32
,
285 MVT::v6f32
, MVT::v7f32
, MVT::v8f32
, MVT::v9f32
,
286 MVT::v10f32
, MVT::v11f32
, MVT::v12f32
, MVT::v16f32
},
289 setOperationAction(ISD::SIGN_EXTEND_INREG
,
290 {MVT::v2i1
, MVT::v4i1
, MVT::v2i8
, MVT::v4i8
, MVT::v2i16
,
291 MVT::v3i16
, MVT::v4i16
, MVT::Other
},
294 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
295 setOperationAction(ISD::BR_CC
,
296 {MVT::i1
, MVT::i32
, MVT::i64
, MVT::f32
, MVT::f64
}, Expand
);
298 setOperationAction({ISD::UADDO
, ISD::USUBO
}, MVT::i32
, Legal
);
300 setOperationAction({ISD::UADDO_CARRY
, ISD::USUBO_CARRY
}, MVT::i32
, Legal
);
302 setOperationAction({ISD::SHL_PARTS
, ISD::SRA_PARTS
, ISD::SRL_PARTS
}, MVT::i64
,
306 setOperationAction({ISD::UADDO_CARRY
, ISD::USUBO_CARRY
}, MVT::i64
, Legal
);
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
312 {MVT::v8i32
, MVT::v8f32
, MVT::v9i32
, MVT::v9f32
, MVT::v10i32
,
313 MVT::v10f32
, MVT::v11i32
, MVT::v11f32
, MVT::v12i32
, MVT::v12f32
,
314 MVT::v16i32
, MVT::v16f32
, MVT::v2i64
, MVT::v2f64
, MVT::v4i16
,
315 MVT::v4f16
, MVT::v4bf16
, MVT::v3i64
, MVT::v3f64
, MVT::v6i32
,
316 MVT::v6f32
, MVT::v4i64
, MVT::v4f64
, MVT::v8i64
, MVT::v8f64
,
317 MVT::v8i16
, MVT::v8f16
, MVT::v8bf16
, MVT::v16i16
, MVT::v16f16
,
318 MVT::v16bf16
, MVT::v16i64
, MVT::v16f64
, MVT::v32i32
, MVT::v32f32
,
319 MVT::v32i16
, MVT::v32f16
, MVT::v32bf16
}) {
320 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
324 case ISD::BUILD_VECTOR
:
327 case ISD::EXTRACT_VECTOR_ELT
:
328 case ISD::INSERT_VECTOR_ELT
:
329 case ISD::SCALAR_TO_VECTOR
:
330 case ISD::IS_FPCLASS
:
332 case ISD::EXTRACT_SUBVECTOR
:
333 case ISD::INSERT_SUBVECTOR
:
334 case ISD::CONCAT_VECTORS
:
335 setOperationAction(Op
, VT
, Custom
);
338 setOperationAction(Op
, VT
, Expand
);
344 setOperationAction(ISD::FP_EXTEND
, MVT::v4f32
, Expand
);
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64
: {MVT::v2i64
, MVT::v2f64
}) {
352 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
353 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v4i32
);
355 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
358 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v4i32
);
361 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v4i32
);
365 for (MVT Vec64
: {MVT::v3i64
, MVT::v3f64
}) {
366 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
367 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v6i32
);
369 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v6i32
);
372 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v6i32
);
375 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v6i32
);
379 for (MVT Vec64
: {MVT::v4i64
, MVT::v4f64
}) {
380 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
381 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v8i32
);
383 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v8i32
);
386 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v8i32
);
389 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v8i32
);
393 for (MVT Vec64
: {MVT::v8i64
, MVT::v8f64
}) {
394 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
395 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v16i32
);
397 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v16i32
);
400 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v16i32
);
403 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v16i32
);
407 for (MVT Vec64
: {MVT::v16i64
, MVT::v16f64
}) {
408 setOperationAction(ISD::BUILD_VECTOR
, Vec64
, Promote
);
409 AddPromotedToType(ISD::BUILD_VECTOR
, Vec64
, MVT::v32i32
);
411 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, Vec64
, Promote
);
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT
, Vec64
, MVT::v32i32
);
414 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec64
, Promote
);
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT
, Vec64
, MVT::v32i32
);
417 setOperationAction(ISD::SCALAR_TO_VECTOR
, Vec64
, Promote
);
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR
, Vec64
, MVT::v32i32
);
421 setOperationAction(ISD::VECTOR_SHUFFLE
,
422 {MVT::v8i32
, MVT::v8f32
, MVT::v16i32
, MVT::v16f32
},
425 setOperationAction(ISD::BUILD_VECTOR
, {MVT::v4f16
, MVT::v4i16
, MVT::v4bf16
},
428 // Avoid stack access for these.
429 // TODO: Generalize to more vector types.
430 setOperationAction({ISD::EXTRACT_VECTOR_ELT
, ISD::INSERT_VECTOR_ELT
},
431 {MVT::v2i16
, MVT::v2f16
, MVT::v2bf16
, MVT::v2i8
, MVT::v4i8
,
432 MVT::v8i8
, MVT::v4i16
, MVT::v4f16
, MVT::v4bf16
},
435 // Deal with vec3 vector operations when widened to vec4.
436 setOperationAction(ISD::INSERT_SUBVECTOR
,
437 {MVT::v3i32
, MVT::v3f32
, MVT::v4i32
, MVT::v4f32
}, Custom
);
439 // Deal with vec5/6/7 vector operations when widened to vec8.
440 setOperationAction(ISD::INSERT_SUBVECTOR
,
441 {MVT::v5i32
, MVT::v5f32
, MVT::v6i32
, MVT::v6f32
,
442 MVT::v7i32
, MVT::v7f32
, MVT::v8i32
, MVT::v8f32
,
443 MVT::v9i32
, MVT::v9f32
, MVT::v10i32
, MVT::v10f32
,
444 MVT::v11i32
, MVT::v11f32
, MVT::v12i32
, MVT::v12f32
},
447 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
448 // and output demarshalling
449 setOperationAction(ISD::ATOMIC_CMP_SWAP
, {MVT::i32
, MVT::i64
}, Custom
);
451 // We can't return success/failure, only the old value,
452 // let LLVM add the comparison
453 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
, {MVT::i32
, MVT::i64
},
456 setOperationAction(ISD::ADDRSPACECAST
, {MVT::i32
, MVT::i64
}, Custom
);
458 setOperationAction(ISD::BITREVERSE
, {MVT::i32
, MVT::i64
}, Legal
);
460 // FIXME: This should be narrowed to i32, but that only happens if i64 is
462 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
463 setOperationAction(ISD::BSWAP
, {MVT::i64
, MVT::i32
}, Legal
);
465 // On SI this is s_memtime and s_memrealtime on VI.
466 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Legal
);
468 if (Subtarget
->hasSMemRealTime() ||
469 Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX11
)
470 setOperationAction(ISD::READSTEADYCOUNTER
, MVT::i64
, Legal
);
471 setOperationAction({ISD::TRAP
, ISD::DEBUGTRAP
}, MVT::Other
, Custom
);
473 if (Subtarget
->has16BitInsts()) {
474 setOperationAction({ISD::FPOW
, ISD::FPOWI
}, MVT::f16
, Promote
);
475 setOperationAction({ISD::FLOG
, ISD::FEXP
, ISD::FLOG10
}, MVT::f16
, Custom
);
477 setOperationAction(ISD::FSQRT
, MVT::f16
, Custom
);
480 if (Subtarget
->hasMadMacF32Insts())
481 setOperationAction(ISD::FMAD
, MVT::f32
, Legal
);
483 if (!Subtarget
->hasBFI())
484 // fcopysign can be done in a single instruction with BFI.
485 setOperationAction(ISD::FCOPYSIGN
, {MVT::f32
, MVT::f64
}, Expand
);
487 if (!Subtarget
->hasBCNT(32))
488 setOperationAction(ISD::CTPOP
, MVT::i32
, Expand
);
490 if (!Subtarget
->hasBCNT(64))
491 setOperationAction(ISD::CTPOP
, MVT::i64
, Expand
);
493 if (Subtarget
->hasFFBH())
494 setOperationAction({ISD::CTLZ
, ISD::CTLZ_ZERO_UNDEF
}, MVT::i32
, Custom
);
496 if (Subtarget
->hasFFBL())
497 setOperationAction({ISD::CTTZ
, ISD::CTTZ_ZERO_UNDEF
}, MVT::i32
, Custom
);
499 // We only really have 32-bit BFE instructions (and 16-bit on VI).
501 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
502 // effort to match them now. We want this to be false for i64 cases when the
503 // extraction isn't restricted to the upper or lower half. Ideally we would
504 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
505 // span the midpoint are probably relatively rare, so don't worry about them
507 if (Subtarget
->hasBFE())
508 setHasExtractBitsInsn(true);
510 // Clamp modifier on add/sub
511 if (Subtarget
->hasIntClamp())
512 setOperationAction({ISD::UADDSAT
, ISD::USUBSAT
}, MVT::i32
, Legal
);
514 if (Subtarget
->hasAddNoCarry())
515 setOperationAction({ISD::SADDSAT
, ISD::SSUBSAT
}, {MVT::i16
, MVT::i32
},
518 setOperationAction({ISD::FMINNUM
, ISD::FMAXNUM
}, {MVT::f32
, MVT::f64
},
521 // These are really only legal for ieee_mode functions. We should be avoiding
522 // them for functions that don't have ieee_mode enabled, so just say they are
524 setOperationAction({ISD::FMINNUM_IEEE
, ISD::FMAXNUM_IEEE
},
525 {MVT::f32
, MVT::f64
}, Legal
);
527 if (Subtarget
->haveRoundOpsF64())
528 setOperationAction({ISD::FTRUNC
, ISD::FCEIL
, ISD::FROUNDEVEN
}, MVT::f64
,
531 setOperationAction({ISD::FCEIL
, ISD::FTRUNC
, ISD::FROUNDEVEN
, ISD::FFLOOR
},
534 setOperationAction(ISD::FFLOOR
, MVT::f64
, Legal
);
535 setOperationAction({ISD::FLDEXP
, ISD::STRICT_FLDEXP
}, {MVT::f32
, MVT::f64
},
537 setOperationAction(ISD::FFREXP
, {MVT::f32
, MVT::f64
}, Custom
);
539 setOperationAction({ISD::FSIN
, ISD::FCOS
, ISD::FDIV
}, MVT::f32
, Custom
);
540 setOperationAction(ISD::FDIV
, MVT::f64
, Custom
);
542 setOperationAction(ISD::BF16_TO_FP
, {MVT::i16
, MVT::f32
, MVT::f64
}, Expand
);
543 setOperationAction(ISD::FP_TO_BF16
, {MVT::i16
, MVT::f32
, MVT::f64
}, Expand
);
545 // Custom lower these because we can't specify a rule based on an illegal
547 setOperationAction({ISD::FP_EXTEND
, ISD::STRICT_FP_EXTEND
}, MVT::f32
, Custom
);
548 setOperationAction({ISD::FP_EXTEND
, ISD::STRICT_FP_EXTEND
}, MVT::f64
, Custom
);
550 if (Subtarget
->has16BitInsts()) {
551 setOperationAction({ISD::Constant
, ISD::SMIN
, ISD::SMAX
, ISD::UMIN
,
552 ISD::UMAX
, ISD::UADDSAT
, ISD::USUBSAT
},
555 AddPromotedToType(ISD::SIGN_EXTEND
, MVT::i16
, MVT::i32
);
557 setOperationAction({ISD::ROTR
, ISD::ROTL
, ISD::SELECT_CC
, ISD::BR_CC
},
560 setOperationAction({ISD::SIGN_EXTEND
, ISD::SDIV
, ISD::UDIV
, ISD::SREM
,
561 ISD::UREM
, ISD::BITREVERSE
, ISD::CTTZ
,
562 ISD::CTTZ_ZERO_UNDEF
, ISD::CTLZ
, ISD::CTLZ_ZERO_UNDEF
,
566 setOperationAction(ISD::LOAD
, MVT::i16
, Custom
);
568 setTruncStoreAction(MVT::i64
, MVT::i16
, Expand
);
570 setOperationAction(ISD::FP16_TO_FP
, MVT::i16
, Promote
);
571 AddPromotedToType(ISD::FP16_TO_FP
, MVT::i16
, MVT::i32
);
572 setOperationAction(ISD::FP_TO_FP16
, MVT::i16
, Promote
);
573 AddPromotedToType(ISD::FP_TO_FP16
, MVT::i16
, MVT::i32
);
575 setOperationAction({ISD::FP_TO_SINT
, ISD::FP_TO_UINT
}, MVT::i16
, Custom
);
576 setOperationAction({ISD::SINT_TO_FP
, ISD::UINT_TO_FP
}, MVT::i16
, Custom
);
577 setOperationAction({ISD::SINT_TO_FP
, ISD::UINT_TO_FP
}, MVT::i16
, Custom
);
579 setOperationAction({ISD::SINT_TO_FP
, ISD::UINT_TO_FP
}, MVT::i32
, Custom
);
581 // F16 - Constant Actions.
582 setOperationAction(ISD::ConstantFP
, MVT::f16
, Legal
);
583 setOperationAction(ISD::ConstantFP
, MVT::bf16
, Legal
);
585 // F16 - Load/Store Actions.
586 setOperationAction(ISD::LOAD
, MVT::f16
, Promote
);
587 AddPromotedToType(ISD::LOAD
, MVT::f16
, MVT::i16
);
588 setOperationAction(ISD::STORE
, MVT::f16
, Promote
);
589 AddPromotedToType(ISD::STORE
, MVT::f16
, MVT::i16
);
591 // BF16 - Load/Store Actions.
592 setOperationAction(ISD::LOAD
, MVT::bf16
, Promote
);
593 AddPromotedToType(ISD::LOAD
, MVT::bf16
, MVT::i16
);
594 setOperationAction(ISD::STORE
, MVT::bf16
, Promote
);
595 AddPromotedToType(ISD::STORE
, MVT::bf16
, MVT::i16
);
597 // F16 - VOP1 Actions.
598 setOperationAction({ISD::FP_ROUND
, ISD::STRICT_FP_ROUND
, ISD::FCOS
,
599 ISD::FSIN
, ISD::FROUND
},
602 setOperationAction({ISD::FP_TO_SINT
, ISD::FP_TO_UINT
}, MVT::f16
, Promote
);
603 setOperationAction({ISD::FP_TO_SINT
, ISD::FP_TO_UINT
}, MVT::bf16
, Promote
);
605 // F16 - VOP2 Actions.
606 setOperationAction({ISD::BR_CC
, ISD::SELECT_CC
}, {MVT::f16
, MVT::bf16
},
608 setOperationAction({ISD::FLDEXP
, ISD::STRICT_FLDEXP
}, MVT::f16
, Custom
);
609 setOperationAction(ISD::FFREXP
, MVT::f16
, Custom
);
610 setOperationAction(ISD::FDIV
, MVT::f16
, Custom
);
612 // F16 - VOP3 Actions.
613 setOperationAction(ISD::FMA
, MVT::f16
, Legal
);
615 setOperationAction(ISD::FMAD
, MVT::f16
, Legal
);
618 {MVT::v2i16
, MVT::v2f16
, MVT::v2bf16
, MVT::v4i16
, MVT::v4f16
,
619 MVT::v4bf16
, MVT::v8i16
, MVT::v8f16
, MVT::v8bf16
, MVT::v16i16
,
620 MVT::v16f16
, MVT::v16bf16
, MVT::v32i16
, MVT::v32f16
}) {
621 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
625 case ISD::BUILD_VECTOR
:
628 case ISD::EXTRACT_VECTOR_ELT
:
629 case ISD::INSERT_VECTOR_ELT
:
630 case ISD::INSERT_SUBVECTOR
:
631 case ISD::SCALAR_TO_VECTOR
:
632 case ISD::IS_FPCLASS
:
634 case ISD::EXTRACT_SUBVECTOR
:
635 case ISD::CONCAT_VECTORS
:
636 setOperationAction(Op
, VT
, Custom
);
639 setOperationAction(Op
, VT
, Expand
);
645 // v_perm_b32 can handle either of these.
646 setOperationAction(ISD::BSWAP
, {MVT::i16
, MVT::v2i16
}, Legal
);
647 setOperationAction(ISD::BSWAP
, MVT::v4i16
, Custom
);
649 // XXX - Do these do anything? Vector constants turn into build_vector.
650 setOperationAction(ISD::Constant
, {MVT::v2i16
, MVT::v2f16
}, Legal
);
652 setOperationAction(ISD::UNDEF
, {MVT::v2i16
, MVT::v2f16
, MVT::v2bf16
},
655 setOperationAction(ISD::STORE
, MVT::v2i16
, Promote
);
656 AddPromotedToType(ISD::STORE
, MVT::v2i16
, MVT::i32
);
657 setOperationAction(ISD::STORE
, MVT::v2f16
, Promote
);
658 AddPromotedToType(ISD::STORE
, MVT::v2f16
, MVT::i32
);
660 setOperationAction(ISD::LOAD
, MVT::v2i16
, Promote
);
661 AddPromotedToType(ISD::LOAD
, MVT::v2i16
, MVT::i32
);
662 setOperationAction(ISD::LOAD
, MVT::v2f16
, Promote
);
663 AddPromotedToType(ISD::LOAD
, MVT::v2f16
, MVT::i32
);
665 setOperationAction(ISD::AND
, MVT::v2i16
, Promote
);
666 AddPromotedToType(ISD::AND
, MVT::v2i16
, MVT::i32
);
667 setOperationAction(ISD::OR
, MVT::v2i16
, Promote
);
668 AddPromotedToType(ISD::OR
, MVT::v2i16
, MVT::i32
);
669 setOperationAction(ISD::XOR
, MVT::v2i16
, Promote
);
670 AddPromotedToType(ISD::XOR
, MVT::v2i16
, MVT::i32
);
672 setOperationAction(ISD::LOAD
, MVT::v4i16
, Promote
);
673 AddPromotedToType(ISD::LOAD
, MVT::v4i16
, MVT::v2i32
);
674 setOperationAction(ISD::LOAD
, MVT::v4f16
, Promote
);
675 AddPromotedToType(ISD::LOAD
, MVT::v4f16
, MVT::v2i32
);
676 setOperationAction(ISD::LOAD
, MVT::v4bf16
, Promote
);
677 AddPromotedToType(ISD::LOAD
, MVT::v4bf16
, MVT::v2i32
);
679 setOperationAction(ISD::STORE
, MVT::v4i16
, Promote
);
680 AddPromotedToType(ISD::STORE
, MVT::v4i16
, MVT::v2i32
);
681 setOperationAction(ISD::STORE
, MVT::v4f16
, Promote
);
682 AddPromotedToType(ISD::STORE
, MVT::v4f16
, MVT::v2i32
);
683 setOperationAction(ISD::STORE
, MVT::v4bf16
, Promote
);
684 AddPromotedToType(ISD::STORE
, MVT::v4bf16
, MVT::v2i32
);
686 setOperationAction(ISD::LOAD
, MVT::v8i16
, Promote
);
687 AddPromotedToType(ISD::LOAD
, MVT::v8i16
, MVT::v4i32
);
688 setOperationAction(ISD::LOAD
, MVT::v8f16
, Promote
);
689 AddPromotedToType(ISD::LOAD
, MVT::v8f16
, MVT::v4i32
);
690 setOperationAction(ISD::LOAD
, MVT::v8bf16
, Promote
);
691 AddPromotedToType(ISD::LOAD
, MVT::v8bf16
, MVT::v4i32
);
693 setOperationAction(ISD::STORE
, MVT::v4i16
, Promote
);
694 AddPromotedToType(ISD::STORE
, MVT::v4i16
, MVT::v2i32
);
695 setOperationAction(ISD::STORE
, MVT::v4f16
, Promote
);
696 AddPromotedToType(ISD::STORE
, MVT::v4f16
, MVT::v2i32
);
698 setOperationAction(ISD::STORE
, MVT::v8i16
, Promote
);
699 AddPromotedToType(ISD::STORE
, MVT::v8i16
, MVT::v4i32
);
700 setOperationAction(ISD::STORE
, MVT::v8f16
, Promote
);
701 AddPromotedToType(ISD::STORE
, MVT::v8f16
, MVT::v4i32
);
702 setOperationAction(ISD::STORE
, MVT::v8bf16
, Promote
);
703 AddPromotedToType(ISD::STORE
, MVT::v8bf16
, MVT::v4i32
);
705 setOperationAction(ISD::LOAD
, MVT::v16i16
, Promote
);
706 AddPromotedToType(ISD::LOAD
, MVT::v16i16
, MVT::v8i32
);
707 setOperationAction(ISD::LOAD
, MVT::v16f16
, Promote
);
708 AddPromotedToType(ISD::LOAD
, MVT::v16f16
, MVT::v8i32
);
709 setOperationAction(ISD::LOAD
, MVT::v16bf16
, Promote
);
710 AddPromotedToType(ISD::LOAD
, MVT::v16bf16
, MVT::v8i32
);
712 setOperationAction(ISD::STORE
, MVT::v16i16
, Promote
);
713 AddPromotedToType(ISD::STORE
, MVT::v16i16
, MVT::v8i32
);
714 setOperationAction(ISD::STORE
, MVT::v16f16
, Promote
);
715 AddPromotedToType(ISD::STORE
, MVT::v16f16
, MVT::v8i32
);
716 setOperationAction(ISD::STORE
, MVT::v16bf16
, Promote
);
717 AddPromotedToType(ISD::STORE
, MVT::v16bf16
, MVT::v8i32
);
719 setOperationAction(ISD::LOAD
, MVT::v32i16
, Promote
);
720 AddPromotedToType(ISD::LOAD
, MVT::v32i16
, MVT::v16i32
);
721 setOperationAction(ISD::LOAD
, MVT::v32f16
, Promote
);
722 AddPromotedToType(ISD::LOAD
, MVT::v32f16
, MVT::v16i32
);
723 setOperationAction(ISD::LOAD
, MVT::v32bf16
, Promote
);
724 AddPromotedToType(ISD::LOAD
, MVT::v32bf16
, MVT::v16i32
);
726 setOperationAction(ISD::STORE
, MVT::v32i16
, Promote
);
727 AddPromotedToType(ISD::STORE
, MVT::v32i16
, MVT::v16i32
);
728 setOperationAction(ISD::STORE
, MVT::v32f16
, Promote
);
729 AddPromotedToType(ISD::STORE
, MVT::v32f16
, MVT::v16i32
);
730 setOperationAction(ISD::STORE
, MVT::v32bf16
, Promote
);
731 AddPromotedToType(ISD::STORE
, MVT::v32bf16
, MVT::v16i32
);
733 setOperationAction({ISD::ANY_EXTEND
, ISD::ZERO_EXTEND
, ISD::SIGN_EXTEND
},
735 setOperationAction(ISD::FP_EXTEND
, MVT::v2f32
, Expand
);
737 setOperationAction({ISD::ANY_EXTEND
, ISD::ZERO_EXTEND
, ISD::SIGN_EXTEND
},
740 setOperationAction({ISD::ANY_EXTEND
, ISD::ZERO_EXTEND
, ISD::SIGN_EXTEND
},
743 setOperationAction(ISD::BUILD_VECTOR
, {MVT::v2i16
, MVT::v2f16
, MVT::v2bf16
},
744 Subtarget
->hasVOP3PInsts() ? Legal
: Custom
);
746 setOperationAction(ISD::FNEG
, MVT::v2f16
, Legal
);
747 // This isn't really legal, but this avoids the legalizer unrolling it (and
748 // allows matching fneg (fabs x) patterns)
749 setOperationAction(ISD::FABS
, MVT::v2f16
, Legal
);
751 setOperationAction({ISD::FMAXNUM
, ISD::FMINNUM
}, MVT::f16
, Custom
);
752 setOperationAction({ISD::FMAXNUM_IEEE
, ISD::FMINNUM_IEEE
}, MVT::f16
, Legal
);
754 setOperationAction({ISD::FMINNUM_IEEE
, ISD::FMAXNUM_IEEE
, ISD::FMINIMUMNUM
,
756 {MVT::v4f16
, MVT::v8f16
, MVT::v16f16
, MVT::v32f16
},
759 setOperationAction({ISD::FMINNUM
, ISD::FMAXNUM
},
760 {MVT::v4f16
, MVT::v8f16
, MVT::v16f16
, MVT::v32f16
},
764 {MVT::v8i16
, MVT::v8f16
, MVT::v8bf16
, MVT::v16i16
, MVT::v16f16
,
765 MVT::v16bf16
, MVT::v32i16
, MVT::v32f16
, MVT::v32bf16
}) {
767 {ISD::BUILD_VECTOR
, ISD::EXTRACT_VECTOR_ELT
, ISD::SCALAR_TO_VECTOR
},
769 setOperationAction(ISD::INSERT_VECTOR_ELT
, Vec16
, Expand
);
773 if (Subtarget
->hasVOP3PInsts()) {
774 setOperationAction({ISD::ADD
, ISD::SUB
, ISD::MUL
, ISD::SHL
, ISD::SRL
,
775 ISD::SRA
, ISD::SMIN
, ISD::UMIN
, ISD::SMAX
, ISD::UMAX
,
776 ISD::UADDSAT
, ISD::USUBSAT
, ISD::SADDSAT
, ISD::SSUBSAT
},
779 setOperationAction({ISD::FADD
, ISD::FMUL
, ISD::FMA
, ISD::FMINNUM_IEEE
,
780 ISD::FMAXNUM_IEEE
, ISD::FCANONICALIZE
},
783 setOperationAction(ISD::EXTRACT_VECTOR_ELT
,
784 {MVT::v2i16
, MVT::v2f16
, MVT::v2bf16
}, Custom
);
786 setOperationAction(ISD::VECTOR_SHUFFLE
,
787 {MVT::v4f16
, MVT::v4i16
, MVT::v8f16
, MVT::v8i16
,
788 MVT::v16f16
, MVT::v16i16
, MVT::v32f16
, MVT::v32i16
},
791 for (MVT VT
: {MVT::v4i16
, MVT::v8i16
, MVT::v16i16
, MVT::v32i16
})
792 // Split vector operations.
793 setOperationAction({ISD::SHL
, ISD::SRA
, ISD::SRL
, ISD::ADD
, ISD::SUB
,
794 ISD::MUL
, ISD::ABS
, ISD::SMIN
, ISD::SMAX
, ISD::UMIN
,
795 ISD::UMAX
, ISD::UADDSAT
, ISD::SADDSAT
, ISD::USUBSAT
,
799 for (MVT VT
: {MVT::v4f16
, MVT::v8f16
, MVT::v16f16
, MVT::v32f16
})
800 // Split vector operations.
801 setOperationAction({ISD::FADD
, ISD::FMUL
, ISD::FMA
, ISD::FCANONICALIZE
},
804 setOperationAction({ISD::FMAXNUM
, ISD::FMINNUM
}, {MVT::v2f16
, MVT::v4f16
},
807 setOperationAction(ISD::FEXP
, MVT::v2f16
, Custom
);
808 setOperationAction(ISD::SELECT
, {MVT::v4i16
, MVT::v4f16
, MVT::v4bf16
},
811 if (Subtarget
->hasPackedFP32Ops()) {
812 setOperationAction({ISD::FADD
, ISD::FMUL
, ISD::FMA
, ISD::FNEG
},
814 setOperationAction({ISD::FADD
, ISD::FMUL
, ISD::FMA
},
815 {MVT::v4f32
, MVT::v8f32
, MVT::v16f32
, MVT::v32f32
},
820 setOperationAction({ISD::FNEG
, ISD::FABS
}, MVT::v4f16
, Custom
);
822 if (Subtarget
->has16BitInsts()) {
823 setOperationAction(ISD::SELECT
, MVT::v2i16
, Promote
);
824 AddPromotedToType(ISD::SELECT
, MVT::v2i16
, MVT::i32
);
825 setOperationAction(ISD::SELECT
, MVT::v2f16
, Promote
);
826 AddPromotedToType(ISD::SELECT
, MVT::v2f16
, MVT::i32
);
828 // Legalization hack.
829 setOperationAction(ISD::SELECT
, {MVT::v2i16
, MVT::v2f16
}, Custom
);
831 setOperationAction({ISD::FNEG
, ISD::FABS
}, MVT::v2f16
, Custom
);
834 setOperationAction(ISD::SELECT
,
835 {MVT::v4i16
, MVT::v4f16
, MVT::v4bf16
, MVT::v2i8
, MVT::v4i8
,
836 MVT::v8i8
, MVT::v8i16
, MVT::v8f16
, MVT::v8bf16
,
837 MVT::v16i16
, MVT::v16f16
, MVT::v16bf16
, MVT::v32i16
,
838 MVT::v32f16
, MVT::v32bf16
},
841 setOperationAction({ISD::SMULO
, ISD::UMULO
}, MVT::i64
, Custom
);
843 if (Subtarget
->hasScalarSMulU64())
844 setOperationAction(ISD::MUL
, MVT::i64
, Custom
);
846 if (Subtarget
->hasMad64_32())
847 setOperationAction({ISD::SMUL_LOHI
, ISD::UMUL_LOHI
}, MVT::i32
, Custom
);
849 if (Subtarget
->hasPrefetch())
850 setOperationAction(ISD::PREFETCH
, MVT::Other
, Custom
);
852 if (Subtarget
->hasIEEEMinMax()) {
853 setOperationAction({ISD::FMAXIMUM
, ISD::FMINIMUM
},
854 {MVT::f16
, MVT::f32
, MVT::f64
, MVT::v2f16
}, Legal
);
855 setOperationAction({ISD::FMINIMUM
, ISD::FMAXIMUM
},
856 {MVT::v4f16
, MVT::v8f16
, MVT::v16f16
, MVT::v32f16
},
860 setOperationAction(ISD::INTRINSIC_WO_CHAIN
,
861 {MVT::Other
, MVT::f32
, MVT::v4f32
, MVT::i16
, MVT::f16
,
862 MVT::bf16
, MVT::v2i16
, MVT::v2f16
, MVT::v2bf16
, MVT::i128
,
866 setOperationAction(ISD::INTRINSIC_W_CHAIN
,
867 {MVT::v2f16
, MVT::v2i16
, MVT::v2bf16
, MVT::v3f16
,
868 MVT::v3i16
, MVT::v4f16
, MVT::v4i16
, MVT::v4bf16
,
869 MVT::v8i16
, MVT::v8f16
, MVT::v8bf16
, MVT::Other
, MVT::f16
,
870 MVT::i16
, MVT::bf16
, MVT::i8
, MVT::i128
},
873 setOperationAction(ISD::INTRINSIC_VOID
,
874 {MVT::Other
, MVT::v2i16
, MVT::v2f16
, MVT::v2bf16
,
875 MVT::v3i16
, MVT::v3f16
, MVT::v4f16
, MVT::v4i16
,
876 MVT::v4bf16
, MVT::v8i16
, MVT::v8f16
, MVT::v8bf16
,
877 MVT::f16
, MVT::i16
, MVT::bf16
, MVT::i8
, MVT::i128
},
880 setOperationAction(ISD::STACKSAVE
, MVT::Other
, Custom
);
881 setOperationAction(ISD::GET_ROUNDING
, MVT::i32
, Custom
);
882 setOperationAction(ISD::SET_ROUNDING
, MVT::Other
, Custom
);
883 setOperationAction(ISD::GET_FPENV
, MVT::i64
, Custom
);
884 setOperationAction(ISD::SET_FPENV
, MVT::i64
, Custom
);
886 // TODO: Could move this to custom lowering, could benefit from combines on
887 // extract of relevant bits.
888 setOperationAction(ISD::GET_FPMODE
, MVT::i32
, Legal
);
890 setOperationAction(ISD::MUL
, MVT::i1
, Promote
);
892 if (Subtarget
->hasBF16ConversionInsts()) {
893 setOperationAction(ISD::FP_ROUND
, MVT::v2bf16
, Legal
);
894 setOperationAction(ISD::FP_ROUND
, MVT::bf16
, Legal
);
895 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2bf16
, Legal
);
898 setTargetDAGCombine({ISD::ADD
,
933 ISD::SCALAR_TO_VECTOR
,
935 ISD::SIGN_EXTEND_INREG
,
936 ISD::EXTRACT_VECTOR_ELT
,
937 ISD::INSERT_VECTOR_ELT
,
940 if (Subtarget
->has16BitInsts() && !Subtarget
->hasMed3_16())
941 setTargetDAGCombine(ISD::FP_ROUND
);
943 // All memory operations. Some folding on the pointer operand is done to help
944 // matching the constant offsets in the addressing modes.
945 setTargetDAGCombine({ISD::LOAD
,
949 ISD::ATOMIC_CMP_SWAP
,
950 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
,
952 ISD::ATOMIC_LOAD_ADD
,
953 ISD::ATOMIC_LOAD_SUB
,
954 ISD::ATOMIC_LOAD_AND
,
956 ISD::ATOMIC_LOAD_XOR
,
957 ISD::ATOMIC_LOAD_NAND
,
958 ISD::ATOMIC_LOAD_MIN
,
959 ISD::ATOMIC_LOAD_MAX
,
960 ISD::ATOMIC_LOAD_UMIN
,
961 ISD::ATOMIC_LOAD_UMAX
,
962 ISD::ATOMIC_LOAD_FADD
,
963 ISD::ATOMIC_LOAD_FMIN
,
964 ISD::ATOMIC_LOAD_FMAX
,
965 ISD::ATOMIC_LOAD_UINC_WRAP
,
966 ISD::ATOMIC_LOAD_UDEC_WRAP
,
968 ISD::INTRINSIC_W_CHAIN
});
970 // FIXME: In other contexts we pretend this is a per-function property.
971 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32
);
973 setSchedulingPreference(Sched::RegPressure
);
976 const GCNSubtarget
*SITargetLowering::getSubtarget() const { return Subtarget
; }
978 ArrayRef
<MCPhysReg
> SITargetLowering::getRoundingControlRegisters() const {
979 static const MCPhysReg RCRegs
[] = {AMDGPU::MODE
};
983 //===----------------------------------------------------------------------===//
984 // TargetLowering queries
985 //===----------------------------------------------------------------------===//
987 // v_mad_mix* support a conversion from f16 to f32.
989 // There is only one special case when denormals are enabled we don't currently,
990 // where this is OK to use.
991 bool SITargetLowering::isFPExtFoldable(const SelectionDAG
&DAG
, unsigned Opcode
,
992 EVT DestVT
, EVT SrcVT
) const {
993 return ((Opcode
== ISD::FMAD
&& Subtarget
->hasMadMixInsts()) ||
994 (Opcode
== ISD::FMA
&& Subtarget
->hasFmaMixInsts())) &&
995 DestVT
.getScalarType() == MVT::f32
&&
996 SrcVT
.getScalarType() == MVT::f16
&&
997 // TODO: This probably only requires no input flushing?
998 denormalModeIsFlushAllF32(DAG
.getMachineFunction());
1001 bool SITargetLowering::isFPExtFoldable(const MachineInstr
&MI
, unsigned Opcode
,
1002 LLT DestTy
, LLT SrcTy
) const {
1003 return ((Opcode
== TargetOpcode::G_FMAD
&& Subtarget
->hasMadMixInsts()) ||
1004 (Opcode
== TargetOpcode::G_FMA
&& Subtarget
->hasFmaMixInsts())) &&
1005 DestTy
.getScalarSizeInBits() == 32 &&
1006 SrcTy
.getScalarSizeInBits() == 16 &&
1007 // TODO: This probably only requires no input flushing?
1008 denormalModeIsFlushAllF32(*MI
.getMF());
1011 bool SITargetLowering::isShuffleMaskLegal(ArrayRef
<int>, EVT
) const {
1012 // SI has some legal vector types, but no legal vector operations. Say no
1013 // shuffles are legal in order to prefer scalarizing some vector operations.
1017 MVT
SITargetLowering::getRegisterTypeForCallingConv(LLVMContext
&Context
,
1020 if (CC
== CallingConv::AMDGPU_KERNEL
)
1021 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
1023 if (VT
.isVector()) {
1024 EVT ScalarVT
= VT
.getScalarType();
1025 unsigned Size
= ScalarVT
.getSizeInBits();
1027 if (Subtarget
->has16BitInsts()) {
1030 return (ScalarVT
== MVT::bf16
? MVT::i32
: MVT::v2f16
);
1032 return VT
.isInteger() ? MVT::i32
: MVT::f32
;
1036 return Subtarget
->has16BitInsts() ? MVT::i16
: MVT::i32
;
1037 return Size
== 32 ? ScalarVT
.getSimpleVT() : MVT::i32
;
1040 if (VT
.getSizeInBits() > 32)
1043 return TargetLowering::getRegisterTypeForCallingConv(Context
, CC
, VT
);
1046 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext
&Context
,
1049 if (CC
== CallingConv::AMDGPU_KERNEL
)
1050 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
1052 if (VT
.isVector()) {
1053 unsigned NumElts
= VT
.getVectorNumElements();
1054 EVT ScalarVT
= VT
.getScalarType();
1055 unsigned Size
= ScalarVT
.getSizeInBits();
1057 // FIXME: Should probably promote 8-bit vectors to i16.
1058 if (Size
== 16 && Subtarget
->has16BitInsts())
1059 return (NumElts
+ 1) / 2;
1065 return NumElts
* ((Size
+ 31) / 32);
1066 } else if (VT
.getSizeInBits() > 32)
1067 return (VT
.getSizeInBits() + 31) / 32;
1069 return TargetLowering::getNumRegistersForCallingConv(Context
, CC
, VT
);
1072 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1073 LLVMContext
&Context
, CallingConv::ID CC
, EVT VT
, EVT
&IntermediateVT
,
1074 unsigned &NumIntermediates
, MVT
&RegisterVT
) const {
1075 if (CC
!= CallingConv::AMDGPU_KERNEL
&& VT
.isVector()) {
1076 unsigned NumElts
= VT
.getVectorNumElements();
1077 EVT ScalarVT
= VT
.getScalarType();
1078 unsigned Size
= ScalarVT
.getSizeInBits();
1079 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1080 // support, but unless we can properly handle 3-vectors, it will be still be
1082 if (Size
== 16 && Subtarget
->has16BitInsts()) {
1083 if (ScalarVT
== MVT::bf16
) {
1084 RegisterVT
= MVT::i32
;
1085 IntermediateVT
= MVT::v2bf16
;
1087 RegisterVT
= VT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
1088 IntermediateVT
= RegisterVT
;
1090 NumIntermediates
= (NumElts
+ 1) / 2;
1091 return NumIntermediates
;
1095 RegisterVT
= ScalarVT
.getSimpleVT();
1096 IntermediateVT
= RegisterVT
;
1097 NumIntermediates
= NumElts
;
1098 return NumIntermediates
;
1101 if (Size
< 16 && Subtarget
->has16BitInsts()) {
1102 // FIXME: Should probably form v2i16 pieces
1103 RegisterVT
= MVT::i16
;
1104 IntermediateVT
= ScalarVT
;
1105 NumIntermediates
= NumElts
;
1106 return NumIntermediates
;
1109 if (Size
!= 16 && Size
<= 32) {
1110 RegisterVT
= MVT::i32
;
1111 IntermediateVT
= ScalarVT
;
1112 NumIntermediates
= NumElts
;
1113 return NumIntermediates
;
1117 RegisterVT
= MVT::i32
;
1118 IntermediateVT
= RegisterVT
;
1119 NumIntermediates
= NumElts
* ((Size
+ 31) / 32);
1120 return NumIntermediates
;
1124 return TargetLowering::getVectorTypeBreakdownForCallingConv(
1125 Context
, CC
, VT
, IntermediateVT
, NumIntermediates
, RegisterVT
);
1128 static EVT
memVTFromLoadIntrData(const SITargetLowering
&TLI
,
1129 const DataLayout
&DL
, Type
*Ty
,
1130 unsigned MaxNumLanes
) {
1131 assert(MaxNumLanes
!= 0);
1133 LLVMContext
&Ctx
= Ty
->getContext();
1134 if (auto *VT
= dyn_cast
<FixedVectorType
>(Ty
)) {
1135 unsigned NumElts
= std::min(MaxNumLanes
, VT
->getNumElements());
1136 return EVT::getVectorVT(Ctx
, TLI
.getValueType(DL
, VT
->getElementType()),
1140 return TLI
.getValueType(DL
, Ty
);
1143 // Peek through TFE struct returns to only use the data size.
1144 static EVT
memVTFromLoadIntrReturn(const SITargetLowering
&TLI
,
1145 const DataLayout
&DL
, Type
*Ty
,
1146 unsigned MaxNumLanes
) {
1147 auto *ST
= dyn_cast
<StructType
>(Ty
);
1149 return memVTFromLoadIntrData(TLI
, DL
, Ty
, MaxNumLanes
);
1151 // TFE intrinsics return an aggregate type.
1152 assert(ST
->getNumContainedTypes() == 2 &&
1153 ST
->getContainedType(1)->isIntegerTy(32));
1154 return memVTFromLoadIntrData(TLI
, DL
, ST
->getContainedType(0), MaxNumLanes
);
1157 /// Map address space 7 to MVT::v5i32 because that's its in-memory
1158 /// representation. This return value is vector-typed because there is no
1159 /// MVT::i160 and it is not clear if one can be added. While this could
1160 /// cause issues during codegen, these address space 7 pointers will be
1161 /// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1162 /// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1163 /// modeling, to work.
1164 MVT
SITargetLowering::getPointerTy(const DataLayout
&DL
, unsigned AS
) const {
1165 if (AMDGPUAS::BUFFER_FAT_POINTER
== AS
&& DL
.getPointerSizeInBits(AS
) == 160)
1167 if (AMDGPUAS::BUFFER_STRIDED_POINTER
== AS
&&
1168 DL
.getPointerSizeInBits(AS
) == 192)
1170 return AMDGPUTargetLowering::getPointerTy(DL
, AS
);
1172 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1173 /// v8i32 when padding is added.
1174 /// The in-memory representation of a p9 is {p8, i32, i32}, which is
1175 /// also v8i32 with padding.
1176 MVT
SITargetLowering::getPointerMemTy(const DataLayout
&DL
, unsigned AS
) const {
1177 if ((AMDGPUAS::BUFFER_FAT_POINTER
== AS
&&
1178 DL
.getPointerSizeInBits(AS
) == 160) ||
1179 (AMDGPUAS::BUFFER_STRIDED_POINTER
== AS
&&
1180 DL
.getPointerSizeInBits(AS
) == 192))
1182 return AMDGPUTargetLowering::getPointerMemTy(DL
, AS
);
1185 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo
&Info
,
1187 MachineFunction
&MF
,
1188 unsigned IntrID
) const {
1189 Info
.flags
= MachineMemOperand::MONone
;
1190 if (CI
.hasMetadata(LLVMContext::MD_invariant_load
))
1191 Info
.flags
|= MachineMemOperand::MOInvariant
;
1193 if (const AMDGPU::RsrcIntrinsic
*RsrcIntr
=
1194 AMDGPU::lookupRsrcIntrinsic(IntrID
)) {
1195 AttributeList Attr
=
1196 Intrinsic::getAttributes(CI
.getContext(), (Intrinsic::ID
)IntrID
);
1197 MemoryEffects ME
= Attr
.getMemoryEffects();
1198 if (ME
.doesNotAccessMemory())
1201 // TODO: Should images get their own address space?
1202 Info
.fallbackAddressSpace
= AMDGPUAS::BUFFER_RESOURCE
;
1204 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
= nullptr;
1205 if (RsrcIntr
->IsImage
) {
1206 const AMDGPU::ImageDimIntrinsicInfo
*Intr
=
1207 AMDGPU::getImageDimIntrinsicInfo(IntrID
);
1208 BaseOpcode
= AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
1212 Value
*RsrcArg
= CI
.getArgOperand(RsrcIntr
->RsrcArg
);
1213 if (auto *RsrcPtrTy
= dyn_cast
<PointerType
>(RsrcArg
->getType())) {
1214 if (RsrcPtrTy
->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE
)
1215 // We conservatively set the memory operand of a buffer intrinsic to the
1216 // base resource pointer, so that we can access alias information about
1217 // those pointers. Cases like "this points at the same value
1218 // but with a different offset" are handled in
1219 // areMemAccessesTriviallyDisjoint.
1220 Info
.ptrVal
= RsrcArg
;
1223 bool IsSPrefetch
= IntrID
== Intrinsic::amdgcn_s_buffer_prefetch_data
;
1225 auto *Aux
= cast
<ConstantInt
>(CI
.getArgOperand(CI
.arg_size() - 1));
1226 if (Aux
->getZExtValue() & AMDGPU::CPol::VOLATILE
)
1227 Info
.flags
|= MachineMemOperand::MOVolatile
;
1230 Info
.flags
|= MachineMemOperand::MODereferenceable
;
1231 if (ME
.onlyReadsMemory()) {
1232 if (RsrcIntr
->IsImage
) {
1233 unsigned MaxNumLanes
= 4;
1235 if (!BaseOpcode
->Gather4
) {
1236 // If this isn't a gather, we may have excess loaded elements in the
1237 // IR type. Check the dmask for the real number of elements loaded.
1239 cast
<ConstantInt
>(CI
.getArgOperand(0))->getZExtValue();
1240 MaxNumLanes
= DMask
== 0 ? 1 : llvm::popcount(DMask
);
1243 Info
.memVT
= memVTFromLoadIntrReturn(*this, MF
.getDataLayout(),
1244 CI
.getType(), MaxNumLanes
);
1247 memVTFromLoadIntrReturn(*this, MF
.getDataLayout(), CI
.getType(),
1248 std::numeric_limits
<unsigned>::max());
1251 // FIXME: What does alignment mean for an image?
1252 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1253 Info
.flags
|= MachineMemOperand::MOLoad
;
1254 } else if (ME
.onlyWritesMemory()) {
1255 Info
.opc
= ISD::INTRINSIC_VOID
;
1257 Type
*DataTy
= CI
.getArgOperand(0)->getType();
1258 if (RsrcIntr
->IsImage
) {
1259 unsigned DMask
= cast
<ConstantInt
>(CI
.getArgOperand(1))->getZExtValue();
1260 unsigned DMaskLanes
= DMask
== 0 ? 1 : llvm::popcount(DMask
);
1261 Info
.memVT
= memVTFromLoadIntrData(*this, MF
.getDataLayout(), DataTy
,
1264 Info
.memVT
= getValueType(MF
.getDataLayout(), DataTy
);
1266 Info
.flags
|= MachineMemOperand::MOStore
;
1268 // Atomic, NoReturn Sampler or prefetch
1269 Info
.opc
= CI
.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1270 : ISD::INTRINSIC_W_CHAIN
;
1272 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
;
1275 Info
.flags
|= MachineMemOperand::MOStore
;
1279 if ((RsrcIntr
->IsImage
&& BaseOpcode
->NoReturn
) || IsSPrefetch
) {
1280 // Fake memory access type for no return sampler intrinsics
1281 Info
.memVT
= MVT::i32
;
1283 // XXX - Should this be volatile without known ordering?
1284 Info
.flags
|= MachineMemOperand::MOVolatile
;
1285 Info
.memVT
= MVT::getVT(CI
.getArgOperand(0)->getType());
1288 case Intrinsic::amdgcn_raw_buffer_load_lds
:
1289 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds
:
1290 case Intrinsic::amdgcn_struct_buffer_load_lds
:
1291 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds
: {
1292 unsigned Width
= cast
<ConstantInt
>(CI
.getArgOperand(2))->getZExtValue();
1293 Info
.memVT
= EVT::getIntegerVT(CI
.getContext(), Width
* 8);
1294 Info
.ptrVal
= CI
.getArgOperand(1);
1297 case Intrinsic::amdgcn_raw_atomic_buffer_load
:
1298 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load
:
1299 case Intrinsic::amdgcn_struct_atomic_buffer_load
:
1300 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load
: {
1302 memVTFromLoadIntrReturn(*this, MF
.getDataLayout(), CI
.getType(),
1303 std::numeric_limits
<unsigned>::max());
1304 Info
.flags
&= ~MachineMemOperand::MOStore
;
1313 case Intrinsic::amdgcn_ds_ordered_add
:
1314 case Intrinsic::amdgcn_ds_ordered_swap
: {
1315 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1316 Info
.memVT
= MVT::getVT(CI
.getType());
1317 Info
.ptrVal
= CI
.getOperand(0);
1319 Info
.flags
|= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1321 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(4));
1323 Info
.flags
|= MachineMemOperand::MOVolatile
;
1327 case Intrinsic::amdgcn_ds_add_gs_reg_rtn
:
1328 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn
: {
1329 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1330 Info
.memVT
= MVT::getVT(CI
.getOperand(0)->getType());
1331 Info
.ptrVal
= nullptr;
1332 Info
.fallbackAddressSpace
= AMDGPUAS::STREAMOUT_REGISTER
;
1333 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1336 case Intrinsic::amdgcn_ds_append
:
1337 case Intrinsic::amdgcn_ds_consume
: {
1338 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1339 Info
.memVT
= MVT::getVT(CI
.getType());
1340 Info
.ptrVal
= CI
.getOperand(0);
1342 Info
.flags
|= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1344 const ConstantInt
*Vol
= cast
<ConstantInt
>(CI
.getOperand(1));
1346 Info
.flags
|= MachineMemOperand::MOVolatile
;
1350 case Intrinsic::amdgcn_global_atomic_csub
: {
1351 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1352 Info
.memVT
= MVT::getVT(CI
.getType());
1353 Info
.ptrVal
= CI
.getOperand(0);
1355 Info
.flags
|= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
|
1356 MachineMemOperand::MOVolatile
;
1359 case Intrinsic::amdgcn_image_bvh_intersect_ray
: {
1360 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1361 Info
.memVT
= MVT::getVT(CI
.getType()); // XXX: what is correct VT?
1363 Info
.fallbackAddressSpace
= AMDGPUAS::BUFFER_RESOURCE
;
1366 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
;
1369 case Intrinsic::amdgcn_global_atomic_fmin_num
:
1370 case Intrinsic::amdgcn_global_atomic_fmax_num
:
1371 case Intrinsic::amdgcn_global_atomic_ordered_add_b64
:
1372 case Intrinsic::amdgcn_flat_atomic_fmin_num
:
1373 case Intrinsic::amdgcn_flat_atomic_fmax_num
:
1374 case Intrinsic::amdgcn_atomic_cond_sub_u32
: {
1375 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1376 Info
.memVT
= MVT::getVT(CI
.getType());
1377 Info
.ptrVal
= CI
.getOperand(0);
1379 Info
.flags
|= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
|
1380 MachineMemOperand::MODereferenceable
|
1381 MachineMemOperand::MOVolatile
;
1384 case Intrinsic::amdgcn_global_load_tr_b64
:
1385 case Intrinsic::amdgcn_global_load_tr_b128
: {
1386 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1387 Info
.memVT
= MVT::getVT(CI
.getType());
1388 Info
.ptrVal
= CI
.getOperand(0);
1390 Info
.flags
|= MachineMemOperand::MOLoad
;
1393 case Intrinsic::amdgcn_ds_gws_init
:
1394 case Intrinsic::amdgcn_ds_gws_barrier
:
1395 case Intrinsic::amdgcn_ds_gws_sema_v
:
1396 case Intrinsic::amdgcn_ds_gws_sema_br
:
1397 case Intrinsic::amdgcn_ds_gws_sema_p
:
1398 case Intrinsic::amdgcn_ds_gws_sema_release_all
: {
1399 Info
.opc
= ISD::INTRINSIC_VOID
;
1401 const GCNTargetMachine
&TM
=
1402 static_cast<const GCNTargetMachine
&>(getTargetMachine());
1404 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1405 Info
.ptrVal
= MFI
->getGWSPSV(TM
);
1407 // This is an abstract access, but we need to specify a type and size.
1408 Info
.memVT
= MVT::i32
;
1410 Info
.align
= Align(4);
1412 if (IntrID
== Intrinsic::amdgcn_ds_gws_barrier
)
1413 Info
.flags
|= MachineMemOperand::MOLoad
;
1415 Info
.flags
|= MachineMemOperand::MOStore
;
1418 case Intrinsic::amdgcn_global_load_lds
: {
1419 Info
.opc
= ISD::INTRINSIC_VOID
;
1420 unsigned Width
= cast
<ConstantInt
>(CI
.getArgOperand(2))->getZExtValue();
1421 Info
.memVT
= EVT::getIntegerVT(CI
.getContext(), Width
* 8);
1422 Info
.ptrVal
= CI
.getArgOperand(1);
1423 Info
.flags
|= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1426 case Intrinsic::amdgcn_ds_bvh_stack_rtn
: {
1427 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
1429 const GCNTargetMachine
&TM
=
1430 static_cast<const GCNTargetMachine
&>(getTargetMachine());
1432 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1433 Info
.ptrVal
= MFI
->getGWSPSV(TM
);
1435 // This is an abstract access, but we need to specify a type and size.
1436 Info
.memVT
= MVT::i32
;
1438 Info
.align
= Align(4);
1440 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
1443 case Intrinsic::amdgcn_s_prefetch_data
: {
1444 Info
.opc
= ISD::INTRINSIC_VOID
;
1445 Info
.memVT
= EVT::getIntegerVT(CI
.getContext(), 8);
1446 Info
.ptrVal
= CI
.getArgOperand(0);
1447 Info
.flags
|= MachineMemOperand::MOLoad
;
1455 void SITargetLowering::CollectTargetIntrinsicOperands(
1456 const CallInst
&I
, SmallVectorImpl
<SDValue
> &Ops
, SelectionDAG
&DAG
) const {
1457 switch (cast
<IntrinsicInst
>(I
).getIntrinsicID()) {
1458 case Intrinsic::amdgcn_addrspacecast_nonnull
: {
1459 // The DAG's ValueType loses the addrspaces.
1460 // Add them as 2 extra Constant operands "from" and "to".
1461 unsigned SrcAS
= I
.getOperand(0)->getType()->getPointerAddressSpace();
1462 unsigned DstAS
= I
.getType()->getPointerAddressSpace();
1463 Ops
.push_back(DAG
.getTargetConstant(SrcAS
, SDLoc(), MVT::i32
));
1464 Ops
.push_back(DAG
.getTargetConstant(DstAS
, SDLoc(), MVT::i32
));
1472 bool SITargetLowering::getAddrModeArguments(IntrinsicInst
*II
,
1473 SmallVectorImpl
<Value
*> &Ops
,
1474 Type
*&AccessTy
) const {
1475 Value
*Ptr
= nullptr;
1476 switch (II
->getIntrinsicID()) {
1477 case Intrinsic::amdgcn_atomic_cond_sub_u32
:
1478 case Intrinsic::amdgcn_ds_append
:
1479 case Intrinsic::amdgcn_ds_consume
:
1480 case Intrinsic::amdgcn_ds_ordered_add
:
1481 case Intrinsic::amdgcn_ds_ordered_swap
:
1482 case Intrinsic::amdgcn_flat_atomic_fmax_num
:
1483 case Intrinsic::amdgcn_flat_atomic_fmin_num
:
1484 case Intrinsic::amdgcn_global_atomic_csub
:
1485 case Intrinsic::amdgcn_global_atomic_fmax_num
:
1486 case Intrinsic::amdgcn_global_atomic_fmin_num
:
1487 case Intrinsic::amdgcn_global_atomic_ordered_add_b64
:
1488 case Intrinsic::amdgcn_global_load_tr_b64
:
1489 case Intrinsic::amdgcn_global_load_tr_b128
:
1490 Ptr
= II
->getArgOperand(0);
1492 case Intrinsic::amdgcn_global_load_lds
:
1493 Ptr
= II
->getArgOperand(1);
1498 AccessTy
= II
->getType();
1503 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode
&AM
,
1504 unsigned AddrSpace
) const {
1505 if (!Subtarget
->hasFlatInstOffsets()) {
1506 // Flat instructions do not have offsets, and only have the register
1508 return AM
.BaseOffs
== 0 && AM
.Scale
== 0;
1511 decltype(SIInstrFlags::FLAT
) FlatVariant
=
1512 AddrSpace
== AMDGPUAS::GLOBAL_ADDRESS
? SIInstrFlags::FlatGlobal
1513 : AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
? SIInstrFlags::FlatScratch
1514 : SIInstrFlags::FLAT
;
1516 return AM
.Scale
== 0 &&
1517 (AM
.BaseOffs
== 0 || Subtarget
->getInstrInfo()->isLegalFLATOffset(
1518 AM
.BaseOffs
, AddrSpace
, FlatVariant
));
1521 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode
&AM
) const {
1522 if (Subtarget
->hasFlatGlobalInsts())
1523 return isLegalFlatAddressingMode(AM
, AMDGPUAS::GLOBAL_ADDRESS
);
1525 if (!Subtarget
->hasAddr64() || Subtarget
->useFlatForGlobal()) {
1526 // Assume the we will use FLAT for all global memory accesses
1528 // FIXME: This assumption is currently wrong. On VI we still use
1529 // MUBUF instructions for the r + i addressing mode. As currently
1530 // implemented, the MUBUF instructions only work on buffer < 4GB.
1531 // It may be possible to support > 4GB buffers with MUBUF instructions,
1532 // by setting the stride value in the resource descriptor which would
1533 // increase the size limit to (stride * 4GB). However, this is risky,
1534 // because it has never been validated.
1535 return isLegalFlatAddressingMode(AM
, AMDGPUAS::FLAT_ADDRESS
);
1538 return isLegalMUBUFAddressingMode(AM
);
1541 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode
&AM
) const {
1542 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1543 // additionally can do r + r + i with addr64. 32-bit has more addressing
1544 // mode options. Depending on the resource constant, it can also do
1545 // (i64 r0) + (i32 r1) * (i14 i).
1547 // Private arrays end up using a scratch buffer most of the time, so also
1548 // assume those use MUBUF instructions. Scratch loads / stores are currently
1549 // implemented as mubuf instructions with offen bit set, so slightly
1550 // different than the normal addr64.
1551 const SIInstrInfo
*TII
= Subtarget
->getInstrInfo();
1552 if (!TII
->isLegalMUBUFImmOffset(AM
.BaseOffs
))
1555 // FIXME: Since we can split immediate into soffset and immediate offset,
1556 // would it make sense to allow any immediate?
1559 case 0: // r + i or just i, depending on HasBaseReg.
1562 return true; // We have r + r or r + i.
1564 if (AM
.HasBaseReg
) {
1565 // Reject 2 * r + r.
1569 // Allow 2 * r as r + r
1570 // Or 2 * r + i is allowed as r + r + i.
1572 default: // Don't allow n * r
1577 bool SITargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
1578 const AddrMode
&AM
, Type
*Ty
,
1580 Instruction
*I
) const {
1581 // No global is ever allowed as a base.
1585 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
)
1586 return isLegalGlobalAddressingMode(AM
);
1588 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1589 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
1590 AS
== AMDGPUAS::BUFFER_FAT_POINTER
|| AS
== AMDGPUAS::BUFFER_RESOURCE
||
1591 AS
== AMDGPUAS::BUFFER_STRIDED_POINTER
) {
1592 // If the offset isn't a multiple of 4, it probably isn't going to be
1593 // correctly aligned.
1594 // FIXME: Can we get the real alignment here?
1595 if (AM
.BaseOffs
% 4 != 0)
1596 return isLegalMUBUFAddressingMode(AM
);
1598 if (!Subtarget
->hasScalarSubwordLoads()) {
1599 // There are no SMRD extloads, so if we have to do a small type access we
1600 // will use a MUBUF load.
1601 // FIXME?: We also need to do this if unaligned, but we don't know the
1603 if (Ty
->isSized() && DL
.getTypeStoreSize(Ty
) < 4)
1604 return isLegalGlobalAddressingMode(AM
);
1607 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
1608 // SMRD instructions have an 8-bit, dword offset on SI.
1609 if (!isUInt
<8>(AM
.BaseOffs
/ 4))
1611 } else if (Subtarget
->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
) {
1612 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1613 // in 8-bits, it can use a smaller encoding.
1614 if (!isUInt
<32>(AM
.BaseOffs
/ 4))
1616 } else if (Subtarget
->getGeneration() < AMDGPUSubtarget::GFX9
) {
1617 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1618 if (!isUInt
<20>(AM
.BaseOffs
))
1620 } else if (Subtarget
->getGeneration() < AMDGPUSubtarget::GFX12
) {
1621 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1622 // for S_BUFFER_* instructions).
1623 if (!isInt
<21>(AM
.BaseOffs
))
1626 // On GFX12, all offsets are signed 24-bit in bytes.
1627 if (!isInt
<24>(AM
.BaseOffs
))
1631 if ((AS
== AMDGPUAS::CONSTANT_ADDRESS
||
1632 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
1634 // Scalar (non-buffer) loads can only use a negative offset if
1635 // soffset+offset is non-negative. Since the compiler can only prove that
1636 // in a few special cases, it is safer to claim that negative offsets are
1641 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1644 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1650 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
)
1651 return Subtarget
->enableFlatScratch()
1652 ? isLegalFlatAddressingMode(AM
, AMDGPUAS::PRIVATE_ADDRESS
)
1653 : isLegalMUBUFAddressingMode(AM
);
1655 if (AS
== AMDGPUAS::LOCAL_ADDRESS
||
1656 (AS
== AMDGPUAS::REGION_ADDRESS
&& Subtarget
->hasGDS())) {
1657 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1659 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1660 // an 8-bit dword offset but we don't know the alignment here.
1661 if (!isUInt
<16>(AM
.BaseOffs
))
1664 if (AM
.Scale
== 0) // r + i or just i, depending on HasBaseReg.
1667 if (AM
.Scale
== 1 && AM
.HasBaseReg
)
1673 if (AS
== AMDGPUAS::FLAT_ADDRESS
|| AS
== AMDGPUAS::UNKNOWN_ADDRESS_SPACE
) {
1674 // For an unknown address space, this usually means that this is for some
1675 // reason being used for pure arithmetic, and not based on some addressing
1676 // computation. We don't have instructions that compute pointers with any
1677 // addressing modes, so treat them as having no offset like flat
1679 return isLegalFlatAddressingMode(AM
, AMDGPUAS::FLAT_ADDRESS
);
1682 // Assume a user alias of global for unknown address spaces.
1683 return isLegalGlobalAddressingMode(AM
);
1686 bool SITargetLowering::canMergeStoresTo(unsigned AS
, EVT MemVT
,
1687 const MachineFunction
&MF
) const {
1688 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
)
1689 return (MemVT
.getSizeInBits() <= 4 * 32);
1690 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
1691 unsigned MaxPrivateBits
= 8 * getSubtarget()->getMaxPrivateElementSize();
1692 return (MemVT
.getSizeInBits() <= MaxPrivateBits
);
1694 if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
)
1695 return (MemVT
.getSizeInBits() <= 2 * 32);
1699 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1700 unsigned Size
, unsigned AddrSpace
, Align Alignment
,
1701 MachineMemOperand::Flags Flags
, unsigned *IsFast
) const {
1705 if (AddrSpace
== AMDGPUAS::LOCAL_ADDRESS
||
1706 AddrSpace
== AMDGPUAS::REGION_ADDRESS
) {
1707 // Check if alignment requirements for ds_read/write instructions are
1709 if (!Subtarget
->hasUnalignedDSAccessEnabled() && Alignment
< Align(4))
1712 Align
RequiredAlignment(
1713 PowerOf2Ceil(divideCeil(Size
, 8))); // Natural alignment.
1714 if (Subtarget
->hasLDSMisalignedBug() && Size
> 32 &&
1715 Alignment
< RequiredAlignment
)
1718 // Either, the alignment requirements are "enabled", or there is an
1719 // unaligned LDS access related hardware bug though alignment requirements
1720 // are "disabled". In either case, we need to check for proper alignment
1725 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1726 // address is negative, then the instruction is incorrectly treated as
1727 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1728 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1729 // load later in the SILoadStoreOptimizer.
1730 if (!Subtarget
->hasUsableDSOffset() && Alignment
< Align(8))
1733 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1734 // can do a 4 byte aligned, 8 byte access in a single operation using
1735 // ds_read2/write2_b32 with adjacent offsets.
1736 RequiredAlignment
= Align(4);
1738 if (Subtarget
->hasUnalignedDSAccessEnabled()) {
1739 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1740 // ds_write2_b32 depending on the alignment. In either case with either
1741 // alignment there is no faster way of doing this.
1743 // The numbers returned here and below are not additive, it is a 'speed
1744 // rank'. They are just meant to be compared to decide if a certain way
1745 // of lowering an operation is faster than another. For that purpose
1746 // naturally aligned operation gets it bitsize to indicate that "it
1747 // operates with a speed comparable to N-bit wide load". With the full
1748 // alignment ds128 is slower than ds96 for example. If underaligned it
1749 // is comparable to a speed of a single dword access, which would then
1750 // mean 32 < 128 and it is faster to issue a wide load regardless.
1751 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1752 // wider load which will not be aligned anymore the latter is slower.
1754 *IsFast
= (Alignment
>= RequiredAlignment
) ? 64
1755 : (Alignment
< Align(4)) ? 32
1762 if (!Subtarget
->hasDS96AndDS128())
1765 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1768 if (Subtarget
->hasUnalignedDSAccessEnabled()) {
1769 // Naturally aligned access is fastest. However, also report it is Fast
1770 // if memory is aligned less than DWORD. A narrow load or store will be
1771 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1772 // be more of them, so overall we will pay less penalty issuing a single
1775 // See comment on the values above.
1777 *IsFast
= (Alignment
>= RequiredAlignment
) ? 96
1778 : (Alignment
< Align(4)) ? 32
1785 if (!Subtarget
->hasDS96AndDS128() || !Subtarget
->useDS128())
1788 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1789 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1790 // single operation using ds_read2/write2_b64.
1791 RequiredAlignment
= Align(8);
1793 if (Subtarget
->hasUnalignedDSAccessEnabled()) {
1794 // Naturally aligned access is fastest. However, also report it is Fast
1795 // if memory is aligned less than DWORD. A narrow load or store will be
1796 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1797 // will be more of them, so overall we will pay less penalty issuing a
1798 // single instruction.
1800 // See comment on the values above.
1802 *IsFast
= (Alignment
>= RequiredAlignment
) ? 128
1803 : (Alignment
< Align(4)) ? 32
1816 // See comment on the values above.
1817 // Note that we have a single-dword or sub-dword here, so if underaligned
1818 // it is a slowest possible access, hence returned value is 0.
1820 *IsFast
= (Alignment
>= RequiredAlignment
) ? Size
: 0;
1822 return Alignment
>= RequiredAlignment
||
1823 Subtarget
->hasUnalignedDSAccessEnabled();
1826 // FIXME: We have to be conservative here and assume that flat operations
1827 // will access scratch. If we had access to the IR function, then we
1828 // could determine if any private memory was used in the function.
1829 if (AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
||
1830 AddrSpace
== AMDGPUAS::FLAT_ADDRESS
) {
1831 bool AlignedBy4
= Alignment
>= Align(4);
1833 *IsFast
= AlignedBy4
;
1835 return AlignedBy4
|| Subtarget
->hasUnalignedScratchAccessEnabled();
1838 // So long as they are correct, wide global memory operations perform better
1839 // than multiple smaller memory ops -- even when misaligned
1840 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace
)) {
1844 return Alignment
>= Align(4) ||
1845 Subtarget
->hasUnalignedBufferAccessEnabled();
1848 // Smaller than dword value must be aligned.
1852 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1853 // byte-address are ignored, thus forcing Dword alignment.
1854 // This applies to private, global, and constant memory.
1858 return Size
>= 32 && Alignment
>= Align(4);
1861 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1862 EVT VT
, unsigned AddrSpace
, Align Alignment
, MachineMemOperand::Flags Flags
,
1863 unsigned *IsFast
) const {
1864 return allowsMisalignedMemoryAccessesImpl(VT
.getSizeInBits(), AddrSpace
,
1865 Alignment
, Flags
, IsFast
);
1868 EVT
SITargetLowering::getOptimalMemOpType(
1869 const MemOp
&Op
, const AttributeList
&FuncAttributes
) const {
1870 // FIXME: Should account for address space here.
1872 // The default fallback uses the private pointer size as a guess for a type to
1873 // use. Make sure we switch these to 64-bit accesses.
1875 if (Op
.size() >= 16 &&
1876 Op
.isDstAligned(Align(4))) // XXX: Should only do for global
1879 if (Op
.size() >= 8 && Op
.isDstAligned(Align(4)))
1886 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode
*N
) const {
1887 const MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1888 return MemNode
->getMemOperand()->getFlags() & MONoClobber
;
1891 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS
) {
1892 return AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
||
1893 AS
== AMDGPUAS::PRIVATE_ADDRESS
;
1896 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS
,
1897 unsigned DestAS
) const {
1898 // Flat -> private/local is a simple truncate.
1899 // Flat -> global is no-op
1900 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
)
1903 const GCNTargetMachine
&TM
=
1904 static_cast<const GCNTargetMachine
&>(getTargetMachine());
1905 return TM
.isNoopAddrSpaceCast(SrcAS
, DestAS
);
1908 TargetLoweringBase::LegalizeTypeAction
1909 SITargetLowering::getPreferredVectorAction(MVT VT
) const {
1910 if (!VT
.isScalableVector() && VT
.getVectorNumElements() != 1 &&
1911 VT
.getScalarType().bitsLE(MVT::i16
))
1912 return VT
.isPow2VectorType() ? TypeSplitVector
: TypeWidenVector
;
1913 return TargetLoweringBase::getPreferredVectorAction(VT
);
1916 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
1918 // FIXME: Could be smarter if called for vector constants.
1922 bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT
, EVT SrcVT
,
1923 unsigned Index
) const {
1924 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR
, ResVT
))
1927 // TODO: Add more cases that are cheap.
1931 bool SITargetLowering::isTypeDesirableForOp(unsigned Op
, EVT VT
) const {
1932 if (Subtarget
->has16BitInsts() && VT
== MVT::i16
) {
1942 // SimplifySetCC uses this function to determine whether or not it should
1943 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1944 if (VT
== MVT::i1
&& Op
== ISD::SETCC
)
1947 return TargetLowering::isTypeDesirableForOp(Op
, VT
);
1950 SDValue
SITargetLowering::lowerKernArgParameterPtr(SelectionDAG
&DAG
,
1953 uint64_t Offset
) const {
1954 const DataLayout
&DL
= DAG
.getDataLayout();
1955 MachineFunction
&MF
= DAG
.getMachineFunction();
1956 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1957 MVT PtrVT
= getPointerTy(DL
, AMDGPUAS::CONSTANT_ADDRESS
);
1959 auto [InputPtrReg
, RC
, ArgTy
] =
1960 Info
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1962 // We may not have the kernarg segment argument if we have no kernel
1965 return DAG
.getConstant(Offset
, SL
, PtrVT
);
1967 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
1968 SDValue BasePtr
= DAG
.getCopyFromReg(
1969 Chain
, SL
, MRI
.getLiveInVirtReg(InputPtrReg
->getRegister()), PtrVT
);
1971 return DAG
.getObjectPtrOffset(SL
, BasePtr
, TypeSize::getFixed(Offset
));
1974 SDValue
SITargetLowering::getImplicitArgPtr(SelectionDAG
&DAG
,
1975 const SDLoc
&SL
) const {
1977 getImplicitParameterOffset(DAG
.getMachineFunction(), FIRST_IMPLICIT
);
1978 return lowerKernArgParameterPtr(DAG
, SL
, DAG
.getEntryNode(), Offset
);
1981 SDValue
SITargetLowering::getLDSKernelId(SelectionDAG
&DAG
,
1982 const SDLoc
&SL
) const {
1984 Function
&F
= DAG
.getMachineFunction().getFunction();
1985 std::optional
<uint32_t> KnownSize
=
1986 AMDGPUMachineFunction::getLDSKernelIdMetadata(F
);
1987 if (KnownSize
.has_value())
1988 return DAG
.getConstant(*KnownSize
, SL
, MVT::i32
);
1992 SDValue
SITargetLowering::convertArgType(SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
1993 const SDLoc
&SL
, SDValue Val
,
1995 const ISD::InputArg
*Arg
) const {
1996 // First, if it is a widened vector, narrow it.
1997 if (VT
.isVector() &&
1998 VT
.getVectorNumElements() != MemVT
.getVectorNumElements()) {
2000 EVT::getVectorVT(*DAG
.getContext(), MemVT
.getVectorElementType(),
2001 VT
.getVectorNumElements());
2002 Val
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, NarrowedVT
, Val
,
2003 DAG
.getConstant(0, SL
, MVT::i32
));
2006 // Then convert the vector elements or scalar value.
2007 if (Arg
&& (Arg
->Flags
.isSExt() || Arg
->Flags
.isZExt()) && VT
.bitsLT(MemVT
)) {
2008 unsigned Opc
= Arg
->Flags
.isZExt() ? ISD::AssertZext
: ISD::AssertSext
;
2009 Val
= DAG
.getNode(Opc
, SL
, MemVT
, Val
, DAG
.getValueType(VT
));
2012 if (MemVT
.isFloatingPoint())
2013 Val
= getFPExtOrFPRound(DAG
, Val
, SL
, VT
);
2015 Val
= DAG
.getSExtOrTrunc(Val
, SL
, VT
);
2017 Val
= DAG
.getZExtOrTrunc(Val
, SL
, VT
);
2022 SDValue
SITargetLowering::lowerKernargMemParameter(
2023 SelectionDAG
&DAG
, EVT VT
, EVT MemVT
, const SDLoc
&SL
, SDValue Chain
,
2024 uint64_t Offset
, Align Alignment
, bool Signed
,
2025 const ISD::InputArg
*Arg
) const {
2026 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
2028 // Try to avoid using an extload by loading earlier than the argument address,
2029 // and extracting the relevant bits. The load should hopefully be merged with
2030 // the previous argument.
2031 if (MemVT
.getStoreSize() < 4 && Alignment
< 4) {
2032 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2033 int64_t AlignDownOffset
= alignDown(Offset
, 4);
2034 int64_t OffsetDiff
= Offset
- AlignDownOffset
;
2036 EVT IntVT
= MemVT
.changeTypeToInteger();
2038 // TODO: If we passed in the base kernel offset we could have a better
2039 // alignment than 4, but we don't really need it.
2040 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, AlignDownOffset
);
2041 SDValue Load
= DAG
.getLoad(MVT::i32
, SL
, Chain
, Ptr
, PtrInfo
, Align(4),
2042 MachineMemOperand::MODereferenceable
|
2043 MachineMemOperand::MOInvariant
);
2045 SDValue ShiftAmt
= DAG
.getConstant(OffsetDiff
* 8, SL
, MVT::i32
);
2046 SDValue Extract
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Load
, ShiftAmt
);
2048 SDValue ArgVal
= DAG
.getNode(ISD::TRUNCATE
, SL
, IntVT
, Extract
);
2049 ArgVal
= DAG
.getNode(ISD::BITCAST
, SL
, MemVT
, ArgVal
);
2050 ArgVal
= convertArgType(DAG
, VT
, MemVT
, SL
, ArgVal
, Signed
, Arg
);
2052 return DAG
.getMergeValues({ArgVal
, Load
.getValue(1)}, SL
);
2055 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, SL
, Chain
, Offset
);
2056 SDValue Load
= DAG
.getLoad(MemVT
, SL
, Chain
, Ptr
, PtrInfo
, Alignment
,
2057 MachineMemOperand::MODereferenceable
|
2058 MachineMemOperand::MOInvariant
);
2060 SDValue Val
= convertArgType(DAG
, VT
, MemVT
, SL
, Load
, Signed
, Arg
);
2061 return DAG
.getMergeValues({Val
, Load
.getValue(1)}, SL
);
2064 SDValue
SITargetLowering::lowerStackParameter(SelectionDAG
&DAG
,
2065 CCValAssign
&VA
, const SDLoc
&SL
,
2067 const ISD::InputArg
&Arg
) const {
2068 MachineFunction
&MF
= DAG
.getMachineFunction();
2069 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2071 if (Arg
.Flags
.isByVal()) {
2072 unsigned Size
= Arg
.Flags
.getByValSize();
2073 int FrameIdx
= MFI
.CreateFixedObject(Size
, VA
.getLocMemOffset(), false);
2074 return DAG
.getFrameIndex(FrameIdx
, MVT::i32
);
2077 unsigned ArgOffset
= VA
.getLocMemOffset();
2078 unsigned ArgSize
= VA
.getValVT().getStoreSize();
2080 int FI
= MFI
.CreateFixedObject(ArgSize
, ArgOffset
, true);
2082 // Create load nodes to retrieve arguments from the stack.
2083 SDValue FIN
= DAG
.getFrameIndex(FI
, MVT::i32
);
2086 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2087 ISD::LoadExtType ExtType
= ISD::NON_EXTLOAD
;
2088 MVT MemVT
= VA
.getValVT();
2090 switch (VA
.getLocInfo()) {
2093 case CCValAssign::BCvt
:
2094 MemVT
= VA
.getLocVT();
2096 case CCValAssign::SExt
:
2097 ExtType
= ISD::SEXTLOAD
;
2099 case CCValAssign::ZExt
:
2100 ExtType
= ISD::ZEXTLOAD
;
2102 case CCValAssign::AExt
:
2103 ExtType
= ISD::EXTLOAD
;
2107 ArgValue
= DAG
.getExtLoad(
2108 ExtType
, SL
, VA
.getLocVT(), Chain
, FIN
,
2109 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
), MemVT
);
2113 SDValue
SITargetLowering::getPreloadedValue(
2114 SelectionDAG
&DAG
, const SIMachineFunctionInfo
&MFI
, EVT VT
,
2115 AMDGPUFunctionArgInfo::PreloadedValue PVID
) const {
2116 const ArgDescriptor
*Reg
= nullptr;
2117 const TargetRegisterClass
*RC
;
2120 CallingConv::ID CC
= DAG
.getMachineFunction().getFunction().getCallingConv();
2121 const ArgDescriptor WorkGroupIDX
=
2122 ArgDescriptor::createRegister(AMDGPU::TTMP9
);
2123 // If GridZ is not programmed in an entry function then the hardware will set
2124 // it to all zeros, so there is no need to mask the GridY value in the low
2126 const ArgDescriptor WorkGroupIDY
= ArgDescriptor::createRegister(
2128 AMDGPU::isEntryFunctionCC(CC
) && !MFI
.hasWorkGroupIDZ() ? ~0u : 0xFFFFu
);
2129 const ArgDescriptor WorkGroupIDZ
=
2130 ArgDescriptor::createRegister(AMDGPU::TTMP7
, 0xFFFF0000u
);
2131 if (Subtarget
->hasArchitectedSGPRs() &&
2132 (AMDGPU::isCompute(CC
) || CC
== CallingConv::AMDGPU_Gfx
)) {
2134 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X
:
2135 Reg
= &WorkGroupIDX
;
2136 RC
= &AMDGPU::SReg_32RegClass
;
2137 Ty
= LLT::scalar(32);
2139 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
:
2140 Reg
= &WorkGroupIDY
;
2141 RC
= &AMDGPU::SReg_32RegClass
;
2142 Ty
= LLT::scalar(32);
2144 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
:
2145 Reg
= &WorkGroupIDZ
;
2146 RC
= &AMDGPU::SReg_32RegClass
;
2147 Ty
= LLT::scalar(32);
2155 std::tie(Reg
, RC
, Ty
) = MFI
.getPreloadedValue(PVID
);
2157 if (PVID
== AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR
) {
2158 // It's possible for a kernarg intrinsic call to appear in a kernel with
2159 // no allocated segment, in which case we do not add the user sgpr
2160 // argument, so just return null.
2161 return DAG
.getConstant(0, SDLoc(), VT
);
2164 // It's undefined behavior if a function marked with the amdgpu-no-*
2165 // attributes uses the corresponding intrinsic.
2166 return DAG
.getUNDEF(VT
);
2169 return loadInputValue(DAG
, RC
, VT
, SDLoc(DAG
.getEntryNode()), *Reg
);
2172 static void processPSInputArgs(SmallVectorImpl
<ISD::InputArg
> &Splits
,
2173 CallingConv::ID CallConv
,
2174 ArrayRef
<ISD::InputArg
> Ins
, BitVector
&Skipped
,
2175 FunctionType
*FType
,
2176 SIMachineFunctionInfo
*Info
) {
2177 for (unsigned I
= 0, E
= Ins
.size(), PSInputNum
= 0; I
!= E
; ++I
) {
2178 const ISD::InputArg
*Arg
= &Ins
[I
];
2180 assert((!Arg
->VT
.isVector() || Arg
->VT
.getScalarSizeInBits() == 16) &&
2181 "vector type argument should have been split");
2183 // First check if it's a PS input addr.
2184 if (CallConv
== CallingConv::AMDGPU_PS
&& !Arg
->Flags
.isInReg() &&
2186 bool SkipArg
= !Arg
->Used
&& !Info
->isPSInputAllocated(PSInputNum
);
2188 // Inconveniently only the first part of the split is marked as isSplit,
2189 // so skip to the end. We only want to increment PSInputNum once for the
2190 // entire split argument.
2191 if (Arg
->Flags
.isSplit()) {
2192 while (!Arg
->Flags
.isSplitEnd()) {
2193 assert((!Arg
->VT
.isVector() || Arg
->VT
.getScalarSizeInBits() == 16) &&
2194 "unexpected vector split in ps argument type");
2196 Splits
.push_back(*Arg
);
2202 // We can safely skip PS inputs.
2203 Skipped
.set(Arg
->getOrigArgIndex());
2208 Info
->markPSInputAllocated(PSInputNum
);
2210 Info
->markPSInputEnabled(PSInputNum
);
2215 Splits
.push_back(*Arg
);
2219 // Allocate special inputs passed in VGPRs.
2220 void SITargetLowering::allocateSpecialEntryInputVGPRs(
2221 CCState
&CCInfo
, MachineFunction
&MF
, const SIRegisterInfo
&TRI
,
2222 SIMachineFunctionInfo
&Info
) const {
2223 const LLT S32
= LLT::scalar(32);
2224 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2226 if (Info
.hasWorkItemIDX()) {
2227 Register Reg
= AMDGPU::VGPR0
;
2228 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
2230 CCInfo
.AllocateReg(Reg
);
2232 (Subtarget
->hasPackedTID() && Info
.hasWorkItemIDY()) ? 0x3ff : ~0u;
2233 Info
.setWorkItemIDX(ArgDescriptor::createRegister(Reg
, Mask
));
2236 if (Info
.hasWorkItemIDY()) {
2237 assert(Info
.hasWorkItemIDX());
2238 if (Subtarget
->hasPackedTID()) {
2239 Info
.setWorkItemIDY(
2240 ArgDescriptor::createRegister(AMDGPU::VGPR0
, 0x3ff << 10));
2242 unsigned Reg
= AMDGPU::VGPR1
;
2243 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
2245 CCInfo
.AllocateReg(Reg
);
2246 Info
.setWorkItemIDY(ArgDescriptor::createRegister(Reg
));
2250 if (Info
.hasWorkItemIDZ()) {
2251 assert(Info
.hasWorkItemIDX() && Info
.hasWorkItemIDY());
2252 if (Subtarget
->hasPackedTID()) {
2253 Info
.setWorkItemIDZ(
2254 ArgDescriptor::createRegister(AMDGPU::VGPR0
, 0x3ff << 20));
2256 unsigned Reg
= AMDGPU::VGPR2
;
2257 MRI
.setType(MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
), S32
);
2259 CCInfo
.AllocateReg(Reg
);
2260 Info
.setWorkItemIDZ(ArgDescriptor::createRegister(Reg
));
2265 // Try to allocate a VGPR at the end of the argument list, or if no argument
2266 // VGPRs are left allocating a stack slot.
2267 // If \p Mask is is given it indicates bitfield position in the register.
2268 // If \p Arg is given use it with new ]p Mask instead of allocating new.
2269 static ArgDescriptor
allocateVGPR32Input(CCState
&CCInfo
, unsigned Mask
= ~0u,
2270 ArgDescriptor Arg
= ArgDescriptor()) {
2272 return ArgDescriptor::createArg(Arg
, Mask
);
2274 ArrayRef
<MCPhysReg
> ArgVGPRs
= ArrayRef(AMDGPU::VGPR_32RegClass
.begin(), 32);
2275 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgVGPRs
);
2276 if (RegIdx
== ArgVGPRs
.size()) {
2277 // Spill to stack required.
2278 int64_t Offset
= CCInfo
.AllocateStack(4, Align(4));
2280 return ArgDescriptor::createStack(Offset
, Mask
);
2283 unsigned Reg
= ArgVGPRs
[RegIdx
];
2284 Reg
= CCInfo
.AllocateReg(Reg
);
2285 assert(Reg
!= AMDGPU::NoRegister
);
2287 MachineFunction
&MF
= CCInfo
.getMachineFunction();
2288 Register LiveInVReg
= MF
.addLiveIn(Reg
, &AMDGPU::VGPR_32RegClass
);
2289 MF
.getRegInfo().setType(LiveInVReg
, LLT::scalar(32));
2290 return ArgDescriptor::createRegister(Reg
, Mask
);
2293 static ArgDescriptor
allocateSGPR32InputImpl(CCState
&CCInfo
,
2294 const TargetRegisterClass
*RC
,
2295 unsigned NumArgRegs
) {
2296 ArrayRef
<MCPhysReg
> ArgSGPRs
= ArrayRef(RC
->begin(), 32);
2297 unsigned RegIdx
= CCInfo
.getFirstUnallocated(ArgSGPRs
);
2298 if (RegIdx
== ArgSGPRs
.size())
2299 report_fatal_error("ran out of SGPRs for arguments");
2301 unsigned Reg
= ArgSGPRs
[RegIdx
];
2302 Reg
= CCInfo
.AllocateReg(Reg
);
2303 assert(Reg
!= AMDGPU::NoRegister
);
2305 MachineFunction
&MF
= CCInfo
.getMachineFunction();
2306 MF
.addLiveIn(Reg
, RC
);
2307 return ArgDescriptor::createRegister(Reg
);
2310 // If this has a fixed position, we still should allocate the register in the
2311 // CCInfo state. Technically we could get away with this for values passed
2312 // outside of the normal argument range.
2313 static void allocateFixedSGPRInputImpl(CCState
&CCInfo
,
2314 const TargetRegisterClass
*RC
,
2316 Reg
= CCInfo
.AllocateReg(Reg
);
2317 assert(Reg
!= AMDGPU::NoRegister
);
2318 MachineFunction
&MF
= CCInfo
.getMachineFunction();
2319 MF
.addLiveIn(Reg
, RC
);
2322 static void allocateSGPR32Input(CCState
&CCInfo
, ArgDescriptor
&Arg
) {
2324 allocateFixedSGPRInputImpl(CCInfo
, &AMDGPU::SGPR_32RegClass
,
2327 Arg
= allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_32RegClass
, 32);
2330 static void allocateSGPR64Input(CCState
&CCInfo
, ArgDescriptor
&Arg
) {
2332 allocateFixedSGPRInputImpl(CCInfo
, &AMDGPU::SGPR_64RegClass
,
2335 Arg
= allocateSGPR32InputImpl(CCInfo
, &AMDGPU::SGPR_64RegClass
, 16);
2338 /// Allocate implicit function VGPR arguments at the end of allocated user
2340 void SITargetLowering::allocateSpecialInputVGPRs(
2341 CCState
&CCInfo
, MachineFunction
&MF
, const SIRegisterInfo
&TRI
,
2342 SIMachineFunctionInfo
&Info
) const {
2343 const unsigned Mask
= 0x3ff;
2346 if (Info
.hasWorkItemIDX()) {
2347 Arg
= allocateVGPR32Input(CCInfo
, Mask
);
2348 Info
.setWorkItemIDX(Arg
);
2351 if (Info
.hasWorkItemIDY()) {
2352 Arg
= allocateVGPR32Input(CCInfo
, Mask
<< 10, Arg
);
2353 Info
.setWorkItemIDY(Arg
);
2356 if (Info
.hasWorkItemIDZ())
2357 Info
.setWorkItemIDZ(allocateVGPR32Input(CCInfo
, Mask
<< 20, Arg
));
2360 /// Allocate implicit function VGPR arguments in fixed registers.
2361 void SITargetLowering::allocateSpecialInputVGPRsFixed(
2362 CCState
&CCInfo
, MachineFunction
&MF
, const SIRegisterInfo
&TRI
,
2363 SIMachineFunctionInfo
&Info
) const {
2364 Register Reg
= CCInfo
.AllocateReg(AMDGPU::VGPR31
);
2366 report_fatal_error("failed to allocated VGPR for implicit arguments");
2368 const unsigned Mask
= 0x3ff;
2369 Info
.setWorkItemIDX(ArgDescriptor::createRegister(Reg
, Mask
));
2370 Info
.setWorkItemIDY(ArgDescriptor::createRegister(Reg
, Mask
<< 10));
2371 Info
.setWorkItemIDZ(ArgDescriptor::createRegister(Reg
, Mask
<< 20));
2374 void SITargetLowering::allocateSpecialInputSGPRs(
2375 CCState
&CCInfo
, MachineFunction
&MF
, const SIRegisterInfo
&TRI
,
2376 SIMachineFunctionInfo
&Info
) const {
2377 auto &ArgInfo
= Info
.getArgInfo();
2378 const GCNUserSGPRUsageInfo
&UserSGPRInfo
= Info
.getUserSGPRInfo();
2380 // TODO: Unify handling with private memory pointers.
2381 if (UserSGPRInfo
.hasDispatchPtr())
2382 allocateSGPR64Input(CCInfo
, ArgInfo
.DispatchPtr
);
2384 if (UserSGPRInfo
.hasQueuePtr())
2385 allocateSGPR64Input(CCInfo
, ArgInfo
.QueuePtr
);
2387 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2388 // constant offset from the kernarg segment.
2389 if (Info
.hasImplicitArgPtr())
2390 allocateSGPR64Input(CCInfo
, ArgInfo
.ImplicitArgPtr
);
2392 if (UserSGPRInfo
.hasDispatchID())
2393 allocateSGPR64Input(CCInfo
, ArgInfo
.DispatchID
);
2395 // flat_scratch_init is not applicable for non-kernel functions.
2397 if (Info
.hasWorkGroupIDX())
2398 allocateSGPR32Input(CCInfo
, ArgInfo
.WorkGroupIDX
);
2400 if (Info
.hasWorkGroupIDY())
2401 allocateSGPR32Input(CCInfo
, ArgInfo
.WorkGroupIDY
);
2403 if (Info
.hasWorkGroupIDZ())
2404 allocateSGPR32Input(CCInfo
, ArgInfo
.WorkGroupIDZ
);
2406 if (Info
.hasLDSKernelId())
2407 allocateSGPR32Input(CCInfo
, ArgInfo
.LDSKernelId
);
2410 // Allocate special inputs passed in user SGPRs.
2411 void SITargetLowering::allocateHSAUserSGPRs(CCState
&CCInfo
,
2412 MachineFunction
&MF
,
2413 const SIRegisterInfo
&TRI
,
2414 SIMachineFunctionInfo
&Info
) const {
2415 const GCNUserSGPRUsageInfo
&UserSGPRInfo
= Info
.getUserSGPRInfo();
2416 if (UserSGPRInfo
.hasImplicitBufferPtr()) {
2417 Register ImplicitBufferPtrReg
= Info
.addImplicitBufferPtr(TRI
);
2418 MF
.addLiveIn(ImplicitBufferPtrReg
, &AMDGPU::SGPR_64RegClass
);
2419 CCInfo
.AllocateReg(ImplicitBufferPtrReg
);
2422 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2423 if (UserSGPRInfo
.hasPrivateSegmentBuffer()) {
2424 Register PrivateSegmentBufferReg
= Info
.addPrivateSegmentBuffer(TRI
);
2425 MF
.addLiveIn(PrivateSegmentBufferReg
, &AMDGPU::SGPR_128RegClass
);
2426 CCInfo
.AllocateReg(PrivateSegmentBufferReg
);
2429 if (UserSGPRInfo
.hasDispatchPtr()) {
2430 Register DispatchPtrReg
= Info
.addDispatchPtr(TRI
);
2431 MF
.addLiveIn(DispatchPtrReg
, &AMDGPU::SGPR_64RegClass
);
2432 CCInfo
.AllocateReg(DispatchPtrReg
);
2435 if (UserSGPRInfo
.hasQueuePtr()) {
2436 Register QueuePtrReg
= Info
.addQueuePtr(TRI
);
2437 MF
.addLiveIn(QueuePtrReg
, &AMDGPU::SGPR_64RegClass
);
2438 CCInfo
.AllocateReg(QueuePtrReg
);
2441 if (UserSGPRInfo
.hasKernargSegmentPtr()) {
2442 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2443 Register InputPtrReg
= Info
.addKernargSegmentPtr(TRI
);
2444 CCInfo
.AllocateReg(InputPtrReg
);
2446 Register VReg
= MF
.addLiveIn(InputPtrReg
, &AMDGPU::SGPR_64RegClass
);
2447 MRI
.setType(VReg
, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
2450 if (UserSGPRInfo
.hasDispatchID()) {
2451 Register DispatchIDReg
= Info
.addDispatchID(TRI
);
2452 MF
.addLiveIn(DispatchIDReg
, &AMDGPU::SGPR_64RegClass
);
2453 CCInfo
.AllocateReg(DispatchIDReg
);
2456 if (UserSGPRInfo
.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2457 Register FlatScratchInitReg
= Info
.addFlatScratchInit(TRI
);
2458 MF
.addLiveIn(FlatScratchInitReg
, &AMDGPU::SGPR_64RegClass
);
2459 CCInfo
.AllocateReg(FlatScratchInitReg
);
2462 if (UserSGPRInfo
.hasPrivateSegmentSize()) {
2463 Register PrivateSegmentSizeReg
= Info
.addPrivateSegmentSize(TRI
);
2464 MF
.addLiveIn(PrivateSegmentSizeReg
, &AMDGPU::SGPR_32RegClass
);
2465 CCInfo
.AllocateReg(PrivateSegmentSizeReg
);
2468 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2469 // these from the dispatch pointer.
2472 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2473 // sequential starting from the first argument.
2474 void SITargetLowering::allocatePreloadKernArgSGPRs(
2475 CCState
&CCInfo
, SmallVectorImpl
<CCValAssign
> &ArgLocs
,
2476 const SmallVectorImpl
<ISD::InputArg
> &Ins
, MachineFunction
&MF
,
2477 const SIRegisterInfo
&TRI
, SIMachineFunctionInfo
&Info
) const {
2478 Function
&F
= MF
.getFunction();
2479 unsigned LastExplicitArgOffset
= Subtarget
->getExplicitKernelArgOffset();
2480 GCNUserSGPRUsageInfo
&SGPRInfo
= Info
.getUserSGPRInfo();
2481 bool InPreloadSequence
= true;
2483 bool AlignedForImplictArgs
= false;
2484 unsigned ImplicitArgOffset
= 0;
2485 for (auto &Arg
: F
.args()) {
2486 if (!InPreloadSequence
|| !Arg
.hasInRegAttr())
2489 unsigned ArgIdx
= Arg
.getArgNo();
2490 // Don't preload non-original args or parts not in the current preload
2492 if (InIdx
< Ins
.size() &&
2493 (!Ins
[InIdx
].isOrigArg() || Ins
[InIdx
].getOrigArgIndex() != ArgIdx
))
2496 for (; InIdx
< Ins
.size() && Ins
[InIdx
].isOrigArg() &&
2497 Ins
[InIdx
].getOrigArgIndex() == ArgIdx
;
2499 assert(ArgLocs
[ArgIdx
].isMemLoc());
2500 auto &ArgLoc
= ArgLocs
[InIdx
];
2501 const Align KernelArgBaseAlign
= Align(16);
2502 unsigned ArgOffset
= ArgLoc
.getLocMemOffset();
2503 Align Alignment
= commonAlignment(KernelArgBaseAlign
, ArgOffset
);
2504 unsigned NumAllocSGPRs
=
2505 alignTo(ArgLoc
.getLocVT().getFixedSizeInBits(), 32) / 32;
2507 // Fix alignment for hidden arguments.
2508 if (Arg
.hasAttribute("amdgpu-hidden-argument")) {
2509 if (!AlignedForImplictArgs
) {
2511 alignTo(LastExplicitArgOffset
,
2512 Subtarget
->getAlignmentForImplicitArgPtr()) -
2513 LastExplicitArgOffset
;
2514 AlignedForImplictArgs
= true;
2516 ArgOffset
+= ImplicitArgOffset
;
2519 // Arg is preloaded into the previous SGPR.
2520 if (ArgLoc
.getLocVT().getStoreSize() < 4 && Alignment
< 4) {
2521 Info
.getArgInfo().PreloadKernArgs
[InIdx
].Regs
.push_back(
2522 Info
.getArgInfo().PreloadKernArgs
[InIdx
- 1].Regs
[0]);
2526 unsigned Padding
= ArgOffset
- LastExplicitArgOffset
;
2527 unsigned PaddingSGPRs
= alignTo(Padding
, 4) / 4;
2528 // Check for free user SGPRs for preloading.
2529 if (PaddingSGPRs
+ NumAllocSGPRs
+ 1 /*Synthetic SGPRs*/ >
2530 SGPRInfo
.getNumFreeUserSGPRs()) {
2531 InPreloadSequence
= false;
2535 // Preload this argument.
2536 const TargetRegisterClass
*RC
=
2537 TRI
.getSGPRClassForBitWidth(NumAllocSGPRs
* 32);
2538 SmallVectorImpl
<MCRegister
> *PreloadRegs
=
2539 Info
.addPreloadedKernArg(TRI
, RC
, NumAllocSGPRs
, InIdx
, PaddingSGPRs
);
2541 if (PreloadRegs
->size() > 1)
2542 RC
= &AMDGPU::SGPR_32RegClass
;
2543 for (auto &Reg
: *PreloadRegs
) {
2545 MF
.addLiveIn(Reg
, RC
);
2546 CCInfo
.AllocateReg(Reg
);
2549 LastExplicitArgOffset
= NumAllocSGPRs
* 4 + ArgOffset
;
2554 void SITargetLowering::allocateLDSKernelId(CCState
&CCInfo
, MachineFunction
&MF
,
2555 const SIRegisterInfo
&TRI
,
2556 SIMachineFunctionInfo
&Info
) const {
2557 // Always allocate this last since it is a synthetic preload.
2558 if (Info
.hasLDSKernelId()) {
2559 Register Reg
= Info
.addLDSKernelId();
2560 MF
.addLiveIn(Reg
, &AMDGPU::SGPR_32RegClass
);
2561 CCInfo
.AllocateReg(Reg
);
2565 // Allocate special input registers that are initialized per-wave.
2566 void SITargetLowering::allocateSystemSGPRs(CCState
&CCInfo
, MachineFunction
&MF
,
2567 SIMachineFunctionInfo
&Info
,
2568 CallingConv::ID CallConv
,
2569 bool IsShader
) const {
2570 bool HasArchitectedSGPRs
= Subtarget
->hasArchitectedSGPRs();
2571 if (Subtarget
->hasUserSGPRInit16Bug() && !IsShader
) {
2572 // Note: user SGPRs are handled by the front-end for graphics shaders
2573 // Pad up the used user SGPRs with dead inputs.
2575 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2576 // before enabling architected SGPRs for workgroup IDs.
2577 assert(!HasArchitectedSGPRs
&& "Unhandled feature for the subtarget");
2579 unsigned CurrentUserSGPRs
= Info
.getNumUserSGPRs();
2580 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2581 // rely on it to reach 16 since if we end up having no stack usage, it will
2582 // not really be added.
2583 unsigned NumRequiredSystemSGPRs
=
2584 Info
.hasWorkGroupIDX() + Info
.hasWorkGroupIDY() +
2585 Info
.hasWorkGroupIDZ() + Info
.hasWorkGroupInfo();
2586 for (unsigned i
= NumRequiredSystemSGPRs
+ CurrentUserSGPRs
; i
< 16; ++i
) {
2587 Register Reg
= Info
.addReservedUserSGPR();
2588 MF
.addLiveIn(Reg
, &AMDGPU::SGPR_32RegClass
);
2589 CCInfo
.AllocateReg(Reg
);
2593 if (!HasArchitectedSGPRs
) {
2594 if (Info
.hasWorkGroupIDX()) {
2595 Register Reg
= Info
.addWorkGroupIDX();
2596 MF
.addLiveIn(Reg
, &AMDGPU::SGPR_32RegClass
);
2597 CCInfo
.AllocateReg(Reg
);
2600 if (Info
.hasWorkGroupIDY()) {
2601 Register Reg
= Info
.addWorkGroupIDY();
2602 MF
.addLiveIn(Reg
, &AMDGPU::SGPR_32RegClass
);
2603 CCInfo
.AllocateReg(Reg
);
2606 if (Info
.hasWorkGroupIDZ()) {
2607 Register Reg
= Info
.addWorkGroupIDZ();
2608 MF
.addLiveIn(Reg
, &AMDGPU::SGPR_32RegClass
);
2609 CCInfo
.AllocateReg(Reg
);
2613 if (Info
.hasWorkGroupInfo()) {
2614 Register Reg
= Info
.addWorkGroupInfo();
2615 MF
.addLiveIn(Reg
, &AMDGPU::SGPR_32RegClass
);
2616 CCInfo
.AllocateReg(Reg
);
2619 if (Info
.hasPrivateSegmentWaveByteOffset()) {
2620 // Scratch wave offset passed in system SGPR.
2621 unsigned PrivateSegmentWaveByteOffsetReg
;
2624 PrivateSegmentWaveByteOffsetReg
=
2625 Info
.getPrivateSegmentWaveByteOffsetSystemSGPR();
2627 // This is true if the scratch wave byte offset doesn't have a fixed
2629 if (PrivateSegmentWaveByteOffsetReg
== AMDGPU::NoRegister
) {
2630 PrivateSegmentWaveByteOffsetReg
= findFirstFreeSGPR(CCInfo
);
2631 Info
.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg
);
2634 PrivateSegmentWaveByteOffsetReg
= Info
.addPrivateSegmentWaveByteOffset();
2636 MF
.addLiveIn(PrivateSegmentWaveByteOffsetReg
, &AMDGPU::SGPR_32RegClass
);
2637 CCInfo
.AllocateReg(PrivateSegmentWaveByteOffsetReg
);
2640 assert(!Subtarget
->hasUserSGPRInit16Bug() || IsShader
||
2641 Info
.getNumPreloadedSGPRs() >= 16);
2644 static void reservePrivateMemoryRegs(const TargetMachine
&TM
,
2645 MachineFunction
&MF
,
2646 const SIRegisterInfo
&TRI
,
2647 SIMachineFunctionInfo
&Info
) {
2648 // Now that we've figured out where the scratch register inputs are, see if
2649 // should reserve the arguments and use them directly.
2650 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2651 bool HasStackObjects
= MFI
.hasStackObjects();
2652 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
2654 // Record that we know we have non-spill stack objects so we don't need to
2655 // check all stack objects later.
2656 if (HasStackObjects
)
2657 Info
.setHasNonSpillStackObjects(true);
2659 // Everything live out of a block is spilled with fast regalloc, so it's
2660 // almost certain that spilling will be required.
2661 if (TM
.getOptLevel() == CodeGenOptLevel::None
)
2662 HasStackObjects
= true;
2664 // For now assume stack access is needed in any callee functions, so we need
2665 // the scratch registers to pass in.
2666 bool RequiresStackAccess
= HasStackObjects
|| MFI
.hasCalls();
2668 if (!ST
.enableFlatScratch()) {
2669 if (RequiresStackAccess
&& ST
.isAmdHsaOrMesa(MF
.getFunction())) {
2670 // If we have stack objects, we unquestionably need the private buffer
2671 // resource. For the Code Object V2 ABI, this will be the first 4 user
2672 // SGPR inputs. We can reserve those and use them directly.
2674 Register PrivateSegmentBufferReg
=
2675 Info
.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER
);
2676 Info
.setScratchRSrcReg(PrivateSegmentBufferReg
);
2678 unsigned ReservedBufferReg
= TRI
.reservedPrivateSegmentBufferReg(MF
);
2679 // We tentatively reserve the last registers (skipping the last registers
2680 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2681 // we'll replace these with the ones immediately after those which were
2682 // really allocated. In the prologue copies will be inserted from the
2683 // argument to these reserved registers.
2685 // Without HSA, relocations are used for the scratch pointer and the
2686 // buffer resource setup is always inserted in the prologue. Scratch wave
2687 // offset is still in an input SGPR.
2688 Info
.setScratchRSrcReg(ReservedBufferReg
);
2692 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2694 // For entry functions we have to set up the stack pointer if we use it,
2695 // whereas non-entry functions get this "for free". This means there is no
2696 // intrinsic advantage to using S32 over S34 in cases where we do not have
2697 // calls but do need a frame pointer (i.e. if we are requested to have one
2698 // because frame pointer elimination is disabled). To keep things simple we
2699 // only ever use S32 as the call ABI stack pointer, and so using it does not
2700 // imply we need a separate frame pointer.
2702 // Try to use s32 as the SP, but move it if it would interfere with input
2703 // arguments. This won't work with calls though.
2705 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2707 if (!MRI
.isLiveIn(AMDGPU::SGPR32
)) {
2708 Info
.setStackPtrOffsetReg(AMDGPU::SGPR32
);
2710 assert(AMDGPU::isShader(MF
.getFunction().getCallingConv()));
2713 report_fatal_error("call in graphics shader with too many input SGPRs");
2715 for (unsigned Reg
: AMDGPU::SGPR_32RegClass
) {
2716 if (!MRI
.isLiveIn(Reg
)) {
2717 Info
.setStackPtrOffsetReg(Reg
);
2722 if (Info
.getStackPtrOffsetReg() == AMDGPU::SP_REG
)
2723 report_fatal_error("failed to find register for SP");
2726 // hasFP should be accurate for entry functions even before the frame is
2727 // finalized, because it does not rely on the known stack size, only
2728 // properties like whether variable sized objects are present.
2729 if (ST
.getFrameLowering()->hasFP(MF
)) {
2730 Info
.setFrameOffsetReg(AMDGPU::SGPR33
);
2734 bool SITargetLowering::supportSplitCSR(MachineFunction
*MF
) const {
2735 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
2736 return !Info
->isEntryFunction();
2739 void SITargetLowering::initializeSplitCSR(MachineBasicBlock
*Entry
) const {}
2741 void SITargetLowering::insertCopiesSplitCSR(
2742 MachineBasicBlock
*Entry
,
2743 const SmallVectorImpl
<MachineBasicBlock
*> &Exits
) const {
2744 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2746 const MCPhysReg
*IStart
= TRI
->getCalleeSavedRegsViaCopy(Entry
->getParent());
2750 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
2751 MachineRegisterInfo
*MRI
= &Entry
->getParent()->getRegInfo();
2752 MachineBasicBlock::iterator MBBI
= Entry
->begin();
2753 for (const MCPhysReg
*I
= IStart
; *I
; ++I
) {
2754 const TargetRegisterClass
*RC
= nullptr;
2755 if (AMDGPU::SReg_64RegClass
.contains(*I
))
2756 RC
= &AMDGPU::SGPR_64RegClass
;
2757 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
2758 RC
= &AMDGPU::SGPR_32RegClass
;
2760 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2762 Register NewVR
= MRI
->createVirtualRegister(RC
);
2763 // Create copy from CSR to a virtual register.
2764 Entry
->addLiveIn(*I
);
2765 BuildMI(*Entry
, MBBI
, DebugLoc(), TII
->get(TargetOpcode::COPY
), NewVR
)
2768 // Insert the copy-back instructions right before the terminator.
2769 for (auto *Exit
: Exits
)
2770 BuildMI(*Exit
, Exit
->getFirstTerminator(), DebugLoc(),
2771 TII
->get(TargetOpcode::COPY
), *I
)
2776 SDValue
SITargetLowering::LowerFormalArguments(
2777 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
2778 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2779 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
2780 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
2782 MachineFunction
&MF
= DAG
.getMachineFunction();
2783 const Function
&Fn
= MF
.getFunction();
2784 FunctionType
*FType
= MF
.getFunction().getFunctionType();
2785 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2787 if (Subtarget
->isAmdHsaOS() && AMDGPU::isGraphics(CallConv
)) {
2788 DiagnosticInfoUnsupported
NoGraphicsHSA(
2789 Fn
, "unsupported non-compute shaders with HSA", DL
.getDebugLoc());
2790 DAG
.getContext()->diagnose(NoGraphicsHSA
);
2791 return DAG
.getEntryNode();
2794 SmallVector
<ISD::InputArg
, 16> Splits
;
2795 SmallVector
<CCValAssign
, 16> ArgLocs
;
2796 BitVector
Skipped(Ins
.size());
2797 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
2800 bool IsGraphics
= AMDGPU::isGraphics(CallConv
);
2801 bool IsKernel
= AMDGPU::isKernel(CallConv
);
2802 bool IsEntryFunc
= AMDGPU::isEntryFunctionCC(CallConv
);
2805 const GCNUserSGPRUsageInfo
&UserSGPRInfo
= Info
->getUserSGPRInfo();
2806 assert(!UserSGPRInfo
.hasDispatchPtr() &&
2807 !UserSGPRInfo
.hasKernargSegmentPtr() && !Info
->hasWorkGroupInfo() &&
2808 !Info
->hasLDSKernelId() && !Info
->hasWorkItemIDX() &&
2809 !Info
->hasWorkItemIDY() && !Info
->hasWorkItemIDZ());
2811 if (!Subtarget
->enableFlatScratch())
2812 assert(!UserSGPRInfo
.hasFlatScratchInit());
2813 if ((CallConv
!= CallingConv::AMDGPU_CS
&&
2814 CallConv
!= CallingConv::AMDGPU_Gfx
) ||
2815 !Subtarget
->hasArchitectedSGPRs())
2816 assert(!Info
->hasWorkGroupIDX() && !Info
->hasWorkGroupIDY() &&
2817 !Info
->hasWorkGroupIDZ());
2820 if (CallConv
== CallingConv::AMDGPU_PS
) {
2821 processPSInputArgs(Splits
, CallConv
, Ins
, Skipped
, FType
, Info
);
2823 // At least one interpolation mode must be enabled or else the GPU will
2826 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2827 // set PSInputAddr, the user wants to enable some bits after the compilation
2828 // based on run-time states. Since we can't know what the final PSInputEna
2829 // will look like, so we shouldn't do anything here and the user should take
2830 // responsibility for the correct programming.
2832 // Otherwise, the following restrictions apply:
2833 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2834 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2836 if ((Info
->getPSInputAddr() & 0x7F) == 0 ||
2837 ((Info
->getPSInputAddr() & 0xF) == 0 && Info
->isPSInputAllocated(11))) {
2838 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
2839 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
2840 Info
->markPSInputAllocated(0);
2841 Info
->markPSInputEnabled(0);
2843 if (Subtarget
->isAmdPalOS()) {
2844 // For isAmdPalOS, the user does not enable some bits after compilation
2845 // based on run-time states; the register values being generated here are
2846 // the final ones set in hardware. Therefore we need to apply the
2847 // workaround to PSInputAddr and PSInputEnable together. (The case where
2848 // a bit is set in PSInputAddr but not PSInputEnable is where the
2849 // frontend set up an input arg for a particular interpolation mode, but
2850 // nothing uses that input arg. Really we should have an earlier pass
2851 // that removes such an arg.)
2852 unsigned PsInputBits
= Info
->getPSInputAddr() & Info
->getPSInputEnable();
2853 if ((PsInputBits
& 0x7F) == 0 ||
2854 ((PsInputBits
& 0xF) == 0 && (PsInputBits
>> 11 & 1)))
2855 Info
->markPSInputEnabled(llvm::countr_zero(Info
->getPSInputAddr()));
2857 } else if (IsKernel
) {
2858 assert(Info
->hasWorkGroupIDX() && Info
->hasWorkItemIDX());
2860 Splits
.append(Ins
.begin(), Ins
.end());
2864 analyzeFormalArgumentsCompute(CCInfo
, Ins
);
2867 allocateSpecialEntryInputVGPRs(CCInfo
, MF
, *TRI
, *Info
);
2868 allocateHSAUserSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2869 if (IsKernel
&& Subtarget
->hasKernargPreload())
2870 allocatePreloadKernArgSGPRs(CCInfo
, ArgLocs
, Ins
, MF
, *TRI
, *Info
);
2872 allocateLDSKernelId(CCInfo
, MF
, *TRI
, *Info
);
2873 } else if (!IsGraphics
) {
2874 // For the fixed ABI, pass workitem IDs in the last argument register.
2875 allocateSpecialInputVGPRsFixed(CCInfo
, MF
, *TRI
, *Info
);
2877 // FIXME: Sink this into allocateSpecialInputSGPRs
2878 if (!Subtarget
->enableFlatScratch())
2879 CCInfo
.AllocateReg(Info
->getScratchRSrcReg());
2881 allocateSpecialInputSGPRs(CCInfo
, MF
, *TRI
, *Info
);
2885 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, isVarArg
);
2886 CCInfo
.AnalyzeFormalArguments(Splits
, AssignFn
);
2889 SmallVector
<SDValue
, 16> Chains
;
2891 // FIXME: This is the minimum kernel argument alignment. We should improve
2892 // this to the maximum alignment of the arguments.
2894 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2896 const Align KernelArgBaseAlign
= Align(16);
2898 for (unsigned i
= 0, e
= Ins
.size(), ArgIdx
= 0; i
!= e
; ++i
) {
2899 const ISD::InputArg
&Arg
= Ins
[i
];
2900 if (Arg
.isOrigArg() && Skipped
[Arg
.getOrigArgIndex()]) {
2901 InVals
.push_back(DAG
.getUNDEF(Arg
.VT
));
2905 CCValAssign
&VA
= ArgLocs
[ArgIdx
++];
2906 MVT VT
= VA
.getLocVT();
2908 if (IsEntryFunc
&& VA
.isMemLoc()) {
2910 EVT MemVT
= VA
.getLocVT();
2912 const uint64_t Offset
= VA
.getLocMemOffset();
2913 Align Alignment
= commonAlignment(KernelArgBaseAlign
, Offset
);
2915 if (Arg
.Flags
.isByRef()) {
2916 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, DL
, Chain
, Offset
);
2918 const GCNTargetMachine
&TM
=
2919 static_cast<const GCNTargetMachine
&>(getTargetMachine());
2920 if (!TM
.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS
,
2921 Arg
.Flags
.getPointerAddrSpace())) {
2922 Ptr
= DAG
.getAddrSpaceCast(DL
, VT
, Ptr
, AMDGPUAS::CONSTANT_ADDRESS
,
2923 Arg
.Flags
.getPointerAddrSpace());
2926 InVals
.push_back(Ptr
);
2931 if (Arg
.isOrigArg() && Info
->getArgInfo().PreloadKernArgs
.count(i
)) {
2932 if (MemVT
.getStoreSize() < 4 && Alignment
< 4) {
2933 // In this case the argument is packed into the previous preload SGPR.
2934 int64_t AlignDownOffset
= alignDown(Offset
, 4);
2935 int64_t OffsetDiff
= Offset
- AlignDownOffset
;
2936 EVT IntVT
= MemVT
.changeTypeToInteger();
2938 const SIMachineFunctionInfo
*Info
=
2939 MF
.getInfo
<SIMachineFunctionInfo
>();
2940 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
2942 Info
->getArgInfo().PreloadKernArgs
.find(i
)->getSecond().Regs
[0];
2945 Register VReg
= MRI
.getLiveInVirtReg(Reg
);
2946 SDValue Copy
= DAG
.getCopyFromReg(Chain
, DL
, VReg
, MVT::i32
);
2948 SDValue ShiftAmt
= DAG
.getConstant(OffsetDiff
* 8, DL
, MVT::i32
);
2949 SDValue Extract
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, Copy
, ShiftAmt
);
2951 SDValue ArgVal
= DAG
.getNode(ISD::TRUNCATE
, DL
, IntVT
, Extract
);
2952 ArgVal
= DAG
.getNode(ISD::BITCAST
, DL
, MemVT
, ArgVal
);
2953 NewArg
= convertArgType(DAG
, VT
, MemVT
, DL
, ArgVal
,
2954 Ins
[i
].Flags
.isSExt(), &Ins
[i
]);
2956 NewArg
= DAG
.getMergeValues({NewArg
, Copy
.getValue(1)}, DL
);
2958 const SIMachineFunctionInfo
*Info
=
2959 MF
.getInfo
<SIMachineFunctionInfo
>();
2960 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
2961 const SmallVectorImpl
<MCRegister
> &PreloadRegs
=
2962 Info
->getArgInfo().PreloadKernArgs
.find(i
)->getSecond().Regs
;
2965 if (PreloadRegs
.size() == 1) {
2966 Register VReg
= MRI
.getLiveInVirtReg(PreloadRegs
[0]);
2967 const TargetRegisterClass
*RC
= MRI
.getRegClass(VReg
);
2968 NewArg
= DAG
.getCopyFromReg(
2970 EVT::getIntegerVT(*DAG
.getContext(),
2971 TRI
->getRegSizeInBits(*RC
)));
2974 // If the kernarg alignment does not match the alignment of the SGPR
2975 // tuple RC that can accommodate this argument, it will be built up
2976 // via copies from from the individual SGPRs that the argument was
2978 SmallVector
<SDValue
, 4> Elts
;
2979 for (auto Reg
: PreloadRegs
) {
2980 Register VReg
= MRI
.getLiveInVirtReg(Reg
);
2981 Copy
= DAG
.getCopyFromReg(Chain
, DL
, VReg
, MVT::i32
);
2982 Elts
.push_back(Copy
);
2985 DAG
.getBuildVector(EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
2986 PreloadRegs
.size()),
2990 // If the argument was preloaded to multiple consecutive 32-bit
2991 // registers because of misalignment between addressable SGPR tuples
2992 // and the argument size, we can still assume that because of kernarg
2993 // segment alignment restrictions that NewArg's size is the same as
2994 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
2995 // truncate since we cannot preload to less than a single SGPR and the
2996 // MemVT may be smaller.
2998 EVT::getIntegerVT(*DAG
.getContext(), MemVT
.getSizeInBits());
2999 if (MemVT
.bitsLT(NewArg
.getSimpleValueType()))
3000 NewArg
= DAG
.getNode(ISD::TRUNCATE
, DL
, MemVTInt
, NewArg
);
3002 NewArg
= DAG
.getBitcast(MemVT
, NewArg
);
3003 NewArg
= convertArgType(DAG
, VT
, MemVT
, DL
, NewArg
,
3004 Ins
[i
].Flags
.isSExt(), &Ins
[i
]);
3005 NewArg
= DAG
.getMergeValues({NewArg
, Chain
}, DL
);
3009 lowerKernargMemParameter(DAG
, VT
, MemVT
, DL
, Chain
, Offset
,
3010 Alignment
, Ins
[i
].Flags
.isSExt(), &Ins
[i
]);
3012 Chains
.push_back(NewArg
.getValue(1));
3015 dyn_cast
<PointerType
>(FType
->getParamType(Ins
[i
].getOrigArgIndex()));
3016 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
3018 (ParamTy
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
3019 ParamTy
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
)) {
3020 // On SI local pointers are just offsets into LDS, so they are always
3021 // less than 16-bits. On CI and newer they could potentially be
3022 // real pointers, so we can't guarantee their size.
3023 NewArg
= DAG
.getNode(ISD::AssertZext
, DL
, NewArg
.getValueType(), NewArg
,
3024 DAG
.getValueType(MVT::i16
));
3027 InVals
.push_back(NewArg
);
3030 if (!IsEntryFunc
&& VA
.isMemLoc()) {
3031 SDValue Val
= lowerStackParameter(DAG
, VA
, DL
, Chain
, Arg
);
3032 InVals
.push_back(Val
);
3033 if (!Arg
.Flags
.isByVal())
3034 Chains
.push_back(Val
.getValue(1));
3038 assert(VA
.isRegLoc() && "Parameter must be in a register!");
3040 Register Reg
= VA
.getLocReg();
3041 const TargetRegisterClass
*RC
= nullptr;
3042 if (AMDGPU::VGPR_32RegClass
.contains(Reg
))
3043 RC
= &AMDGPU::VGPR_32RegClass
;
3044 else if (AMDGPU::SGPR_32RegClass
.contains(Reg
))
3045 RC
= &AMDGPU::SGPR_32RegClass
;
3047 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3048 EVT ValVT
= VA
.getValVT();
3050 Reg
= MF
.addLiveIn(Reg
, RC
);
3051 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
);
3053 if (Arg
.Flags
.isSRet()) {
3054 // The return object should be reasonably addressable.
3056 // FIXME: This helps when the return is a real sret. If it is a
3057 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3058 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3060 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3062 ISD::AssertZext
, DL
, VT
, Val
,
3063 DAG
.getValueType(EVT::getIntegerVT(*DAG
.getContext(), NumBits
)));
3066 // If this is an 8 or 16-bit value, it is really passed promoted
3067 // to 32 bits. Insert an assert[sz]ext to capture this, then
3068 // truncate to the right size.
3069 switch (VA
.getLocInfo()) {
3070 case CCValAssign::Full
:
3072 case CCValAssign::BCvt
:
3073 Val
= DAG
.getNode(ISD::BITCAST
, DL
, ValVT
, Val
);
3075 case CCValAssign::SExt
:
3076 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VT
, Val
, DAG
.getValueType(ValVT
));
3077 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
3079 case CCValAssign::ZExt
:
3080 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VT
, Val
, DAG
.getValueType(ValVT
));
3081 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
3083 case CCValAssign::AExt
:
3084 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, ValVT
, Val
);
3087 llvm_unreachable("Unknown loc info!");
3090 InVals
.push_back(Val
);
3093 // Start adding system SGPRs.
3095 allocateSystemSGPRs(CCInfo
, MF
, *Info
, CallConv
, IsGraphics
);
3097 // DAG.getPass() returns nullptr when using new pass manager.
3098 // TODO: Use DAG.getMFAM() to access analysis result.
3099 if (DAG
.getPass()) {
3100 auto &ArgUsageInfo
= DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
3101 ArgUsageInfo
.setFuncArgInfo(Fn
, Info
->getArgInfo());
3104 unsigned StackArgSize
= CCInfo
.getStackSize();
3105 Info
->setBytesInStackArgArea(StackArgSize
);
3107 return Chains
.empty() ? Chain
3108 : DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
3111 // TODO: If return values can't fit in registers, we should return as many as
3112 // possible in registers before passing on stack.
3113 bool SITargetLowering::CanLowerReturn(
3114 CallingConv::ID CallConv
, MachineFunction
&MF
, bool IsVarArg
,
3115 const SmallVectorImpl
<ISD::OutputArg
> &Outs
, LLVMContext
&Context
) const {
3116 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3117 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3118 // for shaders. Vector types should be explicitly handled by CC.
3119 if (AMDGPU::isEntryFunctionCC(CallConv
))
3122 SmallVector
<CCValAssign
, 16> RVLocs
;
3123 CCState
CCInfo(CallConv
, IsVarArg
, MF
, RVLocs
, Context
);
3124 if (!CCInfo
.CheckReturn(Outs
, CCAssignFnForReturn(CallConv
, IsVarArg
)))
3127 // We must use the stack if return would require unavailable registers.
3128 unsigned MaxNumVGPRs
= Subtarget
->getMaxNumVGPRs(MF
);
3129 unsigned TotalNumVGPRs
= AMDGPU::VGPR_32RegClass
.getNumRegs();
3130 for (unsigned i
= MaxNumVGPRs
; i
< TotalNumVGPRs
; ++i
)
3131 if (CCInfo
.isAllocated(AMDGPU::VGPR_32RegClass
.getRegister(i
)))
3138 SITargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
3140 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
3141 const SmallVectorImpl
<SDValue
> &OutVals
,
3142 const SDLoc
&DL
, SelectionDAG
&DAG
) const {
3143 MachineFunction
&MF
= DAG
.getMachineFunction();
3144 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
3146 if (AMDGPU::isKernel(CallConv
)) {
3147 return AMDGPUTargetLowering::LowerReturn(Chain
, CallConv
, isVarArg
, Outs
,
3151 bool IsShader
= AMDGPU::isShader(CallConv
);
3153 Info
->setIfReturnsVoid(Outs
.empty());
3154 bool IsWaveEnd
= Info
->returnsVoid() && IsShader
;
3156 // CCValAssign - represent the assignment of the return value to a location.
3157 SmallVector
<CCValAssign
, 48> RVLocs
;
3158 SmallVector
<ISD::OutputArg
, 48> Splits
;
3160 // CCState - Info about the registers and stack slots.
3161 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
3164 // Analyze outgoing return values.
3165 CCInfo
.AnalyzeReturn(Outs
, CCAssignFnForReturn(CallConv
, isVarArg
));
3168 SmallVector
<SDValue
, 48> RetOps
;
3169 RetOps
.push_back(Chain
); // Operand #0 = Chain (updated below)
3171 // Copy the result values into the output registers.
3172 for (unsigned I
= 0, RealRVLocIdx
= 0, E
= RVLocs
.size(); I
!= E
;
3173 ++I
, ++RealRVLocIdx
) {
3174 CCValAssign
&VA
= RVLocs
[I
];
3175 assert(VA
.isRegLoc() && "Can only return in registers!");
3176 // TODO: Partially return in registers if return values don't fit.
3177 SDValue Arg
= OutVals
[RealRVLocIdx
];
3179 // Copied from other backends.
3180 switch (VA
.getLocInfo()) {
3181 case CCValAssign::Full
:
3183 case CCValAssign::BCvt
:
3184 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
3186 case CCValAssign::SExt
:
3187 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3189 case CCValAssign::ZExt
:
3190 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3192 case CCValAssign::AExt
:
3193 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3196 llvm_unreachable("Unknown loc info!");
3199 Chain
= DAG
.getCopyToReg(Chain
, DL
, VA
.getLocReg(), Arg
, Glue
);
3200 Glue
= Chain
.getValue(1);
3201 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
3204 // FIXME: Does sret work properly?
3205 if (!Info
->isEntryFunction()) {
3206 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
3207 const MCPhysReg
*I
=
3208 TRI
->getCalleeSavedRegsViaCopy(&DAG
.getMachineFunction());
3211 if (AMDGPU::SReg_64RegClass
.contains(*I
))
3212 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i64
));
3213 else if (AMDGPU::SReg_32RegClass
.contains(*I
))
3214 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i32
));
3216 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3221 // Update chain and glue.
3224 RetOps
.push_back(Glue
);
3226 unsigned Opc
= AMDGPUISD::ENDPGM
;
3228 Opc
= IsShader
? AMDGPUISD::RETURN_TO_EPILOG
: AMDGPUISD::RET_GLUE
;
3229 return DAG
.getNode(Opc
, DL
, MVT::Other
, RetOps
);
3232 SDValue
SITargetLowering::LowerCallResult(
3233 SDValue Chain
, SDValue InGlue
, CallingConv::ID CallConv
, bool IsVarArg
,
3234 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
3235 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
, bool IsThisReturn
,
3236 SDValue ThisVal
) const {
3237 CCAssignFn
*RetCC
= CCAssignFnForReturn(CallConv
, IsVarArg
);
3239 // Assign locations to each value returned by this call.
3240 SmallVector
<CCValAssign
, 16> RVLocs
;
3241 CCState
CCInfo(CallConv
, IsVarArg
, DAG
.getMachineFunction(), RVLocs
,
3243 CCInfo
.AnalyzeCallResult(Ins
, RetCC
);
3245 // Copy all of the result registers out of their specified physreg.
3246 for (CCValAssign VA
: RVLocs
) {
3249 if (VA
.isRegLoc()) {
3251 DAG
.getCopyFromReg(Chain
, DL
, VA
.getLocReg(), VA
.getLocVT(), InGlue
);
3252 Chain
= Val
.getValue(1);
3253 InGlue
= Val
.getValue(2);
3254 } else if (VA
.isMemLoc()) {
3255 report_fatal_error("TODO: return values in memory");
3257 llvm_unreachable("unknown argument location type");
3259 switch (VA
.getLocInfo()) {
3260 case CCValAssign::Full
:
3262 case CCValAssign::BCvt
:
3263 Val
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), Val
);
3265 case CCValAssign::ZExt
:
3266 Val
= DAG
.getNode(ISD::AssertZext
, DL
, VA
.getLocVT(), Val
,
3267 DAG
.getValueType(VA
.getValVT()));
3268 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
3270 case CCValAssign::SExt
:
3271 Val
= DAG
.getNode(ISD::AssertSext
, DL
, VA
.getLocVT(), Val
,
3272 DAG
.getValueType(VA
.getValVT()));
3273 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
3275 case CCValAssign::AExt
:
3276 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Val
);
3279 llvm_unreachable("Unknown loc info!");
3282 InVals
.push_back(Val
);
3288 // Add code to pass special inputs required depending on used features separate
3289 // from the explicit user arguments present in the IR.
3290 void SITargetLowering::passSpecialInputs(
3291 CallLoweringInfo
&CLI
, CCState
&CCInfo
, const SIMachineFunctionInfo
&Info
,
3292 SmallVectorImpl
<std::pair
<unsigned, SDValue
>> &RegsToPass
,
3293 SmallVectorImpl
<SDValue
> &MemOpChains
, SDValue Chain
) const {
3294 // If we don't have a call site, this was a call inserted by
3295 // legalization. These can never use special inputs.
3299 SelectionDAG
&DAG
= CLI
.DAG
;
3300 const SDLoc
&DL
= CLI
.DL
;
3301 const Function
&F
= DAG
.getMachineFunction().getFunction();
3303 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
3304 const AMDGPUFunctionArgInfo
&CallerArgInfo
= Info
.getArgInfo();
3306 const AMDGPUFunctionArgInfo
*CalleeArgInfo
=
3307 &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo
;
3308 if (const Function
*CalleeFunc
= CLI
.CB
->getCalledFunction()) {
3309 // DAG.getPass() returns nullptr when using new pass manager.
3310 // TODO: Use DAG.getMFAM() to access analysis result.
3311 if (DAG
.getPass()) {
3312 auto &ArgUsageInfo
=
3313 DAG
.getPass()->getAnalysis
<AMDGPUArgumentUsageInfo
>();
3314 CalleeArgInfo
= &ArgUsageInfo
.lookupFuncArgInfo(*CalleeFunc
);
3318 // TODO: Unify with private memory register handling. This is complicated by
3319 // the fact that at least in kernels, the input argument is not necessarily
3320 // in the same location as the input.
3322 static constexpr std::pair
<AMDGPUFunctionArgInfo::PreloadedValue
,
3323 StringLiteral
> ImplicitAttrs
[] = {
3324 {AMDGPUFunctionArgInfo::DISPATCH_PTR
, "amdgpu-no-dispatch-ptr"},
3325 {AMDGPUFunctionArgInfo::QUEUE_PTR
, "amdgpu-no-queue-ptr" },
3326 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
, "amdgpu-no-implicitarg-ptr"},
3327 {AMDGPUFunctionArgInfo::DISPATCH_ID
, "amdgpu-no-dispatch-id"},
3328 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X
, "amdgpu-no-workgroup-id-x"},
3329 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
,"amdgpu-no-workgroup-id-y"},
3330 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
,"amdgpu-no-workgroup-id-z"},
3331 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID
,"amdgpu-no-lds-kernel-id"},
3335 for (auto [InputID
, Attr
] : ImplicitAttrs
) {
3336 // If the callee does not use the attribute value, skip copying the value.
3337 if (CLI
.CB
->hasFnAttr(Attr
))
3340 const auto [OutgoingArg
, ArgRC
, ArgTy
] =
3341 CalleeArgInfo
->getPreloadedValue(InputID
);
3345 const auto [IncomingArg
, IncomingArgRC
, Ty
] =
3346 CallerArgInfo
.getPreloadedValue(InputID
);
3347 assert(IncomingArgRC
== ArgRC
);
3349 // All special arguments are ints for now.
3350 EVT ArgVT
= TRI
->getSpillSize(*ArgRC
) == 8 ? MVT::i64
: MVT::i32
;
3354 InputReg
= loadInputValue(DAG
, ArgRC
, ArgVT
, DL
, *IncomingArg
);
3355 } else if (InputID
== AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
) {
3356 // The implicit arg ptr is special because it doesn't have a corresponding
3357 // input for kernels, and is computed from the kernarg segment pointer.
3358 InputReg
= getImplicitArgPtr(DAG
, DL
);
3359 } else if (InputID
== AMDGPUFunctionArgInfo::LDS_KERNEL_ID
) {
3360 std::optional
<uint32_t> Id
=
3361 AMDGPUMachineFunction::getLDSKernelIdMetadata(F
);
3362 if (Id
.has_value()) {
3363 InputReg
= DAG
.getConstant(*Id
, DL
, ArgVT
);
3365 InputReg
= DAG
.getUNDEF(ArgVT
);
3368 // We may have proven the input wasn't needed, although the ABI is
3369 // requiring it. We just need to allocate the register appropriately.
3370 InputReg
= DAG
.getUNDEF(ArgVT
);
3373 if (OutgoingArg
->isRegister()) {
3374 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
3375 if (!CCInfo
.AllocateReg(OutgoingArg
->getRegister()))
3376 report_fatal_error("failed to allocate implicit input argument");
3378 unsigned SpecialArgOffset
=
3379 CCInfo
.AllocateStack(ArgVT
.getStoreSize(), Align(4));
3381 storeStackInputValue(DAG
, DL
, Chain
, InputReg
, SpecialArgOffset
);
3382 MemOpChains
.push_back(ArgStore
);
3386 // Pack workitem IDs into a single register or pass it as is if already
3389 auto [OutgoingArg
, ArgRC
, Ty
] =
3390 CalleeArgInfo
->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
3392 std::tie(OutgoingArg
, ArgRC
, Ty
) =
3393 CalleeArgInfo
->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
3395 std::tie(OutgoingArg
, ArgRC
, Ty
) =
3396 CalleeArgInfo
->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
3400 const ArgDescriptor
*IncomingArgX
= std::get
<0>(
3401 CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X
));
3402 const ArgDescriptor
*IncomingArgY
= std::get
<0>(
3403 CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y
));
3404 const ArgDescriptor
*IncomingArgZ
= std::get
<0>(
3405 CallerArgInfo
.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z
));
3410 const bool NeedWorkItemIDX
= !CLI
.CB
->hasFnAttr("amdgpu-no-workitem-id-x");
3411 const bool NeedWorkItemIDY
= !CLI
.CB
->hasFnAttr("amdgpu-no-workitem-id-y");
3412 const bool NeedWorkItemIDZ
= !CLI
.CB
->hasFnAttr("amdgpu-no-workitem-id-z");
3414 // If incoming ids are not packed we need to pack them.
3415 if (IncomingArgX
&& !IncomingArgX
->isMasked() && CalleeArgInfo
->WorkItemIDX
&&
3417 if (Subtarget
->getMaxWorkitemID(F
, 0) != 0) {
3418 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgX
);
3420 InputReg
= DAG
.getConstant(0, DL
, MVT::i32
);
3424 if (IncomingArgY
&& !IncomingArgY
->isMasked() && CalleeArgInfo
->WorkItemIDY
&&
3425 NeedWorkItemIDY
&& Subtarget
->getMaxWorkitemID(F
, 1) != 0) {
3426 SDValue Y
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgY
);
3427 Y
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Y
,
3428 DAG
.getShiftAmountConstant(10, MVT::i32
, SL
));
3429 InputReg
= InputReg
.getNode()
3430 ? DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Y
)
3434 if (IncomingArgZ
&& !IncomingArgZ
->isMasked() && CalleeArgInfo
->WorkItemIDZ
&&
3435 NeedWorkItemIDZ
&& Subtarget
->getMaxWorkitemID(F
, 2) != 0) {
3436 SDValue Z
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, *IncomingArgZ
);
3437 Z
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Z
,
3438 DAG
.getShiftAmountConstant(20, MVT::i32
, SL
));
3439 InputReg
= InputReg
.getNode()
3440 ? DAG
.getNode(ISD::OR
, SL
, MVT::i32
, InputReg
, Z
)
3444 if (!InputReg
&& (NeedWorkItemIDX
|| NeedWorkItemIDY
|| NeedWorkItemIDZ
)) {
3445 if (!IncomingArgX
&& !IncomingArgY
&& !IncomingArgZ
) {
3446 // We're in a situation where the outgoing function requires the workitem
3447 // ID, but the calling function does not have it (e.g a graphics function
3448 // calling a C calling convention function). This is illegal, but we need
3449 // to produce something.
3450 InputReg
= DAG
.getUNDEF(MVT::i32
);
3452 // Workitem ids are already packed, any of present incoming arguments
3453 // will carry all required fields.
3454 ArgDescriptor IncomingArg
=
3455 ArgDescriptor::createArg(IncomingArgX
? *IncomingArgX
3456 : IncomingArgY
? *IncomingArgY
3459 InputReg
= loadInputValue(DAG
, ArgRC
, MVT::i32
, DL
, IncomingArg
);
3463 if (OutgoingArg
->isRegister()) {
3465 RegsToPass
.emplace_back(OutgoingArg
->getRegister(), InputReg
);
3467 CCInfo
.AllocateReg(OutgoingArg
->getRegister());
3469 unsigned SpecialArgOffset
= CCInfo
.AllocateStack(4, Align(4));
3472 storeStackInputValue(DAG
, DL
, Chain
, InputReg
, SpecialArgOffset
);
3473 MemOpChains
.push_back(ArgStore
);
3478 static bool canGuaranteeTCO(CallingConv::ID CC
) {
3479 return CC
== CallingConv::Fast
;
3482 /// Return true if we might ever do TCO for calls with this calling convention.
3483 static bool mayTailCallThisCC(CallingConv::ID CC
) {
3485 case CallingConv::C
:
3486 case CallingConv::AMDGPU_Gfx
:
3489 return canGuaranteeTCO(CC
);
3493 bool SITargetLowering::isEligibleForTailCallOptimization(
3494 SDValue Callee
, CallingConv::ID CalleeCC
, bool IsVarArg
,
3495 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
3496 const SmallVectorImpl
<SDValue
> &OutVals
,
3497 const SmallVectorImpl
<ISD::InputArg
> &Ins
, SelectionDAG
&DAG
) const {
3498 if (AMDGPU::isChainCC(CalleeCC
))
3501 if (!mayTailCallThisCC(CalleeCC
))
3504 // For a divergent call target, we need to do a waterfall loop over the
3505 // possible callees which precludes us from using a simple jump.
3506 if (Callee
->isDivergent())
3509 MachineFunction
&MF
= DAG
.getMachineFunction();
3510 const Function
&CallerF
= MF
.getFunction();
3511 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
3512 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
3513 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
3515 // Kernels aren't callable, and don't have a live in return address so it
3516 // doesn't make sense to do a tail call with entry functions.
3517 if (!CallerPreserved
)
3520 bool CCMatch
= CallerCC
== CalleeCC
;
3522 if (DAG
.getTarget().Options
.GuaranteedTailCallOpt
) {
3523 if (canGuaranteeTCO(CalleeCC
) && CCMatch
)
3528 // TODO: Can we handle var args?
3532 for (const Argument
&Arg
: CallerF
.args()) {
3533 if (Arg
.hasByValAttr())
3537 LLVMContext
&Ctx
= *DAG
.getContext();
3539 // Check that the call results are passed in the same way.
3540 if (!CCState::resultsCompatible(CalleeCC
, CallerCC
, MF
, Ctx
, Ins
,
3541 CCAssignFnForCall(CalleeCC
, IsVarArg
),
3542 CCAssignFnForCall(CallerCC
, IsVarArg
)))
3545 // The callee has to preserve all registers the caller needs to preserve.
3547 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
3548 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
3552 // Nothing more to check if the callee is taking no arguments.
3556 SmallVector
<CCValAssign
, 16> ArgLocs
;
3557 CCState
CCInfo(CalleeCC
, IsVarArg
, MF
, ArgLocs
, Ctx
);
3559 // FIXME: We are not allocating special input registers, so we will be
3560 // deciding based on incorrect register assignments.
3561 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CalleeCC
, IsVarArg
));
3563 const SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
3564 // If the stack arguments for this call do not fit into our own save area then
3565 // the call cannot be made tail.
3566 // TODO: Is this really necessary?
3567 if (CCInfo
.getStackSize() > FuncInfo
->getBytesInStackArgArea())
3570 for (const auto &[CCVA
, ArgVal
] : zip_equal(ArgLocs
, OutVals
)) {
3571 // FIXME: What about inreg arguments that end up passed in memory?
3572 if (!CCVA
.isRegLoc())
3575 // If we are passing an argument in an SGPR, and the value is divergent,
3576 // this call requires a waterfall loop.
3577 if (ArgVal
->isDivergent() && TRI
->isSGPRPhysReg(CCVA
.getLocReg())) {
3579 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3580 << printReg(CCVA
.getLocReg(), TRI
) << '\n');
3585 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3586 return parametersInCSRMatch(MRI
, CallerPreserved
, ArgLocs
, OutVals
);
3589 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst
*CI
) const {
3590 if (!CI
->isTailCall())
3593 const Function
*ParentFn
= CI
->getParent()->getParent();
3594 if (AMDGPU::isEntryFunctionCC(ParentFn
->getCallingConv()))
3599 // The wave scratch offset register is used as the global base pointer.
3600 SDValue
SITargetLowering::LowerCall(CallLoweringInfo
&CLI
,
3601 SmallVectorImpl
<SDValue
> &InVals
) const {
3602 CallingConv::ID CallConv
= CLI
.CallConv
;
3603 bool IsChainCallConv
= AMDGPU::isChainCC(CallConv
);
3605 SelectionDAG
&DAG
= CLI
.DAG
;
3607 TargetLowering::ArgListEntry RequestedExec
;
3608 if (IsChainCallConv
) {
3609 // The last argument should be the value that we need to put in EXEC.
3610 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3611 // don't treat it like the rest of the arguments.
3612 RequestedExec
= CLI
.Args
.back();
3613 assert(RequestedExec
.Node
&& "No node for EXEC");
3615 if (!RequestedExec
.Ty
->isIntegerTy(Subtarget
->getWavefrontSize()))
3616 return lowerUnhandledCall(CLI
, InVals
, "Invalid value for EXEC");
3618 assert(CLI
.Outs
.back().OrigArgIndex
== 2 && "Unexpected last arg");
3619 CLI
.Outs
.pop_back();
3620 CLI
.OutVals
.pop_back();
3622 if (RequestedExec
.Ty
->isIntegerTy(64)) {
3623 assert(CLI
.Outs
.back().OrigArgIndex
== 2 && "Exec wasn't split up");
3624 CLI
.Outs
.pop_back();
3625 CLI
.OutVals
.pop_back();
3628 assert(CLI
.Outs
.back().OrigArgIndex
!= 2 &&
3629 "Haven't popped all the pieces of the EXEC mask");
3632 const SDLoc
&DL
= CLI
.DL
;
3633 SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
3634 SmallVector
<SDValue
, 32> &OutVals
= CLI
.OutVals
;
3635 SmallVector
<ISD::InputArg
, 32> &Ins
= CLI
.Ins
;
3636 SDValue Chain
= CLI
.Chain
;
3637 SDValue Callee
= CLI
.Callee
;
3638 bool &IsTailCall
= CLI
.IsTailCall
;
3639 bool IsVarArg
= CLI
.IsVarArg
;
3640 bool IsSibCall
= false;
3641 MachineFunction
&MF
= DAG
.getMachineFunction();
3643 if (Callee
.isUndef() || isNullConstant(Callee
)) {
3644 if (!CLI
.IsTailCall
) {
3645 for (ISD::InputArg
&Arg
: CLI
.Ins
)
3646 InVals
.push_back(DAG
.getUNDEF(Arg
.VT
));
3653 return lowerUnhandledCall(CLI
, InVals
,
3654 "unsupported call to variadic function ");
3658 report_fatal_error("unsupported libcall legalization");
3660 if (IsTailCall
&& MF
.getTarget().Options
.GuaranteedTailCallOpt
) {
3661 return lowerUnhandledCall(CLI
, InVals
,
3662 "unsupported required tail call to function ");
3666 IsTailCall
= isEligibleForTailCallOptimization(Callee
, CallConv
, IsVarArg
,
3667 Outs
, OutVals
, Ins
, DAG
);
3669 ((CLI
.CB
&& CLI
.CB
->isMustTailCall()) || IsChainCallConv
)) {
3670 report_fatal_error("failed to perform tail call elimination on a call "
3671 "site marked musttail or on llvm.amdgcn.cs.chain");
3674 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
3676 // A sibling call is one where we're under the usual C ABI and not planning
3677 // to change that but can still do a tail call:
3678 if (!TailCallOpt
&& IsTailCall
)
3685 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
3686 SmallVector
<std::pair
<unsigned, SDValue
>, 8> RegsToPass
;
3687 SmallVector
<SDValue
, 8> MemOpChains
;
3689 // Analyze operands of the call, assigning locations to each operand.
3690 SmallVector
<CCValAssign
, 16> ArgLocs
;
3691 CCState
CCInfo(CallConv
, IsVarArg
, MF
, ArgLocs
, *DAG
.getContext());
3692 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, IsVarArg
);
3694 if (CallConv
!= CallingConv::AMDGPU_Gfx
&& !AMDGPU::isChainCC(CallConv
)) {
3695 // With a fixed ABI, allocate fixed registers before user arguments.
3696 passSpecialInputs(CLI
, CCInfo
, *Info
, RegsToPass
, MemOpChains
, Chain
);
3699 CCInfo
.AnalyzeCallOperands(Outs
, AssignFn
);
3701 // Get a count of how many bytes are to be pushed on the stack.
3702 unsigned NumBytes
= CCInfo
.getStackSize();
3705 // Since we're not changing the ABI to make this a tail call, the memory
3706 // operands are already available in the caller's incoming argument space.
3710 // FPDiff is the byte offset of the call's argument area from the callee's.
3711 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3712 // by this amount for a tail call. In a sibling call it must be 0 because the
3713 // caller will deallocate the entire stack and the callee still expects its
3714 // arguments to begin at SP+0. Completely unused for non-tail calls.
3716 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
3717 auto *TRI
= static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
3719 // Adjust the stack pointer for the new arguments...
3720 // These operations are automatically eliminated by the prolog/epilog pass
3722 Chain
= DAG
.getCALLSEQ_START(Chain
, 0, 0, DL
);
3724 if (!IsSibCall
|| IsChainCallConv
) {
3725 if (!Subtarget
->enableFlatScratch()) {
3726 SmallVector
<SDValue
, 4> CopyFromChains
;
3728 // In the HSA case, this should be an identity copy.
3729 SDValue ScratchRSrcReg
=
3730 DAG
.getCopyFromReg(Chain
, DL
, Info
->getScratchRSrcReg(), MVT::v4i32
);
3731 RegsToPass
.emplace_back(IsChainCallConv
3732 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3733 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3
,
3735 CopyFromChains
.push_back(ScratchRSrcReg
.getValue(1));
3736 Chain
= DAG
.getTokenFactor(DL
, CopyFromChains
);
3740 const unsigned NumSpecialInputs
= RegsToPass
.size();
3742 MVT PtrVT
= MVT::i32
;
3744 // Walk the register/memloc assignments, inserting copies/loads.
3745 for (unsigned i
= 0, e
= ArgLocs
.size(); i
!= e
; ++i
) {
3746 CCValAssign
&VA
= ArgLocs
[i
];
3747 SDValue Arg
= OutVals
[i
];
3749 // Promote the value if needed.
3750 switch (VA
.getLocInfo()) {
3751 case CCValAssign::Full
:
3753 case CCValAssign::BCvt
:
3754 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
3756 case CCValAssign::ZExt
:
3757 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3759 case CCValAssign::SExt
:
3760 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3762 case CCValAssign::AExt
:
3763 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3765 case CCValAssign::FPExt
:
3766 Arg
= DAG
.getNode(ISD::FP_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3769 llvm_unreachable("Unknown loc info!");
3772 if (VA
.isRegLoc()) {
3773 RegsToPass
.push_back(std::pair(VA
.getLocReg(), Arg
));
3775 assert(VA
.isMemLoc());
3778 MachinePointerInfo DstInfo
;
3780 unsigned LocMemOffset
= VA
.getLocMemOffset();
3781 int32_t Offset
= LocMemOffset
;
3783 SDValue PtrOff
= DAG
.getConstant(Offset
, DL
, PtrVT
);
3784 MaybeAlign Alignment
;
3787 ISD::ArgFlagsTy Flags
= Outs
[i
].Flags
;
3788 unsigned OpSize
= Flags
.isByVal() ? Flags
.getByValSize()
3789 : VA
.getValVT().getStoreSize();
3791 // FIXME: We can have better than the minimum byval required alignment.
3794 ? Flags
.getNonZeroByValAlign()
3795 : commonAlignment(Subtarget
->getStackAlignment(), Offset
);
3797 Offset
= Offset
+ FPDiff
;
3798 int FI
= MFI
.CreateFixedObject(OpSize
, Offset
, true);
3800 DstAddr
= DAG
.getFrameIndex(FI
, PtrVT
);
3801 DstInfo
= MachinePointerInfo::getFixedStack(MF
, FI
);
3803 // Make sure any stack arguments overlapping with where we're storing
3804 // are loaded before this eventual operation. Otherwise they'll be
3807 // FIXME: Why is this really necessary? This seems to just result in a
3808 // lot of code to copy the stack and write them back to the same
3809 // locations, which are supposed to be immutable?
3810 Chain
= addTokenForArgument(Chain
, DAG
, MFI
, FI
);
3812 // Stores to the argument stack area are relative to the stack pointer.
3813 SDValue SP
= DAG
.getCopyFromReg(Chain
, DL
, Info
->getStackPtrOffsetReg(),
3815 DstAddr
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, SP
, PtrOff
);
3816 DstInfo
= MachinePointerInfo::getStack(MF
, LocMemOffset
);
3818 commonAlignment(Subtarget
->getStackAlignment(), LocMemOffset
);
3821 if (Outs
[i
].Flags
.isByVal()) {
3823 DAG
.getConstant(Outs
[i
].Flags
.getByValSize(), DL
, MVT::i32
);
3825 DAG
.getMemcpy(Chain
, DL
, DstAddr
, Arg
, SizeNode
,
3826 Outs
[i
].Flags
.getNonZeroByValAlign(),
3827 /*isVol = */ false, /*AlwaysInline = */ true,
3828 /*CI=*/nullptr, std::nullopt
, DstInfo
,
3829 MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS
));
3831 MemOpChains
.push_back(Cpy
);
3834 DAG
.getStore(Chain
, DL
, Arg
, DstAddr
, DstInfo
, Alignment
);
3835 MemOpChains
.push_back(Store
);
3840 if (!MemOpChains
.empty())
3841 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOpChains
);
3843 SDValue ReadFirstLaneID
=
3844 DAG
.getTargetConstant(Intrinsic::amdgcn_readfirstlane
, DL
, MVT::i32
);
3847 if (CLI
.ConvergenceControlToken
) {
3848 TokenGlue
= DAG
.getNode(ISD::CONVERGENCECTRL_GLUE
, DL
, MVT::Glue
,
3849 CLI
.ConvergenceControlToken
);
3852 // Build a sequence of copy-to-reg nodes chained together with token chain
3853 // and flag operands which copy the outgoing args into the appropriate regs.
3856 unsigned ArgIdx
= 0;
3857 for (auto [Reg
, Val
] : RegsToPass
) {
3858 if (ArgIdx
++ >= NumSpecialInputs
&&
3859 (IsChainCallConv
|| !Val
->isDivergent()) && TRI
->isSGPRPhysReg(Reg
)) {
3860 // For chain calls, the inreg arguments are required to be
3861 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3862 // they are uniform.
3864 // For other calls, if an inreg arguments is known to be uniform,
3865 // speculatively insert a readfirstlane in case it is in a VGPR.
3867 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3868 // value, so let that continue to produce invalid code.
3870 SmallVector
<SDValue
, 3> ReadfirstlaneArgs({ReadFirstLaneID
, Val
});
3872 ReadfirstlaneArgs
.push_back(TokenGlue
);
3873 Val
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, Val
.getValueType(),
3877 Chain
= DAG
.getCopyToReg(Chain
, DL
, Reg
, Val
, InGlue
);
3878 InGlue
= Chain
.getValue(1);
3881 // We don't usually want to end the call-sequence here because we would tidy
3882 // the frame up *after* the call, however in the ABI-changing tail-call case
3883 // we've carefully laid out the parameters so that when sp is reset they'll be
3884 // in the correct location.
3885 if (IsTailCall
&& !IsSibCall
) {
3886 Chain
= DAG
.getCALLSEQ_END(Chain
, NumBytes
, 0, InGlue
, DL
);
3887 InGlue
= Chain
.getValue(1);
3890 std::vector
<SDValue
> Ops({Chain
});
3892 // Add a redundant copy of the callee global which will not be legalized, as
3893 // we need direct access to the callee later.
3894 if (GlobalAddressSDNode
*GSD
= dyn_cast
<GlobalAddressSDNode
>(Callee
)) {
3895 const GlobalValue
*GV
= GSD
->getGlobal();
3896 Ops
.push_back(Callee
);
3897 Ops
.push_back(DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i64
));
3900 // isEligibleForTailCallOptimization considered whether the call target is
3901 // divergent, but we may still end up with a uniform value in a VGPR.
3902 // Insert a readfirstlane just in case.
3903 SDValue ReadFirstLaneID
=
3904 DAG
.getTargetConstant(Intrinsic::amdgcn_readfirstlane
, DL
, MVT::i32
);
3906 SmallVector
<SDValue
, 3> ReadfirstlaneArgs({ReadFirstLaneID
, Callee
});
3908 ReadfirstlaneArgs
.push_back(TokenGlue
); // Wire up convergence token.
3909 Callee
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, Callee
.getValueType(),
3913 Ops
.push_back(Callee
);
3914 Ops
.push_back(DAG
.getTargetConstant(0, DL
, MVT::i64
));
3918 // Each tail call may have to adjust the stack by a different amount, so
3919 // this information must travel along with the operation for eventual
3920 // consumption by emitEpilogue.
3921 Ops
.push_back(DAG
.getTargetConstant(FPDiff
, DL
, MVT::i32
));
3924 if (IsChainCallConv
)
3925 Ops
.push_back(RequestedExec
.Node
);
3927 // Add argument registers to the end of the list so that they are known live
3929 for (auto &[Reg
, Val
] : RegsToPass
)
3930 Ops
.push_back(DAG
.getRegister(Reg
, Val
.getValueType()));
3932 // Add a register mask operand representing the call-preserved registers.
3933 const uint32_t *Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
3934 assert(Mask
&& "Missing call preserved mask for calling convention");
3935 Ops
.push_back(DAG
.getRegisterMask(Mask
));
3937 if (SDValue Token
= CLI
.ConvergenceControlToken
) {
3938 SmallVector
<SDValue
, 2> GlueOps
;
3939 GlueOps
.push_back(Token
);
3941 GlueOps
.push_back(InGlue
);
3943 InGlue
= SDValue(DAG
.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE
, DL
,
3944 MVT::Glue
, GlueOps
),
3949 Ops
.push_back(InGlue
);
3951 // If we're doing a tall call, use a TC_RETURN here rather than an
3952 // actual call instruction.
3954 MFI
.setHasTailCall();
3955 unsigned OPC
= AMDGPUISD::TC_RETURN
;
3957 case CallingConv::AMDGPU_Gfx
:
3958 OPC
= AMDGPUISD::TC_RETURN_GFX
;
3960 case CallingConv::AMDGPU_CS_Chain
:
3961 case CallingConv::AMDGPU_CS_ChainPreserve
:
3962 OPC
= AMDGPUISD::TC_RETURN_CHAIN
;
3966 return DAG
.getNode(OPC
, DL
, MVT::Other
, Ops
);
3969 // Returns a chain and a flag for retval copy to use.
3970 SDValue Call
= DAG
.getNode(AMDGPUISD::CALL
, DL
, {MVT::Other
, MVT::Glue
}, Ops
);
3971 Chain
= Call
.getValue(0);
3972 InGlue
= Call
.getValue(1);
3974 uint64_t CalleePopBytes
= NumBytes
;
3975 Chain
= DAG
.getCALLSEQ_END(Chain
, 0, CalleePopBytes
, InGlue
, DL
);
3977 InGlue
= Chain
.getValue(1);
3979 // Handle result values, copying them out of physregs into vregs that we
3981 return LowerCallResult(Chain
, InGlue
, CallConv
, IsVarArg
, Ins
, DL
, DAG
,
3982 InVals
, /*IsThisReturn=*/false, SDValue());
3985 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3986 // except for applying the wave size scale to the increment amount.
3987 SDValue
SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op
,
3988 SelectionDAG
&DAG
) const {
3989 const MachineFunction
&MF
= DAG
.getMachineFunction();
3990 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
3993 EVT VT
= Op
.getValueType();
3995 SDValue Tmp2
= Op
.getValue(1);
3996 SDValue Tmp3
= Op
.getOperand(2);
3997 SDValue Chain
= Tmp1
.getOperand(0);
3999 Register SPReg
= Info
->getStackPtrOffsetReg();
4001 // Chain the dynamic stack allocation so that it doesn't modify the stack
4002 // pointer when other instructions are using the stack.
4003 Chain
= DAG
.getCALLSEQ_START(Chain
, 0, 0, dl
);
4005 SDValue Size
= Tmp2
.getOperand(1);
4006 SDValue SP
= DAG
.getCopyFromReg(Chain
, dl
, SPReg
, VT
);
4007 Chain
= SP
.getValue(1);
4008 MaybeAlign Alignment
= cast
<ConstantSDNode
>(Tmp3
)->getMaybeAlignValue();
4009 const TargetFrameLowering
*TFL
= Subtarget
->getFrameLowering();
4011 TFL
->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp
4015 SDValue ScaledSize
= DAG
.getNode(
4016 ISD::SHL
, dl
, VT
, Size
,
4017 DAG
.getConstant(Subtarget
->getWavefrontSizeLog2(), dl
, MVT::i32
));
4019 Align StackAlign
= TFL
->getStackAlign();
4020 Tmp1
= DAG
.getNode(Opc
, dl
, VT
, SP
, ScaledSize
); // Value
4021 if (Alignment
&& *Alignment
> StackAlign
) {
4022 Tmp1
= DAG
.getNode(ISD::AND
, dl
, VT
, Tmp1
,
4023 DAG
.getConstant(-(uint64_t)Alignment
->value()
4024 << Subtarget
->getWavefrontSizeLog2(),
4028 Chain
= DAG
.getCopyToReg(Chain
, dl
, SPReg
, Tmp1
); // Output chain
4029 Tmp2
= DAG
.getCALLSEQ_END(Chain
, 0, 0, SDValue(), dl
);
4031 return DAG
.getMergeValues({Tmp1
, Tmp2
}, dl
);
4034 SDValue
SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op
,
4035 SelectionDAG
&DAG
) const {
4036 // We only handle constant sizes here to allow non-entry block, static sized
4037 // allocas. A truly dynamic value is more difficult to support because we
4038 // don't know if the size value is uniform or not. If the size isn't uniform,
4039 // we would need to do a wave reduction to get the maximum size to know how
4040 // much to increment the uniform stack pointer.
4041 SDValue Size
= Op
.getOperand(1);
4042 if (isa
<ConstantSDNode
>(Size
))
4043 return lowerDYNAMIC_STACKALLOCImpl(Op
, DAG
); // Use "generic" expansion.
4045 return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op
, DAG
);
4048 SDValue
SITargetLowering::LowerSTACKSAVE(SDValue Op
, SelectionDAG
&DAG
) const {
4049 if (Op
.getValueType() != MVT::i32
)
4050 return Op
; // Defer to cannot select error.
4052 Register SP
= getStackPointerRegisterToSaveRestore();
4055 SDValue CopyFromSP
= DAG
.getCopyFromReg(Op
->getOperand(0), SL
, SP
, MVT::i32
);
4057 // Convert from wave uniform to swizzled vector address. This should protect
4058 // from any edge cases where the stacksave result isn't directly used with
4060 SDValue VectorAddress
=
4061 DAG
.getNode(AMDGPUISD::WAVE_ADDRESS
, SL
, MVT::i32
, CopyFromSP
);
4062 return DAG
.getMergeValues({VectorAddress
, CopyFromSP
.getValue(1)}, SL
);
4065 SDValue
SITargetLowering::lowerGET_ROUNDING(SDValue Op
,
4066 SelectionDAG
&DAG
) const {
4068 assert(Op
.getValueType() == MVT::i32
);
4070 uint32_t BothRoundHwReg
=
4071 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE
, 0, 4);
4072 SDValue GetRoundBothImm
= DAG
.getTargetConstant(BothRoundHwReg
, SL
, MVT::i32
);
4075 DAG
.getTargetConstant(Intrinsic::amdgcn_s_getreg
, SL
, MVT::i32
);
4076 SDValue GetReg
= DAG
.getNode(ISD::INTRINSIC_W_CHAIN
, SL
, Op
->getVTList(),
4077 Op
.getOperand(0), IntrinID
, GetRoundBothImm
);
4079 // There are two rounding modes, one for f32 and one for f64/f16. We only
4080 // report in the standard value range if both are the same.
4082 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4083 // ties away from zero is not supported, and the other values are rotated by
4086 // If the two rounding modes are not the same, report a target defined value.
4088 // Mode register rounding mode fields:
4090 // [1:0] Single-precision round mode.
4091 // [3:2] Double/Half-precision round mode.
4093 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4100 // NearestAway0 N/A 4
4102 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4103 // table we can index by the raw hardware mode.
4105 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4108 DAG
.getConstant(AMDGPU::FltRoundConversionTable
, SL
, MVT::i64
);
4110 SDValue Two
= DAG
.getConstant(2, SL
, MVT::i32
);
4111 SDValue RoundModeTimesNumBits
=
4112 DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, GetReg
, Two
);
4114 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4115 // knew only one mode was demanded.
4116 SDValue TableValue
=
4117 DAG
.getNode(ISD::SRL
, SL
, MVT::i64
, BitTable
, RoundModeTimesNumBits
);
4118 SDValue TruncTable
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, TableValue
);
4120 SDValue EntryMask
= DAG
.getConstant(0xf, SL
, MVT::i32
);
4121 SDValue TableEntry
=
4122 DAG
.getNode(ISD::AND
, SL
, MVT::i32
, TruncTable
, EntryMask
);
4124 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4125 // if it's an extended value.
4126 SDValue Four
= DAG
.getConstant(4, SL
, MVT::i32
);
4127 SDValue IsStandardValue
=
4128 DAG
.getSetCC(SL
, MVT::i1
, TableEntry
, Four
, ISD::SETULT
);
4129 SDValue EnumOffset
= DAG
.getNode(ISD::ADD
, SL
, MVT::i32
, TableEntry
, Four
);
4130 SDValue Result
= DAG
.getNode(ISD::SELECT
, SL
, MVT::i32
, IsStandardValue
,
4131 TableEntry
, EnumOffset
);
4133 return DAG
.getMergeValues({Result
, GetReg
.getValue(1)}, SL
);
4136 SDValue
SITargetLowering::lowerSET_ROUNDING(SDValue Op
,
4137 SelectionDAG
&DAG
) const {
4140 SDValue NewMode
= Op
.getOperand(1);
4141 assert(NewMode
.getValueType() == MVT::i32
);
4143 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4144 // hardware MODE.fp_round values.
4145 if (auto *ConstMode
= dyn_cast
<ConstantSDNode
>(NewMode
)) {
4146 uint32_t ClampedVal
= std::min(
4147 static_cast<uint32_t>(ConstMode
->getZExtValue()),
4148 static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64
));
4149 NewMode
= DAG
.getConstant(
4150 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal
), SL
, MVT::i32
);
4152 // If we know the input can only be one of the supported standard modes in
4153 // the range 0-3, we can use a simplified mapping to hardware values.
4154 KnownBits KB
= DAG
.computeKnownBits(NewMode
);
4155 const bool UseReducedTable
= KB
.countMinLeadingZeros() >= 30;
4156 // The supported standard values are 0-3. The extended values start at 8. We
4157 // need to offset by 4 if the value is in the extended range.
4159 if (UseReducedTable
) {
4160 // Truncate to the low 32-bits.
4161 SDValue BitTable
= DAG
.getConstant(
4162 AMDGPU::FltRoundToHWConversionTable
& 0xffff, SL
, MVT::i32
);
4164 SDValue Two
= DAG
.getConstant(2, SL
, MVT::i32
);
4165 SDValue RoundModeTimesNumBits
=
4166 DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, NewMode
, Two
);
4169 DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, BitTable
, RoundModeTimesNumBits
);
4171 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4172 // the table extracted bits into inline immediates.
4174 // table_index = umin(value, value - 4)
4175 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4177 DAG
.getConstant(AMDGPU::FltRoundToHWConversionTable
, SL
, MVT::i64
);
4179 SDValue Four
= DAG
.getConstant(4, SL
, MVT::i32
);
4180 SDValue OffsetEnum
= DAG
.getNode(ISD::SUB
, SL
, MVT::i32
, NewMode
, Four
);
4182 DAG
.getNode(ISD::UMIN
, SL
, MVT::i32
, NewMode
, OffsetEnum
);
4184 SDValue Two
= DAG
.getConstant(2, SL
, MVT::i32
);
4185 SDValue RoundModeTimesNumBits
=
4186 DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, IndexVal
, Two
);
4188 SDValue TableValue
=
4189 DAG
.getNode(ISD::SRL
, SL
, MVT::i64
, BitTable
, RoundModeTimesNumBits
);
4190 SDValue TruncTable
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, TableValue
);
4192 // No need to mask out the high bits since the setreg will ignore them
4194 NewMode
= TruncTable
;
4197 // Insert a readfirstlane in case the value is a VGPR. We could do this
4198 // earlier and keep more operations scalar, but that interferes with
4199 // combining the source.
4200 SDValue ReadFirstLaneID
=
4201 DAG
.getTargetConstant(Intrinsic::amdgcn_readfirstlane
, SL
, MVT::i32
);
4202 NewMode
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SL
, MVT::i32
,
4203 ReadFirstLaneID
, NewMode
);
4206 // N.B. The setreg will be later folded into s_round_mode on supported
4209 DAG
.getTargetConstant(Intrinsic::amdgcn_s_setreg
, SL
, MVT::i32
);
4210 uint32_t BothRoundHwReg
=
4211 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE
, 0, 4);
4212 SDValue RoundBothImm
= DAG
.getTargetConstant(BothRoundHwReg
, SL
, MVT::i32
);
4215 DAG
.getNode(ISD::INTRINSIC_VOID
, SL
, Op
->getVTList(), Op
.getOperand(0),
4216 IntrinID
, RoundBothImm
, NewMode
);
4221 SDValue
SITargetLowering::lowerPREFETCH(SDValue Op
, SelectionDAG
&DAG
) const {
4222 if (Op
->isDivergent())
4225 switch (cast
<MemSDNode
>(Op
)->getAddressSpace()) {
4226 case AMDGPUAS::FLAT_ADDRESS
:
4227 case AMDGPUAS::GLOBAL_ADDRESS
:
4228 case AMDGPUAS::CONSTANT_ADDRESS
:
4229 case AMDGPUAS::CONSTANT_ADDRESS_32BIT
:
4238 // Work around DAG legality rules only based on the result type.
4239 SDValue
SITargetLowering::lowerFP_EXTEND(SDValue Op
, SelectionDAG
&DAG
) const {
4240 bool IsStrict
= Op
.getOpcode() == ISD::STRICT_FP_EXTEND
;
4241 SDValue Src
= Op
.getOperand(IsStrict
? 1 : 0);
4242 EVT SrcVT
= Src
.getValueType();
4244 if (SrcVT
.getScalarType() != MVT::bf16
)
4249 DAG
.getNode(ISD::BITCAST
, SL
, SrcVT
.changeTypeToInteger(), Src
);
4251 EVT DstVT
= Op
.getValueType();
4253 llvm_unreachable("Need STRICT_BF16_TO_FP");
4255 return DAG
.getNode(ISD::BF16_TO_FP
, SL
, DstVT
, BitCast
);
4258 SDValue
SITargetLowering::lowerGET_FPENV(SDValue Op
, SelectionDAG
&DAG
) const {
4260 if (Op
.getValueType() != MVT::i64
)
4263 uint32_t ModeHwReg
=
4264 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE
, 0, 23);
4265 SDValue ModeHwRegImm
= DAG
.getTargetConstant(ModeHwReg
, SL
, MVT::i32
);
4266 uint32_t TrapHwReg
=
4267 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS
, 0, 5);
4268 SDValue TrapHwRegImm
= DAG
.getTargetConstant(TrapHwReg
, SL
, MVT::i32
);
4270 SDVTList VTList
= DAG
.getVTList(MVT::i32
, MVT::Other
);
4272 DAG
.getTargetConstant(Intrinsic::amdgcn_s_getreg
, SL
, MVT::i32
);
4273 SDValue GetModeReg
= DAG
.getNode(ISD::INTRINSIC_W_CHAIN
, SL
, VTList
,
4274 Op
.getOperand(0), IntrinID
, ModeHwRegImm
);
4275 SDValue GetTrapReg
= DAG
.getNode(ISD::INTRINSIC_W_CHAIN
, SL
, VTList
,
4276 Op
.getOperand(0), IntrinID
, TrapHwRegImm
);
4278 DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
, GetModeReg
.getValue(1),
4279 GetTrapReg
.getValue(1));
4282 DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, GetModeReg
, GetTrapReg
);
4283 SDValue Result
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, CvtPtr
);
4285 return DAG
.getMergeValues({Result
, TokenReg
}, SL
);
4288 SDValue
SITargetLowering::lowerSET_FPENV(SDValue Op
, SelectionDAG
&DAG
) const {
4290 if (Op
.getOperand(1).getValueType() != MVT::i64
)
4293 SDValue Input
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Op
.getOperand(1));
4294 SDValue NewModeReg
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Input
,
4295 DAG
.getConstant(0, SL
, MVT::i32
));
4296 SDValue NewTrapReg
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Input
,
4297 DAG
.getConstant(1, SL
, MVT::i32
));
4299 SDValue ReadFirstLaneID
=
4300 DAG
.getTargetConstant(Intrinsic::amdgcn_readfirstlane
, SL
, MVT::i32
);
4301 NewModeReg
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SL
, MVT::i32
,
4302 ReadFirstLaneID
, NewModeReg
);
4303 NewTrapReg
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SL
, MVT::i32
,
4304 ReadFirstLaneID
, NewTrapReg
);
4306 unsigned ModeHwReg
=
4307 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE
, 0, 23);
4308 SDValue ModeHwRegImm
= DAG
.getTargetConstant(ModeHwReg
, SL
, MVT::i32
);
4309 unsigned TrapHwReg
=
4310 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS
, 0, 5);
4311 SDValue TrapHwRegImm
= DAG
.getTargetConstant(TrapHwReg
, SL
, MVT::i32
);
4314 DAG
.getTargetConstant(Intrinsic::amdgcn_s_setreg
, SL
, MVT::i32
);
4315 SDValue SetModeReg
=
4316 DAG
.getNode(ISD::INTRINSIC_VOID
, SL
, MVT::Other
, Op
.getOperand(0),
4317 IntrinID
, ModeHwRegImm
, NewModeReg
);
4318 SDValue SetTrapReg
=
4319 DAG
.getNode(ISD::INTRINSIC_VOID
, SL
, MVT::Other
, Op
.getOperand(0),
4320 IntrinID
, TrapHwRegImm
, NewTrapReg
);
4321 return DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
, SetTrapReg
, SetModeReg
);
4324 Register
SITargetLowering::getRegisterByName(const char *RegName
, LLT VT
,
4325 const MachineFunction
&MF
) const {
4326 Register Reg
= StringSwitch
<Register
>(RegName
)
4327 .Case("m0", AMDGPU::M0
)
4328 .Case("exec", AMDGPU::EXEC
)
4329 .Case("exec_lo", AMDGPU::EXEC_LO
)
4330 .Case("exec_hi", AMDGPU::EXEC_HI
)
4331 .Case("flat_scratch", AMDGPU::FLAT_SCR
)
4332 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO
)
4333 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI
)
4334 .Default(Register());
4336 if (Reg
== AMDGPU::NoRegister
) {
4338 Twine("invalid register name \"" + StringRef(RegName
) + "\"."));
4341 if (!Subtarget
->hasFlatScrRegister() &&
4342 Subtarget
->getRegisterInfo()->regsOverlap(Reg
, AMDGPU::FLAT_SCR
)) {
4343 report_fatal_error(Twine("invalid register \"" + StringRef(RegName
) +
4344 "\" for subtarget."));
4349 case AMDGPU::EXEC_LO
:
4350 case AMDGPU::EXEC_HI
:
4351 case AMDGPU::FLAT_SCR_LO
:
4352 case AMDGPU::FLAT_SCR_HI
:
4353 if (VT
.getSizeInBits() == 32)
4357 case AMDGPU::FLAT_SCR
:
4358 if (VT
.getSizeInBits() == 64)
4362 llvm_unreachable("missing register type checking");
4366 Twine("invalid type for register \"" + StringRef(RegName
) + "\"."));
4369 // If kill is not the last instruction, split the block so kill is always a
4370 // proper terminator.
4372 SITargetLowering::splitKillBlock(MachineInstr
&MI
,
4373 MachineBasicBlock
*BB
) const {
4374 MachineBasicBlock
*SplitBB
= BB
->splitAt(MI
, false /*UpdateLiveIns*/);
4375 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
4376 MI
.setDesc(TII
->getKillTerminatorFromPseudo(MI
.getOpcode()));
4380 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4381 // \p MI will be the only instruction in the loop body block. Otherwise, it will
4382 // be the first instruction in the remainder block.
4384 /// \returns { LoopBody, Remainder }
4385 static std::pair
<MachineBasicBlock
*, MachineBasicBlock
*>
4386 splitBlockForLoop(MachineInstr
&MI
, MachineBasicBlock
&MBB
, bool InstInLoop
) {
4387 MachineFunction
*MF
= MBB
.getParent();
4388 MachineBasicBlock::iterator
I(&MI
);
4390 // To insert the loop we need to split the block. Move everything after this
4391 // point to a new block, and insert a new empty block between the two.
4392 MachineBasicBlock
*LoopBB
= MF
->CreateMachineBasicBlock();
4393 MachineBasicBlock
*RemainderBB
= MF
->CreateMachineBasicBlock();
4394 MachineFunction::iterator
MBBI(MBB
);
4397 MF
->insert(MBBI
, LoopBB
);
4398 MF
->insert(MBBI
, RemainderBB
);
4400 LoopBB
->addSuccessor(LoopBB
);
4401 LoopBB
->addSuccessor(RemainderBB
);
4403 // Move the rest of the block into a new block.
4404 RemainderBB
->transferSuccessorsAndUpdatePHIs(&MBB
);
4407 auto Next
= std::next(I
);
4409 // Move instruction to loop body.
4410 LoopBB
->splice(LoopBB
->begin(), &MBB
, I
, Next
);
4412 // Move the rest of the block.
4413 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, Next
, MBB
.end());
4415 RemainderBB
->splice(RemainderBB
->begin(), &MBB
, I
, MBB
.end());
4418 MBB
.addSuccessor(LoopBB
);
4420 return std::pair(LoopBB
, RemainderBB
);
4423 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4424 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr
&MI
) const {
4425 MachineBasicBlock
*MBB
= MI
.getParent();
4426 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
4427 auto I
= MI
.getIterator();
4428 auto E
= std::next(I
);
4431 BuildMI(*MBB
, E
, MI
.getDebugLoc(), TII
->get(AMDGPU::S_WAITCNT
))
4435 MIBundleBuilder
Bundler(*MBB
, I
, E
);
4436 finalizeBundle(*MBB
, Bundler
.begin());
4440 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr
&MI
,
4441 MachineBasicBlock
*BB
) const {
4442 const DebugLoc
&DL
= MI
.getDebugLoc();
4444 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
4446 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
4448 // Apparently kill flags are only valid if the def is in the same block?
4449 if (MachineOperand
*Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::data0
))
4450 Src
->setIsKill(false);
4452 auto [LoopBB
, RemainderBB
] = splitBlockForLoop(MI
, *BB
, true);
4454 MachineBasicBlock::iterator I
= LoopBB
->end();
4456 const unsigned EncodedReg
= AMDGPU::Hwreg::HwregEncoding::encode(
4457 AMDGPU::Hwreg::ID_TRAPSTS
, AMDGPU::Hwreg::OFFSET_MEM_VIOL
, 1);
4459 // Clear TRAP_STS.MEM_VIOL
4460 BuildMI(*LoopBB
, LoopBB
->begin(), DL
, TII
->get(AMDGPU::S_SETREG_IMM32_B32
))
4462 .addImm(EncodedReg
);
4464 bundleInstWithWaitcnt(MI
);
4466 Register Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
4468 // Load and check TRAP_STS.MEM_VIOL
4469 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_GETREG_B32
), Reg
)
4470 .addImm(EncodedReg
);
4472 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4473 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CMP_LG_U32
))
4474 .addReg(Reg
, RegState::Kill
)
4477 BuildMI(*LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
4484 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4485 // wavefront. If the value is uniform and just happens to be in a VGPR, this
4486 // will only do one iteration. In the worst case, this will loop 64 times.
4488 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4489 static MachineBasicBlock::iterator
4490 emitLoadM0FromVGPRLoop(const SIInstrInfo
*TII
, MachineRegisterInfo
&MRI
,
4491 MachineBasicBlock
&OrigBB
, MachineBasicBlock
&LoopBB
,
4492 const DebugLoc
&DL
, const MachineOperand
&Idx
,
4493 unsigned InitReg
, unsigned ResultReg
, unsigned PhiReg
,
4494 unsigned InitSaveExecReg
, int Offset
, bool UseGPRIdxMode
,
4495 Register
&SGPRIdxReg
) {
4497 MachineFunction
*MF
= OrigBB
.getParent();
4498 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
4499 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
4500 MachineBasicBlock::iterator I
= LoopBB
.begin();
4502 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
4503 Register PhiExec
= MRI
.createVirtualRegister(BoolRC
);
4504 Register NewExec
= MRI
.createVirtualRegister(BoolRC
);
4505 Register CurrentIdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
4506 Register CondReg
= MRI
.createVirtualRegister(BoolRC
);
4508 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiReg
)
4514 BuildMI(LoopBB
, I
, DL
, TII
->get(TargetOpcode::PHI
), PhiExec
)
4515 .addReg(InitSaveExecReg
)
4520 // Read the next variant <- also loop target.
4521 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), CurrentIdxReg
)
4522 .addReg(Idx
.getReg(), getUndefRegState(Idx
.isUndef()));
4524 // Compare the just read M0 value to all possible Idx values.
4525 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::V_CMP_EQ_U32_e64
), CondReg
)
4526 .addReg(CurrentIdxReg
)
4527 .addReg(Idx
.getReg(), 0, Idx
.getSubReg());
4529 // Update EXEC, save the original EXEC value to VCC.
4530 BuildMI(LoopBB
, I
, DL
,
4531 TII
->get(ST
.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4532 : AMDGPU::S_AND_SAVEEXEC_B64
),
4534 .addReg(CondReg
, RegState::Kill
);
4536 MRI
.setSimpleHint(NewExec
, CondReg
);
4538 if (UseGPRIdxMode
) {
4540 SGPRIdxReg
= CurrentIdxReg
;
4542 SGPRIdxReg
= MRI
.createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
4543 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), SGPRIdxReg
)
4544 .addReg(CurrentIdxReg
, RegState::Kill
)
4548 // Move index from VCC into M0
4550 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
4551 .addReg(CurrentIdxReg
, RegState::Kill
);
4553 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
4554 .addReg(CurrentIdxReg
, RegState::Kill
)
4559 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4560 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
4561 MachineInstr
*InsertPt
=
4562 BuildMI(LoopBB
, I
, DL
,
4563 TII
->get(ST
.isWave32() ? AMDGPU::S_XOR_B32_term
4564 : AMDGPU::S_XOR_B64_term
),
4569 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4572 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4574 BuildMI(LoopBB
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
4578 return InsertPt
->getIterator();
4581 // This has slightly sub-optimal regalloc when the source vector is killed by
4582 // the read. The register allocator does not understand that the kill is
4583 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
4584 // subregister from it, using 1 more VGPR than necessary. This was saved when
4585 // this was expanded after register allocation.
4586 static MachineBasicBlock::iterator
4587 loadM0FromVGPR(const SIInstrInfo
*TII
, MachineBasicBlock
&MBB
, MachineInstr
&MI
,
4588 unsigned InitResultReg
, unsigned PhiReg
, int Offset
,
4589 bool UseGPRIdxMode
, Register
&SGPRIdxReg
) {
4590 MachineFunction
*MF
= MBB
.getParent();
4591 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
4592 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
4593 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
4594 const DebugLoc
&DL
= MI
.getDebugLoc();
4595 MachineBasicBlock::iterator
I(&MI
);
4597 const auto *BoolXExecRC
= TRI
->getWaveMaskRegClass();
4598 Register DstReg
= MI
.getOperand(0).getReg();
4599 Register SaveExec
= MRI
.createVirtualRegister(BoolXExecRC
);
4600 Register TmpExec
= MRI
.createVirtualRegister(BoolXExecRC
);
4601 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
4602 unsigned MovExecOpc
= ST
.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
4604 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), TmpExec
);
4606 // Save the EXEC mask
4608 BuildMI(MBB
, I
, DL
, TII
->get(MovExecOpc
), SaveExec
)
4612 auto [LoopBB
, RemainderBB
] = splitBlockForLoop(MI
, MBB
, false);
4614 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
4616 auto InsPt
= emitLoadM0FromVGPRLoop(TII
, MRI
, MBB
, *LoopBB
, DL
, *Idx
,
4617 InitResultReg
, DstReg
, PhiReg
, TmpExec
,
4618 Offset
, UseGPRIdxMode
, SGPRIdxReg
);
4620 MachineBasicBlock
*LandingPad
= MF
->CreateMachineBasicBlock();
4621 MachineFunction::iterator
MBBI(LoopBB
);
4623 MF
->insert(MBBI
, LandingPad
);
4624 LoopBB
->removeSuccessor(RemainderBB
);
4625 LandingPad
->addSuccessor(RemainderBB
);
4626 LoopBB
->addSuccessor(LandingPad
);
4627 MachineBasicBlock::iterator First
= LandingPad
->begin();
4629 BuildMI(*LandingPad
, First
, DL
, TII
->get(MovExecOpc
), Exec
)
4636 // Returns subreg index, offset
4637 static std::pair
<unsigned, int>
4638 computeIndirectRegAndOffset(const SIRegisterInfo
&TRI
,
4639 const TargetRegisterClass
*SuperRC
, unsigned VecReg
,
4641 int NumElts
= TRI
.getRegSizeInBits(*SuperRC
) / 32;
4643 // Skip out of bounds offsets, or else we would end up using an undefined
4645 if (Offset
>= NumElts
|| Offset
< 0)
4646 return std::pair(AMDGPU::sub0
, Offset
);
4648 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset
), 0);
4651 static void setM0ToIndexFromSGPR(const SIInstrInfo
*TII
,
4652 MachineRegisterInfo
&MRI
, MachineInstr
&MI
,
4654 MachineBasicBlock
*MBB
= MI
.getParent();
4655 const DebugLoc
&DL
= MI
.getDebugLoc();
4656 MachineBasicBlock::iterator
I(&MI
);
4658 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
4660 assert(Idx
->getReg() != AMDGPU::NoRegister
);
4664 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
4668 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), AMDGPU::M0
)
4674 static Register
getIndirectSGPRIdx(const SIInstrInfo
*TII
,
4675 MachineRegisterInfo
&MRI
, MachineInstr
&MI
,
4677 MachineBasicBlock
*MBB
= MI
.getParent();
4678 const DebugLoc
&DL
= MI
.getDebugLoc();
4679 MachineBasicBlock::iterator
I(&MI
);
4681 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
4684 return Idx
->getReg();
4686 Register Tmp
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
4687 BuildMI(*MBB
, I
, DL
, TII
->get(AMDGPU::S_ADD_I32
), Tmp
)
4693 static MachineBasicBlock
*emitIndirectSrc(MachineInstr
&MI
,
4694 MachineBasicBlock
&MBB
,
4695 const GCNSubtarget
&ST
) {
4696 const SIInstrInfo
*TII
= ST
.getInstrInfo();
4697 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
4698 MachineFunction
*MF
= MBB
.getParent();
4699 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
4701 Register Dst
= MI
.getOperand(0).getReg();
4702 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
4703 Register SrcReg
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
)->getReg();
4704 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
4706 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcReg
);
4707 const TargetRegisterClass
*IdxRC
= MRI
.getRegClass(Idx
->getReg());
4710 std::tie(SubReg
, Offset
) =
4711 computeIndirectRegAndOffset(TRI
, VecRC
, SrcReg
, Offset
);
4713 const bool UseGPRIdxMode
= ST
.useVGPRIndexMode();
4715 // Check for a SGPR index.
4716 if (TII
->getRegisterInfo().isSGPRClass(IdxRC
)) {
4717 MachineBasicBlock::iterator
I(&MI
);
4718 const DebugLoc
&DL
= MI
.getDebugLoc();
4720 if (UseGPRIdxMode
) {
4721 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4722 // to avoid interfering with other uses, so probably requires a new
4723 // optimization pass.
4724 Register Idx
= getIndirectSGPRIdx(TII
, MRI
, MI
, Offset
);
4726 const MCInstrDesc
&GPRIDXDesc
=
4727 TII
->getIndirectGPRIDXPseudo(TRI
.getRegSizeInBits(*VecRC
), true);
4728 BuildMI(MBB
, I
, DL
, GPRIDXDesc
, Dst
)
4733 setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
);
4735 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
4736 .addReg(SrcReg
, 0, SubReg
)
4737 .addReg(SrcReg
, RegState::Implicit
);
4740 MI
.eraseFromParent();
4745 // Control flow needs to be inserted if indexing with a VGPR.
4746 const DebugLoc
&DL
= MI
.getDebugLoc();
4747 MachineBasicBlock::iterator
I(&MI
);
4749 Register PhiReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
4750 Register InitReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
4752 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::IMPLICIT_DEF
), InitReg
);
4754 Register SGPRIdxReg
;
4755 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, InitReg
, PhiReg
, Offset
,
4756 UseGPRIdxMode
, SGPRIdxReg
);
4758 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
4760 if (UseGPRIdxMode
) {
4761 const MCInstrDesc
&GPRIDXDesc
=
4762 TII
->getIndirectGPRIDXPseudo(TRI
.getRegSizeInBits(*VecRC
), true);
4764 BuildMI(*LoopBB
, InsPt
, DL
, GPRIDXDesc
, Dst
)
4769 BuildMI(*LoopBB
, InsPt
, DL
, TII
->get(AMDGPU::V_MOVRELS_B32_e32
), Dst
)
4770 .addReg(SrcReg
, 0, SubReg
)
4771 .addReg(SrcReg
, RegState::Implicit
);
4774 MI
.eraseFromParent();
4779 static MachineBasicBlock
*emitIndirectDst(MachineInstr
&MI
,
4780 MachineBasicBlock
&MBB
,
4781 const GCNSubtarget
&ST
) {
4782 const SIInstrInfo
*TII
= ST
.getInstrInfo();
4783 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
4784 MachineFunction
*MF
= MBB
.getParent();
4785 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
4787 Register Dst
= MI
.getOperand(0).getReg();
4788 const MachineOperand
*SrcVec
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src
);
4789 const MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::idx
);
4790 const MachineOperand
*Val
= TII
->getNamedOperand(MI
, AMDGPU::OpName::val
);
4791 int Offset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm();
4792 const TargetRegisterClass
*VecRC
= MRI
.getRegClass(SrcVec
->getReg());
4793 const TargetRegisterClass
*IdxRC
= MRI
.getRegClass(Idx
->getReg());
4795 // This can be an immediate, but will be folded later.
4796 assert(Val
->getReg());
4799 std::tie(SubReg
, Offset
) =
4800 computeIndirectRegAndOffset(TRI
, VecRC
, SrcVec
->getReg(), Offset
);
4801 const bool UseGPRIdxMode
= ST
.useVGPRIndexMode();
4803 if (Idx
->getReg() == AMDGPU::NoRegister
) {
4804 MachineBasicBlock::iterator
I(&MI
);
4805 const DebugLoc
&DL
= MI
.getDebugLoc();
4807 assert(Offset
== 0);
4809 BuildMI(MBB
, I
, DL
, TII
->get(TargetOpcode::INSERT_SUBREG
), Dst
)
4814 MI
.eraseFromParent();
4818 // Check for a SGPR index.
4819 if (TII
->getRegisterInfo().isSGPRClass(IdxRC
)) {
4820 MachineBasicBlock::iterator
I(&MI
);
4821 const DebugLoc
&DL
= MI
.getDebugLoc();
4823 if (UseGPRIdxMode
) {
4824 Register Idx
= getIndirectSGPRIdx(TII
, MRI
, MI
, Offset
);
4826 const MCInstrDesc
&GPRIDXDesc
=
4827 TII
->getIndirectGPRIDXPseudo(TRI
.getRegSizeInBits(*VecRC
), false);
4828 BuildMI(MBB
, I
, DL
, GPRIDXDesc
, Dst
)
4829 .addReg(SrcVec
->getReg())
4834 setM0ToIndexFromSGPR(TII
, MRI
, MI
, Offset
);
4836 const MCInstrDesc
&MovRelDesc
= TII
->getIndirectRegWriteMovRelPseudo(
4837 TRI
.getRegSizeInBits(*VecRC
), 32, false);
4838 BuildMI(MBB
, I
, DL
, MovRelDesc
, Dst
)
4839 .addReg(SrcVec
->getReg())
4843 MI
.eraseFromParent();
4847 // Control flow needs to be inserted if indexing with a VGPR.
4849 MRI
.clearKillFlags(Val
->getReg());
4851 const DebugLoc
&DL
= MI
.getDebugLoc();
4853 Register PhiReg
= MRI
.createVirtualRegister(VecRC
);
4855 Register SGPRIdxReg
;
4856 auto InsPt
= loadM0FromVGPR(TII
, MBB
, MI
, SrcVec
->getReg(), PhiReg
, Offset
,
4857 UseGPRIdxMode
, SGPRIdxReg
);
4858 MachineBasicBlock
*LoopBB
= InsPt
->getParent();
4860 if (UseGPRIdxMode
) {
4861 const MCInstrDesc
&GPRIDXDesc
=
4862 TII
->getIndirectGPRIDXPseudo(TRI
.getRegSizeInBits(*VecRC
), false);
4864 BuildMI(*LoopBB
, InsPt
, DL
, GPRIDXDesc
, Dst
)
4870 const MCInstrDesc
&MovRelDesc
= TII
->getIndirectRegWriteMovRelPseudo(
4871 TRI
.getRegSizeInBits(*VecRC
), 32, false);
4872 BuildMI(*LoopBB
, InsPt
, DL
, MovRelDesc
, Dst
)
4878 MI
.eraseFromParent();
4882 static MachineBasicBlock
*lowerWaveReduce(MachineInstr
&MI
,
4883 MachineBasicBlock
&BB
,
4884 const GCNSubtarget
&ST
,
4886 MachineRegisterInfo
&MRI
= BB
.getParent()->getRegInfo();
4887 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
4888 const DebugLoc
&DL
= MI
.getDebugLoc();
4889 const SIInstrInfo
*TII
= ST
.getInstrInfo();
4891 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4892 Register SrcReg
= MI
.getOperand(1).getReg();
4893 bool isSGPR
= TRI
->isSGPRClass(MRI
.getRegClass(SrcReg
));
4894 Register DstReg
= MI
.getOperand(0).getReg();
4895 MachineBasicBlock
*RetBB
= nullptr;
4897 // These operations with a uniform value i.e. SGPR are idempotent.
4898 // Reduced value will be same as given sgpr.
4900 BuildMI(BB
, MI
, DL
, TII
->get(AMDGPU::S_MOV_B32
), DstReg
)
4905 // TODO: Implement DPP Strategy and switch based on immediate strategy
4906 // operand. For now, for all the cases (default, Iterative and DPP we use
4907 // iterative approach by default.)
4909 // To reduce the VGPR using iterative approach, we need to iterate
4910 // over all the active lanes. Lowering consists of ComputeLoop,
4911 // which iterate over only active lanes. We use copy of EXEC register
4912 // as induction variable and every active lane modifies it using bitset0
4913 // so that we will get the next active lane for next iteration.
4914 MachineBasicBlock::iterator I
= BB
.end();
4915 Register SrcReg
= MI
.getOperand(1).getReg();
4917 // Create Control flow for loop
4918 // Split MI's Machine Basic block into For loop
4919 auto [ComputeLoop
, ComputeEnd
] = splitBlockForLoop(MI
, BB
, true);
4921 // Create virtual registers required for lowering.
4922 const TargetRegisterClass
*WaveMaskRegClass
= TRI
->getWaveMaskRegClass();
4923 const TargetRegisterClass
*DstRegClass
= MRI
.getRegClass(DstReg
);
4924 Register LoopIterator
= MRI
.createVirtualRegister(WaveMaskRegClass
);
4925 Register InitalValReg
= MRI
.createVirtualRegister(DstRegClass
);
4927 Register AccumulatorReg
= MRI
.createVirtualRegister(DstRegClass
);
4928 Register ActiveBitsReg
= MRI
.createVirtualRegister(WaveMaskRegClass
);
4929 Register NewActiveBitsReg
= MRI
.createVirtualRegister(WaveMaskRegClass
);
4931 Register FF1Reg
= MRI
.createVirtualRegister(DstRegClass
);
4932 Register LaneValueReg
= MRI
.createVirtualRegister(DstRegClass
);
4934 bool IsWave32
= ST
.isWave32();
4935 unsigned MovOpc
= IsWave32
? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
4936 unsigned ExecReg
= IsWave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
4938 // Create initail values of induction variable from Exec, Accumulator and
4939 // insert branch instr to newly created ComputeBlockk
4940 uint32_t InitalValue
=
4941 (Opc
== AMDGPU::S_MIN_U32
) ? std::numeric_limits
<uint32_t>::max() : 0;
4943 BuildMI(BB
, I
, DL
, TII
->get(MovOpc
), LoopIterator
).addReg(ExecReg
);
4944 BuildMI(BB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B32
), InitalValReg
)
4945 .addImm(InitalValue
);
4947 BuildMI(BB
, I
, DL
, TII
->get(AMDGPU::S_BRANCH
))
4948 .addMBB(ComputeLoop
);
4951 // Start constructing ComputeLoop
4952 I
= ComputeLoop
->end();
4954 BuildMI(*ComputeLoop
, I
, DL
, TII
->get(AMDGPU::PHI
), AccumulatorReg
)
4955 .addReg(InitalValReg
)
4958 BuildMI(*ComputeLoop
, I
, DL
, TII
->get(AMDGPU::PHI
), ActiveBitsReg
)
4959 .addReg(TmpSReg
->getOperand(0).getReg())
4962 // Perform the computations
4963 unsigned SFFOpc
= IsWave32
? AMDGPU::S_FF1_I32_B32
: AMDGPU::S_FF1_I32_B64
;
4964 auto FF1
= BuildMI(*ComputeLoop
, I
, DL
, TII
->get(SFFOpc
), FF1Reg
)
4965 .addReg(ActiveBits
->getOperand(0).getReg());
4966 auto LaneValue
= BuildMI(*ComputeLoop
, I
, DL
,
4967 TII
->get(AMDGPU::V_READLANE_B32
), LaneValueReg
)
4969 .addReg(FF1
->getOperand(0).getReg());
4970 auto NewAccumulator
= BuildMI(*ComputeLoop
, I
, DL
, TII
->get(Opc
), DstReg
)
4971 .addReg(Accumulator
->getOperand(0).getReg())
4972 .addReg(LaneValue
->getOperand(0).getReg());
4974 // Manipulate the iterator to get the next active lane
4975 unsigned BITSETOpc
=
4976 IsWave32
? AMDGPU::S_BITSET0_B32
: AMDGPU::S_BITSET0_B64
;
4977 auto NewActiveBits
=
4978 BuildMI(*ComputeLoop
, I
, DL
, TII
->get(BITSETOpc
), NewActiveBitsReg
)
4979 .addReg(FF1
->getOperand(0).getReg())
4980 .addReg(ActiveBits
->getOperand(0).getReg());
4983 Accumulator
.addReg(NewAccumulator
->getOperand(0).getReg())
4984 .addMBB(ComputeLoop
);
4985 ActiveBits
.addReg(NewActiveBits
->getOperand(0).getReg())
4986 .addMBB(ComputeLoop
);
4988 // Creating branching
4989 unsigned CMPOpc
= IsWave32
? AMDGPU::S_CMP_LG_U32
: AMDGPU::S_CMP_LG_U64
;
4990 BuildMI(*ComputeLoop
, I
, DL
, TII
->get(CMPOpc
))
4991 .addReg(NewActiveBits
->getOperand(0).getReg())
4993 BuildMI(*ComputeLoop
, I
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
4994 .addMBB(ComputeLoop
);
4998 MI
.eraseFromParent();
5003 SITargetLowering::EmitInstrWithCustomInserter(MachineInstr
&MI
,
5004 MachineBasicBlock
*BB
) const {
5006 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
5007 MachineFunction
*MF
= BB
->getParent();
5008 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
5010 switch (MI
.getOpcode()) {
5011 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32
:
5012 return lowerWaveReduce(MI
, *BB
, *getSubtarget(), AMDGPU::S_MIN_U32
);
5013 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32
:
5014 return lowerWaveReduce(MI
, *BB
, *getSubtarget(), AMDGPU::S_MAX_U32
);
5015 case AMDGPU::S_UADDO_PSEUDO
:
5016 case AMDGPU::S_USUBO_PSEUDO
: {
5017 const DebugLoc
&DL
= MI
.getDebugLoc();
5018 MachineOperand
&Dest0
= MI
.getOperand(0);
5019 MachineOperand
&Dest1
= MI
.getOperand(1);
5020 MachineOperand
&Src0
= MI
.getOperand(2);
5021 MachineOperand
&Src1
= MI
.getOperand(3);
5023 unsigned Opc
= (MI
.getOpcode() == AMDGPU::S_UADDO_PSEUDO
)
5025 : AMDGPU::S_SUB_I32
;
5027 BuildMI(*BB
, MI
, DL
, TII
->get(Opc
), Dest0
.getReg())
5032 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_CSELECT_B64
), Dest1
.getReg())
5036 MI
.eraseFromParent();
5039 case AMDGPU::S_ADD_U64_PSEUDO
:
5040 case AMDGPU::S_SUB_U64_PSEUDO
: {
5041 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5042 // For GFX12, we emit s_add_u64 and s_sub_u64.
5043 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
5044 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
5045 const DebugLoc
&DL
= MI
.getDebugLoc();
5046 MachineOperand
&Dest
= MI
.getOperand(0);
5047 MachineOperand
&Src0
= MI
.getOperand(1);
5048 MachineOperand
&Src1
= MI
.getOperand(2);
5049 bool IsAdd
= (MI
.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO
);
5050 if (Subtarget
->hasScalarAddSub64()) {
5051 unsigned Opc
= IsAdd
? AMDGPU::S_ADD_U64
: AMDGPU::S_SUB_U64
;
5053 BuildMI(*BB
, MI
, DL
, TII
->get(Opc
), Dest
.getReg())
5058 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
5059 const TargetRegisterClass
*BoolRC
= TRI
->getBoolRC();
5061 Register DestSub0
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5062 Register DestSub1
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5064 MachineOperand Src0Sub0
= TII
->buildExtractSubRegOrImm(
5065 MI
, MRI
, Src0
, BoolRC
, AMDGPU::sub0
, &AMDGPU::SReg_32RegClass
);
5066 MachineOperand Src0Sub1
= TII
->buildExtractSubRegOrImm(
5067 MI
, MRI
, Src0
, BoolRC
, AMDGPU::sub1
, &AMDGPU::SReg_32RegClass
);
5069 MachineOperand Src1Sub0
= TII
->buildExtractSubRegOrImm(
5070 MI
, MRI
, Src1
, BoolRC
, AMDGPU::sub0
, &AMDGPU::SReg_32RegClass
);
5071 MachineOperand Src1Sub1
= TII
->buildExtractSubRegOrImm(
5072 MI
, MRI
, Src1
, BoolRC
, AMDGPU::sub1
, &AMDGPU::SReg_32RegClass
);
5074 unsigned LoOpc
= IsAdd
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32
;
5075 unsigned HiOpc
= IsAdd
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32
;
5076 BuildMI(*BB
, MI
, DL
, TII
->get(LoOpc
), DestSub0
)
5079 BuildMI(*BB
, MI
, DL
, TII
->get(HiOpc
), DestSub1
)
5082 BuildMI(*BB
, MI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), Dest
.getReg())
5084 .addImm(AMDGPU::sub0
)
5086 .addImm(AMDGPU::sub1
);
5088 MI
.eraseFromParent();
5091 case AMDGPU::V_ADD_U64_PSEUDO
:
5092 case AMDGPU::V_SUB_U64_PSEUDO
: {
5093 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
5094 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
5095 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
5096 const DebugLoc
&DL
= MI
.getDebugLoc();
5098 bool IsAdd
= (MI
.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO
);
5100 MachineOperand
&Dest
= MI
.getOperand(0);
5101 MachineOperand
&Src0
= MI
.getOperand(1);
5102 MachineOperand
&Src1
= MI
.getOperand(2);
5104 if (IsAdd
&& ST
.hasLshlAddB64()) {
5105 auto Add
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_LSHL_ADD_U64_e64
),
5110 TII
->legalizeOperands(*Add
);
5111 MI
.eraseFromParent();
5115 const auto *CarryRC
= TRI
->getWaveMaskRegClass();
5117 Register DestSub0
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
5118 Register DestSub1
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
5120 Register CarryReg
= MRI
.createVirtualRegister(CarryRC
);
5121 Register DeadCarryReg
= MRI
.createVirtualRegister(CarryRC
);
5123 const TargetRegisterClass
*Src0RC
= Src0
.isReg()
5124 ? MRI
.getRegClass(Src0
.getReg())
5125 : &AMDGPU::VReg_64RegClass
;
5126 const TargetRegisterClass
*Src1RC
= Src1
.isReg()
5127 ? MRI
.getRegClass(Src1
.getReg())
5128 : &AMDGPU::VReg_64RegClass
;
5130 const TargetRegisterClass
*Src0SubRC
=
5131 TRI
->getSubRegisterClass(Src0RC
, AMDGPU::sub0
);
5132 const TargetRegisterClass
*Src1SubRC
=
5133 TRI
->getSubRegisterClass(Src1RC
, AMDGPU::sub1
);
5135 MachineOperand SrcReg0Sub0
= TII
->buildExtractSubRegOrImm(
5136 MI
, MRI
, Src0
, Src0RC
, AMDGPU::sub0
, Src0SubRC
);
5137 MachineOperand SrcReg1Sub0
= TII
->buildExtractSubRegOrImm(
5138 MI
, MRI
, Src1
, Src1RC
, AMDGPU::sub0
, Src1SubRC
);
5140 MachineOperand SrcReg0Sub1
= TII
->buildExtractSubRegOrImm(
5141 MI
, MRI
, Src0
, Src0RC
, AMDGPU::sub1
, Src0SubRC
);
5142 MachineOperand SrcReg1Sub1
= TII
->buildExtractSubRegOrImm(
5143 MI
, MRI
, Src1
, Src1RC
, AMDGPU::sub1
, Src1SubRC
);
5146 IsAdd
? AMDGPU::V_ADD_CO_U32_e64
: AMDGPU::V_SUB_CO_U32_e64
;
5147 MachineInstr
*LoHalf
= BuildMI(*BB
, MI
, DL
, TII
->get(LoOpc
), DestSub0
)
5148 .addReg(CarryReg
, RegState::Define
)
5151 .addImm(0); // clamp bit
5153 unsigned HiOpc
= IsAdd
? AMDGPU::V_ADDC_U32_e64
: AMDGPU::V_SUBB_U32_e64
;
5154 MachineInstr
*HiHalf
=
5155 BuildMI(*BB
, MI
, DL
, TII
->get(HiOpc
), DestSub1
)
5156 .addReg(DeadCarryReg
, RegState::Define
| RegState::Dead
)
5159 .addReg(CarryReg
, RegState::Kill
)
5160 .addImm(0); // clamp bit
5162 BuildMI(*BB
, MI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), Dest
.getReg())
5164 .addImm(AMDGPU::sub0
)
5166 .addImm(AMDGPU::sub1
);
5167 TII
->legalizeOperands(*LoHalf
);
5168 TII
->legalizeOperands(*HiHalf
);
5169 MI
.eraseFromParent();
5172 case AMDGPU::S_ADD_CO_PSEUDO
:
5173 case AMDGPU::S_SUB_CO_PSEUDO
: {
5174 // This pseudo has a chance to be selected
5175 // only from uniform add/subcarry node. All the VGPR operands
5176 // therefore assumed to be splat vectors.
5177 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
5178 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
5179 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
5180 MachineBasicBlock::iterator MII
= MI
;
5181 const DebugLoc
&DL
= MI
.getDebugLoc();
5182 MachineOperand
&Dest
= MI
.getOperand(0);
5183 MachineOperand
&CarryDest
= MI
.getOperand(1);
5184 MachineOperand
&Src0
= MI
.getOperand(2);
5185 MachineOperand
&Src1
= MI
.getOperand(3);
5186 MachineOperand
&Src2
= MI
.getOperand(4);
5187 unsigned Opc
= (MI
.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
)
5188 ? AMDGPU::S_ADDC_U32
5189 : AMDGPU::S_SUBB_U32
;
5190 if (Src0
.isReg() && TRI
->isVectorRegister(MRI
, Src0
.getReg())) {
5191 Register RegOp0
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5192 BuildMI(*BB
, MII
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), RegOp0
)
5193 .addReg(Src0
.getReg());
5194 Src0
.setReg(RegOp0
);
5196 if (Src1
.isReg() && TRI
->isVectorRegister(MRI
, Src1
.getReg())) {
5197 Register RegOp1
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5198 BuildMI(*BB
, MII
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), RegOp1
)
5199 .addReg(Src1
.getReg());
5200 Src1
.setReg(RegOp1
);
5202 Register RegOp2
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5203 if (TRI
->isVectorRegister(MRI
, Src2
.getReg())) {
5204 BuildMI(*BB
, MII
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
), RegOp2
)
5205 .addReg(Src2
.getReg());
5206 Src2
.setReg(RegOp2
);
5209 const TargetRegisterClass
*Src2RC
= MRI
.getRegClass(Src2
.getReg());
5210 unsigned WaveSize
= TRI
->getRegSizeInBits(*Src2RC
);
5211 assert(WaveSize
== 64 || WaveSize
== 32);
5213 if (WaveSize
== 64) {
5214 if (ST
.hasScalarCompareEq64()) {
5215 BuildMI(*BB
, MII
, DL
, TII
->get(AMDGPU::S_CMP_LG_U64
))
5216 .addReg(Src2
.getReg())
5219 const TargetRegisterClass
*SubRC
=
5220 TRI
->getSubRegisterClass(Src2RC
, AMDGPU::sub0
);
5221 MachineOperand Src2Sub0
= TII
->buildExtractSubRegOrImm(
5222 MII
, MRI
, Src2
, Src2RC
, AMDGPU::sub0
, SubRC
);
5223 MachineOperand Src2Sub1
= TII
->buildExtractSubRegOrImm(
5224 MII
, MRI
, Src2
, Src2RC
, AMDGPU::sub1
, SubRC
);
5225 Register Src2_32
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5227 BuildMI(*BB
, MII
, DL
, TII
->get(AMDGPU::S_OR_B32
), Src2_32
)
5231 BuildMI(*BB
, MII
, DL
, TII
->get(AMDGPU::S_CMP_LG_U32
))
5232 .addReg(Src2_32
, RegState::Kill
)
5236 BuildMI(*BB
, MII
, DL
, TII
->get(AMDGPU::S_CMP_LG_U32
))
5237 .addReg(Src2
.getReg())
5242 BuildMI(*BB
, MII
, DL
, TII
->get(Opc
), Dest
.getReg())
5248 (WaveSize
== 64) ? AMDGPU::S_CSELECT_B64
: AMDGPU::S_CSELECT_B32
;
5250 BuildMI(*BB
, MII
, DL
, TII
->get(SelOpc
), CarryDest
.getReg())
5254 MI
.eraseFromParent();
5257 case AMDGPU::SI_INIT_M0
: {
5258 BuildMI(*BB
, MI
.getIterator(), MI
.getDebugLoc(),
5259 TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::M0
)
5260 .add(MI
.getOperand(0));
5261 MI
.eraseFromParent();
5264 case AMDGPU::GET_GROUPSTATICSIZE
: {
5265 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA
||
5266 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL
);
5267 DebugLoc DL
= MI
.getDebugLoc();
5268 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_MOV_B32
))
5269 .add(MI
.getOperand(0))
5270 .addImm(MFI
->getLDSSize());
5271 MI
.eraseFromParent();
5274 case AMDGPU::GET_SHADERCYCLESHILO
: {
5275 assert(MF
->getSubtarget
<GCNSubtarget
>().hasShaderCyclesHiLoRegisters());
5276 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
5277 const DebugLoc
&DL
= MI
.getDebugLoc();
5278 // The algorithm is:
5280 // hi1 = getreg(SHADER_CYCLES_HI)
5281 // lo1 = getreg(SHADER_CYCLES_LO)
5282 // hi2 = getreg(SHADER_CYCLES_HI)
5284 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5285 // Otherwise there was overflow and the result is hi2:0. In both cases the
5286 // result should represent the actual time at some point during the sequence
5287 // of three getregs.
5288 using namespace AMDGPU::Hwreg
;
5289 Register RegHi1
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5290 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_GETREG_B32
), RegHi1
)
5291 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI
, 0, 32));
5292 Register RegLo1
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5293 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_GETREG_B32
), RegLo1
)
5294 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES
, 0, 32));
5295 Register RegHi2
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5296 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_GETREG_B32
), RegHi2
)
5297 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI
, 0, 32));
5298 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_CMP_EQ_U32
))
5301 Register RegLo
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5302 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_CSELECT_B32
), RegLo
)
5305 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
))
5306 .add(MI
.getOperand(0))
5308 .addImm(AMDGPU::sub0
)
5310 .addImm(AMDGPU::sub1
);
5311 MI
.eraseFromParent();
5314 case AMDGPU::SI_INDIRECT_SRC_V1
:
5315 case AMDGPU::SI_INDIRECT_SRC_V2
:
5316 case AMDGPU::SI_INDIRECT_SRC_V4
:
5317 case AMDGPU::SI_INDIRECT_SRC_V8
:
5318 case AMDGPU::SI_INDIRECT_SRC_V9
:
5319 case AMDGPU::SI_INDIRECT_SRC_V10
:
5320 case AMDGPU::SI_INDIRECT_SRC_V11
:
5321 case AMDGPU::SI_INDIRECT_SRC_V12
:
5322 case AMDGPU::SI_INDIRECT_SRC_V16
:
5323 case AMDGPU::SI_INDIRECT_SRC_V32
:
5324 return emitIndirectSrc(MI
, *BB
, *getSubtarget());
5325 case AMDGPU::SI_INDIRECT_DST_V1
:
5326 case AMDGPU::SI_INDIRECT_DST_V2
:
5327 case AMDGPU::SI_INDIRECT_DST_V4
:
5328 case AMDGPU::SI_INDIRECT_DST_V8
:
5329 case AMDGPU::SI_INDIRECT_DST_V9
:
5330 case AMDGPU::SI_INDIRECT_DST_V10
:
5331 case AMDGPU::SI_INDIRECT_DST_V11
:
5332 case AMDGPU::SI_INDIRECT_DST_V12
:
5333 case AMDGPU::SI_INDIRECT_DST_V16
:
5334 case AMDGPU::SI_INDIRECT_DST_V32
:
5335 return emitIndirectDst(MI
, *BB
, *getSubtarget());
5336 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO
:
5337 case AMDGPU::SI_KILL_I1_PSEUDO
:
5338 return splitKillBlock(MI
, BB
);
5339 case AMDGPU::V_CNDMASK_B64_PSEUDO
: {
5340 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
5341 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
5342 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
5344 Register Dst
= MI
.getOperand(0).getReg();
5345 const MachineOperand
&Src0
= MI
.getOperand(1);
5346 const MachineOperand
&Src1
= MI
.getOperand(2);
5347 const DebugLoc
&DL
= MI
.getDebugLoc();
5348 Register SrcCond
= MI
.getOperand(3).getReg();
5350 Register DstLo
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
5351 Register DstHi
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
5352 const auto *CondRC
= TRI
->getWaveMaskRegClass();
5353 Register SrcCondCopy
= MRI
.createVirtualRegister(CondRC
);
5355 const TargetRegisterClass
*Src0RC
= Src0
.isReg()
5356 ? MRI
.getRegClass(Src0
.getReg())
5357 : &AMDGPU::VReg_64RegClass
;
5358 const TargetRegisterClass
*Src1RC
= Src1
.isReg()
5359 ? MRI
.getRegClass(Src1
.getReg())
5360 : &AMDGPU::VReg_64RegClass
;
5362 const TargetRegisterClass
*Src0SubRC
=
5363 TRI
->getSubRegisterClass(Src0RC
, AMDGPU::sub0
);
5364 const TargetRegisterClass
*Src1SubRC
=
5365 TRI
->getSubRegisterClass(Src1RC
, AMDGPU::sub1
);
5367 MachineOperand Src0Sub0
= TII
->buildExtractSubRegOrImm(
5368 MI
, MRI
, Src0
, Src0RC
, AMDGPU::sub0
, Src0SubRC
);
5369 MachineOperand Src1Sub0
= TII
->buildExtractSubRegOrImm(
5370 MI
, MRI
, Src1
, Src1RC
, AMDGPU::sub0
, Src1SubRC
);
5372 MachineOperand Src0Sub1
= TII
->buildExtractSubRegOrImm(
5373 MI
, MRI
, Src0
, Src0RC
, AMDGPU::sub1
, Src0SubRC
);
5374 MachineOperand Src1Sub1
= TII
->buildExtractSubRegOrImm(
5375 MI
, MRI
, Src1
, Src1RC
, AMDGPU::sub1
, Src1SubRC
);
5377 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::COPY
), SrcCondCopy
).addReg(SrcCond
);
5378 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstLo
)
5383 .addReg(SrcCondCopy
);
5384 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::V_CNDMASK_B32_e64
), DstHi
)
5389 .addReg(SrcCondCopy
);
5391 BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), Dst
)
5393 .addImm(AMDGPU::sub0
)
5395 .addImm(AMDGPU::sub1
);
5396 MI
.eraseFromParent();
5399 case AMDGPU::SI_BR_UNDEF
: {
5400 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
5401 const DebugLoc
&DL
= MI
.getDebugLoc();
5402 MachineInstr
*Br
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_SCC1
))
5403 .add(MI
.getOperand(0));
5404 Br
->getOperand(1).setIsUndef(); // read undef SCC
5405 MI
.eraseFromParent();
5408 case AMDGPU::ADJCALLSTACKUP
:
5409 case AMDGPU::ADJCALLSTACKDOWN
: {
5410 const SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
5411 MachineInstrBuilder
MIB(*MF
, &MI
);
5412 MIB
.addReg(Info
->getStackPtrOffsetReg(), RegState::ImplicitDefine
)
5413 .addReg(Info
->getStackPtrOffsetReg(), RegState::Implicit
);
5416 case AMDGPU::SI_CALL_ISEL
: {
5417 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
5418 const DebugLoc
&DL
= MI
.getDebugLoc();
5420 unsigned ReturnAddrReg
= TII
->getRegisterInfo().getReturnAddressReg(*MF
);
5422 MachineInstrBuilder MIB
;
5423 MIB
= BuildMI(*BB
, MI
, DL
, TII
->get(AMDGPU::SI_CALL
), ReturnAddrReg
);
5425 for (const MachineOperand
&MO
: MI
.operands())
5428 MIB
.cloneMemRefs(MI
);
5429 MI
.eraseFromParent();
5432 case AMDGPU::V_ADD_CO_U32_e32
:
5433 case AMDGPU::V_SUB_CO_U32_e32
:
5434 case AMDGPU::V_SUBREV_CO_U32_e32
: {
5435 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5436 const DebugLoc
&DL
= MI
.getDebugLoc();
5437 unsigned Opc
= MI
.getOpcode();
5439 bool NeedClampOperand
= false;
5440 if (TII
->pseudoToMCOpcode(Opc
) == -1) {
5441 Opc
= AMDGPU::getVOPe64(Opc
);
5442 NeedClampOperand
= true;
5445 auto I
= BuildMI(*BB
, MI
, DL
, TII
->get(Opc
), MI
.getOperand(0).getReg());
5446 if (TII
->isVOP3(*I
)) {
5447 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
5448 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
5449 I
.addReg(TRI
->getVCC(), RegState::Define
);
5451 I
.add(MI
.getOperand(1)).add(MI
.getOperand(2));
5452 if (NeedClampOperand
)
5453 I
.addImm(0); // clamp bit for e64 encoding
5455 TII
->legalizeOperands(*I
);
5457 MI
.eraseFromParent();
5460 case AMDGPU::V_ADDC_U32_e32
:
5461 case AMDGPU::V_SUBB_U32_e32
:
5462 case AMDGPU::V_SUBBREV_U32_e32
:
5463 // These instructions have an implicit use of vcc which counts towards the
5464 // constant bus limit.
5465 TII
->legalizeOperands(MI
);
5467 case AMDGPU::DS_GWS_INIT
:
5468 case AMDGPU::DS_GWS_SEMA_BR
:
5469 case AMDGPU::DS_GWS_BARRIER
:
5470 TII
->enforceOperandRCAlignment(MI
, AMDGPU::OpName::data0
);
5472 case AMDGPU::DS_GWS_SEMA_V
:
5473 case AMDGPU::DS_GWS_SEMA_P
:
5474 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL
:
5475 // A s_waitcnt 0 is required to be the instruction immediately following.
5476 if (getSubtarget()->hasGWSAutoReplay()) {
5477 bundleInstWithWaitcnt(MI
);
5481 return emitGWSMemViolTestLoop(MI
, BB
);
5482 case AMDGPU::S_SETREG_B32
: {
5483 // Try to optimize cases that only set the denormal mode or rounding mode.
5485 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5486 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5489 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5490 // allow you to have a no side effect instruction in the output of a
5491 // sideeffecting pattern.
5492 auto [ID
, Offset
, Width
] =
5493 AMDGPU::Hwreg::HwregEncoding::decode(MI
.getOperand(1).getImm());
5494 if (ID
!= AMDGPU::Hwreg::ID_MODE
)
5497 const unsigned WidthMask
= maskTrailingOnes
<unsigned>(Width
);
5498 const unsigned SetMask
= WidthMask
<< Offset
;
5500 if (getSubtarget()->hasDenormModeInst()) {
5501 unsigned SetDenormOp
= 0;
5502 unsigned SetRoundOp
= 0;
5504 // The dedicated instructions can only set the whole denorm or round mode
5505 // at once, not a subset of bits in either.
5507 (AMDGPU::Hwreg::FP_ROUND_MASK
| AMDGPU::Hwreg::FP_DENORM_MASK
)) {
5508 // If this fully sets both the round and denorm mode, emit the two
5509 // dedicated instructions for these.
5510 SetRoundOp
= AMDGPU::S_ROUND_MODE
;
5511 SetDenormOp
= AMDGPU::S_DENORM_MODE
;
5512 } else if (SetMask
== AMDGPU::Hwreg::FP_ROUND_MASK
) {
5513 SetRoundOp
= AMDGPU::S_ROUND_MODE
;
5514 } else if (SetMask
== AMDGPU::Hwreg::FP_DENORM_MASK
) {
5515 SetDenormOp
= AMDGPU::S_DENORM_MODE
;
5518 if (SetRoundOp
|| SetDenormOp
) {
5519 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
5520 MachineInstr
*Def
= MRI
.getVRegDef(MI
.getOperand(0).getReg());
5521 if (Def
&& Def
->isMoveImmediate() && Def
->getOperand(1).isImm()) {
5522 unsigned ImmVal
= Def
->getOperand(1).getImm();
5524 BuildMI(*BB
, MI
, MI
.getDebugLoc(), TII
->get(SetRoundOp
))
5525 .addImm(ImmVal
& 0xf);
5527 // If we also have the denorm mode, get just the denorm mode bits.
5532 BuildMI(*BB
, MI
, MI
.getDebugLoc(), TII
->get(SetDenormOp
))
5533 .addImm(ImmVal
& 0xf);
5536 MI
.eraseFromParent();
5542 // If only FP bits are touched, used the no side effects pseudo.
5543 if ((SetMask
& (AMDGPU::Hwreg::FP_ROUND_MASK
|
5544 AMDGPU::Hwreg::FP_DENORM_MASK
)) == SetMask
)
5545 MI
.setDesc(TII
->get(AMDGPU::S_SETREG_B32_mode
));
5549 case AMDGPU::S_INVERSE_BALLOT_U32
:
5550 case AMDGPU::S_INVERSE_BALLOT_U64
:
5551 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5552 // necessary. After that they are equivalent to a COPY.
5553 MI
.setDesc(TII
->get(AMDGPU::COPY
));
5555 case AMDGPU::ENDPGM_TRAP
: {
5556 const DebugLoc
&DL
= MI
.getDebugLoc();
5557 if (BB
->succ_empty() && std::next(MI
.getIterator()) == BB
->end()) {
5558 MI
.setDesc(TII
->get(AMDGPU::S_ENDPGM
));
5559 MI
.addOperand(MachineOperand::CreateImm(0));
5563 // We need a block split to make the real endpgm a terminator. We also don't
5564 // want to break phis in successor blocks, so we can't just delete to the
5565 // end of the block.
5567 MachineBasicBlock
*SplitBB
= BB
->splitAt(MI
, false /*UpdateLiveIns*/);
5568 MachineBasicBlock
*TrapBB
= MF
->CreateMachineBasicBlock();
5569 MF
->push_back(TrapBB
);
5571 BuildMI(*TrapBB
, TrapBB
->end(), DL
, TII
->get(AMDGPU::S_ENDPGM
))
5573 BuildMI(*BB
, &MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
5577 BB
->addSuccessor(TrapBB
);
5578 MI
.eraseFromParent();
5581 case AMDGPU::SIMULATED_TRAP
: {
5582 assert(Subtarget
->hasPrivEnabledTrap2NopBug());
5583 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
5584 MachineBasicBlock
*SplitBB
=
5585 TII
->insertSimulatedTrap(MRI
, *BB
, MI
, MI
.getDebugLoc());
5586 MI
.eraseFromParent();
5590 if (TII
->isImage(MI
) || TII
->isMUBUF(MI
)) {
5595 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI
, BB
);
5599 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT
) const {
5600 // This currently forces unfolding various combinations of fsub into fma with
5601 // free fneg'd operands. As long as we have fast FMA (controlled by
5602 // isFMAFasterThanFMulAndFAdd), we should perform these.
5604 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5605 // most of these combines appear to be cycle neutral but save on instruction
5606 // count / code size.
5610 bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty
) const { return true; }
5612 EVT
SITargetLowering::getSetCCResultType(const DataLayout
&DL
, LLVMContext
&Ctx
,
5614 if (!VT
.isVector()) {
5617 return EVT::getVectorVT(Ctx
, MVT::i1
, VT
.getVectorNumElements());
5620 MVT
SITargetLowering::getScalarShiftAmountTy(const DataLayout
&, EVT VT
) const {
5621 // TODO: Should i16 be used always if legal? For now it would force VALU
5623 return (VT
== MVT::i16
) ? MVT::i16
: MVT::i32
;
5626 LLT
SITargetLowering::getPreferredShiftAmountTy(LLT Ty
) const {
5627 return (Ty
.getScalarSizeInBits() <= 16 && Subtarget
->has16BitInsts())
5628 ? Ty
.changeElementSize(16)
5629 : Ty
.changeElementSize(32);
5632 // Answering this is somewhat tricky and depends on the specific device which
5633 // have different rates for fma or all f64 operations.
5635 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5636 // regardless of which device (although the number of cycles differs between
5637 // devices), so it is always profitable for f64.
5639 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5640 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
5641 // which we can always do even without fused FP ops since it returns the same
5642 // result as the separate operations and since it is always full
5643 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5644 // however does not support denormals, so we do report fma as faster if we have
5645 // a fast fma device and require denormals.
5647 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction
&MF
,
5649 VT
= VT
.getScalarType();
5651 switch (VT
.getSimpleVT().SimpleTy
) {
5653 // If mad is not available this depends only on if f32 fma is full rate.
5654 if (!Subtarget
->hasMadMacF32Insts())
5655 return Subtarget
->hasFastFMAF32();
5657 // Otherwise f32 mad is always full rate and returns the same result as
5658 // the separate operations so should be preferred over fma.
5659 // However does not support denormals.
5660 if (!denormalModeIsFlushAllF32(MF
))
5661 return Subtarget
->hasFastFMAF32() || Subtarget
->hasDLInsts();
5663 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5664 return Subtarget
->hasFastFMAF32() && Subtarget
->hasDLInsts();
5669 return Subtarget
->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF
);
5677 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction
&MF
,
5679 switch (Ty
.getScalarSizeInBits()) {
5681 return isFMAFasterThanFMulAndFAdd(MF
, MVT::f16
);
5683 return isFMAFasterThanFMulAndFAdd(MF
, MVT::f32
);
5685 return isFMAFasterThanFMulAndFAdd(MF
, MVT::f64
);
5693 bool SITargetLowering::isFMADLegal(const MachineInstr
&MI
, LLT Ty
) const {
5697 if (Ty
.getScalarSizeInBits() == 16)
5698 return Subtarget
->hasMadF16() && denormalModeIsFlushAllF64F16(*MI
.getMF());
5699 if (Ty
.getScalarSizeInBits() == 32)
5700 return Subtarget
->hasMadMacF32Insts() &&
5701 denormalModeIsFlushAllF32(*MI
.getMF());
5706 bool SITargetLowering::isFMADLegal(const SelectionDAG
&DAG
,
5707 const SDNode
*N
) const {
5708 // TODO: Check future ftz flag
5709 // v_mad_f32/v_mac_f32 do not support denormals.
5710 EVT VT
= N
->getValueType(0);
5712 return Subtarget
->hasMadMacF32Insts() &&
5713 denormalModeIsFlushAllF32(DAG
.getMachineFunction());
5714 if (VT
== MVT::f16
) {
5715 return Subtarget
->hasMadF16() &&
5716 denormalModeIsFlushAllF64F16(DAG
.getMachineFunction());
5722 //===----------------------------------------------------------------------===//
5723 // Custom DAG Lowering Operations
5724 //===----------------------------------------------------------------------===//
5726 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5727 // wider vector type is legal.
5728 SDValue
SITargetLowering::splitUnaryVectorOp(SDValue Op
,
5729 SelectionDAG
&DAG
) const {
5730 unsigned Opc
= Op
.getOpcode();
5731 EVT VT
= Op
.getValueType();
5732 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
|| VT
== MVT::v4f32
||
5733 VT
== MVT::v8i16
|| VT
== MVT::v8f16
|| VT
== MVT::v16i16
||
5734 VT
== MVT::v16f16
|| VT
== MVT::v8f32
|| VT
== MVT::v16f32
||
5735 VT
== MVT::v32f32
|| VT
== MVT::v32i16
|| VT
== MVT::v32f16
);
5737 auto [Lo
, Hi
] = DAG
.SplitVectorOperand(Op
.getNode(), 0);
5740 SDValue OpLo
= DAG
.getNode(Opc
, SL
, Lo
.getValueType(), Lo
, Op
->getFlags());
5741 SDValue OpHi
= DAG
.getNode(Opc
, SL
, Hi
.getValueType(), Hi
, Op
->getFlags());
5743 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
5746 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5747 // wider vector type is legal.
5748 SDValue
SITargetLowering::splitBinaryVectorOp(SDValue Op
,
5749 SelectionDAG
&DAG
) const {
5750 unsigned Opc
= Op
.getOpcode();
5751 EVT VT
= Op
.getValueType();
5752 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
|| VT
== MVT::v4f32
||
5753 VT
== MVT::v8i16
|| VT
== MVT::v8f16
|| VT
== MVT::v16i16
||
5754 VT
== MVT::v16f16
|| VT
== MVT::v8f32
|| VT
== MVT::v16f32
||
5755 VT
== MVT::v32f32
|| VT
== MVT::v32i16
|| VT
== MVT::v32f16
);
5757 auto [Lo0
, Hi0
] = DAG
.SplitVectorOperand(Op
.getNode(), 0);
5758 auto [Lo1
, Hi1
] = DAG
.SplitVectorOperand(Op
.getNode(), 1);
5763 DAG
.getNode(Opc
, SL
, Lo0
.getValueType(), Lo0
, Lo1
, Op
->getFlags());
5765 DAG
.getNode(Opc
, SL
, Hi0
.getValueType(), Hi0
, Hi1
, Op
->getFlags());
5767 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
5770 SDValue
SITargetLowering::splitTernaryVectorOp(SDValue Op
,
5771 SelectionDAG
&DAG
) const {
5772 unsigned Opc
= Op
.getOpcode();
5773 EVT VT
= Op
.getValueType();
5774 assert(VT
== MVT::v4i16
|| VT
== MVT::v4f16
|| VT
== MVT::v8i16
||
5775 VT
== MVT::v8f16
|| VT
== MVT::v4f32
|| VT
== MVT::v16i16
||
5776 VT
== MVT::v16f16
|| VT
== MVT::v8f32
|| VT
== MVT::v16f32
||
5777 VT
== MVT::v32f32
|| VT
== MVT::v32f16
|| VT
== MVT::v32i16
||
5778 VT
== MVT::v4bf16
|| VT
== MVT::v8bf16
|| VT
== MVT::v16bf16
||
5779 VT
== MVT::v32bf16
);
5781 SDValue Op0
= Op
.getOperand(0);
5782 auto [Lo0
, Hi0
] = Op0
.getValueType().isVector()
5783 ? DAG
.SplitVectorOperand(Op
.getNode(), 0)
5784 : std::pair(Op0
, Op0
);
5786 auto [Lo1
, Hi1
] = DAG
.SplitVectorOperand(Op
.getNode(), 1);
5787 auto [Lo2
, Hi2
] = DAG
.SplitVectorOperand(Op
.getNode(), 2);
5790 auto ResVT
= DAG
.GetSplitDestVTs(VT
);
5793 DAG
.getNode(Opc
, SL
, ResVT
.first
, Lo0
, Lo1
, Lo2
, Op
->getFlags());
5795 DAG
.getNode(Opc
, SL
, ResVT
.second
, Hi0
, Hi1
, Hi2
, Op
->getFlags());
5797 return DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), VT
, OpLo
, OpHi
);
5800 SDValue
SITargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
5801 switch (Op
.getOpcode()) {
5803 return AMDGPUTargetLowering::LowerOperation(Op
, DAG
);
5805 return LowerBRCOND(Op
, DAG
);
5806 case ISD::RETURNADDR
:
5807 return LowerRETURNADDR(Op
, DAG
);
5809 SDValue Result
= LowerLOAD(Op
, DAG
);
5810 assert((!Result
.getNode() || Result
.getNode()->getNumValues() == 2) &&
5811 "Load should return a value and a chain");
5815 EVT VT
= Op
.getValueType();
5817 return lowerFSQRTF32(Op
, DAG
);
5819 return lowerFSQRTF64(Op
, DAG
);
5824 return LowerTrig(Op
, DAG
);
5826 return LowerSELECT(Op
, DAG
);
5828 return LowerFDIV(Op
, DAG
);
5830 return LowerFFREXP(Op
, DAG
);
5831 case ISD::ATOMIC_CMP_SWAP
:
5832 return LowerATOMIC_CMP_SWAP(Op
, DAG
);
5834 return LowerSTORE(Op
, DAG
);
5835 case ISD::GlobalAddress
: {
5836 MachineFunction
&MF
= DAG
.getMachineFunction();
5837 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
5838 return LowerGlobalAddress(MFI
, Op
, DAG
);
5840 case ISD::INTRINSIC_WO_CHAIN
:
5841 return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
5842 case ISD::INTRINSIC_W_CHAIN
:
5843 return LowerINTRINSIC_W_CHAIN(Op
, DAG
);
5844 case ISD::INTRINSIC_VOID
:
5845 return LowerINTRINSIC_VOID(Op
, DAG
);
5846 case ISD::ADDRSPACECAST
:
5847 return lowerADDRSPACECAST(Op
, DAG
);
5848 case ISD::INSERT_SUBVECTOR
:
5849 return lowerINSERT_SUBVECTOR(Op
, DAG
);
5850 case ISD::INSERT_VECTOR_ELT
:
5851 return lowerINSERT_VECTOR_ELT(Op
, DAG
);
5852 case ISD::EXTRACT_VECTOR_ELT
:
5853 return lowerEXTRACT_VECTOR_ELT(Op
, DAG
);
5854 case ISD::VECTOR_SHUFFLE
:
5855 return lowerVECTOR_SHUFFLE(Op
, DAG
);
5856 case ISD::SCALAR_TO_VECTOR
:
5857 return lowerSCALAR_TO_VECTOR(Op
, DAG
);
5858 case ISD::BUILD_VECTOR
:
5859 return lowerBUILD_VECTOR(Op
, DAG
);
5861 case ISD::STRICT_FP_ROUND
:
5862 return lowerFP_ROUND(Op
, DAG
);
5864 return lowerTRAP(Op
, DAG
);
5865 case ISD::DEBUGTRAP
:
5866 return lowerDEBUGTRAP(Op
, DAG
);
5870 case ISD::FCANONICALIZE
:
5872 return splitUnaryVectorOp(Op
, DAG
);
5875 return lowerFMINNUM_FMAXNUM(Op
, DAG
);
5877 case ISD::STRICT_FLDEXP
:
5878 return lowerFLDEXP(Op
, DAG
);
5880 return splitTernaryVectorOp(Op
, DAG
);
5881 case ISD::FP_TO_SINT
:
5882 case ISD::FP_TO_UINT
:
5883 return LowerFP_TO_INT(Op
, DAG
);
5895 case ISD::FMINNUM_IEEE
:
5896 case ISD::FMAXNUM_IEEE
:
5899 case ISD::FMINIMUMNUM
:
5900 case ISD::FMAXIMUMNUM
:
5905 return splitBinaryVectorOp(Op
, DAG
);
5907 return lowerMUL(Op
, DAG
);
5910 return lowerXMULO(Op
, DAG
);
5911 case ISD::SMUL_LOHI
:
5912 case ISD::UMUL_LOHI
:
5913 return lowerXMUL_LOHI(Op
, DAG
);
5914 case ISD::DYNAMIC_STACKALLOC
:
5915 return LowerDYNAMIC_STACKALLOC(Op
, DAG
);
5916 case ISD::STACKSAVE
:
5917 return LowerSTACKSAVE(Op
, DAG
);
5918 case ISD::GET_ROUNDING
:
5919 return lowerGET_ROUNDING(Op
, DAG
);
5920 case ISD::SET_ROUNDING
:
5921 return lowerSET_ROUNDING(Op
, DAG
);
5923 return lowerPREFETCH(Op
, DAG
);
5924 case ISD::FP_EXTEND
:
5925 case ISD::STRICT_FP_EXTEND
:
5926 return lowerFP_EXTEND(Op
, DAG
);
5927 case ISD::GET_FPENV
:
5928 return lowerGET_FPENV(Op
, DAG
);
5929 case ISD::SET_FPENV
:
5930 return lowerSET_FPENV(Op
, DAG
);
5935 // Used for D16: Casts the result of an instruction into the right vector,
5936 // packs values if loads return unpacked values.
5937 static SDValue
adjustLoadValueTypeImpl(SDValue Result
, EVT LoadVT
,
5938 const SDLoc
&DL
, SelectionDAG
&DAG
,
5940 if (!LoadVT
.isVector())
5943 // Cast back to the original packed type or to a larger type that is a
5944 // multiple of 32 bit for D16. Widening the return type is a required for
5946 EVT FittingLoadVT
= LoadVT
;
5947 if ((LoadVT
.getVectorNumElements() % 2) == 1) {
5949 EVT::getVectorVT(*DAG
.getContext(), LoadVT
.getVectorElementType(),
5950 LoadVT
.getVectorNumElements() + 1);
5953 if (Unpacked
) { // From v2i32/v4i32 back to v2f16/v4f16.
5954 // Truncate to v2i16/v4i16.
5955 EVT IntLoadVT
= FittingLoadVT
.changeTypeToInteger();
5957 // Workaround legalizer not scalarizing truncate after vector op
5958 // legalization but not creating intermediate vector trunc.
5959 SmallVector
<SDValue
, 4> Elts
;
5960 DAG
.ExtractVectorElements(Result
, Elts
);
5961 for (SDValue
&Elt
: Elts
)
5962 Elt
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, Elt
);
5964 // Pad illegal v1i16/v3fi6 to v4i16
5965 if ((LoadVT
.getVectorNumElements() % 2) == 1)
5966 Elts
.push_back(DAG
.getUNDEF(MVT::i16
));
5968 Result
= DAG
.getBuildVector(IntLoadVT
, DL
, Elts
);
5970 // Bitcast to original type (v2f16/v4f16).
5971 return DAG
.getNode(ISD::BITCAST
, DL
, FittingLoadVT
, Result
);
5974 // Cast back to the original packed type.
5975 return DAG
.getNode(ISD::BITCAST
, DL
, FittingLoadVT
, Result
);
5978 SDValue
SITargetLowering::adjustLoadValueType(unsigned Opcode
, MemSDNode
*M
,
5980 ArrayRef
<SDValue
> Ops
,
5981 bool IsIntrinsic
) const {
5984 bool Unpacked
= Subtarget
->hasUnpackedD16VMem();
5985 EVT LoadVT
= M
->getValueType(0);
5987 EVT EquivLoadVT
= LoadVT
;
5988 if (LoadVT
.isVector()) {
5990 EquivLoadVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
5991 LoadVT
.getVectorNumElements());
5992 } else if ((LoadVT
.getVectorNumElements() % 2) == 1) {
5993 // Widen v3f16 to legal type
5995 EVT::getVectorVT(*DAG
.getContext(), LoadVT
.getVectorElementType(),
5996 LoadVT
.getVectorNumElements() + 1);
6000 // Change from v4f16/v2f16 to EquivLoadVT.
6001 SDVTList VTList
= DAG
.getVTList(EquivLoadVT
, MVT::Other
);
6003 SDValue Load
= DAG
.getMemIntrinsicNode(
6004 IsIntrinsic
? (unsigned)ISD::INTRINSIC_W_CHAIN
: Opcode
, DL
, VTList
, Ops
,
6005 M
->getMemoryVT(), M
->getMemOperand());
6007 SDValue Adjusted
= adjustLoadValueTypeImpl(Load
, LoadVT
, DL
, DAG
, Unpacked
);
6009 return DAG
.getMergeValues({Adjusted
, Load
.getValue(1)}, DL
);
6012 SDValue
SITargetLowering::lowerIntrinsicLoad(MemSDNode
*M
, bool IsFormat
,
6014 ArrayRef
<SDValue
> Ops
) const {
6016 EVT LoadVT
= M
->getValueType(0);
6017 EVT EltType
= LoadVT
.getScalarType();
6018 EVT IntVT
= LoadVT
.changeTypeToInteger();
6020 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
6022 assert(M
->getNumValues() == 2 || M
->getNumValues() == 3);
6023 bool IsTFE
= M
->getNumValues() == 3;
6025 unsigned Opc
= IsFormat
? (IsTFE
? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6026 : AMDGPUISD::BUFFER_LOAD_FORMAT
)
6027 : IsTFE
? AMDGPUISD::BUFFER_LOAD_TFE
6028 : AMDGPUISD::BUFFER_LOAD
;
6031 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16
, M
, DAG
, Ops
);
6034 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6035 if (!IsD16
&& !LoadVT
.isVector() && EltType
.getSizeInBits() < 32)
6036 return handleByteShortBufferLoads(DAG
, LoadVT
, DL
, Ops
, M
->getMemOperand(),
6039 if (isTypeLegal(LoadVT
)) {
6040 return getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), Ops
, IntVT
,
6041 M
->getMemOperand(), DAG
);
6044 EVT CastVT
= getEquivalentMemType(*DAG
.getContext(), LoadVT
);
6045 SDVTList VTList
= DAG
.getVTList(CastVT
, MVT::Other
);
6046 SDValue MemNode
= getMemIntrinsicNode(Opc
, DL
, VTList
, Ops
, CastVT
,
6047 M
->getMemOperand(), DAG
);
6048 return DAG
.getMergeValues(
6049 {DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, MemNode
), MemNode
.getValue(1)},
6053 static SDValue
lowerICMPIntrinsic(const SITargetLowering
&TLI
, SDNode
*N
,
6054 SelectionDAG
&DAG
) {
6055 EVT VT
= N
->getValueType(0);
6056 unsigned CondCode
= N
->getConstantOperandVal(3);
6057 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate
>(CondCode
)))
6058 return DAG
.getUNDEF(VT
);
6060 ICmpInst::Predicate IcInput
= static_cast<ICmpInst::Predicate
>(CondCode
);
6062 SDValue LHS
= N
->getOperand(1);
6063 SDValue RHS
= N
->getOperand(2);
6067 EVT CmpVT
= LHS
.getValueType();
6068 if (CmpVT
== MVT::i16
&& !TLI
.isTypeLegal(MVT::i16
)) {
6069 unsigned PromoteOp
=
6070 ICmpInst::isSigned(IcInput
) ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
6071 LHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, LHS
);
6072 RHS
= DAG
.getNode(PromoteOp
, DL
, MVT::i32
, RHS
);
6075 ISD::CondCode CCOpcode
= getICmpCondCode(IcInput
);
6077 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
6078 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
6080 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, DL
, CCVT
, LHS
, RHS
,
6081 DAG
.getCondCode(CCOpcode
));
6082 if (VT
.bitsEq(CCVT
))
6084 return DAG
.getZExtOrTrunc(SetCC
, DL
, VT
);
6087 static SDValue
lowerFCMPIntrinsic(const SITargetLowering
&TLI
, SDNode
*N
,
6088 SelectionDAG
&DAG
) {
6089 EVT VT
= N
->getValueType(0);
6091 unsigned CondCode
= N
->getConstantOperandVal(3);
6092 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate
>(CondCode
)))
6093 return DAG
.getUNDEF(VT
);
6095 SDValue Src0
= N
->getOperand(1);
6096 SDValue Src1
= N
->getOperand(2);
6097 EVT CmpVT
= Src0
.getValueType();
6100 if (CmpVT
== MVT::f16
&& !TLI
.isTypeLegal(CmpVT
)) {
6101 Src0
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src0
);
6102 Src1
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src1
);
6105 FCmpInst::Predicate IcInput
= static_cast<FCmpInst::Predicate
>(CondCode
);
6106 ISD::CondCode CCOpcode
= getFCmpCondCode(IcInput
);
6107 unsigned WavefrontSize
= TLI
.getSubtarget()->getWavefrontSize();
6108 EVT CCVT
= EVT::getIntegerVT(*DAG
.getContext(), WavefrontSize
);
6109 SDValue SetCC
= DAG
.getNode(AMDGPUISD::SETCC
, SL
, CCVT
, Src0
, Src1
,
6110 DAG
.getCondCode(CCOpcode
));
6111 if (VT
.bitsEq(CCVT
))
6113 return DAG
.getZExtOrTrunc(SetCC
, SL
, VT
);
6116 static SDValue
lowerBALLOTIntrinsic(const SITargetLowering
&TLI
, SDNode
*N
,
6117 SelectionDAG
&DAG
) {
6118 EVT VT
= N
->getValueType(0);
6119 SDValue Src
= N
->getOperand(1);
6122 if (Src
.getOpcode() == ISD::SETCC
) {
6123 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6124 return DAG
.getNode(AMDGPUISD::SETCC
, SL
, VT
, Src
.getOperand(0),
6125 Src
.getOperand(1), Src
.getOperand(2));
6127 if (const ConstantSDNode
*Arg
= dyn_cast
<ConstantSDNode
>(Src
)) {
6130 return DAG
.getConstant(0, SL
, VT
);
6132 // (ballot 1) -> EXEC/EXEC_LO
6135 if (VT
.getScalarSizeInBits() == 32)
6136 Exec
= AMDGPU::EXEC_LO
;
6137 else if (VT
.getScalarSizeInBits() == 64)
6138 Exec
= AMDGPU::EXEC
;
6142 return DAG
.getCopyFromReg(DAG
.getEntryNode(), SL
, Exec
, VT
);
6146 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6149 AMDGPUISD::SETCC
, SL
, VT
, DAG
.getZExtOrTrunc(Src
, SL
, MVT::i32
),
6150 DAG
.getConstant(0, SL
, MVT::i32
), DAG
.getCondCode(ISD::SETNE
));
6153 static SDValue
lowerLaneOp(const SITargetLowering
&TLI
, SDNode
*N
,
6154 SelectionDAG
&DAG
) {
6155 EVT VT
= N
->getValueType(0);
6156 unsigned ValSize
= VT
.getSizeInBits();
6157 unsigned IID
= N
->getConstantOperandVal(0);
6158 bool IsPermLane16
= IID
== Intrinsic::amdgcn_permlane16
||
6159 IID
== Intrinsic::amdgcn_permlanex16
;
6160 bool IsSetInactive
= IID
== Intrinsic::amdgcn_set_inactive
||
6161 IID
== Intrinsic::amdgcn_set_inactive_chain_arg
;
6163 MVT IntVT
= MVT::getIntegerVT(ValSize
);
6164 const GCNSubtarget
*ST
= TLI
.getSubtarget();
6165 unsigned SplitSize
= 32;
6166 if (IID
== Intrinsic::amdgcn_update_dpp
&& (ValSize
% 64 == 0) &&
6167 ST
->hasDPALU_DPP() &&
6168 AMDGPU::isLegalDPALU_DPPControl(N
->getConstantOperandVal(3)))
6171 auto createLaneOp
= [&DAG
, &SL
, N
, IID
](SDValue Src0
, SDValue Src1
,
6172 SDValue Src2
, MVT ValT
) -> SDValue
{
6173 SmallVector
<SDValue
, 8> Operands
;
6175 case Intrinsic::amdgcn_permlane16
:
6176 case Intrinsic::amdgcn_permlanex16
:
6177 case Intrinsic::amdgcn_update_dpp
:
6178 Operands
.push_back(N
->getOperand(6));
6179 Operands
.push_back(N
->getOperand(5));
6180 Operands
.push_back(N
->getOperand(4));
6182 case Intrinsic::amdgcn_writelane
:
6183 Operands
.push_back(Src2
);
6185 case Intrinsic::amdgcn_readlane
:
6186 case Intrinsic::amdgcn_set_inactive
:
6187 case Intrinsic::amdgcn_set_inactive_chain_arg
:
6188 case Intrinsic::amdgcn_mov_dpp8
:
6189 Operands
.push_back(Src1
);
6191 case Intrinsic::amdgcn_readfirstlane
:
6192 case Intrinsic::amdgcn_permlane64
:
6193 Operands
.push_back(Src0
);
6196 llvm_unreachable("unhandled lane op");
6199 Operands
.push_back(DAG
.getTargetConstant(IID
, SL
, MVT::i32
));
6200 std::reverse(Operands
.begin(), Operands
.end());
6202 if (SDNode
*GL
= N
->getGluedNode()) {
6203 assert(GL
->getOpcode() == ISD::CONVERGENCECTRL_GLUE
);
6204 GL
= GL
->getOperand(0).getNode();
6205 Operands
.push_back(DAG
.getNode(ISD::CONVERGENCECTRL_GLUE
, SL
, MVT::Glue
,
6209 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SL
, ValT
, Operands
);
6212 SDValue Src0
= N
->getOperand(1);
6214 if (IID
== Intrinsic::amdgcn_readlane
|| IID
== Intrinsic::amdgcn_writelane
||
6215 IID
== Intrinsic::amdgcn_mov_dpp8
||
6216 IID
== Intrinsic::amdgcn_update_dpp
|| IsSetInactive
|| IsPermLane16
) {
6217 Src1
= N
->getOperand(2);
6218 if (IID
== Intrinsic::amdgcn_writelane
||
6219 IID
== Intrinsic::amdgcn_update_dpp
|| IsPermLane16
)
6220 Src2
= N
->getOperand(3);
6223 if (ValSize
== SplitSize
) {
6229 bool IsFloat
= VT
.isFloatingPoint();
6230 Src0
= DAG
.getAnyExtOrTrunc(IsFloat
? DAG
.getBitcast(IntVT
, Src0
) : Src0
,
6233 if (IID
== Intrinsic::amdgcn_update_dpp
|| IsSetInactive
|| IsPermLane16
) {
6234 Src1
= DAG
.getAnyExtOrTrunc(IsFloat
? DAG
.getBitcast(IntVT
, Src1
) : Src1
,
6238 if (IID
== Intrinsic::amdgcn_writelane
) {
6239 Src2
= DAG
.getAnyExtOrTrunc(IsFloat
? DAG
.getBitcast(IntVT
, Src2
) : Src2
,
6243 SDValue LaneOp
= createLaneOp(Src0
, Src1
, Src2
, MVT::i32
);
6244 SDValue Trunc
= DAG
.getAnyExtOrTrunc(LaneOp
, SL
, IntVT
);
6245 return IsFloat
? DAG
.getBitcast(VT
, Trunc
) : Trunc
;
6248 if (ValSize
% SplitSize
!= 0)
6251 auto unrollLaneOp
= [&DAG
, &SL
](SDNode
*N
) -> SDValue
{
6252 EVT VT
= N
->getValueType(0);
6253 unsigned NE
= VT
.getVectorNumElements();
6254 EVT EltVT
= VT
.getVectorElementType();
6255 SmallVector
<SDValue
, 8> Scalars
;
6256 unsigned NumOperands
= N
->getNumOperands();
6257 SmallVector
<SDValue
, 4> Operands(NumOperands
);
6258 SDNode
*GL
= N
->getGluedNode();
6260 // only handle convergencectrl_glue
6261 assert(!GL
|| GL
->getOpcode() == ISD::CONVERGENCECTRL_GLUE
);
6263 for (unsigned i
= 0; i
!= NE
; ++i
) {
6264 for (unsigned j
= 0, e
= GL
? NumOperands
- 1 : NumOperands
; j
!= e
;
6266 SDValue Operand
= N
->getOperand(j
);
6267 EVT OperandVT
= Operand
.getValueType();
6268 if (OperandVT
.isVector()) {
6269 // A vector operand; extract a single element.
6270 EVT OperandEltVT
= OperandVT
.getVectorElementType();
6271 Operands
[j
] = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, OperandEltVT
,
6272 Operand
, DAG
.getVectorIdxConstant(i
, SL
));
6274 // A scalar operand; just use it as is.
6275 Operands
[j
] = Operand
;
6280 Operands
[NumOperands
- 1] =
6281 DAG
.getNode(ISD::CONVERGENCECTRL_GLUE
, SL
, MVT::Glue
,
6282 SDValue(GL
->getOperand(0).getNode(), 0));
6284 Scalars
.push_back(DAG
.getNode(N
->getOpcode(), SL
, EltVT
, Operands
));
6287 EVT VecVT
= EVT::getVectorVT(*DAG
.getContext(), EltVT
, NE
);
6288 return DAG
.getBuildVector(VecVT
, SL
, Scalars
);
6291 if (VT
.isVector()) {
6292 switch (MVT::SimpleValueType EltTy
=
6293 VT
.getVectorElementType().getSimpleVT().SimpleTy
) {
6296 if (SplitSize
== 32) {
6297 SDValue LaneOp
= createLaneOp(Src0
, Src1
, Src2
, VT
.getSimpleVT());
6298 return unrollLaneOp(LaneOp
.getNode());
6304 unsigned SubVecNumElt
=
6305 SplitSize
/ VT
.getVectorElementType().getSizeInBits();
6306 MVT SubVecVT
= MVT::getVectorVT(EltTy
, SubVecNumElt
);
6307 SmallVector
<SDValue
, 4> Pieces
;
6308 SDValue Src0SubVec
, Src1SubVec
, Src2SubVec
;
6309 for (unsigned i
= 0, EltIdx
= 0; i
< ValSize
/ SplitSize
; i
++) {
6310 Src0SubVec
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, SubVecVT
, Src0
,
6311 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
6313 if (IID
== Intrinsic::amdgcn_update_dpp
|| IsSetInactive
||
6315 Src1SubVec
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, SubVecVT
, Src1
,
6316 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
6318 if (IID
== Intrinsic::amdgcn_writelane
)
6319 Src2SubVec
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, SubVecVT
, Src2
,
6320 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
6323 IID
== Intrinsic::amdgcn_update_dpp
|| IsSetInactive
|| IsPermLane16
6324 ? createLaneOp(Src0SubVec
, Src1SubVec
, Src2
, SubVecVT
)
6325 : createLaneOp(Src0SubVec
, Src1
, Src2SubVec
, SubVecVT
));
6326 EltIdx
+= SubVecNumElt
;
6328 return DAG
.getNode(ISD::CONCAT_VECTORS
, SL
, VT
, Pieces
);
6331 // Handle all other cases by bitcasting to i32 vectors
6337 MVT::getVectorVT(MVT::getIntegerVT(SplitSize
), ValSize
/ SplitSize
);
6338 Src0
= DAG
.getBitcast(VecVT
, Src0
);
6340 if (IID
== Intrinsic::amdgcn_update_dpp
|| IsSetInactive
|| IsPermLane16
)
6341 Src1
= DAG
.getBitcast(VecVT
, Src1
);
6343 if (IID
== Intrinsic::amdgcn_writelane
)
6344 Src2
= DAG
.getBitcast(VecVT
, Src2
);
6346 SDValue LaneOp
= createLaneOp(Src0
, Src1
, Src2
, VecVT
);
6347 SDValue UnrolledLaneOp
= unrollLaneOp(LaneOp
.getNode());
6348 return DAG
.getBitcast(VT
, UnrolledLaneOp
);
6351 void SITargetLowering::ReplaceNodeResults(SDNode
*N
,
6352 SmallVectorImpl
<SDValue
> &Results
,
6353 SelectionDAG
&DAG
) const {
6354 switch (N
->getOpcode()) {
6355 case ISD::INSERT_VECTOR_ELT
: {
6356 if (SDValue Res
= lowerINSERT_VECTOR_ELT(SDValue(N
, 0), DAG
))
6357 Results
.push_back(Res
);
6360 case ISD::EXTRACT_VECTOR_ELT
: {
6361 if (SDValue Res
= lowerEXTRACT_VECTOR_ELT(SDValue(N
, 0), DAG
))
6362 Results
.push_back(Res
);
6365 case ISD::INTRINSIC_WO_CHAIN
: {
6366 unsigned IID
= N
->getConstantOperandVal(0);
6368 case Intrinsic::amdgcn_make_buffer_rsrc
:
6369 Results
.push_back(lowerPointerAsRsrcIntrin(N
, DAG
));
6371 case Intrinsic::amdgcn_cvt_pkrtz
: {
6372 SDValue Src0
= N
->getOperand(1);
6373 SDValue Src1
= N
->getOperand(2);
6376 DAG
.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32
, SL
, MVT::i32
, Src0
, Src1
);
6377 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Cvt
));
6380 case Intrinsic::amdgcn_cvt_pknorm_i16
:
6381 case Intrinsic::amdgcn_cvt_pknorm_u16
:
6382 case Intrinsic::amdgcn_cvt_pk_i16
:
6383 case Intrinsic::amdgcn_cvt_pk_u16
: {
6384 SDValue Src0
= N
->getOperand(1);
6385 SDValue Src1
= N
->getOperand(2);
6389 if (IID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
6390 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
6391 else if (IID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
6392 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
6393 else if (IID
== Intrinsic::amdgcn_cvt_pk_i16
)
6394 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
6396 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
6398 EVT VT
= N
->getValueType(0);
6399 if (isTypeLegal(VT
))
6400 Results
.push_back(DAG
.getNode(Opcode
, SL
, VT
, Src0
, Src1
));
6402 SDValue Cvt
= DAG
.getNode(Opcode
, SL
, MVT::i32
, Src0
, Src1
);
6403 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, Cvt
));
6407 case Intrinsic::amdgcn_s_buffer_load
: {
6408 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6409 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6410 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6411 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6412 // s_buffer_load_i8.
6413 if (!Subtarget
->hasScalarSubwordLoads())
6415 SDValue Op
= SDValue(N
, 0);
6416 SDValue Rsrc
= Op
.getOperand(1);
6417 SDValue Offset
= Op
.getOperand(2);
6418 SDValue CachePolicy
= Op
.getOperand(3);
6419 EVT VT
= Op
.getValueType();
6420 assert(VT
== MVT::i8
&& "Expected 8-bit s_buffer_load intrinsics.\n");
6422 MachineFunction
&MF
= DAG
.getMachineFunction();
6423 const DataLayout
&DataLayout
= DAG
.getDataLayout();
6425 DataLayout
.getABITypeAlign(VT
.getTypeForEVT(*DAG
.getContext()));
6426 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
6427 MachinePointerInfo(),
6428 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
6429 MachineMemOperand::MOInvariant
,
6430 VT
.getStoreSize(), Alignment
);
6432 if (!Offset
->isDivergent()) {
6433 SDValue Ops
[] = {Rsrc
, // source register
6434 Offset
, CachePolicy
};
6435 SDValue BufferLoad
=
6436 DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE
, DL
,
6437 DAG
.getVTList(MVT::i32
), Ops
, VT
, MMO
);
6438 LoadVal
= DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, BufferLoad
);
6441 DAG
.getEntryNode(), // Chain
6443 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
6447 CachePolicy
, // cachepolicy
6448 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
6450 setBufferOffsets(Offset
, DAG
, &Ops
[3], Align(4));
6451 LoadVal
= handleByteShortBufferLoads(DAG
, VT
, DL
, Ops
, MMO
);
6453 Results
.push_back(LoadVal
);
6459 case ISD::INTRINSIC_W_CHAIN
: {
6460 if (SDValue Res
= LowerINTRINSIC_W_CHAIN(SDValue(N
, 0), DAG
)) {
6461 if (Res
.getOpcode() == ISD::MERGE_VALUES
) {
6463 for (unsigned I
= 0; I
< Res
.getNumOperands(); I
++) {
6464 Results
.push_back(Res
.getOperand(I
));
6467 Results
.push_back(Res
);
6468 Results
.push_back(Res
.getValue(1));
6477 EVT VT
= N
->getValueType(0);
6478 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VT
);
6479 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(1));
6480 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, N
->getOperand(2));
6482 EVT SelectVT
= NewVT
;
6483 if (NewVT
.bitsLT(MVT::i32
)) {
6484 LHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, LHS
);
6485 RHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, RHS
);
6486 SelectVT
= MVT::i32
;
6490 DAG
.getNode(ISD::SELECT
, SL
, SelectVT
, N
->getOperand(0), LHS
, RHS
);
6492 if (NewVT
!= SelectVT
)
6493 NewSelect
= DAG
.getNode(ISD::TRUNCATE
, SL
, NewVT
, NewSelect
);
6494 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewSelect
));
6498 if (N
->getValueType(0) != MVT::v2f16
)
6502 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
6504 SDValue Op
= DAG
.getNode(ISD::XOR
, SL
, MVT::i32
, BC
,
6505 DAG
.getConstant(0x80008000, SL
, MVT::i32
));
6506 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
6510 if (N
->getValueType(0) != MVT::v2f16
)
6514 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, N
->getOperand(0));
6516 SDValue Op
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
, BC
,
6517 DAG
.getConstant(0x7fff7fff, SL
, MVT::i32
));
6518 Results
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2f16
, Op
));
6522 if (N
->getValueType(0) != MVT::f16
)
6524 Results
.push_back(lowerFSQRTF16(SDValue(N
, 0), DAG
));
6528 AMDGPUTargetLowering::ReplaceNodeResults(N
, Results
, DAG
);
6533 /// Helper function for LowerBRCOND
6534 static SDNode
*findUser(SDValue Value
, unsigned Opcode
) {
6536 SDNode
*Parent
= Value
.getNode();
6537 for (SDNode::use_iterator I
= Parent
->use_begin(), E
= Parent
->use_end();
6540 if (I
.getUse().get() != Value
)
6543 if (I
->getOpcode() == Opcode
)
6549 unsigned SITargetLowering::isCFIntrinsic(const SDNode
*Intr
) const {
6550 if (Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
) {
6551 switch (Intr
->getConstantOperandVal(1)) {
6552 case Intrinsic::amdgcn_if
:
6553 return AMDGPUISD::IF
;
6554 case Intrinsic::amdgcn_else
:
6555 return AMDGPUISD::ELSE
;
6556 case Intrinsic::amdgcn_loop
:
6557 return AMDGPUISD::LOOP
;
6558 case Intrinsic::amdgcn_end_cf
:
6559 llvm_unreachable("should not occur");
6565 // break, if_break, else_break are all only used as inputs to loop, not
6566 // directly as branch conditions.
6570 bool SITargetLowering::shouldEmitFixup(const GlobalValue
*GV
) const {
6571 const Triple
&TT
= getTargetMachine().getTargetTriple();
6572 return (GV
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
6573 GV
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
6574 AMDGPU::shouldEmitConstantsToTextSection(TT
);
6577 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue
*GV
) const {
6578 if (Subtarget
->isAmdPalOS() || Subtarget
->isMesa3DOS())
6581 // FIXME: Either avoid relying on address space here or change the default
6582 // address space for functions to avoid the explicit check.
6583 return (GV
->getValueType()->isFunctionTy() ||
6584 !isNonGlobalAddrSpace(GV
->getAddressSpace())) &&
6585 !shouldEmitFixup(GV
) && !getTargetMachine().shouldAssumeDSOLocal(GV
);
6588 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue
*GV
) const {
6589 return !shouldEmitFixup(GV
) && !shouldEmitGOTReloc(GV
);
6592 bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue
*GV
) const {
6593 if (!GV
->hasExternalLinkage())
6596 const auto OS
= getTargetMachine().getTargetTriple().getOS();
6597 return OS
== Triple::AMDHSA
|| OS
== Triple::AMDPAL
;
6600 /// This transforms the control flow intrinsics to get the branch destination as
6601 /// last parameter, also switches branch target with BR if the need arise
6602 SDValue
SITargetLowering::LowerBRCOND(SDValue BRCOND
, SelectionDAG
&DAG
) const {
6605 SDNode
*Intr
= BRCOND
.getOperand(1).getNode();
6606 SDValue Target
= BRCOND
.getOperand(2);
6607 SDNode
*BR
= nullptr;
6608 SDNode
*SetCC
= nullptr;
6610 if (Intr
->getOpcode() == ISD::SETCC
) {
6611 // As long as we negate the condition everything is fine
6613 Intr
= SetCC
->getOperand(0).getNode();
6616 // Get the target from BR if we don't negate the condition
6617 BR
= findUser(BRCOND
, ISD::BR
);
6618 assert(BR
&& "brcond missing unconditional branch user");
6619 Target
= BR
->getOperand(1);
6622 unsigned CFNode
= isCFIntrinsic(Intr
);
6624 // This is a uniform branch so we don't need to legalize.
6628 bool HaveChain
= Intr
->getOpcode() == ISD::INTRINSIC_VOID
||
6629 Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
;
6632 (SetCC
->getConstantOperandVal(1) == 1 &&
6633 cast
<CondCodeSDNode
>(SetCC
->getOperand(2).getNode())->get() ==
6636 // operands of the new intrinsic call
6637 SmallVector
<SDValue
, 4> Ops
;
6639 Ops
.push_back(BRCOND
.getOperand(0));
6641 Ops
.append(Intr
->op_begin() + (HaveChain
? 2 : 1), Intr
->op_end());
6642 Ops
.push_back(Target
);
6644 ArrayRef
<EVT
> Res(Intr
->value_begin() + 1, Intr
->value_end());
6646 // build the new intrinsic call
6647 SDNode
*Result
= DAG
.getNode(CFNode
, DL
, DAG
.getVTList(Res
), Ops
).getNode();
6650 SDValue Ops
[] = {SDValue(Result
, 0), BRCOND
.getOperand(0)};
6652 Result
= DAG
.getMergeValues(Ops
, DL
).getNode();
6656 // Give the branch instruction our target
6657 SDValue Ops
[] = {BR
->getOperand(0), BRCOND
.getOperand(2)};
6658 SDValue NewBR
= DAG
.getNode(ISD::BR
, DL
, BR
->getVTList(), Ops
);
6659 DAG
.ReplaceAllUsesWith(BR
, NewBR
.getNode());
6662 SDValue Chain
= SDValue(Result
, Result
->getNumValues() - 1);
6664 // Copy the intrinsic results to registers
6665 for (unsigned i
= 1, e
= Intr
->getNumValues() - 1; i
!= e
; ++i
) {
6666 SDNode
*CopyToReg
= findUser(SDValue(Intr
, i
), ISD::CopyToReg
);
6670 Chain
= DAG
.getCopyToReg(Chain
, DL
, CopyToReg
->getOperand(1),
6671 SDValue(Result
, i
- 1), SDValue());
6673 DAG
.ReplaceAllUsesWith(SDValue(CopyToReg
, 0), CopyToReg
->getOperand(0));
6676 // Remove the old intrinsic from the chain
6677 DAG
.ReplaceAllUsesOfValueWith(SDValue(Intr
, Intr
->getNumValues() - 1),
6678 Intr
->getOperand(0));
6683 SDValue
SITargetLowering::LowerRETURNADDR(SDValue Op
, SelectionDAG
&DAG
) const {
6684 MVT VT
= Op
.getSimpleValueType();
6686 // Checking the depth
6687 if (Op
.getConstantOperandVal(0) != 0)
6688 return DAG
.getConstant(0, DL
, VT
);
6690 MachineFunction
&MF
= DAG
.getMachineFunction();
6691 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
6692 // Check for kernel and shader functions
6693 if (Info
->isEntryFunction())
6694 return DAG
.getConstant(0, DL
, VT
);
6696 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
6697 // There is a call to @llvm.returnaddress in this function
6698 MFI
.setReturnAddressIsTaken(true);
6700 const SIRegisterInfo
*TRI
= getSubtarget()->getRegisterInfo();
6701 // Get the return address reg and mark it as an implicit live-in
6702 Register Reg
= MF
.addLiveIn(TRI
->getReturnAddressReg(MF
),
6703 getRegClassFor(VT
, Op
.getNode()->isDivergent()));
6705 return DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, Reg
, VT
);
6708 SDValue
SITargetLowering::getFPExtOrFPRound(SelectionDAG
&DAG
, SDValue Op
,
6709 const SDLoc
&DL
, EVT VT
) const {
6710 return Op
.getValueType().bitsLE(VT
)
6711 ? DAG
.getNode(ISD::FP_EXTEND
, DL
, VT
, Op
)
6712 : DAG
.getNode(ISD::FP_ROUND
, DL
, VT
, Op
,
6713 DAG
.getTargetConstant(0, DL
, MVT::i32
));
6716 SDValue
SITargetLowering::lowerFP_ROUND(SDValue Op
, SelectionDAG
&DAG
) const {
6717 assert(Op
.getValueType() == MVT::f16
&&
6718 "Do not know how to custom lower FP_ROUND for non-f16 type");
6720 SDValue Src
= Op
.getOperand(0);
6721 EVT SrcVT
= Src
.getValueType();
6722 if (SrcVT
!= MVT::f64
)
6725 // TODO: Handle strictfp
6726 if (Op
.getOpcode() != ISD::FP_ROUND
)
6731 SDValue FpToFp16
= DAG
.getNode(ISD::FP_TO_FP16
, DL
, MVT::i32
, Src
);
6732 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, FpToFp16
);
6733 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f16
, Trunc
);
6736 SDValue
SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op
,
6737 SelectionDAG
&DAG
) const {
6738 EVT VT
= Op
.getValueType();
6739 const MachineFunction
&MF
= DAG
.getMachineFunction();
6740 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
6741 bool IsIEEEMode
= Info
->getMode().IEEE
;
6743 // FIXME: Assert during selection that this is only selected for
6744 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6745 // mode functions, but this happens to be OK since it's only done in cases
6746 // where there is known no sNaN.
6748 return expandFMINNUM_FMAXNUM(Op
.getNode(), DAG
);
6750 if (VT
== MVT::v4f16
|| VT
== MVT::v8f16
|| VT
== MVT::v16f16
||
6752 return splitBinaryVectorOp(Op
, DAG
);
6756 SDValue
SITargetLowering::lowerFLDEXP(SDValue Op
, SelectionDAG
&DAG
) const {
6757 bool IsStrict
= Op
.getOpcode() == ISD::STRICT_FLDEXP
;
6758 EVT VT
= Op
.getValueType();
6759 assert(VT
== MVT::f16
);
6761 SDValue Exp
= Op
.getOperand(IsStrict
? 2 : 1);
6762 EVT ExpVT
= Exp
.getValueType();
6763 if (ExpVT
== MVT::i16
)
6768 // Correct the exponent type for f16 to i16.
6769 // Clamp the range of the exponent to the instruction's range.
6771 // TODO: This should be a generic narrowing legalization, and can easily be
6774 SDValue MinExp
= DAG
.getConstant(minIntN(16), DL
, ExpVT
);
6775 SDValue ClampMin
= DAG
.getNode(ISD::SMAX
, DL
, ExpVT
, Exp
, MinExp
);
6777 SDValue MaxExp
= DAG
.getConstant(maxIntN(16), DL
, ExpVT
);
6778 SDValue Clamp
= DAG
.getNode(ISD::SMIN
, DL
, ExpVT
, ClampMin
, MaxExp
);
6780 SDValue TruncExp
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, Clamp
);
6783 return DAG
.getNode(ISD::STRICT_FLDEXP
, DL
, {VT
, MVT::Other
},
6784 {Op
.getOperand(0), Op
.getOperand(1), TruncExp
});
6787 return DAG
.getNode(ISD::FLDEXP
, DL
, VT
, Op
.getOperand(0), TruncExp
);
6790 static unsigned getExtOpcodeForPromotedOp(SDValue Op
) {
6791 switch (Op
->getOpcode()) {
6795 return ISD::SIGN_EXTEND
;
6799 return ISD::ZERO_EXTEND
;
6808 // operation result won't be influenced by garbage high bits.
6809 // TODO: are all of those cases correct, and are there more?
6810 return ISD::ANY_EXTEND
;
6812 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(2))->get();
6813 return ISD::isSignedIntSetCC(CC
) ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
6816 llvm_unreachable("unexpected opcode!");
6820 SDValue
SITargetLowering::promoteUniformOpToI32(SDValue Op
,
6821 DAGCombinerInfo
&DCI
) const {
6822 const unsigned Opc
= Op
.getOpcode();
6823 assert(Opc
== ISD::ADD
|| Opc
== ISD::SUB
|| Opc
== ISD::SHL
||
6824 Opc
== ISD::SRL
|| Opc
== ISD::SRA
|| Opc
== ISD::AND
||
6825 Opc
== ISD::OR
|| Opc
== ISD::XOR
|| Opc
== ISD::MUL
||
6826 Opc
== ISD::SETCC
|| Opc
== ISD::SELECT
|| Opc
== ISD::SMIN
||
6827 Opc
== ISD::SMAX
|| Opc
== ISD::UMIN
|| Opc
== ISD::UMAX
);
6829 EVT OpTy
= (Opc
!= ISD::SETCC
) ? Op
.getValueType()
6830 : Op
->getOperand(0).getValueType();
6831 auto ExtTy
= OpTy
.changeElementType(MVT::i32
);
6833 if (DCI
.isBeforeLegalizeOps() ||
6834 isNarrowingProfitable(Op
.getNode(), ExtTy
, OpTy
))
6837 auto &DAG
= DCI
.DAG
;
6842 if (Opc
== ISD::SELECT
) {
6843 LHS
= Op
->getOperand(1);
6844 RHS
= Op
->getOperand(2);
6846 LHS
= Op
->getOperand(0);
6847 RHS
= Op
->getOperand(1);
6850 const unsigned ExtOp
= getExtOpcodeForPromotedOp(Op
);
6851 LHS
= DAG
.getNode(ExtOp
, DL
, ExtTy
, {LHS
});
6853 // Special case: for shifts, the RHS always needs a zext.
6854 if (Opc
== ISD::SHL
|| Opc
== ISD::SRL
|| Opc
== ISD::SRA
)
6855 RHS
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, ExtTy
, {RHS
});
6857 RHS
= DAG
.getNode(ExtOp
, DL
, ExtTy
, {RHS
});
6859 // setcc always return i1/i1 vec so no need to truncate after.
6860 if (Opc
== ISD::SETCC
) {
6861 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(2))->get();
6862 return DAG
.getSetCC(DL
, Op
.getValueType(), LHS
, RHS
, CC
);
6865 // For other ops, we extend the operation's return type as well so we need to
6866 // truncate back to the original type.
6868 if (Opc
== ISD::SELECT
)
6869 NewVal
= DAG
.getNode(ISD::SELECT
, DL
, ExtTy
, {Op
->getOperand(0), LHS
, RHS
});
6871 NewVal
= DAG
.getNode(Opc
, DL
, ExtTy
, {LHS
, RHS
});
6873 return DAG
.getZExtOrTrunc(NewVal
, DL
, OpTy
);
6876 // Custom lowering for vector multiplications and s_mul_u64.
6877 SDValue
SITargetLowering::lowerMUL(SDValue Op
, SelectionDAG
&DAG
) const {
6878 EVT VT
= Op
.getValueType();
6880 // Split vector operands.
6882 return splitBinaryVectorOp(Op
, DAG
);
6884 assert(VT
== MVT::i64
&& "The following code is a special for s_mul_u64");
6886 // There are four ways to lower s_mul_u64:
6888 // 1. If all the operands are uniform, then we lower it as it is.
6890 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6891 // multiplications because there is not a vector equivalent of s_mul_u64.
6893 // 3. If the cost model decides that it is more efficient to use vector
6894 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6895 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6897 // 4. If the cost model decides to use vector registers and both of the
6898 // operands are zero-extended/sign-extended from 32-bits, then we split the
6899 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6900 // possible to check if the operands are zero-extended or sign-extended in
6901 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6902 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6903 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6904 // If the cost model decides that we have to use vector registers, then
6905 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6906 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6907 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6908 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6909 // SIInstrInfo.cpp .
6911 if (Op
->isDivergent())
6914 SDValue Op0
= Op
.getOperand(0);
6915 SDValue Op1
= Op
.getOperand(1);
6916 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6917 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6918 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6919 KnownBits Op0KnownBits
= DAG
.computeKnownBits(Op0
);
6920 unsigned Op0LeadingZeros
= Op0KnownBits
.countMinLeadingZeros();
6921 KnownBits Op1KnownBits
= DAG
.computeKnownBits(Op1
);
6922 unsigned Op1LeadingZeros
= Op1KnownBits
.countMinLeadingZeros();
6924 if (Op0LeadingZeros
>= 32 && Op1LeadingZeros
>= 32)
6926 DAG
.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO
, SL
, VT
, Op0
, Op1
), 0);
6927 unsigned Op0SignBits
= DAG
.ComputeNumSignBits(Op0
);
6928 unsigned Op1SignBits
= DAG
.ComputeNumSignBits(Op1
);
6929 if (Op0SignBits
>= 33 && Op1SignBits
>= 33)
6931 DAG
.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO
, SL
, VT
, Op0
, Op1
), 0);
6932 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6936 SDValue
SITargetLowering::lowerXMULO(SDValue Op
, SelectionDAG
&DAG
) const {
6937 EVT VT
= Op
.getValueType();
6939 SDValue LHS
= Op
.getOperand(0);
6940 SDValue RHS
= Op
.getOperand(1);
6941 bool isSigned
= Op
.getOpcode() == ISD::SMULO
;
6943 if (ConstantSDNode
*RHSC
= isConstOrConstSplat(RHS
)) {
6944 const APInt
&C
= RHSC
->getAPIntValue();
6945 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6946 if (C
.isPowerOf2()) {
6947 // smulo(x, signed_min) is same as umulo(x, signed_min).
6948 bool UseArithShift
= isSigned
&& !C
.isMinSignedValue();
6949 SDValue ShiftAmt
= DAG
.getConstant(C
.logBase2(), SL
, MVT::i32
);
6950 SDValue Result
= DAG
.getNode(ISD::SHL
, SL
, VT
, LHS
, ShiftAmt
);
6952 DAG
.getSetCC(SL
, MVT::i1
,
6953 DAG
.getNode(UseArithShift
? ISD::SRA
: ISD::SRL
, SL
, VT
,
6956 return DAG
.getMergeValues({Result
, Overflow
}, SL
);
6960 SDValue Result
= DAG
.getNode(ISD::MUL
, SL
, VT
, LHS
, RHS
);
6962 DAG
.getNode(isSigned
? ISD::MULHS
: ISD::MULHU
, SL
, VT
, LHS
, RHS
);
6964 SDValue Sign
= isSigned
6965 ? DAG
.getNode(ISD::SRA
, SL
, VT
, Result
,
6966 DAG
.getConstant(VT
.getScalarSizeInBits() - 1,
6968 : DAG
.getConstant(0, SL
, VT
);
6969 SDValue Overflow
= DAG
.getSetCC(SL
, MVT::i1
, Top
, Sign
, ISD::SETNE
);
6971 return DAG
.getMergeValues({Result
, Overflow
}, SL
);
6974 SDValue
SITargetLowering::lowerXMUL_LOHI(SDValue Op
, SelectionDAG
&DAG
) const {
6975 if (Op
->isDivergent()) {
6976 // Select to V_MAD_[IU]64_[IU]32.
6979 if (Subtarget
->hasSMulHi()) {
6980 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6983 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6984 // calculate the high part, so we might as well do the whole thing with
6985 // V_MAD_[IU]64_[IU]32.
6989 SDValue
SITargetLowering::lowerTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
6990 if (!Subtarget
->isTrapHandlerEnabled() ||
6991 Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA
)
6992 return lowerTrapEndpgm(Op
, DAG
);
6994 return Subtarget
->supportsGetDoorbellID() ? lowerTrapHsa(Op
, DAG
)
6995 : lowerTrapHsaQueuePtr(Op
, DAG
);
6998 SDValue
SITargetLowering::lowerTrapEndpgm(SDValue Op
, SelectionDAG
&DAG
) const {
7000 SDValue Chain
= Op
.getOperand(0);
7001 return DAG
.getNode(AMDGPUISD::ENDPGM_TRAP
, SL
, MVT::Other
, Chain
);
7005 SITargetLowering::loadImplicitKernelArgument(SelectionDAG
&DAG
, MVT VT
,
7006 const SDLoc
&DL
, Align Alignment
,
7007 ImplicitParameter Param
) const {
7008 MachineFunction
&MF
= DAG
.getMachineFunction();
7009 uint64_t Offset
= getImplicitParameterOffset(MF
, Param
);
7010 SDValue Ptr
= lowerKernArgParameterPtr(DAG
, DL
, DAG
.getEntryNode(), Offset
);
7011 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
7012 return DAG
.getLoad(VT
, DL
, DAG
.getEntryNode(), Ptr
, PtrInfo
, Alignment
,
7013 MachineMemOperand::MODereferenceable
|
7014 MachineMemOperand::MOInvariant
);
7017 SDValue
SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op
,
7018 SelectionDAG
&DAG
) const {
7020 SDValue Chain
= Op
.getOperand(0);
7023 // For code object version 5, QueuePtr is passed through implicit kernarg.
7024 const Module
*M
= DAG
.getMachineFunction().getFunction().getParent();
7025 if (AMDGPU::getAMDHSACodeObjectVersion(*M
) >= AMDGPU::AMDHSA_COV5
) {
7027 loadImplicitKernelArgument(DAG
, MVT::i64
, SL
, Align(8), QUEUE_PTR
);
7029 MachineFunction
&MF
= DAG
.getMachineFunction();
7030 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
7031 Register UserSGPR
= Info
->getQueuePtrUserSGPR();
7033 if (UserSGPR
== AMDGPU::NoRegister
) {
7034 // We probably are in a function incorrectly marked with
7035 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7036 // trap, so just use a null pointer.
7037 QueuePtr
= DAG
.getConstant(0, SL
, MVT::i64
);
7039 QueuePtr
= CreateLiveInRegister(DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
,
7044 SDValue SGPR01
= DAG
.getRegister(AMDGPU::SGPR0_SGPR1
, MVT::i64
);
7045 SDValue ToReg
= DAG
.getCopyToReg(Chain
, SL
, SGPR01
, QueuePtr
, SDValue());
7047 uint64_t TrapID
= static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap
);
7048 SDValue Ops
[] = {ToReg
, DAG
.getTargetConstant(TrapID
, SL
, MVT::i16
), SGPR01
,
7050 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
7053 SDValue
SITargetLowering::lowerTrapHsa(SDValue Op
, SelectionDAG
&DAG
) const {
7055 SDValue Chain
= Op
.getOperand(0);
7057 // We need to simulate the 's_trap 2' instruction on targets that run in
7058 // PRIV=1 (where it is treated as a nop).
7059 if (Subtarget
->hasPrivEnabledTrap2NopBug())
7060 return DAG
.getNode(AMDGPUISD::SIMULATED_TRAP
, SL
, MVT::Other
, Chain
);
7062 uint64_t TrapID
= static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap
);
7063 SDValue Ops
[] = {Chain
, DAG
.getTargetConstant(TrapID
, SL
, MVT::i16
)};
7064 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
7067 SDValue
SITargetLowering::lowerDEBUGTRAP(SDValue Op
, SelectionDAG
&DAG
) const {
7069 SDValue Chain
= Op
.getOperand(0);
7070 MachineFunction
&MF
= DAG
.getMachineFunction();
7072 if (!Subtarget
->isTrapHandlerEnabled() ||
7073 Subtarget
->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA
) {
7074 DiagnosticInfoUnsupported
NoTrap(MF
.getFunction(),
7075 "debugtrap handler not supported",
7076 Op
.getDebugLoc(), DS_Warning
);
7077 LLVMContext
&Ctx
= MF
.getFunction().getContext();
7078 Ctx
.diagnose(NoTrap
);
7083 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap
);
7084 SDValue Ops
[] = {Chain
, DAG
.getTargetConstant(TrapID
, SL
, MVT::i16
)};
7085 return DAG
.getNode(AMDGPUISD::TRAP
, SL
, MVT::Other
, Ops
);
7088 SDValue
SITargetLowering::getSegmentAperture(unsigned AS
, const SDLoc
&DL
,
7089 SelectionDAG
&DAG
) const {
7090 if (Subtarget
->hasApertureRegs()) {
7091 const unsigned ApertureRegNo
= (AS
== AMDGPUAS::LOCAL_ADDRESS
)
7092 ? AMDGPU::SRC_SHARED_BASE
7093 : AMDGPU::SRC_PRIVATE_BASE
;
7094 // Note: this feature (register) is broken. When used as a 32-bit operand,
7095 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7098 // To work around the issue, directly emit a 64 bit mov from this register
7099 // then extract the high bits. Note that this shouldn't even result in a
7100 // shift being emitted and simply become a pair of registers (e.g.):
7101 // s_mov_b64 s[6:7], src_shared_base
7102 // v_mov_b32_e32 v1, s7
7104 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7105 // coalescing would kick in and it would think it's okay to use the "HI"
7106 // subregister directly (instead of extracting the HI 32 bits) which is an
7107 // artificial (unusable) register.
7108 // Register TableGen definitions would need an overhaul to get rid of the
7109 // artificial "HI" aperture registers and prevent this kind of issue from
7111 SDNode
*Mov
= DAG
.getMachineNode(AMDGPU::S_MOV_B64
, DL
, MVT::i64
,
7112 DAG
.getRegister(ApertureRegNo
, MVT::i64
));
7114 ISD::TRUNCATE
, DL
, MVT::i32
,
7115 DAG
.getNode(ISD::SRL
, DL
, MVT::i64
,
7116 {SDValue(Mov
, 0), DAG
.getConstant(32, DL
, MVT::i64
)}));
7119 // For code object version 5, private_base and shared_base are passed through
7120 // implicit kernargs.
7121 const Module
*M
= DAG
.getMachineFunction().getFunction().getParent();
7122 if (AMDGPU::getAMDHSACodeObjectVersion(*M
) >= AMDGPU::AMDHSA_COV5
) {
7123 ImplicitParameter Param
=
7124 (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? SHARED_BASE
: PRIVATE_BASE
;
7125 return loadImplicitKernelArgument(DAG
, MVT::i32
, DL
, Align(4), Param
);
7128 MachineFunction
&MF
= DAG
.getMachineFunction();
7129 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
7130 Register UserSGPR
= Info
->getQueuePtrUserSGPR();
7131 if (UserSGPR
== AMDGPU::NoRegister
) {
7132 // We probably are in a function incorrectly marked with
7133 // amdgpu-no-queue-ptr. This is undefined.
7134 return DAG
.getUNDEF(MVT::i32
);
7138 CreateLiveInRegister(DAG
, &AMDGPU::SReg_64RegClass
, UserSGPR
, MVT::i64
);
7140 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7141 // private_segment_aperture_base_hi.
7142 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
7145 DAG
.getObjectPtrOffset(DL
, QueuePtr
, TypeSize::getFixed(StructOffset
));
7147 // TODO: Use custom target PseudoSourceValue.
7148 // TODO: We should use the value from the IR intrinsic call, but it might not
7149 // be available and how do we get it?
7150 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
7151 return DAG
.getLoad(MVT::i32
, DL
, QueuePtr
.getValue(1), Ptr
, PtrInfo
,
7152 commonAlignment(Align(64), StructOffset
),
7153 MachineMemOperand::MODereferenceable
|
7154 MachineMemOperand::MOInvariant
);
7157 /// Return true if the value is a known valid address, such that a null check is
7159 static bool isKnownNonNull(SDValue Val
, SelectionDAG
&DAG
,
7160 const AMDGPUTargetMachine
&TM
, unsigned AddrSpace
) {
7161 if (isa
<FrameIndexSDNode
>(Val
) || isa
<GlobalAddressSDNode
>(Val
) ||
7162 isa
<BasicBlockSDNode
>(Val
))
7165 if (auto *ConstVal
= dyn_cast
<ConstantSDNode
>(Val
))
7166 return ConstVal
->getSExtValue() != TM
.getNullPointerValue(AddrSpace
);
7168 // TODO: Search through arithmetic, handle arguments and loads
7173 SDValue
SITargetLowering::lowerADDRSPACECAST(SDValue Op
,
7174 SelectionDAG
&DAG
) const {
7177 const AMDGPUTargetMachine
&TM
=
7178 static_cast<const AMDGPUTargetMachine
&>(getTargetMachine());
7180 unsigned DestAS
, SrcAS
;
7182 bool IsNonNull
= false;
7183 if (const auto *ASC
= dyn_cast
<AddrSpaceCastSDNode
>(Op
)) {
7184 SrcAS
= ASC
->getSrcAddressSpace();
7185 Src
= ASC
->getOperand(0);
7186 DestAS
= ASC
->getDestAddressSpace();
7188 assert(Op
.getOpcode() == ISD::INTRINSIC_WO_CHAIN
&&
7189 Op
.getConstantOperandVal(0) ==
7190 Intrinsic::amdgcn_addrspacecast_nonnull
);
7191 Src
= Op
->getOperand(1);
7192 SrcAS
= Op
->getConstantOperandVal(2);
7193 DestAS
= Op
->getConstantOperandVal(3);
7197 SDValue FlatNullPtr
= DAG
.getConstant(0, SL
, MVT::i64
);
7199 // flat -> local/private
7200 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
) {
7201 if (DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
7202 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7203 SDValue Ptr
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, Src
);
7205 if (IsNonNull
|| isKnownNonNull(Op
, DAG
, TM
, SrcAS
))
7208 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
7209 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
7210 SDValue NonNull
= DAG
.getSetCC(SL
, MVT::i1
, Src
, FlatNullPtr
, ISD::SETNE
);
7212 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i32
, NonNull
, Ptr
,
7217 // local/private -> flat
7218 if (DestAS
== AMDGPUAS::FLAT_ADDRESS
) {
7219 if (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
7220 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
) {
7222 SDValue Aperture
= getSegmentAperture(SrcAS
, SL
, DAG
);
7224 DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, Src
, Aperture
);
7225 CvtPtr
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, CvtPtr
);
7227 if (IsNonNull
|| isKnownNonNull(Op
, DAG
, TM
, SrcAS
))
7230 unsigned NullVal
= TM
.getNullPointerValue(SrcAS
);
7231 SDValue SegmentNullPtr
= DAG
.getConstant(NullVal
, SL
, MVT::i32
);
7234 DAG
.getSetCC(SL
, MVT::i1
, Src
, SegmentNullPtr
, ISD::SETNE
);
7236 return DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, NonNull
, CvtPtr
,
7241 if (SrcAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
7242 Op
.getValueType() == MVT::i64
) {
7243 const SIMachineFunctionInfo
*Info
=
7244 DAG
.getMachineFunction().getInfo
<SIMachineFunctionInfo
>();
7245 SDValue Hi
= DAG
.getConstant(Info
->get32BitAddressHighBits(), SL
, MVT::i32
);
7246 SDValue Vec
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, Src
, Hi
);
7247 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Vec
);
7250 if (DestAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
7251 Src
.getValueType() == MVT::i64
)
7252 return DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, Src
);
7254 // global <-> flat are no-ops and never emitted.
7256 const MachineFunction
&MF
= DAG
.getMachineFunction();
7257 DiagnosticInfoUnsupported
InvalidAddrSpaceCast(
7258 MF
.getFunction(), "invalid addrspacecast", SL
.getDebugLoc());
7259 DAG
.getContext()->diagnose(InvalidAddrSpaceCast
);
7261 return DAG
.getUNDEF(Op
->getValueType(0));
7264 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7265 // the small vector and inserting them into the big vector. That is better than
7266 // the default expansion of doing it via a stack slot. Even though the use of
7267 // the stack slot would be optimized away afterwards, the stack slot itself
7269 SDValue
SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op
,
7270 SelectionDAG
&DAG
) const {
7271 SDValue Vec
= Op
.getOperand(0);
7272 SDValue Ins
= Op
.getOperand(1);
7273 SDValue Idx
= Op
.getOperand(2);
7274 EVT VecVT
= Vec
.getValueType();
7275 EVT InsVT
= Ins
.getValueType();
7276 EVT EltVT
= VecVT
.getVectorElementType();
7277 unsigned InsNumElts
= InsVT
.getVectorNumElements();
7278 unsigned IdxVal
= Idx
->getAsZExtVal();
7281 if (EltVT
.getScalarSizeInBits() == 16 && IdxVal
% 2 == 0) {
7282 // Insert 32-bit registers at a time.
7283 assert(InsNumElts
% 2 == 0 && "expect legal vector types");
7285 unsigned VecNumElts
= VecVT
.getVectorNumElements();
7287 EVT::getVectorVT(*DAG
.getContext(), MVT::i32
, VecNumElts
/ 2);
7288 EVT NewInsVT
= InsNumElts
== 2 ? MVT::i32
7289 : EVT::getVectorVT(*DAG
.getContext(),
7290 MVT::i32
, InsNumElts
/ 2);
7292 Vec
= DAG
.getNode(ISD::BITCAST
, SL
, NewVecVT
, Vec
);
7293 Ins
= DAG
.getNode(ISD::BITCAST
, SL
, NewInsVT
, Ins
);
7295 for (unsigned I
= 0; I
!= InsNumElts
/ 2; ++I
) {
7297 if (InsNumElts
== 2) {
7300 Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Ins
,
7301 DAG
.getConstant(I
, SL
, MVT::i32
));
7303 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, NewVecVT
, Vec
, Elt
,
7304 DAG
.getConstant(IdxVal
/ 2 + I
, SL
, MVT::i32
));
7307 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, Vec
);
7310 for (unsigned I
= 0; I
!= InsNumElts
; ++I
) {
7311 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Ins
,
7312 DAG
.getConstant(I
, SL
, MVT::i32
));
7313 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, SL
, VecVT
, Vec
, Elt
,
7314 DAG
.getConstant(IdxVal
+ I
, SL
, MVT::i32
));
7319 SDValue
SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op
,
7320 SelectionDAG
&DAG
) const {
7321 SDValue Vec
= Op
.getOperand(0);
7322 SDValue InsVal
= Op
.getOperand(1);
7323 SDValue Idx
= Op
.getOperand(2);
7324 EVT VecVT
= Vec
.getValueType();
7325 EVT EltVT
= VecVT
.getVectorElementType();
7326 unsigned VecSize
= VecVT
.getSizeInBits();
7327 unsigned EltSize
= EltVT
.getSizeInBits();
7330 // Specially handle the case of v4i16 with static indexing.
7331 unsigned NumElts
= VecVT
.getVectorNumElements();
7332 auto *KIdx
= dyn_cast
<ConstantSDNode
>(Idx
);
7333 if (NumElts
== 4 && EltSize
== 16 && KIdx
) {
7334 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Vec
);
7336 SDValue LoHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
7337 DAG
.getConstant(0, SL
, MVT::i32
));
7338 SDValue HiHalf
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BCVec
,
7339 DAG
.getConstant(1, SL
, MVT::i32
));
7341 SDValue LoVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, LoHalf
);
7342 SDValue HiVec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i16
, HiHalf
);
7344 unsigned Idx
= KIdx
->getZExtValue();
7345 bool InsertLo
= Idx
< 2;
7346 SDValue InsHalf
= DAG
.getNode(
7347 ISD::INSERT_VECTOR_ELT
, SL
, MVT::v2i16
, InsertLo
? LoVec
: HiVec
,
7348 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, InsVal
),
7349 DAG
.getConstant(InsertLo
? Idx
: (Idx
- 2), SL
, MVT::i32
));
7351 InsHalf
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, InsHalf
);
7354 InsertLo
? DAG
.getBuildVector(MVT::v2i32
, SL
, {InsHalf
, HiHalf
})
7355 : DAG
.getBuildVector(MVT::v2i32
, SL
, {LoHalf
, InsHalf
});
7357 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, Concat
);
7360 // Static indexing does not lower to stack access, and hence there is no need
7361 // for special custom lowering to avoid stack access.
7362 if (isa
<ConstantSDNode
>(Idx
))
7365 // Avoid stack access for dynamic indexing by custom lowering to
7366 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7368 assert(VecSize
<= 64 && "Expected target vector size to be <= 64 bits");
7370 MVT IntVT
= MVT::getIntegerVT(VecSize
);
7372 // Convert vector index to bit-index and get the required bit mask.
7373 assert(isPowerOf2_32(EltSize
));
7374 const auto EltMask
= maskTrailingOnes
<uint64_t>(EltSize
);
7375 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
7376 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
7377 SDValue BFM
= DAG
.getNode(ISD::SHL
, SL
, IntVT
,
7378 DAG
.getConstant(EltMask
, SL
, IntVT
), ScaledIdx
);
7380 // 1. Create a congruent vector with the target value in each element.
7381 SDValue ExtVal
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
,
7382 DAG
.getSplatBuildVector(VecVT
, SL
, InsVal
));
7384 // 2. Mask off all other indices except the required index within (1).
7385 SDValue LHS
= DAG
.getNode(ISD::AND
, SL
, IntVT
, BFM
, ExtVal
);
7387 // 3. Mask off the required index within the target vector.
7388 SDValue BCVec
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
7390 DAG
.getNode(ISD::AND
, SL
, IntVT
, DAG
.getNOT(SL
, BFM
, IntVT
), BCVec
);
7392 // 4. Get (2) and (3) ORed into the target vector.
7393 SDValue BFI
= DAG
.getNode(ISD::OR
, SL
, IntVT
, LHS
, RHS
);
7395 return DAG
.getNode(ISD::BITCAST
, SL
, VecVT
, BFI
);
7398 SDValue
SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op
,
7399 SelectionDAG
&DAG
) const {
7402 EVT ResultVT
= Op
.getValueType();
7403 SDValue Vec
= Op
.getOperand(0);
7404 SDValue Idx
= Op
.getOperand(1);
7405 EVT VecVT
= Vec
.getValueType();
7406 unsigned VecSize
= VecVT
.getSizeInBits();
7407 EVT EltVT
= VecVT
.getVectorElementType();
7409 DAGCombinerInfo
DCI(DAG
, AfterLegalizeVectorOps
, true, nullptr);
7411 // Make sure we do any optimizations that will make it easier to fold
7412 // source modifiers before obscuring it with bit operations.
7414 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7415 if (SDValue Combined
= performExtractVectorEltCombine(Op
.getNode(), DCI
))
7418 if (VecSize
== 128 || VecSize
== 256 || VecSize
== 512) {
7420 auto [LoVT
, HiVT
] = DAG
.GetSplitDestVTs(VecVT
);
7422 if (VecSize
== 128) {
7423 SDValue V2
= DAG
.getBitcast(MVT::v2i64
, Vec
);
7424 Lo
= DAG
.getBitcast(LoVT
,
7425 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i64
, V2
,
7426 DAG
.getConstant(0, SL
, MVT::i32
)));
7427 Hi
= DAG
.getBitcast(HiVT
,
7428 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i64
, V2
,
7429 DAG
.getConstant(1, SL
, MVT::i32
)));
7430 } else if (VecSize
== 256) {
7431 SDValue V2
= DAG
.getBitcast(MVT::v4i64
, Vec
);
7433 for (unsigned P
= 0; P
< 4; ++P
) {
7434 Parts
[P
] = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i64
, V2
,
7435 DAG
.getConstant(P
, SL
, MVT::i32
));
7438 Lo
= DAG
.getBitcast(LoVT
, DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i64
,
7439 Parts
[0], Parts
[1]));
7440 Hi
= DAG
.getBitcast(HiVT
, DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i64
,
7441 Parts
[2], Parts
[3]));
7443 assert(VecSize
== 512);
7445 SDValue V2
= DAG
.getBitcast(MVT::v8i64
, Vec
);
7447 for (unsigned P
= 0; P
< 8; ++P
) {
7448 Parts
[P
] = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i64
, V2
,
7449 DAG
.getConstant(P
, SL
, MVT::i32
));
7452 Lo
= DAG
.getBitcast(LoVT
,
7453 DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v4i64
,
7454 Parts
[0], Parts
[1], Parts
[2], Parts
[3]));
7455 Hi
= DAG
.getBitcast(HiVT
,
7456 DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v4i64
,
7457 Parts
[4], Parts
[5], Parts
[6], Parts
[7]));
7460 EVT IdxVT
= Idx
.getValueType();
7461 unsigned NElem
= VecVT
.getVectorNumElements();
7462 assert(isPowerOf2_32(NElem
));
7463 SDValue IdxMask
= DAG
.getConstant(NElem
/ 2 - 1, SL
, IdxVT
);
7464 SDValue NewIdx
= DAG
.getNode(ISD::AND
, SL
, IdxVT
, Idx
, IdxMask
);
7465 SDValue Half
= DAG
.getSelectCC(SL
, Idx
, IdxMask
, Hi
, Lo
, ISD::SETUGT
);
7466 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Half
, NewIdx
);
7469 assert(VecSize
<= 64);
7471 MVT IntVT
= MVT::getIntegerVT(VecSize
);
7473 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7474 SDValue VecBC
= peekThroughBitcasts(Vec
);
7475 if (VecBC
.getOpcode() == ISD::SCALAR_TO_VECTOR
) {
7476 SDValue Src
= VecBC
.getOperand(0);
7477 Src
= DAG
.getBitcast(Src
.getValueType().changeTypeToInteger(), Src
);
7478 Vec
= DAG
.getAnyExtOrTrunc(Src
, SL
, IntVT
);
7481 unsigned EltSize
= EltVT
.getSizeInBits();
7482 assert(isPowerOf2_32(EltSize
));
7484 SDValue ScaleFactor
= DAG
.getConstant(Log2_32(EltSize
), SL
, MVT::i32
);
7486 // Convert vector index to bit-index (* EltSize)
7487 SDValue ScaledIdx
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Idx
, ScaleFactor
);
7489 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, IntVT
, Vec
);
7490 SDValue Elt
= DAG
.getNode(ISD::SRL
, SL
, IntVT
, BC
, ScaledIdx
);
7492 if (ResultVT
== MVT::f16
|| ResultVT
== MVT::bf16
) {
7493 SDValue Result
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i16
, Elt
);
7494 return DAG
.getNode(ISD::BITCAST
, SL
, ResultVT
, Result
);
7497 return DAG
.getAnyExtOrTrunc(Elt
, SL
, ResultVT
);
7500 static bool elementPairIsContiguous(ArrayRef
<int> Mask
, int Elt
) {
7501 assert(Elt
% 2 == 0);
7502 return Mask
[Elt
+ 1] == Mask
[Elt
] + 1 && (Mask
[Elt
] % 2 == 0);
7505 SDValue
SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op
,
7506 SelectionDAG
&DAG
) const {
7508 EVT ResultVT
= Op
.getValueType();
7509 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
);
7511 EVT PackVT
= ResultVT
.isInteger() ? MVT::v2i16
: MVT::v2f16
;
7512 EVT EltVT
= PackVT
.getVectorElementType();
7513 int SrcNumElts
= Op
.getOperand(0).getValueType().getVectorNumElements();
7515 // vector_shuffle <0,1,6,7> lhs, rhs
7516 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7518 // vector_shuffle <6,7,2,3> lhs, rhs
7519 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7521 // vector_shuffle <6,7,0,1> lhs, rhs
7522 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7524 // Avoid scalarizing when both halves are reading from consecutive elements.
7525 SmallVector
<SDValue
, 4> Pieces
;
7526 for (int I
= 0, N
= ResultVT
.getVectorNumElements(); I
!= N
; I
+= 2) {
7527 if (elementPairIsContiguous(SVN
->getMask(), I
)) {
7528 const int Idx
= SVN
->getMaskElt(I
);
7529 int VecIdx
= Idx
< SrcNumElts
? 0 : 1;
7530 int EltIdx
= Idx
< SrcNumElts
? Idx
: Idx
- SrcNumElts
;
7531 SDValue SubVec
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, PackVT
,
7532 SVN
->getOperand(VecIdx
),
7533 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
7534 Pieces
.push_back(SubVec
);
7536 const int Idx0
= SVN
->getMaskElt(I
);
7537 const int Idx1
= SVN
->getMaskElt(I
+ 1);
7538 int VecIdx0
= Idx0
< SrcNumElts
? 0 : 1;
7539 int VecIdx1
= Idx1
< SrcNumElts
? 0 : 1;
7540 int EltIdx0
= Idx0
< SrcNumElts
? Idx0
: Idx0
- SrcNumElts
;
7541 int EltIdx1
= Idx1
< SrcNumElts
? Idx1
: Idx1
- SrcNumElts
;
7543 SDValue Vec0
= SVN
->getOperand(VecIdx0
);
7544 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec0
,
7545 DAG
.getConstant(EltIdx0
, SL
, MVT::i32
));
7547 SDValue Vec1
= SVN
->getOperand(VecIdx1
);
7548 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec1
,
7549 DAG
.getConstant(EltIdx1
, SL
, MVT::i32
));
7550 Pieces
.push_back(DAG
.getBuildVector(PackVT
, SL
, {Elt0
, Elt1
}));
7554 return DAG
.getNode(ISD::CONCAT_VECTORS
, SL
, ResultVT
, Pieces
);
7557 SDValue
SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op
,
7558 SelectionDAG
&DAG
) const {
7559 SDValue SVal
= Op
.getOperand(0);
7560 EVT ResultVT
= Op
.getValueType();
7561 EVT SValVT
= SVal
.getValueType();
7562 SDValue UndefVal
= DAG
.getUNDEF(SValVT
);
7565 SmallVector
<SDValue
, 8> VElts
;
7566 VElts
.push_back(SVal
);
7567 for (int I
= 1, E
= ResultVT
.getVectorNumElements(); I
< E
; ++I
)
7568 VElts
.push_back(UndefVal
);
7570 return DAG
.getBuildVector(ResultVT
, SL
, VElts
);
7573 SDValue
SITargetLowering::lowerBUILD_VECTOR(SDValue Op
,
7574 SelectionDAG
&DAG
) const {
7576 EVT VT
= Op
.getValueType();
7578 if (VT
== MVT::v2f16
|| VT
== MVT::v2i16
|| VT
== MVT::v2bf16
) {
7579 assert(!Subtarget
->hasVOP3PInsts() && "this should be legal");
7581 SDValue Lo
= Op
.getOperand(0);
7582 SDValue Hi
= Op
.getOperand(1);
7584 // Avoid adding defined bits with the zero_extend.
7586 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
7587 SDValue ExtLo
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Lo
);
7588 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ExtLo
);
7591 Hi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Hi
);
7592 Hi
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Hi
);
7594 SDValue ShlHi
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Hi
,
7595 DAG
.getConstant(16, SL
, MVT::i32
));
7597 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, ShlHi
);
7599 Lo
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Lo
);
7600 Lo
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Lo
);
7602 SDValue Or
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Lo
, ShlHi
);
7603 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Or
);
7606 // Split into 2-element chunks.
7607 const unsigned NumParts
= VT
.getVectorNumElements() / 2;
7608 EVT PartVT
= MVT::getVectorVT(VT
.getVectorElementType().getSimpleVT(), 2);
7609 MVT PartIntVT
= MVT::getIntegerVT(PartVT
.getSizeInBits());
7611 SmallVector
<SDValue
> Casts
;
7612 for (unsigned P
= 0; P
< NumParts
; ++P
) {
7613 SDValue Vec
= DAG
.getBuildVector(
7614 PartVT
, SL
, {Op
.getOperand(P
* 2), Op
.getOperand(P
* 2 + 1)});
7615 Casts
.push_back(DAG
.getNode(ISD::BITCAST
, SL
, PartIntVT
, Vec
));
7619 DAG
.getBuildVector(MVT::getVectorVT(PartIntVT
, NumParts
), SL
, Casts
);
7620 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Blend
);
7623 bool SITargetLowering::isOffsetFoldingLegal(
7624 const GlobalAddressSDNode
*GA
) const {
7625 // OSes that use ELF REL relocations (instead of RELA) can only store a
7626 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7627 // which can create arbitrary 64-bit addends. (This is only a problem for
7628 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7629 // the high 32 bits of the addend.)
7631 // This should be kept in sync with how HasRelocationAddend is initialized in
7632 // the constructor of ELFAMDGPUAsmBackend.
7633 if (!Subtarget
->isAmdHsaOS())
7636 // We can fold offsets for anything that doesn't require a GOT relocation.
7637 return (GA
->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS
||
7638 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
7639 GA
->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
7640 !shouldEmitGOTReloc(GA
->getGlobal());
7644 buildPCRelGlobalAddress(SelectionDAG
&DAG
, const GlobalValue
*GV
,
7645 const SDLoc
&DL
, int64_t Offset
, EVT PtrVT
,
7646 unsigned GAFlags
= SIInstrInfo::MO_NONE
) {
7647 assert(isInt
<32>(Offset
+ 4) && "32-bit offset is expected!");
7648 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7649 // lowered to the following code sequence:
7651 // For constant address space:
7652 // s_getpc_b64 s[0:1]
7653 // s_add_u32 s0, s0, $symbol
7654 // s_addc_u32 s1, s1, 0
7656 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7657 // a fixup or relocation is emitted to replace $symbol with a literal
7658 // constant, which is a pc-relative offset from the encoding of the $symbol
7659 // operand to the global variable.
7661 // For global address space:
7662 // s_getpc_b64 s[0:1]
7663 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7664 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7666 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7667 // fixups or relocations are emitted to replace $symbol@*@lo and
7668 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7669 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7670 // operand to the global variable.
7671 SDValue PtrLo
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
, GAFlags
);
7673 if (GAFlags
== SIInstrInfo::MO_NONE
)
7674 PtrHi
= DAG
.getTargetConstant(0, DL
, MVT::i32
);
7676 PtrHi
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, Offset
, GAFlags
+ 1);
7677 return DAG
.getNode(AMDGPUISD::PC_ADD_REL_OFFSET
, DL
, PtrVT
, PtrLo
, PtrHi
);
7680 SDValue
SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
*MFI
,
7682 SelectionDAG
&DAG
) const {
7683 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Op
);
7685 EVT PtrVT
= Op
.getValueType();
7687 const GlobalValue
*GV
= GSD
->getGlobal();
7688 if ((GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
&&
7689 shouldUseLDSConstAddress(GV
)) ||
7690 GSD
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
||
7691 GSD
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
) {
7692 if (GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
&&
7693 GV
->hasExternalLinkage()) {
7694 Type
*Ty
= GV
->getValueType();
7695 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7696 // zero-sized type in other languages to declare the dynamic shared
7697 // memory which size is not known at the compile time. They will be
7698 // allocated by the runtime and placed directly after the static
7699 // allocated ones. They all share the same offset.
7700 if (DAG
.getDataLayout().getTypeAllocSize(Ty
).isZero()) {
7701 assert(PtrVT
== MVT::i32
&& "32-bit pointer is expected.");
7702 // Adjust alignment for that dynamic shared memory array.
7703 Function
&F
= DAG
.getMachineFunction().getFunction();
7704 MFI
->setDynLDSAlign(F
, *cast
<GlobalVariable
>(GV
));
7705 MFI
->setUsesDynamicLDS(true);
7707 DAG
.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE
, DL
, PtrVT
), 0);
7710 return AMDGPUTargetLowering::LowerGlobalAddress(MFI
, Op
, DAG
);
7713 if (GSD
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
) {
7714 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, GSD
->getOffset(),
7715 SIInstrInfo::MO_ABS32_LO
);
7716 return DAG
.getNode(AMDGPUISD::LDS
, DL
, MVT::i32
, GA
);
7719 if (Subtarget
->isAmdPalOS() || Subtarget
->isMesa3DOS()) {
7720 SDValue AddrLo
= DAG
.getTargetGlobalAddress(
7721 GV
, DL
, MVT::i32
, GSD
->getOffset(), SIInstrInfo::MO_ABS32_LO
);
7722 AddrLo
= {DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, AddrLo
), 0};
7724 SDValue AddrHi
= DAG
.getTargetGlobalAddress(
7725 GV
, DL
, MVT::i32
, GSD
->getOffset(), SIInstrInfo::MO_ABS32_HI
);
7726 AddrHi
= {DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, AddrHi
), 0};
7728 return DAG
.getNode(ISD::BUILD_PAIR
, DL
, MVT::i64
, AddrLo
, AddrHi
);
7731 if (shouldEmitFixup(GV
))
7732 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
);
7734 if (shouldEmitPCReloc(GV
))
7735 return buildPCRelGlobalAddress(DAG
, GV
, DL
, GSD
->getOffset(), PtrVT
,
7736 SIInstrInfo::MO_REL32
);
7738 SDValue GOTAddr
= buildPCRelGlobalAddress(DAG
, GV
, DL
, 0, PtrVT
,
7739 SIInstrInfo::MO_GOTPCREL32
);
7741 Type
*Ty
= PtrVT
.getTypeForEVT(*DAG
.getContext());
7742 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
7743 const DataLayout
&DataLayout
= DAG
.getDataLayout();
7744 Align Alignment
= DataLayout
.getABITypeAlign(PtrTy
);
7745 MachinePointerInfo PtrInfo
=
7746 MachinePointerInfo::getGOT(DAG
.getMachineFunction());
7748 return DAG
.getLoad(PtrVT
, DL
, DAG
.getEntryNode(), GOTAddr
, PtrInfo
, Alignment
,
7749 MachineMemOperand::MODereferenceable
|
7750 MachineMemOperand::MOInvariant
);
7753 SDValue
SITargetLowering::copyToM0(SelectionDAG
&DAG
, SDValue Chain
,
7754 const SDLoc
&DL
, SDValue V
) const {
7755 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7756 // the destination register.
7758 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7759 // so we will end up with redundant moves to m0.
7761 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7763 // A Null SDValue creates a glue result.
7764 SDNode
*M0
= DAG
.getMachineNode(AMDGPU::SI_INIT_M0
, DL
, MVT::Other
, MVT::Glue
,
7766 return SDValue(M0
, 0);
7769 SDValue
SITargetLowering::lowerImplicitZextParam(SelectionDAG
&DAG
, SDValue Op
,
7771 unsigned Offset
) const {
7773 SDValue Param
= lowerKernargMemParameter(
7774 DAG
, MVT::i32
, MVT::i32
, SL
, DAG
.getEntryNode(), Offset
, Align(4), false);
7775 // The local size values will have the hi 16-bits as zero.
7776 return DAG
.getNode(ISD::AssertZext
, SL
, MVT::i32
, Param
,
7777 DAG
.getValueType(VT
));
7780 static SDValue
emitNonHSAIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
7782 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
7783 "non-hsa intrinsic with hsa target",
7785 DAG
.getContext()->diagnose(BadIntrin
);
7786 return DAG
.getUNDEF(VT
);
7789 static SDValue
emitRemovedIntrinsicError(SelectionDAG
&DAG
, const SDLoc
&DL
,
7791 DiagnosticInfoUnsupported
BadIntrin(DAG
.getMachineFunction().getFunction(),
7792 "intrinsic not supported on subtarget",
7794 DAG
.getContext()->diagnose(BadIntrin
);
7795 return DAG
.getUNDEF(VT
);
7798 static SDValue
getBuildDwordsVector(SelectionDAG
&DAG
, SDLoc DL
,
7799 ArrayRef
<SDValue
> Elts
) {
7800 assert(!Elts
.empty());
7802 unsigned NumElts
= Elts
.size();
7804 if (NumElts
<= 12) {
7805 Type
= MVT::getVectorVT(MVT::f32
, NumElts
);
7807 assert(Elts
.size() <= 16);
7812 SmallVector
<SDValue
, 16> VecElts(NumElts
);
7813 for (unsigned i
= 0; i
< Elts
.size(); ++i
) {
7814 SDValue Elt
= Elts
[i
];
7815 if (Elt
.getValueType() != MVT::f32
)
7816 Elt
= DAG
.getBitcast(MVT::f32
, Elt
);
7819 for (unsigned i
= Elts
.size(); i
< NumElts
; ++i
)
7820 VecElts
[i
] = DAG
.getUNDEF(MVT::f32
);
7824 return DAG
.getBuildVector(Type
, DL
, VecElts
);
7827 static SDValue
padEltsToUndef(SelectionDAG
&DAG
, const SDLoc
&DL
, EVT CastVT
,
7828 SDValue Src
, int ExtraElts
) {
7829 EVT SrcVT
= Src
.getValueType();
7831 SmallVector
<SDValue
, 8> Elts
;
7833 if (SrcVT
.isVector())
7834 DAG
.ExtractVectorElements(Src
, Elts
);
7836 Elts
.push_back(Src
);
7838 SDValue Undef
= DAG
.getUNDEF(SrcVT
.getScalarType());
7840 Elts
.push_back(Undef
);
7842 return DAG
.getBuildVector(CastVT
, DL
, Elts
);
7845 // Re-construct the required return value for a image load intrinsic.
7846 // This is more complicated due to the optional use TexFailCtrl which means the
7847 // required return type is an aggregate
7848 static SDValue
constructRetValue(SelectionDAG
&DAG
, MachineSDNode
*Result
,
7849 ArrayRef
<EVT
> ResultTypes
, bool IsTexFail
,
7850 bool Unpacked
, bool IsD16
, int DMaskPop
,
7851 int NumVDataDwords
, bool IsAtomicPacked16Bit
,
7853 // Determine the required return type. This is the same regardless of
7855 EVT ReqRetVT
= ResultTypes
[0];
7856 int ReqRetNumElts
= ReqRetVT
.isVector() ? ReqRetVT
.getVectorNumElements() : 1;
7857 int NumDataDwords
= ((IsD16
&& !Unpacked
) || IsAtomicPacked16Bit
)
7858 ? (ReqRetNumElts
+ 1) / 2
7861 int MaskPopDwords
= (!IsD16
|| Unpacked
) ? DMaskPop
: (DMaskPop
+ 1) / 2;
7864 NumDataDwords
== 1 ? MVT::i32
: MVT::getVectorVT(MVT::i32
, NumDataDwords
);
7867 MaskPopDwords
== 1 ? MVT::i32
: MVT::getVectorVT(MVT::i32
, MaskPopDwords
);
7869 SDValue
Data(Result
, 0);
7872 if (DMaskPop
> 0 && Data
.getValueType() != MaskPopVT
) {
7873 SDValue ZeroIdx
= DAG
.getConstant(0, DL
, MVT::i32
);
7874 if (MaskPopVT
.isVector()) {
7875 Data
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, MaskPopVT
,
7876 SDValue(Result
, 0), ZeroIdx
);
7878 Data
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MaskPopVT
,
7879 SDValue(Result
, 0), ZeroIdx
);
7883 if (DataDwordVT
.isVector() && !IsAtomicPacked16Bit
)
7884 Data
= padEltsToUndef(DAG
, DL
, DataDwordVT
, Data
,
7885 NumDataDwords
- MaskPopDwords
);
7888 Data
= adjustLoadValueTypeImpl(Data
, ReqRetVT
, DL
, DAG
, Unpacked
);
7890 EVT LegalReqRetVT
= ReqRetVT
;
7891 if (!ReqRetVT
.isVector()) {
7892 if (!Data
.getValueType().isInteger())
7893 Data
= DAG
.getNode(ISD::BITCAST
, DL
,
7894 Data
.getValueType().changeTypeToInteger(), Data
);
7895 Data
= DAG
.getNode(ISD::TRUNCATE
, DL
, ReqRetVT
.changeTypeToInteger(), Data
);
7897 // We need to widen the return vector to a legal type
7898 if ((ReqRetVT
.getVectorNumElements() % 2) == 1 &&
7899 ReqRetVT
.getVectorElementType().getSizeInBits() == 16) {
7901 EVT::getVectorVT(*DAG
.getContext(), ReqRetVT
.getVectorElementType(),
7902 ReqRetVT
.getVectorNumElements() + 1);
7905 Data
= DAG
.getNode(ISD::BITCAST
, DL
, LegalReqRetVT
, Data
);
7909 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, SDValue(Result
, 0),
7910 DAG
.getConstant(MaskPopDwords
, DL
, MVT::i32
));
7912 return DAG
.getMergeValues({Data
, TexFail
, SDValue(Result
, 1)}, DL
);
7915 if (Result
->getNumValues() == 1)
7918 return DAG
.getMergeValues({Data
, SDValue(Result
, 1)}, DL
);
7921 static bool parseTexFail(SDValue TexFailCtrl
, SelectionDAG
&DAG
, SDValue
*TFE
,
7922 SDValue
*LWE
, bool &IsTexFail
) {
7923 auto *TexFailCtrlConst
= cast
<ConstantSDNode
>(TexFailCtrl
.getNode());
7925 uint64_t Value
= TexFailCtrlConst
->getZExtValue();
7930 SDLoc
DL(TexFailCtrlConst
);
7931 *TFE
= DAG
.getTargetConstant((Value
& 0x1) ? 1 : 0, DL
, MVT::i32
);
7932 Value
&= ~(uint64_t)0x1;
7933 *LWE
= DAG
.getTargetConstant((Value
& 0x2) ? 1 : 0, DL
, MVT::i32
);
7934 Value
&= ~(uint64_t)0x2;
7939 static void packImage16bitOpsToDwords(SelectionDAG
&DAG
, SDValue Op
,
7941 SmallVectorImpl
<SDValue
> &PackedAddrs
,
7942 unsigned DimIdx
, unsigned EndIdx
,
7943 unsigned NumGradients
) {
7945 for (unsigned I
= DimIdx
; I
< EndIdx
; I
++) {
7946 SDValue Addr
= Op
.getOperand(I
);
7948 // Gradients are packed with undef for each coordinate.
7949 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7950 // 1D: undef,dx/dh; undef,dx/dv
7951 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7952 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7953 if (((I
+ 1) >= EndIdx
) ||
7954 ((NumGradients
/ 2) % 2 == 1 && (I
== DimIdx
+ (NumGradients
/ 2) - 1 ||
7955 I
== DimIdx
+ NumGradients
- 1))) {
7956 if (Addr
.getValueType() != MVT::i16
)
7957 Addr
= DAG
.getBitcast(MVT::i16
, Addr
);
7958 Addr
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Addr
);
7960 Addr
= DAG
.getBuildVector(PackVectorVT
, DL
, {Addr
, Op
.getOperand(I
+ 1)});
7963 Addr
= DAG
.getBitcast(MVT::f32
, Addr
);
7964 PackedAddrs
.push_back(Addr
);
7968 SDValue
SITargetLowering::lowerImage(SDValue Op
,
7969 const AMDGPU::ImageDimIntrinsicInfo
*Intr
,
7970 SelectionDAG
&DAG
, bool WithChain
) const {
7972 MachineFunction
&MF
= DAG
.getMachineFunction();
7973 const GCNSubtarget
*ST
= &MF
.getSubtarget
<GCNSubtarget
>();
7974 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
7975 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
7976 const AMDGPU::MIMGDimInfo
*DimInfo
= AMDGPU::getMIMGDimInfo(Intr
->Dim
);
7977 unsigned IntrOpcode
= Intr
->BaseOpcode
;
7978 bool IsGFX10Plus
= AMDGPU::isGFX10Plus(*Subtarget
);
7979 bool IsGFX11Plus
= AMDGPU::isGFX11Plus(*Subtarget
);
7980 bool IsGFX12Plus
= AMDGPU::isGFX12Plus(*Subtarget
);
7982 SmallVector
<EVT
, 3> ResultTypes(Op
->values());
7983 SmallVector
<EVT
, 3> OrigResultTypes(Op
->values());
7988 int NumVDataDwords
= 0;
7989 bool AdjustRetType
= false;
7990 bool IsAtomicPacked16Bit
= false;
7992 // Offset of intrinsic arguments
7993 const unsigned ArgOffset
= WithChain
? 2 : 1;
7996 unsigned DMaskLanes
= 0;
7998 if (BaseOpcode
->Atomic
) {
7999 VData
= Op
.getOperand(2);
8001 IsAtomicPacked16Bit
=
8002 (Intr
->BaseOpcode
== AMDGPU::IMAGE_ATOMIC_PK_ADD_F16
||
8003 Intr
->BaseOpcode
== AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16
);
8005 bool Is64Bit
= VData
.getValueSizeInBits() == 64;
8006 if (BaseOpcode
->AtomicX2
) {
8007 SDValue VData2
= Op
.getOperand(3);
8008 VData
= DAG
.getBuildVector(Is64Bit
? MVT::v2i64
: MVT::v2i32
, DL
,
8011 VData
= DAG
.getBitcast(MVT::v4i32
, VData
);
8013 ResultTypes
[0] = Is64Bit
? MVT::v2i64
: MVT::v2i32
;
8014 DMask
= Is64Bit
? 0xf : 0x3;
8015 NumVDataDwords
= Is64Bit
? 4 : 2;
8017 DMask
= Is64Bit
? 0x3 : 0x1;
8018 NumVDataDwords
= Is64Bit
? 2 : 1;
8021 DMask
= Op
->getConstantOperandVal(ArgOffset
+ Intr
->DMaskIndex
);
8022 DMaskLanes
= BaseOpcode
->Gather4
? 4 : llvm::popcount(DMask
);
8024 if (BaseOpcode
->Store
) {
8025 VData
= Op
.getOperand(2);
8027 MVT StoreVT
= VData
.getSimpleValueType();
8028 if (StoreVT
.getScalarType() == MVT::f16
) {
8029 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
8030 return Op
; // D16 is unsupported for this instruction
8033 VData
= handleD16VData(VData
, DAG
, true);
8036 NumVDataDwords
= (VData
.getValueType().getSizeInBits() + 31) / 32;
8037 } else if (!BaseOpcode
->NoReturn
) {
8038 // Work out the num dwords based on the dmask popcount and underlying type
8039 // and whether packing is supported.
8040 MVT LoadVT
= ResultTypes
[0].getSimpleVT();
8041 if (LoadVT
.getScalarType() == MVT::f16
) {
8042 if (!Subtarget
->hasD16Images() || !BaseOpcode
->HasD16
)
8043 return Op
; // D16 is unsupported for this instruction
8048 // Confirm that the return type is large enough for the dmask specified
8049 if ((LoadVT
.isVector() && LoadVT
.getVectorNumElements() < DMaskLanes
) ||
8050 (!LoadVT
.isVector() && DMaskLanes
> 1))
8053 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8054 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8056 if (IsD16
&& !Subtarget
->hasUnpackedD16VMem() &&
8057 !(BaseOpcode
->Gather4
&& Subtarget
->hasImageGather4D16Bug()))
8058 NumVDataDwords
= (DMaskLanes
+ 1) / 2;
8060 NumVDataDwords
= DMaskLanes
;
8062 AdjustRetType
= true;
8066 unsigned VAddrEnd
= ArgOffset
+ Intr
->VAddrEnd
;
8067 SmallVector
<SDValue
, 4> VAddrs
;
8069 // Check for 16 bit addresses or derivatives and pack if true.
8071 Op
.getOperand(ArgOffset
+ Intr
->GradientStart
).getSimpleValueType();
8072 MVT VAddrScalarVT
= VAddrVT
.getScalarType();
8073 MVT GradPackVectorVT
= VAddrScalarVT
== MVT::f16
? MVT::v2f16
: MVT::v2i16
;
8074 IsG16
= VAddrScalarVT
== MVT::f16
|| VAddrScalarVT
== MVT::i16
;
8076 VAddrVT
= Op
.getOperand(ArgOffset
+ Intr
->CoordStart
).getSimpleValueType();
8077 VAddrScalarVT
= VAddrVT
.getScalarType();
8078 MVT AddrPackVectorVT
= VAddrScalarVT
== MVT::f16
? MVT::v2f16
: MVT::v2i16
;
8079 IsA16
= VAddrScalarVT
== MVT::f16
|| VAddrScalarVT
== MVT::i16
;
8081 // Push back extra arguments.
8082 for (unsigned I
= Intr
->VAddrStart
; I
< Intr
->GradientStart
; I
++) {
8083 if (IsA16
&& (Op
.getOperand(ArgOffset
+ I
).getValueType() == MVT::f16
)) {
8084 assert(I
== Intr
->BiasIndex
&& "Got unexpected 16-bit extra argument");
8085 // Special handling of bias when A16 is on. Bias is of type half but
8086 // occupies full 32-bit.
8087 SDValue Bias
= DAG
.getBuildVector(
8089 {Op
.getOperand(ArgOffset
+ I
), DAG
.getUNDEF(MVT::f16
)});
8090 VAddrs
.push_back(Bias
);
8092 assert((!IsA16
|| Intr
->NumBiasArgs
== 0 || I
!= Intr
->BiasIndex
) &&
8093 "Bias needs to be converted to 16 bit in A16 mode");
8094 VAddrs
.push_back(Op
.getOperand(ArgOffset
+ I
));
8098 if (BaseOpcode
->Gradients
&& !ST
->hasG16() && (IsA16
!= IsG16
)) {
8099 // 16 bit gradients are supported, but are tied to the A16 control
8100 // so both gradients and addresses must be 16 bit
8102 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8103 "require 16 bit args for both gradients and addresses");
8108 if (!ST
->hasA16()) {
8109 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8110 "support 16 bit addresses\n");
8115 // We've dealt with incorrect input so we know that if IsA16, IsG16
8116 // are set then we have to compress/pack operands (either address,
8117 // gradient or both)
8118 // In the case where a16 and gradients are tied (no G16 support) then we
8119 // have already verified that both IsA16 and IsG16 are true
8120 if (BaseOpcode
->Gradients
&& IsG16
&& ST
->hasG16()) {
8122 const AMDGPU::MIMGG16MappingInfo
*G16MappingInfo
=
8123 AMDGPU::getMIMGG16MappingInfo(Intr
->BaseOpcode
);
8124 IntrOpcode
= G16MappingInfo
->G16
; // set new opcode to variant with _g16
8127 // Add gradients (packed or unpacked)
8129 // Pack the gradients
8130 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8131 packImage16bitOpsToDwords(DAG
, Op
, GradPackVectorVT
, VAddrs
,
8132 ArgOffset
+ Intr
->GradientStart
,
8133 ArgOffset
+ Intr
->CoordStart
, Intr
->NumGradients
);
8135 for (unsigned I
= ArgOffset
+ Intr
->GradientStart
;
8136 I
< ArgOffset
+ Intr
->CoordStart
; I
++)
8137 VAddrs
.push_back(Op
.getOperand(I
));
8140 // Add addresses (packed or unpacked)
8142 packImage16bitOpsToDwords(DAG
, Op
, AddrPackVectorVT
, VAddrs
,
8143 ArgOffset
+ Intr
->CoordStart
, VAddrEnd
,
8144 0 /* No gradients */);
8146 // Add uncompressed address
8147 for (unsigned I
= ArgOffset
+ Intr
->CoordStart
; I
< VAddrEnd
; I
++)
8148 VAddrs
.push_back(Op
.getOperand(I
));
8151 // If the register allocator cannot place the address registers contiguously
8152 // without introducing moves, then using the non-sequential address encoding
8153 // is always preferable, since it saves VALU instructions and is usually a
8154 // wash in terms of code size or even better.
8156 // However, we currently have no way of hinting to the register allocator that
8157 // MIMG addresses should be placed contiguously when it is possible to do so,
8158 // so force non-NSA for the common 2-address case as a heuristic.
8160 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8161 // allocation when possible.
8163 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8164 // set of the remaining addresses.
8165 const unsigned NSAMaxSize
= ST
->getNSAMaxSize(BaseOpcode
->Sampler
);
8166 const bool HasPartialNSAEncoding
= ST
->hasPartialNSAEncoding();
8167 const bool UseNSA
= ST
->hasNSAEncoding() &&
8168 VAddrs
.size() >= ST
->getNSAThreshold(MF
) &&
8169 (VAddrs
.size() <= NSAMaxSize
|| HasPartialNSAEncoding
);
8170 const bool UsePartialNSA
=
8171 UseNSA
&& HasPartialNSAEncoding
&& VAddrs
.size() > NSAMaxSize
;
8174 if (UsePartialNSA
) {
8175 VAddr
= getBuildDwordsVector(DAG
, DL
,
8176 ArrayRef(VAddrs
).drop_front(NSAMaxSize
- 1));
8177 } else if (!UseNSA
) {
8178 VAddr
= getBuildDwordsVector(DAG
, DL
, VAddrs
);
8181 SDValue True
= DAG
.getTargetConstant(1, DL
, MVT::i1
);
8182 SDValue False
= DAG
.getTargetConstant(0, DL
, MVT::i1
);
8184 if (!BaseOpcode
->Sampler
) {
8187 uint64_t UnormConst
=
8188 Op
.getConstantOperandVal(ArgOffset
+ Intr
->UnormIndex
);
8190 Unorm
= UnormConst
? True
: False
;
8195 SDValue TexFail
= Op
.getOperand(ArgOffset
+ Intr
->TexFailCtrlIndex
);
8196 bool IsTexFail
= false;
8197 if (!parseTexFail(TexFail
, DAG
, &TFE
, &LWE
, IsTexFail
))
8202 // Expecting to get an error flag since TFC is on - and dmask is 0
8203 // Force dmask to be at least 1 otherwise the instruction will fail
8208 NumVDataDwords
+= 1;
8209 AdjustRetType
= true;
8212 // Has something earlier tagged that the return type needs adjusting
8213 // This happens if the instruction is a load or has set TexFailCtrl flags
8214 if (AdjustRetType
) {
8215 // NumVDataDwords reflects the true number of dwords required in the return
8217 if (DMaskLanes
== 0 && !BaseOpcode
->Store
) {
8218 // This is a no-op load. This can be eliminated
8219 SDValue Undef
= DAG
.getUNDEF(Op
.getValueType());
8220 if (isa
<MemSDNode
>(Op
))
8221 return DAG
.getMergeValues({Undef
, Op
.getOperand(0)}, DL
);
8225 EVT NewVT
= NumVDataDwords
> 1 ? EVT::getVectorVT(*DAG
.getContext(),
8226 MVT::i32
, NumVDataDwords
)
8229 ResultTypes
[0] = NewVT
;
8230 if (ResultTypes
.size() == 3) {
8231 // Original result was aggregate type used for TexFailCtrl results
8232 // The actual instruction returns as a vector type which has now been
8233 // created. Remove the aggregate result.
8234 ResultTypes
.erase(&ResultTypes
[1]);
8238 unsigned CPol
= Op
.getConstantOperandVal(ArgOffset
+ Intr
->CachePolicyIndex
);
8239 if (BaseOpcode
->Atomic
)
8240 CPol
|= AMDGPU::CPol::GLC
; // TODO no-return optimization
8241 if (CPol
& ~((IsGFX12Plus
? AMDGPU::CPol::ALL
: AMDGPU::CPol::ALL_pregfx12
) |
8242 AMDGPU::CPol::VOLATILE
))
8245 SmallVector
<SDValue
, 26> Ops
;
8246 if (BaseOpcode
->Store
|| BaseOpcode
->Atomic
)
8247 Ops
.push_back(VData
); // vdata
8248 if (UsePartialNSA
) {
8249 append_range(Ops
, ArrayRef(VAddrs
).take_front(NSAMaxSize
- 1));
8250 Ops
.push_back(VAddr
);
8252 append_range(Ops
, VAddrs
);
8254 Ops
.push_back(VAddr
);
8255 SDValue Rsrc
= Op
.getOperand(ArgOffset
+ Intr
->RsrcIndex
);
8256 EVT RsrcVT
= Rsrc
.getValueType();
8257 if (RsrcVT
!= MVT::v4i32
&& RsrcVT
!= MVT::v8i32
)
8259 Ops
.push_back(Rsrc
);
8260 if (BaseOpcode
->Sampler
) {
8261 SDValue Samp
= Op
.getOperand(ArgOffset
+ Intr
->SampIndex
);
8262 if (Samp
.getValueType() != MVT::v4i32
)
8264 Ops
.push_back(Samp
);
8266 Ops
.push_back(DAG
.getTargetConstant(DMask
, DL
, MVT::i32
));
8268 Ops
.push_back(DAG
.getTargetConstant(DimInfo
->Encoding
, DL
, MVT::i32
));
8269 if (!IsGFX12Plus
|| BaseOpcode
->Sampler
|| BaseOpcode
->MSAA
)
8270 Ops
.push_back(Unorm
);
8271 Ops
.push_back(DAG
.getTargetConstant(CPol
, DL
, MVT::i32
));
8272 Ops
.push_back(IsA16
&& // r128, a16 for gfx9
8273 ST
->hasFeature(AMDGPU::FeatureR128A16
)
8277 Ops
.push_back(IsA16
? True
: False
);
8278 if (!Subtarget
->hasGFX90AInsts()) {
8279 Ops
.push_back(TFE
); // tfe
8280 } else if (TFE
->getAsZExtVal()) {
8281 report_fatal_error("TFE is not supported on this GPU");
8283 if (!IsGFX12Plus
|| BaseOpcode
->Sampler
|| BaseOpcode
->MSAA
)
8284 Ops
.push_back(LWE
); // lwe
8286 Ops
.push_back(DimInfo
->DA
? True
: False
);
8287 if (BaseOpcode
->HasD16
)
8288 Ops
.push_back(IsD16
? True
: False
);
8289 if (isa
<MemSDNode
>(Op
))
8290 Ops
.push_back(Op
.getOperand(0)); // chain
8292 int NumVAddrDwords
=
8293 UseNSA
? VAddrs
.size() : VAddr
.getValueType().getSizeInBits() / 32;
8297 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx12
,
8298 NumVDataDwords
, NumVAddrDwords
);
8299 } else if (IsGFX11Plus
) {
8300 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
,
8301 UseNSA
? AMDGPU::MIMGEncGfx11NSA
8302 : AMDGPU::MIMGEncGfx11Default
,
8303 NumVDataDwords
, NumVAddrDwords
);
8304 } else if (IsGFX10Plus
) {
8305 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
,
8306 UseNSA
? AMDGPU::MIMGEncGfx10NSA
8307 : AMDGPU::MIMGEncGfx10Default
,
8308 NumVDataDwords
, NumVAddrDwords
);
8310 if (Subtarget
->hasGFX90AInsts()) {
8311 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx90a
,
8312 NumVDataDwords
, NumVAddrDwords
);
8315 "requested image instruction is not supported on this GPU");
8318 Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
8319 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx8
,
8320 NumVDataDwords
, NumVAddrDwords
);
8322 Opcode
= AMDGPU::getMIMGOpcode(IntrOpcode
, AMDGPU::MIMGEncGfx6
,
8323 NumVDataDwords
, NumVAddrDwords
);
8328 MachineSDNode
*NewNode
= DAG
.getMachineNode(Opcode
, DL
, ResultTypes
, Ops
);
8329 if (auto *MemOp
= dyn_cast
<MemSDNode
>(Op
)) {
8330 MachineMemOperand
*MemRef
= MemOp
->getMemOperand();
8331 DAG
.setNodeMemRefs(NewNode
, {MemRef
});
8334 if (BaseOpcode
->AtomicX2
) {
8335 SmallVector
<SDValue
, 1> Elt
;
8336 DAG
.ExtractVectorElements(SDValue(NewNode
, 0), Elt
, 0, 1);
8337 return DAG
.getMergeValues({Elt
[0], SDValue(NewNode
, 1)}, DL
);
8339 if (BaseOpcode
->NoReturn
)
8340 return SDValue(NewNode
, 0);
8341 return constructRetValue(DAG
, NewNode
, OrigResultTypes
, IsTexFail
,
8342 Subtarget
->hasUnpackedD16VMem(), IsD16
, DMaskLanes
,
8343 NumVDataDwords
, IsAtomicPacked16Bit
, DL
);
8346 SDValue
SITargetLowering::lowerSBuffer(EVT VT
, SDLoc DL
, SDValue Rsrc
,
8347 SDValue Offset
, SDValue CachePolicy
,
8348 SelectionDAG
&DAG
) const {
8349 MachineFunction
&MF
= DAG
.getMachineFunction();
8351 const DataLayout
&DataLayout
= DAG
.getDataLayout();
8353 DataLayout
.getABITypeAlign(VT
.getTypeForEVT(*DAG
.getContext()));
8355 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
8356 MachinePointerInfo(),
8357 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
8358 MachineMemOperand::MOInvariant
,
8359 VT
.getStoreSize(), Alignment
);
8361 if (!Offset
->isDivergent()) {
8362 SDValue Ops
[] = {Rsrc
, Offset
, CachePolicy
};
8364 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8365 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8366 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8367 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8368 if (VT
== MVT::i16
&& Subtarget
->hasScalarSubwordLoads()) {
8369 SDValue BufferLoad
=
8370 DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT
, DL
,
8371 DAG
.getVTList(MVT::i32
), Ops
, VT
, MMO
);
8372 return DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, BufferLoad
);
8375 // Widen vec3 load to vec4.
8376 if (VT
.isVector() && VT
.getVectorNumElements() == 3 &&
8377 !Subtarget
->hasScalarDwordx3Loads()) {
8379 EVT::getVectorVT(*DAG
.getContext(), VT
.getVectorElementType(), 4);
8380 auto WidenedOp
= DAG
.getMemIntrinsicNode(
8381 AMDGPUISD::SBUFFER_LOAD
, DL
, DAG
.getVTList(WidenedVT
), Ops
, WidenedVT
,
8382 MF
.getMachineMemOperand(MMO
, 0, WidenedVT
.getStoreSize()));
8383 auto Subvector
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, VT
, WidenedOp
,
8384 DAG
.getVectorIdxConstant(0, DL
));
8388 return DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD
, DL
,
8389 DAG
.getVTList(VT
), Ops
, VT
, MMO
);
8392 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8393 // assume that the buffer is unswizzled.
8395 DAG
.getEntryNode(), // Chain
8397 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
8401 CachePolicy
, // cachepolicy
8402 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
8404 if (VT
== MVT::i16
&& Subtarget
->hasScalarSubwordLoads()) {
8405 setBufferOffsets(Offset
, DAG
, &Ops
[3], Align(4));
8406 return handleByteShortBufferLoads(DAG
, VT
, DL
, Ops
, MMO
);
8409 SmallVector
<SDValue
, 4> Loads
;
8410 unsigned NumLoads
= 1;
8411 MVT LoadVT
= VT
.getSimpleVT();
8412 unsigned NumElts
= LoadVT
.isVector() ? LoadVT
.getVectorNumElements() : 1;
8413 assert((LoadVT
.getScalarType() == MVT::i32
||
8414 LoadVT
.getScalarType() == MVT::f32
));
8416 if (NumElts
== 8 || NumElts
== 16) {
8417 NumLoads
= NumElts
/ 4;
8418 LoadVT
= MVT::getVectorVT(LoadVT
.getScalarType(), 4);
8421 SDVTList VTList
= DAG
.getVTList({LoadVT
, MVT::Glue
});
8423 // Use the alignment to ensure that the required offsets will fit into the
8424 // immediate offsets.
8425 setBufferOffsets(Offset
, DAG
, &Ops
[3],
8426 NumLoads
> 1 ? Align(16 * NumLoads
) : Align(4));
8428 uint64_t InstOffset
= Ops
[5]->getAsZExtVal();
8429 for (unsigned i
= 0; i
< NumLoads
; ++i
) {
8430 Ops
[5] = DAG
.getTargetConstant(InstOffset
+ 16 * i
, DL
, MVT::i32
);
8431 Loads
.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD
, DL
, VTList
, Ops
,
8435 if (NumElts
== 8 || NumElts
== 16)
8436 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VT
, Loads
);
8441 SDValue
SITargetLowering::lowerWaveID(SelectionDAG
&DAG
, SDValue Op
) const {
8442 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8443 if (!Subtarget
->hasArchitectedSGPRs())
8447 SDValue TTMP8
= DAG
.getCopyFromReg(DAG
.getEntryNode(), SL
, AMDGPU::TTMP8
, VT
);
8448 return DAG
.getNode(AMDGPUISD::BFE_U32
, SL
, VT
, TTMP8
,
8449 DAG
.getConstant(25, SL
, VT
), DAG
.getConstant(5, SL
, VT
));
8452 SDValue
SITargetLowering::lowerWorkitemID(SelectionDAG
&DAG
, SDValue Op
,
8454 const ArgDescriptor
&Arg
) const {
8456 MachineFunction
&MF
= DAG
.getMachineFunction();
8457 unsigned MaxID
= Subtarget
->getMaxWorkitemID(MF
.getFunction(), Dim
);
8459 return DAG
.getConstant(0, SL
, MVT::i32
);
8461 SDValue Val
= loadInputValue(DAG
, &AMDGPU::VGPR_32RegClass
, MVT::i32
,
8462 SDLoc(DAG
.getEntryNode()), Arg
);
8464 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8465 // masking operations anyway.
8467 // TODO: We could assert the top bit is 0 for the source copy.
8471 // Preserve the known bits after expansion to a copy.
8472 EVT SmallVT
= EVT::getIntegerVT(*DAG
.getContext(), llvm::bit_width(MaxID
));
8473 return DAG
.getNode(ISD::AssertZext
, SL
, MVT::i32
, Val
,
8474 DAG
.getValueType(SmallVT
));
8477 SDValue
SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
8478 SelectionDAG
&DAG
) const {
8479 MachineFunction
&MF
= DAG
.getMachineFunction();
8480 auto *MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
8482 EVT VT
= Op
.getValueType();
8484 unsigned IntrinsicID
= Op
.getConstantOperandVal(0);
8486 // TODO: Should this propagate fast-math-flags?
8488 switch (IntrinsicID
) {
8489 case Intrinsic::amdgcn_implicit_buffer_ptr
: {
8490 if (getSubtarget()->isAmdHsaOrMesa(MF
.getFunction()))
8491 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
8492 return getPreloadedValue(DAG
, *MFI
, VT
,
8493 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
8495 case Intrinsic::amdgcn_dispatch_ptr
:
8496 case Intrinsic::amdgcn_queue_ptr
: {
8497 if (!Subtarget
->isAmdHsaOrMesa(MF
.getFunction())) {
8498 DiagnosticInfoUnsupported
BadIntrin(
8499 MF
.getFunction(), "unsupported hsa intrinsic without hsa target",
8501 DAG
.getContext()->diagnose(BadIntrin
);
8502 return DAG
.getUNDEF(VT
);
8505 auto RegID
= IntrinsicID
== Intrinsic::amdgcn_dispatch_ptr
8506 ? AMDGPUFunctionArgInfo::DISPATCH_PTR
8507 : AMDGPUFunctionArgInfo::QUEUE_PTR
;
8508 return getPreloadedValue(DAG
, *MFI
, VT
, RegID
);
8510 case Intrinsic::amdgcn_implicitarg_ptr
: {
8511 if (MFI
->isEntryFunction())
8512 return getImplicitArgPtr(DAG
, DL
);
8513 return getPreloadedValue(DAG
, *MFI
, VT
,
8514 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
8516 case Intrinsic::amdgcn_kernarg_segment_ptr
: {
8517 if (!AMDGPU::isKernel(MF
.getFunction().getCallingConv())) {
8518 // This only makes sense to call in a kernel, so just lower to null.
8519 return DAG
.getConstant(0, DL
, VT
);
8522 return getPreloadedValue(DAG
, *MFI
, VT
,
8523 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
8525 case Intrinsic::amdgcn_dispatch_id
: {
8526 return getPreloadedValue(DAG
, *MFI
, VT
, AMDGPUFunctionArgInfo::DISPATCH_ID
);
8528 case Intrinsic::amdgcn_rcp
:
8529 return DAG
.getNode(AMDGPUISD::RCP
, DL
, VT
, Op
.getOperand(1));
8530 case Intrinsic::amdgcn_rsq
:
8531 return DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
8532 case Intrinsic::amdgcn_rsq_legacy
:
8533 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
8534 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
8536 case Intrinsic::amdgcn_rcp_legacy
:
8537 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
8538 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
8539 return DAG
.getNode(AMDGPUISD::RCP_LEGACY
, DL
, VT
, Op
.getOperand(1));
8540 case Intrinsic::amdgcn_rsq_clamp
: {
8541 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
8542 return DAG
.getNode(AMDGPUISD::RSQ_CLAMP
, DL
, VT
, Op
.getOperand(1));
8544 Type
*Type
= VT
.getTypeForEVT(*DAG
.getContext());
8545 APFloat Max
= APFloat::getLargest(Type
->getFltSemantics());
8546 APFloat Min
= APFloat::getLargest(Type
->getFltSemantics(), true);
8548 SDValue Rsq
= DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
8550 DAG
.getNode(ISD::FMINNUM
, DL
, VT
, Rsq
, DAG
.getConstantFP(Max
, DL
, VT
));
8551 return DAG
.getNode(ISD::FMAXNUM
, DL
, VT
, Tmp
,
8552 DAG
.getConstantFP(Min
, DL
, VT
));
8554 case Intrinsic::r600_read_ngroups_x
:
8555 if (Subtarget
->isAmdHsaOS())
8556 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
8558 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
8559 SI::KernelInputOffsets::NGROUPS_X
, Align(4),
8561 case Intrinsic::r600_read_ngroups_y
:
8562 if (Subtarget
->isAmdHsaOS())
8563 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
8565 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
8566 SI::KernelInputOffsets::NGROUPS_Y
, Align(4),
8568 case Intrinsic::r600_read_ngroups_z
:
8569 if (Subtarget
->isAmdHsaOS())
8570 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
8572 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
8573 SI::KernelInputOffsets::NGROUPS_Z
, Align(4),
8575 case Intrinsic::r600_read_global_size_x
:
8576 if (Subtarget
->isAmdHsaOS())
8577 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
8579 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
8580 SI::KernelInputOffsets::GLOBAL_SIZE_X
,
8582 case Intrinsic::r600_read_global_size_y
:
8583 if (Subtarget
->isAmdHsaOS())
8584 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
8586 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
8587 SI::KernelInputOffsets::GLOBAL_SIZE_Y
,
8589 case Intrinsic::r600_read_global_size_z
:
8590 if (Subtarget
->isAmdHsaOS())
8591 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
8593 return lowerKernargMemParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
8594 SI::KernelInputOffsets::GLOBAL_SIZE_Z
,
8596 case Intrinsic::r600_read_local_size_x
:
8597 if (Subtarget
->isAmdHsaOS())
8598 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
8600 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
8601 SI::KernelInputOffsets::LOCAL_SIZE_X
);
8602 case Intrinsic::r600_read_local_size_y
:
8603 if (Subtarget
->isAmdHsaOS())
8604 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
8606 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
8607 SI::KernelInputOffsets::LOCAL_SIZE_Y
);
8608 case Intrinsic::r600_read_local_size_z
:
8609 if (Subtarget
->isAmdHsaOS())
8610 return emitNonHSAIntrinsicError(DAG
, DL
, VT
);
8612 return lowerImplicitZextParam(DAG
, Op
, MVT::i16
,
8613 SI::KernelInputOffsets::LOCAL_SIZE_Z
);
8614 case Intrinsic::amdgcn_workgroup_id_x
:
8615 return getPreloadedValue(DAG
, *MFI
, VT
,
8616 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
8617 case Intrinsic::amdgcn_workgroup_id_y
:
8618 return getPreloadedValue(DAG
, *MFI
, VT
,
8619 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
8620 case Intrinsic::amdgcn_workgroup_id_z
:
8621 return getPreloadedValue(DAG
, *MFI
, VT
,
8622 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
8623 case Intrinsic::amdgcn_wave_id
:
8624 return lowerWaveID(DAG
, Op
);
8625 case Intrinsic::amdgcn_lds_kernel_id
: {
8626 if (MFI
->isEntryFunction())
8627 return getLDSKernelId(DAG
, DL
);
8628 return getPreloadedValue(DAG
, *MFI
, VT
,
8629 AMDGPUFunctionArgInfo::LDS_KERNEL_ID
);
8631 case Intrinsic::amdgcn_workitem_id_x
:
8632 return lowerWorkitemID(DAG
, Op
, 0, MFI
->getArgInfo().WorkItemIDX
);
8633 case Intrinsic::amdgcn_workitem_id_y
:
8634 return lowerWorkitemID(DAG
, Op
, 1, MFI
->getArgInfo().WorkItemIDY
);
8635 case Intrinsic::amdgcn_workitem_id_z
:
8636 return lowerWorkitemID(DAG
, Op
, 2, MFI
->getArgInfo().WorkItemIDZ
);
8637 case Intrinsic::amdgcn_wavefrontsize
:
8638 return DAG
.getConstant(MF
.getSubtarget
<GCNSubtarget
>().getWavefrontSize(),
8639 SDLoc(Op
), MVT::i32
);
8640 case Intrinsic::amdgcn_s_buffer_load
: {
8641 unsigned CPol
= Op
.getConstantOperandVal(3);
8642 // s_buffer_load, because of how it's optimized, can't be volatile
8643 // so reject ones with the volatile bit set.
8644 if (CPol
& ~((Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX12
)
8646 : AMDGPU::CPol::ALL_pregfx12
))
8648 return lowerSBuffer(VT
, DL
, Op
.getOperand(1), Op
.getOperand(2),
8649 Op
.getOperand(3), DAG
);
8651 case Intrinsic::amdgcn_fdiv_fast
:
8652 return lowerFDIV_FAST(Op
, DAG
);
8653 case Intrinsic::amdgcn_sin
:
8654 return DAG
.getNode(AMDGPUISD::SIN_HW
, DL
, VT
, Op
.getOperand(1));
8656 case Intrinsic::amdgcn_cos
:
8657 return DAG
.getNode(AMDGPUISD::COS_HW
, DL
, VT
, Op
.getOperand(1));
8659 case Intrinsic::amdgcn_mul_u24
:
8660 return DAG
.getNode(AMDGPUISD::MUL_U24
, DL
, VT
, Op
.getOperand(1),
8662 case Intrinsic::amdgcn_mul_i24
:
8663 return DAG
.getNode(AMDGPUISD::MUL_I24
, DL
, VT
, Op
.getOperand(1),
8666 case Intrinsic::amdgcn_log_clamp
: {
8667 if (Subtarget
->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
8670 return emitRemovedIntrinsicError(DAG
, DL
, VT
);
8672 case Intrinsic::amdgcn_fract
:
8673 return DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, Op
.getOperand(1));
8675 case Intrinsic::amdgcn_class
:
8676 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, VT
, Op
.getOperand(1),
8678 case Intrinsic::amdgcn_div_fmas
:
8679 return DAG
.getNode(AMDGPUISD::DIV_FMAS
, DL
, VT
, Op
.getOperand(1),
8680 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(4));
8682 case Intrinsic::amdgcn_div_fixup
:
8683 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, DL
, VT
, Op
.getOperand(1),
8684 Op
.getOperand(2), Op
.getOperand(3));
8686 case Intrinsic::amdgcn_div_scale
: {
8687 const ConstantSDNode
*Param
= cast
<ConstantSDNode
>(Op
.getOperand(3));
8689 // Translate to the operands expected by the machine instruction. The
8690 // first parameter must be the same as the first instruction.
8691 SDValue Numerator
= Op
.getOperand(1);
8692 SDValue Denominator
= Op
.getOperand(2);
8694 // Note this order is opposite of the machine instruction's operations,
8695 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8696 // intrinsic has the numerator as the first operand to match a normal
8697 // division operation.
8699 SDValue Src0
= Param
->isAllOnes() ? Numerator
: Denominator
;
8701 return DAG
.getNode(AMDGPUISD::DIV_SCALE
, DL
, Op
->getVTList(), Src0
,
8702 Denominator
, Numerator
);
8704 case Intrinsic::amdgcn_icmp
: {
8705 // There is a Pat that handles this variant, so return it as-is.
8706 if (Op
.getOperand(1).getValueType() == MVT::i1
&&
8707 Op
.getConstantOperandVal(2) == 0 &&
8708 Op
.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE
)
8710 return lowerICMPIntrinsic(*this, Op
.getNode(), DAG
);
8712 case Intrinsic::amdgcn_fcmp
: {
8713 return lowerFCMPIntrinsic(*this, Op
.getNode(), DAG
);
8715 case Intrinsic::amdgcn_ballot
:
8716 return lowerBALLOTIntrinsic(*this, Op
.getNode(), DAG
);
8717 case Intrinsic::amdgcn_fmed3
:
8718 return DAG
.getNode(AMDGPUISD::FMED3
, DL
, VT
, Op
.getOperand(1),
8719 Op
.getOperand(2), Op
.getOperand(3));
8720 case Intrinsic::amdgcn_fdot2
:
8721 return DAG
.getNode(AMDGPUISD::FDOT2
, DL
, VT
, Op
.getOperand(1),
8722 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(4));
8723 case Intrinsic::amdgcn_fmul_legacy
:
8724 return DAG
.getNode(AMDGPUISD::FMUL_LEGACY
, DL
, VT
, Op
.getOperand(1),
8726 case Intrinsic::amdgcn_sffbh
:
8727 return DAG
.getNode(AMDGPUISD::FFBH_I32
, DL
, VT
, Op
.getOperand(1));
8728 case Intrinsic::amdgcn_sbfe
:
8729 return DAG
.getNode(AMDGPUISD::BFE_I32
, DL
, VT
, Op
.getOperand(1),
8730 Op
.getOperand(2), Op
.getOperand(3));
8731 case Intrinsic::amdgcn_ubfe
:
8732 return DAG
.getNode(AMDGPUISD::BFE_U32
, DL
, VT
, Op
.getOperand(1),
8733 Op
.getOperand(2), Op
.getOperand(3));
8734 case Intrinsic::amdgcn_cvt_pkrtz
:
8735 case Intrinsic::amdgcn_cvt_pknorm_i16
:
8736 case Intrinsic::amdgcn_cvt_pknorm_u16
:
8737 case Intrinsic::amdgcn_cvt_pk_i16
:
8738 case Intrinsic::amdgcn_cvt_pk_u16
: {
8739 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8740 EVT VT
= Op
.getValueType();
8743 if (IntrinsicID
== Intrinsic::amdgcn_cvt_pkrtz
)
8744 Opcode
= AMDGPUISD::CVT_PKRTZ_F16_F32
;
8745 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_i16
)
8746 Opcode
= AMDGPUISD::CVT_PKNORM_I16_F32
;
8747 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pknorm_u16
)
8748 Opcode
= AMDGPUISD::CVT_PKNORM_U16_F32
;
8749 else if (IntrinsicID
== Intrinsic::amdgcn_cvt_pk_i16
)
8750 Opcode
= AMDGPUISD::CVT_PK_I16_I32
;
8752 Opcode
= AMDGPUISD::CVT_PK_U16_U32
;
8754 if (isTypeLegal(VT
))
8755 return DAG
.getNode(Opcode
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2));
8758 DAG
.getNode(Opcode
, DL
, MVT::i32
, Op
.getOperand(1), Op
.getOperand(2));
8759 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Node
);
8761 case Intrinsic::amdgcn_fmad_ftz
:
8762 return DAG
.getNode(AMDGPUISD::FMAD_FTZ
, DL
, VT
, Op
.getOperand(1),
8763 Op
.getOperand(2), Op
.getOperand(3));
8765 case Intrinsic::amdgcn_if_break
:
8766 return SDValue(DAG
.getMachineNode(AMDGPU::SI_IF_BREAK
, DL
, VT
,
8767 Op
->getOperand(1), Op
->getOperand(2)),
8770 case Intrinsic::amdgcn_groupstaticsize
: {
8771 Triple::OSType OS
= getTargetMachine().getTargetTriple().getOS();
8772 if (OS
== Triple::AMDHSA
|| OS
== Triple::AMDPAL
)
8775 const Module
*M
= MF
.getFunction().getParent();
8776 const GlobalValue
*GV
=
8777 Intrinsic::getDeclarationIfExists(M
, Intrinsic::amdgcn_groupstaticsize
);
8778 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
, 0,
8779 SIInstrInfo::MO_ABS32_LO
);
8780 return {DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, GA
), 0};
8782 case Intrinsic::amdgcn_is_shared
:
8783 case Intrinsic::amdgcn_is_private
: {
8785 unsigned AS
= (IntrinsicID
== Intrinsic::amdgcn_is_shared
)
8786 ? AMDGPUAS::LOCAL_ADDRESS
8787 : AMDGPUAS::PRIVATE_ADDRESS
;
8788 SDValue Aperture
= getSegmentAperture(AS
, SL
, DAG
);
8790 DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(1));
8792 SDValue SrcHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, SrcVec
,
8793 DAG
.getConstant(1, SL
, MVT::i32
));
8794 return DAG
.getSetCC(SL
, MVT::i1
, SrcHi
, Aperture
, ISD::SETEQ
);
8796 case Intrinsic::amdgcn_perm
:
8797 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, Op
.getOperand(1),
8798 Op
.getOperand(2), Op
.getOperand(3));
8799 case Intrinsic::amdgcn_reloc_constant
: {
8800 Module
*M
= const_cast<Module
*>(MF
.getFunction().getParent());
8801 const MDNode
*Metadata
= cast
<MDNodeSDNode
>(Op
.getOperand(1))->getMD();
8802 auto SymbolName
= cast
<MDString
>(Metadata
->getOperand(0))->getString();
8803 auto *RelocSymbol
= cast
<GlobalVariable
>(
8804 M
->getOrInsertGlobal(SymbolName
, Type::getInt32Ty(M
->getContext())));
8805 SDValue GA
= DAG
.getTargetGlobalAddress(RelocSymbol
, DL
, MVT::i32
, 0,
8806 SIInstrInfo::MO_ABS32_LO
);
8807 return {DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, GA
), 0};
8809 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16
:
8810 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16
:
8811 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16
:
8812 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16
:
8813 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8
:
8814 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8
:
8815 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8
:
8816 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8
: {
8817 if (Op
.getOperand(4).getValueType() == MVT::i32
)
8821 auto IndexKeyi32
= DAG
.getAnyExtOrTrunc(Op
.getOperand(4), SL
, MVT::i32
);
8822 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SL
, Op
.getValueType(),
8823 Op
.getOperand(0), Op
.getOperand(1), Op
.getOperand(2),
8824 Op
.getOperand(3), IndexKeyi32
);
8826 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4
:
8827 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8
:
8828 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4
: {
8829 if (Op
.getOperand(6).getValueType() == MVT::i32
)
8833 auto IndexKeyi32
= DAG
.getAnyExtOrTrunc(Op
.getOperand(6), SL
, MVT::i32
);
8834 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SL
, Op
.getValueType(),
8835 {Op
.getOperand(0), Op
.getOperand(1), Op
.getOperand(2),
8836 Op
.getOperand(3), Op
.getOperand(4), Op
.getOperand(5),
8837 IndexKeyi32
, Op
.getOperand(7)});
8839 case Intrinsic::amdgcn_addrspacecast_nonnull
:
8840 return lowerADDRSPACECAST(Op
, DAG
);
8841 case Intrinsic::amdgcn_readlane
:
8842 case Intrinsic::amdgcn_readfirstlane
:
8843 case Intrinsic::amdgcn_writelane
:
8844 case Intrinsic::amdgcn_permlane16
:
8845 case Intrinsic::amdgcn_permlanex16
:
8846 case Intrinsic::amdgcn_permlane64
:
8847 case Intrinsic::amdgcn_set_inactive
:
8848 case Intrinsic::amdgcn_set_inactive_chain_arg
:
8849 case Intrinsic::amdgcn_mov_dpp8
:
8850 case Intrinsic::amdgcn_update_dpp
:
8851 return lowerLaneOp(*this, Op
.getNode(), DAG
);
8853 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
8854 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
8855 return lowerImage(Op
, ImageDimIntr
, DAG
, false);
8861 // On targets not supporting constant in soffset field, turn zero to
8862 // SGPR_NULL to avoid generating an extra s_mov with zero.
8863 static SDValue
selectSOffset(SDValue SOffset
, SelectionDAG
&DAG
,
8864 const GCNSubtarget
*Subtarget
) {
8865 if (Subtarget
->hasRestrictedSOffset() && isNullConstant(SOffset
))
8866 return DAG
.getRegister(AMDGPU::SGPR_NULL
, MVT::i32
);
8870 SDValue
SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op
,
8872 unsigned NewOpcode
) const {
8875 SDValue VData
= Op
.getOperand(2);
8876 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(3), DAG
);
8877 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(4), DAG
);
8878 auto SOffset
= selectSOffset(Op
.getOperand(5), DAG
, Subtarget
);
8880 Op
.getOperand(0), // Chain
8883 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
8887 Op
.getOperand(6), // cachepolicy
8888 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
8891 auto *M
= cast
<MemSDNode
>(Op
);
8893 EVT MemVT
= VData
.getValueType();
8894 return DAG
.getMemIntrinsicNode(NewOpcode
, DL
, Op
->getVTList(), Ops
, MemVT
,
8895 M
->getMemOperand());
8899 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op
, SelectionDAG
&DAG
,
8900 unsigned NewOpcode
) const {
8903 SDValue VData
= Op
.getOperand(2);
8904 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(3), DAG
);
8905 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(5), DAG
);
8906 auto SOffset
= selectSOffset(Op
.getOperand(6), DAG
, Subtarget
);
8908 Op
.getOperand(0), // Chain
8911 Op
.getOperand(4), // vindex
8915 Op
.getOperand(7), // cachepolicy
8916 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
8919 auto *M
= cast
<MemSDNode
>(Op
);
8921 EVT MemVT
= VData
.getValueType();
8922 return DAG
.getMemIntrinsicNode(NewOpcode
, DL
, Op
->getVTList(), Ops
, MemVT
,
8923 M
->getMemOperand());
8926 SDValue
SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op
,
8927 SelectionDAG
&DAG
) const {
8928 unsigned IntrID
= Op
.getConstantOperandVal(1);
8932 case Intrinsic::amdgcn_ds_ordered_add
:
8933 case Intrinsic::amdgcn_ds_ordered_swap
: {
8934 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
8935 SDValue Chain
= M
->getOperand(0);
8936 SDValue M0
= M
->getOperand(2);
8937 SDValue Value
= M
->getOperand(3);
8938 unsigned IndexOperand
= M
->getConstantOperandVal(7);
8939 unsigned WaveRelease
= M
->getConstantOperandVal(8);
8940 unsigned WaveDone
= M
->getConstantOperandVal(9);
8942 unsigned OrderedCountIndex
= IndexOperand
& 0x3f;
8943 IndexOperand
&= ~0x3f;
8944 unsigned CountDw
= 0;
8946 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
) {
8947 CountDw
= (IndexOperand
>> 24) & 0xf;
8948 IndexOperand
&= ~(0xf << 24);
8950 if (CountDw
< 1 || CountDw
> 4) {
8952 "ds_ordered_count: dword count must be between 1 and 4");
8957 report_fatal_error("ds_ordered_count: bad index operand");
8959 if (WaveDone
&& !WaveRelease
)
8960 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8962 unsigned Instruction
= IntrID
== Intrinsic::amdgcn_ds_ordered_add
? 0 : 1;
8963 unsigned ShaderType
=
8964 SIInstrInfo::getDSShaderTypeValue(DAG
.getMachineFunction());
8965 unsigned Offset0
= OrderedCountIndex
<< 2;
8966 unsigned Offset1
= WaveRelease
| (WaveDone
<< 1) | (Instruction
<< 4);
8968 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::GFX10
)
8969 Offset1
|= (CountDw
- 1) << 6;
8971 if (Subtarget
->getGeneration() < AMDGPUSubtarget::GFX11
)
8972 Offset1
|= ShaderType
<< 2;
8974 unsigned Offset
= Offset0
| (Offset1
<< 8);
8977 Chain
, Value
, DAG
.getTargetConstant(Offset
, DL
, MVT::i16
),
8978 copyToM0(DAG
, Chain
, DL
, M0
).getValue(1), // Glue
8980 return DAG
.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT
, DL
,
8981 M
->getVTList(), Ops
, M
->getMemoryVT(),
8982 M
->getMemOperand());
8984 case Intrinsic::amdgcn_raw_buffer_load
:
8985 case Intrinsic::amdgcn_raw_ptr_buffer_load
:
8986 case Intrinsic::amdgcn_raw_atomic_buffer_load
:
8987 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load
:
8988 case Intrinsic::amdgcn_raw_buffer_load_format
:
8989 case Intrinsic::amdgcn_raw_ptr_buffer_load_format
: {
8990 const bool IsFormat
=
8991 IntrID
== Intrinsic::amdgcn_raw_buffer_load_format
||
8992 IntrID
== Intrinsic::amdgcn_raw_ptr_buffer_load_format
;
8994 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(2), DAG
);
8995 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(3), DAG
);
8996 auto SOffset
= selectSOffset(Op
.getOperand(4), DAG
, Subtarget
);
8998 Op
.getOperand(0), // Chain
9000 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
9004 Op
.getOperand(5), // cachepolicy, swizzled buffer
9005 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
9008 auto *M
= cast
<MemSDNode
>(Op
);
9009 return lowerIntrinsicLoad(M
, IsFormat
, DAG
, Ops
);
9011 case Intrinsic::amdgcn_struct_buffer_load
:
9012 case Intrinsic::amdgcn_struct_ptr_buffer_load
:
9013 case Intrinsic::amdgcn_struct_buffer_load_format
:
9014 case Intrinsic::amdgcn_struct_ptr_buffer_load_format
:
9015 case Intrinsic::amdgcn_struct_atomic_buffer_load
:
9016 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load
: {
9017 const bool IsFormat
=
9018 IntrID
== Intrinsic::amdgcn_struct_buffer_load_format
||
9019 IntrID
== Intrinsic::amdgcn_struct_ptr_buffer_load_format
;
9021 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(2), DAG
);
9022 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(4), DAG
);
9023 auto SOffset
= selectSOffset(Op
.getOperand(5), DAG
, Subtarget
);
9025 Op
.getOperand(0), // Chain
9027 Op
.getOperand(3), // vindex
9031 Op
.getOperand(6), // cachepolicy, swizzled buffer
9032 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
9035 return lowerIntrinsicLoad(cast
<MemSDNode
>(Op
), IsFormat
, DAG
, Ops
);
9037 case Intrinsic::amdgcn_raw_tbuffer_load
:
9038 case Intrinsic::amdgcn_raw_ptr_tbuffer_load
: {
9039 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
9040 EVT LoadVT
= Op
.getValueType();
9041 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(2), DAG
);
9042 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(3), DAG
);
9043 auto SOffset
= selectSOffset(Op
.getOperand(4), DAG
, Subtarget
);
9046 Op
.getOperand(0), // Chain
9048 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
9052 Op
.getOperand(5), // format
9053 Op
.getOperand(6), // cachepolicy, swizzled buffer
9054 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
9057 if (LoadVT
.getScalarType() == MVT::f16
)
9058 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
, M
, DAG
,
9060 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
9061 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
9064 case Intrinsic::amdgcn_struct_tbuffer_load
:
9065 case Intrinsic::amdgcn_struct_ptr_tbuffer_load
: {
9066 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
9067 EVT LoadVT
= Op
.getValueType();
9068 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(2), DAG
);
9069 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(4), DAG
);
9070 auto SOffset
= selectSOffset(Op
.getOperand(5), DAG
, Subtarget
);
9073 Op
.getOperand(0), // Chain
9075 Op
.getOperand(3), // vindex
9079 Op
.getOperand(6), // format
9080 Op
.getOperand(7), // cachepolicy, swizzled buffer
9081 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
9084 if (LoadVT
.getScalarType() == MVT::f16
)
9085 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16
, M
, DAG
,
9087 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT
, DL
,
9088 Op
->getVTList(), Ops
, LoadVT
, M
->getMemOperand(),
9091 case Intrinsic::amdgcn_raw_buffer_atomic_fadd
:
9092 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd
:
9093 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_FADD
);
9094 case Intrinsic::amdgcn_struct_buffer_atomic_fadd
:
9095 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd
:
9096 return lowerStructBufferAtomicIntrin(Op
, DAG
,
9097 AMDGPUISD::BUFFER_ATOMIC_FADD
);
9098 case Intrinsic::amdgcn_raw_buffer_atomic_fmin
:
9099 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin
:
9100 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_FMIN
);
9101 case Intrinsic::amdgcn_struct_buffer_atomic_fmin
:
9102 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin
:
9103 return lowerStructBufferAtomicIntrin(Op
, DAG
,
9104 AMDGPUISD::BUFFER_ATOMIC_FMIN
);
9105 case Intrinsic::amdgcn_raw_buffer_atomic_fmax
:
9106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax
:
9107 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_FMAX
);
9108 case Intrinsic::amdgcn_struct_buffer_atomic_fmax
:
9109 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax
:
9110 return lowerStructBufferAtomicIntrin(Op
, DAG
,
9111 AMDGPUISD::BUFFER_ATOMIC_FMAX
);
9112 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
9113 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap
:
9114 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_SWAP
);
9115 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
9116 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add
:
9117 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_ADD
);
9118 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
9119 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub
:
9120 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_SUB
);
9121 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
9122 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin
:
9123 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_SMIN
);
9124 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
9125 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin
:
9126 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_UMIN
);
9127 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
9128 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax
:
9129 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_SMAX
);
9130 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
9131 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax
:
9132 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_UMAX
);
9133 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
9134 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and
:
9135 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_AND
);
9136 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
9137 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or
:
9138 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_OR
);
9139 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
9140 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor
:
9141 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_XOR
);
9142 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
9143 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc
:
9144 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_INC
);
9145 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
9146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec
:
9147 return lowerRawBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_DEC
);
9148 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32
:
9149 return lowerRawBufferAtomicIntrin(Op
, DAG
,
9150 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32
);
9151 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
9152 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap
:
9153 return lowerStructBufferAtomicIntrin(Op
, DAG
,
9154 AMDGPUISD::BUFFER_ATOMIC_SWAP
);
9155 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
9156 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add
:
9157 return lowerStructBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_ADD
);
9158 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
9159 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub
:
9160 return lowerStructBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_SUB
);
9161 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
9162 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin
:
9163 return lowerStructBufferAtomicIntrin(Op
, DAG
,
9164 AMDGPUISD::BUFFER_ATOMIC_SMIN
);
9165 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
9166 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin
:
9167 return lowerStructBufferAtomicIntrin(Op
, DAG
,
9168 AMDGPUISD::BUFFER_ATOMIC_UMIN
);
9169 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
9170 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax
:
9171 return lowerStructBufferAtomicIntrin(Op
, DAG
,
9172 AMDGPUISD::BUFFER_ATOMIC_SMAX
);
9173 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
9174 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax
:
9175 return lowerStructBufferAtomicIntrin(Op
, DAG
,
9176 AMDGPUISD::BUFFER_ATOMIC_UMAX
);
9177 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
9178 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and
:
9179 return lowerStructBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_AND
);
9180 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
9181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or
:
9182 return lowerStructBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_OR
);
9183 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
9184 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor
:
9185 return lowerStructBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_XOR
);
9186 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
9187 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc
:
9188 return lowerStructBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_INC
);
9189 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
9190 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec
:
9191 return lowerStructBufferAtomicIntrin(Op
, DAG
, AMDGPUISD::BUFFER_ATOMIC_DEC
);
9192 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32
:
9193 return lowerStructBufferAtomicIntrin(Op
, DAG
,
9194 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32
);
9196 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
:
9197 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap
: {
9198 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(4), DAG
);
9199 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(5), DAG
);
9200 auto SOffset
= selectSOffset(Op
.getOperand(6), DAG
, Subtarget
);
9202 Op
.getOperand(0), // Chain
9203 Op
.getOperand(2), // src
9204 Op
.getOperand(3), // cmp
9206 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
9210 Op
.getOperand(7), // cachepolicy
9211 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
9213 EVT VT
= Op
.getValueType();
9214 auto *M
= cast
<MemSDNode
>(Op
);
9216 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
9217 Op
->getVTList(), Ops
, VT
,
9218 M
->getMemOperand());
9220 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
:
9221 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap
: {
9222 SDValue Rsrc
= bufferRsrcPtrToVector(Op
->getOperand(4), DAG
);
9223 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(6), DAG
);
9224 auto SOffset
= selectSOffset(Op
.getOperand(7), DAG
, Subtarget
);
9226 Op
.getOperand(0), // Chain
9227 Op
.getOperand(2), // src
9228 Op
.getOperand(3), // cmp
9230 Op
.getOperand(5), // vindex
9234 Op
.getOperand(8), // cachepolicy
9235 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
9237 EVT VT
= Op
.getValueType();
9238 auto *M
= cast
<MemSDNode
>(Op
);
9240 return DAG
.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
, DL
,
9241 Op
->getVTList(), Ops
, VT
,
9242 M
->getMemOperand());
9244 case Intrinsic::amdgcn_image_bvh_intersect_ray
: {
9245 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
9246 SDValue NodePtr
= M
->getOperand(2);
9247 SDValue RayExtent
= M
->getOperand(3);
9248 SDValue RayOrigin
= M
->getOperand(4);
9249 SDValue RayDir
= M
->getOperand(5);
9250 SDValue RayInvDir
= M
->getOperand(6);
9251 SDValue TDescr
= M
->getOperand(7);
9253 assert(NodePtr
.getValueType() == MVT::i32
||
9254 NodePtr
.getValueType() == MVT::i64
);
9255 assert(RayDir
.getValueType() == MVT::v3f16
||
9256 RayDir
.getValueType() == MVT::v3f32
);
9258 if (!Subtarget
->hasGFX10_AEncoding()) {
9259 emitRemovedIntrinsicError(DAG
, DL
, Op
.getValueType());
9263 const bool IsGFX11
= AMDGPU::isGFX11(*Subtarget
);
9264 const bool IsGFX11Plus
= AMDGPU::isGFX11Plus(*Subtarget
);
9265 const bool IsGFX12Plus
= AMDGPU::isGFX12Plus(*Subtarget
);
9266 const bool IsA16
= RayDir
.getValueType().getVectorElementType() == MVT::f16
;
9267 const bool Is64
= NodePtr
.getValueType() == MVT::i64
;
9268 const unsigned NumVDataDwords
= 4;
9269 const unsigned NumVAddrDwords
= IsA16
? (Is64
? 9 : 8) : (Is64
? 12 : 11);
9270 const unsigned NumVAddrs
= IsGFX11Plus
? (IsA16
? 4 : 5) : NumVAddrDwords
;
9271 const bool UseNSA
= (Subtarget
->hasNSAEncoding() &&
9272 NumVAddrs
<= Subtarget
->getNSAMaxSize()) ||
9274 const unsigned BaseOpcodes
[2][2] = {
9275 {AMDGPU::IMAGE_BVH_INTERSECT_RAY
, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16
},
9276 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY
,
9277 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16
}};
9280 Opcode
= AMDGPU::getMIMGOpcode(BaseOpcodes
[Is64
][IsA16
],
9281 IsGFX12Plus
? AMDGPU::MIMGEncGfx12
9282 : IsGFX11
? AMDGPU::MIMGEncGfx11NSA
9283 : AMDGPU::MIMGEncGfx10NSA
,
9284 NumVDataDwords
, NumVAddrDwords
);
9286 assert(!IsGFX12Plus
);
9287 Opcode
= AMDGPU::getMIMGOpcode(BaseOpcodes
[Is64
][IsA16
],
9288 IsGFX11
? AMDGPU::MIMGEncGfx11Default
9289 : AMDGPU::MIMGEncGfx10Default
,
9290 NumVDataDwords
, NumVAddrDwords
);
9292 assert(Opcode
!= -1);
9294 SmallVector
<SDValue
, 16> Ops
;
9296 auto packLanes
= [&DAG
, &Ops
, &DL
](SDValue Op
, bool IsAligned
) {
9297 SmallVector
<SDValue
, 3> Lanes
;
9298 DAG
.ExtractVectorElements(Op
, Lanes
, 0, 3);
9299 if (Lanes
[0].getValueSizeInBits() == 32) {
9300 for (unsigned I
= 0; I
< 3; ++I
)
9301 Ops
.push_back(DAG
.getBitcast(MVT::i32
, Lanes
[I
]));
9304 Ops
.push_back(DAG
.getBitcast(
9306 DAG
.getBuildVector(MVT::v2f16
, DL
, {Lanes
[0], Lanes
[1]})));
9307 Ops
.push_back(Lanes
[2]);
9309 SDValue Elt0
= Ops
.pop_back_val();
9310 Ops
.push_back(DAG
.getBitcast(
9311 MVT::i32
, DAG
.getBuildVector(MVT::v2f16
, DL
, {Elt0
, Lanes
[0]})));
9312 Ops
.push_back(DAG
.getBitcast(
9314 DAG
.getBuildVector(MVT::v2f16
, DL
, {Lanes
[1], Lanes
[2]})));
9319 if (UseNSA
&& IsGFX11Plus
) {
9320 Ops
.push_back(NodePtr
);
9321 Ops
.push_back(DAG
.getBitcast(MVT::i32
, RayExtent
));
9322 Ops
.push_back(RayOrigin
);
9324 SmallVector
<SDValue
, 3> DirLanes
, InvDirLanes
, MergedLanes
;
9325 DAG
.ExtractVectorElements(RayDir
, DirLanes
, 0, 3);
9326 DAG
.ExtractVectorElements(RayInvDir
, InvDirLanes
, 0, 3);
9327 for (unsigned I
= 0; I
< 3; ++I
) {
9328 MergedLanes
.push_back(DAG
.getBitcast(
9329 MVT::i32
, DAG
.getBuildVector(MVT::v2f16
, DL
,
9330 {DirLanes
[I
], InvDirLanes
[I
]})));
9332 Ops
.push_back(DAG
.getBuildVector(MVT::v3i32
, DL
, MergedLanes
));
9334 Ops
.push_back(RayDir
);
9335 Ops
.push_back(RayInvDir
);
9339 DAG
.ExtractVectorElements(DAG
.getBitcast(MVT::v2i32
, NodePtr
), Ops
, 0,
9342 Ops
.push_back(NodePtr
);
9344 Ops
.push_back(DAG
.getBitcast(MVT::i32
, RayExtent
));
9345 packLanes(RayOrigin
, true);
9346 packLanes(RayDir
, true);
9347 packLanes(RayInvDir
, false);
9351 // Build a single vector containing all the operands so far prepared.
9352 if (NumVAddrDwords
> 12) {
9353 SDValue Undef
= DAG
.getUNDEF(MVT::i32
);
9354 Ops
.append(16 - Ops
.size(), Undef
);
9356 assert(Ops
.size() >= 8 && Ops
.size() <= 12);
9358 DAG
.getBuildVector(MVT::getVectorVT(MVT::i32
, Ops
.size()), DL
, Ops
);
9360 Ops
.push_back(MergedOps
);
9363 Ops
.push_back(TDescr
);
9364 Ops
.push_back(DAG
.getTargetConstant(IsA16
, DL
, MVT::i1
));
9365 Ops
.push_back(M
->getChain());
9367 auto *NewNode
= DAG
.getMachineNode(Opcode
, DL
, M
->getVTList(), Ops
);
9368 MachineMemOperand
*MemRef
= M
->getMemOperand();
9369 DAG
.setNodeMemRefs(NewNode
, {MemRef
});
9370 return SDValue(NewNode
, 0);
9372 case Intrinsic::amdgcn_global_atomic_fmin_num
:
9373 case Intrinsic::amdgcn_global_atomic_fmax_num
:
9374 case Intrinsic::amdgcn_flat_atomic_fmin_num
:
9375 case Intrinsic::amdgcn_flat_atomic_fmax_num
: {
9376 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
9378 M
->getOperand(0), // Chain
9379 M
->getOperand(2), // Ptr
9380 M
->getOperand(3) // Value
9382 unsigned Opcode
= 0;
9384 case Intrinsic::amdgcn_global_atomic_fmin_num
:
9385 case Intrinsic::amdgcn_flat_atomic_fmin_num
: {
9386 Opcode
= ISD::ATOMIC_LOAD_FMIN
;
9389 case Intrinsic::amdgcn_global_atomic_fmax_num
:
9390 case Intrinsic::amdgcn_flat_atomic_fmax_num
: {
9391 Opcode
= ISD::ATOMIC_LOAD_FMAX
;
9395 llvm_unreachable("unhandled atomic opcode");
9397 return DAG
.getAtomic(Opcode
, SDLoc(Op
), M
->getMemoryVT(), M
->getVTList(),
9398 Ops
, M
->getMemOperand());
9400 case Intrinsic::amdgcn_s_get_barrier_state
:
9401 case Intrinsic::amdgcn_s_get_named_barrier_state
: {
9402 SDValue Chain
= Op
->getOperand(0);
9403 SmallVector
<SDValue
, 2> Ops
;
9406 if (isa
<ConstantSDNode
>(Op
->getOperand(2))) {
9407 uint64_t BarID
= cast
<ConstantSDNode
>(Op
->getOperand(2))->getZExtValue();
9408 if (IntrID
== Intrinsic::amdgcn_s_get_named_barrier_state
)
9409 BarID
= (BarID
>> 4) & 0x3F;
9410 Opc
= AMDGPU::S_GET_BARRIER_STATE_IMM
;
9411 SDValue K
= DAG
.getTargetConstant(BarID
, DL
, MVT::i32
);
9413 Ops
.push_back(Chain
);
9415 Opc
= AMDGPU::S_GET_BARRIER_STATE_M0
;
9416 if (IntrID
== Intrinsic::amdgcn_s_get_named_barrier_state
) {
9418 M0Val
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, Op
->getOperand(2),
9419 DAG
.getShiftAmountConstant(4, MVT::i32
, DL
));
9421 DAG
.getMachineNode(AMDGPU::S_AND_B32
, DL
, MVT::i32
, M0Val
,
9422 DAG
.getTargetConstant(0x3F, DL
, MVT::i32
)),
9424 Ops
.push_back(copyToM0(DAG
, Chain
, DL
, M0Val
).getValue(0));
9426 Ops
.push_back(copyToM0(DAG
, Chain
, DL
, Op
->getOperand(2)).getValue(0));
9429 auto *NewMI
= DAG
.getMachineNode(Opc
, DL
, Op
->getVTList(), Ops
);
9430 return SDValue(NewMI
, 0);
9434 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
9435 AMDGPU::getImageDimIntrinsicInfo(IntrID
))
9436 return lowerImage(Op
, ImageDimIntr
, DAG
, true);
9442 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9443 // dwordx4 if on SI and handle TFE loads.
9444 SDValue
SITargetLowering::getMemIntrinsicNode(unsigned Opcode
, const SDLoc
&DL
,
9446 ArrayRef
<SDValue
> Ops
, EVT MemVT
,
9447 MachineMemOperand
*MMO
,
9448 SelectionDAG
&DAG
) const {
9449 LLVMContext
&C
= *DAG
.getContext();
9450 MachineFunction
&MF
= DAG
.getMachineFunction();
9451 EVT VT
= VTList
.VTs
[0];
9453 assert(VTList
.NumVTs
== 2 || VTList
.NumVTs
== 3);
9454 bool IsTFE
= VTList
.NumVTs
== 3;
9456 unsigned NumValueDWords
= divideCeil(VT
.getSizeInBits(), 32);
9457 unsigned NumOpDWords
= NumValueDWords
+ 1;
9458 EVT OpDWordsVT
= EVT::getVectorVT(C
, MVT::i32
, NumOpDWords
);
9459 SDVTList OpDWordsVTList
= DAG
.getVTList(OpDWordsVT
, VTList
.VTs
[2]);
9460 MachineMemOperand
*OpDWordsMMO
=
9461 MF
.getMachineMemOperand(MMO
, 0, NumOpDWords
* 4);
9462 SDValue Op
= getMemIntrinsicNode(Opcode
, DL
, OpDWordsVTList
, Ops
,
9463 OpDWordsVT
, OpDWordsMMO
, DAG
);
9464 SDValue Status
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, Op
,
9465 DAG
.getVectorIdxConstant(NumValueDWords
, DL
));
9466 SDValue ZeroIdx
= DAG
.getVectorIdxConstant(0, DL
);
9467 SDValue ValueDWords
=
9469 ? DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, Op
, ZeroIdx
)
9470 : DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
,
9471 EVT::getVectorVT(C
, MVT::i32
, NumValueDWords
), Op
,
9473 SDValue Value
= DAG
.getNode(ISD::BITCAST
, DL
, VT
, ValueDWords
);
9474 return DAG
.getMergeValues({Value
, Status
, SDValue(Op
.getNode(), 1)}, DL
);
9477 if (!Subtarget
->hasDwordx3LoadStores() &&
9478 (VT
== MVT::v3i32
|| VT
== MVT::v3f32
)) {
9479 EVT WidenedVT
= EVT::getVectorVT(C
, VT
.getVectorElementType(), 4);
9480 EVT WidenedMemVT
= EVT::getVectorVT(C
, MemVT
.getVectorElementType(), 4);
9481 MachineMemOperand
*WidenedMMO
= MF
.getMachineMemOperand(MMO
, 0, 16);
9482 SDVTList WidenedVTList
= DAG
.getVTList(WidenedVT
, VTList
.VTs
[1]);
9483 SDValue Op
= DAG
.getMemIntrinsicNode(Opcode
, DL
, WidenedVTList
, Ops
,
9484 WidenedMemVT
, WidenedMMO
);
9485 SDValue Value
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, VT
, Op
,
9486 DAG
.getVectorIdxConstant(0, DL
));
9487 return DAG
.getMergeValues({Value
, SDValue(Op
.getNode(), 1)}, DL
);
9490 return DAG
.getMemIntrinsicNode(Opcode
, DL
, VTList
, Ops
, MemVT
, MMO
);
9493 SDValue
SITargetLowering::handleD16VData(SDValue VData
, SelectionDAG
&DAG
,
9494 bool ImageStore
) const {
9495 EVT StoreVT
= VData
.getValueType();
9497 // No change for f16 and legal vector D16 types.
9498 if (!StoreVT
.isVector())
9502 unsigned NumElements
= StoreVT
.getVectorNumElements();
9504 if (Subtarget
->hasUnpackedD16VMem()) {
9505 // We need to unpack the packed data to store.
9506 EVT IntStoreVT
= StoreVT
.changeTypeToInteger();
9507 SDValue IntVData
= DAG
.getNode(ISD::BITCAST
, DL
, IntStoreVT
, VData
);
9510 EVT::getVectorVT(*DAG
.getContext(), MVT::i32
, NumElements
);
9511 SDValue ZExt
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, EquivStoreVT
, IntVData
);
9512 return DAG
.UnrollVectorOp(ZExt
.getNode());
9515 // The sq block of gfx8.1 does not estimate register use correctly for d16
9516 // image store instructions. The data operand is computed as if it were not a
9517 // d16 image instruction.
9518 if (ImageStore
&& Subtarget
->hasImageStoreD16Bug()) {
9520 EVT IntStoreVT
= StoreVT
.changeTypeToInteger();
9521 SDValue IntVData
= DAG
.getNode(ISD::BITCAST
, DL
, IntStoreVT
, VData
);
9523 // Decompose into scalars
9524 SmallVector
<SDValue
, 4> Elts
;
9525 DAG
.ExtractVectorElements(IntVData
, Elts
);
9527 // Group pairs of i16 into v2i16 and bitcast to i32
9528 SmallVector
<SDValue
, 4> PackedElts
;
9529 for (unsigned I
= 0; I
< Elts
.size() / 2; I
+= 1) {
9531 DAG
.getBuildVector(MVT::v2i16
, DL
, {Elts
[I
* 2], Elts
[I
* 2 + 1]});
9532 SDValue IntPair
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::i32
, Pair
);
9533 PackedElts
.push_back(IntPair
);
9535 if ((NumElements
% 2) == 1) {
9537 unsigned I
= Elts
.size() / 2;
9538 SDValue Pair
= DAG
.getBuildVector(MVT::v2i16
, DL
,
9539 {Elts
[I
* 2], DAG
.getUNDEF(MVT::i16
)});
9540 SDValue IntPair
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::i32
, Pair
);
9541 PackedElts
.push_back(IntPair
);
9545 PackedElts
.resize(Elts
.size(), DAG
.getUNDEF(MVT::i32
));
9547 // Build final vector
9549 EVT::getVectorVT(*DAG
.getContext(), MVT::i32
, PackedElts
.size());
9550 return DAG
.getBuildVector(VecVT
, DL
, PackedElts
);
9553 if (NumElements
== 3) {
9555 EVT::getIntegerVT(*DAG
.getContext(), StoreVT
.getStoreSizeInBits());
9556 SDValue IntVData
= DAG
.getNode(ISD::BITCAST
, DL
, IntStoreVT
, VData
);
9558 EVT WidenedStoreVT
= EVT::getVectorVT(
9559 *DAG
.getContext(), StoreVT
.getVectorElementType(), NumElements
+ 1);
9560 EVT WidenedIntVT
= EVT::getIntegerVT(*DAG
.getContext(),
9561 WidenedStoreVT
.getStoreSizeInBits());
9562 SDValue ZExt
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, WidenedIntVT
, IntVData
);
9563 return DAG
.getNode(ISD::BITCAST
, DL
, WidenedStoreVT
, ZExt
);
9566 assert(isTypeLegal(StoreVT
));
9570 SDValue
SITargetLowering::LowerINTRINSIC_VOID(SDValue Op
,
9571 SelectionDAG
&DAG
) const {
9573 SDValue Chain
= Op
.getOperand(0);
9574 unsigned IntrinsicID
= Op
.getConstantOperandVal(1);
9575 MachineFunction
&MF
= DAG
.getMachineFunction();
9577 switch (IntrinsicID
) {
9578 case Intrinsic::amdgcn_exp_compr
: {
9579 if (!Subtarget
->hasCompressedExport()) {
9580 DiagnosticInfoUnsupported
BadIntrin(
9581 DAG
.getMachineFunction().getFunction(),
9582 "intrinsic not supported on subtarget", DL
.getDebugLoc());
9583 DAG
.getContext()->diagnose(BadIntrin
);
9585 SDValue Src0
= Op
.getOperand(4);
9586 SDValue Src1
= Op
.getOperand(5);
9587 // Hack around illegal type on SI by directly selecting it.
9588 if (isTypeLegal(Src0
.getValueType()))
9591 const ConstantSDNode
*Done
= cast
<ConstantSDNode
>(Op
.getOperand(6));
9592 SDValue Undef
= DAG
.getUNDEF(MVT::f32
);
9593 const SDValue Ops
[] = {
9594 Op
.getOperand(2), // tgt
9595 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src0
), // src0
9596 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Src1
), // src1
9599 Op
.getOperand(7), // vm
9600 DAG
.getTargetConstant(1, DL
, MVT::i1
), // compr
9601 Op
.getOperand(3), // en
9602 Op
.getOperand(0) // Chain
9605 unsigned Opc
= Done
->isZero() ? AMDGPU::EXP
: AMDGPU::EXP_DONE
;
9606 return SDValue(DAG
.getMachineNode(Opc
, DL
, Op
->getVTList(), Ops
), 0);
9608 case Intrinsic::amdgcn_s_barrier
: {
9609 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
9610 if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None
) {
9611 unsigned WGSize
= ST
.getFlatWorkGroupSizes(MF
.getFunction()).second
;
9612 if (WGSize
<= ST
.getWavefrontSize())
9613 return SDValue(DAG
.getMachineNode(AMDGPU::WAVE_BARRIER
, DL
, MVT::Other
,
9618 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9619 if (ST
.hasSplitBarriers()) {
9621 DAG
.getTargetConstant(AMDGPU::Barrier::WORKGROUP
, DL
, MVT::i32
);
9623 SDValue(DAG
.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM
, DL
,
9624 MVT::Other
, K
, Op
.getOperand(0)),
9627 SDValue(DAG
.getMachineNode(AMDGPU::S_BARRIER_WAIT
, DL
, MVT::Other
, K
,
9628 BarSignal
.getValue(0)),
9636 case Intrinsic::amdgcn_struct_tbuffer_store
:
9637 case Intrinsic::amdgcn_struct_ptr_tbuffer_store
: {
9638 SDValue VData
= Op
.getOperand(2);
9639 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
9641 VData
= handleD16VData(VData
, DAG
);
9642 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(3), DAG
);
9643 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(5), DAG
);
9644 auto SOffset
= selectSOffset(Op
.getOperand(6), DAG
, Subtarget
);
9649 Op
.getOperand(4), // vindex
9653 Op
.getOperand(7), // format
9654 Op
.getOperand(8), // cachepolicy, swizzled buffer
9655 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
9657 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9658 : AMDGPUISD::TBUFFER_STORE_FORMAT
;
9659 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
9660 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
9661 M
->getMemoryVT(), M
->getMemOperand());
9664 case Intrinsic::amdgcn_raw_tbuffer_store
:
9665 case Intrinsic::amdgcn_raw_ptr_tbuffer_store
: {
9666 SDValue VData
= Op
.getOperand(2);
9667 bool IsD16
= (VData
.getValueType().getScalarType() == MVT::f16
);
9669 VData
= handleD16VData(VData
, DAG
);
9670 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(3), DAG
);
9671 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(4), DAG
);
9672 auto SOffset
= selectSOffset(Op
.getOperand(5), DAG
, Subtarget
);
9677 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
9681 Op
.getOperand(6), // format
9682 Op
.getOperand(7), // cachepolicy, swizzled buffer
9683 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
9685 unsigned Opc
= IsD16
? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9686 : AMDGPUISD::TBUFFER_STORE_FORMAT
;
9687 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
9688 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
9689 M
->getMemoryVT(), M
->getMemOperand());
9692 case Intrinsic::amdgcn_raw_buffer_store
:
9693 case Intrinsic::amdgcn_raw_ptr_buffer_store
:
9694 case Intrinsic::amdgcn_raw_buffer_store_format
:
9695 case Intrinsic::amdgcn_raw_ptr_buffer_store_format
: {
9696 const bool IsFormat
=
9697 IntrinsicID
== Intrinsic::amdgcn_raw_buffer_store_format
||
9698 IntrinsicID
== Intrinsic::amdgcn_raw_ptr_buffer_store_format
;
9700 SDValue VData
= Op
.getOperand(2);
9701 EVT VDataVT
= VData
.getValueType();
9702 EVT EltType
= VDataVT
.getScalarType();
9703 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
9705 VData
= handleD16VData(VData
, DAG
);
9706 VDataVT
= VData
.getValueType();
9709 if (!isTypeLegal(VDataVT
)) {
9711 DAG
.getNode(ISD::BITCAST
, DL
,
9712 getEquivalentMemType(*DAG
.getContext(), VDataVT
), VData
);
9715 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(3), DAG
);
9716 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(4), DAG
);
9717 auto SOffset
= selectSOffset(Op
.getOperand(5), DAG
, Subtarget
);
9722 DAG
.getConstant(0, DL
, MVT::i32
), // vindex
9726 Op
.getOperand(6), // cachepolicy, swizzled buffer
9727 DAG
.getTargetConstant(0, DL
, MVT::i1
), // idxen
9730 IsFormat
? AMDGPUISD::BUFFER_STORE_FORMAT
: AMDGPUISD::BUFFER_STORE
;
9731 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
9732 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
9734 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9735 if (!IsD16
&& !VDataVT
.isVector() && EltType
.getSizeInBits() < 32)
9736 return handleByteShortBufferStores(DAG
, VDataVT
, DL
, Ops
, M
);
9738 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
9739 M
->getMemoryVT(), M
->getMemOperand());
9742 case Intrinsic::amdgcn_struct_buffer_store
:
9743 case Intrinsic::amdgcn_struct_ptr_buffer_store
:
9744 case Intrinsic::amdgcn_struct_buffer_store_format
:
9745 case Intrinsic::amdgcn_struct_ptr_buffer_store_format
: {
9746 const bool IsFormat
=
9747 IntrinsicID
== Intrinsic::amdgcn_struct_buffer_store_format
||
9748 IntrinsicID
== Intrinsic::amdgcn_struct_ptr_buffer_store_format
;
9750 SDValue VData
= Op
.getOperand(2);
9751 EVT VDataVT
= VData
.getValueType();
9752 EVT EltType
= VDataVT
.getScalarType();
9753 bool IsD16
= IsFormat
&& (EltType
.getSizeInBits() == 16);
9756 VData
= handleD16VData(VData
, DAG
);
9757 VDataVT
= VData
.getValueType();
9760 if (!isTypeLegal(VDataVT
)) {
9762 DAG
.getNode(ISD::BITCAST
, DL
,
9763 getEquivalentMemType(*DAG
.getContext(), VDataVT
), VData
);
9766 auto Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(3), DAG
);
9767 auto [VOffset
, Offset
] = splitBufferOffsets(Op
.getOperand(5), DAG
);
9768 auto SOffset
= selectSOffset(Op
.getOperand(6), DAG
, Subtarget
);
9773 Op
.getOperand(4), // vindex
9777 Op
.getOperand(7), // cachepolicy, swizzled buffer
9778 DAG
.getTargetConstant(1, DL
, MVT::i1
), // idxen
9781 !IsFormat
? AMDGPUISD::BUFFER_STORE
: AMDGPUISD::BUFFER_STORE_FORMAT
;
9782 Opc
= IsD16
? AMDGPUISD::BUFFER_STORE_FORMAT_D16
: Opc
;
9783 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
9785 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9786 EVT VDataType
= VData
.getValueType().getScalarType();
9787 if (!IsD16
&& !VDataVT
.isVector() && EltType
.getSizeInBits() < 32)
9788 return handleByteShortBufferStores(DAG
, VDataType
, DL
, Ops
, M
);
9790 return DAG
.getMemIntrinsicNode(Opc
, DL
, Op
->getVTList(), Ops
,
9791 M
->getMemoryVT(), M
->getMemOperand());
9793 case Intrinsic::amdgcn_raw_buffer_load_lds
:
9794 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds
:
9795 case Intrinsic::amdgcn_struct_buffer_load_lds
:
9796 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds
: {
9797 assert(!AMDGPU::isGFX12Plus(*Subtarget
));
9800 IntrinsicID
== Intrinsic::amdgcn_struct_buffer_load_lds
||
9801 IntrinsicID
== Intrinsic::amdgcn_struct_ptr_buffer_load_lds
;
9802 unsigned OpOffset
= HasVIndex
? 1 : 0;
9803 SDValue VOffset
= Op
.getOperand(5 + OpOffset
);
9804 bool HasVOffset
= !isNullConstant(VOffset
);
9805 unsigned Size
= Op
->getConstantOperandVal(4);
9811 Opc
= HasVIndex
? HasVOffset
? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9812 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9813 : HasVOffset
? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9814 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET
;
9817 Opc
= HasVIndex
? HasVOffset
? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9818 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9819 : HasVOffset
? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9820 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET
;
9823 Opc
= HasVIndex
? HasVOffset
? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9824 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9825 : HasVOffset
? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9826 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET
;
9829 if (!Subtarget
->hasLDSLoadB96_B128())
9831 Opc
= HasVIndex
? HasVOffset
? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9832 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9833 : HasVOffset
? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9834 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET
;
9837 if (!Subtarget
->hasLDSLoadB96_B128())
9839 Opc
= HasVIndex
? HasVOffset
? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9840 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9841 : HasVOffset
? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9842 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET
;
9846 SDValue M0Val
= copyToM0(DAG
, Chain
, DL
, Op
.getOperand(3));
9848 SmallVector
<SDValue
, 8> Ops
;
9850 if (HasVIndex
&& HasVOffset
)
9851 Ops
.push_back(DAG
.getBuildVector(MVT::v2i32
, DL
,
9852 {Op
.getOperand(5), // VIndex
9855 Ops
.push_back(Op
.getOperand(5));
9856 else if (HasVOffset
)
9857 Ops
.push_back(VOffset
);
9859 SDValue Rsrc
= bufferRsrcPtrToVector(Op
.getOperand(2), DAG
);
9860 Ops
.push_back(Rsrc
);
9861 Ops
.push_back(Op
.getOperand(6 + OpOffset
)); // soffset
9862 Ops
.push_back(Op
.getOperand(7 + OpOffset
)); // imm offset
9863 bool IsGFX12Plus
= AMDGPU::isGFX12Plus(*Subtarget
);
9864 unsigned Aux
= Op
.getConstantOperandVal(8 + OpOffset
);
9865 Ops
.push_back(DAG
.getTargetConstant(
9866 Aux
& (IsGFX12Plus
? AMDGPU::CPol::ALL
: AMDGPU::CPol::ALL_pregfx12
),
9867 DL
, MVT::i8
)); // cpol
9868 Ops
.push_back(DAG
.getTargetConstant(
9869 Aux
& (IsGFX12Plus
? AMDGPU::CPol::SWZ
: AMDGPU::CPol::SWZ_pregfx12
)
9872 DL
, MVT::i8
)); // swz
9873 Ops
.push_back(M0Val
.getValue(0)); // Chain
9874 Ops
.push_back(M0Val
.getValue(1)); // Glue
9876 auto *M
= cast
<MemSDNode
>(Op
);
9877 MachineMemOperand
*LoadMMO
= M
->getMemOperand();
9878 // Don't set the offset value here because the pointer points to the base of
9880 MachinePointerInfo LoadPtrI
= LoadMMO
->getPointerInfo();
9882 MachinePointerInfo StorePtrI
= LoadPtrI
;
9883 LoadPtrI
.V
= PoisonValue::get(
9884 PointerType::get(*DAG
.getContext(), AMDGPUAS::GLOBAL_ADDRESS
));
9885 LoadPtrI
.AddrSpace
= AMDGPUAS::GLOBAL_ADDRESS
;
9886 StorePtrI
.AddrSpace
= AMDGPUAS::LOCAL_ADDRESS
;
9888 auto F
= LoadMMO
->getFlags() &
9889 ~(MachineMemOperand::MOStore
| MachineMemOperand::MOLoad
);
9891 MF
.getMachineMemOperand(LoadPtrI
, F
| MachineMemOperand::MOLoad
, Size
,
9892 LoadMMO
->getBaseAlign(), LoadMMO
->getAAInfo());
9894 MachineMemOperand
*StoreMMO
= MF
.getMachineMemOperand(
9895 StorePtrI
, F
| MachineMemOperand::MOStore
, sizeof(int32_t),
9896 LoadMMO
->getBaseAlign(), LoadMMO
->getAAInfo());
9898 auto *Load
= DAG
.getMachineNode(Opc
, DL
, M
->getVTList(), Ops
);
9899 DAG
.setNodeMemRefs(Load
, {LoadMMO
, StoreMMO
});
9901 return SDValue(Load
, 0);
9903 case Intrinsic::amdgcn_global_load_lds
: {
9905 unsigned Size
= Op
->getConstantOperandVal(4);
9910 Opc
= AMDGPU::GLOBAL_LOAD_LDS_UBYTE
;
9913 Opc
= AMDGPU::GLOBAL_LOAD_LDS_USHORT
;
9916 Opc
= AMDGPU::GLOBAL_LOAD_LDS_DWORD
;
9919 if (!Subtarget
->hasLDSLoadB96_B128())
9921 Opc
= AMDGPU::GLOBAL_LOAD_LDS_DWORDX3
;
9924 if (!Subtarget
->hasLDSLoadB96_B128())
9926 Opc
= AMDGPU::GLOBAL_LOAD_LDS_DWORDX4
;
9930 auto *M
= cast
<MemSDNode
>(Op
);
9931 SDValue M0Val
= copyToM0(DAG
, Chain
, DL
, Op
.getOperand(3));
9933 SmallVector
<SDValue
, 6> Ops
;
9935 SDValue Addr
= Op
.getOperand(2); // Global ptr
9937 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9938 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9939 if (Addr
->isDivergent() && Addr
.getOpcode() == ISD::ADD
) {
9940 SDValue LHS
= Addr
.getOperand(0);
9941 SDValue RHS
= Addr
.getOperand(1);
9943 if (LHS
->isDivergent())
9944 std::swap(LHS
, RHS
);
9946 if (!LHS
->isDivergent() && RHS
.getOpcode() == ISD::ZERO_EXTEND
&&
9947 RHS
.getOperand(0).getValueType() == MVT::i32
) {
9948 // add (i64 sgpr), (zero_extend (i32 vgpr))
9950 VOffset
= RHS
.getOperand(0);
9954 Ops
.push_back(Addr
);
9955 if (!Addr
->isDivergent()) {
9956 Opc
= AMDGPU::getGlobalSaddrOp(Opc
);
9959 SDValue(DAG
.getMachineNode(AMDGPU::V_MOV_B32_e32
, DL
, MVT::i32
,
9960 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
9962 Ops
.push_back(VOffset
);
9965 Ops
.push_back(Op
.getOperand(5)); // Offset
9966 Ops
.push_back(Op
.getOperand(6)); // CPol
9967 Ops
.push_back(M0Val
.getValue(0)); // Chain
9968 Ops
.push_back(M0Val
.getValue(1)); // Glue
9970 MachineMemOperand
*LoadMMO
= M
->getMemOperand();
9971 MachinePointerInfo LoadPtrI
= LoadMMO
->getPointerInfo();
9972 LoadPtrI
.Offset
= Op
->getConstantOperandVal(5);
9973 MachinePointerInfo StorePtrI
= LoadPtrI
;
9974 LoadPtrI
.V
= PoisonValue::get(
9975 PointerType::get(*DAG
.getContext(), AMDGPUAS::GLOBAL_ADDRESS
));
9976 LoadPtrI
.AddrSpace
= AMDGPUAS::GLOBAL_ADDRESS
;
9977 StorePtrI
.AddrSpace
= AMDGPUAS::LOCAL_ADDRESS
;
9978 auto F
= LoadMMO
->getFlags() &
9979 ~(MachineMemOperand::MOStore
| MachineMemOperand::MOLoad
);
9981 MF
.getMachineMemOperand(LoadPtrI
, F
| MachineMemOperand::MOLoad
, Size
,
9982 LoadMMO
->getBaseAlign(), LoadMMO
->getAAInfo());
9983 MachineMemOperand
*StoreMMO
= MF
.getMachineMemOperand(
9984 StorePtrI
, F
| MachineMemOperand::MOStore
, sizeof(int32_t), Align(4),
9985 LoadMMO
->getAAInfo());
9987 auto *Load
= DAG
.getMachineNode(Opc
, DL
, Op
->getVTList(), Ops
);
9988 DAG
.setNodeMemRefs(Load
, {LoadMMO
, StoreMMO
});
9990 return SDValue(Load
, 0);
9992 case Intrinsic::amdgcn_end_cf
:
9993 return SDValue(DAG
.getMachineNode(AMDGPU::SI_END_CF
, DL
, MVT::Other
,
9994 Op
->getOperand(2), Chain
),
9996 case Intrinsic::amdgcn_s_barrier_init
:
9997 case Intrinsic::amdgcn_s_barrier_signal_var
: {
9998 // these two intrinsics have two operands: barrier pointer and member count
9999 SDValue Chain
= Op
->getOperand(0);
10000 SmallVector
<SDValue
, 2> Ops
;
10001 SDValue BarOp
= Op
->getOperand(2);
10002 SDValue CntOp
= Op
->getOperand(3);
10004 unsigned Opc
= IntrinsicID
== Intrinsic::amdgcn_s_barrier_init
10005 ? AMDGPU::S_BARRIER_INIT_M0
10006 : AMDGPU::S_BARRIER_SIGNAL_M0
;
10007 // extract the BarrierID from bits 4-9 of BarOp
10009 BarID
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, BarOp
,
10010 DAG
.getShiftAmountConstant(4, MVT::i32
, DL
));
10012 SDValue(DAG
.getMachineNode(AMDGPU::S_AND_B32
, DL
, MVT::i32
, BarID
,
10013 DAG
.getTargetConstant(0x3F, DL
, MVT::i32
)),
10015 // Member count should be put into M0[ShAmt:+6]
10016 // Barrier ID should be put into M0[5:0]
10018 SDValue(DAG
.getMachineNode(AMDGPU::S_AND_B32
, DL
, MVT::i32
, CntOp
,
10019 DAG
.getTargetConstant(0x3F, DL
, MVT::i32
)),
10021 constexpr unsigned ShAmt
= 16;
10022 M0Val
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, CntOp
,
10023 DAG
.getShiftAmountConstant(ShAmt
, MVT::i32
, DL
));
10026 DAG
.getMachineNode(AMDGPU::S_OR_B32
, DL
, MVT::i32
, M0Val
, BarID
), 0);
10028 Ops
.push_back(copyToM0(DAG
, Chain
, DL
, M0Val
).getValue(0));
10030 auto *NewMI
= DAG
.getMachineNode(Opc
, DL
, Op
->getVTList(), Ops
);
10031 return SDValue(NewMI
, 0);
10033 case Intrinsic::amdgcn_s_barrier_join
:
10034 case Intrinsic::amdgcn_s_wakeup_barrier
: {
10035 // these three intrinsics have one operand: barrier pointer
10036 SDValue Chain
= Op
->getOperand(0);
10037 SmallVector
<SDValue
, 2> Ops
;
10038 SDValue BarOp
= Op
->getOperand(2);
10041 if (isa
<ConstantSDNode
>(BarOp
)) {
10042 uint64_t BarVal
= cast
<ConstantSDNode
>(BarOp
)->getZExtValue();
10043 switch (IntrinsicID
) {
10046 case Intrinsic::amdgcn_s_barrier_join
:
10047 Opc
= AMDGPU::S_BARRIER_JOIN_IMM
;
10049 case Intrinsic::amdgcn_s_wakeup_barrier
:
10050 Opc
= AMDGPU::S_WAKEUP_BARRIER_IMM
;
10053 // extract the BarrierID from bits 4-9 of the immediate
10054 unsigned BarID
= (BarVal
>> 4) & 0x3F;
10055 SDValue K
= DAG
.getTargetConstant(BarID
, DL
, MVT::i32
);
10057 Ops
.push_back(Chain
);
10059 switch (IntrinsicID
) {
10062 case Intrinsic::amdgcn_s_barrier_join
:
10063 Opc
= AMDGPU::S_BARRIER_JOIN_M0
;
10065 case Intrinsic::amdgcn_s_wakeup_barrier
:
10066 Opc
= AMDGPU::S_WAKEUP_BARRIER_M0
;
10069 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10071 M0Val
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, BarOp
,
10072 DAG
.getShiftAmountConstant(4, MVT::i32
, DL
));
10074 SDValue(DAG
.getMachineNode(AMDGPU::S_AND_B32
, DL
, MVT::i32
, M0Val
,
10075 DAG
.getTargetConstant(0x3F, DL
, MVT::i32
)),
10077 Ops
.push_back(copyToM0(DAG
, Chain
, DL
, M0Val
).getValue(0));
10080 auto *NewMI
= DAG
.getMachineNode(Opc
, DL
, Op
->getVTList(), Ops
);
10081 return SDValue(NewMI
, 0);
10083 case Intrinsic::amdgcn_s_prefetch_data
: {
10084 // For non-global address space preserve the chain and remove the call.
10085 if (!AMDGPU::isFlatGlobalAddrSpace(cast
<MemSDNode
>(Op
)->getAddressSpace()))
10086 return Op
.getOperand(0);
10089 case Intrinsic::amdgcn_s_buffer_prefetch_data
: {
10091 Chain
, bufferRsrcPtrToVector(Op
.getOperand(2), DAG
),
10092 Op
.getOperand(3), // offset
10093 Op
.getOperand(4), // length
10096 MemSDNode
*M
= cast
<MemSDNode
>(Op
);
10097 return DAG
.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA
, DL
,
10098 Op
->getVTList(), Ops
, M
->getMemoryVT(),
10099 M
->getMemOperand());
10102 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
10103 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID
))
10104 return lowerImage(Op
, ImageDimIntr
, DAG
, true);
10111 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10112 // offset (the offset that is included in bounds checking and swizzling, to be
10113 // split between the instruction's voffset and immoffset fields) and soffset
10114 // (the offset that is excluded from bounds checking and swizzling, to go in
10115 // the instruction's soffset field). This function takes the first kind of
10116 // offset and figures out how to split it between voffset and immoffset.
10117 std::pair
<SDValue
, SDValue
>
10118 SITargetLowering::splitBufferOffsets(SDValue Offset
, SelectionDAG
&DAG
) const {
10120 const unsigned MaxImm
= SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget
);
10121 SDValue N0
= Offset
;
10122 ConstantSDNode
*C1
= nullptr;
10124 if ((C1
= dyn_cast
<ConstantSDNode
>(N0
)))
10126 else if (DAG
.isBaseWithConstantOffset(N0
)) {
10127 C1
= cast
<ConstantSDNode
>(N0
.getOperand(1));
10128 N0
= N0
.getOperand(0);
10132 unsigned ImmOffset
= C1
->getZExtValue();
10133 // If the immediate value is too big for the immoffset field, put only bits
10134 // that would normally fit in the immoffset field. The remaining value that
10135 // is copied/added for the voffset field is a large power of 2, and it
10136 // stands more chance of being CSEd with the copy/add for another similar
10138 // However, do not do that rounding down if that is a negative
10139 // number, as it appears to be illegal to have a negative offset in the
10140 // vgpr, even if adding the immediate offset makes it positive.
10141 unsigned Overflow
= ImmOffset
& ~MaxImm
;
10142 ImmOffset
-= Overflow
;
10143 if ((int32_t)Overflow
< 0) {
10144 Overflow
+= ImmOffset
;
10147 C1
= cast
<ConstantSDNode
>(DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
));
10149 auto OverflowVal
= DAG
.getConstant(Overflow
, DL
, MVT::i32
);
10153 SDValue Ops
[] = {N0
, OverflowVal
};
10154 N0
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, Ops
);
10159 N0
= DAG
.getConstant(0, DL
, MVT::i32
);
10161 C1
= cast
<ConstantSDNode
>(DAG
.getTargetConstant(0, DL
, MVT::i32
));
10162 return {N0
, SDValue(C1
, 0)};
10165 // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10166 // the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10167 // pointed to by Offsets.
10168 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset
,
10169 SelectionDAG
&DAG
, SDValue
*Offsets
,
10170 Align Alignment
) const {
10171 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
10172 SDLoc
DL(CombinedOffset
);
10173 if (auto *C
= dyn_cast
<ConstantSDNode
>(CombinedOffset
)) {
10174 uint32_t Imm
= C
->getZExtValue();
10175 uint32_t SOffset
, ImmOffset
;
10176 if (TII
->splitMUBUFOffset(Imm
, SOffset
, ImmOffset
, Alignment
)) {
10177 Offsets
[0] = DAG
.getConstant(0, DL
, MVT::i32
);
10178 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
10179 Offsets
[2] = DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
);
10183 if (DAG
.isBaseWithConstantOffset(CombinedOffset
)) {
10184 SDValue N0
= CombinedOffset
.getOperand(0);
10185 SDValue N1
= CombinedOffset
.getOperand(1);
10186 uint32_t SOffset
, ImmOffset
;
10187 int Offset
= cast
<ConstantSDNode
>(N1
)->getSExtValue();
10189 TII
->splitMUBUFOffset(Offset
, SOffset
, ImmOffset
, Alignment
)) {
10191 Offsets
[1] = DAG
.getConstant(SOffset
, DL
, MVT::i32
);
10192 Offsets
[2] = DAG
.getTargetConstant(ImmOffset
, DL
, MVT::i32
);
10197 SDValue SOffsetZero
= Subtarget
->hasRestrictedSOffset()
10198 ? DAG
.getRegister(AMDGPU::SGPR_NULL
, MVT::i32
)
10199 : DAG
.getConstant(0, DL
, MVT::i32
);
10201 Offsets
[0] = CombinedOffset
;
10202 Offsets
[1] = SOffsetZero
;
10203 Offsets
[2] = DAG
.getTargetConstant(0, DL
, MVT::i32
);
10206 SDValue
SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer
,
10207 SelectionDAG
&DAG
) const {
10208 if (!MaybePointer
.getValueType().isScalarInteger())
10209 return MaybePointer
;
10211 SDValue Rsrc
= DAG
.getBitcast(MVT::v4i32
, MaybePointer
);
10215 // Wrap a global or flat pointer into a buffer intrinsic using the flags
10216 // specified in the intrinsic.
10217 SDValue
SITargetLowering::lowerPointerAsRsrcIntrin(SDNode
*Op
,
10218 SelectionDAG
&DAG
) const {
10221 SDValue Pointer
= Op
->getOperand(1);
10222 SDValue Stride
= Op
->getOperand(2);
10223 SDValue NumRecords
= Op
->getOperand(3);
10224 SDValue Flags
= Op
->getOperand(4);
10226 auto [LowHalf
, HighHalf
] = DAG
.SplitScalar(Pointer
, Loc
, MVT::i32
, MVT::i32
);
10227 SDValue Mask
= DAG
.getConstant(0x0000ffff, Loc
, MVT::i32
);
10228 SDValue Masked
= DAG
.getNode(ISD::AND
, Loc
, MVT::i32
, HighHalf
, Mask
);
10229 std::optional
<uint32_t> ConstStride
= std::nullopt
;
10230 if (auto *ConstNode
= dyn_cast
<ConstantSDNode
>(Stride
))
10231 ConstStride
= ConstNode
->getZExtValue();
10233 SDValue NewHighHalf
= Masked
;
10234 if (!ConstStride
|| *ConstStride
!= 0) {
10235 SDValue ShiftedStride
;
10237 ShiftedStride
= DAG
.getConstant(*ConstStride
<< 16, Loc
, MVT::i32
);
10239 SDValue ExtStride
= DAG
.getAnyExtOrTrunc(Stride
, Loc
, MVT::i32
);
10241 DAG
.getNode(ISD::SHL
, Loc
, MVT::i32
, ExtStride
,
10242 DAG
.getShiftAmountConstant(16, MVT::i32
, Loc
));
10244 NewHighHalf
= DAG
.getNode(ISD::OR
, Loc
, MVT::i32
, Masked
, ShiftedStride
);
10247 SDValue Rsrc
= DAG
.getNode(ISD::BUILD_VECTOR
, Loc
, MVT::v4i32
, LowHalf
,
10248 NewHighHalf
, NumRecords
, Flags
);
10249 SDValue RsrcPtr
= DAG
.getNode(ISD::BITCAST
, Loc
, MVT::i128
, Rsrc
);
10253 // Handle 8 bit and 16 bit buffer loads
10254 SDValue
SITargetLowering::handleByteShortBufferLoads(SelectionDAG
&DAG
,
10255 EVT LoadVT
, SDLoc DL
,
10256 ArrayRef
<SDValue
> Ops
,
10257 MachineMemOperand
*MMO
,
10258 bool IsTFE
) const {
10259 EVT IntVT
= LoadVT
.changeTypeToInteger();
10262 unsigned Opc
= (LoadVT
.getScalarType() == MVT::i8
)
10263 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
10264 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE
;
10265 MachineFunction
&MF
= DAG
.getMachineFunction();
10266 MachineMemOperand
*OpMMO
= MF
.getMachineMemOperand(MMO
, 0, 8);
10267 SDVTList VTs
= DAG
.getVTList(MVT::v2i32
, MVT::Other
);
10268 SDValue Op
= getMemIntrinsicNode(Opc
, DL
, VTs
, Ops
, MVT::v2i32
, OpMMO
, DAG
);
10269 SDValue Status
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, Op
,
10270 DAG
.getConstant(1, DL
, MVT::i32
));
10271 SDValue Data
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, Op
,
10272 DAG
.getConstant(0, DL
, MVT::i32
));
10273 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, IntVT
, Data
);
10274 SDValue Value
= DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, Trunc
);
10275 return DAG
.getMergeValues({Value
, Status
, SDValue(Op
.getNode(), 1)}, DL
);
10278 unsigned Opc
= LoadVT
.getScalarType() == MVT::i8
10279 ? AMDGPUISD::BUFFER_LOAD_UBYTE
10280 : AMDGPUISD::BUFFER_LOAD_USHORT
;
10282 SDVTList ResList
= DAG
.getVTList(MVT::i32
, MVT::Other
);
10283 SDValue BufferLoad
=
10284 DAG
.getMemIntrinsicNode(Opc
, DL
, ResList
, Ops
, IntVT
, MMO
);
10285 SDValue LoadVal
= DAG
.getNode(ISD::TRUNCATE
, DL
, IntVT
, BufferLoad
);
10286 LoadVal
= DAG
.getNode(ISD::BITCAST
, DL
, LoadVT
, LoadVal
);
10288 return DAG
.getMergeValues({LoadVal
, BufferLoad
.getValue(1)}, DL
);
10291 // Handle 8 bit and 16 bit buffer stores
10292 SDValue
SITargetLowering::handleByteShortBufferStores(SelectionDAG
&DAG
,
10293 EVT VDataType
, SDLoc DL
,
10295 MemSDNode
*M
) const {
10296 if (VDataType
== MVT::f16
|| VDataType
== MVT::bf16
)
10297 Ops
[1] = DAG
.getNode(ISD::BITCAST
, DL
, MVT::i16
, Ops
[1]);
10299 SDValue BufferStoreExt
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Ops
[1]);
10300 Ops
[1] = BufferStoreExt
;
10301 unsigned Opc
= (VDataType
== MVT::i8
) ? AMDGPUISD::BUFFER_STORE_BYTE
10302 : AMDGPUISD::BUFFER_STORE_SHORT
;
10303 ArrayRef
<SDValue
> OpsRef
= ArrayRef(&Ops
[0], 9);
10304 return DAG
.getMemIntrinsicNode(Opc
, DL
, M
->getVTList(), OpsRef
, VDataType
,
10305 M
->getMemOperand());
10308 static SDValue
getLoadExtOrTrunc(SelectionDAG
&DAG
, ISD::LoadExtType ExtType
,
10309 SDValue Op
, const SDLoc
&SL
, EVT VT
) {
10310 if (VT
.bitsLT(Op
.getValueType()))
10311 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Op
);
10314 case ISD::SEXTLOAD
:
10315 return DAG
.getNode(ISD::SIGN_EXTEND
, SL
, VT
, Op
);
10316 case ISD::ZEXTLOAD
:
10317 return DAG
.getNode(ISD::ZERO_EXTEND
, SL
, VT
, Op
);
10319 return DAG
.getNode(ISD::ANY_EXTEND
, SL
, VT
, Op
);
10320 case ISD::NON_EXTLOAD
:
10324 llvm_unreachable("invalid ext type");
10327 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10328 // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10329 SDValue
SITargetLowering::widenLoad(LoadSDNode
*Ld
,
10330 DAGCombinerInfo
&DCI
) const {
10331 SelectionDAG
&DAG
= DCI
.DAG
;
10332 if (Ld
->getAlign() < Align(4) || Ld
->isDivergent())
10335 // FIXME: Constant loads should all be marked invariant.
10336 unsigned AS
= Ld
->getAddressSpace();
10337 if (AS
!= AMDGPUAS::CONSTANT_ADDRESS
&&
10338 AS
!= AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
10339 (AS
!= AMDGPUAS::GLOBAL_ADDRESS
|| !Ld
->isInvariant()))
10342 // Don't do this early, since it may interfere with adjacent load merging for
10343 // illegal types. We can avoid losing alignment information for exotic types
10345 EVT MemVT
= Ld
->getMemoryVT();
10346 if ((MemVT
.isSimple() && !DCI
.isAfterLegalizeDAG()) ||
10347 MemVT
.getSizeInBits() >= 32)
10352 assert((!MemVT
.isVector() || Ld
->getExtensionType() == ISD::NON_EXTLOAD
) &&
10353 "unexpected vector extload");
10355 // TODO: Drop only high part of range.
10356 SDValue Ptr
= Ld
->getBasePtr();
10357 SDValue NewLoad
= DAG
.getLoad(
10358 ISD::UNINDEXED
, ISD::NON_EXTLOAD
, MVT::i32
, SL
, Ld
->getChain(), Ptr
,
10359 Ld
->getOffset(), Ld
->getPointerInfo(), MVT::i32
, Ld
->getAlign(),
10360 Ld
->getMemOperand()->getFlags(), Ld
->getAAInfo(),
10361 nullptr); // Drop ranges
10363 EVT TruncVT
= EVT::getIntegerVT(*DAG
.getContext(), MemVT
.getSizeInBits());
10364 if (MemVT
.isFloatingPoint()) {
10365 assert(Ld
->getExtensionType() == ISD::NON_EXTLOAD
&&
10366 "unexpected fp extload");
10367 TruncVT
= MemVT
.changeTypeToInteger();
10370 SDValue Cvt
= NewLoad
;
10371 if (Ld
->getExtensionType() == ISD::SEXTLOAD
) {
10372 Cvt
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, SL
, MVT::i32
, NewLoad
,
10373 DAG
.getValueType(TruncVT
));
10374 } else if (Ld
->getExtensionType() == ISD::ZEXTLOAD
||
10375 Ld
->getExtensionType() == ISD::NON_EXTLOAD
) {
10376 Cvt
= DAG
.getZeroExtendInReg(NewLoad
, SL
, TruncVT
);
10378 assert(Ld
->getExtensionType() == ISD::EXTLOAD
);
10381 EVT VT
= Ld
->getValueType(0);
10382 EVT IntVT
= EVT::getIntegerVT(*DAG
.getContext(), VT
.getSizeInBits());
10384 DCI
.AddToWorklist(Cvt
.getNode());
10386 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10387 // the appropriate extension from the 32-bit load.
10388 Cvt
= getLoadExtOrTrunc(DAG
, Ld
->getExtensionType(), Cvt
, SL
, IntVT
);
10389 DCI
.AddToWorklist(Cvt
.getNode());
10391 // Handle conversion back to floating point if necessary.
10392 Cvt
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, Cvt
);
10394 return DAG
.getMergeValues({Cvt
, NewLoad
.getValue(1)}, SL
);
10397 static bool addressMayBeAccessedAsPrivate(const MachineMemOperand
*MMO
,
10398 const SIMachineFunctionInfo
&Info
) {
10399 // TODO: Should check if the address can definitely not access stack.
10400 if (Info
.isEntryFunction())
10401 return Info
.getUserSGPRInfo().hasFlatScratchInit();
10405 SDValue
SITargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
10407 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
10408 ISD::LoadExtType ExtType
= Load
->getExtensionType();
10409 EVT MemVT
= Load
->getMemoryVT();
10410 MachineMemOperand
*MMO
= Load
->getMemOperand();
10412 if (ExtType
== ISD::NON_EXTLOAD
&& MemVT
.getSizeInBits() < 32) {
10413 if (MemVT
== MVT::i16
&& isTypeLegal(MVT::i16
))
10416 // FIXME: Copied from PPC
10417 // First, load into 32 bits, then truncate to 1 bit.
10419 SDValue Chain
= Load
->getChain();
10420 SDValue BasePtr
= Load
->getBasePtr();
10422 EVT RealMemVT
= (MemVT
== MVT::i1
) ? MVT::i8
: MVT::i16
;
10424 SDValue NewLD
= DAG
.getExtLoad(ISD::EXTLOAD
, DL
, MVT::i32
, Chain
, BasePtr
,
10427 if (!MemVT
.isVector()) {
10428 SDValue Ops
[] = {DAG
.getNode(ISD::TRUNCATE
, DL
, MemVT
, NewLD
),
10429 NewLD
.getValue(1)};
10431 return DAG
.getMergeValues(Ops
, DL
);
10434 SmallVector
<SDValue
, 3> Elts
;
10435 for (unsigned I
= 0, N
= MemVT
.getVectorNumElements(); I
!= N
; ++I
) {
10436 SDValue Elt
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, NewLD
,
10437 DAG
.getConstant(I
, DL
, MVT::i32
));
10439 Elts
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Elt
));
10442 SDValue Ops
[] = {DAG
.getBuildVector(MemVT
, DL
, Elts
), NewLD
.getValue(1)};
10444 return DAG
.getMergeValues(Ops
, DL
);
10447 if (!MemVT
.isVector())
10450 assert(Op
.getValueType().getVectorElementType() == MVT::i32
&&
10451 "Custom lowering for non-i32 vectors hasn't been implemented.");
10453 Align Alignment
= Load
->getAlign();
10454 unsigned AS
= Load
->getAddressSpace();
10455 if (Subtarget
->hasLDSMisalignedBug() && AS
== AMDGPUAS::FLAT_ADDRESS
&&
10456 Alignment
.value() < MemVT
.getStoreSize() && MemVT
.getSizeInBits() > 32) {
10457 return SplitVectorLoad(Op
, DAG
);
10460 MachineFunction
&MF
= DAG
.getMachineFunction();
10461 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
10462 // If there is a possibility that flat instruction access scratch memory
10463 // then we need to use the same legalization rules we use for private.
10464 if (AS
== AMDGPUAS::FLAT_ADDRESS
&&
10465 !Subtarget
->hasMultiDwordFlatScratchAddressing())
10466 AS
= addressMayBeAccessedAsPrivate(Load
->getMemOperand(), *MFI
)
10467 ? AMDGPUAS::PRIVATE_ADDRESS
10468 : AMDGPUAS::GLOBAL_ADDRESS
;
10470 unsigned NumElements
= MemVT
.getVectorNumElements();
10472 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
10473 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
10474 (AS
== AMDGPUAS::GLOBAL_ADDRESS
&&
10475 Subtarget
->getScalarizeGlobalBehavior() && Load
->isSimple() &&
10476 isMemOpHasNoClobberedMemOperand(Load
))) {
10477 if ((!Op
->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO
)) &&
10478 Alignment
>= Align(4) && NumElements
< 32) {
10479 if (MemVT
.isPow2VectorType() ||
10480 (Subtarget
->hasScalarDwordx3Loads() && NumElements
== 3))
10482 return WidenOrSplitVectorLoad(Op
, DAG
);
10484 // Non-uniform loads will be selected to MUBUF instructions, so they
10485 // have the same legalization requirements as global and private
10489 if (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
10490 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
10491 AS
== AMDGPUAS::GLOBAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
) {
10492 if (NumElements
> 4)
10493 return SplitVectorLoad(Op
, DAG
);
10494 // v3 loads not supported on SI.
10495 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
10496 return WidenOrSplitVectorLoad(Op
, DAG
);
10498 // v3 and v4 loads are supported for private and global memory.
10501 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
10502 // Depending on the setting of the private_element_size field in the
10503 // resource descriptor, we can only make private accesses up to a certain
10505 switch (Subtarget
->getMaxPrivateElementSize()) {
10507 auto [Op0
, Op1
] = scalarizeVectorLoad(Load
, DAG
);
10508 return DAG
.getMergeValues({Op0
, Op1
}, DL
);
10511 if (NumElements
> 2)
10512 return SplitVectorLoad(Op
, DAG
);
10515 // Same as global/flat
10516 if (NumElements
> 4)
10517 return SplitVectorLoad(Op
, DAG
);
10518 // v3 loads not supported on SI.
10519 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
10520 return WidenOrSplitVectorLoad(Op
, DAG
);
10524 llvm_unreachable("unsupported private_element_size");
10526 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
10528 auto Flags
= Load
->getMemOperand()->getFlags();
10529 if (allowsMisalignedMemoryAccessesImpl(MemVT
.getSizeInBits(), AS
,
10530 Load
->getAlign(), Flags
, &Fast
) &&
10534 if (MemVT
.isVector())
10535 return SplitVectorLoad(Op
, DAG
);
10538 if (!allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
10539 MemVT
, *Load
->getMemOperand())) {
10540 auto [Op0
, Op1
] = expandUnalignedLoad(Load
, DAG
);
10541 return DAG
.getMergeValues({Op0
, Op1
}, DL
);
10547 SDValue
SITargetLowering::LowerSELECT(SDValue Op
, SelectionDAG
&DAG
) const {
10548 EVT VT
= Op
.getValueType();
10549 if (VT
.getSizeInBits() == 128 || VT
.getSizeInBits() == 256 ||
10550 VT
.getSizeInBits() == 512)
10551 return splitTernaryVectorOp(Op
, DAG
);
10553 assert(VT
.getSizeInBits() == 64);
10556 SDValue Cond
= Op
.getOperand(0);
10558 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i32
);
10559 SDValue One
= DAG
.getConstant(1, DL
, MVT::i32
);
10561 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(1));
10562 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(2));
10564 SDValue Lo0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, Zero
);
10565 SDValue Lo1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, Zero
);
10567 SDValue Lo
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Lo0
, Lo1
);
10569 SDValue Hi0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, One
);
10570 SDValue Hi1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, One
);
10572 SDValue Hi
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Hi0
, Hi1
);
10574 SDValue Res
= DAG
.getBuildVector(MVT::v2i32
, DL
, {Lo
, Hi
});
10575 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Res
);
10578 // Catch division cases where we can use shortcuts with rcp and rsq
10580 SDValue
SITargetLowering::lowerFastUnsafeFDIV(SDValue Op
,
10581 SelectionDAG
&DAG
) const {
10583 SDValue LHS
= Op
.getOperand(0);
10584 SDValue RHS
= Op
.getOperand(1);
10585 EVT VT
= Op
.getValueType();
10586 const SDNodeFlags Flags
= Op
->getFlags();
10588 bool AllowInaccurateRcp
=
10589 Flags
.hasApproximateFuncs() || DAG
.getTarget().Options
.UnsafeFPMath
;
10591 if (const ConstantFPSDNode
*CLHS
= dyn_cast
<ConstantFPSDNode
>(LHS
)) {
10592 // Without !fpmath accuracy information, we can't do more because we don't
10593 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10594 // f16 is always accurate enough
10595 if (!AllowInaccurateRcp
&& VT
!= MVT::f16
)
10598 if (CLHS
->isExactlyValue(1.0)) {
10599 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10600 // the CI documentation has a worst case error of 1 ulp.
10601 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10602 // use it as long as we aren't trying to use denormals.
10604 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10606 // 1.0 / sqrt(x) -> rsq(x)
10608 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10609 // error seems really high at 2^29 ULP.
10610 // 1.0 / x -> rcp(x)
10611 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
10614 // Same as for 1.0, but expand the sign out of the constant.
10615 if (CLHS
->isExactlyValue(-1.0)) {
10616 // -1.0 / x -> rcp (fneg x)
10617 SDValue FNegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
10618 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, FNegRHS
);
10622 // For f16 require afn or arcp.
10623 // For f32 require afn.
10624 if (!AllowInaccurateRcp
&& (VT
!= MVT::f16
|| !Flags
.hasAllowReciprocal()))
10627 // Turn into multiply by the reciprocal.
10628 // x / y -> x * (1.0 / y)
10629 SDValue Recip
= DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
10630 return DAG
.getNode(ISD::FMUL
, SL
, VT
, LHS
, Recip
, Flags
);
10633 SDValue
SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op
,
10634 SelectionDAG
&DAG
) const {
10636 SDValue X
= Op
.getOperand(0);
10637 SDValue Y
= Op
.getOperand(1);
10638 EVT VT
= Op
.getValueType();
10639 const SDNodeFlags Flags
= Op
->getFlags();
10641 bool AllowInaccurateDiv
=
10642 Flags
.hasApproximateFuncs() || DAG
.getTarget().Options
.UnsafeFPMath
;
10643 if (!AllowInaccurateDiv
)
10646 SDValue NegY
= DAG
.getNode(ISD::FNEG
, SL
, VT
, Y
);
10647 SDValue One
= DAG
.getConstantFP(1.0, SL
, VT
);
10649 SDValue R
= DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, Y
);
10650 SDValue Tmp0
= DAG
.getNode(ISD::FMA
, SL
, VT
, NegY
, R
, One
);
10652 R
= DAG
.getNode(ISD::FMA
, SL
, VT
, Tmp0
, R
, R
);
10653 SDValue Tmp1
= DAG
.getNode(ISD::FMA
, SL
, VT
, NegY
, R
, One
);
10654 R
= DAG
.getNode(ISD::FMA
, SL
, VT
, Tmp1
, R
, R
);
10655 SDValue Ret
= DAG
.getNode(ISD::FMUL
, SL
, VT
, X
, R
);
10656 SDValue Tmp2
= DAG
.getNode(ISD::FMA
, SL
, VT
, NegY
, Ret
, X
);
10657 return DAG
.getNode(ISD::FMA
, SL
, VT
, Tmp2
, R
, Ret
);
10660 static SDValue
getFPBinOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
10661 EVT VT
, SDValue A
, SDValue B
, SDValue GlueChain
,
10662 SDNodeFlags Flags
) {
10663 if (GlueChain
->getNumValues() <= 1) {
10664 return DAG
.getNode(Opcode
, SL
, VT
, A
, B
, Flags
);
10667 assert(GlueChain
->getNumValues() == 3);
10669 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
10672 llvm_unreachable("no chain equivalent for opcode");
10674 Opcode
= AMDGPUISD::FMUL_W_CHAIN
;
10678 return DAG
.getNode(Opcode
, SL
, VTList
,
10679 {GlueChain
.getValue(1), A
, B
, GlueChain
.getValue(2)},
10683 static SDValue
getFPTernOp(SelectionDAG
&DAG
, unsigned Opcode
, const SDLoc
&SL
,
10684 EVT VT
, SDValue A
, SDValue B
, SDValue C
,
10685 SDValue GlueChain
, SDNodeFlags Flags
) {
10686 if (GlueChain
->getNumValues() <= 1) {
10687 return DAG
.getNode(Opcode
, SL
, VT
, {A
, B
, C
}, Flags
);
10690 assert(GlueChain
->getNumValues() == 3);
10692 SDVTList VTList
= DAG
.getVTList(VT
, MVT::Other
, MVT::Glue
);
10695 llvm_unreachable("no chain equivalent for opcode");
10697 Opcode
= AMDGPUISD::FMA_W_CHAIN
;
10701 return DAG
.getNode(Opcode
, SL
, VTList
,
10702 {GlueChain
.getValue(1), A
, B
, C
, GlueChain
.getValue(2)},
10706 SDValue
SITargetLowering::LowerFDIV16(SDValue Op
, SelectionDAG
&DAG
) const {
10707 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
10708 return FastLowered
;
10711 SDValue LHS
= Op
.getOperand(0);
10712 SDValue RHS
= Op
.getOperand(1);
10714 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10715 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10716 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10717 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10718 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10719 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10720 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10721 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10722 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10723 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10724 // q16.u = opx(V_CVT_F16_F32, q32.u);
10725 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10727 // We will use ISD::FMA on targets that don't support ISD::FMAD.
10728 unsigned FMADOpCode
=
10729 isOperationLegal(ISD::FMAD
, MVT::f32
) ? ISD::FMAD
: ISD::FMA
;
10731 SDValue LHSExt
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, LHS
);
10732 SDValue RHSExt
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, RHS
);
10733 SDValue NegRHSExt
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
, RHSExt
);
10735 DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, RHSExt
, Op
->getFlags());
10737 DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, LHSExt
, Rcp
, Op
->getFlags());
10738 SDValue Err
= DAG
.getNode(FMADOpCode
, SL
, MVT::f32
, NegRHSExt
, Quot
, LHSExt
,
10740 Quot
= DAG
.getNode(FMADOpCode
, SL
, MVT::f32
, Err
, Rcp
, Quot
, Op
->getFlags());
10741 Err
= DAG
.getNode(FMADOpCode
, SL
, MVT::f32
, NegRHSExt
, Quot
, LHSExt
,
10743 SDValue Tmp
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, Err
, Rcp
, Op
->getFlags());
10744 SDValue TmpCast
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Tmp
);
10745 TmpCast
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
, TmpCast
,
10746 DAG
.getConstant(0xff800000, SL
, MVT::i32
));
10747 Tmp
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::f32
, TmpCast
);
10748 Quot
= DAG
.getNode(ISD::FADD
, SL
, MVT::f32
, Tmp
, Quot
, Op
->getFlags());
10749 SDValue RDst
= DAG
.getNode(ISD::FP_ROUND
, SL
, MVT::f16
, Quot
,
10750 DAG
.getConstant(0, SL
, MVT::i32
));
10751 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f16
, RDst
, RHS
, LHS
,
10755 // Faster 2.5 ULP division that does not support denormals.
10756 SDValue
SITargetLowering::lowerFDIV_FAST(SDValue Op
, SelectionDAG
&DAG
) const {
10757 SDNodeFlags Flags
= Op
->getFlags();
10759 SDValue LHS
= Op
.getOperand(1);
10760 SDValue RHS
= Op
.getOperand(2);
10762 SDValue r1
= DAG
.getNode(ISD::FABS
, SL
, MVT::f32
, RHS
, Flags
);
10764 const APFloat
K0Val(0x1p
+96f
);
10765 const SDValue K0
= DAG
.getConstantFP(K0Val
, SL
, MVT::f32
);
10767 const APFloat
K1Val(0x1p
-32f
);
10768 const SDValue K1
= DAG
.getConstantFP(K1Val
, SL
, MVT::f32
);
10770 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
10773 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::f32
);
10775 SDValue r2
= DAG
.getSetCC(SL
, SetCCVT
, r1
, K0
, ISD::SETOGT
);
10777 SDValue r3
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f32
, r2
, K1
, One
, Flags
);
10779 r1
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, RHS
, r3
, Flags
);
10781 // rcp does not support denormals.
10782 SDValue r0
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, r1
, Flags
);
10784 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, LHS
, r0
, Flags
);
10786 return DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, r3
, Mul
, Flags
);
10789 // Returns immediate value for setting the F32 denorm mode when using the
10790 // S_DENORM_MODE instruction.
10791 static SDValue
getSPDenormModeValue(uint32_t SPDenormMode
, SelectionDAG
&DAG
,
10792 const SIMachineFunctionInfo
*Info
,
10793 const GCNSubtarget
*ST
) {
10794 assert(ST
->hasDenormModeInst() && "Requires S_DENORM_MODE");
10795 uint32_t DPDenormModeDefault
= Info
->getMode().fpDenormModeDPValue();
10796 uint32_t Mode
= SPDenormMode
| (DPDenormModeDefault
<< 2);
10797 return DAG
.getTargetConstant(Mode
, SDLoc(), MVT::i32
);
10800 SDValue
SITargetLowering::LowerFDIV32(SDValue Op
, SelectionDAG
&DAG
) const {
10801 if (SDValue FastLowered
= lowerFastUnsafeFDIV(Op
, DAG
))
10802 return FastLowered
;
10804 // The selection matcher assumes anything with a chain selecting to a
10805 // mayRaiseFPException machine instruction. Since we're introducing a chain
10806 // here, we need to explicitly report nofpexcept for the regular fdiv
10808 SDNodeFlags Flags
= Op
->getFlags();
10809 Flags
.setNoFPExcept(true);
10812 SDValue LHS
= Op
.getOperand(0);
10813 SDValue RHS
= Op
.getOperand(1);
10815 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f32
);
10817 SDVTList ScaleVT
= DAG
.getVTList(MVT::f32
, MVT::i1
);
10819 SDValue DenominatorScaled
=
10820 DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, {RHS
, RHS
, LHS
}, Flags
);
10821 SDValue NumeratorScaled
=
10822 DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, {LHS
, RHS
, LHS
}, Flags
);
10824 // Denominator is scaled to not be denormal, so using rcp is ok.
10825 SDValue ApproxRcp
=
10826 DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, DenominatorScaled
, Flags
);
10827 SDValue NegDivScale0
=
10828 DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
, DenominatorScaled
, Flags
);
10830 using namespace AMDGPU::Hwreg
;
10831 const unsigned Denorm32Reg
= HwregEncoding::encode(ID_MODE
, 4, 2);
10832 const SDValue BitField
= DAG
.getTargetConstant(Denorm32Reg
, SL
, MVT::i32
);
10834 const MachineFunction
&MF
= DAG
.getMachineFunction();
10835 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
10836 const DenormalMode DenormMode
= Info
->getMode().FP32Denormals
;
10838 const bool PreservesDenormals
= DenormMode
== DenormalMode::getIEEE();
10839 const bool HasDynamicDenormals
=
10840 (DenormMode
.Input
== DenormalMode::Dynamic
) ||
10841 (DenormMode
.Output
== DenormalMode::Dynamic
);
10843 SDValue SavedDenormMode
;
10845 if (!PreservesDenormals
) {
10846 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10847 // lowering. The chain dependence is insufficient, and we need glue. We do
10848 // not need the glue variants in a strictfp function.
10850 SDVTList BindParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
10852 SDValue Glue
= DAG
.getEntryNode();
10853 if (HasDynamicDenormals
) {
10854 SDNode
*GetReg
= DAG
.getMachineNode(AMDGPU::S_GETREG_B32
, SL
,
10855 DAG
.getVTList(MVT::i32
, MVT::Glue
),
10857 SavedDenormMode
= SDValue(GetReg
, 0);
10859 Glue
= DAG
.getMergeValues(
10860 {DAG
.getEntryNode(), SDValue(GetReg
, 0), SDValue(GetReg
, 1)}, SL
);
10863 SDNode
*EnableDenorm
;
10864 if (Subtarget
->hasDenormModeInst()) {
10865 const SDValue EnableDenormValue
=
10866 getSPDenormModeValue(FP_DENORM_FLUSH_NONE
, DAG
, Info
, Subtarget
);
10868 EnableDenorm
= DAG
.getNode(AMDGPUISD::DENORM_MODE
, SL
, BindParamVTs
, Glue
,
10872 const SDValue EnableDenormValue
=
10873 DAG
.getConstant(FP_DENORM_FLUSH_NONE
, SL
, MVT::i32
);
10874 EnableDenorm
= DAG
.getMachineNode(AMDGPU::S_SETREG_B32
, SL
, BindParamVTs
,
10875 {EnableDenormValue
, BitField
, Glue
});
10878 SDValue Ops
[3] = {NegDivScale0
, SDValue(EnableDenorm
, 0),
10879 SDValue(EnableDenorm
, 1)};
10881 NegDivScale0
= DAG
.getMergeValues(Ops
, SL
);
10884 SDValue Fma0
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
,
10885 ApproxRcp
, One
, NegDivScale0
, Flags
);
10887 SDValue Fma1
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma0
, ApproxRcp
,
10888 ApproxRcp
, Fma0
, Flags
);
10890 SDValue Mul
= getFPBinOp(DAG
, ISD::FMUL
, SL
, MVT::f32
, NumeratorScaled
, Fma1
,
10893 SDValue Fma2
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Mul
,
10894 NumeratorScaled
, Mul
, Flags
);
10897 getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, Fma2
, Fma1
, Mul
, Fma2
, Flags
);
10899 SDValue Fma4
= getFPTernOp(DAG
, ISD::FMA
, SL
, MVT::f32
, NegDivScale0
, Fma3
,
10900 NumeratorScaled
, Fma3
, Flags
);
10902 if (!PreservesDenormals
) {
10903 SDNode
*DisableDenorm
;
10904 if (!HasDynamicDenormals
&& Subtarget
->hasDenormModeInst()) {
10905 const SDValue DisableDenormValue
= getSPDenormModeValue(
10906 FP_DENORM_FLUSH_IN_FLUSH_OUT
, DAG
, Info
, Subtarget
);
10909 DAG
.getNode(AMDGPUISD::DENORM_MODE
, SL
, MVT::Other
, Fma4
.getValue(1),
10910 DisableDenormValue
, Fma4
.getValue(2))
10913 assert(HasDynamicDenormals
== (bool)SavedDenormMode
);
10914 const SDValue DisableDenormValue
=
10915 HasDynamicDenormals
10917 : DAG
.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT
, SL
, MVT::i32
);
10919 DisableDenorm
= DAG
.getMachineNode(
10920 AMDGPU::S_SETREG_B32
, SL
, MVT::Other
,
10921 {DisableDenormValue
, BitField
, Fma4
.getValue(1), Fma4
.getValue(2)});
10924 SDValue OutputChain
= DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
,
10925 SDValue(DisableDenorm
, 0), DAG
.getRoot());
10926 DAG
.setRoot(OutputChain
);
10929 SDValue Scale
= NumeratorScaled
.getValue(1);
10930 SDValue Fmas
= DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f32
,
10931 {Fma4
, Fma1
, Fma3
, Scale
}, Flags
);
10933 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f32
, Fmas
, RHS
, LHS
, Flags
);
10936 SDValue
SITargetLowering::LowerFDIV64(SDValue Op
, SelectionDAG
&DAG
) const {
10937 if (SDValue FastLowered
= lowerFastUnsafeFDIV64(Op
, DAG
))
10938 return FastLowered
;
10941 SDValue X
= Op
.getOperand(0);
10942 SDValue Y
= Op
.getOperand(1);
10944 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f64
);
10946 SDVTList ScaleVT
= DAG
.getVTList(MVT::f64
, MVT::i1
);
10948 SDValue DivScale0
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, Y
, Y
, X
);
10950 SDValue NegDivScale0
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f64
, DivScale0
);
10952 SDValue Rcp
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f64
, DivScale0
);
10954 SDValue Fma0
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Rcp
, One
);
10956 SDValue Fma1
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Rcp
, Fma0
, Rcp
);
10958 SDValue Fma2
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Fma1
, One
);
10960 SDValue DivScale1
= DAG
.getNode(AMDGPUISD::DIV_SCALE
, SL
, ScaleVT
, X
, Y
, X
);
10962 SDValue Fma3
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, Fma1
, Fma2
, Fma1
);
10963 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f64
, DivScale1
, Fma3
);
10966 DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, NegDivScale0
, Mul
, DivScale1
);
10970 if (!Subtarget
->hasUsableDivScaleConditionOutput()) {
10971 // Workaround a hardware bug on SI where the condition output from div_scale
10974 const SDValue Hi
= DAG
.getConstant(1, SL
, MVT::i32
);
10976 // Figure out if the scale to use for div_fmas.
10977 SDValue NumBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, X
);
10978 SDValue DenBC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Y
);
10979 SDValue Scale0BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale0
);
10980 SDValue Scale1BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, DivScale1
);
10983 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, NumBC
, Hi
);
10985 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, DenBC
, Hi
);
10988 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale0BC
, Hi
);
10990 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Scale1BC
, Hi
);
10992 SDValue CmpDen
= DAG
.getSetCC(SL
, MVT::i1
, DenHi
, Scale0Hi
, ISD::SETEQ
);
10993 SDValue CmpNum
= DAG
.getSetCC(SL
, MVT::i1
, NumHi
, Scale1Hi
, ISD::SETEQ
);
10994 Scale
= DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, CmpNum
, CmpDen
);
10996 Scale
= DivScale1
.getValue(1);
11000 DAG
.getNode(AMDGPUISD::DIV_FMAS
, SL
, MVT::f64
, Fma4
, Fma3
, Mul
, Scale
);
11002 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, SL
, MVT::f64
, Fmas
, Y
, X
);
11005 SDValue
SITargetLowering::LowerFDIV(SDValue Op
, SelectionDAG
&DAG
) const {
11006 EVT VT
= Op
.getValueType();
11008 if (VT
== MVT::f32
)
11009 return LowerFDIV32(Op
, DAG
);
11011 if (VT
== MVT::f64
)
11012 return LowerFDIV64(Op
, DAG
);
11014 if (VT
== MVT::f16
)
11015 return LowerFDIV16(Op
, DAG
);
11017 llvm_unreachable("Unexpected type for fdiv");
11020 SDValue
SITargetLowering::LowerFFREXP(SDValue Op
, SelectionDAG
&DAG
) const {
11022 SDValue Val
= Op
.getOperand(0);
11023 EVT VT
= Val
.getValueType();
11024 EVT ResultExpVT
= Op
->getValueType(1);
11025 EVT InstrExpVT
= VT
== MVT::f16
? MVT::i16
: MVT::i32
;
11027 SDValue Mant
= DAG
.getNode(
11028 ISD::INTRINSIC_WO_CHAIN
, dl
, VT
,
11029 DAG
.getTargetConstant(Intrinsic::amdgcn_frexp_mant
, dl
, MVT::i32
), Val
);
11031 SDValue Exp
= DAG
.getNode(
11032 ISD::INTRINSIC_WO_CHAIN
, dl
, InstrExpVT
,
11033 DAG
.getTargetConstant(Intrinsic::amdgcn_frexp_exp
, dl
, MVT::i32
), Val
);
11035 if (Subtarget
->hasFractBug()) {
11036 SDValue Fabs
= DAG
.getNode(ISD::FABS
, dl
, VT
, Val
);
11038 DAG
.getConstantFP(APFloat::getInf(VT
.getFltSemantics()), dl
, VT
);
11040 SDValue IsFinite
= DAG
.getSetCC(dl
, MVT::i1
, Fabs
, Inf
, ISD::SETOLT
);
11041 SDValue Zero
= DAG
.getConstant(0, dl
, InstrExpVT
);
11042 Exp
= DAG
.getNode(ISD::SELECT
, dl
, InstrExpVT
, IsFinite
, Exp
, Zero
);
11043 Mant
= DAG
.getNode(ISD::SELECT
, dl
, VT
, IsFinite
, Mant
, Val
);
11046 SDValue CastExp
= DAG
.getSExtOrTrunc(Exp
, dl
, ResultExpVT
);
11047 return DAG
.getMergeValues({Mant
, CastExp
}, dl
);
11050 SDValue
SITargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
11052 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
11053 EVT VT
= Store
->getMemoryVT();
11055 if (VT
== MVT::i1
) {
11056 return DAG
.getTruncStore(
11057 Store
->getChain(), DL
,
11058 DAG
.getSExtOrTrunc(Store
->getValue(), DL
, MVT::i32
),
11059 Store
->getBasePtr(), MVT::i1
, Store
->getMemOperand());
11062 assert(VT
.isVector() &&
11063 Store
->getValue().getValueType().getScalarType() == MVT::i32
);
11065 unsigned AS
= Store
->getAddressSpace();
11066 if (Subtarget
->hasLDSMisalignedBug() && AS
== AMDGPUAS::FLAT_ADDRESS
&&
11067 Store
->getAlign().value() < VT
.getStoreSize() &&
11068 VT
.getSizeInBits() > 32) {
11069 return SplitVectorStore(Op
, DAG
);
11072 MachineFunction
&MF
= DAG
.getMachineFunction();
11073 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
11074 // If there is a possibility that flat instruction access scratch memory
11075 // then we need to use the same legalization rules we use for private.
11076 if (AS
== AMDGPUAS::FLAT_ADDRESS
&&
11077 !Subtarget
->hasMultiDwordFlatScratchAddressing())
11078 AS
= addressMayBeAccessedAsPrivate(Store
->getMemOperand(), *MFI
)
11079 ? AMDGPUAS::PRIVATE_ADDRESS
11080 : AMDGPUAS::GLOBAL_ADDRESS
;
11082 unsigned NumElements
= VT
.getVectorNumElements();
11083 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
) {
11084 if (NumElements
> 4)
11085 return SplitVectorStore(Op
, DAG
);
11086 // v3 stores not supported on SI.
11087 if (NumElements
== 3 && !Subtarget
->hasDwordx3LoadStores())
11088 return SplitVectorStore(Op
, DAG
);
11090 if (!allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
11091 VT
, *Store
->getMemOperand()))
11092 return expandUnalignedStore(Store
, DAG
);
11096 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
) {
11097 switch (Subtarget
->getMaxPrivateElementSize()) {
11099 return scalarizeVectorStore(Store
, DAG
);
11101 if (NumElements
> 2)
11102 return SplitVectorStore(Op
, DAG
);
11105 if (NumElements
> 4 ||
11106 (NumElements
== 3 && !Subtarget
->enableFlatScratch()))
11107 return SplitVectorStore(Op
, DAG
);
11110 llvm_unreachable("unsupported private_element_size");
11112 } else if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
11114 auto Flags
= Store
->getMemOperand()->getFlags();
11115 if (allowsMisalignedMemoryAccessesImpl(VT
.getSizeInBits(), AS
,
11116 Store
->getAlign(), Flags
, &Fast
) &&
11121 return SplitVectorStore(Op
, DAG
);
11123 return expandUnalignedStore(Store
, DAG
);
11126 // Probably an invalid store. If so we'll end up emitting a selection error.
11130 // Avoid the full correct expansion for f32 sqrt when promoting from f16.
11131 SDValue
SITargetLowering::lowerFSQRTF16(SDValue Op
, SelectionDAG
&DAG
) const {
11133 assert(!Subtarget
->has16BitInsts());
11134 SDNodeFlags Flags
= Op
->getFlags();
11136 DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Op
.getOperand(0), Flags
);
11138 SDValue SqrtID
= DAG
.getTargetConstant(Intrinsic::amdgcn_sqrt
, SL
, MVT::i32
);
11140 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SL
, MVT::f32
, SqrtID
, Ext
, Flags
);
11142 return DAG
.getNode(ISD::FP_ROUND
, SL
, MVT::f16
, Sqrt
,
11143 DAG
.getTargetConstant(0, SL
, MVT::i32
), Flags
);
11146 SDValue
SITargetLowering::lowerFSQRTF32(SDValue Op
, SelectionDAG
&DAG
) const {
11148 SDNodeFlags Flags
= Op
->getFlags();
11149 MVT VT
= Op
.getValueType().getSimpleVT();
11150 const SDValue X
= Op
.getOperand(0);
11152 if (allowApproxFunc(DAG
, Flags
)) {
11153 // Instruction is 1ulp but ignores denormals.
11154 return DAG
.getNode(
11155 ISD::INTRINSIC_WO_CHAIN
, DL
, VT
,
11156 DAG
.getTargetConstant(Intrinsic::amdgcn_sqrt
, DL
, MVT::i32
), X
, Flags
);
11159 SDValue ScaleThreshold
= DAG
.getConstantFP(0x1.0p
-96f
, DL
, VT
);
11160 SDValue NeedScale
= DAG
.getSetCC(DL
, MVT::i1
, X
, ScaleThreshold
, ISD::SETOLT
);
11162 SDValue ScaleUpFactor
= DAG
.getConstantFP(0x1.0p
+32f
, DL
, VT
);
11164 SDValue ScaledX
= DAG
.getNode(ISD::FMUL
, DL
, VT
, X
, ScaleUpFactor
, Flags
);
11167 DAG
.getNode(ISD::SELECT
, DL
, VT
, NeedScale
, ScaledX
, X
, Flags
);
11170 if (needsDenormHandlingF32(DAG
, X
, Flags
)) {
11172 DAG
.getTargetConstant(Intrinsic::amdgcn_sqrt
, DL
, MVT::i32
);
11173 SqrtS
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VT
, SqrtID
, SqrtX
, Flags
);
11175 SDValue SqrtSAsInt
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::i32
, SqrtS
);
11176 SDValue SqrtSNextDownInt
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, SqrtSAsInt
,
11177 DAG
.getConstant(-1, DL
, MVT::i32
));
11178 SDValue SqrtSNextDown
= DAG
.getNode(ISD::BITCAST
, DL
, VT
, SqrtSNextDownInt
);
11180 SDValue NegSqrtSNextDown
=
11181 DAG
.getNode(ISD::FNEG
, DL
, VT
, SqrtSNextDown
, Flags
);
11184 DAG
.getNode(ISD::FMA
, DL
, VT
, NegSqrtSNextDown
, SqrtS
, SqrtX
, Flags
);
11186 SDValue SqrtSNextUpInt
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, SqrtSAsInt
,
11187 DAG
.getConstant(1, DL
, MVT::i32
));
11188 SDValue SqrtSNextUp
= DAG
.getNode(ISD::BITCAST
, DL
, VT
, SqrtSNextUpInt
);
11190 SDValue NegSqrtSNextUp
= DAG
.getNode(ISD::FNEG
, DL
, VT
, SqrtSNextUp
, Flags
);
11192 DAG
.getNode(ISD::FMA
, DL
, VT
, NegSqrtSNextUp
, SqrtS
, SqrtX
, Flags
);
11194 SDValue Zero
= DAG
.getConstantFP(0.0f
, DL
, VT
);
11195 SDValue SqrtVPLE0
= DAG
.getSetCC(DL
, MVT::i1
, SqrtVP
, Zero
, ISD::SETOLE
);
11197 SqrtS
= DAG
.getNode(ISD::SELECT
, DL
, VT
, SqrtVPLE0
, SqrtSNextDown
, SqrtS
,
11200 SDValue SqrtVPVSGT0
= DAG
.getSetCC(DL
, MVT::i1
, SqrtVS
, Zero
, ISD::SETOGT
);
11201 SqrtS
= DAG
.getNode(ISD::SELECT
, DL
, VT
, SqrtVPVSGT0
, SqrtSNextUp
, SqrtS
,
11204 SDValue SqrtR
= DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, SqrtX
, Flags
);
11206 SqrtS
= DAG
.getNode(ISD::FMUL
, DL
, VT
, SqrtX
, SqrtR
, Flags
);
11208 SDValue Half
= DAG
.getConstantFP(0.5f
, DL
, VT
);
11209 SDValue SqrtH
= DAG
.getNode(ISD::FMUL
, DL
, VT
, SqrtR
, Half
, Flags
);
11210 SDValue NegSqrtH
= DAG
.getNode(ISD::FNEG
, DL
, VT
, SqrtH
, Flags
);
11212 SDValue SqrtE
= DAG
.getNode(ISD::FMA
, DL
, VT
, NegSqrtH
, SqrtS
, Half
, Flags
);
11213 SqrtH
= DAG
.getNode(ISD::FMA
, DL
, VT
, SqrtH
, SqrtE
, SqrtH
, Flags
);
11214 SqrtS
= DAG
.getNode(ISD::FMA
, DL
, VT
, SqrtS
, SqrtE
, SqrtS
, Flags
);
11216 SDValue NegSqrtS
= DAG
.getNode(ISD::FNEG
, DL
, VT
, SqrtS
, Flags
);
11218 DAG
.getNode(ISD::FMA
, DL
, VT
, NegSqrtS
, SqrtS
, SqrtX
, Flags
);
11219 SqrtS
= DAG
.getNode(ISD::FMA
, DL
, VT
, SqrtD
, SqrtH
, SqrtS
, Flags
);
11222 SDValue ScaleDownFactor
= DAG
.getConstantFP(0x1.0p
-16f
, DL
, VT
);
11224 SDValue ScaledDown
=
11225 DAG
.getNode(ISD::FMUL
, DL
, VT
, SqrtS
, ScaleDownFactor
, Flags
);
11227 SqrtS
= DAG
.getNode(ISD::SELECT
, DL
, VT
, NeedScale
, ScaledDown
, SqrtS
, Flags
);
11228 SDValue IsZeroOrInf
=
11229 DAG
.getNode(ISD::IS_FPCLASS
, DL
, MVT::i1
, SqrtX
,
11230 DAG
.getTargetConstant(fcZero
| fcPosInf
, DL
, MVT::i32
));
11232 return DAG
.getNode(ISD::SELECT
, DL
, VT
, IsZeroOrInf
, SqrtX
, SqrtS
, Flags
);
11235 SDValue
SITargetLowering::lowerFSQRTF64(SDValue Op
, SelectionDAG
&DAG
) const {
11236 // For double type, the SQRT and RSQ instructions don't have required
11237 // precision, we apply Goldschmidt's algorithm to improve the result:
11243 // r0 = 0.5 - h0 * g0
11244 // g1 = g0 * r0 + g0
11245 // h1 = h0 * r0 + h0
11247 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11248 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11249 // h2 = h1 * r1 + h1
11251 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11252 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11256 SDNodeFlags Flags
= Op
->getFlags();
11260 SDValue X
= Op
.getOperand(0);
11261 SDValue ScaleConstant
= DAG
.getConstantFP(0x1.0p
-767, DL
, MVT::f64
);
11263 SDValue Scaling
= DAG
.getSetCC(DL
, MVT::i1
, X
, ScaleConstant
, ISD::SETOLT
);
11265 SDValue ZeroInt
= DAG
.getConstant(0, DL
, MVT::i32
);
11267 // Scale up input if it is too small.
11268 SDValue ScaleUpFactor
= DAG
.getConstant(256, DL
, MVT::i32
);
11270 DAG
.getNode(ISD::SELECT
, DL
, MVT::i32
, Scaling
, ScaleUpFactor
, ZeroInt
);
11271 SDValue SqrtX
= DAG
.getNode(ISD::FLDEXP
, DL
, MVT::f64
, X
, ScaleUp
, Flags
);
11273 SDValue SqrtY
= DAG
.getNode(AMDGPUISD::RSQ
, DL
, MVT::f64
, SqrtX
);
11275 SDValue SqrtS0
= DAG
.getNode(ISD::FMUL
, DL
, MVT::f64
, SqrtX
, SqrtY
);
11277 SDValue Half
= DAG
.getConstantFP(0.5, DL
, MVT::f64
);
11278 SDValue SqrtH0
= DAG
.getNode(ISD::FMUL
, DL
, MVT::f64
, SqrtY
, Half
);
11280 SDValue NegSqrtH0
= DAG
.getNode(ISD::FNEG
, DL
, MVT::f64
, SqrtH0
);
11281 SDValue SqrtR0
= DAG
.getNode(ISD::FMA
, DL
, MVT::f64
, NegSqrtH0
, SqrtS0
, Half
);
11283 SDValue SqrtH1
= DAG
.getNode(ISD::FMA
, DL
, MVT::f64
, SqrtH0
, SqrtR0
, SqrtH0
);
11285 SDValue SqrtS1
= DAG
.getNode(ISD::FMA
, DL
, MVT::f64
, SqrtS0
, SqrtR0
, SqrtS0
);
11287 SDValue NegSqrtS1
= DAG
.getNode(ISD::FNEG
, DL
, MVT::f64
, SqrtS1
);
11289 DAG
.getNode(ISD::FMA
, DL
, MVT::f64
, NegSqrtS1
, SqrtS1
, SqrtX
);
11291 SDValue SqrtS2
= DAG
.getNode(ISD::FMA
, DL
, MVT::f64
, SqrtD0
, SqrtH1
, SqrtS1
);
11293 SDValue NegSqrtS2
= DAG
.getNode(ISD::FNEG
, DL
, MVT::f64
, SqrtS2
);
11295 DAG
.getNode(ISD::FMA
, DL
, MVT::f64
, NegSqrtS2
, SqrtS2
, SqrtX
);
11297 SDValue SqrtRet
= DAG
.getNode(ISD::FMA
, DL
, MVT::f64
, SqrtD1
, SqrtH1
, SqrtS2
);
11299 SDValue ScaleDownFactor
= DAG
.getConstant(-128, DL
, MVT::i32
);
11300 SDValue ScaleDown
=
11301 DAG
.getNode(ISD::SELECT
, DL
, MVT::i32
, Scaling
, ScaleDownFactor
, ZeroInt
);
11302 SqrtRet
= DAG
.getNode(ISD::FLDEXP
, DL
, MVT::f64
, SqrtRet
, ScaleDown
, Flags
);
11304 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11305 // with finite only or nsz because rsq(+/-0) = +/-inf
11307 // TODO: Check for DAZ and expand to subnormals
11308 SDValue IsZeroOrInf
=
11309 DAG
.getNode(ISD::IS_FPCLASS
, DL
, MVT::i1
, SqrtX
,
11310 DAG
.getTargetConstant(fcZero
| fcPosInf
, DL
, MVT::i32
));
11312 // If x is +INF, +0, or -0, use its original value
11313 return DAG
.getNode(ISD::SELECT
, DL
, MVT::f64
, IsZeroOrInf
, SqrtX
, SqrtRet
,
11317 SDValue
SITargetLowering::LowerTrig(SDValue Op
, SelectionDAG
&DAG
) const {
11319 EVT VT
= Op
.getValueType();
11320 SDValue Arg
= Op
.getOperand(0);
11323 // Propagate fast-math flags so that the multiply we introduce can be folded
11324 // if Arg is already the result of a multiply by constant.
11325 auto Flags
= Op
->getFlags();
11327 SDValue OneOver2Pi
= DAG
.getConstantFP(0.5 * numbers::inv_pi
, DL
, VT
);
11329 if (Subtarget
->hasTrigReducedRange()) {
11330 SDValue MulVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
, Flags
);
11331 TrigVal
= DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, MulVal
, Flags
);
11333 TrigVal
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
, OneOver2Pi
, Flags
);
11336 switch (Op
.getOpcode()) {
11338 return DAG
.getNode(AMDGPUISD::COS_HW
, SDLoc(Op
), VT
, TrigVal
, Flags
);
11340 return DAG
.getNode(AMDGPUISD::SIN_HW
, SDLoc(Op
), VT
, TrigVal
, Flags
);
11342 llvm_unreachable("Wrong trig opcode");
11346 SDValue
SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op
,
11347 SelectionDAG
&DAG
) const {
11348 AtomicSDNode
*AtomicNode
= cast
<AtomicSDNode
>(Op
);
11349 assert(AtomicNode
->isCompareAndSwap());
11350 unsigned AS
= AtomicNode
->getAddressSpace();
11352 // No custom lowering required for local address space
11353 if (!AMDGPU::isFlatGlobalAddrSpace(AS
))
11356 // Non-local address space requires custom lowering for atomic compare
11357 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11359 SDValue ChainIn
= Op
.getOperand(0);
11360 SDValue Addr
= Op
.getOperand(1);
11361 SDValue Old
= Op
.getOperand(2);
11362 SDValue New
= Op
.getOperand(3);
11363 EVT VT
= Op
.getValueType();
11364 MVT SimpleVT
= VT
.getSimpleVT();
11365 MVT VecType
= MVT::getVectorVT(SimpleVT
, 2);
11367 SDValue NewOld
= DAG
.getBuildVector(VecType
, DL
, {New
, Old
});
11368 SDValue Ops
[] = {ChainIn
, Addr
, NewOld
};
11370 return DAG
.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP
, DL
,
11371 Op
->getVTList(), Ops
, VT
,
11372 AtomicNode
->getMemOperand());
11375 //===----------------------------------------------------------------------===//
11376 // Custom DAG optimizations
11377 //===----------------------------------------------------------------------===//
11380 SITargetLowering::performUCharToFloatCombine(SDNode
*N
,
11381 DAGCombinerInfo
&DCI
) const {
11382 EVT VT
= N
->getValueType(0);
11383 EVT ScalarVT
= VT
.getScalarType();
11384 if (ScalarVT
!= MVT::f32
&& ScalarVT
!= MVT::f16
)
11387 SelectionDAG
&DAG
= DCI
.DAG
;
11390 SDValue Src
= N
->getOperand(0);
11391 EVT SrcVT
= Src
.getValueType();
11393 // TODO: We could try to match extracting the higher bytes, which would be
11394 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11395 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11396 // about in practice.
11397 if (DCI
.isAfterLegalizeDAG() && SrcVT
== MVT::i32
) {
11398 if (DAG
.MaskedValueIsZero(Src
, APInt::getHighBitsSet(32, 24))) {
11399 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
, DL
, MVT::f32
, Src
);
11400 DCI
.AddToWorklist(Cvt
.getNode());
11402 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11403 if (ScalarVT
!= MVT::f32
) {
11404 Cvt
= DAG
.getNode(ISD::FP_ROUND
, DL
, VT
, Cvt
,
11405 DAG
.getTargetConstant(0, DL
, MVT::i32
));
11414 SDValue
SITargetLowering::performFCopySignCombine(SDNode
*N
,
11415 DAGCombinerInfo
&DCI
) const {
11416 SDValue MagnitudeOp
= N
->getOperand(0);
11417 SDValue SignOp
= N
->getOperand(1);
11418 SelectionDAG
&DAG
= DCI
.DAG
;
11421 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11422 // lower half with a copy.
11423 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11424 if (MagnitudeOp
.getValueType() == MVT::f64
) {
11425 SDValue MagAsVector
=
11426 DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2f32
, MagnitudeOp
);
11427 SDValue MagLo
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
,
11428 MagAsVector
, DAG
.getConstant(0, DL
, MVT::i32
));
11429 SDValue MagHi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
,
11430 MagAsVector
, DAG
.getConstant(1, DL
, MVT::i32
));
11432 SDValue HiOp
= DAG
.getNode(ISD::FCOPYSIGN
, DL
, MVT::f32
, MagHi
, SignOp
);
11435 DAG
.getNode(ISD::BUILD_VECTOR
, DL
, MVT::v2f32
, MagLo
, HiOp
);
11437 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f64
, Vector
);
11440 if (SignOp
.getValueType() != MVT::f64
)
11443 // Reduce width of sign operand, we only need the highest bit.
11445 // fcopysign f64:x, f64:y ->
11446 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11447 // TODO: In some cases it might make sense to go all the way to f16.
11448 SDValue SignAsVector
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2f32
, SignOp
);
11449 SDValue SignAsF32
=
11450 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, SignAsVector
,
11451 DAG
.getConstant(1, DL
, MVT::i32
));
11453 return DAG
.getNode(ISD::FCOPYSIGN
, DL
, N
->getValueType(0), N
->getOperand(0),
11457 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11458 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11461 // This is a variant of
11462 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11464 // The normal DAG combiner will do this, but only if the add has one use since
11465 // that would increase the number of instructions.
11467 // This prevents us from seeing a constant offset that can be folded into a
11468 // memory instruction's addressing mode. If we know the resulting add offset of
11469 // a pointer can be folded into an addressing offset, we can replace the pointer
11470 // operand with the add of new constant offset. This eliminates one of the uses,
11471 // and may allow the remaining use to also be simplified.
11473 SDValue
SITargetLowering::performSHLPtrCombine(SDNode
*N
, unsigned AddrSpace
,
11475 DAGCombinerInfo
&DCI
) const {
11476 SDValue N0
= N
->getOperand(0);
11477 SDValue N1
= N
->getOperand(1);
11479 // We only do this to handle cases where it's profitable when there are
11480 // multiple uses of the add, so defer to the standard combine.
11481 if ((N0
.getOpcode() != ISD::ADD
&& N0
.getOpcode() != ISD::OR
) ||
11485 const ConstantSDNode
*CN1
= dyn_cast
<ConstantSDNode
>(N1
);
11489 const ConstantSDNode
*CAdd
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
11493 SelectionDAG
&DAG
= DCI
.DAG
;
11495 if (N0
->getOpcode() == ISD::OR
&&
11496 !DAG
.haveNoCommonBitsSet(N0
.getOperand(0), N0
.getOperand(1)))
11499 // If the resulting offset is too large, we can't fold it into the
11500 // addressing mode offset.
11501 APInt Offset
= CAdd
->getAPIntValue() << CN1
->getAPIntValue();
11502 Type
*Ty
= MemVT
.getTypeForEVT(*DCI
.DAG
.getContext());
11505 AM
.HasBaseReg
= true;
11506 AM
.BaseOffs
= Offset
.getSExtValue();
11507 if (!isLegalAddressingMode(DCI
.DAG
.getDataLayout(), AM
, Ty
, AddrSpace
))
11511 EVT VT
= N
->getValueType(0);
11513 SDValue ShlX
= DAG
.getNode(ISD::SHL
, SL
, VT
, N0
.getOperand(0), N1
);
11514 SDValue COffset
= DAG
.getConstant(Offset
, SL
, VT
);
11517 Flags
.setNoUnsignedWrap(
11518 N
->getFlags().hasNoUnsignedWrap() &&
11519 (N0
.getOpcode() == ISD::OR
|| N0
->getFlags().hasNoUnsignedWrap()));
11521 return DAG
.getNode(ISD::ADD
, SL
, VT
, ShlX
, COffset
, Flags
);
11524 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11525 /// by the chain and intrinsic ID. Theoretically we would also need to check the
11526 /// specific intrinsic, but they all place the pointer operand first.
11527 static unsigned getBasePtrIndex(const MemSDNode
*N
) {
11528 switch (N
->getOpcode()) {
11530 case ISD::INTRINSIC_W_CHAIN
:
11531 case ISD::INTRINSIC_VOID
:
11538 SDValue
SITargetLowering::performMemSDNodeCombine(MemSDNode
*N
,
11539 DAGCombinerInfo
&DCI
) const {
11540 SelectionDAG
&DAG
= DCI
.DAG
;
11543 unsigned PtrIdx
= getBasePtrIndex(N
);
11544 SDValue Ptr
= N
->getOperand(PtrIdx
);
11546 // TODO: We could also do this for multiplies.
11547 if (Ptr
.getOpcode() == ISD::SHL
) {
11548 SDValue NewPtr
= performSHLPtrCombine(Ptr
.getNode(), N
->getAddressSpace(),
11549 N
->getMemoryVT(), DCI
);
11551 SmallVector
<SDValue
, 8> NewOps(N
->ops());
11553 NewOps
[PtrIdx
] = NewPtr
;
11554 return SDValue(DAG
.UpdateNodeOperands(N
, NewOps
), 0);
11561 static bool bitOpWithConstantIsReducible(unsigned Opc
, uint32_t Val
) {
11562 return (Opc
== ISD::AND
&& (Val
== 0 || Val
== 0xffffffff)) ||
11563 (Opc
== ISD::OR
&& (Val
== 0xffffffff || Val
== 0)) ||
11564 (Opc
== ISD::XOR
&& Val
== 0);
11567 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11568 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11569 // integer combine opportunities since most 64-bit operations are decomposed
11570 // this way. TODO: We won't want this for SALU especially if it is an inline
11572 SDValue
SITargetLowering::splitBinaryBitConstantOp(
11573 DAGCombinerInfo
&DCI
, const SDLoc
&SL
, unsigned Opc
, SDValue LHS
,
11574 const ConstantSDNode
*CRHS
) const {
11575 uint64_t Val
= CRHS
->getZExtValue();
11576 uint32_t ValLo
= Lo_32(Val
);
11577 uint32_t ValHi
= Hi_32(Val
);
11578 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
11580 if ((bitOpWithConstantIsReducible(Opc
, ValLo
) ||
11581 bitOpWithConstantIsReducible(Opc
, ValHi
)) ||
11582 (CRHS
->hasOneUse() && !TII
->isInlineConstant(CRHS
->getAPIntValue()))) {
11583 // If we need to materialize a 64-bit immediate, it will be split up later
11584 // anyway. Avoid creating the harder to understand 64-bit immediate
11585 // materialization.
11586 return splitBinaryBitConstantOpImpl(DCI
, SL
, Opc
, LHS
, ValLo
, ValHi
);
11592 bool llvm::isBoolSGPR(SDValue V
) {
11593 if (V
.getValueType() != MVT::i1
)
11595 switch (V
.getOpcode()) {
11599 case AMDGPUISD::FP_CLASS
:
11604 return isBoolSGPR(V
.getOperand(0)) && isBoolSGPR(V
.getOperand(1));
11609 // If a constant has all zeroes or all ones within each byte return it.
11610 // Otherwise return 0.
11611 static uint32_t getConstantPermuteMask(uint32_t C
) {
11612 // 0xff for any zero byte in the mask
11613 uint32_t ZeroByteMask
= 0;
11614 if (!(C
& 0x000000ff))
11615 ZeroByteMask
|= 0x000000ff;
11616 if (!(C
& 0x0000ff00))
11617 ZeroByteMask
|= 0x0000ff00;
11618 if (!(C
& 0x00ff0000))
11619 ZeroByteMask
|= 0x00ff0000;
11620 if (!(C
& 0xff000000))
11621 ZeroByteMask
|= 0xff000000;
11622 uint32_t NonZeroByteMask
= ~ZeroByteMask
; // 0xff for any non-zero byte
11623 if ((NonZeroByteMask
& C
) != NonZeroByteMask
)
11624 return 0; // Partial bytes selected.
11628 // Check if a node selects whole bytes from its operand 0 starting at a byte
11629 // boundary while masking the rest. Returns select mask as in the v_perm_b32
11630 // or -1 if not succeeded.
11631 // Note byte select encoding:
11632 // value 0-3 selects corresponding source byte;
11633 // value 0xc selects zero;
11634 // value 0xff selects 0xff.
11635 static uint32_t getPermuteMask(SDValue V
) {
11636 assert(V
.getValueSizeInBits() == 32);
11638 if (V
.getNumOperands() != 2)
11641 ConstantSDNode
*N1
= dyn_cast
<ConstantSDNode
>(V
.getOperand(1));
11645 uint32_t C
= N1
->getZExtValue();
11647 switch (V
.getOpcode()) {
11651 if (uint32_t ConstMask
= getConstantPermuteMask(C
))
11652 return (0x03020100 & ConstMask
) | (0x0c0c0c0c & ~ConstMask
);
11656 if (uint32_t ConstMask
= getConstantPermuteMask(C
))
11657 return (0x03020100 & ~ConstMask
) | ConstMask
;
11664 return uint32_t((0x030201000c0c0c0cull
<< C
) >> 32);
11670 return uint32_t(0x0c0c0c0c03020100ull
>> C
);
11676 SDValue
SITargetLowering::performAndCombine(SDNode
*N
,
11677 DAGCombinerInfo
&DCI
) const {
11678 if (DCI
.isBeforeLegalize())
11681 SelectionDAG
&DAG
= DCI
.DAG
;
11682 EVT VT
= N
->getValueType(0);
11683 SDValue LHS
= N
->getOperand(0);
11684 SDValue RHS
= N
->getOperand(1);
11686 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
11687 if (VT
== MVT::i64
&& CRHS
) {
11688 if (SDValue Split
=
11689 splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::AND
, LHS
, CRHS
))
11693 if (CRHS
&& VT
== MVT::i32
) {
11694 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11695 // nb = number of trailing zeroes in mask
11696 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11697 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11698 uint64_t Mask
= CRHS
->getZExtValue();
11699 unsigned Bits
= llvm::popcount(Mask
);
11700 if (getSubtarget()->hasSDWA() && LHS
->getOpcode() == ISD::SRL
&&
11701 (Bits
== 8 || Bits
== 16) && isShiftedMask_64(Mask
) && !(Mask
& 1)) {
11702 if (auto *CShift
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(1))) {
11703 unsigned Shift
= CShift
->getZExtValue();
11704 unsigned NB
= CRHS
->getAPIntValue().countr_zero();
11705 unsigned Offset
= NB
+ Shift
;
11706 if ((Offset
& (Bits
- 1)) == 0) { // Starts at a byte or word boundary.
11709 DAG
.getNode(AMDGPUISD::BFE_U32
, SL
, MVT::i32
, LHS
->getOperand(0),
11710 DAG
.getConstant(Offset
, SL
, MVT::i32
),
11711 DAG
.getConstant(Bits
, SL
, MVT::i32
));
11712 EVT NarrowVT
= EVT::getIntegerVT(*DAG
.getContext(), Bits
);
11713 SDValue Ext
= DAG
.getNode(ISD::AssertZext
, SL
, VT
, BFE
,
11714 DAG
.getValueType(NarrowVT
));
11715 SDValue Shl
= DAG
.getNode(ISD::SHL
, SDLoc(LHS
), VT
, Ext
,
11716 DAG
.getConstant(NB
, SDLoc(CRHS
), MVT::i32
));
11722 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11723 if (LHS
.hasOneUse() && LHS
.getOpcode() == AMDGPUISD::PERM
&&
11724 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
11725 uint32_t Sel
= getConstantPermuteMask(Mask
);
11729 // Select 0xc for all zero bytes
11730 Sel
= (LHS
.getConstantOperandVal(2) & Sel
) | (~Sel
& 0x0c0c0c0c);
11732 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
11733 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
11737 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11738 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11739 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == ISD::SETCC
) {
11740 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
11741 ISD::CondCode RCC
= cast
<CondCodeSDNode
>(RHS
.getOperand(2))->get();
11743 SDValue X
= LHS
.getOperand(0);
11744 SDValue Y
= RHS
.getOperand(0);
11745 if (Y
.getOpcode() != ISD::FABS
|| Y
.getOperand(0) != X
||
11746 !isTypeLegal(X
.getValueType()))
11749 if (LCC
== ISD::SETO
) {
11750 if (X
!= LHS
.getOperand(1))
11753 if (RCC
== ISD::SETUNE
) {
11754 const ConstantFPSDNode
*C1
=
11755 dyn_cast
<ConstantFPSDNode
>(RHS
.getOperand(1));
11756 if (!C1
|| !C1
->isInfinity() || C1
->isNegative())
11759 const uint32_t Mask
= SIInstrFlags::N_NORMAL
|
11760 SIInstrFlags::N_SUBNORMAL
| SIInstrFlags::N_ZERO
|
11761 SIInstrFlags::P_ZERO
| SIInstrFlags::P_SUBNORMAL
|
11762 SIInstrFlags::P_NORMAL
;
11765 ((~(SIInstrFlags::S_NAN
| SIInstrFlags::Q_NAN
|
11766 SIInstrFlags::N_INFINITY
| SIInstrFlags::P_INFINITY
)) &
11771 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
, X
,
11772 DAG
.getConstant(Mask
, DL
, MVT::i32
));
11777 if (RHS
.getOpcode() == ISD::SETCC
&& LHS
.getOpcode() == AMDGPUISD::FP_CLASS
)
11778 std::swap(LHS
, RHS
);
11780 if (LHS
.getOpcode() == ISD::SETCC
&& RHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
11782 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
11783 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11784 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11786 const ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
11787 if ((LCC
== ISD::SETO
|| LCC
== ISD::SETUO
) && Mask
&&
11788 (RHS
.getOperand(0) == LHS
.getOperand(0) &&
11789 LHS
.getOperand(0) == LHS
.getOperand(1))) {
11790 const unsigned OrdMask
= SIInstrFlags::S_NAN
| SIInstrFlags::Q_NAN
;
11791 unsigned NewMask
= LCC
== ISD::SETO
? Mask
->getZExtValue() & ~OrdMask
11792 : Mask
->getZExtValue() & OrdMask
;
11795 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
, RHS
.getOperand(0),
11796 DAG
.getConstant(NewMask
, DL
, MVT::i32
));
11800 if (VT
== MVT::i32
&& (RHS
.getOpcode() == ISD::SIGN_EXTEND
||
11801 LHS
.getOpcode() == ISD::SIGN_EXTEND
)) {
11802 // and x, (sext cc from i1) => select cc, x, 0
11803 if (RHS
.getOpcode() != ISD::SIGN_EXTEND
)
11804 std::swap(LHS
, RHS
);
11805 if (isBoolSGPR(RHS
.getOperand(0)))
11806 return DAG
.getSelect(SDLoc(N
), MVT::i32
, RHS
.getOperand(0), LHS
,
11807 DAG
.getConstant(0, SDLoc(N
), MVT::i32
));
11810 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11811 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
11812 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
11813 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64
) != -1) {
11814 uint32_t LHSMask
= getPermuteMask(LHS
);
11815 uint32_t RHSMask
= getPermuteMask(RHS
);
11816 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
11817 // Canonicalize the expression in an attempt to have fewer unique masks
11818 // and therefore fewer registers used to hold the masks.
11819 if (LHSMask
> RHSMask
) {
11820 std::swap(LHSMask
, RHSMask
);
11821 std::swap(LHS
, RHS
);
11824 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11825 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11826 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
11827 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
11829 // Check of we need to combine values from two sources within a byte.
11830 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
11831 // If we select high and lower word keep it for SDWA.
11832 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11833 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
11834 // Each byte in each mask is either selector mask 0-3, or has higher
11835 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11836 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11837 // mask which is not 0xff wins. By anding both masks we have a correct
11838 // result except that 0x0c shall be corrected to give 0x0c only.
11839 uint32_t Mask
= LHSMask
& RHSMask
;
11840 for (unsigned I
= 0; I
< 32; I
+= 8) {
11841 uint32_t ByteSel
= 0xff << I
;
11842 if ((LHSMask
& ByteSel
) == 0x0c || (RHSMask
& ByteSel
) == 0x0c)
11843 Mask
&= (0x0c << I
) & 0xffffffff;
11846 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11848 uint32_t Sel
= Mask
| (LHSUsedLanes
& 0x04040404);
11851 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
11853 DAG
.getConstant(Sel
, DL
, MVT::i32
));
11861 // A key component of v_perm is a mapping between byte position of the src
11862 // operands, and the byte position of the dest. To provide such, we need: 1. the
11863 // node that provides x byte of the dest of the OR, and 2. the byte of the node
11864 // used to provide that x byte. calculateByteProvider finds which node provides
11865 // a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11866 // and finds an ultimate src and byte position For example: The supported
11867 // LoadCombine pattern for vector loads is as follows
11880 // t9 t10 t11 t12 t13 t14
11881 // trunc* 8 trunc* 8 and and
11883 // t15 t16 t17 t18 t19 t20
11884 // trunc* 255 srl -256
11888 // *In this example, the truncs are from i32->i16
11890 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11891 // respectively. calculateSrcByte would find (given node) -> ultimate src &
11892 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11893 // After finding the mapping, we can combine the tree into vperm t15, t16,
11896 // Find the source and byte position from a node.
11897 // \p DestByte is the byte position of the dest of the or that the src
11898 // ultimately provides. \p SrcIndex is the byte of the src that maps to this
11899 // dest of the or byte. \p Depth tracks how many recursive iterations we have
11901 static const std::optional
<ByteProvider
<SDValue
>>
11902 calculateSrcByte(const SDValue Op
, uint64_t DestByte
, uint64_t SrcIndex
= 0,
11903 unsigned Depth
= 0) {
11904 // We may need to recursively traverse a series of SRLs
11906 return std::nullopt
;
11908 if (Op
.getValueSizeInBits() < 8)
11909 return std::nullopt
;
11911 if (Op
.getValueType().isVector())
11912 return ByteProvider
<SDValue
>::getSrc(Op
, DestByte
, SrcIndex
);
11914 switch (Op
->getOpcode()) {
11915 case ISD::TRUNCATE
: {
11916 return calculateSrcByte(Op
->getOperand(0), DestByte
, SrcIndex
, Depth
+ 1);
11919 case ISD::SIGN_EXTEND
:
11920 case ISD::ZERO_EXTEND
:
11921 case ISD::SIGN_EXTEND_INREG
: {
11922 SDValue NarrowOp
= Op
->getOperand(0);
11923 auto NarrowVT
= NarrowOp
.getValueType();
11924 if (Op
->getOpcode() == ISD::SIGN_EXTEND_INREG
) {
11925 auto *VTSign
= cast
<VTSDNode
>(Op
->getOperand(1));
11926 NarrowVT
= VTSign
->getVT();
11928 if (!NarrowVT
.isByteSized())
11929 return std::nullopt
;
11930 uint64_t NarrowByteWidth
= NarrowVT
.getStoreSize();
11932 if (SrcIndex
>= NarrowByteWidth
)
11933 return std::nullopt
;
11934 return calculateSrcByte(Op
->getOperand(0), DestByte
, SrcIndex
, Depth
+ 1);
11939 auto *ShiftOp
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(1));
11941 return std::nullopt
;
11943 uint64_t BitShift
= ShiftOp
->getZExtValue();
11945 if (BitShift
% 8 != 0)
11946 return std::nullopt
;
11948 SrcIndex
+= BitShift
/ 8;
11950 return calculateSrcByte(Op
->getOperand(0), DestByte
, SrcIndex
, Depth
+ 1);
11954 return ByteProvider
<SDValue
>::getSrc(Op
, DestByte
, SrcIndex
);
11957 llvm_unreachable("fully handled switch");
11960 // For a byte position in the result of an Or, traverse the tree and find the
11961 // node (and the byte of the node) which ultimately provides this {Or,
11962 // BytePosition}. \p Op is the operand we are currently examining. \p Index is
11963 // the byte position of the Op that corresponds with the originally requested
11964 // byte of the Or \p Depth tracks how many recursive iterations we have
11965 // performed. \p StartingIndex is the originally requested byte of the Or
11966 static const std::optional
<ByteProvider
<SDValue
>>
11967 calculateByteProvider(const SDValue
&Op
, unsigned Index
, unsigned Depth
,
11968 unsigned StartingIndex
= 0) {
11969 // Finding Src tree of RHS of or typically requires at least 1 additional
11972 return std::nullopt
;
11974 unsigned BitWidth
= Op
.getScalarValueSizeInBits();
11975 if (BitWidth
% 8 != 0)
11976 return std::nullopt
;
11977 if (Index
> BitWidth
/ 8 - 1)
11978 return std::nullopt
;
11980 bool IsVec
= Op
.getValueType().isVector();
11981 switch (Op
.getOpcode()) {
11984 return std::nullopt
;
11986 auto RHS
= calculateByteProvider(Op
.getOperand(1), Index
, Depth
+ 1,
11989 return std::nullopt
;
11990 auto LHS
= calculateByteProvider(Op
.getOperand(0), Index
, Depth
+ 1,
11993 return std::nullopt
;
11994 // A well formed Or will have two ByteProviders for each byte, one of which
11995 // is constant zero
11996 if (!LHS
->isConstantZero() && !RHS
->isConstantZero())
11997 return std::nullopt
;
11998 if (!LHS
|| LHS
->isConstantZero())
12000 if (!RHS
|| RHS
->isConstantZero())
12002 return std::nullopt
;
12007 return std::nullopt
;
12009 auto *BitMaskOp
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(1));
12011 return std::nullopt
;
12013 uint32_t BitMask
= BitMaskOp
->getZExtValue();
12014 // Bits we expect for our StartingIndex
12015 uint32_t IndexMask
= 0xFF << (Index
* 8);
12017 if ((IndexMask
& BitMask
) != IndexMask
) {
12018 // If the result of the and partially provides the byte, then it
12019 // is not well formatted
12020 if (IndexMask
& BitMask
)
12021 return std::nullopt
;
12022 return ByteProvider
<SDValue
>::getConstantZero();
12025 return calculateSrcByte(Op
->getOperand(0), StartingIndex
, Index
);
12030 return std::nullopt
;
12032 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12033 auto *ShiftOp
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(2));
12034 if (!ShiftOp
|| Op
.getValueType().isVector())
12035 return std::nullopt
;
12037 uint64_t BitsProvided
= Op
.getValueSizeInBits();
12038 if (BitsProvided
% 8 != 0)
12039 return std::nullopt
;
12041 uint64_t BitShift
= ShiftOp
->getAPIntValue().urem(BitsProvided
);
12043 return std::nullopt
;
12045 uint64_t ConcatSizeInBytes
= BitsProvided
/ 4;
12046 uint64_t ByteShift
= BitShift
/ 8;
12048 uint64_t NewIndex
= (Index
+ ByteShift
) % ConcatSizeInBytes
;
12049 uint64_t BytesProvided
= BitsProvided
/ 8;
12050 SDValue NextOp
= Op
.getOperand(NewIndex
>= BytesProvided
? 0 : 1);
12051 NewIndex
%= BytesProvided
;
12052 return calculateByteProvider(NextOp
, NewIndex
, Depth
+ 1, StartingIndex
);
12058 return std::nullopt
;
12060 auto *ShiftOp
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(1));
12062 return std::nullopt
;
12064 uint64_t BitShift
= ShiftOp
->getZExtValue();
12066 return std::nullopt
;
12068 auto BitsProvided
= Op
.getScalarValueSizeInBits();
12069 if (BitsProvided
% 8 != 0)
12070 return std::nullopt
;
12072 uint64_t BytesProvided
= BitsProvided
/ 8;
12073 uint64_t ByteShift
= BitShift
/ 8;
12074 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12075 // If the byte we are trying to provide (as tracked by index) falls in this
12076 // range, then the SRL provides the byte. The byte of interest of the src of
12077 // the SRL is Index + ByteShift
12078 return BytesProvided
- ByteShift
> Index
12079 ? calculateSrcByte(Op
->getOperand(0), StartingIndex
,
12081 : ByteProvider
<SDValue
>::getConstantZero();
12086 return std::nullopt
;
12088 auto *ShiftOp
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(1));
12090 return std::nullopt
;
12092 uint64_t BitShift
= ShiftOp
->getZExtValue();
12093 if (BitShift
% 8 != 0)
12094 return std::nullopt
;
12095 uint64_t ByteShift
= BitShift
/ 8;
12097 // If we are shifting by an amount greater than (or equal to)
12098 // the index we are trying to provide, then it provides 0s. If not,
12099 // then this bytes are not definitively 0s, and the corresponding byte
12100 // of interest is Index - ByteShift of the src
12101 return Index
< ByteShift
12102 ? ByteProvider
<SDValue
>::getConstantZero()
12103 : calculateByteProvider(Op
.getOperand(0), Index
- ByteShift
,
12104 Depth
+ 1, StartingIndex
);
12106 case ISD::ANY_EXTEND
:
12107 case ISD::SIGN_EXTEND
:
12108 case ISD::ZERO_EXTEND
:
12109 case ISD::SIGN_EXTEND_INREG
:
12110 case ISD::AssertZext
:
12111 case ISD::AssertSext
: {
12113 return std::nullopt
;
12115 SDValue NarrowOp
= Op
->getOperand(0);
12116 unsigned NarrowBitWidth
= NarrowOp
.getValueSizeInBits();
12117 if (Op
->getOpcode() == ISD::SIGN_EXTEND_INREG
||
12118 Op
->getOpcode() == ISD::AssertZext
||
12119 Op
->getOpcode() == ISD::AssertSext
) {
12120 auto *VTSign
= cast
<VTSDNode
>(Op
->getOperand(1));
12121 NarrowBitWidth
= VTSign
->getVT().getSizeInBits();
12123 if (NarrowBitWidth
% 8 != 0)
12124 return std::nullopt
;
12125 uint64_t NarrowByteWidth
= NarrowBitWidth
/ 8;
12127 if (Index
>= NarrowByteWidth
)
12128 return Op
.getOpcode() == ISD::ZERO_EXTEND
12129 ? std::optional
<ByteProvider
<SDValue
>>(
12130 ByteProvider
<SDValue
>::getConstantZero())
12132 return calculateByteProvider(NarrowOp
, Index
, Depth
+ 1, StartingIndex
);
12135 case ISD::TRUNCATE
: {
12137 return std::nullopt
;
12139 uint64_t NarrowByteWidth
= BitWidth
/ 8;
12141 if (NarrowByteWidth
>= Index
) {
12142 return calculateByteProvider(Op
.getOperand(0), Index
, Depth
+ 1,
12146 return std::nullopt
;
12149 case ISD::CopyFromReg
: {
12150 if (BitWidth
/ 8 > Index
)
12151 return calculateSrcByte(Op
, StartingIndex
, Index
);
12153 return std::nullopt
;
12157 auto *L
= cast
<LoadSDNode
>(Op
.getNode());
12159 unsigned NarrowBitWidth
= L
->getMemoryVT().getSizeInBits();
12160 if (NarrowBitWidth
% 8 != 0)
12161 return std::nullopt
;
12162 uint64_t NarrowByteWidth
= NarrowBitWidth
/ 8;
12164 // If the width of the load does not reach byte we are trying to provide for
12165 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12167 if (Index
>= NarrowByteWidth
) {
12168 return L
->getExtensionType() == ISD::ZEXTLOAD
12169 ? std::optional
<ByteProvider
<SDValue
>>(
12170 ByteProvider
<SDValue
>::getConstantZero())
12174 if (NarrowByteWidth
> Index
) {
12175 return calculateSrcByte(Op
, StartingIndex
, Index
);
12178 return std::nullopt
;
12183 return std::nullopt
;
12185 return calculateByteProvider(Op
->getOperand(0), BitWidth
/ 8 - Index
- 1,
12186 Depth
+ 1, StartingIndex
);
12189 case ISD::EXTRACT_VECTOR_ELT
: {
12190 auto *IdxOp
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(1));
12192 return std::nullopt
;
12193 auto VecIdx
= IdxOp
->getZExtValue();
12194 auto ScalarSize
= Op
.getScalarValueSizeInBits();
12195 if (ScalarSize
< 32)
12196 Index
= ScalarSize
== 8 ? VecIdx
: VecIdx
* 2 + Index
;
12197 return calculateSrcByte(ScalarSize
>= 32 ? Op
: Op
.getOperand(0),
12198 StartingIndex
, Index
);
12201 case AMDGPUISD::PERM
: {
12203 return std::nullopt
;
12205 auto *PermMask
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(2));
12207 return std::nullopt
;
12210 (PermMask
->getZExtValue() & (0xFF << (Index
* 8))) >> (Index
* 8);
12211 if (IdxMask
> 0x07 && IdxMask
!= 0x0c)
12212 return std::nullopt
;
12214 auto NextOp
= Op
.getOperand(IdxMask
> 0x03 ? 0 : 1);
12215 auto NextIndex
= IdxMask
> 0x03 ? IdxMask
% 4 : IdxMask
;
12217 return IdxMask
!= 0x0c ? calculateSrcByte(NextOp
, StartingIndex
, NextIndex
)
12218 : ByteProvider
<SDValue
>(
12219 ByteProvider
<SDValue
>::getConstantZero());
12223 return std::nullopt
;
12227 llvm_unreachable("fully handled switch");
12230 // Returns true if the Operand is a scalar and is 16 bits
12231 static bool isExtendedFrom16Bits(SDValue
&Operand
) {
12233 switch (Operand
.getOpcode()) {
12234 case ISD::ANY_EXTEND
:
12235 case ISD::SIGN_EXTEND
:
12236 case ISD::ZERO_EXTEND
: {
12237 auto OpVT
= Operand
.getOperand(0).getValueType();
12238 return !OpVT
.isVector() && OpVT
.getSizeInBits() == 16;
12241 LoadSDNode
*L
= cast
<LoadSDNode
>(Operand
.getNode());
12242 auto ExtType
= cast
<LoadSDNode
>(L
)->getExtensionType();
12243 if (ExtType
== ISD::ZEXTLOAD
|| ExtType
== ISD::SEXTLOAD
||
12244 ExtType
== ISD::EXTLOAD
) {
12245 auto MemVT
= L
->getMemoryVT();
12246 return !MemVT
.isVector() && MemVT
.getSizeInBits() == 16;
12248 return L
->getMemoryVT().getSizeInBits() == 16;
12255 // Returns true if the mask matches consecutive bytes, and the first byte
12256 // begins at a power of 2 byte offset from 0th byte
12257 static bool addresses16Bits(int Mask
) {
12258 int Low8
= Mask
& 0xff;
12259 int Hi8
= (Mask
& 0xff00) >> 8;
12261 assert(Low8
< 8 && Hi8
< 8);
12262 // Are the bytes contiguous in the order of increasing addresses.
12263 bool IsConsecutive
= (Hi8
- Low8
== 1);
12264 // Is the first byte at location that is aligned for 16 bit instructions.
12265 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12266 // In this case, we still need code to extract the 16 bit operand, so it
12267 // is better to use i8 v_perm
12268 bool Is16Aligned
= !(Low8
% 2);
12270 return IsConsecutive
&& Is16Aligned
;
12273 // Do not lower into v_perm if the operands are actually 16 bit
12274 // and the selected bits (based on PermMask) correspond with two
12275 // easily addressable 16 bit operands.
12276 static bool hasNon16BitAccesses(uint64_t PermMask
, SDValue
&Op
,
12277 SDValue
&OtherOp
) {
12278 int Low16
= PermMask
& 0xffff;
12279 int Hi16
= (PermMask
& 0xffff0000) >> 16;
12281 auto TempOp
= peekThroughBitcasts(Op
);
12282 auto TempOtherOp
= peekThroughBitcasts(OtherOp
);
12285 TempOtherOp
.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp
);
12289 auto OtherOpIs16Bit
= TempOtherOp
.getValueSizeInBits() == 16 ||
12290 isExtendedFrom16Bits(TempOtherOp
);
12291 if (!OtherOpIs16Bit
)
12294 // Do we cleanly address both
12295 return !addresses16Bits(Low16
) || !addresses16Bits(Hi16
);
12298 static SDValue
getDWordFromOffset(SelectionDAG
&DAG
, SDLoc SL
, SDValue Src
,
12299 unsigned DWordOffset
) {
12302 auto TypeSize
= Src
.getValueSizeInBits().getFixedValue();
12303 // ByteProvider must be at least 8 bits
12304 assert(Src
.getValueSizeInBits().isKnownMultipleOf(8));
12306 if (TypeSize
<= 32)
12307 return DAG
.getBitcastedAnyExtOrTrunc(Src
, SL
, MVT::i32
);
12309 if (Src
.getValueType().isVector()) {
12310 auto ScalarTySize
= Src
.getScalarValueSizeInBits();
12311 auto ScalarTy
= Src
.getValueType().getScalarType();
12312 if (ScalarTySize
== 32) {
12313 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Src
,
12314 DAG
.getConstant(DWordOffset
, SL
, MVT::i32
));
12316 if (ScalarTySize
> 32) {
12318 ISD::EXTRACT_VECTOR_ELT
, SL
, ScalarTy
, Src
,
12319 DAG
.getConstant(DWordOffset
/ (ScalarTySize
/ 32), SL
, MVT::i32
));
12320 auto ShiftVal
= 32 * (DWordOffset
% (ScalarTySize
/ 32));
12322 Ret
= DAG
.getNode(ISD::SRL
, SL
, Ret
.getValueType(), Ret
,
12323 DAG
.getConstant(ShiftVal
, SL
, MVT::i32
));
12324 return DAG
.getBitcastedAnyExtOrTrunc(Ret
, SL
, MVT::i32
);
12327 assert(ScalarTySize
< 32);
12328 auto NumElements
= TypeSize
/ ScalarTySize
;
12329 auto Trunc32Elements
= (ScalarTySize
* NumElements
) / 32;
12330 auto NormalizedTrunc
= Trunc32Elements
* 32 / ScalarTySize
;
12331 auto NumElementsIn32
= 32 / ScalarTySize
;
12332 auto NumAvailElements
= DWordOffset
< Trunc32Elements
12334 : NumElements
- NormalizedTrunc
;
12336 SmallVector
<SDValue
, 4> VecSrcs
;
12337 DAG
.ExtractVectorElements(Src
, VecSrcs
, DWordOffset
* NumElementsIn32
,
12340 Ret
= DAG
.getBuildVector(
12341 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize
), NumAvailElements
), SL
,
12343 return Ret
= DAG
.getBitcastedAnyExtOrTrunc(Ret
, SL
, MVT::i32
);
12347 auto ShiftVal
= 32 * DWordOffset
;
12348 Ret
= DAG
.getNode(ISD::SRL
, SL
, Src
.getValueType(), Src
,
12349 DAG
.getConstant(ShiftVal
, SL
, MVT::i32
));
12350 return DAG
.getBitcastedAnyExtOrTrunc(Ret
, SL
, MVT::i32
);
12353 static SDValue
matchPERM(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
) {
12354 SelectionDAG
&DAG
= DCI
.DAG
;
12355 [[maybe_unused
]] EVT VT
= N
->getValueType(0);
12356 SmallVector
<ByteProvider
<SDValue
>, 8> PermNodes
;
12358 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12359 assert(VT
== MVT::i32
);
12360 for (int i
= 0; i
< 4; i
++) {
12361 // Find the ByteProvider that provides the ith byte of the result of OR
12362 std::optional
<ByteProvider
<SDValue
>> P
=
12363 calculateByteProvider(SDValue(N
, 0), i
, 0, /*StartingIndex = */ i
);
12364 // TODO support constantZero
12365 if (!P
|| P
->isConstantZero())
12368 PermNodes
.push_back(*P
);
12370 if (PermNodes
.size() != 4)
12373 std::pair
<unsigned, unsigned> FirstSrc(0, PermNodes
[0].SrcOffset
/ 4);
12374 std::optional
<std::pair
<unsigned, unsigned>> SecondSrc
;
12375 uint64_t PermMask
= 0x00000000;
12376 for (size_t i
= 0; i
< PermNodes
.size(); i
++) {
12377 auto PermOp
= PermNodes
[i
];
12378 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12379 // by sizeof(Src2) = 4
12380 int SrcByteAdjust
= 4;
12382 // If the Src uses a byte from a different DWORD, then it corresponds
12383 // with a difference source
12384 if (!PermOp
.hasSameSrc(PermNodes
[FirstSrc
.first
]) ||
12385 ((PermOp
.SrcOffset
/ 4) != FirstSrc
.second
)) {
12387 if (!PermOp
.hasSameSrc(PermNodes
[SecondSrc
->first
]) ||
12388 ((PermOp
.SrcOffset
/ 4) != SecondSrc
->second
))
12391 // Set the index of the second distinct Src node
12392 SecondSrc
= {i
, PermNodes
[i
].SrcOffset
/ 4};
12393 assert(!(PermNodes
[SecondSrc
->first
].Src
->getValueSizeInBits() % 8));
12396 assert((PermOp
.SrcOffset
% 4) + SrcByteAdjust
< 8);
12397 assert(!DAG
.getDataLayout().isBigEndian());
12398 PermMask
|= ((PermOp
.SrcOffset
% 4) + SrcByteAdjust
) << (i
* 8);
12401 SDValue Op
= *PermNodes
[FirstSrc
.first
].Src
;
12402 Op
= getDWordFromOffset(DAG
, DL
, Op
, FirstSrc
.second
);
12403 assert(Op
.getValueSizeInBits() == 32);
12405 // Check that we are not just extracting the bytes in order from an op
12407 int Low16
= PermMask
& 0xffff;
12408 int Hi16
= (PermMask
& 0xffff0000) >> 16;
12410 bool WellFormedLow
= (Low16
== 0x0504) || (Low16
== 0x0100);
12411 bool WellFormedHi
= (Hi16
== 0x0706) || (Hi16
== 0x0302);
12413 // The perm op would really just produce Op. So combine into Op
12414 if (WellFormedLow
&& WellFormedHi
)
12415 return DAG
.getBitcast(MVT::getIntegerVT(32), Op
);
12418 SDValue OtherOp
= SecondSrc
? *PermNodes
[SecondSrc
->first
].Src
: Op
;
12421 OtherOp
= getDWordFromOffset(DAG
, DL
, OtherOp
, SecondSrc
->second
);
12422 assert(OtherOp
.getValueSizeInBits() == 32);
12425 if (hasNon16BitAccesses(PermMask
, Op
, OtherOp
)) {
12427 assert(Op
.getValueType().isByteSized() &&
12428 OtherOp
.getValueType().isByteSized());
12430 // If the ultimate src is less than 32 bits, then we will only be
12431 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12432 // CalculateByteProvider would not have returned Op as source if we
12433 // used a byte that is outside its ValueType. Thus, we are free to
12434 // ANY_EXTEND as the extended bits are dont-cares.
12435 Op
= DAG
.getBitcastedAnyExtOrTrunc(Op
, DL
, MVT::i32
);
12436 OtherOp
= DAG
.getBitcastedAnyExtOrTrunc(OtherOp
, DL
, MVT::i32
);
12438 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, Op
, OtherOp
,
12439 DAG
.getConstant(PermMask
, DL
, MVT::i32
));
12444 SDValue
SITargetLowering::performOrCombine(SDNode
*N
,
12445 DAGCombinerInfo
&DCI
) const {
12446 SelectionDAG
&DAG
= DCI
.DAG
;
12447 SDValue LHS
= N
->getOperand(0);
12448 SDValue RHS
= N
->getOperand(1);
12450 EVT VT
= N
->getValueType(0);
12451 if (VT
== MVT::i1
) {
12452 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12453 if (LHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
12454 RHS
.getOpcode() == AMDGPUISD::FP_CLASS
) {
12455 SDValue Src
= LHS
.getOperand(0);
12456 if (Src
!= RHS
.getOperand(0))
12459 const ConstantSDNode
*CLHS
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
12460 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
12461 if (!CLHS
|| !CRHS
)
12464 // Only 10 bits are used.
12465 static const uint32_t MaxMask
= 0x3ff;
12468 (CLHS
->getZExtValue() | CRHS
->getZExtValue()) & MaxMask
;
12470 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, MVT::i1
, Src
,
12471 DAG
.getConstant(NewMask
, DL
, MVT::i32
));
12477 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12478 if (isa
<ConstantSDNode
>(RHS
) && LHS
.hasOneUse() &&
12479 LHS
.getOpcode() == AMDGPUISD::PERM
&&
12480 isa
<ConstantSDNode
>(LHS
.getOperand(2))) {
12481 uint32_t Sel
= getConstantPermuteMask(N
->getConstantOperandVal(1));
12485 Sel
|= LHS
.getConstantOperandVal(2);
12487 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
12488 LHS
.getOperand(1), DAG
.getConstant(Sel
, DL
, MVT::i32
));
12491 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12492 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
12493 if (VT
== MVT::i32
&& LHS
.hasOneUse() && RHS
.hasOneUse() &&
12494 N
->isDivergent() && TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64
) != -1) {
12496 // If all the uses of an or need to extract the individual elements, do not
12497 // attempt to lower into v_perm
12498 auto usesCombinedOperand
= [](SDNode
*OrUse
) {
12499 // If we have any non-vectorized use, then it is a candidate for v_perm
12500 if (OrUse
->getOpcode() != ISD::BITCAST
||
12501 !OrUse
->getValueType(0).isVector())
12504 // If we have any non-vectorized use, then it is a candidate for v_perm
12505 for (auto *VUse
: OrUse
->uses()) {
12506 if (!VUse
->getValueType(0).isVector())
12509 // If the use of a vector is a store, then combining via a v_perm
12511 // TODO -- whitelist more uses
12512 for (auto VectorwiseOp
: {ISD::STORE
, ISD::CopyToReg
, ISD::CopyFromReg
})
12513 if (VUse
->getOpcode() == VectorwiseOp
)
12519 if (!any_of(N
->uses(), usesCombinedOperand
))
12522 uint32_t LHSMask
= getPermuteMask(LHS
);
12523 uint32_t RHSMask
= getPermuteMask(RHS
);
12525 if (LHSMask
!= ~0u && RHSMask
!= ~0u) {
12526 // Canonicalize the expression in an attempt to have fewer unique masks
12527 // and therefore fewer registers used to hold the masks.
12528 if (LHSMask
> RHSMask
) {
12529 std::swap(LHSMask
, RHSMask
);
12530 std::swap(LHS
, RHS
);
12533 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12534 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12535 uint32_t LHSUsedLanes
= ~(LHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
12536 uint32_t RHSUsedLanes
= ~(RHSMask
& 0x0c0c0c0c) & 0x0c0c0c0c;
12538 // Check of we need to combine values from two sources within a byte.
12539 if (!(LHSUsedLanes
& RHSUsedLanes
) &&
12540 // If we select high and lower word keep it for SDWA.
12541 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12542 !(LHSUsedLanes
== 0x0c0c0000 && RHSUsedLanes
== 0x00000c0c)) {
12543 // Kill zero bytes selected by other mask. Zero value is 0xc.
12544 LHSMask
&= ~RHSUsedLanes
;
12545 RHSMask
&= ~LHSUsedLanes
;
12546 // Add 4 to each active LHS lane
12547 LHSMask
|= LHSUsedLanes
& 0x04040404;
12549 uint32_t Sel
= LHSMask
| RHSMask
;
12552 return DAG
.getNode(AMDGPUISD::PERM
, DL
, MVT::i32
, LHS
.getOperand(0),
12554 DAG
.getConstant(Sel
, DL
, MVT::i32
));
12557 if (LHSMask
== ~0u || RHSMask
== ~0u) {
12558 if (SDValue Perm
= matchPERM(N
, DCI
))
12563 if (VT
!= MVT::i64
|| DCI
.isBeforeLegalizeOps())
12566 // TODO: This could be a generic combine with a predicate for extracting the
12567 // high half of an integer being free.
12569 // (or i64:x, (zero_extend i32:y)) ->
12570 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12571 if (LHS
.getOpcode() == ISD::ZERO_EXTEND
&&
12572 RHS
.getOpcode() != ISD::ZERO_EXTEND
)
12573 std::swap(LHS
, RHS
);
12575 if (RHS
.getOpcode() == ISD::ZERO_EXTEND
) {
12576 SDValue ExtSrc
= RHS
.getOperand(0);
12577 EVT SrcVT
= ExtSrc
.getValueType();
12578 if (SrcVT
== MVT::i32
) {
12580 auto [LowLHS
, HiBits
] = split64BitValue(LHS
, DAG
);
12581 SDValue LowOr
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, LowLHS
, ExtSrc
);
12583 DCI
.AddToWorklist(LowOr
.getNode());
12584 DCI
.AddToWorklist(HiBits
.getNode());
12587 DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, LowOr
, HiBits
);
12588 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Vec
);
12592 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
12594 if (SDValue Split
= splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::OR
,
12595 N
->getOperand(0), CRHS
))
12602 SDValue
SITargetLowering::performXorCombine(SDNode
*N
,
12603 DAGCombinerInfo
&DCI
) const {
12604 if (SDValue RV
= reassociateScalarOps(N
, DCI
.DAG
))
12607 SDValue LHS
= N
->getOperand(0);
12608 SDValue RHS
= N
->getOperand(1);
12610 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
12611 SelectionDAG
&DAG
= DCI
.DAG
;
12613 EVT VT
= N
->getValueType(0);
12614 if (CRHS
&& VT
== MVT::i64
) {
12615 if (SDValue Split
=
12616 splitBinaryBitConstantOp(DCI
, SDLoc(N
), ISD::XOR
, LHS
, CRHS
))
12620 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12621 // fneg-like xors into 64-bit select.
12622 if (LHS
.getOpcode() == ISD::SELECT
&& VT
== MVT::i32
) {
12623 // This looks like an fneg, try to fold as a source modifier.
12624 if (CRHS
&& CRHS
->getAPIntValue().isSignMask() &&
12625 shouldFoldFNegIntoSrc(N
, LHS
)) {
12626 // xor (select c, a, b), 0x80000000 ->
12627 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12630 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, LHS
->getOperand(1));
12632 DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, LHS
->getOperand(2));
12633 SDValue FNegLHS
= DAG
.getNode(ISD::FNEG
, DL
, MVT::f32
, CastLHS
);
12634 SDValue FNegRHS
= DAG
.getNode(ISD::FNEG
, DL
, MVT::f32
, CastRHS
);
12635 SDValue NewSelect
= DAG
.getNode(ISD::SELECT
, DL
, MVT::f32
,
12636 LHS
->getOperand(0), FNegLHS
, FNegRHS
);
12637 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, NewSelect
);
12644 SDValue
SITargetLowering::performZeroExtendCombine(SDNode
*N
,
12645 DAGCombinerInfo
&DCI
) const {
12646 if (!Subtarget
->has16BitInsts() ||
12647 DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
12650 EVT VT
= N
->getValueType(0);
12651 if (VT
!= MVT::i32
)
12654 SDValue Src
= N
->getOperand(0);
12655 if (Src
.getValueType() != MVT::i16
)
12662 SITargetLowering::performSignExtendInRegCombine(SDNode
*N
,
12663 DAGCombinerInfo
&DCI
) const {
12664 SDValue Src
= N
->getOperand(0);
12665 auto *VTSign
= cast
<VTSDNode
>(N
->getOperand(1));
12667 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12668 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12669 if (((Src
.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE
&&
12670 VTSign
->getVT() == MVT::i8
) ||
12671 (Src
.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT
&&
12672 VTSign
->getVT() == MVT::i16
))) {
12673 assert(Subtarget
->hasScalarSubwordLoads() &&
12674 "s_buffer_load_{u8, i8} are supported "
12675 "in GFX12 (or newer) architectures.");
12676 EVT VT
= Src
.getValueType();
12677 unsigned Opc
= (Src
.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE
)
12678 ? AMDGPUISD::SBUFFER_LOAD_BYTE
12679 : AMDGPUISD::SBUFFER_LOAD_SHORT
;
12681 SDVTList ResList
= DCI
.DAG
.getVTList(MVT::i32
);
12683 Src
.getOperand(0), // source register
12684 Src
.getOperand(1), // offset
12685 Src
.getOperand(2) // cachePolicy
12687 auto *M
= cast
<MemSDNode
>(Src
);
12688 SDValue BufferLoad
= DCI
.DAG
.getMemIntrinsicNode(
12689 Opc
, DL
, ResList
, Ops
, M
->getMemoryVT(), M
->getMemOperand());
12690 SDValue LoadVal
= DCI
.DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, BufferLoad
);
12693 if (((Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
&&
12694 VTSign
->getVT() == MVT::i8
) ||
12695 (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT
&&
12696 VTSign
->getVT() == MVT::i16
)) &&
12698 auto *M
= cast
<MemSDNode
>(Src
);
12699 SDValue Ops
[] = {Src
.getOperand(0), // Chain
12700 Src
.getOperand(1), // rsrc
12701 Src
.getOperand(2), // vindex
12702 Src
.getOperand(3), // voffset
12703 Src
.getOperand(4), // soffset
12704 Src
.getOperand(5), // offset
12705 Src
.getOperand(6), Src
.getOperand(7)};
12706 // replace with BUFFER_LOAD_BYTE/SHORT
12708 DCI
.DAG
.getVTList(MVT::i32
, Src
.getOperand(0).getValueType());
12709 unsigned Opc
= (Src
.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE
)
12710 ? AMDGPUISD::BUFFER_LOAD_BYTE
12711 : AMDGPUISD::BUFFER_LOAD_SHORT
;
12712 SDValue BufferLoadSignExt
= DCI
.DAG
.getMemIntrinsicNode(
12713 Opc
, SDLoc(N
), ResList
, Ops
, M
->getMemoryVT(), M
->getMemOperand());
12714 return DCI
.DAG
.getMergeValues(
12715 {BufferLoadSignExt
, BufferLoadSignExt
.getValue(1)}, SDLoc(N
));
12720 SDValue
SITargetLowering::performClassCombine(SDNode
*N
,
12721 DAGCombinerInfo
&DCI
) const {
12722 SelectionDAG
&DAG
= DCI
.DAG
;
12723 SDValue Mask
= N
->getOperand(1);
12725 // fp_class x, 0 -> false
12726 if (isNullConstant(Mask
))
12727 return DAG
.getConstant(0, SDLoc(N
), MVT::i1
);
12729 if (N
->getOperand(0).isUndef())
12730 return DAG
.getUNDEF(MVT::i1
);
12735 SDValue
SITargetLowering::performRcpCombine(SDNode
*N
,
12736 DAGCombinerInfo
&DCI
) const {
12737 EVT VT
= N
->getValueType(0);
12738 SDValue N0
= N
->getOperand(0);
12740 if (N0
.isUndef()) {
12741 return DCI
.DAG
.getConstantFP(APFloat::getQNaN(VT
.getFltSemantics()),
12745 if (VT
== MVT::f32
&& (N0
.getOpcode() == ISD::UINT_TO_FP
||
12746 N0
.getOpcode() == ISD::SINT_TO_FP
)) {
12747 return DCI
.DAG
.getNode(AMDGPUISD::RCP_IFLAG
, SDLoc(N
), VT
, N0
,
12751 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12752 if ((VT
== MVT::f16
&& N0
.getOpcode() == ISD::FSQRT
) &&
12753 N
->getFlags().hasAllowContract() && N0
->getFlags().hasAllowContract()) {
12754 return DCI
.DAG
.getNode(AMDGPUISD::RSQ
, SDLoc(N
), VT
, N0
.getOperand(0),
12758 return AMDGPUTargetLowering::performRcpCombine(N
, DCI
);
12761 bool SITargetLowering::isCanonicalized(SelectionDAG
&DAG
, SDValue Op
,
12762 unsigned MaxDepth
) const {
12763 unsigned Opcode
= Op
.getOpcode();
12764 if (Opcode
== ISD::FCANONICALIZE
)
12767 if (auto *CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
12768 const auto &F
= CFP
->getValueAPF();
12769 if (F
.isNaN() && F
.isSignaling())
12771 if (!F
.isDenormal())
12774 DenormalMode Mode
=
12775 DAG
.getMachineFunction().getDenormalMode(F
.getSemantics());
12776 return Mode
== DenormalMode::getIEEE();
12779 // If source is a result of another standard FP operation it is already in
12785 // These will flush denorms if required.
12796 case ISD::FP_ROUND
:
12797 case ISD::FP_EXTEND
:
12798 case ISD::FP16_TO_FP
:
12799 case ISD::FP_TO_FP16
:
12800 case ISD::BF16_TO_FP
:
12801 case ISD::FP_TO_BF16
:
12803 case AMDGPUISD::FMUL_LEGACY
:
12804 case AMDGPUISD::FMAD_FTZ
:
12805 case AMDGPUISD::RCP
:
12806 case AMDGPUISD::RSQ
:
12807 case AMDGPUISD::RSQ_CLAMP
:
12808 case AMDGPUISD::RCP_LEGACY
:
12809 case AMDGPUISD::RCP_IFLAG
:
12810 case AMDGPUISD::LOG
:
12811 case AMDGPUISD::EXP
:
12812 case AMDGPUISD::DIV_SCALE
:
12813 case AMDGPUISD::DIV_FMAS
:
12814 case AMDGPUISD::DIV_FIXUP
:
12815 case AMDGPUISD::FRACT
:
12816 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
12817 case AMDGPUISD::CVT_F32_UBYTE0
:
12818 case AMDGPUISD::CVT_F32_UBYTE1
:
12819 case AMDGPUISD::CVT_F32_UBYTE2
:
12820 case AMDGPUISD::CVT_F32_UBYTE3
:
12821 case AMDGPUISD::FP_TO_FP16
:
12822 case AMDGPUISD::SIN_HW
:
12823 case AMDGPUISD::COS_HW
:
12826 // It can/will be lowered or combined as a bit operation.
12827 // Need to check their input recursively to handle.
12830 case ISD::FCOPYSIGN
:
12831 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
12834 if (Op
.getValueType() == MVT::i32
) {
12835 // Be careful as we only know it is a bitcast floating point type. It
12836 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12837 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12838 // is valid to optimize for all types.
12839 if (auto *RHS
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1))) {
12840 if (RHS
->getZExtValue() == 0xffff0000) {
12841 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
12850 return Op
.getValueType().getScalarType() != MVT::f16
;
12854 case ISD::FMINNUM_IEEE
:
12855 case ISD::FMAXNUM_IEEE
:
12856 case ISD::FMINIMUM
:
12857 case ISD::FMAXIMUM
:
12858 case AMDGPUISD::CLAMP
:
12859 case AMDGPUISD::FMED3
:
12860 case AMDGPUISD::FMAX3
:
12861 case AMDGPUISD::FMIN3
:
12862 case AMDGPUISD::FMAXIMUM3
:
12863 case AMDGPUISD::FMINIMUM3
: {
12864 // FIXME: Shouldn't treat the generic operations different based these.
12865 // However, we aren't really required to flush the result from
12868 // snans will be quieted, so we only need to worry about denormals.
12869 if (Subtarget
->supportsMinMaxDenormModes() ||
12870 // FIXME: denormalsEnabledForType is broken for dynamic
12871 denormalsEnabledForType(DAG
, Op
.getValueType()))
12874 // Flushing may be required.
12875 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12876 // targets need to check their input recursively.
12878 // FIXME: Does this apply with clamp? It's implemented with max.
12879 for (unsigned I
= 0, E
= Op
.getNumOperands(); I
!= E
; ++I
) {
12880 if (!isCanonicalized(DAG
, Op
.getOperand(I
), MaxDepth
- 1))
12886 case ISD::SELECT
: {
12887 return isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1) &&
12888 isCanonicalized(DAG
, Op
.getOperand(2), MaxDepth
- 1);
12890 case ISD::BUILD_VECTOR
: {
12891 for (unsigned i
= 0, e
= Op
.getNumOperands(); i
!= e
; ++i
) {
12892 SDValue SrcOp
= Op
.getOperand(i
);
12893 if (!isCanonicalized(DAG
, SrcOp
, MaxDepth
- 1))
12899 case ISD::EXTRACT_VECTOR_ELT
:
12900 case ISD::EXTRACT_SUBVECTOR
: {
12901 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
12903 case ISD::INSERT_VECTOR_ELT
: {
12904 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1) &&
12905 isCanonicalized(DAG
, Op
.getOperand(1), MaxDepth
- 1);
12908 // Could be anything.
12912 // TODO: This is incorrect as it loses track of the operand's type. We may
12913 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12914 // same bits that are canonicalized in one type need not be in the other.
12915 return isCanonicalized(DAG
, Op
.getOperand(0), MaxDepth
- 1);
12916 case ISD::TRUNCATE
: {
12917 // Hack round the mess we make when legalizing extract_vector_elt
12918 if (Op
.getValueType() == MVT::i16
) {
12919 SDValue TruncSrc
= Op
.getOperand(0);
12920 if (TruncSrc
.getValueType() == MVT::i32
&&
12921 TruncSrc
.getOpcode() == ISD::BITCAST
&&
12922 TruncSrc
.getOperand(0).getValueType() == MVT::v2f16
) {
12923 return isCanonicalized(DAG
, TruncSrc
.getOperand(0), MaxDepth
- 1);
12928 case ISD::INTRINSIC_WO_CHAIN
: {
12929 unsigned IntrinsicID
= Op
.getConstantOperandVal(0);
12930 // TODO: Handle more intrinsics
12931 switch (IntrinsicID
) {
12932 case Intrinsic::amdgcn_cvt_pkrtz
:
12933 case Intrinsic::amdgcn_cubeid
:
12934 case Intrinsic::amdgcn_frexp_mant
:
12935 case Intrinsic::amdgcn_fdot2
:
12936 case Intrinsic::amdgcn_rcp
:
12937 case Intrinsic::amdgcn_rsq
:
12938 case Intrinsic::amdgcn_rsq_clamp
:
12939 case Intrinsic::amdgcn_rcp_legacy
:
12940 case Intrinsic::amdgcn_rsq_legacy
:
12941 case Intrinsic::amdgcn_trig_preop
:
12942 case Intrinsic::amdgcn_log
:
12943 case Intrinsic::amdgcn_exp2
:
12944 case Intrinsic::amdgcn_sqrt
:
12956 // FIXME: denormalsEnabledForType is broken for dynamic
12957 return denormalsEnabledForType(DAG
, Op
.getValueType()) &&
12958 DAG
.isKnownNeverSNaN(Op
);
12961 bool SITargetLowering::isCanonicalized(Register Reg
, const MachineFunction
&MF
,
12962 unsigned MaxDepth
) const {
12963 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
12964 MachineInstr
*MI
= MRI
.getVRegDef(Reg
);
12965 unsigned Opcode
= MI
->getOpcode();
12967 if (Opcode
== AMDGPU::G_FCANONICALIZE
)
12970 std::optional
<FPValueAndVReg
> FCR
;
12971 // Constant splat (can be padded with undef) or scalar constant.
12972 if (mi_match(Reg
, MRI
, MIPatternMatch::m_GFCstOrSplat(FCR
))) {
12973 if (FCR
->Value
.isSignaling())
12975 if (!FCR
->Value
.isDenormal())
12978 DenormalMode Mode
= MF
.getDenormalMode(FCR
->Value
.getSemantics());
12979 return Mode
== DenormalMode::getIEEE();
12986 case AMDGPU::G_FADD
:
12987 case AMDGPU::G_FSUB
:
12988 case AMDGPU::G_FMUL
:
12989 case AMDGPU::G_FCEIL
:
12990 case AMDGPU::G_FFLOOR
:
12991 case AMDGPU::G_FRINT
:
12992 case AMDGPU::G_FNEARBYINT
:
12993 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND
:
12994 case AMDGPU::G_INTRINSIC_TRUNC
:
12995 case AMDGPU::G_INTRINSIC_ROUNDEVEN
:
12996 case AMDGPU::G_FMA
:
12997 case AMDGPU::G_FMAD
:
12998 case AMDGPU::G_FSQRT
:
12999 case AMDGPU::G_FDIV
:
13000 case AMDGPU::G_FREM
:
13001 case AMDGPU::G_FPOW
:
13002 case AMDGPU::G_FPEXT
:
13003 case AMDGPU::G_FLOG
:
13004 case AMDGPU::G_FLOG2
:
13005 case AMDGPU::G_FLOG10
:
13006 case AMDGPU::G_FPTRUNC
:
13007 case AMDGPU::G_AMDGPU_RCP_IFLAG
:
13008 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0
:
13009 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1
:
13010 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2
:
13011 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3
:
13013 case AMDGPU::G_FNEG
:
13014 case AMDGPU::G_FABS
:
13015 case AMDGPU::G_FCOPYSIGN
:
13016 return isCanonicalized(MI
->getOperand(1).getReg(), MF
, MaxDepth
- 1);
13017 case AMDGPU::G_FMINNUM
:
13018 case AMDGPU::G_FMAXNUM
:
13019 case AMDGPU::G_FMINNUM_IEEE
:
13020 case AMDGPU::G_FMAXNUM_IEEE
:
13021 case AMDGPU::G_FMINIMUM
:
13022 case AMDGPU::G_FMAXIMUM
: {
13023 if (Subtarget
->supportsMinMaxDenormModes() ||
13024 // FIXME: denormalsEnabledForType is broken for dynamic
13025 denormalsEnabledForType(MRI
.getType(Reg
), MF
))
13030 case AMDGPU::G_BUILD_VECTOR
:
13031 for (const MachineOperand
&MO
: llvm::drop_begin(MI
->operands()))
13032 if (!isCanonicalized(MO
.getReg(), MF
, MaxDepth
- 1))
13035 case AMDGPU::G_INTRINSIC
:
13036 case AMDGPU::G_INTRINSIC_CONVERGENT
:
13037 switch (cast
<GIntrinsic
>(MI
)->getIntrinsicID()) {
13038 case Intrinsic::amdgcn_fmul_legacy
:
13039 case Intrinsic::amdgcn_fmad_ftz
:
13040 case Intrinsic::amdgcn_sqrt
:
13041 case Intrinsic::amdgcn_fmed3
:
13042 case Intrinsic::amdgcn_sin
:
13043 case Intrinsic::amdgcn_cos
:
13044 case Intrinsic::amdgcn_log
:
13045 case Intrinsic::amdgcn_exp2
:
13046 case Intrinsic::amdgcn_log_clamp
:
13047 case Intrinsic::amdgcn_rcp
:
13048 case Intrinsic::amdgcn_rcp_legacy
:
13049 case Intrinsic::amdgcn_rsq
:
13050 case Intrinsic::amdgcn_rsq_clamp
:
13051 case Intrinsic::amdgcn_rsq_legacy
:
13052 case Intrinsic::amdgcn_div_scale
:
13053 case Intrinsic::amdgcn_div_fmas
:
13054 case Intrinsic::amdgcn_div_fixup
:
13055 case Intrinsic::amdgcn_fract
:
13056 case Intrinsic::amdgcn_cvt_pkrtz
:
13057 case Intrinsic::amdgcn_cubeid
:
13058 case Intrinsic::amdgcn_cubema
:
13059 case Intrinsic::amdgcn_cubesc
:
13060 case Intrinsic::amdgcn_cubetc
:
13061 case Intrinsic::amdgcn_frexp_mant
:
13062 case Intrinsic::amdgcn_fdot2
:
13063 case Intrinsic::amdgcn_trig_preop
:
13074 llvm_unreachable("invalid operation");
13077 // Constant fold canonicalize.
13078 SDValue
SITargetLowering::getCanonicalConstantFP(SelectionDAG
&DAG
,
13079 const SDLoc
&SL
, EVT VT
,
13080 const APFloat
&C
) const {
13081 // Flush denormals to 0 if not enabled.
13082 if (C
.isDenormal()) {
13083 DenormalMode Mode
=
13084 DAG
.getMachineFunction().getDenormalMode(C
.getSemantics());
13085 if (Mode
== DenormalMode::getPreserveSign()) {
13086 return DAG
.getConstantFP(
13087 APFloat::getZero(C
.getSemantics(), C
.isNegative()), SL
, VT
);
13090 if (Mode
!= DenormalMode::getIEEE())
13095 APFloat CanonicalQNaN
= APFloat::getQNaN(C
.getSemantics());
13096 if (C
.isSignaling()) {
13097 // Quiet a signaling NaN.
13098 // FIXME: Is this supposed to preserve payload bits?
13099 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
13102 // Make sure it is the canonical NaN bitpattern.
13104 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13106 if (C
.bitcastToAPInt() != CanonicalQNaN
.bitcastToAPInt())
13107 return DAG
.getConstantFP(CanonicalQNaN
, SL
, VT
);
13110 // Already canonical.
13111 return DAG
.getConstantFP(C
, SL
, VT
);
13114 static bool vectorEltWillFoldAway(SDValue Op
) {
13115 return Op
.isUndef() || isa
<ConstantFPSDNode
>(Op
);
13119 SITargetLowering::performFCanonicalizeCombine(SDNode
*N
,
13120 DAGCombinerInfo
&DCI
) const {
13121 SelectionDAG
&DAG
= DCI
.DAG
;
13122 SDValue N0
= N
->getOperand(0);
13123 EVT VT
= N
->getValueType(0);
13125 // fcanonicalize undef -> qnan
13126 if (N0
.isUndef()) {
13127 APFloat QNaN
= APFloat::getQNaN(VT
.getFltSemantics());
13128 return DAG
.getConstantFP(QNaN
, SDLoc(N
), VT
);
13131 if (ConstantFPSDNode
*CFP
= isConstOrConstSplatFP(N0
)) {
13132 EVT VT
= N
->getValueType(0);
13133 return getCanonicalConstantFP(DAG
, SDLoc(N
), VT
, CFP
->getValueAPF());
13136 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13137 // (fcanonicalize k)
13139 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13141 // TODO: This could be better with wider vectors that will be split to v2f16,
13142 // and to consider uses since there aren't that many packed operations.
13143 if (N0
.getOpcode() == ISD::BUILD_VECTOR
&& VT
== MVT::v2f16
&&
13144 isTypeLegal(MVT::v2f16
)) {
13146 SDValue NewElts
[2];
13147 SDValue Lo
= N0
.getOperand(0);
13148 SDValue Hi
= N0
.getOperand(1);
13149 EVT EltVT
= Lo
.getValueType();
13151 if (vectorEltWillFoldAway(Lo
) || vectorEltWillFoldAway(Hi
)) {
13152 for (unsigned I
= 0; I
!= 2; ++I
) {
13153 SDValue Op
= N0
.getOperand(I
);
13154 if (ConstantFPSDNode
*CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
13156 getCanonicalConstantFP(DAG
, SL
, EltVT
, CFP
->getValueAPF());
13157 } else if (Op
.isUndef()) {
13158 // Handled below based on what the other operand is.
13161 NewElts
[I
] = DAG
.getNode(ISD::FCANONICALIZE
, SL
, EltVT
, Op
);
13165 // If one half is undef, and one is constant, prefer a splat vector rather
13166 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13167 // cheaper to use and may be free with a packed operation.
13168 if (NewElts
[0].isUndef()) {
13169 if (isa
<ConstantFPSDNode
>(NewElts
[1]))
13170 NewElts
[0] = isa
<ConstantFPSDNode
>(NewElts
[1])
13172 : DAG
.getConstantFP(0.0f
, SL
, EltVT
);
13175 if (NewElts
[1].isUndef()) {
13176 NewElts
[1] = isa
<ConstantFPSDNode
>(NewElts
[0])
13178 : DAG
.getConstantFP(0.0f
, SL
, EltVT
);
13181 return DAG
.getBuildVector(VT
, SL
, NewElts
);
13188 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc
) {
13191 case ISD::FMAXNUM_IEEE
:
13192 return AMDGPUISD::FMAX3
;
13193 case ISD::FMAXIMUM
:
13194 return AMDGPUISD::FMAXIMUM3
;
13196 return AMDGPUISD::SMAX3
;
13198 return AMDGPUISD::UMAX3
;
13200 case ISD::FMINNUM_IEEE
:
13201 return AMDGPUISD::FMIN3
;
13202 case ISD::FMINIMUM
:
13203 return AMDGPUISD::FMINIMUM3
;
13205 return AMDGPUISD::SMIN3
;
13207 return AMDGPUISD::UMIN3
;
13209 llvm_unreachable("Not a min/max opcode");
13213 SDValue
SITargetLowering::performIntMed3ImmCombine(SelectionDAG
&DAG
,
13214 const SDLoc
&SL
, SDValue Src
,
13217 bool Signed
) const {
13220 // min(max(x, K0), K1), K0 < K1
13221 // max(min(x, K0), K1), K1 < K0
13223 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13225 ConstantSDNode
*MinK
= dyn_cast
<ConstantSDNode
>(MinVal
);
13226 ConstantSDNode
*MaxK
= dyn_cast
<ConstantSDNode
>(MaxVal
);
13228 if (!MinK
|| !MaxK
)
13232 if (MaxK
->getAPIntValue().sge(MinK
->getAPIntValue()))
13235 if (MaxK
->getAPIntValue().uge(MinK
->getAPIntValue()))
13239 EVT VT
= MinK
->getValueType(0);
13240 unsigned Med3Opc
= Signed
? AMDGPUISD::SMED3
: AMDGPUISD::UMED3
;
13241 if (VT
== MVT::i32
|| (VT
== MVT::i16
&& Subtarget
->hasMed3_16()))
13242 return DAG
.getNode(Med3Opc
, SL
, VT
, Src
, MaxVal
, MinVal
);
13244 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13245 // not available, but this is unlikely to be profitable as constants
13246 // will often need to be materialized & extended, especially on
13247 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13251 static ConstantFPSDNode
*getSplatConstantFP(SDValue Op
) {
13252 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(Op
))
13255 if (BuildVectorSDNode
*BV
= dyn_cast
<BuildVectorSDNode
>(Op
)) {
13256 if (ConstantFPSDNode
*C
= BV
->getConstantFPSplatNode())
13263 SDValue
SITargetLowering::performFPMed3ImmCombine(SelectionDAG
&DAG
,
13264 const SDLoc
&SL
, SDValue Op0
,
13265 SDValue Op1
) const {
13266 ConstantFPSDNode
*K1
= getSplatConstantFP(Op1
);
13270 ConstantFPSDNode
*K0
= getSplatConstantFP(Op0
.getOperand(1));
13274 // Ordered >= (although NaN inputs should have folded away by now).
13275 if (K0
->getValueAPF() > K1
->getValueAPF())
13278 const MachineFunction
&MF
= DAG
.getMachineFunction();
13279 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
13281 // TODO: Check IEEE bit enabled?
13282 EVT VT
= Op0
.getValueType();
13283 if (Info
->getMode().DX10Clamp
) {
13284 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13285 // hardware fmed3 behavior converting to a min.
13286 // FIXME: Should this be allowing -0.0?
13287 if (K1
->isExactlyValue(1.0) && K0
->isExactlyValue(0.0))
13288 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Op0
.getOperand(0));
13291 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13292 if (VT
== MVT::f32
|| (VT
== MVT::f16
&& Subtarget
->hasMed3_16())) {
13293 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13294 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13295 // then give the other result, which is different from med3 with a NaN
13297 SDValue Var
= Op0
.getOperand(0);
13298 if (!DAG
.isKnownNeverSNaN(Var
))
13301 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
13303 if ((!K0
->hasOneUse() || TII
->isInlineConstant(K0
->getValueAPF())) &&
13304 (!K1
->hasOneUse() || TII
->isInlineConstant(K1
->getValueAPF()))) {
13305 return DAG
.getNode(AMDGPUISD::FMED3
, SL
, K0
->getValueType(0), Var
,
13306 SDValue(K0
, 0), SDValue(K1
, 0));
13313 /// \return true if the subtarget supports minimum3 and maximum3 with the given
13314 /// base min/max opcode \p Opc for type \p VT.
13315 static bool supportsMin3Max3(const GCNSubtarget
&Subtarget
, unsigned Opc
,
13320 case ISD::FMINNUM_IEEE
:
13321 case ISD::FMAXNUM_IEEE
:
13322 case AMDGPUISD::FMIN_LEGACY
:
13323 case AMDGPUISD::FMAX_LEGACY
:
13324 return (VT
== MVT::f32
) || (VT
== MVT::f16
&& Subtarget
.hasMin3Max3_16());
13325 case ISD::FMINIMUM
:
13326 case ISD::FMAXIMUM
:
13327 return (VT
== MVT::f32
|| VT
== MVT::f16
) && Subtarget
.hasIEEEMinMax3();
13332 return (VT
== MVT::i32
) || (VT
== MVT::i16
&& Subtarget
.hasMin3Max3_16());
13337 llvm_unreachable("not a min/max opcode");
13340 SDValue
SITargetLowering::performMinMaxCombine(SDNode
*N
,
13341 DAGCombinerInfo
&DCI
) const {
13342 SelectionDAG
&DAG
= DCI
.DAG
;
13344 EVT VT
= N
->getValueType(0);
13345 unsigned Opc
= N
->getOpcode();
13346 SDValue Op0
= N
->getOperand(0);
13347 SDValue Op1
= N
->getOperand(1);
13349 // Only do this if the inner op has one use since this will just increases
13350 // register pressure for no benefit.
13352 if (supportsMin3Max3(*Subtarget
, Opc
, VT
)) {
13353 // max(max(a, b), c) -> max3(a, b, c)
13354 // min(min(a, b), c) -> min3(a, b, c)
13355 if (Op0
.getOpcode() == Opc
&& Op0
.hasOneUse()) {
13357 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
), DL
, N
->getValueType(0),
13358 Op0
.getOperand(0), Op0
.getOperand(1), Op1
);
13362 // max(a, max(b, c)) -> max3(a, b, c)
13363 // min(a, min(b, c)) -> min3(a, b, c)
13364 if (Op1
.getOpcode() == Opc
&& Op1
.hasOneUse()) {
13366 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
), DL
, N
->getValueType(0),
13367 Op0
, Op1
.getOperand(0), Op1
.getOperand(1));
13371 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13372 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13373 if (Opc
== ISD::SMIN
&& Op0
.getOpcode() == ISD::SMAX
&& Op0
.hasOneUse()) {
13374 if (SDValue Med3
= performIntMed3ImmCombine(
13375 DAG
, SDLoc(N
), Op0
->getOperand(0), Op1
, Op0
->getOperand(1), true))
13378 if (Opc
== ISD::SMAX
&& Op0
.getOpcode() == ISD::SMIN
&& Op0
.hasOneUse()) {
13379 if (SDValue Med3
= performIntMed3ImmCombine(
13380 DAG
, SDLoc(N
), Op0
->getOperand(0), Op0
->getOperand(1), Op1
, true))
13384 if (Opc
== ISD::UMIN
&& Op0
.getOpcode() == ISD::UMAX
&& Op0
.hasOneUse()) {
13385 if (SDValue Med3
= performIntMed3ImmCombine(
13386 DAG
, SDLoc(N
), Op0
->getOperand(0), Op1
, Op0
->getOperand(1), false))
13389 if (Opc
== ISD::UMAX
&& Op0
.getOpcode() == ISD::UMIN
&& Op0
.hasOneUse()) {
13390 if (SDValue Med3
= performIntMed3ImmCombine(
13391 DAG
, SDLoc(N
), Op0
->getOperand(0), Op0
->getOperand(1), Op1
, false))
13395 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13396 if (((Opc
== ISD::FMINNUM
&& Op0
.getOpcode() == ISD::FMAXNUM
) ||
13397 (Opc
== ISD::FMINNUM_IEEE
&& Op0
.getOpcode() == ISD::FMAXNUM_IEEE
) ||
13398 (Opc
== AMDGPUISD::FMIN_LEGACY
&&
13399 Op0
.getOpcode() == AMDGPUISD::FMAX_LEGACY
)) &&
13400 (VT
== MVT::f32
|| VT
== MVT::f64
||
13401 (VT
== MVT::f16
&& Subtarget
->has16BitInsts()) ||
13402 (VT
== MVT::v2f16
&& Subtarget
->hasVOP3PInsts())) &&
13404 if (SDValue Res
= performFPMed3ImmCombine(DAG
, SDLoc(N
), Op0
, Op1
))
13411 static bool isClampZeroToOne(SDValue A
, SDValue B
) {
13412 if (ConstantFPSDNode
*CA
= dyn_cast
<ConstantFPSDNode
>(A
)) {
13413 if (ConstantFPSDNode
*CB
= dyn_cast
<ConstantFPSDNode
>(B
)) {
13414 // FIXME: Should this be allowing -0.0?
13415 return (CA
->isExactlyValue(0.0) && CB
->isExactlyValue(1.0)) ||
13416 (CA
->isExactlyValue(1.0) && CB
->isExactlyValue(0.0));
13423 // FIXME: Should only worry about snans for version with chain.
13424 SDValue
SITargetLowering::performFMed3Combine(SDNode
*N
,
13425 DAGCombinerInfo
&DCI
) const {
13426 EVT VT
= N
->getValueType(0);
13427 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13428 // NaNs. With a NaN input, the order of the operands may change the result.
13430 SelectionDAG
&DAG
= DCI
.DAG
;
13433 SDValue Src0
= N
->getOperand(0);
13434 SDValue Src1
= N
->getOperand(1);
13435 SDValue Src2
= N
->getOperand(2);
13437 if (isClampZeroToOne(Src0
, Src1
)) {
13438 // const_a, const_b, x -> clamp is safe in all cases including signaling
13440 // FIXME: Should this be allowing -0.0?
13441 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src2
);
13444 const MachineFunction
&MF
= DAG
.getMachineFunction();
13445 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
13447 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13448 // handling no dx10-clamp?
13449 if (Info
->getMode().DX10Clamp
) {
13450 // If NaNs is clamped to 0, we are free to reorder the inputs.
13452 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
13453 std::swap(Src0
, Src1
);
13455 if (isa
<ConstantFPSDNode
>(Src1
) && !isa
<ConstantFPSDNode
>(Src2
))
13456 std::swap(Src1
, Src2
);
13458 if (isa
<ConstantFPSDNode
>(Src0
) && !isa
<ConstantFPSDNode
>(Src1
))
13459 std::swap(Src0
, Src1
);
13461 if (isClampZeroToOne(Src1
, Src2
))
13462 return DAG
.getNode(AMDGPUISD::CLAMP
, SL
, VT
, Src0
);
13468 SDValue
SITargetLowering::performCvtPkRTZCombine(SDNode
*N
,
13469 DAGCombinerInfo
&DCI
) const {
13470 SDValue Src0
= N
->getOperand(0);
13471 SDValue Src1
= N
->getOperand(1);
13472 if (Src0
.isUndef() && Src1
.isUndef())
13473 return DCI
.DAG
.getUNDEF(N
->getValueType(0));
13477 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13478 // expanded into a set of cmp/select instructions.
13479 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize
,
13481 bool IsDivergentIdx
,
13482 const GCNSubtarget
*Subtarget
) {
13483 if (UseDivergentRegisterIndexing
)
13486 unsigned VecSize
= EltSize
* NumElem
;
13488 // Sub-dword vectors of size 2 dword or less have better implementation.
13489 if (VecSize
<= 64 && EltSize
< 32)
13492 // Always expand the rest of sub-dword instructions, otherwise it will be
13493 // lowered via memory.
13497 // Always do this if var-idx is divergent, otherwise it will become a loop.
13498 if (IsDivergentIdx
)
13501 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13502 unsigned NumInsts
= NumElem
/* Number of compares */ +
13503 ((EltSize
+ 31) / 32) * NumElem
/* Number of cndmasks */;
13505 // On some architectures (GFX9) movrel is not available and it's better
13507 if (Subtarget
->useVGPRIndexMode())
13508 return NumInsts
<= 16;
13510 // If movrel is available, use it instead of expanding for vector of 8
13512 if (Subtarget
->hasMovrel())
13513 return NumInsts
<= 15;
13518 bool SITargetLowering::shouldExpandVectorDynExt(SDNode
*N
) const {
13519 SDValue Idx
= N
->getOperand(N
->getNumOperands() - 1);
13520 if (isa
<ConstantSDNode
>(Idx
))
13523 SDValue Vec
= N
->getOperand(0);
13524 EVT VecVT
= Vec
.getValueType();
13525 EVT EltVT
= VecVT
.getVectorElementType();
13526 unsigned EltSize
= EltVT
.getSizeInBits();
13527 unsigned NumElem
= VecVT
.getVectorNumElements();
13529 return SITargetLowering::shouldExpandVectorDynExt(
13530 EltSize
, NumElem
, Idx
->isDivergent(), getSubtarget());
13534 SITargetLowering::performExtractVectorEltCombine(SDNode
*N
,
13535 DAGCombinerInfo
&DCI
) const {
13536 SDValue Vec
= N
->getOperand(0);
13537 SelectionDAG
&DAG
= DCI
.DAG
;
13539 EVT VecVT
= Vec
.getValueType();
13540 EVT VecEltVT
= VecVT
.getVectorElementType();
13541 EVT ResVT
= N
->getValueType(0);
13543 unsigned VecSize
= VecVT
.getSizeInBits();
13544 unsigned VecEltSize
= VecEltVT
.getSizeInBits();
13546 if ((Vec
.getOpcode() == ISD::FNEG
|| Vec
.getOpcode() == ISD::FABS
) &&
13547 allUsesHaveSourceMods(N
)) {
13549 SDValue Idx
= N
->getOperand(1);
13551 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, ResVT
, Vec
.getOperand(0), Idx
);
13552 return DAG
.getNode(Vec
.getOpcode(), SL
, ResVT
, Elt
);
13555 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13557 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13558 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13559 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13560 if (Vec
.hasOneUse() && DCI
.isBeforeLegalize() && VecEltVT
== ResVT
) {
13562 SDValue Idx
= N
->getOperand(1);
13563 unsigned Opc
= Vec
.getOpcode();
13568 // TODO: Support other binary operations.
13579 case ISD::FMAXNUM_IEEE
:
13580 case ISD::FMINNUM_IEEE
:
13581 case ISD::FMAXIMUM
:
13582 case ISD::FMINIMUM
: {
13583 SDValue Elt0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, ResVT
,
13584 Vec
.getOperand(0), Idx
);
13585 SDValue Elt1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, ResVT
,
13586 Vec
.getOperand(1), Idx
);
13588 DCI
.AddToWorklist(Elt0
.getNode());
13589 DCI
.AddToWorklist(Elt1
.getNode());
13590 return DAG
.getNode(Opc
, SL
, ResVT
, Elt0
, Elt1
, Vec
->getFlags());
13595 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13596 if (shouldExpandVectorDynExt(N
)) {
13598 SDValue Idx
= N
->getOperand(1);
13600 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
13601 SDValue IC
= DAG
.getVectorIdxConstant(I
, SL
);
13602 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, ResVT
, Vec
, IC
);
13606 V
= DAG
.getSelectCC(SL
, Idx
, IC
, Elt
, V
, ISD::SETEQ
);
13611 if (!DCI
.isBeforeLegalize())
13614 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13615 // elements. This exposes more load reduction opportunities by replacing
13616 // multiple small extract_vector_elements with a single 32-bit extract.
13617 auto *Idx
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
13618 if (isa
<MemSDNode
>(Vec
) && VecEltSize
<= 16 && VecEltVT
.isByteSized() &&
13619 VecSize
> 32 && VecSize
% 32 == 0 && Idx
) {
13620 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VecVT
);
13622 unsigned BitIndex
= Idx
->getZExtValue() * VecEltSize
;
13623 unsigned EltIdx
= BitIndex
/ 32;
13624 unsigned LeftoverBitIdx
= BitIndex
% 32;
13627 SDValue Cast
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, Vec
);
13628 DCI
.AddToWorklist(Cast
.getNode());
13630 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Cast
,
13631 DAG
.getConstant(EltIdx
, SL
, MVT::i32
));
13632 DCI
.AddToWorklist(Elt
.getNode());
13633 SDValue Srl
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Elt
,
13634 DAG
.getConstant(LeftoverBitIdx
, SL
, MVT::i32
));
13635 DCI
.AddToWorklist(Srl
.getNode());
13637 EVT VecEltAsIntVT
= VecEltVT
.changeTypeToInteger();
13638 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, SL
, VecEltAsIntVT
, Srl
);
13639 DCI
.AddToWorklist(Trunc
.getNode());
13641 if (VecEltVT
== ResVT
) {
13642 return DAG
.getNode(ISD::BITCAST
, SL
, VecEltVT
, Trunc
);
13645 assert(ResVT
.isScalarInteger());
13646 return DAG
.getAnyExtOrTrunc(Trunc
, SL
, ResVT
);
13653 SITargetLowering::performInsertVectorEltCombine(SDNode
*N
,
13654 DAGCombinerInfo
&DCI
) const {
13655 SDValue Vec
= N
->getOperand(0);
13656 SDValue Idx
= N
->getOperand(2);
13657 EVT VecVT
= Vec
.getValueType();
13658 EVT EltVT
= VecVT
.getVectorElementType();
13660 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13661 // => BUILD_VECTOR n x select (e, const-idx)
13662 if (!shouldExpandVectorDynExt(N
))
13665 SelectionDAG
&DAG
= DCI
.DAG
;
13667 SDValue Ins
= N
->getOperand(1);
13668 EVT IdxVT
= Idx
.getValueType();
13670 SmallVector
<SDValue
, 16> Ops
;
13671 for (unsigned I
= 0, E
= VecVT
.getVectorNumElements(); I
< E
; ++I
) {
13672 SDValue IC
= DAG
.getConstant(I
, SL
, IdxVT
);
13673 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
, Vec
, IC
);
13674 SDValue V
= DAG
.getSelectCC(SL
, Idx
, IC
, Ins
, Elt
, ISD::SETEQ
);
13678 return DAG
.getBuildVector(VecVT
, SL
, Ops
);
13681 /// Return the source of an fp_extend from f16 to f32, or a converted FP
13683 static SDValue
strictFPExtFromF16(SelectionDAG
&DAG
, SDValue Src
) {
13684 if (Src
.getOpcode() == ISD::FP_EXTEND
&&
13685 Src
.getOperand(0).getValueType() == MVT::f16
) {
13686 return Src
.getOperand(0);
13689 if (auto *CFP
= dyn_cast
<ConstantFPSDNode
>(Src
)) {
13690 APFloat Val
= CFP
->getValueAPF();
13691 bool LosesInfo
= true;
13692 Val
.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven
, &LosesInfo
);
13694 return DAG
.getConstantFP(Val
, SDLoc(Src
), MVT::f16
);
13700 SDValue
SITargetLowering::performFPRoundCombine(SDNode
*N
,
13701 DAGCombinerInfo
&DCI
) const {
13702 assert(Subtarget
->has16BitInsts() && !Subtarget
->hasMed3_16() &&
13703 "combine only useful on gfx8");
13705 SDValue TruncSrc
= N
->getOperand(0);
13706 EVT VT
= N
->getValueType(0);
13707 if (VT
!= MVT::f16
)
13710 if (TruncSrc
.getOpcode() != AMDGPUISD::FMED3
||
13711 TruncSrc
.getValueType() != MVT::f32
|| !TruncSrc
.hasOneUse())
13714 SelectionDAG
&DAG
= DCI
.DAG
;
13717 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13718 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13721 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13722 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13723 SDValue A
= strictFPExtFromF16(DAG
, TruncSrc
.getOperand(0));
13727 SDValue B
= strictFPExtFromF16(DAG
, TruncSrc
.getOperand(1));
13731 SDValue C
= strictFPExtFromF16(DAG
, TruncSrc
.getOperand(2));
13735 // This changes signaling nan behavior. If an input is a signaling nan, it
13736 // would have been quieted by the fpext originally. We don't care because
13737 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13738 // we would be worse off than just doing the promotion.
13739 SDValue A1
= DAG
.getNode(ISD::FMINNUM_IEEE
, SL
, VT
, A
, B
);
13740 SDValue B1
= DAG
.getNode(ISD::FMAXNUM_IEEE
, SL
, VT
, A
, B
);
13741 SDValue C1
= DAG
.getNode(ISD::FMAXNUM_IEEE
, SL
, VT
, A1
, C
);
13742 return DAG
.getNode(ISD::FMINNUM_IEEE
, SL
, VT
, B1
, C1
);
13745 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG
&DAG
,
13747 const SDNode
*N1
) const {
13748 EVT VT
= N0
->getValueType(0);
13750 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13751 // support denormals ever.
13752 if (((VT
== MVT::f32
&&
13753 denormalModeIsFlushAllF32(DAG
.getMachineFunction())) ||
13754 (VT
== MVT::f16
&& Subtarget
->hasMadF16() &&
13755 denormalModeIsFlushAllF64F16(DAG
.getMachineFunction()))) &&
13756 isOperationLegal(ISD::FMAD
, VT
))
13759 const TargetOptions
&Options
= DAG
.getTarget().Options
;
13760 if ((Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
13761 (N0
->getFlags().hasAllowContract() &&
13762 N1
->getFlags().hasAllowContract())) &&
13763 isFMAFasterThanFMulAndFAdd(DAG
.getMachineFunction(), VT
)) {
13770 // For a reassociatable opcode perform:
13771 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13772 SDValue
SITargetLowering::reassociateScalarOps(SDNode
*N
,
13773 SelectionDAG
&DAG
) const {
13774 EVT VT
= N
->getValueType(0);
13775 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
13778 if (DAG
.isBaseWithConstantOffset(SDValue(N
, 0)))
13781 unsigned Opc
= N
->getOpcode();
13782 SDValue Op0
= N
->getOperand(0);
13783 SDValue Op1
= N
->getOperand(1);
13785 if (!(Op0
->isDivergent() ^ Op1
->isDivergent()))
13788 if (Op0
->isDivergent())
13789 std::swap(Op0
, Op1
);
13791 if (Op1
.getOpcode() != Opc
|| !Op1
.hasOneUse())
13794 SDValue Op2
= Op1
.getOperand(1);
13795 Op1
= Op1
.getOperand(0);
13796 if (!(Op1
->isDivergent() ^ Op2
->isDivergent()))
13799 if (Op1
->isDivergent())
13800 std::swap(Op1
, Op2
);
13803 SDValue Add1
= DAG
.getNode(Opc
, SL
, VT
, Op0
, Op1
);
13804 return DAG
.getNode(Opc
, SL
, VT
, Add1
, Op2
);
13807 static SDValue
getMad64_32(SelectionDAG
&DAG
, const SDLoc
&SL
, EVT VT
,
13808 SDValue N0
, SDValue N1
, SDValue N2
, bool Signed
) {
13809 unsigned MadOpc
= Signed
? AMDGPUISD::MAD_I64_I32
: AMDGPUISD::MAD_U64_U32
;
13810 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i1
);
13811 SDValue Mad
= DAG
.getNode(MadOpc
, SL
, VTs
, N0
, N1
, N2
);
13812 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Mad
);
13815 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13816 // multiplies, if any.
13818 // Full 64-bit multiplies that feed into an addition are lowered here instead
13819 // of using the generic expansion. The generic expansion ends up with
13820 // a tree of ADD nodes that prevents us from using the "add" part of the
13821 // MAD instruction. The expansion produced here results in a chain of ADDs
13822 // instead of a tree.
13823 SDValue
SITargetLowering::tryFoldToMad64_32(SDNode
*N
,
13824 DAGCombinerInfo
&DCI
) const {
13825 assert(N
->getOpcode() == ISD::ADD
);
13827 SelectionDAG
&DAG
= DCI
.DAG
;
13828 EVT VT
= N
->getValueType(0);
13830 SDValue LHS
= N
->getOperand(0);
13831 SDValue RHS
= N
->getOperand(1);
13836 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13837 // result in scalar registers for uniform values.
13838 if (!N
->isDivergent() && Subtarget
->hasSMulHi())
13841 unsigned NumBits
= VT
.getScalarSizeInBits();
13842 if (NumBits
<= 32 || NumBits
> 64)
13845 if (LHS
.getOpcode() != ISD::MUL
) {
13846 assert(RHS
.getOpcode() == ISD::MUL
);
13847 std::swap(LHS
, RHS
);
13850 // Avoid the fold if it would unduly increase the number of multiplies due to
13851 // multiple uses, except on hardware with full-rate multiply-add (which is
13852 // part of full-rate 64-bit ops).
13853 if (!Subtarget
->hasFullRate64Ops()) {
13854 unsigned NumUsers
= 0;
13855 for (SDNode
*Use
: LHS
->uses()) {
13856 // There is a use that does not feed into addition, so the multiply can't
13857 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13858 if (Use
->getOpcode() != ISD::ADD
)
13861 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13862 // MUL + 3xADD + 3xADDC over 3xMAD.
13869 SDValue MulLHS
= LHS
.getOperand(0);
13870 SDValue MulRHS
= LHS
.getOperand(1);
13871 SDValue AddRHS
= RHS
;
13873 // Always check whether operands are small unsigned values, since that
13874 // knowledge is useful in more cases. Check for small signed values only if
13875 // doing so can unlock a shorter code sequence.
13876 bool MulLHSUnsigned32
= numBitsUnsigned(MulLHS
, DAG
) <= 32;
13877 bool MulRHSUnsigned32
= numBitsUnsigned(MulRHS
, DAG
) <= 32;
13879 bool MulSignedLo
= false;
13880 if (!MulLHSUnsigned32
|| !MulRHSUnsigned32
) {
13882 numBitsSigned(MulLHS
, DAG
) <= 32 && numBitsSigned(MulRHS
, DAG
) <= 32;
13885 // The operands and final result all have the same number of bits. If
13886 // operands need to be extended, they can be extended with garbage. The
13887 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13888 // truncated away in the end.
13889 if (VT
!= MVT::i64
) {
13890 MulLHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i64
, MulLHS
);
13891 MulRHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i64
, MulRHS
);
13892 AddRHS
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i64
, AddRHS
);
13895 // The basic code generated is conceptually straightforward. Pseudo code:
13897 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13898 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13899 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13901 // The second and third lines are optional, depending on whether the factors
13902 // are {sign,zero}-extended or not.
13904 // The actual DAG is noisier than the pseudo code, but only due to
13905 // instructions that disassemble values into low and high parts, and
13906 // assemble the final result.
13907 SDValue One
= DAG
.getConstant(1, SL
, MVT::i32
);
13909 auto MulLHSLo
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, MulLHS
);
13910 auto MulRHSLo
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, MulRHS
);
13912 getMad64_32(DAG
, SL
, MVT::i64
, MulLHSLo
, MulRHSLo
, AddRHS
, MulSignedLo
);
13914 if (!MulSignedLo
&& (!MulLHSUnsigned32
|| !MulRHSUnsigned32
)) {
13915 auto [AccumLo
, AccumHi
] = DAG
.SplitScalar(Accum
, SL
, MVT::i32
, MVT::i32
);
13917 if (!MulLHSUnsigned32
) {
13919 DAG
.getNode(ISD::EXTRACT_ELEMENT
, SL
, MVT::i32
, MulLHS
, One
);
13920 SDValue MulHi
= DAG
.getNode(ISD::MUL
, SL
, MVT::i32
, MulLHSHi
, MulRHSLo
);
13921 AccumHi
= DAG
.getNode(ISD::ADD
, SL
, MVT::i32
, MulHi
, AccumHi
);
13924 if (!MulRHSUnsigned32
) {
13926 DAG
.getNode(ISD::EXTRACT_ELEMENT
, SL
, MVT::i32
, MulRHS
, One
);
13927 SDValue MulHi
= DAG
.getNode(ISD::MUL
, SL
, MVT::i32
, MulLHSLo
, MulRHSHi
);
13928 AccumHi
= DAG
.getNode(ISD::ADD
, SL
, MVT::i32
, MulHi
, AccumHi
);
13931 Accum
= DAG
.getBuildVector(MVT::v2i32
, SL
, {AccumLo
, AccumHi
});
13932 Accum
= DAG
.getBitcast(MVT::i64
, Accum
);
13935 if (VT
!= MVT::i64
)
13936 Accum
= DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Accum
);
13940 // Collect the ultimate src of each of the mul node's operands, and confirm
13941 // each operand is 8 bytes.
13942 static std::optional
<ByteProvider
<SDValue
>>
13943 handleMulOperand(const SDValue
&MulOperand
) {
13944 auto Byte0
= calculateByteProvider(MulOperand
, 0, 0);
13945 if (!Byte0
|| Byte0
->isConstantZero()) {
13946 return std::nullopt
;
13948 auto Byte1
= calculateByteProvider(MulOperand
, 1, 0);
13949 if (Byte1
&& !Byte1
->isConstantZero()) {
13950 return std::nullopt
;
13955 static unsigned addPermMasks(unsigned First
, unsigned Second
) {
13956 unsigned FirstCs
= First
& 0x0c0c0c0c;
13957 unsigned SecondCs
= Second
& 0x0c0c0c0c;
13958 unsigned FirstNoCs
= First
& ~0x0c0c0c0c;
13959 unsigned SecondNoCs
= Second
& ~0x0c0c0c0c;
13961 assert((FirstCs
& 0xFF) | (SecondCs
& 0xFF));
13962 assert((FirstCs
& 0xFF00) | (SecondCs
& 0xFF00));
13963 assert((FirstCs
& 0xFF0000) | (SecondCs
& 0xFF0000));
13964 assert((FirstCs
& 0xFF000000) | (SecondCs
& 0xFF000000));
13966 return (FirstNoCs
| SecondNoCs
) | (FirstCs
& SecondCs
);
13972 int64_t DWordOffset
;
13975 static void placeSources(ByteProvider
<SDValue
> &Src0
,
13976 ByteProvider
<SDValue
> &Src1
,
13977 SmallVectorImpl
<DotSrc
> &Src0s
,
13978 SmallVectorImpl
<DotSrc
> &Src1s
, int Step
) {
13980 assert(Src0
.Src
.has_value() && Src1
.Src
.has_value());
13981 // Src0s and Src1s are empty, just place arbitrarily.
13983 Src0s
.push_back({*Src0
.Src
, ((Src0
.SrcOffset
% 4) << 24) + 0x0c0c0c,
13984 Src0
.SrcOffset
/ 4});
13985 Src1s
.push_back({*Src1
.Src
, ((Src1
.SrcOffset
% 4) << 24) + 0x0c0c0c,
13986 Src1
.SrcOffset
/ 4});
13990 for (int BPI
= 0; BPI
< 2; BPI
++) {
13991 std::pair
<ByteProvider
<SDValue
>, ByteProvider
<SDValue
>> BPP
= {Src0
, Src1
};
13993 BPP
= {Src1
, Src0
};
13995 unsigned ZeroMask
= 0x0c0c0c0c;
13996 unsigned FMask
= 0xFF << (8 * (3 - Step
));
13998 unsigned FirstMask
=
13999 (BPP
.first
.SrcOffset
% 4) << (8 * (3 - Step
)) | (ZeroMask
& ~FMask
);
14000 unsigned SecondMask
=
14001 (BPP
.second
.SrcOffset
% 4) << (8 * (3 - Step
)) | (ZeroMask
& ~FMask
);
14002 // Attempt to find Src vector which contains our SDValue, if so, add our
14003 // perm mask to the existing one. If we are unable to find a match for the
14004 // first SDValue, attempt to find match for the second.
14005 int FirstGroup
= -1;
14006 for (int I
= 0; I
< 2; I
++) {
14007 SmallVectorImpl
<DotSrc
> &Srcs
= I
== 0 ? Src0s
: Src1s
;
14008 auto MatchesFirst
= [&BPP
](DotSrc
&IterElt
) {
14009 return IterElt
.SrcOp
== *BPP
.first
.Src
&&
14010 (IterElt
.DWordOffset
== (BPP
.first
.SrcOffset
/ 4));
14013 auto *Match
= llvm::find_if(Srcs
, MatchesFirst
);
14014 if (Match
!= Srcs
.end()) {
14015 Match
->PermMask
= addPermMasks(FirstMask
, Match
->PermMask
);
14020 if (FirstGroup
!= -1) {
14021 SmallVectorImpl
<DotSrc
> &Srcs
= FirstGroup
== 1 ? Src0s
: Src1s
;
14022 auto MatchesSecond
= [&BPP
](DotSrc
&IterElt
) {
14023 return IterElt
.SrcOp
== *BPP
.second
.Src
&&
14024 (IterElt
.DWordOffset
== (BPP
.second
.SrcOffset
/ 4));
14026 auto *Match
= llvm::find_if(Srcs
, MatchesSecond
);
14027 if (Match
!= Srcs
.end()) {
14028 Match
->PermMask
= addPermMasks(SecondMask
, Match
->PermMask
);
14030 Srcs
.push_back({*BPP
.second
.Src
, SecondMask
, BPP
.second
.SrcOffset
/ 4});
14035 // If we have made it here, then we could not find a match in Src0s or Src1s
14036 // for either Src0 or Src1, so just place them arbitrarily.
14038 unsigned ZeroMask
= 0x0c0c0c0c;
14039 unsigned FMask
= 0xFF << (8 * (3 - Step
));
14043 ((Src0
.SrcOffset
% 4) << (8 * (3 - Step
)) | (ZeroMask
& ~FMask
)),
14044 Src0
.SrcOffset
/ 4});
14047 ((Src1
.SrcOffset
% 4) << (8 * (3 - Step
)) | (ZeroMask
& ~FMask
)),
14048 Src1
.SrcOffset
/ 4});
14051 static SDValue
resolveSources(SelectionDAG
&DAG
, SDLoc SL
,
14052 SmallVectorImpl
<DotSrc
> &Srcs
, bool IsSigned
,
14055 // If we just have one source, just permute it accordingly.
14056 if (Srcs
.size() == 1) {
14057 auto *Elt
= Srcs
.begin();
14058 auto EltOp
= getDWordFromOffset(DAG
, SL
, Elt
->SrcOp
, Elt
->DWordOffset
);
14060 // v_perm will produce the original value
14061 if (Elt
->PermMask
== 0x3020100)
14064 return DAG
.getNode(AMDGPUISD::PERM
, SL
, MVT::i32
, EltOp
, EltOp
,
14065 DAG
.getConstant(Elt
->PermMask
, SL
, MVT::i32
));
14068 auto *FirstElt
= Srcs
.begin();
14069 auto *SecondElt
= std::next(FirstElt
);
14071 SmallVector
<SDValue
, 2> Perms
;
14073 // If we have multiple sources in the chain, combine them via perms (using
14074 // calculated perm mask) and Ors.
14076 auto FirstMask
= FirstElt
->PermMask
;
14077 auto SecondMask
= SecondElt
->PermMask
;
14079 unsigned FirstCs
= FirstMask
& 0x0c0c0c0c;
14080 unsigned FirstPlusFour
= FirstMask
| 0x04040404;
14081 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14083 FirstMask
= (FirstPlusFour
& 0x0F0F0F0F) | FirstCs
;
14085 auto PermMask
= addPermMasks(FirstMask
, SecondMask
);
14087 getDWordFromOffset(DAG
, SL
, FirstElt
->SrcOp
, FirstElt
->DWordOffset
);
14089 getDWordFromOffset(DAG
, SL
, SecondElt
->SrcOp
, SecondElt
->DWordOffset
);
14091 Perms
.push_back(DAG
.getNode(AMDGPUISD::PERM
, SL
, MVT::i32
, FirstVal
,
14093 DAG
.getConstant(PermMask
, SL
, MVT::i32
)));
14095 FirstElt
= std::next(SecondElt
);
14096 if (FirstElt
== Srcs
.end())
14099 SecondElt
= std::next(FirstElt
);
14100 // If we only have a FirstElt, then just combine that into the cumulative
14102 if (SecondElt
== Srcs
.end()) {
14104 getDWordFromOffset(DAG
, SL
, FirstElt
->SrcOp
, FirstElt
->DWordOffset
);
14107 DAG
.getNode(AMDGPUISD::PERM
, SL
, MVT::i32
, EltOp
, EltOp
,
14108 DAG
.getConstant(FirstElt
->PermMask
, SL
, MVT::i32
)));
14113 assert(Perms
.size() == 1 || Perms
.size() == 2);
14114 return Perms
.size() == 2
14115 ? DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Perms
[0], Perms
[1])
14119 static void fixMasks(SmallVectorImpl
<DotSrc
> &Srcs
, unsigned ChainLength
) {
14120 for (auto &[EntryVal
, EntryMask
, EntryOffset
] : Srcs
) {
14121 EntryMask
= EntryMask
>> ((4 - ChainLength
) * 8);
14122 auto ZeroMask
= ChainLength
== 2 ? 0x0c0c0000 : 0x0c000000;
14123 EntryMask
+= ZeroMask
;
14127 static bool isMul(const SDValue Op
) {
14128 auto Opcode
= Op
.getOpcode();
14130 return (Opcode
== ISD::MUL
|| Opcode
== AMDGPUISD::MUL_U24
||
14131 Opcode
== AMDGPUISD::MUL_I24
);
14134 static std::optional
<bool>
14135 checkDot4MulSignedness(const SDValue
&N
, ByteProvider
<SDValue
> &Src0
,
14136 ByteProvider
<SDValue
> &Src1
, const SDValue
&S0Op
,
14137 const SDValue
&S1Op
, const SelectionDAG
&DAG
) {
14138 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14139 // of the dot4 is irrelevant.
14140 if (S0Op
.getValueSizeInBits() == 8 && S1Op
.getValueSizeInBits() == 8)
14143 auto Known0
= DAG
.computeKnownBits(S0Op
, 0);
14144 bool S0IsUnsigned
= Known0
.countMinLeadingZeros() > 0;
14145 bool S0IsSigned
= Known0
.countMinLeadingOnes() > 0;
14146 auto Known1
= DAG
.computeKnownBits(S1Op
, 0);
14147 bool S1IsUnsigned
= Known1
.countMinLeadingZeros() > 0;
14148 bool S1IsSigned
= Known1
.countMinLeadingOnes() > 0;
14150 assert(!(S0IsUnsigned
&& S0IsSigned
));
14151 assert(!(S1IsUnsigned
&& S1IsSigned
));
14153 // There are 9 possible permutations of
14154 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14156 // In two permutations, the sign bits are known to be the same for both Ops,
14157 // so simply return Signed / Unsigned corresponding to the MSB
14159 if ((S0IsUnsigned
&& S1IsUnsigned
) || (S0IsSigned
&& S1IsSigned
))
14162 // In another two permutations, the sign bits are known to be opposite. In
14163 // this case return std::nullopt to indicate a bad match.
14165 if ((S0IsUnsigned
&& S1IsSigned
) || (S0IsSigned
&& S1IsUnsigned
))
14166 return std::nullopt
;
14168 // In the remaining five permutations, we don't know the value of the sign
14169 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14170 // the upper bits must be extension bits. Thus, the only ways for the sign
14171 // bit to be unknown is if it was sign extended from unknown value, or if it
14172 // was any extended. In either case, it is correct to use the signed
14173 // version of the signedness semantics of dot4
14175 // In two of such permutations, we known the sign bit is set for
14176 // one op, and the other is unknown. It is okay to used signed version of
14178 if ((S0IsSigned
&& !(S1IsSigned
|| S1IsUnsigned
)) ||
14179 ((S1IsSigned
&& !(S0IsSigned
|| S0IsUnsigned
))))
14182 // In one such permutation, we don't know either of the sign bits. It is okay
14183 // to used the signed version of dot4.
14184 if ((!(S1IsSigned
|| S1IsUnsigned
) && !(S0IsSigned
|| S0IsUnsigned
)))
14187 // In two of such permutations, we known the sign bit is unset for
14188 // one op, and the other is unknown. Return std::nullopt to indicate a
14190 if ((S0IsUnsigned
&& !(S1IsSigned
|| S1IsUnsigned
)) ||
14191 ((S1IsUnsigned
&& !(S0IsSigned
|| S0IsUnsigned
))))
14192 return std::nullopt
;
14194 llvm_unreachable("Fully covered condition");
14197 SDValue
SITargetLowering::performAddCombine(SDNode
*N
,
14198 DAGCombinerInfo
&DCI
) const {
14199 SelectionDAG
&DAG
= DCI
.DAG
;
14200 EVT VT
= N
->getValueType(0);
14202 SDValue LHS
= N
->getOperand(0);
14203 SDValue RHS
= N
->getOperand(1);
14205 if (LHS
.getOpcode() == ISD::MUL
|| RHS
.getOpcode() == ISD::MUL
) {
14206 if (Subtarget
->hasMad64_32()) {
14207 if (SDValue Folded
= tryFoldToMad64_32(N
, DCI
))
14212 if (SDValue V
= reassociateScalarOps(N
, DAG
)) {
14216 if ((isMul(LHS
) || isMul(RHS
)) && Subtarget
->hasDot7Insts() &&
14217 (Subtarget
->hasDot1Insts() || Subtarget
->hasDot8Insts())) {
14218 SDValue
TempNode(N
, 0);
14219 std::optional
<bool> IsSigned
;
14220 SmallVector
<DotSrc
, 4> Src0s
;
14221 SmallVector
<DotSrc
, 4> Src1s
;
14222 SmallVector
<SDValue
, 4> Src2s
;
14224 // Match the v_dot4 tree, while collecting src nodes.
14225 int ChainLength
= 0;
14226 for (int I
= 0; I
< 4; I
++) {
14227 auto MulIdx
= isMul(LHS
) ? 0 : isMul(RHS
) ? 1 : -1;
14230 auto Src0
= handleMulOperand(TempNode
->getOperand(MulIdx
)->getOperand(0));
14233 auto Src1
= handleMulOperand(TempNode
->getOperand(MulIdx
)->getOperand(1));
14237 auto IterIsSigned
= checkDot4MulSignedness(
14238 TempNode
->getOperand(MulIdx
), *Src0
, *Src1
,
14239 TempNode
->getOperand(MulIdx
)->getOperand(0),
14240 TempNode
->getOperand(MulIdx
)->getOperand(1), DAG
);
14244 IsSigned
= *IterIsSigned
;
14245 if (*IterIsSigned
!= *IsSigned
)
14247 placeSources(*Src0
, *Src1
, Src0s
, Src1s
, I
);
14248 auto AddIdx
= 1 - MulIdx
;
14249 // Allow the special case where add (add (mul24, 0), mul24) became ->
14250 // add (mul24, mul24).
14251 if (I
== 2 && isMul(TempNode
->getOperand(AddIdx
))) {
14252 Src2s
.push_back(TempNode
->getOperand(AddIdx
));
14254 handleMulOperand(TempNode
->getOperand(AddIdx
)->getOperand(0));
14258 handleMulOperand(TempNode
->getOperand(AddIdx
)->getOperand(1));
14261 auto IterIsSigned
= checkDot4MulSignedness(
14262 TempNode
->getOperand(AddIdx
), *Src0
, *Src1
,
14263 TempNode
->getOperand(AddIdx
)->getOperand(0),
14264 TempNode
->getOperand(AddIdx
)->getOperand(1), DAG
);
14268 if (*IterIsSigned
!= *IsSigned
)
14270 placeSources(*Src0
, *Src1
, Src0s
, Src1s
, I
+ 1);
14271 Src2s
.push_back(DAG
.getConstant(0, SL
, MVT::i32
));
14272 ChainLength
= I
+ 2;
14276 TempNode
= TempNode
->getOperand(AddIdx
);
14277 Src2s
.push_back(TempNode
);
14278 ChainLength
= I
+ 1;
14279 if (TempNode
->getNumOperands() < 2)
14281 LHS
= TempNode
->getOperand(0);
14282 RHS
= TempNode
->getOperand(1);
14285 if (ChainLength
< 2)
14288 // Masks were constructed with assumption that we would find a chain of
14289 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14290 // 0x0c) so they do not affect dot calculation.
14291 if (ChainLength
< 4) {
14292 fixMasks(Src0s
, ChainLength
);
14293 fixMasks(Src1s
, ChainLength
);
14296 SDValue Src0
, Src1
;
14298 // If we are just using a single source for both, and have permuted the
14299 // bytes consistently, we can just use the sources without permuting
14301 bool UseOriginalSrc
= false;
14302 if (ChainLength
== 4 && Src0s
.size() == 1 && Src1s
.size() == 1 &&
14303 Src0s
.begin()->PermMask
== Src1s
.begin()->PermMask
&&
14304 Src0s
.begin()->SrcOp
.getValueSizeInBits() >= 32 &&
14305 Src1s
.begin()->SrcOp
.getValueSizeInBits() >= 32) {
14306 SmallVector
<unsigned, 4> SrcBytes
;
14307 auto Src0Mask
= Src0s
.begin()->PermMask
;
14308 SrcBytes
.push_back(Src0Mask
& 0xFF000000);
14309 bool UniqueEntries
= true;
14310 for (auto I
= 1; I
< 4; I
++) {
14311 auto NextByte
= Src0Mask
& (0xFF << ((3 - I
) * 8));
14313 if (is_contained(SrcBytes
, NextByte
)) {
14314 UniqueEntries
= false;
14317 SrcBytes
.push_back(NextByte
);
14320 if (UniqueEntries
) {
14321 UseOriginalSrc
= true;
14323 auto *FirstElt
= Src0s
.begin();
14325 getDWordFromOffset(DAG
, SL
, FirstElt
->SrcOp
, FirstElt
->DWordOffset
);
14327 auto *SecondElt
= Src1s
.begin();
14328 auto SecondEltOp
= getDWordFromOffset(DAG
, SL
, SecondElt
->SrcOp
,
14329 SecondElt
->DWordOffset
);
14331 Src0
= DAG
.getBitcastedAnyExtOrTrunc(FirstEltOp
, SL
,
14332 MVT::getIntegerVT(32));
14333 Src1
= DAG
.getBitcastedAnyExtOrTrunc(SecondEltOp
, SL
,
14334 MVT::getIntegerVT(32));
14338 if (!UseOriginalSrc
) {
14339 Src0
= resolveSources(DAG
, SL
, Src0s
, false, true);
14340 Src1
= resolveSources(DAG
, SL
, Src1s
, false, true);
14345 DAG
.getExtOrTrunc(*IsSigned
, Src2s
[ChainLength
- 1], SL
, MVT::i32
);
14347 SDValue IID
= DAG
.getTargetConstant(*IsSigned
? Intrinsic::amdgcn_sdot4
14348 : Intrinsic::amdgcn_udot4
,
14351 assert(!VT
.isVector());
14352 auto Dot
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SL
, MVT::i32
, IID
, Src0
,
14353 Src1
, Src2
, DAG
.getTargetConstant(0, SL
, MVT::i1
));
14355 return DAG
.getExtOrTrunc(*IsSigned
, Dot
, SL
, VT
);
14358 if (VT
!= MVT::i32
|| !DCI
.isAfterLegalizeDAG())
14361 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14362 // add x, sext (setcc) => usubo_carry x, 0, setcc
14363 unsigned Opc
= LHS
.getOpcode();
14364 if (Opc
== ISD::ZERO_EXTEND
|| Opc
== ISD::SIGN_EXTEND
||
14365 Opc
== ISD::ANY_EXTEND
|| Opc
== ISD::UADDO_CARRY
)
14366 std::swap(RHS
, LHS
);
14368 Opc
= RHS
.getOpcode();
14372 case ISD::ZERO_EXTEND
:
14373 case ISD::SIGN_EXTEND
:
14374 case ISD::ANY_EXTEND
: {
14375 auto Cond
= RHS
.getOperand(0);
14376 // If this won't be a real VOPC output, we would still need to insert an
14377 // extra instruction anyway.
14378 if (!isBoolSGPR(Cond
))
14380 SDVTList VTList
= DAG
.getVTList(MVT::i32
, MVT::i1
);
14381 SDValue Args
[] = {LHS
, DAG
.getConstant(0, SL
, MVT::i32
), Cond
};
14382 Opc
= (Opc
== ISD::SIGN_EXTEND
) ? ISD::USUBO_CARRY
: ISD::UADDO_CARRY
;
14383 return DAG
.getNode(Opc
, SL
, VTList
, Args
);
14385 case ISD::UADDO_CARRY
: {
14386 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14387 if (!isNullConstant(RHS
.getOperand(1)))
14389 SDValue Args
[] = {LHS
, RHS
.getOperand(0), RHS
.getOperand(2)};
14390 return DAG
.getNode(ISD::UADDO_CARRY
, SDLoc(N
), RHS
->getVTList(), Args
);
14396 SDValue
SITargetLowering::performSubCombine(SDNode
*N
,
14397 DAGCombinerInfo
&DCI
) const {
14398 SelectionDAG
&DAG
= DCI
.DAG
;
14399 EVT VT
= N
->getValueType(0);
14401 if (VT
!= MVT::i32
)
14405 SDValue LHS
= N
->getOperand(0);
14406 SDValue RHS
= N
->getOperand(1);
14408 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14409 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14410 unsigned Opc
= RHS
.getOpcode();
14414 case ISD::ZERO_EXTEND
:
14415 case ISD::SIGN_EXTEND
:
14416 case ISD::ANY_EXTEND
: {
14417 auto Cond
= RHS
.getOperand(0);
14418 // If this won't be a real VOPC output, we would still need to insert an
14419 // extra instruction anyway.
14420 if (!isBoolSGPR(Cond
))
14422 SDVTList VTList
= DAG
.getVTList(MVT::i32
, MVT::i1
);
14423 SDValue Args
[] = {LHS
, DAG
.getConstant(0, SL
, MVT::i32
), Cond
};
14424 Opc
= (Opc
== ISD::SIGN_EXTEND
) ? ISD::UADDO_CARRY
: ISD::USUBO_CARRY
;
14425 return DAG
.getNode(Opc
, SL
, VTList
, Args
);
14429 if (LHS
.getOpcode() == ISD::USUBO_CARRY
) {
14430 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14431 if (!isNullConstant(LHS
.getOperand(1)))
14433 SDValue Args
[] = {LHS
.getOperand(0), RHS
, LHS
.getOperand(2)};
14434 return DAG
.getNode(ISD::USUBO_CARRY
, SDLoc(N
), LHS
->getVTList(), Args
);
14440 SITargetLowering::performAddCarrySubCarryCombine(SDNode
*N
,
14441 DAGCombinerInfo
&DCI
) const {
14443 if (N
->getValueType(0) != MVT::i32
)
14446 if (!isNullConstant(N
->getOperand(1)))
14449 SelectionDAG
&DAG
= DCI
.DAG
;
14450 SDValue LHS
= N
->getOperand(0);
14452 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14453 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14454 unsigned LHSOpc
= LHS
.getOpcode();
14455 unsigned Opc
= N
->getOpcode();
14456 if ((LHSOpc
== ISD::ADD
&& Opc
== ISD::UADDO_CARRY
) ||
14457 (LHSOpc
== ISD::SUB
&& Opc
== ISD::USUBO_CARRY
)) {
14458 SDValue Args
[] = {LHS
.getOperand(0), LHS
.getOperand(1), N
->getOperand(2)};
14459 return DAG
.getNode(Opc
, SDLoc(N
), N
->getVTList(), Args
);
14464 SDValue
SITargetLowering::performFAddCombine(SDNode
*N
,
14465 DAGCombinerInfo
&DCI
) const {
14466 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
14469 SelectionDAG
&DAG
= DCI
.DAG
;
14470 EVT VT
= N
->getValueType(0);
14473 SDValue LHS
= N
->getOperand(0);
14474 SDValue RHS
= N
->getOperand(1);
14476 // These should really be instruction patterns, but writing patterns with
14477 // source modifiers is a pain.
14479 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14480 if (LHS
.getOpcode() == ISD::FADD
) {
14481 SDValue A
= LHS
.getOperand(0);
14482 if (A
== LHS
.getOperand(1)) {
14483 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
14484 if (FusedOp
!= 0) {
14485 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
14486 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, RHS
);
14491 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14492 if (RHS
.getOpcode() == ISD::FADD
) {
14493 SDValue A
= RHS
.getOperand(0);
14494 if (A
== RHS
.getOperand(1)) {
14495 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
14496 if (FusedOp
!= 0) {
14497 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
14498 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, LHS
);
14506 SDValue
SITargetLowering::performFSubCombine(SDNode
*N
,
14507 DAGCombinerInfo
&DCI
) const {
14508 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
14511 SelectionDAG
&DAG
= DCI
.DAG
;
14513 EVT VT
= N
->getValueType(0);
14514 assert(!VT
.isVector());
14516 // Try to get the fneg to fold into the source modifier. This undoes generic
14517 // DAG combines and folds them into the mad.
14519 // Only do this if we are not trying to support denormals. v_mad_f32 does
14520 // not support denormals ever.
14521 SDValue LHS
= N
->getOperand(0);
14522 SDValue RHS
= N
->getOperand(1);
14523 if (LHS
.getOpcode() == ISD::FADD
) {
14524 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14525 SDValue A
= LHS
.getOperand(0);
14526 if (A
== LHS
.getOperand(1)) {
14527 unsigned FusedOp
= getFusedOpcode(DAG
, N
, LHS
.getNode());
14528 if (FusedOp
!= 0) {
14529 const SDValue Two
= DAG
.getConstantFP(2.0, SL
, VT
);
14530 SDValue NegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
14532 return DAG
.getNode(FusedOp
, SL
, VT
, A
, Two
, NegRHS
);
14537 if (RHS
.getOpcode() == ISD::FADD
) {
14538 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14540 SDValue A
= RHS
.getOperand(0);
14541 if (A
== RHS
.getOperand(1)) {
14542 unsigned FusedOp
= getFusedOpcode(DAG
, N
, RHS
.getNode());
14543 if (FusedOp
!= 0) {
14544 const SDValue NegTwo
= DAG
.getConstantFP(-2.0, SL
, VT
);
14545 return DAG
.getNode(FusedOp
, SL
, VT
, A
, NegTwo
, LHS
);
14553 SDValue
SITargetLowering::performFDivCombine(SDNode
*N
,
14554 DAGCombinerInfo
&DCI
) const {
14555 SelectionDAG
&DAG
= DCI
.DAG
;
14557 EVT VT
= N
->getValueType(0);
14558 if (VT
!= MVT::f16
|| !Subtarget
->has16BitInsts())
14561 SDValue LHS
= N
->getOperand(0);
14562 SDValue RHS
= N
->getOperand(1);
14564 SDNodeFlags Flags
= N
->getFlags();
14565 SDNodeFlags RHSFlags
= RHS
->getFlags();
14566 if (!Flags
.hasAllowContract() || !RHSFlags
.hasAllowContract() ||
14570 if (const ConstantFPSDNode
*CLHS
= dyn_cast
<ConstantFPSDNode
>(LHS
)) {
14571 bool IsNegative
= false;
14572 if (CLHS
->isExactlyValue(1.0) ||
14573 (IsNegative
= CLHS
->isExactlyValue(-1.0))) {
14574 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14575 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14576 if (RHS
.getOpcode() == ISD::FSQRT
) {
14577 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14579 DAG
.getNode(AMDGPUISD::RSQ
, SL
, VT
, RHS
.getOperand(0), Flags
);
14580 return IsNegative
? DAG
.getNode(ISD::FNEG
, SL
, VT
, Rsq
, Flags
) : Rsq
;
14588 SDValue
SITargetLowering::performFMACombine(SDNode
*N
,
14589 DAGCombinerInfo
&DCI
) const {
14590 SelectionDAG
&DAG
= DCI
.DAG
;
14591 EVT VT
= N
->getValueType(0);
14594 if (!Subtarget
->hasDot7Insts() || VT
!= MVT::f32
)
14597 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14598 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14599 SDValue Op1
= N
->getOperand(0);
14600 SDValue Op2
= N
->getOperand(1);
14601 SDValue FMA
= N
->getOperand(2);
14603 if (FMA
.getOpcode() != ISD::FMA
|| Op1
.getOpcode() != ISD::FP_EXTEND
||
14604 Op2
.getOpcode() != ISD::FP_EXTEND
)
14607 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14608 // regardless of the denorm mode setting. Therefore,
14609 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14610 const TargetOptions
&Options
= DAG
.getTarget().Options
;
14611 if (Options
.AllowFPOpFusion
== FPOpFusion::Fast
|| Options
.UnsafeFPMath
||
14612 (N
->getFlags().hasAllowContract() &&
14613 FMA
->getFlags().hasAllowContract())) {
14614 Op1
= Op1
.getOperand(0);
14615 Op2
= Op2
.getOperand(0);
14616 if (Op1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
14617 Op2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
14620 SDValue Vec1
= Op1
.getOperand(0);
14621 SDValue Idx1
= Op1
.getOperand(1);
14622 SDValue Vec2
= Op2
.getOperand(0);
14624 SDValue FMAOp1
= FMA
.getOperand(0);
14625 SDValue FMAOp2
= FMA
.getOperand(1);
14626 SDValue FMAAcc
= FMA
.getOperand(2);
14628 if (FMAOp1
.getOpcode() != ISD::FP_EXTEND
||
14629 FMAOp2
.getOpcode() != ISD::FP_EXTEND
)
14632 FMAOp1
= FMAOp1
.getOperand(0);
14633 FMAOp2
= FMAOp2
.getOperand(0);
14634 if (FMAOp1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
14635 FMAOp2
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
14638 SDValue Vec3
= FMAOp1
.getOperand(0);
14639 SDValue Vec4
= FMAOp2
.getOperand(0);
14640 SDValue Idx2
= FMAOp1
.getOperand(1);
14642 if (Idx1
!= Op2
.getOperand(1) || Idx2
!= FMAOp2
.getOperand(1) ||
14643 // Idx1 and Idx2 cannot be the same.
14647 if (Vec1
== Vec2
|| Vec3
== Vec4
)
14650 if (Vec1
.getValueType() != MVT::v2f16
|| Vec2
.getValueType() != MVT::v2f16
)
14653 if ((Vec1
== Vec3
&& Vec2
== Vec4
) || (Vec1
== Vec4
&& Vec2
== Vec3
)) {
14654 return DAG
.getNode(AMDGPUISD::FDOT2
, SL
, MVT::f32
, Vec1
, Vec2
, FMAAcc
,
14655 DAG
.getTargetConstant(0, SL
, MVT::i1
));
14661 SDValue
SITargetLowering::performSetCCCombine(SDNode
*N
,
14662 DAGCombinerInfo
&DCI
) const {
14663 SelectionDAG
&DAG
= DCI
.DAG
;
14666 SDValue LHS
= N
->getOperand(0);
14667 SDValue RHS
= N
->getOperand(1);
14668 EVT VT
= LHS
.getValueType();
14669 ISD::CondCode CC
= cast
<CondCodeSDNode
>(N
->getOperand(2))->get();
14671 auto *CRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
14673 CRHS
= dyn_cast
<ConstantSDNode
>(LHS
);
14675 std::swap(LHS
, RHS
);
14676 CC
= getSetCCSwappedOperands(CC
);
14681 if (VT
== MVT::i32
&& LHS
.getOpcode() == ISD::SIGN_EXTEND
&&
14682 isBoolSGPR(LHS
.getOperand(0))) {
14683 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14684 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14685 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14686 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14687 if ((CRHS
->isAllOnes() &&
14688 (CC
== ISD::SETNE
|| CC
== ISD::SETGT
|| CC
== ISD::SETULT
)) ||
14690 (CC
== ISD::SETEQ
|| CC
== ISD::SETGE
|| CC
== ISD::SETULE
)))
14691 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
14692 DAG
.getConstant(-1, SL
, MVT::i1
));
14693 if ((CRHS
->isAllOnes() &&
14694 (CC
== ISD::SETEQ
|| CC
== ISD::SETLE
|| CC
== ISD::SETUGE
)) ||
14696 (CC
== ISD::SETNE
|| CC
== ISD::SETUGT
|| CC
== ISD::SETLT
)))
14697 return LHS
.getOperand(0);
14700 const APInt
&CRHSVal
= CRHS
->getAPIntValue();
14701 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) &&
14702 LHS
.getOpcode() == ISD::SELECT
&&
14703 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
14704 isa
<ConstantSDNode
>(LHS
.getOperand(2)) &&
14705 LHS
.getConstantOperandVal(1) != LHS
.getConstantOperandVal(2) &&
14706 isBoolSGPR(LHS
.getOperand(0))) {
14708 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14709 // setcc (select cc, CT, CF), CF, ne => cc
14710 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14711 // setcc (select cc, CT, CF), CT, eq => cc
14712 const APInt
&CT
= LHS
.getConstantOperandAPInt(1);
14713 const APInt
&CF
= LHS
.getConstantOperandAPInt(2);
14715 if ((CF
== CRHSVal
&& CC
== ISD::SETEQ
) ||
14716 (CT
== CRHSVal
&& CC
== ISD::SETNE
))
14717 return DAG
.getNode(ISD::XOR
, SL
, MVT::i1
, LHS
.getOperand(0),
14718 DAG
.getConstant(-1, SL
, MVT::i1
));
14719 if ((CF
== CRHSVal
&& CC
== ISD::SETNE
) ||
14720 (CT
== CRHSVal
&& CC
== ISD::SETEQ
))
14721 return LHS
.getOperand(0);
14725 if (VT
!= MVT::f32
&& VT
!= MVT::f64
&&
14726 (!Subtarget
->has16BitInsts() || VT
!= MVT::f16
))
14729 // Match isinf/isfinite pattern
14730 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14731 // (fcmp one (fabs x), inf) -> (fp_class x,
14732 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14733 if ((CC
== ISD::SETOEQ
|| CC
== ISD::SETONE
) &&
14734 LHS
.getOpcode() == ISD::FABS
) {
14735 const ConstantFPSDNode
*CRHS
= dyn_cast
<ConstantFPSDNode
>(RHS
);
14739 const APFloat
&APF
= CRHS
->getValueAPF();
14740 if (APF
.isInfinity() && !APF
.isNegative()) {
14741 const unsigned IsInfMask
=
14742 SIInstrFlags::P_INFINITY
| SIInstrFlags::N_INFINITY
;
14743 const unsigned IsFiniteMask
=
14744 SIInstrFlags::N_ZERO
| SIInstrFlags::P_ZERO
| SIInstrFlags::N_NORMAL
|
14745 SIInstrFlags::P_NORMAL
| SIInstrFlags::N_SUBNORMAL
|
14746 SIInstrFlags::P_SUBNORMAL
;
14747 unsigned Mask
= CC
== ISD::SETOEQ
? IsInfMask
: IsFiniteMask
;
14748 return DAG
.getNode(AMDGPUISD::FP_CLASS
, SL
, MVT::i1
, LHS
.getOperand(0),
14749 DAG
.getConstant(Mask
, SL
, MVT::i32
));
14757 SITargetLowering::performCvtF32UByteNCombine(SDNode
*N
,
14758 DAGCombinerInfo
&DCI
) const {
14759 SelectionDAG
&DAG
= DCI
.DAG
;
14761 unsigned Offset
= N
->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0
;
14763 SDValue Src
= N
->getOperand(0);
14764 SDValue Shift
= N
->getOperand(0);
14766 // TODO: Extend type shouldn't matter (assuming legal types).
14767 if (Shift
.getOpcode() == ISD::ZERO_EXTEND
)
14768 Shift
= Shift
.getOperand(0);
14770 if (Shift
.getOpcode() == ISD::SRL
|| Shift
.getOpcode() == ISD::SHL
) {
14771 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14772 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14773 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14774 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14775 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14776 if (auto *C
= dyn_cast
<ConstantSDNode
>(Shift
.getOperand(1))) {
14777 SDValue Shifted
= DAG
.getZExtOrTrunc(
14778 Shift
.getOperand(0), SDLoc(Shift
.getOperand(0)), MVT::i32
);
14780 unsigned ShiftOffset
= 8 * Offset
;
14781 if (Shift
.getOpcode() == ISD::SHL
)
14782 ShiftOffset
-= C
->getZExtValue();
14784 ShiftOffset
+= C
->getZExtValue();
14786 if (ShiftOffset
< 32 && (ShiftOffset
% 8) == 0) {
14787 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
+ ShiftOffset
/ 8, SL
,
14788 MVT::f32
, Shifted
);
14793 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
14794 APInt DemandedBits
= APInt::getBitsSet(32, 8 * Offset
, 8 * Offset
+ 8);
14795 if (TLI
.SimplifyDemandedBits(Src
, DemandedBits
, DCI
)) {
14796 // We simplified Src. If this node is not dead, visit it again so it is
14797 // folded properly.
14798 if (N
->getOpcode() != ISD::DELETED_NODE
)
14799 DCI
.AddToWorklist(N
);
14800 return SDValue(N
, 0);
14803 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14804 if (SDValue DemandedSrc
=
14805 TLI
.SimplifyMultipleUseDemandedBits(Src
, DemandedBits
, DAG
))
14806 return DAG
.getNode(N
->getOpcode(), SL
, MVT::f32
, DemandedSrc
);
14811 SDValue
SITargetLowering::performClampCombine(SDNode
*N
,
14812 DAGCombinerInfo
&DCI
) const {
14813 ConstantFPSDNode
*CSrc
= dyn_cast
<ConstantFPSDNode
>(N
->getOperand(0));
14817 const MachineFunction
&MF
= DCI
.DAG
.getMachineFunction();
14818 const APFloat
&F
= CSrc
->getValueAPF();
14819 APFloat Zero
= APFloat::getZero(F
.getSemantics());
14821 (F
.isNaN() && MF
.getInfo
<SIMachineFunctionInfo
>()->getMode().DX10Clamp
)) {
14822 return DCI
.DAG
.getConstantFP(Zero
, SDLoc(N
), N
->getValueType(0));
14825 APFloat
One(F
.getSemantics(), "1.0");
14827 return DCI
.DAG
.getConstantFP(One
, SDLoc(N
), N
->getValueType(0));
14829 return SDValue(CSrc
, 0);
14832 SDValue
SITargetLowering::PerformDAGCombine(SDNode
*N
,
14833 DAGCombinerInfo
&DCI
) const {
14834 switch (N
->getOpcode()) {
14850 if (auto Res
= promoteUniformOpToI32(SDValue(N
, 0), DCI
))
14857 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None
)
14860 switch (N
->getOpcode()) {
14862 return performAddCombine(N
, DCI
);
14864 return performSubCombine(N
, DCI
);
14865 case ISD::UADDO_CARRY
:
14866 case ISD::USUBO_CARRY
:
14867 return performAddCarrySubCarryCombine(N
, DCI
);
14869 return performFAddCombine(N
, DCI
);
14871 return performFSubCombine(N
, DCI
);
14873 return performFDivCombine(N
, DCI
);
14875 return performSetCCCombine(N
, DCI
);
14878 case ISD::FMAXNUM_IEEE
:
14879 case ISD::FMINNUM_IEEE
:
14880 case ISD::FMAXIMUM
:
14881 case ISD::FMINIMUM
:
14886 case AMDGPUISD::FMIN_LEGACY
:
14887 case AMDGPUISD::FMAX_LEGACY
:
14888 return performMinMaxCombine(N
, DCI
);
14890 return performFMACombine(N
, DCI
);
14892 return performAndCombine(N
, DCI
);
14894 return performOrCombine(N
, DCI
);
14896 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
14897 if (N
->getValueType(0) == MVT::i32
&& N
->isDivergent() &&
14898 TII
->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64
) != -1) {
14899 return matchPERM(N
, DCI
);
14904 return performXorCombine(N
, DCI
);
14905 case ISD::ZERO_EXTEND
:
14906 return performZeroExtendCombine(N
, DCI
);
14907 case ISD::SIGN_EXTEND_INREG
:
14908 return performSignExtendInRegCombine(N
, DCI
);
14909 case AMDGPUISD::FP_CLASS
:
14910 return performClassCombine(N
, DCI
);
14911 case ISD::FCANONICALIZE
:
14912 return performFCanonicalizeCombine(N
, DCI
);
14913 case AMDGPUISD::RCP
:
14914 return performRcpCombine(N
, DCI
);
14916 case AMDGPUISD::FRACT
:
14917 case AMDGPUISD::RSQ
:
14918 case AMDGPUISD::RCP_LEGACY
:
14919 case AMDGPUISD::RCP_IFLAG
:
14920 case AMDGPUISD::RSQ_CLAMP
: {
14921 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14922 SDValue Src
= N
->getOperand(0);
14927 case ISD::SINT_TO_FP
:
14928 case ISD::UINT_TO_FP
:
14929 return performUCharToFloatCombine(N
, DCI
);
14930 case ISD::FCOPYSIGN
:
14931 return performFCopySignCombine(N
, DCI
);
14932 case AMDGPUISD::CVT_F32_UBYTE0
:
14933 case AMDGPUISD::CVT_F32_UBYTE1
:
14934 case AMDGPUISD::CVT_F32_UBYTE2
:
14935 case AMDGPUISD::CVT_F32_UBYTE3
:
14936 return performCvtF32UByteNCombine(N
, DCI
);
14937 case AMDGPUISD::FMED3
:
14938 return performFMed3Combine(N
, DCI
);
14939 case AMDGPUISD::CVT_PKRTZ_F16_F32
:
14940 return performCvtPkRTZCombine(N
, DCI
);
14941 case AMDGPUISD::CLAMP
:
14942 return performClampCombine(N
, DCI
);
14943 case ISD::SCALAR_TO_VECTOR
: {
14944 SelectionDAG
&DAG
= DCI
.DAG
;
14945 EVT VT
= N
->getValueType(0);
14947 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14948 if (VT
== MVT::v2i16
|| VT
== MVT::v2f16
|| VT
== MVT::v2bf16
) {
14950 SDValue Src
= N
->getOperand(0);
14951 EVT EltVT
= Src
.getValueType();
14952 if (EltVT
!= MVT::i16
)
14953 Src
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i16
, Src
);
14955 SDValue Ext
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Src
);
14956 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Ext
);
14961 case ISD::EXTRACT_VECTOR_ELT
:
14962 return performExtractVectorEltCombine(N
, DCI
);
14963 case ISD::INSERT_VECTOR_ELT
:
14964 return performInsertVectorEltCombine(N
, DCI
);
14965 case ISD::FP_ROUND
:
14966 return performFPRoundCombine(N
, DCI
);
14968 if (SDValue Widened
= widenLoad(cast
<LoadSDNode
>(N
), DCI
))
14973 if (!DCI
.isBeforeLegalize()) {
14974 if (MemSDNode
*MemNode
= dyn_cast
<MemSDNode
>(N
))
14975 return performMemSDNodeCombine(MemNode
, DCI
);
14982 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
14985 /// Helper function for adjustWritemask
14986 static unsigned SubIdx2Lane(unsigned Idx
) {
14999 return 4; // Possible with TFE/LWE
15003 /// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15004 SDNode
*SITargetLowering::adjustWritemask(MachineSDNode
*&Node
,
15005 SelectionDAG
&DAG
) const {
15006 unsigned Opcode
= Node
->getMachineOpcode();
15008 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15009 int D16Idx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::d16
) - 1;
15010 if (D16Idx
>= 0 && Node
->getConstantOperandVal(D16Idx
))
15011 return Node
; // not implemented for D16
15013 SDNode
*Users
[5] = {nullptr};
15015 unsigned DmaskIdx
=
15016 AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::dmask
) - 1;
15017 unsigned OldDmask
= Node
->getConstantOperandVal(DmaskIdx
);
15018 unsigned NewDmask
= 0;
15019 unsigned TFEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::tfe
) - 1;
15020 unsigned LWEIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::lwe
) - 1;
15021 bool UsesTFC
= ((int(TFEIdx
) >= 0 && Node
->getConstantOperandVal(TFEIdx
)) ||
15022 (int(LWEIdx
) >= 0 && Node
->getConstantOperandVal(LWEIdx
)))
15025 unsigned TFCLane
= 0;
15026 bool HasChain
= Node
->getNumValues() > 1;
15028 if (OldDmask
== 0) {
15029 // These are folded out, but on the chance it happens don't assert.
15033 unsigned OldBitsSet
= llvm::popcount(OldDmask
);
15034 // Work out which is the TFE/LWE lane if that is enabled.
15036 TFCLane
= OldBitsSet
;
15039 // Try to figure out the used register components
15040 for (SDNode::use_iterator I
= Node
->use_begin(), E
= Node
->use_end(); I
!= E
;
15043 // Don't look at users of the chain.
15044 if (I
.getUse().getResNo() != 0)
15047 // Abort if we can't understand the usage
15048 if (!I
->isMachineOpcode() ||
15049 I
->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
)
15052 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15053 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15054 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15056 Lane
= SubIdx2Lane(I
->getConstantOperandVal(1));
15060 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15061 if (UsesTFC
&& Lane
== TFCLane
) {
15064 // Set which texture component corresponds to the lane.
15066 for (unsigned i
= 0, Dmask
= OldDmask
; (i
<= Lane
) && (Dmask
!= 0); i
++) {
15067 Comp
= llvm::countr_zero(Dmask
);
15068 Dmask
&= ~(1 << Comp
);
15071 // Abort if we have more than one user per component.
15076 NewDmask
|= 1 << Comp
;
15080 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15081 bool NoChannels
= !NewDmask
;
15084 // No uses of the result and not using TFC. Then do nothing.
15087 // If the original dmask has one channel - then nothing to do
15088 if (OldBitsSet
== 1)
15090 // Use an arbitrary dmask - required for the instruction to work
15093 // Abort if there's no change
15094 if (NewDmask
== OldDmask
)
15097 unsigned BitsSet
= llvm::popcount(NewDmask
);
15099 // Check for TFE or LWE - increase the number of channels by one to account
15100 // for the extra return value
15101 // This will need adjustment for D16 if this is also included in
15102 // adjustWriteMask (this function) but at present D16 are excluded.
15103 unsigned NewChannels
= BitsSet
+ UsesTFC
;
15106 AMDGPU::getMaskedMIMGOp(Node
->getMachineOpcode(), NewChannels
);
15107 assert(NewOpcode
!= -1 &&
15108 NewOpcode
!= static_cast<int>(Node
->getMachineOpcode()) &&
15109 "failed to find equivalent MIMG op");
15111 // Adjust the writemask in the node
15112 SmallVector
<SDValue
, 12> Ops
;
15113 Ops
.insert(Ops
.end(), Node
->op_begin(), Node
->op_begin() + DmaskIdx
);
15114 Ops
.push_back(DAG
.getTargetConstant(NewDmask
, SDLoc(Node
), MVT::i32
));
15115 Ops
.insert(Ops
.end(), Node
->op_begin() + DmaskIdx
+ 1, Node
->op_end());
15117 MVT SVT
= Node
->getValueType(0).getVectorElementType().getSimpleVT();
15119 MVT ResultVT
= NewChannels
== 1
15121 : MVT::getVectorVT(SVT
, NewChannels
== 3 ? 4
15122 : NewChannels
== 5 ? 8
15124 SDVTList NewVTList
=
15125 HasChain
? DAG
.getVTList(ResultVT
, MVT::Other
) : DAG
.getVTList(ResultVT
);
15127 MachineSDNode
*NewNode
=
15128 DAG
.getMachineNode(NewOpcode
, SDLoc(Node
), NewVTList
, Ops
);
15132 DAG
.setNodeMemRefs(NewNode
, Node
->memoperands());
15133 DAG
.ReplaceAllUsesOfValueWith(SDValue(Node
, 1), SDValue(NewNode
, 1));
15136 if (NewChannels
== 1) {
15137 assert(Node
->hasNUsesOfValue(1, 0));
15139 DAG
.getMachineNode(TargetOpcode::COPY
, SDLoc(Node
),
15140 Users
[Lane
]->getValueType(0), SDValue(NewNode
, 0));
15141 DAG
.ReplaceAllUsesWith(Users
[Lane
], Copy
);
15145 // Update the users of the node with the new indices
15146 for (unsigned i
= 0, Idx
= AMDGPU::sub0
; i
< 5; ++i
) {
15147 SDNode
*User
= Users
[i
];
15149 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15150 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15151 if (i
|| !NoChannels
)
15154 SDValue Op
= DAG
.getTargetConstant(Idx
, SDLoc(User
), MVT::i32
);
15155 SDNode
*NewUser
= DAG
.UpdateNodeOperands(User
, SDValue(NewNode
, 0), Op
);
15156 if (NewUser
!= User
) {
15157 DAG
.ReplaceAllUsesWith(SDValue(User
, 0), SDValue(NewUser
, 0));
15158 DAG
.RemoveDeadNode(User
);
15166 Idx
= AMDGPU::sub1
;
15169 Idx
= AMDGPU::sub2
;
15172 Idx
= AMDGPU::sub3
;
15175 Idx
= AMDGPU::sub4
;
15180 DAG
.RemoveDeadNode(Node
);
15184 static bool isFrameIndexOp(SDValue Op
) {
15185 if (Op
.getOpcode() == ISD::AssertZext
)
15186 Op
= Op
.getOperand(0);
15188 return isa
<FrameIndexSDNode
>(Op
);
15191 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
15192 /// with frame index operands.
15193 /// LLVM assumes that inputs are to these instructions are registers.
15195 SITargetLowering::legalizeTargetIndependentNode(SDNode
*Node
,
15196 SelectionDAG
&DAG
) const {
15197 if (Node
->getOpcode() == ISD::CopyToReg
) {
15198 RegisterSDNode
*DestReg
= cast
<RegisterSDNode
>(Node
->getOperand(1));
15199 SDValue SrcVal
= Node
->getOperand(2);
15201 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15202 // to try understanding copies to physical registers.
15203 if (SrcVal
.getValueType() == MVT::i1
&& DestReg
->getReg().isPhysical()) {
15205 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
15206 SDValue VReg
= DAG
.getRegister(
15207 MRI
.createVirtualRegister(&AMDGPU::VReg_1RegClass
), MVT::i1
);
15209 SDNode
*Glued
= Node
->getGluedNode();
15210 SDValue ToVReg
= DAG
.getCopyToReg(
15211 Node
->getOperand(0), SL
, VReg
, SrcVal
,
15212 SDValue(Glued
, Glued
? Glued
->getNumValues() - 1 : 0));
15213 SDValue ToResultReg
= DAG
.getCopyToReg(ToVReg
, SL
, SDValue(DestReg
, 0),
15214 VReg
, ToVReg
.getValue(1));
15215 DAG
.ReplaceAllUsesWith(Node
, ToResultReg
.getNode());
15216 DAG
.RemoveDeadNode(Node
);
15217 return ToResultReg
.getNode();
15221 SmallVector
<SDValue
, 8> Ops
;
15222 for (unsigned i
= 0; i
< Node
->getNumOperands(); ++i
) {
15223 if (!isFrameIndexOp(Node
->getOperand(i
))) {
15224 Ops
.push_back(Node
->getOperand(i
));
15229 Ops
.push_back(SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
,
15230 Node
->getOperand(i
).getValueType(),
15231 Node
->getOperand(i
)),
15235 return DAG
.UpdateNodeOperands(Node
, Ops
);
15238 /// Fold the instructions after selecting them.
15239 /// Returns null if users were already updated.
15240 SDNode
*SITargetLowering::PostISelFolding(MachineSDNode
*Node
,
15241 SelectionDAG
&DAG
) const {
15242 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
15243 unsigned Opcode
= Node
->getMachineOpcode();
15245 if (TII
->isImage(Opcode
) && !TII
->get(Opcode
).mayStore() &&
15246 !TII
->isGather4(Opcode
) &&
15247 AMDGPU::hasNamedOperand(Opcode
, AMDGPU::OpName::dmask
)) {
15248 return adjustWritemask(Node
, DAG
);
15251 if (Opcode
== AMDGPU::INSERT_SUBREG
|| Opcode
== AMDGPU::REG_SEQUENCE
) {
15252 legalizeTargetIndependentNode(Node
, DAG
);
15257 case AMDGPU::V_DIV_SCALE_F32_e64
:
15258 case AMDGPU::V_DIV_SCALE_F64_e64
: {
15259 // Satisfy the operand register constraint when one of the inputs is
15260 // undefined. Ordinarily each undef value will have its own implicit_def of
15261 // a vreg, so force these to use a single register.
15262 SDValue Src0
= Node
->getOperand(1);
15263 SDValue Src1
= Node
->getOperand(3);
15264 SDValue Src2
= Node
->getOperand(5);
15266 if ((Src0
.isMachineOpcode() &&
15267 Src0
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
) &&
15268 (Src0
== Src1
|| Src0
== Src2
))
15271 MVT VT
= Src0
.getValueType().getSimpleVT();
15272 const TargetRegisterClass
*RC
=
15273 getRegClassFor(VT
, Src0
.getNode()->isDivergent());
15275 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
15276 SDValue UndefReg
= DAG
.getRegister(MRI
.createVirtualRegister(RC
), VT
);
15278 SDValue ImpDef
= DAG
.getCopyToReg(DAG
.getEntryNode(), SDLoc(Node
), UndefReg
,
15281 // src0 must be the same register as src1 or src2, even if the value is
15282 // undefined, so make sure we don't violate this constraint.
15283 if (Src0
.isMachineOpcode() &&
15284 Src0
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
) {
15285 if (Src1
.isMachineOpcode() &&
15286 Src1
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
15288 else if (Src2
.isMachineOpcode() &&
15289 Src2
.getMachineOpcode() != AMDGPU::IMPLICIT_DEF
)
15292 assert(Src1
.getMachineOpcode() == AMDGPU::IMPLICIT_DEF
);
15299 SmallVector
<SDValue
, 9> Ops(Node
->ops());
15303 Ops
.push_back(ImpDef
.getValue(1));
15304 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
15313 // Any MIMG instructions that use tfe or lwe require an initialization of the
15314 // result register that will be written in the case of a memory access failure.
15315 // The required code is also added to tie this init code to the result of the
15316 // img instruction.
15317 void SITargetLowering::AddMemOpInit(MachineInstr
&MI
) const {
15318 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
15319 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
15320 MachineRegisterInfo
&MRI
= MI
.getMF()->getRegInfo();
15321 MachineBasicBlock
&MBB
= *MI
.getParent();
15324 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vdata
);
15325 unsigned InitIdx
= 0;
15327 if (TII
->isImage(MI
)) {
15328 MachineOperand
*TFE
= TII
->getNamedOperand(MI
, AMDGPU::OpName::tfe
);
15329 MachineOperand
*LWE
= TII
->getNamedOperand(MI
, AMDGPU::OpName::lwe
);
15330 MachineOperand
*D16
= TII
->getNamedOperand(MI
, AMDGPU::OpName::d16
);
15332 if (!TFE
&& !LWE
) // intersect_ray
15335 unsigned TFEVal
= TFE
? TFE
->getImm() : 0;
15336 unsigned LWEVal
= LWE
? LWE
->getImm() : 0;
15337 unsigned D16Val
= D16
? D16
->getImm() : 0;
15339 if (!TFEVal
&& !LWEVal
)
15342 // At least one of TFE or LWE are non-zero
15343 // We have to insert a suitable initialization of the result value and
15344 // tie this to the dest of the image instruction.
15346 // Calculate which dword we have to initialize to 0.
15347 MachineOperand
*MO_Dmask
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dmask
);
15349 // check that dmask operand is found.
15350 assert(MO_Dmask
&& "Expected dmask operand in instruction");
15352 unsigned dmask
= MO_Dmask
->getImm();
15353 // Determine the number of active lanes taking into account the
15354 // Gather4 special case
15355 unsigned ActiveLanes
= TII
->isGather4(MI
) ? 4 : llvm::popcount(dmask
);
15357 bool Packed
= !Subtarget
->hasUnpackedD16VMem();
15359 InitIdx
= D16Val
&& Packed
? ((ActiveLanes
+ 1) >> 1) + 1 : ActiveLanes
+ 1;
15361 // Abandon attempt if the dst size isn't large enough
15362 // - this is in fact an error but this is picked up elsewhere and
15363 // reported correctly.
15365 TRI
.getRegSizeInBits(*TII
->getOpRegClass(MI
, DstIdx
)) / 32;
15366 if (DstSize
< InitIdx
)
15368 } else if (TII
->isMUBUF(MI
) && AMDGPU::getMUBUFTfe(MI
.getOpcode())) {
15369 InitIdx
= TRI
.getRegSizeInBits(*TII
->getOpRegClass(MI
, DstIdx
)) / 32;
15374 const DebugLoc
&DL
= MI
.getDebugLoc();
15376 // Create a register for the initialization value.
15377 Register PrevDst
= MRI
.cloneVirtualRegister(MI
.getOperand(DstIdx
).getReg());
15378 unsigned NewDst
= 0; // Final initialized value will be in here
15380 // If PRTStrictNull feature is enabled (the default) then initialize
15381 // all the result registers to 0, otherwise just the error indication
15382 // register (VGPRn+1)
15383 unsigned SizeLeft
= Subtarget
->usePRTStrictNull() ? InitIdx
: 1;
15384 unsigned CurrIdx
= Subtarget
->usePRTStrictNull() ? 0 : (InitIdx
- 1);
15386 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::IMPLICIT_DEF
), PrevDst
);
15387 for (; SizeLeft
; SizeLeft
--, CurrIdx
++) {
15388 NewDst
= MRI
.createVirtualRegister(TII
->getOpRegClass(MI
, DstIdx
));
15389 // Initialize dword
15390 Register SubReg
= MRI
.createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
15391 // clang-format off
15392 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), SubReg
)
15395 // Insert into the super-reg
15396 BuildMI(MBB
, MI
, DL
, TII
->get(TargetOpcode::INSERT_SUBREG
), NewDst
)
15399 .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx
));
15404 // Add as an implicit operand
15405 MI
.addOperand(MachineOperand::CreateReg(NewDst
, false, true));
15407 // Tie the just added implicit operand to the dst
15408 MI
.tieOperands(DstIdx
, MI
.getNumOperands() - 1);
15411 /// Assign the register class depending on the number of
15412 /// bits set in the writemask
15413 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr
&MI
,
15414 SDNode
*Node
) const {
15415 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
15417 MachineFunction
*MF
= MI
.getParent()->getParent();
15418 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
15419 SIMachineFunctionInfo
*Info
= MF
->getInfo
<SIMachineFunctionInfo
>();
15421 if (TII
->isVOP3(MI
.getOpcode())) {
15422 // Make sure constant bus requirements are respected.
15423 TII
->legalizeOperandsVOP3(MRI
, MI
);
15425 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15426 // This saves a chain-copy of registers and better balance register
15427 // use between vgpr and agpr as agpr tuples tend to be big.
15428 if (!MI
.getDesc().operands().empty()) {
15429 unsigned Opc
= MI
.getOpcode();
15430 bool HasAGPRs
= Info
->mayNeedAGPRs();
15431 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
15432 int16_t Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
15434 {AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
),
15435 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
), Src2Idx
}) {
15438 if ((I
== Src2Idx
) && (HasAGPRs
))
15440 MachineOperand
&Op
= MI
.getOperand(I
);
15441 if (!Op
.isReg() || !Op
.getReg().isVirtual())
15443 auto *RC
= TRI
->getRegClassForReg(MRI
, Op
.getReg());
15444 if (!TRI
->hasAGPRs(RC
))
15446 auto *Src
= MRI
.getUniqueVRegDef(Op
.getReg());
15447 if (!Src
|| !Src
->isCopy() ||
15448 !TRI
->isSGPRReg(MRI
, Src
->getOperand(1).getReg()))
15450 auto *NewRC
= TRI
->getEquivalentVGPRClass(RC
);
15451 // All uses of agpr64 and agpr32 can also accept vgpr except for
15452 // v_accvgpr_read, but we do not produce agpr reads during selection,
15453 // so no use checks are needed.
15454 MRI
.setRegClass(Op
.getReg(), NewRC
);
15457 if (TII
->isMAI(MI
)) {
15458 // The ordinary src0, src1, src2 were legalized above.
15460 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15461 // as a separate instruction.
15462 int Src0Idx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(),
15463 AMDGPU::OpName::scale_src0
);
15464 if (Src0Idx
!= -1) {
15465 int Src1Idx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(),
15466 AMDGPU::OpName::scale_src1
);
15467 if (TII
->usesConstantBus(MRI
, MI
, Src0Idx
) &&
15468 TII
->usesConstantBus(MRI
, MI
, Src1Idx
))
15469 TII
->legalizeOpWithMove(MI
, Src1Idx
);
15476 // Resolve the rest of AV operands to AGPRs.
15477 if (auto *Src2
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
)) {
15478 if (Src2
->isReg() && Src2
->getReg().isVirtual()) {
15479 auto *RC
= TRI
->getRegClassForReg(MRI
, Src2
->getReg());
15480 if (TRI
->isVectorSuperClass(RC
)) {
15481 auto *NewRC
= TRI
->getEquivalentAGPRClass(RC
);
15482 MRI
.setRegClass(Src2
->getReg(), NewRC
);
15483 if (Src2
->isTied())
15484 MRI
.setRegClass(MI
.getOperand(0).getReg(), NewRC
);
15493 if (TII
->isImage(MI
))
15494 TII
->enforceOperandRCAlignment(MI
, AMDGPU::OpName::vaddr
);
15497 static SDValue
buildSMovImm32(SelectionDAG
&DAG
, const SDLoc
&DL
,
15499 SDValue K
= DAG
.getTargetConstant(Val
, DL
, MVT::i32
);
15500 return SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, K
), 0);
15503 MachineSDNode
*SITargetLowering::wrapAddr64Rsrc(SelectionDAG
&DAG
,
15505 SDValue Ptr
) const {
15506 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
15508 // Build the half of the subregister with the constants before building the
15509 // full 128-bit register. If we are building multiple resource descriptors,
15510 // this will allow CSEing of the 2-component register.
15511 const SDValue Ops0
[] = {
15512 DAG
.getTargetConstant(AMDGPU::SGPR_64RegClassID
, DL
, MVT::i32
),
15513 buildSMovImm32(DAG
, DL
, 0),
15514 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
15515 buildSMovImm32(DAG
, DL
, TII
->getDefaultRsrcDataFormat() >> 32),
15516 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
)};
15518 SDValue SubRegHi
= SDValue(
15519 DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v2i32
, Ops0
), 0);
15521 // Combine the constants and the pointer.
15522 const SDValue Ops1
[] = {
15523 DAG
.getTargetConstant(AMDGPU::SGPR_128RegClassID
, DL
, MVT::i32
), Ptr
,
15524 DAG
.getTargetConstant(AMDGPU::sub0_sub1
, DL
, MVT::i32
), SubRegHi
,
15525 DAG
.getTargetConstant(AMDGPU::sub2_sub3
, DL
, MVT::i32
)};
15527 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops1
);
15530 /// Return a resource descriptor with the 'Add TID' bit enabled
15531 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15532 /// of the resource descriptor) to create an offset, which is added to
15533 /// the resource pointer.
15534 MachineSDNode
*SITargetLowering::buildRSRC(SelectionDAG
&DAG
, const SDLoc
&DL
,
15535 SDValue Ptr
, uint32_t RsrcDword1
,
15536 uint64_t RsrcDword2And3
) const {
15537 SDValue PtrLo
= DAG
.getTargetExtractSubreg(AMDGPU::sub0
, DL
, MVT::i32
, Ptr
);
15538 SDValue PtrHi
= DAG
.getTargetExtractSubreg(AMDGPU::sub1
, DL
, MVT::i32
, Ptr
);
15541 SDValue(DAG
.getMachineNode(AMDGPU::S_OR_B32
, DL
, MVT::i32
, PtrHi
,
15542 DAG
.getConstant(RsrcDword1
, DL
, MVT::i32
)),
15547 buildSMovImm32(DAG
, DL
, RsrcDword2And3
& UINT64_C(0xFFFFFFFF));
15548 SDValue DataHi
= buildSMovImm32(DAG
, DL
, RsrcDword2And3
>> 32);
15550 const SDValue Ops
[] = {
15551 DAG
.getTargetConstant(AMDGPU::SGPR_128RegClassID
, DL
, MVT::i32
),
15553 DAG
.getTargetConstant(AMDGPU::sub0
, DL
, MVT::i32
),
15555 DAG
.getTargetConstant(AMDGPU::sub1
, DL
, MVT::i32
),
15557 DAG
.getTargetConstant(AMDGPU::sub2
, DL
, MVT::i32
),
15559 DAG
.getTargetConstant(AMDGPU::sub3
, DL
, MVT::i32
)};
15561 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops
);
15564 //===----------------------------------------------------------------------===//
15565 // SI Inline Assembly Support
15566 //===----------------------------------------------------------------------===//
15568 std::pair
<unsigned, const TargetRegisterClass
*>
15569 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo
*TRI_
,
15570 StringRef Constraint
,
15572 const SIRegisterInfo
*TRI
= static_cast<const SIRegisterInfo
*>(TRI_
);
15574 const TargetRegisterClass
*RC
= nullptr;
15575 if (Constraint
.size() == 1) {
15576 const unsigned BitWidth
= VT
.getSizeInBits();
15577 switch (Constraint
[0]) {
15579 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
15582 switch (BitWidth
) {
15584 RC
= &AMDGPU::SReg_32RegClass
;
15587 RC
= &AMDGPU::SGPR_64RegClass
;
15590 RC
= SIRegisterInfo::getSGPRClassForBitWidth(BitWidth
);
15592 return std::pair(0U, nullptr);
15597 switch (BitWidth
) {
15599 RC
= &AMDGPU::VGPR_32RegClass
;
15602 RC
= TRI
->getVGPRClassForBitWidth(BitWidth
);
15604 return std::pair(0U, nullptr);
15609 if (!Subtarget
->hasMAIInsts())
15611 switch (BitWidth
) {
15613 RC
= &AMDGPU::AGPR_32RegClass
;
15616 RC
= TRI
->getAGPRClassForBitWidth(BitWidth
);
15618 return std::pair(0U, nullptr);
15623 // We actually support i128, i16 and f16 as inline parameters
15624 // even if they are not reported as legal
15625 if (RC
&& (isTypeLegal(VT
) || VT
.SimpleTy
== MVT::i128
||
15626 VT
.SimpleTy
== MVT::i16
|| VT
.SimpleTy
== MVT::f16
))
15627 return std::pair(0U, RC
);
15630 if (Constraint
.starts_with("{") && Constraint
.ends_with("}")) {
15631 StringRef
RegName(Constraint
.data() + 1, Constraint
.size() - 2);
15632 if (RegName
.consume_front("v")) {
15633 RC
= &AMDGPU::VGPR_32RegClass
;
15634 } else if (RegName
.consume_front("s")) {
15635 RC
= &AMDGPU::SGPR_32RegClass
;
15636 } else if (RegName
.consume_front("a")) {
15637 RC
= &AMDGPU::AGPR_32RegClass
;
15642 if (RegName
.consume_front("[")) {
15644 bool Failed
= RegName
.consumeInteger(10, Idx
);
15645 Failed
|= !RegName
.consume_front(":");
15646 Failed
|= RegName
.consumeInteger(10, End
);
15647 Failed
|= !RegName
.consume_back("]");
15649 uint32_t Width
= (End
- Idx
+ 1) * 32;
15650 // Prohibit constraints for register ranges with a width that does not
15651 // match the required type.
15652 if (VT
.SimpleTy
!= MVT::Other
&& Width
!= VT
.getSizeInBits())
15653 return std::pair(0U, nullptr);
15654 MCRegister Reg
= RC
->getRegister(Idx
);
15655 if (SIRegisterInfo::isVGPRClass(RC
))
15656 RC
= TRI
->getVGPRClassForBitWidth(Width
);
15657 else if (SIRegisterInfo::isSGPRClass(RC
))
15658 RC
= TRI
->getSGPRClassForBitWidth(Width
);
15659 else if (SIRegisterInfo::isAGPRClass(RC
))
15660 RC
= TRI
->getAGPRClassForBitWidth(Width
);
15662 Reg
= TRI
->getMatchingSuperReg(Reg
, AMDGPU::sub0
, RC
);
15663 return std::pair(Reg
, RC
);
15667 // Check for lossy scalar/vector conversions.
15668 if (VT
.isVector() && VT
.getSizeInBits() != 32)
15669 return std::pair(0U, nullptr);
15670 bool Failed
= RegName
.getAsInteger(10, Idx
);
15671 if (!Failed
&& Idx
< RC
->getNumRegs())
15672 return std::pair(RC
->getRegister(Idx
), RC
);
15677 auto Ret
= TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
15679 Ret
.second
= TRI
->getPhysRegBaseClass(Ret
.first
);
15684 static bool isImmConstraint(StringRef Constraint
) {
15685 if (Constraint
.size() == 1) {
15686 switch (Constraint
[0]) {
15696 } else if (Constraint
== "DA" || Constraint
== "DB") {
15702 SITargetLowering::ConstraintType
15703 SITargetLowering::getConstraintType(StringRef Constraint
) const {
15704 if (Constraint
.size() == 1) {
15705 switch (Constraint
[0]) {
15711 return C_RegisterClass
;
15714 if (isImmConstraint(Constraint
)) {
15717 return TargetLowering::getConstraintType(Constraint
);
15720 static uint64_t clearUnusedBits(uint64_t Val
, unsigned Size
) {
15721 if (!AMDGPU::isInlinableIntLiteral(Val
)) {
15722 Val
= Val
& maskTrailingOnes
<uint64_t>(Size
);
15727 void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op
,
15728 StringRef Constraint
,
15729 std::vector
<SDValue
> &Ops
,
15730 SelectionDAG
&DAG
) const {
15731 if (isImmConstraint(Constraint
)) {
15733 if (getAsmOperandConstVal(Op
, Val
) &&
15734 checkAsmConstraintVal(Op
, Constraint
, Val
)) {
15735 Val
= clearUnusedBits(Val
, Op
.getScalarValueSizeInBits());
15736 Ops
.push_back(DAG
.getTargetConstant(Val
, SDLoc(Op
), MVT::i64
));
15739 TargetLowering::LowerAsmOperandForConstraint(Op
, Constraint
, Ops
, DAG
);
15743 bool SITargetLowering::getAsmOperandConstVal(SDValue Op
, uint64_t &Val
) const {
15744 unsigned Size
= Op
.getScalarValueSizeInBits();
15748 if (Size
== 16 && !Subtarget
->has16BitInsts())
15751 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Op
)) {
15752 Val
= C
->getSExtValue();
15755 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
15756 Val
= C
->getValueAPF().bitcastToAPInt().getSExtValue();
15759 if (BuildVectorSDNode
*V
= dyn_cast
<BuildVectorSDNode
>(Op
)) {
15760 if (Size
!= 16 || Op
.getNumOperands() != 2)
15762 if (Op
.getOperand(0).isUndef() || Op
.getOperand(1).isUndef())
15764 if (ConstantSDNode
*C
= V
->getConstantSplatNode()) {
15765 Val
= C
->getSExtValue();
15768 if (ConstantFPSDNode
*C
= V
->getConstantFPSplatNode()) {
15769 Val
= C
->getValueAPF().bitcastToAPInt().getSExtValue();
15777 bool SITargetLowering::checkAsmConstraintVal(SDValue Op
, StringRef Constraint
,
15778 uint64_t Val
) const {
15779 if (Constraint
.size() == 1) {
15780 switch (Constraint
[0]) {
15782 return AMDGPU::isInlinableIntLiteral(Val
);
15784 return isInt
<16>(Val
);
15786 return checkAsmConstraintValA(Op
, Val
);
15788 return isInt
<32>(Val
);
15790 return isUInt
<32>(clearUnusedBits(Val
, Op
.getScalarValueSizeInBits())) ||
15791 AMDGPU::isInlinableIntLiteral(Val
);
15795 } else if (Constraint
.size() == 2) {
15796 if (Constraint
== "DA") {
15797 int64_t HiBits
= static_cast<int32_t>(Val
>> 32);
15798 int64_t LoBits
= static_cast<int32_t>(Val
);
15799 return checkAsmConstraintValA(Op
, HiBits
, 32) &&
15800 checkAsmConstraintValA(Op
, LoBits
, 32);
15802 if (Constraint
== "DB") {
15806 llvm_unreachable("Invalid asm constraint");
15809 bool SITargetLowering::checkAsmConstraintValA(SDValue Op
, uint64_t Val
,
15810 unsigned MaxSize
) const {
15811 unsigned Size
= std::min
<unsigned>(Op
.getScalarValueSizeInBits(), MaxSize
);
15812 bool HasInv2Pi
= Subtarget
->hasInv2PiInlineImm();
15814 MVT VT
= Op
.getSimpleValueType();
15815 switch (VT
.SimpleTy
) {
15819 return AMDGPU::isInlinableLiteralI16(Val
, HasInv2Pi
);
15821 return AMDGPU::isInlinableLiteralFP16(Val
, HasInv2Pi
);
15823 return AMDGPU::isInlinableLiteralBF16(Val
, HasInv2Pi
);
15825 return AMDGPU::getInlineEncodingV2I16(Val
).has_value();
15827 return AMDGPU::getInlineEncodingV2F16(Val
).has_value();
15829 return AMDGPU::getInlineEncodingV2BF16(Val
).has_value();
15832 if ((Size
== 32 && AMDGPU::isInlinableLiteral32(Val
, HasInv2Pi
)) ||
15833 (Size
== 64 && AMDGPU::isInlinableLiteral64(Val
, HasInv2Pi
)))
15838 static int getAlignedAGPRClassID(unsigned UnalignedClassID
) {
15839 switch (UnalignedClassID
) {
15840 case AMDGPU::VReg_64RegClassID
:
15841 return AMDGPU::VReg_64_Align2RegClassID
;
15842 case AMDGPU::VReg_96RegClassID
:
15843 return AMDGPU::VReg_96_Align2RegClassID
;
15844 case AMDGPU::VReg_128RegClassID
:
15845 return AMDGPU::VReg_128_Align2RegClassID
;
15846 case AMDGPU::VReg_160RegClassID
:
15847 return AMDGPU::VReg_160_Align2RegClassID
;
15848 case AMDGPU::VReg_192RegClassID
:
15849 return AMDGPU::VReg_192_Align2RegClassID
;
15850 case AMDGPU::VReg_224RegClassID
:
15851 return AMDGPU::VReg_224_Align2RegClassID
;
15852 case AMDGPU::VReg_256RegClassID
:
15853 return AMDGPU::VReg_256_Align2RegClassID
;
15854 case AMDGPU::VReg_288RegClassID
:
15855 return AMDGPU::VReg_288_Align2RegClassID
;
15856 case AMDGPU::VReg_320RegClassID
:
15857 return AMDGPU::VReg_320_Align2RegClassID
;
15858 case AMDGPU::VReg_352RegClassID
:
15859 return AMDGPU::VReg_352_Align2RegClassID
;
15860 case AMDGPU::VReg_384RegClassID
:
15861 return AMDGPU::VReg_384_Align2RegClassID
;
15862 case AMDGPU::VReg_512RegClassID
:
15863 return AMDGPU::VReg_512_Align2RegClassID
;
15864 case AMDGPU::VReg_1024RegClassID
:
15865 return AMDGPU::VReg_1024_Align2RegClassID
;
15866 case AMDGPU::AReg_64RegClassID
:
15867 return AMDGPU::AReg_64_Align2RegClassID
;
15868 case AMDGPU::AReg_96RegClassID
:
15869 return AMDGPU::AReg_96_Align2RegClassID
;
15870 case AMDGPU::AReg_128RegClassID
:
15871 return AMDGPU::AReg_128_Align2RegClassID
;
15872 case AMDGPU::AReg_160RegClassID
:
15873 return AMDGPU::AReg_160_Align2RegClassID
;
15874 case AMDGPU::AReg_192RegClassID
:
15875 return AMDGPU::AReg_192_Align2RegClassID
;
15876 case AMDGPU::AReg_256RegClassID
:
15877 return AMDGPU::AReg_256_Align2RegClassID
;
15878 case AMDGPU::AReg_512RegClassID
:
15879 return AMDGPU::AReg_512_Align2RegClassID
;
15880 case AMDGPU::AReg_1024RegClassID
:
15881 return AMDGPU::AReg_1024_Align2RegClassID
;
15887 // Figure out which registers should be reserved for stack access. Only after
15888 // the function is legalized do we know all of the non-spill stack objects or if
15889 // calls are present.
15890 void SITargetLowering::finalizeLowering(MachineFunction
&MF
) const {
15891 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
15892 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
15893 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
15894 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
15895 const SIInstrInfo
*TII
= ST
.getInstrInfo();
15897 if (Info
->isEntryFunction()) {
15898 // Callable functions have fixed registers used for stack access.
15899 reservePrivateMemoryRegs(getTargetMachine(), MF
, *TRI
, *Info
);
15902 // TODO: Move this logic to getReservedRegs()
15903 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15904 unsigned MaxNumSGPRs
= ST
.getMaxNumSGPRs(MF
);
15905 Register SReg
= ST
.isWave32()
15906 ? AMDGPU::SGPR_32RegClass
.getRegister(MaxNumSGPRs
- 1)
15907 : TRI
->getAlignedHighSGPRForRC(MF
, /*Align=*/2,
15908 &AMDGPU::SGPR_64RegClass
);
15909 Info
->setSGPRForEXECCopy(SReg
);
15911 assert(!TRI
->isSubRegister(Info
->getScratchRSrcReg(),
15912 Info
->getStackPtrOffsetReg()));
15913 if (Info
->getStackPtrOffsetReg() != AMDGPU::SP_REG
)
15914 MRI
.replaceRegWith(AMDGPU::SP_REG
, Info
->getStackPtrOffsetReg());
15916 // We need to worry about replacing the default register with itself in case
15917 // of MIR testcases missing the MFI.
15918 if (Info
->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG
)
15919 MRI
.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG
, Info
->getScratchRSrcReg());
15921 if (Info
->getFrameOffsetReg() != AMDGPU::FP_REG
)
15922 MRI
.replaceRegWith(AMDGPU::FP_REG
, Info
->getFrameOffsetReg());
15924 Info
->limitOccupancy(MF
);
15926 if (ST
.isWave32() && !MF
.empty()) {
15927 for (auto &MBB
: MF
) {
15928 for (auto &MI
: MBB
) {
15929 TII
->fixImplicitOperands(MI
);
15934 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15935 // classes if required. Ideally the register class constraints would differ
15936 // per-subtarget, but there's no easy way to achieve that right now. This is
15937 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15938 // from using them as the register class for legal types.
15939 if (ST
.needsAlignedVGPRs()) {
15940 for (unsigned I
= 0, E
= MRI
.getNumVirtRegs(); I
!= E
; ++I
) {
15941 const Register Reg
= Register::index2VirtReg(I
);
15942 const TargetRegisterClass
*RC
= MRI
.getRegClassOrNull(Reg
);
15945 int NewClassID
= getAlignedAGPRClassID(RC
->getID());
15946 if (NewClassID
!= -1)
15947 MRI
.setRegClass(Reg
, TRI
->getRegClass(NewClassID
));
15951 TargetLoweringBase::finalizeLowering(MF
);
15954 void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op
,
15956 const APInt
&DemandedElts
,
15957 const SelectionDAG
&DAG
,
15958 unsigned Depth
) const {
15960 unsigned Opc
= Op
.getOpcode();
15962 case ISD::INTRINSIC_WO_CHAIN
: {
15963 unsigned IID
= Op
.getConstantOperandVal(0);
15965 case Intrinsic::amdgcn_mbcnt_lo
:
15966 case Intrinsic::amdgcn_mbcnt_hi
: {
15967 const GCNSubtarget
&ST
=
15968 DAG
.getMachineFunction().getSubtarget
<GCNSubtarget
>();
15969 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
15971 Known
.Zero
.setBitsFrom(
15972 IID
== Intrinsic::amdgcn_mbcnt_lo
? ST
.getWavefrontSizeLog2() : 5);
15973 KnownBits Known2
= DAG
.computeKnownBits(Op
.getOperand(2), Depth
+ 1);
15974 Known
= KnownBits::add(Known
, Known2
);
15981 return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
15982 Op
, Known
, DemandedElts
, DAG
, Depth
);
15985 void SITargetLowering::computeKnownBitsForFrameIndex(
15986 const int FI
, KnownBits
&Known
, const MachineFunction
&MF
) const {
15987 TargetLowering::computeKnownBitsForFrameIndex(FI
, Known
, MF
);
15989 // Set the high bits to zero based on the maximum allowed scratch size per
15990 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15991 // calculation won't overflow, so assume the sign bit is never set.
15992 Known
.Zero
.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15995 static void knownBitsForWorkitemID(const GCNSubtarget
&ST
, GISelKnownBits
&KB
,
15996 KnownBits
&Known
, unsigned Dim
) {
15997 unsigned MaxValue
=
15998 ST
.getMaxWorkitemID(KB
.getMachineFunction().getFunction(), Dim
);
15999 Known
.Zero
.setHighBits(llvm::countl_zero(MaxValue
));
16002 void SITargetLowering::computeKnownBitsForTargetInstr(
16003 GISelKnownBits
&KB
, Register R
, KnownBits
&Known
, const APInt
&DemandedElts
,
16004 const MachineRegisterInfo
&MRI
, unsigned Depth
) const {
16005 const MachineInstr
*MI
= MRI
.getVRegDef(R
);
16006 switch (MI
->getOpcode()) {
16007 case AMDGPU::G_INTRINSIC
:
16008 case AMDGPU::G_INTRINSIC_CONVERGENT
: {
16009 Intrinsic::ID IID
= cast
<GIntrinsic
>(MI
)->getIntrinsicID();
16011 case Intrinsic::amdgcn_workitem_id_x
:
16012 knownBitsForWorkitemID(*getSubtarget(), KB
, Known
, 0);
16014 case Intrinsic::amdgcn_workitem_id_y
:
16015 knownBitsForWorkitemID(*getSubtarget(), KB
, Known
, 1);
16017 case Intrinsic::amdgcn_workitem_id_z
:
16018 knownBitsForWorkitemID(*getSubtarget(), KB
, Known
, 2);
16020 case Intrinsic::amdgcn_mbcnt_lo
:
16021 case Intrinsic::amdgcn_mbcnt_hi
: {
16022 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16024 Known
.Zero
.setBitsFrom(IID
== Intrinsic::amdgcn_mbcnt_lo
16025 ? getSubtarget()->getWavefrontSizeLog2()
16028 KB
.computeKnownBitsImpl(MI
->getOperand(3).getReg(), Known2
, DemandedElts
,
16030 Known
= KnownBits::add(Known
, Known2
);
16033 case Intrinsic::amdgcn_groupstaticsize
: {
16034 // We can report everything over the maximum size as 0. We can't report
16035 // based on the actual size because we don't know if it's accurate or not
16036 // at any given point.
16037 Known
.Zero
.setHighBits(
16038 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16044 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
:
16045 Known
.Zero
.setHighBits(24);
16047 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
:
16048 Known
.Zero
.setHighBits(16);
16050 case AMDGPU::G_AMDGPU_SMED3
:
16051 case AMDGPU::G_AMDGPU_UMED3
: {
16052 auto [Dst
, Src0
, Src1
, Src2
] = MI
->getFirst4Regs();
16055 KB
.computeKnownBitsImpl(Src2
, Known2
, DemandedElts
, Depth
+ 1);
16056 if (Known2
.isUnknown())
16060 KB
.computeKnownBitsImpl(Src1
, Known1
, DemandedElts
, Depth
+ 1);
16061 if (Known1
.isUnknown())
16065 KB
.computeKnownBitsImpl(Src0
, Known0
, DemandedElts
, Depth
+ 1);
16066 if (Known0
.isUnknown())
16069 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16070 Known
.Zero
= Known0
.Zero
& Known1
.Zero
& Known2
.Zero
;
16071 Known
.One
= Known0
.One
& Known1
.One
& Known2
.One
;
16077 Align
SITargetLowering::computeKnownAlignForTargetInstr(
16078 GISelKnownBits
&KB
, Register R
, const MachineRegisterInfo
&MRI
,
16079 unsigned Depth
) const {
16080 const MachineInstr
*MI
= MRI
.getVRegDef(R
);
16081 if (auto *GI
= dyn_cast
<GIntrinsic
>(MI
)) {
16082 // FIXME: Can this move to generic code? What about the case where the call
16083 // site specifies a lower alignment?
16084 Intrinsic::ID IID
= GI
->getIntrinsicID();
16085 LLVMContext
&Ctx
= KB
.getMachineFunction().getFunction().getContext();
16086 AttributeList Attrs
= Intrinsic::getAttributes(Ctx
, IID
);
16087 if (MaybeAlign RetAlign
= Attrs
.getRetAlignment())
16093 Align
SITargetLowering::getPrefLoopAlignment(MachineLoop
*ML
) const {
16094 const Align PrefAlign
= TargetLowering::getPrefLoopAlignment(ML
);
16095 const Align CacheLineAlign
= Align(64);
16097 // Pre-GFX10 target did not benefit from loop alignment
16098 if (!ML
|| DisableLoopAlignment
|| !getSubtarget()->hasInstPrefetch() ||
16099 getSubtarget()->hasInstFwdPrefetchBug())
16102 // On GFX10 I$ is 4 x 64 bytes cache lines.
16103 // By default prefetcher keeps one cache line behind and reads two ahead.
16104 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16105 // behind and one ahead.
16106 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16107 // If loop fits 64 bytes it always spans no more than two cache lines and
16108 // does not need an alignment.
16109 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16110 // Else if loop is less or equal 192 bytes we need two lines behind.
16112 const SIInstrInfo
*TII
= getSubtarget()->getInstrInfo();
16113 const MachineBasicBlock
*Header
= ML
->getHeader();
16114 if (Header
->getAlignment() != PrefAlign
)
16115 return Header
->getAlignment(); // Already processed.
16117 unsigned LoopSize
= 0;
16118 for (const MachineBasicBlock
*MBB
: ML
->blocks()) {
16119 // If inner loop block is aligned assume in average half of the alignment
16120 // size to be added as nops.
16122 LoopSize
+= MBB
->getAlignment().value() / 2;
16124 for (const MachineInstr
&MI
: *MBB
) {
16125 LoopSize
+= TII
->getInstSizeInBytes(MI
);
16126 if (LoopSize
> 192)
16131 if (LoopSize
<= 64)
16134 if (LoopSize
<= 128)
16135 return CacheLineAlign
;
16137 // If any of parent loops is surrounded by prefetch instructions do not
16138 // insert new for inner loop, which would reset parent's settings.
16139 for (MachineLoop
*P
= ML
->getParentLoop(); P
; P
= P
->getParentLoop()) {
16140 if (MachineBasicBlock
*Exit
= P
->getExitBlock()) {
16141 auto I
= Exit
->getFirstNonDebugInstr();
16142 if (I
!= Exit
->end() && I
->getOpcode() == AMDGPU::S_INST_PREFETCH
)
16143 return CacheLineAlign
;
16147 MachineBasicBlock
*Pre
= ML
->getLoopPreheader();
16148 MachineBasicBlock
*Exit
= ML
->getExitBlock();
16151 auto PreTerm
= Pre
->getFirstTerminator();
16152 if (PreTerm
== Pre
->begin() ||
16153 std::prev(PreTerm
)->getOpcode() != AMDGPU::S_INST_PREFETCH
)
16154 BuildMI(*Pre
, PreTerm
, DebugLoc(), TII
->get(AMDGPU::S_INST_PREFETCH
))
16155 .addImm(1); // prefetch 2 lines behind PC
16157 auto ExitHead
= Exit
->getFirstNonDebugInstr();
16158 if (ExitHead
== Exit
->end() ||
16159 ExitHead
->getOpcode() != AMDGPU::S_INST_PREFETCH
)
16160 BuildMI(*Exit
, ExitHead
, DebugLoc(), TII
->get(AMDGPU::S_INST_PREFETCH
))
16161 .addImm(2); // prefetch 1 line behind PC
16164 return CacheLineAlign
;
16167 LLVM_ATTRIBUTE_UNUSED
16168 static bool isCopyFromRegOfInlineAsm(const SDNode
*N
) {
16169 assert(N
->getOpcode() == ISD::CopyFromReg
);
16171 // Follow the chain until we find an INLINEASM node.
16172 N
= N
->getOperand(0).getNode();
16173 if (N
->getOpcode() == ISD::INLINEASM
|| N
->getOpcode() == ISD::INLINEASM_BR
)
16175 } while (N
->getOpcode() == ISD::CopyFromReg
);
16179 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode
*N
,
16180 FunctionLoweringInfo
*FLI
,
16181 UniformityInfo
*UA
) const {
16182 switch (N
->getOpcode()) {
16183 case ISD::CopyFromReg
: {
16184 const RegisterSDNode
*R
= cast
<RegisterSDNode
>(N
->getOperand(1));
16185 const MachineRegisterInfo
&MRI
= FLI
->MF
->getRegInfo();
16186 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
16187 Register Reg
= R
->getReg();
16189 // FIXME: Why does this need to consider isLiveIn?
16190 if (Reg
.isPhysical() || MRI
.isLiveIn(Reg
))
16191 return !TRI
->isSGPRReg(MRI
, Reg
);
16193 if (const Value
*V
= FLI
->getValueFromVirtualReg(R
->getReg()))
16194 return UA
->isDivergent(V
);
16196 assert(Reg
== FLI
->DemoteRegister
|| isCopyFromRegOfInlineAsm(N
));
16197 return !TRI
->isSGPRReg(MRI
, Reg
);
16200 const LoadSDNode
*L
= cast
<LoadSDNode
>(N
);
16201 unsigned AS
= L
->getAddressSpace();
16202 // A flat load may access private memory.
16203 return AS
== AMDGPUAS::PRIVATE_ADDRESS
|| AS
== AMDGPUAS::FLAT_ADDRESS
;
16205 case ISD::CALLSEQ_END
:
16207 case ISD::INTRINSIC_WO_CHAIN
:
16208 return AMDGPU::isIntrinsicSourceOfDivergence(N
->getConstantOperandVal(0));
16209 case ISD::INTRINSIC_W_CHAIN
:
16210 return AMDGPU::isIntrinsicSourceOfDivergence(N
->getConstantOperandVal(1));
16211 case AMDGPUISD::ATOMIC_CMP_SWAP
:
16212 case AMDGPUISD::BUFFER_ATOMIC_SWAP
:
16213 case AMDGPUISD::BUFFER_ATOMIC_ADD
:
16214 case AMDGPUISD::BUFFER_ATOMIC_SUB
:
16215 case AMDGPUISD::BUFFER_ATOMIC_SMIN
:
16216 case AMDGPUISD::BUFFER_ATOMIC_UMIN
:
16217 case AMDGPUISD::BUFFER_ATOMIC_SMAX
:
16218 case AMDGPUISD::BUFFER_ATOMIC_UMAX
:
16219 case AMDGPUISD::BUFFER_ATOMIC_AND
:
16220 case AMDGPUISD::BUFFER_ATOMIC_OR
:
16221 case AMDGPUISD::BUFFER_ATOMIC_XOR
:
16222 case AMDGPUISD::BUFFER_ATOMIC_INC
:
16223 case AMDGPUISD::BUFFER_ATOMIC_DEC
:
16224 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP
:
16225 case AMDGPUISD::BUFFER_ATOMIC_CSUB
:
16226 case AMDGPUISD::BUFFER_ATOMIC_FADD
:
16227 case AMDGPUISD::BUFFER_ATOMIC_FMIN
:
16228 case AMDGPUISD::BUFFER_ATOMIC_FMAX
:
16229 // Target-specific read-modify-write atomics are sources of divergence.
16232 if (auto *A
= dyn_cast
<AtomicSDNode
>(N
)) {
16233 // Generic read-modify-write atomics are sources of divergence.
16234 return A
->readMem() && A
->writeMem();
16240 bool SITargetLowering::denormalsEnabledForType(const SelectionDAG
&DAG
,
16242 switch (VT
.getScalarType().getSimpleVT().SimpleTy
) {
16244 return !denormalModeIsFlushAllF32(DAG
.getMachineFunction());
16247 return !denormalModeIsFlushAllF64F16(DAG
.getMachineFunction());
16253 bool SITargetLowering::denormalsEnabledForType(
16254 LLT Ty
, const MachineFunction
&MF
) const {
16255 switch (Ty
.getScalarSizeInBits()) {
16257 return !denormalModeIsFlushAllF32(MF
);
16260 return !denormalModeIsFlushAllF64F16(MF
);
16266 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op
,
16267 const SelectionDAG
&DAG
,
16269 unsigned Depth
) const {
16270 if (Op
.getOpcode() == AMDGPUISD::CLAMP
) {
16271 const MachineFunction
&MF
= DAG
.getMachineFunction();
16272 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
16274 if (Info
->getMode().DX10Clamp
)
16275 return true; // Clamped to 0.
16276 return DAG
.isKnownNeverNaN(Op
.getOperand(0), SNaN
, Depth
+ 1);
16279 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op
, DAG
, SNaN
,
16283 // On older subtargets, global FP atomic instructions have a hardcoded FP mode
16284 // and do not support FP32 denormals, and only support v2f16/f64 denormals.
16285 static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst
*RMW
) {
16286 if (RMW
->hasMetadata("amdgpu.ignore.denormal.mode"))
16289 const fltSemantics
&Flt
= RMW
->getType()->getScalarType()->getFltSemantics();
16290 auto DenormMode
= RMW
->getFunction()->getDenormalMode(Flt
);
16291 if (DenormMode
== DenormalMode::getPreserveSign())
16294 // TODO: Remove this.
16295 return RMW
->getFunction()
16296 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16300 static OptimizationRemark
emitAtomicRMWLegalRemark(const AtomicRMWInst
*RMW
) {
16301 LLVMContext
&Ctx
= RMW
->getContext();
16302 StringRef SS
= Ctx
.getSyncScopeName(RMW
->getSyncScopeID()).value_or("");
16303 StringRef MemScope
= SS
.empty() ? StringRef("system") : SS
;
16305 return OptimizationRemark(DEBUG_TYPE
, "Passed", RMW
)
16306 << "Hardware instruction generated for atomic "
16307 << RMW
->getOperationName(RMW
->getOperation())
16308 << " operation at memory scope " << MemScope
;
16311 static bool isV2F16OrV2BF16(Type
*Ty
) {
16312 if (auto *VT
= dyn_cast
<FixedVectorType
>(Ty
)) {
16313 Type
*EltTy
= VT
->getElementType();
16314 return VT
->getNumElements() == 2 &&
16315 (EltTy
->isHalfTy() || EltTy
->isBFloatTy());
16321 static bool isV2F16(Type
*Ty
) {
16322 FixedVectorType
*VT
= dyn_cast
<FixedVectorType
>(Ty
);
16323 return VT
&& VT
->getNumElements() == 2 && VT
->getElementType()->isHalfTy();
16326 static bool isV2BF16(Type
*Ty
) {
16327 FixedVectorType
*VT
= dyn_cast
<FixedVectorType
>(Ty
);
16328 return VT
&& VT
->getNumElements() == 2 && VT
->getElementType()->isBFloatTy();
16331 /// \return true if atomicrmw integer ops work for the type.
16332 static bool isAtomicRMWLegalIntTy(Type
*Ty
) {
16333 if (auto *IT
= dyn_cast
<IntegerType
>(Ty
)) {
16334 unsigned BW
= IT
->getBitWidth();
16335 return BW
== 32 || BW
== 64;
16341 /// \return true if this atomicrmw xchg type can be selected.
16342 static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst
*RMW
) {
16343 Type
*Ty
= RMW
->getType();
16344 if (isAtomicRMWLegalIntTy(Ty
))
16347 if (PointerType
*PT
= dyn_cast
<PointerType
>(Ty
)) {
16348 const DataLayout
&DL
= RMW
->getFunction()->getParent()->getDataLayout();
16349 unsigned BW
= DL
.getPointerSizeInBits(PT
->getAddressSpace());
16350 return BW
== 32 || BW
== 64;
16353 if (Ty
->isFloatTy() || Ty
->isDoubleTy())
16356 if (FixedVectorType
*VT
= dyn_cast
<FixedVectorType
>(Ty
)) {
16357 return VT
->getNumElements() == 2 &&
16358 VT
->getElementType()->getPrimitiveSizeInBits() == 16;
16364 /// \returns true if it's valid to emit a native instruction for \p RMW, based
16365 /// on the properties of the target memory.
16366 static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget
&Subtarget
,
16367 const AtomicRMWInst
*RMW
,
16368 bool HasSystemScope
) {
16369 // The remote/fine-grained access logic is different from the integer
16370 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16371 // fine-grained access does not work, even for a device local allocation.
16373 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16374 // allocations work.
16375 if (HasSystemScope
) {
16376 if (Subtarget
.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
16377 RMW
->hasMetadata("amdgpu.no.remote.memory"))
16379 } else if (Subtarget
.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16382 return RMW
->hasMetadata("amdgpu.no.fine.grained.memory");
16385 /// \return Action to perform on AtomicRMWInsts for integer operations.
16386 static TargetLowering::AtomicExpansionKind
16387 atomicSupportedIfLegalIntType(const AtomicRMWInst
*RMW
) {
16388 return isAtomicRMWLegalIntTy(RMW
->getType())
16389 ? TargetLowering::AtomicExpansionKind::None
16390 : TargetLowering::AtomicExpansionKind::CmpXChg
;
16393 /// Return if a flat address space atomicrmw can access private memory.
16394 static bool flatInstrMayAccessPrivate(const Instruction
*I
) {
16395 const MDNode
*NoaliasAddrSpaceMD
=
16396 I
->getMetadata(LLVMContext::MD_noalias_addrspace
);
16397 if (!NoaliasAddrSpaceMD
)
16400 for (unsigned I
= 0, E
= NoaliasAddrSpaceMD
->getNumOperands() / 2; I
!= E
;
16402 auto *Low
= mdconst::extract
<ConstantInt
>(
16403 NoaliasAddrSpaceMD
->getOperand(2 * I
+ 0));
16404 if (Low
->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS
)) {
16405 auto *High
= mdconst::extract
<ConstantInt
>(
16406 NoaliasAddrSpaceMD
->getOperand(2 * I
+ 1));
16407 return High
->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS
);
16414 TargetLowering::AtomicExpansionKind
16415 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*RMW
) const {
16416 unsigned AS
= RMW
->getPointerAddressSpace();
16417 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
)
16418 return AtomicExpansionKind::NotAtomic
;
16420 // 64-bit flat atomics that dynamically reside in private memory will silently
16423 // Note that we will emit a new copy of the original atomic in the expansion,
16424 // which will be incrementally relegalized.
16425 const DataLayout
&DL
= RMW
->getFunction()->getDataLayout();
16426 if (AS
== AMDGPUAS::FLAT_ADDRESS
&&
16427 DL
.getTypeSizeInBits(RMW
->getType()) == 64 &&
16428 flatInstrMayAccessPrivate(RMW
))
16429 return AtomicExpansionKind::Expand
;
16431 auto ReportUnsafeHWInst
= [=](TargetLowering::AtomicExpansionKind Kind
) {
16432 OptimizationRemarkEmitter
ORE(RMW
->getFunction());
16434 return emitAtomicRMWLegalRemark(RMW
) << " due to an unsafe request.";
16439 auto SSID
= RMW
->getSyncScopeID();
16440 bool HasSystemScope
=
16441 SSID
== SyncScope::System
||
16442 SSID
== RMW
->getContext().getOrInsertSyncScopeID("one-as");
16444 auto Op
= RMW
->getOperation();
16446 case AtomicRMWInst::Xchg
: {
16447 // PCIe supports add and xchg for system atomics.
16448 return isAtomicRMWLegalXChgTy(RMW
)
16449 ? TargetLowering::AtomicExpansionKind::None
16450 : TargetLowering::AtomicExpansionKind::CmpXChg
;
16452 case AtomicRMWInst::Add
:
16453 case AtomicRMWInst::And
:
16454 case AtomicRMWInst::UIncWrap
:
16455 case AtomicRMWInst::UDecWrap
:
16456 return atomicSupportedIfLegalIntType(RMW
);
16457 case AtomicRMWInst::Sub
:
16458 case AtomicRMWInst::Or
:
16459 case AtomicRMWInst::Xor
: {
16460 // Atomic sub/or/xor do not work over PCI express, but atomic add
16461 // does. InstCombine transforms these with 0 to or, so undo that.
16462 if (HasSystemScope
&& AMDGPU::isFlatGlobalAddrSpace(AS
)) {
16463 if (Constant
*ConstVal
= dyn_cast
<Constant
>(RMW
->getValOperand());
16464 ConstVal
&& ConstVal
->isNullValue())
16465 return AtomicExpansionKind::Expand
;
16468 return atomicSupportedIfLegalIntType(RMW
);
16470 case AtomicRMWInst::FAdd
: {
16471 Type
*Ty
= RMW
->getType();
16473 // TODO: Handle REGION_ADDRESS
16474 if (AS
== AMDGPUAS::LOCAL_ADDRESS
) {
16475 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16476 // is fixed to round-to-nearest-even.
16478 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16479 // round-to-nearest-even.
16481 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16482 // suggests it is OK if the floating-point mode may not match the calling
16484 if (Ty
->isFloatTy()) {
16485 return Subtarget
->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
16486 : AtomicExpansionKind::CmpXChg
;
16489 if (Ty
->isDoubleTy()) {
16490 // Ignores denormal mode, but we don't consider flushing mandatory.
16491 return Subtarget
->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
16492 : AtomicExpansionKind::CmpXChg
;
16495 if (Subtarget
->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty
))
16496 return AtomicExpansionKind::None
;
16498 return AtomicExpansionKind::CmpXChg
;
16501 // LDS atomics respect the denormal mode from the mode register.
16503 // Traditionally f32 global/buffer memory atomics would unconditionally
16504 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16507 // On targets with flat atomic fadd, denormals would flush depending on
16508 // whether the target address resides in LDS or global memory. We consider
16509 // this flat-maybe-flush as will-flush.
16510 if (Ty
->isFloatTy() &&
16511 !Subtarget
->hasMemoryAtomicFaddF32DenormalSupport() &&
16512 !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW
))
16513 return AtomicExpansionKind::CmpXChg
;
16515 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16516 // safe. The message phrasing also should be better.
16517 if (globalMemoryFPAtomicIsLegal(*Subtarget
, RMW
, HasSystemScope
)) {
16518 if (AS
== AMDGPUAS::FLAT_ADDRESS
) {
16520 if (Subtarget
->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty
))
16521 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16522 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS
)) {
16523 // gfx90a, gfx940, gfx12
16524 if (Subtarget
->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty
))
16525 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16528 if (Subtarget
->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty
))
16529 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16530 } else if (AS
== AMDGPUAS::BUFFER_FAT_POINTER
) {
16531 // gfx90a, gfx940, gfx12
16532 if (Subtarget
->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty
))
16533 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16535 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16536 // buffer. gfx12 does have the buffer version.
16537 if (Subtarget
->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty
))
16538 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16541 // global and flat atomic fadd f64: gfx90a, gfx940.
16542 if (Subtarget
->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty
->isDoubleTy())
16543 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16545 if (AS
!= AMDGPUAS::FLAT_ADDRESS
) {
16546 if (Ty
->isFloatTy()) {
16547 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16549 if (RMW
->use_empty() && Subtarget
->hasAtomicFaddNoRtnInsts())
16550 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16551 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16552 if (!RMW
->use_empty() && Subtarget
->hasAtomicFaddRtnInsts())
16553 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16556 if (RMW
->use_empty() &&
16557 Subtarget
->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
16559 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16563 // flat atomic fadd f32: gfx940, gfx11+.
16564 if (AS
== AMDGPUAS::FLAT_ADDRESS
&& Ty
->isFloatTy()) {
16565 if (Subtarget
->hasFlatAtomicFaddF32Inst())
16566 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16568 // If it is in flat address space, and the type is float, we will try to
16569 // expand it, if the target supports global and lds atomic fadd. The
16570 // reason we need that is, in the expansion, we emit the check of
16571 // address space. If it is in global address space, we emit the global
16572 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16574 if (Subtarget
->hasLDSFPAtomicAddF32()) {
16575 if (RMW
->use_empty() && Subtarget
->hasAtomicFaddNoRtnInsts())
16576 return AtomicExpansionKind::Expand
;
16577 if (!RMW
->use_empty() && Subtarget
->hasAtomicFaddRtnInsts())
16578 return AtomicExpansionKind::Expand
;
16583 return AtomicExpansionKind::CmpXChg
;
16585 case AtomicRMWInst::FMin
:
16586 case AtomicRMWInst::FMax
: {
16587 Type
*Ty
= RMW
->getType();
16589 // LDS float and double fmin/fmax were always supported.
16590 if (AS
== AMDGPUAS::LOCAL_ADDRESS
) {
16591 return Ty
->isFloatTy() || Ty
->isDoubleTy() ? AtomicExpansionKind::None
16592 : AtomicExpansionKind::CmpXChg
;
16595 if (globalMemoryFPAtomicIsLegal(*Subtarget
, RMW
, HasSystemScope
)) {
16596 // For flat and global cases:
16597 // float, double in gfx7. Manual claims denormal support.
16598 // Removed in gfx8.
16599 // float, double restored in gfx10.
16600 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16602 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16604 if (AS
== AMDGPUAS::FLAT_ADDRESS
) {
16605 if (Subtarget
->hasAtomicFMinFMaxF32FlatInsts() && Ty
->isFloatTy())
16606 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16607 if (Subtarget
->hasAtomicFMinFMaxF64FlatInsts() && Ty
->isDoubleTy())
16608 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16609 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS
) ||
16610 AS
== AMDGPUAS::BUFFER_FAT_POINTER
) {
16611 if (Subtarget
->hasAtomicFMinFMaxF32GlobalInsts() && Ty
->isFloatTy())
16612 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16613 if (Subtarget
->hasAtomicFMinFMaxF64GlobalInsts() && Ty
->isDoubleTy())
16614 return ReportUnsafeHWInst(AtomicExpansionKind::None
);
16618 return AtomicExpansionKind::CmpXChg
;
16620 case AtomicRMWInst::Min
:
16621 case AtomicRMWInst::Max
:
16622 case AtomicRMWInst::UMin
:
16623 case AtomicRMWInst::UMax
: {
16624 if (AMDGPU::isFlatGlobalAddrSpace(AS
) ||
16625 AS
== AMDGPUAS::BUFFER_FAT_POINTER
) {
16626 // Always expand system scope min/max atomics.
16627 if (HasSystemScope
)
16628 return AtomicExpansionKind::CmpXChg
;
16631 return atomicSupportedIfLegalIntType(RMW
);
16633 case AtomicRMWInst::Nand
:
16634 case AtomicRMWInst::FSub
:
16636 return AtomicExpansionKind::CmpXChg
;
16639 llvm_unreachable("covered atomicrmw op switch");
16642 TargetLowering::AtomicExpansionKind
16643 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst
*LI
) const {
16644 return LI
->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16645 ? AtomicExpansionKind::NotAtomic
16646 : AtomicExpansionKind::None
;
16649 TargetLowering::AtomicExpansionKind
16650 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst
*SI
) const {
16651 return SI
->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16652 ? AtomicExpansionKind::NotAtomic
16653 : AtomicExpansionKind::None
;
16656 TargetLowering::AtomicExpansionKind
16657 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst
*CmpX
) const {
16658 unsigned AddrSpace
= CmpX
->getPointerAddressSpace();
16659 if (AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
)
16660 return AtomicExpansionKind::NotAtomic
;
16662 if (AddrSpace
!= AMDGPUAS::FLAT_ADDRESS
|| !flatInstrMayAccessPrivate(CmpX
))
16663 return AtomicExpansionKind::None
;
16665 const DataLayout
&DL
= CmpX
->getDataLayout();
16667 Type
*ValTy
= CmpX
->getNewValOperand()->getType();
16669 // If a 64-bit flat atomic may alias private, we need to avoid using the
16670 // atomic in the private case.
16671 return DL
.getTypeSizeInBits(ValTy
) == 64 ? AtomicExpansionKind::Expand
16672 : AtomicExpansionKind::None
;
16675 const TargetRegisterClass
*
16676 SITargetLowering::getRegClassFor(MVT VT
, bool isDivergent
) const {
16677 const TargetRegisterClass
*RC
= TargetLoweringBase::getRegClassFor(VT
, false);
16678 const SIRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
16679 if (RC
== &AMDGPU::VReg_1RegClass
&& !isDivergent
)
16680 return Subtarget
->isWave64() ? &AMDGPU::SReg_64RegClass
16681 : &AMDGPU::SReg_32RegClass
;
16682 if (!TRI
->isSGPRClass(RC
) && !isDivergent
)
16683 return TRI
->getEquivalentSGPRClass(RC
);
16684 if (TRI
->isSGPRClass(RC
) && isDivergent
)
16685 return TRI
->getEquivalentVGPRClass(RC
);
16690 // FIXME: This is a workaround for DivergenceAnalysis not understanding always
16691 // uniform values (as produced by the mask results of control flow intrinsics)
16692 // used outside of divergent blocks. The phi users need to also be treated as
16695 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16696 static bool hasCFUser(const Value
*V
, SmallPtrSet
<const Value
*, 16> &Visited
,
16697 unsigned WaveSize
) {
16698 // FIXME: We assume we never cast the mask results of a control flow
16700 // Early exit if the type won't be consistent as a compile time hack.
16701 IntegerType
*IT
= dyn_cast
<IntegerType
>(V
->getType());
16702 if (!IT
|| IT
->getBitWidth() != WaveSize
)
16705 if (!isa
<Instruction
>(V
))
16707 if (!Visited
.insert(V
).second
)
16709 bool Result
= false;
16710 for (const auto *U
: V
->users()) {
16711 if (const IntrinsicInst
*Intrinsic
= dyn_cast
<IntrinsicInst
>(U
)) {
16712 if (V
== U
->getOperand(1)) {
16713 switch (Intrinsic
->getIntrinsicID()) {
16717 case Intrinsic::amdgcn_if_break
:
16718 case Intrinsic::amdgcn_if
:
16719 case Intrinsic::amdgcn_else
:
16724 if (V
== U
->getOperand(0)) {
16725 switch (Intrinsic
->getIntrinsicID()) {
16729 case Intrinsic::amdgcn_end_cf
:
16730 case Intrinsic::amdgcn_loop
:
16736 Result
= hasCFUser(U
, Visited
, WaveSize
);
16744 bool SITargetLowering::requiresUniformRegister(MachineFunction
&MF
,
16745 const Value
*V
) const {
16746 if (const CallInst
*CI
= dyn_cast
<CallInst
>(V
)) {
16747 if (CI
->isInlineAsm()) {
16748 // FIXME: This cannot give a correct answer. This should only trigger in
16749 // the case where inline asm returns mixed SGPR and VGPR results, used
16750 // outside the defining block. We don't have a specific result to
16751 // consider, so this assumes if any value is SGPR, the overall register
16752 // also needs to be SGPR.
16753 const SIRegisterInfo
*SIRI
= Subtarget
->getRegisterInfo();
16754 TargetLowering::AsmOperandInfoVector TargetConstraints
= ParseConstraints(
16755 MF
.getDataLayout(), Subtarget
->getRegisterInfo(), *CI
);
16756 for (auto &TC
: TargetConstraints
) {
16757 if (TC
.Type
== InlineAsm::isOutput
) {
16758 ComputeConstraintToUse(TC
, SDValue());
16759 const TargetRegisterClass
*RC
=
16760 getRegForInlineAsmConstraint(SIRI
, TC
.ConstraintCode
,
16763 if (RC
&& SIRI
->isSGPRClass(RC
))
16769 SmallPtrSet
<const Value
*, 16> Visited
;
16770 return hasCFUser(V
, Visited
, Subtarget
->getWavefrontSize());
16773 bool SITargetLowering::hasMemSDNodeUser(SDNode
*N
) const {
16774 SDNode::use_iterator I
= N
->use_begin(), E
= N
->use_end();
16775 for (; I
!= E
; ++I
) {
16776 if (MemSDNode
*M
= dyn_cast
<MemSDNode
>(*I
)) {
16777 if (getBasePtrIndex(M
) == I
.getOperandNo())
16784 bool SITargetLowering::isReassocProfitable(SelectionDAG
&DAG
, SDValue N0
,
16785 SDValue N1
) const {
16786 if (!N0
.hasOneUse())
16788 // Take care of the opportunity to keep N0 uniform
16789 if (N0
->isDivergent() || !N1
->isDivergent())
16791 // Check if we have a good chance to form the memory access pattern with the
16793 return (DAG
.isBaseWithConstantOffset(N0
) &&
16794 hasMemSDNodeUser(*N0
->use_begin()));
16797 bool SITargetLowering::isReassocProfitable(MachineRegisterInfo
&MRI
,
16798 Register N0
, Register N1
) const {
16799 return MRI
.hasOneNonDBGUse(N0
); // FIXME: handle regbanks
16802 MachineMemOperand::Flags
16803 SITargetLowering::getTargetMMOFlags(const Instruction
&I
) const {
16804 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16805 MachineMemOperand::Flags Flags
= MachineMemOperand::MONone
;
16806 if (I
.getMetadata("amdgpu.noclobber"))
16807 Flags
|= MONoClobber
;
16808 if (I
.getMetadata("amdgpu.last.use"))
16809 Flags
|= MOLastUse
;
16813 bool SITargetLowering::checkForPhysRegDependency(
16814 SDNode
*Def
, SDNode
*User
, unsigned Op
, const TargetRegisterInfo
*TRI
,
16815 const TargetInstrInfo
*TII
, unsigned &PhysReg
, int &Cost
) const {
16816 if (User
->getOpcode() != ISD::CopyToReg
)
16818 if (!Def
->isMachineOpcode())
16820 MachineSDNode
*MDef
= dyn_cast
<MachineSDNode
>(Def
);
16824 unsigned ResNo
= User
->getOperand(Op
).getResNo();
16825 if (User
->getOperand(Op
)->getValueType(ResNo
) != MVT::i1
)
16827 const MCInstrDesc
&II
= TII
->get(MDef
->getMachineOpcode());
16828 if (II
.isCompare() && II
.hasImplicitDefOfPhysReg(AMDGPU::SCC
)) {
16829 PhysReg
= AMDGPU::SCC
;
16830 const TargetRegisterClass
*RC
=
16831 TRI
->getMinimalPhysRegClass(PhysReg
, Def
->getSimpleValueType(ResNo
));
16832 Cost
= RC
->getCopyCost();
16838 void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16839 Instruction
*AI
) const {
16840 // Given: atomicrmw fadd ptr %addr, float %val ordering
16842 // With this expansion we produce the following code:
16844 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16845 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16847 // atomicrmw.shared:
16848 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16849 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16850 // float %val ordering
16851 // br label %atomicrmw.phi
16853 // atomicrmw.check.private:
16854 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16855 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16857 // atomicrmw.private:
16858 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16859 // %loaded.private = load float, ptr addrspace(5) %cast.private
16860 // %val.new = fadd float %loaded.private, %val
16861 // store float %val.new, ptr addrspace(5) %cast.private
16862 // br label %atomicrmw.phi
16864 // atomicrmw.global:
16865 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16866 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16867 // float %val ordering
16868 // br label %atomicrmw.phi
16871 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16872 // [ %loaded.private, %atomicrmw.private ],
16873 // [ %loaded.global, %atomicrmw.global ]
16874 // br label %atomicrmw.end
16880 // For 64-bit atomics which may reside in private memory, we perform a simpler
16881 // version that only inserts the private check, and uses the flat operation.
16883 IRBuilder
<> Builder(AI
);
16884 LLVMContext
&Ctx
= Builder
.getContext();
16886 auto *RMW
= dyn_cast
<AtomicRMWInst
>(AI
);
16887 const unsigned PtrOpIdx
= RMW
? AtomicRMWInst::getPointerOperandIndex()
16888 : AtomicCmpXchgInst::getPointerOperandIndex();
16889 Value
*Addr
= AI
->getOperand(PtrOpIdx
);
16891 /// TODO: Only need to check private, then emit flat-known-not private (no
16892 /// need for shared block, or cast to global).
16893 AtomicCmpXchgInst
*CX
= dyn_cast
<AtomicCmpXchgInst
>(AI
);
16897 Alignment
= RMW
->getAlign();
16899 Alignment
= CX
->getAlign();
16901 llvm_unreachable("unhandled atomic operation");
16903 // FullFlatEmulation is true if we need to issue the private, shared, and
16906 // If this is false, we are only dealing with the flat-targeting-private case,
16907 // where we only insert a check for private and still use the flat instruction
16908 // for global and shared.
16910 bool FullFlatEmulation
= RMW
&& RMW
->getOperation() == AtomicRMWInst::FAdd
&&
16911 Subtarget
->hasAtomicFaddInsts() &&
16912 RMW
->getType()->isFloatTy();
16914 // If the return value isn't used, do not introduce a false use in the phi.
16915 bool ReturnValueIsUsed
= !AI
->use_empty();
16917 BasicBlock
*BB
= Builder
.GetInsertBlock();
16918 Function
*F
= BB
->getParent();
16919 BasicBlock
*ExitBB
=
16920 BB
->splitBasicBlock(Builder
.GetInsertPoint(), "atomicrmw.end");
16921 BasicBlock
*SharedBB
= nullptr;
16923 BasicBlock
*CheckPrivateBB
= BB
;
16924 if (FullFlatEmulation
) {
16925 SharedBB
= BasicBlock::Create(Ctx
, "atomicrmw.shared", F
, ExitBB
);
16927 BasicBlock::Create(Ctx
, "atomicrmw.check.private", F
, ExitBB
);
16930 BasicBlock
*PrivateBB
=
16931 BasicBlock::Create(Ctx
, "atomicrmw.private", F
, ExitBB
);
16932 BasicBlock
*GlobalBB
= BasicBlock::Create(Ctx
, "atomicrmw.global", F
, ExitBB
);
16933 BasicBlock
*PhiBB
= BasicBlock::Create(Ctx
, "atomicrmw.phi", F
, ExitBB
);
16935 std::prev(BB
->end())->eraseFromParent();
16936 Builder
.SetInsertPoint(BB
);
16938 Value
*LoadedShared
= nullptr;
16939 if (FullFlatEmulation
) {
16940 CallInst
*IsShared
= Builder
.CreateIntrinsic(
16941 Intrinsic::amdgcn_is_shared
, {}, {Addr
}, nullptr, "is.shared");
16942 Builder
.CreateCondBr(IsShared
, SharedBB
, CheckPrivateBB
);
16943 Builder
.SetInsertPoint(SharedBB
);
16944 Value
*CastToLocal
= Builder
.CreateAddrSpaceCast(
16945 Addr
, PointerType::get(Ctx
, AMDGPUAS::LOCAL_ADDRESS
));
16947 Instruction
*Clone
= AI
->clone();
16948 Clone
->insertInto(SharedBB
, SharedBB
->end());
16949 Clone
->getOperandUse(PtrOpIdx
).set(CastToLocal
);
16950 LoadedShared
= Clone
;
16952 Builder
.CreateBr(PhiBB
);
16953 Builder
.SetInsertPoint(CheckPrivateBB
);
16956 CallInst
*IsPrivate
= Builder
.CreateIntrinsic(
16957 Intrinsic::amdgcn_is_private
, {}, {Addr
}, nullptr, "is.private");
16958 Builder
.CreateCondBr(IsPrivate
, PrivateBB
, GlobalBB
);
16960 Builder
.SetInsertPoint(PrivateBB
);
16962 Value
*CastToPrivate
= Builder
.CreateAddrSpaceCast(
16963 Addr
, PointerType::get(Ctx
, AMDGPUAS::PRIVATE_ADDRESS
));
16965 Value
*LoadedPrivate
;
16967 LoadedPrivate
= Builder
.CreateAlignedLoad(
16968 RMW
->getType(), CastToPrivate
, RMW
->getAlign(), "loaded.private");
16970 Value
*NewVal
= buildAtomicRMWValue(RMW
->getOperation(), Builder
,
16971 LoadedPrivate
, RMW
->getValOperand());
16973 Builder
.CreateAlignedStore(NewVal
, CastToPrivate
, RMW
->getAlign());
16975 auto [ResultLoad
, Equal
] =
16976 buildCmpXchgValue(Builder
, CastToPrivate
, CX
->getCompareOperand(),
16977 CX
->getNewValOperand(), CX
->getAlign());
16979 Value
*Insert
= Builder
.CreateInsertValue(PoisonValue::get(CX
->getType()),
16981 LoadedPrivate
= Builder
.CreateInsertValue(Insert
, Equal
, 1);
16984 Builder
.CreateBr(PhiBB
);
16986 Builder
.SetInsertPoint(GlobalBB
);
16988 // Continue using a flat instruction if we only emitted the check for private.
16989 Instruction
*LoadedGlobal
= AI
;
16990 if (FullFlatEmulation
) {
16991 Value
*CastToGlobal
= Builder
.CreateAddrSpaceCast(
16992 Addr
, PointerType::get(Ctx
, AMDGPUAS::GLOBAL_ADDRESS
));
16993 AI
->getOperandUse(PtrOpIdx
).set(CastToGlobal
);
16996 AI
->removeFromParent();
16997 AI
->insertInto(GlobalBB
, GlobalBB
->end());
16999 // The new atomicrmw may go through another round of legalization later.
17000 if (!FullFlatEmulation
) {
17001 // We inserted the runtime check already, make sure we do not try to
17003 // TODO: Should union with any existing metadata.
17004 MDBuilder
MDB(F
->getContext());
17005 MDNode
*RangeNotPrivate
=
17006 MDB
.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS
),
17007 APInt(32, AMDGPUAS::PRIVATE_ADDRESS
+ 1));
17008 LoadedGlobal
->setMetadata(LLVMContext::MD_noalias_addrspace
,
17012 Builder
.CreateBr(PhiBB
);
17014 Builder
.SetInsertPoint(PhiBB
);
17016 if (ReturnValueIsUsed
) {
17017 PHINode
*Loaded
= Builder
.CreatePHI(AI
->getType(), 3);
17018 AI
->replaceAllUsesWith(Loaded
);
17019 if (FullFlatEmulation
)
17020 Loaded
->addIncoming(LoadedShared
, SharedBB
);
17021 Loaded
->addIncoming(LoadedPrivate
, PrivateBB
);
17022 Loaded
->addIncoming(LoadedGlobal
, GlobalBB
);
17023 Loaded
->takeName(AI
);
17026 Builder
.CreateBr(ExitBB
);
17029 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst
*AI
) const {
17030 AtomicRMWInst::BinOp Op
= AI
->getOperation();
17032 if (Op
== AtomicRMWInst::Sub
|| Op
== AtomicRMWInst::Or
||
17033 Op
== AtomicRMWInst::Xor
) {
17034 if (const auto *ConstVal
= dyn_cast
<Constant
>(AI
->getValOperand());
17035 ConstVal
&& ConstVal
->isNullValue()) {
17036 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17037 AI
->setOperation(AtomicRMWInst::Add
);
17039 // We may still need the private-alias-flat handling below.
17041 // TODO: Skip this for cases where we cannot access remote memory.
17045 // The non-flat expansions should only perform the de-canonicalization of
17046 // identity values.
17047 if (AI
->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS
)
17050 emitExpandAtomicAddrSpacePredicate(AI
);
17053 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst
*CI
) const {
17054 emitExpandAtomicAddrSpacePredicate(CI
);
17058 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst
*AI
) const {
17059 IRBuilder
<> Builder(AI
);
17060 auto Order
= AI
->getOrdering();
17062 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17063 // must be flushed if the atomic ordering had a release semantics. This is
17064 // not necessary a fence, a release fence just coincides to do that flush.
17065 // Avoid replacing of an atomicrmw with a release semantics.
17066 if (isReleaseOrStronger(Order
))
17069 LoadInst
*LI
= Builder
.CreateAlignedLoad(
17070 AI
->getType(), AI
->getPointerOperand(), AI
->getAlign());
17071 LI
->setAtomic(Order
, AI
->getSyncScopeID());
17072 LI
->copyMetadata(*AI
);
17074 AI
->replaceAllUsesWith(LI
);
17075 AI
->eraseFromParent();