1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This is the parent TargetLowering class for hardware code gen
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUISelLowering.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/CodeGen/Analysis.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/PatternMatch.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Support/KnownBits.h"
28 #include "llvm/Target/TargetMachine.h"
32 #include "AMDGPUGenCallingConv.inc"
34 static cl::opt
<bool> AMDGPUBypassSlowDiv(
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
39 // Find a larger type to do a load / store of a vector with.
40 EVT
AMDGPUTargetLowering::getEquivalentMemType(LLVMContext
&Ctx
, EVT VT
) {
41 unsigned StoreSize
= VT
.getStoreSizeInBits();
43 return EVT::getIntegerVT(Ctx
, StoreSize
);
45 if (StoreSize
% 32 == 0)
46 return EVT::getVectorVT(Ctx
, MVT::i32
, StoreSize
/ 32);
51 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op
, SelectionDAG
&DAG
) {
52 return DAG
.computeKnownBits(Op
).countMaxActiveBits();
55 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op
, SelectionDAG
&DAG
) {
56 // In order for this to be a signed 24-bit value, bit 23, must
58 return DAG
.ComputeMaxSignificantBits(Op
);
61 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine
&TM
,
62 const AMDGPUSubtarget
&STI
)
63 : TargetLowering(TM
), Subtarget(&STI
) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
66 MaxStoresPerMemset
= MaxStoresPerMemsetOptSize
= ~0U;
67 MaxStoresPerMemcpy
= MaxStoresPerMemcpyOptSize
= ~0U;
68 MaxStoresPerMemmove
= MaxStoresPerMemmoveOptSize
= ~0U;
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
71 MaxGluedStoresPerMemcpy
= 16;
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
75 setOperationAction(ISD::LOAD
, MVT::f32
, Promote
);
76 AddPromotedToType(ISD::LOAD
, MVT::f32
, MVT::i32
);
78 setOperationAction(ISD::LOAD
, MVT::v2f32
, Promote
);
79 AddPromotedToType(ISD::LOAD
, MVT::v2f32
, MVT::v2i32
);
81 setOperationAction(ISD::LOAD
, MVT::v3f32
, Promote
);
82 AddPromotedToType(ISD::LOAD
, MVT::v3f32
, MVT::v3i32
);
84 setOperationAction(ISD::LOAD
, MVT::v4f32
, Promote
);
85 AddPromotedToType(ISD::LOAD
, MVT::v4f32
, MVT::v4i32
);
87 setOperationAction(ISD::LOAD
, MVT::v5f32
, Promote
);
88 AddPromotedToType(ISD::LOAD
, MVT::v5f32
, MVT::v5i32
);
90 setOperationAction(ISD::LOAD
, MVT::v6f32
, Promote
);
91 AddPromotedToType(ISD::LOAD
, MVT::v6f32
, MVT::v6i32
);
93 setOperationAction(ISD::LOAD
, MVT::v7f32
, Promote
);
94 AddPromotedToType(ISD::LOAD
, MVT::v7f32
, MVT::v7i32
);
96 setOperationAction(ISD::LOAD
, MVT::v8f32
, Promote
);
97 AddPromotedToType(ISD::LOAD
, MVT::v8f32
, MVT::v8i32
);
99 setOperationAction(ISD::LOAD
, MVT::v9f32
, Promote
);
100 AddPromotedToType(ISD::LOAD
, MVT::v9f32
, MVT::v9i32
);
102 setOperationAction(ISD::LOAD
, MVT::v10f32
, Promote
);
103 AddPromotedToType(ISD::LOAD
, MVT::v10f32
, MVT::v10i32
);
105 setOperationAction(ISD::LOAD
, MVT::v11f32
, Promote
);
106 AddPromotedToType(ISD::LOAD
, MVT::v11f32
, MVT::v11i32
);
108 setOperationAction(ISD::LOAD
, MVT::v12f32
, Promote
);
109 AddPromotedToType(ISD::LOAD
, MVT::v12f32
, MVT::v12i32
);
111 setOperationAction(ISD::LOAD
, MVT::v16f32
, Promote
);
112 AddPromotedToType(ISD::LOAD
, MVT::v16f32
, MVT::v16i32
);
114 setOperationAction(ISD::LOAD
, MVT::v32f32
, Promote
);
115 AddPromotedToType(ISD::LOAD
, MVT::v32f32
, MVT::v32i32
);
117 setOperationAction(ISD::LOAD
, MVT::i64
, Promote
);
118 AddPromotedToType(ISD::LOAD
, MVT::i64
, MVT::v2i32
);
120 setOperationAction(ISD::LOAD
, MVT::v2i64
, Promote
);
121 AddPromotedToType(ISD::LOAD
, MVT::v2i64
, MVT::v4i32
);
123 setOperationAction(ISD::LOAD
, MVT::f64
, Promote
);
124 AddPromotedToType(ISD::LOAD
, MVT::f64
, MVT::v2i32
);
126 setOperationAction(ISD::LOAD
, MVT::v2f64
, Promote
);
127 AddPromotedToType(ISD::LOAD
, MVT::v2f64
, MVT::v4i32
);
129 setOperationAction(ISD::LOAD
, MVT::v3i64
, Promote
);
130 AddPromotedToType(ISD::LOAD
, MVT::v3i64
, MVT::v6i32
);
132 setOperationAction(ISD::LOAD
, MVT::v4i64
, Promote
);
133 AddPromotedToType(ISD::LOAD
, MVT::v4i64
, MVT::v8i32
);
135 setOperationAction(ISD::LOAD
, MVT::v3f64
, Promote
);
136 AddPromotedToType(ISD::LOAD
, MVT::v3f64
, MVT::v6i32
);
138 setOperationAction(ISD::LOAD
, MVT::v4f64
, Promote
);
139 AddPromotedToType(ISD::LOAD
, MVT::v4f64
, MVT::v8i32
);
141 setOperationAction(ISD::LOAD
, MVT::v8i64
, Promote
);
142 AddPromotedToType(ISD::LOAD
, MVT::v8i64
, MVT::v16i32
);
144 setOperationAction(ISD::LOAD
, MVT::v8f64
, Promote
);
145 AddPromotedToType(ISD::LOAD
, MVT::v8f64
, MVT::v16i32
);
147 setOperationAction(ISD::LOAD
, MVT::v16i64
, Promote
);
148 AddPromotedToType(ISD::LOAD
, MVT::v16i64
, MVT::v32i32
);
150 setOperationAction(ISD::LOAD
, MVT::v16f64
, Promote
);
151 AddPromotedToType(ISD::LOAD
, MVT::v16f64
, MVT::v32i32
);
153 setOperationAction(ISD::LOAD
, MVT::i128
, Promote
);
154 AddPromotedToType(ISD::LOAD
, MVT::i128
, MVT::v4i32
);
156 // TODO: Would be better to consume as directly legal
157 setOperationAction(ISD::ATOMIC_LOAD
, MVT::f32
, Promote
);
158 AddPromotedToType(ISD::ATOMIC_LOAD
, MVT::f32
, MVT::i32
);
160 setOperationAction(ISD::ATOMIC_LOAD
, MVT::f64
, Promote
);
161 AddPromotedToType(ISD::ATOMIC_LOAD
, MVT::f64
, MVT::i64
);
163 setOperationAction(ISD::ATOMIC_LOAD
, MVT::f16
, Promote
);
164 AddPromotedToType(ISD::ATOMIC_LOAD
, MVT::f16
, MVT::i16
);
166 setOperationAction(ISD::ATOMIC_LOAD
, MVT::bf16
, Promote
);
167 AddPromotedToType(ISD::ATOMIC_LOAD
, MVT::bf16
, MVT::i16
);
169 setOperationAction(ISD::ATOMIC_STORE
, MVT::f32
, Promote
);
170 AddPromotedToType(ISD::ATOMIC_STORE
, MVT::f32
, MVT::i32
);
172 setOperationAction(ISD::ATOMIC_STORE
, MVT::f64
, Promote
);
173 AddPromotedToType(ISD::ATOMIC_STORE
, MVT::f64
, MVT::i64
);
175 setOperationAction(ISD::ATOMIC_STORE
, MVT::f16
, Promote
);
176 AddPromotedToType(ISD::ATOMIC_STORE
, MVT::f16
, MVT::i16
);
178 setOperationAction(ISD::ATOMIC_STORE
, MVT::bf16
, Promote
);
179 AddPromotedToType(ISD::ATOMIC_STORE
, MVT::bf16
, MVT::i16
);
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT
: MVT::integer_valuetypes())
184 setLoadExtAction({ISD::EXTLOAD
, ISD::SEXTLOAD
, ISD::ZEXTLOAD
}, MVT::i64
, VT
,
187 for (MVT VT
: MVT::integer_valuetypes()) {
191 for (auto Op
: {ISD::SEXTLOAD
, ISD::ZEXTLOAD
, ISD::EXTLOAD
}) {
192 setLoadExtAction(Op
, VT
, MVT::i1
, Promote
);
193 setLoadExtAction(Op
, VT
, MVT::i8
, Legal
);
194 setLoadExtAction(Op
, VT
, MVT::i16
, Legal
);
195 setLoadExtAction(Op
, VT
, MVT::i32
, Expand
);
199 for (MVT VT
: MVT::integer_fixedlen_vector_valuetypes())
201 {MVT::v2i8
, MVT::v4i8
, MVT::v2i16
, MVT::v3i16
, MVT::v4i16
})
202 setLoadExtAction({ISD::SEXTLOAD
, ISD::ZEXTLOAD
, ISD::EXTLOAD
}, VT
, MemVT
,
205 setLoadExtAction(ISD::EXTLOAD
, MVT::f32
, MVT::f16
, Expand
);
206 setLoadExtAction(ISD::EXTLOAD
, MVT::f32
, MVT::bf16
, Expand
);
207 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f32
, MVT::v2f16
, Expand
);
208 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f32
, MVT::v2bf16
, Expand
);
209 setLoadExtAction(ISD::EXTLOAD
, MVT::v3f32
, MVT::v3f16
, Expand
);
210 setLoadExtAction(ISD::EXTLOAD
, MVT::v3f32
, MVT::v3bf16
, Expand
);
211 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f32
, MVT::v4f16
, Expand
);
212 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f32
, MVT::v4bf16
, Expand
);
213 setLoadExtAction(ISD::EXTLOAD
, MVT::v8f32
, MVT::v8f16
, Expand
);
214 setLoadExtAction(ISD::EXTLOAD
, MVT::v8f32
, MVT::v8bf16
, Expand
);
215 setLoadExtAction(ISD::EXTLOAD
, MVT::v16f32
, MVT::v16f16
, Expand
);
216 setLoadExtAction(ISD::EXTLOAD
, MVT::v16f32
, MVT::v16bf16
, Expand
);
217 setLoadExtAction(ISD::EXTLOAD
, MVT::v32f32
, MVT::v32f16
, Expand
);
218 setLoadExtAction(ISD::EXTLOAD
, MVT::v32f32
, MVT::v32bf16
, Expand
);
220 setLoadExtAction(ISD::EXTLOAD
, MVT::f64
, MVT::f32
, Expand
);
221 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f64
, MVT::v2f32
, Expand
);
222 setLoadExtAction(ISD::EXTLOAD
, MVT::v3f64
, MVT::v3f32
, Expand
);
223 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f64
, MVT::v4f32
, Expand
);
224 setLoadExtAction(ISD::EXTLOAD
, MVT::v8f64
, MVT::v8f32
, Expand
);
225 setLoadExtAction(ISD::EXTLOAD
, MVT::v16f64
, MVT::v16f32
, Expand
);
227 setLoadExtAction(ISD::EXTLOAD
, MVT::f64
, MVT::f16
, Expand
);
228 setLoadExtAction(ISD::EXTLOAD
, MVT::f64
, MVT::bf16
, Expand
);
229 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f64
, MVT::v2f16
, Expand
);
230 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f64
, MVT::v2bf16
, Expand
);
231 setLoadExtAction(ISD::EXTLOAD
, MVT::v3f64
, MVT::v3f16
, Expand
);
232 setLoadExtAction(ISD::EXTLOAD
, MVT::v3f64
, MVT::v3bf16
, Expand
);
233 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f64
, MVT::v4f16
, Expand
);
234 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f64
, MVT::v4bf16
, Expand
);
235 setLoadExtAction(ISD::EXTLOAD
, MVT::v8f64
, MVT::v8f16
, Expand
);
236 setLoadExtAction(ISD::EXTLOAD
, MVT::v8f64
, MVT::v8bf16
, Expand
);
237 setLoadExtAction(ISD::EXTLOAD
, MVT::v16f64
, MVT::v16f16
, Expand
);
238 setLoadExtAction(ISD::EXTLOAD
, MVT::v16f64
, MVT::v16bf16
, Expand
);
240 setOperationAction(ISD::STORE
, MVT::f32
, Promote
);
241 AddPromotedToType(ISD::STORE
, MVT::f32
, MVT::i32
);
243 setOperationAction(ISD::STORE
, MVT::v2f32
, Promote
);
244 AddPromotedToType(ISD::STORE
, MVT::v2f32
, MVT::v2i32
);
246 setOperationAction(ISD::STORE
, MVT::v3f32
, Promote
);
247 AddPromotedToType(ISD::STORE
, MVT::v3f32
, MVT::v3i32
);
249 setOperationAction(ISD::STORE
, MVT::v4f32
, Promote
);
250 AddPromotedToType(ISD::STORE
, MVT::v4f32
, MVT::v4i32
);
252 setOperationAction(ISD::STORE
, MVT::v5f32
, Promote
);
253 AddPromotedToType(ISD::STORE
, MVT::v5f32
, MVT::v5i32
);
255 setOperationAction(ISD::STORE
, MVT::v6f32
, Promote
);
256 AddPromotedToType(ISD::STORE
, MVT::v6f32
, MVT::v6i32
);
258 setOperationAction(ISD::STORE
, MVT::v7f32
, Promote
);
259 AddPromotedToType(ISD::STORE
, MVT::v7f32
, MVT::v7i32
);
261 setOperationAction(ISD::STORE
, MVT::v8f32
, Promote
);
262 AddPromotedToType(ISD::STORE
, MVT::v8f32
, MVT::v8i32
);
264 setOperationAction(ISD::STORE
, MVT::v9f32
, Promote
);
265 AddPromotedToType(ISD::STORE
, MVT::v9f32
, MVT::v9i32
);
267 setOperationAction(ISD::STORE
, MVT::v10f32
, Promote
);
268 AddPromotedToType(ISD::STORE
, MVT::v10f32
, MVT::v10i32
);
270 setOperationAction(ISD::STORE
, MVT::v11f32
, Promote
);
271 AddPromotedToType(ISD::STORE
, MVT::v11f32
, MVT::v11i32
);
273 setOperationAction(ISD::STORE
, MVT::v12f32
, Promote
);
274 AddPromotedToType(ISD::STORE
, MVT::v12f32
, MVT::v12i32
);
276 setOperationAction(ISD::STORE
, MVT::v16f32
, Promote
);
277 AddPromotedToType(ISD::STORE
, MVT::v16f32
, MVT::v16i32
);
279 setOperationAction(ISD::STORE
, MVT::v32f32
, Promote
);
280 AddPromotedToType(ISD::STORE
, MVT::v32f32
, MVT::v32i32
);
282 setOperationAction(ISD::STORE
, MVT::i64
, Promote
);
283 AddPromotedToType(ISD::STORE
, MVT::i64
, MVT::v2i32
);
285 setOperationAction(ISD::STORE
, MVT::v2i64
, Promote
);
286 AddPromotedToType(ISD::STORE
, MVT::v2i64
, MVT::v4i32
);
288 setOperationAction(ISD::STORE
, MVT::f64
, Promote
);
289 AddPromotedToType(ISD::STORE
, MVT::f64
, MVT::v2i32
);
291 setOperationAction(ISD::STORE
, MVT::v2f64
, Promote
);
292 AddPromotedToType(ISD::STORE
, MVT::v2f64
, MVT::v4i32
);
294 setOperationAction(ISD::STORE
, MVT::v3i64
, Promote
);
295 AddPromotedToType(ISD::STORE
, MVT::v3i64
, MVT::v6i32
);
297 setOperationAction(ISD::STORE
, MVT::v3f64
, Promote
);
298 AddPromotedToType(ISD::STORE
, MVT::v3f64
, MVT::v6i32
);
300 setOperationAction(ISD::STORE
, MVT::v4i64
, Promote
);
301 AddPromotedToType(ISD::STORE
, MVT::v4i64
, MVT::v8i32
);
303 setOperationAction(ISD::STORE
, MVT::v4f64
, Promote
);
304 AddPromotedToType(ISD::STORE
, MVT::v4f64
, MVT::v8i32
);
306 setOperationAction(ISD::STORE
, MVT::v8i64
, Promote
);
307 AddPromotedToType(ISD::STORE
, MVT::v8i64
, MVT::v16i32
);
309 setOperationAction(ISD::STORE
, MVT::v8f64
, Promote
);
310 AddPromotedToType(ISD::STORE
, MVT::v8f64
, MVT::v16i32
);
312 setOperationAction(ISD::STORE
, MVT::v16i64
, Promote
);
313 AddPromotedToType(ISD::STORE
, MVT::v16i64
, MVT::v32i32
);
315 setOperationAction(ISD::STORE
, MVT::v16f64
, Promote
);
316 AddPromotedToType(ISD::STORE
, MVT::v16f64
, MVT::v32i32
);
318 setOperationAction(ISD::STORE
, MVT::i128
, Promote
);
319 AddPromotedToType(ISD::STORE
, MVT::i128
, MVT::v4i32
);
321 setTruncStoreAction(MVT::i64
, MVT::i1
, Expand
);
322 setTruncStoreAction(MVT::i64
, MVT::i8
, Expand
);
323 setTruncStoreAction(MVT::i64
, MVT::i16
, Expand
);
324 setTruncStoreAction(MVT::i64
, MVT::i32
, Expand
);
326 setTruncStoreAction(MVT::v2i64
, MVT::v2i1
, Expand
);
327 setTruncStoreAction(MVT::v2i64
, MVT::v2i8
, Expand
);
328 setTruncStoreAction(MVT::v2i64
, MVT::v2i16
, Expand
);
329 setTruncStoreAction(MVT::v2i64
, MVT::v2i32
, Expand
);
331 setTruncStoreAction(MVT::f32
, MVT::bf16
, Expand
);
332 setTruncStoreAction(MVT::f32
, MVT::f16
, Expand
);
333 setTruncStoreAction(MVT::v2f32
, MVT::v2bf16
, Expand
);
334 setTruncStoreAction(MVT::v2f32
, MVT::v2f16
, Expand
);
335 setTruncStoreAction(MVT::v3f32
, MVT::v3bf16
, Expand
);
336 setTruncStoreAction(MVT::v3f32
, MVT::v3f16
, Expand
);
337 setTruncStoreAction(MVT::v4f32
, MVT::v4bf16
, Expand
);
338 setTruncStoreAction(MVT::v4f32
, MVT::v4f16
, Expand
);
339 setTruncStoreAction(MVT::v8f32
, MVT::v8bf16
, Expand
);
340 setTruncStoreAction(MVT::v8f32
, MVT::v8f16
, Expand
);
341 setTruncStoreAction(MVT::v16f32
, MVT::v16bf16
, Expand
);
342 setTruncStoreAction(MVT::v16f32
, MVT::v16f16
, Expand
);
343 setTruncStoreAction(MVT::v32f32
, MVT::v32bf16
, Expand
);
344 setTruncStoreAction(MVT::v32f32
, MVT::v32f16
, Expand
);
346 setTruncStoreAction(MVT::f64
, MVT::bf16
, Expand
);
347 setTruncStoreAction(MVT::f64
, MVT::f16
, Expand
);
348 setTruncStoreAction(MVT::f64
, MVT::f32
, Expand
);
350 setTruncStoreAction(MVT::v2f64
, MVT::v2f32
, Expand
);
351 setTruncStoreAction(MVT::v2f64
, MVT::v2bf16
, Expand
);
352 setTruncStoreAction(MVT::v2f64
, MVT::v2f16
, Expand
);
354 setTruncStoreAction(MVT::v3i32
, MVT::v3i8
, Expand
);
356 setTruncStoreAction(MVT::v3i64
, MVT::v3i32
, Expand
);
357 setTruncStoreAction(MVT::v3i64
, MVT::v3i16
, Expand
);
358 setTruncStoreAction(MVT::v3i64
, MVT::v3i8
, Expand
);
359 setTruncStoreAction(MVT::v3i64
, MVT::v3i1
, Expand
);
360 setTruncStoreAction(MVT::v3f64
, MVT::v3f32
, Expand
);
361 setTruncStoreAction(MVT::v3f64
, MVT::v3bf16
, Expand
);
362 setTruncStoreAction(MVT::v3f64
, MVT::v3f16
, Expand
);
364 setTruncStoreAction(MVT::v4i64
, MVT::v4i32
, Expand
);
365 setTruncStoreAction(MVT::v4i64
, MVT::v4i16
, Expand
);
366 setTruncStoreAction(MVT::v4f64
, MVT::v4f32
, Expand
);
367 setTruncStoreAction(MVT::v4f64
, MVT::v4bf16
, Expand
);
368 setTruncStoreAction(MVT::v4f64
, MVT::v4f16
, Expand
);
370 setTruncStoreAction(MVT::v8f64
, MVT::v8f32
, Expand
);
371 setTruncStoreAction(MVT::v8f64
, MVT::v8bf16
, Expand
);
372 setTruncStoreAction(MVT::v8f64
, MVT::v8f16
, Expand
);
374 setTruncStoreAction(MVT::v16f64
, MVT::v16f32
, Expand
);
375 setTruncStoreAction(MVT::v16f64
, MVT::v16bf16
, Expand
);
376 setTruncStoreAction(MVT::v16f64
, MVT::v16f16
, Expand
);
377 setTruncStoreAction(MVT::v16i64
, MVT::v16i16
, Expand
);
378 setTruncStoreAction(MVT::v16i64
, MVT::v16i16
, Expand
);
379 setTruncStoreAction(MVT::v16i64
, MVT::v16i8
, Expand
);
380 setTruncStoreAction(MVT::v16i64
, MVT::v16i8
, Expand
);
381 setTruncStoreAction(MVT::v16i64
, MVT::v16i1
, Expand
);
383 setOperationAction(ISD::Constant
, {MVT::i32
, MVT::i64
}, Legal
);
384 setOperationAction(ISD::ConstantFP
, {MVT::f32
, MVT::f64
}, Legal
);
386 setOperationAction({ISD::BR_JT
, ISD::BRIND
}, MVT::Other
, Expand
);
388 // For R600, this is totally unsupported, just custom lower to produce an
390 setOperationAction(ISD::DYNAMIC_STACKALLOC
, MVT::i32
, Custom
);
392 // Library functions. These default to Expand, but we have instructions
394 setOperationAction({ISD::FCEIL
, ISD::FPOW
, ISD::FABS
, ISD::FFLOOR
,
395 ISD::FROUNDEVEN
, ISD::FTRUNC
, ISD::FMINNUM
, ISD::FMAXNUM
},
398 setOperationAction(ISD::FLOG2
, MVT::f32
, Custom
);
399 setOperationAction(ISD::FROUND
, {MVT::f32
, MVT::f64
}, Custom
);
402 {ISD::FLOG
, ISD::FLOG10
, ISD::FEXP
, ISD::FEXP2
, ISD::FEXP10
}, MVT::f32
,
405 setOperationAction(ISD::FNEARBYINT
, {MVT::f16
, MVT::f32
, MVT::f64
}, Custom
);
407 setOperationAction(ISD::FRINT
, {MVT::f16
, MVT::f32
, MVT::f64
}, Custom
);
409 setOperationAction(ISD::FREM
, {MVT::f16
, MVT::f32
, MVT::f64
}, Custom
);
411 if (Subtarget
->has16BitInsts())
412 setOperationAction(ISD::IS_FPCLASS
, {MVT::f16
, MVT::f32
, MVT::f64
}, Legal
);
414 setOperationAction(ISD::IS_FPCLASS
, {MVT::f32
, MVT::f64
}, Legal
);
415 setOperationAction({ISD::FLOG2
, ISD::FEXP2
}, MVT::f16
, Custom
);
418 setOperationAction({ISD::FLOG10
, ISD::FLOG
, ISD::FEXP
, ISD::FEXP10
}, MVT::f16
,
421 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
422 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
423 // default unless marked custom/legal.
426 {MVT::v2f16
, MVT::v3f16
, MVT::v4f16
, MVT::v16f16
, MVT::v2f32
, MVT::v3f32
,
427 MVT::v4f32
, MVT::v5f32
, MVT::v6f32
, MVT::v7f32
, MVT::v8f32
, MVT::v16f32
,
428 MVT::v2f64
, MVT::v3f64
, MVT::v4f64
, MVT::v8f64
, MVT::v16f64
},
431 // Expand to fneg + fadd.
432 setOperationAction(ISD::FSUB
, MVT::f64
, Expand
);
434 setOperationAction(ISD::CONCAT_VECTORS
,
435 {MVT::v3i32
, MVT::v3f32
, MVT::v4i32
, MVT::v4f32
,
436 MVT::v5i32
, MVT::v5f32
, MVT::v6i32
, MVT::v6f32
,
437 MVT::v7i32
, MVT::v7f32
, MVT::v8i32
, MVT::v8f32
,
438 MVT::v9i32
, MVT::v9f32
, MVT::v10i32
, MVT::v10f32
,
439 MVT::v11i32
, MVT::v11f32
, MVT::v12i32
, MVT::v12f32
},
442 // FIXME: Why is v8f16/v8bf16 missing?
444 ISD::EXTRACT_SUBVECTOR
,
445 {MVT::v2f16
, MVT::v2bf16
, MVT::v2i16
, MVT::v4f16
, MVT::v4bf16
,
446 MVT::v4i16
, MVT::v2f32
, MVT::v2i32
, MVT::v3f32
, MVT::v3i32
,
447 MVT::v4f32
, MVT::v4i32
, MVT::v5f32
, MVT::v5i32
, MVT::v6f32
,
448 MVT::v6i32
, MVT::v7f32
, MVT::v7i32
, MVT::v8f32
, MVT::v8i32
,
449 MVT::v9f32
, MVT::v9i32
, MVT::v10i32
, MVT::v10f32
, MVT::v11i32
,
450 MVT::v11f32
, MVT::v12i32
, MVT::v12f32
, MVT::v16f16
, MVT::v16bf16
,
451 MVT::v16i16
, MVT::v16f32
, MVT::v16i32
, MVT::v32f32
, MVT::v32i32
,
452 MVT::v2f64
, MVT::v2i64
, MVT::v3f64
, MVT::v3i64
, MVT::v4f64
,
453 MVT::v4i64
, MVT::v8f64
, MVT::v8i64
, MVT::v16f64
, MVT::v16i64
,
454 MVT::v32i16
, MVT::v32f16
, MVT::v32bf16
},
457 setOperationAction(ISD::FP16_TO_FP
, MVT::f64
, Expand
);
458 setOperationAction(ISD::FP_TO_FP16
, {MVT::f64
, MVT::f32
}, Custom
);
460 const MVT ScalarIntVTs
[] = { MVT::i32
, MVT::i64
};
461 for (MVT VT
: ScalarIntVTs
) {
462 // These should use [SU]DIVREM, so set them to expand
463 setOperationAction({ISD::SDIV
, ISD::UDIV
, ISD::SREM
, ISD::UREM
}, VT
,
466 // GPU does not have divrem function for signed or unsigned.
467 setOperationAction({ISD::SDIVREM
, ISD::UDIVREM
}, VT
, Custom
);
469 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
470 setOperationAction({ISD::SMUL_LOHI
, ISD::UMUL_LOHI
}, VT
, Expand
);
472 setOperationAction({ISD::BSWAP
, ISD::CTTZ
, ISD::CTLZ
}, VT
, Expand
);
474 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
475 setOperationAction({ISD::ADDC
, ISD::SUBC
, ISD::ADDE
, ISD::SUBE
}, VT
, Legal
);
478 // The hardware supports 32-bit FSHR, but not FSHL.
479 setOperationAction(ISD::FSHR
, MVT::i32
, Legal
);
481 // The hardware supports 32-bit ROTR, but not ROTL.
482 setOperationAction(ISD::ROTL
, {MVT::i32
, MVT::i64
}, Expand
);
483 setOperationAction(ISD::ROTR
, MVT::i64
, Expand
);
485 setOperationAction({ISD::MULHU
, ISD::MULHS
}, MVT::i16
, Expand
);
487 setOperationAction({ISD::MUL
, ISD::MULHU
, ISD::MULHS
}, MVT::i64
, Expand
);
489 {ISD::UINT_TO_FP
, ISD::SINT_TO_FP
, ISD::FP_TO_SINT
, ISD::FP_TO_UINT
},
491 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Expand
);
493 setOperationAction({ISD::SMIN
, ISD::UMIN
, ISD::SMAX
, ISD::UMAX
}, MVT::i32
,
497 {ISD::CTTZ
, ISD::CTTZ_ZERO_UNDEF
, ISD::CTLZ
, ISD::CTLZ_ZERO_UNDEF
},
500 for (auto VT
: {MVT::i8
, MVT::i16
})
501 setOperationAction({ISD::CTLZ
, ISD::CTLZ_ZERO_UNDEF
}, VT
, Custom
);
503 static const MVT::SimpleValueType VectorIntTypes
[] = {
504 MVT::v2i32
, MVT::v3i32
, MVT::v4i32
, MVT::v5i32
, MVT::v6i32
, MVT::v7i32
,
505 MVT::v9i32
, MVT::v10i32
, MVT::v11i32
, MVT::v12i32
};
507 for (MVT VT
: VectorIntTypes
) {
508 // Expand the following operations for the current type by default.
509 setOperationAction({ISD::ADD
, ISD::AND
, ISD::FP_TO_SINT
,
510 ISD::FP_TO_UINT
, ISD::MUL
, ISD::MULHU
,
511 ISD::MULHS
, ISD::OR
, ISD::SHL
,
512 ISD::SRA
, ISD::SRL
, ISD::ROTL
,
513 ISD::ROTR
, ISD::SUB
, ISD::SINT_TO_FP
,
514 ISD::UINT_TO_FP
, ISD::SDIV
, ISD::UDIV
,
515 ISD::SREM
, ISD::UREM
, ISD::SMUL_LOHI
,
516 ISD::UMUL_LOHI
, ISD::SDIVREM
, ISD::UDIVREM
,
517 ISD::SELECT
, ISD::VSELECT
, ISD::SELECT_CC
,
518 ISD::XOR
, ISD::BSWAP
, ISD::CTPOP
,
519 ISD::CTTZ
, ISD::CTLZ
, ISD::VECTOR_SHUFFLE
,
524 static const MVT::SimpleValueType FloatVectorTypes
[] = {
525 MVT::v2f32
, MVT::v3f32
, MVT::v4f32
, MVT::v5f32
, MVT::v6f32
, MVT::v7f32
,
526 MVT::v9f32
, MVT::v10f32
, MVT::v11f32
, MVT::v12f32
};
528 for (MVT VT
: FloatVectorTypes
) {
530 {ISD::FABS
, ISD::FMINNUM
, ISD::FMAXNUM
,
531 ISD::FADD
, ISD::FCEIL
, ISD::FCOS
,
532 ISD::FDIV
, ISD::FEXP2
, ISD::FEXP
,
533 ISD::FEXP10
, ISD::FLOG2
, ISD::FREM
,
534 ISD::FLOG
, ISD::FLOG10
, ISD::FPOW
,
535 ISD::FFLOOR
, ISD::FTRUNC
, ISD::FMUL
,
536 ISD::FMA
, ISD::FRINT
, ISD::FNEARBYINT
,
537 ISD::FSQRT
, ISD::FSIN
, ISD::FSUB
,
538 ISD::FNEG
, ISD::VSELECT
, ISD::SELECT_CC
,
539 ISD::FCOPYSIGN
, ISD::VECTOR_SHUFFLE
, ISD::SETCC
,
540 ISD::FCANONICALIZE
, ISD::FROUNDEVEN
},
544 // This causes using an unrolled select operation rather than expansion with
545 // bit operations. This is in general better, but the alternative using BFI
546 // instructions may be better if the select sources are SGPRs.
547 setOperationAction(ISD::SELECT
, MVT::v2f32
, Promote
);
548 AddPromotedToType(ISD::SELECT
, MVT::v2f32
, MVT::v2i32
);
550 setOperationAction(ISD::SELECT
, MVT::v3f32
, Promote
);
551 AddPromotedToType(ISD::SELECT
, MVT::v3f32
, MVT::v3i32
);
553 setOperationAction(ISD::SELECT
, MVT::v4f32
, Promote
);
554 AddPromotedToType(ISD::SELECT
, MVT::v4f32
, MVT::v4i32
);
556 setOperationAction(ISD::SELECT
, MVT::v5f32
, Promote
);
557 AddPromotedToType(ISD::SELECT
, MVT::v5f32
, MVT::v5i32
);
559 setOperationAction(ISD::SELECT
, MVT::v6f32
, Promote
);
560 AddPromotedToType(ISD::SELECT
, MVT::v6f32
, MVT::v6i32
);
562 setOperationAction(ISD::SELECT
, MVT::v7f32
, Promote
);
563 AddPromotedToType(ISD::SELECT
, MVT::v7f32
, MVT::v7i32
);
565 setOperationAction(ISD::SELECT
, MVT::v9f32
, Promote
);
566 AddPromotedToType(ISD::SELECT
, MVT::v9f32
, MVT::v9i32
);
568 setOperationAction(ISD::SELECT
, MVT::v10f32
, Promote
);
569 AddPromotedToType(ISD::SELECT
, MVT::v10f32
, MVT::v10i32
);
571 setOperationAction(ISD::SELECT
, MVT::v11f32
, Promote
);
572 AddPromotedToType(ISD::SELECT
, MVT::v11f32
, MVT::v11i32
);
574 setOperationAction(ISD::SELECT
, MVT::v12f32
, Promote
);
575 AddPromotedToType(ISD::SELECT
, MVT::v12f32
, MVT::v12i32
);
577 setSchedulingPreference(Sched::RegPressure
);
578 setJumpIsExpensive(true);
580 // FIXME: This is only partially true. If we have to do vector compares, any
581 // SGPR pair can be a condition register. If we have a uniform condition, we
582 // are better off doing SALU operations, where there is only one SCC. For now,
583 // we don't have a way of knowing during instruction selection if a condition
584 // will be uniform and we always use vector compares. Assume we are using
585 // vector compares until that is fixed.
586 setHasMultipleConditionRegisters(true);
588 setMinCmpXchgSizeInBits(32);
589 setSupportsUnalignedAtomics(false);
591 PredictableSelectIsExpensive
= false;
593 // We want to find all load dependencies for long chains of stores to enable
594 // merging into very wide vectors. The problem is with vectors with > 4
595 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
596 // vectors are a legal type, even though we have to split the loads
597 // usually. When we can more precisely specify load legality per address
598 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
599 // smarter so that they can figure out what to do in 2 iterations without all
600 // N > 4 stores on the same chain.
601 GatherAllAliasesMaxDepth
= 16;
603 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
604 // about these during lowering.
605 MaxStoresPerMemcpy
= 0xffffffff;
606 MaxStoresPerMemmove
= 0xffffffff;
607 MaxStoresPerMemset
= 0xffffffff;
609 // The expansion for 64-bit division is enormous.
610 if (AMDGPUBypassSlowDiv
)
611 addBypassSlowDiv(64, 32);
613 setTargetDAGCombine({ISD::BITCAST
, ISD::SHL
,
615 ISD::TRUNCATE
, ISD::MUL
,
616 ISD::SMUL_LOHI
, ISD::UMUL_LOHI
,
617 ISD::MULHU
, ISD::MULHS
,
618 ISD::SELECT
, ISD::SELECT_CC
,
619 ISD::STORE
, ISD::FADD
,
620 ISD::FSUB
, ISD::FNEG
,
621 ISD::FABS
, ISD::AssertZext
,
622 ISD::AssertSext
, ISD::INTRINSIC_WO_CHAIN
});
624 setMaxAtomicSizeInBitsSupported(64);
625 setMaxDivRemBitWidthSupported(64);
626 setMaxLargeFPConvertBitWidthSupported(64);
629 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op
) const {
630 if (getTargetMachine().Options
.NoSignedZerosFPMath
)
633 const auto Flags
= Op
.getNode()->getFlags();
634 if (Flags
.hasNoSignedZeros())
640 //===----------------------------------------------------------------------===//
641 // Target Information
642 //===----------------------------------------------------------------------===//
645 static bool fnegFoldsIntoOpcode(unsigned Opc
) {
654 case ISD::FMINNUM_IEEE
:
655 case ISD::FMAXNUM_IEEE
:
662 case ISD::FNEARBYINT
:
663 case ISD::FROUNDEVEN
:
664 case ISD::FCANONICALIZE
:
666 case AMDGPUISD::RCP_LEGACY
:
667 case AMDGPUISD::RCP_IFLAG
:
668 case AMDGPUISD::SIN_HW
:
669 case AMDGPUISD::FMUL_LEGACY
:
670 case AMDGPUISD::FMIN_LEGACY
:
671 case AMDGPUISD::FMAX_LEGACY
:
672 case AMDGPUISD::FMED3
:
673 // TODO: handle llvm.amdgcn.fma.legacy
676 llvm_unreachable("bitcast is special cased");
682 static bool fnegFoldsIntoOp(const SDNode
*N
) {
683 unsigned Opc
= N
->getOpcode();
684 if (Opc
== ISD::BITCAST
) {
685 // TODO: Is there a benefit to checking the conditions performFNegCombine
686 // does? We don't for the other cases.
687 SDValue BCSrc
= N
->getOperand(0);
688 if (BCSrc
.getOpcode() == ISD::BUILD_VECTOR
) {
689 return BCSrc
.getNumOperands() == 2 &&
690 BCSrc
.getOperand(1).getValueSizeInBits() == 32;
693 return BCSrc
.getOpcode() == ISD::SELECT
&& BCSrc
.getValueType() == MVT::f32
;
696 return fnegFoldsIntoOpcode(Opc
);
699 /// \p returns true if the operation will definitely need to use a 64-bit
700 /// encoding, and thus will use a VOP3 encoding regardless of the source
703 static bool opMustUseVOP3Encoding(const SDNode
*N
, MVT VT
) {
704 return (N
->getNumOperands() > 2 && N
->getOpcode() != ISD::SELECT
) ||
708 /// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
709 /// type for ISD::SELECT.
711 static bool selectSupportsSourceMods(const SDNode
*N
) {
712 // TODO: Only applies if select will be vector
713 return N
->getValueType(0) == MVT::f32
;
716 // Most FP instructions support source modifiers, but this could be refined
719 static bool hasSourceMods(const SDNode
*N
) {
720 if (isa
<MemSDNode
>(N
))
723 switch (N
->getOpcode()) {
728 case ISD::INLINEASM_BR
:
729 case AMDGPUISD::DIV_SCALE
:
730 case ISD::INTRINSIC_W_CHAIN
:
732 // TODO: Should really be looking at the users of the bitcast. These are
733 // problematic because bitcasts are used to legalize all stores to integer
737 case ISD::INTRINSIC_WO_CHAIN
: {
738 switch (N
->getConstantOperandVal(0)) {
739 case Intrinsic::amdgcn_interp_p1
:
740 case Intrinsic::amdgcn_interp_p2
:
741 case Intrinsic::amdgcn_interp_mov
:
742 case Intrinsic::amdgcn_interp_p1_f16
:
743 case Intrinsic::amdgcn_interp_p2_f16
:
750 return selectSupportsSourceMods(N
);
756 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode
*N
,
757 unsigned CostThreshold
) {
758 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
759 // it is truly free to use a source modifier in all cases. If there are
760 // multiple users but for each one will necessitate using VOP3, there will be
761 // a code size increase. Try to avoid increasing code size unless we know it
762 // will save on the instruction count.
763 unsigned NumMayIncreaseSize
= 0;
764 MVT VT
= N
->getValueType(0).getScalarType().getSimpleVT();
766 assert(!N
->use_empty());
768 // XXX - Should this limit number of uses to check?
769 for (const SDNode
*U
: N
->uses()) {
770 if (!hasSourceMods(U
))
773 if (!opMustUseVOP3Encoding(U
, VT
)) {
774 if (++NumMayIncreaseSize
> CostThreshold
)
782 EVT
AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext
&Context
, EVT VT
,
783 ISD::NodeType ExtendKind
) const {
784 assert(!VT
.isVector() && "only scalar expected");
786 // Round to the next multiple of 32-bits.
787 unsigned Size
= VT
.getSizeInBits();
790 return EVT::getIntegerVT(Context
, 32 * ((Size
+ 31) / 32));
793 MVT
AMDGPUTargetLowering::getVectorIdxTy(const DataLayout
&) const {
797 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType
) const {
801 // The backend supports 32 and 64 bit floating point immediates.
802 // FIXME: Why are we reporting vectors of FP immediates as legal?
803 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat
&Imm
, EVT VT
,
804 bool ForCodeSize
) const {
805 EVT ScalarVT
= VT
.getScalarType();
806 return (ScalarVT
== MVT::f32
|| ScalarVT
== MVT::f64
||
807 (ScalarVT
== MVT::f16
&& Subtarget
->has16BitInsts()));
810 // We don't want to shrink f64 / f32 constants.
811 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT
) const {
812 EVT ScalarVT
= VT
.getScalarType();
813 return (ScalarVT
!= MVT::f32
&& ScalarVT
!= MVT::f64
);
816 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode
*N
,
817 ISD::LoadExtType ExtTy
,
819 // TODO: This may be worth removing. Check regression tests for diffs.
820 if (!TargetLoweringBase::shouldReduceLoadWidth(N
, ExtTy
, NewVT
))
823 unsigned NewSize
= NewVT
.getStoreSizeInBits();
825 // If we are reducing to a 32-bit load or a smaller multi-dword load,
826 // this is always better.
830 EVT OldVT
= N
->getValueType(0);
831 unsigned OldSize
= OldVT
.getStoreSizeInBits();
833 MemSDNode
*MN
= cast
<MemSDNode
>(N
);
834 unsigned AS
= MN
->getAddressSpace();
835 // Do not shrink an aligned scalar load to sub-dword.
836 // Scalar engine cannot do sub-dword loads.
837 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
838 if (OldSize
>= 32 && NewSize
< 32 && MN
->getAlign() >= Align(4) &&
839 (AS
== AMDGPUAS::CONSTANT_ADDRESS
||
840 AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
||
841 (isa
<LoadSDNode
>(N
) && AS
== AMDGPUAS::GLOBAL_ADDRESS
&&
842 MN
->isInvariant())) &&
843 AMDGPUInstrInfo::isUniformMMO(MN
->getMemOperand()))
846 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
847 // extloads, so doing one requires using a buffer_load. In cases where we
848 // still couldn't use a scalar load, using the wider load shouldn't really
851 // If the old size already had to be an extload, there's no harm in continuing
852 // to reduce the width.
853 return (OldSize
< 32);
856 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy
, EVT CastTy
,
857 const SelectionDAG
&DAG
,
858 const MachineMemOperand
&MMO
) const {
860 assert(LoadTy
.getSizeInBits() == CastTy
.getSizeInBits());
862 if (LoadTy
.getScalarType() == MVT::i32
)
865 unsigned LScalarSize
= LoadTy
.getScalarSizeInBits();
866 unsigned CastScalarSize
= CastTy
.getScalarSizeInBits();
868 if ((LScalarSize
>= CastScalarSize
) && (CastScalarSize
< 32))
872 return allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
873 CastTy
, MMO
, &Fast
) &&
877 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
878 // profitable with the expansion for 64-bit since it's generally good to
880 bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type
*Ty
) const {
884 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type
*Ty
) const {
888 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode
*N
) const {
889 switch (N
->getOpcode()) {
890 case ISD::EntryToken
:
891 case ISD::TokenFactor
:
893 case ISD::INTRINSIC_WO_CHAIN
: {
894 unsigned IntrID
= N
->getConstantOperandVal(0);
895 return AMDGPU::isIntrinsicAlwaysUniform(IntrID
);
898 if (cast
<LoadSDNode
>(N
)->getMemOperand()->getAddrSpace() ==
899 AMDGPUAS::CONSTANT_ADDRESS_32BIT
)
902 case AMDGPUISD::SETCC
: // ballot-style instruction
908 SDValue
AMDGPUTargetLowering::getNegatedExpression(
909 SDValue Op
, SelectionDAG
&DAG
, bool LegalOperations
, bool ForCodeSize
,
910 NegatibleCost
&Cost
, unsigned Depth
) const {
912 switch (Op
.getOpcode()) {
915 // Negating a fma is not free if it has users without source mods.
916 if (!allUsesHaveSourceMods(Op
.getNode()))
920 case AMDGPUISD::RCP
: {
921 SDValue Src
= Op
.getOperand(0);
922 EVT VT
= Op
.getValueType();
925 SDValue NegSrc
= getNegatedExpression(Src
, DAG
, LegalOperations
,
926 ForCodeSize
, Cost
, Depth
+ 1);
928 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, NegSrc
, Op
->getFlags());
935 return TargetLowering::getNegatedExpression(Op
, DAG
, LegalOperations
,
936 ForCodeSize
, Cost
, Depth
);
939 //===---------------------------------------------------------------------===//
941 //===---------------------------------------------------------------------===//
943 bool AMDGPUTargetLowering::isFAbsFree(EVT VT
) const {
944 assert(VT
.isFloatingPoint());
946 // Packed operations do not have a fabs modifier.
947 return VT
== MVT::f32
|| VT
== MVT::f64
||
948 (Subtarget
->has16BitInsts() && (VT
== MVT::f16
|| VT
== MVT::bf16
));
951 bool AMDGPUTargetLowering::isFNegFree(EVT VT
) const {
952 assert(VT
.isFloatingPoint());
953 // Report this based on the end legalized type.
954 VT
= VT
.getScalarType();
955 return VT
== MVT::f32
|| VT
== MVT::f64
|| VT
== MVT::f16
|| VT
== MVT::bf16
;
958 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero
, EVT MemVT
,
964 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT
) const {
965 // There are few operations which truly have vector input operands. Any vector
966 // operation is going to involve operations on each component, and a
967 // build_vector will be a copy per element, so it always makes sense to use a
968 // build_vector input in place of the extracted element to avoid a copy into a
971 // We should probably only do this if all users are extracts only, but this
972 // should be the common case.
976 bool AMDGPUTargetLowering::isTruncateFree(EVT Source
, EVT Dest
) const {
977 // Truncate is just accessing a subregister.
979 unsigned SrcSize
= Source
.getSizeInBits();
980 unsigned DestSize
= Dest
.getSizeInBits();
982 return DestSize
< SrcSize
&& DestSize
% 32 == 0 ;
985 bool AMDGPUTargetLowering::isTruncateFree(Type
*Source
, Type
*Dest
) const {
986 // Truncate is just accessing a subregister.
988 unsigned SrcSize
= Source
->getScalarSizeInBits();
989 unsigned DestSize
= Dest
->getScalarSizeInBits();
991 if (DestSize
== 16 && Subtarget
->has16BitInsts())
992 return SrcSize
>= 32;
994 return DestSize
< SrcSize
&& DestSize
% 32 == 0;
997 bool AMDGPUTargetLowering::isZExtFree(Type
*Src
, Type
*Dest
) const {
998 unsigned SrcSize
= Src
->getScalarSizeInBits();
999 unsigned DestSize
= Dest
->getScalarSizeInBits();
1001 if (SrcSize
== 16 && Subtarget
->has16BitInsts())
1002 return DestSize
>= 32;
1004 return SrcSize
== 32 && DestSize
== 64;
1007 bool AMDGPUTargetLowering::isZExtFree(EVT Src
, EVT Dest
) const {
1008 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1009 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1010 // this will enable reducing 64-bit operations the 32-bit, which is always
1013 if (Src
== MVT::i16
)
1014 return Dest
== MVT::i32
||Dest
== MVT::i64
;
1016 return Src
== MVT::i32
&& Dest
== MVT::i64
;
1019 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT
, EVT DestVT
) const {
1020 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1021 // limited number of native 64-bit operations. Shrinking an operation to fit
1022 // in a single 32-bit register should always be helpful. As currently used,
1023 // this is much less general than the name suggests, and is only used in
1024 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1025 // not profitable, and may actually be harmful.
1026 return SrcVT
.getSizeInBits() > 32 && DestVT
.getSizeInBits() == 32;
1029 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1030 const SDNode
* N
, CombineLevel Level
) const {
1031 assert((N
->getOpcode() == ISD::SHL
|| N
->getOpcode() == ISD::SRA
||
1032 N
->getOpcode() == ISD::SRL
) &&
1033 "Expected shift op");
1034 // Always commute pre-type legalization and right shifts.
1035 // We're looking for shl(or(x,y),z) patterns.
1036 if (Level
< CombineLevel::AfterLegalizeTypes
||
1037 N
->getOpcode() != ISD::SHL
|| N
->getOperand(0).getOpcode() != ISD::OR
)
1040 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1041 if (N
->getValueType(0) == MVT::i32
&& N
->use_size() == 1 &&
1042 (N
->use_begin()->getOpcode() == ISD::SRA
||
1043 N
->use_begin()->getOpcode() == ISD::SRL
))
1046 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1047 auto IsShiftAndLoad
= [](SDValue LHS
, SDValue RHS
) {
1048 if (LHS
.getOpcode() != ISD::SHL
)
1050 auto *RHSLd
= dyn_cast
<LoadSDNode
>(RHS
);
1051 auto *LHS0
= dyn_cast
<LoadSDNode
>(LHS
.getOperand(0));
1052 auto *LHS1
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
1053 return LHS0
&& LHS1
&& RHSLd
&& LHS0
->getExtensionType() == ISD::ZEXTLOAD
&&
1054 LHS1
->getAPIntValue() == LHS0
->getMemoryVT().getScalarSizeInBits() &&
1055 RHSLd
->getExtensionType() == ISD::ZEXTLOAD
;
1057 SDValue LHS
= N
->getOperand(0).getOperand(0);
1058 SDValue RHS
= N
->getOperand(0).getOperand(1);
1059 return !(IsShiftAndLoad(LHS
, RHS
) || IsShiftAndLoad(RHS
, LHS
));
1062 //===---------------------------------------------------------------------===//
1063 // TargetLowering Callbacks
1064 //===---------------------------------------------------------------------===//
1066 CCAssignFn
*AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC
,
1069 case CallingConv::AMDGPU_VS
:
1070 case CallingConv::AMDGPU_GS
:
1071 case CallingConv::AMDGPU_PS
:
1072 case CallingConv::AMDGPU_CS
:
1073 case CallingConv::AMDGPU_HS
:
1074 case CallingConv::AMDGPU_ES
:
1075 case CallingConv::AMDGPU_LS
:
1077 case CallingConv::AMDGPU_CS_Chain
:
1078 case CallingConv::AMDGPU_CS_ChainPreserve
:
1079 return CC_AMDGPU_CS_CHAIN
;
1080 case CallingConv::C
:
1081 case CallingConv::Fast
:
1082 case CallingConv::Cold
:
1083 return CC_AMDGPU_Func
;
1084 case CallingConv::AMDGPU_Gfx
:
1086 case CallingConv::AMDGPU_KERNEL
:
1087 case CallingConv::SPIR_KERNEL
:
1089 report_fatal_error("Unsupported calling convention for call");
1093 CCAssignFn
*AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC
,
1096 case CallingConv::AMDGPU_KERNEL
:
1097 case CallingConv::SPIR_KERNEL
:
1098 llvm_unreachable("kernels should not be handled here");
1099 case CallingConv::AMDGPU_VS
:
1100 case CallingConv::AMDGPU_GS
:
1101 case CallingConv::AMDGPU_PS
:
1102 case CallingConv::AMDGPU_CS
:
1103 case CallingConv::AMDGPU_CS_Chain
:
1104 case CallingConv::AMDGPU_CS_ChainPreserve
:
1105 case CallingConv::AMDGPU_HS
:
1106 case CallingConv::AMDGPU_ES
:
1107 case CallingConv::AMDGPU_LS
:
1108 return RetCC_SI_Shader
;
1109 case CallingConv::AMDGPU_Gfx
:
1110 return RetCC_SI_Gfx
;
1111 case CallingConv::C
:
1112 case CallingConv::Fast
:
1113 case CallingConv::Cold
:
1114 return RetCC_AMDGPU_Func
;
1116 report_fatal_error("Unsupported calling convention.");
1120 /// The SelectionDAGBuilder will automatically promote function arguments
1121 /// with illegal types. However, this does not work for the AMDGPU targets
1122 /// since the function arguments are stored in memory as these illegal types.
1123 /// In order to handle this properly we need to get the original types sizes
1124 /// from the LLVM IR Function and fixup the ISD:InputArg values before
1125 /// passing them to AnalyzeFormalArguments()
1127 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1128 /// input values across multiple registers. Each item in the Ins array
1129 /// represents a single value that will be stored in registers. Ins[x].VT is
1130 /// the value type of the value that will be stored in the register, so
1131 /// whatever SDNode we lower the argument to needs to be this type.
1133 /// In order to correctly lower the arguments we need to know the size of each
1134 /// argument. Since Ins[x].VT gives us the size of the register that will
1135 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1136 /// for the original function argument so that we can deduce the correct memory
1137 /// type to use for Ins[x]. In most cases the correct memory type will be
1138 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1139 /// we have a kernel argument of type v8i8, this argument will be split into
1140 /// 8 parts and each part will be represented by its own item in the Ins array.
1141 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1142 /// the argument before it was split. From this, we deduce that the memory type
1143 /// for each individual part is i8. We pass the memory type as LocVT to the
1144 /// calling convention analysis function and the register type (Ins[x].VT) as
1146 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1148 const SmallVectorImpl
<ISD::InputArg
> &Ins
) const {
1149 const MachineFunction
&MF
= State
.getMachineFunction();
1150 const Function
&Fn
= MF
.getFunction();
1151 LLVMContext
&Ctx
= Fn
.getParent()->getContext();
1152 const AMDGPUSubtarget
&ST
= AMDGPUSubtarget::get(MF
);
1153 const unsigned ExplicitOffset
= ST
.getExplicitKernelArgOffset();
1154 CallingConv::ID CC
= Fn
.getCallingConv();
1156 Align MaxAlign
= Align(1);
1157 uint64_t ExplicitArgOffset
= 0;
1158 const DataLayout
&DL
= Fn
.getDataLayout();
1160 unsigned InIndex
= 0;
1162 for (const Argument
&Arg
: Fn
.args()) {
1163 const bool IsByRef
= Arg
.hasByRefAttr();
1164 Type
*BaseArgTy
= Arg
.getType();
1165 Type
*MemArgTy
= IsByRef
? Arg
.getParamByRefType() : BaseArgTy
;
1166 Align Alignment
= DL
.getValueOrABITypeAlignment(
1167 IsByRef
? Arg
.getParamAlign() : std::nullopt
, MemArgTy
);
1168 MaxAlign
= std::max(Alignment
, MaxAlign
);
1169 uint64_t AllocSize
= DL
.getTypeAllocSize(MemArgTy
);
1171 uint64_t ArgOffset
= alignTo(ExplicitArgOffset
, Alignment
) + ExplicitOffset
;
1172 ExplicitArgOffset
= alignTo(ExplicitArgOffset
, Alignment
) + AllocSize
;
1174 // We're basically throwing away everything passed into us and starting over
1175 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1176 // to us as computed in Ins.
1178 // We also need to figure out what type legalization is trying to do to get
1179 // the correct memory offsets.
1181 SmallVector
<EVT
, 16> ValueVTs
;
1182 SmallVector
<uint64_t, 16> Offsets
;
1183 ComputeValueVTs(*this, DL
, BaseArgTy
, ValueVTs
, &Offsets
, ArgOffset
);
1185 for (unsigned Value
= 0, NumValues
= ValueVTs
.size();
1186 Value
!= NumValues
; ++Value
) {
1187 uint64_t BasePartOffset
= Offsets
[Value
];
1189 EVT ArgVT
= ValueVTs
[Value
];
1191 MVT RegisterVT
= getRegisterTypeForCallingConv(Ctx
, CC
, ArgVT
);
1192 unsigned NumRegs
= getNumRegistersForCallingConv(Ctx
, CC
, ArgVT
);
1195 // This argument is not split, so the IR type is the memory type.
1196 if (ArgVT
.isExtended()) {
1197 // We have an extended type, like i24, so we should just use the
1203 } else if (ArgVT
.isVector() && RegisterVT
.isVector() &&
1204 ArgVT
.getScalarType() == RegisterVT
.getScalarType()) {
1205 assert(ArgVT
.getVectorNumElements() > RegisterVT
.getVectorNumElements());
1206 // We have a vector value which has been split into a vector with
1207 // the same scalar type, but fewer elements. This should handle
1208 // all the floating-point vector types.
1210 } else if (ArgVT
.isVector() &&
1211 ArgVT
.getVectorNumElements() == NumRegs
) {
1212 // This arg has been split so that each element is stored in a separate
1214 MemVT
= ArgVT
.getScalarType();
1215 } else if (ArgVT
.isExtended()) {
1216 // We have an extended type, like i65.
1219 unsigned MemoryBits
= ArgVT
.getStoreSizeInBits() / NumRegs
;
1220 assert(ArgVT
.getStoreSizeInBits() % NumRegs
== 0);
1221 if (RegisterVT
.isInteger()) {
1222 MemVT
= EVT::getIntegerVT(State
.getContext(), MemoryBits
);
1223 } else if (RegisterVT
.isVector()) {
1224 assert(!RegisterVT
.getScalarType().isFloatingPoint());
1225 unsigned NumElements
= RegisterVT
.getVectorNumElements();
1226 assert(MemoryBits
% NumElements
== 0);
1227 // This vector type has been split into another vector type with
1228 // a different elements size.
1229 EVT ScalarVT
= EVT::getIntegerVT(State
.getContext(),
1230 MemoryBits
/ NumElements
);
1231 MemVT
= EVT::getVectorVT(State
.getContext(), ScalarVT
, NumElements
);
1233 llvm_unreachable("cannot deduce memory type.");
1237 // Convert one element vectors to scalar.
1238 if (MemVT
.isVector() && MemVT
.getVectorNumElements() == 1)
1239 MemVT
= MemVT
.getScalarType();
1241 // Round up vec3/vec5 argument.
1242 if (MemVT
.isVector() && !MemVT
.isPow2VectorType()) {
1243 assert(MemVT
.getVectorNumElements() == 3 ||
1244 MemVT
.getVectorNumElements() == 5 ||
1245 (MemVT
.getVectorNumElements() >= 9 &&
1246 MemVT
.getVectorNumElements() <= 12));
1247 MemVT
= MemVT
.getPow2VectorType(State
.getContext());
1248 } else if (!MemVT
.isSimple() && !MemVT
.isVector()) {
1249 MemVT
= MemVT
.getRoundIntegerType(State
.getContext());
1252 unsigned PartOffset
= 0;
1253 for (unsigned i
= 0; i
!= NumRegs
; ++i
) {
1254 State
.addLoc(CCValAssign::getCustomMem(InIndex
++, RegisterVT
,
1255 BasePartOffset
+ PartOffset
,
1256 MemVT
.getSimpleVT(),
1257 CCValAssign::Full
));
1258 PartOffset
+= MemVT
.getStoreSize();
1264 SDValue
AMDGPUTargetLowering::LowerReturn(
1265 SDValue Chain
, CallingConv::ID CallConv
,
1267 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
1268 const SmallVectorImpl
<SDValue
> &OutVals
,
1269 const SDLoc
&DL
, SelectionDAG
&DAG
) const {
1270 // FIXME: Fails for r600 tests
1271 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1272 // "wave terminate should not have return values");
1273 return DAG
.getNode(AMDGPUISD::ENDPGM
, DL
, MVT::Other
, Chain
);
1276 //===---------------------------------------------------------------------===//
1277 // Target specific lowering
1278 //===---------------------------------------------------------------------===//
1280 /// Selects the correct CCAssignFn for a given CallingConvention value.
1281 CCAssignFn
*AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC
,
1283 return AMDGPUCallLowering::CCAssignFnForCall(CC
, IsVarArg
);
1286 CCAssignFn
*AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC
,
1288 return AMDGPUCallLowering::CCAssignFnForReturn(CC
, IsVarArg
);
1291 SDValue
AMDGPUTargetLowering::addTokenForArgument(SDValue Chain
,
1293 MachineFrameInfo
&MFI
,
1294 int ClobberedFI
) const {
1295 SmallVector
<SDValue
, 8> ArgChains
;
1296 int64_t FirstByte
= MFI
.getObjectOffset(ClobberedFI
);
1297 int64_t LastByte
= FirstByte
+ MFI
.getObjectSize(ClobberedFI
) - 1;
1299 // Include the original chain at the beginning of the list. When this is
1300 // used by target LowerCall hooks, this helps legalize find the
1301 // CALLSEQ_BEGIN node.
1302 ArgChains
.push_back(Chain
);
1304 // Add a chain value for each stack argument corresponding
1305 for (SDNode
*U
: DAG
.getEntryNode().getNode()->uses()) {
1306 if (LoadSDNode
*L
= dyn_cast
<LoadSDNode
>(U
)) {
1307 if (FrameIndexSDNode
*FI
= dyn_cast
<FrameIndexSDNode
>(L
->getBasePtr())) {
1308 if (FI
->getIndex() < 0) {
1309 int64_t InFirstByte
= MFI
.getObjectOffset(FI
->getIndex());
1310 int64_t InLastByte
= InFirstByte
;
1311 InLastByte
+= MFI
.getObjectSize(FI
->getIndex()) - 1;
1313 if ((InFirstByte
<= FirstByte
&& FirstByte
<= InLastByte
) ||
1314 (FirstByte
<= InFirstByte
&& InFirstByte
<= LastByte
))
1315 ArgChains
.push_back(SDValue(L
, 1));
1321 // Build a tokenfactor for all the chains.
1322 return DAG
.getNode(ISD::TokenFactor
, SDLoc(Chain
), MVT::Other
, ArgChains
);
1325 SDValue
AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo
&CLI
,
1326 SmallVectorImpl
<SDValue
> &InVals
,
1327 StringRef Reason
) const {
1328 SDValue Callee
= CLI
.Callee
;
1329 SelectionDAG
&DAG
= CLI
.DAG
;
1331 const Function
&Fn
= DAG
.getMachineFunction().getFunction();
1333 StringRef
FuncName("<unknown>");
1335 if (const ExternalSymbolSDNode
*G
= dyn_cast
<ExternalSymbolSDNode
>(Callee
))
1336 FuncName
= G
->getSymbol();
1337 else if (const GlobalAddressSDNode
*G
= dyn_cast
<GlobalAddressSDNode
>(Callee
))
1338 FuncName
= G
->getGlobal()->getName();
1340 DiagnosticInfoUnsupported
NoCalls(
1341 Fn
, Reason
+ FuncName
, CLI
.DL
.getDebugLoc());
1342 DAG
.getContext()->diagnose(NoCalls
);
1344 if (!CLI
.IsTailCall
) {
1345 for (ISD::InputArg
&Arg
: CLI
.Ins
)
1346 InVals
.push_back(DAG
.getUNDEF(Arg
.VT
));
1349 return DAG
.getEntryNode();
1352 SDValue
AMDGPUTargetLowering::LowerCall(CallLoweringInfo
&CLI
,
1353 SmallVectorImpl
<SDValue
> &InVals
) const {
1354 return lowerUnhandledCall(CLI
, InVals
, "unsupported call to function ");
1357 SDValue
AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op
,
1358 SelectionDAG
&DAG
) const {
1359 const Function
&Fn
= DAG
.getMachineFunction().getFunction();
1361 DiagnosticInfoUnsupported
NoDynamicAlloca(Fn
, "unsupported dynamic alloca",
1362 SDLoc(Op
).getDebugLoc());
1363 DAG
.getContext()->diagnose(NoDynamicAlloca
);
1364 auto Ops
= {DAG
.getConstant(0, SDLoc(), Op
.getValueType()), Op
.getOperand(0)};
1365 return DAG
.getMergeValues(Ops
, SDLoc());
1368 SDValue
AMDGPUTargetLowering::LowerOperation(SDValue Op
,
1369 SelectionDAG
&DAG
) const {
1370 switch (Op
.getOpcode()) {
1372 Op
->print(errs(), &DAG
);
1373 llvm_unreachable("Custom lowering code for this "
1374 "instruction is not implemented yet!");
1376 case ISD::SIGN_EXTEND_INREG
: return LowerSIGN_EXTEND_INREG(Op
, DAG
);
1377 case ISD::CONCAT_VECTORS
: return LowerCONCAT_VECTORS(Op
, DAG
);
1378 case ISD::EXTRACT_SUBVECTOR
: return LowerEXTRACT_SUBVECTOR(Op
, DAG
);
1379 case ISD::UDIVREM
: return LowerUDIVREM(Op
, DAG
);
1380 case ISD::SDIVREM
: return LowerSDIVREM(Op
, DAG
);
1381 case ISD::FREM
: return LowerFREM(Op
, DAG
);
1382 case ISD::FCEIL
: return LowerFCEIL(Op
, DAG
);
1383 case ISD::FTRUNC
: return LowerFTRUNC(Op
, DAG
);
1384 case ISD::FRINT
: return LowerFRINT(Op
, DAG
);
1385 case ISD::FNEARBYINT
: return LowerFNEARBYINT(Op
, DAG
);
1386 case ISD::FROUNDEVEN
:
1387 return LowerFROUNDEVEN(Op
, DAG
);
1388 case ISD::FROUND
: return LowerFROUND(Op
, DAG
);
1389 case ISD::FFLOOR
: return LowerFFLOOR(Op
, DAG
);
1391 return LowerFLOG2(Op
, DAG
);
1394 return LowerFLOGCommon(Op
, DAG
);
1397 return lowerFEXP(Op
, DAG
);
1399 return lowerFEXP2(Op
, DAG
);
1400 case ISD::SINT_TO_FP
: return LowerSINT_TO_FP(Op
, DAG
);
1401 case ISD::UINT_TO_FP
: return LowerUINT_TO_FP(Op
, DAG
);
1402 case ISD::FP_TO_FP16
: return LowerFP_TO_FP16(Op
, DAG
);
1403 case ISD::FP_TO_SINT
:
1404 case ISD::FP_TO_UINT
:
1405 return LowerFP_TO_INT(Op
, DAG
);
1407 case ISD::CTTZ_ZERO_UNDEF
:
1409 case ISD::CTLZ_ZERO_UNDEF
:
1410 return LowerCTLZ_CTTZ(Op
, DAG
);
1411 case ISD::DYNAMIC_STACKALLOC
: return LowerDYNAMIC_STACKALLOC(Op
, DAG
);
1416 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode
*N
,
1417 SmallVectorImpl
<SDValue
> &Results
,
1418 SelectionDAG
&DAG
) const {
1419 switch (N
->getOpcode()) {
1420 case ISD::SIGN_EXTEND_INREG
:
1421 // Different parts of legalization seem to interpret which type of
1422 // sign_extend_inreg is the one to check for custom lowering. The extended
1423 // from type is what really matters, but some places check for custom
1424 // lowering of the result type. This results in trying to use
1425 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1426 // nothing here and let the illegal result integer be handled normally.
1429 if (SDValue Lowered
= LowerFLOG2(SDValue(N
, 0), DAG
))
1430 Results
.push_back(Lowered
);
1434 if (SDValue Lowered
= LowerFLOGCommon(SDValue(N
, 0), DAG
))
1435 Results
.push_back(Lowered
);
1438 if (SDValue Lowered
= lowerFEXP2(SDValue(N
, 0), DAG
))
1439 Results
.push_back(Lowered
);
1443 if (SDValue Lowered
= lowerFEXP(SDValue(N
, 0), DAG
))
1444 Results
.push_back(Lowered
);
1447 case ISD::CTLZ_ZERO_UNDEF
:
1448 if (auto Lowered
= lowerCTLZResults(SDValue(N
, 0u), DAG
))
1449 Results
.push_back(Lowered
);
1456 SDValue
AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
* MFI
,
1458 SelectionDAG
&DAG
) const {
1460 const DataLayout
&DL
= DAG
.getDataLayout();
1461 GlobalAddressSDNode
*G
= cast
<GlobalAddressSDNode
>(Op
);
1462 const GlobalValue
*GV
= G
->getGlobal();
1464 if (!MFI
->isModuleEntryFunction()) {
1465 if (std::optional
<uint32_t> Address
=
1466 AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV
)) {
1467 return DAG
.getConstant(*Address
, SDLoc(Op
), Op
.getValueType());
1471 if (G
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
1472 G
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
) {
1473 if (!MFI
->isModuleEntryFunction() &&
1474 GV
->getName() != "llvm.amdgcn.module.lds") {
1476 const Function
&Fn
= DAG
.getMachineFunction().getFunction();
1477 DiagnosticInfoUnsupported
BadLDSDecl(
1478 Fn
, "local memory global used by non-kernel function",
1479 DL
.getDebugLoc(), DS_Warning
);
1480 DAG
.getContext()->diagnose(BadLDSDecl
);
1482 // We currently don't have a way to correctly allocate LDS objects that
1483 // aren't directly associated with a kernel. We do force inlining of
1484 // functions that use local objects. However, if these dead functions are
1485 // not eliminated, we don't want a compile time error. Just emit a warning
1486 // and a trap, since there should be no callable path here.
1487 SDValue Trap
= DAG
.getNode(ISD::TRAP
, DL
, MVT::Other
, DAG
.getEntryNode());
1488 SDValue OutputChain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
,
1489 Trap
, DAG
.getRoot());
1490 DAG
.setRoot(OutputChain
);
1491 return DAG
.getUNDEF(Op
.getValueType());
1494 // XXX: What does the value of G->getOffset() mean?
1495 assert(G
->getOffset() == 0 &&
1496 "Do not know what to do with an non-zero offset");
1498 // TODO: We could emit code to handle the initialization somewhere.
1499 // We ignore the initializer for now and legalize it to allow selection.
1500 // The initializer will anyway get errored out during assembly emission.
1501 unsigned Offset
= MFI
->allocateLDSGlobal(DL
, *cast
<GlobalVariable
>(GV
));
1502 return DAG
.getConstant(Offset
, SDLoc(Op
), Op
.getValueType());
1507 SDValue
AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op
,
1508 SelectionDAG
&DAG
) const {
1509 SmallVector
<SDValue
, 8> Args
;
1512 EVT VT
= Op
.getValueType();
1513 if (VT
.getVectorElementType().getSizeInBits() < 32) {
1514 unsigned OpBitSize
= Op
.getOperand(0).getValueType().getSizeInBits();
1515 if (OpBitSize
>= 32 && OpBitSize
% 32 == 0) {
1516 unsigned NewNumElt
= OpBitSize
/ 32;
1517 EVT NewEltVT
= (NewNumElt
== 1) ? MVT::i32
1518 : EVT::getVectorVT(*DAG
.getContext(),
1519 MVT::i32
, NewNumElt
);
1520 for (const SDUse
&U
: Op
->ops()) {
1521 SDValue In
= U
.get();
1522 SDValue NewIn
= DAG
.getNode(ISD::BITCAST
, SL
, NewEltVT
, In
);
1524 DAG
.ExtractVectorElements(NewIn
, Args
);
1526 Args
.push_back(NewIn
);
1529 EVT NewVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
1530 NewNumElt
* Op
.getNumOperands());
1531 SDValue BV
= DAG
.getBuildVector(NewVT
, SL
, Args
);
1532 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, BV
);
1536 for (const SDUse
&U
: Op
->ops())
1537 DAG
.ExtractVectorElements(U
.get(), Args
);
1539 return DAG
.getBuildVector(Op
.getValueType(), SL
, Args
);
1542 SDValue
AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op
,
1543 SelectionDAG
&DAG
) const {
1545 SmallVector
<SDValue
, 8> Args
;
1546 unsigned Start
= Op
.getConstantOperandVal(1);
1547 EVT VT
= Op
.getValueType();
1548 EVT SrcVT
= Op
.getOperand(0).getValueType();
1550 if (VT
.getScalarSizeInBits() == 16 && Start
% 2 == 0) {
1551 unsigned NumElt
= VT
.getVectorNumElements();
1552 unsigned NumSrcElt
= SrcVT
.getVectorNumElements();
1553 assert(NumElt
% 2 == 0 && NumSrcElt
% 2 == 0 && "expect legal types");
1555 // Extract 32-bit registers at a time.
1556 EVT NewSrcVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
, NumSrcElt
/ 2);
1557 EVT NewVT
= NumElt
== 2
1559 : EVT::getVectorVT(*DAG
.getContext(), MVT::i32
, NumElt
/ 2);
1560 SDValue Tmp
= DAG
.getNode(ISD::BITCAST
, SL
, NewSrcVT
, Op
.getOperand(0));
1562 DAG
.ExtractVectorElements(Tmp
, Args
, Start
/ 2, NumElt
/ 2);
1566 Tmp
= DAG
.getBuildVector(NewVT
, SL
, Args
);
1568 return DAG
.getNode(ISD::BITCAST
, SL
, VT
, Tmp
);
1571 DAG
.ExtractVectorElements(Op
.getOperand(0), Args
, Start
,
1572 VT
.getVectorNumElements());
1574 return DAG
.getBuildVector(Op
.getValueType(), SL
, Args
);
1577 // TODO: Handle fabs too
1578 static SDValue
peekFNeg(SDValue Val
) {
1579 if (Val
.getOpcode() == ISD::FNEG
)
1580 return Val
.getOperand(0);
1585 static SDValue
peekFPSignOps(SDValue Val
) {
1586 if (Val
.getOpcode() == ISD::FNEG
)
1587 Val
= Val
.getOperand(0);
1588 if (Val
.getOpcode() == ISD::FABS
)
1589 Val
= Val
.getOperand(0);
1590 if (Val
.getOpcode() == ISD::FCOPYSIGN
)
1591 Val
= Val
.getOperand(0);
1595 SDValue
AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1596 const SDLoc
&DL
, EVT VT
, SDValue LHS
, SDValue RHS
, SDValue True
,
1597 SDValue False
, SDValue CC
, DAGCombinerInfo
&DCI
) const {
1598 SelectionDAG
&DAG
= DCI
.DAG
;
1599 ISD::CondCode CCOpcode
= cast
<CondCodeSDNode
>(CC
)->get();
1608 case ISD::SETFALSE2
:
1617 return DAG
.getNode(AMDGPUISD::FMIN_LEGACY
, DL
, VT
, RHS
, LHS
);
1618 return DAG
.getNode(AMDGPUISD::FMAX_LEGACY
, DL
, VT
, LHS
, RHS
);
1624 // Ordered. Assume ordered for undefined.
1626 // Only do this after legalization to avoid interfering with other combines
1627 // which might occur.
1628 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
&&
1629 !DCI
.isCalledByLegalizer())
1632 // We need to permute the operands to get the correct NaN behavior. The
1633 // selected operand is the second one based on the failing compare with NaN,
1634 // so permute it based on the compare type the hardware uses.
1636 return DAG
.getNode(AMDGPUISD::FMIN_LEGACY
, DL
, VT
, LHS
, RHS
);
1637 return DAG
.getNode(AMDGPUISD::FMAX_LEGACY
, DL
, VT
, RHS
, LHS
);
1642 return DAG
.getNode(AMDGPUISD::FMAX_LEGACY
, DL
, VT
, RHS
, LHS
);
1643 return DAG
.getNode(AMDGPUISD::FMIN_LEGACY
, DL
, VT
, LHS
, RHS
);
1649 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
&&
1650 !DCI
.isCalledByLegalizer())
1654 return DAG
.getNode(AMDGPUISD::FMAX_LEGACY
, DL
, VT
, LHS
, RHS
);
1655 return DAG
.getNode(AMDGPUISD::FMIN_LEGACY
, DL
, VT
, RHS
, LHS
);
1657 case ISD::SETCC_INVALID
:
1658 llvm_unreachable("Invalid setcc condcode!");
1663 /// Generate Min/Max node
1664 SDValue
AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc
&DL
, EVT VT
,
1665 SDValue LHS
, SDValue RHS
,
1666 SDValue True
, SDValue False
,
1668 DAGCombinerInfo
&DCI
) const {
1669 if ((LHS
== True
&& RHS
== False
) || (LHS
== False
&& RHS
== True
))
1670 return combineFMinMaxLegacyImpl(DL
, VT
, LHS
, RHS
, True
, False
, CC
, DCI
);
1672 SelectionDAG
&DAG
= DCI
.DAG
;
1674 // If we can't directly match this, try to see if we can fold an fneg to
1677 ConstantFPSDNode
*CRHS
= dyn_cast
<ConstantFPSDNode
>(RHS
);
1678 ConstantFPSDNode
*CFalse
= dyn_cast
<ConstantFPSDNode
>(False
);
1679 SDValue NegTrue
= peekFNeg(True
);
1681 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1684 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1685 // -> fneg (fmin_legacy lhs, K)
1687 // TODO: Use getNegatedExpression
1688 if (LHS
== NegTrue
&& CFalse
&& CRHS
) {
1689 APFloat NegRHS
= neg(CRHS
->getValueAPF());
1690 if (NegRHS
== CFalse
->getValueAPF()) {
1692 combineFMinMaxLegacyImpl(DL
, VT
, LHS
, RHS
, NegTrue
, False
, CC
, DCI
);
1694 return DAG
.getNode(ISD::FNEG
, DL
, VT
, Combined
);
1702 std::pair
<SDValue
, SDValue
>
1703 AMDGPUTargetLowering::split64BitValue(SDValue Op
, SelectionDAG
&DAG
) const {
1706 SDValue Vec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Op
);
1708 const SDValue Zero
= DAG
.getConstant(0, SL
, MVT::i32
);
1709 const SDValue One
= DAG
.getConstant(1, SL
, MVT::i32
);
1711 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Vec
, Zero
);
1712 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Vec
, One
);
1714 return std::pair(Lo
, Hi
);
1717 SDValue
AMDGPUTargetLowering::getLoHalf64(SDValue Op
, SelectionDAG
&DAG
) const {
1720 SDValue Vec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Op
);
1721 const SDValue Zero
= DAG
.getConstant(0, SL
, MVT::i32
);
1722 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Vec
, Zero
);
1725 SDValue
AMDGPUTargetLowering::getHiHalf64(SDValue Op
, SelectionDAG
&DAG
) const {
1728 SDValue Vec
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Op
);
1729 const SDValue One
= DAG
.getConstant(1, SL
, MVT::i32
);
1730 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, Vec
, One
);
1733 // Split a vector type into two parts. The first part is a power of two vector.
1734 // The second part is whatever is left over, and is a scalar if it would
1735 // otherwise be a 1-vector.
1737 AMDGPUTargetLowering::getSplitDestVTs(const EVT
&VT
, SelectionDAG
&DAG
) const {
1739 EVT EltVT
= VT
.getVectorElementType();
1740 unsigned NumElts
= VT
.getVectorNumElements();
1741 unsigned LoNumElts
= PowerOf2Ceil((NumElts
+ 1) / 2);
1742 LoVT
= EVT::getVectorVT(*DAG
.getContext(), EltVT
, LoNumElts
);
1743 HiVT
= NumElts
- LoNumElts
== 1
1745 : EVT::getVectorVT(*DAG
.getContext(), EltVT
, NumElts
- LoNumElts
);
1746 return std::pair(LoVT
, HiVT
);
1749 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1751 std::pair
<SDValue
, SDValue
>
1752 AMDGPUTargetLowering::splitVector(const SDValue
&N
, const SDLoc
&DL
,
1753 const EVT
&LoVT
, const EVT
&HiVT
,
1754 SelectionDAG
&DAG
) const {
1755 assert(LoVT
.getVectorNumElements() +
1756 (HiVT
.isVector() ? HiVT
.getVectorNumElements() : 1) <=
1757 N
.getValueType().getVectorNumElements() &&
1758 "More vector elements requested than available!");
1759 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, LoVT
, N
,
1760 DAG
.getVectorIdxConstant(0, DL
));
1761 SDValue Hi
= DAG
.getNode(
1762 HiVT
.isVector() ? ISD::EXTRACT_SUBVECTOR
: ISD::EXTRACT_VECTOR_ELT
, DL
,
1763 HiVT
, N
, DAG
.getVectorIdxConstant(LoVT
.getVectorNumElements(), DL
));
1764 return std::pair(Lo
, Hi
);
1767 SDValue
AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op
,
1768 SelectionDAG
&DAG
) const {
1769 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
1770 EVT VT
= Op
.getValueType();
1774 // If this is a 2 element vector, we really want to scalarize and not create
1775 // weird 1 element vectors.
1776 if (VT
.getVectorNumElements() == 2) {
1778 std::tie(Ops
[0], Ops
[1]) = scalarizeVectorLoad(Load
, DAG
);
1779 return DAG
.getMergeValues(Ops
, SL
);
1782 SDValue BasePtr
= Load
->getBasePtr();
1783 EVT MemVT
= Load
->getMemoryVT();
1785 const MachinePointerInfo
&SrcValue
= Load
->getMemOperand()->getPointerInfo();
1788 EVT LoMemVT
, HiMemVT
;
1791 std::tie(LoVT
, HiVT
) = getSplitDestVTs(VT
, DAG
);
1792 std::tie(LoMemVT
, HiMemVT
) = getSplitDestVTs(MemVT
, DAG
);
1793 std::tie(Lo
, Hi
) = splitVector(Op
, SL
, LoVT
, HiVT
, DAG
);
1795 unsigned Size
= LoMemVT
.getStoreSize();
1796 Align BaseAlign
= Load
->getAlign();
1797 Align HiAlign
= commonAlignment(BaseAlign
, Size
);
1799 SDValue LoLoad
= DAG
.getExtLoad(Load
->getExtensionType(), SL
, LoVT
,
1800 Load
->getChain(), BasePtr
, SrcValue
, LoMemVT
,
1801 BaseAlign
, Load
->getMemOperand()->getFlags());
1802 SDValue HiPtr
= DAG
.getObjectPtrOffset(SL
, BasePtr
, TypeSize::getFixed(Size
));
1804 DAG
.getExtLoad(Load
->getExtensionType(), SL
, HiVT
, Load
->getChain(),
1805 HiPtr
, SrcValue
.getWithOffset(LoMemVT
.getStoreSize()),
1806 HiMemVT
, HiAlign
, Load
->getMemOperand()->getFlags());
1810 // This is the case that the vector is power of two so was evenly split.
1811 Join
= DAG
.getNode(ISD::CONCAT_VECTORS
, SL
, VT
, LoLoad
, HiLoad
);
1813 Join
= DAG
.getNode(ISD::INSERT_SUBVECTOR
, SL
, VT
, DAG
.getUNDEF(VT
), LoLoad
,
1814 DAG
.getVectorIdxConstant(0, SL
));
1816 HiVT
.isVector() ? ISD::INSERT_SUBVECTOR
: ISD::INSERT_VECTOR_ELT
, SL
,
1818 DAG
.getVectorIdxConstant(LoVT
.getVectorNumElements(), SL
));
1821 SDValue Ops
[] = {Join
, DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
,
1822 LoLoad
.getValue(1), HiLoad
.getValue(1))};
1824 return DAG
.getMergeValues(Ops
, SL
);
1827 SDValue
AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op
,
1828 SelectionDAG
&DAG
) const {
1829 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
1830 EVT VT
= Op
.getValueType();
1831 SDValue BasePtr
= Load
->getBasePtr();
1832 EVT MemVT
= Load
->getMemoryVT();
1834 const MachinePointerInfo
&SrcValue
= Load
->getMemOperand()->getPointerInfo();
1835 Align BaseAlign
= Load
->getAlign();
1836 unsigned NumElements
= MemVT
.getVectorNumElements();
1838 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1839 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1840 if (NumElements
!= 3 ||
1841 (BaseAlign
< Align(8) &&
1842 !SrcValue
.isDereferenceable(16, *DAG
.getContext(), DAG
.getDataLayout())))
1843 return SplitVectorLoad(Op
, DAG
);
1845 assert(NumElements
== 3);
1848 EVT::getVectorVT(*DAG
.getContext(), VT
.getVectorElementType(), 4);
1850 EVT::getVectorVT(*DAG
.getContext(), MemVT
.getVectorElementType(), 4);
1851 SDValue WideLoad
= DAG
.getExtLoad(
1852 Load
->getExtensionType(), SL
, WideVT
, Load
->getChain(), BasePtr
, SrcValue
,
1853 WideMemVT
, BaseAlign
, Load
->getMemOperand()->getFlags());
1854 return DAG
.getMergeValues(
1855 {DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SL
, VT
, WideLoad
,
1856 DAG
.getVectorIdxConstant(0, SL
)),
1857 WideLoad
.getValue(1)},
1861 SDValue
AMDGPUTargetLowering::SplitVectorStore(SDValue Op
,
1862 SelectionDAG
&DAG
) const {
1863 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
1864 SDValue Val
= Store
->getValue();
1865 EVT VT
= Val
.getValueType();
1867 // If this is a 2 element vector, we really want to scalarize and not create
1868 // weird 1 element vectors.
1869 if (VT
.getVectorNumElements() == 2)
1870 return scalarizeVectorStore(Store
, DAG
);
1872 EVT MemVT
= Store
->getMemoryVT();
1873 SDValue Chain
= Store
->getChain();
1874 SDValue BasePtr
= Store
->getBasePtr();
1878 EVT LoMemVT
, HiMemVT
;
1881 std::tie(LoVT
, HiVT
) = getSplitDestVTs(VT
, DAG
);
1882 std::tie(LoMemVT
, HiMemVT
) = getSplitDestVTs(MemVT
, DAG
);
1883 std::tie(Lo
, Hi
) = splitVector(Val
, SL
, LoVT
, HiVT
, DAG
);
1885 SDValue HiPtr
= DAG
.getObjectPtrOffset(SL
, BasePtr
, LoMemVT
.getStoreSize());
1887 const MachinePointerInfo
&SrcValue
= Store
->getMemOperand()->getPointerInfo();
1888 Align BaseAlign
= Store
->getAlign();
1889 unsigned Size
= LoMemVT
.getStoreSize();
1890 Align HiAlign
= commonAlignment(BaseAlign
, Size
);
1893 DAG
.getTruncStore(Chain
, SL
, Lo
, BasePtr
, SrcValue
, LoMemVT
, BaseAlign
,
1894 Store
->getMemOperand()->getFlags());
1896 DAG
.getTruncStore(Chain
, SL
, Hi
, HiPtr
, SrcValue
.getWithOffset(Size
),
1897 HiMemVT
, HiAlign
, Store
->getMemOperand()->getFlags());
1899 return DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
, LoStore
, HiStore
);
1902 // This is a shortcut for integer division because we have fast i32<->f32
1903 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1904 // float is enough to accurately represent up to a 24-bit signed integer.
1905 SDValue
AMDGPUTargetLowering::LowerDIVREM24(SDValue Op
, SelectionDAG
&DAG
,
1908 EVT VT
= Op
.getValueType();
1909 SDValue LHS
= Op
.getOperand(0);
1910 SDValue RHS
= Op
.getOperand(1);
1911 MVT IntVT
= MVT::i32
;
1912 MVT FltVT
= MVT::f32
;
1914 unsigned LHSSignBits
= DAG
.ComputeNumSignBits(LHS
);
1915 if (LHSSignBits
< 9)
1918 unsigned RHSSignBits
= DAG
.ComputeNumSignBits(RHS
);
1919 if (RHSSignBits
< 9)
1922 unsigned BitSize
= VT
.getSizeInBits();
1923 unsigned SignBits
= std::min(LHSSignBits
, RHSSignBits
);
1924 unsigned DivBits
= BitSize
- SignBits
;
1928 ISD::NodeType ToFp
= Sign
? ISD::SINT_TO_FP
: ISD::UINT_TO_FP
;
1929 ISD::NodeType ToInt
= Sign
? ISD::FP_TO_SINT
: ISD::FP_TO_UINT
;
1931 SDValue jq
= DAG
.getConstant(1, DL
, IntVT
);
1934 // char|short jq = ia ^ ib;
1935 jq
= DAG
.getNode(ISD::XOR
, DL
, VT
, LHS
, RHS
);
1937 // jq = jq >> (bitsize - 2)
1938 jq
= DAG
.getNode(ISD::SRA
, DL
, VT
, jq
,
1939 DAG
.getConstant(BitSize
- 2, DL
, VT
));
1942 jq
= DAG
.getNode(ISD::OR
, DL
, VT
, jq
, DAG
.getConstant(1, DL
, VT
));
1945 // int ia = (int)LHS;
1948 // int ib, (int)RHS;
1951 // float fa = (float)ia;
1952 SDValue fa
= DAG
.getNode(ToFp
, DL
, FltVT
, ia
);
1954 // float fb = (float)ib;
1955 SDValue fb
= DAG
.getNode(ToFp
, DL
, FltVT
, ib
);
1957 SDValue fq
= DAG
.getNode(ISD::FMUL
, DL
, FltVT
,
1958 fa
, DAG
.getNode(AMDGPUISD::RCP
, DL
, FltVT
, fb
));
1961 fq
= DAG
.getNode(ISD::FTRUNC
, DL
, FltVT
, fq
);
1963 // float fqneg = -fq;
1964 SDValue fqneg
= DAG
.getNode(ISD::FNEG
, DL
, FltVT
, fq
);
1966 MachineFunction
&MF
= DAG
.getMachineFunction();
1968 bool UseFmadFtz
= false;
1969 if (Subtarget
->isGCN()) {
1970 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1972 MFI
->getMode().FP32Denormals
!= DenormalMode::getPreserveSign();
1975 // float fr = mad(fqneg, fb, fa);
1976 unsigned OpCode
= !Subtarget
->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1977 : UseFmadFtz
? (unsigned)AMDGPUISD::FMAD_FTZ
1978 : (unsigned)ISD::FMAD
;
1979 SDValue fr
= DAG
.getNode(OpCode
, DL
, FltVT
, fqneg
, fb
, fa
);
1981 // int iq = (int)fq;
1982 SDValue iq
= DAG
.getNode(ToInt
, DL
, IntVT
, fq
);
1985 fr
= DAG
.getNode(ISD::FABS
, DL
, FltVT
, fr
);
1988 fb
= DAG
.getNode(ISD::FABS
, DL
, FltVT
, fb
);
1990 EVT SetCCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
1992 // int cv = fr >= fb;
1993 SDValue cv
= DAG
.getSetCC(DL
, SetCCVT
, fr
, fb
, ISD::SETOGE
);
1995 // jq = (cv ? jq : 0);
1996 jq
= DAG
.getNode(ISD::SELECT
, DL
, VT
, cv
, jq
, DAG
.getConstant(0, DL
, VT
));
1999 SDValue Div
= DAG
.getNode(ISD::ADD
, DL
, VT
, iq
, jq
);
2001 // Rem needs compensation, it's easier to recompute it
2002 SDValue Rem
= DAG
.getNode(ISD::MUL
, DL
, VT
, Div
, RHS
);
2003 Rem
= DAG
.getNode(ISD::SUB
, DL
, VT
, LHS
, Rem
);
2005 // Truncate to number of bits this divide really is.
2008 = DAG
.getValueType(EVT::getIntegerVT(*DAG
.getContext(), DivBits
));
2009 Div
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, VT
, Div
, InRegSize
);
2010 Rem
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, VT
, Rem
, InRegSize
);
2012 SDValue TruncMask
= DAG
.getConstant((UINT64_C(1) << DivBits
) - 1, DL
, VT
);
2013 Div
= DAG
.getNode(ISD::AND
, DL
, VT
, Div
, TruncMask
);
2014 Rem
= DAG
.getNode(ISD::AND
, DL
, VT
, Rem
, TruncMask
);
2017 return DAG
.getMergeValues({ Div
, Rem
}, DL
);
2020 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op
,
2022 SmallVectorImpl
<SDValue
> &Results
) const {
2024 EVT VT
= Op
.getValueType();
2026 assert(VT
== MVT::i64
&& "LowerUDIVREM64 expects an i64");
2028 EVT HalfVT
= VT
.getHalfSizedIntegerVT(*DAG
.getContext());
2030 SDValue One
= DAG
.getConstant(1, DL
, HalfVT
);
2031 SDValue Zero
= DAG
.getConstant(0, DL
, HalfVT
);
2034 SDValue LHS_Lo
, LHS_Hi
;
2035 SDValue LHS
= Op
.getOperand(0);
2036 std::tie(LHS_Lo
, LHS_Hi
) = DAG
.SplitScalar(LHS
, DL
, HalfVT
, HalfVT
);
2038 SDValue RHS_Lo
, RHS_Hi
;
2039 SDValue RHS
= Op
.getOperand(1);
2040 std::tie(RHS_Lo
, RHS_Hi
) = DAG
.SplitScalar(RHS
, DL
, HalfVT
, HalfVT
);
2042 if (DAG
.MaskedValueIsZero(RHS
, APInt::getHighBitsSet(64, 32)) &&
2043 DAG
.MaskedValueIsZero(LHS
, APInt::getHighBitsSet(64, 32))) {
2045 SDValue Res
= DAG
.getNode(ISD::UDIVREM
, DL
, DAG
.getVTList(HalfVT
, HalfVT
),
2048 SDValue DIV
= DAG
.getBuildVector(MVT::v2i32
, DL
, {Res
.getValue(0), Zero
});
2049 SDValue REM
= DAG
.getBuildVector(MVT::v2i32
, DL
, {Res
.getValue(1), Zero
});
2051 Results
.push_back(DAG
.getNode(ISD::BITCAST
, DL
, MVT::i64
, DIV
));
2052 Results
.push_back(DAG
.getNode(ISD::BITCAST
, DL
, MVT::i64
, REM
));
2056 if (isTypeLegal(MVT::i64
)) {
2057 // The algorithm here is based on ideas from "Software Integer Division",
2058 // Tom Rodeheffer, August 2008.
2060 MachineFunction
&MF
= DAG
.getMachineFunction();
2061 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2063 // Compute denominator reciprocal.
2065 !Subtarget
->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2066 : MFI
->getMode().FP32Denormals
== DenormalMode::getPreserveSign()
2067 ? (unsigned)ISD::FMAD
2068 : (unsigned)AMDGPUISD::FMAD_FTZ
;
2070 SDValue Cvt_Lo
= DAG
.getNode(ISD::UINT_TO_FP
, DL
, MVT::f32
, RHS_Lo
);
2071 SDValue Cvt_Hi
= DAG
.getNode(ISD::UINT_TO_FP
, DL
, MVT::f32
, RHS_Hi
);
2072 SDValue Mad1
= DAG
.getNode(FMAD
, DL
, MVT::f32
, Cvt_Hi
,
2073 DAG
.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL
, MVT::f32
),
2075 SDValue Rcp
= DAG
.getNode(AMDGPUISD::RCP
, DL
, MVT::f32
, Mad1
);
2076 SDValue Mul1
= DAG
.getNode(ISD::FMUL
, DL
, MVT::f32
, Rcp
,
2077 DAG
.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL
, MVT::f32
));
2078 SDValue Mul2
= DAG
.getNode(ISD::FMUL
, DL
, MVT::f32
, Mul1
,
2079 DAG
.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL
, MVT::f32
));
2080 SDValue Trunc
= DAG
.getNode(ISD::FTRUNC
, DL
, MVT::f32
, Mul2
);
2081 SDValue Mad2
= DAG
.getNode(FMAD
, DL
, MVT::f32
, Trunc
,
2082 DAG
.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL
, MVT::f32
),
2084 SDValue Rcp_Lo
= DAG
.getNode(ISD::FP_TO_UINT
, DL
, HalfVT
, Mad2
);
2085 SDValue Rcp_Hi
= DAG
.getNode(ISD::FP_TO_UINT
, DL
, HalfVT
, Trunc
);
2086 SDValue Rcp64
= DAG
.getBitcast(VT
,
2087 DAG
.getBuildVector(MVT::v2i32
, DL
, {Rcp_Lo
, Rcp_Hi
}));
2089 SDValue Zero64
= DAG
.getConstant(0, DL
, VT
);
2090 SDValue One64
= DAG
.getConstant(1, DL
, VT
);
2091 SDValue Zero1
= DAG
.getConstant(0, DL
, MVT::i1
);
2092 SDVTList HalfCarryVT
= DAG
.getVTList(HalfVT
, MVT::i1
);
2094 // First round of UNR (Unsigned integer Newton-Raphson).
2095 SDValue Neg_RHS
= DAG
.getNode(ISD::SUB
, DL
, VT
, Zero64
, RHS
);
2096 SDValue Mullo1
= DAG
.getNode(ISD::MUL
, DL
, VT
, Neg_RHS
, Rcp64
);
2097 SDValue Mulhi1
= DAG
.getNode(ISD::MULHU
, DL
, VT
, Rcp64
, Mullo1
);
2098 SDValue Mulhi1_Lo
, Mulhi1_Hi
;
2099 std::tie(Mulhi1_Lo
, Mulhi1_Hi
) =
2100 DAG
.SplitScalar(Mulhi1
, DL
, HalfVT
, HalfVT
);
2101 SDValue Add1_Lo
= DAG
.getNode(ISD::UADDO_CARRY
, DL
, HalfCarryVT
, Rcp_Lo
,
2103 SDValue Add1_Hi
= DAG
.getNode(ISD::UADDO_CARRY
, DL
, HalfCarryVT
, Rcp_Hi
,
2104 Mulhi1_Hi
, Add1_Lo
.getValue(1));
2105 SDValue Add1
= DAG
.getBitcast(VT
,
2106 DAG
.getBuildVector(MVT::v2i32
, DL
, {Add1_Lo
, Add1_Hi
}));
2108 // Second round of UNR.
2109 SDValue Mullo2
= DAG
.getNode(ISD::MUL
, DL
, VT
, Neg_RHS
, Add1
);
2110 SDValue Mulhi2
= DAG
.getNode(ISD::MULHU
, DL
, VT
, Add1
, Mullo2
);
2111 SDValue Mulhi2_Lo
, Mulhi2_Hi
;
2112 std::tie(Mulhi2_Lo
, Mulhi2_Hi
) =
2113 DAG
.SplitScalar(Mulhi2
, DL
, HalfVT
, HalfVT
);
2114 SDValue Add2_Lo
= DAG
.getNode(ISD::UADDO_CARRY
, DL
, HalfCarryVT
, Add1_Lo
,
2116 SDValue Add2_Hi
= DAG
.getNode(ISD::UADDO_CARRY
, DL
, HalfCarryVT
, Add1_Hi
,
2117 Mulhi2_Hi
, Add2_Lo
.getValue(1));
2118 SDValue Add2
= DAG
.getBitcast(VT
,
2119 DAG
.getBuildVector(MVT::v2i32
, DL
, {Add2_Lo
, Add2_Hi
}));
2121 SDValue Mulhi3
= DAG
.getNode(ISD::MULHU
, DL
, VT
, LHS
, Add2
);
2123 SDValue Mul3
= DAG
.getNode(ISD::MUL
, DL
, VT
, RHS
, Mulhi3
);
2125 SDValue Mul3_Lo
, Mul3_Hi
;
2126 std::tie(Mul3_Lo
, Mul3_Hi
) = DAG
.SplitScalar(Mul3
, DL
, HalfVT
, HalfVT
);
2127 SDValue Sub1_Lo
= DAG
.getNode(ISD::USUBO_CARRY
, DL
, HalfCarryVT
, LHS_Lo
,
2129 SDValue Sub1_Hi
= DAG
.getNode(ISD::USUBO_CARRY
, DL
, HalfCarryVT
, LHS_Hi
,
2130 Mul3_Hi
, Sub1_Lo
.getValue(1));
2131 SDValue Sub1_Mi
= DAG
.getNode(ISD::SUB
, DL
, HalfVT
, LHS_Hi
, Mul3_Hi
);
2132 SDValue Sub1
= DAG
.getBitcast(VT
,
2133 DAG
.getBuildVector(MVT::v2i32
, DL
, {Sub1_Lo
, Sub1_Hi
}));
2135 SDValue MinusOne
= DAG
.getConstant(0xffffffffu
, DL
, HalfVT
);
2136 SDValue C1
= DAG
.getSelectCC(DL
, Sub1_Hi
, RHS_Hi
, MinusOne
, Zero
,
2138 SDValue C2
= DAG
.getSelectCC(DL
, Sub1_Lo
, RHS_Lo
, MinusOne
, Zero
,
2140 SDValue C3
= DAG
.getSelectCC(DL
, Sub1_Hi
, RHS_Hi
, C2
, C1
, ISD::SETEQ
);
2142 // TODO: Here and below portions of the code can be enclosed into if/endif.
2143 // Currently control flow is unconditional and we have 4 selects after
2144 // potential endif to substitute PHIs.
2147 SDValue Sub2_Lo
= DAG
.getNode(ISD::USUBO_CARRY
, DL
, HalfCarryVT
, Sub1_Lo
,
2149 SDValue Sub2_Mi
= DAG
.getNode(ISD::USUBO_CARRY
, DL
, HalfCarryVT
, Sub1_Mi
,
2150 RHS_Hi
, Sub1_Lo
.getValue(1));
2151 SDValue Sub2_Hi
= DAG
.getNode(ISD::USUBO_CARRY
, DL
, HalfCarryVT
, Sub2_Mi
,
2152 Zero
, Sub2_Lo
.getValue(1));
2153 SDValue Sub2
= DAG
.getBitcast(VT
,
2154 DAG
.getBuildVector(MVT::v2i32
, DL
, {Sub2_Lo
, Sub2_Hi
}));
2156 SDValue Add3
= DAG
.getNode(ISD::ADD
, DL
, VT
, Mulhi3
, One64
);
2158 SDValue C4
= DAG
.getSelectCC(DL
, Sub2_Hi
, RHS_Hi
, MinusOne
, Zero
,
2160 SDValue C5
= DAG
.getSelectCC(DL
, Sub2_Lo
, RHS_Lo
, MinusOne
, Zero
,
2162 SDValue C6
= DAG
.getSelectCC(DL
, Sub2_Hi
, RHS_Hi
, C5
, C4
, ISD::SETEQ
);
2165 SDValue Add4
= DAG
.getNode(ISD::ADD
, DL
, VT
, Add3
, One64
);
2167 SDValue Sub3_Lo
= DAG
.getNode(ISD::USUBO_CARRY
, DL
, HalfCarryVT
, Sub2_Lo
,
2169 SDValue Sub3_Mi
= DAG
.getNode(ISD::USUBO_CARRY
, DL
, HalfCarryVT
, Sub2_Mi
,
2170 RHS_Hi
, Sub2_Lo
.getValue(1));
2171 SDValue Sub3_Hi
= DAG
.getNode(ISD::USUBO_CARRY
, DL
, HalfCarryVT
, Sub3_Mi
,
2172 Zero
, Sub3_Lo
.getValue(1));
2173 SDValue Sub3
= DAG
.getBitcast(VT
,
2174 DAG
.getBuildVector(MVT::v2i32
, DL
, {Sub3_Lo
, Sub3_Hi
}));
2179 SDValue Sel1
= DAG
.getSelectCC(DL
, C6
, Zero
, Add4
, Add3
, ISD::SETNE
);
2180 SDValue Div
= DAG
.getSelectCC(DL
, C3
, Zero
, Sel1
, Mulhi3
, ISD::SETNE
);
2182 SDValue Sel2
= DAG
.getSelectCC(DL
, C6
, Zero
, Sub3
, Sub2
, ISD::SETNE
);
2183 SDValue Rem
= DAG
.getSelectCC(DL
, C3
, Zero
, Sel2
, Sub1
, ISD::SETNE
);
2185 Results
.push_back(Div
);
2186 Results
.push_back(Rem
);
2192 // Get Speculative values
2193 SDValue DIV_Part
= DAG
.getNode(ISD::UDIV
, DL
, HalfVT
, LHS_Hi
, RHS_Lo
);
2194 SDValue REM_Part
= DAG
.getNode(ISD::UREM
, DL
, HalfVT
, LHS_Hi
, RHS_Lo
);
2196 SDValue REM_Lo
= DAG
.getSelectCC(DL
, RHS_Hi
, Zero
, REM_Part
, LHS_Hi
, ISD::SETEQ
);
2197 SDValue REM
= DAG
.getBuildVector(MVT::v2i32
, DL
, {REM_Lo
, Zero
});
2198 REM
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::i64
, REM
);
2200 SDValue DIV_Hi
= DAG
.getSelectCC(DL
, RHS_Hi
, Zero
, DIV_Part
, Zero
, ISD::SETEQ
);
2201 SDValue DIV_Lo
= Zero
;
2203 const unsigned halfBitWidth
= HalfVT
.getSizeInBits();
2205 for (unsigned i
= 0; i
< halfBitWidth
; ++i
) {
2206 const unsigned bitPos
= halfBitWidth
- i
- 1;
2207 SDValue POS
= DAG
.getConstant(bitPos
, DL
, HalfVT
);
2208 // Get value of high bit
2209 SDValue HBit
= DAG
.getNode(ISD::SRL
, DL
, HalfVT
, LHS_Lo
, POS
);
2210 HBit
= DAG
.getNode(ISD::AND
, DL
, HalfVT
, HBit
, One
);
2211 HBit
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VT
, HBit
);
2214 REM
= DAG
.getNode(ISD::SHL
, DL
, VT
, REM
, DAG
.getConstant(1, DL
, VT
));
2216 REM
= DAG
.getNode(ISD::OR
, DL
, VT
, REM
, HBit
);
2218 SDValue BIT
= DAG
.getConstant(1ULL << bitPos
, DL
, HalfVT
);
2219 SDValue realBIT
= DAG
.getSelectCC(DL
, REM
, RHS
, BIT
, Zero
, ISD::SETUGE
);
2221 DIV_Lo
= DAG
.getNode(ISD::OR
, DL
, HalfVT
, DIV_Lo
, realBIT
);
2224 SDValue REM_sub
= DAG
.getNode(ISD::SUB
, DL
, VT
, REM
, RHS
);
2225 REM
= DAG
.getSelectCC(DL
, REM
, RHS
, REM_sub
, REM
, ISD::SETUGE
);
2228 SDValue DIV
= DAG
.getBuildVector(MVT::v2i32
, DL
, {DIV_Lo
, DIV_Hi
});
2229 DIV
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::i64
, DIV
);
2230 Results
.push_back(DIV
);
2231 Results
.push_back(REM
);
2234 SDValue
AMDGPUTargetLowering::LowerUDIVREM(SDValue Op
,
2235 SelectionDAG
&DAG
) const {
2237 EVT VT
= Op
.getValueType();
2239 if (VT
== MVT::i64
) {
2240 SmallVector
<SDValue
, 2> Results
;
2241 LowerUDIVREM64(Op
, DAG
, Results
);
2242 return DAG
.getMergeValues(Results
, DL
);
2245 if (VT
== MVT::i32
) {
2246 if (SDValue Res
= LowerDIVREM24(Op
, DAG
, false))
2250 SDValue X
= Op
.getOperand(0);
2251 SDValue Y
= Op
.getOperand(1);
2253 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2254 // algorithm used here.
2256 // Initial estimate of inv(y).
2257 SDValue Z
= DAG
.getNode(AMDGPUISD::URECIP
, DL
, VT
, Y
);
2259 // One round of UNR.
2260 SDValue NegY
= DAG
.getNode(ISD::SUB
, DL
, VT
, DAG
.getConstant(0, DL
, VT
), Y
);
2261 SDValue NegYZ
= DAG
.getNode(ISD::MUL
, DL
, VT
, NegY
, Z
);
2262 Z
= DAG
.getNode(ISD::ADD
, DL
, VT
, Z
,
2263 DAG
.getNode(ISD::MULHU
, DL
, VT
, Z
, NegYZ
));
2265 // Quotient/remainder estimate.
2266 SDValue Q
= DAG
.getNode(ISD::MULHU
, DL
, VT
, X
, Z
);
2268 DAG
.getNode(ISD::SUB
, DL
, VT
, X
, DAG
.getNode(ISD::MUL
, DL
, VT
, Q
, Y
));
2270 // First quotient/remainder refinement.
2271 EVT CCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
2272 SDValue One
= DAG
.getConstant(1, DL
, VT
);
2273 SDValue Cond
= DAG
.getSetCC(DL
, CCVT
, R
, Y
, ISD::SETUGE
);
2274 Q
= DAG
.getNode(ISD::SELECT
, DL
, VT
, Cond
,
2275 DAG
.getNode(ISD::ADD
, DL
, VT
, Q
, One
), Q
);
2276 R
= DAG
.getNode(ISD::SELECT
, DL
, VT
, Cond
,
2277 DAG
.getNode(ISD::SUB
, DL
, VT
, R
, Y
), R
);
2279 // Second quotient/remainder refinement.
2280 Cond
= DAG
.getSetCC(DL
, CCVT
, R
, Y
, ISD::SETUGE
);
2281 Q
= DAG
.getNode(ISD::SELECT
, DL
, VT
, Cond
,
2282 DAG
.getNode(ISD::ADD
, DL
, VT
, Q
, One
), Q
);
2283 R
= DAG
.getNode(ISD::SELECT
, DL
, VT
, Cond
,
2284 DAG
.getNode(ISD::SUB
, DL
, VT
, R
, Y
), R
);
2286 return DAG
.getMergeValues({Q
, R
}, DL
);
2289 SDValue
AMDGPUTargetLowering::LowerSDIVREM(SDValue Op
,
2290 SelectionDAG
&DAG
) const {
2292 EVT VT
= Op
.getValueType();
2294 SDValue LHS
= Op
.getOperand(0);
2295 SDValue RHS
= Op
.getOperand(1);
2297 SDValue Zero
= DAG
.getConstant(0, DL
, VT
);
2298 SDValue NegOne
= DAG
.getConstant(-1, DL
, VT
);
2300 if (VT
== MVT::i32
) {
2301 if (SDValue Res
= LowerDIVREM24(Op
, DAG
, true))
2305 if (VT
== MVT::i64
&&
2306 DAG
.ComputeNumSignBits(LHS
) > 32 &&
2307 DAG
.ComputeNumSignBits(RHS
) > 32) {
2308 EVT HalfVT
= VT
.getHalfSizedIntegerVT(*DAG
.getContext());
2311 SDValue LHS_Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, HalfVT
, LHS
, Zero
);
2312 SDValue RHS_Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, HalfVT
, RHS
, Zero
);
2313 SDValue DIVREM
= DAG
.getNode(ISD::SDIVREM
, DL
, DAG
.getVTList(HalfVT
, HalfVT
),
2316 DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VT
, DIVREM
.getValue(0)),
2317 DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VT
, DIVREM
.getValue(1))
2319 return DAG
.getMergeValues(Res
, DL
);
2322 SDValue LHSign
= DAG
.getSelectCC(DL
, LHS
, Zero
, NegOne
, Zero
, ISD::SETLT
);
2323 SDValue RHSign
= DAG
.getSelectCC(DL
, RHS
, Zero
, NegOne
, Zero
, ISD::SETLT
);
2324 SDValue DSign
= DAG
.getNode(ISD::XOR
, DL
, VT
, LHSign
, RHSign
);
2325 SDValue RSign
= LHSign
; // Remainder sign is the same as LHS
2327 LHS
= DAG
.getNode(ISD::ADD
, DL
, VT
, LHS
, LHSign
);
2328 RHS
= DAG
.getNode(ISD::ADD
, DL
, VT
, RHS
, RHSign
);
2330 LHS
= DAG
.getNode(ISD::XOR
, DL
, VT
, LHS
, LHSign
);
2331 RHS
= DAG
.getNode(ISD::XOR
, DL
, VT
, RHS
, RHSign
);
2333 SDValue Div
= DAG
.getNode(ISD::UDIVREM
, DL
, DAG
.getVTList(VT
, VT
), LHS
, RHS
);
2334 SDValue Rem
= Div
.getValue(1);
2336 Div
= DAG
.getNode(ISD::XOR
, DL
, VT
, Div
, DSign
);
2337 Rem
= DAG
.getNode(ISD::XOR
, DL
, VT
, Rem
, RSign
);
2339 Div
= DAG
.getNode(ISD::SUB
, DL
, VT
, Div
, DSign
);
2340 Rem
= DAG
.getNode(ISD::SUB
, DL
, VT
, Rem
, RSign
);
2346 return DAG
.getMergeValues(Res
, DL
);
2349 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2350 SDValue
AMDGPUTargetLowering::LowerFREM(SDValue Op
, SelectionDAG
&DAG
) const {
2352 EVT VT
= Op
.getValueType();
2353 auto Flags
= Op
->getFlags();
2354 SDValue X
= Op
.getOperand(0);
2355 SDValue Y
= Op
.getOperand(1);
2357 SDValue Div
= DAG
.getNode(ISD::FDIV
, SL
, VT
, X
, Y
, Flags
);
2358 SDValue Trunc
= DAG
.getNode(ISD::FTRUNC
, SL
, VT
, Div
, Flags
);
2359 SDValue Neg
= DAG
.getNode(ISD::FNEG
, SL
, VT
, Trunc
, Flags
);
2360 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2361 return DAG
.getNode(ISD::FMA
, SL
, VT
, Neg
, Y
, X
, Flags
);
2364 SDValue
AMDGPUTargetLowering::LowerFCEIL(SDValue Op
, SelectionDAG
&DAG
) const {
2366 SDValue Src
= Op
.getOperand(0);
2368 // result = trunc(src)
2369 // if (src > 0.0 && src != result)
2372 SDValue Trunc
= DAG
.getNode(ISD::FTRUNC
, SL
, MVT::f64
, Src
);
2374 const SDValue Zero
= DAG
.getConstantFP(0.0, SL
, MVT::f64
);
2375 const SDValue One
= DAG
.getConstantFP(1.0, SL
, MVT::f64
);
2378 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::f64
);
2380 SDValue Lt0
= DAG
.getSetCC(SL
, SetCCVT
, Src
, Zero
, ISD::SETOGT
);
2381 SDValue NeTrunc
= DAG
.getSetCC(SL
, SetCCVT
, Src
, Trunc
, ISD::SETONE
);
2382 SDValue And
= DAG
.getNode(ISD::AND
, SL
, SetCCVT
, Lt0
, NeTrunc
);
2384 SDValue Add
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f64
, And
, One
, Zero
);
2385 // TODO: Should this propagate fast-math-flags?
2386 return DAG
.getNode(ISD::FADD
, SL
, MVT::f64
, Trunc
, Add
);
2389 static SDValue
extractF64Exponent(SDValue Hi
, const SDLoc
&SL
,
2390 SelectionDAG
&DAG
) {
2391 const unsigned FractBits
= 52;
2392 const unsigned ExpBits
= 11;
2394 SDValue ExpPart
= DAG
.getNode(AMDGPUISD::BFE_U32
, SL
, MVT::i32
,
2396 DAG
.getConstant(FractBits
- 32, SL
, MVT::i32
),
2397 DAG
.getConstant(ExpBits
, SL
, MVT::i32
));
2398 SDValue Exp
= DAG
.getNode(ISD::SUB
, SL
, MVT::i32
, ExpPart
,
2399 DAG
.getConstant(1023, SL
, MVT::i32
));
2404 SDValue
AMDGPUTargetLowering::LowerFTRUNC(SDValue Op
, SelectionDAG
&DAG
) const {
2406 SDValue Src
= Op
.getOperand(0);
2408 assert(Op
.getValueType() == MVT::f64
);
2410 const SDValue Zero
= DAG
.getConstant(0, SL
, MVT::i32
);
2412 // Extract the upper half, since this is where we will find the sign and
2414 SDValue Hi
= getHiHalf64(Src
, DAG
);
2416 SDValue Exp
= extractF64Exponent(Hi
, SL
, DAG
);
2418 const unsigned FractBits
= 52;
2420 // Extract the sign bit.
2421 const SDValue SignBitMask
= DAG
.getConstant(UINT32_C(1) << 31, SL
, MVT::i32
);
2422 SDValue SignBit
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
, Hi
, SignBitMask
);
2424 // Extend back to 64-bits.
2425 SDValue SignBit64
= DAG
.getBuildVector(MVT::v2i32
, SL
, {Zero
, SignBit
});
2426 SignBit64
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, SignBit64
);
2428 SDValue BcInt
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Src
);
2429 const SDValue FractMask
2430 = DAG
.getConstant((UINT64_C(1) << FractBits
) - 1, SL
, MVT::i64
);
2432 SDValue Shr
= DAG
.getNode(ISD::SRA
, SL
, MVT::i64
, FractMask
, Exp
);
2433 SDValue Not
= DAG
.getNOT(SL
, Shr
, MVT::i64
);
2434 SDValue Tmp0
= DAG
.getNode(ISD::AND
, SL
, MVT::i64
, BcInt
, Not
);
2437 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::i32
);
2439 const SDValue FiftyOne
= DAG
.getConstant(FractBits
- 1, SL
, MVT::i32
);
2441 SDValue ExpLt0
= DAG
.getSetCC(SL
, SetCCVT
, Exp
, Zero
, ISD::SETLT
);
2442 SDValue ExpGt51
= DAG
.getSetCC(SL
, SetCCVT
, Exp
, FiftyOne
, ISD::SETGT
);
2444 SDValue Tmp1
= DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, ExpLt0
, SignBit64
, Tmp0
);
2445 SDValue Tmp2
= DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, ExpGt51
, BcInt
, Tmp1
);
2447 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::f64
, Tmp2
);
2450 SDValue
AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op
,
2451 SelectionDAG
&DAG
) const {
2453 SDValue Src
= Op
.getOperand(0);
2455 assert(Op
.getValueType() == MVT::f64
);
2457 APFloat
C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2458 SDValue C1
= DAG
.getConstantFP(C1Val
, SL
, MVT::f64
);
2459 SDValue CopySign
= DAG
.getNode(ISD::FCOPYSIGN
, SL
, MVT::f64
, C1
, Src
);
2461 // TODO: Should this propagate fast-math-flags?
2463 SDValue Tmp1
= DAG
.getNode(ISD::FADD
, SL
, MVT::f64
, Src
, CopySign
);
2464 SDValue Tmp2
= DAG
.getNode(ISD::FSUB
, SL
, MVT::f64
, Tmp1
, CopySign
);
2466 SDValue Fabs
= DAG
.getNode(ISD::FABS
, SL
, MVT::f64
, Src
);
2468 APFloat
C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2469 SDValue C2
= DAG
.getConstantFP(C2Val
, SL
, MVT::f64
);
2472 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::f64
);
2473 SDValue Cond
= DAG
.getSetCC(SL
, SetCCVT
, Fabs
, C2
, ISD::SETOGT
);
2475 return DAG
.getSelect(SL
, MVT::f64
, Cond
, Src
, Tmp2
);
2478 SDValue
AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op
,
2479 SelectionDAG
&DAG
) const {
2480 // FNEARBYINT and FRINT are the same, except in their handling of FP
2481 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2482 // rint, so just treat them as equivalent.
2483 return DAG
.getNode(ISD::FROUNDEVEN
, SDLoc(Op
), Op
.getValueType(),
2487 SDValue
AMDGPUTargetLowering::LowerFRINT(SDValue Op
, SelectionDAG
&DAG
) const {
2488 auto VT
= Op
.getValueType();
2489 auto Arg
= Op
.getOperand(0u);
2490 return DAG
.getNode(ISD::FROUNDEVEN
, SDLoc(Op
), VT
, Arg
);
2493 // XXX - May require not supporting f32 denormals?
2495 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2496 // compare and vselect end up producing worse code than scalarizing the whole
2498 SDValue
AMDGPUTargetLowering::LowerFROUND(SDValue Op
, SelectionDAG
&DAG
) const {
2500 SDValue X
= Op
.getOperand(0);
2501 EVT VT
= Op
.getValueType();
2503 SDValue T
= DAG
.getNode(ISD::FTRUNC
, SL
, VT
, X
);
2505 // TODO: Should this propagate fast-math-flags?
2507 SDValue Diff
= DAG
.getNode(ISD::FSUB
, SL
, VT
, X
, T
);
2509 SDValue AbsDiff
= DAG
.getNode(ISD::FABS
, SL
, VT
, Diff
);
2511 const SDValue Zero
= DAG
.getConstantFP(0.0, SL
, VT
);
2512 const SDValue One
= DAG
.getConstantFP(1.0, SL
, VT
);
2515 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
2517 const SDValue Half
= DAG
.getConstantFP(0.5, SL
, VT
);
2518 SDValue Cmp
= DAG
.getSetCC(SL
, SetCCVT
, AbsDiff
, Half
, ISD::SETOGE
);
2519 SDValue OneOrZeroFP
= DAG
.getNode(ISD::SELECT
, SL
, VT
, Cmp
, One
, Zero
);
2521 SDValue SignedOffset
= DAG
.getNode(ISD::FCOPYSIGN
, SL
, VT
, OneOrZeroFP
, X
);
2522 return DAG
.getNode(ISD::FADD
, SL
, VT
, T
, SignedOffset
);
2525 SDValue
AMDGPUTargetLowering::LowerFFLOOR(SDValue Op
, SelectionDAG
&DAG
) const {
2527 SDValue Src
= Op
.getOperand(0);
2529 // result = trunc(src);
2530 // if (src < 0.0 && src != result)
2533 SDValue Trunc
= DAG
.getNode(ISD::FTRUNC
, SL
, MVT::f64
, Src
);
2535 const SDValue Zero
= DAG
.getConstantFP(0.0, SL
, MVT::f64
);
2536 const SDValue NegOne
= DAG
.getConstantFP(-1.0, SL
, MVT::f64
);
2539 getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), MVT::f64
);
2541 SDValue Lt0
= DAG
.getSetCC(SL
, SetCCVT
, Src
, Zero
, ISD::SETOLT
);
2542 SDValue NeTrunc
= DAG
.getSetCC(SL
, SetCCVT
, Src
, Trunc
, ISD::SETONE
);
2543 SDValue And
= DAG
.getNode(ISD::AND
, SL
, SetCCVT
, Lt0
, NeTrunc
);
2545 SDValue Add
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f64
, And
, NegOne
, Zero
);
2546 // TODO: Should this propagate fast-math-flags?
2547 return DAG
.getNode(ISD::FADD
, SL
, MVT::f64
, Trunc
, Add
);
2550 /// Return true if it's known that \p Src can never be an f32 denormal value.
2551 static bool valueIsKnownNeverF32Denorm(SDValue Src
) {
2552 switch (Src
.getOpcode()) {
2553 case ISD::FP_EXTEND
:
2554 return Src
.getOperand(0).getValueType() == MVT::f16
;
2555 case ISD::FP16_TO_FP
:
2558 case ISD::INTRINSIC_WO_CHAIN
: {
2559 unsigned IntrinsicID
= Src
.getConstantOperandVal(0);
2560 switch (IntrinsicID
) {
2561 case Intrinsic::amdgcn_frexp_mant
:
2571 llvm_unreachable("covered opcode switch");
2574 bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG
&DAG
,
2575 SDNodeFlags Flags
) {
2576 if (Flags
.hasApproximateFuncs())
2578 auto &Options
= DAG
.getTarget().Options
;
2579 return Options
.UnsafeFPMath
|| Options
.ApproxFuncFPMath
;
2582 bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG
&DAG
,
2584 SDNodeFlags Flags
) {
2585 return !valueIsKnownNeverF32Denorm(Src
) &&
2586 DAG
.getMachineFunction()
2587 .getDenormalMode(APFloat::IEEEsingle())
2588 .Input
!= DenormalMode::PreserveSign
;
2591 SDValue
AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG
&DAG
,
2593 SDNodeFlags Flags
) const {
2595 EVT VT
= Src
.getValueType();
2596 const fltSemantics
&Semantics
= SelectionDAG::EVTToAPFloatSemantics(VT
);
2597 SDValue SmallestNormal
=
2598 DAG
.getConstantFP(APFloat::getSmallestNormalized(Semantics
), SL
, VT
);
2600 // Want to scale denormals up, but negatives and 0 work just as well on the
2602 SDValue IsLtSmallestNormal
= DAG
.getSetCC(
2603 SL
, getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
), Src
,
2604 SmallestNormal
, ISD::SETOLT
);
2606 return IsLtSmallestNormal
;
2609 SDValue
AMDGPUTargetLowering::getIsFinite(SelectionDAG
&DAG
, SDValue Src
,
2610 SDNodeFlags Flags
) const {
2612 EVT VT
= Src
.getValueType();
2613 const fltSemantics
&Semantics
= SelectionDAG::EVTToAPFloatSemantics(VT
);
2614 SDValue Inf
= DAG
.getConstantFP(APFloat::getInf(Semantics
), SL
, VT
);
2616 SDValue Fabs
= DAG
.getNode(ISD::FABS
, SL
, VT
, Src
, Flags
);
2617 SDValue IsFinite
= DAG
.getSetCC(
2618 SL
, getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
), Fabs
,
2623 /// If denormal handling is required return the scaled input to FLOG2, and the
2624 /// check for denormal range. Otherwise, return null values.
2625 std::pair
<SDValue
, SDValue
>
2626 AMDGPUTargetLowering::getScaledLogInput(SelectionDAG
&DAG
, const SDLoc SL
,
2627 SDValue Src
, SDNodeFlags Flags
) const {
2628 if (!needsDenormHandlingF32(DAG
, Src
, Flags
))
2632 const fltSemantics
&Semantics
= APFloat::IEEEsingle();
2633 SDValue SmallestNormal
=
2634 DAG
.getConstantFP(APFloat::getSmallestNormalized(Semantics
), SL
, VT
);
2636 SDValue IsLtSmallestNormal
= DAG
.getSetCC(
2637 SL
, getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
), Src
,
2638 SmallestNormal
, ISD::SETOLT
);
2640 SDValue Scale32
= DAG
.getConstantFP(0x1.0p
+32, SL
, VT
);
2641 SDValue One
= DAG
.getConstantFP(1.0, SL
, VT
);
2642 SDValue ScaleFactor
=
2643 DAG
.getNode(ISD::SELECT
, SL
, VT
, IsLtSmallestNormal
, Scale32
, One
, Flags
);
2645 SDValue ScaledInput
= DAG
.getNode(ISD::FMUL
, SL
, VT
, Src
, ScaleFactor
, Flags
);
2646 return {ScaledInput
, IsLtSmallestNormal
};
2649 SDValue
AMDGPUTargetLowering::LowerFLOG2(SDValue Op
, SelectionDAG
&DAG
) const {
2650 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2651 // If we have to handle denormals, scale up the input and adjust the result.
2653 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2654 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2657 EVT VT
= Op
.getValueType();
2658 SDValue Src
= Op
.getOperand(0);
2659 SDNodeFlags Flags
= Op
->getFlags();
2661 if (VT
== MVT::f16
) {
2662 // Nothing in half is a denormal when promoted to f32.
2663 assert(!Subtarget
->has16BitInsts());
2664 SDValue Ext
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src
, Flags
);
2665 SDValue Log
= DAG
.getNode(AMDGPUISD::LOG
, SL
, MVT::f32
, Ext
, Flags
);
2666 return DAG
.getNode(ISD::FP_ROUND
, SL
, VT
, Log
,
2667 DAG
.getTargetConstant(0, SL
, MVT::i32
), Flags
);
2670 auto [ScaledInput
, IsLtSmallestNormal
] =
2671 getScaledLogInput(DAG
, SL
, Src
, Flags
);
2673 return DAG
.getNode(AMDGPUISD::LOG
, SL
, VT
, Src
, Flags
);
2675 SDValue Log2
= DAG
.getNode(AMDGPUISD::LOG
, SL
, VT
, ScaledInput
, Flags
);
2677 SDValue ThirtyTwo
= DAG
.getConstantFP(32.0, SL
, VT
);
2678 SDValue Zero
= DAG
.getConstantFP(0.0, SL
, VT
);
2679 SDValue ResultOffset
=
2680 DAG
.getNode(ISD::SELECT
, SL
, VT
, IsLtSmallestNormal
, ThirtyTwo
, Zero
);
2681 return DAG
.getNode(ISD::FSUB
, SL
, VT
, Log2
, ResultOffset
, Flags
);
2684 static SDValue
getMad(SelectionDAG
&DAG
, const SDLoc
&SL
, EVT VT
, SDValue X
,
2685 SDValue Y
, SDValue C
, SDNodeFlags Flags
= SDNodeFlags()) {
2686 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, VT
, X
, Y
, Flags
);
2687 return DAG
.getNode(ISD::FADD
, SL
, VT
, Mul
, C
, Flags
);
2690 SDValue
AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op
,
2691 SelectionDAG
&DAG
) const {
2692 SDValue X
= Op
.getOperand(0);
2693 EVT VT
= Op
.getValueType();
2694 SDNodeFlags Flags
= Op
->getFlags();
2697 const bool IsLog10
= Op
.getOpcode() == ISD::FLOG10
;
2698 assert(IsLog10
|| Op
.getOpcode() == ISD::FLOG
);
2700 const auto &Options
= getTargetMachine().Options
;
2701 if (VT
== MVT::f16
|| Flags
.hasApproximateFuncs() ||
2702 Options
.ApproxFuncFPMath
|| Options
.UnsafeFPMath
) {
2704 if (VT
== MVT::f16
&& !Subtarget
->has16BitInsts()) {
2705 // Log and multiply in f32 is good enough for f16.
2706 X
= DAG
.getNode(ISD::FP_EXTEND
, DL
, MVT::f32
, X
, Flags
);
2709 SDValue Lowered
= LowerFLOGUnsafe(X
, DL
, DAG
, IsLog10
, Flags
);
2710 if (VT
== MVT::f16
&& !Subtarget
->has16BitInsts()) {
2711 return DAG
.getNode(ISD::FP_ROUND
, DL
, VT
, Lowered
,
2712 DAG
.getTargetConstant(0, DL
, MVT::i32
), Flags
);
2718 auto [ScaledInput
, IsScaled
] = getScaledLogInput(DAG
, DL
, X
, Flags
);
2722 SDValue Y
= DAG
.getNode(AMDGPUISD::LOG
, DL
, VT
, X
, Flags
);
2725 if (Subtarget
->hasFastFMAF32()) {
2726 // c+cc are ln(2)/ln(10) to more than 49 bits
2727 const float c_log10
= 0x1.344134p
-2f
;
2728 const float cc_log10
= 0x1.09f79ep
-26f
;
2730 // c + cc is ln(2) to more than 49 bits
2731 const float c_log
= 0x1.62e42ep
-1f
;
2732 const float cc_log
= 0x1.efa39ep
-25f
;
2734 SDValue C
= DAG
.getConstantFP(IsLog10
? c_log10
: c_log
, DL
, VT
);
2735 SDValue CC
= DAG
.getConstantFP(IsLog10
? cc_log10
: cc_log
, DL
, VT
);
2737 R
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Y
, C
, Flags
);
2738 SDValue NegR
= DAG
.getNode(ISD::FNEG
, DL
, VT
, R
, Flags
);
2739 SDValue FMA0
= DAG
.getNode(ISD::FMA
, DL
, VT
, Y
, C
, NegR
, Flags
);
2740 SDValue FMA1
= DAG
.getNode(ISD::FMA
, DL
, VT
, Y
, CC
, FMA0
, Flags
);
2741 R
= DAG
.getNode(ISD::FADD
, DL
, VT
, R
, FMA1
, Flags
);
2743 // ch+ct is ln(2)/ln(10) to more than 36 bits
2744 const float ch_log10
= 0x1.344000p
-2f
;
2745 const float ct_log10
= 0x1.3509f6p
-18f
;
2747 // ch + ct is ln(2) to more than 36 bits
2748 const float ch_log
= 0x1.62e000p
-1f
;
2749 const float ct_log
= 0x1.0bfbe8p
-15f
;
2751 SDValue CH
= DAG
.getConstantFP(IsLog10
? ch_log10
: ch_log
, DL
, VT
);
2752 SDValue CT
= DAG
.getConstantFP(IsLog10
? ct_log10
: ct_log
, DL
, VT
);
2754 SDValue YAsInt
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::i32
, Y
);
2755 SDValue MaskConst
= DAG
.getConstant(0xfffff000, DL
, MVT::i32
);
2756 SDValue YHInt
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, YAsInt
, MaskConst
);
2757 SDValue YH
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, YHInt
);
2758 SDValue YT
= DAG
.getNode(ISD::FSUB
, DL
, VT
, Y
, YH
, Flags
);
2760 SDValue YTCT
= DAG
.getNode(ISD::FMUL
, DL
, VT
, YT
, CT
, Flags
);
2761 SDValue Mad0
= getMad(DAG
, DL
, VT
, YH
, CT
, YTCT
, Flags
);
2762 SDValue Mad1
= getMad(DAG
, DL
, VT
, YT
, CH
, Mad0
, Flags
);
2763 R
= getMad(DAG
, DL
, VT
, YH
, CH
, Mad1
);
2766 const bool IsFiniteOnly
= (Flags
.hasNoNaNs() || Options
.NoNaNsFPMath
) &&
2767 (Flags
.hasNoInfs() || Options
.NoInfsFPMath
);
2769 // TODO: Check if known finite from source value.
2770 if (!IsFiniteOnly
) {
2771 SDValue IsFinite
= getIsFinite(DAG
, Y
, Flags
);
2772 R
= DAG
.getNode(ISD::SELECT
, DL
, VT
, IsFinite
, R
, Y
, Flags
);
2776 SDValue Zero
= DAG
.getConstantFP(0.0f
, DL
, VT
);
2778 DAG
.getConstantFP(IsLog10
? 0x1.344136p
+3f
: 0x1.62e430p
+4f
, DL
, VT
);
2780 DAG
.getNode(ISD::SELECT
, DL
, VT
, IsScaled
, ShiftK
, Zero
, Flags
);
2781 R
= DAG
.getNode(ISD::FSUB
, DL
, VT
, R
, Shift
, Flags
);
2787 SDValue
AMDGPUTargetLowering::LowerFLOG10(SDValue Op
, SelectionDAG
&DAG
) const {
2788 return LowerFLOGCommon(Op
, DAG
);
2791 // Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2792 // promote f16 operation.
2793 SDValue
AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src
, const SDLoc
&SL
,
2794 SelectionDAG
&DAG
, bool IsLog10
,
2795 SDNodeFlags Flags
) const {
2796 EVT VT
= Src
.getValueType();
2798 VT
== MVT::f32
? (unsigned)AMDGPUISD::LOG
: (unsigned)ISD::FLOG2
;
2800 double Log2BaseInverted
=
2801 IsLog10
? numbers::ln2
/ numbers::ln10
: numbers::ln2
;
2803 if (VT
== MVT::f32
) {
2804 auto [ScaledInput
, IsScaled
] = getScaledLogInput(DAG
, SL
, Src
, Flags
);
2806 SDValue LogSrc
= DAG
.getNode(AMDGPUISD::LOG
, SL
, VT
, ScaledInput
, Flags
);
2807 SDValue ScaledResultOffset
=
2808 DAG
.getConstantFP(-32.0 * Log2BaseInverted
, SL
, VT
);
2810 SDValue Zero
= DAG
.getConstantFP(0.0f
, SL
, VT
);
2812 SDValue ResultOffset
= DAG
.getNode(ISD::SELECT
, SL
, VT
, IsScaled
,
2813 ScaledResultOffset
, Zero
, Flags
);
2815 SDValue Log2Inv
= DAG
.getConstantFP(Log2BaseInverted
, SL
, VT
);
2817 if (Subtarget
->hasFastFMAF32())
2818 return DAG
.getNode(ISD::FMA
, SL
, VT
, LogSrc
, Log2Inv
, ResultOffset
,
2820 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, VT
, LogSrc
, Log2Inv
, Flags
);
2821 return DAG
.getNode(ISD::FADD
, SL
, VT
, Mul
, ResultOffset
);
2825 SDValue Log2Operand
= DAG
.getNode(LogOp
, SL
, VT
, Src
, Flags
);
2826 SDValue Log2BaseInvertedOperand
= DAG
.getConstantFP(Log2BaseInverted
, SL
, VT
);
2828 return DAG
.getNode(ISD::FMUL
, SL
, VT
, Log2Operand
, Log2BaseInvertedOperand
,
2832 SDValue
AMDGPUTargetLowering::lowerFEXP2(SDValue Op
, SelectionDAG
&DAG
) const {
2833 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2834 // If we have to handle denormals, scale up the input and adjust the result.
2837 EVT VT
= Op
.getValueType();
2838 SDValue Src
= Op
.getOperand(0);
2839 SDNodeFlags Flags
= Op
->getFlags();
2841 if (VT
== MVT::f16
) {
2842 // Nothing in half is a denormal when promoted to f32.
2843 assert(!Subtarget
->has16BitInsts());
2844 SDValue Ext
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, Src
, Flags
);
2845 SDValue Log
= DAG
.getNode(AMDGPUISD::EXP
, SL
, MVT::f32
, Ext
, Flags
);
2846 return DAG
.getNode(ISD::FP_ROUND
, SL
, VT
, Log
,
2847 DAG
.getTargetConstant(0, SL
, MVT::i32
), Flags
);
2850 assert(VT
== MVT::f32
);
2852 if (!needsDenormHandlingF32(DAG
, Src
, Flags
))
2853 return DAG
.getNode(AMDGPUISD::EXP
, SL
, MVT::f32
, Src
, Flags
);
2855 // bool needs_scaling = x < -0x1.f80000p+6f;
2856 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2858 // -nextafter(128.0, -1)
2859 SDValue RangeCheckConst
= DAG
.getConstantFP(-0x1.f80000p
+6f
, SL
, VT
);
2861 EVT SetCCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
2863 SDValue NeedsScaling
=
2864 DAG
.getSetCC(SL
, SetCCVT
, Src
, RangeCheckConst
, ISD::SETOLT
);
2866 SDValue SixtyFour
= DAG
.getConstantFP(0x1.0p
+6f
, SL
, VT
);
2867 SDValue Zero
= DAG
.getConstantFP(0.0, SL
, VT
);
2870 DAG
.getNode(ISD::SELECT
, SL
, VT
, NeedsScaling
, SixtyFour
, Zero
);
2872 SDValue AddInput
= DAG
.getNode(ISD::FADD
, SL
, VT
, Src
, AddOffset
, Flags
);
2873 SDValue Exp2
= DAG
.getNode(AMDGPUISD::EXP
, SL
, VT
, AddInput
, Flags
);
2875 SDValue TwoExpNeg64
= DAG
.getConstantFP(0x1.0p
-64f
, SL
, VT
);
2876 SDValue One
= DAG
.getConstantFP(1.0, SL
, VT
);
2877 SDValue ResultScale
=
2878 DAG
.getNode(ISD::SELECT
, SL
, VT
, NeedsScaling
, TwoExpNeg64
, One
);
2880 return DAG
.getNode(ISD::FMUL
, SL
, VT
, Exp2
, ResultScale
, Flags
);
2883 SDValue
AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X
, const SDLoc
&SL
,
2885 SDNodeFlags Flags
) const {
2886 EVT VT
= X
.getValueType();
2887 const SDValue Log2E
= DAG
.getConstantFP(numbers::log2e
, SL
, VT
);
2889 if (VT
!= MVT::f32
|| !needsDenormHandlingF32(DAG
, X
, Flags
)) {
2890 // exp2(M_LOG2E_F * f);
2891 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, VT
, X
, Log2E
, Flags
);
2892 return DAG
.getNode(VT
== MVT::f32
? (unsigned)AMDGPUISD::EXP
2893 : (unsigned)ISD::FEXP2
,
2894 SL
, VT
, Mul
, Flags
);
2897 EVT SetCCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
2899 SDValue Threshold
= DAG
.getConstantFP(-0x1.5d58a0p
+6f
, SL
, VT
);
2900 SDValue NeedsScaling
= DAG
.getSetCC(SL
, SetCCVT
, X
, Threshold
, ISD::SETOLT
);
2902 SDValue ScaleOffset
= DAG
.getConstantFP(0x1.0p
+6f
, SL
, VT
);
2904 SDValue ScaledX
= DAG
.getNode(ISD::FADD
, SL
, VT
, X
, ScaleOffset
, Flags
);
2907 DAG
.getNode(ISD::SELECT
, SL
, VT
, NeedsScaling
, ScaledX
, X
);
2909 SDValue ExpInput
= DAG
.getNode(ISD::FMUL
, SL
, VT
, AdjustedX
, Log2E
, Flags
);
2911 SDValue Exp2
= DAG
.getNode(AMDGPUISD::EXP
, SL
, VT
, ExpInput
, Flags
);
2913 SDValue ResultScaleFactor
= DAG
.getConstantFP(0x1.969d48p
-93f
, SL
, VT
);
2914 SDValue AdjustedResult
=
2915 DAG
.getNode(ISD::FMUL
, SL
, VT
, Exp2
, ResultScaleFactor
, Flags
);
2917 return DAG
.getNode(ISD::SELECT
, SL
, VT
, NeedsScaling
, AdjustedResult
, Exp2
,
2921 /// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2922 /// handled correctly.
2923 SDValue
AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X
, const SDLoc
&SL
,
2925 SDNodeFlags Flags
) const {
2926 const EVT VT
= X
.getValueType();
2927 const unsigned Exp2Op
= VT
== MVT::f32
? AMDGPUISD::EXP
: ISD::FEXP2
;
2929 if (VT
!= MVT::f32
|| !needsDenormHandlingF32(DAG
, X
, Flags
)) {
2930 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2931 SDValue K0
= DAG
.getConstantFP(0x1.a92000p
+1f
, SL
, VT
);
2932 SDValue K1
= DAG
.getConstantFP(0x1.4f0978p
-11f
, SL
, VT
);
2934 SDValue Mul0
= DAG
.getNode(ISD::FMUL
, SL
, VT
, X
, K0
, Flags
);
2935 SDValue Exp2_0
= DAG
.getNode(Exp2Op
, SL
, VT
, Mul0
, Flags
);
2936 SDValue Mul1
= DAG
.getNode(ISD::FMUL
, SL
, VT
, X
, K1
, Flags
);
2937 SDValue Exp2_1
= DAG
.getNode(Exp2Op
, SL
, VT
, Mul1
, Flags
);
2938 return DAG
.getNode(ISD::FMUL
, SL
, VT
, Exp2_0
, Exp2_1
);
2941 // bool s = x < -0x1.2f7030p+5f;
2942 // x += s ? 0x1.0p+5f : 0.0f;
2943 // exp10 = exp2(x * 0x1.a92000p+1f) *
2944 // exp2(x * 0x1.4f0978p-11f) *
2945 // (s ? 0x1.9f623ep-107f : 1.0f);
2947 EVT SetCCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
2949 SDValue Threshold
= DAG
.getConstantFP(-0x1.2f7030p
+5f
, SL
, VT
);
2950 SDValue NeedsScaling
= DAG
.getSetCC(SL
, SetCCVT
, X
, Threshold
, ISD::SETOLT
);
2952 SDValue ScaleOffset
= DAG
.getConstantFP(0x1.0p
+5f
, SL
, VT
);
2953 SDValue ScaledX
= DAG
.getNode(ISD::FADD
, SL
, VT
, X
, ScaleOffset
, Flags
);
2955 DAG
.getNode(ISD::SELECT
, SL
, VT
, NeedsScaling
, ScaledX
, X
);
2957 SDValue K0
= DAG
.getConstantFP(0x1.a92000p
+1f
, SL
, VT
);
2958 SDValue K1
= DAG
.getConstantFP(0x1.4f0978p
-11f
, SL
, VT
);
2960 SDValue Mul0
= DAG
.getNode(ISD::FMUL
, SL
, VT
, AdjustedX
, K0
, Flags
);
2961 SDValue Exp2_0
= DAG
.getNode(Exp2Op
, SL
, VT
, Mul0
, Flags
);
2962 SDValue Mul1
= DAG
.getNode(ISD::FMUL
, SL
, VT
, AdjustedX
, K1
, Flags
);
2963 SDValue Exp2_1
= DAG
.getNode(Exp2Op
, SL
, VT
, Mul1
, Flags
);
2965 SDValue MulExps
= DAG
.getNode(ISD::FMUL
, SL
, VT
, Exp2_0
, Exp2_1
, Flags
);
2967 SDValue ResultScaleFactor
= DAG
.getConstantFP(0x1.9f623ep
-107f
, SL
, VT
);
2968 SDValue AdjustedResult
=
2969 DAG
.getNode(ISD::FMUL
, SL
, VT
, MulExps
, ResultScaleFactor
, Flags
);
2971 return DAG
.getNode(ISD::SELECT
, SL
, VT
, NeedsScaling
, AdjustedResult
, MulExps
,
2975 SDValue
AMDGPUTargetLowering::lowerFEXP(SDValue Op
, SelectionDAG
&DAG
) const {
2976 EVT VT
= Op
.getValueType();
2978 SDValue X
= Op
.getOperand(0);
2979 SDNodeFlags Flags
= Op
->getFlags();
2980 const bool IsExp10
= Op
.getOpcode() == ISD::FEXP10
;
2982 if (VT
.getScalarType() == MVT::f16
) {
2983 // v_exp_f16 (fmul x, log2e)
2984 if (allowApproxFunc(DAG
, Flags
)) // TODO: Does this really require fast?
2985 return lowerFEXPUnsafe(X
, SL
, DAG
, Flags
);
2991 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2993 // Nothing in half is a denormal when promoted to f32.
2994 SDValue Ext
= DAG
.getNode(ISD::FP_EXTEND
, SL
, MVT::f32
, X
, Flags
);
2995 SDValue Lowered
= lowerFEXPUnsafe(Ext
, SL
, DAG
, Flags
);
2996 return DAG
.getNode(ISD::FP_ROUND
, SL
, VT
, Lowered
,
2997 DAG
.getTargetConstant(0, SL
, MVT::i32
), Flags
);
3000 assert(VT
== MVT::f32
);
3002 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3003 // library behavior. Also, is known-not-daz source sufficient?
3004 if (allowApproxFunc(DAG
, Flags
)) {
3005 return IsExp10
? lowerFEXP10Unsafe(X
, SL
, DAG
, Flags
)
3006 : lowerFEXPUnsafe(X
, SL
, DAG
, Flags
);
3011 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3013 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3014 // n = 64*m + j, 0 <= j < 64
3016 // e^x = 2^((64*m + j + f)/64)
3017 // = (2^m) * (2^(j/64)) * 2^(f/64)
3018 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3020 // f = x*(64/ln(2)) - n
3021 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3023 // e^x = (2^m) * (2^(j/64)) * e^r
3025 // (2^(j/64)) is precomputed
3027 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3030 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3032 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3033 SDNodeFlags FlagsNoContract
= Flags
;
3034 FlagsNoContract
.setAllowContract(false);
3037 if (Subtarget
->hasFastFMAF32()) {
3038 const float c_exp
= numbers::log2ef
;
3039 const float cc_exp
= 0x1.4ae0bep
-26f
; // c+cc are 49 bits
3040 const float c_exp10
= 0x1.a934f0p
+1f
;
3041 const float cc_exp10
= 0x1.2f346ep
-24f
;
3043 SDValue C
= DAG
.getConstantFP(IsExp10
? c_exp10
: c_exp
, SL
, VT
);
3044 SDValue CC
= DAG
.getConstantFP(IsExp10
? cc_exp10
: cc_exp
, SL
, VT
);
3046 PH
= DAG
.getNode(ISD::FMUL
, SL
, VT
, X
, C
, Flags
);
3047 SDValue NegPH
= DAG
.getNode(ISD::FNEG
, SL
, VT
, PH
, Flags
);
3048 SDValue FMA0
= DAG
.getNode(ISD::FMA
, SL
, VT
, X
, C
, NegPH
, Flags
);
3049 PL
= DAG
.getNode(ISD::FMA
, SL
, VT
, X
, CC
, FMA0
, Flags
);
3051 const float ch_exp
= 0x1.714000p
+0f
;
3052 const float cl_exp
= 0x1.47652ap
-12f
; // ch + cl are 36 bits
3054 const float ch_exp10
= 0x1.a92000p
+1f
;
3055 const float cl_exp10
= 0x1.4f0978p
-11f
;
3057 SDValue CH
= DAG
.getConstantFP(IsExp10
? ch_exp10
: ch_exp
, SL
, VT
);
3058 SDValue CL
= DAG
.getConstantFP(IsExp10
? cl_exp10
: cl_exp
, SL
, VT
);
3060 SDValue XAsInt
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, X
);
3061 SDValue MaskConst
= DAG
.getConstant(0xfffff000, SL
, MVT::i32
);
3062 SDValue XHAsInt
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
, XAsInt
, MaskConst
);
3063 SDValue XH
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, XHAsInt
);
3064 SDValue XL
= DAG
.getNode(ISD::FSUB
, SL
, VT
, X
, XH
, Flags
);
3066 PH
= DAG
.getNode(ISD::FMUL
, SL
, VT
, XH
, CH
, Flags
);
3068 SDValue XLCL
= DAG
.getNode(ISD::FMUL
, SL
, VT
, XL
, CL
, Flags
);
3069 SDValue Mad0
= getMad(DAG
, SL
, VT
, XL
, CH
, XLCL
, Flags
);
3070 PL
= getMad(DAG
, SL
, VT
, XH
, CL
, Mad0
, Flags
);
3073 SDValue E
= DAG
.getNode(ISD::FROUNDEVEN
, SL
, VT
, PH
, Flags
);
3075 // It is unsafe to contract this fsub into the PH multiply.
3076 SDValue PHSubE
= DAG
.getNode(ISD::FSUB
, SL
, VT
, PH
, E
, FlagsNoContract
);
3078 SDValue A
= DAG
.getNode(ISD::FADD
, SL
, VT
, PHSubE
, PL
, Flags
);
3079 SDValue IntE
= DAG
.getNode(ISD::FP_TO_SINT
, SL
, MVT::i32
, E
);
3080 SDValue Exp2
= DAG
.getNode(AMDGPUISD::EXP
, SL
, VT
, A
, Flags
);
3082 SDValue R
= DAG
.getNode(ISD::FLDEXP
, SL
, VT
, Exp2
, IntE
, Flags
);
3084 SDValue UnderflowCheckConst
=
3085 DAG
.getConstantFP(IsExp10
? -0x1.66d3e8p
+5f
: -0x1.9d1da0p
+6f
, SL
, VT
);
3087 EVT SetCCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
3088 SDValue Zero
= DAG
.getConstantFP(0.0, SL
, VT
);
3090 DAG
.getSetCC(SL
, SetCCVT
, X
, UnderflowCheckConst
, ISD::SETOLT
);
3092 R
= DAG
.getNode(ISD::SELECT
, SL
, VT
, Underflow
, Zero
, R
);
3093 const auto &Options
= getTargetMachine().Options
;
3095 if (!Flags
.hasNoInfs() && !Options
.NoInfsFPMath
) {
3096 SDValue OverflowCheckConst
=
3097 DAG
.getConstantFP(IsExp10
? 0x1.344136p
+5f
: 0x1.62e430p
+6f
, SL
, VT
);
3099 DAG
.getSetCC(SL
, SetCCVT
, X
, OverflowCheckConst
, ISD::SETOGT
);
3101 DAG
.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL
, VT
);
3102 R
= DAG
.getNode(ISD::SELECT
, SL
, VT
, Overflow
, Inf
, R
);
3108 static bool isCtlzOpc(unsigned Opc
) {
3109 return Opc
== ISD::CTLZ
|| Opc
== ISD::CTLZ_ZERO_UNDEF
;
3112 static bool isCttzOpc(unsigned Opc
) {
3113 return Opc
== ISD::CTTZ
|| Opc
== ISD::CTTZ_ZERO_UNDEF
;
3116 SDValue
AMDGPUTargetLowering::lowerCTLZResults(SDValue Op
,
3117 SelectionDAG
&DAG
) const {
3118 auto SL
= SDLoc(Op
);
3119 auto Opc
= Op
.getOpcode();
3120 auto Arg
= Op
.getOperand(0u);
3121 auto ResultVT
= Op
.getValueType();
3123 if (ResultVT
!= MVT::i8
&& ResultVT
!= MVT::i16
)
3126 assert(isCtlzOpc(Opc
));
3127 assert(ResultVT
== Arg
.getValueType());
3129 const uint64_t NumBits
= ResultVT
.getFixedSizeInBits();
3130 SDValue NumExtBits
= DAG
.getConstant(32u - NumBits
, SL
, MVT::i32
);
3133 if (Opc
== ISD::CTLZ_ZERO_UNDEF
) {
3134 NewOp
= DAG
.getNode(ISD::ANY_EXTEND
, SL
, MVT::i32
, Arg
);
3135 NewOp
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, NewOp
, NumExtBits
);
3136 NewOp
= DAG
.getNode(Opc
, SL
, MVT::i32
, NewOp
);
3138 NewOp
= DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i32
, Arg
);
3139 NewOp
= DAG
.getNode(Opc
, SL
, MVT::i32
, NewOp
);
3140 NewOp
= DAG
.getNode(ISD::SUB
, SL
, MVT::i32
, NewOp
, NumExtBits
);
3143 return DAG
.getNode(ISD::TRUNCATE
, SL
, ResultVT
, NewOp
);
3146 SDValue
AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op
, SelectionDAG
&DAG
) const {
3148 SDValue Src
= Op
.getOperand(0);
3150 assert(isCtlzOpc(Op
.getOpcode()) || isCttzOpc(Op
.getOpcode()));
3151 bool Ctlz
= isCtlzOpc(Op
.getOpcode());
3152 unsigned NewOpc
= Ctlz
? AMDGPUISD::FFBH_U32
: AMDGPUISD::FFBL_B32
;
3154 bool ZeroUndef
= Op
.getOpcode() == ISD::CTLZ_ZERO_UNDEF
||
3155 Op
.getOpcode() == ISD::CTTZ_ZERO_UNDEF
;
3156 bool Is64BitScalar
= !Src
->isDivergent() && Src
.getValueType() == MVT::i64
;
3158 if (Src
.getValueType() == MVT::i32
|| Is64BitScalar
) {
3159 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3160 // (cttz hi:lo) -> (umin (ffbl src), 32)
3161 // (ctlz_zero_undef src) -> (ffbh src)
3162 // (cttz_zero_undef src) -> (ffbl src)
3164 // 64-bit scalar version produce 32-bit result
3165 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3166 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3167 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3168 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3169 SDValue NewOpr
= DAG
.getNode(NewOpc
, SL
, MVT::i32
, Src
);
3171 const SDValue ConstVal
= DAG
.getConstant(
3172 Op
.getValueType().getScalarSizeInBits(), SL
, MVT::i32
);
3173 NewOpr
= DAG
.getNode(ISD::UMIN
, SL
, MVT::i32
, NewOpr
, ConstVal
);
3175 return DAG
.getNode(ISD::ZERO_EXTEND
, SL
, Src
.getValueType(), NewOpr
);
3179 std::tie(Lo
, Hi
) = split64BitValue(Src
, DAG
);
3181 SDValue OprLo
= DAG
.getNode(NewOpc
, SL
, MVT::i32
, Lo
);
3182 SDValue OprHi
= DAG
.getNode(NewOpc
, SL
, MVT::i32
, Hi
);
3184 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3185 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3186 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3187 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3189 unsigned AddOpc
= ZeroUndef
? ISD::ADD
: ISD::UADDSAT
;
3190 const SDValue Const32
= DAG
.getConstant(32, SL
, MVT::i32
);
3192 OprLo
= DAG
.getNode(AddOpc
, SL
, MVT::i32
, OprLo
, Const32
);
3194 OprHi
= DAG
.getNode(AddOpc
, SL
, MVT::i32
, OprHi
, Const32
);
3197 NewOpr
= DAG
.getNode(ISD::UMIN
, SL
, MVT::i32
, OprLo
, OprHi
);
3199 const SDValue Const64
= DAG
.getConstant(64, SL
, MVT::i32
);
3200 NewOpr
= DAG
.getNode(ISD::UMIN
, SL
, MVT::i32
, NewOpr
, Const64
);
3203 return DAG
.getNode(ISD::ZERO_EXTEND
, SL
, MVT::i64
, NewOpr
);
3206 SDValue
AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op
, SelectionDAG
&DAG
,
3207 bool Signed
) const {
3208 // The regular method converting a 64-bit integer to float roughly consists of
3209 // 2 steps: normalization and rounding. In fact, after normalization, the
3210 // conversion from a 64-bit integer to a float is essentially the same as the
3211 // one from a 32-bit integer. The only difference is that it has more
3212 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3213 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3214 // converted into the correct float number. The basic steps for the unsigned
3215 // conversion are illustrated in the following pseudo code:
3217 // f32 uitofp(i64 u) {
3218 // i32 hi, lo = split(u);
3219 // // Only count the leading zeros in hi as we have native support of the
3220 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3221 // // reduced to a 32-bit one automatically.
3222 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3224 // hi, lo = split(u);
3225 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3226 // // convert it as a 32-bit integer and scale the result back.
3227 // return uitofp(hi) * 2^(32 - shamt);
3230 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3231 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3232 // converted instead followed by negation based its sign bit.
3235 SDValue Src
= Op
.getOperand(0);
3238 std::tie(Lo
, Hi
) = split64BitValue(Src
, DAG
);
3241 if (Signed
&& Subtarget
->isGCN()) {
3242 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3243 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3244 // account. That is, the maximal shift is
3245 // - 32 if Lo and Hi have opposite signs;
3246 // - 33 if Lo and Hi have the same sign.
3248 // Or, MaxShAmt = 33 + OppositeSign, where
3250 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3251 // - -1 if Lo and Hi have opposite signs; and
3254 // All in all, ShAmt is calculated as
3256 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3260 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3262 // to reduce the critical path.
3263 SDValue OppositeSign
= DAG
.getNode(
3264 ISD::SRA
, SL
, MVT::i32
, DAG
.getNode(ISD::XOR
, SL
, MVT::i32
, Lo
, Hi
),
3265 DAG
.getConstant(31, SL
, MVT::i32
));
3267 DAG
.getNode(ISD::ADD
, SL
, MVT::i32
, DAG
.getConstant(32, SL
, MVT::i32
),
3269 // Count the leading sign bits.
3270 ShAmt
= DAG
.getNode(AMDGPUISD::FFBH_I32
, SL
, MVT::i32
, Hi
);
3271 // Different from unsigned conversion, the shift should be one bit less to
3272 // preserve the sign bit.
3273 ShAmt
= DAG
.getNode(ISD::SUB
, SL
, MVT::i32
, ShAmt
,
3274 DAG
.getConstant(1, SL
, MVT::i32
));
3275 ShAmt
= DAG
.getNode(ISD::UMIN
, SL
, MVT::i32
, ShAmt
, MaxShAmt
);
3278 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3279 // absolute value first.
3280 Sign
= DAG
.getNode(ISD::SRA
, SL
, MVT::i64
, Src
,
3281 DAG
.getConstant(63, SL
, MVT::i64
));
3283 DAG
.getNode(ISD::XOR
, SL
, MVT::i64
,
3284 DAG
.getNode(ISD::ADD
, SL
, MVT::i64
, Src
, Sign
), Sign
);
3285 std::tie(Lo
, Hi
) = split64BitValue(Abs
, DAG
);
3287 // Count the leading zeros.
3288 ShAmt
= DAG
.getNode(ISD::CTLZ
, SL
, MVT::i32
, Hi
);
3289 // The shift amount for signed integers is [0, 32].
3291 // Normalize the given 64-bit integer.
3292 SDValue Norm
= DAG
.getNode(ISD::SHL
, SL
, MVT::i64
, Src
, ShAmt
);
3294 std::tie(Lo
, Hi
) = split64BitValue(Norm
, DAG
);
3295 // Calculate the adjust bit for rounding.
3296 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3297 SDValue Adjust
= DAG
.getNode(ISD::UMIN
, SL
, MVT::i32
,
3298 DAG
.getConstant(1, SL
, MVT::i32
), Lo
);
3299 // Get the 32-bit normalized integer.
3300 Norm
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Hi
, Adjust
);
3301 // Convert the normalized 32-bit integer into f32.
3303 (Signed
&& Subtarget
->isGCN()) ? ISD::SINT_TO_FP
: ISD::UINT_TO_FP
;
3304 SDValue FVal
= DAG
.getNode(Opc
, SL
, MVT::f32
, Norm
);
3306 // Finally, need to scale back the converted floating number as the original
3307 // 64-bit integer is converted as a 32-bit one.
3308 ShAmt
= DAG
.getNode(ISD::SUB
, SL
, MVT::i32
, DAG
.getConstant(32, SL
, MVT::i32
),
3310 // On GCN, use LDEXP directly.
3311 if (Subtarget
->isGCN())
3312 return DAG
.getNode(ISD::FLDEXP
, SL
, MVT::f32
, FVal
, ShAmt
);
3314 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3315 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3316 // exponent is enough to avoid overflowing into the sign bit.
3317 SDValue Exp
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, ShAmt
,
3318 DAG
.getConstant(23, SL
, MVT::i32
));
3320 DAG
.getNode(ISD::ADD
, SL
, MVT::i32
,
3321 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, FVal
), Exp
);
3323 // Set the sign bit.
3324 Sign
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
,
3325 DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, Sign
),
3326 DAG
.getConstant(31, SL
, MVT::i32
));
3327 IVal
= DAG
.getNode(ISD::OR
, SL
, MVT::i32
, IVal
, Sign
);
3329 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::f32
, IVal
);
3332 SDValue
AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op
, SelectionDAG
&DAG
,
3333 bool Signed
) const {
3335 SDValue Src
= Op
.getOperand(0);
3338 std::tie(Lo
, Hi
) = split64BitValue(Src
, DAG
);
3340 SDValue CvtHi
= DAG
.getNode(Signed
? ISD::SINT_TO_FP
: ISD::UINT_TO_FP
,
3343 SDValue CvtLo
= DAG
.getNode(ISD::UINT_TO_FP
, SL
, MVT::f64
, Lo
);
3345 SDValue LdExp
= DAG
.getNode(ISD::FLDEXP
, SL
, MVT::f64
, CvtHi
,
3346 DAG
.getConstant(32, SL
, MVT::i32
));
3347 // TODO: Should this propagate fast-math-flags?
3348 return DAG
.getNode(ISD::FADD
, SL
, MVT::f64
, LdExp
, CvtLo
);
3351 SDValue
AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op
,
3352 SelectionDAG
&DAG
) const {
3353 // TODO: Factor out code common with LowerSINT_TO_FP.
3354 EVT DestVT
= Op
.getValueType();
3355 SDValue Src
= Op
.getOperand(0);
3356 EVT SrcVT
= Src
.getValueType();
3358 if (SrcVT
== MVT::i16
) {
3359 if (DestVT
== MVT::f16
)
3363 // Promote src to i32
3364 SDValue Ext
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::i32
, Src
);
3365 return DAG
.getNode(ISD::UINT_TO_FP
, DL
, DestVT
, Ext
);
3368 if (DestVT
== MVT::bf16
) {
3370 SDValue ToF32
= DAG
.getNode(ISD::UINT_TO_FP
, SL
, MVT::f32
, Src
);
3371 SDValue FPRoundFlag
= DAG
.getIntPtrConstant(0, SL
, /*isTarget=*/true);
3372 return DAG
.getNode(ISD::FP_ROUND
, SL
, MVT::bf16
, ToF32
, FPRoundFlag
);
3375 if (SrcVT
!= MVT::i64
)
3378 if (Subtarget
->has16BitInsts() && DestVT
== MVT::f16
) {
3381 SDValue IntToFp32
= DAG
.getNode(Op
.getOpcode(), DL
, MVT::f32
, Src
);
3382 SDValue FPRoundFlag
=
3383 DAG
.getIntPtrConstant(0, SDLoc(Op
), /*isTarget=*/true);
3385 DAG
.getNode(ISD::FP_ROUND
, DL
, MVT::f16
, IntToFp32
, FPRoundFlag
);
3390 if (DestVT
== MVT::f32
)
3391 return LowerINT_TO_FP32(Op
, DAG
, false);
3393 assert(DestVT
== MVT::f64
);
3394 return LowerINT_TO_FP64(Op
, DAG
, false);
3397 SDValue
AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op
,
3398 SelectionDAG
&DAG
) const {
3399 EVT DestVT
= Op
.getValueType();
3401 SDValue Src
= Op
.getOperand(0);
3402 EVT SrcVT
= Src
.getValueType();
3404 if (SrcVT
== MVT::i16
) {
3405 if (DestVT
== MVT::f16
)
3409 // Promote src to i32
3410 SDValue Ext
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, MVT::i32
, Src
);
3411 return DAG
.getNode(ISD::SINT_TO_FP
, DL
, DestVT
, Ext
);
3414 if (DestVT
== MVT::bf16
) {
3416 SDValue ToF32
= DAG
.getNode(ISD::SINT_TO_FP
, SL
, MVT::f32
, Src
);
3417 SDValue FPRoundFlag
= DAG
.getIntPtrConstant(0, SL
, /*isTarget=*/true);
3418 return DAG
.getNode(ISD::FP_ROUND
, SL
, MVT::bf16
, ToF32
, FPRoundFlag
);
3421 if (SrcVT
!= MVT::i64
)
3424 // TODO: Factor out code common with LowerUINT_TO_FP.
3426 if (Subtarget
->has16BitInsts() && DestVT
== MVT::f16
) {
3428 SDValue Src
= Op
.getOperand(0);
3430 SDValue IntToFp32
= DAG
.getNode(Op
.getOpcode(), DL
, MVT::f32
, Src
);
3431 SDValue FPRoundFlag
=
3432 DAG
.getIntPtrConstant(0, SDLoc(Op
), /*isTarget=*/true);
3434 DAG
.getNode(ISD::FP_ROUND
, DL
, MVT::f16
, IntToFp32
, FPRoundFlag
);
3439 if (DestVT
== MVT::f32
)
3440 return LowerINT_TO_FP32(Op
, DAG
, true);
3442 assert(DestVT
== MVT::f64
);
3443 return LowerINT_TO_FP64(Op
, DAG
, true);
3446 SDValue
AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op
, SelectionDAG
&DAG
,
3447 bool Signed
) const {
3450 SDValue Src
= Op
.getOperand(0);
3451 EVT SrcVT
= Src
.getValueType();
3453 assert(SrcVT
== MVT::f32
|| SrcVT
== MVT::f64
);
3455 // The basic idea of converting a floating point number into a pair of 32-bit
3456 // integers is illustrated as follows:
3458 // tf := trunc(val);
3459 // hif := floor(tf * 2^-32);
3460 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3461 // hi := fptoi(hif);
3462 // lo := fptoi(lof);
3464 SDValue Trunc
= DAG
.getNode(ISD::FTRUNC
, SL
, SrcVT
, Src
);
3466 if (Signed
&& SrcVT
== MVT::f32
) {
3467 // However, a 32-bit floating point number has only 23 bits mantissa and
3468 // it's not enough to hold all the significant bits of `lof` if val is
3469 // negative. To avoid the loss of precision, We need to take the absolute
3470 // value after truncating and flip the result back based on the original
3472 Sign
= DAG
.getNode(ISD::SRA
, SL
, MVT::i32
,
3473 DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Trunc
),
3474 DAG
.getConstant(31, SL
, MVT::i32
));
3475 Trunc
= DAG
.getNode(ISD::FABS
, SL
, SrcVT
, Trunc
);
3479 if (SrcVT
== MVT::f64
) {
3480 K0
= DAG
.getConstantFP(
3481 llvm::bit_cast
<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL
,
3483 K1
= DAG
.getConstantFP(
3484 llvm::bit_cast
<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL
,
3487 K0
= DAG
.getConstantFP(
3488 llvm::bit_cast
<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL
, SrcVT
);
3489 K1
= DAG
.getConstantFP(
3490 llvm::bit_cast
<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL
, SrcVT
);
3492 // TODO: Should this propagate fast-math-flags?
3493 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, SrcVT
, Trunc
, K0
);
3495 SDValue FloorMul
= DAG
.getNode(ISD::FFLOOR
, SL
, SrcVT
, Mul
);
3497 SDValue Fma
= DAG
.getNode(ISD::FMA
, SL
, SrcVT
, FloorMul
, K1
, Trunc
);
3499 SDValue Hi
= DAG
.getNode((Signed
&& SrcVT
== MVT::f64
) ? ISD::FP_TO_SINT
3501 SL
, MVT::i32
, FloorMul
);
3502 SDValue Lo
= DAG
.getNode(ISD::FP_TO_UINT
, SL
, MVT::i32
, Fma
);
3504 SDValue Result
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
,
3505 DAG
.getBuildVector(MVT::v2i32
, SL
, {Lo
, Hi
}));
3507 if (Signed
&& SrcVT
== MVT::f32
) {
3509 // Flip the result based on the signedness, which is either all 0s or 1s.
3510 Sign
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
,
3511 DAG
.getBuildVector(MVT::v2i32
, SL
, {Sign
, Sign
}));
3512 // r := xor(r, sign) - sign;
3514 DAG
.getNode(ISD::SUB
, SL
, MVT::i64
,
3515 DAG
.getNode(ISD::XOR
, SL
, MVT::i64
, Result
, Sign
), Sign
);
3521 SDValue
AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op
, SelectionDAG
&DAG
) const {
3523 SDValue N0
= Op
.getOperand(0);
3525 // Convert to target node to get known bits
3526 if (N0
.getValueType() == MVT::f32
)
3527 return DAG
.getNode(AMDGPUISD::FP_TO_FP16
, DL
, Op
.getValueType(), N0
);
3529 if (getTargetMachine().Options
.UnsafeFPMath
) {
3530 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3534 assert(N0
.getSimpleValueType() == MVT::f64
);
3536 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3537 const unsigned ExpMask
= 0x7ff;
3538 const unsigned ExpBiasf64
= 1023;
3539 const unsigned ExpBiasf16
= 15;
3540 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i32
);
3541 SDValue One
= DAG
.getConstant(1, DL
, MVT::i32
);
3542 SDValue U
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::i64
, N0
);
3543 SDValue UH
= DAG
.getNode(ISD::SRL
, DL
, MVT::i64
, U
,
3544 DAG
.getConstant(32, DL
, MVT::i64
));
3545 UH
= DAG
.getZExtOrTrunc(UH
, DL
, MVT::i32
);
3546 U
= DAG
.getZExtOrTrunc(U
, DL
, MVT::i32
);
3547 SDValue E
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, UH
,
3548 DAG
.getConstant(20, DL
, MVT::i64
));
3549 E
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, E
,
3550 DAG
.getConstant(ExpMask
, DL
, MVT::i32
));
3551 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3552 // add the f16 bias (15) to get the biased exponent for the f16 format.
3553 E
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, E
,
3554 DAG
.getConstant(-ExpBiasf64
+ ExpBiasf16
, DL
, MVT::i32
));
3556 SDValue M
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, UH
,
3557 DAG
.getConstant(8, DL
, MVT::i32
));
3558 M
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, M
,
3559 DAG
.getConstant(0xffe, DL
, MVT::i32
));
3561 SDValue MaskedSig
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, UH
,
3562 DAG
.getConstant(0x1ff, DL
, MVT::i32
));
3563 MaskedSig
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, MaskedSig
, U
);
3565 SDValue Lo40Set
= DAG
.getSelectCC(DL
, MaskedSig
, Zero
, Zero
, One
, ISD::SETEQ
);
3566 M
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, M
, Lo40Set
);
3568 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3569 SDValue I
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
,
3570 DAG
.getSelectCC(DL
, M
, Zero
, DAG
.getConstant(0x0200, DL
, MVT::i32
),
3571 Zero
, ISD::SETNE
), DAG
.getConstant(0x7c00, DL
, MVT::i32
));
3573 // N = M | (E << 12);
3574 SDValue N
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, M
,
3575 DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, E
,
3576 DAG
.getConstant(12, DL
, MVT::i32
)));
3578 // B = clamp(1-E, 0, 13);
3579 SDValue OneSubExp
= DAG
.getNode(ISD::SUB
, DL
, MVT::i32
,
3581 SDValue B
= DAG
.getNode(ISD::SMAX
, DL
, MVT::i32
, OneSubExp
, Zero
);
3582 B
= DAG
.getNode(ISD::SMIN
, DL
, MVT::i32
, B
,
3583 DAG
.getConstant(13, DL
, MVT::i32
));
3585 SDValue SigSetHigh
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, M
,
3586 DAG
.getConstant(0x1000, DL
, MVT::i32
));
3588 SDValue D
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, SigSetHigh
, B
);
3589 SDValue D0
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, D
, B
);
3590 SDValue D1
= DAG
.getSelectCC(DL
, D0
, SigSetHigh
, One
, Zero
, ISD::SETNE
);
3591 D
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, D
, D1
);
3593 SDValue V
= DAG
.getSelectCC(DL
, E
, One
, D
, N
, ISD::SETLT
);
3594 SDValue VLow3
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, V
,
3595 DAG
.getConstant(0x7, DL
, MVT::i32
));
3596 V
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, V
,
3597 DAG
.getConstant(2, DL
, MVT::i32
));
3598 SDValue V0
= DAG
.getSelectCC(DL
, VLow3
, DAG
.getConstant(3, DL
, MVT::i32
),
3599 One
, Zero
, ISD::SETEQ
);
3600 SDValue V1
= DAG
.getSelectCC(DL
, VLow3
, DAG
.getConstant(5, DL
, MVT::i32
),
3601 One
, Zero
, ISD::SETGT
);
3602 V1
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, V0
, V1
);
3603 V
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, V
, V1
);
3605 V
= DAG
.getSelectCC(DL
, E
, DAG
.getConstant(30, DL
, MVT::i32
),
3606 DAG
.getConstant(0x7c00, DL
, MVT::i32
), V
, ISD::SETGT
);
3607 V
= DAG
.getSelectCC(DL
, E
, DAG
.getConstant(1039, DL
, MVT::i32
),
3610 // Extract the sign bit.
3611 SDValue Sign
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, UH
,
3612 DAG
.getConstant(16, DL
, MVT::i32
));
3613 Sign
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, Sign
,
3614 DAG
.getConstant(0x8000, DL
, MVT::i32
));
3616 V
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, Sign
, V
);
3617 return DAG
.getZExtOrTrunc(V
, DL
, Op
.getValueType());
3620 SDValue
AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op
,
3621 SelectionDAG
&DAG
) const {
3622 SDValue Src
= Op
.getOperand(0);
3623 unsigned OpOpcode
= Op
.getOpcode();
3624 EVT SrcVT
= Src
.getValueType();
3625 EVT DestVT
= Op
.getValueType();
3627 // Will be selected natively
3628 if (SrcVT
== MVT::f16
&& DestVT
== MVT::i16
)
3631 if (SrcVT
== MVT::bf16
) {
3633 SDValue PromotedSrc
= DAG
.getNode(ISD::FP_EXTEND
, DL
, MVT::f32
, Src
);
3634 return DAG
.getNode(Op
.getOpcode(), DL
, DestVT
, PromotedSrc
);
3637 // Promote i16 to i32
3638 if (DestVT
== MVT::i16
&& (SrcVT
== MVT::f32
|| SrcVT
== MVT::f64
)) {
3641 SDValue FpToInt32
= DAG
.getNode(OpOpcode
, DL
, MVT::i32
, Src
);
3642 return DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, FpToInt32
);
3645 if (DestVT
!= MVT::i64
)
3648 if (SrcVT
== MVT::f16
||
3649 (SrcVT
== MVT::f32
&& Src
.getOpcode() == ISD::FP16_TO_FP
)) {
3652 SDValue FpToInt32
= DAG
.getNode(OpOpcode
, DL
, MVT::i32
, Src
);
3654 OpOpcode
== ISD::FP_TO_SINT
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
3655 return DAG
.getNode(Ext
, DL
, MVT::i64
, FpToInt32
);
3658 if (SrcVT
== MVT::f32
|| SrcVT
== MVT::f64
)
3659 return LowerFP_TO_INT64(Op
, DAG
, OpOpcode
== ISD::FP_TO_SINT
);
3664 SDValue
AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op
,
3665 SelectionDAG
&DAG
) const {
3666 EVT ExtraVT
= cast
<VTSDNode
>(Op
.getOperand(1))->getVT();
3667 MVT VT
= Op
.getSimpleValueType();
3668 MVT ScalarVT
= VT
.getScalarType();
3670 assert(VT
.isVector());
3672 SDValue Src
= Op
.getOperand(0);
3675 // TODO: Don't scalarize on Evergreen?
3676 unsigned NElts
= VT
.getVectorNumElements();
3677 SmallVector
<SDValue
, 8> Args
;
3678 DAG
.ExtractVectorElements(Src
, Args
, 0, NElts
);
3680 SDValue VTOp
= DAG
.getValueType(ExtraVT
.getScalarType());
3681 for (unsigned I
= 0; I
< NElts
; ++I
)
3682 Args
[I
] = DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, ScalarVT
, Args
[I
], VTOp
);
3684 return DAG
.getBuildVector(VT
, DL
, Args
);
3687 //===----------------------------------------------------------------------===//
3688 // Custom DAG optimizations
3689 //===----------------------------------------------------------------------===//
3691 static bool isU24(SDValue Op
, SelectionDAG
&DAG
) {
3692 return AMDGPUTargetLowering::numBitsUnsigned(Op
, DAG
) <= 24;
3695 static bool isI24(SDValue Op
, SelectionDAG
&DAG
) {
3696 EVT VT
= Op
.getValueType();
3697 return VT
.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3698 // as unsigned 24-bit values.
3699 AMDGPUTargetLowering::numBitsSigned(Op
, DAG
) <= 24;
3702 static SDValue
simplifyMul24(SDNode
*Node24
,
3703 TargetLowering::DAGCombinerInfo
&DCI
) {
3704 SelectionDAG
&DAG
= DCI
.DAG
;
3705 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
3706 bool IsIntrin
= Node24
->getOpcode() == ISD::INTRINSIC_WO_CHAIN
;
3708 SDValue LHS
= IsIntrin
? Node24
->getOperand(1) : Node24
->getOperand(0);
3709 SDValue RHS
= IsIntrin
? Node24
->getOperand(2) : Node24
->getOperand(1);
3710 unsigned NewOpcode
= Node24
->getOpcode();
3712 unsigned IID
= Node24
->getConstantOperandVal(0);
3714 case Intrinsic::amdgcn_mul_i24
:
3715 NewOpcode
= AMDGPUISD::MUL_I24
;
3717 case Intrinsic::amdgcn_mul_u24
:
3718 NewOpcode
= AMDGPUISD::MUL_U24
;
3720 case Intrinsic::amdgcn_mulhi_i24
:
3721 NewOpcode
= AMDGPUISD::MULHI_I24
;
3723 case Intrinsic::amdgcn_mulhi_u24
:
3724 NewOpcode
= AMDGPUISD::MULHI_U24
;
3727 llvm_unreachable("Expected 24-bit mul intrinsic");
3731 APInt Demanded
= APInt::getLowBitsSet(LHS
.getValueSizeInBits(), 24);
3733 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3734 // the operands to have other uses, but will only perform simplifications that
3735 // involve bypassing some nodes for this user.
3736 SDValue DemandedLHS
= TLI
.SimplifyMultipleUseDemandedBits(LHS
, Demanded
, DAG
);
3737 SDValue DemandedRHS
= TLI
.SimplifyMultipleUseDemandedBits(RHS
, Demanded
, DAG
);
3738 if (DemandedLHS
|| DemandedRHS
)
3739 return DAG
.getNode(NewOpcode
, SDLoc(Node24
), Node24
->getVTList(),
3740 DemandedLHS
? DemandedLHS
: LHS
,
3741 DemandedRHS
? DemandedRHS
: RHS
);
3743 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3744 // operands if this node is the only user.
3745 if (TLI
.SimplifyDemandedBits(LHS
, Demanded
, DCI
))
3746 return SDValue(Node24
, 0);
3747 if (TLI
.SimplifyDemandedBits(RHS
, Demanded
, DCI
))
3748 return SDValue(Node24
, 0);
3753 template <typename IntTy
>
3754 static SDValue
constantFoldBFE(SelectionDAG
&DAG
, IntTy Src0
, uint32_t Offset
,
3755 uint32_t Width
, const SDLoc
&DL
) {
3756 if (Width
+ Offset
< 32) {
3757 uint32_t Shl
= static_cast<uint32_t>(Src0
) << (32 - Offset
- Width
);
3758 IntTy Result
= static_cast<IntTy
>(Shl
) >> (32 - Width
);
3759 return DAG
.getConstant(Result
, DL
, MVT::i32
);
3762 return DAG
.getConstant(Src0
>> Offset
, DL
, MVT::i32
);
3765 static bool hasVolatileUser(SDNode
*Val
) {
3766 for (SDNode
*U
: Val
->uses()) {
3767 if (MemSDNode
*M
= dyn_cast
<MemSDNode
>(U
)) {
3768 if (M
->isVolatile())
3776 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT
) const {
3777 // i32 vectors are the canonical memory type.
3778 if (VT
.getScalarType() == MVT::i32
|| isTypeLegal(VT
))
3781 if (!VT
.isByteSized())
3784 unsigned Size
= VT
.getStoreSize();
3786 if ((Size
== 1 || Size
== 2 || Size
== 4) && !VT
.isVector())
3789 if (Size
== 3 || (Size
> 4 && (Size
% 4 != 0)))
3795 // Replace load of an illegal type with a store of a bitcast to a friendlier
3797 SDValue
AMDGPUTargetLowering::performLoadCombine(SDNode
*N
,
3798 DAGCombinerInfo
&DCI
) const {
3799 if (!DCI
.isBeforeLegalize())
3802 LoadSDNode
*LN
= cast
<LoadSDNode
>(N
);
3803 if (!LN
->isSimple() || !ISD::isNormalLoad(LN
) || hasVolatileUser(LN
))
3807 SelectionDAG
&DAG
= DCI
.DAG
;
3808 EVT VT
= LN
->getMemoryVT();
3810 unsigned Size
= VT
.getStoreSize();
3811 Align Alignment
= LN
->getAlign();
3812 if (Alignment
< Size
&& isTypeLegal(VT
)) {
3814 unsigned AS
= LN
->getAddressSpace();
3816 // Expand unaligned loads earlier than legalization. Due to visitation order
3817 // problems during legalization, the emitted instructions to pack and unpack
3818 // the bytes again are not eliminated in the case of an unaligned copy.
3819 if (!allowsMisalignedMemoryAccesses(
3820 VT
, AS
, Alignment
, LN
->getMemOperand()->getFlags(), &IsFast
)) {
3822 return SplitVectorLoad(SDValue(LN
, 0), DAG
);
3825 std::tie(Ops
[0], Ops
[1]) = expandUnalignedLoad(LN
, DAG
);
3827 return DAG
.getMergeValues(Ops
, SDLoc(N
));
3834 if (!shouldCombineMemoryType(VT
))
3837 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VT
);
3840 = DAG
.getLoad(NewVT
, SL
, LN
->getChain(),
3841 LN
->getBasePtr(), LN
->getMemOperand());
3843 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewLoad
);
3844 DCI
.CombineTo(N
, BC
, NewLoad
.getValue(1));
3845 return SDValue(N
, 0);
3848 // Replace store of an illegal type with a store of a bitcast to a friendlier
3850 SDValue
AMDGPUTargetLowering::performStoreCombine(SDNode
*N
,
3851 DAGCombinerInfo
&DCI
) const {
3852 if (!DCI
.isBeforeLegalize())
3855 StoreSDNode
*SN
= cast
<StoreSDNode
>(N
);
3856 if (!SN
->isSimple() || !ISD::isNormalStore(SN
))
3859 EVT VT
= SN
->getMemoryVT();
3860 unsigned Size
= VT
.getStoreSize();
3863 SelectionDAG
&DAG
= DCI
.DAG
;
3864 Align Alignment
= SN
->getAlign();
3865 if (Alignment
< Size
&& isTypeLegal(VT
)) {
3867 unsigned AS
= SN
->getAddressSpace();
3869 // Expand unaligned stores earlier than legalization. Due to visitation
3870 // order problems during legalization, the emitted instructions to pack and
3871 // unpack the bytes again are not eliminated in the case of an unaligned
3873 if (!allowsMisalignedMemoryAccesses(
3874 VT
, AS
, Alignment
, SN
->getMemOperand()->getFlags(), &IsFast
)) {
3876 return SplitVectorStore(SDValue(SN
, 0), DAG
);
3878 return expandUnalignedStore(SN
, DAG
);
3885 if (!shouldCombineMemoryType(VT
))
3888 EVT NewVT
= getEquivalentMemType(*DAG
.getContext(), VT
);
3889 SDValue Val
= SN
->getValue();
3891 //DCI.AddToWorklist(Val.getNode());
3893 bool OtherUses
= !Val
.hasOneUse();
3894 SDValue CastVal
= DAG
.getNode(ISD::BITCAST
, SL
, NewVT
, Val
);
3896 SDValue CastBack
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, CastVal
);
3897 DAG
.ReplaceAllUsesOfValueWith(Val
, CastBack
);
3900 return DAG
.getStore(SN
->getChain(), SL
, CastVal
,
3901 SN
->getBasePtr(), SN
->getMemOperand());
3904 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3905 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3907 SDValue
AMDGPUTargetLowering::performAssertSZExtCombine(SDNode
*N
,
3908 DAGCombinerInfo
&DCI
) const {
3909 SelectionDAG
&DAG
= DCI
.DAG
;
3910 SDValue N0
= N
->getOperand(0);
3912 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3913 // (vt2 (truncate (assertzext vt0:x, vt1)))
3914 if (N0
.getOpcode() == ISD::TRUNCATE
) {
3915 SDValue N1
= N
->getOperand(1);
3916 EVT ExtVT
= cast
<VTSDNode
>(N1
)->getVT();
3919 SDValue Src
= N0
.getOperand(0);
3920 EVT SrcVT
= Src
.getValueType();
3921 if (SrcVT
.bitsGE(ExtVT
)) {
3922 SDValue NewInReg
= DAG
.getNode(N
->getOpcode(), SL
, SrcVT
, Src
, N1
);
3923 return DAG
.getNode(ISD::TRUNCATE
, SL
, N
->getValueType(0), NewInReg
);
3930 SDValue
AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3931 SDNode
*N
, DAGCombinerInfo
&DCI
) const {
3932 unsigned IID
= N
->getConstantOperandVal(0);
3934 case Intrinsic::amdgcn_mul_i24
:
3935 case Intrinsic::amdgcn_mul_u24
:
3936 case Intrinsic::amdgcn_mulhi_i24
:
3937 case Intrinsic::amdgcn_mulhi_u24
:
3938 return simplifyMul24(N
, DCI
);
3939 case Intrinsic::amdgcn_fract
:
3940 case Intrinsic::amdgcn_rsq
:
3941 case Intrinsic::amdgcn_rcp_legacy
:
3942 case Intrinsic::amdgcn_rsq_legacy
:
3943 case Intrinsic::amdgcn_rsq_clamp
: {
3944 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3945 SDValue Src
= N
->getOperand(1);
3946 return Src
.isUndef() ? Src
: SDValue();
3948 case Intrinsic::amdgcn_frexp_exp
: {
3949 // frexp_exp (fneg x) -> frexp_exp x
3950 // frexp_exp (fabs x) -> frexp_exp x
3951 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3952 SDValue Src
= N
->getOperand(1);
3953 SDValue PeekSign
= peekFPSignOps(Src
);
3954 if (PeekSign
== Src
)
3956 return SDValue(DCI
.DAG
.UpdateNodeOperands(N
, N
->getOperand(0), PeekSign
),
3964 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3965 /// binary operation \p Opc to it with the corresponding constant operands.
3966 SDValue
AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3967 DAGCombinerInfo
&DCI
, const SDLoc
&SL
,
3968 unsigned Opc
, SDValue LHS
,
3969 uint32_t ValLo
, uint32_t ValHi
) const {
3970 SelectionDAG
&DAG
= DCI
.DAG
;
3972 std::tie(Lo
, Hi
) = split64BitValue(LHS
, DAG
);
3974 SDValue LoRHS
= DAG
.getConstant(ValLo
, SL
, MVT::i32
);
3975 SDValue HiRHS
= DAG
.getConstant(ValHi
, SL
, MVT::i32
);
3977 SDValue LoAnd
= DAG
.getNode(Opc
, SL
, MVT::i32
, Lo
, LoRHS
);
3978 SDValue HiAnd
= DAG
.getNode(Opc
, SL
, MVT::i32
, Hi
, HiRHS
);
3980 // Re-visit the ands. It's possible we eliminated one of them and it could
3981 // simplify the vector.
3982 DCI
.AddToWorklist(Lo
.getNode());
3983 DCI
.AddToWorklist(Hi
.getNode());
3985 SDValue Vec
= DAG
.getBuildVector(MVT::v2i32
, SL
, {LoAnd
, HiAnd
});
3986 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Vec
);
3989 SDValue
AMDGPUTargetLowering::performShlCombine(SDNode
*N
,
3990 DAGCombinerInfo
&DCI
) const {
3991 EVT VT
= N
->getValueType(0);
3993 ConstantSDNode
*RHS
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
3997 SDValue LHS
= N
->getOperand(0);
3998 unsigned RHSVal
= RHS
->getZExtValue();
4003 SelectionDAG
&DAG
= DCI
.DAG
;
4005 switch (LHS
->getOpcode()) {
4008 case ISD::ZERO_EXTEND
:
4009 case ISD::SIGN_EXTEND
:
4010 case ISD::ANY_EXTEND
: {
4011 SDValue X
= LHS
->getOperand(0);
4013 if (VT
== MVT::i32
&& RHSVal
== 16 && X
.getValueType() == MVT::i16
&&
4014 isOperationLegal(ISD::BUILD_VECTOR
, MVT::v2i16
)) {
4015 // Prefer build_vector as the canonical form if packed types are legal.
4016 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4017 SDValue Vec
= DAG
.getBuildVector(MVT::v2i16
, SL
,
4018 { DAG
.getConstant(0, SL
, MVT::i16
), LHS
->getOperand(0) });
4019 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, Vec
);
4022 // shl (ext x) => zext (shl x), if shift does not overflow int
4025 KnownBits Known
= DAG
.computeKnownBits(X
);
4026 unsigned LZ
= Known
.countMinLeadingZeros();
4029 EVT XVT
= X
.getValueType();
4030 SDValue Shl
= DAG
.getNode(ISD::SHL
, SL
, XVT
, X
, SDValue(RHS
, 0));
4031 return DAG
.getZExtOrTrunc(Shl
, SL
, VT
);
4038 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4040 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4041 // common case, splitting this into a move and a 32-bit shift is faster and
4042 // the same code size.
4046 SDValue ShiftAmt
= DAG
.getConstant(RHSVal
- 32, SL
, MVT::i32
);
4048 SDValue Lo
= DAG
.getNode(ISD::TRUNCATE
, SL
, MVT::i32
, LHS
);
4049 SDValue NewShift
= DAG
.getNode(ISD::SHL
, SL
, MVT::i32
, Lo
, ShiftAmt
);
4051 const SDValue Zero
= DAG
.getConstant(0, SL
, MVT::i32
);
4053 SDValue Vec
= DAG
.getBuildVector(MVT::v2i32
, SL
, {Zero
, NewShift
});
4054 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Vec
);
4057 SDValue
AMDGPUTargetLowering::performSraCombine(SDNode
*N
,
4058 DAGCombinerInfo
&DCI
) const {
4059 if (N
->getValueType(0) != MVT::i64
)
4062 const ConstantSDNode
*RHS
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
4066 SelectionDAG
&DAG
= DCI
.DAG
;
4068 unsigned RHSVal
= RHS
->getZExtValue();
4070 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4072 SDValue Hi
= getHiHalf64(N
->getOperand(0), DAG
);
4073 SDValue NewShift
= DAG
.getNode(ISD::SRA
, SL
, MVT::i32
, Hi
,
4074 DAG
.getConstant(31, SL
, MVT::i32
));
4076 SDValue BuildVec
= DAG
.getBuildVector(MVT::v2i32
, SL
, {Hi
, NewShift
});
4077 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, BuildVec
);
4080 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4082 SDValue Hi
= getHiHalf64(N
->getOperand(0), DAG
);
4083 SDValue NewShift
= DAG
.getNode(ISD::SRA
, SL
, MVT::i32
, Hi
,
4084 DAG
.getConstant(31, SL
, MVT::i32
));
4085 SDValue BuildVec
= DAG
.getBuildVector(MVT::v2i32
, SL
, {NewShift
, NewShift
});
4086 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, BuildVec
);
4092 SDValue
AMDGPUTargetLowering::performSrlCombine(SDNode
*N
,
4093 DAGCombinerInfo
&DCI
) const {
4094 auto *RHS
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
4098 EVT VT
= N
->getValueType(0);
4099 SDValue LHS
= N
->getOperand(0);
4100 unsigned ShiftAmt
= RHS
->getZExtValue();
4101 SelectionDAG
&DAG
= DCI
.DAG
;
4104 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4105 // this improves the ability to match BFE patterns in isel.
4106 if (LHS
.getOpcode() == ISD::AND
) {
4107 if (auto *Mask
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1))) {
4108 unsigned MaskIdx
, MaskLen
;
4109 if (Mask
->getAPIntValue().isShiftedMask(MaskIdx
, MaskLen
) &&
4110 MaskIdx
== ShiftAmt
) {
4113 DAG
.getNode(ISD::SRL
, SL
, VT
, LHS
.getOperand(0), N
->getOperand(1)),
4114 DAG
.getNode(ISD::SRL
, SL
, VT
, LHS
.getOperand(1), N
->getOperand(1)));
4125 // srl i64:x, C for C >= 32
4127 // build_pair (srl hi_32(x), C - 32), 0
4128 SDValue Zero
= DAG
.getConstant(0, SL
, MVT::i32
);
4130 SDValue Hi
= getHiHalf64(LHS
, DAG
);
4132 SDValue NewConst
= DAG
.getConstant(ShiftAmt
- 32, SL
, MVT::i32
);
4133 SDValue NewShift
= DAG
.getNode(ISD::SRL
, SL
, MVT::i32
, Hi
, NewConst
);
4135 SDValue BuildPair
= DAG
.getBuildVector(MVT::v2i32
, SL
, {NewShift
, Zero
});
4137 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, BuildPair
);
4140 SDValue
AMDGPUTargetLowering::performTruncateCombine(
4141 SDNode
*N
, DAGCombinerInfo
&DCI
) const {
4143 SelectionDAG
&DAG
= DCI
.DAG
;
4144 EVT VT
= N
->getValueType(0);
4145 SDValue Src
= N
->getOperand(0);
4147 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4148 if (Src
.getOpcode() == ISD::BITCAST
&& !VT
.isVector()) {
4149 SDValue Vec
= Src
.getOperand(0);
4150 if (Vec
.getOpcode() == ISD::BUILD_VECTOR
) {
4151 SDValue Elt0
= Vec
.getOperand(0);
4152 EVT EltVT
= Elt0
.getValueType();
4153 if (VT
.getFixedSizeInBits() <= EltVT
.getFixedSizeInBits()) {
4154 if (EltVT
.isFloatingPoint()) {
4155 Elt0
= DAG
.getNode(ISD::BITCAST
, SL
,
4156 EltVT
.changeTypeToInteger(), Elt0
);
4159 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, Elt0
);
4164 // Equivalent of above for accessing the high element of a vector as an
4165 // integer operation.
4166 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4167 if (Src
.getOpcode() == ISD::SRL
&& !VT
.isVector()) {
4168 if (auto K
= isConstOrConstSplat(Src
.getOperand(1))) {
4169 if (2 * K
->getZExtValue() == Src
.getValueType().getScalarSizeInBits()) {
4170 SDValue BV
= stripBitcast(Src
.getOperand(0));
4171 if (BV
.getOpcode() == ISD::BUILD_VECTOR
&&
4172 BV
.getValueType().getVectorNumElements() == 2) {
4173 SDValue SrcElt
= BV
.getOperand(1);
4174 EVT SrcEltVT
= SrcElt
.getValueType();
4175 if (SrcEltVT
.isFloatingPoint()) {
4176 SrcElt
= DAG
.getNode(ISD::BITCAST
, SL
,
4177 SrcEltVT
.changeTypeToInteger(), SrcElt
);
4180 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, SrcElt
);
4186 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4188 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4189 // i16 (trunc (srl (i32 (trunc x), K)))
4190 if (VT
.getScalarSizeInBits() < 32) {
4191 EVT SrcVT
= Src
.getValueType();
4192 if (SrcVT
.getScalarSizeInBits() > 32 &&
4193 (Src
.getOpcode() == ISD::SRL
||
4194 Src
.getOpcode() == ISD::SRA
||
4195 Src
.getOpcode() == ISD::SHL
)) {
4196 SDValue Amt
= Src
.getOperand(1);
4197 KnownBits Known
= DAG
.computeKnownBits(Amt
);
4199 // - For left shifts, do the transform as long as the shift
4200 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4201 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4202 // losing information stored in the high bits when truncating.
4203 const unsigned MaxCstSize
=
4204 (Src
.getOpcode() == ISD::SHL
) ? 31 : (32 - VT
.getScalarSizeInBits());
4205 if (Known
.getMaxValue().ule(MaxCstSize
)) {
4206 EVT MidVT
= VT
.isVector() ?
4207 EVT::getVectorVT(*DAG
.getContext(), MVT::i32
,
4208 VT
.getVectorNumElements()) : MVT::i32
;
4210 EVT NewShiftVT
= getShiftAmountTy(MidVT
, DAG
.getDataLayout());
4211 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, SL
, MidVT
,
4213 DCI
.AddToWorklist(Trunc
.getNode());
4215 if (Amt
.getValueType() != NewShiftVT
) {
4216 Amt
= DAG
.getZExtOrTrunc(Amt
, SL
, NewShiftVT
);
4217 DCI
.AddToWorklist(Amt
.getNode());
4220 SDValue ShrunkShift
= DAG
.getNode(Src
.getOpcode(), SL
, MidVT
,
4222 return DAG
.getNode(ISD::TRUNCATE
, SL
, VT
, ShrunkShift
);
4230 // We need to specifically handle i64 mul here to avoid unnecessary conversion
4231 // instructions. If we only match on the legalized i64 mul expansion,
4232 // SimplifyDemandedBits will be unable to remove them because there will be
4233 // multiple uses due to the separate mul + mulh[su].
4234 static SDValue
getMul24(SelectionDAG
&DAG
, const SDLoc
&SL
,
4235 SDValue N0
, SDValue N1
, unsigned Size
, bool Signed
) {
4237 unsigned MulOpc
= Signed
? AMDGPUISD::MUL_I24
: AMDGPUISD::MUL_U24
;
4238 return DAG
.getNode(MulOpc
, SL
, MVT::i32
, N0
, N1
);
4241 unsigned MulLoOpc
= Signed
? AMDGPUISD::MUL_I24
: AMDGPUISD::MUL_U24
;
4242 unsigned MulHiOpc
= Signed
? AMDGPUISD::MULHI_I24
: AMDGPUISD::MULHI_U24
;
4244 SDValue MulLo
= DAG
.getNode(MulLoOpc
, SL
, MVT::i32
, N0
, N1
);
4245 SDValue MulHi
= DAG
.getNode(MulHiOpc
, SL
, MVT::i32
, N0
, N1
);
4247 return DAG
.getNode(ISD::BUILD_PAIR
, SL
, MVT::i64
, MulLo
, MulHi
);
4250 /// If \p V is an add of a constant 1, returns the other operand. Otherwise
4251 /// return SDValue().
4252 static SDValue
getAddOneOp(const SDNode
*V
) {
4253 if (V
->getOpcode() != ISD::ADD
)
4256 return isOneConstant(V
->getOperand(1)) ? V
->getOperand(0) : SDValue();
4259 SDValue
AMDGPUTargetLowering::performMulCombine(SDNode
*N
,
4260 DAGCombinerInfo
&DCI
) const {
4261 assert(N
->getOpcode() == ISD::MUL
);
4262 EVT VT
= N
->getValueType(0);
4264 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4265 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4266 // unnecessarily). isDivergent() is used as an approximation of whether the
4267 // value is in an SGPR.
4268 if (!N
->isDivergent())
4271 unsigned Size
= VT
.getSizeInBits();
4272 if (VT
.isVector() || Size
> 64)
4275 SelectionDAG
&DAG
= DCI
.DAG
;
4278 SDValue N0
= N
->getOperand(0);
4279 SDValue N1
= N
->getOperand(1);
4281 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4284 // mul x, (add y, 1) -> add (mul x, y), x
4285 auto IsFoldableAdd
= [](SDValue V
) -> SDValue
{
4286 SDValue AddOp
= getAddOneOp(V
.getNode());
4290 if (V
.hasOneUse() || all_of(V
->uses(), [](const SDNode
*U
) -> bool {
4291 return U
->getOpcode() == ISD::MUL
;
4298 // FIXME: The selection pattern is not properly checking for commuted
4299 // operands, so we have to place the mul in the LHS
4300 if (SDValue MulOper
= IsFoldableAdd(N0
)) {
4301 SDValue MulVal
= DAG
.getNode(N
->getOpcode(), DL
, VT
, N1
, MulOper
);
4302 return DAG
.getNode(ISD::ADD
, DL
, VT
, MulVal
, N1
);
4305 if (SDValue MulOper
= IsFoldableAdd(N1
)) {
4306 SDValue MulVal
= DAG
.getNode(N
->getOpcode(), DL
, VT
, N0
, MulOper
);
4307 return DAG
.getNode(ISD::ADD
, DL
, VT
, MulVal
, N0
);
4310 // There are i16 integer mul/mad.
4311 if (Subtarget
->has16BitInsts() && VT
.getScalarType().bitsLE(MVT::i16
))
4314 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4315 // in the source into any_extends if the result of the mul is truncated. Since
4316 // we can assume the high bits are whatever we want, use the underlying value
4317 // to avoid the unknown high bits from interfering.
4318 if (N0
.getOpcode() == ISD::ANY_EXTEND
)
4319 N0
= N0
.getOperand(0);
4321 if (N1
.getOpcode() == ISD::ANY_EXTEND
)
4322 N1
= N1
.getOperand(0);
4326 if (Subtarget
->hasMulU24() && isU24(N0
, DAG
) && isU24(N1
, DAG
)) {
4327 N0
= DAG
.getZExtOrTrunc(N0
, DL
, MVT::i32
);
4328 N1
= DAG
.getZExtOrTrunc(N1
, DL
, MVT::i32
);
4329 Mul
= getMul24(DAG
, DL
, N0
, N1
, Size
, false);
4330 } else if (Subtarget
->hasMulI24() && isI24(N0
, DAG
) && isI24(N1
, DAG
)) {
4331 N0
= DAG
.getSExtOrTrunc(N0
, DL
, MVT::i32
);
4332 N1
= DAG
.getSExtOrTrunc(N1
, DL
, MVT::i32
);
4333 Mul
= getMul24(DAG
, DL
, N0
, N1
, Size
, true);
4338 // We need to use sext even for MUL_U24, because MUL_U24 is used
4339 // for signed multiply of 8 and 16-bit types.
4340 return DAG
.getSExtOrTrunc(Mul
, DL
, VT
);
4344 AMDGPUTargetLowering::performMulLoHiCombine(SDNode
*N
,
4345 DAGCombinerInfo
&DCI
) const {
4346 if (N
->getValueType(0) != MVT::i32
)
4349 SelectionDAG
&DAG
= DCI
.DAG
;
4352 bool Signed
= N
->getOpcode() == ISD::SMUL_LOHI
;
4353 SDValue N0
= N
->getOperand(0);
4354 SDValue N1
= N
->getOperand(1);
4356 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4357 // in the source into any_extends if the result of the mul is truncated. Since
4358 // we can assume the high bits are whatever we want, use the underlying value
4359 // to avoid the unknown high bits from interfering.
4360 if (N0
.getOpcode() == ISD::ANY_EXTEND
)
4361 N0
= N0
.getOperand(0);
4362 if (N1
.getOpcode() == ISD::ANY_EXTEND
)
4363 N1
= N1
.getOperand(0);
4365 // Try to use two fast 24-bit multiplies (one for each half of the result)
4366 // instead of one slow extending multiply.
4367 unsigned LoOpcode
= 0;
4368 unsigned HiOpcode
= 0;
4370 if (Subtarget
->hasMulI24() && isI24(N0
, DAG
) && isI24(N1
, DAG
)) {
4371 N0
= DAG
.getSExtOrTrunc(N0
, DL
, MVT::i32
);
4372 N1
= DAG
.getSExtOrTrunc(N1
, DL
, MVT::i32
);
4373 LoOpcode
= AMDGPUISD::MUL_I24
;
4374 HiOpcode
= AMDGPUISD::MULHI_I24
;
4377 if (Subtarget
->hasMulU24() && isU24(N0
, DAG
) && isU24(N1
, DAG
)) {
4378 N0
= DAG
.getZExtOrTrunc(N0
, DL
, MVT::i32
);
4379 N1
= DAG
.getZExtOrTrunc(N1
, DL
, MVT::i32
);
4380 LoOpcode
= AMDGPUISD::MUL_U24
;
4381 HiOpcode
= AMDGPUISD::MULHI_U24
;
4387 SDValue Lo
= DAG
.getNode(LoOpcode
, DL
, MVT::i32
, N0
, N1
);
4388 SDValue Hi
= DAG
.getNode(HiOpcode
, DL
, MVT::i32
, N0
, N1
);
4389 DCI
.CombineTo(N
, Lo
, Hi
);
4390 return SDValue(N
, 0);
4393 SDValue
AMDGPUTargetLowering::performMulhsCombine(SDNode
*N
,
4394 DAGCombinerInfo
&DCI
) const {
4395 EVT VT
= N
->getValueType(0);
4397 if (!Subtarget
->hasMulI24() || VT
.isVector())
4400 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4401 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4402 // unnecessarily). isDivergent() is used as an approximation of whether the
4403 // value is in an SGPR.
4404 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4406 if (Subtarget
->hasSMulHi() && !N
->isDivergent())
4409 SelectionDAG
&DAG
= DCI
.DAG
;
4412 SDValue N0
= N
->getOperand(0);
4413 SDValue N1
= N
->getOperand(1);
4415 if (!isI24(N0
, DAG
) || !isI24(N1
, DAG
))
4418 N0
= DAG
.getSExtOrTrunc(N0
, DL
, MVT::i32
);
4419 N1
= DAG
.getSExtOrTrunc(N1
, DL
, MVT::i32
);
4421 SDValue Mulhi
= DAG
.getNode(AMDGPUISD::MULHI_I24
, DL
, MVT::i32
, N0
, N1
);
4422 DCI
.AddToWorklist(Mulhi
.getNode());
4423 return DAG
.getSExtOrTrunc(Mulhi
, DL
, VT
);
4426 SDValue
AMDGPUTargetLowering::performMulhuCombine(SDNode
*N
,
4427 DAGCombinerInfo
&DCI
) const {
4428 EVT VT
= N
->getValueType(0);
4430 if (!Subtarget
->hasMulU24() || VT
.isVector() || VT
.getSizeInBits() > 32)
4433 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4434 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4435 // unnecessarily). isDivergent() is used as an approximation of whether the
4436 // value is in an SGPR.
4437 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4439 if (Subtarget
->hasSMulHi() && !N
->isDivergent())
4442 SelectionDAG
&DAG
= DCI
.DAG
;
4445 SDValue N0
= N
->getOperand(0);
4446 SDValue N1
= N
->getOperand(1);
4448 if (!isU24(N0
, DAG
) || !isU24(N1
, DAG
))
4451 N0
= DAG
.getZExtOrTrunc(N0
, DL
, MVT::i32
);
4452 N1
= DAG
.getZExtOrTrunc(N1
, DL
, MVT::i32
);
4454 SDValue Mulhi
= DAG
.getNode(AMDGPUISD::MULHI_U24
, DL
, MVT::i32
, N0
, N1
);
4455 DCI
.AddToWorklist(Mulhi
.getNode());
4456 return DAG
.getZExtOrTrunc(Mulhi
, DL
, VT
);
4459 SDValue
AMDGPUTargetLowering::getFFBX_U32(SelectionDAG
&DAG
,
4462 unsigned Opc
) const {
4463 EVT VT
= Op
.getValueType();
4464 EVT LegalVT
= getTypeToTransformTo(*DAG
.getContext(), VT
);
4465 if (LegalVT
!= MVT::i32
&& (Subtarget
->has16BitInsts() &&
4466 LegalVT
!= MVT::i16
))
4470 Op
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::i32
, Op
);
4472 SDValue FFBX
= DAG
.getNode(Opc
, DL
, MVT::i32
, Op
);
4474 FFBX
= DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, FFBX
);
4479 // The native instructions return -1 on 0 input. Optimize out a select that
4480 // produces -1 on 0.
4482 // TODO: If zero is not undef, we could also do this if the output is compared
4483 // against the bitwidth.
4485 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4486 SDValue
AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc
&SL
, SDValue Cond
,
4487 SDValue LHS
, SDValue RHS
,
4488 DAGCombinerInfo
&DCI
) const {
4489 if (!isNullConstant(Cond
.getOperand(1)))
4492 SelectionDAG
&DAG
= DCI
.DAG
;
4493 ISD::CondCode CCOpcode
= cast
<CondCodeSDNode
>(Cond
.getOperand(2))->get();
4494 SDValue CmpLHS
= Cond
.getOperand(0);
4496 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4497 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4498 if (CCOpcode
== ISD::SETEQ
&&
4499 (isCtlzOpc(RHS
.getOpcode()) || isCttzOpc(RHS
.getOpcode())) &&
4500 RHS
.getOperand(0) == CmpLHS
&& isAllOnesConstant(LHS
)) {
4502 isCttzOpc(RHS
.getOpcode()) ? AMDGPUISD::FFBL_B32
: AMDGPUISD::FFBH_U32
;
4503 return getFFBX_U32(DAG
, CmpLHS
, SL
, Opc
);
4506 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4507 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4508 if (CCOpcode
== ISD::SETNE
&&
4509 (isCtlzOpc(LHS
.getOpcode()) || isCttzOpc(LHS
.getOpcode())) &&
4510 LHS
.getOperand(0) == CmpLHS
&& isAllOnesConstant(RHS
)) {
4512 isCttzOpc(LHS
.getOpcode()) ? AMDGPUISD::FFBL_B32
: AMDGPUISD::FFBH_U32
;
4514 return getFFBX_U32(DAG
, CmpLHS
, SL
, Opc
);
4520 static SDValue
distributeOpThroughSelect(TargetLowering::DAGCombinerInfo
&DCI
,
4526 SelectionDAG
&DAG
= DCI
.DAG
;
4527 EVT VT
= N1
.getValueType();
4529 SDValue NewSelect
= DAG
.getNode(ISD::SELECT
, SL
, VT
, Cond
,
4530 N1
.getOperand(0), N2
.getOperand(0));
4531 DCI
.AddToWorklist(NewSelect
.getNode());
4532 return DAG
.getNode(Op
, SL
, VT
, NewSelect
);
4535 // Pull a free FP operation out of a select so it may fold into uses.
4537 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4538 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
4540 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4541 // select c, (fabs x), +k -> fabs (select c, x, k)
4543 AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo
&DCI
,
4545 SelectionDAG
&DAG
= DCI
.DAG
;
4546 SDValue Cond
= N
.getOperand(0);
4547 SDValue LHS
= N
.getOperand(1);
4548 SDValue RHS
= N
.getOperand(2);
4550 EVT VT
= N
.getValueType();
4551 if ((LHS
.getOpcode() == ISD::FABS
&& RHS
.getOpcode() == ISD::FABS
) ||
4552 (LHS
.getOpcode() == ISD::FNEG
&& RHS
.getOpcode() == ISD::FNEG
)) {
4553 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N
.getNode()))
4556 return distributeOpThroughSelect(DCI
, LHS
.getOpcode(),
4557 SDLoc(N
), Cond
, LHS
, RHS
);
4561 if (RHS
.getOpcode() == ISD::FABS
|| RHS
.getOpcode() == ISD::FNEG
) {
4562 std::swap(LHS
, RHS
);
4566 // TODO: Support vector constants.
4567 ConstantFPSDNode
*CRHS
= dyn_cast
<ConstantFPSDNode
>(RHS
);
4568 if ((LHS
.getOpcode() == ISD::FNEG
|| LHS
.getOpcode() == ISD::FABS
) && CRHS
&&
4569 !selectSupportsSourceMods(N
.getNode())) {
4571 // If one side is an fneg/fabs and the other is a constant, we can push the
4572 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4573 SDValue NewLHS
= LHS
.getOperand(0);
4574 SDValue NewRHS
= RHS
;
4576 // Careful: if the neg can be folded up, don't try to pull it back down.
4577 bool ShouldFoldNeg
= true;
4579 if (NewLHS
.hasOneUse()) {
4580 unsigned Opc
= NewLHS
.getOpcode();
4581 if (LHS
.getOpcode() == ISD::FNEG
&& fnegFoldsIntoOp(NewLHS
.getNode()))
4582 ShouldFoldNeg
= false;
4583 if (LHS
.getOpcode() == ISD::FABS
&& Opc
== ISD::FMUL
)
4584 ShouldFoldNeg
= false;
4587 if (ShouldFoldNeg
) {
4588 if (LHS
.getOpcode() == ISD::FABS
&& CRHS
->isNegative())
4591 // We're going to be forced to use a source modifier anyway, there's no
4592 // point to pulling the negate out unless we can get a size reduction by
4593 // negating the constant.
4595 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4596 // about cheaper constants.
4597 if (NewLHS
.getOpcode() == ISD::FABS
&&
4598 getConstantNegateCost(CRHS
) != NegatibleCost::Cheaper
)
4601 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N
.getNode()))
4604 if (LHS
.getOpcode() == ISD::FNEG
)
4605 NewRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
4608 std::swap(NewLHS
, NewRHS
);
4610 SDValue NewSelect
= DAG
.getNode(ISD::SELECT
, SL
, VT
,
4611 Cond
, NewLHS
, NewRHS
);
4612 DCI
.AddToWorklist(NewSelect
.getNode());
4613 return DAG
.getNode(LHS
.getOpcode(), SL
, VT
, NewSelect
);
4620 SDValue
AMDGPUTargetLowering::performSelectCombine(SDNode
*N
,
4621 DAGCombinerInfo
&DCI
) const {
4622 if (SDValue Folded
= foldFreeOpFromSelect(DCI
, SDValue(N
, 0)))
4625 SDValue Cond
= N
->getOperand(0);
4626 if (Cond
.getOpcode() != ISD::SETCC
)
4629 EVT VT
= N
->getValueType(0);
4630 SDValue LHS
= Cond
.getOperand(0);
4631 SDValue RHS
= Cond
.getOperand(1);
4632 SDValue CC
= Cond
.getOperand(2);
4634 SDValue True
= N
->getOperand(1);
4635 SDValue False
= N
->getOperand(2);
4637 if (Cond
.hasOneUse()) { // TODO: Look for multiple select uses.
4638 SelectionDAG
&DAG
= DCI
.DAG
;
4639 if (DAG
.isConstantValueOfAnyType(True
) &&
4640 !DAG
.isConstantValueOfAnyType(False
)) {
4641 // Swap cmp + select pair to move constant to false input.
4642 // This will allow using VOPC cndmasks more often.
4643 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4646 ISD::CondCode NewCC
=
4647 getSetCCInverse(cast
<CondCodeSDNode
>(CC
)->get(), LHS
.getValueType());
4649 SDValue NewCond
= DAG
.getSetCC(SL
, Cond
.getValueType(), LHS
, RHS
, NewCC
);
4650 return DAG
.getNode(ISD::SELECT
, SL
, VT
, NewCond
, False
, True
);
4653 if (VT
== MVT::f32
&& Subtarget
->hasFminFmaxLegacy()) {
4655 = combineFMinMaxLegacy(SDLoc(N
), VT
, LHS
, RHS
, True
, False
, CC
, DCI
);
4656 // Revisit this node so we can catch min3/max3/med3 patterns.
4657 //DCI.AddToWorklist(MinMax.getNode());
4662 // There's no reason to not do this if the condition has other uses.
4663 return performCtlz_CttzCombine(SDLoc(N
), Cond
, True
, False
, DCI
);
4666 static bool isInv2Pi(const APFloat
&APF
) {
4667 static const APFloat
KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4668 static const APFloat
KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4669 static const APFloat
KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4671 return APF
.bitwiseIsEqual(KF16
) ||
4672 APF
.bitwiseIsEqual(KF32
) ||
4673 APF
.bitwiseIsEqual(KF64
);
4676 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4677 // additional cost to negate them.
4678 TargetLowering::NegatibleCost
4679 AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode
*C
) const {
4681 return C
->isNegative() ? NegatibleCost::Cheaper
: NegatibleCost::Expensive
;
4683 if (Subtarget
->hasInv2PiInlineImm() && isInv2Pi(C
->getValueAPF()))
4684 return C
->isNegative() ? NegatibleCost::Cheaper
: NegatibleCost::Expensive
;
4686 return NegatibleCost::Neutral
;
4689 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N
) const {
4690 if (const ConstantFPSDNode
*C
= isConstOrConstSplatFP(N
))
4691 return getConstantNegateCost(C
) == NegatibleCost::Expensive
;
4695 bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N
) const {
4696 if (const ConstantFPSDNode
*C
= isConstOrConstSplatFP(N
))
4697 return getConstantNegateCost(C
) == NegatibleCost::Cheaper
;
4701 static unsigned inverseMinMax(unsigned Opc
) {
4704 return ISD::FMINNUM
;
4706 return ISD::FMAXNUM
;
4707 case ISD::FMAXNUM_IEEE
:
4708 return ISD::FMINNUM_IEEE
;
4709 case ISD::FMINNUM_IEEE
:
4710 return ISD::FMAXNUM_IEEE
;
4712 return ISD::FMINIMUM
;
4714 return ISD::FMAXIMUM
;
4715 case AMDGPUISD::FMAX_LEGACY
:
4716 return AMDGPUISD::FMIN_LEGACY
;
4717 case AMDGPUISD::FMIN_LEGACY
:
4718 return AMDGPUISD::FMAX_LEGACY
;
4720 llvm_unreachable("invalid min/max opcode");
4724 /// \return true if it's profitable to try to push an fneg into its source
4726 bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode
*N
, SDValue N0
) {
4727 // If the input has multiple uses and we can either fold the negate down, or
4728 // the other uses cannot, give up. This both prevents unprofitable
4729 // transformations and infinite loops: we won't repeatedly try to fold around
4730 // a negate that has no 'good' form.
4731 if (N0
.hasOneUse()) {
4732 // This may be able to fold into the source, but at a code size cost. Don't
4733 // fold if the fold into the user is free.
4734 if (allUsesHaveSourceMods(N
, 0))
4737 if (fnegFoldsIntoOp(N0
.getNode()) &&
4738 (allUsesHaveSourceMods(N
) || !allUsesHaveSourceMods(N0
.getNode())))
4745 SDValue
AMDGPUTargetLowering::performFNegCombine(SDNode
*N
,
4746 DAGCombinerInfo
&DCI
) const {
4747 SelectionDAG
&DAG
= DCI
.DAG
;
4748 SDValue N0
= N
->getOperand(0);
4749 EVT VT
= N
->getValueType(0);
4751 unsigned Opc
= N0
.getOpcode();
4753 if (!shouldFoldFNegIntoSrc(N
, N0
))
4759 if (!mayIgnoreSignedZero(N0
))
4762 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4763 SDValue LHS
= N0
.getOperand(0);
4764 SDValue RHS
= N0
.getOperand(1);
4766 if (LHS
.getOpcode() != ISD::FNEG
)
4767 LHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, LHS
);
4769 LHS
= LHS
.getOperand(0);
4771 if (RHS
.getOpcode() != ISD::FNEG
)
4772 RHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
4774 RHS
= RHS
.getOperand(0);
4776 SDValue Res
= DAG
.getNode(ISD::FADD
, SL
, VT
, LHS
, RHS
, N0
->getFlags());
4777 if (Res
.getOpcode() != ISD::FADD
)
4778 return SDValue(); // Op got folded away.
4779 if (!N0
.hasOneUse())
4780 DAG
.ReplaceAllUsesWith(N0
, DAG
.getNode(ISD::FNEG
, SL
, VT
, Res
));
4784 case AMDGPUISD::FMUL_LEGACY
: {
4785 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4786 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4787 SDValue LHS
= N0
.getOperand(0);
4788 SDValue RHS
= N0
.getOperand(1);
4790 if (LHS
.getOpcode() == ISD::FNEG
)
4791 LHS
= LHS
.getOperand(0);
4792 else if (RHS
.getOpcode() == ISD::FNEG
)
4793 RHS
= RHS
.getOperand(0);
4795 RHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
4797 SDValue Res
= DAG
.getNode(Opc
, SL
, VT
, LHS
, RHS
, N0
->getFlags());
4798 if (Res
.getOpcode() != Opc
)
4799 return SDValue(); // Op got folded away.
4800 if (!N0
.hasOneUse())
4801 DAG
.ReplaceAllUsesWith(N0
, DAG
.getNode(ISD::FNEG
, SL
, VT
, Res
));
4806 // TODO: handle llvm.amdgcn.fma.legacy
4807 if (!mayIgnoreSignedZero(N0
))
4810 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4811 SDValue LHS
= N0
.getOperand(0);
4812 SDValue MHS
= N0
.getOperand(1);
4813 SDValue RHS
= N0
.getOperand(2);
4815 if (LHS
.getOpcode() == ISD::FNEG
)
4816 LHS
= LHS
.getOperand(0);
4817 else if (MHS
.getOpcode() == ISD::FNEG
)
4818 MHS
= MHS
.getOperand(0);
4820 MHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, MHS
);
4822 if (RHS
.getOpcode() != ISD::FNEG
)
4823 RHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
4825 RHS
= RHS
.getOperand(0);
4827 SDValue Res
= DAG
.getNode(Opc
, SL
, VT
, LHS
, MHS
, RHS
);
4828 if (Res
.getOpcode() != Opc
)
4829 return SDValue(); // Op got folded away.
4830 if (!N0
.hasOneUse())
4831 DAG
.ReplaceAllUsesWith(N0
, DAG
.getNode(ISD::FNEG
, SL
, VT
, Res
));
4836 case ISD::FMAXNUM_IEEE
:
4837 case ISD::FMINNUM_IEEE
:
4840 case AMDGPUISD::FMAX_LEGACY
:
4841 case AMDGPUISD::FMIN_LEGACY
: {
4842 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4843 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4844 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4845 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4847 SDValue LHS
= N0
.getOperand(0);
4848 SDValue RHS
= N0
.getOperand(1);
4850 // 0 doesn't have a negated inline immediate.
4851 // TODO: This constant check should be generalized to other operations.
4852 if (isConstantCostlierToNegate(RHS
))
4855 SDValue NegLHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, LHS
);
4856 SDValue NegRHS
= DAG
.getNode(ISD::FNEG
, SL
, VT
, RHS
);
4857 unsigned Opposite
= inverseMinMax(Opc
);
4859 SDValue Res
= DAG
.getNode(Opposite
, SL
, VT
, NegLHS
, NegRHS
, N0
->getFlags());
4860 if (Res
.getOpcode() != Opposite
)
4861 return SDValue(); // Op got folded away.
4862 if (!N0
.hasOneUse())
4863 DAG
.ReplaceAllUsesWith(N0
, DAG
.getNode(ISD::FNEG
, SL
, VT
, Res
));
4866 case AMDGPUISD::FMED3
: {
4868 for (unsigned I
= 0; I
< 3; ++I
)
4869 Ops
[I
] = DAG
.getNode(ISD::FNEG
, SL
, VT
, N0
->getOperand(I
), N0
->getFlags());
4871 SDValue Res
= DAG
.getNode(AMDGPUISD::FMED3
, SL
, VT
, Ops
, N0
->getFlags());
4872 if (Res
.getOpcode() != AMDGPUISD::FMED3
)
4873 return SDValue(); // Op got folded away.
4875 if (!N0
.hasOneUse()) {
4876 SDValue Neg
= DAG
.getNode(ISD::FNEG
, SL
, VT
, Res
);
4877 DAG
.ReplaceAllUsesWith(N0
, Neg
);
4879 for (SDNode
*U
: Neg
->uses())
4880 DCI
.AddToWorklist(U
);
4885 case ISD::FP_EXTEND
:
4888 case ISD::FNEARBYINT
: // XXX - Should fround be handled?
4889 case ISD::FROUNDEVEN
:
4891 case ISD::FCANONICALIZE
:
4892 case AMDGPUISD::RCP
:
4893 case AMDGPUISD::RCP_LEGACY
:
4894 case AMDGPUISD::RCP_IFLAG
:
4895 case AMDGPUISD::SIN_HW
: {
4896 SDValue CvtSrc
= N0
.getOperand(0);
4897 if (CvtSrc
.getOpcode() == ISD::FNEG
) {
4898 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4899 // (fneg (rcp (fneg x))) -> (rcp x)
4900 return DAG
.getNode(Opc
, SL
, VT
, CvtSrc
.getOperand(0));
4903 if (!N0
.hasOneUse())
4906 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4907 // (fneg (rcp x)) -> (rcp (fneg x))
4908 SDValue Neg
= DAG
.getNode(ISD::FNEG
, SL
, CvtSrc
.getValueType(), CvtSrc
);
4909 return DAG
.getNode(Opc
, SL
, VT
, Neg
, N0
->getFlags());
4911 case ISD::FP_ROUND
: {
4912 SDValue CvtSrc
= N0
.getOperand(0);
4914 if (CvtSrc
.getOpcode() == ISD::FNEG
) {
4915 // (fneg (fp_round (fneg x))) -> (fp_round x)
4916 return DAG
.getNode(ISD::FP_ROUND
, SL
, VT
,
4917 CvtSrc
.getOperand(0), N0
.getOperand(1));
4920 if (!N0
.hasOneUse())
4923 // (fneg (fp_round x)) -> (fp_round (fneg x))
4924 SDValue Neg
= DAG
.getNode(ISD::FNEG
, SL
, CvtSrc
.getValueType(), CvtSrc
);
4925 return DAG
.getNode(ISD::FP_ROUND
, SL
, VT
, Neg
, N0
.getOperand(1));
4927 case ISD::FP16_TO_FP
: {
4928 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4929 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4930 // Put the fneg back as a legal source operation that can be matched later.
4933 SDValue Src
= N0
.getOperand(0);
4934 EVT SrcVT
= Src
.getValueType();
4936 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4937 SDValue IntFNeg
= DAG
.getNode(ISD::XOR
, SL
, SrcVT
, Src
,
4938 DAG
.getConstant(0x8000, SL
, SrcVT
));
4939 return DAG
.getNode(ISD::FP16_TO_FP
, SL
, N
->getValueType(0), IntFNeg
);
4942 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4943 // TODO: Invert conditions of foldFreeOpFromSelect
4946 case ISD::BITCAST
: {
4948 SDValue BCSrc
= N0
.getOperand(0);
4949 if (BCSrc
.getOpcode() == ISD::BUILD_VECTOR
) {
4950 SDValue HighBits
= BCSrc
.getOperand(BCSrc
.getNumOperands() - 1);
4951 if (HighBits
.getValueType().getSizeInBits() != 32 ||
4952 !fnegFoldsIntoOp(HighBits
.getNode()))
4955 // f64 fneg only really needs to operate on the high half of of the
4956 // register, so try to force it to an f32 operation to help make use of
4957 // source modifiers.
4960 // fneg (f64 (bitcast (build_vector x, y))) ->
4961 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4962 // (fneg (bitcast i32:y to f32)))
4964 SDValue CastHi
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::f32
, HighBits
);
4965 SDValue NegHi
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
, CastHi
);
4967 DAG
.getNode(ISD::BITCAST
, SL
, HighBits
.getValueType(), NegHi
);
4969 SmallVector
<SDValue
, 8> Ops(BCSrc
->op_begin(), BCSrc
->op_end());
4970 Ops
.back() = CastBack
;
4971 DCI
.AddToWorklist(NegHi
.getNode());
4973 DAG
.getNode(ISD::BUILD_VECTOR
, SL
, BCSrc
.getValueType(), Ops
);
4974 SDValue Result
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, Build
);
4976 if (!N0
.hasOneUse())
4977 DAG
.ReplaceAllUsesWith(N0
, DAG
.getNode(ISD::FNEG
, SL
, VT
, Result
));
4981 if (BCSrc
.getOpcode() == ISD::SELECT
&& VT
== MVT::f32
&&
4982 BCSrc
.hasOneUse()) {
4983 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4984 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4986 // TODO: Cast back result for multiple uses is beneficial in some cases.
4989 DAG
.getNode(ISD::BITCAST
, SL
, MVT::f32
, BCSrc
.getOperand(1));
4991 DAG
.getNode(ISD::BITCAST
, SL
, MVT::f32
, BCSrc
.getOperand(2));
4993 SDValue NegLHS
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
, LHS
);
4994 SDValue NegRHS
= DAG
.getNode(ISD::FNEG
, SL
, MVT::f32
, RHS
);
4996 return DAG
.getNode(ISD::SELECT
, SL
, MVT::f32
, BCSrc
.getOperand(0), NegLHS
,
5007 SDValue
AMDGPUTargetLowering::performFAbsCombine(SDNode
*N
,
5008 DAGCombinerInfo
&DCI
) const {
5009 SelectionDAG
&DAG
= DCI
.DAG
;
5010 SDValue N0
= N
->getOperand(0);
5012 if (!N0
.hasOneUse())
5015 switch (N0
.getOpcode()) {
5016 case ISD::FP16_TO_FP
: {
5017 assert(!Subtarget
->has16BitInsts() && "should only see if f16 is illegal");
5019 SDValue Src
= N0
.getOperand(0);
5020 EVT SrcVT
= Src
.getValueType();
5022 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5023 SDValue IntFAbs
= DAG
.getNode(ISD::AND
, SL
, SrcVT
, Src
,
5024 DAG
.getConstant(0x7fff, SL
, SrcVT
));
5025 return DAG
.getNode(ISD::FP16_TO_FP
, SL
, N
->getValueType(0), IntFAbs
);
5032 SDValue
AMDGPUTargetLowering::performRcpCombine(SDNode
*N
,
5033 DAGCombinerInfo
&DCI
) const {
5034 const auto *CFP
= dyn_cast
<ConstantFPSDNode
>(N
->getOperand(0));
5038 // XXX - Should this flush denormals?
5039 const APFloat
&Val
= CFP
->getValueAPF();
5040 APFloat
One(Val
.getSemantics(), "1.0");
5041 return DCI
.DAG
.getConstantFP(One
/ Val
, SDLoc(N
), N
->getValueType(0));
5044 SDValue
AMDGPUTargetLowering::PerformDAGCombine(SDNode
*N
,
5045 DAGCombinerInfo
&DCI
) const {
5046 SelectionDAG
&DAG
= DCI
.DAG
;
5049 switch(N
->getOpcode()) {
5052 case ISD::BITCAST
: {
5053 EVT DestVT
= N
->getValueType(0);
5055 // Push casts through vector builds. This helps avoid emitting a large
5056 // number of copies when materializing floating point vector constants.
5058 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5059 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5060 if (DestVT
.isVector()) {
5061 SDValue Src
= N
->getOperand(0);
5062 if (Src
.getOpcode() == ISD::BUILD_VECTOR
&&
5063 (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
||
5064 isOperationLegal(ISD::BUILD_VECTOR
, DestVT
))) {
5065 EVT SrcVT
= Src
.getValueType();
5066 unsigned NElts
= DestVT
.getVectorNumElements();
5068 if (SrcVT
.getVectorNumElements() == NElts
) {
5069 EVT DestEltVT
= DestVT
.getVectorElementType();
5071 SmallVector
<SDValue
, 8> CastedElts
;
5073 for (unsigned I
= 0, E
= SrcVT
.getVectorNumElements(); I
!= E
; ++I
) {
5074 SDValue Elt
= Src
.getOperand(I
);
5075 CastedElts
.push_back(DAG
.getNode(ISD::BITCAST
, DL
, DestEltVT
, Elt
));
5078 return DAG
.getBuildVector(DestVT
, SL
, CastedElts
);
5083 if (DestVT
.getSizeInBits() != 64 || !DestVT
.isVector())
5086 // Fold bitcasts of constants.
5088 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5089 // TODO: Generalize and move to DAGCombiner
5090 SDValue Src
= N
->getOperand(0);
5091 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Src
)) {
5093 uint64_t CVal
= C
->getZExtValue();
5094 SDValue BV
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
,
5095 DAG
.getConstant(Lo_32(CVal
), SL
, MVT::i32
),
5096 DAG
.getConstant(Hi_32(CVal
), SL
, MVT::i32
));
5097 return DAG
.getNode(ISD::BITCAST
, SL
, DestVT
, BV
);
5100 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(Src
)) {
5101 const APInt
&Val
= C
->getValueAPF().bitcastToAPInt();
5103 uint64_t CVal
= Val
.getZExtValue();
5104 SDValue Vec
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
,
5105 DAG
.getConstant(Lo_32(CVal
), SL
, MVT::i32
),
5106 DAG
.getConstant(Hi_32(CVal
), SL
, MVT::i32
));
5108 return DAG
.getNode(ISD::BITCAST
, SL
, DestVT
, Vec
);
5114 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
5117 return performShlCombine(N
, DCI
);
5120 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
5123 return performSrlCombine(N
, DCI
);
5126 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
5129 return performSraCombine(N
, DCI
);
5132 return performTruncateCombine(N
, DCI
);
5134 return performMulCombine(N
, DCI
);
5135 case AMDGPUISD::MUL_U24
:
5136 case AMDGPUISD::MUL_I24
: {
5137 if (SDValue Simplified
= simplifyMul24(N
, DCI
))
5141 case AMDGPUISD::MULHI_I24
:
5142 case AMDGPUISD::MULHI_U24
:
5143 return simplifyMul24(N
, DCI
);
5144 case ISD::SMUL_LOHI
:
5145 case ISD::UMUL_LOHI
:
5146 return performMulLoHiCombine(N
, DCI
);
5148 return performMulhsCombine(N
, DCI
);
5150 return performMulhuCombine(N
, DCI
);
5152 return performSelectCombine(N
, DCI
);
5154 return performFNegCombine(N
, DCI
);
5156 return performFAbsCombine(N
, DCI
);
5157 case AMDGPUISD::BFE_I32
:
5158 case AMDGPUISD::BFE_U32
: {
5159 assert(!N
->getValueType(0).isVector() &&
5160 "Vector handling of BFE not implemented");
5161 ConstantSDNode
*Width
= dyn_cast
<ConstantSDNode
>(N
->getOperand(2));
5165 uint32_t WidthVal
= Width
->getZExtValue() & 0x1f;
5167 return DAG
.getConstant(0, DL
, MVT::i32
);
5169 ConstantSDNode
*Offset
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
5173 SDValue BitsFrom
= N
->getOperand(0);
5174 uint32_t OffsetVal
= Offset
->getZExtValue() & 0x1f;
5176 bool Signed
= N
->getOpcode() == AMDGPUISD::BFE_I32
;
5178 if (OffsetVal
== 0) {
5179 // This is already sign / zero extended, so try to fold away extra BFEs.
5180 unsigned SignBits
= Signed
? (32 - WidthVal
+ 1) : (32 - WidthVal
);
5182 unsigned OpSignBits
= DAG
.ComputeNumSignBits(BitsFrom
);
5183 if (OpSignBits
>= SignBits
)
5186 EVT SmallVT
= EVT::getIntegerVT(*DAG
.getContext(), WidthVal
);
5188 // This is a sign_extend_inreg. Replace it to take advantage of existing
5189 // DAG Combines. If not eliminated, we will match back to BFE during
5192 // TODO: The sext_inreg of extended types ends, although we can could
5193 // handle them in a single BFE.
5194 return DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, MVT::i32
, BitsFrom
,
5195 DAG
.getValueType(SmallVT
));
5198 return DAG
.getZeroExtendInReg(BitsFrom
, DL
, SmallVT
);
5201 if (ConstantSDNode
*CVal
= dyn_cast
<ConstantSDNode
>(BitsFrom
)) {
5203 return constantFoldBFE
<int32_t>(DAG
,
5204 CVal
->getSExtValue(),
5210 return constantFoldBFE
<uint32_t>(DAG
,
5211 CVal
->getZExtValue(),
5217 if ((OffsetVal
+ WidthVal
) >= 32 &&
5218 !(Subtarget
->hasSDWA() && OffsetVal
== 16 && WidthVal
== 16)) {
5219 SDValue ShiftVal
= DAG
.getConstant(OffsetVal
, DL
, MVT::i32
);
5220 return DAG
.getNode(Signed
? ISD::SRA
: ISD::SRL
, DL
, MVT::i32
,
5221 BitsFrom
, ShiftVal
);
5224 if (BitsFrom
.hasOneUse()) {
5225 APInt Demanded
= APInt::getBitsSet(32,
5227 OffsetVal
+ WidthVal
);
5230 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
5231 !DCI
.isBeforeLegalizeOps());
5232 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
5233 if (TLI
.ShrinkDemandedConstant(BitsFrom
, Demanded
, TLO
) ||
5234 TLI
.SimplifyDemandedBits(BitsFrom
, Demanded
, Known
, TLO
)) {
5235 DCI
.CommitTargetLoweringOpt(TLO
);
5242 return performLoadCombine(N
, DCI
);
5244 return performStoreCombine(N
, DCI
);
5245 case AMDGPUISD::RCP
:
5246 case AMDGPUISD::RCP_IFLAG
:
5247 return performRcpCombine(N
, DCI
);
5248 case ISD::AssertZext
:
5249 case ISD::AssertSext
:
5250 return performAssertSZExtCombine(N
, DCI
);
5251 case ISD::INTRINSIC_WO_CHAIN
:
5252 return performIntrinsicWOChainCombine(N
, DCI
);
5253 case AMDGPUISD::FMAD_FTZ
: {
5254 SDValue N0
= N
->getOperand(0);
5255 SDValue N1
= N
->getOperand(1);
5256 SDValue N2
= N
->getOperand(2);
5257 EVT VT
= N
->getValueType(0);
5259 // FMAD_FTZ is a FMAD + flush denormals to zero.
5260 // We flush the inputs, the intermediate step, and the output.
5261 ConstantFPSDNode
*N0CFP
= dyn_cast
<ConstantFPSDNode
>(N0
);
5262 ConstantFPSDNode
*N1CFP
= dyn_cast
<ConstantFPSDNode
>(N1
);
5263 ConstantFPSDNode
*N2CFP
= dyn_cast
<ConstantFPSDNode
>(N2
);
5264 if (N0CFP
&& N1CFP
&& N2CFP
) {
5265 const auto FTZ
= [](const APFloat
&V
) {
5266 if (V
.isDenormal()) {
5267 APFloat
Zero(V
.getSemantics(), 0);
5268 return V
.isNegative() ? -Zero
: Zero
;
5273 APFloat V0
= FTZ(N0CFP
->getValueAPF());
5274 APFloat V1
= FTZ(N1CFP
->getValueAPF());
5275 APFloat V2
= FTZ(N2CFP
->getValueAPF());
5276 V0
.multiply(V1
, APFloat::rmNearestTiesToEven
);
5278 V0
.add(V2
, APFloat::rmNearestTiesToEven
);
5279 return DAG
.getConstantFP(FTZ(V0
), DL
, VT
);
5287 //===----------------------------------------------------------------------===//
5289 //===----------------------------------------------------------------------===//
5291 SDValue
AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG
&DAG
,
5292 const TargetRegisterClass
*RC
,
5293 Register Reg
, EVT VT
,
5295 bool RawReg
) const {
5296 MachineFunction
&MF
= DAG
.getMachineFunction();
5297 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
5300 if (!MRI
.isLiveIn(Reg
)) {
5301 VReg
= MRI
.createVirtualRegister(RC
);
5302 MRI
.addLiveIn(Reg
, VReg
);
5304 VReg
= MRI
.getLiveInVirtReg(Reg
);
5308 return DAG
.getRegister(VReg
, VT
);
5310 return DAG
.getCopyFromReg(DAG
.getEntryNode(), SL
, VReg
, VT
);
5313 // This may be called multiple times, and nothing prevents creating multiple
5314 // objects at the same offset. See if we already defined this object.
5315 static int getOrCreateFixedStackObject(MachineFrameInfo
&MFI
, unsigned Size
,
5317 for (int I
= MFI
.getObjectIndexBegin(); I
< 0; ++I
) {
5318 if (MFI
.getObjectOffset(I
) == Offset
) {
5319 assert(MFI
.getObjectSize(I
) == Size
);
5324 return MFI
.CreateFixedObject(Size
, Offset
, true);
5327 SDValue
AMDGPUTargetLowering::loadStackInputValue(SelectionDAG
&DAG
,
5330 int64_t Offset
) const {
5331 MachineFunction
&MF
= DAG
.getMachineFunction();
5332 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
5333 int FI
= getOrCreateFixedStackObject(MFI
, VT
.getStoreSize(), Offset
);
5335 auto SrcPtrInfo
= MachinePointerInfo::getStack(MF
, Offset
);
5336 SDValue Ptr
= DAG
.getFrameIndex(FI
, MVT::i32
);
5338 return DAG
.getLoad(VT
, SL
, DAG
.getEntryNode(), Ptr
, SrcPtrInfo
, Align(4),
5339 MachineMemOperand::MODereferenceable
|
5340 MachineMemOperand::MOInvariant
);
5343 SDValue
AMDGPUTargetLowering::storeStackInputValue(SelectionDAG
&DAG
,
5347 int64_t Offset
) const {
5348 MachineFunction
&MF
= DAG
.getMachineFunction();
5349 MachinePointerInfo DstInfo
= MachinePointerInfo::getStack(MF
, Offset
);
5350 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
5352 SDValue Ptr
= DAG
.getConstant(Offset
, SL
, MVT::i32
);
5353 // Stores to the argument stack area are relative to the stack pointer.
5355 DAG
.getCopyFromReg(Chain
, SL
, Info
->getStackPtrOffsetReg(), MVT::i32
);
5356 Ptr
= DAG
.getNode(ISD::ADD
, SL
, MVT::i32
, SP
, Ptr
);
5357 SDValue Store
= DAG
.getStore(Chain
, SL
, ArgVal
, Ptr
, DstInfo
, Align(4),
5358 MachineMemOperand::MODereferenceable
);
5362 SDValue
AMDGPUTargetLowering::loadInputValue(SelectionDAG
&DAG
,
5363 const TargetRegisterClass
*RC
,
5364 EVT VT
, const SDLoc
&SL
,
5365 const ArgDescriptor
&Arg
) const {
5366 assert(Arg
&& "Attempting to load missing argument");
5368 SDValue V
= Arg
.isRegister() ?
5369 CreateLiveInRegister(DAG
, RC
, Arg
.getRegister(), VT
, SL
) :
5370 loadStackInputValue(DAG
, VT
, SL
, Arg
.getStackOffset());
5372 if (!Arg
.isMasked())
5375 unsigned Mask
= Arg
.getMask();
5376 unsigned Shift
= llvm::countr_zero
<unsigned>(Mask
);
5377 V
= DAG
.getNode(ISD::SRL
, SL
, VT
, V
,
5378 DAG
.getShiftAmountConstant(Shift
, VT
, SL
));
5379 return DAG
.getNode(ISD::AND
, SL
, VT
, V
,
5380 DAG
.getConstant(Mask
>> Shift
, SL
, VT
));
5383 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5384 uint64_t ExplicitKernArgSize
, const ImplicitParameter Param
) const {
5385 unsigned ExplicitArgOffset
= Subtarget
->getExplicitKernelArgOffset();
5386 const Align Alignment
= Subtarget
->getAlignmentForImplicitArgPtr();
5387 uint64_t ArgOffset
=
5388 alignTo(ExplicitKernArgSize
, Alignment
) + ExplicitArgOffset
;
5390 case FIRST_IMPLICIT
:
5393 return ArgOffset
+ AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET
;
5395 return ArgOffset
+ AMDGPU::ImplicitArg::SHARED_BASE_OFFSET
;
5397 return ArgOffset
+ AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET
;
5399 llvm_unreachable("unexpected implicit parameter type");
5402 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5403 const MachineFunction
&MF
, const ImplicitParameter Param
) const {
5404 const AMDGPUMachineFunction
*MFI
= MF
.getInfo
<AMDGPUMachineFunction
>();
5405 return getImplicitParameterOffset(MFI
->getExplicitKernArgSize(), Param
);
5408 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5410 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode
) const {
5411 switch ((AMDGPUISD::NodeType
)Opcode
) {
5412 case AMDGPUISD::FIRST_NUMBER
: break;
5414 NODE_NAME_CASE(UMUL
);
5415 NODE_NAME_CASE(BRANCH_COND
);
5419 NODE_NAME_CASE(ELSE
)
5420 NODE_NAME_CASE(LOOP
)
5421 NODE_NAME_CASE(CALL
)
5422 NODE_NAME_CASE(TC_RETURN
)
5423 NODE_NAME_CASE(TC_RETURN_GFX
)
5424 NODE_NAME_CASE(TC_RETURN_CHAIN
)
5425 NODE_NAME_CASE(TRAP
)
5426 NODE_NAME_CASE(RET_GLUE
)
5427 NODE_NAME_CASE(WAVE_ADDRESS
)
5428 NODE_NAME_CASE(RETURN_TO_EPILOG
)
5429 NODE_NAME_CASE(ENDPGM
)
5430 NODE_NAME_CASE(ENDPGM_TRAP
)
5431 NODE_NAME_CASE(SIMULATED_TRAP
)
5432 NODE_NAME_CASE(DWORDADDR
)
5433 NODE_NAME_CASE(FRACT
)
5434 NODE_NAME_CASE(SETCC
)
5435 NODE_NAME_CASE(SETREG
)
5436 NODE_NAME_CASE(DENORM_MODE
)
5437 NODE_NAME_CASE(FMA_W_CHAIN
)
5438 NODE_NAME_CASE(FMUL_W_CHAIN
)
5439 NODE_NAME_CASE(CLAMP
)
5440 NODE_NAME_CASE(COS_HW
)
5441 NODE_NAME_CASE(SIN_HW
)
5442 NODE_NAME_CASE(FMAX_LEGACY
)
5443 NODE_NAME_CASE(FMIN_LEGACY
)
5444 NODE_NAME_CASE(FMAX3
)
5445 NODE_NAME_CASE(SMAX3
)
5446 NODE_NAME_CASE(UMAX3
)
5447 NODE_NAME_CASE(FMIN3
)
5448 NODE_NAME_CASE(SMIN3
)
5449 NODE_NAME_CASE(UMIN3
)
5450 NODE_NAME_CASE(FMED3
)
5451 NODE_NAME_CASE(SMED3
)
5452 NODE_NAME_CASE(UMED3
)
5453 NODE_NAME_CASE(FMAXIMUM3
)
5454 NODE_NAME_CASE(FMINIMUM3
)
5455 NODE_NAME_CASE(FDOT2
)
5456 NODE_NAME_CASE(URECIP
)
5457 NODE_NAME_CASE(DIV_SCALE
)
5458 NODE_NAME_CASE(DIV_FMAS
)
5459 NODE_NAME_CASE(DIV_FIXUP
)
5460 NODE_NAME_CASE(FMAD_FTZ
)
5463 NODE_NAME_CASE(RCP_LEGACY
)
5464 NODE_NAME_CASE(RCP_IFLAG
)
5467 NODE_NAME_CASE(FMUL_LEGACY
)
5468 NODE_NAME_CASE(RSQ_CLAMP
)
5469 NODE_NAME_CASE(FP_CLASS
)
5470 NODE_NAME_CASE(DOT4
)
5471 NODE_NAME_CASE(CARRY
)
5472 NODE_NAME_CASE(BORROW
)
5473 NODE_NAME_CASE(BFE_U32
)
5474 NODE_NAME_CASE(BFE_I32
)
5477 NODE_NAME_CASE(FFBH_U32
)
5478 NODE_NAME_CASE(FFBH_I32
)
5479 NODE_NAME_CASE(FFBL_B32
)
5480 NODE_NAME_CASE(MUL_U24
)
5481 NODE_NAME_CASE(MUL_I24
)
5482 NODE_NAME_CASE(MULHI_U24
)
5483 NODE_NAME_CASE(MULHI_I24
)
5484 NODE_NAME_CASE(MAD_U24
)
5485 NODE_NAME_CASE(MAD_I24
)
5486 NODE_NAME_CASE(MAD_I64_I32
)
5487 NODE_NAME_CASE(MAD_U64_U32
)
5488 NODE_NAME_CASE(PERM
)
5489 NODE_NAME_CASE(TEXTURE_FETCH
)
5490 NODE_NAME_CASE(R600_EXPORT
)
5491 NODE_NAME_CASE(CONST_ADDRESS
)
5492 NODE_NAME_CASE(REGISTER_LOAD
)
5493 NODE_NAME_CASE(REGISTER_STORE
)
5494 NODE_NAME_CASE(SAMPLE
)
5495 NODE_NAME_CASE(SAMPLEB
)
5496 NODE_NAME_CASE(SAMPLED
)
5497 NODE_NAME_CASE(SAMPLEL
)
5498 NODE_NAME_CASE(CVT_F32_UBYTE0
)
5499 NODE_NAME_CASE(CVT_F32_UBYTE1
)
5500 NODE_NAME_CASE(CVT_F32_UBYTE2
)
5501 NODE_NAME_CASE(CVT_F32_UBYTE3
)
5502 NODE_NAME_CASE(CVT_PKRTZ_F16_F32
)
5503 NODE_NAME_CASE(CVT_PKNORM_I16_F32
)
5504 NODE_NAME_CASE(CVT_PKNORM_U16_F32
)
5505 NODE_NAME_CASE(CVT_PK_I16_I32
)
5506 NODE_NAME_CASE(CVT_PK_U16_U32
)
5507 NODE_NAME_CASE(FP_TO_FP16
)
5508 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR
)
5509 NODE_NAME_CASE(CONST_DATA_PTR
)
5510 NODE_NAME_CASE(PC_ADD_REL_OFFSET
)
5512 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD
)
5513 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD
)
5514 NODE_NAME_CASE(DUMMY_CHAIN
)
5515 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER
: break;
5516 NODE_NAME_CASE(LOAD_D16_HI
)
5517 NODE_NAME_CASE(LOAD_D16_LO
)
5518 NODE_NAME_CASE(LOAD_D16_HI_I8
)
5519 NODE_NAME_CASE(LOAD_D16_HI_U8
)
5520 NODE_NAME_CASE(LOAD_D16_LO_I8
)
5521 NODE_NAME_CASE(LOAD_D16_LO_U8
)
5522 NODE_NAME_CASE(STORE_MSKOR
)
5523 NODE_NAME_CASE(LOAD_CONSTANT
)
5524 NODE_NAME_CASE(TBUFFER_STORE_FORMAT
)
5525 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16
)
5526 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT
)
5527 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16
)
5528 NODE_NAME_CASE(DS_ORDERED_COUNT
)
5529 NODE_NAME_CASE(ATOMIC_CMP_SWAP
)
5530 NODE_NAME_CASE(BUFFER_LOAD
)
5531 NODE_NAME_CASE(BUFFER_LOAD_UBYTE
)
5532 NODE_NAME_CASE(BUFFER_LOAD_USHORT
)
5533 NODE_NAME_CASE(BUFFER_LOAD_BYTE
)
5534 NODE_NAME_CASE(BUFFER_LOAD_SHORT
)
5535 NODE_NAME_CASE(BUFFER_LOAD_TFE
)
5536 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE
)
5537 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE
)
5538 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE
)
5539 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE
)
5540 NODE_NAME_CASE(BUFFER_LOAD_FORMAT
)
5541 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE
)
5542 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16
)
5543 NODE_NAME_CASE(SBUFFER_LOAD
)
5544 NODE_NAME_CASE(SBUFFER_LOAD_BYTE
)
5545 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE
)
5546 NODE_NAME_CASE(SBUFFER_LOAD_SHORT
)
5547 NODE_NAME_CASE(SBUFFER_LOAD_USHORT
)
5548 NODE_NAME_CASE(BUFFER_STORE
)
5549 NODE_NAME_CASE(BUFFER_STORE_BYTE
)
5550 NODE_NAME_CASE(BUFFER_STORE_SHORT
)
5551 NODE_NAME_CASE(BUFFER_STORE_FORMAT
)
5552 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16
)
5553 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP
)
5554 NODE_NAME_CASE(BUFFER_ATOMIC_ADD
)
5555 NODE_NAME_CASE(BUFFER_ATOMIC_SUB
)
5556 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN
)
5557 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN
)
5558 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX
)
5559 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX
)
5560 NODE_NAME_CASE(BUFFER_ATOMIC_AND
)
5561 NODE_NAME_CASE(BUFFER_ATOMIC_OR
)
5562 NODE_NAME_CASE(BUFFER_ATOMIC_XOR
)
5563 NODE_NAME_CASE(BUFFER_ATOMIC_INC
)
5564 NODE_NAME_CASE(BUFFER_ATOMIC_DEC
)
5565 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP
)
5566 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB
)
5567 NODE_NAME_CASE(BUFFER_ATOMIC_FADD
)
5568 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN
)
5569 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX
)
5570 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32
)
5572 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER
: break;
5577 SDValue
AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand
,
5578 SelectionDAG
&DAG
, int Enabled
,
5579 int &RefinementSteps
,
5580 bool &UseOneConstNR
,
5581 bool Reciprocal
) const {
5582 EVT VT
= Operand
.getValueType();
5584 if (VT
== MVT::f32
) {
5585 RefinementSteps
= 0;
5586 return DAG
.getNode(AMDGPUISD::RSQ
, SDLoc(Operand
), VT
, Operand
);
5589 // TODO: There is also f64 rsq instruction, but the documentation is less
5590 // clear on its precision.
5595 SDValue
AMDGPUTargetLowering::getRecipEstimate(SDValue Operand
,
5596 SelectionDAG
&DAG
, int Enabled
,
5597 int &RefinementSteps
) const {
5598 EVT VT
= Operand
.getValueType();
5600 if (VT
== MVT::f32
) {
5601 // Reciprocal, < 1 ulp error.
5603 // This reciprocal approximation converges to < 0.5 ulp error with one
5604 // newton rhapson performed with two fused multiple adds (FMAs).
5606 RefinementSteps
= 0;
5607 return DAG
.getNode(AMDGPUISD::RCP
, SDLoc(Operand
), VT
, Operand
);
5610 // TODO: There is also f64 rcp instruction, but the documentation is less
5611 // clear on its precision.
5616 static unsigned workitemIntrinsicDim(unsigned ID
) {
5618 case Intrinsic::amdgcn_workitem_id_x
:
5620 case Intrinsic::amdgcn_workitem_id_y
:
5622 case Intrinsic::amdgcn_workitem_id_z
:
5625 llvm_unreachable("not a workitem intrinsic");
5629 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5630 const SDValue Op
, KnownBits
&Known
,
5631 const APInt
&DemandedElts
, const SelectionDAG
&DAG
, unsigned Depth
) const {
5633 Known
.resetAll(); // Don't know anything.
5635 unsigned Opc
= Op
.getOpcode();
5640 case AMDGPUISD::CARRY
:
5641 case AMDGPUISD::BORROW
: {
5642 Known
.Zero
= APInt::getHighBitsSet(32, 31);
5646 case AMDGPUISD::BFE_I32
:
5647 case AMDGPUISD::BFE_U32
: {
5648 ConstantSDNode
*CWidth
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(2));
5652 uint32_t Width
= CWidth
->getZExtValue() & 0x1f;
5654 if (Opc
== AMDGPUISD::BFE_U32
)
5655 Known
.Zero
= APInt::getHighBitsSet(32, 32 - Width
);
5659 case AMDGPUISD::FP_TO_FP16
: {
5660 unsigned BitWidth
= Known
.getBitWidth();
5662 // High bits are zero.
5663 Known
.Zero
= APInt::getHighBitsSet(BitWidth
, BitWidth
- 16);
5666 case AMDGPUISD::MUL_U24
:
5667 case AMDGPUISD::MUL_I24
: {
5668 KnownBits LHSKnown
= DAG
.computeKnownBits(Op
.getOperand(0), Depth
+ 1);
5669 KnownBits RHSKnown
= DAG
.computeKnownBits(Op
.getOperand(1), Depth
+ 1);
5670 unsigned TrailZ
= LHSKnown
.countMinTrailingZeros() +
5671 RHSKnown
.countMinTrailingZeros();
5672 Known
.Zero
.setLowBits(std::min(TrailZ
, 32u));
5673 // Skip extra check if all bits are known zeros.
5677 // Truncate to 24 bits.
5678 LHSKnown
= LHSKnown
.trunc(24);
5679 RHSKnown
= RHSKnown
.trunc(24);
5681 if (Opc
== AMDGPUISD::MUL_I24
) {
5682 unsigned LHSValBits
= LHSKnown
.countMaxSignificantBits();
5683 unsigned RHSValBits
= RHSKnown
.countMaxSignificantBits();
5684 unsigned MaxValBits
= LHSValBits
+ RHSValBits
;
5685 if (MaxValBits
> 32)
5687 unsigned SignBits
= 32 - MaxValBits
+ 1;
5688 bool LHSNegative
= LHSKnown
.isNegative();
5689 bool LHSNonNegative
= LHSKnown
.isNonNegative();
5690 bool LHSPositive
= LHSKnown
.isStrictlyPositive();
5691 bool RHSNegative
= RHSKnown
.isNegative();
5692 bool RHSNonNegative
= RHSKnown
.isNonNegative();
5693 bool RHSPositive
= RHSKnown
.isStrictlyPositive();
5695 if ((LHSNonNegative
&& RHSNonNegative
) || (LHSNegative
&& RHSNegative
))
5696 Known
.Zero
.setHighBits(SignBits
);
5697 else if ((LHSNegative
&& RHSPositive
) || (LHSPositive
&& RHSNegative
))
5698 Known
.One
.setHighBits(SignBits
);
5700 unsigned LHSValBits
= LHSKnown
.countMaxActiveBits();
5701 unsigned RHSValBits
= RHSKnown
.countMaxActiveBits();
5702 unsigned MaxValBits
= LHSValBits
+ RHSValBits
;
5703 if (MaxValBits
>= 32)
5705 Known
.Zero
.setBitsFrom(MaxValBits
);
5709 case AMDGPUISD::PERM
: {
5710 ConstantSDNode
*CMask
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(2));
5714 KnownBits LHSKnown
= DAG
.computeKnownBits(Op
.getOperand(0), Depth
+ 1);
5715 KnownBits RHSKnown
= DAG
.computeKnownBits(Op
.getOperand(1), Depth
+ 1);
5716 unsigned Sel
= CMask
->getZExtValue();
5718 for (unsigned I
= 0; I
< 32; I
+= 8) {
5719 unsigned SelBits
= Sel
& 0xff;
5722 Known
.One
|= ((RHSKnown
.One
.getZExtValue() >> SelBits
) & 0xff) << I
;
5723 Known
.Zero
|= ((RHSKnown
.Zero
.getZExtValue() >> SelBits
) & 0xff) << I
;
5724 } else if (SelBits
< 7) {
5725 SelBits
= (SelBits
& 3) * 8;
5726 Known
.One
|= ((LHSKnown
.One
.getZExtValue() >> SelBits
) & 0xff) << I
;
5727 Known
.Zero
|= ((LHSKnown
.Zero
.getZExtValue() >> SelBits
) & 0xff) << I
;
5728 } else if (SelBits
== 0x0c) {
5729 Known
.Zero
|= 0xFFull
<< I
;
5730 } else if (SelBits
> 0x0c) {
5731 Known
.One
|= 0xFFull
<< I
;
5737 case AMDGPUISD::BUFFER_LOAD_UBYTE
: {
5738 Known
.Zero
.setHighBits(24);
5741 case AMDGPUISD::BUFFER_LOAD_USHORT
: {
5742 Known
.Zero
.setHighBits(16);
5745 case AMDGPUISD::LDS
: {
5746 auto GA
= cast
<GlobalAddressSDNode
>(Op
.getOperand(0).getNode());
5747 Align Alignment
= GA
->getGlobal()->getPointerAlignment(DAG
.getDataLayout());
5749 Known
.Zero
.setHighBits(16);
5750 Known
.Zero
.setLowBits(Log2(Alignment
));
5753 case AMDGPUISD::SMIN3
:
5754 case AMDGPUISD::SMAX3
:
5755 case AMDGPUISD::SMED3
:
5756 case AMDGPUISD::UMIN3
:
5757 case AMDGPUISD::UMAX3
:
5758 case AMDGPUISD::UMED3
: {
5759 KnownBits Known2
= DAG
.computeKnownBits(Op
.getOperand(2), Depth
+ 1);
5760 if (Known2
.isUnknown())
5763 KnownBits Known1
= DAG
.computeKnownBits(Op
.getOperand(1), Depth
+ 1);
5764 if (Known1
.isUnknown())
5767 KnownBits Known0
= DAG
.computeKnownBits(Op
.getOperand(0), Depth
+ 1);
5768 if (Known0
.isUnknown())
5771 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5772 Known
.Zero
= Known0
.Zero
& Known1
.Zero
& Known2
.Zero
;
5773 Known
.One
= Known0
.One
& Known1
.One
& Known2
.One
;
5776 case ISD::INTRINSIC_WO_CHAIN
: {
5777 unsigned IID
= Op
.getConstantOperandVal(0);
5779 case Intrinsic::amdgcn_workitem_id_x
:
5780 case Intrinsic::amdgcn_workitem_id_y
:
5781 case Intrinsic::amdgcn_workitem_id_z
: {
5782 unsigned MaxValue
= Subtarget
->getMaxWorkitemID(
5783 DAG
.getMachineFunction().getFunction(), workitemIntrinsicDim(IID
));
5784 Known
.Zero
.setHighBits(llvm::countl_zero(MaxValue
));
5794 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
5795 SDValue Op
, const APInt
&DemandedElts
, const SelectionDAG
&DAG
,
5796 unsigned Depth
) const {
5797 switch (Op
.getOpcode()) {
5798 case AMDGPUISD::BFE_I32
: {
5799 ConstantSDNode
*Width
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(2));
5803 unsigned SignBits
= 32 - Width
->getZExtValue() + 1;
5804 if (!isNullConstant(Op
.getOperand(1)))
5807 // TODO: Could probably figure something out with non-0 offsets.
5808 unsigned Op0SignBits
= DAG
.ComputeNumSignBits(Op
.getOperand(0), Depth
+ 1);
5809 return std::max(SignBits
, Op0SignBits
);
5812 case AMDGPUISD::BFE_U32
: {
5813 ConstantSDNode
*Width
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(2));
5814 return Width
? 32 - (Width
->getZExtValue() & 0x1f) : 1;
5817 case AMDGPUISD::CARRY
:
5818 case AMDGPUISD::BORROW
:
5820 case AMDGPUISD::BUFFER_LOAD_BYTE
:
5822 case AMDGPUISD::BUFFER_LOAD_SHORT
:
5824 case AMDGPUISD::BUFFER_LOAD_UBYTE
:
5826 case AMDGPUISD::BUFFER_LOAD_USHORT
:
5828 case AMDGPUISD::FP_TO_FP16
:
5830 case AMDGPUISD::SMIN3
:
5831 case AMDGPUISD::SMAX3
:
5832 case AMDGPUISD::SMED3
:
5833 case AMDGPUISD::UMIN3
:
5834 case AMDGPUISD::UMAX3
:
5835 case AMDGPUISD::UMED3
: {
5836 unsigned Tmp2
= DAG
.ComputeNumSignBits(Op
.getOperand(2), Depth
+ 1);
5838 return 1; // Early out.
5840 unsigned Tmp1
= DAG
.ComputeNumSignBits(Op
.getOperand(1), Depth
+ 1);
5842 return 1; // Early out.
5844 unsigned Tmp0
= DAG
.ComputeNumSignBits(Op
.getOperand(0), Depth
+ 1);
5846 return 1; // Early out.
5848 return std::min({Tmp0
, Tmp1
, Tmp2
});
5855 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
5856 GISelKnownBits
&Analysis
, Register R
,
5857 const APInt
&DemandedElts
, const MachineRegisterInfo
&MRI
,
5858 unsigned Depth
) const {
5859 const MachineInstr
*MI
= MRI
.getVRegDef(R
);
5863 // TODO: Check range metadata on MMO.
5864 switch (MI
->getOpcode()) {
5865 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
:
5867 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT
:
5869 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
:
5871 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
:
5873 case AMDGPU::G_AMDGPU_SMED3
:
5874 case AMDGPU::G_AMDGPU_UMED3
: {
5875 auto [Dst
, Src0
, Src1
, Src2
] = MI
->getFirst4Regs();
5876 unsigned Tmp2
= Analysis
.computeNumSignBits(Src2
, DemandedElts
, Depth
+ 1);
5879 unsigned Tmp1
= Analysis
.computeNumSignBits(Src1
, DemandedElts
, Depth
+ 1);
5882 unsigned Tmp0
= Analysis
.computeNumSignBits(Src0
, DemandedElts
, Depth
+ 1);
5885 return std::min({Tmp0
, Tmp1
, Tmp2
});
5892 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op
,
5893 const SelectionDAG
&DAG
,
5895 unsigned Depth
) const {
5896 unsigned Opcode
= Op
.getOpcode();
5898 case AMDGPUISD::FMIN_LEGACY
:
5899 case AMDGPUISD::FMAX_LEGACY
: {
5903 // TODO: Can check no nans on one of the operands for each one, but which
5907 case AMDGPUISD::FMUL_LEGACY
:
5908 case AMDGPUISD::CVT_PKRTZ_F16_F32
: {
5911 return DAG
.isKnownNeverNaN(Op
.getOperand(0), SNaN
, Depth
+ 1) &&
5912 DAG
.isKnownNeverNaN(Op
.getOperand(1), SNaN
, Depth
+ 1);
5914 case AMDGPUISD::FMED3
:
5915 case AMDGPUISD::FMIN3
:
5916 case AMDGPUISD::FMAX3
:
5917 case AMDGPUISD::FMINIMUM3
:
5918 case AMDGPUISD::FMAXIMUM3
:
5919 case AMDGPUISD::FMAD_FTZ
: {
5922 return DAG
.isKnownNeverNaN(Op
.getOperand(0), SNaN
, Depth
+ 1) &&
5923 DAG
.isKnownNeverNaN(Op
.getOperand(1), SNaN
, Depth
+ 1) &&
5924 DAG
.isKnownNeverNaN(Op
.getOperand(2), SNaN
, Depth
+ 1);
5926 case AMDGPUISD::CVT_F32_UBYTE0
:
5927 case AMDGPUISD::CVT_F32_UBYTE1
:
5928 case AMDGPUISD::CVT_F32_UBYTE2
:
5929 case AMDGPUISD::CVT_F32_UBYTE3
:
5932 case AMDGPUISD::RCP
:
5933 case AMDGPUISD::RSQ
:
5934 case AMDGPUISD::RCP_LEGACY
:
5935 case AMDGPUISD::RSQ_CLAMP
: {
5939 // TODO: Need is known positive check.
5943 case AMDGPUISD::FRACT
: {
5946 return DAG
.isKnownNeverNaN(Op
.getOperand(0), SNaN
, Depth
+ 1);
5948 case AMDGPUISD::DIV_SCALE
:
5949 case AMDGPUISD::DIV_FMAS
:
5950 case AMDGPUISD::DIV_FIXUP
:
5951 // TODO: Refine on operands.
5953 case AMDGPUISD::SIN_HW
:
5954 case AMDGPUISD::COS_HW
: {
5955 // TODO: Need check for infinity
5958 case ISD::INTRINSIC_WO_CHAIN
: {
5959 unsigned IntrinsicID
= Op
.getConstantOperandVal(0);
5960 // TODO: Handle more intrinsics
5961 switch (IntrinsicID
) {
5962 case Intrinsic::amdgcn_cubeid
:
5965 case Intrinsic::amdgcn_frexp_mant
: {
5968 return DAG
.isKnownNeverNaN(Op
.getOperand(1), SNaN
, Depth
+ 1);
5970 case Intrinsic::amdgcn_cvt_pkrtz
: {
5973 return DAG
.isKnownNeverNaN(Op
.getOperand(1), SNaN
, Depth
+ 1) &&
5974 DAG
.isKnownNeverNaN(Op
.getOperand(2), SNaN
, Depth
+ 1);
5976 case Intrinsic::amdgcn_rcp
:
5977 case Intrinsic::amdgcn_rsq
:
5978 case Intrinsic::amdgcn_rcp_legacy
:
5979 case Intrinsic::amdgcn_rsq_legacy
:
5980 case Intrinsic::amdgcn_rsq_clamp
: {
5984 // TODO: Need is known positive check.
5987 case Intrinsic::amdgcn_trig_preop
:
5988 case Intrinsic::amdgcn_fdot2
:
5989 // TODO: Refine on operand
5991 case Intrinsic::amdgcn_fma_legacy
:
5994 return DAG
.isKnownNeverNaN(Op
.getOperand(1), SNaN
, Depth
+ 1) &&
5995 DAG
.isKnownNeverNaN(Op
.getOperand(2), SNaN
, Depth
+ 1) &&
5996 DAG
.isKnownNeverNaN(Op
.getOperand(3), SNaN
, Depth
+ 1);
6006 bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo
&MRI
,
6007 Register N0
, Register N1
) const {
6008 return MRI
.hasOneNonDBGUse(N0
); // FIXME: handle regbanks
6011 TargetLowering::AtomicExpansionKind
6012 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*RMW
) const {
6013 switch (RMW
->getOperation()) {
6014 case AtomicRMWInst::Nand
:
6015 case AtomicRMWInst::FAdd
:
6016 case AtomicRMWInst::FSub
:
6017 case AtomicRMWInst::FMax
:
6018 case AtomicRMWInst::FMin
:
6019 return AtomicExpansionKind::CmpXChg
;
6020 case AtomicRMWInst::Xchg
: {
6021 const DataLayout
&DL
= RMW
->getFunction()->getDataLayout();
6022 unsigned ValSize
= DL
.getTypeSizeInBits(RMW
->getType());
6023 if (ValSize
== 32 || ValSize
== 64)
6024 return AtomicExpansionKind::None
;
6025 return AtomicExpansionKind::CmpXChg
;
6028 if (auto *IntTy
= dyn_cast
<IntegerType
>(RMW
->getType())) {
6029 unsigned Size
= IntTy
->getBitWidth();
6030 if (Size
== 32 || Size
== 64)
6031 return AtomicExpansionKind::None
;
6034 return AtomicExpansionKind::CmpXChg
;
6039 /// Whether it is profitable to sink the operands of an
6040 /// Instruction I to the basic block of I.
6041 /// This helps using several modifiers (like abs and neg) more often.
6042 bool AMDGPUTargetLowering::shouldSinkOperands(
6043 Instruction
*I
, SmallVectorImpl
<Use
*> &Ops
) const {
6044 using namespace PatternMatch
;
6046 for (auto &Op
: I
->operands()) {
6047 // Ensure we are not already sinking this operand.
6048 if (any_of(Ops
, [&](Use
*U
) { return U
->get() == Op
.get(); }))
6051 if (match(&Op
, m_FAbs(m_Value())) || match(&Op
, m_FNeg(m_Value())))
6055 return !Ops
.empty();