1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines an instruction selector for the NVPTX target.
11 //===----------------------------------------------------------------------===//
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/CodeGen/ISDOpcodes.h"
18 #include "llvm/IR/GlobalValue.h"
19 #include "llvm/IR/Instructions.h"
20 #include "llvm/IR/IntrinsicsNVPTX.h"
21 #include "llvm/Support/AtomicOrdering.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/ErrorHandling.h"
25 #include "llvm/Support/raw_ostream.h"
26 #include "llvm/Target/TargetIntrinsicInfo.h"
30 #define DEBUG_TYPE "nvptx-isel"
31 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
33 /// createNVPTXISelDag - This pass converts a legalized DAG into a
34 /// NVPTX-specific DAG, ready for instruction scheduling.
35 FunctionPass
*llvm::createNVPTXISelDag(NVPTXTargetMachine
&TM
,
36 llvm::CodeGenOptLevel OptLevel
) {
37 return new NVPTXDAGToDAGISel(TM
, OptLevel
);
40 char NVPTXDAGToDAGISel::ID
= 0;
42 INITIALIZE_PASS(NVPTXDAGToDAGISel
, DEBUG_TYPE
, PASS_NAME
, false, false)
44 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine
&tm
,
45 CodeGenOptLevel OptLevel
)
46 : SelectionDAGISel(ID
, tm
, OptLevel
), TM(tm
) {
47 doMulWide
= (OptLevel
> CodeGenOptLevel::None
);
50 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction
&MF
) {
51 Subtarget
= &MF
.getSubtarget
<NVPTXSubtarget
>();
52 return SelectionDAGISel::runOnMachineFunction(MF
);
55 int NVPTXDAGToDAGISel::getDivF32Level() const {
56 return Subtarget
->getTargetLowering()->getDivF32Level();
59 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
60 return Subtarget
->getTargetLowering()->usePrecSqrtF32();
63 bool NVPTXDAGToDAGISel::useF32FTZ() const {
64 return Subtarget
->getTargetLowering()->useF32FTZ(*MF
);
67 bool NVPTXDAGToDAGISel::allowFMA() const {
68 const NVPTXTargetLowering
*TL
= Subtarget
->getTargetLowering();
69 return TL
->allowFMA(*MF
, OptLevel
);
72 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
73 const NVPTXTargetLowering
*TL
= Subtarget
->getTargetLowering();
74 return TL
->allowUnsafeFPMath(*MF
);
77 bool NVPTXDAGToDAGISel::useShortPointers() const {
78 return TM
.useShortPointers();
81 /// Select - Select instructions not customized! Used for
82 /// expanded, promoted and normal instructions.
83 void NVPTXDAGToDAGISel::Select(SDNode
*N
) {
85 if (N
->isMachineOpcode()) {
87 return; // Already selected.
90 switch (N
->getOpcode()) {
92 case ISD::ATOMIC_LOAD
:
97 case ISD::ATOMIC_STORE
:
101 case ISD::EXTRACT_VECTOR_ELT
:
102 if (tryEXTRACT_VECTOR_ELEMENT(N
))
105 case NVPTXISD::SETP_F16X2
:
109 case NVPTXISD::LoadV2
:
110 case NVPTXISD::LoadV4
:
111 if (tryLoadVector(N
))
114 case NVPTXISD::LDGV2
:
115 case NVPTXISD::LDGV4
:
116 case NVPTXISD::LDUV2
:
117 case NVPTXISD::LDUV4
:
121 case NVPTXISD::StoreV2
:
122 case NVPTXISD::StoreV4
:
123 if (tryStoreVector(N
))
126 case NVPTXISD::LoadParam
:
127 case NVPTXISD::LoadParamV2
:
128 case NVPTXISD::LoadParamV4
:
132 case NVPTXISD::StoreRetval
:
133 case NVPTXISD::StoreRetvalV2
:
134 case NVPTXISD::StoreRetvalV4
:
135 if (tryStoreRetval(N
))
138 case NVPTXISD::StoreParam
:
139 case NVPTXISD::StoreParamV2
:
140 case NVPTXISD::StoreParamV4
:
141 case NVPTXISD::StoreParamS32
:
142 case NVPTXISD::StoreParamU32
:
143 if (tryStoreParam(N
))
146 case ISD::INTRINSIC_WO_CHAIN
:
147 if (tryIntrinsicNoChain(N
))
150 case ISD::INTRINSIC_W_CHAIN
:
151 if (tryIntrinsicChain(N
))
154 case NVPTXISD::Tex1DFloatS32
:
155 case NVPTXISD::Tex1DFloatFloat
:
156 case NVPTXISD::Tex1DFloatFloatLevel
:
157 case NVPTXISD::Tex1DFloatFloatGrad
:
158 case NVPTXISD::Tex1DS32S32
:
159 case NVPTXISD::Tex1DS32Float
:
160 case NVPTXISD::Tex1DS32FloatLevel
:
161 case NVPTXISD::Tex1DS32FloatGrad
:
162 case NVPTXISD::Tex1DU32S32
:
163 case NVPTXISD::Tex1DU32Float
:
164 case NVPTXISD::Tex1DU32FloatLevel
:
165 case NVPTXISD::Tex1DU32FloatGrad
:
166 case NVPTXISD::Tex1DArrayFloatS32
:
167 case NVPTXISD::Tex1DArrayFloatFloat
:
168 case NVPTXISD::Tex1DArrayFloatFloatLevel
:
169 case NVPTXISD::Tex1DArrayFloatFloatGrad
:
170 case NVPTXISD::Tex1DArrayS32S32
:
171 case NVPTXISD::Tex1DArrayS32Float
:
172 case NVPTXISD::Tex1DArrayS32FloatLevel
:
173 case NVPTXISD::Tex1DArrayS32FloatGrad
:
174 case NVPTXISD::Tex1DArrayU32S32
:
175 case NVPTXISD::Tex1DArrayU32Float
:
176 case NVPTXISD::Tex1DArrayU32FloatLevel
:
177 case NVPTXISD::Tex1DArrayU32FloatGrad
:
178 case NVPTXISD::Tex2DFloatS32
:
179 case NVPTXISD::Tex2DFloatFloat
:
180 case NVPTXISD::Tex2DFloatFloatLevel
:
181 case NVPTXISD::Tex2DFloatFloatGrad
:
182 case NVPTXISD::Tex2DS32S32
:
183 case NVPTXISD::Tex2DS32Float
:
184 case NVPTXISD::Tex2DS32FloatLevel
:
185 case NVPTXISD::Tex2DS32FloatGrad
:
186 case NVPTXISD::Tex2DU32S32
:
187 case NVPTXISD::Tex2DU32Float
:
188 case NVPTXISD::Tex2DU32FloatLevel
:
189 case NVPTXISD::Tex2DU32FloatGrad
:
190 case NVPTXISD::Tex2DArrayFloatS32
:
191 case NVPTXISD::Tex2DArrayFloatFloat
:
192 case NVPTXISD::Tex2DArrayFloatFloatLevel
:
193 case NVPTXISD::Tex2DArrayFloatFloatGrad
:
194 case NVPTXISD::Tex2DArrayS32S32
:
195 case NVPTXISD::Tex2DArrayS32Float
:
196 case NVPTXISD::Tex2DArrayS32FloatLevel
:
197 case NVPTXISD::Tex2DArrayS32FloatGrad
:
198 case NVPTXISD::Tex2DArrayU32S32
:
199 case NVPTXISD::Tex2DArrayU32Float
:
200 case NVPTXISD::Tex2DArrayU32FloatLevel
:
201 case NVPTXISD::Tex2DArrayU32FloatGrad
:
202 case NVPTXISD::Tex3DFloatS32
:
203 case NVPTXISD::Tex3DFloatFloat
:
204 case NVPTXISD::Tex3DFloatFloatLevel
:
205 case NVPTXISD::Tex3DFloatFloatGrad
:
206 case NVPTXISD::Tex3DS32S32
:
207 case NVPTXISD::Tex3DS32Float
:
208 case NVPTXISD::Tex3DS32FloatLevel
:
209 case NVPTXISD::Tex3DS32FloatGrad
:
210 case NVPTXISD::Tex3DU32S32
:
211 case NVPTXISD::Tex3DU32Float
:
212 case NVPTXISD::Tex3DU32FloatLevel
:
213 case NVPTXISD::Tex3DU32FloatGrad
:
214 case NVPTXISD::TexCubeFloatFloat
:
215 case NVPTXISD::TexCubeFloatFloatLevel
:
216 case NVPTXISD::TexCubeS32Float
:
217 case NVPTXISD::TexCubeS32FloatLevel
:
218 case NVPTXISD::TexCubeU32Float
:
219 case NVPTXISD::TexCubeU32FloatLevel
:
220 case NVPTXISD::TexCubeArrayFloatFloat
:
221 case NVPTXISD::TexCubeArrayFloatFloatLevel
:
222 case NVPTXISD::TexCubeArrayS32Float
:
223 case NVPTXISD::TexCubeArrayS32FloatLevel
:
224 case NVPTXISD::TexCubeArrayU32Float
:
225 case NVPTXISD::TexCubeArrayU32FloatLevel
:
226 case NVPTXISD::Tld4R2DFloatFloat
:
227 case NVPTXISD::Tld4G2DFloatFloat
:
228 case NVPTXISD::Tld4B2DFloatFloat
:
229 case NVPTXISD::Tld4A2DFloatFloat
:
230 case NVPTXISD::Tld4R2DS64Float
:
231 case NVPTXISD::Tld4G2DS64Float
:
232 case NVPTXISD::Tld4B2DS64Float
:
233 case NVPTXISD::Tld4A2DS64Float
:
234 case NVPTXISD::Tld4R2DU64Float
:
235 case NVPTXISD::Tld4G2DU64Float
:
236 case NVPTXISD::Tld4B2DU64Float
:
237 case NVPTXISD::Tld4A2DU64Float
:
238 case NVPTXISD::TexUnified1DFloatS32
:
239 case NVPTXISD::TexUnified1DFloatFloat
:
240 case NVPTXISD::TexUnified1DFloatFloatLevel
:
241 case NVPTXISD::TexUnified1DFloatFloatGrad
:
242 case NVPTXISD::TexUnified1DS32S32
:
243 case NVPTXISD::TexUnified1DS32Float
:
244 case NVPTXISD::TexUnified1DS32FloatLevel
:
245 case NVPTXISD::TexUnified1DS32FloatGrad
:
246 case NVPTXISD::TexUnified1DU32S32
:
247 case NVPTXISD::TexUnified1DU32Float
:
248 case NVPTXISD::TexUnified1DU32FloatLevel
:
249 case NVPTXISD::TexUnified1DU32FloatGrad
:
250 case NVPTXISD::TexUnified1DArrayFloatS32
:
251 case NVPTXISD::TexUnified1DArrayFloatFloat
:
252 case NVPTXISD::TexUnified1DArrayFloatFloatLevel
:
253 case NVPTXISD::TexUnified1DArrayFloatFloatGrad
:
254 case NVPTXISD::TexUnified1DArrayS32S32
:
255 case NVPTXISD::TexUnified1DArrayS32Float
:
256 case NVPTXISD::TexUnified1DArrayS32FloatLevel
:
257 case NVPTXISD::TexUnified1DArrayS32FloatGrad
:
258 case NVPTXISD::TexUnified1DArrayU32S32
:
259 case NVPTXISD::TexUnified1DArrayU32Float
:
260 case NVPTXISD::TexUnified1DArrayU32FloatLevel
:
261 case NVPTXISD::TexUnified1DArrayU32FloatGrad
:
262 case NVPTXISD::TexUnified2DFloatS32
:
263 case NVPTXISD::TexUnified2DFloatFloat
:
264 case NVPTXISD::TexUnified2DFloatFloatLevel
:
265 case NVPTXISD::TexUnified2DFloatFloatGrad
:
266 case NVPTXISD::TexUnified2DS32S32
:
267 case NVPTXISD::TexUnified2DS32Float
:
268 case NVPTXISD::TexUnified2DS32FloatLevel
:
269 case NVPTXISD::TexUnified2DS32FloatGrad
:
270 case NVPTXISD::TexUnified2DU32S32
:
271 case NVPTXISD::TexUnified2DU32Float
:
272 case NVPTXISD::TexUnified2DU32FloatLevel
:
273 case NVPTXISD::TexUnified2DU32FloatGrad
:
274 case NVPTXISD::TexUnified2DArrayFloatS32
:
275 case NVPTXISD::TexUnified2DArrayFloatFloat
:
276 case NVPTXISD::TexUnified2DArrayFloatFloatLevel
:
277 case NVPTXISD::TexUnified2DArrayFloatFloatGrad
:
278 case NVPTXISD::TexUnified2DArrayS32S32
:
279 case NVPTXISD::TexUnified2DArrayS32Float
:
280 case NVPTXISD::TexUnified2DArrayS32FloatLevel
:
281 case NVPTXISD::TexUnified2DArrayS32FloatGrad
:
282 case NVPTXISD::TexUnified2DArrayU32S32
:
283 case NVPTXISD::TexUnified2DArrayU32Float
:
284 case NVPTXISD::TexUnified2DArrayU32FloatLevel
:
285 case NVPTXISD::TexUnified2DArrayU32FloatGrad
:
286 case NVPTXISD::TexUnified3DFloatS32
:
287 case NVPTXISD::TexUnified3DFloatFloat
:
288 case NVPTXISD::TexUnified3DFloatFloatLevel
:
289 case NVPTXISD::TexUnified3DFloatFloatGrad
:
290 case NVPTXISD::TexUnified3DS32S32
:
291 case NVPTXISD::TexUnified3DS32Float
:
292 case NVPTXISD::TexUnified3DS32FloatLevel
:
293 case NVPTXISD::TexUnified3DS32FloatGrad
:
294 case NVPTXISD::TexUnified3DU32S32
:
295 case NVPTXISD::TexUnified3DU32Float
:
296 case NVPTXISD::TexUnified3DU32FloatLevel
:
297 case NVPTXISD::TexUnified3DU32FloatGrad
:
298 case NVPTXISD::TexUnifiedCubeFloatFloat
:
299 case NVPTXISD::TexUnifiedCubeFloatFloatLevel
:
300 case NVPTXISD::TexUnifiedCubeS32Float
:
301 case NVPTXISD::TexUnifiedCubeS32FloatLevel
:
302 case NVPTXISD::TexUnifiedCubeU32Float
:
303 case NVPTXISD::TexUnifiedCubeU32FloatLevel
:
304 case NVPTXISD::TexUnifiedCubeArrayFloatFloat
:
305 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel
:
306 case NVPTXISD::TexUnifiedCubeArrayS32Float
:
307 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel
:
308 case NVPTXISD::TexUnifiedCubeArrayU32Float
:
309 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel
:
310 case NVPTXISD::Tld4UnifiedR2DFloatFloat
:
311 case NVPTXISD::Tld4UnifiedG2DFloatFloat
:
312 case NVPTXISD::Tld4UnifiedB2DFloatFloat
:
313 case NVPTXISD::Tld4UnifiedA2DFloatFloat
:
314 case NVPTXISD::Tld4UnifiedR2DS64Float
:
315 case NVPTXISD::Tld4UnifiedG2DS64Float
:
316 case NVPTXISD::Tld4UnifiedB2DS64Float
:
317 case NVPTXISD::Tld4UnifiedA2DS64Float
:
318 case NVPTXISD::Tld4UnifiedR2DU64Float
:
319 case NVPTXISD::Tld4UnifiedG2DU64Float
:
320 case NVPTXISD::Tld4UnifiedB2DU64Float
:
321 case NVPTXISD::Tld4UnifiedA2DU64Float
:
322 if (tryTextureIntrinsic(N
))
325 case NVPTXISD::Suld1DI8Clamp
:
326 case NVPTXISD::Suld1DI16Clamp
:
327 case NVPTXISD::Suld1DI32Clamp
:
328 case NVPTXISD::Suld1DI64Clamp
:
329 case NVPTXISD::Suld1DV2I8Clamp
:
330 case NVPTXISD::Suld1DV2I16Clamp
:
331 case NVPTXISD::Suld1DV2I32Clamp
:
332 case NVPTXISD::Suld1DV2I64Clamp
:
333 case NVPTXISD::Suld1DV4I8Clamp
:
334 case NVPTXISD::Suld1DV4I16Clamp
:
335 case NVPTXISD::Suld1DV4I32Clamp
:
336 case NVPTXISD::Suld1DArrayI8Clamp
:
337 case NVPTXISD::Suld1DArrayI16Clamp
:
338 case NVPTXISD::Suld1DArrayI32Clamp
:
339 case NVPTXISD::Suld1DArrayI64Clamp
:
340 case NVPTXISD::Suld1DArrayV2I8Clamp
:
341 case NVPTXISD::Suld1DArrayV2I16Clamp
:
342 case NVPTXISD::Suld1DArrayV2I32Clamp
:
343 case NVPTXISD::Suld1DArrayV2I64Clamp
:
344 case NVPTXISD::Suld1DArrayV4I8Clamp
:
345 case NVPTXISD::Suld1DArrayV4I16Clamp
:
346 case NVPTXISD::Suld1DArrayV4I32Clamp
:
347 case NVPTXISD::Suld2DI8Clamp
:
348 case NVPTXISD::Suld2DI16Clamp
:
349 case NVPTXISD::Suld2DI32Clamp
:
350 case NVPTXISD::Suld2DI64Clamp
:
351 case NVPTXISD::Suld2DV2I8Clamp
:
352 case NVPTXISD::Suld2DV2I16Clamp
:
353 case NVPTXISD::Suld2DV2I32Clamp
:
354 case NVPTXISD::Suld2DV2I64Clamp
:
355 case NVPTXISD::Suld2DV4I8Clamp
:
356 case NVPTXISD::Suld2DV4I16Clamp
:
357 case NVPTXISD::Suld2DV4I32Clamp
:
358 case NVPTXISD::Suld2DArrayI8Clamp
:
359 case NVPTXISD::Suld2DArrayI16Clamp
:
360 case NVPTXISD::Suld2DArrayI32Clamp
:
361 case NVPTXISD::Suld2DArrayI64Clamp
:
362 case NVPTXISD::Suld2DArrayV2I8Clamp
:
363 case NVPTXISD::Suld2DArrayV2I16Clamp
:
364 case NVPTXISD::Suld2DArrayV2I32Clamp
:
365 case NVPTXISD::Suld2DArrayV2I64Clamp
:
366 case NVPTXISD::Suld2DArrayV4I8Clamp
:
367 case NVPTXISD::Suld2DArrayV4I16Clamp
:
368 case NVPTXISD::Suld2DArrayV4I32Clamp
:
369 case NVPTXISD::Suld3DI8Clamp
:
370 case NVPTXISD::Suld3DI16Clamp
:
371 case NVPTXISD::Suld3DI32Clamp
:
372 case NVPTXISD::Suld3DI64Clamp
:
373 case NVPTXISD::Suld3DV2I8Clamp
:
374 case NVPTXISD::Suld3DV2I16Clamp
:
375 case NVPTXISD::Suld3DV2I32Clamp
:
376 case NVPTXISD::Suld3DV2I64Clamp
:
377 case NVPTXISD::Suld3DV4I8Clamp
:
378 case NVPTXISD::Suld3DV4I16Clamp
:
379 case NVPTXISD::Suld3DV4I32Clamp
:
380 case NVPTXISD::Suld1DI8Trap
:
381 case NVPTXISD::Suld1DI16Trap
:
382 case NVPTXISD::Suld1DI32Trap
:
383 case NVPTXISD::Suld1DI64Trap
:
384 case NVPTXISD::Suld1DV2I8Trap
:
385 case NVPTXISD::Suld1DV2I16Trap
:
386 case NVPTXISD::Suld1DV2I32Trap
:
387 case NVPTXISD::Suld1DV2I64Trap
:
388 case NVPTXISD::Suld1DV4I8Trap
:
389 case NVPTXISD::Suld1DV4I16Trap
:
390 case NVPTXISD::Suld1DV4I32Trap
:
391 case NVPTXISD::Suld1DArrayI8Trap
:
392 case NVPTXISD::Suld1DArrayI16Trap
:
393 case NVPTXISD::Suld1DArrayI32Trap
:
394 case NVPTXISD::Suld1DArrayI64Trap
:
395 case NVPTXISD::Suld1DArrayV2I8Trap
:
396 case NVPTXISD::Suld1DArrayV2I16Trap
:
397 case NVPTXISD::Suld1DArrayV2I32Trap
:
398 case NVPTXISD::Suld1DArrayV2I64Trap
:
399 case NVPTXISD::Suld1DArrayV4I8Trap
:
400 case NVPTXISD::Suld1DArrayV4I16Trap
:
401 case NVPTXISD::Suld1DArrayV4I32Trap
:
402 case NVPTXISD::Suld2DI8Trap
:
403 case NVPTXISD::Suld2DI16Trap
:
404 case NVPTXISD::Suld2DI32Trap
:
405 case NVPTXISD::Suld2DI64Trap
:
406 case NVPTXISD::Suld2DV2I8Trap
:
407 case NVPTXISD::Suld2DV2I16Trap
:
408 case NVPTXISD::Suld2DV2I32Trap
:
409 case NVPTXISD::Suld2DV2I64Trap
:
410 case NVPTXISD::Suld2DV4I8Trap
:
411 case NVPTXISD::Suld2DV4I16Trap
:
412 case NVPTXISD::Suld2DV4I32Trap
:
413 case NVPTXISD::Suld2DArrayI8Trap
:
414 case NVPTXISD::Suld2DArrayI16Trap
:
415 case NVPTXISD::Suld2DArrayI32Trap
:
416 case NVPTXISD::Suld2DArrayI64Trap
:
417 case NVPTXISD::Suld2DArrayV2I8Trap
:
418 case NVPTXISD::Suld2DArrayV2I16Trap
:
419 case NVPTXISD::Suld2DArrayV2I32Trap
:
420 case NVPTXISD::Suld2DArrayV2I64Trap
:
421 case NVPTXISD::Suld2DArrayV4I8Trap
:
422 case NVPTXISD::Suld2DArrayV4I16Trap
:
423 case NVPTXISD::Suld2DArrayV4I32Trap
:
424 case NVPTXISD::Suld3DI8Trap
:
425 case NVPTXISD::Suld3DI16Trap
:
426 case NVPTXISD::Suld3DI32Trap
:
427 case NVPTXISD::Suld3DI64Trap
:
428 case NVPTXISD::Suld3DV2I8Trap
:
429 case NVPTXISD::Suld3DV2I16Trap
:
430 case NVPTXISD::Suld3DV2I32Trap
:
431 case NVPTXISD::Suld3DV2I64Trap
:
432 case NVPTXISD::Suld3DV4I8Trap
:
433 case NVPTXISD::Suld3DV4I16Trap
:
434 case NVPTXISD::Suld3DV4I32Trap
:
435 case NVPTXISD::Suld1DI8Zero
:
436 case NVPTXISD::Suld1DI16Zero
:
437 case NVPTXISD::Suld1DI32Zero
:
438 case NVPTXISD::Suld1DI64Zero
:
439 case NVPTXISD::Suld1DV2I8Zero
:
440 case NVPTXISD::Suld1DV2I16Zero
:
441 case NVPTXISD::Suld1DV2I32Zero
:
442 case NVPTXISD::Suld1DV2I64Zero
:
443 case NVPTXISD::Suld1DV4I8Zero
:
444 case NVPTXISD::Suld1DV4I16Zero
:
445 case NVPTXISD::Suld1DV4I32Zero
:
446 case NVPTXISD::Suld1DArrayI8Zero
:
447 case NVPTXISD::Suld1DArrayI16Zero
:
448 case NVPTXISD::Suld1DArrayI32Zero
:
449 case NVPTXISD::Suld1DArrayI64Zero
:
450 case NVPTXISD::Suld1DArrayV2I8Zero
:
451 case NVPTXISD::Suld1DArrayV2I16Zero
:
452 case NVPTXISD::Suld1DArrayV2I32Zero
:
453 case NVPTXISD::Suld1DArrayV2I64Zero
:
454 case NVPTXISD::Suld1DArrayV4I8Zero
:
455 case NVPTXISD::Suld1DArrayV4I16Zero
:
456 case NVPTXISD::Suld1DArrayV4I32Zero
:
457 case NVPTXISD::Suld2DI8Zero
:
458 case NVPTXISD::Suld2DI16Zero
:
459 case NVPTXISD::Suld2DI32Zero
:
460 case NVPTXISD::Suld2DI64Zero
:
461 case NVPTXISD::Suld2DV2I8Zero
:
462 case NVPTXISD::Suld2DV2I16Zero
:
463 case NVPTXISD::Suld2DV2I32Zero
:
464 case NVPTXISD::Suld2DV2I64Zero
:
465 case NVPTXISD::Suld2DV4I8Zero
:
466 case NVPTXISD::Suld2DV4I16Zero
:
467 case NVPTXISD::Suld2DV4I32Zero
:
468 case NVPTXISD::Suld2DArrayI8Zero
:
469 case NVPTXISD::Suld2DArrayI16Zero
:
470 case NVPTXISD::Suld2DArrayI32Zero
:
471 case NVPTXISD::Suld2DArrayI64Zero
:
472 case NVPTXISD::Suld2DArrayV2I8Zero
:
473 case NVPTXISD::Suld2DArrayV2I16Zero
:
474 case NVPTXISD::Suld2DArrayV2I32Zero
:
475 case NVPTXISD::Suld2DArrayV2I64Zero
:
476 case NVPTXISD::Suld2DArrayV4I8Zero
:
477 case NVPTXISD::Suld2DArrayV4I16Zero
:
478 case NVPTXISD::Suld2DArrayV4I32Zero
:
479 case NVPTXISD::Suld3DI8Zero
:
480 case NVPTXISD::Suld3DI16Zero
:
481 case NVPTXISD::Suld3DI32Zero
:
482 case NVPTXISD::Suld3DI64Zero
:
483 case NVPTXISD::Suld3DV2I8Zero
:
484 case NVPTXISD::Suld3DV2I16Zero
:
485 case NVPTXISD::Suld3DV2I32Zero
:
486 case NVPTXISD::Suld3DV2I64Zero
:
487 case NVPTXISD::Suld3DV4I8Zero
:
488 case NVPTXISD::Suld3DV4I16Zero
:
489 case NVPTXISD::Suld3DV4I32Zero
:
490 if (trySurfaceIntrinsic(N
))
500 case ISD::ADDRSPACECAST
:
501 SelectAddrSpaceCast(N
);
503 case ISD::ConstantFP
:
504 if (tryConstantFP(N
))
513 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode
*N
) {
514 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
518 case Intrinsic::nvvm_ldg_global_f
:
519 case Intrinsic::nvvm_ldg_global_i
:
520 case Intrinsic::nvvm_ldg_global_p
:
521 case Intrinsic::nvvm_ldu_global_f
:
522 case Intrinsic::nvvm_ldu_global_i
:
523 case Intrinsic::nvvm_ldu_global_p
:
528 // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
529 // have to load them into an .(b)f16 register first.
530 bool NVPTXDAGToDAGISel::tryConstantFP(SDNode
*N
) {
531 if (N
->getValueType(0) != MVT::f16
&& N
->getValueType(0) != MVT::bf16
)
533 SDValue Val
= CurDAG
->getTargetConstantFP(
534 cast
<ConstantFPSDNode
>(N
)->getValueAPF(), SDLoc(N
), N
->getValueType(0));
535 SDNode
*LoadConstF16
= CurDAG
->getMachineNode(
536 (N
->getValueType(0) == MVT::f16
? NVPTX::LOAD_CONST_F16
537 : NVPTX::LOAD_CONST_BF16
),
538 SDLoc(N
), N
->getValueType(0), Val
);
539 ReplaceNode(N
, LoadConstF16
);
543 // Map ISD:CONDCODE value to appropriate CmpMode expected by
544 // NVPTXInstPrinter::printCmpMode()
545 static unsigned getPTXCmpMode(const CondCodeSDNode
&CondCode
, bool FTZ
) {
546 using NVPTX::PTXCmpMode::CmpMode
;
547 unsigned PTXCmpMode
= [](ISD::CondCode CC
) {
550 llvm_unreachable("Unexpected condition code.");
566 return CmpMode::NotANumber
;
595 PTXCmpMode
|= NVPTX::PTXCmpMode::FTZ_FLAG
;
600 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode
*N
) {
601 unsigned PTXCmpMode
=
602 getPTXCmpMode(*cast
<CondCodeSDNode
>(N
->getOperand(2)), useF32FTZ());
604 SDNode
*SetP
= CurDAG
->getMachineNode(
605 NVPTX::SETP_f16x2rr
, DL
, MVT::i1
, MVT::i1
, N
->getOperand(0),
606 N
->getOperand(1), CurDAG
->getTargetConstant(PTXCmpMode
, DL
, MVT::i32
));
607 ReplaceNode(N
, SetP
);
611 // Find all instances of extract_vector_elt that use this v2f16 vector
612 // and coalesce them into a scattering move instruction.
613 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode
*N
) {
614 SDValue Vector
= N
->getOperand(0);
616 // We only care about 16x2 as it's the only real vector type we
617 // need to deal with.
618 MVT VT
= Vector
.getSimpleValueType();
621 // Find and record all uses of this vector that extract element 0 or 1.
622 SmallVector
<SDNode
*, 4> E0
, E1
;
623 for (auto *U
: Vector
.getNode()->uses()) {
624 if (U
->getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
626 if (U
->getOperand(0) != Vector
)
628 if (const ConstantSDNode
*IdxConst
=
629 dyn_cast
<ConstantSDNode
>(U
->getOperand(1))) {
630 if (IdxConst
->getZExtValue() == 0)
632 else if (IdxConst
->getZExtValue() == 1)
635 llvm_unreachable("Invalid vector index.");
639 // There's no point scattering f16x2 if we only ever access one
641 if (E0
.empty() || E1
.empty())
644 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
645 // into f16,f16 SplitF16x2(V)
646 MVT EltVT
= VT
.getVectorElementType();
648 CurDAG
->getMachineNode(NVPTX::I32toV2I16
, SDLoc(N
), EltVT
, EltVT
, Vector
);
649 for (auto *Node
: E0
)
650 ReplaceUses(SDValue(Node
, 0), SDValue(ScatterOp
, 0));
651 for (auto *Node
: E1
)
652 ReplaceUses(SDValue(Node
, 0), SDValue(ScatterOp
, 1));
657 static unsigned int getCodeAddrSpace(MemSDNode
*N
) {
658 const Value
*Src
= N
->getMemOperand()->getValue();
661 return NVPTX::PTXLdStInstCode::GENERIC
;
663 if (auto *PT
= dyn_cast
<PointerType
>(Src
->getType())) {
664 switch (PT
->getAddressSpace()) {
665 case llvm::ADDRESS_SPACE_LOCAL
: return NVPTX::PTXLdStInstCode::LOCAL
;
666 case llvm::ADDRESS_SPACE_GLOBAL
: return NVPTX::PTXLdStInstCode::GLOBAL
;
667 case llvm::ADDRESS_SPACE_SHARED
: return NVPTX::PTXLdStInstCode::SHARED
;
668 case llvm::ADDRESS_SPACE_GENERIC
: return NVPTX::PTXLdStInstCode::GENERIC
;
669 case llvm::ADDRESS_SPACE_PARAM
: return NVPTX::PTXLdStInstCode::PARAM
;
670 case llvm::ADDRESS_SPACE_CONST
: return NVPTX::PTXLdStInstCode::CONSTANT
;
674 return NVPTX::PTXLdStInstCode::GENERIC
;
677 static bool canLowerToLDG(MemSDNode
*N
, const NVPTXSubtarget
&Subtarget
,
678 unsigned CodeAddrSpace
, MachineFunction
*F
) {
679 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
682 // We have two ways of identifying invariant loads: Loads may be explicitly
683 // marked as invariant, or we may infer them to be invariant.
685 // We currently infer invariance for loads from
686 // - constant global variables, and
687 // - kernel function pointer params that are noalias (i.e. __restrict) and
690 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
691 // not during the SelectionDAG phase).
693 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
694 // explicitly invariant loads because these are how clang tells us to use ldg
695 // when the user uses a builtin.
696 if (!Subtarget
.hasLDG() || CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GLOBAL
)
699 if (N
->isInvariant())
702 bool IsKernelFn
= isKernelFunction(F
->getFunction());
704 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
705 // because the former looks through phi nodes while the latter does not. We
706 // need to look through phi nodes to handle pointer induction variables.
707 SmallVector
<const Value
*, 8> Objs
;
708 getUnderlyingObjects(N
->getMemOperand()->getValue(), Objs
);
710 return all_of(Objs
, [&](const Value
*V
) {
711 if (auto *A
= dyn_cast
<const Argument
>(V
))
712 return IsKernelFn
&& A
->onlyReadsMemory() && A
->hasNoAliasAttr();
713 if (auto *GV
= dyn_cast
<const GlobalVariable
>(V
))
714 return GV
->isConstant();
719 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode
*N
) {
720 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
724 case Intrinsic::nvvm_texsurf_handle_internal
:
725 SelectTexSurfHandle(N
);
730 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode
*N
) {
731 // Op 0 is the intrinsic ID
732 SDValue Wrapper
= N
->getOperand(1);
733 SDValue GlobalVal
= Wrapper
.getOperand(0);
734 ReplaceNode(N
, CurDAG
->getMachineNode(NVPTX::texsurf_handles
, SDLoc(N
),
735 MVT::i64
, GlobalVal
));
738 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode
*N
) {
739 SDValue Src
= N
->getOperand(0);
740 AddrSpaceCastSDNode
*CastN
= cast
<AddrSpaceCastSDNode
>(N
);
741 unsigned SrcAddrSpace
= CastN
->getSrcAddressSpace();
742 unsigned DstAddrSpace
= CastN
->getDestAddressSpace();
743 assert(SrcAddrSpace
!= DstAddrSpace
&&
744 "addrspacecast must be between different address spaces");
746 if (DstAddrSpace
== ADDRESS_SPACE_GENERIC
) {
747 // Specific to generic
749 switch (SrcAddrSpace
) {
750 default: report_fatal_error("Bad address space in addrspacecast");
751 case ADDRESS_SPACE_GLOBAL
:
752 Opc
= TM
.is64Bit() ? NVPTX::cvta_global_yes_64
: NVPTX::cvta_global_yes
;
754 case ADDRESS_SPACE_SHARED
:
755 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
756 : NVPTX::cvta_shared_yes_64
)
757 : NVPTX::cvta_shared_yes
;
759 case ADDRESS_SPACE_CONST
:
760 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
761 : NVPTX::cvta_const_yes_64
)
762 : NVPTX::cvta_const_yes
;
764 case ADDRESS_SPACE_LOCAL
:
765 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
766 : NVPTX::cvta_local_yes_64
)
767 : NVPTX::cvta_local_yes
;
770 ReplaceNode(N
, CurDAG
->getMachineNode(Opc
, SDLoc(N
), N
->getValueType(0),
774 // Generic to specific
775 if (SrcAddrSpace
!= 0)
776 report_fatal_error("Cannot cast between two non-generic address spaces");
778 switch (DstAddrSpace
) {
779 default: report_fatal_error("Bad address space in addrspacecast");
780 case ADDRESS_SPACE_GLOBAL
:
781 Opc
= TM
.is64Bit() ? NVPTX::cvta_to_global_yes_64
782 : NVPTX::cvta_to_global_yes
;
784 case ADDRESS_SPACE_SHARED
:
785 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
786 : NVPTX::cvta_to_shared_yes_64
)
787 : NVPTX::cvta_to_shared_yes
;
789 case ADDRESS_SPACE_CONST
:
790 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
791 : NVPTX::cvta_to_const_yes_64
)
792 : NVPTX::cvta_to_const_yes
;
794 case ADDRESS_SPACE_LOCAL
:
795 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
796 : NVPTX::cvta_to_local_yes_64
)
797 : NVPTX::cvta_to_local_yes
;
799 case ADDRESS_SPACE_PARAM
:
800 Opc
= TM
.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
801 : NVPTX::nvvm_ptr_gen_to_param
;
804 ReplaceNode(N
, CurDAG
->getMachineNode(Opc
, SDLoc(N
), N
->getValueType(0),
810 // Helper function template to reduce amount of boilerplate code for
812 static std::optional
<unsigned>
813 pickOpcodeForVT(MVT::SimpleValueType VT
, unsigned Opcode_i8
,
814 unsigned Opcode_i16
, unsigned Opcode_i32
,
815 std::optional
<unsigned> Opcode_i64
, unsigned Opcode_f32
,
816 std::optional
<unsigned> Opcode_f64
) {
844 static int getLdStRegType(EVT VT
) {
845 if (VT
.isFloatingPoint())
846 switch (VT
.getSimpleVT().SimpleTy
) {
851 return NVPTX::PTXLdStInstCode::Untyped
;
853 return NVPTX::PTXLdStInstCode::Float
;
856 return NVPTX::PTXLdStInstCode::Unsigned
;
859 bool NVPTXDAGToDAGISel::tryLoad(SDNode
*N
) {
861 MemSDNode
*LD
= cast
<MemSDNode
>(N
);
862 assert(LD
->readMem() && "Expected load");
863 LoadSDNode
*PlainLoad
= dyn_cast
<LoadSDNode
>(N
);
864 EVT LoadedVT
= LD
->getMemoryVT();
865 SDNode
*NVPTXLD
= nullptr;
867 // do not support pre/post inc/dec
868 if (PlainLoad
&& PlainLoad
->isIndexed())
871 if (!LoadedVT
.isSimple())
874 AtomicOrdering Ordering
= LD
->getSuccessOrdering();
875 // In order to lower atomic loads with stronger guarantees we would need to
876 // use load.acquire or insert fences. However these features were only added
877 // with PTX ISA 6.0 / sm_70.
878 // TODO: Check if we can actually use the new instructions and implement them.
879 if (isStrongerThanMonotonic(Ordering
))
882 // Address Space Setting
883 unsigned int CodeAddrSpace
= getCodeAddrSpace(LD
);
884 if (canLowerToLDG(LD
, *Subtarget
, CodeAddrSpace
, MF
)) {
888 unsigned int PointerSize
=
889 CurDAG
->getDataLayout().getPointerSizeInBits(LD
->getAddressSpace());
892 // - .volatile is only available for .global and .shared
893 // - .volatile has the same memory synchronization semantics as .relaxed.sys
894 bool isVolatile
= LD
->isVolatile() || Ordering
== AtomicOrdering::Monotonic
;
895 if (CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GLOBAL
&&
896 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::SHARED
&&
897 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GENERIC
)
900 // Type Setting: fromType + fromTypeWidth
902 // Sign : ISD::SEXTLOAD
903 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
905 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
906 MVT SimpleVT
= LoadedVT
.getSimpleVT();
907 MVT ScalarVT
= SimpleVT
.getScalarType();
908 // Read at least 8 bits (predicates are stored as 8-bit values)
909 unsigned fromTypeWidth
= std::max(8U, (unsigned)ScalarVT
.getSizeInBits());
910 unsigned int fromType
;
913 unsigned vecType
= NVPTX::PTXLdStInstCode::Scalar
;
914 if (SimpleVT
.isVector()) {
915 assert((Isv2x16VT(LoadedVT
) || LoadedVT
== MVT::v4i8
) &&
916 "Unexpected vector type");
917 // v2f16/v2bf16/v2i16 is loaded using ld.b32
921 if (PlainLoad
&& (PlainLoad
->getExtensionType() == ISD::SEXTLOAD
))
922 fromType
= NVPTX::PTXLdStInstCode::Signed
;
924 fromType
= getLdStRegType(ScalarVT
);
926 // Create the machine instruction DAG
927 SDValue Chain
= N
->getOperand(0);
928 SDValue N1
= N
->getOperand(1);
930 SDValue Offset
, Base
;
931 std::optional
<unsigned> Opcode
;
932 MVT::SimpleValueType TargetVT
= LD
->getSimpleValueType(0).SimpleTy
;
934 if (SelectDirectAddr(N1
, Addr
)) {
935 Opcode
= pickOpcodeForVT(TargetVT
, NVPTX::LD_i8_avar
, NVPTX::LD_i16_avar
,
936 NVPTX::LD_i32_avar
, NVPTX::LD_i64_avar
,
937 NVPTX::LD_f32_avar
, NVPTX::LD_f64_avar
);
940 SDValue Ops
[] = { getI32Imm(isVolatile
, dl
), getI32Imm(CodeAddrSpace
, dl
),
941 getI32Imm(vecType
, dl
), getI32Imm(fromType
, dl
),
942 getI32Imm(fromTypeWidth
, dl
), Addr
, Chain
};
943 NVPTXLD
= CurDAG
->getMachineNode(*Opcode
, dl
, TargetVT
, MVT::Other
, Ops
);
944 } else if (PointerSize
== 64 ? SelectADDRsi64(N1
.getNode(), N1
, Base
, Offset
)
945 : SelectADDRsi(N1
.getNode(), N1
, Base
, Offset
)) {
946 Opcode
= pickOpcodeForVT(TargetVT
, NVPTX::LD_i8_asi
, NVPTX::LD_i16_asi
,
947 NVPTX::LD_i32_asi
, NVPTX::LD_i64_asi
,
948 NVPTX::LD_f32_asi
, NVPTX::LD_f64_asi
);
951 SDValue Ops
[] = { getI32Imm(isVolatile
, dl
), getI32Imm(CodeAddrSpace
, dl
),
952 getI32Imm(vecType
, dl
), getI32Imm(fromType
, dl
),
953 getI32Imm(fromTypeWidth
, dl
), Base
, Offset
, Chain
};
954 NVPTXLD
= CurDAG
->getMachineNode(*Opcode
, dl
, TargetVT
, MVT::Other
, Ops
);
955 } else if (PointerSize
== 64 ? SelectADDRri64(N1
.getNode(), N1
, Base
, Offset
)
956 : SelectADDRri(N1
.getNode(), N1
, Base
, Offset
)) {
957 if (PointerSize
== 64)
959 pickOpcodeForVT(TargetVT
, NVPTX::LD_i8_ari_64
, NVPTX::LD_i16_ari_64
,
960 NVPTX::LD_i32_ari_64
, NVPTX::LD_i64_ari_64
,
961 NVPTX::LD_f32_ari_64
, NVPTX::LD_f64_ari_64
);
963 Opcode
= pickOpcodeForVT(TargetVT
, NVPTX::LD_i8_ari
, NVPTX::LD_i16_ari
,
964 NVPTX::LD_i32_ari
, NVPTX::LD_i64_ari
,
965 NVPTX::LD_f32_ari
, NVPTX::LD_f64_ari
);
968 SDValue Ops
[] = { getI32Imm(isVolatile
, dl
), getI32Imm(CodeAddrSpace
, dl
),
969 getI32Imm(vecType
, dl
), getI32Imm(fromType
, dl
),
970 getI32Imm(fromTypeWidth
, dl
), Base
, Offset
, Chain
};
971 NVPTXLD
= CurDAG
->getMachineNode(*Opcode
, dl
, TargetVT
, MVT::Other
, Ops
);
973 if (PointerSize
== 64)
975 pickOpcodeForVT(TargetVT
, NVPTX::LD_i8_areg_64
, NVPTX::LD_i16_areg_64
,
976 NVPTX::LD_i32_areg_64
, NVPTX::LD_i64_areg_64
,
977 NVPTX::LD_f32_areg_64
, NVPTX::LD_f64_areg_64
);
979 Opcode
= pickOpcodeForVT(TargetVT
, NVPTX::LD_i8_areg
, NVPTX::LD_i16_areg
,
980 NVPTX::LD_i32_areg
, NVPTX::LD_i64_areg
,
981 NVPTX::LD_f32_areg
, NVPTX::LD_f64_areg
);
984 SDValue Ops
[] = { getI32Imm(isVolatile
, dl
), getI32Imm(CodeAddrSpace
, dl
),
985 getI32Imm(vecType
, dl
), getI32Imm(fromType
, dl
),
986 getI32Imm(fromTypeWidth
, dl
), N1
, Chain
};
987 NVPTXLD
= CurDAG
->getMachineNode(*Opcode
, dl
, TargetVT
, MVT::Other
, Ops
);
993 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
994 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(NVPTXLD
), {MemRef
});
996 ReplaceNode(N
, NVPTXLD
);
1000 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode
*N
) {
1002 SDValue Chain
= N
->getOperand(0);
1003 SDValue Op1
= N
->getOperand(1);
1004 SDValue Addr
, Offset
, Base
;
1005 std::optional
<unsigned> Opcode
;
1008 MemSDNode
*MemSD
= cast
<MemSDNode
>(N
);
1009 EVT LoadedVT
= MemSD
->getMemoryVT();
1011 if (!LoadedVT
.isSimple())
1014 // Address Space Setting
1015 unsigned int CodeAddrSpace
= getCodeAddrSpace(MemSD
);
1016 if (canLowerToLDG(MemSD
, *Subtarget
, CodeAddrSpace
, MF
)) {
1017 return tryLDGLDU(N
);
1020 unsigned int PointerSize
=
1021 CurDAG
->getDataLayout().getPointerSizeInBits(MemSD
->getAddressSpace());
1024 // - .volatile is only availalble for .global and .shared
1025 bool IsVolatile
= MemSD
->isVolatile();
1026 if (CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GLOBAL
&&
1027 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::SHARED
&&
1028 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GENERIC
)
1032 MVT SimpleVT
= LoadedVT
.getSimpleVT();
1034 // Type Setting: fromType + fromTypeWidth
1036 // Sign : ISD::SEXTLOAD
1037 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1039 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1040 MVT ScalarVT
= SimpleVT
.getScalarType();
1041 // Read at least 8 bits (predicates are stored as 8-bit values)
1042 unsigned FromTypeWidth
= std::max(8U, (unsigned)ScalarVT
.getSizeInBits());
1043 unsigned int FromType
;
1044 // The last operand holds the original LoadSDNode::getExtensionType() value
1045 unsigned ExtensionType
= cast
<ConstantSDNode
>(
1046 N
->getOperand(N
->getNumOperands() - 1))->getZExtValue();
1047 if (ExtensionType
== ISD::SEXTLOAD
)
1048 FromType
= NVPTX::PTXLdStInstCode::Signed
;
1050 FromType
= getLdStRegType(ScalarVT
);
1054 switch (N
->getOpcode()) {
1055 case NVPTXISD::LoadV2
:
1056 VecType
= NVPTX::PTXLdStInstCode::V2
;
1058 case NVPTXISD::LoadV4
:
1059 VecType
= NVPTX::PTXLdStInstCode::V4
;
1065 EVT EltVT
= N
->getValueType(0);
1067 // v8x16 is a special case. PTX doesn't have ld.v8.16
1068 // instruction. Instead, we split the vector into v2x16 chunks and
1069 // load them with ld.v4.b32.
1070 if (Isv2x16VT(EltVT
)) {
1071 assert(N
->getOpcode() == NVPTXISD::LoadV4
&& "Unexpected load opcode.");
1073 FromType
= NVPTX::PTXLdStInstCode::Untyped
;
1077 if (SelectDirectAddr(Op1
, Addr
)) {
1078 switch (N
->getOpcode()) {
1081 case NVPTXISD::LoadV2
:
1082 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1083 NVPTX::LDV_i8_v2_avar
, NVPTX::LDV_i16_v2_avar
,
1084 NVPTX::LDV_i32_v2_avar
, NVPTX::LDV_i64_v2_avar
,
1085 NVPTX::LDV_f32_v2_avar
, NVPTX::LDV_f64_v2_avar
);
1087 case NVPTXISD::LoadV4
:
1089 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_avar
,
1090 NVPTX::LDV_i16_v4_avar
, NVPTX::LDV_i32_v4_avar
,
1091 std::nullopt
, NVPTX::LDV_f32_v4_avar
, std::nullopt
);
1096 SDValue Ops
[] = { getI32Imm(IsVolatile
, DL
), getI32Imm(CodeAddrSpace
, DL
),
1097 getI32Imm(VecType
, DL
), getI32Imm(FromType
, DL
),
1098 getI32Imm(FromTypeWidth
, DL
), Addr
, Chain
};
1099 LD
= CurDAG
->getMachineNode(*Opcode
, DL
, N
->getVTList(), Ops
);
1100 } else if (PointerSize
== 64
1101 ? SelectADDRsi64(Op1
.getNode(), Op1
, Base
, Offset
)
1102 : SelectADDRsi(Op1
.getNode(), Op1
, Base
, Offset
)) {
1103 switch (N
->getOpcode()) {
1106 case NVPTXISD::LoadV2
:
1107 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1108 NVPTX::LDV_i8_v2_asi
, NVPTX::LDV_i16_v2_asi
,
1109 NVPTX::LDV_i32_v2_asi
, NVPTX::LDV_i64_v2_asi
,
1110 NVPTX::LDV_f32_v2_asi
, NVPTX::LDV_f64_v2_asi
);
1112 case NVPTXISD::LoadV4
:
1114 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_asi
,
1115 NVPTX::LDV_i16_v4_asi
, NVPTX::LDV_i32_v4_asi
,
1116 std::nullopt
, NVPTX::LDV_f32_v4_asi
, std::nullopt
);
1121 SDValue Ops
[] = { getI32Imm(IsVolatile
, DL
), getI32Imm(CodeAddrSpace
, DL
),
1122 getI32Imm(VecType
, DL
), getI32Imm(FromType
, DL
),
1123 getI32Imm(FromTypeWidth
, DL
), Base
, Offset
, Chain
};
1124 LD
= CurDAG
->getMachineNode(*Opcode
, DL
, N
->getVTList(), Ops
);
1125 } else if (PointerSize
== 64
1126 ? SelectADDRri64(Op1
.getNode(), Op1
, Base
, Offset
)
1127 : SelectADDRri(Op1
.getNode(), Op1
, Base
, Offset
)) {
1128 if (PointerSize
== 64) {
1129 switch (N
->getOpcode()) {
1132 case NVPTXISD::LoadV2
:
1134 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1135 NVPTX::LDV_i8_v2_ari_64
, NVPTX::LDV_i16_v2_ari_64
,
1136 NVPTX::LDV_i32_v2_ari_64
, NVPTX::LDV_i64_v2_ari_64
,
1137 NVPTX::LDV_f32_v2_ari_64
, NVPTX::LDV_f64_v2_ari_64
);
1139 case NVPTXISD::LoadV4
:
1140 Opcode
= pickOpcodeForVT(
1141 EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_ari_64
,
1142 NVPTX::LDV_i16_v4_ari_64
, NVPTX::LDV_i32_v4_ari_64
, std::nullopt
,
1143 NVPTX::LDV_f32_v4_ari_64
, std::nullopt
);
1147 switch (N
->getOpcode()) {
1150 case NVPTXISD::LoadV2
:
1151 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1152 NVPTX::LDV_i8_v2_ari
, NVPTX::LDV_i16_v2_ari
,
1153 NVPTX::LDV_i32_v2_ari
, NVPTX::LDV_i64_v2_ari
,
1154 NVPTX::LDV_f32_v2_ari
, NVPTX::LDV_f64_v2_ari
);
1156 case NVPTXISD::LoadV4
:
1158 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_ari
,
1159 NVPTX::LDV_i16_v4_ari
, NVPTX::LDV_i32_v4_ari
,
1160 std::nullopt
, NVPTX::LDV_f32_v4_ari
, std::nullopt
);
1166 SDValue Ops
[] = { getI32Imm(IsVolatile
, DL
), getI32Imm(CodeAddrSpace
, DL
),
1167 getI32Imm(VecType
, DL
), getI32Imm(FromType
, DL
),
1168 getI32Imm(FromTypeWidth
, DL
), Base
, Offset
, Chain
};
1170 LD
= CurDAG
->getMachineNode(*Opcode
, DL
, N
->getVTList(), Ops
);
1172 if (PointerSize
== 64) {
1173 switch (N
->getOpcode()) {
1176 case NVPTXISD::LoadV2
:
1177 Opcode
= pickOpcodeForVT(
1178 EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v2_areg_64
,
1179 NVPTX::LDV_i16_v2_areg_64
, NVPTX::LDV_i32_v2_areg_64
,
1180 NVPTX::LDV_i64_v2_areg_64
, NVPTX::LDV_f32_v2_areg_64
,
1181 NVPTX::LDV_f64_v2_areg_64
);
1183 case NVPTXISD::LoadV4
:
1184 Opcode
= pickOpcodeForVT(
1185 EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_areg_64
,
1186 NVPTX::LDV_i16_v4_areg_64
, NVPTX::LDV_i32_v4_areg_64
, std::nullopt
,
1187 NVPTX::LDV_f32_v4_areg_64
, std::nullopt
);
1191 switch (N
->getOpcode()) {
1194 case NVPTXISD::LoadV2
:
1196 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v2_areg
,
1197 NVPTX::LDV_i16_v2_areg
, NVPTX::LDV_i32_v2_areg
,
1198 NVPTX::LDV_i64_v2_areg
, NVPTX::LDV_f32_v2_areg
,
1199 NVPTX::LDV_f64_v2_areg
);
1201 case NVPTXISD::LoadV4
:
1203 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_areg
,
1204 NVPTX::LDV_i16_v4_areg
, NVPTX::LDV_i32_v4_areg
,
1205 std::nullopt
, NVPTX::LDV_f32_v4_areg
, std::nullopt
);
1211 SDValue Ops
[] = { getI32Imm(IsVolatile
, DL
), getI32Imm(CodeAddrSpace
, DL
),
1212 getI32Imm(VecType
, DL
), getI32Imm(FromType
, DL
),
1213 getI32Imm(FromTypeWidth
, DL
), Op1
, Chain
};
1214 LD
= CurDAG
->getMachineNode(*Opcode
, DL
, N
->getVTList(), Ops
);
1217 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
1218 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(LD
), {MemRef
});
1224 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode
*N
) {
1226 SDValue Chain
= N
->getOperand(0);
1231 // If this is an LDG intrinsic, the address is the third operand. If its an
1232 // LDG/LDU SD node (from custom vector handling), then its the second operand
1233 if (N
->getOpcode() == ISD::INTRINSIC_W_CHAIN
) {
1234 Op1
= N
->getOperand(2);
1235 Mem
= cast
<MemIntrinsicSDNode
>(N
);
1236 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
1240 case Intrinsic::nvvm_ldg_global_f
:
1241 case Intrinsic::nvvm_ldg_global_i
:
1242 case Intrinsic::nvvm_ldg_global_p
:
1245 case Intrinsic::nvvm_ldu_global_f
:
1246 case Intrinsic::nvvm_ldu_global_i
:
1247 case Intrinsic::nvvm_ldu_global_p
:
1252 Op1
= N
->getOperand(1);
1253 Mem
= cast
<MemSDNode
>(N
);
1256 std::optional
<unsigned> Opcode
;
1259 SDValue Base
, Offset
, Addr
;
1260 EVT OrigType
= N
->getValueType(0);
1262 EVT EltVT
= Mem
->getMemoryVT();
1263 unsigned NumElts
= 1;
1264 if (EltVT
.isVector()) {
1265 NumElts
= EltVT
.getVectorNumElements();
1266 EltVT
= EltVT
.getVectorElementType();
1267 // vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
1268 if ((EltVT
== MVT::f16
&& OrigType
== MVT::v2f16
) ||
1269 (EltVT
== MVT::bf16
&& OrigType
== MVT::v2bf16
) ||
1270 (EltVT
== MVT::i16
&& OrigType
== MVT::v2i16
)) {
1271 assert(NumElts
% 2 == 0 && "Vector must have even number of elements");
1274 } else if (OrigType
== MVT::v4i8
) {
1280 // Build the "promoted" result VTList for the load. If we are really loading
1281 // i8s, then the return type will be promoted to i16 since we do not expose
1282 // 8-bit registers in NVPTX.
1283 EVT NodeVT
= (EltVT
== MVT::i8
) ? MVT::i16
: EltVT
;
1284 SmallVector
<EVT
, 5> InstVTs
;
1285 for (unsigned i
= 0; i
!= NumElts
; ++i
) {
1286 InstVTs
.push_back(NodeVT
);
1288 InstVTs
.push_back(MVT::Other
);
1289 SDVTList InstVTList
= CurDAG
->getVTList(InstVTs
);
1291 if (SelectDirectAddr(Op1
, Addr
)) {
1292 switch (N
->getOpcode()) {
1296 case ISD::INTRINSIC_W_CHAIN
:
1298 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1299 NVPTX::INT_PTX_LDG_GLOBAL_i8avar
,
1300 NVPTX::INT_PTX_LDG_GLOBAL_i16avar
,
1301 NVPTX::INT_PTX_LDG_GLOBAL_i32avar
,
1302 NVPTX::INT_PTX_LDG_GLOBAL_i64avar
,
1303 NVPTX::INT_PTX_LDG_GLOBAL_f32avar
,
1304 NVPTX::INT_PTX_LDG_GLOBAL_f64avar
);
1306 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1307 NVPTX::INT_PTX_LDU_GLOBAL_i8avar
,
1308 NVPTX::INT_PTX_LDU_GLOBAL_i16avar
,
1309 NVPTX::INT_PTX_LDU_GLOBAL_i32avar
,
1310 NVPTX::INT_PTX_LDU_GLOBAL_i64avar
,
1311 NVPTX::INT_PTX_LDU_GLOBAL_f32avar
,
1312 NVPTX::INT_PTX_LDU_GLOBAL_f64avar
);
1314 case NVPTXISD::LoadV2
:
1315 case NVPTXISD::LDGV2
:
1316 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1317 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar
,
1318 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar
,
1319 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar
,
1320 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar
,
1321 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar
,
1322 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar
);
1324 case NVPTXISD::LDUV2
:
1325 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1326 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar
,
1327 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar
,
1328 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar
,
1329 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar
,
1330 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar
,
1331 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar
);
1333 case NVPTXISD::LoadV4
:
1334 case NVPTXISD::LDGV4
:
1335 Opcode
= pickOpcodeForVT(
1336 EltVT
.getSimpleVT().SimpleTy
, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar
,
1337 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar
,
1338 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar
, std::nullopt
,
1339 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar
, std::nullopt
);
1341 case NVPTXISD::LDUV4
:
1342 Opcode
= pickOpcodeForVT(
1343 EltVT
.getSimpleVT().SimpleTy
, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar
,
1344 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar
,
1345 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar
, std::nullopt
,
1346 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar
, std::nullopt
);
1351 SDValue Ops
[] = { Addr
, Chain
};
1352 LD
= CurDAG
->getMachineNode(*Opcode
, DL
, InstVTList
, Ops
);
1353 } else if (TM
.is64Bit() ? SelectADDRri64(Op1
.getNode(), Op1
, Base
, Offset
)
1354 : SelectADDRri(Op1
.getNode(), Op1
, Base
, Offset
)) {
1356 switch (N
->getOpcode()) {
1360 case ISD::INTRINSIC_W_CHAIN
:
1362 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1363 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64
,
1364 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64
,
1365 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64
,
1366 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64
,
1367 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64
,
1368 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64
);
1370 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1371 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64
,
1372 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64
,
1373 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64
,
1374 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64
,
1375 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64
,
1376 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64
);
1378 case NVPTXISD::LoadV2
:
1379 case NVPTXISD::LDGV2
:
1380 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1381 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64
,
1382 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64
,
1383 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64
,
1384 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64
,
1385 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64
,
1386 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64
);
1388 case NVPTXISD::LDUV2
:
1389 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1390 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64
,
1391 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64
,
1392 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64
,
1393 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64
,
1394 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64
,
1395 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64
);
1397 case NVPTXISD::LoadV4
:
1398 case NVPTXISD::LDGV4
:
1399 Opcode
= pickOpcodeForVT(
1400 EltVT
.getSimpleVT().SimpleTy
, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64
,
1401 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64
,
1402 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64
, std::nullopt
,
1403 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64
, std::nullopt
);
1405 case NVPTXISD::LDUV4
:
1406 Opcode
= pickOpcodeForVT(
1407 EltVT
.getSimpleVT().SimpleTy
, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64
,
1408 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64
,
1409 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64
, std::nullopt
,
1410 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64
, std::nullopt
);
1414 switch (N
->getOpcode()) {
1418 case ISD::INTRINSIC_W_CHAIN
:
1420 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1421 NVPTX::INT_PTX_LDG_GLOBAL_i8ari
,
1422 NVPTX::INT_PTX_LDG_GLOBAL_i16ari
,
1423 NVPTX::INT_PTX_LDG_GLOBAL_i32ari
,
1424 NVPTX::INT_PTX_LDG_GLOBAL_i64ari
,
1425 NVPTX::INT_PTX_LDG_GLOBAL_f32ari
,
1426 NVPTX::INT_PTX_LDG_GLOBAL_f64ari
);
1428 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1429 NVPTX::INT_PTX_LDU_GLOBAL_i8ari
,
1430 NVPTX::INT_PTX_LDU_GLOBAL_i16ari
,
1431 NVPTX::INT_PTX_LDU_GLOBAL_i32ari
,
1432 NVPTX::INT_PTX_LDU_GLOBAL_i64ari
,
1433 NVPTX::INT_PTX_LDU_GLOBAL_f32ari
,
1434 NVPTX::INT_PTX_LDU_GLOBAL_f64ari
);
1436 case NVPTXISD::LoadV2
:
1437 case NVPTXISD::LDGV2
:
1438 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1439 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32
,
1440 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32
,
1441 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32
,
1442 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32
,
1443 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32
,
1444 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32
);
1446 case NVPTXISD::LDUV2
:
1447 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1448 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32
,
1449 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32
,
1450 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32
,
1451 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32
,
1452 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32
,
1453 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32
);
1455 case NVPTXISD::LoadV4
:
1456 case NVPTXISD::LDGV4
:
1457 Opcode
= pickOpcodeForVT(
1458 EltVT
.getSimpleVT().SimpleTy
, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32
,
1459 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32
,
1460 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32
, std::nullopt
,
1461 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32
, std::nullopt
);
1463 case NVPTXISD::LDUV4
:
1464 Opcode
= pickOpcodeForVT(
1465 EltVT
.getSimpleVT().SimpleTy
, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32
,
1466 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32
,
1467 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32
, std::nullopt
,
1468 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32
, std::nullopt
);
1474 SDValue Ops
[] = {Base
, Offset
, Chain
};
1475 LD
= CurDAG
->getMachineNode(*Opcode
, DL
, InstVTList
, Ops
);
1478 switch (N
->getOpcode()) {
1482 case ISD::INTRINSIC_W_CHAIN
:
1484 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1485 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64
,
1486 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64
,
1487 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64
,
1488 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64
,
1489 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64
,
1490 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64
);
1492 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1493 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64
,
1494 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64
,
1495 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64
,
1496 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64
,
1497 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64
,
1498 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64
);
1500 case NVPTXISD::LoadV2
:
1501 case NVPTXISD::LDGV2
:
1502 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1503 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64
,
1504 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64
,
1505 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64
,
1506 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64
,
1507 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64
,
1508 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64
);
1510 case NVPTXISD::LDUV2
:
1511 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1512 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64
,
1513 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64
,
1514 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64
,
1515 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64
,
1516 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64
,
1517 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64
);
1519 case NVPTXISD::LoadV4
:
1520 case NVPTXISD::LDGV4
:
1521 Opcode
= pickOpcodeForVT(
1522 EltVT
.getSimpleVT().SimpleTy
, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64
,
1523 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64
,
1524 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64
, std::nullopt
,
1525 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64
, std::nullopt
);
1527 case NVPTXISD::LDUV4
:
1528 Opcode
= pickOpcodeForVT(
1529 EltVT
.getSimpleVT().SimpleTy
, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64
,
1530 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64
,
1531 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64
, std::nullopt
,
1532 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64
, std::nullopt
);
1536 switch (N
->getOpcode()) {
1540 case ISD::INTRINSIC_W_CHAIN
:
1542 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1543 NVPTX::INT_PTX_LDG_GLOBAL_i8areg
,
1544 NVPTX::INT_PTX_LDG_GLOBAL_i16areg
,
1545 NVPTX::INT_PTX_LDG_GLOBAL_i32areg
,
1546 NVPTX::INT_PTX_LDG_GLOBAL_i64areg
,
1547 NVPTX::INT_PTX_LDG_GLOBAL_f32areg
,
1548 NVPTX::INT_PTX_LDG_GLOBAL_f64areg
);
1550 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1551 NVPTX::INT_PTX_LDU_GLOBAL_i8areg
,
1552 NVPTX::INT_PTX_LDU_GLOBAL_i16areg
,
1553 NVPTX::INT_PTX_LDU_GLOBAL_i32areg
,
1554 NVPTX::INT_PTX_LDU_GLOBAL_i64areg
,
1555 NVPTX::INT_PTX_LDU_GLOBAL_f32areg
,
1556 NVPTX::INT_PTX_LDU_GLOBAL_f64areg
);
1558 case NVPTXISD::LoadV2
:
1559 case NVPTXISD::LDGV2
:
1560 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1561 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32
,
1562 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32
,
1563 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32
,
1564 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32
,
1565 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32
,
1566 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32
);
1568 case NVPTXISD::LDUV2
:
1569 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1570 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32
,
1571 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32
,
1572 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32
,
1573 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32
,
1574 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32
,
1575 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32
);
1577 case NVPTXISD::LoadV4
:
1578 case NVPTXISD::LDGV4
:
1579 Opcode
= pickOpcodeForVT(
1580 EltVT
.getSimpleVT().SimpleTy
, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32
,
1581 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32
,
1582 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32
, std::nullopt
,
1583 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32
, std::nullopt
);
1585 case NVPTXISD::LDUV4
:
1586 Opcode
= pickOpcodeForVT(
1587 EltVT
.getSimpleVT().SimpleTy
, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32
,
1588 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32
,
1589 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32
, std::nullopt
,
1590 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32
, std::nullopt
);
1596 SDValue Ops
[] = { Op1
, Chain
};
1597 LD
= CurDAG
->getMachineNode(*Opcode
, DL
, InstVTList
, Ops
);
1600 // For automatic generation of LDG (through SelectLoad[Vector], not the
1601 // intrinsics), we may have an extending load like:
1603 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1605 // In this case, the matching logic above will select a load for the original
1606 // memory type (in this case, i8) and our types will not match (the node needs
1607 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1608 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1609 // CVT instruction. Ptxas should clean up any redundancies here.
1611 LoadSDNode
*LdNode
= dyn_cast
<LoadSDNode
>(N
);
1613 if (OrigType
!= EltVT
&&
1614 (LdNode
|| (OrigType
.isFloatingPoint() && EltVT
.isFloatingPoint()))) {
1615 // We have an extending-load. The instruction we selected operates on the
1616 // smaller type, but the SDNode we are replacing has the larger type. We
1617 // need to emit a CVT to make the types match.
1619 GetConvertOpcode(OrigType
.getSimpleVT(), EltVT
.getSimpleVT(), LdNode
);
1621 // For each output value, apply the manual sign/zero-extension and make sure
1622 // all users of the load go through that CVT.
1623 for (unsigned i
= 0; i
!= NumElts
; ++i
) {
1625 SDValue
OrigVal(N
, i
);
1628 CurDAG
->getMachineNode(CvtOpc
, DL
, OrigType
, Res
,
1629 CurDAG
->getTargetConstant(NVPTX::PTXCvtMode::NONE
,
1631 ReplaceUses(OrigVal
, SDValue(CvtNode
, 0));
1639 bool NVPTXDAGToDAGISel::tryStore(SDNode
*N
) {
1641 MemSDNode
*ST
= cast
<MemSDNode
>(N
);
1642 assert(ST
->writeMem() && "Expected store");
1643 StoreSDNode
*PlainStore
= dyn_cast
<StoreSDNode
>(N
);
1644 AtomicSDNode
*AtomicStore
= dyn_cast
<AtomicSDNode
>(N
);
1645 assert((PlainStore
|| AtomicStore
) && "Expected store");
1646 EVT StoreVT
= ST
->getMemoryVT();
1647 SDNode
*NVPTXST
= nullptr;
1649 // do not support pre/post inc/dec
1650 if (PlainStore
&& PlainStore
->isIndexed())
1653 if (!StoreVT
.isSimple())
1656 AtomicOrdering Ordering
= ST
->getSuccessOrdering();
1657 // In order to lower atomic loads with stronger guarantees we would need to
1658 // use store.release or insert fences. However these features were only added
1659 // with PTX ISA 6.0 / sm_70.
1660 // TODO: Check if we can actually use the new instructions and implement them.
1661 if (isStrongerThanMonotonic(Ordering
))
1664 // Address Space Setting
1665 unsigned int CodeAddrSpace
= getCodeAddrSpace(ST
);
1666 unsigned int PointerSize
=
1667 CurDAG
->getDataLayout().getPointerSizeInBits(ST
->getAddressSpace());
1670 // - .volatile is only available for .global and .shared
1671 // - .volatile has the same memory synchronization semantics as .relaxed.sys
1672 bool isVolatile
= ST
->isVolatile() || Ordering
== AtomicOrdering::Monotonic
;
1673 if (CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GLOBAL
&&
1674 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::SHARED
&&
1675 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GENERIC
)
1679 MVT SimpleVT
= StoreVT
.getSimpleVT();
1680 unsigned vecType
= NVPTX::PTXLdStInstCode::Scalar
;
1682 // Type Setting: toType + toTypeWidth
1683 // - for integer type, always use 'u'
1685 MVT ScalarVT
= SimpleVT
.getScalarType();
1686 unsigned toTypeWidth
= ScalarVT
.getSizeInBits();
1687 if (SimpleVT
.isVector()) {
1688 assert((Isv2x16VT(StoreVT
) || StoreVT
== MVT::v4i8
) &&
1689 "Unexpected vector type");
1690 // v2x16 is stored using st.b32
1694 unsigned int toType
= getLdStRegType(ScalarVT
);
1696 // Create the machine instruction DAG
1697 SDValue Chain
= ST
->getChain();
1698 SDValue Value
= PlainStore
? PlainStore
->getValue() : AtomicStore
->getVal();
1699 SDValue BasePtr
= ST
->getBasePtr();
1701 SDValue Offset
, Base
;
1702 std::optional
<unsigned> Opcode
;
1703 MVT::SimpleValueType SourceVT
=
1704 Value
.getNode()->getSimpleValueType(0).SimpleTy
;
1706 if (SelectDirectAddr(BasePtr
, Addr
)) {
1707 Opcode
= pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_avar
, NVPTX::ST_i16_avar
,
1708 NVPTX::ST_i32_avar
, NVPTX::ST_i64_avar
,
1709 NVPTX::ST_f32_avar
, NVPTX::ST_f64_avar
);
1712 SDValue Ops
[] = {Value
,
1713 getI32Imm(isVolatile
, dl
),
1714 getI32Imm(CodeAddrSpace
, dl
),
1715 getI32Imm(vecType
, dl
),
1716 getI32Imm(toType
, dl
),
1717 getI32Imm(toTypeWidth
, dl
),
1720 NVPTXST
= CurDAG
->getMachineNode(*Opcode
, dl
, MVT::Other
, Ops
);
1721 } else if (PointerSize
== 64
1722 ? SelectADDRsi64(BasePtr
.getNode(), BasePtr
, Base
, Offset
)
1723 : SelectADDRsi(BasePtr
.getNode(), BasePtr
, Base
, Offset
)) {
1724 Opcode
= pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_asi
, NVPTX::ST_i16_asi
,
1725 NVPTX::ST_i32_asi
, NVPTX::ST_i64_asi
,
1726 NVPTX::ST_f32_asi
, NVPTX::ST_f64_asi
);
1729 SDValue Ops
[] = {Value
,
1730 getI32Imm(isVolatile
, dl
),
1731 getI32Imm(CodeAddrSpace
, dl
),
1732 getI32Imm(vecType
, dl
),
1733 getI32Imm(toType
, dl
),
1734 getI32Imm(toTypeWidth
, dl
),
1738 NVPTXST
= CurDAG
->getMachineNode(*Opcode
, dl
, MVT::Other
, Ops
);
1739 } else if (PointerSize
== 64
1740 ? SelectADDRri64(BasePtr
.getNode(), BasePtr
, Base
, Offset
)
1741 : SelectADDRri(BasePtr
.getNode(), BasePtr
, Base
, Offset
)) {
1742 if (PointerSize
== 64)
1744 pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_ari_64
, NVPTX::ST_i16_ari_64
,
1745 NVPTX::ST_i32_ari_64
, NVPTX::ST_i64_ari_64
,
1746 NVPTX::ST_f32_ari_64
, NVPTX::ST_f64_ari_64
);
1748 Opcode
= pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_ari
, NVPTX::ST_i16_ari
,
1749 NVPTX::ST_i32_ari
, NVPTX::ST_i64_ari
,
1750 NVPTX::ST_f32_ari
, NVPTX::ST_f64_ari
);
1754 SDValue Ops
[] = {Value
,
1755 getI32Imm(isVolatile
, dl
),
1756 getI32Imm(CodeAddrSpace
, dl
),
1757 getI32Imm(vecType
, dl
),
1758 getI32Imm(toType
, dl
),
1759 getI32Imm(toTypeWidth
, dl
),
1763 NVPTXST
= CurDAG
->getMachineNode(*Opcode
, dl
, MVT::Other
, Ops
);
1765 if (PointerSize
== 64)
1767 pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_areg_64
, NVPTX::ST_i16_areg_64
,
1768 NVPTX::ST_i32_areg_64
, NVPTX::ST_i64_areg_64
,
1769 NVPTX::ST_f32_areg_64
, NVPTX::ST_f64_areg_64
);
1771 Opcode
= pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_areg
, NVPTX::ST_i16_areg
,
1772 NVPTX::ST_i32_areg
, NVPTX::ST_i64_areg
,
1773 NVPTX::ST_f32_areg
, NVPTX::ST_f64_areg
);
1776 SDValue Ops
[] = {Value
,
1777 getI32Imm(isVolatile
, dl
),
1778 getI32Imm(CodeAddrSpace
, dl
),
1779 getI32Imm(vecType
, dl
),
1780 getI32Imm(toType
, dl
),
1781 getI32Imm(toTypeWidth
, dl
),
1784 NVPTXST
= CurDAG
->getMachineNode(*Opcode
, dl
, MVT::Other
, Ops
);
1790 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
1791 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(NVPTXST
), {MemRef
});
1792 ReplaceNode(N
, NVPTXST
);
1796 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode
*N
) {
1797 SDValue Chain
= N
->getOperand(0);
1798 SDValue Op1
= N
->getOperand(1);
1799 SDValue Addr
, Offset
, Base
;
1800 std::optional
<unsigned> Opcode
;
1803 EVT EltVT
= Op1
.getValueType();
1804 MemSDNode
*MemSD
= cast
<MemSDNode
>(N
);
1805 EVT StoreVT
= MemSD
->getMemoryVT();
1807 // Address Space Setting
1808 unsigned CodeAddrSpace
= getCodeAddrSpace(MemSD
);
1809 if (CodeAddrSpace
== NVPTX::PTXLdStInstCode::CONSTANT
) {
1810 report_fatal_error("Cannot store to pointer that points to constant "
1813 unsigned int PointerSize
=
1814 CurDAG
->getDataLayout().getPointerSizeInBits(MemSD
->getAddressSpace());
1817 // - .volatile is only availalble for .global and .shared
1818 bool IsVolatile
= MemSD
->isVolatile();
1819 if (CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GLOBAL
&&
1820 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::SHARED
&&
1821 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GENERIC
)
1824 // Type Setting: toType + toTypeWidth
1825 // - for integer type, always use 'u'
1826 assert(StoreVT
.isSimple() && "Store value is not simple");
1827 MVT ScalarVT
= StoreVT
.getSimpleVT().getScalarType();
1828 unsigned ToTypeWidth
= ScalarVT
.getSizeInBits();
1829 unsigned ToType
= getLdStRegType(ScalarVT
);
1831 SmallVector
<SDValue
, 12> StOps
;
1835 switch (N
->getOpcode()) {
1836 case NVPTXISD::StoreV2
:
1837 VecType
= NVPTX::PTXLdStInstCode::V2
;
1838 StOps
.push_back(N
->getOperand(1));
1839 StOps
.push_back(N
->getOperand(2));
1840 N2
= N
->getOperand(3);
1842 case NVPTXISD::StoreV4
:
1843 VecType
= NVPTX::PTXLdStInstCode::V4
;
1844 StOps
.push_back(N
->getOperand(1));
1845 StOps
.push_back(N
->getOperand(2));
1846 StOps
.push_back(N
->getOperand(3));
1847 StOps
.push_back(N
->getOperand(4));
1848 N2
= N
->getOperand(5);
1854 // v8x16 is a special case. PTX doesn't have st.v8.x16
1855 // instruction. Instead, we split the vector into v2x16 chunks and
1856 // store them with st.v4.b32.
1857 if (Isv2x16VT(EltVT
)) {
1858 assert(N
->getOpcode() == NVPTXISD::StoreV4
&& "Unexpected load opcode.");
1860 ToType
= NVPTX::PTXLdStInstCode::Untyped
;
1864 StOps
.push_back(getI32Imm(IsVolatile
, DL
));
1865 StOps
.push_back(getI32Imm(CodeAddrSpace
, DL
));
1866 StOps
.push_back(getI32Imm(VecType
, DL
));
1867 StOps
.push_back(getI32Imm(ToType
, DL
));
1868 StOps
.push_back(getI32Imm(ToTypeWidth
, DL
));
1870 if (SelectDirectAddr(N2
, Addr
)) {
1871 switch (N
->getOpcode()) {
1874 case NVPTXISD::StoreV2
:
1875 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1876 NVPTX::STV_i8_v2_avar
, NVPTX::STV_i16_v2_avar
,
1877 NVPTX::STV_i32_v2_avar
, NVPTX::STV_i64_v2_avar
,
1878 NVPTX::STV_f32_v2_avar
, NVPTX::STV_f64_v2_avar
);
1880 case NVPTXISD::StoreV4
:
1881 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1882 NVPTX::STV_i8_v4_avar
, NVPTX::STV_i16_v4_avar
,
1883 NVPTX::STV_i32_v4_avar
, std::nullopt
,
1884 NVPTX::STV_f32_v4_avar
, std::nullopt
);
1887 StOps
.push_back(Addr
);
1888 } else if (PointerSize
== 64 ? SelectADDRsi64(N2
.getNode(), N2
, Base
, Offset
)
1889 : SelectADDRsi(N2
.getNode(), N2
, Base
, Offset
)) {
1890 switch (N
->getOpcode()) {
1893 case NVPTXISD::StoreV2
:
1894 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1895 NVPTX::STV_i8_v2_asi
, NVPTX::STV_i16_v2_asi
,
1896 NVPTX::STV_i32_v2_asi
, NVPTX::STV_i64_v2_asi
,
1897 NVPTX::STV_f32_v2_asi
, NVPTX::STV_f64_v2_asi
);
1899 case NVPTXISD::StoreV4
:
1901 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v4_asi
,
1902 NVPTX::STV_i16_v4_asi
, NVPTX::STV_i32_v4_asi
,
1903 std::nullopt
, NVPTX::STV_f32_v4_asi
, std::nullopt
);
1906 StOps
.push_back(Base
);
1907 StOps
.push_back(Offset
);
1908 } else if (PointerSize
== 64 ? SelectADDRri64(N2
.getNode(), N2
, Base
, Offset
)
1909 : SelectADDRri(N2
.getNode(), N2
, Base
, Offset
)) {
1910 if (PointerSize
== 64) {
1911 switch (N
->getOpcode()) {
1914 case NVPTXISD::StoreV2
:
1916 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1917 NVPTX::STV_i8_v2_ari_64
, NVPTX::STV_i16_v2_ari_64
,
1918 NVPTX::STV_i32_v2_ari_64
, NVPTX::STV_i64_v2_ari_64
,
1919 NVPTX::STV_f32_v2_ari_64
, NVPTX::STV_f64_v2_ari_64
);
1921 case NVPTXISD::StoreV4
:
1922 Opcode
= pickOpcodeForVT(
1923 EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v4_ari_64
,
1924 NVPTX::STV_i16_v4_ari_64
, NVPTX::STV_i32_v4_ari_64
, std::nullopt
,
1925 NVPTX::STV_f32_v4_ari_64
, std::nullopt
);
1929 switch (N
->getOpcode()) {
1932 case NVPTXISD::StoreV2
:
1933 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1934 NVPTX::STV_i8_v2_ari
, NVPTX::STV_i16_v2_ari
,
1935 NVPTX::STV_i32_v2_ari
, NVPTX::STV_i64_v2_ari
,
1936 NVPTX::STV_f32_v2_ari
, NVPTX::STV_f64_v2_ari
);
1938 case NVPTXISD::StoreV4
:
1939 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1940 NVPTX::STV_i8_v4_ari
, NVPTX::STV_i16_v4_ari
,
1941 NVPTX::STV_i32_v4_ari
, std::nullopt
,
1942 NVPTX::STV_f32_v4_ari
, std::nullopt
);
1946 StOps
.push_back(Base
);
1947 StOps
.push_back(Offset
);
1949 if (PointerSize
== 64) {
1950 switch (N
->getOpcode()) {
1953 case NVPTXISD::StoreV2
:
1954 Opcode
= pickOpcodeForVT(
1955 EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v2_areg_64
,
1956 NVPTX::STV_i16_v2_areg_64
, NVPTX::STV_i32_v2_areg_64
,
1957 NVPTX::STV_i64_v2_areg_64
, NVPTX::STV_f32_v2_areg_64
,
1958 NVPTX::STV_f64_v2_areg_64
);
1960 case NVPTXISD::StoreV4
:
1961 Opcode
= pickOpcodeForVT(
1962 EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v4_areg_64
,
1963 NVPTX::STV_i16_v4_areg_64
, NVPTX::STV_i32_v4_areg_64
, std::nullopt
,
1964 NVPTX::STV_f32_v4_areg_64
, std::nullopt
);
1968 switch (N
->getOpcode()) {
1971 case NVPTXISD::StoreV2
:
1973 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v2_areg
,
1974 NVPTX::STV_i16_v2_areg
, NVPTX::STV_i32_v2_areg
,
1975 NVPTX::STV_i64_v2_areg
, NVPTX::STV_f32_v2_areg
,
1976 NVPTX::STV_f64_v2_areg
);
1978 case NVPTXISD::StoreV4
:
1980 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v4_areg
,
1981 NVPTX::STV_i16_v4_areg
, NVPTX::STV_i32_v4_areg
,
1982 std::nullopt
, NVPTX::STV_f32_v4_areg
, std::nullopt
);
1986 StOps
.push_back(N2
);
1992 StOps
.push_back(Chain
);
1994 ST
= CurDAG
->getMachineNode(*Opcode
, DL
, MVT::Other
, StOps
);
1996 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
1997 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(ST
), {MemRef
});
2003 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode
*Node
) {
2004 SDValue Chain
= Node
->getOperand(0);
2005 SDValue Offset
= Node
->getOperand(2);
2006 SDValue Glue
= Node
->getOperand(3);
2008 MemSDNode
*Mem
= cast
<MemSDNode
>(Node
);
2011 switch (Node
->getOpcode()) {
2014 case NVPTXISD::LoadParam
:
2017 case NVPTXISD::LoadParamV2
:
2020 case NVPTXISD::LoadParamV4
:
2025 EVT EltVT
= Node
->getValueType(0);
2026 EVT MemVT
= Mem
->getMemoryVT();
2028 std::optional
<unsigned> Opcode
;
2034 Opcode
= pickOpcodeForVT(MemVT
.getSimpleVT().SimpleTy
,
2035 NVPTX::LoadParamMemI8
, NVPTX::LoadParamMemI16
,
2036 NVPTX::LoadParamMemI32
, NVPTX::LoadParamMemI64
,
2037 NVPTX::LoadParamMemF32
, NVPTX::LoadParamMemF64
);
2041 pickOpcodeForVT(MemVT
.getSimpleVT().SimpleTy
, NVPTX::LoadParamMemV2I8
,
2042 NVPTX::LoadParamMemV2I16
, NVPTX::LoadParamMemV2I32
,
2043 NVPTX::LoadParamMemV2I64
, NVPTX::LoadParamMemV2F32
,
2044 NVPTX::LoadParamMemV2F64
);
2048 pickOpcodeForVT(MemVT
.getSimpleVT().SimpleTy
, NVPTX::LoadParamMemV4I8
,
2049 NVPTX::LoadParamMemV4I16
, NVPTX::LoadParamMemV4I32
,
2050 std::nullopt
, NVPTX::LoadParamMemV4F32
, std::nullopt
);
2058 VTs
= CurDAG
->getVTList(EltVT
, MVT::Other
, MVT::Glue
);
2059 } else if (VecSize
== 2) {
2060 VTs
= CurDAG
->getVTList(EltVT
, EltVT
, MVT::Other
, MVT::Glue
);
2062 EVT EVTs
[] = { EltVT
, EltVT
, EltVT
, EltVT
, MVT::Other
, MVT::Glue
};
2063 VTs
= CurDAG
->getVTList(EVTs
);
2066 unsigned OffsetVal
= cast
<ConstantSDNode
>(Offset
)->getZExtValue();
2068 SmallVector
<SDValue
, 2> Ops
;
2069 Ops
.push_back(CurDAG
->getTargetConstant(OffsetVal
, DL
, MVT::i32
));
2070 Ops
.push_back(Chain
);
2071 Ops
.push_back(Glue
);
2073 ReplaceNode(Node
, CurDAG
->getMachineNode(*Opcode
, DL
, VTs
, Ops
));
2077 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode
*N
) {
2079 SDValue Chain
= N
->getOperand(0);
2080 SDValue Offset
= N
->getOperand(1);
2081 unsigned OffsetVal
= cast
<ConstantSDNode
>(Offset
)->getZExtValue();
2082 MemSDNode
*Mem
= cast
<MemSDNode
>(N
);
2084 // How many elements do we have?
2085 unsigned NumElts
= 1;
2086 switch (N
->getOpcode()) {
2089 case NVPTXISD::StoreRetval
:
2092 case NVPTXISD::StoreRetvalV2
:
2095 case NVPTXISD::StoreRetvalV4
:
2100 // Build vector of operands
2101 SmallVector
<SDValue
, 6> Ops
;
2102 for (unsigned i
= 0; i
< NumElts
; ++i
)
2103 Ops
.push_back(N
->getOperand(i
+ 2));
2104 Ops
.push_back(CurDAG
->getTargetConstant(OffsetVal
, DL
, MVT::i32
));
2105 Ops
.push_back(Chain
);
2107 // Determine target opcode
2108 // If we have an i1, use an 8-bit store. The lowering code in
2109 // NVPTXISelLowering will have already emitted an upcast.
2110 std::optional
<unsigned> Opcode
= 0;
2115 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2116 NVPTX::StoreRetvalI8
, NVPTX::StoreRetvalI16
,
2117 NVPTX::StoreRetvalI32
, NVPTX::StoreRetvalI64
,
2118 NVPTX::StoreRetvalF32
, NVPTX::StoreRetvalF64
);
2121 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2122 NVPTX::StoreRetvalV2I8
, NVPTX::StoreRetvalV2I16
,
2123 NVPTX::StoreRetvalV2I32
, NVPTX::StoreRetvalV2I64
,
2124 NVPTX::StoreRetvalV2F32
, NVPTX::StoreRetvalV2F64
);
2127 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2128 NVPTX::StoreRetvalV4I8
, NVPTX::StoreRetvalV4I16
,
2129 NVPTX::StoreRetvalV4I32
, std::nullopt
,
2130 NVPTX::StoreRetvalV4F32
, std::nullopt
);
2136 SDNode
*Ret
= CurDAG
->getMachineNode(*Opcode
, DL
, MVT::Other
, Ops
);
2137 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
2138 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Ret
), {MemRef
});
2140 ReplaceNode(N
, Ret
);
2144 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode
*N
) {
2146 SDValue Chain
= N
->getOperand(0);
2147 SDValue Param
= N
->getOperand(1);
2148 unsigned ParamVal
= cast
<ConstantSDNode
>(Param
)->getZExtValue();
2149 SDValue Offset
= N
->getOperand(2);
2150 unsigned OffsetVal
= cast
<ConstantSDNode
>(Offset
)->getZExtValue();
2151 MemSDNode
*Mem
= cast
<MemSDNode
>(N
);
2152 SDValue Glue
= N
->getOperand(N
->getNumOperands() - 1);
2154 // How many elements do we have?
2155 unsigned NumElts
= 1;
2156 switch (N
->getOpcode()) {
2159 case NVPTXISD::StoreParamU32
:
2160 case NVPTXISD::StoreParamS32
:
2161 case NVPTXISD::StoreParam
:
2164 case NVPTXISD::StoreParamV2
:
2167 case NVPTXISD::StoreParamV4
:
2172 // Build vector of operands
2173 SmallVector
<SDValue
, 8> Ops
;
2174 for (unsigned i
= 0; i
< NumElts
; ++i
)
2175 Ops
.push_back(N
->getOperand(i
+ 3));
2176 Ops
.push_back(CurDAG
->getTargetConstant(ParamVal
, DL
, MVT::i32
));
2177 Ops
.push_back(CurDAG
->getTargetConstant(OffsetVal
, DL
, MVT::i32
));
2178 Ops
.push_back(Chain
);
2179 Ops
.push_back(Glue
);
2181 // Determine target opcode
2182 // If we have an i1, use an 8-bit store. The lowering code in
2183 // NVPTXISelLowering will have already emitted an upcast.
2184 std::optional
<unsigned> Opcode
= 0;
2185 switch (N
->getOpcode()) {
2191 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2192 NVPTX::StoreParamI8
, NVPTX::StoreParamI16
,
2193 NVPTX::StoreParamI32
, NVPTX::StoreParamI64
,
2194 NVPTX::StoreParamF32
, NVPTX::StoreParamF64
);
2197 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2198 NVPTX::StoreParamV2I8
, NVPTX::StoreParamV2I16
,
2199 NVPTX::StoreParamV2I32
, NVPTX::StoreParamV2I64
,
2200 NVPTX::StoreParamV2F32
, NVPTX::StoreParamV2F64
);
2203 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2204 NVPTX::StoreParamV4I8
, NVPTX::StoreParamV4I16
,
2205 NVPTX::StoreParamV4I32
, std::nullopt
,
2206 NVPTX::StoreParamV4F32
, std::nullopt
);
2212 // Special case: if we have a sign-extend/zero-extend node, insert the
2213 // conversion instruction first, and use that as the value operand to
2214 // the selected StoreParam node.
2215 case NVPTXISD::StoreParamU32
: {
2216 Opcode
= NVPTX::StoreParamI32
;
2217 SDValue CvtNone
= CurDAG
->getTargetConstant(NVPTX::PTXCvtMode::NONE
, DL
,
2219 SDNode
*Cvt
= CurDAG
->getMachineNode(NVPTX::CVT_u32_u16
, DL
,
2220 MVT::i32
, Ops
[0], CvtNone
);
2221 Ops
[0] = SDValue(Cvt
, 0);
2224 case NVPTXISD::StoreParamS32
: {
2225 Opcode
= NVPTX::StoreParamI32
;
2226 SDValue CvtNone
= CurDAG
->getTargetConstant(NVPTX::PTXCvtMode::NONE
, DL
,
2228 SDNode
*Cvt
= CurDAG
->getMachineNode(NVPTX::CVT_s32_s16
, DL
,
2229 MVT::i32
, Ops
[0], CvtNone
);
2230 Ops
[0] = SDValue(Cvt
, 0);
2235 SDVTList RetVTs
= CurDAG
->getVTList(MVT::Other
, MVT::Glue
);
2236 SDNode
*Ret
= CurDAG
->getMachineNode(*Opcode
, DL
, RetVTs
, Ops
);
2237 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
2238 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Ret
), {MemRef
});
2240 ReplaceNode(N
, Ret
);
2244 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode
*N
) {
2247 switch (N
->getOpcode()) {
2248 default: return false;
2249 case NVPTXISD::Tex1DFloatS32
:
2250 Opc
= NVPTX::TEX_1D_F32_S32_RR
;
2252 case NVPTXISD::Tex1DFloatFloat
:
2253 Opc
= NVPTX::TEX_1D_F32_F32_RR
;
2255 case NVPTXISD::Tex1DFloatFloatLevel
:
2256 Opc
= NVPTX::TEX_1D_F32_F32_LEVEL_RR
;
2258 case NVPTXISD::Tex1DFloatFloatGrad
:
2259 Opc
= NVPTX::TEX_1D_F32_F32_GRAD_RR
;
2261 case NVPTXISD::Tex1DS32S32
:
2262 Opc
= NVPTX::TEX_1D_S32_S32_RR
;
2264 case NVPTXISD::Tex1DS32Float
:
2265 Opc
= NVPTX::TEX_1D_S32_F32_RR
;
2267 case NVPTXISD::Tex1DS32FloatLevel
:
2268 Opc
= NVPTX::TEX_1D_S32_F32_LEVEL_RR
;
2270 case NVPTXISD::Tex1DS32FloatGrad
:
2271 Opc
= NVPTX::TEX_1D_S32_F32_GRAD_RR
;
2273 case NVPTXISD::Tex1DU32S32
:
2274 Opc
= NVPTX::TEX_1D_U32_S32_RR
;
2276 case NVPTXISD::Tex1DU32Float
:
2277 Opc
= NVPTX::TEX_1D_U32_F32_RR
;
2279 case NVPTXISD::Tex1DU32FloatLevel
:
2280 Opc
= NVPTX::TEX_1D_U32_F32_LEVEL_RR
;
2282 case NVPTXISD::Tex1DU32FloatGrad
:
2283 Opc
= NVPTX::TEX_1D_U32_F32_GRAD_RR
;
2285 case NVPTXISD::Tex1DArrayFloatS32
:
2286 Opc
= NVPTX::TEX_1D_ARRAY_F32_S32_RR
;
2288 case NVPTXISD::Tex1DArrayFloatFloat
:
2289 Opc
= NVPTX::TEX_1D_ARRAY_F32_F32_RR
;
2291 case NVPTXISD::Tex1DArrayFloatFloatLevel
:
2292 Opc
= NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR
;
2294 case NVPTXISD::Tex1DArrayFloatFloatGrad
:
2295 Opc
= NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR
;
2297 case NVPTXISD::Tex1DArrayS32S32
:
2298 Opc
= NVPTX::TEX_1D_ARRAY_S32_S32_RR
;
2300 case NVPTXISD::Tex1DArrayS32Float
:
2301 Opc
= NVPTX::TEX_1D_ARRAY_S32_F32_RR
;
2303 case NVPTXISD::Tex1DArrayS32FloatLevel
:
2304 Opc
= NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR
;
2306 case NVPTXISD::Tex1DArrayS32FloatGrad
:
2307 Opc
= NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR
;
2309 case NVPTXISD::Tex1DArrayU32S32
:
2310 Opc
= NVPTX::TEX_1D_ARRAY_U32_S32_RR
;
2312 case NVPTXISD::Tex1DArrayU32Float
:
2313 Opc
= NVPTX::TEX_1D_ARRAY_U32_F32_RR
;
2315 case NVPTXISD::Tex1DArrayU32FloatLevel
:
2316 Opc
= NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR
;
2318 case NVPTXISD::Tex1DArrayU32FloatGrad
:
2319 Opc
= NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR
;
2321 case NVPTXISD::Tex2DFloatS32
:
2322 Opc
= NVPTX::TEX_2D_F32_S32_RR
;
2324 case NVPTXISD::Tex2DFloatFloat
:
2325 Opc
= NVPTX::TEX_2D_F32_F32_RR
;
2327 case NVPTXISD::Tex2DFloatFloatLevel
:
2328 Opc
= NVPTX::TEX_2D_F32_F32_LEVEL_RR
;
2330 case NVPTXISD::Tex2DFloatFloatGrad
:
2331 Opc
= NVPTX::TEX_2D_F32_F32_GRAD_RR
;
2333 case NVPTXISD::Tex2DS32S32
:
2334 Opc
= NVPTX::TEX_2D_S32_S32_RR
;
2336 case NVPTXISD::Tex2DS32Float
:
2337 Opc
= NVPTX::TEX_2D_S32_F32_RR
;
2339 case NVPTXISD::Tex2DS32FloatLevel
:
2340 Opc
= NVPTX::TEX_2D_S32_F32_LEVEL_RR
;
2342 case NVPTXISD::Tex2DS32FloatGrad
:
2343 Opc
= NVPTX::TEX_2D_S32_F32_GRAD_RR
;
2345 case NVPTXISD::Tex2DU32S32
:
2346 Opc
= NVPTX::TEX_2D_U32_S32_RR
;
2348 case NVPTXISD::Tex2DU32Float
:
2349 Opc
= NVPTX::TEX_2D_U32_F32_RR
;
2351 case NVPTXISD::Tex2DU32FloatLevel
:
2352 Opc
= NVPTX::TEX_2D_U32_F32_LEVEL_RR
;
2354 case NVPTXISD::Tex2DU32FloatGrad
:
2355 Opc
= NVPTX::TEX_2D_U32_F32_GRAD_RR
;
2357 case NVPTXISD::Tex2DArrayFloatS32
:
2358 Opc
= NVPTX::TEX_2D_ARRAY_F32_S32_RR
;
2360 case NVPTXISD::Tex2DArrayFloatFloat
:
2361 Opc
= NVPTX::TEX_2D_ARRAY_F32_F32_RR
;
2363 case NVPTXISD::Tex2DArrayFloatFloatLevel
:
2364 Opc
= NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR
;
2366 case NVPTXISD::Tex2DArrayFloatFloatGrad
:
2367 Opc
= NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR
;
2369 case NVPTXISD::Tex2DArrayS32S32
:
2370 Opc
= NVPTX::TEX_2D_ARRAY_S32_S32_RR
;
2372 case NVPTXISD::Tex2DArrayS32Float
:
2373 Opc
= NVPTX::TEX_2D_ARRAY_S32_F32_RR
;
2375 case NVPTXISD::Tex2DArrayS32FloatLevel
:
2376 Opc
= NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR
;
2378 case NVPTXISD::Tex2DArrayS32FloatGrad
:
2379 Opc
= NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR
;
2381 case NVPTXISD::Tex2DArrayU32S32
:
2382 Opc
= NVPTX::TEX_2D_ARRAY_U32_S32_RR
;
2384 case NVPTXISD::Tex2DArrayU32Float
:
2385 Opc
= NVPTX::TEX_2D_ARRAY_U32_F32_RR
;
2387 case NVPTXISD::Tex2DArrayU32FloatLevel
:
2388 Opc
= NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR
;
2390 case NVPTXISD::Tex2DArrayU32FloatGrad
:
2391 Opc
= NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR
;
2393 case NVPTXISD::Tex3DFloatS32
:
2394 Opc
= NVPTX::TEX_3D_F32_S32_RR
;
2396 case NVPTXISD::Tex3DFloatFloat
:
2397 Opc
= NVPTX::TEX_3D_F32_F32_RR
;
2399 case NVPTXISD::Tex3DFloatFloatLevel
:
2400 Opc
= NVPTX::TEX_3D_F32_F32_LEVEL_RR
;
2402 case NVPTXISD::Tex3DFloatFloatGrad
:
2403 Opc
= NVPTX::TEX_3D_F32_F32_GRAD_RR
;
2405 case NVPTXISD::Tex3DS32S32
:
2406 Opc
= NVPTX::TEX_3D_S32_S32_RR
;
2408 case NVPTXISD::Tex3DS32Float
:
2409 Opc
= NVPTX::TEX_3D_S32_F32_RR
;
2411 case NVPTXISD::Tex3DS32FloatLevel
:
2412 Opc
= NVPTX::TEX_3D_S32_F32_LEVEL_RR
;
2414 case NVPTXISD::Tex3DS32FloatGrad
:
2415 Opc
= NVPTX::TEX_3D_S32_F32_GRAD_RR
;
2417 case NVPTXISD::Tex3DU32S32
:
2418 Opc
= NVPTX::TEX_3D_U32_S32_RR
;
2420 case NVPTXISD::Tex3DU32Float
:
2421 Opc
= NVPTX::TEX_3D_U32_F32_RR
;
2423 case NVPTXISD::Tex3DU32FloatLevel
:
2424 Opc
= NVPTX::TEX_3D_U32_F32_LEVEL_RR
;
2426 case NVPTXISD::Tex3DU32FloatGrad
:
2427 Opc
= NVPTX::TEX_3D_U32_F32_GRAD_RR
;
2429 case NVPTXISD::TexCubeFloatFloat
:
2430 Opc
= NVPTX::TEX_CUBE_F32_F32_RR
;
2432 case NVPTXISD::TexCubeFloatFloatLevel
:
2433 Opc
= NVPTX::TEX_CUBE_F32_F32_LEVEL_RR
;
2435 case NVPTXISD::TexCubeS32Float
:
2436 Opc
= NVPTX::TEX_CUBE_S32_F32_RR
;
2438 case NVPTXISD::TexCubeS32FloatLevel
:
2439 Opc
= NVPTX::TEX_CUBE_S32_F32_LEVEL_RR
;
2441 case NVPTXISD::TexCubeU32Float
:
2442 Opc
= NVPTX::TEX_CUBE_U32_F32_RR
;
2444 case NVPTXISD::TexCubeU32FloatLevel
:
2445 Opc
= NVPTX::TEX_CUBE_U32_F32_LEVEL_RR
;
2447 case NVPTXISD::TexCubeArrayFloatFloat
:
2448 Opc
= NVPTX::TEX_CUBE_ARRAY_F32_F32_RR
;
2450 case NVPTXISD::TexCubeArrayFloatFloatLevel
:
2451 Opc
= NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR
;
2453 case NVPTXISD::TexCubeArrayS32Float
:
2454 Opc
= NVPTX::TEX_CUBE_ARRAY_S32_F32_RR
;
2456 case NVPTXISD::TexCubeArrayS32FloatLevel
:
2457 Opc
= NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR
;
2459 case NVPTXISD::TexCubeArrayU32Float
:
2460 Opc
= NVPTX::TEX_CUBE_ARRAY_U32_F32_RR
;
2462 case NVPTXISD::TexCubeArrayU32FloatLevel
:
2463 Opc
= NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR
;
2465 case NVPTXISD::Tld4R2DFloatFloat
:
2466 Opc
= NVPTX::TLD4_R_2D_F32_F32_RR
;
2468 case NVPTXISD::Tld4G2DFloatFloat
:
2469 Opc
= NVPTX::TLD4_G_2D_F32_F32_RR
;
2471 case NVPTXISD::Tld4B2DFloatFloat
:
2472 Opc
= NVPTX::TLD4_B_2D_F32_F32_RR
;
2474 case NVPTXISD::Tld4A2DFloatFloat
:
2475 Opc
= NVPTX::TLD4_A_2D_F32_F32_RR
;
2477 case NVPTXISD::Tld4R2DS64Float
:
2478 Opc
= NVPTX::TLD4_R_2D_S32_F32_RR
;
2480 case NVPTXISD::Tld4G2DS64Float
:
2481 Opc
= NVPTX::TLD4_G_2D_S32_F32_RR
;
2483 case NVPTXISD::Tld4B2DS64Float
:
2484 Opc
= NVPTX::TLD4_B_2D_S32_F32_RR
;
2486 case NVPTXISD::Tld4A2DS64Float
:
2487 Opc
= NVPTX::TLD4_A_2D_S32_F32_RR
;
2489 case NVPTXISD::Tld4R2DU64Float
:
2490 Opc
= NVPTX::TLD4_R_2D_U32_F32_RR
;
2492 case NVPTXISD::Tld4G2DU64Float
:
2493 Opc
= NVPTX::TLD4_G_2D_U32_F32_RR
;
2495 case NVPTXISD::Tld4B2DU64Float
:
2496 Opc
= NVPTX::TLD4_B_2D_U32_F32_RR
;
2498 case NVPTXISD::Tld4A2DU64Float
:
2499 Opc
= NVPTX::TLD4_A_2D_U32_F32_RR
;
2501 case NVPTXISD::TexUnified1DFloatS32
:
2502 Opc
= NVPTX::TEX_UNIFIED_1D_F32_S32_R
;
2504 case NVPTXISD::TexUnified1DFloatFloat
:
2505 Opc
= NVPTX::TEX_UNIFIED_1D_F32_F32_R
;
2507 case NVPTXISD::TexUnified1DFloatFloatLevel
:
2508 Opc
= NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R
;
2510 case NVPTXISD::TexUnified1DFloatFloatGrad
:
2511 Opc
= NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R
;
2513 case NVPTXISD::TexUnified1DS32S32
:
2514 Opc
= NVPTX::TEX_UNIFIED_1D_S32_S32_R
;
2516 case NVPTXISD::TexUnified1DS32Float
:
2517 Opc
= NVPTX::TEX_UNIFIED_1D_S32_F32_R
;
2519 case NVPTXISD::TexUnified1DS32FloatLevel
:
2520 Opc
= NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R
;
2522 case NVPTXISD::TexUnified1DS32FloatGrad
:
2523 Opc
= NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R
;
2525 case NVPTXISD::TexUnified1DU32S32
:
2526 Opc
= NVPTX::TEX_UNIFIED_1D_U32_S32_R
;
2528 case NVPTXISD::TexUnified1DU32Float
:
2529 Opc
= NVPTX::TEX_UNIFIED_1D_U32_F32_R
;
2531 case NVPTXISD::TexUnified1DU32FloatLevel
:
2532 Opc
= NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R
;
2534 case NVPTXISD::TexUnified1DU32FloatGrad
:
2535 Opc
= NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R
;
2537 case NVPTXISD::TexUnified1DArrayFloatS32
:
2538 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R
;
2540 case NVPTXISD::TexUnified1DArrayFloatFloat
:
2541 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R
;
2543 case NVPTXISD::TexUnified1DArrayFloatFloatLevel
:
2544 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R
;
2546 case NVPTXISD::TexUnified1DArrayFloatFloatGrad
:
2547 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R
;
2549 case NVPTXISD::TexUnified1DArrayS32S32
:
2550 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R
;
2552 case NVPTXISD::TexUnified1DArrayS32Float
:
2553 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R
;
2555 case NVPTXISD::TexUnified1DArrayS32FloatLevel
:
2556 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R
;
2558 case NVPTXISD::TexUnified1DArrayS32FloatGrad
:
2559 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R
;
2561 case NVPTXISD::TexUnified1DArrayU32S32
:
2562 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R
;
2564 case NVPTXISD::TexUnified1DArrayU32Float
:
2565 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R
;
2567 case NVPTXISD::TexUnified1DArrayU32FloatLevel
:
2568 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R
;
2570 case NVPTXISD::TexUnified1DArrayU32FloatGrad
:
2571 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R
;
2573 case NVPTXISD::TexUnified2DFloatS32
:
2574 Opc
= NVPTX::TEX_UNIFIED_2D_F32_S32_R
;
2576 case NVPTXISD::TexUnified2DFloatFloat
:
2577 Opc
= NVPTX::TEX_UNIFIED_2D_F32_F32_R
;
2579 case NVPTXISD::TexUnified2DFloatFloatLevel
:
2580 Opc
= NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R
;
2582 case NVPTXISD::TexUnified2DFloatFloatGrad
:
2583 Opc
= NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R
;
2585 case NVPTXISD::TexUnified2DS32S32
:
2586 Opc
= NVPTX::TEX_UNIFIED_2D_S32_S32_R
;
2588 case NVPTXISD::TexUnified2DS32Float
:
2589 Opc
= NVPTX::TEX_UNIFIED_2D_S32_F32_R
;
2591 case NVPTXISD::TexUnified2DS32FloatLevel
:
2592 Opc
= NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R
;
2594 case NVPTXISD::TexUnified2DS32FloatGrad
:
2595 Opc
= NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R
;
2597 case NVPTXISD::TexUnified2DU32S32
:
2598 Opc
= NVPTX::TEX_UNIFIED_2D_U32_S32_R
;
2600 case NVPTXISD::TexUnified2DU32Float
:
2601 Opc
= NVPTX::TEX_UNIFIED_2D_U32_F32_R
;
2603 case NVPTXISD::TexUnified2DU32FloatLevel
:
2604 Opc
= NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R
;
2606 case NVPTXISD::TexUnified2DU32FloatGrad
:
2607 Opc
= NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R
;
2609 case NVPTXISD::TexUnified2DArrayFloatS32
:
2610 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R
;
2612 case NVPTXISD::TexUnified2DArrayFloatFloat
:
2613 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R
;
2615 case NVPTXISD::TexUnified2DArrayFloatFloatLevel
:
2616 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R
;
2618 case NVPTXISD::TexUnified2DArrayFloatFloatGrad
:
2619 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R
;
2621 case NVPTXISD::TexUnified2DArrayS32S32
:
2622 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R
;
2624 case NVPTXISD::TexUnified2DArrayS32Float
:
2625 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R
;
2627 case NVPTXISD::TexUnified2DArrayS32FloatLevel
:
2628 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R
;
2630 case NVPTXISD::TexUnified2DArrayS32FloatGrad
:
2631 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R
;
2633 case NVPTXISD::TexUnified2DArrayU32S32
:
2634 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R
;
2636 case NVPTXISD::TexUnified2DArrayU32Float
:
2637 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R
;
2639 case NVPTXISD::TexUnified2DArrayU32FloatLevel
:
2640 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R
;
2642 case NVPTXISD::TexUnified2DArrayU32FloatGrad
:
2643 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R
;
2645 case NVPTXISD::TexUnified3DFloatS32
:
2646 Opc
= NVPTX::TEX_UNIFIED_3D_F32_S32_R
;
2648 case NVPTXISD::TexUnified3DFloatFloat
:
2649 Opc
= NVPTX::TEX_UNIFIED_3D_F32_F32_R
;
2651 case NVPTXISD::TexUnified3DFloatFloatLevel
:
2652 Opc
= NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R
;
2654 case NVPTXISD::TexUnified3DFloatFloatGrad
:
2655 Opc
= NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R
;
2657 case NVPTXISD::TexUnified3DS32S32
:
2658 Opc
= NVPTX::TEX_UNIFIED_3D_S32_S32_R
;
2660 case NVPTXISD::TexUnified3DS32Float
:
2661 Opc
= NVPTX::TEX_UNIFIED_3D_S32_F32_R
;
2663 case NVPTXISD::TexUnified3DS32FloatLevel
:
2664 Opc
= NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R
;
2666 case NVPTXISD::TexUnified3DS32FloatGrad
:
2667 Opc
= NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R
;
2669 case NVPTXISD::TexUnified3DU32S32
:
2670 Opc
= NVPTX::TEX_UNIFIED_3D_U32_S32_R
;
2672 case NVPTXISD::TexUnified3DU32Float
:
2673 Opc
= NVPTX::TEX_UNIFIED_3D_U32_F32_R
;
2675 case NVPTXISD::TexUnified3DU32FloatLevel
:
2676 Opc
= NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R
;
2678 case NVPTXISD::TexUnified3DU32FloatGrad
:
2679 Opc
= NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R
;
2681 case NVPTXISD::TexUnifiedCubeFloatFloat
:
2682 Opc
= NVPTX::TEX_UNIFIED_CUBE_F32_F32_R
;
2684 case NVPTXISD::TexUnifiedCubeFloatFloatLevel
:
2685 Opc
= NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R
;
2687 case NVPTXISD::TexUnifiedCubeS32Float
:
2688 Opc
= NVPTX::TEX_UNIFIED_CUBE_S32_F32_R
;
2690 case NVPTXISD::TexUnifiedCubeS32FloatLevel
:
2691 Opc
= NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R
;
2693 case NVPTXISD::TexUnifiedCubeU32Float
:
2694 Opc
= NVPTX::TEX_UNIFIED_CUBE_U32_F32_R
;
2696 case NVPTXISD::TexUnifiedCubeU32FloatLevel
:
2697 Opc
= NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R
;
2699 case NVPTXISD::TexUnifiedCubeArrayFloatFloat
:
2700 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R
;
2702 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel
:
2703 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R
;
2705 case NVPTXISD::TexUnifiedCubeArrayS32Float
:
2706 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R
;
2708 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel
:
2709 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R
;
2711 case NVPTXISD::TexUnifiedCubeArrayU32Float
:
2712 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R
;
2714 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel
:
2715 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R
;
2717 case NVPTXISD::Tld4UnifiedR2DFloatFloat
:
2718 Opc
= NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R
;
2720 case NVPTXISD::Tld4UnifiedG2DFloatFloat
:
2721 Opc
= NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R
;
2723 case NVPTXISD::Tld4UnifiedB2DFloatFloat
:
2724 Opc
= NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R
;
2726 case NVPTXISD::Tld4UnifiedA2DFloatFloat
:
2727 Opc
= NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R
;
2729 case NVPTXISD::Tld4UnifiedR2DS64Float
:
2730 Opc
= NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R
;
2732 case NVPTXISD::Tld4UnifiedG2DS64Float
:
2733 Opc
= NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R
;
2735 case NVPTXISD::Tld4UnifiedB2DS64Float
:
2736 Opc
= NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R
;
2738 case NVPTXISD::Tld4UnifiedA2DS64Float
:
2739 Opc
= NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R
;
2741 case NVPTXISD::Tld4UnifiedR2DU64Float
:
2742 Opc
= NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R
;
2744 case NVPTXISD::Tld4UnifiedG2DU64Float
:
2745 Opc
= NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R
;
2747 case NVPTXISD::Tld4UnifiedB2DU64Float
:
2748 Opc
= NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R
;
2750 case NVPTXISD::Tld4UnifiedA2DU64Float
:
2751 Opc
= NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R
;
2755 // Copy over operands
2756 SmallVector
<SDValue
, 8> Ops(drop_begin(N
->ops()));
2757 Ops
.push_back(N
->getOperand(0)); // Move chain to the back.
2759 ReplaceNode(N
, CurDAG
->getMachineNode(Opc
, SDLoc(N
), N
->getVTList(), Ops
));
2763 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode
*N
) {
2765 switch (N
->getOpcode()) {
2766 default: return false;
2767 case NVPTXISD::Suld1DI8Clamp
:
2768 Opc
= NVPTX::SULD_1D_I8_CLAMP_R
;
2770 case NVPTXISD::Suld1DI16Clamp
:
2771 Opc
= NVPTX::SULD_1D_I16_CLAMP_R
;
2773 case NVPTXISD::Suld1DI32Clamp
:
2774 Opc
= NVPTX::SULD_1D_I32_CLAMP_R
;
2776 case NVPTXISD::Suld1DI64Clamp
:
2777 Opc
= NVPTX::SULD_1D_I64_CLAMP_R
;
2779 case NVPTXISD::Suld1DV2I8Clamp
:
2780 Opc
= NVPTX::SULD_1D_V2I8_CLAMP_R
;
2782 case NVPTXISD::Suld1DV2I16Clamp
:
2783 Opc
= NVPTX::SULD_1D_V2I16_CLAMP_R
;
2785 case NVPTXISD::Suld1DV2I32Clamp
:
2786 Opc
= NVPTX::SULD_1D_V2I32_CLAMP_R
;
2788 case NVPTXISD::Suld1DV2I64Clamp
:
2789 Opc
= NVPTX::SULD_1D_V2I64_CLAMP_R
;
2791 case NVPTXISD::Suld1DV4I8Clamp
:
2792 Opc
= NVPTX::SULD_1D_V4I8_CLAMP_R
;
2794 case NVPTXISD::Suld1DV4I16Clamp
:
2795 Opc
= NVPTX::SULD_1D_V4I16_CLAMP_R
;
2797 case NVPTXISD::Suld1DV4I32Clamp
:
2798 Opc
= NVPTX::SULD_1D_V4I32_CLAMP_R
;
2800 case NVPTXISD::Suld1DArrayI8Clamp
:
2801 Opc
= NVPTX::SULD_1D_ARRAY_I8_CLAMP_R
;
2803 case NVPTXISD::Suld1DArrayI16Clamp
:
2804 Opc
= NVPTX::SULD_1D_ARRAY_I16_CLAMP_R
;
2806 case NVPTXISD::Suld1DArrayI32Clamp
:
2807 Opc
= NVPTX::SULD_1D_ARRAY_I32_CLAMP_R
;
2809 case NVPTXISD::Suld1DArrayI64Clamp
:
2810 Opc
= NVPTX::SULD_1D_ARRAY_I64_CLAMP_R
;
2812 case NVPTXISD::Suld1DArrayV2I8Clamp
:
2813 Opc
= NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R
;
2815 case NVPTXISD::Suld1DArrayV2I16Clamp
:
2816 Opc
= NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R
;
2818 case NVPTXISD::Suld1DArrayV2I32Clamp
:
2819 Opc
= NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R
;
2821 case NVPTXISD::Suld1DArrayV2I64Clamp
:
2822 Opc
= NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R
;
2824 case NVPTXISD::Suld1DArrayV4I8Clamp
:
2825 Opc
= NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R
;
2827 case NVPTXISD::Suld1DArrayV4I16Clamp
:
2828 Opc
= NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R
;
2830 case NVPTXISD::Suld1DArrayV4I32Clamp
:
2831 Opc
= NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R
;
2833 case NVPTXISD::Suld2DI8Clamp
:
2834 Opc
= NVPTX::SULD_2D_I8_CLAMP_R
;
2836 case NVPTXISD::Suld2DI16Clamp
:
2837 Opc
= NVPTX::SULD_2D_I16_CLAMP_R
;
2839 case NVPTXISD::Suld2DI32Clamp
:
2840 Opc
= NVPTX::SULD_2D_I32_CLAMP_R
;
2842 case NVPTXISD::Suld2DI64Clamp
:
2843 Opc
= NVPTX::SULD_2D_I64_CLAMP_R
;
2845 case NVPTXISD::Suld2DV2I8Clamp
:
2846 Opc
= NVPTX::SULD_2D_V2I8_CLAMP_R
;
2848 case NVPTXISD::Suld2DV2I16Clamp
:
2849 Opc
= NVPTX::SULD_2D_V2I16_CLAMP_R
;
2851 case NVPTXISD::Suld2DV2I32Clamp
:
2852 Opc
= NVPTX::SULD_2D_V2I32_CLAMP_R
;
2854 case NVPTXISD::Suld2DV2I64Clamp
:
2855 Opc
= NVPTX::SULD_2D_V2I64_CLAMP_R
;
2857 case NVPTXISD::Suld2DV4I8Clamp
:
2858 Opc
= NVPTX::SULD_2D_V4I8_CLAMP_R
;
2860 case NVPTXISD::Suld2DV4I16Clamp
:
2861 Opc
= NVPTX::SULD_2D_V4I16_CLAMP_R
;
2863 case NVPTXISD::Suld2DV4I32Clamp
:
2864 Opc
= NVPTX::SULD_2D_V4I32_CLAMP_R
;
2866 case NVPTXISD::Suld2DArrayI8Clamp
:
2867 Opc
= NVPTX::SULD_2D_ARRAY_I8_CLAMP_R
;
2869 case NVPTXISD::Suld2DArrayI16Clamp
:
2870 Opc
= NVPTX::SULD_2D_ARRAY_I16_CLAMP_R
;
2872 case NVPTXISD::Suld2DArrayI32Clamp
:
2873 Opc
= NVPTX::SULD_2D_ARRAY_I32_CLAMP_R
;
2875 case NVPTXISD::Suld2DArrayI64Clamp
:
2876 Opc
= NVPTX::SULD_2D_ARRAY_I64_CLAMP_R
;
2878 case NVPTXISD::Suld2DArrayV2I8Clamp
:
2879 Opc
= NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R
;
2881 case NVPTXISD::Suld2DArrayV2I16Clamp
:
2882 Opc
= NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R
;
2884 case NVPTXISD::Suld2DArrayV2I32Clamp
:
2885 Opc
= NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R
;
2887 case NVPTXISD::Suld2DArrayV2I64Clamp
:
2888 Opc
= NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R
;
2890 case NVPTXISD::Suld2DArrayV4I8Clamp
:
2891 Opc
= NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R
;
2893 case NVPTXISD::Suld2DArrayV4I16Clamp
:
2894 Opc
= NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R
;
2896 case NVPTXISD::Suld2DArrayV4I32Clamp
:
2897 Opc
= NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R
;
2899 case NVPTXISD::Suld3DI8Clamp
:
2900 Opc
= NVPTX::SULD_3D_I8_CLAMP_R
;
2902 case NVPTXISD::Suld3DI16Clamp
:
2903 Opc
= NVPTX::SULD_3D_I16_CLAMP_R
;
2905 case NVPTXISD::Suld3DI32Clamp
:
2906 Opc
= NVPTX::SULD_3D_I32_CLAMP_R
;
2908 case NVPTXISD::Suld3DI64Clamp
:
2909 Opc
= NVPTX::SULD_3D_I64_CLAMP_R
;
2911 case NVPTXISD::Suld3DV2I8Clamp
:
2912 Opc
= NVPTX::SULD_3D_V2I8_CLAMP_R
;
2914 case NVPTXISD::Suld3DV2I16Clamp
:
2915 Opc
= NVPTX::SULD_3D_V2I16_CLAMP_R
;
2917 case NVPTXISD::Suld3DV2I32Clamp
:
2918 Opc
= NVPTX::SULD_3D_V2I32_CLAMP_R
;
2920 case NVPTXISD::Suld3DV2I64Clamp
:
2921 Opc
= NVPTX::SULD_3D_V2I64_CLAMP_R
;
2923 case NVPTXISD::Suld3DV4I8Clamp
:
2924 Opc
= NVPTX::SULD_3D_V4I8_CLAMP_R
;
2926 case NVPTXISD::Suld3DV4I16Clamp
:
2927 Opc
= NVPTX::SULD_3D_V4I16_CLAMP_R
;
2929 case NVPTXISD::Suld3DV4I32Clamp
:
2930 Opc
= NVPTX::SULD_3D_V4I32_CLAMP_R
;
2932 case NVPTXISD::Suld1DI8Trap
:
2933 Opc
= NVPTX::SULD_1D_I8_TRAP_R
;
2935 case NVPTXISD::Suld1DI16Trap
:
2936 Opc
= NVPTX::SULD_1D_I16_TRAP_R
;
2938 case NVPTXISD::Suld1DI32Trap
:
2939 Opc
= NVPTX::SULD_1D_I32_TRAP_R
;
2941 case NVPTXISD::Suld1DI64Trap
:
2942 Opc
= NVPTX::SULD_1D_I64_TRAP_R
;
2944 case NVPTXISD::Suld1DV2I8Trap
:
2945 Opc
= NVPTX::SULD_1D_V2I8_TRAP_R
;
2947 case NVPTXISD::Suld1DV2I16Trap
:
2948 Opc
= NVPTX::SULD_1D_V2I16_TRAP_R
;
2950 case NVPTXISD::Suld1DV2I32Trap
:
2951 Opc
= NVPTX::SULD_1D_V2I32_TRAP_R
;
2953 case NVPTXISD::Suld1DV2I64Trap
:
2954 Opc
= NVPTX::SULD_1D_V2I64_TRAP_R
;
2956 case NVPTXISD::Suld1DV4I8Trap
:
2957 Opc
= NVPTX::SULD_1D_V4I8_TRAP_R
;
2959 case NVPTXISD::Suld1DV4I16Trap
:
2960 Opc
= NVPTX::SULD_1D_V4I16_TRAP_R
;
2962 case NVPTXISD::Suld1DV4I32Trap
:
2963 Opc
= NVPTX::SULD_1D_V4I32_TRAP_R
;
2965 case NVPTXISD::Suld1DArrayI8Trap
:
2966 Opc
= NVPTX::SULD_1D_ARRAY_I8_TRAP_R
;
2968 case NVPTXISD::Suld1DArrayI16Trap
:
2969 Opc
= NVPTX::SULD_1D_ARRAY_I16_TRAP_R
;
2971 case NVPTXISD::Suld1DArrayI32Trap
:
2972 Opc
= NVPTX::SULD_1D_ARRAY_I32_TRAP_R
;
2974 case NVPTXISD::Suld1DArrayI64Trap
:
2975 Opc
= NVPTX::SULD_1D_ARRAY_I64_TRAP_R
;
2977 case NVPTXISD::Suld1DArrayV2I8Trap
:
2978 Opc
= NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R
;
2980 case NVPTXISD::Suld1DArrayV2I16Trap
:
2981 Opc
= NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R
;
2983 case NVPTXISD::Suld1DArrayV2I32Trap
:
2984 Opc
= NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R
;
2986 case NVPTXISD::Suld1DArrayV2I64Trap
:
2987 Opc
= NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R
;
2989 case NVPTXISD::Suld1DArrayV4I8Trap
:
2990 Opc
= NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R
;
2992 case NVPTXISD::Suld1DArrayV4I16Trap
:
2993 Opc
= NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R
;
2995 case NVPTXISD::Suld1DArrayV4I32Trap
:
2996 Opc
= NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R
;
2998 case NVPTXISD::Suld2DI8Trap
:
2999 Opc
= NVPTX::SULD_2D_I8_TRAP_R
;
3001 case NVPTXISD::Suld2DI16Trap
:
3002 Opc
= NVPTX::SULD_2D_I16_TRAP_R
;
3004 case NVPTXISD::Suld2DI32Trap
:
3005 Opc
= NVPTX::SULD_2D_I32_TRAP_R
;
3007 case NVPTXISD::Suld2DI64Trap
:
3008 Opc
= NVPTX::SULD_2D_I64_TRAP_R
;
3010 case NVPTXISD::Suld2DV2I8Trap
:
3011 Opc
= NVPTX::SULD_2D_V2I8_TRAP_R
;
3013 case NVPTXISD::Suld2DV2I16Trap
:
3014 Opc
= NVPTX::SULD_2D_V2I16_TRAP_R
;
3016 case NVPTXISD::Suld2DV2I32Trap
:
3017 Opc
= NVPTX::SULD_2D_V2I32_TRAP_R
;
3019 case NVPTXISD::Suld2DV2I64Trap
:
3020 Opc
= NVPTX::SULD_2D_V2I64_TRAP_R
;
3022 case NVPTXISD::Suld2DV4I8Trap
:
3023 Opc
= NVPTX::SULD_2D_V4I8_TRAP_R
;
3025 case NVPTXISD::Suld2DV4I16Trap
:
3026 Opc
= NVPTX::SULD_2D_V4I16_TRAP_R
;
3028 case NVPTXISD::Suld2DV4I32Trap
:
3029 Opc
= NVPTX::SULD_2D_V4I32_TRAP_R
;
3031 case NVPTXISD::Suld2DArrayI8Trap
:
3032 Opc
= NVPTX::SULD_2D_ARRAY_I8_TRAP_R
;
3034 case NVPTXISD::Suld2DArrayI16Trap
:
3035 Opc
= NVPTX::SULD_2D_ARRAY_I16_TRAP_R
;
3037 case NVPTXISD::Suld2DArrayI32Trap
:
3038 Opc
= NVPTX::SULD_2D_ARRAY_I32_TRAP_R
;
3040 case NVPTXISD::Suld2DArrayI64Trap
:
3041 Opc
= NVPTX::SULD_2D_ARRAY_I64_TRAP_R
;
3043 case NVPTXISD::Suld2DArrayV2I8Trap
:
3044 Opc
= NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R
;
3046 case NVPTXISD::Suld2DArrayV2I16Trap
:
3047 Opc
= NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R
;
3049 case NVPTXISD::Suld2DArrayV2I32Trap
:
3050 Opc
= NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R
;
3052 case NVPTXISD::Suld2DArrayV2I64Trap
:
3053 Opc
= NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R
;
3055 case NVPTXISD::Suld2DArrayV4I8Trap
:
3056 Opc
= NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R
;
3058 case NVPTXISD::Suld2DArrayV4I16Trap
:
3059 Opc
= NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R
;
3061 case NVPTXISD::Suld2DArrayV4I32Trap
:
3062 Opc
= NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R
;
3064 case NVPTXISD::Suld3DI8Trap
:
3065 Opc
= NVPTX::SULD_3D_I8_TRAP_R
;
3067 case NVPTXISD::Suld3DI16Trap
:
3068 Opc
= NVPTX::SULD_3D_I16_TRAP_R
;
3070 case NVPTXISD::Suld3DI32Trap
:
3071 Opc
= NVPTX::SULD_3D_I32_TRAP_R
;
3073 case NVPTXISD::Suld3DI64Trap
:
3074 Opc
= NVPTX::SULD_3D_I64_TRAP_R
;
3076 case NVPTXISD::Suld3DV2I8Trap
:
3077 Opc
= NVPTX::SULD_3D_V2I8_TRAP_R
;
3079 case NVPTXISD::Suld3DV2I16Trap
:
3080 Opc
= NVPTX::SULD_3D_V2I16_TRAP_R
;
3082 case NVPTXISD::Suld3DV2I32Trap
:
3083 Opc
= NVPTX::SULD_3D_V2I32_TRAP_R
;
3085 case NVPTXISD::Suld3DV2I64Trap
:
3086 Opc
= NVPTX::SULD_3D_V2I64_TRAP_R
;
3088 case NVPTXISD::Suld3DV4I8Trap
:
3089 Opc
= NVPTX::SULD_3D_V4I8_TRAP_R
;
3091 case NVPTXISD::Suld3DV4I16Trap
:
3092 Opc
= NVPTX::SULD_3D_V4I16_TRAP_R
;
3094 case NVPTXISD::Suld3DV4I32Trap
:
3095 Opc
= NVPTX::SULD_3D_V4I32_TRAP_R
;
3097 case NVPTXISD::Suld1DI8Zero
:
3098 Opc
= NVPTX::SULD_1D_I8_ZERO_R
;
3100 case NVPTXISD::Suld1DI16Zero
:
3101 Opc
= NVPTX::SULD_1D_I16_ZERO_R
;
3103 case NVPTXISD::Suld1DI32Zero
:
3104 Opc
= NVPTX::SULD_1D_I32_ZERO_R
;
3106 case NVPTXISD::Suld1DI64Zero
:
3107 Opc
= NVPTX::SULD_1D_I64_ZERO_R
;
3109 case NVPTXISD::Suld1DV2I8Zero
:
3110 Opc
= NVPTX::SULD_1D_V2I8_ZERO_R
;
3112 case NVPTXISD::Suld1DV2I16Zero
:
3113 Opc
= NVPTX::SULD_1D_V2I16_ZERO_R
;
3115 case NVPTXISD::Suld1DV2I32Zero
:
3116 Opc
= NVPTX::SULD_1D_V2I32_ZERO_R
;
3118 case NVPTXISD::Suld1DV2I64Zero
:
3119 Opc
= NVPTX::SULD_1D_V2I64_ZERO_R
;
3121 case NVPTXISD::Suld1DV4I8Zero
:
3122 Opc
= NVPTX::SULD_1D_V4I8_ZERO_R
;
3124 case NVPTXISD::Suld1DV4I16Zero
:
3125 Opc
= NVPTX::SULD_1D_V4I16_ZERO_R
;
3127 case NVPTXISD::Suld1DV4I32Zero
:
3128 Opc
= NVPTX::SULD_1D_V4I32_ZERO_R
;
3130 case NVPTXISD::Suld1DArrayI8Zero
:
3131 Opc
= NVPTX::SULD_1D_ARRAY_I8_ZERO_R
;
3133 case NVPTXISD::Suld1DArrayI16Zero
:
3134 Opc
= NVPTX::SULD_1D_ARRAY_I16_ZERO_R
;
3136 case NVPTXISD::Suld1DArrayI32Zero
:
3137 Opc
= NVPTX::SULD_1D_ARRAY_I32_ZERO_R
;
3139 case NVPTXISD::Suld1DArrayI64Zero
:
3140 Opc
= NVPTX::SULD_1D_ARRAY_I64_ZERO_R
;
3142 case NVPTXISD::Suld1DArrayV2I8Zero
:
3143 Opc
= NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R
;
3145 case NVPTXISD::Suld1DArrayV2I16Zero
:
3146 Opc
= NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R
;
3148 case NVPTXISD::Suld1DArrayV2I32Zero
:
3149 Opc
= NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R
;
3151 case NVPTXISD::Suld1DArrayV2I64Zero
:
3152 Opc
= NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R
;
3154 case NVPTXISD::Suld1DArrayV4I8Zero
:
3155 Opc
= NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R
;
3157 case NVPTXISD::Suld1DArrayV4I16Zero
:
3158 Opc
= NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R
;
3160 case NVPTXISD::Suld1DArrayV4I32Zero
:
3161 Opc
= NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R
;
3163 case NVPTXISD::Suld2DI8Zero
:
3164 Opc
= NVPTX::SULD_2D_I8_ZERO_R
;
3166 case NVPTXISD::Suld2DI16Zero
:
3167 Opc
= NVPTX::SULD_2D_I16_ZERO_R
;
3169 case NVPTXISD::Suld2DI32Zero
:
3170 Opc
= NVPTX::SULD_2D_I32_ZERO_R
;
3172 case NVPTXISD::Suld2DI64Zero
:
3173 Opc
= NVPTX::SULD_2D_I64_ZERO_R
;
3175 case NVPTXISD::Suld2DV2I8Zero
:
3176 Opc
= NVPTX::SULD_2D_V2I8_ZERO_R
;
3178 case NVPTXISD::Suld2DV2I16Zero
:
3179 Opc
= NVPTX::SULD_2D_V2I16_ZERO_R
;
3181 case NVPTXISD::Suld2DV2I32Zero
:
3182 Opc
= NVPTX::SULD_2D_V2I32_ZERO_R
;
3184 case NVPTXISD::Suld2DV2I64Zero
:
3185 Opc
= NVPTX::SULD_2D_V2I64_ZERO_R
;
3187 case NVPTXISD::Suld2DV4I8Zero
:
3188 Opc
= NVPTX::SULD_2D_V4I8_ZERO_R
;
3190 case NVPTXISD::Suld2DV4I16Zero
:
3191 Opc
= NVPTX::SULD_2D_V4I16_ZERO_R
;
3193 case NVPTXISD::Suld2DV4I32Zero
:
3194 Opc
= NVPTX::SULD_2D_V4I32_ZERO_R
;
3196 case NVPTXISD::Suld2DArrayI8Zero
:
3197 Opc
= NVPTX::SULD_2D_ARRAY_I8_ZERO_R
;
3199 case NVPTXISD::Suld2DArrayI16Zero
:
3200 Opc
= NVPTX::SULD_2D_ARRAY_I16_ZERO_R
;
3202 case NVPTXISD::Suld2DArrayI32Zero
:
3203 Opc
= NVPTX::SULD_2D_ARRAY_I32_ZERO_R
;
3205 case NVPTXISD::Suld2DArrayI64Zero
:
3206 Opc
= NVPTX::SULD_2D_ARRAY_I64_ZERO_R
;
3208 case NVPTXISD::Suld2DArrayV2I8Zero
:
3209 Opc
= NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R
;
3211 case NVPTXISD::Suld2DArrayV2I16Zero
:
3212 Opc
= NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R
;
3214 case NVPTXISD::Suld2DArrayV2I32Zero
:
3215 Opc
= NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R
;
3217 case NVPTXISD::Suld2DArrayV2I64Zero
:
3218 Opc
= NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R
;
3220 case NVPTXISD::Suld2DArrayV4I8Zero
:
3221 Opc
= NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R
;
3223 case NVPTXISD::Suld2DArrayV4I16Zero
:
3224 Opc
= NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R
;
3226 case NVPTXISD::Suld2DArrayV4I32Zero
:
3227 Opc
= NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R
;
3229 case NVPTXISD::Suld3DI8Zero
:
3230 Opc
= NVPTX::SULD_3D_I8_ZERO_R
;
3232 case NVPTXISD::Suld3DI16Zero
:
3233 Opc
= NVPTX::SULD_3D_I16_ZERO_R
;
3235 case NVPTXISD::Suld3DI32Zero
:
3236 Opc
= NVPTX::SULD_3D_I32_ZERO_R
;
3238 case NVPTXISD::Suld3DI64Zero
:
3239 Opc
= NVPTX::SULD_3D_I64_ZERO_R
;
3241 case NVPTXISD::Suld3DV2I8Zero
:
3242 Opc
= NVPTX::SULD_3D_V2I8_ZERO_R
;
3244 case NVPTXISD::Suld3DV2I16Zero
:
3245 Opc
= NVPTX::SULD_3D_V2I16_ZERO_R
;
3247 case NVPTXISD::Suld3DV2I32Zero
:
3248 Opc
= NVPTX::SULD_3D_V2I32_ZERO_R
;
3250 case NVPTXISD::Suld3DV2I64Zero
:
3251 Opc
= NVPTX::SULD_3D_V2I64_ZERO_R
;
3253 case NVPTXISD::Suld3DV4I8Zero
:
3254 Opc
= NVPTX::SULD_3D_V4I8_ZERO_R
;
3256 case NVPTXISD::Suld3DV4I16Zero
:
3257 Opc
= NVPTX::SULD_3D_V4I16_ZERO_R
;
3259 case NVPTXISD::Suld3DV4I32Zero
:
3260 Opc
= NVPTX::SULD_3D_V4I32_ZERO_R
;
3264 // Copy over operands
3265 SmallVector
<SDValue
, 8> Ops(drop_begin(N
->ops()));
3266 Ops
.push_back(N
->getOperand(0)); // Move chain to the back.
3268 ReplaceNode(N
, CurDAG
->getMachineNode(Opc
, SDLoc(N
), N
->getVTList(), Ops
));
3273 /// SelectBFE - Look for instruction sequences that can be made more efficient
3274 /// by using the 'bfe' (bit-field extract) PTX instruction
3275 bool NVPTXDAGToDAGISel::tryBFE(SDNode
*N
) {
3277 SDValue LHS
= N
->getOperand(0);
3278 SDValue RHS
= N
->getOperand(1);
3282 bool IsSigned
= false;
3284 if (N
->getOpcode() == ISD::AND
) {
3285 // Canonicalize the operands
3286 // We want 'and %val, %mask'
3287 if (isa
<ConstantSDNode
>(LHS
) && !isa
<ConstantSDNode
>(RHS
)) {
3288 std::swap(LHS
, RHS
);
3291 ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(RHS
);
3293 // We need a constant mask on the RHS of the AND
3297 // Extract the mask bits
3298 uint64_t MaskVal
= Mask
->getZExtValue();
3299 if (!isMask_64(MaskVal
)) {
3300 // We *could* handle shifted masks here, but doing so would require an
3301 // 'and' operation to fix up the low-order bits so we would trade
3302 // shr+and for bfe+and, which has the same throughput
3306 // How many bits are in our mask?
3307 int64_t NumBits
= countr_one(MaskVal
);
3308 Len
= CurDAG
->getTargetConstant(NumBits
, DL
, MVT::i32
);
3310 if (LHS
.getOpcode() == ISD::SRL
|| LHS
.getOpcode() == ISD::SRA
) {
3311 // We have a 'srl/and' pair, extract the effective start bit and length
3312 Val
= LHS
.getNode()->getOperand(0);
3313 Start
= LHS
.getNode()->getOperand(1);
3314 ConstantSDNode
*StartConst
= dyn_cast
<ConstantSDNode
>(Start
);
3316 uint64_t StartVal
= StartConst
->getZExtValue();
3317 // How many "good" bits do we have left? "good" is defined here as bits
3318 // that exist in the original value, not shifted in.
3319 int64_t GoodBits
= Start
.getValueSizeInBits() - StartVal
;
3320 if (NumBits
> GoodBits
) {
3321 // Do not handle the case where bits have been shifted in. In theory
3322 // we could handle this, but the cost is likely higher than just
3323 // emitting the srl/and pair.
3326 Start
= CurDAG
->getTargetConstant(StartVal
, DL
, MVT::i32
);
3328 // Do not handle the case where the shift amount (can be zero if no srl
3329 // was found) is not constant. We could handle this case, but it would
3330 // require run-time logic that would be more expensive than just
3331 // emitting the srl/and pair.
3335 // Do not handle the case where the LHS of the and is not a shift. While
3336 // it would be trivial to handle this case, it would just transform
3337 // 'and' -> 'bfe', but 'and' has higher-throughput.
3340 } else if (N
->getOpcode() == ISD::SRL
|| N
->getOpcode() == ISD::SRA
) {
3341 if (LHS
->getOpcode() == ISD::AND
) {
3342 ConstantSDNode
*ShiftCnst
= dyn_cast
<ConstantSDNode
>(RHS
);
3344 // Shift amount must be constant
3348 uint64_t ShiftAmt
= ShiftCnst
->getZExtValue();
3350 SDValue AndLHS
= LHS
->getOperand(0);
3351 SDValue AndRHS
= LHS
->getOperand(1);
3353 // Canonicalize the AND to have the mask on the RHS
3354 if (isa
<ConstantSDNode
>(AndLHS
)) {
3355 std::swap(AndLHS
, AndRHS
);
3358 ConstantSDNode
*MaskCnst
= dyn_cast
<ConstantSDNode
>(AndRHS
);
3360 // Mask must be constant
3364 uint64_t MaskVal
= MaskCnst
->getZExtValue();
3367 if (isMask_64(MaskVal
)) {
3369 // The number of bits in the result bitfield will be the number of
3370 // trailing ones (the AND) minus the number of bits we shift off
3371 NumBits
= llvm::countr_one(MaskVal
) - ShiftAmt
;
3372 } else if (isShiftedMask_64(MaskVal
)) {
3373 NumZeros
= llvm::countr_zero(MaskVal
);
3374 unsigned NumOnes
= llvm::countr_one(MaskVal
>> NumZeros
);
3375 // The number of bits in the result bitfield will be the number of
3376 // trailing zeros plus the number of set bits in the mask minus the
3377 // number of bits we shift off
3378 NumBits
= NumZeros
+ NumOnes
- ShiftAmt
;
3380 // This is not a mask we can handle
3384 if (ShiftAmt
< NumZeros
) {
3385 // Handling this case would require extra logic that would make this
3386 // transformation non-profitable
3391 Start
= CurDAG
->getTargetConstant(ShiftAmt
, DL
, MVT::i32
);
3392 Len
= CurDAG
->getTargetConstant(NumBits
, DL
, MVT::i32
);
3393 } else if (LHS
->getOpcode() == ISD::SHL
) {
3394 // Here, we have a pattern like:
3396 // (sra (shl val, NN), MM)
3398 // (srl (shl val, NN), MM)
3400 // If MM >= NN, we can efficiently optimize this with bfe
3401 Val
= LHS
->getOperand(0);
3403 SDValue ShlRHS
= LHS
->getOperand(1);
3404 ConstantSDNode
*ShlCnst
= dyn_cast
<ConstantSDNode
>(ShlRHS
);
3406 // Shift amount must be constant
3409 uint64_t InnerShiftAmt
= ShlCnst
->getZExtValue();
3411 SDValue ShrRHS
= RHS
;
3412 ConstantSDNode
*ShrCnst
= dyn_cast
<ConstantSDNode
>(ShrRHS
);
3414 // Shift amount must be constant
3417 uint64_t OuterShiftAmt
= ShrCnst
->getZExtValue();
3419 // To avoid extra codegen and be profitable, we need Outer >= Inner
3420 if (OuterShiftAmt
< InnerShiftAmt
) {
3424 // If the outer shift is more than the type size, we have no bitfield to
3425 // extract (since we also check that the inner shift is <= the outer shift
3426 // then this also implies that the inner shift is < the type size)
3427 if (OuterShiftAmt
>= Val
.getValueSizeInBits()) {
3431 Start
= CurDAG
->getTargetConstant(OuterShiftAmt
- InnerShiftAmt
, DL
,
3433 Len
= CurDAG
->getTargetConstant(Val
.getValueSizeInBits() - OuterShiftAmt
,
3436 if (N
->getOpcode() == ISD::SRA
) {
3437 // If we have a arithmetic right shift, we need to use the signed bfe
3452 // For the BFE operations we form here from "and" and "srl", always use the
3453 // unsigned variants.
3454 if (Val
.getValueType() == MVT::i32
) {
3456 Opc
= NVPTX::BFE_S32rii
;
3458 Opc
= NVPTX::BFE_U32rii
;
3460 } else if (Val
.getValueType() == MVT::i64
) {
3462 Opc
= NVPTX::BFE_S64rii
;
3464 Opc
= NVPTX::BFE_U64rii
;
3467 // We cannot handle this type
3475 ReplaceNode(N
, CurDAG
->getMachineNode(Opc
, DL
, N
->getVTList(), Ops
));
3479 // SelectDirectAddr - Match a direct address for DAG.
3480 // A direct address could be a globaladdress or externalsymbol.
3481 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N
, SDValue
&Address
) {
3482 // Return true if TGA or ES.
3483 if (N
.getOpcode() == ISD::TargetGlobalAddress
||
3484 N
.getOpcode() == ISD::TargetExternalSymbol
) {
3488 if (N
.getOpcode() == NVPTXISD::Wrapper
) {
3489 Address
= N
.getOperand(0);
3492 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3493 if (AddrSpaceCastSDNode
*CastN
= dyn_cast
<AddrSpaceCastSDNode
>(N
)) {
3494 if (CastN
->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC
&&
3495 CastN
->getDestAddressSpace() == ADDRESS_SPACE_PARAM
&&
3496 CastN
->getOperand(0).getOpcode() == NVPTXISD::MoveParam
)
3497 return SelectDirectAddr(CastN
->getOperand(0).getOperand(0), Address
);
3503 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3504 SDNode
*OpNode
, SDValue Addr
, SDValue
&Base
, SDValue
&Offset
, MVT mvt
) {
3505 if (Addr
.getOpcode() == ISD::ADD
) {
3506 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1))) {
3507 SDValue base
= Addr
.getOperand(0);
3508 if (SelectDirectAddr(base
, Base
)) {
3509 Offset
= CurDAG
->getTargetConstant(CN
->getZExtValue(), SDLoc(OpNode
),
3519 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode
*OpNode
, SDValue Addr
,
3520 SDValue
&Base
, SDValue
&Offset
) {
3521 return SelectADDRsi_imp(OpNode
, Addr
, Base
, Offset
, MVT::i32
);
3525 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode
*OpNode
, SDValue Addr
,
3526 SDValue
&Base
, SDValue
&Offset
) {
3527 return SelectADDRsi_imp(OpNode
, Addr
, Base
, Offset
, MVT::i64
);
3531 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3532 SDNode
*OpNode
, SDValue Addr
, SDValue
&Base
, SDValue
&Offset
, MVT mvt
) {
3533 if (FrameIndexSDNode
*FIN
= dyn_cast
<FrameIndexSDNode
>(Addr
)) {
3534 Base
= CurDAG
->getTargetFrameIndex(FIN
->getIndex(), mvt
);
3535 Offset
= CurDAG
->getTargetConstant(0, SDLoc(OpNode
), mvt
);
3538 if (Addr
.getOpcode() == ISD::TargetExternalSymbol
||
3539 Addr
.getOpcode() == ISD::TargetGlobalAddress
)
3540 return false; // direct calls.
3542 if (Addr
.getOpcode() == ISD::ADD
) {
3543 if (SelectDirectAddr(Addr
.getOperand(0), Addr
)) {
3546 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1))) {
3547 if (FrameIndexSDNode
*FIN
=
3548 dyn_cast
<FrameIndexSDNode
>(Addr
.getOperand(0)))
3549 // Constant offset from frame ref.
3550 Base
= CurDAG
->getTargetFrameIndex(FIN
->getIndex(), mvt
);
3552 Base
= Addr
.getOperand(0);
3553 Offset
= CurDAG
->getTargetConstant(CN
->getZExtValue(), SDLoc(OpNode
),
3562 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode
*OpNode
, SDValue Addr
,
3563 SDValue
&Base
, SDValue
&Offset
) {
3564 return SelectADDRri_imp(OpNode
, Addr
, Base
, Offset
, MVT::i32
);
3568 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode
*OpNode
, SDValue Addr
,
3569 SDValue
&Base
, SDValue
&Offset
) {
3570 return SelectADDRri_imp(OpNode
, Addr
, Base
, Offset
, MVT::i64
);
3573 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode
*N
,
3574 unsigned int spN
) const {
3575 const Value
*Src
= nullptr;
3576 if (MemSDNode
*mN
= dyn_cast
<MemSDNode
>(N
)) {
3577 if (spN
== 0 && mN
->getMemOperand()->getPseudoValue())
3579 Src
= mN
->getMemOperand()->getValue();
3583 if (auto *PT
= dyn_cast
<PointerType
>(Src
->getType()))
3584 return (PT
->getAddressSpace() == spN
);
3588 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3589 /// inline asm expressions.
3590 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3591 const SDValue
&Op
, InlineAsm::ConstraintCode ConstraintID
,
3592 std::vector
<SDValue
> &OutOps
) {
3594 switch (ConstraintID
) {
3597 case InlineAsm::ConstraintCode::m
: // memory
3598 if (SelectDirectAddr(Op
, Op0
)) {
3599 OutOps
.push_back(Op0
);
3600 OutOps
.push_back(CurDAG
->getTargetConstant(0, SDLoc(Op
), MVT::i32
));
3603 if (SelectADDRri(Op
.getNode(), Op
, Op0
, Op1
)) {
3604 OutOps
.push_back(Op0
);
3605 OutOps
.push_back(Op1
);
3613 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3614 /// conversion from \p SrcTy to \p DestTy.
3615 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy
, MVT SrcTy
,
3616 LoadSDNode
*LdNode
) {
3617 bool IsSigned
= LdNode
&& LdNode
->getExtensionType() == ISD::SEXTLOAD
;
3618 switch (SrcTy
.SimpleTy
) {
3620 llvm_unreachable("Unhandled source type");
3622 switch (DestTy
.SimpleTy
) {
3624 llvm_unreachable("Unhandled dest type");
3626 return IsSigned
? NVPTX::CVT_s16_s8
: NVPTX::CVT_u16_u8
;
3628 return IsSigned
? NVPTX::CVT_s32_s8
: NVPTX::CVT_u32_u8
;
3630 return IsSigned
? NVPTX::CVT_s64_s8
: NVPTX::CVT_u64_u8
;
3633 switch (DestTy
.SimpleTy
) {
3635 llvm_unreachable("Unhandled dest type");
3637 return IsSigned
? NVPTX::CVT_s8_s16
: NVPTX::CVT_u8_u16
;
3639 return IsSigned
? NVPTX::CVT_s32_s16
: NVPTX::CVT_u32_u16
;
3641 return IsSigned
? NVPTX::CVT_s64_s16
: NVPTX::CVT_u64_u16
;
3644 switch (DestTy
.SimpleTy
) {
3646 llvm_unreachable("Unhandled dest type");
3648 return IsSigned
? NVPTX::CVT_s8_s32
: NVPTX::CVT_u8_u32
;
3650 return IsSigned
? NVPTX::CVT_s16_s32
: NVPTX::CVT_u16_u32
;
3652 return IsSigned
? NVPTX::CVT_s64_s32
: NVPTX::CVT_u64_u32
;
3655 switch (DestTy
.SimpleTy
) {
3657 llvm_unreachable("Unhandled dest type");
3659 return IsSigned
? NVPTX::CVT_s8_s64
: NVPTX::CVT_u8_u64
;
3661 return IsSigned
? NVPTX::CVT_s16_s64
: NVPTX::CVT_u16_u64
;
3663 return IsSigned
? NVPTX::CVT_s32_s64
: NVPTX::CVT_u32_u64
;
3666 switch (DestTy
.SimpleTy
) {
3668 llvm_unreachable("Unhandled dest type");
3670 return NVPTX::CVT_f32_f16
;
3672 return NVPTX::CVT_f64_f16
;