1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
12 //===----------------------------------------------------------------------===//
14 #include "NVPTXISelLowering.h"
15 #include "MCTargetDesc/NVPTXBaseInfo.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/Analysis.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineMemOperand.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/CodeGen/SelectionDAGNodes.h"
29 #include "llvm/CodeGen/TargetCallingConv.h"
30 #include "llvm/CodeGen/TargetLowering.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/Argument.h"
33 #include "llvm/IR/Attributes.h"
34 #include "llvm/IR/CallSite.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DerivedTypes.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/GlobalValue.h"
40 #include "llvm/IR/Instruction.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/IR/Value.h"
45 #include "llvm/Support/Casting.h"
46 #include "llvm/Support/CodeGen.h"
47 #include "llvm/Support/CommandLine.h"
48 #include "llvm/Support/ErrorHandling.h"
49 #include "llvm/Support/MachineValueType.h"
50 #include "llvm/Support/MathExtras.h"
51 #include "llvm/Support/raw_ostream.h"
52 #include "llvm/Target/TargetMachine.h"
53 #include "llvm/Target/TargetOptions.h"
63 #define DEBUG_TYPE "nvptx-lower"
67 static unsigned int uniqueCallSite
= 0;
69 static cl::opt
<bool> sched4reg(
71 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
73 static cl::opt
<unsigned>
74 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore
, cl::Hidden
,
75 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
76 " 1: do it 2: do it aggressively"),
79 static cl::opt
<int> UsePrecDivF32(
80 "nvptx-prec-divf32", cl::ZeroOrMore
, cl::Hidden
,
81 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
82 " IEEE Compliant F32 div.rnd if available."),
85 static cl::opt
<bool> UsePrecSqrtF32(
86 "nvptx-prec-sqrtf32", cl::Hidden
,
87 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
90 static cl::opt
<bool> FtzEnabled(
91 "nvptx-f32ftz", cl::ZeroOrMore
, cl::Hidden
,
92 cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
95 int NVPTXTargetLowering::getDivF32Level() const {
96 if (UsePrecDivF32
.getNumOccurrences() > 0) {
97 // If nvptx-prec-div32=N is used on the command-line, always honor it
100 // Otherwise, use div.approx if fast math is enabled
101 if (getTargetMachine().Options
.UnsafeFPMath
)
108 bool NVPTXTargetLowering::usePrecSqrtF32() const {
109 if (UsePrecSqrtF32
.getNumOccurrences() > 0) {
110 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
111 return UsePrecSqrtF32
;
113 // Otherwise, use sqrt.approx if fast math is enabled
114 return !getTargetMachine().Options
.UnsafeFPMath
;
118 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction
&MF
) const {
119 // TODO: Get rid of this flag; there can be only one way to do this.
120 if (FtzEnabled
.getNumOccurrences() > 0) {
121 // If nvptx-f32ftz is used on the command-line, always honor it
124 const Function
&F
= MF
.getFunction();
125 // Otherwise, check for an nvptx-f32ftz attribute on the function
126 if (F
.hasFnAttribute("nvptx-f32ftz"))
127 return F
.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
133 static bool IsPTXVectorType(MVT VT
) {
134 switch (VT
.SimpleTy
) {
148 case MVT::v8f16
: // <4 x f16x2>
156 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
157 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
158 /// into their primitive components.
159 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
160 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
161 /// LowerCall, and LowerReturn.
162 static void ComputePTXValueVTs(const TargetLowering
&TLI
, const DataLayout
&DL
,
163 Type
*Ty
, SmallVectorImpl
<EVT
> &ValueVTs
,
164 SmallVectorImpl
<uint64_t> *Offsets
= nullptr,
165 uint64_t StartingOffset
= 0) {
166 SmallVector
<EVT
, 16> TempVTs
;
167 SmallVector
<uint64_t, 16> TempOffsets
;
169 // Special case for i128 - decompose to (i64, i64)
170 if (Ty
->isIntegerTy(128)) {
171 ValueVTs
.push_back(EVT(MVT::i64
));
172 ValueVTs
.push_back(EVT(MVT::i64
));
175 Offsets
->push_back(StartingOffset
+ 0);
176 Offsets
->push_back(StartingOffset
+ 8);
182 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
183 if (StructType
*STy
= dyn_cast
<StructType
>(Ty
)) {
184 auto const *SL
= DL
.getStructLayout(STy
);
186 for(auto *EI
: STy
->elements()) {
187 ComputePTXValueVTs(TLI
, DL
, EI
, ValueVTs
, Offsets
,
188 StartingOffset
+ SL
->getElementOffset(ElementNum
));
194 ComputeValueVTs(TLI
, DL
, Ty
, TempVTs
, &TempOffsets
, StartingOffset
);
195 for (unsigned i
= 0, e
= TempVTs
.size(); i
!= e
; ++i
) {
197 uint64_t Off
= TempOffsets
[i
];
198 // Split vectors into individual elements, except for v2f16, which
199 // we will pass as a single scalar.
201 unsigned NumElts
= VT
.getVectorNumElements();
202 EVT EltVT
= VT
.getVectorElementType();
203 // Vectors with an even number of f16 elements will be passed to
204 // us as an array of v2f16 elements. We must match this so we
205 // stay in sync with Ins/Outs.
206 if (EltVT
== MVT::f16
&& NumElts
% 2 == 0) {
210 for (unsigned j
= 0; j
!= NumElts
; ++j
) {
211 ValueVTs
.push_back(EltVT
);
213 Offsets
->push_back(Off
+ j
* EltVT
.getStoreSize());
216 ValueVTs
.push_back(VT
);
218 Offsets
->push_back(Off
);
223 // Check whether we can merge loads/stores of some of the pieces of a
224 // flattened function parameter or return value into a single vector
227 // The flattened parameter is represented as a list of EVTs and
228 // offsets, and the whole structure is aligned to ParamAlignment. This
229 // function determines whether we can load/store pieces of the
230 // parameter starting at index Idx using a single vectorized op of
231 // size AccessSize. If so, it returns the number of param pieces
232 // covered by the vector op. Otherwise, it returns 1.
233 static unsigned CanMergeParamLoadStoresStartingAt(
234 unsigned Idx
, uint32_t AccessSize
, const SmallVectorImpl
<EVT
> &ValueVTs
,
235 const SmallVectorImpl
<uint64_t> &Offsets
, unsigned ParamAlignment
) {
236 assert(isPowerOf2_32(AccessSize
) && "must be a power of 2!");
238 // Can't vectorize if param alignment is not sufficient.
239 if (AccessSize
> ParamAlignment
)
241 // Can't vectorize if offset is not aligned.
242 if (Offsets
[Idx
] & (AccessSize
- 1))
245 EVT EltVT
= ValueVTs
[Idx
];
246 unsigned EltSize
= EltVT
.getStoreSize();
248 // Element is too large to vectorize.
249 if (EltSize
>= AccessSize
)
252 unsigned NumElts
= AccessSize
/ EltSize
;
253 // Can't vectorize if AccessBytes if not a multiple of EltSize.
254 if (AccessSize
!= EltSize
* NumElts
)
257 // We don't have enough elements to vectorize.
258 if (Idx
+ NumElts
> ValueVTs
.size())
261 // PTX ISA can only deal with 2- and 4-element vector ops.
262 if (NumElts
!= 4 && NumElts
!= 2)
265 for (unsigned j
= Idx
+ 1; j
< Idx
+ NumElts
; ++j
) {
266 // Types do not match.
267 if (ValueVTs
[j
] != EltVT
)
270 // Elements are not contiguous.
271 if (Offsets
[j
] - Offsets
[j
- 1] != EltSize
)
274 // OK. We can vectorize ValueVTs[i..i+NumElts)
278 // Flags for tracking per-element vectorization state of loads/stores
279 // of a flattened function parameter or return value.
280 enum ParamVectorizationFlags
{
281 PVF_INNER
= 0x0, // Middle elements of a vector.
282 PVF_FIRST
= 0x1, // First element of the vector.
283 PVF_LAST
= 0x2, // Last element of the vector.
284 // Scalar is effectively a 1-element vector.
285 PVF_SCALAR
= PVF_FIRST
| PVF_LAST
288 // Computes whether and how we can vectorize the loads/stores of a
289 // flattened function parameter or return value.
291 // The flattened parameter is represented as the list of ValueVTs and
292 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
293 // of the same size as ValueVTs indicating how each piece should be
294 // loaded/stored (i.e. as a scalar, or as part of a vector
296 static SmallVector
<ParamVectorizationFlags
, 16>
297 VectorizePTXValueVTs(const SmallVectorImpl
<EVT
> &ValueVTs
,
298 const SmallVectorImpl
<uint64_t> &Offsets
,
299 unsigned ParamAlignment
) {
300 // Set vector size to match ValueVTs and mark all elements as
301 // scalars by default.
302 SmallVector
<ParamVectorizationFlags
, 16> VectorInfo
;
303 VectorInfo
.assign(ValueVTs
.size(), PVF_SCALAR
);
305 // Check what we can vectorize using 128/64/32-bit accesses.
306 for (int I
= 0, E
= ValueVTs
.size(); I
!= E
; ++I
) {
307 // Skip elements we've already processed.
308 assert(VectorInfo
[I
] == PVF_SCALAR
&& "Unexpected vector info state.");
309 for (unsigned AccessSize
: {16, 8, 4, 2}) {
310 unsigned NumElts
= CanMergeParamLoadStoresStartingAt(
311 I
, AccessSize
, ValueVTs
, Offsets
, ParamAlignment
);
312 // Mark vectorized elements.
315 llvm_unreachable("Unexpected return value");
317 // Can't vectorize using this size, try next smaller size.
320 assert(I
+ 1 < E
&& "Not enough elements.");
321 VectorInfo
[I
] = PVF_FIRST
;
322 VectorInfo
[I
+ 1] = PVF_LAST
;
326 assert(I
+ 3 < E
&& "Not enough elements.");
327 VectorInfo
[I
] = PVF_FIRST
;
328 VectorInfo
[I
+ 1] = PVF_INNER
;
329 VectorInfo
[I
+ 2] = PVF_INNER
;
330 VectorInfo
[I
+ 3] = PVF_LAST
;
334 // Break out of the inner loop because we've already succeeded
335 // using largest possible AccessSize.
342 // NVPTXTargetLowering Constructor.
343 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine
&TM
,
344 const NVPTXSubtarget
&STI
)
345 : TargetLowering(TM
), nvTM(&TM
), STI(STI
) {
346 // always lower memset, memcpy, and memmove intrinsics to load/store
347 // instructions, rather
348 // then generating calls to memset, mempcy or memmove.
349 MaxStoresPerMemset
= (unsigned) 0xFFFFFFFF;
350 MaxStoresPerMemcpy
= (unsigned) 0xFFFFFFFF;
351 MaxStoresPerMemmove
= (unsigned) 0xFFFFFFFF;
353 setBooleanContents(ZeroOrNegativeOneBooleanContent
);
354 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent
);
356 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
357 // condition branches.
358 setJumpIsExpensive(true);
360 // Wide divides are _very_ slow. Try to reduce the width of the divide if
362 addBypassSlowDiv(64, 32);
364 // By default, use the Source scheduling
366 setSchedulingPreference(Sched::RegPressure
);
368 setSchedulingPreference(Sched::Source
);
370 auto setFP16OperationAction
= [&](unsigned Op
, MVT VT
, LegalizeAction Action
,
371 LegalizeAction NoF16Action
) {
372 setOperationAction(Op
, VT
, STI
.allowFP16Math() ? Action
: NoF16Action
);
375 addRegisterClass(MVT::i1
, &NVPTX::Int1RegsRegClass
);
376 addRegisterClass(MVT::i16
, &NVPTX::Int16RegsRegClass
);
377 addRegisterClass(MVT::i32
, &NVPTX::Int32RegsRegClass
);
378 addRegisterClass(MVT::i64
, &NVPTX::Int64RegsRegClass
);
379 addRegisterClass(MVT::f32
, &NVPTX::Float32RegsRegClass
);
380 addRegisterClass(MVT::f64
, &NVPTX::Float64RegsRegClass
);
381 addRegisterClass(MVT::f16
, &NVPTX::Float16RegsRegClass
);
382 addRegisterClass(MVT::v2f16
, &NVPTX::Float16x2RegsRegClass
);
384 // Conversion to/from FP16/FP16x2 is always legal.
385 setOperationAction(ISD::SINT_TO_FP
, MVT::f16
, Legal
);
386 setOperationAction(ISD::FP_TO_SINT
, MVT::f16
, Legal
);
387 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2f16
, Custom
);
388 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
389 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2f16
, Expand
);
390 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v2f16
, Expand
);
392 setFP16OperationAction(ISD::SETCC
, MVT::f16
, Legal
, Promote
);
393 setFP16OperationAction(ISD::SETCC
, MVT::v2f16
, Legal
, Expand
);
395 // Operations not directly supported by NVPTX.
396 for (MVT VT
: {MVT::f16
, MVT::v2f16
, MVT::f32
, MVT::f64
, MVT::i1
, MVT::i8
,
397 MVT::i16
, MVT::i32
, MVT::i64
}) {
398 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
399 setOperationAction(ISD::BR_CC
, VT
, Expand
);
402 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
403 // For others we will expand to a SHL/SRA pair.
404 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i64
, Legal
);
405 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i32
, Legal
);
406 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i16
, Legal
);
407 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i8
, Legal
);
408 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i1
, Expand
);
410 setOperationAction(ISD::SHL_PARTS
, MVT::i32
, Custom
);
411 setOperationAction(ISD::SRA_PARTS
, MVT::i32
, Custom
);
412 setOperationAction(ISD::SRL_PARTS
, MVT::i32
, Custom
);
413 setOperationAction(ISD::SHL_PARTS
, MVT::i64
, Custom
);
414 setOperationAction(ISD::SRA_PARTS
, MVT::i64
, Custom
);
415 setOperationAction(ISD::SRL_PARTS
, MVT::i64
, Custom
);
417 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
418 setOperationAction(ISD::BITREVERSE
, MVT::i64
, Legal
);
420 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
421 // that don't have h/w rotation we lower them to multi-instruction assembly.
422 // See ROT*_sw in NVPTXIntrInfo.td
423 setOperationAction(ISD::ROTL
, MVT::i64
, Legal
);
424 setOperationAction(ISD::ROTR
, MVT::i64
, Legal
);
425 setOperationAction(ISD::ROTL
, MVT::i32
, Legal
);
426 setOperationAction(ISD::ROTR
, MVT::i32
, Legal
);
428 setOperationAction(ISD::ROTL
, MVT::i16
, Expand
);
429 setOperationAction(ISD::ROTR
, MVT::i16
, Expand
);
430 setOperationAction(ISD::ROTL
, MVT::i8
, Expand
);
431 setOperationAction(ISD::ROTR
, MVT::i8
, Expand
);
432 setOperationAction(ISD::BSWAP
, MVT::i16
, Expand
);
433 setOperationAction(ISD::BSWAP
, MVT::i32
, Expand
);
434 setOperationAction(ISD::BSWAP
, MVT::i64
, Expand
);
436 // Indirect branch is not supported.
437 // This also disables Jump Table creation.
438 setOperationAction(ISD::BR_JT
, MVT::Other
, Expand
);
439 setOperationAction(ISD::BRIND
, MVT::Other
, Expand
);
441 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
442 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
444 // We want to legalize constant related memmove and memcopy
446 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::Other
, Custom
);
448 // Turn FP extload into load/fpextend
449 setLoadExtAction(ISD::EXTLOAD
, MVT::f32
, MVT::f16
, Expand
);
450 setLoadExtAction(ISD::EXTLOAD
, MVT::f64
, MVT::f16
, Expand
);
451 setLoadExtAction(ISD::EXTLOAD
, MVT::f64
, MVT::f32
, Expand
);
452 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f32
, MVT::v2f16
, Expand
);
453 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f64
, MVT::v2f16
, Expand
);
454 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f64
, MVT::v2f32
, Expand
);
455 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f32
, MVT::v4f16
, Expand
);
456 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f64
, MVT::v4f16
, Expand
);
457 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f64
, MVT::v4f32
, Expand
);
458 // Turn FP truncstore into trunc + store.
459 // FIXME: vector types should also be expanded
460 setTruncStoreAction(MVT::f32
, MVT::f16
, Expand
);
461 setTruncStoreAction(MVT::f64
, MVT::f16
, Expand
);
462 setTruncStoreAction(MVT::f64
, MVT::f32
, Expand
);
464 // PTX does not support load / store predicate registers
465 setOperationAction(ISD::LOAD
, MVT::i1
, Custom
);
466 setOperationAction(ISD::STORE
, MVT::i1
, Custom
);
468 for (MVT VT
: MVT::integer_valuetypes()) {
469 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i1
, Promote
);
470 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::i1
, Promote
);
471 setTruncStoreAction(VT
, MVT::i1
, Expand
);
474 // This is legal in NVPTX
475 setOperationAction(ISD::ConstantFP
, MVT::f64
, Legal
);
476 setOperationAction(ISD::ConstantFP
, MVT::f32
, Legal
);
477 setOperationAction(ISD::ConstantFP
, MVT::f16
, Legal
);
479 // TRAP can be lowered to PTX trap
480 setOperationAction(ISD::TRAP
, MVT::Other
, Legal
);
482 // Register custom handling for vector loads/stores
483 for (MVT VT
: MVT::fixedlen_vector_valuetypes()) {
484 if (IsPTXVectorType(VT
)) {
485 setOperationAction(ISD::LOAD
, VT
, Custom
);
486 setOperationAction(ISD::STORE
, VT
, Custom
);
487 setOperationAction(ISD::INTRINSIC_W_CHAIN
, VT
, Custom
);
491 // Custom handling for i8 intrinsics
492 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i8
, Custom
);
494 for (const auto& Ty
: {MVT::i16
, MVT::i32
, MVT::i64
}) {
495 setOperationAction(ISD::ABS
, Ty
, Legal
);
496 setOperationAction(ISD::SMIN
, Ty
, Legal
);
497 setOperationAction(ISD::SMAX
, Ty
, Legal
);
498 setOperationAction(ISD::UMIN
, Ty
, Legal
);
499 setOperationAction(ISD::UMAX
, Ty
, Legal
);
501 setOperationAction(ISD::CTPOP
, Ty
, Legal
);
502 setOperationAction(ISD::CTLZ
, Ty
, Legal
);
505 setOperationAction(ISD::CTTZ
, MVT::i16
, Expand
);
506 setOperationAction(ISD::CTTZ
, MVT::i32
, Expand
);
507 setOperationAction(ISD::CTTZ
, MVT::i64
, Expand
);
509 // PTX does not directly support SELP of i1, so promote to i32 first
510 setOperationAction(ISD::SELECT
, MVT::i1
, Custom
);
512 // PTX cannot multiply two i64s in a single instruction.
513 setOperationAction(ISD::SMUL_LOHI
, MVT::i64
, Expand
);
514 setOperationAction(ISD::UMUL_LOHI
, MVT::i64
, Expand
);
516 // We have some custom DAG combine patterns for these nodes
517 setTargetDAGCombine(ISD::ADD
);
518 setTargetDAGCombine(ISD::AND
);
519 setTargetDAGCombine(ISD::FADD
);
520 setTargetDAGCombine(ISD::MUL
);
521 setTargetDAGCombine(ISD::SHL
);
522 setTargetDAGCombine(ISD::SREM
);
523 setTargetDAGCombine(ISD::UREM
);
525 // setcc for f16x2 needs special handling to prevent legalizer's
526 // attempt to scalarize it due to v2i1 not being legal.
527 if (STI
.allowFP16Math())
528 setTargetDAGCombine(ISD::SETCC
);
530 // Promote fp16 arithmetic if fp16 hardware isn't available or the
531 // user passed --nvptx-no-fp16-math. The flag is useful because,
532 // although sm_53+ GPUs have some sort of FP16 support in
533 // hardware, only sm_53 and sm_60 have full implementation. Others
534 // only have token amount of hardware and are likely to run faster
535 // by using fp32 units instead.
536 for (const auto &Op
: {ISD::FADD
, ISD::FMUL
, ISD::FSUB
, ISD::FMA
}) {
537 setFP16OperationAction(Op
, MVT::f16
, Legal
, Promote
);
538 setFP16OperationAction(Op
, MVT::v2f16
, Legal
, Expand
);
541 // There's no neg.f16 instruction. Expand to (0-x).
542 setOperationAction(ISD::FNEG
, MVT::f16
, Expand
);
543 setOperationAction(ISD::FNEG
, MVT::v2f16
, Expand
);
545 // (would be) Library functions.
547 // These map to conversion instructions for scalar FP types.
548 for (const auto &Op
: {ISD::FCEIL
, ISD::FFLOOR
, ISD::FNEARBYINT
, ISD::FRINT
,
550 setOperationAction(Op
, MVT::f16
, Legal
);
551 setOperationAction(Op
, MVT::f32
, Legal
);
552 setOperationAction(Op
, MVT::f64
, Legal
);
553 setOperationAction(Op
, MVT::v2f16
, Expand
);
556 setOperationAction(ISD::FROUND
, MVT::f16
, Promote
);
557 setOperationAction(ISD::FROUND
, MVT::v2f16
, Expand
);
558 setOperationAction(ISD::FROUND
, MVT::f32
, Custom
);
559 setOperationAction(ISD::FROUND
, MVT::f64
, Custom
);
562 // 'Expand' implements FCOPYSIGN without calling an external library.
563 setOperationAction(ISD::FCOPYSIGN
, MVT::f16
, Expand
);
564 setOperationAction(ISD::FCOPYSIGN
, MVT::v2f16
, Expand
);
565 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Expand
);
566 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
568 // These map to corresponding instructions for f32/f64. f16 must be
569 // promoted to f32. v2f16 is expanded to f16, which is then promoted
571 for (const auto &Op
: {ISD::FDIV
, ISD::FREM
, ISD::FSQRT
, ISD::FSIN
, ISD::FCOS
,
572 ISD::FABS
, ISD::FMINNUM
, ISD::FMAXNUM
}) {
573 setOperationAction(Op
, MVT::f16
, Promote
);
574 setOperationAction(Op
, MVT::f32
, Legal
);
575 setOperationAction(Op
, MVT::f64
, Legal
);
576 setOperationAction(Op
, MVT::v2f16
, Expand
);
578 setOperationAction(ISD::FMINNUM
, MVT::f16
, Promote
);
579 setOperationAction(ISD::FMAXNUM
, MVT::f16
, Promote
);
580 setOperationAction(ISD::FMINIMUM
, MVT::f16
, Promote
);
581 setOperationAction(ISD::FMAXIMUM
, MVT::f16
, Promote
);
583 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
584 // No FPOW or FREM in PTX.
586 // Now deduce the information based on the above mentioned
588 computeRegisterProperties(STI
.getRegisterInfo());
591 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode
) const {
592 switch ((NVPTXISD::NodeType
)Opcode
) {
593 case NVPTXISD::FIRST_NUMBER
:
596 return "NVPTXISD::CALL";
597 case NVPTXISD::RET_FLAG
:
598 return "NVPTXISD::RET_FLAG";
599 case NVPTXISD::LOAD_PARAM
:
600 return "NVPTXISD::LOAD_PARAM";
601 case NVPTXISD::Wrapper
:
602 return "NVPTXISD::Wrapper";
603 case NVPTXISD::DeclareParam
:
604 return "NVPTXISD::DeclareParam";
605 case NVPTXISD::DeclareScalarParam
:
606 return "NVPTXISD::DeclareScalarParam";
607 case NVPTXISD::DeclareRet
:
608 return "NVPTXISD::DeclareRet";
609 case NVPTXISD::DeclareScalarRet
:
610 return "NVPTXISD::DeclareScalarRet";
611 case NVPTXISD::DeclareRetParam
:
612 return "NVPTXISD::DeclareRetParam";
613 case NVPTXISD::PrintCall
:
614 return "NVPTXISD::PrintCall";
615 case NVPTXISD::PrintConvergentCall
:
616 return "NVPTXISD::PrintConvergentCall";
617 case NVPTXISD::PrintCallUni
:
618 return "NVPTXISD::PrintCallUni";
619 case NVPTXISD::PrintConvergentCallUni
:
620 return "NVPTXISD::PrintConvergentCallUni";
621 case NVPTXISD::LoadParam
:
622 return "NVPTXISD::LoadParam";
623 case NVPTXISD::LoadParamV2
:
624 return "NVPTXISD::LoadParamV2";
625 case NVPTXISD::LoadParamV4
:
626 return "NVPTXISD::LoadParamV4";
627 case NVPTXISD::StoreParam
:
628 return "NVPTXISD::StoreParam";
629 case NVPTXISD::StoreParamV2
:
630 return "NVPTXISD::StoreParamV2";
631 case NVPTXISD::StoreParamV4
:
632 return "NVPTXISD::StoreParamV4";
633 case NVPTXISD::StoreParamS32
:
634 return "NVPTXISD::StoreParamS32";
635 case NVPTXISD::StoreParamU32
:
636 return "NVPTXISD::StoreParamU32";
637 case NVPTXISD::CallArgBegin
:
638 return "NVPTXISD::CallArgBegin";
639 case NVPTXISD::CallArg
:
640 return "NVPTXISD::CallArg";
641 case NVPTXISD::LastCallArg
:
642 return "NVPTXISD::LastCallArg";
643 case NVPTXISD::CallArgEnd
:
644 return "NVPTXISD::CallArgEnd";
645 case NVPTXISD::CallVoid
:
646 return "NVPTXISD::CallVoid";
647 case NVPTXISD::CallVal
:
648 return "NVPTXISD::CallVal";
649 case NVPTXISD::CallSymbol
:
650 return "NVPTXISD::CallSymbol";
651 case NVPTXISD::Prototype
:
652 return "NVPTXISD::Prototype";
653 case NVPTXISD::MoveParam
:
654 return "NVPTXISD::MoveParam";
655 case NVPTXISD::StoreRetval
:
656 return "NVPTXISD::StoreRetval";
657 case NVPTXISD::StoreRetvalV2
:
658 return "NVPTXISD::StoreRetvalV2";
659 case NVPTXISD::StoreRetvalV4
:
660 return "NVPTXISD::StoreRetvalV4";
661 case NVPTXISD::PseudoUseParam
:
662 return "NVPTXISD::PseudoUseParam";
663 case NVPTXISD::RETURN
:
664 return "NVPTXISD::RETURN";
665 case NVPTXISD::CallSeqBegin
:
666 return "NVPTXISD::CallSeqBegin";
667 case NVPTXISD::CallSeqEnd
:
668 return "NVPTXISD::CallSeqEnd";
669 case NVPTXISD::CallPrototype
:
670 return "NVPTXISD::CallPrototype";
671 case NVPTXISD::ProxyReg
:
672 return "NVPTXISD::ProxyReg";
673 case NVPTXISD::LoadV2
:
674 return "NVPTXISD::LoadV2";
675 case NVPTXISD::LoadV4
:
676 return "NVPTXISD::LoadV4";
677 case NVPTXISD::LDGV2
:
678 return "NVPTXISD::LDGV2";
679 case NVPTXISD::LDGV4
:
680 return "NVPTXISD::LDGV4";
681 case NVPTXISD::LDUV2
:
682 return "NVPTXISD::LDUV2";
683 case NVPTXISD::LDUV4
:
684 return "NVPTXISD::LDUV4";
685 case NVPTXISD::StoreV2
:
686 return "NVPTXISD::StoreV2";
687 case NVPTXISD::StoreV4
:
688 return "NVPTXISD::StoreV4";
689 case NVPTXISD::FUN_SHFL_CLAMP
:
690 return "NVPTXISD::FUN_SHFL_CLAMP";
691 case NVPTXISD::FUN_SHFR_CLAMP
:
692 return "NVPTXISD::FUN_SHFR_CLAMP";
694 return "NVPTXISD::IMAD";
695 case NVPTXISD::SETP_F16X2
:
696 return "NVPTXISD::SETP_F16X2";
697 case NVPTXISD::Dummy
:
698 return "NVPTXISD::Dummy";
699 case NVPTXISD::MUL_WIDE_SIGNED
:
700 return "NVPTXISD::MUL_WIDE_SIGNED";
701 case NVPTXISD::MUL_WIDE_UNSIGNED
:
702 return "NVPTXISD::MUL_WIDE_UNSIGNED";
703 case NVPTXISD::Tex1DFloatS32
: return "NVPTXISD::Tex1DFloatS32";
704 case NVPTXISD::Tex1DFloatFloat
: return "NVPTXISD::Tex1DFloatFloat";
705 case NVPTXISD::Tex1DFloatFloatLevel
:
706 return "NVPTXISD::Tex1DFloatFloatLevel";
707 case NVPTXISD::Tex1DFloatFloatGrad
:
708 return "NVPTXISD::Tex1DFloatFloatGrad";
709 case NVPTXISD::Tex1DS32S32
: return "NVPTXISD::Tex1DS32S32";
710 case NVPTXISD::Tex1DS32Float
: return "NVPTXISD::Tex1DS32Float";
711 case NVPTXISD::Tex1DS32FloatLevel
:
712 return "NVPTXISD::Tex1DS32FloatLevel";
713 case NVPTXISD::Tex1DS32FloatGrad
:
714 return "NVPTXISD::Tex1DS32FloatGrad";
715 case NVPTXISD::Tex1DU32S32
: return "NVPTXISD::Tex1DU32S32";
716 case NVPTXISD::Tex1DU32Float
: return "NVPTXISD::Tex1DU32Float";
717 case NVPTXISD::Tex1DU32FloatLevel
:
718 return "NVPTXISD::Tex1DU32FloatLevel";
719 case NVPTXISD::Tex1DU32FloatGrad
:
720 return "NVPTXISD::Tex1DU32FloatGrad";
721 case NVPTXISD::Tex1DArrayFloatS32
: return "NVPTXISD::Tex1DArrayFloatS32";
722 case NVPTXISD::Tex1DArrayFloatFloat
: return "NVPTXISD::Tex1DArrayFloatFloat";
723 case NVPTXISD::Tex1DArrayFloatFloatLevel
:
724 return "NVPTXISD::Tex1DArrayFloatFloatLevel";
725 case NVPTXISD::Tex1DArrayFloatFloatGrad
:
726 return "NVPTXISD::Tex1DArrayFloatFloatGrad";
727 case NVPTXISD::Tex1DArrayS32S32
: return "NVPTXISD::Tex1DArrayS32S32";
728 case NVPTXISD::Tex1DArrayS32Float
: return "NVPTXISD::Tex1DArrayS32Float";
729 case NVPTXISD::Tex1DArrayS32FloatLevel
:
730 return "NVPTXISD::Tex1DArrayS32FloatLevel";
731 case NVPTXISD::Tex1DArrayS32FloatGrad
:
732 return "NVPTXISD::Tex1DArrayS32FloatGrad";
733 case NVPTXISD::Tex1DArrayU32S32
: return "NVPTXISD::Tex1DArrayU32S32";
734 case NVPTXISD::Tex1DArrayU32Float
: return "NVPTXISD::Tex1DArrayU32Float";
735 case NVPTXISD::Tex1DArrayU32FloatLevel
:
736 return "NVPTXISD::Tex1DArrayU32FloatLevel";
737 case NVPTXISD::Tex1DArrayU32FloatGrad
:
738 return "NVPTXISD::Tex1DArrayU32FloatGrad";
739 case NVPTXISD::Tex2DFloatS32
: return "NVPTXISD::Tex2DFloatS32";
740 case NVPTXISD::Tex2DFloatFloat
: return "NVPTXISD::Tex2DFloatFloat";
741 case NVPTXISD::Tex2DFloatFloatLevel
:
742 return "NVPTXISD::Tex2DFloatFloatLevel";
743 case NVPTXISD::Tex2DFloatFloatGrad
:
744 return "NVPTXISD::Tex2DFloatFloatGrad";
745 case NVPTXISD::Tex2DS32S32
: return "NVPTXISD::Tex2DS32S32";
746 case NVPTXISD::Tex2DS32Float
: return "NVPTXISD::Tex2DS32Float";
747 case NVPTXISD::Tex2DS32FloatLevel
:
748 return "NVPTXISD::Tex2DS32FloatLevel";
749 case NVPTXISD::Tex2DS32FloatGrad
:
750 return "NVPTXISD::Tex2DS32FloatGrad";
751 case NVPTXISD::Tex2DU32S32
: return "NVPTXISD::Tex2DU32S32";
752 case NVPTXISD::Tex2DU32Float
: return "NVPTXISD::Tex2DU32Float";
753 case NVPTXISD::Tex2DU32FloatLevel
:
754 return "NVPTXISD::Tex2DU32FloatLevel";
755 case NVPTXISD::Tex2DU32FloatGrad
:
756 return "NVPTXISD::Tex2DU32FloatGrad";
757 case NVPTXISD::Tex2DArrayFloatS32
: return "NVPTXISD::Tex2DArrayFloatS32";
758 case NVPTXISD::Tex2DArrayFloatFloat
: return "NVPTXISD::Tex2DArrayFloatFloat";
759 case NVPTXISD::Tex2DArrayFloatFloatLevel
:
760 return "NVPTXISD::Tex2DArrayFloatFloatLevel";
761 case NVPTXISD::Tex2DArrayFloatFloatGrad
:
762 return "NVPTXISD::Tex2DArrayFloatFloatGrad";
763 case NVPTXISD::Tex2DArrayS32S32
: return "NVPTXISD::Tex2DArrayS32S32";
764 case NVPTXISD::Tex2DArrayS32Float
: return "NVPTXISD::Tex2DArrayS32Float";
765 case NVPTXISD::Tex2DArrayS32FloatLevel
:
766 return "NVPTXISD::Tex2DArrayS32FloatLevel";
767 case NVPTXISD::Tex2DArrayS32FloatGrad
:
768 return "NVPTXISD::Tex2DArrayS32FloatGrad";
769 case NVPTXISD::Tex2DArrayU32S32
: return "NVPTXISD::Tex2DArrayU32S32";
770 case NVPTXISD::Tex2DArrayU32Float
: return "NVPTXISD::Tex2DArrayU32Float";
771 case NVPTXISD::Tex2DArrayU32FloatLevel
:
772 return "NVPTXISD::Tex2DArrayU32FloatLevel";
773 case NVPTXISD::Tex2DArrayU32FloatGrad
:
774 return "NVPTXISD::Tex2DArrayU32FloatGrad";
775 case NVPTXISD::Tex3DFloatS32
: return "NVPTXISD::Tex3DFloatS32";
776 case NVPTXISD::Tex3DFloatFloat
: return "NVPTXISD::Tex3DFloatFloat";
777 case NVPTXISD::Tex3DFloatFloatLevel
:
778 return "NVPTXISD::Tex3DFloatFloatLevel";
779 case NVPTXISD::Tex3DFloatFloatGrad
:
780 return "NVPTXISD::Tex3DFloatFloatGrad";
781 case NVPTXISD::Tex3DS32S32
: return "NVPTXISD::Tex3DS32S32";
782 case NVPTXISD::Tex3DS32Float
: return "NVPTXISD::Tex3DS32Float";
783 case NVPTXISD::Tex3DS32FloatLevel
:
784 return "NVPTXISD::Tex3DS32FloatLevel";
785 case NVPTXISD::Tex3DS32FloatGrad
:
786 return "NVPTXISD::Tex3DS32FloatGrad";
787 case NVPTXISD::Tex3DU32S32
: return "NVPTXISD::Tex3DU32S32";
788 case NVPTXISD::Tex3DU32Float
: return "NVPTXISD::Tex3DU32Float";
789 case NVPTXISD::Tex3DU32FloatLevel
:
790 return "NVPTXISD::Tex3DU32FloatLevel";
791 case NVPTXISD::Tex3DU32FloatGrad
:
792 return "NVPTXISD::Tex3DU32FloatGrad";
793 case NVPTXISD::TexCubeFloatFloat
: return "NVPTXISD::TexCubeFloatFloat";
794 case NVPTXISD::TexCubeFloatFloatLevel
:
795 return "NVPTXISD::TexCubeFloatFloatLevel";
796 case NVPTXISD::TexCubeS32Float
: return "NVPTXISD::TexCubeS32Float";
797 case NVPTXISD::TexCubeS32FloatLevel
:
798 return "NVPTXISD::TexCubeS32FloatLevel";
799 case NVPTXISD::TexCubeU32Float
: return "NVPTXISD::TexCubeU32Float";
800 case NVPTXISD::TexCubeU32FloatLevel
:
801 return "NVPTXISD::TexCubeU32FloatLevel";
802 case NVPTXISD::TexCubeArrayFloatFloat
:
803 return "NVPTXISD::TexCubeArrayFloatFloat";
804 case NVPTXISD::TexCubeArrayFloatFloatLevel
:
805 return "NVPTXISD::TexCubeArrayFloatFloatLevel";
806 case NVPTXISD::TexCubeArrayS32Float
:
807 return "NVPTXISD::TexCubeArrayS32Float";
808 case NVPTXISD::TexCubeArrayS32FloatLevel
:
809 return "NVPTXISD::TexCubeArrayS32FloatLevel";
810 case NVPTXISD::TexCubeArrayU32Float
:
811 return "NVPTXISD::TexCubeArrayU32Float";
812 case NVPTXISD::TexCubeArrayU32FloatLevel
:
813 return "NVPTXISD::TexCubeArrayU32FloatLevel";
814 case NVPTXISD::Tld4R2DFloatFloat
:
815 return "NVPTXISD::Tld4R2DFloatFloat";
816 case NVPTXISD::Tld4G2DFloatFloat
:
817 return "NVPTXISD::Tld4G2DFloatFloat";
818 case NVPTXISD::Tld4B2DFloatFloat
:
819 return "NVPTXISD::Tld4B2DFloatFloat";
820 case NVPTXISD::Tld4A2DFloatFloat
:
821 return "NVPTXISD::Tld4A2DFloatFloat";
822 case NVPTXISD::Tld4R2DS64Float
:
823 return "NVPTXISD::Tld4R2DS64Float";
824 case NVPTXISD::Tld4G2DS64Float
:
825 return "NVPTXISD::Tld4G2DS64Float";
826 case NVPTXISD::Tld4B2DS64Float
:
827 return "NVPTXISD::Tld4B2DS64Float";
828 case NVPTXISD::Tld4A2DS64Float
:
829 return "NVPTXISD::Tld4A2DS64Float";
830 case NVPTXISD::Tld4R2DU64Float
:
831 return "NVPTXISD::Tld4R2DU64Float";
832 case NVPTXISD::Tld4G2DU64Float
:
833 return "NVPTXISD::Tld4G2DU64Float";
834 case NVPTXISD::Tld4B2DU64Float
:
835 return "NVPTXISD::Tld4B2DU64Float";
836 case NVPTXISD::Tld4A2DU64Float
:
837 return "NVPTXISD::Tld4A2DU64Float";
839 case NVPTXISD::TexUnified1DFloatS32
:
840 return "NVPTXISD::TexUnified1DFloatS32";
841 case NVPTXISD::TexUnified1DFloatFloat
:
842 return "NVPTXISD::TexUnified1DFloatFloat";
843 case NVPTXISD::TexUnified1DFloatFloatLevel
:
844 return "NVPTXISD::TexUnified1DFloatFloatLevel";
845 case NVPTXISD::TexUnified1DFloatFloatGrad
:
846 return "NVPTXISD::TexUnified1DFloatFloatGrad";
847 case NVPTXISD::TexUnified1DS32S32
:
848 return "NVPTXISD::TexUnified1DS32S32";
849 case NVPTXISD::TexUnified1DS32Float
:
850 return "NVPTXISD::TexUnified1DS32Float";
851 case NVPTXISD::TexUnified1DS32FloatLevel
:
852 return "NVPTXISD::TexUnified1DS32FloatLevel";
853 case NVPTXISD::TexUnified1DS32FloatGrad
:
854 return "NVPTXISD::TexUnified1DS32FloatGrad";
855 case NVPTXISD::TexUnified1DU32S32
:
856 return "NVPTXISD::TexUnified1DU32S32";
857 case NVPTXISD::TexUnified1DU32Float
:
858 return "NVPTXISD::TexUnified1DU32Float";
859 case NVPTXISD::TexUnified1DU32FloatLevel
:
860 return "NVPTXISD::TexUnified1DU32FloatLevel";
861 case NVPTXISD::TexUnified1DU32FloatGrad
:
862 return "NVPTXISD::TexUnified1DU32FloatGrad";
863 case NVPTXISD::TexUnified1DArrayFloatS32
:
864 return "NVPTXISD::TexUnified1DArrayFloatS32";
865 case NVPTXISD::TexUnified1DArrayFloatFloat
:
866 return "NVPTXISD::TexUnified1DArrayFloatFloat";
867 case NVPTXISD::TexUnified1DArrayFloatFloatLevel
:
868 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
869 case NVPTXISD::TexUnified1DArrayFloatFloatGrad
:
870 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
871 case NVPTXISD::TexUnified1DArrayS32S32
:
872 return "NVPTXISD::TexUnified1DArrayS32S32";
873 case NVPTXISD::TexUnified1DArrayS32Float
:
874 return "NVPTXISD::TexUnified1DArrayS32Float";
875 case NVPTXISD::TexUnified1DArrayS32FloatLevel
:
876 return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
877 case NVPTXISD::TexUnified1DArrayS32FloatGrad
:
878 return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
879 case NVPTXISD::TexUnified1DArrayU32S32
:
880 return "NVPTXISD::TexUnified1DArrayU32S32";
881 case NVPTXISD::TexUnified1DArrayU32Float
:
882 return "NVPTXISD::TexUnified1DArrayU32Float";
883 case NVPTXISD::TexUnified1DArrayU32FloatLevel
:
884 return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
885 case NVPTXISD::TexUnified1DArrayU32FloatGrad
:
886 return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
887 case NVPTXISD::TexUnified2DFloatS32
:
888 return "NVPTXISD::TexUnified2DFloatS32";
889 case NVPTXISD::TexUnified2DFloatFloat
:
890 return "NVPTXISD::TexUnified2DFloatFloat";
891 case NVPTXISD::TexUnified2DFloatFloatLevel
:
892 return "NVPTXISD::TexUnified2DFloatFloatLevel";
893 case NVPTXISD::TexUnified2DFloatFloatGrad
:
894 return "NVPTXISD::TexUnified2DFloatFloatGrad";
895 case NVPTXISD::TexUnified2DS32S32
:
896 return "NVPTXISD::TexUnified2DS32S32";
897 case NVPTXISD::TexUnified2DS32Float
:
898 return "NVPTXISD::TexUnified2DS32Float";
899 case NVPTXISD::TexUnified2DS32FloatLevel
:
900 return "NVPTXISD::TexUnified2DS32FloatLevel";
901 case NVPTXISD::TexUnified2DS32FloatGrad
:
902 return "NVPTXISD::TexUnified2DS32FloatGrad";
903 case NVPTXISD::TexUnified2DU32S32
:
904 return "NVPTXISD::TexUnified2DU32S32";
905 case NVPTXISD::TexUnified2DU32Float
:
906 return "NVPTXISD::TexUnified2DU32Float";
907 case NVPTXISD::TexUnified2DU32FloatLevel
:
908 return "NVPTXISD::TexUnified2DU32FloatLevel";
909 case NVPTXISD::TexUnified2DU32FloatGrad
:
910 return "NVPTXISD::TexUnified2DU32FloatGrad";
911 case NVPTXISD::TexUnified2DArrayFloatS32
:
912 return "NVPTXISD::TexUnified2DArrayFloatS32";
913 case NVPTXISD::TexUnified2DArrayFloatFloat
:
914 return "NVPTXISD::TexUnified2DArrayFloatFloat";
915 case NVPTXISD::TexUnified2DArrayFloatFloatLevel
:
916 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
917 case NVPTXISD::TexUnified2DArrayFloatFloatGrad
:
918 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
919 case NVPTXISD::TexUnified2DArrayS32S32
:
920 return "NVPTXISD::TexUnified2DArrayS32S32";
921 case NVPTXISD::TexUnified2DArrayS32Float
:
922 return "NVPTXISD::TexUnified2DArrayS32Float";
923 case NVPTXISD::TexUnified2DArrayS32FloatLevel
:
924 return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
925 case NVPTXISD::TexUnified2DArrayS32FloatGrad
:
926 return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
927 case NVPTXISD::TexUnified2DArrayU32S32
:
928 return "NVPTXISD::TexUnified2DArrayU32S32";
929 case NVPTXISD::TexUnified2DArrayU32Float
:
930 return "NVPTXISD::TexUnified2DArrayU32Float";
931 case NVPTXISD::TexUnified2DArrayU32FloatLevel
:
932 return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
933 case NVPTXISD::TexUnified2DArrayU32FloatGrad
:
934 return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
935 case NVPTXISD::TexUnified3DFloatS32
:
936 return "NVPTXISD::TexUnified3DFloatS32";
937 case NVPTXISD::TexUnified3DFloatFloat
:
938 return "NVPTXISD::TexUnified3DFloatFloat";
939 case NVPTXISD::TexUnified3DFloatFloatLevel
:
940 return "NVPTXISD::TexUnified3DFloatFloatLevel";
941 case NVPTXISD::TexUnified3DFloatFloatGrad
:
942 return "NVPTXISD::TexUnified3DFloatFloatGrad";
943 case NVPTXISD::TexUnified3DS32S32
:
944 return "NVPTXISD::TexUnified3DS32S32";
945 case NVPTXISD::TexUnified3DS32Float
:
946 return "NVPTXISD::TexUnified3DS32Float";
947 case NVPTXISD::TexUnified3DS32FloatLevel
:
948 return "NVPTXISD::TexUnified3DS32FloatLevel";
949 case NVPTXISD::TexUnified3DS32FloatGrad
:
950 return "NVPTXISD::TexUnified3DS32FloatGrad";
951 case NVPTXISD::TexUnified3DU32S32
:
952 return "NVPTXISD::TexUnified3DU32S32";
953 case NVPTXISD::TexUnified3DU32Float
:
954 return "NVPTXISD::TexUnified3DU32Float";
955 case NVPTXISD::TexUnified3DU32FloatLevel
:
956 return "NVPTXISD::TexUnified3DU32FloatLevel";
957 case NVPTXISD::TexUnified3DU32FloatGrad
:
958 return "NVPTXISD::TexUnified3DU32FloatGrad";
959 case NVPTXISD::TexUnifiedCubeFloatFloat
:
960 return "NVPTXISD::TexUnifiedCubeFloatFloat";
961 case NVPTXISD::TexUnifiedCubeFloatFloatLevel
:
962 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
963 case NVPTXISD::TexUnifiedCubeS32Float
:
964 return "NVPTXISD::TexUnifiedCubeS32Float";
965 case NVPTXISD::TexUnifiedCubeS32FloatLevel
:
966 return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
967 case NVPTXISD::TexUnifiedCubeU32Float
:
968 return "NVPTXISD::TexUnifiedCubeU32Float";
969 case NVPTXISD::TexUnifiedCubeU32FloatLevel
:
970 return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
971 case NVPTXISD::TexUnifiedCubeArrayFloatFloat
:
972 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
973 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel
:
974 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
975 case NVPTXISD::TexUnifiedCubeArrayS32Float
:
976 return "NVPTXISD::TexUnifiedCubeArrayS32Float";
977 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel
:
978 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
979 case NVPTXISD::TexUnifiedCubeArrayU32Float
:
980 return "NVPTXISD::TexUnifiedCubeArrayU32Float";
981 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel
:
982 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
983 case NVPTXISD::Tld4UnifiedR2DFloatFloat
:
984 return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
985 case NVPTXISD::Tld4UnifiedG2DFloatFloat
:
986 return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
987 case NVPTXISD::Tld4UnifiedB2DFloatFloat
:
988 return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
989 case NVPTXISD::Tld4UnifiedA2DFloatFloat
:
990 return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
991 case NVPTXISD::Tld4UnifiedR2DS64Float
:
992 return "NVPTXISD::Tld4UnifiedR2DS64Float";
993 case NVPTXISD::Tld4UnifiedG2DS64Float
:
994 return "NVPTXISD::Tld4UnifiedG2DS64Float";
995 case NVPTXISD::Tld4UnifiedB2DS64Float
:
996 return "NVPTXISD::Tld4UnifiedB2DS64Float";
997 case NVPTXISD::Tld4UnifiedA2DS64Float
:
998 return "NVPTXISD::Tld4UnifiedA2DS64Float";
999 case NVPTXISD::Tld4UnifiedR2DU64Float
:
1000 return "NVPTXISD::Tld4UnifiedR2DU64Float";
1001 case NVPTXISD::Tld4UnifiedG2DU64Float
:
1002 return "NVPTXISD::Tld4UnifiedG2DU64Float";
1003 case NVPTXISD::Tld4UnifiedB2DU64Float
:
1004 return "NVPTXISD::Tld4UnifiedB2DU64Float";
1005 case NVPTXISD::Tld4UnifiedA2DU64Float
:
1006 return "NVPTXISD::Tld4UnifiedA2DU64Float";
1008 case NVPTXISD::Suld1DI8Clamp
: return "NVPTXISD::Suld1DI8Clamp";
1009 case NVPTXISD::Suld1DI16Clamp
: return "NVPTXISD::Suld1DI16Clamp";
1010 case NVPTXISD::Suld1DI32Clamp
: return "NVPTXISD::Suld1DI32Clamp";
1011 case NVPTXISD::Suld1DI64Clamp
: return "NVPTXISD::Suld1DI64Clamp";
1012 case NVPTXISD::Suld1DV2I8Clamp
: return "NVPTXISD::Suld1DV2I8Clamp";
1013 case NVPTXISD::Suld1DV2I16Clamp
: return "NVPTXISD::Suld1DV2I16Clamp";
1014 case NVPTXISD::Suld1DV2I32Clamp
: return "NVPTXISD::Suld1DV2I32Clamp";
1015 case NVPTXISD::Suld1DV2I64Clamp
: return "NVPTXISD::Suld1DV2I64Clamp";
1016 case NVPTXISD::Suld1DV4I8Clamp
: return "NVPTXISD::Suld1DV4I8Clamp";
1017 case NVPTXISD::Suld1DV4I16Clamp
: return "NVPTXISD::Suld1DV4I16Clamp";
1018 case NVPTXISD::Suld1DV4I32Clamp
: return "NVPTXISD::Suld1DV4I32Clamp";
1020 case NVPTXISD::Suld1DArrayI8Clamp
: return "NVPTXISD::Suld1DArrayI8Clamp";
1021 case NVPTXISD::Suld1DArrayI16Clamp
: return "NVPTXISD::Suld1DArrayI16Clamp";
1022 case NVPTXISD::Suld1DArrayI32Clamp
: return "NVPTXISD::Suld1DArrayI32Clamp";
1023 case NVPTXISD::Suld1DArrayI64Clamp
: return "NVPTXISD::Suld1DArrayI64Clamp";
1024 case NVPTXISD::Suld1DArrayV2I8Clamp
: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1025 case NVPTXISD::Suld1DArrayV2I16Clamp
:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1026 case NVPTXISD::Suld1DArrayV2I32Clamp
:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1027 case NVPTXISD::Suld1DArrayV2I64Clamp
:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1028 case NVPTXISD::Suld1DArrayV4I8Clamp
: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1029 case NVPTXISD::Suld1DArrayV4I16Clamp
:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1030 case NVPTXISD::Suld1DArrayV4I32Clamp
:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1032 case NVPTXISD::Suld2DI8Clamp
: return "NVPTXISD::Suld2DI8Clamp";
1033 case NVPTXISD::Suld2DI16Clamp
: return "NVPTXISD::Suld2DI16Clamp";
1034 case NVPTXISD::Suld2DI32Clamp
: return "NVPTXISD::Suld2DI32Clamp";
1035 case NVPTXISD::Suld2DI64Clamp
: return "NVPTXISD::Suld2DI64Clamp";
1036 case NVPTXISD::Suld2DV2I8Clamp
: return "NVPTXISD::Suld2DV2I8Clamp";
1037 case NVPTXISD::Suld2DV2I16Clamp
: return "NVPTXISD::Suld2DV2I16Clamp";
1038 case NVPTXISD::Suld2DV2I32Clamp
: return "NVPTXISD::Suld2DV2I32Clamp";
1039 case NVPTXISD::Suld2DV2I64Clamp
: return "NVPTXISD::Suld2DV2I64Clamp";
1040 case NVPTXISD::Suld2DV4I8Clamp
: return "NVPTXISD::Suld2DV4I8Clamp";
1041 case NVPTXISD::Suld2DV4I16Clamp
: return "NVPTXISD::Suld2DV4I16Clamp";
1042 case NVPTXISD::Suld2DV4I32Clamp
: return "NVPTXISD::Suld2DV4I32Clamp";
1044 case NVPTXISD::Suld2DArrayI8Clamp
: return "NVPTXISD::Suld2DArrayI8Clamp";
1045 case NVPTXISD::Suld2DArrayI16Clamp
: return "NVPTXISD::Suld2DArrayI16Clamp";
1046 case NVPTXISD::Suld2DArrayI32Clamp
: return "NVPTXISD::Suld2DArrayI32Clamp";
1047 case NVPTXISD::Suld2DArrayI64Clamp
: return "NVPTXISD::Suld2DArrayI64Clamp";
1048 case NVPTXISD::Suld2DArrayV2I8Clamp
: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1049 case NVPTXISD::Suld2DArrayV2I16Clamp
:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1050 case NVPTXISD::Suld2DArrayV2I32Clamp
:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1051 case NVPTXISD::Suld2DArrayV2I64Clamp
:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1052 case NVPTXISD::Suld2DArrayV4I8Clamp
: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1053 case NVPTXISD::Suld2DArrayV4I16Clamp
:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1054 case NVPTXISD::Suld2DArrayV4I32Clamp
:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1056 case NVPTXISD::Suld3DI8Clamp
: return "NVPTXISD::Suld3DI8Clamp";
1057 case NVPTXISD::Suld3DI16Clamp
: return "NVPTXISD::Suld3DI16Clamp";
1058 case NVPTXISD::Suld3DI32Clamp
: return "NVPTXISD::Suld3DI32Clamp";
1059 case NVPTXISD::Suld3DI64Clamp
: return "NVPTXISD::Suld3DI64Clamp";
1060 case NVPTXISD::Suld3DV2I8Clamp
: return "NVPTXISD::Suld3DV2I8Clamp";
1061 case NVPTXISD::Suld3DV2I16Clamp
: return "NVPTXISD::Suld3DV2I16Clamp";
1062 case NVPTXISD::Suld3DV2I32Clamp
: return "NVPTXISD::Suld3DV2I32Clamp";
1063 case NVPTXISD::Suld3DV2I64Clamp
: return "NVPTXISD::Suld3DV2I64Clamp";
1064 case NVPTXISD::Suld3DV4I8Clamp
: return "NVPTXISD::Suld3DV4I8Clamp";
1065 case NVPTXISD::Suld3DV4I16Clamp
: return "NVPTXISD::Suld3DV4I16Clamp";
1066 case NVPTXISD::Suld3DV4I32Clamp
: return "NVPTXISD::Suld3DV4I32Clamp";
1068 case NVPTXISD::Suld1DI8Trap
: return "NVPTXISD::Suld1DI8Trap";
1069 case NVPTXISD::Suld1DI16Trap
: return "NVPTXISD::Suld1DI16Trap";
1070 case NVPTXISD::Suld1DI32Trap
: return "NVPTXISD::Suld1DI32Trap";
1071 case NVPTXISD::Suld1DI64Trap
: return "NVPTXISD::Suld1DI64Trap";
1072 case NVPTXISD::Suld1DV2I8Trap
: return "NVPTXISD::Suld1DV2I8Trap";
1073 case NVPTXISD::Suld1DV2I16Trap
: return "NVPTXISD::Suld1DV2I16Trap";
1074 case NVPTXISD::Suld1DV2I32Trap
: return "NVPTXISD::Suld1DV2I32Trap";
1075 case NVPTXISD::Suld1DV2I64Trap
: return "NVPTXISD::Suld1DV2I64Trap";
1076 case NVPTXISD::Suld1DV4I8Trap
: return "NVPTXISD::Suld1DV4I8Trap";
1077 case NVPTXISD::Suld1DV4I16Trap
: return "NVPTXISD::Suld1DV4I16Trap";
1078 case NVPTXISD::Suld1DV4I32Trap
: return "NVPTXISD::Suld1DV4I32Trap";
1080 case NVPTXISD::Suld1DArrayI8Trap
: return "NVPTXISD::Suld1DArrayI8Trap";
1081 case NVPTXISD::Suld1DArrayI16Trap
: return "NVPTXISD::Suld1DArrayI16Trap";
1082 case NVPTXISD::Suld1DArrayI32Trap
: return "NVPTXISD::Suld1DArrayI32Trap";
1083 case NVPTXISD::Suld1DArrayI64Trap
: return "NVPTXISD::Suld1DArrayI64Trap";
1084 case NVPTXISD::Suld1DArrayV2I8Trap
: return "NVPTXISD::Suld1DArrayV2I8Trap";
1085 case NVPTXISD::Suld1DArrayV2I16Trap
: return "NVPTXISD::Suld1DArrayV2I16Trap";
1086 case NVPTXISD::Suld1DArrayV2I32Trap
: return "NVPTXISD::Suld1DArrayV2I32Trap";
1087 case NVPTXISD::Suld1DArrayV2I64Trap
: return "NVPTXISD::Suld1DArrayV2I64Trap";
1088 case NVPTXISD::Suld1DArrayV4I8Trap
: return "NVPTXISD::Suld1DArrayV4I8Trap";
1089 case NVPTXISD::Suld1DArrayV4I16Trap
: return "NVPTXISD::Suld1DArrayV4I16Trap";
1090 case NVPTXISD::Suld1DArrayV4I32Trap
: return "NVPTXISD::Suld1DArrayV4I32Trap";
1092 case NVPTXISD::Suld2DI8Trap
: return "NVPTXISD::Suld2DI8Trap";
1093 case NVPTXISD::Suld2DI16Trap
: return "NVPTXISD::Suld2DI16Trap";
1094 case NVPTXISD::Suld2DI32Trap
: return "NVPTXISD::Suld2DI32Trap";
1095 case NVPTXISD::Suld2DI64Trap
: return "NVPTXISD::Suld2DI64Trap";
1096 case NVPTXISD::Suld2DV2I8Trap
: return "NVPTXISD::Suld2DV2I8Trap";
1097 case NVPTXISD::Suld2DV2I16Trap
: return "NVPTXISD::Suld2DV2I16Trap";
1098 case NVPTXISD::Suld2DV2I32Trap
: return "NVPTXISD::Suld2DV2I32Trap";
1099 case NVPTXISD::Suld2DV2I64Trap
: return "NVPTXISD::Suld2DV2I64Trap";
1100 case NVPTXISD::Suld2DV4I8Trap
: return "NVPTXISD::Suld2DV4I8Trap";
1101 case NVPTXISD::Suld2DV4I16Trap
: return "NVPTXISD::Suld2DV4I16Trap";
1102 case NVPTXISD::Suld2DV4I32Trap
: return "NVPTXISD::Suld2DV4I32Trap";
1104 case NVPTXISD::Suld2DArrayI8Trap
: return "NVPTXISD::Suld2DArrayI8Trap";
1105 case NVPTXISD::Suld2DArrayI16Trap
: return "NVPTXISD::Suld2DArrayI16Trap";
1106 case NVPTXISD::Suld2DArrayI32Trap
: return "NVPTXISD::Suld2DArrayI32Trap";
1107 case NVPTXISD::Suld2DArrayI64Trap
: return "NVPTXISD::Suld2DArrayI64Trap";
1108 case NVPTXISD::Suld2DArrayV2I8Trap
: return "NVPTXISD::Suld2DArrayV2I8Trap";
1109 case NVPTXISD::Suld2DArrayV2I16Trap
: return "NVPTXISD::Suld2DArrayV2I16Trap";
1110 case NVPTXISD::Suld2DArrayV2I32Trap
: return "NVPTXISD::Suld2DArrayV2I32Trap";
1111 case NVPTXISD::Suld2DArrayV2I64Trap
: return "NVPTXISD::Suld2DArrayV2I64Trap";
1112 case NVPTXISD::Suld2DArrayV4I8Trap
: return "NVPTXISD::Suld2DArrayV4I8Trap";
1113 case NVPTXISD::Suld2DArrayV4I16Trap
: return "NVPTXISD::Suld2DArrayV4I16Trap";
1114 case NVPTXISD::Suld2DArrayV4I32Trap
: return "NVPTXISD::Suld2DArrayV4I32Trap";
1116 case NVPTXISD::Suld3DI8Trap
: return "NVPTXISD::Suld3DI8Trap";
1117 case NVPTXISD::Suld3DI16Trap
: return "NVPTXISD::Suld3DI16Trap";
1118 case NVPTXISD::Suld3DI32Trap
: return "NVPTXISD::Suld3DI32Trap";
1119 case NVPTXISD::Suld3DI64Trap
: return "NVPTXISD::Suld3DI64Trap";
1120 case NVPTXISD::Suld3DV2I8Trap
: return "NVPTXISD::Suld3DV2I8Trap";
1121 case NVPTXISD::Suld3DV2I16Trap
: return "NVPTXISD::Suld3DV2I16Trap";
1122 case NVPTXISD::Suld3DV2I32Trap
: return "NVPTXISD::Suld3DV2I32Trap";
1123 case NVPTXISD::Suld3DV2I64Trap
: return "NVPTXISD::Suld3DV2I64Trap";
1124 case NVPTXISD::Suld3DV4I8Trap
: return "NVPTXISD::Suld3DV4I8Trap";
1125 case NVPTXISD::Suld3DV4I16Trap
: return "NVPTXISD::Suld3DV4I16Trap";
1126 case NVPTXISD::Suld3DV4I32Trap
: return "NVPTXISD::Suld3DV4I32Trap";
1128 case NVPTXISD::Suld1DI8Zero
: return "NVPTXISD::Suld1DI8Zero";
1129 case NVPTXISD::Suld1DI16Zero
: return "NVPTXISD::Suld1DI16Zero";
1130 case NVPTXISD::Suld1DI32Zero
: return "NVPTXISD::Suld1DI32Zero";
1131 case NVPTXISD::Suld1DI64Zero
: return "NVPTXISD::Suld1DI64Zero";
1132 case NVPTXISD::Suld1DV2I8Zero
: return "NVPTXISD::Suld1DV2I8Zero";
1133 case NVPTXISD::Suld1DV2I16Zero
: return "NVPTXISD::Suld1DV2I16Zero";
1134 case NVPTXISD::Suld1DV2I32Zero
: return "NVPTXISD::Suld1DV2I32Zero";
1135 case NVPTXISD::Suld1DV2I64Zero
: return "NVPTXISD::Suld1DV2I64Zero";
1136 case NVPTXISD::Suld1DV4I8Zero
: return "NVPTXISD::Suld1DV4I8Zero";
1137 case NVPTXISD::Suld1DV4I16Zero
: return "NVPTXISD::Suld1DV4I16Zero";
1138 case NVPTXISD::Suld1DV4I32Zero
: return "NVPTXISD::Suld1DV4I32Zero";
1140 case NVPTXISD::Suld1DArrayI8Zero
: return "NVPTXISD::Suld1DArrayI8Zero";
1141 case NVPTXISD::Suld1DArrayI16Zero
: return "NVPTXISD::Suld1DArrayI16Zero";
1142 case NVPTXISD::Suld1DArrayI32Zero
: return "NVPTXISD::Suld1DArrayI32Zero";
1143 case NVPTXISD::Suld1DArrayI64Zero
: return "NVPTXISD::Suld1DArrayI64Zero";
1144 case NVPTXISD::Suld1DArrayV2I8Zero
: return "NVPTXISD::Suld1DArrayV2I8Zero";
1145 case NVPTXISD::Suld1DArrayV2I16Zero
: return "NVPTXISD::Suld1DArrayV2I16Zero";
1146 case NVPTXISD::Suld1DArrayV2I32Zero
: return "NVPTXISD::Suld1DArrayV2I32Zero";
1147 case NVPTXISD::Suld1DArrayV2I64Zero
: return "NVPTXISD::Suld1DArrayV2I64Zero";
1148 case NVPTXISD::Suld1DArrayV4I8Zero
: return "NVPTXISD::Suld1DArrayV4I8Zero";
1149 case NVPTXISD::Suld1DArrayV4I16Zero
: return "NVPTXISD::Suld1DArrayV4I16Zero";
1150 case NVPTXISD::Suld1DArrayV4I32Zero
: return "NVPTXISD::Suld1DArrayV4I32Zero";
1152 case NVPTXISD::Suld2DI8Zero
: return "NVPTXISD::Suld2DI8Zero";
1153 case NVPTXISD::Suld2DI16Zero
: return "NVPTXISD::Suld2DI16Zero";
1154 case NVPTXISD::Suld2DI32Zero
: return "NVPTXISD::Suld2DI32Zero";
1155 case NVPTXISD::Suld2DI64Zero
: return "NVPTXISD::Suld2DI64Zero";
1156 case NVPTXISD::Suld2DV2I8Zero
: return "NVPTXISD::Suld2DV2I8Zero";
1157 case NVPTXISD::Suld2DV2I16Zero
: return "NVPTXISD::Suld2DV2I16Zero";
1158 case NVPTXISD::Suld2DV2I32Zero
: return "NVPTXISD::Suld2DV2I32Zero";
1159 case NVPTXISD::Suld2DV2I64Zero
: return "NVPTXISD::Suld2DV2I64Zero";
1160 case NVPTXISD::Suld2DV4I8Zero
: return "NVPTXISD::Suld2DV4I8Zero";
1161 case NVPTXISD::Suld2DV4I16Zero
: return "NVPTXISD::Suld2DV4I16Zero";
1162 case NVPTXISD::Suld2DV4I32Zero
: return "NVPTXISD::Suld2DV4I32Zero";
1164 case NVPTXISD::Suld2DArrayI8Zero
: return "NVPTXISD::Suld2DArrayI8Zero";
1165 case NVPTXISD::Suld2DArrayI16Zero
: return "NVPTXISD::Suld2DArrayI16Zero";
1166 case NVPTXISD::Suld2DArrayI32Zero
: return "NVPTXISD::Suld2DArrayI32Zero";
1167 case NVPTXISD::Suld2DArrayI64Zero
: return "NVPTXISD::Suld2DArrayI64Zero";
1168 case NVPTXISD::Suld2DArrayV2I8Zero
: return "NVPTXISD::Suld2DArrayV2I8Zero";
1169 case NVPTXISD::Suld2DArrayV2I16Zero
: return "NVPTXISD::Suld2DArrayV2I16Zero";
1170 case NVPTXISD::Suld2DArrayV2I32Zero
: return "NVPTXISD::Suld2DArrayV2I32Zero";
1171 case NVPTXISD::Suld2DArrayV2I64Zero
: return "NVPTXISD::Suld2DArrayV2I64Zero";
1172 case NVPTXISD::Suld2DArrayV4I8Zero
: return "NVPTXISD::Suld2DArrayV4I8Zero";
1173 case NVPTXISD::Suld2DArrayV4I16Zero
: return "NVPTXISD::Suld2DArrayV4I16Zero";
1174 case NVPTXISD::Suld2DArrayV4I32Zero
: return "NVPTXISD::Suld2DArrayV4I32Zero";
1176 case NVPTXISD::Suld3DI8Zero
: return "NVPTXISD::Suld3DI8Zero";
1177 case NVPTXISD::Suld3DI16Zero
: return "NVPTXISD::Suld3DI16Zero";
1178 case NVPTXISD::Suld3DI32Zero
: return "NVPTXISD::Suld3DI32Zero";
1179 case NVPTXISD::Suld3DI64Zero
: return "NVPTXISD::Suld3DI64Zero";
1180 case NVPTXISD::Suld3DV2I8Zero
: return "NVPTXISD::Suld3DV2I8Zero";
1181 case NVPTXISD::Suld3DV2I16Zero
: return "NVPTXISD::Suld3DV2I16Zero";
1182 case NVPTXISD::Suld3DV2I32Zero
: return "NVPTXISD::Suld3DV2I32Zero";
1183 case NVPTXISD::Suld3DV2I64Zero
: return "NVPTXISD::Suld3DV2I64Zero";
1184 case NVPTXISD::Suld3DV4I8Zero
: return "NVPTXISD::Suld3DV4I8Zero";
1185 case NVPTXISD::Suld3DV4I16Zero
: return "NVPTXISD::Suld3DV4I16Zero";
1186 case NVPTXISD::Suld3DV4I32Zero
: return "NVPTXISD::Suld3DV4I32Zero";
1191 TargetLoweringBase::LegalizeTypeAction
1192 NVPTXTargetLowering::getPreferredVectorAction(MVT VT
) const {
1193 if (VT
.getVectorNumElements() != 1 && VT
.getScalarType() == MVT::i1
)
1194 return TypeSplitVector
;
1195 if (VT
== MVT::v2f16
)
1197 return TargetLoweringBase::getPreferredVectorAction(VT
);
1200 SDValue
NVPTXTargetLowering::getSqrtEstimate(SDValue Operand
, SelectionDAG
&DAG
,
1201 int Enabled
, int &ExtraSteps
,
1203 bool Reciprocal
) const {
1204 if (!(Enabled
== ReciprocalEstimate::Enabled
||
1205 (Enabled
== ReciprocalEstimate::Unspecified
&& !usePrecSqrtF32())))
1208 if (ExtraSteps
== ReciprocalEstimate::Unspecified
)
1212 EVT VT
= Operand
.getValueType();
1213 bool Ftz
= useF32FTZ(DAG
.getMachineFunction());
1215 auto MakeIntrinsicCall
= [&](Intrinsic::ID IID
) {
1216 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VT
,
1217 DAG
.getConstant(IID
, DL
, MVT::i32
), Operand
);
1220 // The sqrt and rsqrt refinement processes assume we always start out with an
1221 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1222 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1223 // any refinement, we must return a regular sqrt.
1224 if (Reciprocal
|| ExtraSteps
> 0) {
1226 return MakeIntrinsicCall(Ftz
? Intrinsic::nvvm_rsqrt_approx_ftz_f
1227 : Intrinsic::nvvm_rsqrt_approx_f
);
1228 else if (VT
== MVT::f64
)
1229 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d
);
1234 return MakeIntrinsicCall(Ftz
? Intrinsic::nvvm_sqrt_approx_ftz_f
1235 : Intrinsic::nvvm_sqrt_approx_f
);
1237 // There's no sqrt.approx.f64 instruction, so we emit
1238 // reciprocal(rsqrt(x)). This is faster than
1239 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1242 ISD::INTRINSIC_WO_CHAIN
, DL
, VT
,
1243 DAG
.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d
, DL
, MVT::i32
),
1244 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d
));
1250 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op
, SelectionDAG
&DAG
) const {
1252 const GlobalAddressSDNode
*GAN
= cast
<GlobalAddressSDNode
>(Op
);
1253 auto PtrVT
= getPointerTy(DAG
.getDataLayout(), GAN
->getAddressSpace());
1254 Op
= DAG
.getTargetGlobalAddress(GAN
->getGlobal(), dl
, PtrVT
);
1255 return DAG
.getNode(NVPTXISD::Wrapper
, dl
, PtrVT
, Op
);
1258 std::string
NVPTXTargetLowering::getPrototype(
1259 const DataLayout
&DL
, Type
*retTy
, const ArgListTy
&Args
,
1260 const SmallVectorImpl
<ISD::OutputArg
> &Outs
, unsigned retAlignment
,
1261 ImmutableCallSite CS
) const {
1262 auto PtrVT
= getPointerTy(DL
);
1264 bool isABI
= (STI
.getSmVersion() >= 20);
1265 assert(isABI
&& "Non-ABI compilation is not supported");
1269 std::stringstream O
;
1270 O
<< "prototype_" << uniqueCallSite
<< " : .callprototype ";
1272 if (retTy
->getTypeID() == Type::VoidTyID
) {
1276 if (retTy
->isFloatingPointTy() || (retTy
->isIntegerTy() && !retTy
->isIntegerTy(128))) {
1278 if (auto *ITy
= dyn_cast
<IntegerType
>(retTy
)) {
1279 size
= ITy
->getBitWidth();
1281 assert(retTy
->isFloatingPointTy() &&
1282 "Floating point type expected here");
1283 size
= retTy
->getPrimitiveSizeInBits();
1285 // PTX ABI requires all scalar return values to be at least 32
1286 // bits in size. fp16 normally uses .b16 as its storage type in
1287 // PTX, so its size must be adjusted here, too.
1291 O
<< ".param .b" << size
<< " _";
1292 } else if (isa
<PointerType
>(retTy
)) {
1293 O
<< ".param .b" << PtrVT
.getSizeInBits() << " _";
1294 } else if (retTy
->isAggregateType() || retTy
->isVectorTy() ||
1295 retTy
->isIntegerTy(128)) {
1296 O
<< ".param .align " << retAlignment
<< " .b8 _["
1297 << DL
.getTypeAllocSize(retTy
) << "]";
1299 llvm_unreachable("Unknown return type");
1308 for (unsigned i
= 0, e
= Args
.size(); i
!= e
; ++i
, ++OIdx
) {
1309 Type
*Ty
= Args
[i
].Ty
;
1315 if (!Outs
[OIdx
].Flags
.isByVal()) {
1316 if (Ty
->isAggregateType() || Ty
->isVectorTy() || Ty
->isIntegerTy(128)) {
1318 const CallInst
*CallI
= cast
<CallInst
>(CS
.getInstruction());
1319 // +1 because index 0 is reserved for return type alignment
1320 if (!getAlign(*CallI
, i
+ 1, align
))
1321 align
= DL
.getABITypeAlignment(Ty
);
1322 unsigned sz
= DL
.getTypeAllocSize(Ty
);
1323 O
<< ".param .align " << align
<< " .b8 ";
1325 O
<< "[" << sz
<< "]";
1326 // update the index for Outs
1327 SmallVector
<EVT
, 16> vtparts
;
1328 ComputeValueVTs(*this, DL
, Ty
, vtparts
);
1329 if (unsigned len
= vtparts
.size())
1333 // i8 types in IR will be i16 types in SDAG
1334 assert((getValueType(DL
, Ty
) == Outs
[OIdx
].VT
||
1335 (getValueType(DL
, Ty
) == MVT::i8
&& Outs
[OIdx
].VT
== MVT::i16
)) &&
1336 "type mismatch between callee prototype and arguments");
1339 if (isa
<IntegerType
>(Ty
)) {
1340 sz
= cast
<IntegerType
>(Ty
)->getBitWidth();
1343 } else if (isa
<PointerType
>(Ty
)) {
1344 sz
= PtrVT
.getSizeInBits();
1345 } else if (Ty
->isHalfTy())
1346 // PTX ABI requires all scalar parameters to be at least 32
1347 // bits in size. fp16 normally uses .b16 as its storage type
1348 // in PTX, so its size must be adjusted here, too.
1351 sz
= Ty
->getPrimitiveSizeInBits();
1352 O
<< ".param .b" << sz
<< " ";
1356 auto *PTy
= dyn_cast
<PointerType
>(Ty
);
1357 assert(PTy
&& "Param with byval attribute should be a pointer type");
1358 Type
*ETy
= PTy
->getElementType();
1360 unsigned align
= Outs
[OIdx
].Flags
.getByValAlign();
1361 unsigned sz
= DL
.getTypeAllocSize(ETy
);
1362 O
<< ".param .align " << align
<< " .b8 ";
1364 O
<< "[" << sz
<< "]";
1370 unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee
,
1371 ImmutableCallSite CS
,
1372 Type
*Ty
, unsigned Idx
,
1373 const DataLayout
&DL
) const {
1375 // CallSite is zero, fallback to ABI type alignment
1376 return DL
.getABITypeAlignment(Ty
);
1380 const Value
*DirectCallee
= CS
.getCalledFunction();
1382 if (!DirectCallee
) {
1383 // We don't have a direct function symbol, but that may be because of
1384 // constant cast instructions in the call.
1385 const Instruction
*CalleeI
= CS
.getInstruction();
1386 assert(CalleeI
&& "Call target is not a function or derived value?");
1388 // With bitcast'd call targets, the instruction will be the call
1389 if (isa
<CallInst
>(CalleeI
)) {
1390 // Check if we have call alignment metadata
1391 if (getAlign(*cast
<CallInst
>(CalleeI
), Idx
, Align
))
1394 const Value
*CalleeV
= cast
<CallInst
>(CalleeI
)->getCalledValue();
1395 // Ignore any bitcast instructions
1396 while (isa
<ConstantExpr
>(CalleeV
)) {
1397 const ConstantExpr
*CE
= cast
<ConstantExpr
>(CalleeV
);
1400 // Look through the bitcast
1401 CalleeV
= cast
<ConstantExpr
>(CalleeV
)->getOperand(0);
1404 // We have now looked past all of the bitcasts. Do we finally have a
1406 if (isa
<Function
>(CalleeV
))
1407 DirectCallee
= CalleeV
;
1411 // Check for function alignment information if we found that the
1412 // ultimate target is a Function
1414 if (getAlign(*cast
<Function
>(DirectCallee
), Idx
, Align
))
1417 // Call is indirect or alignment information is not available, fall back to
1418 // the ABI type alignment
1419 return DL
.getABITypeAlignment(Ty
);
1422 SDValue
NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo
&CLI
,
1423 SmallVectorImpl
<SDValue
> &InVals
) const {
1424 SelectionDAG
&DAG
= CLI
.DAG
;
1426 SmallVectorImpl
<ISD::OutputArg
> &Outs
= CLI
.Outs
;
1427 SmallVectorImpl
<SDValue
> &OutVals
= CLI
.OutVals
;
1428 SmallVectorImpl
<ISD::InputArg
> &Ins
= CLI
.Ins
;
1429 SDValue Chain
= CLI
.Chain
;
1430 SDValue Callee
= CLI
.Callee
;
1431 bool &isTailCall
= CLI
.IsTailCall
;
1432 ArgListTy
&Args
= CLI
.getArgs();
1433 Type
*RetTy
= CLI
.RetTy
;
1434 ImmutableCallSite CS
= CLI
.CS
;
1435 const DataLayout
&DL
= DAG
.getDataLayout();
1437 bool isABI
= (STI
.getSmVersion() >= 20);
1438 assert(isABI
&& "Non-ABI compilation is not supported");
1442 SDValue tempChain
= Chain
;
1443 Chain
= DAG
.getCALLSEQ_START(Chain
, uniqueCallSite
, 0, dl
);
1444 SDValue InFlag
= Chain
.getValue(1);
1446 unsigned paramCount
= 0;
1447 // Args.size() and Outs.size() need not match.
1448 // Outs.size() will be larger
1449 // * if there is an aggregate argument with multiple fields (each field
1450 // showing up separately in Outs)
1451 // * if there is a vector argument with more than typical vector-length
1452 // elements (generally if more than 4) where each vector element is
1453 // individually present in Outs.
1454 // So a different index should be used for indexing into Outs/OutVals.
1455 // See similar issue in LowerFormalArguments.
1457 // Declare the .params or .reg need to pass values
1459 for (unsigned i
= 0, e
= Args
.size(); i
!= e
; ++i
, ++OIdx
) {
1460 EVT VT
= Outs
[OIdx
].VT
;
1461 Type
*Ty
= Args
[i
].Ty
;
1463 if (!Outs
[OIdx
].Flags
.isByVal()) {
1464 SmallVector
<EVT
, 16> VTs
;
1465 SmallVector
<uint64_t, 16> Offsets
;
1466 ComputePTXValueVTs(*this, DL
, Ty
, VTs
, &Offsets
);
1468 getArgumentAlignment(Callee
, CS
, Ty
, paramCount
+ 1, DL
);
1469 unsigned AllocSize
= DL
.getTypeAllocSize(Ty
);
1470 SDVTList DeclareParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1471 bool NeedAlign
; // Does argument declaration specify alignment?
1472 if (Ty
->isAggregateType() || Ty
->isVectorTy() || Ty
->isIntegerTy(128)) {
1473 // declare .param .align <align> .b8 .param<n>[<size>];
1474 SDValue DeclareParamOps
[] = {
1475 Chain
, DAG
.getConstant(ArgAlign
, dl
, MVT::i32
),
1476 DAG
.getConstant(paramCount
, dl
, MVT::i32
),
1477 DAG
.getConstant(AllocSize
, dl
, MVT::i32
), InFlag
};
1478 Chain
= DAG
.getNode(NVPTXISD::DeclareParam
, dl
, DeclareParamVTs
,
1482 // declare .param .b<size> .param<n>;
1483 if ((VT
.isInteger() || VT
.isFloatingPoint()) && AllocSize
< 4) {
1484 // PTX ABI requires integral types to be at least 32 bits in
1485 // size. FP16 is loaded/stored using i16, so it's handled
1489 SDValue DeclareScalarParamOps
[] = {
1490 Chain
, DAG
.getConstant(paramCount
, dl
, MVT::i32
),
1491 DAG
.getConstant(AllocSize
* 8, dl
, MVT::i32
),
1492 DAG
.getConstant(0, dl
, MVT::i32
), InFlag
};
1493 Chain
= DAG
.getNode(NVPTXISD::DeclareScalarParam
, dl
, DeclareParamVTs
,
1494 DeclareScalarParamOps
);
1497 InFlag
= Chain
.getValue(1);
1499 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1500 // than 32-bits are sign extended or zero extended, depending on
1501 // whether they are signed or unsigned types. This case applies
1502 // only to scalar parameters and not to aggregate values.
1503 bool ExtendIntegerParam
=
1504 Ty
->isIntegerTy() && DL
.getTypeAllocSizeInBits(Ty
) < 32;
1506 auto VectorInfo
= VectorizePTXValueVTs(VTs
, Offsets
, ArgAlign
);
1507 SmallVector
<SDValue
, 6> StoreOperands
;
1508 for (unsigned j
= 0, je
= VTs
.size(); j
!= je
; ++j
) {
1510 if (VectorInfo
[j
] & PVF_FIRST
) {
1511 assert(StoreOperands
.empty() && "Unfinished preceding store.");
1512 StoreOperands
.push_back(Chain
);
1513 StoreOperands
.push_back(DAG
.getConstant(paramCount
, dl
, MVT::i32
));
1514 StoreOperands
.push_back(DAG
.getConstant(Offsets
[j
], dl
, MVT::i32
));
1518 SDValue StVal
= OutVals
[OIdx
];
1519 if (ExtendIntegerParam
) {
1520 assert(VTs
.size() == 1 && "Scalar can't have multiple parts.");
1522 StVal
= DAG
.getNode(Outs
[OIdx
].Flags
.isSExt() ? ISD::SIGN_EXTEND
1524 dl
, MVT::i32
, StVal
);
1525 } else if (EltVT
.getSizeInBits() < 16) {
1526 // Use 16-bit registers for small stores as it's the
1527 // smallest general purpose register size supported by NVPTX.
1528 StVal
= DAG
.getNode(ISD::ANY_EXTEND
, dl
, MVT::i16
, StVal
);
1531 // Record the value to store.
1532 StoreOperands
.push_back(StVal
);
1534 if (VectorInfo
[j
] & PVF_LAST
) {
1535 unsigned NumElts
= StoreOperands
.size() - 3;
1536 NVPTXISD::NodeType Op
;
1539 Op
= NVPTXISD::StoreParam
;
1542 Op
= NVPTXISD::StoreParamV2
;
1545 Op
= NVPTXISD::StoreParamV4
;
1548 llvm_unreachable("Invalid vector info.");
1551 StoreOperands
.push_back(InFlag
);
1553 // Adjust type of the store op if we've extended the scalar
1555 EVT TheStoreType
= ExtendIntegerParam
? MVT::i32
: VTs
[j
];
1557 NeedAlign
? GreatestCommonDivisor64(ArgAlign
, Offsets
[j
]) : 0;
1559 Chain
= DAG
.getMemIntrinsicNode(
1560 Op
, dl
, DAG
.getVTList(MVT::Other
, MVT::Glue
), StoreOperands
,
1561 TheStoreType
, MachinePointerInfo(), EltAlign
,
1562 MachineMemOperand::MOStore
);
1563 InFlag
= Chain
.getValue(1);
1566 StoreOperands
.clear();
1570 assert(StoreOperands
.empty() && "Unfinished parameter store.");
1578 SmallVector
<EVT
, 16> VTs
;
1579 SmallVector
<uint64_t, 16> Offsets
;
1580 auto *PTy
= dyn_cast
<PointerType
>(Args
[i
].Ty
);
1581 assert(PTy
&& "Type of a byval parameter should be pointer");
1582 ComputePTXValueVTs(*this, DL
, PTy
->getElementType(), VTs
, &Offsets
, 0);
1584 // declare .param .align <align> .b8 .param<n>[<size>];
1585 unsigned sz
= Outs
[OIdx
].Flags
.getByValSize();
1586 SDVTList DeclareParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1587 unsigned ArgAlign
= Outs
[OIdx
].Flags
.getByValAlign();
1588 // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1589 // so we don't need to worry about natural alignment or not.
1590 // See TargetLowering::LowerCallTo().
1592 // Enforce minumum alignment of 4 to work around ptxas miscompile
1593 // for sm_50+. See corresponding alignment adjustment in
1594 // emitFunctionParamList() for details.
1597 SDValue DeclareParamOps
[] = {Chain
, DAG
.getConstant(ArgAlign
, dl
, MVT::i32
),
1598 DAG
.getConstant(paramCount
, dl
, MVT::i32
),
1599 DAG
.getConstant(sz
, dl
, MVT::i32
), InFlag
};
1600 Chain
= DAG
.getNode(NVPTXISD::DeclareParam
, dl
, DeclareParamVTs
,
1602 InFlag
= Chain
.getValue(1);
1603 for (unsigned j
= 0, je
= VTs
.size(); j
!= je
; ++j
) {
1604 EVT elemtype
= VTs
[j
];
1605 int curOffset
= Offsets
[j
];
1606 unsigned PartAlign
= GreatestCommonDivisor64(ArgAlign
, curOffset
);
1607 auto PtrVT
= getPointerTy(DL
);
1608 SDValue srcAddr
= DAG
.getNode(ISD::ADD
, dl
, PtrVT
, OutVals
[OIdx
],
1609 DAG
.getConstant(curOffset
, dl
, PtrVT
));
1610 SDValue theVal
= DAG
.getLoad(elemtype
, dl
, tempChain
, srcAddr
,
1611 MachinePointerInfo(), PartAlign
);
1612 if (elemtype
.getSizeInBits() < 16) {
1613 theVal
= DAG
.getNode(ISD::ANY_EXTEND
, dl
, MVT::i16
, theVal
);
1615 SDVTList CopyParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1616 SDValue CopyParamOps
[] = { Chain
,
1617 DAG
.getConstant(paramCount
, dl
, MVT::i32
),
1618 DAG
.getConstant(curOffset
, dl
, MVT::i32
),
1620 Chain
= DAG
.getMemIntrinsicNode(NVPTXISD::StoreParam
, dl
, CopyParamVTs
,
1621 CopyParamOps
, elemtype
,
1622 MachinePointerInfo(), /* Align */ 0,
1623 MachineMemOperand::MOStore
);
1625 InFlag
= Chain
.getValue(1);
1630 GlobalAddressSDNode
*Func
= dyn_cast
<GlobalAddressSDNode
>(Callee
.getNode());
1631 unsigned retAlignment
= 0;
1634 if (Ins
.size() > 0) {
1635 SmallVector
<EVT
, 16> resvtparts
;
1636 ComputeValueVTs(*this, DL
, RetTy
, resvtparts
);
1639 // .param .align 16 .b8 retval0[<size-in-bytes>], or
1640 // .param .b<size-in-bits> retval0
1641 unsigned resultsz
= DL
.getTypeAllocSizeInBits(RetTy
);
1642 // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1643 // these three types to match the logic in
1644 // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1645 // Plus, this behavior is consistent with nvcc's.
1646 if (RetTy
->isFloatingPointTy() || RetTy
->isPointerTy() ||
1647 (RetTy
->isIntegerTy() && !RetTy
->isIntegerTy(128))) {
1648 // Scalar needs to be at least 32bit wide
1651 SDVTList DeclareRetVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1652 SDValue DeclareRetOps
[] = { Chain
, DAG
.getConstant(1, dl
, MVT::i32
),
1653 DAG
.getConstant(resultsz
, dl
, MVT::i32
),
1654 DAG
.getConstant(0, dl
, MVT::i32
), InFlag
};
1655 Chain
= DAG
.getNode(NVPTXISD::DeclareRet
, dl
, DeclareRetVTs
,
1657 InFlag
= Chain
.getValue(1);
1659 retAlignment
= getArgumentAlignment(Callee
, CS
, RetTy
, 0, DL
);
1660 SDVTList DeclareRetVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1661 SDValue DeclareRetOps
[] = { Chain
,
1662 DAG
.getConstant(retAlignment
, dl
, MVT::i32
),
1663 DAG
.getConstant(resultsz
/ 8, dl
, MVT::i32
),
1664 DAG
.getConstant(0, dl
, MVT::i32
), InFlag
};
1665 Chain
= DAG
.getNode(NVPTXISD::DeclareRetParam
, dl
, DeclareRetVTs
,
1667 InFlag
= Chain
.getValue(1);
1671 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1672 // between them we must rely on the call site value which is valid for
1673 // indirect calls but is always null for libcalls.
1674 bool isIndirectCall
= !Func
&& CS
;
1676 if (isa
<ExternalSymbolSDNode
>(Callee
)) {
1677 Function
* CalleeFunc
= nullptr;
1679 // Try to find the callee in the current module.
1680 Callee
= DAG
.getSymbolFunctionGlobalAddress(Callee
, &CalleeFunc
);
1681 assert(CalleeFunc
!= nullptr && "Libcall callee must be set.");
1683 // Set the "libcall callee" attribute to indicate that the function
1684 // must always have a declaration.
1685 CalleeFunc
->addFnAttr("nvptx-libcall-callee", "true");
1688 if (isIndirectCall
) {
1689 // This is indirect function call case : PTX requires a prototype of the
1691 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1692 // to be emitted, and the label has to used as the last arg of call
1694 // The prototype is embedded in a string and put as the operand for a
1695 // CallPrototype SDNode which will print out to the value of the string.
1696 SDVTList ProtoVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1697 std::string Proto
= getPrototype(DL
, RetTy
, Args
, Outs
, retAlignment
, CS
);
1698 const char *ProtoStr
=
1699 nvTM
->getManagedStrPool()->getManagedString(Proto
.c_str())->c_str();
1700 SDValue ProtoOps
[] = {
1701 Chain
, DAG
.getTargetExternalSymbol(ProtoStr
, MVT::i32
), InFlag
,
1703 Chain
= DAG
.getNode(NVPTXISD::CallPrototype
, dl
, ProtoVTs
, ProtoOps
);
1704 InFlag
= Chain
.getValue(1);
1706 // Op to just print "call"
1707 SDVTList PrintCallVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1708 SDValue PrintCallOps
[] = {
1709 Chain
, DAG
.getConstant((Ins
.size() == 0) ? 0 : 1, dl
, MVT::i32
), InFlag
1711 // We model convergent calls as separate opcodes.
1712 unsigned Opcode
= isIndirectCall
? NVPTXISD::PrintCall
: NVPTXISD::PrintCallUni
;
1713 if (CLI
.IsConvergent
)
1714 Opcode
= Opcode
== NVPTXISD::PrintCallUni
? NVPTXISD::PrintConvergentCallUni
1715 : NVPTXISD::PrintConvergentCall
;
1716 Chain
= DAG
.getNode(Opcode
, dl
, PrintCallVTs
, PrintCallOps
);
1717 InFlag
= Chain
.getValue(1);
1719 // Ops to print out the function name
1720 SDVTList CallVoidVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1721 SDValue CallVoidOps
[] = { Chain
, Callee
, InFlag
};
1722 Chain
= DAG
.getNode(NVPTXISD::CallVoid
, dl
, CallVoidVTs
, CallVoidOps
);
1723 InFlag
= Chain
.getValue(1);
1725 // Ops to print out the param list
1726 SDVTList CallArgBeginVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1727 SDValue CallArgBeginOps
[] = { Chain
, InFlag
};
1728 Chain
= DAG
.getNode(NVPTXISD::CallArgBegin
, dl
, CallArgBeginVTs
,
1730 InFlag
= Chain
.getValue(1);
1732 for (unsigned i
= 0, e
= paramCount
; i
!= e
; ++i
) {
1735 opcode
= NVPTXISD::LastCallArg
;
1737 opcode
= NVPTXISD::CallArg
;
1738 SDVTList CallArgVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1739 SDValue CallArgOps
[] = { Chain
, DAG
.getConstant(1, dl
, MVT::i32
),
1740 DAG
.getConstant(i
, dl
, MVT::i32
), InFlag
};
1741 Chain
= DAG
.getNode(opcode
, dl
, CallArgVTs
, CallArgOps
);
1742 InFlag
= Chain
.getValue(1);
1744 SDVTList CallArgEndVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1745 SDValue CallArgEndOps
[] = { Chain
,
1746 DAG
.getConstant(isIndirectCall
? 0 : 1, dl
, MVT::i32
),
1748 Chain
= DAG
.getNode(NVPTXISD::CallArgEnd
, dl
, CallArgEndVTs
, CallArgEndOps
);
1749 InFlag
= Chain
.getValue(1);
1751 if (isIndirectCall
) {
1752 SDVTList PrototypeVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1753 SDValue PrototypeOps
[] = { Chain
,
1754 DAG
.getConstant(uniqueCallSite
, dl
, MVT::i32
),
1756 Chain
= DAG
.getNode(NVPTXISD::Prototype
, dl
, PrototypeVTs
, PrototypeOps
);
1757 InFlag
= Chain
.getValue(1);
1760 SmallVector
<SDValue
, 16> ProxyRegOps
;
1761 SmallVector
<Optional
<MVT
>, 16> ProxyRegTruncates
;
1763 // Generate loads from param memory/moves from registers for result
1764 if (Ins
.size() > 0) {
1765 SmallVector
<EVT
, 16> VTs
;
1766 SmallVector
<uint64_t, 16> Offsets
;
1767 ComputePTXValueVTs(*this, DL
, RetTy
, VTs
, &Offsets
, 0);
1768 assert(VTs
.size() == Ins
.size() && "Bad value decomposition");
1770 unsigned RetAlign
= getArgumentAlignment(Callee
, CS
, RetTy
, 0, DL
);
1771 auto VectorInfo
= VectorizePTXValueVTs(VTs
, Offsets
, RetAlign
);
1773 SmallVector
<EVT
, 6> LoadVTs
;
1774 int VecIdx
= -1; // Index of the first element of the vector.
1776 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1777 // 32-bits are sign extended or zero extended, depending on whether
1778 // they are signed or unsigned types.
1779 bool ExtendIntegerRetVal
=
1780 RetTy
->isIntegerTy() && DL
.getTypeAllocSizeInBits(RetTy
) < 32;
1782 for (unsigned i
= 0, e
= VTs
.size(); i
!= e
; ++i
) {
1783 bool needTruncate
= false;
1784 EVT TheLoadType
= VTs
[i
];
1785 EVT EltType
= Ins
[i
].VT
;
1786 unsigned EltAlign
= GreatestCommonDivisor64(RetAlign
, Offsets
[i
]);
1787 if (ExtendIntegerRetVal
) {
1788 TheLoadType
= MVT::i32
;
1790 needTruncate
= true;
1791 } else if (TheLoadType
.getSizeInBits() < 16) {
1792 if (VTs
[i
].isInteger())
1793 needTruncate
= true;
1797 // Record index of the very first element of the vector.
1798 if (VectorInfo
[i
] & PVF_FIRST
) {
1799 assert(VecIdx
== -1 && LoadVTs
.empty() && "Orphaned operand list.");
1803 LoadVTs
.push_back(EltType
);
1805 if (VectorInfo
[i
] & PVF_LAST
) {
1806 unsigned NumElts
= LoadVTs
.size();
1807 LoadVTs
.push_back(MVT::Other
);
1808 LoadVTs
.push_back(MVT::Glue
);
1809 NVPTXISD::NodeType Op
;
1812 Op
= NVPTXISD::LoadParam
;
1815 Op
= NVPTXISD::LoadParamV2
;
1818 Op
= NVPTXISD::LoadParamV4
;
1821 llvm_unreachable("Invalid vector info.");
1824 SDValue LoadOperands
[] = {
1825 Chain
, DAG
.getConstant(1, dl
, MVT::i32
),
1826 DAG
.getConstant(Offsets
[VecIdx
], dl
, MVT::i32
), InFlag
};
1827 SDValue RetVal
= DAG
.getMemIntrinsicNode(
1828 Op
, dl
, DAG
.getVTList(LoadVTs
), LoadOperands
, TheLoadType
,
1829 MachinePointerInfo(), EltAlign
,
1830 MachineMemOperand::MOLoad
);
1832 for (unsigned j
= 0; j
< NumElts
; ++j
) {
1833 ProxyRegOps
.push_back(RetVal
.getValue(j
));
1836 ProxyRegTruncates
.push_back(Optional
<MVT
>(Ins
[VecIdx
+ j
].VT
));
1838 ProxyRegTruncates
.push_back(Optional
<MVT
>());
1841 Chain
= RetVal
.getValue(NumElts
);
1842 InFlag
= RetVal
.getValue(NumElts
+ 1);
1851 Chain
= DAG
.getCALLSEQ_END(Chain
,
1852 DAG
.getIntPtrConstant(uniqueCallSite
, dl
, true),
1853 DAG
.getIntPtrConstant(uniqueCallSite
+ 1, dl
,
1856 InFlag
= Chain
.getValue(1);
1859 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1860 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1862 for (unsigned i
= 0; i
< ProxyRegOps
.size(); ++i
) {
1863 SDValue Ret
= DAG
.getNode(
1864 NVPTXISD::ProxyReg
, dl
,
1865 DAG
.getVTList(ProxyRegOps
[i
].getSimpleValueType(), MVT::Other
, MVT::Glue
),
1866 { Chain
, ProxyRegOps
[i
], InFlag
}
1869 Chain
= Ret
.getValue(1);
1870 InFlag
= Ret
.getValue(2);
1872 if (ProxyRegTruncates
[i
].hasValue()) {
1873 Ret
= DAG
.getNode(ISD::TRUNCATE
, dl
, ProxyRegTruncates
[i
].getValue(), Ret
);
1876 InVals
.push_back(Ret
);
1879 // set isTailCall to false for now, until we figure out how to express
1880 // tail call optimization in PTX
1885 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1886 // (see LegalizeDAG.cpp). This is slow and uses local memory.
1887 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1889 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op
, SelectionDAG
&DAG
) const {
1890 SDNode
*Node
= Op
.getNode();
1892 SmallVector
<SDValue
, 8> Ops
;
1893 unsigned NumOperands
= Node
->getNumOperands();
1894 for (unsigned i
= 0; i
< NumOperands
; ++i
) {
1895 SDValue SubOp
= Node
->getOperand(i
);
1896 EVT VVT
= SubOp
.getNode()->getValueType(0);
1897 EVT EltVT
= VVT
.getVectorElementType();
1898 unsigned NumSubElem
= VVT
.getVectorNumElements();
1899 for (unsigned j
= 0; j
< NumSubElem
; ++j
) {
1900 Ops
.push_back(DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, SubOp
,
1901 DAG
.getIntPtrConstant(j
, dl
)));
1904 return DAG
.getBuildVector(Node
->getValueType(0), dl
, Ops
);
1907 // We can init constant f16x2 with a single .b32 move. Normally it
1908 // would get lowered as two constant loads and vector-packing move.
1909 // mov.b16 %h1, 0x4000;
1910 // mov.b16 %h2, 0x3C00;
1911 // mov.b32 %hh2, {%h2, %h1};
1912 // Instead we want just a constant move:
1913 // mov.b32 %hh2, 0x40003C00
1915 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1916 // generates good SASS in both cases.
1917 SDValue
NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op
,
1918 SelectionDAG
&DAG
) const {
1920 if (!(Op
->getValueType(0) == MVT::v2f16
&&
1921 isa
<ConstantFPSDNode
>(Op
->getOperand(0)) &&
1922 isa
<ConstantFPSDNode
>(Op
->getOperand(1))))
1926 cast
<ConstantFPSDNode
>(Op
->getOperand(0))->getValueAPF().bitcastToAPInt();
1928 cast
<ConstantFPSDNode
>(Op
->getOperand(1))->getValueAPF().bitcastToAPInt();
1930 DAG
.getConstant(E1
.zext(32).shl(16) | E0
.zext(32), SDLoc(Op
), MVT::i32
);
1931 return DAG
.getNode(ISD::BITCAST
, SDLoc(Op
), MVT::v2f16
, Const
);
1934 SDValue
NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op
,
1935 SelectionDAG
&DAG
) const {
1936 SDValue Index
= Op
->getOperand(1);
1937 // Constant index will be matched by tablegen.
1938 if (isa
<ConstantSDNode
>(Index
.getNode()))
1941 // Extract individual elements and select one of them.
1942 SDValue Vector
= Op
->getOperand(0);
1943 EVT VectorVT
= Vector
.getValueType();
1944 assert(VectorVT
== MVT::v2f16
&& "Unexpected vector type.");
1945 EVT EltVT
= VectorVT
.getVectorElementType();
1947 SDLoc
dl(Op
.getNode());
1948 SDValue E0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, Vector
,
1949 DAG
.getIntPtrConstant(0, dl
));
1950 SDValue E1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, Vector
,
1951 DAG
.getIntPtrConstant(1, dl
));
1952 return DAG
.getSelectCC(dl
, Index
, DAG
.getIntPtrConstant(0, dl
), E0
, E1
,
1953 ISD::CondCode::SETEQ
);
1956 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1957 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1959 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1961 SDValue
NVPTXTargetLowering::LowerShiftRightParts(SDValue Op
,
1962 SelectionDAG
&DAG
) const {
1963 assert(Op
.getNumOperands() == 3 && "Not a double-shift!");
1964 assert(Op
.getOpcode() == ISD::SRA_PARTS
|| Op
.getOpcode() == ISD::SRL_PARTS
);
1966 EVT VT
= Op
.getValueType();
1967 unsigned VTBits
= VT
.getSizeInBits();
1969 SDValue ShOpLo
= Op
.getOperand(0);
1970 SDValue ShOpHi
= Op
.getOperand(1);
1971 SDValue ShAmt
= Op
.getOperand(2);
1972 unsigned Opc
= (Op
.getOpcode() == ISD::SRA_PARTS
) ? ISD::SRA
: ISD::SRL
;
1974 if (VTBits
== 32 && STI
.getSmVersion() >= 35) {
1975 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1976 // {dHi, dLo} = {aHi, aLo} >> Amt
1978 // dLo = shf.r.clamp aLo, aHi, Amt
1980 SDValue Hi
= DAG
.getNode(Opc
, dl
, VT
, ShOpHi
, ShAmt
);
1981 SDValue Lo
= DAG
.getNode(NVPTXISD::FUN_SHFR_CLAMP
, dl
, VT
, ShOpLo
, ShOpHi
,
1984 SDValue Ops
[2] = { Lo
, Hi
};
1985 return DAG
.getMergeValues(Ops
, dl
);
1988 // {dHi, dLo} = {aHi, aLo} >> Amt
1989 // - if (Amt>=size) then
1990 // dLo = aHi >> (Amt-size)
1991 // dHi = aHi >> Amt (this is either all 0 or all 1)
1993 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1996 SDValue RevShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
,
1997 DAG
.getConstant(VTBits
, dl
, MVT::i32
),
1999 SDValue Tmp1
= DAG
.getNode(ISD::SRL
, dl
, VT
, ShOpLo
, ShAmt
);
2000 SDValue ExtraShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
, ShAmt
,
2001 DAG
.getConstant(VTBits
, dl
, MVT::i32
));
2002 SDValue Tmp2
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpHi
, RevShAmt
);
2003 SDValue FalseVal
= DAG
.getNode(ISD::OR
, dl
, VT
, Tmp1
, Tmp2
);
2004 SDValue TrueVal
= DAG
.getNode(Opc
, dl
, VT
, ShOpHi
, ExtraShAmt
);
2006 SDValue Cmp
= DAG
.getSetCC(dl
, MVT::i1
, ShAmt
,
2007 DAG
.getConstant(VTBits
, dl
, MVT::i32
),
2009 SDValue Hi
= DAG
.getNode(Opc
, dl
, VT
, ShOpHi
, ShAmt
);
2010 SDValue Lo
= DAG
.getNode(ISD::SELECT
, dl
, VT
, Cmp
, TrueVal
, FalseVal
);
2012 SDValue Ops
[2] = { Lo
, Hi
};
2013 return DAG
.getMergeValues(Ops
, dl
);
2017 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2018 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2020 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2022 SDValue
NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op
,
2023 SelectionDAG
&DAG
) const {
2024 assert(Op
.getNumOperands() == 3 && "Not a double-shift!");
2025 assert(Op
.getOpcode() == ISD::SHL_PARTS
);
2027 EVT VT
= Op
.getValueType();
2028 unsigned VTBits
= VT
.getSizeInBits();
2030 SDValue ShOpLo
= Op
.getOperand(0);
2031 SDValue ShOpHi
= Op
.getOperand(1);
2032 SDValue ShAmt
= Op
.getOperand(2);
2034 if (VTBits
== 32 && STI
.getSmVersion() >= 35) {
2035 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2036 // {dHi, dLo} = {aHi, aLo} << Amt
2037 // dHi = shf.l.clamp aLo, aHi, Amt
2040 SDValue Hi
= DAG
.getNode(NVPTXISD::FUN_SHFL_CLAMP
, dl
, VT
, ShOpLo
, ShOpHi
,
2042 SDValue Lo
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpLo
, ShAmt
);
2044 SDValue Ops
[2] = { Lo
, Hi
};
2045 return DAG
.getMergeValues(Ops
, dl
);
2048 // {dHi, dLo} = {aHi, aLo} << Amt
2049 // - if (Amt>=size) then
2050 // dLo = aLo << Amt (all 0)
2051 // dLo = aLo << (Amt-size)
2054 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2056 SDValue RevShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
,
2057 DAG
.getConstant(VTBits
, dl
, MVT::i32
),
2059 SDValue Tmp1
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpHi
, ShAmt
);
2060 SDValue ExtraShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
, ShAmt
,
2061 DAG
.getConstant(VTBits
, dl
, MVT::i32
));
2062 SDValue Tmp2
= DAG
.getNode(ISD::SRL
, dl
, VT
, ShOpLo
, RevShAmt
);
2063 SDValue FalseVal
= DAG
.getNode(ISD::OR
, dl
, VT
, Tmp1
, Tmp2
);
2064 SDValue TrueVal
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpLo
, ExtraShAmt
);
2066 SDValue Cmp
= DAG
.getSetCC(dl
, MVT::i1
, ShAmt
,
2067 DAG
.getConstant(VTBits
, dl
, MVT::i32
),
2069 SDValue Lo
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpLo
, ShAmt
);
2070 SDValue Hi
= DAG
.getNode(ISD::SELECT
, dl
, VT
, Cmp
, TrueVal
, FalseVal
);
2072 SDValue Ops
[2] = { Lo
, Hi
};
2073 return DAG
.getMergeValues(Ops
, dl
);
2077 SDValue
NVPTXTargetLowering::LowerFROUND(SDValue Op
, SelectionDAG
&DAG
) const {
2078 EVT VT
= Op
.getValueType();
2081 return LowerFROUND32(Op
, DAG
);
2084 return LowerFROUND64(Op
, DAG
);
2086 llvm_unreachable("unhandled type");
2089 // This is the the rounding method used in CUDA libdevice in C like code:
2090 // float roundf(float A)
2092 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2093 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2094 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2096 SDValue
NVPTXTargetLowering::LowerFROUND32(SDValue Op
,
2097 SelectionDAG
&DAG
) const {
2099 SDValue A
= Op
.getOperand(0);
2100 EVT VT
= Op
.getValueType();
2102 SDValue AbsA
= DAG
.getNode(ISD::FABS
, SL
, VT
, A
);
2104 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2105 SDValue Bitcast
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, A
);
2106 const int SignBitMask
= 0x80000000;
2107 SDValue Sign
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
, Bitcast
,
2108 DAG
.getConstant(SignBitMask
, SL
, MVT::i32
));
2109 const int PointFiveInBits
= 0x3F000000;
2110 SDValue PointFiveWithSignRaw
=
2111 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Sign
,
2112 DAG
.getConstant(PointFiveInBits
, SL
, MVT::i32
));
2113 SDValue PointFiveWithSign
=
2114 DAG
.getNode(ISD::BITCAST
, SL
, VT
, PointFiveWithSignRaw
);
2115 SDValue AdjustedA
= DAG
.getNode(ISD::FADD
, SL
, VT
, A
, PointFiveWithSign
);
2116 SDValue RoundedA
= DAG
.getNode(ISD::FTRUNC
, SL
, VT
, AdjustedA
);
2118 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2119 EVT SetCCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
2121 DAG
.getSetCC(SL
, SetCCVT
, AbsA
, DAG
.getConstantFP(pow(2.0, 23.0), SL
, VT
),
2123 RoundedA
= DAG
.getNode(ISD::SELECT
, SL
, VT
, IsLarge
, A
, RoundedA
);
2125 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2126 SDValue IsSmall
=DAG
.getSetCC(SL
, SetCCVT
, AbsA
,
2127 DAG
.getConstantFP(0.5, SL
, VT
), ISD::SETOLT
);
2128 SDValue RoundedAForSmallA
= DAG
.getNode(ISD::FTRUNC
, SL
, VT
, A
);
2129 return DAG
.getNode(ISD::SELECT
, SL
, VT
, IsSmall
, RoundedAForSmallA
, RoundedA
);
2132 // The implementation of round(double) is similar to that of round(float) in
2133 // that they both separate the value range into three regions and use a method
2134 // specific to the region to round the values. However, round(double) first
2135 // calculates the round of the absolute value and then adds the sign back while
2136 // round(float) directly rounds the value with sign.
2137 SDValue
NVPTXTargetLowering::LowerFROUND64(SDValue Op
,
2138 SelectionDAG
&DAG
) const {
2140 SDValue A
= Op
.getOperand(0);
2141 EVT VT
= Op
.getValueType();
2143 SDValue AbsA
= DAG
.getNode(ISD::FABS
, SL
, VT
, A
);
2145 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2146 SDValue AdjustedA
= DAG
.getNode(ISD::FADD
, SL
, VT
, AbsA
,
2147 DAG
.getConstantFP(0.5, SL
, VT
));
2148 SDValue RoundedA
= DAG
.getNode(ISD::FTRUNC
, SL
, VT
, AdjustedA
);
2150 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2151 EVT SetCCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
2152 SDValue IsSmall
=DAG
.getSetCC(SL
, SetCCVT
, AbsA
,
2153 DAG
.getConstantFP(0.5, SL
, VT
), ISD::SETOLT
);
2154 RoundedA
= DAG
.getNode(ISD::SELECT
, SL
, VT
, IsSmall
,
2155 DAG
.getConstantFP(0, SL
, VT
),
2158 // Add sign to rounded_A
2159 RoundedA
= DAG
.getNode(ISD::FCOPYSIGN
, SL
, VT
, RoundedA
, A
);
2160 DAG
.getNode(ISD::FTRUNC
, SL
, VT
, A
);
2162 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2164 DAG
.getSetCC(SL
, SetCCVT
, AbsA
, DAG
.getConstantFP(pow(2.0, 52.0), SL
, VT
),
2166 return DAG
.getNode(ISD::SELECT
, SL
, VT
, IsLarge
, A
, RoundedA
);
2172 NVPTXTargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
2173 switch (Op
.getOpcode()) {
2174 case ISD::RETURNADDR
:
2176 case ISD::FRAMEADDR
:
2178 case ISD::GlobalAddress
:
2179 return LowerGlobalAddress(Op
, DAG
);
2180 case ISD::INTRINSIC_W_CHAIN
:
2182 case ISD::BUILD_VECTOR
:
2183 return LowerBUILD_VECTOR(Op
, DAG
);
2184 case ISD::EXTRACT_SUBVECTOR
:
2186 case ISD::EXTRACT_VECTOR_ELT
:
2187 return LowerEXTRACT_VECTOR_ELT(Op
, DAG
);
2188 case ISD::CONCAT_VECTORS
:
2189 return LowerCONCAT_VECTORS(Op
, DAG
);
2191 return LowerSTORE(Op
, DAG
);
2193 return LowerLOAD(Op
, DAG
);
2194 case ISD::SHL_PARTS
:
2195 return LowerShiftLeftParts(Op
, DAG
);
2196 case ISD::SRA_PARTS
:
2197 case ISD::SRL_PARTS
:
2198 return LowerShiftRightParts(Op
, DAG
);
2200 return LowerSelect(Op
, DAG
);
2202 return LowerFROUND(Op
, DAG
);
2204 llvm_unreachable("Custom lowering not defined for operation");
2208 SDValue
NVPTXTargetLowering::LowerSelect(SDValue Op
, SelectionDAG
&DAG
) const {
2209 SDValue Op0
= Op
->getOperand(0);
2210 SDValue Op1
= Op
->getOperand(1);
2211 SDValue Op2
= Op
->getOperand(2);
2212 SDLoc
DL(Op
.getNode());
2214 assert(Op
.getValueType() == MVT::i1
&& "Custom lowering enabled only for i1");
2216 Op1
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Op1
);
2217 Op2
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Op2
);
2218 SDValue Select
= DAG
.getNode(ISD::SELECT
, DL
, MVT::i32
, Op0
, Op1
, Op2
);
2219 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Select
);
2224 SDValue
NVPTXTargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
2225 if (Op
.getValueType() == MVT::i1
)
2226 return LowerLOADi1(Op
, DAG
);
2228 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2229 // loads and have to handle it here.
2230 if (Op
.getValueType() == MVT::v2f16
) {
2231 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
2232 EVT MemVT
= Load
->getMemoryVT();
2233 if (!allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
2234 MemVT
, *Load
->getMemOperand())) {
2236 std::tie(Ops
[0], Ops
[1]) = expandUnalignedLoad(Load
, DAG
);
2237 return DAG
.getMergeValues(Ops
, SDLoc(Op
));
2246 // v1 = ld i8* addr (-> i16)
2247 // v = trunc i16 to i1
2248 SDValue
NVPTXTargetLowering::LowerLOADi1(SDValue Op
, SelectionDAG
&DAG
) const {
2249 SDNode
*Node
= Op
.getNode();
2250 LoadSDNode
*LD
= cast
<LoadSDNode
>(Node
);
2252 assert(LD
->getExtensionType() == ISD::NON_EXTLOAD
);
2253 assert(Node
->getValueType(0) == MVT::i1
&&
2254 "Custom lowering for i1 load only");
2255 SDValue newLD
= DAG
.getLoad(MVT::i16
, dl
, LD
->getChain(), LD
->getBasePtr(),
2256 LD
->getPointerInfo(), LD
->getAlignment(),
2257 LD
->getMemOperand()->getFlags());
2258 SDValue result
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i1
, newLD
);
2259 // The legalizer (the caller) is expecting two values from the legalized
2260 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2261 // in LegalizeDAG.cpp which also uses MergeValues.
2262 SDValue Ops
[] = { result
, LD
->getChain() };
2263 return DAG
.getMergeValues(Ops
, dl
);
2266 SDValue
NVPTXTargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
2267 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
2268 EVT VT
= Store
->getMemoryVT();
2271 return LowerSTOREi1(Op
, DAG
);
2273 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2274 // stores and have to handle it here.
2275 if (VT
== MVT::v2f16
&&
2276 !allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
2277 VT
, *Store
->getMemOperand()))
2278 return expandUnalignedStore(Store
, DAG
);
2281 return LowerSTOREVector(Op
, DAG
);
2287 NVPTXTargetLowering::LowerSTOREVector(SDValue Op
, SelectionDAG
&DAG
) const {
2288 SDNode
*N
= Op
.getNode();
2289 SDValue Val
= N
->getOperand(1);
2291 EVT ValVT
= Val
.getValueType();
2293 if (ValVT
.isVector()) {
2294 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2295 // legal. We can (and should) split that into 2 stores of <2 x double> here
2296 // but I'm leaving that as a TODO for now.
2297 if (!ValVT
.isSimple())
2299 switch (ValVT
.getSimpleVT().SimpleTy
) {
2314 case MVT::v8f16
: // <4 x f16x2>
2315 // This is a "native" vector type
2319 MemSDNode
*MemSD
= cast
<MemSDNode
>(N
);
2320 const DataLayout
&TD
= DAG
.getDataLayout();
2322 unsigned Align
= MemSD
->getAlignment();
2323 unsigned PrefAlign
=
2324 TD
.getPrefTypeAlignment(ValVT
.getTypeForEVT(*DAG
.getContext()));
2325 if (Align
< PrefAlign
) {
2326 // This store is not sufficiently aligned, so bail out and let this vector
2327 // store be scalarized. Note that we may still be able to emit smaller
2328 // vector stores. For example, if we are storing a <4 x float> with an
2329 // alignment of 8, this check will fail but the legalizer will try again
2330 // with 2 x <2 x float>, which will succeed with an alignment of 8.
2334 unsigned Opcode
= 0;
2335 EVT EltVT
= ValVT
.getVectorElementType();
2336 unsigned NumElts
= ValVT
.getVectorNumElements();
2338 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2339 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2340 // stored type to i16 and propagate the "real" type as the memory type.
2341 bool NeedExt
= false;
2342 if (EltVT
.getSizeInBits() < 16)
2345 bool StoreF16x2
= false;
2350 Opcode
= NVPTXISD::StoreV2
;
2353 Opcode
= NVPTXISD::StoreV4
;
2356 // v8f16 is a special case. PTX doesn't have st.v8.f16
2357 // instruction. Instead, we split the vector into v2f16 chunks and
2358 // store them with st.v4.b32.
2359 assert(EltVT
== MVT::f16
&& "Wrong type for the vector.");
2360 Opcode
= NVPTXISD::StoreV4
;
2365 SmallVector
<SDValue
, 8> Ops
;
2367 // First is the chain
2368 Ops
.push_back(N
->getOperand(0));
2371 // Combine f16,f16 -> v2f16
2373 for (unsigned i
= 0; i
< NumElts
; ++i
) {
2374 SDValue E0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f16
, Val
,
2375 DAG
.getIntPtrConstant(i
* 2, DL
));
2376 SDValue E1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f16
, Val
,
2377 DAG
.getIntPtrConstant(i
* 2 + 1, DL
));
2378 SDValue V2
= DAG
.getNode(ISD::BUILD_VECTOR
, DL
, MVT::v2f16
, E0
, E1
);
2382 // Then the split values
2383 for (unsigned i
= 0; i
< NumElts
; ++i
) {
2384 SDValue ExtVal
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltVT
, Val
,
2385 DAG
.getIntPtrConstant(i
, DL
));
2387 ExtVal
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i16
, ExtVal
);
2388 Ops
.push_back(ExtVal
);
2392 // Then any remaining arguments
2393 Ops
.append(N
->op_begin() + 2, N
->op_end());
2396 DAG
.getMemIntrinsicNode(Opcode
, DL
, DAG
.getVTList(MVT::Other
), Ops
,
2397 MemSD
->getMemoryVT(), MemSD
->getMemOperand());
2399 // return DCI.CombineTo(N, NewSt, true);
2408 // v1 = zxt v to i16
2410 SDValue
NVPTXTargetLowering::LowerSTOREi1(SDValue Op
, SelectionDAG
&DAG
) const {
2411 SDNode
*Node
= Op
.getNode();
2413 StoreSDNode
*ST
= cast
<StoreSDNode
>(Node
);
2414 SDValue Tmp1
= ST
->getChain();
2415 SDValue Tmp2
= ST
->getBasePtr();
2416 SDValue Tmp3
= ST
->getValue();
2417 assert(Tmp3
.getValueType() == MVT::i1
&& "Custom lowering for i1 store only");
2418 Tmp3
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
, MVT::i16
, Tmp3
);
2420 DAG
.getTruncStore(Tmp1
, dl
, Tmp3
, Tmp2
, ST
->getPointerInfo(), MVT::i8
,
2421 ST
->getAlignment(), ST
->getMemOperand()->getFlags());
2426 NVPTXTargetLowering::getParamSymbol(SelectionDAG
&DAG
, int idx
, EVT v
) const {
2427 std::string ParamSym
;
2428 raw_string_ostream
ParamStr(ParamSym
);
2430 ParamStr
<< DAG
.getMachineFunction().getName() << "_param_" << idx
;
2433 std::string
*SavedStr
=
2434 nvTM
->getManagedStrPool()->getManagedString(ParamSym
.c_str());
2435 return DAG
.getTargetExternalSymbol(SavedStr
->c_str(), v
);
2438 // Check to see if the kernel argument is image*_t or sampler_t
2440 static bool isImageOrSamplerVal(const Value
*arg
, const Module
*context
) {
2441 static const char *const specialTypes
[] = { "struct._image2d_t",
2442 "struct._image3d_t",
2443 "struct._sampler_t" };
2445 Type
*Ty
= arg
->getType();
2446 auto *PTy
= dyn_cast
<PointerType
>(Ty
);
2454 auto *STy
= dyn_cast
<StructType
>(PTy
->getElementType());
2455 if (!STy
|| STy
->isLiteral())
2458 return std::find(std::begin(specialTypes
), std::end(specialTypes
),
2459 STy
->getName()) != std::end(specialTypes
);
2462 SDValue
NVPTXTargetLowering::LowerFormalArguments(
2463 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
2464 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&dl
,
2465 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
2466 MachineFunction
&MF
= DAG
.getMachineFunction();
2467 const DataLayout
&DL
= DAG
.getDataLayout();
2468 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
2470 const Function
*F
= &MF
.getFunction();
2471 const AttributeList
&PAL
= F
->getAttributes();
2472 const TargetLowering
*TLI
= STI
.getTargetLowering();
2474 SDValue Root
= DAG
.getRoot();
2475 std::vector
<SDValue
> OutChains
;
2477 bool isABI
= (STI
.getSmVersion() >= 20);
2478 assert(isABI
&& "Non-ABI compilation is not supported");
2482 std::vector
<Type
*> argTypes
;
2483 std::vector
<const Argument
*> theArgs
;
2484 for (const Argument
&I
: F
->args()) {
2485 theArgs
.push_back(&I
);
2486 argTypes
.push_back(I
.getType());
2488 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2489 // Ins.size() will be larger
2490 // * if there is an aggregate argument with multiple fields (each field
2491 // showing up separately in Ins)
2492 // * if there is a vector argument with more than typical vector-length
2493 // elements (generally if more than 4) where each vector element is
2494 // individually present in Ins.
2495 // So a different index should be used for indexing into Ins.
2496 // See similar issue in LowerCall.
2497 unsigned InsIdx
= 0;
2500 for (unsigned i
= 0, e
= theArgs
.size(); i
!= e
; ++i
, ++idx
, ++InsIdx
) {
2501 Type
*Ty
= argTypes
[i
];
2503 // If the kernel argument is image*_t or sampler_t, convert it to
2504 // a i32 constant holding the parameter position. This can later
2505 // matched in the AsmPrinter to output the correct mangled name.
2506 if (isImageOrSamplerVal(
2508 (theArgs
[i
]->getParent() ? theArgs
[i
]->getParent()->getParent()
2510 assert(isKernelFunction(*F
) &&
2511 "Only kernels can have image/sampler params");
2512 InVals
.push_back(DAG
.getConstant(i
+ 1, dl
, MVT::i32
));
2516 if (theArgs
[i
]->use_empty()) {
2518 if (Ty
->isAggregateType() || Ty
->isIntegerTy(128)) {
2519 SmallVector
<EVT
, 16> vtparts
;
2521 ComputePTXValueVTs(*this, DAG
.getDataLayout(), Ty
, vtparts
);
2522 assert(vtparts
.size() > 0 && "empty aggregate type not expected");
2523 for (unsigned parti
= 0, parte
= vtparts
.size(); parti
!= parte
;
2525 InVals
.push_back(DAG
.getNode(ISD::UNDEF
, dl
, Ins
[InsIdx
].VT
));
2528 if (vtparts
.size() > 0)
2532 if (Ty
->isVectorTy()) {
2533 EVT ObjectVT
= getValueType(DL
, Ty
);
2534 unsigned NumRegs
= TLI
->getNumRegisters(F
->getContext(), ObjectVT
);
2535 for (unsigned parti
= 0; parti
< NumRegs
; ++parti
) {
2536 InVals
.push_back(DAG
.getNode(ISD::UNDEF
, dl
, Ins
[InsIdx
].VT
));
2543 InVals
.push_back(DAG
.getNode(ISD::UNDEF
, dl
, Ins
[InsIdx
].VT
));
2547 // In the following cases, assign a node order of "idx+1"
2548 // to newly created nodes. The SDNodes for params have to
2549 // appear in the same order as their order of appearance
2550 // in the original function. "idx+1" holds that order.
2551 if (!PAL
.hasParamAttribute(i
, Attribute::ByVal
)) {
2552 bool aggregateIsPacked
= false;
2553 if (StructType
*STy
= dyn_cast
<StructType
>(Ty
))
2554 aggregateIsPacked
= STy
->isPacked();
2556 SmallVector
<EVT
, 16> VTs
;
2557 SmallVector
<uint64_t, 16> Offsets
;
2558 ComputePTXValueVTs(*this, DL
, Ty
, VTs
, &Offsets
, 0);
2559 assert(VTs
.size() > 0 && "Unexpected empty type.");
2561 VectorizePTXValueVTs(VTs
, Offsets
, DL
.getABITypeAlignment(Ty
));
2563 SDValue Arg
= getParamSymbol(DAG
, idx
, PtrVT
);
2564 int VecIdx
= -1; // Index of the first element of the current vector.
2565 for (unsigned parti
= 0, parte
= VTs
.size(); parti
!= parte
; ++parti
) {
2566 if (VectorInfo
[parti
] & PVF_FIRST
) {
2567 assert(VecIdx
== -1 && "Orphaned vector.");
2571 // That's the last element of this store op.
2572 if (VectorInfo
[parti
] & PVF_LAST
) {
2573 unsigned NumElts
= parti
- VecIdx
+ 1;
2574 EVT EltVT
= VTs
[parti
];
2575 // i1 is loaded/stored as i8.
2577 if (EltVT
== MVT::i1
)
2579 else if (EltVT
== MVT::v2f16
)
2580 // getLoad needs a vector type, but it can't handle
2581 // vectors which contain v2f16 elements. So we must load
2582 // using i32 here and then bitcast back.
2585 EVT VecVT
= EVT::getVectorVT(F
->getContext(), LoadVT
, NumElts
);
2587 DAG
.getNode(ISD::ADD
, dl
, PtrVT
, Arg
,
2588 DAG
.getConstant(Offsets
[VecIdx
], dl
, PtrVT
));
2589 Value
*srcValue
= Constant::getNullValue(PointerType::get(
2590 EltVT
.getTypeForEVT(F
->getContext()), ADDRESS_SPACE_PARAM
));
2592 DAG
.getLoad(VecVT
, dl
, Root
, VecAddr
,
2593 MachinePointerInfo(srcValue
), aggregateIsPacked
,
2594 MachineMemOperand::MODereferenceable
|
2595 MachineMemOperand::MOInvariant
);
2597 P
.getNode()->setIROrder(idx
+ 1);
2598 for (unsigned j
= 0; j
< NumElts
; ++j
) {
2599 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, LoadVT
, P
,
2600 DAG
.getIntPtrConstant(j
, dl
));
2601 // We've loaded i1 as an i8 and now must truncate it back to i1
2602 if (EltVT
== MVT::i1
)
2603 Elt
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i1
, Elt
);
2604 // v2f16 was loaded as an i32. Now we must bitcast it back.
2605 else if (EltVT
== MVT::v2f16
)
2606 Elt
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::v2f16
, Elt
);
2607 // Extend the element if necessary (e.g. an i8 is loaded
2608 // into an i16 register)
2609 if (Ins
[InsIdx
].VT
.isInteger() &&
2610 Ins
[InsIdx
].VT
.getSizeInBits() > LoadVT
.getSizeInBits()) {
2611 unsigned Extend
= Ins
[InsIdx
].Flags
.isSExt() ? ISD::SIGN_EXTEND
2613 Elt
= DAG
.getNode(Extend
, dl
, Ins
[InsIdx
].VT
, Elt
);
2615 InVals
.push_back(Elt
);
2618 // Reset vector tracking state.
2628 // Param has ByVal attribute
2629 // Return MoveParam(param symbol).
2630 // Ideally, the param symbol can be returned directly,
2631 // but when SDNode builder decides to use it in a CopyToReg(),
2632 // machine instruction fails because TargetExternalSymbol
2633 // (not lowered) is target dependent, and CopyToReg assumes
2634 // the source is lowered.
2635 EVT ObjectVT
= getValueType(DL
, Ty
);
2636 assert(ObjectVT
== Ins
[InsIdx
].VT
&&
2637 "Ins type did not match function type");
2638 SDValue Arg
= getParamSymbol(DAG
, idx
, PtrVT
);
2639 SDValue p
= DAG
.getNode(NVPTXISD::MoveParam
, dl
, ObjectVT
, Arg
);
2641 p
.getNode()->setIROrder(idx
+ 1);
2642 InVals
.push_back(p
);
2645 // Clang will check explicit VarArg and issue error if any. However, Clang
2646 // will let code with
2647 // implicit var arg like f() pass. See bug 617733.
2648 // We treat this case as if the arg list is empty.
2649 // if (F.isVarArg()) {
2650 // assert(0 && "VarArg not supported yet!");
2653 if (!OutChains
.empty())
2654 DAG
.setRoot(DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
, OutChains
));
2660 NVPTXTargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
2662 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2663 const SmallVectorImpl
<SDValue
> &OutVals
,
2664 const SDLoc
&dl
, SelectionDAG
&DAG
) const {
2665 MachineFunction
&MF
= DAG
.getMachineFunction();
2666 Type
*RetTy
= MF
.getFunction().getReturnType();
2668 bool isABI
= (STI
.getSmVersion() >= 20);
2669 assert(isABI
&& "Non-ABI compilation is not supported");
2673 const DataLayout DL
= DAG
.getDataLayout();
2674 SmallVector
<EVT
, 16> VTs
;
2675 SmallVector
<uint64_t, 16> Offsets
;
2676 ComputePTXValueVTs(*this, DL
, RetTy
, VTs
, &Offsets
);
2677 assert(VTs
.size() == OutVals
.size() && "Bad return value decomposition");
2679 auto VectorInfo
= VectorizePTXValueVTs(
2680 VTs
, Offsets
, RetTy
->isSized() ? DL
.getABITypeAlignment(RetTy
) : 1);
2682 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2683 // 32-bits are sign extended or zero extended, depending on whether
2684 // they are signed or unsigned types.
2685 bool ExtendIntegerRetVal
=
2686 RetTy
->isIntegerTy() && DL
.getTypeAllocSizeInBits(RetTy
) < 32;
2688 SmallVector
<SDValue
, 6> StoreOperands
;
2689 for (unsigned i
= 0, e
= VTs
.size(); i
!= e
; ++i
) {
2690 // New load/store. Record chain and offset operands.
2691 if (VectorInfo
[i
] & PVF_FIRST
) {
2692 assert(StoreOperands
.empty() && "Orphaned operand list.");
2693 StoreOperands
.push_back(Chain
);
2694 StoreOperands
.push_back(DAG
.getConstant(Offsets
[i
], dl
, MVT::i32
));
2697 SDValue RetVal
= OutVals
[i
];
2698 if (ExtendIntegerRetVal
) {
2699 RetVal
= DAG
.getNode(Outs
[i
].Flags
.isSExt() ? ISD::SIGN_EXTEND
2701 dl
, MVT::i32
, RetVal
);
2702 } else if (RetVal
.getValueSizeInBits() < 16) {
2703 // Use 16-bit registers for small load-stores as it's the
2704 // smallest general purpose register size supported by NVPTX.
2705 RetVal
= DAG
.getNode(ISD::ANY_EXTEND
, dl
, MVT::i16
, RetVal
);
2708 // Record the value to return.
2709 StoreOperands
.push_back(RetVal
);
2711 // That's the last element of this store op.
2712 if (VectorInfo
[i
] & PVF_LAST
) {
2713 NVPTXISD::NodeType Op
;
2714 unsigned NumElts
= StoreOperands
.size() - 2;
2717 Op
= NVPTXISD::StoreRetval
;
2720 Op
= NVPTXISD::StoreRetvalV2
;
2723 Op
= NVPTXISD::StoreRetvalV4
;
2726 llvm_unreachable("Invalid vector info.");
2729 // Adjust type of load/store op if we've extended the scalar
2731 EVT TheStoreType
= ExtendIntegerRetVal
? MVT::i32
: VTs
[i
];
2732 Chain
= DAG
.getMemIntrinsicNode(Op
, dl
, DAG
.getVTList(MVT::Other
),
2733 StoreOperands
, TheStoreType
,
2734 MachinePointerInfo(), /* Align */ 1,
2735 MachineMemOperand::MOStore
);
2736 // Cleanup vector state.
2737 StoreOperands
.clear();
2741 return DAG
.getNode(NVPTXISD::RET_FLAG
, dl
, MVT::Other
, Chain
);
2744 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
2745 SDValue Op
, std::string
&Constraint
, std::vector
<SDValue
> &Ops
,
2746 SelectionDAG
&DAG
) const {
2747 if (Constraint
.length() > 1)
2750 TargetLowering::LowerAsmOperandForConstraint(Op
, Constraint
, Ops
, DAG
);
2753 static unsigned getOpcForTextureInstr(unsigned Intrinsic
) {
2754 switch (Intrinsic
) {
2758 case Intrinsic::nvvm_tex_1d_v4f32_s32
:
2759 return NVPTXISD::Tex1DFloatS32
;
2760 case Intrinsic::nvvm_tex_1d_v4f32_f32
:
2761 return NVPTXISD::Tex1DFloatFloat
;
2762 case Intrinsic::nvvm_tex_1d_level_v4f32_f32
:
2763 return NVPTXISD::Tex1DFloatFloatLevel
;
2764 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32
:
2765 return NVPTXISD::Tex1DFloatFloatGrad
;
2766 case Intrinsic::nvvm_tex_1d_v4s32_s32
:
2767 return NVPTXISD::Tex1DS32S32
;
2768 case Intrinsic::nvvm_tex_1d_v4s32_f32
:
2769 return NVPTXISD::Tex1DS32Float
;
2770 case Intrinsic::nvvm_tex_1d_level_v4s32_f32
:
2771 return NVPTXISD::Tex1DS32FloatLevel
;
2772 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32
:
2773 return NVPTXISD::Tex1DS32FloatGrad
;
2774 case Intrinsic::nvvm_tex_1d_v4u32_s32
:
2775 return NVPTXISD::Tex1DU32S32
;
2776 case Intrinsic::nvvm_tex_1d_v4u32_f32
:
2777 return NVPTXISD::Tex1DU32Float
;
2778 case Intrinsic::nvvm_tex_1d_level_v4u32_f32
:
2779 return NVPTXISD::Tex1DU32FloatLevel
;
2780 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32
:
2781 return NVPTXISD::Tex1DU32FloatGrad
;
2783 case Intrinsic::nvvm_tex_1d_array_v4f32_s32
:
2784 return NVPTXISD::Tex1DArrayFloatS32
;
2785 case Intrinsic::nvvm_tex_1d_array_v4f32_f32
:
2786 return NVPTXISD::Tex1DArrayFloatFloat
;
2787 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32
:
2788 return NVPTXISD::Tex1DArrayFloatFloatLevel
;
2789 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32
:
2790 return NVPTXISD::Tex1DArrayFloatFloatGrad
;
2791 case Intrinsic::nvvm_tex_1d_array_v4s32_s32
:
2792 return NVPTXISD::Tex1DArrayS32S32
;
2793 case Intrinsic::nvvm_tex_1d_array_v4s32_f32
:
2794 return NVPTXISD::Tex1DArrayS32Float
;
2795 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32
:
2796 return NVPTXISD::Tex1DArrayS32FloatLevel
;
2797 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32
:
2798 return NVPTXISD::Tex1DArrayS32FloatGrad
;
2799 case Intrinsic::nvvm_tex_1d_array_v4u32_s32
:
2800 return NVPTXISD::Tex1DArrayU32S32
;
2801 case Intrinsic::nvvm_tex_1d_array_v4u32_f32
:
2802 return NVPTXISD::Tex1DArrayU32Float
;
2803 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32
:
2804 return NVPTXISD::Tex1DArrayU32FloatLevel
;
2805 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32
:
2806 return NVPTXISD::Tex1DArrayU32FloatGrad
;
2808 case Intrinsic::nvvm_tex_2d_v4f32_s32
:
2809 return NVPTXISD::Tex2DFloatS32
;
2810 case Intrinsic::nvvm_tex_2d_v4f32_f32
:
2811 return NVPTXISD::Tex2DFloatFloat
;
2812 case Intrinsic::nvvm_tex_2d_level_v4f32_f32
:
2813 return NVPTXISD::Tex2DFloatFloatLevel
;
2814 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32
:
2815 return NVPTXISD::Tex2DFloatFloatGrad
;
2816 case Intrinsic::nvvm_tex_2d_v4s32_s32
:
2817 return NVPTXISD::Tex2DS32S32
;
2818 case Intrinsic::nvvm_tex_2d_v4s32_f32
:
2819 return NVPTXISD::Tex2DS32Float
;
2820 case Intrinsic::nvvm_tex_2d_level_v4s32_f32
:
2821 return NVPTXISD::Tex2DS32FloatLevel
;
2822 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32
:
2823 return NVPTXISD::Tex2DS32FloatGrad
;
2824 case Intrinsic::nvvm_tex_2d_v4u32_s32
:
2825 return NVPTXISD::Tex2DU32S32
;
2826 case Intrinsic::nvvm_tex_2d_v4u32_f32
:
2827 return NVPTXISD::Tex2DU32Float
;
2828 case Intrinsic::nvvm_tex_2d_level_v4u32_f32
:
2829 return NVPTXISD::Tex2DU32FloatLevel
;
2830 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32
:
2831 return NVPTXISD::Tex2DU32FloatGrad
;
2833 case Intrinsic::nvvm_tex_2d_array_v4f32_s32
:
2834 return NVPTXISD::Tex2DArrayFloatS32
;
2835 case Intrinsic::nvvm_tex_2d_array_v4f32_f32
:
2836 return NVPTXISD::Tex2DArrayFloatFloat
;
2837 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32
:
2838 return NVPTXISD::Tex2DArrayFloatFloatLevel
;
2839 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32
:
2840 return NVPTXISD::Tex2DArrayFloatFloatGrad
;
2841 case Intrinsic::nvvm_tex_2d_array_v4s32_s32
:
2842 return NVPTXISD::Tex2DArrayS32S32
;
2843 case Intrinsic::nvvm_tex_2d_array_v4s32_f32
:
2844 return NVPTXISD::Tex2DArrayS32Float
;
2845 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32
:
2846 return NVPTXISD::Tex2DArrayS32FloatLevel
;
2847 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32
:
2848 return NVPTXISD::Tex2DArrayS32FloatGrad
;
2849 case Intrinsic::nvvm_tex_2d_array_v4u32_s32
:
2850 return NVPTXISD::Tex2DArrayU32S32
;
2851 case Intrinsic::nvvm_tex_2d_array_v4u32_f32
:
2852 return NVPTXISD::Tex2DArrayU32Float
;
2853 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32
:
2854 return NVPTXISD::Tex2DArrayU32FloatLevel
;
2855 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32
:
2856 return NVPTXISD::Tex2DArrayU32FloatGrad
;
2858 case Intrinsic::nvvm_tex_3d_v4f32_s32
:
2859 return NVPTXISD::Tex3DFloatS32
;
2860 case Intrinsic::nvvm_tex_3d_v4f32_f32
:
2861 return NVPTXISD::Tex3DFloatFloat
;
2862 case Intrinsic::nvvm_tex_3d_level_v4f32_f32
:
2863 return NVPTXISD::Tex3DFloatFloatLevel
;
2864 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32
:
2865 return NVPTXISD::Tex3DFloatFloatGrad
;
2866 case Intrinsic::nvvm_tex_3d_v4s32_s32
:
2867 return NVPTXISD::Tex3DS32S32
;
2868 case Intrinsic::nvvm_tex_3d_v4s32_f32
:
2869 return NVPTXISD::Tex3DS32Float
;
2870 case Intrinsic::nvvm_tex_3d_level_v4s32_f32
:
2871 return NVPTXISD::Tex3DS32FloatLevel
;
2872 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32
:
2873 return NVPTXISD::Tex3DS32FloatGrad
;
2874 case Intrinsic::nvvm_tex_3d_v4u32_s32
:
2875 return NVPTXISD::Tex3DU32S32
;
2876 case Intrinsic::nvvm_tex_3d_v4u32_f32
:
2877 return NVPTXISD::Tex3DU32Float
;
2878 case Intrinsic::nvvm_tex_3d_level_v4u32_f32
:
2879 return NVPTXISD::Tex3DU32FloatLevel
;
2880 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32
:
2881 return NVPTXISD::Tex3DU32FloatGrad
;
2883 case Intrinsic::nvvm_tex_cube_v4f32_f32
:
2884 return NVPTXISD::TexCubeFloatFloat
;
2885 case Intrinsic::nvvm_tex_cube_level_v4f32_f32
:
2886 return NVPTXISD::TexCubeFloatFloatLevel
;
2887 case Intrinsic::nvvm_tex_cube_v4s32_f32
:
2888 return NVPTXISD::TexCubeS32Float
;
2889 case Intrinsic::nvvm_tex_cube_level_v4s32_f32
:
2890 return NVPTXISD::TexCubeS32FloatLevel
;
2891 case Intrinsic::nvvm_tex_cube_v4u32_f32
:
2892 return NVPTXISD::TexCubeU32Float
;
2893 case Intrinsic::nvvm_tex_cube_level_v4u32_f32
:
2894 return NVPTXISD::TexCubeU32FloatLevel
;
2896 case Intrinsic::nvvm_tex_cube_array_v4f32_f32
:
2897 return NVPTXISD::TexCubeArrayFloatFloat
;
2898 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32
:
2899 return NVPTXISD::TexCubeArrayFloatFloatLevel
;
2900 case Intrinsic::nvvm_tex_cube_array_v4s32_f32
:
2901 return NVPTXISD::TexCubeArrayS32Float
;
2902 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32
:
2903 return NVPTXISD::TexCubeArrayS32FloatLevel
;
2904 case Intrinsic::nvvm_tex_cube_array_v4u32_f32
:
2905 return NVPTXISD::TexCubeArrayU32Float
;
2906 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32
:
2907 return NVPTXISD::TexCubeArrayU32FloatLevel
;
2909 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32
:
2910 return NVPTXISD::Tld4R2DFloatFloat
;
2911 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32
:
2912 return NVPTXISD::Tld4G2DFloatFloat
;
2913 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32
:
2914 return NVPTXISD::Tld4B2DFloatFloat
;
2915 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32
:
2916 return NVPTXISD::Tld4A2DFloatFloat
;
2917 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32
:
2918 return NVPTXISD::Tld4R2DS64Float
;
2919 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32
:
2920 return NVPTXISD::Tld4G2DS64Float
;
2921 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32
:
2922 return NVPTXISD::Tld4B2DS64Float
;
2923 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32
:
2924 return NVPTXISD::Tld4A2DS64Float
;
2925 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32
:
2926 return NVPTXISD::Tld4R2DU64Float
;
2927 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32
:
2928 return NVPTXISD::Tld4G2DU64Float
;
2929 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32
:
2930 return NVPTXISD::Tld4B2DU64Float
;
2931 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32
:
2932 return NVPTXISD::Tld4A2DU64Float
;
2934 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32
:
2935 return NVPTXISD::TexUnified1DFloatS32
;
2936 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32
:
2937 return NVPTXISD::TexUnified1DFloatFloat
;
2938 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32
:
2939 return NVPTXISD::TexUnified1DFloatFloatLevel
;
2940 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32
:
2941 return NVPTXISD::TexUnified1DFloatFloatGrad
;
2942 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32
:
2943 return NVPTXISD::TexUnified1DS32S32
;
2944 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32
:
2945 return NVPTXISD::TexUnified1DS32Float
;
2946 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32
:
2947 return NVPTXISD::TexUnified1DS32FloatLevel
;
2948 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32
:
2949 return NVPTXISD::TexUnified1DS32FloatGrad
;
2950 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32
:
2951 return NVPTXISD::TexUnified1DU32S32
;
2952 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32
:
2953 return NVPTXISD::TexUnified1DU32Float
;
2954 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32
:
2955 return NVPTXISD::TexUnified1DU32FloatLevel
;
2956 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32
:
2957 return NVPTXISD::TexUnified1DU32FloatGrad
;
2959 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32
:
2960 return NVPTXISD::TexUnified1DArrayFloatS32
;
2961 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32
:
2962 return NVPTXISD::TexUnified1DArrayFloatFloat
;
2963 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32
:
2964 return NVPTXISD::TexUnified1DArrayFloatFloatLevel
;
2965 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32
:
2966 return NVPTXISD::TexUnified1DArrayFloatFloatGrad
;
2967 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32
:
2968 return NVPTXISD::TexUnified1DArrayS32S32
;
2969 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32
:
2970 return NVPTXISD::TexUnified1DArrayS32Float
;
2971 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32
:
2972 return NVPTXISD::TexUnified1DArrayS32FloatLevel
;
2973 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32
:
2974 return NVPTXISD::TexUnified1DArrayS32FloatGrad
;
2975 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32
:
2976 return NVPTXISD::TexUnified1DArrayU32S32
;
2977 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32
:
2978 return NVPTXISD::TexUnified1DArrayU32Float
;
2979 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32
:
2980 return NVPTXISD::TexUnified1DArrayU32FloatLevel
;
2981 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32
:
2982 return NVPTXISD::TexUnified1DArrayU32FloatGrad
;
2984 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32
:
2985 return NVPTXISD::TexUnified2DFloatS32
;
2986 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32
:
2987 return NVPTXISD::TexUnified2DFloatFloat
;
2988 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32
:
2989 return NVPTXISD::TexUnified2DFloatFloatLevel
;
2990 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32
:
2991 return NVPTXISD::TexUnified2DFloatFloatGrad
;
2992 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32
:
2993 return NVPTXISD::TexUnified2DS32S32
;
2994 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32
:
2995 return NVPTXISD::TexUnified2DS32Float
;
2996 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32
:
2997 return NVPTXISD::TexUnified2DS32FloatLevel
;
2998 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32
:
2999 return NVPTXISD::TexUnified2DS32FloatGrad
;
3000 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32
:
3001 return NVPTXISD::TexUnified2DU32S32
;
3002 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32
:
3003 return NVPTXISD::TexUnified2DU32Float
;
3004 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32
:
3005 return NVPTXISD::TexUnified2DU32FloatLevel
;
3006 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32
:
3007 return NVPTXISD::TexUnified2DU32FloatGrad
;
3009 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32
:
3010 return NVPTXISD::TexUnified2DArrayFloatS32
;
3011 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32
:
3012 return NVPTXISD::TexUnified2DArrayFloatFloat
;
3013 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32
:
3014 return NVPTXISD::TexUnified2DArrayFloatFloatLevel
;
3015 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32
:
3016 return NVPTXISD::TexUnified2DArrayFloatFloatGrad
;
3017 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32
:
3018 return NVPTXISD::TexUnified2DArrayS32S32
;
3019 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32
:
3020 return NVPTXISD::TexUnified2DArrayS32Float
;
3021 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32
:
3022 return NVPTXISD::TexUnified2DArrayS32FloatLevel
;
3023 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32
:
3024 return NVPTXISD::TexUnified2DArrayS32FloatGrad
;
3025 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32
:
3026 return NVPTXISD::TexUnified2DArrayU32S32
;
3027 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32
:
3028 return NVPTXISD::TexUnified2DArrayU32Float
;
3029 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32
:
3030 return NVPTXISD::TexUnified2DArrayU32FloatLevel
;
3031 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32
:
3032 return NVPTXISD::TexUnified2DArrayU32FloatGrad
;
3034 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32
:
3035 return NVPTXISD::TexUnified3DFloatS32
;
3036 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32
:
3037 return NVPTXISD::TexUnified3DFloatFloat
;
3038 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32
:
3039 return NVPTXISD::TexUnified3DFloatFloatLevel
;
3040 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32
:
3041 return NVPTXISD::TexUnified3DFloatFloatGrad
;
3042 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32
:
3043 return NVPTXISD::TexUnified3DS32S32
;
3044 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32
:
3045 return NVPTXISD::TexUnified3DS32Float
;
3046 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32
:
3047 return NVPTXISD::TexUnified3DS32FloatLevel
;
3048 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32
:
3049 return NVPTXISD::TexUnified3DS32FloatGrad
;
3050 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32
:
3051 return NVPTXISD::TexUnified3DU32S32
;
3052 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32
:
3053 return NVPTXISD::TexUnified3DU32Float
;
3054 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32
:
3055 return NVPTXISD::TexUnified3DU32FloatLevel
;
3056 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32
:
3057 return NVPTXISD::TexUnified3DU32FloatGrad
;
3059 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32
:
3060 return NVPTXISD::TexUnifiedCubeFloatFloat
;
3061 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32
:
3062 return NVPTXISD::TexUnifiedCubeFloatFloatLevel
;
3063 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32
:
3064 return NVPTXISD::TexUnifiedCubeS32Float
;
3065 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32
:
3066 return NVPTXISD::TexUnifiedCubeS32FloatLevel
;
3067 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32
:
3068 return NVPTXISD::TexUnifiedCubeU32Float
;
3069 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32
:
3070 return NVPTXISD::TexUnifiedCubeU32FloatLevel
;
3072 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32
:
3073 return NVPTXISD::TexUnifiedCubeArrayFloatFloat
;
3074 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32
:
3075 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel
;
3076 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32
:
3077 return NVPTXISD::TexUnifiedCubeArrayS32Float
;
3078 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32
:
3079 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel
;
3080 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32
:
3081 return NVPTXISD::TexUnifiedCubeArrayU32Float
;
3082 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32
:
3083 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel
;
3085 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32
:
3086 return NVPTXISD::Tld4UnifiedR2DFloatFloat
;
3087 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32
:
3088 return NVPTXISD::Tld4UnifiedG2DFloatFloat
;
3089 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32
:
3090 return NVPTXISD::Tld4UnifiedB2DFloatFloat
;
3091 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32
:
3092 return NVPTXISD::Tld4UnifiedA2DFloatFloat
;
3093 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32
:
3094 return NVPTXISD::Tld4UnifiedR2DS64Float
;
3095 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32
:
3096 return NVPTXISD::Tld4UnifiedG2DS64Float
;
3097 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32
:
3098 return NVPTXISD::Tld4UnifiedB2DS64Float
;
3099 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32
:
3100 return NVPTXISD::Tld4UnifiedA2DS64Float
;
3101 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32
:
3102 return NVPTXISD::Tld4UnifiedR2DU64Float
;
3103 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32
:
3104 return NVPTXISD::Tld4UnifiedG2DU64Float
;
3105 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32
:
3106 return NVPTXISD::Tld4UnifiedB2DU64Float
;
3107 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32
:
3108 return NVPTXISD::Tld4UnifiedA2DU64Float
;
3112 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic
) {
3113 switch (Intrinsic
) {
3116 case Intrinsic::nvvm_suld_1d_i8_clamp
:
3117 return NVPTXISD::Suld1DI8Clamp
;
3118 case Intrinsic::nvvm_suld_1d_i16_clamp
:
3119 return NVPTXISD::Suld1DI16Clamp
;
3120 case Intrinsic::nvvm_suld_1d_i32_clamp
:
3121 return NVPTXISD::Suld1DI32Clamp
;
3122 case Intrinsic::nvvm_suld_1d_i64_clamp
:
3123 return NVPTXISD::Suld1DI64Clamp
;
3124 case Intrinsic::nvvm_suld_1d_v2i8_clamp
:
3125 return NVPTXISD::Suld1DV2I8Clamp
;
3126 case Intrinsic::nvvm_suld_1d_v2i16_clamp
:
3127 return NVPTXISD::Suld1DV2I16Clamp
;
3128 case Intrinsic::nvvm_suld_1d_v2i32_clamp
:
3129 return NVPTXISD::Suld1DV2I32Clamp
;
3130 case Intrinsic::nvvm_suld_1d_v2i64_clamp
:
3131 return NVPTXISD::Suld1DV2I64Clamp
;
3132 case Intrinsic::nvvm_suld_1d_v4i8_clamp
:
3133 return NVPTXISD::Suld1DV4I8Clamp
;
3134 case Intrinsic::nvvm_suld_1d_v4i16_clamp
:
3135 return NVPTXISD::Suld1DV4I16Clamp
;
3136 case Intrinsic::nvvm_suld_1d_v4i32_clamp
:
3137 return NVPTXISD::Suld1DV4I32Clamp
;
3138 case Intrinsic::nvvm_suld_1d_array_i8_clamp
:
3139 return NVPTXISD::Suld1DArrayI8Clamp
;
3140 case Intrinsic::nvvm_suld_1d_array_i16_clamp
:
3141 return NVPTXISD::Suld1DArrayI16Clamp
;
3142 case Intrinsic::nvvm_suld_1d_array_i32_clamp
:
3143 return NVPTXISD::Suld1DArrayI32Clamp
;
3144 case Intrinsic::nvvm_suld_1d_array_i64_clamp
:
3145 return NVPTXISD::Suld1DArrayI64Clamp
;
3146 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp
:
3147 return NVPTXISD::Suld1DArrayV2I8Clamp
;
3148 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp
:
3149 return NVPTXISD::Suld1DArrayV2I16Clamp
;
3150 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp
:
3151 return NVPTXISD::Suld1DArrayV2I32Clamp
;
3152 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp
:
3153 return NVPTXISD::Suld1DArrayV2I64Clamp
;
3154 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp
:
3155 return NVPTXISD::Suld1DArrayV4I8Clamp
;
3156 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp
:
3157 return NVPTXISD::Suld1DArrayV4I16Clamp
;
3158 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp
:
3159 return NVPTXISD::Suld1DArrayV4I32Clamp
;
3160 case Intrinsic::nvvm_suld_2d_i8_clamp
:
3161 return NVPTXISD::Suld2DI8Clamp
;
3162 case Intrinsic::nvvm_suld_2d_i16_clamp
:
3163 return NVPTXISD::Suld2DI16Clamp
;
3164 case Intrinsic::nvvm_suld_2d_i32_clamp
:
3165 return NVPTXISD::Suld2DI32Clamp
;
3166 case Intrinsic::nvvm_suld_2d_i64_clamp
:
3167 return NVPTXISD::Suld2DI64Clamp
;
3168 case Intrinsic::nvvm_suld_2d_v2i8_clamp
:
3169 return NVPTXISD::Suld2DV2I8Clamp
;
3170 case Intrinsic::nvvm_suld_2d_v2i16_clamp
:
3171 return NVPTXISD::Suld2DV2I16Clamp
;
3172 case Intrinsic::nvvm_suld_2d_v2i32_clamp
:
3173 return NVPTXISD::Suld2DV2I32Clamp
;
3174 case Intrinsic::nvvm_suld_2d_v2i64_clamp
:
3175 return NVPTXISD::Suld2DV2I64Clamp
;
3176 case Intrinsic::nvvm_suld_2d_v4i8_clamp
:
3177 return NVPTXISD::Suld2DV4I8Clamp
;
3178 case Intrinsic::nvvm_suld_2d_v4i16_clamp
:
3179 return NVPTXISD::Suld2DV4I16Clamp
;
3180 case Intrinsic::nvvm_suld_2d_v4i32_clamp
:
3181 return NVPTXISD::Suld2DV4I32Clamp
;
3182 case Intrinsic::nvvm_suld_2d_array_i8_clamp
:
3183 return NVPTXISD::Suld2DArrayI8Clamp
;
3184 case Intrinsic::nvvm_suld_2d_array_i16_clamp
:
3185 return NVPTXISD::Suld2DArrayI16Clamp
;
3186 case Intrinsic::nvvm_suld_2d_array_i32_clamp
:
3187 return NVPTXISD::Suld2DArrayI32Clamp
;
3188 case Intrinsic::nvvm_suld_2d_array_i64_clamp
:
3189 return NVPTXISD::Suld2DArrayI64Clamp
;
3190 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp
:
3191 return NVPTXISD::Suld2DArrayV2I8Clamp
;
3192 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp
:
3193 return NVPTXISD::Suld2DArrayV2I16Clamp
;
3194 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp
:
3195 return NVPTXISD::Suld2DArrayV2I32Clamp
;
3196 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp
:
3197 return NVPTXISD::Suld2DArrayV2I64Clamp
;
3198 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp
:
3199 return NVPTXISD::Suld2DArrayV4I8Clamp
;
3200 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp
:
3201 return NVPTXISD::Suld2DArrayV4I16Clamp
;
3202 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp
:
3203 return NVPTXISD::Suld2DArrayV4I32Clamp
;
3204 case Intrinsic::nvvm_suld_3d_i8_clamp
:
3205 return NVPTXISD::Suld3DI8Clamp
;
3206 case Intrinsic::nvvm_suld_3d_i16_clamp
:
3207 return NVPTXISD::Suld3DI16Clamp
;
3208 case Intrinsic::nvvm_suld_3d_i32_clamp
:
3209 return NVPTXISD::Suld3DI32Clamp
;
3210 case Intrinsic::nvvm_suld_3d_i64_clamp
:
3211 return NVPTXISD::Suld3DI64Clamp
;
3212 case Intrinsic::nvvm_suld_3d_v2i8_clamp
:
3213 return NVPTXISD::Suld3DV2I8Clamp
;
3214 case Intrinsic::nvvm_suld_3d_v2i16_clamp
:
3215 return NVPTXISD::Suld3DV2I16Clamp
;
3216 case Intrinsic::nvvm_suld_3d_v2i32_clamp
:
3217 return NVPTXISD::Suld3DV2I32Clamp
;
3218 case Intrinsic::nvvm_suld_3d_v2i64_clamp
:
3219 return NVPTXISD::Suld3DV2I64Clamp
;
3220 case Intrinsic::nvvm_suld_3d_v4i8_clamp
:
3221 return NVPTXISD::Suld3DV4I8Clamp
;
3222 case Intrinsic::nvvm_suld_3d_v4i16_clamp
:
3223 return NVPTXISD::Suld3DV4I16Clamp
;
3224 case Intrinsic::nvvm_suld_3d_v4i32_clamp
:
3225 return NVPTXISD::Suld3DV4I32Clamp
;
3226 case Intrinsic::nvvm_suld_1d_i8_trap
:
3227 return NVPTXISD::Suld1DI8Trap
;
3228 case Intrinsic::nvvm_suld_1d_i16_trap
:
3229 return NVPTXISD::Suld1DI16Trap
;
3230 case Intrinsic::nvvm_suld_1d_i32_trap
:
3231 return NVPTXISD::Suld1DI32Trap
;
3232 case Intrinsic::nvvm_suld_1d_i64_trap
:
3233 return NVPTXISD::Suld1DI64Trap
;
3234 case Intrinsic::nvvm_suld_1d_v2i8_trap
:
3235 return NVPTXISD::Suld1DV2I8Trap
;
3236 case Intrinsic::nvvm_suld_1d_v2i16_trap
:
3237 return NVPTXISD::Suld1DV2I16Trap
;
3238 case Intrinsic::nvvm_suld_1d_v2i32_trap
:
3239 return NVPTXISD::Suld1DV2I32Trap
;
3240 case Intrinsic::nvvm_suld_1d_v2i64_trap
:
3241 return NVPTXISD::Suld1DV2I64Trap
;
3242 case Intrinsic::nvvm_suld_1d_v4i8_trap
:
3243 return NVPTXISD::Suld1DV4I8Trap
;
3244 case Intrinsic::nvvm_suld_1d_v4i16_trap
:
3245 return NVPTXISD::Suld1DV4I16Trap
;
3246 case Intrinsic::nvvm_suld_1d_v4i32_trap
:
3247 return NVPTXISD::Suld1DV4I32Trap
;
3248 case Intrinsic::nvvm_suld_1d_array_i8_trap
:
3249 return NVPTXISD::Suld1DArrayI8Trap
;
3250 case Intrinsic::nvvm_suld_1d_array_i16_trap
:
3251 return NVPTXISD::Suld1DArrayI16Trap
;
3252 case Intrinsic::nvvm_suld_1d_array_i32_trap
:
3253 return NVPTXISD::Suld1DArrayI32Trap
;
3254 case Intrinsic::nvvm_suld_1d_array_i64_trap
:
3255 return NVPTXISD::Suld1DArrayI64Trap
;
3256 case Intrinsic::nvvm_suld_1d_array_v2i8_trap
:
3257 return NVPTXISD::Suld1DArrayV2I8Trap
;
3258 case Intrinsic::nvvm_suld_1d_array_v2i16_trap
:
3259 return NVPTXISD::Suld1DArrayV2I16Trap
;
3260 case Intrinsic::nvvm_suld_1d_array_v2i32_trap
:
3261 return NVPTXISD::Suld1DArrayV2I32Trap
;
3262 case Intrinsic::nvvm_suld_1d_array_v2i64_trap
:
3263 return NVPTXISD::Suld1DArrayV2I64Trap
;
3264 case Intrinsic::nvvm_suld_1d_array_v4i8_trap
:
3265 return NVPTXISD::Suld1DArrayV4I8Trap
;
3266 case Intrinsic::nvvm_suld_1d_array_v4i16_trap
:
3267 return NVPTXISD::Suld1DArrayV4I16Trap
;
3268 case Intrinsic::nvvm_suld_1d_array_v4i32_trap
:
3269 return NVPTXISD::Suld1DArrayV4I32Trap
;
3270 case Intrinsic::nvvm_suld_2d_i8_trap
:
3271 return NVPTXISD::Suld2DI8Trap
;
3272 case Intrinsic::nvvm_suld_2d_i16_trap
:
3273 return NVPTXISD::Suld2DI16Trap
;
3274 case Intrinsic::nvvm_suld_2d_i32_trap
:
3275 return NVPTXISD::Suld2DI32Trap
;
3276 case Intrinsic::nvvm_suld_2d_i64_trap
:
3277 return NVPTXISD::Suld2DI64Trap
;
3278 case Intrinsic::nvvm_suld_2d_v2i8_trap
:
3279 return NVPTXISD::Suld2DV2I8Trap
;
3280 case Intrinsic::nvvm_suld_2d_v2i16_trap
:
3281 return NVPTXISD::Suld2DV2I16Trap
;
3282 case Intrinsic::nvvm_suld_2d_v2i32_trap
:
3283 return NVPTXISD::Suld2DV2I32Trap
;
3284 case Intrinsic::nvvm_suld_2d_v2i64_trap
:
3285 return NVPTXISD::Suld2DV2I64Trap
;
3286 case Intrinsic::nvvm_suld_2d_v4i8_trap
:
3287 return NVPTXISD::Suld2DV4I8Trap
;
3288 case Intrinsic::nvvm_suld_2d_v4i16_trap
:
3289 return NVPTXISD::Suld2DV4I16Trap
;
3290 case Intrinsic::nvvm_suld_2d_v4i32_trap
:
3291 return NVPTXISD::Suld2DV4I32Trap
;
3292 case Intrinsic::nvvm_suld_2d_array_i8_trap
:
3293 return NVPTXISD::Suld2DArrayI8Trap
;
3294 case Intrinsic::nvvm_suld_2d_array_i16_trap
:
3295 return NVPTXISD::Suld2DArrayI16Trap
;
3296 case Intrinsic::nvvm_suld_2d_array_i32_trap
:
3297 return NVPTXISD::Suld2DArrayI32Trap
;
3298 case Intrinsic::nvvm_suld_2d_array_i64_trap
:
3299 return NVPTXISD::Suld2DArrayI64Trap
;
3300 case Intrinsic::nvvm_suld_2d_array_v2i8_trap
:
3301 return NVPTXISD::Suld2DArrayV2I8Trap
;
3302 case Intrinsic::nvvm_suld_2d_array_v2i16_trap
:
3303 return NVPTXISD::Suld2DArrayV2I16Trap
;
3304 case Intrinsic::nvvm_suld_2d_array_v2i32_trap
:
3305 return NVPTXISD::Suld2DArrayV2I32Trap
;
3306 case Intrinsic::nvvm_suld_2d_array_v2i64_trap
:
3307 return NVPTXISD::Suld2DArrayV2I64Trap
;
3308 case Intrinsic::nvvm_suld_2d_array_v4i8_trap
:
3309 return NVPTXISD::Suld2DArrayV4I8Trap
;
3310 case Intrinsic::nvvm_suld_2d_array_v4i16_trap
:
3311 return NVPTXISD::Suld2DArrayV4I16Trap
;
3312 case Intrinsic::nvvm_suld_2d_array_v4i32_trap
:
3313 return NVPTXISD::Suld2DArrayV4I32Trap
;
3314 case Intrinsic::nvvm_suld_3d_i8_trap
:
3315 return NVPTXISD::Suld3DI8Trap
;
3316 case Intrinsic::nvvm_suld_3d_i16_trap
:
3317 return NVPTXISD::Suld3DI16Trap
;
3318 case Intrinsic::nvvm_suld_3d_i32_trap
:
3319 return NVPTXISD::Suld3DI32Trap
;
3320 case Intrinsic::nvvm_suld_3d_i64_trap
:
3321 return NVPTXISD::Suld3DI64Trap
;
3322 case Intrinsic::nvvm_suld_3d_v2i8_trap
:
3323 return NVPTXISD::Suld3DV2I8Trap
;
3324 case Intrinsic::nvvm_suld_3d_v2i16_trap
:
3325 return NVPTXISD::Suld3DV2I16Trap
;
3326 case Intrinsic::nvvm_suld_3d_v2i32_trap
:
3327 return NVPTXISD::Suld3DV2I32Trap
;
3328 case Intrinsic::nvvm_suld_3d_v2i64_trap
:
3329 return NVPTXISD::Suld3DV2I64Trap
;
3330 case Intrinsic::nvvm_suld_3d_v4i8_trap
:
3331 return NVPTXISD::Suld3DV4I8Trap
;
3332 case Intrinsic::nvvm_suld_3d_v4i16_trap
:
3333 return NVPTXISD::Suld3DV4I16Trap
;
3334 case Intrinsic::nvvm_suld_3d_v4i32_trap
:
3335 return NVPTXISD::Suld3DV4I32Trap
;
3336 case Intrinsic::nvvm_suld_1d_i8_zero
:
3337 return NVPTXISD::Suld1DI8Zero
;
3338 case Intrinsic::nvvm_suld_1d_i16_zero
:
3339 return NVPTXISD::Suld1DI16Zero
;
3340 case Intrinsic::nvvm_suld_1d_i32_zero
:
3341 return NVPTXISD::Suld1DI32Zero
;
3342 case Intrinsic::nvvm_suld_1d_i64_zero
:
3343 return NVPTXISD::Suld1DI64Zero
;
3344 case Intrinsic::nvvm_suld_1d_v2i8_zero
:
3345 return NVPTXISD::Suld1DV2I8Zero
;
3346 case Intrinsic::nvvm_suld_1d_v2i16_zero
:
3347 return NVPTXISD::Suld1DV2I16Zero
;
3348 case Intrinsic::nvvm_suld_1d_v2i32_zero
:
3349 return NVPTXISD::Suld1DV2I32Zero
;
3350 case Intrinsic::nvvm_suld_1d_v2i64_zero
:
3351 return NVPTXISD::Suld1DV2I64Zero
;
3352 case Intrinsic::nvvm_suld_1d_v4i8_zero
:
3353 return NVPTXISD::Suld1DV4I8Zero
;
3354 case Intrinsic::nvvm_suld_1d_v4i16_zero
:
3355 return NVPTXISD::Suld1DV4I16Zero
;
3356 case Intrinsic::nvvm_suld_1d_v4i32_zero
:
3357 return NVPTXISD::Suld1DV4I32Zero
;
3358 case Intrinsic::nvvm_suld_1d_array_i8_zero
:
3359 return NVPTXISD::Suld1DArrayI8Zero
;
3360 case Intrinsic::nvvm_suld_1d_array_i16_zero
:
3361 return NVPTXISD::Suld1DArrayI16Zero
;
3362 case Intrinsic::nvvm_suld_1d_array_i32_zero
:
3363 return NVPTXISD::Suld1DArrayI32Zero
;
3364 case Intrinsic::nvvm_suld_1d_array_i64_zero
:
3365 return NVPTXISD::Suld1DArrayI64Zero
;
3366 case Intrinsic::nvvm_suld_1d_array_v2i8_zero
:
3367 return NVPTXISD::Suld1DArrayV2I8Zero
;
3368 case Intrinsic::nvvm_suld_1d_array_v2i16_zero
:
3369 return NVPTXISD::Suld1DArrayV2I16Zero
;
3370 case Intrinsic::nvvm_suld_1d_array_v2i32_zero
:
3371 return NVPTXISD::Suld1DArrayV2I32Zero
;
3372 case Intrinsic::nvvm_suld_1d_array_v2i64_zero
:
3373 return NVPTXISD::Suld1DArrayV2I64Zero
;
3374 case Intrinsic::nvvm_suld_1d_array_v4i8_zero
:
3375 return NVPTXISD::Suld1DArrayV4I8Zero
;
3376 case Intrinsic::nvvm_suld_1d_array_v4i16_zero
:
3377 return NVPTXISD::Suld1DArrayV4I16Zero
;
3378 case Intrinsic::nvvm_suld_1d_array_v4i32_zero
:
3379 return NVPTXISD::Suld1DArrayV4I32Zero
;
3380 case Intrinsic::nvvm_suld_2d_i8_zero
:
3381 return NVPTXISD::Suld2DI8Zero
;
3382 case Intrinsic::nvvm_suld_2d_i16_zero
:
3383 return NVPTXISD::Suld2DI16Zero
;
3384 case Intrinsic::nvvm_suld_2d_i32_zero
:
3385 return NVPTXISD::Suld2DI32Zero
;
3386 case Intrinsic::nvvm_suld_2d_i64_zero
:
3387 return NVPTXISD::Suld2DI64Zero
;
3388 case Intrinsic::nvvm_suld_2d_v2i8_zero
:
3389 return NVPTXISD::Suld2DV2I8Zero
;
3390 case Intrinsic::nvvm_suld_2d_v2i16_zero
:
3391 return NVPTXISD::Suld2DV2I16Zero
;
3392 case Intrinsic::nvvm_suld_2d_v2i32_zero
:
3393 return NVPTXISD::Suld2DV2I32Zero
;
3394 case Intrinsic::nvvm_suld_2d_v2i64_zero
:
3395 return NVPTXISD::Suld2DV2I64Zero
;
3396 case Intrinsic::nvvm_suld_2d_v4i8_zero
:
3397 return NVPTXISD::Suld2DV4I8Zero
;
3398 case Intrinsic::nvvm_suld_2d_v4i16_zero
:
3399 return NVPTXISD::Suld2DV4I16Zero
;
3400 case Intrinsic::nvvm_suld_2d_v4i32_zero
:
3401 return NVPTXISD::Suld2DV4I32Zero
;
3402 case Intrinsic::nvvm_suld_2d_array_i8_zero
:
3403 return NVPTXISD::Suld2DArrayI8Zero
;
3404 case Intrinsic::nvvm_suld_2d_array_i16_zero
:
3405 return NVPTXISD::Suld2DArrayI16Zero
;
3406 case Intrinsic::nvvm_suld_2d_array_i32_zero
:
3407 return NVPTXISD::Suld2DArrayI32Zero
;
3408 case Intrinsic::nvvm_suld_2d_array_i64_zero
:
3409 return NVPTXISD::Suld2DArrayI64Zero
;
3410 case Intrinsic::nvvm_suld_2d_array_v2i8_zero
:
3411 return NVPTXISD::Suld2DArrayV2I8Zero
;
3412 case Intrinsic::nvvm_suld_2d_array_v2i16_zero
:
3413 return NVPTXISD::Suld2DArrayV2I16Zero
;
3414 case Intrinsic::nvvm_suld_2d_array_v2i32_zero
:
3415 return NVPTXISD::Suld2DArrayV2I32Zero
;
3416 case Intrinsic::nvvm_suld_2d_array_v2i64_zero
:
3417 return NVPTXISD::Suld2DArrayV2I64Zero
;
3418 case Intrinsic::nvvm_suld_2d_array_v4i8_zero
:
3419 return NVPTXISD::Suld2DArrayV4I8Zero
;
3420 case Intrinsic::nvvm_suld_2d_array_v4i16_zero
:
3421 return NVPTXISD::Suld2DArrayV4I16Zero
;
3422 case Intrinsic::nvvm_suld_2d_array_v4i32_zero
:
3423 return NVPTXISD::Suld2DArrayV4I32Zero
;
3424 case Intrinsic::nvvm_suld_3d_i8_zero
:
3425 return NVPTXISD::Suld3DI8Zero
;
3426 case Intrinsic::nvvm_suld_3d_i16_zero
:
3427 return NVPTXISD::Suld3DI16Zero
;
3428 case Intrinsic::nvvm_suld_3d_i32_zero
:
3429 return NVPTXISD::Suld3DI32Zero
;
3430 case Intrinsic::nvvm_suld_3d_i64_zero
:
3431 return NVPTXISD::Suld3DI64Zero
;
3432 case Intrinsic::nvvm_suld_3d_v2i8_zero
:
3433 return NVPTXISD::Suld3DV2I8Zero
;
3434 case Intrinsic::nvvm_suld_3d_v2i16_zero
:
3435 return NVPTXISD::Suld3DV2I16Zero
;
3436 case Intrinsic::nvvm_suld_3d_v2i32_zero
:
3437 return NVPTXISD::Suld3DV2I32Zero
;
3438 case Intrinsic::nvvm_suld_3d_v2i64_zero
:
3439 return NVPTXISD::Suld3DV2I64Zero
;
3440 case Intrinsic::nvvm_suld_3d_v4i8_zero
:
3441 return NVPTXISD::Suld3DV4I8Zero
;
3442 case Intrinsic::nvvm_suld_3d_v4i16_zero
:
3443 return NVPTXISD::Suld3DV4I16Zero
;
3444 case Intrinsic::nvvm_suld_3d_v4i32_zero
:
3445 return NVPTXISD::Suld3DV4I32Zero
;
3449 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3451 // because we need the information that is only available in the "Value" type
3453 // pointer. In particular, the address space information.
3454 bool NVPTXTargetLowering::getTgtMemIntrinsic(
3455 IntrinsicInfo
&Info
, const CallInst
&I
,
3456 MachineFunction
&MF
, unsigned Intrinsic
) const {
3457 switch (Intrinsic
) {
3460 case Intrinsic::nvvm_match_all_sync_i32p
:
3461 case Intrinsic::nvvm_match_all_sync_i64p
:
3462 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3463 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3464 // in order to model data exchange with other threads, but perform no real
3466 Info
.memVT
= MVT::i1
;
3468 // Our result depends on both our and other thread's arguments.
3469 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
3471 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col
:
3472 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row
:
3473 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride
:
3474 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride
:
3475 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col
:
3476 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row
:
3477 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride
:
3478 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride
:
3479 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col
:
3480 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row
:
3481 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride
:
3482 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride
:
3483 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col
:
3484 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row
:
3485 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride
:
3486 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride
:
3487 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col
:
3488 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row
:
3489 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride
:
3490 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride
:
3491 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col
:
3492 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row
:
3493 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride
:
3494 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride
: {
3495 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3496 Info
.memVT
= MVT::v8f16
;
3497 Info
.ptrVal
= I
.getArgOperand(0);
3499 Info
.flags
= MachineMemOperand::MOLoad
;
3500 Info
.align
= Align(16);
3503 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col
:
3504 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride
:
3505 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride
:
3506 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col
:
3507 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row
:
3508 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride
:
3509 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride
:
3510 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row
:
3511 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col
:
3512 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride
:
3513 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride
:
3514 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col
:
3515 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row
:
3516 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride
:
3517 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride
:
3518 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row
: {
3519 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3520 Info
.memVT
= MVT::v2i32
;
3521 Info
.ptrVal
= I
.getArgOperand(0);
3523 Info
.flags
= MachineMemOperand::MOLoad
;
3524 Info
.align
= Align(8);
3528 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col
:
3529 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride
:
3530 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride
:
3531 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col
:
3532 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row
:
3533 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride
:
3534 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride
:
3535 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row
:
3537 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col
:
3538 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride
:
3539 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride
:
3540 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col
:
3541 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row
:
3542 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride
:
3543 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride
:
3544 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row
: {
3545 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3546 Info
.memVT
= MVT::v4i32
;
3547 Info
.ptrVal
= I
.getArgOperand(0);
3549 Info
.flags
= MachineMemOperand::MOLoad
;
3550 Info
.align
= Align(16);
3554 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col
:
3555 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride
:
3556 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride
:
3557 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col
:
3558 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row
:
3559 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride
:
3560 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride
:
3561 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row
:
3563 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col
:
3564 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride
:
3565 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride
:
3566 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col
:
3567 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row
:
3568 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride
:
3569 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride
:
3570 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row
:
3571 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row
:
3572 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride
:
3573 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col
:
3574 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride
:
3575 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row
:
3576 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride
:
3577 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride
:
3578 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row
:
3579 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col
:
3580 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride
:
3581 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride
:
3582 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col
: {
3583 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3584 Info
.memVT
= MVT::i32
;
3585 Info
.ptrVal
= I
.getArgOperand(0);
3587 Info
.flags
= MachineMemOperand::MOLoad
;
3588 Info
.align
= Align(4);
3592 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col
:
3593 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row
:
3594 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride
:
3595 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride
:
3596 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col
:
3597 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row
:
3598 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride
:
3599 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride
:
3600 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col
:
3601 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row
:
3602 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride
:
3603 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride
: {
3604 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3605 Info
.memVT
= MVT::v4f16
;
3606 Info
.ptrVal
= I
.getArgOperand(0);
3608 Info
.flags
= MachineMemOperand::MOLoad
;
3609 Info
.align
= Align(16);
3613 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col
:
3614 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row
:
3615 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride
:
3616 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride
:
3617 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col
:
3618 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row
:
3619 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride
:
3620 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride
:
3621 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col
:
3622 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row
:
3623 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride
:
3624 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride
: {
3625 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3626 Info
.memVT
= MVT::v8f32
;
3627 Info
.ptrVal
= I
.getArgOperand(0);
3629 Info
.flags
= MachineMemOperand::MOLoad
;
3630 Info
.align
= Align(16);
3634 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col
:
3635 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride
:
3636 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row
:
3637 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride
:
3638 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col
:
3639 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride
:
3640 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row
:
3641 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride
:
3642 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col
:
3643 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride
:
3644 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row
:
3645 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride
: {
3646 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3647 Info
.memVT
= MVT::v8i32
;
3648 Info
.ptrVal
= I
.getArgOperand(0);
3650 Info
.flags
= MachineMemOperand::MOLoad
;
3651 Info
.align
= Align(16);
3655 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col
:
3656 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride
:
3657 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row
:
3658 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride
:
3659 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col
:
3660 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride
:
3661 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row
:
3662 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride
: {
3663 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3664 Info
.memVT
= MVT::v2i32
;
3665 Info
.ptrVal
= I
.getArgOperand(0);
3667 Info
.flags
= MachineMemOperand::MOLoad
;
3668 Info
.align
= Align(8);
3672 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col
:
3673 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row
:
3674 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride
:
3675 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride
:
3676 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col
:
3677 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row
:
3678 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride
:
3679 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride
:
3680 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col
:
3681 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row
:
3682 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride
:
3683 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride
: {
3684 Info
.opc
= ISD::INTRINSIC_VOID
;
3685 Info
.memVT
= MVT::v4f16
;
3686 Info
.ptrVal
= I
.getArgOperand(0);
3688 Info
.flags
= MachineMemOperand::MOStore
;
3689 Info
.align
= Align(16);
3693 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col
:
3694 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row
:
3695 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride
:
3696 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride
:
3697 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col
:
3698 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row
:
3699 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride
:
3700 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride
:
3701 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col
:
3702 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row
:
3703 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride
:
3704 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride
: {
3705 Info
.opc
= ISD::INTRINSIC_VOID
;
3706 Info
.memVT
= MVT::v8f32
;
3707 Info
.ptrVal
= I
.getArgOperand(0);
3709 Info
.flags
= MachineMemOperand::MOStore
;
3710 Info
.align
= Align(16);
3714 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col
:
3715 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride
:
3716 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row
:
3717 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride
:
3718 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col
:
3719 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride
:
3720 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row
:
3721 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride
:
3722 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col
:
3723 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride
:
3724 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row
:
3725 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride
: {
3726 Info
.opc
= ISD::INTRINSIC_VOID
;
3727 Info
.memVT
= MVT::v8i32
;
3728 Info
.ptrVal
= I
.getArgOperand(0);
3730 Info
.flags
= MachineMemOperand::MOStore
;
3731 Info
.align
= Align(16);
3735 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col
:
3736 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride
:
3737 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row
:
3738 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride
:
3739 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col
:
3740 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride
:
3741 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row
:
3742 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride
: {
3743 Info
.opc
= ISD::INTRINSIC_VOID
;
3744 Info
.memVT
= MVT::v2i32
;
3745 Info
.ptrVal
= I
.getArgOperand(0);
3747 Info
.flags
= MachineMemOperand::MOStore
;
3748 Info
.align
= Align(8);
3752 case Intrinsic::nvvm_atomic_load_inc_32
:
3753 case Intrinsic::nvvm_atomic_load_dec_32
:
3755 case Intrinsic::nvvm_atomic_add_gen_f_cta
:
3756 case Intrinsic::nvvm_atomic_add_gen_f_sys
:
3757 case Intrinsic::nvvm_atomic_add_gen_i_cta
:
3758 case Intrinsic::nvvm_atomic_add_gen_i_sys
:
3759 case Intrinsic::nvvm_atomic_and_gen_i_cta
:
3760 case Intrinsic::nvvm_atomic_and_gen_i_sys
:
3761 case Intrinsic::nvvm_atomic_cas_gen_i_cta
:
3762 case Intrinsic::nvvm_atomic_cas_gen_i_sys
:
3763 case Intrinsic::nvvm_atomic_dec_gen_i_cta
:
3764 case Intrinsic::nvvm_atomic_dec_gen_i_sys
:
3765 case Intrinsic::nvvm_atomic_inc_gen_i_cta
:
3766 case Intrinsic::nvvm_atomic_inc_gen_i_sys
:
3767 case Intrinsic::nvvm_atomic_max_gen_i_cta
:
3768 case Intrinsic::nvvm_atomic_max_gen_i_sys
:
3769 case Intrinsic::nvvm_atomic_min_gen_i_cta
:
3770 case Intrinsic::nvvm_atomic_min_gen_i_sys
:
3771 case Intrinsic::nvvm_atomic_or_gen_i_cta
:
3772 case Intrinsic::nvvm_atomic_or_gen_i_sys
:
3773 case Intrinsic::nvvm_atomic_exch_gen_i_cta
:
3774 case Intrinsic::nvvm_atomic_exch_gen_i_sys
:
3775 case Intrinsic::nvvm_atomic_xor_gen_i_cta
:
3776 case Intrinsic::nvvm_atomic_xor_gen_i_sys
: {
3777 auto &DL
= I
.getModule()->getDataLayout();
3778 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3779 Info
.memVT
= getValueType(DL
, I
.getType());
3780 Info
.ptrVal
= I
.getArgOperand(0);
3782 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
3787 case Intrinsic::nvvm_ldu_global_i
:
3788 case Intrinsic::nvvm_ldu_global_f
:
3789 case Intrinsic::nvvm_ldu_global_p
: {
3790 auto &DL
= I
.getModule()->getDataLayout();
3791 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3792 if (Intrinsic
== Intrinsic::nvvm_ldu_global_i
)
3793 Info
.memVT
= getValueType(DL
, I
.getType());
3794 else if(Intrinsic
== Intrinsic::nvvm_ldu_global_p
)
3795 Info
.memVT
= getPointerTy(DL
);
3797 Info
.memVT
= getValueType(DL
, I
.getType());
3798 Info
.ptrVal
= I
.getArgOperand(0);
3800 Info
.flags
= MachineMemOperand::MOLoad
;
3802 MaybeAlign(cast
<ConstantInt
>(I
.getArgOperand(1))->getZExtValue());
3806 case Intrinsic::nvvm_ldg_global_i
:
3807 case Intrinsic::nvvm_ldg_global_f
:
3808 case Intrinsic::nvvm_ldg_global_p
: {
3809 auto &DL
= I
.getModule()->getDataLayout();
3811 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
3812 if (Intrinsic
== Intrinsic::nvvm_ldg_global_i
)
3813 Info
.memVT
= getValueType(DL
, I
.getType());
3814 else if(Intrinsic
== Intrinsic::nvvm_ldg_global_p
)
3815 Info
.memVT
= getPointerTy(DL
);
3817 Info
.memVT
= getValueType(DL
, I
.getType());
3818 Info
.ptrVal
= I
.getArgOperand(0);
3820 Info
.flags
= MachineMemOperand::MOLoad
;
3822 MaybeAlign(cast
<ConstantInt
>(I
.getArgOperand(1))->getZExtValue());
3827 case Intrinsic::nvvm_tex_1d_v4f32_s32
:
3828 case Intrinsic::nvvm_tex_1d_v4f32_f32
:
3829 case Intrinsic::nvvm_tex_1d_level_v4f32_f32
:
3830 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32
:
3831 case Intrinsic::nvvm_tex_1d_array_v4f32_s32
:
3832 case Intrinsic::nvvm_tex_1d_array_v4f32_f32
:
3833 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32
:
3834 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32
:
3835 case Intrinsic::nvvm_tex_2d_v4f32_s32
:
3836 case Intrinsic::nvvm_tex_2d_v4f32_f32
:
3837 case Intrinsic::nvvm_tex_2d_level_v4f32_f32
:
3838 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32
:
3839 case Intrinsic::nvvm_tex_2d_array_v4f32_s32
:
3840 case Intrinsic::nvvm_tex_2d_array_v4f32_f32
:
3841 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32
:
3842 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32
:
3843 case Intrinsic::nvvm_tex_3d_v4f32_s32
:
3844 case Intrinsic::nvvm_tex_3d_v4f32_f32
:
3845 case Intrinsic::nvvm_tex_3d_level_v4f32_f32
:
3846 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32
:
3847 case Intrinsic::nvvm_tex_cube_v4f32_f32
:
3848 case Intrinsic::nvvm_tex_cube_level_v4f32_f32
:
3849 case Intrinsic::nvvm_tex_cube_array_v4f32_f32
:
3850 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32
:
3851 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32
:
3852 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32
:
3853 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32
:
3854 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32
:
3855 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32
:
3856 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32
:
3857 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32
:
3858 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32
:
3859 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32
:
3860 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32
:
3861 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32
:
3862 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32
:
3863 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32
:
3864 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32
:
3865 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32
:
3866 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32
:
3867 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32
:
3868 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32
:
3869 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32
:
3870 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32
:
3871 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32
:
3872 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32
:
3873 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32
:
3874 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32
:
3875 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32
:
3876 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32
:
3877 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32
:
3878 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32
:
3879 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32
:
3880 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32
:
3881 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32
:
3882 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32
:
3883 Info
.opc
= getOpcForTextureInstr(Intrinsic
);
3884 Info
.memVT
= MVT::v4f32
;
3885 Info
.ptrVal
= nullptr;
3887 Info
.flags
= MachineMemOperand::MOLoad
;
3888 Info
.align
= Align(16);
3891 case Intrinsic::nvvm_tex_1d_v4s32_s32
:
3892 case Intrinsic::nvvm_tex_1d_v4s32_f32
:
3893 case Intrinsic::nvvm_tex_1d_level_v4s32_f32
:
3894 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32
:
3895 case Intrinsic::nvvm_tex_1d_array_v4s32_s32
:
3896 case Intrinsic::nvvm_tex_1d_array_v4s32_f32
:
3897 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32
:
3898 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32
:
3899 case Intrinsic::nvvm_tex_2d_v4s32_s32
:
3900 case Intrinsic::nvvm_tex_2d_v4s32_f32
:
3901 case Intrinsic::nvvm_tex_2d_level_v4s32_f32
:
3902 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32
:
3903 case Intrinsic::nvvm_tex_2d_array_v4s32_s32
:
3904 case Intrinsic::nvvm_tex_2d_array_v4s32_f32
:
3905 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32
:
3906 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32
:
3907 case Intrinsic::nvvm_tex_3d_v4s32_s32
:
3908 case Intrinsic::nvvm_tex_3d_v4s32_f32
:
3909 case Intrinsic::nvvm_tex_3d_level_v4s32_f32
:
3910 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32
:
3911 case Intrinsic::nvvm_tex_cube_v4s32_f32
:
3912 case Intrinsic::nvvm_tex_cube_level_v4s32_f32
:
3913 case Intrinsic::nvvm_tex_cube_array_v4s32_f32
:
3914 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32
:
3915 case Intrinsic::nvvm_tex_cube_v4u32_f32
:
3916 case Intrinsic::nvvm_tex_cube_level_v4u32_f32
:
3917 case Intrinsic::nvvm_tex_cube_array_v4u32_f32
:
3918 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32
:
3919 case Intrinsic::nvvm_tex_1d_v4u32_s32
:
3920 case Intrinsic::nvvm_tex_1d_v4u32_f32
:
3921 case Intrinsic::nvvm_tex_1d_level_v4u32_f32
:
3922 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32
:
3923 case Intrinsic::nvvm_tex_1d_array_v4u32_s32
:
3924 case Intrinsic::nvvm_tex_1d_array_v4u32_f32
:
3925 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32
:
3926 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32
:
3927 case Intrinsic::nvvm_tex_2d_v4u32_s32
:
3928 case Intrinsic::nvvm_tex_2d_v4u32_f32
:
3929 case Intrinsic::nvvm_tex_2d_level_v4u32_f32
:
3930 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32
:
3931 case Intrinsic::nvvm_tex_2d_array_v4u32_s32
:
3932 case Intrinsic::nvvm_tex_2d_array_v4u32_f32
:
3933 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32
:
3934 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32
:
3935 case Intrinsic::nvvm_tex_3d_v4u32_s32
:
3936 case Intrinsic::nvvm_tex_3d_v4u32_f32
:
3937 case Intrinsic::nvvm_tex_3d_level_v4u32_f32
:
3938 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32
:
3939 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32
:
3940 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32
:
3941 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32
:
3942 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32
:
3943 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32
:
3944 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32
:
3945 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32
:
3946 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32
:
3947 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32
:
3948 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32
:
3949 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32
:
3950 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32
:
3951 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32
:
3952 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32
:
3953 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32
:
3954 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32
:
3955 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32
:
3956 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32
:
3957 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32
:
3958 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32
:
3959 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32
:
3960 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32
:
3961 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32
:
3962 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32
:
3963 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32
:
3964 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32
:
3965 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32
:
3966 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32
:
3967 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32
:
3968 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32
:
3969 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32
:
3970 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32
:
3971 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32
:
3972 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32
:
3973 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32
:
3974 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32
:
3975 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32
:
3976 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32
:
3977 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32
:
3978 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32
:
3979 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32
:
3980 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32
:
3981 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32
:
3982 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32
:
3983 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32
:
3984 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32
:
3985 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32
:
3986 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32
:
3987 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32
:
3988 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32
:
3989 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32
:
3990 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32
:
3991 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32
:
3992 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32
:
3993 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32
:
3994 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32
:
3995 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32
:
3996 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32
:
3997 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32
:
3998 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32
:
3999 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32
:
4000 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32
:
4001 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32
:
4002 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32
:
4003 Info
.opc
= getOpcForTextureInstr(Intrinsic
);
4004 Info
.memVT
= MVT::v4i32
;
4005 Info
.ptrVal
= nullptr;
4007 Info
.flags
= MachineMemOperand::MOLoad
;
4008 Info
.align
= Align(16);
4011 case Intrinsic::nvvm_suld_1d_i8_clamp
:
4012 case Intrinsic::nvvm_suld_1d_v2i8_clamp
:
4013 case Intrinsic::nvvm_suld_1d_v4i8_clamp
:
4014 case Intrinsic::nvvm_suld_1d_array_i8_clamp
:
4015 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp
:
4016 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp
:
4017 case Intrinsic::nvvm_suld_2d_i8_clamp
:
4018 case Intrinsic::nvvm_suld_2d_v2i8_clamp
:
4019 case Intrinsic::nvvm_suld_2d_v4i8_clamp
:
4020 case Intrinsic::nvvm_suld_2d_array_i8_clamp
:
4021 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp
:
4022 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp
:
4023 case Intrinsic::nvvm_suld_3d_i8_clamp
:
4024 case Intrinsic::nvvm_suld_3d_v2i8_clamp
:
4025 case Intrinsic::nvvm_suld_3d_v4i8_clamp
:
4026 case Intrinsic::nvvm_suld_1d_i8_trap
:
4027 case Intrinsic::nvvm_suld_1d_v2i8_trap
:
4028 case Intrinsic::nvvm_suld_1d_v4i8_trap
:
4029 case Intrinsic::nvvm_suld_1d_array_i8_trap
:
4030 case Intrinsic::nvvm_suld_1d_array_v2i8_trap
:
4031 case Intrinsic::nvvm_suld_1d_array_v4i8_trap
:
4032 case Intrinsic::nvvm_suld_2d_i8_trap
:
4033 case Intrinsic::nvvm_suld_2d_v2i8_trap
:
4034 case Intrinsic::nvvm_suld_2d_v4i8_trap
:
4035 case Intrinsic::nvvm_suld_2d_array_i8_trap
:
4036 case Intrinsic::nvvm_suld_2d_array_v2i8_trap
:
4037 case Intrinsic::nvvm_suld_2d_array_v4i8_trap
:
4038 case Intrinsic::nvvm_suld_3d_i8_trap
:
4039 case Intrinsic::nvvm_suld_3d_v2i8_trap
:
4040 case Intrinsic::nvvm_suld_3d_v4i8_trap
:
4041 case Intrinsic::nvvm_suld_1d_i8_zero
:
4042 case Intrinsic::nvvm_suld_1d_v2i8_zero
:
4043 case Intrinsic::nvvm_suld_1d_v4i8_zero
:
4044 case Intrinsic::nvvm_suld_1d_array_i8_zero
:
4045 case Intrinsic::nvvm_suld_1d_array_v2i8_zero
:
4046 case Intrinsic::nvvm_suld_1d_array_v4i8_zero
:
4047 case Intrinsic::nvvm_suld_2d_i8_zero
:
4048 case Intrinsic::nvvm_suld_2d_v2i8_zero
:
4049 case Intrinsic::nvvm_suld_2d_v4i8_zero
:
4050 case Intrinsic::nvvm_suld_2d_array_i8_zero
:
4051 case Intrinsic::nvvm_suld_2d_array_v2i8_zero
:
4052 case Intrinsic::nvvm_suld_2d_array_v4i8_zero
:
4053 case Intrinsic::nvvm_suld_3d_i8_zero
:
4054 case Intrinsic::nvvm_suld_3d_v2i8_zero
:
4055 case Intrinsic::nvvm_suld_3d_v4i8_zero
:
4056 Info
.opc
= getOpcForSurfaceInstr(Intrinsic
);
4057 Info
.memVT
= MVT::i8
;
4058 Info
.ptrVal
= nullptr;
4060 Info
.flags
= MachineMemOperand::MOLoad
;
4061 Info
.align
= Align(16);
4064 case Intrinsic::nvvm_suld_1d_i16_clamp
:
4065 case Intrinsic::nvvm_suld_1d_v2i16_clamp
:
4066 case Intrinsic::nvvm_suld_1d_v4i16_clamp
:
4067 case Intrinsic::nvvm_suld_1d_array_i16_clamp
:
4068 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp
:
4069 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp
:
4070 case Intrinsic::nvvm_suld_2d_i16_clamp
:
4071 case Intrinsic::nvvm_suld_2d_v2i16_clamp
:
4072 case Intrinsic::nvvm_suld_2d_v4i16_clamp
:
4073 case Intrinsic::nvvm_suld_2d_array_i16_clamp
:
4074 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp
:
4075 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp
:
4076 case Intrinsic::nvvm_suld_3d_i16_clamp
:
4077 case Intrinsic::nvvm_suld_3d_v2i16_clamp
:
4078 case Intrinsic::nvvm_suld_3d_v4i16_clamp
:
4079 case Intrinsic::nvvm_suld_1d_i16_trap
:
4080 case Intrinsic::nvvm_suld_1d_v2i16_trap
:
4081 case Intrinsic::nvvm_suld_1d_v4i16_trap
:
4082 case Intrinsic::nvvm_suld_1d_array_i16_trap
:
4083 case Intrinsic::nvvm_suld_1d_array_v2i16_trap
:
4084 case Intrinsic::nvvm_suld_1d_array_v4i16_trap
:
4085 case Intrinsic::nvvm_suld_2d_i16_trap
:
4086 case Intrinsic::nvvm_suld_2d_v2i16_trap
:
4087 case Intrinsic::nvvm_suld_2d_v4i16_trap
:
4088 case Intrinsic::nvvm_suld_2d_array_i16_trap
:
4089 case Intrinsic::nvvm_suld_2d_array_v2i16_trap
:
4090 case Intrinsic::nvvm_suld_2d_array_v4i16_trap
:
4091 case Intrinsic::nvvm_suld_3d_i16_trap
:
4092 case Intrinsic::nvvm_suld_3d_v2i16_trap
:
4093 case Intrinsic::nvvm_suld_3d_v4i16_trap
:
4094 case Intrinsic::nvvm_suld_1d_i16_zero
:
4095 case Intrinsic::nvvm_suld_1d_v2i16_zero
:
4096 case Intrinsic::nvvm_suld_1d_v4i16_zero
:
4097 case Intrinsic::nvvm_suld_1d_array_i16_zero
:
4098 case Intrinsic::nvvm_suld_1d_array_v2i16_zero
:
4099 case Intrinsic::nvvm_suld_1d_array_v4i16_zero
:
4100 case Intrinsic::nvvm_suld_2d_i16_zero
:
4101 case Intrinsic::nvvm_suld_2d_v2i16_zero
:
4102 case Intrinsic::nvvm_suld_2d_v4i16_zero
:
4103 case Intrinsic::nvvm_suld_2d_array_i16_zero
:
4104 case Intrinsic::nvvm_suld_2d_array_v2i16_zero
:
4105 case Intrinsic::nvvm_suld_2d_array_v4i16_zero
:
4106 case Intrinsic::nvvm_suld_3d_i16_zero
:
4107 case Intrinsic::nvvm_suld_3d_v2i16_zero
:
4108 case Intrinsic::nvvm_suld_3d_v4i16_zero
:
4109 Info
.opc
= getOpcForSurfaceInstr(Intrinsic
);
4110 Info
.memVT
= MVT::i16
;
4111 Info
.ptrVal
= nullptr;
4113 Info
.flags
= MachineMemOperand::MOLoad
;
4114 Info
.align
= Align(16);
4117 case Intrinsic::nvvm_suld_1d_i32_clamp
:
4118 case Intrinsic::nvvm_suld_1d_v2i32_clamp
:
4119 case Intrinsic::nvvm_suld_1d_v4i32_clamp
:
4120 case Intrinsic::nvvm_suld_1d_array_i32_clamp
:
4121 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp
:
4122 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp
:
4123 case Intrinsic::nvvm_suld_2d_i32_clamp
:
4124 case Intrinsic::nvvm_suld_2d_v2i32_clamp
:
4125 case Intrinsic::nvvm_suld_2d_v4i32_clamp
:
4126 case Intrinsic::nvvm_suld_2d_array_i32_clamp
:
4127 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp
:
4128 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp
:
4129 case Intrinsic::nvvm_suld_3d_i32_clamp
:
4130 case Intrinsic::nvvm_suld_3d_v2i32_clamp
:
4131 case Intrinsic::nvvm_suld_3d_v4i32_clamp
:
4132 case Intrinsic::nvvm_suld_1d_i32_trap
:
4133 case Intrinsic::nvvm_suld_1d_v2i32_trap
:
4134 case Intrinsic::nvvm_suld_1d_v4i32_trap
:
4135 case Intrinsic::nvvm_suld_1d_array_i32_trap
:
4136 case Intrinsic::nvvm_suld_1d_array_v2i32_trap
:
4137 case Intrinsic::nvvm_suld_1d_array_v4i32_trap
:
4138 case Intrinsic::nvvm_suld_2d_i32_trap
:
4139 case Intrinsic::nvvm_suld_2d_v2i32_trap
:
4140 case Intrinsic::nvvm_suld_2d_v4i32_trap
:
4141 case Intrinsic::nvvm_suld_2d_array_i32_trap
:
4142 case Intrinsic::nvvm_suld_2d_array_v2i32_trap
:
4143 case Intrinsic::nvvm_suld_2d_array_v4i32_trap
:
4144 case Intrinsic::nvvm_suld_3d_i32_trap
:
4145 case Intrinsic::nvvm_suld_3d_v2i32_trap
:
4146 case Intrinsic::nvvm_suld_3d_v4i32_trap
:
4147 case Intrinsic::nvvm_suld_1d_i32_zero
:
4148 case Intrinsic::nvvm_suld_1d_v2i32_zero
:
4149 case Intrinsic::nvvm_suld_1d_v4i32_zero
:
4150 case Intrinsic::nvvm_suld_1d_array_i32_zero
:
4151 case Intrinsic::nvvm_suld_1d_array_v2i32_zero
:
4152 case Intrinsic::nvvm_suld_1d_array_v4i32_zero
:
4153 case Intrinsic::nvvm_suld_2d_i32_zero
:
4154 case Intrinsic::nvvm_suld_2d_v2i32_zero
:
4155 case Intrinsic::nvvm_suld_2d_v4i32_zero
:
4156 case Intrinsic::nvvm_suld_2d_array_i32_zero
:
4157 case Intrinsic::nvvm_suld_2d_array_v2i32_zero
:
4158 case Intrinsic::nvvm_suld_2d_array_v4i32_zero
:
4159 case Intrinsic::nvvm_suld_3d_i32_zero
:
4160 case Intrinsic::nvvm_suld_3d_v2i32_zero
:
4161 case Intrinsic::nvvm_suld_3d_v4i32_zero
:
4162 Info
.opc
= getOpcForSurfaceInstr(Intrinsic
);
4163 Info
.memVT
= MVT::i32
;
4164 Info
.ptrVal
= nullptr;
4166 Info
.flags
= MachineMemOperand::MOLoad
;
4167 Info
.align
= Align(16);
4170 case Intrinsic::nvvm_suld_1d_i64_clamp
:
4171 case Intrinsic::nvvm_suld_1d_v2i64_clamp
:
4172 case Intrinsic::nvvm_suld_1d_array_i64_clamp
:
4173 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp
:
4174 case Intrinsic::nvvm_suld_2d_i64_clamp
:
4175 case Intrinsic::nvvm_suld_2d_v2i64_clamp
:
4176 case Intrinsic::nvvm_suld_2d_array_i64_clamp
:
4177 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp
:
4178 case Intrinsic::nvvm_suld_3d_i64_clamp
:
4179 case Intrinsic::nvvm_suld_3d_v2i64_clamp
:
4180 case Intrinsic::nvvm_suld_1d_i64_trap
:
4181 case Intrinsic::nvvm_suld_1d_v2i64_trap
:
4182 case Intrinsic::nvvm_suld_1d_array_i64_trap
:
4183 case Intrinsic::nvvm_suld_1d_array_v2i64_trap
:
4184 case Intrinsic::nvvm_suld_2d_i64_trap
:
4185 case Intrinsic::nvvm_suld_2d_v2i64_trap
:
4186 case Intrinsic::nvvm_suld_2d_array_i64_trap
:
4187 case Intrinsic::nvvm_suld_2d_array_v2i64_trap
:
4188 case Intrinsic::nvvm_suld_3d_i64_trap
:
4189 case Intrinsic::nvvm_suld_3d_v2i64_trap
:
4190 case Intrinsic::nvvm_suld_1d_i64_zero
:
4191 case Intrinsic::nvvm_suld_1d_v2i64_zero
:
4192 case Intrinsic::nvvm_suld_1d_array_i64_zero
:
4193 case Intrinsic::nvvm_suld_1d_array_v2i64_zero
:
4194 case Intrinsic::nvvm_suld_2d_i64_zero
:
4195 case Intrinsic::nvvm_suld_2d_v2i64_zero
:
4196 case Intrinsic::nvvm_suld_2d_array_i64_zero
:
4197 case Intrinsic::nvvm_suld_2d_array_v2i64_zero
:
4198 case Intrinsic::nvvm_suld_3d_i64_zero
:
4199 case Intrinsic::nvvm_suld_3d_v2i64_zero
:
4200 Info
.opc
= getOpcForSurfaceInstr(Intrinsic
);
4201 Info
.memVT
= MVT::i64
;
4202 Info
.ptrVal
= nullptr;
4204 Info
.flags
= MachineMemOperand::MOLoad
;
4205 Info
.align
= Align(16);
4211 /// isLegalAddressingMode - Return true if the addressing mode represented
4212 /// by AM is legal for this target, for a load/store of the specified type.
4213 /// Used to guide target specific optimizations, like loop strength reduction
4214 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
4215 /// (CodeGenPrepare.cpp)
4216 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
4217 const AddrMode
&AM
, Type
*Ty
,
4218 unsigned AS
, Instruction
*I
) const {
4219 // AddrMode - This represents an addressing mode of:
4220 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4222 // The legal address modes are
4229 return !AM
.BaseOffs
&& !AM
.HasBaseReg
&& !AM
.Scale
;
4233 case 0: // "r", "r+i" or "i" is allowed
4236 if (AM
.HasBaseReg
) // "r+r+i" or "r+r" is not allowed.
4238 // Otherwise we have r+i.
4241 // No scale > 1 is allowed
4247 //===----------------------------------------------------------------------===//
4248 // NVPTX Inline Assembly Support
4249 //===----------------------------------------------------------------------===//
4251 /// getConstraintType - Given a constraint letter, return the type of
4252 /// constraint it is for this target.
4253 NVPTXTargetLowering::ConstraintType
4254 NVPTXTargetLowering::getConstraintType(StringRef Constraint
) const {
4255 if (Constraint
.size() == 1) {
4256 switch (Constraint
[0]) {
4268 return C_RegisterClass
;
4271 return TargetLowering::getConstraintType(Constraint
);
4274 std::pair
<unsigned, const TargetRegisterClass
*>
4275 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo
*TRI
,
4276 StringRef Constraint
,
4278 if (Constraint
.size() == 1) {
4279 switch (Constraint
[0]) {
4281 return std::make_pair(0U, &NVPTX::Int1RegsRegClass
);
4283 return std::make_pair(0U, &NVPTX::Int16RegsRegClass
);
4285 return std::make_pair(0U, &NVPTX::Int16RegsRegClass
);
4287 return std::make_pair(0U, &NVPTX::Int32RegsRegClass
);
4290 return std::make_pair(0U, &NVPTX::Int64RegsRegClass
);
4292 return std::make_pair(0U, &NVPTX::Float32RegsRegClass
);
4294 return std::make_pair(0U, &NVPTX::Float64RegsRegClass
);
4297 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
4300 //===----------------------------------------------------------------------===//
4301 // NVPTX DAG Combining
4302 //===----------------------------------------------------------------------===//
4304 bool NVPTXTargetLowering::allowFMA(MachineFunction
&MF
,
4305 CodeGenOpt::Level OptLevel
) const {
4306 // Always honor command-line argument
4307 if (FMAContractLevelOpt
.getNumOccurrences() > 0)
4308 return FMAContractLevelOpt
> 0;
4310 // Do not contract if we're not optimizing the code.
4314 // Honor TargetOptions flags that explicitly say fusion is okay.
4315 if (MF
.getTarget().Options
.AllowFPOpFusion
== FPOpFusion::Fast
)
4318 return allowUnsafeFPMath(MF
);
4321 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction
&MF
) const {
4322 // Honor TargetOptions flags that explicitly say unsafe math is okay.
4323 if (MF
.getTarget().Options
.UnsafeFPMath
)
4326 // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4327 const Function
&F
= MF
.getFunction();
4328 if (F
.hasFnAttribute("unsafe-fp-math")) {
4329 Attribute Attr
= F
.getFnAttribute("unsafe-fp-math");
4330 StringRef Val
= Attr
.getValueAsString();
4338 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4339 /// operands N0 and N1. This is a helper for PerformADDCombine that is
4340 /// called with the default operands, and if that fails, with commuted
4342 static SDValue
PerformADDCombineWithOperands(SDNode
*N
, SDValue N0
, SDValue N1
,
4343 TargetLowering::DAGCombinerInfo
&DCI
,
4344 const NVPTXSubtarget
&Subtarget
,
4345 CodeGenOpt::Level OptLevel
) {
4346 SelectionDAG
&DAG
= DCI
.DAG
;
4347 // Skip non-integer, non-scalar case
4348 EVT VT
=N0
.getValueType();
4352 // fold (add (mul a, b), c) -> (mad a, b, c)
4354 if (N0
.getOpcode() == ISD::MUL
) {
4355 assert (VT
.isInteger());
4357 // Since integer multiply-add costs the same as integer multiply
4358 // but is more costly than integer add, do the fusion only when
4359 // the mul is only used in the add.
4360 if (OptLevel
==CodeGenOpt::None
|| VT
!= MVT::i32
||
4361 !N0
.getNode()->hasOneUse())
4365 return DAG
.getNode(NVPTXISD::IMAD
, SDLoc(N
), VT
,
4366 N0
.getOperand(0), N0
.getOperand(1), N1
);
4368 else if (N0
.getOpcode() == ISD::FMUL
) {
4369 if (VT
== MVT::f32
|| VT
== MVT::f64
) {
4370 const auto *TLI
= static_cast<const NVPTXTargetLowering
*>(
4371 &DAG
.getTargetLoweringInfo());
4372 if (!TLI
->allowFMA(DAG
.getMachineFunction(), OptLevel
))
4375 // For floating point:
4376 // Do the fusion only when the mul has less than 5 uses and all
4378 // The heuristic is that if a use is not an add, then that use
4379 // cannot be fused into fma, therefore mul is still needed anyway.
4380 // If there are more than 4 uses, even if they are all add, fusing
4381 // them will increase register pressue.
4384 int nonAddCount
= 0;
4385 for (SDNode::use_iterator UI
= N0
.getNode()->use_begin(),
4386 UE
= N0
.getNode()->use_end();
4390 if (User
->getOpcode() != ISD::FADD
)
4396 int orderNo
= N
->getIROrder();
4397 int orderNo2
= N0
.getNode()->getIROrder();
4398 // simple heuristics here for considering potential register
4399 // pressure, the logics here is that the differnce are used
4400 // to measure the distance between def and use, the longer distance
4401 // more likely cause register pressure.
4402 if (orderNo
- orderNo2
< 500)
4405 // Now, check if at least one of the FMUL's operands is live beyond the node N,
4406 // which guarantees that the FMA will not increase register pressure at node N.
4407 bool opIsLive
= false;
4408 const SDNode
*left
= N0
.getOperand(0).getNode();
4409 const SDNode
*right
= N0
.getOperand(1).getNode();
4411 if (isa
<ConstantSDNode
>(left
) || isa
<ConstantSDNode
>(right
))
4415 for (SDNode::use_iterator UI
= left
->use_begin(), UE
= left
->use_end(); UI
!= UE
; ++UI
) {
4417 int orderNo3
= User
->getIROrder();
4418 if (orderNo3
> orderNo
) {
4425 for (SDNode::use_iterator UI
= right
->use_begin(), UE
= right
->use_end(); UI
!= UE
; ++UI
) {
4427 int orderNo3
= User
->getIROrder();
4428 if (orderNo3
> orderNo
) {
4438 return DAG
.getNode(ISD::FMA
, SDLoc(N
), VT
,
4439 N0
.getOperand(0), N0
.getOperand(1), N1
);
4446 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4448 static SDValue
PerformADDCombine(SDNode
*N
,
4449 TargetLowering::DAGCombinerInfo
&DCI
,
4450 const NVPTXSubtarget
&Subtarget
,
4451 CodeGenOpt::Level OptLevel
) {
4452 SDValue N0
= N
->getOperand(0);
4453 SDValue N1
= N
->getOperand(1);
4455 // First try with the default operand order.
4456 if (SDValue Result
=
4457 PerformADDCombineWithOperands(N
, N0
, N1
, DCI
, Subtarget
, OptLevel
))
4460 // If that didn't work, try again with the operands commuted.
4461 return PerformADDCombineWithOperands(N
, N1
, N0
, DCI
, Subtarget
, OptLevel
);
4464 static SDValue
PerformANDCombine(SDNode
*N
,
4465 TargetLowering::DAGCombinerInfo
&DCI
) {
4466 // The type legalizer turns a vector load of i8 values into a zextload to i16
4467 // registers, optionally ANY_EXTENDs it (if target type is integer),
4468 // and ANDs off the high 8 bits. Since we turn this load into a
4469 // target-specific DAG node, the DAG combiner fails to eliminate these AND
4470 // nodes. Do that here.
4471 SDValue Val
= N
->getOperand(0);
4472 SDValue Mask
= N
->getOperand(1);
4474 if (isa
<ConstantSDNode
>(Val
)) {
4475 std::swap(Val
, Mask
);
4479 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4480 if (Val
.getOpcode() == ISD::ANY_EXTEND
) {
4482 Val
= Val
->getOperand(0);
4485 if (Val
->isMachineOpcode() && Val
->getMachineOpcode() == NVPTX::IMOV16rr
) {
4486 Val
= Val
->getOperand(0);
4489 if (Val
->getOpcode() == NVPTXISD::LoadV2
||
4490 Val
->getOpcode() == NVPTXISD::LoadV4
) {
4491 ConstantSDNode
*MaskCnst
= dyn_cast
<ConstantSDNode
>(Mask
);
4493 // Not an AND with a constant
4497 uint64_t MaskVal
= MaskCnst
->getZExtValue();
4498 if (MaskVal
!= 0xff) {
4499 // Not an AND that chops off top 8 bits
4503 MemSDNode
*Mem
= dyn_cast
<MemSDNode
>(Val
);
4505 // Not a MemSDNode?!?
4509 EVT MemVT
= Mem
->getMemoryVT();
4510 if (MemVT
!= MVT::v2i8
&& MemVT
!= MVT::v4i8
) {
4511 // We only handle the i8 case
4516 cast
<ConstantSDNode
>(Val
->getOperand(Val
->getNumOperands()-1))->
4518 if (ExtType
== ISD::SEXTLOAD
) {
4519 // If for some reason the load is a sextload, the and is needed to zero
4520 // out the high 8 bits
4525 if (AExt
.getNode() != nullptr) {
4526 // Re-insert the ext as a zext.
4527 Val
= DCI
.DAG
.getNode(ISD::ZERO_EXTEND
, SDLoc(N
),
4528 AExt
.getValueType(), Val
);
4532 // If we get here, the AND is unnecessary. Just replace it with the load
4533 DCI
.CombineTo(N
, Val
, AddTo
);
4539 static SDValue
PerformREMCombine(SDNode
*N
,
4540 TargetLowering::DAGCombinerInfo
&DCI
,
4541 CodeGenOpt::Level OptLevel
) {
4542 assert(N
->getOpcode() == ISD::SREM
|| N
->getOpcode() == ISD::UREM
);
4544 // Don't do anything at less than -O2.
4545 if (OptLevel
< CodeGenOpt::Default
)
4548 SelectionDAG
&DAG
= DCI
.DAG
;
4550 EVT VT
= N
->getValueType(0);
4551 bool IsSigned
= N
->getOpcode() == ISD::SREM
;
4552 unsigned DivOpc
= IsSigned
? ISD::SDIV
: ISD::UDIV
;
4554 const SDValue
&Num
= N
->getOperand(0);
4555 const SDValue
&Den
= N
->getOperand(1);
4557 for (const SDNode
*U
: Num
->uses()) {
4558 if (U
->getOpcode() == DivOpc
&& U
->getOperand(0) == Num
&&
4559 U
->getOperand(1) == Den
) {
4560 // Num % Den -> Num - (Num / Den) * Den
4561 return DAG
.getNode(ISD::SUB
, DL
, VT
, Num
,
4562 DAG
.getNode(ISD::MUL
, DL
, VT
,
4563 DAG
.getNode(DivOpc
, DL
, VT
, Num
, Den
),
4570 enum OperandSignedness
{
4576 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4577 /// that can be demoted to \p OptSize bits without loss of information. The
4578 /// signedness of the operand, if determinable, is placed in \p S.
4579 static bool IsMulWideOperandDemotable(SDValue Op
,
4581 OperandSignedness
&S
) {
4584 if (Op
.getOpcode() == ISD::SIGN_EXTEND
||
4585 Op
.getOpcode() == ISD::SIGN_EXTEND_INREG
) {
4586 EVT OrigVT
= Op
.getOperand(0).getValueType();
4587 if (OrigVT
.getSizeInBits() <= OptSize
) {
4591 } else if (Op
.getOpcode() == ISD::ZERO_EXTEND
) {
4592 EVT OrigVT
= Op
.getOperand(0).getValueType();
4593 if (OrigVT
.getSizeInBits() <= OptSize
) {
4602 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4603 /// be demoted to \p OptSize bits without loss of information. If the operands
4604 /// contain a constant, it should appear as the RHS operand. The signedness of
4605 /// the operands is placed in \p IsSigned.
4606 static bool AreMulWideOperandsDemotable(SDValue LHS
, SDValue RHS
,
4609 OperandSignedness LHSSign
;
4611 // The LHS operand must be a demotable op
4612 if (!IsMulWideOperandDemotable(LHS
, OptSize
, LHSSign
))
4615 // We should have been able to determine the signedness from the LHS
4616 if (LHSSign
== Unknown
)
4619 IsSigned
= (LHSSign
== Signed
);
4621 // The RHS can be a demotable op or a constant
4622 if (ConstantSDNode
*CI
= dyn_cast
<ConstantSDNode
>(RHS
)) {
4623 const APInt
&Val
= CI
->getAPIntValue();
4624 if (LHSSign
== Unsigned
) {
4625 return Val
.isIntN(OptSize
);
4627 return Val
.isSignedIntN(OptSize
);
4630 OperandSignedness RHSSign
;
4631 if (!IsMulWideOperandDemotable(RHS
, OptSize
, RHSSign
))
4634 return LHSSign
== RHSSign
;
4638 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4639 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4640 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4642 static SDValue
TryMULWIDECombine(SDNode
*N
,
4643 TargetLowering::DAGCombinerInfo
&DCI
) {
4644 EVT MulType
= N
->getValueType(0);
4645 if (MulType
!= MVT::i32
&& MulType
!= MVT::i64
) {
4650 unsigned OptSize
= MulType
.getSizeInBits() >> 1;
4651 SDValue LHS
= N
->getOperand(0);
4652 SDValue RHS
= N
->getOperand(1);
4654 // Canonicalize the multiply so the constant (if any) is on the right
4655 if (N
->getOpcode() == ISD::MUL
) {
4656 if (isa
<ConstantSDNode
>(LHS
)) {
4657 std::swap(LHS
, RHS
);
4661 // If we have a SHL, determine the actual multiply amount
4662 if (N
->getOpcode() == ISD::SHL
) {
4663 ConstantSDNode
*ShlRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
4668 APInt ShiftAmt
= ShlRHS
->getAPIntValue();
4669 unsigned BitWidth
= MulType
.getSizeInBits();
4670 if (ShiftAmt
.sge(0) && ShiftAmt
.slt(BitWidth
)) {
4671 APInt MulVal
= APInt(BitWidth
, 1) << ShiftAmt
;
4672 RHS
= DCI
.DAG
.getConstant(MulVal
, DL
, MulType
);
4679 // Verify that our operands are demotable
4680 if (!AreMulWideOperandsDemotable(LHS
, RHS
, OptSize
, Signed
)) {
4685 if (MulType
== MVT::i32
) {
4686 DemotedVT
= MVT::i16
;
4688 DemotedVT
= MVT::i32
;
4691 // Truncate the operands to the correct size. Note that these are just for
4692 // type consistency and will (likely) be eliminated in later phases.
4694 DCI
.DAG
.getNode(ISD::TRUNCATE
, DL
, DemotedVT
, LHS
);
4696 DCI
.DAG
.getNode(ISD::TRUNCATE
, DL
, DemotedVT
, RHS
);
4700 Opc
= NVPTXISD::MUL_WIDE_SIGNED
;
4702 Opc
= NVPTXISD::MUL_WIDE_UNSIGNED
;
4705 return DCI
.DAG
.getNode(Opc
, DL
, MulType
, TruncLHS
, TruncRHS
);
4708 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
4709 static SDValue
PerformMULCombine(SDNode
*N
,
4710 TargetLowering::DAGCombinerInfo
&DCI
,
4711 CodeGenOpt::Level OptLevel
) {
4713 // Try mul.wide combining at OptLevel > 0
4714 if (SDValue Ret
= TryMULWIDECombine(N
, DCI
))
4721 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
4722 static SDValue
PerformSHLCombine(SDNode
*N
,
4723 TargetLowering::DAGCombinerInfo
&DCI
,
4724 CodeGenOpt::Level OptLevel
) {
4726 // Try mul.wide combining at OptLevel > 0
4727 if (SDValue Ret
= TryMULWIDECombine(N
, DCI
))
4734 static SDValue
PerformSETCCCombine(SDNode
*N
,
4735 TargetLowering::DAGCombinerInfo
&DCI
) {
4736 EVT CCType
= N
->getValueType(0);
4737 SDValue A
= N
->getOperand(0);
4738 SDValue B
= N
->getOperand(1);
4740 if (CCType
!= MVT::v2i1
|| A
.getValueType() != MVT::v2f16
)
4744 // setp.f16x2 returns two scalar predicates, which we need to
4745 // convert back to v2i1. The returned result will be scalarized by
4746 // the legalizer, but the comparison will remain a single vector
4748 SDValue CCNode
= DCI
.DAG
.getNode(NVPTXISD::SETP_F16X2
, DL
,
4749 DCI
.DAG
.getVTList(MVT::i1
, MVT::i1
),
4750 {A
, B
, N
->getOperand(2)});
4751 return DCI
.DAG
.getNode(ISD::BUILD_VECTOR
, DL
, CCType
, CCNode
.getValue(0),
4752 CCNode
.getValue(1));
4755 SDValue
NVPTXTargetLowering::PerformDAGCombine(SDNode
*N
,
4756 DAGCombinerInfo
&DCI
) const {
4757 CodeGenOpt::Level OptLevel
= getTargetMachine().getOptLevel();
4758 switch (N
->getOpcode()) {
4762 return PerformADDCombine(N
, DCI
, STI
, OptLevel
);
4764 return PerformMULCombine(N
, DCI
, OptLevel
);
4766 return PerformSHLCombine(N
, DCI
, OptLevel
);
4768 return PerformANDCombine(N
, DCI
);
4771 return PerformREMCombine(N
, DCI
, OptLevel
);
4773 return PerformSETCCCombine(N
, DCI
);
4778 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
4779 static void ReplaceLoadVector(SDNode
*N
, SelectionDAG
&DAG
,
4780 SmallVectorImpl
<SDValue
> &Results
) {
4781 EVT ResVT
= N
->getValueType(0);
4784 assert(ResVT
.isVector() && "Vector load must have vector type");
4786 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
4787 // legal. We can (and should) split that into 2 loads of <2 x double> here
4788 // but I'm leaving that as a TODO for now.
4789 assert(ResVT
.isSimple() && "Can only handle simple types");
4790 switch (ResVT
.getSimpleVT().SimpleTy
) {
4805 case MVT::v8f16
: // <4 x f16x2>
4806 // This is a "native" vector type
4810 LoadSDNode
*LD
= cast
<LoadSDNode
>(N
);
4812 unsigned Align
= LD
->getAlignment();
4813 auto &TD
= DAG
.getDataLayout();
4814 unsigned PrefAlign
=
4815 TD
.getPrefTypeAlignment(ResVT
.getTypeForEVT(*DAG
.getContext()));
4816 if (Align
< PrefAlign
) {
4817 // This load is not sufficiently aligned, so bail out and let this vector
4818 // load be scalarized. Note that we may still be able to emit smaller
4819 // vector loads. For example, if we are loading a <4 x float> with an
4820 // alignment of 8, this check will fail but the legalizer will try again
4821 // with 2 x <2 x float>, which will succeed with an alignment of 8.
4825 EVT EltVT
= ResVT
.getVectorElementType();
4826 unsigned NumElts
= ResVT
.getVectorNumElements();
4828 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
4829 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
4830 // loaded type to i16 and propagate the "real" type as the memory type.
4831 bool NeedTrunc
= false;
4832 if (EltVT
.getSizeInBits() < 16) {
4837 unsigned Opcode
= 0;
4839 bool LoadF16x2
= false;
4845 Opcode
= NVPTXISD::LoadV2
;
4846 LdResVTs
= DAG
.getVTList(EltVT
, EltVT
, MVT::Other
);
4849 Opcode
= NVPTXISD::LoadV4
;
4850 EVT ListVTs
[] = { EltVT
, EltVT
, EltVT
, EltVT
, MVT::Other
};
4851 LdResVTs
= DAG
.getVTList(ListVTs
);
4855 // v8f16 is a special case. PTX doesn't have ld.v8.f16
4856 // instruction. Instead, we split the vector into v2f16 chunks and
4857 // load them with ld.v4.b32.
4858 assert(EltVT
== MVT::f16
&& "Unsupported v8 vector type.");
4860 Opcode
= NVPTXISD::LoadV4
;
4861 EVT ListVTs
[] = {MVT::v2f16
, MVT::v2f16
, MVT::v2f16
, MVT::v2f16
,
4863 LdResVTs
= DAG
.getVTList(ListVTs
);
4868 // Copy regular operands
4869 SmallVector
<SDValue
, 8> OtherOps(N
->op_begin(), N
->op_end());
4871 // The select routine does not have access to the LoadSDNode instance, so
4872 // pass along the extension information
4873 OtherOps
.push_back(DAG
.getIntPtrConstant(LD
->getExtensionType(), DL
));
4875 SDValue NewLD
= DAG
.getMemIntrinsicNode(Opcode
, DL
, LdResVTs
, OtherOps
,
4877 LD
->getMemOperand());
4879 SmallVector
<SDValue
, 8> ScalarRes
;
4881 // Split v2f16 subvectors back into individual elements.
4883 for (unsigned i
= 0; i
< NumElts
; ++i
) {
4884 SDValue SubVector
= NewLD
.getValue(i
);
4885 SDValue E0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltVT
, SubVector
,
4886 DAG
.getIntPtrConstant(0, DL
));
4887 SDValue E1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltVT
, SubVector
,
4888 DAG
.getIntPtrConstant(1, DL
));
4889 ScalarRes
.push_back(E0
);
4890 ScalarRes
.push_back(E1
);
4893 for (unsigned i
= 0; i
< NumElts
; ++i
) {
4894 SDValue Res
= NewLD
.getValue(i
);
4896 Res
= DAG
.getNode(ISD::TRUNCATE
, DL
, ResVT
.getVectorElementType(), Res
);
4897 ScalarRes
.push_back(Res
);
4901 SDValue LoadChain
= NewLD
.getValue(NumElts
);
4903 SDValue BuildVec
= DAG
.getBuildVector(ResVT
, DL
, ScalarRes
);
4905 Results
.push_back(BuildVec
);
4906 Results
.push_back(LoadChain
);
4909 static void ReplaceINTRINSIC_W_CHAIN(SDNode
*N
, SelectionDAG
&DAG
,
4910 SmallVectorImpl
<SDValue
> &Results
) {
4911 SDValue Chain
= N
->getOperand(0);
4912 SDValue Intrin
= N
->getOperand(1);
4915 // Get the intrinsic ID
4916 unsigned IntrinNo
= cast
<ConstantSDNode
>(Intrin
.getNode())->getZExtValue();
4920 case Intrinsic::nvvm_ldg_global_i
:
4921 case Intrinsic::nvvm_ldg_global_f
:
4922 case Intrinsic::nvvm_ldg_global_p
:
4923 case Intrinsic::nvvm_ldu_global_i
:
4924 case Intrinsic::nvvm_ldu_global_f
:
4925 case Intrinsic::nvvm_ldu_global_p
: {
4926 EVT ResVT
= N
->getValueType(0);
4928 if (ResVT
.isVector()) {
4931 unsigned NumElts
= ResVT
.getVectorNumElements();
4932 EVT EltVT
= ResVT
.getVectorElementType();
4934 // Since LDU/LDG are target nodes, we cannot rely on DAG type
4936 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
4937 // loaded type to i16 and propagate the "real" type as the memory type.
4938 bool NeedTrunc
= false;
4939 if (EltVT
.getSizeInBits() < 16) {
4944 unsigned Opcode
= 0;
4954 case Intrinsic::nvvm_ldg_global_i
:
4955 case Intrinsic::nvvm_ldg_global_f
:
4956 case Intrinsic::nvvm_ldg_global_p
:
4957 Opcode
= NVPTXISD::LDGV2
;
4959 case Intrinsic::nvvm_ldu_global_i
:
4960 case Intrinsic::nvvm_ldu_global_f
:
4961 case Intrinsic::nvvm_ldu_global_p
:
4962 Opcode
= NVPTXISD::LDUV2
;
4965 LdResVTs
= DAG
.getVTList(EltVT
, EltVT
, MVT::Other
);
4971 case Intrinsic::nvvm_ldg_global_i
:
4972 case Intrinsic::nvvm_ldg_global_f
:
4973 case Intrinsic::nvvm_ldg_global_p
:
4974 Opcode
= NVPTXISD::LDGV4
;
4976 case Intrinsic::nvvm_ldu_global_i
:
4977 case Intrinsic::nvvm_ldu_global_f
:
4978 case Intrinsic::nvvm_ldu_global_p
:
4979 Opcode
= NVPTXISD::LDUV4
;
4982 EVT ListVTs
[] = { EltVT
, EltVT
, EltVT
, EltVT
, MVT::Other
};
4983 LdResVTs
= DAG
.getVTList(ListVTs
);
4988 SmallVector
<SDValue
, 8> OtherOps
;
4990 // Copy regular operands
4992 OtherOps
.push_back(Chain
); // Chain
4993 // Skip operand 1 (intrinsic ID)
4995 OtherOps
.append(N
->op_begin() + 2, N
->op_end());
4997 MemIntrinsicSDNode
*MemSD
= cast
<MemIntrinsicSDNode
>(N
);
4999 SDValue NewLD
= DAG
.getMemIntrinsicNode(Opcode
, DL
, LdResVTs
, OtherOps
,
5000 MemSD
->getMemoryVT(),
5001 MemSD
->getMemOperand());
5003 SmallVector
<SDValue
, 4> ScalarRes
;
5005 for (unsigned i
= 0; i
< NumElts
; ++i
) {
5006 SDValue Res
= NewLD
.getValue(i
);
5009 DAG
.getNode(ISD::TRUNCATE
, DL
, ResVT
.getVectorElementType(), Res
);
5010 ScalarRes
.push_back(Res
);
5013 SDValue LoadChain
= NewLD
.getValue(NumElts
);
5016 DAG
.getBuildVector(ResVT
, DL
, ScalarRes
);
5018 Results
.push_back(BuildVec
);
5019 Results
.push_back(LoadChain
);
5022 assert(ResVT
.isSimple() && ResVT
.getSimpleVT().SimpleTy
== MVT::i8
&&
5023 "Custom handling of non-i8 ldu/ldg?");
5025 // Just copy all operands as-is
5026 SmallVector
<SDValue
, 4> Ops(N
->op_begin(), N
->op_end());
5028 // Force output to i16
5029 SDVTList LdResVTs
= DAG
.getVTList(MVT::i16
, MVT::Other
);
5031 MemIntrinsicSDNode
*MemSD
= cast
<MemIntrinsicSDNode
>(N
);
5033 // We make sure the memory type is i8, which will be used during isel
5034 // to select the proper instruction.
5036 DAG
.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN
, DL
, LdResVTs
, Ops
,
5037 MVT::i8
, MemSD
->getMemOperand());
5039 Results
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i8
,
5040 NewLD
.getValue(0)));
5041 Results
.push_back(NewLD
.getValue(1));
5047 void NVPTXTargetLowering::ReplaceNodeResults(
5048 SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
, SelectionDAG
&DAG
) const {
5049 switch (N
->getOpcode()) {
5051 report_fatal_error("Unhandled custom legalization");
5053 ReplaceLoadVector(N
, DAG
, Results
);
5055 case ISD::INTRINSIC_W_CHAIN
:
5056 ReplaceINTRINSIC_W_CHAIN(N
, DAG
, Results
);
5061 // Pin NVPTXTargetObjectFile's vtables to this file.
5062 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {}
5064 MCSection
*NVPTXTargetObjectFile::SelectSectionForGlobal(
5065 const GlobalObject
*GO
, SectionKind Kind
, const TargetMachine
&TM
) const {
5066 return getDataSection();