1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
12 //===----------------------------------------------------------------------===//
14 #include "NVPTXISelLowering.h"
15 #include "MCTargetDesc/NVPTXBaseInfo.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/CodeGen/Analysis.h"
26 #include "llvm/CodeGen/ISDOpcodes.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineMemOperand.h"
29 #include "llvm/CodeGen/SelectionDAG.h"
30 #include "llvm/CodeGen/SelectionDAGNodes.h"
31 #include "llvm/CodeGen/TargetCallingConv.h"
32 #include "llvm/CodeGen/TargetLowering.h"
33 #include "llvm/CodeGen/ValueTypes.h"
34 #include "llvm/CodeGenTypes/MachineValueType.h"
35 #include "llvm/IR/Argument.h"
36 #include "llvm/IR/Attributes.h"
37 #include "llvm/IR/Constants.h"
38 #include "llvm/IR/DataLayout.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/DiagnosticInfo.h"
41 #include "llvm/IR/FPEnv.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/GlobalValue.h"
44 #include "llvm/IR/Instruction.h"
45 #include "llvm/IR/Instructions.h"
46 #include "llvm/IR/IntrinsicsNVPTX.h"
47 #include "llvm/IR/Module.h"
48 #include "llvm/IR/Type.h"
49 #include "llvm/IR/Value.h"
50 #include "llvm/Support/Alignment.h"
51 #include "llvm/Support/Casting.h"
52 #include "llvm/Support/CodeGen.h"
53 #include "llvm/Support/CommandLine.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/raw_ostream.h"
56 #include "llvm/Target/TargetMachine.h"
57 #include "llvm/Target/TargetOptions.h"
69 #define DEBUG_TYPE "nvptx-lower"
73 static std::atomic
<unsigned> GlobalUniqueCallSite
;
75 static cl::opt
<bool> sched4reg(
77 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
79 static cl::opt
<unsigned> FMAContractLevelOpt(
80 "nvptx-fma-level", cl::Hidden
,
81 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
82 " 1: do it 2: do it aggressively"),
85 static cl::opt
<int> UsePrecDivF32(
86 "nvptx-prec-divf32", cl::Hidden
,
87 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
88 " IEEE Compliant F32 div.rnd if available."),
91 static cl::opt
<bool> UsePrecSqrtF32(
92 "nvptx-prec-sqrtf32", cl::Hidden
,
93 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
96 static cl::opt
<bool> ForceMinByValParamAlign(
97 "nvptx-force-min-byval-param-align", cl::Hidden
,
98 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
99 " params of device functions."),
102 int NVPTXTargetLowering::getDivF32Level() const {
103 if (UsePrecDivF32
.getNumOccurrences() > 0) {
104 // If nvptx-prec-div32=N is used on the command-line, always honor it
105 return UsePrecDivF32
;
107 // Otherwise, use div.approx if fast math is enabled
108 if (getTargetMachine().Options
.UnsafeFPMath
)
115 bool NVPTXTargetLowering::usePrecSqrtF32() const {
116 if (UsePrecSqrtF32
.getNumOccurrences() > 0) {
117 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
118 return UsePrecSqrtF32
;
120 // Otherwise, use sqrt.approx if fast math is enabled
121 return !getTargetMachine().Options
.UnsafeFPMath
;
125 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction
&MF
) const {
126 return MF
.getDenormalMode(APFloat::IEEEsingle()).Output
==
127 DenormalMode::PreserveSign
;
130 static bool IsPTXVectorType(MVT VT
) {
131 switch (VT
.SimpleTy
) {
140 case MVT::v8i16
: // <4 x i16x2>
146 case MVT::v8f16
: // <4 x f16x2>
149 case MVT::v8bf16
: // <4 x bf16x2>
157 static bool Is16bitsType(MVT VT
) {
158 return (VT
.SimpleTy
== MVT::f16
|| VT
.SimpleTy
== MVT::bf16
||
159 VT
.SimpleTy
== MVT::i16
);
162 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
163 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
164 /// into their primitive components.
165 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
166 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
167 /// LowerCall, and LowerReturn.
168 static void ComputePTXValueVTs(const TargetLowering
&TLI
, const DataLayout
&DL
,
169 Type
*Ty
, SmallVectorImpl
<EVT
> &ValueVTs
,
170 SmallVectorImpl
<uint64_t> *Offsets
= nullptr,
171 uint64_t StartingOffset
= 0) {
172 SmallVector
<EVT
, 16> TempVTs
;
173 SmallVector
<uint64_t, 16> TempOffsets
;
175 // Special case for i128 - decompose to (i64, i64)
176 if (Ty
->isIntegerTy(128)) {
177 ValueVTs
.push_back(EVT(MVT::i64
));
178 ValueVTs
.push_back(EVT(MVT::i64
));
181 Offsets
->push_back(StartingOffset
+ 0);
182 Offsets
->push_back(StartingOffset
+ 8);
188 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
189 if (StructType
*STy
= dyn_cast
<StructType
>(Ty
)) {
190 auto const *SL
= DL
.getStructLayout(STy
);
192 for(auto *EI
: STy
->elements()) {
193 ComputePTXValueVTs(TLI
, DL
, EI
, ValueVTs
, Offsets
,
194 StartingOffset
+ SL
->getElementOffset(ElementNum
));
200 ComputeValueVTs(TLI
, DL
, Ty
, TempVTs
, &TempOffsets
, StartingOffset
);
201 for (unsigned i
= 0, e
= TempVTs
.size(); i
!= e
; ++i
) {
203 uint64_t Off
= TempOffsets
[i
];
204 // Split vectors into individual elements, except for v2f16, which
205 // we will pass as a single scalar.
207 unsigned NumElts
= VT
.getVectorNumElements();
208 EVT EltVT
= VT
.getVectorElementType();
209 // Vectors with an even number of f16 elements will be passed to
210 // us as an array of v2f16/v2bf16 elements. We must match this so we
211 // stay in sync with Ins/Outs.
212 if ((Is16bitsType(EltVT
.getSimpleVT())) && NumElts
% 2 == 0) {
213 switch (EltVT
.getSimpleVT().SimpleTy
) {
224 llvm_unreachable("Unexpected type");
227 } else if (EltVT
.getSimpleVT() == MVT::i8
&&
228 (NumElts
% 4 == 0 || NumElts
== 3)) {
229 // v*i8 are formally lowered as v4i8
231 NumElts
= (NumElts
+ 3) / 4;
233 for (unsigned j
= 0; j
!= NumElts
; ++j
) {
234 ValueVTs
.push_back(EltVT
);
236 Offsets
->push_back(Off
+ j
* EltVT
.getStoreSize());
239 ValueVTs
.push_back(VT
);
241 Offsets
->push_back(Off
);
246 /// PromoteScalarIntegerPTX
247 /// Used to make sure the arguments/returns are suitable for passing
248 /// and promote them to a larger size if they're not.
250 /// The promoted type is placed in \p PromoteVT if the function returns true.
251 static bool PromoteScalarIntegerPTX(const EVT
&VT
, MVT
*PromotedVT
) {
252 if (VT
.isScalarInteger()) {
253 switch (PowerOf2Ceil(VT
.getFixedSizeInBits())) {
256 "Promotion is not suitable for scalars of size larger than 64-bits");
258 *PromotedVT
= MVT::i1
;
263 *PromotedVT
= MVT::i8
;
266 *PromotedVT
= MVT::i16
;
269 *PromotedVT
= MVT::i32
;
272 *PromotedVT
= MVT::i64
;
275 return EVT(*PromotedVT
) != VT
;
280 // Check whether we can merge loads/stores of some of the pieces of a
281 // flattened function parameter or return value into a single vector
284 // The flattened parameter is represented as a list of EVTs and
285 // offsets, and the whole structure is aligned to ParamAlignment. This
286 // function determines whether we can load/store pieces of the
287 // parameter starting at index Idx using a single vectorized op of
288 // size AccessSize. If so, it returns the number of param pieces
289 // covered by the vector op. Otherwise, it returns 1.
290 static unsigned CanMergeParamLoadStoresStartingAt(
291 unsigned Idx
, uint32_t AccessSize
, const SmallVectorImpl
<EVT
> &ValueVTs
,
292 const SmallVectorImpl
<uint64_t> &Offsets
, Align ParamAlignment
) {
294 // Can't vectorize if param alignment is not sufficient.
295 if (ParamAlignment
< AccessSize
)
297 // Can't vectorize if offset is not aligned.
298 if (Offsets
[Idx
] & (AccessSize
- 1))
301 EVT EltVT
= ValueVTs
[Idx
];
302 unsigned EltSize
= EltVT
.getStoreSize();
304 // Element is too large to vectorize.
305 if (EltSize
>= AccessSize
)
308 unsigned NumElts
= AccessSize
/ EltSize
;
309 // Can't vectorize if AccessBytes if not a multiple of EltSize.
310 if (AccessSize
!= EltSize
* NumElts
)
313 // We don't have enough elements to vectorize.
314 if (Idx
+ NumElts
> ValueVTs
.size())
317 // PTX ISA can only deal with 2- and 4-element vector ops.
318 if (NumElts
!= 4 && NumElts
!= 2)
321 for (unsigned j
= Idx
+ 1; j
< Idx
+ NumElts
; ++j
) {
322 // Types do not match.
323 if (ValueVTs
[j
] != EltVT
)
326 // Elements are not contiguous.
327 if (Offsets
[j
] - Offsets
[j
- 1] != EltSize
)
330 // OK. We can vectorize ValueVTs[i..i+NumElts)
334 // Flags for tracking per-element vectorization state of loads/stores
335 // of a flattened function parameter or return value.
336 enum ParamVectorizationFlags
{
337 PVF_INNER
= 0x0, // Middle elements of a vector.
338 PVF_FIRST
= 0x1, // First element of the vector.
339 PVF_LAST
= 0x2, // Last element of the vector.
340 // Scalar is effectively a 1-element vector.
341 PVF_SCALAR
= PVF_FIRST
| PVF_LAST
344 // Computes whether and how we can vectorize the loads/stores of a
345 // flattened function parameter or return value.
347 // The flattened parameter is represented as the list of ValueVTs and
348 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
349 // of the same size as ValueVTs indicating how each piece should be
350 // loaded/stored (i.e. as a scalar, or as part of a vector
352 static SmallVector
<ParamVectorizationFlags
, 16>
353 VectorizePTXValueVTs(const SmallVectorImpl
<EVT
> &ValueVTs
,
354 const SmallVectorImpl
<uint64_t> &Offsets
,
355 Align ParamAlignment
, bool IsVAArg
= false) {
356 // Set vector size to match ValueVTs and mark all elements as
357 // scalars by default.
358 SmallVector
<ParamVectorizationFlags
, 16> VectorInfo
;
359 VectorInfo
.assign(ValueVTs
.size(), PVF_SCALAR
);
364 // Check what we can vectorize using 128/64/32-bit accesses.
365 for (int I
= 0, E
= ValueVTs
.size(); I
!= E
; ++I
) {
366 // Skip elements we've already processed.
367 assert(VectorInfo
[I
] == PVF_SCALAR
&& "Unexpected vector info state.");
368 for (unsigned AccessSize
: {16, 8, 4, 2}) {
369 unsigned NumElts
= CanMergeParamLoadStoresStartingAt(
370 I
, AccessSize
, ValueVTs
, Offsets
, ParamAlignment
);
371 // Mark vectorized elements.
374 llvm_unreachable("Unexpected return value");
376 // Can't vectorize using this size, try next smaller size.
379 assert(I
+ 1 < E
&& "Not enough elements.");
380 VectorInfo
[I
] = PVF_FIRST
;
381 VectorInfo
[I
+ 1] = PVF_LAST
;
385 assert(I
+ 3 < E
&& "Not enough elements.");
386 VectorInfo
[I
] = PVF_FIRST
;
387 VectorInfo
[I
+ 1] = PVF_INNER
;
388 VectorInfo
[I
+ 2] = PVF_INNER
;
389 VectorInfo
[I
+ 3] = PVF_LAST
;
393 // Break out of the inner loop because we've already succeeded
394 // using largest possible AccessSize.
401 // NVPTXTargetLowering Constructor.
402 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine
&TM
,
403 const NVPTXSubtarget
&STI
)
404 : TargetLowering(TM
), nvTM(&TM
), STI(STI
) {
405 // always lower memset, memcpy, and memmove intrinsics to load/store
406 // instructions, rather
407 // then generating calls to memset, mempcy or memmove.
408 MaxStoresPerMemset
= MaxStoresPerMemsetOptSize
= (unsigned)0xFFFFFFFF;
409 MaxStoresPerMemcpy
= MaxStoresPerMemcpyOptSize
= (unsigned) 0xFFFFFFFF;
410 MaxStoresPerMemmove
= MaxStoresPerMemmoveOptSize
= (unsigned) 0xFFFFFFFF;
412 setBooleanContents(ZeroOrNegativeOneBooleanContent
);
413 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent
);
415 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
416 // condition branches.
417 setJumpIsExpensive(true);
419 // Wide divides are _very_ slow. Try to reduce the width of the divide if
421 addBypassSlowDiv(64, 32);
423 // By default, use the Source scheduling
425 setSchedulingPreference(Sched::RegPressure
);
427 setSchedulingPreference(Sched::Source
);
429 auto setFP16OperationAction
= [&](unsigned Op
, MVT VT
, LegalizeAction Action
,
430 LegalizeAction NoF16Action
) {
431 setOperationAction(Op
, VT
, STI
.allowFP16Math() ? Action
: NoF16Action
);
434 auto setBF16OperationAction
= [&](unsigned Op
, MVT VT
, LegalizeAction Action
,
435 LegalizeAction NoBF16Action
) {
436 bool IsOpSupported
= STI
.hasBF16Math();
437 // Few instructions are available on sm_90 only
448 case ISD::FNEARBYINT
:
450 case ISD::FROUNDEVEN
:
452 IsOpSupported
= STI
.getSmVersion() >= 90 && STI
.getPTXVersion() >= 78;
456 Op
, VT
, IsOpSupported
? Action
: NoBF16Action
);
459 auto setI16x2OperationAction
= [&](unsigned Op
, MVT VT
, LegalizeAction Action
,
460 LegalizeAction NoI16x2Action
) {
461 bool IsOpSupported
= false;
462 // instructions are available on sm_90 only
469 IsOpSupported
= STI
.getSmVersion() >= 90 && STI
.getPTXVersion() >= 80;
472 setOperationAction(Op
, VT
, IsOpSupported
? Action
: NoI16x2Action
);
475 addRegisterClass(MVT::i1
, &NVPTX::Int1RegsRegClass
);
476 addRegisterClass(MVT::i16
, &NVPTX::Int16RegsRegClass
);
477 addRegisterClass(MVT::v2i16
, &NVPTX::Int32RegsRegClass
);
478 addRegisterClass(MVT::v4i8
, &NVPTX::Int32RegsRegClass
);
479 addRegisterClass(MVT::i32
, &NVPTX::Int32RegsRegClass
);
480 addRegisterClass(MVT::i64
, &NVPTX::Int64RegsRegClass
);
481 addRegisterClass(MVT::f32
, &NVPTX::Float32RegsRegClass
);
482 addRegisterClass(MVT::f64
, &NVPTX::Float64RegsRegClass
);
483 addRegisterClass(MVT::f16
, &NVPTX::Int16RegsRegClass
);
484 addRegisterClass(MVT::v2f16
, &NVPTX::Int32RegsRegClass
);
485 addRegisterClass(MVT::bf16
, &NVPTX::Int16RegsRegClass
);
486 addRegisterClass(MVT::v2bf16
, &NVPTX::Int32RegsRegClass
);
488 // Conversion to/from FP16/FP16x2 is always legal.
489 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2f16
, Custom
);
490 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f16
, Custom
);
491 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2f16
, Expand
);
492 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v2f16
, Expand
);
494 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Legal
);
495 if (STI
.getSmVersion() >= 30 && STI
.getPTXVersion() > 31)
496 setOperationAction(ISD::READSTEADYCOUNTER
, MVT::i64
, Legal
);
498 setFP16OperationAction(ISD::SETCC
, MVT::f16
, Legal
, Promote
);
499 setFP16OperationAction(ISD::SETCC
, MVT::v2f16
, Legal
, Expand
);
501 // Conversion to/from BFP16/BFP16x2 is always legal.
502 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2bf16
, Custom
);
503 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2bf16
, Custom
);
504 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2bf16
, Expand
);
505 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v2bf16
, Expand
);
507 setBF16OperationAction(ISD::SETCC
, MVT::v2bf16
, Legal
, Expand
);
508 setBF16OperationAction(ISD::SETCC
, MVT::bf16
, Legal
, Promote
);
509 if (getOperationAction(ISD::SETCC
, MVT::bf16
) == Promote
)
510 AddPromotedToType(ISD::SETCC
, MVT::bf16
, MVT::f32
);
512 // Conversion to/from i16/i16x2 is always legal.
513 setOperationAction(ISD::BUILD_VECTOR
, MVT::v2i16
, Custom
);
514 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i16
, Custom
);
515 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i16
, Expand
);
516 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v2i16
, Expand
);
518 setOperationAction(ISD::BUILD_VECTOR
, MVT::v4i8
, Custom
);
519 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i8
, Custom
);
520 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i8
, Custom
);
521 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v4i8
, Custom
);
522 // Only logical ops can be done on v4i8 directly, others must be done
525 {ISD::ABS
, ISD::ADD
, ISD::ADDC
, ISD::ADDE
,
526 ISD::BITREVERSE
, ISD::CTLZ
, ISD::CTPOP
, ISD::CTTZ
,
527 ISD::FP_TO_SINT
, ISD::FP_TO_UINT
, ISD::FSHL
, ISD::FSHR
,
528 ISD::MUL
, ISD::MULHS
, ISD::MULHU
, ISD::PARITY
,
529 ISD::ROTL
, ISD::ROTR
, ISD::SADDO
, ISD::SADDO_CARRY
,
530 ISD::SADDSAT
, ISD::SDIV
, ISD::SDIVREM
, ISD::SELECT_CC
,
531 ISD::SETCC
, ISD::SHL
, ISD::SINT_TO_FP
, ISD::SMAX
,
532 ISD::SMIN
, ISD::SMULO
, ISD::SMUL_LOHI
, ISD::SRA
,
533 ISD::SREM
, ISD::SRL
, ISD::SSHLSAT
, ISD::SSUBO
,
534 ISD::SSUBO_CARRY
, ISD::SSUBSAT
, ISD::SUB
, ISD::SUBC
,
535 ISD::SUBE
, ISD::UADDO
, ISD::UADDO_CARRY
, ISD::UADDSAT
,
536 ISD::UDIV
, ISD::UDIVREM
, ISD::UINT_TO_FP
, ISD::UMAX
,
537 ISD::UMIN
, ISD::UMULO
, ISD::UMUL_LOHI
, ISD::UREM
,
538 ISD::USHLSAT
, ISD::USUBO
, ISD::USUBO_CARRY
, ISD::VSELECT
,
542 // Operations not directly supported by NVPTX.
543 for (MVT VT
: {MVT::bf16
, MVT::f16
, MVT::v2bf16
, MVT::v2f16
, MVT::f32
,
544 MVT::f64
, MVT::i1
, MVT::i8
, MVT::i16
, MVT::v2i16
, MVT::v4i8
,
545 MVT::i32
, MVT::i64
}) {
546 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
547 setOperationAction(ISD::BR_CC
, VT
, Expand
);
550 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
551 // For others we will expand to a SHL/SRA pair.
552 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i64
, Legal
);
553 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i32
, Legal
);
554 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i16
, Legal
);
555 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i8
, Legal
);
556 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i1
, Expand
);
557 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i16
, Expand
);
559 setOperationAction(ISD::SHL_PARTS
, MVT::i32
, Custom
);
560 setOperationAction(ISD::SRA_PARTS
, MVT::i32
, Custom
);
561 setOperationAction(ISD::SRL_PARTS
, MVT::i32
, Custom
);
562 setOperationAction(ISD::SHL_PARTS
, MVT::i64
, Custom
);
563 setOperationAction(ISD::SRA_PARTS
, MVT::i64
, Custom
);
564 setOperationAction(ISD::SRL_PARTS
, MVT::i64
, Custom
);
566 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
567 setOperationAction(ISD::BITREVERSE
, MVT::i64
, Legal
);
569 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
570 // that don't have h/w rotation we lower them to multi-instruction assembly.
571 // See ROT*_sw in NVPTXIntrInfo.td
572 setOperationAction(ISD::ROTL
, MVT::i64
, Legal
);
573 setOperationAction(ISD::ROTR
, MVT::i64
, Legal
);
574 setOperationAction(ISD::ROTL
, MVT::i32
, Legal
);
575 setOperationAction(ISD::ROTR
, MVT::i32
, Legal
);
577 setOperationAction(ISD::ROTL
, MVT::i16
, Expand
);
578 setOperationAction(ISD::ROTL
, MVT::v2i16
, Expand
);
579 setOperationAction(ISD::ROTR
, MVT::i16
, Expand
);
580 setOperationAction(ISD::ROTR
, MVT::v2i16
, Expand
);
581 setOperationAction(ISD::ROTL
, MVT::i8
, Expand
);
582 setOperationAction(ISD::ROTR
, MVT::i8
, Expand
);
583 setOperationAction(ISD::BSWAP
, MVT::i16
, Expand
);
585 // Indirect branch is not supported.
586 // This also disables Jump Table creation.
587 setOperationAction(ISD::BR_JT
, MVT::Other
, Expand
);
588 setOperationAction(ISD::BRIND
, MVT::Other
, Expand
);
590 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
591 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
593 // We want to legalize constant related memmove and memcopy
595 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::Other
, Custom
);
597 // Turn FP extload into load/fpextend
598 setLoadExtAction(ISD::EXTLOAD
, MVT::f32
, MVT::f16
, Expand
);
599 setLoadExtAction(ISD::EXTLOAD
, MVT::f64
, MVT::f16
, Expand
);
600 setLoadExtAction(ISD::EXTLOAD
, MVT::f32
, MVT::bf16
, Expand
);
601 setLoadExtAction(ISD::EXTLOAD
, MVT::f64
, MVT::bf16
, Expand
);
602 setLoadExtAction(ISD::EXTLOAD
, MVT::f64
, MVT::f32
, Expand
);
603 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f32
, MVT::v2f16
, Expand
);
604 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f64
, MVT::v2f16
, Expand
);
605 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f32
, MVT::v2bf16
, Expand
);
606 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f64
, MVT::v2bf16
, Expand
);
607 setLoadExtAction(ISD::EXTLOAD
, MVT::v2f64
, MVT::v2f32
, Expand
);
608 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f32
, MVT::v4f16
, Expand
);
609 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f64
, MVT::v4f16
, Expand
);
610 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f32
, MVT::v4bf16
, Expand
);
611 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f64
, MVT::v4bf16
, Expand
);
612 setLoadExtAction(ISD::EXTLOAD
, MVT::v4f64
, MVT::v4f32
, Expand
);
613 setLoadExtAction(ISD::EXTLOAD
, MVT::v8f32
, MVT::v8f16
, Expand
);
614 setLoadExtAction(ISD::EXTLOAD
, MVT::v8f64
, MVT::v8f16
, Expand
);
615 setLoadExtAction(ISD::EXTLOAD
, MVT::v8f32
, MVT::v8bf16
, Expand
);
616 setLoadExtAction(ISD::EXTLOAD
, MVT::v8f64
, MVT::v8bf16
, Expand
);
617 // Turn FP truncstore into trunc + store.
618 // FIXME: vector types should also be expanded
619 setTruncStoreAction(MVT::f32
, MVT::f16
, Expand
);
620 setTruncStoreAction(MVT::f64
, MVT::f16
, Expand
);
621 setTruncStoreAction(MVT::f32
, MVT::bf16
, Expand
);
622 setTruncStoreAction(MVT::f64
, MVT::bf16
, Expand
);
623 setTruncStoreAction(MVT::f64
, MVT::f32
, Expand
);
625 // PTX does not support load / store predicate registers
626 setOperationAction(ISD::LOAD
, MVT::i1
, Custom
);
627 setOperationAction(ISD::STORE
, MVT::i1
, Custom
);
629 for (MVT VT
: MVT::integer_valuetypes()) {
630 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i1
, Promote
);
631 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::i1
, Promote
);
632 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::i1
, Promote
);
633 setTruncStoreAction(VT
, MVT::i1
, Expand
);
636 // expand extload of vector of integers.
637 setLoadExtAction({ISD::EXTLOAD
, ISD::SEXTLOAD
, ISD::ZEXTLOAD
}, MVT::v2i16
,
639 setTruncStoreAction(MVT::v2i16
, MVT::v2i8
, Expand
);
641 // This is legal in NVPTX
642 setOperationAction(ISD::ConstantFP
, MVT::f64
, Legal
);
643 setOperationAction(ISD::ConstantFP
, MVT::f32
, Legal
);
644 setOperationAction(ISD::ConstantFP
, MVT::f16
, Legal
);
645 setOperationAction(ISD::ConstantFP
, MVT::bf16
, Legal
);
647 setOperationAction(ISD::DYNAMIC_STACKALLOC
, MVT::i32
, Custom
);
648 setOperationAction(ISD::DYNAMIC_STACKALLOC
, MVT::i64
, Custom
);
650 // TRAP can be lowered to PTX trap
651 setOperationAction(ISD::TRAP
, MVT::Other
, Legal
);
653 // Register custom handling for vector loads/stores
654 for (MVT VT
: MVT::fixedlen_vector_valuetypes()) {
655 if (IsPTXVectorType(VT
)) {
656 setOperationAction(ISD::LOAD
, VT
, Custom
);
657 setOperationAction(ISD::STORE
, VT
, Custom
);
658 setOperationAction(ISD::INTRINSIC_W_CHAIN
, VT
, Custom
);
663 setOperationAction(ISD::VASTART
, MVT::Other
, Custom
);
664 setOperationAction(ISD::VAARG
, MVT::Other
, Custom
);
665 setOperationAction(ISD::VACOPY
, MVT::Other
, Expand
);
666 setOperationAction(ISD::VAEND
, MVT::Other
, Expand
);
668 // Custom handling for i8 intrinsics
669 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i8
, Custom
);
671 for (const auto& Ty
: {MVT::i16
, MVT::i32
, MVT::i64
}) {
672 setOperationAction(ISD::ABS
, Ty
, Legal
);
673 setOperationAction(ISD::SMIN
, Ty
, Legal
);
674 setOperationAction(ISD::SMAX
, Ty
, Legal
);
675 setOperationAction(ISD::UMIN
, Ty
, Legal
);
676 setOperationAction(ISD::UMAX
, Ty
, Legal
);
678 setOperationAction(ISD::CTPOP
, Ty
, Legal
);
679 setOperationAction(ISD::CTLZ
, Ty
, Legal
);
682 setI16x2OperationAction(ISD::ABS
, MVT::v2i16
, Legal
, Custom
);
683 setI16x2OperationAction(ISD::SMIN
, MVT::v2i16
, Legal
, Custom
);
684 setI16x2OperationAction(ISD::SMAX
, MVT::v2i16
, Legal
, Custom
);
685 setI16x2OperationAction(ISD::UMIN
, MVT::v2i16
, Legal
, Custom
);
686 setI16x2OperationAction(ISD::UMAX
, MVT::v2i16
, Legal
, Custom
);
687 setI16x2OperationAction(ISD::CTPOP
, MVT::v2i16
, Legal
, Expand
);
688 setI16x2OperationAction(ISD::CTLZ
, MVT::v2i16
, Legal
, Expand
);
690 setI16x2OperationAction(ISD::ADD
, MVT::v2i16
, Legal
, Custom
);
691 setI16x2OperationAction(ISD::SUB
, MVT::v2i16
, Legal
, Custom
);
692 setI16x2OperationAction(ISD::MUL
, MVT::v2i16
, Legal
, Custom
);
693 setI16x2OperationAction(ISD::SHL
, MVT::v2i16
, Legal
, Custom
);
694 setI16x2OperationAction(ISD::SREM
, MVT::v2i16
, Legal
, Custom
);
695 setI16x2OperationAction(ISD::UREM
, MVT::v2i16
, Legal
, Custom
);
697 // Other arithmetic and logic ops are unsupported.
698 setOperationAction({ISD::SDIV
, ISD::UDIV
, ISD::SRA
, ISD::SRL
, ISD::MULHS
,
699 ISD::MULHU
, ISD::FP_TO_SINT
, ISD::FP_TO_UINT
,
700 ISD::SINT_TO_FP
, ISD::UINT_TO_FP
},
703 setOperationAction(ISD::ADDC
, MVT::i32
, Legal
);
704 setOperationAction(ISD::ADDE
, MVT::i32
, Legal
);
705 setOperationAction(ISD::SUBC
, MVT::i32
, Legal
);
706 setOperationAction(ISD::SUBE
, MVT::i32
, Legal
);
707 if (STI
.getPTXVersion() >= 43) {
708 setOperationAction(ISD::ADDC
, MVT::i64
, Legal
);
709 setOperationAction(ISD::ADDE
, MVT::i64
, Legal
);
710 setOperationAction(ISD::SUBC
, MVT::i64
, Legal
);
711 setOperationAction(ISD::SUBE
, MVT::i64
, Legal
);
714 setOperationAction(ISD::CTTZ
, MVT::i16
, Expand
);
715 setOperationAction(ISD::CTTZ
, MVT::v2i16
, Expand
);
716 setOperationAction(ISD::CTTZ
, MVT::i32
, Expand
);
717 setOperationAction(ISD::CTTZ
, MVT::i64
, Expand
);
719 // PTX does not directly support SELP of i1, so promote to i32 first
720 setOperationAction(ISD::SELECT
, MVT::i1
, Custom
);
722 // PTX cannot multiply two i64s in a single instruction.
723 setOperationAction(ISD::SMUL_LOHI
, MVT::i64
, Expand
);
724 setOperationAction(ISD::UMUL_LOHI
, MVT::i64
, Expand
);
726 // We have some custom DAG combine patterns for these nodes
727 setTargetDAGCombine({ISD::ADD
, ISD::AND
, ISD::EXTRACT_VECTOR_ELT
, ISD::FADD
,
728 ISD::LOAD
, ISD::MUL
, ISD::SHL
, ISD::SREM
, ISD::UREM
,
731 // setcc for f16x2 and bf16x2 needs special handling to prevent
732 // legalizer's attempt to scalarize it due to v2i1 not being legal.
733 if (STI
.allowFP16Math() || STI
.hasBF16Math())
734 setTargetDAGCombine(ISD::SETCC
);
736 // Promote fp16 arithmetic if fp16 hardware isn't available or the
737 // user passed --nvptx-no-fp16-math. The flag is useful because,
738 // although sm_53+ GPUs have some sort of FP16 support in
739 // hardware, only sm_53 and sm_60 have full implementation. Others
740 // only have token amount of hardware and are likely to run faster
741 // by using fp32 units instead.
742 for (const auto &Op
: {ISD::FADD
, ISD::FMUL
, ISD::FSUB
, ISD::FMA
}) {
743 setFP16OperationAction(Op
, MVT::f16
, Legal
, Promote
);
744 setFP16OperationAction(Op
, MVT::v2f16
, Legal
, Expand
);
745 setBF16OperationAction(Op
, MVT::v2bf16
, Legal
, Expand
);
746 // bf16 must be promoted to f32.
747 setBF16OperationAction(Op
, MVT::bf16
, Legal
, Promote
);
748 if (getOperationAction(Op
, MVT::bf16
) == Promote
)
749 AddPromotedToType(Op
, MVT::bf16
, MVT::f32
);
752 // f16/f16x2 neg was introduced in PTX 60, SM_53.
753 const bool IsFP16FP16x2NegAvailable
= STI
.getSmVersion() >= 53 &&
754 STI
.getPTXVersion() >= 60 &&
756 for (const auto &VT
: {MVT::f16
, MVT::v2f16
})
757 setOperationAction(ISD::FNEG
, VT
,
758 IsFP16FP16x2NegAvailable
? Legal
: Expand
);
760 setBF16OperationAction(ISD::FNEG
, MVT::bf16
, Legal
, Expand
);
761 setBF16OperationAction(ISD::FNEG
, MVT::v2bf16
, Legal
, Expand
);
762 // (would be) Library functions.
764 // These map to conversion instructions for scalar FP types.
765 for (const auto &Op
: {ISD::FCEIL
, ISD::FFLOOR
, ISD::FNEARBYINT
, ISD::FRINT
,
766 ISD::FROUNDEVEN
, ISD::FTRUNC
}) {
767 setOperationAction(Op
, MVT::f16
, Legal
);
768 setOperationAction(Op
, MVT::f32
, Legal
);
769 setOperationAction(Op
, MVT::f64
, Legal
);
770 setOperationAction(Op
, MVT::v2f16
, Expand
);
771 setOperationAction(Op
, MVT::v2bf16
, Expand
);
772 setBF16OperationAction(Op
, MVT::bf16
, Legal
, Promote
);
773 if (getOperationAction(Op
, MVT::bf16
) == Promote
)
774 AddPromotedToType(Op
, MVT::bf16
, MVT::f32
);
777 if (STI
.getSmVersion() < 80 || STI
.getPTXVersion() < 71) {
778 setOperationAction(ISD::BF16_TO_FP
, MVT::f32
, Expand
);
780 if (STI
.getSmVersion() < 90 || STI
.getPTXVersion() < 78) {
781 for (MVT VT
: {MVT::bf16
, MVT::f32
, MVT::f64
}) {
782 setOperationAction(ISD::FP_EXTEND
, VT
, Custom
);
783 setOperationAction(ISD::FP_ROUND
, VT
, Custom
);
787 // sm_80 only has conversions between f32 and bf16. Custom lower all other
789 if (STI
.getSmVersion() < 90 || STI
.getPTXVersion() < 78) {
790 for (MVT VT
: {MVT::i1
, MVT::i16
, MVT::i32
, MVT::i64
}) {
792 {ISD::SINT_TO_FP
, ISD::UINT_TO_FP
, ISD::FP_TO_SINT
, ISD::FP_TO_UINT
},
796 {ISD::SINT_TO_FP
, ISD::UINT_TO_FP
, ISD::FP_TO_SINT
, ISD::FP_TO_UINT
},
800 setOperationAction(ISD::FROUND
, MVT::f16
, Promote
);
801 setOperationAction(ISD::FROUND
, MVT::v2f16
, Expand
);
802 setOperationAction(ISD::FROUND
, MVT::v2bf16
, Expand
);
803 setOperationAction(ISD::FROUND
, MVT::f32
, Custom
);
804 setOperationAction(ISD::FROUND
, MVT::f64
, Custom
);
805 setOperationAction(ISD::FROUND
, MVT::bf16
, Promote
);
806 AddPromotedToType(ISD::FROUND
, MVT::bf16
, MVT::f32
);
808 // 'Expand' implements FCOPYSIGN without calling an external library.
809 setOperationAction(ISD::FCOPYSIGN
, MVT::f16
, Expand
);
810 setOperationAction(ISD::FCOPYSIGN
, MVT::v2f16
, Expand
);
811 setOperationAction(ISD::FCOPYSIGN
, MVT::bf16
, Expand
);
812 setOperationAction(ISD::FCOPYSIGN
, MVT::v2bf16
, Expand
);
813 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Expand
);
814 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
816 // These map to corresponding instructions for f32/f64. f16 must be
817 // promoted to f32. v2f16 is expanded to f16, which is then promoted
819 for (const auto &Op
:
820 {ISD::FDIV
, ISD::FREM
, ISD::FSQRT
, ISD::FSIN
, ISD::FCOS
}) {
821 setOperationAction(Op
, MVT::f16
, Promote
);
822 setOperationAction(Op
, MVT::f32
, Legal
);
823 setOperationAction(Op
, MVT::f64
, Legal
);
824 setOperationAction(Op
, MVT::v2f16
, Expand
);
825 setOperationAction(Op
, MVT::v2bf16
, Expand
);
826 setOperationAction(Op
, MVT::bf16
, Promote
);
827 AddPromotedToType(Op
, MVT::bf16
, MVT::f32
);
829 for (const auto &Op
: {ISD::FABS
}) {
830 setOperationAction(Op
, MVT::f16
, Promote
);
831 setOperationAction(Op
, MVT::f32
, Legal
);
832 setOperationAction(Op
, MVT::f64
, Legal
);
833 setOperationAction(Op
, MVT::v2f16
, Expand
);
834 setBF16OperationAction(Op
, MVT::v2bf16
, Legal
, Expand
);
835 setBF16OperationAction(Op
, MVT::bf16
, Legal
, Promote
);
836 if (getOperationAction(Op
, MVT::bf16
) == Promote
)
837 AddPromotedToType(Op
, MVT::bf16
, MVT::f32
);
840 // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
841 auto GetMinMaxAction
= [&](LegalizeAction NotSm80Action
) {
842 bool IsAtLeastSm80
= STI
.getSmVersion() >= 80 && STI
.getPTXVersion() >= 70;
843 return IsAtLeastSm80
? Legal
: NotSm80Action
;
845 for (const auto &Op
: {ISD::FMINNUM
, ISD::FMAXNUM
}) {
846 setFP16OperationAction(Op
, MVT::f16
, GetMinMaxAction(Promote
), Promote
);
847 setOperationAction(Op
, MVT::f32
, Legal
);
848 setOperationAction(Op
, MVT::f64
, Legal
);
849 setFP16OperationAction(Op
, MVT::v2f16
, GetMinMaxAction(Expand
), Expand
);
850 setBF16OperationAction(Op
, MVT::v2bf16
, Legal
, Expand
);
851 setBF16OperationAction(Op
, MVT::bf16
, Legal
, Promote
);
852 if (getOperationAction(Op
, MVT::bf16
) == Promote
)
853 AddPromotedToType(Op
, MVT::bf16
, MVT::f32
);
855 for (const auto &Op
: {ISD::FMINIMUM
, ISD::FMAXIMUM
}) {
856 setFP16OperationAction(Op
, MVT::f16
, GetMinMaxAction(Expand
), Expand
);
857 setFP16OperationAction(Op
, MVT::bf16
, Legal
, Expand
);
858 setOperationAction(Op
, MVT::f32
, GetMinMaxAction(Expand
));
859 setFP16OperationAction(Op
, MVT::v2f16
, GetMinMaxAction(Expand
), Expand
);
860 setBF16OperationAction(Op
, MVT::v2bf16
, Legal
, Expand
);
863 // Custom lowering for inline asm with 128-bit operands
864 setOperationAction(ISD::CopyToReg
, MVT::i128
, Custom
);
865 setOperationAction(ISD::CopyFromReg
, MVT::i128
, Custom
);
867 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
868 // No FPOW or FREM in PTX.
870 // Now deduce the information based on the above mentioned
872 computeRegisterProperties(STI
.getRegisterInfo());
874 setMinCmpXchgSizeInBits(32);
875 setMaxAtomicSizeInBitsSupported(64);
876 setMaxDivRemBitWidthSupported(64);
879 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode
) const {
881 #define MAKE_CASE(V) \
885 switch ((NVPTXISD::NodeType
)Opcode
) {
886 case NVPTXISD::FIRST_NUMBER
:
889 MAKE_CASE(NVPTXISD::CALL
)
890 MAKE_CASE(NVPTXISD::RET_GLUE
)
891 MAKE_CASE(NVPTXISD::LOAD_PARAM
)
892 MAKE_CASE(NVPTXISD::Wrapper
)
893 MAKE_CASE(NVPTXISD::DeclareParam
)
894 MAKE_CASE(NVPTXISD::DeclareScalarParam
)
895 MAKE_CASE(NVPTXISD::DeclareRet
)
896 MAKE_CASE(NVPTXISD::DeclareScalarRet
)
897 MAKE_CASE(NVPTXISD::DeclareRetParam
)
898 MAKE_CASE(NVPTXISD::PrintCall
)
899 MAKE_CASE(NVPTXISD::PrintConvergentCall
)
900 MAKE_CASE(NVPTXISD::PrintCallUni
)
901 MAKE_CASE(NVPTXISD::PrintConvergentCallUni
)
902 MAKE_CASE(NVPTXISD::LoadParam
)
903 MAKE_CASE(NVPTXISD::LoadParamV2
)
904 MAKE_CASE(NVPTXISD::LoadParamV4
)
905 MAKE_CASE(NVPTXISD::StoreParam
)
906 MAKE_CASE(NVPTXISD::StoreParamV2
)
907 MAKE_CASE(NVPTXISD::StoreParamV4
)
908 MAKE_CASE(NVPTXISD::StoreParamS32
)
909 MAKE_CASE(NVPTXISD::StoreParamU32
)
910 MAKE_CASE(NVPTXISD::CallArgBegin
)
911 MAKE_CASE(NVPTXISD::CallArg
)
912 MAKE_CASE(NVPTXISD::LastCallArg
)
913 MAKE_CASE(NVPTXISD::CallArgEnd
)
914 MAKE_CASE(NVPTXISD::CallVoid
)
915 MAKE_CASE(NVPTXISD::CallVal
)
916 MAKE_CASE(NVPTXISD::CallSymbol
)
917 MAKE_CASE(NVPTXISD::Prototype
)
918 MAKE_CASE(NVPTXISD::MoveParam
)
919 MAKE_CASE(NVPTXISD::StoreRetval
)
920 MAKE_CASE(NVPTXISD::StoreRetvalV2
)
921 MAKE_CASE(NVPTXISD::StoreRetvalV4
)
922 MAKE_CASE(NVPTXISD::PseudoUseParam
)
923 MAKE_CASE(NVPTXISD::RETURN
)
924 MAKE_CASE(NVPTXISD::CallSeqBegin
)
925 MAKE_CASE(NVPTXISD::CallSeqEnd
)
926 MAKE_CASE(NVPTXISD::CallPrototype
)
927 MAKE_CASE(NVPTXISD::ProxyReg
)
928 MAKE_CASE(NVPTXISD::LoadV2
)
929 MAKE_CASE(NVPTXISD::LoadV4
)
930 MAKE_CASE(NVPTXISD::LDGV2
)
931 MAKE_CASE(NVPTXISD::LDGV4
)
932 MAKE_CASE(NVPTXISD::LDUV2
)
933 MAKE_CASE(NVPTXISD::LDUV4
)
934 MAKE_CASE(NVPTXISD::StoreV2
)
935 MAKE_CASE(NVPTXISD::StoreV4
)
936 MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP
)
937 MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP
)
938 MAKE_CASE(NVPTXISD::IMAD
)
939 MAKE_CASE(NVPTXISD::BFE
)
940 MAKE_CASE(NVPTXISD::BFI
)
941 MAKE_CASE(NVPTXISD::PRMT
)
942 MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC
)
943 MAKE_CASE(NVPTXISD::SETP_F16X2
)
944 MAKE_CASE(NVPTXISD::SETP_BF16X2
)
945 MAKE_CASE(NVPTXISD::Dummy
)
946 MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED
)
947 MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED
)
948 MAKE_CASE(NVPTXISD::Tex1DFloatS32
)
949 MAKE_CASE(NVPTXISD::Tex1DFloatFloat
)
950 MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel
)
951 MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad
)
952 MAKE_CASE(NVPTXISD::Tex1DS32S32
)
953 MAKE_CASE(NVPTXISD::Tex1DS32Float
)
954 MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel
)
955 MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad
)
956 MAKE_CASE(NVPTXISD::Tex1DU32S32
)
957 MAKE_CASE(NVPTXISD::Tex1DU32Float
)
958 MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel
)
959 MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad
)
960 MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32
)
961 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat
)
962 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel
)
963 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad
)
964 MAKE_CASE(NVPTXISD::Tex1DArrayS32S32
)
965 MAKE_CASE(NVPTXISD::Tex1DArrayS32Float
)
966 MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel
)
967 MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad
)
968 MAKE_CASE(NVPTXISD::Tex1DArrayU32S32
)
969 MAKE_CASE(NVPTXISD::Tex1DArrayU32Float
)
970 MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel
)
971 MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad
)
972 MAKE_CASE(NVPTXISD::Tex2DFloatS32
)
973 MAKE_CASE(NVPTXISD::Tex2DFloatFloat
)
974 MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel
)
975 MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad
)
976 MAKE_CASE(NVPTXISD::Tex2DS32S32
)
977 MAKE_CASE(NVPTXISD::Tex2DS32Float
)
978 MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel
)
979 MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad
)
980 MAKE_CASE(NVPTXISD::Tex2DU32S32
)
981 MAKE_CASE(NVPTXISD::Tex2DU32Float
)
982 MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel
)
983 MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad
)
984 MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32
)
985 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat
)
986 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel
)
987 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad
)
988 MAKE_CASE(NVPTXISD::Tex2DArrayS32S32
)
989 MAKE_CASE(NVPTXISD::Tex2DArrayS32Float
)
990 MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel
)
991 MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad
)
992 MAKE_CASE(NVPTXISD::Tex2DArrayU32S32
)
993 MAKE_CASE(NVPTXISD::Tex2DArrayU32Float
)
994 MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel
)
995 MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad
)
996 MAKE_CASE(NVPTXISD::Tex3DFloatS32
)
997 MAKE_CASE(NVPTXISD::Tex3DFloatFloat
)
998 MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel
)
999 MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad
)
1000 MAKE_CASE(NVPTXISD::Tex3DS32S32
)
1001 MAKE_CASE(NVPTXISD::Tex3DS32Float
)
1002 MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel
)
1003 MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad
)
1004 MAKE_CASE(NVPTXISD::Tex3DU32S32
)
1005 MAKE_CASE(NVPTXISD::Tex3DU32Float
)
1006 MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel
)
1007 MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad
)
1008 MAKE_CASE(NVPTXISD::TexCubeFloatFloat
)
1009 MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel
)
1010 MAKE_CASE(NVPTXISD::TexCubeS32Float
)
1011 MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel
)
1012 MAKE_CASE(NVPTXISD::TexCubeU32Float
)
1013 MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel
)
1014 MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat
)
1015 MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel
)
1016 MAKE_CASE(NVPTXISD::TexCubeArrayS32Float
)
1017 MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel
)
1018 MAKE_CASE(NVPTXISD::TexCubeArrayU32Float
)
1019 MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel
)
1020 MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat
)
1021 MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat
)
1022 MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat
)
1023 MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat
)
1024 MAKE_CASE(NVPTXISD::Tld4R2DS64Float
)
1025 MAKE_CASE(NVPTXISD::Tld4G2DS64Float
)
1026 MAKE_CASE(NVPTXISD::Tld4B2DS64Float
)
1027 MAKE_CASE(NVPTXISD::Tld4A2DS64Float
)
1028 MAKE_CASE(NVPTXISD::Tld4R2DU64Float
)
1029 MAKE_CASE(NVPTXISD::Tld4G2DU64Float
)
1030 MAKE_CASE(NVPTXISD::Tld4B2DU64Float
)
1031 MAKE_CASE(NVPTXISD::Tld4A2DU64Float
)
1033 MAKE_CASE(NVPTXISD::TexUnified1DFloatS32
)
1034 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat
)
1035 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel
)
1036 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad
)
1037 MAKE_CASE(NVPTXISD::TexUnified1DS32S32
)
1038 MAKE_CASE(NVPTXISD::TexUnified1DS32Float
)
1039 MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel
)
1040 MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad
)
1041 MAKE_CASE(NVPTXISD::TexUnified1DU32S32
)
1042 MAKE_CASE(NVPTXISD::TexUnified1DU32Float
)
1043 MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel
)
1044 MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad
)
1045 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32
)
1046 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat
)
1047 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel
)
1048 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad
)
1049 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32
)
1050 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float
)
1051 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel
)
1052 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad
)
1053 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32
)
1054 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float
)
1055 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel
)
1056 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad
)
1057 MAKE_CASE(NVPTXISD::TexUnified2DFloatS32
)
1058 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat
)
1059 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel
)
1060 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad
)
1061 MAKE_CASE(NVPTXISD::TexUnified2DS32S32
)
1062 MAKE_CASE(NVPTXISD::TexUnified2DS32Float
)
1063 MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel
)
1064 MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad
)
1065 MAKE_CASE(NVPTXISD::TexUnified2DU32S32
)
1066 MAKE_CASE(NVPTXISD::TexUnified2DU32Float
)
1067 MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel
)
1068 MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad
)
1069 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32
)
1070 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat
)
1071 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel
)
1072 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad
)
1073 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32
)
1074 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float
)
1075 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel
)
1076 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad
)
1077 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32
)
1078 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float
)
1079 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel
)
1080 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad
)
1081 MAKE_CASE(NVPTXISD::TexUnified3DFloatS32
)
1082 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat
)
1083 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel
)
1084 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad
)
1085 MAKE_CASE(NVPTXISD::TexUnified3DS32S32
)
1086 MAKE_CASE(NVPTXISD::TexUnified3DS32Float
)
1087 MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel
)
1088 MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad
)
1089 MAKE_CASE(NVPTXISD::TexUnified3DU32S32
)
1090 MAKE_CASE(NVPTXISD::TexUnified3DU32Float
)
1091 MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel
)
1092 MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad
)
1093 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat
)
1094 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel
)
1095 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float
)
1096 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel
)
1097 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float
)
1098 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel
)
1099 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat
)
1100 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel
)
1101 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float
)
1102 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel
)
1103 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float
)
1104 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel
)
1105 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad
)
1106 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad
)
1107 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad
)
1108 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad
)
1109 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad
)
1110 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad
)
1111 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat
)
1112 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat
)
1113 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat
)
1114 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat
)
1115 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float
)
1116 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float
)
1117 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float
)
1118 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float
)
1119 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float
)
1120 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float
)
1121 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float
)
1122 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float
)
1124 MAKE_CASE(NVPTXISD::Suld1DI8Clamp
)
1125 MAKE_CASE(NVPTXISD::Suld1DI16Clamp
)
1126 MAKE_CASE(NVPTXISD::Suld1DI32Clamp
)
1127 MAKE_CASE(NVPTXISD::Suld1DI64Clamp
)
1128 MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp
)
1129 MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp
)
1130 MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp
)
1131 MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp
)
1132 MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp
)
1133 MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp
)
1134 MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp
)
1136 MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp
)
1137 MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp
)
1138 MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp
)
1139 MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp
)
1140 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp
)
1141 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp
)
1142 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp
)
1143 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp
)
1144 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp
)
1145 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp
)
1146 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp
)
1148 MAKE_CASE(NVPTXISD::Suld2DI8Clamp
)
1149 MAKE_CASE(NVPTXISD::Suld2DI16Clamp
)
1150 MAKE_CASE(NVPTXISD::Suld2DI32Clamp
)
1151 MAKE_CASE(NVPTXISD::Suld2DI64Clamp
)
1152 MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp
)
1153 MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp
)
1154 MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp
)
1155 MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp
)
1156 MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp
)
1157 MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp
)
1158 MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp
)
1160 MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp
)
1161 MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp
)
1162 MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp
)
1163 MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp
)
1164 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp
)
1165 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp
)
1166 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp
)
1167 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp
)
1168 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp
)
1169 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp
)
1170 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp
)
1172 MAKE_CASE(NVPTXISD::Suld3DI8Clamp
)
1173 MAKE_CASE(NVPTXISD::Suld3DI16Clamp
)
1174 MAKE_CASE(NVPTXISD::Suld3DI32Clamp
)
1175 MAKE_CASE(NVPTXISD::Suld3DI64Clamp
)
1176 MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp
)
1177 MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp
)
1178 MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp
)
1179 MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp
)
1180 MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp
)
1181 MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp
)
1182 MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp
)
1184 MAKE_CASE(NVPTXISD::Suld1DI8Trap
)
1185 MAKE_CASE(NVPTXISD::Suld1DI16Trap
)
1186 MAKE_CASE(NVPTXISD::Suld1DI32Trap
)
1187 MAKE_CASE(NVPTXISD::Suld1DI64Trap
)
1188 MAKE_CASE(NVPTXISD::Suld1DV2I8Trap
)
1189 MAKE_CASE(NVPTXISD::Suld1DV2I16Trap
)
1190 MAKE_CASE(NVPTXISD::Suld1DV2I32Trap
)
1191 MAKE_CASE(NVPTXISD::Suld1DV2I64Trap
)
1192 MAKE_CASE(NVPTXISD::Suld1DV4I8Trap
)
1193 MAKE_CASE(NVPTXISD::Suld1DV4I16Trap
)
1194 MAKE_CASE(NVPTXISD::Suld1DV4I32Trap
)
1196 MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap
)
1197 MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap
)
1198 MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap
)
1199 MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap
)
1200 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap
)
1201 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap
)
1202 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap
)
1203 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap
)
1204 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap
)
1205 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap
)
1206 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap
)
1208 MAKE_CASE(NVPTXISD::Suld2DI8Trap
)
1209 MAKE_CASE(NVPTXISD::Suld2DI16Trap
)
1210 MAKE_CASE(NVPTXISD::Suld2DI32Trap
)
1211 MAKE_CASE(NVPTXISD::Suld2DI64Trap
)
1212 MAKE_CASE(NVPTXISD::Suld2DV2I8Trap
)
1213 MAKE_CASE(NVPTXISD::Suld2DV2I16Trap
)
1214 MAKE_CASE(NVPTXISD::Suld2DV2I32Trap
)
1215 MAKE_CASE(NVPTXISD::Suld2DV2I64Trap
)
1216 MAKE_CASE(NVPTXISD::Suld2DV4I8Trap
)
1217 MAKE_CASE(NVPTXISD::Suld2DV4I16Trap
)
1218 MAKE_CASE(NVPTXISD::Suld2DV4I32Trap
)
1220 MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap
)
1221 MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap
)
1222 MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap
)
1223 MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap
)
1224 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap
)
1225 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap
)
1226 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap
)
1227 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap
)
1228 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap
)
1229 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap
)
1230 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap
)
1232 MAKE_CASE(NVPTXISD::Suld3DI8Trap
)
1233 MAKE_CASE(NVPTXISD::Suld3DI16Trap
)
1234 MAKE_CASE(NVPTXISD::Suld3DI32Trap
)
1235 MAKE_CASE(NVPTXISD::Suld3DI64Trap
)
1236 MAKE_CASE(NVPTXISD::Suld3DV2I8Trap
)
1237 MAKE_CASE(NVPTXISD::Suld3DV2I16Trap
)
1238 MAKE_CASE(NVPTXISD::Suld3DV2I32Trap
)
1239 MAKE_CASE(NVPTXISD::Suld3DV2I64Trap
)
1240 MAKE_CASE(NVPTXISD::Suld3DV4I8Trap
)
1241 MAKE_CASE(NVPTXISD::Suld3DV4I16Trap
)
1242 MAKE_CASE(NVPTXISD::Suld3DV4I32Trap
)
1244 MAKE_CASE(NVPTXISD::Suld1DI8Zero
)
1245 MAKE_CASE(NVPTXISD::Suld1DI16Zero
)
1246 MAKE_CASE(NVPTXISD::Suld1DI32Zero
)
1247 MAKE_CASE(NVPTXISD::Suld1DI64Zero
)
1248 MAKE_CASE(NVPTXISD::Suld1DV2I8Zero
)
1249 MAKE_CASE(NVPTXISD::Suld1DV2I16Zero
)
1250 MAKE_CASE(NVPTXISD::Suld1DV2I32Zero
)
1251 MAKE_CASE(NVPTXISD::Suld1DV2I64Zero
)
1252 MAKE_CASE(NVPTXISD::Suld1DV4I8Zero
)
1253 MAKE_CASE(NVPTXISD::Suld1DV4I16Zero
)
1254 MAKE_CASE(NVPTXISD::Suld1DV4I32Zero
)
1256 MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero
)
1257 MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero
)
1258 MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero
)
1259 MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero
)
1260 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero
)
1261 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero
)
1262 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero
)
1263 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero
)
1264 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero
)
1265 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero
)
1266 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero
)
1268 MAKE_CASE(NVPTXISD::Suld2DI8Zero
)
1269 MAKE_CASE(NVPTXISD::Suld2DI16Zero
)
1270 MAKE_CASE(NVPTXISD::Suld2DI32Zero
)
1271 MAKE_CASE(NVPTXISD::Suld2DI64Zero
)
1272 MAKE_CASE(NVPTXISD::Suld2DV2I8Zero
)
1273 MAKE_CASE(NVPTXISD::Suld2DV2I16Zero
)
1274 MAKE_CASE(NVPTXISD::Suld2DV2I32Zero
)
1275 MAKE_CASE(NVPTXISD::Suld2DV2I64Zero
)
1276 MAKE_CASE(NVPTXISD::Suld2DV4I8Zero
)
1277 MAKE_CASE(NVPTXISD::Suld2DV4I16Zero
)
1278 MAKE_CASE(NVPTXISD::Suld2DV4I32Zero
)
1280 MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero
)
1281 MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero
)
1282 MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero
)
1283 MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero
)
1284 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero
)
1285 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero
)
1286 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero
)
1287 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero
)
1288 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero
)
1289 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero
)
1290 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero
)
1292 MAKE_CASE(NVPTXISD::Suld3DI8Zero
)
1293 MAKE_CASE(NVPTXISD::Suld3DI16Zero
)
1294 MAKE_CASE(NVPTXISD::Suld3DI32Zero
)
1295 MAKE_CASE(NVPTXISD::Suld3DI64Zero
)
1296 MAKE_CASE(NVPTXISD::Suld3DV2I8Zero
)
1297 MAKE_CASE(NVPTXISD::Suld3DV2I16Zero
)
1298 MAKE_CASE(NVPTXISD::Suld3DV2I32Zero
)
1299 MAKE_CASE(NVPTXISD::Suld3DV2I64Zero
)
1300 MAKE_CASE(NVPTXISD::Suld3DV4I8Zero
)
1301 MAKE_CASE(NVPTXISD::Suld3DV4I16Zero
)
1302 MAKE_CASE(NVPTXISD::Suld3DV4I32Zero
)
1309 TargetLoweringBase::LegalizeTypeAction
1310 NVPTXTargetLowering::getPreferredVectorAction(MVT VT
) const {
1311 if (!VT
.isScalableVector() && VT
.getVectorNumElements() != 1 &&
1312 VT
.getScalarType() == MVT::i1
)
1313 return TypeSplitVector
;
1316 return TargetLoweringBase::getPreferredVectorAction(VT
);
1319 SDValue
NVPTXTargetLowering::getSqrtEstimate(SDValue Operand
, SelectionDAG
&DAG
,
1320 int Enabled
, int &ExtraSteps
,
1322 bool Reciprocal
) const {
1323 if (!(Enabled
== ReciprocalEstimate::Enabled
||
1324 (Enabled
== ReciprocalEstimate::Unspecified
&& !usePrecSqrtF32())))
1327 if (ExtraSteps
== ReciprocalEstimate::Unspecified
)
1331 EVT VT
= Operand
.getValueType();
1332 bool Ftz
= useF32FTZ(DAG
.getMachineFunction());
1334 auto MakeIntrinsicCall
= [&](Intrinsic::ID IID
) {
1335 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VT
,
1336 DAG
.getConstant(IID
, DL
, MVT::i32
), Operand
);
1339 // The sqrt and rsqrt refinement processes assume we always start out with an
1340 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1341 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1342 // any refinement, we must return a regular sqrt.
1343 if (Reciprocal
|| ExtraSteps
> 0) {
1345 return MakeIntrinsicCall(Ftz
? Intrinsic::nvvm_rsqrt_approx_ftz_f
1346 : Intrinsic::nvvm_rsqrt_approx_f
);
1347 else if (VT
== MVT::f64
)
1348 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d
);
1353 return MakeIntrinsicCall(Ftz
? Intrinsic::nvvm_sqrt_approx_ftz_f
1354 : Intrinsic::nvvm_sqrt_approx_f
);
1356 // There's no sqrt.approx.f64 instruction, so we emit
1357 // reciprocal(rsqrt(x)). This is faster than
1358 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1361 ISD::INTRINSIC_WO_CHAIN
, DL
, VT
,
1362 DAG
.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d
, DL
, MVT::i32
),
1363 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d
));
1369 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op
, SelectionDAG
&DAG
) const {
1371 const GlobalAddressSDNode
*GAN
= cast
<GlobalAddressSDNode
>(Op
);
1372 auto PtrVT
= getPointerTy(DAG
.getDataLayout(), GAN
->getAddressSpace());
1373 Op
= DAG
.getTargetGlobalAddress(GAN
->getGlobal(), dl
, PtrVT
);
1374 return DAG
.getNode(NVPTXISD::Wrapper
, dl
, PtrVT
, Op
);
1377 static bool IsTypePassedAsArray(const Type
*Ty
) {
1378 return Ty
->isAggregateType() || Ty
->isVectorTy() || Ty
->isIntegerTy(128) ||
1379 Ty
->isHalfTy() || Ty
->isBFloatTy();
1382 std::string
NVPTXTargetLowering::getPrototype(
1383 const DataLayout
&DL
, Type
*retTy
, const ArgListTy
&Args
,
1384 const SmallVectorImpl
<ISD::OutputArg
> &Outs
, MaybeAlign retAlignment
,
1385 std::optional
<std::pair
<unsigned, const APInt
&>> VAInfo
,
1386 const CallBase
&CB
, unsigned UniqueCallSite
) const {
1387 auto PtrVT
= getPointerTy(DL
);
1389 bool isABI
= (STI
.getSmVersion() >= 20);
1390 assert(isABI
&& "Non-ABI compilation is not supported");
1394 std::string Prototype
;
1395 raw_string_ostream
O(Prototype
);
1396 O
<< "prototype_" << UniqueCallSite
<< " : .callprototype ";
1398 if (retTy
->getTypeID() == Type::VoidTyID
) {
1402 if ((retTy
->isFloatingPointTy() || retTy
->isIntegerTy()) &&
1403 !IsTypePassedAsArray(retTy
)) {
1405 if (auto *ITy
= dyn_cast
<IntegerType
>(retTy
)) {
1406 size
= ITy
->getBitWidth();
1408 assert(retTy
->isFloatingPointTy() &&
1409 "Floating point type expected here");
1410 size
= retTy
->getPrimitiveSizeInBits();
1412 // PTX ABI requires all scalar return values to be at least 32
1413 // bits in size. fp16 normally uses .b16 as its storage type in
1414 // PTX, so its size must be adjusted here, too.
1415 size
= promoteScalarArgumentSize(size
);
1417 O
<< ".param .b" << size
<< " _";
1418 } else if (isa
<PointerType
>(retTy
)) {
1419 O
<< ".param .b" << PtrVT
.getSizeInBits() << " _";
1420 } else if (IsTypePassedAsArray(retTy
)) {
1421 O
<< ".param .align " << (retAlignment
? retAlignment
->value() : 0)
1422 << " .b8 _[" << DL
.getTypeAllocSize(retTy
) << "]";
1424 llvm_unreachable("Unknown return type");
1432 unsigned NumArgs
= VAInfo
? VAInfo
->first
: Args
.size();
1433 for (unsigned i
= 0, OIdx
= 0; i
!= NumArgs
; ++i
, ++OIdx
) {
1434 Type
*Ty
= Args
[i
].Ty
;
1440 if (!Outs
[OIdx
].Flags
.isByVal()) {
1441 if (IsTypePassedAsArray(Ty
)) {
1443 getArgumentAlignment(&CB
, Ty
, i
+ AttributeList::FirstArgIndex
, DL
);
1444 O
<< ".param .align " << ParamAlign
.value() << " .b8 ";
1446 O
<< "[" << DL
.getTypeAllocSize(Ty
) << "]";
1447 // update the index for Outs
1448 SmallVector
<EVT
, 16> vtparts
;
1449 ComputeValueVTs(*this, DL
, Ty
, vtparts
);
1450 if (unsigned len
= vtparts
.size())
1454 // i8 types in IR will be i16 types in SDAG
1455 assert((getValueType(DL
, Ty
) == Outs
[OIdx
].VT
||
1456 (getValueType(DL
, Ty
) == MVT::i8
&& Outs
[OIdx
].VT
== MVT::i16
)) &&
1457 "type mismatch between callee prototype and arguments");
1460 if (isa
<IntegerType
>(Ty
)) {
1461 sz
= cast
<IntegerType
>(Ty
)->getBitWidth();
1462 sz
= promoteScalarArgumentSize(sz
);
1463 } else if (isa
<PointerType
>(Ty
)) {
1464 sz
= PtrVT
.getSizeInBits();
1466 sz
= Ty
->getPrimitiveSizeInBits();
1468 O
<< ".param .b" << sz
<< " ";
1473 // Indirect calls need strict ABI alignment so we disable optimizations by
1474 // not providing a function to optimize.
1475 Type
*ETy
= Args
[i
].IndirectType
;
1476 Align InitialAlign
= Outs
[OIdx
].Flags
.getNonZeroByValAlign();
1477 Align ParamByValAlign
=
1478 getFunctionByValParamAlign(/*F=*/nullptr, ETy
, InitialAlign
, DL
);
1480 O
<< ".param .align " << ParamByValAlign
.value() << " .b8 ";
1482 O
<< "[" << Outs
[OIdx
].Flags
.getByValSize() << "]";
1486 O
<< (first
? "" : ",") << " .param .align " << VAInfo
->second
1489 if (shouldEmitPTXNoReturn(&CB
, *nvTM
))
1496 Align
NVPTXTargetLowering::getFunctionArgumentAlignment(
1497 const Function
*F
, Type
*Ty
, unsigned Idx
, const DataLayout
&DL
) const {
1498 return getAlign(*F
, Idx
).value_or(getFunctionParamOptimizedAlign(F
, Ty
, DL
));
1501 Align
NVPTXTargetLowering::getArgumentAlignment(const CallBase
*CB
, Type
*Ty
,
1503 const DataLayout
&DL
) const {
1505 // CallSite is zero, fallback to ABI type alignment
1506 return DL
.getABITypeAlign(Ty
);
1509 const Function
*DirectCallee
= CB
->getCalledFunction();
1511 if (!DirectCallee
) {
1512 // We don't have a direct function symbol, but that may be because of
1513 // constant cast instructions in the call.
1515 // With bitcast'd call targets, the instruction will be the call
1516 if (const auto *CI
= dyn_cast
<CallInst
>(CB
)) {
1517 // Check if we have call alignment metadata
1518 if (MaybeAlign StackAlign
= getAlign(*CI
, Idx
))
1519 return StackAlign
.value();
1521 DirectCallee
= getMaybeBitcastedCallee(CB
);
1524 // Check for function alignment information if we found that the
1525 // ultimate target is a Function
1527 return getFunctionArgumentAlignment(DirectCallee
, Ty
, Idx
, DL
);
1529 // Call is indirect, fall back to the ABI type alignment
1530 return DL
.getABITypeAlign(Ty
);
1533 static bool adjustElementType(EVT
&ElementType
) {
1534 switch (ElementType
.getSimpleVT().SimpleTy
) {
1539 ElementType
= MVT::i16
;
1544 ElementType
= MVT::i32
;
1547 ElementType
= MVT::i64
;
1552 // Use byte-store when the param address of the argument value is unaligned.
1553 // This may happen when the return value is a field of a packed structure.
1555 // This is called in LowerCall() when passing the param values.
1556 static SDValue
LowerUnalignedStoreParam(SelectionDAG
&DAG
, SDValue Chain
,
1557 uint64_t Offset
, EVT ElementType
,
1558 SDValue StVal
, SDValue
&InGlue
,
1559 unsigned ArgID
, const SDLoc
&dl
) {
1560 // Bit logic only works on integer types
1561 if (adjustElementType(ElementType
))
1562 StVal
= DAG
.getNode(ISD::BITCAST
, dl
, ElementType
, StVal
);
1565 SDVTList StoreVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1566 for (unsigned i
= 0, n
= ElementType
.getSizeInBits() / 8; i
< n
; i
++) {
1567 // Shift the byte to the last byte position
1568 SDValue ShiftVal
= DAG
.getNode(ISD::SRL
, dl
, ElementType
, StVal
,
1569 DAG
.getConstant(i
* 8, dl
, MVT::i32
));
1570 SDValue StoreOperands
[] = {Chain
, DAG
.getConstant(ArgID
, dl
, MVT::i32
),
1571 DAG
.getConstant(Offset
+ i
, dl
, MVT::i32
),
1573 // Trunc store only the last byte by using
1575 // The register type can be larger than b8.
1576 Chain
= DAG
.getMemIntrinsicNode(
1577 NVPTXISD::StoreParam
, dl
, StoreVTs
, StoreOperands
, MVT::i8
,
1578 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore
);
1579 InGlue
= Chain
.getValue(1);
1584 // Use byte-load when the param adress of the returned value is unaligned.
1585 // This may happen when the returned value is a field of a packed structure.
1587 LowerUnalignedLoadRetParam(SelectionDAG
&DAG
, SDValue
&Chain
, uint64_t Offset
,
1588 EVT ElementType
, SDValue
&InGlue
,
1589 SmallVectorImpl
<SDValue
> &TempProxyRegOps
,
1591 // Bit logic only works on integer types
1592 EVT MergedType
= ElementType
;
1593 adjustElementType(MergedType
);
1595 // Load each byte and construct the whole value. Initial value to 0
1596 SDValue RetVal
= DAG
.getConstant(0, dl
, MergedType
);
1597 // LoadParamMemI8 loads into i16 register only
1598 SDVTList LoadVTs
= DAG
.getVTList(MVT::i16
, MVT::Other
, MVT::Glue
);
1599 for (unsigned i
= 0, n
= ElementType
.getSizeInBits() / 8; i
< n
; i
++) {
1600 SDValue LoadOperands
[] = {Chain
, DAG
.getConstant(1, dl
, MVT::i32
),
1601 DAG
.getConstant(Offset
+ i
, dl
, MVT::i32
),
1603 // This will be selected to LoadParamMemI8
1605 DAG
.getMemIntrinsicNode(NVPTXISD::LoadParam
, dl
, LoadVTs
, LoadOperands
,
1606 MVT::i8
, MachinePointerInfo(), Align(1));
1607 SDValue TmpLdVal
= LdVal
.getValue(0);
1608 Chain
= LdVal
.getValue(1);
1609 InGlue
= LdVal
.getValue(2);
1611 TmpLdVal
= DAG
.getNode(NVPTXISD::ProxyReg
, dl
,
1612 TmpLdVal
.getSimpleValueType(), TmpLdVal
);
1613 TempProxyRegOps
.push_back(TmpLdVal
);
1615 SDValue CMask
= DAG
.getConstant(255, dl
, MergedType
);
1616 SDValue CShift
= DAG
.getConstant(i
* 8, dl
, MVT::i32
);
1617 // Need to extend the i16 register to the whole width.
1618 TmpLdVal
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
, MergedType
, TmpLdVal
);
1619 // Mask off the high bits. Leave only the lower 8bits.
1620 // Do this because we are using loadparam.b8.
1621 TmpLdVal
= DAG
.getNode(ISD::AND
, dl
, MergedType
, TmpLdVal
, CMask
);
1623 TmpLdVal
= DAG
.getNode(ISD::SHL
, dl
, MergedType
, TmpLdVal
, CShift
);
1624 RetVal
= DAG
.getNode(ISD::OR
, dl
, MergedType
, RetVal
, TmpLdVal
);
1626 if (ElementType
!= MergedType
)
1627 RetVal
= DAG
.getNode(ISD::BITCAST
, dl
, ElementType
, RetVal
);
1632 SDValue
NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo
&CLI
,
1633 SmallVectorImpl
<SDValue
> &InVals
) const {
1635 if (CLI
.IsVarArg
&& (STI
.getPTXVersion() < 60 || STI
.getSmVersion() < 30))
1637 "Support for variadic functions (unsized array parameter) introduced "
1638 "in PTX ISA version 6.0 and requires target sm_30.");
1640 SelectionDAG
&DAG
= CLI
.DAG
;
1642 SmallVectorImpl
<ISD::OutputArg
> &Outs
= CLI
.Outs
;
1643 SmallVectorImpl
<SDValue
> &OutVals
= CLI
.OutVals
;
1644 SmallVectorImpl
<ISD::InputArg
> &Ins
= CLI
.Ins
;
1645 SDValue Chain
= CLI
.Chain
;
1646 SDValue Callee
= CLI
.Callee
;
1647 bool &isTailCall
= CLI
.IsTailCall
;
1648 ArgListTy
&Args
= CLI
.getArgs();
1649 Type
*RetTy
= CLI
.RetTy
;
1650 const CallBase
*CB
= CLI
.CB
;
1651 const DataLayout
&DL
= DAG
.getDataLayout();
1653 bool isABI
= (STI
.getSmVersion() >= 20);
1654 assert(isABI
&& "Non-ABI compilation is not supported");
1658 // Variadic arguments.
1660 // Normally, for each argument, we declare a param scalar or a param
1661 // byte array in the .param space, and store the argument value to that
1662 // param scalar or array starting at offset 0.
1664 // In the case of the first variadic argument, we declare a vararg byte array
1665 // with size 0. The exact size of this array isn't known at this point, so
1666 // it'll be patched later. All the variadic arguments will be stored to this
1667 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1668 // initially set to 0, so it can be used for non-variadic arguments (which use
1669 // 0 offset) to simplify the code.
1671 // After all vararg is processed, 'VAOffset' holds the size of the
1672 // vararg byte array.
1674 SDValue VADeclareParam
; // vararg byte array
1675 unsigned FirstVAArg
= CLI
.NumFixedArgs
; // position of the first variadic
1676 unsigned VAOffset
= 0; // current offset in the param array
1678 unsigned UniqueCallSite
= GlobalUniqueCallSite
.fetch_add(1);
1679 SDValue TempChain
= Chain
;
1680 Chain
= DAG
.getCALLSEQ_START(Chain
, UniqueCallSite
, 0, dl
);
1681 SDValue InGlue
= Chain
.getValue(1);
1683 unsigned ParamCount
= 0;
1684 // Args.size() and Outs.size() need not match.
1685 // Outs.size() will be larger
1686 // * if there is an aggregate argument with multiple fields (each field
1687 // showing up separately in Outs)
1688 // * if there is a vector argument with more than typical vector-length
1689 // elements (generally if more than 4) where each vector element is
1690 // individually present in Outs.
1691 // So a different index should be used for indexing into Outs/OutVals.
1692 // See similar issue in LowerFormalArguments.
1694 // Declare the .params or .reg need to pass values
1696 for (unsigned i
= 0, e
= Args
.size(); i
!= e
; ++i
, ++OIdx
) {
1697 EVT VT
= Outs
[OIdx
].VT
;
1698 Type
*Ty
= Args
[i
].Ty
;
1699 bool IsVAArg
= (i
>= CLI
.NumFixedArgs
);
1700 bool IsByVal
= Outs
[OIdx
].Flags
.isByVal();
1702 SmallVector
<EVT
, 16> VTs
;
1703 SmallVector
<uint64_t, 16> Offsets
;
1705 assert((!IsByVal
|| Args
[i
].IndirectType
) &&
1706 "byval arg must have indirect type");
1707 Type
*ETy
= (IsByVal
? Args
[i
].IndirectType
: Ty
);
1708 ComputePTXValueVTs(*this, DL
, ETy
, VTs
, &Offsets
, IsByVal
? 0 : VAOffset
);
1712 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1713 // so we don't need to worry whether it's naturally aligned or not.
1714 // See TargetLowering::LowerCallTo().
1715 Align InitialAlign
= Outs
[OIdx
].Flags
.getNonZeroByValAlign();
1716 ArgAlign
= getFunctionByValParamAlign(CB
->getCalledFunction(), ETy
,
1719 VAOffset
= alignTo(VAOffset
, ArgAlign
);
1721 ArgAlign
= getArgumentAlignment(CB
, Ty
, ParamCount
+ 1, DL
);
1725 (IsByVal
? Outs
[OIdx
].Flags
.getByValSize() : DL
.getTypeAllocSize(Ty
));
1726 SDVTList DeclareParamVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1728 bool NeedAlign
; // Does argument declaration specify alignment?
1729 bool PassAsArray
= IsByVal
|| IsTypePassedAsArray(Ty
);
1731 if (ParamCount
== FirstVAArg
) {
1732 SDValue DeclareParamOps
[] = {
1733 Chain
, DAG
.getConstant(STI
.getMaxRequiredAlignment(), dl
, MVT::i32
),
1734 DAG
.getConstant(ParamCount
, dl
, MVT::i32
),
1735 DAG
.getConstant(1, dl
, MVT::i32
), InGlue
};
1736 VADeclareParam
= Chain
= DAG
.getNode(NVPTXISD::DeclareParam
, dl
,
1737 DeclareParamVTs
, DeclareParamOps
);
1739 NeedAlign
= PassAsArray
;
1740 } else if (PassAsArray
) {
1741 // declare .param .align <align> .b8 .param<n>[<size>];
1742 SDValue DeclareParamOps
[] = {
1743 Chain
, DAG
.getConstant(ArgAlign
.value(), dl
, MVT::i32
),
1744 DAG
.getConstant(ParamCount
, dl
, MVT::i32
),
1745 DAG
.getConstant(TypeSize
, dl
, MVT::i32
), InGlue
};
1746 Chain
= DAG
.getNode(NVPTXISD::DeclareParam
, dl
, DeclareParamVTs
,
1750 // declare .param .b<size> .param<n>;
1751 if (VT
.isInteger() || VT
.isFloatingPoint()) {
1752 // PTX ABI requires integral types to be at least 32 bits in
1753 // size. FP16 is loaded/stored using i16, so it's handled
1755 TypeSize
= promoteScalarArgumentSize(TypeSize
* 8) / 8;
1757 SDValue DeclareScalarParamOps
[] = {
1758 Chain
, DAG
.getConstant(ParamCount
, dl
, MVT::i32
),
1759 DAG
.getConstant(TypeSize
* 8, dl
, MVT::i32
),
1760 DAG
.getConstant(0, dl
, MVT::i32
), InGlue
};
1761 Chain
= DAG
.getNode(NVPTXISD::DeclareScalarParam
, dl
, DeclareParamVTs
,
1762 DeclareScalarParamOps
);
1765 InGlue
= Chain
.getValue(1);
1767 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1768 // than 32-bits are sign extended or zero extended, depending on
1769 // whether they are signed or unsigned types. This case applies
1770 // only to scalar parameters and not to aggregate values.
1771 bool ExtendIntegerParam
=
1772 Ty
->isIntegerTy() && DL
.getTypeAllocSizeInBits(Ty
) < 32;
1774 auto VectorInfo
= VectorizePTXValueVTs(VTs
, Offsets
, ArgAlign
, IsVAArg
);
1775 SmallVector
<SDValue
, 6> StoreOperands
;
1776 for (unsigned j
= 0, je
= VTs
.size(); j
!= je
; ++j
) {
1778 int CurOffset
= Offsets
[j
];
1779 MaybeAlign PartAlign
;
1781 PartAlign
= commonAlignment(ArgAlign
, CurOffset
);
1783 SDValue StVal
= OutVals
[OIdx
];
1786 if (PromoteScalarIntegerPTX(EltVT
, &PromotedVT
)) {
1787 EltVT
= EVT(PromotedVT
);
1789 if (PromoteScalarIntegerPTX(StVal
.getValueType(), &PromotedVT
)) {
1790 llvm::ISD::NodeType Ext
=
1791 Outs
[OIdx
].Flags
.isSExt() ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
1792 StVal
= DAG
.getNode(Ext
, dl
, PromotedVT
, StVal
);
1796 auto PtrVT
= getPointerTy(DL
);
1797 SDValue srcAddr
= DAG
.getNode(ISD::ADD
, dl
, PtrVT
, StVal
,
1798 DAG
.getConstant(CurOffset
, dl
, PtrVT
));
1799 StVal
= DAG
.getLoad(EltVT
, dl
, TempChain
, srcAddr
, MachinePointerInfo(),
1801 } else if (ExtendIntegerParam
) {
1802 assert(VTs
.size() == 1 && "Scalar can't have multiple parts.");
1804 StVal
= DAG
.getNode(Outs
[OIdx
].Flags
.isSExt() ? ISD::SIGN_EXTEND
1806 dl
, MVT::i32
, StVal
);
1809 if (!ExtendIntegerParam
&& EltVT
.getSizeInBits() < 16) {
1810 // Use 16-bit registers for small stores as it's the
1811 // smallest general purpose register size supported by NVPTX.
1812 StVal
= DAG
.getNode(ISD::ANY_EXTEND
, dl
, MVT::i16
, StVal
);
1815 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1816 // scalar store. In such cases, fall back to byte stores.
1817 if (VectorInfo
[j
] == PVF_SCALAR
&& !IsVAArg
&& PartAlign
.has_value() &&
1819 DL
.getABITypeAlign(EltVT
.getTypeForEVT(*DAG
.getContext()))) {
1820 assert(StoreOperands
.empty() && "Unfinished preceeding store.");
1821 Chain
= LowerUnalignedStoreParam(
1822 DAG
, Chain
, IsByVal
? CurOffset
+ VAOffset
: CurOffset
, EltVT
,
1823 StVal
, InGlue
, ParamCount
, dl
);
1825 // LowerUnalignedStoreParam took care of inserting the necessary nodes
1826 // into the SDAG, so just move on to the next element.
1833 if (VectorInfo
[j
] & PVF_FIRST
) {
1834 assert(StoreOperands
.empty() && "Unfinished preceding store.");
1835 StoreOperands
.push_back(Chain
);
1836 StoreOperands
.push_back(
1837 DAG
.getConstant(IsVAArg
? FirstVAArg
: ParamCount
, dl
, MVT::i32
));
1839 StoreOperands
.push_back(DAG
.getConstant(
1840 IsByVal
? CurOffset
+ VAOffset
: (IsVAArg
? VAOffset
: CurOffset
),
1844 // Record the value to store.
1845 StoreOperands
.push_back(StVal
);
1847 if (VectorInfo
[j
] & PVF_LAST
) {
1848 unsigned NumElts
= StoreOperands
.size() - 3;
1849 NVPTXISD::NodeType Op
;
1852 Op
= NVPTXISD::StoreParam
;
1855 Op
= NVPTXISD::StoreParamV2
;
1858 Op
= NVPTXISD::StoreParamV4
;
1861 llvm_unreachable("Invalid vector info.");
1864 StoreOperands
.push_back(InGlue
);
1866 // Adjust type of the store op if we've extended the scalar
1868 EVT TheStoreType
= ExtendIntegerParam
? MVT::i32
: EltVT
;
1870 Chain
= DAG
.getMemIntrinsicNode(
1871 Op
, dl
, DAG
.getVTList(MVT::Other
, MVT::Glue
), StoreOperands
,
1872 TheStoreType
, MachinePointerInfo(), PartAlign
,
1873 MachineMemOperand::MOStore
);
1874 InGlue
= Chain
.getValue(1);
1877 StoreOperands
.clear();
1879 // TODO: We may need to support vector types that can be passed
1880 // as scalars in variadic arguments.
1881 if (!IsByVal
&& IsVAArg
) {
1882 assert(NumElts
== 1 &&
1883 "Vectorization is expected to be disabled for variadics.");
1884 VAOffset
+= DL
.getTypeAllocSize(
1885 TheStoreType
.getTypeForEVT(*DAG
.getContext()));
1891 assert(StoreOperands
.empty() && "Unfinished parameter store.");
1892 if (!IsByVal
&& VTs
.size() > 0)
1895 if (IsByVal
&& IsVAArg
)
1896 VAOffset
+= TypeSize
;
1899 GlobalAddressSDNode
*Func
= dyn_cast
<GlobalAddressSDNode
>(Callee
.getNode());
1900 MaybeAlign retAlignment
= std::nullopt
;
1903 if (Ins
.size() > 0) {
1904 SmallVector
<EVT
, 16> resvtparts
;
1905 ComputeValueVTs(*this, DL
, RetTy
, resvtparts
);
1908 // .param .align N .b8 retval0[<size-in-bytes>], or
1909 // .param .b<size-in-bits> retval0
1910 unsigned resultsz
= DL
.getTypeAllocSizeInBits(RetTy
);
1911 if (!IsTypePassedAsArray(RetTy
)) {
1912 resultsz
= promoteScalarArgumentSize(resultsz
);
1913 SDVTList DeclareRetVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1914 SDValue DeclareRetOps
[] = { Chain
, DAG
.getConstant(1, dl
, MVT::i32
),
1915 DAG
.getConstant(resultsz
, dl
, MVT::i32
),
1916 DAG
.getConstant(0, dl
, MVT::i32
), InGlue
};
1917 Chain
= DAG
.getNode(NVPTXISD::DeclareRet
, dl
, DeclareRetVTs
,
1919 InGlue
= Chain
.getValue(1);
1921 retAlignment
= getArgumentAlignment(CB
, RetTy
, 0, DL
);
1922 assert(retAlignment
&& "retAlignment is guaranteed to be set");
1923 SDVTList DeclareRetVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1924 SDValue DeclareRetOps
[] = {
1925 Chain
, DAG
.getConstant(retAlignment
->value(), dl
, MVT::i32
),
1926 DAG
.getConstant(resultsz
/ 8, dl
, MVT::i32
),
1927 DAG
.getConstant(0, dl
, MVT::i32
), InGlue
};
1928 Chain
= DAG
.getNode(NVPTXISD::DeclareRetParam
, dl
, DeclareRetVTs
,
1930 InGlue
= Chain
.getValue(1);
1934 bool HasVAArgs
= CLI
.IsVarArg
&& (CLI
.Args
.size() > CLI
.NumFixedArgs
);
1935 // Set the size of the vararg param byte array if the callee is a variadic
1936 // function and the variadic part is not empty.
1938 SDValue DeclareParamOps
[] = {
1939 VADeclareParam
.getOperand(0), VADeclareParam
.getOperand(1),
1940 VADeclareParam
.getOperand(2), DAG
.getConstant(VAOffset
, dl
, MVT::i32
),
1941 VADeclareParam
.getOperand(4)};
1942 DAG
.MorphNodeTo(VADeclareParam
.getNode(), VADeclareParam
.getOpcode(),
1943 VADeclareParam
->getVTList(), DeclareParamOps
);
1946 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1947 // between them we must rely on the call site value which is valid for
1948 // indirect calls but is always null for libcalls.
1949 bool isIndirectCall
= !Func
&& CB
;
1951 if (isa
<ExternalSymbolSDNode
>(Callee
)) {
1952 Function
* CalleeFunc
= nullptr;
1954 // Try to find the callee in the current module.
1955 Callee
= DAG
.getSymbolFunctionGlobalAddress(Callee
, &CalleeFunc
);
1956 assert(CalleeFunc
!= nullptr && "Libcall callee must be set.");
1958 // Set the "libcall callee" attribute to indicate that the function
1959 // must always have a declaration.
1960 CalleeFunc
->addFnAttr("nvptx-libcall-callee", "true");
1963 if (isIndirectCall
) {
1964 // This is indirect function call case : PTX requires a prototype of the
1966 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1967 // to be emitted, and the label has to used as the last arg of call
1969 // The prototype is embedded in a string and put as the operand for a
1970 // CallPrototype SDNode which will print out to the value of the string.
1971 SDVTList ProtoVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1972 std::string Proto
= getPrototype(
1973 DL
, RetTy
, Args
, Outs
, retAlignment
,
1975 ? std::optional
<std::pair
<unsigned, const APInt
&>>(std::make_pair(
1976 CLI
.NumFixedArgs
, VADeclareParam
->getConstantOperandAPInt(1)))
1978 *CB
, UniqueCallSite
);
1979 const char *ProtoStr
= nvTM
->getStrPool().save(Proto
).data();
1980 SDValue ProtoOps
[] = {
1982 DAG
.getTargetExternalSymbol(ProtoStr
, MVT::i32
),
1985 Chain
= DAG
.getNode(NVPTXISD::CallPrototype
, dl
, ProtoVTs
, ProtoOps
);
1986 InGlue
= Chain
.getValue(1);
1988 // Op to just print "call"
1989 SDVTList PrintCallVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
1990 SDValue PrintCallOps
[] = {
1991 Chain
, DAG
.getConstant((Ins
.size() == 0) ? 0 : 1, dl
, MVT::i32
), InGlue
1993 // We model convergent calls as separate opcodes.
1994 unsigned Opcode
= isIndirectCall
? NVPTXISD::PrintCall
: NVPTXISD::PrintCallUni
;
1995 if (CLI
.IsConvergent
)
1996 Opcode
= Opcode
== NVPTXISD::PrintCallUni
? NVPTXISD::PrintConvergentCallUni
1997 : NVPTXISD::PrintConvergentCall
;
1998 Chain
= DAG
.getNode(Opcode
, dl
, PrintCallVTs
, PrintCallOps
);
1999 InGlue
= Chain
.getValue(1);
2001 // Ops to print out the function name
2002 SDVTList CallVoidVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2003 SDValue CallVoidOps
[] = { Chain
, Callee
, InGlue
};
2004 Chain
= DAG
.getNode(NVPTXISD::CallVoid
, dl
, CallVoidVTs
, CallVoidOps
);
2005 InGlue
= Chain
.getValue(1);
2007 // Ops to print out the param list
2008 SDVTList CallArgBeginVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2009 SDValue CallArgBeginOps
[] = { Chain
, InGlue
};
2010 Chain
= DAG
.getNode(NVPTXISD::CallArgBegin
, dl
, CallArgBeginVTs
,
2012 InGlue
= Chain
.getValue(1);
2014 for (unsigned i
= 0, e
= std::min(CLI
.NumFixedArgs
+ 1, ParamCount
); i
!= e
;
2018 opcode
= NVPTXISD::LastCallArg
;
2020 opcode
= NVPTXISD::CallArg
;
2021 SDVTList CallArgVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2022 SDValue CallArgOps
[] = { Chain
, DAG
.getConstant(1, dl
, MVT::i32
),
2023 DAG
.getConstant(i
, dl
, MVT::i32
), InGlue
};
2024 Chain
= DAG
.getNode(opcode
, dl
, CallArgVTs
, CallArgOps
);
2025 InGlue
= Chain
.getValue(1);
2027 SDVTList CallArgEndVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2028 SDValue CallArgEndOps
[] = { Chain
,
2029 DAG
.getConstant(isIndirectCall
? 0 : 1, dl
, MVT::i32
),
2031 Chain
= DAG
.getNode(NVPTXISD::CallArgEnd
, dl
, CallArgEndVTs
, CallArgEndOps
);
2032 InGlue
= Chain
.getValue(1);
2034 if (isIndirectCall
) {
2035 SDVTList PrototypeVTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2036 SDValue PrototypeOps
[] = {
2037 Chain
, DAG
.getConstant(UniqueCallSite
, dl
, MVT::i32
), InGlue
};
2038 Chain
= DAG
.getNode(NVPTXISD::Prototype
, dl
, PrototypeVTs
, PrototypeOps
);
2039 InGlue
= Chain
.getValue(1);
2042 SmallVector
<SDValue
, 16> ProxyRegOps
;
2043 SmallVector
<std::optional
<MVT
>, 16> ProxyRegTruncates
;
2044 // An item of the vector is filled if the element does not need a ProxyReg
2045 // operation on it and should be added to InVals as is. ProxyRegOps and
2046 // ProxyRegTruncates contain empty/none items at the same index.
2047 SmallVector
<SDValue
, 16> RetElts
;
2048 // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
2049 // to use the values of `LoadParam`s and to be replaced later then
2050 // `CALLSEQ_END` is added.
2051 SmallVector
<SDValue
, 16> TempProxyRegOps
;
2053 // Generate loads from param memory/moves from registers for result
2054 if (Ins
.size() > 0) {
2055 SmallVector
<EVT
, 16> VTs
;
2056 SmallVector
<uint64_t, 16> Offsets
;
2057 ComputePTXValueVTs(*this, DL
, RetTy
, VTs
, &Offsets
, 0);
2058 assert(VTs
.size() == Ins
.size() && "Bad value decomposition");
2060 Align RetAlign
= getArgumentAlignment(CB
, RetTy
, 0, DL
);
2061 auto VectorInfo
= VectorizePTXValueVTs(VTs
, Offsets
, RetAlign
);
2063 SmallVector
<EVT
, 6> LoadVTs
;
2064 int VecIdx
= -1; // Index of the first element of the vector.
2066 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2067 // 32-bits are sign extended or zero extended, depending on whether
2068 // they are signed or unsigned types.
2069 bool ExtendIntegerRetVal
=
2070 RetTy
->isIntegerTy() && DL
.getTypeAllocSizeInBits(RetTy
) < 32;
2072 for (unsigned i
= 0, e
= VTs
.size(); i
!= e
; ++i
) {
2073 bool needTruncate
= false;
2074 EVT TheLoadType
= VTs
[i
];
2075 EVT EltType
= Ins
[i
].VT
;
2076 Align EltAlign
= commonAlignment(RetAlign
, Offsets
[i
]);
2079 if (PromoteScalarIntegerPTX(TheLoadType
, &PromotedVT
)) {
2080 TheLoadType
= EVT(PromotedVT
);
2081 EltType
= EVT(PromotedVT
);
2082 needTruncate
= true;
2085 if (ExtendIntegerRetVal
) {
2086 TheLoadType
= MVT::i32
;
2088 needTruncate
= true;
2089 } else if (TheLoadType
.getSizeInBits() < 16) {
2090 if (VTs
[i
].isInteger())
2091 needTruncate
= true;
2095 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
2096 // scalar load. In such cases, fall back to byte loads.
2097 if (VectorInfo
[i
] == PVF_SCALAR
&& RetTy
->isAggregateType() &&
2098 EltAlign
< DL
.getABITypeAlign(
2099 TheLoadType
.getTypeForEVT(*DAG
.getContext()))) {
2100 assert(VecIdx
== -1 && LoadVTs
.empty() && "Orphaned operand list.");
2101 SDValue Ret
= LowerUnalignedLoadRetParam(
2102 DAG
, Chain
, Offsets
[i
], TheLoadType
, InGlue
, TempProxyRegOps
, dl
);
2103 ProxyRegOps
.push_back(SDValue());
2104 ProxyRegTruncates
.push_back(std::optional
<MVT
>());
2106 RetElts
.push_back(Ret
);
2111 // Record index of the very first element of the vector.
2112 if (VectorInfo
[i
] & PVF_FIRST
) {
2113 assert(VecIdx
== -1 && LoadVTs
.empty() && "Orphaned operand list.");
2117 LoadVTs
.push_back(EltType
);
2119 if (VectorInfo
[i
] & PVF_LAST
) {
2120 unsigned NumElts
= LoadVTs
.size();
2121 LoadVTs
.push_back(MVT::Other
);
2122 LoadVTs
.push_back(MVT::Glue
);
2123 NVPTXISD::NodeType Op
;
2126 Op
= NVPTXISD::LoadParam
;
2129 Op
= NVPTXISD::LoadParamV2
;
2132 Op
= NVPTXISD::LoadParamV4
;
2135 llvm_unreachable("Invalid vector info.");
2138 SDValue LoadOperands
[] = {
2139 Chain
, DAG
.getConstant(1, dl
, MVT::i32
),
2140 DAG
.getConstant(Offsets
[VecIdx
], dl
, MVT::i32
), InGlue
};
2141 SDValue RetVal
= DAG
.getMemIntrinsicNode(
2142 Op
, dl
, DAG
.getVTList(LoadVTs
), LoadOperands
, TheLoadType
,
2143 MachinePointerInfo(), EltAlign
,
2144 MachineMemOperand::MOLoad
);
2146 for (unsigned j
= 0; j
< NumElts
; ++j
) {
2147 ProxyRegOps
.push_back(RetVal
.getValue(j
));
2150 ProxyRegTruncates
.push_back(std::optional
<MVT
>(Ins
[VecIdx
+ j
].VT
));
2152 ProxyRegTruncates
.push_back(std::optional
<MVT
>());
2155 Chain
= RetVal
.getValue(NumElts
);
2156 InGlue
= RetVal
.getValue(NumElts
+ 1);
2166 DAG
.getCALLSEQ_END(Chain
, UniqueCallSite
, UniqueCallSite
+ 1, InGlue
, dl
);
2167 InGlue
= Chain
.getValue(1);
2169 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2170 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2172 for (unsigned i
= 0; i
< ProxyRegOps
.size(); ++i
) {
2173 if (i
< RetElts
.size() && RetElts
[i
]) {
2174 InVals
.push_back(RetElts
[i
]);
2178 SDValue Ret
= DAG
.getNode(
2179 NVPTXISD::ProxyReg
, dl
,
2180 DAG
.getVTList(ProxyRegOps
[i
].getSimpleValueType(), MVT::Other
, MVT::Glue
),
2181 { Chain
, ProxyRegOps
[i
], InGlue
}
2184 Chain
= Ret
.getValue(1);
2185 InGlue
= Ret
.getValue(2);
2187 if (ProxyRegTruncates
[i
]) {
2188 Ret
= DAG
.getNode(ISD::TRUNCATE
, dl
, *ProxyRegTruncates
[i
], Ret
);
2191 InVals
.push_back(Ret
);
2194 for (SDValue
&T
: TempProxyRegOps
) {
2195 SDValue Repl
= DAG
.getNode(
2196 NVPTXISD::ProxyReg
, dl
,
2197 DAG
.getVTList(T
.getSimpleValueType(), MVT::Other
, MVT::Glue
),
2198 {Chain
, T
.getOperand(0), InGlue
});
2199 DAG
.ReplaceAllUsesWith(T
, Repl
);
2200 DAG
.RemoveDeadNode(T
.getNode());
2202 Chain
= Repl
.getValue(1);
2203 InGlue
= Repl
.getValue(2);
2206 // set isTailCall to false for now, until we figure out how to express
2207 // tail call optimization in PTX
2212 SDValue
NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op
,
2213 SelectionDAG
&DAG
) const {
2215 if (STI
.getPTXVersion() < 73 || STI
.getSmVersion() < 52) {
2216 const Function
&Fn
= DAG
.getMachineFunction().getFunction();
2218 DiagnosticInfoUnsupported
NoDynamicAlloca(
2220 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2221 "requires target sm_52.",
2222 SDLoc(Op
).getDebugLoc());
2223 DAG
.getContext()->diagnose(NoDynamicAlloca
);
2224 auto Ops
= {DAG
.getConstant(0, SDLoc(), Op
.getValueType()),
2226 return DAG
.getMergeValues(Ops
, SDLoc());
2229 SDValue Chain
= Op
.getOperand(0);
2230 SDValue Size
= Op
.getOperand(1);
2231 uint64_t Align
= cast
<ConstantSDNode
>(Op
.getOperand(2))->getZExtValue();
2232 SDLoc
DL(Op
.getNode());
2234 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2235 if (nvTM
->is64Bit())
2236 Size
= DAG
.getZExtOrTrunc(Size
, DL
, MVT::i64
);
2238 Size
= DAG
.getZExtOrTrunc(Size
, DL
, MVT::i32
);
2240 SDValue AllocOps
[] = {Chain
, Size
,
2241 DAG
.getTargetConstant(Align
, DL
, MVT::i32
)};
2242 SDValue Alloca
= DAG
.getNode(NVPTXISD::DYNAMIC_STACKALLOC
, DL
,
2243 nvTM
->is64Bit() ? MVT::i64
: MVT::i32
, AllocOps
);
2245 SDValue MergeOps
[] = {Alloca
, Chain
};
2246 return DAG
.getMergeValues(MergeOps
, DL
);
2249 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2250 // (see LegalizeDAG.cpp). This is slow and uses local memory.
2251 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2253 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op
, SelectionDAG
&DAG
) const {
2254 SDNode
*Node
= Op
.getNode();
2256 SmallVector
<SDValue
, 8> Ops
;
2257 unsigned NumOperands
= Node
->getNumOperands();
2258 for (unsigned i
= 0; i
< NumOperands
; ++i
) {
2259 SDValue SubOp
= Node
->getOperand(i
);
2260 EVT VVT
= SubOp
.getNode()->getValueType(0);
2261 EVT EltVT
= VVT
.getVectorElementType();
2262 unsigned NumSubElem
= VVT
.getVectorNumElements();
2263 for (unsigned j
= 0; j
< NumSubElem
; ++j
) {
2264 Ops
.push_back(DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, SubOp
,
2265 DAG
.getIntPtrConstant(j
, dl
)));
2268 return DAG
.getBuildVector(Node
->getValueType(0), dl
, Ops
);
2271 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2272 // would get lowered as two constant loads and vector-packing move.
2273 // Instead we want just a constant move:
2274 // mov.b32 %r2, 0x40003C00
2275 SDValue
NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op
,
2276 SelectionDAG
&DAG
) const {
2277 EVT VT
= Op
->getValueType(0);
2278 if (!(Isv2x16VT(VT
) || VT
== MVT::v4i8
))
2283 if (!llvm::all_of(Op
->ops(), [](SDValue Operand
) {
2284 return Operand
->isUndef() || isa
<ConstantSDNode
>(Operand
) ||
2285 isa
<ConstantFPSDNode
>(Operand
);
2287 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2288 // to optimize calculation of constant parts.
2289 if (VT
== MVT::v4i8
) {
2290 SDValue C8
= DAG
.getConstant(8, DL
, MVT::i32
);
2291 SDValue E01
= DAG
.getNode(
2292 NVPTXISD::BFI
, DL
, MVT::i32
,
2293 DAG
.getAnyExtOrTrunc(Op
->getOperand(1), DL
, MVT::i32
),
2294 DAG
.getAnyExtOrTrunc(Op
->getOperand(0), DL
, MVT::i32
), C8
, C8
);
2296 DAG
.getNode(NVPTXISD::BFI
, DL
, MVT::i32
,
2297 DAG
.getAnyExtOrTrunc(Op
->getOperand(2), DL
, MVT::i32
),
2298 E01
, DAG
.getConstant(16, DL
, MVT::i32
), C8
);
2300 DAG
.getNode(NVPTXISD::BFI
, DL
, MVT::i32
,
2301 DAG
.getAnyExtOrTrunc(Op
->getOperand(3), DL
, MVT::i32
),
2302 E012
, DAG
.getConstant(24, DL
, MVT::i32
), C8
);
2303 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, E0123
);
2308 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2309 auto GetOperand
= [](SDValue Op
, int N
) -> APInt
{
2310 const SDValue
&Operand
= Op
->getOperand(N
);
2311 EVT VT
= Op
->getValueType(0);
2312 if (Operand
->isUndef())
2313 return APInt(32, 0);
2315 if (VT
== MVT::v2f16
|| VT
== MVT::v2bf16
)
2316 Value
= cast
<ConstantFPSDNode
>(Operand
)->getValueAPF().bitcastToAPInt();
2317 else if (VT
== MVT::v2i16
|| VT
== MVT::v4i8
)
2318 Value
= Operand
->getAsAPIntVal();
2320 llvm_unreachable("Unsupported type");
2321 // i8 values are carried around as i16, so we need to zero out upper bits,
2322 // so they do not get in the way of combining individual byte values
2323 if (VT
== MVT::v4i8
)
2324 Value
= Value
.trunc(8);
2325 return Value
.zext(32);
2328 if (Isv2x16VT(VT
)) {
2329 Value
= GetOperand(Op
, 0) | GetOperand(Op
, 1).shl(16);
2330 } else if (VT
== MVT::v4i8
) {
2331 Value
= GetOperand(Op
, 0) | GetOperand(Op
, 1).shl(8) |
2332 GetOperand(Op
, 2).shl(16) | GetOperand(Op
, 3).shl(24);
2334 llvm_unreachable("Unsupported type");
2336 SDValue Const
= DAG
.getConstant(Value
, SDLoc(Op
), MVT::i32
);
2337 return DAG
.getNode(ISD::BITCAST
, SDLoc(Op
), Op
->getValueType(0), Const
);
2340 SDValue
NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op
,
2341 SelectionDAG
&DAG
) const {
2342 SDValue Index
= Op
->getOperand(1);
2343 SDValue Vector
= Op
->getOperand(0);
2345 EVT VectorVT
= Vector
.getValueType();
2347 if (VectorVT
== MVT::v4i8
) {
2349 DAG
.getNode(NVPTXISD::BFE
, DL
, MVT::i32
,
2351 DAG
.getNode(ISD::MUL
, DL
, MVT::i32
,
2352 DAG
.getZExtOrTrunc(Index
, DL
, MVT::i32
),
2353 DAG
.getConstant(8, DL
, MVT::i32
)),
2354 DAG
.getConstant(8, DL
, MVT::i32
)});
2355 return DAG
.getAnyExtOrTrunc(BFE
, DL
, Op
->getValueType(0));
2358 // Constant index will be matched by tablegen.
2359 if (isa
<ConstantSDNode
>(Index
.getNode()))
2362 // Extract individual elements and select one of them.
2363 assert(Isv2x16VT(VectorVT
) && "Unexpected vector type.");
2364 EVT EltVT
= VectorVT
.getVectorElementType();
2366 SDLoc
dl(Op
.getNode());
2367 SDValue E0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, Vector
,
2368 DAG
.getIntPtrConstant(0, dl
));
2369 SDValue E1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, Vector
,
2370 DAG
.getIntPtrConstant(1, dl
));
2371 return DAG
.getSelectCC(dl
, Index
, DAG
.getIntPtrConstant(0, dl
), E0
, E1
,
2372 ISD::CondCode::SETEQ
);
2375 SDValue
NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op
,
2376 SelectionDAG
&DAG
) const {
2377 SDValue Vector
= Op
->getOperand(0);
2378 EVT VectorVT
= Vector
.getValueType();
2380 if (VectorVT
!= MVT::v4i8
)
2383 SDValue Value
= Op
->getOperand(1);
2384 if (Value
->isUndef())
2387 SDValue Index
= Op
->getOperand(2);
2390 DAG
.getNode(NVPTXISD::BFI
, DL
, MVT::i32
,
2391 {DAG
.getZExtOrTrunc(Value
, DL
, MVT::i32
), Vector
,
2392 DAG
.getNode(ISD::MUL
, DL
, MVT::i32
,
2393 DAG
.getZExtOrTrunc(Index
, DL
, MVT::i32
),
2394 DAG
.getConstant(8, DL
, MVT::i32
)),
2395 DAG
.getConstant(8, DL
, MVT::i32
)});
2396 return DAG
.getNode(ISD::BITCAST
, DL
, Op
->getValueType(0), BFI
);
2399 SDValue
NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op
,
2400 SelectionDAG
&DAG
) const {
2401 SDValue V1
= Op
.getOperand(0);
2402 EVT VectorVT
= V1
.getValueType();
2403 if (VectorVT
!= MVT::v4i8
|| Op
.getValueType() != MVT::v4i8
)
2406 // Lower shuffle to PRMT instruction.
2407 const ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
.getNode());
2408 SDValue V2
= Op
.getOperand(1);
2409 uint32_t Selector
= 0;
2410 for (auto I
: llvm::enumerate(SVN
->getMask())) {
2411 if (I
.value() != -1) // -1 is a placeholder for undef.
2412 Selector
|= (I
.value() << (I
.index() * 4));
2416 return DAG
.getNode(NVPTXISD::PRMT
, DL
, MVT::v4i8
, V1
, V2
,
2417 DAG
.getConstant(Selector
, DL
, MVT::i32
),
2418 DAG
.getConstant(NVPTX::PTXPrmtMode::NONE
, DL
, MVT::i32
));
2420 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2421 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2423 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2425 SDValue
NVPTXTargetLowering::LowerShiftRightParts(SDValue Op
,
2426 SelectionDAG
&DAG
) const {
2427 assert(Op
.getNumOperands() == 3 && "Not a double-shift!");
2428 assert(Op
.getOpcode() == ISD::SRA_PARTS
|| Op
.getOpcode() == ISD::SRL_PARTS
);
2430 EVT VT
= Op
.getValueType();
2431 unsigned VTBits
= VT
.getSizeInBits();
2433 SDValue ShOpLo
= Op
.getOperand(0);
2434 SDValue ShOpHi
= Op
.getOperand(1);
2435 SDValue ShAmt
= Op
.getOperand(2);
2436 unsigned Opc
= (Op
.getOpcode() == ISD::SRA_PARTS
) ? ISD::SRA
: ISD::SRL
;
2438 if (VTBits
== 32 && STI
.getSmVersion() >= 35) {
2439 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2440 // {dHi, dLo} = {aHi, aLo} >> Amt
2442 // dLo = shf.r.clamp aLo, aHi, Amt
2444 SDValue Hi
= DAG
.getNode(Opc
, dl
, VT
, ShOpHi
, ShAmt
);
2445 SDValue Lo
= DAG
.getNode(NVPTXISD::FUN_SHFR_CLAMP
, dl
, VT
, ShOpLo
, ShOpHi
,
2448 SDValue Ops
[2] = { Lo
, Hi
};
2449 return DAG
.getMergeValues(Ops
, dl
);
2452 // {dHi, dLo} = {aHi, aLo} >> Amt
2453 // - if (Amt>=size) then
2454 // dLo = aHi >> (Amt-size)
2455 // dHi = aHi >> Amt (this is either all 0 or all 1)
2457 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2460 SDValue RevShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
,
2461 DAG
.getConstant(VTBits
, dl
, MVT::i32
),
2463 SDValue Tmp1
= DAG
.getNode(ISD::SRL
, dl
, VT
, ShOpLo
, ShAmt
);
2464 SDValue ExtraShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
, ShAmt
,
2465 DAG
.getConstant(VTBits
, dl
, MVT::i32
));
2466 SDValue Tmp2
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpHi
, RevShAmt
);
2467 SDValue FalseVal
= DAG
.getNode(ISD::OR
, dl
, VT
, Tmp1
, Tmp2
);
2468 SDValue TrueVal
= DAG
.getNode(Opc
, dl
, VT
, ShOpHi
, ExtraShAmt
);
2470 SDValue Cmp
= DAG
.getSetCC(dl
, MVT::i1
, ShAmt
,
2471 DAG
.getConstant(VTBits
, dl
, MVT::i32
),
2473 SDValue Hi
= DAG
.getNode(Opc
, dl
, VT
, ShOpHi
, ShAmt
);
2474 SDValue Lo
= DAG
.getNode(ISD::SELECT
, dl
, VT
, Cmp
, TrueVal
, FalseVal
);
2476 SDValue Ops
[2] = { Lo
, Hi
};
2477 return DAG
.getMergeValues(Ops
, dl
);
2481 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2482 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2484 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2486 SDValue
NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op
,
2487 SelectionDAG
&DAG
) const {
2488 assert(Op
.getNumOperands() == 3 && "Not a double-shift!");
2489 assert(Op
.getOpcode() == ISD::SHL_PARTS
);
2491 EVT VT
= Op
.getValueType();
2492 unsigned VTBits
= VT
.getSizeInBits();
2494 SDValue ShOpLo
= Op
.getOperand(0);
2495 SDValue ShOpHi
= Op
.getOperand(1);
2496 SDValue ShAmt
= Op
.getOperand(2);
2498 if (VTBits
== 32 && STI
.getSmVersion() >= 35) {
2499 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2500 // {dHi, dLo} = {aHi, aLo} << Amt
2501 // dHi = shf.l.clamp aLo, aHi, Amt
2504 SDValue Hi
= DAG
.getNode(NVPTXISD::FUN_SHFL_CLAMP
, dl
, VT
, ShOpLo
, ShOpHi
,
2506 SDValue Lo
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpLo
, ShAmt
);
2508 SDValue Ops
[2] = { Lo
, Hi
};
2509 return DAG
.getMergeValues(Ops
, dl
);
2512 // {dHi, dLo} = {aHi, aLo} << Amt
2513 // - if (Amt>=size) then
2514 // dLo = aLo << Amt (all 0)
2515 // dLo = aLo << (Amt-size)
2518 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2520 SDValue RevShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
,
2521 DAG
.getConstant(VTBits
, dl
, MVT::i32
),
2523 SDValue Tmp1
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpHi
, ShAmt
);
2524 SDValue ExtraShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
, ShAmt
,
2525 DAG
.getConstant(VTBits
, dl
, MVT::i32
));
2526 SDValue Tmp2
= DAG
.getNode(ISD::SRL
, dl
, VT
, ShOpLo
, RevShAmt
);
2527 SDValue FalseVal
= DAG
.getNode(ISD::OR
, dl
, VT
, Tmp1
, Tmp2
);
2528 SDValue TrueVal
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpLo
, ExtraShAmt
);
2530 SDValue Cmp
= DAG
.getSetCC(dl
, MVT::i1
, ShAmt
,
2531 DAG
.getConstant(VTBits
, dl
, MVT::i32
),
2533 SDValue Lo
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpLo
, ShAmt
);
2534 SDValue Hi
= DAG
.getNode(ISD::SELECT
, dl
, VT
, Cmp
, TrueVal
, FalseVal
);
2536 SDValue Ops
[2] = { Lo
, Hi
};
2537 return DAG
.getMergeValues(Ops
, dl
);
2541 SDValue
NVPTXTargetLowering::LowerFROUND(SDValue Op
, SelectionDAG
&DAG
) const {
2542 EVT VT
= Op
.getValueType();
2545 return LowerFROUND32(Op
, DAG
);
2548 return LowerFROUND64(Op
, DAG
);
2550 llvm_unreachable("unhandled type");
2553 // This is the the rounding method used in CUDA libdevice in C like code:
2554 // float roundf(float A)
2556 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2557 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2558 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2560 SDValue
NVPTXTargetLowering::LowerFROUND32(SDValue Op
,
2561 SelectionDAG
&DAG
) const {
2563 SDValue A
= Op
.getOperand(0);
2564 EVT VT
= Op
.getValueType();
2566 SDValue AbsA
= DAG
.getNode(ISD::FABS
, SL
, VT
, A
);
2568 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2569 SDValue Bitcast
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i32
, A
);
2570 const int SignBitMask
= 0x80000000;
2571 SDValue Sign
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
, Bitcast
,
2572 DAG
.getConstant(SignBitMask
, SL
, MVT::i32
));
2573 const int PointFiveInBits
= 0x3F000000;
2574 SDValue PointFiveWithSignRaw
=
2575 DAG
.getNode(ISD::OR
, SL
, MVT::i32
, Sign
,
2576 DAG
.getConstant(PointFiveInBits
, SL
, MVT::i32
));
2577 SDValue PointFiveWithSign
=
2578 DAG
.getNode(ISD::BITCAST
, SL
, VT
, PointFiveWithSignRaw
);
2579 SDValue AdjustedA
= DAG
.getNode(ISD::FADD
, SL
, VT
, A
, PointFiveWithSign
);
2580 SDValue RoundedA
= DAG
.getNode(ISD::FTRUNC
, SL
, VT
, AdjustedA
);
2582 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2583 EVT SetCCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
2585 DAG
.getSetCC(SL
, SetCCVT
, AbsA
, DAG
.getConstantFP(pow(2.0, 23.0), SL
, VT
),
2587 RoundedA
= DAG
.getNode(ISD::SELECT
, SL
, VT
, IsLarge
, A
, RoundedA
);
2589 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2590 SDValue IsSmall
=DAG
.getSetCC(SL
, SetCCVT
, AbsA
,
2591 DAG
.getConstantFP(0.5, SL
, VT
), ISD::SETOLT
);
2592 SDValue RoundedAForSmallA
= DAG
.getNode(ISD::FTRUNC
, SL
, VT
, A
);
2593 return DAG
.getNode(ISD::SELECT
, SL
, VT
, IsSmall
, RoundedAForSmallA
, RoundedA
);
2596 // The implementation of round(double) is similar to that of round(float) in
2597 // that they both separate the value range into three regions and use a method
2598 // specific to the region to round the values. However, round(double) first
2599 // calculates the round of the absolute value and then adds the sign back while
2600 // round(float) directly rounds the value with sign.
2601 SDValue
NVPTXTargetLowering::LowerFROUND64(SDValue Op
,
2602 SelectionDAG
&DAG
) const {
2604 SDValue A
= Op
.getOperand(0);
2605 EVT VT
= Op
.getValueType();
2607 SDValue AbsA
= DAG
.getNode(ISD::FABS
, SL
, VT
, A
);
2609 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2610 SDValue AdjustedA
= DAG
.getNode(ISD::FADD
, SL
, VT
, AbsA
,
2611 DAG
.getConstantFP(0.5, SL
, VT
));
2612 SDValue RoundedA
= DAG
.getNode(ISD::FTRUNC
, SL
, VT
, AdjustedA
);
2614 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2615 EVT SetCCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
2616 SDValue IsSmall
=DAG
.getSetCC(SL
, SetCCVT
, AbsA
,
2617 DAG
.getConstantFP(0.5, SL
, VT
), ISD::SETOLT
);
2618 RoundedA
= DAG
.getNode(ISD::SELECT
, SL
, VT
, IsSmall
,
2619 DAG
.getConstantFP(0, SL
, VT
),
2622 // Add sign to rounded_A
2623 RoundedA
= DAG
.getNode(ISD::FCOPYSIGN
, SL
, VT
, RoundedA
, A
);
2624 DAG
.getNode(ISD::FTRUNC
, SL
, VT
, A
);
2626 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2628 DAG
.getSetCC(SL
, SetCCVT
, AbsA
, DAG
.getConstantFP(pow(2.0, 52.0), SL
, VT
),
2630 return DAG
.getNode(ISD::SELECT
, SL
, VT
, IsLarge
, A
, RoundedA
);
2633 SDValue
NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op
,
2634 SelectionDAG
&DAG
) const {
2635 assert(STI
.getSmVersion() < 90 || STI
.getPTXVersion() < 78);
2637 if (Op
.getValueType() == MVT::bf16
) {
2640 ISD::FP_ROUND
, Loc
, MVT::bf16
,
2641 DAG
.getNode(Op
.getOpcode(), Loc
, MVT::f32
, Op
.getOperand(0)),
2642 DAG
.getIntPtrConstant(0, Loc
));
2645 // Everything else is considered legal.
2649 SDValue
NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op
,
2650 SelectionDAG
&DAG
) const {
2651 assert(STI
.getSmVersion() < 90 || STI
.getPTXVersion() < 78);
2653 if (Op
.getOperand(0).getValueType() == MVT::bf16
) {
2656 Op
.getOpcode(), Loc
, Op
.getValueType(),
2657 DAG
.getNode(ISD::FP_EXTEND
, Loc
, MVT::f32
, Op
.getOperand(0)));
2660 // Everything else is considered legal.
2664 SDValue
NVPTXTargetLowering::LowerFP_ROUND(SDValue Op
,
2665 SelectionDAG
&DAG
) const {
2666 EVT NarrowVT
= Op
.getValueType();
2667 SDValue Wide
= Op
.getOperand(0);
2668 EVT WideVT
= Wide
.getValueType();
2669 if (NarrowVT
.getScalarType() == MVT::bf16
) {
2670 const TargetLowering
*TLI
= STI
.getTargetLowering();
2671 if (STI
.getSmVersion() < 80 || STI
.getPTXVersion() < 70) {
2672 return TLI
->expandFP_ROUND(Op
.getNode(), DAG
);
2674 if (STI
.getSmVersion() < 90 || STI
.getPTXVersion() < 78) {
2675 // This combination was the first to support f32 -> bf16.
2676 if (STI
.getSmVersion() >= 80 && STI
.getPTXVersion() >= 70) {
2677 if (WideVT
.getScalarType() == MVT::f32
) {
2680 if (WideVT
.getScalarType() == MVT::f64
) {
2682 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2683 // the hardware f32 -> bf16 instruction.
2684 SDValue rod
= TLI
->expandRoundInexactToOdd(
2685 WideVT
.isVector() ? WideVT
.changeVectorElementType(MVT::f32
)
2688 return DAG
.getFPExtendOrRound(rod
, Loc
, NarrowVT
);
2691 return TLI
->expandFP_ROUND(Op
.getNode(), DAG
);
2695 // Everything else is considered legal.
2699 SDValue
NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op
,
2700 SelectionDAG
&DAG
) const {
2701 SDValue Narrow
= Op
.getOperand(0);
2702 EVT NarrowVT
= Narrow
.getValueType();
2703 EVT WideVT
= Op
.getValueType();
2704 if (NarrowVT
.getScalarType() == MVT::bf16
) {
2705 if (WideVT
.getScalarType() == MVT::f32
&&
2706 (STI
.getSmVersion() < 80 || STI
.getPTXVersion() < 71)) {
2708 return DAG
.getNode(ISD::BF16_TO_FP
, Loc
, WideVT
, Narrow
);
2710 if (WideVT
.getScalarType() == MVT::f64
&&
2711 (STI
.getSmVersion() < 90 || STI
.getPTXVersion() < 78)) {
2712 EVT F32
= NarrowVT
.isVector() ? NarrowVT
.changeVectorElementType(MVT::f32
)
2715 if (STI
.getSmVersion() >= 80 && STI
.getPTXVersion() >= 71) {
2716 Op
= DAG
.getNode(ISD::FP_EXTEND
, Loc
, F32
, Narrow
);
2718 Op
= DAG
.getNode(ISD::BF16_TO_FP
, Loc
, F32
, Narrow
);
2720 return DAG
.getNode(ISD::FP_EXTEND
, Loc
, WideVT
, Op
);
2724 // Everything else is considered legal.
2728 static SDValue
LowerVectorArith(SDValue Op
, SelectionDAG
&DAG
) {
2730 if (Op
.getValueType() != MVT::v2i16
)
2732 EVT EltVT
= Op
.getValueType().getVectorElementType();
2733 SmallVector
<SDValue
> VecElements
;
2734 for (int I
= 0, E
= Op
.getValueType().getVectorNumElements(); I
< E
; I
++) {
2735 SmallVector
<SDValue
> ScalarArgs
;
2736 llvm::transform(Op
->ops(), std::back_inserter(ScalarArgs
),
2737 [&](const SDUse
&O
) {
2738 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltVT
,
2739 O
.get(), DAG
.getIntPtrConstant(I
, DL
));
2741 VecElements
.push_back(DAG
.getNode(Op
.getOpcode(), DL
, EltVT
, ScalarArgs
));
2744 DAG
.getNode(ISD::BUILD_VECTOR
, DL
, Op
.getValueType(), VecElements
);
2749 NVPTXTargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
2750 switch (Op
.getOpcode()) {
2751 case ISD::RETURNADDR
:
2753 case ISD::FRAMEADDR
:
2755 case ISD::GlobalAddress
:
2756 return LowerGlobalAddress(Op
, DAG
);
2757 case ISD::INTRINSIC_W_CHAIN
:
2759 case ISD::BUILD_VECTOR
:
2760 return LowerBUILD_VECTOR(Op
, DAG
);
2761 case ISD::EXTRACT_SUBVECTOR
:
2763 case ISD::EXTRACT_VECTOR_ELT
:
2764 return LowerEXTRACT_VECTOR_ELT(Op
, DAG
);
2765 case ISD::INSERT_VECTOR_ELT
:
2766 return LowerINSERT_VECTOR_ELT(Op
, DAG
);
2767 case ISD::VECTOR_SHUFFLE
:
2768 return LowerVECTOR_SHUFFLE(Op
, DAG
);
2769 case ISD::CONCAT_VECTORS
:
2770 return LowerCONCAT_VECTORS(Op
, DAG
);
2772 return LowerSTORE(Op
, DAG
);
2774 return LowerLOAD(Op
, DAG
);
2775 case ISD::SHL_PARTS
:
2776 return LowerShiftLeftParts(Op
, DAG
);
2777 case ISD::SRA_PARTS
:
2778 case ISD::SRL_PARTS
:
2779 return LowerShiftRightParts(Op
, DAG
);
2781 return LowerSelect(Op
, DAG
);
2783 return LowerFROUND(Op
, DAG
);
2784 case ISD::SINT_TO_FP
:
2785 case ISD::UINT_TO_FP
:
2786 return LowerINT_TO_FP(Op
, DAG
);
2787 case ISD::FP_TO_SINT
:
2788 case ISD::FP_TO_UINT
:
2789 return LowerFP_TO_INT(Op
, DAG
);
2791 return LowerFP_ROUND(Op
, DAG
);
2792 case ISD::FP_EXTEND
:
2793 return LowerFP_EXTEND(Op
, DAG
);
2795 return LowerVAARG(Op
, DAG
);
2797 return LowerVASTART(Op
, DAG
);
2809 return LowerVectorArith(Op
, DAG
);
2810 case ISD::DYNAMIC_STACKALLOC
:
2811 return LowerDYNAMIC_STACKALLOC(Op
, DAG
);
2812 case ISD::CopyToReg
:
2813 return LowerCopyToReg_128(Op
, DAG
);
2815 llvm_unreachable("Custom lowering not defined for operation");
2819 // This function is almost a copy of SelectionDAG::expandVAArg().
2820 // The only diff is that this one produces loads from local address space.
2821 SDValue
NVPTXTargetLowering::LowerVAARG(SDValue Op
, SelectionDAG
&DAG
) const {
2822 const TargetLowering
*TLI
= STI
.getTargetLowering();
2825 SDNode
*Node
= Op
.getNode();
2826 const Value
*V
= cast
<SrcValueSDNode
>(Node
->getOperand(2))->getValue();
2827 EVT VT
= Node
->getValueType(0);
2828 auto *Ty
= VT
.getTypeForEVT(*DAG
.getContext());
2829 SDValue Tmp1
= Node
->getOperand(0);
2830 SDValue Tmp2
= Node
->getOperand(1);
2831 const MaybeAlign
MA(Node
->getConstantOperandVal(3));
2833 SDValue VAListLoad
= DAG
.getLoad(TLI
->getPointerTy(DAG
.getDataLayout()), DL
,
2834 Tmp1
, Tmp2
, MachinePointerInfo(V
));
2835 SDValue VAList
= VAListLoad
;
2837 if (MA
&& *MA
> TLI
->getMinStackArgumentAlignment()) {
2838 VAList
= DAG
.getNode(
2839 ISD::ADD
, DL
, VAList
.getValueType(), VAList
,
2840 DAG
.getConstant(MA
->value() - 1, DL
, VAList
.getValueType()));
2842 VAList
= DAG
.getNode(
2843 ISD::AND
, DL
, VAList
.getValueType(), VAList
,
2844 DAG
.getConstant(-(int64_t)MA
->value(), DL
, VAList
.getValueType()));
2847 // Increment the pointer, VAList, to the next vaarg
2848 Tmp1
= DAG
.getNode(ISD::ADD
, DL
, VAList
.getValueType(), VAList
,
2849 DAG
.getConstant(DAG
.getDataLayout().getTypeAllocSize(Ty
),
2850 DL
, VAList
.getValueType()));
2852 // Store the incremented VAList to the legalized pointer
2853 Tmp1
= DAG
.getStore(VAListLoad
.getValue(1), DL
, Tmp1
, Tmp2
,
2854 MachinePointerInfo(V
));
2857 Constant::getNullValue(PointerType::get(Ty
, ADDRESS_SPACE_LOCAL
));
2859 // Load the actual argument out of the pointer VAList
2860 return DAG
.getLoad(VT
, DL
, Tmp1
, VAList
, MachinePointerInfo(SrcV
));
2863 SDValue
NVPTXTargetLowering::LowerVASTART(SDValue Op
, SelectionDAG
&DAG
) const {
2864 const TargetLowering
*TLI
= STI
.getTargetLowering();
2866 EVT PtrVT
= TLI
->getPointerTy(DAG
.getDataLayout());
2868 // Store the address of unsized array <function>_vararg[] in the ap object.
2869 SDValue Arg
= getParamSymbol(DAG
, /* vararg */ -1, PtrVT
);
2870 SDValue VAReg
= DAG
.getNode(NVPTXISD::Wrapper
, DL
, PtrVT
, Arg
);
2872 const Value
*SV
= cast
<SrcValueSDNode
>(Op
.getOperand(2))->getValue();
2873 return DAG
.getStore(Op
.getOperand(0), DL
, VAReg
, Op
.getOperand(1),
2874 MachinePointerInfo(SV
));
2877 SDValue
NVPTXTargetLowering::LowerSelect(SDValue Op
, SelectionDAG
&DAG
) const {
2878 SDValue Op0
= Op
->getOperand(0);
2879 SDValue Op1
= Op
->getOperand(1);
2880 SDValue Op2
= Op
->getOperand(2);
2881 SDLoc
DL(Op
.getNode());
2883 assert(Op
.getValueType() == MVT::i1
&& "Custom lowering enabled only for i1");
2885 Op1
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Op1
);
2886 Op2
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Op2
);
2887 SDValue Select
= DAG
.getNode(ISD::SELECT
, DL
, MVT::i32
, Op0
, Op1
, Op2
);
2888 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Select
);
2893 SDValue
NVPTXTargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
2894 if (Op
.getValueType() == MVT::i1
)
2895 return LowerLOADi1(Op
, DAG
);
2897 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2898 // unaligned loads and have to handle it here.
2899 EVT VT
= Op
.getValueType();
2900 if (Isv2x16VT(VT
) || VT
== MVT::v4i8
) {
2901 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
2902 EVT MemVT
= Load
->getMemoryVT();
2903 if (!allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
2904 MemVT
, *Load
->getMemOperand())) {
2906 std::tie(Ops
[0], Ops
[1]) = expandUnalignedLoad(Load
, DAG
);
2907 return DAG
.getMergeValues(Ops
, SDLoc(Op
));
2916 // v1 = ld i8* addr (-> i16)
2917 // v = trunc i16 to i1
2918 SDValue
NVPTXTargetLowering::LowerLOADi1(SDValue Op
, SelectionDAG
&DAG
) const {
2919 SDNode
*Node
= Op
.getNode();
2920 LoadSDNode
*LD
= cast
<LoadSDNode
>(Node
);
2922 assert(LD
->getExtensionType() == ISD::NON_EXTLOAD
);
2923 assert(Node
->getValueType(0) == MVT::i1
&&
2924 "Custom lowering for i1 load only");
2925 SDValue newLD
= DAG
.getExtLoad(ISD::ZEXTLOAD
, dl
, MVT::i16
, LD
->getChain(),
2926 LD
->getBasePtr(), LD
->getPointerInfo(),
2927 MVT::i8
, LD
->getAlign(),
2928 LD
->getMemOperand()->getFlags());
2929 SDValue result
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i1
, newLD
);
2930 // The legalizer (the caller) is expecting two values from the legalized
2931 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2932 // in LegalizeDAG.cpp which also uses MergeValues.
2933 SDValue Ops
[] = { result
, LD
->getChain() };
2934 return DAG
.getMergeValues(Ops
, dl
);
2937 SDValue
NVPTXTargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
2938 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
2939 EVT VT
= Store
->getMemoryVT();
2942 return LowerSTOREi1(Op
, DAG
);
2944 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2945 // stores and have to handle it here.
2946 if ((Isv2x16VT(VT
) || VT
== MVT::v4i8
) &&
2947 !allowsMemoryAccessForAlignment(*DAG
.getContext(), DAG
.getDataLayout(),
2948 VT
, *Store
->getMemOperand()))
2949 return expandUnalignedStore(Store
, DAG
);
2951 // v2f16, v2bf16 and v2i16 don't need special handling.
2952 if (Isv2x16VT(VT
) || VT
== MVT::v4i8
)
2956 return LowerSTOREVector(Op
, DAG
);
2962 NVPTXTargetLowering::LowerSTOREVector(SDValue Op
, SelectionDAG
&DAG
) const {
2963 SDNode
*N
= Op
.getNode();
2964 SDValue Val
= N
->getOperand(1);
2966 EVT ValVT
= Val
.getValueType();
2968 if (ValVT
.isVector()) {
2969 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2970 // legal. We can (and should) split that into 2 stores of <2 x double> here
2971 // but I'm leaving that as a TODO for now.
2972 if (!ValVT
.isSimple())
2974 switch (ValVT
.getSimpleVT().SimpleTy
) {
2991 case MVT::v8f16
: // <4 x f16x2>
2992 case MVT::v8bf16
: // <4 x bf16x2>
2993 case MVT::v8i16
: // <4 x i16x2>
2994 // This is a "native" vector type
2998 MemSDNode
*MemSD
= cast
<MemSDNode
>(N
);
2999 const DataLayout
&TD
= DAG
.getDataLayout();
3001 Align Alignment
= MemSD
->getAlign();
3003 TD
.getPrefTypeAlign(ValVT
.getTypeForEVT(*DAG
.getContext()));
3004 if (Alignment
< PrefAlign
) {
3005 // This store is not sufficiently aligned, so bail out and let this vector
3006 // store be scalarized. Note that we may still be able to emit smaller
3007 // vector stores. For example, if we are storing a <4 x float> with an
3008 // alignment of 8, this check will fail but the legalizer will try again
3009 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3013 unsigned Opcode
= 0;
3014 EVT EltVT
= ValVT
.getVectorElementType();
3015 unsigned NumElts
= ValVT
.getVectorNumElements();
3017 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
3018 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3019 // stored type to i16 and propagate the "real" type as the memory type.
3020 bool NeedExt
= false;
3021 if (EltVT
.getSizeInBits() < 16)
3024 bool StoreF16x2
= false;
3029 Opcode
= NVPTXISD::StoreV2
;
3032 Opcode
= NVPTXISD::StoreV4
;
3035 // v8f16 is a special case. PTX doesn't have st.v8.f16
3036 // instruction. Instead, we split the vector into v2f16 chunks and
3037 // store them with st.v4.b32.
3038 assert(Is16bitsType(EltVT
.getSimpleVT()) && "Wrong type for the vector.");
3039 Opcode
= NVPTXISD::StoreV4
;
3044 SmallVector
<SDValue
, 8> Ops
;
3046 // First is the chain
3047 Ops
.push_back(N
->getOperand(0));
3050 // Combine f16,f16 -> v2f16
3052 for (unsigned i
= 0; i
< NumElts
; ++i
) {
3053 SDValue E0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltVT
, Val
,
3054 DAG
.getIntPtrConstant(i
* 2, DL
));
3055 SDValue E1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltVT
, Val
,
3056 DAG
.getIntPtrConstant(i
* 2 + 1, DL
));
3057 EVT VecVT
= EVT::getVectorVT(*DAG
.getContext(), EltVT
, 2);
3058 SDValue V2
= DAG
.getNode(ISD::BUILD_VECTOR
, DL
, VecVT
, E0
, E1
);
3062 // Then the split values
3063 for (unsigned i
= 0; i
< NumElts
; ++i
) {
3064 SDValue ExtVal
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltVT
, Val
,
3065 DAG
.getIntPtrConstant(i
, DL
));
3067 ExtVal
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i16
, ExtVal
);
3068 Ops
.push_back(ExtVal
);
3072 // Then any remaining arguments
3073 Ops
.append(N
->op_begin() + 2, N
->op_end());
3076 DAG
.getMemIntrinsicNode(Opcode
, DL
, DAG
.getVTList(MVT::Other
), Ops
,
3077 MemSD
->getMemoryVT(), MemSD
->getMemOperand());
3079 // return DCI.CombineTo(N, NewSt, true);
3088 // v1 = zxt v to i16
3090 SDValue
NVPTXTargetLowering::LowerSTOREi1(SDValue Op
, SelectionDAG
&DAG
) const {
3091 SDNode
*Node
= Op
.getNode();
3093 StoreSDNode
*ST
= cast
<StoreSDNode
>(Node
);
3094 SDValue Tmp1
= ST
->getChain();
3095 SDValue Tmp2
= ST
->getBasePtr();
3096 SDValue Tmp3
= ST
->getValue();
3097 assert(Tmp3
.getValueType() == MVT::i1
&& "Custom lowering for i1 store only");
3098 Tmp3
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
, MVT::i16
, Tmp3
);
3100 DAG
.getTruncStore(Tmp1
, dl
, Tmp3
, Tmp2
, ST
->getPointerInfo(), MVT::i8
,
3101 ST
->getAlign(), ST
->getMemOperand()->getFlags());
3105 SDValue
NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op
,
3106 SelectionDAG
&DAG
) const {
3107 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3108 // operand so that it can pass the legalization.
3110 assert(Op
.getOperand(1).getValueType() == MVT::i128
&&
3111 "Custom lowering for 128-bit CopyToReg only");
3113 SDNode
*Node
= Op
.getNode();
3116 SDValue Cast
= DAG
.getBitcast(MVT::v2i64
, Op
->getOperand(2));
3117 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i64
, Cast
,
3118 DAG
.getIntPtrConstant(0, DL
));
3119 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i64
, Cast
,
3120 DAG
.getIntPtrConstant(1, DL
));
3122 SmallVector
<SDValue
, 5> NewOps(Op
->getNumOperands() + 1);
3123 SmallVector
<EVT
, 3> ResultsType(Node
->values());
3125 NewOps
[0] = Op
->getOperand(0); // Chain
3126 NewOps
[1] = Op
->getOperand(1); // Dst Reg
3127 NewOps
[2] = Lo
; // Lower 64-bit
3128 NewOps
[3] = Hi
; // Higher 64-bit
3129 if (Op
.getNumOperands() == 4)
3130 NewOps
[4] = Op
->getOperand(3); // Glue if exists
3132 return DAG
.getNode(ISD::CopyToReg
, DL
, ResultsType
, NewOps
);
3135 unsigned NVPTXTargetLowering::getNumRegisters(
3136 LLVMContext
&Context
, EVT VT
,
3137 std::optional
<MVT
> RegisterVT
= std::nullopt
) const {
3138 if (VT
== MVT::i128
&& RegisterVT
== MVT::i128
)
3140 return TargetLoweringBase::getNumRegisters(Context
, VT
, RegisterVT
);
3143 bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3144 SelectionDAG
&DAG
, const SDLoc
&DL
, SDValue Val
, SDValue
*Parts
,
3145 unsigned NumParts
, MVT PartVT
, std::optional
<CallingConv::ID
> CC
) const {
3146 if (Val
.getValueType() == MVT::i128
&& NumParts
== 1) {
3153 // This creates target external symbol for a function parameter.
3154 // Name of the symbol is composed from its index and the function name.
3155 // Negative index corresponds to special parameter (unsized array) used for
3156 // passing variable arguments.
3157 SDValue
NVPTXTargetLowering::getParamSymbol(SelectionDAG
&DAG
, int idx
,
3159 StringRef SavedStr
= nvTM
->getStrPool().save(
3160 getParamName(&DAG
.getMachineFunction().getFunction(), idx
));
3161 return DAG
.getTargetExternalSymbol(SavedStr
.data(), v
);
3164 SDValue
NVPTXTargetLowering::LowerFormalArguments(
3165 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
3166 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&dl
,
3167 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
3168 MachineFunction
&MF
= DAG
.getMachineFunction();
3169 const DataLayout
&DL
= DAG
.getDataLayout();
3170 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
3172 const Function
*F
= &MF
.getFunction();
3173 const AttributeList
&PAL
= F
->getAttributes();
3174 const TargetLowering
*TLI
= STI
.getTargetLowering();
3176 SDValue Root
= DAG
.getRoot();
3177 std::vector
<SDValue
> OutChains
;
3179 bool isABI
= (STI
.getSmVersion() >= 20);
3180 assert(isABI
&& "Non-ABI compilation is not supported");
3184 std::vector
<Type
*> argTypes
;
3185 std::vector
<const Argument
*> theArgs
;
3186 for (const Argument
&I
: F
->args()) {
3187 theArgs
.push_back(&I
);
3188 argTypes
.push_back(I
.getType());
3190 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3191 // Ins.size() will be larger
3192 // * if there is an aggregate argument with multiple fields (each field
3193 // showing up separately in Ins)
3194 // * if there is a vector argument with more than typical vector-length
3195 // elements (generally if more than 4) where each vector element is
3196 // individually present in Ins.
3197 // So a different index should be used for indexing into Ins.
3198 // See similar issue in LowerCall.
3199 unsigned InsIdx
= 0;
3201 for (unsigned i
= 0, e
= theArgs
.size(); i
!= e
; ++i
, ++InsIdx
) {
3202 Type
*Ty
= argTypes
[i
];
3204 if (theArgs
[i
]->use_empty()) {
3206 if (IsTypePassedAsArray(Ty
) && !Ty
->isVectorTy()) {
3207 SmallVector
<EVT
, 16> vtparts
;
3209 ComputePTXValueVTs(*this, DAG
.getDataLayout(), Ty
, vtparts
);
3210 if (vtparts
.empty())
3211 report_fatal_error("Empty parameter types are not supported");
3213 for (unsigned parti
= 0, parte
= vtparts
.size(); parti
!= parte
;
3215 InVals
.push_back(DAG
.getNode(ISD::UNDEF
, dl
, Ins
[InsIdx
].VT
));
3218 if (vtparts
.size() > 0)
3222 if (Ty
->isVectorTy()) {
3223 EVT ObjectVT
= getValueType(DL
, Ty
);
3224 unsigned NumRegs
= TLI
->getNumRegisters(F
->getContext(), ObjectVT
);
3225 for (unsigned parti
= 0; parti
< NumRegs
; ++parti
) {
3226 InVals
.push_back(DAG
.getNode(ISD::UNDEF
, dl
, Ins
[InsIdx
].VT
));
3233 InVals
.push_back(DAG
.getNode(ISD::UNDEF
, dl
, Ins
[InsIdx
].VT
));
3237 // In the following cases, assign a node order of "i+1"
3238 // to newly created nodes. The SDNodes for params have to
3239 // appear in the same order as their order of appearance
3240 // in the original function. "i+1" holds that order.
3241 if (!PAL
.hasParamAttr(i
, Attribute::ByVal
)) {
3242 bool aggregateIsPacked
= false;
3243 if (StructType
*STy
= dyn_cast
<StructType
>(Ty
))
3244 aggregateIsPacked
= STy
->isPacked();
3246 SmallVector
<EVT
, 16> VTs
;
3247 SmallVector
<uint64_t, 16> Offsets
;
3248 ComputePTXValueVTs(*this, DL
, Ty
, VTs
, &Offsets
, 0);
3250 report_fatal_error("Empty parameter types are not supported");
3252 Align ArgAlign
= getFunctionArgumentAlignment(
3253 F
, Ty
, i
+ AttributeList::FirstArgIndex
, DL
);
3254 auto VectorInfo
= VectorizePTXValueVTs(VTs
, Offsets
, ArgAlign
);
3256 SDValue Arg
= getParamSymbol(DAG
, i
, PtrVT
);
3257 int VecIdx
= -1; // Index of the first element of the current vector.
3258 for (unsigned parti
= 0, parte
= VTs
.size(); parti
!= parte
; ++parti
) {
3259 if (VectorInfo
[parti
] & PVF_FIRST
) {
3260 assert(VecIdx
== -1 && "Orphaned vector.");
3264 // That's the last element of this store op.
3265 if (VectorInfo
[parti
] & PVF_LAST
) {
3266 unsigned NumElts
= parti
- VecIdx
+ 1;
3267 EVT EltVT
= VTs
[parti
];
3268 // i1 is loaded/stored as i8.
3270 if (EltVT
== MVT::i1
)
3272 else if (Isv2x16VT(EltVT
) || EltVT
== MVT::v4i8
)
3273 // getLoad needs a vector type, but it can't handle
3274 // vectors which contain v2f16 or v2bf16 elements. So we must load
3275 // using i32 here and then bitcast back.
3278 EVT VecVT
= EVT::getVectorVT(F
->getContext(), LoadVT
, NumElts
);
3280 DAG
.getNode(ISD::ADD
, dl
, PtrVT
, Arg
,
3281 DAG
.getConstant(Offsets
[VecIdx
], dl
, PtrVT
));
3282 Value
*srcValue
= Constant::getNullValue(PointerType::get(
3283 EltVT
.getTypeForEVT(F
->getContext()), ADDRESS_SPACE_PARAM
));
3285 const MaybeAlign PartAlign
= [&]() -> MaybeAlign
{
3286 if (aggregateIsPacked
)
3289 return std::nullopt
;
3291 DL
.getABITypeAlign(EltVT
.getTypeForEVT(F
->getContext()));
3292 return commonAlignment(PartAlign
, Offsets
[parti
]);
3294 SDValue P
= DAG
.getLoad(VecVT
, dl
, Root
, VecAddr
,
3295 MachinePointerInfo(srcValue
), PartAlign
,
3296 MachineMemOperand::MODereferenceable
|
3297 MachineMemOperand::MOInvariant
);
3299 P
.getNode()->setIROrder(i
+ 1);
3300 for (unsigned j
= 0; j
< NumElts
; ++j
) {
3301 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, LoadVT
, P
,
3302 DAG
.getIntPtrConstant(j
, dl
));
3303 // We've loaded i1 as an i8 and now must truncate it back to i1
3304 if (EltVT
== MVT::i1
)
3305 Elt
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i1
, Elt
);
3306 // v2f16 was loaded as an i32. Now we must bitcast it back.
3307 else if (EltVT
!= LoadVT
)
3308 Elt
= DAG
.getNode(ISD::BITCAST
, dl
, EltVT
, Elt
);
3310 // If a promoted integer type is used, truncate down to the original
3312 if (PromoteScalarIntegerPTX(EltVT
, &PromotedVT
)) {
3313 Elt
= DAG
.getNode(ISD::TRUNCATE
, dl
, EltVT
, Elt
);
3316 // Extend the element if necessary (e.g. an i8 is loaded
3317 // into an i16 register)
3318 if (Ins
[InsIdx
].VT
.isInteger() &&
3319 Ins
[InsIdx
].VT
.getFixedSizeInBits() >
3320 LoadVT
.getFixedSizeInBits()) {
3321 unsigned Extend
= Ins
[InsIdx
].Flags
.isSExt() ? ISD::SIGN_EXTEND
3323 Elt
= DAG
.getNode(Extend
, dl
, Ins
[InsIdx
].VT
, Elt
);
3325 InVals
.push_back(Elt
);
3328 // Reset vector tracking state.
3338 // Param has ByVal attribute
3339 // Return MoveParam(param symbol).
3340 // Ideally, the param symbol can be returned directly,
3341 // but when SDNode builder decides to use it in a CopyToReg(),
3342 // machine instruction fails because TargetExternalSymbol
3343 // (not lowered) is target dependent, and CopyToReg assumes
3344 // the source is lowered.
3345 EVT ObjectVT
= getValueType(DL
, Ty
);
3346 assert(ObjectVT
== Ins
[InsIdx
].VT
&&
3347 "Ins type did not match function type");
3348 SDValue Arg
= getParamSymbol(DAG
, i
, PtrVT
);
3349 SDValue p
= DAG
.getNode(NVPTXISD::MoveParam
, dl
, ObjectVT
, Arg
);
3351 p
.getNode()->setIROrder(i
+ 1);
3352 InVals
.push_back(p
);
3355 if (!OutChains
.empty())
3356 DAG
.setRoot(DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
, OutChains
));
3361 // Use byte-store when the param adress of the return value is unaligned.
3362 // This may happen when the return value is a field of a packed structure.
3363 static SDValue
LowerUnalignedStoreRet(SelectionDAG
&DAG
, SDValue Chain
,
3364 uint64_t Offset
, EVT ElementType
,
3365 SDValue RetVal
, const SDLoc
&dl
) {
3366 // Bit logic only works on integer types
3367 if (adjustElementType(ElementType
))
3368 RetVal
= DAG
.getNode(ISD::BITCAST
, dl
, ElementType
, RetVal
);
3371 for (unsigned i
= 0, n
= ElementType
.getSizeInBits() / 8; i
< n
; i
++) {
3372 // Shift the byte to the last byte position
3373 SDValue ShiftVal
= DAG
.getNode(ISD::SRL
, dl
, ElementType
, RetVal
,
3374 DAG
.getConstant(i
* 8, dl
, MVT::i32
));
3375 SDValue StoreOperands
[] = {Chain
, DAG
.getConstant(Offset
+ i
, dl
, MVT::i32
),
3377 // Trunc store only the last byte by using
3379 // The register type can be larger than b8.
3380 Chain
= DAG
.getMemIntrinsicNode(NVPTXISD::StoreRetval
, dl
,
3381 DAG
.getVTList(MVT::Other
), StoreOperands
,
3382 MVT::i8
, MachinePointerInfo(), std::nullopt
,
3383 MachineMemOperand::MOStore
);
3389 NVPTXTargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
3391 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
3392 const SmallVectorImpl
<SDValue
> &OutVals
,
3393 const SDLoc
&dl
, SelectionDAG
&DAG
) const {
3394 const MachineFunction
&MF
= DAG
.getMachineFunction();
3395 const Function
&F
= MF
.getFunction();
3396 Type
*RetTy
= MF
.getFunction().getReturnType();
3398 bool isABI
= (STI
.getSmVersion() >= 20);
3399 assert(isABI
&& "Non-ABI compilation is not supported");
3403 const DataLayout
&DL
= DAG
.getDataLayout();
3404 SmallVector
<SDValue
, 16> PromotedOutVals
;
3405 SmallVector
<EVT
, 16> VTs
;
3406 SmallVector
<uint64_t, 16> Offsets
;
3407 ComputePTXValueVTs(*this, DL
, RetTy
, VTs
, &Offsets
);
3408 assert(VTs
.size() == OutVals
.size() && "Bad return value decomposition");
3410 for (unsigned i
= 0, e
= VTs
.size(); i
!= e
; ++i
) {
3411 SDValue PromotedOutVal
= OutVals
[i
];
3413 if (PromoteScalarIntegerPTX(VTs
[i
], &PromotedVT
)) {
3414 VTs
[i
] = EVT(PromotedVT
);
3416 if (PromoteScalarIntegerPTX(PromotedOutVal
.getValueType(), &PromotedVT
)) {
3417 llvm::ISD::NodeType Ext
=
3418 Outs
[i
].Flags
.isSExt() ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
3419 PromotedOutVal
= DAG
.getNode(Ext
, dl
, PromotedVT
, PromotedOutVal
);
3421 PromotedOutVals
.push_back(PromotedOutVal
);
3424 auto VectorInfo
= VectorizePTXValueVTs(
3426 RetTy
->isSized() ? getFunctionParamOptimizedAlign(&F
, RetTy
, DL
)
3429 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3430 // 32-bits are sign extended or zero extended, depending on whether
3431 // they are signed or unsigned types.
3432 bool ExtendIntegerRetVal
=
3433 RetTy
->isIntegerTy() && DL
.getTypeAllocSizeInBits(RetTy
) < 32;
3435 SmallVector
<SDValue
, 6> StoreOperands
;
3436 for (unsigned i
= 0, e
= VTs
.size(); i
!= e
; ++i
) {
3437 SDValue OutVal
= OutVals
[i
];
3438 SDValue RetVal
= PromotedOutVals
[i
];
3440 if (ExtendIntegerRetVal
) {
3441 RetVal
= DAG
.getNode(Outs
[i
].Flags
.isSExt() ? ISD::SIGN_EXTEND
3443 dl
, MVT::i32
, RetVal
);
3444 } else if (OutVal
.getValueSizeInBits() < 16) {
3445 // Use 16-bit registers for small load-stores as it's the
3446 // smallest general purpose register size supported by NVPTX.
3447 RetVal
= DAG
.getNode(ISD::ANY_EXTEND
, dl
, MVT::i16
, RetVal
);
3450 // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3451 // for a scalar store. In such cases, fall back to byte stores.
3452 if (VectorInfo
[i
] == PVF_SCALAR
&& RetTy
->isAggregateType()) {
3453 EVT ElementType
= ExtendIntegerRetVal
? MVT::i32
: VTs
[i
];
3454 Align ElementTypeAlign
=
3455 DL
.getABITypeAlign(ElementType
.getTypeForEVT(RetTy
->getContext()));
3456 Align ElementAlign
=
3457 commonAlignment(DL
.getABITypeAlign(RetTy
), Offsets
[i
]);
3458 if (ElementAlign
< ElementTypeAlign
) {
3459 assert(StoreOperands
.empty() && "Orphaned operand list.");
3460 Chain
= LowerUnalignedStoreRet(DAG
, Chain
, Offsets
[i
], ElementType
,
3463 // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3464 // into the graph, so just move on to the next element.
3469 // New load/store. Record chain and offset operands.
3470 if (VectorInfo
[i
] & PVF_FIRST
) {
3471 assert(StoreOperands
.empty() && "Orphaned operand list.");
3472 StoreOperands
.push_back(Chain
);
3473 StoreOperands
.push_back(DAG
.getConstant(Offsets
[i
], dl
, MVT::i32
));
3476 // Record the value to return.
3477 StoreOperands
.push_back(RetVal
);
3479 // That's the last element of this store op.
3480 if (VectorInfo
[i
] & PVF_LAST
) {
3481 NVPTXISD::NodeType Op
;
3482 unsigned NumElts
= StoreOperands
.size() - 2;
3485 Op
= NVPTXISD::StoreRetval
;
3488 Op
= NVPTXISD::StoreRetvalV2
;
3491 Op
= NVPTXISD::StoreRetvalV4
;
3494 llvm_unreachable("Invalid vector info.");
3497 // Adjust type of load/store op if we've extended the scalar
3499 EVT TheStoreType
= ExtendIntegerRetVal
? MVT::i32
: VTs
[i
];
3500 Chain
= DAG
.getMemIntrinsicNode(
3501 Op
, dl
, DAG
.getVTList(MVT::Other
), StoreOperands
, TheStoreType
,
3502 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore
);
3503 // Cleanup vector state.
3504 StoreOperands
.clear();
3508 return DAG
.getNode(NVPTXISD::RET_GLUE
, dl
, MVT::Other
, Chain
);
3511 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
3512 SDValue Op
, StringRef Constraint
, std::vector
<SDValue
> &Ops
,
3513 SelectionDAG
&DAG
) const {
3514 if (Constraint
.size() > 1)
3516 TargetLowering::LowerAsmOperandForConstraint(Op
, Constraint
, Ops
, DAG
);
3519 static unsigned getOpcForTextureInstr(unsigned Intrinsic
) {
3520 switch (Intrinsic
) {
3524 case Intrinsic::nvvm_tex_1d_v4f32_s32
:
3525 return NVPTXISD::Tex1DFloatS32
;
3526 case Intrinsic::nvvm_tex_1d_v4f32_f32
:
3527 return NVPTXISD::Tex1DFloatFloat
;
3528 case Intrinsic::nvvm_tex_1d_level_v4f32_f32
:
3529 return NVPTXISD::Tex1DFloatFloatLevel
;
3530 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32
:
3531 return NVPTXISD::Tex1DFloatFloatGrad
;
3532 case Intrinsic::nvvm_tex_1d_v4s32_s32
:
3533 return NVPTXISD::Tex1DS32S32
;
3534 case Intrinsic::nvvm_tex_1d_v4s32_f32
:
3535 return NVPTXISD::Tex1DS32Float
;
3536 case Intrinsic::nvvm_tex_1d_level_v4s32_f32
:
3537 return NVPTXISD::Tex1DS32FloatLevel
;
3538 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32
:
3539 return NVPTXISD::Tex1DS32FloatGrad
;
3540 case Intrinsic::nvvm_tex_1d_v4u32_s32
:
3541 return NVPTXISD::Tex1DU32S32
;
3542 case Intrinsic::nvvm_tex_1d_v4u32_f32
:
3543 return NVPTXISD::Tex1DU32Float
;
3544 case Intrinsic::nvvm_tex_1d_level_v4u32_f32
:
3545 return NVPTXISD::Tex1DU32FloatLevel
;
3546 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32
:
3547 return NVPTXISD::Tex1DU32FloatGrad
;
3549 case Intrinsic::nvvm_tex_1d_array_v4f32_s32
:
3550 return NVPTXISD::Tex1DArrayFloatS32
;
3551 case Intrinsic::nvvm_tex_1d_array_v4f32_f32
:
3552 return NVPTXISD::Tex1DArrayFloatFloat
;
3553 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32
:
3554 return NVPTXISD::Tex1DArrayFloatFloatLevel
;
3555 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32
:
3556 return NVPTXISD::Tex1DArrayFloatFloatGrad
;
3557 case Intrinsic::nvvm_tex_1d_array_v4s32_s32
:
3558 return NVPTXISD::Tex1DArrayS32S32
;
3559 case Intrinsic::nvvm_tex_1d_array_v4s32_f32
:
3560 return NVPTXISD::Tex1DArrayS32Float
;
3561 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32
:
3562 return NVPTXISD::Tex1DArrayS32FloatLevel
;
3563 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32
:
3564 return NVPTXISD::Tex1DArrayS32FloatGrad
;
3565 case Intrinsic::nvvm_tex_1d_array_v4u32_s32
:
3566 return NVPTXISD::Tex1DArrayU32S32
;
3567 case Intrinsic::nvvm_tex_1d_array_v4u32_f32
:
3568 return NVPTXISD::Tex1DArrayU32Float
;
3569 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32
:
3570 return NVPTXISD::Tex1DArrayU32FloatLevel
;
3571 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32
:
3572 return NVPTXISD::Tex1DArrayU32FloatGrad
;
3574 case Intrinsic::nvvm_tex_2d_v4f32_s32
:
3575 return NVPTXISD::Tex2DFloatS32
;
3576 case Intrinsic::nvvm_tex_2d_v4f32_f32
:
3577 return NVPTXISD::Tex2DFloatFloat
;
3578 case Intrinsic::nvvm_tex_2d_level_v4f32_f32
:
3579 return NVPTXISD::Tex2DFloatFloatLevel
;
3580 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32
:
3581 return NVPTXISD::Tex2DFloatFloatGrad
;
3582 case Intrinsic::nvvm_tex_2d_v4s32_s32
:
3583 return NVPTXISD::Tex2DS32S32
;
3584 case Intrinsic::nvvm_tex_2d_v4s32_f32
:
3585 return NVPTXISD::Tex2DS32Float
;
3586 case Intrinsic::nvvm_tex_2d_level_v4s32_f32
:
3587 return NVPTXISD::Tex2DS32FloatLevel
;
3588 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32
:
3589 return NVPTXISD::Tex2DS32FloatGrad
;
3590 case Intrinsic::nvvm_tex_2d_v4u32_s32
:
3591 return NVPTXISD::Tex2DU32S32
;
3592 case Intrinsic::nvvm_tex_2d_v4u32_f32
:
3593 return NVPTXISD::Tex2DU32Float
;
3594 case Intrinsic::nvvm_tex_2d_level_v4u32_f32
:
3595 return NVPTXISD::Tex2DU32FloatLevel
;
3596 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32
:
3597 return NVPTXISD::Tex2DU32FloatGrad
;
3599 case Intrinsic::nvvm_tex_2d_array_v4f32_s32
:
3600 return NVPTXISD::Tex2DArrayFloatS32
;
3601 case Intrinsic::nvvm_tex_2d_array_v4f32_f32
:
3602 return NVPTXISD::Tex2DArrayFloatFloat
;
3603 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32
:
3604 return NVPTXISD::Tex2DArrayFloatFloatLevel
;
3605 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32
:
3606 return NVPTXISD::Tex2DArrayFloatFloatGrad
;
3607 case Intrinsic::nvvm_tex_2d_array_v4s32_s32
:
3608 return NVPTXISD::Tex2DArrayS32S32
;
3609 case Intrinsic::nvvm_tex_2d_array_v4s32_f32
:
3610 return NVPTXISD::Tex2DArrayS32Float
;
3611 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32
:
3612 return NVPTXISD::Tex2DArrayS32FloatLevel
;
3613 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32
:
3614 return NVPTXISD::Tex2DArrayS32FloatGrad
;
3615 case Intrinsic::nvvm_tex_2d_array_v4u32_s32
:
3616 return NVPTXISD::Tex2DArrayU32S32
;
3617 case Intrinsic::nvvm_tex_2d_array_v4u32_f32
:
3618 return NVPTXISD::Tex2DArrayU32Float
;
3619 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32
:
3620 return NVPTXISD::Tex2DArrayU32FloatLevel
;
3621 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32
:
3622 return NVPTXISD::Tex2DArrayU32FloatGrad
;
3624 case Intrinsic::nvvm_tex_3d_v4f32_s32
:
3625 return NVPTXISD::Tex3DFloatS32
;
3626 case Intrinsic::nvvm_tex_3d_v4f32_f32
:
3627 return NVPTXISD::Tex3DFloatFloat
;
3628 case Intrinsic::nvvm_tex_3d_level_v4f32_f32
:
3629 return NVPTXISD::Tex3DFloatFloatLevel
;
3630 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32
:
3631 return NVPTXISD::Tex3DFloatFloatGrad
;
3632 case Intrinsic::nvvm_tex_3d_v4s32_s32
:
3633 return NVPTXISD::Tex3DS32S32
;
3634 case Intrinsic::nvvm_tex_3d_v4s32_f32
:
3635 return NVPTXISD::Tex3DS32Float
;
3636 case Intrinsic::nvvm_tex_3d_level_v4s32_f32
:
3637 return NVPTXISD::Tex3DS32FloatLevel
;
3638 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32
:
3639 return NVPTXISD::Tex3DS32FloatGrad
;
3640 case Intrinsic::nvvm_tex_3d_v4u32_s32
:
3641 return NVPTXISD::Tex3DU32S32
;
3642 case Intrinsic::nvvm_tex_3d_v4u32_f32
:
3643 return NVPTXISD::Tex3DU32Float
;
3644 case Intrinsic::nvvm_tex_3d_level_v4u32_f32
:
3645 return NVPTXISD::Tex3DU32FloatLevel
;
3646 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32
:
3647 return NVPTXISD::Tex3DU32FloatGrad
;
3649 case Intrinsic::nvvm_tex_cube_v4f32_f32
:
3650 return NVPTXISD::TexCubeFloatFloat
;
3651 case Intrinsic::nvvm_tex_cube_level_v4f32_f32
:
3652 return NVPTXISD::TexCubeFloatFloatLevel
;
3653 case Intrinsic::nvvm_tex_cube_v4s32_f32
:
3654 return NVPTXISD::TexCubeS32Float
;
3655 case Intrinsic::nvvm_tex_cube_level_v4s32_f32
:
3656 return NVPTXISD::TexCubeS32FloatLevel
;
3657 case Intrinsic::nvvm_tex_cube_v4u32_f32
:
3658 return NVPTXISD::TexCubeU32Float
;
3659 case Intrinsic::nvvm_tex_cube_level_v4u32_f32
:
3660 return NVPTXISD::TexCubeU32FloatLevel
;
3662 case Intrinsic::nvvm_tex_cube_array_v4f32_f32
:
3663 return NVPTXISD::TexCubeArrayFloatFloat
;
3664 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32
:
3665 return NVPTXISD::TexCubeArrayFloatFloatLevel
;
3666 case Intrinsic::nvvm_tex_cube_array_v4s32_f32
:
3667 return NVPTXISD::TexCubeArrayS32Float
;
3668 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32
:
3669 return NVPTXISD::TexCubeArrayS32FloatLevel
;
3670 case Intrinsic::nvvm_tex_cube_array_v4u32_f32
:
3671 return NVPTXISD::TexCubeArrayU32Float
;
3672 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32
:
3673 return NVPTXISD::TexCubeArrayU32FloatLevel
;
3675 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32
:
3676 return NVPTXISD::Tld4R2DFloatFloat
;
3677 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32
:
3678 return NVPTXISD::Tld4G2DFloatFloat
;
3679 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32
:
3680 return NVPTXISD::Tld4B2DFloatFloat
;
3681 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32
:
3682 return NVPTXISD::Tld4A2DFloatFloat
;
3683 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32
:
3684 return NVPTXISD::Tld4R2DS64Float
;
3685 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32
:
3686 return NVPTXISD::Tld4G2DS64Float
;
3687 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32
:
3688 return NVPTXISD::Tld4B2DS64Float
;
3689 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32
:
3690 return NVPTXISD::Tld4A2DS64Float
;
3691 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32
:
3692 return NVPTXISD::Tld4R2DU64Float
;
3693 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32
:
3694 return NVPTXISD::Tld4G2DU64Float
;
3695 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32
:
3696 return NVPTXISD::Tld4B2DU64Float
;
3697 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32
:
3698 return NVPTXISD::Tld4A2DU64Float
;
3700 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32
:
3701 return NVPTXISD::TexUnified1DFloatS32
;
3702 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32
:
3703 return NVPTXISD::TexUnified1DFloatFloat
;
3704 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32
:
3705 return NVPTXISD::TexUnified1DFloatFloatLevel
;
3706 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32
:
3707 return NVPTXISD::TexUnified1DFloatFloatGrad
;
3708 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32
:
3709 return NVPTXISD::TexUnified1DS32S32
;
3710 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32
:
3711 return NVPTXISD::TexUnified1DS32Float
;
3712 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32
:
3713 return NVPTXISD::TexUnified1DS32FloatLevel
;
3714 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32
:
3715 return NVPTXISD::TexUnified1DS32FloatGrad
;
3716 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32
:
3717 return NVPTXISD::TexUnified1DU32S32
;
3718 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32
:
3719 return NVPTXISD::TexUnified1DU32Float
;
3720 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32
:
3721 return NVPTXISD::TexUnified1DU32FloatLevel
;
3722 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32
:
3723 return NVPTXISD::TexUnified1DU32FloatGrad
;
3725 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32
:
3726 return NVPTXISD::TexUnified1DArrayFloatS32
;
3727 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32
:
3728 return NVPTXISD::TexUnified1DArrayFloatFloat
;
3729 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32
:
3730 return NVPTXISD::TexUnified1DArrayFloatFloatLevel
;
3731 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32
:
3732 return NVPTXISD::TexUnified1DArrayFloatFloatGrad
;
3733 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32
:
3734 return NVPTXISD::TexUnified1DArrayS32S32
;
3735 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32
:
3736 return NVPTXISD::TexUnified1DArrayS32Float
;
3737 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32
:
3738 return NVPTXISD::TexUnified1DArrayS32FloatLevel
;
3739 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32
:
3740 return NVPTXISD::TexUnified1DArrayS32FloatGrad
;
3741 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32
:
3742 return NVPTXISD::TexUnified1DArrayU32S32
;
3743 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32
:
3744 return NVPTXISD::TexUnified1DArrayU32Float
;
3745 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32
:
3746 return NVPTXISD::TexUnified1DArrayU32FloatLevel
;
3747 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32
:
3748 return NVPTXISD::TexUnified1DArrayU32FloatGrad
;
3750 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32
:
3751 return NVPTXISD::TexUnified2DFloatS32
;
3752 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32
:
3753 return NVPTXISD::TexUnified2DFloatFloat
;
3754 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32
:
3755 return NVPTXISD::TexUnified2DFloatFloatLevel
;
3756 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32
:
3757 return NVPTXISD::TexUnified2DFloatFloatGrad
;
3758 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32
:
3759 return NVPTXISD::TexUnified2DS32S32
;
3760 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32
:
3761 return NVPTXISD::TexUnified2DS32Float
;
3762 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32
:
3763 return NVPTXISD::TexUnified2DS32FloatLevel
;
3764 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32
:
3765 return NVPTXISD::TexUnified2DS32FloatGrad
;
3766 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32
:
3767 return NVPTXISD::TexUnified2DU32S32
;
3768 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32
:
3769 return NVPTXISD::TexUnified2DU32Float
;
3770 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32
:
3771 return NVPTXISD::TexUnified2DU32FloatLevel
;
3772 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32
:
3773 return NVPTXISD::TexUnified2DU32FloatGrad
;
3775 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32
:
3776 return NVPTXISD::TexUnified2DArrayFloatS32
;
3777 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32
:
3778 return NVPTXISD::TexUnified2DArrayFloatFloat
;
3779 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32
:
3780 return NVPTXISD::TexUnified2DArrayFloatFloatLevel
;
3781 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32
:
3782 return NVPTXISD::TexUnified2DArrayFloatFloatGrad
;
3783 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32
:
3784 return NVPTXISD::TexUnified2DArrayS32S32
;
3785 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32
:
3786 return NVPTXISD::TexUnified2DArrayS32Float
;
3787 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32
:
3788 return NVPTXISD::TexUnified2DArrayS32FloatLevel
;
3789 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32
:
3790 return NVPTXISD::TexUnified2DArrayS32FloatGrad
;
3791 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32
:
3792 return NVPTXISD::TexUnified2DArrayU32S32
;
3793 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32
:
3794 return NVPTXISD::TexUnified2DArrayU32Float
;
3795 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32
:
3796 return NVPTXISD::TexUnified2DArrayU32FloatLevel
;
3797 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32
:
3798 return NVPTXISD::TexUnified2DArrayU32FloatGrad
;
3800 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32
:
3801 return NVPTXISD::TexUnified3DFloatS32
;
3802 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32
:
3803 return NVPTXISD::TexUnified3DFloatFloat
;
3804 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32
:
3805 return NVPTXISD::TexUnified3DFloatFloatLevel
;
3806 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32
:
3807 return NVPTXISD::TexUnified3DFloatFloatGrad
;
3808 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32
:
3809 return NVPTXISD::TexUnified3DS32S32
;
3810 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32
:
3811 return NVPTXISD::TexUnified3DS32Float
;
3812 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32
:
3813 return NVPTXISD::TexUnified3DS32FloatLevel
;
3814 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32
:
3815 return NVPTXISD::TexUnified3DS32FloatGrad
;
3816 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32
:
3817 return NVPTXISD::TexUnified3DU32S32
;
3818 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32
:
3819 return NVPTXISD::TexUnified3DU32Float
;
3820 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32
:
3821 return NVPTXISD::TexUnified3DU32FloatLevel
;
3822 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32
:
3823 return NVPTXISD::TexUnified3DU32FloatGrad
;
3825 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32
:
3826 return NVPTXISD::TexUnifiedCubeFloatFloat
;
3827 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32
:
3828 return NVPTXISD::TexUnifiedCubeFloatFloatLevel
;
3829 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32
:
3830 return NVPTXISD::TexUnifiedCubeS32Float
;
3831 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32
:
3832 return NVPTXISD::TexUnifiedCubeS32FloatLevel
;
3833 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32
:
3834 return NVPTXISD::TexUnifiedCubeU32Float
;
3835 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32
:
3836 return NVPTXISD::TexUnifiedCubeU32FloatLevel
;
3838 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32
:
3839 return NVPTXISD::TexUnifiedCubeArrayFloatFloat
;
3840 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32
:
3841 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel
;
3842 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32
:
3843 return NVPTXISD::TexUnifiedCubeArrayS32Float
;
3844 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32
:
3845 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel
;
3846 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32
:
3847 return NVPTXISD::TexUnifiedCubeArrayU32Float
;
3848 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32
:
3849 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel
;
3851 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32
:
3852 return NVPTXISD::TexUnifiedCubeFloatFloatGrad
;
3853 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32
:
3854 return NVPTXISD::TexUnifiedCubeS32FloatGrad
;
3855 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32
:
3856 return NVPTXISD::TexUnifiedCubeU32FloatGrad
;
3857 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32
:
3858 return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad
;
3859 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32
:
3860 return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad
;
3861 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32
:
3862 return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad
;
3864 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32
:
3865 return NVPTXISD::Tld4UnifiedR2DFloatFloat
;
3866 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32
:
3867 return NVPTXISD::Tld4UnifiedG2DFloatFloat
;
3868 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32
:
3869 return NVPTXISD::Tld4UnifiedB2DFloatFloat
;
3870 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32
:
3871 return NVPTXISD::Tld4UnifiedA2DFloatFloat
;
3872 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32
:
3873 return NVPTXISD::Tld4UnifiedR2DS64Float
;
3874 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32
:
3875 return NVPTXISD::Tld4UnifiedG2DS64Float
;
3876 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32
:
3877 return NVPTXISD::Tld4UnifiedB2DS64Float
;
3878 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32
:
3879 return NVPTXISD::Tld4UnifiedA2DS64Float
;
3880 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32
:
3881 return NVPTXISD::Tld4UnifiedR2DU64Float
;
3882 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32
:
3883 return NVPTXISD::Tld4UnifiedG2DU64Float
;
3884 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32
:
3885 return NVPTXISD::Tld4UnifiedB2DU64Float
;
3886 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32
:
3887 return NVPTXISD::Tld4UnifiedA2DU64Float
;
3891 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic
) {
3892 switch (Intrinsic
) {
3895 case Intrinsic::nvvm_suld_1d_i8_clamp
:
3896 return NVPTXISD::Suld1DI8Clamp
;
3897 case Intrinsic::nvvm_suld_1d_i16_clamp
:
3898 return NVPTXISD::Suld1DI16Clamp
;
3899 case Intrinsic::nvvm_suld_1d_i32_clamp
:
3900 return NVPTXISD::Suld1DI32Clamp
;
3901 case Intrinsic::nvvm_suld_1d_i64_clamp
:
3902 return NVPTXISD::Suld1DI64Clamp
;
3903 case Intrinsic::nvvm_suld_1d_v2i8_clamp
:
3904 return NVPTXISD::Suld1DV2I8Clamp
;
3905 case Intrinsic::nvvm_suld_1d_v2i16_clamp
:
3906 return NVPTXISD::Suld1DV2I16Clamp
;
3907 case Intrinsic::nvvm_suld_1d_v2i32_clamp
:
3908 return NVPTXISD::Suld1DV2I32Clamp
;
3909 case Intrinsic::nvvm_suld_1d_v2i64_clamp
:
3910 return NVPTXISD::Suld1DV2I64Clamp
;
3911 case Intrinsic::nvvm_suld_1d_v4i8_clamp
:
3912 return NVPTXISD::Suld1DV4I8Clamp
;
3913 case Intrinsic::nvvm_suld_1d_v4i16_clamp
:
3914 return NVPTXISD::Suld1DV4I16Clamp
;
3915 case Intrinsic::nvvm_suld_1d_v4i32_clamp
:
3916 return NVPTXISD::Suld1DV4I32Clamp
;
3917 case Intrinsic::nvvm_suld_1d_array_i8_clamp
:
3918 return NVPTXISD::Suld1DArrayI8Clamp
;
3919 case Intrinsic::nvvm_suld_1d_array_i16_clamp
:
3920 return NVPTXISD::Suld1DArrayI16Clamp
;
3921 case Intrinsic::nvvm_suld_1d_array_i32_clamp
:
3922 return NVPTXISD::Suld1DArrayI32Clamp
;
3923 case Intrinsic::nvvm_suld_1d_array_i64_clamp
:
3924 return NVPTXISD::Suld1DArrayI64Clamp
;
3925 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp
:
3926 return NVPTXISD::Suld1DArrayV2I8Clamp
;
3927 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp
:
3928 return NVPTXISD::Suld1DArrayV2I16Clamp
;
3929 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp
:
3930 return NVPTXISD::Suld1DArrayV2I32Clamp
;
3931 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp
:
3932 return NVPTXISD::Suld1DArrayV2I64Clamp
;
3933 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp
:
3934 return NVPTXISD::Suld1DArrayV4I8Clamp
;
3935 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp
:
3936 return NVPTXISD::Suld1DArrayV4I16Clamp
;
3937 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp
:
3938 return NVPTXISD::Suld1DArrayV4I32Clamp
;
3939 case Intrinsic::nvvm_suld_2d_i8_clamp
:
3940 return NVPTXISD::Suld2DI8Clamp
;
3941 case Intrinsic::nvvm_suld_2d_i16_clamp
:
3942 return NVPTXISD::Suld2DI16Clamp
;
3943 case Intrinsic::nvvm_suld_2d_i32_clamp
:
3944 return NVPTXISD::Suld2DI32Clamp
;
3945 case Intrinsic::nvvm_suld_2d_i64_clamp
:
3946 return NVPTXISD::Suld2DI64Clamp
;
3947 case Intrinsic::nvvm_suld_2d_v2i8_clamp
:
3948 return NVPTXISD::Suld2DV2I8Clamp
;
3949 case Intrinsic::nvvm_suld_2d_v2i16_clamp
:
3950 return NVPTXISD::Suld2DV2I16Clamp
;
3951 case Intrinsic::nvvm_suld_2d_v2i32_clamp
:
3952 return NVPTXISD::Suld2DV2I32Clamp
;
3953 case Intrinsic::nvvm_suld_2d_v2i64_clamp
:
3954 return NVPTXISD::Suld2DV2I64Clamp
;
3955 case Intrinsic::nvvm_suld_2d_v4i8_clamp
:
3956 return NVPTXISD::Suld2DV4I8Clamp
;
3957 case Intrinsic::nvvm_suld_2d_v4i16_clamp
:
3958 return NVPTXISD::Suld2DV4I16Clamp
;
3959 case Intrinsic::nvvm_suld_2d_v4i32_clamp
:
3960 return NVPTXISD::Suld2DV4I32Clamp
;
3961 case Intrinsic::nvvm_suld_2d_array_i8_clamp
:
3962 return NVPTXISD::Suld2DArrayI8Clamp
;
3963 case Intrinsic::nvvm_suld_2d_array_i16_clamp
:
3964 return NVPTXISD::Suld2DArrayI16Clamp
;
3965 case Intrinsic::nvvm_suld_2d_array_i32_clamp
:
3966 return NVPTXISD::Suld2DArrayI32Clamp
;
3967 case Intrinsic::nvvm_suld_2d_array_i64_clamp
:
3968 return NVPTXISD::Suld2DArrayI64Clamp
;
3969 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp
:
3970 return NVPTXISD::Suld2DArrayV2I8Clamp
;
3971 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp
:
3972 return NVPTXISD::Suld2DArrayV2I16Clamp
;
3973 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp
:
3974 return NVPTXISD::Suld2DArrayV2I32Clamp
;
3975 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp
:
3976 return NVPTXISD::Suld2DArrayV2I64Clamp
;
3977 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp
:
3978 return NVPTXISD::Suld2DArrayV4I8Clamp
;
3979 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp
:
3980 return NVPTXISD::Suld2DArrayV4I16Clamp
;
3981 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp
:
3982 return NVPTXISD::Suld2DArrayV4I32Clamp
;
3983 case Intrinsic::nvvm_suld_3d_i8_clamp
:
3984 return NVPTXISD::Suld3DI8Clamp
;
3985 case Intrinsic::nvvm_suld_3d_i16_clamp
:
3986 return NVPTXISD::Suld3DI16Clamp
;
3987 case Intrinsic::nvvm_suld_3d_i32_clamp
:
3988 return NVPTXISD::Suld3DI32Clamp
;
3989 case Intrinsic::nvvm_suld_3d_i64_clamp
:
3990 return NVPTXISD::Suld3DI64Clamp
;
3991 case Intrinsic::nvvm_suld_3d_v2i8_clamp
:
3992 return NVPTXISD::Suld3DV2I8Clamp
;
3993 case Intrinsic::nvvm_suld_3d_v2i16_clamp
:
3994 return NVPTXISD::Suld3DV2I16Clamp
;
3995 case Intrinsic::nvvm_suld_3d_v2i32_clamp
:
3996 return NVPTXISD::Suld3DV2I32Clamp
;
3997 case Intrinsic::nvvm_suld_3d_v2i64_clamp
:
3998 return NVPTXISD::Suld3DV2I64Clamp
;
3999 case Intrinsic::nvvm_suld_3d_v4i8_clamp
:
4000 return NVPTXISD::Suld3DV4I8Clamp
;
4001 case Intrinsic::nvvm_suld_3d_v4i16_clamp
:
4002 return NVPTXISD::Suld3DV4I16Clamp
;
4003 case Intrinsic::nvvm_suld_3d_v4i32_clamp
:
4004 return NVPTXISD::Suld3DV4I32Clamp
;
4005 case Intrinsic::nvvm_suld_1d_i8_trap
:
4006 return NVPTXISD::Suld1DI8Trap
;
4007 case Intrinsic::nvvm_suld_1d_i16_trap
:
4008 return NVPTXISD::Suld1DI16Trap
;
4009 case Intrinsic::nvvm_suld_1d_i32_trap
:
4010 return NVPTXISD::Suld1DI32Trap
;
4011 case Intrinsic::nvvm_suld_1d_i64_trap
:
4012 return NVPTXISD::Suld1DI64Trap
;
4013 case Intrinsic::nvvm_suld_1d_v2i8_trap
:
4014 return NVPTXISD::Suld1DV2I8Trap
;
4015 case Intrinsic::nvvm_suld_1d_v2i16_trap
:
4016 return NVPTXISD::Suld1DV2I16Trap
;
4017 case Intrinsic::nvvm_suld_1d_v2i32_trap
:
4018 return NVPTXISD::Suld1DV2I32Trap
;
4019 case Intrinsic::nvvm_suld_1d_v2i64_trap
:
4020 return NVPTXISD::Suld1DV2I64Trap
;
4021 case Intrinsic::nvvm_suld_1d_v4i8_trap
:
4022 return NVPTXISD::Suld1DV4I8Trap
;
4023 case Intrinsic::nvvm_suld_1d_v4i16_trap
:
4024 return NVPTXISD::Suld1DV4I16Trap
;
4025 case Intrinsic::nvvm_suld_1d_v4i32_trap
:
4026 return NVPTXISD::Suld1DV4I32Trap
;
4027 case Intrinsic::nvvm_suld_1d_array_i8_trap
:
4028 return NVPTXISD::Suld1DArrayI8Trap
;
4029 case Intrinsic::nvvm_suld_1d_array_i16_trap
:
4030 return NVPTXISD::Suld1DArrayI16Trap
;
4031 case Intrinsic::nvvm_suld_1d_array_i32_trap
:
4032 return NVPTXISD::Suld1DArrayI32Trap
;
4033 case Intrinsic::nvvm_suld_1d_array_i64_trap
:
4034 return NVPTXISD::Suld1DArrayI64Trap
;
4035 case Intrinsic::nvvm_suld_1d_array_v2i8_trap
:
4036 return NVPTXISD::Suld1DArrayV2I8Trap
;
4037 case Intrinsic::nvvm_suld_1d_array_v2i16_trap
:
4038 return NVPTXISD::Suld1DArrayV2I16Trap
;
4039 case Intrinsic::nvvm_suld_1d_array_v2i32_trap
:
4040 return NVPTXISD::Suld1DArrayV2I32Trap
;
4041 case Intrinsic::nvvm_suld_1d_array_v2i64_trap
:
4042 return NVPTXISD::Suld1DArrayV2I64Trap
;
4043 case Intrinsic::nvvm_suld_1d_array_v4i8_trap
:
4044 return NVPTXISD::Suld1DArrayV4I8Trap
;
4045 case Intrinsic::nvvm_suld_1d_array_v4i16_trap
:
4046 return NVPTXISD::Suld1DArrayV4I16Trap
;
4047 case Intrinsic::nvvm_suld_1d_array_v4i32_trap
:
4048 return NVPTXISD::Suld1DArrayV4I32Trap
;
4049 case Intrinsic::nvvm_suld_2d_i8_trap
:
4050 return NVPTXISD::Suld2DI8Trap
;
4051 case Intrinsic::nvvm_suld_2d_i16_trap
:
4052 return NVPTXISD::Suld2DI16Trap
;
4053 case Intrinsic::nvvm_suld_2d_i32_trap
:
4054 return NVPTXISD::Suld2DI32Trap
;
4055 case Intrinsic::nvvm_suld_2d_i64_trap
:
4056 return NVPTXISD::Suld2DI64Trap
;
4057 case Intrinsic::nvvm_suld_2d_v2i8_trap
:
4058 return NVPTXISD::Suld2DV2I8Trap
;
4059 case Intrinsic::nvvm_suld_2d_v2i16_trap
:
4060 return NVPTXISD::Suld2DV2I16Trap
;
4061 case Intrinsic::nvvm_suld_2d_v2i32_trap
:
4062 return NVPTXISD::Suld2DV2I32Trap
;
4063 case Intrinsic::nvvm_suld_2d_v2i64_trap
:
4064 return NVPTXISD::Suld2DV2I64Trap
;
4065 case Intrinsic::nvvm_suld_2d_v4i8_trap
:
4066 return NVPTXISD::Suld2DV4I8Trap
;
4067 case Intrinsic::nvvm_suld_2d_v4i16_trap
:
4068 return NVPTXISD::Suld2DV4I16Trap
;
4069 case Intrinsic::nvvm_suld_2d_v4i32_trap
:
4070 return NVPTXISD::Suld2DV4I32Trap
;
4071 case Intrinsic::nvvm_suld_2d_array_i8_trap
:
4072 return NVPTXISD::Suld2DArrayI8Trap
;
4073 case Intrinsic::nvvm_suld_2d_array_i16_trap
:
4074 return NVPTXISD::Suld2DArrayI16Trap
;
4075 case Intrinsic::nvvm_suld_2d_array_i32_trap
:
4076 return NVPTXISD::Suld2DArrayI32Trap
;
4077 case Intrinsic::nvvm_suld_2d_array_i64_trap
:
4078 return NVPTXISD::Suld2DArrayI64Trap
;
4079 case Intrinsic::nvvm_suld_2d_array_v2i8_trap
:
4080 return NVPTXISD::Suld2DArrayV2I8Trap
;
4081 case Intrinsic::nvvm_suld_2d_array_v2i16_trap
:
4082 return NVPTXISD::Suld2DArrayV2I16Trap
;
4083 case Intrinsic::nvvm_suld_2d_array_v2i32_trap
:
4084 return NVPTXISD::Suld2DArrayV2I32Trap
;
4085 case Intrinsic::nvvm_suld_2d_array_v2i64_trap
:
4086 return NVPTXISD::Suld2DArrayV2I64Trap
;
4087 case Intrinsic::nvvm_suld_2d_array_v4i8_trap
:
4088 return NVPTXISD::Suld2DArrayV4I8Trap
;
4089 case Intrinsic::nvvm_suld_2d_array_v4i16_trap
:
4090 return NVPTXISD::Suld2DArrayV4I16Trap
;
4091 case Intrinsic::nvvm_suld_2d_array_v4i32_trap
:
4092 return NVPTXISD::Suld2DArrayV4I32Trap
;
4093 case Intrinsic::nvvm_suld_3d_i8_trap
:
4094 return NVPTXISD::Suld3DI8Trap
;
4095 case Intrinsic::nvvm_suld_3d_i16_trap
:
4096 return NVPTXISD::Suld3DI16Trap
;
4097 case Intrinsic::nvvm_suld_3d_i32_trap
:
4098 return NVPTXISD::Suld3DI32Trap
;
4099 case Intrinsic::nvvm_suld_3d_i64_trap
:
4100 return NVPTXISD::Suld3DI64Trap
;
4101 case Intrinsic::nvvm_suld_3d_v2i8_trap
:
4102 return NVPTXISD::Suld3DV2I8Trap
;
4103 case Intrinsic::nvvm_suld_3d_v2i16_trap
:
4104 return NVPTXISD::Suld3DV2I16Trap
;
4105 case Intrinsic::nvvm_suld_3d_v2i32_trap
:
4106 return NVPTXISD::Suld3DV2I32Trap
;
4107 case Intrinsic::nvvm_suld_3d_v2i64_trap
:
4108 return NVPTXISD::Suld3DV2I64Trap
;
4109 case Intrinsic::nvvm_suld_3d_v4i8_trap
:
4110 return NVPTXISD::Suld3DV4I8Trap
;
4111 case Intrinsic::nvvm_suld_3d_v4i16_trap
:
4112 return NVPTXISD::Suld3DV4I16Trap
;
4113 case Intrinsic::nvvm_suld_3d_v4i32_trap
:
4114 return NVPTXISD::Suld3DV4I32Trap
;
4115 case Intrinsic::nvvm_suld_1d_i8_zero
:
4116 return NVPTXISD::Suld1DI8Zero
;
4117 case Intrinsic::nvvm_suld_1d_i16_zero
:
4118 return NVPTXISD::Suld1DI16Zero
;
4119 case Intrinsic::nvvm_suld_1d_i32_zero
:
4120 return NVPTXISD::Suld1DI32Zero
;
4121 case Intrinsic::nvvm_suld_1d_i64_zero
:
4122 return NVPTXISD::Suld1DI64Zero
;
4123 case Intrinsic::nvvm_suld_1d_v2i8_zero
:
4124 return NVPTXISD::Suld1DV2I8Zero
;
4125 case Intrinsic::nvvm_suld_1d_v2i16_zero
:
4126 return NVPTXISD::Suld1DV2I16Zero
;
4127 case Intrinsic::nvvm_suld_1d_v2i32_zero
:
4128 return NVPTXISD::Suld1DV2I32Zero
;
4129 case Intrinsic::nvvm_suld_1d_v2i64_zero
:
4130 return NVPTXISD::Suld1DV2I64Zero
;
4131 case Intrinsic::nvvm_suld_1d_v4i8_zero
:
4132 return NVPTXISD::Suld1DV4I8Zero
;
4133 case Intrinsic::nvvm_suld_1d_v4i16_zero
:
4134 return NVPTXISD::Suld1DV4I16Zero
;
4135 case Intrinsic::nvvm_suld_1d_v4i32_zero
:
4136 return NVPTXISD::Suld1DV4I32Zero
;
4137 case Intrinsic::nvvm_suld_1d_array_i8_zero
:
4138 return NVPTXISD::Suld1DArrayI8Zero
;
4139 case Intrinsic::nvvm_suld_1d_array_i16_zero
:
4140 return NVPTXISD::Suld1DArrayI16Zero
;
4141 case Intrinsic::nvvm_suld_1d_array_i32_zero
:
4142 return NVPTXISD::Suld1DArrayI32Zero
;
4143 case Intrinsic::nvvm_suld_1d_array_i64_zero
:
4144 return NVPTXISD::Suld1DArrayI64Zero
;
4145 case Intrinsic::nvvm_suld_1d_array_v2i8_zero
:
4146 return NVPTXISD::Suld1DArrayV2I8Zero
;
4147 case Intrinsic::nvvm_suld_1d_array_v2i16_zero
:
4148 return NVPTXISD::Suld1DArrayV2I16Zero
;
4149 case Intrinsic::nvvm_suld_1d_array_v2i32_zero
:
4150 return NVPTXISD::Suld1DArrayV2I32Zero
;
4151 case Intrinsic::nvvm_suld_1d_array_v2i64_zero
:
4152 return NVPTXISD::Suld1DArrayV2I64Zero
;
4153 case Intrinsic::nvvm_suld_1d_array_v4i8_zero
:
4154 return NVPTXISD::Suld1DArrayV4I8Zero
;
4155 case Intrinsic::nvvm_suld_1d_array_v4i16_zero
:
4156 return NVPTXISD::Suld1DArrayV4I16Zero
;
4157 case Intrinsic::nvvm_suld_1d_array_v4i32_zero
:
4158 return NVPTXISD::Suld1DArrayV4I32Zero
;
4159 case Intrinsic::nvvm_suld_2d_i8_zero
:
4160 return NVPTXISD::Suld2DI8Zero
;
4161 case Intrinsic::nvvm_suld_2d_i16_zero
:
4162 return NVPTXISD::Suld2DI16Zero
;
4163 case Intrinsic::nvvm_suld_2d_i32_zero
:
4164 return NVPTXISD::Suld2DI32Zero
;
4165 case Intrinsic::nvvm_suld_2d_i64_zero
:
4166 return NVPTXISD::Suld2DI64Zero
;
4167 case Intrinsic::nvvm_suld_2d_v2i8_zero
:
4168 return NVPTXISD::Suld2DV2I8Zero
;
4169 case Intrinsic::nvvm_suld_2d_v2i16_zero
:
4170 return NVPTXISD::Suld2DV2I16Zero
;
4171 case Intrinsic::nvvm_suld_2d_v2i32_zero
:
4172 return NVPTXISD::Suld2DV2I32Zero
;
4173 case Intrinsic::nvvm_suld_2d_v2i64_zero
:
4174 return NVPTXISD::Suld2DV2I64Zero
;
4175 case Intrinsic::nvvm_suld_2d_v4i8_zero
:
4176 return NVPTXISD::Suld2DV4I8Zero
;
4177 case Intrinsic::nvvm_suld_2d_v4i16_zero
:
4178 return NVPTXISD::Suld2DV4I16Zero
;
4179 case Intrinsic::nvvm_suld_2d_v4i32_zero
:
4180 return NVPTXISD::Suld2DV4I32Zero
;
4181 case Intrinsic::nvvm_suld_2d_array_i8_zero
:
4182 return NVPTXISD::Suld2DArrayI8Zero
;
4183 case Intrinsic::nvvm_suld_2d_array_i16_zero
:
4184 return NVPTXISD::Suld2DArrayI16Zero
;
4185 case Intrinsic::nvvm_suld_2d_array_i32_zero
:
4186 return NVPTXISD::Suld2DArrayI32Zero
;
4187 case Intrinsic::nvvm_suld_2d_array_i64_zero
:
4188 return NVPTXISD::Suld2DArrayI64Zero
;
4189 case Intrinsic::nvvm_suld_2d_array_v2i8_zero
:
4190 return NVPTXISD::Suld2DArrayV2I8Zero
;
4191 case Intrinsic::nvvm_suld_2d_array_v2i16_zero
:
4192 return NVPTXISD::Suld2DArrayV2I16Zero
;
4193 case Intrinsic::nvvm_suld_2d_array_v2i32_zero
:
4194 return NVPTXISD::Suld2DArrayV2I32Zero
;
4195 case Intrinsic::nvvm_suld_2d_array_v2i64_zero
:
4196 return NVPTXISD::Suld2DArrayV2I64Zero
;
4197 case Intrinsic::nvvm_suld_2d_array_v4i8_zero
:
4198 return NVPTXISD::Suld2DArrayV4I8Zero
;
4199 case Intrinsic::nvvm_suld_2d_array_v4i16_zero
:
4200 return NVPTXISD::Suld2DArrayV4I16Zero
;
4201 case Intrinsic::nvvm_suld_2d_array_v4i32_zero
:
4202 return NVPTXISD::Suld2DArrayV4I32Zero
;
4203 case Intrinsic::nvvm_suld_3d_i8_zero
:
4204 return NVPTXISD::Suld3DI8Zero
;
4205 case Intrinsic::nvvm_suld_3d_i16_zero
:
4206 return NVPTXISD::Suld3DI16Zero
;
4207 case Intrinsic::nvvm_suld_3d_i32_zero
:
4208 return NVPTXISD::Suld3DI32Zero
;
4209 case Intrinsic::nvvm_suld_3d_i64_zero
:
4210 return NVPTXISD::Suld3DI64Zero
;
4211 case Intrinsic::nvvm_suld_3d_v2i8_zero
:
4212 return NVPTXISD::Suld3DV2I8Zero
;
4213 case Intrinsic::nvvm_suld_3d_v2i16_zero
:
4214 return NVPTXISD::Suld3DV2I16Zero
;
4215 case Intrinsic::nvvm_suld_3d_v2i32_zero
:
4216 return NVPTXISD::Suld3DV2I32Zero
;
4217 case Intrinsic::nvvm_suld_3d_v2i64_zero
:
4218 return NVPTXISD::Suld3DV2I64Zero
;
4219 case Intrinsic::nvvm_suld_3d_v4i8_zero
:
4220 return NVPTXISD::Suld3DV4I8Zero
;
4221 case Intrinsic::nvvm_suld_3d_v4i16_zero
:
4222 return NVPTXISD::Suld3DV4I16Zero
;
4223 case Intrinsic::nvvm_suld_3d_v4i32_zero
:
4224 return NVPTXISD::Suld3DV4I32Zero
;
4228 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4230 // because we need the information that is only available in the "Value" type
4232 // pointer. In particular, the address space information.
4233 bool NVPTXTargetLowering::getTgtMemIntrinsic(
4234 IntrinsicInfo
&Info
, const CallInst
&I
,
4235 MachineFunction
&MF
, unsigned Intrinsic
) const {
4236 switch (Intrinsic
) {
4239 case Intrinsic::nvvm_match_all_sync_i32p
:
4240 case Intrinsic::nvvm_match_all_sync_i64p
:
4241 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4242 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4243 // in order to model data exchange with other threads, but perform no real
4245 Info
.memVT
= MVT::i1
;
4247 // Our result depends on both our and other thread's arguments.
4248 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
4250 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col
:
4251 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row
:
4252 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride
:
4253 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride
:
4254 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col
:
4255 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row
:
4256 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride
:
4257 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride
:
4258 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col
:
4259 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row
:
4260 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride
:
4261 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride
:
4262 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col
:
4263 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row
:
4264 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride
:
4265 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride
:
4266 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col
:
4267 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row
:
4268 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride
:
4269 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride
:
4270 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col
:
4271 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row
:
4272 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride
:
4273 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride
: {
4274 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4275 Info
.memVT
= MVT::v8f16
;
4276 Info
.ptrVal
= I
.getArgOperand(0);
4278 Info
.flags
= MachineMemOperand::MOLoad
;
4279 Info
.align
= Align(16);
4282 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col
:
4283 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride
:
4284 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride
:
4285 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col
:
4286 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row
:
4287 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride
:
4288 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride
:
4289 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row
:
4290 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col
:
4291 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride
:
4292 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row
:
4293 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride
:
4294 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col
:
4295 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride
:
4296 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride
:
4297 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col
:
4298 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row
:
4299 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride
:
4300 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride
:
4301 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row
:
4302 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col
:
4303 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride
:
4304 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row
:
4305 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride
: {
4306 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4307 Info
.memVT
= MVT::v2i32
;
4308 Info
.ptrVal
= I
.getArgOperand(0);
4310 Info
.flags
= MachineMemOperand::MOLoad
;
4311 Info
.align
= Align(8);
4315 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col
:
4316 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride
:
4317 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride
:
4318 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col
:
4319 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row
:
4320 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride
:
4321 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride
:
4322 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row
:
4323 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col
:
4324 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride
:
4325 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row
:
4326 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride
:
4327 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col
:
4328 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride
:
4329 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row
:
4330 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride
:
4332 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col
:
4333 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride
:
4334 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride
:
4335 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col
:
4336 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row
:
4337 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride
:
4338 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride
:
4339 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row
:
4340 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col
:
4341 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride
:
4342 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row
:
4343 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride
:
4344 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col
:
4345 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride
:
4346 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row
:
4347 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride
:
4348 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16
:
4349 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16
: {
4350 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4351 Info
.memVT
= MVT::v4i32
;
4352 Info
.ptrVal
= I
.getArgOperand(0);
4354 Info
.flags
= MachineMemOperand::MOLoad
;
4355 Info
.align
= Align(16);
4359 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col
:
4360 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride
:
4361 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride
:
4362 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col
:
4363 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row
:
4364 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride
:
4365 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride
:
4366 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row
:
4368 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col
:
4369 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride
:
4370 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride
:
4371 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col
:
4372 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row
:
4373 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride
:
4374 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride
:
4375 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row
:
4376 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row
:
4377 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride
:
4378 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col
:
4379 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride
:
4380 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row
:
4381 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride
:
4382 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride
:
4383 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row
:
4384 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col
:
4385 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride
:
4386 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride
:
4387 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col
:
4388 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16
:
4389 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16
: {
4390 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4391 Info
.memVT
= MVT::i32
;
4392 Info
.ptrVal
= I
.getArgOperand(0);
4394 Info
.flags
= MachineMemOperand::MOLoad
;
4395 Info
.align
= Align(4);
4399 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col
:
4400 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row
:
4401 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride
:
4402 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride
:
4403 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col
:
4404 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row
:
4405 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride
:
4406 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride
:
4407 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col
:
4408 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row
:
4409 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride
:
4410 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride
: {
4411 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4412 Info
.memVT
= MVT::v4f16
;
4413 Info
.ptrVal
= I
.getArgOperand(0);
4415 Info
.flags
= MachineMemOperand::MOLoad
;
4416 Info
.align
= Align(16);
4420 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col
:
4421 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row
:
4422 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride
:
4423 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride
:
4424 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col
:
4425 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row
:
4426 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride
:
4427 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride
:
4428 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col
:
4429 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row
:
4430 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride
:
4431 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride
:
4432 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col
:
4433 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row
:
4434 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride
:
4435 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride
: {
4436 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4437 Info
.memVT
= MVT::v8f32
;
4438 Info
.ptrVal
= I
.getArgOperand(0);
4440 Info
.flags
= MachineMemOperand::MOLoad
;
4441 Info
.align
= Align(16);
4445 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col
:
4446 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride
:
4447 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row
:
4448 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride
:
4450 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col
:
4451 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride
:
4452 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row
:
4453 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride
:
4455 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col
:
4456 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride
:
4457 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row
:
4458 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride
:
4459 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col
:
4460 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride
:
4461 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row
:
4462 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride
:
4463 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col
:
4464 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride
:
4465 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row
:
4466 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride
: {
4467 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4468 Info
.memVT
= MVT::v8i32
;
4469 Info
.ptrVal
= I
.getArgOperand(0);
4471 Info
.flags
= MachineMemOperand::MOLoad
;
4472 Info
.align
= Align(16);
4476 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col
:
4477 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride
:
4478 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row
:
4479 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride
:
4480 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col
:
4481 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride
:
4482 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row
:
4483 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride
:
4484 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16
:
4485 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16
: {
4486 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4487 Info
.memVT
= MVT::v2i32
;
4488 Info
.ptrVal
= I
.getArgOperand(0);
4490 Info
.flags
= MachineMemOperand::MOLoad
;
4491 Info
.align
= Align(8);
4495 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col
:
4496 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride
:
4497 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row
:
4498 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride
:
4500 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col
:
4501 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride
:
4502 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row
:
4503 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride
: {
4504 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4505 Info
.memVT
= MVT::f64
;
4506 Info
.ptrVal
= I
.getArgOperand(0);
4508 Info
.flags
= MachineMemOperand::MOLoad
;
4509 Info
.align
= Align(8);
4513 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col
:
4514 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride
:
4515 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row
:
4516 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride
: {
4517 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4518 Info
.memVT
= MVT::v2f64
;
4519 Info
.ptrVal
= I
.getArgOperand(0);
4521 Info
.flags
= MachineMemOperand::MOLoad
;
4522 Info
.align
= Align(16);
4526 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col
:
4527 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row
:
4528 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride
:
4529 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride
:
4530 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col
:
4531 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row
:
4532 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride
:
4533 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride
:
4534 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col
:
4535 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row
:
4536 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride
:
4537 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride
: {
4538 Info
.opc
= ISD::INTRINSIC_VOID
;
4539 Info
.memVT
= MVT::v4f16
;
4540 Info
.ptrVal
= I
.getArgOperand(0);
4542 Info
.flags
= MachineMemOperand::MOStore
;
4543 Info
.align
= Align(16);
4547 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col
:
4548 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row
:
4549 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride
:
4550 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride
:
4551 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col
:
4552 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row
:
4553 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride
:
4554 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride
:
4555 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col
:
4556 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row
:
4557 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride
:
4558 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride
:
4559 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col
:
4560 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row
:
4561 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride
:
4562 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride
: {
4563 Info
.opc
= ISD::INTRINSIC_VOID
;
4564 Info
.memVT
= MVT::v8f32
;
4565 Info
.ptrVal
= I
.getArgOperand(0);
4567 Info
.flags
= MachineMemOperand::MOStore
;
4568 Info
.align
= Align(16);
4572 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col
:
4573 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride
:
4574 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row
:
4575 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride
:
4576 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col
:
4577 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride
:
4578 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row
:
4579 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride
:
4580 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col
:
4581 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride
:
4582 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row
:
4583 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride
: {
4584 Info
.opc
= ISD::INTRINSIC_VOID
;
4585 Info
.memVT
= MVT::v8i32
;
4586 Info
.ptrVal
= I
.getArgOperand(0);
4588 Info
.flags
= MachineMemOperand::MOStore
;
4589 Info
.align
= Align(16);
4593 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col
:
4594 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride
:
4595 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row
:
4596 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride
:
4597 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col
:
4598 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride
:
4599 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row
:
4600 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride
: {
4601 Info
.opc
= ISD::INTRINSIC_VOID
;
4602 Info
.memVT
= MVT::v2i32
;
4603 Info
.ptrVal
= I
.getArgOperand(0);
4605 Info
.flags
= MachineMemOperand::MOStore
;
4606 Info
.align
= Align(8);
4610 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col
:
4611 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride
:
4612 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row
:
4613 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride
: {
4614 Info
.opc
= ISD::INTRINSIC_VOID
;
4615 Info
.memVT
= MVT::v2f64
;
4616 Info
.ptrVal
= I
.getArgOperand(0);
4618 Info
.flags
= MachineMemOperand::MOStore
;
4619 Info
.align
= Align(16);
4623 case Intrinsic::nvvm_atomic_load_inc_32
:
4624 case Intrinsic::nvvm_atomic_load_dec_32
:
4626 case Intrinsic::nvvm_atomic_add_gen_f_cta
:
4627 case Intrinsic::nvvm_atomic_add_gen_f_sys
:
4628 case Intrinsic::nvvm_atomic_add_gen_i_cta
:
4629 case Intrinsic::nvvm_atomic_add_gen_i_sys
:
4630 case Intrinsic::nvvm_atomic_and_gen_i_cta
:
4631 case Intrinsic::nvvm_atomic_and_gen_i_sys
:
4632 case Intrinsic::nvvm_atomic_cas_gen_i_cta
:
4633 case Intrinsic::nvvm_atomic_cas_gen_i_sys
:
4634 case Intrinsic::nvvm_atomic_dec_gen_i_cta
:
4635 case Intrinsic::nvvm_atomic_dec_gen_i_sys
:
4636 case Intrinsic::nvvm_atomic_inc_gen_i_cta
:
4637 case Intrinsic::nvvm_atomic_inc_gen_i_sys
:
4638 case Intrinsic::nvvm_atomic_max_gen_i_cta
:
4639 case Intrinsic::nvvm_atomic_max_gen_i_sys
:
4640 case Intrinsic::nvvm_atomic_min_gen_i_cta
:
4641 case Intrinsic::nvvm_atomic_min_gen_i_sys
:
4642 case Intrinsic::nvvm_atomic_or_gen_i_cta
:
4643 case Intrinsic::nvvm_atomic_or_gen_i_sys
:
4644 case Intrinsic::nvvm_atomic_exch_gen_i_cta
:
4645 case Intrinsic::nvvm_atomic_exch_gen_i_sys
:
4646 case Intrinsic::nvvm_atomic_xor_gen_i_cta
:
4647 case Intrinsic::nvvm_atomic_xor_gen_i_sys
: {
4648 auto &DL
= I
.getDataLayout();
4649 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4650 Info
.memVT
= getValueType(DL
, I
.getType());
4651 Info
.ptrVal
= I
.getArgOperand(0);
4653 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOStore
;
4658 case Intrinsic::nvvm_ldu_global_i
:
4659 case Intrinsic::nvvm_ldu_global_f
:
4660 case Intrinsic::nvvm_ldu_global_p
: {
4661 auto &DL
= I
.getDataLayout();
4662 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4663 if (Intrinsic
== Intrinsic::nvvm_ldu_global_i
)
4664 Info
.memVT
= getValueType(DL
, I
.getType());
4665 else if(Intrinsic
== Intrinsic::nvvm_ldu_global_p
)
4666 Info
.memVT
= getPointerTy(DL
);
4668 Info
.memVT
= getValueType(DL
, I
.getType());
4669 Info
.ptrVal
= I
.getArgOperand(0);
4671 Info
.flags
= MachineMemOperand::MOLoad
;
4672 Info
.align
= cast
<ConstantInt
>(I
.getArgOperand(1))->getMaybeAlignValue();
4676 case Intrinsic::nvvm_ldg_global_i
:
4677 case Intrinsic::nvvm_ldg_global_f
:
4678 case Intrinsic::nvvm_ldg_global_p
: {
4679 auto &DL
= I
.getDataLayout();
4681 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
4682 if (Intrinsic
== Intrinsic::nvvm_ldg_global_i
)
4683 Info
.memVT
= getValueType(DL
, I
.getType());
4684 else if(Intrinsic
== Intrinsic::nvvm_ldg_global_p
)
4685 Info
.memVT
= getPointerTy(DL
);
4687 Info
.memVT
= getValueType(DL
, I
.getType());
4688 Info
.ptrVal
= I
.getArgOperand(0);
4690 Info
.flags
= MachineMemOperand::MOLoad
;
4691 Info
.align
= cast
<ConstantInt
>(I
.getArgOperand(1))->getMaybeAlignValue();
4696 case Intrinsic::nvvm_tex_1d_v4f32_s32
:
4697 case Intrinsic::nvvm_tex_1d_v4f32_f32
:
4698 case Intrinsic::nvvm_tex_1d_level_v4f32_f32
:
4699 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32
:
4700 case Intrinsic::nvvm_tex_1d_array_v4f32_s32
:
4701 case Intrinsic::nvvm_tex_1d_array_v4f32_f32
:
4702 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32
:
4703 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32
:
4704 case Intrinsic::nvvm_tex_2d_v4f32_s32
:
4705 case Intrinsic::nvvm_tex_2d_v4f32_f32
:
4706 case Intrinsic::nvvm_tex_2d_level_v4f32_f32
:
4707 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32
:
4708 case Intrinsic::nvvm_tex_2d_array_v4f32_s32
:
4709 case Intrinsic::nvvm_tex_2d_array_v4f32_f32
:
4710 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32
:
4711 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32
:
4712 case Intrinsic::nvvm_tex_3d_v4f32_s32
:
4713 case Intrinsic::nvvm_tex_3d_v4f32_f32
:
4714 case Intrinsic::nvvm_tex_3d_level_v4f32_f32
:
4715 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32
:
4716 case Intrinsic::nvvm_tex_cube_v4f32_f32
:
4717 case Intrinsic::nvvm_tex_cube_level_v4f32_f32
:
4718 case Intrinsic::nvvm_tex_cube_array_v4f32_f32
:
4719 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32
:
4720 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32
:
4721 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32
:
4722 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32
:
4723 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32
:
4724 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32
:
4725 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32
:
4726 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32
:
4727 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32
:
4728 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32
:
4729 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32
:
4730 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32
:
4731 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32
:
4732 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32
:
4733 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32
:
4734 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32
:
4735 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32
:
4736 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32
:
4737 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32
:
4738 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32
:
4739 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32
:
4740 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32
:
4741 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32
:
4742 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32
:
4743 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32
:
4744 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32
:
4745 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32
:
4746 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32
:
4747 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32
:
4748 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32
:
4749 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32
:
4750 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32
:
4751 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32
:
4752 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32
:
4753 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32
:
4754 Info
.opc
= getOpcForTextureInstr(Intrinsic
);
4755 Info
.memVT
= MVT::v4f32
;
4756 Info
.ptrVal
= nullptr;
4758 Info
.flags
= MachineMemOperand::MOLoad
;
4759 Info
.align
= Align(16);
4762 case Intrinsic::nvvm_tex_1d_v4s32_s32
:
4763 case Intrinsic::nvvm_tex_1d_v4s32_f32
:
4764 case Intrinsic::nvvm_tex_1d_level_v4s32_f32
:
4765 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32
:
4766 case Intrinsic::nvvm_tex_1d_array_v4s32_s32
:
4767 case Intrinsic::nvvm_tex_1d_array_v4s32_f32
:
4768 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32
:
4769 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32
:
4770 case Intrinsic::nvvm_tex_2d_v4s32_s32
:
4771 case Intrinsic::nvvm_tex_2d_v4s32_f32
:
4772 case Intrinsic::nvvm_tex_2d_level_v4s32_f32
:
4773 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32
:
4774 case Intrinsic::nvvm_tex_2d_array_v4s32_s32
:
4775 case Intrinsic::nvvm_tex_2d_array_v4s32_f32
:
4776 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32
:
4777 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32
:
4778 case Intrinsic::nvvm_tex_3d_v4s32_s32
:
4779 case Intrinsic::nvvm_tex_3d_v4s32_f32
:
4780 case Intrinsic::nvvm_tex_3d_level_v4s32_f32
:
4781 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32
:
4782 case Intrinsic::nvvm_tex_cube_v4s32_f32
:
4783 case Intrinsic::nvvm_tex_cube_level_v4s32_f32
:
4784 case Intrinsic::nvvm_tex_cube_array_v4s32_f32
:
4785 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32
:
4786 case Intrinsic::nvvm_tex_cube_v4u32_f32
:
4787 case Intrinsic::nvvm_tex_cube_level_v4u32_f32
:
4788 case Intrinsic::nvvm_tex_cube_array_v4u32_f32
:
4789 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32
:
4790 case Intrinsic::nvvm_tex_1d_v4u32_s32
:
4791 case Intrinsic::nvvm_tex_1d_v4u32_f32
:
4792 case Intrinsic::nvvm_tex_1d_level_v4u32_f32
:
4793 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32
:
4794 case Intrinsic::nvvm_tex_1d_array_v4u32_s32
:
4795 case Intrinsic::nvvm_tex_1d_array_v4u32_f32
:
4796 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32
:
4797 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32
:
4798 case Intrinsic::nvvm_tex_2d_v4u32_s32
:
4799 case Intrinsic::nvvm_tex_2d_v4u32_f32
:
4800 case Intrinsic::nvvm_tex_2d_level_v4u32_f32
:
4801 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32
:
4802 case Intrinsic::nvvm_tex_2d_array_v4u32_s32
:
4803 case Intrinsic::nvvm_tex_2d_array_v4u32_f32
:
4804 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32
:
4805 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32
:
4806 case Intrinsic::nvvm_tex_3d_v4u32_s32
:
4807 case Intrinsic::nvvm_tex_3d_v4u32_f32
:
4808 case Intrinsic::nvvm_tex_3d_level_v4u32_f32
:
4809 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32
:
4810 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32
:
4811 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32
:
4812 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32
:
4813 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32
:
4814 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32
:
4815 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32
:
4816 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32
:
4817 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32
:
4818 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32
:
4819 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32
:
4820 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32
:
4821 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32
:
4822 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32
:
4823 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32
:
4824 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32
:
4825 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32
:
4826 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32
:
4827 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32
:
4828 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32
:
4829 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32
:
4830 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32
:
4831 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32
:
4832 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32
:
4833 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32
:
4834 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32
:
4835 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32
:
4836 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32
:
4837 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32
:
4838 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32
:
4839 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32
:
4840 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32
:
4841 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32
:
4842 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32
:
4843 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32
:
4844 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32
:
4845 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32
:
4846 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32
:
4847 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32
:
4848 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32
:
4849 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32
:
4850 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32
:
4851 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32
:
4852 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32
:
4853 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32
:
4854 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32
:
4855 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32
:
4856 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32
:
4857 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32
:
4858 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32
:
4859 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32
:
4860 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32
:
4861 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32
:
4862 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32
:
4863 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32
:
4864 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32
:
4865 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32
:
4866 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32
:
4867 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32
:
4868 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32
:
4869 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32
:
4870 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32
:
4871 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32
:
4872 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32
:
4873 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32
:
4874 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32
:
4875 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32
:
4876 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32
:
4877 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32
:
4878 Info
.opc
= getOpcForTextureInstr(Intrinsic
);
4879 Info
.memVT
= MVT::v4i32
;
4880 Info
.ptrVal
= nullptr;
4882 Info
.flags
= MachineMemOperand::MOLoad
;
4883 Info
.align
= Align(16);
4886 case Intrinsic::nvvm_suld_1d_i8_clamp
:
4887 case Intrinsic::nvvm_suld_1d_v2i8_clamp
:
4888 case Intrinsic::nvvm_suld_1d_v4i8_clamp
:
4889 case Intrinsic::nvvm_suld_1d_array_i8_clamp
:
4890 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp
:
4891 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp
:
4892 case Intrinsic::nvvm_suld_2d_i8_clamp
:
4893 case Intrinsic::nvvm_suld_2d_v2i8_clamp
:
4894 case Intrinsic::nvvm_suld_2d_v4i8_clamp
:
4895 case Intrinsic::nvvm_suld_2d_array_i8_clamp
:
4896 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp
:
4897 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp
:
4898 case Intrinsic::nvvm_suld_3d_i8_clamp
:
4899 case Intrinsic::nvvm_suld_3d_v2i8_clamp
:
4900 case Intrinsic::nvvm_suld_3d_v4i8_clamp
:
4901 case Intrinsic::nvvm_suld_1d_i8_trap
:
4902 case Intrinsic::nvvm_suld_1d_v2i8_trap
:
4903 case Intrinsic::nvvm_suld_1d_v4i8_trap
:
4904 case Intrinsic::nvvm_suld_1d_array_i8_trap
:
4905 case Intrinsic::nvvm_suld_1d_array_v2i8_trap
:
4906 case Intrinsic::nvvm_suld_1d_array_v4i8_trap
:
4907 case Intrinsic::nvvm_suld_2d_i8_trap
:
4908 case Intrinsic::nvvm_suld_2d_v2i8_trap
:
4909 case Intrinsic::nvvm_suld_2d_v4i8_trap
:
4910 case Intrinsic::nvvm_suld_2d_array_i8_trap
:
4911 case Intrinsic::nvvm_suld_2d_array_v2i8_trap
:
4912 case Intrinsic::nvvm_suld_2d_array_v4i8_trap
:
4913 case Intrinsic::nvvm_suld_3d_i8_trap
:
4914 case Intrinsic::nvvm_suld_3d_v2i8_trap
:
4915 case Intrinsic::nvvm_suld_3d_v4i8_trap
:
4916 case Intrinsic::nvvm_suld_1d_i8_zero
:
4917 case Intrinsic::nvvm_suld_1d_v2i8_zero
:
4918 case Intrinsic::nvvm_suld_1d_v4i8_zero
:
4919 case Intrinsic::nvvm_suld_1d_array_i8_zero
:
4920 case Intrinsic::nvvm_suld_1d_array_v2i8_zero
:
4921 case Intrinsic::nvvm_suld_1d_array_v4i8_zero
:
4922 case Intrinsic::nvvm_suld_2d_i8_zero
:
4923 case Intrinsic::nvvm_suld_2d_v2i8_zero
:
4924 case Intrinsic::nvvm_suld_2d_v4i8_zero
:
4925 case Intrinsic::nvvm_suld_2d_array_i8_zero
:
4926 case Intrinsic::nvvm_suld_2d_array_v2i8_zero
:
4927 case Intrinsic::nvvm_suld_2d_array_v4i8_zero
:
4928 case Intrinsic::nvvm_suld_3d_i8_zero
:
4929 case Intrinsic::nvvm_suld_3d_v2i8_zero
:
4930 case Intrinsic::nvvm_suld_3d_v4i8_zero
:
4931 Info
.opc
= getOpcForSurfaceInstr(Intrinsic
);
4932 Info
.memVT
= MVT::i8
;
4933 Info
.ptrVal
= nullptr;
4935 Info
.flags
= MachineMemOperand::MOLoad
;
4936 Info
.align
= Align(16);
4939 case Intrinsic::nvvm_suld_1d_i16_clamp
:
4940 case Intrinsic::nvvm_suld_1d_v2i16_clamp
:
4941 case Intrinsic::nvvm_suld_1d_v4i16_clamp
:
4942 case Intrinsic::nvvm_suld_1d_array_i16_clamp
:
4943 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp
:
4944 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp
:
4945 case Intrinsic::nvvm_suld_2d_i16_clamp
:
4946 case Intrinsic::nvvm_suld_2d_v2i16_clamp
:
4947 case Intrinsic::nvvm_suld_2d_v4i16_clamp
:
4948 case Intrinsic::nvvm_suld_2d_array_i16_clamp
:
4949 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp
:
4950 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp
:
4951 case Intrinsic::nvvm_suld_3d_i16_clamp
:
4952 case Intrinsic::nvvm_suld_3d_v2i16_clamp
:
4953 case Intrinsic::nvvm_suld_3d_v4i16_clamp
:
4954 case Intrinsic::nvvm_suld_1d_i16_trap
:
4955 case Intrinsic::nvvm_suld_1d_v2i16_trap
:
4956 case Intrinsic::nvvm_suld_1d_v4i16_trap
:
4957 case Intrinsic::nvvm_suld_1d_array_i16_trap
:
4958 case Intrinsic::nvvm_suld_1d_array_v2i16_trap
:
4959 case Intrinsic::nvvm_suld_1d_array_v4i16_trap
:
4960 case Intrinsic::nvvm_suld_2d_i16_trap
:
4961 case Intrinsic::nvvm_suld_2d_v2i16_trap
:
4962 case Intrinsic::nvvm_suld_2d_v4i16_trap
:
4963 case Intrinsic::nvvm_suld_2d_array_i16_trap
:
4964 case Intrinsic::nvvm_suld_2d_array_v2i16_trap
:
4965 case Intrinsic::nvvm_suld_2d_array_v4i16_trap
:
4966 case Intrinsic::nvvm_suld_3d_i16_trap
:
4967 case Intrinsic::nvvm_suld_3d_v2i16_trap
:
4968 case Intrinsic::nvvm_suld_3d_v4i16_trap
:
4969 case Intrinsic::nvvm_suld_1d_i16_zero
:
4970 case Intrinsic::nvvm_suld_1d_v2i16_zero
:
4971 case Intrinsic::nvvm_suld_1d_v4i16_zero
:
4972 case Intrinsic::nvvm_suld_1d_array_i16_zero
:
4973 case Intrinsic::nvvm_suld_1d_array_v2i16_zero
:
4974 case Intrinsic::nvvm_suld_1d_array_v4i16_zero
:
4975 case Intrinsic::nvvm_suld_2d_i16_zero
:
4976 case Intrinsic::nvvm_suld_2d_v2i16_zero
:
4977 case Intrinsic::nvvm_suld_2d_v4i16_zero
:
4978 case Intrinsic::nvvm_suld_2d_array_i16_zero
:
4979 case Intrinsic::nvvm_suld_2d_array_v2i16_zero
:
4980 case Intrinsic::nvvm_suld_2d_array_v4i16_zero
:
4981 case Intrinsic::nvvm_suld_3d_i16_zero
:
4982 case Intrinsic::nvvm_suld_3d_v2i16_zero
:
4983 case Intrinsic::nvvm_suld_3d_v4i16_zero
:
4984 Info
.opc
= getOpcForSurfaceInstr(Intrinsic
);
4985 Info
.memVT
= MVT::i16
;
4986 Info
.ptrVal
= nullptr;
4988 Info
.flags
= MachineMemOperand::MOLoad
;
4989 Info
.align
= Align(16);
4992 case Intrinsic::nvvm_suld_1d_i32_clamp
:
4993 case Intrinsic::nvvm_suld_1d_v2i32_clamp
:
4994 case Intrinsic::nvvm_suld_1d_v4i32_clamp
:
4995 case Intrinsic::nvvm_suld_1d_array_i32_clamp
:
4996 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp
:
4997 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp
:
4998 case Intrinsic::nvvm_suld_2d_i32_clamp
:
4999 case Intrinsic::nvvm_suld_2d_v2i32_clamp
:
5000 case Intrinsic::nvvm_suld_2d_v4i32_clamp
:
5001 case Intrinsic::nvvm_suld_2d_array_i32_clamp
:
5002 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp
:
5003 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp
:
5004 case Intrinsic::nvvm_suld_3d_i32_clamp
:
5005 case Intrinsic::nvvm_suld_3d_v2i32_clamp
:
5006 case Intrinsic::nvvm_suld_3d_v4i32_clamp
:
5007 case Intrinsic::nvvm_suld_1d_i32_trap
:
5008 case Intrinsic::nvvm_suld_1d_v2i32_trap
:
5009 case Intrinsic::nvvm_suld_1d_v4i32_trap
:
5010 case Intrinsic::nvvm_suld_1d_array_i32_trap
:
5011 case Intrinsic::nvvm_suld_1d_array_v2i32_trap
:
5012 case Intrinsic::nvvm_suld_1d_array_v4i32_trap
:
5013 case Intrinsic::nvvm_suld_2d_i32_trap
:
5014 case Intrinsic::nvvm_suld_2d_v2i32_trap
:
5015 case Intrinsic::nvvm_suld_2d_v4i32_trap
:
5016 case Intrinsic::nvvm_suld_2d_array_i32_trap
:
5017 case Intrinsic::nvvm_suld_2d_array_v2i32_trap
:
5018 case Intrinsic::nvvm_suld_2d_array_v4i32_trap
:
5019 case Intrinsic::nvvm_suld_3d_i32_trap
:
5020 case Intrinsic::nvvm_suld_3d_v2i32_trap
:
5021 case Intrinsic::nvvm_suld_3d_v4i32_trap
:
5022 case Intrinsic::nvvm_suld_1d_i32_zero
:
5023 case Intrinsic::nvvm_suld_1d_v2i32_zero
:
5024 case Intrinsic::nvvm_suld_1d_v4i32_zero
:
5025 case Intrinsic::nvvm_suld_1d_array_i32_zero
:
5026 case Intrinsic::nvvm_suld_1d_array_v2i32_zero
:
5027 case Intrinsic::nvvm_suld_1d_array_v4i32_zero
:
5028 case Intrinsic::nvvm_suld_2d_i32_zero
:
5029 case Intrinsic::nvvm_suld_2d_v2i32_zero
:
5030 case Intrinsic::nvvm_suld_2d_v4i32_zero
:
5031 case Intrinsic::nvvm_suld_2d_array_i32_zero
:
5032 case Intrinsic::nvvm_suld_2d_array_v2i32_zero
:
5033 case Intrinsic::nvvm_suld_2d_array_v4i32_zero
:
5034 case Intrinsic::nvvm_suld_3d_i32_zero
:
5035 case Intrinsic::nvvm_suld_3d_v2i32_zero
:
5036 case Intrinsic::nvvm_suld_3d_v4i32_zero
:
5037 Info
.opc
= getOpcForSurfaceInstr(Intrinsic
);
5038 Info
.memVT
= MVT::i32
;
5039 Info
.ptrVal
= nullptr;
5041 Info
.flags
= MachineMemOperand::MOLoad
;
5042 Info
.align
= Align(16);
5045 case Intrinsic::nvvm_suld_1d_i64_clamp
:
5046 case Intrinsic::nvvm_suld_1d_v2i64_clamp
:
5047 case Intrinsic::nvvm_suld_1d_array_i64_clamp
:
5048 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp
:
5049 case Intrinsic::nvvm_suld_2d_i64_clamp
:
5050 case Intrinsic::nvvm_suld_2d_v2i64_clamp
:
5051 case Intrinsic::nvvm_suld_2d_array_i64_clamp
:
5052 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp
:
5053 case Intrinsic::nvvm_suld_3d_i64_clamp
:
5054 case Intrinsic::nvvm_suld_3d_v2i64_clamp
:
5055 case Intrinsic::nvvm_suld_1d_i64_trap
:
5056 case Intrinsic::nvvm_suld_1d_v2i64_trap
:
5057 case Intrinsic::nvvm_suld_1d_array_i64_trap
:
5058 case Intrinsic::nvvm_suld_1d_array_v2i64_trap
:
5059 case Intrinsic::nvvm_suld_2d_i64_trap
:
5060 case Intrinsic::nvvm_suld_2d_v2i64_trap
:
5061 case Intrinsic::nvvm_suld_2d_array_i64_trap
:
5062 case Intrinsic::nvvm_suld_2d_array_v2i64_trap
:
5063 case Intrinsic::nvvm_suld_3d_i64_trap
:
5064 case Intrinsic::nvvm_suld_3d_v2i64_trap
:
5065 case Intrinsic::nvvm_suld_1d_i64_zero
:
5066 case Intrinsic::nvvm_suld_1d_v2i64_zero
:
5067 case Intrinsic::nvvm_suld_1d_array_i64_zero
:
5068 case Intrinsic::nvvm_suld_1d_array_v2i64_zero
:
5069 case Intrinsic::nvvm_suld_2d_i64_zero
:
5070 case Intrinsic::nvvm_suld_2d_v2i64_zero
:
5071 case Intrinsic::nvvm_suld_2d_array_i64_zero
:
5072 case Intrinsic::nvvm_suld_2d_array_v2i64_zero
:
5073 case Intrinsic::nvvm_suld_3d_i64_zero
:
5074 case Intrinsic::nvvm_suld_3d_v2i64_zero
:
5075 Info
.opc
= getOpcForSurfaceInstr(Intrinsic
);
5076 Info
.memVT
= MVT::i64
;
5077 Info
.ptrVal
= nullptr;
5079 Info
.flags
= MachineMemOperand::MOLoad
;
5080 Info
.align
= Align(16);
5086 /// getFunctionParamOptimizedAlign - since function arguments are passed via
5087 /// .param space, we may want to increase their alignment in a way that
5088 /// ensures that we can effectively vectorize their loads & stores. We can
5089 /// increase alignment only if the function has internal or has private
5090 /// linkage as for other linkage types callers may already rely on default
5091 /// alignment. To allow using 128-bit vectorized loads/stores, this function
5092 /// ensures that alignment is 16 or greater.
5093 Align
NVPTXTargetLowering::getFunctionParamOptimizedAlign(
5094 const Function
*F
, Type
*ArgTy
, const DataLayout
&DL
) const {
5095 // Capping the alignment to 128 bytes as that is the maximum alignment
5096 // supported by PTX.
5097 const Align ABITypeAlign
= std::min(Align(128), DL
.getABITypeAlign(ArgTy
));
5099 // If a function has linkage different from internal or private, we
5100 // must use default ABI alignment as external users rely on it. Same
5101 // for a function that may be called from a function pointer.
5102 if (!F
|| !F
->hasLocalLinkage() ||
5103 F
->hasAddressTaken(/*Users=*/nullptr,
5104 /*IgnoreCallbackUses=*/false,
5105 /*IgnoreAssumeLikeCalls=*/true,
5106 /*IgnoreLLVMUsed=*/true))
5107 return ABITypeAlign
;
5109 assert(!isKernelFunction(*F
) && "Expect kernels to have non-local linkage");
5110 return std::max(Align(16), ABITypeAlign
);
5113 /// Helper for computing alignment of a device function byval parameter.
5114 Align
NVPTXTargetLowering::getFunctionByValParamAlign(
5115 const Function
*F
, Type
*ArgTy
, Align InitialAlign
,
5116 const DataLayout
&DL
) const {
5117 Align ArgAlign
= InitialAlign
;
5118 // Try to increase alignment to enhance vectorization options.
5120 ArgAlign
= std::max(ArgAlign
, getFunctionParamOptimizedAlign(F
, ArgTy
, DL
));
5122 // Old ptx versions have a bug. When PTX code takes address of
5123 // byval parameter with alignment < 4, ptxas generates code to
5124 // spill argument into memory. Alas on sm_50+ ptxas generates
5125 // SASS code that fails with misaligned access. To work around
5126 // the problem, make sure that we align byval parameters by at
5127 // least 4. This bug seems to be fixed at least starting from
5129 // TODO: remove this after verifying the bug is not reproduced
5130 // on non-deprecated ptxas versions.
5131 if (ForceMinByValParamAlign
)
5132 ArgAlign
= std::max(ArgAlign
, Align(4));
5137 // Helper for getting a function parameter name. Name is composed from
5138 // its index and the function name. Negative index corresponds to special
5139 // parameter (unsized array) used for passing variable arguments.
5140 std::string
NVPTXTargetLowering::getParamName(const Function
*F
,
5142 std::string ParamName
;
5143 raw_string_ostream
ParamStr(ParamName
);
5145 ParamStr
<< getTargetMachine().getSymbol(F
)->getName();
5147 ParamStr
<< "_vararg";
5149 ParamStr
<< "_param_" << Idx
;
5154 /// isLegalAddressingMode - Return true if the addressing mode represented
5155 /// by AM is legal for this target, for a load/store of the specified type.
5156 /// Used to guide target specific optimizations, like loop strength reduction
5157 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
5158 /// (CodeGenPrepare.cpp)
5159 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
5160 const AddrMode
&AM
, Type
*Ty
,
5161 unsigned AS
, Instruction
*I
) const {
5162 // AddrMode - This represents an addressing mode of:
5163 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5165 // The legal address modes are
5171 // immoff must fit in a signed 32-bit int
5172 if (!APInt(64, AM
.BaseOffs
).isSignedIntN(32))
5176 return !AM
.BaseOffs
&& !AM
.HasBaseReg
&& !AM
.Scale
;
5179 case 0: // "r", "r+i" or "i" is allowed
5182 if (AM
.HasBaseReg
) // "r+r+i" or "r+r" is not allowed.
5184 // Otherwise we have r+i.
5187 // No scale > 1 is allowed
5193 //===----------------------------------------------------------------------===//
5194 // NVPTX Inline Assembly Support
5195 //===----------------------------------------------------------------------===//
5197 /// getConstraintType - Given a constraint letter, return the type of
5198 /// constraint it is for this target.
5199 NVPTXTargetLowering::ConstraintType
5200 NVPTXTargetLowering::getConstraintType(StringRef Constraint
) const {
5201 if (Constraint
.size() == 1) {
5202 switch (Constraint
[0]) {
5215 return C_RegisterClass
;
5218 return TargetLowering::getConstraintType(Constraint
);
5221 std::pair
<unsigned, const TargetRegisterClass
*>
5222 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo
*TRI
,
5223 StringRef Constraint
,
5225 if (Constraint
.size() == 1) {
5226 switch (Constraint
[0]) {
5228 return std::make_pair(0U, &NVPTX::Int1RegsRegClass
);
5230 return std::make_pair(0U, &NVPTX::Int16RegsRegClass
);
5232 return std::make_pair(0U, &NVPTX::Int16RegsRegClass
);
5234 return std::make_pair(0U, &NVPTX::Int32RegsRegClass
);
5237 return std::make_pair(0U, &NVPTX::Int64RegsRegClass
);
5239 if (STI
.getSmVersion() < 70)
5240 report_fatal_error("Inline asm with 128 bit operands is only "
5241 "supported for sm_70 and higher!");
5242 return std::make_pair(0U, &NVPTX::Int128RegsRegClass
);
5245 return std::make_pair(0U, &NVPTX::Float32RegsRegClass
);
5247 return std::make_pair(0U, &NVPTX::Float64RegsRegClass
);
5250 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
5253 //===----------------------------------------------------------------------===//
5254 // NVPTX DAG Combining
5255 //===----------------------------------------------------------------------===//
5257 bool NVPTXTargetLowering::allowFMA(MachineFunction
&MF
,
5258 CodeGenOptLevel OptLevel
) const {
5259 // Always honor command-line argument
5260 if (FMAContractLevelOpt
.getNumOccurrences() > 0)
5261 return FMAContractLevelOpt
> 0;
5263 // Do not contract if we're not optimizing the code.
5264 if (OptLevel
== CodeGenOptLevel::None
)
5267 // Honor TargetOptions flags that explicitly say fusion is okay.
5268 if (MF
.getTarget().Options
.AllowFPOpFusion
== FPOpFusion::Fast
)
5271 return allowUnsafeFPMath(MF
);
5274 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction
&MF
) const {
5275 // Honor TargetOptions flags that explicitly say unsafe math is okay.
5276 if (MF
.getTarget().Options
.UnsafeFPMath
)
5279 // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5280 const Function
&F
= MF
.getFunction();
5281 return F
.getFnAttribute("unsafe-fp-math").getValueAsBool();
5284 static bool isConstZero(const SDValue
&Operand
) {
5285 const auto *Const
= dyn_cast
<ConstantSDNode
>(Operand
);
5286 return Const
&& Const
->getZExtValue() == 0;
5289 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5290 /// operands N0 and N1. This is a helper for PerformADDCombine that is
5291 /// called with the default operands, and if that fails, with commuted
5294 PerformADDCombineWithOperands(SDNode
*N
, SDValue N0
, SDValue N1
,
5295 TargetLowering::DAGCombinerInfo
&DCI
) {
5296 EVT VT
= N0
.getValueType();
5298 // Since integer multiply-add costs the same as integer multiply
5299 // but is more costly than integer add, do the fusion only when
5300 // the mul is only used in the add.
5301 // TODO: this may not be true for later architectures, consider relaxing this
5302 if (!N0
.getNode()->hasOneUse())
5305 // fold (add (mul a, b), c) -> (mad a, b, c)
5307 if (N0
.getOpcode() == ISD::MUL
)
5308 return DCI
.DAG
.getNode(NVPTXISD::IMAD
, SDLoc(N
), VT
, N0
.getOperand(0),
5309 N0
.getOperand(1), N1
);
5311 // fold (add (select cond, 0, (mul a, b)), c)
5312 // -> (select cond, c, (mad a, b, c))
5314 if (N0
.getOpcode() == ISD::SELECT
) {
5316 if (isConstZero(N0
->getOperand(1)))
5318 else if (isConstZero(N0
->getOperand(2)))
5323 SDValue M
= N0
->getOperand((ZeroOpNum
== 1) ? 2 : 1);
5324 if (M
->getOpcode() != ISD::MUL
|| !M
.getNode()->hasOneUse())
5327 SDValue MAD
= DCI
.DAG
.getNode(NVPTXISD::IMAD
, SDLoc(N
), VT
,
5328 M
->getOperand(0), M
->getOperand(1), N1
);
5329 return DCI
.DAG
.getSelect(SDLoc(N
), VT
, N0
->getOperand(0),
5330 ((ZeroOpNum
== 1) ? N1
: MAD
),
5331 ((ZeroOpNum
== 1) ? MAD
: N1
));
5338 PerformFADDCombineWithOperands(SDNode
*N
, SDValue N0
, SDValue N1
,
5339 TargetLowering::DAGCombinerInfo
&DCI
,
5340 CodeGenOptLevel OptLevel
) {
5341 EVT VT
= N0
.getValueType();
5342 if (N0
.getOpcode() == ISD::FMUL
) {
5343 const auto *TLI
= static_cast<const NVPTXTargetLowering
*>(
5344 &DCI
.DAG
.getTargetLoweringInfo());
5345 if (!TLI
->allowFMA(DCI
.DAG
.getMachineFunction(), OptLevel
))
5348 // For floating point:
5349 // Do the fusion only when the mul has less than 5 uses and all
5351 // The heuristic is that if a use is not an add, then that use
5352 // cannot be fused into fma, therefore mul is still needed anyway.
5353 // If there are more than 4 uses, even if they are all add, fusing
5354 // them will increase register pressue.
5357 int nonAddCount
= 0;
5358 for (const SDNode
*User
: N0
.getNode()->uses()) {
5360 if (User
->getOpcode() != ISD::FADD
)
5366 int orderNo
= N
->getIROrder();
5367 int orderNo2
= N0
.getNode()->getIROrder();
5368 // simple heuristics here for considering potential register
5369 // pressure, the logics here is that the differnce are used
5370 // to measure the distance between def and use, the longer distance
5371 // more likely cause register pressure.
5372 if (orderNo
- orderNo2
< 500)
5375 // Now, check if at least one of the FMUL's operands is live beyond the
5376 // node N, which guarantees that the FMA will not increase register
5377 // pressure at node N.
5378 bool opIsLive
= false;
5379 const SDNode
*left
= N0
.getOperand(0).getNode();
5380 const SDNode
*right
= N0
.getOperand(1).getNode();
5382 if (isa
<ConstantSDNode
>(left
) || isa
<ConstantSDNode
>(right
))
5386 for (const SDNode
*User
: left
->uses()) {
5387 int orderNo3
= User
->getIROrder();
5388 if (orderNo3
> orderNo
) {
5395 for (const SDNode
*User
: right
->uses()) {
5396 int orderNo3
= User
->getIROrder();
5397 if (orderNo3
> orderNo
) {
5407 return DCI
.DAG
.getNode(ISD::FMA
, SDLoc(N
), VT
, N0
.getOperand(0),
5408 N0
.getOperand(1), N1
);
5414 static SDValue
PerformStoreCombineHelper(SDNode
*N
, std::size_t Front
,
5416 if (all_of(N
->ops().drop_front(Front
).drop_back(Back
),
5417 [](const SDUse
&U
) { return U
.get()->isUndef(); }))
5418 // Operand 0 is the previous value in the chain. Cannot return EntryToken
5419 // as the previous value will become unused and eliminated later.
5420 return N
->getOperand(0);
5425 static SDValue
PerformStoreParamCombine(SDNode
*N
) {
5426 // Operands from the 3rd to the 2nd last one are the values to be stored.
5427 // {Chain, ArgID, Offset, Val, Glue}
5428 return PerformStoreCombineHelper(N
, 3, 1);
5431 static SDValue
PerformStoreRetvalCombine(SDNode
*N
) {
5432 // Operands from the 2nd to the last one are the values to be stored
5433 return PerformStoreCombineHelper(N
, 2, 0);
5436 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5438 static SDValue
PerformADDCombine(SDNode
*N
,
5439 TargetLowering::DAGCombinerInfo
&DCI
,
5440 CodeGenOptLevel OptLevel
) {
5441 if (OptLevel
== CodeGenOptLevel::None
)
5444 SDValue N0
= N
->getOperand(0);
5445 SDValue N1
= N
->getOperand(1);
5447 // Skip non-integer, non-scalar case
5448 EVT VT
= N0
.getValueType();
5449 if (VT
.isVector() || VT
!= MVT::i32
)
5452 // First try with the default operand order.
5453 if (SDValue Result
= PerformADDCombineWithOperands(N
, N0
, N1
, DCI
))
5456 // If that didn't work, try again with the operands commuted.
5457 return PerformADDCombineWithOperands(N
, N1
, N0
, DCI
);
5460 /// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5462 static SDValue
PerformFADDCombine(SDNode
*N
,
5463 TargetLowering::DAGCombinerInfo
&DCI
,
5464 CodeGenOptLevel OptLevel
) {
5465 SDValue N0
= N
->getOperand(0);
5466 SDValue N1
= N
->getOperand(1);
5468 EVT VT
= N0
.getValueType();
5469 if (VT
.isVector() || !(VT
== MVT::f32
|| VT
== MVT::f64
))
5472 // First try with the default operand order.
5473 if (SDValue Result
= PerformFADDCombineWithOperands(N
, N0
, N1
, DCI
, OptLevel
))
5476 // If that didn't work, try again with the operands commuted.
5477 return PerformFADDCombineWithOperands(N
, N1
, N0
, DCI
, OptLevel
);
5480 static SDValue
PerformANDCombine(SDNode
*N
,
5481 TargetLowering::DAGCombinerInfo
&DCI
) {
5482 // The type legalizer turns a vector load of i8 values into a zextload to i16
5483 // registers, optionally ANY_EXTENDs it (if target type is integer),
5484 // and ANDs off the high 8 bits. Since we turn this load into a
5485 // target-specific DAG node, the DAG combiner fails to eliminate these AND
5486 // nodes. Do that here.
5487 SDValue Val
= N
->getOperand(0);
5488 SDValue Mask
= N
->getOperand(1);
5490 if (isa
<ConstantSDNode
>(Val
)) {
5491 std::swap(Val
, Mask
);
5496 // Convert BFE-> truncate i16 -> and 255
5497 // To just BFE-> truncate i16, as the value already has all the bits in the
5499 if (Val
.getOpcode() == ISD::TRUNCATE
) {
5500 SDValue BFE
= Val
.getOperand(0);
5501 if (BFE
.getOpcode() != NVPTXISD::BFE
)
5504 ConstantSDNode
*BFEBits
= dyn_cast
<ConstantSDNode
>(BFE
.getOperand(0));
5507 uint64_t BFEBitsVal
= BFEBits
->getZExtValue();
5509 ConstantSDNode
*MaskCnst
= dyn_cast
<ConstantSDNode
>(Mask
);
5511 // Not an AND with a constant
5514 uint64_t MaskVal
= MaskCnst
->getZExtValue();
5516 if (MaskVal
!= (uint64_t(1) << BFEBitsVal
) - 1)
5518 // If we get here, the AND is unnecessary. Just replace it with the trunc
5519 DCI
.CombineTo(N
, Val
, false);
5521 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5522 if (Val
.getOpcode() == ISD::ANY_EXTEND
) {
5524 Val
= Val
->getOperand(0);
5527 if (Val
->isMachineOpcode() && Val
->getMachineOpcode() == NVPTX::IMOV16rr
) {
5528 Val
= Val
->getOperand(0);
5531 if (Val
->getOpcode() == NVPTXISD::LoadV2
||
5532 Val
->getOpcode() == NVPTXISD::LoadV4
) {
5533 ConstantSDNode
*MaskCnst
= dyn_cast
<ConstantSDNode
>(Mask
);
5535 // Not an AND with a constant
5539 uint64_t MaskVal
= MaskCnst
->getZExtValue();
5540 if (MaskVal
!= 0xff) {
5541 // Not an AND that chops off top 8 bits
5545 MemSDNode
*Mem
= dyn_cast
<MemSDNode
>(Val
);
5547 // Not a MemSDNode?!?
5551 EVT MemVT
= Mem
->getMemoryVT();
5552 if (MemVT
!= MVT::v2i8
&& MemVT
!= MVT::v4i8
) {
5553 // We only handle the i8 case
5557 unsigned ExtType
= Val
->getConstantOperandVal(Val
->getNumOperands() - 1);
5558 if (ExtType
== ISD::SEXTLOAD
) {
5559 // If for some reason the load is a sextload, the and is needed to zero
5560 // out the high 8 bits
5565 if (AExt
.getNode() != nullptr) {
5566 // Re-insert the ext as a zext.
5567 Val
= DCI
.DAG
.getNode(ISD::ZERO_EXTEND
, SDLoc(N
),
5568 AExt
.getValueType(), Val
);
5572 // If we get here, the AND is unnecessary. Just replace it with the load
5573 DCI
.CombineTo(N
, Val
, AddTo
);
5579 static SDValue
PerformREMCombine(SDNode
*N
,
5580 TargetLowering::DAGCombinerInfo
&DCI
,
5581 CodeGenOptLevel OptLevel
) {
5582 assert(N
->getOpcode() == ISD::SREM
|| N
->getOpcode() == ISD::UREM
);
5584 // Don't do anything at less than -O2.
5585 if (OptLevel
< CodeGenOptLevel::Default
)
5588 SelectionDAG
&DAG
= DCI
.DAG
;
5590 EVT VT
= N
->getValueType(0);
5591 bool IsSigned
= N
->getOpcode() == ISD::SREM
;
5592 unsigned DivOpc
= IsSigned
? ISD::SDIV
: ISD::UDIV
;
5594 const SDValue
&Num
= N
->getOperand(0);
5595 const SDValue
&Den
= N
->getOperand(1);
5597 for (const SDNode
*U
: Num
->uses()) {
5598 if (U
->getOpcode() == DivOpc
&& U
->getOperand(0) == Num
&&
5599 U
->getOperand(1) == Den
) {
5600 // Num % Den -> Num - (Num / Den) * Den
5601 return DAG
.getNode(ISD::SUB
, DL
, VT
, Num
,
5602 DAG
.getNode(ISD::MUL
, DL
, VT
,
5603 DAG
.getNode(DivOpc
, DL
, VT
, Num
, Den
),
5610 enum OperandSignedness
{
5616 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5617 /// that can be demoted to \p OptSize bits without loss of information. The
5618 /// signedness of the operand, if determinable, is placed in \p S.
5619 static bool IsMulWideOperandDemotable(SDValue Op
,
5621 OperandSignedness
&S
) {
5624 if (Op
.getOpcode() == ISD::SIGN_EXTEND
||
5625 Op
.getOpcode() == ISD::SIGN_EXTEND_INREG
) {
5626 EVT OrigVT
= Op
.getOperand(0).getValueType();
5627 if (OrigVT
.getFixedSizeInBits() <= OptSize
) {
5631 } else if (Op
.getOpcode() == ISD::ZERO_EXTEND
) {
5632 EVT OrigVT
= Op
.getOperand(0).getValueType();
5633 if (OrigVT
.getFixedSizeInBits() <= OptSize
) {
5642 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5643 /// be demoted to \p OptSize bits without loss of information. If the operands
5644 /// contain a constant, it should appear as the RHS operand. The signedness of
5645 /// the operands is placed in \p IsSigned.
5646 static bool AreMulWideOperandsDemotable(SDValue LHS
, SDValue RHS
,
5649 OperandSignedness LHSSign
;
5651 // The LHS operand must be a demotable op
5652 if (!IsMulWideOperandDemotable(LHS
, OptSize
, LHSSign
))
5655 // We should have been able to determine the signedness from the LHS
5656 if (LHSSign
== Unknown
)
5659 IsSigned
= (LHSSign
== Signed
);
5661 // The RHS can be a demotable op or a constant
5662 if (ConstantSDNode
*CI
= dyn_cast
<ConstantSDNode
>(RHS
)) {
5663 const APInt
&Val
= CI
->getAPIntValue();
5664 if (LHSSign
== Unsigned
) {
5665 return Val
.isIntN(OptSize
);
5667 return Val
.isSignedIntN(OptSize
);
5670 OperandSignedness RHSSign
;
5671 if (!IsMulWideOperandDemotable(RHS
, OptSize
, RHSSign
))
5674 return LHSSign
== RHSSign
;
5678 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5679 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5680 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5682 static SDValue
TryMULWIDECombine(SDNode
*N
,
5683 TargetLowering::DAGCombinerInfo
&DCI
) {
5684 EVT MulType
= N
->getValueType(0);
5685 if (MulType
!= MVT::i32
&& MulType
!= MVT::i64
) {
5690 unsigned OptSize
= MulType
.getSizeInBits() >> 1;
5691 SDValue LHS
= N
->getOperand(0);
5692 SDValue RHS
= N
->getOperand(1);
5694 // Canonicalize the multiply so the constant (if any) is on the right
5695 if (N
->getOpcode() == ISD::MUL
) {
5696 if (isa
<ConstantSDNode
>(LHS
)) {
5697 std::swap(LHS
, RHS
);
5701 // If we have a SHL, determine the actual multiply amount
5702 if (N
->getOpcode() == ISD::SHL
) {
5703 ConstantSDNode
*ShlRHS
= dyn_cast
<ConstantSDNode
>(RHS
);
5708 APInt ShiftAmt
= ShlRHS
->getAPIntValue();
5709 unsigned BitWidth
= MulType
.getSizeInBits();
5710 if (ShiftAmt
.sge(0) && ShiftAmt
.slt(BitWidth
)) {
5711 APInt MulVal
= APInt(BitWidth
, 1) << ShiftAmt
;
5712 RHS
= DCI
.DAG
.getConstant(MulVal
, DL
, MulType
);
5719 // Verify that our operands are demotable
5720 if (!AreMulWideOperandsDemotable(LHS
, RHS
, OptSize
, Signed
)) {
5725 if (MulType
== MVT::i32
) {
5726 DemotedVT
= MVT::i16
;
5728 DemotedVT
= MVT::i32
;
5731 // Truncate the operands to the correct size. Note that these are just for
5732 // type consistency and will (likely) be eliminated in later phases.
5734 DCI
.DAG
.getNode(ISD::TRUNCATE
, DL
, DemotedVT
, LHS
);
5736 DCI
.DAG
.getNode(ISD::TRUNCATE
, DL
, DemotedVT
, RHS
);
5740 Opc
= NVPTXISD::MUL_WIDE_SIGNED
;
5742 Opc
= NVPTXISD::MUL_WIDE_UNSIGNED
;
5745 return DCI
.DAG
.getNode(Opc
, DL
, MulType
, TruncLHS
, TruncRHS
);
5748 static bool isConstOne(const SDValue
&Operand
) {
5749 const auto *Const
= dyn_cast
<ConstantSDNode
>(Operand
);
5750 return Const
&& Const
->getZExtValue() == 1;
5753 static SDValue
matchMADConstOnePattern(SDValue Add
) {
5754 if (Add
->getOpcode() != ISD::ADD
)
5757 if (isConstOne(Add
->getOperand(0)))
5758 return Add
->getOperand(1);
5760 if (isConstOne(Add
->getOperand(1)))
5761 return Add
->getOperand(0);
5766 static SDValue
combineMADConstOne(SDValue X
, SDValue Add
, EVT VT
, SDLoc DL
,
5767 TargetLowering::DAGCombinerInfo
&DCI
) {
5769 if (SDValue Y
= matchMADConstOnePattern(Add
))
5770 return DCI
.DAG
.getNode(NVPTXISD::IMAD
, DL
, VT
, X
, Y
, X
);
5775 static SDValue
combineMulSelectConstOne(SDValue X
, SDValue Select
, EVT VT
,
5777 TargetLowering::DAGCombinerInfo
&DCI
) {
5778 if (Select
->getOpcode() != ISD::SELECT
)
5781 SDValue Cond
= Select
->getOperand(0);
5784 if (isConstOne(Select
->getOperand(1)))
5786 else if (isConstOne(Select
->getOperand(2)))
5791 SDValue Y
= Select
->getOperand((ConstOpNo
== 1) ? 2 : 1);
5793 // Do not combine if the resulting sequence is not obviously profitable.
5794 if (!matchMADConstOnePattern(Y
))
5797 SDValue NewMul
= DCI
.DAG
.getNode(ISD::MUL
, DL
, VT
, X
, Y
);
5799 return DCI
.DAG
.getNode(ISD::SELECT
, DL
, VT
, Cond
,
5800 (ConstOpNo
== 1) ? X
: NewMul
,
5801 (ConstOpNo
== 1) ? NewMul
: X
);
5805 PerformMULCombineWithOperands(SDNode
*N
, SDValue N0
, SDValue N1
,
5806 TargetLowering::DAGCombinerInfo
&DCI
) {
5808 EVT VT
= N0
.getValueType();
5812 if (VT
!= MVT::i16
&& VT
!= MVT::i32
&& VT
!= MVT::i64
)
5817 // (mul x, (add y, 1)) -> (mad x, y, x)
5818 if (SDValue Res
= combineMADConstOne(N0
, N1
, VT
, DL
, DCI
))
5820 if (SDValue Res
= combineMADConstOne(N1
, N0
, VT
, DL
, DCI
))
5823 // (mul x, (select y, 1)) -> (select (mul x, y), x)
5824 if (SDValue Res
= combineMulSelectConstOne(N0
, N1
, VT
, DL
, DCI
))
5826 if (SDValue Res
= combineMulSelectConstOne(N1
, N0
, VT
, DL
, DCI
))
5832 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5833 static SDValue
PerformMULCombine(SDNode
*N
,
5834 TargetLowering::DAGCombinerInfo
&DCI
,
5835 CodeGenOptLevel OptLevel
) {
5836 if (OptLevel
== CodeGenOptLevel::None
)
5839 if (SDValue Ret
= TryMULWIDECombine(N
, DCI
))
5842 SDValue N0
= N
->getOperand(0);
5843 SDValue N1
= N
->getOperand(1);
5844 return PerformMULCombineWithOperands(N
, N0
, N1
, DCI
);
5847 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5848 static SDValue
PerformSHLCombine(SDNode
*N
,
5849 TargetLowering::DAGCombinerInfo
&DCI
,
5850 CodeGenOptLevel OptLevel
) {
5851 if (OptLevel
> CodeGenOptLevel::None
) {
5852 // Try mul.wide combining at OptLevel > 0
5853 if (SDValue Ret
= TryMULWIDECombine(N
, DCI
))
5860 static SDValue
PerformSETCCCombine(SDNode
*N
,
5861 TargetLowering::DAGCombinerInfo
&DCI
,
5862 unsigned int SmVersion
) {
5863 EVT CCType
= N
->getValueType(0);
5864 SDValue A
= N
->getOperand(0);
5865 SDValue B
= N
->getOperand(1);
5867 EVT AType
= A
.getValueType();
5868 if (!(CCType
== MVT::v2i1
&& (AType
== MVT::v2f16
|| AType
== MVT::v2bf16
)))
5871 if (A
.getValueType() == MVT::v2bf16
&& SmVersion
< 90)
5875 // setp.f16x2 returns two scalar predicates, which we need to
5876 // convert back to v2i1. The returned result will be scalarized by
5877 // the legalizer, but the comparison will remain a single vector
5879 SDValue CCNode
= DCI
.DAG
.getNode(
5880 A
.getValueType() == MVT::v2f16
? NVPTXISD::SETP_F16X2
5881 : NVPTXISD::SETP_BF16X2
,
5882 DL
, DCI
.DAG
.getVTList(MVT::i1
, MVT::i1
), {A
, B
, N
->getOperand(2)});
5883 return DCI
.DAG
.getNode(ISD::BUILD_VECTOR
, DL
, CCType
, CCNode
.getValue(0),
5884 CCNode
.getValue(1));
5887 static SDValue
PerformEXTRACTCombine(SDNode
*N
,
5888 TargetLowering::DAGCombinerInfo
&DCI
) {
5889 SDValue Vector
= N
->getOperand(0);
5891 EVT VectorVT
= Vector
.getValueType();
5892 if (Vector
->getOpcode() == ISD::LOAD
&& VectorVT
.isSimple() &&
5893 IsPTXVectorType(VectorVT
.getSimpleVT()))
5894 return SDValue(); // Native vector loads already combine nicely w/
5895 // extract_vector_elt.
5896 // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5898 if (VectorVT
.getVectorNumElements() == 1 || Isv2x16VT(VectorVT
) ||
5899 VectorVT
== MVT::v4i8
|| VectorVT
== MVT::v8i8
)
5902 // Don't mess with undef values as sra may be simplified to 0, not undef.
5903 if (Vector
->isUndef() || ISD::allOperandsUndef(Vector
.getNode()))
5906 uint64_t VectorBits
= VectorVT
.getSizeInBits();
5907 // We only handle the types we can extract in-register.
5908 if (!(VectorBits
== 16 || VectorBits
== 32 || VectorBits
== 64))
5911 ConstantSDNode
*Index
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
5912 // Index == 0 is handled by generic DAG combiner.
5913 if (!Index
|| Index
->getZExtValue() == 0)
5916 MVT IVT
= MVT::getIntegerVT(VectorBits
);
5917 EVT EltVT
= VectorVT
.getVectorElementType();
5918 EVT EltIVT
= EltVT
.changeTypeToInteger();
5919 uint64_t EltBits
= EltVT
.getScalarSizeInBits();
5921 SDValue Result
= DCI
.DAG
.getNode(
5922 ISD::TRUNCATE
, DL
, EltIVT
,
5924 ISD::SRA
, DL
, IVT
, DCI
.DAG
.getNode(ISD::BITCAST
, DL
, IVT
, Vector
),
5925 DCI
.DAG
.getConstant(Index
->getZExtValue() * EltBits
, DL
, IVT
)));
5927 // If element has non-integer type, bitcast it back to the expected type.
5928 if (EltVT
!= EltIVT
)
5929 Result
= DCI
.DAG
.getNode(ISD::BITCAST
, DL
, EltVT
, Result
);
5930 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5931 if (EltVT
!= N
->getValueType(0))
5932 Result
= DCI
.DAG
.getNode(ISD::ANY_EXTEND
, DL
, N
->getValueType(0), Result
);
5937 static SDValue
PerformVSELECTCombine(SDNode
*N
,
5938 TargetLowering::DAGCombinerInfo
&DCI
) {
5939 SDValue VA
= N
->getOperand(1);
5940 EVT VectorVT
= VA
.getValueType();
5941 if (VectorVT
!= MVT::v4i8
)
5944 // We need to split vselect into individual per-element operations Because we
5945 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5946 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5947 // to/from i16 normally used for i8 values.
5948 SmallVector
<SDValue
, 4> E
;
5950 SDValue VCond
= N
->getOperand(0);
5951 SDValue VB
= N
->getOperand(2);
5952 for (int I
= 0; I
< 4; ++I
) {
5953 SDValue C
= DCI
.DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i1
, VCond
,
5954 DCI
.DAG
.getConstant(I
, DL
, MVT::i32
));
5955 SDValue EA
= DCI
.DAG
.getAnyExtOrTrunc(
5956 DCI
.DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i8
, VA
,
5957 DCI
.DAG
.getConstant(I
, DL
, MVT::i32
)),
5959 SDValue EB
= DCI
.DAG
.getAnyExtOrTrunc(
5960 DCI
.DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i8
, VB
,
5961 DCI
.DAG
.getConstant(I
, DL
, MVT::i32
)),
5963 E
.push_back(DCI
.DAG
.getAnyExtOrTrunc(
5964 DCI
.DAG
.getNode(ISD::SELECT
, DL
, MVT::i32
, C
, EA
, EB
), DL
, MVT::i8
));
5966 return DCI
.DAG
.getNode(ISD::BUILD_VECTOR
, DL
, MVT::v4i8
, E
);
5969 static SDValue
PerformLOADCombine(SDNode
*N
,
5970 TargetLowering::DAGCombinerInfo
&DCI
) {
5971 SelectionDAG
&DAG
= DCI
.DAG
;
5972 LoadSDNode
*LD
= cast
<LoadSDNode
>(N
);
5974 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5975 // letting ReplaceLoadVector split it into smaller loads during legalization.
5976 // This is done at dag-combine1 time, so that vector operations with i8
5977 // elements can be optimised away instead of being needlessly split during
5978 // legalization, which involves storing to the stack and loading it back.
5979 EVT VT
= N
->getValueType(0);
5980 if (VT
!= MVT::v16i8
)
5985 // Create a v4i32 vector load operation, effectively <4 x v4i8>.
5986 unsigned Opc
= NVPTXISD::LoadV4
;
5987 EVT NewVT
= MVT::v4i32
;
5988 EVT EltVT
= NewVT
.getVectorElementType();
5989 unsigned NumElts
= NewVT
.getVectorNumElements();
5990 EVT RetVTs
[] = {EltVT
, EltVT
, EltVT
, EltVT
, MVT::Other
};
5991 SDVTList RetVTList
= DAG
.getVTList(RetVTs
);
5992 SmallVector
<SDValue
, 8> Ops(N
->ops());
5993 Ops
.push_back(DAG
.getIntPtrConstant(LD
->getExtensionType(), DL
));
5994 SDValue NewLoad
= DAG
.getMemIntrinsicNode(Opc
, DL
, RetVTList
, Ops
, NewVT
,
5995 LD
->getMemOperand());
5996 SDValue NewChain
= NewLoad
.getValue(NumElts
);
5998 // Create a vector of the same type returned by the original load.
5999 SmallVector
<SDValue
, 4> Elts
;
6000 for (unsigned i
= 0; i
< NumElts
; i
++)
6001 Elts
.push_back(NewLoad
.getValue(i
));
6002 return DCI
.DAG
.getMergeValues(
6003 {DCI
.DAG
.getBitcast(VT
, DCI
.DAG
.getBuildVector(NewVT
, DL
, Elts
)),
6008 SDValue
NVPTXTargetLowering::PerformDAGCombine(SDNode
*N
,
6009 DAGCombinerInfo
&DCI
) const {
6010 CodeGenOptLevel OptLevel
= getTargetMachine().getOptLevel();
6011 switch (N
->getOpcode()) {
6014 return PerformADDCombine(N
, DCI
, OptLevel
);
6016 return PerformFADDCombine(N
, DCI
, OptLevel
);
6018 return PerformMULCombine(N
, DCI
, OptLevel
);
6020 return PerformSHLCombine(N
, DCI
, OptLevel
);
6022 return PerformANDCombine(N
, DCI
);
6025 return PerformREMCombine(N
, DCI
, OptLevel
);
6027 return PerformSETCCCombine(N
, DCI
, STI
.getSmVersion());
6029 return PerformLOADCombine(N
, DCI
);
6030 case NVPTXISD::StoreRetval
:
6031 case NVPTXISD::StoreRetvalV2
:
6032 case NVPTXISD::StoreRetvalV4
:
6033 return PerformStoreRetvalCombine(N
);
6034 case NVPTXISD::StoreParam
:
6035 case NVPTXISD::StoreParamV2
:
6036 case NVPTXISD::StoreParamV4
:
6037 return PerformStoreParamCombine(N
);
6038 case ISD::EXTRACT_VECTOR_ELT
:
6039 return PerformEXTRACTCombine(N
, DCI
);
6041 return PerformVSELECTCombine(N
, DCI
);
6046 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
6047 static void ReplaceLoadVector(SDNode
*N
, SelectionDAG
&DAG
,
6048 SmallVectorImpl
<SDValue
> &Results
) {
6049 EVT ResVT
= N
->getValueType(0);
6052 assert(ResVT
.isVector() && "Vector load must have vector type");
6054 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
6055 // legal. We can (and should) split that into 2 loads of <2 x double> here
6056 // but I'm leaving that as a TODO for now.
6057 assert(ResVT
.isSimple() && "Can only handle simple types");
6058 switch (ResVT
.getSimpleVT().SimpleTy
) {
6073 case MVT::v8f16
: // <4 x f16x2>
6074 case MVT::v8bf16
: // <4 x bf16x2>
6075 case MVT::v8i16
: // <4 x i16x2>
6076 // This is a "native" vector type
6080 LoadSDNode
*LD
= cast
<LoadSDNode
>(N
);
6082 Align Alignment
= LD
->getAlign();
6083 auto &TD
= DAG
.getDataLayout();
6085 TD
.getPrefTypeAlign(LD
->getMemoryVT().getTypeForEVT(*DAG
.getContext()));
6086 if (Alignment
< PrefAlign
) {
6087 // This load is not sufficiently aligned, so bail out and let this vector
6088 // load be scalarized. Note that we may still be able to emit smaller
6089 // vector loads. For example, if we are loading a <4 x float> with an
6090 // alignment of 8, this check will fail but the legalizer will try again
6091 // with 2 x <2 x float>, which will succeed with an alignment of 8.
6095 EVT EltVT
= ResVT
.getVectorElementType();
6096 unsigned NumElts
= ResVT
.getVectorNumElements();
6098 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
6099 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6100 // loaded type to i16 and propagate the "real" type as the memory type.
6101 bool NeedTrunc
= false;
6102 if (EltVT
.getSizeInBits() < 16) {
6107 unsigned Opcode
= 0;
6109 bool Load16x2
= false;
6115 Opcode
= NVPTXISD::LoadV2
;
6116 LdResVTs
= DAG
.getVTList(EltVT
, EltVT
, MVT::Other
);
6119 Opcode
= NVPTXISD::LoadV4
;
6120 EVT ListVTs
[] = { EltVT
, EltVT
, EltVT
, EltVT
, MVT::Other
};
6121 LdResVTs
= DAG
.getVTList(ListVTs
);
6125 // v8f16 is a special case. PTX doesn't have ld.v8.f16
6126 // instruction. Instead, we split the vector into v2f16 chunks and
6127 // load them with ld.v4.b32.
6128 assert(Is16bitsType(EltVT
.getSimpleVT()) && "Unsupported v8 vector type.");
6130 Opcode
= NVPTXISD::LoadV4
;
6132 switch (EltVT
.getSimpleVT().SimpleTy
) {
6143 llvm_unreachable("Unsupported v8 vector type.");
6145 EVT ListVTs
[] = {VVT
, VVT
, VVT
, VVT
, MVT::Other
};
6146 LdResVTs
= DAG
.getVTList(ListVTs
);
6151 // Copy regular operands
6152 SmallVector
<SDValue
, 8> OtherOps(N
->op_begin(), N
->op_end());
6154 // The select routine does not have access to the LoadSDNode instance, so
6155 // pass along the extension information
6156 OtherOps
.push_back(DAG
.getIntPtrConstant(LD
->getExtensionType(), DL
));
6158 SDValue NewLD
= DAG
.getMemIntrinsicNode(Opcode
, DL
, LdResVTs
, OtherOps
,
6160 LD
->getMemOperand());
6162 SmallVector
<SDValue
, 8> ScalarRes
;
6164 // Split v2f16 subvectors back into individual elements.
6166 for (unsigned i
= 0; i
< NumElts
; ++i
) {
6167 SDValue SubVector
= NewLD
.getValue(i
);
6168 SDValue E0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltVT
, SubVector
,
6169 DAG
.getIntPtrConstant(0, DL
));
6170 SDValue E1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltVT
, SubVector
,
6171 DAG
.getIntPtrConstant(1, DL
));
6172 ScalarRes
.push_back(E0
);
6173 ScalarRes
.push_back(E1
);
6176 for (unsigned i
= 0; i
< NumElts
; ++i
) {
6177 SDValue Res
= NewLD
.getValue(i
);
6179 Res
= DAG
.getNode(ISD::TRUNCATE
, DL
, ResVT
.getVectorElementType(), Res
);
6180 ScalarRes
.push_back(Res
);
6184 SDValue LoadChain
= NewLD
.getValue(NumElts
);
6186 SDValue BuildVec
= DAG
.getBuildVector(ResVT
, DL
, ScalarRes
);
6188 Results
.push_back(BuildVec
);
6189 Results
.push_back(LoadChain
);
6192 static void ReplaceINTRINSIC_W_CHAIN(SDNode
*N
, SelectionDAG
&DAG
,
6193 SmallVectorImpl
<SDValue
> &Results
) {
6194 SDValue Chain
= N
->getOperand(0);
6195 SDValue Intrin
= N
->getOperand(1);
6198 // Get the intrinsic ID
6199 unsigned IntrinNo
= Intrin
.getNode()->getAsZExtVal();
6203 case Intrinsic::nvvm_ldg_global_i
:
6204 case Intrinsic::nvvm_ldg_global_f
:
6205 case Intrinsic::nvvm_ldg_global_p
:
6206 case Intrinsic::nvvm_ldu_global_i
:
6207 case Intrinsic::nvvm_ldu_global_f
:
6208 case Intrinsic::nvvm_ldu_global_p
: {
6209 EVT ResVT
= N
->getValueType(0);
6211 if (ResVT
.isVector()) {
6214 unsigned NumElts
= ResVT
.getVectorNumElements();
6215 EVT EltVT
= ResVT
.getVectorElementType();
6217 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6219 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6220 // loaded type to i16 and propagate the "real" type as the memory type.
6221 bool NeedTrunc
= false;
6222 if (EltVT
.getSizeInBits() < 16) {
6227 unsigned Opcode
= 0;
6237 case Intrinsic::nvvm_ldg_global_i
:
6238 case Intrinsic::nvvm_ldg_global_f
:
6239 case Intrinsic::nvvm_ldg_global_p
:
6240 Opcode
= NVPTXISD::LDGV2
;
6242 case Intrinsic::nvvm_ldu_global_i
:
6243 case Intrinsic::nvvm_ldu_global_f
:
6244 case Intrinsic::nvvm_ldu_global_p
:
6245 Opcode
= NVPTXISD::LDUV2
;
6248 LdResVTs
= DAG
.getVTList(EltVT
, EltVT
, MVT::Other
);
6254 case Intrinsic::nvvm_ldg_global_i
:
6255 case Intrinsic::nvvm_ldg_global_f
:
6256 case Intrinsic::nvvm_ldg_global_p
:
6257 Opcode
= NVPTXISD::LDGV4
;
6259 case Intrinsic::nvvm_ldu_global_i
:
6260 case Intrinsic::nvvm_ldu_global_f
:
6261 case Intrinsic::nvvm_ldu_global_p
:
6262 Opcode
= NVPTXISD::LDUV4
;
6265 EVT ListVTs
[] = { EltVT
, EltVT
, EltVT
, EltVT
, MVT::Other
};
6266 LdResVTs
= DAG
.getVTList(ListVTs
);
6271 SmallVector
<SDValue
, 8> OtherOps
;
6273 // Copy regular operands
6275 OtherOps
.push_back(Chain
); // Chain
6276 // Skip operand 1 (intrinsic ID)
6278 OtherOps
.append(N
->op_begin() + 2, N
->op_end());
6280 MemIntrinsicSDNode
*MemSD
= cast
<MemIntrinsicSDNode
>(N
);
6282 SDValue NewLD
= DAG
.getMemIntrinsicNode(Opcode
, DL
, LdResVTs
, OtherOps
,
6283 MemSD
->getMemoryVT(),
6284 MemSD
->getMemOperand());
6286 SmallVector
<SDValue
, 4> ScalarRes
;
6288 for (unsigned i
= 0; i
< NumElts
; ++i
) {
6289 SDValue Res
= NewLD
.getValue(i
);
6292 DAG
.getNode(ISD::TRUNCATE
, DL
, ResVT
.getVectorElementType(), Res
);
6293 ScalarRes
.push_back(Res
);
6296 SDValue LoadChain
= NewLD
.getValue(NumElts
);
6299 DAG
.getBuildVector(ResVT
, DL
, ScalarRes
);
6301 Results
.push_back(BuildVec
);
6302 Results
.push_back(LoadChain
);
6305 assert(ResVT
.isSimple() && ResVT
.getSimpleVT().SimpleTy
== MVT::i8
&&
6306 "Custom handling of non-i8 ldu/ldg?");
6308 // Just copy all operands as-is
6309 SmallVector
<SDValue
, 4> Ops(N
->op_begin(), N
->op_end());
6311 // Force output to i16
6312 SDVTList LdResVTs
= DAG
.getVTList(MVT::i16
, MVT::Other
);
6314 MemIntrinsicSDNode
*MemSD
= cast
<MemIntrinsicSDNode
>(N
);
6316 // We make sure the memory type is i8, which will be used during isel
6317 // to select the proper instruction.
6319 DAG
.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN
, DL
, LdResVTs
, Ops
,
6320 MVT::i8
, MemSD
->getMemOperand());
6322 Results
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i8
,
6323 NewLD
.getValue(0)));
6324 Results
.push_back(NewLD
.getValue(1));
6330 static void ReplaceCopyFromReg_128(SDNode
*N
, SelectionDAG
&DAG
,
6331 SmallVectorImpl
<SDValue
> &Results
) {
6332 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6333 // result so that it can pass the legalization
6335 SDValue Chain
= N
->getOperand(0);
6336 SDValue Reg
= N
->getOperand(1);
6337 SDValue Glue
= N
->getOperand(2);
6339 assert(Reg
.getValueType() == MVT::i128
&&
6340 "Custom lowering for CopyFromReg with 128-bit reg only");
6341 SmallVector
<EVT
, 4> ResultsType
= {MVT::i64
, MVT::i64
, N
->getValueType(1),
6342 N
->getValueType(2)};
6343 SmallVector
<SDValue
, 3> NewOps
= {Chain
, Reg
, Glue
};
6345 SDValue NewValue
= DAG
.getNode(ISD::CopyFromReg
, DL
, ResultsType
, NewOps
);
6346 SDValue Pair
= DAG
.getNode(ISD::BUILD_PAIR
, DL
, MVT::i128
,
6347 {NewValue
.getValue(0), NewValue
.getValue(1)});
6349 Results
.push_back(Pair
);
6350 Results
.push_back(NewValue
.getValue(2));
6351 Results
.push_back(NewValue
.getValue(3));
6354 void NVPTXTargetLowering::ReplaceNodeResults(
6355 SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
, SelectionDAG
&DAG
) const {
6356 switch (N
->getOpcode()) {
6358 report_fatal_error("Unhandled custom legalization");
6360 ReplaceLoadVector(N
, DAG
, Results
);
6362 case ISD::INTRINSIC_W_CHAIN
:
6363 ReplaceINTRINSIC_W_CHAIN(N
, DAG
, Results
);
6365 case ISD::CopyFromReg
:
6366 ReplaceCopyFromReg_128(N
, DAG
, Results
);
6371 NVPTXTargetLowering::AtomicExpansionKind
6372 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*AI
) const {
6373 Type
*Ty
= AI
->getValOperand()->getType();
6375 if (AI
->isFloatingPointOperation()) {
6376 if (AI
->getOperation() == AtomicRMWInst::BinOp::FAdd
) {
6377 if (Ty
->isHalfTy() && STI
.getSmVersion() >= 70 &&
6378 STI
.getPTXVersion() >= 63)
6379 return AtomicExpansionKind::None
;
6380 if (Ty
->isBFloatTy() && STI
.getSmVersion() >= 90 &&
6381 STI
.getPTXVersion() >= 78)
6382 return AtomicExpansionKind::None
;
6383 if (Ty
->isFloatTy())
6384 return AtomicExpansionKind::None
;
6385 if (Ty
->isDoubleTy() && STI
.hasAtomAddF64())
6386 return AtomicExpansionKind::None
;
6388 return AtomicExpansionKind::CmpXChg
;
6391 assert(Ty
->isIntegerTy() && "Ty should be integer at this point");
6392 auto ITy
= cast
<llvm::IntegerType
>(Ty
);
6394 switch (AI
->getOperation()) {
6396 return AtomicExpansionKind::CmpXChg
;
6397 case AtomicRMWInst::BinOp::And
:
6398 case AtomicRMWInst::BinOp::Or
:
6399 case AtomicRMWInst::BinOp::Xor
:
6400 case AtomicRMWInst::BinOp::Xchg
:
6401 switch (ITy
->getBitWidth()) {
6404 return AtomicExpansionKind::CmpXChg
;
6406 return AtomicExpansionKind::None
;
6408 if (STI
.hasAtomBitwise64())
6409 return AtomicExpansionKind::None
;
6410 return AtomicExpansionKind::CmpXChg
;
6412 llvm_unreachable("unsupported width encountered");
6414 case AtomicRMWInst::BinOp::Add
:
6415 case AtomicRMWInst::BinOp::Sub
:
6416 case AtomicRMWInst::BinOp::Max
:
6417 case AtomicRMWInst::BinOp::Min
:
6418 case AtomicRMWInst::BinOp::UMax
:
6419 case AtomicRMWInst::BinOp::UMin
:
6420 switch (ITy
->getBitWidth()) {
6423 return AtomicExpansionKind::CmpXChg
;
6425 return AtomicExpansionKind::None
;
6427 if (STI
.hasAtomMinMax64())
6428 return AtomicExpansionKind::None
;
6429 return AtomicExpansionKind::CmpXChg
;
6431 llvm_unreachable("unsupported width encountered");
6435 return AtomicExpansionKind::CmpXChg
;
6438 // Pin NVPTXTargetObjectFile's vtables to this file.
6439 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
6441 MCSection
*NVPTXTargetObjectFile::SelectSectionForGlobal(
6442 const GlobalObject
*GO
, SectionKind Kind
, const TargetMachine
&TM
) const {
6443 return getDataSection();