[X86][BMI] Pull out schedule classes from bmi_andn<> and bmi_bls<>
[llvm-core.git] / lib / Target / NVPTX / NVPTXISelLowering.cpp
blob9acd0bea66fdfdd5fd442aceb83a0e35d2f340d9
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
10 // selection DAG.
12 //===----------------------------------------------------------------------===//
14 #include "NVPTXISelLowering.h"
15 #include "MCTargetDesc/NVPTXBaseInfo.h"
16 #include "NVPTX.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/Analysis.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineMemOperand.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/CodeGen/SelectionDAGNodes.h"
29 #include "llvm/CodeGen/TargetCallingConv.h"
30 #include "llvm/CodeGen/TargetLowering.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/Argument.h"
33 #include "llvm/IR/Attributes.h"
34 #include "llvm/IR/CallSite.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DerivedTypes.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/GlobalValue.h"
40 #include "llvm/IR/Instruction.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/IR/Value.h"
45 #include "llvm/Support/Casting.h"
46 #include "llvm/Support/CodeGen.h"
47 #include "llvm/Support/CommandLine.h"
48 #include "llvm/Support/ErrorHandling.h"
49 #include "llvm/Support/MachineValueType.h"
50 #include "llvm/Support/MathExtras.h"
51 #include "llvm/Support/raw_ostream.h"
52 #include "llvm/Target/TargetMachine.h"
53 #include "llvm/Target/TargetOptions.h"
54 #include <algorithm>
55 #include <cassert>
56 #include <cstdint>
57 #include <iterator>
58 #include <sstream>
59 #include <string>
60 #include <utility>
61 #include <vector>
63 #define DEBUG_TYPE "nvptx-lower"
65 using namespace llvm;
67 static unsigned int uniqueCallSite = 0;
69 static cl::opt<bool> sched4reg(
70 "nvptx-sched4reg",
71 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
73 static cl::opt<unsigned>
74 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
75 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
76 " 1: do it 2: do it aggressively"),
77 cl::init(2));
79 static cl::opt<int> UsePrecDivF32(
80 "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
81 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
82 " IEEE Compliant F32 div.rnd if available."),
83 cl::init(2));
85 static cl::opt<bool> UsePrecSqrtF32(
86 "nvptx-prec-sqrtf32", cl::Hidden,
87 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
88 cl::init(true));
90 static cl::opt<bool> FtzEnabled(
91 "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
92 cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
93 cl::init(false));
95 int NVPTXTargetLowering::getDivF32Level() const {
96 if (UsePrecDivF32.getNumOccurrences() > 0) {
97 // If nvptx-prec-div32=N is used on the command-line, always honor it
98 return UsePrecDivF32;
99 } else {
100 // Otherwise, use div.approx if fast math is enabled
101 if (getTargetMachine().Options.UnsafeFPMath)
102 return 0;
103 else
104 return 2;
108 bool NVPTXTargetLowering::usePrecSqrtF32() const {
109 if (UsePrecSqrtF32.getNumOccurrences() > 0) {
110 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
111 return UsePrecSqrtF32;
112 } else {
113 // Otherwise, use sqrt.approx if fast math is enabled
114 return !getTargetMachine().Options.UnsafeFPMath;
118 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
119 // TODO: Get rid of this flag; there can be only one way to do this.
120 if (FtzEnabled.getNumOccurrences() > 0) {
121 // If nvptx-f32ftz is used on the command-line, always honor it
122 return FtzEnabled;
123 } else {
124 const Function &F = MF.getFunction();
125 // Otherwise, check for an nvptx-f32ftz attribute on the function
126 if (F.hasFnAttribute("nvptx-f32ftz"))
127 return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
128 else
129 return false;
133 static bool IsPTXVectorType(MVT VT) {
134 switch (VT.SimpleTy) {
135 default:
136 return false;
137 case MVT::v2i1:
138 case MVT::v4i1:
139 case MVT::v2i8:
140 case MVT::v4i8:
141 case MVT::v2i16:
142 case MVT::v4i16:
143 case MVT::v2i32:
144 case MVT::v4i32:
145 case MVT::v2i64:
146 case MVT::v2f16:
147 case MVT::v4f16:
148 case MVT::v8f16: // <4 x f16x2>
149 case MVT::v2f32:
150 case MVT::v4f32:
151 case MVT::v2f64:
152 return true;
156 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
157 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
158 /// into their primitive components.
159 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
160 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
161 /// LowerCall, and LowerReturn.
162 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
163 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
164 SmallVectorImpl<uint64_t> *Offsets = nullptr,
165 uint64_t StartingOffset = 0) {
166 SmallVector<EVT, 16> TempVTs;
167 SmallVector<uint64_t, 16> TempOffsets;
169 // Special case for i128 - decompose to (i64, i64)
170 if (Ty->isIntegerTy(128)) {
171 ValueVTs.push_back(EVT(MVT::i64));
172 ValueVTs.push_back(EVT(MVT::i64));
174 if (Offsets) {
175 Offsets->push_back(StartingOffset + 0);
176 Offsets->push_back(StartingOffset + 8);
179 return;
182 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
183 if (StructType *STy = dyn_cast<StructType>(Ty)) {
184 auto const *SL = DL.getStructLayout(STy);
185 auto ElementNum = 0;
186 for(auto *EI : STy->elements()) {
187 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
188 StartingOffset + SL->getElementOffset(ElementNum));
189 ++ElementNum;
191 return;
194 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
195 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
196 EVT VT = TempVTs[i];
197 uint64_t Off = TempOffsets[i];
198 // Split vectors into individual elements, except for v2f16, which
199 // we will pass as a single scalar.
200 if (VT.isVector()) {
201 unsigned NumElts = VT.getVectorNumElements();
202 EVT EltVT = VT.getVectorElementType();
203 // Vectors with an even number of f16 elements will be passed to
204 // us as an array of v2f16 elements. We must match this so we
205 // stay in sync with Ins/Outs.
206 if (EltVT == MVT::f16 && NumElts % 2 == 0) {
207 EltVT = MVT::v2f16;
208 NumElts /= 2;
210 for (unsigned j = 0; j != NumElts; ++j) {
211 ValueVTs.push_back(EltVT);
212 if (Offsets)
213 Offsets->push_back(Off + j * EltVT.getStoreSize());
215 } else {
216 ValueVTs.push_back(VT);
217 if (Offsets)
218 Offsets->push_back(Off);
223 // Check whether we can merge loads/stores of some of the pieces of a
224 // flattened function parameter or return value into a single vector
225 // load/store.
227 // The flattened parameter is represented as a list of EVTs and
228 // offsets, and the whole structure is aligned to ParamAlignment. This
229 // function determines whether we can load/store pieces of the
230 // parameter starting at index Idx using a single vectorized op of
231 // size AccessSize. If so, it returns the number of param pieces
232 // covered by the vector op. Otherwise, it returns 1.
233 static unsigned CanMergeParamLoadStoresStartingAt(
234 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
235 const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
236 assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
238 // Can't vectorize if param alignment is not sufficient.
239 if (AccessSize > ParamAlignment)
240 return 1;
241 // Can't vectorize if offset is not aligned.
242 if (Offsets[Idx] & (AccessSize - 1))
243 return 1;
245 EVT EltVT = ValueVTs[Idx];
246 unsigned EltSize = EltVT.getStoreSize();
248 // Element is too large to vectorize.
249 if (EltSize >= AccessSize)
250 return 1;
252 unsigned NumElts = AccessSize / EltSize;
253 // Can't vectorize if AccessBytes if not a multiple of EltSize.
254 if (AccessSize != EltSize * NumElts)
255 return 1;
257 // We don't have enough elements to vectorize.
258 if (Idx + NumElts > ValueVTs.size())
259 return 1;
261 // PTX ISA can only deal with 2- and 4-element vector ops.
262 if (NumElts != 4 && NumElts != 2)
263 return 1;
265 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
266 // Types do not match.
267 if (ValueVTs[j] != EltVT)
268 return 1;
270 // Elements are not contiguous.
271 if (Offsets[j] - Offsets[j - 1] != EltSize)
272 return 1;
274 // OK. We can vectorize ValueVTs[i..i+NumElts)
275 return NumElts;
278 // Flags for tracking per-element vectorization state of loads/stores
279 // of a flattened function parameter or return value.
280 enum ParamVectorizationFlags {
281 PVF_INNER = 0x0, // Middle elements of a vector.
282 PVF_FIRST = 0x1, // First element of the vector.
283 PVF_LAST = 0x2, // Last element of the vector.
284 // Scalar is effectively a 1-element vector.
285 PVF_SCALAR = PVF_FIRST | PVF_LAST
288 // Computes whether and how we can vectorize the loads/stores of a
289 // flattened function parameter or return value.
291 // The flattened parameter is represented as the list of ValueVTs and
292 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
293 // of the same size as ValueVTs indicating how each piece should be
294 // loaded/stored (i.e. as a scalar, or as part of a vector
295 // load/store).
296 static SmallVector<ParamVectorizationFlags, 16>
297 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
298 const SmallVectorImpl<uint64_t> &Offsets,
299 unsigned ParamAlignment) {
300 // Set vector size to match ValueVTs and mark all elements as
301 // scalars by default.
302 SmallVector<ParamVectorizationFlags, 16> VectorInfo;
303 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
305 // Check what we can vectorize using 128/64/32-bit accesses.
306 for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
307 // Skip elements we've already processed.
308 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
309 for (unsigned AccessSize : {16, 8, 4, 2}) {
310 unsigned NumElts = CanMergeParamLoadStoresStartingAt(
311 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
312 // Mark vectorized elements.
313 switch (NumElts) {
314 default:
315 llvm_unreachable("Unexpected return value");
316 case 1:
317 // Can't vectorize using this size, try next smaller size.
318 continue;
319 case 2:
320 assert(I + 1 < E && "Not enough elements.");
321 VectorInfo[I] = PVF_FIRST;
322 VectorInfo[I + 1] = PVF_LAST;
323 I += 1;
324 break;
325 case 4:
326 assert(I + 3 < E && "Not enough elements.");
327 VectorInfo[I] = PVF_FIRST;
328 VectorInfo[I + 1] = PVF_INNER;
329 VectorInfo[I + 2] = PVF_INNER;
330 VectorInfo[I + 3] = PVF_LAST;
331 I += 3;
332 break;
334 // Break out of the inner loop because we've already succeeded
335 // using largest possible AccessSize.
336 break;
339 return VectorInfo;
342 // NVPTXTargetLowering Constructor.
343 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
344 const NVPTXSubtarget &STI)
345 : TargetLowering(TM), nvTM(&TM), STI(STI) {
346 // always lower memset, memcpy, and memmove intrinsics to load/store
347 // instructions, rather
348 // then generating calls to memset, mempcy or memmove.
349 MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
350 MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
351 MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
353 setBooleanContents(ZeroOrNegativeOneBooleanContent);
354 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
356 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
357 // condition branches.
358 setJumpIsExpensive(true);
360 // Wide divides are _very_ slow. Try to reduce the width of the divide if
361 // possible.
362 addBypassSlowDiv(64, 32);
364 // By default, use the Source scheduling
365 if (sched4reg)
366 setSchedulingPreference(Sched::RegPressure);
367 else
368 setSchedulingPreference(Sched::Source);
370 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
371 LegalizeAction NoF16Action) {
372 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
375 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
376 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
377 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
378 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
379 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
380 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
381 addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
382 addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
384 // Conversion to/from FP16/FP16x2 is always legal.
385 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
386 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
387 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
388 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
389 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
390 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
392 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
393 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
395 // Operations not directly supported by NVPTX.
396 for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8,
397 MVT::i16, MVT::i32, MVT::i64}) {
398 setOperationAction(ISD::SELECT_CC, VT, Expand);
399 setOperationAction(ISD::BR_CC, VT, Expand);
402 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
403 // For others we will expand to a SHL/SRA pair.
404 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
405 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
406 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
407 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
408 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
410 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
411 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
412 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
413 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
414 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
415 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
417 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
418 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
420 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
421 // that don't have h/w rotation we lower them to multi-instruction assembly.
422 // See ROT*_sw in NVPTXIntrInfo.td
423 setOperationAction(ISD::ROTL, MVT::i64, Legal);
424 setOperationAction(ISD::ROTR, MVT::i64, Legal);
425 setOperationAction(ISD::ROTL, MVT::i32, Legal);
426 setOperationAction(ISD::ROTR, MVT::i32, Legal);
428 setOperationAction(ISD::ROTL, MVT::i16, Expand);
429 setOperationAction(ISD::ROTR, MVT::i16, Expand);
430 setOperationAction(ISD::ROTL, MVT::i8, Expand);
431 setOperationAction(ISD::ROTR, MVT::i8, Expand);
432 setOperationAction(ISD::BSWAP, MVT::i16, Expand);
433 setOperationAction(ISD::BSWAP, MVT::i32, Expand);
434 setOperationAction(ISD::BSWAP, MVT::i64, Expand);
436 // Indirect branch is not supported.
437 // This also disables Jump Table creation.
438 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
439 setOperationAction(ISD::BRIND, MVT::Other, Expand);
441 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
442 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
444 // We want to legalize constant related memmove and memcopy
445 // intrinsics.
446 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
448 // Turn FP extload into load/fpextend
449 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
450 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
451 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
452 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
453 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
454 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
455 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
456 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
457 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
458 // Turn FP truncstore into trunc + store.
459 // FIXME: vector types should also be expanded
460 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
461 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
462 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
464 // PTX does not support load / store predicate registers
465 setOperationAction(ISD::LOAD, MVT::i1, Custom);
466 setOperationAction(ISD::STORE, MVT::i1, Custom);
468 for (MVT VT : MVT::integer_valuetypes()) {
469 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
470 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
471 setTruncStoreAction(VT, MVT::i1, Expand);
474 // This is legal in NVPTX
475 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
476 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
477 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
479 // TRAP can be lowered to PTX trap
480 setOperationAction(ISD::TRAP, MVT::Other, Legal);
482 // Register custom handling for vector loads/stores
483 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
484 if (IsPTXVectorType(VT)) {
485 setOperationAction(ISD::LOAD, VT, Custom);
486 setOperationAction(ISD::STORE, VT, Custom);
487 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
491 // Custom handling for i8 intrinsics
492 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
494 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
495 setOperationAction(ISD::ABS, Ty, Legal);
496 setOperationAction(ISD::SMIN, Ty, Legal);
497 setOperationAction(ISD::SMAX, Ty, Legal);
498 setOperationAction(ISD::UMIN, Ty, Legal);
499 setOperationAction(ISD::UMAX, Ty, Legal);
501 setOperationAction(ISD::CTPOP, Ty, Legal);
502 setOperationAction(ISD::CTLZ, Ty, Legal);
505 setOperationAction(ISD::CTTZ, MVT::i16, Expand);
506 setOperationAction(ISD::CTTZ, MVT::i32, Expand);
507 setOperationAction(ISD::CTTZ, MVT::i64, Expand);
509 // PTX does not directly support SELP of i1, so promote to i32 first
510 setOperationAction(ISD::SELECT, MVT::i1, Custom);
512 // PTX cannot multiply two i64s in a single instruction.
513 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
514 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
516 // We have some custom DAG combine patterns for these nodes
517 setTargetDAGCombine(ISD::ADD);
518 setTargetDAGCombine(ISD::AND);
519 setTargetDAGCombine(ISD::FADD);
520 setTargetDAGCombine(ISD::MUL);
521 setTargetDAGCombine(ISD::SHL);
522 setTargetDAGCombine(ISD::SREM);
523 setTargetDAGCombine(ISD::UREM);
525 // setcc for f16x2 needs special handling to prevent legalizer's
526 // attempt to scalarize it due to v2i1 not being legal.
527 if (STI.allowFP16Math())
528 setTargetDAGCombine(ISD::SETCC);
530 // Promote fp16 arithmetic if fp16 hardware isn't available or the
531 // user passed --nvptx-no-fp16-math. The flag is useful because,
532 // although sm_53+ GPUs have some sort of FP16 support in
533 // hardware, only sm_53 and sm_60 have full implementation. Others
534 // only have token amount of hardware and are likely to run faster
535 // by using fp32 units instead.
536 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
537 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
538 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
541 // There's no neg.f16 instruction. Expand to (0-x).
542 setOperationAction(ISD::FNEG, MVT::f16, Expand);
543 setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
545 // (would be) Library functions.
547 // These map to conversion instructions for scalar FP types.
548 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
549 ISD::FTRUNC}) {
550 setOperationAction(Op, MVT::f16, Legal);
551 setOperationAction(Op, MVT::f32, Legal);
552 setOperationAction(Op, MVT::f64, Legal);
553 setOperationAction(Op, MVT::v2f16, Expand);
556 setOperationAction(ISD::FROUND, MVT::f16, Promote);
557 setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
558 setOperationAction(ISD::FROUND, MVT::f32, Custom);
559 setOperationAction(ISD::FROUND, MVT::f64, Custom);
562 // 'Expand' implements FCOPYSIGN without calling an external library.
563 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
564 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
565 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
566 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
568 // These map to corresponding instructions for f32/f64. f16 must be
569 // promoted to f32. v2f16 is expanded to f16, which is then promoted
570 // to f32.
571 for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
572 ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
573 setOperationAction(Op, MVT::f16, Promote);
574 setOperationAction(Op, MVT::f32, Legal);
575 setOperationAction(Op, MVT::f64, Legal);
576 setOperationAction(Op, MVT::v2f16, Expand);
578 setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
579 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
580 setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
581 setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
583 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
584 // No FPOW or FREM in PTX.
586 // Now deduce the information based on the above mentioned
587 // actions
588 computeRegisterProperties(STI.getRegisterInfo());
591 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
592 switch ((NVPTXISD::NodeType)Opcode) {
593 case NVPTXISD::FIRST_NUMBER:
594 break;
595 case NVPTXISD::CALL:
596 return "NVPTXISD::CALL";
597 case NVPTXISD::RET_FLAG:
598 return "NVPTXISD::RET_FLAG";
599 case NVPTXISD::LOAD_PARAM:
600 return "NVPTXISD::LOAD_PARAM";
601 case NVPTXISD::Wrapper:
602 return "NVPTXISD::Wrapper";
603 case NVPTXISD::DeclareParam:
604 return "NVPTXISD::DeclareParam";
605 case NVPTXISD::DeclareScalarParam:
606 return "NVPTXISD::DeclareScalarParam";
607 case NVPTXISD::DeclareRet:
608 return "NVPTXISD::DeclareRet";
609 case NVPTXISD::DeclareScalarRet:
610 return "NVPTXISD::DeclareScalarRet";
611 case NVPTXISD::DeclareRetParam:
612 return "NVPTXISD::DeclareRetParam";
613 case NVPTXISD::PrintCall:
614 return "NVPTXISD::PrintCall";
615 case NVPTXISD::PrintConvergentCall:
616 return "NVPTXISD::PrintConvergentCall";
617 case NVPTXISD::PrintCallUni:
618 return "NVPTXISD::PrintCallUni";
619 case NVPTXISD::PrintConvergentCallUni:
620 return "NVPTXISD::PrintConvergentCallUni";
621 case NVPTXISD::LoadParam:
622 return "NVPTXISD::LoadParam";
623 case NVPTXISD::LoadParamV2:
624 return "NVPTXISD::LoadParamV2";
625 case NVPTXISD::LoadParamV4:
626 return "NVPTXISD::LoadParamV4";
627 case NVPTXISD::StoreParam:
628 return "NVPTXISD::StoreParam";
629 case NVPTXISD::StoreParamV2:
630 return "NVPTXISD::StoreParamV2";
631 case NVPTXISD::StoreParamV4:
632 return "NVPTXISD::StoreParamV4";
633 case NVPTXISD::StoreParamS32:
634 return "NVPTXISD::StoreParamS32";
635 case NVPTXISD::StoreParamU32:
636 return "NVPTXISD::StoreParamU32";
637 case NVPTXISD::CallArgBegin:
638 return "NVPTXISD::CallArgBegin";
639 case NVPTXISD::CallArg:
640 return "NVPTXISD::CallArg";
641 case NVPTXISD::LastCallArg:
642 return "NVPTXISD::LastCallArg";
643 case NVPTXISD::CallArgEnd:
644 return "NVPTXISD::CallArgEnd";
645 case NVPTXISD::CallVoid:
646 return "NVPTXISD::CallVoid";
647 case NVPTXISD::CallVal:
648 return "NVPTXISD::CallVal";
649 case NVPTXISD::CallSymbol:
650 return "NVPTXISD::CallSymbol";
651 case NVPTXISD::Prototype:
652 return "NVPTXISD::Prototype";
653 case NVPTXISD::MoveParam:
654 return "NVPTXISD::MoveParam";
655 case NVPTXISD::StoreRetval:
656 return "NVPTXISD::StoreRetval";
657 case NVPTXISD::StoreRetvalV2:
658 return "NVPTXISD::StoreRetvalV2";
659 case NVPTXISD::StoreRetvalV4:
660 return "NVPTXISD::StoreRetvalV4";
661 case NVPTXISD::PseudoUseParam:
662 return "NVPTXISD::PseudoUseParam";
663 case NVPTXISD::RETURN:
664 return "NVPTXISD::RETURN";
665 case NVPTXISD::CallSeqBegin:
666 return "NVPTXISD::CallSeqBegin";
667 case NVPTXISD::CallSeqEnd:
668 return "NVPTXISD::CallSeqEnd";
669 case NVPTXISD::CallPrototype:
670 return "NVPTXISD::CallPrototype";
671 case NVPTXISD::ProxyReg:
672 return "NVPTXISD::ProxyReg";
673 case NVPTXISD::LoadV2:
674 return "NVPTXISD::LoadV2";
675 case NVPTXISD::LoadV4:
676 return "NVPTXISD::LoadV4";
677 case NVPTXISD::LDGV2:
678 return "NVPTXISD::LDGV2";
679 case NVPTXISD::LDGV4:
680 return "NVPTXISD::LDGV4";
681 case NVPTXISD::LDUV2:
682 return "NVPTXISD::LDUV2";
683 case NVPTXISD::LDUV4:
684 return "NVPTXISD::LDUV4";
685 case NVPTXISD::StoreV2:
686 return "NVPTXISD::StoreV2";
687 case NVPTXISD::StoreV4:
688 return "NVPTXISD::StoreV4";
689 case NVPTXISD::FUN_SHFL_CLAMP:
690 return "NVPTXISD::FUN_SHFL_CLAMP";
691 case NVPTXISD::FUN_SHFR_CLAMP:
692 return "NVPTXISD::FUN_SHFR_CLAMP";
693 case NVPTXISD::IMAD:
694 return "NVPTXISD::IMAD";
695 case NVPTXISD::SETP_F16X2:
696 return "NVPTXISD::SETP_F16X2";
697 case NVPTXISD::Dummy:
698 return "NVPTXISD::Dummy";
699 case NVPTXISD::MUL_WIDE_SIGNED:
700 return "NVPTXISD::MUL_WIDE_SIGNED";
701 case NVPTXISD::MUL_WIDE_UNSIGNED:
702 return "NVPTXISD::MUL_WIDE_UNSIGNED";
703 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
704 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
705 case NVPTXISD::Tex1DFloatFloatLevel:
706 return "NVPTXISD::Tex1DFloatFloatLevel";
707 case NVPTXISD::Tex1DFloatFloatGrad:
708 return "NVPTXISD::Tex1DFloatFloatGrad";
709 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
710 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
711 case NVPTXISD::Tex1DS32FloatLevel:
712 return "NVPTXISD::Tex1DS32FloatLevel";
713 case NVPTXISD::Tex1DS32FloatGrad:
714 return "NVPTXISD::Tex1DS32FloatGrad";
715 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
716 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
717 case NVPTXISD::Tex1DU32FloatLevel:
718 return "NVPTXISD::Tex1DU32FloatLevel";
719 case NVPTXISD::Tex1DU32FloatGrad:
720 return "NVPTXISD::Tex1DU32FloatGrad";
721 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
722 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
723 case NVPTXISD::Tex1DArrayFloatFloatLevel:
724 return "NVPTXISD::Tex1DArrayFloatFloatLevel";
725 case NVPTXISD::Tex1DArrayFloatFloatGrad:
726 return "NVPTXISD::Tex1DArrayFloatFloatGrad";
727 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
728 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
729 case NVPTXISD::Tex1DArrayS32FloatLevel:
730 return "NVPTXISD::Tex1DArrayS32FloatLevel";
731 case NVPTXISD::Tex1DArrayS32FloatGrad:
732 return "NVPTXISD::Tex1DArrayS32FloatGrad";
733 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
734 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
735 case NVPTXISD::Tex1DArrayU32FloatLevel:
736 return "NVPTXISD::Tex1DArrayU32FloatLevel";
737 case NVPTXISD::Tex1DArrayU32FloatGrad:
738 return "NVPTXISD::Tex1DArrayU32FloatGrad";
739 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
740 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
741 case NVPTXISD::Tex2DFloatFloatLevel:
742 return "NVPTXISD::Tex2DFloatFloatLevel";
743 case NVPTXISD::Tex2DFloatFloatGrad:
744 return "NVPTXISD::Tex2DFloatFloatGrad";
745 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
746 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
747 case NVPTXISD::Tex2DS32FloatLevel:
748 return "NVPTXISD::Tex2DS32FloatLevel";
749 case NVPTXISD::Tex2DS32FloatGrad:
750 return "NVPTXISD::Tex2DS32FloatGrad";
751 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
752 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
753 case NVPTXISD::Tex2DU32FloatLevel:
754 return "NVPTXISD::Tex2DU32FloatLevel";
755 case NVPTXISD::Tex2DU32FloatGrad:
756 return "NVPTXISD::Tex2DU32FloatGrad";
757 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
758 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
759 case NVPTXISD::Tex2DArrayFloatFloatLevel:
760 return "NVPTXISD::Tex2DArrayFloatFloatLevel";
761 case NVPTXISD::Tex2DArrayFloatFloatGrad:
762 return "NVPTXISD::Tex2DArrayFloatFloatGrad";
763 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
764 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
765 case NVPTXISD::Tex2DArrayS32FloatLevel:
766 return "NVPTXISD::Tex2DArrayS32FloatLevel";
767 case NVPTXISD::Tex2DArrayS32FloatGrad:
768 return "NVPTXISD::Tex2DArrayS32FloatGrad";
769 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
770 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
771 case NVPTXISD::Tex2DArrayU32FloatLevel:
772 return "NVPTXISD::Tex2DArrayU32FloatLevel";
773 case NVPTXISD::Tex2DArrayU32FloatGrad:
774 return "NVPTXISD::Tex2DArrayU32FloatGrad";
775 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
776 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
777 case NVPTXISD::Tex3DFloatFloatLevel:
778 return "NVPTXISD::Tex3DFloatFloatLevel";
779 case NVPTXISD::Tex3DFloatFloatGrad:
780 return "NVPTXISD::Tex3DFloatFloatGrad";
781 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
782 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
783 case NVPTXISD::Tex3DS32FloatLevel:
784 return "NVPTXISD::Tex3DS32FloatLevel";
785 case NVPTXISD::Tex3DS32FloatGrad:
786 return "NVPTXISD::Tex3DS32FloatGrad";
787 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
788 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
789 case NVPTXISD::Tex3DU32FloatLevel:
790 return "NVPTXISD::Tex3DU32FloatLevel";
791 case NVPTXISD::Tex3DU32FloatGrad:
792 return "NVPTXISD::Tex3DU32FloatGrad";
793 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
794 case NVPTXISD::TexCubeFloatFloatLevel:
795 return "NVPTXISD::TexCubeFloatFloatLevel";
796 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
797 case NVPTXISD::TexCubeS32FloatLevel:
798 return "NVPTXISD::TexCubeS32FloatLevel";
799 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
800 case NVPTXISD::TexCubeU32FloatLevel:
801 return "NVPTXISD::TexCubeU32FloatLevel";
802 case NVPTXISD::TexCubeArrayFloatFloat:
803 return "NVPTXISD::TexCubeArrayFloatFloat";
804 case NVPTXISD::TexCubeArrayFloatFloatLevel:
805 return "NVPTXISD::TexCubeArrayFloatFloatLevel";
806 case NVPTXISD::TexCubeArrayS32Float:
807 return "NVPTXISD::TexCubeArrayS32Float";
808 case NVPTXISD::TexCubeArrayS32FloatLevel:
809 return "NVPTXISD::TexCubeArrayS32FloatLevel";
810 case NVPTXISD::TexCubeArrayU32Float:
811 return "NVPTXISD::TexCubeArrayU32Float";
812 case NVPTXISD::TexCubeArrayU32FloatLevel:
813 return "NVPTXISD::TexCubeArrayU32FloatLevel";
814 case NVPTXISD::Tld4R2DFloatFloat:
815 return "NVPTXISD::Tld4R2DFloatFloat";
816 case NVPTXISD::Tld4G2DFloatFloat:
817 return "NVPTXISD::Tld4G2DFloatFloat";
818 case NVPTXISD::Tld4B2DFloatFloat:
819 return "NVPTXISD::Tld4B2DFloatFloat";
820 case NVPTXISD::Tld4A2DFloatFloat:
821 return "NVPTXISD::Tld4A2DFloatFloat";
822 case NVPTXISD::Tld4R2DS64Float:
823 return "NVPTXISD::Tld4R2DS64Float";
824 case NVPTXISD::Tld4G2DS64Float:
825 return "NVPTXISD::Tld4G2DS64Float";
826 case NVPTXISD::Tld4B2DS64Float:
827 return "NVPTXISD::Tld4B2DS64Float";
828 case NVPTXISD::Tld4A2DS64Float:
829 return "NVPTXISD::Tld4A2DS64Float";
830 case NVPTXISD::Tld4R2DU64Float:
831 return "NVPTXISD::Tld4R2DU64Float";
832 case NVPTXISD::Tld4G2DU64Float:
833 return "NVPTXISD::Tld4G2DU64Float";
834 case NVPTXISD::Tld4B2DU64Float:
835 return "NVPTXISD::Tld4B2DU64Float";
836 case NVPTXISD::Tld4A2DU64Float:
837 return "NVPTXISD::Tld4A2DU64Float";
839 case NVPTXISD::TexUnified1DFloatS32:
840 return "NVPTXISD::TexUnified1DFloatS32";
841 case NVPTXISD::TexUnified1DFloatFloat:
842 return "NVPTXISD::TexUnified1DFloatFloat";
843 case NVPTXISD::TexUnified1DFloatFloatLevel:
844 return "NVPTXISD::TexUnified1DFloatFloatLevel";
845 case NVPTXISD::TexUnified1DFloatFloatGrad:
846 return "NVPTXISD::TexUnified1DFloatFloatGrad";
847 case NVPTXISD::TexUnified1DS32S32:
848 return "NVPTXISD::TexUnified1DS32S32";
849 case NVPTXISD::TexUnified1DS32Float:
850 return "NVPTXISD::TexUnified1DS32Float";
851 case NVPTXISD::TexUnified1DS32FloatLevel:
852 return "NVPTXISD::TexUnified1DS32FloatLevel";
853 case NVPTXISD::TexUnified1DS32FloatGrad:
854 return "NVPTXISD::TexUnified1DS32FloatGrad";
855 case NVPTXISD::TexUnified1DU32S32:
856 return "NVPTXISD::TexUnified1DU32S32";
857 case NVPTXISD::TexUnified1DU32Float:
858 return "NVPTXISD::TexUnified1DU32Float";
859 case NVPTXISD::TexUnified1DU32FloatLevel:
860 return "NVPTXISD::TexUnified1DU32FloatLevel";
861 case NVPTXISD::TexUnified1DU32FloatGrad:
862 return "NVPTXISD::TexUnified1DU32FloatGrad";
863 case NVPTXISD::TexUnified1DArrayFloatS32:
864 return "NVPTXISD::TexUnified1DArrayFloatS32";
865 case NVPTXISD::TexUnified1DArrayFloatFloat:
866 return "NVPTXISD::TexUnified1DArrayFloatFloat";
867 case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
868 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
869 case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
870 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
871 case NVPTXISD::TexUnified1DArrayS32S32:
872 return "NVPTXISD::TexUnified1DArrayS32S32";
873 case NVPTXISD::TexUnified1DArrayS32Float:
874 return "NVPTXISD::TexUnified1DArrayS32Float";
875 case NVPTXISD::TexUnified1DArrayS32FloatLevel:
876 return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
877 case NVPTXISD::TexUnified1DArrayS32FloatGrad:
878 return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
879 case NVPTXISD::TexUnified1DArrayU32S32:
880 return "NVPTXISD::TexUnified1DArrayU32S32";
881 case NVPTXISD::TexUnified1DArrayU32Float:
882 return "NVPTXISD::TexUnified1DArrayU32Float";
883 case NVPTXISD::TexUnified1DArrayU32FloatLevel:
884 return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
885 case NVPTXISD::TexUnified1DArrayU32FloatGrad:
886 return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
887 case NVPTXISD::TexUnified2DFloatS32:
888 return "NVPTXISD::TexUnified2DFloatS32";
889 case NVPTXISD::TexUnified2DFloatFloat:
890 return "NVPTXISD::TexUnified2DFloatFloat";
891 case NVPTXISD::TexUnified2DFloatFloatLevel:
892 return "NVPTXISD::TexUnified2DFloatFloatLevel";
893 case NVPTXISD::TexUnified2DFloatFloatGrad:
894 return "NVPTXISD::TexUnified2DFloatFloatGrad";
895 case NVPTXISD::TexUnified2DS32S32:
896 return "NVPTXISD::TexUnified2DS32S32";
897 case NVPTXISD::TexUnified2DS32Float:
898 return "NVPTXISD::TexUnified2DS32Float";
899 case NVPTXISD::TexUnified2DS32FloatLevel:
900 return "NVPTXISD::TexUnified2DS32FloatLevel";
901 case NVPTXISD::TexUnified2DS32FloatGrad:
902 return "NVPTXISD::TexUnified2DS32FloatGrad";
903 case NVPTXISD::TexUnified2DU32S32:
904 return "NVPTXISD::TexUnified2DU32S32";
905 case NVPTXISD::TexUnified2DU32Float:
906 return "NVPTXISD::TexUnified2DU32Float";
907 case NVPTXISD::TexUnified2DU32FloatLevel:
908 return "NVPTXISD::TexUnified2DU32FloatLevel";
909 case NVPTXISD::TexUnified2DU32FloatGrad:
910 return "NVPTXISD::TexUnified2DU32FloatGrad";
911 case NVPTXISD::TexUnified2DArrayFloatS32:
912 return "NVPTXISD::TexUnified2DArrayFloatS32";
913 case NVPTXISD::TexUnified2DArrayFloatFloat:
914 return "NVPTXISD::TexUnified2DArrayFloatFloat";
915 case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
916 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
917 case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
918 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
919 case NVPTXISD::TexUnified2DArrayS32S32:
920 return "NVPTXISD::TexUnified2DArrayS32S32";
921 case NVPTXISD::TexUnified2DArrayS32Float:
922 return "NVPTXISD::TexUnified2DArrayS32Float";
923 case NVPTXISD::TexUnified2DArrayS32FloatLevel:
924 return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
925 case NVPTXISD::TexUnified2DArrayS32FloatGrad:
926 return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
927 case NVPTXISD::TexUnified2DArrayU32S32:
928 return "NVPTXISD::TexUnified2DArrayU32S32";
929 case NVPTXISD::TexUnified2DArrayU32Float:
930 return "NVPTXISD::TexUnified2DArrayU32Float";
931 case NVPTXISD::TexUnified2DArrayU32FloatLevel:
932 return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
933 case NVPTXISD::TexUnified2DArrayU32FloatGrad:
934 return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
935 case NVPTXISD::TexUnified3DFloatS32:
936 return "NVPTXISD::TexUnified3DFloatS32";
937 case NVPTXISD::TexUnified3DFloatFloat:
938 return "NVPTXISD::TexUnified3DFloatFloat";
939 case NVPTXISD::TexUnified3DFloatFloatLevel:
940 return "NVPTXISD::TexUnified3DFloatFloatLevel";
941 case NVPTXISD::TexUnified3DFloatFloatGrad:
942 return "NVPTXISD::TexUnified3DFloatFloatGrad";
943 case NVPTXISD::TexUnified3DS32S32:
944 return "NVPTXISD::TexUnified3DS32S32";
945 case NVPTXISD::TexUnified3DS32Float:
946 return "NVPTXISD::TexUnified3DS32Float";
947 case NVPTXISD::TexUnified3DS32FloatLevel:
948 return "NVPTXISD::TexUnified3DS32FloatLevel";
949 case NVPTXISD::TexUnified3DS32FloatGrad:
950 return "NVPTXISD::TexUnified3DS32FloatGrad";
951 case NVPTXISD::TexUnified3DU32S32:
952 return "NVPTXISD::TexUnified3DU32S32";
953 case NVPTXISD::TexUnified3DU32Float:
954 return "NVPTXISD::TexUnified3DU32Float";
955 case NVPTXISD::TexUnified3DU32FloatLevel:
956 return "NVPTXISD::TexUnified3DU32FloatLevel";
957 case NVPTXISD::TexUnified3DU32FloatGrad:
958 return "NVPTXISD::TexUnified3DU32FloatGrad";
959 case NVPTXISD::TexUnifiedCubeFloatFloat:
960 return "NVPTXISD::TexUnifiedCubeFloatFloat";
961 case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
962 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
963 case NVPTXISD::TexUnifiedCubeS32Float:
964 return "NVPTXISD::TexUnifiedCubeS32Float";
965 case NVPTXISD::TexUnifiedCubeS32FloatLevel:
966 return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
967 case NVPTXISD::TexUnifiedCubeU32Float:
968 return "NVPTXISD::TexUnifiedCubeU32Float";
969 case NVPTXISD::TexUnifiedCubeU32FloatLevel:
970 return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
971 case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
972 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
973 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
974 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
975 case NVPTXISD::TexUnifiedCubeArrayS32Float:
976 return "NVPTXISD::TexUnifiedCubeArrayS32Float";
977 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
978 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
979 case NVPTXISD::TexUnifiedCubeArrayU32Float:
980 return "NVPTXISD::TexUnifiedCubeArrayU32Float";
981 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
982 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
983 case NVPTXISD::Tld4UnifiedR2DFloatFloat:
984 return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
985 case NVPTXISD::Tld4UnifiedG2DFloatFloat:
986 return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
987 case NVPTXISD::Tld4UnifiedB2DFloatFloat:
988 return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
989 case NVPTXISD::Tld4UnifiedA2DFloatFloat:
990 return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
991 case NVPTXISD::Tld4UnifiedR2DS64Float:
992 return "NVPTXISD::Tld4UnifiedR2DS64Float";
993 case NVPTXISD::Tld4UnifiedG2DS64Float:
994 return "NVPTXISD::Tld4UnifiedG2DS64Float";
995 case NVPTXISD::Tld4UnifiedB2DS64Float:
996 return "NVPTXISD::Tld4UnifiedB2DS64Float";
997 case NVPTXISD::Tld4UnifiedA2DS64Float:
998 return "NVPTXISD::Tld4UnifiedA2DS64Float";
999 case NVPTXISD::Tld4UnifiedR2DU64Float:
1000 return "NVPTXISD::Tld4UnifiedR2DU64Float";
1001 case NVPTXISD::Tld4UnifiedG2DU64Float:
1002 return "NVPTXISD::Tld4UnifiedG2DU64Float";
1003 case NVPTXISD::Tld4UnifiedB2DU64Float:
1004 return "NVPTXISD::Tld4UnifiedB2DU64Float";
1005 case NVPTXISD::Tld4UnifiedA2DU64Float:
1006 return "NVPTXISD::Tld4UnifiedA2DU64Float";
1008 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
1009 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
1010 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
1011 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
1012 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
1013 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
1014 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
1015 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
1016 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
1017 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
1018 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
1020 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
1021 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
1022 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
1023 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
1024 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1025 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1026 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1027 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1028 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1029 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1030 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1032 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
1033 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
1034 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
1035 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
1036 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
1037 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
1038 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
1039 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
1040 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
1041 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
1042 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
1044 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
1045 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
1046 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
1047 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
1048 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1049 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1050 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1051 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1052 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1053 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1054 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1056 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
1057 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
1058 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
1059 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
1060 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
1061 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
1062 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
1063 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
1064 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
1065 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
1066 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
1068 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
1069 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
1070 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
1071 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
1072 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
1073 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
1074 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
1075 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
1076 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
1077 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
1078 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
1080 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
1081 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
1082 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
1083 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
1084 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
1085 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
1086 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
1087 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
1088 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
1089 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
1090 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
1092 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
1093 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
1094 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
1095 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
1096 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
1097 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
1098 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
1099 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
1100 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
1101 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
1102 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
1104 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
1105 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
1106 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
1107 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
1108 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
1109 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
1110 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
1111 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
1112 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
1113 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
1114 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
1116 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
1117 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
1118 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
1119 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
1120 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
1121 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
1122 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
1123 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
1124 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
1125 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
1126 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
1128 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
1129 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
1130 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
1131 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
1132 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
1133 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
1134 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
1135 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
1136 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
1137 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
1138 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
1140 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
1141 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
1142 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
1143 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
1144 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
1145 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
1146 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
1147 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
1148 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
1149 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
1150 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
1152 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
1153 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
1154 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
1155 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
1156 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
1157 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
1158 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
1159 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
1160 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
1161 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
1162 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
1164 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
1165 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
1166 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
1167 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
1168 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
1169 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
1170 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
1171 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
1172 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
1173 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
1174 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
1176 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
1177 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
1178 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
1179 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
1180 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
1181 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
1182 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
1183 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
1184 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
1185 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
1186 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
1188 return nullptr;
1191 TargetLoweringBase::LegalizeTypeAction
1192 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
1193 if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
1194 return TypeSplitVector;
1195 if (VT == MVT::v2f16)
1196 return TypeLegal;
1197 return TargetLoweringBase::getPreferredVectorAction(VT);
1200 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
1201 int Enabled, int &ExtraSteps,
1202 bool &UseOneConst,
1203 bool Reciprocal) const {
1204 if (!(Enabled == ReciprocalEstimate::Enabled ||
1205 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1206 return SDValue();
1208 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1209 ExtraSteps = 0;
1211 SDLoc DL(Operand);
1212 EVT VT = Operand.getValueType();
1213 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1215 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1216 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1217 DAG.getConstant(IID, DL, MVT::i32), Operand);
1220 // The sqrt and rsqrt refinement processes assume we always start out with an
1221 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1222 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1223 // any refinement, we must return a regular sqrt.
1224 if (Reciprocal || ExtraSteps > 0) {
1225 if (VT == MVT::f32)
1226 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1227 : Intrinsic::nvvm_rsqrt_approx_f);
1228 else if (VT == MVT::f64)
1229 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1230 else
1231 return SDValue();
1232 } else {
1233 if (VT == MVT::f32)
1234 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1235 : Intrinsic::nvvm_sqrt_approx_f);
1236 else {
1237 // There's no sqrt.approx.f64 instruction, so we emit
1238 // reciprocal(rsqrt(x)). This is faster than
1239 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1240 // x * rsqrt(x).)
1241 return DAG.getNode(
1242 ISD::INTRINSIC_WO_CHAIN, DL, VT,
1243 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1244 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1249 SDValue
1250 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
1251 SDLoc dl(Op);
1252 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1253 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1254 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1255 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1258 std::string NVPTXTargetLowering::getPrototype(
1259 const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1260 const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
1261 ImmutableCallSite CS) const {
1262 auto PtrVT = getPointerTy(DL);
1264 bool isABI = (STI.getSmVersion() >= 20);
1265 assert(isABI && "Non-ABI compilation is not supported");
1266 if (!isABI)
1267 return "";
1269 std::stringstream O;
1270 O << "prototype_" << uniqueCallSite << " : .callprototype ";
1272 if (retTy->getTypeID() == Type::VoidTyID) {
1273 O << "()";
1274 } else {
1275 O << "(";
1276 if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1277 unsigned size = 0;
1278 if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1279 size = ITy->getBitWidth();
1280 } else {
1281 assert(retTy->isFloatingPointTy() &&
1282 "Floating point type expected here");
1283 size = retTy->getPrimitiveSizeInBits();
1285 // PTX ABI requires all scalar return values to be at least 32
1286 // bits in size. fp16 normally uses .b16 as its storage type in
1287 // PTX, so its size must be adjusted here, too.
1288 if (size < 32)
1289 size = 32;
1291 O << ".param .b" << size << " _";
1292 } else if (isa<PointerType>(retTy)) {
1293 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1294 } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
1295 retTy->isIntegerTy(128)) {
1296 O << ".param .align " << retAlignment << " .b8 _["
1297 << DL.getTypeAllocSize(retTy) << "]";
1298 } else {
1299 llvm_unreachable("Unknown return type");
1301 O << ") ";
1303 O << "_ (";
1305 bool first = true;
1307 unsigned OIdx = 0;
1308 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1309 Type *Ty = Args[i].Ty;
1310 if (!first) {
1311 O << ", ";
1313 first = false;
1315 if (!Outs[OIdx].Flags.isByVal()) {
1316 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1317 unsigned align = 0;
1318 const CallInst *CallI = cast<CallInst>(CS.getInstruction());
1319 // +1 because index 0 is reserved for return type alignment
1320 if (!getAlign(*CallI, i + 1, align))
1321 align = DL.getABITypeAlignment(Ty);
1322 unsigned sz = DL.getTypeAllocSize(Ty);
1323 O << ".param .align " << align << " .b8 ";
1324 O << "_";
1325 O << "[" << sz << "]";
1326 // update the index for Outs
1327 SmallVector<EVT, 16> vtparts;
1328 ComputeValueVTs(*this, DL, Ty, vtparts);
1329 if (unsigned len = vtparts.size())
1330 OIdx += len - 1;
1331 continue;
1333 // i8 types in IR will be i16 types in SDAG
1334 assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1335 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1336 "type mismatch between callee prototype and arguments");
1337 // scalar type
1338 unsigned sz = 0;
1339 if (isa<IntegerType>(Ty)) {
1340 sz = cast<IntegerType>(Ty)->getBitWidth();
1341 if (sz < 32)
1342 sz = 32;
1343 } else if (isa<PointerType>(Ty)) {
1344 sz = PtrVT.getSizeInBits();
1345 } else if (Ty->isHalfTy())
1346 // PTX ABI requires all scalar parameters to be at least 32
1347 // bits in size. fp16 normally uses .b16 as its storage type
1348 // in PTX, so its size must be adjusted here, too.
1349 sz = 32;
1350 else
1351 sz = Ty->getPrimitiveSizeInBits();
1352 O << ".param .b" << sz << " ";
1353 O << "_";
1354 continue;
1356 auto *PTy = dyn_cast<PointerType>(Ty);
1357 assert(PTy && "Param with byval attribute should be a pointer type");
1358 Type *ETy = PTy->getElementType();
1360 unsigned align = Outs[OIdx].Flags.getByValAlign();
1361 unsigned sz = DL.getTypeAllocSize(ETy);
1362 O << ".param .align " << align << " .b8 ";
1363 O << "_";
1364 O << "[" << sz << "]";
1366 O << ");";
1367 return O.str();
1370 unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1371 ImmutableCallSite CS,
1372 Type *Ty, unsigned Idx,
1373 const DataLayout &DL) const {
1374 if (!CS) {
1375 // CallSite is zero, fallback to ABI type alignment
1376 return DL.getABITypeAlignment(Ty);
1379 unsigned Align = 0;
1380 const Value *DirectCallee = CS.getCalledFunction();
1382 if (!DirectCallee) {
1383 // We don't have a direct function symbol, but that may be because of
1384 // constant cast instructions in the call.
1385 const Instruction *CalleeI = CS.getInstruction();
1386 assert(CalleeI && "Call target is not a function or derived value?");
1388 // With bitcast'd call targets, the instruction will be the call
1389 if (isa<CallInst>(CalleeI)) {
1390 // Check if we have call alignment metadata
1391 if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
1392 return Align;
1394 const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
1395 // Ignore any bitcast instructions
1396 while (isa<ConstantExpr>(CalleeV)) {
1397 const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1398 if (!CE->isCast())
1399 break;
1400 // Look through the bitcast
1401 CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1404 // We have now looked past all of the bitcasts. Do we finally have a
1405 // Function?
1406 if (isa<Function>(CalleeV))
1407 DirectCallee = CalleeV;
1411 // Check for function alignment information if we found that the
1412 // ultimate target is a Function
1413 if (DirectCallee)
1414 if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
1415 return Align;
1417 // Call is indirect or alignment information is not available, fall back to
1418 // the ABI type alignment
1419 return DL.getABITypeAlignment(Ty);
1422 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1423 SmallVectorImpl<SDValue> &InVals) const {
1424 SelectionDAG &DAG = CLI.DAG;
1425 SDLoc dl = CLI.DL;
1426 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1427 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1428 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1429 SDValue Chain = CLI.Chain;
1430 SDValue Callee = CLI.Callee;
1431 bool &isTailCall = CLI.IsTailCall;
1432 ArgListTy &Args = CLI.getArgs();
1433 Type *RetTy = CLI.RetTy;
1434 ImmutableCallSite CS = CLI.CS;
1435 const DataLayout &DL = DAG.getDataLayout();
1437 bool isABI = (STI.getSmVersion() >= 20);
1438 assert(isABI && "Non-ABI compilation is not supported");
1439 if (!isABI)
1440 return Chain;
1442 SDValue tempChain = Chain;
1443 Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
1444 SDValue InFlag = Chain.getValue(1);
1446 unsigned paramCount = 0;
1447 // Args.size() and Outs.size() need not match.
1448 // Outs.size() will be larger
1449 // * if there is an aggregate argument with multiple fields (each field
1450 // showing up separately in Outs)
1451 // * if there is a vector argument with more than typical vector-length
1452 // elements (generally if more than 4) where each vector element is
1453 // individually present in Outs.
1454 // So a different index should be used for indexing into Outs/OutVals.
1455 // See similar issue in LowerFormalArguments.
1456 unsigned OIdx = 0;
1457 // Declare the .params or .reg need to pass values
1458 // to the function
1459 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1460 EVT VT = Outs[OIdx].VT;
1461 Type *Ty = Args[i].Ty;
1463 if (!Outs[OIdx].Flags.isByVal()) {
1464 SmallVector<EVT, 16> VTs;
1465 SmallVector<uint64_t, 16> Offsets;
1466 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
1467 unsigned ArgAlign =
1468 getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
1469 unsigned AllocSize = DL.getTypeAllocSize(Ty);
1470 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1471 bool NeedAlign; // Does argument declaration specify alignment?
1472 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1473 // declare .param .align <align> .b8 .param<n>[<size>];
1474 SDValue DeclareParamOps[] = {
1475 Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1476 DAG.getConstant(paramCount, dl, MVT::i32),
1477 DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
1478 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1479 DeclareParamOps);
1480 NeedAlign = true;
1481 } else {
1482 // declare .param .b<size> .param<n>;
1483 if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
1484 // PTX ABI requires integral types to be at least 32 bits in
1485 // size. FP16 is loaded/stored using i16, so it's handled
1486 // here as well.
1487 AllocSize = 4;
1489 SDValue DeclareScalarParamOps[] = {
1490 Chain, DAG.getConstant(paramCount, dl, MVT::i32),
1491 DAG.getConstant(AllocSize * 8, dl, MVT::i32),
1492 DAG.getConstant(0, dl, MVT::i32), InFlag};
1493 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1494 DeclareScalarParamOps);
1495 NeedAlign = false;
1497 InFlag = Chain.getValue(1);
1499 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1500 // than 32-bits are sign extended or zero extended, depending on
1501 // whether they are signed or unsigned types. This case applies
1502 // only to scalar parameters and not to aggregate values.
1503 bool ExtendIntegerParam =
1504 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1506 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
1507 SmallVector<SDValue, 6> StoreOperands;
1508 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1509 // New store.
1510 if (VectorInfo[j] & PVF_FIRST) {
1511 assert(StoreOperands.empty() && "Unfinished preceding store.");
1512 StoreOperands.push_back(Chain);
1513 StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1514 StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
1517 EVT EltVT = VTs[j];
1518 SDValue StVal = OutVals[OIdx];
1519 if (ExtendIntegerParam) {
1520 assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1521 // zext/sext to i32
1522 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1523 : ISD::ZERO_EXTEND,
1524 dl, MVT::i32, StVal);
1525 } else if (EltVT.getSizeInBits() < 16) {
1526 // Use 16-bit registers for small stores as it's the
1527 // smallest general purpose register size supported by NVPTX.
1528 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1531 // Record the value to store.
1532 StoreOperands.push_back(StVal);
1534 if (VectorInfo[j] & PVF_LAST) {
1535 unsigned NumElts = StoreOperands.size() - 3;
1536 NVPTXISD::NodeType Op;
1537 switch (NumElts) {
1538 case 1:
1539 Op = NVPTXISD::StoreParam;
1540 break;
1541 case 2:
1542 Op = NVPTXISD::StoreParamV2;
1543 break;
1544 case 4:
1545 Op = NVPTXISD::StoreParamV4;
1546 break;
1547 default:
1548 llvm_unreachable("Invalid vector info.");
1551 StoreOperands.push_back(InFlag);
1553 // Adjust type of the store op if we've extended the scalar
1554 // return value.
1555 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
1556 unsigned EltAlign =
1557 NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
1559 Chain = DAG.getMemIntrinsicNode(
1560 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1561 TheStoreType, MachinePointerInfo(), EltAlign,
1562 MachineMemOperand::MOStore);
1563 InFlag = Chain.getValue(1);
1565 // Cleanup.
1566 StoreOperands.clear();
1568 ++OIdx;
1570 assert(StoreOperands.empty() && "Unfinished parameter store.");
1571 if (VTs.size() > 0)
1572 --OIdx;
1573 ++paramCount;
1574 continue;
1577 // ByVal arguments
1578 SmallVector<EVT, 16> VTs;
1579 SmallVector<uint64_t, 16> Offsets;
1580 auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1581 assert(PTy && "Type of a byval parameter should be pointer");
1582 ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
1584 // declare .param .align <align> .b8 .param<n>[<size>];
1585 unsigned sz = Outs[OIdx].Flags.getByValSize();
1586 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1587 unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
1588 // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1589 // so we don't need to worry about natural alignment or not.
1590 // See TargetLowering::LowerCallTo().
1592 // Enforce minumum alignment of 4 to work around ptxas miscompile
1593 // for sm_50+. See corresponding alignment adjustment in
1594 // emitFunctionParamList() for details.
1595 if (ArgAlign < 4)
1596 ArgAlign = 4;
1597 SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1598 DAG.getConstant(paramCount, dl, MVT::i32),
1599 DAG.getConstant(sz, dl, MVT::i32), InFlag};
1600 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1601 DeclareParamOps);
1602 InFlag = Chain.getValue(1);
1603 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1604 EVT elemtype = VTs[j];
1605 int curOffset = Offsets[j];
1606 unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
1607 auto PtrVT = getPointerTy(DL);
1608 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1609 DAG.getConstant(curOffset, dl, PtrVT));
1610 SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1611 MachinePointerInfo(), PartAlign);
1612 if (elemtype.getSizeInBits() < 16) {
1613 theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1615 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1616 SDValue CopyParamOps[] = { Chain,
1617 DAG.getConstant(paramCount, dl, MVT::i32),
1618 DAG.getConstant(curOffset, dl, MVT::i32),
1619 theVal, InFlag };
1620 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
1621 CopyParamOps, elemtype,
1622 MachinePointerInfo(), /* Align */ 0,
1623 MachineMemOperand::MOStore);
1625 InFlag = Chain.getValue(1);
1627 ++paramCount;
1630 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1631 unsigned retAlignment = 0;
1633 // Handle Result
1634 if (Ins.size() > 0) {
1635 SmallVector<EVT, 16> resvtparts;
1636 ComputeValueVTs(*this, DL, RetTy, resvtparts);
1638 // Declare
1639 // .param .align 16 .b8 retval0[<size-in-bytes>], or
1640 // .param .b<size-in-bits> retval0
1641 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1642 // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1643 // these three types to match the logic in
1644 // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1645 // Plus, this behavior is consistent with nvcc's.
1646 if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
1647 (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
1648 // Scalar needs to be at least 32bit wide
1649 if (resultsz < 32)
1650 resultsz = 32;
1651 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1652 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1653 DAG.getConstant(resultsz, dl, MVT::i32),
1654 DAG.getConstant(0, dl, MVT::i32), InFlag };
1655 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1656 DeclareRetOps);
1657 InFlag = Chain.getValue(1);
1658 } else {
1659 retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1660 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1661 SDValue DeclareRetOps[] = { Chain,
1662 DAG.getConstant(retAlignment, dl, MVT::i32),
1663 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1664 DAG.getConstant(0, dl, MVT::i32), InFlag };
1665 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1666 DeclareRetOps);
1667 InFlag = Chain.getValue(1);
1671 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1672 // between them we must rely on the call site value which is valid for
1673 // indirect calls but is always null for libcalls.
1674 bool isIndirectCall = !Func && CS;
1676 if (isa<ExternalSymbolSDNode>(Callee)) {
1677 Function* CalleeFunc = nullptr;
1679 // Try to find the callee in the current module.
1680 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1681 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1683 // Set the "libcall callee" attribute to indicate that the function
1684 // must always have a declaration.
1685 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1688 if (isIndirectCall) {
1689 // This is indirect function call case : PTX requires a prototype of the
1690 // form
1691 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1692 // to be emitted, and the label has to used as the last arg of call
1693 // instruction.
1694 // The prototype is embedded in a string and put as the operand for a
1695 // CallPrototype SDNode which will print out to the value of the string.
1696 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1697 std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
1698 const char *ProtoStr =
1699 nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1700 SDValue ProtoOps[] = {
1701 Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1703 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1704 InFlag = Chain.getValue(1);
1706 // Op to just print "call"
1707 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1708 SDValue PrintCallOps[] = {
1709 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1711 // We model convergent calls as separate opcodes.
1712 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
1713 if (CLI.IsConvergent)
1714 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
1715 : NVPTXISD::PrintConvergentCall;
1716 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1717 InFlag = Chain.getValue(1);
1719 // Ops to print out the function name
1720 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1721 SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1722 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1723 InFlag = Chain.getValue(1);
1725 // Ops to print out the param list
1726 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1727 SDValue CallArgBeginOps[] = { Chain, InFlag };
1728 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1729 CallArgBeginOps);
1730 InFlag = Chain.getValue(1);
1732 for (unsigned i = 0, e = paramCount; i != e; ++i) {
1733 unsigned opcode;
1734 if (i == (e - 1))
1735 opcode = NVPTXISD::LastCallArg;
1736 else
1737 opcode = NVPTXISD::CallArg;
1738 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1739 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1740 DAG.getConstant(i, dl, MVT::i32), InFlag };
1741 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1742 InFlag = Chain.getValue(1);
1744 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1745 SDValue CallArgEndOps[] = { Chain,
1746 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1747 InFlag };
1748 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1749 InFlag = Chain.getValue(1);
1751 if (isIndirectCall) {
1752 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1753 SDValue PrototypeOps[] = { Chain,
1754 DAG.getConstant(uniqueCallSite, dl, MVT::i32),
1755 InFlag };
1756 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1757 InFlag = Chain.getValue(1);
1760 SmallVector<SDValue, 16> ProxyRegOps;
1761 SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
1763 // Generate loads from param memory/moves from registers for result
1764 if (Ins.size() > 0) {
1765 SmallVector<EVT, 16> VTs;
1766 SmallVector<uint64_t, 16> Offsets;
1767 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1768 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1770 unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1771 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1773 SmallVector<EVT, 6> LoadVTs;
1774 int VecIdx = -1; // Index of the first element of the vector.
1776 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1777 // 32-bits are sign extended or zero extended, depending on whether
1778 // they are signed or unsigned types.
1779 bool ExtendIntegerRetVal =
1780 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1782 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1783 bool needTruncate = false;
1784 EVT TheLoadType = VTs[i];
1785 EVT EltType = Ins[i].VT;
1786 unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
1787 if (ExtendIntegerRetVal) {
1788 TheLoadType = MVT::i32;
1789 EltType = MVT::i32;
1790 needTruncate = true;
1791 } else if (TheLoadType.getSizeInBits() < 16) {
1792 if (VTs[i].isInteger())
1793 needTruncate = true;
1794 EltType = MVT::i16;
1797 // Record index of the very first element of the vector.
1798 if (VectorInfo[i] & PVF_FIRST) {
1799 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1800 VecIdx = i;
1803 LoadVTs.push_back(EltType);
1805 if (VectorInfo[i] & PVF_LAST) {
1806 unsigned NumElts = LoadVTs.size();
1807 LoadVTs.push_back(MVT::Other);
1808 LoadVTs.push_back(MVT::Glue);
1809 NVPTXISD::NodeType Op;
1810 switch (NumElts) {
1811 case 1:
1812 Op = NVPTXISD::LoadParam;
1813 break;
1814 case 2:
1815 Op = NVPTXISD::LoadParamV2;
1816 break;
1817 case 4:
1818 Op = NVPTXISD::LoadParamV4;
1819 break;
1820 default:
1821 llvm_unreachable("Invalid vector info.");
1824 SDValue LoadOperands[] = {
1825 Chain, DAG.getConstant(1, dl, MVT::i32),
1826 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1827 SDValue RetVal = DAG.getMemIntrinsicNode(
1828 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1829 MachinePointerInfo(), EltAlign,
1830 MachineMemOperand::MOLoad);
1832 for (unsigned j = 0; j < NumElts; ++j) {
1833 ProxyRegOps.push_back(RetVal.getValue(j));
1835 if (needTruncate)
1836 ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
1837 else
1838 ProxyRegTruncates.push_back(Optional<MVT>());
1841 Chain = RetVal.getValue(NumElts);
1842 InFlag = RetVal.getValue(NumElts + 1);
1844 // Cleanup
1845 VecIdx = -1;
1846 LoadVTs.clear();
1851 Chain = DAG.getCALLSEQ_END(Chain,
1852 DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1853 DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
1854 true),
1855 InFlag, dl);
1856 InFlag = Chain.getValue(1);
1857 uniqueCallSite++;
1859 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1860 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1861 // dangling.
1862 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1863 SDValue Ret = DAG.getNode(
1864 NVPTXISD::ProxyReg, dl,
1865 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1866 { Chain, ProxyRegOps[i], InFlag }
1869 Chain = Ret.getValue(1);
1870 InFlag = Ret.getValue(2);
1872 if (ProxyRegTruncates[i].hasValue()) {
1873 Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
1876 InVals.push_back(Ret);
1879 // set isTailCall to false for now, until we figure out how to express
1880 // tail call optimization in PTX
1881 isTailCall = false;
1882 return Chain;
1885 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1886 // (see LegalizeDAG.cpp). This is slow and uses local memory.
1887 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1888 SDValue
1889 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1890 SDNode *Node = Op.getNode();
1891 SDLoc dl(Node);
1892 SmallVector<SDValue, 8> Ops;
1893 unsigned NumOperands = Node->getNumOperands();
1894 for (unsigned i = 0; i < NumOperands; ++i) {
1895 SDValue SubOp = Node->getOperand(i);
1896 EVT VVT = SubOp.getNode()->getValueType(0);
1897 EVT EltVT = VVT.getVectorElementType();
1898 unsigned NumSubElem = VVT.getVectorNumElements();
1899 for (unsigned j = 0; j < NumSubElem; ++j) {
1900 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1901 DAG.getIntPtrConstant(j, dl)));
1904 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1907 // We can init constant f16x2 with a single .b32 move. Normally it
1908 // would get lowered as two constant loads and vector-packing move.
1909 // mov.b16 %h1, 0x4000;
1910 // mov.b16 %h2, 0x3C00;
1911 // mov.b32 %hh2, {%h2, %h1};
1912 // Instead we want just a constant move:
1913 // mov.b32 %hh2, 0x40003C00
1915 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1916 // generates good SASS in both cases.
1917 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1918 SelectionDAG &DAG) const {
1919 //return Op;
1920 if (!(Op->getValueType(0) == MVT::v2f16 &&
1921 isa<ConstantFPSDNode>(Op->getOperand(0)) &&
1922 isa<ConstantFPSDNode>(Op->getOperand(1))))
1923 return Op;
1925 APInt E0 =
1926 cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
1927 APInt E1 =
1928 cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
1929 SDValue Const =
1930 DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
1931 return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
1934 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1935 SelectionDAG &DAG) const {
1936 SDValue Index = Op->getOperand(1);
1937 // Constant index will be matched by tablegen.
1938 if (isa<ConstantSDNode>(Index.getNode()))
1939 return Op;
1941 // Extract individual elements and select one of them.
1942 SDValue Vector = Op->getOperand(0);
1943 EVT VectorVT = Vector.getValueType();
1944 assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
1945 EVT EltVT = VectorVT.getVectorElementType();
1947 SDLoc dl(Op.getNode());
1948 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1949 DAG.getIntPtrConstant(0, dl));
1950 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1951 DAG.getIntPtrConstant(1, dl));
1952 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
1953 ISD::CondCode::SETEQ);
1956 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1957 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1958 /// amount, or
1959 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1960 /// amount.
1961 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1962 SelectionDAG &DAG) const {
1963 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1964 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
1966 EVT VT = Op.getValueType();
1967 unsigned VTBits = VT.getSizeInBits();
1968 SDLoc dl(Op);
1969 SDValue ShOpLo = Op.getOperand(0);
1970 SDValue ShOpHi = Op.getOperand(1);
1971 SDValue ShAmt = Op.getOperand(2);
1972 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1974 if (VTBits == 32 && STI.getSmVersion() >= 35) {
1975 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1976 // {dHi, dLo} = {aHi, aLo} >> Amt
1977 // dHi = aHi >> Amt
1978 // dLo = shf.r.clamp aLo, aHi, Amt
1980 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1981 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1982 ShAmt);
1984 SDValue Ops[2] = { Lo, Hi };
1985 return DAG.getMergeValues(Ops, dl);
1987 else {
1988 // {dHi, dLo} = {aHi, aLo} >> Amt
1989 // - if (Amt>=size) then
1990 // dLo = aHi >> (Amt-size)
1991 // dHi = aHi >> Amt (this is either all 0 or all 1)
1992 // else
1993 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1994 // dHi = aHi >> Amt
1996 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1997 DAG.getConstant(VTBits, dl, MVT::i32),
1998 ShAmt);
1999 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2000 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2001 DAG.getConstant(VTBits, dl, MVT::i32));
2002 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2003 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2004 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2006 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2007 DAG.getConstant(VTBits, dl, MVT::i32),
2008 ISD::SETGE);
2009 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2010 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2012 SDValue Ops[2] = { Lo, Hi };
2013 return DAG.getMergeValues(Ops, dl);
2017 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2018 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2019 /// amount, or
2020 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2021 /// amount.
2022 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2023 SelectionDAG &DAG) const {
2024 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2025 assert(Op.getOpcode() == ISD::SHL_PARTS);
2027 EVT VT = Op.getValueType();
2028 unsigned VTBits = VT.getSizeInBits();
2029 SDLoc dl(Op);
2030 SDValue ShOpLo = Op.getOperand(0);
2031 SDValue ShOpHi = Op.getOperand(1);
2032 SDValue ShAmt = Op.getOperand(2);
2034 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2035 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2036 // {dHi, dLo} = {aHi, aLo} << Amt
2037 // dHi = shf.l.clamp aLo, aHi, Amt
2038 // dLo = aLo << Amt
2040 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2041 ShAmt);
2042 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2044 SDValue Ops[2] = { Lo, Hi };
2045 return DAG.getMergeValues(Ops, dl);
2047 else {
2048 // {dHi, dLo} = {aHi, aLo} << Amt
2049 // - if (Amt>=size) then
2050 // dLo = aLo << Amt (all 0)
2051 // dLo = aLo << (Amt-size)
2052 // else
2053 // dLo = aLo << Amt
2054 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2056 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2057 DAG.getConstant(VTBits, dl, MVT::i32),
2058 ShAmt);
2059 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2060 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2061 DAG.getConstant(VTBits, dl, MVT::i32));
2062 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2063 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2064 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2066 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2067 DAG.getConstant(VTBits, dl, MVT::i32),
2068 ISD::SETGE);
2069 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2070 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2072 SDValue Ops[2] = { Lo, Hi };
2073 return DAG.getMergeValues(Ops, dl);
2077 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2078 EVT VT = Op.getValueType();
2080 if (VT == MVT::f32)
2081 return LowerFROUND32(Op, DAG);
2083 if (VT == MVT::f64)
2084 return LowerFROUND64(Op, DAG);
2086 llvm_unreachable("unhandled type");
2089 // This is the the rounding method used in CUDA libdevice in C like code:
2090 // float roundf(float A)
2091 // {
2092 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2093 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2094 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2095 // }
2096 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2097 SelectionDAG &DAG) const {
2098 SDLoc SL(Op);
2099 SDValue A = Op.getOperand(0);
2100 EVT VT = Op.getValueType();
2102 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2104 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2105 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2106 const int SignBitMask = 0x80000000;
2107 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2108 DAG.getConstant(SignBitMask, SL, MVT::i32));
2109 const int PointFiveInBits = 0x3F000000;
2110 SDValue PointFiveWithSignRaw =
2111 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2112 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2113 SDValue PointFiveWithSign =
2114 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2115 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2116 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2118 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2119 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2120 SDValue IsLarge =
2121 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2122 ISD::SETOGT);
2123 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2125 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2126 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2127 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2128 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2129 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2132 // The implementation of round(double) is similar to that of round(float) in
2133 // that they both separate the value range into three regions and use a method
2134 // specific to the region to round the values. However, round(double) first
2135 // calculates the round of the absolute value and then adds the sign back while
2136 // round(float) directly rounds the value with sign.
2137 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2138 SelectionDAG &DAG) const {
2139 SDLoc SL(Op);
2140 SDValue A = Op.getOperand(0);
2141 EVT VT = Op.getValueType();
2143 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2145 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2146 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2147 DAG.getConstantFP(0.5, SL, VT));
2148 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2150 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2151 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2152 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2153 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2154 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2155 DAG.getConstantFP(0, SL, VT),
2156 RoundedA);
2158 // Add sign to rounded_A
2159 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2160 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2162 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2163 SDValue IsLarge =
2164 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2165 ISD::SETOGT);
2166 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2171 SDValue
2172 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2173 switch (Op.getOpcode()) {
2174 case ISD::RETURNADDR:
2175 return SDValue();
2176 case ISD::FRAMEADDR:
2177 return SDValue();
2178 case ISD::GlobalAddress:
2179 return LowerGlobalAddress(Op, DAG);
2180 case ISD::INTRINSIC_W_CHAIN:
2181 return Op;
2182 case ISD::BUILD_VECTOR:
2183 return LowerBUILD_VECTOR(Op, DAG);
2184 case ISD::EXTRACT_SUBVECTOR:
2185 return Op;
2186 case ISD::EXTRACT_VECTOR_ELT:
2187 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2188 case ISD::CONCAT_VECTORS:
2189 return LowerCONCAT_VECTORS(Op, DAG);
2190 case ISD::STORE:
2191 return LowerSTORE(Op, DAG);
2192 case ISD::LOAD:
2193 return LowerLOAD(Op, DAG);
2194 case ISD::SHL_PARTS:
2195 return LowerShiftLeftParts(Op, DAG);
2196 case ISD::SRA_PARTS:
2197 case ISD::SRL_PARTS:
2198 return LowerShiftRightParts(Op, DAG);
2199 case ISD::SELECT:
2200 return LowerSelect(Op, DAG);
2201 case ISD::FROUND:
2202 return LowerFROUND(Op, DAG);
2203 default:
2204 llvm_unreachable("Custom lowering not defined for operation");
2208 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2209 SDValue Op0 = Op->getOperand(0);
2210 SDValue Op1 = Op->getOperand(1);
2211 SDValue Op2 = Op->getOperand(2);
2212 SDLoc DL(Op.getNode());
2214 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2216 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2217 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2218 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2219 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2221 return Trunc;
2224 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2225 if (Op.getValueType() == MVT::i1)
2226 return LowerLOADi1(Op, DAG);
2228 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2229 // loads and have to handle it here.
2230 if (Op.getValueType() == MVT::v2f16) {
2231 LoadSDNode *Load = cast<LoadSDNode>(Op);
2232 EVT MemVT = Load->getMemoryVT();
2233 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2234 MemVT, *Load->getMemOperand())) {
2235 SDValue Ops[2];
2236 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2237 return DAG.getMergeValues(Ops, SDLoc(Op));
2241 return SDValue();
2244 // v = ld i1* addr
2245 // =>
2246 // v1 = ld i8* addr (-> i16)
2247 // v = trunc i16 to i1
2248 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2249 SDNode *Node = Op.getNode();
2250 LoadSDNode *LD = cast<LoadSDNode>(Node);
2251 SDLoc dl(Node);
2252 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2253 assert(Node->getValueType(0) == MVT::i1 &&
2254 "Custom lowering for i1 load only");
2255 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2256 LD->getPointerInfo(), LD->getAlignment(),
2257 LD->getMemOperand()->getFlags());
2258 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2259 // The legalizer (the caller) is expecting two values from the legalized
2260 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2261 // in LegalizeDAG.cpp which also uses MergeValues.
2262 SDValue Ops[] = { result, LD->getChain() };
2263 return DAG.getMergeValues(Ops, dl);
2266 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2267 StoreSDNode *Store = cast<StoreSDNode>(Op);
2268 EVT VT = Store->getMemoryVT();
2270 if (VT == MVT::i1)
2271 return LowerSTOREi1(Op, DAG);
2273 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2274 // stores and have to handle it here.
2275 if (VT == MVT::v2f16 &&
2276 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2277 VT, *Store->getMemOperand()))
2278 return expandUnalignedStore(Store, DAG);
2280 if (VT.isVector())
2281 return LowerSTOREVector(Op, DAG);
2283 return SDValue();
2286 SDValue
2287 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2288 SDNode *N = Op.getNode();
2289 SDValue Val = N->getOperand(1);
2290 SDLoc DL(N);
2291 EVT ValVT = Val.getValueType();
2293 if (ValVT.isVector()) {
2294 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2295 // legal. We can (and should) split that into 2 stores of <2 x double> here
2296 // but I'm leaving that as a TODO for now.
2297 if (!ValVT.isSimple())
2298 return SDValue();
2299 switch (ValVT.getSimpleVT().SimpleTy) {
2300 default:
2301 return SDValue();
2302 case MVT::v2i8:
2303 case MVT::v2i16:
2304 case MVT::v2i32:
2305 case MVT::v2i64:
2306 case MVT::v2f16:
2307 case MVT::v2f32:
2308 case MVT::v2f64:
2309 case MVT::v4i8:
2310 case MVT::v4i16:
2311 case MVT::v4i32:
2312 case MVT::v4f16:
2313 case MVT::v4f32:
2314 case MVT::v8f16: // <4 x f16x2>
2315 // This is a "native" vector type
2316 break;
2319 MemSDNode *MemSD = cast<MemSDNode>(N);
2320 const DataLayout &TD = DAG.getDataLayout();
2322 unsigned Align = MemSD->getAlignment();
2323 unsigned PrefAlign =
2324 TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
2325 if (Align < PrefAlign) {
2326 // This store is not sufficiently aligned, so bail out and let this vector
2327 // store be scalarized. Note that we may still be able to emit smaller
2328 // vector stores. For example, if we are storing a <4 x float> with an
2329 // alignment of 8, this check will fail but the legalizer will try again
2330 // with 2 x <2 x float>, which will succeed with an alignment of 8.
2331 return SDValue();
2334 unsigned Opcode = 0;
2335 EVT EltVT = ValVT.getVectorElementType();
2336 unsigned NumElts = ValVT.getVectorNumElements();
2338 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2339 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2340 // stored type to i16 and propagate the "real" type as the memory type.
2341 bool NeedExt = false;
2342 if (EltVT.getSizeInBits() < 16)
2343 NeedExt = true;
2345 bool StoreF16x2 = false;
2346 switch (NumElts) {
2347 default:
2348 return SDValue();
2349 case 2:
2350 Opcode = NVPTXISD::StoreV2;
2351 break;
2352 case 4:
2353 Opcode = NVPTXISD::StoreV4;
2354 break;
2355 case 8:
2356 // v8f16 is a special case. PTX doesn't have st.v8.f16
2357 // instruction. Instead, we split the vector into v2f16 chunks and
2358 // store them with st.v4.b32.
2359 assert(EltVT == MVT::f16 && "Wrong type for the vector.");
2360 Opcode = NVPTXISD::StoreV4;
2361 StoreF16x2 = true;
2362 break;
2365 SmallVector<SDValue, 8> Ops;
2367 // First is the chain
2368 Ops.push_back(N->getOperand(0));
2370 if (StoreF16x2) {
2371 // Combine f16,f16 -> v2f16
2372 NumElts /= 2;
2373 for (unsigned i = 0; i < NumElts; ++i) {
2374 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2375 DAG.getIntPtrConstant(i * 2, DL));
2376 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2377 DAG.getIntPtrConstant(i * 2 + 1, DL));
2378 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
2379 Ops.push_back(V2);
2381 } else {
2382 // Then the split values
2383 for (unsigned i = 0; i < NumElts; ++i) {
2384 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2385 DAG.getIntPtrConstant(i, DL));
2386 if (NeedExt)
2387 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2388 Ops.push_back(ExtVal);
2392 // Then any remaining arguments
2393 Ops.append(N->op_begin() + 2, N->op_end());
2395 SDValue NewSt =
2396 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2397 MemSD->getMemoryVT(), MemSD->getMemOperand());
2399 // return DCI.CombineTo(N, NewSt, true);
2400 return NewSt;
2403 return SDValue();
2406 // st i1 v, addr
2407 // =>
2408 // v1 = zxt v to i16
2409 // st.u8 i16, addr
2410 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2411 SDNode *Node = Op.getNode();
2412 SDLoc dl(Node);
2413 StoreSDNode *ST = cast<StoreSDNode>(Node);
2414 SDValue Tmp1 = ST->getChain();
2415 SDValue Tmp2 = ST->getBasePtr();
2416 SDValue Tmp3 = ST->getValue();
2417 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2418 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2419 SDValue Result =
2420 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2421 ST->getAlignment(), ST->getMemOperand()->getFlags());
2422 return Result;
2425 SDValue
2426 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2427 std::string ParamSym;
2428 raw_string_ostream ParamStr(ParamSym);
2430 ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2431 ParamStr.flush();
2433 std::string *SavedStr =
2434 nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2435 return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2438 // Check to see if the kernel argument is image*_t or sampler_t
2440 static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
2441 static const char *const specialTypes[] = { "struct._image2d_t",
2442 "struct._image3d_t",
2443 "struct._sampler_t" };
2445 Type *Ty = arg->getType();
2446 auto *PTy = dyn_cast<PointerType>(Ty);
2448 if (!PTy)
2449 return false;
2451 if (!context)
2452 return false;
2454 auto *STy = dyn_cast<StructType>(PTy->getElementType());
2455 if (!STy || STy->isLiteral())
2456 return false;
2458 return std::find(std::begin(specialTypes), std::end(specialTypes),
2459 STy->getName()) != std::end(specialTypes);
2462 SDValue NVPTXTargetLowering::LowerFormalArguments(
2463 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2464 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2465 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2466 MachineFunction &MF = DAG.getMachineFunction();
2467 const DataLayout &DL = DAG.getDataLayout();
2468 auto PtrVT = getPointerTy(DAG.getDataLayout());
2470 const Function *F = &MF.getFunction();
2471 const AttributeList &PAL = F->getAttributes();
2472 const TargetLowering *TLI = STI.getTargetLowering();
2474 SDValue Root = DAG.getRoot();
2475 std::vector<SDValue> OutChains;
2477 bool isABI = (STI.getSmVersion() >= 20);
2478 assert(isABI && "Non-ABI compilation is not supported");
2479 if (!isABI)
2480 return Chain;
2482 std::vector<Type *> argTypes;
2483 std::vector<const Argument *> theArgs;
2484 for (const Argument &I : F->args()) {
2485 theArgs.push_back(&I);
2486 argTypes.push_back(I.getType());
2488 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2489 // Ins.size() will be larger
2490 // * if there is an aggregate argument with multiple fields (each field
2491 // showing up separately in Ins)
2492 // * if there is a vector argument with more than typical vector-length
2493 // elements (generally if more than 4) where each vector element is
2494 // individually present in Ins.
2495 // So a different index should be used for indexing into Ins.
2496 // See similar issue in LowerCall.
2497 unsigned InsIdx = 0;
2499 int idx = 0;
2500 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2501 Type *Ty = argTypes[i];
2503 // If the kernel argument is image*_t or sampler_t, convert it to
2504 // a i32 constant holding the parameter position. This can later
2505 // matched in the AsmPrinter to output the correct mangled name.
2506 if (isImageOrSamplerVal(
2507 theArgs[i],
2508 (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2509 : nullptr))) {
2510 assert(isKernelFunction(*F) &&
2511 "Only kernels can have image/sampler params");
2512 InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2513 continue;
2516 if (theArgs[i]->use_empty()) {
2517 // argument is dead
2518 if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
2519 SmallVector<EVT, 16> vtparts;
2521 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2522 assert(vtparts.size() > 0 && "empty aggregate type not expected");
2523 for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2524 ++parti) {
2525 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2526 ++InsIdx;
2528 if (vtparts.size() > 0)
2529 --InsIdx;
2530 continue;
2532 if (Ty->isVectorTy()) {
2533 EVT ObjectVT = getValueType(DL, Ty);
2534 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2535 for (unsigned parti = 0; parti < NumRegs; ++parti) {
2536 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2537 ++InsIdx;
2539 if (NumRegs > 0)
2540 --InsIdx;
2541 continue;
2543 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2544 continue;
2547 // In the following cases, assign a node order of "idx+1"
2548 // to newly created nodes. The SDNodes for params have to
2549 // appear in the same order as their order of appearance
2550 // in the original function. "idx+1" holds that order.
2551 if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
2552 bool aggregateIsPacked = false;
2553 if (StructType *STy = dyn_cast<StructType>(Ty))
2554 aggregateIsPacked = STy->isPacked();
2556 SmallVector<EVT, 16> VTs;
2557 SmallVector<uint64_t, 16> Offsets;
2558 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2559 assert(VTs.size() > 0 && "Unexpected empty type.");
2560 auto VectorInfo =
2561 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
2563 SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2564 int VecIdx = -1; // Index of the first element of the current vector.
2565 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2566 if (VectorInfo[parti] & PVF_FIRST) {
2567 assert(VecIdx == -1 && "Orphaned vector.");
2568 VecIdx = parti;
2571 // That's the last element of this store op.
2572 if (VectorInfo[parti] & PVF_LAST) {
2573 unsigned NumElts = parti - VecIdx + 1;
2574 EVT EltVT = VTs[parti];
2575 // i1 is loaded/stored as i8.
2576 EVT LoadVT = EltVT;
2577 if (EltVT == MVT::i1)
2578 LoadVT = MVT::i8;
2579 else if (EltVT == MVT::v2f16)
2580 // getLoad needs a vector type, but it can't handle
2581 // vectors which contain v2f16 elements. So we must load
2582 // using i32 here and then bitcast back.
2583 LoadVT = MVT::i32;
2585 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2586 SDValue VecAddr =
2587 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2588 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2589 Value *srcValue = Constant::getNullValue(PointerType::get(
2590 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2591 SDValue P =
2592 DAG.getLoad(VecVT, dl, Root, VecAddr,
2593 MachinePointerInfo(srcValue), aggregateIsPacked,
2594 MachineMemOperand::MODereferenceable |
2595 MachineMemOperand::MOInvariant);
2596 if (P.getNode())
2597 P.getNode()->setIROrder(idx + 1);
2598 for (unsigned j = 0; j < NumElts; ++j) {
2599 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2600 DAG.getIntPtrConstant(j, dl));
2601 // We've loaded i1 as an i8 and now must truncate it back to i1
2602 if (EltVT == MVT::i1)
2603 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2604 // v2f16 was loaded as an i32. Now we must bitcast it back.
2605 else if (EltVT == MVT::v2f16)
2606 Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2607 // Extend the element if necessary (e.g. an i8 is loaded
2608 // into an i16 register)
2609 if (Ins[InsIdx].VT.isInteger() &&
2610 Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
2611 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2612 : ISD::ZERO_EXTEND;
2613 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2615 InVals.push_back(Elt);
2618 // Reset vector tracking state.
2619 VecIdx = -1;
2621 ++InsIdx;
2623 if (VTs.size() > 0)
2624 --InsIdx;
2625 continue;
2628 // Param has ByVal attribute
2629 // Return MoveParam(param symbol).
2630 // Ideally, the param symbol can be returned directly,
2631 // but when SDNode builder decides to use it in a CopyToReg(),
2632 // machine instruction fails because TargetExternalSymbol
2633 // (not lowered) is target dependent, and CopyToReg assumes
2634 // the source is lowered.
2635 EVT ObjectVT = getValueType(DL, Ty);
2636 assert(ObjectVT == Ins[InsIdx].VT &&
2637 "Ins type did not match function type");
2638 SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2639 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2640 if (p.getNode())
2641 p.getNode()->setIROrder(idx + 1);
2642 InVals.push_back(p);
2645 // Clang will check explicit VarArg and issue error if any. However, Clang
2646 // will let code with
2647 // implicit var arg like f() pass. See bug 617733.
2648 // We treat this case as if the arg list is empty.
2649 // if (F.isVarArg()) {
2650 // assert(0 && "VarArg not supported yet!");
2653 if (!OutChains.empty())
2654 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2656 return Chain;
2659 SDValue
2660 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2661 bool isVarArg,
2662 const SmallVectorImpl<ISD::OutputArg> &Outs,
2663 const SmallVectorImpl<SDValue> &OutVals,
2664 const SDLoc &dl, SelectionDAG &DAG) const {
2665 MachineFunction &MF = DAG.getMachineFunction();
2666 Type *RetTy = MF.getFunction().getReturnType();
2668 bool isABI = (STI.getSmVersion() >= 20);
2669 assert(isABI && "Non-ABI compilation is not supported");
2670 if (!isABI)
2671 return Chain;
2673 const DataLayout DL = DAG.getDataLayout();
2674 SmallVector<EVT, 16> VTs;
2675 SmallVector<uint64_t, 16> Offsets;
2676 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2677 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2679 auto VectorInfo = VectorizePTXValueVTs(
2680 VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
2682 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2683 // 32-bits are sign extended or zero extended, depending on whether
2684 // they are signed or unsigned types.
2685 bool ExtendIntegerRetVal =
2686 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2688 SmallVector<SDValue, 6> StoreOperands;
2689 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2690 // New load/store. Record chain and offset operands.
2691 if (VectorInfo[i] & PVF_FIRST) {
2692 assert(StoreOperands.empty() && "Orphaned operand list.");
2693 StoreOperands.push_back(Chain);
2694 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2697 SDValue RetVal = OutVals[i];
2698 if (ExtendIntegerRetVal) {
2699 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
2700 : ISD::ZERO_EXTEND,
2701 dl, MVT::i32, RetVal);
2702 } else if (RetVal.getValueSizeInBits() < 16) {
2703 // Use 16-bit registers for small load-stores as it's the
2704 // smallest general purpose register size supported by NVPTX.
2705 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2708 // Record the value to return.
2709 StoreOperands.push_back(RetVal);
2711 // That's the last element of this store op.
2712 if (VectorInfo[i] & PVF_LAST) {
2713 NVPTXISD::NodeType Op;
2714 unsigned NumElts = StoreOperands.size() - 2;
2715 switch (NumElts) {
2716 case 1:
2717 Op = NVPTXISD::StoreRetval;
2718 break;
2719 case 2:
2720 Op = NVPTXISD::StoreRetvalV2;
2721 break;
2722 case 4:
2723 Op = NVPTXISD::StoreRetvalV4;
2724 break;
2725 default:
2726 llvm_unreachable("Invalid vector info.");
2729 // Adjust type of load/store op if we've extended the scalar
2730 // return value.
2731 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
2732 Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
2733 StoreOperands, TheStoreType,
2734 MachinePointerInfo(), /* Align */ 1,
2735 MachineMemOperand::MOStore);
2736 // Cleanup vector state.
2737 StoreOperands.clear();
2741 return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2744 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
2745 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2746 SelectionDAG &DAG) const {
2747 if (Constraint.length() > 1)
2748 return;
2749 else
2750 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2753 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2754 switch (Intrinsic) {
2755 default:
2756 return 0;
2758 case Intrinsic::nvvm_tex_1d_v4f32_s32:
2759 return NVPTXISD::Tex1DFloatS32;
2760 case Intrinsic::nvvm_tex_1d_v4f32_f32:
2761 return NVPTXISD::Tex1DFloatFloat;
2762 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2763 return NVPTXISD::Tex1DFloatFloatLevel;
2764 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2765 return NVPTXISD::Tex1DFloatFloatGrad;
2766 case Intrinsic::nvvm_tex_1d_v4s32_s32:
2767 return NVPTXISD::Tex1DS32S32;
2768 case Intrinsic::nvvm_tex_1d_v4s32_f32:
2769 return NVPTXISD::Tex1DS32Float;
2770 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2771 return NVPTXISD::Tex1DS32FloatLevel;
2772 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2773 return NVPTXISD::Tex1DS32FloatGrad;
2774 case Intrinsic::nvvm_tex_1d_v4u32_s32:
2775 return NVPTXISD::Tex1DU32S32;
2776 case Intrinsic::nvvm_tex_1d_v4u32_f32:
2777 return NVPTXISD::Tex1DU32Float;
2778 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2779 return NVPTXISD::Tex1DU32FloatLevel;
2780 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2781 return NVPTXISD::Tex1DU32FloatGrad;
2783 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2784 return NVPTXISD::Tex1DArrayFloatS32;
2785 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2786 return NVPTXISD::Tex1DArrayFloatFloat;
2787 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2788 return NVPTXISD::Tex1DArrayFloatFloatLevel;
2789 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2790 return NVPTXISD::Tex1DArrayFloatFloatGrad;
2791 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2792 return NVPTXISD::Tex1DArrayS32S32;
2793 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2794 return NVPTXISD::Tex1DArrayS32Float;
2795 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2796 return NVPTXISD::Tex1DArrayS32FloatLevel;
2797 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2798 return NVPTXISD::Tex1DArrayS32FloatGrad;
2799 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2800 return NVPTXISD::Tex1DArrayU32S32;
2801 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2802 return NVPTXISD::Tex1DArrayU32Float;
2803 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2804 return NVPTXISD::Tex1DArrayU32FloatLevel;
2805 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2806 return NVPTXISD::Tex1DArrayU32FloatGrad;
2808 case Intrinsic::nvvm_tex_2d_v4f32_s32:
2809 return NVPTXISD::Tex2DFloatS32;
2810 case Intrinsic::nvvm_tex_2d_v4f32_f32:
2811 return NVPTXISD::Tex2DFloatFloat;
2812 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2813 return NVPTXISD::Tex2DFloatFloatLevel;
2814 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2815 return NVPTXISD::Tex2DFloatFloatGrad;
2816 case Intrinsic::nvvm_tex_2d_v4s32_s32:
2817 return NVPTXISD::Tex2DS32S32;
2818 case Intrinsic::nvvm_tex_2d_v4s32_f32:
2819 return NVPTXISD::Tex2DS32Float;
2820 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2821 return NVPTXISD::Tex2DS32FloatLevel;
2822 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2823 return NVPTXISD::Tex2DS32FloatGrad;
2824 case Intrinsic::nvvm_tex_2d_v4u32_s32:
2825 return NVPTXISD::Tex2DU32S32;
2826 case Intrinsic::nvvm_tex_2d_v4u32_f32:
2827 return NVPTXISD::Tex2DU32Float;
2828 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2829 return NVPTXISD::Tex2DU32FloatLevel;
2830 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2831 return NVPTXISD::Tex2DU32FloatGrad;
2833 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2834 return NVPTXISD::Tex2DArrayFloatS32;
2835 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2836 return NVPTXISD::Tex2DArrayFloatFloat;
2837 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2838 return NVPTXISD::Tex2DArrayFloatFloatLevel;
2839 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2840 return NVPTXISD::Tex2DArrayFloatFloatGrad;
2841 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2842 return NVPTXISD::Tex2DArrayS32S32;
2843 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2844 return NVPTXISD::Tex2DArrayS32Float;
2845 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2846 return NVPTXISD::Tex2DArrayS32FloatLevel;
2847 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2848 return NVPTXISD::Tex2DArrayS32FloatGrad;
2849 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2850 return NVPTXISD::Tex2DArrayU32S32;
2851 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2852 return NVPTXISD::Tex2DArrayU32Float;
2853 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2854 return NVPTXISD::Tex2DArrayU32FloatLevel;
2855 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2856 return NVPTXISD::Tex2DArrayU32FloatGrad;
2858 case Intrinsic::nvvm_tex_3d_v4f32_s32:
2859 return NVPTXISD::Tex3DFloatS32;
2860 case Intrinsic::nvvm_tex_3d_v4f32_f32:
2861 return NVPTXISD::Tex3DFloatFloat;
2862 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2863 return NVPTXISD::Tex3DFloatFloatLevel;
2864 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2865 return NVPTXISD::Tex3DFloatFloatGrad;
2866 case Intrinsic::nvvm_tex_3d_v4s32_s32:
2867 return NVPTXISD::Tex3DS32S32;
2868 case Intrinsic::nvvm_tex_3d_v4s32_f32:
2869 return NVPTXISD::Tex3DS32Float;
2870 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2871 return NVPTXISD::Tex3DS32FloatLevel;
2872 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2873 return NVPTXISD::Tex3DS32FloatGrad;
2874 case Intrinsic::nvvm_tex_3d_v4u32_s32:
2875 return NVPTXISD::Tex3DU32S32;
2876 case Intrinsic::nvvm_tex_3d_v4u32_f32:
2877 return NVPTXISD::Tex3DU32Float;
2878 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2879 return NVPTXISD::Tex3DU32FloatLevel;
2880 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2881 return NVPTXISD::Tex3DU32FloatGrad;
2883 case Intrinsic::nvvm_tex_cube_v4f32_f32:
2884 return NVPTXISD::TexCubeFloatFloat;
2885 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2886 return NVPTXISD::TexCubeFloatFloatLevel;
2887 case Intrinsic::nvvm_tex_cube_v4s32_f32:
2888 return NVPTXISD::TexCubeS32Float;
2889 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2890 return NVPTXISD::TexCubeS32FloatLevel;
2891 case Intrinsic::nvvm_tex_cube_v4u32_f32:
2892 return NVPTXISD::TexCubeU32Float;
2893 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2894 return NVPTXISD::TexCubeU32FloatLevel;
2896 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2897 return NVPTXISD::TexCubeArrayFloatFloat;
2898 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2899 return NVPTXISD::TexCubeArrayFloatFloatLevel;
2900 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2901 return NVPTXISD::TexCubeArrayS32Float;
2902 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2903 return NVPTXISD::TexCubeArrayS32FloatLevel;
2904 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2905 return NVPTXISD::TexCubeArrayU32Float;
2906 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2907 return NVPTXISD::TexCubeArrayU32FloatLevel;
2909 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2910 return NVPTXISD::Tld4R2DFloatFloat;
2911 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2912 return NVPTXISD::Tld4G2DFloatFloat;
2913 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2914 return NVPTXISD::Tld4B2DFloatFloat;
2915 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2916 return NVPTXISD::Tld4A2DFloatFloat;
2917 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2918 return NVPTXISD::Tld4R2DS64Float;
2919 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2920 return NVPTXISD::Tld4G2DS64Float;
2921 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2922 return NVPTXISD::Tld4B2DS64Float;
2923 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2924 return NVPTXISD::Tld4A2DS64Float;
2925 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2926 return NVPTXISD::Tld4R2DU64Float;
2927 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2928 return NVPTXISD::Tld4G2DU64Float;
2929 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2930 return NVPTXISD::Tld4B2DU64Float;
2931 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2932 return NVPTXISD::Tld4A2DU64Float;
2934 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2935 return NVPTXISD::TexUnified1DFloatS32;
2936 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2937 return NVPTXISD::TexUnified1DFloatFloat;
2938 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2939 return NVPTXISD::TexUnified1DFloatFloatLevel;
2940 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2941 return NVPTXISD::TexUnified1DFloatFloatGrad;
2942 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2943 return NVPTXISD::TexUnified1DS32S32;
2944 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2945 return NVPTXISD::TexUnified1DS32Float;
2946 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2947 return NVPTXISD::TexUnified1DS32FloatLevel;
2948 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2949 return NVPTXISD::TexUnified1DS32FloatGrad;
2950 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2951 return NVPTXISD::TexUnified1DU32S32;
2952 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2953 return NVPTXISD::TexUnified1DU32Float;
2954 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2955 return NVPTXISD::TexUnified1DU32FloatLevel;
2956 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2957 return NVPTXISD::TexUnified1DU32FloatGrad;
2959 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2960 return NVPTXISD::TexUnified1DArrayFloatS32;
2961 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2962 return NVPTXISD::TexUnified1DArrayFloatFloat;
2963 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2964 return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
2965 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2966 return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
2967 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2968 return NVPTXISD::TexUnified1DArrayS32S32;
2969 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2970 return NVPTXISD::TexUnified1DArrayS32Float;
2971 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2972 return NVPTXISD::TexUnified1DArrayS32FloatLevel;
2973 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2974 return NVPTXISD::TexUnified1DArrayS32FloatGrad;
2975 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2976 return NVPTXISD::TexUnified1DArrayU32S32;
2977 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2978 return NVPTXISD::TexUnified1DArrayU32Float;
2979 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2980 return NVPTXISD::TexUnified1DArrayU32FloatLevel;
2981 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2982 return NVPTXISD::TexUnified1DArrayU32FloatGrad;
2984 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2985 return NVPTXISD::TexUnified2DFloatS32;
2986 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2987 return NVPTXISD::TexUnified2DFloatFloat;
2988 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2989 return NVPTXISD::TexUnified2DFloatFloatLevel;
2990 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2991 return NVPTXISD::TexUnified2DFloatFloatGrad;
2992 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2993 return NVPTXISD::TexUnified2DS32S32;
2994 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2995 return NVPTXISD::TexUnified2DS32Float;
2996 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2997 return NVPTXISD::TexUnified2DS32FloatLevel;
2998 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
2999 return NVPTXISD::TexUnified2DS32FloatGrad;
3000 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3001 return NVPTXISD::TexUnified2DU32S32;
3002 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3003 return NVPTXISD::TexUnified2DU32Float;
3004 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3005 return NVPTXISD::TexUnified2DU32FloatLevel;
3006 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3007 return NVPTXISD::TexUnified2DU32FloatGrad;
3009 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3010 return NVPTXISD::TexUnified2DArrayFloatS32;
3011 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3012 return NVPTXISD::TexUnified2DArrayFloatFloat;
3013 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3014 return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
3015 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3016 return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
3017 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3018 return NVPTXISD::TexUnified2DArrayS32S32;
3019 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3020 return NVPTXISD::TexUnified2DArrayS32Float;
3021 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3022 return NVPTXISD::TexUnified2DArrayS32FloatLevel;
3023 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3024 return NVPTXISD::TexUnified2DArrayS32FloatGrad;
3025 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3026 return NVPTXISD::TexUnified2DArrayU32S32;
3027 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3028 return NVPTXISD::TexUnified2DArrayU32Float;
3029 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3030 return NVPTXISD::TexUnified2DArrayU32FloatLevel;
3031 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3032 return NVPTXISD::TexUnified2DArrayU32FloatGrad;
3034 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3035 return NVPTXISD::TexUnified3DFloatS32;
3036 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3037 return NVPTXISD::TexUnified3DFloatFloat;
3038 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3039 return NVPTXISD::TexUnified3DFloatFloatLevel;
3040 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3041 return NVPTXISD::TexUnified3DFloatFloatGrad;
3042 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3043 return NVPTXISD::TexUnified3DS32S32;
3044 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3045 return NVPTXISD::TexUnified3DS32Float;
3046 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3047 return NVPTXISD::TexUnified3DS32FloatLevel;
3048 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3049 return NVPTXISD::TexUnified3DS32FloatGrad;
3050 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3051 return NVPTXISD::TexUnified3DU32S32;
3052 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3053 return NVPTXISD::TexUnified3DU32Float;
3054 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3055 return NVPTXISD::TexUnified3DU32FloatLevel;
3056 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3057 return NVPTXISD::TexUnified3DU32FloatGrad;
3059 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3060 return NVPTXISD::TexUnifiedCubeFloatFloat;
3061 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3062 return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
3063 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3064 return NVPTXISD::TexUnifiedCubeS32Float;
3065 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3066 return NVPTXISD::TexUnifiedCubeS32FloatLevel;
3067 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3068 return NVPTXISD::TexUnifiedCubeU32Float;
3069 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3070 return NVPTXISD::TexUnifiedCubeU32FloatLevel;
3072 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3073 return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
3074 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3075 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
3076 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3077 return NVPTXISD::TexUnifiedCubeArrayS32Float;
3078 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3079 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
3080 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3081 return NVPTXISD::TexUnifiedCubeArrayU32Float;
3082 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3083 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
3085 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3086 return NVPTXISD::Tld4UnifiedR2DFloatFloat;
3087 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3088 return NVPTXISD::Tld4UnifiedG2DFloatFloat;
3089 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3090 return NVPTXISD::Tld4UnifiedB2DFloatFloat;
3091 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3092 return NVPTXISD::Tld4UnifiedA2DFloatFloat;
3093 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3094 return NVPTXISD::Tld4UnifiedR2DS64Float;
3095 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3096 return NVPTXISD::Tld4UnifiedG2DS64Float;
3097 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3098 return NVPTXISD::Tld4UnifiedB2DS64Float;
3099 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3100 return NVPTXISD::Tld4UnifiedA2DS64Float;
3101 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3102 return NVPTXISD::Tld4UnifiedR2DU64Float;
3103 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3104 return NVPTXISD::Tld4UnifiedG2DU64Float;
3105 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3106 return NVPTXISD::Tld4UnifiedB2DU64Float;
3107 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3108 return NVPTXISD::Tld4UnifiedA2DU64Float;
3112 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3113 switch (Intrinsic) {
3114 default:
3115 return 0;
3116 case Intrinsic::nvvm_suld_1d_i8_clamp:
3117 return NVPTXISD::Suld1DI8Clamp;
3118 case Intrinsic::nvvm_suld_1d_i16_clamp:
3119 return NVPTXISD::Suld1DI16Clamp;
3120 case Intrinsic::nvvm_suld_1d_i32_clamp:
3121 return NVPTXISD::Suld1DI32Clamp;
3122 case Intrinsic::nvvm_suld_1d_i64_clamp:
3123 return NVPTXISD::Suld1DI64Clamp;
3124 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3125 return NVPTXISD::Suld1DV2I8Clamp;
3126 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3127 return NVPTXISD::Suld1DV2I16Clamp;
3128 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3129 return NVPTXISD::Suld1DV2I32Clamp;
3130 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3131 return NVPTXISD::Suld1DV2I64Clamp;
3132 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3133 return NVPTXISD::Suld1DV4I8Clamp;
3134 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3135 return NVPTXISD::Suld1DV4I16Clamp;
3136 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3137 return NVPTXISD::Suld1DV4I32Clamp;
3138 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3139 return NVPTXISD::Suld1DArrayI8Clamp;
3140 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3141 return NVPTXISD::Suld1DArrayI16Clamp;
3142 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3143 return NVPTXISD::Suld1DArrayI32Clamp;
3144 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3145 return NVPTXISD::Suld1DArrayI64Clamp;
3146 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3147 return NVPTXISD::Suld1DArrayV2I8Clamp;
3148 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3149 return NVPTXISD::Suld1DArrayV2I16Clamp;
3150 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3151 return NVPTXISD::Suld1DArrayV2I32Clamp;
3152 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3153 return NVPTXISD::Suld1DArrayV2I64Clamp;
3154 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3155 return NVPTXISD::Suld1DArrayV4I8Clamp;
3156 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3157 return NVPTXISD::Suld1DArrayV4I16Clamp;
3158 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3159 return NVPTXISD::Suld1DArrayV4I32Clamp;
3160 case Intrinsic::nvvm_suld_2d_i8_clamp:
3161 return NVPTXISD::Suld2DI8Clamp;
3162 case Intrinsic::nvvm_suld_2d_i16_clamp:
3163 return NVPTXISD::Suld2DI16Clamp;
3164 case Intrinsic::nvvm_suld_2d_i32_clamp:
3165 return NVPTXISD::Suld2DI32Clamp;
3166 case Intrinsic::nvvm_suld_2d_i64_clamp:
3167 return NVPTXISD::Suld2DI64Clamp;
3168 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3169 return NVPTXISD::Suld2DV2I8Clamp;
3170 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3171 return NVPTXISD::Suld2DV2I16Clamp;
3172 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3173 return NVPTXISD::Suld2DV2I32Clamp;
3174 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3175 return NVPTXISD::Suld2DV2I64Clamp;
3176 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3177 return NVPTXISD::Suld2DV4I8Clamp;
3178 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3179 return NVPTXISD::Suld2DV4I16Clamp;
3180 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3181 return NVPTXISD::Suld2DV4I32Clamp;
3182 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3183 return NVPTXISD::Suld2DArrayI8Clamp;
3184 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3185 return NVPTXISD::Suld2DArrayI16Clamp;
3186 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3187 return NVPTXISD::Suld2DArrayI32Clamp;
3188 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3189 return NVPTXISD::Suld2DArrayI64Clamp;
3190 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3191 return NVPTXISD::Suld2DArrayV2I8Clamp;
3192 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3193 return NVPTXISD::Suld2DArrayV2I16Clamp;
3194 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3195 return NVPTXISD::Suld2DArrayV2I32Clamp;
3196 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3197 return NVPTXISD::Suld2DArrayV2I64Clamp;
3198 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3199 return NVPTXISD::Suld2DArrayV4I8Clamp;
3200 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3201 return NVPTXISD::Suld2DArrayV4I16Clamp;
3202 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3203 return NVPTXISD::Suld2DArrayV4I32Clamp;
3204 case Intrinsic::nvvm_suld_3d_i8_clamp:
3205 return NVPTXISD::Suld3DI8Clamp;
3206 case Intrinsic::nvvm_suld_3d_i16_clamp:
3207 return NVPTXISD::Suld3DI16Clamp;
3208 case Intrinsic::nvvm_suld_3d_i32_clamp:
3209 return NVPTXISD::Suld3DI32Clamp;
3210 case Intrinsic::nvvm_suld_3d_i64_clamp:
3211 return NVPTXISD::Suld3DI64Clamp;
3212 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3213 return NVPTXISD::Suld3DV2I8Clamp;
3214 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3215 return NVPTXISD::Suld3DV2I16Clamp;
3216 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3217 return NVPTXISD::Suld3DV2I32Clamp;
3218 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3219 return NVPTXISD::Suld3DV2I64Clamp;
3220 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3221 return NVPTXISD::Suld3DV4I8Clamp;
3222 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3223 return NVPTXISD::Suld3DV4I16Clamp;
3224 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3225 return NVPTXISD::Suld3DV4I32Clamp;
3226 case Intrinsic::nvvm_suld_1d_i8_trap:
3227 return NVPTXISD::Suld1DI8Trap;
3228 case Intrinsic::nvvm_suld_1d_i16_trap:
3229 return NVPTXISD::Suld1DI16Trap;
3230 case Intrinsic::nvvm_suld_1d_i32_trap:
3231 return NVPTXISD::Suld1DI32Trap;
3232 case Intrinsic::nvvm_suld_1d_i64_trap:
3233 return NVPTXISD::Suld1DI64Trap;
3234 case Intrinsic::nvvm_suld_1d_v2i8_trap:
3235 return NVPTXISD::Suld1DV2I8Trap;
3236 case Intrinsic::nvvm_suld_1d_v2i16_trap:
3237 return NVPTXISD::Suld1DV2I16Trap;
3238 case Intrinsic::nvvm_suld_1d_v2i32_trap:
3239 return NVPTXISD::Suld1DV2I32Trap;
3240 case Intrinsic::nvvm_suld_1d_v2i64_trap:
3241 return NVPTXISD::Suld1DV2I64Trap;
3242 case Intrinsic::nvvm_suld_1d_v4i8_trap:
3243 return NVPTXISD::Suld1DV4I8Trap;
3244 case Intrinsic::nvvm_suld_1d_v4i16_trap:
3245 return NVPTXISD::Suld1DV4I16Trap;
3246 case Intrinsic::nvvm_suld_1d_v4i32_trap:
3247 return NVPTXISD::Suld1DV4I32Trap;
3248 case Intrinsic::nvvm_suld_1d_array_i8_trap:
3249 return NVPTXISD::Suld1DArrayI8Trap;
3250 case Intrinsic::nvvm_suld_1d_array_i16_trap:
3251 return NVPTXISD::Suld1DArrayI16Trap;
3252 case Intrinsic::nvvm_suld_1d_array_i32_trap:
3253 return NVPTXISD::Suld1DArrayI32Trap;
3254 case Intrinsic::nvvm_suld_1d_array_i64_trap:
3255 return NVPTXISD::Suld1DArrayI64Trap;
3256 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3257 return NVPTXISD::Suld1DArrayV2I8Trap;
3258 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3259 return NVPTXISD::Suld1DArrayV2I16Trap;
3260 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3261 return NVPTXISD::Suld1DArrayV2I32Trap;
3262 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3263 return NVPTXISD::Suld1DArrayV2I64Trap;
3264 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3265 return NVPTXISD::Suld1DArrayV4I8Trap;
3266 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3267 return NVPTXISD::Suld1DArrayV4I16Trap;
3268 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3269 return NVPTXISD::Suld1DArrayV4I32Trap;
3270 case Intrinsic::nvvm_suld_2d_i8_trap:
3271 return NVPTXISD::Suld2DI8Trap;
3272 case Intrinsic::nvvm_suld_2d_i16_trap:
3273 return NVPTXISD::Suld2DI16Trap;
3274 case Intrinsic::nvvm_suld_2d_i32_trap:
3275 return NVPTXISD::Suld2DI32Trap;
3276 case Intrinsic::nvvm_suld_2d_i64_trap:
3277 return NVPTXISD::Suld2DI64Trap;
3278 case Intrinsic::nvvm_suld_2d_v2i8_trap:
3279 return NVPTXISD::Suld2DV2I8Trap;
3280 case Intrinsic::nvvm_suld_2d_v2i16_trap:
3281 return NVPTXISD::Suld2DV2I16Trap;
3282 case Intrinsic::nvvm_suld_2d_v2i32_trap:
3283 return NVPTXISD::Suld2DV2I32Trap;
3284 case Intrinsic::nvvm_suld_2d_v2i64_trap:
3285 return NVPTXISD::Suld2DV2I64Trap;
3286 case Intrinsic::nvvm_suld_2d_v4i8_trap:
3287 return NVPTXISD::Suld2DV4I8Trap;
3288 case Intrinsic::nvvm_suld_2d_v4i16_trap:
3289 return NVPTXISD::Suld2DV4I16Trap;
3290 case Intrinsic::nvvm_suld_2d_v4i32_trap:
3291 return NVPTXISD::Suld2DV4I32Trap;
3292 case Intrinsic::nvvm_suld_2d_array_i8_trap:
3293 return NVPTXISD::Suld2DArrayI8Trap;
3294 case Intrinsic::nvvm_suld_2d_array_i16_trap:
3295 return NVPTXISD::Suld2DArrayI16Trap;
3296 case Intrinsic::nvvm_suld_2d_array_i32_trap:
3297 return NVPTXISD::Suld2DArrayI32Trap;
3298 case Intrinsic::nvvm_suld_2d_array_i64_trap:
3299 return NVPTXISD::Suld2DArrayI64Trap;
3300 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3301 return NVPTXISD::Suld2DArrayV2I8Trap;
3302 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3303 return NVPTXISD::Suld2DArrayV2I16Trap;
3304 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3305 return NVPTXISD::Suld2DArrayV2I32Trap;
3306 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3307 return NVPTXISD::Suld2DArrayV2I64Trap;
3308 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3309 return NVPTXISD::Suld2DArrayV4I8Trap;
3310 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3311 return NVPTXISD::Suld2DArrayV4I16Trap;
3312 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3313 return NVPTXISD::Suld2DArrayV4I32Trap;
3314 case Intrinsic::nvvm_suld_3d_i8_trap:
3315 return NVPTXISD::Suld3DI8Trap;
3316 case Intrinsic::nvvm_suld_3d_i16_trap:
3317 return NVPTXISD::Suld3DI16Trap;
3318 case Intrinsic::nvvm_suld_3d_i32_trap:
3319 return NVPTXISD::Suld3DI32Trap;
3320 case Intrinsic::nvvm_suld_3d_i64_trap:
3321 return NVPTXISD::Suld3DI64Trap;
3322 case Intrinsic::nvvm_suld_3d_v2i8_trap:
3323 return NVPTXISD::Suld3DV2I8Trap;
3324 case Intrinsic::nvvm_suld_3d_v2i16_trap:
3325 return NVPTXISD::Suld3DV2I16Trap;
3326 case Intrinsic::nvvm_suld_3d_v2i32_trap:
3327 return NVPTXISD::Suld3DV2I32Trap;
3328 case Intrinsic::nvvm_suld_3d_v2i64_trap:
3329 return NVPTXISD::Suld3DV2I64Trap;
3330 case Intrinsic::nvvm_suld_3d_v4i8_trap:
3331 return NVPTXISD::Suld3DV4I8Trap;
3332 case Intrinsic::nvvm_suld_3d_v4i16_trap:
3333 return NVPTXISD::Suld3DV4I16Trap;
3334 case Intrinsic::nvvm_suld_3d_v4i32_trap:
3335 return NVPTXISD::Suld3DV4I32Trap;
3336 case Intrinsic::nvvm_suld_1d_i8_zero:
3337 return NVPTXISD::Suld1DI8Zero;
3338 case Intrinsic::nvvm_suld_1d_i16_zero:
3339 return NVPTXISD::Suld1DI16Zero;
3340 case Intrinsic::nvvm_suld_1d_i32_zero:
3341 return NVPTXISD::Suld1DI32Zero;
3342 case Intrinsic::nvvm_suld_1d_i64_zero:
3343 return NVPTXISD::Suld1DI64Zero;
3344 case Intrinsic::nvvm_suld_1d_v2i8_zero:
3345 return NVPTXISD::Suld1DV2I8Zero;
3346 case Intrinsic::nvvm_suld_1d_v2i16_zero:
3347 return NVPTXISD::Suld1DV2I16Zero;
3348 case Intrinsic::nvvm_suld_1d_v2i32_zero:
3349 return NVPTXISD::Suld1DV2I32Zero;
3350 case Intrinsic::nvvm_suld_1d_v2i64_zero:
3351 return NVPTXISD::Suld1DV2I64Zero;
3352 case Intrinsic::nvvm_suld_1d_v4i8_zero:
3353 return NVPTXISD::Suld1DV4I8Zero;
3354 case Intrinsic::nvvm_suld_1d_v4i16_zero:
3355 return NVPTXISD::Suld1DV4I16Zero;
3356 case Intrinsic::nvvm_suld_1d_v4i32_zero:
3357 return NVPTXISD::Suld1DV4I32Zero;
3358 case Intrinsic::nvvm_suld_1d_array_i8_zero:
3359 return NVPTXISD::Suld1DArrayI8Zero;
3360 case Intrinsic::nvvm_suld_1d_array_i16_zero:
3361 return NVPTXISD::Suld1DArrayI16Zero;
3362 case Intrinsic::nvvm_suld_1d_array_i32_zero:
3363 return NVPTXISD::Suld1DArrayI32Zero;
3364 case Intrinsic::nvvm_suld_1d_array_i64_zero:
3365 return NVPTXISD::Suld1DArrayI64Zero;
3366 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3367 return NVPTXISD::Suld1DArrayV2I8Zero;
3368 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3369 return NVPTXISD::Suld1DArrayV2I16Zero;
3370 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3371 return NVPTXISD::Suld1DArrayV2I32Zero;
3372 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3373 return NVPTXISD::Suld1DArrayV2I64Zero;
3374 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3375 return NVPTXISD::Suld1DArrayV4I8Zero;
3376 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3377 return NVPTXISD::Suld1DArrayV4I16Zero;
3378 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3379 return NVPTXISD::Suld1DArrayV4I32Zero;
3380 case Intrinsic::nvvm_suld_2d_i8_zero:
3381 return NVPTXISD::Suld2DI8Zero;
3382 case Intrinsic::nvvm_suld_2d_i16_zero:
3383 return NVPTXISD::Suld2DI16Zero;
3384 case Intrinsic::nvvm_suld_2d_i32_zero:
3385 return NVPTXISD::Suld2DI32Zero;
3386 case Intrinsic::nvvm_suld_2d_i64_zero:
3387 return NVPTXISD::Suld2DI64Zero;
3388 case Intrinsic::nvvm_suld_2d_v2i8_zero:
3389 return NVPTXISD::Suld2DV2I8Zero;
3390 case Intrinsic::nvvm_suld_2d_v2i16_zero:
3391 return NVPTXISD::Suld2DV2I16Zero;
3392 case Intrinsic::nvvm_suld_2d_v2i32_zero:
3393 return NVPTXISD::Suld2DV2I32Zero;
3394 case Intrinsic::nvvm_suld_2d_v2i64_zero:
3395 return NVPTXISD::Suld2DV2I64Zero;
3396 case Intrinsic::nvvm_suld_2d_v4i8_zero:
3397 return NVPTXISD::Suld2DV4I8Zero;
3398 case Intrinsic::nvvm_suld_2d_v4i16_zero:
3399 return NVPTXISD::Suld2DV4I16Zero;
3400 case Intrinsic::nvvm_suld_2d_v4i32_zero:
3401 return NVPTXISD::Suld2DV4I32Zero;
3402 case Intrinsic::nvvm_suld_2d_array_i8_zero:
3403 return NVPTXISD::Suld2DArrayI8Zero;
3404 case Intrinsic::nvvm_suld_2d_array_i16_zero:
3405 return NVPTXISD::Suld2DArrayI16Zero;
3406 case Intrinsic::nvvm_suld_2d_array_i32_zero:
3407 return NVPTXISD::Suld2DArrayI32Zero;
3408 case Intrinsic::nvvm_suld_2d_array_i64_zero:
3409 return NVPTXISD::Suld2DArrayI64Zero;
3410 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3411 return NVPTXISD::Suld2DArrayV2I8Zero;
3412 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3413 return NVPTXISD::Suld2DArrayV2I16Zero;
3414 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3415 return NVPTXISD::Suld2DArrayV2I32Zero;
3416 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3417 return NVPTXISD::Suld2DArrayV2I64Zero;
3418 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3419 return NVPTXISD::Suld2DArrayV4I8Zero;
3420 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3421 return NVPTXISD::Suld2DArrayV4I16Zero;
3422 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3423 return NVPTXISD::Suld2DArrayV4I32Zero;
3424 case Intrinsic::nvvm_suld_3d_i8_zero:
3425 return NVPTXISD::Suld3DI8Zero;
3426 case Intrinsic::nvvm_suld_3d_i16_zero:
3427 return NVPTXISD::Suld3DI16Zero;
3428 case Intrinsic::nvvm_suld_3d_i32_zero:
3429 return NVPTXISD::Suld3DI32Zero;
3430 case Intrinsic::nvvm_suld_3d_i64_zero:
3431 return NVPTXISD::Suld3DI64Zero;
3432 case Intrinsic::nvvm_suld_3d_v2i8_zero:
3433 return NVPTXISD::Suld3DV2I8Zero;
3434 case Intrinsic::nvvm_suld_3d_v2i16_zero:
3435 return NVPTXISD::Suld3DV2I16Zero;
3436 case Intrinsic::nvvm_suld_3d_v2i32_zero:
3437 return NVPTXISD::Suld3DV2I32Zero;
3438 case Intrinsic::nvvm_suld_3d_v2i64_zero:
3439 return NVPTXISD::Suld3DV2I64Zero;
3440 case Intrinsic::nvvm_suld_3d_v4i8_zero:
3441 return NVPTXISD::Suld3DV4I8Zero;
3442 case Intrinsic::nvvm_suld_3d_v4i16_zero:
3443 return NVPTXISD::Suld3DV4I16Zero;
3444 case Intrinsic::nvvm_suld_3d_v4i32_zero:
3445 return NVPTXISD::Suld3DV4I32Zero;
3449 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3450 // TgtMemIntrinsic
3451 // because we need the information that is only available in the "Value" type
3452 // of destination
3453 // pointer. In particular, the address space information.
3454 bool NVPTXTargetLowering::getTgtMemIntrinsic(
3455 IntrinsicInfo &Info, const CallInst &I,
3456 MachineFunction &MF, unsigned Intrinsic) const {
3457 switch (Intrinsic) {
3458 default:
3459 return false;
3460 case Intrinsic::nvvm_match_all_sync_i32p:
3461 case Intrinsic::nvvm_match_all_sync_i64p:
3462 Info.opc = ISD::INTRINSIC_W_CHAIN;
3463 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3464 // in order to model data exchange with other threads, but perform no real
3465 // memory accesses.
3466 Info.memVT = MVT::i1;
3468 // Our result depends on both our and other thread's arguments.
3469 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
3470 return true;
3471 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3472 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3473 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3474 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3475 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3476 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3477 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3478 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3479 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3480 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3481 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3482 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3483 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3484 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3485 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3486 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3487 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3488 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3489 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3490 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3491 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3492 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3493 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3494 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3495 Info.opc = ISD::INTRINSIC_W_CHAIN;
3496 Info.memVT = MVT::v8f16;
3497 Info.ptrVal = I.getArgOperand(0);
3498 Info.offset = 0;
3499 Info.flags = MachineMemOperand::MOLoad;
3500 Info.align = Align(16);
3501 return true;
3503 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3504 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3505 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3506 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3507 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3508 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3509 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3510 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3511 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3512 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3513 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3514 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3515 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3516 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3517 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3518 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: {
3519 Info.opc = ISD::INTRINSIC_W_CHAIN;
3520 Info.memVT = MVT::v2i32;
3521 Info.ptrVal = I.getArgOperand(0);
3522 Info.offset = 0;
3523 Info.flags = MachineMemOperand::MOLoad;
3524 Info.align = Align(8);
3525 return true;
3528 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3529 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3530 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3531 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3532 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3533 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3534 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3535 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3537 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3538 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3539 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3540 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3541 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3542 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3543 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3544 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: {
3545 Info.opc = ISD::INTRINSIC_W_CHAIN;
3546 Info.memVT = MVT::v4i32;
3547 Info.ptrVal = I.getArgOperand(0);
3548 Info.offset = 0;
3549 Info.flags = MachineMemOperand::MOLoad;
3550 Info.align = Align(16);
3551 return true;
3554 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3555 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3556 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3557 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3558 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3559 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3560 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3561 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3563 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3564 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3565 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3566 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3567 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3568 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3569 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3570 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3571 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3572 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3573 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3574 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3575 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3576 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3577 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3578 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3579 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3580 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3581 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3582 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: {
3583 Info.opc = ISD::INTRINSIC_W_CHAIN;
3584 Info.memVT = MVT::i32;
3585 Info.ptrVal = I.getArgOperand(0);
3586 Info.offset = 0;
3587 Info.flags = MachineMemOperand::MOLoad;
3588 Info.align = Align(4);
3589 return true;
3592 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3593 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3594 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3595 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3596 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3597 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3598 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3599 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3600 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3601 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3602 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3603 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3604 Info.opc = ISD::INTRINSIC_W_CHAIN;
3605 Info.memVT = MVT::v4f16;
3606 Info.ptrVal = I.getArgOperand(0);
3607 Info.offset = 0;
3608 Info.flags = MachineMemOperand::MOLoad;
3609 Info.align = Align(16);
3610 return true;
3613 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3614 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3615 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3616 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3617 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3618 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3619 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3620 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3621 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3622 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3623 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3624 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
3625 Info.opc = ISD::INTRINSIC_W_CHAIN;
3626 Info.memVT = MVT::v8f32;
3627 Info.ptrVal = I.getArgOperand(0);
3628 Info.offset = 0;
3629 Info.flags = MachineMemOperand::MOLoad;
3630 Info.align = Align(16);
3631 return true;
3634 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3635 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3636 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3637 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3638 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3639 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3640 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3641 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3642 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3643 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3644 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3645 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3646 Info.opc = ISD::INTRINSIC_W_CHAIN;
3647 Info.memVT = MVT::v8i32;
3648 Info.ptrVal = I.getArgOperand(0);
3649 Info.offset = 0;
3650 Info.flags = MachineMemOperand::MOLoad;
3651 Info.align = Align(16);
3652 return true;
3655 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3656 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3657 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3658 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3659 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3660 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3661 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3662 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: {
3663 Info.opc = ISD::INTRINSIC_W_CHAIN;
3664 Info.memVT = MVT::v2i32;
3665 Info.ptrVal = I.getArgOperand(0);
3666 Info.offset = 0;
3667 Info.flags = MachineMemOperand::MOLoad;
3668 Info.align = Align(8);
3669 return true;
3672 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3673 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3674 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3675 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3676 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3677 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3678 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3679 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3680 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3681 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3682 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3683 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3684 Info.opc = ISD::INTRINSIC_VOID;
3685 Info.memVT = MVT::v4f16;
3686 Info.ptrVal = I.getArgOperand(0);
3687 Info.offset = 0;
3688 Info.flags = MachineMemOperand::MOStore;
3689 Info.align = Align(16);
3690 return true;
3693 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3694 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3695 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3696 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3697 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3698 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3699 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3700 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3701 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3702 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3703 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3704 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
3705 Info.opc = ISD::INTRINSIC_VOID;
3706 Info.memVT = MVT::v8f32;
3707 Info.ptrVal = I.getArgOperand(0);
3708 Info.offset = 0;
3709 Info.flags = MachineMemOperand::MOStore;
3710 Info.align = Align(16);
3711 return true;
3714 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3715 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3716 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3717 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3718 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3719 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3720 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3721 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3722 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3723 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3724 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3725 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3726 Info.opc = ISD::INTRINSIC_VOID;
3727 Info.memVT = MVT::v8i32;
3728 Info.ptrVal = I.getArgOperand(0);
3729 Info.offset = 0;
3730 Info.flags = MachineMemOperand::MOStore;
3731 Info.align = Align(16);
3732 return true;
3735 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3736 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3737 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3738 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3739 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3740 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3741 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3742 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3743 Info.opc = ISD::INTRINSIC_VOID;
3744 Info.memVT = MVT::v2i32;
3745 Info.ptrVal = I.getArgOperand(0);
3746 Info.offset = 0;
3747 Info.flags = MachineMemOperand::MOStore;
3748 Info.align = Align(8);
3749 return true;
3752 case Intrinsic::nvvm_atomic_load_inc_32:
3753 case Intrinsic::nvvm_atomic_load_dec_32:
3755 case Intrinsic::nvvm_atomic_add_gen_f_cta:
3756 case Intrinsic::nvvm_atomic_add_gen_f_sys:
3757 case Intrinsic::nvvm_atomic_add_gen_i_cta:
3758 case Intrinsic::nvvm_atomic_add_gen_i_sys:
3759 case Intrinsic::nvvm_atomic_and_gen_i_cta:
3760 case Intrinsic::nvvm_atomic_and_gen_i_sys:
3761 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3762 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3763 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3764 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3765 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3766 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3767 case Intrinsic::nvvm_atomic_max_gen_i_cta:
3768 case Intrinsic::nvvm_atomic_max_gen_i_sys:
3769 case Intrinsic::nvvm_atomic_min_gen_i_cta:
3770 case Intrinsic::nvvm_atomic_min_gen_i_sys:
3771 case Intrinsic::nvvm_atomic_or_gen_i_cta:
3772 case Intrinsic::nvvm_atomic_or_gen_i_sys:
3773 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3774 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3775 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3776 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3777 auto &DL = I.getModule()->getDataLayout();
3778 Info.opc = ISD::INTRINSIC_W_CHAIN;
3779 Info.memVT = getValueType(DL, I.getType());
3780 Info.ptrVal = I.getArgOperand(0);
3781 Info.offset = 0;
3782 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
3783 Info.align.reset();
3784 return true;
3787 case Intrinsic::nvvm_ldu_global_i:
3788 case Intrinsic::nvvm_ldu_global_f:
3789 case Intrinsic::nvvm_ldu_global_p: {
3790 auto &DL = I.getModule()->getDataLayout();
3791 Info.opc = ISD::INTRINSIC_W_CHAIN;
3792 if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3793 Info.memVT = getValueType(DL, I.getType());
3794 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3795 Info.memVT = getPointerTy(DL);
3796 else
3797 Info.memVT = getValueType(DL, I.getType());
3798 Info.ptrVal = I.getArgOperand(0);
3799 Info.offset = 0;
3800 Info.flags = MachineMemOperand::MOLoad;
3801 Info.align =
3802 MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
3804 return true;
3806 case Intrinsic::nvvm_ldg_global_i:
3807 case Intrinsic::nvvm_ldg_global_f:
3808 case Intrinsic::nvvm_ldg_global_p: {
3809 auto &DL = I.getModule()->getDataLayout();
3811 Info.opc = ISD::INTRINSIC_W_CHAIN;
3812 if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3813 Info.memVT = getValueType(DL, I.getType());
3814 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3815 Info.memVT = getPointerTy(DL);
3816 else
3817 Info.memVT = getValueType(DL, I.getType());
3818 Info.ptrVal = I.getArgOperand(0);
3819 Info.offset = 0;
3820 Info.flags = MachineMemOperand::MOLoad;
3821 Info.align =
3822 MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
3824 return true;
3827 case Intrinsic::nvvm_tex_1d_v4f32_s32:
3828 case Intrinsic::nvvm_tex_1d_v4f32_f32:
3829 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3830 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3831 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3832 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3833 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3834 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3835 case Intrinsic::nvvm_tex_2d_v4f32_s32:
3836 case Intrinsic::nvvm_tex_2d_v4f32_f32:
3837 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3838 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3839 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3840 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3841 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3842 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3843 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3844 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3845 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3846 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3847 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3848 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3849 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3850 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3851 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3852 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3853 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3854 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3855 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3856 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3857 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3858 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3859 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3860 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3861 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3862 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3863 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3864 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3865 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3866 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3867 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3868 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3869 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3870 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3871 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3872 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3873 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3874 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3875 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3876 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3877 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3878 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3879 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3880 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3881 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3882 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3883 Info.opc = getOpcForTextureInstr(Intrinsic);
3884 Info.memVT = MVT::v4f32;
3885 Info.ptrVal = nullptr;
3886 Info.offset = 0;
3887 Info.flags = MachineMemOperand::MOLoad;
3888 Info.align = Align(16);
3889 return true;
3891 case Intrinsic::nvvm_tex_1d_v4s32_s32:
3892 case Intrinsic::nvvm_tex_1d_v4s32_f32:
3893 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3894 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3895 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3896 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3897 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3898 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3899 case Intrinsic::nvvm_tex_2d_v4s32_s32:
3900 case Intrinsic::nvvm_tex_2d_v4s32_f32:
3901 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3902 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3903 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3904 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3905 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3906 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3907 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3908 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3909 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3910 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3911 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3912 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3913 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3914 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3915 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3916 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3917 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3918 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3919 case Intrinsic::nvvm_tex_1d_v4u32_s32:
3920 case Intrinsic::nvvm_tex_1d_v4u32_f32:
3921 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3922 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3923 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3924 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3925 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3926 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3927 case Intrinsic::nvvm_tex_2d_v4u32_s32:
3928 case Intrinsic::nvvm_tex_2d_v4u32_f32:
3929 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3930 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3931 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3932 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3933 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3934 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3935 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3936 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3937 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3938 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3939 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3940 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3941 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3942 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3943 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3944 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3945 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3946 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3947 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3948 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3949 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3950 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3951 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3952 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3953 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3954 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3955 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3956 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3957 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3958 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3959 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3960 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3961 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3962 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3963 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3964 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3965 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3966 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3967 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3968 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3969 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3970 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3971 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3972 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3973 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3974 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3975 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3976 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3977 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3978 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3979 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3980 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3981 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3982 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3983 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3984 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3985 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3986 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3987 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3988 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3989 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3990 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3991 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3992 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3993 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3994 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3995 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3996 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3997 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3998 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3999 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4000 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4001 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4002 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4003 Info.opc = getOpcForTextureInstr(Intrinsic);
4004 Info.memVT = MVT::v4i32;
4005 Info.ptrVal = nullptr;
4006 Info.offset = 0;
4007 Info.flags = MachineMemOperand::MOLoad;
4008 Info.align = Align(16);
4009 return true;
4011 case Intrinsic::nvvm_suld_1d_i8_clamp:
4012 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4013 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4014 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4015 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4016 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4017 case Intrinsic::nvvm_suld_2d_i8_clamp:
4018 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4019 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4020 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4021 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4022 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4023 case Intrinsic::nvvm_suld_3d_i8_clamp:
4024 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4025 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4026 case Intrinsic::nvvm_suld_1d_i8_trap:
4027 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4028 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4029 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4030 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4031 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4032 case Intrinsic::nvvm_suld_2d_i8_trap:
4033 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4034 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4035 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4036 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4037 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4038 case Intrinsic::nvvm_suld_3d_i8_trap:
4039 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4040 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4041 case Intrinsic::nvvm_suld_1d_i8_zero:
4042 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4043 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4044 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4045 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4046 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4047 case Intrinsic::nvvm_suld_2d_i8_zero:
4048 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4049 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4050 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4051 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4052 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4053 case Intrinsic::nvvm_suld_3d_i8_zero:
4054 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4055 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4056 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4057 Info.memVT = MVT::i8;
4058 Info.ptrVal = nullptr;
4059 Info.offset = 0;
4060 Info.flags = MachineMemOperand::MOLoad;
4061 Info.align = Align(16);
4062 return true;
4064 case Intrinsic::nvvm_suld_1d_i16_clamp:
4065 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4066 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4067 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4068 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4069 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4070 case Intrinsic::nvvm_suld_2d_i16_clamp:
4071 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4072 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4073 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4074 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4075 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4076 case Intrinsic::nvvm_suld_3d_i16_clamp:
4077 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4078 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4079 case Intrinsic::nvvm_suld_1d_i16_trap:
4080 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4081 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4082 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4083 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4084 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4085 case Intrinsic::nvvm_suld_2d_i16_trap:
4086 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4087 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4088 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4089 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4090 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4091 case Intrinsic::nvvm_suld_3d_i16_trap:
4092 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4093 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4094 case Intrinsic::nvvm_suld_1d_i16_zero:
4095 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4096 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4097 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4098 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4099 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4100 case Intrinsic::nvvm_suld_2d_i16_zero:
4101 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4102 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4103 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4104 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4105 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4106 case Intrinsic::nvvm_suld_3d_i16_zero:
4107 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4108 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4109 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4110 Info.memVT = MVT::i16;
4111 Info.ptrVal = nullptr;
4112 Info.offset = 0;
4113 Info.flags = MachineMemOperand::MOLoad;
4114 Info.align = Align(16);
4115 return true;
4117 case Intrinsic::nvvm_suld_1d_i32_clamp:
4118 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4119 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4120 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4121 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4122 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4123 case Intrinsic::nvvm_suld_2d_i32_clamp:
4124 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4125 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4126 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4127 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4128 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4129 case Intrinsic::nvvm_suld_3d_i32_clamp:
4130 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4131 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4132 case Intrinsic::nvvm_suld_1d_i32_trap:
4133 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4134 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4135 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4136 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4137 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4138 case Intrinsic::nvvm_suld_2d_i32_trap:
4139 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4140 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4141 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4142 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4143 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4144 case Intrinsic::nvvm_suld_3d_i32_trap:
4145 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4146 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4147 case Intrinsic::nvvm_suld_1d_i32_zero:
4148 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4149 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4150 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4151 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4152 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4153 case Intrinsic::nvvm_suld_2d_i32_zero:
4154 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4155 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4156 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4157 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4158 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4159 case Intrinsic::nvvm_suld_3d_i32_zero:
4160 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4161 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4162 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4163 Info.memVT = MVT::i32;
4164 Info.ptrVal = nullptr;
4165 Info.offset = 0;
4166 Info.flags = MachineMemOperand::MOLoad;
4167 Info.align = Align(16);
4168 return true;
4170 case Intrinsic::nvvm_suld_1d_i64_clamp:
4171 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4172 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4173 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4174 case Intrinsic::nvvm_suld_2d_i64_clamp:
4175 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4176 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4177 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4178 case Intrinsic::nvvm_suld_3d_i64_clamp:
4179 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4180 case Intrinsic::nvvm_suld_1d_i64_trap:
4181 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4182 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4183 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4184 case Intrinsic::nvvm_suld_2d_i64_trap:
4185 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4186 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4187 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4188 case Intrinsic::nvvm_suld_3d_i64_trap:
4189 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4190 case Intrinsic::nvvm_suld_1d_i64_zero:
4191 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4192 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4193 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4194 case Intrinsic::nvvm_suld_2d_i64_zero:
4195 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4196 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4197 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4198 case Intrinsic::nvvm_suld_3d_i64_zero:
4199 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4200 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4201 Info.memVT = MVT::i64;
4202 Info.ptrVal = nullptr;
4203 Info.offset = 0;
4204 Info.flags = MachineMemOperand::MOLoad;
4205 Info.align = Align(16);
4206 return true;
4208 return false;
4211 /// isLegalAddressingMode - Return true if the addressing mode represented
4212 /// by AM is legal for this target, for a load/store of the specified type.
4213 /// Used to guide target specific optimizations, like loop strength reduction
4214 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
4215 /// (CodeGenPrepare.cpp)
4216 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
4217 const AddrMode &AM, Type *Ty,
4218 unsigned AS, Instruction *I) const {
4219 // AddrMode - This represents an addressing mode of:
4220 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4222 // The legal address modes are
4223 // - [avar]
4224 // - [areg]
4225 // - [areg+immoff]
4226 // - [immAddr]
4228 if (AM.BaseGV) {
4229 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4232 switch (AM.Scale) {
4233 case 0: // "r", "r+i" or "i" is allowed
4234 break;
4235 case 1:
4236 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4237 return false;
4238 // Otherwise we have r+i.
4239 break;
4240 default:
4241 // No scale > 1 is allowed
4242 return false;
4244 return true;
4247 //===----------------------------------------------------------------------===//
4248 // NVPTX Inline Assembly Support
4249 //===----------------------------------------------------------------------===//
4251 /// getConstraintType - Given a constraint letter, return the type of
4252 /// constraint it is for this target.
4253 NVPTXTargetLowering::ConstraintType
4254 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
4255 if (Constraint.size() == 1) {
4256 switch (Constraint[0]) {
4257 default:
4258 break;
4259 case 'b':
4260 case 'r':
4261 case 'h':
4262 case 'c':
4263 case 'l':
4264 case 'f':
4265 case 'd':
4266 case '0':
4267 case 'N':
4268 return C_RegisterClass;
4271 return TargetLowering::getConstraintType(Constraint);
4274 std::pair<unsigned, const TargetRegisterClass *>
4275 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
4276 StringRef Constraint,
4277 MVT VT) const {
4278 if (Constraint.size() == 1) {
4279 switch (Constraint[0]) {
4280 case 'b':
4281 return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4282 case 'c':
4283 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4284 case 'h':
4285 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4286 case 'r':
4287 return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4288 case 'l':
4289 case 'N':
4290 return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4291 case 'f':
4292 return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4293 case 'd':
4294 return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4297 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4300 //===----------------------------------------------------------------------===//
4301 // NVPTX DAG Combining
4302 //===----------------------------------------------------------------------===//
4304 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
4305 CodeGenOpt::Level OptLevel) const {
4306 // Always honor command-line argument
4307 if (FMAContractLevelOpt.getNumOccurrences() > 0)
4308 return FMAContractLevelOpt > 0;
4310 // Do not contract if we're not optimizing the code.
4311 if (OptLevel == 0)
4312 return false;
4314 // Honor TargetOptions flags that explicitly say fusion is okay.
4315 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
4316 return true;
4318 return allowUnsafeFPMath(MF);
4321 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
4322 // Honor TargetOptions flags that explicitly say unsafe math is okay.
4323 if (MF.getTarget().Options.UnsafeFPMath)
4324 return true;
4326 // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4327 const Function &F = MF.getFunction();
4328 if (F.hasFnAttribute("unsafe-fp-math")) {
4329 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
4330 StringRef Val = Attr.getValueAsString();
4331 if (Val == "true")
4332 return true;
4335 return false;
4338 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4339 /// operands N0 and N1. This is a helper for PerformADDCombine that is
4340 /// called with the default operands, and if that fails, with commuted
4341 /// operands.
4342 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
4343 TargetLowering::DAGCombinerInfo &DCI,
4344 const NVPTXSubtarget &Subtarget,
4345 CodeGenOpt::Level OptLevel) {
4346 SelectionDAG &DAG = DCI.DAG;
4347 // Skip non-integer, non-scalar case
4348 EVT VT=N0.getValueType();
4349 if (VT.isVector())
4350 return SDValue();
4352 // fold (add (mul a, b), c) -> (mad a, b, c)
4354 if (N0.getOpcode() == ISD::MUL) {
4355 assert (VT.isInteger());
4356 // For integer:
4357 // Since integer multiply-add costs the same as integer multiply
4358 // but is more costly than integer add, do the fusion only when
4359 // the mul is only used in the add.
4360 if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
4361 !N0.getNode()->hasOneUse())
4362 return SDValue();
4364 // Do the folding
4365 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4366 N0.getOperand(0), N0.getOperand(1), N1);
4368 else if (N0.getOpcode() == ISD::FMUL) {
4369 if (VT == MVT::f32 || VT == MVT::f64) {
4370 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4371 &DAG.getTargetLoweringInfo());
4372 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
4373 return SDValue();
4375 // For floating point:
4376 // Do the fusion only when the mul has less than 5 uses and all
4377 // are add.
4378 // The heuristic is that if a use is not an add, then that use
4379 // cannot be fused into fma, therefore mul is still needed anyway.
4380 // If there are more than 4 uses, even if they are all add, fusing
4381 // them will increase register pressue.
4383 int numUses = 0;
4384 int nonAddCount = 0;
4385 for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
4386 UE = N0.getNode()->use_end();
4387 UI != UE; ++UI) {
4388 numUses++;
4389 SDNode *User = *UI;
4390 if (User->getOpcode() != ISD::FADD)
4391 ++nonAddCount;
4393 if (numUses >= 5)
4394 return SDValue();
4395 if (nonAddCount) {
4396 int orderNo = N->getIROrder();
4397 int orderNo2 = N0.getNode()->getIROrder();
4398 // simple heuristics here for considering potential register
4399 // pressure, the logics here is that the differnce are used
4400 // to measure the distance between def and use, the longer distance
4401 // more likely cause register pressure.
4402 if (orderNo - orderNo2 < 500)
4403 return SDValue();
4405 // Now, check if at least one of the FMUL's operands is live beyond the node N,
4406 // which guarantees that the FMA will not increase register pressure at node N.
4407 bool opIsLive = false;
4408 const SDNode *left = N0.getOperand(0).getNode();
4409 const SDNode *right = N0.getOperand(1).getNode();
4411 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4412 opIsLive = true;
4414 if (!opIsLive)
4415 for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
4416 SDNode *User = *UI;
4417 int orderNo3 = User->getIROrder();
4418 if (orderNo3 > orderNo) {
4419 opIsLive = true;
4420 break;
4424 if (!opIsLive)
4425 for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
4426 SDNode *User = *UI;
4427 int orderNo3 = User->getIROrder();
4428 if (orderNo3 > orderNo) {
4429 opIsLive = true;
4430 break;
4434 if (!opIsLive)
4435 return SDValue();
4438 return DAG.getNode(ISD::FMA, SDLoc(N), VT,
4439 N0.getOperand(0), N0.getOperand(1), N1);
4443 return SDValue();
4446 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4448 static SDValue PerformADDCombine(SDNode *N,
4449 TargetLowering::DAGCombinerInfo &DCI,
4450 const NVPTXSubtarget &Subtarget,
4451 CodeGenOpt::Level OptLevel) {
4452 SDValue N0 = N->getOperand(0);
4453 SDValue N1 = N->getOperand(1);
4455 // First try with the default operand order.
4456 if (SDValue Result =
4457 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
4458 return Result;
4460 // If that didn't work, try again with the operands commuted.
4461 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
4464 static SDValue PerformANDCombine(SDNode *N,
4465 TargetLowering::DAGCombinerInfo &DCI) {
4466 // The type legalizer turns a vector load of i8 values into a zextload to i16
4467 // registers, optionally ANY_EXTENDs it (if target type is integer),
4468 // and ANDs off the high 8 bits. Since we turn this load into a
4469 // target-specific DAG node, the DAG combiner fails to eliminate these AND
4470 // nodes. Do that here.
4471 SDValue Val = N->getOperand(0);
4472 SDValue Mask = N->getOperand(1);
4474 if (isa<ConstantSDNode>(Val)) {
4475 std::swap(Val, Mask);
4478 SDValue AExt;
4479 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4480 if (Val.getOpcode() == ISD::ANY_EXTEND) {
4481 AExt = Val;
4482 Val = Val->getOperand(0);
4485 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
4486 Val = Val->getOperand(0);
4489 if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4490 Val->getOpcode() == NVPTXISD::LoadV4) {
4491 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
4492 if (!MaskCnst) {
4493 // Not an AND with a constant
4494 return SDValue();
4497 uint64_t MaskVal = MaskCnst->getZExtValue();
4498 if (MaskVal != 0xff) {
4499 // Not an AND that chops off top 8 bits
4500 return SDValue();
4503 MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4504 if (!Mem) {
4505 // Not a MemSDNode?!?
4506 return SDValue();
4509 EVT MemVT = Mem->getMemoryVT();
4510 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4511 // We only handle the i8 case
4512 return SDValue();
4515 unsigned ExtType =
4516 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4517 getZExtValue();
4518 if (ExtType == ISD::SEXTLOAD) {
4519 // If for some reason the load is a sextload, the and is needed to zero
4520 // out the high 8 bits
4521 return SDValue();
4524 bool AddTo = false;
4525 if (AExt.getNode() != nullptr) {
4526 // Re-insert the ext as a zext.
4527 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4528 AExt.getValueType(), Val);
4529 AddTo = true;
4532 // If we get here, the AND is unnecessary. Just replace it with the load
4533 DCI.CombineTo(N, Val, AddTo);
4536 return SDValue();
4539 static SDValue PerformREMCombine(SDNode *N,
4540 TargetLowering::DAGCombinerInfo &DCI,
4541 CodeGenOpt::Level OptLevel) {
4542 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
4544 // Don't do anything at less than -O2.
4545 if (OptLevel < CodeGenOpt::Default)
4546 return SDValue();
4548 SelectionDAG &DAG = DCI.DAG;
4549 SDLoc DL(N);
4550 EVT VT = N->getValueType(0);
4551 bool IsSigned = N->getOpcode() == ISD::SREM;
4552 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
4554 const SDValue &Num = N->getOperand(0);
4555 const SDValue &Den = N->getOperand(1);
4557 for (const SDNode *U : Num->uses()) {
4558 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4559 U->getOperand(1) == Den) {
4560 // Num % Den -> Num - (Num / Den) * Den
4561 return DAG.getNode(ISD::SUB, DL, VT, Num,
4562 DAG.getNode(ISD::MUL, DL, VT,
4563 DAG.getNode(DivOpc, DL, VT, Num, Den),
4564 Den));
4567 return SDValue();
4570 enum OperandSignedness {
4571 Signed = 0,
4572 Unsigned,
4573 Unknown
4576 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4577 /// that can be demoted to \p OptSize bits without loss of information. The
4578 /// signedness of the operand, if determinable, is placed in \p S.
4579 static bool IsMulWideOperandDemotable(SDValue Op,
4580 unsigned OptSize,
4581 OperandSignedness &S) {
4582 S = Unknown;
4584 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4585 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
4586 EVT OrigVT = Op.getOperand(0).getValueType();
4587 if (OrigVT.getSizeInBits() <= OptSize) {
4588 S = Signed;
4589 return true;
4591 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4592 EVT OrigVT = Op.getOperand(0).getValueType();
4593 if (OrigVT.getSizeInBits() <= OptSize) {
4594 S = Unsigned;
4595 return true;
4599 return false;
4602 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4603 /// be demoted to \p OptSize bits without loss of information. If the operands
4604 /// contain a constant, it should appear as the RHS operand. The signedness of
4605 /// the operands is placed in \p IsSigned.
4606 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
4607 unsigned OptSize,
4608 bool &IsSigned) {
4609 OperandSignedness LHSSign;
4611 // The LHS operand must be a demotable op
4612 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4613 return false;
4615 // We should have been able to determine the signedness from the LHS
4616 if (LHSSign == Unknown)
4617 return false;
4619 IsSigned = (LHSSign == Signed);
4621 // The RHS can be a demotable op or a constant
4622 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4623 const APInt &Val = CI->getAPIntValue();
4624 if (LHSSign == Unsigned) {
4625 return Val.isIntN(OptSize);
4626 } else {
4627 return Val.isSignedIntN(OptSize);
4629 } else {
4630 OperandSignedness RHSSign;
4631 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4632 return false;
4634 return LHSSign == RHSSign;
4638 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4639 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4640 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4641 /// amount.
4642 static SDValue TryMULWIDECombine(SDNode *N,
4643 TargetLowering::DAGCombinerInfo &DCI) {
4644 EVT MulType = N->getValueType(0);
4645 if (MulType != MVT::i32 && MulType != MVT::i64) {
4646 return SDValue();
4649 SDLoc DL(N);
4650 unsigned OptSize = MulType.getSizeInBits() >> 1;
4651 SDValue LHS = N->getOperand(0);
4652 SDValue RHS = N->getOperand(1);
4654 // Canonicalize the multiply so the constant (if any) is on the right
4655 if (N->getOpcode() == ISD::MUL) {
4656 if (isa<ConstantSDNode>(LHS)) {
4657 std::swap(LHS, RHS);
4661 // If we have a SHL, determine the actual multiply amount
4662 if (N->getOpcode() == ISD::SHL) {
4663 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4664 if (!ShlRHS) {
4665 return SDValue();
4668 APInt ShiftAmt = ShlRHS->getAPIntValue();
4669 unsigned BitWidth = MulType.getSizeInBits();
4670 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4671 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4672 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4673 } else {
4674 return SDValue();
4678 bool Signed;
4679 // Verify that our operands are demotable
4680 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4681 return SDValue();
4684 EVT DemotedVT;
4685 if (MulType == MVT::i32) {
4686 DemotedVT = MVT::i16;
4687 } else {
4688 DemotedVT = MVT::i32;
4691 // Truncate the operands to the correct size. Note that these are just for
4692 // type consistency and will (likely) be eliminated in later phases.
4693 SDValue TruncLHS =
4694 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
4695 SDValue TruncRHS =
4696 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
4698 unsigned Opc;
4699 if (Signed) {
4700 Opc = NVPTXISD::MUL_WIDE_SIGNED;
4701 } else {
4702 Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
4705 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
4708 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
4709 static SDValue PerformMULCombine(SDNode *N,
4710 TargetLowering::DAGCombinerInfo &DCI,
4711 CodeGenOpt::Level OptLevel) {
4712 if (OptLevel > 0) {
4713 // Try mul.wide combining at OptLevel > 0
4714 if (SDValue Ret = TryMULWIDECombine(N, DCI))
4715 return Ret;
4718 return SDValue();
4721 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
4722 static SDValue PerformSHLCombine(SDNode *N,
4723 TargetLowering::DAGCombinerInfo &DCI,
4724 CodeGenOpt::Level OptLevel) {
4725 if (OptLevel > 0) {
4726 // Try mul.wide combining at OptLevel > 0
4727 if (SDValue Ret = TryMULWIDECombine(N, DCI))
4728 return Ret;
4731 return SDValue();
4734 static SDValue PerformSETCCCombine(SDNode *N,
4735 TargetLowering::DAGCombinerInfo &DCI) {
4736 EVT CCType = N->getValueType(0);
4737 SDValue A = N->getOperand(0);
4738 SDValue B = N->getOperand(1);
4740 if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
4741 return SDValue();
4743 SDLoc DL(N);
4744 // setp.f16x2 returns two scalar predicates, which we need to
4745 // convert back to v2i1. The returned result will be scalarized by
4746 // the legalizer, but the comparison will remain a single vector
4747 // instruction.
4748 SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
4749 DCI.DAG.getVTList(MVT::i1, MVT::i1),
4750 {A, B, N->getOperand(2)});
4751 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
4752 CCNode.getValue(1));
4755 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
4756 DAGCombinerInfo &DCI) const {
4757 CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
4758 switch (N->getOpcode()) {
4759 default: break;
4760 case ISD::ADD:
4761 case ISD::FADD:
4762 return PerformADDCombine(N, DCI, STI, OptLevel);
4763 case ISD::MUL:
4764 return PerformMULCombine(N, DCI, OptLevel);
4765 case ISD::SHL:
4766 return PerformSHLCombine(N, DCI, OptLevel);
4767 case ISD::AND:
4768 return PerformANDCombine(N, DCI);
4769 case ISD::UREM:
4770 case ISD::SREM:
4771 return PerformREMCombine(N, DCI, OptLevel);
4772 case ISD::SETCC:
4773 return PerformSETCCCombine(N, DCI);
4775 return SDValue();
4778 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
4779 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
4780 SmallVectorImpl<SDValue> &Results) {
4781 EVT ResVT = N->getValueType(0);
4782 SDLoc DL(N);
4784 assert(ResVT.isVector() && "Vector load must have vector type");
4786 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
4787 // legal. We can (and should) split that into 2 loads of <2 x double> here
4788 // but I'm leaving that as a TODO for now.
4789 assert(ResVT.isSimple() && "Can only handle simple types");
4790 switch (ResVT.getSimpleVT().SimpleTy) {
4791 default:
4792 return;
4793 case MVT::v2i8:
4794 case MVT::v2i16:
4795 case MVT::v2i32:
4796 case MVT::v2i64:
4797 case MVT::v2f16:
4798 case MVT::v2f32:
4799 case MVT::v2f64:
4800 case MVT::v4i8:
4801 case MVT::v4i16:
4802 case MVT::v4i32:
4803 case MVT::v4f16:
4804 case MVT::v4f32:
4805 case MVT::v8f16: // <4 x f16x2>
4806 // This is a "native" vector type
4807 break;
4810 LoadSDNode *LD = cast<LoadSDNode>(N);
4812 unsigned Align = LD->getAlignment();
4813 auto &TD = DAG.getDataLayout();
4814 unsigned PrefAlign =
4815 TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
4816 if (Align < PrefAlign) {
4817 // This load is not sufficiently aligned, so bail out and let this vector
4818 // load be scalarized. Note that we may still be able to emit smaller
4819 // vector loads. For example, if we are loading a <4 x float> with an
4820 // alignment of 8, this check will fail but the legalizer will try again
4821 // with 2 x <2 x float>, which will succeed with an alignment of 8.
4822 return;
4825 EVT EltVT = ResVT.getVectorElementType();
4826 unsigned NumElts = ResVT.getVectorNumElements();
4828 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
4829 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
4830 // loaded type to i16 and propagate the "real" type as the memory type.
4831 bool NeedTrunc = false;
4832 if (EltVT.getSizeInBits() < 16) {
4833 EltVT = MVT::i16;
4834 NeedTrunc = true;
4837 unsigned Opcode = 0;
4838 SDVTList LdResVTs;
4839 bool LoadF16x2 = false;
4841 switch (NumElts) {
4842 default:
4843 return;
4844 case 2:
4845 Opcode = NVPTXISD::LoadV2;
4846 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4847 break;
4848 case 4: {
4849 Opcode = NVPTXISD::LoadV4;
4850 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4851 LdResVTs = DAG.getVTList(ListVTs);
4852 break;
4854 case 8: {
4855 // v8f16 is a special case. PTX doesn't have ld.v8.f16
4856 // instruction. Instead, we split the vector into v2f16 chunks and
4857 // load them with ld.v4.b32.
4858 assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
4859 LoadF16x2 = true;
4860 Opcode = NVPTXISD::LoadV4;
4861 EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
4862 MVT::Other};
4863 LdResVTs = DAG.getVTList(ListVTs);
4864 break;
4868 // Copy regular operands
4869 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
4871 // The select routine does not have access to the LoadSDNode instance, so
4872 // pass along the extension information
4873 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
4875 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
4876 LD->getMemoryVT(),
4877 LD->getMemOperand());
4879 SmallVector<SDValue, 8> ScalarRes;
4880 if (LoadF16x2) {
4881 // Split v2f16 subvectors back into individual elements.
4882 NumElts /= 2;
4883 for (unsigned i = 0; i < NumElts; ++i) {
4884 SDValue SubVector = NewLD.getValue(i);
4885 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4886 DAG.getIntPtrConstant(0, DL));
4887 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4888 DAG.getIntPtrConstant(1, DL));
4889 ScalarRes.push_back(E0);
4890 ScalarRes.push_back(E1);
4892 } else {
4893 for (unsigned i = 0; i < NumElts; ++i) {
4894 SDValue Res = NewLD.getValue(i);
4895 if (NeedTrunc)
4896 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
4897 ScalarRes.push_back(Res);
4901 SDValue LoadChain = NewLD.getValue(NumElts);
4903 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
4905 Results.push_back(BuildVec);
4906 Results.push_back(LoadChain);
4909 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
4910 SmallVectorImpl<SDValue> &Results) {
4911 SDValue Chain = N->getOperand(0);
4912 SDValue Intrin = N->getOperand(1);
4913 SDLoc DL(N);
4915 // Get the intrinsic ID
4916 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
4917 switch (IntrinNo) {
4918 default:
4919 return;
4920 case Intrinsic::nvvm_ldg_global_i:
4921 case Intrinsic::nvvm_ldg_global_f:
4922 case Intrinsic::nvvm_ldg_global_p:
4923 case Intrinsic::nvvm_ldu_global_i:
4924 case Intrinsic::nvvm_ldu_global_f:
4925 case Intrinsic::nvvm_ldu_global_p: {
4926 EVT ResVT = N->getValueType(0);
4928 if (ResVT.isVector()) {
4929 // Vector LDG/LDU
4931 unsigned NumElts = ResVT.getVectorNumElements();
4932 EVT EltVT = ResVT.getVectorElementType();
4934 // Since LDU/LDG are target nodes, we cannot rely on DAG type
4935 // legalization.
4936 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
4937 // loaded type to i16 and propagate the "real" type as the memory type.
4938 bool NeedTrunc = false;
4939 if (EltVT.getSizeInBits() < 16) {
4940 EltVT = MVT::i16;
4941 NeedTrunc = true;
4944 unsigned Opcode = 0;
4945 SDVTList LdResVTs;
4947 switch (NumElts) {
4948 default:
4949 return;
4950 case 2:
4951 switch (IntrinNo) {
4952 default:
4953 return;
4954 case Intrinsic::nvvm_ldg_global_i:
4955 case Intrinsic::nvvm_ldg_global_f:
4956 case Intrinsic::nvvm_ldg_global_p:
4957 Opcode = NVPTXISD::LDGV2;
4958 break;
4959 case Intrinsic::nvvm_ldu_global_i:
4960 case Intrinsic::nvvm_ldu_global_f:
4961 case Intrinsic::nvvm_ldu_global_p:
4962 Opcode = NVPTXISD::LDUV2;
4963 break;
4965 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4966 break;
4967 case 4: {
4968 switch (IntrinNo) {
4969 default:
4970 return;
4971 case Intrinsic::nvvm_ldg_global_i:
4972 case Intrinsic::nvvm_ldg_global_f:
4973 case Intrinsic::nvvm_ldg_global_p:
4974 Opcode = NVPTXISD::LDGV4;
4975 break;
4976 case Intrinsic::nvvm_ldu_global_i:
4977 case Intrinsic::nvvm_ldu_global_f:
4978 case Intrinsic::nvvm_ldu_global_p:
4979 Opcode = NVPTXISD::LDUV4;
4980 break;
4982 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4983 LdResVTs = DAG.getVTList(ListVTs);
4984 break;
4988 SmallVector<SDValue, 8> OtherOps;
4990 // Copy regular operands
4992 OtherOps.push_back(Chain); // Chain
4993 // Skip operand 1 (intrinsic ID)
4994 // Others
4995 OtherOps.append(N->op_begin() + 2, N->op_end());
4997 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
4999 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5000 MemSD->getMemoryVT(),
5001 MemSD->getMemOperand());
5003 SmallVector<SDValue, 4> ScalarRes;
5005 for (unsigned i = 0; i < NumElts; ++i) {
5006 SDValue Res = NewLD.getValue(i);
5007 if (NeedTrunc)
5008 Res =
5009 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5010 ScalarRes.push_back(Res);
5013 SDValue LoadChain = NewLD.getValue(NumElts);
5015 SDValue BuildVec =
5016 DAG.getBuildVector(ResVT, DL, ScalarRes);
5018 Results.push_back(BuildVec);
5019 Results.push_back(LoadChain);
5020 } else {
5021 // i8 LDG/LDU
5022 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
5023 "Custom handling of non-i8 ldu/ldg?");
5025 // Just copy all operands as-is
5026 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
5028 // Force output to i16
5029 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
5031 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5033 // We make sure the memory type is i8, which will be used during isel
5034 // to select the proper instruction.
5035 SDValue NewLD =
5036 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
5037 MVT::i8, MemSD->getMemOperand());
5039 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
5040 NewLD.getValue(0)));
5041 Results.push_back(NewLD.getValue(1));
5047 void NVPTXTargetLowering::ReplaceNodeResults(
5048 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
5049 switch (N->getOpcode()) {
5050 default:
5051 report_fatal_error("Unhandled custom legalization");
5052 case ISD::LOAD:
5053 ReplaceLoadVector(N, DAG, Results);
5054 return;
5055 case ISD::INTRINSIC_W_CHAIN:
5056 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
5057 return;
5061 // Pin NVPTXTargetObjectFile's vtables to this file.
5062 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {}
5064 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
5065 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
5066 return getDataSection();