1 //===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file describes the PTX instructions in TableGen format.
11 //===----------------------------------------------------------------------===//
13 include "NVPTXInstrFormats.td"
15 let OperandType = "OPERAND_IMMEDIATE" in {
16 def f16imm : Operand<f16>;
17 def bf16imm : Operand<bf16>;
21 // List of vector specific properties
22 def isVecLD : VecInstTypeEnum<1>;
23 def isVecST : VecInstTypeEnum<2>;
24 def isVecBuild : VecInstTypeEnum<3>;
25 def isVecShuffle : VecInstTypeEnum<4>;
26 def isVecExtract : VecInstTypeEnum<5>;
27 def isVecInsert : VecInstTypeEnum<6>;
28 def isVecDest : VecInstTypeEnum<7>;
29 def isVecOther : VecInstTypeEnum<15>;
31 //===----------------------------------------------------------------------===//
32 // NVPTX Operand Definitions.
33 //===----------------------------------------------------------------------===//
35 def brtarget : Operand<OtherVT>;
37 // CVT conversion modes
38 // These must match the enum in NVPTX.h
39 def CvtNONE : PatLeaf<(i32 0x0)>;
40 def CvtRNI : PatLeaf<(i32 0x1)>;
41 def CvtRZI : PatLeaf<(i32 0x2)>;
42 def CvtRMI : PatLeaf<(i32 0x3)>;
43 def CvtRPI : PatLeaf<(i32 0x4)>;
44 def CvtRN : PatLeaf<(i32 0x5)>;
45 def CvtRZ : PatLeaf<(i32 0x6)>;
46 def CvtRM : PatLeaf<(i32 0x7)>;
47 def CvtRP : PatLeaf<(i32 0x8)>;
48 def CvtRNA : PatLeaf<(i32 0x9)>;
50 def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
51 def CvtRNI_FTZ : PatLeaf<(i32 0x11)>;
52 def CvtRZI_FTZ : PatLeaf<(i32 0x12)>;
53 def CvtRMI_FTZ : PatLeaf<(i32 0x13)>;
54 def CvtRPI_FTZ : PatLeaf<(i32 0x14)>;
55 def CvtRN_FTZ : PatLeaf<(i32 0x15)>;
56 def CvtRZ_FTZ : PatLeaf<(i32 0x16)>;
57 def CvtRM_FTZ : PatLeaf<(i32 0x17)>;
58 def CvtRP_FTZ : PatLeaf<(i32 0x18)>;
60 def CvtSAT : PatLeaf<(i32 0x20)>;
61 def CvtSAT_FTZ : PatLeaf<(i32 0x30)>;
63 def CvtNONE_RELU : PatLeaf<(i32 0x40)>;
64 def CvtRN_RELU : PatLeaf<(i32 0x45)>;
65 def CvtRZ_RELU : PatLeaf<(i32 0x46)>;
67 def CvtMode : Operand<i32> {
68 let PrintMethod = "printCvtMode";
72 // These must match the enum in NVPTX.h
73 def CmpEQ : PatLeaf<(i32 0)>;
74 def CmpNE : PatLeaf<(i32 1)>;
75 def CmpLT : PatLeaf<(i32 2)>;
76 def CmpLE : PatLeaf<(i32 3)>;
77 def CmpGT : PatLeaf<(i32 4)>;
78 def CmpGE : PatLeaf<(i32 5)>;
79 def CmpLO : PatLeaf<(i32 6)>;
80 def CmpLS : PatLeaf<(i32 7)>;
81 def CmpHI : PatLeaf<(i32 8)>;
82 def CmpHS : PatLeaf<(i32 9)>;
83 def CmpEQU : PatLeaf<(i32 10)>;
84 def CmpNEU : PatLeaf<(i32 11)>;
85 def CmpLTU : PatLeaf<(i32 12)>;
86 def CmpLEU : PatLeaf<(i32 13)>;
87 def CmpGTU : PatLeaf<(i32 14)>;
88 def CmpGEU : PatLeaf<(i32 15)>;
89 def CmpNUM : PatLeaf<(i32 16)>;
90 def CmpNAN : PatLeaf<(i32 17)>;
92 def CmpEQ_FTZ : PatLeaf<(i32 0x100)>;
93 def CmpNE_FTZ : PatLeaf<(i32 0x101)>;
94 def CmpLT_FTZ : PatLeaf<(i32 0x102)>;
95 def CmpLE_FTZ : PatLeaf<(i32 0x103)>;
96 def CmpGT_FTZ : PatLeaf<(i32 0x104)>;
97 def CmpGE_FTZ : PatLeaf<(i32 0x105)>;
98 def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>;
99 def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>;
100 def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>;
101 def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>;
102 def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>;
103 def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>;
104 def CmpNUM_FTZ : PatLeaf<(i32 0x110)>;
105 def CmpNAN_FTZ : PatLeaf<(i32 0x111)>;
107 def CmpMode : Operand<i32> {
108 let PrintMethod = "printCmpMode";
110 def VecElement : Operand<i32> {
111 let PrintMethod = "printVecElement";
115 // These must match the enum in NVPTX.h
116 def PrmtNONE : PatLeaf<(i32 0x0)>;
117 def PrmtF4E : PatLeaf<(i32 0x1)>;
118 def PrmtB4E : PatLeaf<(i32 0x2)>;
119 def PrmtRC8 : PatLeaf<(i32 0x3)>;
120 def PrmtECL : PatLeaf<(i32 0x4)>;
121 def PrmtECR : PatLeaf<(i32 0x5)>;
122 def PrmtRC16 : PatLeaf<(i32 0x6)>;
124 def PrmtMode : Operand<i32> {
125 let PrintMethod = "printPrmtMode";
129 //===----------------------------------------------------------------------===//
130 // NVPTX Instruction Predicate Definitions
131 //===----------------------------------------------------------------------===//
134 def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
135 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
136 def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
137 def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
138 def hasVote : Predicate<"Subtarget->hasVote()">;
139 def hasDouble : Predicate<"Subtarget->hasDouble()">;
140 def hasLDG : Predicate<"Subtarget->hasLDG()">;
141 def hasLDU : Predicate<"Subtarget->hasLDU()">;
143 def doF32FTZ : Predicate<"useF32FTZ()">;
144 def doNoF32FTZ : Predicate<"!useF32FTZ()">;
145 def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
147 def doMulWide : Predicate<"doMulWide">;
149 def allowFMA : Predicate<"allowFMA()">;
150 def noFMA : Predicate<"!allowFMA()">;
151 def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
152 def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">;
154 def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
155 def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
157 def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
158 def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
160 def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
161 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
163 def True : Predicate<"true">;
164 def False : Predicate<"false">;
166 class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
167 class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
169 // Explicit records for arch-accelerated SM versions
170 def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">;
172 // non-sync shfl instructions are not available on sm_70+ in PTX6.4+
173 def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
174 "&& Subtarget->getPTXVersion() >= 64)">;
176 def useShortPtrLocal : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_LOCAL) == 32">;
177 def useShortPtrShared : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32">;
178 def useShortPtrConst : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_CONST) == 32">;
180 def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
181 def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">;
183 // Helper class to aid conversion between ValueType and a matching RegisterClass.
185 class ValueToRegClass<ValueType T> {
186 string name = !cast<string>(T);
187 NVPTXRegClass ret = !cond(
188 !eq(name, "i1"): Int1Regs,
189 !eq(name, "i16"): Int16Regs,
190 !eq(name, "v2i16"): Int32Regs,
191 !eq(name, "i32"): Int32Regs,
192 !eq(name, "i64"): Int64Regs,
193 !eq(name, "f16"): Int16Regs,
194 !eq(name, "v2f16"): Int32Regs,
195 !eq(name, "bf16"): Int16Regs,
196 !eq(name, "v2bf16"): Int32Regs,
197 !eq(name, "f32"): Float32Regs,
198 !eq(name, "f64"): Float64Regs,
199 !eq(name, "ai32"): Int32ArgRegs,
200 !eq(name, "ai64"): Int64ArgRegs,
201 !eq(name, "af32"): Float32ArgRegs,
202 !eq(name, "if64"): Float64ArgRegs,
207 //===----------------------------------------------------------------------===//
208 // Some Common Instruction Class Templates
209 //===----------------------------------------------------------------------===//
211 // Template for instructions which take three int64, int32, or int16 args.
212 // The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
213 multiclass I3<string OpcStr, SDNode OpNode> {
215 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
216 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
217 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
219 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
220 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
221 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
223 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
224 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
225 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
227 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
228 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
229 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
231 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
232 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
233 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
235 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
236 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
237 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
240 class I16x2<string OpcStr, SDNode OpNode> :
241 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
242 !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"),
243 [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>,
244 Requires<[hasPTX<80>, hasSM<90>]>;
246 // Template for instructions which take 3 int args. The instructions are
247 // named "<OpcStr>.s32" (e.g. "addc.cc.s32").
248 multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
249 let hasSideEffects = 1 in {
251 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
252 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
253 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
255 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
256 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
257 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
259 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
260 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
261 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>,
262 Requires<[hasPTX<43>]>;
264 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
265 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
266 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>,
267 Requires<[hasPTX<43>]>;
271 // Template for instructions which take three fp64 or fp32 args. The
272 // instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
274 // Also defines ftz (flush subnormal inputs and results to sign-preserving
275 // zero) variants for fp32 functions.
277 // This multiclass should be used for nodes that cannot be folded into FMAs.
278 // For nodes that can be folded into FMAs (i.e. adds and muls), use
280 multiclass F3<string OpcStr, SDNode OpNode> {
282 NVPTXInst<(outs Float64Regs:$dst),
283 (ins Float64Regs:$a, Float64Regs:$b),
284 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
285 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
287 NVPTXInst<(outs Float64Regs:$dst),
288 (ins Float64Regs:$a, f64imm:$b),
289 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
290 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
292 NVPTXInst<(outs Float32Regs:$dst),
293 (ins Float32Regs:$a, Float32Regs:$b),
294 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
295 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
296 Requires<[doF32FTZ]>;
298 NVPTXInst<(outs Float32Regs:$dst),
299 (ins Float32Regs:$a, f32imm:$b),
300 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
301 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
302 Requires<[doF32FTZ]>;
304 NVPTXInst<(outs Float32Regs:$dst),
305 (ins Float32Regs:$a, Float32Regs:$b),
306 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
307 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
309 NVPTXInst<(outs Float32Regs:$dst),
310 (ins Float32Regs:$a, f32imm:$b),
311 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
312 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
315 NVPTXInst<(outs Int16Regs:$dst),
316 (ins Int16Regs:$a, Int16Regs:$b),
317 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
318 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
319 Requires<[useFP16Math, doF32FTZ]>;
321 NVPTXInst<(outs Int16Regs:$dst),
322 (ins Int16Regs:$a, Int16Regs:$b),
323 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
324 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
325 Requires<[useFP16Math]>;
328 NVPTXInst<(outs Int32Regs:$dst),
329 (ins Int32Regs:$a, Int32Regs:$b),
330 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
331 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
332 Requires<[useFP16Math, doF32FTZ]>;
334 NVPTXInst<(outs Int32Regs:$dst),
335 (ins Int32Regs:$a, Int32Regs:$b),
336 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
337 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
338 Requires<[useFP16Math]>;
340 NVPTXInst<(outs Int16Regs:$dst),
341 (ins Int16Regs:$a, Int16Regs:$b),
342 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
343 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
344 Requires<[hasBF16Math, doF32FTZ]>;
346 NVPTXInst<(outs Int16Regs:$dst),
347 (ins Int16Regs:$a, Int16Regs:$b),
348 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
349 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
350 Requires<[hasBF16Math]>;
353 NVPTXInst<(outs Int32Regs:$dst),
354 (ins Int32Regs:$a, Int32Regs:$b),
355 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
356 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
357 Requires<[hasBF16Math, doF32FTZ]>;
359 NVPTXInst<(outs Int32Regs:$dst),
360 (ins Int32Regs:$a, Int32Regs:$b),
361 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
362 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
363 Requires<[hasBF16Math]>;
366 // Template for instructions which take three FP args. The
367 // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
369 // Also defines ftz (flush subnormal inputs and results to sign-preserving
370 // zero) variants for fp32/fp16 functions.
372 // This multiclass should be used for nodes that can be folded to make fma ops.
373 // In this case, we use the ".rn" variant when FMA is disabled, as this behaves
374 // just like the non ".rn" op, but prevents ptxas from creating FMAs.
375 multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
377 NVPTXInst<(outs Float64Regs:$dst),
378 (ins Float64Regs:$a, Float64Regs:$b),
379 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
380 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
381 Requires<[allowFMA]>;
383 NVPTXInst<(outs Float64Regs:$dst),
384 (ins Float64Regs:$a, f64imm:$b),
385 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
386 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
387 Requires<[allowFMA]>;
389 NVPTXInst<(outs Float32Regs:$dst),
390 (ins Float32Regs:$a, Float32Regs:$b),
391 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
392 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
393 Requires<[allowFMA, doF32FTZ]>;
395 NVPTXInst<(outs Float32Regs:$dst),
396 (ins Float32Regs:$a, f32imm:$b),
397 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
398 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
399 Requires<[allowFMA, doF32FTZ]>;
401 NVPTXInst<(outs Float32Regs:$dst),
402 (ins Float32Regs:$a, Float32Regs:$b),
403 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
404 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
405 Requires<[allowFMA]>;
407 NVPTXInst<(outs Float32Regs:$dst),
408 (ins Float32Regs:$a, f32imm:$b),
409 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
410 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
411 Requires<[allowFMA]>;
414 NVPTXInst<(outs Int16Regs:$dst),
415 (ins Int16Regs:$a, Int16Regs:$b),
416 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
417 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
418 Requires<[useFP16Math, allowFMA, doF32FTZ]>;
420 NVPTXInst<(outs Int16Regs:$dst),
421 (ins Int16Regs:$a, Int16Regs:$b),
422 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
423 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
424 Requires<[useFP16Math, allowFMA]>;
427 NVPTXInst<(outs Int32Regs:$dst),
428 (ins Int32Regs:$a, Int32Regs:$b),
429 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
430 [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
431 Requires<[useFP16Math, allowFMA, doF32FTZ]>;
433 NVPTXInst<(outs Int32Regs:$dst),
434 (ins Int32Regs:$a, Int32Regs:$b),
435 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
436 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
437 Requires<[useFP16Math, allowFMA]>;
439 NVPTXInst<(outs Int16Regs:$dst),
440 (ins Int16Regs:$a, Int16Regs:$b),
441 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
442 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
443 Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
445 NVPTXInst<(outs Int16Regs:$dst),
446 (ins Int16Regs:$a, Int16Regs:$b),
447 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
448 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
449 Requires<[hasBF16Math, allowFMA]>;
452 NVPTXInst<(outs Int32Regs:$dst),
453 (ins Int32Regs:$a, Int32Regs:$b),
454 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
455 [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
456 Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
458 NVPTXInst<(outs Int32Regs:$dst),
459 (ins Int32Regs:$a, Int32Regs:$b),
460 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
461 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
462 Requires<[hasBF16Math, allowFMA]>;
463 // These have strange names so we don't perturb existing mir tests.
465 NVPTXInst<(outs Float64Regs:$dst),
466 (ins Float64Regs:$a, Float64Regs:$b),
467 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
468 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
471 NVPTXInst<(outs Float64Regs:$dst),
472 (ins Float64Regs:$a, f64imm:$b),
473 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
474 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
477 NVPTXInst<(outs Float32Regs:$dst),
478 (ins Float32Regs:$a, Float32Regs:$b),
479 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
480 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
481 Requires<[noFMA, doF32FTZ]>;
483 NVPTXInst<(outs Float32Regs:$dst),
484 (ins Float32Regs:$a, f32imm:$b),
485 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
486 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
487 Requires<[noFMA, doF32FTZ]>;
489 NVPTXInst<(outs Float32Regs:$dst),
490 (ins Float32Regs:$a, Float32Regs:$b),
491 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
492 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
495 NVPTXInst<(outs Float32Regs:$dst),
496 (ins Float32Regs:$a, f32imm:$b),
497 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
498 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
501 NVPTXInst<(outs Int16Regs:$dst),
502 (ins Int16Regs:$a, Int16Regs:$b),
503 !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
504 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
505 Requires<[useFP16Math, noFMA, doF32FTZ]>;
507 NVPTXInst<(outs Int16Regs:$dst),
508 (ins Int16Regs:$a, Int16Regs:$b),
509 !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
510 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
511 Requires<[useFP16Math, noFMA]>;
513 NVPTXInst<(outs Int32Regs:$dst),
514 (ins Int32Regs:$a, Int32Regs:$b),
515 !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
516 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
517 Requires<[useFP16Math, noFMA, doF32FTZ]>;
519 NVPTXInst<(outs Int32Regs:$dst),
520 (ins Int32Regs:$a, Int32Regs:$b),
521 !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
522 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
523 Requires<[useFP16Math, noFMA]>;
525 NVPTXInst<(outs Int16Regs:$dst),
526 (ins Int16Regs:$a, Int16Regs:$b),
527 !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"),
528 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
529 Requires<[hasBF16Math, noFMA, doF32FTZ]>;
531 NVPTXInst<(outs Int16Regs:$dst),
532 (ins Int16Regs:$a, Int16Regs:$b),
533 !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"),
534 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
535 Requires<[hasBF16Math, noFMA]>;
536 def _rnbf16x2rr_ftz :
537 NVPTXInst<(outs Int32Regs:$dst),
538 (ins Int32Regs:$a, Int32Regs:$b),
539 !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"),
540 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
541 Requires<[hasBF16Math, noFMA, doF32FTZ]>;
543 NVPTXInst<(outs Int32Regs:$dst),
544 (ins Int32Regs:$a, Int32Regs:$b),
545 !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"),
546 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
547 Requires<[hasBF16Math, noFMA]>;
550 // Template for operations which take two f32 or f64 operands. Provides three
551 // instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
552 // subnormal inputs and results to zero).
553 multiclass F2<string OpcStr, SDNode OpNode> {
554 def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
555 !strconcat(OpcStr, ".f64 \t$dst, $a;"),
556 [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
557 def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
558 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
559 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
560 Requires<[doF32FTZ]>;
561 def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
562 !strconcat(OpcStr, ".f32 \t$dst, $a;"),
563 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
566 multiclass F2_Support_Half<string OpcStr, SDNode OpNode> {
567 def bf16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
568 !strconcat(OpcStr, ".bf16 \t$dst, $a;"),
569 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a)))]>,
570 Requires<[hasSM<80>, hasPTX<70>]>;
571 def bf16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
572 !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"),
573 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a)))]>,
574 Requires<[hasSM<80>, hasPTX<70>]>;
575 def f16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
576 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"),
577 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>,
578 Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
579 def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
580 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"),
581 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>,
582 Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
583 def f16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
584 !strconcat(OpcStr, ".f16 \t$dst, $a;"),
585 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>,
586 Requires<[hasSM<53>, hasPTX<65>]>;
587 def f16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
588 !strconcat(OpcStr, ".f16x2 \t$dst, $a;"),
589 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>,
590 Requires<[hasSM<53>, hasPTX<65>]>;
594 //===----------------------------------------------------------------------===//
595 // NVPTX Instructions.
596 //===----------------------------------------------------------------------===//
598 //-----------------------------------
600 //-----------------------------------
602 let hasSideEffects = false in {
603 // Generate a cvt to the given type from all possible types. Each instance
604 // takes a CvtMode immediate that defines the conversion mode to use. It can
605 // be CvtNONE to omit a conversion mode.
606 multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> {
608 NVPTXInst<(outs RC:$dst),
609 (ins Int16Regs:$src, CvtMode:$mode),
610 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
611 ToType, ".s8 \t$dst, $src;"), []>,
614 NVPTXInst<(outs RC:$dst),
615 (ins Int16Regs:$src, CvtMode:$mode),
616 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
617 ToType, ".u8 \t$dst, $src;"), []>,
620 NVPTXInst<(outs RC:$dst),
621 (ins Int16Regs:$src, CvtMode:$mode),
622 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
623 ToType, ".s16 \t$dst, $src;"), []>,
626 NVPTXInst<(outs RC:$dst),
627 (ins Int16Regs:$src, CvtMode:$mode),
628 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
629 ToType, ".u16 \t$dst, $src;"), []>,
632 NVPTXInst<(outs RC:$dst),
633 (ins Int32Regs:$src, CvtMode:$mode),
634 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
635 ToType, ".s32 \t$dst, $src;"), []>,
638 NVPTXInst<(outs RC:$dst),
639 (ins Int32Regs:$src, CvtMode:$mode),
640 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
641 ToType, ".u32 \t$dst, $src;"), []>,
644 NVPTXInst<(outs RC:$dst),
645 (ins Int64Regs:$src, CvtMode:$mode),
646 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
647 ToType, ".s64 \t$dst, $src;"), []>,
650 NVPTXInst<(outs RC:$dst),
651 (ins Int64Regs:$src, CvtMode:$mode),
652 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
653 ToType, ".u64 \t$dst, $src;"), []>,
656 NVPTXInst<(outs RC:$dst),
657 (ins Int16Regs:$src, CvtMode:$mode),
658 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
659 ToType, ".f16 \t$dst, $src;"), []>,
662 NVPTXInst<(outs RC:$dst),
663 (ins Int16Regs:$src, CvtMode:$mode),
664 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
665 ToType, ".bf16 \t$dst, $src;"), []>,
666 Requires<!if(!eq(ToType, "f32"),
667 // bf16->f32 was introduced early.
668 [hasPTX<71>, hasSM<80>],
669 // bf16->everything else needs sm90/ptx78
670 [hasPTX<78>, hasSM<90>])>;
672 NVPTXInst<(outs RC:$dst),
673 (ins Float32Regs:$src, CvtMode:$mode),
674 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
675 ToType, ".f32 \t$dst, $src;"), []>,
676 Requires<!if(!eq(ToType, "bf16"),
677 // f32->bf16 was introduced early.
678 [hasPTX<70>, hasSM<80>],
681 NVPTXInst<(outs RC:$dst),
682 (ins Float64Regs:$src, CvtMode:$mode),
683 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
684 ToType, ".f64 \t$dst, $src;"), []>,
688 // Generate cvts from all types to all types.
689 defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>;
690 defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>;
691 defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
692 defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
693 defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
694 defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
695 defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
696 defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
697 defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
698 defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>;
699 defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
700 defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
702 // These cvts are different from those above: The source and dest registers
703 // are of the same type.
704 def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
705 "cvt.s16.s8 \t$dst, $src;", []>;
706 def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
707 "cvt.s32.s8 \t$dst, $src;", []>;
708 def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
709 "cvt.s32.s16 \t$dst, $src;", []>;
710 def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
711 "cvt.s64.s8 \t$dst, $src;", []>;
712 def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
713 "cvt.s64.s16 \t$dst, $src;", []>;
714 def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
715 "cvt.s64.s32 \t$dst, $src;", []>;
717 multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
719 NVPTXInst<(outs RC:$dst),
720 (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
721 !strconcat("cvt${mode:base}${mode:relu}.",
722 FromName, ".f32 \t$dst, $src1, $src2;"), []>,
723 Requires<[hasPTX<70>, hasSM<80>]>;
726 defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
727 defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
730 //-----------------------------------
731 // Selection instructions (selp)
732 //-----------------------------------
734 // TODO: Missing slct
736 // selp instructions that don't have any pattern matches; we explicitly use
737 // them within this file.
738 let hasSideEffects = false in {
739 multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
740 def rr : NVPTXInst<(outs RC:$dst),
741 (ins RC:$a, RC:$b, Int1Regs:$p),
742 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
743 def ri : NVPTXInst<(outs RC:$dst),
744 (ins RC:$a, ImmCls:$b, Int1Regs:$p),
745 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
746 def ir : NVPTXInst<(outs RC:$dst),
747 (ins ImmCls:$a, RC:$b, Int1Regs:$p),
748 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
749 def ii : NVPTXInst<(outs RC:$dst),
750 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
751 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
754 multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC,
755 Operand ImmCls, SDNode ImmNode> {
757 NVPTXInst<(outs RC:$dst),
758 (ins RC:$a, RC:$b, Int1Regs:$p),
759 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
760 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>;
762 NVPTXInst<(outs RC:$dst),
763 (ins RC:$a, ImmCls:$b, Int1Regs:$p),
764 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
765 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>;
767 NVPTXInst<(outs RC:$dst),
768 (ins ImmCls:$a, RC:$b, Int1Regs:$p),
769 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
770 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>;
772 NVPTXInst<(outs RC:$dst),
773 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
774 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
775 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
779 // Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
781 defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>;
782 defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
783 defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
784 defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>;
785 defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
786 defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
787 defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>;
788 defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
789 defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
790 defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>;
791 defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>;
793 defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>;
794 defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
796 // This does not work as tablegen fails to infer the type of 'imm'.
797 // def v2f16imm : Operand<v2f16>;
798 // defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>;
800 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
801 def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))),
802 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>;
805 //-----------------------------------
807 //-----------------------------------
809 def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a),
810 "testp.infinite.f32 \t$p, $a;",
812 def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a),
813 "testp.infinite.f32 \t$p, $a;",
815 def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
816 "testp.infinite.f64 \t$p, $a;",
818 def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a),
819 "testp.infinite.f64 \t$p, $a;",
822 //-----------------------------------
823 // Integer Arithmetic
824 //-----------------------------------
826 // Template for xor masquerading as int1 arithmetic.
827 multiclass ADD_SUB_i1<SDNode OpNode> {
828 def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
829 "xor.pred \t$dst, $a, $b;",
830 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
831 def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
832 "xor.pred \t$dst, $a, $b;",
833 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
836 // int1 addition and subtraction are both just xor.
837 defm ADD_i1 : ADD_SUB_i1<add>;
838 defm SUB_i1 : ADD_SUB_i1<sub>;
840 // int16, int32, and int64 signed addition. Since nvptx is 2's complement, we
841 // also use these for unsigned arithmetic.
842 defm ADD : I3<"add.s", add>;
843 defm SUB : I3<"sub.s", sub>;
845 def ADD16x2 : I16x2<"add.s", add>;
847 // in32 and int64 addition and subtraction with carry-out.
848 defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
849 defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
851 // int32 and int64 addition and subtraction with carry-in and carry-out.
852 defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
853 defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
855 defm MULT : I3<"mul.lo.s", mul>;
857 defm MULTHS : I3<"mul.hi.s", mulhs>;
858 defm MULTHU : I3<"mul.hi.u", mulhu>;
860 defm SDIV : I3<"div.s", sdiv>;
861 defm UDIV : I3<"div.u", udiv>;
863 // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
865 defm SREM : I3<"rem.s", srem>;
866 defm UREM : I3<"rem.u", urem>;
868 // Integer absolute value. NumBits should be one minus the bit width of RC.
869 // This idiom implements the algorithm at
870 // http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
871 multiclass ABS<ValueType T, RegisterClass RC, string SizeName> {
872 def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
873 !strconcat("abs", SizeName, " \t$dst, $a;"),
874 [(set (T RC:$dst), (abs (T RC:$a)))]>;
876 defm ABS_16 : ABS<i16, Int16Regs, ".s16">;
877 defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
878 defm ABS_64 : ABS<i64, Int64Regs, ".s64">;
881 defm SMAX : I3<"max.s", smax>;
882 defm UMAX : I3<"max.u", umax>;
883 defm SMIN : I3<"min.s", smin>;
884 defm UMIN : I3<"min.u", umin>;
886 def SMAX16x2 : I16x2<"max.s", smax>;
887 def UMAX16x2 : I16x2<"max.u", umax>;
888 def SMIN16x2 : I16x2<"min.s", smin>;
889 def UMIN16x2 : I16x2<"min.u", umin>;
893 // Wide multiplication
896 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
897 "mul.wide.s32 \t$dst, $a, $b;", []>;
899 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
900 "mul.wide.s32 \t$dst, $a, $b;", []>;
901 def MULWIDES64Imm64 :
902 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
903 "mul.wide.s32 \t$dst, $a, $b;", []>;
906 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
907 "mul.wide.u32 \t$dst, $a, $b;", []>;
909 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
910 "mul.wide.u32 \t$dst, $a, $b;", []>;
911 def MULWIDEU64Imm64 :
912 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
913 "mul.wide.u32 \t$dst, $a, $b;", []>;
916 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
917 "mul.wide.s16 \t$dst, $a, $b;", []>;
919 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
920 "mul.wide.s16 \t$dst, $a, $b;", []>;
921 def MULWIDES32Imm32 :
922 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
923 "mul.wide.s16 \t$dst, $a, $b;", []>;
926 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
927 "mul.wide.u16 \t$dst, $a, $b;", []>;
929 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
930 "mul.wide.u16 \t$dst, $a, $b;", []>;
931 def MULWIDEU32Imm32 :
932 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
933 "mul.wide.u16 \t$dst, $a, $b;", []>;
935 def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
936 def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
937 def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
939 // Matchers for signed, unsigned mul.wide ISD nodes.
940 def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
941 (MULWIDES32 i16:$a, i16:$b)>,
942 Requires<[doMulWide]>;
943 def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
944 (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
945 Requires<[doMulWide]>;
946 def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
947 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
948 Requires<[doMulWide]>;
949 def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
950 (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
951 Requires<[doMulWide]>;
953 def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
954 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
955 Requires<[doMulWide]>;
956 def : Pat<(i64 (mul_wide_signed (i32 Int32Regs:$a), imm:$b)),
957 (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
958 Requires<[doMulWide]>;
959 def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
960 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
961 Requires<[doMulWide]>;
962 def : Pat<(i64 (mul_wide_unsigned (i32 Int32Regs:$a), imm:$b)),
963 (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
964 Requires<[doMulWide]>;
966 // Predicates used for converting some patterns to mul.wide.
967 def SInt32Const : PatLeaf<(imm), [{
968 const APInt &v = N->getAPIntValue();
969 return v.isSignedIntN(32);
972 def UInt32Const : PatLeaf<(imm), [{
973 const APInt &v = N->getAPIntValue();
977 def SInt16Const : PatLeaf<(imm), [{
978 const APInt &v = N->getAPIntValue();
979 return v.isSignedIntN(16);
982 def UInt16Const : PatLeaf<(imm), [{
983 const APInt &v = N->getAPIntValue();
987 def IntConst_0_30 : PatLeaf<(imm), [{
988 // Check if 0 <= v < 31; only then will the result of (x << v) be an int32.
989 const APInt &v = N->getAPIntValue();
990 return v.sge(0) && v.slt(31);
993 def IntConst_0_14 : PatLeaf<(imm), [{
994 // Check if 0 <= v < 15; only then will the result of (x << v) be an int16.
995 const APInt &v = N->getAPIntValue();
996 return v.sge(0) && v.slt(15);
999 def SHL2MUL32 : SDNodeXForm<imm, [{
1000 const APInt &v = N->getAPIntValue();
1002 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
1005 def SHL2MUL16 : SDNodeXForm<imm, [{
1006 const APInt &v = N->getAPIntValue();
1008 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
1011 // Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
1012 def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)),
1013 (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
1014 Requires<[doMulWide]>;
1015 def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)),
1016 (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
1017 Requires<[doMulWide]>;
1019 def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)),
1020 (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
1021 Requires<[doMulWide]>;
1022 def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)),
1023 (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
1024 Requires<[doMulWide]>;
1026 // Convert "sign/zero-extend then multiply" to mul.wide.
1027 def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
1028 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
1029 Requires<[doMulWide]>;
1030 def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
1031 (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
1032 Requires<[doMulWide]>;
1034 def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
1035 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
1036 Requires<[doMulWide]>;
1037 def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
1038 (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
1039 Requires<[doMulWide]>;
1041 def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
1042 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
1043 Requires<[doMulWide]>;
1044 def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
1045 (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
1046 Requires<[doMulWide]>;
1048 def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
1049 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
1050 Requires<[doMulWide]>;
1051 def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
1052 (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
1053 Requires<[doMulWide]>;
1056 // Integer multiply-add
1059 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
1060 SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
1061 def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
1064 NVPTXInst<(outs Int16Regs:$dst),
1065 (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
1066 "mad.lo.s16 \t$dst, $a, $b, $c;",
1067 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
1069 NVPTXInst<(outs Int16Regs:$dst),
1070 (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
1071 "mad.lo.s16 \t$dst, $a, $b, $c;",
1072 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
1074 NVPTXInst<(outs Int16Regs:$dst),
1075 (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
1076 "mad.lo.s16 \t$dst, $a, $b, $c;",
1077 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
1079 NVPTXInst<(outs Int16Regs:$dst),
1080 (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
1081 "mad.lo.s16 \t$dst, $a, $b, $c;",
1082 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
1085 NVPTXInst<(outs Int32Regs:$dst),
1086 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
1087 "mad.lo.s32 \t$dst, $a, $b, $c;",
1088 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
1090 NVPTXInst<(outs Int32Regs:$dst),
1091 (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
1092 "mad.lo.s32 \t$dst, $a, $b, $c;",
1093 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), imm:$c))]>;
1095 NVPTXInst<(outs Int32Regs:$dst),
1096 (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
1097 "mad.lo.s32 \t$dst, $a, $b, $c;",
1098 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, (i32 Int32Regs:$c)))]>;
1100 NVPTXInst<(outs Int32Regs:$dst),
1101 (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
1102 "mad.lo.s32 \t$dst, $a, $b, $c;",
1103 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, imm:$c))]>;
1106 NVPTXInst<(outs Int64Regs:$dst),
1107 (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
1108 "mad.lo.s64 \t$dst, $a, $b, $c;",
1109 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
1111 NVPTXInst<(outs Int64Regs:$dst),
1112 (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
1113 "mad.lo.s64 \t$dst, $a, $b, $c;",
1114 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
1116 NVPTXInst<(outs Int64Regs:$dst),
1117 (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
1118 "mad.lo.s64 \t$dst, $a, $b, $c;",
1119 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
1121 NVPTXInst<(outs Int64Regs:$dst),
1122 (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
1123 "mad.lo.s64 \t$dst, $a, $b, $c;",
1124 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
1127 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1128 "neg.s16 \t$dst, $src;",
1129 [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
1131 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1132 "neg.s32 \t$dst, $src;",
1133 [(set (i32 Int32Regs:$dst), (ineg (i32 Int32Regs:$src)))]>;
1135 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1136 "neg.s64 \t$dst, $src;",
1137 [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
1139 //-----------------------------------
1140 // Floating Point Arithmetic
1141 //-----------------------------------
1144 def FloatConst1 : PatLeaf<(fpimm), [{
1145 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
1146 N->getValueAPF().convertToFloat() == 1.0f;
1148 // Constant 1.0 (double)
1149 def DoubleConst1 : PatLeaf<(fpimm), [{
1150 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
1151 N->getValueAPF().convertToDouble() == 1.0;
1153 // Constant -1.0 (double)
1154 def DoubleConstNeg1 : PatLeaf<(fpimm), [{
1155 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
1156 N->getValueAPF().convertToDouble() == -1.0;
1160 // Constant -X -> X (double)
1161 def NegDoubleConst : SDNodeXForm<fpimm, [{
1162 return CurDAG->getTargetConstantFP(-(N->getValueAPF()),
1163 SDLoc(N), MVT::f64);
1166 // Loads FP16 constant into a register.
1168 // ptxas does not have hex representation for fp16, so we can't use
1169 // fp16 immediate values in .f16 instructions. Instead we have to load
1170 // the constant into a register using mov.b16.
1171 def LOAD_CONST_F16 :
1172 NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
1173 "mov.b16 \t$dst, $a;", []>;
1174 def LOAD_CONST_BF16 :
1175 NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a),
1176 "mov.b16 \t$dst, $a;", []>;
1177 defm FADD : F3_fma_component<"add", fadd>;
1178 defm FSUB : F3_fma_component<"sub", fsub>;
1179 defm FMUL : F3_fma_component<"mul", fmul>;
1181 defm FMIN : F3<"min", fminnum>;
1182 defm FMAX : F3<"max", fmaxnum>;
1183 // Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
1184 defm FMINNAN : F3<"min.NaN", fminimum>;
1185 defm FMAXNAN : F3<"max.NaN", fmaximum>;
1187 defm FABS : F2<"abs", fabs>;
1188 defm FNEG : F2<"neg", fneg>;
1189 defm FABS_H: F2_Support_Half<"abs", fabs>;
1190 defm FNEG_H: F2_Support_Half<"neg", fneg>;
1192 defm FSQRT : F2<"sqrt.rn", fsqrt>;
1197 class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1198 NVPTXInst<(outs RC:$dst), (ins RC:$src),
1199 !strconcat(OpcStr, " \t$dst, $src;"),
1200 [(set RC:$dst, (fneg (T RC:$src)))]>,
1201 Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>;
1202 def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>;
1203 def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>;
1204 def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1205 def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>;
1211 class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1212 NVPTXInst<(outs RC:$dst), (ins RC:$src),
1213 !strconcat(OpcStr, " \t$dst, $src;"),
1214 [(set RC:$dst, (fneg (T RC:$src)))]>,
1215 Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>;
1216 def BFNEG16_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1217 def BFNEG16 : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>;
1218 def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1219 def BFNEG16x2 : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>;
1225 NVPTXInst<(outs Float64Regs:$dst),
1226 (ins f64imm:$a, Float64Regs:$b),
1227 "rcp.rn.f64 \t$dst, $b;",
1228 [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
1230 NVPTXInst<(outs Float64Regs:$dst),
1231 (ins Float64Regs:$a, Float64Regs:$b),
1232 "div.rn.f64 \t$dst, $a, $b;",
1233 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
1235 NVPTXInst<(outs Float64Regs:$dst),
1236 (ins Float64Regs:$a, f64imm:$b),
1237 "div.rn.f64 \t$dst, $a, $b;",
1238 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
1240 // fdiv will be converted to rcp
1241 // fneg (fdiv 1.0, X) => fneg (rcp.rn X)
1242 def : Pat<(fdiv DoubleConstNeg1:$a, Float64Regs:$b),
1243 (FNEGf64 (FDIV641r (NegDoubleConst node:$a), Float64Regs:$b))>;
1246 // F32 Approximate reciprocal
1249 NVPTXInst<(outs Float32Regs:$dst),
1250 (ins f32imm:$a, Float32Regs:$b),
1251 "rcp.approx.ftz.f32 \t$dst, $b;",
1252 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1253 Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1255 NVPTXInst<(outs Float32Regs:$dst),
1256 (ins f32imm:$a, Float32Regs:$b),
1257 "rcp.approx.f32 \t$dst, $b;",
1258 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1259 Requires<[do_DIVF32_APPROX]>;
1261 // F32 Approximate division
1263 def FDIV32approxrr_ftz :
1264 NVPTXInst<(outs Float32Regs:$dst),
1265 (ins Float32Regs:$a, Float32Regs:$b),
1266 "div.approx.ftz.f32 \t$dst, $a, $b;",
1267 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1268 Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1269 def FDIV32approxri_ftz :
1270 NVPTXInst<(outs Float32Regs:$dst),
1271 (ins Float32Regs:$a, f32imm:$b),
1272 "div.approx.ftz.f32 \t$dst, $a, $b;",
1273 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1274 Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1275 def FDIV32approxrr :
1276 NVPTXInst<(outs Float32Regs:$dst),
1277 (ins Float32Regs:$a, Float32Regs:$b),
1278 "div.approx.f32 \t$dst, $a, $b;",
1279 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1280 Requires<[do_DIVF32_APPROX]>;
1281 def FDIV32approxri :
1282 NVPTXInst<(outs Float32Regs:$dst),
1283 (ins Float32Regs:$a, f32imm:$b),
1284 "div.approx.f32 \t$dst, $a, $b;",
1285 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1286 Requires<[do_DIVF32_APPROX]>;
1288 // F32 Semi-accurate reciprocal
1290 // rcp.approx gives the same result as div.full(1.0f, a) and is faster.
1292 def FDIV321r_approx_ftz :
1293 NVPTXInst<(outs Float32Regs:$dst),
1294 (ins f32imm:$a, Float32Regs:$b),
1295 "rcp.approx.ftz.f32 \t$dst, $b;",
1296 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1297 Requires<[do_DIVF32_FULL, doF32FTZ]>;
1298 def FDIV321r_approx :
1299 NVPTXInst<(outs Float32Regs:$dst),
1300 (ins f32imm:$a, Float32Regs:$b),
1301 "rcp.approx.f32 \t$dst, $b;",
1302 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1303 Requires<[do_DIVF32_FULL]>;
1305 // F32 Semi-accurate division
1308 NVPTXInst<(outs Float32Regs:$dst),
1309 (ins Float32Regs:$a, Float32Regs:$b),
1310 "div.full.ftz.f32 \t$dst, $a, $b;",
1311 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1312 Requires<[do_DIVF32_FULL, doF32FTZ]>;
1314 NVPTXInst<(outs Float32Regs:$dst),
1315 (ins Float32Regs:$a, f32imm:$b),
1316 "div.full.ftz.f32 \t$dst, $a, $b;",
1317 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1318 Requires<[do_DIVF32_FULL, doF32FTZ]>;
1320 NVPTXInst<(outs Float32Regs:$dst),
1321 (ins Float32Regs:$a, Float32Regs:$b),
1322 "div.full.f32 \t$dst, $a, $b;",
1323 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1324 Requires<[do_DIVF32_FULL]>;
1326 NVPTXInst<(outs Float32Regs:$dst),
1327 (ins Float32Regs:$a, f32imm:$b),
1328 "div.full.f32 \t$dst, $a, $b;",
1329 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1330 Requires<[do_DIVF32_FULL]>;
1332 // F32 Accurate reciprocal
1334 def FDIV321r_prec_ftz :
1335 NVPTXInst<(outs Float32Regs:$dst),
1336 (ins f32imm:$a, Float32Regs:$b),
1337 "rcp.rn.ftz.f32 \t$dst, $b;",
1338 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1339 Requires<[doF32FTZ]>;
1341 NVPTXInst<(outs Float32Regs:$dst),
1342 (ins f32imm:$a, Float32Regs:$b),
1343 "rcp.rn.f32 \t$dst, $b;",
1344 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
1346 // F32 Accurate division
1348 def FDIV32rr_prec_ftz :
1349 NVPTXInst<(outs Float32Regs:$dst),
1350 (ins Float32Regs:$a, Float32Regs:$b),
1351 "div.rn.ftz.f32 \t$dst, $a, $b;",
1352 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1353 Requires<[doF32FTZ]>;
1354 def FDIV32ri_prec_ftz :
1355 NVPTXInst<(outs Float32Regs:$dst),
1356 (ins Float32Regs:$a, f32imm:$b),
1357 "div.rn.ftz.f32 \t$dst, $a, $b;",
1358 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1359 Requires<[doF32FTZ]>;
1361 NVPTXInst<(outs Float32Regs:$dst),
1362 (ins Float32Regs:$a, Float32Regs:$b),
1363 "div.rn.f32 \t$dst, $a, $b;",
1364 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
1366 NVPTXInst<(outs Float32Regs:$dst),
1367 (ins Float32Regs:$a, f32imm:$b),
1368 "div.rn.f32 \t$dst, $a, $b;",
1369 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
1375 multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
1376 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1377 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1378 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1380 def rri : NVPTXInst<(outs RC:$dst),
1381 (ins RC:$a, RC:$b, ImmCls:$c),
1382 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1383 [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
1385 def rir : NVPTXInst<(outs RC:$dst),
1386 (ins RC:$a, ImmCls:$b, RC:$c),
1387 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1388 [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
1390 def rii : NVPTXInst<(outs RC:$dst),
1391 (ins RC:$a, ImmCls:$b, ImmCls:$c),
1392 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1393 [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
1397 multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1398 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1399 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1400 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1401 Requires<[useFP16Math, Pred]>;
1404 multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1405 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1406 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1407 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1408 Requires<[hasBF16Math, Pred]>;
1411 defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
1412 defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
1413 defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1414 defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>;
1415 defm BFMA16_ftz : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1416 defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>;
1417 defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1418 defm BFMA16x2 : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>;
1419 defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
1420 defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
1421 defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
1424 def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1425 "sin.approx.f32 \t$dst, $src;",
1426 [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
1427 Requires<[allowUnsafeFPMath]>;
1428 def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1429 "cos.approx.f32 \t$dst, $src;",
1430 [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
1431 Requires<[allowUnsafeFPMath]>;
1433 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
1434 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the
1435 // semantics of LLVM's frem.
1438 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1439 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1440 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1442 Requires<[doF32FTZ, allowUnsafeFPMath]>;
1443 def : Pat<(frem Float32Regs:$x, fpimm:$y),
1444 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1445 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1447 Requires<[doF32FTZ, allowUnsafeFPMath]>;
1449 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1450 (SELP_f32rr Float32Regs:$x,
1451 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1452 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1454 (TESTINF_f32r Float32Regs:$y))>,
1455 Requires<[doF32FTZ, noUnsafeFPMath]>;
1456 def : Pat<(frem Float32Regs:$x, fpimm:$y),
1457 (SELP_f32rr Float32Regs:$x,
1458 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1459 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1461 (TESTINF_f32i fpimm:$y))>,
1462 Requires<[doF32FTZ, noUnsafeFPMath]>;
1465 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1466 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1467 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1469 Requires<[allowUnsafeFPMath]>;
1470 def : Pat<(frem Float32Regs:$x, fpimm:$y),
1471 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1472 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1474 Requires<[allowUnsafeFPMath]>;
1476 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1477 (SELP_f32rr Float32Regs:$x,
1478 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1479 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1481 (TESTINF_f32r Float32Regs:$y))>,
1482 Requires<[noUnsafeFPMath]>;
1483 def : Pat<(frem Float32Regs:$x, fpimm:$y),
1484 (SELP_f32rr Float32Regs:$x,
1485 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1486 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1488 (TESTINF_f32i fpimm:$y))>,
1489 Requires<[noUnsafeFPMath]>;
1492 def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1493 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1494 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1496 Requires<[allowUnsafeFPMath]>;
1497 def : Pat<(frem Float64Regs:$x, fpimm:$y),
1498 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1499 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1501 Requires<[allowUnsafeFPMath]>;
1503 def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1504 (SELP_f64rr Float64Regs:$x,
1505 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1506 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1508 (TESTINF_f64r Float64Regs:$y))>,
1509 Requires<[noUnsafeFPMath]>;
1510 def : Pat<(frem Float64Regs:$x, fpimm:$y),
1511 (SELP_f64rr Float64Regs:$x,
1512 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1513 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1515 (TESTINF_f64r Float64Regs:$y))>,
1516 Requires<[noUnsafeFPMath]>;
1518 //-----------------------------------
1519 // Bitwise operations
1520 //-----------------------------------
1522 // Template for three-arg bitwise operations. Takes three args, Creates .b16,
1523 // .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
1524 multiclass BITWISE<string OpcStr, SDNode OpNode> {
1526 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1527 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
1528 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
1530 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1531 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
1532 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
1534 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1535 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
1536 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1538 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1539 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
1540 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
1542 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1543 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
1544 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
1546 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1547 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
1548 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
1550 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1551 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
1552 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1554 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1555 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
1556 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1559 defm OR : BITWISE<"or", or>;
1560 defm AND : BITWISE<"and", and>;
1561 defm XOR : BITWISE<"xor", xor>;
1563 // PTX does not support mul on predicates, convert to and instructions
1564 def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
1565 def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>;
1567 // These transformations were once reliably performed by instcombine, but thanks
1568 // to poison semantics they are no longer safe for LLVM IR, perform them here
1570 def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
1571 def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
1573 // Lower logical v2i16/v4i8 ops as bitwise ops on b32.
1574 foreach vt = [v2i16, v4i8] in {
1575 def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
1576 (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
1577 def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)),
1578 (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
1579 def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)),
1580 (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
1582 // The constants get legalized into a bitcast from i32, so that's what we need
1584 def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1585 (ORb32ri Int32Regs:$a, imm:$b)>;
1586 def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1587 (XORb32ri Int32Regs:$a, imm:$b)>;
1588 def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1589 (ANDb32ri Int32Regs:$a, imm:$b)>;
1592 def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1593 "not.pred \t$dst, $src;",
1594 [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1595 def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1596 "not.b16 \t$dst, $src;",
1597 [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1598 def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1599 "not.b32 \t$dst, $src;",
1600 [(set (i32 Int32Regs:$dst), (not (i32 Int32Regs:$src)))]>;
1601 def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1602 "not.b64 \t$dst, $src;",
1603 [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1605 // Template for left/right shifts. Takes three operands,
1606 // [dest (reg), src (reg), shift (reg or imm)].
1607 // dest and src may be int64, int32, or int16, but shift is always int32.
1609 // This template also defines a 32-bit shift (imm, imm) instruction.
1610 multiclass SHIFT<string OpcStr, SDNode OpNode> {
1612 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1613 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1614 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 Int32Regs:$b)))]>;
1616 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1617 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1618 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1620 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1621 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1622 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
1624 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1625 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1626 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 imm:$b)))]>;
1628 NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1629 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1630 [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1632 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1633 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1634 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 Int32Regs:$b)))]>;
1636 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1637 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1638 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1641 defm SHL : SHIFT<"shl.b", shl>;
1642 defm SRA : SHIFT<"shr.s", sra>;
1643 defm SRL : SHIFT<"shr.u", srl>;
1647 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
1648 "brev.b32 \t$dst, $a;",
1649 [(set Int32Regs:$dst, (bitreverse (i32 Int32Regs:$a)))]>;
1651 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
1652 "brev.b64 \t$dst, $a;",
1653 [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
1656 // Rotate: Use ptx shf instruction if available.
1659 // 32 bit r2 = rotl r1, n
1661 // r2 = shf.l r1, r1, n
1663 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1664 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1665 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
1666 Requires<[hasHWROT32]>;
1669 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1670 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1671 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1672 Requires<[hasHWROT32]>;
1674 // 32 bit r2 = rotr r1, n
1676 // r2 = shf.r r1, r1, n
1678 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1679 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1680 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
1681 Requires<[hasHWROT32]>;
1684 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1685 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1686 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1687 Requires<[hasHWROT32]>;
1689 // 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1.
1691 NVPTXInst<(outs Int32Regs:$dst),
1692 (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1694 ".reg .b32 %lhs;\n\t"
1695 ".reg .b32 %rhs;\n\t"
1696 "shl.b32 \t%lhs, $src, $amt1;\n\t"
1697 "shr.b32 \t%rhs, $src, $amt2;\n\t"
1698 "add.u32 \t$dst, %lhs, %rhs;\n\t"
1702 def SUB_FRM_32 : SDNodeXForm<imm, [{
1703 return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1706 def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)),
1707 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1708 Requires<[noHWROT32]>;
1709 def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)),
1710 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1711 Requires<[noHWROT32]>;
1713 // 32-bit software rotate left by register.
1715 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1717 ".reg .b32 %lhs;\n\t"
1718 ".reg .b32 %rhs;\n\t"
1719 ".reg .b32 %amt2;\n\t"
1720 "shl.b32 \t%lhs, $src, $amt;\n\t"
1721 "sub.s32 \t%amt2, 32, $amt;\n\t"
1722 "shr.b32 \t%rhs, $src, %amt2;\n\t"
1723 "add.u32 \t$dst, %lhs, %rhs;\n\t"
1725 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1726 Requires<[noHWROT32]>;
1728 // 32-bit software rotate right by register.
1730 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1732 ".reg .b32 %lhs;\n\t"
1733 ".reg .b32 %rhs;\n\t"
1734 ".reg .b32 %amt2;\n\t"
1735 "shr.b32 \t%lhs, $src, $amt;\n\t"
1736 "sub.s32 \t%amt2, 32, $amt;\n\t"
1737 "shl.b32 \t%rhs, $src, %amt2;\n\t"
1738 "add.u32 \t$dst, %lhs, %rhs;\n\t"
1740 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1741 Requires<[noHWROT32]>;
1743 // 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1.
1745 NVPTXInst<(outs Int64Regs:$dst),
1746 (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1748 ".reg .b64 %lhs;\n\t"
1749 ".reg .b64 %rhs;\n\t"
1750 "shl.b64 \t%lhs, $src, $amt1;\n\t"
1751 "shr.b64 \t%rhs, $src, $amt2;\n\t"
1752 "add.u64 \t$dst, %lhs, %rhs;\n\t"
1756 def SUB_FRM_64 : SDNodeXForm<imm, [{
1757 return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1760 def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1761 (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1762 def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1763 (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1765 // 64-bit software rotate left by register.
1767 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1769 ".reg .b64 %lhs;\n\t"
1770 ".reg .b64 %rhs;\n\t"
1771 ".reg .u32 %amt2;\n\t"
1772 "and.b32 \t%amt2, $amt, 63;\n\t"
1773 "shl.b64 \t%lhs, $src, %amt2;\n\t"
1774 "sub.u32 \t%amt2, 64, %amt2;\n\t"
1775 "shr.b64 \t%rhs, $src, %amt2;\n\t"
1776 "add.u64 \t$dst, %lhs, %rhs;\n\t"
1778 [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
1781 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1783 ".reg .b64 %lhs;\n\t"
1784 ".reg .b64 %rhs;\n\t"
1785 ".reg .u32 %amt2;\n\t"
1786 "and.b32 \t%amt2, $amt, 63;\n\t"
1787 "shr.b64 \t%lhs, $src, %amt2;\n\t"
1788 "sub.u32 \t%amt2, 64, %amt2;\n\t"
1789 "shl.b64 \t%rhs, $src, %amt2;\n\t"
1790 "add.u64 \t$dst, %lhs, %rhs;\n\t"
1792 [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
1795 // Funnnel shift in clamp mode
1798 // Create SDNodes so they can be used in the DAG code, e.g.
1799 // NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1800 def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1801 def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1804 NVPTXInst<(outs Int32Regs:$dst),
1805 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1806 "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1807 [(set Int32Regs:$dst,
1808 (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
1811 NVPTXInst<(outs Int32Regs:$dst),
1812 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1813 "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1814 [(set Int32Regs:$dst,
1815 (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
1818 // BFE - bit-field extract
1821 // Template for BFE/BFI instructions.
1822 // Args: [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1823 // Start may be an imm only if end is also an imm. FIXME: Is this a
1824 // restriction in PTX?
1826 // dest and src may be int32 or int64, but start and end are always int32.
1828 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
1829 SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
1830 def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>;
1833 SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
1834 SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
1835 def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>;
1838 SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
1839 SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>;
1840 def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
1842 multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
1844 : NVPTXInst<(outs RC:$d),
1845 (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1846 !strconcat(Instr, " \t$d, $a, $b, $c;"),
1847 [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
1849 : NVPTXInst<(outs RC:$d),
1850 (ins RC:$a, Int32Regs:$b, i32imm:$c),
1851 !strconcat(Instr, " \t$d, $a, $b, $c;"),
1852 [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>;
1854 : NVPTXInst<(outs RC:$d),
1855 (ins RC:$a, i32imm:$b, i32imm:$c),
1856 !strconcat(Instr, " \t$d, $a, $b, $c;"),
1857 [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>;
1860 multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
1862 : NVPTXInst<(outs RC:$f),
1863 (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1864 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1865 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
1867 : NVPTXInst<(outs RC:$f),
1868 (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1869 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1870 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
1872 : NVPTXInst<(outs RC:$f),
1873 (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
1874 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1875 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
1877 : NVPTXInst<(outs RC:$f),
1878 (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1879 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1880 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
1882 : NVPTXInst<(outs RC:$f),
1883 (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1884 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1885 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
1887 : NVPTXInst<(outs RC:$f),
1888 (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d),
1889 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1890 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
1893 multiclass PRMT<ValueType T, RegisterClass RC> {
1895 : NVPTXInst<(outs RC:$d),
1896 (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode),
1897 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1898 [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>;
1900 : NVPTXInst<(outs RC:$d),
1901 (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode),
1902 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1903 [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>;
1905 : NVPTXInst<(outs RC:$d),
1906 (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode),
1907 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1908 [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>;
1911 let hasSideEffects = false in {
1912 // order is somewhat important here. signed/unsigned variants match
1913 // the same patterns, so the first one wins. Having unsigned byte extraction
1914 // has the benefit of always having zero in unused bits, which makes some
1915 // optimizations easier (e.g. no need to mask them).
1916 defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
1917 defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
1918 defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;
1919 defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
1921 defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
1922 defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
1924 defm PRMT_B32 : PRMT<i32, Int32Regs>;
1928 // byte extraction + signed/unsigned extension to i32.
1929 def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), i8)),
1930 (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>;
1931 def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), i8)),
1932 (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
1933 def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), 255)),
1934 (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>;
1935 def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), 255)),
1936 (BFE_U32rii Int32Regs:$s, imm:$o, 8)>;
1938 // byte extraction + signed extension to i16
1939 def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8)), i8)),
1940 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
1943 // Byte extraction via shift/trunc/sext
1944 def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)),
1945 (CVT_s8_s32 Int32Regs:$s, CvtNONE)>;
1946 def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s), (i32 imm:$o))), i8)),
1947 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
1948 def : Pat<(sext_inreg (srl (i32 Int32Regs:$s), (i32 imm:$o)), i8),
1949 (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
1950 def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))),
1951 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>;
1952 def : Pat<(sext_inreg (srl (i64 Int64Regs:$s), (i32 imm:$o)), i8),
1953 (BFE_S64rii Int64Regs:$s, imm:$o, 8)>;
1954 def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)),
1955 (CVT_s8_s64 Int64Regs:$s, CvtNONE)>;
1956 def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s), (i32 imm:$o))), i8)),
1957 (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
1959 //-----------------------------------
1960 // Comparison instructions (setp, set)
1961 //-----------------------------------
1963 // FIXME: This doesn't cover versions of set and setp that combine with a
1964 // boolean predicate, e.g. setp.eq.and.b16.
1966 let hasSideEffects = false in {
1967 multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1969 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1970 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1971 " \t$dst, $a, $b;"), []>;
1973 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1974 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1975 " \t$dst, $a, $b;"), []>;
1977 NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1978 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1979 " \t$dst, $a, $b;"), []>;
1983 defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1984 defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1985 defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1986 defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1987 defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1988 defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1989 defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1990 defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1991 defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1992 defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1993 defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1995 NVPTXInst<(outs Int1Regs:$dst),
1996 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1997 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
1998 []>, Requires<[useFP16Math]>;
2001 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
2002 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
2003 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
2005 Requires<[useFP16Math]>;
2007 NVPTXInst<(outs Int1Regs:$dst),
2008 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
2009 "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;",
2010 []>, Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
2013 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
2014 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
2015 "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;",
2017 Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
2020 // FIXME: This doesn't appear to be correct. The "set" mnemonic has the form
2021 // "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
2022 // reg, either u32, s32, or f32. Anyway these aren't used at the moment.
2024 let hasSideEffects = false in {
2025 multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
2026 def rr : NVPTXInst<(outs Int32Regs:$dst),
2027 (ins RC:$a, RC:$b, CmpMode:$cmp),
2028 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
2029 def ri : NVPTXInst<(outs Int32Regs:$dst),
2030 (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
2031 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
2032 def ir : NVPTXInst<(outs Int32Regs:$dst),
2033 (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
2034 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
2038 defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
2039 defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
2040 defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
2041 defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
2042 defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
2043 defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
2044 defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
2045 defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
2046 defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
2047 defm SET_f16 : SET<"f16", Int16Regs, f16imm>;
2048 defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>, Requires<[hasPTX<78>, hasSM<90>]>;
2049 defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
2050 defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
2052 //-----------------------------------
2053 // Data Movement (Load / Store, Move)
2054 //-----------------------------------
2056 def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
2058 def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
2060 def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
2062 def MEMri : Operand<i32> {
2063 let PrintMethod = "printMemOperand";
2064 let MIOperandInfo = (ops Int32Regs, i32imm);
2066 def MEMri64 : Operand<i64> {
2067 let PrintMethod = "printMemOperand";
2068 let MIOperandInfo = (ops Int64Regs, i64imm);
2071 def imem : Operand<iPTR> {
2072 let PrintMethod = "printOperand";
2075 def imemAny : Operand<iPTRAny> {
2076 let PrintMethod = "printOperand";
2079 def LdStCode : Operand<i32> {
2080 let PrintMethod = "printLdStCode";
2083 def MmaCode : Operand<i32> {
2084 let PrintMethod = "printMmaCode";
2087 def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
2088 def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
2090 // Load a memory address into a u32 or u64 register.
2091 def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
2092 "mov.u32 \t$dst, $a;",
2093 [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
2094 def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
2095 "mov.u64 \t$dst, $a;",
2096 [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
2098 // Get pointer to local stack.
2099 let hasSideEffects = false in {
2100 def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
2101 "mov.u32 \t$d, __local_depot$num;", []>;
2102 def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
2103 "mov.u64 \t$d, __local_depot$num;", []>;
2107 // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
2108 let IsSimpleMove=1, hasSideEffects=0 in {
2109 def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
2110 "mov.pred \t$dst, $sss;", []>;
2111 def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
2112 "mov.u16 \t$dst, $sss;", []>;
2113 def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
2114 "mov.u32 \t$dst, $sss;", []>;
2115 def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
2116 "mov.u64 \t$dst, $sss;", []>;
2117 def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss),
2118 "mov.b128 \t$dst, $sss;", []>;
2120 def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
2121 "mov.b16 \t$dst, $sss;", []>;
2122 def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
2123 "mov.b32 \t$dst, $sss;", []>;
2124 def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
2125 "mov.b64 \t$dst, $sss;", []>;
2127 def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2128 // We have to use .b16 here as there's no mov.f16.
2129 "mov.b16 \t$dst, $src;", []>;
2130 def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
2131 "mov.f32 \t$dst, $src;", []>;
2132 def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
2133 "mov.f64 \t$dst, $src;", []>;
2136 def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
2137 "mov.pred \t$dst, $src;",
2138 [(set Int1Regs:$dst, imm:$src)]>;
2139 def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2140 "mov.u16 \t$dst, $src;",
2141 [(set Int16Regs:$dst, imm:$src)]>;
2142 def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2143 "mov.u32 \t$dst, $src;",
2144 [(set (i32 Int32Regs:$dst), imm:$src)]>;
2145 def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2146 "mov.u64 \t$dst, $src;",
2147 [(set Int64Regs:$dst, imm:$src)]>;
2149 def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2150 "mov.b16 \t$dst, $src;", []>;
2151 def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2152 "mov.b32 \t$dst, $src;", []>;
2153 def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2154 "mov.b64 \t$dst, $src;", []>;
2156 def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
2157 "mov.f32 \t$dst, $src;",
2158 [(set Float32Regs:$dst, fpimm:$src)]>;
2159 def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
2160 "mov.f64 \t$dst, $src;",
2161 [(set Float64Regs:$dst, fpimm:$src)]>;
2163 def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
2164 def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
2166 //---- Copy Frame Index ----
2167 def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
2168 "add.u32 \t$dst, ${addr:add};",
2169 [(set Int32Regs:$dst, ADDRri:$addr)]>;
2170 def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
2171 "add.u64 \t$dst, ${addr:add};",
2172 [(set Int64Regs:$dst, ADDRri64:$addr)]>;
2174 //-----------------------------------
2175 // Comparison and Selection
2176 //-----------------------------------
2178 multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
2179 Instruction setp_16rr,
2180 Instruction setp_16ri,
2181 Instruction setp_16ir,
2182 Instruction setp_32rr,
2183 Instruction setp_32ri,
2184 Instruction setp_32ir,
2185 Instruction setp_64rr,
2186 Instruction setp_64ri,
2187 Instruction setp_64ir,
2188 Instruction set_16rr,
2189 Instruction set_16ri,
2190 Instruction set_16ir,
2191 Instruction set_32rr,
2192 Instruction set_32ri,
2193 Instruction set_32ir,
2194 Instruction set_64rr,
2195 Instruction set_64ri,
2196 Instruction set_64ir> {
2198 def : Pat<(i1 (OpNode i16:$a, i16:$b)),
2199 (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
2200 def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
2201 (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
2202 def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
2203 (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
2205 def : Pat<(i1 (OpNode i32:$a, i32:$b)),
2206 (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2207 def : Pat<(i1 (OpNode (i32 Int32Regs:$a), imm:$b)),
2208 (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
2209 def : Pat<(i1 (OpNode imm:$a, (i32 Int32Regs:$b))),
2210 (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
2212 def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
2213 (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2214 def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
2215 (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
2216 def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
2217 (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
2220 def : Pat<(i32 (OpNode i16:$a, i16:$b)),
2221 (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
2222 def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
2223 (set_16ri Int16Regs:$a, imm:$b, Mode)>;
2224 def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
2225 (set_16ir imm:$a, Int16Regs:$b, Mode)>;
2227 def : Pat<(i32 (OpNode i32:$a, i32:$b)),
2228 (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2229 def : Pat<(i32 (OpNode (i32 Int32Regs:$a), imm:$b)),
2230 (set_32ri Int32Regs:$a, imm:$b, Mode)>;
2231 def : Pat<(i32 (OpNode imm:$a, (i32 Int32Regs:$b))),
2232 (set_32ir imm:$a, Int32Regs:$b, Mode)>;
2234 def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
2235 (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2236 def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
2237 (set_64ri Int64Regs:$a, imm:$b, Mode)>;
2238 def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
2239 (set_64ir imm:$a, Int64Regs:$b, Mode)>;
2242 multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
2243 : ISET_FORMAT<OpNode, Mode,
2244 SETP_s16rr, SETP_s16ri, SETP_s16ir,
2245 SETP_s32rr, SETP_s32ri, SETP_s32ir,
2246 SETP_s64rr, SETP_s64ri, SETP_s64ir,
2247 SET_s16rr, SET_s16ri, SET_s16ir,
2248 SET_s32rr, SET_s32ri, SET_s32ir,
2249 SET_s64rr, SET_s64ri, SET_s64ir> {
2250 // TableGen doesn't like empty multiclasses.
2251 def : PatLeaf<(i32 0)>;
2254 multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
2255 : ISET_FORMAT<OpNode, Mode,
2256 SETP_u16rr, SETP_u16ri, SETP_u16ir,
2257 SETP_u32rr, SETP_u32ri, SETP_u32ir,
2258 SETP_u64rr, SETP_u64ri, SETP_u64ir,
2259 SET_u16rr, SET_u16ri, SET_u16ir,
2260 SET_u32rr, SET_u32ri, SET_u32ir,
2261 SET_u64rr, SET_u64ri, SET_u64ir> {
2262 // TableGen doesn't like empty multiclasses.
2263 def : PatLeaf<(i32 0)>;
2266 defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
2267 defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
2268 defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
2269 defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
2270 defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
2271 defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
2272 defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
2273 defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
2274 defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
2275 defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
2276 defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
2277 defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
2280 def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
2281 (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2282 def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
2283 (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2285 def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
2286 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2287 def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
2288 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2290 // comparisons of i8 extracted with BFE as i32
2291 // It's faster to do comparison directly on i32 extracted by BFE,
2292 // instead of the long conversion and sign extending.
2293 def: Pat<(setgt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2294 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2295 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGT)>;
2296 def: Pat<(setgt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2297 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2298 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGT)>;
2299 def: Pat<(setge (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2300 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2301 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGE)>;
2302 def: Pat<(setge (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2303 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2304 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGE)>;
2305 def: Pat<(setlt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2306 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2307 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLT)>;
2308 def: Pat<(setlt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2309 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2310 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLT)>;
2311 def: Pat<(setle (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2312 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2313 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLE)>;
2314 def: Pat<(setle (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2315 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2316 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLE)>;
2318 def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2319 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2320 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHI)>;
2321 def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2322 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2323 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHI)>;
2324 def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2325 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2326 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHS)>;
2327 def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2328 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2329 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHS)>;
2330 def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2331 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2332 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLO)>;
2333 def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2334 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2335 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLO)>;
2336 def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2337 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2338 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLS)>;
2339 def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2340 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2341 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLS)>;
2342 def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2343 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2344 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpEQ)>;
2345 def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2346 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2347 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpEQ)>;
2348 def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2349 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2350 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpNE)>;
2351 def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2352 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2353 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>;
2355 // i1 compare -> i32
2356 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2357 (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2358 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2359 (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2363 multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
2365 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2366 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2367 Requires<[useFP16Math,doF32FTZ]>;
2368 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2369 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2370 Requires<[useFP16Math]>;
2371 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2372 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2373 Requires<[useFP16Math,doF32FTZ]>;
2374 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2375 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2376 Requires<[useFP16Math]>;
2377 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2378 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2379 Requires<[useFP16Math,doF32FTZ]>;
2380 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2381 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2382 Requires<[useFP16Math]>;
2385 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2386 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2387 Requires<[hasBF16Math,doF32FTZ]>;
2388 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2389 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2390 Requires<[hasBF16Math]>;
2391 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2392 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2393 Requires<[hasBF16Math,doF32FTZ]>;
2394 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2395 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2396 Requires<[hasBF16Math]>;
2397 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2398 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2399 Requires<[hasBF16Math,doF32FTZ]>;
2400 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2401 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2402 Requires<[hasBF16Math]>;
2405 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2406 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2407 Requires<[doF32FTZ]>;
2408 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2409 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2410 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2411 (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2412 Requires<[doF32FTZ]>;
2413 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2414 (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2415 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2416 (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2417 Requires<[doF32FTZ]>;
2418 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2419 (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2422 def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
2423 (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2424 def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
2425 (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2426 def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
2427 (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2430 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2431 (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2432 Requires<[useFP16Math, doF32FTZ]>;
2433 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2434 (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2435 Requires<[useFP16Math]>;
2436 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2437 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2438 Requires<[useFP16Math, doF32FTZ]>;
2439 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2440 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2441 Requires<[useFP16Math]>;
2442 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2443 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2444 Requires<[useFP16Math, doF32FTZ]>;
2445 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2446 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2447 Requires<[useFP16Math]>;
2450 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2451 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2452 Requires<[hasBF16Math, doF32FTZ]>;
2453 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2454 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2455 Requires<[hasBF16Math]>;
2456 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2457 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2458 Requires<[hasBF16Math, doF32FTZ]>;
2459 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2460 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2461 Requires<[hasBF16Math]>;
2462 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2463 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2464 Requires<[hasBF16Math, doF32FTZ]>;
2465 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2466 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2467 Requires<[hasBF16Math]>;
2470 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2471 (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2472 Requires<[doF32FTZ]>;
2473 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2474 (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2475 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2476 (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2477 Requires<[doF32FTZ]>;
2478 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2479 (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2480 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2481 (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2482 Requires<[doF32FTZ]>;
2483 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2484 (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2487 def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
2488 (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2489 def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
2490 (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2491 def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
2492 (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2495 defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
2496 defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
2497 defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
2498 defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
2499 defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
2500 defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
2502 defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
2503 defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
2504 defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
2505 defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
2506 defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
2507 defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
2509 defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
2510 defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
2511 defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
2512 defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
2513 defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
2514 defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
2516 defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
2517 defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
2519 def SDTDeclareParamProfile :
2520 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2521 def SDTDeclareScalarParamProfile :
2522 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2523 def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
2524 def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
2525 def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
2526 def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2527 def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2528 def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2529 def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
2530 def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
2531 def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2532 def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2533 def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
2534 def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
2535 def SDTCallValProfile : SDTypeProfile<1, 0, []>;
2536 def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
2537 def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2538 def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
2539 def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
2540 def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
2541 def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
2544 SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
2545 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2546 def DeclareScalarParam :
2547 SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
2548 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2549 def DeclareRetParam :
2550 SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
2551 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2553 SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
2554 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2556 SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
2557 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2559 SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
2560 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2562 SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
2563 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2565 SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
2566 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2567 def PrintConvergentCall :
2568 SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
2569 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2571 SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
2572 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2573 def PrintConvergentCallUni :
2574 SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
2575 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2577 SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
2578 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2580 SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
2581 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2583 SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
2584 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2586 SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
2587 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2589 SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
2590 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2592 SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
2593 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2595 SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
2596 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2598 SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
2599 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2601 SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
2602 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2604 SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
2605 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2607 SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
2608 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2610 SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
2611 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2613 SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
2615 SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
2616 [SDNPHasChain, SDNPSideEffect]>;
2618 SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
2619 [SDNPHasChain, SDNPSideEffect]>;
2621 SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
2622 [SDNPHasChain, SDNPSideEffect]>;
2623 def PseudoUseParam :
2624 SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
2625 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2627 SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
2628 [SDNPHasChain, SDNPSideEffect]>;
2630 SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
2631 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2633 let mayLoad = true in {
2634 class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
2635 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2636 !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
2639 class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
2640 NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
2641 !strconcat("ld.param.v2", opstr,
2642 " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
2644 class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
2645 NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
2648 !strconcat("ld.param.v4", opstr,
2649 " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
2653 class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
2654 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2655 !strconcat("mov", opstr, " \t$dst, retval$b;"),
2656 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
2658 let mayStore = true in {
2660 multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> {
2661 foreach op = [IMMType, regclass] in
2662 if !or(support_imm, !isa<NVPTXRegClass>(op)) then
2663 def _ # !if(!isa<NVPTXRegClass>(op), "r", "i")
2665 (ins op:$val, i32imm:$a, i32imm:$b),
2666 "st.param" # opstr # " \t[param$a+$b], $val;",
2670 multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
2671 foreach op1 = [IMMType, regclass] in
2672 foreach op2 = [IMMType, regclass] in
2673 def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
2674 # !if(!isa<NVPTXRegClass>(op2), "r", "i")
2676 (ins op1:$val1, op2:$val2,
2677 i32imm:$a, i32imm:$b),
2678 "st.param.v2" # opstr # " \t[param$a+$b], {{$val1, $val2}};",
2682 multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
2683 foreach op1 = [IMMType, regclass] in
2684 foreach op2 = [IMMType, regclass] in
2685 foreach op3 = [IMMType, regclass] in
2686 foreach op4 = [IMMType, regclass] in
2687 def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
2688 # !if(!isa<NVPTXRegClass>(op2), "r", "i")
2689 # !if(!isa<NVPTXRegClass>(op3), "r", "i")
2690 # !if(!isa<NVPTXRegClass>(op4), "r", "i")
2693 (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4,
2694 i32imm:$a, i32imm:$b),
2695 "st.param.v4" # opstr #
2696 " \t[param$a+$b], {{$val1, $val2, $val3, $val4}};",
2700 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
2701 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
2702 !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
2705 class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
2706 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
2707 !strconcat("st.param.v2", opstr,
2708 " \t[func_retval0+$a], {{$val, $val2}};"),
2711 class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
2713 (ins regclass:$val, regclass:$val2, regclass:$val3,
2714 regclass:$val4, i32imm:$a),
2715 !strconcat("st.param.v4", opstr,
2716 " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
2721 multiclass CALL<string OpcStr, SDNode OpNode> {
2722 def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
2723 !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
2724 def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
2725 !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
2726 def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
2727 !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
2728 def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
2729 !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
2730 def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
2731 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
2732 [(OpNode (i32 4))]>;
2733 def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
2734 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
2735 [(OpNode (i32 5))]>;
2736 def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
2737 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2739 [(OpNode (i32 6))]>;
2740 def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
2741 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2742 "retval5, retval6), "),
2743 [(OpNode (i32 7))]>;
2744 def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
2745 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2746 "retval5, retval6, retval7), "),
2747 [(OpNode (i32 8))]>;
2751 defm Call : CALL<"call", PrintCall>;
2752 defm CallUni : CALL<"call.uni", PrintCallUni>;
2754 // Convergent call instructions. These are identical to regular calls, except
2755 // they have the isConvergent bit set.
2756 let isConvergent=1 in {
2757 defm ConvergentCall : CALL<"call", PrintConvergentCall>;
2758 defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
2761 def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">;
2762 def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">;
2763 def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">;
2764 def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">;
2765 def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">;
2766 def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">;
2767 def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">;
2768 def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">;
2769 def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
2770 def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
2771 def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
2772 def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">;
2773 def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">;
2774 def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">;
2775 def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">;
2776 def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">;
2778 defm StoreParamI64 : StoreParamInst<Int64Regs, i64imm, ".b64">;
2779 defm StoreParamI32 : StoreParamInst<Int32Regs, i32imm, ".b32">;
2780 defm StoreParamI16 : StoreParamInst<Int16Regs, i16imm, ".b16">;
2781 defm StoreParamI8 : StoreParamInst<Int16Regs, i8imm, ".b8">;
2783 defm StoreParamI8TruncI32 : StoreParamInst<Int32Regs, i8imm, ".b8", /* support_imm */ false>;
2784 defm StoreParamI8TruncI64 : StoreParamInst<Int64Regs, i8imm, ".b8", /* support_imm */ false>;
2786 defm StoreParamV2I64 : StoreParamV2Inst<Int64Regs, i64imm, ".b64">;
2787 defm StoreParamV2I32 : StoreParamV2Inst<Int32Regs, i32imm, ".b32">;
2788 defm StoreParamV2I16 : StoreParamV2Inst<Int16Regs, i16imm, ".b16">;
2789 defm StoreParamV2I8 : StoreParamV2Inst<Int16Regs, i8imm, ".b8">;
2791 defm StoreParamV4I32 : StoreParamV4Inst<Int32Regs, i32imm, ".b32">;
2792 defm StoreParamV4I16 : StoreParamV4Inst<Int16Regs, i16imm, ".b16">;
2793 defm StoreParamV4I8 : StoreParamV4Inst<Int16Regs, i8imm, ".b8">;
2795 defm StoreParamF32 : StoreParamInst<Float32Regs, f32imm, ".f32">;
2796 defm StoreParamF64 : StoreParamInst<Float64Regs, f64imm, ".f64">;
2798 defm StoreParamV2F32 : StoreParamV2Inst<Float32Regs, f32imm, ".f32">;
2799 defm StoreParamV2F64 : StoreParamV2Inst<Float64Regs, f64imm, ".f64">;
2801 defm StoreParamV4F32 : StoreParamV4Inst<Float32Regs, f32imm, ".f32">;
2803 def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
2804 def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
2805 def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">;
2806 def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">;
2807 def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">;
2808 def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">;
2809 def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">;
2810 def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">;
2811 def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">;
2812 def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">;
2813 def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">;
2814 def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">;
2815 def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
2817 def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">;
2818 def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">;
2819 def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">;
2820 def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">;
2821 def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">;
2823 def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
2824 def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
2825 def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
2826 def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
2828 class CallArgInst<NVPTXRegClass regclass> :
2829 NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2830 [(CallArg (i32 0), regclass:$a)]>;
2832 class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2833 NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2834 [(CallArg (i32 0), vt:$a)]>;
2836 class LastCallArgInst<NVPTXRegClass regclass> :
2837 NVPTXInst<(outs), (ins regclass:$a), "$a",
2838 [(LastCallArg (i32 0), regclass:$a)]>;
2839 class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2840 NVPTXInst<(outs), (ins regclass:$a), "$a",
2841 [(LastCallArg (i32 0), vt:$a)]>;
2843 def CallArgI64 : CallArgInst<Int64Regs>;
2844 def CallArgI32 : CallArgInstVT<Int32Regs, i32>;
2845 def CallArgI16 : CallArgInstVT<Int16Regs, i16>;
2846 def CallArgF64 : CallArgInst<Float64Regs>;
2847 def CallArgF32 : CallArgInst<Float32Regs>;
2849 def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2850 def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>;
2851 def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>;
2852 def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2853 def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2855 def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2856 [(CallArg (i32 0), (i32 imm:$a))]>;
2857 def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2858 [(LastCallArg (i32 0), (i32 imm:$a))]>;
2860 def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2861 [(CallArg (i32 1), (i32 imm:$a))]>;
2862 def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2863 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2865 def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2866 [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2867 def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2868 [(CallVoid i32:$addr)]>;
2869 def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2870 [(CallVoid Int64Regs:$addr)]>;
2871 def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2872 [(Prototype (i32 imm:$val))]>;
2874 def DeclareRetMemInst :
2875 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2876 ".param .align $align .b8 retval$num[$size];",
2877 [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2878 def DeclareRetScalarInst :
2879 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2880 ".param .b$size retval$num;",
2881 [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2882 def DeclareRetRegInst :
2883 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2884 ".reg .b$size retval$num;",
2885 [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2887 def DeclareParamInst :
2888 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2889 ".param .align $align .b8 param$a[$size];",
2890 [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2891 def DeclareScalarParamInst :
2892 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2893 ".param .b$size param$a;",
2894 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2895 def DeclareScalarRegInst :
2896 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2897 ".reg .b$size param$a;",
2898 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2900 class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
2901 NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2902 !strconcat("mov", asmstr, " \t$dst, $src;"),
2903 [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>;
2905 class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
2907 NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
2908 !strconcat("mov", asmstr, " \t$dst, $src;"),
2909 [(set vt:$dst, (MoveParam texternalsym:$src))]>;
2911 def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">;
2912 def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">;
2914 def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
2915 def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
2918 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2919 "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ?
2920 [(set i16:$dst, (MoveParam i16:$src))]>;
2921 def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">;
2922 def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">;
2924 class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
2925 NVPTXInst<(outs), (ins regclass:$src),
2926 "// Pseudo use of $src",
2927 [(PseudoUseParam vt:$src)]>;
2929 def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>;
2930 def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>;
2931 def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>;
2932 def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
2933 def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
2935 class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
2936 NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2937 !strconcat("mov.", SzStr, " \t$dst, $src;"),
2938 [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>;
2940 def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>;
2941 def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>;
2942 def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>;
2943 def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>;
2944 def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>;
2945 def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>;
2947 foreach vt = [f16, bf16] in {
2948 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
2951 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
2952 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
2956 // Load / Store Handling
2958 multiclass LD<NVPTXRegClass regclass> {
2959 def _avar : NVPTXInst<
2960 (outs regclass:$dst),
2961 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2962 i32imm:$fromWidth, imem:$addr),
2963 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2964 "\t$dst, [$addr];", []>;
2965 def _areg : NVPTXInst<
2966 (outs regclass:$dst),
2967 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2968 i32imm:$fromWidth, Int32Regs:$addr),
2969 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2970 "\t$dst, [$addr];", []>;
2971 def _areg_64 : NVPTXInst<
2972 (outs regclass:$dst),
2973 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2974 i32imm:$fromWidth, Int64Regs:$addr),
2975 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2976 "\t$dst, [$addr];", []>;
2977 def _ari : NVPTXInst<
2978 (outs regclass:$dst),
2979 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2980 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2981 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2982 "\t$dst, [$addr+$offset];", []>;
2983 def _ari_64 : NVPTXInst<
2984 (outs regclass:$dst),
2985 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2986 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2987 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2988 "\t$dst, [$addr+$offset];", []>;
2989 def _asi : NVPTXInst<
2990 (outs regclass:$dst),
2991 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2992 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2993 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2994 "\t$dst, [$addr+$offset];", []>;
2997 let mayLoad=1, hasSideEffects=0 in {
2998 defm LD_i8 : LD<Int16Regs>;
2999 defm LD_i16 : LD<Int16Regs>;
3000 defm LD_i32 : LD<Int32Regs>;
3001 defm LD_i64 : LD<Int64Regs>;
3002 defm LD_f32 : LD<Float32Regs>;
3003 defm LD_f64 : LD<Float64Regs>;
3006 multiclass ST<NVPTXRegClass regclass> {
3007 def _avar : NVPTXInst<
3009 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
3010 LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
3011 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3012 " \t[$addr], $src;", []>;
3013 def _areg : NVPTXInst<
3015 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
3016 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
3017 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3018 " \t[$addr], $src;", []>;
3019 def _areg_64 : NVPTXInst<
3021 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
3022 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
3023 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3024 " \t[$addr], $src;", []>;
3025 def _ari : NVPTXInst<
3027 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
3028 LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
3029 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3030 " \t[$addr+$offset], $src;", []>;
3031 def _ari_64 : NVPTXInst<
3033 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
3034 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
3035 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3036 " \t[$addr+$offset], $src;", []>;
3037 def _asi : NVPTXInst<
3039 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
3040 LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
3041 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3042 " \t[$addr+$offset], $src;", []>;
3045 let mayStore=1, hasSideEffects=0 in {
3046 defm ST_i8 : ST<Int16Regs>;
3047 defm ST_i16 : ST<Int16Regs>;
3048 defm ST_i32 : ST<Int32Regs>;
3049 defm ST_i64 : ST<Int64Regs>;
3050 defm ST_f32 : ST<Float32Regs>;
3051 defm ST_f64 : ST<Float64Regs>;
3054 // The following is used only in and after vector elementizations. Vector
3055 // elementization happens at the machine instruction level, so the following
3056 // instructions never appear in the DAG.
3057 multiclass LD_VEC<NVPTXRegClass regclass> {
3058 def _v2_avar : NVPTXInst<
3059 (outs regclass:$dst1, regclass:$dst2),
3060 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3061 i32imm:$fromWidth, imem:$addr),
3062 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3063 "\t{{$dst1, $dst2}}, [$addr];", []>;
3064 def _v2_areg : NVPTXInst<
3065 (outs regclass:$dst1, regclass:$dst2),
3066 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3067 i32imm:$fromWidth, Int32Regs:$addr),
3068 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3069 "\t{{$dst1, $dst2}}, [$addr];", []>;
3070 def _v2_areg_64 : NVPTXInst<
3071 (outs regclass:$dst1, regclass:$dst2),
3072 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3073 i32imm:$fromWidth, Int64Regs:$addr),
3074 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3075 "\t{{$dst1, $dst2}}, [$addr];", []>;
3076 def _v2_ari : NVPTXInst<
3077 (outs regclass:$dst1, regclass:$dst2),
3078 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3079 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3080 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3081 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
3082 def _v2_ari_64 : NVPTXInst<
3083 (outs regclass:$dst1, regclass:$dst2),
3084 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3085 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3086 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3087 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
3088 def _v2_asi : NVPTXInst<
3089 (outs regclass:$dst1, regclass:$dst2),
3090 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3091 i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3092 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3093 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
3094 def _v4_avar : NVPTXInst<
3095 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3096 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3097 i32imm:$fromWidth, imem:$addr),
3098 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3099 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
3100 def _v4_areg : NVPTXInst<
3101 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3102 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3103 i32imm:$fromWidth, Int32Regs:$addr),
3104 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3105 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
3106 def _v4_areg_64 : NVPTXInst<
3107 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3108 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3109 i32imm:$fromWidth, Int64Regs:$addr),
3110 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3111 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
3112 def _v4_ari : NVPTXInst<
3113 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3114 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3115 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3116 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3117 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3118 def _v4_ari_64 : NVPTXInst<
3119 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3120 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3121 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3122 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3123 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3124 def _v4_asi : NVPTXInst<
3125 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3126 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3127 i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3128 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3129 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3131 let mayLoad=1, hasSideEffects=0 in {
3132 defm LDV_i8 : LD_VEC<Int16Regs>;
3133 defm LDV_i16 : LD_VEC<Int16Regs>;
3134 defm LDV_i32 : LD_VEC<Int32Regs>;
3135 defm LDV_i64 : LD_VEC<Int64Regs>;
3136 defm LDV_f32 : LD_VEC<Float32Regs>;
3137 defm LDV_f64 : LD_VEC<Float64Regs>;
3140 multiclass ST_VEC<NVPTXRegClass regclass> {
3141 def _v2_avar : NVPTXInst<
3143 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3144 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
3145 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3146 "\t[$addr], {{$src1, $src2}};", []>;
3147 def _v2_areg : NVPTXInst<
3149 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3150 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
3151 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3152 "\t[$addr], {{$src1, $src2}};", []>;
3153 def _v2_areg_64 : NVPTXInst<
3155 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3156 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
3157 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3158 "\t[$addr], {{$src1, $src2}};", []>;
3159 def _v2_ari : NVPTXInst<
3161 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3162 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
3164 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3165 "\t[$addr+$offset], {{$src1, $src2}};", []>;
3166 def _v2_ari_64 : NVPTXInst<
3168 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3169 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
3171 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3172 "\t[$addr+$offset], {{$src1, $src2}};", []>;
3173 def _v2_asi : NVPTXInst<
3175 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3176 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
3178 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3179 "\t[$addr+$offset], {{$src1, $src2}};", []>;
3180 def _v4_avar : NVPTXInst<
3182 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3183 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3184 i32imm:$fromWidth, imem:$addr),
3185 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3186 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3187 def _v4_areg : NVPTXInst<
3189 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3190 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3191 i32imm:$fromWidth, Int32Regs:$addr),
3192 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3193 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3194 def _v4_areg_64 : NVPTXInst<
3196 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3197 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3198 i32imm:$fromWidth, Int64Regs:$addr),
3199 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3200 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3201 def _v4_ari : NVPTXInst<
3203 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3204 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3205 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3206 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3207 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3208 def _v4_ari_64 : NVPTXInst<
3210 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3211 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3212 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3213 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3214 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3215 def _v4_asi : NVPTXInst<
3217 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3218 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3219 i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3220 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
3221 "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3224 let mayStore=1, hasSideEffects=0 in {
3225 defm STV_i8 : ST_VEC<Int16Regs>;
3226 defm STV_i16 : ST_VEC<Int16Regs>;
3227 defm STV_i32 : ST_VEC<Int32Regs>;
3228 defm STV_i64 : ST_VEC<Int64Regs>;
3229 defm STV_f32 : ST_VEC<Float32Regs>;
3230 defm STV_f64 : ST_VEC<Float64Regs>;
3233 //---- Conversion ----
3235 class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
3236 NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret,
3237 NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> :
3238 NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
3239 !strconcat("mov.b", SzStr, " \t$d, $a;"),
3240 [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>;
3242 def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
3243 def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
3244 def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>;
3245 def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
3247 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
3248 def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
3249 (BITCONVERT_32_F2I Float32Regs:$a)>;
3250 def: Pat<(f32 (bitconvert (vt Int32Regs:$a))),
3251 (BITCONVERT_32_I2F Int32Regs:$a)>;
3253 foreach vt = [f16, bf16] in {
3254 def: Pat<(vt (bitconvert (i16 UInt16Const:$a))),
3255 (IMOVB16ri UInt16Const:$a)>;
3256 def: Pat<(vt (bitconvert (i16 Int16Regs:$a))),
3257 (ProxyRegI16 Int16Regs:$a)>;
3258 def: Pat<(i16 (bitconvert (vt Int16Regs:$a))),
3259 (ProxyRegI16 Int16Regs:$a)>;
3262 foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3263 def: Pat<(ta (bitconvert (i32 UInt32Const:$a))),
3264 (IMOVB32ri UInt32Const:$a)>;
3265 foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3266 if !ne(ta, tb) then {
3267 def: Pat<(ta (bitconvert (tb Int32Regs:$a))),
3268 (ProxyRegI32 Int32Regs:$a)>;
3273 // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
3274 // we cannot specify floating-point literals in isel patterns. Therefore, we
3275 // use an integer selp to select either 1 or 0 and then cvt to floating-point.
3278 def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
3279 (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3280 def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
3281 (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
3282 def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
3283 (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
3284 def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
3285 (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
3288 def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
3289 (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3290 def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
3291 (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
3292 def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
3293 (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
3294 def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
3295 (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
3298 def : Pat<(bf16 (sint_to_fp Int1Regs:$a)),
3299 (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3300 def : Pat<(bf16 (sint_to_fp Int16Regs:$a)),
3301 (CVT_bf16_s16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3302 def : Pat<(bf16 (sint_to_fp Int32Regs:$a)),
3303 (CVT_bf16_s32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3304 def : Pat<(bf16 (sint_to_fp Int64Regs:$a)),
3305 (CVT_bf16_s64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3308 def : Pat<(bf16 (uint_to_fp Int1Regs:$a)),
3309 (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3310 def : Pat<(bf16 (uint_to_fp Int16Regs:$a)),
3311 (CVT_bf16_u16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3312 def : Pat<(bf16 (uint_to_fp Int32Regs:$a)),
3313 (CVT_bf16_u32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3314 def : Pat<(bf16 (uint_to_fp Int64Regs:$a)),
3315 (CVT_bf16_u64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3318 def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
3319 (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3320 def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
3321 (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
3322 def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
3323 (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
3324 def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
3325 (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
3328 def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
3329 (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3330 def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
3331 (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
3332 def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
3333 (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
3334 def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
3335 (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
3338 def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
3339 (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3340 def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
3341 (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
3342 def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
3343 (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
3344 def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
3345 (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
3348 def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
3349 (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3350 def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
3351 (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
3352 def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
3353 (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
3354 def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
3355 (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
3359 def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))),
3360 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3361 def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))),
3362 (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>;
3363 def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))),
3364 (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>;
3365 def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))),
3366 (CVT_s64_f16 Int16Regs:$a, CvtRZI)>;
3369 def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))),
3370 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3371 def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))),
3372 (CVT_u16_f16 Int16Regs:$a, CvtRZI)>;
3373 def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))),
3374 (CVT_u32_f16 Int16Regs:$a, CvtRZI)>;
3375 def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))),
3376 (CVT_u64_f16 Int16Regs:$a, CvtRZI)>;
3379 def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))),
3380 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3381 def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))),
3382 (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3383 def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))),
3384 (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3385 def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))),
3386 (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>;
3389 def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))),
3390 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3391 def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))),
3392 (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>;
3393 def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))),
3394 (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>;
3395 def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))),
3396 (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>;
3398 def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
3399 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3400 def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3401 (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3402 def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3403 (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
3404 def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3405 (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3406 def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3407 (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
3408 def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3409 (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3410 def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3411 (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
3414 def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
3415 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3416 def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3417 (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3418 def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3419 (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
3420 def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3421 (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3422 def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3423 (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
3424 def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3425 (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3426 def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3427 (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
3430 def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
3431 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3432 def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
3433 (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
3434 def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
3435 (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
3436 def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
3437 (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
3440 def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
3441 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3442 def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
3443 (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
3444 def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
3445 (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
3446 def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
3447 (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
3450 def : Pat<(i16 (sext Int1Regs:$a)),
3451 (SELP_s16ii -1, 0, Int1Regs:$a)>;
3452 def : Pat<(i32 (sext Int1Regs:$a)),
3453 (SELP_s32ii -1, 0, Int1Regs:$a)>;
3454 def : Pat<(i64 (sext Int1Regs:$a)),
3455 (SELP_s64ii -1, 0, Int1Regs:$a)>;
3458 def : Pat<(i16 (zext Int1Regs:$a)),
3459 (SELP_u16ii 1, 0, Int1Regs:$a)>;
3460 def : Pat<(i32 (zext Int1Regs:$a)),
3461 (SELP_u32ii 1, 0, Int1Regs:$a)>;
3462 def : Pat<(i64 (zext Int1Regs:$a)),
3463 (SELP_u64ii 1, 0, Int1Regs:$a)>;
3466 def : Pat<(i16 (anyext Int1Regs:$a)),
3467 (SELP_u16ii -1, 0, Int1Regs:$a)>;
3468 def : Pat<(i32 (anyext Int1Regs:$a)),
3469 (SELP_u32ii -1, 0, Int1Regs:$a)>;
3470 def : Pat<(i64 (anyext Int1Regs:$a)),
3471 (SELP_u64ii -1, 0, Int1Regs:$a)>;
3474 def : Pat<(i32 (sext Int16Regs:$a)),
3475 (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
3476 def : Pat<(i64 (sext Int16Regs:$a)),
3477 (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
3480 def : Pat<(i32 (zext Int16Regs:$a)),
3481 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3482 def : Pat<(i64 (zext Int16Regs:$a)),
3483 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3486 def : Pat<(i32 (anyext Int16Regs:$a)),
3487 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3488 def : Pat<(i64 (anyext Int16Regs:$a)),
3489 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3492 def : Pat<(i64 (sext Int32Regs:$a)),
3493 (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
3496 def : Pat<(i64 (zext Int32Regs:$a)),
3497 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3500 def : Pat<(i64 (anyext Int32Regs:$a)),
3501 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3505 def : Pat<(i32 (trunc Int64Regs:$a)),
3506 (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
3507 def : Pat<(i16 (trunc Int64Regs:$a)),
3508 (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
3509 def : Pat<(i1 (trunc Int64Regs:$a)),
3510 (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
3513 def : Pat<(i16 (trunc Int32Regs:$a)),
3514 (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
3515 def : Pat<(i1 (trunc Int32Regs:$a)),
3516 (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
3519 def : Pat<(i1 (trunc Int16Regs:$a)),
3520 (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
3523 def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
3524 def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
3525 def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
3526 def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
3527 def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
3528 def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
3531 // Select instructions with 32-bit predicates
3532 def : Pat<(select (i32 Int32Regs:$pred), i16:$a, i16:$b),
3533 (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
3534 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3535 def : Pat<(select (i32 Int32Regs:$pred), i32:$a, i32:$b),
3536 (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
3537 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3538 def : Pat<(select (i32 Int32Regs:$pred), Int64Regs:$a, Int64Regs:$b),
3539 (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
3540 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3541 def : Pat<(select (i32 Int32Regs:$pred), (f16 Int16Regs:$a), (f16 Int16Regs:$b)),
3542 (SELP_f16rr Int16Regs:$a, Int16Regs:$b,
3543 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3544 def : Pat<(select (i32 Int32Regs:$pred), (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)),
3545 (SELP_bf16rr Int16Regs:$a, Int16Regs:$b,
3546 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3547 def : Pat<(select (i32 Int32Regs:$pred), Float32Regs:$a, Float32Regs:$b),
3548 (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
3549 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3550 def : Pat<(select (i32 Int32Regs:$pred), Float64Regs:$a, Float64Regs:$b),
3551 (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
3552 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3555 let hasSideEffects = false in {
3556 // pack a set of smaller int registers to a larger int register
3557 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
3558 (ins Int16Regs:$s1, Int16Regs:$s2,
3559 Int16Regs:$s3, Int16Regs:$s4),
3560 "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
3561 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
3562 (ins Int16Regs:$s1, Int16Regs:$s2),
3563 "mov.b32 \t$d, {{$s1, $s2}};", []>;
3564 def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
3565 (ins Int32Regs:$s1, Int32Regs:$s2),
3566 "mov.b64 \t$d, {{$s1, $s2}};", []>;
3567 def V2I64toI128 : NVPTXInst<(outs Int128Regs:$d),
3568 (ins Int64Regs:$s1, Int64Regs:$s2),
3569 "mov.b128 \t$d, {{$s1, $s2}};", []>;
3570 def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
3571 (ins Float32Regs:$s1, Float32Regs:$s2),
3572 "mov.b64 \t$d, {{$s1, $s2}};", []>;
3574 // unpack a larger int register to a set of smaller int registers
3575 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
3576 Int16Regs:$d3, Int16Regs:$d4),
3578 "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
3579 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
3581 "mov.b32 \t{{$d1, $d2}}, $s;", []>;
3582 def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
3584 "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3585 def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2),
3586 (ins Int128Regs:$s),
3587 "mov.b128 \t{{$d1, $d2}}, $s;", []>;
3588 def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
3589 (ins Float64Regs:$s),
3590 "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3592 def I32toI16H : NVPTXInst<(outs Int16Regs:$high),
3594 "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
3596 def I32toI16L : NVPTXInst<(outs Int16Regs:$low),
3598 "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}",
3600 def I64toI32H : NVPTXInst<(outs Int32Regs:$high),
3602 "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
3604 def I64toI32L : NVPTXInst<(outs Int32Regs:$low),
3606 "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
3611 // Using partial vectorized move produces better SASS code for extraction of
3612 // upper/lower parts of an integer.
3613 def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
3614 (I32toI16H Int32Regs:$s)>;
3615 def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
3616 (I32toI16H Int32Regs:$s)>;
3617 def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
3618 (I64toI32H Int64Regs:$s)>;
3619 def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
3620 (I64toI32H Int64Regs:$s)>;
3622 def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))),
3623 (CVT_INREG_s32_s16 Int32Regs:$src)>;
3625 foreach vt = [v2f16, v2bf16, v2i16] in {
3626 def : Pat<(extractelt (vt Int32Regs:$src), 0),
3627 (I32toI16L Int32Regs:$src)>;
3628 def : Pat<(extractelt (vt Int32Regs:$src), 1),
3629 (I32toI16H Int32Regs:$src)>;
3631 def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
3632 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3633 def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
3634 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3635 def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
3636 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3638 def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))),
3639 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3641 // Count leading zeros
3642 let hasSideEffects = false in {
3643 def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3644 "clz.b32 \t$d, $a;", []>;
3645 def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3646 "clz.b64 \t$d, $a;", []>;
3649 // 32-bit has a direct PTX instruction
3650 def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>;
3652 // The return type of the ctlz ISD node is the same as its input, but the PTX
3653 // ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
3654 // ptx value to 64 bits to match the ISD node's semantics, unless we know we're
3655 // truncating back down to 32 bits.
3656 def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
3657 def : Pat<(i32 (trunc (i64 (ctlz Int64Regs:$a)))), (CLZr64 Int64Regs:$a)>;
3659 // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
3660 // result back to 16-bits if necessary. We also need to subtract 16 because
3661 // the high-order 16 zeros were counted.
3663 // TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
3664 // use to save one SASS instruction (on sm_35 anyway):
3666 // mov.b32 $tmp, {0xffff, $a}
3667 // ctlz.b32 $result, $tmp
3669 // That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
3670 // and then ctlz that value. This way we don't have to subtract 16 from the
3671 // result. Unfortunately today we don't have a way to generate
3672 // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
3673 def : Pat<(i16 (ctlz Int16Regs:$a)),
3674 (SUBi16ri (CVT_u16_u32
3675 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
3676 def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
3677 (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
3680 let hasSideEffects = false in {
3681 def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3682 "popc.b32 \t$d, $a;", []>;
3683 def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3684 "popc.b64 \t$d, $a;", []>;
3687 // 32-bit has a direct PTX instruction
3688 def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>;
3690 // For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
3691 // to match the LLVM semantics. Just as with ctlz.i64, we provide a second
3692 // pattern that avoids the type conversion if we're truncating the result to
3694 def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
3695 def : Pat<(i32 (trunc (i64 (ctpop Int64Regs:$a)))), (POPCr64 Int64Regs:$a)>;
3697 // For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
3698 // If we know that we're storing into an i32, we can avoid the final trunc.
3699 def : Pat<(ctpop Int16Regs:$a),
3700 (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
3701 def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
3702 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
3704 // fpround f32 -> f16
3705 def : Pat<(f16 (fpround Float32Regs:$a)),
3706 (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
3708 // fpround f32 -> bf16
3709 def : Pat<(bf16 (fpround Float32Regs:$a)),
3710 (CVT_bf16_f32 Float32Regs:$a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
3712 // fpround f64 -> f16
3713 def : Pat<(f16 (fpround Float64Regs:$a)),
3714 (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
3716 // fpround f64 -> bf16
3717 def : Pat<(bf16 (fpround Float64Regs:$a)),
3718 (CVT_bf16_f64 Float64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3719 // fpround f64 -> f32
3720 def : Pat<(f32 (fpround Float64Regs:$a)),
3721 (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
3722 def : Pat<(f32 (fpround Float64Regs:$a)),
3723 (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
3725 // fpextend f16 -> f32
3726 def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3727 (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3728 def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3729 (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
3730 // fpextend bf16 -> f32
3731 def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3732 (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3733 def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3734 (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
3736 // fpextend f16 -> f64
3737 def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
3738 (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
3740 // fpextend bf16 -> f64
3741 def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))),
3742 (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
3744 // fpextend f32 -> f64
3745 def : Pat<(f64 (fpextend Float32Regs:$a)),
3746 (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3747 def : Pat<(f64 (fpextend Float32Regs:$a)),
3748 (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
3750 def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
3751 [SDNPHasChain, SDNPOptInGlue]>;
3753 // fceil, ffloor, froundeven, ftrunc.
3755 multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
3756 def : Pat<(OpNode (f16 Int16Regs:$a)),
3757 (CVT_f16_f16 Int16Regs:$a, Mode)>;
3758 def : Pat<(OpNode (bf16 Int16Regs:$a)),
3759 (CVT_bf16_bf16 Int16Regs:$a, Mode)>;
3760 def : Pat<(OpNode Float32Regs:$a),
3761 (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
3762 def : Pat<(OpNode Float32Regs:$a),
3763 (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;
3764 def : Pat<(OpNode Float64Regs:$a),
3765 (CVT_f64_f64 Float64Regs:$a, Mode)>;
3768 defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>;
3769 defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>;
3770 defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>;
3771 defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>;
3773 // nearbyint and rint are implemented as rounding to nearest even. This isn't
3774 // strictly correct, because it causes us to ignore the rounding mode. But it
3775 // matches what CUDA's "libm" does.
3777 defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>;
3778 defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>;
3780 //-----------------------------------
3782 //-----------------------------------
3784 let isTerminator=1 in {
3785 let isReturn=1, isBarrier=1 in
3786 def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>;
3789 def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3790 "@$a bra \t$target;",
3791 [(brcond Int1Regs:$a, bb:$target)]>;
3793 def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3794 "@!$a bra \t$target;", []>;
3796 let isBranch=1, isBarrier=1 in
3797 def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
3798 "bra.uni \t$target;", [(br bb:$target)]>;
3801 def : Pat<(brcond (i32 Int32Regs:$a), bb:$target),
3802 (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
3804 // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
3805 // conditional branch if the target block is the next block so that the code
3806 // can fall through to the target block. The invertion is done by 'xor
3807 // condition, 1', which will be translated to (setne condition, -1). Since ptx
3808 // supports '@!pred bra target', we should use it.
3809 def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
3810 (CBranchOther Int1Regs:$a, bb:$target)>;
3813 def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
3815 def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
3817 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
3818 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3819 def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
3820 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
3823 def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
3824 def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
3825 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
3826 def calltarget : Operand<i32>;
3828 def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
3831 def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
3832 def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
3834 // Pseudo instructions.
3835 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
3836 : NVPTXInst<outs, ins, asmstr, pattern>;
3839 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3840 "\\{ // callseq $amt1, $amt2",
3841 [(callseq_start timm:$amt1, timm:$amt2)]>;
3843 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3844 "\\} // callseq $amt1",
3845 [(callseq_end timm:$amt1, timm:$amt2)]>;
3848 // Emit an `exit` as well to convey to ptxas that `trap` exits the CFG.
3849 // This won't be necessary in a future version of ptxas.
3850 def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>;
3852 // Call prototype wrapper
3853 def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3855 SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
3856 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3857 def ProtoIdent : Operand<i32> {
3858 let PrintMethod = "printProtoIdent";
3860 def CALL_PROTOTYPE :
3861 NVPTXInst<(outs), (ins ProtoIdent:$ident),
3862 "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
3864 def SDTDynAllocaOp :
3865 SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
3868 SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
3869 [SDNPHasChain, SDNPSideEffect]>;
3871 def DYNAMIC_STACKALLOC32 :
3872 NVPTXInst<(outs Int32Regs:$ptr),
3873 (ins Int32Regs:$size, i32imm:$align),
3874 "alloca.u32 \t$ptr, $size, $align;\n\t"
3875 "cvta.local.u32 \t$ptr, $ptr;",
3876 [(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>,
3877 Requires<[hasPTX<73>, hasSM<52>]>;
3879 def DYNAMIC_STACKALLOC64 :
3880 NVPTXInst<(outs Int64Regs:$ptr),
3881 (ins Int64Regs:$size, i32imm:$align),
3882 "alloca.u64 \t$ptr, $size, $align;\n\t"
3883 "cvta.local.u64 \t$ptr, $ptr;",
3884 [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
3885 Requires<[hasPTX<73>, hasSM<52>]>;
3887 include "NVPTXIntrinsics.td"
3889 //-----------------------------------
3891 //-----------------------------------
3892 // BSWAP is currently expanded. The following is a more efficient
3893 // - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
3894 // - for sm_20, use pmpt (use vector scalar mov to get the pack and
3895 // unpack). sm_20 supports native 32-bit register, but not native 16-bit
3899 (i32 (bswap i32:$a)),
3900 (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>;
3903 (v2i16 (bswap v2i16:$a)),
3904 (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
3907 (i64 (bswap i64:$a)),
3909 (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
3910 (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;