1 //===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file describes the PTX instructions in TableGen format.
11 //===----------------------------------------------------------------------===//
13 include "NVPTXInstrFormats.td"
15 let OperandType = "OPERAND_IMMEDIATE" in {
16 def f16imm : Operand<f16>;
17 def bf16imm : Operand<bf16>;
21 // List of vector specific properties
22 def isVecLD : VecInstTypeEnum<1>;
23 def isVecST : VecInstTypeEnum<2>;
24 def isVecBuild : VecInstTypeEnum<3>;
25 def isVecShuffle : VecInstTypeEnum<4>;
26 def isVecExtract : VecInstTypeEnum<5>;
27 def isVecInsert : VecInstTypeEnum<6>;
28 def isVecDest : VecInstTypeEnum<7>;
29 def isVecOther : VecInstTypeEnum<15>;
31 //===----------------------------------------------------------------------===//
32 // NVPTX Operand Definitions.
33 //===----------------------------------------------------------------------===//
35 def brtarget : Operand<OtherVT>;
37 // CVT conversion modes
38 // These must match the enum in NVPTX.h
39 def CvtNONE : PatLeaf<(i32 0x0)>;
40 def CvtRNI : PatLeaf<(i32 0x1)>;
41 def CvtRZI : PatLeaf<(i32 0x2)>;
42 def CvtRMI : PatLeaf<(i32 0x3)>;
43 def CvtRPI : PatLeaf<(i32 0x4)>;
44 def CvtRN : PatLeaf<(i32 0x5)>;
45 def CvtRZ : PatLeaf<(i32 0x6)>;
46 def CvtRM : PatLeaf<(i32 0x7)>;
47 def CvtRP : PatLeaf<(i32 0x8)>;
48 def CvtRNA : PatLeaf<(i32 0x9)>;
50 def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
51 def CvtRNI_FTZ : PatLeaf<(i32 0x11)>;
52 def CvtRZI_FTZ : PatLeaf<(i32 0x12)>;
53 def CvtRMI_FTZ : PatLeaf<(i32 0x13)>;
54 def CvtRPI_FTZ : PatLeaf<(i32 0x14)>;
55 def CvtRN_FTZ : PatLeaf<(i32 0x15)>;
56 def CvtRZ_FTZ : PatLeaf<(i32 0x16)>;
57 def CvtRM_FTZ : PatLeaf<(i32 0x17)>;
58 def CvtRP_FTZ : PatLeaf<(i32 0x18)>;
60 def CvtSAT : PatLeaf<(i32 0x20)>;
61 def CvtSAT_FTZ : PatLeaf<(i32 0x30)>;
63 def CvtNONE_RELU : PatLeaf<(i32 0x40)>;
64 def CvtRN_RELU : PatLeaf<(i32 0x45)>;
65 def CvtRZ_RELU : PatLeaf<(i32 0x46)>;
67 def CvtMode : Operand<i32> {
68 let PrintMethod = "printCvtMode";
72 // These must match the enum in NVPTX.h
73 def CmpEQ : PatLeaf<(i32 0)>;
74 def CmpNE : PatLeaf<(i32 1)>;
75 def CmpLT : PatLeaf<(i32 2)>;
76 def CmpLE : PatLeaf<(i32 3)>;
77 def CmpGT : PatLeaf<(i32 4)>;
78 def CmpGE : PatLeaf<(i32 5)>;
79 def CmpLO : PatLeaf<(i32 6)>;
80 def CmpLS : PatLeaf<(i32 7)>;
81 def CmpHI : PatLeaf<(i32 8)>;
82 def CmpHS : PatLeaf<(i32 9)>;
83 def CmpEQU : PatLeaf<(i32 10)>;
84 def CmpNEU : PatLeaf<(i32 11)>;
85 def CmpLTU : PatLeaf<(i32 12)>;
86 def CmpLEU : PatLeaf<(i32 13)>;
87 def CmpGTU : PatLeaf<(i32 14)>;
88 def CmpGEU : PatLeaf<(i32 15)>;
89 def CmpNUM : PatLeaf<(i32 16)>;
90 def CmpNAN : PatLeaf<(i32 17)>;
92 def CmpEQ_FTZ : PatLeaf<(i32 0x100)>;
93 def CmpNE_FTZ : PatLeaf<(i32 0x101)>;
94 def CmpLT_FTZ : PatLeaf<(i32 0x102)>;
95 def CmpLE_FTZ : PatLeaf<(i32 0x103)>;
96 def CmpGT_FTZ : PatLeaf<(i32 0x104)>;
97 def CmpGE_FTZ : PatLeaf<(i32 0x105)>;
98 def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>;
99 def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>;
100 def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>;
101 def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>;
102 def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>;
103 def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>;
104 def CmpNUM_FTZ : PatLeaf<(i32 0x110)>;
105 def CmpNAN_FTZ : PatLeaf<(i32 0x111)>;
107 def CmpMode : Operand<i32> {
108 let PrintMethod = "printCmpMode";
110 def VecElement : Operand<i32> {
111 let PrintMethod = "printVecElement";
115 // These must match the enum in NVPTX.h
116 def PrmtNONE : PatLeaf<(i32 0x0)>;
117 def PrmtF4E : PatLeaf<(i32 0x1)>;
118 def PrmtB4E : PatLeaf<(i32 0x2)>;
119 def PrmtRC8 : PatLeaf<(i32 0x3)>;
120 def PrmtECL : PatLeaf<(i32 0x4)>;
121 def PrmtECR : PatLeaf<(i32 0x5)>;
122 def PrmtRC16 : PatLeaf<(i32 0x6)>;
124 def PrmtMode : Operand<i32> {
125 let PrintMethod = "printPrmtMode";
129 //===----------------------------------------------------------------------===//
130 // NVPTX Instruction Predicate Definitions
131 //===----------------------------------------------------------------------===//
134 def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
135 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
136 def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
137 def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
138 def hasVote : Predicate<"Subtarget->hasVote()">;
139 def hasDouble : Predicate<"Subtarget->hasDouble()">;
140 def hasLDG : Predicate<"Subtarget->hasLDG()">;
141 def hasLDU : Predicate<"Subtarget->hasLDU()">;
143 def doF32FTZ : Predicate<"useF32FTZ()">;
144 def doNoF32FTZ : Predicate<"!useF32FTZ()">;
146 def doMulWide : Predicate<"doMulWide">;
148 def allowFMA : Predicate<"allowFMA()">;
149 def noFMA : Predicate<"!allowFMA()">;
150 def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
151 def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">;
153 def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
154 def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
156 def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
157 def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
159 def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
160 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
162 def True : Predicate<"true">;
164 class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
165 class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
167 // non-sync shfl instructions are not available on sm_70+ in PTX6.4+
168 def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
169 "&& Subtarget->getPTXVersion() >= 64)">;
171 def useShortPtr : Predicate<"useShortPointers()">;
172 def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
173 def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">;
175 // Helper class to aid conversion between ValueType and a matching RegisterClass.
177 class ValueToRegClass<ValueType T> {
178 string name = !cast<string>(T);
179 NVPTXRegClass ret = !cond(
180 !eq(name, "i1"): Int1Regs,
181 !eq(name, "i16"): Int16Regs,
182 !eq(name, "v2i16"): Int32Regs,
183 !eq(name, "i32"): Int32Regs,
184 !eq(name, "i64"): Int64Regs,
185 !eq(name, "f16"): Int16Regs,
186 !eq(name, "v2f16"): Int32Regs,
187 !eq(name, "bf16"): Int16Regs,
188 !eq(name, "v2bf16"): Int32Regs,
189 !eq(name, "f32"): Float32Regs,
190 !eq(name, "f64"): Float64Regs,
191 !eq(name, "ai32"): Int32ArgRegs,
192 !eq(name, "ai64"): Int64ArgRegs,
193 !eq(name, "af32"): Float32ArgRegs,
194 !eq(name, "if64"): Float64ArgRegs,
199 //===----------------------------------------------------------------------===//
200 // Some Common Instruction Class Templates
201 //===----------------------------------------------------------------------===//
203 // Template for instructions which take three int64, int32, or int16 args.
204 // The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
205 multiclass I3<string OpcStr, SDNode OpNode> {
207 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
208 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
209 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
211 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
212 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
213 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
215 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
216 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
217 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
219 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
220 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
221 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
223 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
224 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
225 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
227 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
228 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
229 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
232 class I16x2<string OpcStr, SDNode OpNode> :
233 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
234 !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"),
235 [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>,
236 Requires<[hasPTX<80>, hasSM<90>]>;
238 // Template for instructions which take 3 int args. The instructions are
239 // named "<OpcStr>.s32" (e.g. "addc.cc.s32").
240 multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
241 let hasSideEffects = 1 in {
243 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
244 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
245 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
247 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
248 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
249 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
251 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
252 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
253 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>,
254 Requires<[hasPTX<43>]>;
256 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
257 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
258 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>,
259 Requires<[hasPTX<43>]>;
263 // Template for instructions which take three fp64 or fp32 args. The
264 // instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
266 // Also defines ftz (flush subnormal inputs and results to sign-preserving
267 // zero) variants for fp32 functions.
269 // This multiclass should be used for nodes that cannot be folded into FMAs.
270 // For nodes that can be folded into FMAs (i.e. adds and muls), use
272 multiclass F3<string OpcStr, SDNode OpNode> {
274 NVPTXInst<(outs Float64Regs:$dst),
275 (ins Float64Regs:$a, Float64Regs:$b),
276 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
277 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
279 NVPTXInst<(outs Float64Regs:$dst),
280 (ins Float64Regs:$a, f64imm:$b),
281 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
282 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
284 NVPTXInst<(outs Float32Regs:$dst),
285 (ins Float32Regs:$a, Float32Regs:$b),
286 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
287 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
288 Requires<[doF32FTZ]>;
290 NVPTXInst<(outs Float32Regs:$dst),
291 (ins Float32Regs:$a, f32imm:$b),
292 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
293 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
294 Requires<[doF32FTZ]>;
296 NVPTXInst<(outs Float32Regs:$dst),
297 (ins Float32Regs:$a, Float32Regs:$b),
298 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
299 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
301 NVPTXInst<(outs Float32Regs:$dst),
302 (ins Float32Regs:$a, f32imm:$b),
303 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
304 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
307 NVPTXInst<(outs Int16Regs:$dst),
308 (ins Int16Regs:$a, Int16Regs:$b),
309 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
310 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
311 Requires<[useFP16Math, doF32FTZ]>;
313 NVPTXInst<(outs Int16Regs:$dst),
314 (ins Int16Regs:$a, Int16Regs:$b),
315 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
316 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
317 Requires<[useFP16Math]>;
320 NVPTXInst<(outs Int32Regs:$dst),
321 (ins Int32Regs:$a, Int32Regs:$b),
322 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
323 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
324 Requires<[useFP16Math, doF32FTZ]>;
326 NVPTXInst<(outs Int32Regs:$dst),
327 (ins Int32Regs:$a, Int32Regs:$b),
328 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
329 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
330 Requires<[useFP16Math]>;
332 NVPTXInst<(outs Int16Regs:$dst),
333 (ins Int16Regs:$a, Int16Regs:$b),
334 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
335 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
336 Requires<[hasBF16Math, doF32FTZ]>;
338 NVPTXInst<(outs Int16Regs:$dst),
339 (ins Int16Regs:$a, Int16Regs:$b),
340 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
341 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
342 Requires<[hasBF16Math]>;
345 NVPTXInst<(outs Int32Regs:$dst),
346 (ins Int32Regs:$a, Int32Regs:$b),
347 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
348 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
349 Requires<[hasBF16Math, doF32FTZ]>;
351 NVPTXInst<(outs Int32Regs:$dst),
352 (ins Int32Regs:$a, Int32Regs:$b),
353 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
354 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
355 Requires<[hasBF16Math]>;
358 // Template for instructions which take three FP args. The
359 // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
361 // Also defines ftz (flush subnormal inputs and results to sign-preserving
362 // zero) variants for fp32/fp16 functions.
364 // This multiclass should be used for nodes that can be folded to make fma ops.
365 // In this case, we use the ".rn" variant when FMA is disabled, as this behaves
366 // just like the non ".rn" op, but prevents ptxas from creating FMAs.
367 multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
369 NVPTXInst<(outs Float64Regs:$dst),
370 (ins Float64Regs:$a, Float64Regs:$b),
371 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
372 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
373 Requires<[allowFMA]>;
375 NVPTXInst<(outs Float64Regs:$dst),
376 (ins Float64Regs:$a, f64imm:$b),
377 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
378 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
379 Requires<[allowFMA]>;
381 NVPTXInst<(outs Float32Regs:$dst),
382 (ins Float32Regs:$a, Float32Regs:$b),
383 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
384 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
385 Requires<[allowFMA, doF32FTZ]>;
387 NVPTXInst<(outs Float32Regs:$dst),
388 (ins Float32Regs:$a, f32imm:$b),
389 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
390 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
391 Requires<[allowFMA, doF32FTZ]>;
393 NVPTXInst<(outs Float32Regs:$dst),
394 (ins Float32Regs:$a, Float32Regs:$b),
395 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
396 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
397 Requires<[allowFMA]>;
399 NVPTXInst<(outs Float32Regs:$dst),
400 (ins Float32Regs:$a, f32imm:$b),
401 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
402 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
403 Requires<[allowFMA]>;
406 NVPTXInst<(outs Int16Regs:$dst),
407 (ins Int16Regs:$a, Int16Regs:$b),
408 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
409 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
410 Requires<[useFP16Math, allowFMA, doF32FTZ]>;
412 NVPTXInst<(outs Int16Regs:$dst),
413 (ins Int16Regs:$a, Int16Regs:$b),
414 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
415 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
416 Requires<[useFP16Math, allowFMA]>;
419 NVPTXInst<(outs Int32Regs:$dst),
420 (ins Int32Regs:$a, Int32Regs:$b),
421 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
422 [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
423 Requires<[useFP16Math, allowFMA, doF32FTZ]>;
425 NVPTXInst<(outs Int32Regs:$dst),
426 (ins Int32Regs:$a, Int32Regs:$b),
427 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
428 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
429 Requires<[useFP16Math, allowFMA]>;
431 NVPTXInst<(outs Int16Regs:$dst),
432 (ins Int16Regs:$a, Int16Regs:$b),
433 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
434 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
435 Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
437 NVPTXInst<(outs Int16Regs:$dst),
438 (ins Int16Regs:$a, Int16Regs:$b),
439 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
440 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
441 Requires<[hasBF16Math, allowFMA]>;
444 NVPTXInst<(outs Int32Regs:$dst),
445 (ins Int32Regs:$a, Int32Regs:$b),
446 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
447 [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
448 Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
450 NVPTXInst<(outs Int32Regs:$dst),
451 (ins Int32Regs:$a, Int32Regs:$b),
452 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
453 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
454 Requires<[hasBF16Math, allowFMA]>;
455 // These have strange names so we don't perturb existing mir tests.
457 NVPTXInst<(outs Float64Regs:$dst),
458 (ins Float64Regs:$a, Float64Regs:$b),
459 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
460 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
463 NVPTXInst<(outs Float64Regs:$dst),
464 (ins Float64Regs:$a, f64imm:$b),
465 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
466 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
469 NVPTXInst<(outs Float32Regs:$dst),
470 (ins Float32Regs:$a, Float32Regs:$b),
471 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
472 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
473 Requires<[noFMA, doF32FTZ]>;
475 NVPTXInst<(outs Float32Regs:$dst),
476 (ins Float32Regs:$a, f32imm:$b),
477 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
478 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
479 Requires<[noFMA, doF32FTZ]>;
481 NVPTXInst<(outs Float32Regs:$dst),
482 (ins Float32Regs:$a, Float32Regs:$b),
483 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
484 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
487 NVPTXInst<(outs Float32Regs:$dst),
488 (ins Float32Regs:$a, f32imm:$b),
489 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
490 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
493 NVPTXInst<(outs Int16Regs:$dst),
494 (ins Int16Regs:$a, Int16Regs:$b),
495 !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
496 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
497 Requires<[useFP16Math, noFMA, doF32FTZ]>;
499 NVPTXInst<(outs Int16Regs:$dst),
500 (ins Int16Regs:$a, Int16Regs:$b),
501 !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
502 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
503 Requires<[useFP16Math, noFMA]>;
505 NVPTXInst<(outs Int32Regs:$dst),
506 (ins Int32Regs:$a, Int32Regs:$b),
507 !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
508 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
509 Requires<[useFP16Math, noFMA, doF32FTZ]>;
511 NVPTXInst<(outs Int32Regs:$dst),
512 (ins Int32Regs:$a, Int32Regs:$b),
513 !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
514 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
515 Requires<[useFP16Math, noFMA]>;
517 NVPTXInst<(outs Int16Regs:$dst),
518 (ins Int16Regs:$a, Int16Regs:$b),
519 !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"),
520 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
521 Requires<[hasBF16Math, noFMA, doF32FTZ]>;
523 NVPTXInst<(outs Int16Regs:$dst),
524 (ins Int16Regs:$a, Int16Regs:$b),
525 !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"),
526 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
527 Requires<[hasBF16Math, noFMA]>;
528 def _rnbf16x2rr_ftz :
529 NVPTXInst<(outs Int32Regs:$dst),
530 (ins Int32Regs:$a, Int32Regs:$b),
531 !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"),
532 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
533 Requires<[hasBF16Math, noFMA, doF32FTZ]>;
535 NVPTXInst<(outs Int32Regs:$dst),
536 (ins Int32Regs:$a, Int32Regs:$b),
537 !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"),
538 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
539 Requires<[hasBF16Math, noFMA]>;
542 // Template for operations which take two f32 or f64 operands. Provides three
543 // instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
544 // subnormal inputs and results to zero).
545 multiclass F2<string OpcStr, SDNode OpNode> {
546 def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
547 !strconcat(OpcStr, ".f64 \t$dst, $a;"),
548 [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
549 def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
550 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
551 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
552 Requires<[doF32FTZ]>;
553 def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
554 !strconcat(OpcStr, ".f32 \t$dst, $a;"),
555 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
558 //===----------------------------------------------------------------------===//
559 // NVPTX Instructions.
560 //===----------------------------------------------------------------------===//
562 //-----------------------------------
564 //-----------------------------------
566 let hasSideEffects = false in {
567 // Generate a cvt to the given type from all possible types. Each instance
568 // takes a CvtMode immediate that defines the conversion mode to use. It can
569 // be CvtNONE to omit a conversion mode.
570 multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> {
572 NVPTXInst<(outs RC:$dst),
573 (ins Int16Regs:$src, CvtMode:$mode),
574 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
575 ToType, ".s8 \t$dst, $src;"), []>,
578 NVPTXInst<(outs RC:$dst),
579 (ins Int16Regs:$src, CvtMode:$mode),
580 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
581 ToType, ".u8 \t$dst, $src;"), []>,
584 NVPTXInst<(outs RC:$dst),
585 (ins Int16Regs:$src, CvtMode:$mode),
586 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
587 ToType, ".s16 \t$dst, $src;"), []>,
590 NVPTXInst<(outs RC:$dst),
591 (ins Int16Regs:$src, CvtMode:$mode),
592 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
593 ToType, ".u16 \t$dst, $src;"), []>,
596 NVPTXInst<(outs RC:$dst),
597 (ins Int32Regs:$src, CvtMode:$mode),
598 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
599 ToType, ".s32 \t$dst, $src;"), []>,
602 NVPTXInst<(outs RC:$dst),
603 (ins Int32Regs:$src, CvtMode:$mode),
604 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
605 ToType, ".u32 \t$dst, $src;"), []>,
608 NVPTXInst<(outs RC:$dst),
609 (ins Int64Regs:$src, CvtMode:$mode),
610 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
611 ToType, ".s64 \t$dst, $src;"), []>,
614 NVPTXInst<(outs RC:$dst),
615 (ins Int64Regs:$src, CvtMode:$mode),
616 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
617 ToType, ".u64 \t$dst, $src;"), []>,
620 NVPTXInst<(outs RC:$dst),
621 (ins Int16Regs:$src, CvtMode:$mode),
622 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
623 ToType, ".f16 \t$dst, $src;"), []>,
626 NVPTXInst<(outs RC:$dst),
627 (ins Int16Regs:$src, CvtMode:$mode),
628 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
629 ToType, ".bf16 \t$dst, $src;"), []>,
630 Requires<!if(!eq(ToType, "f32"),
631 // bf16->f32 was introduced early.
632 [hasPTX<71>, hasSM<80>],
633 // bf16->everything else needs sm90/ptx78
634 [hasPTX<78>, hasSM<90>])>;
636 NVPTXInst<(outs RC:$dst),
637 (ins Float32Regs:$src, CvtMode:$mode),
638 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
639 ToType, ".f32 \t$dst, $src;"), []>,
640 Requires<!if(!eq(ToType, "bf16"),
641 // f32->bf16 was introduced early.
642 [hasPTX<70>, hasSM<80>],
645 NVPTXInst<(outs RC:$dst),
646 (ins Float64Regs:$src, CvtMode:$mode),
647 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
648 ToType, ".f64 \t$dst, $src;"), []>,
652 // Generate cvts from all types to all types.
653 defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>;
654 defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>;
655 defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
656 defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
657 defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
658 defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
659 defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
660 defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
661 defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
662 defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>;
663 defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
664 defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
666 // These cvts are different from those above: The source and dest registers
667 // are of the same type.
668 def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
669 "cvt.s16.s8 \t$dst, $src;", []>;
670 def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
671 "cvt.s32.s8 \t$dst, $src;", []>;
672 def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
673 "cvt.s32.s16 \t$dst, $src;", []>;
674 def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
675 "cvt.s64.s8 \t$dst, $src;", []>;
676 def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
677 "cvt.s64.s16 \t$dst, $src;", []>;
678 def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
679 "cvt.s64.s32 \t$dst, $src;", []>;
681 multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
683 NVPTXInst<(outs RC:$dst),
684 (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
685 !strconcat("cvt${mode:base}${mode:relu}.",
686 FromName, ".f32 \t$dst, $src1, $src2;"), []>,
687 Requires<[hasPTX<70>, hasSM<80>]>;
690 defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
691 defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
694 //-----------------------------------
695 // Selection instructions (selp)
696 //-----------------------------------
698 // TODO: Missing slct
700 // selp instructions that don't have any pattern matches; we explicitly use
701 // them within this file.
702 let hasSideEffects = false in {
703 multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
704 def rr : NVPTXInst<(outs RC:$dst),
705 (ins RC:$a, RC:$b, Int1Regs:$p),
706 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
707 def ri : NVPTXInst<(outs RC:$dst),
708 (ins RC:$a, ImmCls:$b, Int1Regs:$p),
709 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
710 def ir : NVPTXInst<(outs RC:$dst),
711 (ins ImmCls:$a, RC:$b, Int1Regs:$p),
712 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
713 def ii : NVPTXInst<(outs RC:$dst),
714 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
715 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
718 multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC,
719 Operand ImmCls, SDNode ImmNode> {
721 NVPTXInst<(outs RC:$dst),
722 (ins RC:$a, RC:$b, Int1Regs:$p),
723 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
724 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>;
726 NVPTXInst<(outs RC:$dst),
727 (ins RC:$a, ImmCls:$b, Int1Regs:$p),
728 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
729 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>;
731 NVPTXInst<(outs RC:$dst),
732 (ins ImmCls:$a, RC:$b, Int1Regs:$p),
733 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
734 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>;
736 NVPTXInst<(outs RC:$dst),
737 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
738 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
739 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
743 // Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
745 defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>;
746 defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
747 defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
748 defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>;
749 defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
750 defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
751 defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>;
752 defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
753 defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
754 defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>;
755 defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>;
757 defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>;
758 defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
760 // This does not work as tablegen fails to infer the type of 'imm'.
761 // def v2f16imm : Operand<v2f16>;
762 // defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>;
764 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
765 def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))),
766 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>;
769 //-----------------------------------
771 //-----------------------------------
773 def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a),
774 "testp.infinite.f32 \t$p, $a;",
776 def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a),
777 "testp.infinite.f32 \t$p, $a;",
779 def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
780 "testp.infinite.f64 \t$p, $a;",
782 def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a),
783 "testp.infinite.f64 \t$p, $a;",
786 //-----------------------------------
787 // Integer Arithmetic
788 //-----------------------------------
790 // Template for xor masquerading as int1 arithmetic.
791 multiclass ADD_SUB_i1<SDNode OpNode> {
792 def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
793 "xor.pred \t$dst, $a, $b;",
794 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
795 def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
796 "xor.pred \t$dst, $a, $b;",
797 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
800 // int1 addition and subtraction are both just xor.
801 defm ADD_i1 : ADD_SUB_i1<add>;
802 defm SUB_i1 : ADD_SUB_i1<sub>;
804 // int16, int32, and int64 signed addition. Since nvptx is 2's complement, we
805 // also use these for unsigned arithmetic.
806 defm ADD : I3<"add.s", add>;
807 defm SUB : I3<"sub.s", sub>;
809 def ADD16x2 : I16x2<"add.s", add>;
810 def SUB16x2 : I16x2<"sub.s", sub>;
812 // in32 and int64 addition and subtraction with carry-out.
813 defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
814 defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
816 // int32 and int64 addition and subtraction with carry-in and carry-out.
817 defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
818 defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
820 defm MULT : I3<"mul.lo.s", mul>;
822 defm MULTHS : I3<"mul.hi.s", mulhs>;
823 defm MULTHU : I3<"mul.hi.u", mulhu>;
825 defm SDIV : I3<"div.s", sdiv>;
826 defm UDIV : I3<"div.u", udiv>;
828 // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
830 defm SREM : I3<"rem.s", srem>;
831 defm UREM : I3<"rem.u", urem>;
833 // Integer absolute value. NumBits should be one minus the bit width of RC.
834 // This idiom implements the algorithm at
835 // http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
836 multiclass ABS<ValueType T, RegisterClass RC, string SizeName> {
837 def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
838 !strconcat("abs", SizeName, " \t$dst, $a;"),
839 [(set (T RC:$dst), (abs (T RC:$a)))]>;
841 defm ABS_16 : ABS<i16, Int16Regs, ".s16">;
842 defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
843 defm ABS_64 : ABS<i64, Int64Regs, ".s64">;
846 defm SMAX : I3<"max.s", smax>;
847 defm UMAX : I3<"max.u", umax>;
848 defm SMIN : I3<"min.s", smin>;
849 defm UMIN : I3<"min.u", umin>;
851 def SMAX16x2 : I16x2<"max.s", smax>;
852 def UMAX16x2 : I16x2<"max.u", umax>;
853 def SMIN16x2 : I16x2<"min.s", smin>;
854 def UMIN16x2 : I16x2<"min.u", umin>;
858 // Wide multiplication
861 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
862 "mul.wide.s32 \t$dst, $a, $b;", []>;
864 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
865 "mul.wide.s32 \t$dst, $a, $b;", []>;
866 def MULWIDES64Imm64 :
867 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
868 "mul.wide.s32 \t$dst, $a, $b;", []>;
871 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
872 "mul.wide.u32 \t$dst, $a, $b;", []>;
874 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
875 "mul.wide.u32 \t$dst, $a, $b;", []>;
876 def MULWIDEU64Imm64 :
877 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
878 "mul.wide.u32 \t$dst, $a, $b;", []>;
881 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
882 "mul.wide.s16 \t$dst, $a, $b;", []>;
884 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
885 "mul.wide.s16 \t$dst, $a, $b;", []>;
886 def MULWIDES32Imm32 :
887 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
888 "mul.wide.s16 \t$dst, $a, $b;", []>;
891 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
892 "mul.wide.u16 \t$dst, $a, $b;", []>;
894 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
895 "mul.wide.u16 \t$dst, $a, $b;", []>;
896 def MULWIDEU32Imm32 :
897 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
898 "mul.wide.u16 \t$dst, $a, $b;", []>;
900 def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
901 def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
902 def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
904 // Matchers for signed, unsigned mul.wide ISD nodes.
905 def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
906 (MULWIDES32 i16:$a, i16:$b)>,
907 Requires<[doMulWide]>;
908 def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
909 (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
910 Requires<[doMulWide]>;
911 def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
912 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
913 Requires<[doMulWide]>;
914 def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
915 (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
916 Requires<[doMulWide]>;
918 def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
919 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
920 Requires<[doMulWide]>;
921 def : Pat<(i64 (mul_wide_signed (i32 Int32Regs:$a), imm:$b)),
922 (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
923 Requires<[doMulWide]>;
924 def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
925 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
926 Requires<[doMulWide]>;
927 def : Pat<(i64 (mul_wide_unsigned (i32 Int32Regs:$a), imm:$b)),
928 (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
929 Requires<[doMulWide]>;
931 // Predicates used for converting some patterns to mul.wide.
932 def SInt32Const : PatLeaf<(imm), [{
933 const APInt &v = N->getAPIntValue();
934 return v.isSignedIntN(32);
937 def UInt32Const : PatLeaf<(imm), [{
938 const APInt &v = N->getAPIntValue();
942 def SInt16Const : PatLeaf<(imm), [{
943 const APInt &v = N->getAPIntValue();
944 return v.isSignedIntN(16);
947 def UInt16Const : PatLeaf<(imm), [{
948 const APInt &v = N->getAPIntValue();
952 def IntConst_0_30 : PatLeaf<(imm), [{
953 // Check if 0 <= v < 31; only then will the result of (x << v) be an int32.
954 const APInt &v = N->getAPIntValue();
955 return v.sge(0) && v.slt(31);
958 def IntConst_0_14 : PatLeaf<(imm), [{
959 // Check if 0 <= v < 15; only then will the result of (x << v) be an int16.
960 const APInt &v = N->getAPIntValue();
961 return v.sge(0) && v.slt(15);
964 def SHL2MUL32 : SDNodeXForm<imm, [{
965 const APInt &v = N->getAPIntValue();
967 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
970 def SHL2MUL16 : SDNodeXForm<imm, [{
971 const APInt &v = N->getAPIntValue();
973 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
976 // Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
977 def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)),
978 (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
979 Requires<[doMulWide]>;
980 def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)),
981 (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
982 Requires<[doMulWide]>;
984 def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)),
985 (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
986 Requires<[doMulWide]>;
987 def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)),
988 (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
989 Requires<[doMulWide]>;
991 // Convert "sign/zero-extend then multiply" to mul.wide.
992 def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
993 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
994 Requires<[doMulWide]>;
995 def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
996 (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
997 Requires<[doMulWide]>;
999 def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
1000 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
1001 Requires<[doMulWide]>;
1002 def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
1003 (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
1004 Requires<[doMulWide]>;
1006 def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
1007 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
1008 Requires<[doMulWide]>;
1009 def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
1010 (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
1011 Requires<[doMulWide]>;
1013 def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
1014 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
1015 Requires<[doMulWide]>;
1016 def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
1017 (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
1018 Requires<[doMulWide]>;
1021 // Integer multiply-add
1024 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
1025 SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
1026 def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
1029 NVPTXInst<(outs Int16Regs:$dst),
1030 (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
1031 "mad.lo.s16 \t$dst, $a, $b, $c;",
1032 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
1034 NVPTXInst<(outs Int16Regs:$dst),
1035 (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
1036 "mad.lo.s16 \t$dst, $a, $b, $c;",
1037 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
1039 NVPTXInst<(outs Int16Regs:$dst),
1040 (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
1041 "mad.lo.s16 \t$dst, $a, $b, $c;",
1042 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
1044 NVPTXInst<(outs Int16Regs:$dst),
1045 (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
1046 "mad.lo.s16 \t$dst, $a, $b, $c;",
1047 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
1050 NVPTXInst<(outs Int32Regs:$dst),
1051 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
1052 "mad.lo.s32 \t$dst, $a, $b, $c;",
1053 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
1055 NVPTXInst<(outs Int32Regs:$dst),
1056 (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
1057 "mad.lo.s32 \t$dst, $a, $b, $c;",
1058 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), imm:$c))]>;
1060 NVPTXInst<(outs Int32Regs:$dst),
1061 (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
1062 "mad.lo.s32 \t$dst, $a, $b, $c;",
1063 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, (i32 Int32Regs:$c)))]>;
1065 NVPTXInst<(outs Int32Regs:$dst),
1066 (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
1067 "mad.lo.s32 \t$dst, $a, $b, $c;",
1068 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, imm:$c))]>;
1071 NVPTXInst<(outs Int64Regs:$dst),
1072 (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
1073 "mad.lo.s64 \t$dst, $a, $b, $c;",
1074 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
1076 NVPTXInst<(outs Int64Regs:$dst),
1077 (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
1078 "mad.lo.s64 \t$dst, $a, $b, $c;",
1079 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
1081 NVPTXInst<(outs Int64Regs:$dst),
1082 (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
1083 "mad.lo.s64 \t$dst, $a, $b, $c;",
1084 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
1086 NVPTXInst<(outs Int64Regs:$dst),
1087 (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
1088 "mad.lo.s64 \t$dst, $a, $b, $c;",
1089 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
1092 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1093 "neg.s16 \t$dst, $src;",
1094 [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
1096 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1097 "neg.s32 \t$dst, $src;",
1098 [(set (i32 Int32Regs:$dst), (ineg (i32 Int32Regs:$src)))]>;
1100 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1101 "neg.s64 \t$dst, $src;",
1102 [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
1104 //-----------------------------------
1105 // Floating Point Arithmetic
1106 //-----------------------------------
1109 def FloatConst1 : PatLeaf<(fpimm), [{
1110 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
1111 N->getValueAPF().convertToFloat() == 1.0f;
1113 // Constant 1.0 (double)
1114 def DoubleConst1 : PatLeaf<(fpimm), [{
1115 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
1116 N->getValueAPF().convertToDouble() == 1.0;
1119 // Loads FP16 constant into a register.
1121 // ptxas does not have hex representation for fp16, so we can't use
1122 // fp16 immediate values in .f16 instructions. Instead we have to load
1123 // the constant into a register using mov.b16.
1124 def LOAD_CONST_F16 :
1125 NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
1126 "mov.b16 \t$dst, $a;", []>;
1127 def LOAD_CONST_BF16 :
1128 NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a),
1129 "mov.b16 \t$dst, $a;", []>;
1130 defm FADD : F3_fma_component<"add", fadd>;
1131 defm FSUB : F3_fma_component<"sub", fsub>;
1132 defm FMUL : F3_fma_component<"mul", fmul>;
1134 defm FMIN : F3<"min", fminnum>;
1135 defm FMAX : F3<"max", fmaxnum>;
1136 // Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
1137 defm FMINNAN : F3<"min.NaN", fminimum>;
1138 defm FMAXNAN : F3<"max.NaN", fmaximum>;
1140 defm FABS : F2<"abs", fabs>;
1141 defm FNEG : F2<"neg", fneg>;
1142 defm FSQRT : F2<"sqrt.rn", fsqrt>;
1147 class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1148 NVPTXInst<(outs RC:$dst), (ins RC:$src),
1149 !strconcat(OpcStr, " \t$dst, $src;"),
1150 [(set RC:$dst, (fneg (T RC:$src)))]>,
1151 Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>;
1152 def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>;
1153 def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>;
1154 def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1155 def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>;
1161 class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1162 NVPTXInst<(outs RC:$dst), (ins RC:$src),
1163 !strconcat(OpcStr, " \t$dst, $src;"),
1164 [(set RC:$dst, (fneg (T RC:$src)))]>,
1165 Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>;
1166 def BFNEG16_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1167 def BFNEG16 : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>;
1168 def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1169 def BFNEG16x2 : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>;
1175 NVPTXInst<(outs Float64Regs:$dst),
1176 (ins f64imm:$a, Float64Regs:$b),
1177 "rcp.rn.f64 \t$dst, $b;",
1178 [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
1180 NVPTXInst<(outs Float64Regs:$dst),
1181 (ins Float64Regs:$a, Float64Regs:$b),
1182 "div.rn.f64 \t$dst, $a, $b;",
1183 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
1185 NVPTXInst<(outs Float64Regs:$dst),
1186 (ins Float64Regs:$a, f64imm:$b),
1187 "div.rn.f64 \t$dst, $a, $b;",
1188 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
1191 // F32 Approximate reciprocal
1194 NVPTXInst<(outs Float32Regs:$dst),
1195 (ins f32imm:$a, Float32Regs:$b),
1196 "rcp.approx.ftz.f32 \t$dst, $b;",
1197 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1198 Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1200 NVPTXInst<(outs Float32Regs:$dst),
1201 (ins f32imm:$a, Float32Regs:$b),
1202 "rcp.approx.f32 \t$dst, $b;",
1203 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1204 Requires<[do_DIVF32_APPROX]>;
1206 // F32 Approximate division
1208 def FDIV32approxrr_ftz :
1209 NVPTXInst<(outs Float32Regs:$dst),
1210 (ins Float32Regs:$a, Float32Regs:$b),
1211 "div.approx.ftz.f32 \t$dst, $a, $b;",
1212 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1213 Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1214 def FDIV32approxri_ftz :
1215 NVPTXInst<(outs Float32Regs:$dst),
1216 (ins Float32Regs:$a, f32imm:$b),
1217 "div.approx.ftz.f32 \t$dst, $a, $b;",
1218 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1219 Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1220 def FDIV32approxrr :
1221 NVPTXInst<(outs Float32Regs:$dst),
1222 (ins Float32Regs:$a, Float32Regs:$b),
1223 "div.approx.f32 \t$dst, $a, $b;",
1224 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1225 Requires<[do_DIVF32_APPROX]>;
1226 def FDIV32approxri :
1227 NVPTXInst<(outs Float32Regs:$dst),
1228 (ins Float32Regs:$a, f32imm:$b),
1229 "div.approx.f32 \t$dst, $a, $b;",
1230 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1231 Requires<[do_DIVF32_APPROX]>;
1233 // F32 Semi-accurate reciprocal
1235 // rcp.approx gives the same result as div.full(1.0f, a) and is faster.
1237 def FDIV321r_approx_ftz :
1238 NVPTXInst<(outs Float32Regs:$dst),
1239 (ins f32imm:$a, Float32Regs:$b),
1240 "rcp.approx.ftz.f32 \t$dst, $b;",
1241 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1242 Requires<[do_DIVF32_FULL, doF32FTZ]>;
1243 def FDIV321r_approx :
1244 NVPTXInst<(outs Float32Regs:$dst),
1245 (ins f32imm:$a, Float32Regs:$b),
1246 "rcp.approx.f32 \t$dst, $b;",
1247 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1248 Requires<[do_DIVF32_FULL]>;
1250 // F32 Semi-accurate division
1253 NVPTXInst<(outs Float32Regs:$dst),
1254 (ins Float32Regs:$a, Float32Regs:$b),
1255 "div.full.ftz.f32 \t$dst, $a, $b;",
1256 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1257 Requires<[do_DIVF32_FULL, doF32FTZ]>;
1259 NVPTXInst<(outs Float32Regs:$dst),
1260 (ins Float32Regs:$a, f32imm:$b),
1261 "div.full.ftz.f32 \t$dst, $a, $b;",
1262 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1263 Requires<[do_DIVF32_FULL, doF32FTZ]>;
1265 NVPTXInst<(outs Float32Regs:$dst),
1266 (ins Float32Regs:$a, Float32Regs:$b),
1267 "div.full.f32 \t$dst, $a, $b;",
1268 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1269 Requires<[do_DIVF32_FULL]>;
1271 NVPTXInst<(outs Float32Regs:$dst),
1272 (ins Float32Regs:$a, f32imm:$b),
1273 "div.full.f32 \t$dst, $a, $b;",
1274 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1275 Requires<[do_DIVF32_FULL]>;
1277 // F32 Accurate reciprocal
1279 def FDIV321r_prec_ftz :
1280 NVPTXInst<(outs Float32Regs:$dst),
1281 (ins f32imm:$a, Float32Regs:$b),
1282 "rcp.rn.ftz.f32 \t$dst, $b;",
1283 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1284 Requires<[doF32FTZ]>;
1286 NVPTXInst<(outs Float32Regs:$dst),
1287 (ins f32imm:$a, Float32Regs:$b),
1288 "rcp.rn.f32 \t$dst, $b;",
1289 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
1291 // F32 Accurate division
1293 def FDIV32rr_prec_ftz :
1294 NVPTXInst<(outs Float32Regs:$dst),
1295 (ins Float32Regs:$a, Float32Regs:$b),
1296 "div.rn.ftz.f32 \t$dst, $a, $b;",
1297 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1298 Requires<[doF32FTZ]>;
1299 def FDIV32ri_prec_ftz :
1300 NVPTXInst<(outs Float32Regs:$dst),
1301 (ins Float32Regs:$a, f32imm:$b),
1302 "div.rn.ftz.f32 \t$dst, $a, $b;",
1303 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1304 Requires<[doF32FTZ]>;
1306 NVPTXInst<(outs Float32Regs:$dst),
1307 (ins Float32Regs:$a, Float32Regs:$b),
1308 "div.rn.f32 \t$dst, $a, $b;",
1309 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
1311 NVPTXInst<(outs Float32Regs:$dst),
1312 (ins Float32Regs:$a, f32imm:$b),
1313 "div.rn.f32 \t$dst, $a, $b;",
1314 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
1320 multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
1321 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1322 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1323 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1325 def rri : NVPTXInst<(outs RC:$dst),
1326 (ins RC:$a, RC:$b, ImmCls:$c),
1327 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1328 [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
1330 def rir : NVPTXInst<(outs RC:$dst),
1331 (ins RC:$a, ImmCls:$b, RC:$c),
1332 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1333 [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
1335 def rii : NVPTXInst<(outs RC:$dst),
1336 (ins RC:$a, ImmCls:$b, ImmCls:$c),
1337 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1338 [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
1342 multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1343 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1344 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1345 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1346 Requires<[useFP16Math, Pred]>;
1349 multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1350 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1351 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1352 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1353 Requires<[hasBF16Math, Pred]>;
1356 defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
1357 defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
1358 defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1359 defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>;
1360 defm BFMA16_ftz : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1361 defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>;
1362 defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1363 defm BFMA16x2 : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>;
1364 defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
1365 defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
1366 defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
1369 def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1370 "sin.approx.f32 \t$dst, $src;",
1371 [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
1372 Requires<[allowUnsafeFPMath]>;
1373 def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1374 "cos.approx.f32 \t$dst, $src;",
1375 [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
1376 Requires<[allowUnsafeFPMath]>;
1378 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
1379 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the
1380 // semantics of LLVM's frem.
1383 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1384 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1385 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1387 Requires<[doF32FTZ, allowUnsafeFPMath]>;
1388 def : Pat<(frem Float32Regs:$x, fpimm:$y),
1389 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1390 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1392 Requires<[doF32FTZ, allowUnsafeFPMath]>;
1394 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1395 (SELP_f32rr Float32Regs:$x,
1396 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1397 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1399 (TESTINF_f32r Float32Regs:$y))>,
1400 Requires<[doF32FTZ, noUnsafeFPMath]>;
1401 def : Pat<(frem Float32Regs:$x, fpimm:$y),
1402 (SELP_f32rr Float32Regs:$x,
1403 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1404 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1406 (TESTINF_f32i fpimm:$y))>,
1407 Requires<[doF32FTZ, noUnsafeFPMath]>;
1410 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1411 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1412 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1414 Requires<[allowUnsafeFPMath]>;
1415 def : Pat<(frem Float32Regs:$x, fpimm:$y),
1416 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1417 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1419 Requires<[allowUnsafeFPMath]>;
1421 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1422 (SELP_f32rr Float32Regs:$x,
1423 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1424 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1426 (TESTINF_f32r Float32Regs:$y))>,
1427 Requires<[noUnsafeFPMath]>;
1428 def : Pat<(frem Float32Regs:$x, fpimm:$y),
1429 (SELP_f32rr Float32Regs:$x,
1430 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1431 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1433 (TESTINF_f32i fpimm:$y))>,
1434 Requires<[noUnsafeFPMath]>;
1437 def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1438 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1439 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1441 Requires<[allowUnsafeFPMath]>;
1442 def : Pat<(frem Float64Regs:$x, fpimm:$y),
1443 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1444 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1446 Requires<[allowUnsafeFPMath]>;
1448 def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1449 (SELP_f64rr Float64Regs:$x,
1450 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1451 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1453 (TESTINF_f64r Float64Regs:$y))>,
1454 Requires<[noUnsafeFPMath]>;
1455 def : Pat<(frem Float64Regs:$x, fpimm:$y),
1456 (SELP_f64rr Float64Regs:$x,
1457 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1458 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1460 (TESTINF_f64r Float64Regs:$y))>,
1461 Requires<[noUnsafeFPMath]>;
1463 //-----------------------------------
1464 // Bitwise operations
1465 //-----------------------------------
1467 // Template for three-arg bitwise operations. Takes three args, Creates .b16,
1468 // .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
1469 multiclass BITWISE<string OpcStr, SDNode OpNode> {
1471 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1472 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
1473 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
1475 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1476 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
1477 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
1479 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1480 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
1481 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1483 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1484 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
1485 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
1487 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1488 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
1489 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
1491 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1492 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
1493 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
1495 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1496 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
1497 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1499 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1500 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
1501 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1504 defm OR : BITWISE<"or", or>;
1505 defm AND : BITWISE<"and", and>;
1506 defm XOR : BITWISE<"xor", xor>;
1508 // Lower logical v2i16/v4i8 ops as bitwise ops on b32.
1509 foreach vt = [v2i16, v4i8] in {
1510 def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
1511 (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
1512 def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)),
1513 (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
1514 def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)),
1515 (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
1517 // The constants get legalized into a bitcast from i32, so that's what we need
1519 def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1520 (ORb32ri Int32Regs:$a, imm:$b)>;
1521 def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1522 (XORb32ri Int32Regs:$a, imm:$b)>;
1523 def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1524 (ANDb32ri Int32Regs:$a, imm:$b)>;
1527 def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1528 "not.pred \t$dst, $src;",
1529 [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1530 def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1531 "not.b16 \t$dst, $src;",
1532 [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1533 def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1534 "not.b32 \t$dst, $src;",
1535 [(set (i32 Int32Regs:$dst), (not (i32 Int32Regs:$src)))]>;
1536 def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1537 "not.b64 \t$dst, $src;",
1538 [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1540 // Template for left/right shifts. Takes three operands,
1541 // [dest (reg), src (reg), shift (reg or imm)].
1542 // dest and src may be int64, int32, or int16, but shift is always int32.
1544 // This template also defines a 32-bit shift (imm, imm) instruction.
1545 multiclass SHIFT<string OpcStr, SDNode OpNode> {
1547 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1548 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1549 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 Int32Regs:$b)))]>;
1551 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1552 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1553 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1555 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1556 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1557 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
1559 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1560 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1561 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 imm:$b)))]>;
1563 NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1564 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1565 [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1567 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1568 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1569 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 Int32Regs:$b)))]>;
1571 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1572 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1573 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1576 defm SHL : SHIFT<"shl.b", shl>;
1577 defm SRA : SHIFT<"shr.s", sra>;
1578 defm SRL : SHIFT<"shr.u", srl>;
1582 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
1583 "brev.b32 \t$dst, $a;",
1584 [(set Int32Regs:$dst, (bitreverse (i32 Int32Regs:$a)))]>;
1586 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
1587 "brev.b64 \t$dst, $a;",
1588 [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
1591 // Rotate: Use ptx shf instruction if available.
1594 // 32 bit r2 = rotl r1, n
1596 // r2 = shf.l r1, r1, n
1598 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1599 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1600 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
1601 Requires<[hasHWROT32]>;
1604 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1605 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1606 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1607 Requires<[hasHWROT32]>;
1609 // 32 bit r2 = rotr r1, n
1611 // r2 = shf.r r1, r1, n
1613 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1614 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1615 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
1616 Requires<[hasHWROT32]>;
1619 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1620 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1621 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1622 Requires<[hasHWROT32]>;
1624 // 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1.
1626 NVPTXInst<(outs Int32Regs:$dst),
1627 (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1629 ".reg .b32 %lhs;\n\t"
1630 ".reg .b32 %rhs;\n\t"
1631 "shl.b32 \t%lhs, $src, $amt1;\n\t"
1632 "shr.b32 \t%rhs, $src, $amt2;\n\t"
1633 "add.u32 \t$dst, %lhs, %rhs;\n\t"
1637 def SUB_FRM_32 : SDNodeXForm<imm, [{
1638 return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1641 def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)),
1642 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1643 Requires<[noHWROT32]>;
1644 def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)),
1645 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1646 Requires<[noHWROT32]>;
1648 // 32-bit software rotate left by register.
1650 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1652 ".reg .b32 %lhs;\n\t"
1653 ".reg .b32 %rhs;\n\t"
1654 ".reg .b32 %amt2;\n\t"
1655 "shl.b32 \t%lhs, $src, $amt;\n\t"
1656 "sub.s32 \t%amt2, 32, $amt;\n\t"
1657 "shr.b32 \t%rhs, $src, %amt2;\n\t"
1658 "add.u32 \t$dst, %lhs, %rhs;\n\t"
1660 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1661 Requires<[noHWROT32]>;
1663 // 32-bit software rotate right by register.
1665 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1667 ".reg .b32 %lhs;\n\t"
1668 ".reg .b32 %rhs;\n\t"
1669 ".reg .b32 %amt2;\n\t"
1670 "shr.b32 \t%lhs, $src, $amt;\n\t"
1671 "sub.s32 \t%amt2, 32, $amt;\n\t"
1672 "shl.b32 \t%rhs, $src, %amt2;\n\t"
1673 "add.u32 \t$dst, %lhs, %rhs;\n\t"
1675 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1676 Requires<[noHWROT32]>;
1678 // 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1.
1680 NVPTXInst<(outs Int64Regs:$dst),
1681 (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1683 ".reg .b64 %lhs;\n\t"
1684 ".reg .b64 %rhs;\n\t"
1685 "shl.b64 \t%lhs, $src, $amt1;\n\t"
1686 "shr.b64 \t%rhs, $src, $amt2;\n\t"
1687 "add.u64 \t$dst, %lhs, %rhs;\n\t"
1691 def SUB_FRM_64 : SDNodeXForm<imm, [{
1692 return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1695 def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1696 (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1697 def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1698 (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1700 // 64-bit software rotate left by register.
1702 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1704 ".reg .b64 %lhs;\n\t"
1705 ".reg .b64 %rhs;\n\t"
1706 ".reg .u32 %amt2;\n\t"
1707 "shl.b64 \t%lhs, $src, $amt;\n\t"
1708 "sub.u32 \t%amt2, 64, $amt;\n\t"
1709 "shr.b64 \t%rhs, $src, %amt2;\n\t"
1710 "add.u64 \t$dst, %lhs, %rhs;\n\t"
1712 [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
1715 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1717 ".reg .b64 %lhs;\n\t"
1718 ".reg .b64 %rhs;\n\t"
1719 ".reg .u32 %amt2;\n\t"
1720 "shr.b64 \t%lhs, $src, $amt;\n\t"
1721 "sub.u32 \t%amt2, 64, $amt;\n\t"
1722 "shl.b64 \t%rhs, $src, %amt2;\n\t"
1723 "add.u64 \t$dst, %lhs, %rhs;\n\t"
1725 [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
1728 // Funnnel shift in clamp mode
1731 // Create SDNodes so they can be used in the DAG code, e.g.
1732 // NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1733 def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1734 def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1737 NVPTXInst<(outs Int32Regs:$dst),
1738 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1739 "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1740 [(set Int32Regs:$dst,
1741 (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
1744 NVPTXInst<(outs Int32Regs:$dst),
1745 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1746 "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1747 [(set Int32Regs:$dst,
1748 (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
1751 // BFE - bit-field extract
1754 // Template for BFE/BFI instructions.
1755 // Args: [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1756 // Start may be an imm only if end is also an imm. FIXME: Is this a
1757 // restriction in PTX?
1759 // dest and src may be int32 or int64, but start and end are always int32.
1761 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
1762 SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
1763 def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>;
1766 SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
1767 SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
1768 def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>;
1771 SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
1772 SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>;
1773 def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
1775 multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
1777 : NVPTXInst<(outs RC:$d),
1778 (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1779 !strconcat(Instr, " \t$d, $a, $b, $c;"),
1780 [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
1782 : NVPTXInst<(outs RC:$d),
1783 (ins RC:$a, Int32Regs:$b, i32imm:$c),
1784 !strconcat(Instr, " \t$d, $a, $b, $c;"),
1785 [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>;
1787 : NVPTXInst<(outs RC:$d),
1788 (ins RC:$a, i32imm:$b, i32imm:$c),
1789 !strconcat(Instr, " \t$d, $a, $b, $c;"),
1790 [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>;
1793 multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
1795 : NVPTXInst<(outs RC:$f),
1796 (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1797 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1798 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
1800 : NVPTXInst<(outs RC:$f),
1801 (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1802 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1803 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
1805 : NVPTXInst<(outs RC:$f),
1806 (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
1807 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1808 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
1810 : NVPTXInst<(outs RC:$f),
1811 (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1812 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1813 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
1815 : NVPTXInst<(outs RC:$f),
1816 (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1817 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1818 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
1820 : NVPTXInst<(outs RC:$f),
1821 (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d),
1822 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1823 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
1826 multiclass PRMT<ValueType T, RegisterClass RC> {
1828 : NVPTXInst<(outs RC:$d),
1829 (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode),
1830 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1831 [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>;
1833 : NVPTXInst<(outs RC:$d),
1834 (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode),
1835 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1836 [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>;
1838 : NVPTXInst<(outs RC:$d),
1839 (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode),
1840 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1841 [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>;
1844 let hasSideEffects = false in {
1845 defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
1846 defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
1847 defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
1848 defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;
1850 defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
1851 defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
1853 defm PRMT_B32 : PRMT<i32, Int32Regs>;
1857 // byte extraction + signed/unsigned extension to i32.
1858 def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), i8)),
1859 (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>;
1860 def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), i8)),
1861 (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
1862 def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), 255)),
1863 (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>;
1864 def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), 255)),
1865 (BFE_U32rii Int32Regs:$s, imm:$o, 8)>;
1867 // byte extraction + signed extension to i16
1868 def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8)), i8)),
1869 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
1872 // Byte extraction via shift/trunc/sext
1873 def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)),
1874 (CVT_s8_s32 Int32Regs:$s, CvtNONE)>;
1875 def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s), (i32 imm:$o))), i8)),
1876 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
1877 def : Pat<(sext_inreg (srl (i32 Int32Regs:$s), (i32 imm:$o)), i8),
1878 (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
1879 def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))),
1880 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>;
1881 def : Pat<(sext_inreg (srl (i64 Int64Regs:$s), (i32 imm:$o)), i8),
1882 (BFE_S64rii Int64Regs:$s, imm:$o, 8)>;
1883 def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)),
1884 (CVT_s8_s64 Int64Regs:$s, CvtNONE)>;
1885 def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s), (i32 imm:$o))), i8)),
1886 (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
1888 //-----------------------------------
1889 // Comparison instructions (setp, set)
1890 //-----------------------------------
1892 // FIXME: This doesn't cover versions of set and setp that combine with a
1893 // boolean predicate, e.g. setp.eq.and.b16.
1895 let hasSideEffects = false in {
1896 multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1898 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1899 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1900 " \t$dst, $a, $b;"), []>;
1902 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1903 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1904 " \t$dst, $a, $b;"), []>;
1906 NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1907 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1908 " \t$dst, $a, $b;"), []>;
1912 defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1913 defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1914 defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1915 defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1916 defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1917 defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1918 defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1919 defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1920 defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1921 defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1922 defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1924 NVPTXInst<(outs Int1Regs:$dst),
1925 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1926 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
1927 []>, Requires<[useFP16Math]>;
1930 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1931 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
1932 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
1934 Requires<[useFP16Math]>;
1936 NVPTXInst<(outs Int1Regs:$dst),
1937 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1938 "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;",
1939 []>, Requires<[hasBF16Math]>;
1942 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1943 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
1944 "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;",
1946 Requires<[hasBF16Math]>;
1949 // FIXME: This doesn't appear to be correct. The "set" mnemonic has the form
1950 // "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
1951 // reg, either u32, s32, or f32. Anyway these aren't used at the moment.
1953 let hasSideEffects = false in {
1954 multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
1955 def rr : NVPTXInst<(outs Int32Regs:$dst),
1956 (ins RC:$a, RC:$b, CmpMode:$cmp),
1957 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1958 def ri : NVPTXInst<(outs Int32Regs:$dst),
1959 (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1960 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1961 def ir : NVPTXInst<(outs Int32Regs:$dst),
1962 (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1963 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1967 defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
1968 defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
1969 defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
1970 defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
1971 defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
1972 defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
1973 defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
1974 defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
1975 defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
1976 defm SET_f16 : SET<"f16", Int16Regs, f16imm>;
1977 defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>;
1978 defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
1979 defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
1981 //-----------------------------------
1982 // Data Movement (Load / Store, Move)
1983 //-----------------------------------
1985 def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
1987 def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
1989 def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
1991 def MEMri : Operand<i32> {
1992 let PrintMethod = "printMemOperand";
1993 let MIOperandInfo = (ops Int32Regs, i32imm);
1995 def MEMri64 : Operand<i64> {
1996 let PrintMethod = "printMemOperand";
1997 let MIOperandInfo = (ops Int64Regs, i64imm);
2000 def imem : Operand<iPTR> {
2001 let PrintMethod = "printOperand";
2004 def imemAny : Operand<iPTRAny> {
2005 let PrintMethod = "printOperand";
2008 def LdStCode : Operand<i32> {
2009 let PrintMethod = "printLdStCode";
2012 def MmaCode : Operand<i32> {
2013 let PrintMethod = "printMmaCode";
2016 def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
2017 def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
2019 // Load a memory address into a u32 or u64 register.
2020 def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
2021 "mov.u32 \t$dst, $a;",
2022 [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
2023 def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
2024 "mov.u64 \t$dst, $a;",
2025 [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
2027 // Get pointer to local stack.
2028 let hasSideEffects = false in {
2029 def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
2030 "mov.u32 \t$d, __local_depot$num;", []>;
2031 def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
2032 "mov.u64 \t$d, __local_depot$num;", []>;
2036 // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
2037 let IsSimpleMove=1, hasSideEffects=0 in {
2038 def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
2039 "mov.pred \t$dst, $sss;", []>;
2040 def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
2041 "mov.u16 \t$dst, $sss;", []>;
2042 def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
2043 "mov.u32 \t$dst, $sss;", []>;
2044 def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
2045 "mov.u64 \t$dst, $sss;", []>;
2047 def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
2048 "mov.b16 \t$dst, $sss;", []>;
2049 def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
2050 "mov.b32 \t$dst, $sss;", []>;
2051 def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
2052 "mov.b64 \t$dst, $sss;", []>;
2054 def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2055 // We have to use .b16 here as there's no mov.f16.
2056 "mov.b16 \t$dst, $src;", []>;
2057 def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
2058 "mov.f32 \t$dst, $src;", []>;
2059 def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
2060 "mov.f64 \t$dst, $src;", []>;
2063 def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
2064 "mov.pred \t$dst, $src;",
2065 [(set Int1Regs:$dst, imm:$src)]>;
2066 def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2067 "mov.u16 \t$dst, $src;",
2068 [(set Int16Regs:$dst, imm:$src)]>;
2069 def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2070 "mov.u32 \t$dst, $src;",
2071 [(set (i32 Int32Regs:$dst), imm:$src)]>;
2072 def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2073 "mov.u64 \t$dst, $src;",
2074 [(set Int64Regs:$dst, imm:$src)]>;
2076 def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2077 "mov.b16 \t$dst, $src;", []>;
2078 def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2079 "mov.b32 \t$dst, $src;", []>;
2080 def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2081 "mov.b64 \t$dst, $src;", []>;
2083 def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
2084 "mov.f32 \t$dst, $src;",
2085 [(set Float32Regs:$dst, fpimm:$src)]>;
2086 def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
2087 "mov.f64 \t$dst, $src;",
2088 [(set Float64Regs:$dst, fpimm:$src)]>;
2090 def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
2091 def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
2093 //---- Copy Frame Index ----
2094 def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
2095 "add.u32 \t$dst, ${addr:add};",
2096 [(set Int32Regs:$dst, ADDRri:$addr)]>;
2097 def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
2098 "add.u64 \t$dst, ${addr:add};",
2099 [(set Int64Regs:$dst, ADDRri64:$addr)]>;
2101 //-----------------------------------
2102 // Comparison and Selection
2103 //-----------------------------------
2105 multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
2106 Instruction setp_16rr,
2107 Instruction setp_16ri,
2108 Instruction setp_16ir,
2109 Instruction setp_32rr,
2110 Instruction setp_32ri,
2111 Instruction setp_32ir,
2112 Instruction setp_64rr,
2113 Instruction setp_64ri,
2114 Instruction setp_64ir,
2115 Instruction set_16rr,
2116 Instruction set_16ri,
2117 Instruction set_16ir,
2118 Instruction set_32rr,
2119 Instruction set_32ri,
2120 Instruction set_32ir,
2121 Instruction set_64rr,
2122 Instruction set_64ri,
2123 Instruction set_64ir> {
2125 def : Pat<(i1 (OpNode i16:$a, i16:$b)),
2126 (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
2127 def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
2128 (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
2129 def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
2130 (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
2132 def : Pat<(i1 (OpNode i32:$a, i32:$b)),
2133 (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2134 def : Pat<(i1 (OpNode (i32 Int32Regs:$a), imm:$b)),
2135 (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
2136 def : Pat<(i1 (OpNode imm:$a, (i32 Int32Regs:$b))),
2137 (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
2139 def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
2140 (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2141 def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
2142 (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
2143 def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
2144 (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
2147 def : Pat<(i32 (OpNode i16:$a, i16:$b)),
2148 (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
2149 def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
2150 (set_16ri Int16Regs:$a, imm:$b, Mode)>;
2151 def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
2152 (set_16ir imm:$a, Int16Regs:$b, Mode)>;
2154 def : Pat<(i32 (OpNode i32:$a, i32:$b)),
2155 (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2156 def : Pat<(i32 (OpNode (i32 Int32Regs:$a), imm:$b)),
2157 (set_32ri Int32Regs:$a, imm:$b, Mode)>;
2158 def : Pat<(i32 (OpNode imm:$a, (i32 Int32Regs:$b))),
2159 (set_32ir imm:$a, Int32Regs:$b, Mode)>;
2161 def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
2162 (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2163 def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
2164 (set_64ri Int64Regs:$a, imm:$b, Mode)>;
2165 def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
2166 (set_64ir imm:$a, Int64Regs:$b, Mode)>;
2169 multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
2170 : ISET_FORMAT<OpNode, Mode,
2171 SETP_s16rr, SETP_s16ri, SETP_s16ir,
2172 SETP_s32rr, SETP_s32ri, SETP_s32ir,
2173 SETP_s64rr, SETP_s64ri, SETP_s64ir,
2174 SET_s16rr, SET_s16ri, SET_s16ir,
2175 SET_s32rr, SET_s32ri, SET_s32ir,
2176 SET_s64rr, SET_s64ri, SET_s64ir> {
2177 // TableGen doesn't like empty multiclasses.
2178 def : PatLeaf<(i32 0)>;
2181 multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
2182 : ISET_FORMAT<OpNode, Mode,
2183 SETP_u16rr, SETP_u16ri, SETP_u16ir,
2184 SETP_u32rr, SETP_u32ri, SETP_u32ir,
2185 SETP_u64rr, SETP_u64ri, SETP_u64ir,
2186 SET_u16rr, SET_u16ri, SET_u16ir,
2187 SET_u32rr, SET_u32ri, SET_u32ir,
2188 SET_u64rr, SET_u64ri, SET_u64ir> {
2189 // TableGen doesn't like empty multiclasses.
2190 def : PatLeaf<(i32 0)>;
2193 defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
2194 defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
2195 defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
2196 defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
2197 defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
2198 defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
2199 defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
2200 defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
2201 defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
2202 defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
2203 defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
2204 defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
2207 def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
2208 (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2209 def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
2210 (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2212 def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
2213 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2214 def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
2215 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2217 // comparisons of i8 extracted with BFE as i32
2218 def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2219 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>;
2220 def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2221 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>;
2222 def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2223 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>;
2224 def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2225 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>;
2227 def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2228 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>;
2229 def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2230 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>;
2231 def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2232 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>;
2233 def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2234 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>;
2235 def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2236 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>;
2237 def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2238 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>;
2240 // i1 compare -> i32
2241 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2242 (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2243 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2244 (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2248 multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
2250 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2251 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2252 Requires<[useFP16Math,doF32FTZ]>;
2253 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2254 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2255 Requires<[useFP16Math]>;
2256 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2257 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2258 Requires<[useFP16Math,doF32FTZ]>;
2259 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2260 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2261 Requires<[useFP16Math]>;
2262 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2263 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2264 Requires<[useFP16Math,doF32FTZ]>;
2265 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2266 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2267 Requires<[useFP16Math]>;
2270 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2271 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2272 Requires<[hasBF16Math,doF32FTZ]>;
2273 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2274 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2275 Requires<[hasBF16Math]>;
2276 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2277 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2278 Requires<[hasBF16Math,doF32FTZ]>;
2279 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2280 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2281 Requires<[hasBF16Math]>;
2282 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2283 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2284 Requires<[hasBF16Math,doF32FTZ]>;
2285 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2286 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2287 Requires<[hasBF16Math]>;
2290 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2291 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2292 Requires<[doF32FTZ]>;
2293 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2294 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2295 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2296 (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2297 Requires<[doF32FTZ]>;
2298 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2299 (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2300 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2301 (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2302 Requires<[doF32FTZ]>;
2303 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2304 (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2307 def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
2308 (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2309 def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
2310 (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2311 def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
2312 (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2315 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2316 (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2317 Requires<[useFP16Math, doF32FTZ]>;
2318 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2319 (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2320 Requires<[useFP16Math]>;
2321 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2322 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2323 Requires<[useFP16Math, doF32FTZ]>;
2324 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2325 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2326 Requires<[useFP16Math]>;
2327 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2328 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2329 Requires<[useFP16Math, doF32FTZ]>;
2330 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2331 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2332 Requires<[useFP16Math]>;
2335 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2336 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2337 Requires<[hasBF16Math, doF32FTZ]>;
2338 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2339 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2340 Requires<[hasBF16Math]>;
2341 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2342 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2343 Requires<[hasBF16Math, doF32FTZ]>;
2344 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2345 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2346 Requires<[hasBF16Math]>;
2347 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2348 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2349 Requires<[hasBF16Math, doF32FTZ]>;
2350 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2351 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2352 Requires<[hasBF16Math]>;
2355 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2356 (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2357 Requires<[doF32FTZ]>;
2358 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2359 (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2360 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2361 (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2362 Requires<[doF32FTZ]>;
2363 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2364 (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2365 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2366 (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2367 Requires<[doF32FTZ]>;
2368 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2369 (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2372 def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
2373 (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2374 def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
2375 (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2376 def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
2377 (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2380 defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
2381 defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
2382 defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
2383 defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
2384 defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
2385 defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
2387 defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
2388 defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
2389 defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
2390 defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
2391 defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
2392 defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
2394 defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
2395 defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
2396 defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
2397 defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
2398 defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
2399 defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
2401 defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
2402 defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
2404 // FIXME: What is this doing here? Can it be deleted?
2405 // def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
2406 // [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
2408 def SDTDeclareParamProfile :
2409 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2410 def SDTDeclareScalarParamProfile :
2411 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2412 def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
2413 def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
2414 def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
2415 def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2416 def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2417 def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2418 def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
2419 def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
2420 def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2421 def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2422 def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
2423 def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
2424 def SDTCallValProfile : SDTypeProfile<1, 0, []>;
2425 def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
2426 def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2427 def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
2428 def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
2429 def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
2430 def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
2433 SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
2434 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2435 def DeclareScalarParam :
2436 SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
2437 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2438 def DeclareRetParam :
2439 SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
2440 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2442 SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
2443 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2445 SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
2446 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2448 SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
2449 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2451 SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
2452 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2454 SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
2455 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2456 def PrintConvergentCall :
2457 SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
2458 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2460 SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
2461 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2462 def PrintConvergentCallUni :
2463 SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
2464 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2466 SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
2467 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2469 SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
2470 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2472 SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
2473 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2475 SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
2476 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2478 SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
2479 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2481 SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
2482 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2484 SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
2485 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2487 SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
2488 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2490 SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
2491 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2493 SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
2494 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2496 SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
2497 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2499 SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
2500 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2502 SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
2504 SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
2505 [SDNPHasChain, SDNPSideEffect]>;
2507 SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
2508 [SDNPHasChain, SDNPSideEffect]>;
2510 SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
2511 [SDNPHasChain, SDNPSideEffect]>;
2512 def PseudoUseParam :
2513 SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
2514 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2516 SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
2517 [SDNPHasChain, SDNPSideEffect]>;
2519 SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
2520 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2522 let mayLoad = true in {
2523 class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
2524 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2525 !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
2528 class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
2529 NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
2530 !strconcat("ld.param.v2", opstr,
2531 " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
2533 class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
2534 NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
2537 !strconcat("ld.param.v4", opstr,
2538 " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
2542 class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
2543 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2544 !strconcat("mov", opstr, " \t$dst, retval$b;"),
2545 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
2547 let mayStore = true in {
2548 class StoreParamInst<NVPTXRegClass regclass, string opstr> :
2549 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
2550 !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
2553 class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
2554 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
2555 i32imm:$a, i32imm:$b),
2556 !strconcat("st.param.v2", opstr,
2557 " \t[param$a+$b], {{$val, $val2}};"),
2560 class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
2561 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
2562 regclass:$val4, i32imm:$a,
2564 !strconcat("st.param.v4", opstr,
2565 " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
2568 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
2569 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
2570 !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
2573 class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
2574 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
2575 !strconcat("st.param.v2", opstr,
2576 " \t[func_retval0+$a], {{$val, $val2}};"),
2579 class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
2581 (ins regclass:$val, regclass:$val2, regclass:$val3,
2582 regclass:$val4, i32imm:$a),
2583 !strconcat("st.param.v4", opstr,
2584 " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
2589 multiclass CALL<string OpcStr, SDNode OpNode> {
2590 def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
2591 !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
2592 def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
2593 !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
2594 def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
2595 !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
2596 def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
2597 !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
2598 def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
2599 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
2600 [(OpNode (i32 4))]>;
2601 def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
2602 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
2603 [(OpNode (i32 5))]>;
2604 def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
2605 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2607 [(OpNode (i32 6))]>;
2608 def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
2609 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2610 "retval5, retval6), "),
2611 [(OpNode (i32 7))]>;
2612 def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
2613 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2614 "retval5, retval6, retval7), "),
2615 [(OpNode (i32 8))]>;
2619 defm Call : CALL<"call", PrintCall>;
2620 defm CallUni : CALL<"call.uni", PrintCallUni>;
2622 // Convergent call instructions. These are identical to regular calls, except
2623 // they have the isConvergent bit set.
2624 let isConvergent=1 in {
2625 defm ConvergentCall : CALL<"call", PrintConvergentCall>;
2626 defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
2629 def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">;
2630 def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">;
2631 def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">;
2632 def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">;
2633 def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">;
2634 def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">;
2635 def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">;
2636 def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">;
2637 def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
2638 def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
2639 def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
2640 def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">;
2641 def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">;
2642 def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">;
2643 def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">;
2644 def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">;
2646 def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">;
2647 def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">;
2649 def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">;
2650 def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">;
2651 def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">;
2652 def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">;
2653 def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">;
2654 def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">;
2656 def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">;
2657 def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">;
2658 def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">;
2660 def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">;
2661 def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">;
2662 def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">;
2663 def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">;
2664 def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">;
2666 def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
2667 def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
2668 def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">;
2669 def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">;
2670 def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">;
2671 def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">;
2672 def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">;
2673 def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">;
2674 def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">;
2675 def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">;
2676 def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
2678 def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">;
2679 def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">;
2680 def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">;
2681 def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">;
2682 def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">;
2684 def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
2685 def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
2686 def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
2687 def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
2689 class CallArgInst<NVPTXRegClass regclass> :
2690 NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2691 [(CallArg (i32 0), regclass:$a)]>;
2693 class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2694 NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2695 [(CallArg (i32 0), vt:$a)]>;
2697 class LastCallArgInst<NVPTXRegClass regclass> :
2698 NVPTXInst<(outs), (ins regclass:$a), "$a",
2699 [(LastCallArg (i32 0), regclass:$a)]>;
2700 class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2701 NVPTXInst<(outs), (ins regclass:$a), "$a",
2702 [(LastCallArg (i32 0), vt:$a)]>;
2704 def CallArgI64 : CallArgInst<Int64Regs>;
2705 def CallArgI32 : CallArgInstVT<Int32Regs, i32>;
2706 def CallArgI16 : CallArgInstVT<Int16Regs, i16>;
2707 def CallArgF64 : CallArgInst<Float64Regs>;
2708 def CallArgF32 : CallArgInst<Float32Regs>;
2710 def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2711 def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>;
2712 def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>;
2713 def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2714 def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2716 def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2717 [(CallArg (i32 0), (i32 imm:$a))]>;
2718 def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2719 [(LastCallArg (i32 0), (i32 imm:$a))]>;
2721 def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2722 [(CallArg (i32 1), (i32 imm:$a))]>;
2723 def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2724 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2726 def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2727 [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2728 def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2729 [(CallVoid i32:$addr)]>;
2730 def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2731 [(CallVoid Int64Regs:$addr)]>;
2732 def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2733 [(Prototype (i32 imm:$val))]>;
2735 def DeclareRetMemInst :
2736 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2737 ".param .align $align .b8 retval$num[$size];",
2738 [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2739 def DeclareRetScalarInst :
2740 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2741 ".param .b$size retval$num;",
2742 [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2743 def DeclareRetRegInst :
2744 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2745 ".reg .b$size retval$num;",
2746 [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2748 def DeclareParamInst :
2749 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2750 ".param .align $align .b8 param$a[$size];",
2751 [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2752 def DeclareScalarParamInst :
2753 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2754 ".param .b$size param$a;",
2755 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2756 def DeclareScalarRegInst :
2757 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2758 ".reg .b$size param$a;",
2759 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2761 class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
2762 NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2763 !strconcat("mov", asmstr, " \t$dst, $src;"),
2764 [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>;
2766 class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
2768 NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
2769 !strconcat("mov", asmstr, " \t$dst, $src;"),
2770 [(set vt:$dst, (MoveParam texternalsym:$src))]>;
2772 def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">;
2773 def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">;
2775 def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
2776 def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
2779 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2780 "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ?
2781 [(set i16:$dst, (MoveParam i16:$src))]>;
2782 def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">;
2783 def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">;
2785 class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
2786 NVPTXInst<(outs), (ins regclass:$src),
2787 "// Pseudo use of $src",
2788 [(PseudoUseParam vt:$src)]>;
2790 def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>;
2791 def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>;
2792 def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>;
2793 def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
2794 def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
2796 class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
2797 NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2798 !strconcat("mov.", SzStr, " \t$dst, $src;"),
2799 [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>;
2801 def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>;
2802 def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>;
2803 def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>;
2804 def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>;
2805 def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>;
2806 def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>;
2808 foreach vt = [f16, bf16] in {
2809 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
2812 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
2813 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
2817 // Load / Store Handling
2819 multiclass LD<NVPTXRegClass regclass> {
2820 def _avar : NVPTXInst<
2821 (outs regclass:$dst),
2822 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2823 i32imm:$fromWidth, imem:$addr),
2824 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2825 "\t$dst, [$addr];", []>;
2826 def _areg : NVPTXInst<
2827 (outs regclass:$dst),
2828 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2829 i32imm:$fromWidth, Int32Regs:$addr),
2830 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2831 "\t$dst, [$addr];", []>;
2832 def _areg_64 : NVPTXInst<
2833 (outs regclass:$dst),
2834 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2835 i32imm:$fromWidth, Int64Regs:$addr),
2836 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2837 "\t$dst, [$addr];", []>;
2838 def _ari : NVPTXInst<
2839 (outs regclass:$dst),
2840 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2841 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2842 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2843 "\t$dst, [$addr+$offset];", []>;
2844 def _ari_64 : NVPTXInst<
2845 (outs regclass:$dst),
2846 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2847 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2848 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2849 "\t$dst, [$addr+$offset];", []>;
2850 def _asi : NVPTXInst<
2851 (outs regclass:$dst),
2852 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2853 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2854 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2855 "\t$dst, [$addr+$offset];", []>;
2858 let mayLoad=1, hasSideEffects=0 in {
2859 defm LD_i8 : LD<Int16Regs>;
2860 defm LD_i16 : LD<Int16Regs>;
2861 defm LD_i32 : LD<Int32Regs>;
2862 defm LD_i64 : LD<Int64Regs>;
2863 defm LD_f32 : LD<Float32Regs>;
2864 defm LD_f64 : LD<Float64Regs>;
2867 multiclass ST<NVPTXRegClass regclass> {
2868 def _avar : NVPTXInst<
2870 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2871 LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
2872 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2873 " \t[$addr], $src;", []>;
2874 def _areg : NVPTXInst<
2876 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
2877 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
2878 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2879 " \t[$addr], $src;", []>;
2880 def _areg_64 : NVPTXInst<
2882 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2883 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
2884 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2885 " \t[$addr], $src;", []>;
2886 def _ari : NVPTXInst<
2888 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2889 LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
2890 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2891 " \t[$addr+$offset], $src;", []>;
2892 def _ari_64 : NVPTXInst<
2894 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2895 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
2896 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2897 " \t[$addr+$offset], $src;", []>;
2898 def _asi : NVPTXInst<
2900 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2901 LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
2902 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2903 " \t[$addr+$offset], $src;", []>;
2906 let mayStore=1, hasSideEffects=0 in {
2907 defm ST_i8 : ST<Int16Regs>;
2908 defm ST_i16 : ST<Int16Regs>;
2909 defm ST_i32 : ST<Int32Regs>;
2910 defm ST_i64 : ST<Int64Regs>;
2911 defm ST_f32 : ST<Float32Regs>;
2912 defm ST_f64 : ST<Float64Regs>;
2915 // The following is used only in and after vector elementizations. Vector
2916 // elementization happens at the machine instruction level, so the following
2917 // instructions never appear in the DAG.
2918 multiclass LD_VEC<NVPTXRegClass regclass> {
2919 def _v2_avar : NVPTXInst<
2920 (outs regclass:$dst1, regclass:$dst2),
2921 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2922 i32imm:$fromWidth, imem:$addr),
2923 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2924 "\t{{$dst1, $dst2}}, [$addr];", []>;
2925 def _v2_areg : NVPTXInst<
2926 (outs regclass:$dst1, regclass:$dst2),
2927 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2928 i32imm:$fromWidth, Int32Regs:$addr),
2929 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2930 "\t{{$dst1, $dst2}}, [$addr];", []>;
2931 def _v2_areg_64 : NVPTXInst<
2932 (outs regclass:$dst1, regclass:$dst2),
2933 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2934 i32imm:$fromWidth, Int64Regs:$addr),
2935 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2936 "\t{{$dst1, $dst2}}, [$addr];", []>;
2937 def _v2_ari : NVPTXInst<
2938 (outs regclass:$dst1, regclass:$dst2),
2939 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2940 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2941 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2942 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2943 def _v2_ari_64 : NVPTXInst<
2944 (outs regclass:$dst1, regclass:$dst2),
2945 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2946 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2947 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2948 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2949 def _v2_asi : NVPTXInst<
2950 (outs regclass:$dst1, regclass:$dst2),
2951 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2952 i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2953 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2954 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2955 def _v4_avar : NVPTXInst<
2956 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2957 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2958 i32imm:$fromWidth, imem:$addr),
2959 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2960 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2961 def _v4_areg : NVPTXInst<
2962 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2963 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2964 i32imm:$fromWidth, Int32Regs:$addr),
2965 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2966 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2967 def _v4_areg_64 : NVPTXInst<
2968 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2969 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2970 i32imm:$fromWidth, Int64Regs:$addr),
2971 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2972 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2973 def _v4_ari : NVPTXInst<
2974 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2975 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2976 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2977 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2978 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2979 def _v4_ari_64 : NVPTXInst<
2980 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2981 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2982 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2983 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2984 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2985 def _v4_asi : NVPTXInst<
2986 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2987 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2988 i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2989 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2990 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2992 let mayLoad=1, hasSideEffects=0 in {
2993 defm LDV_i8 : LD_VEC<Int16Regs>;
2994 defm LDV_i16 : LD_VEC<Int16Regs>;
2995 defm LDV_i32 : LD_VEC<Int32Regs>;
2996 defm LDV_i64 : LD_VEC<Int64Regs>;
2997 defm LDV_f32 : LD_VEC<Float32Regs>;
2998 defm LDV_f64 : LD_VEC<Float64Regs>;
3001 multiclass ST_VEC<NVPTXRegClass regclass> {
3002 def _v2_avar : NVPTXInst<
3004 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3005 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
3006 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3007 "\t[$addr], {{$src1, $src2}};", []>;
3008 def _v2_areg : NVPTXInst<
3010 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3011 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
3012 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3013 "\t[$addr], {{$src1, $src2}};", []>;
3014 def _v2_areg_64 : NVPTXInst<
3016 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3017 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
3018 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3019 "\t[$addr], {{$src1, $src2}};", []>;
3020 def _v2_ari : NVPTXInst<
3022 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3023 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
3025 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3026 "\t[$addr+$offset], {{$src1, $src2}};", []>;
3027 def _v2_ari_64 : NVPTXInst<
3029 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3030 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
3032 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3033 "\t[$addr+$offset], {{$src1, $src2}};", []>;
3034 def _v2_asi : NVPTXInst<
3036 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3037 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
3039 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3040 "\t[$addr+$offset], {{$src1, $src2}};", []>;
3041 def _v4_avar : NVPTXInst<
3043 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3044 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3045 i32imm:$fromWidth, imem:$addr),
3046 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3047 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3048 def _v4_areg : NVPTXInst<
3050 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3051 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3052 i32imm:$fromWidth, Int32Regs:$addr),
3053 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3054 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3055 def _v4_areg_64 : NVPTXInst<
3057 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3058 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3059 i32imm:$fromWidth, Int64Regs:$addr),
3060 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3061 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3062 def _v4_ari : NVPTXInst<
3064 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3065 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3066 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3067 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3068 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3069 def _v4_ari_64 : NVPTXInst<
3071 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3072 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3073 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3074 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3075 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3076 def _v4_asi : NVPTXInst<
3078 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3079 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3080 i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3081 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
3082 "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3085 let mayStore=1, hasSideEffects=0 in {
3086 defm STV_i8 : ST_VEC<Int16Regs>;
3087 defm STV_i16 : ST_VEC<Int16Regs>;
3088 defm STV_i32 : ST_VEC<Int32Regs>;
3089 defm STV_i64 : ST_VEC<Int64Regs>;
3090 defm STV_f32 : ST_VEC<Float32Regs>;
3091 defm STV_f64 : ST_VEC<Float64Regs>;
3094 //---- Conversion ----
3096 class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
3097 NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret,
3098 NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> :
3099 NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
3100 !strconcat("mov.b", SzStr, " \t$d, $a;"),
3101 [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>;
3103 def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
3104 def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
3105 def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>;
3106 def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
3108 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
3109 def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
3110 (BITCONVERT_32_F2I Float32Regs:$a)>;
3111 def: Pat<(f32 (bitconvert (vt Int32Regs:$a))),
3112 (BITCONVERT_32_I2F Int32Regs:$a)>;
3114 foreach vt = [f16, bf16] in {
3115 def: Pat<(vt (bitconvert (i16 UInt16Const:$a))),
3116 (IMOVB16ri UInt16Const:$a)>;
3117 def: Pat<(vt (bitconvert (i16 Int16Regs:$a))),
3118 (ProxyRegI16 Int16Regs:$a)>;
3119 def: Pat<(i16 (bitconvert (vt Int16Regs:$a))),
3120 (ProxyRegI16 Int16Regs:$a)>;
3123 foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3124 def: Pat<(ta (bitconvert (i32 UInt32Const:$a))),
3125 (IMOVB32ri UInt32Const:$a)>;
3126 foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3127 if !ne(ta, tb) then {
3128 def: Pat<(ta (bitconvert (tb Int32Regs:$a))),
3129 (ProxyRegI32 Int32Regs:$a)>;
3134 // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
3135 // we cannot specify floating-point literals in isel patterns. Therefore, we
3136 // use an integer selp to select either 1 or 0 and then cvt to floating-point.
3139 def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
3140 (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3141 def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
3142 (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
3143 def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
3144 (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
3145 def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
3146 (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
3149 def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
3150 (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3151 def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
3152 (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
3153 def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
3154 (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
3155 def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
3156 (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
3159 def : Pat<(bf16 (sint_to_fp Int1Regs:$a)),
3160 (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3161 def : Pat<(bf16 (sint_to_fp Int16Regs:$a)),
3162 (CVT_bf16_s16 Int16Regs:$a, CvtRN)>;
3163 def : Pat<(bf16 (sint_to_fp Int32Regs:$a)),
3164 (CVT_bf16_s32 Int32Regs:$a, CvtRN)>;
3165 def : Pat<(bf16 (sint_to_fp Int64Regs:$a)),
3166 (CVT_bf16_s64 Int64Regs:$a, CvtRN)>;
3169 def : Pat<(bf16 (uint_to_fp Int1Regs:$a)),
3170 (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3171 def : Pat<(bf16 (uint_to_fp Int16Regs:$a)),
3172 (CVT_bf16_u16 Int16Regs:$a, CvtRN)>;
3173 def : Pat<(bf16 (uint_to_fp Int32Regs:$a)),
3174 (CVT_bf16_u32 Int32Regs:$a, CvtRN)>;
3175 def : Pat<(bf16 (uint_to_fp Int64Regs:$a)),
3176 (CVT_bf16_u64 Int64Regs:$a, CvtRN)>;
3179 def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
3180 (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3181 def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
3182 (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
3183 def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
3184 (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
3185 def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
3186 (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
3189 def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
3190 (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3191 def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
3192 (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
3193 def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
3194 (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
3195 def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
3196 (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
3199 def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
3200 (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3201 def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
3202 (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
3203 def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
3204 (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
3205 def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
3206 (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
3209 def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
3210 (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3211 def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
3212 (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
3213 def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
3214 (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
3215 def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
3216 (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
3220 def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))),
3221 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3222 def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))),
3223 (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>;
3224 def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))),
3225 (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>;
3226 def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))),
3227 (CVT_s64_f16 Int16Regs:$a, CvtRZI)>;
3230 def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))),
3231 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3232 def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))),
3233 (CVT_u16_f16 Int16Regs:$a, CvtRZI)>;
3234 def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))),
3235 (CVT_u32_f16 Int16Regs:$a, CvtRZI)>;
3236 def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))),
3237 (CVT_u64_f16 Int16Regs:$a, CvtRZI)>;
3240 def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))),
3241 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3242 def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))),
3243 (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3244 def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))),
3245 (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3246 def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))),
3247 (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>;
3250 def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))),
3251 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3252 def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))),
3253 (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>;
3254 def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))),
3255 (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>;
3256 def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))),
3257 (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>;
3259 def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
3260 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3261 def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3262 (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3263 def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3264 (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
3265 def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3266 (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3267 def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3268 (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
3269 def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3270 (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3271 def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3272 (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
3275 def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
3276 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3277 def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3278 (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3279 def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3280 (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
3281 def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3282 (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3283 def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3284 (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
3285 def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3286 (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3287 def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3288 (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
3291 def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
3292 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3293 def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
3294 (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
3295 def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
3296 (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
3297 def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
3298 (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
3301 def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
3302 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3303 def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
3304 (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
3305 def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
3306 (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
3307 def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
3308 (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
3311 def : Pat<(i16 (sext Int1Regs:$a)),
3312 (SELP_s16ii -1, 0, Int1Regs:$a)>;
3313 def : Pat<(i32 (sext Int1Regs:$a)),
3314 (SELP_s32ii -1, 0, Int1Regs:$a)>;
3315 def : Pat<(i64 (sext Int1Regs:$a)),
3316 (SELP_s64ii -1, 0, Int1Regs:$a)>;
3319 def : Pat<(i16 (zext Int1Regs:$a)),
3320 (SELP_u16ii 1, 0, Int1Regs:$a)>;
3321 def : Pat<(i32 (zext Int1Regs:$a)),
3322 (SELP_u32ii 1, 0, Int1Regs:$a)>;
3323 def : Pat<(i64 (zext Int1Regs:$a)),
3324 (SELP_u64ii 1, 0, Int1Regs:$a)>;
3327 def : Pat<(i16 (anyext Int1Regs:$a)),
3328 (SELP_u16ii -1, 0, Int1Regs:$a)>;
3329 def : Pat<(i32 (anyext Int1Regs:$a)),
3330 (SELP_u32ii -1, 0, Int1Regs:$a)>;
3331 def : Pat<(i64 (anyext Int1Regs:$a)),
3332 (SELP_u64ii -1, 0, Int1Regs:$a)>;
3335 def : Pat<(i32 (sext Int16Regs:$a)),
3336 (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
3337 def : Pat<(i64 (sext Int16Regs:$a)),
3338 (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
3341 def : Pat<(i32 (zext Int16Regs:$a)),
3342 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3343 def : Pat<(i64 (zext Int16Regs:$a)),
3344 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3347 def : Pat<(i32 (anyext Int16Regs:$a)),
3348 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3349 def : Pat<(i64 (anyext Int16Regs:$a)),
3350 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3353 def : Pat<(i64 (sext Int32Regs:$a)),
3354 (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
3357 def : Pat<(i64 (zext Int32Regs:$a)),
3358 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3361 def : Pat<(i64 (anyext Int32Regs:$a)),
3362 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3366 def : Pat<(i32 (trunc Int64Regs:$a)),
3367 (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
3368 def : Pat<(i16 (trunc Int64Regs:$a)),
3369 (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
3370 def : Pat<(i1 (trunc Int64Regs:$a)),
3371 (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
3374 def : Pat<(i16 (trunc Int32Regs:$a)),
3375 (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
3376 def : Pat<(i1 (trunc Int32Regs:$a)),
3377 (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
3380 def : Pat<(i1 (trunc Int16Regs:$a)),
3381 (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
3384 def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
3385 def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
3386 def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
3387 def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
3388 def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
3389 def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
3392 // Select instructions with 32-bit predicates
3393 def : Pat<(select (i32 Int32Regs:$pred), i16:$a, i16:$b),
3394 (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
3395 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3396 def : Pat<(select (i32 Int32Regs:$pred), i32:$a, i32:$b),
3397 (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
3398 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3399 def : Pat<(select (i32 Int32Regs:$pred), Int64Regs:$a, Int64Regs:$b),
3400 (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
3401 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3402 def : Pat<(select (i32 Int32Regs:$pred), (f16 Int16Regs:$a), (f16 Int16Regs:$b)),
3403 (SELP_f16rr Int16Regs:$a, Int16Regs:$b,
3404 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3405 def : Pat<(select (i32 Int32Regs:$pred), (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)),
3406 (SELP_bf16rr Int16Regs:$a, Int16Regs:$b,
3407 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3408 def : Pat<(select (i32 Int32Regs:$pred), Float32Regs:$a, Float32Regs:$b),
3409 (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
3410 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3411 def : Pat<(select (i32 Int32Regs:$pred), Float64Regs:$a, Float64Regs:$b),
3412 (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
3413 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3416 let hasSideEffects = false in {
3417 // pack a set of smaller int registers to a larger int register
3418 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
3419 (ins Int16Regs:$s1, Int16Regs:$s2,
3420 Int16Regs:$s3, Int16Regs:$s4),
3421 "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
3422 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
3423 (ins Int16Regs:$s1, Int16Regs:$s2),
3424 "mov.b32 \t$d, {{$s1, $s2}};", []>;
3425 def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
3426 (ins Int32Regs:$s1, Int32Regs:$s2),
3427 "mov.b64 \t$d, {{$s1, $s2}};", []>;
3428 def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
3429 (ins Float32Regs:$s1, Float32Regs:$s2),
3430 "mov.b64 \t$d, {{$s1, $s2}};", []>;
3432 // unpack a larger int register to a set of smaller int registers
3433 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
3434 Int16Regs:$d3, Int16Regs:$d4),
3436 "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
3437 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
3439 "mov.b32 \t{{$d1, $d2}}, $s;", []>;
3440 def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
3442 "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3443 def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
3444 (ins Float64Regs:$s),
3445 "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3447 def I32toI16H : NVPTXInst<(outs Int16Regs:$high),
3449 "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
3451 def I32toI16L : NVPTXInst<(outs Int16Regs:$low),
3453 "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}",
3455 def I64toI32H : NVPTXInst<(outs Int32Regs:$high),
3457 "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
3461 // Using partial vectorized move produces better SASS code for extraction of
3462 // upper/lower parts of an integer.
3463 def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
3464 (I32toI16H Int32Regs:$s)>;
3465 def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
3466 (I32toI16H Int32Regs:$s)>;
3467 def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
3468 (I64toI32H Int64Regs:$s)>;
3469 def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
3470 (I64toI32H Int64Regs:$s)>;
3472 def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))),
3473 (CVT_INREG_s32_s16 Int32Regs:$src)>;
3475 foreach vt = [v2f16, v2bf16, v2i16] in {
3476 def : Pat<(extractelt (vt Int32Regs:$src), 0),
3477 (I32toI16L Int32Regs:$src)>;
3478 def : Pat<(extractelt (vt Int32Regs:$src), 1),
3479 (I32toI16H Int32Regs:$src)>;
3481 def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
3482 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3483 def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
3484 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3485 def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
3486 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3488 def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))),
3489 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3491 // Count leading zeros
3492 let hasSideEffects = false in {
3493 def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3494 "clz.b32 \t$d, $a;", []>;
3495 def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3496 "clz.b64 \t$d, $a;", []>;
3499 // 32-bit has a direct PTX instruction
3500 def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>;
3502 // The return type of the ctlz ISD node is the same as its input, but the PTX
3503 // ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
3504 // ptx value to 64 bits to match the ISD node's semantics, unless we know we're
3505 // truncating back down to 32 bits.
3506 def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
3507 def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
3509 // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
3510 // result back to 16-bits if necessary. We also need to subtract 16 because
3511 // the high-order 16 zeros were counted.
3513 // TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
3514 // use to save one SASS instruction (on sm_35 anyway):
3516 // mov.b32 $tmp, {0xffff, $a}
3517 // ctlz.b32 $result, $tmp
3519 // That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
3520 // and then ctlz that value. This way we don't have to subtract 16 from the
3521 // result. Unfortunately today we don't have a way to generate
3522 // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
3523 def : Pat<(i16 (ctlz Int16Regs:$a)),
3524 (SUBi16ri (CVT_u16_u32
3525 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
3526 def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
3527 (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
3530 let hasSideEffects = false in {
3531 def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3532 "popc.b32 \t$d, $a;", []>;
3533 def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3534 "popc.b64 \t$d, $a;", []>;
3537 // 32-bit has a direct PTX instruction
3538 def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>;
3540 // For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
3541 // to match the LLVM semantics. Just as with ctlz.i64, we provide a second
3542 // pattern that avoids the type conversion if we're truncating the result to
3544 def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
3545 def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
3547 // For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
3548 // If we know that we're storing into an i32, we can avoid the final trunc.
3549 def : Pat<(ctpop Int16Regs:$a),
3550 (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
3551 def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
3552 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
3554 // fpround f32 -> f16
3555 def : Pat<(f16 (fpround Float32Regs:$a)),
3556 (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
3558 // fpround f32 -> bf16
3559 def : Pat<(bf16 (fpround Float32Regs:$a)),
3560 (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
3562 // fpround f64 -> f16
3563 def : Pat<(f16 (fpround Float64Regs:$a)),
3564 (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
3566 // fpround f64 -> bf16
3567 def : Pat<(bf16 (fpround Float64Regs:$a)),
3568 (CVT_bf16_f64 Float64Regs:$a, CvtRN)>;
3569 // fpround f64 -> f32
3570 def : Pat<(f32 (fpround Float64Regs:$a)),
3571 (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
3572 def : Pat<(f32 (fpround Float64Regs:$a)),
3573 (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
3575 // fpextend f16 -> f32
3576 def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3577 (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3578 def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3579 (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
3580 // fpextend bf16 -> f32
3581 def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3582 (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3583 def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3584 (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>;
3586 // fpextend f16 -> f64
3587 def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
3588 (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
3590 // fpextend bf16 -> f64
3591 def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))),
3592 (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>;
3594 // fpextend f32 -> f64
3595 def : Pat<(f64 (fpextend Float32Regs:$a)),
3596 (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3597 def : Pat<(f64 (fpextend Float32Regs:$a)),
3598 (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
3600 def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
3601 [SDNPHasChain, SDNPOptInGlue]>;
3603 // fceil, ffloor, froundeven, ftrunc.
3605 multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
3606 def : Pat<(OpNode (f16 Int16Regs:$a)),
3607 (CVT_f16_f16 Int16Regs:$a, Mode)>;
3608 def : Pat<(OpNode (bf16 Int16Regs:$a)),
3609 (CVT_bf16_bf16 Int16Regs:$a, Mode)>;
3610 def : Pat<(OpNode Float32Regs:$a),
3611 (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
3612 def : Pat<(OpNode Float32Regs:$a),
3613 (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;
3614 def : Pat<(OpNode Float64Regs:$a),
3615 (CVT_f64_f64 Float64Regs:$a, Mode)>;
3618 defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>;
3619 defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>;
3620 defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>;
3621 defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>;
3623 // nearbyint and rint are implemented as rounding to nearest even. This isn't
3624 // strictly correct, because it causes us to ignore the rounding mode. But it
3625 // matches what CUDA's "libm" does.
3627 defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>;
3628 defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>;
3630 //-----------------------------------
3632 //-----------------------------------
3634 let isTerminator=1 in {
3635 let isReturn=1, isBarrier=1 in
3636 def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>;
3639 def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3640 "@$a bra \t$target;",
3641 [(brcond Int1Regs:$a, bb:$target)]>;
3643 def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3644 "@!$a bra \t$target;", []>;
3646 let isBranch=1, isBarrier=1 in
3647 def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
3648 "bra.uni \t$target;", [(br bb:$target)]>;
3651 def : Pat<(brcond (i32 Int32Regs:$a), bb:$target),
3652 (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
3654 // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
3655 // conditional branch if the target block is the next block so that the code
3656 // can fall through to the target block. The invertion is done by 'xor
3657 // condition, 1', which will be translated to (setne condition, -1). Since ptx
3658 // supports '@!pred bra target', we should use it.
3659 def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
3660 (CBranchOther Int1Regs:$a, bb:$target)>;
3663 def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
3665 def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
3667 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
3668 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3669 def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
3670 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
3673 def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
3674 def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
3675 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
3676 def calltarget : Operand<i32>;
3678 def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
3681 def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
3682 def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
3684 // Pseudo instructions.
3685 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
3686 : NVPTXInst<outs, ins, asmstr, pattern>;
3689 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3690 "\\{ // callseq $amt1, $amt2\n"
3691 "\t.reg .b32 temp_param_reg;",
3692 [(callseq_start timm:$amt1, timm:$amt2)]>;
3694 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3695 "\\} // callseq $amt1",
3696 [(callseq_end timm:$amt1, timm:$amt2)]>;
3699 // Emit an `exit` as well to convey to ptxas that `trap` exits the CFG.
3700 // This won't be necessary in a future version of ptxas.
3701 def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>;
3703 // Call prototype wrapper
3704 def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3706 SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
3707 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3708 def ProtoIdent : Operand<i32> {
3709 let PrintMethod = "printProtoIdent";
3711 def CALL_PROTOTYPE :
3712 NVPTXInst<(outs), (ins ProtoIdent:$ident),
3713 "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
3716 include "NVPTXIntrinsics.td"
3719 //-----------------------------------
3721 //-----------------------------------
3722 // BSWAP is currently expanded. The following is a more efficient
3723 // - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
3724 // - for sm_20, use pmpt (use vector scalar mov to get the pack and
3725 // unpack). sm_20 supports native 32-bit register, but not native 16-bit